1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2 /* 3 * Copyright(c) 2015 - 2020 Intel Corporation. 4 */ 5 6 #include <linux/topology.h> 7 #include <linux/cpumask.h> 8 #include <linux/interrupt.h> 9 #include <linux/numa.h> 10 11 #include "hfi.h" 12 #include "affinity.h" 13 #include "sdma.h" 14 #include "trace.h" 15 16 struct hfi1_affinity_node_list node_affinity = { 17 .list = LIST_HEAD_INIT(node_affinity.list), 18 .lock = __MUTEX_INITIALIZER(node_affinity.lock) 19 }; 20 21 /* Name of IRQ types, indexed by enum irq_type */ 22 static const char * const irq_type_names[] = { 23 "SDMA", 24 "RCVCTXT", 25 "NETDEVCTXT", 26 "GENERAL", 27 "OTHER", 28 }; 29 30 /* Per NUMA node count of HFI devices */ 31 static unsigned int *hfi1_per_node_cntr; 32 33 static inline void init_cpu_mask_set(struct cpu_mask_set *set) 34 { 35 cpumask_clear(&set->mask); 36 cpumask_clear(&set->used); 37 set->gen = 0; 38 } 39 40 /* Increment generation of CPU set if needed */ 41 static void _cpu_mask_set_gen_inc(struct cpu_mask_set *set) 42 { 43 if (cpumask_equal(&set->mask, &set->used)) { 44 /* 45 * We've used up all the CPUs, bump up the generation 46 * and reset the 'used' map 47 */ 48 set->gen++; 49 cpumask_clear(&set->used); 50 } 51 } 52 53 static void _cpu_mask_set_gen_dec(struct cpu_mask_set *set) 54 { 55 if (cpumask_empty(&set->used) && set->gen) { 56 set->gen--; 57 cpumask_copy(&set->used, &set->mask); 58 } 59 } 60 61 /* Get the first CPU from the list of unused CPUs in a CPU set data structure */ 62 static int cpu_mask_set_get_first(struct cpu_mask_set *set, cpumask_var_t diff) 63 { 64 int cpu; 65 66 if (!diff || !set) 67 return -EINVAL; 68 69 _cpu_mask_set_gen_inc(set); 70 71 /* Find out CPUs left in CPU mask */ 72 cpumask_andnot(diff, &set->mask, &set->used); 73 74 cpu = cpumask_first(diff); 75 if (cpu >= nr_cpu_ids) /* empty */ 76 cpu = -EINVAL; 77 else 78 cpumask_set_cpu(cpu, &set->used); 79 80 return cpu; 81 } 82 83 static void cpu_mask_set_put(struct cpu_mask_set *set, int cpu) 84 { 85 if (!set) 86 return; 87 88 cpumask_clear_cpu(cpu, &set->used); 89 _cpu_mask_set_gen_dec(set); 90 } 91 92 /* Initialize non-HT cpu cores mask */ 93 void init_real_cpu_mask(void) 94 { 95 int possible, curr_cpu, ht; 96 97 /* Start with cpu online mask as the real cpu mask */ 98 cpumask_copy(&node_affinity.real_cpu_mask, cpu_online_mask); 99 100 /* 101 * Remove HT cores from the real cpu mask. Do this in two steps below. 102 */ 103 possible = cpumask_weight(&node_affinity.real_cpu_mask); 104 ht = cpumask_weight(topology_sibling_cpumask( 105 cpumask_first(&node_affinity.real_cpu_mask))); 106 /* 107 * Step 1. Skip over the first N HT siblings and use them as the 108 * "real" cores. Assumes that HT cores are not enumerated in 109 * succession (except in the single core case). 110 */ 111 curr_cpu = cpumask_nth(possible / ht, &node_affinity.real_cpu_mask) + 1; 112 113 /* Step 2. Remove the remaining HT siblings. */ 114 cpumask_clear_cpus(&node_affinity.real_cpu_mask, curr_cpu, nr_cpu_ids - curr_cpu); 115 } 116 117 int node_affinity_init(void) 118 { 119 int node; 120 struct pci_dev *dev = NULL; 121 const struct pci_device_id *ids = hfi1_pci_tbl; 122 123 cpumask_clear(&node_affinity.proc.used); 124 cpumask_copy(&node_affinity.proc.mask, cpu_online_mask); 125 126 node_affinity.proc.gen = 0; 127 node_affinity.num_core_siblings = 128 cpumask_weight(topology_sibling_cpumask( 129 cpumask_first(&node_affinity.proc.mask) 130 )); 131 node_affinity.num_possible_nodes = num_possible_nodes(); 132 node_affinity.num_online_nodes = num_online_nodes(); 133 node_affinity.num_online_cpus = num_online_cpus(); 134 135 /* 136 * The real cpu mask is part of the affinity struct but it has to be 137 * initialized early. It is needed to calculate the number of user 138 * contexts in set_up_context_variables(). 139 */ 140 init_real_cpu_mask(); 141 142 hfi1_per_node_cntr = kcalloc(node_affinity.num_possible_nodes, 143 sizeof(*hfi1_per_node_cntr), GFP_KERNEL); 144 if (!hfi1_per_node_cntr) 145 return -ENOMEM; 146 147 while (ids->vendor) { 148 dev = NULL; 149 while ((dev = pci_get_device(ids->vendor, ids->device, dev))) { 150 node = pcibus_to_node(dev->bus); 151 if (node < 0) 152 goto out; 153 154 hfi1_per_node_cntr[node]++; 155 } 156 ids++; 157 } 158 159 return 0; 160 161 out: 162 /* 163 * Invalid PCI NUMA node information found, note it, and populate 164 * our database 1:1. 165 */ 166 pr_err("HFI: Invalid PCI NUMA node. Performance may be affected\n"); 167 pr_err("HFI: System BIOS may need to be upgraded\n"); 168 for (node = 0; node < node_affinity.num_possible_nodes; node++) 169 hfi1_per_node_cntr[node] = 1; 170 171 pci_dev_put(dev); 172 173 return 0; 174 } 175 176 static void node_affinity_destroy(struct hfi1_affinity_node *entry) 177 { 178 free_percpu(entry->comp_vect_affinity); 179 kfree(entry); 180 } 181 182 void node_affinity_destroy_all(void) 183 { 184 struct list_head *pos, *q; 185 struct hfi1_affinity_node *entry; 186 187 mutex_lock(&node_affinity.lock); 188 list_for_each_safe(pos, q, &node_affinity.list) { 189 entry = list_entry(pos, struct hfi1_affinity_node, 190 list); 191 list_del(pos); 192 node_affinity_destroy(entry); 193 } 194 mutex_unlock(&node_affinity.lock); 195 kfree(hfi1_per_node_cntr); 196 } 197 198 static struct hfi1_affinity_node *node_affinity_allocate(int node) 199 { 200 struct hfi1_affinity_node *entry; 201 202 entry = kzalloc_obj(*entry); 203 if (!entry) 204 return NULL; 205 entry->node = node; 206 entry->comp_vect_affinity = alloc_percpu(u16); 207 INIT_LIST_HEAD(&entry->list); 208 209 return entry; 210 } 211 212 /* 213 * It appends an entry to the list. 214 * It *must* be called with node_affinity.lock held. 215 */ 216 static void node_affinity_add_tail(struct hfi1_affinity_node *entry) 217 { 218 list_add_tail(&entry->list, &node_affinity.list); 219 } 220 221 /* It must be called with node_affinity.lock held */ 222 static struct hfi1_affinity_node *node_affinity_lookup(int node) 223 { 224 struct hfi1_affinity_node *entry; 225 226 list_for_each_entry(entry, &node_affinity.list, list) { 227 if (entry->node == node) 228 return entry; 229 } 230 231 return NULL; 232 } 233 234 static int per_cpu_affinity_get(cpumask_var_t possible_cpumask, 235 u16 __percpu *comp_vect_affinity) 236 { 237 int curr_cpu; 238 u16 cntr; 239 u16 prev_cntr; 240 int ret_cpu; 241 242 if (!possible_cpumask) { 243 ret_cpu = -EINVAL; 244 goto fail; 245 } 246 247 if (!comp_vect_affinity) { 248 ret_cpu = -EINVAL; 249 goto fail; 250 } 251 252 ret_cpu = cpumask_first(possible_cpumask); 253 if (ret_cpu >= nr_cpu_ids) { 254 ret_cpu = -EINVAL; 255 goto fail; 256 } 257 258 prev_cntr = *per_cpu_ptr(comp_vect_affinity, ret_cpu); 259 for_each_cpu(curr_cpu, possible_cpumask) { 260 cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu); 261 262 if (cntr < prev_cntr) { 263 ret_cpu = curr_cpu; 264 prev_cntr = cntr; 265 } 266 } 267 268 *per_cpu_ptr(comp_vect_affinity, ret_cpu) += 1; 269 270 fail: 271 return ret_cpu; 272 } 273 274 static int per_cpu_affinity_put_max(cpumask_var_t possible_cpumask, 275 u16 __percpu *comp_vect_affinity) 276 { 277 int curr_cpu; 278 int max_cpu; 279 u16 cntr; 280 u16 prev_cntr; 281 282 if (!possible_cpumask) 283 return -EINVAL; 284 285 if (!comp_vect_affinity) 286 return -EINVAL; 287 288 max_cpu = cpumask_first(possible_cpumask); 289 if (max_cpu >= nr_cpu_ids) 290 return -EINVAL; 291 292 prev_cntr = *per_cpu_ptr(comp_vect_affinity, max_cpu); 293 for_each_cpu(curr_cpu, possible_cpumask) { 294 cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu); 295 296 if (cntr > prev_cntr) { 297 max_cpu = curr_cpu; 298 prev_cntr = cntr; 299 } 300 } 301 302 *per_cpu_ptr(comp_vect_affinity, max_cpu) -= 1; 303 304 return max_cpu; 305 } 306 307 /* 308 * Non-interrupt CPUs are used first, then interrupt CPUs. 309 * Two already allocated cpu masks must be passed. 310 */ 311 static int _dev_comp_vect_cpu_get(struct hfi1_devdata *dd, 312 struct hfi1_affinity_node *entry, 313 cpumask_var_t non_intr_cpus, 314 cpumask_var_t available_cpus) 315 __must_hold(&node_affinity.lock) 316 { 317 int cpu; 318 struct cpu_mask_set *set = dd->comp_vect; 319 320 lockdep_assert_held(&node_affinity.lock); 321 if (!non_intr_cpus) { 322 cpu = -1; 323 goto fail; 324 } 325 326 if (!available_cpus) { 327 cpu = -1; 328 goto fail; 329 } 330 331 /* Available CPUs for pinning completion vectors */ 332 _cpu_mask_set_gen_inc(set); 333 cpumask_andnot(available_cpus, &set->mask, &set->used); 334 335 /* Available CPUs without SDMA engine interrupts */ 336 cpumask_andnot(non_intr_cpus, available_cpus, 337 &entry->def_intr.used); 338 339 /* If there are non-interrupt CPUs available, use them first */ 340 cpu = cpumask_first(non_intr_cpus); 341 342 /* Otherwise, use interrupt CPUs */ 343 if (cpu >= nr_cpu_ids) 344 cpu = cpumask_first(available_cpus); 345 346 if (cpu >= nr_cpu_ids) { /* empty */ 347 cpu = -1; 348 goto fail; 349 } 350 cpumask_set_cpu(cpu, &set->used); 351 352 fail: 353 return cpu; 354 } 355 356 static void _dev_comp_vect_cpu_put(struct hfi1_devdata *dd, int cpu) 357 { 358 struct cpu_mask_set *set = dd->comp_vect; 359 360 if (cpu < 0) 361 return; 362 363 cpu_mask_set_put(set, cpu); 364 } 365 366 /* _dev_comp_vect_mappings_destroy() is reentrant */ 367 static void _dev_comp_vect_mappings_destroy(struct hfi1_devdata *dd) 368 { 369 int i, cpu; 370 371 if (!dd->comp_vect_mappings) 372 return; 373 374 for (i = 0; i < dd->comp_vect_possible_cpus; i++) { 375 cpu = dd->comp_vect_mappings[i]; 376 _dev_comp_vect_cpu_put(dd, cpu); 377 dd->comp_vect_mappings[i] = -1; 378 hfi1_cdbg(AFFINITY, 379 "[%s] Release CPU %d from completion vector %d", 380 rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), cpu, i); 381 } 382 383 kfree(dd->comp_vect_mappings); 384 dd->comp_vect_mappings = NULL; 385 } 386 387 /* 388 * This function creates the table for looking up CPUs for completion vectors. 389 * num_comp_vectors needs to have been initilized before calling this function. 390 */ 391 static int _dev_comp_vect_mappings_create(struct hfi1_devdata *dd, 392 struct hfi1_affinity_node *entry) 393 __must_hold(&node_affinity.lock) 394 { 395 int i, cpu, ret; 396 cpumask_var_t non_intr_cpus; 397 cpumask_var_t available_cpus; 398 399 lockdep_assert_held(&node_affinity.lock); 400 401 if (!zalloc_cpumask_var(&non_intr_cpus, GFP_KERNEL)) 402 return -ENOMEM; 403 404 if (!zalloc_cpumask_var(&available_cpus, GFP_KERNEL)) { 405 free_cpumask_var(non_intr_cpus); 406 return -ENOMEM; 407 } 408 409 dd->comp_vect_mappings = kzalloc_objs(*dd->comp_vect_mappings, 410 dd->comp_vect_possible_cpus); 411 if (!dd->comp_vect_mappings) { 412 ret = -ENOMEM; 413 goto fail; 414 } 415 for (i = 0; i < dd->comp_vect_possible_cpus; i++) 416 dd->comp_vect_mappings[i] = -1; 417 418 for (i = 0; i < dd->comp_vect_possible_cpus; i++) { 419 cpu = _dev_comp_vect_cpu_get(dd, entry, non_intr_cpus, 420 available_cpus); 421 if (cpu < 0) { 422 ret = -EINVAL; 423 goto fail; 424 } 425 426 dd->comp_vect_mappings[i] = cpu; 427 hfi1_cdbg(AFFINITY, 428 "[%s] Completion Vector %d -> CPU %d", 429 rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), i, cpu); 430 } 431 432 free_cpumask_var(available_cpus); 433 free_cpumask_var(non_intr_cpus); 434 return 0; 435 436 fail: 437 free_cpumask_var(available_cpus); 438 free_cpumask_var(non_intr_cpus); 439 _dev_comp_vect_mappings_destroy(dd); 440 441 return ret; 442 } 443 444 int hfi1_comp_vectors_set_up(struct hfi1_devdata *dd) 445 { 446 int ret; 447 struct hfi1_affinity_node *entry; 448 449 mutex_lock(&node_affinity.lock); 450 entry = node_affinity_lookup(dd->node); 451 if (!entry) { 452 ret = -EINVAL; 453 goto unlock; 454 } 455 ret = _dev_comp_vect_mappings_create(dd, entry); 456 unlock: 457 mutex_unlock(&node_affinity.lock); 458 459 return ret; 460 } 461 462 void hfi1_comp_vectors_clean_up(struct hfi1_devdata *dd) 463 { 464 _dev_comp_vect_mappings_destroy(dd); 465 } 466 467 int hfi1_comp_vect_mappings_lookup(struct rvt_dev_info *rdi, int comp_vect) 468 { 469 struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi); 470 struct hfi1_devdata *dd = dd_from_dev(verbs_dev); 471 472 if (!dd->comp_vect_mappings) 473 return -EINVAL; 474 if (comp_vect >= dd->comp_vect_possible_cpus) 475 return -EINVAL; 476 477 return dd->comp_vect_mappings[comp_vect]; 478 } 479 480 /* 481 * It assumes dd->comp_vect_possible_cpus is available. 482 */ 483 static int _dev_comp_vect_cpu_mask_init(struct hfi1_devdata *dd, 484 struct hfi1_affinity_node *entry, 485 bool first_dev_init) 486 __must_hold(&node_affinity.lock) 487 { 488 int i, j, curr_cpu; 489 int possible_cpus_comp_vect = 0; 490 struct cpumask *dev_comp_vect_mask = &dd->comp_vect->mask; 491 492 lockdep_assert_held(&node_affinity.lock); 493 /* 494 * If there's only one CPU available for completion vectors, then 495 * there will only be one completion vector available. Othewise, 496 * the number of completion vector available will be the number of 497 * available CPUs divide it by the number of devices in the 498 * local NUMA node. 499 */ 500 if (cpumask_weight(&entry->comp_vect_mask) == 1) { 501 possible_cpus_comp_vect = 1; 502 dd_dev_warn(dd, 503 "Number of kernel receive queues is too large for completion vector affinity to be effective\n"); 504 } else { 505 possible_cpus_comp_vect += 506 cpumask_weight(&entry->comp_vect_mask) / 507 hfi1_per_node_cntr[dd->node]; 508 509 /* 510 * If the completion vector CPUs available doesn't divide 511 * evenly among devices, then the first device device to be 512 * initialized gets an extra CPU. 513 */ 514 if (first_dev_init && 515 cpumask_weight(&entry->comp_vect_mask) % 516 hfi1_per_node_cntr[dd->node] != 0) 517 possible_cpus_comp_vect++; 518 } 519 520 dd->comp_vect_possible_cpus = possible_cpus_comp_vect; 521 522 /* Reserving CPUs for device completion vector */ 523 for (i = 0; i < dd->comp_vect_possible_cpus; i++) { 524 curr_cpu = per_cpu_affinity_get(&entry->comp_vect_mask, 525 entry->comp_vect_affinity); 526 if (curr_cpu < 0) 527 goto fail; 528 529 cpumask_set_cpu(curr_cpu, dev_comp_vect_mask); 530 } 531 532 hfi1_cdbg(AFFINITY, 533 "[%s] Completion vector affinity CPU set(s) %*pbl", 534 rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), 535 cpumask_pr_args(dev_comp_vect_mask)); 536 537 return 0; 538 539 fail: 540 for (j = 0; j < i; j++) 541 per_cpu_affinity_put_max(&entry->comp_vect_mask, 542 entry->comp_vect_affinity); 543 544 return curr_cpu; 545 } 546 547 /* 548 * It assumes dd->comp_vect_possible_cpus is available. 549 */ 550 static void _dev_comp_vect_cpu_mask_clean_up(struct hfi1_devdata *dd, 551 struct hfi1_affinity_node *entry) 552 __must_hold(&node_affinity.lock) 553 { 554 int i, cpu; 555 556 lockdep_assert_held(&node_affinity.lock); 557 if (!dd->comp_vect_possible_cpus) 558 return; 559 560 for (i = 0; i < dd->comp_vect_possible_cpus; i++) { 561 cpu = per_cpu_affinity_put_max(&dd->comp_vect->mask, 562 entry->comp_vect_affinity); 563 /* Clearing CPU in device completion vector cpu mask */ 564 if (cpu >= 0) 565 cpumask_clear_cpu(cpu, &dd->comp_vect->mask); 566 } 567 568 dd->comp_vect_possible_cpus = 0; 569 } 570 571 /* 572 * Interrupt affinity. 573 * 574 * non-rcv avail gets a default mask that 575 * starts as possible cpus with threads reset 576 * and each rcv avail reset. 577 * 578 * rcv avail gets node relative 1 wrapping back 579 * to the node relative 1 as necessary. 580 * 581 */ 582 int hfi1_dev_affinity_init(struct hfi1_devdata *dd) 583 { 584 struct hfi1_affinity_node *entry; 585 const struct cpumask *local_mask; 586 int curr_cpu, possible, i, ret; 587 bool new_entry = false; 588 589 local_mask = cpumask_of_node(dd->node); 590 if (cpumask_first(local_mask) >= nr_cpu_ids) 591 local_mask = topology_core_cpumask(0); 592 593 mutex_lock(&node_affinity.lock); 594 entry = node_affinity_lookup(dd->node); 595 596 /* 597 * If this is the first time this NUMA node's affinity is used, 598 * create an entry in the global affinity structure and initialize it. 599 */ 600 if (!entry) { 601 entry = node_affinity_allocate(dd->node); 602 if (!entry) { 603 dd_dev_err(dd, 604 "Unable to allocate global affinity node\n"); 605 ret = -ENOMEM; 606 goto fail; 607 } 608 new_entry = true; 609 610 init_cpu_mask_set(&entry->def_intr); 611 init_cpu_mask_set(&entry->rcv_intr); 612 cpumask_clear(&entry->comp_vect_mask); 613 cpumask_clear(&entry->general_intr_mask); 614 /* Use the "real" cpu mask of this node as the default */ 615 cpumask_and(&entry->def_intr.mask, &node_affinity.real_cpu_mask, 616 local_mask); 617 618 /* fill in the receive list */ 619 possible = cpumask_weight(&entry->def_intr.mask); 620 curr_cpu = cpumask_first(&entry->def_intr.mask); 621 622 if (possible == 1) { 623 /* only one CPU, everyone will use it */ 624 cpumask_set_cpu(curr_cpu, &entry->rcv_intr.mask); 625 cpumask_set_cpu(curr_cpu, &entry->general_intr_mask); 626 } else { 627 /* 628 * The general/control context will be the first CPU in 629 * the default list, so it is removed from the default 630 * list and added to the general interrupt list. 631 */ 632 cpumask_clear_cpu(curr_cpu, &entry->def_intr.mask); 633 cpumask_set_cpu(curr_cpu, &entry->general_intr_mask); 634 curr_cpu = cpumask_next(curr_cpu, 635 &entry->def_intr.mask); 636 637 /* 638 * Remove the remaining kernel receive queues from 639 * the default list and add them to the receive list. 640 */ 641 for (i = 0; 642 i < (dd->n_krcv_queues - 1) * 643 hfi1_per_node_cntr[dd->node]; 644 i++) { 645 cpumask_clear_cpu(curr_cpu, 646 &entry->def_intr.mask); 647 cpumask_set_cpu(curr_cpu, 648 &entry->rcv_intr.mask); 649 curr_cpu = cpumask_next(curr_cpu, 650 &entry->def_intr.mask); 651 if (curr_cpu >= nr_cpu_ids) 652 break; 653 } 654 655 /* 656 * If there ends up being 0 CPU cores leftover for SDMA 657 * engines, use the same CPU cores as general/control 658 * context. 659 */ 660 if (cpumask_empty(&entry->def_intr.mask)) 661 cpumask_copy(&entry->def_intr.mask, 662 &entry->general_intr_mask); 663 } 664 665 /* Determine completion vector CPUs for the entire node */ 666 cpumask_and(&entry->comp_vect_mask, 667 &node_affinity.real_cpu_mask, local_mask); 668 cpumask_andnot(&entry->comp_vect_mask, 669 &entry->comp_vect_mask, 670 &entry->rcv_intr.mask); 671 cpumask_andnot(&entry->comp_vect_mask, 672 &entry->comp_vect_mask, 673 &entry->general_intr_mask); 674 675 /* 676 * If there ends up being 0 CPU cores leftover for completion 677 * vectors, use the same CPU core as the general/control 678 * context. 679 */ 680 if (cpumask_empty(&entry->comp_vect_mask)) 681 cpumask_copy(&entry->comp_vect_mask, 682 &entry->general_intr_mask); 683 } 684 685 ret = _dev_comp_vect_cpu_mask_init(dd, entry, new_entry); 686 if (ret < 0) 687 goto fail; 688 689 if (new_entry) 690 node_affinity_add_tail(entry); 691 692 dd->affinity_entry = entry; 693 mutex_unlock(&node_affinity.lock); 694 695 return 0; 696 697 fail: 698 if (new_entry) 699 node_affinity_destroy(entry); 700 mutex_unlock(&node_affinity.lock); 701 return ret; 702 } 703 704 void hfi1_dev_affinity_clean_up(struct hfi1_devdata *dd) 705 { 706 struct hfi1_affinity_node *entry; 707 708 mutex_lock(&node_affinity.lock); 709 if (!dd->affinity_entry) 710 goto unlock; 711 entry = node_affinity_lookup(dd->node); 712 if (!entry) 713 goto unlock; 714 715 /* 716 * Free device completion vector CPUs to be used by future 717 * completion vectors 718 */ 719 _dev_comp_vect_cpu_mask_clean_up(dd, entry); 720 unlock: 721 dd->affinity_entry = NULL; 722 mutex_unlock(&node_affinity.lock); 723 } 724 725 /* 726 * Function updates the irq affinity hint for msix after it has been changed 727 * by the user using the /proc/irq interface. This function only accepts 728 * one cpu in the mask. 729 */ 730 static void hfi1_update_sdma_affinity(struct hfi1_msix_entry *msix, int cpu) 731 { 732 struct sdma_engine *sde = msix->arg; 733 struct hfi1_devdata *dd = sde->dd; 734 struct hfi1_affinity_node *entry; 735 struct cpu_mask_set *set; 736 int i, old_cpu; 737 738 if (cpu > num_online_cpus() || cpu == sde->cpu) 739 return; 740 741 mutex_lock(&node_affinity.lock); 742 entry = node_affinity_lookup(dd->node); 743 if (!entry) 744 goto unlock; 745 746 old_cpu = sde->cpu; 747 sde->cpu = cpu; 748 cpumask_clear(&msix->mask); 749 cpumask_set_cpu(cpu, &msix->mask); 750 dd_dev_dbg(dd, "IRQ: %u, type %s engine %u -> cpu: %d\n", 751 msix->irq, irq_type_names[msix->type], 752 sde->this_idx, cpu); 753 irq_set_affinity_hint(msix->irq, &msix->mask); 754 755 /* 756 * Set the new cpu in the hfi1_affinity_node and clean 757 * the old cpu if it is not used by any other IRQ 758 */ 759 set = &entry->def_intr; 760 cpumask_set_cpu(cpu, &set->mask); 761 cpumask_set_cpu(cpu, &set->used); 762 for (i = 0; i < dd->msix_info.max_requested; i++) { 763 struct hfi1_msix_entry *other_msix; 764 765 other_msix = &dd->msix_info.msix_entries[i]; 766 if (other_msix->type != IRQ_SDMA || other_msix == msix) 767 continue; 768 769 if (cpumask_test_cpu(old_cpu, &other_msix->mask)) 770 goto unlock; 771 } 772 cpumask_clear_cpu(old_cpu, &set->mask); 773 cpumask_clear_cpu(old_cpu, &set->used); 774 unlock: 775 mutex_unlock(&node_affinity.lock); 776 } 777 778 static void hfi1_irq_notifier_notify(struct irq_affinity_notify *notify, 779 const cpumask_t *mask) 780 { 781 int cpu = cpumask_first(mask); 782 struct hfi1_msix_entry *msix = container_of(notify, 783 struct hfi1_msix_entry, 784 notify); 785 786 /* Only one CPU configuration supported currently */ 787 hfi1_update_sdma_affinity(msix, cpu); 788 } 789 790 static void hfi1_irq_notifier_release(struct kref *ref) 791 { 792 /* 793 * This is required by affinity notifier. We don't have anything to 794 * free here. 795 */ 796 } 797 798 static void hfi1_setup_sdma_notifier(struct hfi1_msix_entry *msix) 799 { 800 struct irq_affinity_notify *notify = &msix->notify; 801 802 notify->irq = msix->irq; 803 notify->notify = hfi1_irq_notifier_notify; 804 notify->release = hfi1_irq_notifier_release; 805 806 if (irq_set_affinity_notifier(notify->irq, notify)) 807 pr_err("Failed to register sdma irq affinity notifier for irq %d\n", 808 notify->irq); 809 } 810 811 static void hfi1_cleanup_sdma_notifier(struct hfi1_msix_entry *msix) 812 { 813 struct irq_affinity_notify *notify = &msix->notify; 814 815 if (irq_set_affinity_notifier(notify->irq, NULL)) 816 pr_err("Failed to cleanup sdma irq affinity notifier for irq %d\n", 817 notify->irq); 818 } 819 820 /* 821 * Function sets the irq affinity for msix. 822 * It *must* be called with node_affinity.lock held. 823 */ 824 static int get_irq_affinity(struct hfi1_devdata *dd, 825 struct hfi1_msix_entry *msix) 826 { 827 cpumask_var_t diff; 828 struct hfi1_affinity_node *entry; 829 struct cpu_mask_set *set = NULL; 830 struct sdma_engine *sde = NULL; 831 struct hfi1_ctxtdata *rcd = NULL; 832 char extra[64]; 833 int cpu = -1; 834 835 extra[0] = '\0'; 836 cpumask_clear(&msix->mask); 837 838 entry = node_affinity_lookup(dd->node); 839 840 switch (msix->type) { 841 case IRQ_SDMA: 842 sde = (struct sdma_engine *)msix->arg; 843 scnprintf(extra, 64, "engine %u", sde->this_idx); 844 set = &entry->def_intr; 845 break; 846 case IRQ_GENERAL: 847 cpu = cpumask_first(&entry->general_intr_mask); 848 break; 849 case IRQ_RCVCTXT: 850 rcd = (struct hfi1_ctxtdata *)msix->arg; 851 if (rcd->ctxt == HFI1_CTRL_CTXT) 852 cpu = cpumask_first(&entry->general_intr_mask); 853 else 854 set = &entry->rcv_intr; 855 scnprintf(extra, 64, "ctxt %u", rcd->ctxt); 856 break; 857 case IRQ_NETDEVCTXT: 858 rcd = (struct hfi1_ctxtdata *)msix->arg; 859 set = &entry->def_intr; 860 scnprintf(extra, 64, "ctxt %u", rcd->ctxt); 861 break; 862 default: 863 dd_dev_err(dd, "Invalid IRQ type %d\n", msix->type); 864 return -EINVAL; 865 } 866 867 /* 868 * The general and control contexts are placed on a particular 869 * CPU, which is set above. Skip accounting for it. Everything else 870 * finds its CPU here. 871 */ 872 if (cpu == -1 && set) { 873 if (!zalloc_cpumask_var(&diff, GFP_KERNEL)) 874 return -ENOMEM; 875 876 cpu = cpu_mask_set_get_first(set, diff); 877 if (cpu < 0) { 878 free_cpumask_var(diff); 879 dd_dev_err(dd, "Failure to obtain CPU for IRQ\n"); 880 return cpu; 881 } 882 883 free_cpumask_var(diff); 884 } 885 886 cpumask_set_cpu(cpu, &msix->mask); 887 dd_dev_info(dd, "IRQ: %u, type %s %s -> cpu: %d\n", 888 msix->irq, irq_type_names[msix->type], 889 extra, cpu); 890 irq_set_affinity_hint(msix->irq, &msix->mask); 891 892 if (msix->type == IRQ_SDMA) { 893 sde->cpu = cpu; 894 hfi1_setup_sdma_notifier(msix); 895 } 896 897 return 0; 898 } 899 900 int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix) 901 { 902 int ret; 903 904 mutex_lock(&node_affinity.lock); 905 ret = get_irq_affinity(dd, msix); 906 mutex_unlock(&node_affinity.lock); 907 return ret; 908 } 909 910 void hfi1_put_irq_affinity(struct hfi1_devdata *dd, 911 struct hfi1_msix_entry *msix) 912 { 913 struct cpu_mask_set *set = NULL; 914 struct hfi1_affinity_node *entry; 915 916 mutex_lock(&node_affinity.lock); 917 entry = node_affinity_lookup(dd->node); 918 919 switch (msix->type) { 920 case IRQ_SDMA: 921 set = &entry->def_intr; 922 hfi1_cleanup_sdma_notifier(msix); 923 break; 924 case IRQ_GENERAL: 925 /* Don't do accounting for general contexts */ 926 break; 927 case IRQ_RCVCTXT: { 928 struct hfi1_ctxtdata *rcd = msix->arg; 929 930 /* Don't do accounting for control contexts */ 931 if (rcd->ctxt != HFI1_CTRL_CTXT) 932 set = &entry->rcv_intr; 933 break; 934 } 935 case IRQ_NETDEVCTXT: 936 set = &entry->def_intr; 937 break; 938 default: 939 mutex_unlock(&node_affinity.lock); 940 return; 941 } 942 943 if (set) { 944 cpumask_andnot(&set->used, &set->used, &msix->mask); 945 _cpu_mask_set_gen_dec(set); 946 } 947 948 irq_set_affinity_hint(msix->irq, NULL); 949 cpumask_clear(&msix->mask); 950 mutex_unlock(&node_affinity.lock); 951 } 952 953 /* This should be called with node_affinity.lock held */ 954 static void find_hw_thread_mask(uint hw_thread_no, cpumask_var_t hw_thread_mask, 955 struct hfi1_affinity_node_list *affinity) 956 { 957 int curr_cpu; 958 uint num_cores; 959 960 cpumask_copy(hw_thread_mask, &affinity->proc.mask); 961 962 if (affinity->num_core_siblings == 0) 963 return; 964 965 num_cores = rounddown(node_affinity.num_online_cpus / affinity->num_core_siblings, 966 node_affinity.num_online_nodes); 967 968 /* Removing other siblings not needed for now */ 969 curr_cpu = cpumask_nth(num_cores * node_affinity.num_online_nodes, hw_thread_mask) + 1; 970 cpumask_clear_cpus(hw_thread_mask, curr_cpu, nr_cpu_ids - curr_cpu); 971 972 /* Identifying correct HW threads within physical cores */ 973 cpumask_shift_left(hw_thread_mask, hw_thread_mask, num_cores * hw_thread_no); 974 } 975 976 int hfi1_get_proc_affinity(int node) 977 { 978 int cpu = -1, ret, i; 979 struct hfi1_affinity_node *entry; 980 cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask; 981 const struct cpumask *node_mask, 982 *proc_mask = current->cpus_ptr; 983 struct hfi1_affinity_node_list *affinity = &node_affinity; 984 struct cpu_mask_set *set = &affinity->proc; 985 986 /* 987 * check whether process/context affinity has already 988 * been set 989 */ 990 if (current->nr_cpus_allowed == 1) { 991 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl", 992 current->pid, current->comm, 993 cpumask_pr_args(proc_mask)); 994 /* 995 * Mark the pre-set CPU as used. This is atomic so we don't 996 * need the lock 997 */ 998 cpu = cpumask_first(proc_mask); 999 cpumask_set_cpu(cpu, &set->used); 1000 goto done; 1001 } else if (current->nr_cpus_allowed < cpumask_weight(&set->mask)) { 1002 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl", 1003 current->pid, current->comm, 1004 cpumask_pr_args(proc_mask)); 1005 goto done; 1006 } 1007 1008 /* 1009 * The process does not have a preset CPU affinity so find one to 1010 * recommend using the following algorithm: 1011 * 1012 * For each user process that is opening a context on HFI Y: 1013 * a) If all cores are filled, reinitialize the bitmask 1014 * b) Fill real cores first, then HT cores (First set of HT 1015 * cores on all physical cores, then second set of HT core, 1016 * and, so on) in the following order: 1017 * 1018 * 1. Same NUMA node as HFI Y and not running an IRQ 1019 * handler 1020 * 2. Same NUMA node as HFI Y and running an IRQ handler 1021 * 3. Different NUMA node to HFI Y and not running an IRQ 1022 * handler 1023 * 4. Different NUMA node to HFI Y and running an IRQ 1024 * handler 1025 * c) Mark core as filled in the bitmask. As user processes are 1026 * done, clear cores from the bitmask. 1027 */ 1028 1029 ret = zalloc_cpumask_var(&diff, GFP_KERNEL); 1030 if (!ret) 1031 goto done; 1032 ret = zalloc_cpumask_var(&hw_thread_mask, GFP_KERNEL); 1033 if (!ret) 1034 goto free_diff; 1035 ret = zalloc_cpumask_var(&available_mask, GFP_KERNEL); 1036 if (!ret) 1037 goto free_hw_thread_mask; 1038 ret = zalloc_cpumask_var(&intrs_mask, GFP_KERNEL); 1039 if (!ret) 1040 goto free_available_mask; 1041 1042 mutex_lock(&affinity->lock); 1043 /* 1044 * If we've used all available HW threads, clear the mask and start 1045 * overloading. 1046 */ 1047 _cpu_mask_set_gen_inc(set); 1048 1049 /* 1050 * If NUMA node has CPUs used by interrupt handlers, include them in the 1051 * interrupt handler mask. 1052 */ 1053 entry = node_affinity_lookup(node); 1054 if (entry) { 1055 cpumask_copy(intrs_mask, (entry->def_intr.gen ? 1056 &entry->def_intr.mask : 1057 &entry->def_intr.used)); 1058 cpumask_or(intrs_mask, intrs_mask, (entry->rcv_intr.gen ? 1059 &entry->rcv_intr.mask : 1060 &entry->rcv_intr.used)); 1061 cpumask_or(intrs_mask, intrs_mask, &entry->general_intr_mask); 1062 } 1063 hfi1_cdbg(PROC, "CPUs used by interrupts: %*pbl", 1064 cpumask_pr_args(intrs_mask)); 1065 1066 cpumask_copy(hw_thread_mask, &set->mask); 1067 1068 /* 1069 * If HT cores are enabled, identify which HW threads within the 1070 * physical cores should be used. 1071 */ 1072 for (i = 0; i < affinity->num_core_siblings; i++) { 1073 find_hw_thread_mask(i, hw_thread_mask, affinity); 1074 1075 /* 1076 * If there's at least one available core for this HW 1077 * thread number, stop looking for a core. 1078 * 1079 * diff will always be not empty at least once in this 1080 * loop as the used mask gets reset when 1081 * (set->mask == set->used) before this loop. 1082 */ 1083 if (cpumask_andnot(diff, hw_thread_mask, &set->used)) 1084 break; 1085 } 1086 hfi1_cdbg(PROC, "Same available HW thread on all physical CPUs: %*pbl", 1087 cpumask_pr_args(hw_thread_mask)); 1088 1089 node_mask = cpumask_of_node(node); 1090 hfi1_cdbg(PROC, "Device on NUMA %u, CPUs %*pbl", node, 1091 cpumask_pr_args(node_mask)); 1092 1093 /* Get cpumask of available CPUs on preferred NUMA */ 1094 cpumask_and(available_mask, hw_thread_mask, node_mask); 1095 cpumask_andnot(available_mask, available_mask, &set->used); 1096 hfi1_cdbg(PROC, "Available CPUs on NUMA %u: %*pbl", node, 1097 cpumask_pr_args(available_mask)); 1098 1099 /* 1100 * At first, we don't want to place processes on the same 1101 * CPUs as interrupt handlers. Then, CPUs running interrupt 1102 * handlers are used. 1103 * 1104 * 1) If diff is not empty, then there are CPUs not running 1105 * non-interrupt handlers available, so diff gets copied 1106 * over to available_mask. 1107 * 2) If diff is empty, then all CPUs not running interrupt 1108 * handlers are taken, so available_mask contains all 1109 * available CPUs running interrupt handlers. 1110 * 3) If available_mask is empty, then all CPUs on the 1111 * preferred NUMA node are taken, so other NUMA nodes are 1112 * used for process assignments using the same method as 1113 * the preferred NUMA node. 1114 */ 1115 if (cpumask_andnot(diff, available_mask, intrs_mask)) 1116 cpumask_copy(available_mask, diff); 1117 1118 /* If we don't have CPUs on the preferred node, use other NUMA nodes */ 1119 if (cpumask_empty(available_mask)) { 1120 cpumask_andnot(available_mask, hw_thread_mask, &set->used); 1121 /* Excluding preferred NUMA cores */ 1122 cpumask_andnot(available_mask, available_mask, node_mask); 1123 hfi1_cdbg(PROC, 1124 "Preferred NUMA node cores are taken, cores available in other NUMA nodes: %*pbl", 1125 cpumask_pr_args(available_mask)); 1126 1127 /* 1128 * At first, we don't want to place processes on the same 1129 * CPUs as interrupt handlers. 1130 */ 1131 if (cpumask_andnot(diff, available_mask, intrs_mask)) 1132 cpumask_copy(available_mask, diff); 1133 } 1134 hfi1_cdbg(PROC, "Possible CPUs for process: %*pbl", 1135 cpumask_pr_args(available_mask)); 1136 1137 cpu = cpumask_first(available_mask); 1138 if (cpu >= nr_cpu_ids) /* empty */ 1139 cpu = -1; 1140 else 1141 cpumask_set_cpu(cpu, &set->used); 1142 1143 mutex_unlock(&affinity->lock); 1144 hfi1_cdbg(PROC, "Process assigned to CPU %d", cpu); 1145 1146 free_cpumask_var(intrs_mask); 1147 free_available_mask: 1148 free_cpumask_var(available_mask); 1149 free_hw_thread_mask: 1150 free_cpumask_var(hw_thread_mask); 1151 free_diff: 1152 free_cpumask_var(diff); 1153 done: 1154 return cpu; 1155 } 1156 1157 void hfi1_put_proc_affinity(int cpu) 1158 { 1159 struct hfi1_affinity_node_list *affinity = &node_affinity; 1160 struct cpu_mask_set *set = &affinity->proc; 1161 1162 if (cpu < 0) 1163 return; 1164 1165 mutex_lock(&affinity->lock); 1166 cpu_mask_set_put(set, cpu); 1167 hfi1_cdbg(PROC, "Returning CPU %d for future process assignment", cpu); 1168 mutex_unlock(&affinity->lock); 1169 } 1170