1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2 /* 3 * Copyright(c) 2015 - 2020 Intel Corporation. 4 */ 5 6 #include <linux/topology.h> 7 #include <linux/cpumask.h> 8 #include <linux/interrupt.h> 9 #include <linux/numa.h> 10 11 #include "hfi.h" 12 #include "affinity.h" 13 #include "sdma.h" 14 #include "trace.h" 15 16 struct hfi1_affinity_node_list node_affinity = { 17 .list = LIST_HEAD_INIT(node_affinity.list), 18 .lock = __MUTEX_INITIALIZER(node_affinity.lock) 19 }; 20 21 /* Name of IRQ types, indexed by enum irq_type */ 22 static const char * const irq_type_names[] = { 23 "SDMA", 24 "RCVCTXT", 25 "NETDEVCTXT", 26 "GENERAL", 27 "OTHER", 28 }; 29 30 /* Per NUMA node count of HFI devices */ 31 static unsigned int *hfi1_per_node_cntr; 32 33 static inline void init_cpu_mask_set(struct cpu_mask_set *set) 34 { 35 cpumask_clear(&set->mask); 36 cpumask_clear(&set->used); 37 set->gen = 0; 38 } 39 40 /* Increment generation of CPU set if needed */ 41 static void _cpu_mask_set_gen_inc(struct cpu_mask_set *set) 42 { 43 if (cpumask_equal(&set->mask, &set->used)) { 44 /* 45 * We've used up all the CPUs, bump up the generation 46 * and reset the 'used' map 47 */ 48 set->gen++; 49 cpumask_clear(&set->used); 50 } 51 } 52 53 static void _cpu_mask_set_gen_dec(struct cpu_mask_set *set) 54 { 55 if (cpumask_empty(&set->used) && set->gen) { 56 set->gen--; 57 cpumask_copy(&set->used, &set->mask); 58 } 59 } 60 61 /* Get the first CPU from the list of unused CPUs in a CPU set data structure */ 62 static int cpu_mask_set_get_first(struct cpu_mask_set *set, cpumask_var_t diff) 63 { 64 int cpu; 65 66 if (!diff || !set) 67 return -EINVAL; 68 69 _cpu_mask_set_gen_inc(set); 70 71 /* Find out CPUs left in CPU mask */ 72 cpumask_andnot(diff, &set->mask, &set->used); 73 74 cpu = cpumask_first(diff); 75 if (cpu >= nr_cpu_ids) /* empty */ 76 cpu = -EINVAL; 77 else 78 cpumask_set_cpu(cpu, &set->used); 79 80 return cpu; 81 } 82 83 static void cpu_mask_set_put(struct cpu_mask_set *set, int cpu) 84 { 85 if (!set) 86 return; 87 88 cpumask_clear_cpu(cpu, &set->used); 89 _cpu_mask_set_gen_dec(set); 90 } 91 92 /* Initialize non-HT cpu cores mask */ 93 void init_real_cpu_mask(void) 94 { 95 int possible, curr_cpu, i, ht; 96 97 cpumask_clear(&node_affinity.real_cpu_mask); 98 99 /* Start with cpu online mask as the real cpu mask */ 100 cpumask_copy(&node_affinity.real_cpu_mask, cpu_online_mask); 101 102 /* 103 * Remove HT cores from the real cpu mask. Do this in two steps below. 104 */ 105 possible = cpumask_weight(&node_affinity.real_cpu_mask); 106 ht = cpumask_weight(topology_sibling_cpumask( 107 cpumask_first(&node_affinity.real_cpu_mask))); 108 /* 109 * Step 1. Skip over the first N HT siblings and use them as the 110 * "real" cores. Assumes that HT cores are not enumerated in 111 * succession (except in the single core case). 112 */ 113 curr_cpu = cpumask_first(&node_affinity.real_cpu_mask); 114 for (i = 0; i < possible / ht; i++) 115 curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask); 116 /* 117 * Step 2. Remove the remaining HT siblings. Use cpumask_next() to 118 * skip any gaps. 119 */ 120 for (; i < possible; i++) { 121 cpumask_clear_cpu(curr_cpu, &node_affinity.real_cpu_mask); 122 curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask); 123 } 124 } 125 126 int node_affinity_init(void) 127 { 128 int node; 129 struct pci_dev *dev = NULL; 130 const struct pci_device_id *ids = hfi1_pci_tbl; 131 132 cpumask_clear(&node_affinity.proc.used); 133 cpumask_copy(&node_affinity.proc.mask, cpu_online_mask); 134 135 node_affinity.proc.gen = 0; 136 node_affinity.num_core_siblings = 137 cpumask_weight(topology_sibling_cpumask( 138 cpumask_first(&node_affinity.proc.mask) 139 )); 140 node_affinity.num_possible_nodes = num_possible_nodes(); 141 node_affinity.num_online_nodes = num_online_nodes(); 142 node_affinity.num_online_cpus = num_online_cpus(); 143 144 /* 145 * The real cpu mask is part of the affinity struct but it has to be 146 * initialized early. It is needed to calculate the number of user 147 * contexts in set_up_context_variables(). 148 */ 149 init_real_cpu_mask(); 150 151 hfi1_per_node_cntr = kcalloc(node_affinity.num_possible_nodes, 152 sizeof(*hfi1_per_node_cntr), GFP_KERNEL); 153 if (!hfi1_per_node_cntr) 154 return -ENOMEM; 155 156 while (ids->vendor) { 157 dev = NULL; 158 while ((dev = pci_get_device(ids->vendor, ids->device, dev))) { 159 node = pcibus_to_node(dev->bus); 160 if (node < 0) 161 goto out; 162 163 hfi1_per_node_cntr[node]++; 164 } 165 ids++; 166 } 167 168 return 0; 169 170 out: 171 /* 172 * Invalid PCI NUMA node information found, note it, and populate 173 * our database 1:1. 174 */ 175 pr_err("HFI: Invalid PCI NUMA node. Performance may be affected\n"); 176 pr_err("HFI: System BIOS may need to be upgraded\n"); 177 for (node = 0; node < node_affinity.num_possible_nodes; node++) 178 hfi1_per_node_cntr[node] = 1; 179 180 pci_dev_put(dev); 181 182 return 0; 183 } 184 185 static void node_affinity_destroy(struct hfi1_affinity_node *entry) 186 { 187 free_percpu(entry->comp_vect_affinity); 188 kfree(entry); 189 } 190 191 void node_affinity_destroy_all(void) 192 { 193 struct list_head *pos, *q; 194 struct hfi1_affinity_node *entry; 195 196 mutex_lock(&node_affinity.lock); 197 list_for_each_safe(pos, q, &node_affinity.list) { 198 entry = list_entry(pos, struct hfi1_affinity_node, 199 list); 200 list_del(pos); 201 node_affinity_destroy(entry); 202 } 203 mutex_unlock(&node_affinity.lock); 204 kfree(hfi1_per_node_cntr); 205 } 206 207 static struct hfi1_affinity_node *node_affinity_allocate(int node) 208 { 209 struct hfi1_affinity_node *entry; 210 211 entry = kzalloc(sizeof(*entry), GFP_KERNEL); 212 if (!entry) 213 return NULL; 214 entry->node = node; 215 entry->comp_vect_affinity = alloc_percpu(u16); 216 INIT_LIST_HEAD(&entry->list); 217 218 return entry; 219 } 220 221 /* 222 * It appends an entry to the list. 223 * It *must* be called with node_affinity.lock held. 224 */ 225 static void node_affinity_add_tail(struct hfi1_affinity_node *entry) 226 { 227 list_add_tail(&entry->list, &node_affinity.list); 228 } 229 230 /* It must be called with node_affinity.lock held */ 231 static struct hfi1_affinity_node *node_affinity_lookup(int node) 232 { 233 struct hfi1_affinity_node *entry; 234 235 list_for_each_entry(entry, &node_affinity.list, list) { 236 if (entry->node == node) 237 return entry; 238 } 239 240 return NULL; 241 } 242 243 static int per_cpu_affinity_get(cpumask_var_t possible_cpumask, 244 u16 __percpu *comp_vect_affinity) 245 { 246 int curr_cpu; 247 u16 cntr; 248 u16 prev_cntr; 249 int ret_cpu; 250 251 if (!possible_cpumask) { 252 ret_cpu = -EINVAL; 253 goto fail; 254 } 255 256 if (!comp_vect_affinity) { 257 ret_cpu = -EINVAL; 258 goto fail; 259 } 260 261 ret_cpu = cpumask_first(possible_cpumask); 262 if (ret_cpu >= nr_cpu_ids) { 263 ret_cpu = -EINVAL; 264 goto fail; 265 } 266 267 prev_cntr = *per_cpu_ptr(comp_vect_affinity, ret_cpu); 268 for_each_cpu(curr_cpu, possible_cpumask) { 269 cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu); 270 271 if (cntr < prev_cntr) { 272 ret_cpu = curr_cpu; 273 prev_cntr = cntr; 274 } 275 } 276 277 *per_cpu_ptr(comp_vect_affinity, ret_cpu) += 1; 278 279 fail: 280 return ret_cpu; 281 } 282 283 static int per_cpu_affinity_put_max(cpumask_var_t possible_cpumask, 284 u16 __percpu *comp_vect_affinity) 285 { 286 int curr_cpu; 287 int max_cpu; 288 u16 cntr; 289 u16 prev_cntr; 290 291 if (!possible_cpumask) 292 return -EINVAL; 293 294 if (!comp_vect_affinity) 295 return -EINVAL; 296 297 max_cpu = cpumask_first(possible_cpumask); 298 if (max_cpu >= nr_cpu_ids) 299 return -EINVAL; 300 301 prev_cntr = *per_cpu_ptr(comp_vect_affinity, max_cpu); 302 for_each_cpu(curr_cpu, possible_cpumask) { 303 cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu); 304 305 if (cntr > prev_cntr) { 306 max_cpu = curr_cpu; 307 prev_cntr = cntr; 308 } 309 } 310 311 *per_cpu_ptr(comp_vect_affinity, max_cpu) -= 1; 312 313 return max_cpu; 314 } 315 316 /* 317 * Non-interrupt CPUs are used first, then interrupt CPUs. 318 * Two already allocated cpu masks must be passed. 319 */ 320 static int _dev_comp_vect_cpu_get(struct hfi1_devdata *dd, 321 struct hfi1_affinity_node *entry, 322 cpumask_var_t non_intr_cpus, 323 cpumask_var_t available_cpus) 324 __must_hold(&node_affinity.lock) 325 { 326 int cpu; 327 struct cpu_mask_set *set = dd->comp_vect; 328 329 lockdep_assert_held(&node_affinity.lock); 330 if (!non_intr_cpus) { 331 cpu = -1; 332 goto fail; 333 } 334 335 if (!available_cpus) { 336 cpu = -1; 337 goto fail; 338 } 339 340 /* Available CPUs for pinning completion vectors */ 341 _cpu_mask_set_gen_inc(set); 342 cpumask_andnot(available_cpus, &set->mask, &set->used); 343 344 /* Available CPUs without SDMA engine interrupts */ 345 cpumask_andnot(non_intr_cpus, available_cpus, 346 &entry->def_intr.used); 347 348 /* If there are non-interrupt CPUs available, use them first */ 349 if (!cpumask_empty(non_intr_cpus)) 350 cpu = cpumask_first(non_intr_cpus); 351 else /* Otherwise, use interrupt CPUs */ 352 cpu = cpumask_first(available_cpus); 353 354 if (cpu >= nr_cpu_ids) { /* empty */ 355 cpu = -1; 356 goto fail; 357 } 358 cpumask_set_cpu(cpu, &set->used); 359 360 fail: 361 return cpu; 362 } 363 364 static void _dev_comp_vect_cpu_put(struct hfi1_devdata *dd, int cpu) 365 { 366 struct cpu_mask_set *set = dd->comp_vect; 367 368 if (cpu < 0) 369 return; 370 371 cpu_mask_set_put(set, cpu); 372 } 373 374 /* _dev_comp_vect_mappings_destroy() is reentrant */ 375 static void _dev_comp_vect_mappings_destroy(struct hfi1_devdata *dd) 376 { 377 int i, cpu; 378 379 if (!dd->comp_vect_mappings) 380 return; 381 382 for (i = 0; i < dd->comp_vect_possible_cpus; i++) { 383 cpu = dd->comp_vect_mappings[i]; 384 _dev_comp_vect_cpu_put(dd, cpu); 385 dd->comp_vect_mappings[i] = -1; 386 hfi1_cdbg(AFFINITY, 387 "[%s] Release CPU %d from completion vector %d", 388 rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), cpu, i); 389 } 390 391 kfree(dd->comp_vect_mappings); 392 dd->comp_vect_mappings = NULL; 393 } 394 395 /* 396 * This function creates the table for looking up CPUs for completion vectors. 397 * num_comp_vectors needs to have been initilized before calling this function. 398 */ 399 static int _dev_comp_vect_mappings_create(struct hfi1_devdata *dd, 400 struct hfi1_affinity_node *entry) 401 __must_hold(&node_affinity.lock) 402 { 403 int i, cpu, ret; 404 cpumask_var_t non_intr_cpus; 405 cpumask_var_t available_cpus; 406 407 lockdep_assert_held(&node_affinity.lock); 408 409 if (!zalloc_cpumask_var(&non_intr_cpus, GFP_KERNEL)) 410 return -ENOMEM; 411 412 if (!zalloc_cpumask_var(&available_cpus, GFP_KERNEL)) { 413 free_cpumask_var(non_intr_cpus); 414 return -ENOMEM; 415 } 416 417 dd->comp_vect_mappings = kcalloc(dd->comp_vect_possible_cpus, 418 sizeof(*dd->comp_vect_mappings), 419 GFP_KERNEL); 420 if (!dd->comp_vect_mappings) { 421 ret = -ENOMEM; 422 goto fail; 423 } 424 for (i = 0; i < dd->comp_vect_possible_cpus; i++) 425 dd->comp_vect_mappings[i] = -1; 426 427 for (i = 0; i < dd->comp_vect_possible_cpus; i++) { 428 cpu = _dev_comp_vect_cpu_get(dd, entry, non_intr_cpus, 429 available_cpus); 430 if (cpu < 0) { 431 ret = -EINVAL; 432 goto fail; 433 } 434 435 dd->comp_vect_mappings[i] = cpu; 436 hfi1_cdbg(AFFINITY, 437 "[%s] Completion Vector %d -> CPU %d", 438 rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), i, cpu); 439 } 440 441 free_cpumask_var(available_cpus); 442 free_cpumask_var(non_intr_cpus); 443 return 0; 444 445 fail: 446 free_cpumask_var(available_cpus); 447 free_cpumask_var(non_intr_cpus); 448 _dev_comp_vect_mappings_destroy(dd); 449 450 return ret; 451 } 452 453 int hfi1_comp_vectors_set_up(struct hfi1_devdata *dd) 454 { 455 int ret; 456 struct hfi1_affinity_node *entry; 457 458 mutex_lock(&node_affinity.lock); 459 entry = node_affinity_lookup(dd->node); 460 if (!entry) { 461 ret = -EINVAL; 462 goto unlock; 463 } 464 ret = _dev_comp_vect_mappings_create(dd, entry); 465 unlock: 466 mutex_unlock(&node_affinity.lock); 467 468 return ret; 469 } 470 471 void hfi1_comp_vectors_clean_up(struct hfi1_devdata *dd) 472 { 473 _dev_comp_vect_mappings_destroy(dd); 474 } 475 476 int hfi1_comp_vect_mappings_lookup(struct rvt_dev_info *rdi, int comp_vect) 477 { 478 struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi); 479 struct hfi1_devdata *dd = dd_from_dev(verbs_dev); 480 481 if (!dd->comp_vect_mappings) 482 return -EINVAL; 483 if (comp_vect >= dd->comp_vect_possible_cpus) 484 return -EINVAL; 485 486 return dd->comp_vect_mappings[comp_vect]; 487 } 488 489 /* 490 * It assumes dd->comp_vect_possible_cpus is available. 491 */ 492 static int _dev_comp_vect_cpu_mask_init(struct hfi1_devdata *dd, 493 struct hfi1_affinity_node *entry, 494 bool first_dev_init) 495 __must_hold(&node_affinity.lock) 496 { 497 int i, j, curr_cpu; 498 int possible_cpus_comp_vect = 0; 499 struct cpumask *dev_comp_vect_mask = &dd->comp_vect->mask; 500 501 lockdep_assert_held(&node_affinity.lock); 502 /* 503 * If there's only one CPU available for completion vectors, then 504 * there will only be one completion vector available. Othewise, 505 * the number of completion vector available will be the number of 506 * available CPUs divide it by the number of devices in the 507 * local NUMA node. 508 */ 509 if (cpumask_weight(&entry->comp_vect_mask) == 1) { 510 possible_cpus_comp_vect = 1; 511 dd_dev_warn(dd, 512 "Number of kernel receive queues is too large for completion vector affinity to be effective\n"); 513 } else { 514 possible_cpus_comp_vect += 515 cpumask_weight(&entry->comp_vect_mask) / 516 hfi1_per_node_cntr[dd->node]; 517 518 /* 519 * If the completion vector CPUs available doesn't divide 520 * evenly among devices, then the first device device to be 521 * initialized gets an extra CPU. 522 */ 523 if (first_dev_init && 524 cpumask_weight(&entry->comp_vect_mask) % 525 hfi1_per_node_cntr[dd->node] != 0) 526 possible_cpus_comp_vect++; 527 } 528 529 dd->comp_vect_possible_cpus = possible_cpus_comp_vect; 530 531 /* Reserving CPUs for device completion vector */ 532 for (i = 0; i < dd->comp_vect_possible_cpus; i++) { 533 curr_cpu = per_cpu_affinity_get(&entry->comp_vect_mask, 534 entry->comp_vect_affinity); 535 if (curr_cpu < 0) 536 goto fail; 537 538 cpumask_set_cpu(curr_cpu, dev_comp_vect_mask); 539 } 540 541 hfi1_cdbg(AFFINITY, 542 "[%s] Completion vector affinity CPU set(s) %*pbl", 543 rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), 544 cpumask_pr_args(dev_comp_vect_mask)); 545 546 return 0; 547 548 fail: 549 for (j = 0; j < i; j++) 550 per_cpu_affinity_put_max(&entry->comp_vect_mask, 551 entry->comp_vect_affinity); 552 553 return curr_cpu; 554 } 555 556 /* 557 * It assumes dd->comp_vect_possible_cpus is available. 558 */ 559 static void _dev_comp_vect_cpu_mask_clean_up(struct hfi1_devdata *dd, 560 struct hfi1_affinity_node *entry) 561 __must_hold(&node_affinity.lock) 562 { 563 int i, cpu; 564 565 lockdep_assert_held(&node_affinity.lock); 566 if (!dd->comp_vect_possible_cpus) 567 return; 568 569 for (i = 0; i < dd->comp_vect_possible_cpus; i++) { 570 cpu = per_cpu_affinity_put_max(&dd->comp_vect->mask, 571 entry->comp_vect_affinity); 572 /* Clearing CPU in device completion vector cpu mask */ 573 if (cpu >= 0) 574 cpumask_clear_cpu(cpu, &dd->comp_vect->mask); 575 } 576 577 dd->comp_vect_possible_cpus = 0; 578 } 579 580 /* 581 * Interrupt affinity. 582 * 583 * non-rcv avail gets a default mask that 584 * starts as possible cpus with threads reset 585 * and each rcv avail reset. 586 * 587 * rcv avail gets node relative 1 wrapping back 588 * to the node relative 1 as necessary. 589 * 590 */ 591 int hfi1_dev_affinity_init(struct hfi1_devdata *dd) 592 { 593 struct hfi1_affinity_node *entry; 594 const struct cpumask *local_mask; 595 int curr_cpu, possible, i, ret; 596 bool new_entry = false; 597 598 local_mask = cpumask_of_node(dd->node); 599 if (cpumask_first(local_mask) >= nr_cpu_ids) 600 local_mask = topology_core_cpumask(0); 601 602 mutex_lock(&node_affinity.lock); 603 entry = node_affinity_lookup(dd->node); 604 605 /* 606 * If this is the first time this NUMA node's affinity is used, 607 * create an entry in the global affinity structure and initialize it. 608 */ 609 if (!entry) { 610 entry = node_affinity_allocate(dd->node); 611 if (!entry) { 612 dd_dev_err(dd, 613 "Unable to allocate global affinity node\n"); 614 ret = -ENOMEM; 615 goto fail; 616 } 617 new_entry = true; 618 619 init_cpu_mask_set(&entry->def_intr); 620 init_cpu_mask_set(&entry->rcv_intr); 621 cpumask_clear(&entry->comp_vect_mask); 622 cpumask_clear(&entry->general_intr_mask); 623 /* Use the "real" cpu mask of this node as the default */ 624 cpumask_and(&entry->def_intr.mask, &node_affinity.real_cpu_mask, 625 local_mask); 626 627 /* fill in the receive list */ 628 possible = cpumask_weight(&entry->def_intr.mask); 629 curr_cpu = cpumask_first(&entry->def_intr.mask); 630 631 if (possible == 1) { 632 /* only one CPU, everyone will use it */ 633 cpumask_set_cpu(curr_cpu, &entry->rcv_intr.mask); 634 cpumask_set_cpu(curr_cpu, &entry->general_intr_mask); 635 } else { 636 /* 637 * The general/control context will be the first CPU in 638 * the default list, so it is removed from the default 639 * list and added to the general interrupt list. 640 */ 641 cpumask_clear_cpu(curr_cpu, &entry->def_intr.mask); 642 cpumask_set_cpu(curr_cpu, &entry->general_intr_mask); 643 curr_cpu = cpumask_next(curr_cpu, 644 &entry->def_intr.mask); 645 646 /* 647 * Remove the remaining kernel receive queues from 648 * the default list and add them to the receive list. 649 */ 650 for (i = 0; 651 i < (dd->n_krcv_queues - 1) * 652 hfi1_per_node_cntr[dd->node]; 653 i++) { 654 cpumask_clear_cpu(curr_cpu, 655 &entry->def_intr.mask); 656 cpumask_set_cpu(curr_cpu, 657 &entry->rcv_intr.mask); 658 curr_cpu = cpumask_next(curr_cpu, 659 &entry->def_intr.mask); 660 if (curr_cpu >= nr_cpu_ids) 661 break; 662 } 663 664 /* 665 * If there ends up being 0 CPU cores leftover for SDMA 666 * engines, use the same CPU cores as general/control 667 * context. 668 */ 669 if (cpumask_empty(&entry->def_intr.mask)) 670 cpumask_copy(&entry->def_intr.mask, 671 &entry->general_intr_mask); 672 } 673 674 /* Determine completion vector CPUs for the entire node */ 675 cpumask_and(&entry->comp_vect_mask, 676 &node_affinity.real_cpu_mask, local_mask); 677 cpumask_andnot(&entry->comp_vect_mask, 678 &entry->comp_vect_mask, 679 &entry->rcv_intr.mask); 680 cpumask_andnot(&entry->comp_vect_mask, 681 &entry->comp_vect_mask, 682 &entry->general_intr_mask); 683 684 /* 685 * If there ends up being 0 CPU cores leftover for completion 686 * vectors, use the same CPU core as the general/control 687 * context. 688 */ 689 if (cpumask_empty(&entry->comp_vect_mask)) 690 cpumask_copy(&entry->comp_vect_mask, 691 &entry->general_intr_mask); 692 } 693 694 ret = _dev_comp_vect_cpu_mask_init(dd, entry, new_entry); 695 if (ret < 0) 696 goto fail; 697 698 if (new_entry) 699 node_affinity_add_tail(entry); 700 701 dd->affinity_entry = entry; 702 mutex_unlock(&node_affinity.lock); 703 704 return 0; 705 706 fail: 707 if (new_entry) 708 node_affinity_destroy(entry); 709 mutex_unlock(&node_affinity.lock); 710 return ret; 711 } 712 713 void hfi1_dev_affinity_clean_up(struct hfi1_devdata *dd) 714 { 715 struct hfi1_affinity_node *entry; 716 717 mutex_lock(&node_affinity.lock); 718 if (!dd->affinity_entry) 719 goto unlock; 720 entry = node_affinity_lookup(dd->node); 721 if (!entry) 722 goto unlock; 723 724 /* 725 * Free device completion vector CPUs to be used by future 726 * completion vectors 727 */ 728 _dev_comp_vect_cpu_mask_clean_up(dd, entry); 729 unlock: 730 dd->affinity_entry = NULL; 731 mutex_unlock(&node_affinity.lock); 732 } 733 734 /* 735 * Function updates the irq affinity hint for msix after it has been changed 736 * by the user using the /proc/irq interface. This function only accepts 737 * one cpu in the mask. 738 */ 739 static void hfi1_update_sdma_affinity(struct hfi1_msix_entry *msix, int cpu) 740 { 741 struct sdma_engine *sde = msix->arg; 742 struct hfi1_devdata *dd = sde->dd; 743 struct hfi1_affinity_node *entry; 744 struct cpu_mask_set *set; 745 int i, old_cpu; 746 747 if (cpu > num_online_cpus() || cpu == sde->cpu) 748 return; 749 750 mutex_lock(&node_affinity.lock); 751 entry = node_affinity_lookup(dd->node); 752 if (!entry) 753 goto unlock; 754 755 old_cpu = sde->cpu; 756 sde->cpu = cpu; 757 cpumask_clear(&msix->mask); 758 cpumask_set_cpu(cpu, &msix->mask); 759 dd_dev_dbg(dd, "IRQ: %u, type %s engine %u -> cpu: %d\n", 760 msix->irq, irq_type_names[msix->type], 761 sde->this_idx, cpu); 762 irq_set_affinity_hint(msix->irq, &msix->mask); 763 764 /* 765 * Set the new cpu in the hfi1_affinity_node and clean 766 * the old cpu if it is not used by any other IRQ 767 */ 768 set = &entry->def_intr; 769 cpumask_set_cpu(cpu, &set->mask); 770 cpumask_set_cpu(cpu, &set->used); 771 for (i = 0; i < dd->msix_info.max_requested; i++) { 772 struct hfi1_msix_entry *other_msix; 773 774 other_msix = &dd->msix_info.msix_entries[i]; 775 if (other_msix->type != IRQ_SDMA || other_msix == msix) 776 continue; 777 778 if (cpumask_test_cpu(old_cpu, &other_msix->mask)) 779 goto unlock; 780 } 781 cpumask_clear_cpu(old_cpu, &set->mask); 782 cpumask_clear_cpu(old_cpu, &set->used); 783 unlock: 784 mutex_unlock(&node_affinity.lock); 785 } 786 787 static void hfi1_irq_notifier_notify(struct irq_affinity_notify *notify, 788 const cpumask_t *mask) 789 { 790 int cpu = cpumask_first(mask); 791 struct hfi1_msix_entry *msix = container_of(notify, 792 struct hfi1_msix_entry, 793 notify); 794 795 /* Only one CPU configuration supported currently */ 796 hfi1_update_sdma_affinity(msix, cpu); 797 } 798 799 static void hfi1_irq_notifier_release(struct kref *ref) 800 { 801 /* 802 * This is required by affinity notifier. We don't have anything to 803 * free here. 804 */ 805 } 806 807 static void hfi1_setup_sdma_notifier(struct hfi1_msix_entry *msix) 808 { 809 struct irq_affinity_notify *notify = &msix->notify; 810 811 notify->irq = msix->irq; 812 notify->notify = hfi1_irq_notifier_notify; 813 notify->release = hfi1_irq_notifier_release; 814 815 if (irq_set_affinity_notifier(notify->irq, notify)) 816 pr_err("Failed to register sdma irq affinity notifier for irq %d\n", 817 notify->irq); 818 } 819 820 static void hfi1_cleanup_sdma_notifier(struct hfi1_msix_entry *msix) 821 { 822 struct irq_affinity_notify *notify = &msix->notify; 823 824 if (irq_set_affinity_notifier(notify->irq, NULL)) 825 pr_err("Failed to cleanup sdma irq affinity notifier for irq %d\n", 826 notify->irq); 827 } 828 829 /* 830 * Function sets the irq affinity for msix. 831 * It *must* be called with node_affinity.lock held. 832 */ 833 static int get_irq_affinity(struct hfi1_devdata *dd, 834 struct hfi1_msix_entry *msix) 835 { 836 cpumask_var_t diff; 837 struct hfi1_affinity_node *entry; 838 struct cpu_mask_set *set = NULL; 839 struct sdma_engine *sde = NULL; 840 struct hfi1_ctxtdata *rcd = NULL; 841 char extra[64]; 842 int cpu = -1; 843 844 extra[0] = '\0'; 845 cpumask_clear(&msix->mask); 846 847 entry = node_affinity_lookup(dd->node); 848 849 switch (msix->type) { 850 case IRQ_SDMA: 851 sde = (struct sdma_engine *)msix->arg; 852 scnprintf(extra, 64, "engine %u", sde->this_idx); 853 set = &entry->def_intr; 854 break; 855 case IRQ_GENERAL: 856 cpu = cpumask_first(&entry->general_intr_mask); 857 break; 858 case IRQ_RCVCTXT: 859 rcd = (struct hfi1_ctxtdata *)msix->arg; 860 if (rcd->ctxt == HFI1_CTRL_CTXT) 861 cpu = cpumask_first(&entry->general_intr_mask); 862 else 863 set = &entry->rcv_intr; 864 scnprintf(extra, 64, "ctxt %u", rcd->ctxt); 865 break; 866 case IRQ_NETDEVCTXT: 867 rcd = (struct hfi1_ctxtdata *)msix->arg; 868 set = &entry->def_intr; 869 scnprintf(extra, 64, "ctxt %u", rcd->ctxt); 870 break; 871 default: 872 dd_dev_err(dd, "Invalid IRQ type %d\n", msix->type); 873 return -EINVAL; 874 } 875 876 /* 877 * The general and control contexts are placed on a particular 878 * CPU, which is set above. Skip accounting for it. Everything else 879 * finds its CPU here. 880 */ 881 if (cpu == -1 && set) { 882 if (!zalloc_cpumask_var(&diff, GFP_KERNEL)) 883 return -ENOMEM; 884 885 cpu = cpu_mask_set_get_first(set, diff); 886 if (cpu < 0) { 887 free_cpumask_var(diff); 888 dd_dev_err(dd, "Failure to obtain CPU for IRQ\n"); 889 return cpu; 890 } 891 892 free_cpumask_var(diff); 893 } 894 895 cpumask_set_cpu(cpu, &msix->mask); 896 dd_dev_info(dd, "IRQ: %u, type %s %s -> cpu: %d\n", 897 msix->irq, irq_type_names[msix->type], 898 extra, cpu); 899 irq_set_affinity_hint(msix->irq, &msix->mask); 900 901 if (msix->type == IRQ_SDMA) { 902 sde->cpu = cpu; 903 hfi1_setup_sdma_notifier(msix); 904 } 905 906 return 0; 907 } 908 909 int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix) 910 { 911 int ret; 912 913 mutex_lock(&node_affinity.lock); 914 ret = get_irq_affinity(dd, msix); 915 mutex_unlock(&node_affinity.lock); 916 return ret; 917 } 918 919 void hfi1_put_irq_affinity(struct hfi1_devdata *dd, 920 struct hfi1_msix_entry *msix) 921 { 922 struct cpu_mask_set *set = NULL; 923 struct hfi1_affinity_node *entry; 924 925 mutex_lock(&node_affinity.lock); 926 entry = node_affinity_lookup(dd->node); 927 928 switch (msix->type) { 929 case IRQ_SDMA: 930 set = &entry->def_intr; 931 hfi1_cleanup_sdma_notifier(msix); 932 break; 933 case IRQ_GENERAL: 934 /* Don't do accounting for general contexts */ 935 break; 936 case IRQ_RCVCTXT: { 937 struct hfi1_ctxtdata *rcd = msix->arg; 938 939 /* Don't do accounting for control contexts */ 940 if (rcd->ctxt != HFI1_CTRL_CTXT) 941 set = &entry->rcv_intr; 942 break; 943 } 944 case IRQ_NETDEVCTXT: 945 set = &entry->def_intr; 946 break; 947 default: 948 mutex_unlock(&node_affinity.lock); 949 return; 950 } 951 952 if (set) { 953 cpumask_andnot(&set->used, &set->used, &msix->mask); 954 _cpu_mask_set_gen_dec(set); 955 } 956 957 irq_set_affinity_hint(msix->irq, NULL); 958 cpumask_clear(&msix->mask); 959 mutex_unlock(&node_affinity.lock); 960 } 961 962 /* This should be called with node_affinity.lock held */ 963 static void find_hw_thread_mask(uint hw_thread_no, cpumask_var_t hw_thread_mask, 964 struct hfi1_affinity_node_list *affinity) 965 { 966 int possible, curr_cpu, i; 967 uint num_cores_per_socket = node_affinity.num_online_cpus / 968 affinity->num_core_siblings / 969 node_affinity.num_online_nodes; 970 971 cpumask_copy(hw_thread_mask, &affinity->proc.mask); 972 if (affinity->num_core_siblings > 0) { 973 /* Removing other siblings not needed for now */ 974 possible = cpumask_weight(hw_thread_mask); 975 curr_cpu = cpumask_first(hw_thread_mask); 976 for (i = 0; 977 i < num_cores_per_socket * node_affinity.num_online_nodes; 978 i++) 979 curr_cpu = cpumask_next(curr_cpu, hw_thread_mask); 980 981 for (; i < possible; i++) { 982 cpumask_clear_cpu(curr_cpu, hw_thread_mask); 983 curr_cpu = cpumask_next(curr_cpu, hw_thread_mask); 984 } 985 986 /* Identifying correct HW threads within physical cores */ 987 cpumask_shift_left(hw_thread_mask, hw_thread_mask, 988 num_cores_per_socket * 989 node_affinity.num_online_nodes * 990 hw_thread_no); 991 } 992 } 993 994 int hfi1_get_proc_affinity(int node) 995 { 996 int cpu = -1, ret, i; 997 struct hfi1_affinity_node *entry; 998 cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask; 999 const struct cpumask *node_mask, 1000 *proc_mask = current->cpus_ptr; 1001 struct hfi1_affinity_node_list *affinity = &node_affinity; 1002 struct cpu_mask_set *set = &affinity->proc; 1003 1004 /* 1005 * check whether process/context affinity has already 1006 * been set 1007 */ 1008 if (current->nr_cpus_allowed == 1) { 1009 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl", 1010 current->pid, current->comm, 1011 cpumask_pr_args(proc_mask)); 1012 /* 1013 * Mark the pre-set CPU as used. This is atomic so we don't 1014 * need the lock 1015 */ 1016 cpu = cpumask_first(proc_mask); 1017 cpumask_set_cpu(cpu, &set->used); 1018 goto done; 1019 } else if (current->nr_cpus_allowed < cpumask_weight(&set->mask)) { 1020 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl", 1021 current->pid, current->comm, 1022 cpumask_pr_args(proc_mask)); 1023 goto done; 1024 } 1025 1026 /* 1027 * The process does not have a preset CPU affinity so find one to 1028 * recommend using the following algorithm: 1029 * 1030 * For each user process that is opening a context on HFI Y: 1031 * a) If all cores are filled, reinitialize the bitmask 1032 * b) Fill real cores first, then HT cores (First set of HT 1033 * cores on all physical cores, then second set of HT core, 1034 * and, so on) in the following order: 1035 * 1036 * 1. Same NUMA node as HFI Y and not running an IRQ 1037 * handler 1038 * 2. Same NUMA node as HFI Y and running an IRQ handler 1039 * 3. Different NUMA node to HFI Y and not running an IRQ 1040 * handler 1041 * 4. Different NUMA node to HFI Y and running an IRQ 1042 * handler 1043 * c) Mark core as filled in the bitmask. As user processes are 1044 * done, clear cores from the bitmask. 1045 */ 1046 1047 ret = zalloc_cpumask_var(&diff, GFP_KERNEL); 1048 if (!ret) 1049 goto done; 1050 ret = zalloc_cpumask_var(&hw_thread_mask, GFP_KERNEL); 1051 if (!ret) 1052 goto free_diff; 1053 ret = zalloc_cpumask_var(&available_mask, GFP_KERNEL); 1054 if (!ret) 1055 goto free_hw_thread_mask; 1056 ret = zalloc_cpumask_var(&intrs_mask, GFP_KERNEL); 1057 if (!ret) 1058 goto free_available_mask; 1059 1060 mutex_lock(&affinity->lock); 1061 /* 1062 * If we've used all available HW threads, clear the mask and start 1063 * overloading. 1064 */ 1065 _cpu_mask_set_gen_inc(set); 1066 1067 /* 1068 * If NUMA node has CPUs used by interrupt handlers, include them in the 1069 * interrupt handler mask. 1070 */ 1071 entry = node_affinity_lookup(node); 1072 if (entry) { 1073 cpumask_copy(intrs_mask, (entry->def_intr.gen ? 1074 &entry->def_intr.mask : 1075 &entry->def_intr.used)); 1076 cpumask_or(intrs_mask, intrs_mask, (entry->rcv_intr.gen ? 1077 &entry->rcv_intr.mask : 1078 &entry->rcv_intr.used)); 1079 cpumask_or(intrs_mask, intrs_mask, &entry->general_intr_mask); 1080 } 1081 hfi1_cdbg(PROC, "CPUs used by interrupts: %*pbl", 1082 cpumask_pr_args(intrs_mask)); 1083 1084 cpumask_copy(hw_thread_mask, &set->mask); 1085 1086 /* 1087 * If HT cores are enabled, identify which HW threads within the 1088 * physical cores should be used. 1089 */ 1090 if (affinity->num_core_siblings > 0) { 1091 for (i = 0; i < affinity->num_core_siblings; i++) { 1092 find_hw_thread_mask(i, hw_thread_mask, affinity); 1093 1094 /* 1095 * If there's at least one available core for this HW 1096 * thread number, stop looking for a core. 1097 * 1098 * diff will always be not empty at least once in this 1099 * loop as the used mask gets reset when 1100 * (set->mask == set->used) before this loop. 1101 */ 1102 cpumask_andnot(diff, hw_thread_mask, &set->used); 1103 if (!cpumask_empty(diff)) 1104 break; 1105 } 1106 } 1107 hfi1_cdbg(PROC, "Same available HW thread on all physical CPUs: %*pbl", 1108 cpumask_pr_args(hw_thread_mask)); 1109 1110 node_mask = cpumask_of_node(node); 1111 hfi1_cdbg(PROC, "Device on NUMA %u, CPUs %*pbl", node, 1112 cpumask_pr_args(node_mask)); 1113 1114 /* Get cpumask of available CPUs on preferred NUMA */ 1115 cpumask_and(available_mask, hw_thread_mask, node_mask); 1116 cpumask_andnot(available_mask, available_mask, &set->used); 1117 hfi1_cdbg(PROC, "Available CPUs on NUMA %u: %*pbl", node, 1118 cpumask_pr_args(available_mask)); 1119 1120 /* 1121 * At first, we don't want to place processes on the same 1122 * CPUs as interrupt handlers. Then, CPUs running interrupt 1123 * handlers are used. 1124 * 1125 * 1) If diff is not empty, then there are CPUs not running 1126 * non-interrupt handlers available, so diff gets copied 1127 * over to available_mask. 1128 * 2) If diff is empty, then all CPUs not running interrupt 1129 * handlers are taken, so available_mask contains all 1130 * available CPUs running interrupt handlers. 1131 * 3) If available_mask is empty, then all CPUs on the 1132 * preferred NUMA node are taken, so other NUMA nodes are 1133 * used for process assignments using the same method as 1134 * the preferred NUMA node. 1135 */ 1136 cpumask_andnot(diff, available_mask, intrs_mask); 1137 if (!cpumask_empty(diff)) 1138 cpumask_copy(available_mask, diff); 1139 1140 /* If we don't have CPUs on the preferred node, use other NUMA nodes */ 1141 if (cpumask_empty(available_mask)) { 1142 cpumask_andnot(available_mask, hw_thread_mask, &set->used); 1143 /* Excluding preferred NUMA cores */ 1144 cpumask_andnot(available_mask, available_mask, node_mask); 1145 hfi1_cdbg(PROC, 1146 "Preferred NUMA node cores are taken, cores available in other NUMA nodes: %*pbl", 1147 cpumask_pr_args(available_mask)); 1148 1149 /* 1150 * At first, we don't want to place processes on the same 1151 * CPUs as interrupt handlers. 1152 */ 1153 cpumask_andnot(diff, available_mask, intrs_mask); 1154 if (!cpumask_empty(diff)) 1155 cpumask_copy(available_mask, diff); 1156 } 1157 hfi1_cdbg(PROC, "Possible CPUs for process: %*pbl", 1158 cpumask_pr_args(available_mask)); 1159 1160 cpu = cpumask_first(available_mask); 1161 if (cpu >= nr_cpu_ids) /* empty */ 1162 cpu = -1; 1163 else 1164 cpumask_set_cpu(cpu, &set->used); 1165 1166 mutex_unlock(&affinity->lock); 1167 hfi1_cdbg(PROC, "Process assigned to CPU %d", cpu); 1168 1169 free_cpumask_var(intrs_mask); 1170 free_available_mask: 1171 free_cpumask_var(available_mask); 1172 free_hw_thread_mask: 1173 free_cpumask_var(hw_thread_mask); 1174 free_diff: 1175 free_cpumask_var(diff); 1176 done: 1177 return cpu; 1178 } 1179 1180 void hfi1_put_proc_affinity(int cpu) 1181 { 1182 struct hfi1_affinity_node_list *affinity = &node_affinity; 1183 struct cpu_mask_set *set = &affinity->proc; 1184 1185 if (cpu < 0) 1186 return; 1187 1188 mutex_lock(&affinity->lock); 1189 cpu_mask_set_put(set, cpu); 1190 hfi1_cdbg(PROC, "Returning CPU %d for future process assignment", cpu); 1191 mutex_unlock(&affinity->lock); 1192 } 1193