1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 /*
3 * Copyright(c) 2015 - 2020 Intel Corporation.
4 */
5
6 #include <linux/topology.h>
7 #include <linux/cpumask.h>
8 #include <linux/interrupt.h>
9 #include <linux/numa.h>
10
11 #include "hfi.h"
12 #include "affinity.h"
13 #include "sdma.h"
14 #include "trace.h"
15
16 struct hfi1_affinity_node_list node_affinity = {
17 .list = LIST_HEAD_INIT(node_affinity.list),
18 .lock = __MUTEX_INITIALIZER(node_affinity.lock)
19 };
20
21 /* Name of IRQ types, indexed by enum irq_type */
22 static const char * const irq_type_names[] = {
23 "SDMA",
24 "RCVCTXT",
25 "NETDEVCTXT",
26 "GENERAL",
27 "OTHER",
28 };
29
30 /* Per NUMA node count of HFI devices */
31 static unsigned int *hfi1_per_node_cntr;
32
init_cpu_mask_set(struct cpu_mask_set * set)33 static inline void init_cpu_mask_set(struct cpu_mask_set *set)
34 {
35 cpumask_clear(&set->mask);
36 cpumask_clear(&set->used);
37 set->gen = 0;
38 }
39
40 /* Increment generation of CPU set if needed */
_cpu_mask_set_gen_inc(struct cpu_mask_set * set)41 static void _cpu_mask_set_gen_inc(struct cpu_mask_set *set)
42 {
43 if (cpumask_equal(&set->mask, &set->used)) {
44 /*
45 * We've used up all the CPUs, bump up the generation
46 * and reset the 'used' map
47 */
48 set->gen++;
49 cpumask_clear(&set->used);
50 }
51 }
52
_cpu_mask_set_gen_dec(struct cpu_mask_set * set)53 static void _cpu_mask_set_gen_dec(struct cpu_mask_set *set)
54 {
55 if (cpumask_empty(&set->used) && set->gen) {
56 set->gen--;
57 cpumask_copy(&set->used, &set->mask);
58 }
59 }
60
61 /* Get the first CPU from the list of unused CPUs in a CPU set data structure */
cpu_mask_set_get_first(struct cpu_mask_set * set,cpumask_var_t diff)62 static int cpu_mask_set_get_first(struct cpu_mask_set *set, cpumask_var_t diff)
63 {
64 int cpu;
65
66 if (!diff || !set)
67 return -EINVAL;
68
69 _cpu_mask_set_gen_inc(set);
70
71 /* Find out CPUs left in CPU mask */
72 cpumask_andnot(diff, &set->mask, &set->used);
73
74 cpu = cpumask_first(diff);
75 if (cpu >= nr_cpu_ids) /* empty */
76 cpu = -EINVAL;
77 else
78 cpumask_set_cpu(cpu, &set->used);
79
80 return cpu;
81 }
82
cpu_mask_set_put(struct cpu_mask_set * set,int cpu)83 static void cpu_mask_set_put(struct cpu_mask_set *set, int cpu)
84 {
85 if (!set)
86 return;
87
88 cpumask_clear_cpu(cpu, &set->used);
89 _cpu_mask_set_gen_dec(set);
90 }
91
92 /* Initialize non-HT cpu cores mask */
init_real_cpu_mask(void)93 void init_real_cpu_mask(void)
94 {
95 int possible, curr_cpu, ht;
96
97 /* Start with cpu online mask as the real cpu mask */
98 cpumask_copy(&node_affinity.real_cpu_mask, cpu_online_mask);
99
100 /*
101 * Remove HT cores from the real cpu mask. Do this in two steps below.
102 */
103 possible = cpumask_weight(&node_affinity.real_cpu_mask);
104 ht = cpumask_weight(topology_sibling_cpumask(
105 cpumask_first(&node_affinity.real_cpu_mask)));
106 /*
107 * Step 1. Skip over the first N HT siblings and use them as the
108 * "real" cores. Assumes that HT cores are not enumerated in
109 * succession (except in the single core case).
110 */
111 curr_cpu = cpumask_nth(possible / ht, &node_affinity.real_cpu_mask) + 1;
112
113 /* Step 2. Remove the remaining HT siblings. */
114 cpumask_clear_cpus(&node_affinity.real_cpu_mask, curr_cpu, nr_cpu_ids - curr_cpu);
115 }
116
node_affinity_init(void)117 int node_affinity_init(void)
118 {
119 int node;
120 struct pci_dev *dev = NULL;
121 const struct pci_device_id *ids = hfi1_pci_tbl;
122
123 cpumask_clear(&node_affinity.proc.used);
124 cpumask_copy(&node_affinity.proc.mask, cpu_online_mask);
125
126 node_affinity.proc.gen = 0;
127 node_affinity.num_core_siblings =
128 cpumask_weight(topology_sibling_cpumask(
129 cpumask_first(&node_affinity.proc.mask)
130 ));
131 node_affinity.num_possible_nodes = num_possible_nodes();
132 node_affinity.num_online_nodes = num_online_nodes();
133 node_affinity.num_online_cpus = num_online_cpus();
134
135 /*
136 * The real cpu mask is part of the affinity struct but it has to be
137 * initialized early. It is needed to calculate the number of user
138 * contexts in set_up_context_variables().
139 */
140 init_real_cpu_mask();
141
142 hfi1_per_node_cntr = kcalloc(node_affinity.num_possible_nodes,
143 sizeof(*hfi1_per_node_cntr), GFP_KERNEL);
144 if (!hfi1_per_node_cntr)
145 return -ENOMEM;
146
147 while (ids->vendor) {
148 dev = NULL;
149 while ((dev = pci_get_device(ids->vendor, ids->device, dev))) {
150 node = pcibus_to_node(dev->bus);
151 if (node < 0)
152 goto out;
153
154 hfi1_per_node_cntr[node]++;
155 }
156 ids++;
157 }
158
159 return 0;
160
161 out:
162 /*
163 * Invalid PCI NUMA node information found, note it, and populate
164 * our database 1:1.
165 */
166 pr_err("HFI: Invalid PCI NUMA node. Performance may be affected\n");
167 pr_err("HFI: System BIOS may need to be upgraded\n");
168 for (node = 0; node < node_affinity.num_possible_nodes; node++)
169 hfi1_per_node_cntr[node] = 1;
170
171 pci_dev_put(dev);
172
173 return 0;
174 }
175
node_affinity_destroy(struct hfi1_affinity_node * entry)176 static void node_affinity_destroy(struct hfi1_affinity_node *entry)
177 {
178 free_percpu(entry->comp_vect_affinity);
179 kfree(entry);
180 }
181
node_affinity_destroy_all(void)182 void node_affinity_destroy_all(void)
183 {
184 struct list_head *pos, *q;
185 struct hfi1_affinity_node *entry;
186
187 mutex_lock(&node_affinity.lock);
188 list_for_each_safe(pos, q, &node_affinity.list) {
189 entry = list_entry(pos, struct hfi1_affinity_node,
190 list);
191 list_del(pos);
192 node_affinity_destroy(entry);
193 }
194 mutex_unlock(&node_affinity.lock);
195 kfree(hfi1_per_node_cntr);
196 }
197
node_affinity_allocate(int node)198 static struct hfi1_affinity_node *node_affinity_allocate(int node)
199 {
200 struct hfi1_affinity_node *entry;
201
202 entry = kzalloc_obj(*entry);
203 if (!entry)
204 return NULL;
205 entry->node = node;
206 entry->comp_vect_affinity = alloc_percpu(u16);
207 INIT_LIST_HEAD(&entry->list);
208
209 return entry;
210 }
211
212 /*
213 * It appends an entry to the list.
214 * It *must* be called with node_affinity.lock held.
215 */
node_affinity_add_tail(struct hfi1_affinity_node * entry)216 static void node_affinity_add_tail(struct hfi1_affinity_node *entry)
217 {
218 list_add_tail(&entry->list, &node_affinity.list);
219 }
220
221 /* It must be called with node_affinity.lock held */
node_affinity_lookup(int node)222 static struct hfi1_affinity_node *node_affinity_lookup(int node)
223 {
224 struct hfi1_affinity_node *entry;
225
226 list_for_each_entry(entry, &node_affinity.list, list) {
227 if (entry->node == node)
228 return entry;
229 }
230
231 return NULL;
232 }
233
per_cpu_affinity_get(cpumask_var_t possible_cpumask,u16 __percpu * comp_vect_affinity)234 static int per_cpu_affinity_get(cpumask_var_t possible_cpumask,
235 u16 __percpu *comp_vect_affinity)
236 {
237 int curr_cpu;
238 u16 cntr;
239 u16 prev_cntr;
240 int ret_cpu;
241
242 if (!possible_cpumask) {
243 ret_cpu = -EINVAL;
244 goto fail;
245 }
246
247 if (!comp_vect_affinity) {
248 ret_cpu = -EINVAL;
249 goto fail;
250 }
251
252 ret_cpu = cpumask_first(possible_cpumask);
253 if (ret_cpu >= nr_cpu_ids) {
254 ret_cpu = -EINVAL;
255 goto fail;
256 }
257
258 prev_cntr = *per_cpu_ptr(comp_vect_affinity, ret_cpu);
259 for_each_cpu(curr_cpu, possible_cpumask) {
260 cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu);
261
262 if (cntr < prev_cntr) {
263 ret_cpu = curr_cpu;
264 prev_cntr = cntr;
265 }
266 }
267
268 *per_cpu_ptr(comp_vect_affinity, ret_cpu) += 1;
269
270 fail:
271 return ret_cpu;
272 }
273
per_cpu_affinity_put_max(cpumask_var_t possible_cpumask,u16 __percpu * comp_vect_affinity)274 static int per_cpu_affinity_put_max(cpumask_var_t possible_cpumask,
275 u16 __percpu *comp_vect_affinity)
276 {
277 int curr_cpu;
278 int max_cpu;
279 u16 cntr;
280 u16 prev_cntr;
281
282 if (!possible_cpumask)
283 return -EINVAL;
284
285 if (!comp_vect_affinity)
286 return -EINVAL;
287
288 max_cpu = cpumask_first(possible_cpumask);
289 if (max_cpu >= nr_cpu_ids)
290 return -EINVAL;
291
292 prev_cntr = *per_cpu_ptr(comp_vect_affinity, max_cpu);
293 for_each_cpu(curr_cpu, possible_cpumask) {
294 cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu);
295
296 if (cntr > prev_cntr) {
297 max_cpu = curr_cpu;
298 prev_cntr = cntr;
299 }
300 }
301
302 *per_cpu_ptr(comp_vect_affinity, max_cpu) -= 1;
303
304 return max_cpu;
305 }
306
307 /*
308 * Non-interrupt CPUs are used first, then interrupt CPUs.
309 * Two already allocated cpu masks must be passed.
310 */
_dev_comp_vect_cpu_get(struct hfi1_devdata * dd,struct hfi1_affinity_node * entry,cpumask_var_t non_intr_cpus,cpumask_var_t available_cpus)311 static int _dev_comp_vect_cpu_get(struct hfi1_devdata *dd,
312 struct hfi1_affinity_node *entry,
313 cpumask_var_t non_intr_cpus,
314 cpumask_var_t available_cpus)
315 __must_hold(&node_affinity.lock)
316 {
317 int cpu;
318 struct cpu_mask_set *set = dd->comp_vect;
319
320 lockdep_assert_held(&node_affinity.lock);
321 if (!non_intr_cpus) {
322 cpu = -1;
323 goto fail;
324 }
325
326 if (!available_cpus) {
327 cpu = -1;
328 goto fail;
329 }
330
331 /* Available CPUs for pinning completion vectors */
332 _cpu_mask_set_gen_inc(set);
333 cpumask_andnot(available_cpus, &set->mask, &set->used);
334
335 /* Available CPUs without SDMA engine interrupts */
336 cpumask_andnot(non_intr_cpus, available_cpus,
337 &entry->def_intr.used);
338
339 /* If there are non-interrupt CPUs available, use them first */
340 cpu = cpumask_first(non_intr_cpus);
341
342 /* Otherwise, use interrupt CPUs */
343 if (cpu >= nr_cpu_ids)
344 cpu = cpumask_first(available_cpus);
345
346 if (cpu >= nr_cpu_ids) { /* empty */
347 cpu = -1;
348 goto fail;
349 }
350 cpumask_set_cpu(cpu, &set->used);
351
352 fail:
353 return cpu;
354 }
355
_dev_comp_vect_cpu_put(struct hfi1_devdata * dd,int cpu)356 static void _dev_comp_vect_cpu_put(struct hfi1_devdata *dd, int cpu)
357 {
358 struct cpu_mask_set *set = dd->comp_vect;
359
360 if (cpu < 0)
361 return;
362
363 cpu_mask_set_put(set, cpu);
364 }
365
366 /* _dev_comp_vect_mappings_destroy() is reentrant */
_dev_comp_vect_mappings_destroy(struct hfi1_devdata * dd)367 static void _dev_comp_vect_mappings_destroy(struct hfi1_devdata *dd)
368 {
369 int i, cpu;
370
371 if (!dd->comp_vect_mappings)
372 return;
373
374 for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
375 cpu = dd->comp_vect_mappings[i];
376 _dev_comp_vect_cpu_put(dd, cpu);
377 dd->comp_vect_mappings[i] = -1;
378 hfi1_cdbg(AFFINITY,
379 "[%s] Release CPU %d from completion vector %d",
380 rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), cpu, i);
381 }
382
383 kfree(dd->comp_vect_mappings);
384 dd->comp_vect_mappings = NULL;
385 }
386
387 /*
388 * This function creates the table for looking up CPUs for completion vectors.
389 * num_comp_vectors needs to have been initilized before calling this function.
390 */
_dev_comp_vect_mappings_create(struct hfi1_devdata * dd,struct hfi1_affinity_node * entry)391 static int _dev_comp_vect_mappings_create(struct hfi1_devdata *dd,
392 struct hfi1_affinity_node *entry)
393 __must_hold(&node_affinity.lock)
394 {
395 int i, cpu, ret;
396 cpumask_var_t non_intr_cpus;
397 cpumask_var_t available_cpus;
398
399 lockdep_assert_held(&node_affinity.lock);
400
401 if (!zalloc_cpumask_var(&non_intr_cpus, GFP_KERNEL))
402 return -ENOMEM;
403
404 if (!zalloc_cpumask_var(&available_cpus, GFP_KERNEL)) {
405 free_cpumask_var(non_intr_cpus);
406 return -ENOMEM;
407 }
408
409 dd->comp_vect_mappings = kzalloc_objs(*dd->comp_vect_mappings,
410 dd->comp_vect_possible_cpus);
411 if (!dd->comp_vect_mappings) {
412 ret = -ENOMEM;
413 goto fail;
414 }
415 for (i = 0; i < dd->comp_vect_possible_cpus; i++)
416 dd->comp_vect_mappings[i] = -1;
417
418 for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
419 cpu = _dev_comp_vect_cpu_get(dd, entry, non_intr_cpus,
420 available_cpus);
421 if (cpu < 0) {
422 ret = -EINVAL;
423 goto fail;
424 }
425
426 dd->comp_vect_mappings[i] = cpu;
427 hfi1_cdbg(AFFINITY,
428 "[%s] Completion Vector %d -> CPU %d",
429 rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), i, cpu);
430 }
431
432 free_cpumask_var(available_cpus);
433 free_cpumask_var(non_intr_cpus);
434 return 0;
435
436 fail:
437 free_cpumask_var(available_cpus);
438 free_cpumask_var(non_intr_cpus);
439 _dev_comp_vect_mappings_destroy(dd);
440
441 return ret;
442 }
443
hfi1_comp_vectors_set_up(struct hfi1_devdata * dd)444 int hfi1_comp_vectors_set_up(struct hfi1_devdata *dd)
445 {
446 int ret;
447 struct hfi1_affinity_node *entry;
448
449 mutex_lock(&node_affinity.lock);
450 entry = node_affinity_lookup(dd->node);
451 if (!entry) {
452 ret = -EINVAL;
453 goto unlock;
454 }
455 ret = _dev_comp_vect_mappings_create(dd, entry);
456 unlock:
457 mutex_unlock(&node_affinity.lock);
458
459 return ret;
460 }
461
hfi1_comp_vectors_clean_up(struct hfi1_devdata * dd)462 void hfi1_comp_vectors_clean_up(struct hfi1_devdata *dd)
463 {
464 _dev_comp_vect_mappings_destroy(dd);
465 }
466
hfi1_comp_vect_mappings_lookup(struct rvt_dev_info * rdi,int comp_vect)467 int hfi1_comp_vect_mappings_lookup(struct rvt_dev_info *rdi, int comp_vect)
468 {
469 struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
470 struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
471
472 if (!dd->comp_vect_mappings)
473 return -EINVAL;
474 if (comp_vect >= dd->comp_vect_possible_cpus)
475 return -EINVAL;
476
477 return dd->comp_vect_mappings[comp_vect];
478 }
479
480 /*
481 * It assumes dd->comp_vect_possible_cpus is available.
482 */
_dev_comp_vect_cpu_mask_init(struct hfi1_devdata * dd,struct hfi1_affinity_node * entry,bool first_dev_init)483 static int _dev_comp_vect_cpu_mask_init(struct hfi1_devdata *dd,
484 struct hfi1_affinity_node *entry,
485 bool first_dev_init)
486 __must_hold(&node_affinity.lock)
487 {
488 int i, j, curr_cpu;
489 int possible_cpus_comp_vect = 0;
490 struct cpumask *dev_comp_vect_mask = &dd->comp_vect->mask;
491
492 lockdep_assert_held(&node_affinity.lock);
493 /*
494 * If there's only one CPU available for completion vectors, then
495 * there will only be one completion vector available. Othewise,
496 * the number of completion vector available will be the number of
497 * available CPUs divide it by the number of devices in the
498 * local NUMA node.
499 */
500 if (cpumask_weight(&entry->comp_vect_mask) == 1) {
501 possible_cpus_comp_vect = 1;
502 dd_dev_warn(dd,
503 "Number of kernel receive queues is too large for completion vector affinity to be effective\n");
504 } else {
505 possible_cpus_comp_vect +=
506 cpumask_weight(&entry->comp_vect_mask) /
507 hfi1_per_node_cntr[dd->node];
508
509 /*
510 * If the completion vector CPUs available doesn't divide
511 * evenly among devices, then the first device device to be
512 * initialized gets an extra CPU.
513 */
514 if (first_dev_init &&
515 cpumask_weight(&entry->comp_vect_mask) %
516 hfi1_per_node_cntr[dd->node] != 0)
517 possible_cpus_comp_vect++;
518 }
519
520 dd->comp_vect_possible_cpus = possible_cpus_comp_vect;
521
522 /* Reserving CPUs for device completion vector */
523 for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
524 curr_cpu = per_cpu_affinity_get(&entry->comp_vect_mask,
525 entry->comp_vect_affinity);
526 if (curr_cpu < 0)
527 goto fail;
528
529 cpumask_set_cpu(curr_cpu, dev_comp_vect_mask);
530 }
531
532 hfi1_cdbg(AFFINITY,
533 "[%s] Completion vector affinity CPU set(s) %*pbl",
534 rvt_get_ibdev_name(&(dd)->verbs_dev.rdi),
535 cpumask_pr_args(dev_comp_vect_mask));
536
537 return 0;
538
539 fail:
540 for (j = 0; j < i; j++)
541 per_cpu_affinity_put_max(&entry->comp_vect_mask,
542 entry->comp_vect_affinity);
543
544 return curr_cpu;
545 }
546
547 /*
548 * It assumes dd->comp_vect_possible_cpus is available.
549 */
_dev_comp_vect_cpu_mask_clean_up(struct hfi1_devdata * dd,struct hfi1_affinity_node * entry)550 static void _dev_comp_vect_cpu_mask_clean_up(struct hfi1_devdata *dd,
551 struct hfi1_affinity_node *entry)
552 __must_hold(&node_affinity.lock)
553 {
554 int i, cpu;
555
556 lockdep_assert_held(&node_affinity.lock);
557 if (!dd->comp_vect_possible_cpus)
558 return;
559
560 for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
561 cpu = per_cpu_affinity_put_max(&dd->comp_vect->mask,
562 entry->comp_vect_affinity);
563 /* Clearing CPU in device completion vector cpu mask */
564 if (cpu >= 0)
565 cpumask_clear_cpu(cpu, &dd->comp_vect->mask);
566 }
567
568 dd->comp_vect_possible_cpus = 0;
569 }
570
571 /*
572 * Interrupt affinity.
573 *
574 * non-rcv avail gets a default mask that
575 * starts as possible cpus with threads reset
576 * and each rcv avail reset.
577 *
578 * rcv avail gets node relative 1 wrapping back
579 * to the node relative 1 as necessary.
580 *
581 */
hfi1_dev_affinity_init(struct hfi1_devdata * dd)582 int hfi1_dev_affinity_init(struct hfi1_devdata *dd)
583 {
584 struct hfi1_affinity_node *entry;
585 const struct cpumask *local_mask;
586 int curr_cpu, possible, i, ret;
587 bool new_entry = false;
588
589 local_mask = cpumask_of_node(dd->node);
590 if (cpumask_first(local_mask) >= nr_cpu_ids)
591 local_mask = topology_core_cpumask(0);
592
593 mutex_lock(&node_affinity.lock);
594 entry = node_affinity_lookup(dd->node);
595
596 /*
597 * If this is the first time this NUMA node's affinity is used,
598 * create an entry in the global affinity structure and initialize it.
599 */
600 if (!entry) {
601 entry = node_affinity_allocate(dd->node);
602 if (!entry) {
603 dd_dev_err(dd,
604 "Unable to allocate global affinity node\n");
605 ret = -ENOMEM;
606 goto fail;
607 }
608 new_entry = true;
609
610 init_cpu_mask_set(&entry->def_intr);
611 init_cpu_mask_set(&entry->rcv_intr);
612 cpumask_clear(&entry->comp_vect_mask);
613 cpumask_clear(&entry->general_intr_mask);
614 /* Use the "real" cpu mask of this node as the default */
615 cpumask_and(&entry->def_intr.mask, &node_affinity.real_cpu_mask,
616 local_mask);
617
618 /* fill in the receive list */
619 possible = cpumask_weight(&entry->def_intr.mask);
620 curr_cpu = cpumask_first(&entry->def_intr.mask);
621
622 if (possible == 1) {
623 /* only one CPU, everyone will use it */
624 cpumask_set_cpu(curr_cpu, &entry->rcv_intr.mask);
625 cpumask_set_cpu(curr_cpu, &entry->general_intr_mask);
626 } else {
627 /*
628 * The general/control context will be the first CPU in
629 * the default list, so it is removed from the default
630 * list and added to the general interrupt list.
631 */
632 cpumask_clear_cpu(curr_cpu, &entry->def_intr.mask);
633 cpumask_set_cpu(curr_cpu, &entry->general_intr_mask);
634 curr_cpu = cpumask_next(curr_cpu,
635 &entry->def_intr.mask);
636
637 /*
638 * Remove the remaining kernel receive queues from
639 * the default list and add them to the receive list.
640 */
641 for (i = 0;
642 i < (dd->n_krcv_queues - 1) *
643 hfi1_per_node_cntr[dd->node];
644 i++) {
645 cpumask_clear_cpu(curr_cpu,
646 &entry->def_intr.mask);
647 cpumask_set_cpu(curr_cpu,
648 &entry->rcv_intr.mask);
649 curr_cpu = cpumask_next(curr_cpu,
650 &entry->def_intr.mask);
651 if (curr_cpu >= nr_cpu_ids)
652 break;
653 }
654
655 /*
656 * If there ends up being 0 CPU cores leftover for SDMA
657 * engines, use the same CPU cores as general/control
658 * context.
659 */
660 if (cpumask_empty(&entry->def_intr.mask))
661 cpumask_copy(&entry->def_intr.mask,
662 &entry->general_intr_mask);
663 }
664
665 /* Determine completion vector CPUs for the entire node */
666 cpumask_and(&entry->comp_vect_mask,
667 &node_affinity.real_cpu_mask, local_mask);
668 cpumask_andnot(&entry->comp_vect_mask,
669 &entry->comp_vect_mask,
670 &entry->rcv_intr.mask);
671 cpumask_andnot(&entry->comp_vect_mask,
672 &entry->comp_vect_mask,
673 &entry->general_intr_mask);
674
675 /*
676 * If there ends up being 0 CPU cores leftover for completion
677 * vectors, use the same CPU core as the general/control
678 * context.
679 */
680 if (cpumask_empty(&entry->comp_vect_mask))
681 cpumask_copy(&entry->comp_vect_mask,
682 &entry->general_intr_mask);
683 }
684
685 ret = _dev_comp_vect_cpu_mask_init(dd, entry, new_entry);
686 if (ret < 0)
687 goto fail;
688
689 if (new_entry)
690 node_affinity_add_tail(entry);
691
692 dd->affinity_entry = entry;
693 mutex_unlock(&node_affinity.lock);
694
695 return 0;
696
697 fail:
698 if (new_entry)
699 node_affinity_destroy(entry);
700 mutex_unlock(&node_affinity.lock);
701 return ret;
702 }
703
hfi1_dev_affinity_clean_up(struct hfi1_devdata * dd)704 void hfi1_dev_affinity_clean_up(struct hfi1_devdata *dd)
705 {
706 struct hfi1_affinity_node *entry;
707
708 mutex_lock(&node_affinity.lock);
709 if (!dd->affinity_entry)
710 goto unlock;
711 entry = node_affinity_lookup(dd->node);
712 if (!entry)
713 goto unlock;
714
715 /*
716 * Free device completion vector CPUs to be used by future
717 * completion vectors
718 */
719 _dev_comp_vect_cpu_mask_clean_up(dd, entry);
720 unlock:
721 dd->affinity_entry = NULL;
722 mutex_unlock(&node_affinity.lock);
723 }
724
725 /*
726 * Function updates the irq affinity hint for msix after it has been changed
727 * by the user using the /proc/irq interface. This function only accepts
728 * one cpu in the mask.
729 */
hfi1_update_sdma_affinity(struct hfi1_msix_entry * msix,int cpu)730 static void hfi1_update_sdma_affinity(struct hfi1_msix_entry *msix, int cpu)
731 {
732 struct sdma_engine *sde = msix->arg;
733 struct hfi1_devdata *dd = sde->dd;
734 struct hfi1_affinity_node *entry;
735 struct cpu_mask_set *set;
736 int i, old_cpu;
737
738 if (cpu > num_online_cpus() || cpu == sde->cpu)
739 return;
740
741 mutex_lock(&node_affinity.lock);
742 entry = node_affinity_lookup(dd->node);
743 if (!entry)
744 goto unlock;
745
746 old_cpu = sde->cpu;
747 sde->cpu = cpu;
748 cpumask_clear(&msix->mask);
749 cpumask_set_cpu(cpu, &msix->mask);
750 dd_dev_dbg(dd, "IRQ: %u, type %s engine %u -> cpu: %d\n",
751 msix->irq, irq_type_names[msix->type],
752 sde->this_idx, cpu);
753 irq_set_affinity_hint(msix->irq, &msix->mask);
754
755 /*
756 * Set the new cpu in the hfi1_affinity_node and clean
757 * the old cpu if it is not used by any other IRQ
758 */
759 set = &entry->def_intr;
760 cpumask_set_cpu(cpu, &set->mask);
761 cpumask_set_cpu(cpu, &set->used);
762 for (i = 0; i < dd->msix_info.max_requested; i++) {
763 struct hfi1_msix_entry *other_msix;
764
765 other_msix = &dd->msix_info.msix_entries[i];
766 if (other_msix->type != IRQ_SDMA || other_msix == msix)
767 continue;
768
769 if (cpumask_test_cpu(old_cpu, &other_msix->mask))
770 goto unlock;
771 }
772 cpumask_clear_cpu(old_cpu, &set->mask);
773 cpumask_clear_cpu(old_cpu, &set->used);
774 unlock:
775 mutex_unlock(&node_affinity.lock);
776 }
777
hfi1_irq_notifier_notify(struct irq_affinity_notify * notify,const cpumask_t * mask)778 static void hfi1_irq_notifier_notify(struct irq_affinity_notify *notify,
779 const cpumask_t *mask)
780 {
781 int cpu = cpumask_first(mask);
782 struct hfi1_msix_entry *msix = container_of(notify,
783 struct hfi1_msix_entry,
784 notify);
785
786 /* Only one CPU configuration supported currently */
787 hfi1_update_sdma_affinity(msix, cpu);
788 }
789
hfi1_irq_notifier_release(struct kref * ref)790 static void hfi1_irq_notifier_release(struct kref *ref)
791 {
792 /*
793 * This is required by affinity notifier. We don't have anything to
794 * free here.
795 */
796 }
797
hfi1_setup_sdma_notifier(struct hfi1_msix_entry * msix)798 static void hfi1_setup_sdma_notifier(struct hfi1_msix_entry *msix)
799 {
800 struct irq_affinity_notify *notify = &msix->notify;
801
802 notify->irq = msix->irq;
803 notify->notify = hfi1_irq_notifier_notify;
804 notify->release = hfi1_irq_notifier_release;
805
806 if (irq_set_affinity_notifier(notify->irq, notify))
807 pr_err("Failed to register sdma irq affinity notifier for irq %d\n",
808 notify->irq);
809 }
810
hfi1_cleanup_sdma_notifier(struct hfi1_msix_entry * msix)811 static void hfi1_cleanup_sdma_notifier(struct hfi1_msix_entry *msix)
812 {
813 struct irq_affinity_notify *notify = &msix->notify;
814
815 if (irq_set_affinity_notifier(notify->irq, NULL))
816 pr_err("Failed to cleanup sdma irq affinity notifier for irq %d\n",
817 notify->irq);
818 }
819
820 /*
821 * Function sets the irq affinity for msix.
822 * It *must* be called with node_affinity.lock held.
823 */
get_irq_affinity(struct hfi1_devdata * dd,struct hfi1_msix_entry * msix)824 static int get_irq_affinity(struct hfi1_devdata *dd,
825 struct hfi1_msix_entry *msix)
826 {
827 cpumask_var_t diff;
828 struct hfi1_affinity_node *entry;
829 struct cpu_mask_set *set = NULL;
830 struct sdma_engine *sde = NULL;
831 struct hfi1_ctxtdata *rcd = NULL;
832 char extra[64];
833 int cpu = -1;
834
835 extra[0] = '\0';
836 cpumask_clear(&msix->mask);
837
838 entry = node_affinity_lookup(dd->node);
839
840 switch (msix->type) {
841 case IRQ_SDMA:
842 sde = (struct sdma_engine *)msix->arg;
843 scnprintf(extra, 64, "engine %u", sde->this_idx);
844 set = &entry->def_intr;
845 break;
846 case IRQ_GENERAL:
847 cpu = cpumask_first(&entry->general_intr_mask);
848 break;
849 case IRQ_RCVCTXT:
850 rcd = (struct hfi1_ctxtdata *)msix->arg;
851 if (rcd->ctxt == HFI1_CTRL_CTXT)
852 cpu = cpumask_first(&entry->general_intr_mask);
853 else
854 set = &entry->rcv_intr;
855 scnprintf(extra, 64, "ctxt %u", rcd->ctxt);
856 break;
857 case IRQ_NETDEVCTXT:
858 rcd = (struct hfi1_ctxtdata *)msix->arg;
859 set = &entry->def_intr;
860 scnprintf(extra, 64, "ctxt %u", rcd->ctxt);
861 break;
862 default:
863 dd_dev_err(dd, "Invalid IRQ type %d\n", msix->type);
864 return -EINVAL;
865 }
866
867 /*
868 * The general and control contexts are placed on a particular
869 * CPU, which is set above. Skip accounting for it. Everything else
870 * finds its CPU here.
871 */
872 if (cpu == -1 && set) {
873 if (!zalloc_cpumask_var(&diff, GFP_KERNEL))
874 return -ENOMEM;
875
876 cpu = cpu_mask_set_get_first(set, diff);
877 if (cpu < 0) {
878 free_cpumask_var(diff);
879 dd_dev_err(dd, "Failure to obtain CPU for IRQ\n");
880 return cpu;
881 }
882
883 free_cpumask_var(diff);
884 }
885
886 cpumask_set_cpu(cpu, &msix->mask);
887 dd_dev_info(dd, "IRQ: %u, type %s %s -> cpu: %d\n",
888 msix->irq, irq_type_names[msix->type],
889 extra, cpu);
890 irq_set_affinity_hint(msix->irq, &msix->mask);
891
892 if (msix->type == IRQ_SDMA) {
893 sde->cpu = cpu;
894 hfi1_setup_sdma_notifier(msix);
895 }
896
897 return 0;
898 }
899
hfi1_get_irq_affinity(struct hfi1_devdata * dd,struct hfi1_msix_entry * msix)900 int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix)
901 {
902 int ret;
903
904 mutex_lock(&node_affinity.lock);
905 ret = get_irq_affinity(dd, msix);
906 mutex_unlock(&node_affinity.lock);
907 return ret;
908 }
909
hfi1_put_irq_affinity(struct hfi1_devdata * dd,struct hfi1_msix_entry * msix)910 void hfi1_put_irq_affinity(struct hfi1_devdata *dd,
911 struct hfi1_msix_entry *msix)
912 {
913 struct cpu_mask_set *set = NULL;
914 struct hfi1_affinity_node *entry;
915
916 mutex_lock(&node_affinity.lock);
917 entry = node_affinity_lookup(dd->node);
918
919 switch (msix->type) {
920 case IRQ_SDMA:
921 set = &entry->def_intr;
922 hfi1_cleanup_sdma_notifier(msix);
923 break;
924 case IRQ_GENERAL:
925 /* Don't do accounting for general contexts */
926 break;
927 case IRQ_RCVCTXT: {
928 struct hfi1_ctxtdata *rcd = msix->arg;
929
930 /* Don't do accounting for control contexts */
931 if (rcd->ctxt != HFI1_CTRL_CTXT)
932 set = &entry->rcv_intr;
933 break;
934 }
935 case IRQ_NETDEVCTXT:
936 set = &entry->def_intr;
937 break;
938 default:
939 mutex_unlock(&node_affinity.lock);
940 return;
941 }
942
943 if (set) {
944 cpumask_andnot(&set->used, &set->used, &msix->mask);
945 _cpu_mask_set_gen_dec(set);
946 }
947
948 irq_set_affinity_hint(msix->irq, NULL);
949 cpumask_clear(&msix->mask);
950 mutex_unlock(&node_affinity.lock);
951 }
952
953 /* This should be called with node_affinity.lock held */
find_hw_thread_mask(uint hw_thread_no,cpumask_var_t hw_thread_mask,struct hfi1_affinity_node_list * affinity)954 static void find_hw_thread_mask(uint hw_thread_no, cpumask_var_t hw_thread_mask,
955 struct hfi1_affinity_node_list *affinity)
956 {
957 int curr_cpu;
958 uint num_cores;
959
960 cpumask_copy(hw_thread_mask, &affinity->proc.mask);
961
962 if (affinity->num_core_siblings == 0)
963 return;
964
965 num_cores = rounddown(node_affinity.num_online_cpus / affinity->num_core_siblings,
966 node_affinity.num_online_nodes);
967
968 /* Removing other siblings not needed for now */
969 curr_cpu = cpumask_nth(num_cores * node_affinity.num_online_nodes, hw_thread_mask) + 1;
970 cpumask_clear_cpus(hw_thread_mask, curr_cpu, nr_cpu_ids - curr_cpu);
971
972 /* Identifying correct HW threads within physical cores */
973 cpumask_shift_left(hw_thread_mask, hw_thread_mask, num_cores * hw_thread_no);
974 }
975
hfi1_get_proc_affinity(int node)976 int hfi1_get_proc_affinity(int node)
977 {
978 int cpu = -1, ret, i;
979 struct hfi1_affinity_node *entry;
980 cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask;
981 const struct cpumask *node_mask,
982 *proc_mask = current->cpus_ptr;
983 struct hfi1_affinity_node_list *affinity = &node_affinity;
984 struct cpu_mask_set *set = &affinity->proc;
985
986 /*
987 * check whether process/context affinity has already
988 * been set
989 */
990 if (current->nr_cpus_allowed == 1) {
991 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl",
992 current->pid, current->comm,
993 cpumask_pr_args(proc_mask));
994 /*
995 * Mark the pre-set CPU as used. This is atomic so we don't
996 * need the lock
997 */
998 cpu = cpumask_first(proc_mask);
999 cpumask_set_cpu(cpu, &set->used);
1000 goto done;
1001 } else if (current->nr_cpus_allowed < cpumask_weight(&set->mask)) {
1002 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl",
1003 current->pid, current->comm,
1004 cpumask_pr_args(proc_mask));
1005 goto done;
1006 }
1007
1008 /*
1009 * The process does not have a preset CPU affinity so find one to
1010 * recommend using the following algorithm:
1011 *
1012 * For each user process that is opening a context on HFI Y:
1013 * a) If all cores are filled, reinitialize the bitmask
1014 * b) Fill real cores first, then HT cores (First set of HT
1015 * cores on all physical cores, then second set of HT core,
1016 * and, so on) in the following order:
1017 *
1018 * 1. Same NUMA node as HFI Y and not running an IRQ
1019 * handler
1020 * 2. Same NUMA node as HFI Y and running an IRQ handler
1021 * 3. Different NUMA node to HFI Y and not running an IRQ
1022 * handler
1023 * 4. Different NUMA node to HFI Y and running an IRQ
1024 * handler
1025 * c) Mark core as filled in the bitmask. As user processes are
1026 * done, clear cores from the bitmask.
1027 */
1028
1029 ret = zalloc_cpumask_var(&diff, GFP_KERNEL);
1030 if (!ret)
1031 goto done;
1032 ret = zalloc_cpumask_var(&hw_thread_mask, GFP_KERNEL);
1033 if (!ret)
1034 goto free_diff;
1035 ret = zalloc_cpumask_var(&available_mask, GFP_KERNEL);
1036 if (!ret)
1037 goto free_hw_thread_mask;
1038 ret = zalloc_cpumask_var(&intrs_mask, GFP_KERNEL);
1039 if (!ret)
1040 goto free_available_mask;
1041
1042 mutex_lock(&affinity->lock);
1043 /*
1044 * If we've used all available HW threads, clear the mask and start
1045 * overloading.
1046 */
1047 _cpu_mask_set_gen_inc(set);
1048
1049 /*
1050 * If NUMA node has CPUs used by interrupt handlers, include them in the
1051 * interrupt handler mask.
1052 */
1053 entry = node_affinity_lookup(node);
1054 if (entry) {
1055 cpumask_copy(intrs_mask, (entry->def_intr.gen ?
1056 &entry->def_intr.mask :
1057 &entry->def_intr.used));
1058 cpumask_or(intrs_mask, intrs_mask, (entry->rcv_intr.gen ?
1059 &entry->rcv_intr.mask :
1060 &entry->rcv_intr.used));
1061 cpumask_or(intrs_mask, intrs_mask, &entry->general_intr_mask);
1062 }
1063 hfi1_cdbg(PROC, "CPUs used by interrupts: %*pbl",
1064 cpumask_pr_args(intrs_mask));
1065
1066 cpumask_copy(hw_thread_mask, &set->mask);
1067
1068 /*
1069 * If HT cores are enabled, identify which HW threads within the
1070 * physical cores should be used.
1071 */
1072 for (i = 0; i < affinity->num_core_siblings; i++) {
1073 find_hw_thread_mask(i, hw_thread_mask, affinity);
1074
1075 /*
1076 * If there's at least one available core for this HW
1077 * thread number, stop looking for a core.
1078 *
1079 * diff will always be not empty at least once in this
1080 * loop as the used mask gets reset when
1081 * (set->mask == set->used) before this loop.
1082 */
1083 if (cpumask_andnot(diff, hw_thread_mask, &set->used))
1084 break;
1085 }
1086 hfi1_cdbg(PROC, "Same available HW thread on all physical CPUs: %*pbl",
1087 cpumask_pr_args(hw_thread_mask));
1088
1089 node_mask = cpumask_of_node(node);
1090 hfi1_cdbg(PROC, "Device on NUMA %u, CPUs %*pbl", node,
1091 cpumask_pr_args(node_mask));
1092
1093 /* Get cpumask of available CPUs on preferred NUMA */
1094 cpumask_and(available_mask, hw_thread_mask, node_mask);
1095 cpumask_andnot(available_mask, available_mask, &set->used);
1096 hfi1_cdbg(PROC, "Available CPUs on NUMA %u: %*pbl", node,
1097 cpumask_pr_args(available_mask));
1098
1099 /*
1100 * At first, we don't want to place processes on the same
1101 * CPUs as interrupt handlers. Then, CPUs running interrupt
1102 * handlers are used.
1103 *
1104 * 1) If diff is not empty, then there are CPUs not running
1105 * non-interrupt handlers available, so diff gets copied
1106 * over to available_mask.
1107 * 2) If diff is empty, then all CPUs not running interrupt
1108 * handlers are taken, so available_mask contains all
1109 * available CPUs running interrupt handlers.
1110 * 3) If available_mask is empty, then all CPUs on the
1111 * preferred NUMA node are taken, so other NUMA nodes are
1112 * used for process assignments using the same method as
1113 * the preferred NUMA node.
1114 */
1115 if (cpumask_andnot(diff, available_mask, intrs_mask))
1116 cpumask_copy(available_mask, diff);
1117
1118 /* If we don't have CPUs on the preferred node, use other NUMA nodes */
1119 if (cpumask_empty(available_mask)) {
1120 cpumask_andnot(available_mask, hw_thread_mask, &set->used);
1121 /* Excluding preferred NUMA cores */
1122 cpumask_andnot(available_mask, available_mask, node_mask);
1123 hfi1_cdbg(PROC,
1124 "Preferred NUMA node cores are taken, cores available in other NUMA nodes: %*pbl",
1125 cpumask_pr_args(available_mask));
1126
1127 /*
1128 * At first, we don't want to place processes on the same
1129 * CPUs as interrupt handlers.
1130 */
1131 if (cpumask_andnot(diff, available_mask, intrs_mask))
1132 cpumask_copy(available_mask, diff);
1133 }
1134 hfi1_cdbg(PROC, "Possible CPUs for process: %*pbl",
1135 cpumask_pr_args(available_mask));
1136
1137 cpu = cpumask_first(available_mask);
1138 if (cpu >= nr_cpu_ids) /* empty */
1139 cpu = -1;
1140 else
1141 cpumask_set_cpu(cpu, &set->used);
1142
1143 mutex_unlock(&affinity->lock);
1144 hfi1_cdbg(PROC, "Process assigned to CPU %d", cpu);
1145
1146 free_cpumask_var(intrs_mask);
1147 free_available_mask:
1148 free_cpumask_var(available_mask);
1149 free_hw_thread_mask:
1150 free_cpumask_var(hw_thread_mask);
1151 free_diff:
1152 free_cpumask_var(diff);
1153 done:
1154 return cpu;
1155 }
1156
hfi1_put_proc_affinity(int cpu)1157 void hfi1_put_proc_affinity(int cpu)
1158 {
1159 struct hfi1_affinity_node_list *affinity = &node_affinity;
1160 struct cpu_mask_set *set = &affinity->proc;
1161
1162 if (cpu < 0)
1163 return;
1164
1165 mutex_lock(&affinity->lock);
1166 cpu_mask_set_put(set, cpu);
1167 hfi1_cdbg(PROC, "Returning CPU %d for future process assignment", cpu);
1168 mutex_unlock(&affinity->lock);
1169 }
1170