xref: /linux/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c (revision fa2852a28c5bb05b9ce7b0f6227d0b276b78c07e)
1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /* Copyright (c) 2019 Mellanox Technologies. */
3 
4 #include <linux/pci.h>
5 #include <linux/interrupt.h>
6 #include <linux/notifier.h>
7 #include <linux/mlx5/driver.h>
8 #include <linux/mlx5/vport.h>
9 #include "mlx5_core.h"
10 #include "mlx5_irq.h"
11 #include "pci_irq.h"
12 #include "lib/sf.h"
13 #include "lib/eq.h"
14 #ifdef CONFIG_RFS_ACCEL
15 #include <linux/cpu_rmap.h>
16 #endif
17 
18 #define MLX5_SFS_PER_CTRL_IRQ 64
19 #define MLX5_MAX_MSIX_PER_SF 256
20 #define MLX5_IRQ_CTRL_SF_MAX 8
21 /* min num of vectors for SFs to be enabled */
22 #define MLX5_IRQ_VEC_COMP_BASE_SF 2
23 #define MLX5_IRQ_VEC_COMP_BASE 1
24 
25 #define MLX5_EQ_SHARE_IRQ_MAX_COMP (8)
26 #define MLX5_EQ_SHARE_IRQ_MAX_CTRL (UINT_MAX)
27 #define MLX5_EQ_SHARE_IRQ_MIN_COMP (1)
28 #define MLX5_EQ_SHARE_IRQ_MIN_CTRL (4)
29 
30 struct mlx5_irq {
31 	struct atomic_notifier_head nh;
32 	cpumask_var_t mask;
33 	char name[MLX5_MAX_IRQ_FORMATTED_NAME];
34 	struct mlx5_irq_pool *pool;
35 	int refcount;
36 	struct msi_map map;
37 	u32 pool_index;
38 };
39 
40 struct mlx5_irq_table {
41 	struct mlx5_irq_pool *pcif_pool;
42 	struct mlx5_irq_pool *sf_ctrl_pool;
43 	struct mlx5_irq_pool *sf_comp_pool;
44 };
45 
46 static int mlx5_core_func_to_vport(const struct mlx5_core_dev *dev,
47 				   int func,
48 				   bool ec_vf_func)
49 {
50 	if (!ec_vf_func)
51 		return func;
52 	return mlx5_core_ec_vf_vport_base(dev) + func - 1;
53 }
54 
55 /**
56  * mlx5_get_default_msix_vec_count - Get the default number of MSI-X vectors
57  *                                   to be assigned to each VF.
58  * @dev: PF to work on
59  * @num_vfs: Number of enabled VFs
60  */
61 int mlx5_get_default_msix_vec_count(struct mlx5_core_dev *dev, int num_vfs)
62 {
63 	int num_vf_msix, min_msix, max_msix;
64 
65 	num_vf_msix = MLX5_CAP_GEN_MAX(dev, num_total_dynamic_vf_msix);
66 	if (!num_vf_msix)
67 		return 0;
68 
69 	min_msix = MLX5_CAP_GEN(dev, min_dynamic_vf_msix_table_size);
70 	max_msix = MLX5_CAP_GEN(dev, max_dynamic_vf_msix_table_size);
71 
72 	/* Limit maximum number of MSI-X vectors so the default configuration
73 	 * has some available in the pool. This will allow the user to increase
74 	 * the number of vectors in a VF without having to first size-down other
75 	 * VFs.
76 	 */
77 	return max(min(num_vf_msix / num_vfs, max_msix / 2), min_msix);
78 }
79 
80 /**
81  * mlx5_set_msix_vec_count - Set dynamically allocated MSI-X on the VF
82  * @dev: PF to work on
83  * @function_id: Internal PCI VF function IDd
84  * @msix_vec_count: Number of MSI-X vectors to set
85  */
86 int mlx5_set_msix_vec_count(struct mlx5_core_dev *dev, int function_id,
87 			    int msix_vec_count)
88 {
89 	int query_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out);
90 	int num_vf_msix, min_msix, max_msix;
91 	void *query_cap, *hca_caps;
92 	bool ec_vf_function;
93 	int vport;
94 	int ret;
95 
96 	num_vf_msix = MLX5_CAP_GEN_MAX(dev, num_total_dynamic_vf_msix);
97 	if (!num_vf_msix)
98 		return 0;
99 
100 	if (!MLX5_CAP_GEN(dev, vport_group_manager) || !mlx5_core_is_pf(dev))
101 		return -EOPNOTSUPP;
102 
103 	min_msix = MLX5_CAP_GEN(dev, min_dynamic_vf_msix_table_size);
104 	max_msix = MLX5_CAP_GEN(dev, max_dynamic_vf_msix_table_size);
105 
106 	if (msix_vec_count < min_msix)
107 		return -EINVAL;
108 
109 	if (msix_vec_count > max_msix)
110 		return -EOVERFLOW;
111 
112 	query_cap = kvzalloc(query_sz, GFP_KERNEL);
113 	if (!query_cap)
114 		return -ENOMEM;
115 
116 	ec_vf_function = mlx5_core_ec_sriov_enabled(dev);
117 	vport = mlx5_core_func_to_vport(dev, function_id, ec_vf_function);
118 	ret = mlx5_vport_get_other_func_general_cap(dev, vport, query_cap);
119 	if (ret)
120 		goto out;
121 
122 	hca_caps = MLX5_ADDR_OF(query_hca_cap_out, query_cap, capability);
123 	MLX5_SET(cmd_hca_cap, hca_caps, dynamic_msix_table_size,
124 		 msix_vec_count);
125 
126 	ret = mlx5_vport_set_other_func_general_cap(dev, hca_caps, vport);
127 out:
128 	kvfree(query_cap);
129 	return ret;
130 }
131 
132 /* mlx5_system_free_irq - Free an IRQ
133  * @irq: IRQ to free
134  *
135  * Free the IRQ and other resources such as rmap from the system.
136  * BUT doesn't free or remove reference from mlx5.
137  * This function is very important for the shutdown flow, where we need to
138  * cleanup system resources but keep mlx5 objects alive,
139  * see mlx5_irq_table_free_irqs().
140  */
141 static void mlx5_system_free_irq(struct mlx5_irq *irq)
142 {
143 	struct mlx5_irq_pool *pool = irq->pool;
144 #ifdef CONFIG_RFS_ACCEL
145 	struct cpu_rmap *rmap;
146 #endif
147 
148 	/* free_irq requires that affinity_hint and rmap will be cleared before
149 	 * calling it. To satisfy this requirement, we call
150 	 * irq_cpu_rmap_remove() to remove the notifier
151 	 */
152 	irq_update_affinity_hint(irq->map.virq, NULL);
153 #ifdef CONFIG_RFS_ACCEL
154 	rmap = mlx5_eq_table_get_rmap(pool->dev);
155 	if (rmap)
156 		irq_cpu_rmap_remove(rmap, irq->map.virq);
157 #endif
158 
159 	free_irq(irq->map.virq, &irq->nh);
160 	if (irq->map.index && pci_msix_can_alloc_dyn(pool->dev->pdev))
161 		pci_msix_free_irq(pool->dev->pdev, irq->map);
162 }
163 
164 static void irq_release(struct mlx5_irq *irq)
165 {
166 	struct mlx5_irq_pool *pool = irq->pool;
167 
168 	xa_erase(&pool->irqs, irq->pool_index);
169 	mlx5_system_free_irq(irq);
170 	free_cpumask_var(irq->mask);
171 	kfree(irq);
172 }
173 
174 int mlx5_irq_put(struct mlx5_irq *irq)
175 {
176 	struct mlx5_irq_pool *pool = irq->pool;
177 	int ret = 0;
178 
179 	mutex_lock(&pool->lock);
180 	irq->refcount--;
181 	if (!irq->refcount) {
182 		irq_release(irq);
183 		ret = 1;
184 	}
185 	mutex_unlock(&pool->lock);
186 	return ret;
187 }
188 
189 int mlx5_irq_read_locked(struct mlx5_irq *irq)
190 {
191 	lockdep_assert_held(&irq->pool->lock);
192 	return irq->refcount;
193 }
194 
195 int mlx5_irq_get_locked(struct mlx5_irq *irq)
196 {
197 	lockdep_assert_held(&irq->pool->lock);
198 	if (WARN_ON_ONCE(!irq->refcount))
199 		return 0;
200 	irq->refcount++;
201 	return 1;
202 }
203 
204 static int irq_get(struct mlx5_irq *irq)
205 {
206 	int err;
207 
208 	mutex_lock(&irq->pool->lock);
209 	err = mlx5_irq_get_locked(irq);
210 	mutex_unlock(&irq->pool->lock);
211 	return err;
212 }
213 
214 static irqreturn_t irq_int_handler(int irq, void *nh)
215 {
216 	atomic_notifier_call_chain(nh, 0, NULL);
217 	return IRQ_HANDLED;
218 }
219 
220 static void irq_sf_set_name(struct mlx5_irq_pool *pool, char *name, int vecidx)
221 {
222 	snprintf(name, MLX5_MAX_IRQ_NAME, "%s%d", pool->name, vecidx);
223 }
224 
225 static void irq_set_name(struct mlx5_irq_pool *pool, char *name, int vecidx)
226 {
227 	if (!pool->xa_num_irqs.max) {
228 		/* in case we only have a single irq for the device */
229 		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_combined%d", vecidx);
230 		return;
231 	}
232 
233 	if (!vecidx) {
234 		snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_async%d", vecidx);
235 		return;
236 	}
237 
238 	vecidx -= MLX5_IRQ_VEC_COMP_BASE;
239 	snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_comp%d", vecidx);
240 }
241 
242 struct mlx5_irq *mlx5_irq_alloc(struct mlx5_irq_pool *pool, int i,
243 				struct irq_affinity_desc *af_desc,
244 				struct cpu_rmap **rmap)
245 {
246 	struct mlx5_core_dev *dev = pool->dev;
247 	char name[MLX5_MAX_IRQ_NAME];
248 	struct mlx5_irq *irq;
249 	int err;
250 
251 	irq = kzalloc_obj(*irq);
252 	if (!irq || !zalloc_cpumask_var(&irq->mask, GFP_KERNEL)) {
253 		kfree(irq);
254 		return ERR_PTR(-ENOMEM);
255 	}
256 
257 	if (!i || !pci_msix_can_alloc_dyn(dev->pdev)) {
258 		/* The vector at index 0 is always statically allocated. If
259 		 * dynamic irq is not supported all vectors are statically
260 		 * allocated. In both cases just get the irq number and set
261 		 * the index.
262 		 */
263 		irq->map.virq = pci_irq_vector(dev->pdev, i);
264 		irq->map.index = i;
265 	} else {
266 		irq->map = pci_msix_alloc_irq_at(dev->pdev, MSI_ANY_INDEX, af_desc);
267 		if (!irq->map.virq) {
268 			err = irq->map.index;
269 			goto err_alloc_irq;
270 		}
271 	}
272 
273 	if (i && rmap && *rmap) {
274 #ifdef CONFIG_RFS_ACCEL
275 		err = irq_cpu_rmap_add(*rmap, irq->map.virq);
276 		if (err)
277 			goto err_irq_rmap;
278 #endif
279 	}
280 	if (!mlx5_irq_pool_is_sf_pool(pool))
281 		irq_set_name(pool, name, i);
282 	else
283 		irq_sf_set_name(pool, name, i);
284 	ATOMIC_INIT_NOTIFIER_HEAD(&irq->nh);
285 	snprintf(irq->name, MLX5_MAX_IRQ_FORMATTED_NAME,
286 		 MLX5_IRQ_NAME_FORMAT_STR, name, pci_name(dev->pdev));
287 	err = request_irq(irq->map.virq, irq_int_handler, 0, irq->name,
288 			  &irq->nh);
289 	if (err) {
290 		mlx5_core_err(dev, "Failed to request irq. err = %d\n", err);
291 		goto err_req_irq;
292 	}
293 
294 	if (af_desc) {
295 		cpumask_copy(irq->mask, &af_desc->mask);
296 		irq_set_affinity_and_hint(irq->map.virq, irq->mask);
297 	}
298 	irq->pool = pool;
299 	irq->refcount = 1;
300 	irq->pool_index = i;
301 	err = xa_err(xa_store(&pool->irqs, irq->pool_index, irq, GFP_KERNEL));
302 	if (err) {
303 		mlx5_core_err(dev, "Failed to alloc xa entry for irq(%u). err = %d\n",
304 			      irq->pool_index, err);
305 		goto err_xa;
306 	}
307 	return irq;
308 err_xa:
309 	if (af_desc)
310 		irq_update_affinity_hint(irq->map.virq, NULL);
311 	free_irq(irq->map.virq, &irq->nh);
312 err_req_irq:
313 #ifdef CONFIG_RFS_ACCEL
314 	if (i && rmap && *rmap)
315 		irq_cpu_rmap_remove(*rmap, irq->map.virq);
316 err_irq_rmap:
317 #endif
318 	if (i && pci_msix_can_alloc_dyn(dev->pdev))
319 		pci_msix_free_irq(dev->pdev, irq->map);
320 err_alloc_irq:
321 	free_cpumask_var(irq->mask);
322 	kfree(irq);
323 	return ERR_PTR(err);
324 }
325 
326 int mlx5_irq_attach_nb(struct mlx5_irq *irq, struct notifier_block *nb)
327 {
328 	int ret;
329 
330 	ret = irq_get(irq);
331 	if (!ret)
332 		/* Something very bad happens here, we are enabling EQ
333 		 * on non-existing IRQ.
334 		 */
335 		return -ENOENT;
336 	ret = atomic_notifier_chain_register(&irq->nh, nb);
337 	if (ret)
338 		mlx5_irq_put(irq);
339 	return ret;
340 }
341 
342 int mlx5_irq_detach_nb(struct mlx5_irq *irq, struct notifier_block *nb)
343 {
344 	int err = 0;
345 
346 	err = atomic_notifier_chain_unregister(&irq->nh, nb);
347 	mlx5_irq_put(irq);
348 	return err;
349 }
350 
351 struct cpumask *mlx5_irq_get_affinity_mask(struct mlx5_irq *irq)
352 {
353 	return irq->mask;
354 }
355 
356 int mlx5_irq_get_irq(const struct mlx5_irq *irq)
357 {
358 	return irq->map.virq;
359 }
360 
361 int mlx5_irq_get_index(struct mlx5_irq *irq)
362 {
363 	return irq->map.index;
364 }
365 
366 struct mlx5_irq_pool *mlx5_irq_get_pool(struct mlx5_irq *irq)
367 {
368 	return irq->pool;
369 }
370 
371 /* irq_pool API */
372 
373 /* requesting an irq from a given pool according to given index */
374 static struct mlx5_irq *
375 irq_pool_request_vector(struct mlx5_irq_pool *pool, int vecidx,
376 			struct irq_affinity_desc *af_desc,
377 			struct cpu_rmap **rmap)
378 {
379 	struct mlx5_irq *irq;
380 
381 	mutex_lock(&pool->lock);
382 	irq = xa_load(&pool->irqs, vecidx);
383 	if (irq) {
384 		mlx5_irq_get_locked(irq);
385 		goto unlock;
386 	}
387 	irq = mlx5_irq_alloc(pool, vecidx, af_desc, rmap);
388 unlock:
389 	mutex_unlock(&pool->lock);
390 	return irq;
391 }
392 
393 static struct mlx5_irq_pool *sf_ctrl_irq_pool_get(struct mlx5_irq_table *irq_table)
394 {
395 	return irq_table->sf_ctrl_pool;
396 }
397 
398 static struct mlx5_irq_pool *
399 sf_comp_irq_pool_get(struct mlx5_irq_table *irq_table)
400 {
401 	return irq_table->sf_comp_pool;
402 }
403 
404 struct mlx5_irq_pool *
405 mlx5_irq_table_get_comp_irq_pool(struct mlx5_core_dev *dev)
406 {
407 	struct mlx5_irq_table *irq_table = mlx5_irq_table_get(dev);
408 	struct mlx5_irq_pool *pool = NULL;
409 
410 	if (mlx5_core_is_sf(dev))
411 		pool = sf_comp_irq_pool_get(irq_table);
412 
413 	/* In some configs, there won't be a pool of SFs IRQs. Hence, returning
414 	 * the PF IRQs pool in case the SF pool doesn't exist.
415 	 */
416 	return pool ? pool : irq_table->pcif_pool;
417 }
418 
419 static struct mlx5_irq_pool *ctrl_irq_pool_get(struct mlx5_core_dev *dev)
420 {
421 	struct mlx5_irq_table *irq_table = mlx5_irq_table_get(dev);
422 	struct mlx5_irq_pool *pool = NULL;
423 
424 	if (mlx5_core_is_sf(dev))
425 		pool = sf_ctrl_irq_pool_get(irq_table);
426 
427 	/* In some configs, there won't be a pool of SFs IRQs. Hence, returning
428 	 * the PF IRQs pool in case the SF pool doesn't exist.
429 	 */
430 	return pool ? pool : irq_table->pcif_pool;
431 }
432 
433 static void _mlx5_irq_release(struct mlx5_irq *irq)
434 {
435 	synchronize_irq(irq->map.virq);
436 	mlx5_irq_put(irq);
437 }
438 
439 /**
440  * mlx5_ctrl_irq_release - release a ctrl IRQ back to the system.
441  * @dev: mlx5 device that releasing the IRQ.
442  * @ctrl_irq: ctrl IRQ to be released.
443  */
444 void mlx5_ctrl_irq_release(struct mlx5_core_dev *dev, struct mlx5_irq *ctrl_irq)
445 {
446 	mlx5_irq_affinity_irq_release(dev, ctrl_irq);
447 }
448 
449 /**
450  * mlx5_ctrl_irq_request - request a ctrl IRQ for mlx5 device.
451  * @dev: mlx5 device that requesting the IRQ.
452  *
453  * This function returns a pointer to IRQ, or ERR_PTR in case of error.
454  */
455 struct mlx5_irq *mlx5_ctrl_irq_request(struct mlx5_core_dev *dev)
456 {
457 	struct mlx5_irq_pool *pool = ctrl_irq_pool_get(dev);
458 	struct irq_affinity_desc *af_desc;
459 	struct mlx5_irq *irq;
460 
461 	af_desc = kvzalloc_obj(*af_desc);
462 	if (!af_desc)
463 		return ERR_PTR(-ENOMEM);
464 
465 	cpumask_copy(&af_desc->mask, cpu_online_mask);
466 	af_desc->is_managed = false;
467 	if (!mlx5_irq_pool_is_sf_pool(pool)) {
468 		/* In case we are allocating a control IRQ from a pci device's pool.
469 		 * This can happen also for a SF if the SFs pool is empty.
470 		 */
471 		if (!pool->xa_num_irqs.max) {
472 			cpumask_clear(&af_desc->mask);
473 			/* In case we only have a single IRQ for PF/VF */
474 			cpumask_set_cpu(cpumask_first(cpu_online_mask), &af_desc->mask);
475 		}
476 		/* Allocate the IRQ in index 0. The vector was already allocated */
477 		irq = irq_pool_request_vector(pool, 0, af_desc, NULL);
478 	} else {
479 		irq = mlx5_irq_affinity_request(dev, pool, af_desc);
480 	}
481 
482 	kvfree(af_desc);
483 
484 	return irq;
485 }
486 
487 /**
488  * mlx5_irq_request - request an IRQ for mlx5 PF/VF device.
489  * @dev: mlx5 device that requesting the IRQ.
490  * @vecidx: vector index of the IRQ. This argument is ignore if affinity is
491  * provided.
492  * @af_desc: affinity descriptor for this IRQ.
493  * @rmap: pointer to reverse map pointer for completion interrupts
494  *
495  * This function returns a pointer to IRQ, or ERR_PTR in case of error.
496  */
497 struct mlx5_irq *mlx5_irq_request(struct mlx5_core_dev *dev, u16 vecidx,
498 				  struct irq_affinity_desc *af_desc,
499 				  struct cpu_rmap **rmap)
500 {
501 	struct mlx5_irq_table *irq_table = mlx5_irq_table_get(dev);
502 	struct mlx5_irq_pool *pool;
503 	struct mlx5_irq *irq;
504 
505 	pool = irq_table->pcif_pool;
506 	irq = irq_pool_request_vector(pool, vecidx, af_desc, rmap);
507 	if (IS_ERR(irq))
508 		return irq;
509 	mlx5_core_dbg(dev, "irq %u mapped to cpu %*pbl, %u EQs on this irq\n",
510 		      irq->map.virq, cpumask_pr_args(&af_desc->mask),
511 		      irq->refcount / MLX5_EQ_REFS_PER_IRQ);
512 	return irq;
513 }
514 
515 /**
516  * mlx5_irq_release_vector - release one IRQ back to the system.
517  * @irq: the irq to release.
518  */
519 void mlx5_irq_release_vector(struct mlx5_irq *irq)
520 {
521 	_mlx5_irq_release(irq);
522 }
523 
524 /**
525  * mlx5_irq_request_vector - request one IRQ for mlx5 device.
526  * @dev: mlx5 device that is requesting the IRQ.
527  * @cpu: CPU to bind the IRQ to.
528  * @vecidx: vector index to request an IRQ for.
529  * @rmap: pointer to reverse map pointer for completion interrupts
530  *
531  * Each IRQ is bound to at most 1 CPU.
532  * This function is requests one IRQ, for the given @vecidx.
533  *
534  * This function returns a pointer to the irq on success, or an error pointer
535  * in case of an error.
536  */
537 struct mlx5_irq *mlx5_irq_request_vector(struct mlx5_core_dev *dev, u16 cpu,
538 					 u16 vecidx, struct cpu_rmap **rmap)
539 {
540 	struct mlx5_irq_table *table = mlx5_irq_table_get(dev);
541 	struct mlx5_irq_pool *pool = table->pcif_pool;
542 	int offset = MLX5_IRQ_VEC_COMP_BASE;
543 	struct irq_affinity_desc *af_desc;
544 	struct mlx5_irq *irq;
545 
546 	af_desc = kvzalloc_obj(*af_desc);
547 	if (!af_desc)
548 		return ERR_PTR(-ENOMEM);
549 
550 	if (!pool->xa_num_irqs.max)
551 		offset = 0;
552 
553 	af_desc->is_managed = false;
554 	cpumask_clear(&af_desc->mask);
555 	cpumask_set_cpu(cpu, &af_desc->mask);
556 
557 	irq = mlx5_irq_request(dev, vecidx + offset, af_desc, rmap);
558 
559 	kvfree(af_desc);
560 
561 	return irq;
562 }
563 
564 static struct mlx5_irq_pool *
565 irq_pool_alloc(struct mlx5_core_dev *dev, int start, int size, char *name,
566 	       u32 min_threshold, u32 max_threshold)
567 {
568 	struct mlx5_irq_pool *pool = kvzalloc_obj(*pool);
569 
570 	if (!pool)
571 		return ERR_PTR(-ENOMEM);
572 	pool->dev = dev;
573 	mutex_init(&pool->lock);
574 	xa_init_flags(&pool->irqs, XA_FLAGS_ALLOC);
575 	pool->xa_num_irqs.min = start;
576 	pool->xa_num_irqs.max = start + size - 1;
577 	if (name)
578 		snprintf(pool->name, MLX5_MAX_IRQ_NAME - MLX5_MAX_IRQ_IDX_CHARS,
579 			 "%s", name);
580 	pool->min_threshold = min_threshold * MLX5_EQ_REFS_PER_IRQ;
581 	pool->max_threshold = max_threshold * MLX5_EQ_REFS_PER_IRQ;
582 	mlx5_core_dbg(dev, "pool->name = %s, pool->size = %d, pool->start = %d",
583 		      name ? name : "mlx5_pcif_pool", size, start);
584 	return pool;
585 }
586 
587 static void irq_pool_free(struct mlx5_irq_pool *pool)
588 {
589 	struct mlx5_irq *irq;
590 	unsigned long index;
591 
592 	/* There are cases in which we are destroying the irq_table before
593 	 * freeing all the IRQs, fast teardown for example. Hence, free the irqs
594 	 * which might not have been freed.
595 	 */
596 	xa_for_each(&pool->irqs, index, irq)
597 		irq_release(irq);
598 	xa_destroy(&pool->irqs);
599 	mutex_destroy(&pool->lock);
600 	kfree(pool->irqs_per_cpu);
601 	kvfree(pool);
602 }
603 
604 static int irq_pools_init(struct mlx5_core_dev *dev, int sf_vec, int pcif_vec,
605 			  bool dynamic_vec)
606 {
607 	struct mlx5_irq_table *table = dev->priv.irq_table;
608 	int sf_vec_available = sf_vec;
609 	int num_sf_ctrl;
610 	int err;
611 
612 	/* init pcif_pool */
613 	table->pcif_pool = irq_pool_alloc(dev, 0, pcif_vec, NULL,
614 					  MLX5_EQ_SHARE_IRQ_MIN_COMP,
615 					  MLX5_EQ_SHARE_IRQ_MAX_COMP);
616 	if (IS_ERR(table->pcif_pool))
617 		return PTR_ERR(table->pcif_pool);
618 	if (!mlx5_sf_max_functions(dev))
619 		return 0;
620 	if (sf_vec < MLX5_IRQ_VEC_COMP_BASE_SF) {
621 		mlx5_core_dbg(dev, "Not enough IRQs for SFs. SF may run at lower performance\n");
622 		return 0;
623 	}
624 
625 	/* init sf_ctrl_pool */
626 	num_sf_ctrl = DIV_ROUND_UP(mlx5_sf_max_functions(dev),
627 				   MLX5_SFS_PER_CTRL_IRQ);
628 	num_sf_ctrl = min_t(int, MLX5_IRQ_CTRL_SF_MAX, num_sf_ctrl);
629 	if (!dynamic_vec && (num_sf_ctrl + 1) > sf_vec_available) {
630 		mlx5_core_dbg(dev,
631 			      "Not enough IRQs for SFs control and completion pool, required=%d avail=%d\n",
632 			      num_sf_ctrl + 1, sf_vec_available);
633 		return 0;
634 	}
635 
636 	table->sf_ctrl_pool = irq_pool_alloc(dev, pcif_vec, num_sf_ctrl,
637 					     "mlx5_sf_ctrl",
638 					     MLX5_EQ_SHARE_IRQ_MIN_CTRL,
639 					     MLX5_EQ_SHARE_IRQ_MAX_CTRL);
640 	if (IS_ERR(table->sf_ctrl_pool)) {
641 		err = PTR_ERR(table->sf_ctrl_pool);
642 		goto err_pf;
643 	}
644 	sf_vec_available -= num_sf_ctrl;
645 
646 	/* init sf_comp_pool, remaining vectors are for the SF completions */
647 	table->sf_comp_pool = irq_pool_alloc(dev, pcif_vec + num_sf_ctrl,
648 					     sf_vec_available, "mlx5_sf_comp",
649 					     MLX5_EQ_SHARE_IRQ_MIN_COMP,
650 					     MLX5_EQ_SHARE_IRQ_MAX_COMP);
651 	if (IS_ERR(table->sf_comp_pool)) {
652 		err = PTR_ERR(table->sf_comp_pool);
653 		goto err_sf_ctrl;
654 	}
655 
656 	table->sf_comp_pool->irqs_per_cpu = kcalloc(nr_cpu_ids, sizeof(u16), GFP_KERNEL);
657 	if (!table->sf_comp_pool->irqs_per_cpu) {
658 		err = -ENOMEM;
659 		goto err_irqs_per_cpu;
660 	}
661 
662 	return 0;
663 
664 err_irqs_per_cpu:
665 	irq_pool_free(table->sf_comp_pool);
666 err_sf_ctrl:
667 	irq_pool_free(table->sf_ctrl_pool);
668 err_pf:
669 	irq_pool_free(table->pcif_pool);
670 	return err;
671 }
672 
673 static void irq_pools_destroy(struct mlx5_irq_table *table)
674 {
675 	if (table->sf_ctrl_pool) {
676 		irq_pool_free(table->sf_comp_pool);
677 		irq_pool_free(table->sf_ctrl_pool);
678 	}
679 	irq_pool_free(table->pcif_pool);
680 }
681 
682 static void mlx5_irq_pool_free_irqs(struct mlx5_irq_pool *pool)
683 {
684 	struct mlx5_irq *irq;
685 	unsigned long index;
686 
687 	xa_for_each(&pool->irqs, index, irq)
688 		mlx5_system_free_irq(irq);
689 
690 }
691 
692 static void mlx5_irq_pools_free_irqs(struct mlx5_irq_table *table)
693 {
694 	if (table->sf_ctrl_pool) {
695 		mlx5_irq_pool_free_irqs(table->sf_comp_pool);
696 		mlx5_irq_pool_free_irqs(table->sf_ctrl_pool);
697 	}
698 	mlx5_irq_pool_free_irqs(table->pcif_pool);
699 }
700 
701 /* irq_table API */
702 
703 int mlx5_irq_table_init(struct mlx5_core_dev *dev)
704 {
705 	struct mlx5_irq_table *irq_table;
706 
707 	if (mlx5_core_is_sf(dev))
708 		return 0;
709 
710 	irq_table = kvzalloc_node(sizeof(*irq_table), GFP_KERNEL,
711 				  dev->priv.numa_node);
712 	if (!irq_table)
713 		return -ENOMEM;
714 
715 	dev->priv.irq_table = irq_table;
716 	return 0;
717 }
718 
719 void mlx5_irq_table_cleanup(struct mlx5_core_dev *dev)
720 {
721 	if (mlx5_core_is_sf(dev))
722 		return;
723 
724 	kvfree(dev->priv.irq_table);
725 }
726 
727 int mlx5_irq_table_get_num_comp(struct mlx5_irq_table *table)
728 {
729 	if (!table->pcif_pool->xa_num_irqs.max)
730 		return 1;
731 	return table->pcif_pool->xa_num_irqs.max - table->pcif_pool->xa_num_irqs.min;
732 }
733 
734 int mlx5_irq_table_create(struct mlx5_core_dev *dev)
735 {
736 	int num_eqs = mlx5_max_eq_cap_get(dev);
737 	bool dynamic_vec;
738 	int total_vec;
739 	int pcif_vec;
740 	int req_vec;
741 	int err;
742 	int n;
743 
744 	if (mlx5_core_is_sf(dev))
745 		return 0;
746 
747 	/* PCI PF vectors usage is limited by online cpus, device EQs and
748 	 * PCI MSI-X capability.
749 	 */
750 	pcif_vec = MLX5_CAP_GEN(dev, num_ports) * num_online_cpus() + 1;
751 	pcif_vec = min_t(int, pcif_vec, num_eqs);
752 	pcif_vec = min_t(int, pcif_vec, pci_msix_vec_count(dev->pdev));
753 
754 	total_vec = pcif_vec;
755 	if (mlx5_sf_max_functions(dev))
756 		total_vec += MLX5_MAX_MSIX_PER_SF * mlx5_sf_max_functions(dev);
757 	total_vec = min_t(int, total_vec, pci_msix_vec_count(dev->pdev));
758 
759 	req_vec = pci_msix_can_alloc_dyn(dev->pdev) ? 1 : total_vec;
760 	n = pci_alloc_irq_vectors(dev->pdev, 1, req_vec, PCI_IRQ_MSIX);
761 	if (n < 0)
762 		return n;
763 
764 	/* Further limit vectors of the pools based on platform for non dynamic case */
765 	dynamic_vec = pci_msix_can_alloc_dyn(dev->pdev);
766 	if (!dynamic_vec) {
767 		pcif_vec = min_t(int, n, pcif_vec);
768 		total_vec = min_t(int, n, total_vec);
769 	}
770 
771 	err = irq_pools_init(dev, total_vec - pcif_vec, pcif_vec, dynamic_vec);
772 	if (err)
773 		pci_free_irq_vectors(dev->pdev);
774 
775 	return err;
776 }
777 
778 void mlx5_irq_table_destroy(struct mlx5_core_dev *dev)
779 {
780 	struct mlx5_irq_table *table = dev->priv.irq_table;
781 
782 	if (mlx5_core_is_sf(dev))
783 		return;
784 
785 	/* There are cases where IRQs still will be in used when we reaching
786 	 * to here. Hence, making sure all the irqs are released.
787 	 */
788 	irq_pools_destroy(table);
789 	pci_free_irq_vectors(dev->pdev);
790 }
791 
792 void mlx5_irq_table_free_irqs(struct mlx5_core_dev *dev)
793 {
794 	struct mlx5_irq_table *table = dev->priv.irq_table;
795 
796 	if (mlx5_core_is_sf(dev))
797 		return;
798 
799 	mlx5_irq_pools_free_irqs(table);
800 	pci_free_irq_vectors(dev->pdev);
801 }
802 
803 int mlx5_irq_table_get_sfs_vec(struct mlx5_irq_table *table)
804 {
805 	if (table->sf_comp_pool)
806 		return min_t(int, num_online_cpus(),
807 			     table->sf_comp_pool->xa_num_irqs.max -
808 			     table->sf_comp_pool->xa_num_irqs.min + 1);
809 	else
810 		return mlx5_irq_table_get_num_comp(table);
811 }
812 
813 struct mlx5_irq_table *mlx5_irq_table_get(struct mlx5_core_dev *dev)
814 {
815 #ifdef CONFIG_MLX5_SF
816 	if (mlx5_core_is_sf(dev))
817 		return dev->priv.parent_mdev->priv.irq_table;
818 #endif
819 	return dev->priv.irq_table;
820 }
821