xref: /linux/drivers/net/ethernet/microsoft/mana/gdma_main.c (revision d755d45bc08a57a3b845b850f8760de922a499bf)
1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 /* Copyright (c) 2021, Microsoft Corporation. */
3 
4 #include <linux/debugfs.h>
5 #include <linux/module.h>
6 #include <linux/pci.h>
7 #include <linux/sizes.h>
8 #include <linux/utsname.h>
9 #include <linux/version.h>
10 #include <linux/msi.h>
11 #include <linux/irqdomain.h>
12 #include <linux/export.h>
13 
14 #include <net/mana/mana.h>
15 #include <net/mana/hw_channel.h>
16 
17 struct dentry *mana_debugfs_root;
18 
19 struct mana_dev_recovery {
20 	struct list_head list;
21 	struct pci_dev *pdev;
22 	enum gdma_eqe_type type;
23 };
24 
25 static struct mana_dev_recovery_work {
26 	struct list_head dev_list;
27 	struct delayed_work work;
28 
29 	/* Lock for dev_list above */
30 	spinlock_t lock;
31 } mana_dev_recovery_work;
32 
33 static u32 mana_gd_r32(struct gdma_context *g, u64 offset)
34 {
35 	return readl(g->bar0_va + offset);
36 }
37 
38 static u64 mana_gd_r64(struct gdma_context *g, u64 offset)
39 {
40 	return readq(g->bar0_va + offset);
41 }
42 
43 static int mana_gd_init_pf_regs(struct pci_dev *pdev)
44 {
45 	struct gdma_context *gc = pci_get_drvdata(pdev);
46 	u64 remaining_barsize;
47 	u64 sriov_base_off;
48 	u64 sriov_shm_off;
49 
50 	gc->db_page_size = mana_gd_r32(gc, GDMA_PF_REG_DB_PAGE_SIZE) & 0xFFFF;
51 
52 	/* mana_gd_ring_doorbell() accesses offsets up to DOORBELL_OFFSET_EQ
53 	 * (0xFF8) + 8 bytes = 4KB within each doorbell page, so the page
54 	 * size must be at least SZ_4K.
55 	 */
56 	if (gc->db_page_size < SZ_4K) {
57 		dev_err(gc->dev,
58 			"Doorbell page size %llu too small (min %u)\n",
59 			gc->db_page_size, SZ_4K);
60 		return -EPROTO;
61 	}
62 
63 	gc->db_page_off = mana_gd_r64(gc, GDMA_PF_REG_DB_PAGE_OFF);
64 
65 	/* Validate doorbell offset is within BAR0 */
66 	if (gc->db_page_off >= gc->bar0_size) {
67 		dev_err(gc->dev,
68 			"Doorbell offset 0x%llx exceeds BAR0 size 0x%llx\n",
69 			gc->db_page_off, (u64)gc->bar0_size);
70 		return -EPROTO;
71 	}
72 
73 	gc->db_page_base = gc->bar0_va + gc->db_page_off;
74 	gc->phys_db_page_base = gc->bar0_pa + gc->db_page_off;
75 
76 	sriov_base_off = mana_gd_r64(gc, GDMA_SRIOV_REG_CFG_BASE_OFF);
77 	if (sriov_base_off >= gc->bar0_size ||
78 	    gc->bar0_size - sriov_base_off <
79 		GDMA_PF_REG_SHM_OFF + sizeof(u64) ||
80 	    !IS_ALIGNED(sriov_base_off, sizeof(u64))) {
81 		dev_err(gc->dev,
82 			"SRIOV base offset 0x%llx out of range or unaligned (BAR0 size 0x%llx)\n",
83 			sriov_base_off, (u64)gc->bar0_size);
84 		return -EPROTO;
85 	}
86 
87 	remaining_barsize = gc->bar0_size - sriov_base_off;
88 	sriov_shm_off = mana_gd_r64(gc, sriov_base_off + GDMA_PF_REG_SHM_OFF);
89 	if (sriov_shm_off >= remaining_barsize ||
90 	    remaining_barsize - sriov_shm_off < SMC_APERTURE_SIZE ||
91 	    !IS_ALIGNED(sriov_shm_off, sizeof(u32))) {
92 		dev_err(gc->dev,
93 			"SRIOV SHM offset 0x%llx out of range or unaligned (BAR0 size 0x%llx)\n",
94 			sriov_shm_off, (u64)gc->bar0_size);
95 		return -EPROTO;
96 	}
97 
98 	gc->shm_base = gc->bar0_va + sriov_base_off + sriov_shm_off;
99 
100 	return 0;
101 }
102 
103 static int mana_gd_init_vf_regs(struct pci_dev *pdev)
104 {
105 	struct gdma_context *gc = pci_get_drvdata(pdev);
106 	u64 shm_off;
107 
108 	gc->db_page_size = mana_gd_r32(gc, GDMA_REG_DB_PAGE_SIZE) & 0xFFFF;
109 
110 	/* mana_gd_ring_doorbell() accesses offsets up to DOORBELL_OFFSET_EQ
111 	 * (0xFF8) + 8 bytes = 4KB within each doorbell page, so the page
112 	 * size must be at least SZ_4K.
113 	 */
114 	if (gc->db_page_size < SZ_4K) {
115 		dev_err(gc->dev,
116 			"Doorbell page size %llu too small (min %u)\n",
117 			gc->db_page_size, SZ_4K);
118 		return -EPROTO;
119 	}
120 
121 	gc->db_page_off = mana_gd_r64(gc, GDMA_REG_DB_PAGE_OFFSET);
122 
123 	/* Validate doorbell offset is within BAR0 */
124 	if (gc->db_page_off >= gc->bar0_size) {
125 		dev_err(gc->dev,
126 			"Doorbell offset 0x%llx exceeds BAR0 size 0x%llx\n",
127 			gc->db_page_off, (u64)gc->bar0_size);
128 		return -EPROTO;
129 	}
130 
131 	gc->db_page_base = gc->bar0_va + gc->db_page_off;
132 	gc->phys_db_page_base = gc->bar0_pa + gc->db_page_off;
133 
134 	shm_off = mana_gd_r64(gc, GDMA_REG_SHM_OFFSET);
135 	if (shm_off >= gc->bar0_size ||
136 	    gc->bar0_size - shm_off < SMC_APERTURE_SIZE ||
137 	    !IS_ALIGNED(shm_off, sizeof(u32))) {
138 		dev_err(gc->dev,
139 			"SHM offset 0x%llx out of range or unaligned (BAR0 size 0x%llx)\n",
140 			shm_off, (u64)gc->bar0_size);
141 		return -EPROTO;
142 	}
143 
144 	gc->shm_base = gc->bar0_va + shm_off;
145 
146 	return 0;
147 }
148 
149 static int mana_gd_init_registers(struct pci_dev *pdev)
150 {
151 	struct gdma_context *gc = pci_get_drvdata(pdev);
152 
153 	if (gc->is_pf && !gc->is_pf2)
154 		return mana_gd_init_pf_regs(pdev);
155 	else
156 		return mana_gd_init_vf_regs(pdev);
157 }
158 
159 /* Suppress logging when we set timeout to zero */
160 bool mana_need_log(struct gdma_context *gc, int err)
161 {
162 	struct hw_channel_context *hwc;
163 
164 	if (err != -ETIMEDOUT)
165 		return true;
166 
167 	if (!gc)
168 		return true;
169 
170 	hwc = gc->hwc.driver_data;
171 	if (hwc && hwc->hwc_timeout == 0)
172 		return false;
173 
174 	return true;
175 }
176 
177 static int mana_gd_query_max_resources(struct pci_dev *pdev)
178 {
179 	struct gdma_context *gc = pci_get_drvdata(pdev);
180 	struct gdma_query_max_resources_resp resp = {};
181 	struct gdma_general_req req = {};
182 	unsigned int max_num_queues;
183 	u8 bm_hostmode;
184 	u16 num_ports;
185 	int err;
186 
187 	/* Reset msi_sharing so it is recomputed from current hardware
188 	 * state. On resume, num_online_cpus() or num_msix_usable may
189 	 * have changed, making dedicated MSI-X feasible where it was
190 	 * not before. Only reset on platforms that support dynamic
191 	 * MSI-X allocation; on non-dyn platforms msi_sharing is
192 	 * unconditionally true (set in mana_gd_setup_hwc_irqs).
193 	 */
194 	if (pci_msix_can_alloc_dyn(to_pci_dev(gc->dev)))
195 		gc->msi_sharing = false;
196 
197 	mana_gd_init_req_hdr(&req.hdr, GDMA_QUERY_MAX_RESOURCES,
198 			     sizeof(req), sizeof(resp));
199 
200 	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
201 	if (err || resp.hdr.status) {
202 		dev_err(gc->dev, "Failed to query resource info: %d, 0x%x\n",
203 			err, resp.hdr.status);
204 		return err ? err : -EPROTO;
205 	}
206 
207 	if (!pci_msix_can_alloc_dyn(pdev)) {
208 		if (gc->num_msix_usable > resp.max_msix)
209 			gc->num_msix_usable = resp.max_msix;
210 	} else {
211 		/* If dynamic allocation is enabled we have already allocated
212 		 * hwc msi
213 		 */
214 		gc->num_msix_usable = min(resp.max_msix, num_online_cpus() + 1);
215 	}
216 
217 	if (gc->num_msix_usable <= 1)
218 		return -ENOSPC;
219 
220 	gc->max_num_queues = num_online_cpus();
221 	if (gc->max_num_queues > MANA_MAX_NUM_QUEUES)
222 		gc->max_num_queues = MANA_MAX_NUM_QUEUES;
223 
224 	if (gc->max_num_queues > resp.max_eq)
225 		gc->max_num_queues = resp.max_eq;
226 
227 	if (gc->max_num_queues > resp.max_cq)
228 		gc->max_num_queues = resp.max_cq;
229 
230 	if (gc->max_num_queues > resp.max_sq)
231 		gc->max_num_queues = resp.max_sq;
232 
233 	if (gc->max_num_queues > resp.max_rq)
234 		gc->max_num_queues = resp.max_rq;
235 
236 	/* The Hardware Channel (HWC) used 1 MSI-X */
237 	if (gc->max_num_queues > gc->num_msix_usable - 1)
238 		gc->max_num_queues = gc->num_msix_usable - 1;
239 
240 	if (gc->max_num_queues == 0)
241 		return -ENOSPC;
242 
243 	debugfs_create_u32("num_msix_usable", 0400, gc->mana_pci_debugfs,
244 			   &gc->num_msix_usable);
245 	debugfs_create_u32("max_num_queues", 0400, gc->mana_pci_debugfs,
246 			   &gc->max_num_queues);
247 
248 	err = mana_gd_query_device_cfg(gc, MANA_MAJOR_VERSION,
249 				       MANA_MINOR_VERSION,
250 				       MANA_MICRO_VERSION,
251 				       &num_ports, &bm_hostmode);
252 	if (err)
253 		return err;
254 
255 	if (!num_ports) {
256 		dev_err(gc->dev, "Failed to detect any vPort\n");
257 		return -EINVAL;
258 	}
259 
260 	/* Cap to the same limit used by mana_probe() for port instantiation,
261 	 * so MSI-X and queue budgeting matches the actual port count.
262 	 */
263 	if (num_ports > MAX_PORTS_IN_MANA_DEV)
264 		num_ports = MAX_PORTS_IN_MANA_DEV;
265 
266 	/*
267 	 * Adjust the per-vPort max queue count to allow dedicated
268 	 * MSIx for each vPort. Prefer at least MANA_DEF_NUM_QUEUES,
269 	 * but the hardware max (gc->max_num_queues) takes precedence.
270 	 */
271 	max_num_queues = (gc->num_msix_usable - 1) / num_ports;
272 	max_num_queues = rounddown_pow_of_two(max(max_num_queues, 1U));
273 	if (max_num_queues < MANA_DEF_NUM_QUEUES)
274 		max_num_queues = MANA_DEF_NUM_QUEUES;
275 
276 	/*
277 	 * Use dedicated MSIx for EQs whenever possible, use MSIx sharing for
278 	 * Ethernet EQs when (max_num_queues * num_ports > num_msix_usable - 1).
279 	 */
280 	max_num_queues = min(gc->max_num_queues, max_num_queues);
281 	if (max_num_queues * num_ports > gc->num_msix_usable - 1)
282 		gc->msi_sharing = true;
283 
284 	/* If MSI is shared, use max allowed value */
285 	if (gc->msi_sharing)
286 		gc->max_num_queues_vport = min(gc->num_msix_usable - 1,
287 					       gc->max_num_queues);
288 	else
289 		gc->max_num_queues_vport = max_num_queues;
290 
291 	dev_info(gc->dev, "MSI sharing mode %u max queues %u\n",
292 		 gc->msi_sharing, gc->max_num_queues_vport);
293 
294 	return 0;
295 }
296 
297 static int mana_gd_query_hwc_timeout(struct pci_dev *pdev, u32 *timeout_val)
298 {
299 	struct gdma_context *gc = pci_get_drvdata(pdev);
300 	struct gdma_query_hwc_timeout_resp resp = {};
301 	struct gdma_query_hwc_timeout_req req = {};
302 	int err;
303 
304 	mana_gd_init_req_hdr(&req.hdr, GDMA_QUERY_HWC_TIMEOUT,
305 			     sizeof(req), sizeof(resp));
306 	req.timeout_ms = *timeout_val;
307 	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
308 	if (err || resp.hdr.status)
309 		return err ? err : -EPROTO;
310 
311 	*timeout_val = resp.timeout_ms;
312 
313 	return 0;
314 }
315 
316 static int mana_gd_detect_devices(struct pci_dev *pdev)
317 {
318 	struct gdma_context *gc = pci_get_drvdata(pdev);
319 	struct gdma_list_devices_resp resp = {};
320 	struct gdma_general_req req = {};
321 	struct gdma_dev_id dev;
322 	int found_dev = 0;
323 	u16 dev_type;
324 	int err;
325 	u32 i;
326 
327 	mana_gd_init_req_hdr(&req.hdr, GDMA_LIST_DEVICES, sizeof(req),
328 			     sizeof(resp));
329 
330 	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
331 	if (err || resp.hdr.status) {
332 		dev_err(gc->dev, "Failed to detect devices: %d, 0x%x\n", err,
333 			resp.hdr.status);
334 		return err ? err : -EPROTO;
335 	}
336 
337 	for (i = 0; i < GDMA_DEV_LIST_SIZE &&
338 	     found_dev < resp.num_of_devs; i++) {
339 		dev = resp.devs[i];
340 		dev_type = dev.type;
341 
342 		/* Skip empty devices */
343 		if (dev.as_uint32 == 0)
344 			continue;
345 
346 		found_dev++;
347 
348 		/* HWC is already detected in mana_hwc_create_channel(). */
349 		if (dev_type == GDMA_DEVICE_HWC)
350 			continue;
351 
352 		if (dev_type == GDMA_DEVICE_MANA) {
353 			gc->mana.gdma_context = gc;
354 			gc->mana.dev_id = dev;
355 		} else if (dev_type == GDMA_DEVICE_MANA_IB) {
356 			gc->mana_ib.dev_id = dev;
357 			gc->mana_ib.gdma_context = gc;
358 		}
359 	}
360 
361 	return gc->mana.dev_id.type == 0 ? -ENODEV : 0;
362 }
363 
364 int mana_gd_send_request(struct gdma_context *gc, u32 req_len, const void *req,
365 			 u32 resp_len, void *resp)
366 {
367 	struct hw_channel_context *hwc = gc->hwc.driver_data;
368 
369 	return mana_hwc_send_request(hwc, req_len, req, resp_len, resp);
370 }
371 EXPORT_SYMBOL_NS(mana_gd_send_request, "NET_MANA");
372 
373 int mana_gd_alloc_memory(struct gdma_context *gc, unsigned int length,
374 			 struct gdma_mem_info *gmi)
375 {
376 	dma_addr_t dma_handle;
377 	void *buf;
378 
379 	if (length < MANA_PAGE_SIZE || !is_power_of_2(length))
380 		return -EINVAL;
381 
382 	gmi->dev = gc->dev;
383 	buf = dma_alloc_coherent(gmi->dev, length, &dma_handle, GFP_KERNEL);
384 	if (!buf)
385 		return -ENOMEM;
386 
387 	gmi->dma_handle = dma_handle;
388 	gmi->virt_addr = buf;
389 	gmi->length = length;
390 
391 	return 0;
392 }
393 
394 void mana_gd_free_memory(struct gdma_mem_info *gmi)
395 {
396 	dma_free_coherent(gmi->dev, gmi->length, gmi->virt_addr,
397 			  gmi->dma_handle);
398 }
399 
400 static int mana_gd_create_hw_eq(struct gdma_context *gc,
401 				struct gdma_queue *queue)
402 {
403 	struct gdma_create_queue_resp resp = {};
404 	struct gdma_create_queue_req req = {};
405 	int err;
406 
407 	if (queue->type != GDMA_EQ)
408 		return -EINVAL;
409 
410 	mana_gd_init_req_hdr(&req.hdr, GDMA_CREATE_QUEUE,
411 			     sizeof(req), sizeof(resp));
412 
413 	req.hdr.dev_id = queue->gdma_dev->dev_id;
414 	req.type = queue->type;
415 	req.pdid = queue->gdma_dev->pdid;
416 	req.doolbell_id = queue->gdma_dev->doorbell;
417 	req.gdma_region = queue->mem_info.dma_region_handle;
418 	req.queue_size = queue->queue_size;
419 	req.log2_throttle_limit = queue->eq.log2_throttle_limit;
420 	req.eq_pci_msix_index = queue->eq.msix_index;
421 
422 	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
423 	if (err || resp.hdr.status) {
424 		dev_err(gc->dev, "Failed to create queue: %d, 0x%x\n", err,
425 			resp.hdr.status);
426 		return err ? err : -EPROTO;
427 	}
428 
429 	queue->id = resp.queue_index;
430 	queue->eq.disable_needed = true;
431 	queue->mem_info.dma_region_handle = GDMA_INVALID_DMA_REGION;
432 	return 0;
433 }
434 
435 static int mana_gd_disable_queue(struct gdma_queue *queue)
436 {
437 	struct gdma_context *gc = queue->gdma_dev->gdma_context;
438 	struct gdma_disable_queue_req req = {};
439 	struct gdma_general_resp resp = {};
440 	int err;
441 
442 	WARN_ON(queue->type != GDMA_EQ);
443 
444 	mana_gd_init_req_hdr(&req.hdr, GDMA_DISABLE_QUEUE,
445 			     sizeof(req), sizeof(resp));
446 
447 	req.hdr.dev_id = queue->gdma_dev->dev_id;
448 	req.type = queue->type;
449 	req.queue_index =  queue->id;
450 	req.alloc_res_id_on_creation = 1;
451 
452 	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
453 	if (err || resp.hdr.status) {
454 		if (mana_need_log(gc, err))
455 			dev_err(gc->dev, "Failed to disable queue: %d, 0x%x\n", err,
456 				resp.hdr.status);
457 		return err ? err : -EPROTO;
458 	}
459 
460 	return 0;
461 }
462 
463 #define DOORBELL_OFFSET_SQ	0x0
464 #define DOORBELL_OFFSET_RQ	0x400
465 #define DOORBELL_OFFSET_CQ	0x800
466 #define DOORBELL_OFFSET_EQ	0xFF8
467 
468 static void mana_gd_ring_doorbell(struct gdma_context *gc, u32 db_index,
469 				  enum gdma_queue_type q_type, u32 qid,
470 				  u32 tail_ptr, u8 num_req)
471 {
472 	void __iomem *addr = gc->db_page_base + gc->db_page_size * db_index;
473 	union gdma_doorbell_entry e = {};
474 
475 	switch (q_type) {
476 	case GDMA_EQ:
477 		e.eq.id = qid;
478 		e.eq.tail_ptr = tail_ptr;
479 		e.eq.arm = num_req;
480 
481 		addr += DOORBELL_OFFSET_EQ;
482 		break;
483 
484 	case GDMA_CQ:
485 		e.cq.id = qid;
486 		e.cq.tail_ptr = tail_ptr;
487 		e.cq.arm = num_req;
488 
489 		addr += DOORBELL_OFFSET_CQ;
490 		break;
491 
492 	case GDMA_RQ:
493 		e.rq.id = qid;
494 		e.rq.tail_ptr = tail_ptr;
495 		e.rq.wqe_cnt = num_req;
496 
497 		addr += DOORBELL_OFFSET_RQ;
498 		break;
499 
500 	case GDMA_SQ:
501 		e.sq.id = qid;
502 		e.sq.tail_ptr = tail_ptr;
503 
504 		addr += DOORBELL_OFFSET_SQ;
505 		break;
506 
507 	default:
508 		WARN_ON(1);
509 		return;
510 	}
511 
512 	/* Ensure all writes are done before ring doorbell */
513 	wmb();
514 
515 	writeq(e.as_uint64, addr);
516 }
517 
518 void mana_gd_wq_ring_doorbell(struct gdma_context *gc, struct gdma_queue *queue)
519 {
520 	/* Hardware Spec specifies that software client should set 0 for
521 	 * wqe_cnt for Receive Queues. This value is not used in Send Queues.
522 	 */
523 	mana_gd_ring_doorbell(gc, queue->gdma_dev->doorbell, queue->type,
524 			      queue->id, queue->head * GDMA_WQE_BU_SIZE, 0);
525 }
526 EXPORT_SYMBOL_NS(mana_gd_wq_ring_doorbell, "NET_MANA");
527 
528 void mana_gd_ring_cq(struct gdma_queue *cq, u8 arm_bit)
529 {
530 	struct gdma_context *gc = cq->gdma_dev->gdma_context;
531 
532 	u32 num_cqe = cq->queue_size / GDMA_CQE_SIZE;
533 
534 	u32 head = cq->head % (num_cqe << GDMA_CQE_OWNER_BITS);
535 
536 	mana_gd_ring_doorbell(gc, cq->gdma_dev->doorbell, cq->type, cq->id,
537 			      head, arm_bit);
538 }
539 EXPORT_SYMBOL_NS(mana_gd_ring_cq, "NET_MANA");
540 
541 #define MANA_SERVICE_PERIOD 10
542 
543 static void mana_serv_rescan(struct pci_dev *pdev)
544 {
545 	struct pci_bus *parent;
546 
547 	pci_lock_rescan_remove();
548 
549 	parent = pdev->bus;
550 	if (!parent) {
551 		dev_err(&pdev->dev, "MANA service: no parent bus\n");
552 		goto out;
553 	}
554 
555 	pci_stop_and_remove_bus_device(pdev);
556 	pci_rescan_bus(parent);
557 
558 out:
559 	pci_unlock_rescan_remove();
560 }
561 
562 static void mana_serv_fpga(struct pci_dev *pdev)
563 {
564 	struct pci_bus *bus, *parent;
565 
566 	pci_lock_rescan_remove();
567 
568 	bus = pdev->bus;
569 	if (!bus) {
570 		dev_err(&pdev->dev, "MANA service: no bus\n");
571 		goto out;
572 	}
573 
574 	parent = bus->parent;
575 	if (!parent) {
576 		dev_err(&pdev->dev, "MANA service: no parent bus\n");
577 		goto out;
578 	}
579 
580 	pci_stop_and_remove_bus_device(bus->self);
581 
582 	msleep(MANA_SERVICE_PERIOD * 1000);
583 
584 	pci_rescan_bus(parent);
585 
586 out:
587 	pci_unlock_rescan_remove();
588 }
589 
590 static void mana_serv_reset(struct pci_dev *pdev)
591 {
592 	struct gdma_context *gc = pci_get_drvdata(pdev);
593 	struct hw_channel_context *hwc;
594 	int ret;
595 
596 	if (!gc) {
597 		/* Perform PCI rescan on device if GC is not set up */
598 		dev_err(&pdev->dev, "MANA service: GC not setup, rescanning\n");
599 		mana_serv_rescan(pdev);
600 		return;
601 	}
602 
603 	hwc = gc->hwc.driver_data;
604 	if (!hwc) {
605 		dev_err(&pdev->dev, "MANA service: no HWC\n");
606 		goto out;
607 	}
608 
609 	/* HWC is not responding in this case, so don't wait */
610 	hwc->hwc_timeout = 0;
611 
612 	dev_info(&pdev->dev, "MANA reset cycle start\n");
613 
614 	mana_gd_suspend(pdev, PMSG_SUSPEND);
615 
616 	msleep(MANA_SERVICE_PERIOD * 1000);
617 
618 	ret = mana_gd_resume(pdev);
619 	if (ret == -ETIMEDOUT || ret == -EPROTO) {
620 		/* Perform PCI rescan on device if we failed on HWC */
621 		dev_err(&pdev->dev, "MANA service: resume failed, rescanning\n");
622 		mana_serv_rescan(pdev);
623 		return;
624 	}
625 
626 	if (ret)
627 		dev_info(&pdev->dev, "MANA reset cycle failed err %d\n", ret);
628 	else
629 		dev_info(&pdev->dev, "MANA reset cycle completed\n");
630 
631 out:
632 	clear_bit(GC_IN_SERVICE, &gc->flags);
633 }
634 
635 static void mana_do_service(enum gdma_eqe_type type, struct pci_dev *pdev)
636 {
637 	switch (type) {
638 	case GDMA_EQE_HWC_FPGA_RECONFIG:
639 		mana_serv_fpga(pdev);
640 		break;
641 
642 	case GDMA_EQE_HWC_RESET_REQUEST:
643 		mana_serv_reset(pdev);
644 		break;
645 
646 	default:
647 		dev_err(&pdev->dev, "MANA service: unknown type %d\n", type);
648 		break;
649 	}
650 }
651 
652 static void mana_recovery_delayed_func(struct work_struct *w)
653 {
654 	struct mana_dev_recovery_work *work;
655 	struct mana_dev_recovery *dev;
656 	unsigned long flags;
657 
658 	work = container_of(w, struct mana_dev_recovery_work, work.work);
659 
660 	spin_lock_irqsave(&work->lock, flags);
661 
662 	while (!list_empty(&work->dev_list)) {
663 		dev = list_first_entry(&work->dev_list,
664 				       struct mana_dev_recovery, list);
665 		list_del(&dev->list);
666 		spin_unlock_irqrestore(&work->lock, flags);
667 
668 		mana_do_service(dev->type, dev->pdev);
669 		pci_dev_put(dev->pdev);
670 		kfree(dev);
671 
672 		spin_lock_irqsave(&work->lock, flags);
673 	}
674 
675 	spin_unlock_irqrestore(&work->lock, flags);
676 }
677 
678 static void mana_serv_func(struct work_struct *w)
679 {
680 	struct mana_serv_work *mns_wk;
681 	struct pci_dev *pdev;
682 
683 	mns_wk = container_of(w, struct mana_serv_work, serv_work);
684 	pdev = mns_wk->pdev;
685 
686 	if (pdev)
687 		mana_do_service(mns_wk->type, pdev);
688 
689 	pci_dev_put(pdev);
690 	kfree(mns_wk);
691 	module_put(THIS_MODULE);
692 }
693 
694 int mana_schedule_serv_work(struct gdma_context *gc, enum gdma_eqe_type type)
695 {
696 	struct mana_serv_work *mns_wk;
697 
698 	if (test_and_set_bit(GC_IN_SERVICE, &gc->flags)) {
699 		dev_info(gc->dev, "Already in service\n");
700 		return -EBUSY;
701 	}
702 
703 	if (!try_module_get(THIS_MODULE)) {
704 		dev_info(gc->dev, "Module is unloading\n");
705 		clear_bit(GC_IN_SERVICE, &gc->flags);
706 		return -ENODEV;
707 	}
708 
709 	mns_wk = kzalloc(sizeof(*mns_wk), GFP_ATOMIC);
710 	if (!mns_wk) {
711 		module_put(THIS_MODULE);
712 		clear_bit(GC_IN_SERVICE, &gc->flags);
713 		return -ENOMEM;
714 	}
715 
716 	dev_info(gc->dev, "Start MANA service type:%d\n", type);
717 	mns_wk->pdev = to_pci_dev(gc->dev);
718 	mns_wk->type = type;
719 	pci_dev_get(mns_wk->pdev);
720 	INIT_WORK(&mns_wk->serv_work, mana_serv_func);
721 	schedule_work(&mns_wk->serv_work);
722 	return 0;
723 }
724 
725 static void mana_gd_process_eqe(struct gdma_queue *eq)
726 {
727 	u32 head = eq->head % (eq->queue_size / GDMA_EQE_SIZE);
728 	struct gdma_context *gc = eq->gdma_dev->gdma_context;
729 	struct gdma_eqe *eq_eqe_ptr = eq->queue_mem_ptr;
730 	union gdma_eqe_info eqe_info;
731 	enum gdma_eqe_type type;
732 	struct gdma_event event;
733 	struct gdma_queue *cq;
734 	struct gdma_eqe *eqe;
735 	u32 cq_id;
736 
737 	eqe = &eq_eqe_ptr[head];
738 	eqe_info.as_uint32 = eqe->eqe_info;
739 	type = eqe_info.type;
740 
741 	switch (type) {
742 	case GDMA_EQE_COMPLETION:
743 		cq_id = eqe->details[0] & 0xFFFFFF;
744 		if (WARN_ON_ONCE(cq_id >= gc->max_num_cqs))
745 			break;
746 
747 		cq = gc->cq_table[cq_id];
748 		if (WARN_ON_ONCE(!cq || cq->type != GDMA_CQ || cq->id != cq_id))
749 			break;
750 
751 		if (cq->cq.callback)
752 			cq->cq.callback(cq->cq.context, cq);
753 
754 		break;
755 
756 	case GDMA_EQE_TEST_EVENT:
757 		gc->test_event_eq_id = eq->id;
758 		complete(&gc->eq_test_event);
759 		break;
760 
761 	case GDMA_EQE_HWC_INIT_EQ_ID_DB:
762 	case GDMA_EQE_HWC_INIT_DATA:
763 	case GDMA_EQE_HWC_INIT_DONE:
764 	case GDMA_EQE_HWC_SOC_SERVICE:
765 	case GDMA_EQE_RNIC_QP_FATAL:
766 	case GDMA_EQE_HWC_SOC_RECONFIG_DATA:
767 		if (!eq->eq.callback)
768 			break;
769 
770 		event.type = type;
771 		memcpy(&event.details, &eqe->details, GDMA_EVENT_DATA_SIZE);
772 		eq->eq.callback(eq->eq.context, eq, &event);
773 		break;
774 
775 	case GDMA_EQE_HWC_FPGA_RECONFIG:
776 	case GDMA_EQE_HWC_RESET_REQUEST:
777 		dev_info(gc->dev, "Recv MANA service type:%d\n", type);
778 
779 		if (!test_and_set_bit(GC_PROBE_SUCCEEDED, &gc->flags)) {
780 			/*
781 			 * Device is in probe and we received a hardware reset
782 			 * event, the probe function will detect that the flag
783 			 * has changed and perform service procedure.
784 			 */
785 			dev_info(gc->dev,
786 				 "Service is to be processed in probe\n");
787 			break;
788 		}
789 		mana_schedule_serv_work(gc, type);
790 		break;
791 
792 	default:
793 		break;
794 	}
795 }
796 
797 static void mana_gd_process_eq_events(void *arg)
798 {
799 	u32 owner_bits, new_bits, old_bits;
800 	union gdma_eqe_info eqe_info;
801 	struct gdma_eqe *eq_eqe_ptr;
802 	struct gdma_queue *eq = arg;
803 	struct gdma_context *gc;
804 	struct gdma_eqe *eqe;
805 	u32 head, num_eqe;
806 	int i;
807 
808 	gc = eq->gdma_dev->gdma_context;
809 
810 	num_eqe = eq->queue_size / GDMA_EQE_SIZE;
811 	eq_eqe_ptr = eq->queue_mem_ptr;
812 
813 	/* Process up to 5 EQEs at a time, and update the HW head. */
814 	for (i = 0; i < 5; i++) {
815 		eqe = &eq_eqe_ptr[eq->head % num_eqe];
816 		eqe_info.as_uint32 = eqe->eqe_info;
817 		owner_bits = eqe_info.owner_bits;
818 
819 		old_bits = (eq->head / num_eqe - 1) & GDMA_EQE_OWNER_MASK;
820 		/* No more entries */
821 		if (owner_bits == old_bits) {
822 			/* return here without ringing the doorbell */
823 			if (i == 0)
824 				return;
825 			break;
826 		}
827 
828 		new_bits = (eq->head / num_eqe) & GDMA_EQE_OWNER_MASK;
829 		if (owner_bits != new_bits) {
830 			dev_err(gc->dev, "EQ %d: overflow detected\n", eq->id);
831 			break;
832 		}
833 
834 		/* Per GDMA spec, rmb is necessary after checking owner_bits, before
835 		 * reading eqe.
836 		 */
837 		rmb();
838 
839 		mana_gd_process_eqe(eq);
840 
841 		eq->head++;
842 	}
843 
844 	head = eq->head % (num_eqe << GDMA_EQE_OWNER_BITS);
845 
846 	mana_gd_ring_doorbell(gc, eq->gdma_dev->doorbell, eq->type, eq->id,
847 			      head, SET_ARM_BIT);
848 }
849 
850 static int mana_gd_register_irq(struct gdma_queue *queue,
851 				const struct gdma_queue_spec *spec)
852 {
853 	struct gdma_dev *gd = queue->gdma_dev;
854 	struct gdma_irq_context *gic;
855 	struct gdma_context *gc;
856 	unsigned int msi_index;
857 	unsigned long flags;
858 	struct device *dev;
859 	int err = 0;
860 
861 	gc = gd->gdma_context;
862 	dev = gc->dev;
863 	msi_index = spec->eq.msix_index;
864 
865 	if (msi_index >= gc->num_msix_usable) {
866 		err = -ENOSPC;
867 		dev_err(dev, "Register IRQ err:%d, msi:%u nMSI:%u",
868 			err, msi_index, gc->num_msix_usable);
869 
870 		return err;
871 	}
872 
873 	queue->eq.msix_index = msi_index;
874 	/* The caller acquired a GIC reference via mana_gd_get_gic().
875 	 * That refcount prevents mana_gd_put_gic() from erasing this
876 	 * irq_contexts entry concurrently.
877 	 */
878 	gic = xa_load(&gc->irq_contexts, msi_index);
879 	if (WARN_ON(!gic))
880 		return -EINVAL;
881 
882 	spin_lock_irqsave(&gic->lock, flags);
883 	list_add_rcu(&queue->entry, &gic->eq_list);
884 	spin_unlock_irqrestore(&gic->lock, flags);
885 
886 	return 0;
887 }
888 
889 static void mana_gd_deregister_irq(struct gdma_queue *queue)
890 {
891 	struct gdma_dev *gd = queue->gdma_dev;
892 	struct gdma_irq_context *gic;
893 	struct gdma_context *gc;
894 	unsigned int msix_index;
895 	unsigned long flags;
896 	struct gdma_queue *eq;
897 
898 	gc = gd->gdma_context;
899 
900 	/* At most num_online_cpus() + 1 interrupts are used. */
901 	msix_index = queue->eq.msix_index;
902 	if (WARN_ON(msix_index >= gc->num_msix_usable))
903 		return;
904 
905 	/* The caller releases the GIC reference via mana_gd_put_gic()
906 	 * after this function returns. The refcount guarantees this
907 	 * irq_contexts entry is still valid.
908 	 */
909 	gic = xa_load(&gc->irq_contexts, msix_index);
910 	if (WARN_ON(!gic))
911 		return;
912 
913 	spin_lock_irqsave(&gic->lock, flags);
914 	list_for_each_entry_rcu(eq, &gic->eq_list, entry) {
915 		if (queue == eq) {
916 			list_del_rcu(&eq->entry);
917 			break;
918 		}
919 	}
920 	spin_unlock_irqrestore(&gic->lock, flags);
921 
922 	synchronize_rcu();
923 }
924 
925 int mana_gd_test_eq(struct gdma_context *gc, struct gdma_queue *eq)
926 {
927 	struct gdma_generate_test_event_req req = {};
928 	struct gdma_general_resp resp = {};
929 	struct device *dev = gc->dev;
930 	int err;
931 
932 	mutex_lock(&gc->eq_test_event_mutex);
933 
934 	init_completion(&gc->eq_test_event);
935 	gc->test_event_eq_id = INVALID_QUEUE_ID;
936 
937 	mana_gd_init_req_hdr(&req.hdr, GDMA_GENERATE_TEST_EQE,
938 			     sizeof(req), sizeof(resp));
939 
940 	req.hdr.dev_id = eq->gdma_dev->dev_id;
941 	req.queue_index = eq->id;
942 
943 	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
944 	if (err) {
945 		if (mana_need_log(gc, err))
946 			dev_err(dev, "test_eq failed: %d\n", err);
947 		goto out;
948 	}
949 
950 	err = -EPROTO;
951 
952 	if (resp.hdr.status) {
953 		dev_err(dev, "test_eq failed: 0x%x\n", resp.hdr.status);
954 		goto out;
955 	}
956 
957 	if (!wait_for_completion_timeout(&gc->eq_test_event, 30 * HZ)) {
958 		dev_err(dev, "test_eq timed out on queue %d\n", eq->id);
959 		goto out;
960 	}
961 
962 	if (eq->id != gc->test_event_eq_id) {
963 		dev_err(dev, "test_eq got an event on wrong queue %d (%d)\n",
964 			gc->test_event_eq_id, eq->id);
965 		goto out;
966 	}
967 
968 	err = 0;
969 out:
970 	mutex_unlock(&gc->eq_test_event_mutex);
971 	return err;
972 }
973 
974 static void mana_gd_destroy_eq(struct gdma_context *gc, bool flush_evenets,
975 			       struct gdma_queue *queue)
976 {
977 	int err;
978 
979 	if (flush_evenets) {
980 		err = mana_gd_test_eq(gc, queue);
981 		if (err && mana_need_log(gc, err))
982 			dev_warn(gc->dev, "Failed to flush EQ: %d\n", err);
983 	}
984 
985 	mana_gd_deregister_irq(queue);
986 
987 	if (queue->eq.disable_needed)
988 		mana_gd_disable_queue(queue);
989 }
990 
991 static int mana_gd_create_eq(struct gdma_dev *gd,
992 			     const struct gdma_queue_spec *spec,
993 			     bool create_hwq, struct gdma_queue *queue)
994 {
995 	struct gdma_context *gc = gd->gdma_context;
996 	struct device *dev = gc->dev;
997 	u32 log2_num_entries;
998 	int err;
999 
1000 	queue->eq.msix_index = INVALID_PCI_MSIX_INDEX;
1001 	queue->id = INVALID_QUEUE_ID;
1002 
1003 	log2_num_entries = ilog2(queue->queue_size / GDMA_EQE_SIZE);
1004 
1005 	if (spec->eq.log2_throttle_limit > log2_num_entries) {
1006 		dev_err(dev, "EQ throttling limit (%lu) > maximum EQE (%u)\n",
1007 			spec->eq.log2_throttle_limit, log2_num_entries);
1008 		return -EINVAL;
1009 	}
1010 
1011 	err = mana_gd_register_irq(queue, spec);
1012 	if (err) {
1013 		dev_err(dev, "Failed to register irq: %d\n", err);
1014 		return err;
1015 	}
1016 
1017 	queue->eq.callback = spec->eq.callback;
1018 	queue->eq.context = spec->eq.context;
1019 	queue->head |= INITIALIZED_OWNER_BIT(log2_num_entries);
1020 	queue->eq.log2_throttle_limit = spec->eq.log2_throttle_limit ?: 1;
1021 
1022 	if (create_hwq) {
1023 		err = mana_gd_create_hw_eq(gc, queue);
1024 		if (err)
1025 			goto out;
1026 
1027 		err = mana_gd_test_eq(gc, queue);
1028 		if (err)
1029 			goto out;
1030 	}
1031 
1032 	return 0;
1033 out:
1034 	dev_err(dev, "Failed to create EQ: %d\n", err);
1035 	mana_gd_destroy_eq(gc, false, queue);
1036 	queue->eq.msix_index = INVALID_PCI_MSIX_INDEX;
1037 	return err;
1038 }
1039 
1040 static void mana_gd_create_cq(const struct gdma_queue_spec *spec,
1041 			      struct gdma_queue *queue)
1042 {
1043 	u32 log2_num_entries = ilog2(spec->queue_size / GDMA_CQE_SIZE);
1044 
1045 	queue->head |= INITIALIZED_OWNER_BIT(log2_num_entries);
1046 	queue->cq.parent = spec->cq.parent_eq;
1047 	queue->cq.context = spec->cq.context;
1048 	queue->cq.callback = spec->cq.callback;
1049 }
1050 
1051 static void mana_gd_destroy_cq(struct gdma_context *gc,
1052 			       struct gdma_queue *queue)
1053 {
1054 	u32 id = queue->id;
1055 
1056 	if (id >= gc->max_num_cqs)
1057 		return;
1058 
1059 	if (!gc->cq_table[id])
1060 		return;
1061 
1062 	gc->cq_table[id] = NULL;
1063 }
1064 
1065 int mana_gd_create_hwc_queue(struct gdma_dev *gd,
1066 			     const struct gdma_queue_spec *spec,
1067 			     struct gdma_queue **queue_ptr)
1068 {
1069 	struct gdma_context *gc = gd->gdma_context;
1070 	struct gdma_mem_info *gmi;
1071 	struct gdma_queue *queue;
1072 	int err;
1073 
1074 	queue = kzalloc_obj(*queue);
1075 	if (!queue)
1076 		return -ENOMEM;
1077 
1078 	gmi = &queue->mem_info;
1079 	err = mana_gd_alloc_memory(gc, spec->queue_size, gmi);
1080 	if (err) {
1081 		dev_err(gc->dev, "GDMA queue type: %d, size: %u, gdma memory allocation err: %d\n",
1082 			spec->type, spec->queue_size, err);
1083 		goto free_q;
1084 	}
1085 
1086 	queue->head = 0;
1087 	queue->tail = 0;
1088 	queue->queue_mem_ptr = gmi->virt_addr;
1089 	queue->queue_size = spec->queue_size;
1090 	queue->monitor_avl_buf = spec->monitor_avl_buf;
1091 	queue->type = spec->type;
1092 	queue->gdma_dev = gd;
1093 
1094 	if (spec->type == GDMA_EQ)
1095 		err = mana_gd_create_eq(gd, spec, false, queue);
1096 	else if (spec->type == GDMA_CQ)
1097 		mana_gd_create_cq(spec, queue);
1098 
1099 	if (err)
1100 		goto out;
1101 
1102 	*queue_ptr = queue;
1103 	return 0;
1104 out:
1105 	dev_err(gc->dev, "Failed to create queue type %d of size %u, err: %d\n",
1106 		spec->type, spec->queue_size, err);
1107 	mana_gd_free_memory(gmi);
1108 free_q:
1109 	kfree(queue);
1110 	return err;
1111 }
1112 
1113 int mana_gd_destroy_dma_region(struct gdma_context *gc, u64 dma_region_handle)
1114 {
1115 	struct gdma_destroy_dma_region_req req = {};
1116 	struct gdma_general_resp resp = {};
1117 	int err;
1118 
1119 	if (dma_region_handle == GDMA_INVALID_DMA_REGION)
1120 		return 0;
1121 
1122 	mana_gd_init_req_hdr(&req.hdr, GDMA_DESTROY_DMA_REGION, sizeof(req),
1123 			     sizeof(resp));
1124 	req.dma_region_handle = dma_region_handle;
1125 
1126 	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
1127 	if (err || resp.hdr.status) {
1128 		if (mana_need_log(gc, err))
1129 			dev_err(gc->dev, "Failed to destroy DMA region: %d, 0x%x\n",
1130 				err, resp.hdr.status);
1131 		return -EPROTO;
1132 	}
1133 
1134 	return 0;
1135 }
1136 EXPORT_SYMBOL_NS(mana_gd_destroy_dma_region, "NET_MANA");
1137 
1138 static int mana_gd_create_dma_region(struct gdma_dev *gd,
1139 				     struct gdma_mem_info *gmi)
1140 {
1141 	unsigned int num_page = gmi->length / MANA_PAGE_SIZE;
1142 	struct gdma_create_dma_region_req *req = NULL;
1143 	struct gdma_create_dma_region_resp resp = {};
1144 	struct gdma_context *gc = gd->gdma_context;
1145 	struct hw_channel_context *hwc;
1146 	u32 length = gmi->length;
1147 	size_t req_msg_size;
1148 	int err;
1149 	int i;
1150 
1151 	if (length < MANA_PAGE_SIZE || !is_power_of_2(length))
1152 		return -EINVAL;
1153 
1154 	if (!MANA_PAGE_ALIGNED(gmi->virt_addr))
1155 		return -EINVAL;
1156 
1157 	hwc = gc->hwc.driver_data;
1158 	req_msg_size = struct_size(req, page_addr_list, num_page);
1159 	if (req_msg_size > hwc->max_req_msg_size)
1160 		return -EINVAL;
1161 
1162 	req = kzalloc(req_msg_size, GFP_KERNEL);
1163 	if (!req)
1164 		return -ENOMEM;
1165 
1166 	mana_gd_init_req_hdr(&req->hdr, GDMA_CREATE_DMA_REGION,
1167 			     req_msg_size, sizeof(resp));
1168 	req->length = length;
1169 	req->offset_in_page = 0;
1170 	req->gdma_page_type = GDMA_PAGE_TYPE_4K;
1171 	req->page_count = num_page;
1172 	req->page_addr_list_len = num_page;
1173 
1174 	for (i = 0; i < num_page; i++)
1175 		req->page_addr_list[i] = gmi->dma_handle +  i * MANA_PAGE_SIZE;
1176 
1177 	err = mana_gd_send_request(gc, req_msg_size, req, sizeof(resp), &resp);
1178 	if (err)
1179 		goto out;
1180 
1181 	if (resp.hdr.status ||
1182 	    resp.dma_region_handle == GDMA_INVALID_DMA_REGION) {
1183 		dev_err(gc->dev, "Failed to create DMA region: 0x%x\n",
1184 			resp.hdr.status);
1185 		err = -EPROTO;
1186 		goto out;
1187 	}
1188 
1189 	gmi->dma_region_handle = resp.dma_region_handle;
1190 	dev_dbg(gc->dev, "Created DMA region handle 0x%llx\n",
1191 		gmi->dma_region_handle);
1192 out:
1193 	if (err)
1194 		dev_dbg(gc->dev,
1195 			"Failed to create DMA region of length: %u, page_type: %d, status: 0x%x, err: %d\n",
1196 			length, req->gdma_page_type, resp.hdr.status, err);
1197 	kfree(req);
1198 	return err;
1199 }
1200 
1201 int mana_gd_create_mana_eq(struct gdma_dev *gd,
1202 			   const struct gdma_queue_spec *spec,
1203 			   struct gdma_queue **queue_ptr)
1204 {
1205 	struct gdma_context *gc = gd->gdma_context;
1206 	struct gdma_mem_info *gmi;
1207 	struct gdma_queue *queue;
1208 	int err;
1209 
1210 	if (spec->type != GDMA_EQ)
1211 		return -EINVAL;
1212 
1213 	queue = kzalloc_obj(*queue);
1214 	if (!queue)
1215 		return -ENOMEM;
1216 
1217 	gmi = &queue->mem_info;
1218 	err = mana_gd_alloc_memory(gc, spec->queue_size, gmi);
1219 	if (err) {
1220 		dev_err(gc->dev, "GDMA queue type: %d, size: %u, gdma memory allocation err: %d\n",
1221 			spec->type, spec->queue_size, err);
1222 		goto free_q;
1223 	}
1224 
1225 	err = mana_gd_create_dma_region(gd, gmi);
1226 	if (err)
1227 		goto out;
1228 
1229 	queue->head = 0;
1230 	queue->tail = 0;
1231 	queue->queue_mem_ptr = gmi->virt_addr;
1232 	queue->queue_size = spec->queue_size;
1233 	queue->monitor_avl_buf = spec->monitor_avl_buf;
1234 	queue->type = spec->type;
1235 	queue->gdma_dev = gd;
1236 
1237 	err = mana_gd_create_eq(gd, spec, true, queue);
1238 	if (err)
1239 		goto out;
1240 
1241 	*queue_ptr = queue;
1242 	return 0;
1243 out:
1244 	dev_err(gc->dev, "Failed to create queue type %d of size: %u, err: %d\n",
1245 		spec->type, spec->queue_size, err);
1246 	mana_gd_free_memory(gmi);
1247 free_q:
1248 	kfree(queue);
1249 	return err;
1250 }
1251 EXPORT_SYMBOL_NS(mana_gd_create_mana_eq, "NET_MANA");
1252 
1253 int mana_gd_create_mana_wq_cq(struct gdma_dev *gd,
1254 			      const struct gdma_queue_spec *spec,
1255 			      struct gdma_queue **queue_ptr)
1256 {
1257 	struct gdma_context *gc = gd->gdma_context;
1258 	struct gdma_mem_info *gmi;
1259 	struct gdma_queue *queue;
1260 	int err;
1261 
1262 	if (spec->type != GDMA_CQ && spec->type != GDMA_SQ &&
1263 	    spec->type != GDMA_RQ)
1264 		return -EINVAL;
1265 
1266 	queue = kzalloc_obj(*queue);
1267 	if (!queue)
1268 		return -ENOMEM;
1269 
1270 	queue->id = INVALID_QUEUE_ID;
1271 
1272 	gmi = &queue->mem_info;
1273 	err = mana_gd_alloc_memory(gc, spec->queue_size, gmi);
1274 	if (err) {
1275 		dev_err(gc->dev, "GDMA queue type: %d, size: %u, memory allocation err: %d\n",
1276 			spec->type, spec->queue_size, err);
1277 		goto free_q;
1278 	}
1279 
1280 	err = mana_gd_create_dma_region(gd, gmi);
1281 	if (err)
1282 		goto out;
1283 
1284 	queue->head = 0;
1285 	queue->tail = 0;
1286 	queue->queue_mem_ptr = gmi->virt_addr;
1287 	queue->queue_size = spec->queue_size;
1288 	queue->monitor_avl_buf = spec->monitor_avl_buf;
1289 	queue->type = spec->type;
1290 	queue->gdma_dev = gd;
1291 
1292 	if (spec->type == GDMA_CQ)
1293 		mana_gd_create_cq(spec, queue);
1294 
1295 	*queue_ptr = queue;
1296 	return 0;
1297 out:
1298 	dev_err(gc->dev, "Failed to create queue type %d of size: %u, err: %d\n",
1299 		spec->type, spec->queue_size, err);
1300 	mana_gd_free_memory(gmi);
1301 free_q:
1302 	kfree(queue);
1303 	return err;
1304 }
1305 EXPORT_SYMBOL_NS(mana_gd_create_mana_wq_cq, "NET_MANA");
1306 
1307 void mana_gd_destroy_queue(struct gdma_context *gc, struct gdma_queue *queue)
1308 {
1309 	struct gdma_mem_info *gmi = &queue->mem_info;
1310 
1311 	switch (queue->type) {
1312 	case GDMA_EQ:
1313 		mana_gd_destroy_eq(gc, queue->eq.disable_needed, queue);
1314 		break;
1315 
1316 	case GDMA_CQ:
1317 		mana_gd_destroy_cq(gc, queue);
1318 		break;
1319 
1320 	case GDMA_RQ:
1321 		break;
1322 
1323 	case GDMA_SQ:
1324 		break;
1325 
1326 	default:
1327 		dev_err(gc->dev, "Can't destroy unknown queue: type=%d\n",
1328 			queue->type);
1329 		return;
1330 	}
1331 
1332 	mana_gd_destroy_dma_region(gc, gmi->dma_region_handle);
1333 	mana_gd_free_memory(gmi);
1334 	kfree(queue);
1335 }
1336 EXPORT_SYMBOL_NS(mana_gd_destroy_queue, "NET_MANA");
1337 
1338 int mana_gd_verify_vf_version(struct pci_dev *pdev)
1339 {
1340 	struct gdma_context *gc = pci_get_drvdata(pdev);
1341 	struct gdma_verify_ver_resp resp = {};
1342 	struct gdma_verify_ver_req req = {};
1343 	struct hw_channel_context *hwc;
1344 	int err;
1345 
1346 	hwc = gc->hwc.driver_data;
1347 	mana_gd_init_req_hdr(&req.hdr, GDMA_VERIFY_VF_DRIVER_VERSION,
1348 			     sizeof(req), sizeof(resp));
1349 
1350 	req.protocol_ver_min = GDMA_PROTOCOL_FIRST;
1351 	req.protocol_ver_max = GDMA_PROTOCOL_LAST;
1352 
1353 	req.gd_drv_cap_flags1 = GDMA_DRV_CAP_FLAGS1;
1354 	req.gd_drv_cap_flags2 = GDMA_DRV_CAP_FLAGS2;
1355 	req.gd_drv_cap_flags3 = GDMA_DRV_CAP_FLAGS3;
1356 	req.gd_drv_cap_flags4 = GDMA_DRV_CAP_FLAGS4;
1357 
1358 	req.drv_ver = 0;	/* Unused*/
1359 	req.os_type = 0x10;	/* Linux */
1360 	req.os_ver_major = LINUX_VERSION_MAJOR;
1361 	req.os_ver_minor = LINUX_VERSION_PATCHLEVEL;
1362 	req.os_ver_build = LINUX_VERSION_SUBLEVEL;
1363 	strscpy(req.os_ver_str1, utsname()->sysname, sizeof(req.os_ver_str1));
1364 	strscpy(req.os_ver_str2, utsname()->release, sizeof(req.os_ver_str2));
1365 	strscpy(req.os_ver_str3, utsname()->version, sizeof(req.os_ver_str3));
1366 
1367 	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
1368 	if (err || resp.hdr.status) {
1369 		dev_err(gc->dev, "VfVerifyVersionOutput: %d, status=0x%x\n",
1370 			err, resp.hdr.status);
1371 		return err ? err : -EPROTO;
1372 	}
1373 	gc->pf_cap_flags1 = resp.pf_cap_flags1;
1374 	gc->gdma_protocol_ver = resp.gdma_protocol_ver;
1375 
1376 	debugfs_create_x64("gdma_protocol_ver", 0400, gc->mana_pci_debugfs,
1377 			   &gc->gdma_protocol_ver);
1378 	debugfs_create_x64("pf_cap_flags1", 0400, gc->mana_pci_debugfs,
1379 			   &gc->pf_cap_flags1);
1380 
1381 	if (resp.pf_cap_flags1 & GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG) {
1382 		err = mana_gd_query_hwc_timeout(pdev, &hwc->hwc_timeout);
1383 		if (err) {
1384 			dev_err(gc->dev, "Failed to set the hwc timeout %d\n", err);
1385 			return err;
1386 		}
1387 		dev_dbg(gc->dev, "set the hwc timeout to %u\n", hwc->hwc_timeout);
1388 	}
1389 	return 0;
1390 }
1391 
1392 int mana_gd_register_device(struct gdma_dev *gd)
1393 {
1394 	struct gdma_context *gc = gd->gdma_context;
1395 	struct gdma_register_device_resp resp = {};
1396 	struct gdma_general_req req = {};
1397 	int err;
1398 
1399 	gd->pdid = INVALID_PDID;
1400 	gd->doorbell = INVALID_DOORBELL;
1401 	gd->gpa_mkey = INVALID_MEM_KEY;
1402 
1403 	mana_gd_init_req_hdr(&req.hdr, GDMA_REGISTER_DEVICE, sizeof(req),
1404 			     sizeof(resp));
1405 
1406 	req.hdr.dev_id = gd->dev_id;
1407 
1408 	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
1409 	if (err || resp.hdr.status) {
1410 		dev_err(gc->dev, "gdma_register_device_resp failed: %d, 0x%x\n",
1411 			err, resp.hdr.status);
1412 		return err ? err : -EPROTO;
1413 	}
1414 
1415 	/* Validate that doorbell page for db_id is within the BAR0 region.
1416 	 * In mana_gd_ring_doorbell(), the address is calculated as:
1417 	 *   addr = db_page_base + db_page_size * db_id
1418 	 *        = (bar0_va + db_page_off) + (db_page_size * db_id)
1419 	 * So we need: db_page_off + db_page_size * (db_id + 1) <= bar0_size
1420 	 */
1421 	if (gc->db_page_off + gc->db_page_size * ((u64)resp.db_id + 1) > gc->bar0_size) {
1422 		dev_err(gc->dev, "Doorbell ID %u out of range\n", resp.db_id);
1423 		return -EPROTO;
1424 	}
1425 
1426 	gd->pdid = resp.pdid;
1427 	gd->gpa_mkey = resp.gpa_mkey;
1428 	gd->doorbell = resp.db_id;
1429 
1430 	return 0;
1431 }
1432 
1433 int mana_gd_deregister_device(struct gdma_dev *gd)
1434 {
1435 	struct gdma_context *gc = gd->gdma_context;
1436 	struct gdma_general_resp resp = {};
1437 	struct gdma_general_req req = {};
1438 	int err;
1439 
1440 	if (gd->pdid == INVALID_PDID)
1441 		return -EINVAL;
1442 
1443 	mana_gd_init_req_hdr(&req.hdr, GDMA_DEREGISTER_DEVICE, sizeof(req),
1444 			     sizeof(resp));
1445 
1446 	req.hdr.dev_id = gd->dev_id;
1447 
1448 	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
1449 	if (err || resp.hdr.status) {
1450 		if (mana_need_log(gc, err))
1451 			dev_err(gc->dev, "Failed to deregister device: %d, 0x%x\n",
1452 				err, resp.hdr.status);
1453 		if (!err)
1454 			err = -EPROTO;
1455 	}
1456 
1457 	gd->pdid = INVALID_PDID;
1458 	gd->doorbell = INVALID_DOORBELL;
1459 	gd->gpa_mkey = INVALID_MEM_KEY;
1460 
1461 	return err;
1462 }
1463 
1464 u32 mana_gd_wq_avail_space(struct gdma_queue *wq)
1465 {
1466 	u32 used_space = (wq->head - wq->tail) * GDMA_WQE_BU_SIZE;
1467 	u32 wq_size = wq->queue_size;
1468 
1469 	WARN_ON_ONCE(used_space > wq_size);
1470 
1471 	return wq_size - used_space;
1472 }
1473 
1474 u8 *mana_gd_get_wqe_ptr(const struct gdma_queue *wq, u32 wqe_offset)
1475 {
1476 	u32 offset = (wqe_offset * GDMA_WQE_BU_SIZE) & (wq->queue_size - 1);
1477 
1478 	WARN_ON_ONCE((offset + GDMA_WQE_BU_SIZE) > wq->queue_size);
1479 
1480 	return wq->queue_mem_ptr + offset;
1481 }
1482 
1483 static u32 mana_gd_write_client_oob(const struct gdma_wqe_request *wqe_req,
1484 				    enum gdma_queue_type q_type,
1485 				    u32 client_oob_size, u32 sgl_data_size,
1486 				    u8 *wqe_ptr)
1487 {
1488 	bool oob_in_sgl = !!(wqe_req->flags & GDMA_WR_OOB_IN_SGL);
1489 	bool pad_data = !!(wqe_req->flags & GDMA_WR_PAD_BY_SGE0);
1490 	struct gdma_wqe *header = (struct gdma_wqe *)wqe_ptr;
1491 	u8 *ptr;
1492 
1493 	memset(header, 0, sizeof(struct gdma_wqe));
1494 	header->num_sge = wqe_req->num_sge;
1495 	header->inline_oob_size_div4 = client_oob_size / sizeof(u32);
1496 
1497 	if (oob_in_sgl) {
1498 		WARN_ON_ONCE(wqe_req->num_sge < 2);
1499 
1500 		header->client_oob_in_sgl = 1;
1501 
1502 		if (pad_data)
1503 			header->last_vbytes = wqe_req->sgl[0].size;
1504 	}
1505 
1506 	if (q_type == GDMA_SQ)
1507 		header->client_data_unit = wqe_req->client_data_unit;
1508 
1509 	/* The size of gdma_wqe + client_oob_size must be less than or equal
1510 	 * to one Basic Unit (i.e. 32 bytes), so the pointer can't go beyond
1511 	 * the queue memory buffer boundary.
1512 	 */
1513 	ptr = wqe_ptr + sizeof(header);
1514 
1515 	if (wqe_req->inline_oob_data && wqe_req->inline_oob_size > 0) {
1516 		memcpy(ptr, wqe_req->inline_oob_data, wqe_req->inline_oob_size);
1517 
1518 		if (client_oob_size > wqe_req->inline_oob_size)
1519 			memset(ptr + wqe_req->inline_oob_size, 0,
1520 			       client_oob_size - wqe_req->inline_oob_size);
1521 	}
1522 
1523 	return sizeof(header) + client_oob_size;
1524 }
1525 
1526 static void mana_gd_write_sgl(struct gdma_queue *wq, u8 *wqe_ptr,
1527 			      const struct gdma_wqe_request *wqe_req)
1528 {
1529 	u32 sgl_size = sizeof(struct gdma_sge) * wqe_req->num_sge;
1530 	const u8 *address = (u8 *)wqe_req->sgl;
1531 	u8 *base_ptr, *end_ptr;
1532 	u32 size_to_end;
1533 
1534 	base_ptr = wq->queue_mem_ptr;
1535 	end_ptr = base_ptr + wq->queue_size;
1536 	size_to_end = (u32)(end_ptr - wqe_ptr);
1537 
1538 	if (size_to_end < sgl_size) {
1539 		memcpy(wqe_ptr, address, size_to_end);
1540 
1541 		wqe_ptr = base_ptr;
1542 		address += size_to_end;
1543 		sgl_size -= size_to_end;
1544 	}
1545 
1546 	memcpy(wqe_ptr, address, sgl_size);
1547 }
1548 
1549 int mana_gd_post_work_request(struct gdma_queue *wq,
1550 			      const struct gdma_wqe_request *wqe_req,
1551 			      struct gdma_posted_wqe_info *wqe_info)
1552 {
1553 	u32 client_oob_size = wqe_req->inline_oob_size;
1554 	u32 sgl_data_size;
1555 	u32 max_wqe_size;
1556 	u32 wqe_size;
1557 	u8 *wqe_ptr;
1558 
1559 	if (wqe_req->num_sge == 0)
1560 		return -EINVAL;
1561 
1562 	if (wq->type == GDMA_RQ) {
1563 		if (client_oob_size != 0)
1564 			return -EINVAL;
1565 
1566 		client_oob_size = INLINE_OOB_SMALL_SIZE;
1567 
1568 		max_wqe_size = GDMA_MAX_RQE_SIZE;
1569 	} else {
1570 		if (client_oob_size != INLINE_OOB_SMALL_SIZE &&
1571 		    client_oob_size != INLINE_OOB_LARGE_SIZE)
1572 			return -EINVAL;
1573 
1574 		max_wqe_size = GDMA_MAX_SQE_SIZE;
1575 	}
1576 
1577 	sgl_data_size = sizeof(struct gdma_sge) * wqe_req->num_sge;
1578 	wqe_size = ALIGN(sizeof(struct gdma_wqe) + client_oob_size +
1579 			 sgl_data_size, GDMA_WQE_BU_SIZE);
1580 	if (wqe_size > max_wqe_size)
1581 		return -EINVAL;
1582 
1583 	if (wq->monitor_avl_buf && wqe_size > mana_gd_wq_avail_space(wq))
1584 		return -ENOSPC;
1585 
1586 	if (wqe_info)
1587 		wqe_info->wqe_size_in_bu = wqe_size / GDMA_WQE_BU_SIZE;
1588 
1589 	wqe_ptr = mana_gd_get_wqe_ptr(wq, wq->head);
1590 	wqe_ptr += mana_gd_write_client_oob(wqe_req, wq->type, client_oob_size,
1591 					    sgl_data_size, wqe_ptr);
1592 	if (wqe_ptr >= (u8 *)wq->queue_mem_ptr + wq->queue_size)
1593 		wqe_ptr -= wq->queue_size;
1594 
1595 	mana_gd_write_sgl(wq, wqe_ptr, wqe_req);
1596 
1597 	wq->head += wqe_size / GDMA_WQE_BU_SIZE;
1598 
1599 	return 0;
1600 }
1601 EXPORT_SYMBOL_NS(mana_gd_post_work_request, "NET_MANA");
1602 
1603 int mana_gd_post_and_ring(struct gdma_queue *queue,
1604 			  const struct gdma_wqe_request *wqe_req,
1605 			  struct gdma_posted_wqe_info *wqe_info)
1606 {
1607 	struct gdma_context *gc = queue->gdma_dev->gdma_context;
1608 	int err;
1609 
1610 	err = mana_gd_post_work_request(queue, wqe_req, wqe_info);
1611 	if (err) {
1612 		dev_err(gc->dev, "Failed to post work req from queue type %d of size %u (err=%d)\n",
1613 			queue->type, queue->queue_size, err);
1614 		return err;
1615 	}
1616 
1617 	mana_gd_wq_ring_doorbell(gc, queue);
1618 
1619 	return 0;
1620 }
1621 
1622 static int mana_gd_read_cqe(struct gdma_queue *cq, struct gdma_comp *comp)
1623 {
1624 	unsigned int num_cqe = cq->queue_size / sizeof(struct gdma_cqe);
1625 	struct gdma_cqe *cq_cqe = cq->queue_mem_ptr;
1626 	u32 owner_bits, new_bits, old_bits;
1627 	struct gdma_cqe *cqe;
1628 
1629 	cqe = &cq_cqe[cq->head % num_cqe];
1630 	owner_bits = cqe->cqe_info.owner_bits;
1631 
1632 	old_bits = (cq->head / num_cqe - 1) & GDMA_CQE_OWNER_MASK;
1633 	/* Return 0 if no more entries. */
1634 	if (owner_bits == old_bits)
1635 		return 0;
1636 
1637 	new_bits = (cq->head / num_cqe) & GDMA_CQE_OWNER_MASK;
1638 	/* Return -1 if overflow detected. */
1639 	if (WARN_ON_ONCE(owner_bits != new_bits))
1640 		return -1;
1641 
1642 	/* Per GDMA spec, rmb is necessary after checking owner_bits, before
1643 	 * reading completion info
1644 	 */
1645 	rmb();
1646 
1647 	comp->wq_num = cqe->cqe_info.wq_num;
1648 	comp->is_sq = cqe->cqe_info.is_sq;
1649 	memcpy(comp->cqe_data, cqe->cqe_data, GDMA_COMP_DATA_SIZE);
1650 
1651 	return 1;
1652 }
1653 
1654 int mana_gd_poll_cq(struct gdma_queue *cq, struct gdma_comp *comp, int num_cqe)
1655 {
1656 	int cqe_idx;
1657 	int ret;
1658 
1659 	for (cqe_idx = 0; cqe_idx < num_cqe; cqe_idx++) {
1660 		ret = mana_gd_read_cqe(cq, &comp[cqe_idx]);
1661 
1662 		if (ret < 0) {
1663 			cq->head -= cqe_idx;
1664 			return ret;
1665 		}
1666 
1667 		if (ret == 0)
1668 			break;
1669 
1670 		cq->head++;
1671 	}
1672 
1673 	return cqe_idx;
1674 }
1675 EXPORT_SYMBOL_NS(mana_gd_poll_cq, "NET_MANA");
1676 
1677 static irqreturn_t mana_gd_intr(int irq, void *arg)
1678 {
1679 	struct gdma_irq_context *gic = arg;
1680 	struct list_head *eq_list = &gic->eq_list;
1681 	struct gdma_queue *eq;
1682 
1683 	rcu_read_lock();
1684 	list_for_each_entry_rcu(eq, eq_list, entry) {
1685 		gic->handler(eq);
1686 	}
1687 	rcu_read_unlock();
1688 
1689 	return IRQ_HANDLED;
1690 }
1691 
1692 void mana_gd_put_gic(struct gdma_context *gc, bool use_msi_bitmap, int msi)
1693 {
1694 	struct pci_dev *dev = to_pci_dev(gc->dev);
1695 	struct gdma_irq_context *gic;
1696 	struct msi_map irq_map;
1697 	int irq;
1698 
1699 	mutex_lock(&gc->gic_mutex);
1700 
1701 	gic = xa_load(&gc->irq_contexts, msi);
1702 	if (WARN_ON(!gic)) {
1703 		mutex_unlock(&gc->gic_mutex);
1704 		return;
1705 	}
1706 
1707 	if (use_msi_bitmap)
1708 		gic->bitmap_refs--;
1709 
1710 	if (use_msi_bitmap && gic->bitmap_refs == 0)
1711 		clear_bit(msi, gc->msi_bitmap);
1712 
1713 	if (!refcount_dec_and_test(&gic->refcount))
1714 		goto out;
1715 
1716 	irq = gic->irq;
1717 
1718 	irq_update_affinity_hint(irq, NULL);
1719 	free_irq(irq, gic);
1720 
1721 	if (gic->dyn_msix) {
1722 		irq_map.virq = irq;
1723 		irq_map.index = msi;
1724 		pci_msix_free_irq(dev, irq_map);
1725 	}
1726 
1727 	xa_erase(&gc->irq_contexts, msi);
1728 	kfree(gic);
1729 
1730 out:
1731 	mutex_unlock(&gc->gic_mutex);
1732 }
1733 EXPORT_SYMBOL_NS(mana_gd_put_gic, "NET_MANA");
1734 
1735 /*
1736  * Get a GIC (GDMA IRQ Context) on a MSI vector
1737  * a MSI can be shared between different EQs, this function supports setting
1738  * up separate MSIs using a bitmap, or directly using the MSI index
1739  *
1740  * @use_msi_bitmap:
1741  * True if MSI is assigned by this function on available slots from bitmap.
1742  * False if MSI is passed from *msi_requested
1743  */
1744 struct gdma_irq_context *mana_gd_get_gic(struct gdma_context *gc,
1745 					 bool use_msi_bitmap,
1746 					 int *msi_requested)
1747 {
1748 	struct pci_dev *dev = to_pci_dev(gc->dev);
1749 	struct gdma_irq_context *gic;
1750 	struct msi_map irq_map = { };
1751 	int irq;
1752 	int msi;
1753 	int err;
1754 
1755 	mutex_lock(&gc->gic_mutex);
1756 
1757 	if (use_msi_bitmap) {
1758 		msi = find_first_zero_bit(gc->msi_bitmap, gc->num_msix_usable);
1759 		if (msi >= gc->num_msix_usable) {
1760 			dev_err(gc->dev, "No free MSI vectors available\n");
1761 			gic = ERR_PTR(-ENOSPC);
1762 			goto out;
1763 		}
1764 		*msi_requested = msi;
1765 	} else {
1766 		msi = *msi_requested;
1767 	}
1768 
1769 	gic = xa_load(&gc->irq_contexts, msi);
1770 	if (gic) {
1771 		refcount_inc(&gic->refcount);
1772 		if (use_msi_bitmap) {
1773 			gic->bitmap_refs++;
1774 			set_bit(msi, gc->msi_bitmap);
1775 		}
1776 		goto out;
1777 	}
1778 
1779 	irq = pci_irq_vector(dev, msi);
1780 	if (irq == -EINVAL) {
1781 		irq_map = pci_msix_alloc_irq_at(dev, msi, NULL);
1782 		if (!irq_map.virq) {
1783 			err = irq_map.index;
1784 			dev_err(gc->dev,
1785 				"Failed to alloc irq_map msi %d err %d\n",
1786 				msi, err);
1787 			gic = ERR_PTR(err);
1788 			goto out;
1789 		}
1790 		irq = irq_map.virq;
1791 		msi = irq_map.index;
1792 		*msi_requested = msi;
1793 	}
1794 
1795 	gic = kzalloc(sizeof(*gic), GFP_KERNEL);
1796 	if (!gic) {
1797 		gic = ERR_PTR(-ENOMEM);
1798 		if (irq_map.virq)
1799 			pci_msix_free_irq(dev, irq_map);
1800 		goto out;
1801 	}
1802 
1803 	gic->handler = mana_gd_process_eq_events;
1804 	gic->msi = msi;
1805 	gic->irq = irq;
1806 	INIT_LIST_HEAD(&gic->eq_list);
1807 	spin_lock_init(&gic->lock);
1808 
1809 	if (!gic->msi)
1810 		snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_hwc@pci:%s",
1811 			 pci_name(dev));
1812 	else
1813 		snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_msi%d@pci:%s",
1814 			 gic->msi, pci_name(dev));
1815 
1816 	err = request_irq(irq, mana_gd_intr, 0, gic->name, gic);
1817 	if (err) {
1818 		dev_err(gc->dev, "Failed to request irq %d %s\n",
1819 			irq, gic->name);
1820 		kfree(gic);
1821 		gic = ERR_PTR(err);
1822 		if (irq_map.virq)
1823 			pci_msix_free_irq(dev, irq_map);
1824 		goto out;
1825 	}
1826 
1827 	gic->dyn_msix = !!irq_map.virq;
1828 	refcount_set(&gic->refcount, 1);
1829 	gic->bitmap_refs = use_msi_bitmap ? 1 : 0;
1830 
1831 	err = xa_err(xa_store(&gc->irq_contexts, msi, gic, GFP_KERNEL));
1832 	if (err) {
1833 		dev_err(gc->dev, "Failed to store irq context for msi %d: %d\n",
1834 			msi, err);
1835 		free_irq(irq, gic);
1836 		kfree(gic);
1837 		gic = ERR_PTR(err);
1838 		if (irq_map.virq)
1839 			pci_msix_free_irq(dev, irq_map);
1840 		goto out;
1841 	}
1842 
1843 	if (use_msi_bitmap)
1844 		set_bit(msi, gc->msi_bitmap);
1845 
1846 out:
1847 	mutex_unlock(&gc->gic_mutex);
1848 	return gic;
1849 }
1850 EXPORT_SYMBOL_NS(mana_gd_get_gic, "NET_MANA");
1851 
1852 int mana_gd_alloc_res_map(u32 res_avail, struct gdma_resource *r)
1853 {
1854 	r->map = bitmap_zalloc(res_avail, GFP_KERNEL);
1855 	if (!r->map)
1856 		return -ENOMEM;
1857 
1858 	r->size = res_avail;
1859 	spin_lock_init(&r->lock);
1860 
1861 	return 0;
1862 }
1863 
1864 void mana_gd_free_res_map(struct gdma_resource *r)
1865 {
1866 	bitmap_free(r->map);
1867 	r->map = NULL;
1868 	r->size = 0;
1869 }
1870 
1871 /*
1872  * Spread on CPUs with the following heuristics:
1873  *
1874  * 1. No more than one IRQ per CPU, if possible;
1875  * 2. NUMA locality is the second priority;
1876  * 3. Sibling dislocality is the last priority.
1877  *
1878  * Let's consider this topology:
1879  *
1880  * Node            0               1
1881  * Core        0       1       2       3
1882  * CPU       0   1   2   3   4   5   6   7
1883  *
1884  * The most performant IRQ distribution based on the above topology
1885  * and heuristics may look like this:
1886  *
1887  * IRQ     Nodes   Cores   CPUs
1888  * 0       1       0       0-1
1889  * 1       1       1       2-3
1890  * 2       1       0       0-1
1891  * 3       1       1       2-3
1892  * 4       2       2       4-5
1893  * 5       2       3       6-7
1894  * 6       2       2       4-5
1895  * 7       2       3       6-7
1896  *
1897  * The heuristics is implemented as follows.
1898  *
1899  * The outer for_each() loop resets the 'weight' to the actual number
1900  * of CPUs in the hop. Then inner for_each() loop decrements it by the
1901  * number of sibling groups (cores) while assigning first set of IRQs
1902  * to each group. IRQs 0 and 1 above are distributed this way.
1903  *
1904  * Now, because NUMA locality is more important, we should walk the
1905  * same set of siblings and assign 2nd set of IRQs (2 and 3), and it's
1906  * implemented by the medium while() loop. We do like this unless the
1907  * number of IRQs assigned on this hop will not become equal to number
1908  * of CPUs in the hop (weight == 0). Then we switch to the next hop and
1909  * do the same thing.
1910  */
1911 
1912 static int irq_setup(unsigned int *irqs, unsigned int len, int node,
1913 		     bool skip_first_cpu)
1914 {
1915 	const struct cpumask *next, *prev = cpu_none_mask;
1916 	cpumask_var_t cpus __free(free_cpumask_var);
1917 	int cpu, weight;
1918 
1919 	if (!alloc_cpumask_var(&cpus, GFP_KERNEL))
1920 		return -ENOMEM;
1921 
1922 	rcu_read_lock();
1923 	for_each_numa_hop_mask(next, node) {
1924 		weight = cpumask_weight_andnot(next, prev);
1925 		while (weight > 0) {
1926 			cpumask_andnot(cpus, next, prev);
1927 			for_each_cpu(cpu, cpus) {
1928 				cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu));
1929 				--weight;
1930 
1931 				if (unlikely(skip_first_cpu)) {
1932 					skip_first_cpu = false;
1933 					continue;
1934 				}
1935 
1936 				if (len-- == 0)
1937 					goto done;
1938 
1939 				irq_set_affinity_and_hint(*irqs++, topology_sibling_cpumask(cpu));
1940 			}
1941 		}
1942 		prev = next;
1943 	}
1944 done:
1945 	rcu_read_unlock();
1946 	return 0;
1947 }
1948 
1949 static int mana_gd_setup_dyn_irqs(struct pci_dev *pdev, int nvec)
1950 {
1951 	struct gdma_context *gc = pci_get_drvdata(pdev);
1952 	struct gdma_irq_context *gic;
1953 	bool skip_first_cpu = false;
1954 	int *irqs, err, i, msi;
1955 
1956 	irqs = kmalloc_objs(int, nvec);
1957 	if (!irqs)
1958 		return -ENOMEM;
1959 
1960 	/*
1961 	 * While processing the next pci irq vector, we start with index 1,
1962 	 * as IRQ vector at index 0 is already processed for HWC.
1963 	 * However, the population of irqs array starts with index 0, to be
1964 	 * further used in irq_setup()
1965 	 */
1966 	for (i = 1; i <= nvec; i++) {
1967 		msi = i;
1968 		gic = mana_gd_get_gic(gc, false, &msi);
1969 		if (IS_ERR(gic)) {
1970 			err = PTR_ERR(gic);
1971 			goto free_irq;
1972 		}
1973 
1974 		irqs[i - 1] = gic->irq;
1975 	}
1976 
1977 	/*
1978 	 * When calling irq_setup() for dynamically added IRQs, if number of
1979 	 * CPUs is more than or equal to allocated MSI-X, we need to skip the
1980 	 * first CPU sibling group since they are already affinitized to HWC IRQ
1981 	 */
1982 	cpus_read_lock();
1983 	if (gc->num_msix_usable <= num_online_cpus())
1984 		skip_first_cpu = true;
1985 
1986 	err = irq_setup(irqs, nvec, gc->numa_node, skip_first_cpu);
1987 	if (err) {
1988 		cpus_read_unlock();
1989 		goto free_irq;
1990 	}
1991 
1992 	cpus_read_unlock();
1993 	kfree(irqs);
1994 	return 0;
1995 
1996 free_irq:
1997 	for (i -= 1; i > 0; i--)
1998 		mana_gd_put_gic(gc, false, i);
1999 	kfree(irqs);
2000 	return err;
2001 }
2002 
2003 static int mana_gd_setup_irqs(struct pci_dev *pdev, int nvec)
2004 {
2005 	struct gdma_context *gc = pci_get_drvdata(pdev);
2006 	struct gdma_irq_context *gic;
2007 	int *irqs, *start_irqs;
2008 	unsigned int cpu;
2009 	int err, i, msi;
2010 
2011 	irqs = kmalloc_objs(int, nvec);
2012 	if (!irqs)
2013 		return -ENOMEM;
2014 
2015 	start_irqs = irqs;
2016 
2017 	for (i = 0; i < nvec; i++) {
2018 		msi = i;
2019 		gic = mana_gd_get_gic(gc, false, &msi);
2020 		if (IS_ERR(gic)) {
2021 			err = PTR_ERR(gic);
2022 			goto free_irq;
2023 		}
2024 
2025 		irqs[i] = gic->irq;
2026 	}
2027 
2028 	/* If number of IRQ is one extra than number of online CPUs,
2029 	 * then we need to assign IRQ0 (hwc irq) and IRQ1 to
2030 	 * same CPU.
2031 	 * Else we will use different CPUs for IRQ0 and IRQ1.
2032 	 * Also we are using cpumask_local_spread instead of
2033 	 * cpumask_first for the node, because the node can be
2034 	 * mem only.
2035 	 */
2036 	cpus_read_lock();
2037 	if (nvec > num_online_cpus()) {
2038 		cpu = cpumask_local_spread(0, gc->numa_node);
2039 		irq_set_affinity_and_hint(irqs[0], cpumask_of(cpu));
2040 		irqs++;
2041 		nvec -= 1;
2042 	}
2043 
2044 	err = irq_setup(irqs, nvec, gc->numa_node, false);
2045 	if (err) {
2046 		cpus_read_unlock();
2047 		goto free_irq;
2048 	}
2049 
2050 	cpus_read_unlock();
2051 	kfree(start_irqs);
2052 	return 0;
2053 
2054 free_irq:
2055 	for (i -= 1; i >= 0; i--)
2056 		mana_gd_put_gic(gc, false, i);
2057 
2058 	kfree(start_irqs);
2059 	return err;
2060 }
2061 
2062 static int mana_gd_setup_hwc_irqs(struct pci_dev *pdev)
2063 {
2064 	struct gdma_context *gc = pci_get_drvdata(pdev);
2065 	unsigned int max_irqs, min_irqs;
2066 	int nvec, err;
2067 
2068 	if (pci_msix_can_alloc_dyn(pdev)) {
2069 		max_irqs = 1;
2070 		min_irqs = 1;
2071 	} else {
2072 		/* Need 1 interrupt for HWC */
2073 		max_irqs = min(num_online_cpus(), MANA_MAX_NUM_QUEUES) + 1;
2074 		min_irqs = 2;
2075 		gc->msi_sharing = true;
2076 	}
2077 
2078 	nvec = pci_alloc_irq_vectors(pdev, min_irqs, max_irqs, PCI_IRQ_MSIX);
2079 	if (nvec < 0)
2080 		return nvec;
2081 
2082 	err = mana_gd_setup_irqs(pdev, nvec);
2083 	if (err) {
2084 		pci_free_irq_vectors(pdev);
2085 		return err;
2086 	}
2087 
2088 	gc->num_msix_usable = nvec;
2089 	gc->max_num_msix = nvec;
2090 
2091 	return 0;
2092 }
2093 
2094 static int mana_gd_setup_remaining_irqs(struct pci_dev *pdev)
2095 {
2096 	struct gdma_context *gc = pci_get_drvdata(pdev);
2097 	struct msi_map irq_map;
2098 	int max_irqs, i, err;
2099 
2100 	if (!pci_msix_can_alloc_dyn(pdev))
2101 		/* remain irqs are already allocated with HWC IRQ */
2102 		return 0;
2103 
2104 	/* allocate only remaining IRQs*/
2105 	max_irqs = gc->num_msix_usable - 1;
2106 
2107 	for (i = 1; i <= max_irqs; i++) {
2108 		irq_map = pci_msix_alloc_irq_at(pdev, i, NULL);
2109 		if (!irq_map.virq) {
2110 			err = irq_map.index;
2111 			/* caller will handle cleaning up all allocated
2112 			 * irqs, after HWC is destroyed
2113 			 */
2114 			return err;
2115 		}
2116 	}
2117 
2118 	err = mana_gd_setup_dyn_irqs(pdev, max_irqs);
2119 	if (err)
2120 		return err;
2121 
2122 	gc->max_num_msix = gc->max_num_msix + max_irqs;
2123 
2124 	return 0;
2125 }
2126 
2127 static void mana_gd_remove_irqs(struct pci_dev *pdev)
2128 {
2129 	struct gdma_context *gc = pci_get_drvdata(pdev);
2130 	int i;
2131 
2132 	if (gc->max_num_msix < 1)
2133 		return;
2134 
2135 	for (i = 0; i < gc->max_num_msix; i++) {
2136 		if (!xa_load(&gc->irq_contexts, i))
2137 			continue;
2138 
2139 		mana_gd_put_gic(gc, false, i);
2140 	}
2141 
2142 	WARN_ON(!xa_empty(&gc->irq_contexts));
2143 
2144 	pci_free_irq_vectors(pdev);
2145 
2146 	bitmap_free(gc->msi_bitmap);
2147 	gc->msi_bitmap = NULL;
2148 	gc->max_num_msix = 0;
2149 	gc->num_msix_usable = 0;
2150 }
2151 
2152 static int mana_gd_setup(struct pci_dev *pdev)
2153 {
2154 	struct gdma_context *gc = pci_get_drvdata(pdev);
2155 	int err;
2156 
2157 	gc->mana_pci_debugfs = debugfs_create_dir(pci_name(pdev),
2158 						  mana_debugfs_root);
2159 
2160 	err = mana_gd_init_registers(pdev);
2161 	if (err)
2162 		goto remove_debugfs;
2163 
2164 	mana_smc_init(&gc->shm_channel, gc->dev, gc->shm_base);
2165 
2166 	gc->service_wq = alloc_ordered_workqueue("gdma_service_wq", 0);
2167 	if (!gc->service_wq) {
2168 		err = -ENOMEM;
2169 		goto remove_debugfs;
2170 	}
2171 
2172 	err = mana_gd_setup_hwc_irqs(pdev);
2173 	if (err) {
2174 		dev_err(gc->dev, "Failed to setup IRQs for HWC creation: %d\n",
2175 			err);
2176 		goto free_workqueue;
2177 	}
2178 
2179 	err = mana_hwc_create_channel(gc);
2180 	if (err)
2181 		goto remove_irq;
2182 
2183 	err = mana_gd_verify_vf_version(pdev);
2184 	if (err)
2185 		goto destroy_hwc;
2186 
2187 	err = mana_gd_detect_devices(pdev);
2188 	if (err)
2189 		goto destroy_hwc;
2190 
2191 	err = mana_gd_query_max_resources(pdev);
2192 	if (err)
2193 		goto destroy_hwc;
2194 
2195 	err = mana_gd_setup_remaining_irqs(pdev);
2196 	if (err) {
2197 		dev_err(gc->dev, "Failed to setup remaining IRQs: %d", err);
2198 		goto destroy_hwc;
2199 	}
2200 
2201 	if (!gc->msi_sharing) {
2202 		gc->msi_bitmap = bitmap_zalloc(gc->num_msix_usable, GFP_KERNEL);
2203 		if (!gc->msi_bitmap) {
2204 			err = -ENOMEM;
2205 			goto destroy_hwc;
2206 		}
2207 		/* Set bit for HWC */
2208 		set_bit(0, gc->msi_bitmap);
2209 	}
2210 
2211 	dev_dbg(&pdev->dev, "mana gdma setup successful\n");
2212 	return 0;
2213 
2214 destroy_hwc:
2215 	mana_hwc_destroy_channel(gc);
2216 remove_irq:
2217 	mana_gd_remove_irqs(pdev);
2218 free_workqueue:
2219 	destroy_workqueue(gc->service_wq);
2220 	gc->service_wq = NULL;
2221 remove_debugfs:
2222 	debugfs_remove_recursive(gc->mana_pci_debugfs);
2223 	gc->mana_pci_debugfs = NULL;
2224 	dev_err(&pdev->dev, "%s failed (error %d)\n", __func__, err);
2225 	return err;
2226 }
2227 
2228 static void mana_gd_cleanup_device(struct pci_dev *pdev)
2229 {
2230 	struct gdma_context *gc = pci_get_drvdata(pdev);
2231 
2232 	mana_hwc_destroy_channel(gc);
2233 
2234 	mana_gd_remove_irqs(pdev);
2235 
2236 	if (gc->service_wq) {
2237 		destroy_workqueue(gc->service_wq);
2238 		gc->service_wq = NULL;
2239 	}
2240 
2241 	debugfs_remove_recursive(gc->mana_pci_debugfs);
2242 	gc->mana_pci_debugfs = NULL;
2243 
2244 	dev_dbg(&pdev->dev, "mana gdma cleanup successful\n");
2245 }
2246 
2247 static bool mana_is_pf(unsigned short dev_id)
2248 {
2249 	return dev_id == MANA_PF_DEVICE_ID || dev_id == MANA_PF2_DEVICE_ID;
2250 }
2251 
2252 static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
2253 {
2254 	struct gdma_context *gc;
2255 	void __iomem *bar0_va;
2256 	int bar = 0;
2257 	int err;
2258 
2259 	/* Each port has 2 CQs, each CQ has at most 1 EQE at a time */
2260 	BUILD_BUG_ON(2 * MAX_PORTS_IN_MANA_DEV * GDMA_EQE_SIZE > EQ_SIZE);
2261 
2262 	err = pci_enable_device(pdev);
2263 	if (err) {
2264 		dev_err(&pdev->dev, "Failed to enable pci device (err=%d)\n", err);
2265 		return -ENXIO;
2266 	}
2267 
2268 	pci_set_master(pdev);
2269 
2270 	err = pci_request_regions(pdev, "mana");
2271 	if (err)
2272 		goto disable_dev;
2273 
2274 	err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
2275 	if (err) {
2276 		dev_err(&pdev->dev, "DMA set mask failed: %d\n", err);
2277 		goto release_region;
2278 	}
2279 	dma_set_max_seg_size(&pdev->dev, UINT_MAX);
2280 
2281 	err = -ENOMEM;
2282 	gc = vzalloc(sizeof(*gc));
2283 	if (!gc)
2284 		goto release_region;
2285 
2286 	mutex_init(&gc->eq_test_event_mutex);
2287 	mutex_init(&gc->gic_mutex);
2288 	pci_set_drvdata(pdev, gc);
2289 	gc->bar0_pa = pci_resource_start(pdev, 0);
2290 	gc->bar0_size = pci_resource_len(pdev, 0);
2291 
2292 	bar0_va = pci_iomap(pdev, bar, 0);
2293 	if (!bar0_va)
2294 		goto free_gc;
2295 
2296 	gc->numa_node = dev_to_node(&pdev->dev);
2297 	gc->is_pf = mana_is_pf(pdev->device);
2298 	gc->is_pf2 = (pdev->device == MANA_PF2_DEVICE_ID);
2299 
2300 	gc->bar0_va = bar0_va;
2301 	gc->dev = &pdev->dev;
2302 	xa_init(&gc->irq_contexts);
2303 
2304 	err = mana_gd_setup(pdev);
2305 	if (err)
2306 		goto unmap_bar;
2307 
2308 	err = mana_probe(&gc->mana, false);
2309 	if (err)
2310 		goto cleanup_gd;
2311 
2312 	err = mana_rdma_probe(&gc->mana_ib);
2313 	if (err)
2314 		goto cleanup_mana;
2315 
2316 	/*
2317 	 * If a hardware reset event has occurred over HWC during probe,
2318 	 * rollback and perform hardware reset procedure.
2319 	 */
2320 	if (test_and_set_bit(GC_PROBE_SUCCEEDED, &gc->flags)) {
2321 		err = -EPROTO;
2322 		goto cleanup_mana_rdma;
2323 	}
2324 
2325 	return 0;
2326 
2327 cleanup_mana_rdma:
2328 	mana_rdma_remove(&gc->mana_ib);
2329 cleanup_mana:
2330 	mana_remove(&gc->mana, false);
2331 cleanup_gd:
2332 	mana_gd_cleanup_device(pdev);
2333 unmap_bar:
2334 	xa_destroy(&gc->irq_contexts);
2335 	pci_iounmap(pdev, bar0_va);
2336 free_gc:
2337 	pci_set_drvdata(pdev, NULL);
2338 	vfree(gc);
2339 release_region:
2340 	pci_release_regions(pdev);
2341 disable_dev:
2342 	pci_disable_device(pdev);
2343 	dev_err(&pdev->dev, "gdma probe failed: err = %d\n", err);
2344 
2345 	/*
2346 	 * Hardware could be in recovery mode and the HWC returns TIMEDOUT or
2347 	 * EPROTO from mana_gd_setup(), mana_probe() or mana_rdma_probe(), or
2348 	 * we received a hardware reset event over HWC interrupt. In this case,
2349 	 * perform the device recovery procedure after MANA_SERVICE_PERIOD
2350 	 * seconds.
2351 	 */
2352 	if (err == -ETIMEDOUT || err == -EPROTO) {
2353 		struct mana_dev_recovery *dev;
2354 		unsigned long flags;
2355 
2356 		dev_info(&pdev->dev, "Start MANA recovery mode\n");
2357 
2358 		dev = kzalloc_obj(*dev);
2359 		if (!dev)
2360 			return err;
2361 
2362 		dev->pdev = pci_dev_get(pdev);
2363 		dev->type = GDMA_EQE_HWC_RESET_REQUEST;
2364 
2365 		spin_lock_irqsave(&mana_dev_recovery_work.lock, flags);
2366 		list_add_tail(&dev->list, &mana_dev_recovery_work.dev_list);
2367 		spin_unlock_irqrestore(&mana_dev_recovery_work.lock, flags);
2368 
2369 		schedule_delayed_work(&mana_dev_recovery_work.work,
2370 				      secs_to_jiffies(MANA_SERVICE_PERIOD));
2371 	}
2372 
2373 	return err;
2374 }
2375 
2376 static void mana_gd_remove(struct pci_dev *pdev)
2377 {
2378 	struct gdma_context *gc = pci_get_drvdata(pdev);
2379 
2380 	mana_rdma_remove(&gc->mana_ib);
2381 	mana_remove(&gc->mana, false);
2382 
2383 	mana_gd_cleanup_device(pdev);
2384 
2385 	xa_destroy(&gc->irq_contexts);
2386 
2387 	pci_iounmap(pdev, gc->bar0_va);
2388 
2389 	vfree(gc);
2390 
2391 	pci_release_regions(pdev);
2392 	pci_disable_device(pdev);
2393 
2394 	dev_dbg(&pdev->dev, "mana gdma remove successful\n");
2395 }
2396 
2397 /* The 'state' parameter is not used. */
2398 int mana_gd_suspend(struct pci_dev *pdev, pm_message_t state)
2399 {
2400 	struct gdma_context *gc = pci_get_drvdata(pdev);
2401 
2402 	mana_rdma_remove(&gc->mana_ib);
2403 	mana_remove(&gc->mana, true);
2404 
2405 	mana_gd_cleanup_device(pdev);
2406 
2407 	return 0;
2408 }
2409 
2410 int mana_gd_resume(struct pci_dev *pdev)
2411 {
2412 	struct gdma_context *gc = pci_get_drvdata(pdev);
2413 	int err;
2414 
2415 	err = mana_gd_setup(pdev);
2416 	if (err)
2417 		return err;
2418 
2419 	err = mana_probe(&gc->mana, true);
2420 	if (err)
2421 		goto cleanup_gd;
2422 
2423 	err = mana_rdma_probe(&gc->mana_ib);
2424 	if (err)
2425 		mana_rdma_remove(&gc->mana_ib);
2426 
2427 	return err;
2428 
2429 cleanup_gd:
2430 	mana_gd_cleanup_device(pdev);
2431 	return err;
2432 }
2433 
2434 /* Quiesce the device for kexec. This is also called upon reboot/shutdown. */
2435 static void mana_gd_shutdown(struct pci_dev *pdev)
2436 {
2437 	struct gdma_context *gc = pci_get_drvdata(pdev);
2438 
2439 	dev_info(&pdev->dev, "Shutdown was called\n");
2440 
2441 	mana_rdma_remove(&gc->mana_ib);
2442 	mana_remove(&gc->mana, true);
2443 
2444 	mana_gd_cleanup_device(pdev);
2445 
2446 	pci_disable_device(pdev);
2447 }
2448 
2449 static const struct pci_device_id mana_id_table[] = {
2450 	{ PCI_DEVICE(PCI_VENDOR_ID_MICROSOFT, MANA_PF_DEVICE_ID) },
2451 	{ PCI_DEVICE(PCI_VENDOR_ID_MICROSOFT, MANA_PF2_DEVICE_ID) },
2452 	{ PCI_DEVICE(PCI_VENDOR_ID_MICROSOFT, MANA_VF_DEVICE_ID) },
2453 	{ }
2454 };
2455 
2456 static struct pci_driver mana_driver = {
2457 	.name		= "mana",
2458 	.id_table	= mana_id_table,
2459 	.probe		= mana_gd_probe,
2460 	.remove		= mana_gd_remove,
2461 	.suspend	= mana_gd_suspend,
2462 	.resume		= mana_gd_resume,
2463 	.shutdown	= mana_gd_shutdown,
2464 };
2465 
2466 static int __init mana_driver_init(void)
2467 {
2468 	int err;
2469 
2470 	INIT_LIST_HEAD(&mana_dev_recovery_work.dev_list);
2471 	spin_lock_init(&mana_dev_recovery_work.lock);
2472 	INIT_DELAYED_WORK(&mana_dev_recovery_work.work, mana_recovery_delayed_func);
2473 
2474 	mana_debugfs_root = debugfs_create_dir("mana", NULL);
2475 
2476 	err = pci_register_driver(&mana_driver);
2477 	if (err) {
2478 		debugfs_remove(mana_debugfs_root);
2479 		mana_debugfs_root = NULL;
2480 	}
2481 
2482 	return err;
2483 }
2484 
2485 static void __exit mana_driver_exit(void)
2486 {
2487 	struct mana_dev_recovery *dev;
2488 	unsigned long flags;
2489 
2490 	disable_delayed_work_sync(&mana_dev_recovery_work.work);
2491 
2492 	spin_lock_irqsave(&mana_dev_recovery_work.lock, flags);
2493 	while (!list_empty(&mana_dev_recovery_work.dev_list)) {
2494 		dev = list_first_entry(&mana_dev_recovery_work.dev_list,
2495 				       struct mana_dev_recovery, list);
2496 		list_del(&dev->list);
2497 		pci_dev_put(dev->pdev);
2498 		kfree(dev);
2499 	}
2500 	spin_unlock_irqrestore(&mana_dev_recovery_work.lock, flags);
2501 
2502 	pci_unregister_driver(&mana_driver);
2503 
2504 	debugfs_remove(mana_debugfs_root);
2505 
2506 	mana_debugfs_root = NULL;
2507 }
2508 
2509 module_init(mana_driver_init);
2510 module_exit(mana_driver_exit);
2511 
2512 MODULE_DEVICE_TABLE(pci, mana_id_table);
2513 
2514 MODULE_LICENSE("Dual BSD/GPL");
2515 MODULE_DESCRIPTION("Microsoft Azure Network Adapter driver");
2516