xref: /linux/drivers/vdpa/mlx5/net/mlx5_vnet.c (revision eb01fe7abbe2d0b38824d2a93fdb4cc3eaf2ccc1)
1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /* Copyright (c) 2020 Mellanox Technologies Ltd. */
3 
4 #include <linux/module.h>
5 #include <linux/vdpa.h>
6 #include <linux/vringh.h>
7 #include <uapi/linux/virtio_net.h>
8 #include <uapi/linux/virtio_ids.h>
9 #include <uapi/linux/vdpa.h>
10 #include <uapi/linux/vhost_types.h>
11 #include <linux/virtio_config.h>
12 #include <linux/auxiliary_bus.h>
13 #include <linux/mlx5/cq.h>
14 #include <linux/mlx5/qp.h>
15 #include <linux/mlx5/device.h>
16 #include <linux/mlx5/driver.h>
17 #include <linux/mlx5/vport.h>
18 #include <linux/mlx5/fs.h>
19 #include <linux/mlx5/mlx5_ifc_vdpa.h>
20 #include <linux/mlx5/mpfs.h>
21 #include "mlx5_vdpa.h"
22 #include "mlx5_vnet.h"
23 
24 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
25 MODULE_DESCRIPTION("Mellanox VDPA driver");
26 MODULE_LICENSE("Dual BSD/GPL");
27 
28 #define VALID_FEATURES_MASK                                                                        \
29 	(BIT_ULL(VIRTIO_NET_F_CSUM) | BIT_ULL(VIRTIO_NET_F_GUEST_CSUM) |                                   \
30 	 BIT_ULL(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) | BIT_ULL(VIRTIO_NET_F_MTU) | BIT_ULL(VIRTIO_NET_F_MAC) |   \
31 	 BIT_ULL(VIRTIO_NET_F_GUEST_TSO4) | BIT_ULL(VIRTIO_NET_F_GUEST_TSO6) |                             \
32 	 BIT_ULL(VIRTIO_NET_F_GUEST_ECN) | BIT_ULL(VIRTIO_NET_F_GUEST_UFO) | BIT_ULL(VIRTIO_NET_F_HOST_TSO4) | \
33 	 BIT_ULL(VIRTIO_NET_F_HOST_TSO6) | BIT_ULL(VIRTIO_NET_F_HOST_ECN) | BIT_ULL(VIRTIO_NET_F_HOST_UFO) |   \
34 	 BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) | BIT_ULL(VIRTIO_NET_F_STATUS) | BIT_ULL(VIRTIO_NET_F_CTRL_VQ) |      \
35 	 BIT_ULL(VIRTIO_NET_F_CTRL_RX) | BIT_ULL(VIRTIO_NET_F_CTRL_VLAN) |                                 \
36 	 BIT_ULL(VIRTIO_NET_F_CTRL_RX_EXTRA) | BIT_ULL(VIRTIO_NET_F_GUEST_ANNOUNCE) |                      \
37 	 BIT_ULL(VIRTIO_NET_F_MQ) | BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR) | BIT_ULL(VIRTIO_NET_F_HASH_REPORT) |  \
38 	 BIT_ULL(VIRTIO_NET_F_RSS) | BIT_ULL(VIRTIO_NET_F_RSC_EXT) | BIT_ULL(VIRTIO_NET_F_STANDBY) |           \
39 	 BIT_ULL(VIRTIO_NET_F_SPEED_DUPLEX) | BIT_ULL(VIRTIO_F_NOTIFY_ON_EMPTY) |                          \
40 	 BIT_ULL(VIRTIO_F_ANY_LAYOUT) | BIT_ULL(VIRTIO_F_VERSION_1) | BIT_ULL(VIRTIO_F_ACCESS_PLATFORM) |      \
41 	 BIT_ULL(VIRTIO_F_RING_PACKED) | BIT_ULL(VIRTIO_F_ORDER_PLATFORM) | BIT_ULL(VIRTIO_F_SR_IOV))
42 
43 #define VALID_STATUS_MASK                                                                          \
44 	(VIRTIO_CONFIG_S_ACKNOWLEDGE | VIRTIO_CONFIG_S_DRIVER | VIRTIO_CONFIG_S_DRIVER_OK |        \
45 	 VIRTIO_CONFIG_S_FEATURES_OK | VIRTIO_CONFIG_S_NEEDS_RESET | VIRTIO_CONFIG_S_FAILED)
46 
47 #define MLX5_FEATURE(_mvdev, _feature) (!!((_mvdev)->actual_features & BIT_ULL(_feature)))
48 
49 #define MLX5V_UNTAGGED 0x1000
50 
51 struct mlx5_vdpa_cq_buf {
52 	struct mlx5_frag_buf_ctrl fbc;
53 	struct mlx5_frag_buf frag_buf;
54 	int cqe_size;
55 	int nent;
56 };
57 
58 struct mlx5_vdpa_cq {
59 	struct mlx5_core_cq mcq;
60 	struct mlx5_vdpa_cq_buf buf;
61 	struct mlx5_db db;
62 	int cqe;
63 };
64 
65 struct mlx5_vdpa_umem {
66 	struct mlx5_frag_buf_ctrl fbc;
67 	struct mlx5_frag_buf frag_buf;
68 	int size;
69 	u32 id;
70 };
71 
72 struct mlx5_vdpa_qp {
73 	struct mlx5_core_qp mqp;
74 	struct mlx5_frag_buf frag_buf;
75 	struct mlx5_db db;
76 	u16 head;
77 	bool fw;
78 };
79 
80 struct mlx5_vq_restore_info {
81 	u32 num_ent;
82 	u64 desc_addr;
83 	u64 device_addr;
84 	u64 driver_addr;
85 	u16 avail_index;
86 	u16 used_index;
87 	struct msi_map map;
88 	bool ready;
89 	bool restore;
90 };
91 
92 struct mlx5_vdpa_virtqueue {
93 	bool ready;
94 	u64 desc_addr;
95 	u64 device_addr;
96 	u64 driver_addr;
97 	u32 num_ent;
98 
99 	/* Resources for implementing the notification channel from the device
100 	 * to the driver. fwqp is the firmware end of an RC connection; the
101 	 * other end is vqqp used by the driver. cq is where completions are
102 	 * reported.
103 	 */
104 	struct mlx5_vdpa_cq cq;
105 	struct mlx5_vdpa_qp fwqp;
106 	struct mlx5_vdpa_qp vqqp;
107 
108 	/* umem resources are required for the virtqueue operation. They're use
109 	 * is internal and they must be provided by the driver.
110 	 */
111 	struct mlx5_vdpa_umem umem1;
112 	struct mlx5_vdpa_umem umem2;
113 	struct mlx5_vdpa_umem umem3;
114 
115 	u32 counter_set_id;
116 	bool initialized;
117 	int index;
118 	u32 virtq_id;
119 	struct mlx5_vdpa_net *ndev;
120 	u16 avail_idx;
121 	u16 used_idx;
122 	int fw_state;
123 
124 	u64 modified_fields;
125 
126 	struct mlx5_vdpa_mr *vq_mr;
127 	struct mlx5_vdpa_mr *desc_mr;
128 
129 	struct msi_map map;
130 
131 	/* keep last in the struct */
132 	struct mlx5_vq_restore_info ri;
133 };
134 
135 static bool is_index_valid(struct mlx5_vdpa_dev *mvdev, u16 idx)
136 {
137 	if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_MQ))) {
138 		if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)))
139 			return idx < 2;
140 		else
141 			return idx < 3;
142 	}
143 
144 	return idx <= mvdev->max_idx;
145 }
146 
147 static void free_resources(struct mlx5_vdpa_net *ndev);
148 static void init_mvqs(struct mlx5_vdpa_net *ndev);
149 static int setup_driver(struct mlx5_vdpa_dev *mvdev);
150 static void teardown_driver(struct mlx5_vdpa_net *ndev);
151 
152 static bool mlx5_vdpa_debug;
153 
154 #define MLX5_LOG_VIO_FLAG(_feature)                                                                \
155 	do {                                                                                       \
156 		if (features & BIT_ULL(_feature))                                                  \
157 			mlx5_vdpa_info(mvdev, "%s\n", #_feature);                                  \
158 	} while (0)
159 
160 #define MLX5_LOG_VIO_STAT(_status)                                                                 \
161 	do {                                                                                       \
162 		if (status & (_status))                                                            \
163 			mlx5_vdpa_info(mvdev, "%s\n", #_status);                                   \
164 	} while (0)
165 
166 /* TODO: cross-endian support */
167 static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev *mvdev)
168 {
169 	return virtio_legacy_is_little_endian() ||
170 		(mvdev->actual_features & BIT_ULL(VIRTIO_F_VERSION_1));
171 }
172 
173 static u16 mlx5vdpa16_to_cpu(struct mlx5_vdpa_dev *mvdev, __virtio16 val)
174 {
175 	return __virtio16_to_cpu(mlx5_vdpa_is_little_endian(mvdev), val);
176 }
177 
178 static __virtio16 cpu_to_mlx5vdpa16(struct mlx5_vdpa_dev *mvdev, u16 val)
179 {
180 	return __cpu_to_virtio16(mlx5_vdpa_is_little_endian(mvdev), val);
181 }
182 
183 static u16 ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev)
184 {
185 	if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_MQ)))
186 		return 2;
187 
188 	return mvdev->max_vqs;
189 }
190 
191 static bool is_ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev, u16 idx)
192 {
193 	return idx == ctrl_vq_idx(mvdev);
194 }
195 
196 static void print_status(struct mlx5_vdpa_dev *mvdev, u8 status, bool set)
197 {
198 	if (status & ~VALID_STATUS_MASK)
199 		mlx5_vdpa_warn(mvdev, "Warning: there are invalid status bits 0x%x\n",
200 			       status & ~VALID_STATUS_MASK);
201 
202 	if (!mlx5_vdpa_debug)
203 		return;
204 
205 	mlx5_vdpa_info(mvdev, "driver status %s", set ? "set" : "get");
206 	if (set && !status) {
207 		mlx5_vdpa_info(mvdev, "driver resets the device\n");
208 		return;
209 	}
210 
211 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_ACKNOWLEDGE);
212 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_DRIVER);
213 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_DRIVER_OK);
214 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_FEATURES_OK);
215 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_NEEDS_RESET);
216 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_FAILED);
217 }
218 
219 static void print_features(struct mlx5_vdpa_dev *mvdev, u64 features, bool set)
220 {
221 	if (features & ~VALID_FEATURES_MASK)
222 		mlx5_vdpa_warn(mvdev, "There are invalid feature bits 0x%llx\n",
223 			       features & ~VALID_FEATURES_MASK);
224 
225 	if (!mlx5_vdpa_debug)
226 		return;
227 
228 	mlx5_vdpa_info(mvdev, "driver %s feature bits:\n", set ? "sets" : "reads");
229 	if (!features)
230 		mlx5_vdpa_info(mvdev, "all feature bits are cleared\n");
231 
232 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CSUM);
233 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_CSUM);
234 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS);
235 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MTU);
236 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MAC);
237 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_TSO4);
238 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_TSO6);
239 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_ECN);
240 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_UFO);
241 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_TSO4);
242 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_TSO6);
243 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_ECN);
244 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_UFO);
245 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MRG_RXBUF);
246 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_STATUS);
247 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_VQ);
248 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_RX);
249 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_VLAN);
250 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_RX_EXTRA);
251 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_ANNOUNCE);
252 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MQ);
253 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_MAC_ADDR);
254 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HASH_REPORT);
255 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_RSS);
256 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_RSC_EXT);
257 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_STANDBY);
258 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_SPEED_DUPLEX);
259 	MLX5_LOG_VIO_FLAG(VIRTIO_F_NOTIFY_ON_EMPTY);
260 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ANY_LAYOUT);
261 	MLX5_LOG_VIO_FLAG(VIRTIO_F_VERSION_1);
262 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ACCESS_PLATFORM);
263 	MLX5_LOG_VIO_FLAG(VIRTIO_F_RING_PACKED);
264 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ORDER_PLATFORM);
265 	MLX5_LOG_VIO_FLAG(VIRTIO_F_SR_IOV);
266 }
267 
268 static int create_tis(struct mlx5_vdpa_net *ndev)
269 {
270 	struct mlx5_vdpa_dev *mvdev = &ndev->mvdev;
271 	u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {};
272 	void *tisc;
273 	int err;
274 
275 	tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
276 	MLX5_SET(tisc, tisc, transport_domain, ndev->res.tdn);
277 	err = mlx5_vdpa_create_tis(mvdev, in, &ndev->res.tisn);
278 	if (err)
279 		mlx5_vdpa_warn(mvdev, "create TIS (%d)\n", err);
280 
281 	return err;
282 }
283 
284 static void destroy_tis(struct mlx5_vdpa_net *ndev)
285 {
286 	mlx5_vdpa_destroy_tis(&ndev->mvdev, ndev->res.tisn);
287 }
288 
289 #define MLX5_VDPA_CQE_SIZE 64
290 #define MLX5_VDPA_LOG_CQE_SIZE ilog2(MLX5_VDPA_CQE_SIZE)
291 
292 static int cq_frag_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_cq_buf *buf, int nent)
293 {
294 	struct mlx5_frag_buf *frag_buf = &buf->frag_buf;
295 	u8 log_wq_stride = MLX5_VDPA_LOG_CQE_SIZE;
296 	u8 log_wq_sz = MLX5_VDPA_LOG_CQE_SIZE;
297 	int err;
298 
299 	err = mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, nent * MLX5_VDPA_CQE_SIZE, frag_buf,
300 				       ndev->mvdev.mdev->priv.numa_node);
301 	if (err)
302 		return err;
303 
304 	mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc);
305 
306 	buf->cqe_size = MLX5_VDPA_CQE_SIZE;
307 	buf->nent = nent;
308 
309 	return 0;
310 }
311 
312 static int umem_frag_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem, int size)
313 {
314 	struct mlx5_frag_buf *frag_buf = &umem->frag_buf;
315 
316 	return mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, size, frag_buf,
317 					ndev->mvdev.mdev->priv.numa_node);
318 }
319 
320 static void cq_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_cq_buf *buf)
321 {
322 	mlx5_frag_buf_free(ndev->mvdev.mdev, &buf->frag_buf);
323 }
324 
325 static void *get_cqe(struct mlx5_vdpa_cq *vcq, int n)
326 {
327 	return mlx5_frag_buf_get_wqe(&vcq->buf.fbc, n);
328 }
329 
330 static void cq_frag_buf_init(struct mlx5_vdpa_cq *vcq, struct mlx5_vdpa_cq_buf *buf)
331 {
332 	struct mlx5_cqe64 *cqe64;
333 	void *cqe;
334 	int i;
335 
336 	for (i = 0; i < buf->nent; i++) {
337 		cqe = get_cqe(vcq, i);
338 		cqe64 = cqe;
339 		cqe64->op_own = MLX5_CQE_INVALID << 4;
340 	}
341 }
342 
343 static void *get_sw_cqe(struct mlx5_vdpa_cq *cq, int n)
344 {
345 	struct mlx5_cqe64 *cqe64 = get_cqe(cq, n & (cq->cqe - 1));
346 
347 	if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) &&
348 	    !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & cq->cqe)))
349 		return cqe64;
350 
351 	return NULL;
352 }
353 
354 static void rx_post(struct mlx5_vdpa_qp *vqp, int n)
355 {
356 	vqp->head += n;
357 	vqp->db.db[0] = cpu_to_be32(vqp->head);
358 }
359 
360 static void qp_prepare(struct mlx5_vdpa_net *ndev, bool fw, void *in,
361 		       struct mlx5_vdpa_virtqueue *mvq, u32 num_ent)
362 {
363 	struct mlx5_vdpa_qp *vqp;
364 	__be64 *pas;
365 	void *qpc;
366 
367 	vqp = fw ? &mvq->fwqp : &mvq->vqqp;
368 	MLX5_SET(create_qp_in, in, uid, ndev->mvdev.res.uid);
369 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
370 	if (vqp->fw) {
371 		/* Firmware QP is allocated by the driver for the firmware's
372 		 * use so we can skip part of the params as they will be chosen by firmware
373 		 */
374 		qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
375 		MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ);
376 		MLX5_SET(qpc, qpc, no_sq, 1);
377 		return;
378 	}
379 
380 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
381 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
382 	MLX5_SET(qpc, qpc, pd, ndev->mvdev.res.pdn);
383 	MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
384 	MLX5_SET(qpc, qpc, uar_page, ndev->mvdev.res.uar->index);
385 	MLX5_SET(qpc, qpc, log_page_size, vqp->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
386 	MLX5_SET(qpc, qpc, no_sq, 1);
387 	MLX5_SET(qpc, qpc, cqn_rcv, mvq->cq.mcq.cqn);
388 	MLX5_SET(qpc, qpc, log_rq_size, ilog2(num_ent));
389 	MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
390 	pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, in, pas);
391 	mlx5_fill_page_frag_array(&vqp->frag_buf, pas);
392 }
393 
394 static int rq_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp, u32 num_ent)
395 {
396 	return mlx5_frag_buf_alloc_node(ndev->mvdev.mdev,
397 					num_ent * sizeof(struct mlx5_wqe_data_seg), &vqp->frag_buf,
398 					ndev->mvdev.mdev->priv.numa_node);
399 }
400 
401 static void rq_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp)
402 {
403 	mlx5_frag_buf_free(ndev->mvdev.mdev, &vqp->frag_buf);
404 }
405 
406 static int qp_create(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
407 		     struct mlx5_vdpa_qp *vqp)
408 {
409 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
410 	int inlen = MLX5_ST_SZ_BYTES(create_qp_in);
411 	u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
412 	void *qpc;
413 	void *in;
414 	int err;
415 
416 	if (!vqp->fw) {
417 		vqp = &mvq->vqqp;
418 		err = rq_buf_alloc(ndev, vqp, mvq->num_ent);
419 		if (err)
420 			return err;
421 
422 		err = mlx5_db_alloc(ndev->mvdev.mdev, &vqp->db);
423 		if (err)
424 			goto err_db;
425 		inlen += vqp->frag_buf.npages * sizeof(__be64);
426 	}
427 
428 	in = kzalloc(inlen, GFP_KERNEL);
429 	if (!in) {
430 		err = -ENOMEM;
431 		goto err_kzalloc;
432 	}
433 
434 	qp_prepare(ndev, vqp->fw, in, mvq, mvq->num_ent);
435 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
436 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
437 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
438 	MLX5_SET(qpc, qpc, pd, ndev->mvdev.res.pdn);
439 	MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
440 	if (!vqp->fw)
441 		MLX5_SET64(qpc, qpc, dbr_addr, vqp->db.dma);
442 	MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
443 	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
444 	kfree(in);
445 	if (err)
446 		goto err_kzalloc;
447 
448 	vqp->mqp.uid = ndev->mvdev.res.uid;
449 	vqp->mqp.qpn = MLX5_GET(create_qp_out, out, qpn);
450 
451 	if (!vqp->fw)
452 		rx_post(vqp, mvq->num_ent);
453 
454 	return 0;
455 
456 err_kzalloc:
457 	if (!vqp->fw)
458 		mlx5_db_free(ndev->mvdev.mdev, &vqp->db);
459 err_db:
460 	if (!vqp->fw)
461 		rq_buf_free(ndev, vqp);
462 
463 	return err;
464 }
465 
466 static void qp_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp)
467 {
468 	u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
469 
470 	MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
471 	MLX5_SET(destroy_qp_in, in, qpn, vqp->mqp.qpn);
472 	MLX5_SET(destroy_qp_in, in, uid, ndev->mvdev.res.uid);
473 	if (mlx5_cmd_exec_in(ndev->mvdev.mdev, destroy_qp, in))
474 		mlx5_vdpa_warn(&ndev->mvdev, "destroy qp 0x%x\n", vqp->mqp.qpn);
475 	if (!vqp->fw) {
476 		mlx5_db_free(ndev->mvdev.mdev, &vqp->db);
477 		rq_buf_free(ndev, vqp);
478 	}
479 }
480 
481 static void *next_cqe_sw(struct mlx5_vdpa_cq *cq)
482 {
483 	return get_sw_cqe(cq, cq->mcq.cons_index);
484 }
485 
486 static int mlx5_vdpa_poll_one(struct mlx5_vdpa_cq *vcq)
487 {
488 	struct mlx5_cqe64 *cqe64;
489 
490 	cqe64 = next_cqe_sw(vcq);
491 	if (!cqe64)
492 		return -EAGAIN;
493 
494 	vcq->mcq.cons_index++;
495 	return 0;
496 }
497 
498 static void mlx5_vdpa_handle_completions(struct mlx5_vdpa_virtqueue *mvq, int num)
499 {
500 	struct mlx5_vdpa_net *ndev = mvq->ndev;
501 	struct vdpa_callback *event_cb;
502 
503 	event_cb = &ndev->event_cbs[mvq->index];
504 	mlx5_cq_set_ci(&mvq->cq.mcq);
505 
506 	/* make sure CQ cosumer update is visible to the hardware before updating
507 	 * RX doorbell record.
508 	 */
509 	dma_wmb();
510 	rx_post(&mvq->vqqp, num);
511 	if (event_cb->callback)
512 		event_cb->callback(event_cb->private);
513 }
514 
515 static void mlx5_vdpa_cq_comp(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe)
516 {
517 	struct mlx5_vdpa_virtqueue *mvq = container_of(mcq, struct mlx5_vdpa_virtqueue, cq.mcq);
518 	struct mlx5_vdpa_net *ndev = mvq->ndev;
519 	void __iomem *uar_page = ndev->mvdev.res.uar->map;
520 	int num = 0;
521 
522 	while (!mlx5_vdpa_poll_one(&mvq->cq)) {
523 		num++;
524 		if (num > mvq->num_ent / 2) {
525 			/* If completions keep coming while we poll, we want to
526 			 * let the hardware know that we consumed them by
527 			 * updating the doorbell record.  We also let vdpa core
528 			 * know about this so it passes it on the virtio driver
529 			 * on the guest.
530 			 */
531 			mlx5_vdpa_handle_completions(mvq, num);
532 			num = 0;
533 		}
534 	}
535 
536 	if (num)
537 		mlx5_vdpa_handle_completions(mvq, num);
538 
539 	mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index);
540 }
541 
542 static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent)
543 {
544 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
545 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
546 	void __iomem *uar_page = ndev->mvdev.res.uar->map;
547 	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
548 	struct mlx5_vdpa_cq *vcq = &mvq->cq;
549 	__be64 *pas;
550 	int inlen;
551 	void *cqc;
552 	void *in;
553 	int err;
554 	int eqn;
555 
556 	err = mlx5_db_alloc(mdev, &vcq->db);
557 	if (err)
558 		return err;
559 
560 	vcq->mcq.set_ci_db = vcq->db.db;
561 	vcq->mcq.arm_db = vcq->db.db + 1;
562 	vcq->mcq.cqe_sz = 64;
563 
564 	err = cq_frag_buf_alloc(ndev, &vcq->buf, num_ent);
565 	if (err)
566 		goto err_db;
567 
568 	cq_frag_buf_init(vcq, &vcq->buf);
569 
570 	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
571 		MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * vcq->buf.frag_buf.npages;
572 	in = kzalloc(inlen, GFP_KERNEL);
573 	if (!in) {
574 		err = -ENOMEM;
575 		goto err_vzalloc;
576 	}
577 
578 	MLX5_SET(create_cq_in, in, uid, ndev->mvdev.res.uid);
579 	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
580 	mlx5_fill_page_frag_array(&vcq->buf.frag_buf, pas);
581 
582 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
583 	MLX5_SET(cqc, cqc, log_page_size, vcq->buf.frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
584 
585 	/* Use vector 0 by default. Consider adding code to choose least used
586 	 * vector.
587 	 */
588 	err = mlx5_comp_eqn_get(mdev, 0, &eqn);
589 	if (err)
590 		goto err_vec;
591 
592 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
593 	MLX5_SET(cqc, cqc, log_cq_size, ilog2(num_ent));
594 	MLX5_SET(cqc, cqc, uar_page, ndev->mvdev.res.uar->index);
595 	MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
596 	MLX5_SET64(cqc, cqc, dbr_addr, vcq->db.dma);
597 
598 	err = mlx5_core_create_cq(mdev, &vcq->mcq, in, inlen, out, sizeof(out));
599 	if (err)
600 		goto err_vec;
601 
602 	vcq->mcq.comp = mlx5_vdpa_cq_comp;
603 	vcq->cqe = num_ent;
604 	vcq->mcq.set_ci_db = vcq->db.db;
605 	vcq->mcq.arm_db = vcq->db.db + 1;
606 	mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index);
607 	kfree(in);
608 	return 0;
609 
610 err_vec:
611 	kfree(in);
612 err_vzalloc:
613 	cq_frag_buf_free(ndev, &vcq->buf);
614 err_db:
615 	mlx5_db_free(ndev->mvdev.mdev, &vcq->db);
616 	return err;
617 }
618 
619 static void cq_destroy(struct mlx5_vdpa_net *ndev, u16 idx)
620 {
621 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
622 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
623 	struct mlx5_vdpa_cq *vcq = &mvq->cq;
624 
625 	if (mlx5_core_destroy_cq(mdev, &vcq->mcq)) {
626 		mlx5_vdpa_warn(&ndev->mvdev, "destroy CQ 0x%x\n", vcq->mcq.cqn);
627 		return;
628 	}
629 	cq_frag_buf_free(ndev, &vcq->buf);
630 	mlx5_db_free(ndev->mvdev.mdev, &vcq->db);
631 }
632 
633 static int read_umem_params(struct mlx5_vdpa_net *ndev)
634 {
635 	u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {};
636 	u16 opmod = (MLX5_CAP_VDPA_EMULATION << 1) | (HCA_CAP_OPMOD_GET_CUR & 0x01);
637 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
638 	int out_size;
639 	void *caps;
640 	void *out;
641 	int err;
642 
643 	out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out);
644 	out = kzalloc(out_size, GFP_KERNEL);
645 	if (!out)
646 		return -ENOMEM;
647 
648 	MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
649 	MLX5_SET(query_hca_cap_in, in, op_mod, opmod);
650 	err = mlx5_cmd_exec_inout(mdev, query_hca_cap, in, out);
651 	if (err) {
652 		mlx5_vdpa_warn(&ndev->mvdev,
653 			"Failed reading vdpa umem capabilities with err %d\n", err);
654 		goto out;
655 	}
656 
657 	caps =  MLX5_ADDR_OF(query_hca_cap_out, out, capability);
658 
659 	ndev->umem_1_buffer_param_a = MLX5_GET(virtio_emulation_cap, caps, umem_1_buffer_param_a);
660 	ndev->umem_1_buffer_param_b = MLX5_GET(virtio_emulation_cap, caps, umem_1_buffer_param_b);
661 
662 	ndev->umem_2_buffer_param_a = MLX5_GET(virtio_emulation_cap, caps, umem_2_buffer_param_a);
663 	ndev->umem_2_buffer_param_b = MLX5_GET(virtio_emulation_cap, caps, umem_2_buffer_param_b);
664 
665 	ndev->umem_3_buffer_param_a = MLX5_GET(virtio_emulation_cap, caps, umem_3_buffer_param_a);
666 	ndev->umem_3_buffer_param_b = MLX5_GET(virtio_emulation_cap, caps, umem_3_buffer_param_b);
667 
668 out:
669 	kfree(out);
670 	return 0;
671 }
672 
673 static void set_umem_size(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num,
674 			  struct mlx5_vdpa_umem **umemp)
675 {
676 	u32 p_a;
677 	u32 p_b;
678 
679 	switch (num) {
680 	case 1:
681 		p_a = ndev->umem_1_buffer_param_a;
682 		p_b = ndev->umem_1_buffer_param_b;
683 		*umemp = &mvq->umem1;
684 		break;
685 	case 2:
686 		p_a = ndev->umem_2_buffer_param_a;
687 		p_b = ndev->umem_2_buffer_param_b;
688 		*umemp = &mvq->umem2;
689 		break;
690 	case 3:
691 		p_a = ndev->umem_3_buffer_param_a;
692 		p_b = ndev->umem_3_buffer_param_b;
693 		*umemp = &mvq->umem3;
694 		break;
695 	}
696 
697 	(*umemp)->size = p_a * mvq->num_ent + p_b;
698 }
699 
700 static void umem_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem)
701 {
702 	mlx5_frag_buf_free(ndev->mvdev.mdev, &umem->frag_buf);
703 }
704 
705 static int create_umem(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num)
706 {
707 	int inlen;
708 	u32 out[MLX5_ST_SZ_DW(create_umem_out)] = {};
709 	void *um;
710 	void *in;
711 	int err;
712 	__be64 *pas;
713 	struct mlx5_vdpa_umem *umem;
714 
715 	set_umem_size(ndev, mvq, num, &umem);
716 	err = umem_frag_buf_alloc(ndev, umem, umem->size);
717 	if (err)
718 		return err;
719 
720 	inlen = MLX5_ST_SZ_BYTES(create_umem_in) + MLX5_ST_SZ_BYTES(mtt) * umem->frag_buf.npages;
721 
722 	in = kzalloc(inlen, GFP_KERNEL);
723 	if (!in) {
724 		err = -ENOMEM;
725 		goto err_in;
726 	}
727 
728 	MLX5_SET(create_umem_in, in, opcode, MLX5_CMD_OP_CREATE_UMEM);
729 	MLX5_SET(create_umem_in, in, uid, ndev->mvdev.res.uid);
730 	um = MLX5_ADDR_OF(create_umem_in, in, umem);
731 	MLX5_SET(umem, um, log_page_size, umem->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
732 	MLX5_SET64(umem, um, num_of_mtt, umem->frag_buf.npages);
733 
734 	pas = (__be64 *)MLX5_ADDR_OF(umem, um, mtt[0]);
735 	mlx5_fill_page_frag_array_perm(&umem->frag_buf, pas, MLX5_MTT_PERM_RW);
736 
737 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
738 	if (err) {
739 		mlx5_vdpa_warn(&ndev->mvdev, "create umem(%d)\n", err);
740 		goto err_cmd;
741 	}
742 
743 	kfree(in);
744 	umem->id = MLX5_GET(create_umem_out, out, umem_id);
745 
746 	return 0;
747 
748 err_cmd:
749 	kfree(in);
750 err_in:
751 	umem_frag_buf_free(ndev, umem);
752 	return err;
753 }
754 
755 static void umem_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num)
756 {
757 	u32 in[MLX5_ST_SZ_DW(destroy_umem_in)] = {};
758 	u32 out[MLX5_ST_SZ_DW(destroy_umem_out)] = {};
759 	struct mlx5_vdpa_umem *umem;
760 
761 	switch (num) {
762 	case 1:
763 		umem = &mvq->umem1;
764 		break;
765 	case 2:
766 		umem = &mvq->umem2;
767 		break;
768 	case 3:
769 		umem = &mvq->umem3;
770 		break;
771 	}
772 
773 	MLX5_SET(destroy_umem_in, in, opcode, MLX5_CMD_OP_DESTROY_UMEM);
774 	MLX5_SET(destroy_umem_in, in, umem_id, umem->id);
775 	if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out)))
776 		return;
777 
778 	umem_frag_buf_free(ndev, umem);
779 }
780 
781 static int umems_create(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
782 {
783 	int num;
784 	int err;
785 
786 	for (num = 1; num <= 3; num++) {
787 		err = create_umem(ndev, mvq, num);
788 		if (err)
789 			goto err_umem;
790 	}
791 	return 0;
792 
793 err_umem:
794 	for (num--; num > 0; num--)
795 		umem_destroy(ndev, mvq, num);
796 
797 	return err;
798 }
799 
800 static void umems_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
801 {
802 	int num;
803 
804 	for (num = 3; num > 0; num--)
805 		umem_destroy(ndev, mvq, num);
806 }
807 
808 static int get_queue_type(struct mlx5_vdpa_net *ndev)
809 {
810 	u32 type_mask;
811 
812 	type_mask = MLX5_CAP_DEV_VDPA_EMULATION(ndev->mvdev.mdev, virtio_queue_type);
813 
814 	/* prefer split queue */
815 	if (type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT)
816 		return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT;
817 
818 	WARN_ON(!(type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED));
819 
820 	return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED;
821 }
822 
823 static bool vq_is_tx(u16 idx)
824 {
825 	return idx % 2;
826 }
827 
828 enum {
829 	MLX5_VIRTIO_NET_F_MRG_RXBUF = 2,
830 	MLX5_VIRTIO_NET_F_HOST_ECN = 4,
831 	MLX5_VIRTIO_NET_F_GUEST_ECN = 6,
832 	MLX5_VIRTIO_NET_F_GUEST_TSO6 = 7,
833 	MLX5_VIRTIO_NET_F_GUEST_TSO4 = 8,
834 	MLX5_VIRTIO_NET_F_GUEST_CSUM = 9,
835 	MLX5_VIRTIO_NET_F_CSUM = 10,
836 	MLX5_VIRTIO_NET_F_HOST_TSO6 = 11,
837 	MLX5_VIRTIO_NET_F_HOST_TSO4 = 12,
838 };
839 
840 static u16 get_features(u64 features)
841 {
842 	return (!!(features & BIT_ULL(VIRTIO_NET_F_MRG_RXBUF)) << MLX5_VIRTIO_NET_F_MRG_RXBUF) |
843 	       (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_ECN)) << MLX5_VIRTIO_NET_F_HOST_ECN) |
844 	       (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_ECN)) << MLX5_VIRTIO_NET_F_GUEST_ECN) |
845 	       (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_TSO6)) << MLX5_VIRTIO_NET_F_GUEST_TSO6) |
846 	       (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_TSO4)) << MLX5_VIRTIO_NET_F_GUEST_TSO4) |
847 	       (!!(features & BIT_ULL(VIRTIO_NET_F_CSUM)) << MLX5_VIRTIO_NET_F_CSUM) |
848 	       (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO6)) << MLX5_VIRTIO_NET_F_HOST_TSO6) |
849 	       (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO4)) << MLX5_VIRTIO_NET_F_HOST_TSO4);
850 }
851 
852 static bool counters_supported(const struct mlx5_vdpa_dev *mvdev)
853 {
854 	return MLX5_CAP_GEN_64(mvdev->mdev, general_obj_types) &
855 	       BIT_ULL(MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS);
856 }
857 
858 static bool msix_mode_supported(struct mlx5_vdpa_dev *mvdev)
859 {
860 	return MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, event_mode) &
861 		(1 << MLX5_VIRTIO_Q_EVENT_MODE_MSIX_MODE) &&
862 		pci_msix_can_alloc_dyn(mvdev->mdev->pdev);
863 }
864 
865 static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
866 {
867 	int inlen = MLX5_ST_SZ_BYTES(create_virtio_net_q_in);
868 	u32 out[MLX5_ST_SZ_DW(create_virtio_net_q_out)] = {};
869 	struct mlx5_vdpa_dev *mvdev = &ndev->mvdev;
870 	struct mlx5_vdpa_mr *vq_mr;
871 	struct mlx5_vdpa_mr *vq_desc_mr;
872 	void *obj_context;
873 	u16 mlx_features;
874 	void *cmd_hdr;
875 	void *vq_ctx;
876 	void *in;
877 	int err;
878 
879 	err = umems_create(ndev, mvq);
880 	if (err)
881 		return err;
882 
883 	in = kzalloc(inlen, GFP_KERNEL);
884 	if (!in) {
885 		err = -ENOMEM;
886 		goto err_alloc;
887 	}
888 
889 	mlx_features = get_features(ndev->mvdev.actual_features);
890 	cmd_hdr = MLX5_ADDR_OF(create_virtio_net_q_in, in, general_obj_in_cmd_hdr);
891 
892 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
893 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
894 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
895 
896 	obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, in, obj_context);
897 	MLX5_SET(virtio_net_q_object, obj_context, hw_available_index, mvq->avail_idx);
898 	MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, mvq->used_idx);
899 	MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_12_3,
900 		 mlx_features >> 3);
901 	MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_2_0,
902 		 mlx_features & 7);
903 	vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, virtio_q_context);
904 	MLX5_SET(virtio_q, vq_ctx, virtio_q_type, get_queue_type(ndev));
905 
906 	if (vq_is_tx(mvq->index))
907 		MLX5_SET(virtio_net_q_object, obj_context, tisn_or_qpn, ndev->res.tisn);
908 
909 	if (mvq->map.virq) {
910 		MLX5_SET(virtio_q, vq_ctx, event_mode, MLX5_VIRTIO_Q_EVENT_MODE_MSIX_MODE);
911 		MLX5_SET(virtio_q, vq_ctx, event_qpn_or_msix, mvq->map.index);
912 	} else {
913 		MLX5_SET(virtio_q, vq_ctx, event_mode, MLX5_VIRTIO_Q_EVENT_MODE_QP_MODE);
914 		MLX5_SET(virtio_q, vq_ctx, event_qpn_or_msix, mvq->fwqp.mqp.qpn);
915 	}
916 
917 	MLX5_SET(virtio_q, vq_ctx, queue_index, mvq->index);
918 	MLX5_SET(virtio_q, vq_ctx, queue_size, mvq->num_ent);
919 	MLX5_SET(virtio_q, vq_ctx, virtio_version_1_0,
920 		 !!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_F_VERSION_1)));
921 	MLX5_SET64(virtio_q, vq_ctx, desc_addr, mvq->desc_addr);
922 	MLX5_SET64(virtio_q, vq_ctx, used_addr, mvq->device_addr);
923 	MLX5_SET64(virtio_q, vq_ctx, available_addr, mvq->driver_addr);
924 	vq_mr = mvdev->mr[mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP]];
925 	if (vq_mr)
926 		MLX5_SET(virtio_q, vq_ctx, virtio_q_mkey, vq_mr->mkey);
927 
928 	vq_desc_mr = mvdev->mr[mvdev->group2asid[MLX5_VDPA_DATAVQ_DESC_GROUP]];
929 	if (vq_desc_mr && MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, desc_group_mkey_supported))
930 		MLX5_SET(virtio_q, vq_ctx, desc_group_mkey, vq_desc_mr->mkey);
931 
932 	MLX5_SET(virtio_q, vq_ctx, umem_1_id, mvq->umem1.id);
933 	MLX5_SET(virtio_q, vq_ctx, umem_1_size, mvq->umem1.size);
934 	MLX5_SET(virtio_q, vq_ctx, umem_2_id, mvq->umem2.id);
935 	MLX5_SET(virtio_q, vq_ctx, umem_2_size, mvq->umem2.size);
936 	MLX5_SET(virtio_q, vq_ctx, umem_3_id, mvq->umem3.id);
937 	MLX5_SET(virtio_q, vq_ctx, umem_3_size, mvq->umem3.size);
938 	MLX5_SET(virtio_q, vq_ctx, pd, ndev->mvdev.res.pdn);
939 	if (counters_supported(&ndev->mvdev))
940 		MLX5_SET(virtio_q, vq_ctx, counter_set_id, mvq->counter_set_id);
941 
942 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
943 	if (err)
944 		goto err_cmd;
945 
946 	mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT;
947 	kfree(in);
948 	mvq->virtq_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
949 
950 	mlx5_vdpa_get_mr(mvdev, vq_mr);
951 	mvq->vq_mr = vq_mr;
952 
953 	if (vq_desc_mr && MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, desc_group_mkey_supported)) {
954 		mlx5_vdpa_get_mr(mvdev, vq_desc_mr);
955 		mvq->desc_mr = vq_desc_mr;
956 	}
957 
958 	return 0;
959 
960 err_cmd:
961 	kfree(in);
962 err_alloc:
963 	umems_destroy(ndev, mvq);
964 	return err;
965 }
966 
967 static void destroy_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
968 {
969 	u32 in[MLX5_ST_SZ_DW(destroy_virtio_net_q_in)] = {};
970 	u32 out[MLX5_ST_SZ_DW(destroy_virtio_net_q_out)] = {};
971 
972 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.opcode,
973 		 MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
974 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.obj_id, mvq->virtq_id);
975 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.uid, ndev->mvdev.res.uid);
976 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.obj_type,
977 		 MLX5_OBJ_TYPE_VIRTIO_NET_Q);
978 	if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out))) {
979 		mlx5_vdpa_warn(&ndev->mvdev, "destroy virtqueue 0x%x\n", mvq->virtq_id);
980 		return;
981 	}
982 	mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_NONE;
983 	umems_destroy(ndev, mvq);
984 
985 	mlx5_vdpa_put_mr(&ndev->mvdev, mvq->vq_mr);
986 	mvq->vq_mr = NULL;
987 
988 	mlx5_vdpa_put_mr(&ndev->mvdev, mvq->desc_mr);
989 	mvq->desc_mr = NULL;
990 }
991 
992 static u32 get_rqpn(struct mlx5_vdpa_virtqueue *mvq, bool fw)
993 {
994 	return fw ? mvq->vqqp.mqp.qpn : mvq->fwqp.mqp.qpn;
995 }
996 
997 static u32 get_qpn(struct mlx5_vdpa_virtqueue *mvq, bool fw)
998 {
999 	return fw ? mvq->fwqp.mqp.qpn : mvq->vqqp.mqp.qpn;
1000 }
1001 
1002 static void alloc_inout(struct mlx5_vdpa_net *ndev, int cmd, void **in, int *inlen, void **out,
1003 			int *outlen, u32 qpn, u32 rqpn)
1004 {
1005 	void *qpc;
1006 	void *pp;
1007 
1008 	switch (cmd) {
1009 	case MLX5_CMD_OP_2RST_QP:
1010 		*inlen = MLX5_ST_SZ_BYTES(qp_2rst_in);
1011 		*outlen = MLX5_ST_SZ_BYTES(qp_2rst_out);
1012 		*in = kzalloc(*inlen, GFP_KERNEL);
1013 		*out = kzalloc(*outlen, GFP_KERNEL);
1014 		if (!*in || !*out)
1015 			goto outerr;
1016 
1017 		MLX5_SET(qp_2rst_in, *in, opcode, cmd);
1018 		MLX5_SET(qp_2rst_in, *in, uid, ndev->mvdev.res.uid);
1019 		MLX5_SET(qp_2rst_in, *in, qpn, qpn);
1020 		break;
1021 	case MLX5_CMD_OP_RST2INIT_QP:
1022 		*inlen = MLX5_ST_SZ_BYTES(rst2init_qp_in);
1023 		*outlen = MLX5_ST_SZ_BYTES(rst2init_qp_out);
1024 		*in = kzalloc(*inlen, GFP_KERNEL);
1025 		*out = kzalloc(MLX5_ST_SZ_BYTES(rst2init_qp_out), GFP_KERNEL);
1026 		if (!*in || !*out)
1027 			goto outerr;
1028 
1029 		MLX5_SET(rst2init_qp_in, *in, opcode, cmd);
1030 		MLX5_SET(rst2init_qp_in, *in, uid, ndev->mvdev.res.uid);
1031 		MLX5_SET(rst2init_qp_in, *in, qpn, qpn);
1032 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
1033 		MLX5_SET(qpc, qpc, remote_qpn, rqpn);
1034 		MLX5_SET(qpc, qpc, rwe, 1);
1035 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
1036 		MLX5_SET(ads, pp, vhca_port_num, 1);
1037 		break;
1038 	case MLX5_CMD_OP_INIT2RTR_QP:
1039 		*inlen = MLX5_ST_SZ_BYTES(init2rtr_qp_in);
1040 		*outlen = MLX5_ST_SZ_BYTES(init2rtr_qp_out);
1041 		*in = kzalloc(*inlen, GFP_KERNEL);
1042 		*out = kzalloc(MLX5_ST_SZ_BYTES(init2rtr_qp_out), GFP_KERNEL);
1043 		if (!*in || !*out)
1044 			goto outerr;
1045 
1046 		MLX5_SET(init2rtr_qp_in, *in, opcode, cmd);
1047 		MLX5_SET(init2rtr_qp_in, *in, uid, ndev->mvdev.res.uid);
1048 		MLX5_SET(init2rtr_qp_in, *in, qpn, qpn);
1049 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
1050 		MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
1051 		MLX5_SET(qpc, qpc, log_msg_max, 30);
1052 		MLX5_SET(qpc, qpc, remote_qpn, rqpn);
1053 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
1054 		MLX5_SET(ads, pp, fl, 1);
1055 		break;
1056 	case MLX5_CMD_OP_RTR2RTS_QP:
1057 		*inlen = MLX5_ST_SZ_BYTES(rtr2rts_qp_in);
1058 		*outlen = MLX5_ST_SZ_BYTES(rtr2rts_qp_out);
1059 		*in = kzalloc(*inlen, GFP_KERNEL);
1060 		*out = kzalloc(MLX5_ST_SZ_BYTES(rtr2rts_qp_out), GFP_KERNEL);
1061 		if (!*in || !*out)
1062 			goto outerr;
1063 
1064 		MLX5_SET(rtr2rts_qp_in, *in, opcode, cmd);
1065 		MLX5_SET(rtr2rts_qp_in, *in, uid, ndev->mvdev.res.uid);
1066 		MLX5_SET(rtr2rts_qp_in, *in, qpn, qpn);
1067 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
1068 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
1069 		MLX5_SET(ads, pp, ack_timeout, 14);
1070 		MLX5_SET(qpc, qpc, retry_count, 7);
1071 		MLX5_SET(qpc, qpc, rnr_retry, 7);
1072 		break;
1073 	default:
1074 		goto outerr_nullify;
1075 	}
1076 
1077 	return;
1078 
1079 outerr:
1080 	kfree(*in);
1081 	kfree(*out);
1082 outerr_nullify:
1083 	*in = NULL;
1084 	*out = NULL;
1085 }
1086 
1087 static void free_inout(void *in, void *out)
1088 {
1089 	kfree(in);
1090 	kfree(out);
1091 }
1092 
1093 /* Two QPs are used by each virtqueue. One is used by the driver and one by
1094  * firmware. The fw argument indicates whether the subjected QP is the one used
1095  * by firmware.
1096  */
1097 static int modify_qp(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, bool fw, int cmd)
1098 {
1099 	int outlen;
1100 	int inlen;
1101 	void *out;
1102 	void *in;
1103 	int err;
1104 
1105 	alloc_inout(ndev, cmd, &in, &inlen, &out, &outlen, get_qpn(mvq, fw), get_rqpn(mvq, fw));
1106 	if (!in || !out)
1107 		return -ENOMEM;
1108 
1109 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, outlen);
1110 	free_inout(in, out);
1111 	return err;
1112 }
1113 
1114 static int connect_qps(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1115 {
1116 	int err;
1117 
1118 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_2RST_QP);
1119 	if (err)
1120 		return err;
1121 
1122 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_2RST_QP);
1123 	if (err)
1124 		return err;
1125 
1126 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_RST2INIT_QP);
1127 	if (err)
1128 		return err;
1129 
1130 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_RST2INIT_QP);
1131 	if (err)
1132 		return err;
1133 
1134 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_INIT2RTR_QP);
1135 	if (err)
1136 		return err;
1137 
1138 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_INIT2RTR_QP);
1139 	if (err)
1140 		return err;
1141 
1142 	return modify_qp(ndev, mvq, true, MLX5_CMD_OP_RTR2RTS_QP);
1143 }
1144 
1145 struct mlx5_virtq_attr {
1146 	u8 state;
1147 	u16 available_index;
1148 	u16 used_index;
1149 };
1150 
1151 static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
1152 			   struct mlx5_virtq_attr *attr)
1153 {
1154 	int outlen = MLX5_ST_SZ_BYTES(query_virtio_net_q_out);
1155 	u32 in[MLX5_ST_SZ_DW(query_virtio_net_q_in)] = {};
1156 	void *out;
1157 	void *obj_context;
1158 	void *cmd_hdr;
1159 	int err;
1160 
1161 	out = kzalloc(outlen, GFP_KERNEL);
1162 	if (!out)
1163 		return -ENOMEM;
1164 
1165 	cmd_hdr = MLX5_ADDR_OF(query_virtio_net_q_in, in, general_obj_in_cmd_hdr);
1166 
1167 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT);
1168 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
1169 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);
1170 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
1171 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, outlen);
1172 	if (err)
1173 		goto err_cmd;
1174 
1175 	obj_context = MLX5_ADDR_OF(query_virtio_net_q_out, out, obj_context);
1176 	memset(attr, 0, sizeof(*attr));
1177 	attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);
1178 	attr->available_index = MLX5_GET(virtio_net_q_object, obj_context, hw_available_index);
1179 	attr->used_index = MLX5_GET(virtio_net_q_object, obj_context, hw_used_index);
1180 	kfree(out);
1181 	return 0;
1182 
1183 err_cmd:
1184 	kfree(out);
1185 	return err;
1186 }
1187 
1188 static bool is_resumable(struct mlx5_vdpa_net *ndev)
1189 {
1190 	return ndev->mvdev.vdev.config->resume;
1191 }
1192 
1193 static bool is_valid_state_change(int oldstate, int newstate, bool resumable)
1194 {
1195 	switch (oldstate) {
1196 	case MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT:
1197 		return newstate == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY;
1198 	case MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY:
1199 		return newstate == MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND;
1200 	case MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND:
1201 		return resumable ? newstate == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY : false;
1202 	case MLX5_VIRTIO_NET_Q_OBJECT_STATE_ERR:
1203 	default:
1204 		return false;
1205 	}
1206 }
1207 
1208 static bool modifiable_virtqueue_fields(struct mlx5_vdpa_virtqueue *mvq)
1209 {
1210 	/* Only state is always modifiable */
1211 	if (mvq->modified_fields & ~MLX5_VIRTQ_MODIFY_MASK_STATE)
1212 		return mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT ||
1213 		       mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND;
1214 
1215 	return true;
1216 }
1217 
1218 static int modify_virtqueue(struct mlx5_vdpa_net *ndev,
1219 			    struct mlx5_vdpa_virtqueue *mvq,
1220 			    int state)
1221 {
1222 	int inlen = MLX5_ST_SZ_BYTES(modify_virtio_net_q_in);
1223 	u32 out[MLX5_ST_SZ_DW(modify_virtio_net_q_out)] = {};
1224 	struct mlx5_vdpa_dev *mvdev = &ndev->mvdev;
1225 	struct mlx5_vdpa_mr *desc_mr = NULL;
1226 	struct mlx5_vdpa_mr *vq_mr = NULL;
1227 	bool state_change = false;
1228 	void *obj_context;
1229 	void *cmd_hdr;
1230 	void *vq_ctx;
1231 	void *in;
1232 	int err;
1233 
1234 	if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_NONE)
1235 		return 0;
1236 
1237 	if (!modifiable_virtqueue_fields(mvq))
1238 		return -EINVAL;
1239 
1240 	in = kzalloc(inlen, GFP_KERNEL);
1241 	if (!in)
1242 		return -ENOMEM;
1243 
1244 	cmd_hdr = MLX5_ADDR_OF(modify_virtio_net_q_in, in, general_obj_in_cmd_hdr);
1245 
1246 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT);
1247 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
1248 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);
1249 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
1250 
1251 	obj_context = MLX5_ADDR_OF(modify_virtio_net_q_in, in, obj_context);
1252 	vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, virtio_q_context);
1253 
1254 	if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_STATE) {
1255 		if (!is_valid_state_change(mvq->fw_state, state, is_resumable(ndev))) {
1256 			err = -EINVAL;
1257 			goto done;
1258 		}
1259 
1260 		MLX5_SET(virtio_net_q_object, obj_context, state, state);
1261 		state_change = true;
1262 	}
1263 
1264 	if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_ADDRS) {
1265 		MLX5_SET64(virtio_q, vq_ctx, desc_addr, mvq->desc_addr);
1266 		MLX5_SET64(virtio_q, vq_ctx, used_addr, mvq->device_addr);
1267 		MLX5_SET64(virtio_q, vq_ctx, available_addr, mvq->driver_addr);
1268 	}
1269 
1270 	if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_AVAIL_IDX)
1271 		MLX5_SET(virtio_net_q_object, obj_context, hw_available_index, mvq->avail_idx);
1272 
1273 	if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_USED_IDX)
1274 		MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, mvq->used_idx);
1275 
1276 	if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY) {
1277 		vq_mr = mvdev->mr[mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP]];
1278 
1279 		if (vq_mr)
1280 			MLX5_SET(virtio_q, vq_ctx, virtio_q_mkey, vq_mr->mkey);
1281 		else
1282 			mvq->modified_fields &= ~MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY;
1283 	}
1284 
1285 	if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY) {
1286 		desc_mr = mvdev->mr[mvdev->group2asid[MLX5_VDPA_DATAVQ_DESC_GROUP]];
1287 
1288 		if (desc_mr && MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, desc_group_mkey_supported))
1289 			MLX5_SET(virtio_q, vq_ctx, desc_group_mkey, desc_mr->mkey);
1290 		else
1291 			mvq->modified_fields &= ~MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY;
1292 	}
1293 
1294 	MLX5_SET64(virtio_net_q_object, obj_context, modify_field_select, mvq->modified_fields);
1295 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
1296 	if (err)
1297 		goto done;
1298 
1299 	if (state_change)
1300 		mvq->fw_state = state;
1301 
1302 	if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY) {
1303 		mlx5_vdpa_put_mr(mvdev, mvq->vq_mr);
1304 		mlx5_vdpa_get_mr(mvdev, vq_mr);
1305 		mvq->vq_mr = vq_mr;
1306 	}
1307 
1308 	if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY) {
1309 		mlx5_vdpa_put_mr(mvdev, mvq->desc_mr);
1310 		mlx5_vdpa_get_mr(mvdev, desc_mr);
1311 		mvq->desc_mr = desc_mr;
1312 	}
1313 
1314 	mvq->modified_fields = 0;
1315 
1316 done:
1317 	kfree(in);
1318 	return err;
1319 }
1320 
1321 static int modify_virtqueue_state(struct mlx5_vdpa_net *ndev,
1322 				  struct mlx5_vdpa_virtqueue *mvq,
1323 				  unsigned int state)
1324 {
1325 	mvq->modified_fields |= MLX5_VIRTQ_MODIFY_MASK_STATE;
1326 	return modify_virtqueue(ndev, mvq, state);
1327 }
1328 
1329 static int counter_set_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1330 {
1331 	u32 in[MLX5_ST_SZ_DW(create_virtio_q_counters_in)] = {};
1332 	u32 out[MLX5_ST_SZ_DW(create_virtio_q_counters_out)] = {};
1333 	void *cmd_hdr;
1334 	int err;
1335 
1336 	if (!counters_supported(&ndev->mvdev))
1337 		return 0;
1338 
1339 	cmd_hdr = MLX5_ADDR_OF(create_virtio_q_counters_in, in, hdr);
1340 
1341 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
1342 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS);
1343 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
1344 
1345 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out));
1346 	if (err)
1347 		return err;
1348 
1349 	mvq->counter_set_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
1350 
1351 	return 0;
1352 }
1353 
1354 static void counter_set_dealloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1355 {
1356 	u32 in[MLX5_ST_SZ_DW(destroy_virtio_q_counters_in)] = {};
1357 	u32 out[MLX5_ST_SZ_DW(destroy_virtio_q_counters_out)] = {};
1358 
1359 	if (!counters_supported(&ndev->mvdev))
1360 		return;
1361 
1362 	MLX5_SET(destroy_virtio_q_counters_in, in, hdr.opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
1363 	MLX5_SET(destroy_virtio_q_counters_in, in, hdr.obj_id, mvq->counter_set_id);
1364 	MLX5_SET(destroy_virtio_q_counters_in, in, hdr.uid, ndev->mvdev.res.uid);
1365 	MLX5_SET(destroy_virtio_q_counters_in, in, hdr.obj_type, MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS);
1366 	if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out)))
1367 		mlx5_vdpa_warn(&ndev->mvdev, "dealloc counter set 0x%x\n", mvq->counter_set_id);
1368 }
1369 
1370 static irqreturn_t mlx5_vdpa_int_handler(int irq, void *priv)
1371 {
1372 	struct vdpa_callback *cb = priv;
1373 
1374 	if (cb->callback)
1375 		return cb->callback(cb->private);
1376 
1377 	return IRQ_HANDLED;
1378 }
1379 
1380 static void alloc_vector(struct mlx5_vdpa_net *ndev,
1381 			 struct mlx5_vdpa_virtqueue *mvq)
1382 {
1383 	struct mlx5_vdpa_irq_pool *irqp = &ndev->irqp;
1384 	struct mlx5_vdpa_irq_pool_entry *ent;
1385 	int err;
1386 	int i;
1387 
1388 	for (i = 0; i < irqp->num_ent; i++) {
1389 		ent = &irqp->entries[i];
1390 		if (!ent->used) {
1391 			snprintf(ent->name, MLX5_VDPA_IRQ_NAME_LEN, "%s-vq-%d",
1392 				 dev_name(&ndev->mvdev.vdev.dev), mvq->index);
1393 			ent->dev_id = &ndev->event_cbs[mvq->index];
1394 			err = request_irq(ent->map.virq, mlx5_vdpa_int_handler, 0,
1395 					  ent->name, ent->dev_id);
1396 			if (err)
1397 				return;
1398 
1399 			ent->used = true;
1400 			mvq->map = ent->map;
1401 			return;
1402 		}
1403 	}
1404 }
1405 
1406 static void dealloc_vector(struct mlx5_vdpa_net *ndev,
1407 			   struct mlx5_vdpa_virtqueue *mvq)
1408 {
1409 	struct mlx5_vdpa_irq_pool *irqp = &ndev->irqp;
1410 	int i;
1411 
1412 	for (i = 0; i < irqp->num_ent; i++)
1413 		if (mvq->map.virq == irqp->entries[i].map.virq) {
1414 			free_irq(mvq->map.virq, irqp->entries[i].dev_id);
1415 			irqp->entries[i].used = false;
1416 			return;
1417 		}
1418 }
1419 
1420 static int setup_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1421 {
1422 	u16 idx = mvq->index;
1423 	int err;
1424 
1425 	if (!mvq->num_ent)
1426 		return 0;
1427 
1428 	if (mvq->initialized)
1429 		return 0;
1430 
1431 	err = cq_create(ndev, idx, mvq->num_ent);
1432 	if (err)
1433 		return err;
1434 
1435 	err = qp_create(ndev, mvq, &mvq->fwqp);
1436 	if (err)
1437 		goto err_fwqp;
1438 
1439 	err = qp_create(ndev, mvq, &mvq->vqqp);
1440 	if (err)
1441 		goto err_vqqp;
1442 
1443 	err = connect_qps(ndev, mvq);
1444 	if (err)
1445 		goto err_connect;
1446 
1447 	err = counter_set_alloc(ndev, mvq);
1448 	if (err)
1449 		goto err_connect;
1450 
1451 	alloc_vector(ndev, mvq);
1452 	err = create_virtqueue(ndev, mvq);
1453 	if (err)
1454 		goto err_vq;
1455 
1456 	if (mvq->ready) {
1457 		err = modify_virtqueue_state(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY);
1458 		if (err) {
1459 			mlx5_vdpa_warn(&ndev->mvdev, "failed to modify to ready vq idx %d(%d)\n",
1460 				       idx, err);
1461 			goto err_modify;
1462 		}
1463 	}
1464 
1465 	mvq->initialized = true;
1466 	return 0;
1467 
1468 err_modify:
1469 	destroy_virtqueue(ndev, mvq);
1470 err_vq:
1471 	dealloc_vector(ndev, mvq);
1472 	counter_set_dealloc(ndev, mvq);
1473 err_connect:
1474 	qp_destroy(ndev, &mvq->vqqp);
1475 err_vqqp:
1476 	qp_destroy(ndev, &mvq->fwqp);
1477 err_fwqp:
1478 	cq_destroy(ndev, idx);
1479 	return err;
1480 }
1481 
1482 static void suspend_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1483 {
1484 	struct mlx5_virtq_attr attr;
1485 
1486 	if (!mvq->initialized)
1487 		return;
1488 
1489 	if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY)
1490 		return;
1491 
1492 	if (modify_virtqueue_state(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND))
1493 		mlx5_vdpa_warn(&ndev->mvdev, "modify to suspend failed\n");
1494 
1495 	if (query_virtqueue(ndev, mvq, &attr)) {
1496 		mlx5_vdpa_warn(&ndev->mvdev, "failed to query virtqueue\n");
1497 		return;
1498 	}
1499 	mvq->avail_idx = attr.available_index;
1500 	mvq->used_idx = attr.used_index;
1501 }
1502 
1503 static void suspend_vqs(struct mlx5_vdpa_net *ndev)
1504 {
1505 	int i;
1506 
1507 	for (i = 0; i < ndev->mvdev.max_vqs; i++)
1508 		suspend_vq(ndev, &ndev->vqs[i]);
1509 }
1510 
1511 static void resume_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1512 {
1513 	if (!mvq->initialized || !is_resumable(ndev))
1514 		return;
1515 
1516 	if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND)
1517 		return;
1518 
1519 	if (modify_virtqueue_state(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY))
1520 		mlx5_vdpa_warn(&ndev->mvdev, "modify to resume failed for vq %u\n", mvq->index);
1521 }
1522 
1523 static void resume_vqs(struct mlx5_vdpa_net *ndev)
1524 {
1525 	for (int i = 0; i < ndev->mvdev.max_vqs; i++)
1526 		resume_vq(ndev, &ndev->vqs[i]);
1527 }
1528 
1529 static void teardown_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1530 {
1531 	if (!mvq->initialized)
1532 		return;
1533 
1534 	suspend_vq(ndev, mvq);
1535 	mvq->modified_fields = 0;
1536 	destroy_virtqueue(ndev, mvq);
1537 	dealloc_vector(ndev, mvq);
1538 	counter_set_dealloc(ndev, mvq);
1539 	qp_destroy(ndev, &mvq->vqqp);
1540 	qp_destroy(ndev, &mvq->fwqp);
1541 	cq_destroy(ndev, mvq->index);
1542 	mvq->initialized = false;
1543 }
1544 
1545 static int create_rqt(struct mlx5_vdpa_net *ndev)
1546 {
1547 	int rqt_table_size = roundup_pow_of_two(ndev->rqt_size);
1548 	int act_sz = roundup_pow_of_two(ndev->cur_num_vqs / 2);
1549 	__be32 *list;
1550 	void *rqtc;
1551 	int inlen;
1552 	void *in;
1553 	int i, j;
1554 	int err;
1555 
1556 	inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + rqt_table_size * MLX5_ST_SZ_BYTES(rq_num);
1557 	in = kzalloc(inlen, GFP_KERNEL);
1558 	if (!in)
1559 		return -ENOMEM;
1560 
1561 	MLX5_SET(create_rqt_in, in, uid, ndev->mvdev.res.uid);
1562 	rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context);
1563 
1564 	MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q);
1565 	MLX5_SET(rqtc, rqtc, rqt_max_size, rqt_table_size);
1566 	list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]);
1567 	for (i = 0, j = 0; i < act_sz; i++, j += 2)
1568 		list[i] = cpu_to_be32(ndev->vqs[j % ndev->cur_num_vqs].virtq_id);
1569 
1570 	MLX5_SET(rqtc, rqtc, rqt_actual_size, act_sz);
1571 	err = mlx5_vdpa_create_rqt(&ndev->mvdev, in, inlen, &ndev->res.rqtn);
1572 	kfree(in);
1573 	if (err)
1574 		return err;
1575 
1576 	return 0;
1577 }
1578 
1579 #define MLX5_MODIFY_RQT_NUM_RQS ((u64)1)
1580 
1581 static int modify_rqt(struct mlx5_vdpa_net *ndev, int num)
1582 {
1583 	int act_sz = roundup_pow_of_two(num / 2);
1584 	__be32 *list;
1585 	void *rqtc;
1586 	int inlen;
1587 	void *in;
1588 	int i, j;
1589 	int err;
1590 
1591 	inlen = MLX5_ST_SZ_BYTES(modify_rqt_in) + act_sz * MLX5_ST_SZ_BYTES(rq_num);
1592 	in = kzalloc(inlen, GFP_KERNEL);
1593 	if (!in)
1594 		return -ENOMEM;
1595 
1596 	MLX5_SET(modify_rqt_in, in, uid, ndev->mvdev.res.uid);
1597 	MLX5_SET64(modify_rqt_in, in, bitmask, MLX5_MODIFY_RQT_NUM_RQS);
1598 	rqtc = MLX5_ADDR_OF(modify_rqt_in, in, ctx);
1599 	MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q);
1600 
1601 	list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]);
1602 	for (i = 0, j = 0; i < act_sz; i++, j = j + 2)
1603 		list[i] = cpu_to_be32(ndev->vqs[j % num].virtq_id);
1604 
1605 	MLX5_SET(rqtc, rqtc, rqt_actual_size, act_sz);
1606 	err = mlx5_vdpa_modify_rqt(&ndev->mvdev, in, inlen, ndev->res.rqtn);
1607 	kfree(in);
1608 	if (err)
1609 		return err;
1610 
1611 	return 0;
1612 }
1613 
1614 static void destroy_rqt(struct mlx5_vdpa_net *ndev)
1615 {
1616 	mlx5_vdpa_destroy_rqt(&ndev->mvdev, ndev->res.rqtn);
1617 }
1618 
1619 static int create_tir(struct mlx5_vdpa_net *ndev)
1620 {
1621 #define HASH_IP_L4PORTS                                                                            \
1622 	(MLX5_HASH_FIELD_SEL_SRC_IP | MLX5_HASH_FIELD_SEL_DST_IP | MLX5_HASH_FIELD_SEL_L4_SPORT |  \
1623 	 MLX5_HASH_FIELD_SEL_L4_DPORT)
1624 	static const u8 rx_hash_toeplitz_key[] = { 0x2c, 0xc6, 0x81, 0xd1, 0x5b, 0xdb, 0xf4, 0xf7,
1625 						   0xfc, 0xa2, 0x83, 0x19, 0xdb, 0x1a, 0x3e, 0x94,
1626 						   0x6b, 0x9e, 0x38, 0xd9, 0x2c, 0x9c, 0x03, 0xd1,
1627 						   0xad, 0x99, 0x44, 0xa7, 0xd9, 0x56, 0x3d, 0x59,
1628 						   0x06, 0x3c, 0x25, 0xf3, 0xfc, 0x1f, 0xdc, 0x2a };
1629 	void *rss_key;
1630 	void *outer;
1631 	void *tirc;
1632 	void *in;
1633 	int err;
1634 
1635 	in = kzalloc(MLX5_ST_SZ_BYTES(create_tir_in), GFP_KERNEL);
1636 	if (!in)
1637 		return -ENOMEM;
1638 
1639 	MLX5_SET(create_tir_in, in, uid, ndev->mvdev.res.uid);
1640 	tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
1641 	MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT);
1642 
1643 	MLX5_SET(tirc, tirc, rx_hash_symmetric, 1);
1644 	MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_TOEPLITZ);
1645 	rss_key = MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key);
1646 	memcpy(rss_key, rx_hash_toeplitz_key, sizeof(rx_hash_toeplitz_key));
1647 
1648 	outer = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer);
1649 	MLX5_SET(rx_hash_field_select, outer, l3_prot_type, MLX5_L3_PROT_TYPE_IPV4);
1650 	MLX5_SET(rx_hash_field_select, outer, l4_prot_type, MLX5_L4_PROT_TYPE_TCP);
1651 	MLX5_SET(rx_hash_field_select, outer, selected_fields, HASH_IP_L4PORTS);
1652 
1653 	MLX5_SET(tirc, tirc, indirect_table, ndev->res.rqtn);
1654 	MLX5_SET(tirc, tirc, transport_domain, ndev->res.tdn);
1655 
1656 	err = mlx5_vdpa_create_tir(&ndev->mvdev, in, &ndev->res.tirn);
1657 	kfree(in);
1658 	if (err)
1659 		return err;
1660 
1661 	mlx5_vdpa_add_tirn(ndev);
1662 	return err;
1663 }
1664 
1665 static void destroy_tir(struct mlx5_vdpa_net *ndev)
1666 {
1667 	mlx5_vdpa_remove_tirn(ndev);
1668 	mlx5_vdpa_destroy_tir(&ndev->mvdev, ndev->res.tirn);
1669 }
1670 
1671 #define MAX_STEERING_ENT 0x8000
1672 #define MAX_STEERING_GROUPS 2
1673 
1674 #if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG)
1675        #define NUM_DESTS 2
1676 #else
1677        #define NUM_DESTS 1
1678 #endif
1679 
1680 static int add_steering_counters(struct mlx5_vdpa_net *ndev,
1681 				 struct macvlan_node *node,
1682 				 struct mlx5_flow_act *flow_act,
1683 				 struct mlx5_flow_destination *dests)
1684 {
1685 #if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG)
1686 	int err;
1687 
1688 	node->ucast_counter.counter = mlx5_fc_create(ndev->mvdev.mdev, false);
1689 	if (IS_ERR(node->ucast_counter.counter))
1690 		return PTR_ERR(node->ucast_counter.counter);
1691 
1692 	node->mcast_counter.counter = mlx5_fc_create(ndev->mvdev.mdev, false);
1693 	if (IS_ERR(node->mcast_counter.counter)) {
1694 		err = PTR_ERR(node->mcast_counter.counter);
1695 		goto err_mcast_counter;
1696 	}
1697 
1698 	dests[1].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
1699 	flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
1700 	return 0;
1701 
1702 err_mcast_counter:
1703 	mlx5_fc_destroy(ndev->mvdev.mdev, node->ucast_counter.counter);
1704 	return err;
1705 #else
1706 	return 0;
1707 #endif
1708 }
1709 
1710 static void remove_steering_counters(struct mlx5_vdpa_net *ndev,
1711 				     struct macvlan_node *node)
1712 {
1713 #if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG)
1714 	mlx5_fc_destroy(ndev->mvdev.mdev, node->mcast_counter.counter);
1715 	mlx5_fc_destroy(ndev->mvdev.mdev, node->ucast_counter.counter);
1716 #endif
1717 }
1718 
1719 static int mlx5_vdpa_add_mac_vlan_rules(struct mlx5_vdpa_net *ndev, u8 *mac,
1720 					struct macvlan_node *node)
1721 {
1722 	struct mlx5_flow_destination dests[NUM_DESTS] = {};
1723 	struct mlx5_flow_act flow_act = {};
1724 	struct mlx5_flow_spec *spec;
1725 	void *headers_c;
1726 	void *headers_v;
1727 	u8 *dmac_c;
1728 	u8 *dmac_v;
1729 	int err;
1730 	u16 vid;
1731 
1732 	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
1733 	if (!spec)
1734 		return -ENOMEM;
1735 
1736 	vid = key2vid(node->macvlan);
1737 	spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
1738 	headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, outer_headers);
1739 	headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, outer_headers);
1740 	dmac_c = MLX5_ADDR_OF(fte_match_param, headers_c, outer_headers.dmac_47_16);
1741 	dmac_v = MLX5_ADDR_OF(fte_match_param, headers_v, outer_headers.dmac_47_16);
1742 	eth_broadcast_addr(dmac_c);
1743 	ether_addr_copy(dmac_v, mac);
1744 	if (ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VLAN)) {
1745 		MLX5_SET(fte_match_set_lyr_2_4, headers_c, cvlan_tag, 1);
1746 		MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, first_vid);
1747 	}
1748 	if (node->tagged) {
1749 		MLX5_SET(fte_match_set_lyr_2_4, headers_v, cvlan_tag, 1);
1750 		MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_vid, vid);
1751 	}
1752 	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
1753 	dests[0].type = MLX5_FLOW_DESTINATION_TYPE_TIR;
1754 	dests[0].tir_num = ndev->res.tirn;
1755 	err = add_steering_counters(ndev, node, &flow_act, dests);
1756 	if (err)
1757 		goto out_free;
1758 
1759 #if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG)
1760 	dests[1].counter_id = mlx5_fc_id(node->ucast_counter.counter);
1761 #endif
1762 	node->ucast_rule = mlx5_add_flow_rules(ndev->rxft, spec, &flow_act, dests, NUM_DESTS);
1763 	if (IS_ERR(node->ucast_rule)) {
1764 		err = PTR_ERR(node->ucast_rule);
1765 		goto err_ucast;
1766 	}
1767 
1768 #if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG)
1769 	dests[1].counter_id = mlx5_fc_id(node->mcast_counter.counter);
1770 #endif
1771 
1772 	memset(dmac_c, 0, ETH_ALEN);
1773 	memset(dmac_v, 0, ETH_ALEN);
1774 	dmac_c[0] = 1;
1775 	dmac_v[0] = 1;
1776 	node->mcast_rule = mlx5_add_flow_rules(ndev->rxft, spec, &flow_act, dests, NUM_DESTS);
1777 	if (IS_ERR(node->mcast_rule)) {
1778 		err = PTR_ERR(node->mcast_rule);
1779 		goto err_mcast;
1780 	}
1781 	kvfree(spec);
1782 	mlx5_vdpa_add_rx_counters(ndev, node);
1783 	return 0;
1784 
1785 err_mcast:
1786 	mlx5_del_flow_rules(node->ucast_rule);
1787 err_ucast:
1788 	remove_steering_counters(ndev, node);
1789 out_free:
1790 	kvfree(spec);
1791 	return err;
1792 }
1793 
1794 static void mlx5_vdpa_del_mac_vlan_rules(struct mlx5_vdpa_net *ndev,
1795 					 struct macvlan_node *node)
1796 {
1797 	mlx5_vdpa_remove_rx_counters(ndev, node);
1798 	mlx5_del_flow_rules(node->ucast_rule);
1799 	mlx5_del_flow_rules(node->mcast_rule);
1800 }
1801 
1802 static u64 search_val(u8 *mac, u16 vlan, bool tagged)
1803 {
1804 	u64 val;
1805 
1806 	if (!tagged)
1807 		vlan = MLX5V_UNTAGGED;
1808 
1809 	val = (u64)vlan << 48 |
1810 	      (u64)mac[0] << 40 |
1811 	      (u64)mac[1] << 32 |
1812 	      (u64)mac[2] << 24 |
1813 	      (u64)mac[3] << 16 |
1814 	      (u64)mac[4] << 8 |
1815 	      (u64)mac[5];
1816 
1817 	return val;
1818 }
1819 
1820 static struct macvlan_node *mac_vlan_lookup(struct mlx5_vdpa_net *ndev, u64 value)
1821 {
1822 	struct macvlan_node *pos;
1823 	u32 idx;
1824 
1825 	idx = hash_64(value, 8); // tbd 8
1826 	hlist_for_each_entry(pos, &ndev->macvlan_hash[idx], hlist) {
1827 		if (pos->macvlan == value)
1828 			return pos;
1829 	}
1830 	return NULL;
1831 }
1832 
1833 static int mac_vlan_add(struct mlx5_vdpa_net *ndev, u8 *mac, u16 vid, bool tagged)
1834 {
1835 	struct macvlan_node *ptr;
1836 	u64 val;
1837 	u32 idx;
1838 	int err;
1839 
1840 	val = search_val(mac, vid, tagged);
1841 	if (mac_vlan_lookup(ndev, val))
1842 		return -EEXIST;
1843 
1844 	ptr = kzalloc(sizeof(*ptr), GFP_KERNEL);
1845 	if (!ptr)
1846 		return -ENOMEM;
1847 
1848 	ptr->tagged = tagged;
1849 	ptr->macvlan = val;
1850 	ptr->ndev = ndev;
1851 	err = mlx5_vdpa_add_mac_vlan_rules(ndev, ndev->config.mac, ptr);
1852 	if (err)
1853 		goto err_add;
1854 
1855 	idx = hash_64(val, 8);
1856 	hlist_add_head(&ptr->hlist, &ndev->macvlan_hash[idx]);
1857 	return 0;
1858 
1859 err_add:
1860 	kfree(ptr);
1861 	return err;
1862 }
1863 
1864 static void mac_vlan_del(struct mlx5_vdpa_net *ndev, u8 *mac, u16 vlan, bool tagged)
1865 {
1866 	struct macvlan_node *ptr;
1867 
1868 	ptr = mac_vlan_lookup(ndev, search_val(mac, vlan, tagged));
1869 	if (!ptr)
1870 		return;
1871 
1872 	hlist_del(&ptr->hlist);
1873 	mlx5_vdpa_del_mac_vlan_rules(ndev, ptr);
1874 	remove_steering_counters(ndev, ptr);
1875 	kfree(ptr);
1876 }
1877 
1878 static void clear_mac_vlan_table(struct mlx5_vdpa_net *ndev)
1879 {
1880 	struct macvlan_node *pos;
1881 	struct hlist_node *n;
1882 	int i;
1883 
1884 	for (i = 0; i < MLX5V_MACVLAN_SIZE; i++) {
1885 		hlist_for_each_entry_safe(pos, n, &ndev->macvlan_hash[i], hlist) {
1886 			hlist_del(&pos->hlist);
1887 			mlx5_vdpa_del_mac_vlan_rules(ndev, pos);
1888 			remove_steering_counters(ndev, pos);
1889 			kfree(pos);
1890 		}
1891 	}
1892 }
1893 
1894 static int setup_steering(struct mlx5_vdpa_net *ndev)
1895 {
1896 	struct mlx5_flow_table_attr ft_attr = {};
1897 	struct mlx5_flow_namespace *ns;
1898 	int err;
1899 
1900 	ft_attr.max_fte = MAX_STEERING_ENT;
1901 	ft_attr.autogroup.max_num_groups = MAX_STEERING_GROUPS;
1902 
1903 	ns = mlx5_get_flow_namespace(ndev->mvdev.mdev, MLX5_FLOW_NAMESPACE_BYPASS);
1904 	if (!ns) {
1905 		mlx5_vdpa_warn(&ndev->mvdev, "failed to get flow namespace\n");
1906 		return -EOPNOTSUPP;
1907 	}
1908 
1909 	ndev->rxft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr);
1910 	if (IS_ERR(ndev->rxft)) {
1911 		mlx5_vdpa_warn(&ndev->mvdev, "failed to create flow table\n");
1912 		return PTR_ERR(ndev->rxft);
1913 	}
1914 	mlx5_vdpa_add_rx_flow_table(ndev);
1915 
1916 	err = mac_vlan_add(ndev, ndev->config.mac, 0, false);
1917 	if (err)
1918 		goto err_add;
1919 
1920 	return 0;
1921 
1922 err_add:
1923 	mlx5_vdpa_remove_rx_flow_table(ndev);
1924 	mlx5_destroy_flow_table(ndev->rxft);
1925 	return err;
1926 }
1927 
1928 static void teardown_steering(struct mlx5_vdpa_net *ndev)
1929 {
1930 	clear_mac_vlan_table(ndev);
1931 	mlx5_vdpa_remove_rx_flow_table(ndev);
1932 	mlx5_destroy_flow_table(ndev->rxft);
1933 }
1934 
1935 static virtio_net_ctrl_ack handle_ctrl_mac(struct mlx5_vdpa_dev *mvdev, u8 cmd)
1936 {
1937 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1938 	struct mlx5_control_vq *cvq = &mvdev->cvq;
1939 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
1940 	struct mlx5_core_dev *pfmdev;
1941 	size_t read;
1942 	u8 mac[ETH_ALEN], mac_back[ETH_ALEN];
1943 
1944 	pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
1945 	switch (cmd) {
1946 	case VIRTIO_NET_CTRL_MAC_ADDR_SET:
1947 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, (void *)mac, ETH_ALEN);
1948 		if (read != ETH_ALEN)
1949 			break;
1950 
1951 		if (!memcmp(ndev->config.mac, mac, 6)) {
1952 			status = VIRTIO_NET_OK;
1953 			break;
1954 		}
1955 
1956 		if (is_zero_ether_addr(mac))
1957 			break;
1958 
1959 		if (!is_zero_ether_addr(ndev->config.mac)) {
1960 			if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) {
1961 				mlx5_vdpa_warn(mvdev, "failed to delete old MAC %pM from MPFS table\n",
1962 					       ndev->config.mac);
1963 				break;
1964 			}
1965 		}
1966 
1967 		if (mlx5_mpfs_add_mac(pfmdev, mac)) {
1968 			mlx5_vdpa_warn(mvdev, "failed to insert new MAC %pM into MPFS table\n",
1969 				       mac);
1970 			break;
1971 		}
1972 
1973 		/* backup the original mac address so that if failed to add the forward rules
1974 		 * we could restore it
1975 		 */
1976 		memcpy(mac_back, ndev->config.mac, ETH_ALEN);
1977 
1978 		memcpy(ndev->config.mac, mac, ETH_ALEN);
1979 
1980 		/* Need recreate the flow table entry, so that the packet could forward back
1981 		 */
1982 		mac_vlan_del(ndev, mac_back, 0, false);
1983 
1984 		if (mac_vlan_add(ndev, ndev->config.mac, 0, false)) {
1985 			mlx5_vdpa_warn(mvdev, "failed to insert forward rules, try to restore\n");
1986 
1987 			/* Although it hardly run here, we still need double check */
1988 			if (is_zero_ether_addr(mac_back)) {
1989 				mlx5_vdpa_warn(mvdev, "restore mac failed: Original MAC is zero\n");
1990 				break;
1991 			}
1992 
1993 			/* Try to restore original mac address to MFPS table, and try to restore
1994 			 * the forward rule entry.
1995 			 */
1996 			if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) {
1997 				mlx5_vdpa_warn(mvdev, "restore mac failed: delete MAC %pM from MPFS table failed\n",
1998 					       ndev->config.mac);
1999 			}
2000 
2001 			if (mlx5_mpfs_add_mac(pfmdev, mac_back)) {
2002 				mlx5_vdpa_warn(mvdev, "restore mac failed: insert old MAC %pM into MPFS table failed\n",
2003 					       mac_back);
2004 			}
2005 
2006 			memcpy(ndev->config.mac, mac_back, ETH_ALEN);
2007 
2008 			if (mac_vlan_add(ndev, ndev->config.mac, 0, false))
2009 				mlx5_vdpa_warn(mvdev, "restore forward rules failed: insert forward rules failed\n");
2010 
2011 			break;
2012 		}
2013 
2014 		status = VIRTIO_NET_OK;
2015 		break;
2016 
2017 	default:
2018 		break;
2019 	}
2020 
2021 	return status;
2022 }
2023 
2024 static int change_num_qps(struct mlx5_vdpa_dev *mvdev, int newqps)
2025 {
2026 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2027 	int cur_qps = ndev->cur_num_vqs / 2;
2028 	int err;
2029 	int i;
2030 
2031 	if (cur_qps > newqps) {
2032 		err = modify_rqt(ndev, 2 * newqps);
2033 		if (err)
2034 			return err;
2035 
2036 		for (i = ndev->cur_num_vqs - 1; i >= 2 * newqps; i--)
2037 			teardown_vq(ndev, &ndev->vqs[i]);
2038 
2039 		ndev->cur_num_vqs = 2 * newqps;
2040 	} else {
2041 		ndev->cur_num_vqs = 2 * newqps;
2042 		for (i = cur_qps * 2; i < 2 * newqps; i++) {
2043 			err = setup_vq(ndev, &ndev->vqs[i]);
2044 			if (err)
2045 				goto clean_added;
2046 		}
2047 		err = modify_rqt(ndev, 2 * newqps);
2048 		if (err)
2049 			goto clean_added;
2050 	}
2051 	return 0;
2052 
2053 clean_added:
2054 	for (--i; i >= 2 * cur_qps; --i)
2055 		teardown_vq(ndev, &ndev->vqs[i]);
2056 
2057 	ndev->cur_num_vqs = 2 * cur_qps;
2058 
2059 	return err;
2060 }
2061 
2062 static virtio_net_ctrl_ack handle_ctrl_mq(struct mlx5_vdpa_dev *mvdev, u8 cmd)
2063 {
2064 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2065 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
2066 	struct mlx5_control_vq *cvq = &mvdev->cvq;
2067 	struct virtio_net_ctrl_mq mq;
2068 	size_t read;
2069 	u16 newqps;
2070 
2071 	switch (cmd) {
2072 	case VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET:
2073 		/* This mq feature check aligns with pre-existing userspace
2074 		 * implementation.
2075 		 *
2076 		 * Without it, an untrusted driver could fake a multiqueue config
2077 		 * request down to a non-mq device that may cause kernel to
2078 		 * panic due to uninitialized resources for extra vqs. Even with
2079 		 * a well behaving guest driver, it is not expected to allow
2080 		 * changing the number of vqs on a non-mq device.
2081 		 */
2082 		if (!MLX5_FEATURE(mvdev, VIRTIO_NET_F_MQ))
2083 			break;
2084 
2085 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, (void *)&mq, sizeof(mq));
2086 		if (read != sizeof(mq))
2087 			break;
2088 
2089 		newqps = mlx5vdpa16_to_cpu(mvdev, mq.virtqueue_pairs);
2090 		if (newqps < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
2091 		    newqps > ndev->rqt_size)
2092 			break;
2093 
2094 		if (ndev->cur_num_vqs == 2 * newqps) {
2095 			status = VIRTIO_NET_OK;
2096 			break;
2097 		}
2098 
2099 		if (!change_num_qps(mvdev, newqps))
2100 			status = VIRTIO_NET_OK;
2101 
2102 		break;
2103 	default:
2104 		break;
2105 	}
2106 
2107 	return status;
2108 }
2109 
2110 static virtio_net_ctrl_ack handle_ctrl_vlan(struct mlx5_vdpa_dev *mvdev, u8 cmd)
2111 {
2112 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2113 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
2114 	struct mlx5_control_vq *cvq = &mvdev->cvq;
2115 	__virtio16 vlan;
2116 	size_t read;
2117 	u16 id;
2118 
2119 	if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VLAN)))
2120 		return status;
2121 
2122 	switch (cmd) {
2123 	case VIRTIO_NET_CTRL_VLAN_ADD:
2124 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, &vlan, sizeof(vlan));
2125 		if (read != sizeof(vlan))
2126 			break;
2127 
2128 		id = mlx5vdpa16_to_cpu(mvdev, vlan);
2129 		if (mac_vlan_add(ndev, ndev->config.mac, id, true))
2130 			break;
2131 
2132 		status = VIRTIO_NET_OK;
2133 		break;
2134 	case VIRTIO_NET_CTRL_VLAN_DEL:
2135 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, &vlan, sizeof(vlan));
2136 		if (read != sizeof(vlan))
2137 			break;
2138 
2139 		id = mlx5vdpa16_to_cpu(mvdev, vlan);
2140 		mac_vlan_del(ndev, ndev->config.mac, id, true);
2141 		status = VIRTIO_NET_OK;
2142 		break;
2143 	default:
2144 		break;
2145 	}
2146 
2147 	return status;
2148 }
2149 
2150 static void mlx5_cvq_kick_handler(struct work_struct *work)
2151 {
2152 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
2153 	struct virtio_net_ctrl_hdr ctrl;
2154 	struct mlx5_vdpa_wq_ent *wqent;
2155 	struct mlx5_vdpa_dev *mvdev;
2156 	struct mlx5_control_vq *cvq;
2157 	struct mlx5_vdpa_net *ndev;
2158 	size_t read, write;
2159 	int err;
2160 
2161 	wqent = container_of(work, struct mlx5_vdpa_wq_ent, work);
2162 	mvdev = wqent->mvdev;
2163 	ndev = to_mlx5_vdpa_ndev(mvdev);
2164 	cvq = &mvdev->cvq;
2165 
2166 	down_write(&ndev->reslock);
2167 
2168 	if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK))
2169 		goto out;
2170 
2171 	if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)))
2172 		goto out;
2173 
2174 	if (!cvq->ready)
2175 		goto out;
2176 
2177 	while (true) {
2178 		err = vringh_getdesc_iotlb(&cvq->vring, &cvq->riov, &cvq->wiov, &cvq->head,
2179 					   GFP_ATOMIC);
2180 		if (err <= 0)
2181 			break;
2182 
2183 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, &ctrl, sizeof(ctrl));
2184 		if (read != sizeof(ctrl))
2185 			break;
2186 
2187 		cvq->received_desc++;
2188 		switch (ctrl.class) {
2189 		case VIRTIO_NET_CTRL_MAC:
2190 			status = handle_ctrl_mac(mvdev, ctrl.cmd);
2191 			break;
2192 		case VIRTIO_NET_CTRL_MQ:
2193 			status = handle_ctrl_mq(mvdev, ctrl.cmd);
2194 			break;
2195 		case VIRTIO_NET_CTRL_VLAN:
2196 			status = handle_ctrl_vlan(mvdev, ctrl.cmd);
2197 			break;
2198 		default:
2199 			break;
2200 		}
2201 
2202 		/* Make sure data is written before advancing index */
2203 		smp_wmb();
2204 
2205 		write = vringh_iov_push_iotlb(&cvq->vring, &cvq->wiov, &status, sizeof(status));
2206 		vringh_complete_iotlb(&cvq->vring, cvq->head, write);
2207 		vringh_kiov_cleanup(&cvq->riov);
2208 		vringh_kiov_cleanup(&cvq->wiov);
2209 
2210 		if (vringh_need_notify_iotlb(&cvq->vring))
2211 			vringh_notify(&cvq->vring);
2212 
2213 		cvq->completed_desc++;
2214 		queue_work(mvdev->wq, &wqent->work);
2215 		break;
2216 	}
2217 
2218 out:
2219 	up_write(&ndev->reslock);
2220 }
2221 
2222 static void mlx5_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx)
2223 {
2224 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2225 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2226 	struct mlx5_vdpa_virtqueue *mvq;
2227 
2228 	if (!is_index_valid(mvdev, idx))
2229 		return;
2230 
2231 	if (unlikely(is_ctrl_vq_idx(mvdev, idx))) {
2232 		if (!mvdev->wq || !mvdev->cvq.ready)
2233 			return;
2234 
2235 		queue_work(mvdev->wq, &ndev->cvq_ent.work);
2236 		return;
2237 	}
2238 
2239 	mvq = &ndev->vqs[idx];
2240 	if (unlikely(!mvq->ready))
2241 		return;
2242 
2243 	iowrite16(idx, ndev->mvdev.res.kick_addr);
2244 }
2245 
2246 static int mlx5_vdpa_set_vq_address(struct vdpa_device *vdev, u16 idx, u64 desc_area,
2247 				    u64 driver_area, u64 device_area)
2248 {
2249 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2250 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2251 	struct mlx5_vdpa_virtqueue *mvq;
2252 
2253 	if (!is_index_valid(mvdev, idx))
2254 		return -EINVAL;
2255 
2256 	if (is_ctrl_vq_idx(mvdev, idx)) {
2257 		mvdev->cvq.desc_addr = desc_area;
2258 		mvdev->cvq.device_addr = device_area;
2259 		mvdev->cvq.driver_addr = driver_area;
2260 		return 0;
2261 	}
2262 
2263 	mvq = &ndev->vqs[idx];
2264 	mvq->desc_addr = desc_area;
2265 	mvq->device_addr = device_area;
2266 	mvq->driver_addr = driver_area;
2267 	mvq->modified_fields |= MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_ADDRS;
2268 	return 0;
2269 }
2270 
2271 static void mlx5_vdpa_set_vq_num(struct vdpa_device *vdev, u16 idx, u32 num)
2272 {
2273 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2274 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2275 	struct mlx5_vdpa_virtqueue *mvq;
2276 
2277 	if (!is_index_valid(mvdev, idx))
2278 		return;
2279 
2280         if (is_ctrl_vq_idx(mvdev, idx)) {
2281                 struct mlx5_control_vq *cvq = &mvdev->cvq;
2282 
2283                 cvq->vring.vring.num = num;
2284                 return;
2285         }
2286 
2287 	mvq = &ndev->vqs[idx];
2288 	mvq->num_ent = num;
2289 }
2290 
2291 static void mlx5_vdpa_set_vq_cb(struct vdpa_device *vdev, u16 idx, struct vdpa_callback *cb)
2292 {
2293 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2294 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2295 
2296 	ndev->event_cbs[idx] = *cb;
2297 	if (is_ctrl_vq_idx(mvdev, idx))
2298 		mvdev->cvq.event_cb = *cb;
2299 }
2300 
2301 static void mlx5_cvq_notify(struct vringh *vring)
2302 {
2303 	struct mlx5_control_vq *cvq = container_of(vring, struct mlx5_control_vq, vring);
2304 
2305 	if (!cvq->event_cb.callback)
2306 		return;
2307 
2308 	cvq->event_cb.callback(cvq->event_cb.private);
2309 }
2310 
2311 static void set_cvq_ready(struct mlx5_vdpa_dev *mvdev, bool ready)
2312 {
2313 	struct mlx5_control_vq *cvq = &mvdev->cvq;
2314 
2315 	cvq->ready = ready;
2316 	if (!ready)
2317 		return;
2318 
2319 	cvq->vring.notify = mlx5_cvq_notify;
2320 }
2321 
2322 static void mlx5_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx, bool ready)
2323 {
2324 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2325 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2326 	struct mlx5_vdpa_virtqueue *mvq;
2327 	int err;
2328 
2329 	if (!mvdev->actual_features)
2330 		return;
2331 
2332 	if (!is_index_valid(mvdev, idx))
2333 		return;
2334 
2335 	if (is_ctrl_vq_idx(mvdev, idx)) {
2336 		set_cvq_ready(mvdev, ready);
2337 		return;
2338 	}
2339 
2340 	mvq = &ndev->vqs[idx];
2341 	if (!ready) {
2342 		suspend_vq(ndev, mvq);
2343 	} else {
2344 		err = modify_virtqueue_state(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY);
2345 		if (err) {
2346 			mlx5_vdpa_warn(mvdev, "modify VQ %d to ready failed (%d)\n", idx, err);
2347 			ready = false;
2348 		}
2349 	}
2350 
2351 
2352 	mvq->ready = ready;
2353 }
2354 
2355 static bool mlx5_vdpa_get_vq_ready(struct vdpa_device *vdev, u16 idx)
2356 {
2357 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2358 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2359 
2360 	if (!is_index_valid(mvdev, idx))
2361 		return false;
2362 
2363 	if (is_ctrl_vq_idx(mvdev, idx))
2364 		return mvdev->cvq.ready;
2365 
2366 	return ndev->vqs[idx].ready;
2367 }
2368 
2369 static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx,
2370 				  const struct vdpa_vq_state *state)
2371 {
2372 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2373 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2374 	struct mlx5_vdpa_virtqueue *mvq;
2375 
2376 	if (!is_index_valid(mvdev, idx))
2377 		return -EINVAL;
2378 
2379 	if (is_ctrl_vq_idx(mvdev, idx)) {
2380 		mvdev->cvq.vring.last_avail_idx = state->split.avail_index;
2381 		return 0;
2382 	}
2383 
2384 	mvq = &ndev->vqs[idx];
2385 	if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) {
2386 		mlx5_vdpa_warn(mvdev, "can't modify available index\n");
2387 		return -EINVAL;
2388 	}
2389 
2390 	mvq->used_idx = state->split.avail_index;
2391 	mvq->avail_idx = state->split.avail_index;
2392 	mvq->modified_fields |= MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_AVAIL_IDX |
2393 				MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_USED_IDX;
2394 	return 0;
2395 }
2396 
2397 static int mlx5_vdpa_get_vq_state(struct vdpa_device *vdev, u16 idx, struct vdpa_vq_state *state)
2398 {
2399 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2400 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2401 	struct mlx5_vdpa_virtqueue *mvq;
2402 	struct mlx5_virtq_attr attr;
2403 	int err;
2404 
2405 	if (!is_index_valid(mvdev, idx))
2406 		return -EINVAL;
2407 
2408 	if (is_ctrl_vq_idx(mvdev, idx)) {
2409 		state->split.avail_index = mvdev->cvq.vring.last_avail_idx;
2410 		return 0;
2411 	}
2412 
2413 	mvq = &ndev->vqs[idx];
2414 	/* If the virtq object was destroyed, use the value saved at
2415 	 * the last minute of suspend_vq. This caters for userspace
2416 	 * that cares about emulating the index after vq is stopped.
2417 	 */
2418 	if (!mvq->initialized) {
2419 		/* Firmware returns a wrong value for the available index.
2420 		 * Since both values should be identical, we take the value of
2421 		 * used_idx which is reported correctly.
2422 		 */
2423 		state->split.avail_index = mvq->used_idx;
2424 		return 0;
2425 	}
2426 
2427 	err = query_virtqueue(ndev, mvq, &attr);
2428 	if (err) {
2429 		mlx5_vdpa_warn(mvdev, "failed to query virtqueue\n");
2430 		return err;
2431 	}
2432 	state->split.avail_index = attr.used_index;
2433 	return 0;
2434 }
2435 
2436 static u32 mlx5_vdpa_get_vq_align(struct vdpa_device *vdev)
2437 {
2438 	return PAGE_SIZE;
2439 }
2440 
2441 static u32 mlx5_vdpa_get_vq_group(struct vdpa_device *vdev, u16 idx)
2442 {
2443 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2444 
2445 	if (is_ctrl_vq_idx(mvdev, idx))
2446 		return MLX5_VDPA_CVQ_GROUP;
2447 
2448 	return MLX5_VDPA_DATAVQ_GROUP;
2449 }
2450 
2451 static u32 mlx5_vdpa_get_vq_desc_group(struct vdpa_device *vdev, u16 idx)
2452 {
2453 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2454 
2455 	if (is_ctrl_vq_idx(mvdev, idx))
2456 		return MLX5_VDPA_CVQ_GROUP;
2457 
2458 	return MLX5_VDPA_DATAVQ_DESC_GROUP;
2459 }
2460 
2461 static u64 mlx_to_vritio_features(u16 dev_features)
2462 {
2463 	u64 result = 0;
2464 
2465 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_MRG_RXBUF))
2466 		result |= BIT_ULL(VIRTIO_NET_F_MRG_RXBUF);
2467 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_HOST_ECN))
2468 		result |= BIT_ULL(VIRTIO_NET_F_HOST_ECN);
2469 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_ECN))
2470 		result |= BIT_ULL(VIRTIO_NET_F_GUEST_ECN);
2471 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_TSO6))
2472 		result |= BIT_ULL(VIRTIO_NET_F_GUEST_TSO6);
2473 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_TSO4))
2474 		result |= BIT_ULL(VIRTIO_NET_F_GUEST_TSO4);
2475 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_CSUM))
2476 		result |= BIT_ULL(VIRTIO_NET_F_GUEST_CSUM);
2477 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_CSUM))
2478 		result |= BIT_ULL(VIRTIO_NET_F_CSUM);
2479 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_HOST_TSO6))
2480 		result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO6);
2481 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_HOST_TSO4))
2482 		result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO4);
2483 
2484 	return result;
2485 }
2486 
2487 static u64 get_supported_features(struct mlx5_core_dev *mdev)
2488 {
2489 	u64 mlx_vdpa_features = 0;
2490 	u16 dev_features;
2491 
2492 	dev_features = MLX5_CAP_DEV_VDPA_EMULATION(mdev, device_features_bits_mask);
2493 	mlx_vdpa_features |= mlx_to_vritio_features(dev_features);
2494 	if (MLX5_CAP_DEV_VDPA_EMULATION(mdev, virtio_version_1_0))
2495 		mlx_vdpa_features |= BIT_ULL(VIRTIO_F_VERSION_1);
2496 	mlx_vdpa_features |= BIT_ULL(VIRTIO_F_ACCESS_PLATFORM);
2497 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_CTRL_VQ);
2498 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR);
2499 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_MQ);
2500 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_STATUS);
2501 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_MTU);
2502 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_CTRL_VLAN);
2503 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_MAC);
2504 
2505 	return mlx_vdpa_features;
2506 }
2507 
2508 static u64 mlx5_vdpa_get_device_features(struct vdpa_device *vdev)
2509 {
2510 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2511 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2512 
2513 	print_features(mvdev, ndev->mvdev.mlx_features, false);
2514 	return ndev->mvdev.mlx_features;
2515 }
2516 
2517 static int verify_driver_features(struct mlx5_vdpa_dev *mvdev, u64 features)
2518 {
2519 	/* Minimum features to expect */
2520 	if (!(features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)))
2521 		return -EOPNOTSUPP;
2522 
2523 	/* Double check features combination sent down by the driver.
2524 	 * Fail invalid features due to absence of the depended feature.
2525 	 *
2526 	 * Per VIRTIO v1.1 specification, section 5.1.3.1 Feature bit
2527 	 * requirements: "VIRTIO_NET_F_MQ Requires VIRTIO_NET_F_CTRL_VQ".
2528 	 * By failing the invalid features sent down by untrusted drivers,
2529 	 * we're assured the assumption made upon is_index_valid() and
2530 	 * is_ctrl_vq_idx() will not be compromised.
2531 	 */
2532 	if ((features & (BIT_ULL(VIRTIO_NET_F_MQ) | BIT_ULL(VIRTIO_NET_F_CTRL_VQ))) ==
2533             BIT_ULL(VIRTIO_NET_F_MQ))
2534 		return -EINVAL;
2535 
2536 	return 0;
2537 }
2538 
2539 static int setup_virtqueues(struct mlx5_vdpa_dev *mvdev)
2540 {
2541 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2542 	int err;
2543 	int i;
2544 
2545 	for (i = 0; i < mvdev->max_vqs; i++) {
2546 		err = setup_vq(ndev, &ndev->vqs[i]);
2547 		if (err)
2548 			goto err_vq;
2549 	}
2550 
2551 	return 0;
2552 
2553 err_vq:
2554 	for (--i; i >= 0; i--)
2555 		teardown_vq(ndev, &ndev->vqs[i]);
2556 
2557 	return err;
2558 }
2559 
2560 static void teardown_virtqueues(struct mlx5_vdpa_net *ndev)
2561 {
2562 	struct mlx5_vdpa_virtqueue *mvq;
2563 	int i;
2564 
2565 	for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {
2566 		mvq = &ndev->vqs[i];
2567 		if (!mvq->initialized)
2568 			continue;
2569 
2570 		teardown_vq(ndev, mvq);
2571 	}
2572 }
2573 
2574 static void update_cvq_info(struct mlx5_vdpa_dev *mvdev)
2575 {
2576 	if (MLX5_FEATURE(mvdev, VIRTIO_NET_F_CTRL_VQ)) {
2577 		if (MLX5_FEATURE(mvdev, VIRTIO_NET_F_MQ)) {
2578 			/* MQ supported. CVQ index is right above the last data virtqueue's */
2579 			mvdev->max_idx = mvdev->max_vqs;
2580 		} else {
2581 			/* Only CVQ supportted. data virtqueues occupy indices 0 and 1.
2582 			 * CVQ gets index 2
2583 			 */
2584 			mvdev->max_idx = 2;
2585 		}
2586 	} else {
2587 		/* Two data virtqueues only: one for rx and one for tx */
2588 		mvdev->max_idx = 1;
2589 	}
2590 }
2591 
2592 static u8 query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport)
2593 {
2594 	u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {};
2595 	u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {};
2596 	int err;
2597 
2598 	MLX5_SET(query_vport_state_in, in, opcode, MLX5_CMD_OP_QUERY_VPORT_STATE);
2599 	MLX5_SET(query_vport_state_in, in, op_mod, opmod);
2600 	MLX5_SET(query_vport_state_in, in, vport_number, vport);
2601 	if (vport)
2602 		MLX5_SET(query_vport_state_in, in, other_vport, 1);
2603 
2604 	err = mlx5_cmd_exec_inout(mdev, query_vport_state, in, out);
2605 	if (err)
2606 		return 0;
2607 
2608 	return MLX5_GET(query_vport_state_out, out, state);
2609 }
2610 
2611 static bool get_link_state(struct mlx5_vdpa_dev *mvdev)
2612 {
2613 	if (query_vport_state(mvdev->mdev, MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT, 0) ==
2614 	    VPORT_STATE_UP)
2615 		return true;
2616 
2617 	return false;
2618 }
2619 
2620 static void update_carrier(struct work_struct *work)
2621 {
2622 	struct mlx5_vdpa_wq_ent *wqent;
2623 	struct mlx5_vdpa_dev *mvdev;
2624 	struct mlx5_vdpa_net *ndev;
2625 
2626 	wqent = container_of(work, struct mlx5_vdpa_wq_ent, work);
2627 	mvdev = wqent->mvdev;
2628 	ndev = to_mlx5_vdpa_ndev(mvdev);
2629 	if (get_link_state(mvdev))
2630 		ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP);
2631 	else
2632 		ndev->config.status &= cpu_to_mlx5vdpa16(mvdev, ~VIRTIO_NET_S_LINK_UP);
2633 
2634 	if (ndev->config_cb.callback)
2635 		ndev->config_cb.callback(ndev->config_cb.private);
2636 
2637 	kfree(wqent);
2638 }
2639 
2640 static int queue_link_work(struct mlx5_vdpa_net *ndev)
2641 {
2642 	struct mlx5_vdpa_wq_ent *wqent;
2643 
2644 	wqent = kzalloc(sizeof(*wqent), GFP_ATOMIC);
2645 	if (!wqent)
2646 		return -ENOMEM;
2647 
2648 	wqent->mvdev = &ndev->mvdev;
2649 	INIT_WORK(&wqent->work, update_carrier);
2650 	queue_work(ndev->mvdev.wq, &wqent->work);
2651 	return 0;
2652 }
2653 
2654 static int event_handler(struct notifier_block *nb, unsigned long event, void *param)
2655 {
2656 	struct mlx5_vdpa_net *ndev = container_of(nb, struct mlx5_vdpa_net, nb);
2657 	struct mlx5_eqe *eqe = param;
2658 	int ret = NOTIFY_DONE;
2659 
2660 	if (event == MLX5_EVENT_TYPE_PORT_CHANGE) {
2661 		switch (eqe->sub_type) {
2662 		case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
2663 		case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
2664 			if (queue_link_work(ndev))
2665 				return NOTIFY_DONE;
2666 
2667 			ret = NOTIFY_OK;
2668 			break;
2669 		default:
2670 			return NOTIFY_DONE;
2671 		}
2672 		return ret;
2673 	}
2674 	return ret;
2675 }
2676 
2677 static void register_link_notifier(struct mlx5_vdpa_net *ndev)
2678 {
2679 	if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_STATUS)))
2680 		return;
2681 
2682 	ndev->nb.notifier_call = event_handler;
2683 	mlx5_notifier_register(ndev->mvdev.mdev, &ndev->nb);
2684 	ndev->nb_registered = true;
2685 	queue_link_work(ndev);
2686 }
2687 
2688 static void unregister_link_notifier(struct mlx5_vdpa_net *ndev)
2689 {
2690 	if (!ndev->nb_registered)
2691 		return;
2692 
2693 	ndev->nb_registered = false;
2694 	mlx5_notifier_unregister(ndev->mvdev.mdev, &ndev->nb);
2695 	if (ndev->mvdev.wq)
2696 		flush_workqueue(ndev->mvdev.wq);
2697 }
2698 
2699 static u64 mlx5_vdpa_get_backend_features(const struct vdpa_device *vdpa)
2700 {
2701 	return BIT_ULL(VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK);
2702 }
2703 
2704 static int mlx5_vdpa_set_driver_features(struct vdpa_device *vdev, u64 features)
2705 {
2706 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2707 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2708 	int err;
2709 
2710 	print_features(mvdev, features, true);
2711 
2712 	err = verify_driver_features(mvdev, features);
2713 	if (err)
2714 		return err;
2715 
2716 	ndev->mvdev.actual_features = features & ndev->mvdev.mlx_features;
2717 	if (ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_MQ))
2718 		ndev->rqt_size = mlx5vdpa16_to_cpu(mvdev, ndev->config.max_virtqueue_pairs);
2719 	else
2720 		ndev->rqt_size = 1;
2721 
2722 	/* Device must start with 1 queue pair, as per VIRTIO v1.2 spec, section
2723 	 * 5.1.6.5.5 "Device operation in multiqueue mode":
2724 	 *
2725 	 * Multiqueue is disabled by default.
2726 	 * The driver enables multiqueue by sending a command using class
2727 	 * VIRTIO_NET_CTRL_MQ. The command selects the mode of multiqueue
2728 	 * operation, as follows: ...
2729 	 */
2730 	ndev->cur_num_vqs = 2;
2731 
2732 	update_cvq_info(mvdev);
2733 	return err;
2734 }
2735 
2736 static void mlx5_vdpa_set_config_cb(struct vdpa_device *vdev, struct vdpa_callback *cb)
2737 {
2738 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2739 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2740 
2741 	ndev->config_cb = *cb;
2742 }
2743 
2744 #define MLX5_VDPA_MAX_VQ_ENTRIES 256
2745 static u16 mlx5_vdpa_get_vq_num_max(struct vdpa_device *vdev)
2746 {
2747 	return MLX5_VDPA_MAX_VQ_ENTRIES;
2748 }
2749 
2750 static u32 mlx5_vdpa_get_device_id(struct vdpa_device *vdev)
2751 {
2752 	return VIRTIO_ID_NET;
2753 }
2754 
2755 static u32 mlx5_vdpa_get_vendor_id(struct vdpa_device *vdev)
2756 {
2757 	return PCI_VENDOR_ID_MELLANOX;
2758 }
2759 
2760 static u8 mlx5_vdpa_get_status(struct vdpa_device *vdev)
2761 {
2762 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2763 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2764 
2765 	print_status(mvdev, ndev->mvdev.status, false);
2766 	return ndev->mvdev.status;
2767 }
2768 
2769 static int save_channel_info(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
2770 {
2771 	struct mlx5_vq_restore_info *ri = &mvq->ri;
2772 	struct mlx5_virtq_attr attr = {};
2773 	int err;
2774 
2775 	if (mvq->initialized) {
2776 		err = query_virtqueue(ndev, mvq, &attr);
2777 		if (err)
2778 			return err;
2779 	}
2780 
2781 	ri->avail_index = attr.available_index;
2782 	ri->used_index = attr.used_index;
2783 	ri->ready = mvq->ready;
2784 	ri->num_ent = mvq->num_ent;
2785 	ri->desc_addr = mvq->desc_addr;
2786 	ri->device_addr = mvq->device_addr;
2787 	ri->driver_addr = mvq->driver_addr;
2788 	ri->map = mvq->map;
2789 	ri->restore = true;
2790 	return 0;
2791 }
2792 
2793 static int save_channels_info(struct mlx5_vdpa_net *ndev)
2794 {
2795 	int i;
2796 
2797 	for (i = 0; i < ndev->mvdev.max_vqs; i++) {
2798 		memset(&ndev->vqs[i].ri, 0, sizeof(ndev->vqs[i].ri));
2799 		save_channel_info(ndev, &ndev->vqs[i]);
2800 	}
2801 	return 0;
2802 }
2803 
2804 static void mlx5_clear_vqs(struct mlx5_vdpa_net *ndev)
2805 {
2806 	int i;
2807 
2808 	for (i = 0; i < ndev->mvdev.max_vqs; i++)
2809 		memset(&ndev->vqs[i], 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
2810 }
2811 
2812 static void restore_channels_info(struct mlx5_vdpa_net *ndev)
2813 {
2814 	struct mlx5_vdpa_virtqueue *mvq;
2815 	struct mlx5_vq_restore_info *ri;
2816 	int i;
2817 
2818 	mlx5_clear_vqs(ndev);
2819 	init_mvqs(ndev);
2820 	for (i = 0; i < ndev->mvdev.max_vqs; i++) {
2821 		mvq = &ndev->vqs[i];
2822 		ri = &mvq->ri;
2823 		if (!ri->restore)
2824 			continue;
2825 
2826 		mvq->avail_idx = ri->avail_index;
2827 		mvq->used_idx = ri->used_index;
2828 		mvq->ready = ri->ready;
2829 		mvq->num_ent = ri->num_ent;
2830 		mvq->desc_addr = ri->desc_addr;
2831 		mvq->device_addr = ri->device_addr;
2832 		mvq->driver_addr = ri->driver_addr;
2833 		mvq->map = ri->map;
2834 	}
2835 }
2836 
2837 static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev *mvdev,
2838 				struct mlx5_vdpa_mr *new_mr,
2839 				unsigned int asid)
2840 {
2841 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2842 	bool teardown = !is_resumable(ndev);
2843 	int err;
2844 
2845 	suspend_vqs(ndev);
2846 	if (teardown) {
2847 		err = save_channels_info(ndev);
2848 		if (err)
2849 			return err;
2850 
2851 		teardown_driver(ndev);
2852 	}
2853 
2854 	mlx5_vdpa_update_mr(mvdev, new_mr, asid);
2855 
2856 	for (int i = 0; i < ndev->cur_num_vqs; i++)
2857 		ndev->vqs[i].modified_fields |= MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY |
2858 						MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY;
2859 
2860 	if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK) || mvdev->suspended)
2861 		return 0;
2862 
2863 	if (teardown) {
2864 		restore_channels_info(ndev);
2865 		err = setup_driver(mvdev);
2866 		if (err)
2867 			return err;
2868 	}
2869 
2870 	resume_vqs(ndev);
2871 
2872 	return 0;
2873 }
2874 
2875 /* reslock must be held for this function */
2876 static int setup_driver(struct mlx5_vdpa_dev *mvdev)
2877 {
2878 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2879 	int err;
2880 
2881 	WARN_ON(!rwsem_is_locked(&ndev->reslock));
2882 
2883 	if (ndev->setup) {
2884 		mlx5_vdpa_warn(mvdev, "setup driver called for already setup driver\n");
2885 		err = 0;
2886 		goto out;
2887 	}
2888 	mlx5_vdpa_add_debugfs(ndev);
2889 
2890 	err = read_umem_params(ndev);
2891 	if (err)
2892 		goto err_setup;
2893 
2894 	err = setup_virtqueues(mvdev);
2895 	if (err) {
2896 		mlx5_vdpa_warn(mvdev, "setup_virtqueues\n");
2897 		goto err_setup;
2898 	}
2899 
2900 	err = create_rqt(ndev);
2901 	if (err) {
2902 		mlx5_vdpa_warn(mvdev, "create_rqt\n");
2903 		goto err_rqt;
2904 	}
2905 
2906 	err = create_tir(ndev);
2907 	if (err) {
2908 		mlx5_vdpa_warn(mvdev, "create_tir\n");
2909 		goto err_tir;
2910 	}
2911 
2912 	err = setup_steering(ndev);
2913 	if (err) {
2914 		mlx5_vdpa_warn(mvdev, "setup_steering\n");
2915 		goto err_fwd;
2916 	}
2917 	ndev->setup = true;
2918 
2919 	return 0;
2920 
2921 err_fwd:
2922 	destroy_tir(ndev);
2923 err_tir:
2924 	destroy_rqt(ndev);
2925 err_rqt:
2926 	teardown_virtqueues(ndev);
2927 err_setup:
2928 	mlx5_vdpa_remove_debugfs(ndev);
2929 out:
2930 	return err;
2931 }
2932 
2933 /* reslock must be held for this function */
2934 static void teardown_driver(struct mlx5_vdpa_net *ndev)
2935 {
2936 
2937 	WARN_ON(!rwsem_is_locked(&ndev->reslock));
2938 
2939 	if (!ndev->setup)
2940 		return;
2941 
2942 	mlx5_vdpa_remove_debugfs(ndev);
2943 	teardown_steering(ndev);
2944 	destroy_tir(ndev);
2945 	destroy_rqt(ndev);
2946 	teardown_virtqueues(ndev);
2947 	ndev->setup = false;
2948 }
2949 
2950 static void clear_vqs_ready(struct mlx5_vdpa_net *ndev)
2951 {
2952 	int i;
2953 
2954 	for (i = 0; i < ndev->mvdev.max_vqs; i++) {
2955 		ndev->vqs[i].ready = false;
2956 		ndev->vqs[i].modified_fields = 0;
2957 	}
2958 
2959 	ndev->mvdev.cvq.ready = false;
2960 }
2961 
2962 static int setup_cvq_vring(struct mlx5_vdpa_dev *mvdev)
2963 {
2964 	struct mlx5_control_vq *cvq = &mvdev->cvq;
2965 	int err = 0;
2966 
2967 	if (mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)) {
2968 		u16 idx = cvq->vring.last_avail_idx;
2969 
2970 		err = vringh_init_iotlb(&cvq->vring, mvdev->actual_features,
2971 					cvq->vring.vring.num, false,
2972 					(struct vring_desc *)(uintptr_t)cvq->desc_addr,
2973 					(struct vring_avail *)(uintptr_t)cvq->driver_addr,
2974 					(struct vring_used *)(uintptr_t)cvq->device_addr);
2975 
2976 		if (!err)
2977 			cvq->vring.last_avail_idx = cvq->vring.last_used_idx = idx;
2978 	}
2979 	return err;
2980 }
2981 
2982 static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status)
2983 {
2984 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2985 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2986 	int err;
2987 
2988 	print_status(mvdev, status, true);
2989 
2990 	down_write(&ndev->reslock);
2991 
2992 	if ((status ^ ndev->mvdev.status) & VIRTIO_CONFIG_S_DRIVER_OK) {
2993 		if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
2994 			err = setup_cvq_vring(mvdev);
2995 			if (err) {
2996 				mlx5_vdpa_warn(mvdev, "failed to setup control VQ vring\n");
2997 				goto err_setup;
2998 			}
2999 			register_link_notifier(ndev);
3000 			err = setup_driver(mvdev);
3001 			if (err) {
3002 				mlx5_vdpa_warn(mvdev, "failed to setup driver\n");
3003 				goto err_driver;
3004 			}
3005 		} else {
3006 			mlx5_vdpa_warn(mvdev, "did not expect DRIVER_OK to be cleared\n");
3007 			goto err_clear;
3008 		}
3009 	}
3010 
3011 	ndev->mvdev.status = status;
3012 	up_write(&ndev->reslock);
3013 	return;
3014 
3015 err_driver:
3016 	unregister_link_notifier(ndev);
3017 err_setup:
3018 	mlx5_vdpa_destroy_mr_resources(&ndev->mvdev);
3019 	ndev->mvdev.status |= VIRTIO_CONFIG_S_FAILED;
3020 err_clear:
3021 	up_write(&ndev->reslock);
3022 }
3023 
3024 static void init_group_to_asid_map(struct mlx5_vdpa_dev *mvdev)
3025 {
3026 	int i;
3027 
3028 	/* default mapping all groups are mapped to asid 0 */
3029 	for (i = 0; i < MLX5_VDPA_NUMVQ_GROUPS; i++)
3030 		mvdev->group2asid[i] = 0;
3031 }
3032 
3033 static int mlx5_vdpa_compat_reset(struct vdpa_device *vdev, u32 flags)
3034 {
3035 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3036 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
3037 
3038 	print_status(mvdev, 0, true);
3039 	mlx5_vdpa_info(mvdev, "performing device reset\n");
3040 
3041 	down_write(&ndev->reslock);
3042 	unregister_link_notifier(ndev);
3043 	teardown_driver(ndev);
3044 	clear_vqs_ready(ndev);
3045 	if (flags & VDPA_RESET_F_CLEAN_MAP)
3046 		mlx5_vdpa_destroy_mr_resources(&ndev->mvdev);
3047 	ndev->mvdev.status = 0;
3048 	ndev->mvdev.suspended = false;
3049 	ndev->cur_num_vqs = 0;
3050 	ndev->mvdev.cvq.received_desc = 0;
3051 	ndev->mvdev.cvq.completed_desc = 0;
3052 	memset(ndev->event_cbs, 0, sizeof(*ndev->event_cbs) * (mvdev->max_vqs + 1));
3053 	ndev->mvdev.actual_features = 0;
3054 	init_group_to_asid_map(mvdev);
3055 	++mvdev->generation;
3056 
3057 	if ((flags & VDPA_RESET_F_CLEAN_MAP) &&
3058 	    MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) {
3059 		if (mlx5_vdpa_create_dma_mr(mvdev))
3060 			mlx5_vdpa_warn(mvdev, "create MR failed\n");
3061 	}
3062 	up_write(&ndev->reslock);
3063 
3064 	return 0;
3065 }
3066 
3067 static int mlx5_vdpa_reset(struct vdpa_device *vdev)
3068 {
3069 	return mlx5_vdpa_compat_reset(vdev, 0);
3070 }
3071 
3072 static size_t mlx5_vdpa_get_config_size(struct vdpa_device *vdev)
3073 {
3074 	return sizeof(struct virtio_net_config);
3075 }
3076 
3077 static void mlx5_vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, void *buf,
3078 				 unsigned int len)
3079 {
3080 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3081 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
3082 
3083 	if (offset + len <= sizeof(struct virtio_net_config))
3084 		memcpy(buf, (u8 *)&ndev->config + offset, len);
3085 }
3086 
3087 static void mlx5_vdpa_set_config(struct vdpa_device *vdev, unsigned int offset, const void *buf,
3088 				 unsigned int len)
3089 {
3090 	/* not supported */
3091 }
3092 
3093 static u32 mlx5_vdpa_get_generation(struct vdpa_device *vdev)
3094 {
3095 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3096 
3097 	return mvdev->generation;
3098 }
3099 
3100 static int set_map_data(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb,
3101 			unsigned int asid)
3102 {
3103 	struct mlx5_vdpa_mr *new_mr;
3104 	int err;
3105 
3106 	if (asid >= MLX5_VDPA_NUM_AS)
3107 		return -EINVAL;
3108 
3109 	if (vhost_iotlb_itree_first(iotlb, 0, U64_MAX)) {
3110 		new_mr = mlx5_vdpa_create_mr(mvdev, iotlb);
3111 		if (IS_ERR(new_mr)) {
3112 			err = PTR_ERR(new_mr);
3113 			mlx5_vdpa_warn(mvdev, "create map failed(%d)\n", err);
3114 			return err;
3115 		}
3116 	} else {
3117 		/* Empty iotlbs don't have an mr but will clear the previous mr. */
3118 		new_mr = NULL;
3119 	}
3120 
3121 	if (!mvdev->mr[asid]) {
3122 		mlx5_vdpa_update_mr(mvdev, new_mr, asid);
3123 	} else {
3124 		err = mlx5_vdpa_change_map(mvdev, new_mr, asid);
3125 		if (err) {
3126 			mlx5_vdpa_warn(mvdev, "change map failed(%d)\n", err);
3127 			goto out_err;
3128 		}
3129 	}
3130 
3131 	return mlx5_vdpa_update_cvq_iotlb(mvdev, iotlb, asid);
3132 
3133 out_err:
3134 	mlx5_vdpa_put_mr(mvdev, new_mr);
3135 	return err;
3136 }
3137 
3138 static int mlx5_vdpa_set_map(struct vdpa_device *vdev, unsigned int asid,
3139 			     struct vhost_iotlb *iotlb)
3140 {
3141 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3142 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
3143 	int err = -EINVAL;
3144 
3145 	down_write(&ndev->reslock);
3146 	err = set_map_data(mvdev, iotlb, asid);
3147 	up_write(&ndev->reslock);
3148 	return err;
3149 }
3150 
3151 static int mlx5_vdpa_reset_map(struct vdpa_device *vdev, unsigned int asid)
3152 {
3153 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3154 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
3155 	int err;
3156 
3157 	down_write(&ndev->reslock);
3158 	err = mlx5_vdpa_reset_mr(mvdev, asid);
3159 	up_write(&ndev->reslock);
3160 	return err;
3161 }
3162 
3163 static struct device *mlx5_get_vq_dma_dev(struct vdpa_device *vdev, u16 idx)
3164 {
3165 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3166 
3167 	if (is_ctrl_vq_idx(mvdev, idx))
3168 		return &vdev->dev;
3169 
3170 	return mvdev->vdev.dma_dev;
3171 }
3172 
3173 static void free_irqs(struct mlx5_vdpa_net *ndev)
3174 {
3175 	struct mlx5_vdpa_irq_pool_entry *ent;
3176 	int i;
3177 
3178 	if (!msix_mode_supported(&ndev->mvdev))
3179 		return;
3180 
3181 	if (!ndev->irqp.entries)
3182 		return;
3183 
3184 	for (i = ndev->irqp.num_ent - 1; i >= 0; i--) {
3185 		ent = ndev->irqp.entries + i;
3186 		if (ent->map.virq)
3187 			pci_msix_free_irq(ndev->mvdev.mdev->pdev, ent->map);
3188 	}
3189 	kfree(ndev->irqp.entries);
3190 }
3191 
3192 static void mlx5_vdpa_free(struct vdpa_device *vdev)
3193 {
3194 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3195 	struct mlx5_core_dev *pfmdev;
3196 	struct mlx5_vdpa_net *ndev;
3197 
3198 	ndev = to_mlx5_vdpa_ndev(mvdev);
3199 
3200 	free_resources(ndev);
3201 	mlx5_vdpa_destroy_mr_resources(mvdev);
3202 	if (!is_zero_ether_addr(ndev->config.mac)) {
3203 		pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
3204 		mlx5_mpfs_del_mac(pfmdev, ndev->config.mac);
3205 	}
3206 	mlx5_vdpa_free_resources(&ndev->mvdev);
3207 	free_irqs(ndev);
3208 	kfree(ndev->event_cbs);
3209 	kfree(ndev->vqs);
3210 }
3211 
3212 static struct vdpa_notification_area mlx5_get_vq_notification(struct vdpa_device *vdev, u16 idx)
3213 {
3214 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3215 	struct vdpa_notification_area ret = {};
3216 	struct mlx5_vdpa_net *ndev;
3217 	phys_addr_t addr;
3218 
3219 	if (!is_index_valid(mvdev, idx) || is_ctrl_vq_idx(mvdev, idx))
3220 		return ret;
3221 
3222 	/* If SF BAR size is smaller than PAGE_SIZE, do not use direct
3223 	 * notification to avoid the risk of mapping pages that contain BAR of more
3224 	 * than one SF
3225 	 */
3226 	if (MLX5_CAP_GEN(mvdev->mdev, log_min_sf_size) + 12 < PAGE_SHIFT)
3227 		return ret;
3228 
3229 	ndev = to_mlx5_vdpa_ndev(mvdev);
3230 	addr = (phys_addr_t)ndev->mvdev.res.phys_kick_addr;
3231 	ret.addr = addr;
3232 	ret.size = PAGE_SIZE;
3233 	return ret;
3234 }
3235 
3236 static int mlx5_get_vq_irq(struct vdpa_device *vdev, u16 idx)
3237 {
3238 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3239 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
3240 	struct mlx5_vdpa_virtqueue *mvq;
3241 
3242 	if (!is_index_valid(mvdev, idx))
3243 		return -EINVAL;
3244 
3245 	if (is_ctrl_vq_idx(mvdev, idx))
3246 		return -EOPNOTSUPP;
3247 
3248 	mvq = &ndev->vqs[idx];
3249 	if (!mvq->map.virq)
3250 		return -EOPNOTSUPP;
3251 
3252 	return mvq->map.virq;
3253 }
3254 
3255 static u64 mlx5_vdpa_get_driver_features(struct vdpa_device *vdev)
3256 {
3257 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3258 
3259 	return mvdev->actual_features;
3260 }
3261 
3262 static int counter_set_query(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
3263 			     u64 *received_desc, u64 *completed_desc)
3264 {
3265 	u32 in[MLX5_ST_SZ_DW(query_virtio_q_counters_in)] = {};
3266 	u32 out[MLX5_ST_SZ_DW(query_virtio_q_counters_out)] = {};
3267 	void *cmd_hdr;
3268 	void *ctx;
3269 	int err;
3270 
3271 	if (!counters_supported(&ndev->mvdev))
3272 		return -EOPNOTSUPP;
3273 
3274 	if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY)
3275 		return -EAGAIN;
3276 
3277 	cmd_hdr = MLX5_ADDR_OF(query_virtio_q_counters_in, in, hdr);
3278 
3279 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT);
3280 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS);
3281 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
3282 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->counter_set_id);
3283 
3284 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out));
3285 	if (err)
3286 		return err;
3287 
3288 	ctx = MLX5_ADDR_OF(query_virtio_q_counters_out, out, counters);
3289 	*received_desc = MLX5_GET64(virtio_q_counters, ctx, received_desc);
3290 	*completed_desc = MLX5_GET64(virtio_q_counters, ctx, completed_desc);
3291 	return 0;
3292 }
3293 
3294 static int mlx5_vdpa_get_vendor_vq_stats(struct vdpa_device *vdev, u16 idx,
3295 					 struct sk_buff *msg,
3296 					 struct netlink_ext_ack *extack)
3297 {
3298 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3299 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
3300 	struct mlx5_vdpa_virtqueue *mvq;
3301 	struct mlx5_control_vq *cvq;
3302 	u64 received_desc;
3303 	u64 completed_desc;
3304 	int err = 0;
3305 
3306 	down_read(&ndev->reslock);
3307 	if (!is_index_valid(mvdev, idx)) {
3308 		NL_SET_ERR_MSG_MOD(extack, "virtqueue index is not valid");
3309 		err = -EINVAL;
3310 		goto out_err;
3311 	}
3312 
3313 	if (idx == ctrl_vq_idx(mvdev)) {
3314 		cvq = &mvdev->cvq;
3315 		received_desc = cvq->received_desc;
3316 		completed_desc = cvq->completed_desc;
3317 		goto out;
3318 	}
3319 
3320 	mvq = &ndev->vqs[idx];
3321 	err = counter_set_query(ndev, mvq, &received_desc, &completed_desc);
3322 	if (err) {
3323 		NL_SET_ERR_MSG_MOD(extack, "failed to query hardware");
3324 		goto out_err;
3325 	}
3326 
3327 out:
3328 	err = -EMSGSIZE;
3329 	if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, "received_desc"))
3330 		goto out_err;
3331 
3332 	if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, received_desc,
3333 			      VDPA_ATTR_PAD))
3334 		goto out_err;
3335 
3336 	if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, "completed_desc"))
3337 		goto out_err;
3338 
3339 	if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, completed_desc,
3340 			      VDPA_ATTR_PAD))
3341 		goto out_err;
3342 
3343 	err = 0;
3344 out_err:
3345 	up_read(&ndev->reslock);
3346 	return err;
3347 }
3348 
3349 static void mlx5_vdpa_cvq_suspend(struct mlx5_vdpa_dev *mvdev)
3350 {
3351 	struct mlx5_control_vq *cvq;
3352 
3353 	if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)))
3354 		return;
3355 
3356 	cvq = &mvdev->cvq;
3357 	cvq->ready = false;
3358 }
3359 
3360 static int mlx5_vdpa_suspend(struct vdpa_device *vdev)
3361 {
3362 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3363 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
3364 	struct mlx5_vdpa_virtqueue *mvq;
3365 	int i;
3366 
3367 	mlx5_vdpa_info(mvdev, "suspending device\n");
3368 
3369 	down_write(&ndev->reslock);
3370 	unregister_link_notifier(ndev);
3371 	for (i = 0; i < ndev->cur_num_vqs; i++) {
3372 		mvq = &ndev->vqs[i];
3373 		suspend_vq(ndev, mvq);
3374 	}
3375 	mlx5_vdpa_cvq_suspend(mvdev);
3376 	mvdev->suspended = true;
3377 	up_write(&ndev->reslock);
3378 	return 0;
3379 }
3380 
3381 static int mlx5_vdpa_resume(struct vdpa_device *vdev)
3382 {
3383 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3384 	struct mlx5_vdpa_net *ndev;
3385 
3386 	ndev = to_mlx5_vdpa_ndev(mvdev);
3387 
3388 	mlx5_vdpa_info(mvdev, "resuming device\n");
3389 
3390 	down_write(&ndev->reslock);
3391 	mvdev->suspended = false;
3392 	resume_vqs(ndev);
3393 	register_link_notifier(ndev);
3394 	up_write(&ndev->reslock);
3395 	return 0;
3396 }
3397 
3398 static int mlx5_set_group_asid(struct vdpa_device *vdev, u32 group,
3399 			       unsigned int asid)
3400 {
3401 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3402 	int err = 0;
3403 
3404 	if (group >= MLX5_VDPA_NUMVQ_GROUPS)
3405 		return -EINVAL;
3406 
3407 	mvdev->group2asid[group] = asid;
3408 
3409 	mutex_lock(&mvdev->mr_mtx);
3410 	if (group == MLX5_VDPA_CVQ_GROUP && mvdev->mr[asid])
3411 		err = mlx5_vdpa_update_cvq_iotlb(mvdev, mvdev->mr[asid]->iotlb, asid);
3412 	mutex_unlock(&mvdev->mr_mtx);
3413 
3414 	return err;
3415 }
3416 
3417 static const struct vdpa_config_ops mlx5_vdpa_ops = {
3418 	.set_vq_address = mlx5_vdpa_set_vq_address,
3419 	.set_vq_num = mlx5_vdpa_set_vq_num,
3420 	.kick_vq = mlx5_vdpa_kick_vq,
3421 	.set_vq_cb = mlx5_vdpa_set_vq_cb,
3422 	.set_vq_ready = mlx5_vdpa_set_vq_ready,
3423 	.get_vq_ready = mlx5_vdpa_get_vq_ready,
3424 	.set_vq_state = mlx5_vdpa_set_vq_state,
3425 	.get_vq_state = mlx5_vdpa_get_vq_state,
3426 	.get_vendor_vq_stats = mlx5_vdpa_get_vendor_vq_stats,
3427 	.get_vq_notification = mlx5_get_vq_notification,
3428 	.get_vq_irq = mlx5_get_vq_irq,
3429 	.get_vq_align = mlx5_vdpa_get_vq_align,
3430 	.get_vq_group = mlx5_vdpa_get_vq_group,
3431 	.get_vq_desc_group = mlx5_vdpa_get_vq_desc_group, /* Op disabled if not supported. */
3432 	.get_device_features = mlx5_vdpa_get_device_features,
3433 	.get_backend_features = mlx5_vdpa_get_backend_features,
3434 	.set_driver_features = mlx5_vdpa_set_driver_features,
3435 	.get_driver_features = mlx5_vdpa_get_driver_features,
3436 	.set_config_cb = mlx5_vdpa_set_config_cb,
3437 	.get_vq_num_max = mlx5_vdpa_get_vq_num_max,
3438 	.get_device_id = mlx5_vdpa_get_device_id,
3439 	.get_vendor_id = mlx5_vdpa_get_vendor_id,
3440 	.get_status = mlx5_vdpa_get_status,
3441 	.set_status = mlx5_vdpa_set_status,
3442 	.reset = mlx5_vdpa_reset,
3443 	.compat_reset = mlx5_vdpa_compat_reset,
3444 	.get_config_size = mlx5_vdpa_get_config_size,
3445 	.get_config = mlx5_vdpa_get_config,
3446 	.set_config = mlx5_vdpa_set_config,
3447 	.get_generation = mlx5_vdpa_get_generation,
3448 	.set_map = mlx5_vdpa_set_map,
3449 	.reset_map = mlx5_vdpa_reset_map,
3450 	.set_group_asid = mlx5_set_group_asid,
3451 	.get_vq_dma_dev = mlx5_get_vq_dma_dev,
3452 	.free = mlx5_vdpa_free,
3453 	.suspend = mlx5_vdpa_suspend,
3454 	.resume = mlx5_vdpa_resume, /* Op disabled if not supported. */
3455 };
3456 
3457 static int query_mtu(struct mlx5_core_dev *mdev, u16 *mtu)
3458 {
3459 	u16 hw_mtu;
3460 	int err;
3461 
3462 	err = mlx5_query_nic_vport_mtu(mdev, &hw_mtu);
3463 	if (err)
3464 		return err;
3465 
3466 	*mtu = hw_mtu - MLX5V_ETH_HARD_MTU;
3467 	return 0;
3468 }
3469 
3470 static int alloc_resources(struct mlx5_vdpa_net *ndev)
3471 {
3472 	struct mlx5_vdpa_net_resources *res = &ndev->res;
3473 	int err;
3474 
3475 	if (res->valid) {
3476 		mlx5_vdpa_warn(&ndev->mvdev, "resources already allocated\n");
3477 		return -EEXIST;
3478 	}
3479 
3480 	err = mlx5_vdpa_alloc_transport_domain(&ndev->mvdev, &res->tdn);
3481 	if (err)
3482 		return err;
3483 
3484 	err = create_tis(ndev);
3485 	if (err)
3486 		goto err_tis;
3487 
3488 	res->valid = true;
3489 
3490 	return 0;
3491 
3492 err_tis:
3493 	mlx5_vdpa_dealloc_transport_domain(&ndev->mvdev, res->tdn);
3494 	return err;
3495 }
3496 
3497 static void free_resources(struct mlx5_vdpa_net *ndev)
3498 {
3499 	struct mlx5_vdpa_net_resources *res = &ndev->res;
3500 
3501 	if (!res->valid)
3502 		return;
3503 
3504 	destroy_tis(ndev);
3505 	mlx5_vdpa_dealloc_transport_domain(&ndev->mvdev, res->tdn);
3506 	res->valid = false;
3507 }
3508 
3509 static void init_mvqs(struct mlx5_vdpa_net *ndev)
3510 {
3511 	struct mlx5_vdpa_virtqueue *mvq;
3512 	int i;
3513 
3514 	for (i = 0; i < ndev->mvdev.max_vqs; ++i) {
3515 		mvq = &ndev->vqs[i];
3516 		memset(mvq, 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
3517 		mvq->index = i;
3518 		mvq->ndev = ndev;
3519 		mvq->fwqp.fw = true;
3520 		mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_NONE;
3521 	}
3522 	for (; i < ndev->mvdev.max_vqs; i++) {
3523 		mvq = &ndev->vqs[i];
3524 		memset(mvq, 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
3525 		mvq->index = i;
3526 		mvq->ndev = ndev;
3527 	}
3528 }
3529 
3530 struct mlx5_vdpa_mgmtdev {
3531 	struct vdpa_mgmt_dev mgtdev;
3532 	struct mlx5_adev *madev;
3533 	struct mlx5_vdpa_net *ndev;
3534 	struct vdpa_config_ops vdpa_ops;
3535 };
3536 
3537 static int config_func_mtu(struct mlx5_core_dev *mdev, u16 mtu)
3538 {
3539 	int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in);
3540 	void *in;
3541 	int err;
3542 
3543 	in = kvzalloc(inlen, GFP_KERNEL);
3544 	if (!in)
3545 		return -ENOMEM;
3546 
3547 	MLX5_SET(modify_nic_vport_context_in, in, field_select.mtu, 1);
3548 	MLX5_SET(modify_nic_vport_context_in, in, nic_vport_context.mtu,
3549 		 mtu + MLX5V_ETH_HARD_MTU);
3550 	MLX5_SET(modify_nic_vport_context_in, in, opcode,
3551 		 MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT);
3552 
3553 	err = mlx5_cmd_exec_in(mdev, modify_nic_vport_context, in);
3554 
3555 	kvfree(in);
3556 	return err;
3557 }
3558 
3559 static void allocate_irqs(struct mlx5_vdpa_net *ndev)
3560 {
3561 	struct mlx5_vdpa_irq_pool_entry *ent;
3562 	int i;
3563 
3564 	if (!msix_mode_supported(&ndev->mvdev))
3565 		return;
3566 
3567 	if (!ndev->mvdev.mdev->pdev)
3568 		return;
3569 
3570 	ndev->irqp.entries = kcalloc(ndev->mvdev.max_vqs, sizeof(*ndev->irqp.entries), GFP_KERNEL);
3571 	if (!ndev->irqp.entries)
3572 		return;
3573 
3574 
3575 	for (i = 0; i < ndev->mvdev.max_vqs; i++) {
3576 		ent = ndev->irqp.entries + i;
3577 		snprintf(ent->name, MLX5_VDPA_IRQ_NAME_LEN, "%s-vq-%d",
3578 			 dev_name(&ndev->mvdev.vdev.dev), i);
3579 		ent->map = pci_msix_alloc_irq_at(ndev->mvdev.mdev->pdev, MSI_ANY_INDEX, NULL);
3580 		if (!ent->map.virq)
3581 			return;
3582 
3583 		ndev->irqp.num_ent++;
3584 	}
3585 }
3586 
3587 static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name,
3588 			     const struct vdpa_dev_set_config *add_config)
3589 {
3590 	struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev);
3591 	struct virtio_net_config *config;
3592 	struct mlx5_core_dev *pfmdev;
3593 	struct mlx5_vdpa_dev *mvdev;
3594 	struct mlx5_vdpa_net *ndev;
3595 	struct mlx5_core_dev *mdev;
3596 	u64 device_features;
3597 	u32 max_vqs;
3598 	u16 mtu;
3599 	int err;
3600 
3601 	if (mgtdev->ndev)
3602 		return -ENOSPC;
3603 
3604 	mdev = mgtdev->madev->mdev;
3605 	device_features = mgtdev->mgtdev.supported_features;
3606 	if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) {
3607 		if (add_config->device_features & ~device_features) {
3608 			dev_warn(mdev->device,
3609 				 "The provisioned features 0x%llx are not supported by this device with features 0x%llx\n",
3610 				 add_config->device_features, device_features);
3611 			return -EINVAL;
3612 		}
3613 		device_features &= add_config->device_features;
3614 	} else {
3615 		device_features &= ~BIT_ULL(VIRTIO_NET_F_MRG_RXBUF);
3616 	}
3617 	if (!(device_features & BIT_ULL(VIRTIO_F_VERSION_1) &&
3618 	      device_features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM))) {
3619 		dev_warn(mdev->device,
3620 			 "Must provision minimum features 0x%llx for this device",
3621 			 BIT_ULL(VIRTIO_F_VERSION_1) | BIT_ULL(VIRTIO_F_ACCESS_PLATFORM));
3622 		return -EOPNOTSUPP;
3623 	}
3624 
3625 	if (!(MLX5_CAP_DEV_VDPA_EMULATION(mdev, virtio_queue_type) &
3626 	    MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT)) {
3627 		dev_warn(mdev->device, "missing support for split virtqueues\n");
3628 		return -EOPNOTSUPP;
3629 	}
3630 
3631 	max_vqs = min_t(int, MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues),
3632 			1 << MLX5_CAP_GEN(mdev, log_max_rqt_size));
3633 	if (max_vqs < 2) {
3634 		dev_warn(mdev->device,
3635 			 "%d virtqueues are supported. At least 2 are required\n",
3636 			 max_vqs);
3637 		return -EAGAIN;
3638 	}
3639 
3640 	if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP)) {
3641 		if (add_config->net.max_vq_pairs > max_vqs / 2)
3642 			return -EINVAL;
3643 		max_vqs = min_t(u32, max_vqs, 2 * add_config->net.max_vq_pairs);
3644 	} else {
3645 		max_vqs = 2;
3646 	}
3647 
3648 	ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mgtdev->vdpa_ops,
3649 				 MLX5_VDPA_NUMVQ_GROUPS, MLX5_VDPA_NUM_AS, name, false);
3650 	if (IS_ERR(ndev))
3651 		return PTR_ERR(ndev);
3652 
3653 	ndev->mvdev.max_vqs = max_vqs;
3654 	mvdev = &ndev->mvdev;
3655 	mvdev->mdev = mdev;
3656 
3657 	ndev->vqs = kcalloc(max_vqs, sizeof(*ndev->vqs), GFP_KERNEL);
3658 	ndev->event_cbs = kcalloc(max_vqs + 1, sizeof(*ndev->event_cbs), GFP_KERNEL);
3659 	if (!ndev->vqs || !ndev->event_cbs) {
3660 		err = -ENOMEM;
3661 		goto err_alloc;
3662 	}
3663 
3664 	init_mvqs(ndev);
3665 	allocate_irqs(ndev);
3666 	init_rwsem(&ndev->reslock);
3667 	config = &ndev->config;
3668 
3669 	if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU)) {
3670 		err = config_func_mtu(mdev, add_config->net.mtu);
3671 		if (err)
3672 			goto err_alloc;
3673 	}
3674 
3675 	if (device_features & BIT_ULL(VIRTIO_NET_F_MTU)) {
3676 		err = query_mtu(mdev, &mtu);
3677 		if (err)
3678 			goto err_alloc;
3679 
3680 		ndev->config.mtu = cpu_to_mlx5vdpa16(mvdev, mtu);
3681 	}
3682 
3683 	if (device_features & BIT_ULL(VIRTIO_NET_F_STATUS)) {
3684 		if (get_link_state(mvdev))
3685 			ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP);
3686 		else
3687 			ndev->config.status &= cpu_to_mlx5vdpa16(mvdev, ~VIRTIO_NET_S_LINK_UP);
3688 	}
3689 
3690 	if (add_config->mask & (1 << VDPA_ATTR_DEV_NET_CFG_MACADDR)) {
3691 		memcpy(ndev->config.mac, add_config->net.mac, ETH_ALEN);
3692 	/* No bother setting mac address in config if not going to provision _F_MAC */
3693 	} else if ((add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) == 0 ||
3694 		   device_features & BIT_ULL(VIRTIO_NET_F_MAC)) {
3695 		err = mlx5_query_nic_vport_mac_address(mdev, 0, 0, config->mac);
3696 		if (err)
3697 			goto err_alloc;
3698 	}
3699 
3700 	if (!is_zero_ether_addr(config->mac)) {
3701 		pfmdev = pci_get_drvdata(pci_physfn(mdev->pdev));
3702 		err = mlx5_mpfs_add_mac(pfmdev, config->mac);
3703 		if (err)
3704 			goto err_alloc;
3705 	} else if ((add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) == 0) {
3706 		/*
3707 		 * We used to clear _F_MAC feature bit if seeing
3708 		 * zero mac address when device features are not
3709 		 * specifically provisioned. Keep the behaviour
3710 		 * so old scripts do not break.
3711 		 */
3712 		device_features &= ~BIT_ULL(VIRTIO_NET_F_MAC);
3713 	} else if (device_features & BIT_ULL(VIRTIO_NET_F_MAC)) {
3714 		/* Don't provision zero mac address for _F_MAC */
3715 		mlx5_vdpa_warn(&ndev->mvdev,
3716 			       "No mac address provisioned?\n");
3717 		err = -EINVAL;
3718 		goto err_alloc;
3719 	}
3720 
3721 	if (device_features & BIT_ULL(VIRTIO_NET_F_MQ))
3722 		config->max_virtqueue_pairs = cpu_to_mlx5vdpa16(mvdev, max_vqs / 2);
3723 
3724 	ndev->mvdev.mlx_features = device_features;
3725 	mvdev->vdev.dma_dev = &mdev->pdev->dev;
3726 	err = mlx5_vdpa_alloc_resources(&ndev->mvdev);
3727 	if (err)
3728 		goto err_mpfs;
3729 
3730 	INIT_LIST_HEAD(&mvdev->mr_list_head);
3731 
3732 	if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) {
3733 		err = mlx5_vdpa_create_dma_mr(mvdev);
3734 		if (err)
3735 			goto err_res;
3736 	}
3737 
3738 	err = alloc_resources(ndev);
3739 	if (err)
3740 		goto err_mr;
3741 
3742 	ndev->cvq_ent.mvdev = mvdev;
3743 	INIT_WORK(&ndev->cvq_ent.work, mlx5_cvq_kick_handler);
3744 	mvdev->wq = create_singlethread_workqueue("mlx5_vdpa_wq");
3745 	if (!mvdev->wq) {
3746 		err = -ENOMEM;
3747 		goto err_res2;
3748 	}
3749 
3750 	mvdev->vdev.mdev = &mgtdev->mgtdev;
3751 	err = _vdpa_register_device(&mvdev->vdev, max_vqs + 1);
3752 	if (err)
3753 		goto err_reg;
3754 
3755 	mgtdev->ndev = ndev;
3756 	return 0;
3757 
3758 err_reg:
3759 	destroy_workqueue(mvdev->wq);
3760 err_res2:
3761 	free_resources(ndev);
3762 err_mr:
3763 	mlx5_vdpa_destroy_mr_resources(mvdev);
3764 err_res:
3765 	mlx5_vdpa_free_resources(&ndev->mvdev);
3766 err_mpfs:
3767 	if (!is_zero_ether_addr(config->mac))
3768 		mlx5_mpfs_del_mac(pfmdev, config->mac);
3769 err_alloc:
3770 	put_device(&mvdev->vdev.dev);
3771 	return err;
3772 }
3773 
3774 static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device *dev)
3775 {
3776 	struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev);
3777 	struct mlx5_vdpa_dev *mvdev = to_mvdev(dev);
3778 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
3779 	struct workqueue_struct *wq;
3780 
3781 	unregister_link_notifier(ndev);
3782 	_vdpa_unregister_device(dev);
3783 	wq = mvdev->wq;
3784 	mvdev->wq = NULL;
3785 	destroy_workqueue(wq);
3786 	mgtdev->ndev = NULL;
3787 }
3788 
3789 static const struct vdpa_mgmtdev_ops mdev_ops = {
3790 	.dev_add = mlx5_vdpa_dev_add,
3791 	.dev_del = mlx5_vdpa_dev_del,
3792 };
3793 
3794 static struct virtio_device_id id_table[] = {
3795 	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
3796 	{ 0 },
3797 };
3798 
3799 static int mlx5v_probe(struct auxiliary_device *adev,
3800 		       const struct auxiliary_device_id *id)
3801 
3802 {
3803 	struct mlx5_adev *madev = container_of(adev, struct mlx5_adev, adev);
3804 	struct mlx5_core_dev *mdev = madev->mdev;
3805 	struct mlx5_vdpa_mgmtdev *mgtdev;
3806 	int err;
3807 
3808 	mgtdev = kzalloc(sizeof(*mgtdev), GFP_KERNEL);
3809 	if (!mgtdev)
3810 		return -ENOMEM;
3811 
3812 	mgtdev->mgtdev.ops = &mdev_ops;
3813 	mgtdev->mgtdev.device = mdev->device;
3814 	mgtdev->mgtdev.id_table = id_table;
3815 	mgtdev->mgtdev.config_attr_mask = BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR) |
3816 					  BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP) |
3817 					  BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU) |
3818 					  BIT_ULL(VDPA_ATTR_DEV_FEATURES);
3819 	mgtdev->mgtdev.max_supported_vqs =
3820 		MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues) + 1;
3821 	mgtdev->mgtdev.supported_features = get_supported_features(mdev);
3822 	mgtdev->madev = madev;
3823 	mgtdev->vdpa_ops = mlx5_vdpa_ops;
3824 
3825 	if (!MLX5_CAP_DEV_VDPA_EMULATION(mdev, desc_group_mkey_supported))
3826 		mgtdev->vdpa_ops.get_vq_desc_group = NULL;
3827 
3828 	if (!MLX5_CAP_DEV_VDPA_EMULATION(mdev, freeze_to_rdy_supported))
3829 		mgtdev->vdpa_ops.resume = NULL;
3830 
3831 	err = vdpa_mgmtdev_register(&mgtdev->mgtdev);
3832 	if (err)
3833 		goto reg_err;
3834 
3835 	auxiliary_set_drvdata(adev, mgtdev);
3836 
3837 	return 0;
3838 
3839 reg_err:
3840 	kfree(mgtdev);
3841 	return err;
3842 }
3843 
3844 static void mlx5v_remove(struct auxiliary_device *adev)
3845 {
3846 	struct mlx5_vdpa_mgmtdev *mgtdev;
3847 
3848 	mgtdev = auxiliary_get_drvdata(adev);
3849 	vdpa_mgmtdev_unregister(&mgtdev->mgtdev);
3850 	kfree(mgtdev);
3851 }
3852 
3853 static const struct auxiliary_device_id mlx5v_id_table[] = {
3854 	{ .name = MLX5_ADEV_NAME ".vnet", },
3855 	{},
3856 };
3857 
3858 MODULE_DEVICE_TABLE(auxiliary, mlx5v_id_table);
3859 
3860 static struct auxiliary_driver mlx5v_driver = {
3861 	.name = "vnet",
3862 	.probe = mlx5v_probe,
3863 	.remove = mlx5v_remove,
3864 	.id_table = mlx5v_id_table,
3865 };
3866 
3867 module_auxiliary_driver(mlx5v_driver);
3868