xref: /linux/drivers/vdpa/mlx5/net/mlx5_vnet.c (revision e60e9eeb3659776d3e450a5a86ca8b6f6594bced)
1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /* Copyright (c) 2020 Mellanox Technologies Ltd. */
3 
4 #include <linux/module.h>
5 #include <linux/vdpa.h>
6 #include <linux/vringh.h>
7 #include <uapi/linux/virtio_net.h>
8 #include <uapi/linux/virtio_ids.h>
9 #include <uapi/linux/vdpa.h>
10 #include <uapi/linux/vhost_types.h>
11 #include <linux/virtio_config.h>
12 #include <linux/auxiliary_bus.h>
13 #include <linux/mlx5/cq.h>
14 #include <linux/mlx5/qp.h>
15 #include <linux/mlx5/device.h>
16 #include <linux/mlx5/driver.h>
17 #include <linux/mlx5/vport.h>
18 #include <linux/mlx5/fs.h>
19 #include <linux/mlx5/mlx5_ifc_vdpa.h>
20 #include <linux/mlx5/mpfs.h>
21 #include "mlx5_vdpa.h"
22 #include "mlx5_vnet.h"
23 
24 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
25 MODULE_DESCRIPTION("Mellanox VDPA driver");
26 MODULE_LICENSE("Dual BSD/GPL");
27 
28 #define VALID_FEATURES_MASK                                                                        \
29 	(BIT_ULL(VIRTIO_NET_F_CSUM) | BIT_ULL(VIRTIO_NET_F_GUEST_CSUM) |                                   \
30 	 BIT_ULL(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) | BIT_ULL(VIRTIO_NET_F_MTU) | BIT_ULL(VIRTIO_NET_F_MAC) |   \
31 	 BIT_ULL(VIRTIO_NET_F_GUEST_TSO4) | BIT_ULL(VIRTIO_NET_F_GUEST_TSO6) |                             \
32 	 BIT_ULL(VIRTIO_NET_F_GUEST_ECN) | BIT_ULL(VIRTIO_NET_F_GUEST_UFO) | BIT_ULL(VIRTIO_NET_F_HOST_TSO4) | \
33 	 BIT_ULL(VIRTIO_NET_F_HOST_TSO6) | BIT_ULL(VIRTIO_NET_F_HOST_ECN) | BIT_ULL(VIRTIO_NET_F_HOST_UFO) |   \
34 	 BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) | BIT_ULL(VIRTIO_NET_F_STATUS) | BIT_ULL(VIRTIO_NET_F_CTRL_VQ) |      \
35 	 BIT_ULL(VIRTIO_NET_F_CTRL_RX) | BIT_ULL(VIRTIO_NET_F_CTRL_VLAN) |                                 \
36 	 BIT_ULL(VIRTIO_NET_F_CTRL_RX_EXTRA) | BIT_ULL(VIRTIO_NET_F_GUEST_ANNOUNCE) |                      \
37 	 BIT_ULL(VIRTIO_NET_F_MQ) | BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR) | BIT_ULL(VIRTIO_NET_F_HASH_REPORT) |  \
38 	 BIT_ULL(VIRTIO_NET_F_RSS) | BIT_ULL(VIRTIO_NET_F_RSC_EXT) | BIT_ULL(VIRTIO_NET_F_STANDBY) |           \
39 	 BIT_ULL(VIRTIO_NET_F_SPEED_DUPLEX) | BIT_ULL(VIRTIO_F_NOTIFY_ON_EMPTY) |                          \
40 	 BIT_ULL(VIRTIO_F_ANY_LAYOUT) | BIT_ULL(VIRTIO_F_VERSION_1) | BIT_ULL(VIRTIO_F_ACCESS_PLATFORM) |      \
41 	 BIT_ULL(VIRTIO_F_RING_PACKED) | BIT_ULL(VIRTIO_F_ORDER_PLATFORM) | BIT_ULL(VIRTIO_F_SR_IOV))
42 
43 #define VALID_STATUS_MASK                                                                          \
44 	(VIRTIO_CONFIG_S_ACKNOWLEDGE | VIRTIO_CONFIG_S_DRIVER | VIRTIO_CONFIG_S_DRIVER_OK |        \
45 	 VIRTIO_CONFIG_S_FEATURES_OK | VIRTIO_CONFIG_S_NEEDS_RESET | VIRTIO_CONFIG_S_FAILED)
46 
47 #define MLX5_FEATURE(_mvdev, _feature) (!!((_mvdev)->actual_features & BIT_ULL(_feature)))
48 
49 #define MLX5V_UNTAGGED 0x1000
50 
51 /* Device must start with 1 queue pair, as per VIRTIO v1.2 spec, section
52  * 5.1.6.5.5 "Device operation in multiqueue mode":
53  *
54  * Multiqueue is disabled by default.
55  * The driver enables multiqueue by sending a command using class
56  * VIRTIO_NET_CTRL_MQ. The command selects the mode of multiqueue
57  * operation, as follows: ...
58  */
59 #define MLX5V_DEFAULT_VQ_COUNT 2
60 
61 #define MLX5V_DEFAULT_VQ_SIZE 256
62 
63 struct mlx5_vdpa_cq_buf {
64 	struct mlx5_frag_buf_ctrl fbc;
65 	struct mlx5_frag_buf frag_buf;
66 	int cqe_size;
67 	int nent;
68 };
69 
70 struct mlx5_vdpa_cq {
71 	struct mlx5_core_cq mcq;
72 	struct mlx5_vdpa_cq_buf buf;
73 	struct mlx5_db db;
74 	int cqe;
75 };
76 
77 struct mlx5_vdpa_umem {
78 	struct mlx5_frag_buf_ctrl fbc;
79 	struct mlx5_frag_buf frag_buf;
80 	int size;
81 	u32 id;
82 };
83 
84 struct mlx5_vdpa_qp {
85 	struct mlx5_core_qp mqp;
86 	struct mlx5_frag_buf frag_buf;
87 	struct mlx5_db db;
88 	u16 head;
89 	bool fw;
90 };
91 
92 struct mlx5_vq_restore_info {
93 	u32 num_ent;
94 	u64 desc_addr;
95 	u64 device_addr;
96 	u64 driver_addr;
97 	u16 avail_index;
98 	u16 used_index;
99 	struct msi_map map;
100 	bool ready;
101 	bool restore;
102 };
103 
104 struct mlx5_vdpa_virtqueue {
105 	bool ready;
106 	u64 desc_addr;
107 	u64 device_addr;
108 	u64 driver_addr;
109 	u32 num_ent;
110 
111 	/* Resources for implementing the notification channel from the device
112 	 * to the driver. fwqp is the firmware end of an RC connection; the
113 	 * other end is vqqp used by the driver. cq is where completions are
114 	 * reported.
115 	 */
116 	struct mlx5_vdpa_cq cq;
117 	struct mlx5_vdpa_qp fwqp;
118 	struct mlx5_vdpa_qp vqqp;
119 
120 	/* umem resources are required for the virtqueue operation. They're use
121 	 * is internal and they must be provided by the driver.
122 	 */
123 	struct mlx5_vdpa_umem umem1;
124 	struct mlx5_vdpa_umem umem2;
125 	struct mlx5_vdpa_umem umem3;
126 
127 	u32 counter_set_id;
128 	bool initialized;
129 	int index;
130 	u32 virtq_id;
131 	struct mlx5_vdpa_net *ndev;
132 	u16 avail_idx;
133 	u16 used_idx;
134 	int fw_state;
135 
136 	u64 modified_fields;
137 
138 	struct mlx5_vdpa_mr *vq_mr;
139 	struct mlx5_vdpa_mr *desc_mr;
140 
141 	struct msi_map map;
142 
143 	/* keep last in the struct */
144 	struct mlx5_vq_restore_info ri;
145 };
146 
147 static bool is_index_valid(struct mlx5_vdpa_dev *mvdev, u16 idx)
148 {
149 	if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_MQ))) {
150 		if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)))
151 			return idx < 2;
152 		else
153 			return idx < 3;
154 	}
155 
156 	return idx <= mvdev->max_idx;
157 }
158 
159 static void free_fixed_resources(struct mlx5_vdpa_net *ndev);
160 static void mvqs_set_defaults(struct mlx5_vdpa_net *ndev);
161 static int setup_vq_resources(struct mlx5_vdpa_net *ndev, bool filled);
162 static void teardown_vq_resources(struct mlx5_vdpa_net *ndev);
163 
164 static bool mlx5_vdpa_debug;
165 
166 #define MLX5_LOG_VIO_FLAG(_feature)                                                                \
167 	do {                                                                                       \
168 		if (features & BIT_ULL(_feature))                                                  \
169 			mlx5_vdpa_info(mvdev, "%s\n", #_feature);                                  \
170 	} while (0)
171 
172 #define MLX5_LOG_VIO_STAT(_status)                                                                 \
173 	do {                                                                                       \
174 		if (status & (_status))                                                            \
175 			mlx5_vdpa_info(mvdev, "%s\n", #_status);                                   \
176 	} while (0)
177 
178 /* TODO: cross-endian support */
179 static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev *mvdev)
180 {
181 	return virtio_legacy_is_little_endian() ||
182 		(mvdev->actual_features & BIT_ULL(VIRTIO_F_VERSION_1));
183 }
184 
185 static u16 mlx5vdpa16_to_cpu(struct mlx5_vdpa_dev *mvdev, __virtio16 val)
186 {
187 	return __virtio16_to_cpu(mlx5_vdpa_is_little_endian(mvdev), val);
188 }
189 
190 static __virtio16 cpu_to_mlx5vdpa16(struct mlx5_vdpa_dev *mvdev, u16 val)
191 {
192 	return __cpu_to_virtio16(mlx5_vdpa_is_little_endian(mvdev), val);
193 }
194 
195 static u16 ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev)
196 {
197 	if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_MQ)))
198 		return 2;
199 
200 	return mvdev->max_vqs;
201 }
202 
203 static bool is_ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev, u16 idx)
204 {
205 	return idx == ctrl_vq_idx(mvdev);
206 }
207 
208 static void print_status(struct mlx5_vdpa_dev *mvdev, u8 status, bool set)
209 {
210 	if (status & ~VALID_STATUS_MASK)
211 		mlx5_vdpa_warn(mvdev, "Warning: there are invalid status bits 0x%x\n",
212 			       status & ~VALID_STATUS_MASK);
213 
214 	if (!mlx5_vdpa_debug)
215 		return;
216 
217 	mlx5_vdpa_info(mvdev, "driver status %s", set ? "set" : "get");
218 	if (set && !status) {
219 		mlx5_vdpa_info(mvdev, "driver resets the device\n");
220 		return;
221 	}
222 
223 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_ACKNOWLEDGE);
224 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_DRIVER);
225 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_DRIVER_OK);
226 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_FEATURES_OK);
227 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_NEEDS_RESET);
228 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_FAILED);
229 }
230 
231 static void print_features(struct mlx5_vdpa_dev *mvdev, u64 features, bool set)
232 {
233 	if (features & ~VALID_FEATURES_MASK)
234 		mlx5_vdpa_warn(mvdev, "There are invalid feature bits 0x%llx\n",
235 			       features & ~VALID_FEATURES_MASK);
236 
237 	if (!mlx5_vdpa_debug)
238 		return;
239 
240 	mlx5_vdpa_info(mvdev, "driver %s feature bits:\n", set ? "sets" : "reads");
241 	if (!features)
242 		mlx5_vdpa_info(mvdev, "all feature bits are cleared\n");
243 
244 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CSUM);
245 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_CSUM);
246 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS);
247 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MTU);
248 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MAC);
249 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_TSO4);
250 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_TSO6);
251 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_ECN);
252 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_UFO);
253 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_TSO4);
254 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_TSO6);
255 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_ECN);
256 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_UFO);
257 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MRG_RXBUF);
258 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_STATUS);
259 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_VQ);
260 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_RX);
261 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_VLAN);
262 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_RX_EXTRA);
263 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_ANNOUNCE);
264 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MQ);
265 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_MAC_ADDR);
266 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HASH_REPORT);
267 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_RSS);
268 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_RSC_EXT);
269 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_STANDBY);
270 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_SPEED_DUPLEX);
271 	MLX5_LOG_VIO_FLAG(VIRTIO_F_NOTIFY_ON_EMPTY);
272 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ANY_LAYOUT);
273 	MLX5_LOG_VIO_FLAG(VIRTIO_F_VERSION_1);
274 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ACCESS_PLATFORM);
275 	MLX5_LOG_VIO_FLAG(VIRTIO_F_RING_PACKED);
276 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ORDER_PLATFORM);
277 	MLX5_LOG_VIO_FLAG(VIRTIO_F_SR_IOV);
278 }
279 
280 static int create_tis(struct mlx5_vdpa_net *ndev)
281 {
282 	struct mlx5_vdpa_dev *mvdev = &ndev->mvdev;
283 	u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {};
284 	void *tisc;
285 	int err;
286 
287 	tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
288 	MLX5_SET(tisc, tisc, transport_domain, ndev->res.tdn);
289 	err = mlx5_vdpa_create_tis(mvdev, in, &ndev->res.tisn);
290 	if (err)
291 		mlx5_vdpa_warn(mvdev, "create TIS (%d)\n", err);
292 
293 	return err;
294 }
295 
296 static void destroy_tis(struct mlx5_vdpa_net *ndev)
297 {
298 	mlx5_vdpa_destroy_tis(&ndev->mvdev, ndev->res.tisn);
299 }
300 
301 #define MLX5_VDPA_CQE_SIZE 64
302 #define MLX5_VDPA_LOG_CQE_SIZE ilog2(MLX5_VDPA_CQE_SIZE)
303 
304 static int cq_frag_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_cq_buf *buf, int nent)
305 {
306 	struct mlx5_frag_buf *frag_buf = &buf->frag_buf;
307 	u8 log_wq_stride = MLX5_VDPA_LOG_CQE_SIZE;
308 	u8 log_wq_sz = MLX5_VDPA_LOG_CQE_SIZE;
309 	int err;
310 
311 	err = mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, nent * MLX5_VDPA_CQE_SIZE, frag_buf,
312 				       ndev->mvdev.mdev->priv.numa_node);
313 	if (err)
314 		return err;
315 
316 	mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc);
317 
318 	buf->cqe_size = MLX5_VDPA_CQE_SIZE;
319 	buf->nent = nent;
320 
321 	return 0;
322 }
323 
324 static int umem_frag_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem, int size)
325 {
326 	struct mlx5_frag_buf *frag_buf = &umem->frag_buf;
327 
328 	return mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, size, frag_buf,
329 					ndev->mvdev.mdev->priv.numa_node);
330 }
331 
332 static void cq_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_cq_buf *buf)
333 {
334 	mlx5_frag_buf_free(ndev->mvdev.mdev, &buf->frag_buf);
335 }
336 
337 static void *get_cqe(struct mlx5_vdpa_cq *vcq, int n)
338 {
339 	return mlx5_frag_buf_get_wqe(&vcq->buf.fbc, n);
340 }
341 
342 static void cq_frag_buf_init(struct mlx5_vdpa_cq *vcq, struct mlx5_vdpa_cq_buf *buf)
343 {
344 	struct mlx5_cqe64 *cqe64;
345 	void *cqe;
346 	int i;
347 
348 	for (i = 0; i < buf->nent; i++) {
349 		cqe = get_cqe(vcq, i);
350 		cqe64 = cqe;
351 		cqe64->op_own = MLX5_CQE_INVALID << 4;
352 	}
353 }
354 
355 static void *get_sw_cqe(struct mlx5_vdpa_cq *cq, int n)
356 {
357 	struct mlx5_cqe64 *cqe64 = get_cqe(cq, n & (cq->cqe - 1));
358 
359 	if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) &&
360 	    !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & cq->cqe)))
361 		return cqe64;
362 
363 	return NULL;
364 }
365 
366 static void rx_post(struct mlx5_vdpa_qp *vqp, int n)
367 {
368 	vqp->head += n;
369 	vqp->db.db[0] = cpu_to_be32(vqp->head);
370 }
371 
372 static void qp_prepare(struct mlx5_vdpa_net *ndev, bool fw, void *in,
373 		       struct mlx5_vdpa_virtqueue *mvq, u32 num_ent)
374 {
375 	struct mlx5_vdpa_qp *vqp;
376 	__be64 *pas;
377 	void *qpc;
378 
379 	vqp = fw ? &mvq->fwqp : &mvq->vqqp;
380 	MLX5_SET(create_qp_in, in, uid, ndev->mvdev.res.uid);
381 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
382 	if (vqp->fw) {
383 		/* Firmware QP is allocated by the driver for the firmware's
384 		 * use so we can skip part of the params as they will be chosen by firmware
385 		 */
386 		qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
387 		MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ);
388 		MLX5_SET(qpc, qpc, no_sq, 1);
389 		return;
390 	}
391 
392 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
393 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
394 	MLX5_SET(qpc, qpc, pd, ndev->mvdev.res.pdn);
395 	MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
396 	MLX5_SET(qpc, qpc, uar_page, ndev->mvdev.res.uar->index);
397 	MLX5_SET(qpc, qpc, log_page_size, vqp->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
398 	MLX5_SET(qpc, qpc, no_sq, 1);
399 	MLX5_SET(qpc, qpc, cqn_rcv, mvq->cq.mcq.cqn);
400 	MLX5_SET(qpc, qpc, log_rq_size, ilog2(num_ent));
401 	MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
402 	pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, in, pas);
403 	mlx5_fill_page_frag_array(&vqp->frag_buf, pas);
404 }
405 
406 static int rq_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp, u32 num_ent)
407 {
408 	return mlx5_frag_buf_alloc_node(ndev->mvdev.mdev,
409 					num_ent * sizeof(struct mlx5_wqe_data_seg), &vqp->frag_buf,
410 					ndev->mvdev.mdev->priv.numa_node);
411 }
412 
413 static void rq_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp)
414 {
415 	mlx5_frag_buf_free(ndev->mvdev.mdev, &vqp->frag_buf);
416 }
417 
418 static int qp_create(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
419 		     struct mlx5_vdpa_qp *vqp)
420 {
421 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
422 	int inlen = MLX5_ST_SZ_BYTES(create_qp_in);
423 	u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
424 	void *qpc;
425 	void *in;
426 	int err;
427 
428 	if (!vqp->fw) {
429 		vqp = &mvq->vqqp;
430 		err = rq_buf_alloc(ndev, vqp, mvq->num_ent);
431 		if (err)
432 			return err;
433 
434 		err = mlx5_db_alloc(ndev->mvdev.mdev, &vqp->db);
435 		if (err)
436 			goto err_db;
437 		inlen += vqp->frag_buf.npages * sizeof(__be64);
438 	}
439 
440 	in = kzalloc(inlen, GFP_KERNEL);
441 	if (!in) {
442 		err = -ENOMEM;
443 		goto err_kzalloc;
444 	}
445 
446 	qp_prepare(ndev, vqp->fw, in, mvq, mvq->num_ent);
447 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
448 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
449 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
450 	MLX5_SET(qpc, qpc, pd, ndev->mvdev.res.pdn);
451 	MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
452 	if (!vqp->fw)
453 		MLX5_SET64(qpc, qpc, dbr_addr, vqp->db.dma);
454 	MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
455 	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
456 	kfree(in);
457 	if (err)
458 		goto err_kzalloc;
459 
460 	vqp->mqp.uid = ndev->mvdev.res.uid;
461 	vqp->mqp.qpn = MLX5_GET(create_qp_out, out, qpn);
462 
463 	if (!vqp->fw)
464 		rx_post(vqp, mvq->num_ent);
465 
466 	return 0;
467 
468 err_kzalloc:
469 	if (!vqp->fw)
470 		mlx5_db_free(ndev->mvdev.mdev, &vqp->db);
471 err_db:
472 	if (!vqp->fw)
473 		rq_buf_free(ndev, vqp);
474 
475 	return err;
476 }
477 
478 static void qp_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp)
479 {
480 	u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
481 
482 	MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
483 	MLX5_SET(destroy_qp_in, in, qpn, vqp->mqp.qpn);
484 	MLX5_SET(destroy_qp_in, in, uid, ndev->mvdev.res.uid);
485 	if (mlx5_cmd_exec_in(ndev->mvdev.mdev, destroy_qp, in))
486 		mlx5_vdpa_warn(&ndev->mvdev, "destroy qp 0x%x\n", vqp->mqp.qpn);
487 	if (!vqp->fw) {
488 		mlx5_db_free(ndev->mvdev.mdev, &vqp->db);
489 		rq_buf_free(ndev, vqp);
490 	}
491 }
492 
493 static void *next_cqe_sw(struct mlx5_vdpa_cq *cq)
494 {
495 	return get_sw_cqe(cq, cq->mcq.cons_index);
496 }
497 
498 static int mlx5_vdpa_poll_one(struct mlx5_vdpa_cq *vcq)
499 {
500 	struct mlx5_cqe64 *cqe64;
501 
502 	cqe64 = next_cqe_sw(vcq);
503 	if (!cqe64)
504 		return -EAGAIN;
505 
506 	vcq->mcq.cons_index++;
507 	return 0;
508 }
509 
510 static void mlx5_vdpa_handle_completions(struct mlx5_vdpa_virtqueue *mvq, int num)
511 {
512 	struct mlx5_vdpa_net *ndev = mvq->ndev;
513 	struct vdpa_callback *event_cb;
514 
515 	event_cb = &ndev->event_cbs[mvq->index];
516 	mlx5_cq_set_ci(&mvq->cq.mcq);
517 
518 	/* make sure CQ cosumer update is visible to the hardware before updating
519 	 * RX doorbell record.
520 	 */
521 	dma_wmb();
522 	rx_post(&mvq->vqqp, num);
523 	if (event_cb->callback)
524 		event_cb->callback(event_cb->private);
525 }
526 
527 static void mlx5_vdpa_cq_comp(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe)
528 {
529 	struct mlx5_vdpa_virtqueue *mvq = container_of(mcq, struct mlx5_vdpa_virtqueue, cq.mcq);
530 	struct mlx5_vdpa_net *ndev = mvq->ndev;
531 	void __iomem *uar_page = ndev->mvdev.res.uar->map;
532 	int num = 0;
533 
534 	while (!mlx5_vdpa_poll_one(&mvq->cq)) {
535 		num++;
536 		if (num > mvq->num_ent / 2) {
537 			/* If completions keep coming while we poll, we want to
538 			 * let the hardware know that we consumed them by
539 			 * updating the doorbell record.  We also let vdpa core
540 			 * know about this so it passes it on the virtio driver
541 			 * on the guest.
542 			 */
543 			mlx5_vdpa_handle_completions(mvq, num);
544 			num = 0;
545 		}
546 	}
547 
548 	if (num)
549 		mlx5_vdpa_handle_completions(mvq, num);
550 
551 	mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index);
552 }
553 
554 static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent)
555 {
556 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
557 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
558 	void __iomem *uar_page = ndev->mvdev.res.uar->map;
559 	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
560 	struct mlx5_vdpa_cq *vcq = &mvq->cq;
561 	__be64 *pas;
562 	int inlen;
563 	void *cqc;
564 	void *in;
565 	int err;
566 	int eqn;
567 
568 	err = mlx5_db_alloc(mdev, &vcq->db);
569 	if (err)
570 		return err;
571 
572 	vcq->mcq.set_ci_db = vcq->db.db;
573 	vcq->mcq.arm_db = vcq->db.db + 1;
574 	vcq->mcq.cqe_sz = 64;
575 
576 	err = cq_frag_buf_alloc(ndev, &vcq->buf, num_ent);
577 	if (err)
578 		goto err_db;
579 
580 	cq_frag_buf_init(vcq, &vcq->buf);
581 
582 	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
583 		MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * vcq->buf.frag_buf.npages;
584 	in = kzalloc(inlen, GFP_KERNEL);
585 	if (!in) {
586 		err = -ENOMEM;
587 		goto err_vzalloc;
588 	}
589 
590 	MLX5_SET(create_cq_in, in, uid, ndev->mvdev.res.uid);
591 	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
592 	mlx5_fill_page_frag_array(&vcq->buf.frag_buf, pas);
593 
594 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
595 	MLX5_SET(cqc, cqc, log_page_size, vcq->buf.frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
596 
597 	/* Use vector 0 by default. Consider adding code to choose least used
598 	 * vector.
599 	 */
600 	err = mlx5_comp_eqn_get(mdev, 0, &eqn);
601 	if (err)
602 		goto err_vec;
603 
604 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
605 	MLX5_SET(cqc, cqc, log_cq_size, ilog2(num_ent));
606 	MLX5_SET(cqc, cqc, uar_page, ndev->mvdev.res.uar->index);
607 	MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
608 	MLX5_SET64(cqc, cqc, dbr_addr, vcq->db.dma);
609 
610 	err = mlx5_core_create_cq(mdev, &vcq->mcq, in, inlen, out, sizeof(out));
611 	if (err)
612 		goto err_vec;
613 
614 	vcq->mcq.comp = mlx5_vdpa_cq_comp;
615 	vcq->cqe = num_ent;
616 	vcq->mcq.set_ci_db = vcq->db.db;
617 	vcq->mcq.arm_db = vcq->db.db + 1;
618 	mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index);
619 	kfree(in);
620 	return 0;
621 
622 err_vec:
623 	kfree(in);
624 err_vzalloc:
625 	cq_frag_buf_free(ndev, &vcq->buf);
626 err_db:
627 	mlx5_db_free(ndev->mvdev.mdev, &vcq->db);
628 	return err;
629 }
630 
631 static void cq_destroy(struct mlx5_vdpa_net *ndev, u16 idx)
632 {
633 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
634 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
635 	struct mlx5_vdpa_cq *vcq = &mvq->cq;
636 
637 	if (mlx5_core_destroy_cq(mdev, &vcq->mcq)) {
638 		mlx5_vdpa_warn(&ndev->mvdev, "destroy CQ 0x%x\n", vcq->mcq.cqn);
639 		return;
640 	}
641 	cq_frag_buf_free(ndev, &vcq->buf);
642 	mlx5_db_free(ndev->mvdev.mdev, &vcq->db);
643 }
644 
645 static int read_umem_params(struct mlx5_vdpa_net *ndev)
646 {
647 	u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {};
648 	u16 opmod = (MLX5_CAP_VDPA_EMULATION << 1) | (HCA_CAP_OPMOD_GET_CUR & 0x01);
649 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
650 	int out_size;
651 	void *caps;
652 	void *out;
653 	int err;
654 
655 	out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out);
656 	out = kzalloc(out_size, GFP_KERNEL);
657 	if (!out)
658 		return -ENOMEM;
659 
660 	MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
661 	MLX5_SET(query_hca_cap_in, in, op_mod, opmod);
662 	err = mlx5_cmd_exec_inout(mdev, query_hca_cap, in, out);
663 	if (err) {
664 		mlx5_vdpa_warn(&ndev->mvdev,
665 			"Failed reading vdpa umem capabilities with err %d\n", err);
666 		goto out;
667 	}
668 
669 	caps =  MLX5_ADDR_OF(query_hca_cap_out, out, capability);
670 
671 	ndev->umem_1_buffer_param_a = MLX5_GET(virtio_emulation_cap, caps, umem_1_buffer_param_a);
672 	ndev->umem_1_buffer_param_b = MLX5_GET(virtio_emulation_cap, caps, umem_1_buffer_param_b);
673 
674 	ndev->umem_2_buffer_param_a = MLX5_GET(virtio_emulation_cap, caps, umem_2_buffer_param_a);
675 	ndev->umem_2_buffer_param_b = MLX5_GET(virtio_emulation_cap, caps, umem_2_buffer_param_b);
676 
677 	ndev->umem_3_buffer_param_a = MLX5_GET(virtio_emulation_cap, caps, umem_3_buffer_param_a);
678 	ndev->umem_3_buffer_param_b = MLX5_GET(virtio_emulation_cap, caps, umem_3_buffer_param_b);
679 
680 out:
681 	kfree(out);
682 	return 0;
683 }
684 
685 static void set_umem_size(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num,
686 			  struct mlx5_vdpa_umem **umemp)
687 {
688 	u32 p_a;
689 	u32 p_b;
690 
691 	switch (num) {
692 	case 1:
693 		p_a = ndev->umem_1_buffer_param_a;
694 		p_b = ndev->umem_1_buffer_param_b;
695 		*umemp = &mvq->umem1;
696 		break;
697 	case 2:
698 		p_a = ndev->umem_2_buffer_param_a;
699 		p_b = ndev->umem_2_buffer_param_b;
700 		*umemp = &mvq->umem2;
701 		break;
702 	case 3:
703 		p_a = ndev->umem_3_buffer_param_a;
704 		p_b = ndev->umem_3_buffer_param_b;
705 		*umemp = &mvq->umem3;
706 		break;
707 	}
708 
709 	(*umemp)->size = p_a * mvq->num_ent + p_b;
710 }
711 
712 static void umem_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem)
713 {
714 	mlx5_frag_buf_free(ndev->mvdev.mdev, &umem->frag_buf);
715 }
716 
717 static int create_umem(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num)
718 {
719 	int inlen;
720 	u32 out[MLX5_ST_SZ_DW(create_umem_out)] = {};
721 	void *um;
722 	void *in;
723 	int err;
724 	__be64 *pas;
725 	struct mlx5_vdpa_umem *umem;
726 
727 	set_umem_size(ndev, mvq, num, &umem);
728 	err = umem_frag_buf_alloc(ndev, umem, umem->size);
729 	if (err)
730 		return err;
731 
732 	inlen = MLX5_ST_SZ_BYTES(create_umem_in) + MLX5_ST_SZ_BYTES(mtt) * umem->frag_buf.npages;
733 
734 	in = kzalloc(inlen, GFP_KERNEL);
735 	if (!in) {
736 		err = -ENOMEM;
737 		goto err_in;
738 	}
739 
740 	MLX5_SET(create_umem_in, in, opcode, MLX5_CMD_OP_CREATE_UMEM);
741 	MLX5_SET(create_umem_in, in, uid, ndev->mvdev.res.uid);
742 	um = MLX5_ADDR_OF(create_umem_in, in, umem);
743 	MLX5_SET(umem, um, log_page_size, umem->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
744 	MLX5_SET64(umem, um, num_of_mtt, umem->frag_buf.npages);
745 
746 	pas = (__be64 *)MLX5_ADDR_OF(umem, um, mtt[0]);
747 	mlx5_fill_page_frag_array_perm(&umem->frag_buf, pas, MLX5_MTT_PERM_RW);
748 
749 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
750 	if (err) {
751 		mlx5_vdpa_warn(&ndev->mvdev, "create umem(%d)\n", err);
752 		goto err_cmd;
753 	}
754 
755 	kfree(in);
756 	umem->id = MLX5_GET(create_umem_out, out, umem_id);
757 
758 	return 0;
759 
760 err_cmd:
761 	kfree(in);
762 err_in:
763 	umem_frag_buf_free(ndev, umem);
764 	return err;
765 }
766 
767 static void umem_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num)
768 {
769 	u32 in[MLX5_ST_SZ_DW(destroy_umem_in)] = {};
770 	u32 out[MLX5_ST_SZ_DW(destroy_umem_out)] = {};
771 	struct mlx5_vdpa_umem *umem;
772 
773 	switch (num) {
774 	case 1:
775 		umem = &mvq->umem1;
776 		break;
777 	case 2:
778 		umem = &mvq->umem2;
779 		break;
780 	case 3:
781 		umem = &mvq->umem3;
782 		break;
783 	}
784 
785 	MLX5_SET(destroy_umem_in, in, opcode, MLX5_CMD_OP_DESTROY_UMEM);
786 	MLX5_SET(destroy_umem_in, in, umem_id, umem->id);
787 	if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out)))
788 		return;
789 
790 	umem_frag_buf_free(ndev, umem);
791 }
792 
793 static int umems_create(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
794 {
795 	int num;
796 	int err;
797 
798 	for (num = 1; num <= 3; num++) {
799 		err = create_umem(ndev, mvq, num);
800 		if (err)
801 			goto err_umem;
802 	}
803 	return 0;
804 
805 err_umem:
806 	for (num--; num > 0; num--)
807 		umem_destroy(ndev, mvq, num);
808 
809 	return err;
810 }
811 
812 static void umems_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
813 {
814 	int num;
815 
816 	for (num = 3; num > 0; num--)
817 		umem_destroy(ndev, mvq, num);
818 }
819 
820 static int get_queue_type(struct mlx5_vdpa_net *ndev)
821 {
822 	u32 type_mask;
823 
824 	type_mask = MLX5_CAP_DEV_VDPA_EMULATION(ndev->mvdev.mdev, virtio_queue_type);
825 
826 	/* prefer split queue */
827 	if (type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT)
828 		return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT;
829 
830 	WARN_ON(!(type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED));
831 
832 	return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED;
833 }
834 
835 static bool vq_is_tx(u16 idx)
836 {
837 	return idx % 2;
838 }
839 
840 enum {
841 	MLX5_VIRTIO_NET_F_MRG_RXBUF = 2,
842 	MLX5_VIRTIO_NET_F_HOST_ECN = 4,
843 	MLX5_VIRTIO_NET_F_GUEST_ECN = 6,
844 	MLX5_VIRTIO_NET_F_GUEST_TSO6 = 7,
845 	MLX5_VIRTIO_NET_F_GUEST_TSO4 = 8,
846 	MLX5_VIRTIO_NET_F_GUEST_CSUM = 9,
847 	MLX5_VIRTIO_NET_F_CSUM = 10,
848 	MLX5_VIRTIO_NET_F_HOST_TSO6 = 11,
849 	MLX5_VIRTIO_NET_F_HOST_TSO4 = 12,
850 };
851 
852 static u16 get_features(u64 features)
853 {
854 	return (!!(features & BIT_ULL(VIRTIO_NET_F_MRG_RXBUF)) << MLX5_VIRTIO_NET_F_MRG_RXBUF) |
855 	       (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_ECN)) << MLX5_VIRTIO_NET_F_HOST_ECN) |
856 	       (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_ECN)) << MLX5_VIRTIO_NET_F_GUEST_ECN) |
857 	       (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_TSO6)) << MLX5_VIRTIO_NET_F_GUEST_TSO6) |
858 	       (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_TSO4)) << MLX5_VIRTIO_NET_F_GUEST_TSO4) |
859 	       (!!(features & BIT_ULL(VIRTIO_NET_F_CSUM)) << MLX5_VIRTIO_NET_F_CSUM) |
860 	       (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO6)) << MLX5_VIRTIO_NET_F_HOST_TSO6) |
861 	       (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO4)) << MLX5_VIRTIO_NET_F_HOST_TSO4);
862 }
863 
864 static bool counters_supported(const struct mlx5_vdpa_dev *mvdev)
865 {
866 	return MLX5_CAP_GEN_64(mvdev->mdev, general_obj_types) &
867 	       BIT_ULL(MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS);
868 }
869 
870 static bool msix_mode_supported(struct mlx5_vdpa_dev *mvdev)
871 {
872 	return MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, event_mode) &
873 		(1 << MLX5_VIRTIO_Q_EVENT_MODE_MSIX_MODE) &&
874 		pci_msix_can_alloc_dyn(mvdev->mdev->pdev);
875 }
876 
877 static int create_virtqueue(struct mlx5_vdpa_net *ndev,
878 			    struct mlx5_vdpa_virtqueue *mvq,
879 			    bool filled)
880 {
881 	int inlen = MLX5_ST_SZ_BYTES(create_virtio_net_q_in);
882 	u32 out[MLX5_ST_SZ_DW(create_virtio_net_q_out)] = {};
883 	struct mlx5_vdpa_dev *mvdev = &ndev->mvdev;
884 	struct mlx5_vdpa_mr *vq_mr;
885 	struct mlx5_vdpa_mr *vq_desc_mr;
886 	u64 features = filled ? mvdev->actual_features : mvdev->mlx_features;
887 	void *obj_context;
888 	u16 mlx_features;
889 	void *cmd_hdr;
890 	void *vq_ctx;
891 	void *in;
892 	int err;
893 
894 	err = umems_create(ndev, mvq);
895 	if (err)
896 		return err;
897 
898 	in = kzalloc(inlen, GFP_KERNEL);
899 	if (!in) {
900 		err = -ENOMEM;
901 		goto err_alloc;
902 	}
903 
904 	mlx_features = get_features(features);
905 	cmd_hdr = MLX5_ADDR_OF(create_virtio_net_q_in, in, general_obj_in_cmd_hdr);
906 
907 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
908 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
909 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
910 
911 	obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, in, obj_context);
912 	MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_12_3,
913 		 mlx_features >> 3);
914 	MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_2_0,
915 		 mlx_features & 7);
916 	vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, virtio_q_context);
917 	MLX5_SET(virtio_q, vq_ctx, virtio_q_type, get_queue_type(ndev));
918 
919 	if (vq_is_tx(mvq->index))
920 		MLX5_SET(virtio_net_q_object, obj_context, tisn_or_qpn, ndev->res.tisn);
921 
922 	if (mvq->map.virq) {
923 		MLX5_SET(virtio_q, vq_ctx, event_mode, MLX5_VIRTIO_Q_EVENT_MODE_MSIX_MODE);
924 		MLX5_SET(virtio_q, vq_ctx, event_qpn_or_msix, mvq->map.index);
925 	} else {
926 		MLX5_SET(virtio_q, vq_ctx, event_mode, MLX5_VIRTIO_Q_EVENT_MODE_QP_MODE);
927 		MLX5_SET(virtio_q, vq_ctx, event_qpn_or_msix, mvq->fwqp.mqp.qpn);
928 	}
929 
930 	MLX5_SET(virtio_q, vq_ctx, queue_index, mvq->index);
931 	MLX5_SET(virtio_q, vq_ctx, queue_size, mvq->num_ent);
932 	MLX5_SET(virtio_q, vq_ctx, virtio_version_1_0,
933 		 !!(features & BIT_ULL(VIRTIO_F_VERSION_1)));
934 
935 	if (filled) {
936 		MLX5_SET(virtio_net_q_object, obj_context, hw_available_index, mvq->avail_idx);
937 		MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, mvq->used_idx);
938 
939 		MLX5_SET64(virtio_q, vq_ctx, desc_addr, mvq->desc_addr);
940 		MLX5_SET64(virtio_q, vq_ctx, used_addr, mvq->device_addr);
941 		MLX5_SET64(virtio_q, vq_ctx, available_addr, mvq->driver_addr);
942 
943 		vq_mr = mvdev->mr[mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP]];
944 		if (vq_mr)
945 			MLX5_SET(virtio_q, vq_ctx, virtio_q_mkey, vq_mr->mkey);
946 
947 		vq_desc_mr = mvdev->mr[mvdev->group2asid[MLX5_VDPA_DATAVQ_DESC_GROUP]];
948 		if (vq_desc_mr &&
949 		    MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, desc_group_mkey_supported))
950 			MLX5_SET(virtio_q, vq_ctx, desc_group_mkey, vq_desc_mr->mkey);
951 	} else {
952 		/* If there is no mr update, make sure that the existing ones are set
953 		 * modify to ready.
954 		 */
955 		vq_mr = mvdev->mr[mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP]];
956 		if (vq_mr)
957 			mvq->modified_fields |= MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY;
958 
959 		vq_desc_mr = mvdev->mr[mvdev->group2asid[MLX5_VDPA_DATAVQ_DESC_GROUP]];
960 		if (vq_desc_mr)
961 			mvq->modified_fields |= MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY;
962 	}
963 
964 	MLX5_SET(virtio_q, vq_ctx, umem_1_id, mvq->umem1.id);
965 	MLX5_SET(virtio_q, vq_ctx, umem_1_size, mvq->umem1.size);
966 	MLX5_SET(virtio_q, vq_ctx, umem_2_id, mvq->umem2.id);
967 	MLX5_SET(virtio_q, vq_ctx, umem_2_size, mvq->umem2.size);
968 	MLX5_SET(virtio_q, vq_ctx, umem_3_id, mvq->umem3.id);
969 	MLX5_SET(virtio_q, vq_ctx, umem_3_size, mvq->umem3.size);
970 	MLX5_SET(virtio_q, vq_ctx, pd, ndev->mvdev.res.pdn);
971 	if (counters_supported(&ndev->mvdev))
972 		MLX5_SET(virtio_q, vq_ctx, counter_set_id, mvq->counter_set_id);
973 
974 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
975 	if (err)
976 		goto err_cmd;
977 
978 	mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT;
979 	kfree(in);
980 	mvq->virtq_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
981 
982 	if (filled) {
983 		mlx5_vdpa_get_mr(mvdev, vq_mr);
984 		mvq->vq_mr = vq_mr;
985 
986 		if (vq_desc_mr &&
987 		    MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, desc_group_mkey_supported)) {
988 			mlx5_vdpa_get_mr(mvdev, vq_desc_mr);
989 			mvq->desc_mr = vq_desc_mr;
990 		}
991 	}
992 
993 	return 0;
994 
995 err_cmd:
996 	kfree(in);
997 err_alloc:
998 	umems_destroy(ndev, mvq);
999 	return err;
1000 }
1001 
1002 static void destroy_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1003 {
1004 	u32 in[MLX5_ST_SZ_DW(destroy_virtio_net_q_in)] = {};
1005 	u32 out[MLX5_ST_SZ_DW(destroy_virtio_net_q_out)] = {};
1006 
1007 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.opcode,
1008 		 MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
1009 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.obj_id, mvq->virtq_id);
1010 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.uid, ndev->mvdev.res.uid);
1011 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.obj_type,
1012 		 MLX5_OBJ_TYPE_VIRTIO_NET_Q);
1013 	if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out))) {
1014 		mlx5_vdpa_warn(&ndev->mvdev, "destroy virtqueue 0x%x\n", mvq->virtq_id);
1015 		return;
1016 	}
1017 	mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_NONE;
1018 	umems_destroy(ndev, mvq);
1019 
1020 	mlx5_vdpa_put_mr(&ndev->mvdev, mvq->vq_mr);
1021 	mvq->vq_mr = NULL;
1022 
1023 	mlx5_vdpa_put_mr(&ndev->mvdev, mvq->desc_mr);
1024 	mvq->desc_mr = NULL;
1025 }
1026 
1027 static u32 get_rqpn(struct mlx5_vdpa_virtqueue *mvq, bool fw)
1028 {
1029 	return fw ? mvq->vqqp.mqp.qpn : mvq->fwqp.mqp.qpn;
1030 }
1031 
1032 static u32 get_qpn(struct mlx5_vdpa_virtqueue *mvq, bool fw)
1033 {
1034 	return fw ? mvq->fwqp.mqp.qpn : mvq->vqqp.mqp.qpn;
1035 }
1036 
1037 static void alloc_inout(struct mlx5_vdpa_net *ndev, int cmd, void **in, int *inlen, void **out,
1038 			int *outlen, u32 qpn, u32 rqpn)
1039 {
1040 	void *qpc;
1041 	void *pp;
1042 
1043 	switch (cmd) {
1044 	case MLX5_CMD_OP_2RST_QP:
1045 		*inlen = MLX5_ST_SZ_BYTES(qp_2rst_in);
1046 		*outlen = MLX5_ST_SZ_BYTES(qp_2rst_out);
1047 		*in = kzalloc(*inlen, GFP_KERNEL);
1048 		*out = kzalloc(*outlen, GFP_KERNEL);
1049 		if (!*in || !*out)
1050 			goto outerr;
1051 
1052 		MLX5_SET(qp_2rst_in, *in, opcode, cmd);
1053 		MLX5_SET(qp_2rst_in, *in, uid, ndev->mvdev.res.uid);
1054 		MLX5_SET(qp_2rst_in, *in, qpn, qpn);
1055 		break;
1056 	case MLX5_CMD_OP_RST2INIT_QP:
1057 		*inlen = MLX5_ST_SZ_BYTES(rst2init_qp_in);
1058 		*outlen = MLX5_ST_SZ_BYTES(rst2init_qp_out);
1059 		*in = kzalloc(*inlen, GFP_KERNEL);
1060 		*out = kzalloc(MLX5_ST_SZ_BYTES(rst2init_qp_out), GFP_KERNEL);
1061 		if (!*in || !*out)
1062 			goto outerr;
1063 
1064 		MLX5_SET(rst2init_qp_in, *in, opcode, cmd);
1065 		MLX5_SET(rst2init_qp_in, *in, uid, ndev->mvdev.res.uid);
1066 		MLX5_SET(rst2init_qp_in, *in, qpn, qpn);
1067 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
1068 		MLX5_SET(qpc, qpc, remote_qpn, rqpn);
1069 		MLX5_SET(qpc, qpc, rwe, 1);
1070 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
1071 		MLX5_SET(ads, pp, vhca_port_num, 1);
1072 		break;
1073 	case MLX5_CMD_OP_INIT2RTR_QP:
1074 		*inlen = MLX5_ST_SZ_BYTES(init2rtr_qp_in);
1075 		*outlen = MLX5_ST_SZ_BYTES(init2rtr_qp_out);
1076 		*in = kzalloc(*inlen, GFP_KERNEL);
1077 		*out = kzalloc(MLX5_ST_SZ_BYTES(init2rtr_qp_out), GFP_KERNEL);
1078 		if (!*in || !*out)
1079 			goto outerr;
1080 
1081 		MLX5_SET(init2rtr_qp_in, *in, opcode, cmd);
1082 		MLX5_SET(init2rtr_qp_in, *in, uid, ndev->mvdev.res.uid);
1083 		MLX5_SET(init2rtr_qp_in, *in, qpn, qpn);
1084 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
1085 		MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
1086 		MLX5_SET(qpc, qpc, log_msg_max, 30);
1087 		MLX5_SET(qpc, qpc, remote_qpn, rqpn);
1088 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
1089 		MLX5_SET(ads, pp, fl, 1);
1090 		break;
1091 	case MLX5_CMD_OP_RTR2RTS_QP:
1092 		*inlen = MLX5_ST_SZ_BYTES(rtr2rts_qp_in);
1093 		*outlen = MLX5_ST_SZ_BYTES(rtr2rts_qp_out);
1094 		*in = kzalloc(*inlen, GFP_KERNEL);
1095 		*out = kzalloc(MLX5_ST_SZ_BYTES(rtr2rts_qp_out), GFP_KERNEL);
1096 		if (!*in || !*out)
1097 			goto outerr;
1098 
1099 		MLX5_SET(rtr2rts_qp_in, *in, opcode, cmd);
1100 		MLX5_SET(rtr2rts_qp_in, *in, uid, ndev->mvdev.res.uid);
1101 		MLX5_SET(rtr2rts_qp_in, *in, qpn, qpn);
1102 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
1103 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
1104 		MLX5_SET(ads, pp, ack_timeout, 14);
1105 		MLX5_SET(qpc, qpc, retry_count, 7);
1106 		MLX5_SET(qpc, qpc, rnr_retry, 7);
1107 		break;
1108 	default:
1109 		goto outerr_nullify;
1110 	}
1111 
1112 	return;
1113 
1114 outerr:
1115 	kfree(*in);
1116 	kfree(*out);
1117 outerr_nullify:
1118 	*in = NULL;
1119 	*out = NULL;
1120 }
1121 
1122 static void free_inout(void *in, void *out)
1123 {
1124 	kfree(in);
1125 	kfree(out);
1126 }
1127 
1128 /* Two QPs are used by each virtqueue. One is used by the driver and one by
1129  * firmware. The fw argument indicates whether the subjected QP is the one used
1130  * by firmware.
1131  */
1132 static int modify_qp(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, bool fw, int cmd)
1133 {
1134 	int outlen;
1135 	int inlen;
1136 	void *out;
1137 	void *in;
1138 	int err;
1139 
1140 	alloc_inout(ndev, cmd, &in, &inlen, &out, &outlen, get_qpn(mvq, fw), get_rqpn(mvq, fw));
1141 	if (!in || !out)
1142 		return -ENOMEM;
1143 
1144 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, outlen);
1145 	free_inout(in, out);
1146 	return err;
1147 }
1148 
1149 static int connect_qps(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1150 {
1151 	int err;
1152 
1153 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_2RST_QP);
1154 	if (err)
1155 		return err;
1156 
1157 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_2RST_QP);
1158 	if (err)
1159 		return err;
1160 
1161 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_RST2INIT_QP);
1162 	if (err)
1163 		return err;
1164 
1165 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_RST2INIT_QP);
1166 	if (err)
1167 		return err;
1168 
1169 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_INIT2RTR_QP);
1170 	if (err)
1171 		return err;
1172 
1173 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_INIT2RTR_QP);
1174 	if (err)
1175 		return err;
1176 
1177 	return modify_qp(ndev, mvq, true, MLX5_CMD_OP_RTR2RTS_QP);
1178 }
1179 
1180 struct mlx5_virtq_attr {
1181 	u8 state;
1182 	u16 available_index;
1183 	u16 used_index;
1184 };
1185 
1186 static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
1187 			   struct mlx5_virtq_attr *attr)
1188 {
1189 	int outlen = MLX5_ST_SZ_BYTES(query_virtio_net_q_out);
1190 	u32 in[MLX5_ST_SZ_DW(query_virtio_net_q_in)] = {};
1191 	void *out;
1192 	void *obj_context;
1193 	void *cmd_hdr;
1194 	int err;
1195 
1196 	out = kzalloc(outlen, GFP_KERNEL);
1197 	if (!out)
1198 		return -ENOMEM;
1199 
1200 	cmd_hdr = MLX5_ADDR_OF(query_virtio_net_q_in, in, general_obj_in_cmd_hdr);
1201 
1202 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT);
1203 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
1204 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);
1205 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
1206 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, outlen);
1207 	if (err)
1208 		goto err_cmd;
1209 
1210 	obj_context = MLX5_ADDR_OF(query_virtio_net_q_out, out, obj_context);
1211 	memset(attr, 0, sizeof(*attr));
1212 	attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);
1213 	attr->available_index = MLX5_GET(virtio_net_q_object, obj_context, hw_available_index);
1214 	attr->used_index = MLX5_GET(virtio_net_q_object, obj_context, hw_used_index);
1215 	kfree(out);
1216 	return 0;
1217 
1218 err_cmd:
1219 	kfree(out);
1220 	return err;
1221 }
1222 
1223 static bool is_resumable(struct mlx5_vdpa_net *ndev)
1224 {
1225 	return ndev->mvdev.vdev.config->resume;
1226 }
1227 
1228 static bool is_valid_state_change(int oldstate, int newstate, bool resumable)
1229 {
1230 	switch (oldstate) {
1231 	case MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT:
1232 		return newstate == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY;
1233 	case MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY:
1234 		return newstate == MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND;
1235 	case MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND:
1236 		return resumable ? newstate == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY : false;
1237 	case MLX5_VIRTIO_NET_Q_OBJECT_STATE_ERR:
1238 	default:
1239 		return false;
1240 	}
1241 }
1242 
1243 static bool modifiable_virtqueue_fields(struct mlx5_vdpa_virtqueue *mvq)
1244 {
1245 	/* Only state is always modifiable */
1246 	if (mvq->modified_fields & ~MLX5_VIRTQ_MODIFY_MASK_STATE)
1247 		return mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT ||
1248 		       mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND;
1249 
1250 	return true;
1251 }
1252 
1253 static int modify_virtqueue(struct mlx5_vdpa_net *ndev,
1254 			    struct mlx5_vdpa_virtqueue *mvq,
1255 			    int state)
1256 {
1257 	int inlen = MLX5_ST_SZ_BYTES(modify_virtio_net_q_in);
1258 	u32 out[MLX5_ST_SZ_DW(modify_virtio_net_q_out)] = {};
1259 	struct mlx5_vdpa_dev *mvdev = &ndev->mvdev;
1260 	struct mlx5_vdpa_mr *desc_mr = NULL;
1261 	struct mlx5_vdpa_mr *vq_mr = NULL;
1262 	bool state_change = false;
1263 	void *obj_context;
1264 	void *cmd_hdr;
1265 	void *vq_ctx;
1266 	void *in;
1267 	int err;
1268 
1269 	if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_NONE)
1270 		return 0;
1271 
1272 	if (!modifiable_virtqueue_fields(mvq))
1273 		return -EINVAL;
1274 
1275 	in = kzalloc(inlen, GFP_KERNEL);
1276 	if (!in)
1277 		return -ENOMEM;
1278 
1279 	cmd_hdr = MLX5_ADDR_OF(modify_virtio_net_q_in, in, general_obj_in_cmd_hdr);
1280 
1281 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT);
1282 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
1283 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);
1284 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
1285 
1286 	obj_context = MLX5_ADDR_OF(modify_virtio_net_q_in, in, obj_context);
1287 	vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, virtio_q_context);
1288 
1289 	if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_STATE) {
1290 		if (!is_valid_state_change(mvq->fw_state, state, is_resumable(ndev))) {
1291 			err = -EINVAL;
1292 			goto done;
1293 		}
1294 
1295 		MLX5_SET(virtio_net_q_object, obj_context, state, state);
1296 		state_change = true;
1297 	}
1298 
1299 	if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_ADDRS) {
1300 		MLX5_SET64(virtio_q, vq_ctx, desc_addr, mvq->desc_addr);
1301 		MLX5_SET64(virtio_q, vq_ctx, used_addr, mvq->device_addr);
1302 		MLX5_SET64(virtio_q, vq_ctx, available_addr, mvq->driver_addr);
1303 	}
1304 
1305 	if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_AVAIL_IDX)
1306 		MLX5_SET(virtio_net_q_object, obj_context, hw_available_index, mvq->avail_idx);
1307 
1308 	if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_USED_IDX)
1309 		MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, mvq->used_idx);
1310 
1311 	if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_QUEUE_VIRTIO_VERSION)
1312 		MLX5_SET(virtio_q, vq_ctx, virtio_version_1_0,
1313 			!!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_F_VERSION_1)));
1314 
1315 	if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_QUEUE_FEATURES) {
1316 		u16 mlx_features = get_features(ndev->mvdev.actual_features);
1317 
1318 		MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_12_3,
1319 			 mlx_features >> 3);
1320 		MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_2_0,
1321 			 mlx_features & 7);
1322 	}
1323 
1324 	if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY) {
1325 		vq_mr = mvdev->mr[mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP]];
1326 
1327 		if (vq_mr)
1328 			MLX5_SET(virtio_q, vq_ctx, virtio_q_mkey, vq_mr->mkey);
1329 		else
1330 			mvq->modified_fields &= ~MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY;
1331 	}
1332 
1333 	if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY) {
1334 		desc_mr = mvdev->mr[mvdev->group2asid[MLX5_VDPA_DATAVQ_DESC_GROUP]];
1335 
1336 		if (desc_mr && MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, desc_group_mkey_supported))
1337 			MLX5_SET(virtio_q, vq_ctx, desc_group_mkey, desc_mr->mkey);
1338 		else
1339 			mvq->modified_fields &= ~MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY;
1340 	}
1341 
1342 	MLX5_SET64(virtio_net_q_object, obj_context, modify_field_select, mvq->modified_fields);
1343 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
1344 	if (err)
1345 		goto done;
1346 
1347 	if (state_change)
1348 		mvq->fw_state = state;
1349 
1350 	if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY) {
1351 		mlx5_vdpa_put_mr(mvdev, mvq->vq_mr);
1352 		mlx5_vdpa_get_mr(mvdev, vq_mr);
1353 		mvq->vq_mr = vq_mr;
1354 	}
1355 
1356 	if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY) {
1357 		mlx5_vdpa_put_mr(mvdev, mvq->desc_mr);
1358 		mlx5_vdpa_get_mr(mvdev, desc_mr);
1359 		mvq->desc_mr = desc_mr;
1360 	}
1361 
1362 	mvq->modified_fields = 0;
1363 
1364 done:
1365 	kfree(in);
1366 	return err;
1367 }
1368 
1369 static int modify_virtqueue_state(struct mlx5_vdpa_net *ndev,
1370 				  struct mlx5_vdpa_virtqueue *mvq,
1371 				  unsigned int state)
1372 {
1373 	mvq->modified_fields |= MLX5_VIRTQ_MODIFY_MASK_STATE;
1374 	return modify_virtqueue(ndev, mvq, state);
1375 }
1376 
1377 static int counter_set_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1378 {
1379 	u32 in[MLX5_ST_SZ_DW(create_virtio_q_counters_in)] = {};
1380 	u32 out[MLX5_ST_SZ_DW(create_virtio_q_counters_out)] = {};
1381 	void *cmd_hdr;
1382 	int err;
1383 
1384 	if (!counters_supported(&ndev->mvdev))
1385 		return 0;
1386 
1387 	cmd_hdr = MLX5_ADDR_OF(create_virtio_q_counters_in, in, hdr);
1388 
1389 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
1390 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS);
1391 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
1392 
1393 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out));
1394 	if (err)
1395 		return err;
1396 
1397 	mvq->counter_set_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
1398 
1399 	return 0;
1400 }
1401 
1402 static void counter_set_dealloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1403 {
1404 	u32 in[MLX5_ST_SZ_DW(destroy_virtio_q_counters_in)] = {};
1405 	u32 out[MLX5_ST_SZ_DW(destroy_virtio_q_counters_out)] = {};
1406 
1407 	if (!counters_supported(&ndev->mvdev))
1408 		return;
1409 
1410 	MLX5_SET(destroy_virtio_q_counters_in, in, hdr.opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
1411 	MLX5_SET(destroy_virtio_q_counters_in, in, hdr.obj_id, mvq->counter_set_id);
1412 	MLX5_SET(destroy_virtio_q_counters_in, in, hdr.uid, ndev->mvdev.res.uid);
1413 	MLX5_SET(destroy_virtio_q_counters_in, in, hdr.obj_type, MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS);
1414 	if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out)))
1415 		mlx5_vdpa_warn(&ndev->mvdev, "dealloc counter set 0x%x\n", mvq->counter_set_id);
1416 }
1417 
1418 static irqreturn_t mlx5_vdpa_int_handler(int irq, void *priv)
1419 {
1420 	struct vdpa_callback *cb = priv;
1421 
1422 	if (cb->callback)
1423 		return cb->callback(cb->private);
1424 
1425 	return IRQ_HANDLED;
1426 }
1427 
1428 static void alloc_vector(struct mlx5_vdpa_net *ndev,
1429 			 struct mlx5_vdpa_virtqueue *mvq)
1430 {
1431 	struct mlx5_vdpa_irq_pool *irqp = &ndev->irqp;
1432 	struct mlx5_vdpa_irq_pool_entry *ent;
1433 	int err;
1434 	int i;
1435 
1436 	for (i = 0; i < irqp->num_ent; i++) {
1437 		ent = &irqp->entries[i];
1438 		if (!ent->used) {
1439 			snprintf(ent->name, MLX5_VDPA_IRQ_NAME_LEN, "%s-vq-%d",
1440 				 dev_name(&ndev->mvdev.vdev.dev), mvq->index);
1441 			ent->dev_id = &ndev->event_cbs[mvq->index];
1442 			err = request_irq(ent->map.virq, mlx5_vdpa_int_handler, 0,
1443 					  ent->name, ent->dev_id);
1444 			if (err)
1445 				return;
1446 
1447 			ent->used = true;
1448 			mvq->map = ent->map;
1449 			return;
1450 		}
1451 	}
1452 }
1453 
1454 static void dealloc_vector(struct mlx5_vdpa_net *ndev,
1455 			   struct mlx5_vdpa_virtqueue *mvq)
1456 {
1457 	struct mlx5_vdpa_irq_pool *irqp = &ndev->irqp;
1458 	int i;
1459 
1460 	for (i = 0; i < irqp->num_ent; i++)
1461 		if (mvq->map.virq == irqp->entries[i].map.virq) {
1462 			free_irq(mvq->map.virq, irqp->entries[i].dev_id);
1463 			irqp->entries[i].used = false;
1464 			return;
1465 		}
1466 }
1467 
1468 static int setup_vq(struct mlx5_vdpa_net *ndev,
1469 		    struct mlx5_vdpa_virtqueue *mvq,
1470 		    bool filled)
1471 {
1472 	u16 idx = mvq->index;
1473 	int err;
1474 
1475 	if (mvq->initialized)
1476 		return 0;
1477 
1478 	err = cq_create(ndev, idx, mvq->num_ent);
1479 	if (err)
1480 		return err;
1481 
1482 	err = qp_create(ndev, mvq, &mvq->fwqp);
1483 	if (err)
1484 		goto err_fwqp;
1485 
1486 	err = qp_create(ndev, mvq, &mvq->vqqp);
1487 	if (err)
1488 		goto err_vqqp;
1489 
1490 	err = connect_qps(ndev, mvq);
1491 	if (err)
1492 		goto err_connect;
1493 
1494 	err = counter_set_alloc(ndev, mvq);
1495 	if (err)
1496 		goto err_connect;
1497 
1498 	alloc_vector(ndev, mvq);
1499 	err = create_virtqueue(ndev, mvq, filled);
1500 	if (err)
1501 		goto err_vq;
1502 
1503 	if (mvq->ready) {
1504 		err = modify_virtqueue_state(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY);
1505 		if (err) {
1506 			mlx5_vdpa_warn(&ndev->mvdev, "failed to modify to ready vq idx %d(%d)\n",
1507 				       idx, err);
1508 			goto err_modify;
1509 		}
1510 	}
1511 
1512 	mvq->initialized = true;
1513 	return 0;
1514 
1515 err_modify:
1516 	destroy_virtqueue(ndev, mvq);
1517 err_vq:
1518 	dealloc_vector(ndev, mvq);
1519 	counter_set_dealloc(ndev, mvq);
1520 err_connect:
1521 	qp_destroy(ndev, &mvq->vqqp);
1522 err_vqqp:
1523 	qp_destroy(ndev, &mvq->fwqp);
1524 err_fwqp:
1525 	cq_destroy(ndev, idx);
1526 	return err;
1527 }
1528 
1529 static void suspend_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1530 {
1531 	struct mlx5_virtq_attr attr;
1532 
1533 	if (!mvq->initialized)
1534 		return;
1535 
1536 	if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY)
1537 		return;
1538 
1539 	if (modify_virtqueue_state(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND))
1540 		mlx5_vdpa_warn(&ndev->mvdev, "modify to suspend failed\n");
1541 
1542 	if (query_virtqueue(ndev, mvq, &attr)) {
1543 		mlx5_vdpa_warn(&ndev->mvdev, "failed to query virtqueue\n");
1544 		return;
1545 	}
1546 	mvq->avail_idx = attr.available_index;
1547 	mvq->used_idx = attr.used_index;
1548 }
1549 
1550 static void suspend_vqs(struct mlx5_vdpa_net *ndev)
1551 {
1552 	int i;
1553 
1554 	for (i = 0; i < ndev->cur_num_vqs; i++)
1555 		suspend_vq(ndev, &ndev->vqs[i]);
1556 }
1557 
1558 static void resume_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1559 {
1560 	if (!mvq->initialized || !is_resumable(ndev))
1561 		return;
1562 
1563 	if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND)
1564 		return;
1565 
1566 	if (modify_virtqueue_state(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY))
1567 		mlx5_vdpa_warn(&ndev->mvdev, "modify to resume failed for vq %u\n", mvq->index);
1568 }
1569 
1570 static void resume_vqs(struct mlx5_vdpa_net *ndev)
1571 {
1572 	for (int i = 0; i < ndev->cur_num_vqs; i++)
1573 		resume_vq(ndev, &ndev->vqs[i]);
1574 }
1575 
1576 static void teardown_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1577 {
1578 	if (!mvq->initialized)
1579 		return;
1580 
1581 	suspend_vq(ndev, mvq);
1582 	mvq->modified_fields = 0;
1583 	destroy_virtqueue(ndev, mvq);
1584 	dealloc_vector(ndev, mvq);
1585 	counter_set_dealloc(ndev, mvq);
1586 	qp_destroy(ndev, &mvq->vqqp);
1587 	qp_destroy(ndev, &mvq->fwqp);
1588 	cq_destroy(ndev, mvq->index);
1589 	mvq->initialized = false;
1590 }
1591 
1592 static int create_rqt(struct mlx5_vdpa_net *ndev)
1593 {
1594 	int rqt_table_size = roundup_pow_of_two(ndev->rqt_size);
1595 	int act_sz = roundup_pow_of_two(ndev->cur_num_vqs / 2);
1596 	__be32 *list;
1597 	void *rqtc;
1598 	int inlen;
1599 	void *in;
1600 	int i, j;
1601 	int err;
1602 
1603 	inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + rqt_table_size * MLX5_ST_SZ_BYTES(rq_num);
1604 	in = kzalloc(inlen, GFP_KERNEL);
1605 	if (!in)
1606 		return -ENOMEM;
1607 
1608 	MLX5_SET(create_rqt_in, in, uid, ndev->mvdev.res.uid);
1609 	rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context);
1610 
1611 	MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q);
1612 	MLX5_SET(rqtc, rqtc, rqt_max_size, rqt_table_size);
1613 	list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]);
1614 	for (i = 0, j = 0; i < act_sz; i++, j += 2)
1615 		list[i] = cpu_to_be32(ndev->vqs[j % ndev->cur_num_vqs].virtq_id);
1616 
1617 	MLX5_SET(rqtc, rqtc, rqt_actual_size, act_sz);
1618 	err = mlx5_vdpa_create_rqt(&ndev->mvdev, in, inlen, &ndev->res.rqtn);
1619 	kfree(in);
1620 	if (err)
1621 		return err;
1622 
1623 	return 0;
1624 }
1625 
1626 #define MLX5_MODIFY_RQT_NUM_RQS ((u64)1)
1627 
1628 static int modify_rqt(struct mlx5_vdpa_net *ndev, int num)
1629 {
1630 	int act_sz = roundup_pow_of_two(num / 2);
1631 	__be32 *list;
1632 	void *rqtc;
1633 	int inlen;
1634 	void *in;
1635 	int i, j;
1636 	int err;
1637 
1638 	inlen = MLX5_ST_SZ_BYTES(modify_rqt_in) + act_sz * MLX5_ST_SZ_BYTES(rq_num);
1639 	in = kzalloc(inlen, GFP_KERNEL);
1640 	if (!in)
1641 		return -ENOMEM;
1642 
1643 	MLX5_SET(modify_rqt_in, in, uid, ndev->mvdev.res.uid);
1644 	MLX5_SET64(modify_rqt_in, in, bitmask, MLX5_MODIFY_RQT_NUM_RQS);
1645 	rqtc = MLX5_ADDR_OF(modify_rqt_in, in, ctx);
1646 	MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q);
1647 
1648 	list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]);
1649 	for (i = 0, j = 0; i < act_sz; i++, j = j + 2)
1650 		list[i] = cpu_to_be32(ndev->vqs[j % num].virtq_id);
1651 
1652 	MLX5_SET(rqtc, rqtc, rqt_actual_size, act_sz);
1653 	err = mlx5_vdpa_modify_rqt(&ndev->mvdev, in, inlen, ndev->res.rqtn);
1654 	kfree(in);
1655 	if (err)
1656 		return err;
1657 
1658 	return 0;
1659 }
1660 
1661 static void destroy_rqt(struct mlx5_vdpa_net *ndev)
1662 {
1663 	mlx5_vdpa_destroy_rqt(&ndev->mvdev, ndev->res.rqtn);
1664 }
1665 
1666 static int create_tir(struct mlx5_vdpa_net *ndev)
1667 {
1668 #define HASH_IP_L4PORTS                                                                            \
1669 	(MLX5_HASH_FIELD_SEL_SRC_IP | MLX5_HASH_FIELD_SEL_DST_IP | MLX5_HASH_FIELD_SEL_L4_SPORT |  \
1670 	 MLX5_HASH_FIELD_SEL_L4_DPORT)
1671 	static const u8 rx_hash_toeplitz_key[] = { 0x2c, 0xc6, 0x81, 0xd1, 0x5b, 0xdb, 0xf4, 0xf7,
1672 						   0xfc, 0xa2, 0x83, 0x19, 0xdb, 0x1a, 0x3e, 0x94,
1673 						   0x6b, 0x9e, 0x38, 0xd9, 0x2c, 0x9c, 0x03, 0xd1,
1674 						   0xad, 0x99, 0x44, 0xa7, 0xd9, 0x56, 0x3d, 0x59,
1675 						   0x06, 0x3c, 0x25, 0xf3, 0xfc, 0x1f, 0xdc, 0x2a };
1676 	void *rss_key;
1677 	void *outer;
1678 	void *tirc;
1679 	void *in;
1680 	int err;
1681 
1682 	in = kzalloc(MLX5_ST_SZ_BYTES(create_tir_in), GFP_KERNEL);
1683 	if (!in)
1684 		return -ENOMEM;
1685 
1686 	MLX5_SET(create_tir_in, in, uid, ndev->mvdev.res.uid);
1687 	tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
1688 	MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT);
1689 
1690 	MLX5_SET(tirc, tirc, rx_hash_symmetric, 1);
1691 	MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_TOEPLITZ);
1692 	rss_key = MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key);
1693 	memcpy(rss_key, rx_hash_toeplitz_key, sizeof(rx_hash_toeplitz_key));
1694 
1695 	outer = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer);
1696 	MLX5_SET(rx_hash_field_select, outer, l3_prot_type, MLX5_L3_PROT_TYPE_IPV4);
1697 	MLX5_SET(rx_hash_field_select, outer, l4_prot_type, MLX5_L4_PROT_TYPE_TCP);
1698 	MLX5_SET(rx_hash_field_select, outer, selected_fields, HASH_IP_L4PORTS);
1699 
1700 	MLX5_SET(tirc, tirc, indirect_table, ndev->res.rqtn);
1701 	MLX5_SET(tirc, tirc, transport_domain, ndev->res.tdn);
1702 
1703 	err = mlx5_vdpa_create_tir(&ndev->mvdev, in, &ndev->res.tirn);
1704 	kfree(in);
1705 	if (err)
1706 		return err;
1707 
1708 	mlx5_vdpa_add_tirn(ndev);
1709 	return err;
1710 }
1711 
1712 static void destroy_tir(struct mlx5_vdpa_net *ndev)
1713 {
1714 	mlx5_vdpa_remove_tirn(ndev);
1715 	mlx5_vdpa_destroy_tir(&ndev->mvdev, ndev->res.tirn);
1716 }
1717 
1718 #define MAX_STEERING_ENT 0x8000
1719 #define MAX_STEERING_GROUPS 2
1720 
1721 #if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG)
1722        #define NUM_DESTS 2
1723 #else
1724        #define NUM_DESTS 1
1725 #endif
1726 
1727 static int add_steering_counters(struct mlx5_vdpa_net *ndev,
1728 				 struct macvlan_node *node,
1729 				 struct mlx5_flow_act *flow_act,
1730 				 struct mlx5_flow_destination *dests)
1731 {
1732 #if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG)
1733 	int err;
1734 
1735 	node->ucast_counter.counter = mlx5_fc_create(ndev->mvdev.mdev, false);
1736 	if (IS_ERR(node->ucast_counter.counter))
1737 		return PTR_ERR(node->ucast_counter.counter);
1738 
1739 	node->mcast_counter.counter = mlx5_fc_create(ndev->mvdev.mdev, false);
1740 	if (IS_ERR(node->mcast_counter.counter)) {
1741 		err = PTR_ERR(node->mcast_counter.counter);
1742 		goto err_mcast_counter;
1743 	}
1744 
1745 	dests[1].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
1746 	flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
1747 	return 0;
1748 
1749 err_mcast_counter:
1750 	mlx5_fc_destroy(ndev->mvdev.mdev, node->ucast_counter.counter);
1751 	return err;
1752 #else
1753 	return 0;
1754 #endif
1755 }
1756 
1757 static void remove_steering_counters(struct mlx5_vdpa_net *ndev,
1758 				     struct macvlan_node *node)
1759 {
1760 #if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG)
1761 	mlx5_fc_destroy(ndev->mvdev.mdev, node->mcast_counter.counter);
1762 	mlx5_fc_destroy(ndev->mvdev.mdev, node->ucast_counter.counter);
1763 #endif
1764 }
1765 
1766 static int mlx5_vdpa_add_mac_vlan_rules(struct mlx5_vdpa_net *ndev, u8 *mac,
1767 					struct macvlan_node *node)
1768 {
1769 	struct mlx5_flow_destination dests[NUM_DESTS] = {};
1770 	struct mlx5_flow_act flow_act = {};
1771 	struct mlx5_flow_spec *spec;
1772 	void *headers_c;
1773 	void *headers_v;
1774 	u8 *dmac_c;
1775 	u8 *dmac_v;
1776 	int err;
1777 	u16 vid;
1778 
1779 	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
1780 	if (!spec)
1781 		return -ENOMEM;
1782 
1783 	vid = key2vid(node->macvlan);
1784 	spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
1785 	headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, outer_headers);
1786 	headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, outer_headers);
1787 	dmac_c = MLX5_ADDR_OF(fte_match_param, headers_c, outer_headers.dmac_47_16);
1788 	dmac_v = MLX5_ADDR_OF(fte_match_param, headers_v, outer_headers.dmac_47_16);
1789 	eth_broadcast_addr(dmac_c);
1790 	ether_addr_copy(dmac_v, mac);
1791 	if (ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VLAN)) {
1792 		MLX5_SET(fte_match_set_lyr_2_4, headers_c, cvlan_tag, 1);
1793 		MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, first_vid);
1794 	}
1795 	if (node->tagged) {
1796 		MLX5_SET(fte_match_set_lyr_2_4, headers_v, cvlan_tag, 1);
1797 		MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_vid, vid);
1798 	}
1799 	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
1800 	dests[0].type = MLX5_FLOW_DESTINATION_TYPE_TIR;
1801 	dests[0].tir_num = ndev->res.tirn;
1802 	err = add_steering_counters(ndev, node, &flow_act, dests);
1803 	if (err)
1804 		goto out_free;
1805 
1806 #if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG)
1807 	dests[1].counter_id = mlx5_fc_id(node->ucast_counter.counter);
1808 #endif
1809 	node->ucast_rule = mlx5_add_flow_rules(ndev->rxft, spec, &flow_act, dests, NUM_DESTS);
1810 	if (IS_ERR(node->ucast_rule)) {
1811 		err = PTR_ERR(node->ucast_rule);
1812 		goto err_ucast;
1813 	}
1814 
1815 #if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG)
1816 	dests[1].counter_id = mlx5_fc_id(node->mcast_counter.counter);
1817 #endif
1818 
1819 	memset(dmac_c, 0, ETH_ALEN);
1820 	memset(dmac_v, 0, ETH_ALEN);
1821 	dmac_c[0] = 1;
1822 	dmac_v[0] = 1;
1823 	node->mcast_rule = mlx5_add_flow_rules(ndev->rxft, spec, &flow_act, dests, NUM_DESTS);
1824 	if (IS_ERR(node->mcast_rule)) {
1825 		err = PTR_ERR(node->mcast_rule);
1826 		goto err_mcast;
1827 	}
1828 	kvfree(spec);
1829 	mlx5_vdpa_add_rx_counters(ndev, node);
1830 	return 0;
1831 
1832 err_mcast:
1833 	mlx5_del_flow_rules(node->ucast_rule);
1834 err_ucast:
1835 	remove_steering_counters(ndev, node);
1836 out_free:
1837 	kvfree(spec);
1838 	return err;
1839 }
1840 
1841 static void mlx5_vdpa_del_mac_vlan_rules(struct mlx5_vdpa_net *ndev,
1842 					 struct macvlan_node *node)
1843 {
1844 	mlx5_vdpa_remove_rx_counters(ndev, node);
1845 	mlx5_del_flow_rules(node->ucast_rule);
1846 	mlx5_del_flow_rules(node->mcast_rule);
1847 }
1848 
1849 static u64 search_val(u8 *mac, u16 vlan, bool tagged)
1850 {
1851 	u64 val;
1852 
1853 	if (!tagged)
1854 		vlan = MLX5V_UNTAGGED;
1855 
1856 	val = (u64)vlan << 48 |
1857 	      (u64)mac[0] << 40 |
1858 	      (u64)mac[1] << 32 |
1859 	      (u64)mac[2] << 24 |
1860 	      (u64)mac[3] << 16 |
1861 	      (u64)mac[4] << 8 |
1862 	      (u64)mac[5];
1863 
1864 	return val;
1865 }
1866 
1867 static struct macvlan_node *mac_vlan_lookup(struct mlx5_vdpa_net *ndev, u64 value)
1868 {
1869 	struct macvlan_node *pos;
1870 	u32 idx;
1871 
1872 	idx = hash_64(value, 8); // tbd 8
1873 	hlist_for_each_entry(pos, &ndev->macvlan_hash[idx], hlist) {
1874 		if (pos->macvlan == value)
1875 			return pos;
1876 	}
1877 	return NULL;
1878 }
1879 
1880 static int mac_vlan_add(struct mlx5_vdpa_net *ndev, u8 *mac, u16 vid, bool tagged)
1881 {
1882 	struct macvlan_node *ptr;
1883 	u64 val;
1884 	u32 idx;
1885 	int err;
1886 
1887 	val = search_val(mac, vid, tagged);
1888 	if (mac_vlan_lookup(ndev, val))
1889 		return -EEXIST;
1890 
1891 	ptr = kzalloc(sizeof(*ptr), GFP_KERNEL);
1892 	if (!ptr)
1893 		return -ENOMEM;
1894 
1895 	ptr->tagged = tagged;
1896 	ptr->macvlan = val;
1897 	ptr->ndev = ndev;
1898 	err = mlx5_vdpa_add_mac_vlan_rules(ndev, ndev->config.mac, ptr);
1899 	if (err)
1900 		goto err_add;
1901 
1902 	idx = hash_64(val, 8);
1903 	hlist_add_head(&ptr->hlist, &ndev->macvlan_hash[idx]);
1904 	return 0;
1905 
1906 err_add:
1907 	kfree(ptr);
1908 	return err;
1909 }
1910 
1911 static void mac_vlan_del(struct mlx5_vdpa_net *ndev, u8 *mac, u16 vlan, bool tagged)
1912 {
1913 	struct macvlan_node *ptr;
1914 
1915 	ptr = mac_vlan_lookup(ndev, search_val(mac, vlan, tagged));
1916 	if (!ptr)
1917 		return;
1918 
1919 	hlist_del(&ptr->hlist);
1920 	mlx5_vdpa_del_mac_vlan_rules(ndev, ptr);
1921 	remove_steering_counters(ndev, ptr);
1922 	kfree(ptr);
1923 }
1924 
1925 static void clear_mac_vlan_table(struct mlx5_vdpa_net *ndev)
1926 {
1927 	struct macvlan_node *pos;
1928 	struct hlist_node *n;
1929 	int i;
1930 
1931 	for (i = 0; i < MLX5V_MACVLAN_SIZE; i++) {
1932 		hlist_for_each_entry_safe(pos, n, &ndev->macvlan_hash[i], hlist) {
1933 			hlist_del(&pos->hlist);
1934 			mlx5_vdpa_del_mac_vlan_rules(ndev, pos);
1935 			remove_steering_counters(ndev, pos);
1936 			kfree(pos);
1937 		}
1938 	}
1939 }
1940 
1941 static int setup_steering(struct mlx5_vdpa_net *ndev)
1942 {
1943 	struct mlx5_flow_table_attr ft_attr = {};
1944 	struct mlx5_flow_namespace *ns;
1945 	int err;
1946 
1947 	ft_attr.max_fte = MAX_STEERING_ENT;
1948 	ft_attr.autogroup.max_num_groups = MAX_STEERING_GROUPS;
1949 
1950 	ns = mlx5_get_flow_namespace(ndev->mvdev.mdev, MLX5_FLOW_NAMESPACE_BYPASS);
1951 	if (!ns) {
1952 		mlx5_vdpa_warn(&ndev->mvdev, "failed to get flow namespace\n");
1953 		return -EOPNOTSUPP;
1954 	}
1955 
1956 	ndev->rxft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr);
1957 	if (IS_ERR(ndev->rxft)) {
1958 		mlx5_vdpa_warn(&ndev->mvdev, "failed to create flow table\n");
1959 		return PTR_ERR(ndev->rxft);
1960 	}
1961 	mlx5_vdpa_add_rx_flow_table(ndev);
1962 
1963 	err = mac_vlan_add(ndev, ndev->config.mac, 0, false);
1964 	if (err)
1965 		goto err_add;
1966 
1967 	return 0;
1968 
1969 err_add:
1970 	mlx5_vdpa_remove_rx_flow_table(ndev);
1971 	mlx5_destroy_flow_table(ndev->rxft);
1972 	return err;
1973 }
1974 
1975 static void teardown_steering(struct mlx5_vdpa_net *ndev)
1976 {
1977 	clear_mac_vlan_table(ndev);
1978 	mlx5_vdpa_remove_rx_flow_table(ndev);
1979 	mlx5_destroy_flow_table(ndev->rxft);
1980 }
1981 
1982 static virtio_net_ctrl_ack handle_ctrl_mac(struct mlx5_vdpa_dev *mvdev, u8 cmd)
1983 {
1984 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1985 	struct mlx5_control_vq *cvq = &mvdev->cvq;
1986 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
1987 	struct mlx5_core_dev *pfmdev;
1988 	size_t read;
1989 	u8 mac[ETH_ALEN], mac_back[ETH_ALEN];
1990 
1991 	pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
1992 	switch (cmd) {
1993 	case VIRTIO_NET_CTRL_MAC_ADDR_SET:
1994 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, (void *)mac, ETH_ALEN);
1995 		if (read != ETH_ALEN)
1996 			break;
1997 
1998 		if (!memcmp(ndev->config.mac, mac, 6)) {
1999 			status = VIRTIO_NET_OK;
2000 			break;
2001 		}
2002 
2003 		if (is_zero_ether_addr(mac))
2004 			break;
2005 
2006 		if (!is_zero_ether_addr(ndev->config.mac)) {
2007 			if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) {
2008 				mlx5_vdpa_warn(mvdev, "failed to delete old MAC %pM from MPFS table\n",
2009 					       ndev->config.mac);
2010 				break;
2011 			}
2012 		}
2013 
2014 		if (mlx5_mpfs_add_mac(pfmdev, mac)) {
2015 			mlx5_vdpa_warn(mvdev, "failed to insert new MAC %pM into MPFS table\n",
2016 				       mac);
2017 			break;
2018 		}
2019 
2020 		/* backup the original mac address so that if failed to add the forward rules
2021 		 * we could restore it
2022 		 */
2023 		memcpy(mac_back, ndev->config.mac, ETH_ALEN);
2024 
2025 		memcpy(ndev->config.mac, mac, ETH_ALEN);
2026 
2027 		/* Need recreate the flow table entry, so that the packet could forward back
2028 		 */
2029 		mac_vlan_del(ndev, mac_back, 0, false);
2030 
2031 		if (mac_vlan_add(ndev, ndev->config.mac, 0, false)) {
2032 			mlx5_vdpa_warn(mvdev, "failed to insert forward rules, try to restore\n");
2033 
2034 			/* Although it hardly run here, we still need double check */
2035 			if (is_zero_ether_addr(mac_back)) {
2036 				mlx5_vdpa_warn(mvdev, "restore mac failed: Original MAC is zero\n");
2037 				break;
2038 			}
2039 
2040 			/* Try to restore original mac address to MFPS table, and try to restore
2041 			 * the forward rule entry.
2042 			 */
2043 			if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) {
2044 				mlx5_vdpa_warn(mvdev, "restore mac failed: delete MAC %pM from MPFS table failed\n",
2045 					       ndev->config.mac);
2046 			}
2047 
2048 			if (mlx5_mpfs_add_mac(pfmdev, mac_back)) {
2049 				mlx5_vdpa_warn(mvdev, "restore mac failed: insert old MAC %pM into MPFS table failed\n",
2050 					       mac_back);
2051 			}
2052 
2053 			memcpy(ndev->config.mac, mac_back, ETH_ALEN);
2054 
2055 			if (mac_vlan_add(ndev, ndev->config.mac, 0, false))
2056 				mlx5_vdpa_warn(mvdev, "restore forward rules failed: insert forward rules failed\n");
2057 
2058 			break;
2059 		}
2060 
2061 		status = VIRTIO_NET_OK;
2062 		break;
2063 
2064 	default:
2065 		break;
2066 	}
2067 
2068 	return status;
2069 }
2070 
2071 static int change_num_qps(struct mlx5_vdpa_dev *mvdev, int newqps)
2072 {
2073 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2074 	int cur_qps = ndev->cur_num_vqs / 2;
2075 	int err;
2076 	int i;
2077 
2078 	if (cur_qps > newqps) {
2079 		err = modify_rqt(ndev, 2 * newqps);
2080 		if (err)
2081 			return err;
2082 
2083 		for (i = ndev->cur_num_vqs - 1; i >= 2 * newqps; i--)
2084 			teardown_vq(ndev, &ndev->vqs[i]);
2085 
2086 		ndev->cur_num_vqs = 2 * newqps;
2087 	} else {
2088 		ndev->cur_num_vqs = 2 * newqps;
2089 		for (i = cur_qps * 2; i < 2 * newqps; i++) {
2090 			err = setup_vq(ndev, &ndev->vqs[i], true);
2091 			if (err)
2092 				goto clean_added;
2093 		}
2094 		err = modify_rqt(ndev, 2 * newqps);
2095 		if (err)
2096 			goto clean_added;
2097 	}
2098 	return 0;
2099 
2100 clean_added:
2101 	for (--i; i >= 2 * cur_qps; --i)
2102 		teardown_vq(ndev, &ndev->vqs[i]);
2103 
2104 	ndev->cur_num_vqs = 2 * cur_qps;
2105 
2106 	return err;
2107 }
2108 
2109 static virtio_net_ctrl_ack handle_ctrl_mq(struct mlx5_vdpa_dev *mvdev, u8 cmd)
2110 {
2111 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2112 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
2113 	struct mlx5_control_vq *cvq = &mvdev->cvq;
2114 	struct virtio_net_ctrl_mq mq;
2115 	size_t read;
2116 	u16 newqps;
2117 
2118 	switch (cmd) {
2119 	case VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET:
2120 		/* This mq feature check aligns with pre-existing userspace
2121 		 * implementation.
2122 		 *
2123 		 * Without it, an untrusted driver could fake a multiqueue config
2124 		 * request down to a non-mq device that may cause kernel to
2125 		 * panic due to uninitialized resources for extra vqs. Even with
2126 		 * a well behaving guest driver, it is not expected to allow
2127 		 * changing the number of vqs on a non-mq device.
2128 		 */
2129 		if (!MLX5_FEATURE(mvdev, VIRTIO_NET_F_MQ))
2130 			break;
2131 
2132 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, (void *)&mq, sizeof(mq));
2133 		if (read != sizeof(mq))
2134 			break;
2135 
2136 		newqps = mlx5vdpa16_to_cpu(mvdev, mq.virtqueue_pairs);
2137 		if (newqps < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
2138 		    newqps > ndev->rqt_size)
2139 			break;
2140 
2141 		if (ndev->cur_num_vqs == 2 * newqps) {
2142 			status = VIRTIO_NET_OK;
2143 			break;
2144 		}
2145 
2146 		if (!change_num_qps(mvdev, newqps))
2147 			status = VIRTIO_NET_OK;
2148 
2149 		break;
2150 	default:
2151 		break;
2152 	}
2153 
2154 	return status;
2155 }
2156 
2157 static virtio_net_ctrl_ack handle_ctrl_vlan(struct mlx5_vdpa_dev *mvdev, u8 cmd)
2158 {
2159 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2160 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
2161 	struct mlx5_control_vq *cvq = &mvdev->cvq;
2162 	__virtio16 vlan;
2163 	size_t read;
2164 	u16 id;
2165 
2166 	if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VLAN)))
2167 		return status;
2168 
2169 	switch (cmd) {
2170 	case VIRTIO_NET_CTRL_VLAN_ADD:
2171 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, &vlan, sizeof(vlan));
2172 		if (read != sizeof(vlan))
2173 			break;
2174 
2175 		id = mlx5vdpa16_to_cpu(mvdev, vlan);
2176 		if (mac_vlan_add(ndev, ndev->config.mac, id, true))
2177 			break;
2178 
2179 		status = VIRTIO_NET_OK;
2180 		break;
2181 	case VIRTIO_NET_CTRL_VLAN_DEL:
2182 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, &vlan, sizeof(vlan));
2183 		if (read != sizeof(vlan))
2184 			break;
2185 
2186 		id = mlx5vdpa16_to_cpu(mvdev, vlan);
2187 		mac_vlan_del(ndev, ndev->config.mac, id, true);
2188 		status = VIRTIO_NET_OK;
2189 		break;
2190 	default:
2191 		break;
2192 	}
2193 
2194 	return status;
2195 }
2196 
2197 static void mlx5_cvq_kick_handler(struct work_struct *work)
2198 {
2199 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
2200 	struct virtio_net_ctrl_hdr ctrl;
2201 	struct mlx5_vdpa_wq_ent *wqent;
2202 	struct mlx5_vdpa_dev *mvdev;
2203 	struct mlx5_control_vq *cvq;
2204 	struct mlx5_vdpa_net *ndev;
2205 	size_t read, write;
2206 	int err;
2207 
2208 	wqent = container_of(work, struct mlx5_vdpa_wq_ent, work);
2209 	mvdev = wqent->mvdev;
2210 	ndev = to_mlx5_vdpa_ndev(mvdev);
2211 	cvq = &mvdev->cvq;
2212 
2213 	down_write(&ndev->reslock);
2214 
2215 	if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK))
2216 		goto out;
2217 
2218 	if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)))
2219 		goto out;
2220 
2221 	if (!cvq->ready)
2222 		goto out;
2223 
2224 	while (true) {
2225 		err = vringh_getdesc_iotlb(&cvq->vring, &cvq->riov, &cvq->wiov, &cvq->head,
2226 					   GFP_ATOMIC);
2227 		if (err <= 0)
2228 			break;
2229 
2230 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, &ctrl, sizeof(ctrl));
2231 		if (read != sizeof(ctrl))
2232 			break;
2233 
2234 		cvq->received_desc++;
2235 		switch (ctrl.class) {
2236 		case VIRTIO_NET_CTRL_MAC:
2237 			status = handle_ctrl_mac(mvdev, ctrl.cmd);
2238 			break;
2239 		case VIRTIO_NET_CTRL_MQ:
2240 			status = handle_ctrl_mq(mvdev, ctrl.cmd);
2241 			break;
2242 		case VIRTIO_NET_CTRL_VLAN:
2243 			status = handle_ctrl_vlan(mvdev, ctrl.cmd);
2244 			break;
2245 		default:
2246 			break;
2247 		}
2248 
2249 		/* Make sure data is written before advancing index */
2250 		smp_wmb();
2251 
2252 		write = vringh_iov_push_iotlb(&cvq->vring, &cvq->wiov, &status, sizeof(status));
2253 		vringh_complete_iotlb(&cvq->vring, cvq->head, write);
2254 		vringh_kiov_cleanup(&cvq->riov);
2255 		vringh_kiov_cleanup(&cvq->wiov);
2256 
2257 		if (vringh_need_notify_iotlb(&cvq->vring))
2258 			vringh_notify(&cvq->vring);
2259 
2260 		cvq->completed_desc++;
2261 		queue_work(mvdev->wq, &wqent->work);
2262 		break;
2263 	}
2264 
2265 out:
2266 	up_write(&ndev->reslock);
2267 }
2268 
2269 static void mlx5_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx)
2270 {
2271 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2272 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2273 	struct mlx5_vdpa_virtqueue *mvq;
2274 
2275 	if (!is_index_valid(mvdev, idx))
2276 		return;
2277 
2278 	if (unlikely(is_ctrl_vq_idx(mvdev, idx))) {
2279 		if (!mvdev->wq || !mvdev->cvq.ready)
2280 			return;
2281 
2282 		queue_work(mvdev->wq, &ndev->cvq_ent.work);
2283 		return;
2284 	}
2285 
2286 	mvq = &ndev->vqs[idx];
2287 	if (unlikely(!mvq->ready))
2288 		return;
2289 
2290 	iowrite16(idx, ndev->mvdev.res.kick_addr);
2291 }
2292 
2293 static int mlx5_vdpa_set_vq_address(struct vdpa_device *vdev, u16 idx, u64 desc_area,
2294 				    u64 driver_area, u64 device_area)
2295 {
2296 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2297 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2298 	struct mlx5_vdpa_virtqueue *mvq;
2299 
2300 	if (!is_index_valid(mvdev, idx))
2301 		return -EINVAL;
2302 
2303 	if (is_ctrl_vq_idx(mvdev, idx)) {
2304 		mvdev->cvq.desc_addr = desc_area;
2305 		mvdev->cvq.device_addr = device_area;
2306 		mvdev->cvq.driver_addr = driver_area;
2307 		return 0;
2308 	}
2309 
2310 	mvq = &ndev->vqs[idx];
2311 	mvq->desc_addr = desc_area;
2312 	mvq->device_addr = device_area;
2313 	mvq->driver_addr = driver_area;
2314 	mvq->modified_fields |= MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_ADDRS;
2315 	return 0;
2316 }
2317 
2318 static void mlx5_vdpa_set_vq_num(struct vdpa_device *vdev, u16 idx, u32 num)
2319 {
2320 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2321 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2322 	struct mlx5_vdpa_virtqueue *mvq;
2323 
2324 	if (!is_index_valid(mvdev, idx))
2325 		return;
2326 
2327         if (is_ctrl_vq_idx(mvdev, idx)) {
2328                 struct mlx5_control_vq *cvq = &mvdev->cvq;
2329 
2330                 cvq->vring.vring.num = num;
2331                 return;
2332         }
2333 
2334 	mvq = &ndev->vqs[idx];
2335 	mvq->num_ent = num;
2336 }
2337 
2338 static void mlx5_vdpa_set_vq_cb(struct vdpa_device *vdev, u16 idx, struct vdpa_callback *cb)
2339 {
2340 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2341 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2342 
2343 	ndev->event_cbs[idx] = *cb;
2344 	if (is_ctrl_vq_idx(mvdev, idx))
2345 		mvdev->cvq.event_cb = *cb;
2346 }
2347 
2348 static void mlx5_cvq_notify(struct vringh *vring)
2349 {
2350 	struct mlx5_control_vq *cvq = container_of(vring, struct mlx5_control_vq, vring);
2351 
2352 	if (!cvq->event_cb.callback)
2353 		return;
2354 
2355 	cvq->event_cb.callback(cvq->event_cb.private);
2356 }
2357 
2358 static void set_cvq_ready(struct mlx5_vdpa_dev *mvdev, bool ready)
2359 {
2360 	struct mlx5_control_vq *cvq = &mvdev->cvq;
2361 
2362 	cvq->ready = ready;
2363 	if (!ready)
2364 		return;
2365 
2366 	cvq->vring.notify = mlx5_cvq_notify;
2367 }
2368 
2369 static void mlx5_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx, bool ready)
2370 {
2371 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2372 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2373 	struct mlx5_vdpa_virtqueue *mvq;
2374 	int err;
2375 
2376 	if (!mvdev->actual_features)
2377 		return;
2378 
2379 	if (!is_index_valid(mvdev, idx))
2380 		return;
2381 
2382 	if (is_ctrl_vq_idx(mvdev, idx)) {
2383 		set_cvq_ready(mvdev, ready);
2384 		return;
2385 	}
2386 
2387 	mvq = &ndev->vqs[idx];
2388 	if (!ready) {
2389 		suspend_vq(ndev, mvq);
2390 	} else {
2391 		err = modify_virtqueue_state(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY);
2392 		if (err) {
2393 			mlx5_vdpa_warn(mvdev, "modify VQ %d to ready failed (%d)\n", idx, err);
2394 			ready = false;
2395 		}
2396 	}
2397 
2398 
2399 	mvq->ready = ready;
2400 }
2401 
2402 static bool mlx5_vdpa_get_vq_ready(struct vdpa_device *vdev, u16 idx)
2403 {
2404 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2405 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2406 
2407 	if (!is_index_valid(mvdev, idx))
2408 		return false;
2409 
2410 	if (is_ctrl_vq_idx(mvdev, idx))
2411 		return mvdev->cvq.ready;
2412 
2413 	return ndev->vqs[idx].ready;
2414 }
2415 
2416 static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx,
2417 				  const struct vdpa_vq_state *state)
2418 {
2419 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2420 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2421 	struct mlx5_vdpa_virtqueue *mvq;
2422 
2423 	if (!is_index_valid(mvdev, idx))
2424 		return -EINVAL;
2425 
2426 	if (is_ctrl_vq_idx(mvdev, idx)) {
2427 		mvdev->cvq.vring.last_avail_idx = state->split.avail_index;
2428 		return 0;
2429 	}
2430 
2431 	mvq = &ndev->vqs[idx];
2432 	if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) {
2433 		mlx5_vdpa_warn(mvdev, "can't modify available index\n");
2434 		return -EINVAL;
2435 	}
2436 
2437 	mvq->used_idx = state->split.avail_index;
2438 	mvq->avail_idx = state->split.avail_index;
2439 	mvq->modified_fields |= MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_AVAIL_IDX |
2440 				MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_USED_IDX;
2441 	return 0;
2442 }
2443 
2444 static int mlx5_vdpa_get_vq_state(struct vdpa_device *vdev, u16 idx, struct vdpa_vq_state *state)
2445 {
2446 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2447 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2448 	struct mlx5_vdpa_virtqueue *mvq;
2449 	struct mlx5_virtq_attr attr;
2450 	int err;
2451 
2452 	if (!is_index_valid(mvdev, idx))
2453 		return -EINVAL;
2454 
2455 	if (is_ctrl_vq_idx(mvdev, idx)) {
2456 		state->split.avail_index = mvdev->cvq.vring.last_avail_idx;
2457 		return 0;
2458 	}
2459 
2460 	mvq = &ndev->vqs[idx];
2461 	/* If the virtq object was destroyed, use the value saved at
2462 	 * the last minute of suspend_vq. This caters for userspace
2463 	 * that cares about emulating the index after vq is stopped.
2464 	 */
2465 	if (!mvq->initialized) {
2466 		/* Firmware returns a wrong value for the available index.
2467 		 * Since both values should be identical, we take the value of
2468 		 * used_idx which is reported correctly.
2469 		 */
2470 		state->split.avail_index = mvq->used_idx;
2471 		return 0;
2472 	}
2473 
2474 	err = query_virtqueue(ndev, mvq, &attr);
2475 	if (err) {
2476 		mlx5_vdpa_warn(mvdev, "failed to query virtqueue\n");
2477 		return err;
2478 	}
2479 	state->split.avail_index = attr.used_index;
2480 	return 0;
2481 }
2482 
2483 static u32 mlx5_vdpa_get_vq_align(struct vdpa_device *vdev)
2484 {
2485 	return PAGE_SIZE;
2486 }
2487 
2488 static u32 mlx5_vdpa_get_vq_group(struct vdpa_device *vdev, u16 idx)
2489 {
2490 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2491 
2492 	if (is_ctrl_vq_idx(mvdev, idx))
2493 		return MLX5_VDPA_CVQ_GROUP;
2494 
2495 	return MLX5_VDPA_DATAVQ_GROUP;
2496 }
2497 
2498 static u32 mlx5_vdpa_get_vq_desc_group(struct vdpa_device *vdev, u16 idx)
2499 {
2500 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2501 
2502 	if (is_ctrl_vq_idx(mvdev, idx))
2503 		return MLX5_VDPA_CVQ_GROUP;
2504 
2505 	return MLX5_VDPA_DATAVQ_DESC_GROUP;
2506 }
2507 
2508 static u64 mlx_to_vritio_features(u16 dev_features)
2509 {
2510 	u64 result = 0;
2511 
2512 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_MRG_RXBUF))
2513 		result |= BIT_ULL(VIRTIO_NET_F_MRG_RXBUF);
2514 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_HOST_ECN))
2515 		result |= BIT_ULL(VIRTIO_NET_F_HOST_ECN);
2516 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_ECN))
2517 		result |= BIT_ULL(VIRTIO_NET_F_GUEST_ECN);
2518 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_TSO6))
2519 		result |= BIT_ULL(VIRTIO_NET_F_GUEST_TSO6);
2520 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_TSO4))
2521 		result |= BIT_ULL(VIRTIO_NET_F_GUEST_TSO4);
2522 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_CSUM))
2523 		result |= BIT_ULL(VIRTIO_NET_F_GUEST_CSUM);
2524 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_CSUM))
2525 		result |= BIT_ULL(VIRTIO_NET_F_CSUM);
2526 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_HOST_TSO6))
2527 		result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO6);
2528 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_HOST_TSO4))
2529 		result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO4);
2530 
2531 	return result;
2532 }
2533 
2534 static u64 get_supported_features(struct mlx5_core_dev *mdev)
2535 {
2536 	u64 mlx_vdpa_features = 0;
2537 	u16 dev_features;
2538 
2539 	dev_features = MLX5_CAP_DEV_VDPA_EMULATION(mdev, device_features_bits_mask);
2540 	mlx_vdpa_features |= mlx_to_vritio_features(dev_features);
2541 	if (MLX5_CAP_DEV_VDPA_EMULATION(mdev, virtio_version_1_0))
2542 		mlx_vdpa_features |= BIT_ULL(VIRTIO_F_VERSION_1);
2543 	mlx_vdpa_features |= BIT_ULL(VIRTIO_F_ACCESS_PLATFORM);
2544 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_CTRL_VQ);
2545 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR);
2546 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_MQ);
2547 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_STATUS);
2548 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_MTU);
2549 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_CTRL_VLAN);
2550 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_MAC);
2551 
2552 	return mlx_vdpa_features;
2553 }
2554 
2555 static u64 mlx5_vdpa_get_device_features(struct vdpa_device *vdev)
2556 {
2557 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2558 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2559 
2560 	print_features(mvdev, ndev->mvdev.mlx_features, false);
2561 	return ndev->mvdev.mlx_features;
2562 }
2563 
2564 static int verify_driver_features(struct mlx5_vdpa_dev *mvdev, u64 features)
2565 {
2566 	/* Minimum features to expect */
2567 	if (!(features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)))
2568 		return -EOPNOTSUPP;
2569 
2570 	/* Double check features combination sent down by the driver.
2571 	 * Fail invalid features due to absence of the depended feature.
2572 	 *
2573 	 * Per VIRTIO v1.1 specification, section 5.1.3.1 Feature bit
2574 	 * requirements: "VIRTIO_NET_F_MQ Requires VIRTIO_NET_F_CTRL_VQ".
2575 	 * By failing the invalid features sent down by untrusted drivers,
2576 	 * we're assured the assumption made upon is_index_valid() and
2577 	 * is_ctrl_vq_idx() will not be compromised.
2578 	 */
2579 	if ((features & (BIT_ULL(VIRTIO_NET_F_MQ) | BIT_ULL(VIRTIO_NET_F_CTRL_VQ))) ==
2580             BIT_ULL(VIRTIO_NET_F_MQ))
2581 		return -EINVAL;
2582 
2583 	return 0;
2584 }
2585 
2586 static int setup_virtqueues(struct mlx5_vdpa_dev *mvdev, bool filled)
2587 {
2588 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2589 	int err;
2590 	int i;
2591 
2592 	for (i = 0; i < mvdev->max_vqs; i++) {
2593 		err = setup_vq(ndev, &ndev->vqs[i], filled);
2594 		if (err)
2595 			goto err_vq;
2596 	}
2597 
2598 	return 0;
2599 
2600 err_vq:
2601 	for (--i; i >= 0; i--)
2602 		teardown_vq(ndev, &ndev->vqs[i]);
2603 
2604 	return err;
2605 }
2606 
2607 static void teardown_virtqueues(struct mlx5_vdpa_net *ndev)
2608 {
2609 	int i;
2610 
2611 	for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--)
2612 		teardown_vq(ndev, &ndev->vqs[i]);
2613 }
2614 
2615 static void update_cvq_info(struct mlx5_vdpa_dev *mvdev)
2616 {
2617 	if (MLX5_FEATURE(mvdev, VIRTIO_NET_F_CTRL_VQ)) {
2618 		if (MLX5_FEATURE(mvdev, VIRTIO_NET_F_MQ)) {
2619 			/* MQ supported. CVQ index is right above the last data virtqueue's */
2620 			mvdev->max_idx = mvdev->max_vqs;
2621 		} else {
2622 			/* Only CVQ supportted. data virtqueues occupy indices 0 and 1.
2623 			 * CVQ gets index 2
2624 			 */
2625 			mvdev->max_idx = 2;
2626 		}
2627 	} else {
2628 		/* Two data virtqueues only: one for rx and one for tx */
2629 		mvdev->max_idx = 1;
2630 	}
2631 }
2632 
2633 static u8 query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport)
2634 {
2635 	u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {};
2636 	u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {};
2637 	int err;
2638 
2639 	MLX5_SET(query_vport_state_in, in, opcode, MLX5_CMD_OP_QUERY_VPORT_STATE);
2640 	MLX5_SET(query_vport_state_in, in, op_mod, opmod);
2641 	MLX5_SET(query_vport_state_in, in, vport_number, vport);
2642 	if (vport)
2643 		MLX5_SET(query_vport_state_in, in, other_vport, 1);
2644 
2645 	err = mlx5_cmd_exec_inout(mdev, query_vport_state, in, out);
2646 	if (err)
2647 		return 0;
2648 
2649 	return MLX5_GET(query_vport_state_out, out, state);
2650 }
2651 
2652 static bool get_link_state(struct mlx5_vdpa_dev *mvdev)
2653 {
2654 	if (query_vport_state(mvdev->mdev, MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT, 0) ==
2655 	    VPORT_STATE_UP)
2656 		return true;
2657 
2658 	return false;
2659 }
2660 
2661 static void update_carrier(struct work_struct *work)
2662 {
2663 	struct mlx5_vdpa_wq_ent *wqent;
2664 	struct mlx5_vdpa_dev *mvdev;
2665 	struct mlx5_vdpa_net *ndev;
2666 
2667 	wqent = container_of(work, struct mlx5_vdpa_wq_ent, work);
2668 	mvdev = wqent->mvdev;
2669 	ndev = to_mlx5_vdpa_ndev(mvdev);
2670 	if (get_link_state(mvdev))
2671 		ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP);
2672 	else
2673 		ndev->config.status &= cpu_to_mlx5vdpa16(mvdev, ~VIRTIO_NET_S_LINK_UP);
2674 
2675 	if (ndev->config_cb.callback)
2676 		ndev->config_cb.callback(ndev->config_cb.private);
2677 
2678 	kfree(wqent);
2679 }
2680 
2681 static int queue_link_work(struct mlx5_vdpa_net *ndev)
2682 {
2683 	struct mlx5_vdpa_wq_ent *wqent;
2684 
2685 	wqent = kzalloc(sizeof(*wqent), GFP_ATOMIC);
2686 	if (!wqent)
2687 		return -ENOMEM;
2688 
2689 	wqent->mvdev = &ndev->mvdev;
2690 	INIT_WORK(&wqent->work, update_carrier);
2691 	queue_work(ndev->mvdev.wq, &wqent->work);
2692 	return 0;
2693 }
2694 
2695 static int event_handler(struct notifier_block *nb, unsigned long event, void *param)
2696 {
2697 	struct mlx5_vdpa_net *ndev = container_of(nb, struct mlx5_vdpa_net, nb);
2698 	struct mlx5_eqe *eqe = param;
2699 	int ret = NOTIFY_DONE;
2700 
2701 	if (event == MLX5_EVENT_TYPE_PORT_CHANGE) {
2702 		switch (eqe->sub_type) {
2703 		case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
2704 		case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
2705 			if (queue_link_work(ndev))
2706 				return NOTIFY_DONE;
2707 
2708 			ret = NOTIFY_OK;
2709 			break;
2710 		default:
2711 			return NOTIFY_DONE;
2712 		}
2713 		return ret;
2714 	}
2715 	return ret;
2716 }
2717 
2718 static void register_link_notifier(struct mlx5_vdpa_net *ndev)
2719 {
2720 	if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_STATUS)))
2721 		return;
2722 
2723 	ndev->nb.notifier_call = event_handler;
2724 	mlx5_notifier_register(ndev->mvdev.mdev, &ndev->nb);
2725 	ndev->nb_registered = true;
2726 	queue_link_work(ndev);
2727 }
2728 
2729 static void unregister_link_notifier(struct mlx5_vdpa_net *ndev)
2730 {
2731 	if (!ndev->nb_registered)
2732 		return;
2733 
2734 	ndev->nb_registered = false;
2735 	mlx5_notifier_unregister(ndev->mvdev.mdev, &ndev->nb);
2736 	if (ndev->mvdev.wq)
2737 		flush_workqueue(ndev->mvdev.wq);
2738 }
2739 
2740 static u64 mlx5_vdpa_get_backend_features(const struct vdpa_device *vdpa)
2741 {
2742 	return BIT_ULL(VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK);
2743 }
2744 
2745 static int mlx5_vdpa_set_driver_features(struct vdpa_device *vdev, u64 features)
2746 {
2747 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2748 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2749 	u64 old_features = mvdev->actual_features;
2750 	int err;
2751 
2752 	print_features(mvdev, features, true);
2753 
2754 	err = verify_driver_features(mvdev, features);
2755 	if (err)
2756 		return err;
2757 
2758 	ndev->mvdev.actual_features = features & ndev->mvdev.mlx_features;
2759 
2760 	/* Interested in changes of vq features only. */
2761 	if (get_features(old_features) != get_features(mvdev->actual_features)) {
2762 		for (int i = 0; i < mvdev->max_vqs; ++i) {
2763 			struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[i];
2764 
2765 			mvq->modified_fields |= (
2766 				MLX5_VIRTQ_MODIFY_MASK_QUEUE_VIRTIO_VERSION |
2767 				MLX5_VIRTQ_MODIFY_MASK_QUEUE_FEATURES
2768 			);
2769 		}
2770 	}
2771 
2772 	update_cvq_info(mvdev);
2773 	return err;
2774 }
2775 
2776 static void mlx5_vdpa_set_config_cb(struct vdpa_device *vdev, struct vdpa_callback *cb)
2777 {
2778 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2779 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2780 
2781 	ndev->config_cb = *cb;
2782 }
2783 
2784 #define MLX5_VDPA_MAX_VQ_ENTRIES 256
2785 static u16 mlx5_vdpa_get_vq_num_max(struct vdpa_device *vdev)
2786 {
2787 	return MLX5_VDPA_MAX_VQ_ENTRIES;
2788 }
2789 
2790 static u32 mlx5_vdpa_get_device_id(struct vdpa_device *vdev)
2791 {
2792 	return VIRTIO_ID_NET;
2793 }
2794 
2795 static u32 mlx5_vdpa_get_vendor_id(struct vdpa_device *vdev)
2796 {
2797 	return PCI_VENDOR_ID_MELLANOX;
2798 }
2799 
2800 static u8 mlx5_vdpa_get_status(struct vdpa_device *vdev)
2801 {
2802 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2803 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2804 
2805 	print_status(mvdev, ndev->mvdev.status, false);
2806 	return ndev->mvdev.status;
2807 }
2808 
2809 static int save_channel_info(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
2810 {
2811 	struct mlx5_vq_restore_info *ri = &mvq->ri;
2812 	struct mlx5_virtq_attr attr = {};
2813 	int err;
2814 
2815 	if (mvq->initialized) {
2816 		err = query_virtqueue(ndev, mvq, &attr);
2817 		if (err)
2818 			return err;
2819 	}
2820 
2821 	ri->avail_index = attr.available_index;
2822 	ri->used_index = attr.used_index;
2823 	ri->ready = mvq->ready;
2824 	ri->num_ent = mvq->num_ent;
2825 	ri->desc_addr = mvq->desc_addr;
2826 	ri->device_addr = mvq->device_addr;
2827 	ri->driver_addr = mvq->driver_addr;
2828 	ri->map = mvq->map;
2829 	ri->restore = true;
2830 	return 0;
2831 }
2832 
2833 static int save_channels_info(struct mlx5_vdpa_net *ndev)
2834 {
2835 	int i;
2836 
2837 	for (i = 0; i < ndev->mvdev.max_vqs; i++) {
2838 		memset(&ndev->vqs[i].ri, 0, sizeof(ndev->vqs[i].ri));
2839 		save_channel_info(ndev, &ndev->vqs[i]);
2840 	}
2841 	return 0;
2842 }
2843 
2844 static void mlx5_clear_vqs(struct mlx5_vdpa_net *ndev)
2845 {
2846 	int i;
2847 
2848 	for (i = 0; i < ndev->mvdev.max_vqs; i++)
2849 		memset(&ndev->vqs[i], 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
2850 }
2851 
2852 static void restore_channels_info(struct mlx5_vdpa_net *ndev)
2853 {
2854 	struct mlx5_vdpa_virtqueue *mvq;
2855 	struct mlx5_vq_restore_info *ri;
2856 	int i;
2857 
2858 	mlx5_clear_vqs(ndev);
2859 	mvqs_set_defaults(ndev);
2860 	for (i = 0; i < ndev->mvdev.max_vqs; i++) {
2861 		mvq = &ndev->vqs[i];
2862 		ri = &mvq->ri;
2863 		if (!ri->restore)
2864 			continue;
2865 
2866 		mvq->avail_idx = ri->avail_index;
2867 		mvq->used_idx = ri->used_index;
2868 		mvq->ready = ri->ready;
2869 		mvq->num_ent = ri->num_ent;
2870 		mvq->desc_addr = ri->desc_addr;
2871 		mvq->device_addr = ri->device_addr;
2872 		mvq->driver_addr = ri->driver_addr;
2873 		mvq->map = ri->map;
2874 	}
2875 }
2876 
2877 static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev *mvdev,
2878 				struct mlx5_vdpa_mr *new_mr,
2879 				unsigned int asid)
2880 {
2881 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2882 	bool teardown = !is_resumable(ndev);
2883 	int err;
2884 
2885 	suspend_vqs(ndev);
2886 	if (teardown) {
2887 		err = save_channels_info(ndev);
2888 		if (err)
2889 			return err;
2890 
2891 		teardown_vq_resources(ndev);
2892 	}
2893 
2894 	mlx5_vdpa_update_mr(mvdev, new_mr, asid);
2895 
2896 	for (int i = 0; i < mvdev->max_vqs; i++)
2897 		ndev->vqs[i].modified_fields |= MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY |
2898 						MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY;
2899 
2900 	if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK) || mvdev->suspended)
2901 		return 0;
2902 
2903 	if (teardown) {
2904 		restore_channels_info(ndev);
2905 		err = setup_vq_resources(ndev, true);
2906 		if (err)
2907 			return err;
2908 	}
2909 
2910 	resume_vqs(ndev);
2911 
2912 	return 0;
2913 }
2914 
2915 /* reslock must be held for this function */
2916 static int setup_vq_resources(struct mlx5_vdpa_net *ndev, bool filled)
2917 {
2918 	struct mlx5_vdpa_dev *mvdev = &ndev->mvdev;
2919 	int err;
2920 
2921 	WARN_ON(!rwsem_is_locked(&ndev->reslock));
2922 
2923 	if (ndev->setup) {
2924 		mlx5_vdpa_warn(mvdev, "setup driver called for already setup driver\n");
2925 		err = 0;
2926 		goto out;
2927 	}
2928 	mlx5_vdpa_add_debugfs(ndev);
2929 
2930 	err = read_umem_params(ndev);
2931 	if (err)
2932 		goto err_setup;
2933 
2934 	err = setup_virtqueues(mvdev, filled);
2935 	if (err) {
2936 		mlx5_vdpa_warn(mvdev, "setup_virtqueues\n");
2937 		goto err_setup;
2938 	}
2939 
2940 	err = create_rqt(ndev);
2941 	if (err) {
2942 		mlx5_vdpa_warn(mvdev, "create_rqt\n");
2943 		goto err_rqt;
2944 	}
2945 
2946 	err = create_tir(ndev);
2947 	if (err) {
2948 		mlx5_vdpa_warn(mvdev, "create_tir\n");
2949 		goto err_tir;
2950 	}
2951 
2952 	err = setup_steering(ndev);
2953 	if (err) {
2954 		mlx5_vdpa_warn(mvdev, "setup_steering\n");
2955 		goto err_fwd;
2956 	}
2957 	ndev->setup = true;
2958 
2959 	return 0;
2960 
2961 err_fwd:
2962 	destroy_tir(ndev);
2963 err_tir:
2964 	destroy_rqt(ndev);
2965 err_rqt:
2966 	teardown_virtqueues(ndev);
2967 err_setup:
2968 	mlx5_vdpa_remove_debugfs(ndev);
2969 out:
2970 	return err;
2971 }
2972 
2973 /* reslock must be held for this function */
2974 static void teardown_vq_resources(struct mlx5_vdpa_net *ndev)
2975 {
2976 
2977 	WARN_ON(!rwsem_is_locked(&ndev->reslock));
2978 
2979 	if (!ndev->setup)
2980 		return;
2981 
2982 	mlx5_vdpa_remove_debugfs(ndev);
2983 	teardown_steering(ndev);
2984 	destroy_tir(ndev);
2985 	destroy_rqt(ndev);
2986 	teardown_virtqueues(ndev);
2987 	ndev->setup = false;
2988 }
2989 
2990 static int setup_cvq_vring(struct mlx5_vdpa_dev *mvdev)
2991 {
2992 	struct mlx5_control_vq *cvq = &mvdev->cvq;
2993 	int err = 0;
2994 
2995 	if (mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)) {
2996 		u16 idx = cvq->vring.last_avail_idx;
2997 
2998 		err = vringh_init_iotlb(&cvq->vring, mvdev->actual_features,
2999 					cvq->vring.vring.num, false,
3000 					(struct vring_desc *)(uintptr_t)cvq->desc_addr,
3001 					(struct vring_avail *)(uintptr_t)cvq->driver_addr,
3002 					(struct vring_used *)(uintptr_t)cvq->device_addr);
3003 
3004 		if (!err)
3005 			cvq->vring.last_avail_idx = cvq->vring.last_used_idx = idx;
3006 	}
3007 	return err;
3008 }
3009 
3010 static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status)
3011 {
3012 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3013 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
3014 	int err;
3015 
3016 	print_status(mvdev, status, true);
3017 
3018 	down_write(&ndev->reslock);
3019 
3020 	if ((status ^ ndev->mvdev.status) & VIRTIO_CONFIG_S_DRIVER_OK) {
3021 		if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
3022 			err = setup_cvq_vring(mvdev);
3023 			if (err) {
3024 				mlx5_vdpa_warn(mvdev, "failed to setup control VQ vring\n");
3025 				goto err_setup;
3026 			}
3027 			register_link_notifier(ndev);
3028 			err = setup_vq_resources(ndev, true);
3029 			if (err) {
3030 				mlx5_vdpa_warn(mvdev, "failed to setup driver\n");
3031 				goto err_driver;
3032 			}
3033 		} else {
3034 			mlx5_vdpa_warn(mvdev, "did not expect DRIVER_OK to be cleared\n");
3035 			goto err_clear;
3036 		}
3037 	}
3038 
3039 	ndev->mvdev.status = status;
3040 	up_write(&ndev->reslock);
3041 	return;
3042 
3043 err_driver:
3044 	unregister_link_notifier(ndev);
3045 err_setup:
3046 	mlx5_vdpa_destroy_mr_resources(&ndev->mvdev);
3047 	ndev->mvdev.status |= VIRTIO_CONFIG_S_FAILED;
3048 err_clear:
3049 	up_write(&ndev->reslock);
3050 }
3051 
3052 static void init_group_to_asid_map(struct mlx5_vdpa_dev *mvdev)
3053 {
3054 	int i;
3055 
3056 	/* default mapping all groups are mapped to asid 0 */
3057 	for (i = 0; i < MLX5_VDPA_NUMVQ_GROUPS; i++)
3058 		mvdev->group2asid[i] = 0;
3059 }
3060 
3061 static int mlx5_vdpa_compat_reset(struct vdpa_device *vdev, u32 flags)
3062 {
3063 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3064 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
3065 
3066 	print_status(mvdev, 0, true);
3067 	mlx5_vdpa_info(mvdev, "performing device reset\n");
3068 
3069 	down_write(&ndev->reslock);
3070 	unregister_link_notifier(ndev);
3071 	teardown_vq_resources(ndev);
3072 	mvqs_set_defaults(ndev);
3073 
3074 	if (flags & VDPA_RESET_F_CLEAN_MAP)
3075 		mlx5_vdpa_destroy_mr_resources(&ndev->mvdev);
3076 	ndev->mvdev.status = 0;
3077 	ndev->mvdev.suspended = false;
3078 	ndev->cur_num_vqs = MLX5V_DEFAULT_VQ_COUNT;
3079 	ndev->mvdev.cvq.ready = false;
3080 	ndev->mvdev.cvq.received_desc = 0;
3081 	ndev->mvdev.cvq.completed_desc = 0;
3082 	memset(ndev->event_cbs, 0, sizeof(*ndev->event_cbs) * (mvdev->max_vqs + 1));
3083 	ndev->mvdev.actual_features = 0;
3084 	init_group_to_asid_map(mvdev);
3085 	++mvdev->generation;
3086 
3087 	if ((flags & VDPA_RESET_F_CLEAN_MAP) &&
3088 	    MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) {
3089 		if (mlx5_vdpa_create_dma_mr(mvdev))
3090 			mlx5_vdpa_warn(mvdev, "create MR failed\n");
3091 	}
3092 	up_write(&ndev->reslock);
3093 
3094 	return 0;
3095 }
3096 
3097 static int mlx5_vdpa_reset(struct vdpa_device *vdev)
3098 {
3099 	return mlx5_vdpa_compat_reset(vdev, 0);
3100 }
3101 
3102 static size_t mlx5_vdpa_get_config_size(struct vdpa_device *vdev)
3103 {
3104 	return sizeof(struct virtio_net_config);
3105 }
3106 
3107 static void mlx5_vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, void *buf,
3108 				 unsigned int len)
3109 {
3110 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3111 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
3112 
3113 	if (offset + len <= sizeof(struct virtio_net_config))
3114 		memcpy(buf, (u8 *)&ndev->config + offset, len);
3115 }
3116 
3117 static void mlx5_vdpa_set_config(struct vdpa_device *vdev, unsigned int offset, const void *buf,
3118 				 unsigned int len)
3119 {
3120 	/* not supported */
3121 }
3122 
3123 static u32 mlx5_vdpa_get_generation(struct vdpa_device *vdev)
3124 {
3125 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3126 
3127 	return mvdev->generation;
3128 }
3129 
3130 static int set_map_data(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb,
3131 			unsigned int asid)
3132 {
3133 	struct mlx5_vdpa_mr *new_mr;
3134 	int err;
3135 
3136 	if (asid >= MLX5_VDPA_NUM_AS)
3137 		return -EINVAL;
3138 
3139 	if (vhost_iotlb_itree_first(iotlb, 0, U64_MAX)) {
3140 		new_mr = mlx5_vdpa_create_mr(mvdev, iotlb);
3141 		if (IS_ERR(new_mr)) {
3142 			err = PTR_ERR(new_mr);
3143 			mlx5_vdpa_warn(mvdev, "create map failed(%d)\n", err);
3144 			return err;
3145 		}
3146 	} else {
3147 		/* Empty iotlbs don't have an mr but will clear the previous mr. */
3148 		new_mr = NULL;
3149 	}
3150 
3151 	if (!mvdev->mr[asid]) {
3152 		mlx5_vdpa_update_mr(mvdev, new_mr, asid);
3153 	} else {
3154 		err = mlx5_vdpa_change_map(mvdev, new_mr, asid);
3155 		if (err) {
3156 			mlx5_vdpa_warn(mvdev, "change map failed(%d)\n", err);
3157 			goto out_err;
3158 		}
3159 	}
3160 
3161 	return mlx5_vdpa_update_cvq_iotlb(mvdev, iotlb, asid);
3162 
3163 out_err:
3164 	mlx5_vdpa_put_mr(mvdev, new_mr);
3165 	return err;
3166 }
3167 
3168 static int mlx5_vdpa_set_map(struct vdpa_device *vdev, unsigned int asid,
3169 			     struct vhost_iotlb *iotlb)
3170 {
3171 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3172 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
3173 	int err = -EINVAL;
3174 
3175 	down_write(&ndev->reslock);
3176 	err = set_map_data(mvdev, iotlb, asid);
3177 	up_write(&ndev->reslock);
3178 	return err;
3179 }
3180 
3181 static int mlx5_vdpa_reset_map(struct vdpa_device *vdev, unsigned int asid)
3182 {
3183 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3184 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
3185 	int err;
3186 
3187 	down_write(&ndev->reslock);
3188 	err = mlx5_vdpa_reset_mr(mvdev, asid);
3189 	up_write(&ndev->reslock);
3190 	return err;
3191 }
3192 
3193 static struct device *mlx5_get_vq_dma_dev(struct vdpa_device *vdev, u16 idx)
3194 {
3195 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3196 
3197 	if (is_ctrl_vq_idx(mvdev, idx))
3198 		return &vdev->dev;
3199 
3200 	return mvdev->vdev.dma_dev;
3201 }
3202 
3203 static void free_irqs(struct mlx5_vdpa_net *ndev)
3204 {
3205 	struct mlx5_vdpa_irq_pool_entry *ent;
3206 	int i;
3207 
3208 	if (!msix_mode_supported(&ndev->mvdev))
3209 		return;
3210 
3211 	if (!ndev->irqp.entries)
3212 		return;
3213 
3214 	for (i = ndev->irqp.num_ent - 1; i >= 0; i--) {
3215 		ent = ndev->irqp.entries + i;
3216 		if (ent->map.virq)
3217 			pci_msix_free_irq(ndev->mvdev.mdev->pdev, ent->map);
3218 	}
3219 	kfree(ndev->irqp.entries);
3220 }
3221 
3222 static void mlx5_vdpa_free(struct vdpa_device *vdev)
3223 {
3224 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3225 	struct mlx5_core_dev *pfmdev;
3226 	struct mlx5_vdpa_net *ndev;
3227 
3228 	ndev = to_mlx5_vdpa_ndev(mvdev);
3229 
3230 	free_fixed_resources(ndev);
3231 	mlx5_vdpa_destroy_mr_resources(mvdev);
3232 	if (!is_zero_ether_addr(ndev->config.mac)) {
3233 		pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
3234 		mlx5_mpfs_del_mac(pfmdev, ndev->config.mac);
3235 	}
3236 	mlx5_vdpa_free_resources(&ndev->mvdev);
3237 	free_irqs(ndev);
3238 	kfree(ndev->event_cbs);
3239 	kfree(ndev->vqs);
3240 }
3241 
3242 static struct vdpa_notification_area mlx5_get_vq_notification(struct vdpa_device *vdev, u16 idx)
3243 {
3244 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3245 	struct vdpa_notification_area ret = {};
3246 	struct mlx5_vdpa_net *ndev;
3247 	phys_addr_t addr;
3248 
3249 	if (!is_index_valid(mvdev, idx) || is_ctrl_vq_idx(mvdev, idx))
3250 		return ret;
3251 
3252 	/* If SF BAR size is smaller than PAGE_SIZE, do not use direct
3253 	 * notification to avoid the risk of mapping pages that contain BAR of more
3254 	 * than one SF
3255 	 */
3256 	if (MLX5_CAP_GEN(mvdev->mdev, log_min_sf_size) + 12 < PAGE_SHIFT)
3257 		return ret;
3258 
3259 	ndev = to_mlx5_vdpa_ndev(mvdev);
3260 	addr = (phys_addr_t)ndev->mvdev.res.phys_kick_addr;
3261 	ret.addr = addr;
3262 	ret.size = PAGE_SIZE;
3263 	return ret;
3264 }
3265 
3266 static int mlx5_get_vq_irq(struct vdpa_device *vdev, u16 idx)
3267 {
3268 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3269 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
3270 	struct mlx5_vdpa_virtqueue *mvq;
3271 
3272 	if (!is_index_valid(mvdev, idx))
3273 		return -EINVAL;
3274 
3275 	if (is_ctrl_vq_idx(mvdev, idx))
3276 		return -EOPNOTSUPP;
3277 
3278 	mvq = &ndev->vqs[idx];
3279 	if (!mvq->map.virq)
3280 		return -EOPNOTSUPP;
3281 
3282 	return mvq->map.virq;
3283 }
3284 
3285 static u64 mlx5_vdpa_get_driver_features(struct vdpa_device *vdev)
3286 {
3287 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3288 
3289 	return mvdev->actual_features;
3290 }
3291 
3292 static int counter_set_query(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
3293 			     u64 *received_desc, u64 *completed_desc)
3294 {
3295 	u32 in[MLX5_ST_SZ_DW(query_virtio_q_counters_in)] = {};
3296 	u32 out[MLX5_ST_SZ_DW(query_virtio_q_counters_out)] = {};
3297 	void *cmd_hdr;
3298 	void *ctx;
3299 	int err;
3300 
3301 	if (!counters_supported(&ndev->mvdev))
3302 		return -EOPNOTSUPP;
3303 
3304 	if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY)
3305 		return -EAGAIN;
3306 
3307 	cmd_hdr = MLX5_ADDR_OF(query_virtio_q_counters_in, in, hdr);
3308 
3309 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT);
3310 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS);
3311 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
3312 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->counter_set_id);
3313 
3314 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out));
3315 	if (err)
3316 		return err;
3317 
3318 	ctx = MLX5_ADDR_OF(query_virtio_q_counters_out, out, counters);
3319 	*received_desc = MLX5_GET64(virtio_q_counters, ctx, received_desc);
3320 	*completed_desc = MLX5_GET64(virtio_q_counters, ctx, completed_desc);
3321 	return 0;
3322 }
3323 
3324 static int mlx5_vdpa_get_vendor_vq_stats(struct vdpa_device *vdev, u16 idx,
3325 					 struct sk_buff *msg,
3326 					 struct netlink_ext_ack *extack)
3327 {
3328 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3329 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
3330 	struct mlx5_vdpa_virtqueue *mvq;
3331 	struct mlx5_control_vq *cvq;
3332 	u64 received_desc;
3333 	u64 completed_desc;
3334 	int err = 0;
3335 
3336 	down_read(&ndev->reslock);
3337 	if (!is_index_valid(mvdev, idx)) {
3338 		NL_SET_ERR_MSG_MOD(extack, "virtqueue index is not valid");
3339 		err = -EINVAL;
3340 		goto out_err;
3341 	}
3342 
3343 	if (idx == ctrl_vq_idx(mvdev)) {
3344 		cvq = &mvdev->cvq;
3345 		received_desc = cvq->received_desc;
3346 		completed_desc = cvq->completed_desc;
3347 		goto out;
3348 	}
3349 
3350 	mvq = &ndev->vqs[idx];
3351 	err = counter_set_query(ndev, mvq, &received_desc, &completed_desc);
3352 	if (err) {
3353 		NL_SET_ERR_MSG_MOD(extack, "failed to query hardware");
3354 		goto out_err;
3355 	}
3356 
3357 out:
3358 	err = -EMSGSIZE;
3359 	if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, "received_desc"))
3360 		goto out_err;
3361 
3362 	if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, received_desc,
3363 			      VDPA_ATTR_PAD))
3364 		goto out_err;
3365 
3366 	if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, "completed_desc"))
3367 		goto out_err;
3368 
3369 	if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, completed_desc,
3370 			      VDPA_ATTR_PAD))
3371 		goto out_err;
3372 
3373 	err = 0;
3374 out_err:
3375 	up_read(&ndev->reslock);
3376 	return err;
3377 }
3378 
3379 static void mlx5_vdpa_cvq_suspend(struct mlx5_vdpa_dev *mvdev)
3380 {
3381 	struct mlx5_control_vq *cvq;
3382 
3383 	if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)))
3384 		return;
3385 
3386 	cvq = &mvdev->cvq;
3387 	cvq->ready = false;
3388 }
3389 
3390 static int mlx5_vdpa_suspend(struct vdpa_device *vdev)
3391 {
3392 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3393 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
3394 
3395 	mlx5_vdpa_info(mvdev, "suspending device\n");
3396 
3397 	down_write(&ndev->reslock);
3398 	unregister_link_notifier(ndev);
3399 	suspend_vqs(ndev);
3400 	mlx5_vdpa_cvq_suspend(mvdev);
3401 	mvdev->suspended = true;
3402 	up_write(&ndev->reslock);
3403 	return 0;
3404 }
3405 
3406 static int mlx5_vdpa_resume(struct vdpa_device *vdev)
3407 {
3408 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3409 	struct mlx5_vdpa_net *ndev;
3410 
3411 	ndev = to_mlx5_vdpa_ndev(mvdev);
3412 
3413 	mlx5_vdpa_info(mvdev, "resuming device\n");
3414 
3415 	down_write(&ndev->reslock);
3416 	mvdev->suspended = false;
3417 	resume_vqs(ndev);
3418 	register_link_notifier(ndev);
3419 	up_write(&ndev->reslock);
3420 	return 0;
3421 }
3422 
3423 static int mlx5_set_group_asid(struct vdpa_device *vdev, u32 group,
3424 			       unsigned int asid)
3425 {
3426 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3427 	int err = 0;
3428 
3429 	if (group >= MLX5_VDPA_NUMVQ_GROUPS)
3430 		return -EINVAL;
3431 
3432 	mvdev->group2asid[group] = asid;
3433 
3434 	mutex_lock(&mvdev->mr_mtx);
3435 	if (group == MLX5_VDPA_CVQ_GROUP && mvdev->mr[asid])
3436 		err = mlx5_vdpa_update_cvq_iotlb(mvdev, mvdev->mr[asid]->iotlb, asid);
3437 	mutex_unlock(&mvdev->mr_mtx);
3438 
3439 	return err;
3440 }
3441 
3442 static const struct vdpa_config_ops mlx5_vdpa_ops = {
3443 	.set_vq_address = mlx5_vdpa_set_vq_address,
3444 	.set_vq_num = mlx5_vdpa_set_vq_num,
3445 	.kick_vq = mlx5_vdpa_kick_vq,
3446 	.set_vq_cb = mlx5_vdpa_set_vq_cb,
3447 	.set_vq_ready = mlx5_vdpa_set_vq_ready,
3448 	.get_vq_ready = mlx5_vdpa_get_vq_ready,
3449 	.set_vq_state = mlx5_vdpa_set_vq_state,
3450 	.get_vq_state = mlx5_vdpa_get_vq_state,
3451 	.get_vendor_vq_stats = mlx5_vdpa_get_vendor_vq_stats,
3452 	.get_vq_notification = mlx5_get_vq_notification,
3453 	.get_vq_irq = mlx5_get_vq_irq,
3454 	.get_vq_align = mlx5_vdpa_get_vq_align,
3455 	.get_vq_group = mlx5_vdpa_get_vq_group,
3456 	.get_vq_desc_group = mlx5_vdpa_get_vq_desc_group, /* Op disabled if not supported. */
3457 	.get_device_features = mlx5_vdpa_get_device_features,
3458 	.get_backend_features = mlx5_vdpa_get_backend_features,
3459 	.set_driver_features = mlx5_vdpa_set_driver_features,
3460 	.get_driver_features = mlx5_vdpa_get_driver_features,
3461 	.set_config_cb = mlx5_vdpa_set_config_cb,
3462 	.get_vq_num_max = mlx5_vdpa_get_vq_num_max,
3463 	.get_device_id = mlx5_vdpa_get_device_id,
3464 	.get_vendor_id = mlx5_vdpa_get_vendor_id,
3465 	.get_status = mlx5_vdpa_get_status,
3466 	.set_status = mlx5_vdpa_set_status,
3467 	.reset = mlx5_vdpa_reset,
3468 	.compat_reset = mlx5_vdpa_compat_reset,
3469 	.get_config_size = mlx5_vdpa_get_config_size,
3470 	.get_config = mlx5_vdpa_get_config,
3471 	.set_config = mlx5_vdpa_set_config,
3472 	.get_generation = mlx5_vdpa_get_generation,
3473 	.set_map = mlx5_vdpa_set_map,
3474 	.reset_map = mlx5_vdpa_reset_map,
3475 	.set_group_asid = mlx5_set_group_asid,
3476 	.get_vq_dma_dev = mlx5_get_vq_dma_dev,
3477 	.free = mlx5_vdpa_free,
3478 	.suspend = mlx5_vdpa_suspend,
3479 	.resume = mlx5_vdpa_resume, /* Op disabled if not supported. */
3480 };
3481 
3482 static int query_mtu(struct mlx5_core_dev *mdev, u16 *mtu)
3483 {
3484 	u16 hw_mtu;
3485 	int err;
3486 
3487 	err = mlx5_query_nic_vport_mtu(mdev, &hw_mtu);
3488 	if (err)
3489 		return err;
3490 
3491 	*mtu = hw_mtu - MLX5V_ETH_HARD_MTU;
3492 	return 0;
3493 }
3494 
3495 static int alloc_fixed_resources(struct mlx5_vdpa_net *ndev)
3496 {
3497 	struct mlx5_vdpa_net_resources *res = &ndev->res;
3498 	int err;
3499 
3500 	if (res->valid) {
3501 		mlx5_vdpa_warn(&ndev->mvdev, "resources already allocated\n");
3502 		return -EEXIST;
3503 	}
3504 
3505 	err = mlx5_vdpa_alloc_transport_domain(&ndev->mvdev, &res->tdn);
3506 	if (err)
3507 		return err;
3508 
3509 	err = create_tis(ndev);
3510 	if (err)
3511 		goto err_tis;
3512 
3513 	res->valid = true;
3514 
3515 	return 0;
3516 
3517 err_tis:
3518 	mlx5_vdpa_dealloc_transport_domain(&ndev->mvdev, res->tdn);
3519 	return err;
3520 }
3521 
3522 static void free_fixed_resources(struct mlx5_vdpa_net *ndev)
3523 {
3524 	struct mlx5_vdpa_net_resources *res = &ndev->res;
3525 
3526 	if (!res->valid)
3527 		return;
3528 
3529 	destroy_tis(ndev);
3530 	mlx5_vdpa_dealloc_transport_domain(&ndev->mvdev, res->tdn);
3531 	res->valid = false;
3532 }
3533 
3534 static void mvqs_set_defaults(struct mlx5_vdpa_net *ndev)
3535 {
3536 	struct mlx5_vdpa_virtqueue *mvq;
3537 	int i;
3538 
3539 	for (i = 0; i < ndev->mvdev.max_vqs; ++i) {
3540 		mvq = &ndev->vqs[i];
3541 		memset(mvq, 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
3542 		mvq->index = i;
3543 		mvq->ndev = ndev;
3544 		mvq->fwqp.fw = true;
3545 		mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_NONE;
3546 		mvq->num_ent = MLX5V_DEFAULT_VQ_SIZE;
3547 	}
3548 }
3549 
3550 struct mlx5_vdpa_mgmtdev {
3551 	struct vdpa_mgmt_dev mgtdev;
3552 	struct mlx5_adev *madev;
3553 	struct mlx5_vdpa_net *ndev;
3554 	struct vdpa_config_ops vdpa_ops;
3555 };
3556 
3557 static int config_func_mtu(struct mlx5_core_dev *mdev, u16 mtu)
3558 {
3559 	int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in);
3560 	void *in;
3561 	int err;
3562 
3563 	in = kvzalloc(inlen, GFP_KERNEL);
3564 	if (!in)
3565 		return -ENOMEM;
3566 
3567 	MLX5_SET(modify_nic_vport_context_in, in, field_select.mtu, 1);
3568 	MLX5_SET(modify_nic_vport_context_in, in, nic_vport_context.mtu,
3569 		 mtu + MLX5V_ETH_HARD_MTU);
3570 	MLX5_SET(modify_nic_vport_context_in, in, opcode,
3571 		 MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT);
3572 
3573 	err = mlx5_cmd_exec_in(mdev, modify_nic_vport_context, in);
3574 
3575 	kvfree(in);
3576 	return err;
3577 }
3578 
3579 static void allocate_irqs(struct mlx5_vdpa_net *ndev)
3580 {
3581 	struct mlx5_vdpa_irq_pool_entry *ent;
3582 	int i;
3583 
3584 	if (!msix_mode_supported(&ndev->mvdev))
3585 		return;
3586 
3587 	if (!ndev->mvdev.mdev->pdev)
3588 		return;
3589 
3590 	ndev->irqp.entries = kcalloc(ndev->mvdev.max_vqs, sizeof(*ndev->irqp.entries), GFP_KERNEL);
3591 	if (!ndev->irqp.entries)
3592 		return;
3593 
3594 
3595 	for (i = 0; i < ndev->mvdev.max_vqs; i++) {
3596 		ent = ndev->irqp.entries + i;
3597 		snprintf(ent->name, MLX5_VDPA_IRQ_NAME_LEN, "%s-vq-%d",
3598 			 dev_name(&ndev->mvdev.vdev.dev), i);
3599 		ent->map = pci_msix_alloc_irq_at(ndev->mvdev.mdev->pdev, MSI_ANY_INDEX, NULL);
3600 		if (!ent->map.virq)
3601 			return;
3602 
3603 		ndev->irqp.num_ent++;
3604 	}
3605 }
3606 
3607 static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name,
3608 			     const struct vdpa_dev_set_config *add_config)
3609 {
3610 	struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev);
3611 	struct virtio_net_config *config;
3612 	struct mlx5_core_dev *pfmdev;
3613 	struct mlx5_vdpa_dev *mvdev;
3614 	struct mlx5_vdpa_net *ndev;
3615 	struct mlx5_core_dev *mdev;
3616 	u64 device_features;
3617 	u32 max_vqs;
3618 	u16 mtu;
3619 	int err;
3620 
3621 	if (mgtdev->ndev)
3622 		return -ENOSPC;
3623 
3624 	mdev = mgtdev->madev->mdev;
3625 	device_features = mgtdev->mgtdev.supported_features;
3626 	if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) {
3627 		if (add_config->device_features & ~device_features) {
3628 			dev_warn(mdev->device,
3629 				 "The provisioned features 0x%llx are not supported by this device with features 0x%llx\n",
3630 				 add_config->device_features, device_features);
3631 			return -EINVAL;
3632 		}
3633 		device_features &= add_config->device_features;
3634 	} else {
3635 		device_features &= ~BIT_ULL(VIRTIO_NET_F_MRG_RXBUF);
3636 	}
3637 	if (!(device_features & BIT_ULL(VIRTIO_F_VERSION_1) &&
3638 	      device_features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM))) {
3639 		dev_warn(mdev->device,
3640 			 "Must provision minimum features 0x%llx for this device",
3641 			 BIT_ULL(VIRTIO_F_VERSION_1) | BIT_ULL(VIRTIO_F_ACCESS_PLATFORM));
3642 		return -EOPNOTSUPP;
3643 	}
3644 
3645 	if (!(MLX5_CAP_DEV_VDPA_EMULATION(mdev, virtio_queue_type) &
3646 	    MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT)) {
3647 		dev_warn(mdev->device, "missing support for split virtqueues\n");
3648 		return -EOPNOTSUPP;
3649 	}
3650 
3651 	max_vqs = min_t(int, MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues),
3652 			1 << MLX5_CAP_GEN(mdev, log_max_rqt_size));
3653 	if (max_vqs < 2) {
3654 		dev_warn(mdev->device,
3655 			 "%d virtqueues are supported. At least 2 are required\n",
3656 			 max_vqs);
3657 		return -EAGAIN;
3658 	}
3659 
3660 	if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP)) {
3661 		if (add_config->net.max_vq_pairs > max_vqs / 2)
3662 			return -EINVAL;
3663 		max_vqs = min_t(u32, max_vqs, 2 * add_config->net.max_vq_pairs);
3664 	} else {
3665 		max_vqs = 2;
3666 	}
3667 
3668 	ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mgtdev->vdpa_ops,
3669 				 MLX5_VDPA_NUMVQ_GROUPS, MLX5_VDPA_NUM_AS, name, false);
3670 	if (IS_ERR(ndev))
3671 		return PTR_ERR(ndev);
3672 
3673 	ndev->mvdev.max_vqs = max_vqs;
3674 	mvdev = &ndev->mvdev;
3675 	mvdev->mdev = mdev;
3676 
3677 	ndev->vqs = kcalloc(max_vqs, sizeof(*ndev->vqs), GFP_KERNEL);
3678 	ndev->event_cbs = kcalloc(max_vqs + 1, sizeof(*ndev->event_cbs), GFP_KERNEL);
3679 	if (!ndev->vqs || !ndev->event_cbs) {
3680 		err = -ENOMEM;
3681 		goto err_alloc;
3682 	}
3683 	ndev->cur_num_vqs = MLX5V_DEFAULT_VQ_COUNT;
3684 
3685 	mvqs_set_defaults(ndev);
3686 	allocate_irqs(ndev);
3687 	init_rwsem(&ndev->reslock);
3688 	config = &ndev->config;
3689 
3690 	if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU)) {
3691 		err = config_func_mtu(mdev, add_config->net.mtu);
3692 		if (err)
3693 			goto err_alloc;
3694 	}
3695 
3696 	if (device_features & BIT_ULL(VIRTIO_NET_F_MTU)) {
3697 		err = query_mtu(mdev, &mtu);
3698 		if (err)
3699 			goto err_alloc;
3700 
3701 		ndev->config.mtu = cpu_to_mlx5vdpa16(mvdev, mtu);
3702 	}
3703 
3704 	if (device_features & BIT_ULL(VIRTIO_NET_F_STATUS)) {
3705 		if (get_link_state(mvdev))
3706 			ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP);
3707 		else
3708 			ndev->config.status &= cpu_to_mlx5vdpa16(mvdev, ~VIRTIO_NET_S_LINK_UP);
3709 	}
3710 
3711 	if (add_config->mask & (1 << VDPA_ATTR_DEV_NET_CFG_MACADDR)) {
3712 		memcpy(ndev->config.mac, add_config->net.mac, ETH_ALEN);
3713 	/* No bother setting mac address in config if not going to provision _F_MAC */
3714 	} else if ((add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) == 0 ||
3715 		   device_features & BIT_ULL(VIRTIO_NET_F_MAC)) {
3716 		err = mlx5_query_nic_vport_mac_address(mdev, 0, 0, config->mac);
3717 		if (err)
3718 			goto err_alloc;
3719 	}
3720 
3721 	if (!is_zero_ether_addr(config->mac)) {
3722 		pfmdev = pci_get_drvdata(pci_physfn(mdev->pdev));
3723 		err = mlx5_mpfs_add_mac(pfmdev, config->mac);
3724 		if (err)
3725 			goto err_alloc;
3726 	} else if ((add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) == 0) {
3727 		/*
3728 		 * We used to clear _F_MAC feature bit if seeing
3729 		 * zero mac address when device features are not
3730 		 * specifically provisioned. Keep the behaviour
3731 		 * so old scripts do not break.
3732 		 */
3733 		device_features &= ~BIT_ULL(VIRTIO_NET_F_MAC);
3734 	} else if (device_features & BIT_ULL(VIRTIO_NET_F_MAC)) {
3735 		/* Don't provision zero mac address for _F_MAC */
3736 		mlx5_vdpa_warn(&ndev->mvdev,
3737 			       "No mac address provisioned?\n");
3738 		err = -EINVAL;
3739 		goto err_alloc;
3740 	}
3741 
3742 	if (device_features & BIT_ULL(VIRTIO_NET_F_MQ)) {
3743 		config->max_virtqueue_pairs = cpu_to_mlx5vdpa16(mvdev, max_vqs / 2);
3744 		ndev->rqt_size = max_vqs / 2;
3745 	} else {
3746 		ndev->rqt_size = 1;
3747 	}
3748 
3749 	ndev->mvdev.mlx_features = device_features;
3750 	mvdev->vdev.dma_dev = &mdev->pdev->dev;
3751 	err = mlx5_vdpa_alloc_resources(&ndev->mvdev);
3752 	if (err)
3753 		goto err_mpfs;
3754 
3755 	INIT_LIST_HEAD(&mvdev->mr_list_head);
3756 
3757 	if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) {
3758 		err = mlx5_vdpa_create_dma_mr(mvdev);
3759 		if (err)
3760 			goto err_res;
3761 	}
3762 
3763 	err = alloc_fixed_resources(ndev);
3764 	if (err)
3765 		goto err_mr;
3766 
3767 	ndev->cvq_ent.mvdev = mvdev;
3768 	INIT_WORK(&ndev->cvq_ent.work, mlx5_cvq_kick_handler);
3769 	mvdev->wq = create_singlethread_workqueue("mlx5_vdpa_wq");
3770 	if (!mvdev->wq) {
3771 		err = -ENOMEM;
3772 		goto err_res2;
3773 	}
3774 
3775 	mvdev->vdev.mdev = &mgtdev->mgtdev;
3776 	err = _vdpa_register_device(&mvdev->vdev, max_vqs + 1);
3777 	if (err)
3778 		goto err_reg;
3779 
3780 	mgtdev->ndev = ndev;
3781 	return 0;
3782 
3783 err_reg:
3784 	destroy_workqueue(mvdev->wq);
3785 err_res2:
3786 	free_fixed_resources(ndev);
3787 err_mr:
3788 	mlx5_vdpa_destroy_mr_resources(mvdev);
3789 err_res:
3790 	mlx5_vdpa_free_resources(&ndev->mvdev);
3791 err_mpfs:
3792 	if (!is_zero_ether_addr(config->mac))
3793 		mlx5_mpfs_del_mac(pfmdev, config->mac);
3794 err_alloc:
3795 	put_device(&mvdev->vdev.dev);
3796 	return err;
3797 }
3798 
3799 static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device *dev)
3800 {
3801 	struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev);
3802 	struct mlx5_vdpa_dev *mvdev = to_mvdev(dev);
3803 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
3804 	struct workqueue_struct *wq;
3805 
3806 	unregister_link_notifier(ndev);
3807 	_vdpa_unregister_device(dev);
3808 	wq = mvdev->wq;
3809 	mvdev->wq = NULL;
3810 	destroy_workqueue(wq);
3811 	mgtdev->ndev = NULL;
3812 }
3813 
3814 static const struct vdpa_mgmtdev_ops mdev_ops = {
3815 	.dev_add = mlx5_vdpa_dev_add,
3816 	.dev_del = mlx5_vdpa_dev_del,
3817 };
3818 
3819 static struct virtio_device_id id_table[] = {
3820 	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
3821 	{ 0 },
3822 };
3823 
3824 static int mlx5v_probe(struct auxiliary_device *adev,
3825 		       const struct auxiliary_device_id *id)
3826 
3827 {
3828 	struct mlx5_adev *madev = container_of(adev, struct mlx5_adev, adev);
3829 	struct mlx5_core_dev *mdev = madev->mdev;
3830 	struct mlx5_vdpa_mgmtdev *mgtdev;
3831 	int err;
3832 
3833 	mgtdev = kzalloc(sizeof(*mgtdev), GFP_KERNEL);
3834 	if (!mgtdev)
3835 		return -ENOMEM;
3836 
3837 	mgtdev->mgtdev.ops = &mdev_ops;
3838 	mgtdev->mgtdev.device = mdev->device;
3839 	mgtdev->mgtdev.id_table = id_table;
3840 	mgtdev->mgtdev.config_attr_mask = BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR) |
3841 					  BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP) |
3842 					  BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU) |
3843 					  BIT_ULL(VDPA_ATTR_DEV_FEATURES);
3844 	mgtdev->mgtdev.max_supported_vqs =
3845 		MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues) + 1;
3846 	mgtdev->mgtdev.supported_features = get_supported_features(mdev);
3847 	mgtdev->madev = madev;
3848 	mgtdev->vdpa_ops = mlx5_vdpa_ops;
3849 
3850 	if (!MLX5_CAP_DEV_VDPA_EMULATION(mdev, desc_group_mkey_supported))
3851 		mgtdev->vdpa_ops.get_vq_desc_group = NULL;
3852 
3853 	if (!MLX5_CAP_DEV_VDPA_EMULATION(mdev, freeze_to_rdy_supported))
3854 		mgtdev->vdpa_ops.resume = NULL;
3855 
3856 	err = vdpa_mgmtdev_register(&mgtdev->mgtdev);
3857 	if (err)
3858 		goto reg_err;
3859 
3860 	auxiliary_set_drvdata(adev, mgtdev);
3861 
3862 	return 0;
3863 
3864 reg_err:
3865 	kfree(mgtdev);
3866 	return err;
3867 }
3868 
3869 static void mlx5v_remove(struct auxiliary_device *adev)
3870 {
3871 	struct mlx5_vdpa_mgmtdev *mgtdev;
3872 
3873 	mgtdev = auxiliary_get_drvdata(adev);
3874 	vdpa_mgmtdev_unregister(&mgtdev->mgtdev);
3875 	kfree(mgtdev);
3876 }
3877 
3878 static const struct auxiliary_device_id mlx5v_id_table[] = {
3879 	{ .name = MLX5_ADEV_NAME ".vnet", },
3880 	{},
3881 };
3882 
3883 MODULE_DEVICE_TABLE(auxiliary, mlx5v_id_table);
3884 
3885 static struct auxiliary_driver mlx5v_driver = {
3886 	.name = "vnet",
3887 	.probe = mlx5v_probe,
3888 	.remove = mlx5v_remove,
3889 	.id_table = mlx5v_id_table,
3890 };
3891 
3892 module_auxiliary_driver(mlx5v_driver);
3893