xref: /linux/drivers/vdpa/mlx5/net/mlx5_vnet.c (revision 3663e2c4bc45fcdc71931fcbfcbfbf9b71f55c83)
1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /* Copyright (c) 2020 Mellanox Technologies Ltd. */
3 
4 #include <linux/module.h>
5 #include <linux/vdpa.h>
6 #include <linux/vringh.h>
7 #include <uapi/linux/virtio_net.h>
8 #include <uapi/linux/virtio_ids.h>
9 #include <uapi/linux/vdpa.h>
10 #include <uapi/linux/vhost_types.h>
11 #include <linux/virtio_config.h>
12 #include <linux/auxiliary_bus.h>
13 #include <linux/mlx5/cq.h>
14 #include <linux/mlx5/qp.h>
15 #include <linux/mlx5/device.h>
16 #include <linux/mlx5/driver.h>
17 #include <linux/mlx5/vport.h>
18 #include <linux/mlx5/fs.h>
19 #include <linux/mlx5/mlx5_ifc_vdpa.h>
20 #include <linux/mlx5/mpfs.h>
21 #include "mlx5_vdpa.h"
22 #include "mlx5_vnet.h"
23 
24 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
25 MODULE_DESCRIPTION("Mellanox VDPA driver");
26 MODULE_LICENSE("Dual BSD/GPL");
27 
28 #define VALID_FEATURES_MASK                                                                        \
29 	(BIT_ULL(VIRTIO_NET_F_CSUM) | BIT_ULL(VIRTIO_NET_F_GUEST_CSUM) |                                   \
30 	 BIT_ULL(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) | BIT_ULL(VIRTIO_NET_F_MTU) | BIT_ULL(VIRTIO_NET_F_MAC) |   \
31 	 BIT_ULL(VIRTIO_NET_F_GUEST_TSO4) | BIT_ULL(VIRTIO_NET_F_GUEST_TSO6) |                             \
32 	 BIT_ULL(VIRTIO_NET_F_GUEST_ECN) | BIT_ULL(VIRTIO_NET_F_GUEST_UFO) | BIT_ULL(VIRTIO_NET_F_HOST_TSO4) | \
33 	 BIT_ULL(VIRTIO_NET_F_HOST_TSO6) | BIT_ULL(VIRTIO_NET_F_HOST_ECN) | BIT_ULL(VIRTIO_NET_F_HOST_UFO) |   \
34 	 BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) | BIT_ULL(VIRTIO_NET_F_STATUS) | BIT_ULL(VIRTIO_NET_F_CTRL_VQ) |      \
35 	 BIT_ULL(VIRTIO_NET_F_CTRL_RX) | BIT_ULL(VIRTIO_NET_F_CTRL_VLAN) |                                 \
36 	 BIT_ULL(VIRTIO_NET_F_CTRL_RX_EXTRA) | BIT_ULL(VIRTIO_NET_F_GUEST_ANNOUNCE) |                      \
37 	 BIT_ULL(VIRTIO_NET_F_MQ) | BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR) | BIT_ULL(VIRTIO_NET_F_HASH_REPORT) |  \
38 	 BIT_ULL(VIRTIO_NET_F_RSS) | BIT_ULL(VIRTIO_NET_F_RSC_EXT) | BIT_ULL(VIRTIO_NET_F_STANDBY) |           \
39 	 BIT_ULL(VIRTIO_NET_F_SPEED_DUPLEX) | BIT_ULL(VIRTIO_F_NOTIFY_ON_EMPTY) |                          \
40 	 BIT_ULL(VIRTIO_F_ANY_LAYOUT) | BIT_ULL(VIRTIO_F_VERSION_1) | BIT_ULL(VIRTIO_F_ACCESS_PLATFORM) |      \
41 	 BIT_ULL(VIRTIO_F_RING_PACKED) | BIT_ULL(VIRTIO_F_ORDER_PLATFORM) | BIT_ULL(VIRTIO_F_SR_IOV))
42 
43 #define VALID_STATUS_MASK                                                                          \
44 	(VIRTIO_CONFIG_S_ACKNOWLEDGE | VIRTIO_CONFIG_S_DRIVER | VIRTIO_CONFIG_S_DRIVER_OK |        \
45 	 VIRTIO_CONFIG_S_FEATURES_OK | VIRTIO_CONFIG_S_NEEDS_RESET | VIRTIO_CONFIG_S_FAILED)
46 
47 #define MLX5_FEATURE(_mvdev, _feature) (!!((_mvdev)->actual_features & BIT_ULL(_feature)))
48 
49 #define MLX5V_UNTAGGED 0x1000
50 
51 /* Device must start with 1 queue pair, as per VIRTIO v1.2 spec, section
52  * 5.1.6.5.5 "Device operation in multiqueue mode":
53  *
54  * Multiqueue is disabled by default.
55  * The driver enables multiqueue by sending a command using class
56  * VIRTIO_NET_CTRL_MQ. The command selects the mode of multiqueue
57  * operation, as follows: ...
58  */
59 #define MLX5V_DEFAULT_VQ_COUNT 2
60 
61 #define MLX5V_DEFAULT_VQ_SIZE 256
62 
63 struct mlx5_vdpa_cq_buf {
64 	struct mlx5_frag_buf_ctrl fbc;
65 	struct mlx5_frag_buf frag_buf;
66 	int cqe_size;
67 	int nent;
68 };
69 
70 struct mlx5_vdpa_cq {
71 	struct mlx5_core_cq mcq;
72 	struct mlx5_vdpa_cq_buf buf;
73 	struct mlx5_db db;
74 	int cqe;
75 };
76 
77 struct mlx5_vdpa_umem {
78 	struct mlx5_frag_buf_ctrl fbc;
79 	struct mlx5_frag_buf frag_buf;
80 	int size;
81 	u32 id;
82 };
83 
84 struct mlx5_vdpa_qp {
85 	struct mlx5_core_qp mqp;
86 	struct mlx5_frag_buf frag_buf;
87 	struct mlx5_db db;
88 	u16 head;
89 	bool fw;
90 };
91 
92 struct mlx5_vq_restore_info {
93 	u32 num_ent;
94 	u64 desc_addr;
95 	u64 device_addr;
96 	u64 driver_addr;
97 	u16 avail_index;
98 	u16 used_index;
99 	struct msi_map map;
100 	bool ready;
101 	bool restore;
102 };
103 
104 struct mlx5_vdpa_virtqueue {
105 	bool ready;
106 	u64 desc_addr;
107 	u64 device_addr;
108 	u64 driver_addr;
109 	u32 num_ent;
110 
111 	/* Resources for implementing the notification channel from the device
112 	 * to the driver. fwqp is the firmware end of an RC connection; the
113 	 * other end is vqqp used by the driver. cq is where completions are
114 	 * reported.
115 	 */
116 	struct mlx5_vdpa_cq cq;
117 	struct mlx5_vdpa_qp fwqp;
118 	struct mlx5_vdpa_qp vqqp;
119 
120 	/* umem resources are required for the virtqueue operation. They're use
121 	 * is internal and they must be provided by the driver.
122 	 */
123 	struct mlx5_vdpa_umem umem1;
124 	struct mlx5_vdpa_umem umem2;
125 	struct mlx5_vdpa_umem umem3;
126 
127 	u32 counter_set_id;
128 	bool initialized;
129 	int index;
130 	u32 virtq_id;
131 	struct mlx5_vdpa_net *ndev;
132 	u16 avail_idx;
133 	u16 used_idx;
134 	int fw_state;
135 
136 	u64 modified_fields;
137 
138 	struct mlx5_vdpa_mr *vq_mr;
139 	struct mlx5_vdpa_mr *desc_mr;
140 
141 	struct msi_map map;
142 
143 	/* keep last in the struct */
144 	struct mlx5_vq_restore_info ri;
145 };
146 
147 static bool is_index_valid(struct mlx5_vdpa_dev *mvdev, u16 idx)
148 {
149 	if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_MQ))) {
150 		if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)))
151 			return idx < 2;
152 		else
153 			return idx < 3;
154 	}
155 
156 	return idx <= mvdev->max_idx;
157 }
158 
159 static void free_fixed_resources(struct mlx5_vdpa_net *ndev);
160 static void mvqs_set_defaults(struct mlx5_vdpa_net *ndev);
161 static int setup_vq_resources(struct mlx5_vdpa_net *ndev, bool filled);
162 static void teardown_vq_resources(struct mlx5_vdpa_net *ndev);
163 static int resume_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq);
164 
165 static bool mlx5_vdpa_debug;
166 
167 #define MLX5_LOG_VIO_FLAG(_feature)                                                                \
168 	do {                                                                                       \
169 		if (features & BIT_ULL(_feature))                                                  \
170 			mlx5_vdpa_info(mvdev, "%s\n", #_feature);                                  \
171 	} while (0)
172 
173 #define MLX5_LOG_VIO_STAT(_status)                                                                 \
174 	do {                                                                                       \
175 		if (status & (_status))                                                            \
176 			mlx5_vdpa_info(mvdev, "%s\n", #_status);                                   \
177 	} while (0)
178 
179 /* TODO: cross-endian support */
180 static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev *mvdev)
181 {
182 	return virtio_legacy_is_little_endian() ||
183 		(mvdev->actual_features & BIT_ULL(VIRTIO_F_VERSION_1));
184 }
185 
186 static u16 mlx5vdpa16_to_cpu(struct mlx5_vdpa_dev *mvdev, __virtio16 val)
187 {
188 	return __virtio16_to_cpu(mlx5_vdpa_is_little_endian(mvdev), val);
189 }
190 
191 static __virtio16 cpu_to_mlx5vdpa16(struct mlx5_vdpa_dev *mvdev, u16 val)
192 {
193 	return __cpu_to_virtio16(mlx5_vdpa_is_little_endian(mvdev), val);
194 }
195 
196 static u16 ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev)
197 {
198 	if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_MQ)))
199 		return 2;
200 
201 	return mvdev->max_vqs;
202 }
203 
204 static bool is_ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev, u16 idx)
205 {
206 	return idx == ctrl_vq_idx(mvdev);
207 }
208 
209 static void print_status(struct mlx5_vdpa_dev *mvdev, u8 status, bool set)
210 {
211 	if (status & ~VALID_STATUS_MASK)
212 		mlx5_vdpa_warn(mvdev, "Warning: there are invalid status bits 0x%x\n",
213 			       status & ~VALID_STATUS_MASK);
214 
215 	if (!mlx5_vdpa_debug)
216 		return;
217 
218 	mlx5_vdpa_info(mvdev, "driver status %s", set ? "set" : "get");
219 	if (set && !status) {
220 		mlx5_vdpa_info(mvdev, "driver resets the device\n");
221 		return;
222 	}
223 
224 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_ACKNOWLEDGE);
225 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_DRIVER);
226 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_DRIVER_OK);
227 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_FEATURES_OK);
228 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_NEEDS_RESET);
229 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_FAILED);
230 }
231 
232 static void print_features(struct mlx5_vdpa_dev *mvdev, u64 features, bool set)
233 {
234 	if (features & ~VALID_FEATURES_MASK)
235 		mlx5_vdpa_warn(mvdev, "There are invalid feature bits 0x%llx\n",
236 			       features & ~VALID_FEATURES_MASK);
237 
238 	if (!mlx5_vdpa_debug)
239 		return;
240 
241 	mlx5_vdpa_info(mvdev, "driver %s feature bits:\n", set ? "sets" : "reads");
242 	if (!features)
243 		mlx5_vdpa_info(mvdev, "all feature bits are cleared\n");
244 
245 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CSUM);
246 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_CSUM);
247 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS);
248 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MTU);
249 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MAC);
250 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_TSO4);
251 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_TSO6);
252 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_ECN);
253 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_UFO);
254 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_TSO4);
255 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_TSO6);
256 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_ECN);
257 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_UFO);
258 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MRG_RXBUF);
259 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_STATUS);
260 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_VQ);
261 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_RX);
262 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_VLAN);
263 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_RX_EXTRA);
264 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_ANNOUNCE);
265 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MQ);
266 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_MAC_ADDR);
267 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HASH_REPORT);
268 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_RSS);
269 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_RSC_EXT);
270 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_STANDBY);
271 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_SPEED_DUPLEX);
272 	MLX5_LOG_VIO_FLAG(VIRTIO_F_NOTIFY_ON_EMPTY);
273 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ANY_LAYOUT);
274 	MLX5_LOG_VIO_FLAG(VIRTIO_F_VERSION_1);
275 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ACCESS_PLATFORM);
276 	MLX5_LOG_VIO_FLAG(VIRTIO_F_RING_PACKED);
277 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ORDER_PLATFORM);
278 	MLX5_LOG_VIO_FLAG(VIRTIO_F_SR_IOV);
279 }
280 
281 static int create_tis(struct mlx5_vdpa_net *ndev)
282 {
283 	struct mlx5_vdpa_dev *mvdev = &ndev->mvdev;
284 	u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {};
285 	void *tisc;
286 	int err;
287 
288 	tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
289 	MLX5_SET(tisc, tisc, transport_domain, ndev->res.tdn);
290 	err = mlx5_vdpa_create_tis(mvdev, in, &ndev->res.tisn);
291 	if (err)
292 		mlx5_vdpa_warn(mvdev, "create TIS (%d)\n", err);
293 
294 	return err;
295 }
296 
297 static void destroy_tis(struct mlx5_vdpa_net *ndev)
298 {
299 	mlx5_vdpa_destroy_tis(&ndev->mvdev, ndev->res.tisn);
300 }
301 
302 #define MLX5_VDPA_CQE_SIZE 64
303 #define MLX5_VDPA_LOG_CQE_SIZE ilog2(MLX5_VDPA_CQE_SIZE)
304 
305 static int cq_frag_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_cq_buf *buf, int nent)
306 {
307 	struct mlx5_frag_buf *frag_buf = &buf->frag_buf;
308 	u8 log_wq_stride = MLX5_VDPA_LOG_CQE_SIZE;
309 	u8 log_wq_sz = MLX5_VDPA_LOG_CQE_SIZE;
310 	int err;
311 
312 	err = mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, nent * MLX5_VDPA_CQE_SIZE, frag_buf,
313 				       ndev->mvdev.mdev->priv.numa_node);
314 	if (err)
315 		return err;
316 
317 	mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc);
318 
319 	buf->cqe_size = MLX5_VDPA_CQE_SIZE;
320 	buf->nent = nent;
321 
322 	return 0;
323 }
324 
325 static int umem_frag_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem, int size)
326 {
327 	struct mlx5_frag_buf *frag_buf = &umem->frag_buf;
328 
329 	return mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, size, frag_buf,
330 					ndev->mvdev.mdev->priv.numa_node);
331 }
332 
333 static void cq_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_cq_buf *buf)
334 {
335 	mlx5_frag_buf_free(ndev->mvdev.mdev, &buf->frag_buf);
336 }
337 
338 static void *get_cqe(struct mlx5_vdpa_cq *vcq, int n)
339 {
340 	return mlx5_frag_buf_get_wqe(&vcq->buf.fbc, n);
341 }
342 
343 static void cq_frag_buf_init(struct mlx5_vdpa_cq *vcq, struct mlx5_vdpa_cq_buf *buf)
344 {
345 	struct mlx5_cqe64 *cqe64;
346 	void *cqe;
347 	int i;
348 
349 	for (i = 0; i < buf->nent; i++) {
350 		cqe = get_cqe(vcq, i);
351 		cqe64 = cqe;
352 		cqe64->op_own = MLX5_CQE_INVALID << 4;
353 	}
354 }
355 
356 static void *get_sw_cqe(struct mlx5_vdpa_cq *cq, int n)
357 {
358 	struct mlx5_cqe64 *cqe64 = get_cqe(cq, n & (cq->cqe - 1));
359 
360 	if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) &&
361 	    !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & cq->cqe)))
362 		return cqe64;
363 
364 	return NULL;
365 }
366 
367 static void rx_post(struct mlx5_vdpa_qp *vqp, int n)
368 {
369 	vqp->head += n;
370 	vqp->db.db[0] = cpu_to_be32(vqp->head);
371 }
372 
373 static void qp_prepare(struct mlx5_vdpa_net *ndev, bool fw, void *in,
374 		       struct mlx5_vdpa_virtqueue *mvq, u32 num_ent)
375 {
376 	struct mlx5_vdpa_qp *vqp;
377 	__be64 *pas;
378 	void *qpc;
379 
380 	vqp = fw ? &mvq->fwqp : &mvq->vqqp;
381 	MLX5_SET(create_qp_in, in, uid, ndev->mvdev.res.uid);
382 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
383 	if (vqp->fw) {
384 		/* Firmware QP is allocated by the driver for the firmware's
385 		 * use so we can skip part of the params as they will be chosen by firmware
386 		 */
387 		qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
388 		MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ);
389 		MLX5_SET(qpc, qpc, no_sq, 1);
390 		return;
391 	}
392 
393 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
394 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
395 	MLX5_SET(qpc, qpc, pd, ndev->mvdev.res.pdn);
396 	MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
397 	MLX5_SET(qpc, qpc, uar_page, ndev->mvdev.res.uar->index);
398 	MLX5_SET(qpc, qpc, log_page_size, vqp->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
399 	MLX5_SET(qpc, qpc, no_sq, 1);
400 	MLX5_SET(qpc, qpc, cqn_rcv, mvq->cq.mcq.cqn);
401 	MLX5_SET(qpc, qpc, log_rq_size, ilog2(num_ent));
402 	MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
403 	pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, in, pas);
404 	mlx5_fill_page_frag_array(&vqp->frag_buf, pas);
405 }
406 
407 static int rq_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp, u32 num_ent)
408 {
409 	return mlx5_frag_buf_alloc_node(ndev->mvdev.mdev,
410 					num_ent * sizeof(struct mlx5_wqe_data_seg), &vqp->frag_buf,
411 					ndev->mvdev.mdev->priv.numa_node);
412 }
413 
414 static void rq_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp)
415 {
416 	mlx5_frag_buf_free(ndev->mvdev.mdev, &vqp->frag_buf);
417 }
418 
419 static int qp_create(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
420 		     struct mlx5_vdpa_qp *vqp)
421 {
422 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
423 	int inlen = MLX5_ST_SZ_BYTES(create_qp_in);
424 	u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
425 	void *qpc;
426 	void *in;
427 	int err;
428 
429 	if (!vqp->fw) {
430 		vqp = &mvq->vqqp;
431 		err = rq_buf_alloc(ndev, vqp, mvq->num_ent);
432 		if (err)
433 			return err;
434 
435 		err = mlx5_db_alloc(ndev->mvdev.mdev, &vqp->db);
436 		if (err)
437 			goto err_db;
438 		inlen += vqp->frag_buf.npages * sizeof(__be64);
439 	}
440 
441 	in = kzalloc(inlen, GFP_KERNEL);
442 	if (!in) {
443 		err = -ENOMEM;
444 		goto err_kzalloc;
445 	}
446 
447 	qp_prepare(ndev, vqp->fw, in, mvq, mvq->num_ent);
448 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
449 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
450 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
451 	MLX5_SET(qpc, qpc, pd, ndev->mvdev.res.pdn);
452 	MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
453 	if (!vqp->fw)
454 		MLX5_SET64(qpc, qpc, dbr_addr, vqp->db.dma);
455 	MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
456 	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
457 	kfree(in);
458 	if (err)
459 		goto err_kzalloc;
460 
461 	vqp->mqp.uid = ndev->mvdev.res.uid;
462 	vqp->mqp.qpn = MLX5_GET(create_qp_out, out, qpn);
463 
464 	if (!vqp->fw)
465 		rx_post(vqp, mvq->num_ent);
466 
467 	return 0;
468 
469 err_kzalloc:
470 	if (!vqp->fw)
471 		mlx5_db_free(ndev->mvdev.mdev, &vqp->db);
472 err_db:
473 	if (!vqp->fw)
474 		rq_buf_free(ndev, vqp);
475 
476 	return err;
477 }
478 
479 static void qp_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp)
480 {
481 	u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
482 
483 	MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
484 	MLX5_SET(destroy_qp_in, in, qpn, vqp->mqp.qpn);
485 	MLX5_SET(destroy_qp_in, in, uid, ndev->mvdev.res.uid);
486 	if (mlx5_cmd_exec_in(ndev->mvdev.mdev, destroy_qp, in))
487 		mlx5_vdpa_warn(&ndev->mvdev, "destroy qp 0x%x\n", vqp->mqp.qpn);
488 	if (!vqp->fw) {
489 		mlx5_db_free(ndev->mvdev.mdev, &vqp->db);
490 		rq_buf_free(ndev, vqp);
491 	}
492 }
493 
494 static void *next_cqe_sw(struct mlx5_vdpa_cq *cq)
495 {
496 	return get_sw_cqe(cq, cq->mcq.cons_index);
497 }
498 
499 static int mlx5_vdpa_poll_one(struct mlx5_vdpa_cq *vcq)
500 {
501 	struct mlx5_cqe64 *cqe64;
502 
503 	cqe64 = next_cqe_sw(vcq);
504 	if (!cqe64)
505 		return -EAGAIN;
506 
507 	vcq->mcq.cons_index++;
508 	return 0;
509 }
510 
511 static void mlx5_vdpa_handle_completions(struct mlx5_vdpa_virtqueue *mvq, int num)
512 {
513 	struct mlx5_vdpa_net *ndev = mvq->ndev;
514 	struct vdpa_callback *event_cb;
515 
516 	event_cb = &ndev->event_cbs[mvq->index];
517 	mlx5_cq_set_ci(&mvq->cq.mcq);
518 
519 	/* make sure CQ cosumer update is visible to the hardware before updating
520 	 * RX doorbell record.
521 	 */
522 	dma_wmb();
523 	rx_post(&mvq->vqqp, num);
524 	if (event_cb->callback)
525 		event_cb->callback(event_cb->private);
526 }
527 
528 static void mlx5_vdpa_cq_comp(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe)
529 {
530 	struct mlx5_vdpa_virtqueue *mvq = container_of(mcq, struct mlx5_vdpa_virtqueue, cq.mcq);
531 	struct mlx5_vdpa_net *ndev = mvq->ndev;
532 	void __iomem *uar_page = ndev->mvdev.res.uar->map;
533 	int num = 0;
534 
535 	while (!mlx5_vdpa_poll_one(&mvq->cq)) {
536 		num++;
537 		if (num > mvq->num_ent / 2) {
538 			/* If completions keep coming while we poll, we want to
539 			 * let the hardware know that we consumed them by
540 			 * updating the doorbell record.  We also let vdpa core
541 			 * know about this so it passes it on the virtio driver
542 			 * on the guest.
543 			 */
544 			mlx5_vdpa_handle_completions(mvq, num);
545 			num = 0;
546 		}
547 	}
548 
549 	if (num)
550 		mlx5_vdpa_handle_completions(mvq, num);
551 
552 	mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index);
553 }
554 
555 static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent)
556 {
557 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
558 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
559 	void __iomem *uar_page = ndev->mvdev.res.uar->map;
560 	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
561 	struct mlx5_vdpa_cq *vcq = &mvq->cq;
562 	__be64 *pas;
563 	int inlen;
564 	void *cqc;
565 	void *in;
566 	int err;
567 	int eqn;
568 
569 	err = mlx5_db_alloc(mdev, &vcq->db);
570 	if (err)
571 		return err;
572 
573 	vcq->mcq.set_ci_db = vcq->db.db;
574 	vcq->mcq.arm_db = vcq->db.db + 1;
575 	vcq->mcq.cqe_sz = 64;
576 
577 	err = cq_frag_buf_alloc(ndev, &vcq->buf, num_ent);
578 	if (err)
579 		goto err_db;
580 
581 	cq_frag_buf_init(vcq, &vcq->buf);
582 
583 	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
584 		MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * vcq->buf.frag_buf.npages;
585 	in = kzalloc(inlen, GFP_KERNEL);
586 	if (!in) {
587 		err = -ENOMEM;
588 		goto err_vzalloc;
589 	}
590 
591 	MLX5_SET(create_cq_in, in, uid, ndev->mvdev.res.uid);
592 	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
593 	mlx5_fill_page_frag_array(&vcq->buf.frag_buf, pas);
594 
595 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
596 	MLX5_SET(cqc, cqc, log_page_size, vcq->buf.frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
597 
598 	/* Use vector 0 by default. Consider adding code to choose least used
599 	 * vector.
600 	 */
601 	err = mlx5_comp_eqn_get(mdev, 0, &eqn);
602 	if (err)
603 		goto err_vec;
604 
605 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
606 	MLX5_SET(cqc, cqc, log_cq_size, ilog2(num_ent));
607 	MLX5_SET(cqc, cqc, uar_page, ndev->mvdev.res.uar->index);
608 	MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
609 	MLX5_SET64(cqc, cqc, dbr_addr, vcq->db.dma);
610 
611 	err = mlx5_core_create_cq(mdev, &vcq->mcq, in, inlen, out, sizeof(out));
612 	if (err)
613 		goto err_vec;
614 
615 	vcq->mcq.comp = mlx5_vdpa_cq_comp;
616 	vcq->cqe = num_ent;
617 	vcq->mcq.set_ci_db = vcq->db.db;
618 	vcq->mcq.arm_db = vcq->db.db + 1;
619 	mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index);
620 	kfree(in);
621 	return 0;
622 
623 err_vec:
624 	kfree(in);
625 err_vzalloc:
626 	cq_frag_buf_free(ndev, &vcq->buf);
627 err_db:
628 	mlx5_db_free(ndev->mvdev.mdev, &vcq->db);
629 	return err;
630 }
631 
632 static void cq_destroy(struct mlx5_vdpa_net *ndev, u16 idx)
633 {
634 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
635 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
636 	struct mlx5_vdpa_cq *vcq = &mvq->cq;
637 
638 	if (mlx5_core_destroy_cq(mdev, &vcq->mcq)) {
639 		mlx5_vdpa_warn(&ndev->mvdev, "destroy CQ 0x%x\n", vcq->mcq.cqn);
640 		return;
641 	}
642 	cq_frag_buf_free(ndev, &vcq->buf);
643 	mlx5_db_free(ndev->mvdev.mdev, &vcq->db);
644 }
645 
646 static int read_umem_params(struct mlx5_vdpa_net *ndev)
647 {
648 	u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {};
649 	u16 opmod = (MLX5_CAP_VDPA_EMULATION << 1) | (HCA_CAP_OPMOD_GET_CUR & 0x01);
650 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
651 	int out_size;
652 	void *caps;
653 	void *out;
654 	int err;
655 
656 	out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out);
657 	out = kzalloc(out_size, GFP_KERNEL);
658 	if (!out)
659 		return -ENOMEM;
660 
661 	MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
662 	MLX5_SET(query_hca_cap_in, in, op_mod, opmod);
663 	err = mlx5_cmd_exec_inout(mdev, query_hca_cap, in, out);
664 	if (err) {
665 		mlx5_vdpa_warn(&ndev->mvdev,
666 			"Failed reading vdpa umem capabilities with err %d\n", err);
667 		goto out;
668 	}
669 
670 	caps =  MLX5_ADDR_OF(query_hca_cap_out, out, capability);
671 
672 	ndev->umem_1_buffer_param_a = MLX5_GET(virtio_emulation_cap, caps, umem_1_buffer_param_a);
673 	ndev->umem_1_buffer_param_b = MLX5_GET(virtio_emulation_cap, caps, umem_1_buffer_param_b);
674 
675 	ndev->umem_2_buffer_param_a = MLX5_GET(virtio_emulation_cap, caps, umem_2_buffer_param_a);
676 	ndev->umem_2_buffer_param_b = MLX5_GET(virtio_emulation_cap, caps, umem_2_buffer_param_b);
677 
678 	ndev->umem_3_buffer_param_a = MLX5_GET(virtio_emulation_cap, caps, umem_3_buffer_param_a);
679 	ndev->umem_3_buffer_param_b = MLX5_GET(virtio_emulation_cap, caps, umem_3_buffer_param_b);
680 
681 out:
682 	kfree(out);
683 	return 0;
684 }
685 
686 static void set_umem_size(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num,
687 			  struct mlx5_vdpa_umem **umemp)
688 {
689 	u32 p_a;
690 	u32 p_b;
691 
692 	switch (num) {
693 	case 1:
694 		p_a = ndev->umem_1_buffer_param_a;
695 		p_b = ndev->umem_1_buffer_param_b;
696 		*umemp = &mvq->umem1;
697 		break;
698 	case 2:
699 		p_a = ndev->umem_2_buffer_param_a;
700 		p_b = ndev->umem_2_buffer_param_b;
701 		*umemp = &mvq->umem2;
702 		break;
703 	case 3:
704 		p_a = ndev->umem_3_buffer_param_a;
705 		p_b = ndev->umem_3_buffer_param_b;
706 		*umemp = &mvq->umem3;
707 		break;
708 	}
709 
710 	(*umemp)->size = p_a * mvq->num_ent + p_b;
711 }
712 
713 static void umem_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem)
714 {
715 	mlx5_frag_buf_free(ndev->mvdev.mdev, &umem->frag_buf);
716 }
717 
718 static int create_umem(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num)
719 {
720 	int inlen;
721 	u32 out[MLX5_ST_SZ_DW(create_umem_out)] = {};
722 	void *um;
723 	void *in;
724 	int err;
725 	__be64 *pas;
726 	struct mlx5_vdpa_umem *umem;
727 
728 	set_umem_size(ndev, mvq, num, &umem);
729 	err = umem_frag_buf_alloc(ndev, umem, umem->size);
730 	if (err)
731 		return err;
732 
733 	inlen = MLX5_ST_SZ_BYTES(create_umem_in) + MLX5_ST_SZ_BYTES(mtt) * umem->frag_buf.npages;
734 
735 	in = kzalloc(inlen, GFP_KERNEL);
736 	if (!in) {
737 		err = -ENOMEM;
738 		goto err_in;
739 	}
740 
741 	MLX5_SET(create_umem_in, in, opcode, MLX5_CMD_OP_CREATE_UMEM);
742 	MLX5_SET(create_umem_in, in, uid, ndev->mvdev.res.uid);
743 	um = MLX5_ADDR_OF(create_umem_in, in, umem);
744 	MLX5_SET(umem, um, log_page_size, umem->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
745 	MLX5_SET64(umem, um, num_of_mtt, umem->frag_buf.npages);
746 
747 	pas = (__be64 *)MLX5_ADDR_OF(umem, um, mtt[0]);
748 	mlx5_fill_page_frag_array_perm(&umem->frag_buf, pas, MLX5_MTT_PERM_RW);
749 
750 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
751 	if (err) {
752 		mlx5_vdpa_warn(&ndev->mvdev, "create umem(%d)\n", err);
753 		goto err_cmd;
754 	}
755 
756 	kfree(in);
757 	umem->id = MLX5_GET(create_umem_out, out, umem_id);
758 
759 	return 0;
760 
761 err_cmd:
762 	kfree(in);
763 err_in:
764 	umem_frag_buf_free(ndev, umem);
765 	return err;
766 }
767 
768 static void umem_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num)
769 {
770 	u32 in[MLX5_ST_SZ_DW(destroy_umem_in)] = {};
771 	u32 out[MLX5_ST_SZ_DW(destroy_umem_out)] = {};
772 	struct mlx5_vdpa_umem *umem;
773 
774 	switch (num) {
775 	case 1:
776 		umem = &mvq->umem1;
777 		break;
778 	case 2:
779 		umem = &mvq->umem2;
780 		break;
781 	case 3:
782 		umem = &mvq->umem3;
783 		break;
784 	}
785 
786 	MLX5_SET(destroy_umem_in, in, opcode, MLX5_CMD_OP_DESTROY_UMEM);
787 	MLX5_SET(destroy_umem_in, in, umem_id, umem->id);
788 	if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out)))
789 		return;
790 
791 	umem_frag_buf_free(ndev, umem);
792 }
793 
794 static int umems_create(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
795 {
796 	int num;
797 	int err;
798 
799 	for (num = 1; num <= 3; num++) {
800 		err = create_umem(ndev, mvq, num);
801 		if (err)
802 			goto err_umem;
803 	}
804 	return 0;
805 
806 err_umem:
807 	for (num--; num > 0; num--)
808 		umem_destroy(ndev, mvq, num);
809 
810 	return err;
811 }
812 
813 static void umems_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
814 {
815 	int num;
816 
817 	for (num = 3; num > 0; num--)
818 		umem_destroy(ndev, mvq, num);
819 }
820 
821 static int get_queue_type(struct mlx5_vdpa_net *ndev)
822 {
823 	u32 type_mask;
824 
825 	type_mask = MLX5_CAP_DEV_VDPA_EMULATION(ndev->mvdev.mdev, virtio_queue_type);
826 
827 	/* prefer split queue */
828 	if (type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT)
829 		return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT;
830 
831 	WARN_ON(!(type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED));
832 
833 	return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED;
834 }
835 
836 static bool vq_is_tx(u16 idx)
837 {
838 	return idx % 2;
839 }
840 
841 enum {
842 	MLX5_VIRTIO_NET_F_MRG_RXBUF = 2,
843 	MLX5_VIRTIO_NET_F_HOST_ECN = 4,
844 	MLX5_VIRTIO_NET_F_GUEST_ECN = 6,
845 	MLX5_VIRTIO_NET_F_GUEST_TSO6 = 7,
846 	MLX5_VIRTIO_NET_F_GUEST_TSO4 = 8,
847 	MLX5_VIRTIO_NET_F_GUEST_CSUM = 9,
848 	MLX5_VIRTIO_NET_F_CSUM = 10,
849 	MLX5_VIRTIO_NET_F_HOST_TSO6 = 11,
850 	MLX5_VIRTIO_NET_F_HOST_TSO4 = 12,
851 };
852 
853 static u16 get_features(u64 features)
854 {
855 	return (!!(features & BIT_ULL(VIRTIO_NET_F_MRG_RXBUF)) << MLX5_VIRTIO_NET_F_MRG_RXBUF) |
856 	       (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_ECN)) << MLX5_VIRTIO_NET_F_HOST_ECN) |
857 	       (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_ECN)) << MLX5_VIRTIO_NET_F_GUEST_ECN) |
858 	       (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_TSO6)) << MLX5_VIRTIO_NET_F_GUEST_TSO6) |
859 	       (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_TSO4)) << MLX5_VIRTIO_NET_F_GUEST_TSO4) |
860 	       (!!(features & BIT_ULL(VIRTIO_NET_F_CSUM)) << MLX5_VIRTIO_NET_F_CSUM) |
861 	       (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO6)) << MLX5_VIRTIO_NET_F_HOST_TSO6) |
862 	       (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO4)) << MLX5_VIRTIO_NET_F_HOST_TSO4);
863 }
864 
865 static bool counters_supported(const struct mlx5_vdpa_dev *mvdev)
866 {
867 	return MLX5_CAP_GEN_64(mvdev->mdev, general_obj_types) &
868 	       BIT_ULL(MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS);
869 }
870 
871 static bool msix_mode_supported(struct mlx5_vdpa_dev *mvdev)
872 {
873 	return MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, event_mode) &
874 		(1 << MLX5_VIRTIO_Q_EVENT_MODE_MSIX_MODE) &&
875 		pci_msix_can_alloc_dyn(mvdev->mdev->pdev);
876 }
877 
878 static int create_virtqueue(struct mlx5_vdpa_net *ndev,
879 			    struct mlx5_vdpa_virtqueue *mvq,
880 			    bool filled)
881 {
882 	int inlen = MLX5_ST_SZ_BYTES(create_virtio_net_q_in);
883 	u32 out[MLX5_ST_SZ_DW(create_virtio_net_q_out)] = {};
884 	struct mlx5_vdpa_dev *mvdev = &ndev->mvdev;
885 	struct mlx5_vdpa_mr *vq_mr;
886 	struct mlx5_vdpa_mr *vq_desc_mr;
887 	u64 features = filled ? mvdev->actual_features : mvdev->mlx_features;
888 	void *obj_context;
889 	u16 mlx_features;
890 	void *cmd_hdr;
891 	void *vq_ctx;
892 	void *in;
893 	int err;
894 
895 	err = umems_create(ndev, mvq);
896 	if (err)
897 		return err;
898 
899 	in = kzalloc(inlen, GFP_KERNEL);
900 	if (!in) {
901 		err = -ENOMEM;
902 		goto err_alloc;
903 	}
904 
905 	mlx_features = get_features(features);
906 	cmd_hdr = MLX5_ADDR_OF(create_virtio_net_q_in, in, general_obj_in_cmd_hdr);
907 
908 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
909 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
910 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
911 
912 	obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, in, obj_context);
913 	MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_12_3,
914 		 mlx_features >> 3);
915 	MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_2_0,
916 		 mlx_features & 7);
917 	vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, virtio_q_context);
918 	MLX5_SET(virtio_q, vq_ctx, virtio_q_type, get_queue_type(ndev));
919 
920 	if (vq_is_tx(mvq->index))
921 		MLX5_SET(virtio_net_q_object, obj_context, tisn_or_qpn, ndev->res.tisn);
922 
923 	if (mvq->map.virq) {
924 		MLX5_SET(virtio_q, vq_ctx, event_mode, MLX5_VIRTIO_Q_EVENT_MODE_MSIX_MODE);
925 		MLX5_SET(virtio_q, vq_ctx, event_qpn_or_msix, mvq->map.index);
926 	} else {
927 		MLX5_SET(virtio_q, vq_ctx, event_mode, MLX5_VIRTIO_Q_EVENT_MODE_QP_MODE);
928 		MLX5_SET(virtio_q, vq_ctx, event_qpn_or_msix, mvq->fwqp.mqp.qpn);
929 	}
930 
931 	MLX5_SET(virtio_q, vq_ctx, queue_index, mvq->index);
932 	MLX5_SET(virtio_q, vq_ctx, queue_size, mvq->num_ent);
933 	MLX5_SET(virtio_q, vq_ctx, virtio_version_1_0,
934 		 !!(features & BIT_ULL(VIRTIO_F_VERSION_1)));
935 
936 	if (filled) {
937 		MLX5_SET(virtio_net_q_object, obj_context, hw_available_index, mvq->avail_idx);
938 		MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, mvq->used_idx);
939 
940 		MLX5_SET64(virtio_q, vq_ctx, desc_addr, mvq->desc_addr);
941 		MLX5_SET64(virtio_q, vq_ctx, used_addr, mvq->device_addr);
942 		MLX5_SET64(virtio_q, vq_ctx, available_addr, mvq->driver_addr);
943 
944 		vq_mr = mvdev->mr[mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP]];
945 		if (vq_mr)
946 			MLX5_SET(virtio_q, vq_ctx, virtio_q_mkey, vq_mr->mkey);
947 
948 		vq_desc_mr = mvdev->mr[mvdev->group2asid[MLX5_VDPA_DATAVQ_DESC_GROUP]];
949 		if (vq_desc_mr &&
950 		    MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, desc_group_mkey_supported))
951 			MLX5_SET(virtio_q, vq_ctx, desc_group_mkey, vq_desc_mr->mkey);
952 	} else {
953 		/* If there is no mr update, make sure that the existing ones are set
954 		 * modify to ready.
955 		 */
956 		vq_mr = mvdev->mr[mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP]];
957 		if (vq_mr)
958 			mvq->modified_fields |= MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY;
959 
960 		vq_desc_mr = mvdev->mr[mvdev->group2asid[MLX5_VDPA_DATAVQ_DESC_GROUP]];
961 		if (vq_desc_mr)
962 			mvq->modified_fields |= MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY;
963 	}
964 
965 	MLX5_SET(virtio_q, vq_ctx, umem_1_id, mvq->umem1.id);
966 	MLX5_SET(virtio_q, vq_ctx, umem_1_size, mvq->umem1.size);
967 	MLX5_SET(virtio_q, vq_ctx, umem_2_id, mvq->umem2.id);
968 	MLX5_SET(virtio_q, vq_ctx, umem_2_size, mvq->umem2.size);
969 	MLX5_SET(virtio_q, vq_ctx, umem_3_id, mvq->umem3.id);
970 	MLX5_SET(virtio_q, vq_ctx, umem_3_size, mvq->umem3.size);
971 	MLX5_SET(virtio_q, vq_ctx, pd, ndev->mvdev.res.pdn);
972 	if (counters_supported(&ndev->mvdev))
973 		MLX5_SET(virtio_q, vq_ctx, counter_set_id, mvq->counter_set_id);
974 
975 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
976 	if (err)
977 		goto err_cmd;
978 
979 	mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT;
980 	kfree(in);
981 	mvq->virtq_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
982 
983 	if (filled) {
984 		mlx5_vdpa_get_mr(mvdev, vq_mr);
985 		mvq->vq_mr = vq_mr;
986 
987 		if (vq_desc_mr &&
988 		    MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, desc_group_mkey_supported)) {
989 			mlx5_vdpa_get_mr(mvdev, vq_desc_mr);
990 			mvq->desc_mr = vq_desc_mr;
991 		}
992 	}
993 
994 	return 0;
995 
996 err_cmd:
997 	kfree(in);
998 err_alloc:
999 	umems_destroy(ndev, mvq);
1000 	return err;
1001 }
1002 
1003 static void destroy_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1004 {
1005 	u32 in[MLX5_ST_SZ_DW(destroy_virtio_net_q_in)] = {};
1006 	u32 out[MLX5_ST_SZ_DW(destroy_virtio_net_q_out)] = {};
1007 
1008 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.opcode,
1009 		 MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
1010 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.obj_id, mvq->virtq_id);
1011 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.uid, ndev->mvdev.res.uid);
1012 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.obj_type,
1013 		 MLX5_OBJ_TYPE_VIRTIO_NET_Q);
1014 	if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out))) {
1015 		mlx5_vdpa_warn(&ndev->mvdev, "destroy virtqueue 0x%x\n", mvq->virtq_id);
1016 		return;
1017 	}
1018 	mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_NONE;
1019 	umems_destroy(ndev, mvq);
1020 
1021 	mlx5_vdpa_put_mr(&ndev->mvdev, mvq->vq_mr);
1022 	mvq->vq_mr = NULL;
1023 
1024 	mlx5_vdpa_put_mr(&ndev->mvdev, mvq->desc_mr);
1025 	mvq->desc_mr = NULL;
1026 }
1027 
1028 static u32 get_rqpn(struct mlx5_vdpa_virtqueue *mvq, bool fw)
1029 {
1030 	return fw ? mvq->vqqp.mqp.qpn : mvq->fwqp.mqp.qpn;
1031 }
1032 
1033 static u32 get_qpn(struct mlx5_vdpa_virtqueue *mvq, bool fw)
1034 {
1035 	return fw ? mvq->fwqp.mqp.qpn : mvq->vqqp.mqp.qpn;
1036 }
1037 
1038 static void alloc_inout(struct mlx5_vdpa_net *ndev, int cmd, void **in, int *inlen, void **out,
1039 			int *outlen, u32 qpn, u32 rqpn)
1040 {
1041 	void *qpc;
1042 	void *pp;
1043 
1044 	switch (cmd) {
1045 	case MLX5_CMD_OP_2RST_QP:
1046 		*inlen = MLX5_ST_SZ_BYTES(qp_2rst_in);
1047 		*outlen = MLX5_ST_SZ_BYTES(qp_2rst_out);
1048 		*in = kzalloc(*inlen, GFP_KERNEL);
1049 		*out = kzalloc(*outlen, GFP_KERNEL);
1050 		if (!*in || !*out)
1051 			goto outerr;
1052 
1053 		MLX5_SET(qp_2rst_in, *in, opcode, cmd);
1054 		MLX5_SET(qp_2rst_in, *in, uid, ndev->mvdev.res.uid);
1055 		MLX5_SET(qp_2rst_in, *in, qpn, qpn);
1056 		break;
1057 	case MLX5_CMD_OP_RST2INIT_QP:
1058 		*inlen = MLX5_ST_SZ_BYTES(rst2init_qp_in);
1059 		*outlen = MLX5_ST_SZ_BYTES(rst2init_qp_out);
1060 		*in = kzalloc(*inlen, GFP_KERNEL);
1061 		*out = kzalloc(MLX5_ST_SZ_BYTES(rst2init_qp_out), GFP_KERNEL);
1062 		if (!*in || !*out)
1063 			goto outerr;
1064 
1065 		MLX5_SET(rst2init_qp_in, *in, opcode, cmd);
1066 		MLX5_SET(rst2init_qp_in, *in, uid, ndev->mvdev.res.uid);
1067 		MLX5_SET(rst2init_qp_in, *in, qpn, qpn);
1068 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
1069 		MLX5_SET(qpc, qpc, remote_qpn, rqpn);
1070 		MLX5_SET(qpc, qpc, rwe, 1);
1071 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
1072 		MLX5_SET(ads, pp, vhca_port_num, 1);
1073 		break;
1074 	case MLX5_CMD_OP_INIT2RTR_QP:
1075 		*inlen = MLX5_ST_SZ_BYTES(init2rtr_qp_in);
1076 		*outlen = MLX5_ST_SZ_BYTES(init2rtr_qp_out);
1077 		*in = kzalloc(*inlen, GFP_KERNEL);
1078 		*out = kzalloc(MLX5_ST_SZ_BYTES(init2rtr_qp_out), GFP_KERNEL);
1079 		if (!*in || !*out)
1080 			goto outerr;
1081 
1082 		MLX5_SET(init2rtr_qp_in, *in, opcode, cmd);
1083 		MLX5_SET(init2rtr_qp_in, *in, uid, ndev->mvdev.res.uid);
1084 		MLX5_SET(init2rtr_qp_in, *in, qpn, qpn);
1085 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
1086 		MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
1087 		MLX5_SET(qpc, qpc, log_msg_max, 30);
1088 		MLX5_SET(qpc, qpc, remote_qpn, rqpn);
1089 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
1090 		MLX5_SET(ads, pp, fl, 1);
1091 		break;
1092 	case MLX5_CMD_OP_RTR2RTS_QP:
1093 		*inlen = MLX5_ST_SZ_BYTES(rtr2rts_qp_in);
1094 		*outlen = MLX5_ST_SZ_BYTES(rtr2rts_qp_out);
1095 		*in = kzalloc(*inlen, GFP_KERNEL);
1096 		*out = kzalloc(MLX5_ST_SZ_BYTES(rtr2rts_qp_out), GFP_KERNEL);
1097 		if (!*in || !*out)
1098 			goto outerr;
1099 
1100 		MLX5_SET(rtr2rts_qp_in, *in, opcode, cmd);
1101 		MLX5_SET(rtr2rts_qp_in, *in, uid, ndev->mvdev.res.uid);
1102 		MLX5_SET(rtr2rts_qp_in, *in, qpn, qpn);
1103 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
1104 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
1105 		MLX5_SET(ads, pp, ack_timeout, 14);
1106 		MLX5_SET(qpc, qpc, retry_count, 7);
1107 		MLX5_SET(qpc, qpc, rnr_retry, 7);
1108 		break;
1109 	default:
1110 		goto outerr_nullify;
1111 	}
1112 
1113 	return;
1114 
1115 outerr:
1116 	kfree(*in);
1117 	kfree(*out);
1118 outerr_nullify:
1119 	*in = NULL;
1120 	*out = NULL;
1121 }
1122 
1123 static void free_inout(void *in, void *out)
1124 {
1125 	kfree(in);
1126 	kfree(out);
1127 }
1128 
1129 /* Two QPs are used by each virtqueue. One is used by the driver and one by
1130  * firmware. The fw argument indicates whether the subjected QP is the one used
1131  * by firmware.
1132  */
1133 static int modify_qp(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, bool fw, int cmd)
1134 {
1135 	int outlen;
1136 	int inlen;
1137 	void *out;
1138 	void *in;
1139 	int err;
1140 
1141 	alloc_inout(ndev, cmd, &in, &inlen, &out, &outlen, get_qpn(mvq, fw), get_rqpn(mvq, fw));
1142 	if (!in || !out)
1143 		return -ENOMEM;
1144 
1145 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, outlen);
1146 	free_inout(in, out);
1147 	return err;
1148 }
1149 
1150 static int connect_qps(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1151 {
1152 	int err;
1153 
1154 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_2RST_QP);
1155 	if (err)
1156 		return err;
1157 
1158 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_2RST_QP);
1159 	if (err)
1160 		return err;
1161 
1162 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_RST2INIT_QP);
1163 	if (err)
1164 		return err;
1165 
1166 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_RST2INIT_QP);
1167 	if (err)
1168 		return err;
1169 
1170 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_INIT2RTR_QP);
1171 	if (err)
1172 		return err;
1173 
1174 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_INIT2RTR_QP);
1175 	if (err)
1176 		return err;
1177 
1178 	return modify_qp(ndev, mvq, true, MLX5_CMD_OP_RTR2RTS_QP);
1179 }
1180 
1181 struct mlx5_virtq_attr {
1182 	u8 state;
1183 	u16 available_index;
1184 	u16 used_index;
1185 };
1186 
1187 static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
1188 			   struct mlx5_virtq_attr *attr)
1189 {
1190 	int outlen = MLX5_ST_SZ_BYTES(query_virtio_net_q_out);
1191 	u32 in[MLX5_ST_SZ_DW(query_virtio_net_q_in)] = {};
1192 	void *out;
1193 	void *obj_context;
1194 	void *cmd_hdr;
1195 	int err;
1196 
1197 	out = kzalloc(outlen, GFP_KERNEL);
1198 	if (!out)
1199 		return -ENOMEM;
1200 
1201 	cmd_hdr = MLX5_ADDR_OF(query_virtio_net_q_in, in, general_obj_in_cmd_hdr);
1202 
1203 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT);
1204 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
1205 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);
1206 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
1207 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, outlen);
1208 	if (err)
1209 		goto err_cmd;
1210 
1211 	obj_context = MLX5_ADDR_OF(query_virtio_net_q_out, out, obj_context);
1212 	memset(attr, 0, sizeof(*attr));
1213 	attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);
1214 	attr->available_index = MLX5_GET(virtio_net_q_object, obj_context, hw_available_index);
1215 	attr->used_index = MLX5_GET(virtio_net_q_object, obj_context, hw_used_index);
1216 	kfree(out);
1217 	return 0;
1218 
1219 err_cmd:
1220 	kfree(out);
1221 	return err;
1222 }
1223 
1224 static bool is_resumable(struct mlx5_vdpa_net *ndev)
1225 {
1226 	return ndev->mvdev.vdev.config->resume;
1227 }
1228 
1229 static bool is_valid_state_change(int oldstate, int newstate, bool resumable)
1230 {
1231 	switch (oldstate) {
1232 	case MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT:
1233 		return newstate == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY;
1234 	case MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY:
1235 		return newstate == MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND;
1236 	case MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND:
1237 		return resumable ? newstate == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY : false;
1238 	case MLX5_VIRTIO_NET_Q_OBJECT_STATE_ERR:
1239 	default:
1240 		return false;
1241 	}
1242 }
1243 
1244 static bool modifiable_virtqueue_fields(struct mlx5_vdpa_virtqueue *mvq)
1245 {
1246 	/* Only state is always modifiable */
1247 	if (mvq->modified_fields & ~MLX5_VIRTQ_MODIFY_MASK_STATE)
1248 		return mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT ||
1249 		       mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND;
1250 
1251 	return true;
1252 }
1253 
1254 static int modify_virtqueue(struct mlx5_vdpa_net *ndev,
1255 			    struct mlx5_vdpa_virtqueue *mvq,
1256 			    int state)
1257 {
1258 	int inlen = MLX5_ST_SZ_BYTES(modify_virtio_net_q_in);
1259 	u32 out[MLX5_ST_SZ_DW(modify_virtio_net_q_out)] = {};
1260 	struct mlx5_vdpa_dev *mvdev = &ndev->mvdev;
1261 	struct mlx5_vdpa_mr *desc_mr = NULL;
1262 	struct mlx5_vdpa_mr *vq_mr = NULL;
1263 	bool state_change = false;
1264 	void *obj_context;
1265 	void *cmd_hdr;
1266 	void *vq_ctx;
1267 	void *in;
1268 	int err;
1269 
1270 	if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_NONE)
1271 		return 0;
1272 
1273 	if (!modifiable_virtqueue_fields(mvq))
1274 		return -EINVAL;
1275 
1276 	in = kzalloc(inlen, GFP_KERNEL);
1277 	if (!in)
1278 		return -ENOMEM;
1279 
1280 	cmd_hdr = MLX5_ADDR_OF(modify_virtio_net_q_in, in, general_obj_in_cmd_hdr);
1281 
1282 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT);
1283 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
1284 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);
1285 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
1286 
1287 	obj_context = MLX5_ADDR_OF(modify_virtio_net_q_in, in, obj_context);
1288 	vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, virtio_q_context);
1289 
1290 	if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_STATE) {
1291 		if (!is_valid_state_change(mvq->fw_state, state, is_resumable(ndev))) {
1292 			err = -EINVAL;
1293 			goto done;
1294 		}
1295 
1296 		MLX5_SET(virtio_net_q_object, obj_context, state, state);
1297 		state_change = true;
1298 	}
1299 
1300 	if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_ADDRS) {
1301 		MLX5_SET64(virtio_q, vq_ctx, desc_addr, mvq->desc_addr);
1302 		MLX5_SET64(virtio_q, vq_ctx, used_addr, mvq->device_addr);
1303 		MLX5_SET64(virtio_q, vq_ctx, available_addr, mvq->driver_addr);
1304 	}
1305 
1306 	if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_AVAIL_IDX)
1307 		MLX5_SET(virtio_net_q_object, obj_context, hw_available_index, mvq->avail_idx);
1308 
1309 	if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_USED_IDX)
1310 		MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, mvq->used_idx);
1311 
1312 	if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_QUEUE_VIRTIO_VERSION)
1313 		MLX5_SET(virtio_q, vq_ctx, virtio_version_1_0,
1314 			!!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_F_VERSION_1)));
1315 
1316 	if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_QUEUE_FEATURES) {
1317 		u16 mlx_features = get_features(ndev->mvdev.actual_features);
1318 
1319 		MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_12_3,
1320 			 mlx_features >> 3);
1321 		MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_2_0,
1322 			 mlx_features & 7);
1323 	}
1324 
1325 	if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY) {
1326 		vq_mr = mvdev->mr[mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP]];
1327 
1328 		if (vq_mr)
1329 			MLX5_SET(virtio_q, vq_ctx, virtio_q_mkey, vq_mr->mkey);
1330 		else
1331 			mvq->modified_fields &= ~MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY;
1332 	}
1333 
1334 	if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY) {
1335 		desc_mr = mvdev->mr[mvdev->group2asid[MLX5_VDPA_DATAVQ_DESC_GROUP]];
1336 
1337 		if (desc_mr && MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, desc_group_mkey_supported))
1338 			MLX5_SET(virtio_q, vq_ctx, desc_group_mkey, desc_mr->mkey);
1339 		else
1340 			mvq->modified_fields &= ~MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY;
1341 	}
1342 
1343 	MLX5_SET64(virtio_net_q_object, obj_context, modify_field_select, mvq->modified_fields);
1344 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
1345 	if (err)
1346 		goto done;
1347 
1348 	if (state_change)
1349 		mvq->fw_state = state;
1350 
1351 	if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY) {
1352 		mlx5_vdpa_put_mr(mvdev, mvq->vq_mr);
1353 		mlx5_vdpa_get_mr(mvdev, vq_mr);
1354 		mvq->vq_mr = vq_mr;
1355 	}
1356 
1357 	if (mvq->modified_fields & MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY) {
1358 		mlx5_vdpa_put_mr(mvdev, mvq->desc_mr);
1359 		mlx5_vdpa_get_mr(mvdev, desc_mr);
1360 		mvq->desc_mr = desc_mr;
1361 	}
1362 
1363 	mvq->modified_fields = 0;
1364 
1365 done:
1366 	kfree(in);
1367 	return err;
1368 }
1369 
1370 static int modify_virtqueue_state(struct mlx5_vdpa_net *ndev,
1371 				  struct mlx5_vdpa_virtqueue *mvq,
1372 				  unsigned int state)
1373 {
1374 	mvq->modified_fields |= MLX5_VIRTQ_MODIFY_MASK_STATE;
1375 	return modify_virtqueue(ndev, mvq, state);
1376 }
1377 
1378 static int counter_set_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1379 {
1380 	u32 in[MLX5_ST_SZ_DW(create_virtio_q_counters_in)] = {};
1381 	u32 out[MLX5_ST_SZ_DW(create_virtio_q_counters_out)] = {};
1382 	void *cmd_hdr;
1383 	int err;
1384 
1385 	if (!counters_supported(&ndev->mvdev))
1386 		return 0;
1387 
1388 	cmd_hdr = MLX5_ADDR_OF(create_virtio_q_counters_in, in, hdr);
1389 
1390 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
1391 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS);
1392 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
1393 
1394 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out));
1395 	if (err)
1396 		return err;
1397 
1398 	mvq->counter_set_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
1399 
1400 	return 0;
1401 }
1402 
1403 static void counter_set_dealloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1404 {
1405 	u32 in[MLX5_ST_SZ_DW(destroy_virtio_q_counters_in)] = {};
1406 	u32 out[MLX5_ST_SZ_DW(destroy_virtio_q_counters_out)] = {};
1407 
1408 	if (!counters_supported(&ndev->mvdev))
1409 		return;
1410 
1411 	MLX5_SET(destroy_virtio_q_counters_in, in, hdr.opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
1412 	MLX5_SET(destroy_virtio_q_counters_in, in, hdr.obj_id, mvq->counter_set_id);
1413 	MLX5_SET(destroy_virtio_q_counters_in, in, hdr.uid, ndev->mvdev.res.uid);
1414 	MLX5_SET(destroy_virtio_q_counters_in, in, hdr.obj_type, MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS);
1415 	if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out)))
1416 		mlx5_vdpa_warn(&ndev->mvdev, "dealloc counter set 0x%x\n", mvq->counter_set_id);
1417 }
1418 
1419 static irqreturn_t mlx5_vdpa_int_handler(int irq, void *priv)
1420 {
1421 	struct vdpa_callback *cb = priv;
1422 
1423 	if (cb->callback)
1424 		return cb->callback(cb->private);
1425 
1426 	return IRQ_HANDLED;
1427 }
1428 
1429 static void alloc_vector(struct mlx5_vdpa_net *ndev,
1430 			 struct mlx5_vdpa_virtqueue *mvq)
1431 {
1432 	struct mlx5_vdpa_irq_pool *irqp = &ndev->irqp;
1433 	struct mlx5_vdpa_irq_pool_entry *ent;
1434 	int err;
1435 	int i;
1436 
1437 	for (i = 0; i < irqp->num_ent; i++) {
1438 		ent = &irqp->entries[i];
1439 		if (!ent->used) {
1440 			snprintf(ent->name, MLX5_VDPA_IRQ_NAME_LEN, "%s-vq-%d",
1441 				 dev_name(&ndev->mvdev.vdev.dev), mvq->index);
1442 			ent->dev_id = &ndev->event_cbs[mvq->index];
1443 			err = request_irq(ent->map.virq, mlx5_vdpa_int_handler, 0,
1444 					  ent->name, ent->dev_id);
1445 			if (err)
1446 				return;
1447 
1448 			ent->used = true;
1449 			mvq->map = ent->map;
1450 			return;
1451 		}
1452 	}
1453 }
1454 
1455 static void dealloc_vector(struct mlx5_vdpa_net *ndev,
1456 			   struct mlx5_vdpa_virtqueue *mvq)
1457 {
1458 	struct mlx5_vdpa_irq_pool *irqp = &ndev->irqp;
1459 	int i;
1460 
1461 	for (i = 0; i < irqp->num_ent; i++)
1462 		if (mvq->map.virq == irqp->entries[i].map.virq) {
1463 			free_irq(mvq->map.virq, irqp->entries[i].dev_id);
1464 			irqp->entries[i].used = false;
1465 			return;
1466 		}
1467 }
1468 
1469 static int setup_vq(struct mlx5_vdpa_net *ndev,
1470 		    struct mlx5_vdpa_virtqueue *mvq,
1471 		    bool filled)
1472 {
1473 	u16 idx = mvq->index;
1474 	int err;
1475 
1476 	if (mvq->initialized)
1477 		return 0;
1478 
1479 	err = cq_create(ndev, idx, mvq->num_ent);
1480 	if (err)
1481 		return err;
1482 
1483 	err = qp_create(ndev, mvq, &mvq->fwqp);
1484 	if (err)
1485 		goto err_fwqp;
1486 
1487 	err = qp_create(ndev, mvq, &mvq->vqqp);
1488 	if (err)
1489 		goto err_vqqp;
1490 
1491 	err = connect_qps(ndev, mvq);
1492 	if (err)
1493 		goto err_connect;
1494 
1495 	err = counter_set_alloc(ndev, mvq);
1496 	if (err)
1497 		goto err_connect;
1498 
1499 	alloc_vector(ndev, mvq);
1500 	err = create_virtqueue(ndev, mvq, filled);
1501 	if (err)
1502 		goto err_vq;
1503 
1504 	mvq->initialized = true;
1505 
1506 	if (mvq->ready) {
1507 		err = resume_vq(ndev, mvq);
1508 		if (err)
1509 			goto err_modify;
1510 	}
1511 
1512 	return 0;
1513 
1514 err_modify:
1515 	destroy_virtqueue(ndev, mvq);
1516 err_vq:
1517 	dealloc_vector(ndev, mvq);
1518 	counter_set_dealloc(ndev, mvq);
1519 err_connect:
1520 	qp_destroy(ndev, &mvq->vqqp);
1521 err_vqqp:
1522 	qp_destroy(ndev, &mvq->fwqp);
1523 err_fwqp:
1524 	cq_destroy(ndev, idx);
1525 	return err;
1526 }
1527 
1528 static int suspend_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1529 {
1530 	struct mlx5_virtq_attr attr;
1531 	int err;
1532 
1533 	if (!mvq->initialized)
1534 		return 0;
1535 
1536 	if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY)
1537 		return 0;
1538 
1539 	err = modify_virtqueue_state(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND);
1540 	if (err) {
1541 		mlx5_vdpa_warn(&ndev->mvdev, "modify to suspend failed, err: %d\n", err);
1542 		return err;
1543 	}
1544 
1545 	err = query_virtqueue(ndev, mvq, &attr);
1546 	if (err) {
1547 		mlx5_vdpa_warn(&ndev->mvdev, "failed to query virtqueue, err: %d\n", err);
1548 		return err;
1549 	}
1550 
1551 	mvq->avail_idx = attr.available_index;
1552 	mvq->used_idx = attr.used_index;
1553 
1554 	return 0;
1555 }
1556 
1557 static int suspend_vqs(struct mlx5_vdpa_net *ndev)
1558 {
1559 	int err = 0;
1560 	int i;
1561 
1562 	for (i = 0; i < ndev->cur_num_vqs; i++) {
1563 		int local_err = suspend_vq(ndev, &ndev->vqs[i]);
1564 
1565 		err = local_err ? local_err : err;
1566 	}
1567 
1568 	return err;
1569 }
1570 
1571 static int resume_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1572 {
1573 	int err;
1574 
1575 	if (!mvq->initialized)
1576 		return 0;
1577 
1578 	if (mvq->index >= ndev->cur_num_vqs)
1579 		return 0;
1580 
1581 	switch (mvq->fw_state) {
1582 	case MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT:
1583 		/* Due to a FW quirk we need to modify the VQ fields first then change state.
1584 		 * This should be fixed soon. After that, a single command can be used.
1585 		 */
1586 		err = modify_virtqueue(ndev, mvq, 0);
1587 		if (err) {
1588 			mlx5_vdpa_warn(&ndev->mvdev,
1589 				"modify vq properties failed for vq %u, err: %d\n",
1590 				mvq->index, err);
1591 			return err;
1592 		}
1593 		break;
1594 	case MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND:
1595 		if (!is_resumable(ndev)) {
1596 			mlx5_vdpa_warn(&ndev->mvdev, "vq %d is not resumable\n", mvq->index);
1597 			return -EINVAL;
1598 		}
1599 		break;
1600 	case MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY:
1601 		return 0;
1602 	default:
1603 		mlx5_vdpa_warn(&ndev->mvdev, "resume vq %u called from bad state %d\n",
1604 			       mvq->index, mvq->fw_state);
1605 		return -EINVAL;
1606 	}
1607 
1608 	err = modify_virtqueue_state(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY);
1609 	if (err)
1610 		mlx5_vdpa_warn(&ndev->mvdev, "modify to resume failed for vq %u, err: %d\n",
1611 			       mvq->index, err);
1612 
1613 	return err;
1614 }
1615 
1616 static int resume_vqs(struct mlx5_vdpa_net *ndev)
1617 {
1618 	int err = 0;
1619 
1620 	for (int i = 0; i < ndev->cur_num_vqs; i++) {
1621 		int local_err = resume_vq(ndev, &ndev->vqs[i]);
1622 
1623 		err = local_err ? local_err : err;
1624 	}
1625 
1626 	return err;
1627 }
1628 
1629 static void teardown_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1630 {
1631 	if (!mvq->initialized)
1632 		return;
1633 
1634 	suspend_vq(ndev, mvq);
1635 	mvq->modified_fields = 0;
1636 	destroy_virtqueue(ndev, mvq);
1637 	dealloc_vector(ndev, mvq);
1638 	counter_set_dealloc(ndev, mvq);
1639 	qp_destroy(ndev, &mvq->vqqp);
1640 	qp_destroy(ndev, &mvq->fwqp);
1641 	cq_destroy(ndev, mvq->index);
1642 	mvq->initialized = false;
1643 }
1644 
1645 static int create_rqt(struct mlx5_vdpa_net *ndev)
1646 {
1647 	int rqt_table_size = roundup_pow_of_two(ndev->rqt_size);
1648 	int act_sz = roundup_pow_of_two(ndev->cur_num_vqs / 2);
1649 	__be32 *list;
1650 	void *rqtc;
1651 	int inlen;
1652 	void *in;
1653 	int i, j;
1654 	int err;
1655 
1656 	inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + rqt_table_size * MLX5_ST_SZ_BYTES(rq_num);
1657 	in = kzalloc(inlen, GFP_KERNEL);
1658 	if (!in)
1659 		return -ENOMEM;
1660 
1661 	MLX5_SET(create_rqt_in, in, uid, ndev->mvdev.res.uid);
1662 	rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context);
1663 
1664 	MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q);
1665 	MLX5_SET(rqtc, rqtc, rqt_max_size, rqt_table_size);
1666 	list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]);
1667 	for (i = 0, j = 0; i < act_sz; i++, j += 2)
1668 		list[i] = cpu_to_be32(ndev->vqs[j % ndev->cur_num_vqs].virtq_id);
1669 
1670 	MLX5_SET(rqtc, rqtc, rqt_actual_size, act_sz);
1671 	err = mlx5_vdpa_create_rqt(&ndev->mvdev, in, inlen, &ndev->res.rqtn);
1672 	kfree(in);
1673 	if (err)
1674 		return err;
1675 
1676 	return 0;
1677 }
1678 
1679 #define MLX5_MODIFY_RQT_NUM_RQS ((u64)1)
1680 
1681 static int modify_rqt(struct mlx5_vdpa_net *ndev, int num)
1682 {
1683 	int act_sz = roundup_pow_of_two(num / 2);
1684 	__be32 *list;
1685 	void *rqtc;
1686 	int inlen;
1687 	void *in;
1688 	int i, j;
1689 	int err;
1690 
1691 	inlen = MLX5_ST_SZ_BYTES(modify_rqt_in) + act_sz * MLX5_ST_SZ_BYTES(rq_num);
1692 	in = kzalloc(inlen, GFP_KERNEL);
1693 	if (!in)
1694 		return -ENOMEM;
1695 
1696 	MLX5_SET(modify_rqt_in, in, uid, ndev->mvdev.res.uid);
1697 	MLX5_SET64(modify_rqt_in, in, bitmask, MLX5_MODIFY_RQT_NUM_RQS);
1698 	rqtc = MLX5_ADDR_OF(modify_rqt_in, in, ctx);
1699 	MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q);
1700 
1701 	list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]);
1702 	for (i = 0, j = 0; i < act_sz; i++, j = j + 2)
1703 		list[i] = cpu_to_be32(ndev->vqs[j % num].virtq_id);
1704 
1705 	MLX5_SET(rqtc, rqtc, rqt_actual_size, act_sz);
1706 	err = mlx5_vdpa_modify_rqt(&ndev->mvdev, in, inlen, ndev->res.rqtn);
1707 	kfree(in);
1708 	if (err)
1709 		return err;
1710 
1711 	return 0;
1712 }
1713 
1714 static void destroy_rqt(struct mlx5_vdpa_net *ndev)
1715 {
1716 	mlx5_vdpa_destroy_rqt(&ndev->mvdev, ndev->res.rqtn);
1717 }
1718 
1719 static int create_tir(struct mlx5_vdpa_net *ndev)
1720 {
1721 #define HASH_IP_L4PORTS                                                                            \
1722 	(MLX5_HASH_FIELD_SEL_SRC_IP | MLX5_HASH_FIELD_SEL_DST_IP | MLX5_HASH_FIELD_SEL_L4_SPORT |  \
1723 	 MLX5_HASH_FIELD_SEL_L4_DPORT)
1724 	static const u8 rx_hash_toeplitz_key[] = { 0x2c, 0xc6, 0x81, 0xd1, 0x5b, 0xdb, 0xf4, 0xf7,
1725 						   0xfc, 0xa2, 0x83, 0x19, 0xdb, 0x1a, 0x3e, 0x94,
1726 						   0x6b, 0x9e, 0x38, 0xd9, 0x2c, 0x9c, 0x03, 0xd1,
1727 						   0xad, 0x99, 0x44, 0xa7, 0xd9, 0x56, 0x3d, 0x59,
1728 						   0x06, 0x3c, 0x25, 0xf3, 0xfc, 0x1f, 0xdc, 0x2a };
1729 	void *rss_key;
1730 	void *outer;
1731 	void *tirc;
1732 	void *in;
1733 	int err;
1734 
1735 	in = kzalloc(MLX5_ST_SZ_BYTES(create_tir_in), GFP_KERNEL);
1736 	if (!in)
1737 		return -ENOMEM;
1738 
1739 	MLX5_SET(create_tir_in, in, uid, ndev->mvdev.res.uid);
1740 	tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
1741 	MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT);
1742 
1743 	MLX5_SET(tirc, tirc, rx_hash_symmetric, 1);
1744 	MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_TOEPLITZ);
1745 	rss_key = MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key);
1746 	memcpy(rss_key, rx_hash_toeplitz_key, sizeof(rx_hash_toeplitz_key));
1747 
1748 	outer = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer);
1749 	MLX5_SET(rx_hash_field_select, outer, l3_prot_type, MLX5_L3_PROT_TYPE_IPV4);
1750 	MLX5_SET(rx_hash_field_select, outer, l4_prot_type, MLX5_L4_PROT_TYPE_TCP);
1751 	MLX5_SET(rx_hash_field_select, outer, selected_fields, HASH_IP_L4PORTS);
1752 
1753 	MLX5_SET(tirc, tirc, indirect_table, ndev->res.rqtn);
1754 	MLX5_SET(tirc, tirc, transport_domain, ndev->res.tdn);
1755 
1756 	err = mlx5_vdpa_create_tir(&ndev->mvdev, in, &ndev->res.tirn);
1757 	kfree(in);
1758 	if (err)
1759 		return err;
1760 
1761 	mlx5_vdpa_add_tirn(ndev);
1762 	return err;
1763 }
1764 
1765 static void destroy_tir(struct mlx5_vdpa_net *ndev)
1766 {
1767 	mlx5_vdpa_remove_tirn(ndev);
1768 	mlx5_vdpa_destroy_tir(&ndev->mvdev, ndev->res.tirn);
1769 }
1770 
1771 #define MAX_STEERING_ENT 0x8000
1772 #define MAX_STEERING_GROUPS 2
1773 
1774 #if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG)
1775        #define NUM_DESTS 2
1776 #else
1777        #define NUM_DESTS 1
1778 #endif
1779 
1780 static int add_steering_counters(struct mlx5_vdpa_net *ndev,
1781 				 struct macvlan_node *node,
1782 				 struct mlx5_flow_act *flow_act,
1783 				 struct mlx5_flow_destination *dests)
1784 {
1785 #if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG)
1786 	int err;
1787 
1788 	node->ucast_counter.counter = mlx5_fc_create(ndev->mvdev.mdev, false);
1789 	if (IS_ERR(node->ucast_counter.counter))
1790 		return PTR_ERR(node->ucast_counter.counter);
1791 
1792 	node->mcast_counter.counter = mlx5_fc_create(ndev->mvdev.mdev, false);
1793 	if (IS_ERR(node->mcast_counter.counter)) {
1794 		err = PTR_ERR(node->mcast_counter.counter);
1795 		goto err_mcast_counter;
1796 	}
1797 
1798 	dests[1].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
1799 	flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
1800 	return 0;
1801 
1802 err_mcast_counter:
1803 	mlx5_fc_destroy(ndev->mvdev.mdev, node->ucast_counter.counter);
1804 	return err;
1805 #else
1806 	return 0;
1807 #endif
1808 }
1809 
1810 static void remove_steering_counters(struct mlx5_vdpa_net *ndev,
1811 				     struct macvlan_node *node)
1812 {
1813 #if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG)
1814 	mlx5_fc_destroy(ndev->mvdev.mdev, node->mcast_counter.counter);
1815 	mlx5_fc_destroy(ndev->mvdev.mdev, node->ucast_counter.counter);
1816 #endif
1817 }
1818 
1819 static int mlx5_vdpa_add_mac_vlan_rules(struct mlx5_vdpa_net *ndev, u8 *mac,
1820 					struct macvlan_node *node)
1821 {
1822 	struct mlx5_flow_destination dests[NUM_DESTS] = {};
1823 	struct mlx5_flow_act flow_act = {};
1824 	struct mlx5_flow_spec *spec;
1825 	void *headers_c;
1826 	void *headers_v;
1827 	u8 *dmac_c;
1828 	u8 *dmac_v;
1829 	int err;
1830 	u16 vid;
1831 
1832 	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
1833 	if (!spec)
1834 		return -ENOMEM;
1835 
1836 	vid = key2vid(node->macvlan);
1837 	spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
1838 	headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, outer_headers);
1839 	headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, outer_headers);
1840 	dmac_c = MLX5_ADDR_OF(fte_match_param, headers_c, outer_headers.dmac_47_16);
1841 	dmac_v = MLX5_ADDR_OF(fte_match_param, headers_v, outer_headers.dmac_47_16);
1842 	eth_broadcast_addr(dmac_c);
1843 	ether_addr_copy(dmac_v, mac);
1844 	if (ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VLAN)) {
1845 		MLX5_SET(fte_match_set_lyr_2_4, headers_c, cvlan_tag, 1);
1846 		MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, first_vid);
1847 	}
1848 	if (node->tagged) {
1849 		MLX5_SET(fte_match_set_lyr_2_4, headers_v, cvlan_tag, 1);
1850 		MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_vid, vid);
1851 	}
1852 	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
1853 	dests[0].type = MLX5_FLOW_DESTINATION_TYPE_TIR;
1854 	dests[0].tir_num = ndev->res.tirn;
1855 	err = add_steering_counters(ndev, node, &flow_act, dests);
1856 	if (err)
1857 		goto out_free;
1858 
1859 #if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG)
1860 	dests[1].counter_id = mlx5_fc_id(node->ucast_counter.counter);
1861 #endif
1862 	node->ucast_rule = mlx5_add_flow_rules(ndev->rxft, spec, &flow_act, dests, NUM_DESTS);
1863 	if (IS_ERR(node->ucast_rule)) {
1864 		err = PTR_ERR(node->ucast_rule);
1865 		goto err_ucast;
1866 	}
1867 
1868 #if defined(CONFIG_MLX5_VDPA_STEERING_DEBUG)
1869 	dests[1].counter_id = mlx5_fc_id(node->mcast_counter.counter);
1870 #endif
1871 
1872 	memset(dmac_c, 0, ETH_ALEN);
1873 	memset(dmac_v, 0, ETH_ALEN);
1874 	dmac_c[0] = 1;
1875 	dmac_v[0] = 1;
1876 	node->mcast_rule = mlx5_add_flow_rules(ndev->rxft, spec, &flow_act, dests, NUM_DESTS);
1877 	if (IS_ERR(node->mcast_rule)) {
1878 		err = PTR_ERR(node->mcast_rule);
1879 		goto err_mcast;
1880 	}
1881 	kvfree(spec);
1882 	mlx5_vdpa_add_rx_counters(ndev, node);
1883 	return 0;
1884 
1885 err_mcast:
1886 	mlx5_del_flow_rules(node->ucast_rule);
1887 err_ucast:
1888 	remove_steering_counters(ndev, node);
1889 out_free:
1890 	kvfree(spec);
1891 	return err;
1892 }
1893 
1894 static void mlx5_vdpa_del_mac_vlan_rules(struct mlx5_vdpa_net *ndev,
1895 					 struct macvlan_node *node)
1896 {
1897 	mlx5_vdpa_remove_rx_counters(ndev, node);
1898 	mlx5_del_flow_rules(node->ucast_rule);
1899 	mlx5_del_flow_rules(node->mcast_rule);
1900 }
1901 
1902 static u64 search_val(u8 *mac, u16 vlan, bool tagged)
1903 {
1904 	u64 val;
1905 
1906 	if (!tagged)
1907 		vlan = MLX5V_UNTAGGED;
1908 
1909 	val = (u64)vlan << 48 |
1910 	      (u64)mac[0] << 40 |
1911 	      (u64)mac[1] << 32 |
1912 	      (u64)mac[2] << 24 |
1913 	      (u64)mac[3] << 16 |
1914 	      (u64)mac[4] << 8 |
1915 	      (u64)mac[5];
1916 
1917 	return val;
1918 }
1919 
1920 static struct macvlan_node *mac_vlan_lookup(struct mlx5_vdpa_net *ndev, u64 value)
1921 {
1922 	struct macvlan_node *pos;
1923 	u32 idx;
1924 
1925 	idx = hash_64(value, 8); // tbd 8
1926 	hlist_for_each_entry(pos, &ndev->macvlan_hash[idx], hlist) {
1927 		if (pos->macvlan == value)
1928 			return pos;
1929 	}
1930 	return NULL;
1931 }
1932 
1933 static int mac_vlan_add(struct mlx5_vdpa_net *ndev, u8 *mac, u16 vid, bool tagged)
1934 {
1935 	struct macvlan_node *ptr;
1936 	u64 val;
1937 	u32 idx;
1938 	int err;
1939 
1940 	val = search_val(mac, vid, tagged);
1941 	if (mac_vlan_lookup(ndev, val))
1942 		return -EEXIST;
1943 
1944 	ptr = kzalloc(sizeof(*ptr), GFP_KERNEL);
1945 	if (!ptr)
1946 		return -ENOMEM;
1947 
1948 	ptr->tagged = tagged;
1949 	ptr->macvlan = val;
1950 	ptr->ndev = ndev;
1951 	err = mlx5_vdpa_add_mac_vlan_rules(ndev, ndev->config.mac, ptr);
1952 	if (err)
1953 		goto err_add;
1954 
1955 	idx = hash_64(val, 8);
1956 	hlist_add_head(&ptr->hlist, &ndev->macvlan_hash[idx]);
1957 	return 0;
1958 
1959 err_add:
1960 	kfree(ptr);
1961 	return err;
1962 }
1963 
1964 static void mac_vlan_del(struct mlx5_vdpa_net *ndev, u8 *mac, u16 vlan, bool tagged)
1965 {
1966 	struct macvlan_node *ptr;
1967 
1968 	ptr = mac_vlan_lookup(ndev, search_val(mac, vlan, tagged));
1969 	if (!ptr)
1970 		return;
1971 
1972 	hlist_del(&ptr->hlist);
1973 	mlx5_vdpa_del_mac_vlan_rules(ndev, ptr);
1974 	remove_steering_counters(ndev, ptr);
1975 	kfree(ptr);
1976 }
1977 
1978 static void clear_mac_vlan_table(struct mlx5_vdpa_net *ndev)
1979 {
1980 	struct macvlan_node *pos;
1981 	struct hlist_node *n;
1982 	int i;
1983 
1984 	for (i = 0; i < MLX5V_MACVLAN_SIZE; i++) {
1985 		hlist_for_each_entry_safe(pos, n, &ndev->macvlan_hash[i], hlist) {
1986 			hlist_del(&pos->hlist);
1987 			mlx5_vdpa_del_mac_vlan_rules(ndev, pos);
1988 			remove_steering_counters(ndev, pos);
1989 			kfree(pos);
1990 		}
1991 	}
1992 }
1993 
1994 static int setup_steering(struct mlx5_vdpa_net *ndev)
1995 {
1996 	struct mlx5_flow_table_attr ft_attr = {};
1997 	struct mlx5_flow_namespace *ns;
1998 	int err;
1999 
2000 	ft_attr.max_fte = MAX_STEERING_ENT;
2001 	ft_attr.autogroup.max_num_groups = MAX_STEERING_GROUPS;
2002 
2003 	ns = mlx5_get_flow_namespace(ndev->mvdev.mdev, MLX5_FLOW_NAMESPACE_BYPASS);
2004 	if (!ns) {
2005 		mlx5_vdpa_warn(&ndev->mvdev, "failed to get flow namespace\n");
2006 		return -EOPNOTSUPP;
2007 	}
2008 
2009 	ndev->rxft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr);
2010 	if (IS_ERR(ndev->rxft)) {
2011 		mlx5_vdpa_warn(&ndev->mvdev, "failed to create flow table\n");
2012 		return PTR_ERR(ndev->rxft);
2013 	}
2014 	mlx5_vdpa_add_rx_flow_table(ndev);
2015 
2016 	err = mac_vlan_add(ndev, ndev->config.mac, 0, false);
2017 	if (err)
2018 		goto err_add;
2019 
2020 	return 0;
2021 
2022 err_add:
2023 	mlx5_vdpa_remove_rx_flow_table(ndev);
2024 	mlx5_destroy_flow_table(ndev->rxft);
2025 	return err;
2026 }
2027 
2028 static void teardown_steering(struct mlx5_vdpa_net *ndev)
2029 {
2030 	clear_mac_vlan_table(ndev);
2031 	mlx5_vdpa_remove_rx_flow_table(ndev);
2032 	mlx5_destroy_flow_table(ndev->rxft);
2033 }
2034 
2035 static virtio_net_ctrl_ack handle_ctrl_mac(struct mlx5_vdpa_dev *mvdev, u8 cmd)
2036 {
2037 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2038 	struct mlx5_control_vq *cvq = &mvdev->cvq;
2039 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
2040 	struct mlx5_core_dev *pfmdev;
2041 	size_t read;
2042 	u8 mac[ETH_ALEN], mac_back[ETH_ALEN];
2043 
2044 	pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
2045 	switch (cmd) {
2046 	case VIRTIO_NET_CTRL_MAC_ADDR_SET:
2047 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, (void *)mac, ETH_ALEN);
2048 		if (read != ETH_ALEN)
2049 			break;
2050 
2051 		if (!memcmp(ndev->config.mac, mac, 6)) {
2052 			status = VIRTIO_NET_OK;
2053 			break;
2054 		}
2055 
2056 		if (is_zero_ether_addr(mac))
2057 			break;
2058 
2059 		if (!is_zero_ether_addr(ndev->config.mac)) {
2060 			if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) {
2061 				mlx5_vdpa_warn(mvdev, "failed to delete old MAC %pM from MPFS table\n",
2062 					       ndev->config.mac);
2063 				break;
2064 			}
2065 		}
2066 
2067 		if (mlx5_mpfs_add_mac(pfmdev, mac)) {
2068 			mlx5_vdpa_warn(mvdev, "failed to insert new MAC %pM into MPFS table\n",
2069 				       mac);
2070 			break;
2071 		}
2072 
2073 		/* backup the original mac address so that if failed to add the forward rules
2074 		 * we could restore it
2075 		 */
2076 		memcpy(mac_back, ndev->config.mac, ETH_ALEN);
2077 
2078 		memcpy(ndev->config.mac, mac, ETH_ALEN);
2079 
2080 		/* Need recreate the flow table entry, so that the packet could forward back
2081 		 */
2082 		mac_vlan_del(ndev, mac_back, 0, false);
2083 
2084 		if (mac_vlan_add(ndev, ndev->config.mac, 0, false)) {
2085 			mlx5_vdpa_warn(mvdev, "failed to insert forward rules, try to restore\n");
2086 
2087 			/* Although it hardly run here, we still need double check */
2088 			if (is_zero_ether_addr(mac_back)) {
2089 				mlx5_vdpa_warn(mvdev, "restore mac failed: Original MAC is zero\n");
2090 				break;
2091 			}
2092 
2093 			/* Try to restore original mac address to MFPS table, and try to restore
2094 			 * the forward rule entry.
2095 			 */
2096 			if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) {
2097 				mlx5_vdpa_warn(mvdev, "restore mac failed: delete MAC %pM from MPFS table failed\n",
2098 					       ndev->config.mac);
2099 			}
2100 
2101 			if (mlx5_mpfs_add_mac(pfmdev, mac_back)) {
2102 				mlx5_vdpa_warn(mvdev, "restore mac failed: insert old MAC %pM into MPFS table failed\n",
2103 					       mac_back);
2104 			}
2105 
2106 			memcpy(ndev->config.mac, mac_back, ETH_ALEN);
2107 
2108 			if (mac_vlan_add(ndev, ndev->config.mac, 0, false))
2109 				mlx5_vdpa_warn(mvdev, "restore forward rules failed: insert forward rules failed\n");
2110 
2111 			break;
2112 		}
2113 
2114 		status = VIRTIO_NET_OK;
2115 		break;
2116 
2117 	default:
2118 		break;
2119 	}
2120 
2121 	return status;
2122 }
2123 
2124 static int change_num_qps(struct mlx5_vdpa_dev *mvdev, int newqps)
2125 {
2126 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2127 	int cur_qps = ndev->cur_num_vqs / 2;
2128 	int err;
2129 	int i;
2130 
2131 	if (cur_qps > newqps) {
2132 		err = modify_rqt(ndev, 2 * newqps);
2133 		if (err)
2134 			return err;
2135 
2136 		for (i = ndev->cur_num_vqs - 1; i >= 2 * newqps; i--) {
2137 			struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[i];
2138 
2139 			if (is_resumable(ndev))
2140 				suspend_vq(ndev, mvq);
2141 			else
2142 				teardown_vq(ndev, mvq);
2143 		}
2144 
2145 		ndev->cur_num_vqs = 2 * newqps;
2146 	} else {
2147 		ndev->cur_num_vqs = 2 * newqps;
2148 		for (i = cur_qps * 2; i < 2 * newqps; i++) {
2149 			struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[i];
2150 
2151 			err = mvq->initialized ? resume_vq(ndev, mvq) : setup_vq(ndev, mvq, true);
2152 			if (err)
2153 				goto clean_added;
2154 		}
2155 		err = modify_rqt(ndev, 2 * newqps);
2156 		if (err)
2157 			goto clean_added;
2158 	}
2159 	return 0;
2160 
2161 clean_added:
2162 	for (--i; i >= 2 * cur_qps; --i)
2163 		teardown_vq(ndev, &ndev->vqs[i]);
2164 
2165 	ndev->cur_num_vqs = 2 * cur_qps;
2166 
2167 	return err;
2168 }
2169 
2170 static virtio_net_ctrl_ack handle_ctrl_mq(struct mlx5_vdpa_dev *mvdev, u8 cmd)
2171 {
2172 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2173 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
2174 	struct mlx5_control_vq *cvq = &mvdev->cvq;
2175 	struct virtio_net_ctrl_mq mq;
2176 	size_t read;
2177 	u16 newqps;
2178 
2179 	switch (cmd) {
2180 	case VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET:
2181 		/* This mq feature check aligns with pre-existing userspace
2182 		 * implementation.
2183 		 *
2184 		 * Without it, an untrusted driver could fake a multiqueue config
2185 		 * request down to a non-mq device that may cause kernel to
2186 		 * panic due to uninitialized resources for extra vqs. Even with
2187 		 * a well behaving guest driver, it is not expected to allow
2188 		 * changing the number of vqs on a non-mq device.
2189 		 */
2190 		if (!MLX5_FEATURE(mvdev, VIRTIO_NET_F_MQ))
2191 			break;
2192 
2193 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, (void *)&mq, sizeof(mq));
2194 		if (read != sizeof(mq))
2195 			break;
2196 
2197 		newqps = mlx5vdpa16_to_cpu(mvdev, mq.virtqueue_pairs);
2198 		if (newqps < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
2199 		    newqps > ndev->rqt_size)
2200 			break;
2201 
2202 		if (ndev->cur_num_vqs == 2 * newqps) {
2203 			status = VIRTIO_NET_OK;
2204 			break;
2205 		}
2206 
2207 		if (!change_num_qps(mvdev, newqps))
2208 			status = VIRTIO_NET_OK;
2209 
2210 		break;
2211 	default:
2212 		break;
2213 	}
2214 
2215 	return status;
2216 }
2217 
2218 static virtio_net_ctrl_ack handle_ctrl_vlan(struct mlx5_vdpa_dev *mvdev, u8 cmd)
2219 {
2220 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2221 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
2222 	struct mlx5_control_vq *cvq = &mvdev->cvq;
2223 	__virtio16 vlan;
2224 	size_t read;
2225 	u16 id;
2226 
2227 	if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VLAN)))
2228 		return status;
2229 
2230 	switch (cmd) {
2231 	case VIRTIO_NET_CTRL_VLAN_ADD:
2232 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, &vlan, sizeof(vlan));
2233 		if (read != sizeof(vlan))
2234 			break;
2235 
2236 		id = mlx5vdpa16_to_cpu(mvdev, vlan);
2237 		if (mac_vlan_add(ndev, ndev->config.mac, id, true))
2238 			break;
2239 
2240 		status = VIRTIO_NET_OK;
2241 		break;
2242 	case VIRTIO_NET_CTRL_VLAN_DEL:
2243 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, &vlan, sizeof(vlan));
2244 		if (read != sizeof(vlan))
2245 			break;
2246 
2247 		id = mlx5vdpa16_to_cpu(mvdev, vlan);
2248 		mac_vlan_del(ndev, ndev->config.mac, id, true);
2249 		status = VIRTIO_NET_OK;
2250 		break;
2251 	default:
2252 		break;
2253 	}
2254 
2255 	return status;
2256 }
2257 
2258 static void mlx5_cvq_kick_handler(struct work_struct *work)
2259 {
2260 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
2261 	struct virtio_net_ctrl_hdr ctrl;
2262 	struct mlx5_vdpa_wq_ent *wqent;
2263 	struct mlx5_vdpa_dev *mvdev;
2264 	struct mlx5_control_vq *cvq;
2265 	struct mlx5_vdpa_net *ndev;
2266 	size_t read, write;
2267 	int err;
2268 
2269 	wqent = container_of(work, struct mlx5_vdpa_wq_ent, work);
2270 	mvdev = wqent->mvdev;
2271 	ndev = to_mlx5_vdpa_ndev(mvdev);
2272 	cvq = &mvdev->cvq;
2273 
2274 	down_write(&ndev->reslock);
2275 
2276 	if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK))
2277 		goto out;
2278 
2279 	if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)))
2280 		goto out;
2281 
2282 	if (!cvq->ready)
2283 		goto out;
2284 
2285 	while (true) {
2286 		err = vringh_getdesc_iotlb(&cvq->vring, &cvq->riov, &cvq->wiov, &cvq->head,
2287 					   GFP_ATOMIC);
2288 		if (err <= 0)
2289 			break;
2290 
2291 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, &ctrl, sizeof(ctrl));
2292 		if (read != sizeof(ctrl))
2293 			break;
2294 
2295 		cvq->received_desc++;
2296 		switch (ctrl.class) {
2297 		case VIRTIO_NET_CTRL_MAC:
2298 			status = handle_ctrl_mac(mvdev, ctrl.cmd);
2299 			break;
2300 		case VIRTIO_NET_CTRL_MQ:
2301 			status = handle_ctrl_mq(mvdev, ctrl.cmd);
2302 			break;
2303 		case VIRTIO_NET_CTRL_VLAN:
2304 			status = handle_ctrl_vlan(mvdev, ctrl.cmd);
2305 			break;
2306 		default:
2307 			break;
2308 		}
2309 
2310 		/* Make sure data is written before advancing index */
2311 		smp_wmb();
2312 
2313 		write = vringh_iov_push_iotlb(&cvq->vring, &cvq->wiov, &status, sizeof(status));
2314 		vringh_complete_iotlb(&cvq->vring, cvq->head, write);
2315 		vringh_kiov_cleanup(&cvq->riov);
2316 		vringh_kiov_cleanup(&cvq->wiov);
2317 
2318 		if (vringh_need_notify_iotlb(&cvq->vring))
2319 			vringh_notify(&cvq->vring);
2320 
2321 		cvq->completed_desc++;
2322 		queue_work(mvdev->wq, &wqent->work);
2323 		break;
2324 	}
2325 
2326 out:
2327 	up_write(&ndev->reslock);
2328 }
2329 
2330 static void mlx5_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx)
2331 {
2332 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2333 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2334 	struct mlx5_vdpa_virtqueue *mvq;
2335 
2336 	if (!is_index_valid(mvdev, idx))
2337 		return;
2338 
2339 	if (unlikely(is_ctrl_vq_idx(mvdev, idx))) {
2340 		if (!mvdev->wq || !mvdev->cvq.ready)
2341 			return;
2342 
2343 		queue_work(mvdev->wq, &ndev->cvq_ent.work);
2344 		return;
2345 	}
2346 
2347 	mvq = &ndev->vqs[idx];
2348 	if (unlikely(!mvq->ready))
2349 		return;
2350 
2351 	iowrite16(idx, ndev->mvdev.res.kick_addr);
2352 }
2353 
2354 static int mlx5_vdpa_set_vq_address(struct vdpa_device *vdev, u16 idx, u64 desc_area,
2355 				    u64 driver_area, u64 device_area)
2356 {
2357 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2358 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2359 	struct mlx5_vdpa_virtqueue *mvq;
2360 
2361 	if (!is_index_valid(mvdev, idx))
2362 		return -EINVAL;
2363 
2364 	if (is_ctrl_vq_idx(mvdev, idx)) {
2365 		mvdev->cvq.desc_addr = desc_area;
2366 		mvdev->cvq.device_addr = device_area;
2367 		mvdev->cvq.driver_addr = driver_area;
2368 		return 0;
2369 	}
2370 
2371 	mvq = &ndev->vqs[idx];
2372 	mvq->desc_addr = desc_area;
2373 	mvq->device_addr = device_area;
2374 	mvq->driver_addr = driver_area;
2375 	mvq->modified_fields |= MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_ADDRS;
2376 	return 0;
2377 }
2378 
2379 static void mlx5_vdpa_set_vq_num(struct vdpa_device *vdev, u16 idx, u32 num)
2380 {
2381 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2382 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2383 	struct mlx5_vdpa_virtqueue *mvq;
2384 
2385 	if (!is_index_valid(mvdev, idx))
2386 		return;
2387 
2388         if (is_ctrl_vq_idx(mvdev, idx)) {
2389                 struct mlx5_control_vq *cvq = &mvdev->cvq;
2390 
2391                 cvq->vring.vring.num = num;
2392                 return;
2393         }
2394 
2395 	mvq = &ndev->vqs[idx];
2396 	ndev->needs_teardown = num != mvq->num_ent;
2397 	mvq->num_ent = num;
2398 }
2399 
2400 static void mlx5_vdpa_set_vq_cb(struct vdpa_device *vdev, u16 idx, struct vdpa_callback *cb)
2401 {
2402 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2403 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2404 
2405 	ndev->event_cbs[idx] = *cb;
2406 	if (is_ctrl_vq_idx(mvdev, idx))
2407 		mvdev->cvq.event_cb = *cb;
2408 }
2409 
2410 static void mlx5_cvq_notify(struct vringh *vring)
2411 {
2412 	struct mlx5_control_vq *cvq = container_of(vring, struct mlx5_control_vq, vring);
2413 
2414 	if (!cvq->event_cb.callback)
2415 		return;
2416 
2417 	cvq->event_cb.callback(cvq->event_cb.private);
2418 }
2419 
2420 static void set_cvq_ready(struct mlx5_vdpa_dev *mvdev, bool ready)
2421 {
2422 	struct mlx5_control_vq *cvq = &mvdev->cvq;
2423 
2424 	cvq->ready = ready;
2425 	if (!ready)
2426 		return;
2427 
2428 	cvq->vring.notify = mlx5_cvq_notify;
2429 }
2430 
2431 static void mlx5_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx, bool ready)
2432 {
2433 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2434 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2435 	struct mlx5_vdpa_virtqueue *mvq;
2436 
2437 	if (!mvdev->actual_features)
2438 		return;
2439 
2440 	if (!is_index_valid(mvdev, idx))
2441 		return;
2442 
2443 	if (is_ctrl_vq_idx(mvdev, idx)) {
2444 		set_cvq_ready(mvdev, ready);
2445 		return;
2446 	}
2447 
2448 	mvq = &ndev->vqs[idx];
2449 	if (!ready) {
2450 		suspend_vq(ndev, mvq);
2451 	} else if (mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK) {
2452 		if (resume_vq(ndev, mvq))
2453 			ready = false;
2454 	}
2455 
2456 	mvq->ready = ready;
2457 }
2458 
2459 static bool mlx5_vdpa_get_vq_ready(struct vdpa_device *vdev, u16 idx)
2460 {
2461 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2462 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2463 
2464 	if (!is_index_valid(mvdev, idx))
2465 		return false;
2466 
2467 	if (is_ctrl_vq_idx(mvdev, idx))
2468 		return mvdev->cvq.ready;
2469 
2470 	return ndev->vqs[idx].ready;
2471 }
2472 
2473 static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx,
2474 				  const struct vdpa_vq_state *state)
2475 {
2476 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2477 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2478 	struct mlx5_vdpa_virtqueue *mvq;
2479 
2480 	if (!is_index_valid(mvdev, idx))
2481 		return -EINVAL;
2482 
2483 	if (is_ctrl_vq_idx(mvdev, idx)) {
2484 		mvdev->cvq.vring.last_avail_idx = state->split.avail_index;
2485 		return 0;
2486 	}
2487 
2488 	mvq = &ndev->vqs[idx];
2489 	if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) {
2490 		mlx5_vdpa_warn(mvdev, "can't modify available index\n");
2491 		return -EINVAL;
2492 	}
2493 
2494 	mvq->used_idx = state->split.avail_index;
2495 	mvq->avail_idx = state->split.avail_index;
2496 	mvq->modified_fields |= MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_AVAIL_IDX |
2497 				MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_USED_IDX;
2498 	return 0;
2499 }
2500 
2501 static int mlx5_vdpa_get_vq_state(struct vdpa_device *vdev, u16 idx, struct vdpa_vq_state *state)
2502 {
2503 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2504 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2505 	struct mlx5_vdpa_virtqueue *mvq;
2506 	struct mlx5_virtq_attr attr;
2507 	int err;
2508 
2509 	if (!is_index_valid(mvdev, idx))
2510 		return -EINVAL;
2511 
2512 	if (is_ctrl_vq_idx(mvdev, idx)) {
2513 		state->split.avail_index = mvdev->cvq.vring.last_avail_idx;
2514 		return 0;
2515 	}
2516 
2517 	mvq = &ndev->vqs[idx];
2518 	/* If the virtq object was destroyed, use the value saved at
2519 	 * the last minute of suspend_vq. This caters for userspace
2520 	 * that cares about emulating the index after vq is stopped.
2521 	 */
2522 	if (!mvq->initialized) {
2523 		/* Firmware returns a wrong value for the available index.
2524 		 * Since both values should be identical, we take the value of
2525 		 * used_idx which is reported correctly.
2526 		 */
2527 		state->split.avail_index = mvq->used_idx;
2528 		return 0;
2529 	}
2530 
2531 	err = query_virtqueue(ndev, mvq, &attr);
2532 	if (err) {
2533 		mlx5_vdpa_warn(mvdev, "failed to query virtqueue\n");
2534 		return err;
2535 	}
2536 	state->split.avail_index = attr.used_index;
2537 	return 0;
2538 }
2539 
2540 static u32 mlx5_vdpa_get_vq_align(struct vdpa_device *vdev)
2541 {
2542 	return PAGE_SIZE;
2543 }
2544 
2545 static u32 mlx5_vdpa_get_vq_group(struct vdpa_device *vdev, u16 idx)
2546 {
2547 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2548 
2549 	if (is_ctrl_vq_idx(mvdev, idx))
2550 		return MLX5_VDPA_CVQ_GROUP;
2551 
2552 	return MLX5_VDPA_DATAVQ_GROUP;
2553 }
2554 
2555 static u32 mlx5_vdpa_get_vq_desc_group(struct vdpa_device *vdev, u16 idx)
2556 {
2557 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2558 
2559 	if (is_ctrl_vq_idx(mvdev, idx))
2560 		return MLX5_VDPA_CVQ_GROUP;
2561 
2562 	return MLX5_VDPA_DATAVQ_DESC_GROUP;
2563 }
2564 
2565 static u64 mlx_to_vritio_features(u16 dev_features)
2566 {
2567 	u64 result = 0;
2568 
2569 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_MRG_RXBUF))
2570 		result |= BIT_ULL(VIRTIO_NET_F_MRG_RXBUF);
2571 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_HOST_ECN))
2572 		result |= BIT_ULL(VIRTIO_NET_F_HOST_ECN);
2573 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_ECN))
2574 		result |= BIT_ULL(VIRTIO_NET_F_GUEST_ECN);
2575 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_TSO6))
2576 		result |= BIT_ULL(VIRTIO_NET_F_GUEST_TSO6);
2577 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_TSO4))
2578 		result |= BIT_ULL(VIRTIO_NET_F_GUEST_TSO4);
2579 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_GUEST_CSUM))
2580 		result |= BIT_ULL(VIRTIO_NET_F_GUEST_CSUM);
2581 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_CSUM))
2582 		result |= BIT_ULL(VIRTIO_NET_F_CSUM);
2583 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_HOST_TSO6))
2584 		result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO6);
2585 	if (dev_features & BIT_ULL(MLX5_VIRTIO_NET_F_HOST_TSO4))
2586 		result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO4);
2587 
2588 	return result;
2589 }
2590 
2591 static u64 get_supported_features(struct mlx5_core_dev *mdev)
2592 {
2593 	u64 mlx_vdpa_features = 0;
2594 	u16 dev_features;
2595 
2596 	dev_features = MLX5_CAP_DEV_VDPA_EMULATION(mdev, device_features_bits_mask);
2597 	mlx_vdpa_features |= mlx_to_vritio_features(dev_features);
2598 	if (MLX5_CAP_DEV_VDPA_EMULATION(mdev, virtio_version_1_0))
2599 		mlx_vdpa_features |= BIT_ULL(VIRTIO_F_VERSION_1);
2600 	mlx_vdpa_features |= BIT_ULL(VIRTIO_F_ACCESS_PLATFORM);
2601 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_CTRL_VQ);
2602 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR);
2603 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_MQ);
2604 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_STATUS);
2605 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_MTU);
2606 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_CTRL_VLAN);
2607 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_MAC);
2608 
2609 	return mlx_vdpa_features;
2610 }
2611 
2612 static u64 mlx5_vdpa_get_device_features(struct vdpa_device *vdev)
2613 {
2614 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2615 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2616 
2617 	print_features(mvdev, ndev->mvdev.mlx_features, false);
2618 	return ndev->mvdev.mlx_features;
2619 }
2620 
2621 static int verify_driver_features(struct mlx5_vdpa_dev *mvdev, u64 features)
2622 {
2623 	/* Minimum features to expect */
2624 	if (!(features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)))
2625 		return -EOPNOTSUPP;
2626 
2627 	/* Double check features combination sent down by the driver.
2628 	 * Fail invalid features due to absence of the depended feature.
2629 	 *
2630 	 * Per VIRTIO v1.1 specification, section 5.1.3.1 Feature bit
2631 	 * requirements: "VIRTIO_NET_F_MQ Requires VIRTIO_NET_F_CTRL_VQ".
2632 	 * By failing the invalid features sent down by untrusted drivers,
2633 	 * we're assured the assumption made upon is_index_valid() and
2634 	 * is_ctrl_vq_idx() will not be compromised.
2635 	 */
2636 	if ((features & (BIT_ULL(VIRTIO_NET_F_MQ) | BIT_ULL(VIRTIO_NET_F_CTRL_VQ))) ==
2637             BIT_ULL(VIRTIO_NET_F_MQ))
2638 		return -EINVAL;
2639 
2640 	return 0;
2641 }
2642 
2643 static int setup_virtqueues(struct mlx5_vdpa_dev *mvdev, bool filled)
2644 {
2645 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2646 	int err;
2647 	int i;
2648 
2649 	for (i = 0; i < mvdev->max_vqs; i++) {
2650 		err = setup_vq(ndev, &ndev->vqs[i], filled);
2651 		if (err)
2652 			goto err_vq;
2653 	}
2654 
2655 	return 0;
2656 
2657 err_vq:
2658 	for (--i; i >= 0; i--)
2659 		teardown_vq(ndev, &ndev->vqs[i]);
2660 
2661 	return err;
2662 }
2663 
2664 static void teardown_virtqueues(struct mlx5_vdpa_net *ndev)
2665 {
2666 	int i;
2667 
2668 	for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--)
2669 		teardown_vq(ndev, &ndev->vqs[i]);
2670 }
2671 
2672 static void update_cvq_info(struct mlx5_vdpa_dev *mvdev)
2673 {
2674 	if (MLX5_FEATURE(mvdev, VIRTIO_NET_F_CTRL_VQ)) {
2675 		if (MLX5_FEATURE(mvdev, VIRTIO_NET_F_MQ)) {
2676 			/* MQ supported. CVQ index is right above the last data virtqueue's */
2677 			mvdev->max_idx = mvdev->max_vqs;
2678 		} else {
2679 			/* Only CVQ supportted. data virtqueues occupy indices 0 and 1.
2680 			 * CVQ gets index 2
2681 			 */
2682 			mvdev->max_idx = 2;
2683 		}
2684 	} else {
2685 		/* Two data virtqueues only: one for rx and one for tx */
2686 		mvdev->max_idx = 1;
2687 	}
2688 }
2689 
2690 static u8 query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport)
2691 {
2692 	u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {};
2693 	u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {};
2694 	int err;
2695 
2696 	MLX5_SET(query_vport_state_in, in, opcode, MLX5_CMD_OP_QUERY_VPORT_STATE);
2697 	MLX5_SET(query_vport_state_in, in, op_mod, opmod);
2698 	MLX5_SET(query_vport_state_in, in, vport_number, vport);
2699 	if (vport)
2700 		MLX5_SET(query_vport_state_in, in, other_vport, 1);
2701 
2702 	err = mlx5_cmd_exec_inout(mdev, query_vport_state, in, out);
2703 	if (err)
2704 		return 0;
2705 
2706 	return MLX5_GET(query_vport_state_out, out, state);
2707 }
2708 
2709 static bool get_link_state(struct mlx5_vdpa_dev *mvdev)
2710 {
2711 	if (query_vport_state(mvdev->mdev, MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT, 0) ==
2712 	    VPORT_STATE_UP)
2713 		return true;
2714 
2715 	return false;
2716 }
2717 
2718 static void update_carrier(struct work_struct *work)
2719 {
2720 	struct mlx5_vdpa_wq_ent *wqent;
2721 	struct mlx5_vdpa_dev *mvdev;
2722 	struct mlx5_vdpa_net *ndev;
2723 
2724 	wqent = container_of(work, struct mlx5_vdpa_wq_ent, work);
2725 	mvdev = wqent->mvdev;
2726 	ndev = to_mlx5_vdpa_ndev(mvdev);
2727 	if (get_link_state(mvdev))
2728 		ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP);
2729 	else
2730 		ndev->config.status &= cpu_to_mlx5vdpa16(mvdev, ~VIRTIO_NET_S_LINK_UP);
2731 
2732 	if (ndev->config_cb.callback)
2733 		ndev->config_cb.callback(ndev->config_cb.private);
2734 
2735 	kfree(wqent);
2736 }
2737 
2738 static int queue_link_work(struct mlx5_vdpa_net *ndev)
2739 {
2740 	struct mlx5_vdpa_wq_ent *wqent;
2741 
2742 	wqent = kzalloc(sizeof(*wqent), GFP_ATOMIC);
2743 	if (!wqent)
2744 		return -ENOMEM;
2745 
2746 	wqent->mvdev = &ndev->mvdev;
2747 	INIT_WORK(&wqent->work, update_carrier);
2748 	queue_work(ndev->mvdev.wq, &wqent->work);
2749 	return 0;
2750 }
2751 
2752 static int event_handler(struct notifier_block *nb, unsigned long event, void *param)
2753 {
2754 	struct mlx5_vdpa_net *ndev = container_of(nb, struct mlx5_vdpa_net, nb);
2755 	struct mlx5_eqe *eqe = param;
2756 	int ret = NOTIFY_DONE;
2757 
2758 	if (event == MLX5_EVENT_TYPE_PORT_CHANGE) {
2759 		switch (eqe->sub_type) {
2760 		case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
2761 		case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
2762 			if (queue_link_work(ndev))
2763 				return NOTIFY_DONE;
2764 
2765 			ret = NOTIFY_OK;
2766 			break;
2767 		default:
2768 			return NOTIFY_DONE;
2769 		}
2770 		return ret;
2771 	}
2772 	return ret;
2773 }
2774 
2775 static void register_link_notifier(struct mlx5_vdpa_net *ndev)
2776 {
2777 	if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_STATUS)))
2778 		return;
2779 
2780 	ndev->nb.notifier_call = event_handler;
2781 	mlx5_notifier_register(ndev->mvdev.mdev, &ndev->nb);
2782 	ndev->nb_registered = true;
2783 	queue_link_work(ndev);
2784 }
2785 
2786 static void unregister_link_notifier(struct mlx5_vdpa_net *ndev)
2787 {
2788 	if (!ndev->nb_registered)
2789 		return;
2790 
2791 	ndev->nb_registered = false;
2792 	mlx5_notifier_unregister(ndev->mvdev.mdev, &ndev->nb);
2793 	if (ndev->mvdev.wq)
2794 		flush_workqueue(ndev->mvdev.wq);
2795 }
2796 
2797 static u64 mlx5_vdpa_get_backend_features(const struct vdpa_device *vdpa)
2798 {
2799 	return BIT_ULL(VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK);
2800 }
2801 
2802 static int mlx5_vdpa_set_driver_features(struct vdpa_device *vdev, u64 features)
2803 {
2804 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2805 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2806 	u64 old_features = mvdev->actual_features;
2807 	u64 diff_features;
2808 	int err;
2809 
2810 	print_features(mvdev, features, true);
2811 
2812 	err = verify_driver_features(mvdev, features);
2813 	if (err)
2814 		return err;
2815 
2816 	ndev->mvdev.actual_features = features & ndev->mvdev.mlx_features;
2817 
2818 	/* Interested in changes of vq features only. */
2819 	if (get_features(old_features) != get_features(mvdev->actual_features)) {
2820 		for (int i = 0; i < mvdev->max_vqs; ++i) {
2821 			struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[i];
2822 
2823 			mvq->modified_fields |= (
2824 				MLX5_VIRTQ_MODIFY_MASK_QUEUE_VIRTIO_VERSION |
2825 				MLX5_VIRTQ_MODIFY_MASK_QUEUE_FEATURES
2826 			);
2827 		}
2828 	}
2829 
2830 	/* When below features diverge from initial device features, VQs need a full teardown. */
2831 #define NEEDS_TEARDOWN_MASK (BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) | \
2832 			     BIT_ULL(VIRTIO_NET_F_CSUM) | \
2833 			     BIT_ULL(VIRTIO_F_VERSION_1))
2834 
2835 	diff_features = mvdev->mlx_features ^ mvdev->actual_features;
2836 	ndev->needs_teardown = !!(diff_features & NEEDS_TEARDOWN_MASK);
2837 
2838 	update_cvq_info(mvdev);
2839 	return err;
2840 }
2841 
2842 static void mlx5_vdpa_set_config_cb(struct vdpa_device *vdev, struct vdpa_callback *cb)
2843 {
2844 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2845 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2846 
2847 	ndev->config_cb = *cb;
2848 }
2849 
2850 #define MLX5_VDPA_MAX_VQ_ENTRIES 256
2851 static u16 mlx5_vdpa_get_vq_num_max(struct vdpa_device *vdev)
2852 {
2853 	return MLX5_VDPA_MAX_VQ_ENTRIES;
2854 }
2855 
2856 static u32 mlx5_vdpa_get_device_id(struct vdpa_device *vdev)
2857 {
2858 	return VIRTIO_ID_NET;
2859 }
2860 
2861 static u32 mlx5_vdpa_get_vendor_id(struct vdpa_device *vdev)
2862 {
2863 	return PCI_VENDOR_ID_MELLANOX;
2864 }
2865 
2866 static u8 mlx5_vdpa_get_status(struct vdpa_device *vdev)
2867 {
2868 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2869 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2870 
2871 	print_status(mvdev, ndev->mvdev.status, false);
2872 	return ndev->mvdev.status;
2873 }
2874 
2875 static int save_channel_info(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
2876 {
2877 	struct mlx5_vq_restore_info *ri = &mvq->ri;
2878 	struct mlx5_virtq_attr attr = {};
2879 	int err;
2880 
2881 	if (mvq->initialized) {
2882 		err = query_virtqueue(ndev, mvq, &attr);
2883 		if (err)
2884 			return err;
2885 	}
2886 
2887 	ri->avail_index = attr.available_index;
2888 	ri->used_index = attr.used_index;
2889 	ri->ready = mvq->ready;
2890 	ri->num_ent = mvq->num_ent;
2891 	ri->desc_addr = mvq->desc_addr;
2892 	ri->device_addr = mvq->device_addr;
2893 	ri->driver_addr = mvq->driver_addr;
2894 	ri->map = mvq->map;
2895 	ri->restore = true;
2896 	return 0;
2897 }
2898 
2899 static int save_channels_info(struct mlx5_vdpa_net *ndev)
2900 {
2901 	int i;
2902 
2903 	for (i = 0; i < ndev->mvdev.max_vqs; i++) {
2904 		memset(&ndev->vqs[i].ri, 0, sizeof(ndev->vqs[i].ri));
2905 		save_channel_info(ndev, &ndev->vqs[i]);
2906 	}
2907 	return 0;
2908 }
2909 
2910 static void mlx5_clear_vqs(struct mlx5_vdpa_net *ndev)
2911 {
2912 	int i;
2913 
2914 	for (i = 0; i < ndev->mvdev.max_vqs; i++)
2915 		memset(&ndev->vqs[i], 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
2916 }
2917 
2918 static void restore_channels_info(struct mlx5_vdpa_net *ndev)
2919 {
2920 	struct mlx5_vdpa_virtqueue *mvq;
2921 	struct mlx5_vq_restore_info *ri;
2922 	int i;
2923 
2924 	mlx5_clear_vqs(ndev);
2925 	mvqs_set_defaults(ndev);
2926 	for (i = 0; i < ndev->mvdev.max_vqs; i++) {
2927 		mvq = &ndev->vqs[i];
2928 		ri = &mvq->ri;
2929 		if (!ri->restore)
2930 			continue;
2931 
2932 		mvq->avail_idx = ri->avail_index;
2933 		mvq->used_idx = ri->used_index;
2934 		mvq->ready = ri->ready;
2935 		mvq->num_ent = ri->num_ent;
2936 		mvq->desc_addr = ri->desc_addr;
2937 		mvq->device_addr = ri->device_addr;
2938 		mvq->driver_addr = ri->driver_addr;
2939 		mvq->map = ri->map;
2940 	}
2941 }
2942 
2943 static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev *mvdev,
2944 				struct mlx5_vdpa_mr *new_mr,
2945 				unsigned int asid)
2946 {
2947 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2948 	bool teardown = !is_resumable(ndev);
2949 	int err;
2950 
2951 	suspend_vqs(ndev);
2952 	if (teardown) {
2953 		err = save_channels_info(ndev);
2954 		if (err)
2955 			return err;
2956 
2957 		teardown_vq_resources(ndev);
2958 	}
2959 
2960 	mlx5_vdpa_update_mr(mvdev, new_mr, asid);
2961 
2962 	for (int i = 0; i < mvdev->max_vqs; i++)
2963 		ndev->vqs[i].modified_fields |= MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_MKEY |
2964 						MLX5_VIRTQ_MODIFY_MASK_DESC_GROUP_MKEY;
2965 
2966 	if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK) || mvdev->suspended)
2967 		return 0;
2968 
2969 	if (teardown) {
2970 		restore_channels_info(ndev);
2971 		err = setup_vq_resources(ndev, true);
2972 		if (err)
2973 			return err;
2974 	}
2975 
2976 	resume_vqs(ndev);
2977 
2978 	return 0;
2979 }
2980 
2981 /* reslock must be held for this function */
2982 static int setup_vq_resources(struct mlx5_vdpa_net *ndev, bool filled)
2983 {
2984 	struct mlx5_vdpa_dev *mvdev = &ndev->mvdev;
2985 	int err;
2986 
2987 	WARN_ON(!rwsem_is_locked(&ndev->reslock));
2988 
2989 	if (ndev->setup) {
2990 		mlx5_vdpa_warn(mvdev, "setup driver called for already setup driver\n");
2991 		err = 0;
2992 		goto out;
2993 	}
2994 	mlx5_vdpa_add_debugfs(ndev);
2995 
2996 	err = read_umem_params(ndev);
2997 	if (err)
2998 		goto err_setup;
2999 
3000 	err = setup_virtqueues(mvdev, filled);
3001 	if (err) {
3002 		mlx5_vdpa_warn(mvdev, "setup_virtqueues\n");
3003 		goto err_setup;
3004 	}
3005 
3006 	err = create_rqt(ndev);
3007 	if (err) {
3008 		mlx5_vdpa_warn(mvdev, "create_rqt\n");
3009 		goto err_rqt;
3010 	}
3011 
3012 	err = create_tir(ndev);
3013 	if (err) {
3014 		mlx5_vdpa_warn(mvdev, "create_tir\n");
3015 		goto err_tir;
3016 	}
3017 
3018 	err = setup_steering(ndev);
3019 	if (err) {
3020 		mlx5_vdpa_warn(mvdev, "setup_steering\n");
3021 		goto err_fwd;
3022 	}
3023 	ndev->setup = true;
3024 
3025 	return 0;
3026 
3027 err_fwd:
3028 	destroy_tir(ndev);
3029 err_tir:
3030 	destroy_rqt(ndev);
3031 err_rqt:
3032 	teardown_virtqueues(ndev);
3033 err_setup:
3034 	mlx5_vdpa_remove_debugfs(ndev);
3035 out:
3036 	return err;
3037 }
3038 
3039 /* reslock must be held for this function */
3040 static void teardown_vq_resources(struct mlx5_vdpa_net *ndev)
3041 {
3042 
3043 	WARN_ON(!rwsem_is_locked(&ndev->reslock));
3044 
3045 	if (!ndev->setup)
3046 		return;
3047 
3048 	mlx5_vdpa_remove_debugfs(ndev);
3049 	teardown_steering(ndev);
3050 	destroy_tir(ndev);
3051 	destroy_rqt(ndev);
3052 	teardown_virtqueues(ndev);
3053 	ndev->setup = false;
3054 	ndev->needs_teardown = false;
3055 }
3056 
3057 static int setup_cvq_vring(struct mlx5_vdpa_dev *mvdev)
3058 {
3059 	struct mlx5_control_vq *cvq = &mvdev->cvq;
3060 	int err = 0;
3061 
3062 	if (mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)) {
3063 		u16 idx = cvq->vring.last_avail_idx;
3064 
3065 		err = vringh_init_iotlb(&cvq->vring, mvdev->actual_features,
3066 					cvq->vring.vring.num, false,
3067 					(struct vring_desc *)(uintptr_t)cvq->desc_addr,
3068 					(struct vring_avail *)(uintptr_t)cvq->driver_addr,
3069 					(struct vring_used *)(uintptr_t)cvq->device_addr);
3070 
3071 		if (!err)
3072 			cvq->vring.last_avail_idx = cvq->vring.last_used_idx = idx;
3073 	}
3074 	return err;
3075 }
3076 
3077 static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status)
3078 {
3079 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3080 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
3081 	int err;
3082 
3083 	print_status(mvdev, status, true);
3084 
3085 	down_write(&ndev->reslock);
3086 
3087 	if ((status ^ ndev->mvdev.status) & VIRTIO_CONFIG_S_DRIVER_OK) {
3088 		if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
3089 			err = setup_cvq_vring(mvdev);
3090 			if (err) {
3091 				mlx5_vdpa_warn(mvdev, "failed to setup control VQ vring\n");
3092 				goto err_setup;
3093 			}
3094 			register_link_notifier(ndev);
3095 
3096 			if (ndev->needs_teardown)
3097 				teardown_vq_resources(ndev);
3098 
3099 			if (ndev->setup) {
3100 				err = resume_vqs(ndev);
3101 				if (err) {
3102 					mlx5_vdpa_warn(mvdev, "failed to resume VQs\n");
3103 					goto err_driver;
3104 				}
3105 			} else {
3106 				err = setup_vq_resources(ndev, true);
3107 				if (err) {
3108 					mlx5_vdpa_warn(mvdev, "failed to setup driver\n");
3109 					goto err_driver;
3110 				}
3111 			}
3112 		} else {
3113 			mlx5_vdpa_warn(mvdev, "did not expect DRIVER_OK to be cleared\n");
3114 			goto err_clear;
3115 		}
3116 	}
3117 
3118 	ndev->mvdev.status = status;
3119 	up_write(&ndev->reslock);
3120 	return;
3121 
3122 err_driver:
3123 	unregister_link_notifier(ndev);
3124 err_setup:
3125 	mlx5_vdpa_destroy_mr_resources(&ndev->mvdev);
3126 	ndev->mvdev.status |= VIRTIO_CONFIG_S_FAILED;
3127 err_clear:
3128 	up_write(&ndev->reslock);
3129 }
3130 
3131 static void init_group_to_asid_map(struct mlx5_vdpa_dev *mvdev)
3132 {
3133 	int i;
3134 
3135 	/* default mapping all groups are mapped to asid 0 */
3136 	for (i = 0; i < MLX5_VDPA_NUMVQ_GROUPS; i++)
3137 		mvdev->group2asid[i] = 0;
3138 }
3139 
3140 static bool needs_vqs_reset(const struct mlx5_vdpa_dev *mvdev)
3141 {
3142 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
3143 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[0];
3144 
3145 	if (mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK)
3146 		return true;
3147 
3148 	if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_INIT)
3149 		return true;
3150 
3151 	return mvq->modified_fields & (
3152 		MLX5_VIRTQ_MODIFY_MASK_STATE |
3153 		MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_ADDRS |
3154 		MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_AVAIL_IDX |
3155 		MLX5_VIRTQ_MODIFY_MASK_VIRTIO_Q_USED_IDX
3156 	);
3157 }
3158 
3159 static int mlx5_vdpa_compat_reset(struct vdpa_device *vdev, u32 flags)
3160 {
3161 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3162 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
3163 	bool vq_reset;
3164 
3165 	print_status(mvdev, 0, true);
3166 	mlx5_vdpa_info(mvdev, "performing device reset\n");
3167 
3168 	down_write(&ndev->reslock);
3169 	unregister_link_notifier(ndev);
3170 	vq_reset = needs_vqs_reset(mvdev);
3171 	if (vq_reset) {
3172 		teardown_vq_resources(ndev);
3173 		mvqs_set_defaults(ndev);
3174 	}
3175 
3176 	if (flags & VDPA_RESET_F_CLEAN_MAP)
3177 		mlx5_vdpa_destroy_mr_resources(&ndev->mvdev);
3178 	ndev->mvdev.status = 0;
3179 	ndev->mvdev.suspended = false;
3180 	ndev->cur_num_vqs = MLX5V_DEFAULT_VQ_COUNT;
3181 	ndev->mvdev.cvq.ready = false;
3182 	ndev->mvdev.cvq.received_desc = 0;
3183 	ndev->mvdev.cvq.completed_desc = 0;
3184 	memset(ndev->event_cbs, 0, sizeof(*ndev->event_cbs) * (mvdev->max_vqs + 1));
3185 	ndev->mvdev.actual_features = 0;
3186 	init_group_to_asid_map(mvdev);
3187 	++mvdev->generation;
3188 
3189 	if ((flags & VDPA_RESET_F_CLEAN_MAP) &&
3190 	    MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) {
3191 		if (mlx5_vdpa_create_dma_mr(mvdev))
3192 			mlx5_vdpa_warn(mvdev, "create MR failed\n");
3193 	}
3194 	if (vq_reset)
3195 		setup_vq_resources(ndev, false);
3196 	up_write(&ndev->reslock);
3197 
3198 	return 0;
3199 }
3200 
3201 static int mlx5_vdpa_reset(struct vdpa_device *vdev)
3202 {
3203 	return mlx5_vdpa_compat_reset(vdev, 0);
3204 }
3205 
3206 static size_t mlx5_vdpa_get_config_size(struct vdpa_device *vdev)
3207 {
3208 	return sizeof(struct virtio_net_config);
3209 }
3210 
3211 static void mlx5_vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, void *buf,
3212 				 unsigned int len)
3213 {
3214 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3215 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
3216 
3217 	if (offset + len <= sizeof(struct virtio_net_config))
3218 		memcpy(buf, (u8 *)&ndev->config + offset, len);
3219 }
3220 
3221 static void mlx5_vdpa_set_config(struct vdpa_device *vdev, unsigned int offset, const void *buf,
3222 				 unsigned int len)
3223 {
3224 	/* not supported */
3225 }
3226 
3227 static u32 mlx5_vdpa_get_generation(struct vdpa_device *vdev)
3228 {
3229 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3230 
3231 	return mvdev->generation;
3232 }
3233 
3234 static int set_map_data(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb,
3235 			unsigned int asid)
3236 {
3237 	struct mlx5_vdpa_mr *new_mr;
3238 	int err;
3239 
3240 	if (asid >= MLX5_VDPA_NUM_AS)
3241 		return -EINVAL;
3242 
3243 	if (vhost_iotlb_itree_first(iotlb, 0, U64_MAX)) {
3244 		new_mr = mlx5_vdpa_create_mr(mvdev, iotlb);
3245 		if (IS_ERR(new_mr)) {
3246 			err = PTR_ERR(new_mr);
3247 			mlx5_vdpa_warn(mvdev, "create map failed(%d)\n", err);
3248 			return err;
3249 		}
3250 	} else {
3251 		/* Empty iotlbs don't have an mr but will clear the previous mr. */
3252 		new_mr = NULL;
3253 	}
3254 
3255 	if (!mvdev->mr[asid]) {
3256 		mlx5_vdpa_update_mr(mvdev, new_mr, asid);
3257 	} else {
3258 		err = mlx5_vdpa_change_map(mvdev, new_mr, asid);
3259 		if (err) {
3260 			mlx5_vdpa_warn(mvdev, "change map failed(%d)\n", err);
3261 			goto out_err;
3262 		}
3263 	}
3264 
3265 	return mlx5_vdpa_update_cvq_iotlb(mvdev, iotlb, asid);
3266 
3267 out_err:
3268 	mlx5_vdpa_put_mr(mvdev, new_mr);
3269 	return err;
3270 }
3271 
3272 static int mlx5_vdpa_set_map(struct vdpa_device *vdev, unsigned int asid,
3273 			     struct vhost_iotlb *iotlb)
3274 {
3275 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3276 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
3277 	int err = -EINVAL;
3278 
3279 	down_write(&ndev->reslock);
3280 	err = set_map_data(mvdev, iotlb, asid);
3281 	up_write(&ndev->reslock);
3282 	return err;
3283 }
3284 
3285 static int mlx5_vdpa_reset_map(struct vdpa_device *vdev, unsigned int asid)
3286 {
3287 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3288 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
3289 	int err;
3290 
3291 	down_write(&ndev->reslock);
3292 	err = mlx5_vdpa_reset_mr(mvdev, asid);
3293 	up_write(&ndev->reslock);
3294 	return err;
3295 }
3296 
3297 static struct device *mlx5_get_vq_dma_dev(struct vdpa_device *vdev, u16 idx)
3298 {
3299 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3300 
3301 	if (is_ctrl_vq_idx(mvdev, idx))
3302 		return &vdev->dev;
3303 
3304 	return mvdev->vdev.dma_dev;
3305 }
3306 
3307 static void free_irqs(struct mlx5_vdpa_net *ndev)
3308 {
3309 	struct mlx5_vdpa_irq_pool_entry *ent;
3310 	int i;
3311 
3312 	if (!msix_mode_supported(&ndev->mvdev))
3313 		return;
3314 
3315 	if (!ndev->irqp.entries)
3316 		return;
3317 
3318 	for (i = ndev->irqp.num_ent - 1; i >= 0; i--) {
3319 		ent = ndev->irqp.entries + i;
3320 		if (ent->map.virq)
3321 			pci_msix_free_irq(ndev->mvdev.mdev->pdev, ent->map);
3322 	}
3323 	kfree(ndev->irqp.entries);
3324 }
3325 
3326 static void mlx5_vdpa_free(struct vdpa_device *vdev)
3327 {
3328 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3329 	struct mlx5_core_dev *pfmdev;
3330 	struct mlx5_vdpa_net *ndev;
3331 
3332 	ndev = to_mlx5_vdpa_ndev(mvdev);
3333 
3334 	free_fixed_resources(ndev);
3335 	mlx5_vdpa_destroy_mr_resources(mvdev);
3336 	if (!is_zero_ether_addr(ndev->config.mac)) {
3337 		pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
3338 		mlx5_mpfs_del_mac(pfmdev, ndev->config.mac);
3339 	}
3340 	mlx5_vdpa_free_resources(&ndev->mvdev);
3341 	free_irqs(ndev);
3342 	kfree(ndev->event_cbs);
3343 	kfree(ndev->vqs);
3344 }
3345 
3346 static struct vdpa_notification_area mlx5_get_vq_notification(struct vdpa_device *vdev, u16 idx)
3347 {
3348 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3349 	struct vdpa_notification_area ret = {};
3350 	struct mlx5_vdpa_net *ndev;
3351 	phys_addr_t addr;
3352 
3353 	if (!is_index_valid(mvdev, idx) || is_ctrl_vq_idx(mvdev, idx))
3354 		return ret;
3355 
3356 	/* If SF BAR size is smaller than PAGE_SIZE, do not use direct
3357 	 * notification to avoid the risk of mapping pages that contain BAR of more
3358 	 * than one SF
3359 	 */
3360 	if (MLX5_CAP_GEN(mvdev->mdev, log_min_sf_size) + 12 < PAGE_SHIFT)
3361 		return ret;
3362 
3363 	ndev = to_mlx5_vdpa_ndev(mvdev);
3364 	addr = (phys_addr_t)ndev->mvdev.res.phys_kick_addr;
3365 	ret.addr = addr;
3366 	ret.size = PAGE_SIZE;
3367 	return ret;
3368 }
3369 
3370 static int mlx5_get_vq_irq(struct vdpa_device *vdev, u16 idx)
3371 {
3372 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3373 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
3374 	struct mlx5_vdpa_virtqueue *mvq;
3375 
3376 	if (!is_index_valid(mvdev, idx))
3377 		return -EINVAL;
3378 
3379 	if (is_ctrl_vq_idx(mvdev, idx))
3380 		return -EOPNOTSUPP;
3381 
3382 	mvq = &ndev->vqs[idx];
3383 	if (!mvq->map.virq)
3384 		return -EOPNOTSUPP;
3385 
3386 	return mvq->map.virq;
3387 }
3388 
3389 static u64 mlx5_vdpa_get_driver_features(struct vdpa_device *vdev)
3390 {
3391 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3392 
3393 	return mvdev->actual_features;
3394 }
3395 
3396 static int counter_set_query(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
3397 			     u64 *received_desc, u64 *completed_desc)
3398 {
3399 	u32 in[MLX5_ST_SZ_DW(query_virtio_q_counters_in)] = {};
3400 	u32 out[MLX5_ST_SZ_DW(query_virtio_q_counters_out)] = {};
3401 	void *cmd_hdr;
3402 	void *ctx;
3403 	int err;
3404 
3405 	if (!counters_supported(&ndev->mvdev))
3406 		return -EOPNOTSUPP;
3407 
3408 	if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY)
3409 		return -EAGAIN;
3410 
3411 	cmd_hdr = MLX5_ADDR_OF(query_virtio_q_counters_in, in, hdr);
3412 
3413 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT);
3414 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS);
3415 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
3416 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->counter_set_id);
3417 
3418 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out));
3419 	if (err)
3420 		return err;
3421 
3422 	ctx = MLX5_ADDR_OF(query_virtio_q_counters_out, out, counters);
3423 	*received_desc = MLX5_GET64(virtio_q_counters, ctx, received_desc);
3424 	*completed_desc = MLX5_GET64(virtio_q_counters, ctx, completed_desc);
3425 	return 0;
3426 }
3427 
3428 static int mlx5_vdpa_get_vendor_vq_stats(struct vdpa_device *vdev, u16 idx,
3429 					 struct sk_buff *msg,
3430 					 struct netlink_ext_ack *extack)
3431 {
3432 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3433 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
3434 	struct mlx5_vdpa_virtqueue *mvq;
3435 	struct mlx5_control_vq *cvq;
3436 	u64 received_desc;
3437 	u64 completed_desc;
3438 	int err = 0;
3439 
3440 	down_read(&ndev->reslock);
3441 	if (!is_index_valid(mvdev, idx)) {
3442 		NL_SET_ERR_MSG_MOD(extack, "virtqueue index is not valid");
3443 		err = -EINVAL;
3444 		goto out_err;
3445 	}
3446 
3447 	if (idx == ctrl_vq_idx(mvdev)) {
3448 		cvq = &mvdev->cvq;
3449 		received_desc = cvq->received_desc;
3450 		completed_desc = cvq->completed_desc;
3451 		goto out;
3452 	}
3453 
3454 	mvq = &ndev->vqs[idx];
3455 	err = counter_set_query(ndev, mvq, &received_desc, &completed_desc);
3456 	if (err) {
3457 		NL_SET_ERR_MSG_MOD(extack, "failed to query hardware");
3458 		goto out_err;
3459 	}
3460 
3461 out:
3462 	err = -EMSGSIZE;
3463 	if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, "received_desc"))
3464 		goto out_err;
3465 
3466 	if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, received_desc,
3467 			      VDPA_ATTR_PAD))
3468 		goto out_err;
3469 
3470 	if (nla_put_string(msg, VDPA_ATTR_DEV_VENDOR_ATTR_NAME, "completed_desc"))
3471 		goto out_err;
3472 
3473 	if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_VENDOR_ATTR_VALUE, completed_desc,
3474 			      VDPA_ATTR_PAD))
3475 		goto out_err;
3476 
3477 	err = 0;
3478 out_err:
3479 	up_read(&ndev->reslock);
3480 	return err;
3481 }
3482 
3483 static void mlx5_vdpa_cvq_suspend(struct mlx5_vdpa_dev *mvdev)
3484 {
3485 	struct mlx5_control_vq *cvq;
3486 
3487 	if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)))
3488 		return;
3489 
3490 	cvq = &mvdev->cvq;
3491 	cvq->ready = false;
3492 }
3493 
3494 static int mlx5_vdpa_suspend(struct vdpa_device *vdev)
3495 {
3496 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3497 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
3498 	int err;
3499 
3500 	mlx5_vdpa_info(mvdev, "suspending device\n");
3501 
3502 	down_write(&ndev->reslock);
3503 	unregister_link_notifier(ndev);
3504 	err = suspend_vqs(ndev);
3505 	mlx5_vdpa_cvq_suspend(mvdev);
3506 	mvdev->suspended = true;
3507 	up_write(&ndev->reslock);
3508 
3509 	return err;
3510 }
3511 
3512 static int mlx5_vdpa_resume(struct vdpa_device *vdev)
3513 {
3514 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3515 	struct mlx5_vdpa_net *ndev;
3516 	int err;
3517 
3518 	ndev = to_mlx5_vdpa_ndev(mvdev);
3519 
3520 	mlx5_vdpa_info(mvdev, "resuming device\n");
3521 
3522 	down_write(&ndev->reslock);
3523 	mvdev->suspended = false;
3524 	err = resume_vqs(ndev);
3525 	register_link_notifier(ndev);
3526 	up_write(&ndev->reslock);
3527 
3528 	return err;
3529 }
3530 
3531 static int mlx5_set_group_asid(struct vdpa_device *vdev, u32 group,
3532 			       unsigned int asid)
3533 {
3534 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
3535 	int err = 0;
3536 
3537 	if (group >= MLX5_VDPA_NUMVQ_GROUPS)
3538 		return -EINVAL;
3539 
3540 	mvdev->group2asid[group] = asid;
3541 
3542 	mutex_lock(&mvdev->mr_mtx);
3543 	if (group == MLX5_VDPA_CVQ_GROUP && mvdev->mr[asid])
3544 		err = mlx5_vdpa_update_cvq_iotlb(mvdev, mvdev->mr[asid]->iotlb, asid);
3545 	mutex_unlock(&mvdev->mr_mtx);
3546 
3547 	return err;
3548 }
3549 
3550 static const struct vdpa_config_ops mlx5_vdpa_ops = {
3551 	.set_vq_address = mlx5_vdpa_set_vq_address,
3552 	.set_vq_num = mlx5_vdpa_set_vq_num,
3553 	.kick_vq = mlx5_vdpa_kick_vq,
3554 	.set_vq_cb = mlx5_vdpa_set_vq_cb,
3555 	.set_vq_ready = mlx5_vdpa_set_vq_ready,
3556 	.get_vq_ready = mlx5_vdpa_get_vq_ready,
3557 	.set_vq_state = mlx5_vdpa_set_vq_state,
3558 	.get_vq_state = mlx5_vdpa_get_vq_state,
3559 	.get_vendor_vq_stats = mlx5_vdpa_get_vendor_vq_stats,
3560 	.get_vq_notification = mlx5_get_vq_notification,
3561 	.get_vq_irq = mlx5_get_vq_irq,
3562 	.get_vq_align = mlx5_vdpa_get_vq_align,
3563 	.get_vq_group = mlx5_vdpa_get_vq_group,
3564 	.get_vq_desc_group = mlx5_vdpa_get_vq_desc_group, /* Op disabled if not supported. */
3565 	.get_device_features = mlx5_vdpa_get_device_features,
3566 	.get_backend_features = mlx5_vdpa_get_backend_features,
3567 	.set_driver_features = mlx5_vdpa_set_driver_features,
3568 	.get_driver_features = mlx5_vdpa_get_driver_features,
3569 	.set_config_cb = mlx5_vdpa_set_config_cb,
3570 	.get_vq_num_max = mlx5_vdpa_get_vq_num_max,
3571 	.get_device_id = mlx5_vdpa_get_device_id,
3572 	.get_vendor_id = mlx5_vdpa_get_vendor_id,
3573 	.get_status = mlx5_vdpa_get_status,
3574 	.set_status = mlx5_vdpa_set_status,
3575 	.reset = mlx5_vdpa_reset,
3576 	.compat_reset = mlx5_vdpa_compat_reset,
3577 	.get_config_size = mlx5_vdpa_get_config_size,
3578 	.get_config = mlx5_vdpa_get_config,
3579 	.set_config = mlx5_vdpa_set_config,
3580 	.get_generation = mlx5_vdpa_get_generation,
3581 	.set_map = mlx5_vdpa_set_map,
3582 	.reset_map = mlx5_vdpa_reset_map,
3583 	.set_group_asid = mlx5_set_group_asid,
3584 	.get_vq_dma_dev = mlx5_get_vq_dma_dev,
3585 	.free = mlx5_vdpa_free,
3586 	.suspend = mlx5_vdpa_suspend,
3587 	.resume = mlx5_vdpa_resume, /* Op disabled if not supported. */
3588 };
3589 
3590 static int query_mtu(struct mlx5_core_dev *mdev, u16 *mtu)
3591 {
3592 	u16 hw_mtu;
3593 	int err;
3594 
3595 	err = mlx5_query_nic_vport_mtu(mdev, &hw_mtu);
3596 	if (err)
3597 		return err;
3598 
3599 	*mtu = hw_mtu - MLX5V_ETH_HARD_MTU;
3600 	return 0;
3601 }
3602 
3603 static int alloc_fixed_resources(struct mlx5_vdpa_net *ndev)
3604 {
3605 	struct mlx5_vdpa_net_resources *res = &ndev->res;
3606 	int err;
3607 
3608 	if (res->valid) {
3609 		mlx5_vdpa_warn(&ndev->mvdev, "resources already allocated\n");
3610 		return -EEXIST;
3611 	}
3612 
3613 	err = mlx5_vdpa_alloc_transport_domain(&ndev->mvdev, &res->tdn);
3614 	if (err)
3615 		return err;
3616 
3617 	err = create_tis(ndev);
3618 	if (err)
3619 		goto err_tis;
3620 
3621 	res->valid = true;
3622 
3623 	return 0;
3624 
3625 err_tis:
3626 	mlx5_vdpa_dealloc_transport_domain(&ndev->mvdev, res->tdn);
3627 	return err;
3628 }
3629 
3630 static void free_fixed_resources(struct mlx5_vdpa_net *ndev)
3631 {
3632 	struct mlx5_vdpa_net_resources *res = &ndev->res;
3633 
3634 	if (!res->valid)
3635 		return;
3636 
3637 	destroy_tis(ndev);
3638 	mlx5_vdpa_dealloc_transport_domain(&ndev->mvdev, res->tdn);
3639 	res->valid = false;
3640 }
3641 
3642 static void mvqs_set_defaults(struct mlx5_vdpa_net *ndev)
3643 {
3644 	struct mlx5_vdpa_virtqueue *mvq;
3645 	int i;
3646 
3647 	for (i = 0; i < ndev->mvdev.max_vqs; ++i) {
3648 		mvq = &ndev->vqs[i];
3649 		memset(mvq, 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
3650 		mvq->index = i;
3651 		mvq->ndev = ndev;
3652 		mvq->fwqp.fw = true;
3653 		mvq->fw_state = MLX5_VIRTIO_NET_Q_OBJECT_NONE;
3654 		mvq->num_ent = MLX5V_DEFAULT_VQ_SIZE;
3655 	}
3656 }
3657 
3658 struct mlx5_vdpa_mgmtdev {
3659 	struct vdpa_mgmt_dev mgtdev;
3660 	struct mlx5_adev *madev;
3661 	struct mlx5_vdpa_net *ndev;
3662 	struct vdpa_config_ops vdpa_ops;
3663 };
3664 
3665 static int config_func_mtu(struct mlx5_core_dev *mdev, u16 mtu)
3666 {
3667 	int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in);
3668 	void *in;
3669 	int err;
3670 
3671 	in = kvzalloc(inlen, GFP_KERNEL);
3672 	if (!in)
3673 		return -ENOMEM;
3674 
3675 	MLX5_SET(modify_nic_vport_context_in, in, field_select.mtu, 1);
3676 	MLX5_SET(modify_nic_vport_context_in, in, nic_vport_context.mtu,
3677 		 mtu + MLX5V_ETH_HARD_MTU);
3678 	MLX5_SET(modify_nic_vport_context_in, in, opcode,
3679 		 MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT);
3680 
3681 	err = mlx5_cmd_exec_in(mdev, modify_nic_vport_context, in);
3682 
3683 	kvfree(in);
3684 	return err;
3685 }
3686 
3687 static void allocate_irqs(struct mlx5_vdpa_net *ndev)
3688 {
3689 	struct mlx5_vdpa_irq_pool_entry *ent;
3690 	int i;
3691 
3692 	if (!msix_mode_supported(&ndev->mvdev))
3693 		return;
3694 
3695 	if (!ndev->mvdev.mdev->pdev)
3696 		return;
3697 
3698 	ndev->irqp.entries = kcalloc(ndev->mvdev.max_vqs, sizeof(*ndev->irqp.entries), GFP_KERNEL);
3699 	if (!ndev->irqp.entries)
3700 		return;
3701 
3702 
3703 	for (i = 0; i < ndev->mvdev.max_vqs; i++) {
3704 		ent = ndev->irqp.entries + i;
3705 		snprintf(ent->name, MLX5_VDPA_IRQ_NAME_LEN, "%s-vq-%d",
3706 			 dev_name(&ndev->mvdev.vdev.dev), i);
3707 		ent->map = pci_msix_alloc_irq_at(ndev->mvdev.mdev->pdev, MSI_ANY_INDEX, NULL);
3708 		if (!ent->map.virq)
3709 			return;
3710 
3711 		ndev->irqp.num_ent++;
3712 	}
3713 }
3714 
3715 static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name,
3716 			     const struct vdpa_dev_set_config *add_config)
3717 {
3718 	struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev);
3719 	struct virtio_net_config *config;
3720 	struct mlx5_core_dev *pfmdev;
3721 	struct mlx5_vdpa_dev *mvdev;
3722 	struct mlx5_vdpa_net *ndev;
3723 	struct mlx5_core_dev *mdev;
3724 	u64 device_features;
3725 	u32 max_vqs;
3726 	u16 mtu;
3727 	int err;
3728 
3729 	if (mgtdev->ndev)
3730 		return -ENOSPC;
3731 
3732 	mdev = mgtdev->madev->mdev;
3733 	device_features = mgtdev->mgtdev.supported_features;
3734 	if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) {
3735 		if (add_config->device_features & ~device_features) {
3736 			dev_warn(mdev->device,
3737 				 "The provisioned features 0x%llx are not supported by this device with features 0x%llx\n",
3738 				 add_config->device_features, device_features);
3739 			return -EINVAL;
3740 		}
3741 		device_features &= add_config->device_features;
3742 	} else {
3743 		device_features &= ~BIT_ULL(VIRTIO_NET_F_MRG_RXBUF);
3744 	}
3745 	if (!(device_features & BIT_ULL(VIRTIO_F_VERSION_1) &&
3746 	      device_features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM))) {
3747 		dev_warn(mdev->device,
3748 			 "Must provision minimum features 0x%llx for this device",
3749 			 BIT_ULL(VIRTIO_F_VERSION_1) | BIT_ULL(VIRTIO_F_ACCESS_PLATFORM));
3750 		return -EOPNOTSUPP;
3751 	}
3752 
3753 	if (!(MLX5_CAP_DEV_VDPA_EMULATION(mdev, virtio_queue_type) &
3754 	    MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT)) {
3755 		dev_warn(mdev->device, "missing support for split virtqueues\n");
3756 		return -EOPNOTSUPP;
3757 	}
3758 
3759 	max_vqs = min_t(int, MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues),
3760 			1 << MLX5_CAP_GEN(mdev, log_max_rqt_size));
3761 	if (max_vqs < 2) {
3762 		dev_warn(mdev->device,
3763 			 "%d virtqueues are supported. At least 2 are required\n",
3764 			 max_vqs);
3765 		return -EAGAIN;
3766 	}
3767 
3768 	if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP)) {
3769 		if (add_config->net.max_vq_pairs > max_vqs / 2)
3770 			return -EINVAL;
3771 		max_vqs = min_t(u32, max_vqs, 2 * add_config->net.max_vq_pairs);
3772 	} else {
3773 		max_vqs = 2;
3774 	}
3775 
3776 	ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mgtdev->vdpa_ops,
3777 				 MLX5_VDPA_NUMVQ_GROUPS, MLX5_VDPA_NUM_AS, name, false);
3778 	if (IS_ERR(ndev))
3779 		return PTR_ERR(ndev);
3780 
3781 	ndev->mvdev.max_vqs = max_vqs;
3782 	mvdev = &ndev->mvdev;
3783 	mvdev->mdev = mdev;
3784 
3785 	ndev->vqs = kcalloc(max_vqs, sizeof(*ndev->vqs), GFP_KERNEL);
3786 	ndev->event_cbs = kcalloc(max_vqs + 1, sizeof(*ndev->event_cbs), GFP_KERNEL);
3787 	if (!ndev->vqs || !ndev->event_cbs) {
3788 		err = -ENOMEM;
3789 		goto err_alloc;
3790 	}
3791 	ndev->cur_num_vqs = MLX5V_DEFAULT_VQ_COUNT;
3792 
3793 	mvqs_set_defaults(ndev);
3794 	allocate_irqs(ndev);
3795 	init_rwsem(&ndev->reslock);
3796 	config = &ndev->config;
3797 
3798 	if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU)) {
3799 		err = config_func_mtu(mdev, add_config->net.mtu);
3800 		if (err)
3801 			goto err_alloc;
3802 	}
3803 
3804 	if (device_features & BIT_ULL(VIRTIO_NET_F_MTU)) {
3805 		err = query_mtu(mdev, &mtu);
3806 		if (err)
3807 			goto err_alloc;
3808 
3809 		ndev->config.mtu = cpu_to_mlx5vdpa16(mvdev, mtu);
3810 	}
3811 
3812 	if (device_features & BIT_ULL(VIRTIO_NET_F_STATUS)) {
3813 		if (get_link_state(mvdev))
3814 			ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP);
3815 		else
3816 			ndev->config.status &= cpu_to_mlx5vdpa16(mvdev, ~VIRTIO_NET_S_LINK_UP);
3817 	}
3818 
3819 	if (add_config->mask & (1 << VDPA_ATTR_DEV_NET_CFG_MACADDR)) {
3820 		memcpy(ndev->config.mac, add_config->net.mac, ETH_ALEN);
3821 	/* No bother setting mac address in config if not going to provision _F_MAC */
3822 	} else if ((add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) == 0 ||
3823 		   device_features & BIT_ULL(VIRTIO_NET_F_MAC)) {
3824 		err = mlx5_query_nic_vport_mac_address(mdev, 0, 0, config->mac);
3825 		if (err)
3826 			goto err_alloc;
3827 	}
3828 
3829 	if (!is_zero_ether_addr(config->mac)) {
3830 		pfmdev = pci_get_drvdata(pci_physfn(mdev->pdev));
3831 		err = mlx5_mpfs_add_mac(pfmdev, config->mac);
3832 		if (err)
3833 			goto err_alloc;
3834 	} else if ((add_config->mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES)) == 0) {
3835 		/*
3836 		 * We used to clear _F_MAC feature bit if seeing
3837 		 * zero mac address when device features are not
3838 		 * specifically provisioned. Keep the behaviour
3839 		 * so old scripts do not break.
3840 		 */
3841 		device_features &= ~BIT_ULL(VIRTIO_NET_F_MAC);
3842 	} else if (device_features & BIT_ULL(VIRTIO_NET_F_MAC)) {
3843 		/* Don't provision zero mac address for _F_MAC */
3844 		mlx5_vdpa_warn(&ndev->mvdev,
3845 			       "No mac address provisioned?\n");
3846 		err = -EINVAL;
3847 		goto err_alloc;
3848 	}
3849 
3850 	if (device_features & BIT_ULL(VIRTIO_NET_F_MQ)) {
3851 		config->max_virtqueue_pairs = cpu_to_mlx5vdpa16(mvdev, max_vqs / 2);
3852 		ndev->rqt_size = max_vqs / 2;
3853 	} else {
3854 		ndev->rqt_size = 1;
3855 	}
3856 
3857 	ndev->mvdev.mlx_features = device_features;
3858 	mvdev->vdev.dma_dev = &mdev->pdev->dev;
3859 	err = mlx5_vdpa_alloc_resources(&ndev->mvdev);
3860 	if (err)
3861 		goto err_mpfs;
3862 
3863 	INIT_LIST_HEAD(&mvdev->mr_list_head);
3864 
3865 	if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) {
3866 		err = mlx5_vdpa_create_dma_mr(mvdev);
3867 		if (err)
3868 			goto err_res;
3869 	}
3870 
3871 	err = alloc_fixed_resources(ndev);
3872 	if (err)
3873 		goto err_mr;
3874 
3875 	ndev->cvq_ent.mvdev = mvdev;
3876 	INIT_WORK(&ndev->cvq_ent.work, mlx5_cvq_kick_handler);
3877 	mvdev->wq = create_singlethread_workqueue("mlx5_vdpa_wq");
3878 	if (!mvdev->wq) {
3879 		err = -ENOMEM;
3880 		goto err_res2;
3881 	}
3882 
3883 	mvdev->vdev.mdev = &mgtdev->mgtdev;
3884 	err = _vdpa_register_device(&mvdev->vdev, max_vqs + 1);
3885 	if (err)
3886 		goto err_reg;
3887 
3888 	mgtdev->ndev = ndev;
3889 
3890 	/* For virtio-vdpa, the device was set up during device register. */
3891 	if (ndev->setup)
3892 		return 0;
3893 
3894 	down_write(&ndev->reslock);
3895 	err = setup_vq_resources(ndev, false);
3896 	up_write(&ndev->reslock);
3897 	if (err)
3898 		goto err_setup_vq_res;
3899 
3900 	return 0;
3901 
3902 err_setup_vq_res:
3903 	_vdpa_unregister_device(&mvdev->vdev);
3904 err_reg:
3905 	destroy_workqueue(mvdev->wq);
3906 err_res2:
3907 	free_fixed_resources(ndev);
3908 err_mr:
3909 	mlx5_vdpa_destroy_mr_resources(mvdev);
3910 err_res:
3911 	mlx5_vdpa_free_resources(&ndev->mvdev);
3912 err_mpfs:
3913 	if (!is_zero_ether_addr(config->mac))
3914 		mlx5_mpfs_del_mac(pfmdev, config->mac);
3915 err_alloc:
3916 	put_device(&mvdev->vdev.dev);
3917 	return err;
3918 }
3919 
3920 static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device *dev)
3921 {
3922 	struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev);
3923 	struct mlx5_vdpa_dev *mvdev = to_mvdev(dev);
3924 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
3925 	struct workqueue_struct *wq;
3926 
3927 	unregister_link_notifier(ndev);
3928 	_vdpa_unregister_device(dev);
3929 
3930 	down_write(&ndev->reslock);
3931 	teardown_vq_resources(ndev);
3932 	up_write(&ndev->reslock);
3933 
3934 	wq = mvdev->wq;
3935 	mvdev->wq = NULL;
3936 	destroy_workqueue(wq);
3937 	mgtdev->ndev = NULL;
3938 }
3939 
3940 static const struct vdpa_mgmtdev_ops mdev_ops = {
3941 	.dev_add = mlx5_vdpa_dev_add,
3942 	.dev_del = mlx5_vdpa_dev_del,
3943 };
3944 
3945 static struct virtio_device_id id_table[] = {
3946 	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
3947 	{ 0 },
3948 };
3949 
3950 static int mlx5v_probe(struct auxiliary_device *adev,
3951 		       const struct auxiliary_device_id *id)
3952 
3953 {
3954 	struct mlx5_adev *madev = container_of(adev, struct mlx5_adev, adev);
3955 	struct mlx5_core_dev *mdev = madev->mdev;
3956 	struct mlx5_vdpa_mgmtdev *mgtdev;
3957 	int err;
3958 
3959 	mgtdev = kzalloc(sizeof(*mgtdev), GFP_KERNEL);
3960 	if (!mgtdev)
3961 		return -ENOMEM;
3962 
3963 	mgtdev->mgtdev.ops = &mdev_ops;
3964 	mgtdev->mgtdev.device = mdev->device;
3965 	mgtdev->mgtdev.id_table = id_table;
3966 	mgtdev->mgtdev.config_attr_mask = BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR) |
3967 					  BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP) |
3968 					  BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU) |
3969 					  BIT_ULL(VDPA_ATTR_DEV_FEATURES);
3970 	mgtdev->mgtdev.max_supported_vqs =
3971 		MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues) + 1;
3972 	mgtdev->mgtdev.supported_features = get_supported_features(mdev);
3973 	mgtdev->madev = madev;
3974 	mgtdev->vdpa_ops = mlx5_vdpa_ops;
3975 
3976 	if (!MLX5_CAP_DEV_VDPA_EMULATION(mdev, desc_group_mkey_supported))
3977 		mgtdev->vdpa_ops.get_vq_desc_group = NULL;
3978 
3979 	if (!MLX5_CAP_DEV_VDPA_EMULATION(mdev, freeze_to_rdy_supported))
3980 		mgtdev->vdpa_ops.resume = NULL;
3981 
3982 	err = vdpa_mgmtdev_register(&mgtdev->mgtdev);
3983 	if (err)
3984 		goto reg_err;
3985 
3986 	auxiliary_set_drvdata(adev, mgtdev);
3987 
3988 	return 0;
3989 
3990 reg_err:
3991 	kfree(mgtdev);
3992 	return err;
3993 }
3994 
3995 static void mlx5v_remove(struct auxiliary_device *adev)
3996 {
3997 	struct mlx5_vdpa_mgmtdev *mgtdev;
3998 
3999 	mgtdev = auxiliary_get_drvdata(adev);
4000 	vdpa_mgmtdev_unregister(&mgtdev->mgtdev);
4001 	kfree(mgtdev);
4002 }
4003 
4004 static const struct auxiliary_device_id mlx5v_id_table[] = {
4005 	{ .name = MLX5_ADEV_NAME ".vnet", },
4006 	{},
4007 };
4008 
4009 MODULE_DEVICE_TABLE(auxiliary, mlx5v_id_table);
4010 
4011 static struct auxiliary_driver mlx5v_driver = {
4012 	.name = "vnet",
4013 	.probe = mlx5v_probe,
4014 	.remove = mlx5v_remove,
4015 	.id_table = mlx5v_id_table,
4016 };
4017 
4018 module_auxiliary_driver(mlx5v_driver);
4019