xref: /freebsd/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c (revision 5c831a5bd61576cacb48b39f8eeb47b92707a355)
1 /*-
2  * Copyright (c) 2013-2015, Mellanox Technologies, Ltd.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  *
25  * $FreeBSD$
26  */
27 
28 #include <linux/errno.h>
29 #include <linux/pci.h>
30 #include <linux/dma-mapping.h>
31 #include <linux/slab.h>
32 #include <linux/io-mapping.h>
33 #include <linux/sched.h>
34 #include <linux/netdevice.h>
35 #include <linux/etherdevice.h>
36 #include <linux/list.h>
37 #include <dev/mlx5/driver.h>
38 #include <dev/mlx5/vport.h>
39 #include <asm/pgtable.h>
40 #include <linux/fs.h>
41 #undef inode
42 
43 #include <rdma/ib_user_verbs.h>
44 #include <rdma/ib_smi.h>
45 #include <rdma/ib_umem.h>
46 #include "user.h"
47 #include "mlx5_ib.h"
48 
49 #include <sys/unistd.h>
50 #include <sys/kthread.h>
51 
52 #define DRIVER_NAME "mlx5_ib"
53 #define DRIVER_VERSION "3.2-rc1"
54 #define DRIVER_RELDATE	"May 2016"
55 
56 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
57 MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver");
58 MODULE_LICENSE("Dual BSD/GPL");
59 MODULE_DEPEND(mlx5ib, linuxkpi, 1, 1, 1);
60 MODULE_DEPEND(mlx5ib, mlx5, 1, 1, 1);
61 MODULE_DEPEND(mlx5ib, ibcore, 1, 1, 1);
62 MODULE_VERSION(mlx5ib, 1);
63 
64 static int deprecated_prof_sel = 2;
65 module_param_named(prof_sel, deprecated_prof_sel, int, 0444);
66 MODULE_PARM_DESC(prof_sel, "profile selector. Deprecated here. Moved to module mlx5_core");
67 
68 enum {
69 	MLX5_STANDARD_ATOMIC_SIZE = 0x8,
70 };
71 
72 struct workqueue_struct *mlx5_ib_wq;
73 
74 static char mlx5_version[] =
75 	DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v"
76 	DRIVER_VERSION " (" DRIVER_RELDATE ")\n";
77 
78 static void get_atomic_caps(struct mlx5_ib_dev *dev,
79 			    struct ib_device_attr *props)
80 {
81 	int tmp;
82 	u8 atomic_operations;
83 	u8 atomic_size_qp;
84 	u8 atomic_req_endianess;
85 
86 	atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations);
87 	atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp);
88 	atomic_req_endianess = MLX5_CAP_ATOMIC(dev->mdev,
89 					       atomic_req_8B_endianess_mode) ||
90 			       !mlx5_host_is_le();
91 
92 	tmp = MLX5_ATOMIC_OPS_CMP_SWAP | MLX5_ATOMIC_OPS_FETCH_ADD;
93 	if (((atomic_operations & tmp) == tmp)
94 	    && (atomic_size_qp & 8)) {
95 		if (atomic_req_endianess) {
96 			props->atomic_cap = IB_ATOMIC_HCA;
97 		} else {
98 			props->atomic_cap = IB_ATOMIC_NONE;
99 		}
100 	} else {
101 		props->atomic_cap = IB_ATOMIC_NONE;
102 	}
103 
104 	tmp = MLX5_ATOMIC_OPS_MASKED_CMP_SWAP | MLX5_ATOMIC_OPS_MASKED_FETCH_ADD;
105 	if (((atomic_operations & tmp) == tmp)
106 	    &&(atomic_size_qp & 8)) {
107 		if (atomic_req_endianess)
108 			props->masked_atomic_cap = IB_ATOMIC_HCA;
109 		else {
110 			props->masked_atomic_cap = IB_ATOMIC_NONE;
111 		}
112 	} else {
113 		props->masked_atomic_cap = IB_ATOMIC_NONE;
114 	}
115 }
116 
117 static enum rdma_link_layer
118 mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num)
119 {
120 	struct mlx5_ib_dev *dev = to_mdev(device);
121 
122 	switch (MLX5_CAP_GEN(dev->mdev, port_type)) {
123 	case MLX5_CAP_PORT_TYPE_IB:
124 		return IB_LINK_LAYER_INFINIBAND;
125 	case MLX5_CAP_PORT_TYPE_ETH:
126 		return IB_LINK_LAYER_ETHERNET;
127 	default:
128 		return IB_LINK_LAYER_UNSPECIFIED;
129 	}
130 }
131 
132 static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev)
133 {
134 	return !dev->mdev->issi;
135 }
136 
137 enum {
138 	MLX5_VPORT_ACCESS_METHOD_MAD,
139 	MLX5_VPORT_ACCESS_METHOD_HCA,
140 	MLX5_VPORT_ACCESS_METHOD_NIC,
141 };
142 
143 static int mlx5_get_vport_access_method(struct ib_device *ibdev)
144 {
145 	if (mlx5_use_mad_ifc(to_mdev(ibdev)))
146 		return MLX5_VPORT_ACCESS_METHOD_MAD;
147 
148 	if (mlx5_ib_port_link_layer(ibdev, 1) ==
149 	    IB_LINK_LAYER_ETHERNET)
150 		return MLX5_VPORT_ACCESS_METHOD_NIC;
151 
152 	return MLX5_VPORT_ACCESS_METHOD_HCA;
153 }
154 
155 static int mlx5_query_system_image_guid(struct ib_device *ibdev,
156 					__be64 *sys_image_guid)
157 {
158 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
159 	struct mlx5_core_dev *mdev = dev->mdev;
160 	u64 tmp;
161 	int err;
162 
163 	switch (mlx5_get_vport_access_method(ibdev)) {
164 	case MLX5_VPORT_ACCESS_METHOD_MAD:
165 		return mlx5_query_system_image_guid_mad_ifc(ibdev,
166 							    sys_image_guid);
167 
168 	case MLX5_VPORT_ACCESS_METHOD_HCA:
169 		err = mlx5_query_hca_vport_system_image_guid(mdev, &tmp);
170 		if (!err)
171 			*sys_image_guid = cpu_to_be64(tmp);
172 		return err;
173 
174 	case MLX5_VPORT_ACCESS_METHOD_NIC:
175 		err = mlx5_query_nic_vport_system_image_guid(mdev, &tmp);
176 		if (!err)
177 			*sys_image_guid = cpu_to_be64(tmp);
178 		return err;
179 
180 	default:
181 		return -EINVAL;
182 	}
183 }
184 
185 static int mlx5_query_max_pkeys(struct ib_device *ibdev,
186 				u16 *max_pkeys)
187 {
188 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
189 	struct mlx5_core_dev *mdev = dev->mdev;
190 
191 	switch (mlx5_get_vport_access_method(ibdev)) {
192 	case MLX5_VPORT_ACCESS_METHOD_MAD:
193 		return mlx5_query_max_pkeys_mad_ifc(ibdev, max_pkeys);
194 
195 	case MLX5_VPORT_ACCESS_METHOD_HCA:
196 	case MLX5_VPORT_ACCESS_METHOD_NIC:
197 		*max_pkeys = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev,
198 						pkey_table_size));
199 		return 0;
200 
201 	default:
202 		return -EINVAL;
203 	}
204 }
205 
206 static int mlx5_query_vendor_id(struct ib_device *ibdev,
207 				u32 *vendor_id)
208 {
209 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
210 
211 	switch (mlx5_get_vport_access_method(ibdev)) {
212 	case MLX5_VPORT_ACCESS_METHOD_MAD:
213 		return mlx5_query_vendor_id_mad_ifc(ibdev, vendor_id);
214 
215 	case MLX5_VPORT_ACCESS_METHOD_HCA:
216 	case MLX5_VPORT_ACCESS_METHOD_NIC:
217 		return mlx5_core_query_vendor_id(dev->mdev, vendor_id);
218 
219 	default:
220 		return -EINVAL;
221 	}
222 }
223 
224 static int mlx5_query_node_guid(struct mlx5_ib_dev *dev,
225 				__be64 *node_guid)
226 {
227 	u64 tmp;
228 	int err;
229 
230 	switch (mlx5_get_vport_access_method(&dev->ib_dev)) {
231 	case MLX5_VPORT_ACCESS_METHOD_MAD:
232 		return mlx5_query_node_guid_mad_ifc(dev, node_guid);
233 
234 	case MLX5_VPORT_ACCESS_METHOD_HCA:
235 		err = mlx5_query_hca_vport_node_guid(dev->mdev, &tmp);
236 		if (!err)
237 			*node_guid = cpu_to_be64(tmp);
238 		return err;
239 
240 	case MLX5_VPORT_ACCESS_METHOD_NIC:
241 		err = mlx5_query_nic_vport_node_guid(dev->mdev, &tmp);
242 		if (!err)
243 			*node_guid = cpu_to_be64(tmp);
244 		return err;
245 
246 	default:
247 		return -EINVAL;
248 	}
249 }
250 
251 struct mlx5_reg_node_desc {
252 	u8	desc[64];
253 };
254 
255 static int mlx5_query_node_desc(struct mlx5_ib_dev *dev, char *node_desc)
256 {
257 	struct mlx5_reg_node_desc in;
258 
259 	if (mlx5_use_mad_ifc(dev))
260 		return mlx5_query_node_desc_mad_ifc(dev, node_desc);
261 
262 	memset(&in, 0, sizeof(in));
263 
264 	return mlx5_core_access_reg(dev->mdev, &in, sizeof(in), node_desc,
265 				    sizeof(struct mlx5_reg_node_desc),
266 				    MLX5_REG_NODE_DESC, 0, 0);
267 }
268 
269 static int mlx5_ib_query_device(struct ib_device *ibdev,
270 				struct ib_device_attr *props)
271 {
272 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
273 	struct mlx5_core_dev *mdev = dev->mdev;
274 	int max_sq_desc;
275 	int max_rq_sg;
276 	int max_sq_sg;
277 	int err;
278 
279 
280 	memset(props, 0, sizeof(*props));
281 
282 	err = mlx5_query_system_image_guid(ibdev,
283 					   &props->sys_image_guid);
284 	if (err)
285 		return err;
286 
287 	err = mlx5_query_max_pkeys(ibdev, &props->max_pkeys);
288 	if (err)
289 		return err;
290 
291 	err = mlx5_query_vendor_id(ibdev, &props->vendor_id);
292 	if (err)
293 		return err;
294 
295 	props->fw_ver = ((u64)fw_rev_maj(dev->mdev) << 32) |
296 		((u64)fw_rev_min(dev->mdev) << 16) |
297 		fw_rev_sub(dev->mdev);
298 	props->device_cap_flags    = IB_DEVICE_CHANGE_PHY_PORT |
299 		IB_DEVICE_PORT_ACTIVE_EVENT		|
300 		IB_DEVICE_SYS_IMAGE_GUID		|
301 		IB_DEVICE_RC_RNR_NAK_GEN;
302 
303 	if (MLX5_CAP_GEN(mdev, pkv))
304 		props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
305 	if (MLX5_CAP_GEN(mdev, qkv))
306 		props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
307 	if (MLX5_CAP_GEN(mdev, apm))
308 		props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
309 	props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY;
310 	if (MLX5_CAP_GEN(mdev, xrc))
311 		props->device_cap_flags |= IB_DEVICE_XRC;
312 	props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
313 	if (MLX5_CAP_GEN(mdev, block_lb_mc))
314 		props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
315 
316 	props->vendor_part_id	   = mdev->pdev->device;
317 	props->hw_ver		   = mdev->pdev->revision;
318 
319 	props->max_mr_size	   = ~0ull;
320 	props->page_size_cap	   = ~(u32)((1ull << MLX5_CAP_GEN(mdev, log_pg_sz)) -1);
321 	props->max_qp		   = 1 << MLX5_CAP_GEN(mdev, log_max_qp);
322 	props->max_qp_wr	   = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
323 	max_rq_sg =  MLX5_CAP_GEN(mdev, max_wqe_sz_rq) /
324 		     sizeof(struct mlx5_wqe_data_seg);
325 	max_sq_desc = min((int)MLX5_CAP_GEN(mdev, max_wqe_sz_sq), 512);
326 	max_sq_sg = (max_sq_desc -
327 		     sizeof(struct mlx5_wqe_ctrl_seg) -
328 		     sizeof(struct mlx5_wqe_raddr_seg)) / sizeof(struct mlx5_wqe_data_seg);
329 	props->max_sge = min(max_rq_sg, max_sq_sg);
330 	props->max_cq		   = 1 << MLX5_CAP_GEN(mdev, log_max_cq);
331 	props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_cq_sz)) - 1;
332 	props->max_mr		   = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
333 	props->max_pd		   = 1 << MLX5_CAP_GEN(mdev, log_max_pd);
334 	props->max_qp_rd_atom	   = 1 << MLX5_CAP_GEN(mdev, log_max_ra_req_qp);
335 	props->max_qp_init_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_res_qp);
336 	props->max_srq		   = 1 << MLX5_CAP_GEN(mdev, log_max_srq);
337 	props->max_srq_wr = (1 << MLX5_CAP_GEN(mdev, log_max_srq_sz)) - 1;
338 	props->local_ca_ack_delay  = MLX5_CAP_GEN(mdev, local_ca_ack_delay);
339 	props->max_res_rd_atom	   = props->max_qp_rd_atom * props->max_qp;
340 	props->max_srq_sge	   = max_rq_sg - 1;
341 	props->max_fast_reg_page_list_len = (unsigned int)-1;
342 	get_atomic_caps(dev, props);
343 	props->max_mcast_grp	   = 1 << MLX5_CAP_GEN(mdev, log_max_mcg);
344 	props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg);
345 	props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
346 					   props->max_mcast_grp;
347 	props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */
348 	props->max_ah		= INT_MAX;
349 
350 	return 0;
351 }
352 
353 enum mlx5_ib_width {
354 	MLX5_IB_WIDTH_1X	= 1 << 0,
355 	MLX5_IB_WIDTH_2X	= 1 << 1,
356 	MLX5_IB_WIDTH_4X	= 1 << 2,
357 	MLX5_IB_WIDTH_8X	= 1 << 3,
358 	MLX5_IB_WIDTH_12X	= 1 << 4
359 };
360 
361 static int translate_active_width(struct ib_device *ibdev, u8 active_width,
362 				  u8 *ib_width)
363 {
364 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
365 	int err = 0;
366 
367 	if (active_width & MLX5_IB_WIDTH_1X) {
368 		*ib_width = IB_WIDTH_1X;
369 	} else if (active_width & MLX5_IB_WIDTH_2X) {
370 		mlx5_ib_warn(dev, "active_width %d is not supported by IB spec\n",
371 			     (int)active_width);
372 		err = -EINVAL;
373 	} else if (active_width & MLX5_IB_WIDTH_4X) {
374 		*ib_width = IB_WIDTH_4X;
375 	} else if (active_width & MLX5_IB_WIDTH_8X) {
376 		*ib_width = IB_WIDTH_8X;
377 	} else if (active_width & MLX5_IB_WIDTH_12X) {
378 		*ib_width = IB_WIDTH_12X;
379 	} else {
380 		mlx5_ib_dbg(dev, "Invalid active_width %d\n",
381 			    (int)active_width);
382 		err = -EINVAL;
383 	}
384 
385 	return err;
386 }
387 
388 /*
389  * TODO: Move to IB core
390  */
391 enum ib_max_vl_num {
392 	__IB_MAX_VL_0		= 1,
393 	__IB_MAX_VL_0_1		= 2,
394 	__IB_MAX_VL_0_3		= 3,
395 	__IB_MAX_VL_0_7		= 4,
396 	__IB_MAX_VL_0_14	= 5,
397 };
398 
399 enum mlx5_vl_hw_cap {
400 	MLX5_VL_HW_0	= 1,
401 	MLX5_VL_HW_0_1	= 2,
402 	MLX5_VL_HW_0_2	= 3,
403 	MLX5_VL_HW_0_3	= 4,
404 	MLX5_VL_HW_0_4	= 5,
405 	MLX5_VL_HW_0_5	= 6,
406 	MLX5_VL_HW_0_6	= 7,
407 	MLX5_VL_HW_0_7	= 8,
408 	MLX5_VL_HW_0_14	= 15
409 };
410 
411 static int translate_max_vl_num(struct ib_device *ibdev, u8 vl_hw_cap,
412 				u8 *max_vl_num)
413 {
414 	switch (vl_hw_cap) {
415 	case MLX5_VL_HW_0:
416 		*max_vl_num = __IB_MAX_VL_0;
417 		break;
418 	case MLX5_VL_HW_0_1:
419 		*max_vl_num = __IB_MAX_VL_0_1;
420 		break;
421 	case MLX5_VL_HW_0_3:
422 		*max_vl_num = __IB_MAX_VL_0_3;
423 		break;
424 	case MLX5_VL_HW_0_7:
425 		*max_vl_num = __IB_MAX_VL_0_7;
426 		break;
427 	case MLX5_VL_HW_0_14:
428 		*max_vl_num = __IB_MAX_VL_0_14;
429 		break;
430 
431 	default:
432 		return -EINVAL;
433 	}
434 
435 	return 0;
436 }
437 
438 static int mlx5_query_port_ib(struct ib_device *ibdev, u8 port,
439 			      struct ib_port_attr *props)
440 {
441 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
442 	struct mlx5_core_dev *mdev = dev->mdev;
443 	u32 *rep;
444 	int outlen = MLX5_ST_SZ_BYTES(query_hca_vport_context_out);
445 	struct mlx5_ptys_reg *ptys;
446 	struct mlx5_pmtu_reg *pmtu;
447 	struct mlx5_pvlc_reg pvlc;
448 	void *ctx;
449 	int err;
450 
451 	rep = mlx5_vzalloc(outlen);
452 	ptys = kzalloc(sizeof(*ptys), GFP_KERNEL);
453 	pmtu = kzalloc(sizeof(*pmtu), GFP_KERNEL);
454 	if (!rep || !ptys || !pmtu) {
455 		err = -ENOMEM;
456 		goto out;
457 	}
458 
459 	memset(props, 0, sizeof(*props));
460 
461 	/* what if I am pf with dual port */
462 	err = mlx5_query_hca_vport_context(mdev, port, 0, rep, outlen);
463 	if (err)
464 		goto out;
465 
466 	ctx = MLX5_ADDR_OF(query_hca_vport_context_out, rep, hca_vport_context);
467 
468 	props->lid		= MLX5_GET(hca_vport_context, ctx, lid);
469 	props->lmc		= MLX5_GET(hca_vport_context, ctx, lmc);
470 	props->sm_lid		= MLX5_GET(hca_vport_context, ctx, sm_lid);
471 	props->sm_sl		= MLX5_GET(hca_vport_context, ctx, sm_sl);
472 	props->state		= MLX5_GET(hca_vport_context, ctx, vport_state);
473 	props->phys_state	= MLX5_GET(hca_vport_context, ctx,
474 					port_physical_state);
475 	props->port_cap_flags	= MLX5_GET(hca_vport_context, ctx, cap_mask1);
476 	props->gid_tbl_len	= mlx5_get_gid_table_len(MLX5_CAP_GEN(mdev, gid_table_size));
477 	props->max_msg_sz	= 1 << MLX5_CAP_GEN(mdev, log_max_msg);
478 	props->pkey_tbl_len	= mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, pkey_table_size));
479 	props->bad_pkey_cntr	= MLX5_GET(hca_vport_context, ctx,
480 					      pkey_violation_counter);
481 	props->qkey_viol_cntr	= MLX5_GET(hca_vport_context, ctx,
482 					      qkey_violation_counter);
483 	props->subnet_timeout	= MLX5_GET(hca_vport_context, ctx,
484 					      subnet_timeout);
485 	props->init_type_reply	= MLX5_GET(hca_vport_context, ctx,
486 					   init_type_reply);
487 
488 	ptys->proto_mask |= MLX5_PTYS_IB;
489 	ptys->local_port = port;
490 	err = mlx5_core_access_ptys(mdev, ptys, 0);
491 	if (err)
492 		goto out;
493 
494 	err = translate_active_width(ibdev, ptys->ib_link_width_oper,
495 				     &props->active_width);
496 	if (err)
497 		goto out;
498 
499 	props->active_speed	= (u8)ptys->ib_proto_oper;
500 
501 	pmtu->local_port = port;
502 	err = mlx5_core_access_pmtu(mdev, pmtu, 0);
503 	if (err)
504 		goto out;
505 
506 	props->max_mtu		= pmtu->max_mtu;
507 	props->active_mtu	= pmtu->oper_mtu;
508 
509 	memset(&pvlc, 0, sizeof(pvlc));
510 	pvlc.local_port = port;
511 	err = mlx5_core_access_pvlc(mdev, &pvlc, 0);
512 	if (err)
513 		goto out;
514 
515 	err = translate_max_vl_num(ibdev, pvlc.vl_hw_cap,
516 				   &props->max_vl_num);
517 out:
518 	kvfree(rep);
519 	kfree(ptys);
520 	kfree(pmtu);
521 	return err;
522 }
523 
524 int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
525 		       struct ib_port_attr *props)
526 {
527 	switch (mlx5_get_vport_access_method(ibdev)) {
528 	case MLX5_VPORT_ACCESS_METHOD_MAD:
529 		return mlx5_query_port_mad_ifc(ibdev, port, props);
530 
531 	case MLX5_VPORT_ACCESS_METHOD_HCA:
532 		return mlx5_query_port_ib(ibdev, port, props);
533 
534 	case MLX5_VPORT_ACCESS_METHOD_NIC:
535 		return mlx5_query_port_roce(ibdev, port, props);
536 
537 	default:
538 		return -EINVAL;
539 	}
540 }
541 
542 static inline int
543 mlx5_addrconf_ifid_eui48(u8 *eui, struct net_device *dev)
544 {
545 	if (dev->if_addrlen != ETH_ALEN)
546 		return -1;
547 	memcpy(eui, IF_LLADDR(dev), 3);
548 	memcpy(eui + 5, IF_LLADDR(dev) + 3, 3);
549 
550 	/* NOTE: The scope ID is added by the GID to IP conversion */
551 
552 	eui[3] = 0xFF;
553 	eui[4] = 0xFE;
554 	eui[0] ^= 2;
555 	return 0;
556 }
557 
558 static void
559 mlx5_make_default_gid(struct net_device *dev, union ib_gid *gid)
560 {
561 	gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
562 	mlx5_addrconf_ifid_eui48(&gid->raw[8], dev);
563 }
564 
565 static inline int
566 mlx5_ip2gid(const struct sockaddr *addr, union ib_gid *gid)
567 {
568 	switch (addr->sa_family) {
569 	case AF_INET:
570 		ipv6_addr_set_v4mapped(((const struct sockaddr_in *)addr)->sin_addr.s_addr,
571 		    (struct in6_addr *)gid->raw);
572 		break;
573 	case AF_INET6:
574 		memcpy(gid->raw, &((const struct sockaddr_in6 *)addr)->sin6_addr, 16);
575 		/* clear SCOPE ID */
576 		gid->raw[2] = 0;
577 		gid->raw[3] = 0;
578 		break;
579 	default:
580 		return -EINVAL;
581 	}
582 	return 0;
583 }
584 
585 static void
586 mlx5_ib_roce_port_update(void *arg)
587 {
588 	struct mlx5_ib_port *port = (struct mlx5_ib_port *)arg;
589 	struct mlx5_ib_dev *dev = port->dev;
590 	struct mlx5_core_dev *mdev = dev->mdev;
591 	struct net_device *xdev[MLX5_IB_GID_MAX];
592 	struct net_device *idev;
593 	struct net_device *ndev;
594 	struct ifaddr *ifa;
595 	union ib_gid gid_temp;
596 
597 	while (port->port_gone == 0) {
598 		int update = 0;
599 		int gid_index = 0;
600 		int j;
601 		int error;
602 
603 		ndev = mlx5_get_protocol_dev(mdev, MLX5_INTERFACE_PROTOCOL_ETH);
604 		if (ndev == NULL) {
605 			pause("W", hz);
606 			continue;
607 		}
608 
609 		CURVNET_SET_QUIET(ndev->if_vnet);
610 
611 		memset(&gid_temp, 0, sizeof(gid_temp));
612 		mlx5_make_default_gid(ndev, &gid_temp);
613 		if (bcmp(&gid_temp, &port->gid_table[gid_index], sizeof(gid_temp))) {
614 			port->gid_table[gid_index] = gid_temp;
615 			update = 1;
616 		}
617 		xdev[gid_index] = ndev;
618 		gid_index++;
619 
620 		IFNET_RLOCK();
621 		TAILQ_FOREACH(idev, &V_ifnet, if_link) {
622 			if (idev == ndev)
623 				break;
624 		}
625 		if (idev != NULL) {
626 		    TAILQ_FOREACH(idev, &V_ifnet, if_link) {
627 			if (idev != ndev) {
628 				if (idev->if_type != IFT_L2VLAN)
629 					continue;
630 				if (ndev != rdma_vlan_dev_real_dev(idev))
631 					continue;
632 			}
633 			/* clone address information for IPv4 and IPv6 */
634 			IF_ADDR_RLOCK(idev);
635 			TAILQ_FOREACH(ifa, &idev->if_addrhead, ifa_link) {
636 				if (ifa->ifa_addr == NULL ||
637 				    (ifa->ifa_addr->sa_family != AF_INET &&
638 				     ifa->ifa_addr->sa_family != AF_INET6) ||
639 				    gid_index >= MLX5_IB_GID_MAX)
640 					continue;
641 				memset(&gid_temp, 0, sizeof(gid_temp));
642 				mlx5_ip2gid(ifa->ifa_addr, &gid_temp);
643 				/* check for existing entry */
644 				for (j = 0; j != gid_index; j++) {
645 					if (bcmp(&gid_temp, &port->gid_table[j], sizeof(gid_temp)) == 0)
646 						break;
647 				}
648 				/* check if new entry must be added */
649 				if (j == gid_index) {
650 					if (bcmp(&gid_temp, &port->gid_table[gid_index], sizeof(gid_temp))) {
651 						port->gid_table[gid_index] = gid_temp;
652 						update = 1;
653 					}
654 					xdev[gid_index] = idev;
655 					gid_index++;
656 				}
657 			}
658 			IF_ADDR_RUNLOCK(idev);
659 		    }
660 		}
661 		IFNET_RUNLOCK();
662 		CURVNET_RESTORE();
663 
664 		if (update != 0 &&
665 		    mlx5_ib_port_link_layer(&dev->ib_dev, 1) == IB_LINK_LAYER_ETHERNET) {
666 			struct ib_event event = {
667 			    .device = &dev->ib_dev,
668 			    .element.port_num = port->port_num + 1,
669 			    .event = IB_EVENT_GID_CHANGE,
670 			};
671 
672 			/* add new entries, if any */
673 			for (j = 0; j != gid_index; j++) {
674 				error = modify_gid_roce(&dev->ib_dev, port->port_num, j,
675 				    port->gid_table + j, xdev[j]);
676 				if (error != 0)
677 					printf("mlx5_ib: Failed to update ROCE GID table: %d\n", error);
678 			}
679 			memset(&gid_temp, 0, sizeof(gid_temp));
680 
681 			/* clear old entries, if any */
682 			for (; j != MLX5_IB_GID_MAX; j++) {
683 				if (bcmp(&gid_temp, port->gid_table + j, sizeof(gid_temp)) == 0)
684 					continue;
685 				port->gid_table[j] = gid_temp;
686 				(void) modify_gid_roce(&dev->ib_dev, port->port_num, j,
687 				    port->gid_table + j, ndev);
688 			}
689 
690 			/* make sure ibcore gets updated */
691 			ib_dispatch_event(&event);
692 		}
693 		pause("W", hz);
694 	}
695 	do {
696 		struct ib_event event = {
697 			.device = &dev->ib_dev,
698 			.element.port_num = port->port_num + 1,
699 			.event = IB_EVENT_GID_CHANGE,
700 		};
701 		/* make sure ibcore gets updated */
702 		ib_dispatch_event(&event);
703 
704 		/* wait a bit */
705 		pause("W", hz);
706 	} while (0);
707 	port->port_gone = 2;
708 	kthread_exit();
709 }
710 
711 static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
712 			     union ib_gid *gid)
713 {
714 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
715 	struct mlx5_core_dev *mdev = dev->mdev;
716 
717 	switch (mlx5_get_vport_access_method(ibdev)) {
718 	case MLX5_VPORT_ACCESS_METHOD_MAD:
719 		return mlx5_query_gids_mad_ifc(ibdev, port, index, gid);
720 
721 	case MLX5_VPORT_ACCESS_METHOD_HCA:
722 		return mlx5_query_hca_vport_gid(mdev, port, 0, index, gid);
723 
724 	case MLX5_VPORT_ACCESS_METHOD_NIC:
725 		if (port == 0 || port > MLX5_CAP_GEN(mdev, num_ports) ||
726 		    index < 0 || index >= MLX5_IB_GID_MAX ||
727 		    dev->port[port - 1].port_gone != 0)
728 			memset(gid, 0, sizeof(*gid));
729 		else
730 			*gid = dev->port[port - 1].gid_table[index];
731 		return 0;
732 
733 	default:
734 		return -EINVAL;
735 	}
736 }
737 
738 static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
739 			      u16 *pkey)
740 {
741 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
742 	struct mlx5_core_dev *mdev = dev->mdev;
743 
744 	switch (mlx5_get_vport_access_method(ibdev)) {
745 	case MLX5_VPORT_ACCESS_METHOD_MAD:
746 		return mlx5_query_pkey_mad_ifc(ibdev, port, index, pkey);
747 
748 	case MLX5_VPORT_ACCESS_METHOD_HCA:
749 	case MLX5_VPORT_ACCESS_METHOD_NIC:
750 		return mlx5_query_hca_vport_pkey(mdev, 0, port, 0, index,
751 						 pkey);
752 
753 	default:
754 		return -EINVAL;
755 	}
756 }
757 
758 static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask,
759 				 struct ib_device_modify *props)
760 {
761 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
762 	struct mlx5_reg_node_desc in;
763 	struct mlx5_reg_node_desc out;
764 	int err;
765 
766 	if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
767 		return -EOPNOTSUPP;
768 
769 	if (!(mask & IB_DEVICE_MODIFY_NODE_DESC))
770 		return 0;
771 
772 	/*
773 	 * If possible, pass node desc to FW, so it can generate
774 	 * a 144 trap.  If cmd fails, just ignore.
775 	 */
776 	memcpy(&in, props->node_desc, 64);
777 	err = mlx5_core_access_reg(dev->mdev, &in, sizeof(in), &out,
778 				   sizeof(out), MLX5_REG_NODE_DESC, 0, 1);
779 	if (err)
780 		return err;
781 
782 	memcpy(ibdev->node_desc, props->node_desc, 64);
783 
784 	return err;
785 }
786 
787 static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask,
788 			       struct ib_port_modify *props)
789 {
790 	u8 is_eth = (mlx5_ib_port_link_layer(ibdev, port) ==
791 		     IB_LINK_LAYER_ETHERNET);
792 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
793 	struct ib_port_attr attr;
794 	u32 tmp;
795 	int err;
796 
797 	/* return OK if this is RoCE. CM calls ib_modify_port() regardless
798 	 * of whether port link layer is ETH or IB. For ETH ports, qkey
799 	 * violations and port capabilities are not valid.
800 	 */
801 	if (is_eth)
802 		return 0;
803 
804 	mutex_lock(&dev->cap_mask_mutex);
805 
806 	err = mlx5_ib_query_port(ibdev, port, &attr);
807 	if (err)
808 		goto out;
809 
810 	tmp = (attr.port_cap_flags | props->set_port_cap_mask) &
811 		~props->clr_port_cap_mask;
812 
813 	err = mlx5_set_port_caps(dev->mdev, port, tmp);
814 
815 out:
816 	mutex_unlock(&dev->cap_mask_mutex);
817 	return err;
818 }
819 
820 enum mlx5_cap_flags {
821 	MLX5_CAP_COMPACT_AV = 1 << 0,
822 };
823 
824 static void set_mlx5_flags(u32 *flags, struct mlx5_core_dev *dev)
825 {
826 	*flags |= MLX5_CAP_GEN(dev, compact_address_vector) ?
827 		  MLX5_CAP_COMPACT_AV : 0;
828 }
829 
830 static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
831 						  struct ib_udata *udata)
832 {
833 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
834 	struct mlx5_ib_alloc_ucontext_req_v2 req;
835 	struct mlx5_ib_alloc_ucontext_resp resp;
836 	struct mlx5_ib_ucontext *context;
837 	struct mlx5_uuar_info *uuari;
838 	struct mlx5_uar *uars;
839 	int gross_uuars;
840 	int num_uars;
841 	int ver;
842 	int uuarn;
843 	int err;
844 	int i;
845 	size_t reqlen;
846 
847 	if (!dev->ib_active)
848 		return ERR_PTR(-EAGAIN);
849 
850 	memset(&req, 0, sizeof(req));
851 	memset(&resp, 0, sizeof(resp));
852 
853 	reqlen = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr);
854 	if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req))
855 		ver = 0;
856 	else if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req_v2))
857 		ver = 2;
858 	else {
859 		mlx5_ib_err(dev, "request malformed, reqlen: %ld\n", (long)reqlen);
860 		return ERR_PTR(-EINVAL);
861 	}
862 
863 	err = ib_copy_from_udata(&req, udata, reqlen);
864 	if (err) {
865 		mlx5_ib_err(dev, "copy failed\n");
866 		return ERR_PTR(err);
867 	}
868 
869 	if (req.reserved) {
870 		mlx5_ib_err(dev, "request corrupted\n");
871 		return ERR_PTR(-EINVAL);
872 	}
873 
874 	if (req.total_num_uuars == 0 || req.total_num_uuars > MLX5_MAX_UUARS) {
875 		mlx5_ib_warn(dev, "wrong num_uuars: %d\n", req.total_num_uuars);
876 		return ERR_PTR(-ENOMEM);
877 	}
878 
879 	req.total_num_uuars = ALIGN(req.total_num_uuars,
880 				    MLX5_NON_FP_BF_REGS_PER_PAGE);
881 	if (req.num_low_latency_uuars > req.total_num_uuars - 1) {
882 		mlx5_ib_warn(dev, "wrong num_low_latency_uuars: %d ( > %d)\n",
883 			     req.total_num_uuars, req.total_num_uuars);
884 		return ERR_PTR(-EINVAL);
885 	}
886 
887 	num_uars = req.total_num_uuars / MLX5_NON_FP_BF_REGS_PER_PAGE;
888 	gross_uuars = num_uars * MLX5_BF_REGS_PER_PAGE;
889 	resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp);
890 	if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf))
891 		resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size);
892 	resp.cache_line_size = L1_CACHE_BYTES;
893 	resp.max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq);
894 	resp.max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq);
895 	resp.max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
896 	resp.max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
897 	resp.max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz);
898 	set_mlx5_flags(&resp.flags, dev->mdev);
899 
900 	if (offsetof(struct mlx5_ib_alloc_ucontext_resp, max_desc_sz_sq_dc) < udata->outlen)
901 		resp.max_desc_sz_sq_dc = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq_dc);
902 
903 	if (offsetof(struct mlx5_ib_alloc_ucontext_resp, atomic_arg_sizes_dc) < udata->outlen)
904 		resp.atomic_arg_sizes_dc = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_dc);
905 
906 	context = kzalloc(sizeof(*context), GFP_KERNEL);
907 	if (!context)
908 		return ERR_PTR(-ENOMEM);
909 
910 	uuari = &context->uuari;
911 	mutex_init(&uuari->lock);
912 	uars = kcalloc(num_uars, sizeof(*uars), GFP_KERNEL);
913 	if (!uars) {
914 		err = -ENOMEM;
915 		goto out_ctx;
916 	}
917 
918 	uuari->bitmap = kcalloc(BITS_TO_LONGS(gross_uuars),
919 				sizeof(*uuari->bitmap),
920 				GFP_KERNEL);
921 	if (!uuari->bitmap) {
922 		err = -ENOMEM;
923 		goto out_uar_ctx;
924 	}
925 	/*
926 	 * clear all fast path uuars
927 	 */
928 	for (i = 0; i < gross_uuars; i++) {
929 		uuarn = i & 3;
930 		if (uuarn == 2 || uuarn == 3)
931 			set_bit(i, uuari->bitmap);
932 	}
933 
934 	uuari->count = kcalloc(gross_uuars, sizeof(*uuari->count), GFP_KERNEL);
935 	if (!uuari->count) {
936 		err = -ENOMEM;
937 		goto out_bitmap;
938 	}
939 
940 	for (i = 0; i < num_uars; i++) {
941 		err = mlx5_cmd_alloc_uar(dev->mdev, &uars[i].index);
942 		if (err) {
943 			mlx5_ib_err(dev, "uar alloc failed at %d\n", i);
944 			goto out_uars;
945 		}
946 	}
947 	for (i = 0; i < MLX5_IB_MAX_CTX_DYNAMIC_UARS; i++)
948 		context->dynamic_wc_uar_index[i] = MLX5_IB_INVALID_UAR_INDEX;
949 
950 	INIT_LIST_HEAD(&context->db_page_list);
951 	mutex_init(&context->db_page_mutex);
952 
953 	resp.tot_uuars = req.total_num_uuars;
954 	resp.num_ports = MLX5_CAP_GEN(dev->mdev, num_ports);
955 	err = ib_copy_to_udata(udata, &resp,
956 			       min_t(size_t, udata->outlen, sizeof(resp)));
957 	if (err)
958 		goto out_uars;
959 
960 	uuari->ver = ver;
961 	uuari->num_low_latency_uuars = req.num_low_latency_uuars;
962 	uuari->uars = uars;
963 	uuari->num_uars = num_uars;
964 
965 	if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) ==
966 	    IB_LINK_LAYER_ETHERNET) {
967 		err = mlx5_alloc_transport_domain(dev->mdev, &context->tdn);
968 		if (err)
969 			goto out_uars;
970 	}
971 
972 	return &context->ibucontext;
973 
974 out_uars:
975 	for (i--; i >= 0; i--)
976 		mlx5_cmd_free_uar(dev->mdev, uars[i].index);
977 	kfree(uuari->count);
978 
979 out_bitmap:
980 	kfree(uuari->bitmap);
981 
982 out_uar_ctx:
983 	kfree(uars);
984 
985 out_ctx:
986 	kfree(context);
987 	return ERR_PTR(err);
988 }
989 
990 static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
991 {
992 	struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
993 	struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
994 	struct mlx5_uuar_info *uuari = &context->uuari;
995 	int i;
996 
997 	if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) ==
998 	    IB_LINK_LAYER_ETHERNET)
999 		mlx5_dealloc_transport_domain(dev->mdev, context->tdn);
1000 
1001 	for (i = 0; i < uuari->num_uars; i++) {
1002 		if (mlx5_cmd_free_uar(dev->mdev, uuari->uars[i].index))
1003 			mlx5_ib_warn(dev, "failed to free UAR 0x%x\n", uuari->uars[i].index);
1004 	}
1005 	for (i = 0; i < MLX5_IB_MAX_CTX_DYNAMIC_UARS; i++) {
1006 		if (context->dynamic_wc_uar_index[i] != MLX5_IB_INVALID_UAR_INDEX)
1007 			mlx5_cmd_free_uar(dev->mdev, context->dynamic_wc_uar_index[i]);
1008 	}
1009 
1010 	kfree(uuari->count);
1011 	kfree(uuari->bitmap);
1012 	kfree(uuari->uars);
1013 	kfree(context);
1014 
1015 	return 0;
1016 }
1017 
1018 static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev, int index)
1019 {
1020 	return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) + index;
1021 }
1022 
1023 static int get_command(unsigned long offset)
1024 {
1025 	return (offset >> MLX5_IB_MMAP_CMD_SHIFT) & MLX5_IB_MMAP_CMD_MASK;
1026 }
1027 
1028 static int get_arg(unsigned long offset)
1029 {
1030 	return offset & ((1 << MLX5_IB_MMAP_CMD_SHIFT) - 1);
1031 }
1032 
1033 static int get_index(unsigned long offset)
1034 {
1035 	return get_arg(offset);
1036 }
1037 
1038 static int uar_mmap(struct vm_area_struct *vma, pgprot_t prot, bool is_wc,
1039 		    struct mlx5_uuar_info *uuari, struct mlx5_ib_dev *dev,
1040 		    struct mlx5_ib_ucontext *context)
1041 {
1042 	unsigned long idx;
1043 	phys_addr_t pfn;
1044 
1045 	if (vma->vm_end - vma->vm_start != PAGE_SIZE) {
1046 		mlx5_ib_warn(dev, "wrong size, expected PAGE_SIZE(%ld) got %ld\n",
1047 			     (long)PAGE_SIZE, (long)(vma->vm_end - vma->vm_start));
1048 		return -EINVAL;
1049 	}
1050 
1051 	idx = get_index(vma->vm_pgoff);
1052 	if (idx >= uuari->num_uars) {
1053 		mlx5_ib_warn(dev, "wrong offset, idx:%ld num_uars:%d\n",
1054 			     idx, uuari->num_uars);
1055 		return -EINVAL;
1056 	}
1057 
1058 	pfn = uar_index2pfn(dev, uuari->uars[idx].index);
1059 	mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn 0x%llx\n", idx,
1060 		    (unsigned long long)pfn);
1061 
1062 	vma->vm_page_prot = prot;
1063 	if (io_remap_pfn_range(vma, vma->vm_start, pfn,
1064 			       PAGE_SIZE, vma->vm_page_prot)) {
1065 		mlx5_ib_err(dev, "io remap failed\n");
1066 		return -EAGAIN;
1067 	}
1068 
1069 	mlx5_ib_dbg(dev, "mapped %s at 0x%lx, PA 0x%llx\n", is_wc ? "WC" : "NC",
1070 		    (long)vma->vm_start, (unsigned long long)pfn << PAGE_SHIFT);
1071 
1072 	return 0;
1073 }
1074 
1075 static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
1076 {
1077 	struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
1078 	struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
1079 	struct mlx5_uuar_info *uuari = &context->uuari;
1080 	unsigned long command;
1081 
1082 	command = get_command(vma->vm_pgoff);
1083 	switch (command) {
1084 	case MLX5_IB_MMAP_REGULAR_PAGE:
1085 		return uar_mmap(vma, pgprot_writecombine(vma->vm_page_prot),
1086 				true,
1087 				uuari, dev, context);
1088 
1089 		break;
1090 
1091 	case MLX5_IB_MMAP_WC_PAGE:
1092 		return uar_mmap(vma, pgprot_writecombine(vma->vm_page_prot),
1093 				true, uuari, dev, context);
1094 		break;
1095 
1096 	case MLX5_IB_MMAP_NC_PAGE:
1097 		return uar_mmap(vma, pgprot_noncached(vma->vm_page_prot),
1098 				false, uuari, dev, context);
1099 		break;
1100 
1101 	default:
1102 		return -EINVAL;
1103 	}
1104 
1105 	return 0;
1106 }
1107 
1108 static int alloc_pa_mkey(struct mlx5_ib_dev *dev, u32 *key, u32 pdn)
1109 {
1110 	struct mlx5_create_mkey_mbox_in *in;
1111 	struct mlx5_mkey_seg *seg;
1112 	struct mlx5_core_mr mr;
1113 	int err;
1114 
1115 	in = kzalloc(sizeof(*in), GFP_KERNEL);
1116 	if (!in)
1117 		return -ENOMEM;
1118 
1119 	seg = &in->seg;
1120 	seg->flags = MLX5_PERM_LOCAL_READ | MLX5_ACCESS_MODE_PA;
1121 	seg->flags_pd = cpu_to_be32(pdn | MLX5_MKEY_LEN64);
1122 	seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
1123 	seg->start_addr = 0;
1124 
1125 	err = mlx5_core_create_mkey(dev->mdev, &mr, in, sizeof(*in),
1126 				    NULL, NULL, NULL);
1127 	if (err) {
1128 		mlx5_ib_warn(dev, "failed to create mkey, %d\n", err);
1129 		goto err_in;
1130 	}
1131 
1132 	kfree(in);
1133 	*key = mr.key;
1134 
1135 	return 0;
1136 
1137 err_in:
1138 	kfree(in);
1139 
1140 	return err;
1141 }
1142 
1143 static void free_pa_mkey(struct mlx5_ib_dev *dev, u32 key)
1144 {
1145 	struct mlx5_core_mr mr;
1146 	int err;
1147 
1148 	memset(&mr, 0, sizeof(mr));
1149 	mr.key = key;
1150 	err = mlx5_core_destroy_mkey(dev->mdev, &mr);
1151 	if (err)
1152 		mlx5_ib_warn(dev, "failed to destroy mkey 0x%x\n", key);
1153 }
1154 
1155 static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev,
1156 				      struct ib_ucontext *context,
1157 				      struct ib_udata *udata)
1158 {
1159 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
1160 	struct mlx5_ib_alloc_pd_resp resp;
1161 	struct mlx5_ib_pd *pd;
1162 	int err;
1163 
1164 	pd = kmalloc(sizeof(*pd), GFP_KERNEL);
1165 	if (!pd)
1166 		return ERR_PTR(-ENOMEM);
1167 
1168 	err = mlx5_core_alloc_pd(to_mdev(ibdev)->mdev, &pd->pdn);
1169 	if (err) {
1170 		mlx5_ib_warn(dev, "pd alloc failed\n");
1171 		kfree(pd);
1172 		return ERR_PTR(err);
1173 	}
1174 
1175 	if (context) {
1176 		resp.pdn = pd->pdn;
1177 		if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
1178 			mlx5_ib_err(dev, "copy failed\n");
1179 			mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn);
1180 			kfree(pd);
1181 			return ERR_PTR(-EFAULT);
1182 		}
1183 	} else {
1184 		err = alloc_pa_mkey(to_mdev(ibdev), &pd->pa_lkey, pd->pdn);
1185 		if (err) {
1186 			mlx5_ib_err(dev, "alloc mkey failed\n");
1187 			mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn);
1188 			kfree(pd);
1189 			return ERR_PTR(err);
1190 		}
1191 	}
1192 
1193 	return &pd->ibpd;
1194 }
1195 
1196 static int mlx5_ib_dealloc_pd(struct ib_pd *pd)
1197 {
1198 	struct mlx5_ib_dev *mdev = to_mdev(pd->device);
1199 	struct mlx5_ib_pd *mpd = to_mpd(pd);
1200 
1201 	if (!pd->uobject)
1202 		free_pa_mkey(mdev, mpd->pa_lkey);
1203 
1204 	mlx5_core_dealloc_pd(mdev->mdev, mpd->pdn);
1205 	kfree(mpd);
1206 
1207 	return 0;
1208 }
1209 
1210 static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
1211 {
1212 	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
1213 	int err;
1214 
1215 	if (ibqp->qp_type == IB_QPT_RAW_PACKET)
1216 		err = -EOPNOTSUPP;
1217 	else
1218 		err = mlx5_core_attach_mcg(dev->mdev, gid, ibqp->qp_num);
1219 	if (err)
1220 		mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n",
1221 			     ibqp->qp_num, gid->raw);
1222 
1223 	return err;
1224 }
1225 
1226 static int mlx5_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
1227 {
1228 	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
1229 	int err;
1230 
1231 	if (ibqp->qp_type == IB_QPT_RAW_PACKET)
1232 		err = -EOPNOTSUPP;
1233 	else
1234 		err = mlx5_core_detach_mcg(dev->mdev, gid, ibqp->qp_num);
1235 	if (err)
1236 		mlx5_ib_warn(dev, "failed detaching QPN 0x%x, MGID %pI6\n",
1237 			     ibqp->qp_num, gid->raw);
1238 
1239 	return err;
1240 }
1241 
1242 static int init_node_data(struct mlx5_ib_dev *dev)
1243 {
1244 	int err;
1245 
1246 	err = mlx5_query_node_desc(dev, dev->ib_dev.node_desc);
1247 	if (err)
1248 		return err;
1249 
1250 	return mlx5_query_node_guid(dev, &dev->ib_dev.node_guid);
1251 }
1252 
1253 static ssize_t show_fw_pages(struct device *device, struct device_attribute *attr,
1254 			     char *buf)
1255 {
1256 	struct mlx5_ib_dev *dev =
1257 		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
1258 
1259 	return sprintf(buf, "%lld\n", (long long)dev->mdev->priv.fw_pages);
1260 }
1261 
1262 static ssize_t show_reg_pages(struct device *device,
1263 			      struct device_attribute *attr, char *buf)
1264 {
1265 	struct mlx5_ib_dev *dev =
1266 		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
1267 
1268 	return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages));
1269 }
1270 
1271 static ssize_t show_hca(struct device *device, struct device_attribute *attr,
1272 			char *buf)
1273 {
1274 	struct mlx5_ib_dev *dev =
1275 		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
1276 	return sprintf(buf, "MT%d\n", dev->mdev->pdev->device);
1277 }
1278 
1279 static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
1280 			   char *buf)
1281 {
1282 	struct mlx5_ib_dev *dev =
1283 		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
1284 	return sprintf(buf, "%d.%d.%04d\n", fw_rev_maj(dev->mdev),
1285 		       fw_rev_min(dev->mdev), fw_rev_sub(dev->mdev));
1286 }
1287 
1288 static ssize_t show_rev(struct device *device, struct device_attribute *attr,
1289 			char *buf)
1290 {
1291 	struct mlx5_ib_dev *dev =
1292 		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
1293 	return sprintf(buf, "%x\n", (unsigned)dev->mdev->pdev->revision);
1294 }
1295 
1296 static ssize_t show_board(struct device *device, struct device_attribute *attr,
1297 			  char *buf)
1298 {
1299 	struct mlx5_ib_dev *dev =
1300 		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
1301 	return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN,
1302 		       dev->mdev->board_id);
1303 }
1304 
1305 static DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,    NULL);
1306 static DEVICE_ATTR(fw_ver,   S_IRUGO, show_fw_ver, NULL);
1307 static DEVICE_ATTR(hca_type, S_IRUGO, show_hca,    NULL);
1308 static DEVICE_ATTR(board_id, S_IRUGO, show_board,  NULL);
1309 static DEVICE_ATTR(fw_pages, S_IRUGO, show_fw_pages, NULL);
1310 static DEVICE_ATTR(reg_pages, S_IRUGO, show_reg_pages, NULL);
1311 
1312 static struct device_attribute *mlx5_class_attributes[] = {
1313 	&dev_attr_hw_rev,
1314 	&dev_attr_fw_ver,
1315 	&dev_attr_hca_type,
1316 	&dev_attr_board_id,
1317 	&dev_attr_fw_pages,
1318 	&dev_attr_reg_pages,
1319 };
1320 
1321 static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev)
1322 {
1323 	struct mlx5_ib_qp *mqp;
1324 	struct mlx5_ib_cq *send_mcq, *recv_mcq;
1325 	struct mlx5_core_cq *mcq;
1326 	struct list_head cq_armed_list;
1327 	unsigned long flags_qp;
1328 	unsigned long flags_cq;
1329 	unsigned long flags;
1330 
1331 	mlx5_ib_warn(ibdev, " started\n");
1332 	INIT_LIST_HEAD(&cq_armed_list);
1333 
1334 	/* Go over qp list reside on that ibdev, sync with create/destroy qp.*/
1335 	spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags);
1336 	list_for_each_entry(mqp, &ibdev->qp_list, qps_list) {
1337 		spin_lock_irqsave(&mqp->sq.lock, flags_qp);
1338 		if (mqp->sq.tail != mqp->sq.head) {
1339 			send_mcq = to_mcq(mqp->ibqp.send_cq);
1340 			spin_lock_irqsave(&send_mcq->lock, flags_cq);
1341 			if (send_mcq->mcq.comp &&
1342 			    mqp->ibqp.send_cq->comp_handler) {
1343 				if (!send_mcq->mcq.reset_notify_added) {
1344 					send_mcq->mcq.reset_notify_added = 1;
1345 					list_add_tail(&send_mcq->mcq.reset_notify,
1346 						      &cq_armed_list);
1347 				}
1348 			}
1349 			spin_unlock_irqrestore(&send_mcq->lock, flags_cq);
1350 		}
1351 		spin_unlock_irqrestore(&mqp->sq.lock, flags_qp);
1352 		spin_lock_irqsave(&mqp->rq.lock, flags_qp);
1353 		/* no handling is needed for SRQ */
1354 		if (!mqp->ibqp.srq) {
1355 			if (mqp->rq.tail != mqp->rq.head) {
1356 				recv_mcq = to_mcq(mqp->ibqp.recv_cq);
1357 				spin_lock_irqsave(&recv_mcq->lock, flags_cq);
1358 				if (recv_mcq->mcq.comp &&
1359 				    mqp->ibqp.recv_cq->comp_handler) {
1360 					if (!recv_mcq->mcq.reset_notify_added) {
1361 						recv_mcq->mcq.reset_notify_added = 1;
1362 						list_add_tail(&recv_mcq->mcq.reset_notify,
1363 							      &cq_armed_list);
1364 					}
1365 				}
1366 				spin_unlock_irqrestore(&recv_mcq->lock,
1367 						       flags_cq);
1368 			}
1369 		}
1370 		spin_unlock_irqrestore(&mqp->rq.lock, flags_qp);
1371 	}
1372 	/*At that point all inflight post send were put to be executed as of we
1373 	 * lock/unlock above locks Now need to arm all involved CQs.
1374 	 */
1375 	list_for_each_entry(mcq, &cq_armed_list, reset_notify) {
1376 		mcq->comp(mcq);
1377 	}
1378 	spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags);
1379 	mlx5_ib_warn(ibdev, " ended\n");
1380 	return;
1381 }
1382 
1383 static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
1384 			  enum mlx5_dev_event event, unsigned long param)
1385 {
1386 	struct mlx5_ib_dev *ibdev = (struct mlx5_ib_dev *)context;
1387 	struct ib_event ibev;
1388 
1389 	u8 port = 0;
1390 
1391 	switch (event) {
1392 	case MLX5_DEV_EVENT_SYS_ERROR:
1393 		ibdev->ib_active = false;
1394 		ibev.event = IB_EVENT_DEVICE_FATAL;
1395 		mlx5_ib_handle_internal_error(ibdev);
1396 		break;
1397 
1398 	case MLX5_DEV_EVENT_PORT_UP:
1399 		ibev.event = IB_EVENT_PORT_ACTIVE;
1400 		port = (u8)param;
1401 		break;
1402 
1403 	case MLX5_DEV_EVENT_PORT_DOWN:
1404 	case MLX5_DEV_EVENT_PORT_INITIALIZED:
1405 		ibev.event = IB_EVENT_PORT_ERR;
1406 		port = (u8)param;
1407 		break;
1408 
1409 	case MLX5_DEV_EVENT_LID_CHANGE:
1410 		ibev.event = IB_EVENT_LID_CHANGE;
1411 		port = (u8)param;
1412 		break;
1413 
1414 	case MLX5_DEV_EVENT_PKEY_CHANGE:
1415 		ibev.event = IB_EVENT_PKEY_CHANGE;
1416 		port = (u8)param;
1417 		break;
1418 
1419 	case MLX5_DEV_EVENT_GUID_CHANGE:
1420 		ibev.event = IB_EVENT_GID_CHANGE;
1421 		port = (u8)param;
1422 		break;
1423 
1424 	case MLX5_DEV_EVENT_CLIENT_REREG:
1425 		ibev.event = IB_EVENT_CLIENT_REREGISTER;
1426 		port = (u8)param;
1427 		break;
1428 
1429 	default:
1430 		break;
1431 	}
1432 
1433 	ibev.device	      = &ibdev->ib_dev;
1434 	ibev.element.port_num = port;
1435 
1436 	if ((event != MLX5_DEV_EVENT_SYS_ERROR) &&
1437 	    (port < 1 || port > ibdev->num_ports)) {
1438 		mlx5_ib_warn(ibdev, "warning: event on port %d\n", port);
1439 		return;
1440 	}
1441 
1442 	if (ibdev->ib_active)
1443 		ib_dispatch_event(&ibev);
1444 }
1445 
1446 static void get_ext_port_caps(struct mlx5_ib_dev *dev)
1447 {
1448 	int port;
1449 
1450 	for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++)
1451 		mlx5_query_ext_port_caps(dev, port);
1452 }
1453 
1454 static void config_atomic_responder(struct mlx5_ib_dev *dev,
1455 				    struct ib_device_attr *props)
1456 {
1457 	enum ib_atomic_cap cap = props->atomic_cap;
1458 
1459 #if 0
1460 	if (cap == IB_ATOMIC_HCA ||
1461 	    cap == IB_ATOMIC_GLOB)
1462 #endif
1463 		dev->enable_atomic_resp = 1;
1464 
1465 	dev->atomic_cap = cap;
1466 }
1467 
1468 enum mlx5_addr_align {
1469 	MLX5_ADDR_ALIGN_0	= 0,
1470 	MLX5_ADDR_ALIGN_64	= 64,
1471 	MLX5_ADDR_ALIGN_128	= 128,
1472 };
1473 
1474 static int get_port_caps(struct mlx5_ib_dev *dev)
1475 {
1476 	struct ib_device_attr *dprops = NULL;
1477 	struct ib_port_attr *pprops = NULL;
1478 	int err = -ENOMEM;
1479 	int port;
1480 
1481 	pprops = kmalloc(sizeof(*pprops), GFP_KERNEL);
1482 	if (!pprops)
1483 		goto out;
1484 
1485 	dprops = kmalloc(sizeof(*dprops), GFP_KERNEL);
1486 	if (!dprops)
1487 		goto out;
1488 
1489 	err = mlx5_ib_query_device(&dev->ib_dev, dprops);
1490 	if (err) {
1491 		mlx5_ib_warn(dev, "query_device failed %d\n", err);
1492 		goto out;
1493 	}
1494 	config_atomic_responder(dev, dprops);
1495 
1496 	for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) {
1497 		err = mlx5_ib_query_port(&dev->ib_dev, port, pprops);
1498 		if (err) {
1499 			mlx5_ib_warn(dev, "query_port %d failed %d\n",
1500 				     port, err);
1501 			break;
1502 		}
1503 		dev->mdev->port_caps[port - 1].pkey_table_len = dprops->max_pkeys;
1504 		dev->mdev->port_caps[port - 1].gid_table_len = pprops->gid_tbl_len;
1505 		mlx5_ib_dbg(dev, "pkey_table_len %d, gid_table_len %d\n",
1506 			    dprops->max_pkeys, pprops->gid_tbl_len);
1507 	}
1508 
1509 out:
1510 	kfree(pprops);
1511 	kfree(dprops);
1512 
1513 	return err;
1514 }
1515 
1516 static void destroy_umrc_res(struct mlx5_ib_dev *dev)
1517 {
1518 	int err;
1519 
1520 	err = mlx5_mr_cache_cleanup(dev);
1521 	if (err)
1522 		mlx5_ib_warn(dev, "mr cache cleanup failed\n");
1523 
1524 	ib_dereg_mr(dev->umrc.mr);
1525 	ib_dealloc_pd(dev->umrc.pd);
1526 }
1527 
1528 enum {
1529 	MAX_UMR_WR = 128,
1530 };
1531 
1532 static int create_umr_res(struct mlx5_ib_dev *dev)
1533 {
1534 	struct ib_pd *pd;
1535 	struct ib_mr *mr;
1536 	int ret;
1537 
1538 	pd = ib_alloc_pd(&dev->ib_dev);
1539 	if (IS_ERR(pd)) {
1540 		mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n");
1541 		ret = PTR_ERR(pd);
1542 		goto error_0;
1543 	}
1544 
1545 	mr = ib_get_dma_mr(pd,  IB_ACCESS_LOCAL_WRITE);
1546 	if (IS_ERR(mr)) {
1547 		mlx5_ib_dbg(dev, "Couldn't create DMA MR for sync UMR QP\n");
1548 		ret = PTR_ERR(mr);
1549 		goto error_1;
1550 	}
1551 
1552 	dev->umrc.mr = mr;
1553 	dev->umrc.pd = pd;
1554 
1555 	ret = mlx5_mr_cache_init(dev);
1556 	if (ret) {
1557 		mlx5_ib_warn(dev, "mr cache init failed %d\n", ret);
1558 		goto error_4;
1559 	}
1560 
1561 	return 0;
1562 
1563 error_4:
1564 	ib_dereg_mr(mr);
1565 error_1:
1566 	ib_dealloc_pd(pd);
1567 error_0:
1568 	return ret;
1569 }
1570 
1571 static int create_dev_resources(struct mlx5_ib_resources *devr)
1572 {
1573 	struct ib_srq_init_attr attr;
1574 	struct mlx5_ib_dev *dev;
1575 	int ret = 0;
1576 	struct ib_cq_init_attr cq_attr = { .cqe = 1 };
1577 
1578 	dev = container_of(devr, struct mlx5_ib_dev, devr);
1579 
1580 	devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL);
1581 	if (IS_ERR(devr->p0)) {
1582 		ret = PTR_ERR(devr->p0);
1583 		goto error0;
1584 	}
1585 	devr->p0->device  = &dev->ib_dev;
1586 	devr->p0->uobject = NULL;
1587 	atomic_set(&devr->p0->usecnt, 0);
1588 
1589 	devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, &cq_attr, NULL, NULL);
1590 	if (IS_ERR(devr->c0)) {
1591 		ret = PTR_ERR(devr->c0);
1592 		goto error1;
1593 	}
1594 	devr->c0->device        = &dev->ib_dev;
1595 	devr->c0->uobject       = NULL;
1596 	devr->c0->comp_handler  = NULL;
1597 	devr->c0->event_handler = NULL;
1598 	devr->c0->cq_context    = NULL;
1599 	atomic_set(&devr->c0->usecnt, 0);
1600 
1601 	devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL);
1602 	if (IS_ERR(devr->x0)) {
1603 		ret = PTR_ERR(devr->x0);
1604 		goto error2;
1605 	}
1606 	devr->x0->device = &dev->ib_dev;
1607 	devr->x0->inode = NULL;
1608 	atomic_set(&devr->x0->usecnt, 0);
1609 	mutex_init(&devr->x0->tgt_qp_mutex);
1610 	INIT_LIST_HEAD(&devr->x0->tgt_qp_list);
1611 
1612 	devr->x1 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL);
1613 	if (IS_ERR(devr->x1)) {
1614 		ret = PTR_ERR(devr->x1);
1615 		goto error3;
1616 	}
1617 	devr->x1->device = &dev->ib_dev;
1618 	devr->x1->inode = NULL;
1619 	atomic_set(&devr->x1->usecnt, 0);
1620 	mutex_init(&devr->x1->tgt_qp_mutex);
1621 	INIT_LIST_HEAD(&devr->x1->tgt_qp_list);
1622 
1623 	memset(&attr, 0, sizeof(attr));
1624 	attr.attr.max_sge = 1;
1625 	attr.attr.max_wr = 1;
1626 	attr.srq_type = IB_SRQT_XRC;
1627 	attr.ext.xrc.cq = devr->c0;
1628 	attr.ext.xrc.xrcd = devr->x0;
1629 
1630 	devr->s0 = mlx5_ib_create_srq(devr->p0, &attr, NULL);
1631 	if (IS_ERR(devr->s0)) {
1632 		ret = PTR_ERR(devr->s0);
1633 		goto error4;
1634 	}
1635 	devr->s0->device	= &dev->ib_dev;
1636 	devr->s0->pd		= devr->p0;
1637 	devr->s0->uobject       = NULL;
1638 	devr->s0->event_handler = NULL;
1639 	devr->s0->srq_context   = NULL;
1640 	devr->s0->srq_type      = IB_SRQT_XRC;
1641 	devr->s0->ext.xrc.xrcd  = devr->x0;
1642 	devr->s0->ext.xrc.cq	= devr->c0;
1643 	atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt);
1644 	atomic_inc(&devr->s0->ext.xrc.cq->usecnt);
1645 	atomic_inc(&devr->p0->usecnt);
1646 	atomic_set(&devr->s0->usecnt, 0);
1647 
1648 	memset(&attr, 0, sizeof(attr));
1649 	attr.attr.max_sge = 1;
1650 	attr.attr.max_wr = 1;
1651 	attr.srq_type = IB_SRQT_BASIC;
1652 	devr->s1 = mlx5_ib_create_srq(devr->p0, &attr, NULL);
1653 	if (IS_ERR(devr->s1)) {
1654 		ret = PTR_ERR(devr->s1);
1655 		goto error5;
1656 	}
1657 	devr->s1->device	= &dev->ib_dev;
1658 	devr->s1->pd		= devr->p0;
1659 	devr->s1->uobject       = NULL;
1660 	devr->s1->event_handler = NULL;
1661 	devr->s1->srq_context   = NULL;
1662 	devr->s1->srq_type      = IB_SRQT_BASIC;
1663 	devr->s1->ext.xrc.cq	= devr->c0;
1664 	atomic_inc(&devr->p0->usecnt);
1665 	atomic_set(&devr->s1->usecnt, 0);
1666 
1667 	return 0;
1668 
1669 error5:
1670 	mlx5_ib_destroy_srq(devr->s0);
1671 error4:
1672 	mlx5_ib_dealloc_xrcd(devr->x1);
1673 error3:
1674 	mlx5_ib_dealloc_xrcd(devr->x0);
1675 error2:
1676 	mlx5_ib_destroy_cq(devr->c0);
1677 error1:
1678 	mlx5_ib_dealloc_pd(devr->p0);
1679 error0:
1680 	return ret;
1681 }
1682 
1683 static void destroy_dev_resources(struct mlx5_ib_resources *devr)
1684 {
1685 	mlx5_ib_destroy_srq(devr->s1);
1686 	mlx5_ib_destroy_srq(devr->s0);
1687 	mlx5_ib_dealloc_xrcd(devr->x0);
1688 	mlx5_ib_dealloc_xrcd(devr->x1);
1689 	mlx5_ib_destroy_cq(devr->c0);
1690 	mlx5_ib_dealloc_pd(devr->p0);
1691 }
1692 
1693 static void enable_dc_tracer(struct mlx5_ib_dev *dev)
1694 {
1695 	struct device *device = dev->ib_dev.dma_device;
1696 	struct mlx5_dc_tracer *dct = &dev->dctr;
1697 	int order;
1698 	void *tmp;
1699 	int size;
1700 	int err;
1701 
1702 	size = MLX5_CAP_GEN(dev->mdev, num_ports) * 4096;
1703 	if (size <= PAGE_SIZE)
1704 		order = 0;
1705 	else
1706 		order = 1;
1707 
1708 	dct->pg = alloc_pages(GFP_KERNEL, order);
1709 	if (!dct->pg) {
1710 		mlx5_ib_err(dev, "failed to allocate %d pages\n", order);
1711 		return;
1712 	}
1713 
1714 	tmp = page_address(dct->pg);
1715 	memset(tmp, 0xff, size);
1716 
1717 	dct->size = size;
1718 	dct->order = order;
1719 	dct->dma = dma_map_page(device, dct->pg, 0, size, DMA_FROM_DEVICE);
1720 	if (dma_mapping_error(device, dct->dma)) {
1721 		mlx5_ib_err(dev, "dma mapping error\n");
1722 		goto map_err;
1723 	}
1724 
1725 	err = mlx5_core_set_dc_cnak_trace(dev->mdev, 1, dct->dma);
1726 	if (err) {
1727 		mlx5_ib_warn(dev, "failed to enable DC tracer\n");
1728 		goto cmd_err;
1729 	}
1730 
1731 	return;
1732 
1733 cmd_err:
1734 	dma_unmap_page(device, dct->dma, size, DMA_FROM_DEVICE);
1735 map_err:
1736 	__free_pages(dct->pg, dct->order);
1737 	dct->pg = NULL;
1738 }
1739 
1740 static void disable_dc_tracer(struct mlx5_ib_dev *dev)
1741 {
1742 	struct device *device = dev->ib_dev.dma_device;
1743 	struct mlx5_dc_tracer *dct = &dev->dctr;
1744 	int err;
1745 
1746 	if (!dct->pg)
1747 		return;
1748 
1749 	err = mlx5_core_set_dc_cnak_trace(dev->mdev, 0, dct->dma);
1750 	if (err) {
1751 		mlx5_ib_warn(dev, "failed to disable DC tracer\n");
1752 		return;
1753 	}
1754 
1755 	dma_unmap_page(device, dct->dma, dct->size, DMA_FROM_DEVICE);
1756 	__free_pages(dct->pg, dct->order);
1757 	dct->pg = NULL;
1758 }
1759 
1760 enum {
1761 	MLX5_DC_CNAK_SIZE		= 128,
1762 	MLX5_NUM_BUF_IN_PAGE		= PAGE_SIZE / MLX5_DC_CNAK_SIZE,
1763 	MLX5_CNAK_TX_CQ_SIGNAL_FACTOR	= 128,
1764 	MLX5_DC_CNAK_SL			= 0,
1765 	MLX5_DC_CNAK_VL			= 0,
1766 };
1767 
1768 static int init_dc_improvements(struct mlx5_ib_dev *dev)
1769 {
1770 	if (!mlx5_core_is_pf(dev->mdev))
1771 		return 0;
1772 
1773 	if (!(MLX5_CAP_GEN(dev->mdev, dc_cnak_trace)))
1774 		return 0;
1775 
1776 	enable_dc_tracer(dev);
1777 
1778 	return 0;
1779 }
1780 
1781 static void cleanup_dc_improvements(struct mlx5_ib_dev *dev)
1782 {
1783 
1784 	disable_dc_tracer(dev);
1785 }
1786 
1787 static void mlx5_ib_dealloc_q_port_counter(struct mlx5_ib_dev *dev, u8 port_num)
1788 {
1789 	mlx5_vport_dealloc_q_counter(dev->mdev,
1790 				     MLX5_INTERFACE_PROTOCOL_IB,
1791 				     dev->port[port_num].q_cnt_id);
1792 	dev->port[port_num].q_cnt_id = 0;
1793 }
1794 
1795 static void mlx5_ib_dealloc_q_counters(struct mlx5_ib_dev *dev)
1796 {
1797 	unsigned int i;
1798 
1799 	for (i = 0; i < dev->num_ports; i++)
1800 		mlx5_ib_dealloc_q_port_counter(dev, i);
1801 }
1802 
1803 static int mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev)
1804 {
1805 	int i;
1806 	int ret;
1807 
1808 	for (i = 0; i < dev->num_ports; i++) {
1809 		ret = mlx5_vport_alloc_q_counter(dev->mdev,
1810 						 MLX5_INTERFACE_PROTOCOL_IB,
1811 						 &dev->port[i].q_cnt_id);
1812 		if (ret) {
1813 			mlx5_ib_warn(dev,
1814 				     "couldn't allocate queue counter for port %d\n",
1815 				     i + 1);
1816 			goto dealloc_counters;
1817 		}
1818 	}
1819 
1820 	return 0;
1821 
1822 dealloc_counters:
1823 	while (--i >= 0)
1824 		mlx5_ib_dealloc_q_port_counter(dev, i);
1825 
1826 	return ret;
1827 }
1828 
1829 struct port_attribute {
1830 	struct attribute attr;
1831 	ssize_t (*show)(struct mlx5_ib_port *,
1832 			struct port_attribute *, char *buf);
1833 	ssize_t (*store)(struct mlx5_ib_port *,
1834 			 struct port_attribute *,
1835 			 const char *buf, size_t count);
1836 };
1837 
1838 struct port_counter_attribute {
1839 	struct port_attribute	attr;
1840 	size_t			offset;
1841 };
1842 
1843 static ssize_t port_attr_show(struct kobject *kobj,
1844 			      struct attribute *attr, char *buf)
1845 {
1846 	struct port_attribute *port_attr =
1847 		container_of(attr, struct port_attribute, attr);
1848 	struct mlx5_ib_port_sysfs_group *p =
1849 		container_of(kobj, struct mlx5_ib_port_sysfs_group,
1850 			     kobj);
1851 	struct mlx5_ib_port *mibport = container_of(p, struct mlx5_ib_port,
1852 						    group);
1853 
1854 	if (!port_attr->show)
1855 		return -EIO;
1856 
1857 	return port_attr->show(mibport, port_attr, buf);
1858 }
1859 
1860 static ssize_t show_port_counter(struct mlx5_ib_port *p,
1861 				 struct port_attribute *port_attr,
1862 				 char *buf)
1863 {
1864 	int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out);
1865 	struct port_counter_attribute *counter_attr =
1866 		container_of(port_attr, struct port_counter_attribute, attr);
1867 	void *out;
1868 	int ret;
1869 
1870 	out = mlx5_vzalloc(outlen);
1871 	if (!out)
1872 		return -ENOMEM;
1873 
1874 	ret = mlx5_vport_query_q_counter(p->dev->mdev,
1875 					 p->q_cnt_id, 0,
1876 					 out, outlen);
1877 	if (ret)
1878 		goto free;
1879 
1880 	ret = sprintf(buf, "%d\n",
1881 		      be32_to_cpu(*(__be32 *)(out + counter_attr->offset)));
1882 
1883 free:
1884 	kfree(out);
1885 	return ret;
1886 }
1887 
1888 #define PORT_COUNTER_ATTR(_name)					\
1889 struct port_counter_attribute port_counter_attr_##_name = {		\
1890 	.attr  = __ATTR(_name, S_IRUGO, show_port_counter, NULL),	\
1891 	.offset = MLX5_BYTE_OFF(query_q_counter_out, _name)		\
1892 }
1893 
1894 static PORT_COUNTER_ATTR(rx_write_requests);
1895 static PORT_COUNTER_ATTR(rx_read_requests);
1896 static PORT_COUNTER_ATTR(rx_atomic_requests);
1897 static PORT_COUNTER_ATTR(rx_dct_connect);
1898 static PORT_COUNTER_ATTR(out_of_buffer);
1899 static PORT_COUNTER_ATTR(out_of_sequence);
1900 static PORT_COUNTER_ATTR(duplicate_request);
1901 static PORT_COUNTER_ATTR(rnr_nak_retry_err);
1902 static PORT_COUNTER_ATTR(packet_seq_err);
1903 static PORT_COUNTER_ATTR(implied_nak_seq_err);
1904 static PORT_COUNTER_ATTR(local_ack_timeout_err);
1905 
1906 static struct attribute *counter_attrs[] = {
1907 	&port_counter_attr_rx_write_requests.attr.attr,
1908 	&port_counter_attr_rx_read_requests.attr.attr,
1909 	&port_counter_attr_rx_atomic_requests.attr.attr,
1910 	&port_counter_attr_rx_dct_connect.attr.attr,
1911 	&port_counter_attr_out_of_buffer.attr.attr,
1912 	&port_counter_attr_out_of_sequence.attr.attr,
1913 	&port_counter_attr_duplicate_request.attr.attr,
1914 	&port_counter_attr_rnr_nak_retry_err.attr.attr,
1915 	&port_counter_attr_packet_seq_err.attr.attr,
1916 	&port_counter_attr_implied_nak_seq_err.attr.attr,
1917 	&port_counter_attr_local_ack_timeout_err.attr.attr,
1918 	NULL
1919 };
1920 
1921 static struct attribute_group port_counters_group = {
1922 	.name  = "counters",
1923 	.attrs  = counter_attrs
1924 };
1925 
1926 static const struct sysfs_ops port_sysfs_ops = {
1927 	.show = port_attr_show
1928 };
1929 
1930 static struct kobj_type port_type = {
1931 	.sysfs_ops     = &port_sysfs_ops,
1932 };
1933 
1934 static int add_port_attrs(struct mlx5_ib_dev *dev,
1935 			  struct kobject *parent,
1936 			  struct mlx5_ib_port_sysfs_group *port,
1937 			  u8 port_num)
1938 {
1939 	int ret;
1940 
1941 	ret = kobject_init_and_add(&port->kobj, &port_type,
1942 				   parent,
1943 				   "%d", port_num);
1944 	if (ret)
1945 		return ret;
1946 
1947 	if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt) &&
1948 	    MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) {
1949 		ret = sysfs_create_group(&port->kobj, &port_counters_group);
1950 		if (ret)
1951 			goto put_kobj;
1952 	}
1953 
1954 	port->enabled = true;
1955 	return ret;
1956 
1957 put_kobj:
1958 	kobject_put(&port->kobj);
1959 	return ret;
1960 }
1961 
1962 static void destroy_ports_attrs(struct mlx5_ib_dev *dev,
1963 				unsigned int num_ports)
1964 {
1965 	unsigned int i;
1966 
1967 	for (i = 0; i < num_ports; i++) {
1968 		struct mlx5_ib_port_sysfs_group *port =
1969 			&dev->port[i].group;
1970 
1971 		if (!port->enabled)
1972 			continue;
1973 
1974 		if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt) &&
1975 		    MLX5_CAP_GEN(dev->mdev, retransmission_q_counters))
1976 			sysfs_remove_group(&port->kobj,
1977 					   &port_counters_group);
1978 		kobject_put(&port->kobj);
1979 		port->enabled = false;
1980 	}
1981 
1982 	if (dev->ports_parent) {
1983 		kobject_put(dev->ports_parent);
1984 		dev->ports_parent = NULL;
1985 	}
1986 }
1987 
1988 static int create_port_attrs(struct mlx5_ib_dev *dev)
1989 {
1990 	int ret = 0;
1991 	unsigned int i = 0;
1992 	struct device *device = &dev->ib_dev.dev;
1993 
1994 	dev->ports_parent = kobject_create_and_add("mlx5_ports",
1995 						   &device->kobj);
1996 	if (!dev->ports_parent)
1997 		return -ENOMEM;
1998 
1999 	for (i = 0; i < dev->num_ports; i++) {
2000 		ret = add_port_attrs(dev,
2001 				     dev->ports_parent,
2002 				     &dev->port[i].group,
2003 				     i + 1);
2004 
2005 		if (ret)
2006 			goto _destroy_ports_attrs;
2007 	}
2008 
2009 	return 0;
2010 
2011 _destroy_ports_attrs:
2012 	destroy_ports_attrs(dev, i);
2013 	return ret;
2014 }
2015 
2016 static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
2017 {
2018 	struct mlx5_ib_dev *dev;
2019 	int err;
2020 	int i;
2021 
2022 	printk_once(KERN_INFO "%s", mlx5_version);
2023 
2024 	dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev));
2025 	if (!dev)
2026 		return NULL;
2027 
2028 	dev->mdev = mdev;
2029 
2030 	dev->port = kcalloc(MLX5_CAP_GEN(mdev, num_ports), sizeof(*dev->port),
2031 			     GFP_KERNEL);
2032 	if (!dev->port)
2033 		goto err_dealloc;
2034 
2035 	for (i = 0; i < MLX5_CAP_GEN(mdev, num_ports); i++) {
2036 		dev->port[i].dev = dev;
2037 		dev->port[i].port_num = i;
2038 		dev->port[i].port_gone = 0;
2039 		memset(dev->port[i].gid_table, 0, sizeof(dev->port[i].gid_table));
2040 	}
2041 
2042 	err = get_port_caps(dev);
2043 	if (err)
2044 		goto err_free_port;
2045 
2046 	if (mlx5_use_mad_ifc(dev))
2047 		get_ext_port_caps(dev);
2048 
2049 	if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) ==
2050 	    IB_LINK_LAYER_ETHERNET) {
2051 		if (MLX5_CAP_GEN(mdev, roce)) {
2052 			err = mlx5_nic_vport_enable_roce(mdev);
2053 			if (err)
2054 				goto err_free_port;
2055 		} else {
2056 			goto err_free_port;
2057 		}
2058 	}
2059 
2060 	MLX5_INIT_DOORBELL_LOCK(&dev->uar_lock);
2061 
2062 	strlcpy(dev->ib_dev.name, "mlx5_%d", IB_DEVICE_NAME_MAX);
2063 	dev->ib_dev.owner		= THIS_MODULE;
2064 	dev->ib_dev.node_type		= RDMA_NODE_IB_CA;
2065 	dev->ib_dev.local_dma_lkey	= mdev->special_contexts.resd_lkey;
2066 	dev->num_ports		= MLX5_CAP_GEN(mdev, num_ports);
2067 	dev->ib_dev.phys_port_cnt     = dev->num_ports;
2068 	dev->ib_dev.num_comp_vectors    =
2069 		dev->mdev->priv.eq_table.num_comp_vectors;
2070 	dev->ib_dev.dma_device	= &mdev->pdev->dev;
2071 
2072 	dev->ib_dev.uverbs_abi_ver	= MLX5_IB_UVERBS_ABI_VERSION;
2073 	dev->ib_dev.uverbs_cmd_mask	=
2074 		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
2075 		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
2076 		(1ull << IB_USER_VERBS_CMD_QUERY_PORT)		|
2077 		(1ull << IB_USER_VERBS_CMD_ALLOC_PD)		|
2078 		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD)		|
2079 		(1ull << IB_USER_VERBS_CMD_REG_MR)		|
2080 		(1ull << IB_USER_VERBS_CMD_DEREG_MR)		|
2081 		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL)	|
2082 		(1ull << IB_USER_VERBS_CMD_CREATE_CQ)		|
2083 		(1ull << IB_USER_VERBS_CMD_RESIZE_CQ)		|
2084 		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ)		|
2085 		(1ull << IB_USER_VERBS_CMD_CREATE_QP)		|
2086 		(1ull << IB_USER_VERBS_CMD_MODIFY_QP)		|
2087 		(1ull << IB_USER_VERBS_CMD_QUERY_QP)		|
2088 		(1ull << IB_USER_VERBS_CMD_DESTROY_QP)		|
2089 		(1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)	|
2090 		(1ull << IB_USER_VERBS_CMD_DETACH_MCAST)	|
2091 		(1ull << IB_USER_VERBS_CMD_CREATE_SRQ)		|
2092 		(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)		|
2093 		(1ull << IB_USER_VERBS_CMD_QUERY_SRQ)		|
2094 		(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)		|
2095 		(1ull << IB_USER_VERBS_CMD_CREATE_XSRQ)		|
2096 		(1ull << IB_USER_VERBS_CMD_OPEN_QP);
2097 
2098 	dev->ib_dev.query_device	= mlx5_ib_query_device;
2099 	dev->ib_dev.query_port		= mlx5_ib_query_port;
2100 	dev->ib_dev.get_link_layer	= mlx5_ib_port_link_layer;
2101 	dev->ib_dev.get_netdev		= mlx5_ib_get_netdev;
2102 	dev->ib_dev.query_gid		= mlx5_ib_query_gid;
2103 	dev->ib_dev.query_pkey		= mlx5_ib_query_pkey;
2104 	dev->ib_dev.modify_device	= mlx5_ib_modify_device;
2105 	dev->ib_dev.modify_port		= mlx5_ib_modify_port;
2106 	dev->ib_dev.alloc_ucontext	= mlx5_ib_alloc_ucontext;
2107 	dev->ib_dev.dealloc_ucontext	= mlx5_ib_dealloc_ucontext;
2108 	dev->ib_dev.mmap		= mlx5_ib_mmap;
2109 	dev->ib_dev.alloc_pd		= mlx5_ib_alloc_pd;
2110 	dev->ib_dev.dealloc_pd		= mlx5_ib_dealloc_pd;
2111 	dev->ib_dev.create_ah		= mlx5_ib_create_ah;
2112 	dev->ib_dev.query_ah		= mlx5_ib_query_ah;
2113 	dev->ib_dev.destroy_ah		= mlx5_ib_destroy_ah;
2114 	dev->ib_dev.create_srq		= mlx5_ib_create_srq;
2115 	dev->ib_dev.modify_srq		= mlx5_ib_modify_srq;
2116 	dev->ib_dev.query_srq		= mlx5_ib_query_srq;
2117 	dev->ib_dev.destroy_srq		= mlx5_ib_destroy_srq;
2118 	dev->ib_dev.post_srq_recv	= mlx5_ib_post_srq_recv;
2119 	dev->ib_dev.create_qp		= mlx5_ib_create_qp;
2120 	dev->ib_dev.modify_qp		= mlx5_ib_modify_qp;
2121 	dev->ib_dev.query_qp		= mlx5_ib_query_qp;
2122 	dev->ib_dev.destroy_qp		= mlx5_ib_destroy_qp;
2123 	dev->ib_dev.post_send		= mlx5_ib_post_send;
2124 	dev->ib_dev.post_recv		= mlx5_ib_post_recv;
2125 	dev->ib_dev.create_cq		= mlx5_ib_create_cq;
2126 	dev->ib_dev.modify_cq		= mlx5_ib_modify_cq;
2127 	dev->ib_dev.resize_cq		= mlx5_ib_resize_cq;
2128 	dev->ib_dev.destroy_cq		= mlx5_ib_destroy_cq;
2129 	dev->ib_dev.poll_cq		= mlx5_ib_poll_cq;
2130 	dev->ib_dev.req_notify_cq	= mlx5_ib_arm_cq;
2131 	dev->ib_dev.get_dma_mr		= mlx5_ib_get_dma_mr;
2132 	dev->ib_dev.reg_user_mr		= mlx5_ib_reg_user_mr;
2133 	dev->ib_dev.reg_phys_mr		= mlx5_ib_reg_phys_mr;
2134 	dev->ib_dev.dereg_mr		= mlx5_ib_dereg_mr;
2135 	dev->ib_dev.attach_mcast	= mlx5_ib_mcg_attach;
2136 	dev->ib_dev.detach_mcast	= mlx5_ib_mcg_detach;
2137 	dev->ib_dev.process_mad		= mlx5_ib_process_mad;
2138 	dev->ib_dev.alloc_fast_reg_mr	= mlx5_ib_alloc_fast_reg_mr;
2139 	dev->ib_dev.alloc_fast_reg_page_list = mlx5_ib_alloc_fast_reg_page_list;
2140 	dev->ib_dev.free_fast_reg_page_list  = mlx5_ib_free_fast_reg_page_list;
2141 
2142 	if (MLX5_CAP_GEN(mdev, xrc)) {
2143 		dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd;
2144 		dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd;
2145 		dev->ib_dev.uverbs_cmd_mask |=
2146 			(1ull << IB_USER_VERBS_CMD_OPEN_XRCD) |
2147 			(1ull << IB_USER_VERBS_CMD_CLOSE_XRCD);
2148 	}
2149 
2150 	err = init_node_data(dev);
2151 	if (err)
2152 		goto err_disable_roce;
2153 
2154 	mutex_init(&dev->cap_mask_mutex);
2155 	INIT_LIST_HEAD(&dev->qp_list);
2156 	spin_lock_init(&dev->reset_flow_resource_lock);
2157 
2158 	err = create_dev_resources(&dev->devr);
2159 	if (err)
2160 		goto err_disable_roce;
2161 
2162 
2163 	err = mlx5_ib_alloc_q_counters(dev);
2164 	if (err)
2165 		goto err_odp;
2166 
2167 	err = ib_register_device(&dev->ib_dev, NULL);
2168 	if (err)
2169 		goto err_q_cnt;
2170 
2171 	err = create_umr_res(dev);
2172 	if (err)
2173 		goto err_dev;
2174 
2175 	if (MLX5_CAP_GEN(dev->mdev, port_type) ==
2176 	    MLX5_CAP_PORT_TYPE_IB) {
2177 		if (init_dc_improvements(dev))
2178 			mlx5_ib_dbg(dev, "init_dc_improvements - continuing\n");
2179 	}
2180 
2181 	err = create_port_attrs(dev);
2182 	if (err)
2183 		goto err_dc;
2184 
2185 	for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) {
2186 		err = device_create_file(&dev->ib_dev.dev,
2187 					 mlx5_class_attributes[i]);
2188 		if (err)
2189 			goto err_port_attrs;
2190 	}
2191 
2192 	if (1) {
2193 		struct thread *rl_thread = NULL;
2194 		struct proc *rl_proc = NULL;
2195 
2196 		for (i = 0; i < MLX5_CAP_GEN(mdev, num_ports); i++) {
2197 			(void) kproc_kthread_add(mlx5_ib_roce_port_update, dev->port + i, &rl_proc, &rl_thread,
2198 			    RFHIGHPID, 0, "mlx5-ib-roce-port", "mlx5-ib-roce_port-%d", i);
2199 		}
2200 	}
2201 
2202 	dev->ib_active = true;
2203 
2204 	return dev;
2205 
2206 err_port_attrs:
2207 	destroy_ports_attrs(dev, dev->num_ports);
2208 
2209 err_dc:
2210 	if (MLX5_CAP_GEN(dev->mdev, port_type) ==
2211 	    MLX5_CAP_PORT_TYPE_IB)
2212 		cleanup_dc_improvements(dev);
2213 	destroy_umrc_res(dev);
2214 
2215 err_dev:
2216 	ib_unregister_device(&dev->ib_dev);
2217 
2218 err_q_cnt:
2219 	mlx5_ib_dealloc_q_counters(dev);
2220 
2221 err_odp:
2222 	destroy_dev_resources(&dev->devr);
2223 
2224 err_disable_roce:
2225 	if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) ==
2226 	    IB_LINK_LAYER_ETHERNET && MLX5_CAP_GEN(mdev, roce))
2227 		mlx5_nic_vport_disable_roce(mdev);
2228 err_free_port:
2229 	kfree(dev->port);
2230 
2231 err_dealloc:
2232 	ib_dealloc_device((struct ib_device *)dev);
2233 
2234 	return NULL;
2235 }
2236 
2237 static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
2238 {
2239 	struct mlx5_ib_dev *dev = context;
2240 	int i;
2241 
2242 	for (i = 0; i < MLX5_CAP_GEN(mdev, num_ports); i++) {
2243 		dev->port[i].port_gone = 1;
2244 		while (dev->port[i].port_gone != 2)
2245 			pause("W", hz);
2246 	}
2247 
2248 	for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) {
2249 		device_remove_file(&dev->ib_dev.dev,
2250 		    mlx5_class_attributes[i]);
2251 	}
2252 
2253 	destroy_ports_attrs(dev, dev->num_ports);
2254 	if (MLX5_CAP_GEN(dev->mdev, port_type) ==
2255 	    MLX5_CAP_PORT_TYPE_IB)
2256 		cleanup_dc_improvements(dev);
2257 	mlx5_ib_dealloc_q_counters(dev);
2258 	ib_unregister_device(&dev->ib_dev);
2259 	destroy_umrc_res(dev);
2260 	destroy_dev_resources(&dev->devr);
2261 
2262 	if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) ==
2263 	    IB_LINK_LAYER_ETHERNET && MLX5_CAP_GEN(mdev, roce))
2264 		mlx5_nic_vport_disable_roce(mdev);
2265 
2266 	kfree(dev->port);
2267 	ib_dealloc_device(&dev->ib_dev);
2268 }
2269 
2270 static struct mlx5_interface mlx5_ib_interface = {
2271 	.add            = mlx5_ib_add,
2272 	.remove         = mlx5_ib_remove,
2273 	.event          = mlx5_ib_event,
2274 	.protocol	= MLX5_INTERFACE_PROTOCOL_IB,
2275 };
2276 
2277 static int __init mlx5_ib_init(void)
2278 {
2279 	int err;
2280 
2281 	if (deprecated_prof_sel != 2)
2282 		printf("mlx5_ib: WARN: ""prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n");
2283 
2284 	err = mlx5_register_interface(&mlx5_ib_interface);
2285 	if (err)
2286 		goto clean_odp;
2287 
2288 	mlx5_ib_wq = create_singlethread_workqueue("mlx5_ib_wq");
2289 	if (!mlx5_ib_wq) {
2290 		printf("mlx5_ib: ERR: ""%s: failed to create mlx5_ib_wq\n", __func__);
2291 		goto err_unreg;
2292 	}
2293 
2294 	return err;
2295 
2296 err_unreg:
2297 	mlx5_unregister_interface(&mlx5_ib_interface);
2298 
2299 clean_odp:
2300 	return err;
2301 }
2302 
2303 static void __exit mlx5_ib_cleanup(void)
2304 {
2305 	destroy_workqueue(mlx5_ib_wq);
2306 	mlx5_unregister_interface(&mlx5_ib_interface);
2307 }
2308 
2309 module_init_order(mlx5_ib_init, SI_ORDER_THIRD);
2310 module_exit_order(mlx5_ib_cleanup, SI_ORDER_THIRD);
2311