xref: /freebsd/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c (revision ea825d02749f382c3f7e17f28247f20a48733eab)
1 /*-
2  * Copyright (c) 2013-2015, Mellanox Technologies, Ltd.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  *
25  * $FreeBSD$
26  */
27 
28 #include <linux/errno.h>
29 #include <linux/pci.h>
30 #include <linux/dma-mapping.h>
31 #include <linux/slab.h>
32 #include <linux/io-mapping.h>
33 #include <linux/sched.h>
34 #include <linux/netdevice.h>
35 #include <linux/etherdevice.h>
36 #include <linux/list.h>
37 #include <dev/mlx5/driver.h>
38 #include <dev/mlx5/vport.h>
39 #include <asm/pgtable.h>
40 #include <linux/fs.h>
41 #undef inode
42 
43 #include <rdma/ib_user_verbs.h>
44 #include <rdma/ib_smi.h>
45 #include <rdma/ib_umem.h>
46 #include "user.h"
47 #include "mlx5_ib.h"
48 
49 #include <sys/unistd.h>
50 #include <sys/kthread.h>
51 
52 #define DRIVER_NAME "mlx5_ib"
53 #define DRIVER_VERSION "3.2-rc1"
54 #define DRIVER_RELDATE	"May 2016"
55 
56 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
57 MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver");
58 MODULE_LICENSE("Dual BSD/GPL");
59 MODULE_DEPEND(mlx5ib, linuxkpi, 1, 1, 1);
60 MODULE_DEPEND(mlx5ib, mlx5, 1, 1, 1);
61 MODULE_DEPEND(mlx5ib, ibcore, 1, 1, 1);
62 MODULE_VERSION(mlx5ib, 1);
63 
64 static int deprecated_prof_sel = 2;
65 module_param_named(prof_sel, deprecated_prof_sel, int, 0444);
66 MODULE_PARM_DESC(prof_sel, "profile selector. Deprecated here. Moved to module mlx5_core");
67 
68 enum {
69 	MLX5_STANDARD_ATOMIC_SIZE = 0x8,
70 };
71 
72 struct workqueue_struct *mlx5_ib_wq;
73 
74 static char mlx5_version[] =
75 	DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v"
76 	DRIVER_VERSION " (" DRIVER_RELDATE ")\n";
77 
78 static void get_atomic_caps(struct mlx5_ib_dev *dev,
79 			    struct ib_device_attr *props)
80 {
81 	int tmp;
82 	u8 atomic_operations;
83 	u8 atomic_size_qp;
84 	u8 atomic_req_endianess;
85 
86 	atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations);
87 	atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp);
88 	atomic_req_endianess = MLX5_CAP_ATOMIC(dev->mdev,
89 					       atomic_req_8B_endianess_mode) ||
90 			       !mlx5_host_is_le();
91 
92 	tmp = MLX5_ATOMIC_OPS_CMP_SWAP | MLX5_ATOMIC_OPS_FETCH_ADD;
93 	if (((atomic_operations & tmp) == tmp)
94 	    && (atomic_size_qp & 8)) {
95 		if (atomic_req_endianess) {
96 			props->atomic_cap = IB_ATOMIC_HCA;
97 		} else {
98 			props->atomic_cap = IB_ATOMIC_NONE;
99 		}
100 	} else {
101 		props->atomic_cap = IB_ATOMIC_NONE;
102 	}
103 
104 	tmp = MLX5_ATOMIC_OPS_MASKED_CMP_SWAP | MLX5_ATOMIC_OPS_MASKED_FETCH_ADD;
105 	if (((atomic_operations & tmp) == tmp)
106 	    &&(atomic_size_qp & 8)) {
107 		if (atomic_req_endianess)
108 			props->masked_atomic_cap = IB_ATOMIC_HCA;
109 		else {
110 			props->masked_atomic_cap = IB_ATOMIC_NONE;
111 		}
112 	} else {
113 		props->masked_atomic_cap = IB_ATOMIC_NONE;
114 	}
115 }
116 
117 static enum rdma_link_layer
118 mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num)
119 {
120 	struct mlx5_ib_dev *dev = to_mdev(device);
121 
122 	switch (MLX5_CAP_GEN(dev->mdev, port_type)) {
123 	case MLX5_CAP_PORT_TYPE_IB:
124 		return IB_LINK_LAYER_INFINIBAND;
125 	case MLX5_CAP_PORT_TYPE_ETH:
126 		return IB_LINK_LAYER_ETHERNET;
127 	default:
128 		return IB_LINK_LAYER_UNSPECIFIED;
129 	}
130 }
131 
132 static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev)
133 {
134 	return !dev->mdev->issi;
135 }
136 
137 enum {
138 	MLX5_VPORT_ACCESS_METHOD_MAD,
139 	MLX5_VPORT_ACCESS_METHOD_HCA,
140 	MLX5_VPORT_ACCESS_METHOD_NIC,
141 };
142 
143 static int mlx5_get_vport_access_method(struct ib_device *ibdev)
144 {
145 	if (mlx5_use_mad_ifc(to_mdev(ibdev)))
146 		return MLX5_VPORT_ACCESS_METHOD_MAD;
147 
148 	if (mlx5_ib_port_link_layer(ibdev, 1) ==
149 	    IB_LINK_LAYER_ETHERNET)
150 		return MLX5_VPORT_ACCESS_METHOD_NIC;
151 
152 	return MLX5_VPORT_ACCESS_METHOD_HCA;
153 }
154 
155 static int mlx5_query_system_image_guid(struct ib_device *ibdev,
156 					__be64 *sys_image_guid)
157 {
158 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
159 	struct mlx5_core_dev *mdev = dev->mdev;
160 	u64 tmp;
161 	int err;
162 
163 	switch (mlx5_get_vport_access_method(ibdev)) {
164 	case MLX5_VPORT_ACCESS_METHOD_MAD:
165 		return mlx5_query_system_image_guid_mad_ifc(ibdev,
166 							    sys_image_guid);
167 
168 	case MLX5_VPORT_ACCESS_METHOD_HCA:
169 		err = mlx5_query_hca_vport_system_image_guid(mdev, &tmp);
170 		if (!err)
171 			*sys_image_guid = cpu_to_be64(tmp);
172 		return err;
173 
174 	case MLX5_VPORT_ACCESS_METHOD_NIC:
175 		err = mlx5_query_nic_vport_system_image_guid(mdev, &tmp);
176 		if (!err)
177 			*sys_image_guid = cpu_to_be64(tmp);
178 		return err;
179 
180 	default:
181 		return -EINVAL;
182 	}
183 }
184 
185 static int mlx5_query_max_pkeys(struct ib_device *ibdev,
186 				u16 *max_pkeys)
187 {
188 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
189 	struct mlx5_core_dev *mdev = dev->mdev;
190 
191 	switch (mlx5_get_vport_access_method(ibdev)) {
192 	case MLX5_VPORT_ACCESS_METHOD_MAD:
193 		return mlx5_query_max_pkeys_mad_ifc(ibdev, max_pkeys);
194 
195 	case MLX5_VPORT_ACCESS_METHOD_HCA:
196 	case MLX5_VPORT_ACCESS_METHOD_NIC:
197 		*max_pkeys = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev,
198 						pkey_table_size));
199 		return 0;
200 
201 	default:
202 		return -EINVAL;
203 	}
204 }
205 
206 static int mlx5_query_vendor_id(struct ib_device *ibdev,
207 				u32 *vendor_id)
208 {
209 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
210 
211 	switch (mlx5_get_vport_access_method(ibdev)) {
212 	case MLX5_VPORT_ACCESS_METHOD_MAD:
213 		return mlx5_query_vendor_id_mad_ifc(ibdev, vendor_id);
214 
215 	case MLX5_VPORT_ACCESS_METHOD_HCA:
216 	case MLX5_VPORT_ACCESS_METHOD_NIC:
217 		return mlx5_core_query_vendor_id(dev->mdev, vendor_id);
218 
219 	default:
220 		return -EINVAL;
221 	}
222 }
223 
224 static int mlx5_query_node_guid(struct mlx5_ib_dev *dev,
225 				__be64 *node_guid)
226 {
227 	u64 tmp;
228 	int err;
229 
230 	switch (mlx5_get_vport_access_method(&dev->ib_dev)) {
231 	case MLX5_VPORT_ACCESS_METHOD_MAD:
232 		return mlx5_query_node_guid_mad_ifc(dev, node_guid);
233 
234 	case MLX5_VPORT_ACCESS_METHOD_HCA:
235 		err = mlx5_query_hca_vport_node_guid(dev->mdev, &tmp);
236 		if (!err)
237 			*node_guid = cpu_to_be64(tmp);
238 		return err;
239 
240 	case MLX5_VPORT_ACCESS_METHOD_NIC:
241 		err = mlx5_query_nic_vport_node_guid(dev->mdev, &tmp);
242 		if (!err)
243 			*node_guid = cpu_to_be64(tmp);
244 		return err;
245 
246 	default:
247 		return -EINVAL;
248 	}
249 }
250 
251 struct mlx5_reg_node_desc {
252 	u8	desc[64];
253 };
254 
255 static int mlx5_query_node_desc(struct mlx5_ib_dev *dev, char *node_desc)
256 {
257 	struct mlx5_reg_node_desc in;
258 
259 	if (mlx5_use_mad_ifc(dev))
260 		return mlx5_query_node_desc_mad_ifc(dev, node_desc);
261 
262 	memset(&in, 0, sizeof(in));
263 
264 	return mlx5_core_access_reg(dev->mdev, &in, sizeof(in), node_desc,
265 				    sizeof(struct mlx5_reg_node_desc),
266 				    MLX5_REG_NODE_DESC, 0, 0);
267 }
268 
269 static int mlx5_ib_query_device(struct ib_device *ibdev,
270 				struct ib_device_attr *props)
271 {
272 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
273 	struct mlx5_core_dev *mdev = dev->mdev;
274 	int max_sq_desc;
275 	int max_rq_sg;
276 	int max_sq_sg;
277 	int err;
278 
279 
280 	memset(props, 0, sizeof(*props));
281 
282 	err = mlx5_query_system_image_guid(ibdev,
283 					   &props->sys_image_guid);
284 	if (err)
285 		return err;
286 
287 	err = mlx5_query_max_pkeys(ibdev, &props->max_pkeys);
288 	if (err)
289 		return err;
290 
291 	err = mlx5_query_vendor_id(ibdev, &props->vendor_id);
292 	if (err)
293 		return err;
294 
295 	props->fw_ver = ((u64)fw_rev_maj(dev->mdev) << 32) |
296 		((u64)fw_rev_min(dev->mdev) << 16) |
297 		fw_rev_sub(dev->mdev);
298 	props->device_cap_flags    = IB_DEVICE_CHANGE_PHY_PORT |
299 		IB_DEVICE_PORT_ACTIVE_EVENT		|
300 		IB_DEVICE_SYS_IMAGE_GUID		|
301 		IB_DEVICE_RC_RNR_NAK_GEN;
302 
303 	if (MLX5_CAP_GEN(mdev, pkv))
304 		props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
305 	if (MLX5_CAP_GEN(mdev, qkv))
306 		props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
307 	if (MLX5_CAP_GEN(mdev, apm))
308 		props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
309 	props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY;
310 	if (MLX5_CAP_GEN(mdev, xrc))
311 		props->device_cap_flags |= IB_DEVICE_XRC;
312 	props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
313 	if (MLX5_CAP_GEN(mdev, block_lb_mc))
314 		props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
315 
316 	props->vendor_part_id	   = mdev->pdev->device;
317 	props->hw_ver		   = mdev->pdev->revision;
318 
319 	props->max_mr_size	   = ~0ull;
320 	props->page_size_cap	   = ~(u32)((1ull << MLX5_CAP_GEN(mdev, log_pg_sz)) -1);
321 	props->max_qp		   = 1 << MLX5_CAP_GEN(mdev, log_max_qp);
322 	props->max_qp_wr	   = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
323 	max_rq_sg =  MLX5_CAP_GEN(mdev, max_wqe_sz_rq) /
324 		     sizeof(struct mlx5_wqe_data_seg);
325 	max_sq_desc = min((int)MLX5_CAP_GEN(mdev, max_wqe_sz_sq), 512);
326 	max_sq_sg = (max_sq_desc -
327 		     sizeof(struct mlx5_wqe_ctrl_seg) -
328 		     sizeof(struct mlx5_wqe_raddr_seg)) / sizeof(struct mlx5_wqe_data_seg);
329 	props->max_sge = min(max_rq_sg, max_sq_sg);
330 	props->max_cq		   = 1 << MLX5_CAP_GEN(mdev, log_max_cq);
331 	props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_cq_sz)) - 1;
332 	props->max_mr		   = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
333 	props->max_pd		   = 1 << MLX5_CAP_GEN(mdev, log_max_pd);
334 	props->max_qp_rd_atom	   = 1 << MLX5_CAP_GEN(mdev, log_max_ra_req_qp);
335 	props->max_qp_init_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_res_qp);
336 	props->max_srq		   = 1 << MLX5_CAP_GEN(mdev, log_max_srq);
337 	props->max_srq_wr = (1 << MLX5_CAP_GEN(mdev, log_max_srq_sz)) - 1;
338 	props->local_ca_ack_delay  = MLX5_CAP_GEN(mdev, local_ca_ack_delay);
339 	props->max_res_rd_atom	   = props->max_qp_rd_atom * props->max_qp;
340 	props->max_srq_sge	   = max_rq_sg - 1;
341 	props->max_fast_reg_page_list_len = (unsigned int)-1;
342 	get_atomic_caps(dev, props);
343 	props->max_mcast_grp	   = 1 << MLX5_CAP_GEN(mdev, log_max_mcg);
344 	props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg);
345 	props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
346 					   props->max_mcast_grp;
347 	props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */
348 	props->max_ah		= INT_MAX;
349 
350 	return 0;
351 }
352 
353 enum mlx5_ib_width {
354 	MLX5_IB_WIDTH_1X	= 1 << 0,
355 	MLX5_IB_WIDTH_2X	= 1 << 1,
356 	MLX5_IB_WIDTH_4X	= 1 << 2,
357 	MLX5_IB_WIDTH_8X	= 1 << 3,
358 	MLX5_IB_WIDTH_12X	= 1 << 4
359 };
360 
361 static int translate_active_width(struct ib_device *ibdev, u8 active_width,
362 				  u8 *ib_width)
363 {
364 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
365 	int err = 0;
366 
367 	if (active_width & MLX5_IB_WIDTH_1X) {
368 		*ib_width = IB_WIDTH_1X;
369 	} else if (active_width & MLX5_IB_WIDTH_2X) {
370 		mlx5_ib_warn(dev, "active_width %d is not supported by IB spec\n",
371 			     (int)active_width);
372 		err = -EINVAL;
373 	} else if (active_width & MLX5_IB_WIDTH_4X) {
374 		*ib_width = IB_WIDTH_4X;
375 	} else if (active_width & MLX5_IB_WIDTH_8X) {
376 		*ib_width = IB_WIDTH_8X;
377 	} else if (active_width & MLX5_IB_WIDTH_12X) {
378 		*ib_width = IB_WIDTH_12X;
379 	} else {
380 		mlx5_ib_dbg(dev, "Invalid active_width %d\n",
381 			    (int)active_width);
382 		err = -EINVAL;
383 	}
384 
385 	return err;
386 }
387 
388 /*
389  * TODO: Move to IB core
390  */
391 enum ib_max_vl_num {
392 	__IB_MAX_VL_0		= 1,
393 	__IB_MAX_VL_0_1		= 2,
394 	__IB_MAX_VL_0_3		= 3,
395 	__IB_MAX_VL_0_7		= 4,
396 	__IB_MAX_VL_0_14	= 5,
397 };
398 
399 enum mlx5_vl_hw_cap {
400 	MLX5_VL_HW_0	= 1,
401 	MLX5_VL_HW_0_1	= 2,
402 	MLX5_VL_HW_0_2	= 3,
403 	MLX5_VL_HW_0_3	= 4,
404 	MLX5_VL_HW_0_4	= 5,
405 	MLX5_VL_HW_0_5	= 6,
406 	MLX5_VL_HW_0_6	= 7,
407 	MLX5_VL_HW_0_7	= 8,
408 	MLX5_VL_HW_0_14	= 15
409 };
410 
411 static int translate_max_vl_num(struct ib_device *ibdev, u8 vl_hw_cap,
412 				u8 *max_vl_num)
413 {
414 	switch (vl_hw_cap) {
415 	case MLX5_VL_HW_0:
416 		*max_vl_num = __IB_MAX_VL_0;
417 		break;
418 	case MLX5_VL_HW_0_1:
419 		*max_vl_num = __IB_MAX_VL_0_1;
420 		break;
421 	case MLX5_VL_HW_0_3:
422 		*max_vl_num = __IB_MAX_VL_0_3;
423 		break;
424 	case MLX5_VL_HW_0_7:
425 		*max_vl_num = __IB_MAX_VL_0_7;
426 		break;
427 	case MLX5_VL_HW_0_14:
428 		*max_vl_num = __IB_MAX_VL_0_14;
429 		break;
430 
431 	default:
432 		return -EINVAL;
433 	}
434 
435 	return 0;
436 }
437 
438 static int mlx5_query_port_ib(struct ib_device *ibdev, u8 port,
439 			      struct ib_port_attr *props)
440 {
441 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
442 	struct mlx5_core_dev *mdev = dev->mdev;
443 	u32 *rep;
444 	int outlen = MLX5_ST_SZ_BYTES(query_hca_vport_context_out);
445 	struct mlx5_ptys_reg *ptys;
446 	struct mlx5_pmtu_reg *pmtu;
447 	struct mlx5_pvlc_reg pvlc;
448 	void *ctx;
449 	int err;
450 
451 	rep = mlx5_vzalloc(outlen);
452 	ptys = kzalloc(sizeof(*ptys), GFP_KERNEL);
453 	pmtu = kzalloc(sizeof(*pmtu), GFP_KERNEL);
454 	if (!rep || !ptys || !pmtu) {
455 		err = -ENOMEM;
456 		goto out;
457 	}
458 
459 	memset(props, 0, sizeof(*props));
460 
461 	/* what if I am pf with dual port */
462 	err = mlx5_query_hca_vport_context(mdev, port, 0, rep, outlen);
463 	if (err)
464 		goto out;
465 
466 	ctx = MLX5_ADDR_OF(query_hca_vport_context_out, rep, hca_vport_context);
467 
468 	props->lid		= MLX5_GET(hca_vport_context, ctx, lid);
469 	props->lmc		= MLX5_GET(hca_vport_context, ctx, lmc);
470 	props->sm_lid		= MLX5_GET(hca_vport_context, ctx, sm_lid);
471 	props->sm_sl		= MLX5_GET(hca_vport_context, ctx, sm_sl);
472 	props->state		= MLX5_GET(hca_vport_context, ctx, vport_state);
473 	props->phys_state	= MLX5_GET(hca_vport_context, ctx,
474 					port_physical_state);
475 	props->port_cap_flags	= MLX5_GET(hca_vport_context, ctx, cap_mask1);
476 	props->gid_tbl_len	= mlx5_get_gid_table_len(MLX5_CAP_GEN(mdev, gid_table_size));
477 	props->max_msg_sz	= 1 << MLX5_CAP_GEN(mdev, log_max_msg);
478 	props->pkey_tbl_len	= mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, pkey_table_size));
479 	props->bad_pkey_cntr	= MLX5_GET(hca_vport_context, ctx,
480 					      pkey_violation_counter);
481 	props->qkey_viol_cntr	= MLX5_GET(hca_vport_context, ctx,
482 					      qkey_violation_counter);
483 	props->subnet_timeout	= MLX5_GET(hca_vport_context, ctx,
484 					      subnet_timeout);
485 	props->init_type_reply	= MLX5_GET(hca_vport_context, ctx,
486 					   init_type_reply);
487 
488 	ptys->proto_mask |= MLX5_PTYS_IB;
489 	ptys->local_port = port;
490 	err = mlx5_core_access_ptys(mdev, ptys, 0);
491 	if (err)
492 		goto out;
493 
494 	err = translate_active_width(ibdev, ptys->ib_link_width_oper,
495 				     &props->active_width);
496 	if (err)
497 		goto out;
498 
499 	props->active_speed	= (u8)ptys->ib_proto_oper;
500 
501 	pmtu->local_port = port;
502 	err = mlx5_core_access_pmtu(mdev, pmtu, 0);
503 	if (err)
504 		goto out;
505 
506 	props->max_mtu		= pmtu->max_mtu;
507 	props->active_mtu	= pmtu->oper_mtu;
508 
509 	memset(&pvlc, 0, sizeof(pvlc));
510 	pvlc.local_port = port;
511 	err = mlx5_core_access_pvlc(mdev, &pvlc, 0);
512 	if (err)
513 		goto out;
514 
515 	err = translate_max_vl_num(ibdev, pvlc.vl_hw_cap,
516 				   &props->max_vl_num);
517 out:
518 	kvfree(rep);
519 	kfree(ptys);
520 	kfree(pmtu);
521 	return err;
522 }
523 
524 int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
525 		       struct ib_port_attr *props)
526 {
527 	switch (mlx5_get_vport_access_method(ibdev)) {
528 	case MLX5_VPORT_ACCESS_METHOD_MAD:
529 		return mlx5_query_port_mad_ifc(ibdev, port, props);
530 
531 	case MLX5_VPORT_ACCESS_METHOD_HCA:
532 		return mlx5_query_port_ib(ibdev, port, props);
533 
534 	case MLX5_VPORT_ACCESS_METHOD_NIC:
535 		return mlx5_query_port_roce(ibdev, port, props);
536 
537 	default:
538 		return -EINVAL;
539 	}
540 }
541 
542 static inline int
543 mlx5_addrconf_ifid_eui48(u8 *eui, struct net_device *dev)
544 {
545 	if (dev->if_addrlen != ETH_ALEN)
546 		return -1;
547 	memcpy(eui, IF_LLADDR(dev), 3);
548 	memcpy(eui + 5, IF_LLADDR(dev) + 3, 3);
549 
550 	/* NOTE: The scope ID is added by the GID to IP conversion */
551 
552 	eui[3] = 0xFF;
553 	eui[4] = 0xFE;
554 	eui[0] ^= 2;
555 	return 0;
556 }
557 
558 static void
559 mlx5_make_default_gid(struct net_device *dev, union ib_gid *gid)
560 {
561 	gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
562 	mlx5_addrconf_ifid_eui48(&gid->raw[8], dev);
563 }
564 
565 static void
566 mlx5_ib_roce_port_update(void *arg)
567 {
568 	struct mlx5_ib_port *port = (struct mlx5_ib_port *)arg;
569 	struct mlx5_ib_dev *dev = port->dev;
570 	struct mlx5_core_dev *mdev = dev->mdev;
571 	struct net_device *xdev[MLX5_IB_GID_MAX];
572 	struct net_device *idev;
573 	struct net_device *ndev;
574 	struct ifaddr *ifa;
575 	union ib_gid gid_temp;
576 
577 	while (port->port_gone == 0) {
578 		int update = 0;
579 		int gid_index = 0;
580 		int j;
581 		int error;
582 
583 		ndev = mlx5_get_protocol_dev(mdev, MLX5_INTERFACE_PROTOCOL_ETH);
584 		if (ndev == NULL) {
585 			pause("W", hz);
586 			continue;
587 		}
588 
589 		CURVNET_SET_QUIET(ndev->if_vnet);
590 
591 		memset(&gid_temp, 0, sizeof(gid_temp));
592 		mlx5_make_default_gid(ndev, &gid_temp);
593 		if (bcmp(&gid_temp, &port->gid_table[gid_index], sizeof(gid_temp))) {
594 			port->gid_table[gid_index] = gid_temp;
595 			update = 1;
596 		}
597 		xdev[gid_index] = ndev;
598 		gid_index++;
599 
600 		IFNET_RLOCK();
601 		TAILQ_FOREACH(idev, &V_ifnet, if_link) {
602 			if (idev == ndev)
603 				break;
604 		}
605 		if (idev != NULL) {
606 		    TAILQ_FOREACH(idev, &V_ifnet, if_link) {
607 			if (idev != ndev) {
608 				if (idev->if_type != IFT_L2VLAN)
609 					continue;
610 				if (ndev != rdma_vlan_dev_real_dev(idev))
611 					continue;
612 			}
613 			/* clone address information for IPv4 and IPv6 */
614 			IF_ADDR_RLOCK(idev);
615 			TAILQ_FOREACH(ifa, &idev->if_addrhead, ifa_link) {
616 				if (ifa->ifa_addr == NULL ||
617 				    (ifa->ifa_addr->sa_family != AF_INET &&
618 				     ifa->ifa_addr->sa_family != AF_INET6) ||
619 				    gid_index >= MLX5_IB_GID_MAX)
620 					continue;
621 				memset(&gid_temp, 0, sizeof(gid_temp));
622 				rdma_ip2gid(ifa->ifa_addr, &gid_temp);
623 				/* check for existing entry */
624 				for (j = 0; j != gid_index; j++) {
625 					if (bcmp(&gid_temp, &port->gid_table[j], sizeof(gid_temp)) == 0)
626 						break;
627 				}
628 				/* check if new entry must be added */
629 				if (j == gid_index) {
630 					if (bcmp(&gid_temp, &port->gid_table[gid_index], sizeof(gid_temp))) {
631 						port->gid_table[gid_index] = gid_temp;
632 						update = 1;
633 					}
634 					xdev[gid_index] = idev;
635 					gid_index++;
636 				}
637 			}
638 			IF_ADDR_RUNLOCK(idev);
639 		    }
640 		}
641 		IFNET_RUNLOCK();
642 		CURVNET_RESTORE();
643 
644 		if (update != 0 &&
645 		    mlx5_ib_port_link_layer(&dev->ib_dev, 1) == IB_LINK_LAYER_ETHERNET) {
646 			struct ib_event event = {
647 			    .device = &dev->ib_dev,
648 			    .element.port_num = port->port_num + 1,
649 			    .event = IB_EVENT_GID_CHANGE,
650 			};
651 
652 			/* add new entries, if any */
653 			for (j = 0; j != gid_index; j++) {
654 				error = modify_gid_roce(&dev->ib_dev, port->port_num, j,
655 				    port->gid_table + j, xdev[j]);
656 				if (error != 0)
657 					printf("mlx5_ib: Failed to update ROCE GID table: %d\n", error);
658 			}
659 			memset(&gid_temp, 0, sizeof(gid_temp));
660 
661 			/* clear old entries, if any */
662 			for (; j != MLX5_IB_GID_MAX; j++) {
663 				if (bcmp(&gid_temp, port->gid_table + j, sizeof(gid_temp)) == 0)
664 					continue;
665 				port->gid_table[j] = gid_temp;
666 				(void) modify_gid_roce(&dev->ib_dev, port->port_num, j,
667 				    port->gid_table + j, ndev);
668 			}
669 
670 			/* make sure ibcore gets updated */
671 			ib_dispatch_event(&event);
672 		}
673 		pause("W", hz);
674 	}
675 	do {
676 		struct ib_event event = {
677 			.device = &dev->ib_dev,
678 			.element.port_num = port->port_num + 1,
679 			.event = IB_EVENT_GID_CHANGE,
680 		};
681 		/* make sure ibcore gets updated */
682 		ib_dispatch_event(&event);
683 
684 		/* wait a bit */
685 		pause("W", hz);
686 	} while (0);
687 	port->port_gone = 2;
688 	kthread_exit();
689 }
690 
691 static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
692 			     union ib_gid *gid)
693 {
694 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
695 	struct mlx5_core_dev *mdev = dev->mdev;
696 
697 	switch (mlx5_get_vport_access_method(ibdev)) {
698 	case MLX5_VPORT_ACCESS_METHOD_MAD:
699 		return mlx5_query_gids_mad_ifc(ibdev, port, index, gid);
700 
701 	case MLX5_VPORT_ACCESS_METHOD_HCA:
702 		return mlx5_query_hca_vport_gid(mdev, port, 0, index, gid);
703 
704 	case MLX5_VPORT_ACCESS_METHOD_NIC:
705 		if (port == 0 || port > MLX5_CAP_GEN(mdev, num_ports) ||
706 		    index < 0 || index >= MLX5_IB_GID_MAX ||
707 		    dev->port[port - 1].port_gone != 0)
708 			memset(gid, 0, sizeof(*gid));
709 		else
710 			*gid = dev->port[port - 1].gid_table[index];
711 		return 0;
712 
713 	default:
714 		return -EINVAL;
715 	}
716 }
717 
718 static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
719 			      u16 *pkey)
720 {
721 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
722 	struct mlx5_core_dev *mdev = dev->mdev;
723 
724 	switch (mlx5_get_vport_access_method(ibdev)) {
725 	case MLX5_VPORT_ACCESS_METHOD_MAD:
726 		return mlx5_query_pkey_mad_ifc(ibdev, port, index, pkey);
727 
728 	case MLX5_VPORT_ACCESS_METHOD_HCA:
729 	case MLX5_VPORT_ACCESS_METHOD_NIC:
730 		return mlx5_query_hca_vport_pkey(mdev, 0, port, 0, index,
731 						 pkey);
732 
733 	default:
734 		return -EINVAL;
735 	}
736 }
737 
738 static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask,
739 				 struct ib_device_modify *props)
740 {
741 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
742 	struct mlx5_reg_node_desc in;
743 	struct mlx5_reg_node_desc out;
744 	int err;
745 
746 	if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
747 		return -EOPNOTSUPP;
748 
749 	if (!(mask & IB_DEVICE_MODIFY_NODE_DESC))
750 		return 0;
751 
752 	/*
753 	 * If possible, pass node desc to FW, so it can generate
754 	 * a 144 trap.  If cmd fails, just ignore.
755 	 */
756 	memcpy(&in, props->node_desc, 64);
757 	err = mlx5_core_access_reg(dev->mdev, &in, sizeof(in), &out,
758 				   sizeof(out), MLX5_REG_NODE_DESC, 0, 1);
759 	if (err)
760 		return err;
761 
762 	memcpy(ibdev->node_desc, props->node_desc, 64);
763 
764 	return err;
765 }
766 
767 static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask,
768 			       struct ib_port_modify *props)
769 {
770 	u8 is_eth = (mlx5_ib_port_link_layer(ibdev, port) ==
771 		     IB_LINK_LAYER_ETHERNET);
772 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
773 	struct ib_port_attr attr;
774 	u32 tmp;
775 	int err;
776 
777 	/* return OK if this is RoCE. CM calls ib_modify_port() regardless
778 	 * of whether port link layer is ETH or IB. For ETH ports, qkey
779 	 * violations and port capabilities are not valid.
780 	 */
781 	if (is_eth)
782 		return 0;
783 
784 	mutex_lock(&dev->cap_mask_mutex);
785 
786 	err = mlx5_ib_query_port(ibdev, port, &attr);
787 	if (err)
788 		goto out;
789 
790 	tmp = (attr.port_cap_flags | props->set_port_cap_mask) &
791 		~props->clr_port_cap_mask;
792 
793 	err = mlx5_set_port_caps(dev->mdev, port, tmp);
794 
795 out:
796 	mutex_unlock(&dev->cap_mask_mutex);
797 	return err;
798 }
799 
800 enum mlx5_cap_flags {
801 	MLX5_CAP_COMPACT_AV = 1 << 0,
802 };
803 
804 static void set_mlx5_flags(u32 *flags, struct mlx5_core_dev *dev)
805 {
806 	*flags |= MLX5_CAP_GEN(dev, compact_address_vector) ?
807 		  MLX5_CAP_COMPACT_AV : 0;
808 }
809 
810 static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
811 						  struct ib_udata *udata)
812 {
813 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
814 	struct mlx5_ib_alloc_ucontext_req_v2 req;
815 	struct mlx5_ib_alloc_ucontext_resp resp;
816 	struct mlx5_ib_ucontext *context;
817 	struct mlx5_uuar_info *uuari;
818 	struct mlx5_uar *uars;
819 	int gross_uuars;
820 	int num_uars;
821 	int ver;
822 	int uuarn;
823 	int err;
824 	int i;
825 	size_t reqlen;
826 
827 	if (!dev->ib_active)
828 		return ERR_PTR(-EAGAIN);
829 
830 	memset(&req, 0, sizeof(req));
831 	memset(&resp, 0, sizeof(resp));
832 
833 	reqlen = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr);
834 	if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req))
835 		ver = 0;
836 	else if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req_v2))
837 		ver = 2;
838 	else {
839 		mlx5_ib_err(dev, "request malformed, reqlen: %ld\n", (long)reqlen);
840 		return ERR_PTR(-EINVAL);
841 	}
842 
843 	err = ib_copy_from_udata(&req, udata, reqlen);
844 	if (err) {
845 		mlx5_ib_err(dev, "copy failed\n");
846 		return ERR_PTR(err);
847 	}
848 
849 	if (req.reserved) {
850 		mlx5_ib_err(dev, "request corrupted\n");
851 		return ERR_PTR(-EINVAL);
852 	}
853 
854 	if (req.total_num_uuars == 0 || req.total_num_uuars > MLX5_MAX_UUARS) {
855 		mlx5_ib_warn(dev, "wrong num_uuars: %d\n", req.total_num_uuars);
856 		return ERR_PTR(-ENOMEM);
857 	}
858 
859 	req.total_num_uuars = ALIGN(req.total_num_uuars,
860 				    MLX5_NON_FP_BF_REGS_PER_PAGE);
861 	if (req.num_low_latency_uuars > req.total_num_uuars - 1) {
862 		mlx5_ib_warn(dev, "wrong num_low_latency_uuars: %d ( > %d)\n",
863 			     req.total_num_uuars, req.total_num_uuars);
864 		return ERR_PTR(-EINVAL);
865 	}
866 
867 	num_uars = req.total_num_uuars / MLX5_NON_FP_BF_REGS_PER_PAGE;
868 	gross_uuars = num_uars * MLX5_BF_REGS_PER_PAGE;
869 	resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp);
870 	if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf))
871 		resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size);
872 	resp.cache_line_size = L1_CACHE_BYTES;
873 	resp.max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq);
874 	resp.max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq);
875 	resp.max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
876 	resp.max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
877 	resp.max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz);
878 	set_mlx5_flags(&resp.flags, dev->mdev);
879 
880 	if (offsetof(struct mlx5_ib_alloc_ucontext_resp, max_desc_sz_sq_dc) < udata->outlen)
881 		resp.max_desc_sz_sq_dc = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq_dc);
882 
883 	if (offsetof(struct mlx5_ib_alloc_ucontext_resp, atomic_arg_sizes_dc) < udata->outlen)
884 		resp.atomic_arg_sizes_dc = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_dc);
885 
886 	context = kzalloc(sizeof(*context), GFP_KERNEL);
887 	if (!context)
888 		return ERR_PTR(-ENOMEM);
889 
890 	uuari = &context->uuari;
891 	mutex_init(&uuari->lock);
892 	uars = kcalloc(num_uars, sizeof(*uars), GFP_KERNEL);
893 	if (!uars) {
894 		err = -ENOMEM;
895 		goto out_ctx;
896 	}
897 
898 	uuari->bitmap = kcalloc(BITS_TO_LONGS(gross_uuars),
899 				sizeof(*uuari->bitmap),
900 				GFP_KERNEL);
901 	if (!uuari->bitmap) {
902 		err = -ENOMEM;
903 		goto out_uar_ctx;
904 	}
905 	/*
906 	 * clear all fast path uuars
907 	 */
908 	for (i = 0; i < gross_uuars; i++) {
909 		uuarn = i & 3;
910 		if (uuarn == 2 || uuarn == 3)
911 			set_bit(i, uuari->bitmap);
912 	}
913 
914 	uuari->count = kcalloc(gross_uuars, sizeof(*uuari->count), GFP_KERNEL);
915 	if (!uuari->count) {
916 		err = -ENOMEM;
917 		goto out_bitmap;
918 	}
919 
920 	for (i = 0; i < num_uars; i++) {
921 		err = mlx5_cmd_alloc_uar(dev->mdev, &uars[i].index);
922 		if (err) {
923 			mlx5_ib_err(dev, "uar alloc failed at %d\n", i);
924 			goto out_uars;
925 		}
926 	}
927 	for (i = 0; i < MLX5_IB_MAX_CTX_DYNAMIC_UARS; i++)
928 		context->dynamic_wc_uar_index[i] = MLX5_IB_INVALID_UAR_INDEX;
929 
930 	INIT_LIST_HEAD(&context->db_page_list);
931 	mutex_init(&context->db_page_mutex);
932 
933 	resp.tot_uuars = req.total_num_uuars;
934 	resp.num_ports = MLX5_CAP_GEN(dev->mdev, num_ports);
935 	err = ib_copy_to_udata(udata, &resp,
936 			       min_t(size_t, udata->outlen, sizeof(resp)));
937 	if (err)
938 		goto out_uars;
939 
940 	uuari->ver = ver;
941 	uuari->num_low_latency_uuars = req.num_low_latency_uuars;
942 	uuari->uars = uars;
943 	uuari->num_uars = num_uars;
944 
945 	if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) ==
946 	    IB_LINK_LAYER_ETHERNET) {
947 		err = mlx5_alloc_transport_domain(dev->mdev, &context->tdn);
948 		if (err)
949 			goto out_uars;
950 	}
951 
952 	return &context->ibucontext;
953 
954 out_uars:
955 	for (i--; i >= 0; i--)
956 		mlx5_cmd_free_uar(dev->mdev, uars[i].index);
957 	kfree(uuari->count);
958 
959 out_bitmap:
960 	kfree(uuari->bitmap);
961 
962 out_uar_ctx:
963 	kfree(uars);
964 
965 out_ctx:
966 	kfree(context);
967 	return ERR_PTR(err);
968 }
969 
970 static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
971 {
972 	struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
973 	struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
974 	struct mlx5_uuar_info *uuari = &context->uuari;
975 	int i;
976 
977 	if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) ==
978 	    IB_LINK_LAYER_ETHERNET)
979 		mlx5_dealloc_transport_domain(dev->mdev, context->tdn);
980 
981 	for (i = 0; i < uuari->num_uars; i++) {
982 		if (mlx5_cmd_free_uar(dev->mdev, uuari->uars[i].index))
983 			mlx5_ib_warn(dev, "failed to free UAR 0x%x\n", uuari->uars[i].index);
984 	}
985 	for (i = 0; i < MLX5_IB_MAX_CTX_DYNAMIC_UARS; i++) {
986 		if (context->dynamic_wc_uar_index[i] != MLX5_IB_INVALID_UAR_INDEX)
987 			mlx5_cmd_free_uar(dev->mdev, context->dynamic_wc_uar_index[i]);
988 	}
989 
990 	kfree(uuari->count);
991 	kfree(uuari->bitmap);
992 	kfree(uuari->uars);
993 	kfree(context);
994 
995 	return 0;
996 }
997 
998 static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev, int index)
999 {
1000 	return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) + index;
1001 }
1002 
1003 static int get_command(unsigned long offset)
1004 {
1005 	return (offset >> MLX5_IB_MMAP_CMD_SHIFT) & MLX5_IB_MMAP_CMD_MASK;
1006 }
1007 
1008 static int get_arg(unsigned long offset)
1009 {
1010 	return offset & ((1 << MLX5_IB_MMAP_CMD_SHIFT) - 1);
1011 }
1012 
1013 static int get_index(unsigned long offset)
1014 {
1015 	return get_arg(offset);
1016 }
1017 
1018 static int uar_mmap(struct vm_area_struct *vma, pgprot_t prot, bool is_wc,
1019 		    struct mlx5_uuar_info *uuari, struct mlx5_ib_dev *dev,
1020 		    struct mlx5_ib_ucontext *context)
1021 {
1022 	unsigned long idx;
1023 	phys_addr_t pfn;
1024 
1025 	if (vma->vm_end - vma->vm_start != PAGE_SIZE) {
1026 		mlx5_ib_warn(dev, "wrong size, expected PAGE_SIZE(%ld) got %ld\n",
1027 			     (long)PAGE_SIZE, (long)(vma->vm_end - vma->vm_start));
1028 		return -EINVAL;
1029 	}
1030 
1031 	idx = get_index(vma->vm_pgoff);
1032 	if (idx >= uuari->num_uars) {
1033 		mlx5_ib_warn(dev, "wrong offset, idx:%ld num_uars:%d\n",
1034 			     idx, uuari->num_uars);
1035 		return -EINVAL;
1036 	}
1037 
1038 	pfn = uar_index2pfn(dev, uuari->uars[idx].index);
1039 	mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn 0x%llx\n", idx,
1040 		    (unsigned long long)pfn);
1041 
1042 	vma->vm_page_prot = prot;
1043 	if (io_remap_pfn_range(vma, vma->vm_start, pfn,
1044 			       PAGE_SIZE, vma->vm_page_prot)) {
1045 		mlx5_ib_err(dev, "io remap failed\n");
1046 		return -EAGAIN;
1047 	}
1048 
1049 	mlx5_ib_dbg(dev, "mapped %s at 0x%lx, PA 0x%llx\n", is_wc ? "WC" : "NC",
1050 		    (long)vma->vm_start, (unsigned long long)pfn << PAGE_SHIFT);
1051 
1052 	return 0;
1053 }
1054 
1055 static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
1056 {
1057 	struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
1058 	struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
1059 	struct mlx5_uuar_info *uuari = &context->uuari;
1060 	unsigned long command;
1061 
1062 	command = get_command(vma->vm_pgoff);
1063 	switch (command) {
1064 	case MLX5_IB_MMAP_REGULAR_PAGE:
1065 		return uar_mmap(vma, pgprot_writecombine(vma->vm_page_prot),
1066 				true,
1067 				uuari, dev, context);
1068 
1069 		break;
1070 
1071 	case MLX5_IB_MMAP_WC_PAGE:
1072 		return uar_mmap(vma, pgprot_writecombine(vma->vm_page_prot),
1073 				true, uuari, dev, context);
1074 		break;
1075 
1076 	case MLX5_IB_MMAP_NC_PAGE:
1077 		return uar_mmap(vma, pgprot_noncached(vma->vm_page_prot),
1078 				false, uuari, dev, context);
1079 		break;
1080 
1081 	default:
1082 		return -EINVAL;
1083 	}
1084 
1085 	return 0;
1086 }
1087 
1088 static int alloc_pa_mkey(struct mlx5_ib_dev *dev, u32 *key, u32 pdn)
1089 {
1090 	struct mlx5_create_mkey_mbox_in *in;
1091 	struct mlx5_mkey_seg *seg;
1092 	struct mlx5_core_mr mr;
1093 	int err;
1094 
1095 	in = kzalloc(sizeof(*in), GFP_KERNEL);
1096 	if (!in)
1097 		return -ENOMEM;
1098 
1099 	seg = &in->seg;
1100 	seg->flags = MLX5_PERM_LOCAL_READ | MLX5_ACCESS_MODE_PA;
1101 	seg->flags_pd = cpu_to_be32(pdn | MLX5_MKEY_LEN64);
1102 	seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
1103 	seg->start_addr = 0;
1104 
1105 	err = mlx5_core_create_mkey(dev->mdev, &mr, in, sizeof(*in),
1106 				    NULL, NULL, NULL);
1107 	if (err) {
1108 		mlx5_ib_warn(dev, "failed to create mkey, %d\n", err);
1109 		goto err_in;
1110 	}
1111 
1112 	kfree(in);
1113 	*key = mr.key;
1114 
1115 	return 0;
1116 
1117 err_in:
1118 	kfree(in);
1119 
1120 	return err;
1121 }
1122 
1123 static void free_pa_mkey(struct mlx5_ib_dev *dev, u32 key)
1124 {
1125 	struct mlx5_core_mr mr;
1126 	int err;
1127 
1128 	memset(&mr, 0, sizeof(mr));
1129 	mr.key = key;
1130 	err = mlx5_core_destroy_mkey(dev->mdev, &mr);
1131 	if (err)
1132 		mlx5_ib_warn(dev, "failed to destroy mkey 0x%x\n", key);
1133 }
1134 
1135 static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev,
1136 				      struct ib_ucontext *context,
1137 				      struct ib_udata *udata)
1138 {
1139 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
1140 	struct mlx5_ib_alloc_pd_resp resp;
1141 	struct mlx5_ib_pd *pd;
1142 	int err;
1143 
1144 	pd = kmalloc(sizeof(*pd), GFP_KERNEL);
1145 	if (!pd)
1146 		return ERR_PTR(-ENOMEM);
1147 
1148 	err = mlx5_core_alloc_pd(to_mdev(ibdev)->mdev, &pd->pdn);
1149 	if (err) {
1150 		mlx5_ib_warn(dev, "pd alloc failed\n");
1151 		kfree(pd);
1152 		return ERR_PTR(err);
1153 	}
1154 
1155 	if (context) {
1156 		resp.pdn = pd->pdn;
1157 		if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
1158 			mlx5_ib_err(dev, "copy failed\n");
1159 			mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn);
1160 			kfree(pd);
1161 			return ERR_PTR(-EFAULT);
1162 		}
1163 	} else {
1164 		err = alloc_pa_mkey(to_mdev(ibdev), &pd->pa_lkey, pd->pdn);
1165 		if (err) {
1166 			mlx5_ib_err(dev, "alloc mkey failed\n");
1167 			mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn);
1168 			kfree(pd);
1169 			return ERR_PTR(err);
1170 		}
1171 	}
1172 
1173 	return &pd->ibpd;
1174 }
1175 
1176 static int mlx5_ib_dealloc_pd(struct ib_pd *pd)
1177 {
1178 	struct mlx5_ib_dev *mdev = to_mdev(pd->device);
1179 	struct mlx5_ib_pd *mpd = to_mpd(pd);
1180 
1181 	if (!pd->uobject)
1182 		free_pa_mkey(mdev, mpd->pa_lkey);
1183 
1184 	mlx5_core_dealloc_pd(mdev->mdev, mpd->pdn);
1185 	kfree(mpd);
1186 
1187 	return 0;
1188 }
1189 
1190 static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
1191 {
1192 	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
1193 	int err;
1194 
1195 	if (ibqp->qp_type == IB_QPT_RAW_PACKET)
1196 		err = -EOPNOTSUPP;
1197 	else
1198 		err = mlx5_core_attach_mcg(dev->mdev, gid, ibqp->qp_num);
1199 	if (err)
1200 		mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n",
1201 			     ibqp->qp_num, gid->raw);
1202 
1203 	return err;
1204 }
1205 
1206 static int mlx5_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
1207 {
1208 	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
1209 	int err;
1210 
1211 	if (ibqp->qp_type == IB_QPT_RAW_PACKET)
1212 		err = -EOPNOTSUPP;
1213 	else
1214 		err = mlx5_core_detach_mcg(dev->mdev, gid, ibqp->qp_num);
1215 	if (err)
1216 		mlx5_ib_warn(dev, "failed detaching QPN 0x%x, MGID %pI6\n",
1217 			     ibqp->qp_num, gid->raw);
1218 
1219 	return err;
1220 }
1221 
1222 static int init_node_data(struct mlx5_ib_dev *dev)
1223 {
1224 	int err;
1225 
1226 	err = mlx5_query_node_desc(dev, dev->ib_dev.node_desc);
1227 	if (err)
1228 		return err;
1229 
1230 	return mlx5_query_node_guid(dev, &dev->ib_dev.node_guid);
1231 }
1232 
1233 static ssize_t show_fw_pages(struct device *device, struct device_attribute *attr,
1234 			     char *buf)
1235 {
1236 	struct mlx5_ib_dev *dev =
1237 		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
1238 
1239 	return sprintf(buf, "%lld\n", (long long)dev->mdev->priv.fw_pages);
1240 }
1241 
1242 static ssize_t show_reg_pages(struct device *device,
1243 			      struct device_attribute *attr, char *buf)
1244 {
1245 	struct mlx5_ib_dev *dev =
1246 		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
1247 
1248 	return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages));
1249 }
1250 
1251 static ssize_t show_hca(struct device *device, struct device_attribute *attr,
1252 			char *buf)
1253 {
1254 	struct mlx5_ib_dev *dev =
1255 		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
1256 	return sprintf(buf, "MT%d\n", dev->mdev->pdev->device);
1257 }
1258 
1259 static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
1260 			   char *buf)
1261 {
1262 	struct mlx5_ib_dev *dev =
1263 		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
1264 	return sprintf(buf, "%d.%d.%04d\n", fw_rev_maj(dev->mdev),
1265 		       fw_rev_min(dev->mdev), fw_rev_sub(dev->mdev));
1266 }
1267 
1268 static ssize_t show_rev(struct device *device, struct device_attribute *attr,
1269 			char *buf)
1270 {
1271 	struct mlx5_ib_dev *dev =
1272 		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
1273 	return sprintf(buf, "%x\n", (unsigned)dev->mdev->pdev->revision);
1274 }
1275 
1276 static ssize_t show_board(struct device *device, struct device_attribute *attr,
1277 			  char *buf)
1278 {
1279 	struct mlx5_ib_dev *dev =
1280 		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
1281 	return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN,
1282 		       dev->mdev->board_id);
1283 }
1284 
1285 static DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,    NULL);
1286 static DEVICE_ATTR(fw_ver,   S_IRUGO, show_fw_ver, NULL);
1287 static DEVICE_ATTR(hca_type, S_IRUGO, show_hca,    NULL);
1288 static DEVICE_ATTR(board_id, S_IRUGO, show_board,  NULL);
1289 static DEVICE_ATTR(fw_pages, S_IRUGO, show_fw_pages, NULL);
1290 static DEVICE_ATTR(reg_pages, S_IRUGO, show_reg_pages, NULL);
1291 
1292 static struct device_attribute *mlx5_class_attributes[] = {
1293 	&dev_attr_hw_rev,
1294 	&dev_attr_fw_ver,
1295 	&dev_attr_hca_type,
1296 	&dev_attr_board_id,
1297 	&dev_attr_fw_pages,
1298 	&dev_attr_reg_pages,
1299 };
1300 
1301 static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev)
1302 {
1303 	struct mlx5_ib_qp *mqp;
1304 	struct mlx5_ib_cq *send_mcq, *recv_mcq;
1305 	struct mlx5_core_cq *mcq;
1306 	struct list_head cq_armed_list;
1307 	unsigned long flags_qp;
1308 	unsigned long flags_cq;
1309 	unsigned long flags;
1310 
1311 	mlx5_ib_warn(ibdev, " started\n");
1312 	INIT_LIST_HEAD(&cq_armed_list);
1313 
1314 	/* Go over qp list reside on that ibdev, sync with create/destroy qp.*/
1315 	spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags);
1316 	list_for_each_entry(mqp, &ibdev->qp_list, qps_list) {
1317 		spin_lock_irqsave(&mqp->sq.lock, flags_qp);
1318 		if (mqp->sq.tail != mqp->sq.head) {
1319 			send_mcq = to_mcq(mqp->ibqp.send_cq);
1320 			spin_lock_irqsave(&send_mcq->lock, flags_cq);
1321 			if (send_mcq->mcq.comp &&
1322 			    mqp->ibqp.send_cq->comp_handler) {
1323 				if (!send_mcq->mcq.reset_notify_added) {
1324 					send_mcq->mcq.reset_notify_added = 1;
1325 					list_add_tail(&send_mcq->mcq.reset_notify,
1326 						      &cq_armed_list);
1327 				}
1328 			}
1329 			spin_unlock_irqrestore(&send_mcq->lock, flags_cq);
1330 		}
1331 		spin_unlock_irqrestore(&mqp->sq.lock, flags_qp);
1332 		spin_lock_irqsave(&mqp->rq.lock, flags_qp);
1333 		/* no handling is needed for SRQ */
1334 		if (!mqp->ibqp.srq) {
1335 			if (mqp->rq.tail != mqp->rq.head) {
1336 				recv_mcq = to_mcq(mqp->ibqp.recv_cq);
1337 				spin_lock_irqsave(&recv_mcq->lock, flags_cq);
1338 				if (recv_mcq->mcq.comp &&
1339 				    mqp->ibqp.recv_cq->comp_handler) {
1340 					if (!recv_mcq->mcq.reset_notify_added) {
1341 						recv_mcq->mcq.reset_notify_added = 1;
1342 						list_add_tail(&recv_mcq->mcq.reset_notify,
1343 							      &cq_armed_list);
1344 					}
1345 				}
1346 				spin_unlock_irqrestore(&recv_mcq->lock,
1347 						       flags_cq);
1348 			}
1349 		}
1350 		spin_unlock_irqrestore(&mqp->rq.lock, flags_qp);
1351 	}
1352 	/*At that point all inflight post send were put to be executed as of we
1353 	 * lock/unlock above locks Now need to arm all involved CQs.
1354 	 */
1355 	list_for_each_entry(mcq, &cq_armed_list, reset_notify) {
1356 		mcq->comp(mcq);
1357 	}
1358 	spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags);
1359 	mlx5_ib_warn(ibdev, " ended\n");
1360 	return;
1361 }
1362 
1363 static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
1364 			  enum mlx5_dev_event event, unsigned long param)
1365 {
1366 	struct mlx5_ib_dev *ibdev = (struct mlx5_ib_dev *)context;
1367 	struct ib_event ibev;
1368 
1369 	u8 port = 0;
1370 
1371 	switch (event) {
1372 	case MLX5_DEV_EVENT_SYS_ERROR:
1373 		ibdev->ib_active = false;
1374 		ibev.event = IB_EVENT_DEVICE_FATAL;
1375 		mlx5_ib_handle_internal_error(ibdev);
1376 		break;
1377 
1378 	case MLX5_DEV_EVENT_PORT_UP:
1379 		ibev.event = IB_EVENT_PORT_ACTIVE;
1380 		port = (u8)param;
1381 		break;
1382 
1383 	case MLX5_DEV_EVENT_PORT_DOWN:
1384 	case MLX5_DEV_EVENT_PORT_INITIALIZED:
1385 		ibev.event = IB_EVENT_PORT_ERR;
1386 		port = (u8)param;
1387 		break;
1388 
1389 	case MLX5_DEV_EVENT_LID_CHANGE:
1390 		ibev.event = IB_EVENT_LID_CHANGE;
1391 		port = (u8)param;
1392 		break;
1393 
1394 	case MLX5_DEV_EVENT_PKEY_CHANGE:
1395 		ibev.event = IB_EVENT_PKEY_CHANGE;
1396 		port = (u8)param;
1397 		break;
1398 
1399 	case MLX5_DEV_EVENT_GUID_CHANGE:
1400 		ibev.event = IB_EVENT_GID_CHANGE;
1401 		port = (u8)param;
1402 		break;
1403 
1404 	case MLX5_DEV_EVENT_CLIENT_REREG:
1405 		ibev.event = IB_EVENT_CLIENT_REREGISTER;
1406 		port = (u8)param;
1407 		break;
1408 
1409 	default:
1410 		break;
1411 	}
1412 
1413 	ibev.device	      = &ibdev->ib_dev;
1414 	ibev.element.port_num = port;
1415 
1416 	if ((event != MLX5_DEV_EVENT_SYS_ERROR) &&
1417 	    (port < 1 || port > ibdev->num_ports)) {
1418 		mlx5_ib_warn(ibdev, "warning: event on port %d\n", port);
1419 		return;
1420 	}
1421 
1422 	if (ibdev->ib_active)
1423 		ib_dispatch_event(&ibev);
1424 }
1425 
1426 static void get_ext_port_caps(struct mlx5_ib_dev *dev)
1427 {
1428 	int port;
1429 
1430 	for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++)
1431 		mlx5_query_ext_port_caps(dev, port);
1432 }
1433 
1434 static void config_atomic_responder(struct mlx5_ib_dev *dev,
1435 				    struct ib_device_attr *props)
1436 {
1437 	enum ib_atomic_cap cap = props->atomic_cap;
1438 
1439 #if 0
1440 	if (cap == IB_ATOMIC_HCA ||
1441 	    cap == IB_ATOMIC_GLOB)
1442 #endif
1443 		dev->enable_atomic_resp = 1;
1444 
1445 	dev->atomic_cap = cap;
1446 }
1447 
1448 enum mlx5_addr_align {
1449 	MLX5_ADDR_ALIGN_0	= 0,
1450 	MLX5_ADDR_ALIGN_64	= 64,
1451 	MLX5_ADDR_ALIGN_128	= 128,
1452 };
1453 
1454 static int get_port_caps(struct mlx5_ib_dev *dev)
1455 {
1456 	struct ib_device_attr *dprops = NULL;
1457 	struct ib_port_attr *pprops = NULL;
1458 	int err = -ENOMEM;
1459 	int port;
1460 
1461 	pprops = kmalloc(sizeof(*pprops), GFP_KERNEL);
1462 	if (!pprops)
1463 		goto out;
1464 
1465 	dprops = kmalloc(sizeof(*dprops), GFP_KERNEL);
1466 	if (!dprops)
1467 		goto out;
1468 
1469 	err = mlx5_ib_query_device(&dev->ib_dev, dprops);
1470 	if (err) {
1471 		mlx5_ib_warn(dev, "query_device failed %d\n", err);
1472 		goto out;
1473 	}
1474 	config_atomic_responder(dev, dprops);
1475 
1476 	for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) {
1477 		err = mlx5_ib_query_port(&dev->ib_dev, port, pprops);
1478 		if (err) {
1479 			mlx5_ib_warn(dev, "query_port %d failed %d\n",
1480 				     port, err);
1481 			break;
1482 		}
1483 		dev->mdev->port_caps[port - 1].pkey_table_len = dprops->max_pkeys;
1484 		dev->mdev->port_caps[port - 1].gid_table_len = pprops->gid_tbl_len;
1485 		mlx5_ib_dbg(dev, "pkey_table_len %d, gid_table_len %d\n",
1486 			    dprops->max_pkeys, pprops->gid_tbl_len);
1487 	}
1488 
1489 out:
1490 	kfree(pprops);
1491 	kfree(dprops);
1492 
1493 	return err;
1494 }
1495 
1496 static void destroy_umrc_res(struct mlx5_ib_dev *dev)
1497 {
1498 	int err;
1499 
1500 	err = mlx5_mr_cache_cleanup(dev);
1501 	if (err)
1502 		mlx5_ib_warn(dev, "mr cache cleanup failed\n");
1503 
1504 	ib_dereg_mr(dev->umrc.mr);
1505 	ib_dealloc_pd(dev->umrc.pd);
1506 }
1507 
1508 enum {
1509 	MAX_UMR_WR = 128,
1510 };
1511 
1512 static int create_umr_res(struct mlx5_ib_dev *dev)
1513 {
1514 	struct ib_pd *pd;
1515 	struct ib_mr *mr;
1516 	int ret;
1517 
1518 	pd = ib_alloc_pd(&dev->ib_dev);
1519 	if (IS_ERR(pd)) {
1520 		mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n");
1521 		ret = PTR_ERR(pd);
1522 		goto error_0;
1523 	}
1524 
1525 	mr = ib_get_dma_mr(pd,  IB_ACCESS_LOCAL_WRITE);
1526 	if (IS_ERR(mr)) {
1527 		mlx5_ib_dbg(dev, "Couldn't create DMA MR for sync UMR QP\n");
1528 		ret = PTR_ERR(mr);
1529 		goto error_1;
1530 	}
1531 
1532 	dev->umrc.mr = mr;
1533 	dev->umrc.pd = pd;
1534 
1535 	ret = mlx5_mr_cache_init(dev);
1536 	if (ret) {
1537 		mlx5_ib_warn(dev, "mr cache init failed %d\n", ret);
1538 		goto error_4;
1539 	}
1540 
1541 	return 0;
1542 
1543 error_4:
1544 	ib_dereg_mr(mr);
1545 error_1:
1546 	ib_dealloc_pd(pd);
1547 error_0:
1548 	return ret;
1549 }
1550 
1551 static int create_dev_resources(struct mlx5_ib_resources *devr)
1552 {
1553 	struct ib_srq_init_attr attr;
1554 	struct mlx5_ib_dev *dev;
1555 	int ret = 0;
1556 	struct ib_cq_init_attr cq_attr = { .cqe = 1 };
1557 
1558 	dev = container_of(devr, struct mlx5_ib_dev, devr);
1559 
1560 	devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL);
1561 	if (IS_ERR(devr->p0)) {
1562 		ret = PTR_ERR(devr->p0);
1563 		goto error0;
1564 	}
1565 	devr->p0->device  = &dev->ib_dev;
1566 	devr->p0->uobject = NULL;
1567 	atomic_set(&devr->p0->usecnt, 0);
1568 
1569 	devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, &cq_attr, NULL, NULL);
1570 	if (IS_ERR(devr->c0)) {
1571 		ret = PTR_ERR(devr->c0);
1572 		goto error1;
1573 	}
1574 	devr->c0->device        = &dev->ib_dev;
1575 	devr->c0->uobject       = NULL;
1576 	devr->c0->comp_handler  = NULL;
1577 	devr->c0->event_handler = NULL;
1578 	devr->c0->cq_context    = NULL;
1579 	atomic_set(&devr->c0->usecnt, 0);
1580 
1581 	devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL);
1582 	if (IS_ERR(devr->x0)) {
1583 		ret = PTR_ERR(devr->x0);
1584 		goto error2;
1585 	}
1586 	devr->x0->device = &dev->ib_dev;
1587 	devr->x0->inode = NULL;
1588 	atomic_set(&devr->x0->usecnt, 0);
1589 	mutex_init(&devr->x0->tgt_qp_mutex);
1590 	INIT_LIST_HEAD(&devr->x0->tgt_qp_list);
1591 
1592 	devr->x1 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL);
1593 	if (IS_ERR(devr->x1)) {
1594 		ret = PTR_ERR(devr->x1);
1595 		goto error3;
1596 	}
1597 	devr->x1->device = &dev->ib_dev;
1598 	devr->x1->inode = NULL;
1599 	atomic_set(&devr->x1->usecnt, 0);
1600 	mutex_init(&devr->x1->tgt_qp_mutex);
1601 	INIT_LIST_HEAD(&devr->x1->tgt_qp_list);
1602 
1603 	memset(&attr, 0, sizeof(attr));
1604 	attr.attr.max_sge = 1;
1605 	attr.attr.max_wr = 1;
1606 	attr.srq_type = IB_SRQT_XRC;
1607 	attr.ext.xrc.cq = devr->c0;
1608 	attr.ext.xrc.xrcd = devr->x0;
1609 
1610 	devr->s0 = mlx5_ib_create_srq(devr->p0, &attr, NULL);
1611 	if (IS_ERR(devr->s0)) {
1612 		ret = PTR_ERR(devr->s0);
1613 		goto error4;
1614 	}
1615 	devr->s0->device	= &dev->ib_dev;
1616 	devr->s0->pd		= devr->p0;
1617 	devr->s0->uobject       = NULL;
1618 	devr->s0->event_handler = NULL;
1619 	devr->s0->srq_context   = NULL;
1620 	devr->s0->srq_type      = IB_SRQT_XRC;
1621 	devr->s0->ext.xrc.xrcd  = devr->x0;
1622 	devr->s0->ext.xrc.cq	= devr->c0;
1623 	atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt);
1624 	atomic_inc(&devr->s0->ext.xrc.cq->usecnt);
1625 	atomic_inc(&devr->p0->usecnt);
1626 	atomic_set(&devr->s0->usecnt, 0);
1627 
1628 	memset(&attr, 0, sizeof(attr));
1629 	attr.attr.max_sge = 1;
1630 	attr.attr.max_wr = 1;
1631 	attr.srq_type = IB_SRQT_BASIC;
1632 	devr->s1 = mlx5_ib_create_srq(devr->p0, &attr, NULL);
1633 	if (IS_ERR(devr->s1)) {
1634 		ret = PTR_ERR(devr->s1);
1635 		goto error5;
1636 	}
1637 	devr->s1->device	= &dev->ib_dev;
1638 	devr->s1->pd		= devr->p0;
1639 	devr->s1->uobject       = NULL;
1640 	devr->s1->event_handler = NULL;
1641 	devr->s1->srq_context   = NULL;
1642 	devr->s1->srq_type      = IB_SRQT_BASIC;
1643 	devr->s1->ext.xrc.cq	= devr->c0;
1644 	atomic_inc(&devr->p0->usecnt);
1645 	atomic_set(&devr->s1->usecnt, 0);
1646 
1647 	return 0;
1648 
1649 error5:
1650 	mlx5_ib_destroy_srq(devr->s0);
1651 error4:
1652 	mlx5_ib_dealloc_xrcd(devr->x1);
1653 error3:
1654 	mlx5_ib_dealloc_xrcd(devr->x0);
1655 error2:
1656 	mlx5_ib_destroy_cq(devr->c0);
1657 error1:
1658 	mlx5_ib_dealloc_pd(devr->p0);
1659 error0:
1660 	return ret;
1661 }
1662 
1663 static void destroy_dev_resources(struct mlx5_ib_resources *devr)
1664 {
1665 	mlx5_ib_destroy_srq(devr->s1);
1666 	mlx5_ib_destroy_srq(devr->s0);
1667 	mlx5_ib_dealloc_xrcd(devr->x0);
1668 	mlx5_ib_dealloc_xrcd(devr->x1);
1669 	mlx5_ib_destroy_cq(devr->c0);
1670 	mlx5_ib_dealloc_pd(devr->p0);
1671 }
1672 
1673 static u32 get_core_cap_flags(struct ib_device *ibdev)
1674 {
1675 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
1676 	enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, 1);
1677 	u8 l3_type_cap = MLX5_CAP_ROCE(dev->mdev, l3_type);
1678 	u8 roce_version_cap = MLX5_CAP_ROCE(dev->mdev, roce_version);
1679 	u32 ret = 0;
1680 
1681 	if (ll == IB_LINK_LAYER_INFINIBAND)
1682 		return RDMA_CORE_PORT_IBA_IB;
1683 
1684 	ret = RDMA_CORE_PORT_RAW_PACKET;
1685 
1686 	if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV4_CAP))
1687 		return ret;
1688 
1689 	if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV6_CAP))
1690 		return ret;
1691 
1692 	if (roce_version_cap & MLX5_ROCE_VERSION_1_CAP)
1693 		ret |= RDMA_CORE_PORT_IBA_ROCE;
1694 
1695 	if (roce_version_cap & MLX5_ROCE_VERSION_2_CAP)
1696 		ret |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
1697 
1698 	return ret;
1699 }
1700 
1701 static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num,
1702 			       struct ib_port_immutable *immutable)
1703 {
1704 	struct ib_port_attr attr;
1705 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
1706 	enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, port_num);
1707 	int err;
1708 
1709 	immutable->core_cap_flags = get_core_cap_flags(ibdev);
1710 
1711 	err = ib_query_port(ibdev, port_num, &attr);
1712 	if (err)
1713 		return err;
1714 
1715 	immutable->pkey_tbl_len = attr.pkey_tbl_len;
1716 	immutable->gid_tbl_len = attr.gid_tbl_len;
1717 	immutable->core_cap_flags = get_core_cap_flags(ibdev);
1718 	if ((ll == IB_LINK_LAYER_INFINIBAND) || MLX5_CAP_GEN(dev->mdev, roce))
1719 		immutable->max_mad_size = IB_MGMT_MAD_SIZE;
1720 
1721 	return 0;
1722 }
1723 
1724 static void enable_dc_tracer(struct mlx5_ib_dev *dev)
1725 {
1726 	struct device *device = dev->ib_dev.dma_device;
1727 	struct mlx5_dc_tracer *dct = &dev->dctr;
1728 	int order;
1729 	void *tmp;
1730 	int size;
1731 	int err;
1732 
1733 	size = MLX5_CAP_GEN(dev->mdev, num_ports) * 4096;
1734 	if (size <= PAGE_SIZE)
1735 		order = 0;
1736 	else
1737 		order = 1;
1738 
1739 	dct->pg = alloc_pages(GFP_KERNEL, order);
1740 	if (!dct->pg) {
1741 		mlx5_ib_err(dev, "failed to allocate %d pages\n", order);
1742 		return;
1743 	}
1744 
1745 	tmp = page_address(dct->pg);
1746 	memset(tmp, 0xff, size);
1747 
1748 	dct->size = size;
1749 	dct->order = order;
1750 	dct->dma = dma_map_page(device, dct->pg, 0, size, DMA_FROM_DEVICE);
1751 	if (dma_mapping_error(device, dct->dma)) {
1752 		mlx5_ib_err(dev, "dma mapping error\n");
1753 		goto map_err;
1754 	}
1755 
1756 	err = mlx5_core_set_dc_cnak_trace(dev->mdev, 1, dct->dma);
1757 	if (err) {
1758 		mlx5_ib_warn(dev, "failed to enable DC tracer\n");
1759 		goto cmd_err;
1760 	}
1761 
1762 	return;
1763 
1764 cmd_err:
1765 	dma_unmap_page(device, dct->dma, size, DMA_FROM_DEVICE);
1766 map_err:
1767 	__free_pages(dct->pg, dct->order);
1768 	dct->pg = NULL;
1769 }
1770 
1771 static void disable_dc_tracer(struct mlx5_ib_dev *dev)
1772 {
1773 	struct device *device = dev->ib_dev.dma_device;
1774 	struct mlx5_dc_tracer *dct = &dev->dctr;
1775 	int err;
1776 
1777 	if (!dct->pg)
1778 		return;
1779 
1780 	err = mlx5_core_set_dc_cnak_trace(dev->mdev, 0, dct->dma);
1781 	if (err) {
1782 		mlx5_ib_warn(dev, "failed to disable DC tracer\n");
1783 		return;
1784 	}
1785 
1786 	dma_unmap_page(device, dct->dma, dct->size, DMA_FROM_DEVICE);
1787 	__free_pages(dct->pg, dct->order);
1788 	dct->pg = NULL;
1789 }
1790 
1791 enum {
1792 	MLX5_DC_CNAK_SIZE		= 128,
1793 	MLX5_NUM_BUF_IN_PAGE		= PAGE_SIZE / MLX5_DC_CNAK_SIZE,
1794 	MLX5_CNAK_TX_CQ_SIGNAL_FACTOR	= 128,
1795 	MLX5_DC_CNAK_SL			= 0,
1796 	MLX5_DC_CNAK_VL			= 0,
1797 };
1798 
1799 static int init_dc_improvements(struct mlx5_ib_dev *dev)
1800 {
1801 	if (!mlx5_core_is_pf(dev->mdev))
1802 		return 0;
1803 
1804 	if (!(MLX5_CAP_GEN(dev->mdev, dc_cnak_trace)))
1805 		return 0;
1806 
1807 	enable_dc_tracer(dev);
1808 
1809 	return 0;
1810 }
1811 
1812 static void cleanup_dc_improvements(struct mlx5_ib_dev *dev)
1813 {
1814 
1815 	disable_dc_tracer(dev);
1816 }
1817 
1818 static void mlx5_ib_dealloc_q_port_counter(struct mlx5_ib_dev *dev, u8 port_num)
1819 {
1820 	mlx5_vport_dealloc_q_counter(dev->mdev,
1821 				     MLX5_INTERFACE_PROTOCOL_IB,
1822 				     dev->port[port_num].q_cnt_id);
1823 	dev->port[port_num].q_cnt_id = 0;
1824 }
1825 
1826 static void mlx5_ib_dealloc_q_counters(struct mlx5_ib_dev *dev)
1827 {
1828 	unsigned int i;
1829 
1830 	for (i = 0; i < dev->num_ports; i++)
1831 		mlx5_ib_dealloc_q_port_counter(dev, i);
1832 }
1833 
1834 static int mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev)
1835 {
1836 	int i;
1837 	int ret;
1838 
1839 	for (i = 0; i < dev->num_ports; i++) {
1840 		ret = mlx5_vport_alloc_q_counter(dev->mdev,
1841 						 MLX5_INTERFACE_PROTOCOL_IB,
1842 						 &dev->port[i].q_cnt_id);
1843 		if (ret) {
1844 			mlx5_ib_warn(dev,
1845 				     "couldn't allocate queue counter for port %d\n",
1846 				     i + 1);
1847 			goto dealloc_counters;
1848 		}
1849 	}
1850 
1851 	return 0;
1852 
1853 dealloc_counters:
1854 	while (--i >= 0)
1855 		mlx5_ib_dealloc_q_port_counter(dev, i);
1856 
1857 	return ret;
1858 }
1859 
1860 struct port_attribute {
1861 	struct attribute attr;
1862 	ssize_t (*show)(struct mlx5_ib_port *,
1863 			struct port_attribute *, char *buf);
1864 	ssize_t (*store)(struct mlx5_ib_port *,
1865 			 struct port_attribute *,
1866 			 const char *buf, size_t count);
1867 };
1868 
1869 struct port_counter_attribute {
1870 	struct port_attribute	attr;
1871 	size_t			offset;
1872 };
1873 
1874 static ssize_t port_attr_show(struct kobject *kobj,
1875 			      struct attribute *attr, char *buf)
1876 {
1877 	struct port_attribute *port_attr =
1878 		container_of(attr, struct port_attribute, attr);
1879 	struct mlx5_ib_port_sysfs_group *p =
1880 		container_of(kobj, struct mlx5_ib_port_sysfs_group,
1881 			     kobj);
1882 	struct mlx5_ib_port *mibport = container_of(p, struct mlx5_ib_port,
1883 						    group);
1884 
1885 	if (!port_attr->show)
1886 		return -EIO;
1887 
1888 	return port_attr->show(mibport, port_attr, buf);
1889 }
1890 
1891 static ssize_t show_port_counter(struct mlx5_ib_port *p,
1892 				 struct port_attribute *port_attr,
1893 				 char *buf)
1894 {
1895 	int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out);
1896 	struct port_counter_attribute *counter_attr =
1897 		container_of(port_attr, struct port_counter_attribute, attr);
1898 	void *out;
1899 	int ret;
1900 
1901 	out = mlx5_vzalloc(outlen);
1902 	if (!out)
1903 		return -ENOMEM;
1904 
1905 	ret = mlx5_vport_query_q_counter(p->dev->mdev,
1906 					 p->q_cnt_id, 0,
1907 					 out, outlen);
1908 	if (ret)
1909 		goto free;
1910 
1911 	ret = sprintf(buf, "%d\n",
1912 		      be32_to_cpu(*(__be32 *)(out + counter_attr->offset)));
1913 
1914 free:
1915 	kfree(out);
1916 	return ret;
1917 }
1918 
1919 #define PORT_COUNTER_ATTR(_name)					\
1920 struct port_counter_attribute port_counter_attr_##_name = {		\
1921 	.attr  = __ATTR(_name, S_IRUGO, show_port_counter, NULL),	\
1922 	.offset = MLX5_BYTE_OFF(query_q_counter_out, _name)		\
1923 }
1924 
1925 static PORT_COUNTER_ATTR(rx_write_requests);
1926 static PORT_COUNTER_ATTR(rx_read_requests);
1927 static PORT_COUNTER_ATTR(rx_atomic_requests);
1928 static PORT_COUNTER_ATTR(rx_dct_connect);
1929 static PORT_COUNTER_ATTR(out_of_buffer);
1930 static PORT_COUNTER_ATTR(out_of_sequence);
1931 static PORT_COUNTER_ATTR(duplicate_request);
1932 static PORT_COUNTER_ATTR(rnr_nak_retry_err);
1933 static PORT_COUNTER_ATTR(packet_seq_err);
1934 static PORT_COUNTER_ATTR(implied_nak_seq_err);
1935 static PORT_COUNTER_ATTR(local_ack_timeout_err);
1936 
1937 static struct attribute *counter_attrs[] = {
1938 	&port_counter_attr_rx_write_requests.attr.attr,
1939 	&port_counter_attr_rx_read_requests.attr.attr,
1940 	&port_counter_attr_rx_atomic_requests.attr.attr,
1941 	&port_counter_attr_rx_dct_connect.attr.attr,
1942 	&port_counter_attr_out_of_buffer.attr.attr,
1943 	&port_counter_attr_out_of_sequence.attr.attr,
1944 	&port_counter_attr_duplicate_request.attr.attr,
1945 	&port_counter_attr_rnr_nak_retry_err.attr.attr,
1946 	&port_counter_attr_packet_seq_err.attr.attr,
1947 	&port_counter_attr_implied_nak_seq_err.attr.attr,
1948 	&port_counter_attr_local_ack_timeout_err.attr.attr,
1949 	NULL
1950 };
1951 
1952 static struct attribute_group port_counters_group = {
1953 	.name  = "counters",
1954 	.attrs  = counter_attrs
1955 };
1956 
1957 static const struct sysfs_ops port_sysfs_ops = {
1958 	.show = port_attr_show
1959 };
1960 
1961 static struct kobj_type port_type = {
1962 	.sysfs_ops     = &port_sysfs_ops,
1963 };
1964 
1965 static int add_port_attrs(struct mlx5_ib_dev *dev,
1966 			  struct kobject *parent,
1967 			  struct mlx5_ib_port_sysfs_group *port,
1968 			  u8 port_num)
1969 {
1970 	int ret;
1971 
1972 	ret = kobject_init_and_add(&port->kobj, &port_type,
1973 				   parent,
1974 				   "%d", port_num);
1975 	if (ret)
1976 		return ret;
1977 
1978 	if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt) &&
1979 	    MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) {
1980 		ret = sysfs_create_group(&port->kobj, &port_counters_group);
1981 		if (ret)
1982 			goto put_kobj;
1983 	}
1984 
1985 	port->enabled = true;
1986 	return ret;
1987 
1988 put_kobj:
1989 	kobject_put(&port->kobj);
1990 	return ret;
1991 }
1992 
1993 static void destroy_ports_attrs(struct mlx5_ib_dev *dev,
1994 				unsigned int num_ports)
1995 {
1996 	unsigned int i;
1997 
1998 	for (i = 0; i < num_ports; i++) {
1999 		struct mlx5_ib_port_sysfs_group *port =
2000 			&dev->port[i].group;
2001 
2002 		if (!port->enabled)
2003 			continue;
2004 
2005 		if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt) &&
2006 		    MLX5_CAP_GEN(dev->mdev, retransmission_q_counters))
2007 			sysfs_remove_group(&port->kobj,
2008 					   &port_counters_group);
2009 		kobject_put(&port->kobj);
2010 		port->enabled = false;
2011 	}
2012 
2013 	if (dev->ports_parent) {
2014 		kobject_put(dev->ports_parent);
2015 		dev->ports_parent = NULL;
2016 	}
2017 }
2018 
2019 static int create_port_attrs(struct mlx5_ib_dev *dev)
2020 {
2021 	int ret = 0;
2022 	unsigned int i = 0;
2023 	struct device *device = &dev->ib_dev.dev;
2024 
2025 	dev->ports_parent = kobject_create_and_add("mlx5_ports",
2026 						   &device->kobj);
2027 	if (!dev->ports_parent)
2028 		return -ENOMEM;
2029 
2030 	for (i = 0; i < dev->num_ports; i++) {
2031 		ret = add_port_attrs(dev,
2032 				     dev->ports_parent,
2033 				     &dev->port[i].group,
2034 				     i + 1);
2035 
2036 		if (ret)
2037 			goto _destroy_ports_attrs;
2038 	}
2039 
2040 	return 0;
2041 
2042 _destroy_ports_attrs:
2043 	destroy_ports_attrs(dev, i);
2044 	return ret;
2045 }
2046 
2047 static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
2048 {
2049 	struct mlx5_ib_dev *dev;
2050 	int err;
2051 	int i;
2052 
2053 	printk_once(KERN_INFO "%s", mlx5_version);
2054 
2055 	dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev));
2056 	if (!dev)
2057 		return NULL;
2058 
2059 	dev->mdev = mdev;
2060 
2061 	dev->port = kcalloc(MLX5_CAP_GEN(mdev, num_ports), sizeof(*dev->port),
2062 			     GFP_KERNEL);
2063 	if (!dev->port)
2064 		goto err_dealloc;
2065 
2066 	for (i = 0; i < MLX5_CAP_GEN(mdev, num_ports); i++) {
2067 		dev->port[i].dev = dev;
2068 		dev->port[i].port_num = i;
2069 		dev->port[i].port_gone = 0;
2070 		memset(dev->port[i].gid_table, 0, sizeof(dev->port[i].gid_table));
2071 	}
2072 
2073 	err = get_port_caps(dev);
2074 	if (err)
2075 		goto err_free_port;
2076 
2077 	if (mlx5_use_mad_ifc(dev))
2078 		get_ext_port_caps(dev);
2079 
2080 	if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) ==
2081 	    IB_LINK_LAYER_ETHERNET) {
2082 		if (MLX5_CAP_GEN(mdev, roce)) {
2083 			err = mlx5_nic_vport_enable_roce(mdev);
2084 			if (err)
2085 				goto err_free_port;
2086 		} else {
2087 			goto err_free_port;
2088 		}
2089 	}
2090 
2091 	MLX5_INIT_DOORBELL_LOCK(&dev->uar_lock);
2092 
2093 	strlcpy(dev->ib_dev.name, "mlx5_%d", IB_DEVICE_NAME_MAX);
2094 	dev->ib_dev.owner		= THIS_MODULE;
2095 	dev->ib_dev.node_type		= RDMA_NODE_IB_CA;
2096 	dev->ib_dev.local_dma_lkey	= mdev->special_contexts.resd_lkey;
2097 	dev->num_ports		= MLX5_CAP_GEN(mdev, num_ports);
2098 	dev->ib_dev.phys_port_cnt     = dev->num_ports;
2099 	dev->ib_dev.num_comp_vectors    =
2100 		dev->mdev->priv.eq_table.num_comp_vectors;
2101 	dev->ib_dev.dma_device	= &mdev->pdev->dev;
2102 
2103 	dev->ib_dev.uverbs_abi_ver	= MLX5_IB_UVERBS_ABI_VERSION;
2104 	dev->ib_dev.uverbs_cmd_mask	=
2105 		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
2106 		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
2107 		(1ull << IB_USER_VERBS_CMD_QUERY_PORT)		|
2108 		(1ull << IB_USER_VERBS_CMD_ALLOC_PD)		|
2109 		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD)		|
2110 		(1ull << IB_USER_VERBS_CMD_REG_MR)		|
2111 		(1ull << IB_USER_VERBS_CMD_DEREG_MR)		|
2112 		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL)	|
2113 		(1ull << IB_USER_VERBS_CMD_CREATE_CQ)		|
2114 		(1ull << IB_USER_VERBS_CMD_RESIZE_CQ)		|
2115 		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ)		|
2116 		(1ull << IB_USER_VERBS_CMD_CREATE_QP)		|
2117 		(1ull << IB_USER_VERBS_CMD_MODIFY_QP)		|
2118 		(1ull << IB_USER_VERBS_CMD_QUERY_QP)		|
2119 		(1ull << IB_USER_VERBS_CMD_DESTROY_QP)		|
2120 		(1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)	|
2121 		(1ull << IB_USER_VERBS_CMD_DETACH_MCAST)	|
2122 		(1ull << IB_USER_VERBS_CMD_CREATE_SRQ)		|
2123 		(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)		|
2124 		(1ull << IB_USER_VERBS_CMD_QUERY_SRQ)		|
2125 		(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)		|
2126 		(1ull << IB_USER_VERBS_CMD_CREATE_XSRQ)		|
2127 		(1ull << IB_USER_VERBS_CMD_OPEN_QP);
2128 
2129 	dev->ib_dev.query_device	= mlx5_ib_query_device;
2130 	dev->ib_dev.query_port		= mlx5_ib_query_port;
2131 	dev->ib_dev.get_link_layer	= mlx5_ib_port_link_layer;
2132 	dev->ib_dev.get_netdev		= mlx5_ib_get_netdev;
2133 	dev->ib_dev.query_gid		= mlx5_ib_query_gid;
2134 	dev->ib_dev.query_pkey		= mlx5_ib_query_pkey;
2135 	dev->ib_dev.modify_device	= mlx5_ib_modify_device;
2136 	dev->ib_dev.modify_port		= mlx5_ib_modify_port;
2137 	dev->ib_dev.alloc_ucontext	= mlx5_ib_alloc_ucontext;
2138 	dev->ib_dev.dealloc_ucontext	= mlx5_ib_dealloc_ucontext;
2139 	dev->ib_dev.mmap		= mlx5_ib_mmap;
2140 	dev->ib_dev.alloc_pd		= mlx5_ib_alloc_pd;
2141 	dev->ib_dev.dealloc_pd		= mlx5_ib_dealloc_pd;
2142 	dev->ib_dev.create_ah		= mlx5_ib_create_ah;
2143 	dev->ib_dev.query_ah		= mlx5_ib_query_ah;
2144 	dev->ib_dev.destroy_ah		= mlx5_ib_destroy_ah;
2145 	dev->ib_dev.create_srq		= mlx5_ib_create_srq;
2146 	dev->ib_dev.modify_srq		= mlx5_ib_modify_srq;
2147 	dev->ib_dev.query_srq		= mlx5_ib_query_srq;
2148 	dev->ib_dev.destroy_srq		= mlx5_ib_destroy_srq;
2149 	dev->ib_dev.post_srq_recv	= mlx5_ib_post_srq_recv;
2150 	dev->ib_dev.create_qp		= mlx5_ib_create_qp;
2151 	dev->ib_dev.modify_qp		= mlx5_ib_modify_qp;
2152 	dev->ib_dev.query_qp		= mlx5_ib_query_qp;
2153 	dev->ib_dev.destroy_qp		= mlx5_ib_destroy_qp;
2154 	dev->ib_dev.post_send		= mlx5_ib_post_send;
2155 	dev->ib_dev.post_recv		= mlx5_ib_post_recv;
2156 	dev->ib_dev.create_cq		= mlx5_ib_create_cq;
2157 	dev->ib_dev.modify_cq		= mlx5_ib_modify_cq;
2158 	dev->ib_dev.resize_cq		= mlx5_ib_resize_cq;
2159 	dev->ib_dev.destroy_cq		= mlx5_ib_destroy_cq;
2160 	dev->ib_dev.poll_cq		= mlx5_ib_poll_cq;
2161 	dev->ib_dev.req_notify_cq	= mlx5_ib_arm_cq;
2162 	dev->ib_dev.get_dma_mr		= mlx5_ib_get_dma_mr;
2163 	dev->ib_dev.reg_user_mr		= mlx5_ib_reg_user_mr;
2164 	dev->ib_dev.reg_phys_mr		= mlx5_ib_reg_phys_mr;
2165 	dev->ib_dev.dereg_mr		= mlx5_ib_dereg_mr;
2166 	dev->ib_dev.attach_mcast	= mlx5_ib_mcg_attach;
2167 	dev->ib_dev.detach_mcast	= mlx5_ib_mcg_detach;
2168 	dev->ib_dev.process_mad		= mlx5_ib_process_mad;
2169 	dev->ib_dev.get_port_immutable  = mlx5_port_immutable;
2170 	dev->ib_dev.alloc_fast_reg_mr	= mlx5_ib_alloc_fast_reg_mr;
2171 	dev->ib_dev.alloc_fast_reg_page_list = mlx5_ib_alloc_fast_reg_page_list;
2172 	dev->ib_dev.free_fast_reg_page_list  = mlx5_ib_free_fast_reg_page_list;
2173 
2174 	if (MLX5_CAP_GEN(mdev, xrc)) {
2175 		dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd;
2176 		dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd;
2177 		dev->ib_dev.uverbs_cmd_mask |=
2178 			(1ull << IB_USER_VERBS_CMD_OPEN_XRCD) |
2179 			(1ull << IB_USER_VERBS_CMD_CLOSE_XRCD);
2180 	}
2181 
2182 	err = init_node_data(dev);
2183 	if (err)
2184 		goto err_disable_roce;
2185 
2186 	mutex_init(&dev->cap_mask_mutex);
2187 	INIT_LIST_HEAD(&dev->qp_list);
2188 	spin_lock_init(&dev->reset_flow_resource_lock);
2189 
2190 	err = create_dev_resources(&dev->devr);
2191 	if (err)
2192 		goto err_disable_roce;
2193 
2194 
2195 	err = mlx5_ib_alloc_q_counters(dev);
2196 	if (err)
2197 		goto err_odp;
2198 
2199 	err = ib_register_device(&dev->ib_dev, NULL);
2200 	if (err)
2201 		goto err_q_cnt;
2202 
2203 	err = create_umr_res(dev);
2204 	if (err)
2205 		goto err_dev;
2206 
2207 	if (MLX5_CAP_GEN(dev->mdev, port_type) ==
2208 	    MLX5_CAP_PORT_TYPE_IB) {
2209 		if (init_dc_improvements(dev))
2210 			mlx5_ib_dbg(dev, "init_dc_improvements - continuing\n");
2211 	}
2212 
2213 	err = create_port_attrs(dev);
2214 	if (err)
2215 		goto err_dc;
2216 
2217 	for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) {
2218 		err = device_create_file(&dev->ib_dev.dev,
2219 					 mlx5_class_attributes[i]);
2220 		if (err)
2221 			goto err_port_attrs;
2222 	}
2223 
2224 	if (1) {
2225 		struct thread *rl_thread = NULL;
2226 		struct proc *rl_proc = NULL;
2227 
2228 		for (i = 0; i < MLX5_CAP_GEN(mdev, num_ports); i++) {
2229 			(void) kproc_kthread_add(mlx5_ib_roce_port_update, dev->port + i, &rl_proc, &rl_thread,
2230 			    RFHIGHPID, 0, "mlx5-ib-roce-port", "mlx5-ib-roce_port-%d", i);
2231 		}
2232 	}
2233 
2234 	dev->ib_active = true;
2235 
2236 	return dev;
2237 
2238 err_port_attrs:
2239 	destroy_ports_attrs(dev, dev->num_ports);
2240 
2241 err_dc:
2242 	if (MLX5_CAP_GEN(dev->mdev, port_type) ==
2243 	    MLX5_CAP_PORT_TYPE_IB)
2244 		cleanup_dc_improvements(dev);
2245 	destroy_umrc_res(dev);
2246 
2247 err_dev:
2248 	ib_unregister_device(&dev->ib_dev);
2249 
2250 err_q_cnt:
2251 	mlx5_ib_dealloc_q_counters(dev);
2252 
2253 err_odp:
2254 	destroy_dev_resources(&dev->devr);
2255 
2256 err_disable_roce:
2257 	if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) ==
2258 	    IB_LINK_LAYER_ETHERNET && MLX5_CAP_GEN(mdev, roce))
2259 		mlx5_nic_vport_disable_roce(mdev);
2260 err_free_port:
2261 	kfree(dev->port);
2262 
2263 err_dealloc:
2264 	ib_dealloc_device((struct ib_device *)dev);
2265 
2266 	return NULL;
2267 }
2268 
2269 static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
2270 {
2271 	struct mlx5_ib_dev *dev = context;
2272 	int i;
2273 
2274 	for (i = 0; i < MLX5_CAP_GEN(mdev, num_ports); i++) {
2275 		dev->port[i].port_gone = 1;
2276 		while (dev->port[i].port_gone != 2)
2277 			pause("W", hz);
2278 	}
2279 
2280 	for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) {
2281 		device_remove_file(&dev->ib_dev.dev,
2282 		    mlx5_class_attributes[i]);
2283 	}
2284 
2285 	destroy_ports_attrs(dev, dev->num_ports);
2286 	if (MLX5_CAP_GEN(dev->mdev, port_type) ==
2287 	    MLX5_CAP_PORT_TYPE_IB)
2288 		cleanup_dc_improvements(dev);
2289 	mlx5_ib_dealloc_q_counters(dev);
2290 	ib_unregister_device(&dev->ib_dev);
2291 	destroy_umrc_res(dev);
2292 	destroy_dev_resources(&dev->devr);
2293 
2294 	if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) ==
2295 	    IB_LINK_LAYER_ETHERNET && MLX5_CAP_GEN(mdev, roce))
2296 		mlx5_nic_vport_disable_roce(mdev);
2297 
2298 	kfree(dev->port);
2299 	ib_dealloc_device(&dev->ib_dev);
2300 }
2301 
2302 static struct mlx5_interface mlx5_ib_interface = {
2303 	.add            = mlx5_ib_add,
2304 	.remove         = mlx5_ib_remove,
2305 	.event          = mlx5_ib_event,
2306 	.protocol	= MLX5_INTERFACE_PROTOCOL_IB,
2307 };
2308 
2309 static int __init mlx5_ib_init(void)
2310 {
2311 	int err;
2312 
2313 	if (deprecated_prof_sel != 2)
2314 		printf("mlx5_ib: WARN: ""prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n");
2315 
2316 	err = mlx5_register_interface(&mlx5_ib_interface);
2317 	if (err)
2318 		goto clean_odp;
2319 
2320 	mlx5_ib_wq = create_singlethread_workqueue("mlx5_ib_wq");
2321 	if (!mlx5_ib_wq) {
2322 		printf("mlx5_ib: ERR: ""%s: failed to create mlx5_ib_wq\n", __func__);
2323 		goto err_unreg;
2324 	}
2325 
2326 	return err;
2327 
2328 err_unreg:
2329 	mlx5_unregister_interface(&mlx5_ib_interface);
2330 
2331 clean_odp:
2332 	return err;
2333 }
2334 
2335 static void __exit mlx5_ib_cleanup(void)
2336 {
2337 	destroy_workqueue(mlx5_ib_wq);
2338 	mlx5_unregister_interface(&mlx5_ib_interface);
2339 }
2340 
2341 module_init_order(mlx5_ib_init, SI_ORDER_THIRD);
2342 module_exit_order(mlx5_ib_cleanup, SI_ORDER_THIRD);
2343