xref: /linux/drivers/infiniband/hw/mlx5/main.c (revision 17ade5366345656e1a7f4e9da16863a7499da21b)
1  // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2  /*
3   * Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved.
4   * Copyright (c) 2020, Intel Corporation. All rights reserved.
5   */
6  
7  #include <linux/debugfs.h>
8  #include <linux/highmem.h>
9  #include <linux/module.h>
10  #include <linux/init.h>
11  #include <linux/errno.h>
12  #include <linux/pci.h>
13  #include <linux/dma-mapping.h>
14  #include <linux/slab.h>
15  #include <linux/bitmap.h>
16  #include <linux/sched.h>
17  #include <linux/sched/mm.h>
18  #include <linux/sched/task.h>
19  #include <linux/delay.h>
20  #include <rdma/ib_user_verbs.h>
21  #include <rdma/ib_addr.h>
22  #include <rdma/ib_cache.h>
23  #include <linux/mlx5/port.h>
24  #include <linux/mlx5/vport.h>
25  #include <linux/mlx5/fs.h>
26  #include <linux/mlx5/eswitch.h>
27  #include <linux/mlx5/driver.h>
28  #include <linux/list.h>
29  #include <rdma/ib_smi.h>
30  #include <rdma/ib_umem_odp.h>
31  #include <rdma/lag.h>
32  #include <linux/in.h>
33  #include <linux/etherdevice.h>
34  #include "mlx5_ib.h"
35  #include "ib_rep.h"
36  #include "cmd.h"
37  #include "devx.h"
38  #include "dm.h"
39  #include "fs.h"
40  #include "srq.h"
41  #include "qp.h"
42  #include "wr.h"
43  #include "restrack.h"
44  #include "counters.h"
45  #include "umr.h"
46  #include <rdma/uverbs_std_types.h>
47  #include <rdma/uverbs_ioctl.h>
48  #include <rdma/mlx5_user_ioctl_verbs.h>
49  #include <rdma/mlx5_user_ioctl_cmds.h>
50  #include <rdma/ib_ucaps.h>
51  #include "macsec.h"
52  #include "data_direct.h"
53  
54  #define UVERBS_MODULE_NAME mlx5_ib
55  #include <rdma/uverbs_named_ioctl.h>
56  
57  MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
58  MODULE_DESCRIPTION("Mellanox 5th generation network adapters (ConnectX series) IB driver");
59  MODULE_LICENSE("Dual BSD/GPL");
60  
61  struct mlx5_ib_event_work {
62  	struct work_struct	work;
63  	union {
64  		struct mlx5_ib_dev	      *dev;
65  		struct mlx5_ib_multiport_info *mpi;
66  	};
67  	bool			is_slave;
68  	unsigned int		event;
69  	void			*param;
70  };
71  
72  enum {
73  	MLX5_ATOMIC_SIZE_QP_8BYTES = 1 << 3,
74  };
75  
76  static struct workqueue_struct *mlx5_ib_event_wq;
77  static LIST_HEAD(mlx5_ib_unaffiliated_port_list);
78  static LIST_HEAD(mlx5_ib_dev_list);
79  /*
80   * This mutex should be held when accessing either of the above lists
81   */
82  static DEFINE_MUTEX(mlx5_ib_multiport_mutex);
83  
84  struct mlx5_ib_dev *mlx5_ib_get_ibdev_from_mpi(struct mlx5_ib_multiport_info *mpi)
85  {
86  	struct mlx5_ib_dev *dev;
87  
88  	mutex_lock(&mlx5_ib_multiport_mutex);
89  	dev = mpi->ibdev;
90  	mutex_unlock(&mlx5_ib_multiport_mutex);
91  	return dev;
92  }
93  
94  static enum rdma_link_layer
95  mlx5_port_type_cap_to_rdma_ll(int port_type_cap)
96  {
97  	switch (port_type_cap) {
98  	case MLX5_CAP_PORT_TYPE_IB:
99  		return IB_LINK_LAYER_INFINIBAND;
100  	case MLX5_CAP_PORT_TYPE_ETH:
101  		return IB_LINK_LAYER_ETHERNET;
102  	default:
103  		return IB_LINK_LAYER_UNSPECIFIED;
104  	}
105  }
106  
107  static enum rdma_link_layer
108  mlx5_ib_port_link_layer(struct ib_device *device, u32 port_num)
109  {
110  	struct mlx5_ib_dev *dev = to_mdev(device);
111  	int port_type_cap = MLX5_CAP_GEN(dev->mdev, port_type);
112  
113  	return mlx5_port_type_cap_to_rdma_ll(port_type_cap);
114  }
115  
116  static int get_port_state(struct ib_device *ibdev,
117  			  u32 port_num,
118  			  enum ib_port_state *state)
119  {
120  	struct ib_port_attr attr;
121  	int ret;
122  
123  	memset(&attr, 0, sizeof(attr));
124  	ret = ibdev->ops.query_port(ibdev, port_num, &attr);
125  	if (!ret)
126  		*state = attr.state;
127  	return ret;
128  }
129  
130  static struct mlx5_roce *mlx5_get_rep_roce(struct mlx5_ib_dev *dev,
131  					   struct net_device *ndev,
132  					   struct net_device *upper,
133  					   u32 *port_num)
134  {
135  	struct net_device *rep_ndev;
136  	struct mlx5_ib_port *port;
137  	int i;
138  
139  	for (i = 0; i < dev->num_ports; i++) {
140  		port  = &dev->port[i];
141  		if (!port->rep)
142  			continue;
143  
144  		if (upper == ndev && port->rep->vport == MLX5_VPORT_UPLINK) {
145  			*port_num = i + 1;
146  			return &port->roce;
147  		}
148  
149  		if (upper && port->rep->vport == MLX5_VPORT_UPLINK)
150  			continue;
151  		rep_ndev = ib_device_get_netdev(&dev->ib_dev, i + 1);
152  		if (rep_ndev && rep_ndev == ndev) {
153  			dev_put(rep_ndev);
154  			*port_num = i + 1;
155  			return &port->roce;
156  		}
157  
158  		dev_put(rep_ndev);
159  	}
160  
161  	return NULL;
162  }
163  
164  static bool mlx5_netdev_send_event(struct mlx5_ib_dev *dev,
165  				   struct net_device *ndev,
166  				   struct net_device *upper,
167  				   struct net_device *ib_ndev)
168  {
169  	if (!dev->ib_active)
170  		return false;
171  
172  	/* Event is about our upper device */
173  	if (upper == ndev)
174  		return true;
175  
176  	/* RDMA device is not in lag and not in switchdev */
177  	if (!dev->is_rep && !upper && ndev == ib_ndev)
178  		return true;
179  
180  	/* RDMA devie is in switchdev */
181  	if (dev->is_rep && ndev == ib_ndev)
182  		return true;
183  
184  	return false;
185  }
186  
187  static struct net_device *mlx5_ib_get_rep_uplink_netdev(struct mlx5_ib_dev *ibdev)
188  {
189  	struct mlx5_ib_port *port;
190  	int i;
191  
192  	for (i = 0; i < ibdev->num_ports; i++) {
193  		port = &ibdev->port[i];
194  		if (port->rep && port->rep->vport == MLX5_VPORT_UPLINK) {
195  			return ib_device_get_netdev(&ibdev->ib_dev, i + 1);
196  		}
197  	}
198  
199  	return NULL;
200  }
201  
202  static int mlx5_netdev_event(struct notifier_block *this,
203  			     unsigned long event, void *ptr)
204  {
205  	struct mlx5_roce *roce = container_of(this, struct mlx5_roce, nb);
206  	struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
207  	u32 port_num = roce->native_port_num;
208  	struct net_device *ib_ndev = NULL;
209  	struct mlx5_core_dev *mdev;
210  	struct mlx5_ib_dev *ibdev;
211  
212  	ibdev = roce->dev;
213  	mdev = mlx5_ib_get_native_port_mdev(ibdev, port_num, NULL);
214  	if (!mdev)
215  		return NOTIFY_DONE;
216  
217  	switch (event) {
218  	case NETDEV_REGISTER:
219  		/* Should already be registered during the load */
220  		if (ibdev->is_rep)
221  			break;
222  
223  		ib_ndev = ib_device_get_netdev(&ibdev->ib_dev, port_num);
224  		/* Exit if already registered */
225  		if (ib_ndev)
226  			goto put_ndev;
227  
228  		if (ndev->dev.parent == mdev->device)
229  			ib_device_set_netdev(&ibdev->ib_dev, ndev, port_num);
230  		break;
231  
232  	case NETDEV_UNREGISTER:
233  		/* In case of reps, ib device goes away before the netdevs */
234  		if (ibdev->is_rep)
235  			break;
236  		ib_ndev = ib_device_get_netdev(&ibdev->ib_dev, port_num);
237  		if (ib_ndev == ndev)
238  			ib_device_set_netdev(&ibdev->ib_dev, NULL, port_num);
239  		goto put_ndev;
240  
241  	case NETDEV_CHANGE:
242  	case NETDEV_UP:
243  	case NETDEV_DOWN: {
244  		struct net_device *upper = NULL;
245  
246  		if (!netif_is_lag_master(ndev) && !netif_is_lag_port(ndev) &&
247  		    !mlx5_core_mp_enabled(mdev))
248  			return NOTIFY_DONE;
249  
250  		if (mlx5_lag_is_roce(mdev) || mlx5_lag_is_sriov(mdev)) {
251  			struct net_device *lag_ndev;
252  
253  			if(mlx5_lag_is_roce(mdev))
254  				lag_ndev = ib_device_get_netdev(&ibdev->ib_dev, 1);
255  			else /* sriov lag */
256  				lag_ndev = mlx5_ib_get_rep_uplink_netdev(ibdev);
257  
258  			if (lag_ndev) {
259  				upper = netdev_master_upper_dev_get(lag_ndev);
260  				dev_put(lag_ndev);
261  			} else {
262  				goto done;
263  			}
264  		}
265  
266  		if (ibdev->is_rep)
267  			roce = mlx5_get_rep_roce(ibdev, ndev, upper, &port_num);
268  		if (!roce)
269  			return NOTIFY_DONE;
270  
271  		ib_ndev = ib_device_get_netdev(&ibdev->ib_dev, port_num);
272  
273  		if (mlx5_netdev_send_event(ibdev, ndev, upper, ib_ndev)) {
274  			struct ib_event ibev = { };
275  			enum ib_port_state port_state;
276  
277  			if (get_port_state(&ibdev->ib_dev, port_num,
278  					   &port_state))
279  				goto put_ndev;
280  
281  			if (roce->last_port_state == port_state)
282  				goto put_ndev;
283  
284  			roce->last_port_state = port_state;
285  			ibev.device = &ibdev->ib_dev;
286  			if (port_state == IB_PORT_DOWN)
287  				ibev.event = IB_EVENT_PORT_ERR;
288  			else if (port_state == IB_PORT_ACTIVE)
289  				ibev.event = IB_EVENT_PORT_ACTIVE;
290  			else
291  				goto put_ndev;
292  
293  			ibev.element.port_num = port_num;
294  			ib_dispatch_event(&ibev);
295  		}
296  		break;
297  	}
298  
299  	default:
300  		break;
301  	}
302  put_ndev:
303  	dev_put(ib_ndev);
304  done:
305  	mlx5_ib_put_native_port_mdev(ibdev, port_num);
306  	return NOTIFY_DONE;
307  }
308  
309  struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *ibdev,
310  						   u32 ib_port_num,
311  						   u32 *native_port_num)
312  {
313  	enum rdma_link_layer ll = mlx5_ib_port_link_layer(&ibdev->ib_dev,
314  							  ib_port_num);
315  	struct mlx5_core_dev *mdev = NULL;
316  	struct mlx5_ib_multiport_info *mpi;
317  	struct mlx5_ib_port *port;
318  
319  	if (ibdev->ib_dev.type == RDMA_DEVICE_TYPE_SMI) {
320  		if (native_port_num)
321  			*native_port_num = smi_to_native_portnum(ibdev,
322  								 ib_port_num);
323  		return ibdev->mdev;
324  
325  	}
326  
327  	if (!mlx5_core_mp_enabled(ibdev->mdev) ||
328  	    ll != IB_LINK_LAYER_ETHERNET) {
329  		if (native_port_num)
330  			*native_port_num = ib_port_num;
331  		return ibdev->mdev;
332  	}
333  
334  	if (native_port_num)
335  		*native_port_num = 1;
336  
337  	port = &ibdev->port[ib_port_num - 1];
338  	spin_lock(&port->mp.mpi_lock);
339  	mpi = ibdev->port[ib_port_num - 1].mp.mpi;
340  	if (mpi && !mpi->unaffiliate) {
341  		mdev = mpi->mdev;
342  		/* If it's the master no need to refcount, it'll exist
343  		 * as long as the ib_dev exists.
344  		 */
345  		if (!mpi->is_master)
346  			mpi->mdev_refcnt++;
347  	}
348  	spin_unlock(&port->mp.mpi_lock);
349  
350  	return mdev;
351  }
352  
353  void mlx5_ib_put_native_port_mdev(struct mlx5_ib_dev *ibdev, u32 port_num)
354  {
355  	enum rdma_link_layer ll = mlx5_ib_port_link_layer(&ibdev->ib_dev,
356  							  port_num);
357  	struct mlx5_ib_multiport_info *mpi;
358  	struct mlx5_ib_port *port;
359  
360  	if (!mlx5_core_mp_enabled(ibdev->mdev) || ll != IB_LINK_LAYER_ETHERNET)
361  		return;
362  
363  	port = &ibdev->port[port_num - 1];
364  
365  	spin_lock(&port->mp.mpi_lock);
366  	mpi = ibdev->port[port_num - 1].mp.mpi;
367  	if (mpi->is_master)
368  		goto out;
369  
370  	mpi->mdev_refcnt--;
371  	if (mpi->unaffiliate)
372  		complete(&mpi->unref_comp);
373  out:
374  	spin_unlock(&port->mp.mpi_lock);
375  }
376  
377  static int translate_eth_legacy_proto_oper(u32 eth_proto_oper,
378  					   u16 *active_speed, u8 *active_width)
379  {
380  	switch (eth_proto_oper) {
381  	case MLX5E_PROT_MASK(MLX5E_1000BASE_CX_SGMII):
382  	case MLX5E_PROT_MASK(MLX5E_1000BASE_KX):
383  	case MLX5E_PROT_MASK(MLX5E_100BASE_TX):
384  	case MLX5E_PROT_MASK(MLX5E_1000BASE_T):
385  		*active_width = IB_WIDTH_1X;
386  		*active_speed = IB_SPEED_SDR;
387  		break;
388  	case MLX5E_PROT_MASK(MLX5E_10GBASE_T):
389  	case MLX5E_PROT_MASK(MLX5E_10GBASE_CX4):
390  	case MLX5E_PROT_MASK(MLX5E_10GBASE_KX4):
391  	case MLX5E_PROT_MASK(MLX5E_10GBASE_KR):
392  	case MLX5E_PROT_MASK(MLX5E_10GBASE_CR):
393  	case MLX5E_PROT_MASK(MLX5E_10GBASE_SR):
394  	case MLX5E_PROT_MASK(MLX5E_10GBASE_ER):
395  		*active_width = IB_WIDTH_1X;
396  		*active_speed = IB_SPEED_QDR;
397  		break;
398  	case MLX5E_PROT_MASK(MLX5E_25GBASE_CR):
399  	case MLX5E_PROT_MASK(MLX5E_25GBASE_KR):
400  	case MLX5E_PROT_MASK(MLX5E_25GBASE_SR):
401  		*active_width = IB_WIDTH_1X;
402  		*active_speed = IB_SPEED_EDR;
403  		break;
404  	case MLX5E_PROT_MASK(MLX5E_40GBASE_CR4):
405  	case MLX5E_PROT_MASK(MLX5E_40GBASE_KR4):
406  	case MLX5E_PROT_MASK(MLX5E_40GBASE_SR4):
407  	case MLX5E_PROT_MASK(MLX5E_40GBASE_LR4):
408  		*active_width = IB_WIDTH_4X;
409  		*active_speed = IB_SPEED_QDR;
410  		break;
411  	case MLX5E_PROT_MASK(MLX5E_50GBASE_CR2):
412  	case MLX5E_PROT_MASK(MLX5E_50GBASE_KR2):
413  	case MLX5E_PROT_MASK(MLX5E_50GBASE_SR2):
414  		*active_width = IB_WIDTH_1X;
415  		*active_speed = IB_SPEED_HDR;
416  		break;
417  	case MLX5E_PROT_MASK(MLX5E_56GBASE_R4):
418  		*active_width = IB_WIDTH_4X;
419  		*active_speed = IB_SPEED_FDR;
420  		break;
421  	case MLX5E_PROT_MASK(MLX5E_100GBASE_CR4):
422  	case MLX5E_PROT_MASK(MLX5E_100GBASE_SR4):
423  	case MLX5E_PROT_MASK(MLX5E_100GBASE_KR4):
424  	case MLX5E_PROT_MASK(MLX5E_100GBASE_LR4):
425  		*active_width = IB_WIDTH_4X;
426  		*active_speed = IB_SPEED_EDR;
427  		break;
428  	default:
429  		return -EINVAL;
430  	}
431  
432  	return 0;
433  }
434  
435  static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u16 *active_speed,
436  					u8 *active_width)
437  {
438  	switch (eth_proto_oper) {
439  	case MLX5E_PROT_MASK(MLX5E_SGMII_100M):
440  	case MLX5E_PROT_MASK(MLX5E_1000BASE_X_SGMII):
441  		*active_width = IB_WIDTH_1X;
442  		*active_speed = IB_SPEED_SDR;
443  		break;
444  	case MLX5E_PROT_MASK(MLX5E_5GBASE_R):
445  		*active_width = IB_WIDTH_1X;
446  		*active_speed = IB_SPEED_DDR;
447  		break;
448  	case MLX5E_PROT_MASK(MLX5E_10GBASE_XFI_XAUI_1):
449  		*active_width = IB_WIDTH_1X;
450  		*active_speed = IB_SPEED_QDR;
451  		break;
452  	case MLX5E_PROT_MASK(MLX5E_40GBASE_XLAUI_4_XLPPI_4):
453  		*active_width = IB_WIDTH_4X;
454  		*active_speed = IB_SPEED_QDR;
455  		break;
456  	case MLX5E_PROT_MASK(MLX5E_25GAUI_1_25GBASE_CR_KR):
457  		*active_width = IB_WIDTH_1X;
458  		*active_speed = IB_SPEED_EDR;
459  		break;
460  	case MLX5E_PROT_MASK(MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2):
461  		*active_width = IB_WIDTH_2X;
462  		*active_speed = IB_SPEED_EDR;
463  		break;
464  	case MLX5E_PROT_MASK(MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR):
465  		*active_width = IB_WIDTH_1X;
466  		*active_speed = IB_SPEED_HDR;
467  		break;
468  	case MLX5E_PROT_MASK(MLX5E_CAUI_4_100GBASE_CR4_KR4):
469  		*active_width = IB_WIDTH_4X;
470  		*active_speed = IB_SPEED_EDR;
471  		break;
472  	case MLX5E_PROT_MASK(MLX5E_100GAUI_2_100GBASE_CR2_KR2):
473  		*active_width = IB_WIDTH_2X;
474  		*active_speed = IB_SPEED_HDR;
475  		break;
476  	case MLX5E_PROT_MASK(MLX5E_100GAUI_1_100GBASE_CR_KR):
477  		*active_width = IB_WIDTH_1X;
478  		*active_speed = IB_SPEED_NDR;
479  		break;
480  	case MLX5E_PROT_MASK(MLX5E_200GAUI_4_200GBASE_CR4_KR4):
481  		*active_width = IB_WIDTH_4X;
482  		*active_speed = IB_SPEED_HDR;
483  		break;
484  	case MLX5E_PROT_MASK(MLX5E_200GAUI_2_200GBASE_CR2_KR2):
485  		*active_width = IB_WIDTH_2X;
486  		*active_speed = IB_SPEED_NDR;
487  		break;
488  	case MLX5E_PROT_MASK(MLX5E_400GAUI_8_400GBASE_CR8):
489  		*active_width = IB_WIDTH_8X;
490  		*active_speed = IB_SPEED_HDR;
491  		break;
492  	case MLX5E_PROT_MASK(MLX5E_400GAUI_4_400GBASE_CR4_KR4):
493  		*active_width = IB_WIDTH_4X;
494  		*active_speed = IB_SPEED_NDR;
495  		break;
496  	case MLX5E_PROT_MASK(MLX5E_800GAUI_8_800GBASE_CR8_KR8):
497  		*active_width = IB_WIDTH_8X;
498  		*active_speed = IB_SPEED_NDR;
499  		break;
500  	default:
501  		return -EINVAL;
502  	}
503  
504  	return 0;
505  }
506  
507  static int translate_eth_proto_oper(u32 eth_proto_oper, u16 *active_speed,
508  				    u8 *active_width, bool ext)
509  {
510  	return ext ?
511  		translate_eth_ext_proto_oper(eth_proto_oper, active_speed,
512  					     active_width) :
513  		translate_eth_legacy_proto_oper(eth_proto_oper, active_speed,
514  						active_width);
515  }
516  
517  static int mlx5_query_port_roce(struct ib_device *device, u32 port_num,
518  				struct ib_port_attr *props)
519  {
520  	struct mlx5_ib_dev *dev = to_mdev(device);
521  	u32 out[MLX5_ST_SZ_DW(ptys_reg)] = {0};
522  	struct mlx5_core_dev *mdev;
523  	struct net_device *ndev, *upper;
524  	enum ib_mtu ndev_ib_mtu;
525  	bool put_mdev = true;
526  	u32 eth_prot_oper;
527  	u32 mdev_port_num;
528  	bool ext;
529  	int err;
530  
531  	mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num);
532  	if (!mdev) {
533  		/* This means the port isn't affiliated yet. Get the
534  		 * info for the master port instead.
535  		 */
536  		put_mdev = false;
537  		mdev = dev->mdev;
538  		mdev_port_num = 1;
539  		port_num = 1;
540  	}
541  
542  	/* Possible bad flows are checked before filling out props so in case
543  	 * of an error it will still be zeroed out.
544  	 * Use native port in case of reps
545  	 */
546  	if (dev->is_rep)
547  		err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN,
548  					   1, 0);
549  	else
550  		err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN,
551  					   mdev_port_num, 0);
552  	if (err)
553  		goto out;
554  	ext = !!MLX5_GET_ETH_PROTO(ptys_reg, out, true, eth_proto_capability);
555  	eth_prot_oper = MLX5_GET_ETH_PROTO(ptys_reg, out, ext, eth_proto_oper);
556  
557  	props->active_width     = IB_WIDTH_4X;
558  	props->active_speed     = IB_SPEED_QDR;
559  
560  	translate_eth_proto_oper(eth_prot_oper, &props->active_speed,
561  				 &props->active_width, ext);
562  
563  	if (!dev->is_rep && dev->mdev->roce.roce_en) {
564  		u16 qkey_viol_cntr;
565  
566  		props->port_cap_flags |= IB_PORT_CM_SUP;
567  		props->ip_gids = true;
568  		props->gid_tbl_len = MLX5_CAP_ROCE(dev->mdev,
569  						   roce_address_table_size);
570  		mlx5_query_nic_vport_qkey_viol_cntr(mdev, &qkey_viol_cntr);
571  		props->qkey_viol_cntr = qkey_viol_cntr;
572  	}
573  	props->max_mtu          = IB_MTU_4096;
574  	props->max_msg_sz       = 1 << MLX5_CAP_GEN(dev->mdev, log_max_msg);
575  	props->pkey_tbl_len     = 1;
576  	props->state            = IB_PORT_DOWN;
577  	props->phys_state       = IB_PORT_PHYS_STATE_DISABLED;
578  
579  	/* If this is a stub query for an unaffiliated port stop here */
580  	if (!put_mdev)
581  		goto out;
582  
583  	ndev = ib_device_get_netdev(device, port_num);
584  	if (!ndev)
585  		goto out;
586  
587  	if (mlx5_lag_is_roce(mdev) || mlx5_lag_is_sriov(mdev)) {
588  		rcu_read_lock();
589  		upper = netdev_master_upper_dev_get_rcu(ndev);
590  		if (upper) {
591  			dev_put(ndev);
592  			ndev = upper;
593  			dev_hold(ndev);
594  		}
595  		rcu_read_unlock();
596  	}
597  
598  	if (netif_running(ndev) && netif_carrier_ok(ndev)) {
599  		props->state      = IB_PORT_ACTIVE;
600  		props->phys_state = IB_PORT_PHYS_STATE_LINK_UP;
601  	}
602  
603  	ndev_ib_mtu = iboe_get_mtu(ndev->mtu);
604  
605  	dev_put(ndev);
606  
607  	props->active_mtu	= min(props->max_mtu, ndev_ib_mtu);
608  out:
609  	if (put_mdev)
610  		mlx5_ib_put_native_port_mdev(dev, port_num);
611  	return err;
612  }
613  
614  int set_roce_addr(struct mlx5_ib_dev *dev, u32 port_num,
615  		  unsigned int index, const union ib_gid *gid,
616  		  const struct ib_gid_attr *attr)
617  {
618  	enum ib_gid_type gid_type;
619  	u16 vlan_id = 0xffff;
620  	u8 roce_version = 0;
621  	u8 roce_l3_type = 0;
622  	u8 mac[ETH_ALEN];
623  	int ret;
624  
625  	gid_type = attr->gid_type;
626  	if (gid) {
627  		ret = rdma_read_gid_l2_fields(attr, &vlan_id, &mac[0]);
628  		if (ret)
629  			return ret;
630  	}
631  
632  	switch (gid_type) {
633  	case IB_GID_TYPE_ROCE:
634  		roce_version = MLX5_ROCE_VERSION_1;
635  		break;
636  	case IB_GID_TYPE_ROCE_UDP_ENCAP:
637  		roce_version = MLX5_ROCE_VERSION_2;
638  		if (gid && ipv6_addr_v4mapped((void *)gid))
639  			roce_l3_type = MLX5_ROCE_L3_TYPE_IPV4;
640  		else
641  			roce_l3_type = MLX5_ROCE_L3_TYPE_IPV6;
642  		break;
643  
644  	default:
645  		mlx5_ib_warn(dev, "Unexpected GID type %u\n", gid_type);
646  	}
647  
648  	return mlx5_core_roce_gid_set(dev->mdev, index, roce_version,
649  				      roce_l3_type, gid->raw, mac,
650  				      vlan_id < VLAN_CFI_MASK, vlan_id,
651  				      port_num);
652  }
653  
654  static int mlx5_ib_add_gid(const struct ib_gid_attr *attr,
655  			   __always_unused void **context)
656  {
657  	int ret;
658  
659  	ret = mlx5r_add_gid_macsec_operations(attr);
660  	if (ret)
661  		return ret;
662  
663  	return set_roce_addr(to_mdev(attr->device), attr->port_num,
664  			     attr->index, &attr->gid, attr);
665  }
666  
667  static int mlx5_ib_del_gid(const struct ib_gid_attr *attr,
668  			   __always_unused void **context)
669  {
670  	int ret;
671  
672  	ret = set_roce_addr(to_mdev(attr->device), attr->port_num,
673  			    attr->index, NULL, attr);
674  	if (ret)
675  		return ret;
676  
677  	mlx5r_del_gid_macsec_operations(attr);
678  	return 0;
679  }
680  
681  __be16 mlx5_get_roce_udp_sport_min(const struct mlx5_ib_dev *dev,
682  				   const struct ib_gid_attr *attr)
683  {
684  	if (attr->gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP)
685  		return 0;
686  
687  	return cpu_to_be16(MLX5_CAP_ROCE(dev->mdev, r_roce_min_src_udp_port));
688  }
689  
690  static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev)
691  {
692  	if (MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_IB)
693  		return !MLX5_CAP_GEN(dev->mdev, ib_virt);
694  	return 0;
695  }
696  
697  enum {
698  	MLX5_VPORT_ACCESS_METHOD_MAD,
699  	MLX5_VPORT_ACCESS_METHOD_HCA,
700  	MLX5_VPORT_ACCESS_METHOD_NIC,
701  };
702  
703  static int mlx5_get_vport_access_method(struct ib_device *ibdev)
704  {
705  	if (mlx5_use_mad_ifc(to_mdev(ibdev)))
706  		return MLX5_VPORT_ACCESS_METHOD_MAD;
707  
708  	if (mlx5_ib_port_link_layer(ibdev, 1) ==
709  	    IB_LINK_LAYER_ETHERNET)
710  		return MLX5_VPORT_ACCESS_METHOD_NIC;
711  
712  	return MLX5_VPORT_ACCESS_METHOD_HCA;
713  }
714  
715  static void get_atomic_caps(struct mlx5_ib_dev *dev,
716  			    u8 atomic_size_qp,
717  			    struct ib_device_attr *props)
718  {
719  	u8 tmp;
720  	u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations);
721  	u8 atomic_req_8B_endianness_mode =
722  		MLX5_CAP_ATOMIC(dev->mdev, atomic_req_8B_endianness_mode);
723  
724  	/* Check if HW supports 8 bytes standard atomic operations and capable
725  	 * of host endianness respond
726  	 */
727  	tmp = MLX5_ATOMIC_OPS_CMP_SWAP | MLX5_ATOMIC_OPS_FETCH_ADD;
728  	if (((atomic_operations & tmp) == tmp) &&
729  	    (atomic_size_qp & MLX5_ATOMIC_SIZE_QP_8BYTES) &&
730  	    (atomic_req_8B_endianness_mode)) {
731  		props->atomic_cap = IB_ATOMIC_HCA;
732  	} else {
733  		props->atomic_cap = IB_ATOMIC_NONE;
734  	}
735  }
736  
737  static void get_atomic_caps_qp(struct mlx5_ib_dev *dev,
738  			       struct ib_device_attr *props)
739  {
740  	u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp);
741  
742  	get_atomic_caps(dev, atomic_size_qp, props);
743  }
744  
745  static int mlx5_query_system_image_guid(struct ib_device *ibdev,
746  					__be64 *sys_image_guid)
747  {
748  	struct mlx5_ib_dev *dev = to_mdev(ibdev);
749  	struct mlx5_core_dev *mdev = dev->mdev;
750  	u64 tmp;
751  	int err;
752  
753  	switch (mlx5_get_vport_access_method(ibdev)) {
754  	case MLX5_VPORT_ACCESS_METHOD_MAD:
755  		return mlx5_query_mad_ifc_system_image_guid(ibdev,
756  							    sys_image_guid);
757  
758  	case MLX5_VPORT_ACCESS_METHOD_HCA:
759  		err = mlx5_query_hca_vport_system_image_guid(mdev, &tmp);
760  		break;
761  
762  	case MLX5_VPORT_ACCESS_METHOD_NIC:
763  		err = mlx5_query_nic_vport_system_image_guid(mdev, &tmp);
764  		break;
765  
766  	default:
767  		return -EINVAL;
768  	}
769  
770  	if (!err)
771  		*sys_image_guid = cpu_to_be64(tmp);
772  
773  	return err;
774  
775  }
776  
777  static int mlx5_query_max_pkeys(struct ib_device *ibdev,
778  				u16 *max_pkeys)
779  {
780  	struct mlx5_ib_dev *dev = to_mdev(ibdev);
781  	struct mlx5_core_dev *mdev = dev->mdev;
782  
783  	switch (mlx5_get_vport_access_method(ibdev)) {
784  	case MLX5_VPORT_ACCESS_METHOD_MAD:
785  		return mlx5_query_mad_ifc_max_pkeys(ibdev, max_pkeys);
786  
787  	case MLX5_VPORT_ACCESS_METHOD_HCA:
788  	case MLX5_VPORT_ACCESS_METHOD_NIC:
789  		*max_pkeys = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev,
790  						pkey_table_size));
791  		return 0;
792  
793  	default:
794  		return -EINVAL;
795  	}
796  }
797  
798  static int mlx5_query_vendor_id(struct ib_device *ibdev,
799  				u32 *vendor_id)
800  {
801  	struct mlx5_ib_dev *dev = to_mdev(ibdev);
802  
803  	switch (mlx5_get_vport_access_method(ibdev)) {
804  	case MLX5_VPORT_ACCESS_METHOD_MAD:
805  		return mlx5_query_mad_ifc_vendor_id(ibdev, vendor_id);
806  
807  	case MLX5_VPORT_ACCESS_METHOD_HCA:
808  	case MLX5_VPORT_ACCESS_METHOD_NIC:
809  		return mlx5_core_query_vendor_id(dev->mdev, vendor_id);
810  
811  	default:
812  		return -EINVAL;
813  	}
814  }
815  
816  static int mlx5_query_node_guid(struct mlx5_ib_dev *dev,
817  				__be64 *node_guid)
818  {
819  	u64 tmp;
820  	int err;
821  
822  	switch (mlx5_get_vport_access_method(&dev->ib_dev)) {
823  	case MLX5_VPORT_ACCESS_METHOD_MAD:
824  		return mlx5_query_mad_ifc_node_guid(dev, node_guid);
825  
826  	case MLX5_VPORT_ACCESS_METHOD_HCA:
827  		err = mlx5_query_hca_vport_node_guid(dev->mdev, &tmp);
828  		break;
829  
830  	case MLX5_VPORT_ACCESS_METHOD_NIC:
831  		err = mlx5_query_nic_vport_node_guid(dev->mdev, &tmp);
832  		break;
833  
834  	default:
835  		return -EINVAL;
836  	}
837  
838  	if (!err)
839  		*node_guid = cpu_to_be64(tmp);
840  
841  	return err;
842  }
843  
844  struct mlx5_reg_node_desc {
845  	u8	desc[IB_DEVICE_NODE_DESC_MAX];
846  };
847  
848  static int mlx5_query_node_desc(struct mlx5_ib_dev *dev, char *node_desc)
849  {
850  	struct mlx5_reg_node_desc in;
851  
852  	if (mlx5_use_mad_ifc(dev))
853  		return mlx5_query_mad_ifc_node_desc(dev, node_desc);
854  
855  	memset(&in, 0, sizeof(in));
856  
857  	return mlx5_core_access_reg(dev->mdev, &in, sizeof(in), node_desc,
858  				    sizeof(struct mlx5_reg_node_desc),
859  				    MLX5_REG_NODE_DESC, 0, 0);
860  }
861  
862  static void fill_esw_mgr_reg_c0(struct mlx5_core_dev *mdev,
863  				struct mlx5_ib_query_device_resp *resp)
864  {
865  	struct mlx5_eswitch *esw = mdev->priv.eswitch;
866  	u16 vport = mlx5_eswitch_manager_vport(mdev);
867  
868  	resp->reg_c0.value = mlx5_eswitch_get_vport_metadata_for_match(esw,
869  								      vport);
870  	resp->reg_c0.mask = mlx5_eswitch_get_vport_metadata_mask();
871  }
872  
873  static int mlx5_ib_query_device(struct ib_device *ibdev,
874  				struct ib_device_attr *props,
875  				struct ib_udata *uhw)
876  {
877  	size_t uhw_outlen = (uhw) ? uhw->outlen : 0;
878  	struct mlx5_ib_dev *dev = to_mdev(ibdev);
879  	struct mlx5_core_dev *mdev = dev->mdev;
880  	int err = -ENOMEM;
881  	int max_sq_desc;
882  	int max_rq_sg;
883  	int max_sq_sg;
884  	u64 min_page_size = 1ull << MLX5_CAP_GEN(mdev, log_pg_sz);
885  	bool raw_support = !mlx5_core_mp_enabled(mdev);
886  	struct mlx5_ib_query_device_resp resp = {};
887  	size_t resp_len;
888  	u64 max_tso;
889  
890  	resp_len = sizeof(resp.comp_mask) + sizeof(resp.response_length);
891  	if (uhw_outlen && uhw_outlen < resp_len)
892  		return -EINVAL;
893  
894  	resp.response_length = resp_len;
895  
896  	if (uhw && uhw->inlen && !ib_is_udata_cleared(uhw, 0, uhw->inlen))
897  		return -EINVAL;
898  
899  	memset(props, 0, sizeof(*props));
900  	err = mlx5_query_system_image_guid(ibdev,
901  					   &props->sys_image_guid);
902  	if (err)
903  		return err;
904  
905  	props->max_pkeys = dev->pkey_table_len;
906  
907  	err = mlx5_query_vendor_id(ibdev, &props->vendor_id);
908  	if (err)
909  		return err;
910  
911  	props->fw_ver = ((u64)fw_rev_maj(dev->mdev) << 32) |
912  		(fw_rev_min(dev->mdev) << 16) |
913  		fw_rev_sub(dev->mdev);
914  	props->device_cap_flags    = IB_DEVICE_CHANGE_PHY_PORT |
915  		IB_DEVICE_PORT_ACTIVE_EVENT		|
916  		IB_DEVICE_SYS_IMAGE_GUID		|
917  		IB_DEVICE_RC_RNR_NAK_GEN;
918  
919  	if (MLX5_CAP_GEN(mdev, pkv))
920  		props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
921  	if (MLX5_CAP_GEN(mdev, qkv))
922  		props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
923  	if (MLX5_CAP_GEN(mdev, apm))
924  		props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
925  	if (MLX5_CAP_GEN(mdev, xrc))
926  		props->device_cap_flags |= IB_DEVICE_XRC;
927  	if (MLX5_CAP_GEN(mdev, imaicl)) {
928  		props->device_cap_flags |= IB_DEVICE_MEM_WINDOW |
929  					   IB_DEVICE_MEM_WINDOW_TYPE_2B;
930  		props->max_mw = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
931  		/* We support 'Gappy' memory registration too */
932  		props->kernel_cap_flags |= IBK_SG_GAPS_REG;
933  	}
934  	/* IB_WR_REG_MR always requires changing the entity size with UMR */
935  	if (!MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled))
936  		props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
937  	if (MLX5_CAP_GEN(mdev, sho)) {
938  		props->kernel_cap_flags |= IBK_INTEGRITY_HANDOVER;
939  		/* At this stage no support for signature handover */
940  		props->sig_prot_cap = IB_PROT_T10DIF_TYPE_1 |
941  				      IB_PROT_T10DIF_TYPE_2 |
942  				      IB_PROT_T10DIF_TYPE_3;
943  		props->sig_guard_cap = IB_GUARD_T10DIF_CRC |
944  				       IB_GUARD_T10DIF_CSUM;
945  	}
946  	if (MLX5_CAP_GEN(mdev, block_lb_mc))
947  		props->kernel_cap_flags |= IBK_BLOCK_MULTICAST_LOOPBACK;
948  
949  	if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && raw_support) {
950  		if (MLX5_CAP_ETH(mdev, csum_cap)) {
951  			/* Legacy bit to support old userspace libraries */
952  			props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM;
953  			props->raw_packet_caps |= IB_RAW_PACKET_CAP_IP_CSUM;
954  		}
955  
956  		if (MLX5_CAP_ETH(dev->mdev, vlan_cap))
957  			props->raw_packet_caps |=
958  				IB_RAW_PACKET_CAP_CVLAN_STRIPPING;
959  
960  		if (offsetofend(typeof(resp), tso_caps) <= uhw_outlen) {
961  			max_tso = MLX5_CAP_ETH(mdev, max_lso_cap);
962  			if (max_tso) {
963  				resp.tso_caps.max_tso = 1 << max_tso;
964  				resp.tso_caps.supported_qpts |=
965  					1 << IB_QPT_RAW_PACKET;
966  				resp.response_length += sizeof(resp.tso_caps);
967  			}
968  		}
969  
970  		if (offsetofend(typeof(resp), rss_caps) <= uhw_outlen) {
971  			resp.rss_caps.rx_hash_function =
972  						MLX5_RX_HASH_FUNC_TOEPLITZ;
973  			resp.rss_caps.rx_hash_fields_mask =
974  						MLX5_RX_HASH_SRC_IPV4 |
975  						MLX5_RX_HASH_DST_IPV4 |
976  						MLX5_RX_HASH_SRC_IPV6 |
977  						MLX5_RX_HASH_DST_IPV6 |
978  						MLX5_RX_HASH_SRC_PORT_TCP |
979  						MLX5_RX_HASH_DST_PORT_TCP |
980  						MLX5_RX_HASH_SRC_PORT_UDP |
981  						MLX5_RX_HASH_DST_PORT_UDP |
982  						MLX5_RX_HASH_INNER;
983  			resp.response_length += sizeof(resp.rss_caps);
984  		}
985  	} else {
986  		if (offsetofend(typeof(resp), tso_caps) <= uhw_outlen)
987  			resp.response_length += sizeof(resp.tso_caps);
988  		if (offsetofend(typeof(resp), rss_caps) <= uhw_outlen)
989  			resp.response_length += sizeof(resp.rss_caps);
990  	}
991  
992  	if (MLX5_CAP_GEN(mdev, ipoib_basic_offloads)) {
993  		props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
994  		props->kernel_cap_flags |= IBK_UD_TSO;
995  	}
996  
997  	if (MLX5_CAP_GEN(dev->mdev, rq_delay_drop) &&
998  	    MLX5_CAP_GEN(dev->mdev, general_notification_event) &&
999  	    raw_support)
1000  		props->raw_packet_caps |= IB_RAW_PACKET_CAP_DELAY_DROP;
1001  
1002  	if (MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads) &&
1003  	    MLX5_CAP_IPOIB_ENHANCED(mdev, csum_cap))
1004  		props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
1005  
1006  	if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) &&
1007  	    MLX5_CAP_ETH(dev->mdev, scatter_fcs) &&
1008  	    raw_support) {
1009  		/* Legacy bit to support old userspace libraries */
1010  		props->device_cap_flags |= IB_DEVICE_RAW_SCATTER_FCS;
1011  		props->raw_packet_caps |= IB_RAW_PACKET_CAP_SCATTER_FCS;
1012  	}
1013  
1014  	if (MLX5_CAP_DEV_MEM(mdev, memic)) {
1015  		props->max_dm_size =
1016  			MLX5_CAP_DEV_MEM(mdev, max_memic_size);
1017  	}
1018  
1019  	if (mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS))
1020  		props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING;
1021  
1022  	if (MLX5_CAP_GEN(mdev, end_pad))
1023  		props->device_cap_flags |= IB_DEVICE_PCI_WRITE_END_PADDING;
1024  
1025  	props->vendor_part_id	   = mdev->pdev->device;
1026  	props->hw_ver		   = mdev->pdev->revision;
1027  
1028  	props->max_mr_size	   = ~0ull;
1029  	props->page_size_cap	   = ~(min_page_size - 1);
1030  	props->max_qp		   = 1 << MLX5_CAP_GEN(mdev, log_max_qp);
1031  	props->max_qp_wr	   = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
1032  	max_rq_sg =  MLX5_CAP_GEN(mdev, max_wqe_sz_rq) /
1033  		     sizeof(struct mlx5_wqe_data_seg);
1034  	max_sq_desc = min_t(int, MLX5_CAP_GEN(mdev, max_wqe_sz_sq), 512);
1035  	max_sq_sg = (max_sq_desc - sizeof(struct mlx5_wqe_ctrl_seg) -
1036  		     sizeof(struct mlx5_wqe_raddr_seg)) /
1037  		sizeof(struct mlx5_wqe_data_seg);
1038  	props->max_send_sge = max_sq_sg;
1039  	props->max_recv_sge = max_rq_sg;
1040  	props->max_sge_rd	   = MLX5_MAX_SGE_RD;
1041  	props->max_cq		   = 1 << MLX5_CAP_GEN(mdev, log_max_cq);
1042  	props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_cq_sz)) - 1;
1043  	props->max_mr		   = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
1044  	props->max_pd		   = 1 << MLX5_CAP_GEN(mdev, log_max_pd);
1045  	props->max_qp_rd_atom	   = 1 << MLX5_CAP_GEN(mdev, log_max_ra_req_qp);
1046  	props->max_qp_init_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_res_qp);
1047  	props->max_srq		   = 1 << MLX5_CAP_GEN(mdev, log_max_srq);
1048  	props->max_srq_wr = (1 << MLX5_CAP_GEN(mdev, log_max_srq_sz)) - 1;
1049  	props->local_ca_ack_delay  = MLX5_CAP_GEN(mdev, local_ca_ack_delay);
1050  	props->max_res_rd_atom	   = props->max_qp_rd_atom * props->max_qp;
1051  	props->max_srq_sge	   = max_rq_sg - 1;
1052  	props->max_fast_reg_page_list_len =
1053  		1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size);
1054  	props->max_pi_fast_reg_page_list_len =
1055  		props->max_fast_reg_page_list_len / 2;
1056  	props->max_sgl_rd =
1057  		MLX5_CAP_GEN(mdev, max_sgl_for_optimized_performance);
1058  	get_atomic_caps_qp(dev, props);
1059  	props->masked_atomic_cap   = IB_ATOMIC_NONE;
1060  	props->max_mcast_grp	   = 1 << MLX5_CAP_GEN(mdev, log_max_mcg);
1061  	props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg);
1062  	props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
1063  					   props->max_mcast_grp;
1064  	props->max_ah = INT_MAX;
1065  	props->hca_core_clock = MLX5_CAP_GEN(mdev, device_frequency_khz);
1066  	props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL;
1067  
1068  	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
1069  		if (dev->odp_caps.general_caps & IB_ODP_SUPPORT)
1070  			props->kernel_cap_flags |= IBK_ON_DEMAND_PAGING;
1071  		props->odp_caps = dev->odp_caps;
1072  		if (!uhw) {
1073  			/* ODP for kernel QPs is not implemented for receive
1074  			 * WQEs and SRQ WQEs
1075  			 */
1076  			props->odp_caps.per_transport_caps.rc_odp_caps &=
1077  				~(IB_ODP_SUPPORT_READ |
1078  				  IB_ODP_SUPPORT_SRQ_RECV);
1079  			props->odp_caps.per_transport_caps.uc_odp_caps &=
1080  				~(IB_ODP_SUPPORT_READ |
1081  				  IB_ODP_SUPPORT_SRQ_RECV);
1082  			props->odp_caps.per_transport_caps.ud_odp_caps &=
1083  				~(IB_ODP_SUPPORT_READ |
1084  				  IB_ODP_SUPPORT_SRQ_RECV);
1085  			props->odp_caps.per_transport_caps.xrc_odp_caps &=
1086  				~(IB_ODP_SUPPORT_READ |
1087  				  IB_ODP_SUPPORT_SRQ_RECV);
1088  		}
1089  	}
1090  
1091  	if (mlx5_core_is_vf(mdev))
1092  		props->kernel_cap_flags |= IBK_VIRTUAL_FUNCTION;
1093  
1094  	if (mlx5_ib_port_link_layer(ibdev, 1) ==
1095  	    IB_LINK_LAYER_ETHERNET && raw_support) {
1096  		props->rss_caps.max_rwq_indirection_tables =
1097  			1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt);
1098  		props->rss_caps.max_rwq_indirection_table_size =
1099  			1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt_size);
1100  		props->rss_caps.supported_qpts = 1 << IB_QPT_RAW_PACKET;
1101  		props->max_wq_type_rq =
1102  			1 << MLX5_CAP_GEN(dev->mdev, log_max_rq);
1103  	}
1104  
1105  	if (MLX5_CAP_GEN(mdev, tag_matching)) {
1106  		props->tm_caps.max_num_tags =
1107  			(1 << MLX5_CAP_GEN(mdev, log_tag_matching_list_sz)) - 1;
1108  		props->tm_caps.max_ops =
1109  			1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
1110  		props->tm_caps.max_sge = MLX5_TM_MAX_SGE;
1111  	}
1112  
1113  	if (MLX5_CAP_GEN(mdev, tag_matching) &&
1114  	    MLX5_CAP_GEN(mdev, rndv_offload_rc)) {
1115  		props->tm_caps.flags = IB_TM_CAP_RNDV_RC;
1116  		props->tm_caps.max_rndv_hdr_size = MLX5_TM_MAX_RNDV_MSG_SIZE;
1117  	}
1118  
1119  	if (MLX5_CAP_GEN(dev->mdev, cq_moderation)) {
1120  		props->cq_caps.max_cq_moderation_count =
1121  						MLX5_MAX_CQ_COUNT;
1122  		props->cq_caps.max_cq_moderation_period =
1123  						MLX5_MAX_CQ_PERIOD;
1124  	}
1125  
1126  	if (offsetofend(typeof(resp), cqe_comp_caps) <= uhw_outlen) {
1127  		resp.response_length += sizeof(resp.cqe_comp_caps);
1128  
1129  		if (MLX5_CAP_GEN(dev->mdev, cqe_compression)) {
1130  			resp.cqe_comp_caps.max_num =
1131  				MLX5_CAP_GEN(dev->mdev,
1132  					     cqe_compression_max_num);
1133  
1134  			resp.cqe_comp_caps.supported_format =
1135  				MLX5_IB_CQE_RES_FORMAT_HASH |
1136  				MLX5_IB_CQE_RES_FORMAT_CSUM;
1137  
1138  			if (MLX5_CAP_GEN(dev->mdev, mini_cqe_resp_stride_index))
1139  				resp.cqe_comp_caps.supported_format |=
1140  					MLX5_IB_CQE_RES_FORMAT_CSUM_STRIDX;
1141  		}
1142  	}
1143  
1144  	if (offsetofend(typeof(resp), packet_pacing_caps) <= uhw_outlen &&
1145  	    raw_support) {
1146  		if (MLX5_CAP_QOS(mdev, packet_pacing) &&
1147  		    MLX5_CAP_GEN(mdev, qos)) {
1148  			resp.packet_pacing_caps.qp_rate_limit_max =
1149  				MLX5_CAP_QOS(mdev, packet_pacing_max_rate);
1150  			resp.packet_pacing_caps.qp_rate_limit_min =
1151  				MLX5_CAP_QOS(mdev, packet_pacing_min_rate);
1152  			resp.packet_pacing_caps.supported_qpts |=
1153  				1 << IB_QPT_RAW_PACKET;
1154  			if (MLX5_CAP_QOS(mdev, packet_pacing_burst_bound) &&
1155  			    MLX5_CAP_QOS(mdev, packet_pacing_typical_size))
1156  				resp.packet_pacing_caps.cap_flags |=
1157  					MLX5_IB_PP_SUPPORT_BURST;
1158  		}
1159  		resp.response_length += sizeof(resp.packet_pacing_caps);
1160  	}
1161  
1162  	if (offsetofend(typeof(resp), mlx5_ib_support_multi_pkt_send_wqes) <=
1163  	    uhw_outlen) {
1164  		if (MLX5_CAP_ETH(mdev, multi_pkt_send_wqe))
1165  			resp.mlx5_ib_support_multi_pkt_send_wqes =
1166  				MLX5_IB_ALLOW_MPW;
1167  
1168  		if (MLX5_CAP_ETH(mdev, enhanced_multi_pkt_send_wqe))
1169  			resp.mlx5_ib_support_multi_pkt_send_wqes |=
1170  				MLX5_IB_SUPPORT_EMPW;
1171  
1172  		resp.response_length +=
1173  			sizeof(resp.mlx5_ib_support_multi_pkt_send_wqes);
1174  	}
1175  
1176  	if (offsetofend(typeof(resp), flags) <= uhw_outlen) {
1177  		resp.response_length += sizeof(resp.flags);
1178  
1179  		if (MLX5_CAP_GEN(mdev, cqe_compression_128))
1180  			resp.flags |=
1181  				MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_COMP;
1182  
1183  		if (MLX5_CAP_GEN(mdev, cqe_128_always))
1184  			resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_PAD;
1185  		if (MLX5_CAP_GEN(mdev, qp_packet_based))
1186  			resp.flags |=
1187  				MLX5_IB_QUERY_DEV_RESP_PACKET_BASED_CREDIT_MODE;
1188  
1189  		resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_SCAT2CQE_DCT;
1190  
1191  		if (MLX5_CAP_GEN_2(mdev, dp_ordering_force) &&
1192  		    (MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_xrc) ||
1193  		    MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_dc) ||
1194  		    MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_rc) ||
1195  		    MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_ud) ||
1196  		    MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_uc)))
1197  			resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_OOO_DP;
1198  	}
1199  
1200  	if (offsetofend(typeof(resp), sw_parsing_caps) <= uhw_outlen) {
1201  		resp.response_length += sizeof(resp.sw_parsing_caps);
1202  		if (MLX5_CAP_ETH(mdev, swp)) {
1203  			resp.sw_parsing_caps.sw_parsing_offloads |=
1204  				MLX5_IB_SW_PARSING;
1205  
1206  			if (MLX5_CAP_ETH(mdev, swp_csum))
1207  				resp.sw_parsing_caps.sw_parsing_offloads |=
1208  					MLX5_IB_SW_PARSING_CSUM;
1209  
1210  			if (MLX5_CAP_ETH(mdev, swp_lso))
1211  				resp.sw_parsing_caps.sw_parsing_offloads |=
1212  					MLX5_IB_SW_PARSING_LSO;
1213  
1214  			if (resp.sw_parsing_caps.sw_parsing_offloads)
1215  				resp.sw_parsing_caps.supported_qpts =
1216  					BIT(IB_QPT_RAW_PACKET);
1217  		}
1218  	}
1219  
1220  	if (offsetofend(typeof(resp), striding_rq_caps) <= uhw_outlen &&
1221  	    raw_support) {
1222  		resp.response_length += sizeof(resp.striding_rq_caps);
1223  		if (MLX5_CAP_GEN(mdev, striding_rq)) {
1224  			resp.striding_rq_caps.min_single_stride_log_num_of_bytes =
1225  				MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES;
1226  			resp.striding_rq_caps.max_single_stride_log_num_of_bytes =
1227  				MLX5_MAX_SINGLE_STRIDE_LOG_NUM_BYTES;
1228  			if (MLX5_CAP_GEN(dev->mdev, ext_stride_num_range))
1229  				resp.striding_rq_caps
1230  					.min_single_wqe_log_num_of_strides =
1231  					MLX5_EXT_MIN_SINGLE_WQE_LOG_NUM_STRIDES;
1232  			else
1233  				resp.striding_rq_caps
1234  					.min_single_wqe_log_num_of_strides =
1235  					MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES;
1236  			resp.striding_rq_caps.max_single_wqe_log_num_of_strides =
1237  				MLX5_MAX_SINGLE_WQE_LOG_NUM_STRIDES;
1238  			resp.striding_rq_caps.supported_qpts =
1239  				BIT(IB_QPT_RAW_PACKET);
1240  		}
1241  	}
1242  
1243  	if (offsetofend(typeof(resp), tunnel_offloads_caps) <= uhw_outlen) {
1244  		resp.response_length += sizeof(resp.tunnel_offloads_caps);
1245  		if (MLX5_CAP_ETH(mdev, tunnel_stateless_vxlan))
1246  			resp.tunnel_offloads_caps |=
1247  				MLX5_IB_TUNNELED_OFFLOADS_VXLAN;
1248  		if (MLX5_CAP_ETH(mdev, tunnel_stateless_geneve_rx))
1249  			resp.tunnel_offloads_caps |=
1250  				MLX5_IB_TUNNELED_OFFLOADS_GENEVE;
1251  		if (MLX5_CAP_ETH(mdev, tunnel_stateless_gre))
1252  			resp.tunnel_offloads_caps |=
1253  				MLX5_IB_TUNNELED_OFFLOADS_GRE;
1254  		if (MLX5_CAP_ETH(mdev, tunnel_stateless_mpls_over_gre))
1255  			resp.tunnel_offloads_caps |=
1256  				MLX5_IB_TUNNELED_OFFLOADS_MPLS_GRE;
1257  		if (MLX5_CAP_ETH(mdev, tunnel_stateless_mpls_over_udp))
1258  			resp.tunnel_offloads_caps |=
1259  				MLX5_IB_TUNNELED_OFFLOADS_MPLS_UDP;
1260  	}
1261  
1262  	if (offsetofend(typeof(resp), dci_streams_caps) <= uhw_outlen) {
1263  		resp.response_length += sizeof(resp.dci_streams_caps);
1264  
1265  		resp.dci_streams_caps.max_log_num_concurent =
1266  			MLX5_CAP_GEN(mdev, log_max_dci_stream_channels);
1267  
1268  		resp.dci_streams_caps.max_log_num_errored =
1269  			MLX5_CAP_GEN(mdev, log_max_dci_errored_streams);
1270  	}
1271  
1272  	if (offsetofend(typeof(resp), reserved) <= uhw_outlen)
1273  		resp.response_length += sizeof(resp.reserved);
1274  
1275  	if (offsetofend(typeof(resp), reg_c0) <= uhw_outlen) {
1276  		struct mlx5_eswitch *esw = mdev->priv.eswitch;
1277  
1278  		resp.response_length += sizeof(resp.reg_c0);
1279  
1280  		if (mlx5_eswitch_mode(mdev) == MLX5_ESWITCH_OFFLOADS &&
1281  		    mlx5_eswitch_vport_match_metadata_enabled(esw))
1282  			fill_esw_mgr_reg_c0(mdev, &resp);
1283  	}
1284  
1285  	if (uhw_outlen) {
1286  		err = ib_copy_to_udata(uhw, &resp, resp.response_length);
1287  
1288  		if (err)
1289  			return err;
1290  	}
1291  
1292  	return 0;
1293  }
1294  
1295  static void translate_active_width(struct ib_device *ibdev, u16 active_width,
1296  				   u8 *ib_width)
1297  {
1298  	struct mlx5_ib_dev *dev = to_mdev(ibdev);
1299  
1300  	if (active_width & MLX5_PTYS_WIDTH_1X)
1301  		*ib_width = IB_WIDTH_1X;
1302  	else if (active_width & MLX5_PTYS_WIDTH_2X)
1303  		*ib_width = IB_WIDTH_2X;
1304  	else if (active_width & MLX5_PTYS_WIDTH_4X)
1305  		*ib_width = IB_WIDTH_4X;
1306  	else if (active_width & MLX5_PTYS_WIDTH_8X)
1307  		*ib_width = IB_WIDTH_8X;
1308  	else if (active_width & MLX5_PTYS_WIDTH_12X)
1309  		*ib_width = IB_WIDTH_12X;
1310  	else {
1311  		mlx5_ib_dbg(dev, "Invalid active_width %d, setting width to default value: 4x\n",
1312  			    active_width);
1313  		*ib_width = IB_WIDTH_4X;
1314  	}
1315  
1316  	return;
1317  }
1318  
1319  static int mlx5_mtu_to_ib_mtu(int mtu)
1320  {
1321  	switch (mtu) {
1322  	case 256: return 1;
1323  	case 512: return 2;
1324  	case 1024: return 3;
1325  	case 2048: return 4;
1326  	case 4096: return 5;
1327  	default:
1328  		pr_warn("invalid mtu\n");
1329  		return -1;
1330  	}
1331  }
1332  
1333  enum ib_max_vl_num {
1334  	__IB_MAX_VL_0		= 1,
1335  	__IB_MAX_VL_0_1		= 2,
1336  	__IB_MAX_VL_0_3		= 3,
1337  	__IB_MAX_VL_0_7		= 4,
1338  	__IB_MAX_VL_0_14	= 5,
1339  };
1340  
1341  enum mlx5_vl_hw_cap {
1342  	MLX5_VL_HW_0	= 1,
1343  	MLX5_VL_HW_0_1	= 2,
1344  	MLX5_VL_HW_0_2	= 3,
1345  	MLX5_VL_HW_0_3	= 4,
1346  	MLX5_VL_HW_0_4	= 5,
1347  	MLX5_VL_HW_0_5	= 6,
1348  	MLX5_VL_HW_0_6	= 7,
1349  	MLX5_VL_HW_0_7	= 8,
1350  	MLX5_VL_HW_0_14	= 15
1351  };
1352  
1353  static int translate_max_vl_num(struct ib_device *ibdev, u8 vl_hw_cap,
1354  				u8 *max_vl_num)
1355  {
1356  	switch (vl_hw_cap) {
1357  	case MLX5_VL_HW_0:
1358  		*max_vl_num = __IB_MAX_VL_0;
1359  		break;
1360  	case MLX5_VL_HW_0_1:
1361  		*max_vl_num = __IB_MAX_VL_0_1;
1362  		break;
1363  	case MLX5_VL_HW_0_3:
1364  		*max_vl_num = __IB_MAX_VL_0_3;
1365  		break;
1366  	case MLX5_VL_HW_0_7:
1367  		*max_vl_num = __IB_MAX_VL_0_7;
1368  		break;
1369  	case MLX5_VL_HW_0_14:
1370  		*max_vl_num = __IB_MAX_VL_0_14;
1371  		break;
1372  
1373  	default:
1374  		return -EINVAL;
1375  	}
1376  
1377  	return 0;
1378  }
1379  
1380  static int mlx5_query_hca_port(struct ib_device *ibdev, u32 port,
1381  			       struct ib_port_attr *props)
1382  {
1383  	struct mlx5_ib_dev *dev = to_mdev(ibdev);
1384  	struct mlx5_core_dev *mdev = dev->mdev;
1385  	struct mlx5_hca_vport_context *rep;
1386  	u8 vl_hw_cap, plane_index = 0;
1387  	u16 max_mtu;
1388  	u16 oper_mtu;
1389  	int err;
1390  	u16 ib_link_width_oper;
1391  
1392  	rep = kzalloc(sizeof(*rep), GFP_KERNEL);
1393  	if (!rep) {
1394  		err = -ENOMEM;
1395  		goto out;
1396  	}
1397  
1398  	/* props being zeroed by the caller, avoid zeroing it here */
1399  
1400  	if (ibdev->type == RDMA_DEVICE_TYPE_SMI) {
1401  		plane_index = port;
1402  		port = smi_to_native_portnum(dev, port);
1403  	}
1404  
1405  	err = mlx5_query_hca_vport_context(mdev, 0, port, 0, rep);
1406  	if (err)
1407  		goto out;
1408  
1409  	props->lid		= rep->lid;
1410  	props->lmc		= rep->lmc;
1411  	props->sm_lid		= rep->sm_lid;
1412  	props->sm_sl		= rep->sm_sl;
1413  	props->state		= rep->vport_state;
1414  	props->phys_state	= rep->port_physical_state;
1415  
1416  	props->port_cap_flags = rep->cap_mask1;
1417  	if (dev->num_plane) {
1418  		props->port_cap_flags |= IB_PORT_SM_DISABLED;
1419  		props->port_cap_flags &= ~IB_PORT_SM;
1420  	} else if (ibdev->type == RDMA_DEVICE_TYPE_SMI)
1421  		props->port_cap_flags &= ~IB_PORT_CM_SUP;
1422  
1423  	props->gid_tbl_len	= mlx5_get_gid_table_len(MLX5_CAP_GEN(mdev, gid_table_size));
1424  	props->max_msg_sz	= 1 << MLX5_CAP_GEN(mdev, log_max_msg);
1425  	props->pkey_tbl_len	= mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, pkey_table_size));
1426  	props->bad_pkey_cntr	= rep->pkey_violation_counter;
1427  	props->qkey_viol_cntr	= rep->qkey_violation_counter;
1428  	props->subnet_timeout	= rep->subnet_timeout;
1429  	props->init_type_reply	= rep->init_type_reply;
1430  
1431  	if (props->port_cap_flags & IB_PORT_CAP_MASK2_SUP)
1432  		props->port_cap_flags2 = rep->cap_mask2;
1433  
1434  	err = mlx5_query_ib_port_oper(mdev, &ib_link_width_oper,
1435  				      &props->active_speed, port, plane_index);
1436  	if (err)
1437  		goto out;
1438  
1439  	translate_active_width(ibdev, ib_link_width_oper, &props->active_width);
1440  
1441  	mlx5_query_port_max_mtu(mdev, &max_mtu, port);
1442  
1443  	props->max_mtu = mlx5_mtu_to_ib_mtu(max_mtu);
1444  
1445  	mlx5_query_port_oper_mtu(mdev, &oper_mtu, port);
1446  
1447  	props->active_mtu = mlx5_mtu_to_ib_mtu(oper_mtu);
1448  
1449  	err = mlx5_query_port_vl_hw_cap(mdev, &vl_hw_cap, port);
1450  	if (err)
1451  		goto out;
1452  
1453  	err = translate_max_vl_num(ibdev, vl_hw_cap,
1454  				   &props->max_vl_num);
1455  out:
1456  	kfree(rep);
1457  	return err;
1458  }
1459  
1460  int mlx5_ib_query_port(struct ib_device *ibdev, u32 port,
1461  		       struct ib_port_attr *props)
1462  {
1463  	unsigned int count;
1464  	int ret;
1465  
1466  	switch (mlx5_get_vport_access_method(ibdev)) {
1467  	case MLX5_VPORT_ACCESS_METHOD_MAD:
1468  		ret = mlx5_query_mad_ifc_port(ibdev, port, props);
1469  		break;
1470  
1471  	case MLX5_VPORT_ACCESS_METHOD_HCA:
1472  		ret = mlx5_query_hca_port(ibdev, port, props);
1473  		break;
1474  
1475  	case MLX5_VPORT_ACCESS_METHOD_NIC:
1476  		ret = mlx5_query_port_roce(ibdev, port, props);
1477  		break;
1478  
1479  	default:
1480  		ret = -EINVAL;
1481  	}
1482  
1483  	if (!ret && props) {
1484  		struct mlx5_ib_dev *dev = to_mdev(ibdev);
1485  		struct mlx5_core_dev *mdev;
1486  		bool put_mdev = true;
1487  
1488  		mdev = mlx5_ib_get_native_port_mdev(dev, port, NULL);
1489  		if (!mdev) {
1490  			/* If the port isn't affiliated yet query the master.
1491  			 * The master and slave will have the same values.
1492  			 */
1493  			mdev = dev->mdev;
1494  			port = 1;
1495  			put_mdev = false;
1496  		}
1497  		count = mlx5_core_reserved_gids_count(mdev);
1498  		if (put_mdev)
1499  			mlx5_ib_put_native_port_mdev(dev, port);
1500  		props->gid_tbl_len -= count;
1501  	}
1502  	return ret;
1503  }
1504  
1505  static int mlx5_ib_rep_query_port(struct ib_device *ibdev, u32 port,
1506  				  struct ib_port_attr *props)
1507  {
1508  	return mlx5_query_port_roce(ibdev, port, props);
1509  }
1510  
1511  static int mlx5_ib_rep_query_pkey(struct ib_device *ibdev, u32 port, u16 index,
1512  				  u16 *pkey)
1513  {
1514  	/* Default special Pkey for representor device port as per the
1515  	 * IB specification 1.3 section 10.9.1.2.
1516  	 */
1517  	*pkey = 0xffff;
1518  	return 0;
1519  }
1520  
1521  static int mlx5_ib_query_gid(struct ib_device *ibdev, u32 port, int index,
1522  			     union ib_gid *gid)
1523  {
1524  	struct mlx5_ib_dev *dev = to_mdev(ibdev);
1525  	struct mlx5_core_dev *mdev = dev->mdev;
1526  
1527  	switch (mlx5_get_vport_access_method(ibdev)) {
1528  	case MLX5_VPORT_ACCESS_METHOD_MAD:
1529  		return mlx5_query_mad_ifc_gids(ibdev, port, index, gid);
1530  
1531  	case MLX5_VPORT_ACCESS_METHOD_HCA:
1532  		return mlx5_query_hca_vport_gid(mdev, 0, port, 0, index, gid);
1533  
1534  	default:
1535  		return -EINVAL;
1536  	}
1537  
1538  }
1539  
1540  static int mlx5_query_hca_nic_pkey(struct ib_device *ibdev, u32 port,
1541  				   u16 index, u16 *pkey)
1542  {
1543  	struct mlx5_ib_dev *dev = to_mdev(ibdev);
1544  	struct mlx5_core_dev *mdev;
1545  	bool put_mdev = true;
1546  	u32 mdev_port_num;
1547  	int err;
1548  
1549  	mdev = mlx5_ib_get_native_port_mdev(dev, port, &mdev_port_num);
1550  	if (!mdev) {
1551  		/* The port isn't affiliated yet, get the PKey from the master
1552  		 * port. For RoCE the PKey tables will be the same.
1553  		 */
1554  		put_mdev = false;
1555  		mdev = dev->mdev;
1556  		mdev_port_num = 1;
1557  	}
1558  
1559  	err = mlx5_query_hca_vport_pkey(mdev, 0, mdev_port_num, 0,
1560  					index, pkey);
1561  	if (put_mdev)
1562  		mlx5_ib_put_native_port_mdev(dev, port);
1563  
1564  	return err;
1565  }
1566  
1567  static int mlx5_ib_query_pkey(struct ib_device *ibdev, u32 port, u16 index,
1568  			      u16 *pkey)
1569  {
1570  	switch (mlx5_get_vport_access_method(ibdev)) {
1571  	case MLX5_VPORT_ACCESS_METHOD_MAD:
1572  		return mlx5_query_mad_ifc_pkey(ibdev, port, index, pkey);
1573  
1574  	case MLX5_VPORT_ACCESS_METHOD_HCA:
1575  	case MLX5_VPORT_ACCESS_METHOD_NIC:
1576  		return mlx5_query_hca_nic_pkey(ibdev, port, index, pkey);
1577  	default:
1578  		return -EINVAL;
1579  	}
1580  }
1581  
1582  static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask,
1583  				 struct ib_device_modify *props)
1584  {
1585  	struct mlx5_ib_dev *dev = to_mdev(ibdev);
1586  	struct mlx5_reg_node_desc in;
1587  	struct mlx5_reg_node_desc out;
1588  	int err;
1589  
1590  	if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
1591  		return -EOPNOTSUPP;
1592  
1593  	if (!(mask & IB_DEVICE_MODIFY_NODE_DESC))
1594  		return 0;
1595  
1596  	/*
1597  	 * If possible, pass node desc to FW, so it can generate
1598  	 * a 144 trap.  If cmd fails, just ignore.
1599  	 */
1600  	memcpy(&in, props->node_desc, IB_DEVICE_NODE_DESC_MAX);
1601  	err = mlx5_core_access_reg(dev->mdev, &in, sizeof(in), &out,
1602  				   sizeof(out), MLX5_REG_NODE_DESC, 0, 1);
1603  	if (err)
1604  		return err;
1605  
1606  	memcpy(ibdev->node_desc, props->node_desc, IB_DEVICE_NODE_DESC_MAX);
1607  
1608  	return err;
1609  }
1610  
1611  static int set_port_caps_atomic(struct mlx5_ib_dev *dev, u32 port_num, u32 mask,
1612  				u32 value)
1613  {
1614  	struct mlx5_hca_vport_context ctx = {};
1615  	struct mlx5_core_dev *mdev;
1616  	u32 mdev_port_num;
1617  	int err;
1618  
1619  	mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num);
1620  	if (!mdev)
1621  		return -ENODEV;
1622  
1623  	err = mlx5_query_hca_vport_context(mdev, 0, mdev_port_num, 0, &ctx);
1624  	if (err)
1625  		goto out;
1626  
1627  	if (~ctx.cap_mask1_perm & mask) {
1628  		mlx5_ib_warn(dev, "trying to change bitmask 0x%X but change supported 0x%X\n",
1629  			     mask, ctx.cap_mask1_perm);
1630  		err = -EINVAL;
1631  		goto out;
1632  	}
1633  
1634  	ctx.cap_mask1 = value;
1635  	ctx.cap_mask1_perm = mask;
1636  	err = mlx5_core_modify_hca_vport_context(mdev, 0, mdev_port_num,
1637  						 0, &ctx);
1638  
1639  out:
1640  	mlx5_ib_put_native_port_mdev(dev, port_num);
1641  
1642  	return err;
1643  }
1644  
1645  static int mlx5_ib_modify_port(struct ib_device *ibdev, u32 port, int mask,
1646  			       struct ib_port_modify *props)
1647  {
1648  	struct mlx5_ib_dev *dev = to_mdev(ibdev);
1649  	struct ib_port_attr attr;
1650  	u32 tmp;
1651  	int err;
1652  	u32 change_mask;
1653  	u32 value;
1654  	bool is_ib = (mlx5_ib_port_link_layer(ibdev, port) ==
1655  		      IB_LINK_LAYER_INFINIBAND);
1656  
1657  	/* CM layer calls ib_modify_port() regardless of the link layer. For
1658  	 * Ethernet ports, qkey violation and Port capabilities are meaningless.
1659  	 */
1660  	if (!is_ib)
1661  		return 0;
1662  
1663  	if (MLX5_CAP_GEN(dev->mdev, ib_virt) && is_ib) {
1664  		change_mask = props->clr_port_cap_mask | props->set_port_cap_mask;
1665  		value = ~props->clr_port_cap_mask | props->set_port_cap_mask;
1666  		return set_port_caps_atomic(dev, port, change_mask, value);
1667  	}
1668  
1669  	mutex_lock(&dev->cap_mask_mutex);
1670  
1671  	err = ib_query_port(ibdev, port, &attr);
1672  	if (err)
1673  		goto out;
1674  
1675  	tmp = (attr.port_cap_flags | props->set_port_cap_mask) &
1676  		~props->clr_port_cap_mask;
1677  
1678  	err = mlx5_set_port_caps(dev->mdev, port, tmp);
1679  
1680  out:
1681  	mutex_unlock(&dev->cap_mask_mutex);
1682  	return err;
1683  }
1684  
1685  static void print_lib_caps(struct mlx5_ib_dev *dev, u64 caps)
1686  {
1687  	mlx5_ib_dbg(dev, "MLX5_LIB_CAP_4K_UAR = %s\n",
1688  		    caps & MLX5_LIB_CAP_4K_UAR ? "y" : "n");
1689  }
1690  
1691  static u16 calc_dynamic_bfregs(int uars_per_sys_page)
1692  {
1693  	/* Large page with non 4k uar support might limit the dynamic size */
1694  	if (uars_per_sys_page == 1  && PAGE_SIZE > 4096)
1695  		return MLX5_MIN_DYN_BFREGS;
1696  
1697  	return MLX5_MAX_DYN_BFREGS;
1698  }
1699  
1700  static int calc_total_bfregs(struct mlx5_ib_dev *dev, bool lib_uar_4k,
1701  			     struct mlx5_ib_alloc_ucontext_req_v2 *req,
1702  			     struct mlx5_bfreg_info *bfregi)
1703  {
1704  	int uars_per_sys_page;
1705  	int bfregs_per_sys_page;
1706  	int ref_bfregs = req->total_num_bfregs;
1707  
1708  	if (req->total_num_bfregs == 0)
1709  		return -EINVAL;
1710  
1711  	BUILD_BUG_ON(MLX5_MAX_BFREGS % MLX5_NON_FP_BFREGS_IN_PAGE);
1712  	BUILD_BUG_ON(MLX5_MAX_BFREGS < MLX5_NON_FP_BFREGS_IN_PAGE);
1713  
1714  	if (req->total_num_bfregs > MLX5_MAX_BFREGS)
1715  		return -ENOMEM;
1716  
1717  	uars_per_sys_page = get_uars_per_sys_page(dev, lib_uar_4k);
1718  	bfregs_per_sys_page = uars_per_sys_page * MLX5_NON_FP_BFREGS_PER_UAR;
1719  	/* This holds the required static allocation asked by the user */
1720  	req->total_num_bfregs = ALIGN(req->total_num_bfregs, bfregs_per_sys_page);
1721  	if (req->num_low_latency_bfregs > req->total_num_bfregs - 1)
1722  		return -EINVAL;
1723  
1724  	bfregi->num_static_sys_pages = req->total_num_bfregs / bfregs_per_sys_page;
1725  	bfregi->num_dyn_bfregs = ALIGN(calc_dynamic_bfregs(uars_per_sys_page), bfregs_per_sys_page);
1726  	bfregi->total_num_bfregs = req->total_num_bfregs + bfregi->num_dyn_bfregs;
1727  	bfregi->num_sys_pages = bfregi->total_num_bfregs / bfregs_per_sys_page;
1728  
1729  	mlx5_ib_dbg(dev, "uar_4k: fw support %s, lib support %s, user requested %d bfregs, allocated %d, total bfregs %d, using %d sys pages\n",
1730  		    MLX5_CAP_GEN(dev->mdev, uar_4k) ? "yes" : "no",
1731  		    lib_uar_4k ? "yes" : "no", ref_bfregs,
1732  		    req->total_num_bfregs, bfregi->total_num_bfregs,
1733  		    bfregi->num_sys_pages);
1734  
1735  	return 0;
1736  }
1737  
1738  static int allocate_uars(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *context)
1739  {
1740  	struct mlx5_bfreg_info *bfregi;
1741  	int err;
1742  	int i;
1743  
1744  	bfregi = &context->bfregi;
1745  	for (i = 0; i < bfregi->num_static_sys_pages; i++) {
1746  		err = mlx5_cmd_uar_alloc(dev->mdev, &bfregi->sys_pages[i],
1747  					 context->devx_uid);
1748  		if (err)
1749  			goto error;
1750  
1751  		mlx5_ib_dbg(dev, "allocated uar %d\n", bfregi->sys_pages[i]);
1752  	}
1753  
1754  	for (i = bfregi->num_static_sys_pages; i < bfregi->num_sys_pages; i++)
1755  		bfregi->sys_pages[i] = MLX5_IB_INVALID_UAR_INDEX;
1756  
1757  	return 0;
1758  
1759  error:
1760  	for (--i; i >= 0; i--)
1761  		if (mlx5_cmd_uar_dealloc(dev->mdev, bfregi->sys_pages[i],
1762  					 context->devx_uid))
1763  			mlx5_ib_warn(dev, "failed to free uar %d\n", i);
1764  
1765  	return err;
1766  }
1767  
1768  static void deallocate_uars(struct mlx5_ib_dev *dev,
1769  			    struct mlx5_ib_ucontext *context)
1770  {
1771  	struct mlx5_bfreg_info *bfregi;
1772  	int i;
1773  
1774  	bfregi = &context->bfregi;
1775  	for (i = 0; i < bfregi->num_sys_pages; i++)
1776  		if (i < bfregi->num_static_sys_pages ||
1777  		    bfregi->sys_pages[i] != MLX5_IB_INVALID_UAR_INDEX)
1778  			mlx5_cmd_uar_dealloc(dev->mdev, bfregi->sys_pages[i],
1779  					     context->devx_uid);
1780  }
1781  
1782  int mlx5_ib_enable_lb(struct mlx5_ib_dev *dev, bool td, bool qp)
1783  {
1784  	int err = 0;
1785  
1786  	mutex_lock(&dev->lb.mutex);
1787  	if (td)
1788  		dev->lb.user_td++;
1789  	if (qp)
1790  		dev->lb.qps++;
1791  
1792  	if (dev->lb.user_td == 2 ||
1793  	    dev->lb.qps == 1) {
1794  		if (!dev->lb.enabled) {
1795  			err = mlx5_nic_vport_update_local_lb(dev->mdev, true);
1796  			dev->lb.enabled = true;
1797  		}
1798  	}
1799  
1800  	mutex_unlock(&dev->lb.mutex);
1801  
1802  	return err;
1803  }
1804  
1805  void mlx5_ib_disable_lb(struct mlx5_ib_dev *dev, bool td, bool qp)
1806  {
1807  	mutex_lock(&dev->lb.mutex);
1808  	if (td)
1809  		dev->lb.user_td--;
1810  	if (qp)
1811  		dev->lb.qps--;
1812  
1813  	if (dev->lb.user_td == 1 &&
1814  	    dev->lb.qps == 0) {
1815  		if (dev->lb.enabled) {
1816  			mlx5_nic_vport_update_local_lb(dev->mdev, false);
1817  			dev->lb.enabled = false;
1818  		}
1819  	}
1820  
1821  	mutex_unlock(&dev->lb.mutex);
1822  }
1823  
1824  static int mlx5_ib_alloc_transport_domain(struct mlx5_ib_dev *dev, u32 *tdn,
1825  					  u16 uid)
1826  {
1827  	int err;
1828  
1829  	if (!MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
1830  		return 0;
1831  
1832  	err = mlx5_cmd_alloc_transport_domain(dev->mdev, tdn, uid);
1833  	if (err)
1834  		return err;
1835  
1836  	if ((MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) ||
1837  	    (!MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) &&
1838  	     !MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
1839  		return err;
1840  
1841  	return mlx5_ib_enable_lb(dev, true, false);
1842  }
1843  
1844  static void mlx5_ib_dealloc_transport_domain(struct mlx5_ib_dev *dev, u32 tdn,
1845  					     u16 uid)
1846  {
1847  	if (!MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
1848  		return;
1849  
1850  	mlx5_cmd_dealloc_transport_domain(dev->mdev, tdn, uid);
1851  
1852  	if ((MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) ||
1853  	    (!MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) &&
1854  	     !MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
1855  		return;
1856  
1857  	mlx5_ib_disable_lb(dev, true, false);
1858  }
1859  
1860  static int set_ucontext_resp(struct ib_ucontext *uctx,
1861  			     struct mlx5_ib_alloc_ucontext_resp *resp)
1862  {
1863  	struct ib_device *ibdev = uctx->device;
1864  	struct mlx5_ib_dev *dev = to_mdev(ibdev);
1865  	struct mlx5_ib_ucontext *context = to_mucontext(uctx);
1866  	struct mlx5_bfreg_info *bfregi = &context->bfregi;
1867  
1868  	if (MLX5_CAP_GEN(dev->mdev, dump_fill_mkey)) {
1869  		resp->dump_fill_mkey = dev->mkeys.dump_fill_mkey;
1870  		resp->comp_mask |=
1871  			MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_DUMP_FILL_MKEY;
1872  	}
1873  
1874  	resp->qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp);
1875  	if (mlx5_wc_support_get(dev->mdev))
1876  		resp->bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev,
1877  						      log_bf_reg_size);
1878  	resp->cache_line_size = cache_line_size();
1879  	resp->max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq);
1880  	resp->max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq);
1881  	resp->max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
1882  	resp->max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
1883  	resp->max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz);
1884  	resp->cqe_version = context->cqe_version;
1885  	resp->log_uar_size = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
1886  				MLX5_ADAPTER_PAGE_SHIFT : PAGE_SHIFT;
1887  	resp->num_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
1888  					MLX5_CAP_GEN(dev->mdev,
1889  						     num_of_uars_per_page) : 1;
1890  	resp->tot_bfregs = bfregi->lib_uar_dyn ? 0 :
1891  			bfregi->total_num_bfregs - bfregi->num_dyn_bfregs;
1892  	resp->num_ports = dev->num_ports;
1893  	resp->cmds_supp_uhw |= MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE |
1894  				      MLX5_USER_CMDS_SUPP_UHW_CREATE_AH;
1895  
1896  	if (mlx5_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET) {
1897  		mlx5_query_min_inline(dev->mdev, &resp->eth_min_inline);
1898  		resp->eth_min_inline++;
1899  	}
1900  
1901  	if (dev->mdev->clock_info)
1902  		resp->clock_info_versions = BIT(MLX5_IB_CLOCK_INFO_V1);
1903  
1904  	/*
1905  	 * We don't want to expose information from the PCI bar that is located
1906  	 * after 4096 bytes, so if the arch only supports larger pages, let's
1907  	 * pretend we don't support reading the HCA's core clock. This is also
1908  	 * forced by mmap function.
1909  	 */
1910  	if (PAGE_SIZE <= 4096) {
1911  		resp->comp_mask |=
1912  			MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET;
1913  		resp->hca_core_clock_offset =
1914  			offsetof(struct mlx5_init_seg,
1915  				 internal_timer_h) % PAGE_SIZE;
1916  	}
1917  
1918  	if (MLX5_CAP_GEN(dev->mdev, ece_support))
1919  		resp->comp_mask |= MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_ECE;
1920  
1921  	if (rt_supported(MLX5_CAP_GEN(dev->mdev, sq_ts_format)) &&
1922  	    rt_supported(MLX5_CAP_GEN(dev->mdev, rq_ts_format)) &&
1923  	    rt_supported(MLX5_CAP_ROCE(dev->mdev, qp_ts_format)))
1924  		resp->comp_mask |=
1925  			MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_REAL_TIME_TS;
1926  
1927  	resp->num_dyn_bfregs = bfregi->num_dyn_bfregs;
1928  
1929  	if (MLX5_CAP_GEN(dev->mdev, drain_sigerr))
1930  		resp->comp_mask |= MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_SQD2RTS;
1931  
1932  	resp->comp_mask |=
1933  		MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_MKEY_UPDATE_TAG;
1934  
1935  	return 0;
1936  }
1937  
1938  static bool uctx_rdma_ctrl_is_enabled(u64 enabled_caps)
1939  {
1940  	return UCAP_ENABLED(enabled_caps, RDMA_UCAP_MLX5_CTRL_LOCAL) ||
1941  	       UCAP_ENABLED(enabled_caps, RDMA_UCAP_MLX5_CTRL_OTHER_VHCA);
1942  }
1943  
1944  static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx,
1945  				  struct ib_udata *udata)
1946  {
1947  	struct ib_device *ibdev = uctx->device;
1948  	struct mlx5_ib_dev *dev = to_mdev(ibdev);
1949  	struct mlx5_ib_alloc_ucontext_req_v2 req = {};
1950  	struct mlx5_ib_alloc_ucontext_resp resp = {};
1951  	struct mlx5_ib_ucontext *context = to_mucontext(uctx);
1952  	struct mlx5_bfreg_info *bfregi;
1953  	int ver;
1954  	int err;
1955  	size_t min_req_v2 = offsetof(struct mlx5_ib_alloc_ucontext_req_v2,
1956  				     max_cqe_version);
1957  	bool lib_uar_4k;
1958  	bool lib_uar_dyn;
1959  
1960  	if (!dev->ib_active)
1961  		return -EAGAIN;
1962  
1963  	if (udata->inlen == sizeof(struct mlx5_ib_alloc_ucontext_req))
1964  		ver = 0;
1965  	else if (udata->inlen >= min_req_v2)
1966  		ver = 2;
1967  	else
1968  		return -EINVAL;
1969  
1970  	err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
1971  	if (err)
1972  		return err;
1973  
1974  	if (req.flags & ~MLX5_IB_ALLOC_UCTX_DEVX)
1975  		return -EOPNOTSUPP;
1976  
1977  	if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2)
1978  		return -EOPNOTSUPP;
1979  
1980  	req.total_num_bfregs = ALIGN(req.total_num_bfregs,
1981  				    MLX5_NON_FP_BFREGS_PER_UAR);
1982  	if (req.num_low_latency_bfregs > req.total_num_bfregs - 1)
1983  		return -EINVAL;
1984  
1985  	if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) {
1986  		err = mlx5_ib_devx_create(dev, true, uctx->enabled_caps);
1987  		if (err < 0)
1988  			goto out_ctx;
1989  		context->devx_uid = err;
1990  
1991  		if (uctx_rdma_ctrl_is_enabled(uctx->enabled_caps)) {
1992  			err = mlx5_cmd_add_privileged_uid(dev->mdev,
1993  							  context->devx_uid);
1994  			if (err)
1995  				goto out_devx;
1996  		}
1997  	}
1998  
1999  	lib_uar_4k = req.lib_caps & MLX5_LIB_CAP_4K_UAR;
2000  	lib_uar_dyn = req.lib_caps & MLX5_LIB_CAP_DYN_UAR;
2001  	bfregi = &context->bfregi;
2002  
2003  	if (lib_uar_dyn) {
2004  		bfregi->lib_uar_dyn = lib_uar_dyn;
2005  		goto uar_done;
2006  	}
2007  
2008  	/* updates req->total_num_bfregs */
2009  	err = calc_total_bfregs(dev, lib_uar_4k, &req, bfregi);
2010  	if (err)
2011  		goto out_ucap;
2012  
2013  	mutex_init(&bfregi->lock);
2014  	bfregi->lib_uar_4k = lib_uar_4k;
2015  	bfregi->count = kcalloc(bfregi->total_num_bfregs, sizeof(*bfregi->count),
2016  				GFP_KERNEL);
2017  	if (!bfregi->count) {
2018  		err = -ENOMEM;
2019  		goto out_ucap;
2020  	}
2021  
2022  	bfregi->sys_pages = kcalloc(bfregi->num_sys_pages,
2023  				    sizeof(*bfregi->sys_pages),
2024  				    GFP_KERNEL);
2025  	if (!bfregi->sys_pages) {
2026  		err = -ENOMEM;
2027  		goto out_count;
2028  	}
2029  
2030  	err = allocate_uars(dev, context);
2031  	if (err)
2032  		goto out_sys_pages;
2033  
2034  uar_done:
2035  	err = mlx5_ib_alloc_transport_domain(dev, &context->tdn,
2036  					     context->devx_uid);
2037  	if (err)
2038  		goto out_uars;
2039  
2040  	INIT_LIST_HEAD(&context->db_page_list);
2041  	mutex_init(&context->db_page_mutex);
2042  
2043  	context->cqe_version = min_t(__u8,
2044  				 (__u8)MLX5_CAP_GEN(dev->mdev, cqe_version),
2045  				 req.max_cqe_version);
2046  
2047  	err = set_ucontext_resp(uctx, &resp);
2048  	if (err)
2049  		goto out_mdev;
2050  
2051  	resp.response_length = min(udata->outlen, sizeof(resp));
2052  	err = ib_copy_to_udata(udata, &resp, resp.response_length);
2053  	if (err)
2054  		goto out_mdev;
2055  
2056  	bfregi->ver = ver;
2057  	bfregi->num_low_latency_bfregs = req.num_low_latency_bfregs;
2058  	context->lib_caps = req.lib_caps;
2059  	print_lib_caps(dev, context->lib_caps);
2060  
2061  	if (mlx5_ib_lag_should_assign_affinity(dev)) {
2062  		u32 port = mlx5_core_native_port_num(dev->mdev) - 1;
2063  
2064  		atomic_set(&context->tx_port_affinity,
2065  			   atomic_add_return(
2066  				   1, &dev->port[port].roce.tx_port_affinity));
2067  	}
2068  
2069  	return 0;
2070  
2071  out_mdev:
2072  	mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid);
2073  
2074  out_uars:
2075  	deallocate_uars(dev, context);
2076  
2077  out_sys_pages:
2078  	kfree(bfregi->sys_pages);
2079  
2080  out_count:
2081  	kfree(bfregi->count);
2082  
2083  out_ucap:
2084  	if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX &&
2085  	    uctx_rdma_ctrl_is_enabled(uctx->enabled_caps))
2086  		mlx5_cmd_remove_privileged_uid(dev->mdev, context->devx_uid);
2087  
2088  out_devx:
2089  	if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX)
2090  		mlx5_ib_devx_destroy(dev, context->devx_uid);
2091  
2092  out_ctx:
2093  	return err;
2094  }
2095  
2096  static int mlx5_ib_query_ucontext(struct ib_ucontext *ibcontext,
2097  				  struct uverbs_attr_bundle *attrs)
2098  {
2099  	struct mlx5_ib_alloc_ucontext_resp uctx_resp = {};
2100  	int ret;
2101  
2102  	ret = set_ucontext_resp(ibcontext, &uctx_resp);
2103  	if (ret)
2104  		return ret;
2105  
2106  	uctx_resp.response_length =
2107  		min_t(size_t,
2108  		      uverbs_attr_get_len(attrs,
2109  				MLX5_IB_ATTR_QUERY_CONTEXT_RESP_UCTX),
2110  		      sizeof(uctx_resp));
2111  
2112  	ret = uverbs_copy_to_struct_or_zero(attrs,
2113  					MLX5_IB_ATTR_QUERY_CONTEXT_RESP_UCTX,
2114  					&uctx_resp,
2115  					sizeof(uctx_resp));
2116  	return ret;
2117  }
2118  
2119  static void mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
2120  {
2121  	struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
2122  	struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
2123  	struct mlx5_bfreg_info *bfregi;
2124  
2125  	bfregi = &context->bfregi;
2126  	mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid);
2127  
2128  	deallocate_uars(dev, context);
2129  	kfree(bfregi->sys_pages);
2130  	kfree(bfregi->count);
2131  
2132  	if (context->devx_uid) {
2133  		if (uctx_rdma_ctrl_is_enabled(ibcontext->enabled_caps))
2134  			mlx5_cmd_remove_privileged_uid(dev->mdev,
2135  						       context->devx_uid);
2136  		mlx5_ib_devx_destroy(dev, context->devx_uid);
2137  	}
2138  }
2139  
2140  static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev,
2141  				 int uar_idx)
2142  {
2143  	int fw_uars_per_page;
2144  
2145  	fw_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ? MLX5_UARS_IN_PAGE : 1;
2146  
2147  	return (dev->mdev->bar_addr >> PAGE_SHIFT) + uar_idx / fw_uars_per_page;
2148  }
2149  
2150  static u64 uar_index2paddress(struct mlx5_ib_dev *dev,
2151  				 int uar_idx)
2152  {
2153  	unsigned int fw_uars_per_page;
2154  
2155  	fw_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
2156  				MLX5_UARS_IN_PAGE : 1;
2157  
2158  	return (dev->mdev->bar_addr + (uar_idx / fw_uars_per_page) * PAGE_SIZE);
2159  }
2160  
2161  static int get_command(unsigned long offset)
2162  {
2163  	return (offset >> MLX5_IB_MMAP_CMD_SHIFT) & MLX5_IB_MMAP_CMD_MASK;
2164  }
2165  
2166  static int get_arg(unsigned long offset)
2167  {
2168  	return offset & ((1 << MLX5_IB_MMAP_CMD_SHIFT) - 1);
2169  }
2170  
2171  static int get_index(unsigned long offset)
2172  {
2173  	return get_arg(offset);
2174  }
2175  
2176  /* Index resides in an extra byte to enable larger values than 255 */
2177  static int get_extended_index(unsigned long offset)
2178  {
2179  	return get_arg(offset) | ((offset >> 16) & 0xff) << 8;
2180  }
2181  
2182  
2183  static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
2184  {
2185  }
2186  
2187  static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd)
2188  {
2189  	switch (cmd) {
2190  	case MLX5_IB_MMAP_WC_PAGE:
2191  		return "WC";
2192  	case MLX5_IB_MMAP_REGULAR_PAGE:
2193  		return "best effort WC";
2194  	case MLX5_IB_MMAP_NC_PAGE:
2195  		return "NC";
2196  	case MLX5_IB_MMAP_DEVICE_MEM:
2197  		return "Device Memory";
2198  	default:
2199  		return "Unknown";
2200  	}
2201  }
2202  
2203  static int mlx5_ib_mmap_clock_info_page(struct mlx5_ib_dev *dev,
2204  					struct vm_area_struct *vma,
2205  					struct mlx5_ib_ucontext *context)
2206  {
2207  	if ((vma->vm_end - vma->vm_start != PAGE_SIZE) ||
2208  	    !(vma->vm_flags & VM_SHARED))
2209  		return -EINVAL;
2210  
2211  	if (get_index(vma->vm_pgoff) != MLX5_IB_CLOCK_INFO_V1)
2212  		return -EOPNOTSUPP;
2213  
2214  	if (vma->vm_flags & (VM_WRITE | VM_EXEC))
2215  		return -EPERM;
2216  	vm_flags_clear(vma, VM_MAYWRITE);
2217  
2218  	if (!dev->mdev->clock_info)
2219  		return -EOPNOTSUPP;
2220  
2221  	return vm_insert_page(vma, vma->vm_start,
2222  			      virt_to_page(dev->mdev->clock_info));
2223  }
2224  
2225  static void mlx5_ib_mmap_free(struct rdma_user_mmap_entry *entry)
2226  {
2227  	struct mlx5_user_mmap_entry *mentry = to_mmmap(entry);
2228  	struct mlx5_ib_dev *dev = to_mdev(entry->ucontext->device);
2229  	struct mlx5_var_table *var_table = &dev->var_table;
2230  	struct mlx5_ib_ucontext *context = to_mucontext(entry->ucontext);
2231  
2232  	switch (mentry->mmap_flag) {
2233  	case MLX5_IB_MMAP_TYPE_MEMIC:
2234  	case MLX5_IB_MMAP_TYPE_MEMIC_OP:
2235  		mlx5_ib_dm_mmap_free(dev, mentry);
2236  		break;
2237  	case MLX5_IB_MMAP_TYPE_VAR:
2238  		mutex_lock(&var_table->bitmap_lock);
2239  		clear_bit(mentry->page_idx, var_table->bitmap);
2240  		mutex_unlock(&var_table->bitmap_lock);
2241  		kfree(mentry);
2242  		break;
2243  	case MLX5_IB_MMAP_TYPE_UAR_WC:
2244  	case MLX5_IB_MMAP_TYPE_UAR_NC:
2245  		mlx5_cmd_uar_dealloc(dev->mdev, mentry->page_idx,
2246  				     context->devx_uid);
2247  		kfree(mentry);
2248  		break;
2249  	default:
2250  		WARN_ON(true);
2251  	}
2252  }
2253  
2254  static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd,
2255  		    struct vm_area_struct *vma,
2256  		    struct mlx5_ib_ucontext *context)
2257  {
2258  	struct mlx5_bfreg_info *bfregi = &context->bfregi;
2259  	int err;
2260  	unsigned long idx;
2261  	phys_addr_t pfn;
2262  	pgprot_t prot;
2263  	u32 bfreg_dyn_idx = 0;
2264  	u32 uar_index;
2265  	int dyn_uar = (cmd == MLX5_IB_MMAP_ALLOC_WC);
2266  	int max_valid_idx = dyn_uar ? bfregi->num_sys_pages :
2267  				bfregi->num_static_sys_pages;
2268  
2269  	if (bfregi->lib_uar_dyn)
2270  		return -EINVAL;
2271  
2272  	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
2273  		return -EINVAL;
2274  
2275  	if (dyn_uar)
2276  		idx = get_extended_index(vma->vm_pgoff) + bfregi->num_static_sys_pages;
2277  	else
2278  		idx = get_index(vma->vm_pgoff);
2279  
2280  	if (idx >= max_valid_idx) {
2281  		mlx5_ib_warn(dev, "invalid uar index %lu, max=%d\n",
2282  			     idx, max_valid_idx);
2283  		return -EINVAL;
2284  	}
2285  
2286  	switch (cmd) {
2287  	case MLX5_IB_MMAP_WC_PAGE:
2288  	case MLX5_IB_MMAP_ALLOC_WC:
2289  	case MLX5_IB_MMAP_REGULAR_PAGE:
2290  		/* For MLX5_IB_MMAP_REGULAR_PAGE do the best effort to get WC */
2291  		prot = pgprot_writecombine(vma->vm_page_prot);
2292  		break;
2293  	case MLX5_IB_MMAP_NC_PAGE:
2294  		prot = pgprot_noncached(vma->vm_page_prot);
2295  		break;
2296  	default:
2297  		return -EINVAL;
2298  	}
2299  
2300  	if (dyn_uar) {
2301  		int uars_per_page;
2302  
2303  		uars_per_page = get_uars_per_sys_page(dev, bfregi->lib_uar_4k);
2304  		bfreg_dyn_idx = idx * (uars_per_page * MLX5_NON_FP_BFREGS_PER_UAR);
2305  		if (bfreg_dyn_idx >= bfregi->total_num_bfregs) {
2306  			mlx5_ib_warn(dev, "invalid bfreg_dyn_idx %u, max=%u\n",
2307  				     bfreg_dyn_idx, bfregi->total_num_bfregs);
2308  			return -EINVAL;
2309  		}
2310  
2311  		mutex_lock(&bfregi->lock);
2312  		/* Fail if uar already allocated, first bfreg index of each
2313  		 * page holds its count.
2314  		 */
2315  		if (bfregi->count[bfreg_dyn_idx]) {
2316  			mlx5_ib_warn(dev, "wrong offset, idx %lu is busy, bfregn=%u\n", idx, bfreg_dyn_idx);
2317  			mutex_unlock(&bfregi->lock);
2318  			return -EINVAL;
2319  		}
2320  
2321  		bfregi->count[bfreg_dyn_idx]++;
2322  		mutex_unlock(&bfregi->lock);
2323  
2324  		err = mlx5_cmd_uar_alloc(dev->mdev, &uar_index,
2325  					 context->devx_uid);
2326  		if (err) {
2327  			mlx5_ib_warn(dev, "UAR alloc failed\n");
2328  			goto free_bfreg;
2329  		}
2330  	} else {
2331  		uar_index = bfregi->sys_pages[idx];
2332  	}
2333  
2334  	pfn = uar_index2pfn(dev, uar_index);
2335  	mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn %pa\n", idx, &pfn);
2336  
2337  	err = rdma_user_mmap_io(&context->ibucontext, vma, pfn, PAGE_SIZE,
2338  				prot, NULL);
2339  	if (err) {
2340  		mlx5_ib_err(dev,
2341  			    "rdma_user_mmap_io failed with error=%d, mmap_cmd=%s\n",
2342  			    err, mmap_cmd2str(cmd));
2343  		goto err;
2344  	}
2345  
2346  	if (dyn_uar)
2347  		bfregi->sys_pages[idx] = uar_index;
2348  	return 0;
2349  
2350  err:
2351  	if (!dyn_uar)
2352  		return err;
2353  
2354  	mlx5_cmd_uar_dealloc(dev->mdev, idx, context->devx_uid);
2355  
2356  free_bfreg:
2357  	mlx5_ib_free_bfreg(dev, bfregi, bfreg_dyn_idx);
2358  
2359  	return err;
2360  }
2361  
2362  static unsigned long mlx5_vma_to_pgoff(struct vm_area_struct *vma)
2363  {
2364  	unsigned long idx;
2365  	u8 command;
2366  
2367  	command = get_command(vma->vm_pgoff);
2368  	idx = get_extended_index(vma->vm_pgoff);
2369  
2370  	return (command << 16 | idx);
2371  }
2372  
2373  static int mlx5_ib_mmap_offset(struct mlx5_ib_dev *dev,
2374  			       struct vm_area_struct *vma,
2375  			       struct ib_ucontext *ucontext)
2376  {
2377  	struct mlx5_user_mmap_entry *mentry;
2378  	struct rdma_user_mmap_entry *entry;
2379  	unsigned long pgoff;
2380  	pgprot_t prot;
2381  	phys_addr_t pfn;
2382  	int ret;
2383  
2384  	pgoff = mlx5_vma_to_pgoff(vma);
2385  	entry = rdma_user_mmap_entry_get_pgoff(ucontext, pgoff);
2386  	if (!entry)
2387  		return -EINVAL;
2388  
2389  	mentry = to_mmmap(entry);
2390  	pfn = (mentry->address >> PAGE_SHIFT);
2391  	if (mentry->mmap_flag == MLX5_IB_MMAP_TYPE_VAR ||
2392  	    mentry->mmap_flag == MLX5_IB_MMAP_TYPE_UAR_NC)
2393  		prot = pgprot_noncached(vma->vm_page_prot);
2394  	else
2395  		prot = pgprot_writecombine(vma->vm_page_prot);
2396  	ret = rdma_user_mmap_io(ucontext, vma, pfn,
2397  				entry->npages * PAGE_SIZE,
2398  				prot,
2399  				entry);
2400  	rdma_user_mmap_entry_put(&mentry->rdma_entry);
2401  	return ret;
2402  }
2403  
2404  static u64 mlx5_entry_to_mmap_offset(struct mlx5_user_mmap_entry *entry)
2405  {
2406  	u64 cmd = (entry->rdma_entry.start_pgoff >> 16) & 0xFFFF;
2407  	u64 index = entry->rdma_entry.start_pgoff & 0xFFFF;
2408  
2409  	return (((index >> 8) << 16) | (cmd << MLX5_IB_MMAP_CMD_SHIFT) |
2410  		(index & 0xFF)) << PAGE_SHIFT;
2411  }
2412  
2413  static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
2414  {
2415  	struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
2416  	struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
2417  	unsigned long command;
2418  	phys_addr_t pfn;
2419  
2420  	command = get_command(vma->vm_pgoff);
2421  	switch (command) {
2422  	case MLX5_IB_MMAP_WC_PAGE:
2423  	case MLX5_IB_MMAP_ALLOC_WC:
2424  		if (!mlx5_wc_support_get(dev->mdev))
2425  			return -EPERM;
2426  		fallthrough;
2427  	case MLX5_IB_MMAP_NC_PAGE:
2428  	case MLX5_IB_MMAP_REGULAR_PAGE:
2429  		return uar_mmap(dev, command, vma, context);
2430  
2431  	case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES:
2432  		return -ENOSYS;
2433  
2434  	case MLX5_IB_MMAP_CORE_CLOCK:
2435  		if (vma->vm_end - vma->vm_start != PAGE_SIZE)
2436  			return -EINVAL;
2437  
2438  		if (vma->vm_flags & VM_WRITE)
2439  			return -EPERM;
2440  		vm_flags_clear(vma, VM_MAYWRITE);
2441  
2442  		/* Don't expose to user-space information it shouldn't have */
2443  		if (PAGE_SIZE > 4096)
2444  			return -EOPNOTSUPP;
2445  
2446  		pfn = (dev->mdev->iseg_base +
2447  		       offsetof(struct mlx5_init_seg, internal_timer_h)) >>
2448  			PAGE_SHIFT;
2449  		return rdma_user_mmap_io(&context->ibucontext, vma, pfn,
2450  					 PAGE_SIZE,
2451  					 pgprot_noncached(vma->vm_page_prot),
2452  					 NULL);
2453  	case MLX5_IB_MMAP_CLOCK_INFO:
2454  		return mlx5_ib_mmap_clock_info_page(dev, vma, context);
2455  
2456  	default:
2457  		return mlx5_ib_mmap_offset(dev, vma, ibcontext);
2458  	}
2459  
2460  	return 0;
2461  }
2462  
2463  static int mlx5_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
2464  {
2465  	struct mlx5_ib_pd *pd = to_mpd(ibpd);
2466  	struct ib_device *ibdev = ibpd->device;
2467  	struct mlx5_ib_alloc_pd_resp resp;
2468  	int err;
2469  	u32 out[MLX5_ST_SZ_DW(alloc_pd_out)] = {};
2470  	u32 in[MLX5_ST_SZ_DW(alloc_pd_in)] = {};
2471  	u16 uid = 0;
2472  	struct mlx5_ib_ucontext *context = rdma_udata_to_drv_context(
2473  		udata, struct mlx5_ib_ucontext, ibucontext);
2474  
2475  	uid = context ? context->devx_uid : 0;
2476  	MLX5_SET(alloc_pd_in, in, opcode, MLX5_CMD_OP_ALLOC_PD);
2477  	MLX5_SET(alloc_pd_in, in, uid, uid);
2478  	err = mlx5_cmd_exec_inout(to_mdev(ibdev)->mdev, alloc_pd, in, out);
2479  	if (err)
2480  		return err;
2481  
2482  	pd->pdn = MLX5_GET(alloc_pd_out, out, pd);
2483  	pd->uid = uid;
2484  	if (udata) {
2485  		resp.pdn = pd->pdn;
2486  		if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
2487  			mlx5_cmd_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn, uid);
2488  			return -EFAULT;
2489  		}
2490  	}
2491  
2492  	return 0;
2493  }
2494  
2495  static int mlx5_ib_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
2496  {
2497  	struct mlx5_ib_dev *mdev = to_mdev(pd->device);
2498  	struct mlx5_ib_pd *mpd = to_mpd(pd);
2499  
2500  	return mlx5_cmd_dealloc_pd(mdev->mdev, mpd->pdn, mpd->uid);
2501  }
2502  
2503  static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
2504  {
2505  	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
2506  	struct mlx5_ib_qp *mqp = to_mqp(ibqp);
2507  	int err;
2508  	u16 uid;
2509  
2510  	uid = ibqp->pd ?
2511  		to_mpd(ibqp->pd)->uid : 0;
2512  
2513  	if (mqp->flags & IB_QP_CREATE_SOURCE_QPN) {
2514  		mlx5_ib_dbg(dev, "Attaching a multi cast group to underlay QP is not supported\n");
2515  		return -EOPNOTSUPP;
2516  	}
2517  
2518  	err = mlx5_cmd_attach_mcg(dev->mdev, gid, ibqp->qp_num, uid);
2519  	if (err)
2520  		mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n",
2521  			     ibqp->qp_num, gid->raw);
2522  
2523  	return err;
2524  }
2525  
2526  static int mlx5_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
2527  {
2528  	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
2529  	int err;
2530  	u16 uid;
2531  
2532  	uid = ibqp->pd ?
2533  		to_mpd(ibqp->pd)->uid : 0;
2534  	err = mlx5_cmd_detach_mcg(dev->mdev, gid, ibqp->qp_num, uid);
2535  	if (err)
2536  		mlx5_ib_warn(dev, "failed detaching QPN 0x%x, MGID %pI6\n",
2537  			     ibqp->qp_num, gid->raw);
2538  
2539  	return err;
2540  }
2541  
2542  static int init_node_data(struct mlx5_ib_dev *dev)
2543  {
2544  	int err;
2545  
2546  	err = mlx5_query_node_desc(dev, dev->ib_dev.node_desc);
2547  	if (err)
2548  		return err;
2549  
2550  	dev->mdev->rev_id = dev->mdev->pdev->revision;
2551  
2552  	return mlx5_query_node_guid(dev, &dev->ib_dev.node_guid);
2553  }
2554  
2555  static ssize_t fw_pages_show(struct device *device,
2556  			     struct device_attribute *attr, char *buf)
2557  {
2558  	struct mlx5_ib_dev *dev =
2559  		rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
2560  
2561  	return sysfs_emit(buf, "%d\n", dev->mdev->priv.fw_pages);
2562  }
2563  static DEVICE_ATTR_RO(fw_pages);
2564  
2565  static ssize_t reg_pages_show(struct device *device,
2566  			      struct device_attribute *attr, char *buf)
2567  {
2568  	struct mlx5_ib_dev *dev =
2569  		rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
2570  
2571  	return sysfs_emit(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages));
2572  }
2573  static DEVICE_ATTR_RO(reg_pages);
2574  
2575  static ssize_t hca_type_show(struct device *device,
2576  			     struct device_attribute *attr, char *buf)
2577  {
2578  	struct mlx5_ib_dev *dev =
2579  		rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
2580  
2581  	return sysfs_emit(buf, "MT%d\n", dev->mdev->pdev->device);
2582  }
2583  static DEVICE_ATTR_RO(hca_type);
2584  
2585  static ssize_t hw_rev_show(struct device *device,
2586  			   struct device_attribute *attr, char *buf)
2587  {
2588  	struct mlx5_ib_dev *dev =
2589  		rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
2590  
2591  	return sysfs_emit(buf, "%x\n", dev->mdev->rev_id);
2592  }
2593  static DEVICE_ATTR_RO(hw_rev);
2594  
2595  static ssize_t board_id_show(struct device *device,
2596  			     struct device_attribute *attr, char *buf)
2597  {
2598  	struct mlx5_ib_dev *dev =
2599  		rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
2600  
2601  	return sysfs_emit(buf, "%.*s\n", MLX5_BOARD_ID_LEN,
2602  			  dev->mdev->board_id);
2603  }
2604  static DEVICE_ATTR_RO(board_id);
2605  
2606  static struct attribute *mlx5_class_attributes[] = {
2607  	&dev_attr_hw_rev.attr,
2608  	&dev_attr_hca_type.attr,
2609  	&dev_attr_board_id.attr,
2610  	&dev_attr_fw_pages.attr,
2611  	&dev_attr_reg_pages.attr,
2612  	NULL,
2613  };
2614  
2615  static const struct attribute_group mlx5_attr_group = {
2616  	.attrs = mlx5_class_attributes,
2617  };
2618  
2619  static void pkey_change_handler(struct work_struct *work)
2620  {
2621  	struct mlx5_ib_port_resources *ports =
2622  		container_of(work, struct mlx5_ib_port_resources,
2623  			     pkey_change_work);
2624  
2625  	if (!ports->gsi)
2626  		/*
2627  		 * We got this event before device was fully configured
2628  		 * and MAD registration code wasn't called/finished yet.
2629  		 */
2630  		return;
2631  
2632  	mlx5_ib_gsi_pkey_change(ports->gsi);
2633  }
2634  
2635  static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev)
2636  {
2637  	struct mlx5_ib_qp *mqp;
2638  	struct mlx5_ib_cq *send_mcq, *recv_mcq;
2639  	struct mlx5_core_cq *mcq;
2640  	struct list_head cq_armed_list;
2641  	unsigned long flags_qp;
2642  	unsigned long flags_cq;
2643  	unsigned long flags;
2644  
2645  	INIT_LIST_HEAD(&cq_armed_list);
2646  
2647  	/* Go over qp list reside on that ibdev, sync with create/destroy qp.*/
2648  	spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags);
2649  	list_for_each_entry(mqp, &ibdev->qp_list, qps_list) {
2650  		spin_lock_irqsave(&mqp->sq.lock, flags_qp);
2651  		if (mqp->sq.tail != mqp->sq.head) {
2652  			send_mcq = to_mcq(mqp->ibqp.send_cq);
2653  			spin_lock_irqsave(&send_mcq->lock, flags_cq);
2654  			if (send_mcq->mcq.comp &&
2655  			    mqp->ibqp.send_cq->comp_handler) {
2656  				if (!send_mcq->mcq.reset_notify_added) {
2657  					send_mcq->mcq.reset_notify_added = 1;
2658  					list_add_tail(&send_mcq->mcq.reset_notify,
2659  						      &cq_armed_list);
2660  				}
2661  			}
2662  			spin_unlock_irqrestore(&send_mcq->lock, flags_cq);
2663  		}
2664  		spin_unlock_irqrestore(&mqp->sq.lock, flags_qp);
2665  		spin_lock_irqsave(&mqp->rq.lock, flags_qp);
2666  		/* no handling is needed for SRQ */
2667  		if (!mqp->ibqp.srq) {
2668  			if (mqp->rq.tail != mqp->rq.head) {
2669  				recv_mcq = to_mcq(mqp->ibqp.recv_cq);
2670  				spin_lock_irqsave(&recv_mcq->lock, flags_cq);
2671  				if (recv_mcq->mcq.comp &&
2672  				    mqp->ibqp.recv_cq->comp_handler) {
2673  					if (!recv_mcq->mcq.reset_notify_added) {
2674  						recv_mcq->mcq.reset_notify_added = 1;
2675  						list_add_tail(&recv_mcq->mcq.reset_notify,
2676  							      &cq_armed_list);
2677  					}
2678  				}
2679  				spin_unlock_irqrestore(&recv_mcq->lock,
2680  						       flags_cq);
2681  			}
2682  		}
2683  		spin_unlock_irqrestore(&mqp->rq.lock, flags_qp);
2684  	}
2685  	/*At that point all inflight post send were put to be executed as of we
2686  	 * lock/unlock above locks Now need to arm all involved CQs.
2687  	 */
2688  	list_for_each_entry(mcq, &cq_armed_list, reset_notify) {
2689  		mcq->comp(mcq, NULL);
2690  	}
2691  	spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags);
2692  }
2693  
2694  static void delay_drop_handler(struct work_struct *work)
2695  {
2696  	int err;
2697  	struct mlx5_ib_delay_drop *delay_drop =
2698  		container_of(work, struct mlx5_ib_delay_drop,
2699  			     delay_drop_work);
2700  
2701  	atomic_inc(&delay_drop->events_cnt);
2702  
2703  	mutex_lock(&delay_drop->lock);
2704  	err = mlx5_core_set_delay_drop(delay_drop->dev, delay_drop->timeout);
2705  	if (err) {
2706  		mlx5_ib_warn(delay_drop->dev, "Failed to set delay drop, timeout=%u\n",
2707  			     delay_drop->timeout);
2708  		delay_drop->activate = false;
2709  	}
2710  	mutex_unlock(&delay_drop->lock);
2711  }
2712  
2713  static void handle_general_event(struct mlx5_ib_dev *ibdev, struct mlx5_eqe *eqe,
2714  				 struct ib_event *ibev)
2715  {
2716  	u32 port = (eqe->data.port.port >> 4) & 0xf;
2717  
2718  	switch (eqe->sub_type) {
2719  	case MLX5_GENERAL_SUBTYPE_DELAY_DROP_TIMEOUT:
2720  		if (mlx5_ib_port_link_layer(&ibdev->ib_dev, port) ==
2721  					    IB_LINK_LAYER_ETHERNET)
2722  			schedule_work(&ibdev->delay_drop.delay_drop_work);
2723  		break;
2724  	default: /* do nothing */
2725  		return;
2726  	}
2727  }
2728  
2729  static int handle_port_change(struct mlx5_ib_dev *ibdev, struct mlx5_eqe *eqe,
2730  			      struct ib_event *ibev)
2731  {
2732  	u32 port = (eqe->data.port.port >> 4) & 0xf;
2733  
2734  	ibev->element.port_num = port;
2735  
2736  	switch (eqe->sub_type) {
2737  	case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
2738  	case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
2739  	case MLX5_PORT_CHANGE_SUBTYPE_INITIALIZED:
2740  		/* In RoCE, port up/down events are handled in
2741  		 * mlx5_netdev_event().
2742  		 */
2743  		if (mlx5_ib_port_link_layer(&ibdev->ib_dev, port) ==
2744  					    IB_LINK_LAYER_ETHERNET)
2745  			return -EINVAL;
2746  
2747  		ibev->event = (eqe->sub_type == MLX5_PORT_CHANGE_SUBTYPE_ACTIVE) ?
2748  				IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
2749  		break;
2750  
2751  	case MLX5_PORT_CHANGE_SUBTYPE_LID:
2752  		ibev->event = IB_EVENT_LID_CHANGE;
2753  		break;
2754  
2755  	case MLX5_PORT_CHANGE_SUBTYPE_PKEY:
2756  		ibev->event = IB_EVENT_PKEY_CHANGE;
2757  		schedule_work(&ibdev->devr.ports[port - 1].pkey_change_work);
2758  		break;
2759  
2760  	case MLX5_PORT_CHANGE_SUBTYPE_GUID:
2761  		ibev->event = IB_EVENT_GID_CHANGE;
2762  		break;
2763  
2764  	case MLX5_PORT_CHANGE_SUBTYPE_CLIENT_REREG:
2765  		ibev->event = IB_EVENT_CLIENT_REREGISTER;
2766  		break;
2767  	default:
2768  		return -EINVAL;
2769  	}
2770  
2771  	return 0;
2772  }
2773  
2774  static void mlx5_ib_handle_event(struct work_struct *_work)
2775  {
2776  	struct mlx5_ib_event_work *work =
2777  		container_of(_work, struct mlx5_ib_event_work, work);
2778  	struct mlx5_ib_dev *ibdev;
2779  	struct ib_event ibev;
2780  	bool fatal = false;
2781  
2782  	if (work->is_slave) {
2783  		ibdev = mlx5_ib_get_ibdev_from_mpi(work->mpi);
2784  		if (!ibdev)
2785  			goto out;
2786  	} else {
2787  		ibdev = work->dev;
2788  	}
2789  
2790  	switch (work->event) {
2791  	case MLX5_DEV_EVENT_SYS_ERROR:
2792  		ibev.event = IB_EVENT_DEVICE_FATAL;
2793  		mlx5_ib_handle_internal_error(ibdev);
2794  		ibev.element.port_num  = (u8)(unsigned long)work->param;
2795  		fatal = true;
2796  		break;
2797  	case MLX5_EVENT_TYPE_PORT_CHANGE:
2798  		if (handle_port_change(ibdev, work->param, &ibev))
2799  			goto out;
2800  		break;
2801  	case MLX5_EVENT_TYPE_GENERAL_EVENT:
2802  		handle_general_event(ibdev, work->param, &ibev);
2803  		fallthrough;
2804  	default:
2805  		goto out;
2806  	}
2807  
2808  	ibev.device = &ibdev->ib_dev;
2809  
2810  	if (!rdma_is_port_valid(&ibdev->ib_dev, ibev.element.port_num)) {
2811  		mlx5_ib_warn(ibdev, "warning: event on port %d\n",  ibev.element.port_num);
2812  		goto out;
2813  	}
2814  
2815  	if (ibdev->ib_active)
2816  		ib_dispatch_event(&ibev);
2817  
2818  	if (fatal)
2819  		ibdev->ib_active = false;
2820  out:
2821  	kfree(work);
2822  }
2823  
2824  static int mlx5_ib_event(struct notifier_block *nb,
2825  			 unsigned long event, void *param)
2826  {
2827  	struct mlx5_ib_event_work *work;
2828  
2829  	work = kmalloc(sizeof(*work), GFP_ATOMIC);
2830  	if (!work)
2831  		return NOTIFY_DONE;
2832  
2833  	INIT_WORK(&work->work, mlx5_ib_handle_event);
2834  	work->dev = container_of(nb, struct mlx5_ib_dev, mdev_events);
2835  	work->is_slave = false;
2836  	work->param = param;
2837  	work->event = event;
2838  
2839  	queue_work(mlx5_ib_event_wq, &work->work);
2840  
2841  	return NOTIFY_OK;
2842  }
2843  
2844  static int mlx5_ib_event_slave_port(struct notifier_block *nb,
2845  				    unsigned long event, void *param)
2846  {
2847  	struct mlx5_ib_event_work *work;
2848  
2849  	work = kmalloc(sizeof(*work), GFP_ATOMIC);
2850  	if (!work)
2851  		return NOTIFY_DONE;
2852  
2853  	INIT_WORK(&work->work, mlx5_ib_handle_event);
2854  	work->mpi = container_of(nb, struct mlx5_ib_multiport_info, mdev_events);
2855  	work->is_slave = true;
2856  	work->param = param;
2857  	work->event = event;
2858  	queue_work(mlx5_ib_event_wq, &work->work);
2859  
2860  	return NOTIFY_OK;
2861  }
2862  
2863  static int mlx5_ib_get_plane_num(struct mlx5_core_dev *mdev, u8 *num_plane)
2864  {
2865  	struct mlx5_hca_vport_context vport_ctx;
2866  	int err;
2867  
2868  	*num_plane = 0;
2869  	if (!MLX5_CAP_GEN(mdev, ib_virt) || !MLX5_CAP_GEN_2(mdev, multiplane))
2870  		return 0;
2871  
2872  	err = mlx5_query_hca_vport_context(mdev, 0, 1, 0, &vport_ctx);
2873  	if (err)
2874  		return err;
2875  
2876  	*num_plane = vport_ctx.num_plane;
2877  	return 0;
2878  }
2879  
2880  static int set_has_smi_cap(struct mlx5_ib_dev *dev)
2881  {
2882  	struct mlx5_hca_vport_context vport_ctx;
2883  	int err;
2884  	int port;
2885  
2886  	if (MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_IB)
2887  		return 0;
2888  
2889  	for (port = 1; port <= dev->num_ports; port++) {
2890  		if (dev->num_plane) {
2891  			dev->port_caps[port - 1].has_smi = false;
2892  			continue;
2893  		} else if (!MLX5_CAP_GEN(dev->mdev, ib_virt) ||
2894  			dev->ib_dev.type == RDMA_DEVICE_TYPE_SMI) {
2895  			dev->port_caps[port - 1].has_smi = true;
2896  			continue;
2897  		}
2898  
2899  		err = mlx5_query_hca_vport_context(dev->mdev, 0, port, 0,
2900  						   &vport_ctx);
2901  		if (err) {
2902  			mlx5_ib_err(dev, "query_hca_vport_context for port=%d failed %d\n",
2903  				    port, err);
2904  			return err;
2905  		}
2906  		dev->port_caps[port - 1].has_smi = vport_ctx.has_smi;
2907  	}
2908  
2909  	return 0;
2910  }
2911  
2912  static void get_ext_port_caps(struct mlx5_ib_dev *dev)
2913  {
2914  	unsigned int port;
2915  
2916  	rdma_for_each_port (&dev->ib_dev, port)
2917  		mlx5_query_ext_port_caps(dev, port);
2918  }
2919  
2920  static u8 mlx5_get_umr_fence(u8 umr_fence_cap)
2921  {
2922  	switch (umr_fence_cap) {
2923  	case MLX5_CAP_UMR_FENCE_NONE:
2924  		return MLX5_FENCE_MODE_NONE;
2925  	case MLX5_CAP_UMR_FENCE_SMALL:
2926  		return MLX5_FENCE_MODE_INITIATOR_SMALL;
2927  	default:
2928  		return MLX5_FENCE_MODE_STRONG_ORDERING;
2929  	}
2930  }
2931  
2932  int mlx5_ib_dev_res_cq_init(struct mlx5_ib_dev *dev)
2933  {
2934  	struct mlx5_ib_resources *devr = &dev->devr;
2935  	struct ib_cq_init_attr cq_attr = {.cqe = 1};
2936  	struct ib_device *ibdev;
2937  	struct ib_pd *pd;
2938  	struct ib_cq *cq;
2939  	int ret = 0;
2940  
2941  
2942  	/*
2943  	 * devr->c0 is set once, never changed until device unload.
2944  	 * Avoid taking the mutex if initialization is already done.
2945  	 */
2946  	if (devr->c0)
2947  		return 0;
2948  
2949  	mutex_lock(&devr->cq_lock);
2950  	if (devr->c0)
2951  		goto unlock;
2952  
2953  	ibdev = &dev->ib_dev;
2954  	pd = ib_alloc_pd(ibdev, 0);
2955  	if (IS_ERR(pd)) {
2956  		ret = PTR_ERR(pd);
2957  		mlx5_ib_err(dev, "Couldn't allocate PD for res init, err=%d\n", ret);
2958  		goto unlock;
2959  	}
2960  
2961  	cq = ib_create_cq(ibdev, NULL, NULL, NULL, &cq_attr);
2962  	if (IS_ERR(cq)) {
2963  		ret = PTR_ERR(cq);
2964  		mlx5_ib_err(dev, "Couldn't create CQ for res init, err=%d\n", ret);
2965  		ib_dealloc_pd(pd);
2966  		goto unlock;
2967  	}
2968  
2969  	devr->p0 = pd;
2970  	devr->c0 = cq;
2971  
2972  unlock:
2973  	mutex_unlock(&devr->cq_lock);
2974  	return ret;
2975  }
2976  
2977  int mlx5_ib_dev_res_srq_init(struct mlx5_ib_dev *dev)
2978  {
2979  	struct mlx5_ib_resources *devr = &dev->devr;
2980  	struct ib_srq_init_attr attr;
2981  	struct ib_srq *s0, *s1;
2982  	int ret = 0;
2983  
2984  	/*
2985  	 * devr->s1 is set once, never changed until device unload.
2986  	 * Avoid taking the mutex if initialization is already done.
2987  	 */
2988  	if (devr->s1)
2989  		return 0;
2990  
2991  	mutex_lock(&devr->srq_lock);
2992  	if (devr->s1)
2993  		goto unlock;
2994  
2995  	ret = mlx5_ib_dev_res_cq_init(dev);
2996  	if (ret)
2997  		goto unlock;
2998  
2999  	memset(&attr, 0, sizeof(attr));
3000  	attr.attr.max_sge = 1;
3001  	attr.attr.max_wr = 1;
3002  	attr.srq_type = IB_SRQT_XRC;
3003  	attr.ext.cq = devr->c0;
3004  
3005  	s0 = ib_create_srq(devr->p0, &attr);
3006  	if (IS_ERR(s0)) {
3007  		ret = PTR_ERR(s0);
3008  		mlx5_ib_err(dev, "Couldn't create SRQ 0 for res init, err=%d\n", ret);
3009  		goto unlock;
3010  	}
3011  
3012  	memset(&attr, 0, sizeof(attr));
3013  	attr.attr.max_sge = 1;
3014  	attr.attr.max_wr = 1;
3015  	attr.srq_type = IB_SRQT_BASIC;
3016  
3017  	s1 = ib_create_srq(devr->p0, &attr);
3018  	if (IS_ERR(s1)) {
3019  		ret = PTR_ERR(s1);
3020  		mlx5_ib_err(dev, "Couldn't create SRQ 1 for res init, err=%d\n", ret);
3021  		ib_destroy_srq(s0);
3022  	}
3023  
3024  	devr->s0 = s0;
3025  	devr->s1 = s1;
3026  
3027  unlock:
3028  	mutex_unlock(&devr->srq_lock);
3029  	return ret;
3030  }
3031  
3032  static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev)
3033  {
3034  	struct mlx5_ib_resources *devr = &dev->devr;
3035  	int ret;
3036  
3037  	if (!MLX5_CAP_GEN(dev->mdev, xrc))
3038  		return -EOPNOTSUPP;
3039  
3040  	ret = mlx5_cmd_xrcd_alloc(dev->mdev, &devr->xrcdn0, 0);
3041  	if (ret)
3042  		return ret;
3043  
3044  	ret = mlx5_cmd_xrcd_alloc(dev->mdev, &devr->xrcdn1, 0);
3045  	if (ret) {
3046  		mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn0, 0);
3047  		return ret;
3048  	}
3049  
3050  	mutex_init(&devr->cq_lock);
3051  	mutex_init(&devr->srq_lock);
3052  
3053  	return 0;
3054  }
3055  
3056  static void mlx5_ib_dev_res_cleanup(struct mlx5_ib_dev *dev)
3057  {
3058  	struct mlx5_ib_resources *devr = &dev->devr;
3059  
3060  	/* After s0/s1 init, they are not unset during the device lifetime. */
3061  	if (devr->s1) {
3062  		ib_destroy_srq(devr->s1);
3063  		ib_destroy_srq(devr->s0);
3064  	}
3065  	mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn1, 0);
3066  	mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn0, 0);
3067  	/* After p0/c0 init, they are not unset during the device lifetime. */
3068  	if (devr->c0) {
3069  		ib_destroy_cq(devr->c0);
3070  		ib_dealloc_pd(devr->p0);
3071  	}
3072  	mutex_destroy(&devr->cq_lock);
3073  	mutex_destroy(&devr->srq_lock);
3074  }
3075  
3076  static int
3077  mlx5_ib_create_data_direct_resources(struct mlx5_ib_dev *dev)
3078  {
3079  	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
3080  	struct mlx5_core_dev *mdev = dev->mdev;
3081  	void *mkc;
3082  	u32 mkey;
3083  	u32 pdn;
3084  	u32 *in;
3085  	int err;
3086  
3087  	err = mlx5_core_alloc_pd(mdev, &pdn);
3088  	if (err)
3089  		return err;
3090  
3091  	in = kvzalloc(inlen, GFP_KERNEL);
3092  	if (!in) {
3093  		err = -ENOMEM;
3094  		goto err;
3095  	}
3096  
3097  	MLX5_SET(create_mkey_in, in, data_direct, 1);
3098  	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
3099  	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
3100  	MLX5_SET(mkc, mkc, lw, 1);
3101  	MLX5_SET(mkc, mkc, lr, 1);
3102  	MLX5_SET(mkc, mkc, rw, 1);
3103  	MLX5_SET(mkc, mkc, rr, 1);
3104  	MLX5_SET(mkc, mkc, a, 1);
3105  	MLX5_SET(mkc, mkc, pd, pdn);
3106  	MLX5_SET(mkc, mkc, length64, 1);
3107  	MLX5_SET(mkc, mkc, qpn, 0xffffff);
3108  	err = mlx5_core_create_mkey(mdev, &mkey, in, inlen);
3109  	kvfree(in);
3110  	if (err)
3111  		goto err;
3112  
3113  	dev->ddr.mkey = mkey;
3114  	dev->ddr.pdn = pdn;
3115  	return 0;
3116  
3117  err:
3118  	mlx5_core_dealloc_pd(mdev, pdn);
3119  	return err;
3120  }
3121  
3122  static void
3123  mlx5_ib_free_data_direct_resources(struct mlx5_ib_dev *dev)
3124  {
3125  	mlx5_core_destroy_mkey(dev->mdev, dev->ddr.mkey);
3126  	mlx5_core_dealloc_pd(dev->mdev, dev->ddr.pdn);
3127  }
3128  
3129  static u32 get_core_cap_flags(struct ib_device *ibdev,
3130  			      struct mlx5_hca_vport_context *rep)
3131  {
3132  	struct mlx5_ib_dev *dev = to_mdev(ibdev);
3133  	enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, 1);
3134  	u8 l3_type_cap = MLX5_CAP_ROCE(dev->mdev, l3_type);
3135  	u8 roce_version_cap = MLX5_CAP_ROCE(dev->mdev, roce_version);
3136  	bool raw_support = !mlx5_core_mp_enabled(dev->mdev);
3137  	u32 ret = 0;
3138  
3139  	if (rep->grh_required)
3140  		ret |= RDMA_CORE_CAP_IB_GRH_REQUIRED;
3141  
3142  	if (dev->num_plane)
3143  		return ret | RDMA_CORE_CAP_PROT_IB | RDMA_CORE_CAP_IB_MAD |
3144  			RDMA_CORE_CAP_IB_CM | RDMA_CORE_CAP_IB_SA |
3145  			RDMA_CORE_CAP_AF_IB;
3146  	else if (ibdev->type == RDMA_DEVICE_TYPE_SMI)
3147  		return ret | RDMA_CORE_CAP_IB_MAD | RDMA_CORE_CAP_IB_SMI;
3148  
3149  	if (ll == IB_LINK_LAYER_INFINIBAND)
3150  		return ret | RDMA_CORE_PORT_IBA_IB;
3151  
3152  	if (raw_support)
3153  		ret |= RDMA_CORE_PORT_RAW_PACKET;
3154  
3155  	if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV4_CAP))
3156  		return ret;
3157  
3158  	if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV6_CAP))
3159  		return ret;
3160  
3161  	if (roce_version_cap & MLX5_ROCE_VERSION_1_CAP)
3162  		ret |= RDMA_CORE_PORT_IBA_ROCE;
3163  
3164  	if (roce_version_cap & MLX5_ROCE_VERSION_2_CAP)
3165  		ret |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
3166  
3167  	return ret;
3168  }
3169  
3170  static int mlx5_port_immutable(struct ib_device *ibdev, u32 port_num,
3171  			       struct ib_port_immutable *immutable)
3172  {
3173  	struct ib_port_attr attr;
3174  	struct mlx5_ib_dev *dev = to_mdev(ibdev);
3175  	enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, port_num);
3176  	struct mlx5_hca_vport_context rep = {0};
3177  	int err;
3178  
3179  	err = ib_query_port(ibdev, port_num, &attr);
3180  	if (err)
3181  		return err;
3182  
3183  	if (ll == IB_LINK_LAYER_INFINIBAND) {
3184  		if (ibdev->type == RDMA_DEVICE_TYPE_SMI)
3185  			port_num = smi_to_native_portnum(dev, port_num);
3186  
3187  		err = mlx5_query_hca_vport_context(dev->mdev, 0, port_num, 0,
3188  						   &rep);
3189  		if (err)
3190  			return err;
3191  	}
3192  
3193  	immutable->pkey_tbl_len = attr.pkey_tbl_len;
3194  	immutable->gid_tbl_len = attr.gid_tbl_len;
3195  	immutable->core_cap_flags = get_core_cap_flags(ibdev, &rep);
3196  	immutable->max_mad_size = IB_MGMT_MAD_SIZE;
3197  
3198  	return 0;
3199  }
3200  
3201  static int mlx5_port_rep_immutable(struct ib_device *ibdev, u32 port_num,
3202  				   struct ib_port_immutable *immutable)
3203  {
3204  	struct ib_port_attr attr;
3205  	int err;
3206  
3207  	immutable->core_cap_flags = RDMA_CORE_PORT_RAW_PACKET;
3208  
3209  	err = ib_query_port(ibdev, port_num, &attr);
3210  	if (err)
3211  		return err;
3212  
3213  	immutable->pkey_tbl_len = attr.pkey_tbl_len;
3214  	immutable->gid_tbl_len = attr.gid_tbl_len;
3215  	immutable->core_cap_flags = RDMA_CORE_PORT_RAW_PACKET;
3216  
3217  	return 0;
3218  }
3219  
3220  static void get_dev_fw_str(struct ib_device *ibdev, char *str)
3221  {
3222  	struct mlx5_ib_dev *dev =
3223  		container_of(ibdev, struct mlx5_ib_dev, ib_dev);
3224  	snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%04d",
3225  		 fw_rev_maj(dev->mdev), fw_rev_min(dev->mdev),
3226  		 fw_rev_sub(dev->mdev));
3227  }
3228  
3229  static int lag_event(struct notifier_block *nb, unsigned long event, void *data)
3230  {
3231  	struct mlx5_ib_dev *dev = container_of(nb, struct mlx5_ib_dev,
3232  					       lag_events);
3233  	struct mlx5_core_dev *mdev = dev->mdev;
3234  	struct ib_device *ibdev = &dev->ib_dev;
3235  	struct net_device *old_ndev = NULL;
3236  	struct mlx5_ib_port *port;
3237  	struct net_device *ndev;
3238  	u32 portnum = 0;
3239  	int ret = 0;
3240  	int i;
3241  
3242  	switch (event) {
3243  	case MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE:
3244  		ndev = data;
3245  		if (ndev) {
3246  			if (!mlx5_lag_is_roce(mdev)) {
3247  				// sriov lag
3248  				for (i = 0; i < dev->num_ports; i++) {
3249  					port = &dev->port[i];
3250  					if (port->rep && port->rep->vport ==
3251  					    MLX5_VPORT_UPLINK) {
3252  						portnum = i;
3253  						break;
3254  					}
3255  				}
3256  			}
3257  			old_ndev = ib_device_get_netdev(ibdev, portnum + 1);
3258  			ret = ib_device_set_netdev(ibdev, ndev, portnum + 1);
3259  			if (ret)
3260  				goto out;
3261  
3262  			if (old_ndev)
3263  				roce_del_all_netdev_gids(ibdev, portnum + 1,
3264  							 old_ndev);
3265  			rdma_roce_rescan_port(ibdev, portnum + 1);
3266  		}
3267  		break;
3268  	default:
3269  		return NOTIFY_DONE;
3270  	}
3271  
3272  out:
3273  	dev_put(old_ndev);
3274  	return notifier_from_errno(ret);
3275  }
3276  
3277  static void mlx5e_lag_event_register(struct mlx5_ib_dev *dev)
3278  {
3279  	dev->lag_events.notifier_call = lag_event;
3280  	blocking_notifier_chain_register(&dev->mdev->priv.lag_nh,
3281  					 &dev->lag_events);
3282  }
3283  
3284  static void mlx5e_lag_event_unregister(struct mlx5_ib_dev *dev)
3285  {
3286  	blocking_notifier_chain_unregister(&dev->mdev->priv.lag_nh,
3287  					   &dev->lag_events);
3288  }
3289  
3290  static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev)
3291  {
3292  	struct mlx5_core_dev *mdev = dev->mdev;
3293  	struct mlx5_flow_namespace *ns = mlx5_get_flow_namespace(mdev,
3294  								 MLX5_FLOW_NAMESPACE_LAG);
3295  	struct mlx5_flow_table *ft;
3296  	int err;
3297  
3298  	if (!ns || !mlx5_lag_is_active(mdev))
3299  		return 0;
3300  
3301  	err = mlx5_cmd_create_vport_lag(mdev);
3302  	if (err)
3303  		return err;
3304  
3305  	ft = mlx5_create_lag_demux_flow_table(ns, 0, 0);
3306  	if (IS_ERR(ft)) {
3307  		err = PTR_ERR(ft);
3308  		goto err_destroy_vport_lag;
3309  	}
3310  
3311  	mlx5e_lag_event_register(dev);
3312  	dev->flow_db->lag_demux_ft = ft;
3313  	dev->lag_ports = mlx5_lag_get_num_ports(mdev);
3314  	dev->lag_active = true;
3315  	return 0;
3316  
3317  err_destroy_vport_lag:
3318  	mlx5_cmd_destroy_vport_lag(mdev);
3319  	return err;
3320  }
3321  
3322  static void mlx5_eth_lag_cleanup(struct mlx5_ib_dev *dev)
3323  {
3324  	struct mlx5_core_dev *mdev = dev->mdev;
3325  
3326  	if (dev->lag_active) {
3327  		dev->lag_active = false;
3328  
3329  		mlx5e_lag_event_unregister(dev);
3330  		mlx5_destroy_flow_table(dev->flow_db->lag_demux_ft);
3331  		dev->flow_db->lag_demux_ft = NULL;
3332  
3333  		mlx5_cmd_destroy_vport_lag(mdev);
3334  	}
3335  }
3336  
3337  static void mlx5_netdev_notifier_register(struct mlx5_roce *roce,
3338  					  struct net_device *netdev)
3339  {
3340  	int err;
3341  
3342  	if (roce->tracking_netdev)
3343  		return;
3344  	roce->tracking_netdev = netdev;
3345  	roce->nb.notifier_call = mlx5_netdev_event;
3346  	err = register_netdevice_notifier_dev_net(netdev, &roce->nb, &roce->nn);
3347  	WARN_ON(err);
3348  }
3349  
3350  static void mlx5_netdev_notifier_unregister(struct mlx5_roce *roce)
3351  {
3352  	if (!roce->tracking_netdev)
3353  		return;
3354  	unregister_netdevice_notifier_dev_net(roce->tracking_netdev, &roce->nb,
3355  					      &roce->nn);
3356  	roce->tracking_netdev = NULL;
3357  }
3358  
3359  static int mlx5e_mdev_notifier_event(struct notifier_block *nb,
3360  				     unsigned long event, void *data)
3361  {
3362  	struct mlx5_roce *roce = container_of(nb, struct mlx5_roce, mdev_nb);
3363  	struct net_device *netdev = data;
3364  
3365  	switch (event) {
3366  	case MLX5_DRIVER_EVENT_UPLINK_NETDEV:
3367  		if (netdev)
3368  			mlx5_netdev_notifier_register(roce, netdev);
3369  		else
3370  			mlx5_netdev_notifier_unregister(roce);
3371  		break;
3372  	default:
3373  		return NOTIFY_DONE;
3374  	}
3375  
3376  	return NOTIFY_OK;
3377  }
3378  
3379  static void mlx5_mdev_netdev_track(struct mlx5_ib_dev *dev, u32 port_num)
3380  {
3381  	struct mlx5_roce *roce = &dev->port[port_num].roce;
3382  
3383  	roce->mdev_nb.notifier_call = mlx5e_mdev_notifier_event;
3384  	mlx5_blocking_notifier_register(dev->mdev, &roce->mdev_nb);
3385  	mlx5_core_uplink_netdev_event_replay(dev->mdev);
3386  }
3387  
3388  static void mlx5_mdev_netdev_untrack(struct mlx5_ib_dev *dev, u32 port_num)
3389  {
3390  	struct mlx5_roce *roce = &dev->port[port_num].roce;
3391  
3392  	mlx5_blocking_notifier_unregister(dev->mdev, &roce->mdev_nb);
3393  	mlx5_netdev_notifier_unregister(roce);
3394  }
3395  
3396  static int mlx5_enable_eth(struct mlx5_ib_dev *dev)
3397  {
3398  	int err;
3399  
3400  	if (!dev->is_rep && dev->profile != &raw_eth_profile) {
3401  		err = mlx5_nic_vport_enable_roce(dev->mdev);
3402  		if (err)
3403  			return err;
3404  	}
3405  
3406  	err = mlx5_eth_lag_init(dev);
3407  	if (err)
3408  		goto err_disable_roce;
3409  
3410  	return 0;
3411  
3412  err_disable_roce:
3413  	if (!dev->is_rep && dev->profile != &raw_eth_profile)
3414  		mlx5_nic_vport_disable_roce(dev->mdev);
3415  
3416  	return err;
3417  }
3418  
3419  static void mlx5_disable_eth(struct mlx5_ib_dev *dev)
3420  {
3421  	mlx5_eth_lag_cleanup(dev);
3422  	if (!dev->is_rep && dev->profile != &raw_eth_profile)
3423  		mlx5_nic_vport_disable_roce(dev->mdev);
3424  }
3425  
3426  static int mlx5_ib_rn_get_params(struct ib_device *device, u32 port_num,
3427  				 enum rdma_netdev_t type,
3428  				 struct rdma_netdev_alloc_params *params)
3429  {
3430  	if (type != RDMA_NETDEV_IPOIB)
3431  		return -EOPNOTSUPP;
3432  
3433  	return mlx5_rdma_rn_get_params(to_mdev(device)->mdev, device, params);
3434  }
3435  
3436  static ssize_t delay_drop_timeout_read(struct file *filp, char __user *buf,
3437  				       size_t count, loff_t *pos)
3438  {
3439  	struct mlx5_ib_delay_drop *delay_drop = filp->private_data;
3440  	char lbuf[20];
3441  	int len;
3442  
3443  	len = snprintf(lbuf, sizeof(lbuf), "%u\n", delay_drop->timeout);
3444  	return simple_read_from_buffer(buf, count, pos, lbuf, len);
3445  }
3446  
3447  static ssize_t delay_drop_timeout_write(struct file *filp, const char __user *buf,
3448  					size_t count, loff_t *pos)
3449  {
3450  	struct mlx5_ib_delay_drop *delay_drop = filp->private_data;
3451  	u32 timeout;
3452  	u32 var;
3453  
3454  	if (kstrtouint_from_user(buf, count, 0, &var))
3455  		return -EFAULT;
3456  
3457  	timeout = min_t(u32, roundup(var, 100), MLX5_MAX_DELAY_DROP_TIMEOUT_MS *
3458  			1000);
3459  	if (timeout != var)
3460  		mlx5_ib_dbg(delay_drop->dev, "Round delay drop timeout to %u usec\n",
3461  			    timeout);
3462  
3463  	delay_drop->timeout = timeout;
3464  
3465  	return count;
3466  }
3467  
3468  static const struct file_operations fops_delay_drop_timeout = {
3469  	.owner	= THIS_MODULE,
3470  	.open	= simple_open,
3471  	.write	= delay_drop_timeout_write,
3472  	.read	= delay_drop_timeout_read,
3473  };
3474  
3475  static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev,
3476  				      struct mlx5_ib_multiport_info *mpi)
3477  {
3478  	u32 port_num = mlx5_core_native_port_num(mpi->mdev) - 1;
3479  	struct mlx5_ib_port *port = &ibdev->port[port_num];
3480  	int comps;
3481  	int err;
3482  	int i;
3483  
3484  	lockdep_assert_held(&mlx5_ib_multiport_mutex);
3485  
3486  	mlx5_core_mp_event_replay(ibdev->mdev,
3487  				  MLX5_DRIVER_EVENT_AFFILIATION_REMOVED,
3488  				  NULL);
3489  	mlx5_core_mp_event_replay(mpi->mdev,
3490  				  MLX5_DRIVER_EVENT_AFFILIATION_REMOVED,
3491  				  NULL);
3492  
3493  	mlx5_ib_cleanup_cong_debugfs(ibdev, port_num);
3494  
3495  	spin_lock(&port->mp.mpi_lock);
3496  	if (!mpi->ibdev) {
3497  		spin_unlock(&port->mp.mpi_lock);
3498  		return;
3499  	}
3500  
3501  	mpi->ibdev = NULL;
3502  
3503  	spin_unlock(&port->mp.mpi_lock);
3504  	if (mpi->mdev_events.notifier_call)
3505  		mlx5_notifier_unregister(mpi->mdev, &mpi->mdev_events);
3506  	mpi->mdev_events.notifier_call = NULL;
3507  	mlx5_mdev_netdev_untrack(ibdev, port_num);
3508  	spin_lock(&port->mp.mpi_lock);
3509  
3510  	comps = mpi->mdev_refcnt;
3511  	if (comps) {
3512  		mpi->unaffiliate = true;
3513  		init_completion(&mpi->unref_comp);
3514  		spin_unlock(&port->mp.mpi_lock);
3515  
3516  		for (i = 0; i < comps; i++)
3517  			wait_for_completion(&mpi->unref_comp);
3518  
3519  		spin_lock(&port->mp.mpi_lock);
3520  		mpi->unaffiliate = false;
3521  	}
3522  
3523  	port->mp.mpi = NULL;
3524  
3525  	spin_unlock(&port->mp.mpi_lock);
3526  
3527  	err = mlx5_nic_vport_unaffiliate_multiport(mpi->mdev);
3528  
3529  	mlx5_ib_dbg(ibdev, "unaffiliated port %u\n", port_num + 1);
3530  	/* Log an error, still needed to cleanup the pointers and add
3531  	 * it back to the list.
3532  	 */
3533  	if (err)
3534  		mlx5_ib_err(ibdev, "Failed to unaffiliate port %u\n",
3535  			    port_num + 1);
3536  
3537  	ibdev->port[port_num].roce.last_port_state = IB_PORT_DOWN;
3538  }
3539  
3540  static bool mlx5_ib_bind_slave_port(struct mlx5_ib_dev *ibdev,
3541  				    struct mlx5_ib_multiport_info *mpi)
3542  {
3543  	u32 port_num = mlx5_core_native_port_num(mpi->mdev) - 1;
3544  	u64 key;
3545  	int err;
3546  
3547  	lockdep_assert_held(&mlx5_ib_multiport_mutex);
3548  
3549  	spin_lock(&ibdev->port[port_num].mp.mpi_lock);
3550  	if (ibdev->port[port_num].mp.mpi) {
3551  		mlx5_ib_dbg(ibdev, "port %u already affiliated.\n",
3552  			    port_num + 1);
3553  		spin_unlock(&ibdev->port[port_num].mp.mpi_lock);
3554  		return false;
3555  	}
3556  
3557  	ibdev->port[port_num].mp.mpi = mpi;
3558  	mpi->ibdev = ibdev;
3559  	mpi->mdev_events.notifier_call = NULL;
3560  	spin_unlock(&ibdev->port[port_num].mp.mpi_lock);
3561  
3562  	err = mlx5_nic_vport_affiliate_multiport(ibdev->mdev, mpi->mdev);
3563  	if (err)
3564  		goto unbind;
3565  
3566  	mlx5_mdev_netdev_track(ibdev, port_num);
3567  
3568  	mpi->mdev_events.notifier_call = mlx5_ib_event_slave_port;
3569  	mlx5_notifier_register(mpi->mdev, &mpi->mdev_events);
3570  
3571  	mlx5_ib_init_cong_debugfs(ibdev, port_num);
3572  
3573  	key = mpi->mdev->priv.adev_idx;
3574  	mlx5_core_mp_event_replay(mpi->mdev,
3575  				  MLX5_DRIVER_EVENT_AFFILIATION_DONE,
3576  				  &key);
3577  	mlx5_core_mp_event_replay(ibdev->mdev,
3578  				  MLX5_DRIVER_EVENT_AFFILIATION_DONE,
3579  				  &key);
3580  
3581  	return true;
3582  
3583  unbind:
3584  	mlx5_ib_unbind_slave_port(ibdev, mpi);
3585  	return false;
3586  }
3587  
3588  static int mlx5_ib_data_direct_init(struct mlx5_ib_dev *dev)
3589  {
3590  	char vuid[MLX5_ST_SZ_BYTES(array1024_auto) + 1] = {};
3591  	int ret;
3592  
3593  	if (!MLX5_CAP_GEN(dev->mdev, data_direct) ||
3594  	    !MLX5_CAP_GEN_2(dev->mdev, query_vuid))
3595  		return 0;
3596  
3597  	ret = mlx5_cmd_query_vuid(dev->mdev, true, vuid);
3598  	if (ret)
3599  		return ret;
3600  
3601  	ret = mlx5_ib_create_data_direct_resources(dev);
3602  	if (ret)
3603  		return ret;
3604  
3605  	INIT_LIST_HEAD(&dev->data_direct_mr_list);
3606  	ret = mlx5_data_direct_ib_reg(dev, vuid);
3607  	if (ret)
3608  		mlx5_ib_free_data_direct_resources(dev);
3609  
3610  	return ret;
3611  }
3612  
3613  static void mlx5_ib_data_direct_cleanup(struct mlx5_ib_dev *dev)
3614  {
3615  	if (!MLX5_CAP_GEN(dev->mdev, data_direct) ||
3616  	    !MLX5_CAP_GEN_2(dev->mdev, query_vuid))
3617  		return;
3618  
3619  	mlx5_data_direct_ib_unreg(dev);
3620  	mlx5_ib_free_data_direct_resources(dev);
3621  }
3622  
3623  static int mlx5_ib_init_multiport_master(struct mlx5_ib_dev *dev)
3624  {
3625  	u32 port_num = mlx5_core_native_port_num(dev->mdev) - 1;
3626  	enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev,
3627  							  port_num + 1);
3628  	struct mlx5_ib_multiport_info *mpi;
3629  	int err;
3630  	u32 i;
3631  
3632  	if (!mlx5_core_is_mp_master(dev->mdev) || ll != IB_LINK_LAYER_ETHERNET)
3633  		return 0;
3634  
3635  	err = mlx5_query_nic_vport_system_image_guid(dev->mdev,
3636  						     &dev->sys_image_guid);
3637  	if (err)
3638  		return err;
3639  
3640  	err = mlx5_nic_vport_enable_roce(dev->mdev);
3641  	if (err)
3642  		return err;
3643  
3644  	mutex_lock(&mlx5_ib_multiport_mutex);
3645  	for (i = 0; i < dev->num_ports; i++) {
3646  		bool bound = false;
3647  
3648  		/* build a stub multiport info struct for the native port. */
3649  		if (i == port_num) {
3650  			mpi = kzalloc(sizeof(*mpi), GFP_KERNEL);
3651  			if (!mpi) {
3652  				mutex_unlock(&mlx5_ib_multiport_mutex);
3653  				mlx5_nic_vport_disable_roce(dev->mdev);
3654  				return -ENOMEM;
3655  			}
3656  
3657  			mpi->is_master = true;
3658  			mpi->mdev = dev->mdev;
3659  			mpi->sys_image_guid = dev->sys_image_guid;
3660  			dev->port[i].mp.mpi = mpi;
3661  			mpi->ibdev = dev;
3662  			mpi = NULL;
3663  			continue;
3664  		}
3665  
3666  		list_for_each_entry(mpi, &mlx5_ib_unaffiliated_port_list,
3667  				    list) {
3668  			if (dev->sys_image_guid == mpi->sys_image_guid &&
3669  			    (mlx5_core_native_port_num(mpi->mdev) - 1) == i &&
3670  			    mlx5_core_same_coredev_type(dev->mdev, mpi->mdev)) {
3671  				bound = mlx5_ib_bind_slave_port(dev, mpi);
3672  			}
3673  
3674  			if (bound) {
3675  				dev_dbg(mpi->mdev->device,
3676  					"removing port from unaffiliated list.\n");
3677  				mlx5_ib_dbg(dev, "port %d bound\n", i + 1);
3678  				list_del(&mpi->list);
3679  				break;
3680  			}
3681  		}
3682  		if (!bound)
3683  			mlx5_ib_dbg(dev, "no free port found for port %d\n",
3684  				    i + 1);
3685  	}
3686  
3687  	list_add_tail(&dev->ib_dev_list, &mlx5_ib_dev_list);
3688  	mutex_unlock(&mlx5_ib_multiport_mutex);
3689  	return err;
3690  }
3691  
3692  static void mlx5_ib_cleanup_multiport_master(struct mlx5_ib_dev *dev)
3693  {
3694  	u32 port_num = mlx5_core_native_port_num(dev->mdev) - 1;
3695  	enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev,
3696  							  port_num + 1);
3697  	u32 i;
3698  
3699  	if (!mlx5_core_is_mp_master(dev->mdev) || ll != IB_LINK_LAYER_ETHERNET)
3700  		return;
3701  
3702  	mutex_lock(&mlx5_ib_multiport_mutex);
3703  	for (i = 0; i < dev->num_ports; i++) {
3704  		if (dev->port[i].mp.mpi) {
3705  			/* Destroy the native port stub */
3706  			if (i == port_num) {
3707  				kfree(dev->port[i].mp.mpi);
3708  				dev->port[i].mp.mpi = NULL;
3709  			} else {
3710  				mlx5_ib_dbg(dev, "unbinding port_num: %u\n",
3711  					    i + 1);
3712  				list_add_tail(&dev->port[i].mp.mpi->list,
3713  					      &mlx5_ib_unaffiliated_port_list);
3714  				mlx5_ib_unbind_slave_port(dev,
3715  							  dev->port[i].mp.mpi);
3716  			}
3717  		}
3718  	}
3719  
3720  	mlx5_ib_dbg(dev, "removing from devlist\n");
3721  	list_del(&dev->ib_dev_list);
3722  	mutex_unlock(&mlx5_ib_multiport_mutex);
3723  
3724  	mlx5_nic_vport_disable_roce(dev->mdev);
3725  }
3726  
3727  static int mmap_obj_cleanup(struct ib_uobject *uobject,
3728  			    enum rdma_remove_reason why,
3729  			    struct uverbs_attr_bundle *attrs)
3730  {
3731  	struct mlx5_user_mmap_entry *obj = uobject->object;
3732  
3733  	rdma_user_mmap_entry_remove(&obj->rdma_entry);
3734  	return 0;
3735  }
3736  
3737  static int mlx5_rdma_user_mmap_entry_insert(struct mlx5_ib_ucontext *c,
3738  					    struct mlx5_user_mmap_entry *entry,
3739  					    size_t length)
3740  {
3741  	return rdma_user_mmap_entry_insert_range(
3742  		&c->ibucontext, &entry->rdma_entry, length,
3743  		(MLX5_IB_MMAP_OFFSET_START << 16),
3744  		((MLX5_IB_MMAP_OFFSET_END << 16) + (1UL << 16) - 1));
3745  }
3746  
3747  static struct mlx5_user_mmap_entry *
3748  alloc_var_entry(struct mlx5_ib_ucontext *c)
3749  {
3750  	struct mlx5_user_mmap_entry *entry;
3751  	struct mlx5_var_table *var_table;
3752  	u32 page_idx;
3753  	int err;
3754  
3755  	var_table = &to_mdev(c->ibucontext.device)->var_table;
3756  	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
3757  	if (!entry)
3758  		return ERR_PTR(-ENOMEM);
3759  
3760  	mutex_lock(&var_table->bitmap_lock);
3761  	page_idx = find_first_zero_bit(var_table->bitmap,
3762  				       var_table->num_var_hw_entries);
3763  	if (page_idx >= var_table->num_var_hw_entries) {
3764  		err = -ENOSPC;
3765  		mutex_unlock(&var_table->bitmap_lock);
3766  		goto end;
3767  	}
3768  
3769  	set_bit(page_idx, var_table->bitmap);
3770  	mutex_unlock(&var_table->bitmap_lock);
3771  
3772  	entry->address = var_table->hw_start_addr +
3773  				(page_idx * var_table->stride_size);
3774  	entry->page_idx = page_idx;
3775  	entry->mmap_flag = MLX5_IB_MMAP_TYPE_VAR;
3776  
3777  	err = mlx5_rdma_user_mmap_entry_insert(c, entry,
3778  					       var_table->stride_size);
3779  	if (err)
3780  		goto err_insert;
3781  
3782  	return entry;
3783  
3784  err_insert:
3785  	mutex_lock(&var_table->bitmap_lock);
3786  	clear_bit(page_idx, var_table->bitmap);
3787  	mutex_unlock(&var_table->bitmap_lock);
3788  end:
3789  	kfree(entry);
3790  	return ERR_PTR(err);
3791  }
3792  
3793  static int UVERBS_HANDLER(MLX5_IB_METHOD_VAR_OBJ_ALLOC)(
3794  	struct uverbs_attr_bundle *attrs)
3795  {
3796  	struct ib_uobject *uobj = uverbs_attr_get_uobject(
3797  		attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_HANDLE);
3798  	struct mlx5_ib_ucontext *c;
3799  	struct mlx5_user_mmap_entry *entry;
3800  	u64 mmap_offset;
3801  	u32 length;
3802  	int err;
3803  
3804  	c = to_mucontext(ib_uverbs_get_ucontext(attrs));
3805  	if (IS_ERR(c))
3806  		return PTR_ERR(c);
3807  
3808  	entry = alloc_var_entry(c);
3809  	if (IS_ERR(entry))
3810  		return PTR_ERR(entry);
3811  
3812  	mmap_offset = mlx5_entry_to_mmap_offset(entry);
3813  	length = entry->rdma_entry.npages * PAGE_SIZE;
3814  	uobj->object = entry;
3815  	uverbs_finalize_uobj_create(attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_HANDLE);
3816  
3817  	err = uverbs_copy_to(attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_OFFSET,
3818  			     &mmap_offset, sizeof(mmap_offset));
3819  	if (err)
3820  		return err;
3821  
3822  	err = uverbs_copy_to(attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_PAGE_ID,
3823  			     &entry->page_idx, sizeof(entry->page_idx));
3824  	if (err)
3825  		return err;
3826  
3827  	err = uverbs_copy_to(attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_LENGTH,
3828  			     &length, sizeof(length));
3829  	return err;
3830  }
3831  
3832  DECLARE_UVERBS_NAMED_METHOD(
3833  	MLX5_IB_METHOD_VAR_OBJ_ALLOC,
3834  	UVERBS_ATTR_IDR(MLX5_IB_ATTR_VAR_OBJ_ALLOC_HANDLE,
3835  			MLX5_IB_OBJECT_VAR,
3836  			UVERBS_ACCESS_NEW,
3837  			UA_MANDATORY),
3838  	UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_VAR_OBJ_ALLOC_PAGE_ID,
3839  			   UVERBS_ATTR_TYPE(u32),
3840  			   UA_MANDATORY),
3841  	UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_LENGTH,
3842  			   UVERBS_ATTR_TYPE(u32),
3843  			   UA_MANDATORY),
3844  	UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_OFFSET,
3845  			    UVERBS_ATTR_TYPE(u64),
3846  			    UA_MANDATORY));
3847  
3848  DECLARE_UVERBS_NAMED_METHOD_DESTROY(
3849  	MLX5_IB_METHOD_VAR_OBJ_DESTROY,
3850  	UVERBS_ATTR_IDR(MLX5_IB_ATTR_VAR_OBJ_DESTROY_HANDLE,
3851  			MLX5_IB_OBJECT_VAR,
3852  			UVERBS_ACCESS_DESTROY,
3853  			UA_MANDATORY));
3854  
3855  DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_VAR,
3856  			    UVERBS_TYPE_ALLOC_IDR(mmap_obj_cleanup),
3857  			    &UVERBS_METHOD(MLX5_IB_METHOD_VAR_OBJ_ALLOC),
3858  			    &UVERBS_METHOD(MLX5_IB_METHOD_VAR_OBJ_DESTROY));
3859  
3860  static bool var_is_supported(struct ib_device *device)
3861  {
3862  	struct mlx5_ib_dev *dev = to_mdev(device);
3863  
3864  	return (MLX5_CAP_GEN_64(dev->mdev, general_obj_types) &
3865  			MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q);
3866  }
3867  
3868  static struct mlx5_user_mmap_entry *
3869  alloc_uar_entry(struct mlx5_ib_ucontext *c,
3870  		enum mlx5_ib_uapi_uar_alloc_type alloc_type)
3871  {
3872  	struct mlx5_user_mmap_entry *entry;
3873  	struct mlx5_ib_dev *dev;
3874  	u32 uar_index;
3875  	int err;
3876  
3877  	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
3878  	if (!entry)
3879  		return ERR_PTR(-ENOMEM);
3880  
3881  	dev = to_mdev(c->ibucontext.device);
3882  	err = mlx5_cmd_uar_alloc(dev->mdev, &uar_index, c->devx_uid);
3883  	if (err)
3884  		goto end;
3885  
3886  	entry->page_idx = uar_index;
3887  	entry->address = uar_index2paddress(dev, uar_index);
3888  	if (alloc_type == MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF)
3889  		entry->mmap_flag = MLX5_IB_MMAP_TYPE_UAR_WC;
3890  	else
3891  		entry->mmap_flag = MLX5_IB_MMAP_TYPE_UAR_NC;
3892  
3893  	err = mlx5_rdma_user_mmap_entry_insert(c, entry, PAGE_SIZE);
3894  	if (err)
3895  		goto err_insert;
3896  
3897  	return entry;
3898  
3899  err_insert:
3900  	mlx5_cmd_uar_dealloc(dev->mdev, uar_index, c->devx_uid);
3901  end:
3902  	kfree(entry);
3903  	return ERR_PTR(err);
3904  }
3905  
3906  static int UVERBS_HANDLER(MLX5_IB_METHOD_UAR_OBJ_ALLOC)(
3907  	struct uverbs_attr_bundle *attrs)
3908  {
3909  	struct ib_uobject *uobj = uverbs_attr_get_uobject(
3910  		attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_HANDLE);
3911  	enum mlx5_ib_uapi_uar_alloc_type alloc_type;
3912  	struct mlx5_ib_ucontext *c;
3913  	struct mlx5_user_mmap_entry *entry;
3914  	u64 mmap_offset;
3915  	u32 length;
3916  	int err;
3917  
3918  	c = to_mucontext(ib_uverbs_get_ucontext(attrs));
3919  	if (IS_ERR(c))
3920  		return PTR_ERR(c);
3921  
3922  	err = uverbs_get_const(&alloc_type, attrs,
3923  			       MLX5_IB_ATTR_UAR_OBJ_ALLOC_TYPE);
3924  	if (err)
3925  		return err;
3926  
3927  	if (alloc_type != MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF &&
3928  	    alloc_type != MLX5_IB_UAPI_UAR_ALLOC_TYPE_NC)
3929  		return -EOPNOTSUPP;
3930  
3931  	if (!mlx5_wc_support_get(to_mdev(c->ibucontext.device)->mdev) &&
3932  	    alloc_type == MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF)
3933  		return -EOPNOTSUPP;
3934  
3935  	entry = alloc_uar_entry(c, alloc_type);
3936  	if (IS_ERR(entry))
3937  		return PTR_ERR(entry);
3938  
3939  	mmap_offset = mlx5_entry_to_mmap_offset(entry);
3940  	length = entry->rdma_entry.npages * PAGE_SIZE;
3941  	uobj->object = entry;
3942  	uverbs_finalize_uobj_create(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_HANDLE);
3943  
3944  	err = uverbs_copy_to(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_OFFSET,
3945  			     &mmap_offset, sizeof(mmap_offset));
3946  	if (err)
3947  		return err;
3948  
3949  	err = uverbs_copy_to(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_PAGE_ID,
3950  			     &entry->page_idx, sizeof(entry->page_idx));
3951  	if (err)
3952  		return err;
3953  
3954  	err = uverbs_copy_to(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_LENGTH,
3955  			     &length, sizeof(length));
3956  	return err;
3957  }
3958  
3959  DECLARE_UVERBS_NAMED_METHOD(
3960  	MLX5_IB_METHOD_UAR_OBJ_ALLOC,
3961  	UVERBS_ATTR_IDR(MLX5_IB_ATTR_UAR_OBJ_ALLOC_HANDLE,
3962  			MLX5_IB_OBJECT_UAR,
3963  			UVERBS_ACCESS_NEW,
3964  			UA_MANDATORY),
3965  	UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_UAR_OBJ_ALLOC_TYPE,
3966  			     enum mlx5_ib_uapi_uar_alloc_type,
3967  			     UA_MANDATORY),
3968  	UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_UAR_OBJ_ALLOC_PAGE_ID,
3969  			   UVERBS_ATTR_TYPE(u32),
3970  			   UA_MANDATORY),
3971  	UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_LENGTH,
3972  			   UVERBS_ATTR_TYPE(u32),
3973  			   UA_MANDATORY),
3974  	UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_OFFSET,
3975  			    UVERBS_ATTR_TYPE(u64),
3976  			    UA_MANDATORY));
3977  
3978  DECLARE_UVERBS_NAMED_METHOD_DESTROY(
3979  	MLX5_IB_METHOD_UAR_OBJ_DESTROY,
3980  	UVERBS_ATTR_IDR(MLX5_IB_ATTR_UAR_OBJ_DESTROY_HANDLE,
3981  			MLX5_IB_OBJECT_UAR,
3982  			UVERBS_ACCESS_DESTROY,
3983  			UA_MANDATORY));
3984  
3985  DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_UAR,
3986  			    UVERBS_TYPE_ALLOC_IDR(mmap_obj_cleanup),
3987  			    &UVERBS_METHOD(MLX5_IB_METHOD_UAR_OBJ_ALLOC),
3988  			    &UVERBS_METHOD(MLX5_IB_METHOD_UAR_OBJ_DESTROY));
3989  
3990  ADD_UVERBS_ATTRIBUTES_SIMPLE(
3991  	mlx5_ib_query_context,
3992  	UVERBS_OBJECT_DEVICE,
3993  	UVERBS_METHOD_QUERY_CONTEXT,
3994  	UVERBS_ATTR_PTR_OUT(
3995  		MLX5_IB_ATTR_QUERY_CONTEXT_RESP_UCTX,
3996  		UVERBS_ATTR_STRUCT(struct mlx5_ib_alloc_ucontext_resp,
3997  				   dump_fill_mkey),
3998  		UA_MANDATORY));
3999  
4000  ADD_UVERBS_ATTRIBUTES_SIMPLE(
4001  	mlx5_ib_reg_dmabuf_mr,
4002  	UVERBS_OBJECT_MR,
4003  	UVERBS_METHOD_REG_DMABUF_MR,
4004  	UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS,
4005  			     enum mlx5_ib_uapi_reg_dmabuf_flags,
4006  			     UA_OPTIONAL));
4007  
4008  static const struct uapi_definition mlx5_ib_defs[] = {
4009  	UAPI_DEF_CHAIN(mlx5_ib_devx_defs),
4010  	UAPI_DEF_CHAIN(mlx5_ib_flow_defs),
4011  	UAPI_DEF_CHAIN(mlx5_ib_qos_defs),
4012  	UAPI_DEF_CHAIN(mlx5_ib_std_types_defs),
4013  	UAPI_DEF_CHAIN(mlx5_ib_dm_defs),
4014  	UAPI_DEF_CHAIN(mlx5_ib_create_cq_defs),
4015  
4016  	UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DEVICE, &mlx5_ib_query_context),
4017  	UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_MR, &mlx5_ib_reg_dmabuf_mr),
4018  	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_VAR,
4019  				UAPI_DEF_IS_OBJ_SUPPORTED(var_is_supported)),
4020  	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_UAR),
4021  	{}
4022  };
4023  
4024  static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev)
4025  {
4026  	mlx5_ib_data_direct_cleanup(dev);
4027  	mlx5_ib_cleanup_multiport_master(dev);
4028  	WARN_ON(!xa_empty(&dev->odp_mkeys));
4029  	mutex_destroy(&dev->cap_mask_mutex);
4030  	WARN_ON(!xa_empty(&dev->sig_mrs));
4031  	WARN_ON(!bitmap_empty(dev->dm.memic_alloc_pages, MLX5_MAX_MEMIC_PAGES));
4032  	mlx5r_macsec_dealloc_gids(dev);
4033  }
4034  
4035  static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
4036  {
4037  	struct mlx5_core_dev *mdev = dev->mdev;
4038  	int err, i;
4039  
4040  	dev->ib_dev.node_type = RDMA_NODE_IB_CA;
4041  	dev->ib_dev.local_dma_lkey = 0 /* not supported for now */;
4042  	dev->ib_dev.dev.parent = mdev->device;
4043  	dev->ib_dev.lag_flags = RDMA_LAG_FLAGS_HASH_ALL_SLAVES;
4044  
4045  	for (i = 0; i < dev->num_ports; i++) {
4046  		spin_lock_init(&dev->port[i].mp.mpi_lock);
4047  		dev->port[i].roce.dev = dev;
4048  		dev->port[i].roce.native_port_num = i + 1;
4049  		dev->port[i].roce.last_port_state = IB_PORT_DOWN;
4050  	}
4051  
4052  	err = mlx5r_cmd_query_special_mkeys(dev);
4053  	if (err)
4054  		return err;
4055  
4056  	err = mlx5r_macsec_init_gids_and_devlist(dev);
4057  	if (err)
4058  		return err;
4059  
4060  	err = mlx5_ib_init_multiport_master(dev);
4061  	if (err)
4062  		goto err;
4063  
4064  	err = set_has_smi_cap(dev);
4065  	if (err)
4066  		goto err_mp;
4067  
4068  	err = mlx5_query_max_pkeys(&dev->ib_dev, &dev->pkey_table_len);
4069  	if (err)
4070  		goto err_mp;
4071  
4072  	if (mlx5_use_mad_ifc(dev))
4073  		get_ext_port_caps(dev);
4074  
4075  	dev->ib_dev.num_comp_vectors    = mlx5_comp_vectors_max(mdev);
4076  
4077  	mutex_init(&dev->cap_mask_mutex);
4078  	mutex_init(&dev->data_direct_lock);
4079  	INIT_LIST_HEAD(&dev->qp_list);
4080  	spin_lock_init(&dev->reset_flow_resource_lock);
4081  	xa_init(&dev->odp_mkeys);
4082  	xa_init(&dev->sig_mrs);
4083  	atomic_set(&dev->mkey_var, 0);
4084  
4085  	spin_lock_init(&dev->dm.lock);
4086  	dev->dm.dev = mdev;
4087  	err = mlx5_ib_data_direct_init(dev);
4088  	if (err)
4089  		goto err_mp;
4090  
4091  	return 0;
4092  err_mp:
4093  	mlx5_ib_cleanup_multiport_master(dev);
4094  err:
4095  	mlx5r_macsec_dealloc_gids(dev);
4096  	return err;
4097  }
4098  
4099  static struct ib_device *mlx5_ib_add_sub_dev(struct ib_device *parent,
4100  					     enum rdma_nl_dev_type type,
4101  					     const char *name);
4102  static void mlx5_ib_del_sub_dev(struct ib_device *sub_dev);
4103  
4104  static const struct ib_device_ops mlx5_ib_dev_ops = {
4105  	.owner = THIS_MODULE,
4106  	.driver_id = RDMA_DRIVER_MLX5,
4107  	.uverbs_abi_ver	= MLX5_IB_UVERBS_ABI_VERSION,
4108  
4109  	.add_gid = mlx5_ib_add_gid,
4110  	.add_sub_dev = mlx5_ib_add_sub_dev,
4111  	.alloc_mr = mlx5_ib_alloc_mr,
4112  	.alloc_mr_integrity = mlx5_ib_alloc_mr_integrity,
4113  	.alloc_pd = mlx5_ib_alloc_pd,
4114  	.alloc_ucontext = mlx5_ib_alloc_ucontext,
4115  	.attach_mcast = mlx5_ib_mcg_attach,
4116  	.check_mr_status = mlx5_ib_check_mr_status,
4117  	.create_ah = mlx5_ib_create_ah,
4118  	.create_cq = mlx5_ib_create_cq,
4119  	.create_qp = mlx5_ib_create_qp,
4120  	.create_srq = mlx5_ib_create_srq,
4121  	.create_user_ah = mlx5_ib_create_ah,
4122  	.dealloc_pd = mlx5_ib_dealloc_pd,
4123  	.dealloc_ucontext = mlx5_ib_dealloc_ucontext,
4124  	.del_gid = mlx5_ib_del_gid,
4125  	.del_sub_dev = mlx5_ib_del_sub_dev,
4126  	.dereg_mr = mlx5_ib_dereg_mr,
4127  	.destroy_ah = mlx5_ib_destroy_ah,
4128  	.destroy_cq = mlx5_ib_destroy_cq,
4129  	.destroy_qp = mlx5_ib_destroy_qp,
4130  	.destroy_srq = mlx5_ib_destroy_srq,
4131  	.detach_mcast = mlx5_ib_mcg_detach,
4132  	.disassociate_ucontext = mlx5_ib_disassociate_ucontext,
4133  	.drain_rq = mlx5_ib_drain_rq,
4134  	.drain_sq = mlx5_ib_drain_sq,
4135  	.device_group = &mlx5_attr_group,
4136  	.get_dev_fw_str = get_dev_fw_str,
4137  	.get_dma_mr = mlx5_ib_get_dma_mr,
4138  	.get_link_layer = mlx5_ib_port_link_layer,
4139  	.map_mr_sg = mlx5_ib_map_mr_sg,
4140  	.map_mr_sg_pi = mlx5_ib_map_mr_sg_pi,
4141  	.mmap = mlx5_ib_mmap,
4142  	.mmap_free = mlx5_ib_mmap_free,
4143  	.modify_cq = mlx5_ib_modify_cq,
4144  	.modify_device = mlx5_ib_modify_device,
4145  	.modify_port = mlx5_ib_modify_port,
4146  	.modify_qp = mlx5_ib_modify_qp,
4147  	.modify_srq = mlx5_ib_modify_srq,
4148  	.poll_cq = mlx5_ib_poll_cq,
4149  	.post_recv = mlx5_ib_post_recv_nodrain,
4150  	.post_send = mlx5_ib_post_send_nodrain,
4151  	.post_srq_recv = mlx5_ib_post_srq_recv,
4152  	.process_mad = mlx5_ib_process_mad,
4153  	.query_ah = mlx5_ib_query_ah,
4154  	.query_device = mlx5_ib_query_device,
4155  	.query_gid = mlx5_ib_query_gid,
4156  	.query_pkey = mlx5_ib_query_pkey,
4157  	.query_qp = mlx5_ib_query_qp,
4158  	.query_srq = mlx5_ib_query_srq,
4159  	.query_ucontext = mlx5_ib_query_ucontext,
4160  	.reg_user_mr = mlx5_ib_reg_user_mr,
4161  	.reg_user_mr_dmabuf = mlx5_ib_reg_user_mr_dmabuf,
4162  	.req_notify_cq = mlx5_ib_arm_cq,
4163  	.rereg_user_mr = mlx5_ib_rereg_user_mr,
4164  	.resize_cq = mlx5_ib_resize_cq,
4165  	.ufile_hw_cleanup = mlx5_ib_ufile_hw_cleanup,
4166  
4167  	INIT_RDMA_OBJ_SIZE(ib_ah, mlx5_ib_ah, ibah),
4168  	INIT_RDMA_OBJ_SIZE(ib_counters, mlx5_ib_mcounters, ibcntrs),
4169  	INIT_RDMA_OBJ_SIZE(ib_cq, mlx5_ib_cq, ibcq),
4170  	INIT_RDMA_OBJ_SIZE(ib_pd, mlx5_ib_pd, ibpd),
4171  	INIT_RDMA_OBJ_SIZE(ib_qp, mlx5_ib_qp, ibqp),
4172  	INIT_RDMA_OBJ_SIZE(ib_srq, mlx5_ib_srq, ibsrq),
4173  	INIT_RDMA_OBJ_SIZE(ib_ucontext, mlx5_ib_ucontext, ibucontext),
4174  };
4175  
4176  static const struct ib_device_ops mlx5_ib_dev_ipoib_enhanced_ops = {
4177  	.rdma_netdev_get_params = mlx5_ib_rn_get_params,
4178  };
4179  
4180  static const struct ib_device_ops mlx5_ib_dev_sriov_ops = {
4181  	.get_vf_config = mlx5_ib_get_vf_config,
4182  	.get_vf_guid = mlx5_ib_get_vf_guid,
4183  	.get_vf_stats = mlx5_ib_get_vf_stats,
4184  	.set_vf_guid = mlx5_ib_set_vf_guid,
4185  	.set_vf_link_state = mlx5_ib_set_vf_link_state,
4186  };
4187  
4188  static const struct ib_device_ops mlx5_ib_dev_mw_ops = {
4189  	.alloc_mw = mlx5_ib_alloc_mw,
4190  	.dealloc_mw = mlx5_ib_dealloc_mw,
4191  
4192  	INIT_RDMA_OBJ_SIZE(ib_mw, mlx5_ib_mw, ibmw),
4193  };
4194  
4195  static const struct ib_device_ops mlx5_ib_dev_xrc_ops = {
4196  	.alloc_xrcd = mlx5_ib_alloc_xrcd,
4197  	.dealloc_xrcd = mlx5_ib_dealloc_xrcd,
4198  
4199  	INIT_RDMA_OBJ_SIZE(ib_xrcd, mlx5_ib_xrcd, ibxrcd),
4200  };
4201  
4202  static int mlx5_ib_init_var_table(struct mlx5_ib_dev *dev)
4203  {
4204  	struct mlx5_core_dev *mdev = dev->mdev;
4205  	struct mlx5_var_table *var_table = &dev->var_table;
4206  	u8 log_doorbell_bar_size;
4207  	u8 log_doorbell_stride;
4208  	u64 bar_size;
4209  
4210  	log_doorbell_bar_size = MLX5_CAP_DEV_VDPA_EMULATION(mdev,
4211  					log_doorbell_bar_size);
4212  	log_doorbell_stride = MLX5_CAP_DEV_VDPA_EMULATION(mdev,
4213  					log_doorbell_stride);
4214  	var_table->hw_start_addr = dev->mdev->bar_addr +
4215  				MLX5_CAP64_DEV_VDPA_EMULATION(mdev,
4216  					doorbell_bar_offset);
4217  	bar_size = (1ULL << log_doorbell_bar_size) * 4096;
4218  	var_table->stride_size = 1ULL << log_doorbell_stride;
4219  	var_table->num_var_hw_entries = div_u64(bar_size,
4220  						var_table->stride_size);
4221  	mutex_init(&var_table->bitmap_lock);
4222  	var_table->bitmap = bitmap_zalloc(var_table->num_var_hw_entries,
4223  					  GFP_KERNEL);
4224  	return (var_table->bitmap) ? 0 : -ENOMEM;
4225  }
4226  
4227  static void mlx5_ib_cleanup_ucaps(struct mlx5_ib_dev *dev)
4228  {
4229  	if (MLX5_CAP_GEN(dev->mdev, uctx_cap) & MLX5_UCTX_CAP_RDMA_CTRL)
4230  		ib_remove_ucap(RDMA_UCAP_MLX5_CTRL_LOCAL);
4231  
4232  	if (MLX5_CAP_GEN(dev->mdev, uctx_cap) &
4233  	    MLX5_UCTX_CAP_RDMA_CTRL_OTHER_VHCA)
4234  		ib_remove_ucap(RDMA_UCAP_MLX5_CTRL_OTHER_VHCA);
4235  }
4236  
4237  static int mlx5_ib_init_ucaps(struct mlx5_ib_dev *dev)
4238  {
4239  	int ret;
4240  
4241  	if (MLX5_CAP_GEN(dev->mdev, uctx_cap) & MLX5_UCTX_CAP_RDMA_CTRL) {
4242  		ret = ib_create_ucap(RDMA_UCAP_MLX5_CTRL_LOCAL);
4243  		if (ret)
4244  			return ret;
4245  	}
4246  
4247  	if (MLX5_CAP_GEN(dev->mdev, uctx_cap) &
4248  	    MLX5_UCTX_CAP_RDMA_CTRL_OTHER_VHCA) {
4249  		ret = ib_create_ucap(RDMA_UCAP_MLX5_CTRL_OTHER_VHCA);
4250  		if (ret)
4251  			goto remove_local;
4252  	}
4253  
4254  	return 0;
4255  
4256  remove_local:
4257  	if (MLX5_CAP_GEN(dev->mdev, uctx_cap) & MLX5_UCTX_CAP_RDMA_CTRL)
4258  		ib_remove_ucap(RDMA_UCAP_MLX5_CTRL_LOCAL);
4259  	return ret;
4260  }
4261  
4262  static void mlx5_ib_stage_caps_cleanup(struct mlx5_ib_dev *dev)
4263  {
4264  	if (MLX5_CAP_GEN_2_64(dev->mdev, general_obj_types_127_64) &
4265  	    MLX5_HCA_CAP_2_GENERAL_OBJECT_TYPES_RDMA_CTRL)
4266  		mlx5_ib_cleanup_ucaps(dev);
4267  
4268  	bitmap_free(dev->var_table.bitmap);
4269  }
4270  
4271  static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev)
4272  {
4273  	struct mlx5_core_dev *mdev = dev->mdev;
4274  	int err;
4275  
4276  	if (MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads) &&
4277  	    IS_ENABLED(CONFIG_MLX5_CORE_IPOIB))
4278  		ib_set_device_ops(&dev->ib_dev,
4279  				  &mlx5_ib_dev_ipoib_enhanced_ops);
4280  
4281  	if (mlx5_core_is_pf(mdev))
4282  		ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_sriov_ops);
4283  
4284  	dev->umr_fence = mlx5_get_umr_fence(MLX5_CAP_GEN(mdev, umr_fence));
4285  
4286  	if (MLX5_CAP_GEN(mdev, imaicl))
4287  		ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_mw_ops);
4288  
4289  	if (MLX5_CAP_GEN(mdev, xrc))
4290  		ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_xrc_ops);
4291  
4292  	if (MLX5_CAP_DEV_MEM(mdev, memic) ||
4293  	    MLX5_CAP_GEN_64(dev->mdev, general_obj_types) &
4294  	    MLX5_GENERAL_OBJ_TYPES_CAP_SW_ICM)
4295  		ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_dm_ops);
4296  
4297  	ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_ops);
4298  
4299  	if (IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS))
4300  		dev->ib_dev.driver_def = mlx5_ib_defs;
4301  
4302  	err = init_node_data(dev);
4303  	if (err)
4304  		return err;
4305  
4306  	if ((MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) &&
4307  	    (MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) ||
4308  	     MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
4309  		mutex_init(&dev->lb.mutex);
4310  
4311  	if (MLX5_CAP_GEN_64(dev->mdev, general_obj_types) &
4312  			MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q) {
4313  		err = mlx5_ib_init_var_table(dev);
4314  		if (err)
4315  			return err;
4316  	}
4317  
4318  	if (MLX5_CAP_GEN_2_64(dev->mdev, general_obj_types_127_64) &
4319  	    MLX5_HCA_CAP_2_GENERAL_OBJECT_TYPES_RDMA_CTRL) {
4320  		err = mlx5_ib_init_ucaps(dev);
4321  		if (err)
4322  			return err;
4323  	}
4324  
4325  	dev->ib_dev.use_cq_dim = true;
4326  
4327  	return 0;
4328  }
4329  
4330  static const struct ib_device_ops mlx5_ib_dev_port_ops = {
4331  	.get_port_immutable = mlx5_port_immutable,
4332  	.query_port = mlx5_ib_query_port,
4333  };
4334  
4335  static int mlx5_ib_stage_non_default_cb(struct mlx5_ib_dev *dev)
4336  {
4337  	ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_port_ops);
4338  	return 0;
4339  }
4340  
4341  static const struct ib_device_ops mlx5_ib_dev_port_rep_ops = {
4342  	.get_port_immutable = mlx5_port_rep_immutable,
4343  	.query_port = mlx5_ib_rep_query_port,
4344  	.query_pkey = mlx5_ib_rep_query_pkey,
4345  };
4346  
4347  static int mlx5_ib_stage_raw_eth_non_default_cb(struct mlx5_ib_dev *dev)
4348  {
4349  	ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_port_rep_ops);
4350  	return 0;
4351  }
4352  
4353  static const struct ib_device_ops mlx5_ib_dev_common_roce_ops = {
4354  	.create_rwq_ind_table = mlx5_ib_create_rwq_ind_table,
4355  	.create_wq = mlx5_ib_create_wq,
4356  	.destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table,
4357  	.destroy_wq = mlx5_ib_destroy_wq,
4358  	.modify_wq = mlx5_ib_modify_wq,
4359  
4360  	INIT_RDMA_OBJ_SIZE(ib_rwq_ind_table, mlx5_ib_rwq_ind_table,
4361  			   ib_rwq_ind_tbl),
4362  };
4363  
4364  static int mlx5_ib_roce_init(struct mlx5_ib_dev *dev)
4365  {
4366  	struct mlx5_core_dev *mdev = dev->mdev;
4367  	enum rdma_link_layer ll;
4368  	int port_type_cap;
4369  	u32 port_num = 0;
4370  	int err;
4371  
4372  	port_type_cap = MLX5_CAP_GEN(mdev, port_type);
4373  	ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
4374  
4375  	if (ll == IB_LINK_LAYER_ETHERNET) {
4376  		ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_common_roce_ops);
4377  
4378  		port_num = mlx5_core_native_port_num(dev->mdev) - 1;
4379  
4380  		/* Register only for native ports */
4381  		mlx5_mdev_netdev_track(dev, port_num);
4382  
4383  		err = mlx5_enable_eth(dev);
4384  		if (err)
4385  			goto cleanup;
4386  	}
4387  
4388  	return 0;
4389  cleanup:
4390  	mlx5_mdev_netdev_untrack(dev, port_num);
4391  	return err;
4392  }
4393  
4394  static void mlx5_ib_roce_cleanup(struct mlx5_ib_dev *dev)
4395  {
4396  	struct mlx5_core_dev *mdev = dev->mdev;
4397  	enum rdma_link_layer ll;
4398  	int port_type_cap;
4399  	u32 port_num;
4400  
4401  	port_type_cap = MLX5_CAP_GEN(mdev, port_type);
4402  	ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
4403  
4404  	if (ll == IB_LINK_LAYER_ETHERNET) {
4405  		mlx5_disable_eth(dev);
4406  
4407  		port_num = mlx5_core_native_port_num(dev->mdev) - 1;
4408  		mlx5_mdev_netdev_untrack(dev, port_num);
4409  	}
4410  }
4411  
4412  static int mlx5_ib_stage_cong_debugfs_init(struct mlx5_ib_dev *dev)
4413  {
4414  	mlx5_ib_init_cong_debugfs(dev,
4415  				  mlx5_core_native_port_num(dev->mdev) - 1);
4416  	return 0;
4417  }
4418  
4419  static void mlx5_ib_stage_cong_debugfs_cleanup(struct mlx5_ib_dev *dev)
4420  {
4421  	mlx5_ib_cleanup_cong_debugfs(dev,
4422  				     mlx5_core_native_port_num(dev->mdev) - 1);
4423  }
4424  
4425  static int mlx5_ib_stage_uar_init(struct mlx5_ib_dev *dev)
4426  {
4427  	dev->mdev->priv.uar = mlx5_get_uars_page(dev->mdev);
4428  	return PTR_ERR_OR_ZERO(dev->mdev->priv.uar);
4429  }
4430  
4431  static void mlx5_ib_stage_uar_cleanup(struct mlx5_ib_dev *dev)
4432  {
4433  	mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar);
4434  }
4435  
4436  static int mlx5_ib_stage_bfrag_init(struct mlx5_ib_dev *dev)
4437  {
4438  	int err;
4439  
4440  	err = mlx5_alloc_bfreg(dev->mdev, &dev->bfreg, false, false);
4441  	if (err)
4442  		return err;
4443  
4444  	err = mlx5_alloc_bfreg(dev->mdev, &dev->fp_bfreg, false, true);
4445  	if (err)
4446  		mlx5_free_bfreg(dev->mdev, &dev->bfreg);
4447  
4448  	return err;
4449  }
4450  
4451  static void mlx5_ib_stage_bfrag_cleanup(struct mlx5_ib_dev *dev)
4452  {
4453  	mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg);
4454  	mlx5_free_bfreg(dev->mdev, &dev->bfreg);
4455  }
4456  
4457  static int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev)
4458  {
4459  	const char *name;
4460  
4461  	if (dev->sub_dev_name) {
4462  		name = dev->sub_dev_name;
4463  		ib_mark_name_assigned_by_user(&dev->ib_dev);
4464  	} else if (!mlx5_lag_is_active(dev->mdev))
4465  		name = "mlx5_%d";
4466  	else
4467  		name = "mlx5_bond_%d";
4468  	return ib_register_device(&dev->ib_dev, name, &dev->mdev->pdev->dev);
4469  }
4470  
4471  static void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev)
4472  {
4473  	mlx5_mkey_cache_cleanup(dev);
4474  	mlx5r_umr_resource_cleanup(dev);
4475  	mlx5r_umr_cleanup(dev);
4476  }
4477  
4478  static void mlx5_ib_stage_ib_reg_cleanup(struct mlx5_ib_dev *dev)
4479  {
4480  	ib_unregister_device(&dev->ib_dev);
4481  }
4482  
4483  static int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev)
4484  {
4485  	int ret;
4486  
4487  	ret = mlx5r_umr_init(dev);
4488  	if (ret)
4489  		return ret;
4490  
4491  	ret = mlx5_mkey_cache_init(dev);
4492  	if (ret)
4493  		mlx5_ib_warn(dev, "mr cache init failed %d\n", ret);
4494  	return ret;
4495  }
4496  
4497  static int mlx5_ib_stage_delay_drop_init(struct mlx5_ib_dev *dev)
4498  {
4499  	struct dentry *root;
4500  
4501  	if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP))
4502  		return 0;
4503  
4504  	mutex_init(&dev->delay_drop.lock);
4505  	dev->delay_drop.dev = dev;
4506  	dev->delay_drop.activate = false;
4507  	dev->delay_drop.timeout = MLX5_MAX_DELAY_DROP_TIMEOUT_MS * 1000;
4508  	INIT_WORK(&dev->delay_drop.delay_drop_work, delay_drop_handler);
4509  	atomic_set(&dev->delay_drop.rqs_cnt, 0);
4510  	atomic_set(&dev->delay_drop.events_cnt, 0);
4511  
4512  	if (!mlx5_debugfs_root)
4513  		return 0;
4514  
4515  	root = debugfs_create_dir("delay_drop", mlx5_debugfs_get_dev_root(dev->mdev));
4516  	dev->delay_drop.dir_debugfs = root;
4517  
4518  	debugfs_create_atomic_t("num_timeout_events", 0400, root,
4519  				&dev->delay_drop.events_cnt);
4520  	debugfs_create_atomic_t("num_rqs", 0400, root,
4521  				&dev->delay_drop.rqs_cnt);
4522  	debugfs_create_file("timeout", 0600, root, &dev->delay_drop,
4523  			    &fops_delay_drop_timeout);
4524  	return 0;
4525  }
4526  
4527  static void mlx5_ib_stage_delay_drop_cleanup(struct mlx5_ib_dev *dev)
4528  {
4529  	if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP))
4530  		return;
4531  
4532  	cancel_work_sync(&dev->delay_drop.delay_drop_work);
4533  	if (!dev->delay_drop.dir_debugfs)
4534  		return;
4535  
4536  	debugfs_remove_recursive(dev->delay_drop.dir_debugfs);
4537  	dev->delay_drop.dir_debugfs = NULL;
4538  }
4539  
4540  static int mlx5_ib_stage_dev_notifier_init(struct mlx5_ib_dev *dev)
4541  {
4542  	struct mlx5_ib_resources *devr = &dev->devr;
4543  	int port;
4544  
4545  	for (port = 0; port < ARRAY_SIZE(devr->ports); ++port)
4546  		INIT_WORK(&devr->ports[port].pkey_change_work,
4547  			  pkey_change_handler);
4548  
4549  	dev->mdev_events.notifier_call = mlx5_ib_event;
4550  	mlx5_notifier_register(dev->mdev, &dev->mdev_events);
4551  
4552  	mlx5r_macsec_event_register(dev);
4553  
4554  	return 0;
4555  }
4556  
4557  static void mlx5_ib_stage_dev_notifier_cleanup(struct mlx5_ib_dev *dev)
4558  {
4559  	struct mlx5_ib_resources *devr = &dev->devr;
4560  	int port;
4561  
4562  	mlx5r_macsec_event_unregister(dev);
4563  	mlx5_notifier_unregister(dev->mdev, &dev->mdev_events);
4564  
4565  	for (port = 0; port < ARRAY_SIZE(devr->ports); ++port)
4566  		cancel_work_sync(&devr->ports[port].pkey_change_work);
4567  }
4568  
4569  void mlx5_ib_data_direct_bind(struct mlx5_ib_dev *ibdev,
4570  			      struct mlx5_data_direct_dev *dev)
4571  {
4572  	mutex_lock(&ibdev->data_direct_lock);
4573  	ibdev->data_direct_dev = dev;
4574  	mutex_unlock(&ibdev->data_direct_lock);
4575  }
4576  
4577  void mlx5_ib_data_direct_unbind(struct mlx5_ib_dev *ibdev)
4578  {
4579  	mutex_lock(&ibdev->data_direct_lock);
4580  	mlx5_ib_revoke_data_direct_mrs(ibdev);
4581  	ibdev->data_direct_dev = NULL;
4582  	mutex_unlock(&ibdev->data_direct_lock);
4583  }
4584  
4585  void __mlx5_ib_remove(struct mlx5_ib_dev *dev,
4586  		      const struct mlx5_ib_profile *profile,
4587  		      int stage)
4588  {
4589  	dev->ib_active = false;
4590  
4591  	/* Number of stages to cleanup */
4592  	while (stage) {
4593  		stage--;
4594  		if (profile->stage[stage].cleanup)
4595  			profile->stage[stage].cleanup(dev);
4596  	}
4597  
4598  	kfree(dev->port);
4599  	ib_dealloc_device(&dev->ib_dev);
4600  }
4601  
4602  int __mlx5_ib_add(struct mlx5_ib_dev *dev,
4603  		  const struct mlx5_ib_profile *profile)
4604  {
4605  	int err;
4606  	int i;
4607  
4608  	dev->profile = profile;
4609  
4610  	for (i = 0; i < MLX5_IB_STAGE_MAX; i++) {
4611  		if (profile->stage[i].init) {
4612  			err = profile->stage[i].init(dev);
4613  			if (err)
4614  				goto err_out;
4615  		}
4616  	}
4617  
4618  	dev->ib_active = true;
4619  	return 0;
4620  
4621  err_out:
4622  	/* Clean up stages which were initialized */
4623  	while (i) {
4624  		i--;
4625  		if (profile->stage[i].cleanup)
4626  			profile->stage[i].cleanup(dev);
4627  	}
4628  	return -ENOMEM;
4629  }
4630  
4631  static const struct mlx5_ib_profile pf_profile = {
4632  	STAGE_CREATE(MLX5_IB_STAGE_INIT,
4633  		     mlx5_ib_stage_init_init,
4634  		     mlx5_ib_stage_init_cleanup),
4635  	STAGE_CREATE(MLX5_IB_STAGE_FS,
4636  		     mlx5_ib_fs_init,
4637  		     mlx5_ib_fs_cleanup),
4638  	STAGE_CREATE(MLX5_IB_STAGE_CAPS,
4639  		     mlx5_ib_stage_caps_init,
4640  		     mlx5_ib_stage_caps_cleanup),
4641  	STAGE_CREATE(MLX5_IB_STAGE_NON_DEFAULT_CB,
4642  		     mlx5_ib_stage_non_default_cb,
4643  		     NULL),
4644  	STAGE_CREATE(MLX5_IB_STAGE_ROCE,
4645  		     mlx5_ib_roce_init,
4646  		     mlx5_ib_roce_cleanup),
4647  	STAGE_CREATE(MLX5_IB_STAGE_QP,
4648  		     mlx5_init_qp_table,
4649  		     mlx5_cleanup_qp_table),
4650  	STAGE_CREATE(MLX5_IB_STAGE_SRQ,
4651  		     mlx5_init_srq_table,
4652  		     mlx5_cleanup_srq_table),
4653  	STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
4654  		     mlx5_ib_dev_res_init,
4655  		     mlx5_ib_dev_res_cleanup),
4656  	STAGE_CREATE(MLX5_IB_STAGE_ODP,
4657  		     mlx5_ib_odp_init_one,
4658  		     mlx5_ib_odp_cleanup_one),
4659  	STAGE_CREATE(MLX5_IB_STAGE_COUNTERS,
4660  		     mlx5_ib_counters_init,
4661  		     mlx5_ib_counters_cleanup),
4662  	STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS,
4663  		     mlx5_ib_stage_cong_debugfs_init,
4664  		     mlx5_ib_stage_cong_debugfs_cleanup),
4665  	STAGE_CREATE(MLX5_IB_STAGE_UAR,
4666  		     mlx5_ib_stage_uar_init,
4667  		     mlx5_ib_stage_uar_cleanup),
4668  	STAGE_CREATE(MLX5_IB_STAGE_BFREG,
4669  		     mlx5_ib_stage_bfrag_init,
4670  		     mlx5_ib_stage_bfrag_cleanup),
4671  	STAGE_CREATE(MLX5_IB_STAGE_PRE_IB_REG_UMR,
4672  		     NULL,
4673  		     mlx5_ib_stage_pre_ib_reg_umr_cleanup),
4674  	STAGE_CREATE(MLX5_IB_STAGE_WHITELIST_UID,
4675  		     mlx5_ib_devx_init,
4676  		     mlx5_ib_devx_cleanup),
4677  	STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
4678  		     mlx5_ib_stage_ib_reg_init,
4679  		     mlx5_ib_stage_ib_reg_cleanup),
4680  	STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER,
4681  		     mlx5_ib_stage_dev_notifier_init,
4682  		     mlx5_ib_stage_dev_notifier_cleanup),
4683  	STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR,
4684  		     mlx5_ib_stage_post_ib_reg_umr_init,
4685  		     NULL),
4686  	STAGE_CREATE(MLX5_IB_STAGE_DELAY_DROP,
4687  		     mlx5_ib_stage_delay_drop_init,
4688  		     mlx5_ib_stage_delay_drop_cleanup),
4689  	STAGE_CREATE(MLX5_IB_STAGE_RESTRACK,
4690  		     mlx5_ib_restrack_init,
4691  		     NULL),
4692  };
4693  
4694  const struct mlx5_ib_profile raw_eth_profile = {
4695  	STAGE_CREATE(MLX5_IB_STAGE_INIT,
4696  		     mlx5_ib_stage_init_init,
4697  		     mlx5_ib_stage_init_cleanup),
4698  	STAGE_CREATE(MLX5_IB_STAGE_FS,
4699  		     mlx5_ib_fs_init,
4700  		     mlx5_ib_fs_cleanup),
4701  	STAGE_CREATE(MLX5_IB_STAGE_CAPS,
4702  		     mlx5_ib_stage_caps_init,
4703  		     mlx5_ib_stage_caps_cleanup),
4704  	STAGE_CREATE(MLX5_IB_STAGE_NON_DEFAULT_CB,
4705  		     mlx5_ib_stage_raw_eth_non_default_cb,
4706  		     NULL),
4707  	STAGE_CREATE(MLX5_IB_STAGE_ROCE,
4708  		     mlx5_ib_roce_init,
4709  		     mlx5_ib_roce_cleanup),
4710  	STAGE_CREATE(MLX5_IB_STAGE_QP,
4711  		     mlx5_init_qp_table,
4712  		     mlx5_cleanup_qp_table),
4713  	STAGE_CREATE(MLX5_IB_STAGE_SRQ,
4714  		     mlx5_init_srq_table,
4715  		     mlx5_cleanup_srq_table),
4716  	STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
4717  		     mlx5_ib_dev_res_init,
4718  		     mlx5_ib_dev_res_cleanup),
4719  	STAGE_CREATE(MLX5_IB_STAGE_COUNTERS,
4720  		     mlx5_ib_counters_init,
4721  		     mlx5_ib_counters_cleanup),
4722  	STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS,
4723  		     mlx5_ib_stage_cong_debugfs_init,
4724  		     mlx5_ib_stage_cong_debugfs_cleanup),
4725  	STAGE_CREATE(MLX5_IB_STAGE_UAR,
4726  		     mlx5_ib_stage_uar_init,
4727  		     mlx5_ib_stage_uar_cleanup),
4728  	STAGE_CREATE(MLX5_IB_STAGE_BFREG,
4729  		     mlx5_ib_stage_bfrag_init,
4730  		     mlx5_ib_stage_bfrag_cleanup),
4731  	STAGE_CREATE(MLX5_IB_STAGE_PRE_IB_REG_UMR,
4732  		     NULL,
4733  		     mlx5_ib_stage_pre_ib_reg_umr_cleanup),
4734  	STAGE_CREATE(MLX5_IB_STAGE_WHITELIST_UID,
4735  		     mlx5_ib_devx_init,
4736  		     mlx5_ib_devx_cleanup),
4737  	STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
4738  		     mlx5_ib_stage_ib_reg_init,
4739  		     mlx5_ib_stage_ib_reg_cleanup),
4740  	STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER,
4741  		     mlx5_ib_stage_dev_notifier_init,
4742  		     mlx5_ib_stage_dev_notifier_cleanup),
4743  	STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR,
4744  		     mlx5_ib_stage_post_ib_reg_umr_init,
4745  		     NULL),
4746  	STAGE_CREATE(MLX5_IB_STAGE_DELAY_DROP,
4747  		     mlx5_ib_stage_delay_drop_init,
4748  		     mlx5_ib_stage_delay_drop_cleanup),
4749  	STAGE_CREATE(MLX5_IB_STAGE_RESTRACK,
4750  		     mlx5_ib_restrack_init,
4751  		     NULL),
4752  };
4753  
4754  static const struct mlx5_ib_profile plane_profile = {
4755  	STAGE_CREATE(MLX5_IB_STAGE_INIT,
4756  		     mlx5_ib_stage_init_init,
4757  		     mlx5_ib_stage_init_cleanup),
4758  	STAGE_CREATE(MLX5_IB_STAGE_CAPS,
4759  		     mlx5_ib_stage_caps_init,
4760  		     mlx5_ib_stage_caps_cleanup),
4761  	STAGE_CREATE(MLX5_IB_STAGE_NON_DEFAULT_CB,
4762  		     mlx5_ib_stage_non_default_cb,
4763  		     NULL),
4764  	STAGE_CREATE(MLX5_IB_STAGE_QP,
4765  		     mlx5_init_qp_table,
4766  		     mlx5_cleanup_qp_table),
4767  	STAGE_CREATE(MLX5_IB_STAGE_SRQ,
4768  		     mlx5_init_srq_table,
4769  		     mlx5_cleanup_srq_table),
4770  	STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
4771  		     mlx5_ib_dev_res_init,
4772  		     mlx5_ib_dev_res_cleanup),
4773  	STAGE_CREATE(MLX5_IB_STAGE_BFREG,
4774  		     mlx5_ib_stage_bfrag_init,
4775  		     mlx5_ib_stage_bfrag_cleanup),
4776  	STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
4777  		     mlx5_ib_stage_ib_reg_init,
4778  		     mlx5_ib_stage_ib_reg_cleanup),
4779  };
4780  
4781  static struct ib_device *mlx5_ib_add_sub_dev(struct ib_device *parent,
4782  					     enum rdma_nl_dev_type type,
4783  					     const char *name)
4784  {
4785  	struct mlx5_ib_dev *mparent = to_mdev(parent), *mplane;
4786  	enum rdma_link_layer ll;
4787  	int ret;
4788  
4789  	if (mparent->smi_dev)
4790  		return ERR_PTR(-EEXIST);
4791  
4792  	ll = mlx5_port_type_cap_to_rdma_ll(MLX5_CAP_GEN(mparent->mdev,
4793  							port_type));
4794  	if (type != RDMA_DEVICE_TYPE_SMI || !mparent->num_plane ||
4795  	    ll != IB_LINK_LAYER_INFINIBAND ||
4796  	    !MLX5_CAP_GEN_2(mparent->mdev, multiplane_qp_ud))
4797  		return ERR_PTR(-EOPNOTSUPP);
4798  
4799  	mplane = ib_alloc_device(mlx5_ib_dev, ib_dev);
4800  	if (!mplane)
4801  		return ERR_PTR(-ENOMEM);
4802  
4803  	mplane->port = kcalloc(mparent->num_plane * mparent->num_ports,
4804  			       sizeof(*mplane->port), GFP_KERNEL);
4805  	if (!mplane->port) {
4806  		ret = -ENOMEM;
4807  		goto fail_kcalloc;
4808  	}
4809  
4810  	mplane->ib_dev.type = type;
4811  	mplane->mdev = mparent->mdev;
4812  	mplane->num_ports = mparent->num_plane;
4813  	mplane->sub_dev_name = name;
4814  	mplane->ib_dev.phys_port_cnt = mplane->num_ports;
4815  
4816  	ret = __mlx5_ib_add(mplane, &plane_profile);
4817  	if (ret)
4818  		goto fail_ib_add;
4819  
4820  	mparent->smi_dev = mplane;
4821  	return &mplane->ib_dev;
4822  
4823  fail_ib_add:
4824  	kfree(mplane->port);
4825  fail_kcalloc:
4826  	ib_dealloc_device(&mplane->ib_dev);
4827  	return ERR_PTR(ret);
4828  }
4829  
4830  static void mlx5_ib_del_sub_dev(struct ib_device *sub_dev)
4831  {
4832  	struct mlx5_ib_dev *mdev = to_mdev(sub_dev);
4833  
4834  	to_mdev(sub_dev->parent)->smi_dev = NULL;
4835  	__mlx5_ib_remove(mdev, mdev->profile, MLX5_IB_STAGE_MAX);
4836  }
4837  
4838  static int mlx5r_mp_probe(struct auxiliary_device *adev,
4839  			  const struct auxiliary_device_id *id)
4840  {
4841  	struct mlx5_adev *idev = container_of(adev, struct mlx5_adev, adev);
4842  	struct mlx5_core_dev *mdev = idev->mdev;
4843  	struct mlx5_ib_multiport_info *mpi;
4844  	struct mlx5_ib_dev *dev;
4845  	bool bound = false;
4846  	int err;
4847  
4848  	mpi = kzalloc(sizeof(*mpi), GFP_KERNEL);
4849  	if (!mpi)
4850  		return -ENOMEM;
4851  
4852  	mpi->mdev = mdev;
4853  	err = mlx5_query_nic_vport_system_image_guid(mdev,
4854  						     &mpi->sys_image_guid);
4855  	if (err) {
4856  		kfree(mpi);
4857  		return err;
4858  	}
4859  
4860  	mutex_lock(&mlx5_ib_multiport_mutex);
4861  	list_for_each_entry(dev, &mlx5_ib_dev_list, ib_dev_list) {
4862  		if (dev->sys_image_guid == mpi->sys_image_guid &&
4863  		    mlx5_core_same_coredev_type(dev->mdev, mpi->mdev))
4864  			bound = mlx5_ib_bind_slave_port(dev, mpi);
4865  
4866  		if (bound) {
4867  			rdma_roce_rescan_device(&dev->ib_dev);
4868  			mpi->ibdev->ib_active = true;
4869  			break;
4870  		}
4871  	}
4872  
4873  	if (!bound) {
4874  		list_add_tail(&mpi->list, &mlx5_ib_unaffiliated_port_list);
4875  		dev_dbg(mdev->device,
4876  			"no suitable IB device found to bind to, added to unaffiliated list.\n");
4877  	}
4878  	mutex_unlock(&mlx5_ib_multiport_mutex);
4879  
4880  	auxiliary_set_drvdata(adev, mpi);
4881  	return 0;
4882  }
4883  
4884  static void mlx5r_mp_remove(struct auxiliary_device *adev)
4885  {
4886  	struct mlx5_ib_multiport_info *mpi;
4887  
4888  	mpi = auxiliary_get_drvdata(adev);
4889  	mutex_lock(&mlx5_ib_multiport_mutex);
4890  	if (mpi->ibdev)
4891  		mlx5_ib_unbind_slave_port(mpi->ibdev, mpi);
4892  	else
4893  		list_del(&mpi->list);
4894  	mutex_unlock(&mlx5_ib_multiport_mutex);
4895  	kfree(mpi);
4896  }
4897  
4898  static int mlx5r_probe(struct auxiliary_device *adev,
4899  		       const struct auxiliary_device_id *id)
4900  {
4901  	struct mlx5_adev *idev = container_of(adev, struct mlx5_adev, adev);
4902  	struct mlx5_core_dev *mdev = idev->mdev;
4903  	const struct mlx5_ib_profile *profile;
4904  	int port_type_cap, num_ports, ret;
4905  	enum rdma_link_layer ll;
4906  	struct mlx5_ib_dev *dev;
4907  
4908  	port_type_cap = MLX5_CAP_GEN(mdev, port_type);
4909  	ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
4910  
4911  	num_ports = max(MLX5_CAP_GEN(mdev, num_ports),
4912  			MLX5_CAP_GEN(mdev, num_vhca_ports));
4913  	dev = ib_alloc_device(mlx5_ib_dev, ib_dev);
4914  	if (!dev)
4915  		return -ENOMEM;
4916  
4917  	if (ll == IB_LINK_LAYER_INFINIBAND) {
4918  		ret = mlx5_ib_get_plane_num(mdev, &dev->num_plane);
4919  		if (ret)
4920  			goto fail;
4921  	}
4922  
4923  	dev->port = kcalloc(num_ports, sizeof(*dev->port),
4924  			     GFP_KERNEL);
4925  	if (!dev->port) {
4926  		ret = -ENOMEM;
4927  		goto fail;
4928  	}
4929  
4930  	dev->mdev = mdev;
4931  	dev->num_ports = num_ports;
4932  	dev->ib_dev.phys_port_cnt = num_ports;
4933  
4934  	if (ll == IB_LINK_LAYER_ETHERNET && !mlx5_get_roce_state(mdev))
4935  		profile = &raw_eth_profile;
4936  	else
4937  		profile = &pf_profile;
4938  
4939  	ret = __mlx5_ib_add(dev, profile);
4940  	if (ret)
4941  		goto fail_ib_add;
4942  
4943  	auxiliary_set_drvdata(adev, dev);
4944  	return 0;
4945  
4946  fail_ib_add:
4947  	kfree(dev->port);
4948  fail:
4949  	ib_dealloc_device(&dev->ib_dev);
4950  	return ret;
4951  }
4952  
4953  static void mlx5r_remove(struct auxiliary_device *adev)
4954  {
4955  	struct mlx5_ib_dev *dev;
4956  
4957  	dev = auxiliary_get_drvdata(adev);
4958  	__mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX);
4959  }
4960  
4961  static const struct auxiliary_device_id mlx5r_mp_id_table[] = {
4962  	{ .name = MLX5_ADEV_NAME ".multiport", },
4963  	{},
4964  };
4965  
4966  static const struct auxiliary_device_id mlx5r_id_table[] = {
4967  	{ .name = MLX5_ADEV_NAME ".rdma", },
4968  	{},
4969  };
4970  
4971  MODULE_DEVICE_TABLE(auxiliary, mlx5r_mp_id_table);
4972  MODULE_DEVICE_TABLE(auxiliary, mlx5r_id_table);
4973  
4974  static struct auxiliary_driver mlx5r_mp_driver = {
4975  	.name = "multiport",
4976  	.probe = mlx5r_mp_probe,
4977  	.remove = mlx5r_mp_remove,
4978  	.id_table = mlx5r_mp_id_table,
4979  };
4980  
4981  static struct auxiliary_driver mlx5r_driver = {
4982  	.name = "rdma",
4983  	.probe = mlx5r_probe,
4984  	.remove = mlx5r_remove,
4985  	.id_table = mlx5r_id_table,
4986  };
4987  
4988  static int __init mlx5_ib_init(void)
4989  {
4990  	int ret;
4991  
4992  	xlt_emergency_page = (void *)__get_free_page(GFP_KERNEL);
4993  	if (!xlt_emergency_page)
4994  		return -ENOMEM;
4995  
4996  	mlx5_ib_event_wq = alloc_ordered_workqueue("mlx5_ib_event_wq", 0);
4997  	if (!mlx5_ib_event_wq) {
4998  		free_page((unsigned long)xlt_emergency_page);
4999  		return -ENOMEM;
5000  	}
5001  
5002  	ret = mlx5_ib_qp_event_init();
5003  	if (ret)
5004  		goto qp_event_err;
5005  
5006  	mlx5_ib_odp_init();
5007  	ret = mlx5r_rep_init();
5008  	if (ret)
5009  		goto rep_err;
5010  	ret = mlx5_data_direct_driver_register();
5011  	if (ret)
5012  		goto dd_err;
5013  	ret = auxiliary_driver_register(&mlx5r_mp_driver);
5014  	if (ret)
5015  		goto mp_err;
5016  	ret = auxiliary_driver_register(&mlx5r_driver);
5017  	if (ret)
5018  		goto drv_err;
5019  
5020  	return 0;
5021  
5022  drv_err:
5023  	auxiliary_driver_unregister(&mlx5r_mp_driver);
5024  mp_err:
5025  	mlx5_data_direct_driver_unregister();
5026  dd_err:
5027  	mlx5r_rep_cleanup();
5028  rep_err:
5029  	mlx5_ib_qp_event_cleanup();
5030  qp_event_err:
5031  	destroy_workqueue(mlx5_ib_event_wq);
5032  	free_page((unsigned long)xlt_emergency_page);
5033  	return ret;
5034  }
5035  
5036  static void __exit mlx5_ib_cleanup(void)
5037  {
5038  	mlx5_data_direct_driver_unregister();
5039  	auxiliary_driver_unregister(&mlx5r_driver);
5040  	auxiliary_driver_unregister(&mlx5r_mp_driver);
5041  	mlx5r_rep_cleanup();
5042  
5043  	mlx5_ib_qp_event_cleanup();
5044  	destroy_workqueue(mlx5_ib_event_wq);
5045  	free_page((unsigned long)xlt_emergency_page);
5046  }
5047  
5048  module_init(mlx5_ib_init);
5049  module_exit(mlx5_ib_cleanup);
5050