xref: /linux/net/rds/ib.c (revision 40589e74f7ba855f3a887c9d4abe9d100c5b039c)
1ec16227eSAndy Grover /*
2ec16227eSAndy Grover  * Copyright (c) 2006 Oracle.  All rights reserved.
3ec16227eSAndy Grover  *
4ec16227eSAndy Grover  * This software is available to you under a choice of one of two
5ec16227eSAndy Grover  * licenses.  You may choose to be licensed under the terms of the GNU
6ec16227eSAndy Grover  * General Public License (GPL) Version 2, available from the file
7ec16227eSAndy Grover  * COPYING in the main directory of this source tree, or the
8ec16227eSAndy Grover  * OpenIB.org BSD license below:
9ec16227eSAndy Grover  *
10ec16227eSAndy Grover  *     Redistribution and use in source and binary forms, with or
11ec16227eSAndy Grover  *     without modification, are permitted provided that the following
12ec16227eSAndy Grover  *     conditions are met:
13ec16227eSAndy Grover  *
14ec16227eSAndy Grover  *      - Redistributions of source code must retain the above
15ec16227eSAndy Grover  *        copyright notice, this list of conditions and the following
16ec16227eSAndy Grover  *        disclaimer.
17ec16227eSAndy Grover  *
18ec16227eSAndy Grover  *      - Redistributions in binary form must reproduce the above
19ec16227eSAndy Grover  *        copyright notice, this list of conditions and the following
20ec16227eSAndy Grover  *        disclaimer in the documentation and/or other materials
21ec16227eSAndy Grover  *        provided with the distribution.
22ec16227eSAndy Grover  *
23ec16227eSAndy Grover  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24ec16227eSAndy Grover  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25ec16227eSAndy Grover  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26ec16227eSAndy Grover  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27ec16227eSAndy Grover  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28ec16227eSAndy Grover  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29ec16227eSAndy Grover  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30ec16227eSAndy Grover  * SOFTWARE.
31ec16227eSAndy Grover  *
32ec16227eSAndy Grover  */
33ec16227eSAndy Grover #include <linux/kernel.h>
34ec16227eSAndy Grover #include <linux/in.h>
35ec16227eSAndy Grover #include <linux/if.h>
36ec16227eSAndy Grover #include <linux/netdevice.h>
37ec16227eSAndy Grover #include <linux/inetdevice.h>
38ec16227eSAndy Grover #include <linux/if_arp.h>
39ec16227eSAndy Grover #include <linux/delay.h>
405a0e3ad6STejun Heo #include <linux/slab.h>
41ec16227eSAndy Grover 
42ec16227eSAndy Grover #include "rds.h"
43ec16227eSAndy Grover #include "ib.h"
44ec16227eSAndy Grover 
45ec16227eSAndy Grover unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE;
46ec16227eSAndy Grover unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */
473ba23adeSAndy Grover unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT;
48ec16227eSAndy Grover 
49ec16227eSAndy Grover module_param(fmr_pool_size, int, 0444);
50ec16227eSAndy Grover MODULE_PARM_DESC(fmr_pool_size, " Max number of fmr per HCA");
51ec16227eSAndy Grover module_param(fmr_message_size, int, 0444);
52ec16227eSAndy Grover MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer");
533ba23adeSAndy Grover module_param(rds_ib_retry_count, int, 0444);
543ba23adeSAndy Grover MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error");
55ec16227eSAndy Grover 
56ec16227eSAndy Grover struct list_head rds_ib_devices;
57ec16227eSAndy Grover 
58745cbccaSAndy Grover /* NOTE: if also grabbing ibdev lock, grab this first */
59ec16227eSAndy Grover DEFINE_SPINLOCK(ib_nodev_conns_lock);
60ec16227eSAndy Grover LIST_HEAD(ib_nodev_conns);
61ec16227eSAndy Grover 
62ec16227eSAndy Grover void rds_ib_add_one(struct ib_device *device)
63ec16227eSAndy Grover {
64ec16227eSAndy Grover 	struct rds_ib_device *rds_ibdev;
65ec16227eSAndy Grover 	struct ib_device_attr *dev_attr;
66ec16227eSAndy Grover 
67ec16227eSAndy Grover 	/* Only handle IB (no iWARP) devices */
68ec16227eSAndy Grover 	if (device->node_type != RDMA_NODE_IB_CA)
69ec16227eSAndy Grover 		return;
70ec16227eSAndy Grover 
71ec16227eSAndy Grover 	dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
72ec16227eSAndy Grover 	if (!dev_attr)
73ec16227eSAndy Grover 		return;
74ec16227eSAndy Grover 
75ec16227eSAndy Grover 	if (ib_query_device(device, dev_attr)) {
76ec16227eSAndy Grover 		rdsdebug("Query device failed for %s\n", device->name);
77ec16227eSAndy Grover 		goto free_attr;
78ec16227eSAndy Grover 	}
79ec16227eSAndy Grover 
80ec16227eSAndy Grover 	rds_ibdev = kmalloc(sizeof *rds_ibdev, GFP_KERNEL);
81ec16227eSAndy Grover 	if (!rds_ibdev)
82ec16227eSAndy Grover 		goto free_attr;
83ec16227eSAndy Grover 
84ec16227eSAndy Grover 	spin_lock_init(&rds_ibdev->spinlock);
85ec16227eSAndy Grover 
86ec16227eSAndy Grover 	rds_ibdev->max_wrs = dev_attr->max_qp_wr;
87ec16227eSAndy Grover 	rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
88ec16227eSAndy Grover 
89ec16227eSAndy Grover 	rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32;
90ec16227eSAndy Grover 	rds_ibdev->max_fmrs = dev_attr->max_fmr ?
91ec16227eSAndy Grover 			min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) :
92ec16227eSAndy Grover 			fmr_pool_size;
93ec16227eSAndy Grover 
94*40589e74SAndy Grover 	rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom;
95*40589e74SAndy Grover 	rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom;
96*40589e74SAndy Grover 
97ec16227eSAndy Grover 	rds_ibdev->dev = device;
98ec16227eSAndy Grover 	rds_ibdev->pd = ib_alloc_pd(device);
99ec16227eSAndy Grover 	if (IS_ERR(rds_ibdev->pd))
100ec16227eSAndy Grover 		goto free_dev;
101ec16227eSAndy Grover 
102ec16227eSAndy Grover 	rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd,
103ec16227eSAndy Grover 				      IB_ACCESS_LOCAL_WRITE);
104ec16227eSAndy Grover 	if (IS_ERR(rds_ibdev->mr))
105ec16227eSAndy Grover 		goto err_pd;
106ec16227eSAndy Grover 
107ec16227eSAndy Grover 	rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev);
108ec16227eSAndy Grover 	if (IS_ERR(rds_ibdev->mr_pool)) {
109ec16227eSAndy Grover 		rds_ibdev->mr_pool = NULL;
110ec16227eSAndy Grover 		goto err_mr;
111ec16227eSAndy Grover 	}
112ec16227eSAndy Grover 
113ec16227eSAndy Grover 	INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
114ec16227eSAndy Grover 	INIT_LIST_HEAD(&rds_ibdev->conn_list);
115ec16227eSAndy Grover 	list_add_tail(&rds_ibdev->list, &rds_ib_devices);
116ec16227eSAndy Grover 
117ec16227eSAndy Grover 	ib_set_client_data(device, &rds_ib_client, rds_ibdev);
118ec16227eSAndy Grover 
119ec16227eSAndy Grover 	goto free_attr;
120ec16227eSAndy Grover 
121ec16227eSAndy Grover err_mr:
122ec16227eSAndy Grover 	ib_dereg_mr(rds_ibdev->mr);
123ec16227eSAndy Grover err_pd:
124ec16227eSAndy Grover 	ib_dealloc_pd(rds_ibdev->pd);
125ec16227eSAndy Grover free_dev:
126ec16227eSAndy Grover 	kfree(rds_ibdev);
127ec16227eSAndy Grover free_attr:
128ec16227eSAndy Grover 	kfree(dev_attr);
129ec16227eSAndy Grover }
130ec16227eSAndy Grover 
131ec16227eSAndy Grover void rds_ib_remove_one(struct ib_device *device)
132ec16227eSAndy Grover {
133ec16227eSAndy Grover 	struct rds_ib_device *rds_ibdev;
134ec16227eSAndy Grover 	struct rds_ib_ipaddr *i_ipaddr, *i_next;
135ec16227eSAndy Grover 
136ec16227eSAndy Grover 	rds_ibdev = ib_get_client_data(device, &rds_ib_client);
137ec16227eSAndy Grover 	if (!rds_ibdev)
138ec16227eSAndy Grover 		return;
139ec16227eSAndy Grover 
140ec16227eSAndy Grover 	list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
141ec16227eSAndy Grover 		list_del(&i_ipaddr->list);
142ec16227eSAndy Grover 		kfree(i_ipaddr);
143ec16227eSAndy Grover 	}
144ec16227eSAndy Grover 
145745cbccaSAndy Grover 	rds_ib_destroy_conns(rds_ibdev);
146ec16227eSAndy Grover 
147ec16227eSAndy Grover 	if (rds_ibdev->mr_pool)
148ec16227eSAndy Grover 		rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
149ec16227eSAndy Grover 
150ec16227eSAndy Grover 	ib_dereg_mr(rds_ibdev->mr);
151ec16227eSAndy Grover 
152ec16227eSAndy Grover 	while (ib_dealloc_pd(rds_ibdev->pd)) {
153ec16227eSAndy Grover 		rdsdebug("Failed to dealloc pd %p\n", rds_ibdev->pd);
154ec16227eSAndy Grover 		msleep(1);
155ec16227eSAndy Grover 	}
156ec16227eSAndy Grover 
157ec16227eSAndy Grover 	list_del(&rds_ibdev->list);
158ec16227eSAndy Grover 	kfree(rds_ibdev);
159ec16227eSAndy Grover }
160ec16227eSAndy Grover 
161ec16227eSAndy Grover struct ib_client rds_ib_client = {
162ec16227eSAndy Grover 	.name   = "rds_ib",
163ec16227eSAndy Grover 	.add    = rds_ib_add_one,
164ec16227eSAndy Grover 	.remove = rds_ib_remove_one
165ec16227eSAndy Grover };
166ec16227eSAndy Grover 
167ec16227eSAndy Grover static int rds_ib_conn_info_visitor(struct rds_connection *conn,
168ec16227eSAndy Grover 				    void *buffer)
169ec16227eSAndy Grover {
170ec16227eSAndy Grover 	struct rds_info_rdma_connection *iinfo = buffer;
171ec16227eSAndy Grover 	struct rds_ib_connection *ic;
172ec16227eSAndy Grover 
173ec16227eSAndy Grover 	/* We will only ever look at IB transports */
174ec16227eSAndy Grover 	if (conn->c_trans != &rds_ib_transport)
175ec16227eSAndy Grover 		return 0;
176ec16227eSAndy Grover 
177ec16227eSAndy Grover 	iinfo->src_addr = conn->c_laddr;
178ec16227eSAndy Grover 	iinfo->dst_addr = conn->c_faddr;
179ec16227eSAndy Grover 
180ec16227eSAndy Grover 	memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
181ec16227eSAndy Grover 	memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
182ec16227eSAndy Grover 	if (rds_conn_state(conn) == RDS_CONN_UP) {
183ec16227eSAndy Grover 		struct rds_ib_device *rds_ibdev;
184ec16227eSAndy Grover 		struct rdma_dev_addr *dev_addr;
185ec16227eSAndy Grover 
186ec16227eSAndy Grover 		ic = conn->c_transport_data;
187ec16227eSAndy Grover 		dev_addr = &ic->i_cm_id->route.addr.dev_addr;
188ec16227eSAndy Grover 
1896f8372b6SSean Hefty 		rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
1906f8372b6SSean Hefty 		rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
191ec16227eSAndy Grover 
192ec16227eSAndy Grover 		rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
193ec16227eSAndy Grover 		iinfo->max_send_wr = ic->i_send_ring.w_nr;
194ec16227eSAndy Grover 		iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
195ec16227eSAndy Grover 		iinfo->max_send_sge = rds_ibdev->max_sge;
196ec16227eSAndy Grover 		rds_ib_get_mr_info(rds_ibdev, iinfo);
197ec16227eSAndy Grover 	}
198ec16227eSAndy Grover 	return 1;
199ec16227eSAndy Grover }
200ec16227eSAndy Grover 
201ec16227eSAndy Grover static void rds_ib_ic_info(struct socket *sock, unsigned int len,
202ec16227eSAndy Grover 			   struct rds_info_iterator *iter,
203ec16227eSAndy Grover 			   struct rds_info_lengths *lens)
204ec16227eSAndy Grover {
205ec16227eSAndy Grover 	rds_for_each_conn_info(sock, len, iter, lens,
206ec16227eSAndy Grover 				rds_ib_conn_info_visitor,
207ec16227eSAndy Grover 				sizeof(struct rds_info_rdma_connection));
208ec16227eSAndy Grover }
209ec16227eSAndy Grover 
210ec16227eSAndy Grover 
211ec16227eSAndy Grover /*
212ec16227eSAndy Grover  * Early RDS/IB was built to only bind to an address if there is an IPoIB
213ec16227eSAndy Grover  * device with that address set.
214ec16227eSAndy Grover  *
215ec16227eSAndy Grover  * If it were me, I'd advocate for something more flexible.  Sending and
216ec16227eSAndy Grover  * receiving should be device-agnostic.  Transports would try and maintain
217ec16227eSAndy Grover  * connections between peers who have messages queued.  Userspace would be
218ec16227eSAndy Grover  * allowed to influence which paths have priority.  We could call userspace
219ec16227eSAndy Grover  * asserting this policy "routing".
220ec16227eSAndy Grover  */
221ec16227eSAndy Grover static int rds_ib_laddr_check(__be32 addr)
222ec16227eSAndy Grover {
223ec16227eSAndy Grover 	int ret;
224ec16227eSAndy Grover 	struct rdma_cm_id *cm_id;
225ec16227eSAndy Grover 	struct sockaddr_in sin;
226ec16227eSAndy Grover 
227ec16227eSAndy Grover 	/* Create a CMA ID and try to bind it. This catches both
228ec16227eSAndy Grover 	 * IB and iWARP capable NICs.
229ec16227eSAndy Grover 	 */
230ec16227eSAndy Grover 	cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP);
23194713babSDan Carpenter 	if (IS_ERR(cm_id))
23294713babSDan Carpenter 		return PTR_ERR(cm_id);
233ec16227eSAndy Grover 
234ec16227eSAndy Grover 	memset(&sin, 0, sizeof(sin));
235ec16227eSAndy Grover 	sin.sin_family = AF_INET;
236ec16227eSAndy Grover 	sin.sin_addr.s_addr = addr;
237ec16227eSAndy Grover 
238ec16227eSAndy Grover 	/* rdma_bind_addr will only succeed for IB & iWARP devices */
239ec16227eSAndy Grover 	ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
240ec16227eSAndy Grover 	/* due to this, we will claim to support iWARP devices unless we
241ec16227eSAndy Grover 	   check node_type. */
242ec16227eSAndy Grover 	if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA)
243ec16227eSAndy Grover 		ret = -EADDRNOTAVAIL;
244ec16227eSAndy Grover 
245ec16227eSAndy Grover 	rdsdebug("addr %pI4 ret %d node type %d\n",
246ec16227eSAndy Grover 		&addr, ret,
247ec16227eSAndy Grover 		cm_id->device ? cm_id->device->node_type : -1);
248ec16227eSAndy Grover 
249ec16227eSAndy Grover 	rdma_destroy_id(cm_id);
250ec16227eSAndy Grover 
251ec16227eSAndy Grover 	return ret;
252ec16227eSAndy Grover }
253ec16227eSAndy Grover 
254ec16227eSAndy Grover void rds_ib_exit(void)
255ec16227eSAndy Grover {
256ec16227eSAndy Grover 	rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
257745cbccaSAndy Grover 	rds_ib_destroy_nodev_conns();
258ec16227eSAndy Grover 	ib_unregister_client(&rds_ib_client);
259ec16227eSAndy Grover 	rds_ib_sysctl_exit();
260ec16227eSAndy Grover 	rds_ib_recv_exit();
261ec16227eSAndy Grover 	rds_trans_unregister(&rds_ib_transport);
262ec16227eSAndy Grover }
263ec16227eSAndy Grover 
264ec16227eSAndy Grover struct rds_transport rds_ib_transport = {
265ec16227eSAndy Grover 	.laddr_check		= rds_ib_laddr_check,
266ec16227eSAndy Grover 	.xmit_complete		= rds_ib_xmit_complete,
267ec16227eSAndy Grover 	.xmit			= rds_ib_xmit,
268ec16227eSAndy Grover 	.xmit_cong_map		= NULL,
269ec16227eSAndy Grover 	.xmit_rdma		= rds_ib_xmit_rdma,
27015133f6eSAndy Grover 	.xmit_atomic		= rds_ib_xmit_atomic,
271ec16227eSAndy Grover 	.recv			= rds_ib_recv,
272ec16227eSAndy Grover 	.conn_alloc		= rds_ib_conn_alloc,
273ec16227eSAndy Grover 	.conn_free		= rds_ib_conn_free,
274ec16227eSAndy Grover 	.conn_connect		= rds_ib_conn_connect,
275ec16227eSAndy Grover 	.conn_shutdown		= rds_ib_conn_shutdown,
276ec16227eSAndy Grover 	.inc_copy_to_user	= rds_ib_inc_copy_to_user,
277ec16227eSAndy Grover 	.inc_purge		= rds_ib_inc_purge,
278ec16227eSAndy Grover 	.inc_free		= rds_ib_inc_free,
279ec16227eSAndy Grover 	.cm_initiate_connect	= rds_ib_cm_initiate_connect,
280ec16227eSAndy Grover 	.cm_handle_connect	= rds_ib_cm_handle_connect,
281ec16227eSAndy Grover 	.cm_connect_complete	= rds_ib_cm_connect_complete,
282ec16227eSAndy Grover 	.stats_info_copy	= rds_ib_stats_info_copy,
283ec16227eSAndy Grover 	.exit			= rds_ib_exit,
284ec16227eSAndy Grover 	.get_mr			= rds_ib_get_mr,
285ec16227eSAndy Grover 	.sync_mr		= rds_ib_sync_mr,
286ec16227eSAndy Grover 	.free_mr		= rds_ib_free_mr,
287ec16227eSAndy Grover 	.flush_mrs		= rds_ib_flush_mrs,
288ec16227eSAndy Grover 	.t_owner		= THIS_MODULE,
289ec16227eSAndy Grover 	.t_name			= "infiniband",
290335776bdSAndy Grover 	.t_type			= RDS_TRANS_IB
291ec16227eSAndy Grover };
292ec16227eSAndy Grover 
293ec16227eSAndy Grover int __init rds_ib_init(void)
294ec16227eSAndy Grover {
295ec16227eSAndy Grover 	int ret;
296ec16227eSAndy Grover 
297ec16227eSAndy Grover 	INIT_LIST_HEAD(&rds_ib_devices);
298ec16227eSAndy Grover 
299ec16227eSAndy Grover 	ret = ib_register_client(&rds_ib_client);
300ec16227eSAndy Grover 	if (ret)
301ec16227eSAndy Grover 		goto out;
302ec16227eSAndy Grover 
303ec16227eSAndy Grover 	ret = rds_ib_sysctl_init();
304ec16227eSAndy Grover 	if (ret)
305ec16227eSAndy Grover 		goto out_ibreg;
306ec16227eSAndy Grover 
307ec16227eSAndy Grover 	ret = rds_ib_recv_init();
308ec16227eSAndy Grover 	if (ret)
309ec16227eSAndy Grover 		goto out_sysctl;
310ec16227eSAndy Grover 
311ec16227eSAndy Grover 	ret = rds_trans_register(&rds_ib_transport);
312ec16227eSAndy Grover 	if (ret)
313ec16227eSAndy Grover 		goto out_recv;
314ec16227eSAndy Grover 
315ec16227eSAndy Grover 	rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
316ec16227eSAndy Grover 
317ec16227eSAndy Grover 	goto out;
318ec16227eSAndy Grover 
319ec16227eSAndy Grover out_recv:
320ec16227eSAndy Grover 	rds_ib_recv_exit();
321ec16227eSAndy Grover out_sysctl:
322ec16227eSAndy Grover 	rds_ib_sysctl_exit();
323ec16227eSAndy Grover out_ibreg:
324ec16227eSAndy Grover 	ib_unregister_client(&rds_ib_client);
325ec16227eSAndy Grover out:
326ec16227eSAndy Grover 	return ret;
327ec16227eSAndy Grover }
328ec16227eSAndy Grover 
329ec16227eSAndy Grover MODULE_LICENSE("GPL");
330ec16227eSAndy Grover 
331