xref: /illumos-gate/usr/src/uts/common/io/ib/clients/rdsv3/ib.c (revision 134379c07d59b848341b71d3c4819af39ad347cc)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * Copyright (c) 2006 Oracle.  All rights reserved.
27  *
28  * This software is available to you under a choice of one of two
29  * licenses.  You may choose to be licensed under the terms of the GNU
30  * General Public License (GPL) Version 2, available from the file
31  * COPYING in the main directory of this source tree, or the
32  * OpenIB.org BSD license below:
33  *
34  *     Redistribution and use in source and binary forms, with or
35  *     without modification, are permitted provided that the following
36  *     conditions are met:
37  *
38  *      - Redistributions of source code must retain the above
39  *        copyright notice, this list of conditions and the following
40  *        disclaimer.
41  *
42  *      - Redistributions in binary form must reproduce the above
43  *        copyright notice, this list of conditions and the following
44  *        disclaimer in the documentation and/or other materials
45  *        provided with the distribution.
46  *
47  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
48  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
49  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
50  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
51  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
52  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
53  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
54  * SOFTWARE.
55  *
56  */
57 #include <sys/sysmacros.h>
58 #include <sys/rds.h>
59 
60 #include <sys/ib/ibtl/ibti.h>
61 #include <sys/ib/clients/rdsv3/rdsv3.h>
62 #include <sys/ib/clients/rdsv3/ib.h>
63 #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
64 
65 unsigned int rdsv3_ib_retry_count = RDSV3_IB_DEFAULT_RETRY_COUNT;
66 
67 struct list	rdsv3_ib_devices;
68 
69 /* NOTE: if also grabbing ibdev lock, grab this first */
70 kmutex_t ib_nodev_conns_lock;
71 list_t ib_nodev_conns;
72 
73 void
74 rdsv3_ib_add_one(ib_device_t *device)
75 {
76 	struct rdsv3_ib_device *rds_ibdev;
77 	ibt_hca_attr_t *dev_attr;
78 
79 	RDSV3_DPRINTF4("rdsv3_ib_add_one", "device: %p", device);
80 
81 	/* Only handle IB (no iWARP) devices */
82 	if (device->node_type != RDMA_NODE_IB_CA)
83 		return;
84 
85 	dev_attr = (ibt_hca_attr_t *)kmem_alloc(sizeof (*dev_attr),
86 	    KM_NOSLEEP);
87 	if (!dev_attr)
88 		return;
89 
90 	if (ibt_query_hca(ib_get_ibt_hca_hdl(device), dev_attr)) {
91 		RDSV3_DPRINTF5("rdsv3_ib_add_one",
92 		    "Query device failed for %s", device->name);
93 		goto free_attr;
94 	}
95 
96 	/* We depend on Reserved Lkey */
97 	if (!(dev_attr->hca_flags2 & IBT_HCA2_RES_LKEY)) {
98 		RDSV3_DPRINTF5("rdsv3_ib_add_one",
99 		    "Reserved Lkey support is required: %s",
100 		    device->name);
101 		goto free_attr;
102 	}
103 
104 	rds_ibdev = kmem_zalloc(sizeof (*rds_ibdev), KM_NOSLEEP);
105 	if (!rds_ibdev)
106 		goto free_attr;
107 
108 	mutex_init(&rds_ibdev->spinlock, NULL, MUTEX_DRIVER, NULL);
109 
110 	rds_ibdev->max_wrs = dev_attr->hca_max_chan_sz;
111 	rds_ibdev->max_sge = min(dev_attr->hca_max_sgl, RDSV3_IB_MAX_SGE);
112 
113 	rds_ibdev->dev = device;
114 	rds_ibdev->pd = ib_alloc_pd(device);
115 	if (IS_ERR(rds_ibdev->pd))
116 		goto free_dev;
117 
118 	if (rdsv3_ib_create_mr_pool(rds_ibdev) != 0) {
119 		goto free_dev;
120 	}
121 
122 	list_create(&rds_ibdev->ipaddr_list, sizeof (struct rdsv3_ib_ipaddr),
123 	    offsetof(struct rdsv3_ib_ipaddr, list));
124 	list_create(&rds_ibdev->conn_list, sizeof (struct rdsv3_ib_connection),
125 	    offsetof(struct rdsv3_ib_connection, ib_node));
126 
127 	list_insert_tail(&rdsv3_ib_devices, rds_ibdev);
128 
129 	ib_set_client_data(device, &rdsv3_ib_client, rds_ibdev);
130 
131 	RDSV3_DPRINTF4("rdsv3_ib_add_one", "Return: device: %p", device);
132 
133 	goto free_attr;
134 
135 err_pd:
136 	(void) ib_dealloc_pd(rds_ibdev->pd);
137 free_dev:
138 	kmem_free(rds_ibdev, sizeof (*rds_ibdev));
139 free_attr:
140 	kmem_free(dev_attr, sizeof (*dev_attr));
141 }
142 
143 void
144 rdsv3_ib_remove_one(struct ib_device *device)
145 {
146 	struct rdsv3_ib_device *rds_ibdev;
147 	struct rdsv3_ib_ipaddr *i_ipaddr, *i_next;
148 
149 	RDSV3_DPRINTF4("rdsv3_ib_remove_one", "device: %p", device);
150 
151 	rds_ibdev = ib_get_client_data(device, &rdsv3_ib_client);
152 	if (!rds_ibdev)
153 		return;
154 
155 	RDSV3_FOR_EACH_LIST_NODE_SAFE(i_ipaddr, i_next, &rds_ibdev->ipaddr_list,
156 	    list) {
157 		list_remove_node(&i_ipaddr->list);
158 		kmem_free(i_ipaddr, sizeof (*i_ipaddr));
159 	}
160 
161 	rdsv3_ib_destroy_conns(rds_ibdev);
162 
163 	rdsv3_ib_destroy_mr_pool(rds_ibdev);
164 
165 #if 0
166 	while (ib_dealloc_pd(rds_ibdev->pd)) {
167 #ifndef __lock_lint
168 		RDSV3_DPRINTF5("rdsv3_ib_remove_one",
169 		    "%s-%d Failed to dealloc pd %p",
170 		    __func__, __LINE__, rds_ibdev->pd);
171 #endif
172 		delay(drv_usectohz(1000));
173 	}
174 #else
175 	if (ib_dealloc_pd(rds_ibdev->pd)) {
176 #ifndef __lock_lint
177 		RDSV3_DPRINTF2("rdsv3_ib_remove_one",
178 		    "Failed to dealloc pd %p\n", rds_ibdev->pd);
179 #endif
180 	}
181 #endif
182 
183 	list_destroy(&rds_ibdev->ipaddr_list);
184 	list_destroy(&rds_ibdev->conn_list);
185 	list_remove_node(&rds_ibdev->list);
186 	kmem_free(rds_ibdev, sizeof (*rds_ibdev));
187 
188 	RDSV3_DPRINTF4("rdsv3_ib_remove_one", "Return: device: %p", device);
189 }
190 
191 #ifndef __lock_lint
192 struct ib_client rdsv3_ib_client = {
193 	.name		= "rdsv3_ib",
194 	.add		= rdsv3_ib_add_one,
195 	.remove		= rdsv3_ib_remove_one,
196 	.clnt_hdl	= NULL,
197 	.state		= IB_CLNT_UNINITIALIZED
198 };
199 #else
200 struct ib_client rdsv3_ib_client = {
201 	"rdsv3_ib",
202 	rdsv3_ib_add_one,
203 	rdsv3_ib_remove_one,
204 	NULL,
205 	NULL,
206 	IB_CLNT_UNINITIALIZED
207 };
208 #endif
209 
210 static int
211 rds_ib_conn_info_visitor(struct rdsv3_connection *conn,
212     void *buffer)
213 {
214 	struct rdsv3_info_rdma_connection *iinfo = buffer;
215 	struct rdsv3_ib_connection *ic;
216 
217 	RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p",
218 	    conn, buffer);
219 
220 	/* We will only ever look at IB transports */
221 	if (conn->c_trans != &rdsv3_ib_transport)
222 		return (0);
223 
224 	iinfo->src_addr = conn->c_laddr;
225 	iinfo->dst_addr = conn->c_faddr;
226 
227 	(void) memset(&iinfo->src_gid, 0, sizeof (iinfo->src_gid));
228 	(void) memset(&iinfo->dst_gid, 0, sizeof (iinfo->dst_gid));
229 	if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) {
230 		struct rdsv3_ib_device *rds_ibdev;
231 		struct rdma_dev_addr *dev_addr;
232 
233 		ic = conn->c_transport_data;
234 		dev_addr = &ic->i_cm_id->route.addr.dev_addr;
235 
236 		ib_addr_get_sgid(dev_addr, (union ib_gid *)&iinfo->src_gid);
237 		ib_addr_get_dgid(dev_addr, (union ib_gid *)&iinfo->dst_gid);
238 
239 		rds_ibdev = ib_get_client_data(ic->i_cm_id->device,
240 		    &rdsv3_ib_client);
241 		iinfo->max_send_wr = ic->i_send_ring.w_nr;
242 		iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
243 		iinfo->max_send_sge = rds_ibdev->max_sge;
244 	}
245 
246 	RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p",
247 	    conn, buffer);
248 	return (1);
249 }
250 
251 static void
252 rds_ib_ic_info(struct rsock *sock, unsigned int len,
253     struct rdsv3_info_iterator *iter,
254     struct rdsv3_info_lengths *lens)
255 {
256 	RDSV3_DPRINTF4("rds_ib_ic_info", "sk: %p iter: %p, lens: %p, len: %d",
257 	    sock, iter, lens, len);
258 
259 	rdsv3_for_each_conn_info(sock, len, iter, lens,
260 	    rds_ib_conn_info_visitor,
261 	    sizeof (struct rdsv3_info_rdma_connection));
262 }
263 
264 /*
265  * Early RDS/IB was built to only bind to an address if there is an IPoIB
266  * device with that address set.
267  *
268  * If it were me, I'd advocate for something more flexible.  Sending and
269  * receiving should be device-agnostic.  Transports would try and maintain
270  * connections between peers who have messages queued.  Userspace would be
271  * allowed to influence which paths have priority.  We could call userspace
272  * asserting this policy "routing".
273  */
274 static int
275 rds_ib_laddr_check(uint32_be_t addr)
276 {
277 	int ret;
278 	struct rdma_cm_id *cm_id;
279 	struct sockaddr_in sin;
280 
281 	RDSV3_DPRINTF4("rds_ib_laddr_check", "addr: %x", ntohl(addr));
282 
283 	/*
284 	 * Create a CMA ID and try to bind it. This catches both
285 	 * IB and iWARP capable NICs.
286 	 */
287 	cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP);
288 	if (!cm_id)
289 		return (-EADDRNOTAVAIL);
290 
291 	(void) memset(&sin, 0, sizeof (sin));
292 	sin.sin_family = AF_INET;
293 	sin.sin_addr.s_addr = rdsv3_scaddr_to_ibaddr(addr);
294 
295 	/* rdma_bind_addr will only succeed for IB & iWARP devices */
296 	ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
297 	/*
298 	 * due to this, we will claim to support iWARP devices unless we
299 	 * check node_type.
300 	 */
301 	if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA)
302 		ret = -EADDRNOTAVAIL;
303 
304 	RDSV3_DPRINTF5("rds_ib_laddr_check",
305 	    "addr %u.%u.%u.%u ret %d node type %d",
306 	    NIPQUAD(addr), ret,
307 	    cm_id->device ? cm_id->device->node_type : -1);
308 
309 	rdma_destroy_id(cm_id);
310 
311 	return (ret);
312 }
313 
314 void
315 rdsv3_ib_exit(void)
316 {
317 	RDSV3_DPRINTF4("rds_ib_exit", "Enter");
318 
319 	rdsv3_info_deregister_func(RDSV3_INFO_IB_CONNECTIONS, rds_ib_ic_info);
320 	rdsv3_ib_destroy_nodev_conns();
321 	ib_unregister_client(&rdsv3_ib_client);
322 	rdsv3_ib_sysctl_exit();
323 	rdsv3_ib_recv_exit();
324 	rdsv3_trans_unregister(&rdsv3_ib_transport);
325 	mutex_destroy(&ib_nodev_conns_lock);
326 	list_destroy(&ib_nodev_conns);
327 	list_destroy(&rdsv3_ib_devices);
328 
329 	RDSV3_DPRINTF4("rds_ib_exit", "Return");
330 }
331 
332 #ifndef __lock_lint
333 struct rdsv3_transport rdsv3_ib_transport = {
334 	.laddr_check		= rds_ib_laddr_check,
335 	.xmit_complete		= rdsv3_ib_xmit_complete,
336 	.xmit			= rdsv3_ib_xmit,
337 	.xmit_cong_map		= NULL,
338 	.xmit_rdma		= rdsv3_ib_xmit_rdma,
339 	.recv			= rdsv3_ib_recv,
340 	.conn_alloc		= rdsv3_ib_conn_alloc,
341 	.conn_free		= rdsv3_ib_conn_free,
342 	.conn_connect		= rdsv3_ib_conn_connect,
343 	.conn_shutdown		= rdsv3_ib_conn_shutdown,
344 	.inc_copy_to_user	= rdsv3_ib_inc_copy_to_user,
345 	.inc_purge		= rdsv3_ib_inc_purge,
346 	.inc_free		= rdsv3_ib_inc_free,
347 	.cm_initiate_connect	= rdsv3_ib_cm_initiate_connect,
348 	.cm_handle_connect	= rdsv3_ib_cm_handle_connect,
349 	.cm_connect_complete	= rdsv3_ib_cm_connect_complete,
350 	.stats_info_copy	= rdsv3_ib_stats_info_copy,
351 	.exit			= rdsv3_ib_exit,
352 	.get_mr			= rdsv3_ib_get_mr,
353 	.sync_mr		= rdsv3_ib_sync_mr,
354 	.free_mr		= rdsv3_ib_free_mr,
355 	.flush_mrs		= rdsv3_ib_flush_mrs,
356 	.t_name			= "infiniband",
357 	.t_type			= RDS_TRANS_IB
358 };
359 #else
360 struct rdsv3_transport rdsv3_ib_transport;
361 #endif
362 
363 int
364 rdsv3_ib_init(void)
365 {
366 	int ret;
367 
368 	RDSV3_DPRINTF4("rds_ib_init", "Enter");
369 
370 	list_create(&rdsv3_ib_devices, sizeof (struct rdsv3_ib_device),
371 	    offsetof(struct rdsv3_ib_device, list));
372 	list_create(&ib_nodev_conns, sizeof (struct rdsv3_ib_connection),
373 	    offsetof(struct rdsv3_ib_connection, ib_node));
374 	mutex_init(&ib_nodev_conns_lock, NULL, MUTEX_DRIVER, NULL);
375 
376 	rdsv3_ib_client.dip = rdsv3_dev_info;
377 	ret = ib_register_client(&rdsv3_ib_client);
378 	if (ret)
379 		goto out;
380 
381 	ret = rdsv3_ib_sysctl_init();
382 	if (ret)
383 		goto out_ibreg;
384 
385 	ret = rdsv3_ib_recv_init();
386 	if (ret)
387 		goto out_sysctl;
388 
389 	ret = rdsv3_trans_register(&rdsv3_ib_transport);
390 	if (ret)
391 		goto out_recv;
392 
393 	rdsv3_info_register_func(RDSV3_INFO_IB_CONNECTIONS, rds_ib_ic_info);
394 
395 	RDSV3_DPRINTF4("rds_ib_init", "Return");
396 
397 	return (0);
398 
399 out_recv:
400 	rdsv3_ib_recv_exit();
401 out_sysctl:
402 	rdsv3_ib_sysctl_exit();
403 out_ibreg:
404 	ib_unregister_client(&rdsv3_ib_client);
405 out:
406 	mutex_destroy(&ib_nodev_conns_lock);
407 	list_destroy(&ib_nodev_conns);
408 	list_destroy(&rdsv3_ib_devices);
409 	return (ret);
410 }
411