xref: /illumos-gate/usr/src/uts/common/io/ib/clients/rdsv3/ib.c (revision 5002558f6bfef3915c7f3b4ecb7c19c7f044bf5b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * Copyright (c) 2006 Oracle.  All rights reserved.
27  *
28  * This software is available to you under a choice of one of two
29  * licenses.  You may choose to be licensed under the terms of the GNU
30  * General Public License (GPL) Version 2, available from the file
31  * COPYING in the main directory of this source tree, or the
32  * OpenIB.org BSD license below:
33  *
34  *     Redistribution and use in source and binary forms, with or
35  *     without modification, are permitted provided that the following
36  *     conditions are met:
37  *
38  *      - Redistributions of source code must retain the above
39  *        copyright notice, this list of conditions and the following
40  *        disclaimer.
41  *
42  *      - Redistributions in binary form must reproduce the above
43  *        copyright notice, this list of conditions and the following
44  *        disclaimer in the documentation and/or other materials
45  *        provided with the distribution.
46  *
47  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
48  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
49  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
50  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
51  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
52  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
53  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
54  * SOFTWARE.
55  *
56  */
57 #include <sys/sysmacros.h>
58 #include <sys/rds.h>
59 
60 #include <sys/ib/ibtl/ibti.h>
61 #include <sys/ib/clients/rdsv3/rdsv3.h>
62 #include <sys/ib/clients/rdsv3/ib.h>
63 #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
64 
65 unsigned int rdsv3_ib_retry_count = RDSV3_IB_DEFAULT_RETRY_COUNT;
66 
67 struct list	rdsv3_ib_devices;
68 
69 /* NOTE: if also grabbing ibdev lock, grab this first */
70 kmutex_t ib_nodev_conns_lock;
71 list_t ib_nodev_conns;
72 
73 extern int rdsv3_ib_frag_constructor(void *buf, void *arg, int kmflags);
74 extern void rdsv3_ib_frag_destructor(void *buf, void *arg);
75 
76 void
77 rdsv3_ib_add_one(ib_device_t *device)
78 {
79 	struct rdsv3_ib_device *rds_ibdev;
80 	ibt_hca_attr_t *dev_attr;
81 	char name[64];
82 
83 	RDSV3_DPRINTF2("rdsv3_ib_add_one", "device: %p", device);
84 
85 	/* Only handle IB (no iWARP) devices */
86 	if (device->node_type != RDMA_NODE_IB_CA)
87 		return;
88 
89 	dev_attr = (ibt_hca_attr_t *)kmem_alloc(sizeof (*dev_attr),
90 	    KM_NOSLEEP);
91 	if (!dev_attr)
92 		return;
93 
94 	if (ibt_query_hca(ib_get_ibt_hca_hdl(device), dev_attr)) {
95 		RDSV3_DPRINTF2("rdsv3_ib_add_one",
96 		    "Query device failed for %s", device->name);
97 		goto free_attr;
98 	}
99 
100 	/* We depend on Reserved Lkey */
101 	if (!(dev_attr->hca_flags2 & IBT_HCA2_RES_LKEY)) {
102 		RDSV3_DPRINTF2("rdsv3_ib_add_one",
103 		    "Reserved Lkey support is required: %s",
104 		    device->name);
105 		goto free_attr;
106 	}
107 
108 	rds_ibdev = kmem_zalloc(sizeof (*rds_ibdev), KM_NOSLEEP);
109 	if (!rds_ibdev)
110 		goto free_attr;
111 
112 	rds_ibdev->ibt_hca_hdl = ib_get_ibt_hca_hdl(device);
113 	rds_ibdev->hca_attr =  *dev_attr;
114 
115 	rw_init(&rds_ibdev->rwlock, NULL, RW_DRIVER, NULL);
116 	mutex_init(&rds_ibdev->spinlock, NULL, MUTEX_DRIVER, NULL);
117 
118 	rds_ibdev->max_wrs = dev_attr->hca_max_chan_sz;
119 	rds_ibdev->max_sge = min(dev_attr->hca_max_sgl, RDSV3_IB_MAX_SGE);
120 
121 	rds_ibdev->max_initiator_depth = (uint_t)dev_attr->hca_max_rdma_in_qp;
122 	rds_ibdev->max_responder_resources =
123 	    (uint_t)dev_attr->hca_max_rdma_in_qp;
124 
125 	rds_ibdev->dev = device;
126 	rds_ibdev->pd = ib_alloc_pd(device);
127 	if (IS_ERR(rds_ibdev->pd))
128 		goto free_dev;
129 
130 	if (rdsv3_ib_create_mr_pool(rds_ibdev) != 0) {
131 		goto free_dev;
132 	}
133 
134 	if (rdsv3_ib_create_inc_pool(rds_ibdev) != 0) {
135 		rdsv3_ib_destroy_mr_pool(rds_ibdev);
136 		goto free_dev;
137 	}
138 
139 	(void) snprintf(name, 64, "RDSV3_IB_FRAG_%llx",
140 	    (longlong_t)htonll(dev_attr->hca_node_guid));
141 	rds_ibdev->ib_frag_slab = kmem_cache_create(name,
142 	    sizeof (struct rdsv3_page_frag), 0, rdsv3_ib_frag_constructor,
143 	    rdsv3_ib_frag_destructor, NULL, (void *)rds_ibdev, NULL, 0);
144 	if (rds_ibdev->ib_frag_slab == NULL) {
145 		RDSV3_DPRINTF2("rdsv3_ib_add_one",
146 		    "kmem_cache_create for ib_frag_slab failed for device: %s",
147 		    device->name);
148 		rdsv3_ib_destroy_mr_pool(rds_ibdev);
149 		rdsv3_ib_destroy_inc_pool(rds_ibdev);
150 		goto free_dev;
151 	}
152 
153 	rds_ibdev->aft_hcagp = rdsv3_af_grp_create(rds_ibdev->ibt_hca_hdl,
154 	    (uint64_t)rds_ibdev->hca_attr.hca_node_guid);
155 	if (rds_ibdev->aft_hcagp == NULL) {
156 		rdsv3_ib_destroy_mr_pool(rds_ibdev);
157 		rdsv3_ib_destroy_inc_pool(rds_ibdev);
158 		kmem_cache_destroy(rds_ibdev->ib_frag_slab);
159 		goto free_dev;
160 	}
161 	rds_ibdev->fmr_soft_cq = rdsv3_af_thr_create(rdsv3_ib_drain_mrlist_fn,
162 	    (void *)rds_ibdev->fmr_pool, SCQ_HCA_BIND_CPU,
163 	    rds_ibdev->aft_hcagp);
164 	if (rds_ibdev->fmr_soft_cq == NULL) {
165 		rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp);
166 		rdsv3_ib_destroy_mr_pool(rds_ibdev);
167 		rdsv3_ib_destroy_inc_pool(rds_ibdev);
168 		kmem_cache_destroy(rds_ibdev->ib_frag_slab);
169 		goto free_dev;
170 	}
171 
172 	rds_ibdev->inc_soft_cq = rdsv3_af_thr_create(rdsv3_ib_drain_inclist,
173 	    (void *)rds_ibdev->inc_pool, SCQ_HCA_BIND_CPU,
174 	    rds_ibdev->aft_hcagp);
175 	if (rds_ibdev->inc_soft_cq == NULL) {
176 		rdsv3_af_thr_destroy(rds_ibdev->fmr_soft_cq);
177 		rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp);
178 		rdsv3_ib_destroy_mr_pool(rds_ibdev);
179 		rdsv3_ib_destroy_inc_pool(rds_ibdev);
180 		kmem_cache_destroy(rds_ibdev->ib_frag_slab);
181 		goto free_dev;
182 	}
183 
184 	list_create(&rds_ibdev->ipaddr_list, sizeof (struct rdsv3_ib_ipaddr),
185 	    offsetof(struct rdsv3_ib_ipaddr, list));
186 	list_create(&rds_ibdev->conn_list, sizeof (struct rdsv3_ib_connection),
187 	    offsetof(struct rdsv3_ib_connection, ib_node));
188 
189 	list_insert_tail(&rdsv3_ib_devices, rds_ibdev);
190 
191 	ib_set_client_data(device, &rdsv3_ib_client, rds_ibdev);
192 
193 	RDSV3_DPRINTF2("rdsv3_ib_add_one", "Return: device: %p", device);
194 
195 	goto free_attr;
196 
197 err_pd:
198 	(void) ib_dealloc_pd(rds_ibdev->pd);
199 free_dev:
200 	mutex_destroy(&rds_ibdev->spinlock);
201 	rw_destroy(&rds_ibdev->rwlock);
202 	kmem_free(rds_ibdev, sizeof (*rds_ibdev));
203 free_attr:
204 	kmem_free(dev_attr, sizeof (*dev_attr));
205 }
206 
207 void
208 rdsv3_ib_remove_one(struct ib_device *device)
209 {
210 	struct rdsv3_ib_device *rds_ibdev;
211 	struct rdsv3_ib_ipaddr *i_ipaddr, *i_next;
212 
213 	RDSV3_DPRINTF2("rdsv3_ib_remove_one", "device: %p", device);
214 
215 	rds_ibdev = ib_get_client_data(device, &rdsv3_ib_client);
216 	if (!rds_ibdev)
217 		return;
218 
219 	RDSV3_FOR_EACH_LIST_NODE_SAFE(i_ipaddr, i_next, &rds_ibdev->ipaddr_list,
220 	    list) {
221 		list_remove_node(&i_ipaddr->list);
222 		kmem_free(i_ipaddr, sizeof (*i_ipaddr));
223 	}
224 
225 	rdsv3_ib_destroy_conns(rds_ibdev);
226 
227 	if (rds_ibdev->fmr_soft_cq)
228 		rdsv3_af_thr_destroy(rds_ibdev->fmr_soft_cq);
229 	if (rds_ibdev->inc_soft_cq)
230 		rdsv3_af_thr_destroy(rds_ibdev->inc_soft_cq);
231 
232 	rdsv3_ib_destroy_mr_pool(rds_ibdev);
233 	rdsv3_ib_destroy_inc_pool(rds_ibdev);
234 
235 	kmem_cache_destroy(rds_ibdev->ib_frag_slab);
236 
237 	rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp);
238 
239 #if 0
240 	while (ib_dealloc_pd(rds_ibdev->pd)) {
241 #ifndef __lock_lint
242 		RDSV3_DPRINTF5("rdsv3_ib_remove_one",
243 		    "%s-%d Failed to dealloc pd %p",
244 		    __func__, __LINE__, rds_ibdev->pd);
245 #endif
246 		delay(drv_usectohz(1000));
247 	}
248 #else
249 	if (ib_dealloc_pd(rds_ibdev->pd)) {
250 #ifndef __lock_lint
251 		RDSV3_DPRINTF2("rdsv3_ib_remove_one",
252 		    "Failed to dealloc pd %p\n", rds_ibdev->pd);
253 #endif
254 	}
255 #endif
256 
257 	list_destroy(&rds_ibdev->ipaddr_list);
258 	list_destroy(&rds_ibdev->conn_list);
259 	list_remove_node(&rds_ibdev->list);
260 	mutex_destroy(&rds_ibdev->spinlock);
261 	rw_destroy(&rds_ibdev->rwlock);
262 	kmem_free(rds_ibdev, sizeof (*rds_ibdev));
263 
264 	RDSV3_DPRINTF2("rdsv3_ib_remove_one", "Return: device: %p", device);
265 }
266 
267 #ifndef __lock_lint
268 struct ib_client rdsv3_ib_client = {
269 	.name		= "rdsv3_ib",
270 	.add		= rdsv3_ib_add_one,
271 	.remove		= rdsv3_ib_remove_one,
272 	.clnt_hdl	= NULL,
273 	.state		= IB_CLNT_UNINITIALIZED
274 };
275 #else
276 struct ib_client rdsv3_ib_client = {
277 	"rdsv3_ib",
278 	rdsv3_ib_add_one,
279 	rdsv3_ib_remove_one,
280 	NULL,
281 	NULL,
282 	IB_CLNT_UNINITIALIZED
283 };
284 #endif
285 
286 static int
287 rds_ib_conn_info_visitor(struct rdsv3_connection *conn,
288     void *buffer)
289 {
290 	struct rdsv3_info_rdma_connection *iinfo = buffer;
291 	struct rdsv3_ib_connection *ic;
292 
293 	RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p",
294 	    conn, buffer);
295 
296 	/* We will only ever look at IB transports */
297 	if (conn->c_trans != &rdsv3_ib_transport)
298 		return (0);
299 
300 	iinfo->src_addr = conn->c_laddr;
301 	iinfo->dst_addr = conn->c_faddr;
302 
303 	(void) memset(&iinfo->src_gid, 0, sizeof (iinfo->src_gid));
304 	(void) memset(&iinfo->dst_gid, 0, sizeof (iinfo->dst_gid));
305 	if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) {
306 		struct rdsv3_ib_device *rds_ibdev;
307 		struct rdma_dev_addr *dev_addr;
308 
309 		ic = conn->c_transport_data;
310 		dev_addr = &ic->i_cm_id->route.addr.dev_addr;
311 
312 		ib_addr_get_sgid(dev_addr, (union ib_gid *)&iinfo->src_gid);
313 		ib_addr_get_dgid(dev_addr, (union ib_gid *)&iinfo->dst_gid);
314 
315 		rds_ibdev = ib_get_client_data(ic->i_cm_id->device,
316 		    &rdsv3_ib_client);
317 		iinfo->max_send_wr = ic->i_send_ring.w_nr;
318 		iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
319 		iinfo->max_send_sge = rds_ibdev->max_sge;
320 	}
321 
322 	RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p",
323 	    conn, buffer);
324 	return (1);
325 }
326 
327 static void
328 rds_ib_ic_info(struct rsock *sock, unsigned int len,
329     struct rdsv3_info_iterator *iter,
330     struct rdsv3_info_lengths *lens)
331 {
332 	RDSV3_DPRINTF4("rds_ib_ic_info", "sk: %p iter: %p, lens: %p, len: %d",
333 	    sock, iter, lens, len);
334 
335 	rdsv3_for_each_conn_info(sock, len, iter, lens,
336 	    rds_ib_conn_info_visitor,
337 	    sizeof (struct rdsv3_info_rdma_connection));
338 }
339 
340 /*
341  * Early RDS/IB was built to only bind to an address if there is an IPoIB
342  * device with that address set.
343  *
344  * If it were me, I'd advocate for something more flexible.  Sending and
345  * receiving should be device-agnostic.  Transports would try and maintain
346  * connections between peers who have messages queued.  Userspace would be
347  * allowed to influence which paths have priority.  We could call userspace
348  * asserting this policy "routing".
349  */
350 static int
351 rds_ib_laddr_check(uint32_be_t addr)
352 {
353 	int ret;
354 	struct rdma_cm_id *cm_id;
355 	struct sockaddr_in sin;
356 
357 	RDSV3_DPRINTF4("rds_ib_laddr_check", "addr: %x", ntohl(addr));
358 
359 	/*
360 	 * Create a CMA ID and try to bind it. This catches both
361 	 * IB and iWARP capable NICs.
362 	 */
363 	cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP);
364 	if (!cm_id)
365 		return (-EADDRNOTAVAIL);
366 
367 	(void) memset(&sin, 0, sizeof (sin));
368 	sin.sin_family = AF_INET;
369 	sin.sin_addr.s_addr = rdsv3_scaddr_to_ibaddr(addr);
370 
371 	/* rdma_bind_addr will only succeed for IB & iWARP devices */
372 	ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
373 	/*
374 	 * due to this, we will claim to support iWARP devices unless we
375 	 * check node_type.
376 	 */
377 	if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA)
378 		ret = -EADDRNOTAVAIL;
379 
380 	RDSV3_DPRINTF5("rds_ib_laddr_check",
381 	    "addr %u.%u.%u.%u ret %d node type %d",
382 	    NIPQUAD(addr), ret,
383 	    cm_id->device ? cm_id->device->node_type : -1);
384 
385 	rdma_destroy_id(cm_id);
386 
387 	return (ret);
388 }
389 
390 void
391 rdsv3_ib_exit(void)
392 {
393 	RDSV3_DPRINTF4("rds_ib_exit", "Enter");
394 
395 	rdsv3_info_deregister_func(RDSV3_INFO_IB_CONNECTIONS, rds_ib_ic_info);
396 	rdsv3_ib_destroy_nodev_conns();
397 	ib_unregister_client(&rdsv3_ib_client);
398 	rdsv3_ib_sysctl_exit();
399 	rdsv3_ib_recv_exit();
400 	rdsv3_trans_unregister(&rdsv3_ib_transport);
401 	mutex_destroy(&ib_nodev_conns_lock);
402 	list_destroy(&ib_nodev_conns);
403 	list_destroy(&rdsv3_ib_devices);
404 
405 	RDSV3_DPRINTF4("rds_ib_exit", "Return");
406 }
407 
408 #ifndef __lock_lint
409 struct rdsv3_transport rdsv3_ib_transport = {
410 	.laddr_check		= rds_ib_laddr_check,
411 	.xmit_complete		= rdsv3_ib_xmit_complete,
412 	.xmit			= rdsv3_ib_xmit,
413 	.xmit_cong_map		= NULL,
414 	.xmit_rdma		= rdsv3_ib_xmit_rdma,
415 	.recv			= rdsv3_ib_recv,
416 	.conn_alloc		= rdsv3_ib_conn_alloc,
417 	.conn_free		= rdsv3_ib_conn_free,
418 	.conn_connect		= rdsv3_ib_conn_connect,
419 	.conn_shutdown		= rdsv3_ib_conn_shutdown,
420 	.inc_copy_to_user	= rdsv3_ib_inc_copy_to_user,
421 	.inc_free		= rdsv3_ib_inc_free,
422 	.cm_initiate_connect	= rdsv3_ib_cm_initiate_connect,
423 	.cm_handle_connect	= rdsv3_ib_cm_handle_connect,
424 	.cm_connect_complete	= rdsv3_ib_cm_connect_complete,
425 	.stats_info_copy	= rdsv3_ib_stats_info_copy,
426 	.exit			= rdsv3_ib_exit,
427 	.get_mr			= rdsv3_ib_get_mr,
428 	.sync_mr		= rdsv3_ib_sync_mr,
429 	.free_mr		= rdsv3_ib_free_mr,
430 	.flush_mrs		= rdsv3_ib_flush_mrs,
431 	.t_name			= "infiniband",
432 	.t_type			= RDS_TRANS_IB
433 };
434 #else
435 struct rdsv3_transport rdsv3_ib_transport;
436 #endif
437 
438 int
439 rdsv3_ib_init(void)
440 {
441 	int ret;
442 
443 	RDSV3_DPRINTF4("rds_ib_init", "Enter");
444 
445 	list_create(&rdsv3_ib_devices, sizeof (struct rdsv3_ib_device),
446 	    offsetof(struct rdsv3_ib_device, list));
447 	list_create(&ib_nodev_conns, sizeof (struct rdsv3_ib_connection),
448 	    offsetof(struct rdsv3_ib_connection, ib_node));
449 	mutex_init(&ib_nodev_conns_lock, NULL, MUTEX_DRIVER, NULL);
450 
451 	rdsv3_ib_client.dip = rdsv3_dev_info;
452 	ret = ib_register_client(&rdsv3_ib_client);
453 	if (ret)
454 		goto out;
455 
456 	ret = rdsv3_ib_sysctl_init();
457 	if (ret)
458 		goto out_ibreg;
459 
460 	ret = rdsv3_ib_recv_init();
461 	if (ret)
462 		goto out_sysctl;
463 
464 	ret = rdsv3_trans_register(&rdsv3_ib_transport);
465 	if (ret)
466 		goto out_recv;
467 
468 	rdsv3_info_register_func(RDSV3_INFO_IB_CONNECTIONS, rds_ib_ic_info);
469 
470 	RDSV3_DPRINTF4("rds_ib_init", "Return");
471 
472 	return (0);
473 
474 out_recv:
475 	rdsv3_ib_recv_exit();
476 out_sysctl:
477 	rdsv3_ib_sysctl_exit();
478 out_ibreg:
479 	ib_unregister_client(&rdsv3_ib_client);
480 out:
481 	mutex_destroy(&ib_nodev_conns_lock);
482 	list_destroy(&ib_nodev_conns);
483 	list_destroy(&rdsv3_ib_devices);
484 	return (ret);
485 }
486