xref: /illumos-gate/usr/src/uts/common/io/ib/clients/rdsv3/ib.c (revision ca082a315a09d463643bfd5cae755e9a04b74904)
1 /*
2  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
3  */
4 
5 /*
6  * This file contains code imported from the OFED rds source file ib.c
7  * Oracle elects to have and use the contents of ib.c under and governed
8  * by the OpenIB.org BSD license (see below for full license text). However,
9  * the following notice accompanied the original version of this file:
10  */
11 
12 /*
13  * Copyright (c) 2006 Oracle.  All rights reserved.
14  *
15  * This software is available to you under a choice of one of two
16  * licenses.  You may choose to be licensed under the terms of the GNU
17  * General Public License (GPL) Version 2, available from the file
18  * COPYING in the main directory of this source tree, or the
19  * OpenIB.org BSD license below:
20  *
21  *     Redistribution and use in source and binary forms, with or
22  *     without modification, are permitted provided that the following
23  *     conditions are met:
24  *
25  *      - Redistributions of source code must retain the above
26  *        copyright notice, this list of conditions and the following
27  *        disclaimer.
28  *
29  *      - Redistributions in binary form must reproduce the above
30  *        copyright notice, this list of conditions and the following
31  *        disclaimer in the documentation and/or other materials
32  *        provided with the distribution.
33  *
34  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
35  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
36  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
37  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
38  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
39  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
40  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
41  * SOFTWARE.
42  *
43  */
44 #include <sys/sysmacros.h>
45 #include <sys/rds.h>
46 
47 #include <sys/ib/ibtl/ibti.h>
48 #include <sys/ib/clients/rdsv3/rdsv3.h>
49 #include <sys/ib/clients/rdsv3/ib.h>
50 #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
51 
52 unsigned int rdsv3_ib_retry_count = RDSV3_IB_DEFAULT_RETRY_COUNT;
53 
54 struct list	rdsv3_ib_devices;
55 
56 /* NOTE: if also grabbing ibdev lock, grab this first */
57 kmutex_t ib_nodev_conns_lock;
58 list_t ib_nodev_conns;
59 
60 extern int rdsv3_ib_frag_constructor(void *buf, void *arg, int kmflags);
61 extern void rdsv3_ib_frag_destructor(void *buf, void *arg);
62 
63 void
64 rdsv3_ib_add_one(ib_device_t *device)
65 {
66 	struct rdsv3_ib_device *rds_ibdev;
67 	ibt_hca_attr_t *dev_attr;
68 	char name[64];
69 
70 	RDSV3_DPRINTF2("rdsv3_ib_add_one", "device: %p", device);
71 
72 	/* Only handle IB (no iWARP) devices */
73 	if (device->node_type != RDMA_NODE_IB_CA)
74 		return;
75 
76 	dev_attr = (ibt_hca_attr_t *)kmem_alloc(sizeof (*dev_attr),
77 	    KM_NOSLEEP);
78 	if (!dev_attr)
79 		return;
80 
81 	if (ibt_query_hca(ib_get_ibt_hca_hdl(device), dev_attr)) {
82 		RDSV3_DPRINTF2("rdsv3_ib_add_one",
83 		    "Query device failed for %s", device->name);
84 		goto free_attr;
85 	}
86 
87 	/* We depend on Reserved Lkey */
88 	if (!(dev_attr->hca_flags2 & IBT_HCA2_RES_LKEY)) {
89 		RDSV3_DPRINTF2("rdsv3_ib_add_one",
90 		    "Reserved Lkey support is required: %s",
91 		    device->name);
92 		goto free_attr;
93 	}
94 
95 	rds_ibdev = kmem_zalloc(sizeof (*rds_ibdev), KM_NOSLEEP);
96 	if (!rds_ibdev)
97 		goto free_attr;
98 
99 	rds_ibdev->ibt_hca_hdl = ib_get_ibt_hca_hdl(device);
100 	rds_ibdev->hca_attr =  *dev_attr;
101 
102 	rw_init(&rds_ibdev->rwlock, NULL, RW_DRIVER, NULL);
103 	mutex_init(&rds_ibdev->spinlock, NULL, MUTEX_DRIVER, NULL);
104 
105 	rds_ibdev->max_wrs = dev_attr->hca_max_chan_sz;
106 	rds_ibdev->max_sge = min(dev_attr->hca_max_sgl, RDSV3_IB_MAX_SGE);
107 
108 	rds_ibdev->max_initiator_depth = (uint_t)dev_attr->hca_max_rdma_in_qp;
109 	rds_ibdev->max_responder_resources =
110 	    (uint_t)dev_attr->hca_max_rdma_in_qp;
111 
112 	rds_ibdev->dev = device;
113 	rds_ibdev->pd = ib_alloc_pd(device);
114 	if (IS_ERR(rds_ibdev->pd))
115 		goto free_dev;
116 
117 	if (rdsv3_ib_create_mr_pool(rds_ibdev) != 0) {
118 		goto free_dev;
119 	}
120 
121 	if (rdsv3_ib_create_inc_pool(rds_ibdev) != 0) {
122 		rdsv3_ib_destroy_mr_pool(rds_ibdev);
123 		goto free_dev;
124 	}
125 
126 	(void) snprintf(name, 64, "RDSV3_IB_FRAG_%llx",
127 	    (longlong_t)htonll(dev_attr->hca_node_guid));
128 	rds_ibdev->ib_frag_slab = kmem_cache_create(name,
129 	    sizeof (struct rdsv3_page_frag), 0, rdsv3_ib_frag_constructor,
130 	    rdsv3_ib_frag_destructor, NULL, (void *)rds_ibdev, NULL, 0);
131 	if (rds_ibdev->ib_frag_slab == NULL) {
132 		RDSV3_DPRINTF2("rdsv3_ib_add_one",
133 		    "kmem_cache_create for ib_frag_slab failed for device: %s",
134 		    device->name);
135 		rdsv3_ib_destroy_mr_pool(rds_ibdev);
136 		rdsv3_ib_destroy_inc_pool(rds_ibdev);
137 		goto free_dev;
138 	}
139 
140 	rds_ibdev->aft_hcagp = rdsv3_af_grp_create(rds_ibdev->ibt_hca_hdl,
141 	    (uint64_t)rds_ibdev->hca_attr.hca_node_guid);
142 	if (rds_ibdev->aft_hcagp == NULL) {
143 		rdsv3_ib_destroy_mr_pool(rds_ibdev);
144 		rdsv3_ib_destroy_inc_pool(rds_ibdev);
145 		kmem_cache_destroy(rds_ibdev->ib_frag_slab);
146 		goto free_dev;
147 	}
148 	rds_ibdev->fmr_soft_cq = rdsv3_af_thr_create(rdsv3_ib_drain_mrlist_fn,
149 	    (void *)rds_ibdev->fmr_pool, SCQ_HCA_BIND_CPU,
150 	    rds_ibdev->aft_hcagp);
151 	if (rds_ibdev->fmr_soft_cq == NULL) {
152 		rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp);
153 		rdsv3_ib_destroy_mr_pool(rds_ibdev);
154 		rdsv3_ib_destroy_inc_pool(rds_ibdev);
155 		kmem_cache_destroy(rds_ibdev->ib_frag_slab);
156 		goto free_dev;
157 	}
158 
159 	rds_ibdev->inc_soft_cq = rdsv3_af_thr_create(rdsv3_ib_drain_inclist,
160 	    (void *)rds_ibdev->inc_pool, SCQ_HCA_BIND_CPU,
161 	    rds_ibdev->aft_hcagp);
162 	if (rds_ibdev->inc_soft_cq == NULL) {
163 		rdsv3_af_thr_destroy(rds_ibdev->fmr_soft_cq);
164 		rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp);
165 		rdsv3_ib_destroy_mr_pool(rds_ibdev);
166 		rdsv3_ib_destroy_inc_pool(rds_ibdev);
167 		kmem_cache_destroy(rds_ibdev->ib_frag_slab);
168 		goto free_dev;
169 	}
170 
171 	list_create(&rds_ibdev->ipaddr_list, sizeof (struct rdsv3_ib_ipaddr),
172 	    offsetof(struct rdsv3_ib_ipaddr, list));
173 	list_create(&rds_ibdev->conn_list, sizeof (struct rdsv3_ib_connection),
174 	    offsetof(struct rdsv3_ib_connection, ib_node));
175 
176 	list_insert_tail(&rdsv3_ib_devices, rds_ibdev);
177 
178 	ib_set_client_data(device, &rdsv3_ib_client, rds_ibdev);
179 
180 	RDSV3_DPRINTF2("rdsv3_ib_add_one", "Return: device: %p", device);
181 
182 	goto free_attr;
183 
184 err_pd:
185 	(void) ib_dealloc_pd(rds_ibdev->pd);
186 free_dev:
187 	mutex_destroy(&rds_ibdev->spinlock);
188 	rw_destroy(&rds_ibdev->rwlock);
189 	kmem_free(rds_ibdev, sizeof (*rds_ibdev));
190 free_attr:
191 	kmem_free(dev_attr, sizeof (*dev_attr));
192 }
193 
194 void
195 rdsv3_ib_remove_one(struct ib_device *device)
196 {
197 	struct rdsv3_ib_device *rds_ibdev;
198 	struct rdsv3_ib_ipaddr *i_ipaddr, *i_next;
199 
200 	RDSV3_DPRINTF2("rdsv3_ib_remove_one", "device: %p", device);
201 
202 	rds_ibdev = ib_get_client_data(device, &rdsv3_ib_client);
203 	if (!rds_ibdev)
204 		return;
205 
206 	RDSV3_FOR_EACH_LIST_NODE_SAFE(i_ipaddr, i_next, &rds_ibdev->ipaddr_list,
207 	    list) {
208 		list_remove_node(&i_ipaddr->list);
209 		kmem_free(i_ipaddr, sizeof (*i_ipaddr));
210 	}
211 
212 	rdsv3_ib_destroy_conns(rds_ibdev);
213 
214 	if (rds_ibdev->fmr_soft_cq)
215 		rdsv3_af_thr_destroy(rds_ibdev->fmr_soft_cq);
216 	if (rds_ibdev->inc_soft_cq)
217 		rdsv3_af_thr_destroy(rds_ibdev->inc_soft_cq);
218 
219 	rdsv3_ib_destroy_mr_pool(rds_ibdev);
220 	rdsv3_ib_destroy_inc_pool(rds_ibdev);
221 
222 	kmem_cache_destroy(rds_ibdev->ib_frag_slab);
223 
224 	rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp);
225 
226 #if 0
227 	while (ib_dealloc_pd(rds_ibdev->pd)) {
228 #ifndef __lock_lint
229 		RDSV3_DPRINTF5("rdsv3_ib_remove_one",
230 		    "%s-%d Failed to dealloc pd %p",
231 		    __func__, __LINE__, rds_ibdev->pd);
232 #endif
233 		delay(drv_usectohz(1000));
234 	}
235 #else
236 	if (ib_dealloc_pd(rds_ibdev->pd)) {
237 #ifndef __lock_lint
238 		RDSV3_DPRINTF2("rdsv3_ib_remove_one",
239 		    "Failed to dealloc pd %p\n", rds_ibdev->pd);
240 #endif
241 	}
242 #endif
243 
244 	list_destroy(&rds_ibdev->ipaddr_list);
245 	list_destroy(&rds_ibdev->conn_list);
246 	list_remove_node(&rds_ibdev->list);
247 	mutex_destroy(&rds_ibdev->spinlock);
248 	rw_destroy(&rds_ibdev->rwlock);
249 	kmem_free(rds_ibdev, sizeof (*rds_ibdev));
250 
251 	RDSV3_DPRINTF2("rdsv3_ib_remove_one", "Return: device: %p", device);
252 }
253 
254 #ifndef __lock_lint
255 struct ib_client rdsv3_ib_client = {
256 	.name		= "rdsv3_ib",
257 	.add		= rdsv3_ib_add_one,
258 	.remove		= rdsv3_ib_remove_one,
259 	.clnt_hdl	= NULL,
260 	.state		= IB_CLNT_UNINITIALIZED
261 };
262 #else
263 struct ib_client rdsv3_ib_client = {
264 	"rdsv3_ib",
265 	rdsv3_ib_add_one,
266 	rdsv3_ib_remove_one,
267 	NULL,
268 	NULL,
269 	IB_CLNT_UNINITIALIZED
270 };
271 #endif
272 
273 static int
274 rds_ib_conn_info_visitor(struct rdsv3_connection *conn,
275     void *buffer)
276 {
277 	struct rds_info_rdma_connection *iinfo = buffer;
278 	struct rdsv3_ib_connection *ic;
279 
280 	RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p",
281 	    conn, buffer);
282 
283 	/* We will only ever look at IB transports */
284 	if (conn->c_trans != &rdsv3_ib_transport)
285 		return (0);
286 
287 	iinfo->src_addr = conn->c_laddr;
288 	iinfo->dst_addr = conn->c_faddr;
289 
290 	(void) memset(&iinfo->src_gid, 0, sizeof (iinfo->src_gid));
291 	(void) memset(&iinfo->dst_gid, 0, sizeof (iinfo->dst_gid));
292 	if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) {
293 		struct rdsv3_ib_device *rds_ibdev;
294 		struct rdma_dev_addr *dev_addr;
295 
296 		ic = conn->c_transport_data;
297 		dev_addr = &ic->i_cm_id->route.addr.dev_addr;
298 
299 		ib_addr_get_sgid(dev_addr, (union ib_gid *)&iinfo->src_gid);
300 		ib_addr_get_dgid(dev_addr, (union ib_gid *)&iinfo->dst_gid);
301 
302 		rds_ibdev = ib_get_client_data(ic->i_cm_id->device,
303 		    &rdsv3_ib_client);
304 		iinfo->max_send_wr = ic->i_send_ring.w_nr;
305 		iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
306 		iinfo->max_send_sge = rds_ibdev->max_sge;
307 	}
308 
309 	RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p",
310 	    conn, buffer);
311 	return (1);
312 }
313 
314 static void
315 rds_ib_ic_info(struct rsock *sock, unsigned int len,
316     struct rdsv3_info_iterator *iter,
317     struct rdsv3_info_lengths *lens)
318 {
319 	RDSV3_DPRINTF4("rds_ib_ic_info", "sk: %p iter: %p, lens: %p, len: %d",
320 	    sock, iter, lens, len);
321 
322 	rdsv3_for_each_conn_info(sock, len, iter, lens,
323 	    rds_ib_conn_info_visitor,
324 	    sizeof (struct rds_info_rdma_connection));
325 }
326 
327 /*
328  * Early RDS/IB was built to only bind to an address if there is an IPoIB
329  * device with that address set.
330  *
331  * If it were me, I'd advocate for something more flexible.  Sending and
332  * receiving should be device-agnostic.  Transports would try and maintain
333  * connections between peers who have messages queued.  Userspace would be
334  * allowed to influence which paths have priority.  We could call userspace
335  * asserting this policy "routing".
336  */
337 static int
338 rds_ib_laddr_check(uint32_be_t addr)
339 {
340 	int ret;
341 	struct rdma_cm_id *cm_id;
342 	struct sockaddr_in sin;
343 
344 	RDSV3_DPRINTF4("rds_ib_laddr_check", "addr: %x", ntohl(addr));
345 
346 	/*
347 	 * Create a CMA ID and try to bind it. This catches both
348 	 * IB and iWARP capable NICs.
349 	 */
350 	cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP);
351 	if (!cm_id)
352 		return (-EADDRNOTAVAIL);
353 
354 	(void) memset(&sin, 0, sizeof (sin));
355 	sin.sin_family = AF_INET;
356 	sin.sin_addr.s_addr = rdsv3_scaddr_to_ibaddr(addr);
357 
358 	/* rdma_bind_addr will only succeed for IB & iWARP devices */
359 	ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
360 	/*
361 	 * due to this, we will claim to support iWARP devices unless we
362 	 * check node_type.
363 	 */
364 	if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA)
365 		ret = -EADDRNOTAVAIL;
366 
367 	RDSV3_DPRINTF5("rds_ib_laddr_check",
368 	    "addr %u.%u.%u.%u ret %d node type %d",
369 	    NIPQUAD(addr), ret,
370 	    cm_id->device ? cm_id->device->node_type : -1);
371 
372 	rdma_destroy_id(cm_id);
373 
374 	return (ret);
375 }
376 
377 void
378 rdsv3_ib_exit(void)
379 {
380 	RDSV3_DPRINTF4("rds_ib_exit", "Enter");
381 
382 	rdsv3_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
383 	rdsv3_ib_destroy_nodev_conns();
384 	ib_unregister_client(&rdsv3_ib_client);
385 	rdsv3_ib_sysctl_exit();
386 	rdsv3_ib_recv_exit();
387 	rdsv3_trans_unregister(&rdsv3_ib_transport);
388 	kmem_free(rdsv3_ib_stats,
389 	    nr_cpus * sizeof (struct rdsv3_ib_statistics));
390 	mutex_destroy(&ib_nodev_conns_lock);
391 	list_destroy(&ib_nodev_conns);
392 	list_destroy(&rdsv3_ib_devices);
393 
394 	RDSV3_DPRINTF4("rds_ib_exit", "Return");
395 }
396 
397 #ifndef __lock_lint
398 struct rdsv3_transport rdsv3_ib_transport = {
399 	.laddr_check		= rds_ib_laddr_check,
400 	.xmit_complete		= rdsv3_ib_xmit_complete,
401 	.xmit			= rdsv3_ib_xmit,
402 	.xmit_cong_map		= NULL,
403 	.xmit_rdma		= rdsv3_ib_xmit_rdma,
404 	.recv			= rdsv3_ib_recv,
405 	.conn_alloc		= rdsv3_ib_conn_alloc,
406 	.conn_free		= rdsv3_ib_conn_free,
407 	.conn_connect		= rdsv3_ib_conn_connect,
408 	.conn_shutdown		= rdsv3_ib_conn_shutdown,
409 	.inc_copy_to_user	= rdsv3_ib_inc_copy_to_user,
410 	.inc_free		= rdsv3_ib_inc_free,
411 	.cm_initiate_connect	= rdsv3_ib_cm_initiate_connect,
412 	.cm_handle_connect	= rdsv3_ib_cm_handle_connect,
413 	.cm_connect_complete	= rdsv3_ib_cm_connect_complete,
414 	.stats_info_copy	= rdsv3_ib_stats_info_copy,
415 	.exit			= rdsv3_ib_exit,
416 	.get_mr			= rdsv3_ib_get_mr,
417 	.sync_mr		= rdsv3_ib_sync_mr,
418 	.free_mr		= rdsv3_ib_free_mr,
419 	.flush_mrs		= rdsv3_ib_flush_mrs,
420 	.t_name			= "infiniband",
421 	.t_type			= RDS_TRANS_IB
422 };
423 #else
424 struct rdsv3_transport rdsv3_ib_transport;
425 #endif
426 
427 int
428 rdsv3_ib_init(void)
429 {
430 	int ret;
431 
432 	RDSV3_DPRINTF4("rds_ib_init", "Enter");
433 
434 	list_create(&rdsv3_ib_devices, sizeof (struct rdsv3_ib_device),
435 	    offsetof(struct rdsv3_ib_device, list));
436 	list_create(&ib_nodev_conns, sizeof (struct rdsv3_ib_connection),
437 	    offsetof(struct rdsv3_ib_connection, ib_node));
438 	mutex_init(&ib_nodev_conns_lock, NULL, MUTEX_DRIVER, NULL);
439 
440 	/* allocate space for ib statistics */
441 	ASSERT(rdsv3_ib_stats == NULL);
442 	rdsv3_ib_stats = kmem_zalloc(nr_cpus *
443 	    sizeof (struct rdsv3_ib_statistics), KM_SLEEP);
444 
445 	rdsv3_ib_client.dip = rdsv3_dev_info;
446 	ret = ib_register_client(&rdsv3_ib_client);
447 	if (ret)
448 		goto out;
449 
450 	ret = rdsv3_ib_sysctl_init();
451 	if (ret)
452 		goto out_ibreg;
453 
454 	ret = rdsv3_ib_recv_init();
455 	if (ret)
456 		goto out_sysctl;
457 
458 	ret = rdsv3_trans_register(&rdsv3_ib_transport);
459 	if (ret)
460 		goto out_recv;
461 
462 	rdsv3_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
463 
464 	RDSV3_DPRINTF4("rds_ib_init", "Return");
465 
466 	return (0);
467 
468 out_recv:
469 	rdsv3_ib_recv_exit();
470 out_sysctl:
471 	rdsv3_ib_sysctl_exit();
472 out_ibreg:
473 	ib_unregister_client(&rdsv3_ib_client);
474 out:
475 	kmem_free(rdsv3_ib_stats,
476 	    nr_cpus * sizeof (struct rdsv3_ib_statistics));
477 	mutex_destroy(&ib_nodev_conns_lock);
478 	list_destroy(&ib_nodev_conns);
479 	list_destroy(&rdsv3_ib_devices);
480 	return (ret);
481 }
482