xref: /illumos-gate/usr/src/uts/common/io/ib/clients/rdsv3/ib.c (revision b1d7ec75953cd517f5b7c3d9cb427ff8ec5d7d07)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * Copyright (c) 2006 Oracle.  All rights reserved.
27  *
28  * This software is available to you under a choice of one of two
29  * licenses.  You may choose to be licensed under the terms of the GNU
30  * General Public License (GPL) Version 2, available from the file
31  * COPYING in the main directory of this source tree, or the
32  * OpenIB.org BSD license below:
33  *
34  *     Redistribution and use in source and binary forms, with or
35  *     without modification, are permitted provided that the following
36  *     conditions are met:
37  *
38  *      - Redistributions of source code must retain the above
39  *        copyright notice, this list of conditions and the following
40  *        disclaimer.
41  *
42  *      - Redistributions in binary form must reproduce the above
43  *        copyright notice, this list of conditions and the following
44  *        disclaimer in the documentation and/or other materials
45  *        provided with the distribution.
46  *
47  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
48  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
49  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
50  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
51  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
52  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
53  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
54  * SOFTWARE.
55  *
56  */
57 #include <sys/sysmacros.h>
58 #include <sys/rds.h>
59 
60 #include <sys/ib/ibtl/ibti.h>
61 #include <sys/ib/clients/rdsv3/rdsv3.h>
62 #include <sys/ib/clients/rdsv3/ib.h>
63 #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
64 
65 unsigned int rdsv3_ib_retry_count = RDSV3_IB_DEFAULT_RETRY_COUNT;
66 
67 struct list	rdsv3_ib_devices;
68 
69 /* NOTE: if also grabbing ibdev lock, grab this first */
70 kmutex_t ib_nodev_conns_lock;
71 list_t ib_nodev_conns;
72 
73 extern int rdsv3_ib_frag_constructor(void *buf, void *arg, int kmflags);
74 extern void rdsv3_ib_frag_destructor(void *buf, void *arg);
75 
76 void
77 rdsv3_ib_add_one(ib_device_t *device)
78 {
79 	struct rdsv3_ib_device *rds_ibdev;
80 	ibt_hca_attr_t *dev_attr;
81 	char name[64];
82 
83 	RDSV3_DPRINTF4("rdsv3_ib_add_one", "device: %p", device);
84 
85 	/* Only handle IB (no iWARP) devices */
86 	if (device->node_type != RDMA_NODE_IB_CA)
87 		return;
88 
89 	dev_attr = (ibt_hca_attr_t *)kmem_alloc(sizeof (*dev_attr),
90 	    KM_NOSLEEP);
91 	if (!dev_attr)
92 		return;
93 
94 	if (ibt_query_hca(ib_get_ibt_hca_hdl(device), dev_attr)) {
95 		RDSV3_DPRINTF2("rdsv3_ib_add_one",
96 		    "Query device failed for %s", device->name);
97 		goto free_attr;
98 	}
99 
100 	/* We depend on Reserved Lkey */
101 	if (!(dev_attr->hca_flags2 & IBT_HCA2_RES_LKEY)) {
102 		RDSV3_DPRINTF2("rdsv3_ib_add_one",
103 		    "Reserved Lkey support is required: %s",
104 		    device->name);
105 		goto free_attr;
106 	}
107 
108 	rds_ibdev = kmem_zalloc(sizeof (*rds_ibdev), KM_NOSLEEP);
109 	if (!rds_ibdev)
110 		goto free_attr;
111 
112 	mutex_init(&rds_ibdev->spinlock, NULL, MUTEX_DRIVER, NULL);
113 
114 	rds_ibdev->max_wrs = dev_attr->hca_max_chan_sz;
115 	rds_ibdev->max_sge = min(dev_attr->hca_max_sgl, RDSV3_IB_MAX_SGE);
116 
117 	rds_ibdev->dev = device;
118 	rds_ibdev->pd = ib_alloc_pd(device);
119 	if (IS_ERR(rds_ibdev->pd))
120 		goto free_dev;
121 
122 	if (rdsv3_ib_create_mr_pool(rds_ibdev) != 0) {
123 		goto free_dev;
124 	}
125 
126 	(void) snprintf(name, 64, "RDSV3_IB_FRAG_%llx",
127 	    (longlong_t)htonll(dev_attr->hca_node_guid));
128 	rds_ibdev->ib_frag_slab = kmem_cache_create(name,
129 	    sizeof (struct rdsv3_page_frag), 0, rdsv3_ib_frag_constructor,
130 	    rdsv3_ib_frag_destructor, NULL, (void *)rds_ibdev, NULL, 0);
131 	if (rds_ibdev->ib_frag_slab == NULL) {
132 		RDSV3_DPRINTF2("rdsv3_ib_add_one",
133 		    "kmem_cache_create for ib_frag_slab failed for device: %s",
134 		    device->name);
135 		rdsv3_ib_destroy_mr_pool(rds_ibdev);
136 		goto free_dev;
137 	}
138 
139 
140 	list_create(&rds_ibdev->ipaddr_list, sizeof (struct rdsv3_ib_ipaddr),
141 	    offsetof(struct rdsv3_ib_ipaddr, list));
142 	list_create(&rds_ibdev->conn_list, sizeof (struct rdsv3_ib_connection),
143 	    offsetof(struct rdsv3_ib_connection, ib_node));
144 
145 	list_insert_tail(&rdsv3_ib_devices, rds_ibdev);
146 
147 	ib_set_client_data(device, &rdsv3_ib_client, rds_ibdev);
148 
149 	RDSV3_DPRINTF4("rdsv3_ib_add_one", "Return: device: %p", device);
150 
151 	goto free_attr;
152 
153 err_pd:
154 	(void) ib_dealloc_pd(rds_ibdev->pd);
155 free_dev:
156 	kmem_free(rds_ibdev, sizeof (*rds_ibdev));
157 free_attr:
158 	kmem_free(dev_attr, sizeof (*dev_attr));
159 }
160 
161 void
162 rdsv3_ib_remove_one(struct ib_device *device)
163 {
164 	struct rdsv3_ib_device *rds_ibdev;
165 	struct rdsv3_ib_ipaddr *i_ipaddr, *i_next;
166 
167 	RDSV3_DPRINTF4("rdsv3_ib_remove_one", "device: %p", device);
168 
169 	rds_ibdev = ib_get_client_data(device, &rdsv3_ib_client);
170 	if (!rds_ibdev)
171 		return;
172 
173 	RDSV3_FOR_EACH_LIST_NODE_SAFE(i_ipaddr, i_next, &rds_ibdev->ipaddr_list,
174 	    list) {
175 		list_remove_node(&i_ipaddr->list);
176 		kmem_free(i_ipaddr, sizeof (*i_ipaddr));
177 	}
178 
179 	rdsv3_ib_destroy_conns(rds_ibdev);
180 
181 	rdsv3_ib_destroy_mr_pool(rds_ibdev);
182 
183 	kmem_cache_destroy(rds_ibdev->ib_frag_slab);
184 
185 #if 0
186 	while (ib_dealloc_pd(rds_ibdev->pd)) {
187 #ifndef __lock_lint
188 		RDSV3_DPRINTF5("rdsv3_ib_remove_one",
189 		    "%s-%d Failed to dealloc pd %p",
190 		    __func__, __LINE__, rds_ibdev->pd);
191 #endif
192 		delay(drv_usectohz(1000));
193 	}
194 #else
195 	if (ib_dealloc_pd(rds_ibdev->pd)) {
196 #ifndef __lock_lint
197 		RDSV3_DPRINTF2("rdsv3_ib_remove_one",
198 		    "Failed to dealloc pd %p\n", rds_ibdev->pd);
199 #endif
200 	}
201 #endif
202 
203 	list_destroy(&rds_ibdev->ipaddr_list);
204 	list_destroy(&rds_ibdev->conn_list);
205 	list_remove_node(&rds_ibdev->list);
206 	kmem_free(rds_ibdev, sizeof (*rds_ibdev));
207 
208 	RDSV3_DPRINTF4("rdsv3_ib_remove_one", "Return: device: %p", device);
209 }
210 
211 #ifndef __lock_lint
212 struct ib_client rdsv3_ib_client = {
213 	.name		= "rdsv3_ib",
214 	.add		= rdsv3_ib_add_one,
215 	.remove		= rdsv3_ib_remove_one,
216 	.clnt_hdl	= NULL,
217 	.state		= IB_CLNT_UNINITIALIZED
218 };
219 #else
220 struct ib_client rdsv3_ib_client = {
221 	"rdsv3_ib",
222 	rdsv3_ib_add_one,
223 	rdsv3_ib_remove_one,
224 	NULL,
225 	NULL,
226 	IB_CLNT_UNINITIALIZED
227 };
228 #endif
229 
230 static int
231 rds_ib_conn_info_visitor(struct rdsv3_connection *conn,
232     void *buffer)
233 {
234 	struct rdsv3_info_rdma_connection *iinfo = buffer;
235 	struct rdsv3_ib_connection *ic;
236 
237 	RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p",
238 	    conn, buffer);
239 
240 	/* We will only ever look at IB transports */
241 	if (conn->c_trans != &rdsv3_ib_transport)
242 		return (0);
243 
244 	iinfo->src_addr = conn->c_laddr;
245 	iinfo->dst_addr = conn->c_faddr;
246 
247 	(void) memset(&iinfo->src_gid, 0, sizeof (iinfo->src_gid));
248 	(void) memset(&iinfo->dst_gid, 0, sizeof (iinfo->dst_gid));
249 	if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) {
250 		struct rdsv3_ib_device *rds_ibdev;
251 		struct rdma_dev_addr *dev_addr;
252 
253 		ic = conn->c_transport_data;
254 		dev_addr = &ic->i_cm_id->route.addr.dev_addr;
255 
256 		ib_addr_get_sgid(dev_addr, (union ib_gid *)&iinfo->src_gid);
257 		ib_addr_get_dgid(dev_addr, (union ib_gid *)&iinfo->dst_gid);
258 
259 		rds_ibdev = ib_get_client_data(ic->i_cm_id->device,
260 		    &rdsv3_ib_client);
261 		iinfo->max_send_wr = ic->i_send_ring.w_nr;
262 		iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
263 		iinfo->max_send_sge = rds_ibdev->max_sge;
264 	}
265 
266 	RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p",
267 	    conn, buffer);
268 	return (1);
269 }
270 
271 static void
272 rds_ib_ic_info(struct rsock *sock, unsigned int len,
273     struct rdsv3_info_iterator *iter,
274     struct rdsv3_info_lengths *lens)
275 {
276 	RDSV3_DPRINTF4("rds_ib_ic_info", "sk: %p iter: %p, lens: %p, len: %d",
277 	    sock, iter, lens, len);
278 
279 	rdsv3_for_each_conn_info(sock, len, iter, lens,
280 	    rds_ib_conn_info_visitor,
281 	    sizeof (struct rdsv3_info_rdma_connection));
282 }
283 
284 /*
285  * Early RDS/IB was built to only bind to an address if there is an IPoIB
286  * device with that address set.
287  *
288  * If it were me, I'd advocate for something more flexible.  Sending and
289  * receiving should be device-agnostic.  Transports would try and maintain
290  * connections between peers who have messages queued.  Userspace would be
291  * allowed to influence which paths have priority.  We could call userspace
292  * asserting this policy "routing".
293  */
294 static int
295 rds_ib_laddr_check(uint32_be_t addr)
296 {
297 	int ret;
298 	struct rdma_cm_id *cm_id;
299 	struct sockaddr_in sin;
300 
301 	RDSV3_DPRINTF4("rds_ib_laddr_check", "addr: %x", ntohl(addr));
302 
303 	/*
304 	 * Create a CMA ID and try to bind it. This catches both
305 	 * IB and iWARP capable NICs.
306 	 */
307 	cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP);
308 	if (!cm_id)
309 		return (-EADDRNOTAVAIL);
310 
311 	(void) memset(&sin, 0, sizeof (sin));
312 	sin.sin_family = AF_INET;
313 	sin.sin_addr.s_addr = rdsv3_scaddr_to_ibaddr(addr);
314 
315 	/* rdma_bind_addr will only succeed for IB & iWARP devices */
316 	ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
317 	/*
318 	 * due to this, we will claim to support iWARP devices unless we
319 	 * check node_type.
320 	 */
321 	if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA)
322 		ret = -EADDRNOTAVAIL;
323 
324 	RDSV3_DPRINTF5("rds_ib_laddr_check",
325 	    "addr %u.%u.%u.%u ret %d node type %d",
326 	    NIPQUAD(addr), ret,
327 	    cm_id->device ? cm_id->device->node_type : -1);
328 
329 	rdma_destroy_id(cm_id);
330 
331 	return (ret);
332 }
333 
334 void
335 rdsv3_ib_exit(void)
336 {
337 	RDSV3_DPRINTF4("rds_ib_exit", "Enter");
338 
339 	rdsv3_info_deregister_func(RDSV3_INFO_IB_CONNECTIONS, rds_ib_ic_info);
340 	rdsv3_ib_destroy_nodev_conns();
341 	ib_unregister_client(&rdsv3_ib_client);
342 	rdsv3_ib_sysctl_exit();
343 	rdsv3_ib_recv_exit();
344 	rdsv3_trans_unregister(&rdsv3_ib_transport);
345 	mutex_destroy(&ib_nodev_conns_lock);
346 	list_destroy(&ib_nodev_conns);
347 	list_destroy(&rdsv3_ib_devices);
348 
349 	RDSV3_DPRINTF4("rds_ib_exit", "Return");
350 }
351 
352 #ifndef __lock_lint
353 struct rdsv3_transport rdsv3_ib_transport = {
354 	.laddr_check		= rds_ib_laddr_check,
355 	.xmit_complete		= rdsv3_ib_xmit_complete,
356 	.xmit			= rdsv3_ib_xmit,
357 	.xmit_cong_map		= NULL,
358 	.xmit_rdma		= rdsv3_ib_xmit_rdma,
359 	.recv			= rdsv3_ib_recv,
360 	.conn_alloc		= rdsv3_ib_conn_alloc,
361 	.conn_free		= rdsv3_ib_conn_free,
362 	.conn_connect		= rdsv3_ib_conn_connect,
363 	.conn_shutdown		= rdsv3_ib_conn_shutdown,
364 	.inc_copy_to_user	= rdsv3_ib_inc_copy_to_user,
365 	.inc_purge		= rdsv3_ib_inc_purge,
366 	.inc_free		= rdsv3_ib_inc_free,
367 	.cm_initiate_connect	= rdsv3_ib_cm_initiate_connect,
368 	.cm_handle_connect	= rdsv3_ib_cm_handle_connect,
369 	.cm_connect_complete	= rdsv3_ib_cm_connect_complete,
370 	.stats_info_copy	= rdsv3_ib_stats_info_copy,
371 	.exit			= rdsv3_ib_exit,
372 	.get_mr			= rdsv3_ib_get_mr,
373 	.sync_mr		= rdsv3_ib_sync_mr,
374 	.free_mr		= rdsv3_ib_free_mr,
375 	.flush_mrs		= rdsv3_ib_flush_mrs,
376 	.t_name			= "infiniband",
377 	.t_type			= RDS_TRANS_IB
378 };
379 #else
380 struct rdsv3_transport rdsv3_ib_transport;
381 #endif
382 
383 int
384 rdsv3_ib_init(void)
385 {
386 	int ret;
387 
388 	RDSV3_DPRINTF4("rds_ib_init", "Enter");
389 
390 	list_create(&rdsv3_ib_devices, sizeof (struct rdsv3_ib_device),
391 	    offsetof(struct rdsv3_ib_device, list));
392 	list_create(&ib_nodev_conns, sizeof (struct rdsv3_ib_connection),
393 	    offsetof(struct rdsv3_ib_connection, ib_node));
394 	mutex_init(&ib_nodev_conns_lock, NULL, MUTEX_DRIVER, NULL);
395 
396 	rdsv3_ib_client.dip = rdsv3_dev_info;
397 	ret = ib_register_client(&rdsv3_ib_client);
398 	if (ret)
399 		goto out;
400 
401 	ret = rdsv3_ib_sysctl_init();
402 	if (ret)
403 		goto out_ibreg;
404 
405 	ret = rdsv3_ib_recv_init();
406 	if (ret)
407 		goto out_sysctl;
408 
409 	ret = rdsv3_trans_register(&rdsv3_ib_transport);
410 	if (ret)
411 		goto out_recv;
412 
413 	rdsv3_info_register_func(RDSV3_INFO_IB_CONNECTIONS, rds_ib_ic_info);
414 
415 	RDSV3_DPRINTF4("rds_ib_init", "Return");
416 
417 	return (0);
418 
419 out_recv:
420 	rdsv3_ib_recv_exit();
421 out_sysctl:
422 	rdsv3_ib_sysctl_exit();
423 out_ibreg:
424 	ib_unregister_client(&rdsv3_ib_client);
425 out:
426 	mutex_destroy(&ib_nodev_conns_lock);
427 	list_destroy(&ib_nodev_conns);
428 	list_destroy(&rdsv3_ib_devices);
429 	return (ret);
430 }
431