xref: /illumos-gate/usr/src/uts/common/io/ib/clients/rdsv3/ib.c (revision cc35afbc6e8770281f105081f860f422c2426b11)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * Copyright (c) 2006 Oracle.  All rights reserved.
27  *
28  * This software is available to you under a choice of one of two
29  * licenses.  You may choose to be licensed under the terms of the GNU
30  * General Public License (GPL) Version 2, available from the file
31  * COPYING in the main directory of this source tree, or the
32  * OpenIB.org BSD license below:
33  *
34  *     Redistribution and use in source and binary forms, with or
35  *     without modification, are permitted provided that the following
36  *     conditions are met:
37  *
38  *      - Redistributions of source code must retain the above
39  *        copyright notice, this list of conditions and the following
40  *        disclaimer.
41  *
42  *      - Redistributions in binary form must reproduce the above
43  *        copyright notice, this list of conditions and the following
44  *        disclaimer in the documentation and/or other materials
45  *        provided with the distribution.
46  *
47  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
48  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
49  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
50  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
51  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
52  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
53  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
54  * SOFTWARE.
55  *
56  */
57 #include <sys/sysmacros.h>
58 #include <sys/rds.h>
59 
60 #include <sys/ib/ibtl/ibti.h>
61 #include <sys/ib/clients/rdsv3/rdsv3.h>
62 #include <sys/ib/clients/rdsv3/ib.h>
63 #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
64 
65 unsigned int rdsv3_ib_retry_count = RDSV3_IB_DEFAULT_RETRY_COUNT;
66 
67 struct list	rdsv3_ib_devices;
68 
69 /* NOTE: if also grabbing ibdev lock, grab this first */
70 kmutex_t ib_nodev_conns_lock;
71 list_t ib_nodev_conns;
72 
73 void
74 rdsv3_ib_add_one(ib_device_t *device)
75 {
76 	struct rdsv3_ib_device *rds_ibdev;
77 	ibt_hca_attr_t *dev_attr;
78 
79 	RDSV3_DPRINTF4("rdsv3_ib_add_one", "device: %p", device);
80 
81 	/* Only handle IB (no iWARP) devices */
82 	if (device->node_type != RDMA_NODE_IB_CA)
83 		return;
84 
85 	dev_attr = (ibt_hca_attr_t *)kmem_alloc(sizeof (*dev_attr),
86 	    KM_NOSLEEP);
87 	if (!dev_attr)
88 		return;
89 
90 	if (ibt_query_hca(ib_get_ibt_hca_hdl(device), dev_attr)) {
91 		RDSV3_DPRINTF5("rdsv3_ib_add_one",
92 		    "Query device failed for %s", device->name);
93 		goto free_attr;
94 	}
95 
96 	/* We depend on Reserved Lkey */
97 	if (!(dev_attr->hca_flags2 & IBT_HCA2_RES_LKEY)) {
98 		RDSV3_DPRINTF5("rdsv3_ib_add_one",
99 		    "Reserved Lkey support is required: %s",
100 		    device->name);
101 		goto free_attr;
102 	}
103 
104 	rds_ibdev = kmem_zalloc(sizeof (*rds_ibdev), KM_NOSLEEP);
105 	if (!rds_ibdev)
106 		goto free_attr;
107 
108 	mutex_init(&rds_ibdev->spinlock, NULL, MUTEX_DRIVER, NULL);
109 
110 	rds_ibdev->max_wrs = dev_attr->hca_max_chan_sz;
111 	rds_ibdev->max_sge = min(dev_attr->hca_max_sgl, RDSV3_IB_MAX_SGE);
112 
113 	rds_ibdev->dev = device;
114 	rds_ibdev->pd = ib_alloc_pd(device);
115 	if (IS_ERR(rds_ibdev->pd))
116 		goto free_dev;
117 
118 	if (rdsv3_ib_create_mr_pool(rds_ibdev) != 0) {
119 		goto free_dev;
120 	}
121 
122 	list_create(&rds_ibdev->ipaddr_list, sizeof (struct rdsv3_ib_ipaddr),
123 	    offsetof(struct rdsv3_ib_ipaddr, list));
124 	list_create(&rds_ibdev->conn_list, sizeof (struct rdsv3_ib_connection),
125 	    offsetof(struct rdsv3_ib_connection, ib_node));
126 
127 	list_insert_tail(&rdsv3_ib_devices, rds_ibdev);
128 
129 	ib_set_client_data(device, &rdsv3_ib_client, rds_ibdev);
130 
131 	RDSV3_DPRINTF4("rdsv3_ib_add_one", "Return: device: %p", device);
132 
133 	goto free_attr;
134 
135 err_pd:
136 	(void) ib_dealloc_pd(rds_ibdev->pd);
137 free_dev:
138 	kmem_free(rds_ibdev, sizeof (*rds_ibdev));
139 free_attr:
140 	kmem_free(dev_attr, sizeof (*dev_attr));
141 }
142 
143 void
144 rdsv3_ib_remove_one(struct ib_device *device)
145 {
146 	struct rdsv3_ib_device *rds_ibdev;
147 	struct rdsv3_ib_ipaddr *i_ipaddr, *i_next;
148 
149 	RDSV3_DPRINTF4("rdsv3_ib_remove_one", "device: %p", device);
150 
151 	rds_ibdev = ib_get_client_data(device, &rdsv3_ib_client);
152 	if (!rds_ibdev)
153 		return;
154 
155 	RDSV3_FOR_EACH_LIST_NODE_SAFE(i_ipaddr, i_next, &rds_ibdev->ipaddr_list,
156 	    list) {
157 		list_remove_node(&i_ipaddr->list);
158 		kmem_free(i_ipaddr, sizeof (*i_ipaddr));
159 	}
160 
161 	rdsv3_ib_destroy_conns(rds_ibdev);
162 
163 	rdsv3_ib_destroy_mr_pool(rds_ibdev);
164 
165 #if 0
166 	while (ib_dealloc_pd(rds_ibdev->pd)) {
167 #ifndef __lock_lint
168 		RDSV3_DPRINTF5("rdsv3_ib_remove_one",
169 		    "%s-%d Failed to dealloc pd %p",
170 		    __func__, __LINE__, rds_ibdev->pd);
171 #endif
172 		delay(drv_usectohz(1000));
173 	}
174 #else
175 	if (ib_dealloc_pd(rds_ibdev->pd)) {
176 #ifndef __lock_lint
177 		RDSV3_DPRINTF2("rdsv3_ib_remove_one",
178 		    "%s-%d Failed to dealloc pd %p",
179 		    __func__, __LINE__, rds_ibdev->pd);
180 #endif
181 	}
182 #endif
183 
184 	list_destroy(&rds_ibdev->ipaddr_list);
185 	list_destroy(&rds_ibdev->conn_list);
186 	list_remove_node(&rds_ibdev->list);
187 	kmem_free(rds_ibdev, sizeof (*rds_ibdev));
188 
189 	RDSV3_DPRINTF4("rdsv3_ib_remove_one", "Return: device: %p", device);
190 }
191 
192 #ifndef __lock_lint
193 struct ib_client rdsv3_ib_client = {
194 	.name		= "rdsv3_ib",
195 	.add		= rdsv3_ib_add_one,
196 	.remove		= rdsv3_ib_remove_one,
197 	.clnt_hdl	= NULL,
198 	.state		= IB_CLNT_UNINITIALIZED
199 };
200 #else
201 struct ib_client rdsv3_ib_client = {
202 	"rdsv3_ib",
203 	rdsv3_ib_add_one,
204 	rdsv3_ib_remove_one,
205 	NULL,
206 	NULL,
207 	IB_CLNT_UNINITIALIZED
208 };
209 #endif
210 
211 static int
212 rds_ib_conn_info_visitor(struct rdsv3_connection *conn,
213     void *buffer)
214 {
215 	struct rdsv3_info_rdma_connection *iinfo = buffer;
216 	struct rdsv3_ib_connection *ic;
217 
218 	RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p",
219 	    conn, buffer);
220 
221 	/* We will only ever look at IB transports */
222 	if (conn->c_trans != &rdsv3_ib_transport)
223 		return (0);
224 
225 	iinfo->src_addr = conn->c_laddr;
226 	iinfo->dst_addr = conn->c_faddr;
227 
228 	(void) memset(&iinfo->src_gid, 0, sizeof (iinfo->src_gid));
229 	(void) memset(&iinfo->dst_gid, 0, sizeof (iinfo->dst_gid));
230 	if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) {
231 		struct rdsv3_ib_device *rds_ibdev;
232 		struct rdma_dev_addr *dev_addr;
233 
234 		ic = conn->c_transport_data;
235 		dev_addr = &ic->i_cm_id->route.addr.dev_addr;
236 
237 		ib_addr_get_sgid(dev_addr, (union ib_gid *)&iinfo->src_gid);
238 		ib_addr_get_dgid(dev_addr, (union ib_gid *)&iinfo->dst_gid);
239 
240 		rds_ibdev = ib_get_client_data(ic->i_cm_id->device,
241 		    &rdsv3_ib_client);
242 		iinfo->max_send_wr = ic->i_send_ring.w_nr;
243 		iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
244 		iinfo->max_send_sge = rds_ibdev->max_sge;
245 	}
246 
247 	RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p",
248 	    conn, buffer);
249 	return (1);
250 }
251 
252 static void
253 rds_ib_ic_info(struct rsock *sock, unsigned int len,
254     struct rdsv3_info_iterator *iter,
255     struct rdsv3_info_lengths *lens)
256 {
257 	RDSV3_DPRINTF4("rds_ib_ic_info", "sk: %p iter: %p, lens: %p, len: %d",
258 	    sock, iter, lens, len);
259 
260 	rdsv3_for_each_conn_info(sock, len, iter, lens,
261 	    rds_ib_conn_info_visitor,
262 	    sizeof (struct rdsv3_info_rdma_connection));
263 }
264 
265 /*
266  * Early RDS/IB was built to only bind to an address if there is an IPoIB
267  * device with that address set.
268  *
269  * If it were me, I'd advocate for something more flexible.  Sending and
270  * receiving should be device-agnostic.  Transports would try and maintain
271  * connections between peers who have messages queued.  Userspace would be
272  * allowed to influence which paths have priority.  We could call userspace
273  * asserting this policy "routing".
274  */
275 static int
276 rds_ib_laddr_check(uint32_be_t addr)
277 {
278 	int ret;
279 	struct rdma_cm_id *cm_id;
280 	struct sockaddr_in sin;
281 
282 	RDSV3_DPRINTF4("rds_ib_laddr_check", "addr: %x", ntohl(addr));
283 
284 	/*
285 	 * Create a CMA ID and try to bind it. This catches both
286 	 * IB and iWARP capable NICs.
287 	 */
288 	cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP);
289 	if (IS_ERR(cm_id))
290 		return (PTR_ERR(cm_id));
291 
292 	(void) memset(&sin, 0, sizeof (sin));
293 	sin.sin_family = AF_INET;
294 	sin.sin_addr.s_addr = rdsv3_scaddr_to_ibaddr(addr);
295 
296 	/* rdma_bind_addr will only succeed for IB & iWARP devices */
297 	ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
298 	/*
299 	 * due to this, we will claim to support iWARP devices unless we
300 	 * check node_type.
301 	 */
302 	if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA)
303 		ret = -EADDRNOTAVAIL;
304 
305 	RDSV3_DPRINTF5("rds_ib_laddr_check",
306 	    "addr %u.%u.%u.%u ret %d node type %d",
307 	    NIPQUAD(addr), ret,
308 	    cm_id->device ? cm_id->device->node_type : -1);
309 
310 	rdma_destroy_id(cm_id);
311 
312 	return (ret);
313 }
314 
315 void
316 rdsv3_ib_exit(void)
317 {
318 	RDSV3_DPRINTF4("rds_ib_exit", "Enter");
319 
320 	rdsv3_info_deregister_func(RDSV3_INFO_IB_CONNECTIONS, rds_ib_ic_info);
321 	rdsv3_ib_destroy_nodev_conns();
322 	ib_unregister_client(&rdsv3_ib_client);
323 	rdsv3_ib_sysctl_exit();
324 	rdsv3_ib_recv_exit();
325 	rdsv3_trans_unregister(&rdsv3_ib_transport);
326 	mutex_destroy(&ib_nodev_conns_lock);
327 	list_destroy(&ib_nodev_conns);
328 	list_destroy(&rdsv3_ib_devices);
329 
330 	RDSV3_DPRINTF4("rds_ib_exit", "Return");
331 }
332 
333 #ifndef __lock_lint
334 struct rdsv3_transport rdsv3_ib_transport = {
335 	.laddr_check		= rds_ib_laddr_check,
336 	.xmit_complete		= rdsv3_ib_xmit_complete,
337 	.xmit			= rdsv3_ib_xmit,
338 	.xmit_cong_map		= NULL,
339 	.xmit_rdma		= rdsv3_ib_xmit_rdma,
340 	.recv			= rdsv3_ib_recv,
341 	.conn_alloc		= rdsv3_ib_conn_alloc,
342 	.conn_free		= rdsv3_ib_conn_free,
343 	.conn_connect		= rdsv3_ib_conn_connect,
344 	.conn_shutdown		= rdsv3_ib_conn_shutdown,
345 	.inc_copy_to_user	= rdsv3_ib_inc_copy_to_user,
346 	.inc_purge		= rdsv3_ib_inc_purge,
347 	.inc_free		= rdsv3_ib_inc_free,
348 	.cm_initiate_connect	= rdsv3_ib_cm_initiate_connect,
349 	.cm_handle_connect	= rdsv3_ib_cm_handle_connect,
350 	.cm_connect_complete	= rdsv3_ib_cm_connect_complete,
351 	.stats_info_copy	= rdsv3_ib_stats_info_copy,
352 	.exit			= rdsv3_ib_exit,
353 	.get_mr			= rdsv3_ib_get_mr,
354 	.sync_mr		= rdsv3_ib_sync_mr,
355 	.free_mr		= rdsv3_ib_free_mr,
356 	.flush_mrs		= rdsv3_ib_flush_mrs,
357 	.t_name			= "infiniband",
358 };
359 #else
360 struct rdsv3_transport rdsv3_ib_transport;
361 #endif
362 
363 int
364 rdsv3_ib_init(void)
365 {
366 	int ret;
367 
368 	RDSV3_DPRINTF4("rds_ib_init", "Enter");
369 
370 	list_create(&rdsv3_ib_devices, sizeof (struct rdsv3_ib_device),
371 	    offsetof(struct rdsv3_ib_device, list));
372 	list_create(&ib_nodev_conns, sizeof (struct rdsv3_ib_connection),
373 	    offsetof(struct rdsv3_ib_connection, ib_node));
374 	mutex_init(&ib_nodev_conns_lock, NULL, MUTEX_DRIVER, NULL);
375 
376 	rdsv3_ib_client.dip = rdsv3_dev_info;
377 	ret = ib_register_client(&rdsv3_ib_client);
378 	if (ret)
379 		goto out;
380 
381 	ret = rdsv3_ib_sysctl_init();
382 	if (ret)
383 		goto out_ibreg;
384 
385 	ret = rdsv3_ib_recv_init();
386 	if (ret)
387 		goto out_sysctl;
388 
389 	ret = rdsv3_trans_register(&rdsv3_ib_transport);
390 	if (ret)
391 		goto out_recv;
392 
393 	rdsv3_info_register_func(RDSV3_INFO_IB_CONNECTIONS, rds_ib_ic_info);
394 
395 	RDSV3_DPRINTF4("rds_ib_init", "Return");
396 
397 	return (0);
398 
399 out_recv:
400 	rdsv3_ib_recv_exit();
401 out_sysctl:
402 	rdsv3_ib_sysctl_exit();
403 out_ibreg:
404 	ib_unregister_client(&rdsv3_ib_client);
405 out:
406 	mutex_destroy(&ib_nodev_conns_lock);
407 	list_destroy(&ib_nodev_conns);
408 	list_destroy(&rdsv3_ib_devices);
409 	return (ret);
410 }
411