1 /*
2 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
3 */
4
5 /*
6 * This file contains code imported from the OFED rds source file ib.c
7 * Oracle elects to have and use the contents of ib.c under and governed
8 * by the OpenIB.org BSD license (see below for full license text). However,
9 * the following notice accompanied the original version of this file:
10 */
11
12 /*
13 * Copyright (c) 2006 Oracle. All rights reserved.
14 *
15 * This software is available to you under a choice of one of two
16 * licenses. You may choose to be licensed under the terms of the GNU
17 * General Public License (GPL) Version 2, available from the file
18 * COPYING in the main directory of this source tree, or the
19 * OpenIB.org BSD license below:
20 *
21 * Redistribution and use in source and binary forms, with or
22 * without modification, are permitted provided that the following
23 * conditions are met:
24 *
25 * - Redistributions of source code must retain the above
26 * copyright notice, this list of conditions and the following
27 * disclaimer.
28 *
29 * - Redistributions in binary form must reproduce the above
30 * copyright notice, this list of conditions and the following
31 * disclaimer in the documentation and/or other materials
32 * provided with the distribution.
33 *
34 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
35 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
36 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
37 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
38 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
39 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
40 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
41 * SOFTWARE.
42 *
43 */
44 #include <sys/sysmacros.h>
45 #include <sys/rds.h>
46
47 #include <sys/ib/ibtl/ibti.h>
48 #include <sys/ib/clients/rdsv3/rdsv3.h>
49 #include <sys/ib/clients/rdsv3/ib.h>
50 #include <sys/ib/clients/rdsv3/rdsv3_debug.h>
51
52 unsigned int rdsv3_ib_retry_count = RDSV3_IB_DEFAULT_RETRY_COUNT;
53
54 struct list rdsv3_ib_devices;
55
56 /* NOTE: if also grabbing ibdev lock, grab this first */
57 kmutex_t ib_nodev_conns_lock;
58 list_t ib_nodev_conns;
59
60 extern int rdsv3_ib_frag_constructor(void *buf, void *arg, int kmflags);
61 extern void rdsv3_ib_frag_destructor(void *buf, void *arg);
62
63 void
rdsv3_ib_add_one(ib_device_t * device)64 rdsv3_ib_add_one(ib_device_t *device)
65 {
66 struct rdsv3_ib_device *rds_ibdev;
67 ibt_hca_attr_t *dev_attr;
68 char name[64];
69
70 RDSV3_DPRINTF2("rdsv3_ib_add_one", "device: %p", device);
71
72 /* Only handle IB (no iWARP) devices */
73 if (device->node_type != RDMA_NODE_IB_CA)
74 return;
75
76 dev_attr = (ibt_hca_attr_t *)kmem_alloc(sizeof (*dev_attr),
77 KM_NOSLEEP);
78 if (!dev_attr)
79 return;
80
81 if (ibt_query_hca(ib_get_ibt_hca_hdl(device), dev_attr)) {
82 RDSV3_DPRINTF2("rdsv3_ib_add_one",
83 "Query device failed for %s", device->name);
84 goto free_attr;
85 }
86
87 /* We depend on Reserved Lkey */
88 if (!(dev_attr->hca_flags2 & IBT_HCA2_RES_LKEY)) {
89 RDSV3_DPRINTF2("rdsv3_ib_add_one",
90 "Reserved Lkey support is required: %s",
91 device->name);
92 goto free_attr;
93 }
94
95 rds_ibdev = kmem_zalloc(sizeof (*rds_ibdev), KM_NOSLEEP);
96 if (!rds_ibdev)
97 goto free_attr;
98
99 rds_ibdev->ibt_hca_hdl = ib_get_ibt_hca_hdl(device);
100 rds_ibdev->hca_attr = *dev_attr;
101
102 rw_init(&rds_ibdev->rwlock, NULL, RW_DRIVER, NULL);
103 mutex_init(&rds_ibdev->spinlock, NULL, MUTEX_DRIVER, NULL);
104
105 rds_ibdev->max_wrs = dev_attr->hca_max_chan_sz;
106 rds_ibdev->max_sge = min(dev_attr->hca_max_sgl, RDSV3_IB_MAX_SGE);
107
108 rds_ibdev->max_initiator_depth = (uint_t)dev_attr->hca_max_rdma_in_qp;
109 rds_ibdev->max_responder_resources =
110 (uint_t)dev_attr->hca_max_rdma_in_qp;
111
112 rds_ibdev->dev = device;
113 rds_ibdev->pd = ib_alloc_pd(device);
114 if (IS_ERR(rds_ibdev->pd))
115 goto free_dev;
116
117 if (rdsv3_ib_create_mr_pool(rds_ibdev) != 0) {
118 goto free_dev;
119 }
120
121 if (rdsv3_ib_create_inc_pool(rds_ibdev) != 0) {
122 rdsv3_ib_destroy_mr_pool(rds_ibdev);
123 goto free_dev;
124 }
125
126 (void) snprintf(name, 64, "RDSV3_IB_FRAG_%llx",
127 (longlong_t)htonll(dev_attr->hca_node_guid));
128 rds_ibdev->ib_frag_slab = kmem_cache_create(name,
129 sizeof (struct rdsv3_page_frag), 0, rdsv3_ib_frag_constructor,
130 rdsv3_ib_frag_destructor, NULL, (void *)rds_ibdev, NULL, 0);
131 if (rds_ibdev->ib_frag_slab == NULL) {
132 RDSV3_DPRINTF2("rdsv3_ib_add_one",
133 "kmem_cache_create for ib_frag_slab failed for device: %s",
134 device->name);
135 rdsv3_ib_destroy_mr_pool(rds_ibdev);
136 rdsv3_ib_destroy_inc_pool(rds_ibdev);
137 goto free_dev;
138 }
139
140 rds_ibdev->aft_hcagp = rdsv3_af_grp_create(rds_ibdev->ibt_hca_hdl,
141 (uint64_t)rds_ibdev->hca_attr.hca_node_guid);
142 if (rds_ibdev->aft_hcagp == NULL) {
143 rdsv3_ib_destroy_mr_pool(rds_ibdev);
144 rdsv3_ib_destroy_inc_pool(rds_ibdev);
145 kmem_cache_destroy(rds_ibdev->ib_frag_slab);
146 goto free_dev;
147 }
148 rds_ibdev->fmr_soft_cq = rdsv3_af_thr_create(rdsv3_ib_drain_mrlist_fn,
149 (void *)rds_ibdev->fmr_pool, SCQ_HCA_BIND_CPU,
150 rds_ibdev->aft_hcagp);
151 if (rds_ibdev->fmr_soft_cq == NULL) {
152 rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp);
153 rdsv3_ib_destroy_mr_pool(rds_ibdev);
154 rdsv3_ib_destroy_inc_pool(rds_ibdev);
155 kmem_cache_destroy(rds_ibdev->ib_frag_slab);
156 goto free_dev;
157 }
158
159 rds_ibdev->inc_soft_cq = rdsv3_af_thr_create(rdsv3_ib_drain_inclist,
160 (void *)rds_ibdev->inc_pool, SCQ_HCA_BIND_CPU,
161 rds_ibdev->aft_hcagp);
162 if (rds_ibdev->inc_soft_cq == NULL) {
163 rdsv3_af_thr_destroy(rds_ibdev->fmr_soft_cq);
164 rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp);
165 rdsv3_ib_destroy_mr_pool(rds_ibdev);
166 rdsv3_ib_destroy_inc_pool(rds_ibdev);
167 kmem_cache_destroy(rds_ibdev->ib_frag_slab);
168 goto free_dev;
169 }
170
171 list_create(&rds_ibdev->ipaddr_list, sizeof (struct rdsv3_ib_ipaddr),
172 offsetof(struct rdsv3_ib_ipaddr, list));
173 list_create(&rds_ibdev->conn_list, sizeof (struct rdsv3_ib_connection),
174 offsetof(struct rdsv3_ib_connection, ib_node));
175
176 list_insert_tail(&rdsv3_ib_devices, rds_ibdev);
177
178 ib_set_client_data(device, &rdsv3_ib_client, rds_ibdev);
179
180 RDSV3_DPRINTF2("rdsv3_ib_add_one", "Return: device: %p", device);
181
182 goto free_attr;
183
184 err_pd:
185 (void) ib_dealloc_pd(rds_ibdev->pd);
186 free_dev:
187 mutex_destroy(&rds_ibdev->spinlock);
188 rw_destroy(&rds_ibdev->rwlock);
189 kmem_free(rds_ibdev, sizeof (*rds_ibdev));
190 free_attr:
191 kmem_free(dev_attr, sizeof (*dev_attr));
192 }
193
194 void
rdsv3_ib_remove_one(struct ib_device * device)195 rdsv3_ib_remove_one(struct ib_device *device)
196 {
197 struct rdsv3_ib_device *rds_ibdev;
198 struct rdsv3_ib_ipaddr *i_ipaddr, *i_next;
199
200 RDSV3_DPRINTF2("rdsv3_ib_remove_one", "device: %p", device);
201
202 rds_ibdev = ib_get_client_data(device, &rdsv3_ib_client);
203 if (!rds_ibdev)
204 return;
205
206 RDSV3_FOR_EACH_LIST_NODE_SAFE(i_ipaddr, i_next, &rds_ibdev->ipaddr_list,
207 list) {
208 list_remove_node(&i_ipaddr->list);
209 kmem_free(i_ipaddr, sizeof (*i_ipaddr));
210 }
211
212 rdsv3_ib_destroy_conns(rds_ibdev);
213
214 if (rds_ibdev->fmr_soft_cq)
215 rdsv3_af_thr_destroy(rds_ibdev->fmr_soft_cq);
216 if (rds_ibdev->inc_soft_cq)
217 rdsv3_af_thr_destroy(rds_ibdev->inc_soft_cq);
218
219 rdsv3_ib_destroy_mr_pool(rds_ibdev);
220 rdsv3_ib_destroy_inc_pool(rds_ibdev);
221
222 kmem_cache_destroy(rds_ibdev->ib_frag_slab);
223
224 rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp);
225
226 #if 0
227 while (ib_dealloc_pd(rds_ibdev->pd)) {
228 #ifndef __lock_lint
229 RDSV3_DPRINTF5("rdsv3_ib_remove_one",
230 "%s-%d Failed to dealloc pd %p",
231 __func__, __LINE__, rds_ibdev->pd);
232 #endif
233 delay(drv_usectohz(1000));
234 }
235 #else
236 if (ib_dealloc_pd(rds_ibdev->pd)) {
237 #ifndef __lock_lint
238 RDSV3_DPRINTF2("rdsv3_ib_remove_one",
239 "Failed to dealloc pd %p\n", rds_ibdev->pd);
240 #endif
241 }
242 #endif
243
244 list_destroy(&rds_ibdev->ipaddr_list);
245 list_destroy(&rds_ibdev->conn_list);
246 list_remove_node(&rds_ibdev->list);
247 mutex_destroy(&rds_ibdev->spinlock);
248 rw_destroy(&rds_ibdev->rwlock);
249 kmem_free(rds_ibdev, sizeof (*rds_ibdev));
250
251 RDSV3_DPRINTF2("rdsv3_ib_remove_one", "Return: device: %p", device);
252 }
253
254 #ifndef __lock_lint
255 struct ib_client rdsv3_ib_client = {
256 .name = "rdsv3_ib",
257 .add = rdsv3_ib_add_one,
258 .remove = rdsv3_ib_remove_one,
259 .clnt_hdl = NULL,
260 .state = IB_CLNT_UNINITIALIZED
261 };
262 #else
263 struct ib_client rdsv3_ib_client = {
264 "rdsv3_ib",
265 rdsv3_ib_add_one,
266 rdsv3_ib_remove_one,
267 NULL,
268 NULL,
269 IB_CLNT_UNINITIALIZED
270 };
271 #endif
272
273 static int
rds_ib_conn_info_visitor(struct rdsv3_connection * conn,void * buffer)274 rds_ib_conn_info_visitor(struct rdsv3_connection *conn,
275 void *buffer)
276 {
277 struct rds_info_rdma_connection *iinfo = buffer;
278 struct rdsv3_ib_connection *ic;
279
280 RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p",
281 conn, buffer);
282
283 /* We will only ever look at IB transports */
284 if (conn->c_trans != &rdsv3_ib_transport)
285 return (0);
286
287 iinfo->src_addr = conn->c_laddr;
288 iinfo->dst_addr = conn->c_faddr;
289
290 (void) memset(&iinfo->src_gid, 0, sizeof (iinfo->src_gid));
291 (void) memset(&iinfo->dst_gid, 0, sizeof (iinfo->dst_gid));
292 if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) {
293 struct rdsv3_ib_device *rds_ibdev;
294 struct rdma_dev_addr *dev_addr;
295
296 ic = conn->c_transport_data;
297 dev_addr = &ic->i_cm_id->route.addr.dev_addr;
298
299 ib_addr_get_sgid(dev_addr, (union ib_gid *)&iinfo->src_gid);
300 ib_addr_get_dgid(dev_addr, (union ib_gid *)&iinfo->dst_gid);
301
302 rds_ibdev = ib_get_client_data(ic->i_cm_id->device,
303 &rdsv3_ib_client);
304 iinfo->max_send_wr = ic->i_send_ring.w_nr;
305 iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
306 iinfo->max_send_sge = rds_ibdev->max_sge;
307 }
308
309 RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p",
310 conn, buffer);
311 return (1);
312 }
313
314 static void
rds_ib_ic_info(struct rsock * sock,unsigned int len,struct rdsv3_info_iterator * iter,struct rdsv3_info_lengths * lens)315 rds_ib_ic_info(struct rsock *sock, unsigned int len,
316 struct rdsv3_info_iterator *iter,
317 struct rdsv3_info_lengths *lens)
318 {
319 RDSV3_DPRINTF4("rds_ib_ic_info", "sk: %p iter: %p, lens: %p, len: %d",
320 sock, iter, lens, len);
321
322 rdsv3_for_each_conn_info(sock, len, iter, lens,
323 rds_ib_conn_info_visitor,
324 sizeof (struct rds_info_rdma_connection));
325 }
326
327 /*
328 * Early RDS/IB was built to only bind to an address if there is an IPoIB
329 * device with that address set.
330 *
331 * If it were me, I'd advocate for something more flexible. Sending and
332 * receiving should be device-agnostic. Transports would try and maintain
333 * connections between peers who have messages queued. Userspace would be
334 * allowed to influence which paths have priority. We could call userspace
335 * asserting this policy "routing".
336 */
337 static int
rds_ib_laddr_check(uint32_be_t addr)338 rds_ib_laddr_check(uint32_be_t addr)
339 {
340 int ret;
341 struct rdma_cm_id *cm_id;
342 struct sockaddr_in sin;
343
344 RDSV3_DPRINTF4("rds_ib_laddr_check", "addr: %x", ntohl(addr));
345
346 /*
347 * Create a CMA ID and try to bind it. This catches both
348 * IB and iWARP capable NICs.
349 */
350 cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP);
351 if (!cm_id)
352 return (-EADDRNOTAVAIL);
353
354 (void) memset(&sin, 0, sizeof (sin));
355 sin.sin_family = AF_INET;
356 sin.sin_addr.s_addr = rdsv3_scaddr_to_ibaddr(addr);
357
358 /* rdma_bind_addr will only succeed for IB & iWARP devices */
359 ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
360 /*
361 * due to this, we will claim to support iWARP devices unless we
362 * check node_type.
363 */
364 if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA)
365 ret = -EADDRNOTAVAIL;
366
367 RDSV3_DPRINTF5("rds_ib_laddr_check",
368 "addr %u.%u.%u.%u ret %d node type %d",
369 NIPQUAD(addr), ret,
370 cm_id->device ? cm_id->device->node_type : -1);
371
372 rdma_destroy_id(cm_id);
373
374 return (ret);
375 }
376
377 void
rdsv3_ib_exit(void)378 rdsv3_ib_exit(void)
379 {
380 RDSV3_DPRINTF4("rds_ib_exit", "Enter");
381
382 rdsv3_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
383 rdsv3_ib_destroy_nodev_conns();
384 ib_unregister_client(&rdsv3_ib_client);
385 rdsv3_ib_sysctl_exit();
386 rdsv3_ib_recv_exit();
387 rdsv3_trans_unregister(&rdsv3_ib_transport);
388 kmem_free(rdsv3_ib_stats,
389 nr_cpus * sizeof (struct rdsv3_ib_statistics));
390 mutex_destroy(&ib_nodev_conns_lock);
391 list_destroy(&ib_nodev_conns);
392 list_destroy(&rdsv3_ib_devices);
393
394 RDSV3_DPRINTF4("rds_ib_exit", "Return");
395 }
396
397 #ifndef __lock_lint
398 struct rdsv3_transport rdsv3_ib_transport = {
399 .laddr_check = rds_ib_laddr_check,
400 .xmit_complete = rdsv3_ib_xmit_complete,
401 .xmit = rdsv3_ib_xmit,
402 .xmit_cong_map = NULL,
403 .xmit_rdma = rdsv3_ib_xmit_rdma,
404 .recv = rdsv3_ib_recv,
405 .conn_alloc = rdsv3_ib_conn_alloc,
406 .conn_free = rdsv3_ib_conn_free,
407 .conn_connect = rdsv3_ib_conn_connect,
408 .conn_shutdown = rdsv3_ib_conn_shutdown,
409 .inc_copy_to_user = rdsv3_ib_inc_copy_to_user,
410 .inc_free = rdsv3_ib_inc_free,
411 .cm_initiate_connect = rdsv3_ib_cm_initiate_connect,
412 .cm_handle_connect = rdsv3_ib_cm_handle_connect,
413 .cm_connect_complete = rdsv3_ib_cm_connect_complete,
414 .stats_info_copy = rdsv3_ib_stats_info_copy,
415 .exit = rdsv3_ib_exit,
416 .get_mr = rdsv3_ib_get_mr,
417 .sync_mr = rdsv3_ib_sync_mr,
418 .free_mr = rdsv3_ib_free_mr,
419 .flush_mrs = rdsv3_ib_flush_mrs,
420 .t_name = "infiniband",
421 .t_type = RDS_TRANS_IB
422 };
423 #else
424 struct rdsv3_transport rdsv3_ib_transport;
425 #endif
426
427 int
rdsv3_ib_init(void)428 rdsv3_ib_init(void)
429 {
430 int ret;
431
432 RDSV3_DPRINTF4("rds_ib_init", "Enter");
433
434 list_create(&rdsv3_ib_devices, sizeof (struct rdsv3_ib_device),
435 offsetof(struct rdsv3_ib_device, list));
436 list_create(&ib_nodev_conns, sizeof (struct rdsv3_ib_connection),
437 offsetof(struct rdsv3_ib_connection, ib_node));
438 mutex_init(&ib_nodev_conns_lock, NULL, MUTEX_DRIVER, NULL);
439
440 /* allocate space for ib statistics */
441 ASSERT(rdsv3_ib_stats == NULL);
442 rdsv3_ib_stats = kmem_zalloc(nr_cpus *
443 sizeof (struct rdsv3_ib_statistics), KM_SLEEP);
444
445 rdsv3_ib_client.dip = rdsv3_dev_info;
446 ret = ib_register_client(&rdsv3_ib_client);
447 if (ret)
448 goto out;
449
450 ret = rdsv3_ib_sysctl_init();
451 if (ret)
452 goto out_ibreg;
453
454 ret = rdsv3_ib_recv_init();
455 if (ret)
456 goto out_sysctl;
457
458 ret = rdsv3_trans_register(&rdsv3_ib_transport);
459 if (ret)
460 goto out_recv;
461
462 rdsv3_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
463
464 RDSV3_DPRINTF4("rds_ib_init", "Return");
465
466 return (0);
467
468 out_recv:
469 rdsv3_ib_recv_exit();
470 out_sysctl:
471 rdsv3_ib_sysctl_exit();
472 out_ibreg:
473 ib_unregister_client(&rdsv3_ib_client);
474 out:
475 kmem_free(rdsv3_ib_stats,
476 nr_cpus * sizeof (struct rdsv3_ib_statistics));
477 mutex_destroy(&ib_nodev_conns_lock);
478 list_destroy(&ib_nodev_conns);
479 list_destroy(&rdsv3_ib_devices);
480 return (ret);
481 }
482