1 /* 2 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 3 */ 4 5 /* 6 * This file contains code imported from the OFED rds source file ib.c 7 * Oracle elects to have and use the contents of ib.c under and governed 8 * by the OpenIB.org BSD license (see below for full license text). However, 9 * the following notice accompanied the original version of this file: 10 */ 11 12 /* 13 * Copyright (c) 2006 Oracle. All rights reserved. 14 * 15 * This software is available to you under a choice of one of two 16 * licenses. You may choose to be licensed under the terms of the GNU 17 * General Public License (GPL) Version 2, available from the file 18 * COPYING in the main directory of this source tree, or the 19 * OpenIB.org BSD license below: 20 * 21 * Redistribution and use in source and binary forms, with or 22 * without modification, are permitted provided that the following 23 * conditions are met: 24 * 25 * - Redistributions of source code must retain the above 26 * copyright notice, this list of conditions and the following 27 * disclaimer. 28 * 29 * - Redistributions in binary form must reproduce the above 30 * copyright notice, this list of conditions and the following 31 * disclaimer in the documentation and/or other materials 32 * provided with the distribution. 33 * 34 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 35 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 36 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 37 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 38 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 39 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 40 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 41 * SOFTWARE. 42 * 43 */ 44 #include <sys/sysmacros.h> 45 #include <sys/rds.h> 46 47 #include <sys/ib/ibtl/ibti.h> 48 #include <sys/ib/clients/rdsv3/rdsv3.h> 49 #include <sys/ib/clients/rdsv3/ib.h> 50 #include <sys/ib/clients/rdsv3/rdsv3_debug.h> 51 52 unsigned int rdsv3_ib_retry_count = RDSV3_IB_DEFAULT_RETRY_COUNT; 53 54 struct list rdsv3_ib_devices; 55 56 /* NOTE: if also grabbing ibdev lock, grab this first */ 57 kmutex_t ib_nodev_conns_lock; 58 list_t ib_nodev_conns; 59 60 extern int rdsv3_ib_frag_constructor(void *buf, void *arg, int kmflags); 61 extern void rdsv3_ib_frag_destructor(void *buf, void *arg); 62 63 void 64 rdsv3_ib_add_one(ib_device_t *device) 65 { 66 struct rdsv3_ib_device *rds_ibdev; 67 ibt_hca_attr_t *dev_attr; 68 char name[64]; 69 70 RDSV3_DPRINTF2("rdsv3_ib_add_one", "device: %p", device); 71 72 /* Only handle IB (no iWARP) devices */ 73 if (device->node_type != RDMA_NODE_IB_CA) 74 return; 75 76 dev_attr = (ibt_hca_attr_t *)kmem_alloc(sizeof (*dev_attr), 77 KM_NOSLEEP); 78 if (!dev_attr) 79 return; 80 81 if (ibt_query_hca(ib_get_ibt_hca_hdl(device), dev_attr)) { 82 RDSV3_DPRINTF2("rdsv3_ib_add_one", 83 "Query device failed for %s", device->name); 84 goto free_attr; 85 } 86 87 /* We depend on Reserved Lkey */ 88 if (!(dev_attr->hca_flags2 & IBT_HCA2_RES_LKEY)) { 89 RDSV3_DPRINTF2("rdsv3_ib_add_one", 90 "Reserved Lkey support is required: %s", 91 device->name); 92 goto free_attr; 93 } 94 95 rds_ibdev = kmem_zalloc(sizeof (*rds_ibdev), KM_NOSLEEP); 96 if (!rds_ibdev) 97 goto free_attr; 98 99 rds_ibdev->ibt_hca_hdl = ib_get_ibt_hca_hdl(device); 100 rds_ibdev->hca_attr = *dev_attr; 101 102 rw_init(&rds_ibdev->rwlock, NULL, RW_DRIVER, NULL); 103 mutex_init(&rds_ibdev->spinlock, NULL, MUTEX_DRIVER, NULL); 104 105 rds_ibdev->max_wrs = dev_attr->hca_max_chan_sz; 106 rds_ibdev->max_sge = min(dev_attr->hca_max_sgl, RDSV3_IB_MAX_SGE); 107 108 rds_ibdev->max_initiator_depth = (uint_t)dev_attr->hca_max_rdma_in_qp; 109 rds_ibdev->max_responder_resources = 110 (uint_t)dev_attr->hca_max_rdma_in_qp; 111 112 rds_ibdev->dev = device; 113 rds_ibdev->pd = ib_alloc_pd(device); 114 if (IS_ERR(rds_ibdev->pd)) 115 goto free_dev; 116 117 if (rdsv3_ib_create_mr_pool(rds_ibdev) != 0) { 118 goto free_dev; 119 } 120 121 if (rdsv3_ib_create_inc_pool(rds_ibdev) != 0) { 122 rdsv3_ib_destroy_mr_pool(rds_ibdev); 123 goto free_dev; 124 } 125 126 (void) snprintf(name, 64, "RDSV3_IB_FRAG_%llx", 127 (longlong_t)htonll(dev_attr->hca_node_guid)); 128 rds_ibdev->ib_frag_slab = kmem_cache_create(name, 129 sizeof (struct rdsv3_page_frag), 0, rdsv3_ib_frag_constructor, 130 rdsv3_ib_frag_destructor, NULL, (void *)rds_ibdev, NULL, 0); 131 if (rds_ibdev->ib_frag_slab == NULL) { 132 RDSV3_DPRINTF2("rdsv3_ib_add_one", 133 "kmem_cache_create for ib_frag_slab failed for device: %s", 134 device->name); 135 rdsv3_ib_destroy_mr_pool(rds_ibdev); 136 rdsv3_ib_destroy_inc_pool(rds_ibdev); 137 goto free_dev; 138 } 139 140 rds_ibdev->aft_hcagp = rdsv3_af_grp_create(rds_ibdev->ibt_hca_hdl, 141 (uint64_t)rds_ibdev->hca_attr.hca_node_guid); 142 if (rds_ibdev->aft_hcagp == NULL) { 143 rdsv3_ib_destroy_mr_pool(rds_ibdev); 144 rdsv3_ib_destroy_inc_pool(rds_ibdev); 145 kmem_cache_destroy(rds_ibdev->ib_frag_slab); 146 goto free_dev; 147 } 148 rds_ibdev->fmr_soft_cq = rdsv3_af_thr_create(rdsv3_ib_drain_mrlist_fn, 149 (void *)rds_ibdev->fmr_pool, SCQ_HCA_BIND_CPU, 150 rds_ibdev->aft_hcagp); 151 if (rds_ibdev->fmr_soft_cq == NULL) { 152 rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp); 153 rdsv3_ib_destroy_mr_pool(rds_ibdev); 154 rdsv3_ib_destroy_inc_pool(rds_ibdev); 155 kmem_cache_destroy(rds_ibdev->ib_frag_slab); 156 goto free_dev; 157 } 158 159 rds_ibdev->inc_soft_cq = rdsv3_af_thr_create(rdsv3_ib_drain_inclist, 160 (void *)rds_ibdev->inc_pool, SCQ_HCA_BIND_CPU, 161 rds_ibdev->aft_hcagp); 162 if (rds_ibdev->inc_soft_cq == NULL) { 163 rdsv3_af_thr_destroy(rds_ibdev->fmr_soft_cq); 164 rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp); 165 rdsv3_ib_destroy_mr_pool(rds_ibdev); 166 rdsv3_ib_destroy_inc_pool(rds_ibdev); 167 kmem_cache_destroy(rds_ibdev->ib_frag_slab); 168 goto free_dev; 169 } 170 171 list_create(&rds_ibdev->ipaddr_list, sizeof (struct rdsv3_ib_ipaddr), 172 offsetof(struct rdsv3_ib_ipaddr, list)); 173 list_create(&rds_ibdev->conn_list, sizeof (struct rdsv3_ib_connection), 174 offsetof(struct rdsv3_ib_connection, ib_node)); 175 176 list_insert_tail(&rdsv3_ib_devices, rds_ibdev); 177 178 ib_set_client_data(device, &rdsv3_ib_client, rds_ibdev); 179 180 RDSV3_DPRINTF2("rdsv3_ib_add_one", "Return: device: %p", device); 181 182 goto free_attr; 183 184 err_pd: 185 (void) ib_dealloc_pd(rds_ibdev->pd); 186 free_dev: 187 mutex_destroy(&rds_ibdev->spinlock); 188 rw_destroy(&rds_ibdev->rwlock); 189 kmem_free(rds_ibdev, sizeof (*rds_ibdev)); 190 free_attr: 191 kmem_free(dev_attr, sizeof (*dev_attr)); 192 } 193 194 void 195 rdsv3_ib_remove_one(struct ib_device *device) 196 { 197 struct rdsv3_ib_device *rds_ibdev; 198 struct rdsv3_ib_ipaddr *i_ipaddr, *i_next; 199 200 RDSV3_DPRINTF2("rdsv3_ib_remove_one", "device: %p", device); 201 202 rds_ibdev = ib_get_client_data(device, &rdsv3_ib_client); 203 if (!rds_ibdev) 204 return; 205 206 RDSV3_FOR_EACH_LIST_NODE_SAFE(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, 207 list) { 208 list_remove_node(&i_ipaddr->list); 209 kmem_free(i_ipaddr, sizeof (*i_ipaddr)); 210 } 211 212 rdsv3_ib_destroy_conns(rds_ibdev); 213 214 if (rds_ibdev->fmr_soft_cq) 215 rdsv3_af_thr_destroy(rds_ibdev->fmr_soft_cq); 216 if (rds_ibdev->inc_soft_cq) 217 rdsv3_af_thr_destroy(rds_ibdev->inc_soft_cq); 218 219 rdsv3_ib_destroy_mr_pool(rds_ibdev); 220 rdsv3_ib_destroy_inc_pool(rds_ibdev); 221 222 kmem_cache_destroy(rds_ibdev->ib_frag_slab); 223 224 rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp); 225 226 #if 0 227 while (ib_dealloc_pd(rds_ibdev->pd)) { 228 #ifndef __lock_lint 229 RDSV3_DPRINTF5("rdsv3_ib_remove_one", 230 "%s-%d Failed to dealloc pd %p", 231 __func__, __LINE__, rds_ibdev->pd); 232 #endif 233 delay(drv_usectohz(1000)); 234 } 235 #else 236 if (ib_dealloc_pd(rds_ibdev->pd)) { 237 #ifndef __lock_lint 238 RDSV3_DPRINTF2("rdsv3_ib_remove_one", 239 "Failed to dealloc pd %p\n", rds_ibdev->pd); 240 #endif 241 } 242 #endif 243 244 list_destroy(&rds_ibdev->ipaddr_list); 245 list_destroy(&rds_ibdev->conn_list); 246 list_remove_node(&rds_ibdev->list); 247 mutex_destroy(&rds_ibdev->spinlock); 248 rw_destroy(&rds_ibdev->rwlock); 249 kmem_free(rds_ibdev, sizeof (*rds_ibdev)); 250 251 RDSV3_DPRINTF2("rdsv3_ib_remove_one", "Return: device: %p", device); 252 } 253 254 #ifndef __lock_lint 255 struct ib_client rdsv3_ib_client = { 256 .name = "rdsv3_ib", 257 .add = rdsv3_ib_add_one, 258 .remove = rdsv3_ib_remove_one, 259 .clnt_hdl = NULL, 260 .state = IB_CLNT_UNINITIALIZED 261 }; 262 #else 263 struct ib_client rdsv3_ib_client = { 264 "rdsv3_ib", 265 rdsv3_ib_add_one, 266 rdsv3_ib_remove_one, 267 NULL, 268 NULL, 269 IB_CLNT_UNINITIALIZED 270 }; 271 #endif 272 273 static int 274 rds_ib_conn_info_visitor(struct rdsv3_connection *conn, 275 void *buffer) 276 { 277 struct rds_info_rdma_connection *iinfo = buffer; 278 struct rdsv3_ib_connection *ic; 279 280 RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p", 281 conn, buffer); 282 283 /* We will only ever look at IB transports */ 284 if (conn->c_trans != &rdsv3_ib_transport) 285 return (0); 286 287 iinfo->src_addr = conn->c_laddr; 288 iinfo->dst_addr = conn->c_faddr; 289 290 (void) memset(&iinfo->src_gid, 0, sizeof (iinfo->src_gid)); 291 (void) memset(&iinfo->dst_gid, 0, sizeof (iinfo->dst_gid)); 292 if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) { 293 struct rdsv3_ib_device *rds_ibdev; 294 struct rdma_dev_addr *dev_addr; 295 296 ic = conn->c_transport_data; 297 dev_addr = &ic->i_cm_id->route.addr.dev_addr; 298 299 ib_addr_get_sgid(dev_addr, (union ib_gid *)&iinfo->src_gid); 300 ib_addr_get_dgid(dev_addr, (union ib_gid *)&iinfo->dst_gid); 301 302 rds_ibdev = ib_get_client_data(ic->i_cm_id->device, 303 &rdsv3_ib_client); 304 iinfo->max_send_wr = ic->i_send_ring.w_nr; 305 iinfo->max_recv_wr = ic->i_recv_ring.w_nr; 306 iinfo->max_send_sge = rds_ibdev->max_sge; 307 } 308 309 RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p", 310 conn, buffer); 311 return (1); 312 } 313 314 static void 315 rds_ib_ic_info(struct rsock *sock, unsigned int len, 316 struct rdsv3_info_iterator *iter, 317 struct rdsv3_info_lengths *lens) 318 { 319 RDSV3_DPRINTF4("rds_ib_ic_info", "sk: %p iter: %p, lens: %p, len: %d", 320 sock, iter, lens, len); 321 322 rdsv3_for_each_conn_info(sock, len, iter, lens, 323 rds_ib_conn_info_visitor, 324 sizeof (struct rds_info_rdma_connection)); 325 } 326 327 /* 328 * Early RDS/IB was built to only bind to an address if there is an IPoIB 329 * device with that address set. 330 * 331 * If it were me, I'd advocate for something more flexible. Sending and 332 * receiving should be device-agnostic. Transports would try and maintain 333 * connections between peers who have messages queued. Userspace would be 334 * allowed to influence which paths have priority. We could call userspace 335 * asserting this policy "routing". 336 */ 337 static int 338 rds_ib_laddr_check(uint32_be_t addr) 339 { 340 int ret; 341 struct rdma_cm_id *cm_id; 342 struct sockaddr_in sin; 343 344 RDSV3_DPRINTF4("rds_ib_laddr_check", "addr: %x", ntohl(addr)); 345 346 /* 347 * Create a CMA ID and try to bind it. This catches both 348 * IB and iWARP capable NICs. 349 */ 350 cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP); 351 if (!cm_id) 352 return (-EADDRNOTAVAIL); 353 354 (void) memset(&sin, 0, sizeof (sin)); 355 sin.sin_family = AF_INET; 356 sin.sin_addr.s_addr = rdsv3_scaddr_to_ibaddr(addr); 357 358 /* rdma_bind_addr will only succeed for IB & iWARP devices */ 359 ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); 360 /* 361 * due to this, we will claim to support iWARP devices unless we 362 * check node_type. 363 */ 364 if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA) 365 ret = -EADDRNOTAVAIL; 366 367 RDSV3_DPRINTF5("rds_ib_laddr_check", 368 "addr %u.%u.%u.%u ret %d node type %d", 369 NIPQUAD(addr), ret, 370 cm_id->device ? cm_id->device->node_type : -1); 371 372 rdma_destroy_id(cm_id); 373 374 return (ret); 375 } 376 377 void 378 rdsv3_ib_exit(void) 379 { 380 RDSV3_DPRINTF4("rds_ib_exit", "Enter"); 381 382 rdsv3_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); 383 rdsv3_ib_destroy_nodev_conns(); 384 ib_unregister_client(&rdsv3_ib_client); 385 rdsv3_ib_sysctl_exit(); 386 rdsv3_ib_recv_exit(); 387 rdsv3_trans_unregister(&rdsv3_ib_transport); 388 kmem_free(rdsv3_ib_stats, 389 nr_cpus * sizeof (struct rdsv3_ib_statistics)); 390 mutex_destroy(&ib_nodev_conns_lock); 391 list_destroy(&ib_nodev_conns); 392 list_destroy(&rdsv3_ib_devices); 393 394 RDSV3_DPRINTF4("rds_ib_exit", "Return"); 395 } 396 397 #ifndef __lock_lint 398 struct rdsv3_transport rdsv3_ib_transport = { 399 .laddr_check = rds_ib_laddr_check, 400 .xmit_complete = rdsv3_ib_xmit_complete, 401 .xmit = rdsv3_ib_xmit, 402 .xmit_cong_map = NULL, 403 .xmit_rdma = rdsv3_ib_xmit_rdma, 404 .recv = rdsv3_ib_recv, 405 .conn_alloc = rdsv3_ib_conn_alloc, 406 .conn_free = rdsv3_ib_conn_free, 407 .conn_connect = rdsv3_ib_conn_connect, 408 .conn_shutdown = rdsv3_ib_conn_shutdown, 409 .inc_copy_to_user = rdsv3_ib_inc_copy_to_user, 410 .inc_free = rdsv3_ib_inc_free, 411 .cm_initiate_connect = rdsv3_ib_cm_initiate_connect, 412 .cm_handle_connect = rdsv3_ib_cm_handle_connect, 413 .cm_connect_complete = rdsv3_ib_cm_connect_complete, 414 .stats_info_copy = rdsv3_ib_stats_info_copy, 415 .exit = rdsv3_ib_exit, 416 .get_mr = rdsv3_ib_get_mr, 417 .sync_mr = rdsv3_ib_sync_mr, 418 .free_mr = rdsv3_ib_free_mr, 419 .flush_mrs = rdsv3_ib_flush_mrs, 420 .t_name = "infiniband", 421 .t_type = RDS_TRANS_IB 422 }; 423 #else 424 struct rdsv3_transport rdsv3_ib_transport; 425 #endif 426 427 int 428 rdsv3_ib_init(void) 429 { 430 int ret; 431 432 RDSV3_DPRINTF4("rds_ib_init", "Enter"); 433 434 list_create(&rdsv3_ib_devices, sizeof (struct rdsv3_ib_device), 435 offsetof(struct rdsv3_ib_device, list)); 436 list_create(&ib_nodev_conns, sizeof (struct rdsv3_ib_connection), 437 offsetof(struct rdsv3_ib_connection, ib_node)); 438 mutex_init(&ib_nodev_conns_lock, NULL, MUTEX_DRIVER, NULL); 439 440 /* allocate space for ib statistics */ 441 ASSERT(rdsv3_ib_stats == NULL); 442 rdsv3_ib_stats = kmem_zalloc(nr_cpus * 443 sizeof (struct rdsv3_ib_statistics), KM_SLEEP); 444 445 rdsv3_ib_client.dip = rdsv3_dev_info; 446 ret = ib_register_client(&rdsv3_ib_client); 447 if (ret) 448 goto out; 449 450 ret = rdsv3_ib_sysctl_init(); 451 if (ret) 452 goto out_ibreg; 453 454 ret = rdsv3_ib_recv_init(); 455 if (ret) 456 goto out_sysctl; 457 458 ret = rdsv3_trans_register(&rdsv3_ib_transport); 459 if (ret) 460 goto out_recv; 461 462 rdsv3_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); 463 464 RDSV3_DPRINTF4("rds_ib_init", "Return"); 465 466 return (0); 467 468 out_recv: 469 rdsv3_ib_recv_exit(); 470 out_sysctl: 471 rdsv3_ib_sysctl_exit(); 472 out_ibreg: 473 ib_unregister_client(&rdsv3_ib_client); 474 out: 475 kmem_free(rdsv3_ib_stats, 476 nr_cpus * sizeof (struct rdsv3_ib_statistics)); 477 mutex_destroy(&ib_nodev_conns_lock); 478 list_destroy(&ib_nodev_conns); 479 list_destroy(&rdsv3_ib_devices); 480 return (ret); 481 } 482