1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * Copyright (c) 2006 Oracle. All rights reserved. 27 * 28 * This software is available to you under a choice of one of two 29 * licenses. You may choose to be licensed under the terms of the GNU 30 * General Public License (GPL) Version 2, available from the file 31 * COPYING in the main directory of this source tree, or the 32 * OpenIB.org BSD license below: 33 * 34 * Redistribution and use in source and binary forms, with or 35 * without modification, are permitted provided that the following 36 * conditions are met: 37 * 38 * - Redistributions of source code must retain the above 39 * copyright notice, this list of conditions and the following 40 * disclaimer. 41 * 42 * - Redistributions in binary form must reproduce the above 43 * copyright notice, this list of conditions and the following 44 * disclaimer in the documentation and/or other materials 45 * provided with the distribution. 46 * 47 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 48 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 49 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 50 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 51 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 52 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 53 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 54 * SOFTWARE. 55 * 56 */ 57 #include <sys/sysmacros.h> 58 #include <sys/rds.h> 59 60 #include <sys/ib/ibtl/ibti.h> 61 #include <sys/ib/clients/rdsv3/rdsv3.h> 62 #include <sys/ib/clients/rdsv3/ib.h> 63 #include <sys/ib/clients/rdsv3/rdsv3_debug.h> 64 65 unsigned int rdsv3_ib_retry_count = RDSV3_IB_DEFAULT_RETRY_COUNT; 66 67 struct list rdsv3_ib_devices; 68 69 /* NOTE: if also grabbing ibdev lock, grab this first */ 70 kmutex_t ib_nodev_conns_lock; 71 list_t ib_nodev_conns; 72 73 extern int rdsv3_ib_frag_constructor(void *buf, void *arg, int kmflags); 74 extern void rdsv3_ib_frag_destructor(void *buf, void *arg); 75 76 void 77 rdsv3_ib_add_one(ib_device_t *device) 78 { 79 struct rdsv3_ib_device *rds_ibdev; 80 ibt_hca_attr_t *dev_attr; 81 char name[64]; 82 83 RDSV3_DPRINTF2("rdsv3_ib_add_one", "device: %p", device); 84 85 /* Only handle IB (no iWARP) devices */ 86 if (device->node_type != RDMA_NODE_IB_CA) 87 return; 88 89 dev_attr = (ibt_hca_attr_t *)kmem_alloc(sizeof (*dev_attr), 90 KM_NOSLEEP); 91 if (!dev_attr) 92 return; 93 94 if (ibt_query_hca(ib_get_ibt_hca_hdl(device), dev_attr)) { 95 RDSV3_DPRINTF2("rdsv3_ib_add_one", 96 "Query device failed for %s", device->name); 97 goto free_attr; 98 } 99 100 /* We depend on Reserved Lkey */ 101 if (!(dev_attr->hca_flags2 & IBT_HCA2_RES_LKEY)) { 102 RDSV3_DPRINTF2("rdsv3_ib_add_one", 103 "Reserved Lkey support is required: %s", 104 device->name); 105 goto free_attr; 106 } 107 108 rds_ibdev = kmem_zalloc(sizeof (*rds_ibdev), KM_NOSLEEP); 109 if (!rds_ibdev) 110 goto free_attr; 111 112 rds_ibdev->ibt_hca_hdl = ib_get_ibt_hca_hdl(device); 113 rds_ibdev->hca_attr = *dev_attr; 114 115 rw_init(&rds_ibdev->rwlock, NULL, RW_DRIVER, NULL); 116 mutex_init(&rds_ibdev->spinlock, NULL, MUTEX_DRIVER, NULL); 117 118 rds_ibdev->max_wrs = dev_attr->hca_max_chan_sz; 119 rds_ibdev->max_sge = min(dev_attr->hca_max_sgl, RDSV3_IB_MAX_SGE); 120 121 rds_ibdev->max_initiator_depth = (uint_t)dev_attr->hca_max_rdma_in_qp; 122 rds_ibdev->max_responder_resources = 123 (uint_t)dev_attr->hca_max_rdma_in_qp; 124 125 rds_ibdev->dev = device; 126 rds_ibdev->pd = ib_alloc_pd(device); 127 if (IS_ERR(rds_ibdev->pd)) 128 goto free_dev; 129 130 if (rdsv3_ib_create_mr_pool(rds_ibdev) != 0) { 131 goto free_dev; 132 } 133 134 if (rdsv3_ib_create_inc_pool(rds_ibdev) != 0) { 135 rdsv3_ib_destroy_mr_pool(rds_ibdev); 136 goto free_dev; 137 } 138 139 (void) snprintf(name, 64, "RDSV3_IB_FRAG_%llx", 140 (longlong_t)htonll(dev_attr->hca_node_guid)); 141 rds_ibdev->ib_frag_slab = kmem_cache_create(name, 142 sizeof (struct rdsv3_page_frag), 0, rdsv3_ib_frag_constructor, 143 rdsv3_ib_frag_destructor, NULL, (void *)rds_ibdev, NULL, 0); 144 if (rds_ibdev->ib_frag_slab == NULL) { 145 RDSV3_DPRINTF2("rdsv3_ib_add_one", 146 "kmem_cache_create for ib_frag_slab failed for device: %s", 147 device->name); 148 rdsv3_ib_destroy_mr_pool(rds_ibdev); 149 rdsv3_ib_destroy_inc_pool(rds_ibdev); 150 goto free_dev; 151 } 152 153 rds_ibdev->aft_hcagp = rdsv3_af_grp_create(rds_ibdev->ibt_hca_hdl, 154 (uint64_t)rds_ibdev->hca_attr.hca_node_guid); 155 if (rds_ibdev->aft_hcagp == NULL) { 156 rdsv3_ib_destroy_mr_pool(rds_ibdev); 157 rdsv3_ib_destroy_inc_pool(rds_ibdev); 158 kmem_cache_destroy(rds_ibdev->ib_frag_slab); 159 goto free_dev; 160 } 161 rds_ibdev->fmr_soft_cq = rdsv3_af_thr_create(rdsv3_ib_drain_mrlist_fn, 162 (void *)rds_ibdev->fmr_pool, SCQ_HCA_BIND_CPU, 163 rds_ibdev->aft_hcagp); 164 if (rds_ibdev->fmr_soft_cq == NULL) { 165 rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp); 166 rdsv3_ib_destroy_mr_pool(rds_ibdev); 167 rdsv3_ib_destroy_inc_pool(rds_ibdev); 168 kmem_cache_destroy(rds_ibdev->ib_frag_slab); 169 goto free_dev; 170 } 171 172 rds_ibdev->inc_soft_cq = rdsv3_af_thr_create(rdsv3_ib_drain_inclist, 173 (void *)rds_ibdev->inc_pool, SCQ_HCA_BIND_CPU, 174 rds_ibdev->aft_hcagp); 175 if (rds_ibdev->inc_soft_cq == NULL) { 176 rdsv3_af_thr_destroy(rds_ibdev->fmr_soft_cq); 177 rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp); 178 rdsv3_ib_destroy_mr_pool(rds_ibdev); 179 rdsv3_ib_destroy_inc_pool(rds_ibdev); 180 kmem_cache_destroy(rds_ibdev->ib_frag_slab); 181 goto free_dev; 182 } 183 184 list_create(&rds_ibdev->ipaddr_list, sizeof (struct rdsv3_ib_ipaddr), 185 offsetof(struct rdsv3_ib_ipaddr, list)); 186 list_create(&rds_ibdev->conn_list, sizeof (struct rdsv3_ib_connection), 187 offsetof(struct rdsv3_ib_connection, ib_node)); 188 189 list_insert_tail(&rdsv3_ib_devices, rds_ibdev); 190 191 ib_set_client_data(device, &rdsv3_ib_client, rds_ibdev); 192 193 RDSV3_DPRINTF2("rdsv3_ib_add_one", "Return: device: %p", device); 194 195 goto free_attr; 196 197 err_pd: 198 (void) ib_dealloc_pd(rds_ibdev->pd); 199 free_dev: 200 mutex_destroy(&rds_ibdev->spinlock); 201 rw_destroy(&rds_ibdev->rwlock); 202 kmem_free(rds_ibdev, sizeof (*rds_ibdev)); 203 free_attr: 204 kmem_free(dev_attr, sizeof (*dev_attr)); 205 } 206 207 void 208 rdsv3_ib_remove_one(struct ib_device *device) 209 { 210 struct rdsv3_ib_device *rds_ibdev; 211 struct rdsv3_ib_ipaddr *i_ipaddr, *i_next; 212 213 RDSV3_DPRINTF2("rdsv3_ib_remove_one", "device: %p", device); 214 215 rds_ibdev = ib_get_client_data(device, &rdsv3_ib_client); 216 if (!rds_ibdev) 217 return; 218 219 RDSV3_FOR_EACH_LIST_NODE_SAFE(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, 220 list) { 221 list_remove_node(&i_ipaddr->list); 222 kmem_free(i_ipaddr, sizeof (*i_ipaddr)); 223 } 224 225 rdsv3_ib_destroy_conns(rds_ibdev); 226 227 if (rds_ibdev->fmr_soft_cq) 228 rdsv3_af_thr_destroy(rds_ibdev->fmr_soft_cq); 229 if (rds_ibdev->inc_soft_cq) 230 rdsv3_af_thr_destroy(rds_ibdev->inc_soft_cq); 231 232 rdsv3_ib_destroy_mr_pool(rds_ibdev); 233 rdsv3_ib_destroy_inc_pool(rds_ibdev); 234 235 kmem_cache_destroy(rds_ibdev->ib_frag_slab); 236 237 rdsv3_af_grp_destroy(rds_ibdev->aft_hcagp); 238 239 #if 0 240 while (ib_dealloc_pd(rds_ibdev->pd)) { 241 #ifndef __lock_lint 242 RDSV3_DPRINTF5("rdsv3_ib_remove_one", 243 "%s-%d Failed to dealloc pd %p", 244 __func__, __LINE__, rds_ibdev->pd); 245 #endif 246 delay(drv_usectohz(1000)); 247 } 248 #else 249 if (ib_dealloc_pd(rds_ibdev->pd)) { 250 #ifndef __lock_lint 251 RDSV3_DPRINTF2("rdsv3_ib_remove_one", 252 "Failed to dealloc pd %p\n", rds_ibdev->pd); 253 #endif 254 } 255 #endif 256 257 list_destroy(&rds_ibdev->ipaddr_list); 258 list_destroy(&rds_ibdev->conn_list); 259 list_remove_node(&rds_ibdev->list); 260 mutex_destroy(&rds_ibdev->spinlock); 261 rw_destroy(&rds_ibdev->rwlock); 262 kmem_free(rds_ibdev, sizeof (*rds_ibdev)); 263 264 RDSV3_DPRINTF2("rdsv3_ib_remove_one", "Return: device: %p", device); 265 } 266 267 #ifndef __lock_lint 268 struct ib_client rdsv3_ib_client = { 269 .name = "rdsv3_ib", 270 .add = rdsv3_ib_add_one, 271 .remove = rdsv3_ib_remove_one, 272 .clnt_hdl = NULL, 273 .state = IB_CLNT_UNINITIALIZED 274 }; 275 #else 276 struct ib_client rdsv3_ib_client = { 277 "rdsv3_ib", 278 rdsv3_ib_add_one, 279 rdsv3_ib_remove_one, 280 NULL, 281 NULL, 282 IB_CLNT_UNINITIALIZED 283 }; 284 #endif 285 286 static int 287 rds_ib_conn_info_visitor(struct rdsv3_connection *conn, 288 void *buffer) 289 { 290 struct rdsv3_info_rdma_connection *iinfo = buffer; 291 struct rdsv3_ib_connection *ic; 292 293 RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p", 294 conn, buffer); 295 296 /* We will only ever look at IB transports */ 297 if (conn->c_trans != &rdsv3_ib_transport) 298 return (0); 299 300 iinfo->src_addr = conn->c_laddr; 301 iinfo->dst_addr = conn->c_faddr; 302 303 (void) memset(&iinfo->src_gid, 0, sizeof (iinfo->src_gid)); 304 (void) memset(&iinfo->dst_gid, 0, sizeof (iinfo->dst_gid)); 305 if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) { 306 struct rdsv3_ib_device *rds_ibdev; 307 struct rdma_dev_addr *dev_addr; 308 309 ic = conn->c_transport_data; 310 dev_addr = &ic->i_cm_id->route.addr.dev_addr; 311 312 ib_addr_get_sgid(dev_addr, (union ib_gid *)&iinfo->src_gid); 313 ib_addr_get_dgid(dev_addr, (union ib_gid *)&iinfo->dst_gid); 314 315 rds_ibdev = ib_get_client_data(ic->i_cm_id->device, 316 &rdsv3_ib_client); 317 iinfo->max_send_wr = ic->i_send_ring.w_nr; 318 iinfo->max_recv_wr = ic->i_recv_ring.w_nr; 319 iinfo->max_send_sge = rds_ibdev->max_sge; 320 } 321 322 RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p", 323 conn, buffer); 324 return (1); 325 } 326 327 static void 328 rds_ib_ic_info(struct rsock *sock, unsigned int len, 329 struct rdsv3_info_iterator *iter, 330 struct rdsv3_info_lengths *lens) 331 { 332 RDSV3_DPRINTF4("rds_ib_ic_info", "sk: %p iter: %p, lens: %p, len: %d", 333 sock, iter, lens, len); 334 335 rdsv3_for_each_conn_info(sock, len, iter, lens, 336 rds_ib_conn_info_visitor, 337 sizeof (struct rdsv3_info_rdma_connection)); 338 } 339 340 /* 341 * Early RDS/IB was built to only bind to an address if there is an IPoIB 342 * device with that address set. 343 * 344 * If it were me, I'd advocate for something more flexible. Sending and 345 * receiving should be device-agnostic. Transports would try and maintain 346 * connections between peers who have messages queued. Userspace would be 347 * allowed to influence which paths have priority. We could call userspace 348 * asserting this policy "routing". 349 */ 350 static int 351 rds_ib_laddr_check(uint32_be_t addr) 352 { 353 int ret; 354 struct rdma_cm_id *cm_id; 355 struct sockaddr_in sin; 356 357 RDSV3_DPRINTF4("rds_ib_laddr_check", "addr: %x", ntohl(addr)); 358 359 /* 360 * Create a CMA ID and try to bind it. This catches both 361 * IB and iWARP capable NICs. 362 */ 363 cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP); 364 if (!cm_id) 365 return (-EADDRNOTAVAIL); 366 367 (void) memset(&sin, 0, sizeof (sin)); 368 sin.sin_family = AF_INET; 369 sin.sin_addr.s_addr = rdsv3_scaddr_to_ibaddr(addr); 370 371 /* rdma_bind_addr will only succeed for IB & iWARP devices */ 372 ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); 373 /* 374 * due to this, we will claim to support iWARP devices unless we 375 * check node_type. 376 */ 377 if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA) 378 ret = -EADDRNOTAVAIL; 379 380 RDSV3_DPRINTF5("rds_ib_laddr_check", 381 "addr %u.%u.%u.%u ret %d node type %d", 382 NIPQUAD(addr), ret, 383 cm_id->device ? cm_id->device->node_type : -1); 384 385 rdma_destroy_id(cm_id); 386 387 return (ret); 388 } 389 390 void 391 rdsv3_ib_exit(void) 392 { 393 RDSV3_DPRINTF4("rds_ib_exit", "Enter"); 394 395 rdsv3_info_deregister_func(RDSV3_INFO_IB_CONNECTIONS, rds_ib_ic_info); 396 rdsv3_ib_destroy_nodev_conns(); 397 ib_unregister_client(&rdsv3_ib_client); 398 rdsv3_ib_sysctl_exit(); 399 rdsv3_ib_recv_exit(); 400 rdsv3_trans_unregister(&rdsv3_ib_transport); 401 mutex_destroy(&ib_nodev_conns_lock); 402 list_destroy(&ib_nodev_conns); 403 list_destroy(&rdsv3_ib_devices); 404 405 RDSV3_DPRINTF4("rds_ib_exit", "Return"); 406 } 407 408 #ifndef __lock_lint 409 struct rdsv3_transport rdsv3_ib_transport = { 410 .laddr_check = rds_ib_laddr_check, 411 .xmit_complete = rdsv3_ib_xmit_complete, 412 .xmit = rdsv3_ib_xmit, 413 .xmit_cong_map = NULL, 414 .xmit_rdma = rdsv3_ib_xmit_rdma, 415 .recv = rdsv3_ib_recv, 416 .conn_alloc = rdsv3_ib_conn_alloc, 417 .conn_free = rdsv3_ib_conn_free, 418 .conn_connect = rdsv3_ib_conn_connect, 419 .conn_shutdown = rdsv3_ib_conn_shutdown, 420 .inc_copy_to_user = rdsv3_ib_inc_copy_to_user, 421 .inc_free = rdsv3_ib_inc_free, 422 .cm_initiate_connect = rdsv3_ib_cm_initiate_connect, 423 .cm_handle_connect = rdsv3_ib_cm_handle_connect, 424 .cm_connect_complete = rdsv3_ib_cm_connect_complete, 425 .stats_info_copy = rdsv3_ib_stats_info_copy, 426 .exit = rdsv3_ib_exit, 427 .get_mr = rdsv3_ib_get_mr, 428 .sync_mr = rdsv3_ib_sync_mr, 429 .free_mr = rdsv3_ib_free_mr, 430 .flush_mrs = rdsv3_ib_flush_mrs, 431 .t_name = "infiniband", 432 .t_type = RDS_TRANS_IB 433 }; 434 #else 435 struct rdsv3_transport rdsv3_ib_transport; 436 #endif 437 438 int 439 rdsv3_ib_init(void) 440 { 441 int ret; 442 443 RDSV3_DPRINTF4("rds_ib_init", "Enter"); 444 445 list_create(&rdsv3_ib_devices, sizeof (struct rdsv3_ib_device), 446 offsetof(struct rdsv3_ib_device, list)); 447 list_create(&ib_nodev_conns, sizeof (struct rdsv3_ib_connection), 448 offsetof(struct rdsv3_ib_connection, ib_node)); 449 mutex_init(&ib_nodev_conns_lock, NULL, MUTEX_DRIVER, NULL); 450 451 rdsv3_ib_client.dip = rdsv3_dev_info; 452 ret = ib_register_client(&rdsv3_ib_client); 453 if (ret) 454 goto out; 455 456 ret = rdsv3_ib_sysctl_init(); 457 if (ret) 458 goto out_ibreg; 459 460 ret = rdsv3_ib_recv_init(); 461 if (ret) 462 goto out_sysctl; 463 464 ret = rdsv3_trans_register(&rdsv3_ib_transport); 465 if (ret) 466 goto out_recv; 467 468 rdsv3_info_register_func(RDSV3_INFO_IB_CONNECTIONS, rds_ib_ic_info); 469 470 RDSV3_DPRINTF4("rds_ib_init", "Return"); 471 472 return (0); 473 474 out_recv: 475 rdsv3_ib_recv_exit(); 476 out_sysctl: 477 rdsv3_ib_sysctl_exit(); 478 out_ibreg: 479 ib_unregister_client(&rdsv3_ib_client); 480 out: 481 mutex_destroy(&ib_nodev_conns_lock); 482 list_destroy(&ib_nodev_conns); 483 list_destroy(&rdsv3_ib_devices); 484 return (ret); 485 } 486