1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * Copyright (c) 2006 Oracle. All rights reserved. 27 * 28 * This software is available to you under a choice of one of two 29 * licenses. You may choose to be licensed under the terms of the GNU 30 * General Public License (GPL) Version 2, available from the file 31 * COPYING in the main directory of this source tree, or the 32 * OpenIB.org BSD license below: 33 * 34 * Redistribution and use in source and binary forms, with or 35 * without modification, are permitted provided that the following 36 * conditions are met: 37 * 38 * - Redistributions of source code must retain the above 39 * copyright notice, this list of conditions and the following 40 * disclaimer. 41 * 42 * - Redistributions in binary form must reproduce the above 43 * copyright notice, this list of conditions and the following 44 * disclaimer in the documentation and/or other materials 45 * provided with the distribution. 46 * 47 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 48 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 49 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 50 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 51 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 52 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 53 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 54 * SOFTWARE. 55 * 56 */ 57 #include <sys/sysmacros.h> 58 #include <sys/rds.h> 59 60 #include <sys/ib/ibtl/ibti.h> 61 #include <sys/ib/clients/rdsv3/rdsv3.h> 62 #include <sys/ib/clients/rdsv3/ib.h> 63 #include <sys/ib/clients/rdsv3/rdsv3_debug.h> 64 65 unsigned int rdsv3_ib_retry_count = RDSV3_IB_DEFAULT_RETRY_COUNT; 66 67 struct list rdsv3_ib_devices; 68 69 /* NOTE: if also grabbing ibdev lock, grab this first */ 70 kmutex_t ib_nodev_conns_lock; 71 list_t ib_nodev_conns; 72 73 extern int rdsv3_ib_frag_constructor(void *buf, void *arg, int kmflags); 74 extern void rdsv3_ib_frag_destructor(void *buf, void *arg); 75 76 void 77 rdsv3_ib_add_one(ib_device_t *device) 78 { 79 struct rdsv3_ib_device *rds_ibdev; 80 ibt_hca_attr_t *dev_attr; 81 char name[64]; 82 83 RDSV3_DPRINTF4("rdsv3_ib_add_one", "device: %p", device); 84 85 /* Only handle IB (no iWARP) devices */ 86 if (device->node_type != RDMA_NODE_IB_CA) 87 return; 88 89 dev_attr = (ibt_hca_attr_t *)kmem_alloc(sizeof (*dev_attr), 90 KM_NOSLEEP); 91 if (!dev_attr) 92 return; 93 94 if (ibt_query_hca(ib_get_ibt_hca_hdl(device), dev_attr)) { 95 RDSV3_DPRINTF2("rdsv3_ib_add_one", 96 "Query device failed for %s", device->name); 97 goto free_attr; 98 } 99 100 /* We depend on Reserved Lkey */ 101 if (!(dev_attr->hca_flags2 & IBT_HCA2_RES_LKEY)) { 102 RDSV3_DPRINTF2("rdsv3_ib_add_one", 103 "Reserved Lkey support is required: %s", 104 device->name); 105 goto free_attr; 106 } 107 108 rds_ibdev = kmem_zalloc(sizeof (*rds_ibdev), KM_NOSLEEP); 109 if (!rds_ibdev) 110 goto free_attr; 111 112 mutex_init(&rds_ibdev->spinlock, NULL, MUTEX_DRIVER, NULL); 113 114 rds_ibdev->max_wrs = dev_attr->hca_max_chan_sz; 115 rds_ibdev->max_sge = min(dev_attr->hca_max_sgl, RDSV3_IB_MAX_SGE); 116 117 rds_ibdev->dev = device; 118 rds_ibdev->pd = ib_alloc_pd(device); 119 if (IS_ERR(rds_ibdev->pd)) 120 goto free_dev; 121 122 if (rdsv3_ib_create_mr_pool(rds_ibdev) != 0) { 123 goto free_dev; 124 } 125 126 (void) snprintf(name, 64, "RDSV3_IB_FRAG_%llx", 127 (longlong_t)htonll(dev_attr->hca_node_guid)); 128 rds_ibdev->ib_frag_slab = kmem_cache_create(name, 129 sizeof (struct rdsv3_page_frag), 0, rdsv3_ib_frag_constructor, 130 rdsv3_ib_frag_destructor, NULL, (void *)rds_ibdev, NULL, 0); 131 if (rds_ibdev->ib_frag_slab == NULL) { 132 RDSV3_DPRINTF2("rdsv3_ib_add_one", 133 "kmem_cache_create for ib_frag_slab failed for device: %s", 134 device->name); 135 rdsv3_ib_destroy_mr_pool(rds_ibdev); 136 goto free_dev; 137 } 138 139 140 list_create(&rds_ibdev->ipaddr_list, sizeof (struct rdsv3_ib_ipaddr), 141 offsetof(struct rdsv3_ib_ipaddr, list)); 142 list_create(&rds_ibdev->conn_list, sizeof (struct rdsv3_ib_connection), 143 offsetof(struct rdsv3_ib_connection, ib_node)); 144 145 list_insert_tail(&rdsv3_ib_devices, rds_ibdev); 146 147 ib_set_client_data(device, &rdsv3_ib_client, rds_ibdev); 148 149 RDSV3_DPRINTF4("rdsv3_ib_add_one", "Return: device: %p", device); 150 151 goto free_attr; 152 153 err_pd: 154 (void) ib_dealloc_pd(rds_ibdev->pd); 155 free_dev: 156 kmem_free(rds_ibdev, sizeof (*rds_ibdev)); 157 free_attr: 158 kmem_free(dev_attr, sizeof (*dev_attr)); 159 } 160 161 void 162 rdsv3_ib_remove_one(struct ib_device *device) 163 { 164 struct rdsv3_ib_device *rds_ibdev; 165 struct rdsv3_ib_ipaddr *i_ipaddr, *i_next; 166 167 RDSV3_DPRINTF4("rdsv3_ib_remove_one", "device: %p", device); 168 169 rds_ibdev = ib_get_client_data(device, &rdsv3_ib_client); 170 if (!rds_ibdev) 171 return; 172 173 RDSV3_FOR_EACH_LIST_NODE_SAFE(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, 174 list) { 175 list_remove_node(&i_ipaddr->list); 176 kmem_free(i_ipaddr, sizeof (*i_ipaddr)); 177 } 178 179 rdsv3_ib_destroy_conns(rds_ibdev); 180 181 rdsv3_ib_destroy_mr_pool(rds_ibdev); 182 183 kmem_cache_destroy(rds_ibdev->ib_frag_slab); 184 185 #if 0 186 while (ib_dealloc_pd(rds_ibdev->pd)) { 187 #ifndef __lock_lint 188 RDSV3_DPRINTF5("rdsv3_ib_remove_one", 189 "%s-%d Failed to dealloc pd %p", 190 __func__, __LINE__, rds_ibdev->pd); 191 #endif 192 delay(drv_usectohz(1000)); 193 } 194 #else 195 if (ib_dealloc_pd(rds_ibdev->pd)) { 196 #ifndef __lock_lint 197 RDSV3_DPRINTF2("rdsv3_ib_remove_one", 198 "Failed to dealloc pd %p\n", rds_ibdev->pd); 199 #endif 200 } 201 #endif 202 203 list_destroy(&rds_ibdev->ipaddr_list); 204 list_destroy(&rds_ibdev->conn_list); 205 list_remove_node(&rds_ibdev->list); 206 kmem_free(rds_ibdev, sizeof (*rds_ibdev)); 207 208 RDSV3_DPRINTF4("rdsv3_ib_remove_one", "Return: device: %p", device); 209 } 210 211 #ifndef __lock_lint 212 struct ib_client rdsv3_ib_client = { 213 .name = "rdsv3_ib", 214 .add = rdsv3_ib_add_one, 215 .remove = rdsv3_ib_remove_one, 216 .clnt_hdl = NULL, 217 .state = IB_CLNT_UNINITIALIZED 218 }; 219 #else 220 struct ib_client rdsv3_ib_client = { 221 "rdsv3_ib", 222 rdsv3_ib_add_one, 223 rdsv3_ib_remove_one, 224 NULL, 225 NULL, 226 IB_CLNT_UNINITIALIZED 227 }; 228 #endif 229 230 static int 231 rds_ib_conn_info_visitor(struct rdsv3_connection *conn, 232 void *buffer) 233 { 234 struct rdsv3_info_rdma_connection *iinfo = buffer; 235 struct rdsv3_ib_connection *ic; 236 237 RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p", 238 conn, buffer); 239 240 /* We will only ever look at IB transports */ 241 if (conn->c_trans != &rdsv3_ib_transport) 242 return (0); 243 244 iinfo->src_addr = conn->c_laddr; 245 iinfo->dst_addr = conn->c_faddr; 246 247 (void) memset(&iinfo->src_gid, 0, sizeof (iinfo->src_gid)); 248 (void) memset(&iinfo->dst_gid, 0, sizeof (iinfo->dst_gid)); 249 if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) { 250 struct rdsv3_ib_device *rds_ibdev; 251 struct rdma_dev_addr *dev_addr; 252 253 ic = conn->c_transport_data; 254 dev_addr = &ic->i_cm_id->route.addr.dev_addr; 255 256 ib_addr_get_sgid(dev_addr, (union ib_gid *)&iinfo->src_gid); 257 ib_addr_get_dgid(dev_addr, (union ib_gid *)&iinfo->dst_gid); 258 259 rds_ibdev = ib_get_client_data(ic->i_cm_id->device, 260 &rdsv3_ib_client); 261 iinfo->max_send_wr = ic->i_send_ring.w_nr; 262 iinfo->max_recv_wr = ic->i_recv_ring.w_nr; 263 iinfo->max_send_sge = rds_ibdev->max_sge; 264 } 265 266 RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p", 267 conn, buffer); 268 return (1); 269 } 270 271 static void 272 rds_ib_ic_info(struct rsock *sock, unsigned int len, 273 struct rdsv3_info_iterator *iter, 274 struct rdsv3_info_lengths *lens) 275 { 276 RDSV3_DPRINTF4("rds_ib_ic_info", "sk: %p iter: %p, lens: %p, len: %d", 277 sock, iter, lens, len); 278 279 rdsv3_for_each_conn_info(sock, len, iter, lens, 280 rds_ib_conn_info_visitor, 281 sizeof (struct rdsv3_info_rdma_connection)); 282 } 283 284 /* 285 * Early RDS/IB was built to only bind to an address if there is an IPoIB 286 * device with that address set. 287 * 288 * If it were me, I'd advocate for something more flexible. Sending and 289 * receiving should be device-agnostic. Transports would try and maintain 290 * connections between peers who have messages queued. Userspace would be 291 * allowed to influence which paths have priority. We could call userspace 292 * asserting this policy "routing". 293 */ 294 static int 295 rds_ib_laddr_check(uint32_be_t addr) 296 { 297 int ret; 298 struct rdma_cm_id *cm_id; 299 struct sockaddr_in sin; 300 301 RDSV3_DPRINTF4("rds_ib_laddr_check", "addr: %x", ntohl(addr)); 302 303 /* 304 * Create a CMA ID and try to bind it. This catches both 305 * IB and iWARP capable NICs. 306 */ 307 cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP); 308 if (!cm_id) 309 return (-EADDRNOTAVAIL); 310 311 (void) memset(&sin, 0, sizeof (sin)); 312 sin.sin_family = AF_INET; 313 sin.sin_addr.s_addr = rdsv3_scaddr_to_ibaddr(addr); 314 315 /* rdma_bind_addr will only succeed for IB & iWARP devices */ 316 ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); 317 /* 318 * due to this, we will claim to support iWARP devices unless we 319 * check node_type. 320 */ 321 if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA) 322 ret = -EADDRNOTAVAIL; 323 324 RDSV3_DPRINTF5("rds_ib_laddr_check", 325 "addr %u.%u.%u.%u ret %d node type %d", 326 NIPQUAD(addr), ret, 327 cm_id->device ? cm_id->device->node_type : -1); 328 329 rdma_destroy_id(cm_id); 330 331 return (ret); 332 } 333 334 void 335 rdsv3_ib_exit(void) 336 { 337 RDSV3_DPRINTF4("rds_ib_exit", "Enter"); 338 339 rdsv3_info_deregister_func(RDSV3_INFO_IB_CONNECTIONS, rds_ib_ic_info); 340 rdsv3_ib_destroy_nodev_conns(); 341 ib_unregister_client(&rdsv3_ib_client); 342 rdsv3_ib_sysctl_exit(); 343 rdsv3_ib_recv_exit(); 344 rdsv3_trans_unregister(&rdsv3_ib_transport); 345 mutex_destroy(&ib_nodev_conns_lock); 346 list_destroy(&ib_nodev_conns); 347 list_destroy(&rdsv3_ib_devices); 348 349 RDSV3_DPRINTF4("rds_ib_exit", "Return"); 350 } 351 352 #ifndef __lock_lint 353 struct rdsv3_transport rdsv3_ib_transport = { 354 .laddr_check = rds_ib_laddr_check, 355 .xmit_complete = rdsv3_ib_xmit_complete, 356 .xmit = rdsv3_ib_xmit, 357 .xmit_cong_map = NULL, 358 .xmit_rdma = rdsv3_ib_xmit_rdma, 359 .recv = rdsv3_ib_recv, 360 .conn_alloc = rdsv3_ib_conn_alloc, 361 .conn_free = rdsv3_ib_conn_free, 362 .conn_connect = rdsv3_ib_conn_connect, 363 .conn_shutdown = rdsv3_ib_conn_shutdown, 364 .inc_copy_to_user = rdsv3_ib_inc_copy_to_user, 365 .inc_purge = rdsv3_ib_inc_purge, 366 .inc_free = rdsv3_ib_inc_free, 367 .cm_initiate_connect = rdsv3_ib_cm_initiate_connect, 368 .cm_handle_connect = rdsv3_ib_cm_handle_connect, 369 .cm_connect_complete = rdsv3_ib_cm_connect_complete, 370 .stats_info_copy = rdsv3_ib_stats_info_copy, 371 .exit = rdsv3_ib_exit, 372 .get_mr = rdsv3_ib_get_mr, 373 .sync_mr = rdsv3_ib_sync_mr, 374 .free_mr = rdsv3_ib_free_mr, 375 .flush_mrs = rdsv3_ib_flush_mrs, 376 .t_name = "infiniband", 377 .t_type = RDS_TRANS_IB 378 }; 379 #else 380 struct rdsv3_transport rdsv3_ib_transport; 381 #endif 382 383 int 384 rdsv3_ib_init(void) 385 { 386 int ret; 387 388 RDSV3_DPRINTF4("rds_ib_init", "Enter"); 389 390 list_create(&rdsv3_ib_devices, sizeof (struct rdsv3_ib_device), 391 offsetof(struct rdsv3_ib_device, list)); 392 list_create(&ib_nodev_conns, sizeof (struct rdsv3_ib_connection), 393 offsetof(struct rdsv3_ib_connection, ib_node)); 394 mutex_init(&ib_nodev_conns_lock, NULL, MUTEX_DRIVER, NULL); 395 396 rdsv3_ib_client.dip = rdsv3_dev_info; 397 ret = ib_register_client(&rdsv3_ib_client); 398 if (ret) 399 goto out; 400 401 ret = rdsv3_ib_sysctl_init(); 402 if (ret) 403 goto out_ibreg; 404 405 ret = rdsv3_ib_recv_init(); 406 if (ret) 407 goto out_sysctl; 408 409 ret = rdsv3_trans_register(&rdsv3_ib_transport); 410 if (ret) 411 goto out_recv; 412 413 rdsv3_info_register_func(RDSV3_INFO_IB_CONNECTIONS, rds_ib_ic_info); 414 415 RDSV3_DPRINTF4("rds_ib_init", "Return"); 416 417 return (0); 418 419 out_recv: 420 rdsv3_ib_recv_exit(); 421 out_sysctl: 422 rdsv3_ib_sysctl_exit(); 423 out_ibreg: 424 ib_unregister_client(&rdsv3_ib_client); 425 out: 426 mutex_destroy(&ib_nodev_conns_lock); 427 list_destroy(&ib_nodev_conns); 428 list_destroy(&rdsv3_ib_devices); 429 return (ret); 430 } 431