1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * Copyright (c) 2006 Oracle. All rights reserved. 27 * 28 * This software is available to you under a choice of one of two 29 * licenses. You may choose to be licensed under the terms of the GNU 30 * General Public License (GPL) Version 2, available from the file 31 * COPYING in the main directory of this source tree, or the 32 * OpenIB.org BSD license below: 33 * 34 * Redistribution and use in source and binary forms, with or 35 * without modification, are permitted provided that the following 36 * conditions are met: 37 * 38 * - Redistributions of source code must retain the above 39 * copyright notice, this list of conditions and the following 40 * disclaimer. 41 * 42 * - Redistributions in binary form must reproduce the above 43 * copyright notice, this list of conditions and the following 44 * disclaimer in the documentation and/or other materials 45 * provided with the distribution. 46 * 47 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 48 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 49 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 50 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 51 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 52 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 53 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 54 * SOFTWARE. 55 * 56 */ 57 #include <sys/sysmacros.h> 58 #include <sys/rds.h> 59 60 #include <sys/ib/ibtl/ibti.h> 61 #include <sys/ib/clients/rdsv3/rdsv3.h> 62 #include <sys/ib/clients/rdsv3/ib.h> 63 #include <sys/ib/clients/rdsv3/rdsv3_debug.h> 64 65 unsigned int rdsv3_ib_retry_count = RDSV3_IB_DEFAULT_RETRY_COUNT; 66 67 struct list rdsv3_ib_devices; 68 69 /* NOTE: if also grabbing ibdev lock, grab this first */ 70 kmutex_t ib_nodev_conns_lock; 71 list_t ib_nodev_conns; 72 73 void 74 rdsv3_ib_add_one(ib_device_t *device) 75 { 76 struct rdsv3_ib_device *rds_ibdev; 77 ibt_hca_attr_t *dev_attr; 78 79 RDSV3_DPRINTF4("rdsv3_ib_add_one", "device: %p", device); 80 81 /* Only handle IB (no iWARP) devices */ 82 if (device->node_type != RDMA_NODE_IB_CA) 83 return; 84 85 dev_attr = (ibt_hca_attr_t *)kmem_alloc(sizeof (*dev_attr), 86 KM_NOSLEEP); 87 if (!dev_attr) 88 return; 89 90 if (ibt_query_hca(ib_get_ibt_hca_hdl(device), dev_attr)) { 91 RDSV3_DPRINTF5("rdsv3_ib_add_one", 92 "Query device failed for %s", device->name); 93 goto free_attr; 94 } 95 96 /* We depend on Reserved Lkey */ 97 if (!(dev_attr->hca_flags2 & IBT_HCA2_RES_LKEY)) { 98 RDSV3_DPRINTF5("rdsv3_ib_add_one", 99 "Reserved Lkey support is required: %s", 100 device->name); 101 goto free_attr; 102 } 103 104 rds_ibdev = kmem_zalloc(sizeof (*rds_ibdev), KM_NOSLEEP); 105 if (!rds_ibdev) 106 goto free_attr; 107 108 mutex_init(&rds_ibdev->spinlock, NULL, MUTEX_DRIVER, NULL); 109 110 rds_ibdev->max_wrs = dev_attr->hca_max_chan_sz; 111 rds_ibdev->max_sge = min(dev_attr->hca_max_sgl, RDSV3_IB_MAX_SGE); 112 113 rds_ibdev->dev = device; 114 rds_ibdev->pd = ib_alloc_pd(device); 115 if (IS_ERR(rds_ibdev->pd)) 116 goto free_dev; 117 118 if (rdsv3_ib_create_mr_pool(rds_ibdev) != 0) { 119 goto free_dev; 120 } 121 122 list_create(&rds_ibdev->ipaddr_list, sizeof (struct rdsv3_ib_ipaddr), 123 offsetof(struct rdsv3_ib_ipaddr, list)); 124 list_create(&rds_ibdev->conn_list, sizeof (struct rdsv3_ib_connection), 125 offsetof(struct rdsv3_ib_connection, ib_node)); 126 127 list_insert_tail(&rdsv3_ib_devices, rds_ibdev); 128 129 ib_set_client_data(device, &rdsv3_ib_client, rds_ibdev); 130 131 RDSV3_DPRINTF4("rdsv3_ib_add_one", "Return: device: %p", device); 132 133 goto free_attr; 134 135 err_pd: 136 (void) ib_dealloc_pd(rds_ibdev->pd); 137 free_dev: 138 kmem_free(rds_ibdev, sizeof (*rds_ibdev)); 139 free_attr: 140 kmem_free(dev_attr, sizeof (*dev_attr)); 141 } 142 143 void 144 rdsv3_ib_remove_one(struct ib_device *device) 145 { 146 struct rdsv3_ib_device *rds_ibdev; 147 struct rdsv3_ib_ipaddr *i_ipaddr, *i_next; 148 149 RDSV3_DPRINTF4("rdsv3_ib_remove_one", "device: %p", device); 150 151 rds_ibdev = ib_get_client_data(device, &rdsv3_ib_client); 152 if (!rds_ibdev) 153 return; 154 155 RDSV3_FOR_EACH_LIST_NODE_SAFE(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, 156 list) { 157 list_remove_node(&i_ipaddr->list); 158 kmem_free(i_ipaddr, sizeof (*i_ipaddr)); 159 } 160 161 rdsv3_ib_destroy_conns(rds_ibdev); 162 163 rdsv3_ib_destroy_mr_pool(rds_ibdev); 164 165 #if 0 166 while (ib_dealloc_pd(rds_ibdev->pd)) { 167 #ifndef __lock_lint 168 RDSV3_DPRINTF5("rdsv3_ib_remove_one", 169 "%s-%d Failed to dealloc pd %p", 170 __func__, __LINE__, rds_ibdev->pd); 171 #endif 172 delay(drv_usectohz(1000)); 173 } 174 #else 175 if (ib_dealloc_pd(rds_ibdev->pd)) { 176 #ifndef __lock_lint 177 RDSV3_DPRINTF2("rdsv3_ib_remove_one", 178 "%s-%d Failed to dealloc pd %p", 179 __func__, __LINE__, rds_ibdev->pd); 180 #endif 181 } 182 #endif 183 184 list_destroy(&rds_ibdev->ipaddr_list); 185 list_destroy(&rds_ibdev->conn_list); 186 list_remove_node(&rds_ibdev->list); 187 kmem_free(rds_ibdev, sizeof (*rds_ibdev)); 188 189 RDSV3_DPRINTF4("rdsv3_ib_remove_one", "Return: device: %p", device); 190 } 191 192 #ifndef __lock_lint 193 struct ib_client rdsv3_ib_client = { 194 .name = "rdsv3_ib", 195 .add = rdsv3_ib_add_one, 196 .remove = rdsv3_ib_remove_one, 197 .clnt_hdl = NULL, 198 .state = IB_CLNT_UNINITIALIZED 199 }; 200 #else 201 struct ib_client rdsv3_ib_client = { 202 "rdsv3_ib", 203 rdsv3_ib_add_one, 204 rdsv3_ib_remove_one, 205 NULL, 206 NULL, 207 IB_CLNT_UNINITIALIZED 208 }; 209 #endif 210 211 static int 212 rds_ib_conn_info_visitor(struct rdsv3_connection *conn, 213 void *buffer) 214 { 215 struct rdsv3_info_rdma_connection *iinfo = buffer; 216 struct rdsv3_ib_connection *ic; 217 218 RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p", 219 conn, buffer); 220 221 /* We will only ever look at IB transports */ 222 if (conn->c_trans != &rdsv3_ib_transport) 223 return (0); 224 225 iinfo->src_addr = conn->c_laddr; 226 iinfo->dst_addr = conn->c_faddr; 227 228 (void) memset(&iinfo->src_gid, 0, sizeof (iinfo->src_gid)); 229 (void) memset(&iinfo->dst_gid, 0, sizeof (iinfo->dst_gid)); 230 if (rdsv3_conn_state(conn) == RDSV3_CONN_UP) { 231 struct rdsv3_ib_device *rds_ibdev; 232 struct rdma_dev_addr *dev_addr; 233 234 ic = conn->c_transport_data; 235 dev_addr = &ic->i_cm_id->route.addr.dev_addr; 236 237 ib_addr_get_sgid(dev_addr, (union ib_gid *)&iinfo->src_gid); 238 ib_addr_get_dgid(dev_addr, (union ib_gid *)&iinfo->dst_gid); 239 240 rds_ibdev = ib_get_client_data(ic->i_cm_id->device, 241 &rdsv3_ib_client); 242 iinfo->max_send_wr = ic->i_send_ring.w_nr; 243 iinfo->max_recv_wr = ic->i_recv_ring.w_nr; 244 iinfo->max_send_sge = rds_ibdev->max_sge; 245 } 246 247 RDSV3_DPRINTF4("rds_ib_conn_info_visitor", "conn: %p buffer: %p", 248 conn, buffer); 249 return (1); 250 } 251 252 static void 253 rds_ib_ic_info(struct rsock *sock, unsigned int len, 254 struct rdsv3_info_iterator *iter, 255 struct rdsv3_info_lengths *lens) 256 { 257 RDSV3_DPRINTF4("rds_ib_ic_info", "sk: %p iter: %p, lens: %p, len: %d", 258 sock, iter, lens, len); 259 260 rdsv3_for_each_conn_info(sock, len, iter, lens, 261 rds_ib_conn_info_visitor, 262 sizeof (struct rdsv3_info_rdma_connection)); 263 } 264 265 /* 266 * Early RDS/IB was built to only bind to an address if there is an IPoIB 267 * device with that address set. 268 * 269 * If it were me, I'd advocate for something more flexible. Sending and 270 * receiving should be device-agnostic. Transports would try and maintain 271 * connections between peers who have messages queued. Userspace would be 272 * allowed to influence which paths have priority. We could call userspace 273 * asserting this policy "routing". 274 */ 275 static int 276 rds_ib_laddr_check(uint32_be_t addr) 277 { 278 int ret; 279 struct rdma_cm_id *cm_id; 280 struct sockaddr_in sin; 281 282 RDSV3_DPRINTF4("rds_ib_laddr_check", "addr: %x", ntohl(addr)); 283 284 /* 285 * Create a CMA ID and try to bind it. This catches both 286 * IB and iWARP capable NICs. 287 */ 288 cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP); 289 if (IS_ERR(cm_id)) 290 return (PTR_ERR(cm_id)); 291 292 (void) memset(&sin, 0, sizeof (sin)); 293 sin.sin_family = AF_INET; 294 sin.sin_addr.s_addr = rdsv3_scaddr_to_ibaddr(addr); 295 296 /* rdma_bind_addr will only succeed for IB & iWARP devices */ 297 ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); 298 /* 299 * due to this, we will claim to support iWARP devices unless we 300 * check node_type. 301 */ 302 if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA) 303 ret = -EADDRNOTAVAIL; 304 305 RDSV3_DPRINTF5("rds_ib_laddr_check", 306 "addr %u.%u.%u.%u ret %d node type %d", 307 NIPQUAD(addr), ret, 308 cm_id->device ? cm_id->device->node_type : -1); 309 310 rdma_destroy_id(cm_id); 311 312 return (ret); 313 } 314 315 void 316 rdsv3_ib_exit(void) 317 { 318 RDSV3_DPRINTF4("rds_ib_exit", "Enter"); 319 320 rdsv3_info_deregister_func(RDSV3_INFO_IB_CONNECTIONS, rds_ib_ic_info); 321 rdsv3_ib_destroy_nodev_conns(); 322 ib_unregister_client(&rdsv3_ib_client); 323 rdsv3_ib_sysctl_exit(); 324 rdsv3_ib_recv_exit(); 325 rdsv3_trans_unregister(&rdsv3_ib_transport); 326 mutex_destroy(&ib_nodev_conns_lock); 327 list_destroy(&ib_nodev_conns); 328 list_destroy(&rdsv3_ib_devices); 329 330 RDSV3_DPRINTF4("rds_ib_exit", "Return"); 331 } 332 333 #ifndef __lock_lint 334 struct rdsv3_transport rdsv3_ib_transport = { 335 .laddr_check = rds_ib_laddr_check, 336 .xmit_complete = rdsv3_ib_xmit_complete, 337 .xmit = rdsv3_ib_xmit, 338 .xmit_cong_map = NULL, 339 .xmit_rdma = rdsv3_ib_xmit_rdma, 340 .recv = rdsv3_ib_recv, 341 .conn_alloc = rdsv3_ib_conn_alloc, 342 .conn_free = rdsv3_ib_conn_free, 343 .conn_connect = rdsv3_ib_conn_connect, 344 .conn_shutdown = rdsv3_ib_conn_shutdown, 345 .inc_copy_to_user = rdsv3_ib_inc_copy_to_user, 346 .inc_purge = rdsv3_ib_inc_purge, 347 .inc_free = rdsv3_ib_inc_free, 348 .cm_initiate_connect = rdsv3_ib_cm_initiate_connect, 349 .cm_handle_connect = rdsv3_ib_cm_handle_connect, 350 .cm_connect_complete = rdsv3_ib_cm_connect_complete, 351 .stats_info_copy = rdsv3_ib_stats_info_copy, 352 .exit = rdsv3_ib_exit, 353 .get_mr = rdsv3_ib_get_mr, 354 .sync_mr = rdsv3_ib_sync_mr, 355 .free_mr = rdsv3_ib_free_mr, 356 .flush_mrs = rdsv3_ib_flush_mrs, 357 .t_name = "infiniband", 358 }; 359 #else 360 struct rdsv3_transport rdsv3_ib_transport; 361 #endif 362 363 int 364 rdsv3_ib_init(void) 365 { 366 int ret; 367 368 RDSV3_DPRINTF4("rds_ib_init", "Enter"); 369 370 list_create(&rdsv3_ib_devices, sizeof (struct rdsv3_ib_device), 371 offsetof(struct rdsv3_ib_device, list)); 372 list_create(&ib_nodev_conns, sizeof (struct rdsv3_ib_connection), 373 offsetof(struct rdsv3_ib_connection, ib_node)); 374 mutex_init(&ib_nodev_conns_lock, NULL, MUTEX_DRIVER, NULL); 375 376 rdsv3_ib_client.dip = rdsv3_dev_info; 377 ret = ib_register_client(&rdsv3_ib_client); 378 if (ret) 379 goto out; 380 381 ret = rdsv3_ib_sysctl_init(); 382 if (ret) 383 goto out_ibreg; 384 385 ret = rdsv3_ib_recv_init(); 386 if (ret) 387 goto out_sysctl; 388 389 ret = rdsv3_trans_register(&rdsv3_ib_transport); 390 if (ret) 391 goto out_recv; 392 393 rdsv3_info_register_func(RDSV3_INFO_IB_CONNECTIONS, rds_ib_ic_info); 394 395 RDSV3_DPRINTF4("rds_ib_init", "Return"); 396 397 return (0); 398 399 out_recv: 400 rdsv3_ib_recv_exit(); 401 out_sysctl: 402 rdsv3_ib_sysctl_exit(); 403 out_ibreg: 404 ib_unregister_client(&rdsv3_ib_client); 405 out: 406 mutex_destroy(&ib_nodev_conns_lock); 407 list_destroy(&ib_nodev_conns); 408 list_destroy(&rdsv3_ib_devices); 409 return (ret); 410 } 411