1ec16227eSAndy Grover /* 2ec16227eSAndy Grover * Copyright (c) 2006 Oracle. All rights reserved. 3ec16227eSAndy Grover * 4ec16227eSAndy Grover * This software is available to you under a choice of one of two 5ec16227eSAndy Grover * licenses. You may choose to be licensed under the terms of the GNU 6ec16227eSAndy Grover * General Public License (GPL) Version 2, available from the file 7ec16227eSAndy Grover * COPYING in the main directory of this source tree, or the 8ec16227eSAndy Grover * OpenIB.org BSD license below: 9ec16227eSAndy Grover * 10ec16227eSAndy Grover * Redistribution and use in source and binary forms, with or 11ec16227eSAndy Grover * without modification, are permitted provided that the following 12ec16227eSAndy Grover * conditions are met: 13ec16227eSAndy Grover * 14ec16227eSAndy Grover * - Redistributions of source code must retain the above 15ec16227eSAndy Grover * copyright notice, this list of conditions and the following 16ec16227eSAndy Grover * disclaimer. 17ec16227eSAndy Grover * 18ec16227eSAndy Grover * - Redistributions in binary form must reproduce the above 19ec16227eSAndy Grover * copyright notice, this list of conditions and the following 20ec16227eSAndy Grover * disclaimer in the documentation and/or other materials 21ec16227eSAndy Grover * provided with the distribution. 22ec16227eSAndy Grover * 23ec16227eSAndy Grover * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24ec16227eSAndy Grover * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25ec16227eSAndy Grover * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26ec16227eSAndy Grover * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27ec16227eSAndy Grover * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28ec16227eSAndy Grover * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29ec16227eSAndy Grover * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30ec16227eSAndy Grover * SOFTWARE. 31ec16227eSAndy Grover * 32ec16227eSAndy Grover */ 33ec16227eSAndy Grover #include <linux/kernel.h> 34ec16227eSAndy Grover #include <linux/in.h> 35ec16227eSAndy Grover #include <linux/if.h> 36ec16227eSAndy Grover #include <linux/netdevice.h> 37ec16227eSAndy Grover #include <linux/inetdevice.h> 38ec16227eSAndy Grover #include <linux/if_arp.h> 39ec16227eSAndy Grover #include <linux/delay.h> 405a0e3ad6STejun Heo #include <linux/slab.h> 41ec16227eSAndy Grover 42ec16227eSAndy Grover #include "rds.h" 43ec16227eSAndy Grover #include "ib.h" 44ec16227eSAndy Grover 45ec16227eSAndy Grover unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE; 46ec16227eSAndy Grover unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */ 473ba23adeSAndy Grover unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT; 48ec16227eSAndy Grover 49ec16227eSAndy Grover module_param(fmr_pool_size, int, 0444); 50ec16227eSAndy Grover MODULE_PARM_DESC(fmr_pool_size, " Max number of fmr per HCA"); 51ec16227eSAndy Grover module_param(fmr_message_size, int, 0444); 52ec16227eSAndy Grover MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer"); 533ba23adeSAndy Grover module_param(rds_ib_retry_count, int, 0444); 543ba23adeSAndy Grover MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error"); 55ec16227eSAndy Grover 56ec16227eSAndy Grover struct list_head rds_ib_devices; 57ec16227eSAndy Grover 58745cbccaSAndy Grover /* NOTE: if also grabbing ibdev lock, grab this first */ 59ec16227eSAndy Grover DEFINE_SPINLOCK(ib_nodev_conns_lock); 60ec16227eSAndy Grover LIST_HEAD(ib_nodev_conns); 61ec16227eSAndy Grover 62ec16227eSAndy Grover void rds_ib_add_one(struct ib_device *device) 63ec16227eSAndy Grover { 64ec16227eSAndy Grover struct rds_ib_device *rds_ibdev; 65ec16227eSAndy Grover struct ib_device_attr *dev_attr; 66ec16227eSAndy Grover 67ec16227eSAndy Grover /* Only handle IB (no iWARP) devices */ 68ec16227eSAndy Grover if (device->node_type != RDMA_NODE_IB_CA) 69ec16227eSAndy Grover return; 70ec16227eSAndy Grover 71ec16227eSAndy Grover dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL); 72ec16227eSAndy Grover if (!dev_attr) 73ec16227eSAndy Grover return; 74ec16227eSAndy Grover 75ec16227eSAndy Grover if (ib_query_device(device, dev_attr)) { 76ec16227eSAndy Grover rdsdebug("Query device failed for %s\n", device->name); 77ec16227eSAndy Grover goto free_attr; 78ec16227eSAndy Grover } 79ec16227eSAndy Grover 80ec16227eSAndy Grover rds_ibdev = kmalloc(sizeof *rds_ibdev, GFP_KERNEL); 81ec16227eSAndy Grover if (!rds_ibdev) 82ec16227eSAndy Grover goto free_attr; 83ec16227eSAndy Grover 84ec16227eSAndy Grover spin_lock_init(&rds_ibdev->spinlock); 85ec16227eSAndy Grover 86ec16227eSAndy Grover rds_ibdev->max_wrs = dev_attr->max_qp_wr; 87ec16227eSAndy Grover rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE); 88ec16227eSAndy Grover 89ec16227eSAndy Grover rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32; 90ec16227eSAndy Grover rds_ibdev->max_fmrs = dev_attr->max_fmr ? 91ec16227eSAndy Grover min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) : 92ec16227eSAndy Grover fmr_pool_size; 93ec16227eSAndy Grover 94*40589e74SAndy Grover rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom; 95*40589e74SAndy Grover rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom; 96*40589e74SAndy Grover 97ec16227eSAndy Grover rds_ibdev->dev = device; 98ec16227eSAndy Grover rds_ibdev->pd = ib_alloc_pd(device); 99ec16227eSAndy Grover if (IS_ERR(rds_ibdev->pd)) 100ec16227eSAndy Grover goto free_dev; 101ec16227eSAndy Grover 102ec16227eSAndy Grover rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, 103ec16227eSAndy Grover IB_ACCESS_LOCAL_WRITE); 104ec16227eSAndy Grover if (IS_ERR(rds_ibdev->mr)) 105ec16227eSAndy Grover goto err_pd; 106ec16227eSAndy Grover 107ec16227eSAndy Grover rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev); 108ec16227eSAndy Grover if (IS_ERR(rds_ibdev->mr_pool)) { 109ec16227eSAndy Grover rds_ibdev->mr_pool = NULL; 110ec16227eSAndy Grover goto err_mr; 111ec16227eSAndy Grover } 112ec16227eSAndy Grover 113ec16227eSAndy Grover INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); 114ec16227eSAndy Grover INIT_LIST_HEAD(&rds_ibdev->conn_list); 115ec16227eSAndy Grover list_add_tail(&rds_ibdev->list, &rds_ib_devices); 116ec16227eSAndy Grover 117ec16227eSAndy Grover ib_set_client_data(device, &rds_ib_client, rds_ibdev); 118ec16227eSAndy Grover 119ec16227eSAndy Grover goto free_attr; 120ec16227eSAndy Grover 121ec16227eSAndy Grover err_mr: 122ec16227eSAndy Grover ib_dereg_mr(rds_ibdev->mr); 123ec16227eSAndy Grover err_pd: 124ec16227eSAndy Grover ib_dealloc_pd(rds_ibdev->pd); 125ec16227eSAndy Grover free_dev: 126ec16227eSAndy Grover kfree(rds_ibdev); 127ec16227eSAndy Grover free_attr: 128ec16227eSAndy Grover kfree(dev_attr); 129ec16227eSAndy Grover } 130ec16227eSAndy Grover 131ec16227eSAndy Grover void rds_ib_remove_one(struct ib_device *device) 132ec16227eSAndy Grover { 133ec16227eSAndy Grover struct rds_ib_device *rds_ibdev; 134ec16227eSAndy Grover struct rds_ib_ipaddr *i_ipaddr, *i_next; 135ec16227eSAndy Grover 136ec16227eSAndy Grover rds_ibdev = ib_get_client_data(device, &rds_ib_client); 137ec16227eSAndy Grover if (!rds_ibdev) 138ec16227eSAndy Grover return; 139ec16227eSAndy Grover 140ec16227eSAndy Grover list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) { 141ec16227eSAndy Grover list_del(&i_ipaddr->list); 142ec16227eSAndy Grover kfree(i_ipaddr); 143ec16227eSAndy Grover } 144ec16227eSAndy Grover 145745cbccaSAndy Grover rds_ib_destroy_conns(rds_ibdev); 146ec16227eSAndy Grover 147ec16227eSAndy Grover if (rds_ibdev->mr_pool) 148ec16227eSAndy Grover rds_ib_destroy_mr_pool(rds_ibdev->mr_pool); 149ec16227eSAndy Grover 150ec16227eSAndy Grover ib_dereg_mr(rds_ibdev->mr); 151ec16227eSAndy Grover 152ec16227eSAndy Grover while (ib_dealloc_pd(rds_ibdev->pd)) { 153ec16227eSAndy Grover rdsdebug("Failed to dealloc pd %p\n", rds_ibdev->pd); 154ec16227eSAndy Grover msleep(1); 155ec16227eSAndy Grover } 156ec16227eSAndy Grover 157ec16227eSAndy Grover list_del(&rds_ibdev->list); 158ec16227eSAndy Grover kfree(rds_ibdev); 159ec16227eSAndy Grover } 160ec16227eSAndy Grover 161ec16227eSAndy Grover struct ib_client rds_ib_client = { 162ec16227eSAndy Grover .name = "rds_ib", 163ec16227eSAndy Grover .add = rds_ib_add_one, 164ec16227eSAndy Grover .remove = rds_ib_remove_one 165ec16227eSAndy Grover }; 166ec16227eSAndy Grover 167ec16227eSAndy Grover static int rds_ib_conn_info_visitor(struct rds_connection *conn, 168ec16227eSAndy Grover void *buffer) 169ec16227eSAndy Grover { 170ec16227eSAndy Grover struct rds_info_rdma_connection *iinfo = buffer; 171ec16227eSAndy Grover struct rds_ib_connection *ic; 172ec16227eSAndy Grover 173ec16227eSAndy Grover /* We will only ever look at IB transports */ 174ec16227eSAndy Grover if (conn->c_trans != &rds_ib_transport) 175ec16227eSAndy Grover return 0; 176ec16227eSAndy Grover 177ec16227eSAndy Grover iinfo->src_addr = conn->c_laddr; 178ec16227eSAndy Grover iinfo->dst_addr = conn->c_faddr; 179ec16227eSAndy Grover 180ec16227eSAndy Grover memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid)); 181ec16227eSAndy Grover memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid)); 182ec16227eSAndy Grover if (rds_conn_state(conn) == RDS_CONN_UP) { 183ec16227eSAndy Grover struct rds_ib_device *rds_ibdev; 184ec16227eSAndy Grover struct rdma_dev_addr *dev_addr; 185ec16227eSAndy Grover 186ec16227eSAndy Grover ic = conn->c_transport_data; 187ec16227eSAndy Grover dev_addr = &ic->i_cm_id->route.addr.dev_addr; 188ec16227eSAndy Grover 1896f8372b6SSean Hefty rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); 1906f8372b6SSean Hefty rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); 191ec16227eSAndy Grover 192ec16227eSAndy Grover rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); 193ec16227eSAndy Grover iinfo->max_send_wr = ic->i_send_ring.w_nr; 194ec16227eSAndy Grover iinfo->max_recv_wr = ic->i_recv_ring.w_nr; 195ec16227eSAndy Grover iinfo->max_send_sge = rds_ibdev->max_sge; 196ec16227eSAndy Grover rds_ib_get_mr_info(rds_ibdev, iinfo); 197ec16227eSAndy Grover } 198ec16227eSAndy Grover return 1; 199ec16227eSAndy Grover } 200ec16227eSAndy Grover 201ec16227eSAndy Grover static void rds_ib_ic_info(struct socket *sock, unsigned int len, 202ec16227eSAndy Grover struct rds_info_iterator *iter, 203ec16227eSAndy Grover struct rds_info_lengths *lens) 204ec16227eSAndy Grover { 205ec16227eSAndy Grover rds_for_each_conn_info(sock, len, iter, lens, 206ec16227eSAndy Grover rds_ib_conn_info_visitor, 207ec16227eSAndy Grover sizeof(struct rds_info_rdma_connection)); 208ec16227eSAndy Grover } 209ec16227eSAndy Grover 210ec16227eSAndy Grover 211ec16227eSAndy Grover /* 212ec16227eSAndy Grover * Early RDS/IB was built to only bind to an address if there is an IPoIB 213ec16227eSAndy Grover * device with that address set. 214ec16227eSAndy Grover * 215ec16227eSAndy Grover * If it were me, I'd advocate for something more flexible. Sending and 216ec16227eSAndy Grover * receiving should be device-agnostic. Transports would try and maintain 217ec16227eSAndy Grover * connections between peers who have messages queued. Userspace would be 218ec16227eSAndy Grover * allowed to influence which paths have priority. We could call userspace 219ec16227eSAndy Grover * asserting this policy "routing". 220ec16227eSAndy Grover */ 221ec16227eSAndy Grover static int rds_ib_laddr_check(__be32 addr) 222ec16227eSAndy Grover { 223ec16227eSAndy Grover int ret; 224ec16227eSAndy Grover struct rdma_cm_id *cm_id; 225ec16227eSAndy Grover struct sockaddr_in sin; 226ec16227eSAndy Grover 227ec16227eSAndy Grover /* Create a CMA ID and try to bind it. This catches both 228ec16227eSAndy Grover * IB and iWARP capable NICs. 229ec16227eSAndy Grover */ 230ec16227eSAndy Grover cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP); 23194713babSDan Carpenter if (IS_ERR(cm_id)) 23294713babSDan Carpenter return PTR_ERR(cm_id); 233ec16227eSAndy Grover 234ec16227eSAndy Grover memset(&sin, 0, sizeof(sin)); 235ec16227eSAndy Grover sin.sin_family = AF_INET; 236ec16227eSAndy Grover sin.sin_addr.s_addr = addr; 237ec16227eSAndy Grover 238ec16227eSAndy Grover /* rdma_bind_addr will only succeed for IB & iWARP devices */ 239ec16227eSAndy Grover ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); 240ec16227eSAndy Grover /* due to this, we will claim to support iWARP devices unless we 241ec16227eSAndy Grover check node_type. */ 242ec16227eSAndy Grover if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA) 243ec16227eSAndy Grover ret = -EADDRNOTAVAIL; 244ec16227eSAndy Grover 245ec16227eSAndy Grover rdsdebug("addr %pI4 ret %d node type %d\n", 246ec16227eSAndy Grover &addr, ret, 247ec16227eSAndy Grover cm_id->device ? cm_id->device->node_type : -1); 248ec16227eSAndy Grover 249ec16227eSAndy Grover rdma_destroy_id(cm_id); 250ec16227eSAndy Grover 251ec16227eSAndy Grover return ret; 252ec16227eSAndy Grover } 253ec16227eSAndy Grover 254ec16227eSAndy Grover void rds_ib_exit(void) 255ec16227eSAndy Grover { 256ec16227eSAndy Grover rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); 257745cbccaSAndy Grover rds_ib_destroy_nodev_conns(); 258ec16227eSAndy Grover ib_unregister_client(&rds_ib_client); 259ec16227eSAndy Grover rds_ib_sysctl_exit(); 260ec16227eSAndy Grover rds_ib_recv_exit(); 261ec16227eSAndy Grover rds_trans_unregister(&rds_ib_transport); 262ec16227eSAndy Grover } 263ec16227eSAndy Grover 264ec16227eSAndy Grover struct rds_transport rds_ib_transport = { 265ec16227eSAndy Grover .laddr_check = rds_ib_laddr_check, 266ec16227eSAndy Grover .xmit_complete = rds_ib_xmit_complete, 267ec16227eSAndy Grover .xmit = rds_ib_xmit, 268ec16227eSAndy Grover .xmit_cong_map = NULL, 269ec16227eSAndy Grover .xmit_rdma = rds_ib_xmit_rdma, 27015133f6eSAndy Grover .xmit_atomic = rds_ib_xmit_atomic, 271ec16227eSAndy Grover .recv = rds_ib_recv, 272ec16227eSAndy Grover .conn_alloc = rds_ib_conn_alloc, 273ec16227eSAndy Grover .conn_free = rds_ib_conn_free, 274ec16227eSAndy Grover .conn_connect = rds_ib_conn_connect, 275ec16227eSAndy Grover .conn_shutdown = rds_ib_conn_shutdown, 276ec16227eSAndy Grover .inc_copy_to_user = rds_ib_inc_copy_to_user, 277ec16227eSAndy Grover .inc_purge = rds_ib_inc_purge, 278ec16227eSAndy Grover .inc_free = rds_ib_inc_free, 279ec16227eSAndy Grover .cm_initiate_connect = rds_ib_cm_initiate_connect, 280ec16227eSAndy Grover .cm_handle_connect = rds_ib_cm_handle_connect, 281ec16227eSAndy Grover .cm_connect_complete = rds_ib_cm_connect_complete, 282ec16227eSAndy Grover .stats_info_copy = rds_ib_stats_info_copy, 283ec16227eSAndy Grover .exit = rds_ib_exit, 284ec16227eSAndy Grover .get_mr = rds_ib_get_mr, 285ec16227eSAndy Grover .sync_mr = rds_ib_sync_mr, 286ec16227eSAndy Grover .free_mr = rds_ib_free_mr, 287ec16227eSAndy Grover .flush_mrs = rds_ib_flush_mrs, 288ec16227eSAndy Grover .t_owner = THIS_MODULE, 289ec16227eSAndy Grover .t_name = "infiniband", 290335776bdSAndy Grover .t_type = RDS_TRANS_IB 291ec16227eSAndy Grover }; 292ec16227eSAndy Grover 293ec16227eSAndy Grover int __init rds_ib_init(void) 294ec16227eSAndy Grover { 295ec16227eSAndy Grover int ret; 296ec16227eSAndy Grover 297ec16227eSAndy Grover INIT_LIST_HEAD(&rds_ib_devices); 298ec16227eSAndy Grover 299ec16227eSAndy Grover ret = ib_register_client(&rds_ib_client); 300ec16227eSAndy Grover if (ret) 301ec16227eSAndy Grover goto out; 302ec16227eSAndy Grover 303ec16227eSAndy Grover ret = rds_ib_sysctl_init(); 304ec16227eSAndy Grover if (ret) 305ec16227eSAndy Grover goto out_ibreg; 306ec16227eSAndy Grover 307ec16227eSAndy Grover ret = rds_ib_recv_init(); 308ec16227eSAndy Grover if (ret) 309ec16227eSAndy Grover goto out_sysctl; 310ec16227eSAndy Grover 311ec16227eSAndy Grover ret = rds_trans_register(&rds_ib_transport); 312ec16227eSAndy Grover if (ret) 313ec16227eSAndy Grover goto out_recv; 314ec16227eSAndy Grover 315ec16227eSAndy Grover rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); 316ec16227eSAndy Grover 317ec16227eSAndy Grover goto out; 318ec16227eSAndy Grover 319ec16227eSAndy Grover out_recv: 320ec16227eSAndy Grover rds_ib_recv_exit(); 321ec16227eSAndy Grover out_sysctl: 322ec16227eSAndy Grover rds_ib_sysctl_exit(); 323ec16227eSAndy Grover out_ibreg: 324ec16227eSAndy Grover ib_unregister_client(&rds_ib_client); 325ec16227eSAndy Grover out: 326ec16227eSAndy Grover return ret; 327ec16227eSAndy Grover } 328ec16227eSAndy Grover 329ec16227eSAndy Grover MODULE_LICENSE("GPL"); 330ec16227eSAndy Grover 331