1 /* 2 * Copyright (c) 2004 Topspin Communications. All rights reserved. 3 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34 #include <linux/module.h> 35 #include <linux/string.h> 36 #include <linux/errno.h> 37 #include <linux/kernel.h> 38 #include <linux/slab.h> 39 #include <linux/init.h> 40 #include <linux/netdevice.h> 41 #include <net/net_namespace.h> 42 #include <linux/security.h> 43 #include <linux/notifier.h> 44 #include <linux/hashtable.h> 45 #include <rdma/rdma_netlink.h> 46 #include <rdma/ib_addr.h> 47 #include <rdma/ib_cache.h> 48 #include <rdma/rdma_counter.h> 49 50 #include "core_priv.h" 51 #include "restrack.h" 52 53 MODULE_AUTHOR("Roland Dreier"); 54 MODULE_DESCRIPTION("core kernel InfiniBand API"); 55 MODULE_LICENSE("Dual BSD/GPL"); 56 57 struct workqueue_struct *ib_comp_wq; 58 struct workqueue_struct *ib_comp_unbound_wq; 59 struct workqueue_struct *ib_wq; 60 EXPORT_SYMBOL_GPL(ib_wq); 61 static struct workqueue_struct *ib_unreg_wq; 62 63 /* 64 * Each of the three rwsem locks (devices, clients, client_data) protects the 65 * xarray of the same name. Specifically it allows the caller to assert that 66 * the MARK will/will not be changing under the lock, and for devices and 67 * clients, that the value in the xarray is still a valid pointer. Change of 68 * the MARK is linked to the object state, so holding the lock and testing the 69 * MARK also asserts that the contained object is in a certain state. 70 * 71 * This is used to build a two stage register/unregister flow where objects 72 * can continue to be in the xarray even though they are still in progress to 73 * register/unregister. 74 * 75 * The xarray itself provides additional locking, and restartable iteration, 76 * which is also relied on. 77 * 78 * Locks should not be nested, with the exception of client_data, which is 79 * allowed to nest under the read side of the other two locks. 80 * 81 * The devices_rwsem also protects the device name list, any change or 82 * assignment of device name must also hold the write side to guarantee unique 83 * names. 84 */ 85 86 /* 87 * devices contains devices that have had their names assigned. The 88 * devices may not be registered. Users that care about the registration 89 * status need to call ib_device_try_get() on the device to ensure it is 90 * registered, and keep it registered, for the required duration. 91 * 92 */ 93 static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC); 94 static DECLARE_RWSEM(devices_rwsem); 95 #define DEVICE_REGISTERED XA_MARK_1 96 97 static u32 highest_client_id; 98 #define CLIENT_REGISTERED XA_MARK_1 99 static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC); 100 static DECLARE_RWSEM(clients_rwsem); 101 102 static void ib_client_put(struct ib_client *client) 103 { 104 if (refcount_dec_and_test(&client->uses)) 105 complete(&client->uses_zero); 106 } 107 108 /* 109 * If client_data is registered then the corresponding client must also still 110 * be registered. 111 */ 112 #define CLIENT_DATA_REGISTERED XA_MARK_1 113 114 unsigned int rdma_dev_net_id; 115 116 /* 117 * A list of net namespaces is maintained in an xarray. This is necessary 118 * because we can't get the locking right using the existing net ns list. We 119 * would require a init_net callback after the list is updated. 120 */ 121 static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_ALLOC); 122 /* 123 * rwsem to protect accessing the rdma_nets xarray entries. 124 */ 125 static DECLARE_RWSEM(rdma_nets_rwsem); 126 127 bool ib_devices_shared_netns = true; 128 module_param_named(netns_mode, ib_devices_shared_netns, bool, 0444); 129 MODULE_PARM_DESC(netns_mode, 130 "Share device among net namespaces; default=1 (shared)"); 131 /** 132 * rdma_dev_access_netns() - Return whether an rdma device can be accessed 133 * from a specified net namespace or not. 134 * @dev: Pointer to rdma device which needs to be checked 135 * @net: Pointer to net namesapce for which access to be checked 136 * 137 * When the rdma device is in shared mode, it ignores the net namespace. 138 * When the rdma device is exclusive to a net namespace, rdma device net 139 * namespace is checked against the specified one. 140 */ 141 bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net) 142 { 143 return (ib_devices_shared_netns || 144 net_eq(read_pnet(&dev->coredev.rdma_net), net)); 145 } 146 EXPORT_SYMBOL(rdma_dev_access_netns); 147 148 /** 149 * rdma_dev_has_raw_cap() - Returns whether a specified rdma device has 150 * CAP_NET_RAW capability or not. 151 * 152 * @dev: Pointer to rdma device whose capability to be checked 153 * 154 * Returns true if a rdma device's owning user namespace has CAP_NET_RAW 155 * capability, otherwise false. When rdma subsystem is in legacy shared network, 156 * namespace mode, the default net namespace is considered. 157 */ 158 bool rdma_dev_has_raw_cap(const struct ib_device *dev) 159 { 160 const struct net *net; 161 162 /* Network namespace is the resource whose user namespace 163 * to be considered. When in shared mode, there is no reliable 164 * network namespace resource, so consider the default net namespace. 165 */ 166 if (ib_devices_shared_netns) 167 net = &init_net; 168 else 169 net = read_pnet(&dev->coredev.rdma_net); 170 171 return ns_capable(net->user_ns, CAP_NET_RAW); 172 } 173 EXPORT_SYMBOL(rdma_dev_has_raw_cap); 174 175 /* 176 * xarray has this behavior where it won't iterate over NULL values stored in 177 * allocated arrays. So we need our own iterator to see all values stored in 178 * the array. This does the same thing as xa_for_each except that it also 179 * returns NULL valued entries if the array is allocating. Simplified to only 180 * work on simple xarrays. 181 */ 182 static void *xan_find_marked(struct xarray *xa, unsigned long *indexp, 183 xa_mark_t filter) 184 { 185 XA_STATE(xas, xa, *indexp); 186 void *entry; 187 188 rcu_read_lock(); 189 do { 190 entry = xas_find_marked(&xas, ULONG_MAX, filter); 191 if (xa_is_zero(entry)) 192 break; 193 } while (xas_retry(&xas, entry)); 194 rcu_read_unlock(); 195 196 if (entry) { 197 *indexp = xas.xa_index; 198 if (xa_is_zero(entry)) 199 return NULL; 200 return entry; 201 } 202 return XA_ERROR(-ENOENT); 203 } 204 #define xan_for_each_marked(xa, index, entry, filter) \ 205 for (index = 0, entry = xan_find_marked(xa, &(index), filter); \ 206 !xa_is_err(entry); \ 207 (index)++, entry = xan_find_marked(xa, &(index), filter)) 208 209 /* RCU hash table mapping netdevice pointers to struct ib_port_data */ 210 static DEFINE_SPINLOCK(ndev_hash_lock); 211 static DECLARE_HASHTABLE(ndev_hash, 5); 212 213 static void free_netdevs(struct ib_device *ib_dev); 214 static void ib_unregister_work(struct work_struct *work); 215 static void __ib_unregister_device(struct ib_device *device); 216 static int ib_security_change(struct notifier_block *nb, unsigned long event, 217 void *lsm_data); 218 static void ib_policy_change_task(struct work_struct *work); 219 static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task); 220 221 static void __ibdev_printk(const char *level, const struct ib_device *ibdev, 222 struct va_format *vaf) 223 { 224 if (ibdev && ibdev->dev.parent) 225 dev_printk_emit(level[1] - '0', 226 ibdev->dev.parent, 227 "%s %s %s: %pV", 228 dev_driver_string(ibdev->dev.parent), 229 dev_name(ibdev->dev.parent), 230 dev_name(&ibdev->dev), 231 vaf); 232 else if (ibdev) 233 printk("%s%s: %pV", 234 level, dev_name(&ibdev->dev), vaf); 235 else 236 printk("%s(NULL ib_device): %pV", level, vaf); 237 } 238 239 #define define_ibdev_printk_level(func, level) \ 240 void func(const struct ib_device *ibdev, const char *fmt, ...) \ 241 { \ 242 struct va_format vaf; \ 243 va_list args; \ 244 \ 245 va_start(args, fmt); \ 246 \ 247 vaf.fmt = fmt; \ 248 vaf.va = &args; \ 249 \ 250 __ibdev_printk(level, ibdev, &vaf); \ 251 \ 252 va_end(args); \ 253 } \ 254 EXPORT_SYMBOL(func); 255 256 define_ibdev_printk_level(ibdev_emerg, KERN_EMERG); 257 define_ibdev_printk_level(ibdev_alert, KERN_ALERT); 258 define_ibdev_printk_level(ibdev_crit, KERN_CRIT); 259 define_ibdev_printk_level(ibdev_err, KERN_ERR); 260 define_ibdev_printk_level(ibdev_warn, KERN_WARNING); 261 define_ibdev_printk_level(ibdev_notice, KERN_NOTICE); 262 define_ibdev_printk_level(ibdev_info, KERN_INFO); 263 264 static struct notifier_block ibdev_lsm_nb = { 265 .notifier_call = ib_security_change, 266 }; 267 268 static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, 269 struct net *net); 270 271 /* Pointer to the RCU head at the start of the ib_port_data array */ 272 struct ib_port_data_rcu { 273 struct rcu_head rcu_head; 274 struct ib_port_data pdata[]; 275 }; 276 277 static void ib_device_check_mandatory(struct ib_device *device) 278 { 279 #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x } 280 static const struct { 281 size_t offset; 282 char *name; 283 } mandatory_table[] = { 284 IB_MANDATORY_FUNC(query_device), 285 IB_MANDATORY_FUNC(query_port), 286 IB_MANDATORY_FUNC(alloc_pd), 287 IB_MANDATORY_FUNC(dealloc_pd), 288 IB_MANDATORY_FUNC(create_qp), 289 IB_MANDATORY_FUNC(modify_qp), 290 IB_MANDATORY_FUNC(destroy_qp), 291 IB_MANDATORY_FUNC(post_send), 292 IB_MANDATORY_FUNC(post_recv), 293 IB_MANDATORY_FUNC(create_cq), 294 IB_MANDATORY_FUNC(destroy_cq), 295 IB_MANDATORY_FUNC(poll_cq), 296 IB_MANDATORY_FUNC(req_notify_cq), 297 IB_MANDATORY_FUNC(get_dma_mr), 298 IB_MANDATORY_FUNC(reg_user_mr), 299 IB_MANDATORY_FUNC(dereg_mr), 300 IB_MANDATORY_FUNC(get_port_immutable) 301 }; 302 int i; 303 304 device->kverbs_provider = true; 305 for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { 306 if (!*(void **) ((void *) &device->ops + 307 mandatory_table[i].offset)) { 308 device->kverbs_provider = false; 309 break; 310 } 311 } 312 } 313 314 /* 315 * Caller must perform ib_device_put() to return the device reference count 316 * when ib_device_get_by_index() returns valid device pointer. 317 */ 318 struct ib_device *ib_device_get_by_index(const struct net *net, u32 index) 319 { 320 struct ib_device *device; 321 322 down_read(&devices_rwsem); 323 device = xa_load(&devices, index); 324 if (device) { 325 if (!rdma_dev_access_netns(device, net)) { 326 device = NULL; 327 goto out; 328 } 329 330 if (!ib_device_try_get(device)) 331 device = NULL; 332 } 333 out: 334 up_read(&devices_rwsem); 335 return device; 336 } 337 338 /** 339 * ib_device_put - Release IB device reference 340 * @device: device whose reference to be released 341 * 342 * ib_device_put() releases reference to the IB device to allow it to be 343 * unregistered and eventually free. 344 */ 345 void ib_device_put(struct ib_device *device) 346 { 347 if (refcount_dec_and_test(&device->refcount)) 348 complete(&device->unreg_completion); 349 } 350 EXPORT_SYMBOL(ib_device_put); 351 352 static struct ib_device *__ib_device_get_by_name(const char *name) 353 { 354 struct ib_device *device; 355 unsigned long index; 356 357 xa_for_each (&devices, index, device) 358 if (!strcmp(name, dev_name(&device->dev))) 359 return device; 360 361 return NULL; 362 } 363 364 static int rename_compat_devs(struct ib_device *device) 365 { 366 struct ib_core_device *cdev; 367 unsigned long index; 368 int ret = 0; 369 370 mutex_lock(&device->compat_devs_mutex); 371 xa_for_each (&device->compat_devs, index, cdev) { 372 ret = device_rename(&cdev->dev, dev_name(&device->dev)); 373 if (ret) { 374 dev_warn(&cdev->dev, 375 "Fail to rename compatdev to new name %s\n", 376 dev_name(&device->dev)); 377 break; 378 } 379 } 380 mutex_unlock(&device->compat_devs_mutex); 381 return ret; 382 } 383 384 int ib_device_rename(struct ib_device *ibdev, const char *name) 385 { 386 unsigned long index; 387 void *client_data; 388 int ret; 389 390 down_write(&devices_rwsem); 391 if (!strcmp(name, dev_name(&ibdev->dev))) { 392 up_write(&devices_rwsem); 393 return 0; 394 } 395 396 if (__ib_device_get_by_name(name)) { 397 up_write(&devices_rwsem); 398 return -EEXIST; 399 } 400 401 ret = device_rename(&ibdev->dev, name); 402 if (ret) { 403 up_write(&devices_rwsem); 404 return ret; 405 } 406 407 strscpy(ibdev->name, name, IB_DEVICE_NAME_MAX); 408 ret = rename_compat_devs(ibdev); 409 410 downgrade_write(&devices_rwsem); 411 down_read(&ibdev->client_data_rwsem); 412 xan_for_each_marked(&ibdev->client_data, index, client_data, 413 CLIENT_DATA_REGISTERED) { 414 struct ib_client *client = xa_load(&clients, index); 415 416 if (!client || !client->rename) 417 continue; 418 419 client->rename(ibdev, client_data); 420 } 421 up_read(&ibdev->client_data_rwsem); 422 rdma_nl_notify_event(ibdev, 0, RDMA_RENAME_EVENT); 423 up_read(&devices_rwsem); 424 return 0; 425 } 426 427 int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim) 428 { 429 if (use_dim > 1) 430 return -EINVAL; 431 ibdev->use_cq_dim = use_dim; 432 433 return 0; 434 } 435 436 static int alloc_name(struct ib_device *ibdev, const char *name) 437 { 438 struct ib_device *device; 439 unsigned long index; 440 struct ida inuse; 441 int rc; 442 int i; 443 444 lockdep_assert_held_write(&devices_rwsem); 445 ida_init(&inuse); 446 xa_for_each (&devices, index, device) { 447 char buf[IB_DEVICE_NAME_MAX]; 448 449 if (sscanf(dev_name(&device->dev), name, &i) != 1) 450 continue; 451 if (i < 0 || i >= INT_MAX) 452 continue; 453 snprintf(buf, sizeof buf, name, i); 454 if (strcmp(buf, dev_name(&device->dev)) != 0) 455 continue; 456 457 rc = ida_alloc_range(&inuse, i, i, GFP_KERNEL); 458 if (rc < 0) 459 goto out; 460 } 461 462 rc = ida_alloc(&inuse, GFP_KERNEL); 463 if (rc < 0) 464 goto out; 465 466 rc = dev_set_name(&ibdev->dev, name, rc); 467 out: 468 ida_destroy(&inuse); 469 return rc; 470 } 471 472 static void ib_device_release(struct device *device) 473 { 474 struct ib_device *dev = container_of(device, struct ib_device, dev); 475 476 free_netdevs(dev); 477 WARN_ON(refcount_read(&dev->refcount)); 478 if (dev->hw_stats_data) 479 ib_device_release_hw_stats(dev->hw_stats_data); 480 if (dev->port_data) { 481 ib_cache_release_one(dev); 482 ib_security_release_port_pkey_list(dev); 483 rdma_counter_release(dev); 484 kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu, 485 pdata[0]), 486 rcu_head); 487 } 488 489 mutex_destroy(&dev->subdev_lock); 490 mutex_destroy(&dev->unregistration_lock); 491 mutex_destroy(&dev->compat_devs_mutex); 492 493 xa_destroy(&dev->compat_devs); 494 xa_destroy(&dev->client_data); 495 kfree_rcu(dev, rcu_head); 496 } 497 498 static int ib_device_uevent(const struct device *device, 499 struct kobj_uevent_env *env) 500 { 501 if (add_uevent_var(env, "NAME=%s", dev_name(device))) 502 return -ENOMEM; 503 504 /* 505 * It would be nice to pass the node GUID with the event... 506 */ 507 508 return 0; 509 } 510 511 static const void *net_namespace(const struct device *d) 512 { 513 const struct ib_core_device *coredev = 514 container_of(d, struct ib_core_device, dev); 515 516 return read_pnet(&coredev->rdma_net); 517 } 518 519 static struct class ib_class = { 520 .name = "infiniband", 521 .dev_release = ib_device_release, 522 .dev_uevent = ib_device_uevent, 523 .ns_type = &net_ns_type_operations, 524 .namespace = net_namespace, 525 }; 526 527 static void rdma_init_coredev(struct ib_core_device *coredev, 528 struct ib_device *dev, struct net *net) 529 { 530 bool is_full_dev = &dev->coredev == coredev; 531 532 /* This BUILD_BUG_ON is intended to catch layout change 533 * of union of ib_core_device and device. 534 * dev must be the first element as ib_core and providers 535 * driver uses it. Adding anything in ib_core_device before 536 * device will break this assumption. 537 */ 538 BUILD_BUG_ON(offsetof(struct ib_device, coredev.dev) != 539 offsetof(struct ib_device, dev)); 540 541 coredev->dev.class = &ib_class; 542 coredev->dev.groups = dev->groups; 543 544 /* 545 * Don't expose hw counters outside of the init namespace. 546 */ 547 if (!is_full_dev && dev->hw_stats_attr_index) 548 coredev->dev.groups[dev->hw_stats_attr_index] = NULL; 549 550 device_initialize(&coredev->dev); 551 coredev->owner = dev; 552 INIT_LIST_HEAD(&coredev->port_list); 553 write_pnet(&coredev->rdma_net, net); 554 } 555 556 /** 557 * _ib_alloc_device - allocate an IB device struct 558 * @size:size of structure to allocate 559 * @net: network namespace device should be located in, namespace 560 * must stay valid until ib_register_device() is completed. 561 * 562 * Low-level drivers should use ib_alloc_device() to allocate &struct 563 * ib_device. @size is the size of the structure to be allocated, 564 * including any private data used by the low-level driver. 565 * ib_dealloc_device() must be used to free structures allocated with 566 * ib_alloc_device(). 567 */ 568 struct ib_device *_ib_alloc_device(size_t size, struct net *net) 569 { 570 struct ib_device *device; 571 unsigned int i; 572 573 if (WARN_ON(size < sizeof(struct ib_device))) 574 return NULL; 575 576 device = kzalloc(size, GFP_KERNEL); 577 if (!device) 578 return NULL; 579 580 if (rdma_restrack_init(device)) { 581 kfree(device); 582 return NULL; 583 } 584 585 /* ib_devices_shared_netns can't change while we have active namespaces 586 * in the system which means either init_net is passed or the user has 587 * no idea what they are doing. 588 * 589 * To avoid breaking backward compatibility, when in shared mode, 590 * force to init the device in the init_net. 591 */ 592 net = ib_devices_shared_netns ? &init_net : net; 593 rdma_init_coredev(&device->coredev, device, net); 594 595 INIT_LIST_HEAD(&device->event_handler_list); 596 spin_lock_init(&device->qp_open_list_lock); 597 init_rwsem(&device->event_handler_rwsem); 598 mutex_init(&device->unregistration_lock); 599 /* 600 * client_data needs to be alloc because we don't want our mark to be 601 * destroyed if the user stores NULL in the client data. 602 */ 603 xa_init_flags(&device->client_data, XA_FLAGS_ALLOC); 604 init_rwsem(&device->client_data_rwsem); 605 xa_init_flags(&device->compat_devs, XA_FLAGS_ALLOC); 606 mutex_init(&device->compat_devs_mutex); 607 init_completion(&device->unreg_completion); 608 INIT_WORK(&device->unregistration_work, ib_unregister_work); 609 610 spin_lock_init(&device->cq_pools_lock); 611 for (i = 0; i < ARRAY_SIZE(device->cq_pools); i++) 612 INIT_LIST_HEAD(&device->cq_pools[i]); 613 614 rwlock_init(&device->cache_lock); 615 616 device->uverbs_cmd_mask = 617 BIT_ULL(IB_USER_VERBS_CMD_ALLOC_MW) | 618 BIT_ULL(IB_USER_VERBS_CMD_ALLOC_PD) | 619 BIT_ULL(IB_USER_VERBS_CMD_ATTACH_MCAST) | 620 BIT_ULL(IB_USER_VERBS_CMD_CLOSE_XRCD) | 621 BIT_ULL(IB_USER_VERBS_CMD_CREATE_AH) | 622 BIT_ULL(IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | 623 BIT_ULL(IB_USER_VERBS_CMD_CREATE_CQ) | 624 BIT_ULL(IB_USER_VERBS_CMD_CREATE_QP) | 625 BIT_ULL(IB_USER_VERBS_CMD_CREATE_SRQ) | 626 BIT_ULL(IB_USER_VERBS_CMD_CREATE_XSRQ) | 627 BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_MW) | 628 BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_PD) | 629 BIT_ULL(IB_USER_VERBS_CMD_DEREG_MR) | 630 BIT_ULL(IB_USER_VERBS_CMD_DESTROY_AH) | 631 BIT_ULL(IB_USER_VERBS_CMD_DESTROY_CQ) | 632 BIT_ULL(IB_USER_VERBS_CMD_DESTROY_QP) | 633 BIT_ULL(IB_USER_VERBS_CMD_DESTROY_SRQ) | 634 BIT_ULL(IB_USER_VERBS_CMD_DETACH_MCAST) | 635 BIT_ULL(IB_USER_VERBS_CMD_GET_CONTEXT) | 636 BIT_ULL(IB_USER_VERBS_CMD_MODIFY_QP) | 637 BIT_ULL(IB_USER_VERBS_CMD_MODIFY_SRQ) | 638 BIT_ULL(IB_USER_VERBS_CMD_OPEN_QP) | 639 BIT_ULL(IB_USER_VERBS_CMD_OPEN_XRCD) | 640 BIT_ULL(IB_USER_VERBS_CMD_QUERY_DEVICE) | 641 BIT_ULL(IB_USER_VERBS_CMD_QUERY_PORT) | 642 BIT_ULL(IB_USER_VERBS_CMD_QUERY_QP) | 643 BIT_ULL(IB_USER_VERBS_CMD_QUERY_SRQ) | 644 BIT_ULL(IB_USER_VERBS_CMD_REG_MR) | 645 BIT_ULL(IB_USER_VERBS_CMD_REREG_MR) | 646 BIT_ULL(IB_USER_VERBS_CMD_RESIZE_CQ); 647 648 mutex_init(&device->subdev_lock); 649 INIT_LIST_HEAD(&device->subdev_list_head); 650 INIT_LIST_HEAD(&device->subdev_list); 651 652 return device; 653 } 654 EXPORT_SYMBOL(_ib_alloc_device); 655 656 /** 657 * ib_dealloc_device - free an IB device struct 658 * @device:structure to free 659 * 660 * Free a structure allocated with ib_alloc_device(). 661 */ 662 void ib_dealloc_device(struct ib_device *device) 663 { 664 if (device->ops.dealloc_driver) 665 device->ops.dealloc_driver(device); 666 667 /* 668 * ib_unregister_driver() requires all devices to remain in the xarray 669 * while their ops are callable. The last op we call is dealloc_driver 670 * above. This is needed to create a fence on op callbacks prior to 671 * allowing the driver module to unload. 672 */ 673 down_write(&devices_rwsem); 674 if (xa_load(&devices, device->index) == device) 675 xa_erase(&devices, device->index); 676 up_write(&devices_rwsem); 677 678 /* Expedite releasing netdev references */ 679 free_netdevs(device); 680 681 WARN_ON(!xa_empty(&device->compat_devs)); 682 WARN_ON(!xa_empty(&device->client_data)); 683 WARN_ON(refcount_read(&device->refcount)); 684 rdma_restrack_clean(device); 685 /* Balances with device_initialize */ 686 put_device(&device->dev); 687 } 688 EXPORT_SYMBOL(ib_dealloc_device); 689 690 /* 691 * add_client_context() and remove_client_context() must be safe against 692 * parallel calls on the same device - registration/unregistration of both the 693 * device and client can be occurring in parallel. 694 * 695 * The routines need to be a fence, any caller must not return until the add 696 * or remove is fully completed. 697 */ 698 static int add_client_context(struct ib_device *device, 699 struct ib_client *client) 700 { 701 int ret = 0; 702 703 if (!device->kverbs_provider && !client->no_kverbs_req) 704 return 0; 705 706 down_write(&device->client_data_rwsem); 707 /* 708 * So long as the client is registered hold both the client and device 709 * unregistration locks. 710 */ 711 if (!refcount_inc_not_zero(&client->uses)) 712 goto out_unlock; 713 refcount_inc(&device->refcount); 714 715 /* 716 * Another caller to add_client_context got here first and has already 717 * completely initialized context. 718 */ 719 if (xa_get_mark(&device->client_data, client->client_id, 720 CLIENT_DATA_REGISTERED)) 721 goto out; 722 723 ret = xa_err(xa_store(&device->client_data, client->client_id, NULL, 724 GFP_KERNEL)); 725 if (ret) 726 goto out; 727 downgrade_write(&device->client_data_rwsem); 728 if (client->add) { 729 if (client->add(device)) { 730 /* 731 * If a client fails to add then the error code is 732 * ignored, but we won't call any more ops on this 733 * client. 734 */ 735 xa_erase(&device->client_data, client->client_id); 736 up_read(&device->client_data_rwsem); 737 ib_device_put(device); 738 ib_client_put(client); 739 return 0; 740 } 741 } 742 743 /* Readers shall not see a client until add has been completed */ 744 xa_set_mark(&device->client_data, client->client_id, 745 CLIENT_DATA_REGISTERED); 746 up_read(&device->client_data_rwsem); 747 return 0; 748 749 out: 750 ib_device_put(device); 751 ib_client_put(client); 752 out_unlock: 753 up_write(&device->client_data_rwsem); 754 return ret; 755 } 756 757 static void remove_client_context(struct ib_device *device, 758 unsigned int client_id) 759 { 760 struct ib_client *client; 761 void *client_data; 762 763 down_write(&device->client_data_rwsem); 764 if (!xa_get_mark(&device->client_data, client_id, 765 CLIENT_DATA_REGISTERED)) { 766 up_write(&device->client_data_rwsem); 767 return; 768 } 769 client_data = xa_load(&device->client_data, client_id); 770 xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED); 771 client = xa_load(&clients, client_id); 772 up_write(&device->client_data_rwsem); 773 774 /* 775 * Notice we cannot be holding any exclusive locks when calling the 776 * remove callback as the remove callback can recurse back into any 777 * public functions in this module and thus try for any locks those 778 * functions take. 779 * 780 * For this reason clients and drivers should not call the 781 * unregistration functions will holdling any locks. 782 */ 783 if (client->remove) 784 client->remove(device, client_data); 785 786 xa_erase(&device->client_data, client_id); 787 ib_device_put(device); 788 ib_client_put(client); 789 } 790 791 static int alloc_port_data(struct ib_device *device) 792 { 793 struct ib_port_data_rcu *pdata_rcu; 794 u32 port; 795 796 if (device->port_data) 797 return 0; 798 799 /* This can only be called once the physical port range is defined */ 800 if (WARN_ON(!device->phys_port_cnt)) 801 return -EINVAL; 802 803 /* Reserve U32_MAX so the logic to go over all the ports is sane */ 804 if (WARN_ON(device->phys_port_cnt == U32_MAX)) 805 return -EINVAL; 806 807 /* 808 * device->port_data is indexed directly by the port number to make 809 * access to this data as efficient as possible. 810 * 811 * Therefore port_data is declared as a 1 based array with potential 812 * empty slots at the beginning. 813 */ 814 pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata, 815 size_add(rdma_end_port(device), 1)), 816 GFP_KERNEL); 817 if (!pdata_rcu) 818 return -ENOMEM; 819 /* 820 * The rcu_head is put in front of the port data array and the stored 821 * pointer is adjusted since we never need to see that member until 822 * kfree_rcu. 823 */ 824 device->port_data = pdata_rcu->pdata; 825 826 rdma_for_each_port (device, port) { 827 struct ib_port_data *pdata = &device->port_data[port]; 828 829 pdata->ib_dev = device; 830 spin_lock_init(&pdata->pkey_list_lock); 831 INIT_LIST_HEAD(&pdata->pkey_list); 832 spin_lock_init(&pdata->netdev_lock); 833 INIT_HLIST_NODE(&pdata->ndev_hash_link); 834 } 835 return 0; 836 } 837 838 static int verify_immutable(const struct ib_device *dev, u32 port) 839 { 840 return WARN_ON(!rdma_cap_ib_mad(dev, port) && 841 rdma_max_mad_size(dev, port) != 0); 842 } 843 844 static int setup_port_data(struct ib_device *device) 845 { 846 u32 port; 847 int ret; 848 849 ret = alloc_port_data(device); 850 if (ret) 851 return ret; 852 853 rdma_for_each_port (device, port) { 854 struct ib_port_data *pdata = &device->port_data[port]; 855 856 ret = device->ops.get_port_immutable(device, port, 857 &pdata->immutable); 858 if (ret) 859 return ret; 860 861 if (verify_immutable(device, port)) 862 return -EINVAL; 863 } 864 return 0; 865 } 866 867 /** 868 * ib_port_immutable_read() - Read rdma port's immutable data 869 * @dev: IB device 870 * @port: port number whose immutable data to read. It starts with index 1 and 871 * valid upto including rdma_end_port(). 872 */ 873 const struct ib_port_immutable* 874 ib_port_immutable_read(struct ib_device *dev, unsigned int port) 875 { 876 WARN_ON(!rdma_is_port_valid(dev, port)); 877 return &dev->port_data[port].immutable; 878 } 879 EXPORT_SYMBOL(ib_port_immutable_read); 880 881 void ib_get_device_fw_str(struct ib_device *dev, char *str) 882 { 883 if (dev->ops.get_dev_fw_str) 884 dev->ops.get_dev_fw_str(dev, str); 885 else 886 str[0] = '\0'; 887 } 888 EXPORT_SYMBOL(ib_get_device_fw_str); 889 890 static void ib_policy_change_task(struct work_struct *work) 891 { 892 struct ib_device *dev; 893 unsigned long index; 894 895 down_read(&devices_rwsem); 896 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 897 unsigned int i; 898 899 rdma_for_each_port (dev, i) { 900 u64 sp; 901 ib_get_cached_subnet_prefix(dev, i, &sp); 902 ib_security_cache_change(dev, i, sp); 903 } 904 } 905 up_read(&devices_rwsem); 906 } 907 908 static int ib_security_change(struct notifier_block *nb, unsigned long event, 909 void *lsm_data) 910 { 911 if (event != LSM_POLICY_CHANGE) 912 return NOTIFY_DONE; 913 914 schedule_work(&ib_policy_change_work); 915 ib_mad_agent_security_change(); 916 917 return NOTIFY_OK; 918 } 919 920 static void compatdev_release(struct device *dev) 921 { 922 struct ib_core_device *cdev = 923 container_of(dev, struct ib_core_device, dev); 924 925 kfree(cdev); 926 } 927 928 static int add_one_compat_dev(struct ib_device *device, 929 struct rdma_dev_net *rnet) 930 { 931 struct ib_core_device *cdev; 932 int ret; 933 934 lockdep_assert_held(&rdma_nets_rwsem); 935 if (!ib_devices_shared_netns) 936 return 0; 937 938 /* 939 * Create and add compat device in all namespaces other than where it 940 * is currently bound to. 941 */ 942 if (net_eq(read_pnet(&rnet->net), 943 read_pnet(&device->coredev.rdma_net))) 944 return 0; 945 946 /* 947 * The first of init_net() or ib_register_device() to take the 948 * compat_devs_mutex wins and gets to add the device. Others will wait 949 * for completion here. 950 */ 951 mutex_lock(&device->compat_devs_mutex); 952 cdev = xa_load(&device->compat_devs, rnet->id); 953 if (cdev) { 954 ret = 0; 955 goto done; 956 } 957 ret = xa_reserve(&device->compat_devs, rnet->id, GFP_KERNEL); 958 if (ret) 959 goto done; 960 961 cdev = kzalloc(sizeof(*cdev), GFP_KERNEL); 962 if (!cdev) { 963 ret = -ENOMEM; 964 goto cdev_err; 965 } 966 967 cdev->dev.parent = device->dev.parent; 968 rdma_init_coredev(cdev, device, read_pnet(&rnet->net)); 969 cdev->dev.release = compatdev_release; 970 ret = dev_set_name(&cdev->dev, "%s", dev_name(&device->dev)); 971 if (ret) 972 goto add_err; 973 974 ret = device_add(&cdev->dev); 975 if (ret) 976 goto add_err; 977 ret = ib_setup_port_attrs(cdev); 978 if (ret) 979 goto port_err; 980 981 ret = xa_err(xa_store(&device->compat_devs, rnet->id, 982 cdev, GFP_KERNEL)); 983 if (ret) 984 goto insert_err; 985 986 mutex_unlock(&device->compat_devs_mutex); 987 return 0; 988 989 insert_err: 990 ib_free_port_attrs(cdev); 991 port_err: 992 device_del(&cdev->dev); 993 add_err: 994 put_device(&cdev->dev); 995 cdev_err: 996 xa_release(&device->compat_devs, rnet->id); 997 done: 998 mutex_unlock(&device->compat_devs_mutex); 999 return ret; 1000 } 1001 1002 static void remove_one_compat_dev(struct ib_device *device, u32 id) 1003 { 1004 struct ib_core_device *cdev; 1005 1006 mutex_lock(&device->compat_devs_mutex); 1007 cdev = xa_erase(&device->compat_devs, id); 1008 mutex_unlock(&device->compat_devs_mutex); 1009 if (cdev) { 1010 ib_free_port_attrs(cdev); 1011 device_del(&cdev->dev); 1012 put_device(&cdev->dev); 1013 } 1014 } 1015 1016 static void remove_compat_devs(struct ib_device *device) 1017 { 1018 struct ib_core_device *cdev; 1019 unsigned long index; 1020 1021 xa_for_each (&device->compat_devs, index, cdev) 1022 remove_one_compat_dev(device, index); 1023 } 1024 1025 static int add_compat_devs(struct ib_device *device) 1026 { 1027 struct rdma_dev_net *rnet; 1028 unsigned long index; 1029 int ret = 0; 1030 1031 lockdep_assert_held(&devices_rwsem); 1032 1033 down_read(&rdma_nets_rwsem); 1034 xa_for_each (&rdma_nets, index, rnet) { 1035 ret = add_one_compat_dev(device, rnet); 1036 if (ret) 1037 break; 1038 } 1039 up_read(&rdma_nets_rwsem); 1040 return ret; 1041 } 1042 1043 static void remove_all_compat_devs(void) 1044 { 1045 struct ib_compat_device *cdev; 1046 struct ib_device *dev; 1047 unsigned long index; 1048 1049 down_read(&devices_rwsem); 1050 xa_for_each (&devices, index, dev) { 1051 unsigned long c_index = 0; 1052 1053 /* Hold nets_rwsem so that any other thread modifying this 1054 * system param can sync with this thread. 1055 */ 1056 down_read(&rdma_nets_rwsem); 1057 xa_for_each (&dev->compat_devs, c_index, cdev) 1058 remove_one_compat_dev(dev, c_index); 1059 up_read(&rdma_nets_rwsem); 1060 } 1061 up_read(&devices_rwsem); 1062 } 1063 1064 static int add_all_compat_devs(void) 1065 { 1066 struct rdma_dev_net *rnet; 1067 struct ib_device *dev; 1068 unsigned long index; 1069 int ret = 0; 1070 1071 down_read(&devices_rwsem); 1072 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 1073 unsigned long net_index = 0; 1074 1075 /* Hold nets_rwsem so that any other thread modifying this 1076 * system param can sync with this thread. 1077 */ 1078 down_read(&rdma_nets_rwsem); 1079 xa_for_each (&rdma_nets, net_index, rnet) { 1080 ret = add_one_compat_dev(dev, rnet); 1081 if (ret) 1082 break; 1083 } 1084 up_read(&rdma_nets_rwsem); 1085 } 1086 up_read(&devices_rwsem); 1087 if (ret) 1088 remove_all_compat_devs(); 1089 return ret; 1090 } 1091 1092 int rdma_compatdev_set(u8 enable) 1093 { 1094 struct rdma_dev_net *rnet; 1095 unsigned long index; 1096 int ret = 0; 1097 1098 down_write(&rdma_nets_rwsem); 1099 if (ib_devices_shared_netns == enable) { 1100 up_write(&rdma_nets_rwsem); 1101 return 0; 1102 } 1103 1104 /* enable/disable of compat devices is not supported 1105 * when more than default init_net exists. 1106 */ 1107 xa_for_each (&rdma_nets, index, rnet) { 1108 ret++; 1109 break; 1110 } 1111 if (!ret) 1112 ib_devices_shared_netns = enable; 1113 up_write(&rdma_nets_rwsem); 1114 if (ret) 1115 return -EBUSY; 1116 1117 if (enable) 1118 ret = add_all_compat_devs(); 1119 else 1120 remove_all_compat_devs(); 1121 return ret; 1122 } 1123 1124 static void rdma_dev_exit_net(struct net *net) 1125 { 1126 struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); 1127 struct ib_device *dev; 1128 unsigned long index; 1129 int ret; 1130 1131 down_write(&rdma_nets_rwsem); 1132 /* 1133 * Prevent the ID from being re-used and hide the id from xa_for_each. 1134 */ 1135 ret = xa_err(xa_store(&rdma_nets, rnet->id, NULL, GFP_KERNEL)); 1136 WARN_ON(ret); 1137 up_write(&rdma_nets_rwsem); 1138 1139 down_read(&devices_rwsem); 1140 xa_for_each (&devices, index, dev) { 1141 get_device(&dev->dev); 1142 /* 1143 * Release the devices_rwsem so that pontentially blocking 1144 * device_del, doesn't hold the devices_rwsem for too long. 1145 */ 1146 up_read(&devices_rwsem); 1147 1148 remove_one_compat_dev(dev, rnet->id); 1149 1150 /* 1151 * If the real device is in the NS then move it back to init. 1152 */ 1153 rdma_dev_change_netns(dev, net, &init_net); 1154 1155 put_device(&dev->dev); 1156 down_read(&devices_rwsem); 1157 } 1158 up_read(&devices_rwsem); 1159 1160 rdma_nl_net_exit(rnet); 1161 xa_erase(&rdma_nets, rnet->id); 1162 } 1163 1164 static __net_init int rdma_dev_init_net(struct net *net) 1165 { 1166 struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); 1167 unsigned long index; 1168 struct ib_device *dev; 1169 int ret; 1170 1171 write_pnet(&rnet->net, net); 1172 1173 ret = rdma_nl_net_init(rnet); 1174 if (ret) 1175 return ret; 1176 1177 /* No need to create any compat devices in default init_net. */ 1178 if (net_eq(net, &init_net)) 1179 return 0; 1180 1181 ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL); 1182 if (ret) { 1183 rdma_nl_net_exit(rnet); 1184 return ret; 1185 } 1186 1187 down_read(&devices_rwsem); 1188 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 1189 /* Hold nets_rwsem so that netlink command cannot change 1190 * system configuration for device sharing mode. 1191 */ 1192 down_read(&rdma_nets_rwsem); 1193 ret = add_one_compat_dev(dev, rnet); 1194 up_read(&rdma_nets_rwsem); 1195 if (ret) 1196 break; 1197 } 1198 up_read(&devices_rwsem); 1199 1200 if (ret) 1201 rdma_dev_exit_net(net); 1202 1203 return ret; 1204 } 1205 1206 /* 1207 * Assign the unique string device name and the unique device index. This is 1208 * undone by ib_dealloc_device. 1209 */ 1210 static int assign_name(struct ib_device *device, const char *name) 1211 { 1212 static u32 last_id; 1213 int ret; 1214 1215 down_write(&devices_rwsem); 1216 /* Assign a unique name to the device */ 1217 if (strchr(name, '%')) 1218 ret = alloc_name(device, name); 1219 else 1220 ret = dev_set_name(&device->dev, name); 1221 if (ret) 1222 goto out; 1223 1224 if (__ib_device_get_by_name(dev_name(&device->dev))) { 1225 ret = -ENFILE; 1226 goto out; 1227 } 1228 strscpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); 1229 1230 ret = xa_alloc_cyclic(&devices, &device->index, device, xa_limit_31b, 1231 &last_id, GFP_KERNEL); 1232 if (ret > 0) 1233 ret = 0; 1234 1235 out: 1236 up_write(&devices_rwsem); 1237 return ret; 1238 } 1239 1240 /* 1241 * setup_device() allocates memory and sets up data that requires calling the 1242 * device ops, this is the only reason these actions are not done during 1243 * ib_alloc_device. It is undone by ib_dealloc_device(). 1244 */ 1245 static int setup_device(struct ib_device *device) 1246 { 1247 struct ib_udata uhw = {.outlen = 0, .inlen = 0}; 1248 int ret; 1249 1250 ib_device_check_mandatory(device); 1251 1252 ret = setup_port_data(device); 1253 if (ret) { 1254 dev_warn(&device->dev, "Couldn't create per-port data\n"); 1255 return ret; 1256 } 1257 1258 memset(&device->attrs, 0, sizeof(device->attrs)); 1259 ret = device->ops.query_device(device, &device->attrs, &uhw); 1260 if (ret) { 1261 dev_warn(&device->dev, 1262 "Couldn't query the device attributes\n"); 1263 return ret; 1264 } 1265 1266 return 0; 1267 } 1268 1269 static void disable_device(struct ib_device *device) 1270 { 1271 u32 cid; 1272 1273 WARN_ON(!refcount_read(&device->refcount)); 1274 1275 down_write(&devices_rwsem); 1276 xa_clear_mark(&devices, device->index, DEVICE_REGISTERED); 1277 up_write(&devices_rwsem); 1278 1279 /* 1280 * Remove clients in LIFO order, see assign_client_id. This could be 1281 * more efficient if xarray learns to reverse iterate. Since no new 1282 * clients can be added to this ib_device past this point we only need 1283 * the maximum possible client_id value here. 1284 */ 1285 down_read(&clients_rwsem); 1286 cid = highest_client_id; 1287 up_read(&clients_rwsem); 1288 while (cid) { 1289 cid--; 1290 remove_client_context(device, cid); 1291 } 1292 1293 ib_cq_pool_cleanup(device); 1294 1295 /* Pairs with refcount_set in enable_device */ 1296 ib_device_put(device); 1297 wait_for_completion(&device->unreg_completion); 1298 1299 /* 1300 * compat devices must be removed after device refcount drops to zero. 1301 * Otherwise init_net() may add more compatdevs after removing compat 1302 * devices and before device is disabled. 1303 */ 1304 remove_compat_devs(device); 1305 } 1306 1307 /* 1308 * An enabled device is visible to all clients and to all the public facing 1309 * APIs that return a device pointer. This always returns with a new get, even 1310 * if it fails. 1311 */ 1312 static int enable_device_and_get(struct ib_device *device) 1313 { 1314 struct ib_client *client; 1315 unsigned long index; 1316 int ret = 0; 1317 1318 /* 1319 * One ref belongs to the xa and the other belongs to this 1320 * thread. This is needed to guard against parallel unregistration. 1321 */ 1322 refcount_set(&device->refcount, 2); 1323 down_write(&devices_rwsem); 1324 xa_set_mark(&devices, device->index, DEVICE_REGISTERED); 1325 1326 /* 1327 * By using downgrade_write() we ensure that no other thread can clear 1328 * DEVICE_REGISTERED while we are completing the client setup. 1329 */ 1330 downgrade_write(&devices_rwsem); 1331 1332 if (device->ops.enable_driver) { 1333 ret = device->ops.enable_driver(device); 1334 if (ret) 1335 goto out; 1336 } 1337 1338 down_read(&clients_rwsem); 1339 xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { 1340 ret = add_client_context(device, client); 1341 if (ret) 1342 break; 1343 } 1344 up_read(&clients_rwsem); 1345 if (!ret) 1346 ret = add_compat_devs(device); 1347 out: 1348 up_read(&devices_rwsem); 1349 return ret; 1350 } 1351 1352 static void prevent_dealloc_device(struct ib_device *ib_dev) 1353 { 1354 } 1355 1356 static void ib_device_notify_register(struct ib_device *device) 1357 { 1358 struct net_device *netdev; 1359 u32 port; 1360 int ret; 1361 1362 down_read(&devices_rwsem); 1363 1364 /* Mark for userspace that device is ready */ 1365 kobject_uevent(&device->dev.kobj, KOBJ_ADD); 1366 1367 ret = rdma_nl_notify_event(device, 0, RDMA_REGISTER_EVENT); 1368 if (ret) 1369 goto out; 1370 1371 rdma_for_each_port(device, port) { 1372 netdev = ib_device_get_netdev(device, port); 1373 if (!netdev) 1374 continue; 1375 1376 ret = rdma_nl_notify_event(device, port, 1377 RDMA_NETDEV_ATTACH_EVENT); 1378 dev_put(netdev); 1379 if (ret) 1380 goto out; 1381 } 1382 1383 out: 1384 up_read(&devices_rwsem); 1385 } 1386 1387 /** 1388 * ib_register_device - Register an IB device with IB core 1389 * @device: Device to register 1390 * @name: unique string device name. This may include a '%' which will 1391 * cause a unique index to be added to the passed device name. 1392 * @dma_device: pointer to a DMA-capable device. If %NULL, then the IB 1393 * device will be used. In this case the caller should fully 1394 * setup the ibdev for DMA. This usually means using dma_virt_ops. 1395 * 1396 * Low-level drivers use ib_register_device() to register their 1397 * devices with the IB core. All registered clients will receive a 1398 * callback for each device that is added. @device must be allocated 1399 * with ib_alloc_device(). 1400 * 1401 * If the driver uses ops.dealloc_driver and calls any ib_unregister_device() 1402 * asynchronously then the device pointer may become freed as soon as this 1403 * function returns. 1404 */ 1405 int ib_register_device(struct ib_device *device, const char *name, 1406 struct device *dma_device) 1407 { 1408 int ret; 1409 1410 ret = assign_name(device, name); 1411 if (ret) 1412 return ret; 1413 1414 /* 1415 * If the caller does not provide a DMA capable device then the IB core 1416 * will set up ib_sge and scatterlist structures that stash the kernel 1417 * virtual address into the address field. 1418 */ 1419 WARN_ON(dma_device && !dma_device->dma_parms); 1420 device->dma_device = dma_device; 1421 1422 ret = setup_device(device); 1423 if (ret) 1424 return ret; 1425 1426 ret = ib_cache_setup_one(device); 1427 if (ret) { 1428 dev_warn(&device->dev, 1429 "Couldn't set up InfiniBand P_Key/GID cache\n"); 1430 return ret; 1431 } 1432 1433 device->groups[0] = &ib_dev_attr_group; 1434 device->groups[1] = device->ops.device_group; 1435 ret = ib_setup_device_attrs(device); 1436 if (ret) 1437 goto cache_cleanup; 1438 1439 ib_device_register_rdmacg(device); 1440 1441 rdma_counter_init(device); 1442 1443 /* 1444 * Ensure that ADD uevent is not fired because it 1445 * is too early amd device is not initialized yet. 1446 */ 1447 dev_set_uevent_suppress(&device->dev, true); 1448 ret = device_add(&device->dev); 1449 if (ret) 1450 goto cg_cleanup; 1451 1452 ret = ib_setup_port_attrs(&device->coredev); 1453 if (ret) { 1454 dev_warn(&device->dev, 1455 "Couldn't register device with driver model\n"); 1456 goto dev_cleanup; 1457 } 1458 1459 ret = enable_device_and_get(device); 1460 if (ret) { 1461 void (*dealloc_fn)(struct ib_device *); 1462 1463 /* 1464 * If we hit this error flow then we don't want to 1465 * automatically dealloc the device since the caller is 1466 * expected to call ib_dealloc_device() after 1467 * ib_register_device() fails. This is tricky due to the 1468 * possibility for a parallel unregistration along with this 1469 * error flow. Since we have a refcount here we know any 1470 * parallel flow is stopped in disable_device and will see the 1471 * special dealloc_driver pointer, causing the responsibility to 1472 * ib_dealloc_device() to revert back to this thread. 1473 */ 1474 dealloc_fn = device->ops.dealloc_driver; 1475 device->ops.dealloc_driver = prevent_dealloc_device; 1476 ib_device_put(device); 1477 __ib_unregister_device(device); 1478 device->ops.dealloc_driver = dealloc_fn; 1479 dev_set_uevent_suppress(&device->dev, false); 1480 return ret; 1481 } 1482 dev_set_uevent_suppress(&device->dev, false); 1483 1484 ib_device_notify_register(device); 1485 1486 ib_device_put(device); 1487 1488 return 0; 1489 1490 dev_cleanup: 1491 device_del(&device->dev); 1492 cg_cleanup: 1493 dev_set_uevent_suppress(&device->dev, false); 1494 ib_device_unregister_rdmacg(device); 1495 cache_cleanup: 1496 ib_cache_cleanup_one(device); 1497 return ret; 1498 } 1499 EXPORT_SYMBOL(ib_register_device); 1500 1501 /* Callers must hold a get on the device. */ 1502 static void __ib_unregister_device(struct ib_device *ib_dev) 1503 { 1504 struct ib_device *sub, *tmp; 1505 1506 mutex_lock(&ib_dev->subdev_lock); 1507 list_for_each_entry_safe_reverse(sub, tmp, 1508 &ib_dev->subdev_list_head, 1509 subdev_list) { 1510 list_del(&sub->subdev_list); 1511 ib_dev->ops.del_sub_dev(sub); 1512 ib_device_put(ib_dev); 1513 } 1514 mutex_unlock(&ib_dev->subdev_lock); 1515 1516 /* 1517 * We have a registration lock so that all the calls to unregister are 1518 * fully fenced, once any unregister returns the device is truly 1519 * unregistered even if multiple callers are unregistering it at the 1520 * same time. This also interacts with the registration flow and 1521 * provides sane semantics if register and unregister are racing. 1522 */ 1523 mutex_lock(&ib_dev->unregistration_lock); 1524 if (!refcount_read(&ib_dev->refcount)) 1525 goto out; 1526 1527 disable_device(ib_dev); 1528 rdma_nl_notify_event(ib_dev, 0, RDMA_UNREGISTER_EVENT); 1529 1530 /* Expedite removing unregistered pointers from the hash table */ 1531 free_netdevs(ib_dev); 1532 1533 ib_free_port_attrs(&ib_dev->coredev); 1534 device_del(&ib_dev->dev); 1535 ib_device_unregister_rdmacg(ib_dev); 1536 ib_cache_cleanup_one(ib_dev); 1537 1538 /* 1539 * Drivers using the new flow may not call ib_dealloc_device except 1540 * in error unwind prior to registration success. 1541 */ 1542 if (ib_dev->ops.dealloc_driver && 1543 ib_dev->ops.dealloc_driver != prevent_dealloc_device) { 1544 WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1); 1545 ib_dealloc_device(ib_dev); 1546 } 1547 out: 1548 mutex_unlock(&ib_dev->unregistration_lock); 1549 } 1550 1551 /** 1552 * ib_unregister_device - Unregister an IB device 1553 * @ib_dev: The device to unregister 1554 * 1555 * Unregister an IB device. All clients will receive a remove callback. 1556 * 1557 * Callers should call this routine only once, and protect against races with 1558 * registration. Typically it should only be called as part of a remove 1559 * callback in an implementation of driver core's struct device_driver and 1560 * related. 1561 * 1562 * If ops.dealloc_driver is used then ib_dev will be freed upon return from 1563 * this function. 1564 */ 1565 void ib_unregister_device(struct ib_device *ib_dev) 1566 { 1567 get_device(&ib_dev->dev); 1568 __ib_unregister_device(ib_dev); 1569 put_device(&ib_dev->dev); 1570 } 1571 EXPORT_SYMBOL(ib_unregister_device); 1572 1573 /** 1574 * ib_unregister_device_and_put - Unregister a device while holding a 'get' 1575 * @ib_dev: The device to unregister 1576 * 1577 * This is the same as ib_unregister_device(), except it includes an internal 1578 * ib_device_put() that should match a 'get' obtained by the caller. 1579 * 1580 * It is safe to call this routine concurrently from multiple threads while 1581 * holding the 'get'. When the function returns the device is fully 1582 * unregistered. 1583 * 1584 * Drivers using this flow MUST use the driver_unregister callback to clean up 1585 * their resources associated with the device and dealloc it. 1586 */ 1587 void ib_unregister_device_and_put(struct ib_device *ib_dev) 1588 { 1589 WARN_ON(!ib_dev->ops.dealloc_driver); 1590 get_device(&ib_dev->dev); 1591 ib_device_put(ib_dev); 1592 __ib_unregister_device(ib_dev); 1593 put_device(&ib_dev->dev); 1594 } 1595 EXPORT_SYMBOL(ib_unregister_device_and_put); 1596 1597 /** 1598 * ib_unregister_driver - Unregister all IB devices for a driver 1599 * @driver_id: The driver to unregister 1600 * 1601 * This implements a fence for device unregistration. It only returns once all 1602 * devices associated with the driver_id have fully completed their 1603 * unregistration and returned from ib_unregister_device*(). 1604 * 1605 * If device's are not yet unregistered it goes ahead and starts unregistering 1606 * them. 1607 * 1608 * This does not block creation of new devices with the given driver_id, that 1609 * is the responsibility of the caller. 1610 */ 1611 void ib_unregister_driver(enum rdma_driver_id driver_id) 1612 { 1613 struct ib_device *ib_dev; 1614 unsigned long index; 1615 1616 down_read(&devices_rwsem); 1617 xa_for_each (&devices, index, ib_dev) { 1618 if (ib_dev->ops.driver_id != driver_id) 1619 continue; 1620 1621 get_device(&ib_dev->dev); 1622 up_read(&devices_rwsem); 1623 1624 WARN_ON(!ib_dev->ops.dealloc_driver); 1625 __ib_unregister_device(ib_dev); 1626 1627 put_device(&ib_dev->dev); 1628 down_read(&devices_rwsem); 1629 } 1630 up_read(&devices_rwsem); 1631 } 1632 EXPORT_SYMBOL(ib_unregister_driver); 1633 1634 static void ib_unregister_work(struct work_struct *work) 1635 { 1636 struct ib_device *ib_dev = 1637 container_of(work, struct ib_device, unregistration_work); 1638 1639 __ib_unregister_device(ib_dev); 1640 put_device(&ib_dev->dev); 1641 } 1642 1643 /** 1644 * ib_unregister_device_queued - Unregister a device using a work queue 1645 * @ib_dev: The device to unregister 1646 * 1647 * This schedules an asynchronous unregistration using a WQ for the device. A 1648 * driver should use this to avoid holding locks while doing unregistration, 1649 * such as holding the RTNL lock. 1650 * 1651 * Drivers using this API must use ib_unregister_driver before module unload 1652 * to ensure that all scheduled unregistrations have completed. 1653 */ 1654 void ib_unregister_device_queued(struct ib_device *ib_dev) 1655 { 1656 WARN_ON(!refcount_read(&ib_dev->refcount)); 1657 WARN_ON(!ib_dev->ops.dealloc_driver); 1658 get_device(&ib_dev->dev); 1659 if (!queue_work(ib_unreg_wq, &ib_dev->unregistration_work)) 1660 put_device(&ib_dev->dev); 1661 } 1662 EXPORT_SYMBOL(ib_unregister_device_queued); 1663 1664 /* 1665 * The caller must pass in a device that has the kref held and the refcount 1666 * released. If the device is in cur_net and still registered then it is moved 1667 * into net. 1668 */ 1669 static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, 1670 struct net *net) 1671 { 1672 int ret2 = -EINVAL; 1673 int ret; 1674 1675 mutex_lock(&device->unregistration_lock); 1676 1677 /* 1678 * If a device not under ib_device_get() or if the unregistration_lock 1679 * is not held, the namespace can be changed, or it can be unregistered. 1680 * Check again under the lock. 1681 */ 1682 if (refcount_read(&device->refcount) == 0 || 1683 !net_eq(cur_net, read_pnet(&device->coredev.rdma_net))) { 1684 ret = -ENODEV; 1685 goto out; 1686 } 1687 1688 kobject_uevent(&device->dev.kobj, KOBJ_REMOVE); 1689 disable_device(device); 1690 1691 /* 1692 * At this point no one can be using the device, so it is safe to 1693 * change the namespace. 1694 */ 1695 write_pnet(&device->coredev.rdma_net, net); 1696 1697 down_read(&devices_rwsem); 1698 /* 1699 * Currently rdma devices are system wide unique. So the device name 1700 * is guaranteed free in the new namespace. Publish the new namespace 1701 * at the sysfs level. 1702 */ 1703 ret = device_rename(&device->dev, dev_name(&device->dev)); 1704 up_read(&devices_rwsem); 1705 if (ret) { 1706 dev_warn(&device->dev, 1707 "%s: Couldn't rename device after namespace change\n", 1708 __func__); 1709 /* Try and put things back and re-enable the device */ 1710 write_pnet(&device->coredev.rdma_net, cur_net); 1711 } 1712 1713 ret2 = enable_device_and_get(device); 1714 if (ret2) { 1715 /* 1716 * This shouldn't really happen, but if it does, let the user 1717 * retry at later point. So don't disable the device. 1718 */ 1719 dev_warn(&device->dev, 1720 "%s: Couldn't re-enable device after namespace change\n", 1721 __func__); 1722 } 1723 kobject_uevent(&device->dev.kobj, KOBJ_ADD); 1724 1725 ib_device_put(device); 1726 out: 1727 mutex_unlock(&device->unregistration_lock); 1728 if (ret) 1729 return ret; 1730 return ret2; 1731 } 1732 1733 int ib_device_set_netns_put(struct sk_buff *skb, 1734 struct ib_device *dev, u32 ns_fd) 1735 { 1736 struct net *net; 1737 int ret; 1738 1739 net = get_net_ns_by_fd(ns_fd); 1740 if (IS_ERR(net)) { 1741 ret = PTR_ERR(net); 1742 goto net_err; 1743 } 1744 1745 if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { 1746 ret = -EPERM; 1747 goto ns_err; 1748 } 1749 1750 /* 1751 * All the ib_clients, including uverbs, are reset when the namespace is 1752 * changed and this cannot be blocked waiting for userspace to do 1753 * something, so disassociation is mandatory. 1754 */ 1755 if (!dev->ops.disassociate_ucontext || ib_devices_shared_netns) { 1756 ret = -EOPNOTSUPP; 1757 goto ns_err; 1758 } 1759 1760 get_device(&dev->dev); 1761 ib_device_put(dev); 1762 ret = rdma_dev_change_netns(dev, current->nsproxy->net_ns, net); 1763 put_device(&dev->dev); 1764 1765 put_net(net); 1766 return ret; 1767 1768 ns_err: 1769 put_net(net); 1770 net_err: 1771 ib_device_put(dev); 1772 return ret; 1773 } 1774 1775 static struct pernet_operations rdma_dev_net_ops = { 1776 .init = rdma_dev_init_net, 1777 .exit = rdma_dev_exit_net, 1778 .id = &rdma_dev_net_id, 1779 .size = sizeof(struct rdma_dev_net), 1780 }; 1781 1782 static int assign_client_id(struct ib_client *client) 1783 { 1784 int ret; 1785 1786 lockdep_assert_held(&clients_rwsem); 1787 /* 1788 * The add/remove callbacks must be called in FIFO/LIFO order. To 1789 * achieve this we assign client_ids so they are sorted in 1790 * registration order. 1791 */ 1792 client->client_id = highest_client_id; 1793 ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL); 1794 if (ret) 1795 return ret; 1796 1797 highest_client_id++; 1798 xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED); 1799 return 0; 1800 } 1801 1802 static void remove_client_id(struct ib_client *client) 1803 { 1804 down_write(&clients_rwsem); 1805 xa_erase(&clients, client->client_id); 1806 for (; highest_client_id; highest_client_id--) 1807 if (xa_load(&clients, highest_client_id - 1)) 1808 break; 1809 up_write(&clients_rwsem); 1810 } 1811 1812 /** 1813 * ib_register_client - Register an IB client 1814 * @client:Client to register 1815 * 1816 * Upper level users of the IB drivers can use ib_register_client() to 1817 * register callbacks for IB device addition and removal. When an IB 1818 * device is added, each registered client's add method will be called 1819 * (in the order the clients were registered), and when a device is 1820 * removed, each client's remove method will be called (in the reverse 1821 * order that clients were registered). In addition, when 1822 * ib_register_client() is called, the client will receive an add 1823 * callback for all devices already registered. 1824 */ 1825 int ib_register_client(struct ib_client *client) 1826 { 1827 struct ib_device *device; 1828 unsigned long index; 1829 bool need_unreg = false; 1830 int ret; 1831 1832 refcount_set(&client->uses, 1); 1833 init_completion(&client->uses_zero); 1834 1835 /* 1836 * The devices_rwsem is held in write mode to ensure that a racing 1837 * ib_register_device() sees a consisent view of clients and devices. 1838 */ 1839 down_write(&devices_rwsem); 1840 down_write(&clients_rwsem); 1841 ret = assign_client_id(client); 1842 if (ret) 1843 goto out; 1844 1845 need_unreg = true; 1846 xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) { 1847 ret = add_client_context(device, client); 1848 if (ret) 1849 goto out; 1850 } 1851 ret = 0; 1852 out: 1853 up_write(&clients_rwsem); 1854 up_write(&devices_rwsem); 1855 if (need_unreg && ret) 1856 ib_unregister_client(client); 1857 return ret; 1858 } 1859 EXPORT_SYMBOL(ib_register_client); 1860 1861 /** 1862 * ib_unregister_client - Unregister an IB client 1863 * @client:Client to unregister 1864 * 1865 * Upper level users use ib_unregister_client() to remove their client 1866 * registration. When ib_unregister_client() is called, the client 1867 * will receive a remove callback for each IB device still registered. 1868 * 1869 * This is a full fence, once it returns no client callbacks will be called, 1870 * or are running in another thread. 1871 */ 1872 void ib_unregister_client(struct ib_client *client) 1873 { 1874 struct ib_device *device; 1875 unsigned long index; 1876 1877 down_write(&clients_rwsem); 1878 ib_client_put(client); 1879 xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED); 1880 up_write(&clients_rwsem); 1881 1882 /* We do not want to have locks while calling client->remove() */ 1883 rcu_read_lock(); 1884 xa_for_each (&devices, index, device) { 1885 if (!ib_device_try_get(device)) 1886 continue; 1887 rcu_read_unlock(); 1888 1889 remove_client_context(device, client->client_id); 1890 1891 ib_device_put(device); 1892 rcu_read_lock(); 1893 } 1894 rcu_read_unlock(); 1895 1896 /* 1897 * remove_client_context() is not a fence, it can return even though a 1898 * removal is ongoing. Wait until all removals are completed. 1899 */ 1900 wait_for_completion(&client->uses_zero); 1901 remove_client_id(client); 1902 } 1903 EXPORT_SYMBOL(ib_unregister_client); 1904 1905 static int __ib_get_global_client_nl_info(const char *client_name, 1906 struct ib_client_nl_info *res) 1907 { 1908 struct ib_client *client; 1909 unsigned long index; 1910 int ret = -ENOENT; 1911 1912 down_read(&clients_rwsem); 1913 xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { 1914 if (strcmp(client->name, client_name) != 0) 1915 continue; 1916 if (!client->get_global_nl_info) { 1917 ret = -EOPNOTSUPP; 1918 break; 1919 } 1920 ret = client->get_global_nl_info(res); 1921 if (WARN_ON(ret == -ENOENT)) 1922 ret = -EINVAL; 1923 if (!ret && res->cdev) 1924 get_device(res->cdev); 1925 break; 1926 } 1927 up_read(&clients_rwsem); 1928 return ret; 1929 } 1930 1931 static int __ib_get_client_nl_info(struct ib_device *ibdev, 1932 const char *client_name, 1933 struct ib_client_nl_info *res) 1934 { 1935 unsigned long index; 1936 void *client_data; 1937 int ret = -ENOENT; 1938 1939 down_read(&ibdev->client_data_rwsem); 1940 xan_for_each_marked (&ibdev->client_data, index, client_data, 1941 CLIENT_DATA_REGISTERED) { 1942 struct ib_client *client = xa_load(&clients, index); 1943 1944 if (!client || strcmp(client->name, client_name) != 0) 1945 continue; 1946 if (!client->get_nl_info) { 1947 ret = -EOPNOTSUPP; 1948 break; 1949 } 1950 ret = client->get_nl_info(ibdev, client_data, res); 1951 if (WARN_ON(ret == -ENOENT)) 1952 ret = -EINVAL; 1953 1954 /* 1955 * The cdev is guaranteed valid as long as we are inside the 1956 * client_data_rwsem as remove_one can't be called. Keep it 1957 * valid for the caller. 1958 */ 1959 if (!ret && res->cdev) 1960 get_device(res->cdev); 1961 break; 1962 } 1963 up_read(&ibdev->client_data_rwsem); 1964 1965 return ret; 1966 } 1967 1968 /** 1969 * ib_get_client_nl_info - Fetch the nl_info from a client 1970 * @ibdev: IB device 1971 * @client_name: Name of the client 1972 * @res: Result of the query 1973 */ 1974 int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, 1975 struct ib_client_nl_info *res) 1976 { 1977 int ret; 1978 1979 if (ibdev) 1980 ret = __ib_get_client_nl_info(ibdev, client_name, res); 1981 else 1982 ret = __ib_get_global_client_nl_info(client_name, res); 1983 #ifdef CONFIG_MODULES 1984 if (ret == -ENOENT) { 1985 request_module("rdma-client-%s", client_name); 1986 if (ibdev) 1987 ret = __ib_get_client_nl_info(ibdev, client_name, res); 1988 else 1989 ret = __ib_get_global_client_nl_info(client_name, res); 1990 } 1991 #endif 1992 if (ret) { 1993 if (ret == -ENOENT) 1994 return -EOPNOTSUPP; 1995 return ret; 1996 } 1997 1998 if (WARN_ON(!res->cdev)) 1999 return -EINVAL; 2000 return 0; 2001 } 2002 2003 /** 2004 * ib_set_client_data - Set IB client context 2005 * @device:Device to set context for 2006 * @client:Client to set context for 2007 * @data:Context to set 2008 * 2009 * ib_set_client_data() sets client context data that can be retrieved with 2010 * ib_get_client_data(). This can only be called while the client is 2011 * registered to the device, once the ib_client remove() callback returns this 2012 * cannot be called. 2013 */ 2014 void ib_set_client_data(struct ib_device *device, struct ib_client *client, 2015 void *data) 2016 { 2017 void *rc; 2018 2019 if (WARN_ON(IS_ERR(data))) 2020 data = NULL; 2021 2022 rc = xa_store(&device->client_data, client->client_id, data, 2023 GFP_KERNEL); 2024 WARN_ON(xa_is_err(rc)); 2025 } 2026 EXPORT_SYMBOL(ib_set_client_data); 2027 2028 /** 2029 * ib_register_event_handler - Register an IB event handler 2030 * @event_handler:Handler to register 2031 * 2032 * ib_register_event_handler() registers an event handler that will be 2033 * called back when asynchronous IB events occur (as defined in 2034 * chapter 11 of the InfiniBand Architecture Specification). This 2035 * callback occurs in workqueue context. 2036 */ 2037 void ib_register_event_handler(struct ib_event_handler *event_handler) 2038 { 2039 down_write(&event_handler->device->event_handler_rwsem); 2040 list_add_tail(&event_handler->list, 2041 &event_handler->device->event_handler_list); 2042 up_write(&event_handler->device->event_handler_rwsem); 2043 } 2044 EXPORT_SYMBOL(ib_register_event_handler); 2045 2046 /** 2047 * ib_unregister_event_handler - Unregister an event handler 2048 * @event_handler:Handler to unregister 2049 * 2050 * Unregister an event handler registered with 2051 * ib_register_event_handler(). 2052 */ 2053 void ib_unregister_event_handler(struct ib_event_handler *event_handler) 2054 { 2055 down_write(&event_handler->device->event_handler_rwsem); 2056 list_del(&event_handler->list); 2057 up_write(&event_handler->device->event_handler_rwsem); 2058 } 2059 EXPORT_SYMBOL(ib_unregister_event_handler); 2060 2061 void ib_dispatch_event_clients(struct ib_event *event) 2062 { 2063 struct ib_event_handler *handler; 2064 2065 down_read(&event->device->event_handler_rwsem); 2066 2067 list_for_each_entry(handler, &event->device->event_handler_list, list) 2068 handler->handler(handler, event); 2069 2070 up_read(&event->device->event_handler_rwsem); 2071 } 2072 2073 static int iw_query_port(struct ib_device *device, 2074 u32 port_num, 2075 struct ib_port_attr *port_attr) 2076 { 2077 struct in_device *inetdev; 2078 struct net_device *netdev; 2079 2080 memset(port_attr, 0, sizeof(*port_attr)); 2081 2082 netdev = ib_device_get_netdev(device, port_num); 2083 if (!netdev) 2084 return -ENODEV; 2085 2086 port_attr->max_mtu = IB_MTU_4096; 2087 port_attr->active_mtu = ib_mtu_int_to_enum(netdev->mtu); 2088 2089 if (!netif_carrier_ok(netdev)) { 2090 port_attr->state = IB_PORT_DOWN; 2091 port_attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; 2092 } else { 2093 rcu_read_lock(); 2094 inetdev = __in_dev_get_rcu(netdev); 2095 2096 if (inetdev && inetdev->ifa_list) { 2097 port_attr->state = IB_PORT_ACTIVE; 2098 port_attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; 2099 } else { 2100 port_attr->state = IB_PORT_INIT; 2101 port_attr->phys_state = 2102 IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING; 2103 } 2104 2105 rcu_read_unlock(); 2106 } 2107 2108 dev_put(netdev); 2109 return device->ops.query_port(device, port_num, port_attr); 2110 } 2111 2112 static int __ib_query_port(struct ib_device *device, 2113 u32 port_num, 2114 struct ib_port_attr *port_attr) 2115 { 2116 int err; 2117 2118 memset(port_attr, 0, sizeof(*port_attr)); 2119 2120 err = device->ops.query_port(device, port_num, port_attr); 2121 if (err || port_attr->subnet_prefix) 2122 return err; 2123 2124 if (rdma_port_get_link_layer(device, port_num) != 2125 IB_LINK_LAYER_INFINIBAND) 2126 return 0; 2127 2128 ib_get_cached_subnet_prefix(device, port_num, 2129 &port_attr->subnet_prefix); 2130 return 0; 2131 } 2132 2133 /** 2134 * ib_query_port - Query IB port attributes 2135 * @device:Device to query 2136 * @port_num:Port number to query 2137 * @port_attr:Port attributes 2138 * 2139 * ib_query_port() returns the attributes of a port through the 2140 * @port_attr pointer. 2141 */ 2142 int ib_query_port(struct ib_device *device, 2143 u32 port_num, 2144 struct ib_port_attr *port_attr) 2145 { 2146 if (!rdma_is_port_valid(device, port_num)) 2147 return -EINVAL; 2148 2149 if (rdma_protocol_iwarp(device, port_num)) 2150 return iw_query_port(device, port_num, port_attr); 2151 else 2152 return __ib_query_port(device, port_num, port_attr); 2153 } 2154 EXPORT_SYMBOL(ib_query_port); 2155 2156 static void add_ndev_hash(struct ib_port_data *pdata) 2157 { 2158 unsigned long flags; 2159 2160 might_sleep(); 2161 2162 spin_lock_irqsave(&ndev_hash_lock, flags); 2163 if (hash_hashed(&pdata->ndev_hash_link)) { 2164 hash_del_rcu(&pdata->ndev_hash_link); 2165 spin_unlock_irqrestore(&ndev_hash_lock, flags); 2166 /* 2167 * We cannot do hash_add_rcu after a hash_del_rcu until the 2168 * grace period 2169 */ 2170 synchronize_rcu(); 2171 spin_lock_irqsave(&ndev_hash_lock, flags); 2172 } 2173 if (pdata->netdev) 2174 hash_add_rcu(ndev_hash, &pdata->ndev_hash_link, 2175 (uintptr_t)pdata->netdev); 2176 spin_unlock_irqrestore(&ndev_hash_lock, flags); 2177 } 2178 2179 /** 2180 * ib_device_set_netdev - Associate the ib_dev with an underlying net_device 2181 * @ib_dev: Device to modify 2182 * @ndev: net_device to affiliate, may be NULL 2183 * @port: IB port the net_device is connected to 2184 * 2185 * Drivers should use this to link the ib_device to a netdev so the netdev 2186 * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be 2187 * affiliated with any port. 2188 * 2189 * The caller must ensure that the given ndev is not unregistered or 2190 * unregistering, and that either the ib_device is unregistered or 2191 * ib_device_set_netdev() is called with NULL when the ndev sends a 2192 * NETDEV_UNREGISTER event. 2193 */ 2194 int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, 2195 u32 port) 2196 { 2197 enum rdma_nl_notify_event_type etype; 2198 struct net_device *old_ndev; 2199 struct ib_port_data *pdata; 2200 unsigned long flags; 2201 int ret; 2202 2203 if (!rdma_is_port_valid(ib_dev, port)) 2204 return -EINVAL; 2205 2206 /* 2207 * Drivers wish to call this before ib_register_driver, so we have to 2208 * setup the port data early. 2209 */ 2210 ret = alloc_port_data(ib_dev); 2211 if (ret) 2212 return ret; 2213 2214 pdata = &ib_dev->port_data[port]; 2215 spin_lock_irqsave(&pdata->netdev_lock, flags); 2216 old_ndev = rcu_dereference_protected( 2217 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2218 if (old_ndev == ndev) { 2219 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2220 return 0; 2221 } 2222 2223 rcu_assign_pointer(pdata->netdev, ndev); 2224 netdev_put(old_ndev, &pdata->netdev_tracker); 2225 netdev_hold(ndev, &pdata->netdev_tracker, GFP_ATOMIC); 2226 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2227 2228 add_ndev_hash(pdata); 2229 2230 /* Make sure that the device is registered before we send events */ 2231 if (xa_load(&devices, ib_dev->index) != ib_dev) 2232 return 0; 2233 2234 etype = ndev ? RDMA_NETDEV_ATTACH_EVENT : RDMA_NETDEV_DETACH_EVENT; 2235 rdma_nl_notify_event(ib_dev, port, etype); 2236 2237 return 0; 2238 } 2239 EXPORT_SYMBOL(ib_device_set_netdev); 2240 2241 static void free_netdevs(struct ib_device *ib_dev) 2242 { 2243 unsigned long flags; 2244 u32 port; 2245 2246 if (!ib_dev->port_data) 2247 return; 2248 2249 rdma_for_each_port (ib_dev, port) { 2250 struct ib_port_data *pdata = &ib_dev->port_data[port]; 2251 struct net_device *ndev; 2252 2253 spin_lock_irqsave(&pdata->netdev_lock, flags); 2254 ndev = rcu_dereference_protected( 2255 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2256 if (ndev) { 2257 spin_lock(&ndev_hash_lock); 2258 hash_del_rcu(&pdata->ndev_hash_link); 2259 spin_unlock(&ndev_hash_lock); 2260 2261 /* 2262 * If this is the last dev_put there is still a 2263 * synchronize_rcu before the netdev is kfreed, so we 2264 * can continue to rely on unlocked pointer 2265 * comparisons after the put 2266 */ 2267 rcu_assign_pointer(pdata->netdev, NULL); 2268 netdev_put(ndev, &pdata->netdev_tracker); 2269 } 2270 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2271 } 2272 } 2273 2274 struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, 2275 u32 port) 2276 { 2277 struct ib_port_data *pdata; 2278 struct net_device *res; 2279 2280 if (!rdma_is_port_valid(ib_dev, port)) 2281 return NULL; 2282 2283 if (!ib_dev->port_data) 2284 return NULL; 2285 2286 pdata = &ib_dev->port_data[port]; 2287 2288 /* 2289 * New drivers should use ib_device_set_netdev() not the legacy 2290 * get_netdev(). 2291 */ 2292 if (ib_dev->ops.get_netdev) 2293 res = ib_dev->ops.get_netdev(ib_dev, port); 2294 else { 2295 spin_lock(&pdata->netdev_lock); 2296 res = rcu_dereference_protected( 2297 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2298 dev_hold(res); 2299 spin_unlock(&pdata->netdev_lock); 2300 } 2301 2302 return res; 2303 } 2304 EXPORT_SYMBOL(ib_device_get_netdev); 2305 2306 /** 2307 * ib_query_netdev_port - Query the port number of a net_device 2308 * associated with an ibdev 2309 * @ibdev: IB device 2310 * @ndev: Network device 2311 * @port: IB port the net_device is connected to 2312 */ 2313 int ib_query_netdev_port(struct ib_device *ibdev, struct net_device *ndev, 2314 u32 *port) 2315 { 2316 struct net_device *ib_ndev; 2317 u32 port_num; 2318 2319 rdma_for_each_port(ibdev, port_num) { 2320 ib_ndev = ib_device_get_netdev(ibdev, port_num); 2321 if (ndev == ib_ndev) { 2322 *port = port_num; 2323 dev_put(ib_ndev); 2324 return 0; 2325 } 2326 dev_put(ib_ndev); 2327 } 2328 2329 return -ENOENT; 2330 } 2331 EXPORT_SYMBOL(ib_query_netdev_port); 2332 2333 /** 2334 * ib_device_get_by_netdev - Find an IB device associated with a netdev 2335 * @ndev: netdev to locate 2336 * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) 2337 * 2338 * Find and hold an ib_device that is associated with a netdev via 2339 * ib_device_set_netdev(). The caller must call ib_device_put() on the 2340 * returned pointer. 2341 */ 2342 struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, 2343 enum rdma_driver_id driver_id) 2344 { 2345 struct ib_device *res = NULL; 2346 struct ib_port_data *cur; 2347 2348 rcu_read_lock(); 2349 hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link, 2350 (uintptr_t)ndev) { 2351 if (rcu_access_pointer(cur->netdev) == ndev && 2352 (driver_id == RDMA_DRIVER_UNKNOWN || 2353 cur->ib_dev->ops.driver_id == driver_id) && 2354 ib_device_try_get(cur->ib_dev)) { 2355 res = cur->ib_dev; 2356 break; 2357 } 2358 } 2359 rcu_read_unlock(); 2360 2361 return res; 2362 } 2363 EXPORT_SYMBOL(ib_device_get_by_netdev); 2364 2365 /** 2366 * ib_enum_roce_netdev - enumerate all RoCE ports 2367 * @ib_dev : IB device we want to query 2368 * @filter: Should we call the callback? 2369 * @filter_cookie: Cookie passed to filter 2370 * @cb: Callback to call for each found RoCE ports 2371 * @cookie: Cookie passed back to the callback 2372 * 2373 * Enumerates all of the physical RoCE ports of ib_dev 2374 * which are related to netdevice and calls callback() on each 2375 * device for which filter() function returns non zero. 2376 */ 2377 void ib_enum_roce_netdev(struct ib_device *ib_dev, 2378 roce_netdev_filter filter, 2379 void *filter_cookie, 2380 roce_netdev_callback cb, 2381 void *cookie) 2382 { 2383 u32 port; 2384 2385 rdma_for_each_port (ib_dev, port) 2386 if (rdma_protocol_roce(ib_dev, port)) { 2387 struct net_device *idev = 2388 ib_device_get_netdev(ib_dev, port); 2389 2390 if (filter(ib_dev, port, idev, filter_cookie)) 2391 cb(ib_dev, port, idev, cookie); 2392 dev_put(idev); 2393 } 2394 } 2395 2396 /** 2397 * ib_enum_all_roce_netdevs - enumerate all RoCE devices 2398 * @filter: Should we call the callback? 2399 * @filter_cookie: Cookie passed to filter 2400 * @cb: Callback to call for each found RoCE ports 2401 * @cookie: Cookie passed back to the callback 2402 * 2403 * Enumerates all RoCE devices' physical ports which are related 2404 * to netdevices and calls callback() on each device for which 2405 * filter() function returns non zero. 2406 */ 2407 void ib_enum_all_roce_netdevs(roce_netdev_filter filter, 2408 void *filter_cookie, 2409 roce_netdev_callback cb, 2410 void *cookie) 2411 { 2412 struct ib_device *dev; 2413 unsigned long index; 2414 2415 down_read(&devices_rwsem); 2416 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) 2417 ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie); 2418 up_read(&devices_rwsem); 2419 } 2420 2421 /* 2422 * ib_enum_all_devs - enumerate all ib_devices 2423 * @cb: Callback to call for each found ib_device 2424 * 2425 * Enumerates all ib_devices and calls callback() on each device. 2426 */ 2427 int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, 2428 struct netlink_callback *cb) 2429 { 2430 unsigned long index; 2431 struct ib_device *dev; 2432 unsigned int idx = 0; 2433 int ret = 0; 2434 2435 down_read(&devices_rwsem); 2436 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 2437 if (!rdma_dev_access_netns(dev, sock_net(skb->sk))) 2438 continue; 2439 2440 ret = nldev_cb(dev, skb, cb, idx); 2441 if (ret) 2442 break; 2443 idx++; 2444 } 2445 up_read(&devices_rwsem); 2446 return ret; 2447 } 2448 2449 /** 2450 * ib_query_pkey - Get P_Key table entry 2451 * @device:Device to query 2452 * @port_num:Port number to query 2453 * @index:P_Key table index to query 2454 * @pkey:Returned P_Key 2455 * 2456 * ib_query_pkey() fetches the specified P_Key table entry. 2457 */ 2458 int ib_query_pkey(struct ib_device *device, 2459 u32 port_num, u16 index, u16 *pkey) 2460 { 2461 if (!rdma_is_port_valid(device, port_num)) 2462 return -EINVAL; 2463 2464 if (!device->ops.query_pkey) 2465 return -EOPNOTSUPP; 2466 2467 return device->ops.query_pkey(device, port_num, index, pkey); 2468 } 2469 EXPORT_SYMBOL(ib_query_pkey); 2470 2471 /** 2472 * ib_modify_device - Change IB device attributes 2473 * @device:Device to modify 2474 * @device_modify_mask:Mask of attributes to change 2475 * @device_modify:New attribute values 2476 * 2477 * ib_modify_device() changes a device's attributes as specified by 2478 * the @device_modify_mask and @device_modify structure. 2479 */ 2480 int ib_modify_device(struct ib_device *device, 2481 int device_modify_mask, 2482 struct ib_device_modify *device_modify) 2483 { 2484 if (!device->ops.modify_device) 2485 return -EOPNOTSUPP; 2486 2487 return device->ops.modify_device(device, device_modify_mask, 2488 device_modify); 2489 } 2490 EXPORT_SYMBOL(ib_modify_device); 2491 2492 /** 2493 * ib_modify_port - Modifies the attributes for the specified port. 2494 * @device: The device to modify. 2495 * @port_num: The number of the port to modify. 2496 * @port_modify_mask: Mask used to specify which attributes of the port 2497 * to change. 2498 * @port_modify: New attribute values for the port. 2499 * 2500 * ib_modify_port() changes a port's attributes as specified by the 2501 * @port_modify_mask and @port_modify structure. 2502 */ 2503 int ib_modify_port(struct ib_device *device, 2504 u32 port_num, int port_modify_mask, 2505 struct ib_port_modify *port_modify) 2506 { 2507 int rc; 2508 2509 if (!rdma_is_port_valid(device, port_num)) 2510 return -EINVAL; 2511 2512 if (device->ops.modify_port) 2513 rc = device->ops.modify_port(device, port_num, 2514 port_modify_mask, 2515 port_modify); 2516 else if (rdma_protocol_roce(device, port_num) && 2517 ((port_modify->set_port_cap_mask & ~IB_PORT_CM_SUP) == 0 || 2518 (port_modify->clr_port_cap_mask & ~IB_PORT_CM_SUP) == 0)) 2519 rc = 0; 2520 else 2521 rc = -EOPNOTSUPP; 2522 return rc; 2523 } 2524 EXPORT_SYMBOL(ib_modify_port); 2525 2526 /** 2527 * ib_find_gid - Returns the port number and GID table index where 2528 * a specified GID value occurs. Its searches only for IB link layer. 2529 * @device: The device to query. 2530 * @gid: The GID value to search for. 2531 * @port_num: The port number of the device where the GID value was found. 2532 * @index: The index into the GID table where the GID was found. This 2533 * parameter may be NULL. 2534 */ 2535 int ib_find_gid(struct ib_device *device, union ib_gid *gid, 2536 u32 *port_num, u16 *index) 2537 { 2538 union ib_gid tmp_gid; 2539 u32 port; 2540 int ret, i; 2541 2542 rdma_for_each_port (device, port) { 2543 if (!rdma_protocol_ib(device, port)) 2544 continue; 2545 2546 for (i = 0; i < device->port_data[port].immutable.gid_tbl_len; 2547 ++i) { 2548 ret = rdma_query_gid(device, port, i, &tmp_gid); 2549 if (ret) 2550 continue; 2551 2552 if (!memcmp(&tmp_gid, gid, sizeof *gid)) { 2553 *port_num = port; 2554 if (index) 2555 *index = i; 2556 return 0; 2557 } 2558 } 2559 } 2560 2561 return -ENOENT; 2562 } 2563 EXPORT_SYMBOL(ib_find_gid); 2564 2565 /** 2566 * ib_find_pkey - Returns the PKey table index where a specified 2567 * PKey value occurs. 2568 * @device: The device to query. 2569 * @port_num: The port number of the device to search for the PKey. 2570 * @pkey: The PKey value to search for. 2571 * @index: The index into the PKey table where the PKey was found. 2572 */ 2573 int ib_find_pkey(struct ib_device *device, 2574 u32 port_num, u16 pkey, u16 *index) 2575 { 2576 int ret, i; 2577 u16 tmp_pkey; 2578 int partial_ix = -1; 2579 2580 for (i = 0; i < device->port_data[port_num].immutable.pkey_tbl_len; 2581 ++i) { 2582 ret = ib_query_pkey(device, port_num, i, &tmp_pkey); 2583 if (ret) 2584 return ret; 2585 if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) { 2586 /* if there is full-member pkey take it.*/ 2587 if (tmp_pkey & 0x8000) { 2588 *index = i; 2589 return 0; 2590 } 2591 if (partial_ix < 0) 2592 partial_ix = i; 2593 } 2594 } 2595 2596 /*no full-member, if exists take the limited*/ 2597 if (partial_ix >= 0) { 2598 *index = partial_ix; 2599 return 0; 2600 } 2601 return -ENOENT; 2602 } 2603 EXPORT_SYMBOL(ib_find_pkey); 2604 2605 /** 2606 * ib_get_net_dev_by_params() - Return the appropriate net_dev 2607 * for a received CM request 2608 * @dev: An RDMA device on which the request has been received. 2609 * @port: Port number on the RDMA device. 2610 * @pkey: The Pkey the request came on. 2611 * @gid: A GID that the net_dev uses to communicate. 2612 * @addr: Contains the IP address that the request specified as its 2613 * destination. 2614 * 2615 */ 2616 struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, 2617 u32 port, 2618 u16 pkey, 2619 const union ib_gid *gid, 2620 const struct sockaddr *addr) 2621 { 2622 struct net_device *net_dev = NULL; 2623 unsigned long index; 2624 void *client_data; 2625 2626 if (!rdma_protocol_ib(dev, port)) 2627 return NULL; 2628 2629 /* 2630 * Holding the read side guarantees that the client will not become 2631 * unregistered while we are calling get_net_dev_by_params() 2632 */ 2633 down_read(&dev->client_data_rwsem); 2634 xan_for_each_marked (&dev->client_data, index, client_data, 2635 CLIENT_DATA_REGISTERED) { 2636 struct ib_client *client = xa_load(&clients, index); 2637 2638 if (!client || !client->get_net_dev_by_params) 2639 continue; 2640 2641 net_dev = client->get_net_dev_by_params(dev, port, pkey, gid, 2642 addr, client_data); 2643 if (net_dev) 2644 break; 2645 } 2646 up_read(&dev->client_data_rwsem); 2647 2648 return net_dev; 2649 } 2650 EXPORT_SYMBOL(ib_get_net_dev_by_params); 2651 2652 void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) 2653 { 2654 struct ib_device_ops *dev_ops = &dev->ops; 2655 #define SET_DEVICE_OP(ptr, name) \ 2656 do { \ 2657 if (ops->name) \ 2658 if (!((ptr)->name)) \ 2659 (ptr)->name = ops->name; \ 2660 } while (0) 2661 2662 #define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name) 2663 2664 if (ops->driver_id != RDMA_DRIVER_UNKNOWN) { 2665 WARN_ON(dev_ops->driver_id != RDMA_DRIVER_UNKNOWN && 2666 dev_ops->driver_id != ops->driver_id); 2667 dev_ops->driver_id = ops->driver_id; 2668 } 2669 if (ops->owner) { 2670 WARN_ON(dev_ops->owner && dev_ops->owner != ops->owner); 2671 dev_ops->owner = ops->owner; 2672 } 2673 if (ops->uverbs_abi_ver) 2674 dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver; 2675 2676 dev_ops->uverbs_no_driver_id_binding |= 2677 ops->uverbs_no_driver_id_binding; 2678 2679 SET_DEVICE_OP(dev_ops, add_gid); 2680 SET_DEVICE_OP(dev_ops, add_sub_dev); 2681 SET_DEVICE_OP(dev_ops, advise_mr); 2682 SET_DEVICE_OP(dev_ops, alloc_dm); 2683 SET_DEVICE_OP(dev_ops, alloc_dmah); 2684 SET_DEVICE_OP(dev_ops, alloc_hw_device_stats); 2685 SET_DEVICE_OP(dev_ops, alloc_hw_port_stats); 2686 SET_DEVICE_OP(dev_ops, alloc_mr); 2687 SET_DEVICE_OP(dev_ops, alloc_mr_integrity); 2688 SET_DEVICE_OP(dev_ops, alloc_mw); 2689 SET_DEVICE_OP(dev_ops, alloc_pd); 2690 SET_DEVICE_OP(dev_ops, alloc_rdma_netdev); 2691 SET_DEVICE_OP(dev_ops, alloc_ucontext); 2692 SET_DEVICE_OP(dev_ops, alloc_xrcd); 2693 SET_DEVICE_OP(dev_ops, attach_mcast); 2694 SET_DEVICE_OP(dev_ops, check_mr_status); 2695 SET_DEVICE_OP(dev_ops, counter_alloc_stats); 2696 SET_DEVICE_OP(dev_ops, counter_bind_qp); 2697 SET_DEVICE_OP(dev_ops, counter_dealloc); 2698 SET_DEVICE_OP(dev_ops, counter_init); 2699 SET_DEVICE_OP(dev_ops, counter_unbind_qp); 2700 SET_DEVICE_OP(dev_ops, counter_update_stats); 2701 SET_DEVICE_OP(dev_ops, create_ah); 2702 SET_DEVICE_OP(dev_ops, create_counters); 2703 SET_DEVICE_OP(dev_ops, create_cq); 2704 SET_DEVICE_OP(dev_ops, create_cq_umem); 2705 SET_DEVICE_OP(dev_ops, create_flow); 2706 SET_DEVICE_OP(dev_ops, create_qp); 2707 SET_DEVICE_OP(dev_ops, create_rwq_ind_table); 2708 SET_DEVICE_OP(dev_ops, create_srq); 2709 SET_DEVICE_OP(dev_ops, create_user_ah); 2710 SET_DEVICE_OP(dev_ops, create_wq); 2711 SET_DEVICE_OP(dev_ops, dealloc_dm); 2712 SET_DEVICE_OP(dev_ops, dealloc_dmah); 2713 SET_DEVICE_OP(dev_ops, dealloc_driver); 2714 SET_DEVICE_OP(dev_ops, dealloc_mw); 2715 SET_DEVICE_OP(dev_ops, dealloc_pd); 2716 SET_DEVICE_OP(dev_ops, dealloc_ucontext); 2717 SET_DEVICE_OP(dev_ops, dealloc_xrcd); 2718 SET_DEVICE_OP(dev_ops, del_gid); 2719 SET_DEVICE_OP(dev_ops, del_sub_dev); 2720 SET_DEVICE_OP(dev_ops, dereg_mr); 2721 SET_DEVICE_OP(dev_ops, destroy_ah); 2722 SET_DEVICE_OP(dev_ops, destroy_counters); 2723 SET_DEVICE_OP(dev_ops, destroy_cq); 2724 SET_DEVICE_OP(dev_ops, destroy_flow); 2725 SET_DEVICE_OP(dev_ops, destroy_flow_action); 2726 SET_DEVICE_OP(dev_ops, destroy_qp); 2727 SET_DEVICE_OP(dev_ops, destroy_rwq_ind_table); 2728 SET_DEVICE_OP(dev_ops, destroy_srq); 2729 SET_DEVICE_OP(dev_ops, destroy_wq); 2730 SET_DEVICE_OP(dev_ops, device_group); 2731 SET_DEVICE_OP(dev_ops, detach_mcast); 2732 SET_DEVICE_OP(dev_ops, disassociate_ucontext); 2733 SET_DEVICE_OP(dev_ops, drain_rq); 2734 SET_DEVICE_OP(dev_ops, drain_sq); 2735 SET_DEVICE_OP(dev_ops, enable_driver); 2736 SET_DEVICE_OP(dev_ops, fill_res_cm_id_entry); 2737 SET_DEVICE_OP(dev_ops, fill_res_cq_entry); 2738 SET_DEVICE_OP(dev_ops, fill_res_cq_entry_raw); 2739 SET_DEVICE_OP(dev_ops, fill_res_mr_entry); 2740 SET_DEVICE_OP(dev_ops, fill_res_mr_entry_raw); 2741 SET_DEVICE_OP(dev_ops, fill_res_qp_entry); 2742 SET_DEVICE_OP(dev_ops, fill_res_qp_entry_raw); 2743 SET_DEVICE_OP(dev_ops, fill_res_srq_entry); 2744 SET_DEVICE_OP(dev_ops, fill_res_srq_entry_raw); 2745 SET_DEVICE_OP(dev_ops, fill_stat_mr_entry); 2746 SET_DEVICE_OP(dev_ops, get_dev_fw_str); 2747 SET_DEVICE_OP(dev_ops, get_dma_mr); 2748 SET_DEVICE_OP(dev_ops, get_hw_stats); 2749 SET_DEVICE_OP(dev_ops, get_link_layer); 2750 SET_DEVICE_OP(dev_ops, get_netdev); 2751 SET_DEVICE_OP(dev_ops, get_numa_node); 2752 SET_DEVICE_OP(dev_ops, get_port_immutable); 2753 SET_DEVICE_OP(dev_ops, get_vector_affinity); 2754 SET_DEVICE_OP(dev_ops, get_vf_config); 2755 SET_DEVICE_OP(dev_ops, get_vf_guid); 2756 SET_DEVICE_OP(dev_ops, get_vf_stats); 2757 SET_DEVICE_OP(dev_ops, iw_accept); 2758 SET_DEVICE_OP(dev_ops, iw_add_ref); 2759 SET_DEVICE_OP(dev_ops, iw_connect); 2760 SET_DEVICE_OP(dev_ops, iw_create_listen); 2761 SET_DEVICE_OP(dev_ops, iw_destroy_listen); 2762 SET_DEVICE_OP(dev_ops, iw_get_qp); 2763 SET_DEVICE_OP(dev_ops, iw_reject); 2764 SET_DEVICE_OP(dev_ops, iw_rem_ref); 2765 SET_DEVICE_OP(dev_ops, map_mr_sg); 2766 SET_DEVICE_OP(dev_ops, map_mr_sg_pi); 2767 SET_DEVICE_OP(dev_ops, mmap); 2768 SET_DEVICE_OP(dev_ops, mmap_get_pfns); 2769 SET_DEVICE_OP(dev_ops, mmap_free); 2770 SET_DEVICE_OP(dev_ops, modify_ah); 2771 SET_DEVICE_OP(dev_ops, modify_cq); 2772 SET_DEVICE_OP(dev_ops, modify_device); 2773 SET_DEVICE_OP(dev_ops, modify_hw_stat); 2774 SET_DEVICE_OP(dev_ops, modify_port); 2775 SET_DEVICE_OP(dev_ops, modify_qp); 2776 SET_DEVICE_OP(dev_ops, modify_srq); 2777 SET_DEVICE_OP(dev_ops, modify_wq); 2778 SET_DEVICE_OP(dev_ops, peek_cq); 2779 SET_DEVICE_OP(dev_ops, pgoff_to_mmap_entry); 2780 SET_DEVICE_OP(dev_ops, pre_destroy_cq); 2781 SET_DEVICE_OP(dev_ops, poll_cq); 2782 SET_DEVICE_OP(dev_ops, port_groups); 2783 SET_DEVICE_OP(dev_ops, post_destroy_cq); 2784 SET_DEVICE_OP(dev_ops, post_recv); 2785 SET_DEVICE_OP(dev_ops, post_send); 2786 SET_DEVICE_OP(dev_ops, post_srq_recv); 2787 SET_DEVICE_OP(dev_ops, process_mad); 2788 SET_DEVICE_OP(dev_ops, query_ah); 2789 SET_DEVICE_OP(dev_ops, query_device); 2790 SET_DEVICE_OP(dev_ops, query_gid); 2791 SET_DEVICE_OP(dev_ops, query_pkey); 2792 SET_DEVICE_OP(dev_ops, query_port); 2793 SET_DEVICE_OP(dev_ops, query_port_speed); 2794 SET_DEVICE_OP(dev_ops, query_qp); 2795 SET_DEVICE_OP(dev_ops, query_srq); 2796 SET_DEVICE_OP(dev_ops, query_ucontext); 2797 SET_DEVICE_OP(dev_ops, rdma_netdev_get_params); 2798 SET_DEVICE_OP(dev_ops, read_counters); 2799 SET_DEVICE_OP(dev_ops, reg_dm_mr); 2800 SET_DEVICE_OP(dev_ops, reg_user_mr); 2801 SET_DEVICE_OP(dev_ops, reg_user_mr_dmabuf); 2802 SET_DEVICE_OP(dev_ops, req_notify_cq); 2803 SET_DEVICE_OP(dev_ops, rereg_user_mr); 2804 SET_DEVICE_OP(dev_ops, resize_cq); 2805 SET_DEVICE_OP(dev_ops, set_vf_guid); 2806 SET_DEVICE_OP(dev_ops, set_vf_link_state); 2807 SET_DEVICE_OP(dev_ops, ufile_hw_cleanup); 2808 SET_DEVICE_OP(dev_ops, report_port_event); 2809 2810 SET_OBJ_SIZE(dev_ops, ib_ah); 2811 SET_OBJ_SIZE(dev_ops, ib_counters); 2812 SET_OBJ_SIZE(dev_ops, ib_cq); 2813 SET_OBJ_SIZE(dev_ops, ib_dmah); 2814 SET_OBJ_SIZE(dev_ops, ib_mw); 2815 SET_OBJ_SIZE(dev_ops, ib_pd); 2816 SET_OBJ_SIZE(dev_ops, ib_qp); 2817 SET_OBJ_SIZE(dev_ops, ib_rwq_ind_table); 2818 SET_OBJ_SIZE(dev_ops, ib_srq); 2819 SET_OBJ_SIZE(dev_ops, ib_ucontext); 2820 SET_OBJ_SIZE(dev_ops, ib_xrcd); 2821 SET_OBJ_SIZE(dev_ops, rdma_counter); 2822 } 2823 EXPORT_SYMBOL(ib_set_device_ops); 2824 2825 int ib_add_sub_device(struct ib_device *parent, 2826 enum rdma_nl_dev_type type, 2827 const char *name) 2828 { 2829 struct ib_device *sub; 2830 int ret = 0; 2831 2832 if (!parent->ops.add_sub_dev || !parent->ops.del_sub_dev) 2833 return -EOPNOTSUPP; 2834 2835 if (!ib_device_try_get(parent)) 2836 return -EINVAL; 2837 2838 sub = parent->ops.add_sub_dev(parent, type, name); 2839 if (IS_ERR(sub)) { 2840 ib_device_put(parent); 2841 return PTR_ERR(sub); 2842 } 2843 2844 sub->type = type; 2845 sub->parent = parent; 2846 2847 mutex_lock(&parent->subdev_lock); 2848 list_add_tail(&parent->subdev_list_head, &sub->subdev_list); 2849 mutex_unlock(&parent->subdev_lock); 2850 2851 return ret; 2852 } 2853 2854 int ib_del_sub_device_and_put(struct ib_device *sub) 2855 { 2856 struct ib_device *parent = sub->parent; 2857 2858 if (!parent) { 2859 ib_device_put(sub); 2860 return -EOPNOTSUPP; 2861 } 2862 2863 mutex_lock(&parent->subdev_lock); 2864 list_del(&sub->subdev_list); 2865 mutex_unlock(&parent->subdev_lock); 2866 2867 ib_device_put(sub); 2868 parent->ops.del_sub_dev(sub); 2869 ib_device_put(parent); 2870 2871 return 0; 2872 } 2873 2874 #ifdef CONFIG_INFINIBAND_VIRT_DMA 2875 int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents) 2876 { 2877 struct scatterlist *s; 2878 int i; 2879 2880 for_each_sg(sg, s, nents, i) { 2881 sg_dma_address(s) = (uintptr_t)sg_virt(s); 2882 sg_dma_len(s) = s->length; 2883 } 2884 return nents; 2885 } 2886 EXPORT_SYMBOL(ib_dma_virt_map_sg); 2887 #endif /* CONFIG_INFINIBAND_VIRT_DMA */ 2888 2889 static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = { 2890 [RDMA_NL_LS_OP_RESOLVE] = { 2891 .doit = ib_nl_handle_resolve_resp, 2892 .flags = RDMA_NL_ADMIN_PERM, 2893 }, 2894 [RDMA_NL_LS_OP_SET_TIMEOUT] = { 2895 .doit = ib_nl_handle_set_timeout, 2896 .flags = RDMA_NL_ADMIN_PERM, 2897 }, 2898 [RDMA_NL_LS_OP_IP_RESOLVE] = { 2899 .doit = ib_nl_handle_ip_res_resp, 2900 .flags = RDMA_NL_ADMIN_PERM, 2901 }, 2902 }; 2903 2904 void ib_dispatch_port_state_event(struct ib_device *ibdev, struct net_device *ndev) 2905 { 2906 enum ib_port_state curr_state; 2907 struct ib_event ibevent = {}; 2908 u32 port; 2909 2910 if (ib_query_netdev_port(ibdev, ndev, &port)) 2911 return; 2912 2913 curr_state = ib_get_curr_port_state(ndev); 2914 2915 write_lock_irq(&ibdev->cache_lock); 2916 if (ibdev->port_data[port].cache.last_port_state == curr_state) { 2917 write_unlock_irq(&ibdev->cache_lock); 2918 return; 2919 } 2920 ibdev->port_data[port].cache.last_port_state = curr_state; 2921 write_unlock_irq(&ibdev->cache_lock); 2922 2923 ibevent.event = (curr_state == IB_PORT_DOWN) ? 2924 IB_EVENT_PORT_ERR : IB_EVENT_PORT_ACTIVE; 2925 ibevent.device = ibdev; 2926 ibevent.element.port_num = port; 2927 ib_dispatch_event(&ibevent); 2928 } 2929 EXPORT_SYMBOL(ib_dispatch_port_state_event); 2930 2931 static void handle_port_event(struct net_device *ndev, unsigned long event) 2932 { 2933 struct ib_device *ibdev; 2934 2935 /* Currently, link events in bonding scenarios are still 2936 * reported by drivers that support bonding. 2937 */ 2938 if (netif_is_lag_master(ndev) || netif_is_lag_port(ndev)) 2939 return; 2940 2941 ibdev = ib_device_get_by_netdev(ndev, RDMA_DRIVER_UNKNOWN); 2942 if (!ibdev) 2943 return; 2944 2945 if (ibdev->ops.report_port_event) { 2946 ibdev->ops.report_port_event(ibdev, ndev, event); 2947 goto put_ibdev; 2948 } 2949 2950 ib_dispatch_port_state_event(ibdev, ndev); 2951 2952 put_ibdev: 2953 ib_device_put(ibdev); 2954 }; 2955 2956 static int ib_netdevice_event(struct notifier_block *this, 2957 unsigned long event, void *ptr) 2958 { 2959 struct net_device *ndev = netdev_notifier_info_to_dev(ptr); 2960 struct ib_device *ibdev; 2961 u32 port; 2962 2963 switch (event) { 2964 case NETDEV_CHANGENAME: 2965 ibdev = ib_device_get_by_netdev(ndev, RDMA_DRIVER_UNKNOWN); 2966 if (!ibdev) 2967 return NOTIFY_DONE; 2968 2969 if (ib_query_netdev_port(ibdev, ndev, &port)) { 2970 ib_device_put(ibdev); 2971 break; 2972 } 2973 2974 rdma_nl_notify_event(ibdev, port, RDMA_NETDEV_RENAME_EVENT); 2975 ib_device_put(ibdev); 2976 break; 2977 2978 case NETDEV_UP: 2979 case NETDEV_CHANGE: 2980 case NETDEV_DOWN: 2981 handle_port_event(ndev, event); 2982 break; 2983 2984 default: 2985 break; 2986 } 2987 2988 return NOTIFY_DONE; 2989 } 2990 2991 static struct notifier_block nb_netdevice = { 2992 .notifier_call = ib_netdevice_event, 2993 }; 2994 2995 static int __init ib_core_init(void) 2996 { 2997 int ret = -ENOMEM; 2998 2999 ib_wq = alloc_workqueue("infiniband", WQ_PERCPU, 0); 3000 if (!ib_wq) 3001 return -ENOMEM; 3002 3003 ib_unreg_wq = alloc_workqueue("ib-unreg-wq", WQ_UNBOUND, 3004 WQ_UNBOUND_MAX_ACTIVE); 3005 if (!ib_unreg_wq) 3006 goto err; 3007 3008 ib_comp_wq = alloc_workqueue("ib-comp-wq", 3009 WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS | WQ_PERCPU, 0); 3010 if (!ib_comp_wq) 3011 goto err_unbound; 3012 3013 ib_comp_unbound_wq = 3014 alloc_workqueue("ib-comp-unb-wq", 3015 WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM | 3016 WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE); 3017 if (!ib_comp_unbound_wq) 3018 goto err_comp; 3019 3020 ret = class_register(&ib_class); 3021 if (ret) { 3022 pr_warn("Couldn't create InfiniBand device class\n"); 3023 goto err_comp_unbound; 3024 } 3025 3026 rdma_nl_init(); 3027 3028 ret = addr_init(); 3029 if (ret) { 3030 pr_warn("Couldn't init IB address resolution\n"); 3031 goto err_ibnl; 3032 } 3033 3034 ret = ib_mad_init(); 3035 if (ret) { 3036 pr_warn("Couldn't init IB MAD\n"); 3037 goto err_addr; 3038 } 3039 3040 ret = ib_sa_init(); 3041 if (ret) { 3042 pr_warn("Couldn't init SA\n"); 3043 goto err_mad; 3044 } 3045 3046 ret = register_blocking_lsm_notifier(&ibdev_lsm_nb); 3047 if (ret) { 3048 pr_warn("Couldn't register LSM notifier. ret %d\n", ret); 3049 goto err_sa; 3050 } 3051 3052 ret = register_pernet_device(&rdma_dev_net_ops); 3053 if (ret) { 3054 pr_warn("Couldn't init compat dev. ret %d\n", ret); 3055 goto err_compat; 3056 } 3057 3058 nldev_init(); 3059 rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table); 3060 ret = roce_gid_mgmt_init(); 3061 if (ret) { 3062 pr_warn("Couldn't init RoCE GID management\n"); 3063 goto err_parent; 3064 } 3065 3066 register_netdevice_notifier(&nb_netdevice); 3067 3068 return 0; 3069 3070 err_parent: 3071 rdma_nl_unregister(RDMA_NL_LS); 3072 nldev_exit(); 3073 unregister_pernet_device(&rdma_dev_net_ops); 3074 err_compat: 3075 unregister_blocking_lsm_notifier(&ibdev_lsm_nb); 3076 err_sa: 3077 ib_sa_cleanup(); 3078 err_mad: 3079 ib_mad_cleanup(); 3080 err_addr: 3081 addr_cleanup(); 3082 err_ibnl: 3083 class_unregister(&ib_class); 3084 err_comp_unbound: 3085 destroy_workqueue(ib_comp_unbound_wq); 3086 err_comp: 3087 destroy_workqueue(ib_comp_wq); 3088 err_unbound: 3089 destroy_workqueue(ib_unreg_wq); 3090 err: 3091 destroy_workqueue(ib_wq); 3092 return ret; 3093 } 3094 3095 static void __exit ib_core_cleanup(void) 3096 { 3097 unregister_netdevice_notifier(&nb_netdevice); 3098 roce_gid_mgmt_cleanup(); 3099 rdma_nl_unregister(RDMA_NL_LS); 3100 nldev_exit(); 3101 unregister_pernet_device(&rdma_dev_net_ops); 3102 unregister_blocking_lsm_notifier(&ibdev_lsm_nb); 3103 ib_sa_cleanup(); 3104 ib_mad_cleanup(); 3105 addr_cleanup(); 3106 rdma_nl_exit(); 3107 class_unregister(&ib_class); 3108 destroy_workqueue(ib_comp_unbound_wq); 3109 destroy_workqueue(ib_comp_wq); 3110 /* Make sure that any pending umem accounting work is done. */ 3111 destroy_workqueue(ib_wq); 3112 destroy_workqueue(ib_unreg_wq); 3113 WARN_ON(!xa_empty(&clients)); 3114 WARN_ON(!xa_empty(&devices)); 3115 } 3116 3117 MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4); 3118 3119 /* ib core relies on netdev stack to first register net_ns_type_operations 3120 * ns kobject type before ib_core initialization. 3121 */ 3122 fs_initcall(ib_core_init); 3123 module_exit(ib_core_cleanup); 3124