1 /* 2 * Copyright (c) 2004 Topspin Communications. All rights reserved. 3 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34 #include <linux/module.h> 35 #include <linux/string.h> 36 #include <linux/errno.h> 37 #include <linux/kernel.h> 38 #include <linux/slab.h> 39 #include <linux/init.h> 40 #include <linux/netdevice.h> 41 #include <net/net_namespace.h> 42 #include <linux/security.h> 43 #include <linux/notifier.h> 44 #include <linux/hashtable.h> 45 #include <rdma/rdma_netlink.h> 46 #include <rdma/ib_addr.h> 47 #include <rdma/ib_cache.h> 48 #include <rdma/rdma_counter.h> 49 50 #include "core_priv.h" 51 #include "restrack.h" 52 53 MODULE_AUTHOR("Roland Dreier"); 54 MODULE_DESCRIPTION("core kernel InfiniBand API"); 55 MODULE_LICENSE("Dual BSD/GPL"); 56 57 struct workqueue_struct *ib_comp_wq; 58 struct workqueue_struct *ib_comp_unbound_wq; 59 struct workqueue_struct *ib_wq; 60 EXPORT_SYMBOL_GPL(ib_wq); 61 static struct workqueue_struct *ib_unreg_wq; 62 63 /* 64 * Each of the three rwsem locks (devices, clients, client_data) protects the 65 * xarray of the same name. Specifically it allows the caller to assert that 66 * the MARK will/will not be changing under the lock, and for devices and 67 * clients, that the value in the xarray is still a valid pointer. Change of 68 * the MARK is linked to the object state, so holding the lock and testing the 69 * MARK also asserts that the contained object is in a certain state. 70 * 71 * This is used to build a two stage register/unregister flow where objects 72 * can continue to be in the xarray even though they are still in progress to 73 * register/unregister. 74 * 75 * The xarray itself provides additional locking, and restartable iteration, 76 * which is also relied on. 77 * 78 * Locks should not be nested, with the exception of client_data, which is 79 * allowed to nest under the read side of the other two locks. 80 * 81 * The devices_rwsem also protects the device name list, any change or 82 * assignment of device name must also hold the write side to guarantee unique 83 * names. 84 */ 85 86 /* 87 * devices contains devices that have had their names assigned. The 88 * devices may not be registered. Users that care about the registration 89 * status need to call ib_device_try_get() on the device to ensure it is 90 * registered, and keep it registered, for the required duration. 91 * 92 */ 93 static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC); 94 static DECLARE_RWSEM(devices_rwsem); 95 #define DEVICE_REGISTERED XA_MARK_1 96 97 static u32 highest_client_id; 98 #define CLIENT_REGISTERED XA_MARK_1 99 static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC); 100 static DECLARE_RWSEM(clients_rwsem); 101 102 static void ib_client_put(struct ib_client *client) 103 { 104 if (refcount_dec_and_test(&client->uses)) 105 complete(&client->uses_zero); 106 } 107 108 /* 109 * If client_data is registered then the corresponding client must also still 110 * be registered. 111 */ 112 #define CLIENT_DATA_REGISTERED XA_MARK_1 113 114 unsigned int rdma_dev_net_id; 115 116 /* 117 * A list of net namespaces is maintained in an xarray. This is necessary 118 * because we can't get the locking right using the existing net ns list. We 119 * would require a init_net callback after the list is updated. 120 */ 121 static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_ALLOC); 122 /* 123 * rwsem to protect accessing the rdma_nets xarray entries. 124 */ 125 static DECLARE_RWSEM(rdma_nets_rwsem); 126 127 bool ib_devices_shared_netns = true; 128 module_param_named(netns_mode, ib_devices_shared_netns, bool, 0444); 129 MODULE_PARM_DESC(netns_mode, 130 "Share device among net namespaces; default=1 (shared)"); 131 /** 132 * rdma_dev_access_netns() - Return whether an rdma device can be accessed 133 * from a specified net namespace or not. 134 * @dev: Pointer to rdma device which needs to be checked 135 * @net: Pointer to net namesapce for which access to be checked 136 * 137 * When the rdma device is in shared mode, it ignores the net namespace. 138 * When the rdma device is exclusive to a net namespace, rdma device net 139 * namespace is checked against the specified one. 140 */ 141 bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net) 142 { 143 return (ib_devices_shared_netns || 144 net_eq(read_pnet(&dev->coredev.rdma_net), net)); 145 } 146 EXPORT_SYMBOL(rdma_dev_access_netns); 147 148 /* 149 * xarray has this behavior where it won't iterate over NULL values stored in 150 * allocated arrays. So we need our own iterator to see all values stored in 151 * the array. This does the same thing as xa_for_each except that it also 152 * returns NULL valued entries if the array is allocating. Simplified to only 153 * work on simple xarrays. 154 */ 155 static void *xan_find_marked(struct xarray *xa, unsigned long *indexp, 156 xa_mark_t filter) 157 { 158 XA_STATE(xas, xa, *indexp); 159 void *entry; 160 161 rcu_read_lock(); 162 do { 163 entry = xas_find_marked(&xas, ULONG_MAX, filter); 164 if (xa_is_zero(entry)) 165 break; 166 } while (xas_retry(&xas, entry)); 167 rcu_read_unlock(); 168 169 if (entry) { 170 *indexp = xas.xa_index; 171 if (xa_is_zero(entry)) 172 return NULL; 173 return entry; 174 } 175 return XA_ERROR(-ENOENT); 176 } 177 #define xan_for_each_marked(xa, index, entry, filter) \ 178 for (index = 0, entry = xan_find_marked(xa, &(index), filter); \ 179 !xa_is_err(entry); \ 180 (index)++, entry = xan_find_marked(xa, &(index), filter)) 181 182 /* RCU hash table mapping netdevice pointers to struct ib_port_data */ 183 static DEFINE_SPINLOCK(ndev_hash_lock); 184 static DECLARE_HASHTABLE(ndev_hash, 5); 185 186 static void free_netdevs(struct ib_device *ib_dev); 187 static void ib_unregister_work(struct work_struct *work); 188 static void __ib_unregister_device(struct ib_device *device); 189 static int ib_security_change(struct notifier_block *nb, unsigned long event, 190 void *lsm_data); 191 static void ib_policy_change_task(struct work_struct *work); 192 static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task); 193 194 static void __ibdev_printk(const char *level, const struct ib_device *ibdev, 195 struct va_format *vaf) 196 { 197 if (ibdev && ibdev->dev.parent) 198 dev_printk_emit(level[1] - '0', 199 ibdev->dev.parent, 200 "%s %s %s: %pV", 201 dev_driver_string(ibdev->dev.parent), 202 dev_name(ibdev->dev.parent), 203 dev_name(&ibdev->dev), 204 vaf); 205 else if (ibdev) 206 printk("%s%s: %pV", 207 level, dev_name(&ibdev->dev), vaf); 208 else 209 printk("%s(NULL ib_device): %pV", level, vaf); 210 } 211 212 void ibdev_printk(const char *level, const struct ib_device *ibdev, 213 const char *format, ...) 214 { 215 struct va_format vaf; 216 va_list args; 217 218 va_start(args, format); 219 220 vaf.fmt = format; 221 vaf.va = &args; 222 223 __ibdev_printk(level, ibdev, &vaf); 224 225 va_end(args); 226 } 227 EXPORT_SYMBOL(ibdev_printk); 228 229 #define define_ibdev_printk_level(func, level) \ 230 void func(const struct ib_device *ibdev, const char *fmt, ...) \ 231 { \ 232 struct va_format vaf; \ 233 va_list args; \ 234 \ 235 va_start(args, fmt); \ 236 \ 237 vaf.fmt = fmt; \ 238 vaf.va = &args; \ 239 \ 240 __ibdev_printk(level, ibdev, &vaf); \ 241 \ 242 va_end(args); \ 243 } \ 244 EXPORT_SYMBOL(func); 245 246 define_ibdev_printk_level(ibdev_emerg, KERN_EMERG); 247 define_ibdev_printk_level(ibdev_alert, KERN_ALERT); 248 define_ibdev_printk_level(ibdev_crit, KERN_CRIT); 249 define_ibdev_printk_level(ibdev_err, KERN_ERR); 250 define_ibdev_printk_level(ibdev_warn, KERN_WARNING); 251 define_ibdev_printk_level(ibdev_notice, KERN_NOTICE); 252 define_ibdev_printk_level(ibdev_info, KERN_INFO); 253 254 static struct notifier_block ibdev_lsm_nb = { 255 .notifier_call = ib_security_change, 256 }; 257 258 static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, 259 struct net *net); 260 261 /* Pointer to the RCU head at the start of the ib_port_data array */ 262 struct ib_port_data_rcu { 263 struct rcu_head rcu_head; 264 struct ib_port_data pdata[]; 265 }; 266 267 static void ib_device_check_mandatory(struct ib_device *device) 268 { 269 #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x } 270 static const struct { 271 size_t offset; 272 char *name; 273 } mandatory_table[] = { 274 IB_MANDATORY_FUNC(query_device), 275 IB_MANDATORY_FUNC(query_port), 276 IB_MANDATORY_FUNC(alloc_pd), 277 IB_MANDATORY_FUNC(dealloc_pd), 278 IB_MANDATORY_FUNC(create_qp), 279 IB_MANDATORY_FUNC(modify_qp), 280 IB_MANDATORY_FUNC(destroy_qp), 281 IB_MANDATORY_FUNC(post_send), 282 IB_MANDATORY_FUNC(post_recv), 283 IB_MANDATORY_FUNC(create_cq), 284 IB_MANDATORY_FUNC(destroy_cq), 285 IB_MANDATORY_FUNC(poll_cq), 286 IB_MANDATORY_FUNC(req_notify_cq), 287 IB_MANDATORY_FUNC(get_dma_mr), 288 IB_MANDATORY_FUNC(reg_user_mr), 289 IB_MANDATORY_FUNC(dereg_mr), 290 IB_MANDATORY_FUNC(get_port_immutable) 291 }; 292 int i; 293 294 device->kverbs_provider = true; 295 for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { 296 if (!*(void **) ((void *) &device->ops + 297 mandatory_table[i].offset)) { 298 device->kverbs_provider = false; 299 break; 300 } 301 } 302 } 303 304 /* 305 * Caller must perform ib_device_put() to return the device reference count 306 * when ib_device_get_by_index() returns valid device pointer. 307 */ 308 struct ib_device *ib_device_get_by_index(const struct net *net, u32 index) 309 { 310 struct ib_device *device; 311 312 down_read(&devices_rwsem); 313 device = xa_load(&devices, index); 314 if (device) { 315 if (!rdma_dev_access_netns(device, net)) { 316 device = NULL; 317 goto out; 318 } 319 320 if (!ib_device_try_get(device)) 321 device = NULL; 322 } 323 out: 324 up_read(&devices_rwsem); 325 return device; 326 } 327 328 /** 329 * ib_device_put - Release IB device reference 330 * @device: device whose reference to be released 331 * 332 * ib_device_put() releases reference to the IB device to allow it to be 333 * unregistered and eventually free. 334 */ 335 void ib_device_put(struct ib_device *device) 336 { 337 if (refcount_dec_and_test(&device->refcount)) 338 complete(&device->unreg_completion); 339 } 340 EXPORT_SYMBOL(ib_device_put); 341 342 static struct ib_device *__ib_device_get_by_name(const char *name) 343 { 344 struct ib_device *device; 345 unsigned long index; 346 347 xa_for_each (&devices, index, device) 348 if (!strcmp(name, dev_name(&device->dev))) 349 return device; 350 351 return NULL; 352 } 353 354 /** 355 * ib_device_get_by_name - Find an IB device by name 356 * @name: The name to look for 357 * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) 358 * 359 * Find and hold an ib_device by its name. The caller must call 360 * ib_device_put() on the returned pointer. 361 */ 362 struct ib_device *ib_device_get_by_name(const char *name, 363 enum rdma_driver_id driver_id) 364 { 365 struct ib_device *device; 366 367 down_read(&devices_rwsem); 368 device = __ib_device_get_by_name(name); 369 if (device && driver_id != RDMA_DRIVER_UNKNOWN && 370 device->ops.driver_id != driver_id) 371 device = NULL; 372 373 if (device) { 374 if (!ib_device_try_get(device)) 375 device = NULL; 376 } 377 up_read(&devices_rwsem); 378 return device; 379 } 380 EXPORT_SYMBOL(ib_device_get_by_name); 381 382 static int rename_compat_devs(struct ib_device *device) 383 { 384 struct ib_core_device *cdev; 385 unsigned long index; 386 int ret = 0; 387 388 mutex_lock(&device->compat_devs_mutex); 389 xa_for_each (&device->compat_devs, index, cdev) { 390 ret = device_rename(&cdev->dev, dev_name(&device->dev)); 391 if (ret) { 392 dev_warn(&cdev->dev, 393 "Fail to rename compatdev to new name %s\n", 394 dev_name(&device->dev)); 395 break; 396 } 397 } 398 mutex_unlock(&device->compat_devs_mutex); 399 return ret; 400 } 401 402 int ib_device_rename(struct ib_device *ibdev, const char *name) 403 { 404 unsigned long index; 405 void *client_data; 406 int ret; 407 408 down_write(&devices_rwsem); 409 if (!strcmp(name, dev_name(&ibdev->dev))) { 410 up_write(&devices_rwsem); 411 return 0; 412 } 413 414 if (__ib_device_get_by_name(name)) { 415 up_write(&devices_rwsem); 416 return -EEXIST; 417 } 418 419 ret = device_rename(&ibdev->dev, name); 420 if (ret) { 421 up_write(&devices_rwsem); 422 return ret; 423 } 424 425 strscpy(ibdev->name, name, IB_DEVICE_NAME_MAX); 426 ret = rename_compat_devs(ibdev); 427 428 downgrade_write(&devices_rwsem); 429 down_read(&ibdev->client_data_rwsem); 430 xan_for_each_marked(&ibdev->client_data, index, client_data, 431 CLIENT_DATA_REGISTERED) { 432 struct ib_client *client = xa_load(&clients, index); 433 434 if (!client || !client->rename) 435 continue; 436 437 client->rename(ibdev, client_data); 438 } 439 up_read(&ibdev->client_data_rwsem); 440 up_read(&devices_rwsem); 441 return 0; 442 } 443 444 int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim) 445 { 446 if (use_dim > 1) 447 return -EINVAL; 448 ibdev->use_cq_dim = use_dim; 449 450 return 0; 451 } 452 453 static int alloc_name(struct ib_device *ibdev, const char *name) 454 { 455 struct ib_device *device; 456 unsigned long index; 457 struct ida inuse; 458 int rc; 459 int i; 460 461 lockdep_assert_held_write(&devices_rwsem); 462 ida_init(&inuse); 463 xa_for_each (&devices, index, device) { 464 char buf[IB_DEVICE_NAME_MAX]; 465 466 if (sscanf(dev_name(&device->dev), name, &i) != 1) 467 continue; 468 if (i < 0 || i >= INT_MAX) 469 continue; 470 snprintf(buf, sizeof buf, name, i); 471 if (strcmp(buf, dev_name(&device->dev)) != 0) 472 continue; 473 474 rc = ida_alloc_range(&inuse, i, i, GFP_KERNEL); 475 if (rc < 0) 476 goto out; 477 } 478 479 rc = ida_alloc(&inuse, GFP_KERNEL); 480 if (rc < 0) 481 goto out; 482 483 rc = dev_set_name(&ibdev->dev, name, rc); 484 out: 485 ida_destroy(&inuse); 486 return rc; 487 } 488 489 static void ib_device_release(struct device *device) 490 { 491 struct ib_device *dev = container_of(device, struct ib_device, dev); 492 493 free_netdevs(dev); 494 WARN_ON(refcount_read(&dev->refcount)); 495 if (dev->hw_stats_data) 496 ib_device_release_hw_stats(dev->hw_stats_data); 497 if (dev->port_data) { 498 ib_cache_release_one(dev); 499 ib_security_release_port_pkey_list(dev); 500 rdma_counter_release(dev); 501 kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu, 502 pdata[0]), 503 rcu_head); 504 } 505 506 mutex_destroy(&dev->subdev_lock); 507 mutex_destroy(&dev->unregistration_lock); 508 mutex_destroy(&dev->compat_devs_mutex); 509 510 xa_destroy(&dev->compat_devs); 511 xa_destroy(&dev->client_data); 512 kfree_rcu(dev, rcu_head); 513 } 514 515 static int ib_device_uevent(const struct device *device, 516 struct kobj_uevent_env *env) 517 { 518 if (add_uevent_var(env, "NAME=%s", dev_name(device))) 519 return -ENOMEM; 520 521 /* 522 * It would be nice to pass the node GUID with the event... 523 */ 524 525 return 0; 526 } 527 528 static const void *net_namespace(const struct device *d) 529 { 530 const struct ib_core_device *coredev = 531 container_of(d, struct ib_core_device, dev); 532 533 return read_pnet(&coredev->rdma_net); 534 } 535 536 static struct class ib_class = { 537 .name = "infiniband", 538 .dev_release = ib_device_release, 539 .dev_uevent = ib_device_uevent, 540 .ns_type = &net_ns_type_operations, 541 .namespace = net_namespace, 542 }; 543 544 static void rdma_init_coredev(struct ib_core_device *coredev, 545 struct ib_device *dev, struct net *net) 546 { 547 /* This BUILD_BUG_ON is intended to catch layout change 548 * of union of ib_core_device and device. 549 * dev must be the first element as ib_core and providers 550 * driver uses it. Adding anything in ib_core_device before 551 * device will break this assumption. 552 */ 553 BUILD_BUG_ON(offsetof(struct ib_device, coredev.dev) != 554 offsetof(struct ib_device, dev)); 555 556 coredev->dev.class = &ib_class; 557 coredev->dev.groups = dev->groups; 558 device_initialize(&coredev->dev); 559 coredev->owner = dev; 560 INIT_LIST_HEAD(&coredev->port_list); 561 write_pnet(&coredev->rdma_net, net); 562 } 563 564 /** 565 * _ib_alloc_device - allocate an IB device struct 566 * @size:size of structure to allocate 567 * 568 * Low-level drivers should use ib_alloc_device() to allocate &struct 569 * ib_device. @size is the size of the structure to be allocated, 570 * including any private data used by the low-level driver. 571 * ib_dealloc_device() must be used to free structures allocated with 572 * ib_alloc_device(). 573 */ 574 struct ib_device *_ib_alloc_device(size_t size) 575 { 576 struct ib_device *device; 577 unsigned int i; 578 579 if (WARN_ON(size < sizeof(struct ib_device))) 580 return NULL; 581 582 device = kzalloc(size, GFP_KERNEL); 583 if (!device) 584 return NULL; 585 586 if (rdma_restrack_init(device)) { 587 kfree(device); 588 return NULL; 589 } 590 591 rdma_init_coredev(&device->coredev, device, &init_net); 592 593 INIT_LIST_HEAD(&device->event_handler_list); 594 spin_lock_init(&device->qp_open_list_lock); 595 init_rwsem(&device->event_handler_rwsem); 596 mutex_init(&device->unregistration_lock); 597 /* 598 * client_data needs to be alloc because we don't want our mark to be 599 * destroyed if the user stores NULL in the client data. 600 */ 601 xa_init_flags(&device->client_data, XA_FLAGS_ALLOC); 602 init_rwsem(&device->client_data_rwsem); 603 xa_init_flags(&device->compat_devs, XA_FLAGS_ALLOC); 604 mutex_init(&device->compat_devs_mutex); 605 init_completion(&device->unreg_completion); 606 INIT_WORK(&device->unregistration_work, ib_unregister_work); 607 608 spin_lock_init(&device->cq_pools_lock); 609 for (i = 0; i < ARRAY_SIZE(device->cq_pools); i++) 610 INIT_LIST_HEAD(&device->cq_pools[i]); 611 612 rwlock_init(&device->cache_lock); 613 614 device->uverbs_cmd_mask = 615 BIT_ULL(IB_USER_VERBS_CMD_ALLOC_MW) | 616 BIT_ULL(IB_USER_VERBS_CMD_ALLOC_PD) | 617 BIT_ULL(IB_USER_VERBS_CMD_ATTACH_MCAST) | 618 BIT_ULL(IB_USER_VERBS_CMD_CLOSE_XRCD) | 619 BIT_ULL(IB_USER_VERBS_CMD_CREATE_AH) | 620 BIT_ULL(IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | 621 BIT_ULL(IB_USER_VERBS_CMD_CREATE_CQ) | 622 BIT_ULL(IB_USER_VERBS_CMD_CREATE_QP) | 623 BIT_ULL(IB_USER_VERBS_CMD_CREATE_SRQ) | 624 BIT_ULL(IB_USER_VERBS_CMD_CREATE_XSRQ) | 625 BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_MW) | 626 BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_PD) | 627 BIT_ULL(IB_USER_VERBS_CMD_DEREG_MR) | 628 BIT_ULL(IB_USER_VERBS_CMD_DESTROY_AH) | 629 BIT_ULL(IB_USER_VERBS_CMD_DESTROY_CQ) | 630 BIT_ULL(IB_USER_VERBS_CMD_DESTROY_QP) | 631 BIT_ULL(IB_USER_VERBS_CMD_DESTROY_SRQ) | 632 BIT_ULL(IB_USER_VERBS_CMD_DETACH_MCAST) | 633 BIT_ULL(IB_USER_VERBS_CMD_GET_CONTEXT) | 634 BIT_ULL(IB_USER_VERBS_CMD_MODIFY_QP) | 635 BIT_ULL(IB_USER_VERBS_CMD_MODIFY_SRQ) | 636 BIT_ULL(IB_USER_VERBS_CMD_OPEN_QP) | 637 BIT_ULL(IB_USER_VERBS_CMD_OPEN_XRCD) | 638 BIT_ULL(IB_USER_VERBS_CMD_QUERY_DEVICE) | 639 BIT_ULL(IB_USER_VERBS_CMD_QUERY_PORT) | 640 BIT_ULL(IB_USER_VERBS_CMD_QUERY_QP) | 641 BIT_ULL(IB_USER_VERBS_CMD_QUERY_SRQ) | 642 BIT_ULL(IB_USER_VERBS_CMD_REG_MR) | 643 BIT_ULL(IB_USER_VERBS_CMD_REREG_MR) | 644 BIT_ULL(IB_USER_VERBS_CMD_RESIZE_CQ); 645 646 mutex_init(&device->subdev_lock); 647 INIT_LIST_HEAD(&device->subdev_list_head); 648 INIT_LIST_HEAD(&device->subdev_list); 649 650 return device; 651 } 652 EXPORT_SYMBOL(_ib_alloc_device); 653 654 /** 655 * ib_dealloc_device - free an IB device struct 656 * @device:structure to free 657 * 658 * Free a structure allocated with ib_alloc_device(). 659 */ 660 void ib_dealloc_device(struct ib_device *device) 661 { 662 if (device->ops.dealloc_driver) 663 device->ops.dealloc_driver(device); 664 665 /* 666 * ib_unregister_driver() requires all devices to remain in the xarray 667 * while their ops are callable. The last op we call is dealloc_driver 668 * above. This is needed to create a fence on op callbacks prior to 669 * allowing the driver module to unload. 670 */ 671 down_write(&devices_rwsem); 672 if (xa_load(&devices, device->index) == device) 673 xa_erase(&devices, device->index); 674 up_write(&devices_rwsem); 675 676 /* Expedite releasing netdev references */ 677 free_netdevs(device); 678 679 WARN_ON(!xa_empty(&device->compat_devs)); 680 WARN_ON(!xa_empty(&device->client_data)); 681 WARN_ON(refcount_read(&device->refcount)); 682 rdma_restrack_clean(device); 683 /* Balances with device_initialize */ 684 put_device(&device->dev); 685 } 686 EXPORT_SYMBOL(ib_dealloc_device); 687 688 /* 689 * add_client_context() and remove_client_context() must be safe against 690 * parallel calls on the same device - registration/unregistration of both the 691 * device and client can be occurring in parallel. 692 * 693 * The routines need to be a fence, any caller must not return until the add 694 * or remove is fully completed. 695 */ 696 static int add_client_context(struct ib_device *device, 697 struct ib_client *client) 698 { 699 int ret = 0; 700 701 if (!device->kverbs_provider && !client->no_kverbs_req) 702 return 0; 703 704 down_write(&device->client_data_rwsem); 705 /* 706 * So long as the client is registered hold both the client and device 707 * unregistration locks. 708 */ 709 if (!refcount_inc_not_zero(&client->uses)) 710 goto out_unlock; 711 refcount_inc(&device->refcount); 712 713 /* 714 * Another caller to add_client_context got here first and has already 715 * completely initialized context. 716 */ 717 if (xa_get_mark(&device->client_data, client->client_id, 718 CLIENT_DATA_REGISTERED)) 719 goto out; 720 721 ret = xa_err(xa_store(&device->client_data, client->client_id, NULL, 722 GFP_KERNEL)); 723 if (ret) 724 goto out; 725 downgrade_write(&device->client_data_rwsem); 726 if (client->add) { 727 if (client->add(device)) { 728 /* 729 * If a client fails to add then the error code is 730 * ignored, but we won't call any more ops on this 731 * client. 732 */ 733 xa_erase(&device->client_data, client->client_id); 734 up_read(&device->client_data_rwsem); 735 ib_device_put(device); 736 ib_client_put(client); 737 return 0; 738 } 739 } 740 741 /* Readers shall not see a client until add has been completed */ 742 xa_set_mark(&device->client_data, client->client_id, 743 CLIENT_DATA_REGISTERED); 744 up_read(&device->client_data_rwsem); 745 return 0; 746 747 out: 748 ib_device_put(device); 749 ib_client_put(client); 750 out_unlock: 751 up_write(&device->client_data_rwsem); 752 return ret; 753 } 754 755 static void remove_client_context(struct ib_device *device, 756 unsigned int client_id) 757 { 758 struct ib_client *client; 759 void *client_data; 760 761 down_write(&device->client_data_rwsem); 762 if (!xa_get_mark(&device->client_data, client_id, 763 CLIENT_DATA_REGISTERED)) { 764 up_write(&device->client_data_rwsem); 765 return; 766 } 767 client_data = xa_load(&device->client_data, client_id); 768 xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED); 769 client = xa_load(&clients, client_id); 770 up_write(&device->client_data_rwsem); 771 772 /* 773 * Notice we cannot be holding any exclusive locks when calling the 774 * remove callback as the remove callback can recurse back into any 775 * public functions in this module and thus try for any locks those 776 * functions take. 777 * 778 * For this reason clients and drivers should not call the 779 * unregistration functions will holdling any locks. 780 */ 781 if (client->remove) 782 client->remove(device, client_data); 783 784 xa_erase(&device->client_data, client_id); 785 ib_device_put(device); 786 ib_client_put(client); 787 } 788 789 static int alloc_port_data(struct ib_device *device) 790 { 791 struct ib_port_data_rcu *pdata_rcu; 792 u32 port; 793 794 if (device->port_data) 795 return 0; 796 797 /* This can only be called once the physical port range is defined */ 798 if (WARN_ON(!device->phys_port_cnt)) 799 return -EINVAL; 800 801 /* Reserve U32_MAX so the logic to go over all the ports is sane */ 802 if (WARN_ON(device->phys_port_cnt == U32_MAX)) 803 return -EINVAL; 804 805 /* 806 * device->port_data is indexed directly by the port number to make 807 * access to this data as efficient as possible. 808 * 809 * Therefore port_data is declared as a 1 based array with potential 810 * empty slots at the beginning. 811 */ 812 pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata, 813 size_add(rdma_end_port(device), 1)), 814 GFP_KERNEL); 815 if (!pdata_rcu) 816 return -ENOMEM; 817 /* 818 * The rcu_head is put in front of the port data array and the stored 819 * pointer is adjusted since we never need to see that member until 820 * kfree_rcu. 821 */ 822 device->port_data = pdata_rcu->pdata; 823 824 rdma_for_each_port (device, port) { 825 struct ib_port_data *pdata = &device->port_data[port]; 826 827 pdata->ib_dev = device; 828 spin_lock_init(&pdata->pkey_list_lock); 829 INIT_LIST_HEAD(&pdata->pkey_list); 830 spin_lock_init(&pdata->netdev_lock); 831 INIT_HLIST_NODE(&pdata->ndev_hash_link); 832 } 833 return 0; 834 } 835 836 static int verify_immutable(const struct ib_device *dev, u32 port) 837 { 838 return WARN_ON(!rdma_cap_ib_mad(dev, port) && 839 rdma_max_mad_size(dev, port) != 0); 840 } 841 842 static int setup_port_data(struct ib_device *device) 843 { 844 u32 port; 845 int ret; 846 847 ret = alloc_port_data(device); 848 if (ret) 849 return ret; 850 851 rdma_for_each_port (device, port) { 852 struct ib_port_data *pdata = &device->port_data[port]; 853 854 ret = device->ops.get_port_immutable(device, port, 855 &pdata->immutable); 856 if (ret) 857 return ret; 858 859 if (verify_immutable(device, port)) 860 return -EINVAL; 861 } 862 return 0; 863 } 864 865 /** 866 * ib_port_immutable_read() - Read rdma port's immutable data 867 * @dev: IB device 868 * @port: port number whose immutable data to read. It starts with index 1 and 869 * valid upto including rdma_end_port(). 870 */ 871 const struct ib_port_immutable* 872 ib_port_immutable_read(struct ib_device *dev, unsigned int port) 873 { 874 WARN_ON(!rdma_is_port_valid(dev, port)); 875 return &dev->port_data[port].immutable; 876 } 877 EXPORT_SYMBOL(ib_port_immutable_read); 878 879 void ib_get_device_fw_str(struct ib_device *dev, char *str) 880 { 881 if (dev->ops.get_dev_fw_str) 882 dev->ops.get_dev_fw_str(dev, str); 883 else 884 str[0] = '\0'; 885 } 886 EXPORT_SYMBOL(ib_get_device_fw_str); 887 888 static void ib_policy_change_task(struct work_struct *work) 889 { 890 struct ib_device *dev; 891 unsigned long index; 892 893 down_read(&devices_rwsem); 894 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 895 unsigned int i; 896 897 rdma_for_each_port (dev, i) { 898 u64 sp; 899 ib_get_cached_subnet_prefix(dev, i, &sp); 900 ib_security_cache_change(dev, i, sp); 901 } 902 } 903 up_read(&devices_rwsem); 904 } 905 906 static int ib_security_change(struct notifier_block *nb, unsigned long event, 907 void *lsm_data) 908 { 909 if (event != LSM_POLICY_CHANGE) 910 return NOTIFY_DONE; 911 912 schedule_work(&ib_policy_change_work); 913 ib_mad_agent_security_change(); 914 915 return NOTIFY_OK; 916 } 917 918 static void compatdev_release(struct device *dev) 919 { 920 struct ib_core_device *cdev = 921 container_of(dev, struct ib_core_device, dev); 922 923 kfree(cdev); 924 } 925 926 static int add_one_compat_dev(struct ib_device *device, 927 struct rdma_dev_net *rnet) 928 { 929 struct ib_core_device *cdev; 930 int ret; 931 932 lockdep_assert_held(&rdma_nets_rwsem); 933 if (!ib_devices_shared_netns) 934 return 0; 935 936 /* 937 * Create and add compat device in all namespaces other than where it 938 * is currently bound to. 939 */ 940 if (net_eq(read_pnet(&rnet->net), 941 read_pnet(&device->coredev.rdma_net))) 942 return 0; 943 944 /* 945 * The first of init_net() or ib_register_device() to take the 946 * compat_devs_mutex wins and gets to add the device. Others will wait 947 * for completion here. 948 */ 949 mutex_lock(&device->compat_devs_mutex); 950 cdev = xa_load(&device->compat_devs, rnet->id); 951 if (cdev) { 952 ret = 0; 953 goto done; 954 } 955 ret = xa_reserve(&device->compat_devs, rnet->id, GFP_KERNEL); 956 if (ret) 957 goto done; 958 959 cdev = kzalloc(sizeof(*cdev), GFP_KERNEL); 960 if (!cdev) { 961 ret = -ENOMEM; 962 goto cdev_err; 963 } 964 965 cdev->dev.parent = device->dev.parent; 966 rdma_init_coredev(cdev, device, read_pnet(&rnet->net)); 967 cdev->dev.release = compatdev_release; 968 ret = dev_set_name(&cdev->dev, "%s", dev_name(&device->dev)); 969 if (ret) 970 goto add_err; 971 972 ret = device_add(&cdev->dev); 973 if (ret) 974 goto add_err; 975 ret = ib_setup_port_attrs(cdev); 976 if (ret) 977 goto port_err; 978 979 ret = xa_err(xa_store(&device->compat_devs, rnet->id, 980 cdev, GFP_KERNEL)); 981 if (ret) 982 goto insert_err; 983 984 mutex_unlock(&device->compat_devs_mutex); 985 return 0; 986 987 insert_err: 988 ib_free_port_attrs(cdev); 989 port_err: 990 device_del(&cdev->dev); 991 add_err: 992 put_device(&cdev->dev); 993 cdev_err: 994 xa_release(&device->compat_devs, rnet->id); 995 done: 996 mutex_unlock(&device->compat_devs_mutex); 997 return ret; 998 } 999 1000 static void remove_one_compat_dev(struct ib_device *device, u32 id) 1001 { 1002 struct ib_core_device *cdev; 1003 1004 mutex_lock(&device->compat_devs_mutex); 1005 cdev = xa_erase(&device->compat_devs, id); 1006 mutex_unlock(&device->compat_devs_mutex); 1007 if (cdev) { 1008 ib_free_port_attrs(cdev); 1009 device_del(&cdev->dev); 1010 put_device(&cdev->dev); 1011 } 1012 } 1013 1014 static void remove_compat_devs(struct ib_device *device) 1015 { 1016 struct ib_core_device *cdev; 1017 unsigned long index; 1018 1019 xa_for_each (&device->compat_devs, index, cdev) 1020 remove_one_compat_dev(device, index); 1021 } 1022 1023 static int add_compat_devs(struct ib_device *device) 1024 { 1025 struct rdma_dev_net *rnet; 1026 unsigned long index; 1027 int ret = 0; 1028 1029 lockdep_assert_held(&devices_rwsem); 1030 1031 down_read(&rdma_nets_rwsem); 1032 xa_for_each (&rdma_nets, index, rnet) { 1033 ret = add_one_compat_dev(device, rnet); 1034 if (ret) 1035 break; 1036 } 1037 up_read(&rdma_nets_rwsem); 1038 return ret; 1039 } 1040 1041 static void remove_all_compat_devs(void) 1042 { 1043 struct ib_compat_device *cdev; 1044 struct ib_device *dev; 1045 unsigned long index; 1046 1047 down_read(&devices_rwsem); 1048 xa_for_each (&devices, index, dev) { 1049 unsigned long c_index = 0; 1050 1051 /* Hold nets_rwsem so that any other thread modifying this 1052 * system param can sync with this thread. 1053 */ 1054 down_read(&rdma_nets_rwsem); 1055 xa_for_each (&dev->compat_devs, c_index, cdev) 1056 remove_one_compat_dev(dev, c_index); 1057 up_read(&rdma_nets_rwsem); 1058 } 1059 up_read(&devices_rwsem); 1060 } 1061 1062 static int add_all_compat_devs(void) 1063 { 1064 struct rdma_dev_net *rnet; 1065 struct ib_device *dev; 1066 unsigned long index; 1067 int ret = 0; 1068 1069 down_read(&devices_rwsem); 1070 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 1071 unsigned long net_index = 0; 1072 1073 /* Hold nets_rwsem so that any other thread modifying this 1074 * system param can sync with this thread. 1075 */ 1076 down_read(&rdma_nets_rwsem); 1077 xa_for_each (&rdma_nets, net_index, rnet) { 1078 ret = add_one_compat_dev(dev, rnet); 1079 if (ret) 1080 break; 1081 } 1082 up_read(&rdma_nets_rwsem); 1083 } 1084 up_read(&devices_rwsem); 1085 if (ret) 1086 remove_all_compat_devs(); 1087 return ret; 1088 } 1089 1090 int rdma_compatdev_set(u8 enable) 1091 { 1092 struct rdma_dev_net *rnet; 1093 unsigned long index; 1094 int ret = 0; 1095 1096 down_write(&rdma_nets_rwsem); 1097 if (ib_devices_shared_netns == enable) { 1098 up_write(&rdma_nets_rwsem); 1099 return 0; 1100 } 1101 1102 /* enable/disable of compat devices is not supported 1103 * when more than default init_net exists. 1104 */ 1105 xa_for_each (&rdma_nets, index, rnet) { 1106 ret++; 1107 break; 1108 } 1109 if (!ret) 1110 ib_devices_shared_netns = enable; 1111 up_write(&rdma_nets_rwsem); 1112 if (ret) 1113 return -EBUSY; 1114 1115 if (enable) 1116 ret = add_all_compat_devs(); 1117 else 1118 remove_all_compat_devs(); 1119 return ret; 1120 } 1121 1122 static void rdma_dev_exit_net(struct net *net) 1123 { 1124 struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); 1125 struct ib_device *dev; 1126 unsigned long index; 1127 int ret; 1128 1129 down_write(&rdma_nets_rwsem); 1130 /* 1131 * Prevent the ID from being re-used and hide the id from xa_for_each. 1132 */ 1133 ret = xa_err(xa_store(&rdma_nets, rnet->id, NULL, GFP_KERNEL)); 1134 WARN_ON(ret); 1135 up_write(&rdma_nets_rwsem); 1136 1137 down_read(&devices_rwsem); 1138 xa_for_each (&devices, index, dev) { 1139 get_device(&dev->dev); 1140 /* 1141 * Release the devices_rwsem so that pontentially blocking 1142 * device_del, doesn't hold the devices_rwsem for too long. 1143 */ 1144 up_read(&devices_rwsem); 1145 1146 remove_one_compat_dev(dev, rnet->id); 1147 1148 /* 1149 * If the real device is in the NS then move it back to init. 1150 */ 1151 rdma_dev_change_netns(dev, net, &init_net); 1152 1153 put_device(&dev->dev); 1154 down_read(&devices_rwsem); 1155 } 1156 up_read(&devices_rwsem); 1157 1158 rdma_nl_net_exit(rnet); 1159 xa_erase(&rdma_nets, rnet->id); 1160 } 1161 1162 static __net_init int rdma_dev_init_net(struct net *net) 1163 { 1164 struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); 1165 unsigned long index; 1166 struct ib_device *dev; 1167 int ret; 1168 1169 write_pnet(&rnet->net, net); 1170 1171 ret = rdma_nl_net_init(rnet); 1172 if (ret) 1173 return ret; 1174 1175 /* No need to create any compat devices in default init_net. */ 1176 if (net_eq(net, &init_net)) 1177 return 0; 1178 1179 ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL); 1180 if (ret) { 1181 rdma_nl_net_exit(rnet); 1182 return ret; 1183 } 1184 1185 down_read(&devices_rwsem); 1186 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 1187 /* Hold nets_rwsem so that netlink command cannot change 1188 * system configuration for device sharing mode. 1189 */ 1190 down_read(&rdma_nets_rwsem); 1191 ret = add_one_compat_dev(dev, rnet); 1192 up_read(&rdma_nets_rwsem); 1193 if (ret) 1194 break; 1195 } 1196 up_read(&devices_rwsem); 1197 1198 if (ret) 1199 rdma_dev_exit_net(net); 1200 1201 return ret; 1202 } 1203 1204 /* 1205 * Assign the unique string device name and the unique device index. This is 1206 * undone by ib_dealloc_device. 1207 */ 1208 static int assign_name(struct ib_device *device, const char *name) 1209 { 1210 static u32 last_id; 1211 int ret; 1212 1213 down_write(&devices_rwsem); 1214 /* Assign a unique name to the device */ 1215 if (strchr(name, '%')) 1216 ret = alloc_name(device, name); 1217 else 1218 ret = dev_set_name(&device->dev, name); 1219 if (ret) 1220 goto out; 1221 1222 if (__ib_device_get_by_name(dev_name(&device->dev))) { 1223 ret = -ENFILE; 1224 goto out; 1225 } 1226 strscpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); 1227 1228 ret = xa_alloc_cyclic(&devices, &device->index, device, xa_limit_31b, 1229 &last_id, GFP_KERNEL); 1230 if (ret > 0) 1231 ret = 0; 1232 1233 out: 1234 up_write(&devices_rwsem); 1235 return ret; 1236 } 1237 1238 /* 1239 * setup_device() allocates memory and sets up data that requires calling the 1240 * device ops, this is the only reason these actions are not done during 1241 * ib_alloc_device. It is undone by ib_dealloc_device(). 1242 */ 1243 static int setup_device(struct ib_device *device) 1244 { 1245 struct ib_udata uhw = {.outlen = 0, .inlen = 0}; 1246 int ret; 1247 1248 ib_device_check_mandatory(device); 1249 1250 ret = setup_port_data(device); 1251 if (ret) { 1252 dev_warn(&device->dev, "Couldn't create per-port data\n"); 1253 return ret; 1254 } 1255 1256 memset(&device->attrs, 0, sizeof(device->attrs)); 1257 ret = device->ops.query_device(device, &device->attrs, &uhw); 1258 if (ret) { 1259 dev_warn(&device->dev, 1260 "Couldn't query the device attributes\n"); 1261 return ret; 1262 } 1263 1264 return 0; 1265 } 1266 1267 static void disable_device(struct ib_device *device) 1268 { 1269 u32 cid; 1270 1271 WARN_ON(!refcount_read(&device->refcount)); 1272 1273 down_write(&devices_rwsem); 1274 xa_clear_mark(&devices, device->index, DEVICE_REGISTERED); 1275 up_write(&devices_rwsem); 1276 1277 /* 1278 * Remove clients in LIFO order, see assign_client_id. This could be 1279 * more efficient if xarray learns to reverse iterate. Since no new 1280 * clients can be added to this ib_device past this point we only need 1281 * the maximum possible client_id value here. 1282 */ 1283 down_read(&clients_rwsem); 1284 cid = highest_client_id; 1285 up_read(&clients_rwsem); 1286 while (cid) { 1287 cid--; 1288 remove_client_context(device, cid); 1289 } 1290 1291 ib_cq_pool_cleanup(device); 1292 1293 /* Pairs with refcount_set in enable_device */ 1294 ib_device_put(device); 1295 wait_for_completion(&device->unreg_completion); 1296 1297 /* 1298 * compat devices must be removed after device refcount drops to zero. 1299 * Otherwise init_net() may add more compatdevs after removing compat 1300 * devices and before device is disabled. 1301 */ 1302 remove_compat_devs(device); 1303 } 1304 1305 /* 1306 * An enabled device is visible to all clients and to all the public facing 1307 * APIs that return a device pointer. This always returns with a new get, even 1308 * if it fails. 1309 */ 1310 static int enable_device_and_get(struct ib_device *device) 1311 { 1312 struct ib_client *client; 1313 unsigned long index; 1314 int ret = 0; 1315 1316 /* 1317 * One ref belongs to the xa and the other belongs to this 1318 * thread. This is needed to guard against parallel unregistration. 1319 */ 1320 refcount_set(&device->refcount, 2); 1321 down_write(&devices_rwsem); 1322 xa_set_mark(&devices, device->index, DEVICE_REGISTERED); 1323 1324 /* 1325 * By using downgrade_write() we ensure that no other thread can clear 1326 * DEVICE_REGISTERED while we are completing the client setup. 1327 */ 1328 downgrade_write(&devices_rwsem); 1329 1330 if (device->ops.enable_driver) { 1331 ret = device->ops.enable_driver(device); 1332 if (ret) 1333 goto out; 1334 } 1335 1336 down_read(&clients_rwsem); 1337 xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { 1338 ret = add_client_context(device, client); 1339 if (ret) 1340 break; 1341 } 1342 up_read(&clients_rwsem); 1343 if (!ret) 1344 ret = add_compat_devs(device); 1345 out: 1346 up_read(&devices_rwsem); 1347 return ret; 1348 } 1349 1350 static void prevent_dealloc_device(struct ib_device *ib_dev) 1351 { 1352 } 1353 1354 static void ib_device_notify_register(struct ib_device *device) 1355 { 1356 struct net_device *netdev; 1357 u32 port; 1358 int ret; 1359 1360 ret = rdma_nl_notify_event(device, 0, RDMA_REGISTER_EVENT); 1361 if (ret) 1362 return; 1363 1364 rdma_for_each_port(device, port) { 1365 netdev = ib_device_get_netdev(device, port); 1366 if (!netdev) 1367 continue; 1368 1369 ret = rdma_nl_notify_event(device, port, 1370 RDMA_NETDEV_ATTACH_EVENT); 1371 dev_put(netdev); 1372 if (ret) 1373 return; 1374 } 1375 } 1376 1377 /** 1378 * ib_register_device - Register an IB device with IB core 1379 * @device: Device to register 1380 * @name: unique string device name. This may include a '%' which will 1381 * cause a unique index to be added to the passed device name. 1382 * @dma_device: pointer to a DMA-capable device. If %NULL, then the IB 1383 * device will be used. In this case the caller should fully 1384 * setup the ibdev for DMA. This usually means using dma_virt_ops. 1385 * 1386 * Low-level drivers use ib_register_device() to register their 1387 * devices with the IB core. All registered clients will receive a 1388 * callback for each device that is added. @device must be allocated 1389 * with ib_alloc_device(). 1390 * 1391 * If the driver uses ops.dealloc_driver and calls any ib_unregister_device() 1392 * asynchronously then the device pointer may become freed as soon as this 1393 * function returns. 1394 */ 1395 int ib_register_device(struct ib_device *device, const char *name, 1396 struct device *dma_device) 1397 { 1398 int ret; 1399 1400 ret = assign_name(device, name); 1401 if (ret) 1402 return ret; 1403 1404 /* 1405 * If the caller does not provide a DMA capable device then the IB core 1406 * will set up ib_sge and scatterlist structures that stash the kernel 1407 * virtual address into the address field. 1408 */ 1409 WARN_ON(dma_device && !dma_device->dma_parms); 1410 device->dma_device = dma_device; 1411 1412 ret = setup_device(device); 1413 if (ret) 1414 return ret; 1415 1416 ret = ib_cache_setup_one(device); 1417 if (ret) { 1418 dev_warn(&device->dev, 1419 "Couldn't set up InfiniBand P_Key/GID cache\n"); 1420 return ret; 1421 } 1422 1423 device->groups[0] = &ib_dev_attr_group; 1424 device->groups[1] = device->ops.device_group; 1425 ret = ib_setup_device_attrs(device); 1426 if (ret) 1427 goto cache_cleanup; 1428 1429 ib_device_register_rdmacg(device); 1430 1431 rdma_counter_init(device); 1432 1433 /* 1434 * Ensure that ADD uevent is not fired because it 1435 * is too early amd device is not initialized yet. 1436 */ 1437 dev_set_uevent_suppress(&device->dev, true); 1438 ret = device_add(&device->dev); 1439 if (ret) 1440 goto cg_cleanup; 1441 1442 ret = ib_setup_port_attrs(&device->coredev); 1443 if (ret) { 1444 dev_warn(&device->dev, 1445 "Couldn't register device with driver model\n"); 1446 goto dev_cleanup; 1447 } 1448 1449 ret = enable_device_and_get(device); 1450 if (ret) { 1451 void (*dealloc_fn)(struct ib_device *); 1452 1453 /* 1454 * If we hit this error flow then we don't want to 1455 * automatically dealloc the device since the caller is 1456 * expected to call ib_dealloc_device() after 1457 * ib_register_device() fails. This is tricky due to the 1458 * possibility for a parallel unregistration along with this 1459 * error flow. Since we have a refcount here we know any 1460 * parallel flow is stopped in disable_device and will see the 1461 * special dealloc_driver pointer, causing the responsibility to 1462 * ib_dealloc_device() to revert back to this thread. 1463 */ 1464 dealloc_fn = device->ops.dealloc_driver; 1465 device->ops.dealloc_driver = prevent_dealloc_device; 1466 ib_device_put(device); 1467 __ib_unregister_device(device); 1468 device->ops.dealloc_driver = dealloc_fn; 1469 dev_set_uevent_suppress(&device->dev, false); 1470 return ret; 1471 } 1472 dev_set_uevent_suppress(&device->dev, false); 1473 /* Mark for userspace that device is ready */ 1474 kobject_uevent(&device->dev.kobj, KOBJ_ADD); 1475 1476 ib_device_notify_register(device); 1477 ib_device_put(device); 1478 1479 return 0; 1480 1481 dev_cleanup: 1482 device_del(&device->dev); 1483 cg_cleanup: 1484 dev_set_uevent_suppress(&device->dev, false); 1485 ib_device_unregister_rdmacg(device); 1486 cache_cleanup: 1487 ib_cache_cleanup_one(device); 1488 return ret; 1489 } 1490 EXPORT_SYMBOL(ib_register_device); 1491 1492 /* Callers must hold a get on the device. */ 1493 static void __ib_unregister_device(struct ib_device *ib_dev) 1494 { 1495 struct ib_device *sub, *tmp; 1496 1497 mutex_lock(&ib_dev->subdev_lock); 1498 list_for_each_entry_safe_reverse(sub, tmp, 1499 &ib_dev->subdev_list_head, 1500 subdev_list) { 1501 list_del(&sub->subdev_list); 1502 ib_dev->ops.del_sub_dev(sub); 1503 ib_device_put(ib_dev); 1504 } 1505 mutex_unlock(&ib_dev->subdev_lock); 1506 1507 /* 1508 * We have a registration lock so that all the calls to unregister are 1509 * fully fenced, once any unregister returns the device is truely 1510 * unregistered even if multiple callers are unregistering it at the 1511 * same time. This also interacts with the registration flow and 1512 * provides sane semantics if register and unregister are racing. 1513 */ 1514 mutex_lock(&ib_dev->unregistration_lock); 1515 if (!refcount_read(&ib_dev->refcount)) 1516 goto out; 1517 1518 disable_device(ib_dev); 1519 rdma_nl_notify_event(ib_dev, 0, RDMA_UNREGISTER_EVENT); 1520 1521 /* Expedite removing unregistered pointers from the hash table */ 1522 free_netdevs(ib_dev); 1523 1524 ib_free_port_attrs(&ib_dev->coredev); 1525 device_del(&ib_dev->dev); 1526 ib_device_unregister_rdmacg(ib_dev); 1527 ib_cache_cleanup_one(ib_dev); 1528 1529 /* 1530 * Drivers using the new flow may not call ib_dealloc_device except 1531 * in error unwind prior to registration success. 1532 */ 1533 if (ib_dev->ops.dealloc_driver && 1534 ib_dev->ops.dealloc_driver != prevent_dealloc_device) { 1535 WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1); 1536 ib_dealloc_device(ib_dev); 1537 } 1538 out: 1539 mutex_unlock(&ib_dev->unregistration_lock); 1540 } 1541 1542 /** 1543 * ib_unregister_device - Unregister an IB device 1544 * @ib_dev: The device to unregister 1545 * 1546 * Unregister an IB device. All clients will receive a remove callback. 1547 * 1548 * Callers should call this routine only once, and protect against races with 1549 * registration. Typically it should only be called as part of a remove 1550 * callback in an implementation of driver core's struct device_driver and 1551 * related. 1552 * 1553 * If ops.dealloc_driver is used then ib_dev will be freed upon return from 1554 * this function. 1555 */ 1556 void ib_unregister_device(struct ib_device *ib_dev) 1557 { 1558 get_device(&ib_dev->dev); 1559 __ib_unregister_device(ib_dev); 1560 put_device(&ib_dev->dev); 1561 } 1562 EXPORT_SYMBOL(ib_unregister_device); 1563 1564 /** 1565 * ib_unregister_device_and_put - Unregister a device while holding a 'get' 1566 * @ib_dev: The device to unregister 1567 * 1568 * This is the same as ib_unregister_device(), except it includes an internal 1569 * ib_device_put() that should match a 'get' obtained by the caller. 1570 * 1571 * It is safe to call this routine concurrently from multiple threads while 1572 * holding the 'get'. When the function returns the device is fully 1573 * unregistered. 1574 * 1575 * Drivers using this flow MUST use the driver_unregister callback to clean up 1576 * their resources associated with the device and dealloc it. 1577 */ 1578 void ib_unregister_device_and_put(struct ib_device *ib_dev) 1579 { 1580 WARN_ON(!ib_dev->ops.dealloc_driver); 1581 get_device(&ib_dev->dev); 1582 ib_device_put(ib_dev); 1583 __ib_unregister_device(ib_dev); 1584 put_device(&ib_dev->dev); 1585 } 1586 EXPORT_SYMBOL(ib_unregister_device_and_put); 1587 1588 /** 1589 * ib_unregister_driver - Unregister all IB devices for a driver 1590 * @driver_id: The driver to unregister 1591 * 1592 * This implements a fence for device unregistration. It only returns once all 1593 * devices associated with the driver_id have fully completed their 1594 * unregistration and returned from ib_unregister_device*(). 1595 * 1596 * If device's are not yet unregistered it goes ahead and starts unregistering 1597 * them. 1598 * 1599 * This does not block creation of new devices with the given driver_id, that 1600 * is the responsibility of the caller. 1601 */ 1602 void ib_unregister_driver(enum rdma_driver_id driver_id) 1603 { 1604 struct ib_device *ib_dev; 1605 unsigned long index; 1606 1607 down_read(&devices_rwsem); 1608 xa_for_each (&devices, index, ib_dev) { 1609 if (ib_dev->ops.driver_id != driver_id) 1610 continue; 1611 1612 get_device(&ib_dev->dev); 1613 up_read(&devices_rwsem); 1614 1615 WARN_ON(!ib_dev->ops.dealloc_driver); 1616 __ib_unregister_device(ib_dev); 1617 1618 put_device(&ib_dev->dev); 1619 down_read(&devices_rwsem); 1620 } 1621 up_read(&devices_rwsem); 1622 } 1623 EXPORT_SYMBOL(ib_unregister_driver); 1624 1625 static void ib_unregister_work(struct work_struct *work) 1626 { 1627 struct ib_device *ib_dev = 1628 container_of(work, struct ib_device, unregistration_work); 1629 1630 __ib_unregister_device(ib_dev); 1631 put_device(&ib_dev->dev); 1632 } 1633 1634 /** 1635 * ib_unregister_device_queued - Unregister a device using a work queue 1636 * @ib_dev: The device to unregister 1637 * 1638 * This schedules an asynchronous unregistration using a WQ for the device. A 1639 * driver should use this to avoid holding locks while doing unregistration, 1640 * such as holding the RTNL lock. 1641 * 1642 * Drivers using this API must use ib_unregister_driver before module unload 1643 * to ensure that all scheduled unregistrations have completed. 1644 */ 1645 void ib_unregister_device_queued(struct ib_device *ib_dev) 1646 { 1647 WARN_ON(!refcount_read(&ib_dev->refcount)); 1648 WARN_ON(!ib_dev->ops.dealloc_driver); 1649 get_device(&ib_dev->dev); 1650 if (!queue_work(ib_unreg_wq, &ib_dev->unregistration_work)) 1651 put_device(&ib_dev->dev); 1652 } 1653 EXPORT_SYMBOL(ib_unregister_device_queued); 1654 1655 /* 1656 * The caller must pass in a device that has the kref held and the refcount 1657 * released. If the device is in cur_net and still registered then it is moved 1658 * into net. 1659 */ 1660 static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, 1661 struct net *net) 1662 { 1663 int ret2 = -EINVAL; 1664 int ret; 1665 1666 mutex_lock(&device->unregistration_lock); 1667 1668 /* 1669 * If a device not under ib_device_get() or if the unregistration_lock 1670 * is not held, the namespace can be changed, or it can be unregistered. 1671 * Check again under the lock. 1672 */ 1673 if (refcount_read(&device->refcount) == 0 || 1674 !net_eq(cur_net, read_pnet(&device->coredev.rdma_net))) { 1675 ret = -ENODEV; 1676 goto out; 1677 } 1678 1679 kobject_uevent(&device->dev.kobj, KOBJ_REMOVE); 1680 disable_device(device); 1681 1682 /* 1683 * At this point no one can be using the device, so it is safe to 1684 * change the namespace. 1685 */ 1686 write_pnet(&device->coredev.rdma_net, net); 1687 1688 down_read(&devices_rwsem); 1689 /* 1690 * Currently rdma devices are system wide unique. So the device name 1691 * is guaranteed free in the new namespace. Publish the new namespace 1692 * at the sysfs level. 1693 */ 1694 ret = device_rename(&device->dev, dev_name(&device->dev)); 1695 up_read(&devices_rwsem); 1696 if (ret) { 1697 dev_warn(&device->dev, 1698 "%s: Couldn't rename device after namespace change\n", 1699 __func__); 1700 /* Try and put things back and re-enable the device */ 1701 write_pnet(&device->coredev.rdma_net, cur_net); 1702 } 1703 1704 ret2 = enable_device_and_get(device); 1705 if (ret2) { 1706 /* 1707 * This shouldn't really happen, but if it does, let the user 1708 * retry at later point. So don't disable the device. 1709 */ 1710 dev_warn(&device->dev, 1711 "%s: Couldn't re-enable device after namespace change\n", 1712 __func__); 1713 } 1714 kobject_uevent(&device->dev.kobj, KOBJ_ADD); 1715 1716 ib_device_put(device); 1717 out: 1718 mutex_unlock(&device->unregistration_lock); 1719 if (ret) 1720 return ret; 1721 return ret2; 1722 } 1723 1724 int ib_device_set_netns_put(struct sk_buff *skb, 1725 struct ib_device *dev, u32 ns_fd) 1726 { 1727 struct net *net; 1728 int ret; 1729 1730 net = get_net_ns_by_fd(ns_fd); 1731 if (IS_ERR(net)) { 1732 ret = PTR_ERR(net); 1733 goto net_err; 1734 } 1735 1736 if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { 1737 ret = -EPERM; 1738 goto ns_err; 1739 } 1740 1741 /* 1742 * All the ib_clients, including uverbs, are reset when the namespace is 1743 * changed and this cannot be blocked waiting for userspace to do 1744 * something, so disassociation is mandatory. 1745 */ 1746 if (!dev->ops.disassociate_ucontext || ib_devices_shared_netns) { 1747 ret = -EOPNOTSUPP; 1748 goto ns_err; 1749 } 1750 1751 get_device(&dev->dev); 1752 ib_device_put(dev); 1753 ret = rdma_dev_change_netns(dev, current->nsproxy->net_ns, net); 1754 put_device(&dev->dev); 1755 1756 put_net(net); 1757 return ret; 1758 1759 ns_err: 1760 put_net(net); 1761 net_err: 1762 ib_device_put(dev); 1763 return ret; 1764 } 1765 1766 static struct pernet_operations rdma_dev_net_ops = { 1767 .init = rdma_dev_init_net, 1768 .exit = rdma_dev_exit_net, 1769 .id = &rdma_dev_net_id, 1770 .size = sizeof(struct rdma_dev_net), 1771 }; 1772 1773 static int assign_client_id(struct ib_client *client) 1774 { 1775 int ret; 1776 1777 lockdep_assert_held(&clients_rwsem); 1778 /* 1779 * The add/remove callbacks must be called in FIFO/LIFO order. To 1780 * achieve this we assign client_ids so they are sorted in 1781 * registration order. 1782 */ 1783 client->client_id = highest_client_id; 1784 ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL); 1785 if (ret) 1786 return ret; 1787 1788 highest_client_id++; 1789 xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED); 1790 return 0; 1791 } 1792 1793 static void remove_client_id(struct ib_client *client) 1794 { 1795 down_write(&clients_rwsem); 1796 xa_erase(&clients, client->client_id); 1797 for (; highest_client_id; highest_client_id--) 1798 if (xa_load(&clients, highest_client_id - 1)) 1799 break; 1800 up_write(&clients_rwsem); 1801 } 1802 1803 /** 1804 * ib_register_client - Register an IB client 1805 * @client:Client to register 1806 * 1807 * Upper level users of the IB drivers can use ib_register_client() to 1808 * register callbacks for IB device addition and removal. When an IB 1809 * device is added, each registered client's add method will be called 1810 * (in the order the clients were registered), and when a device is 1811 * removed, each client's remove method will be called (in the reverse 1812 * order that clients were registered). In addition, when 1813 * ib_register_client() is called, the client will receive an add 1814 * callback for all devices already registered. 1815 */ 1816 int ib_register_client(struct ib_client *client) 1817 { 1818 struct ib_device *device; 1819 unsigned long index; 1820 bool need_unreg = false; 1821 int ret; 1822 1823 refcount_set(&client->uses, 1); 1824 init_completion(&client->uses_zero); 1825 1826 /* 1827 * The devices_rwsem is held in write mode to ensure that a racing 1828 * ib_register_device() sees a consisent view of clients and devices. 1829 */ 1830 down_write(&devices_rwsem); 1831 down_write(&clients_rwsem); 1832 ret = assign_client_id(client); 1833 if (ret) 1834 goto out; 1835 1836 need_unreg = true; 1837 xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) { 1838 ret = add_client_context(device, client); 1839 if (ret) 1840 goto out; 1841 } 1842 ret = 0; 1843 out: 1844 up_write(&clients_rwsem); 1845 up_write(&devices_rwsem); 1846 if (need_unreg && ret) 1847 ib_unregister_client(client); 1848 return ret; 1849 } 1850 EXPORT_SYMBOL(ib_register_client); 1851 1852 /** 1853 * ib_unregister_client - Unregister an IB client 1854 * @client:Client to unregister 1855 * 1856 * Upper level users use ib_unregister_client() to remove their client 1857 * registration. When ib_unregister_client() is called, the client 1858 * will receive a remove callback for each IB device still registered. 1859 * 1860 * This is a full fence, once it returns no client callbacks will be called, 1861 * or are running in another thread. 1862 */ 1863 void ib_unregister_client(struct ib_client *client) 1864 { 1865 struct ib_device *device; 1866 unsigned long index; 1867 1868 down_write(&clients_rwsem); 1869 ib_client_put(client); 1870 xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED); 1871 up_write(&clients_rwsem); 1872 1873 /* We do not want to have locks while calling client->remove() */ 1874 rcu_read_lock(); 1875 xa_for_each (&devices, index, device) { 1876 if (!ib_device_try_get(device)) 1877 continue; 1878 rcu_read_unlock(); 1879 1880 remove_client_context(device, client->client_id); 1881 1882 ib_device_put(device); 1883 rcu_read_lock(); 1884 } 1885 rcu_read_unlock(); 1886 1887 /* 1888 * remove_client_context() is not a fence, it can return even though a 1889 * removal is ongoing. Wait until all removals are completed. 1890 */ 1891 wait_for_completion(&client->uses_zero); 1892 remove_client_id(client); 1893 } 1894 EXPORT_SYMBOL(ib_unregister_client); 1895 1896 static int __ib_get_global_client_nl_info(const char *client_name, 1897 struct ib_client_nl_info *res) 1898 { 1899 struct ib_client *client; 1900 unsigned long index; 1901 int ret = -ENOENT; 1902 1903 down_read(&clients_rwsem); 1904 xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { 1905 if (strcmp(client->name, client_name) != 0) 1906 continue; 1907 if (!client->get_global_nl_info) { 1908 ret = -EOPNOTSUPP; 1909 break; 1910 } 1911 ret = client->get_global_nl_info(res); 1912 if (WARN_ON(ret == -ENOENT)) 1913 ret = -EINVAL; 1914 if (!ret && res->cdev) 1915 get_device(res->cdev); 1916 break; 1917 } 1918 up_read(&clients_rwsem); 1919 return ret; 1920 } 1921 1922 static int __ib_get_client_nl_info(struct ib_device *ibdev, 1923 const char *client_name, 1924 struct ib_client_nl_info *res) 1925 { 1926 unsigned long index; 1927 void *client_data; 1928 int ret = -ENOENT; 1929 1930 down_read(&ibdev->client_data_rwsem); 1931 xan_for_each_marked (&ibdev->client_data, index, client_data, 1932 CLIENT_DATA_REGISTERED) { 1933 struct ib_client *client = xa_load(&clients, index); 1934 1935 if (!client || strcmp(client->name, client_name) != 0) 1936 continue; 1937 if (!client->get_nl_info) { 1938 ret = -EOPNOTSUPP; 1939 break; 1940 } 1941 ret = client->get_nl_info(ibdev, client_data, res); 1942 if (WARN_ON(ret == -ENOENT)) 1943 ret = -EINVAL; 1944 1945 /* 1946 * The cdev is guaranteed valid as long as we are inside the 1947 * client_data_rwsem as remove_one can't be called. Keep it 1948 * valid for the caller. 1949 */ 1950 if (!ret && res->cdev) 1951 get_device(res->cdev); 1952 break; 1953 } 1954 up_read(&ibdev->client_data_rwsem); 1955 1956 return ret; 1957 } 1958 1959 /** 1960 * ib_get_client_nl_info - Fetch the nl_info from a client 1961 * @ibdev: IB device 1962 * @client_name: Name of the client 1963 * @res: Result of the query 1964 */ 1965 int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, 1966 struct ib_client_nl_info *res) 1967 { 1968 int ret; 1969 1970 if (ibdev) 1971 ret = __ib_get_client_nl_info(ibdev, client_name, res); 1972 else 1973 ret = __ib_get_global_client_nl_info(client_name, res); 1974 #ifdef CONFIG_MODULES 1975 if (ret == -ENOENT) { 1976 request_module("rdma-client-%s", client_name); 1977 if (ibdev) 1978 ret = __ib_get_client_nl_info(ibdev, client_name, res); 1979 else 1980 ret = __ib_get_global_client_nl_info(client_name, res); 1981 } 1982 #endif 1983 if (ret) { 1984 if (ret == -ENOENT) 1985 return -EOPNOTSUPP; 1986 return ret; 1987 } 1988 1989 if (WARN_ON(!res->cdev)) 1990 return -EINVAL; 1991 return 0; 1992 } 1993 1994 /** 1995 * ib_set_client_data - Set IB client context 1996 * @device:Device to set context for 1997 * @client:Client to set context for 1998 * @data:Context to set 1999 * 2000 * ib_set_client_data() sets client context data that can be retrieved with 2001 * ib_get_client_data(). This can only be called while the client is 2002 * registered to the device, once the ib_client remove() callback returns this 2003 * cannot be called. 2004 */ 2005 void ib_set_client_data(struct ib_device *device, struct ib_client *client, 2006 void *data) 2007 { 2008 void *rc; 2009 2010 if (WARN_ON(IS_ERR(data))) 2011 data = NULL; 2012 2013 rc = xa_store(&device->client_data, client->client_id, data, 2014 GFP_KERNEL); 2015 WARN_ON(xa_is_err(rc)); 2016 } 2017 EXPORT_SYMBOL(ib_set_client_data); 2018 2019 /** 2020 * ib_register_event_handler - Register an IB event handler 2021 * @event_handler:Handler to register 2022 * 2023 * ib_register_event_handler() registers an event handler that will be 2024 * called back when asynchronous IB events occur (as defined in 2025 * chapter 11 of the InfiniBand Architecture Specification). This 2026 * callback occurs in workqueue context. 2027 */ 2028 void ib_register_event_handler(struct ib_event_handler *event_handler) 2029 { 2030 down_write(&event_handler->device->event_handler_rwsem); 2031 list_add_tail(&event_handler->list, 2032 &event_handler->device->event_handler_list); 2033 up_write(&event_handler->device->event_handler_rwsem); 2034 } 2035 EXPORT_SYMBOL(ib_register_event_handler); 2036 2037 /** 2038 * ib_unregister_event_handler - Unregister an event handler 2039 * @event_handler:Handler to unregister 2040 * 2041 * Unregister an event handler registered with 2042 * ib_register_event_handler(). 2043 */ 2044 void ib_unregister_event_handler(struct ib_event_handler *event_handler) 2045 { 2046 down_write(&event_handler->device->event_handler_rwsem); 2047 list_del(&event_handler->list); 2048 up_write(&event_handler->device->event_handler_rwsem); 2049 } 2050 EXPORT_SYMBOL(ib_unregister_event_handler); 2051 2052 void ib_dispatch_event_clients(struct ib_event *event) 2053 { 2054 struct ib_event_handler *handler; 2055 2056 down_read(&event->device->event_handler_rwsem); 2057 2058 list_for_each_entry(handler, &event->device->event_handler_list, list) 2059 handler->handler(handler, event); 2060 2061 up_read(&event->device->event_handler_rwsem); 2062 } 2063 2064 static int iw_query_port(struct ib_device *device, 2065 u32 port_num, 2066 struct ib_port_attr *port_attr) 2067 { 2068 struct in_device *inetdev; 2069 struct net_device *netdev; 2070 2071 memset(port_attr, 0, sizeof(*port_attr)); 2072 2073 netdev = ib_device_get_netdev(device, port_num); 2074 if (!netdev) 2075 return -ENODEV; 2076 2077 port_attr->max_mtu = IB_MTU_4096; 2078 port_attr->active_mtu = ib_mtu_int_to_enum(netdev->mtu); 2079 2080 if (!netif_carrier_ok(netdev)) { 2081 port_attr->state = IB_PORT_DOWN; 2082 port_attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; 2083 } else { 2084 rcu_read_lock(); 2085 inetdev = __in_dev_get_rcu(netdev); 2086 2087 if (inetdev && inetdev->ifa_list) { 2088 port_attr->state = IB_PORT_ACTIVE; 2089 port_attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; 2090 } else { 2091 port_attr->state = IB_PORT_INIT; 2092 port_attr->phys_state = 2093 IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING; 2094 } 2095 2096 rcu_read_unlock(); 2097 } 2098 2099 dev_put(netdev); 2100 return device->ops.query_port(device, port_num, port_attr); 2101 } 2102 2103 static int __ib_query_port(struct ib_device *device, 2104 u32 port_num, 2105 struct ib_port_attr *port_attr) 2106 { 2107 int err; 2108 2109 memset(port_attr, 0, sizeof(*port_attr)); 2110 2111 err = device->ops.query_port(device, port_num, port_attr); 2112 if (err || port_attr->subnet_prefix) 2113 return err; 2114 2115 if (rdma_port_get_link_layer(device, port_num) != 2116 IB_LINK_LAYER_INFINIBAND) 2117 return 0; 2118 2119 ib_get_cached_subnet_prefix(device, port_num, 2120 &port_attr->subnet_prefix); 2121 return 0; 2122 } 2123 2124 /** 2125 * ib_query_port - Query IB port attributes 2126 * @device:Device to query 2127 * @port_num:Port number to query 2128 * @port_attr:Port attributes 2129 * 2130 * ib_query_port() returns the attributes of a port through the 2131 * @port_attr pointer. 2132 */ 2133 int ib_query_port(struct ib_device *device, 2134 u32 port_num, 2135 struct ib_port_attr *port_attr) 2136 { 2137 if (!rdma_is_port_valid(device, port_num)) 2138 return -EINVAL; 2139 2140 if (rdma_protocol_iwarp(device, port_num)) 2141 return iw_query_port(device, port_num, port_attr); 2142 else 2143 return __ib_query_port(device, port_num, port_attr); 2144 } 2145 EXPORT_SYMBOL(ib_query_port); 2146 2147 static void add_ndev_hash(struct ib_port_data *pdata) 2148 { 2149 unsigned long flags; 2150 2151 might_sleep(); 2152 2153 spin_lock_irqsave(&ndev_hash_lock, flags); 2154 if (hash_hashed(&pdata->ndev_hash_link)) { 2155 hash_del_rcu(&pdata->ndev_hash_link); 2156 spin_unlock_irqrestore(&ndev_hash_lock, flags); 2157 /* 2158 * We cannot do hash_add_rcu after a hash_del_rcu until the 2159 * grace period 2160 */ 2161 synchronize_rcu(); 2162 spin_lock_irqsave(&ndev_hash_lock, flags); 2163 } 2164 if (pdata->netdev) 2165 hash_add_rcu(ndev_hash, &pdata->ndev_hash_link, 2166 (uintptr_t)pdata->netdev); 2167 spin_unlock_irqrestore(&ndev_hash_lock, flags); 2168 } 2169 2170 /** 2171 * ib_device_set_netdev - Associate the ib_dev with an underlying net_device 2172 * @ib_dev: Device to modify 2173 * @ndev: net_device to affiliate, may be NULL 2174 * @port: IB port the net_device is connected to 2175 * 2176 * Drivers should use this to link the ib_device to a netdev so the netdev 2177 * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be 2178 * affiliated with any port. 2179 * 2180 * The caller must ensure that the given ndev is not unregistered or 2181 * unregistering, and that either the ib_device is unregistered or 2182 * ib_device_set_netdev() is called with NULL when the ndev sends a 2183 * NETDEV_UNREGISTER event. 2184 */ 2185 int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, 2186 u32 port) 2187 { 2188 enum rdma_nl_notify_event_type etype; 2189 struct net_device *old_ndev; 2190 struct ib_port_data *pdata; 2191 unsigned long flags; 2192 int ret; 2193 2194 if (!rdma_is_port_valid(ib_dev, port)) 2195 return -EINVAL; 2196 2197 /* 2198 * Drivers wish to call this before ib_register_driver, so we have to 2199 * setup the port data early. 2200 */ 2201 ret = alloc_port_data(ib_dev); 2202 if (ret) 2203 return ret; 2204 2205 pdata = &ib_dev->port_data[port]; 2206 spin_lock_irqsave(&pdata->netdev_lock, flags); 2207 old_ndev = rcu_dereference_protected( 2208 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2209 if (old_ndev == ndev) { 2210 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2211 return 0; 2212 } 2213 2214 rcu_assign_pointer(pdata->netdev, ndev); 2215 netdev_put(old_ndev, &pdata->netdev_tracker); 2216 netdev_hold(ndev, &pdata->netdev_tracker, GFP_ATOMIC); 2217 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2218 2219 add_ndev_hash(pdata); 2220 2221 /* Make sure that the device is registered before we send events */ 2222 if (xa_load(&devices, ib_dev->index) != ib_dev) 2223 return 0; 2224 2225 etype = ndev ? RDMA_NETDEV_ATTACH_EVENT : RDMA_NETDEV_DETACH_EVENT; 2226 rdma_nl_notify_event(ib_dev, port, etype); 2227 2228 return 0; 2229 } 2230 EXPORT_SYMBOL(ib_device_set_netdev); 2231 2232 static void free_netdevs(struct ib_device *ib_dev) 2233 { 2234 unsigned long flags; 2235 u32 port; 2236 2237 if (!ib_dev->port_data) 2238 return; 2239 2240 rdma_for_each_port (ib_dev, port) { 2241 struct ib_port_data *pdata = &ib_dev->port_data[port]; 2242 struct net_device *ndev; 2243 2244 spin_lock_irqsave(&pdata->netdev_lock, flags); 2245 ndev = rcu_dereference_protected( 2246 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2247 if (ndev) { 2248 spin_lock(&ndev_hash_lock); 2249 hash_del_rcu(&pdata->ndev_hash_link); 2250 spin_unlock(&ndev_hash_lock); 2251 2252 /* 2253 * If this is the last dev_put there is still a 2254 * synchronize_rcu before the netdev is kfreed, so we 2255 * can continue to rely on unlocked pointer 2256 * comparisons after the put 2257 */ 2258 rcu_assign_pointer(pdata->netdev, NULL); 2259 netdev_put(ndev, &pdata->netdev_tracker); 2260 } 2261 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2262 } 2263 } 2264 2265 struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, 2266 u32 port) 2267 { 2268 struct ib_port_data *pdata; 2269 struct net_device *res; 2270 2271 if (!rdma_is_port_valid(ib_dev, port)) 2272 return NULL; 2273 2274 if (!ib_dev->port_data) 2275 return NULL; 2276 2277 pdata = &ib_dev->port_data[port]; 2278 2279 /* 2280 * New drivers should use ib_device_set_netdev() not the legacy 2281 * get_netdev(). 2282 */ 2283 if (ib_dev->ops.get_netdev) 2284 res = ib_dev->ops.get_netdev(ib_dev, port); 2285 else { 2286 spin_lock(&pdata->netdev_lock); 2287 res = rcu_dereference_protected( 2288 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2289 dev_hold(res); 2290 spin_unlock(&pdata->netdev_lock); 2291 } 2292 2293 return res; 2294 } 2295 EXPORT_SYMBOL(ib_device_get_netdev); 2296 2297 /** 2298 * ib_device_get_by_netdev - Find an IB device associated with a netdev 2299 * @ndev: netdev to locate 2300 * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) 2301 * 2302 * Find and hold an ib_device that is associated with a netdev via 2303 * ib_device_set_netdev(). The caller must call ib_device_put() on the 2304 * returned pointer. 2305 */ 2306 struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, 2307 enum rdma_driver_id driver_id) 2308 { 2309 struct ib_device *res = NULL; 2310 struct ib_port_data *cur; 2311 2312 rcu_read_lock(); 2313 hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link, 2314 (uintptr_t)ndev) { 2315 if (rcu_access_pointer(cur->netdev) == ndev && 2316 (driver_id == RDMA_DRIVER_UNKNOWN || 2317 cur->ib_dev->ops.driver_id == driver_id) && 2318 ib_device_try_get(cur->ib_dev)) { 2319 res = cur->ib_dev; 2320 break; 2321 } 2322 } 2323 rcu_read_unlock(); 2324 2325 return res; 2326 } 2327 EXPORT_SYMBOL(ib_device_get_by_netdev); 2328 2329 /** 2330 * ib_enum_roce_netdev - enumerate all RoCE ports 2331 * @ib_dev : IB device we want to query 2332 * @filter: Should we call the callback? 2333 * @filter_cookie: Cookie passed to filter 2334 * @cb: Callback to call for each found RoCE ports 2335 * @cookie: Cookie passed back to the callback 2336 * 2337 * Enumerates all of the physical RoCE ports of ib_dev 2338 * which are related to netdevice and calls callback() on each 2339 * device for which filter() function returns non zero. 2340 */ 2341 void ib_enum_roce_netdev(struct ib_device *ib_dev, 2342 roce_netdev_filter filter, 2343 void *filter_cookie, 2344 roce_netdev_callback cb, 2345 void *cookie) 2346 { 2347 u32 port; 2348 2349 rdma_for_each_port (ib_dev, port) 2350 if (rdma_protocol_roce(ib_dev, port)) { 2351 struct net_device *idev = 2352 ib_device_get_netdev(ib_dev, port); 2353 2354 if (filter(ib_dev, port, idev, filter_cookie)) 2355 cb(ib_dev, port, idev, cookie); 2356 dev_put(idev); 2357 } 2358 } 2359 2360 /** 2361 * ib_enum_all_roce_netdevs - enumerate all RoCE devices 2362 * @filter: Should we call the callback? 2363 * @filter_cookie: Cookie passed to filter 2364 * @cb: Callback to call for each found RoCE ports 2365 * @cookie: Cookie passed back to the callback 2366 * 2367 * Enumerates all RoCE devices' physical ports which are related 2368 * to netdevices and calls callback() on each device for which 2369 * filter() function returns non zero. 2370 */ 2371 void ib_enum_all_roce_netdevs(roce_netdev_filter filter, 2372 void *filter_cookie, 2373 roce_netdev_callback cb, 2374 void *cookie) 2375 { 2376 struct ib_device *dev; 2377 unsigned long index; 2378 2379 down_read(&devices_rwsem); 2380 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) 2381 ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie); 2382 up_read(&devices_rwsem); 2383 } 2384 2385 /* 2386 * ib_enum_all_devs - enumerate all ib_devices 2387 * @cb: Callback to call for each found ib_device 2388 * 2389 * Enumerates all ib_devices and calls callback() on each device. 2390 */ 2391 int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, 2392 struct netlink_callback *cb) 2393 { 2394 unsigned long index; 2395 struct ib_device *dev; 2396 unsigned int idx = 0; 2397 int ret = 0; 2398 2399 down_read(&devices_rwsem); 2400 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 2401 if (!rdma_dev_access_netns(dev, sock_net(skb->sk))) 2402 continue; 2403 2404 ret = nldev_cb(dev, skb, cb, idx); 2405 if (ret) 2406 break; 2407 idx++; 2408 } 2409 up_read(&devices_rwsem); 2410 return ret; 2411 } 2412 2413 /** 2414 * ib_query_pkey - Get P_Key table entry 2415 * @device:Device to query 2416 * @port_num:Port number to query 2417 * @index:P_Key table index to query 2418 * @pkey:Returned P_Key 2419 * 2420 * ib_query_pkey() fetches the specified P_Key table entry. 2421 */ 2422 int ib_query_pkey(struct ib_device *device, 2423 u32 port_num, u16 index, u16 *pkey) 2424 { 2425 if (!rdma_is_port_valid(device, port_num)) 2426 return -EINVAL; 2427 2428 if (!device->ops.query_pkey) 2429 return -EOPNOTSUPP; 2430 2431 return device->ops.query_pkey(device, port_num, index, pkey); 2432 } 2433 EXPORT_SYMBOL(ib_query_pkey); 2434 2435 /** 2436 * ib_modify_device - Change IB device attributes 2437 * @device:Device to modify 2438 * @device_modify_mask:Mask of attributes to change 2439 * @device_modify:New attribute values 2440 * 2441 * ib_modify_device() changes a device's attributes as specified by 2442 * the @device_modify_mask and @device_modify structure. 2443 */ 2444 int ib_modify_device(struct ib_device *device, 2445 int device_modify_mask, 2446 struct ib_device_modify *device_modify) 2447 { 2448 if (!device->ops.modify_device) 2449 return -EOPNOTSUPP; 2450 2451 return device->ops.modify_device(device, device_modify_mask, 2452 device_modify); 2453 } 2454 EXPORT_SYMBOL(ib_modify_device); 2455 2456 /** 2457 * ib_modify_port - Modifies the attributes for the specified port. 2458 * @device: The device to modify. 2459 * @port_num: The number of the port to modify. 2460 * @port_modify_mask: Mask used to specify which attributes of the port 2461 * to change. 2462 * @port_modify: New attribute values for the port. 2463 * 2464 * ib_modify_port() changes a port's attributes as specified by the 2465 * @port_modify_mask and @port_modify structure. 2466 */ 2467 int ib_modify_port(struct ib_device *device, 2468 u32 port_num, int port_modify_mask, 2469 struct ib_port_modify *port_modify) 2470 { 2471 int rc; 2472 2473 if (!rdma_is_port_valid(device, port_num)) 2474 return -EINVAL; 2475 2476 if (device->ops.modify_port) 2477 rc = device->ops.modify_port(device, port_num, 2478 port_modify_mask, 2479 port_modify); 2480 else if (rdma_protocol_roce(device, port_num) && 2481 ((port_modify->set_port_cap_mask & ~IB_PORT_CM_SUP) == 0 || 2482 (port_modify->clr_port_cap_mask & ~IB_PORT_CM_SUP) == 0)) 2483 rc = 0; 2484 else 2485 rc = -EOPNOTSUPP; 2486 return rc; 2487 } 2488 EXPORT_SYMBOL(ib_modify_port); 2489 2490 /** 2491 * ib_find_gid - Returns the port number and GID table index where 2492 * a specified GID value occurs. Its searches only for IB link layer. 2493 * @device: The device to query. 2494 * @gid: The GID value to search for. 2495 * @port_num: The port number of the device where the GID value was found. 2496 * @index: The index into the GID table where the GID was found. This 2497 * parameter may be NULL. 2498 */ 2499 int ib_find_gid(struct ib_device *device, union ib_gid *gid, 2500 u32 *port_num, u16 *index) 2501 { 2502 union ib_gid tmp_gid; 2503 u32 port; 2504 int ret, i; 2505 2506 rdma_for_each_port (device, port) { 2507 if (!rdma_protocol_ib(device, port)) 2508 continue; 2509 2510 for (i = 0; i < device->port_data[port].immutable.gid_tbl_len; 2511 ++i) { 2512 ret = rdma_query_gid(device, port, i, &tmp_gid); 2513 if (ret) 2514 continue; 2515 2516 if (!memcmp(&tmp_gid, gid, sizeof *gid)) { 2517 *port_num = port; 2518 if (index) 2519 *index = i; 2520 return 0; 2521 } 2522 } 2523 } 2524 2525 return -ENOENT; 2526 } 2527 EXPORT_SYMBOL(ib_find_gid); 2528 2529 /** 2530 * ib_find_pkey - Returns the PKey table index where a specified 2531 * PKey value occurs. 2532 * @device: The device to query. 2533 * @port_num: The port number of the device to search for the PKey. 2534 * @pkey: The PKey value to search for. 2535 * @index: The index into the PKey table where the PKey was found. 2536 */ 2537 int ib_find_pkey(struct ib_device *device, 2538 u32 port_num, u16 pkey, u16 *index) 2539 { 2540 int ret, i; 2541 u16 tmp_pkey; 2542 int partial_ix = -1; 2543 2544 for (i = 0; i < device->port_data[port_num].immutable.pkey_tbl_len; 2545 ++i) { 2546 ret = ib_query_pkey(device, port_num, i, &tmp_pkey); 2547 if (ret) 2548 return ret; 2549 if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) { 2550 /* if there is full-member pkey take it.*/ 2551 if (tmp_pkey & 0x8000) { 2552 *index = i; 2553 return 0; 2554 } 2555 if (partial_ix < 0) 2556 partial_ix = i; 2557 } 2558 } 2559 2560 /*no full-member, if exists take the limited*/ 2561 if (partial_ix >= 0) { 2562 *index = partial_ix; 2563 return 0; 2564 } 2565 return -ENOENT; 2566 } 2567 EXPORT_SYMBOL(ib_find_pkey); 2568 2569 /** 2570 * ib_get_net_dev_by_params() - Return the appropriate net_dev 2571 * for a received CM request 2572 * @dev: An RDMA device on which the request has been received. 2573 * @port: Port number on the RDMA device. 2574 * @pkey: The Pkey the request came on. 2575 * @gid: A GID that the net_dev uses to communicate. 2576 * @addr: Contains the IP address that the request specified as its 2577 * destination. 2578 * 2579 */ 2580 struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, 2581 u32 port, 2582 u16 pkey, 2583 const union ib_gid *gid, 2584 const struct sockaddr *addr) 2585 { 2586 struct net_device *net_dev = NULL; 2587 unsigned long index; 2588 void *client_data; 2589 2590 if (!rdma_protocol_ib(dev, port)) 2591 return NULL; 2592 2593 /* 2594 * Holding the read side guarantees that the client will not become 2595 * unregistered while we are calling get_net_dev_by_params() 2596 */ 2597 down_read(&dev->client_data_rwsem); 2598 xan_for_each_marked (&dev->client_data, index, client_data, 2599 CLIENT_DATA_REGISTERED) { 2600 struct ib_client *client = xa_load(&clients, index); 2601 2602 if (!client || !client->get_net_dev_by_params) 2603 continue; 2604 2605 net_dev = client->get_net_dev_by_params(dev, port, pkey, gid, 2606 addr, client_data); 2607 if (net_dev) 2608 break; 2609 } 2610 up_read(&dev->client_data_rwsem); 2611 2612 return net_dev; 2613 } 2614 EXPORT_SYMBOL(ib_get_net_dev_by_params); 2615 2616 void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) 2617 { 2618 struct ib_device_ops *dev_ops = &dev->ops; 2619 #define SET_DEVICE_OP(ptr, name) \ 2620 do { \ 2621 if (ops->name) \ 2622 if (!((ptr)->name)) \ 2623 (ptr)->name = ops->name; \ 2624 } while (0) 2625 2626 #define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name) 2627 2628 if (ops->driver_id != RDMA_DRIVER_UNKNOWN) { 2629 WARN_ON(dev_ops->driver_id != RDMA_DRIVER_UNKNOWN && 2630 dev_ops->driver_id != ops->driver_id); 2631 dev_ops->driver_id = ops->driver_id; 2632 } 2633 if (ops->owner) { 2634 WARN_ON(dev_ops->owner && dev_ops->owner != ops->owner); 2635 dev_ops->owner = ops->owner; 2636 } 2637 if (ops->uverbs_abi_ver) 2638 dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver; 2639 2640 dev_ops->uverbs_no_driver_id_binding |= 2641 ops->uverbs_no_driver_id_binding; 2642 2643 SET_DEVICE_OP(dev_ops, add_gid); 2644 SET_DEVICE_OP(dev_ops, add_sub_dev); 2645 SET_DEVICE_OP(dev_ops, advise_mr); 2646 SET_DEVICE_OP(dev_ops, alloc_dm); 2647 SET_DEVICE_OP(dev_ops, alloc_hw_device_stats); 2648 SET_DEVICE_OP(dev_ops, alloc_hw_port_stats); 2649 SET_DEVICE_OP(dev_ops, alloc_mr); 2650 SET_DEVICE_OP(dev_ops, alloc_mr_integrity); 2651 SET_DEVICE_OP(dev_ops, alloc_mw); 2652 SET_DEVICE_OP(dev_ops, alloc_pd); 2653 SET_DEVICE_OP(dev_ops, alloc_rdma_netdev); 2654 SET_DEVICE_OP(dev_ops, alloc_ucontext); 2655 SET_DEVICE_OP(dev_ops, alloc_xrcd); 2656 SET_DEVICE_OP(dev_ops, attach_mcast); 2657 SET_DEVICE_OP(dev_ops, check_mr_status); 2658 SET_DEVICE_OP(dev_ops, counter_alloc_stats); 2659 SET_DEVICE_OP(dev_ops, counter_bind_qp); 2660 SET_DEVICE_OP(dev_ops, counter_dealloc); 2661 SET_DEVICE_OP(dev_ops, counter_unbind_qp); 2662 SET_DEVICE_OP(dev_ops, counter_update_stats); 2663 SET_DEVICE_OP(dev_ops, create_ah); 2664 SET_DEVICE_OP(dev_ops, create_counters); 2665 SET_DEVICE_OP(dev_ops, create_cq); 2666 SET_DEVICE_OP(dev_ops, create_flow); 2667 SET_DEVICE_OP(dev_ops, create_qp); 2668 SET_DEVICE_OP(dev_ops, create_rwq_ind_table); 2669 SET_DEVICE_OP(dev_ops, create_srq); 2670 SET_DEVICE_OP(dev_ops, create_user_ah); 2671 SET_DEVICE_OP(dev_ops, create_wq); 2672 SET_DEVICE_OP(dev_ops, dealloc_dm); 2673 SET_DEVICE_OP(dev_ops, dealloc_driver); 2674 SET_DEVICE_OP(dev_ops, dealloc_mw); 2675 SET_DEVICE_OP(dev_ops, dealloc_pd); 2676 SET_DEVICE_OP(dev_ops, dealloc_ucontext); 2677 SET_DEVICE_OP(dev_ops, dealloc_xrcd); 2678 SET_DEVICE_OP(dev_ops, del_gid); 2679 SET_DEVICE_OP(dev_ops, del_sub_dev); 2680 SET_DEVICE_OP(dev_ops, dereg_mr); 2681 SET_DEVICE_OP(dev_ops, destroy_ah); 2682 SET_DEVICE_OP(dev_ops, destroy_counters); 2683 SET_DEVICE_OP(dev_ops, destroy_cq); 2684 SET_DEVICE_OP(dev_ops, destroy_flow); 2685 SET_DEVICE_OP(dev_ops, destroy_flow_action); 2686 SET_DEVICE_OP(dev_ops, destroy_qp); 2687 SET_DEVICE_OP(dev_ops, destroy_rwq_ind_table); 2688 SET_DEVICE_OP(dev_ops, destroy_srq); 2689 SET_DEVICE_OP(dev_ops, destroy_wq); 2690 SET_DEVICE_OP(dev_ops, device_group); 2691 SET_DEVICE_OP(dev_ops, detach_mcast); 2692 SET_DEVICE_OP(dev_ops, disassociate_ucontext); 2693 SET_DEVICE_OP(dev_ops, drain_rq); 2694 SET_DEVICE_OP(dev_ops, drain_sq); 2695 SET_DEVICE_OP(dev_ops, enable_driver); 2696 SET_DEVICE_OP(dev_ops, fill_res_cm_id_entry); 2697 SET_DEVICE_OP(dev_ops, fill_res_cq_entry); 2698 SET_DEVICE_OP(dev_ops, fill_res_cq_entry_raw); 2699 SET_DEVICE_OP(dev_ops, fill_res_mr_entry); 2700 SET_DEVICE_OP(dev_ops, fill_res_mr_entry_raw); 2701 SET_DEVICE_OP(dev_ops, fill_res_qp_entry); 2702 SET_DEVICE_OP(dev_ops, fill_res_qp_entry_raw); 2703 SET_DEVICE_OP(dev_ops, fill_res_srq_entry); 2704 SET_DEVICE_OP(dev_ops, fill_res_srq_entry_raw); 2705 SET_DEVICE_OP(dev_ops, fill_stat_mr_entry); 2706 SET_DEVICE_OP(dev_ops, get_dev_fw_str); 2707 SET_DEVICE_OP(dev_ops, get_dma_mr); 2708 SET_DEVICE_OP(dev_ops, get_hw_stats); 2709 SET_DEVICE_OP(dev_ops, get_link_layer); 2710 SET_DEVICE_OP(dev_ops, get_netdev); 2711 SET_DEVICE_OP(dev_ops, get_numa_node); 2712 SET_DEVICE_OP(dev_ops, get_port_immutable); 2713 SET_DEVICE_OP(dev_ops, get_vector_affinity); 2714 SET_DEVICE_OP(dev_ops, get_vf_config); 2715 SET_DEVICE_OP(dev_ops, get_vf_guid); 2716 SET_DEVICE_OP(dev_ops, get_vf_stats); 2717 SET_DEVICE_OP(dev_ops, iw_accept); 2718 SET_DEVICE_OP(dev_ops, iw_add_ref); 2719 SET_DEVICE_OP(dev_ops, iw_connect); 2720 SET_DEVICE_OP(dev_ops, iw_create_listen); 2721 SET_DEVICE_OP(dev_ops, iw_destroy_listen); 2722 SET_DEVICE_OP(dev_ops, iw_get_qp); 2723 SET_DEVICE_OP(dev_ops, iw_reject); 2724 SET_DEVICE_OP(dev_ops, iw_rem_ref); 2725 SET_DEVICE_OP(dev_ops, map_mr_sg); 2726 SET_DEVICE_OP(dev_ops, map_mr_sg_pi); 2727 SET_DEVICE_OP(dev_ops, mmap); 2728 SET_DEVICE_OP(dev_ops, mmap_free); 2729 SET_DEVICE_OP(dev_ops, modify_ah); 2730 SET_DEVICE_OP(dev_ops, modify_cq); 2731 SET_DEVICE_OP(dev_ops, modify_device); 2732 SET_DEVICE_OP(dev_ops, modify_hw_stat); 2733 SET_DEVICE_OP(dev_ops, modify_port); 2734 SET_DEVICE_OP(dev_ops, modify_qp); 2735 SET_DEVICE_OP(dev_ops, modify_srq); 2736 SET_DEVICE_OP(dev_ops, modify_wq); 2737 SET_DEVICE_OP(dev_ops, peek_cq); 2738 SET_DEVICE_OP(dev_ops, poll_cq); 2739 SET_DEVICE_OP(dev_ops, port_groups); 2740 SET_DEVICE_OP(dev_ops, post_recv); 2741 SET_DEVICE_OP(dev_ops, post_send); 2742 SET_DEVICE_OP(dev_ops, post_srq_recv); 2743 SET_DEVICE_OP(dev_ops, process_mad); 2744 SET_DEVICE_OP(dev_ops, query_ah); 2745 SET_DEVICE_OP(dev_ops, query_device); 2746 SET_DEVICE_OP(dev_ops, query_gid); 2747 SET_DEVICE_OP(dev_ops, query_pkey); 2748 SET_DEVICE_OP(dev_ops, query_port); 2749 SET_DEVICE_OP(dev_ops, query_qp); 2750 SET_DEVICE_OP(dev_ops, query_srq); 2751 SET_DEVICE_OP(dev_ops, query_ucontext); 2752 SET_DEVICE_OP(dev_ops, rdma_netdev_get_params); 2753 SET_DEVICE_OP(dev_ops, read_counters); 2754 SET_DEVICE_OP(dev_ops, reg_dm_mr); 2755 SET_DEVICE_OP(dev_ops, reg_user_mr); 2756 SET_DEVICE_OP(dev_ops, reg_user_mr_dmabuf); 2757 SET_DEVICE_OP(dev_ops, req_notify_cq); 2758 SET_DEVICE_OP(dev_ops, rereg_user_mr); 2759 SET_DEVICE_OP(dev_ops, resize_cq); 2760 SET_DEVICE_OP(dev_ops, set_vf_guid); 2761 SET_DEVICE_OP(dev_ops, set_vf_link_state); 2762 2763 SET_OBJ_SIZE(dev_ops, ib_ah); 2764 SET_OBJ_SIZE(dev_ops, ib_counters); 2765 SET_OBJ_SIZE(dev_ops, ib_cq); 2766 SET_OBJ_SIZE(dev_ops, ib_mw); 2767 SET_OBJ_SIZE(dev_ops, ib_pd); 2768 SET_OBJ_SIZE(dev_ops, ib_qp); 2769 SET_OBJ_SIZE(dev_ops, ib_rwq_ind_table); 2770 SET_OBJ_SIZE(dev_ops, ib_srq); 2771 SET_OBJ_SIZE(dev_ops, ib_ucontext); 2772 SET_OBJ_SIZE(dev_ops, ib_xrcd); 2773 } 2774 EXPORT_SYMBOL(ib_set_device_ops); 2775 2776 int ib_add_sub_device(struct ib_device *parent, 2777 enum rdma_nl_dev_type type, 2778 const char *name) 2779 { 2780 struct ib_device *sub; 2781 int ret = 0; 2782 2783 if (!parent->ops.add_sub_dev || !parent->ops.del_sub_dev) 2784 return -EOPNOTSUPP; 2785 2786 if (!ib_device_try_get(parent)) 2787 return -EINVAL; 2788 2789 sub = parent->ops.add_sub_dev(parent, type, name); 2790 if (IS_ERR(sub)) { 2791 ib_device_put(parent); 2792 return PTR_ERR(sub); 2793 } 2794 2795 sub->type = type; 2796 sub->parent = parent; 2797 2798 mutex_lock(&parent->subdev_lock); 2799 list_add_tail(&parent->subdev_list_head, &sub->subdev_list); 2800 mutex_unlock(&parent->subdev_lock); 2801 2802 return ret; 2803 } 2804 EXPORT_SYMBOL(ib_add_sub_device); 2805 2806 int ib_del_sub_device_and_put(struct ib_device *sub) 2807 { 2808 struct ib_device *parent = sub->parent; 2809 2810 if (!parent) 2811 return -EOPNOTSUPP; 2812 2813 mutex_lock(&parent->subdev_lock); 2814 list_del(&sub->subdev_list); 2815 mutex_unlock(&parent->subdev_lock); 2816 2817 ib_device_put(sub); 2818 parent->ops.del_sub_dev(sub); 2819 ib_device_put(parent); 2820 2821 return 0; 2822 } 2823 EXPORT_SYMBOL(ib_del_sub_device_and_put); 2824 2825 #ifdef CONFIG_INFINIBAND_VIRT_DMA 2826 int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents) 2827 { 2828 struct scatterlist *s; 2829 int i; 2830 2831 for_each_sg(sg, s, nents, i) { 2832 sg_dma_address(s) = (uintptr_t)sg_virt(s); 2833 sg_dma_len(s) = s->length; 2834 } 2835 return nents; 2836 } 2837 EXPORT_SYMBOL(ib_dma_virt_map_sg); 2838 #endif /* CONFIG_INFINIBAND_VIRT_DMA */ 2839 2840 static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = { 2841 [RDMA_NL_LS_OP_RESOLVE] = { 2842 .doit = ib_nl_handle_resolve_resp, 2843 .flags = RDMA_NL_ADMIN_PERM, 2844 }, 2845 [RDMA_NL_LS_OP_SET_TIMEOUT] = { 2846 .doit = ib_nl_handle_set_timeout, 2847 .flags = RDMA_NL_ADMIN_PERM, 2848 }, 2849 [RDMA_NL_LS_OP_IP_RESOLVE] = { 2850 .doit = ib_nl_handle_ip_res_resp, 2851 .flags = RDMA_NL_ADMIN_PERM, 2852 }, 2853 }; 2854 2855 static int __init ib_core_init(void) 2856 { 2857 int ret = -ENOMEM; 2858 2859 ib_wq = alloc_workqueue("infiniband", 0, 0); 2860 if (!ib_wq) 2861 return -ENOMEM; 2862 2863 ib_unreg_wq = alloc_workqueue("ib-unreg-wq", WQ_UNBOUND, 2864 WQ_UNBOUND_MAX_ACTIVE); 2865 if (!ib_unreg_wq) 2866 goto err; 2867 2868 ib_comp_wq = alloc_workqueue("ib-comp-wq", 2869 WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0); 2870 if (!ib_comp_wq) 2871 goto err_unbound; 2872 2873 ib_comp_unbound_wq = 2874 alloc_workqueue("ib-comp-unb-wq", 2875 WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM | 2876 WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE); 2877 if (!ib_comp_unbound_wq) 2878 goto err_comp; 2879 2880 ret = class_register(&ib_class); 2881 if (ret) { 2882 pr_warn("Couldn't create InfiniBand device class\n"); 2883 goto err_comp_unbound; 2884 } 2885 2886 rdma_nl_init(); 2887 2888 ret = addr_init(); 2889 if (ret) { 2890 pr_warn("Couldn't init IB address resolution\n"); 2891 goto err_ibnl; 2892 } 2893 2894 ret = ib_mad_init(); 2895 if (ret) { 2896 pr_warn("Couldn't init IB MAD\n"); 2897 goto err_addr; 2898 } 2899 2900 ret = ib_sa_init(); 2901 if (ret) { 2902 pr_warn("Couldn't init SA\n"); 2903 goto err_mad; 2904 } 2905 2906 ret = register_blocking_lsm_notifier(&ibdev_lsm_nb); 2907 if (ret) { 2908 pr_warn("Couldn't register LSM notifier. ret %d\n", ret); 2909 goto err_sa; 2910 } 2911 2912 ret = register_pernet_device(&rdma_dev_net_ops); 2913 if (ret) { 2914 pr_warn("Couldn't init compat dev. ret %d\n", ret); 2915 goto err_compat; 2916 } 2917 2918 nldev_init(); 2919 rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table); 2920 ret = roce_gid_mgmt_init(); 2921 if (ret) { 2922 pr_warn("Couldn't init RoCE GID management\n"); 2923 goto err_parent; 2924 } 2925 2926 return 0; 2927 2928 err_parent: 2929 rdma_nl_unregister(RDMA_NL_LS); 2930 nldev_exit(); 2931 unregister_pernet_device(&rdma_dev_net_ops); 2932 err_compat: 2933 unregister_blocking_lsm_notifier(&ibdev_lsm_nb); 2934 err_sa: 2935 ib_sa_cleanup(); 2936 err_mad: 2937 ib_mad_cleanup(); 2938 err_addr: 2939 addr_cleanup(); 2940 err_ibnl: 2941 class_unregister(&ib_class); 2942 err_comp_unbound: 2943 destroy_workqueue(ib_comp_unbound_wq); 2944 err_comp: 2945 destroy_workqueue(ib_comp_wq); 2946 err_unbound: 2947 destroy_workqueue(ib_unreg_wq); 2948 err: 2949 destroy_workqueue(ib_wq); 2950 return ret; 2951 } 2952 2953 static void __exit ib_core_cleanup(void) 2954 { 2955 roce_gid_mgmt_cleanup(); 2956 rdma_nl_unregister(RDMA_NL_LS); 2957 nldev_exit(); 2958 unregister_pernet_device(&rdma_dev_net_ops); 2959 unregister_blocking_lsm_notifier(&ibdev_lsm_nb); 2960 ib_sa_cleanup(); 2961 ib_mad_cleanup(); 2962 addr_cleanup(); 2963 rdma_nl_exit(); 2964 class_unregister(&ib_class); 2965 destroy_workqueue(ib_comp_unbound_wq); 2966 destroy_workqueue(ib_comp_wq); 2967 /* Make sure that any pending umem accounting work is done. */ 2968 destroy_workqueue(ib_wq); 2969 destroy_workqueue(ib_unreg_wq); 2970 WARN_ON(!xa_empty(&clients)); 2971 WARN_ON(!xa_empty(&devices)); 2972 } 2973 2974 MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4); 2975 2976 /* ib core relies on netdev stack to first register net_ns_type_operations 2977 * ns kobject type before ib_core initialization. 2978 */ 2979 fs_initcall(ib_core_init); 2980 module_exit(ib_core_cleanup); 2981