1 /* 2 * Copyright (c) 2004 Topspin Communications. All rights reserved. 3 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34 #include <linux/module.h> 35 #include <linux/string.h> 36 #include <linux/errno.h> 37 #include <linux/kernel.h> 38 #include <linux/slab.h> 39 #include <linux/init.h> 40 #include <linux/netdevice.h> 41 #include <net/net_namespace.h> 42 #include <linux/security.h> 43 #include <linux/notifier.h> 44 #include <linux/hashtable.h> 45 #include <rdma/rdma_netlink.h> 46 #include <rdma/ib_addr.h> 47 #include <rdma/ib_cache.h> 48 #include <rdma/rdma_counter.h> 49 50 #include "core_priv.h" 51 #include "restrack.h" 52 53 MODULE_AUTHOR("Roland Dreier"); 54 MODULE_DESCRIPTION("core kernel InfiniBand API"); 55 MODULE_LICENSE("Dual BSD/GPL"); 56 57 struct workqueue_struct *ib_comp_wq; 58 struct workqueue_struct *ib_comp_unbound_wq; 59 struct workqueue_struct *ib_wq; 60 EXPORT_SYMBOL_GPL(ib_wq); 61 62 /* 63 * Each of the three rwsem locks (devices, clients, client_data) protects the 64 * xarray of the same name. Specifically it allows the caller to assert that 65 * the MARK will/will not be changing under the lock, and for devices and 66 * clients, that the value in the xarray is still a valid pointer. Change of 67 * the MARK is linked to the object state, so holding the lock and testing the 68 * MARK also asserts that the contained object is in a certain state. 69 * 70 * This is used to build a two stage register/unregister flow where objects 71 * can continue to be in the xarray even though they are still in progress to 72 * register/unregister. 73 * 74 * The xarray itself provides additional locking, and restartable iteration, 75 * which is also relied on. 76 * 77 * Locks should not be nested, with the exception of client_data, which is 78 * allowed to nest under the read side of the other two locks. 79 * 80 * The devices_rwsem also protects the device name list, any change or 81 * assignment of device name must also hold the write side to guarantee unique 82 * names. 83 */ 84 85 /* 86 * devices contains devices that have had their names assigned. The 87 * devices may not be registered. Users that care about the registration 88 * status need to call ib_device_try_get() on the device to ensure it is 89 * registered, and keep it registered, for the required duration. 90 * 91 */ 92 static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC); 93 static DECLARE_RWSEM(devices_rwsem); 94 #define DEVICE_REGISTERED XA_MARK_1 95 96 static u32 highest_client_id; 97 #define CLIENT_REGISTERED XA_MARK_1 98 static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC); 99 static DECLARE_RWSEM(clients_rwsem); 100 101 static void ib_client_put(struct ib_client *client) 102 { 103 if (refcount_dec_and_test(&client->uses)) 104 complete(&client->uses_zero); 105 } 106 107 /* 108 * If client_data is registered then the corresponding client must also still 109 * be registered. 110 */ 111 #define CLIENT_DATA_REGISTERED XA_MARK_1 112 113 unsigned int rdma_dev_net_id; 114 115 /* 116 * A list of net namespaces is maintained in an xarray. This is necessary 117 * because we can't get the locking right using the existing net ns list. We 118 * would require a init_net callback after the list is updated. 119 */ 120 static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_ALLOC); 121 /* 122 * rwsem to protect accessing the rdma_nets xarray entries. 123 */ 124 static DECLARE_RWSEM(rdma_nets_rwsem); 125 126 bool ib_devices_shared_netns = true; 127 module_param_named(netns_mode, ib_devices_shared_netns, bool, 0444); 128 MODULE_PARM_DESC(netns_mode, 129 "Share device among net namespaces; default=1 (shared)"); 130 /** 131 * rdma_dev_access_netns() - Return whether an rdma device can be accessed 132 * from a specified net namespace or not. 133 * @dev: Pointer to rdma device which needs to be checked 134 * @net: Pointer to net namesapce for which access to be checked 135 * 136 * When the rdma device is in shared mode, it ignores the net namespace. 137 * When the rdma device is exclusive to a net namespace, rdma device net 138 * namespace is checked against the specified one. 139 */ 140 bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net) 141 { 142 return (ib_devices_shared_netns || 143 net_eq(read_pnet(&dev->coredev.rdma_net), net)); 144 } 145 EXPORT_SYMBOL(rdma_dev_access_netns); 146 147 /* 148 * xarray has this behavior where it won't iterate over NULL values stored in 149 * allocated arrays. So we need our own iterator to see all values stored in 150 * the array. This does the same thing as xa_for_each except that it also 151 * returns NULL valued entries if the array is allocating. Simplified to only 152 * work on simple xarrays. 153 */ 154 static void *xan_find_marked(struct xarray *xa, unsigned long *indexp, 155 xa_mark_t filter) 156 { 157 XA_STATE(xas, xa, *indexp); 158 void *entry; 159 160 rcu_read_lock(); 161 do { 162 entry = xas_find_marked(&xas, ULONG_MAX, filter); 163 if (xa_is_zero(entry)) 164 break; 165 } while (xas_retry(&xas, entry)); 166 rcu_read_unlock(); 167 168 if (entry) { 169 *indexp = xas.xa_index; 170 if (xa_is_zero(entry)) 171 return NULL; 172 return entry; 173 } 174 return XA_ERROR(-ENOENT); 175 } 176 #define xan_for_each_marked(xa, index, entry, filter) \ 177 for (index = 0, entry = xan_find_marked(xa, &(index), filter); \ 178 !xa_is_err(entry); \ 179 (index)++, entry = xan_find_marked(xa, &(index), filter)) 180 181 /* RCU hash table mapping netdevice pointers to struct ib_port_data */ 182 static DEFINE_SPINLOCK(ndev_hash_lock); 183 static DECLARE_HASHTABLE(ndev_hash, 5); 184 185 static void free_netdevs(struct ib_device *ib_dev); 186 static void ib_unregister_work(struct work_struct *work); 187 static void __ib_unregister_device(struct ib_device *device); 188 static int ib_security_change(struct notifier_block *nb, unsigned long event, 189 void *lsm_data); 190 static void ib_policy_change_task(struct work_struct *work); 191 static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task); 192 193 static void __ibdev_printk(const char *level, const struct ib_device *ibdev, 194 struct va_format *vaf) 195 { 196 if (ibdev && ibdev->dev.parent) 197 dev_printk_emit(level[1] - '0', 198 ibdev->dev.parent, 199 "%s %s %s: %pV", 200 dev_driver_string(ibdev->dev.parent), 201 dev_name(ibdev->dev.parent), 202 dev_name(&ibdev->dev), 203 vaf); 204 else if (ibdev) 205 printk("%s%s: %pV", 206 level, dev_name(&ibdev->dev), vaf); 207 else 208 printk("%s(NULL ib_device): %pV", level, vaf); 209 } 210 211 void ibdev_printk(const char *level, const struct ib_device *ibdev, 212 const char *format, ...) 213 { 214 struct va_format vaf; 215 va_list args; 216 217 va_start(args, format); 218 219 vaf.fmt = format; 220 vaf.va = &args; 221 222 __ibdev_printk(level, ibdev, &vaf); 223 224 va_end(args); 225 } 226 EXPORT_SYMBOL(ibdev_printk); 227 228 #define define_ibdev_printk_level(func, level) \ 229 void func(const struct ib_device *ibdev, const char *fmt, ...) \ 230 { \ 231 struct va_format vaf; \ 232 va_list args; \ 233 \ 234 va_start(args, fmt); \ 235 \ 236 vaf.fmt = fmt; \ 237 vaf.va = &args; \ 238 \ 239 __ibdev_printk(level, ibdev, &vaf); \ 240 \ 241 va_end(args); \ 242 } \ 243 EXPORT_SYMBOL(func); 244 245 define_ibdev_printk_level(ibdev_emerg, KERN_EMERG); 246 define_ibdev_printk_level(ibdev_alert, KERN_ALERT); 247 define_ibdev_printk_level(ibdev_crit, KERN_CRIT); 248 define_ibdev_printk_level(ibdev_err, KERN_ERR); 249 define_ibdev_printk_level(ibdev_warn, KERN_WARNING); 250 define_ibdev_printk_level(ibdev_notice, KERN_NOTICE); 251 define_ibdev_printk_level(ibdev_info, KERN_INFO); 252 253 static struct notifier_block ibdev_lsm_nb = { 254 .notifier_call = ib_security_change, 255 }; 256 257 static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, 258 struct net *net); 259 260 /* Pointer to the RCU head at the start of the ib_port_data array */ 261 struct ib_port_data_rcu { 262 struct rcu_head rcu_head; 263 struct ib_port_data pdata[]; 264 }; 265 266 static void ib_device_check_mandatory(struct ib_device *device) 267 { 268 #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x } 269 static const struct { 270 size_t offset; 271 char *name; 272 } mandatory_table[] = { 273 IB_MANDATORY_FUNC(query_device), 274 IB_MANDATORY_FUNC(query_port), 275 IB_MANDATORY_FUNC(alloc_pd), 276 IB_MANDATORY_FUNC(dealloc_pd), 277 IB_MANDATORY_FUNC(create_qp), 278 IB_MANDATORY_FUNC(modify_qp), 279 IB_MANDATORY_FUNC(destroy_qp), 280 IB_MANDATORY_FUNC(post_send), 281 IB_MANDATORY_FUNC(post_recv), 282 IB_MANDATORY_FUNC(create_cq), 283 IB_MANDATORY_FUNC(destroy_cq), 284 IB_MANDATORY_FUNC(poll_cq), 285 IB_MANDATORY_FUNC(req_notify_cq), 286 IB_MANDATORY_FUNC(get_dma_mr), 287 IB_MANDATORY_FUNC(reg_user_mr), 288 IB_MANDATORY_FUNC(dereg_mr), 289 IB_MANDATORY_FUNC(get_port_immutable) 290 }; 291 int i; 292 293 device->kverbs_provider = true; 294 for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { 295 if (!*(void **) ((void *) &device->ops + 296 mandatory_table[i].offset)) { 297 device->kverbs_provider = false; 298 break; 299 } 300 } 301 } 302 303 /* 304 * Caller must perform ib_device_put() to return the device reference count 305 * when ib_device_get_by_index() returns valid device pointer. 306 */ 307 struct ib_device *ib_device_get_by_index(const struct net *net, u32 index) 308 { 309 struct ib_device *device; 310 311 down_read(&devices_rwsem); 312 device = xa_load(&devices, index); 313 if (device) { 314 if (!rdma_dev_access_netns(device, net)) { 315 device = NULL; 316 goto out; 317 } 318 319 if (!ib_device_try_get(device)) 320 device = NULL; 321 } 322 out: 323 up_read(&devices_rwsem); 324 return device; 325 } 326 327 /** 328 * ib_device_put - Release IB device reference 329 * @device: device whose reference to be released 330 * 331 * ib_device_put() releases reference to the IB device to allow it to be 332 * unregistered and eventually free. 333 */ 334 void ib_device_put(struct ib_device *device) 335 { 336 if (refcount_dec_and_test(&device->refcount)) 337 complete(&device->unreg_completion); 338 } 339 EXPORT_SYMBOL(ib_device_put); 340 341 static struct ib_device *__ib_device_get_by_name(const char *name) 342 { 343 struct ib_device *device; 344 unsigned long index; 345 346 xa_for_each (&devices, index, device) 347 if (!strcmp(name, dev_name(&device->dev))) 348 return device; 349 350 return NULL; 351 } 352 353 /** 354 * ib_device_get_by_name - Find an IB device by name 355 * @name: The name to look for 356 * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) 357 * 358 * Find and hold an ib_device by its name. The caller must call 359 * ib_device_put() on the returned pointer. 360 */ 361 struct ib_device *ib_device_get_by_name(const char *name, 362 enum rdma_driver_id driver_id) 363 { 364 struct ib_device *device; 365 366 down_read(&devices_rwsem); 367 device = __ib_device_get_by_name(name); 368 if (device && driver_id != RDMA_DRIVER_UNKNOWN && 369 device->ops.driver_id != driver_id) 370 device = NULL; 371 372 if (device) { 373 if (!ib_device_try_get(device)) 374 device = NULL; 375 } 376 up_read(&devices_rwsem); 377 return device; 378 } 379 EXPORT_SYMBOL(ib_device_get_by_name); 380 381 static int rename_compat_devs(struct ib_device *device) 382 { 383 struct ib_core_device *cdev; 384 unsigned long index; 385 int ret = 0; 386 387 mutex_lock(&device->compat_devs_mutex); 388 xa_for_each (&device->compat_devs, index, cdev) { 389 ret = device_rename(&cdev->dev, dev_name(&device->dev)); 390 if (ret) { 391 dev_warn(&cdev->dev, 392 "Fail to rename compatdev to new name %s\n", 393 dev_name(&device->dev)); 394 break; 395 } 396 } 397 mutex_unlock(&device->compat_devs_mutex); 398 return ret; 399 } 400 401 int ib_device_rename(struct ib_device *ibdev, const char *name) 402 { 403 unsigned long index; 404 void *client_data; 405 int ret; 406 407 down_write(&devices_rwsem); 408 if (!strcmp(name, dev_name(&ibdev->dev))) { 409 up_write(&devices_rwsem); 410 return 0; 411 } 412 413 if (__ib_device_get_by_name(name)) { 414 up_write(&devices_rwsem); 415 return -EEXIST; 416 } 417 418 ret = device_rename(&ibdev->dev, name); 419 if (ret) { 420 up_write(&devices_rwsem); 421 return ret; 422 } 423 424 strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX); 425 ret = rename_compat_devs(ibdev); 426 427 downgrade_write(&devices_rwsem); 428 down_read(&ibdev->client_data_rwsem); 429 xan_for_each_marked(&ibdev->client_data, index, client_data, 430 CLIENT_DATA_REGISTERED) { 431 struct ib_client *client = xa_load(&clients, index); 432 433 if (!client || !client->rename) 434 continue; 435 436 client->rename(ibdev, client_data); 437 } 438 up_read(&ibdev->client_data_rwsem); 439 up_read(&devices_rwsem); 440 return 0; 441 } 442 443 int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim) 444 { 445 if (use_dim > 1) 446 return -EINVAL; 447 ibdev->use_cq_dim = use_dim; 448 449 return 0; 450 } 451 452 static int alloc_name(struct ib_device *ibdev, const char *name) 453 { 454 struct ib_device *device; 455 unsigned long index; 456 struct ida inuse; 457 int rc; 458 int i; 459 460 lockdep_assert_held_write(&devices_rwsem); 461 ida_init(&inuse); 462 xa_for_each (&devices, index, device) { 463 char buf[IB_DEVICE_NAME_MAX]; 464 465 if (sscanf(dev_name(&device->dev), name, &i) != 1) 466 continue; 467 if (i < 0 || i >= INT_MAX) 468 continue; 469 snprintf(buf, sizeof buf, name, i); 470 if (strcmp(buf, dev_name(&device->dev)) != 0) 471 continue; 472 473 rc = ida_alloc_range(&inuse, i, i, GFP_KERNEL); 474 if (rc < 0) 475 goto out; 476 } 477 478 rc = ida_alloc(&inuse, GFP_KERNEL); 479 if (rc < 0) 480 goto out; 481 482 rc = dev_set_name(&ibdev->dev, name, rc); 483 out: 484 ida_destroy(&inuse); 485 return rc; 486 } 487 488 static void ib_device_release(struct device *device) 489 { 490 struct ib_device *dev = container_of(device, struct ib_device, dev); 491 492 free_netdevs(dev); 493 WARN_ON(refcount_read(&dev->refcount)); 494 if (dev->hw_stats_data) 495 ib_device_release_hw_stats(dev->hw_stats_data); 496 if (dev->port_data) { 497 ib_cache_release_one(dev); 498 ib_security_release_port_pkey_list(dev); 499 rdma_counter_release(dev); 500 kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu, 501 pdata[0]), 502 rcu_head); 503 } 504 505 mutex_destroy(&dev->unregistration_lock); 506 mutex_destroy(&dev->compat_devs_mutex); 507 508 xa_destroy(&dev->compat_devs); 509 xa_destroy(&dev->client_data); 510 kfree_rcu(dev, rcu_head); 511 } 512 513 static int ib_device_uevent(struct device *device, 514 struct kobj_uevent_env *env) 515 { 516 if (add_uevent_var(env, "NAME=%s", dev_name(device))) 517 return -ENOMEM; 518 519 /* 520 * It would be nice to pass the node GUID with the event... 521 */ 522 523 return 0; 524 } 525 526 static const void *net_namespace(struct device *d) 527 { 528 struct ib_core_device *coredev = 529 container_of(d, struct ib_core_device, dev); 530 531 return read_pnet(&coredev->rdma_net); 532 } 533 534 static struct class ib_class = { 535 .name = "infiniband", 536 .dev_release = ib_device_release, 537 .dev_uevent = ib_device_uevent, 538 .ns_type = &net_ns_type_operations, 539 .namespace = net_namespace, 540 }; 541 542 static void rdma_init_coredev(struct ib_core_device *coredev, 543 struct ib_device *dev, struct net *net) 544 { 545 /* This BUILD_BUG_ON is intended to catch layout change 546 * of union of ib_core_device and device. 547 * dev must be the first element as ib_core and providers 548 * driver uses it. Adding anything in ib_core_device before 549 * device will break this assumption. 550 */ 551 BUILD_BUG_ON(offsetof(struct ib_device, coredev.dev) != 552 offsetof(struct ib_device, dev)); 553 554 coredev->dev.class = &ib_class; 555 coredev->dev.groups = dev->groups; 556 device_initialize(&coredev->dev); 557 coredev->owner = dev; 558 INIT_LIST_HEAD(&coredev->port_list); 559 write_pnet(&coredev->rdma_net, net); 560 } 561 562 /** 563 * _ib_alloc_device - allocate an IB device struct 564 * @size:size of structure to allocate 565 * 566 * Low-level drivers should use ib_alloc_device() to allocate &struct 567 * ib_device. @size is the size of the structure to be allocated, 568 * including any private data used by the low-level driver. 569 * ib_dealloc_device() must be used to free structures allocated with 570 * ib_alloc_device(). 571 */ 572 struct ib_device *_ib_alloc_device(size_t size) 573 { 574 struct ib_device *device; 575 unsigned int i; 576 577 if (WARN_ON(size < sizeof(struct ib_device))) 578 return NULL; 579 580 device = kzalloc(size, GFP_KERNEL); 581 if (!device) 582 return NULL; 583 584 if (rdma_restrack_init(device)) { 585 kfree(device); 586 return NULL; 587 } 588 589 rdma_init_coredev(&device->coredev, device, &init_net); 590 591 INIT_LIST_HEAD(&device->event_handler_list); 592 spin_lock_init(&device->qp_open_list_lock); 593 init_rwsem(&device->event_handler_rwsem); 594 mutex_init(&device->unregistration_lock); 595 /* 596 * client_data needs to be alloc because we don't want our mark to be 597 * destroyed if the user stores NULL in the client data. 598 */ 599 xa_init_flags(&device->client_data, XA_FLAGS_ALLOC); 600 init_rwsem(&device->client_data_rwsem); 601 xa_init_flags(&device->compat_devs, XA_FLAGS_ALLOC); 602 mutex_init(&device->compat_devs_mutex); 603 init_completion(&device->unreg_completion); 604 INIT_WORK(&device->unregistration_work, ib_unregister_work); 605 606 spin_lock_init(&device->cq_pools_lock); 607 for (i = 0; i < ARRAY_SIZE(device->cq_pools); i++) 608 INIT_LIST_HEAD(&device->cq_pools[i]); 609 610 device->uverbs_cmd_mask = 611 BIT_ULL(IB_USER_VERBS_CMD_ALLOC_MW) | 612 BIT_ULL(IB_USER_VERBS_CMD_ALLOC_PD) | 613 BIT_ULL(IB_USER_VERBS_CMD_ATTACH_MCAST) | 614 BIT_ULL(IB_USER_VERBS_CMD_CLOSE_XRCD) | 615 BIT_ULL(IB_USER_VERBS_CMD_CREATE_AH) | 616 BIT_ULL(IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | 617 BIT_ULL(IB_USER_VERBS_CMD_CREATE_CQ) | 618 BIT_ULL(IB_USER_VERBS_CMD_CREATE_QP) | 619 BIT_ULL(IB_USER_VERBS_CMD_CREATE_SRQ) | 620 BIT_ULL(IB_USER_VERBS_CMD_CREATE_XSRQ) | 621 BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_MW) | 622 BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_PD) | 623 BIT_ULL(IB_USER_VERBS_CMD_DEREG_MR) | 624 BIT_ULL(IB_USER_VERBS_CMD_DESTROY_AH) | 625 BIT_ULL(IB_USER_VERBS_CMD_DESTROY_CQ) | 626 BIT_ULL(IB_USER_VERBS_CMD_DESTROY_QP) | 627 BIT_ULL(IB_USER_VERBS_CMD_DESTROY_SRQ) | 628 BIT_ULL(IB_USER_VERBS_CMD_DETACH_MCAST) | 629 BIT_ULL(IB_USER_VERBS_CMD_GET_CONTEXT) | 630 BIT_ULL(IB_USER_VERBS_CMD_MODIFY_QP) | 631 BIT_ULL(IB_USER_VERBS_CMD_MODIFY_SRQ) | 632 BIT_ULL(IB_USER_VERBS_CMD_OPEN_QP) | 633 BIT_ULL(IB_USER_VERBS_CMD_OPEN_XRCD) | 634 BIT_ULL(IB_USER_VERBS_CMD_QUERY_DEVICE) | 635 BIT_ULL(IB_USER_VERBS_CMD_QUERY_PORT) | 636 BIT_ULL(IB_USER_VERBS_CMD_QUERY_QP) | 637 BIT_ULL(IB_USER_VERBS_CMD_QUERY_SRQ) | 638 BIT_ULL(IB_USER_VERBS_CMD_REG_MR) | 639 BIT_ULL(IB_USER_VERBS_CMD_REREG_MR) | 640 BIT_ULL(IB_USER_VERBS_CMD_RESIZE_CQ); 641 return device; 642 } 643 EXPORT_SYMBOL(_ib_alloc_device); 644 645 /** 646 * ib_dealloc_device - free an IB device struct 647 * @device:structure to free 648 * 649 * Free a structure allocated with ib_alloc_device(). 650 */ 651 void ib_dealloc_device(struct ib_device *device) 652 { 653 if (device->ops.dealloc_driver) 654 device->ops.dealloc_driver(device); 655 656 /* 657 * ib_unregister_driver() requires all devices to remain in the xarray 658 * while their ops are callable. The last op we call is dealloc_driver 659 * above. This is needed to create a fence on op callbacks prior to 660 * allowing the driver module to unload. 661 */ 662 down_write(&devices_rwsem); 663 if (xa_load(&devices, device->index) == device) 664 xa_erase(&devices, device->index); 665 up_write(&devices_rwsem); 666 667 /* Expedite releasing netdev references */ 668 free_netdevs(device); 669 670 WARN_ON(!xa_empty(&device->compat_devs)); 671 WARN_ON(!xa_empty(&device->client_data)); 672 WARN_ON(refcount_read(&device->refcount)); 673 rdma_restrack_clean(device); 674 /* Balances with device_initialize */ 675 put_device(&device->dev); 676 } 677 EXPORT_SYMBOL(ib_dealloc_device); 678 679 /* 680 * add_client_context() and remove_client_context() must be safe against 681 * parallel calls on the same device - registration/unregistration of both the 682 * device and client can be occurring in parallel. 683 * 684 * The routines need to be a fence, any caller must not return until the add 685 * or remove is fully completed. 686 */ 687 static int add_client_context(struct ib_device *device, 688 struct ib_client *client) 689 { 690 int ret = 0; 691 692 if (!device->kverbs_provider && !client->no_kverbs_req) 693 return 0; 694 695 down_write(&device->client_data_rwsem); 696 /* 697 * So long as the client is registered hold both the client and device 698 * unregistration locks. 699 */ 700 if (!refcount_inc_not_zero(&client->uses)) 701 goto out_unlock; 702 refcount_inc(&device->refcount); 703 704 /* 705 * Another caller to add_client_context got here first and has already 706 * completely initialized context. 707 */ 708 if (xa_get_mark(&device->client_data, client->client_id, 709 CLIENT_DATA_REGISTERED)) 710 goto out; 711 712 ret = xa_err(xa_store(&device->client_data, client->client_id, NULL, 713 GFP_KERNEL)); 714 if (ret) 715 goto out; 716 downgrade_write(&device->client_data_rwsem); 717 if (client->add) { 718 if (client->add(device)) { 719 /* 720 * If a client fails to add then the error code is 721 * ignored, but we won't call any more ops on this 722 * client. 723 */ 724 xa_erase(&device->client_data, client->client_id); 725 up_read(&device->client_data_rwsem); 726 ib_device_put(device); 727 ib_client_put(client); 728 return 0; 729 } 730 } 731 732 /* Readers shall not see a client until add has been completed */ 733 xa_set_mark(&device->client_data, client->client_id, 734 CLIENT_DATA_REGISTERED); 735 up_read(&device->client_data_rwsem); 736 return 0; 737 738 out: 739 ib_device_put(device); 740 ib_client_put(client); 741 out_unlock: 742 up_write(&device->client_data_rwsem); 743 return ret; 744 } 745 746 static void remove_client_context(struct ib_device *device, 747 unsigned int client_id) 748 { 749 struct ib_client *client; 750 void *client_data; 751 752 down_write(&device->client_data_rwsem); 753 if (!xa_get_mark(&device->client_data, client_id, 754 CLIENT_DATA_REGISTERED)) { 755 up_write(&device->client_data_rwsem); 756 return; 757 } 758 client_data = xa_load(&device->client_data, client_id); 759 xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED); 760 client = xa_load(&clients, client_id); 761 up_write(&device->client_data_rwsem); 762 763 /* 764 * Notice we cannot be holding any exclusive locks when calling the 765 * remove callback as the remove callback can recurse back into any 766 * public functions in this module and thus try for any locks those 767 * functions take. 768 * 769 * For this reason clients and drivers should not call the 770 * unregistration functions will holdling any locks. 771 */ 772 if (client->remove) 773 client->remove(device, client_data); 774 775 xa_erase(&device->client_data, client_id); 776 ib_device_put(device); 777 ib_client_put(client); 778 } 779 780 static int alloc_port_data(struct ib_device *device) 781 { 782 struct ib_port_data_rcu *pdata_rcu; 783 u32 port; 784 785 if (device->port_data) 786 return 0; 787 788 /* This can only be called once the physical port range is defined */ 789 if (WARN_ON(!device->phys_port_cnt)) 790 return -EINVAL; 791 792 /* Reserve U32_MAX so the logic to go over all the ports is sane */ 793 if (WARN_ON(device->phys_port_cnt == U32_MAX)) 794 return -EINVAL; 795 796 /* 797 * device->port_data is indexed directly by the port number to make 798 * access to this data as efficient as possible. 799 * 800 * Therefore port_data is declared as a 1 based array with potential 801 * empty slots at the beginning. 802 */ 803 pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata, 804 rdma_end_port(device) + 1), 805 GFP_KERNEL); 806 if (!pdata_rcu) 807 return -ENOMEM; 808 /* 809 * The rcu_head is put in front of the port data array and the stored 810 * pointer is adjusted since we never need to see that member until 811 * kfree_rcu. 812 */ 813 device->port_data = pdata_rcu->pdata; 814 815 rdma_for_each_port (device, port) { 816 struct ib_port_data *pdata = &device->port_data[port]; 817 818 pdata->ib_dev = device; 819 spin_lock_init(&pdata->pkey_list_lock); 820 INIT_LIST_HEAD(&pdata->pkey_list); 821 spin_lock_init(&pdata->netdev_lock); 822 INIT_HLIST_NODE(&pdata->ndev_hash_link); 823 } 824 return 0; 825 } 826 827 static int verify_immutable(const struct ib_device *dev, u32 port) 828 { 829 return WARN_ON(!rdma_cap_ib_mad(dev, port) && 830 rdma_max_mad_size(dev, port) != 0); 831 } 832 833 static int setup_port_data(struct ib_device *device) 834 { 835 u32 port; 836 int ret; 837 838 ret = alloc_port_data(device); 839 if (ret) 840 return ret; 841 842 rdma_for_each_port (device, port) { 843 struct ib_port_data *pdata = &device->port_data[port]; 844 845 ret = device->ops.get_port_immutable(device, port, 846 &pdata->immutable); 847 if (ret) 848 return ret; 849 850 if (verify_immutable(device, port)) 851 return -EINVAL; 852 } 853 return 0; 854 } 855 856 /** 857 * ib_port_immutable_read() - Read rdma port's immutable data 858 * @dev: IB device 859 * @port: port number whose immutable data to read. It starts with index 1 and 860 * valid upto including rdma_end_port(). 861 */ 862 const struct ib_port_immutable* 863 ib_port_immutable_read(struct ib_device *dev, unsigned int port) 864 { 865 WARN_ON(!rdma_is_port_valid(dev, port)); 866 return &dev->port_data[port].immutable; 867 } 868 EXPORT_SYMBOL(ib_port_immutable_read); 869 870 void ib_get_device_fw_str(struct ib_device *dev, char *str) 871 { 872 if (dev->ops.get_dev_fw_str) 873 dev->ops.get_dev_fw_str(dev, str); 874 else 875 str[0] = '\0'; 876 } 877 EXPORT_SYMBOL(ib_get_device_fw_str); 878 879 static void ib_policy_change_task(struct work_struct *work) 880 { 881 struct ib_device *dev; 882 unsigned long index; 883 884 down_read(&devices_rwsem); 885 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 886 unsigned int i; 887 888 rdma_for_each_port (dev, i) { 889 u64 sp; 890 ib_get_cached_subnet_prefix(dev, i, &sp); 891 ib_security_cache_change(dev, i, sp); 892 } 893 } 894 up_read(&devices_rwsem); 895 } 896 897 static int ib_security_change(struct notifier_block *nb, unsigned long event, 898 void *lsm_data) 899 { 900 if (event != LSM_POLICY_CHANGE) 901 return NOTIFY_DONE; 902 903 schedule_work(&ib_policy_change_work); 904 ib_mad_agent_security_change(); 905 906 return NOTIFY_OK; 907 } 908 909 static void compatdev_release(struct device *dev) 910 { 911 struct ib_core_device *cdev = 912 container_of(dev, struct ib_core_device, dev); 913 914 kfree(cdev); 915 } 916 917 static int add_one_compat_dev(struct ib_device *device, 918 struct rdma_dev_net *rnet) 919 { 920 struct ib_core_device *cdev; 921 int ret; 922 923 lockdep_assert_held(&rdma_nets_rwsem); 924 if (!ib_devices_shared_netns) 925 return 0; 926 927 /* 928 * Create and add compat device in all namespaces other than where it 929 * is currently bound to. 930 */ 931 if (net_eq(read_pnet(&rnet->net), 932 read_pnet(&device->coredev.rdma_net))) 933 return 0; 934 935 /* 936 * The first of init_net() or ib_register_device() to take the 937 * compat_devs_mutex wins and gets to add the device. Others will wait 938 * for completion here. 939 */ 940 mutex_lock(&device->compat_devs_mutex); 941 cdev = xa_load(&device->compat_devs, rnet->id); 942 if (cdev) { 943 ret = 0; 944 goto done; 945 } 946 ret = xa_reserve(&device->compat_devs, rnet->id, GFP_KERNEL); 947 if (ret) 948 goto done; 949 950 cdev = kzalloc(sizeof(*cdev), GFP_KERNEL); 951 if (!cdev) { 952 ret = -ENOMEM; 953 goto cdev_err; 954 } 955 956 cdev->dev.parent = device->dev.parent; 957 rdma_init_coredev(cdev, device, read_pnet(&rnet->net)); 958 cdev->dev.release = compatdev_release; 959 ret = dev_set_name(&cdev->dev, "%s", dev_name(&device->dev)); 960 if (ret) 961 goto add_err; 962 963 ret = device_add(&cdev->dev); 964 if (ret) 965 goto add_err; 966 ret = ib_setup_port_attrs(cdev); 967 if (ret) 968 goto port_err; 969 970 ret = xa_err(xa_store(&device->compat_devs, rnet->id, 971 cdev, GFP_KERNEL)); 972 if (ret) 973 goto insert_err; 974 975 mutex_unlock(&device->compat_devs_mutex); 976 return 0; 977 978 insert_err: 979 ib_free_port_attrs(cdev); 980 port_err: 981 device_del(&cdev->dev); 982 add_err: 983 put_device(&cdev->dev); 984 cdev_err: 985 xa_release(&device->compat_devs, rnet->id); 986 done: 987 mutex_unlock(&device->compat_devs_mutex); 988 return ret; 989 } 990 991 static void remove_one_compat_dev(struct ib_device *device, u32 id) 992 { 993 struct ib_core_device *cdev; 994 995 mutex_lock(&device->compat_devs_mutex); 996 cdev = xa_erase(&device->compat_devs, id); 997 mutex_unlock(&device->compat_devs_mutex); 998 if (cdev) { 999 ib_free_port_attrs(cdev); 1000 device_del(&cdev->dev); 1001 put_device(&cdev->dev); 1002 } 1003 } 1004 1005 static void remove_compat_devs(struct ib_device *device) 1006 { 1007 struct ib_core_device *cdev; 1008 unsigned long index; 1009 1010 xa_for_each (&device->compat_devs, index, cdev) 1011 remove_one_compat_dev(device, index); 1012 } 1013 1014 static int add_compat_devs(struct ib_device *device) 1015 { 1016 struct rdma_dev_net *rnet; 1017 unsigned long index; 1018 int ret = 0; 1019 1020 lockdep_assert_held(&devices_rwsem); 1021 1022 down_read(&rdma_nets_rwsem); 1023 xa_for_each (&rdma_nets, index, rnet) { 1024 ret = add_one_compat_dev(device, rnet); 1025 if (ret) 1026 break; 1027 } 1028 up_read(&rdma_nets_rwsem); 1029 return ret; 1030 } 1031 1032 static void remove_all_compat_devs(void) 1033 { 1034 struct ib_compat_device *cdev; 1035 struct ib_device *dev; 1036 unsigned long index; 1037 1038 down_read(&devices_rwsem); 1039 xa_for_each (&devices, index, dev) { 1040 unsigned long c_index = 0; 1041 1042 /* Hold nets_rwsem so that any other thread modifying this 1043 * system param can sync with this thread. 1044 */ 1045 down_read(&rdma_nets_rwsem); 1046 xa_for_each (&dev->compat_devs, c_index, cdev) 1047 remove_one_compat_dev(dev, c_index); 1048 up_read(&rdma_nets_rwsem); 1049 } 1050 up_read(&devices_rwsem); 1051 } 1052 1053 static int add_all_compat_devs(void) 1054 { 1055 struct rdma_dev_net *rnet; 1056 struct ib_device *dev; 1057 unsigned long index; 1058 int ret = 0; 1059 1060 down_read(&devices_rwsem); 1061 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 1062 unsigned long net_index = 0; 1063 1064 /* Hold nets_rwsem so that any other thread modifying this 1065 * system param can sync with this thread. 1066 */ 1067 down_read(&rdma_nets_rwsem); 1068 xa_for_each (&rdma_nets, net_index, rnet) { 1069 ret = add_one_compat_dev(dev, rnet); 1070 if (ret) 1071 break; 1072 } 1073 up_read(&rdma_nets_rwsem); 1074 } 1075 up_read(&devices_rwsem); 1076 if (ret) 1077 remove_all_compat_devs(); 1078 return ret; 1079 } 1080 1081 int rdma_compatdev_set(u8 enable) 1082 { 1083 struct rdma_dev_net *rnet; 1084 unsigned long index; 1085 int ret = 0; 1086 1087 down_write(&rdma_nets_rwsem); 1088 if (ib_devices_shared_netns == enable) { 1089 up_write(&rdma_nets_rwsem); 1090 return 0; 1091 } 1092 1093 /* enable/disable of compat devices is not supported 1094 * when more than default init_net exists. 1095 */ 1096 xa_for_each (&rdma_nets, index, rnet) { 1097 ret++; 1098 break; 1099 } 1100 if (!ret) 1101 ib_devices_shared_netns = enable; 1102 up_write(&rdma_nets_rwsem); 1103 if (ret) 1104 return -EBUSY; 1105 1106 if (enable) 1107 ret = add_all_compat_devs(); 1108 else 1109 remove_all_compat_devs(); 1110 return ret; 1111 } 1112 1113 static void rdma_dev_exit_net(struct net *net) 1114 { 1115 struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); 1116 struct ib_device *dev; 1117 unsigned long index; 1118 int ret; 1119 1120 down_write(&rdma_nets_rwsem); 1121 /* 1122 * Prevent the ID from being re-used and hide the id from xa_for_each. 1123 */ 1124 ret = xa_err(xa_store(&rdma_nets, rnet->id, NULL, GFP_KERNEL)); 1125 WARN_ON(ret); 1126 up_write(&rdma_nets_rwsem); 1127 1128 down_read(&devices_rwsem); 1129 xa_for_each (&devices, index, dev) { 1130 get_device(&dev->dev); 1131 /* 1132 * Release the devices_rwsem so that pontentially blocking 1133 * device_del, doesn't hold the devices_rwsem for too long. 1134 */ 1135 up_read(&devices_rwsem); 1136 1137 remove_one_compat_dev(dev, rnet->id); 1138 1139 /* 1140 * If the real device is in the NS then move it back to init. 1141 */ 1142 rdma_dev_change_netns(dev, net, &init_net); 1143 1144 put_device(&dev->dev); 1145 down_read(&devices_rwsem); 1146 } 1147 up_read(&devices_rwsem); 1148 1149 rdma_nl_net_exit(rnet); 1150 xa_erase(&rdma_nets, rnet->id); 1151 } 1152 1153 static __net_init int rdma_dev_init_net(struct net *net) 1154 { 1155 struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); 1156 unsigned long index; 1157 struct ib_device *dev; 1158 int ret; 1159 1160 write_pnet(&rnet->net, net); 1161 1162 ret = rdma_nl_net_init(rnet); 1163 if (ret) 1164 return ret; 1165 1166 /* No need to create any compat devices in default init_net. */ 1167 if (net_eq(net, &init_net)) 1168 return 0; 1169 1170 ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL); 1171 if (ret) { 1172 rdma_nl_net_exit(rnet); 1173 return ret; 1174 } 1175 1176 down_read(&devices_rwsem); 1177 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 1178 /* Hold nets_rwsem so that netlink command cannot change 1179 * system configuration for device sharing mode. 1180 */ 1181 down_read(&rdma_nets_rwsem); 1182 ret = add_one_compat_dev(dev, rnet); 1183 up_read(&rdma_nets_rwsem); 1184 if (ret) 1185 break; 1186 } 1187 up_read(&devices_rwsem); 1188 1189 if (ret) 1190 rdma_dev_exit_net(net); 1191 1192 return ret; 1193 } 1194 1195 /* 1196 * Assign the unique string device name and the unique device index. This is 1197 * undone by ib_dealloc_device. 1198 */ 1199 static int assign_name(struct ib_device *device, const char *name) 1200 { 1201 static u32 last_id; 1202 int ret; 1203 1204 down_write(&devices_rwsem); 1205 /* Assign a unique name to the device */ 1206 if (strchr(name, '%')) 1207 ret = alloc_name(device, name); 1208 else 1209 ret = dev_set_name(&device->dev, name); 1210 if (ret) 1211 goto out; 1212 1213 if (__ib_device_get_by_name(dev_name(&device->dev))) { 1214 ret = -ENFILE; 1215 goto out; 1216 } 1217 strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); 1218 1219 ret = xa_alloc_cyclic(&devices, &device->index, device, xa_limit_31b, 1220 &last_id, GFP_KERNEL); 1221 if (ret > 0) 1222 ret = 0; 1223 1224 out: 1225 up_write(&devices_rwsem); 1226 return ret; 1227 } 1228 1229 /* 1230 * setup_device() allocates memory and sets up data that requires calling the 1231 * device ops, this is the only reason these actions are not done during 1232 * ib_alloc_device. It is undone by ib_dealloc_device(). 1233 */ 1234 static int setup_device(struct ib_device *device) 1235 { 1236 struct ib_udata uhw = {.outlen = 0, .inlen = 0}; 1237 int ret; 1238 1239 ib_device_check_mandatory(device); 1240 1241 ret = setup_port_data(device); 1242 if (ret) { 1243 dev_warn(&device->dev, "Couldn't create per-port data\n"); 1244 return ret; 1245 } 1246 1247 memset(&device->attrs, 0, sizeof(device->attrs)); 1248 ret = device->ops.query_device(device, &device->attrs, &uhw); 1249 if (ret) { 1250 dev_warn(&device->dev, 1251 "Couldn't query the device attributes\n"); 1252 return ret; 1253 } 1254 1255 return 0; 1256 } 1257 1258 static void disable_device(struct ib_device *device) 1259 { 1260 u32 cid; 1261 1262 WARN_ON(!refcount_read(&device->refcount)); 1263 1264 down_write(&devices_rwsem); 1265 xa_clear_mark(&devices, device->index, DEVICE_REGISTERED); 1266 up_write(&devices_rwsem); 1267 1268 /* 1269 * Remove clients in LIFO order, see assign_client_id. This could be 1270 * more efficient if xarray learns to reverse iterate. Since no new 1271 * clients can be added to this ib_device past this point we only need 1272 * the maximum possible client_id value here. 1273 */ 1274 down_read(&clients_rwsem); 1275 cid = highest_client_id; 1276 up_read(&clients_rwsem); 1277 while (cid) { 1278 cid--; 1279 remove_client_context(device, cid); 1280 } 1281 1282 ib_cq_pool_cleanup(device); 1283 1284 /* Pairs with refcount_set in enable_device */ 1285 ib_device_put(device); 1286 wait_for_completion(&device->unreg_completion); 1287 1288 /* 1289 * compat devices must be removed after device refcount drops to zero. 1290 * Otherwise init_net() may add more compatdevs after removing compat 1291 * devices and before device is disabled. 1292 */ 1293 remove_compat_devs(device); 1294 } 1295 1296 /* 1297 * An enabled device is visible to all clients and to all the public facing 1298 * APIs that return a device pointer. This always returns with a new get, even 1299 * if it fails. 1300 */ 1301 static int enable_device_and_get(struct ib_device *device) 1302 { 1303 struct ib_client *client; 1304 unsigned long index; 1305 int ret = 0; 1306 1307 /* 1308 * One ref belongs to the xa and the other belongs to this 1309 * thread. This is needed to guard against parallel unregistration. 1310 */ 1311 refcount_set(&device->refcount, 2); 1312 down_write(&devices_rwsem); 1313 xa_set_mark(&devices, device->index, DEVICE_REGISTERED); 1314 1315 /* 1316 * By using downgrade_write() we ensure that no other thread can clear 1317 * DEVICE_REGISTERED while we are completing the client setup. 1318 */ 1319 downgrade_write(&devices_rwsem); 1320 1321 if (device->ops.enable_driver) { 1322 ret = device->ops.enable_driver(device); 1323 if (ret) 1324 goto out; 1325 } 1326 1327 down_read(&clients_rwsem); 1328 xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { 1329 ret = add_client_context(device, client); 1330 if (ret) 1331 break; 1332 } 1333 up_read(&clients_rwsem); 1334 if (!ret) 1335 ret = add_compat_devs(device); 1336 out: 1337 up_read(&devices_rwsem); 1338 return ret; 1339 } 1340 1341 static void prevent_dealloc_device(struct ib_device *ib_dev) 1342 { 1343 } 1344 1345 /** 1346 * ib_register_device - Register an IB device with IB core 1347 * @device: Device to register 1348 * @name: unique string device name. This may include a '%' which will 1349 * cause a unique index to be added to the passed device name. 1350 * @dma_device: pointer to a DMA-capable device. If %NULL, then the IB 1351 * device will be used. In this case the caller should fully 1352 * setup the ibdev for DMA. This usually means using dma_virt_ops. 1353 * 1354 * Low-level drivers use ib_register_device() to register their 1355 * devices with the IB core. All registered clients will receive a 1356 * callback for each device that is added. @device must be allocated 1357 * with ib_alloc_device(). 1358 * 1359 * If the driver uses ops.dealloc_driver and calls any ib_unregister_device() 1360 * asynchronously then the device pointer may become freed as soon as this 1361 * function returns. 1362 */ 1363 int ib_register_device(struct ib_device *device, const char *name, 1364 struct device *dma_device) 1365 { 1366 int ret; 1367 1368 ret = assign_name(device, name); 1369 if (ret) 1370 return ret; 1371 1372 /* 1373 * If the caller does not provide a DMA capable device then the IB core 1374 * will set up ib_sge and scatterlist structures that stash the kernel 1375 * virtual address into the address field. 1376 */ 1377 WARN_ON(dma_device && !dma_device->dma_parms); 1378 device->dma_device = dma_device; 1379 1380 ret = setup_device(device); 1381 if (ret) 1382 return ret; 1383 1384 ret = ib_cache_setup_one(device); 1385 if (ret) { 1386 dev_warn(&device->dev, 1387 "Couldn't set up InfiniBand P_Key/GID cache\n"); 1388 return ret; 1389 } 1390 1391 device->groups[0] = &ib_dev_attr_group; 1392 device->groups[1] = device->ops.device_group; 1393 ret = ib_setup_device_attrs(device); 1394 if (ret) 1395 goto cache_cleanup; 1396 1397 ib_device_register_rdmacg(device); 1398 1399 rdma_counter_init(device); 1400 1401 /* 1402 * Ensure that ADD uevent is not fired because it 1403 * is too early amd device is not initialized yet. 1404 */ 1405 dev_set_uevent_suppress(&device->dev, true); 1406 ret = device_add(&device->dev); 1407 if (ret) 1408 goto cg_cleanup; 1409 1410 ret = ib_setup_port_attrs(&device->coredev); 1411 if (ret) { 1412 dev_warn(&device->dev, 1413 "Couldn't register device with driver model\n"); 1414 goto dev_cleanup; 1415 } 1416 1417 ret = enable_device_and_get(device); 1418 if (ret) { 1419 void (*dealloc_fn)(struct ib_device *); 1420 1421 /* 1422 * If we hit this error flow then we don't want to 1423 * automatically dealloc the device since the caller is 1424 * expected to call ib_dealloc_device() after 1425 * ib_register_device() fails. This is tricky due to the 1426 * possibility for a parallel unregistration along with this 1427 * error flow. Since we have a refcount here we know any 1428 * parallel flow is stopped in disable_device and will see the 1429 * special dealloc_driver pointer, causing the responsibility to 1430 * ib_dealloc_device() to revert back to this thread. 1431 */ 1432 dealloc_fn = device->ops.dealloc_driver; 1433 device->ops.dealloc_driver = prevent_dealloc_device; 1434 ib_device_put(device); 1435 __ib_unregister_device(device); 1436 device->ops.dealloc_driver = dealloc_fn; 1437 dev_set_uevent_suppress(&device->dev, false); 1438 return ret; 1439 } 1440 dev_set_uevent_suppress(&device->dev, false); 1441 /* Mark for userspace that device is ready */ 1442 kobject_uevent(&device->dev.kobj, KOBJ_ADD); 1443 ib_device_put(device); 1444 1445 return 0; 1446 1447 dev_cleanup: 1448 device_del(&device->dev); 1449 cg_cleanup: 1450 dev_set_uevent_suppress(&device->dev, false); 1451 ib_device_unregister_rdmacg(device); 1452 cache_cleanup: 1453 ib_cache_cleanup_one(device); 1454 return ret; 1455 } 1456 EXPORT_SYMBOL(ib_register_device); 1457 1458 /* Callers must hold a get on the device. */ 1459 static void __ib_unregister_device(struct ib_device *ib_dev) 1460 { 1461 /* 1462 * We have a registration lock so that all the calls to unregister are 1463 * fully fenced, once any unregister returns the device is truely 1464 * unregistered even if multiple callers are unregistering it at the 1465 * same time. This also interacts with the registration flow and 1466 * provides sane semantics if register and unregister are racing. 1467 */ 1468 mutex_lock(&ib_dev->unregistration_lock); 1469 if (!refcount_read(&ib_dev->refcount)) 1470 goto out; 1471 1472 disable_device(ib_dev); 1473 1474 /* Expedite removing unregistered pointers from the hash table */ 1475 free_netdevs(ib_dev); 1476 1477 ib_free_port_attrs(&ib_dev->coredev); 1478 device_del(&ib_dev->dev); 1479 ib_device_unregister_rdmacg(ib_dev); 1480 ib_cache_cleanup_one(ib_dev); 1481 1482 /* 1483 * Drivers using the new flow may not call ib_dealloc_device except 1484 * in error unwind prior to registration success. 1485 */ 1486 if (ib_dev->ops.dealloc_driver && 1487 ib_dev->ops.dealloc_driver != prevent_dealloc_device) { 1488 WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1); 1489 ib_dealloc_device(ib_dev); 1490 } 1491 out: 1492 mutex_unlock(&ib_dev->unregistration_lock); 1493 } 1494 1495 /** 1496 * ib_unregister_device - Unregister an IB device 1497 * @ib_dev: The device to unregister 1498 * 1499 * Unregister an IB device. All clients will receive a remove callback. 1500 * 1501 * Callers should call this routine only once, and protect against races with 1502 * registration. Typically it should only be called as part of a remove 1503 * callback in an implementation of driver core's struct device_driver and 1504 * related. 1505 * 1506 * If ops.dealloc_driver is used then ib_dev will be freed upon return from 1507 * this function. 1508 */ 1509 void ib_unregister_device(struct ib_device *ib_dev) 1510 { 1511 get_device(&ib_dev->dev); 1512 __ib_unregister_device(ib_dev); 1513 put_device(&ib_dev->dev); 1514 } 1515 EXPORT_SYMBOL(ib_unregister_device); 1516 1517 /** 1518 * ib_unregister_device_and_put - Unregister a device while holding a 'get' 1519 * @ib_dev: The device to unregister 1520 * 1521 * This is the same as ib_unregister_device(), except it includes an internal 1522 * ib_device_put() that should match a 'get' obtained by the caller. 1523 * 1524 * It is safe to call this routine concurrently from multiple threads while 1525 * holding the 'get'. When the function returns the device is fully 1526 * unregistered. 1527 * 1528 * Drivers using this flow MUST use the driver_unregister callback to clean up 1529 * their resources associated with the device and dealloc it. 1530 */ 1531 void ib_unregister_device_and_put(struct ib_device *ib_dev) 1532 { 1533 WARN_ON(!ib_dev->ops.dealloc_driver); 1534 get_device(&ib_dev->dev); 1535 ib_device_put(ib_dev); 1536 __ib_unregister_device(ib_dev); 1537 put_device(&ib_dev->dev); 1538 } 1539 EXPORT_SYMBOL(ib_unregister_device_and_put); 1540 1541 /** 1542 * ib_unregister_driver - Unregister all IB devices for a driver 1543 * @driver_id: The driver to unregister 1544 * 1545 * This implements a fence for device unregistration. It only returns once all 1546 * devices associated with the driver_id have fully completed their 1547 * unregistration and returned from ib_unregister_device*(). 1548 * 1549 * If device's are not yet unregistered it goes ahead and starts unregistering 1550 * them. 1551 * 1552 * This does not block creation of new devices with the given driver_id, that 1553 * is the responsibility of the caller. 1554 */ 1555 void ib_unregister_driver(enum rdma_driver_id driver_id) 1556 { 1557 struct ib_device *ib_dev; 1558 unsigned long index; 1559 1560 down_read(&devices_rwsem); 1561 xa_for_each (&devices, index, ib_dev) { 1562 if (ib_dev->ops.driver_id != driver_id) 1563 continue; 1564 1565 get_device(&ib_dev->dev); 1566 up_read(&devices_rwsem); 1567 1568 WARN_ON(!ib_dev->ops.dealloc_driver); 1569 __ib_unregister_device(ib_dev); 1570 1571 put_device(&ib_dev->dev); 1572 down_read(&devices_rwsem); 1573 } 1574 up_read(&devices_rwsem); 1575 } 1576 EXPORT_SYMBOL(ib_unregister_driver); 1577 1578 static void ib_unregister_work(struct work_struct *work) 1579 { 1580 struct ib_device *ib_dev = 1581 container_of(work, struct ib_device, unregistration_work); 1582 1583 __ib_unregister_device(ib_dev); 1584 put_device(&ib_dev->dev); 1585 } 1586 1587 /** 1588 * ib_unregister_device_queued - Unregister a device using a work queue 1589 * @ib_dev: The device to unregister 1590 * 1591 * This schedules an asynchronous unregistration using a WQ for the device. A 1592 * driver should use this to avoid holding locks while doing unregistration, 1593 * such as holding the RTNL lock. 1594 * 1595 * Drivers using this API must use ib_unregister_driver before module unload 1596 * to ensure that all scheduled unregistrations have completed. 1597 */ 1598 void ib_unregister_device_queued(struct ib_device *ib_dev) 1599 { 1600 WARN_ON(!refcount_read(&ib_dev->refcount)); 1601 WARN_ON(!ib_dev->ops.dealloc_driver); 1602 get_device(&ib_dev->dev); 1603 if (!queue_work(system_unbound_wq, &ib_dev->unregistration_work)) 1604 put_device(&ib_dev->dev); 1605 } 1606 EXPORT_SYMBOL(ib_unregister_device_queued); 1607 1608 /* 1609 * The caller must pass in a device that has the kref held and the refcount 1610 * released. If the device is in cur_net and still registered then it is moved 1611 * into net. 1612 */ 1613 static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, 1614 struct net *net) 1615 { 1616 int ret2 = -EINVAL; 1617 int ret; 1618 1619 mutex_lock(&device->unregistration_lock); 1620 1621 /* 1622 * If a device not under ib_device_get() or if the unregistration_lock 1623 * is not held, the namespace can be changed, or it can be unregistered. 1624 * Check again under the lock. 1625 */ 1626 if (refcount_read(&device->refcount) == 0 || 1627 !net_eq(cur_net, read_pnet(&device->coredev.rdma_net))) { 1628 ret = -ENODEV; 1629 goto out; 1630 } 1631 1632 kobject_uevent(&device->dev.kobj, KOBJ_REMOVE); 1633 disable_device(device); 1634 1635 /* 1636 * At this point no one can be using the device, so it is safe to 1637 * change the namespace. 1638 */ 1639 write_pnet(&device->coredev.rdma_net, net); 1640 1641 down_read(&devices_rwsem); 1642 /* 1643 * Currently rdma devices are system wide unique. So the device name 1644 * is guaranteed free in the new namespace. Publish the new namespace 1645 * at the sysfs level. 1646 */ 1647 ret = device_rename(&device->dev, dev_name(&device->dev)); 1648 up_read(&devices_rwsem); 1649 if (ret) { 1650 dev_warn(&device->dev, 1651 "%s: Couldn't rename device after namespace change\n", 1652 __func__); 1653 /* Try and put things back and re-enable the device */ 1654 write_pnet(&device->coredev.rdma_net, cur_net); 1655 } 1656 1657 ret2 = enable_device_and_get(device); 1658 if (ret2) { 1659 /* 1660 * This shouldn't really happen, but if it does, let the user 1661 * retry at later point. So don't disable the device. 1662 */ 1663 dev_warn(&device->dev, 1664 "%s: Couldn't re-enable device after namespace change\n", 1665 __func__); 1666 } 1667 kobject_uevent(&device->dev.kobj, KOBJ_ADD); 1668 1669 ib_device_put(device); 1670 out: 1671 mutex_unlock(&device->unregistration_lock); 1672 if (ret) 1673 return ret; 1674 return ret2; 1675 } 1676 1677 int ib_device_set_netns_put(struct sk_buff *skb, 1678 struct ib_device *dev, u32 ns_fd) 1679 { 1680 struct net *net; 1681 int ret; 1682 1683 net = get_net_ns_by_fd(ns_fd); 1684 if (IS_ERR(net)) { 1685 ret = PTR_ERR(net); 1686 goto net_err; 1687 } 1688 1689 if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { 1690 ret = -EPERM; 1691 goto ns_err; 1692 } 1693 1694 /* 1695 * All the ib_clients, including uverbs, are reset when the namespace is 1696 * changed and this cannot be blocked waiting for userspace to do 1697 * something, so disassociation is mandatory. 1698 */ 1699 if (!dev->ops.disassociate_ucontext || ib_devices_shared_netns) { 1700 ret = -EOPNOTSUPP; 1701 goto ns_err; 1702 } 1703 1704 get_device(&dev->dev); 1705 ib_device_put(dev); 1706 ret = rdma_dev_change_netns(dev, current->nsproxy->net_ns, net); 1707 put_device(&dev->dev); 1708 1709 put_net(net); 1710 return ret; 1711 1712 ns_err: 1713 put_net(net); 1714 net_err: 1715 ib_device_put(dev); 1716 return ret; 1717 } 1718 1719 static struct pernet_operations rdma_dev_net_ops = { 1720 .init = rdma_dev_init_net, 1721 .exit = rdma_dev_exit_net, 1722 .id = &rdma_dev_net_id, 1723 .size = sizeof(struct rdma_dev_net), 1724 }; 1725 1726 static int assign_client_id(struct ib_client *client) 1727 { 1728 int ret; 1729 1730 down_write(&clients_rwsem); 1731 /* 1732 * The add/remove callbacks must be called in FIFO/LIFO order. To 1733 * achieve this we assign client_ids so they are sorted in 1734 * registration order. 1735 */ 1736 client->client_id = highest_client_id; 1737 ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL); 1738 if (ret) 1739 goto out; 1740 1741 highest_client_id++; 1742 xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED); 1743 1744 out: 1745 up_write(&clients_rwsem); 1746 return ret; 1747 } 1748 1749 static void remove_client_id(struct ib_client *client) 1750 { 1751 down_write(&clients_rwsem); 1752 xa_erase(&clients, client->client_id); 1753 for (; highest_client_id; highest_client_id--) 1754 if (xa_load(&clients, highest_client_id - 1)) 1755 break; 1756 up_write(&clients_rwsem); 1757 } 1758 1759 /** 1760 * ib_register_client - Register an IB client 1761 * @client:Client to register 1762 * 1763 * Upper level users of the IB drivers can use ib_register_client() to 1764 * register callbacks for IB device addition and removal. When an IB 1765 * device is added, each registered client's add method will be called 1766 * (in the order the clients were registered), and when a device is 1767 * removed, each client's remove method will be called (in the reverse 1768 * order that clients were registered). In addition, when 1769 * ib_register_client() is called, the client will receive an add 1770 * callback for all devices already registered. 1771 */ 1772 int ib_register_client(struct ib_client *client) 1773 { 1774 struct ib_device *device; 1775 unsigned long index; 1776 int ret; 1777 1778 refcount_set(&client->uses, 1); 1779 init_completion(&client->uses_zero); 1780 ret = assign_client_id(client); 1781 if (ret) 1782 return ret; 1783 1784 down_read(&devices_rwsem); 1785 xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) { 1786 ret = add_client_context(device, client); 1787 if (ret) { 1788 up_read(&devices_rwsem); 1789 ib_unregister_client(client); 1790 return ret; 1791 } 1792 } 1793 up_read(&devices_rwsem); 1794 return 0; 1795 } 1796 EXPORT_SYMBOL(ib_register_client); 1797 1798 /** 1799 * ib_unregister_client - Unregister an IB client 1800 * @client:Client to unregister 1801 * 1802 * Upper level users use ib_unregister_client() to remove their client 1803 * registration. When ib_unregister_client() is called, the client 1804 * will receive a remove callback for each IB device still registered. 1805 * 1806 * This is a full fence, once it returns no client callbacks will be called, 1807 * or are running in another thread. 1808 */ 1809 void ib_unregister_client(struct ib_client *client) 1810 { 1811 struct ib_device *device; 1812 unsigned long index; 1813 1814 down_write(&clients_rwsem); 1815 ib_client_put(client); 1816 xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED); 1817 up_write(&clients_rwsem); 1818 1819 /* We do not want to have locks while calling client->remove() */ 1820 rcu_read_lock(); 1821 xa_for_each (&devices, index, device) { 1822 if (!ib_device_try_get(device)) 1823 continue; 1824 rcu_read_unlock(); 1825 1826 remove_client_context(device, client->client_id); 1827 1828 ib_device_put(device); 1829 rcu_read_lock(); 1830 } 1831 rcu_read_unlock(); 1832 1833 /* 1834 * remove_client_context() is not a fence, it can return even though a 1835 * removal is ongoing. Wait until all removals are completed. 1836 */ 1837 wait_for_completion(&client->uses_zero); 1838 remove_client_id(client); 1839 } 1840 EXPORT_SYMBOL(ib_unregister_client); 1841 1842 static int __ib_get_global_client_nl_info(const char *client_name, 1843 struct ib_client_nl_info *res) 1844 { 1845 struct ib_client *client; 1846 unsigned long index; 1847 int ret = -ENOENT; 1848 1849 down_read(&clients_rwsem); 1850 xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { 1851 if (strcmp(client->name, client_name) != 0) 1852 continue; 1853 if (!client->get_global_nl_info) { 1854 ret = -EOPNOTSUPP; 1855 break; 1856 } 1857 ret = client->get_global_nl_info(res); 1858 if (WARN_ON(ret == -ENOENT)) 1859 ret = -EINVAL; 1860 if (!ret && res->cdev) 1861 get_device(res->cdev); 1862 break; 1863 } 1864 up_read(&clients_rwsem); 1865 return ret; 1866 } 1867 1868 static int __ib_get_client_nl_info(struct ib_device *ibdev, 1869 const char *client_name, 1870 struct ib_client_nl_info *res) 1871 { 1872 unsigned long index; 1873 void *client_data; 1874 int ret = -ENOENT; 1875 1876 down_read(&ibdev->client_data_rwsem); 1877 xan_for_each_marked (&ibdev->client_data, index, client_data, 1878 CLIENT_DATA_REGISTERED) { 1879 struct ib_client *client = xa_load(&clients, index); 1880 1881 if (!client || strcmp(client->name, client_name) != 0) 1882 continue; 1883 if (!client->get_nl_info) { 1884 ret = -EOPNOTSUPP; 1885 break; 1886 } 1887 ret = client->get_nl_info(ibdev, client_data, res); 1888 if (WARN_ON(ret == -ENOENT)) 1889 ret = -EINVAL; 1890 1891 /* 1892 * The cdev is guaranteed valid as long as we are inside the 1893 * client_data_rwsem as remove_one can't be called. Keep it 1894 * valid for the caller. 1895 */ 1896 if (!ret && res->cdev) 1897 get_device(res->cdev); 1898 break; 1899 } 1900 up_read(&ibdev->client_data_rwsem); 1901 1902 return ret; 1903 } 1904 1905 /** 1906 * ib_get_client_nl_info - Fetch the nl_info from a client 1907 * @ibdev: IB device 1908 * @client_name: Name of the client 1909 * @res: Result of the query 1910 */ 1911 int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, 1912 struct ib_client_nl_info *res) 1913 { 1914 int ret; 1915 1916 if (ibdev) 1917 ret = __ib_get_client_nl_info(ibdev, client_name, res); 1918 else 1919 ret = __ib_get_global_client_nl_info(client_name, res); 1920 #ifdef CONFIG_MODULES 1921 if (ret == -ENOENT) { 1922 request_module("rdma-client-%s", client_name); 1923 if (ibdev) 1924 ret = __ib_get_client_nl_info(ibdev, client_name, res); 1925 else 1926 ret = __ib_get_global_client_nl_info(client_name, res); 1927 } 1928 #endif 1929 if (ret) { 1930 if (ret == -ENOENT) 1931 return -EOPNOTSUPP; 1932 return ret; 1933 } 1934 1935 if (WARN_ON(!res->cdev)) 1936 return -EINVAL; 1937 return 0; 1938 } 1939 1940 /** 1941 * ib_set_client_data - Set IB client context 1942 * @device:Device to set context for 1943 * @client:Client to set context for 1944 * @data:Context to set 1945 * 1946 * ib_set_client_data() sets client context data that can be retrieved with 1947 * ib_get_client_data(). This can only be called while the client is 1948 * registered to the device, once the ib_client remove() callback returns this 1949 * cannot be called. 1950 */ 1951 void ib_set_client_data(struct ib_device *device, struct ib_client *client, 1952 void *data) 1953 { 1954 void *rc; 1955 1956 if (WARN_ON(IS_ERR(data))) 1957 data = NULL; 1958 1959 rc = xa_store(&device->client_data, client->client_id, data, 1960 GFP_KERNEL); 1961 WARN_ON(xa_is_err(rc)); 1962 } 1963 EXPORT_SYMBOL(ib_set_client_data); 1964 1965 /** 1966 * ib_register_event_handler - Register an IB event handler 1967 * @event_handler:Handler to register 1968 * 1969 * ib_register_event_handler() registers an event handler that will be 1970 * called back when asynchronous IB events occur (as defined in 1971 * chapter 11 of the InfiniBand Architecture Specification). This 1972 * callback occurs in workqueue context. 1973 */ 1974 void ib_register_event_handler(struct ib_event_handler *event_handler) 1975 { 1976 down_write(&event_handler->device->event_handler_rwsem); 1977 list_add_tail(&event_handler->list, 1978 &event_handler->device->event_handler_list); 1979 up_write(&event_handler->device->event_handler_rwsem); 1980 } 1981 EXPORT_SYMBOL(ib_register_event_handler); 1982 1983 /** 1984 * ib_unregister_event_handler - Unregister an event handler 1985 * @event_handler:Handler to unregister 1986 * 1987 * Unregister an event handler registered with 1988 * ib_register_event_handler(). 1989 */ 1990 void ib_unregister_event_handler(struct ib_event_handler *event_handler) 1991 { 1992 down_write(&event_handler->device->event_handler_rwsem); 1993 list_del(&event_handler->list); 1994 up_write(&event_handler->device->event_handler_rwsem); 1995 } 1996 EXPORT_SYMBOL(ib_unregister_event_handler); 1997 1998 void ib_dispatch_event_clients(struct ib_event *event) 1999 { 2000 struct ib_event_handler *handler; 2001 2002 down_read(&event->device->event_handler_rwsem); 2003 2004 list_for_each_entry(handler, &event->device->event_handler_list, list) 2005 handler->handler(handler, event); 2006 2007 up_read(&event->device->event_handler_rwsem); 2008 } 2009 2010 static int iw_query_port(struct ib_device *device, 2011 u32 port_num, 2012 struct ib_port_attr *port_attr) 2013 { 2014 struct in_device *inetdev; 2015 struct net_device *netdev; 2016 2017 memset(port_attr, 0, sizeof(*port_attr)); 2018 2019 netdev = ib_device_get_netdev(device, port_num); 2020 if (!netdev) 2021 return -ENODEV; 2022 2023 port_attr->max_mtu = IB_MTU_4096; 2024 port_attr->active_mtu = ib_mtu_int_to_enum(netdev->mtu); 2025 2026 if (!netif_carrier_ok(netdev)) { 2027 port_attr->state = IB_PORT_DOWN; 2028 port_attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; 2029 } else { 2030 rcu_read_lock(); 2031 inetdev = __in_dev_get_rcu(netdev); 2032 2033 if (inetdev && inetdev->ifa_list) { 2034 port_attr->state = IB_PORT_ACTIVE; 2035 port_attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; 2036 } else { 2037 port_attr->state = IB_PORT_INIT; 2038 port_attr->phys_state = 2039 IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING; 2040 } 2041 2042 rcu_read_unlock(); 2043 } 2044 2045 dev_put(netdev); 2046 return device->ops.query_port(device, port_num, port_attr); 2047 } 2048 2049 static int __ib_query_port(struct ib_device *device, 2050 u32 port_num, 2051 struct ib_port_attr *port_attr) 2052 { 2053 union ib_gid gid = {}; 2054 int err; 2055 2056 memset(port_attr, 0, sizeof(*port_attr)); 2057 2058 err = device->ops.query_port(device, port_num, port_attr); 2059 if (err || port_attr->subnet_prefix) 2060 return err; 2061 2062 if (rdma_port_get_link_layer(device, port_num) != 2063 IB_LINK_LAYER_INFINIBAND) 2064 return 0; 2065 2066 err = device->ops.query_gid(device, port_num, 0, &gid); 2067 if (err) 2068 return err; 2069 2070 port_attr->subnet_prefix = be64_to_cpu(gid.global.subnet_prefix); 2071 return 0; 2072 } 2073 2074 /** 2075 * ib_query_port - Query IB port attributes 2076 * @device:Device to query 2077 * @port_num:Port number to query 2078 * @port_attr:Port attributes 2079 * 2080 * ib_query_port() returns the attributes of a port through the 2081 * @port_attr pointer. 2082 */ 2083 int ib_query_port(struct ib_device *device, 2084 u32 port_num, 2085 struct ib_port_attr *port_attr) 2086 { 2087 if (!rdma_is_port_valid(device, port_num)) 2088 return -EINVAL; 2089 2090 if (rdma_protocol_iwarp(device, port_num)) 2091 return iw_query_port(device, port_num, port_attr); 2092 else 2093 return __ib_query_port(device, port_num, port_attr); 2094 } 2095 EXPORT_SYMBOL(ib_query_port); 2096 2097 static void add_ndev_hash(struct ib_port_data *pdata) 2098 { 2099 unsigned long flags; 2100 2101 might_sleep(); 2102 2103 spin_lock_irqsave(&ndev_hash_lock, flags); 2104 if (hash_hashed(&pdata->ndev_hash_link)) { 2105 hash_del_rcu(&pdata->ndev_hash_link); 2106 spin_unlock_irqrestore(&ndev_hash_lock, flags); 2107 /* 2108 * We cannot do hash_add_rcu after a hash_del_rcu until the 2109 * grace period 2110 */ 2111 synchronize_rcu(); 2112 spin_lock_irqsave(&ndev_hash_lock, flags); 2113 } 2114 if (pdata->netdev) 2115 hash_add_rcu(ndev_hash, &pdata->ndev_hash_link, 2116 (uintptr_t)pdata->netdev); 2117 spin_unlock_irqrestore(&ndev_hash_lock, flags); 2118 } 2119 2120 /** 2121 * ib_device_set_netdev - Associate the ib_dev with an underlying net_device 2122 * @ib_dev: Device to modify 2123 * @ndev: net_device to affiliate, may be NULL 2124 * @port: IB port the net_device is connected to 2125 * 2126 * Drivers should use this to link the ib_device to a netdev so the netdev 2127 * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be 2128 * affiliated with any port. 2129 * 2130 * The caller must ensure that the given ndev is not unregistered or 2131 * unregistering, and that either the ib_device is unregistered or 2132 * ib_device_set_netdev() is called with NULL when the ndev sends a 2133 * NETDEV_UNREGISTER event. 2134 */ 2135 int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, 2136 u32 port) 2137 { 2138 struct net_device *old_ndev; 2139 struct ib_port_data *pdata; 2140 unsigned long flags; 2141 int ret; 2142 2143 /* 2144 * Drivers wish to call this before ib_register_driver, so we have to 2145 * setup the port data early. 2146 */ 2147 ret = alloc_port_data(ib_dev); 2148 if (ret) 2149 return ret; 2150 2151 if (!rdma_is_port_valid(ib_dev, port)) 2152 return -EINVAL; 2153 2154 pdata = &ib_dev->port_data[port]; 2155 spin_lock_irqsave(&pdata->netdev_lock, flags); 2156 old_ndev = rcu_dereference_protected( 2157 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2158 if (old_ndev == ndev) { 2159 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2160 return 0; 2161 } 2162 2163 if (ndev) 2164 dev_hold(ndev); 2165 rcu_assign_pointer(pdata->netdev, ndev); 2166 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2167 2168 add_ndev_hash(pdata); 2169 if (old_ndev) 2170 dev_put(old_ndev); 2171 2172 return 0; 2173 } 2174 EXPORT_SYMBOL(ib_device_set_netdev); 2175 2176 static void free_netdevs(struct ib_device *ib_dev) 2177 { 2178 unsigned long flags; 2179 u32 port; 2180 2181 if (!ib_dev->port_data) 2182 return; 2183 2184 rdma_for_each_port (ib_dev, port) { 2185 struct ib_port_data *pdata = &ib_dev->port_data[port]; 2186 struct net_device *ndev; 2187 2188 spin_lock_irqsave(&pdata->netdev_lock, flags); 2189 ndev = rcu_dereference_protected( 2190 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2191 if (ndev) { 2192 spin_lock(&ndev_hash_lock); 2193 hash_del_rcu(&pdata->ndev_hash_link); 2194 spin_unlock(&ndev_hash_lock); 2195 2196 /* 2197 * If this is the last dev_put there is still a 2198 * synchronize_rcu before the netdev is kfreed, so we 2199 * can continue to rely on unlocked pointer 2200 * comparisons after the put 2201 */ 2202 rcu_assign_pointer(pdata->netdev, NULL); 2203 dev_put(ndev); 2204 } 2205 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2206 } 2207 } 2208 2209 struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, 2210 u32 port) 2211 { 2212 struct ib_port_data *pdata; 2213 struct net_device *res; 2214 2215 if (!rdma_is_port_valid(ib_dev, port)) 2216 return NULL; 2217 2218 pdata = &ib_dev->port_data[port]; 2219 2220 /* 2221 * New drivers should use ib_device_set_netdev() not the legacy 2222 * get_netdev(). 2223 */ 2224 if (ib_dev->ops.get_netdev) 2225 res = ib_dev->ops.get_netdev(ib_dev, port); 2226 else { 2227 spin_lock(&pdata->netdev_lock); 2228 res = rcu_dereference_protected( 2229 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2230 if (res) 2231 dev_hold(res); 2232 spin_unlock(&pdata->netdev_lock); 2233 } 2234 2235 /* 2236 * If we are starting to unregister expedite things by preventing 2237 * propagation of an unregistering netdev. 2238 */ 2239 if (res && res->reg_state != NETREG_REGISTERED) { 2240 dev_put(res); 2241 return NULL; 2242 } 2243 2244 return res; 2245 } 2246 2247 /** 2248 * ib_device_get_by_netdev - Find an IB device associated with a netdev 2249 * @ndev: netdev to locate 2250 * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) 2251 * 2252 * Find and hold an ib_device that is associated with a netdev via 2253 * ib_device_set_netdev(). The caller must call ib_device_put() on the 2254 * returned pointer. 2255 */ 2256 struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, 2257 enum rdma_driver_id driver_id) 2258 { 2259 struct ib_device *res = NULL; 2260 struct ib_port_data *cur; 2261 2262 rcu_read_lock(); 2263 hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link, 2264 (uintptr_t)ndev) { 2265 if (rcu_access_pointer(cur->netdev) == ndev && 2266 (driver_id == RDMA_DRIVER_UNKNOWN || 2267 cur->ib_dev->ops.driver_id == driver_id) && 2268 ib_device_try_get(cur->ib_dev)) { 2269 res = cur->ib_dev; 2270 break; 2271 } 2272 } 2273 rcu_read_unlock(); 2274 2275 return res; 2276 } 2277 EXPORT_SYMBOL(ib_device_get_by_netdev); 2278 2279 /** 2280 * ib_enum_roce_netdev - enumerate all RoCE ports 2281 * @ib_dev : IB device we want to query 2282 * @filter: Should we call the callback? 2283 * @filter_cookie: Cookie passed to filter 2284 * @cb: Callback to call for each found RoCE ports 2285 * @cookie: Cookie passed back to the callback 2286 * 2287 * Enumerates all of the physical RoCE ports of ib_dev 2288 * which are related to netdevice and calls callback() on each 2289 * device for which filter() function returns non zero. 2290 */ 2291 void ib_enum_roce_netdev(struct ib_device *ib_dev, 2292 roce_netdev_filter filter, 2293 void *filter_cookie, 2294 roce_netdev_callback cb, 2295 void *cookie) 2296 { 2297 u32 port; 2298 2299 rdma_for_each_port (ib_dev, port) 2300 if (rdma_protocol_roce(ib_dev, port)) { 2301 struct net_device *idev = 2302 ib_device_get_netdev(ib_dev, port); 2303 2304 if (filter(ib_dev, port, idev, filter_cookie)) 2305 cb(ib_dev, port, idev, cookie); 2306 2307 if (idev) 2308 dev_put(idev); 2309 } 2310 } 2311 2312 /** 2313 * ib_enum_all_roce_netdevs - enumerate all RoCE devices 2314 * @filter: Should we call the callback? 2315 * @filter_cookie: Cookie passed to filter 2316 * @cb: Callback to call for each found RoCE ports 2317 * @cookie: Cookie passed back to the callback 2318 * 2319 * Enumerates all RoCE devices' physical ports which are related 2320 * to netdevices and calls callback() on each device for which 2321 * filter() function returns non zero. 2322 */ 2323 void ib_enum_all_roce_netdevs(roce_netdev_filter filter, 2324 void *filter_cookie, 2325 roce_netdev_callback cb, 2326 void *cookie) 2327 { 2328 struct ib_device *dev; 2329 unsigned long index; 2330 2331 down_read(&devices_rwsem); 2332 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) 2333 ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie); 2334 up_read(&devices_rwsem); 2335 } 2336 2337 /* 2338 * ib_enum_all_devs - enumerate all ib_devices 2339 * @cb: Callback to call for each found ib_device 2340 * 2341 * Enumerates all ib_devices and calls callback() on each device. 2342 */ 2343 int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, 2344 struct netlink_callback *cb) 2345 { 2346 unsigned long index; 2347 struct ib_device *dev; 2348 unsigned int idx = 0; 2349 int ret = 0; 2350 2351 down_read(&devices_rwsem); 2352 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 2353 if (!rdma_dev_access_netns(dev, sock_net(skb->sk))) 2354 continue; 2355 2356 ret = nldev_cb(dev, skb, cb, idx); 2357 if (ret) 2358 break; 2359 idx++; 2360 } 2361 up_read(&devices_rwsem); 2362 return ret; 2363 } 2364 2365 /** 2366 * ib_query_pkey - Get P_Key table entry 2367 * @device:Device to query 2368 * @port_num:Port number to query 2369 * @index:P_Key table index to query 2370 * @pkey:Returned P_Key 2371 * 2372 * ib_query_pkey() fetches the specified P_Key table entry. 2373 */ 2374 int ib_query_pkey(struct ib_device *device, 2375 u32 port_num, u16 index, u16 *pkey) 2376 { 2377 if (!rdma_is_port_valid(device, port_num)) 2378 return -EINVAL; 2379 2380 if (!device->ops.query_pkey) 2381 return -EOPNOTSUPP; 2382 2383 return device->ops.query_pkey(device, port_num, index, pkey); 2384 } 2385 EXPORT_SYMBOL(ib_query_pkey); 2386 2387 /** 2388 * ib_modify_device - Change IB device attributes 2389 * @device:Device to modify 2390 * @device_modify_mask:Mask of attributes to change 2391 * @device_modify:New attribute values 2392 * 2393 * ib_modify_device() changes a device's attributes as specified by 2394 * the @device_modify_mask and @device_modify structure. 2395 */ 2396 int ib_modify_device(struct ib_device *device, 2397 int device_modify_mask, 2398 struct ib_device_modify *device_modify) 2399 { 2400 if (!device->ops.modify_device) 2401 return -EOPNOTSUPP; 2402 2403 return device->ops.modify_device(device, device_modify_mask, 2404 device_modify); 2405 } 2406 EXPORT_SYMBOL(ib_modify_device); 2407 2408 /** 2409 * ib_modify_port - Modifies the attributes for the specified port. 2410 * @device: The device to modify. 2411 * @port_num: The number of the port to modify. 2412 * @port_modify_mask: Mask used to specify which attributes of the port 2413 * to change. 2414 * @port_modify: New attribute values for the port. 2415 * 2416 * ib_modify_port() changes a port's attributes as specified by the 2417 * @port_modify_mask and @port_modify structure. 2418 */ 2419 int ib_modify_port(struct ib_device *device, 2420 u32 port_num, int port_modify_mask, 2421 struct ib_port_modify *port_modify) 2422 { 2423 int rc; 2424 2425 if (!rdma_is_port_valid(device, port_num)) 2426 return -EINVAL; 2427 2428 if (device->ops.modify_port) 2429 rc = device->ops.modify_port(device, port_num, 2430 port_modify_mask, 2431 port_modify); 2432 else if (rdma_protocol_roce(device, port_num) && 2433 ((port_modify->set_port_cap_mask & ~IB_PORT_CM_SUP) == 0 || 2434 (port_modify->clr_port_cap_mask & ~IB_PORT_CM_SUP) == 0)) 2435 rc = 0; 2436 else 2437 rc = -EOPNOTSUPP; 2438 return rc; 2439 } 2440 EXPORT_SYMBOL(ib_modify_port); 2441 2442 /** 2443 * ib_find_gid - Returns the port number and GID table index where 2444 * a specified GID value occurs. Its searches only for IB link layer. 2445 * @device: The device to query. 2446 * @gid: The GID value to search for. 2447 * @port_num: The port number of the device where the GID value was found. 2448 * @index: The index into the GID table where the GID was found. This 2449 * parameter may be NULL. 2450 */ 2451 int ib_find_gid(struct ib_device *device, union ib_gid *gid, 2452 u32 *port_num, u16 *index) 2453 { 2454 union ib_gid tmp_gid; 2455 u32 port; 2456 int ret, i; 2457 2458 rdma_for_each_port (device, port) { 2459 if (!rdma_protocol_ib(device, port)) 2460 continue; 2461 2462 for (i = 0; i < device->port_data[port].immutable.gid_tbl_len; 2463 ++i) { 2464 ret = rdma_query_gid(device, port, i, &tmp_gid); 2465 if (ret) 2466 return ret; 2467 if (!memcmp(&tmp_gid, gid, sizeof *gid)) { 2468 *port_num = port; 2469 if (index) 2470 *index = i; 2471 return 0; 2472 } 2473 } 2474 } 2475 2476 return -ENOENT; 2477 } 2478 EXPORT_SYMBOL(ib_find_gid); 2479 2480 /** 2481 * ib_find_pkey - Returns the PKey table index where a specified 2482 * PKey value occurs. 2483 * @device: The device to query. 2484 * @port_num: The port number of the device to search for the PKey. 2485 * @pkey: The PKey value to search for. 2486 * @index: The index into the PKey table where the PKey was found. 2487 */ 2488 int ib_find_pkey(struct ib_device *device, 2489 u32 port_num, u16 pkey, u16 *index) 2490 { 2491 int ret, i; 2492 u16 tmp_pkey; 2493 int partial_ix = -1; 2494 2495 for (i = 0; i < device->port_data[port_num].immutable.pkey_tbl_len; 2496 ++i) { 2497 ret = ib_query_pkey(device, port_num, i, &tmp_pkey); 2498 if (ret) 2499 return ret; 2500 if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) { 2501 /* if there is full-member pkey take it.*/ 2502 if (tmp_pkey & 0x8000) { 2503 *index = i; 2504 return 0; 2505 } 2506 if (partial_ix < 0) 2507 partial_ix = i; 2508 } 2509 } 2510 2511 /*no full-member, if exists take the limited*/ 2512 if (partial_ix >= 0) { 2513 *index = partial_ix; 2514 return 0; 2515 } 2516 return -ENOENT; 2517 } 2518 EXPORT_SYMBOL(ib_find_pkey); 2519 2520 /** 2521 * ib_get_net_dev_by_params() - Return the appropriate net_dev 2522 * for a received CM request 2523 * @dev: An RDMA device on which the request has been received. 2524 * @port: Port number on the RDMA device. 2525 * @pkey: The Pkey the request came on. 2526 * @gid: A GID that the net_dev uses to communicate. 2527 * @addr: Contains the IP address that the request specified as its 2528 * destination. 2529 * 2530 */ 2531 struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, 2532 u32 port, 2533 u16 pkey, 2534 const union ib_gid *gid, 2535 const struct sockaddr *addr) 2536 { 2537 struct net_device *net_dev = NULL; 2538 unsigned long index; 2539 void *client_data; 2540 2541 if (!rdma_protocol_ib(dev, port)) 2542 return NULL; 2543 2544 /* 2545 * Holding the read side guarantees that the client will not become 2546 * unregistered while we are calling get_net_dev_by_params() 2547 */ 2548 down_read(&dev->client_data_rwsem); 2549 xan_for_each_marked (&dev->client_data, index, client_data, 2550 CLIENT_DATA_REGISTERED) { 2551 struct ib_client *client = xa_load(&clients, index); 2552 2553 if (!client || !client->get_net_dev_by_params) 2554 continue; 2555 2556 net_dev = client->get_net_dev_by_params(dev, port, pkey, gid, 2557 addr, client_data); 2558 if (net_dev) 2559 break; 2560 } 2561 up_read(&dev->client_data_rwsem); 2562 2563 return net_dev; 2564 } 2565 EXPORT_SYMBOL(ib_get_net_dev_by_params); 2566 2567 void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) 2568 { 2569 struct ib_device_ops *dev_ops = &dev->ops; 2570 #define SET_DEVICE_OP(ptr, name) \ 2571 do { \ 2572 if (ops->name) \ 2573 if (!((ptr)->name)) \ 2574 (ptr)->name = ops->name; \ 2575 } while (0) 2576 2577 #define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name) 2578 2579 if (ops->driver_id != RDMA_DRIVER_UNKNOWN) { 2580 WARN_ON(dev_ops->driver_id != RDMA_DRIVER_UNKNOWN && 2581 dev_ops->driver_id != ops->driver_id); 2582 dev_ops->driver_id = ops->driver_id; 2583 } 2584 if (ops->owner) { 2585 WARN_ON(dev_ops->owner && dev_ops->owner != ops->owner); 2586 dev_ops->owner = ops->owner; 2587 } 2588 if (ops->uverbs_abi_ver) 2589 dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver; 2590 2591 dev_ops->uverbs_no_driver_id_binding |= 2592 ops->uverbs_no_driver_id_binding; 2593 2594 SET_DEVICE_OP(dev_ops, add_gid); 2595 SET_DEVICE_OP(dev_ops, advise_mr); 2596 SET_DEVICE_OP(dev_ops, alloc_dm); 2597 SET_DEVICE_OP(dev_ops, alloc_hw_device_stats); 2598 SET_DEVICE_OP(dev_ops, alloc_hw_port_stats); 2599 SET_DEVICE_OP(dev_ops, alloc_mr); 2600 SET_DEVICE_OP(dev_ops, alloc_mr_integrity); 2601 SET_DEVICE_OP(dev_ops, alloc_mw); 2602 SET_DEVICE_OP(dev_ops, alloc_pd); 2603 SET_DEVICE_OP(dev_ops, alloc_rdma_netdev); 2604 SET_DEVICE_OP(dev_ops, alloc_ucontext); 2605 SET_DEVICE_OP(dev_ops, alloc_xrcd); 2606 SET_DEVICE_OP(dev_ops, attach_mcast); 2607 SET_DEVICE_OP(dev_ops, check_mr_status); 2608 SET_DEVICE_OP(dev_ops, counter_alloc_stats); 2609 SET_DEVICE_OP(dev_ops, counter_bind_qp); 2610 SET_DEVICE_OP(dev_ops, counter_dealloc); 2611 SET_DEVICE_OP(dev_ops, counter_unbind_qp); 2612 SET_DEVICE_OP(dev_ops, counter_update_stats); 2613 SET_DEVICE_OP(dev_ops, create_ah); 2614 SET_DEVICE_OP(dev_ops, create_counters); 2615 SET_DEVICE_OP(dev_ops, create_cq); 2616 SET_DEVICE_OP(dev_ops, create_flow); 2617 SET_DEVICE_OP(dev_ops, create_flow_action_esp); 2618 SET_DEVICE_OP(dev_ops, create_qp); 2619 SET_DEVICE_OP(dev_ops, create_rwq_ind_table); 2620 SET_DEVICE_OP(dev_ops, create_srq); 2621 SET_DEVICE_OP(dev_ops, create_user_ah); 2622 SET_DEVICE_OP(dev_ops, create_wq); 2623 SET_DEVICE_OP(dev_ops, dealloc_dm); 2624 SET_DEVICE_OP(dev_ops, dealloc_driver); 2625 SET_DEVICE_OP(dev_ops, dealloc_mw); 2626 SET_DEVICE_OP(dev_ops, dealloc_pd); 2627 SET_DEVICE_OP(dev_ops, dealloc_ucontext); 2628 SET_DEVICE_OP(dev_ops, dealloc_xrcd); 2629 SET_DEVICE_OP(dev_ops, del_gid); 2630 SET_DEVICE_OP(dev_ops, dereg_mr); 2631 SET_DEVICE_OP(dev_ops, destroy_ah); 2632 SET_DEVICE_OP(dev_ops, destroy_counters); 2633 SET_DEVICE_OP(dev_ops, destroy_cq); 2634 SET_DEVICE_OP(dev_ops, destroy_flow); 2635 SET_DEVICE_OP(dev_ops, destroy_flow_action); 2636 SET_DEVICE_OP(dev_ops, destroy_qp); 2637 SET_DEVICE_OP(dev_ops, destroy_rwq_ind_table); 2638 SET_DEVICE_OP(dev_ops, destroy_srq); 2639 SET_DEVICE_OP(dev_ops, destroy_wq); 2640 SET_DEVICE_OP(dev_ops, device_group); 2641 SET_DEVICE_OP(dev_ops, detach_mcast); 2642 SET_DEVICE_OP(dev_ops, disassociate_ucontext); 2643 SET_DEVICE_OP(dev_ops, drain_rq); 2644 SET_DEVICE_OP(dev_ops, drain_sq); 2645 SET_DEVICE_OP(dev_ops, enable_driver); 2646 SET_DEVICE_OP(dev_ops, fill_res_cm_id_entry); 2647 SET_DEVICE_OP(dev_ops, fill_res_cq_entry); 2648 SET_DEVICE_OP(dev_ops, fill_res_cq_entry_raw); 2649 SET_DEVICE_OP(dev_ops, fill_res_mr_entry); 2650 SET_DEVICE_OP(dev_ops, fill_res_mr_entry_raw); 2651 SET_DEVICE_OP(dev_ops, fill_res_qp_entry); 2652 SET_DEVICE_OP(dev_ops, fill_res_qp_entry_raw); 2653 SET_DEVICE_OP(dev_ops, fill_stat_mr_entry); 2654 SET_DEVICE_OP(dev_ops, get_dev_fw_str); 2655 SET_DEVICE_OP(dev_ops, get_dma_mr); 2656 SET_DEVICE_OP(dev_ops, get_hw_stats); 2657 SET_DEVICE_OP(dev_ops, get_link_layer); 2658 SET_DEVICE_OP(dev_ops, get_netdev); 2659 SET_DEVICE_OP(dev_ops, get_port_immutable); 2660 SET_DEVICE_OP(dev_ops, get_vector_affinity); 2661 SET_DEVICE_OP(dev_ops, get_vf_config); 2662 SET_DEVICE_OP(dev_ops, get_vf_guid); 2663 SET_DEVICE_OP(dev_ops, get_vf_stats); 2664 SET_DEVICE_OP(dev_ops, iw_accept); 2665 SET_DEVICE_OP(dev_ops, iw_add_ref); 2666 SET_DEVICE_OP(dev_ops, iw_connect); 2667 SET_DEVICE_OP(dev_ops, iw_create_listen); 2668 SET_DEVICE_OP(dev_ops, iw_destroy_listen); 2669 SET_DEVICE_OP(dev_ops, iw_get_qp); 2670 SET_DEVICE_OP(dev_ops, iw_reject); 2671 SET_DEVICE_OP(dev_ops, iw_rem_ref); 2672 SET_DEVICE_OP(dev_ops, map_mr_sg); 2673 SET_DEVICE_OP(dev_ops, map_mr_sg_pi); 2674 SET_DEVICE_OP(dev_ops, mmap); 2675 SET_DEVICE_OP(dev_ops, mmap_free); 2676 SET_DEVICE_OP(dev_ops, modify_ah); 2677 SET_DEVICE_OP(dev_ops, modify_cq); 2678 SET_DEVICE_OP(dev_ops, modify_device); 2679 SET_DEVICE_OP(dev_ops, modify_flow_action_esp); 2680 SET_DEVICE_OP(dev_ops, modify_port); 2681 SET_DEVICE_OP(dev_ops, modify_qp); 2682 SET_DEVICE_OP(dev_ops, modify_srq); 2683 SET_DEVICE_OP(dev_ops, modify_wq); 2684 SET_DEVICE_OP(dev_ops, peek_cq); 2685 SET_DEVICE_OP(dev_ops, poll_cq); 2686 SET_DEVICE_OP(dev_ops, port_groups); 2687 SET_DEVICE_OP(dev_ops, post_recv); 2688 SET_DEVICE_OP(dev_ops, post_send); 2689 SET_DEVICE_OP(dev_ops, post_srq_recv); 2690 SET_DEVICE_OP(dev_ops, process_mad); 2691 SET_DEVICE_OP(dev_ops, query_ah); 2692 SET_DEVICE_OP(dev_ops, query_device); 2693 SET_DEVICE_OP(dev_ops, query_gid); 2694 SET_DEVICE_OP(dev_ops, query_pkey); 2695 SET_DEVICE_OP(dev_ops, query_port); 2696 SET_DEVICE_OP(dev_ops, query_qp); 2697 SET_DEVICE_OP(dev_ops, query_srq); 2698 SET_DEVICE_OP(dev_ops, query_ucontext); 2699 SET_DEVICE_OP(dev_ops, rdma_netdev_get_params); 2700 SET_DEVICE_OP(dev_ops, read_counters); 2701 SET_DEVICE_OP(dev_ops, reg_dm_mr); 2702 SET_DEVICE_OP(dev_ops, reg_user_mr); 2703 SET_DEVICE_OP(dev_ops, reg_user_mr_dmabuf); 2704 SET_DEVICE_OP(dev_ops, req_notify_cq); 2705 SET_DEVICE_OP(dev_ops, rereg_user_mr); 2706 SET_DEVICE_OP(dev_ops, resize_cq); 2707 SET_DEVICE_OP(dev_ops, set_vf_guid); 2708 SET_DEVICE_OP(dev_ops, set_vf_link_state); 2709 2710 SET_OBJ_SIZE(dev_ops, ib_ah); 2711 SET_OBJ_SIZE(dev_ops, ib_counters); 2712 SET_OBJ_SIZE(dev_ops, ib_cq); 2713 SET_OBJ_SIZE(dev_ops, ib_mw); 2714 SET_OBJ_SIZE(dev_ops, ib_pd); 2715 SET_OBJ_SIZE(dev_ops, ib_rwq_ind_table); 2716 SET_OBJ_SIZE(dev_ops, ib_srq); 2717 SET_OBJ_SIZE(dev_ops, ib_ucontext); 2718 SET_OBJ_SIZE(dev_ops, ib_xrcd); 2719 } 2720 EXPORT_SYMBOL(ib_set_device_ops); 2721 2722 #ifdef CONFIG_INFINIBAND_VIRT_DMA 2723 int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents) 2724 { 2725 struct scatterlist *s; 2726 int i; 2727 2728 for_each_sg(sg, s, nents, i) { 2729 sg_dma_address(s) = (uintptr_t)sg_virt(s); 2730 sg_dma_len(s) = s->length; 2731 } 2732 return nents; 2733 } 2734 EXPORT_SYMBOL(ib_dma_virt_map_sg); 2735 #endif /* CONFIG_INFINIBAND_VIRT_DMA */ 2736 2737 static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = { 2738 [RDMA_NL_LS_OP_RESOLVE] = { 2739 .doit = ib_nl_handle_resolve_resp, 2740 .flags = RDMA_NL_ADMIN_PERM, 2741 }, 2742 [RDMA_NL_LS_OP_SET_TIMEOUT] = { 2743 .doit = ib_nl_handle_set_timeout, 2744 .flags = RDMA_NL_ADMIN_PERM, 2745 }, 2746 [RDMA_NL_LS_OP_IP_RESOLVE] = { 2747 .doit = ib_nl_handle_ip_res_resp, 2748 .flags = RDMA_NL_ADMIN_PERM, 2749 }, 2750 }; 2751 2752 static int __init ib_core_init(void) 2753 { 2754 int ret; 2755 2756 ib_wq = alloc_workqueue("infiniband", 0, 0); 2757 if (!ib_wq) 2758 return -ENOMEM; 2759 2760 ib_comp_wq = alloc_workqueue("ib-comp-wq", 2761 WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0); 2762 if (!ib_comp_wq) { 2763 ret = -ENOMEM; 2764 goto err; 2765 } 2766 2767 ib_comp_unbound_wq = 2768 alloc_workqueue("ib-comp-unb-wq", 2769 WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM | 2770 WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE); 2771 if (!ib_comp_unbound_wq) { 2772 ret = -ENOMEM; 2773 goto err_comp; 2774 } 2775 2776 ret = class_register(&ib_class); 2777 if (ret) { 2778 pr_warn("Couldn't create InfiniBand device class\n"); 2779 goto err_comp_unbound; 2780 } 2781 2782 rdma_nl_init(); 2783 2784 ret = addr_init(); 2785 if (ret) { 2786 pr_warn("Couldn't init IB address resolution\n"); 2787 goto err_ibnl; 2788 } 2789 2790 ret = ib_mad_init(); 2791 if (ret) { 2792 pr_warn("Couldn't init IB MAD\n"); 2793 goto err_addr; 2794 } 2795 2796 ret = ib_sa_init(); 2797 if (ret) { 2798 pr_warn("Couldn't init SA\n"); 2799 goto err_mad; 2800 } 2801 2802 ret = register_blocking_lsm_notifier(&ibdev_lsm_nb); 2803 if (ret) { 2804 pr_warn("Couldn't register LSM notifier. ret %d\n", ret); 2805 goto err_sa; 2806 } 2807 2808 ret = register_pernet_device(&rdma_dev_net_ops); 2809 if (ret) { 2810 pr_warn("Couldn't init compat dev. ret %d\n", ret); 2811 goto err_compat; 2812 } 2813 2814 nldev_init(); 2815 rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table); 2816 roce_gid_mgmt_init(); 2817 2818 return 0; 2819 2820 err_compat: 2821 unregister_blocking_lsm_notifier(&ibdev_lsm_nb); 2822 err_sa: 2823 ib_sa_cleanup(); 2824 err_mad: 2825 ib_mad_cleanup(); 2826 err_addr: 2827 addr_cleanup(); 2828 err_ibnl: 2829 class_unregister(&ib_class); 2830 err_comp_unbound: 2831 destroy_workqueue(ib_comp_unbound_wq); 2832 err_comp: 2833 destroy_workqueue(ib_comp_wq); 2834 err: 2835 destroy_workqueue(ib_wq); 2836 return ret; 2837 } 2838 2839 static void __exit ib_core_cleanup(void) 2840 { 2841 roce_gid_mgmt_cleanup(); 2842 nldev_exit(); 2843 rdma_nl_unregister(RDMA_NL_LS); 2844 unregister_pernet_device(&rdma_dev_net_ops); 2845 unregister_blocking_lsm_notifier(&ibdev_lsm_nb); 2846 ib_sa_cleanup(); 2847 ib_mad_cleanup(); 2848 addr_cleanup(); 2849 rdma_nl_exit(); 2850 class_unregister(&ib_class); 2851 destroy_workqueue(ib_comp_unbound_wq); 2852 destroy_workqueue(ib_comp_wq); 2853 /* Make sure that any pending umem accounting work is done. */ 2854 destroy_workqueue(ib_wq); 2855 flush_workqueue(system_unbound_wq); 2856 WARN_ON(!xa_empty(&clients)); 2857 WARN_ON(!xa_empty(&devices)); 2858 } 2859 2860 MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4); 2861 2862 /* ib core relies on netdev stack to first register net_ns_type_operations 2863 * ns kobject type before ib_core initialization. 2864 */ 2865 fs_initcall(ib_core_init); 2866 module_exit(ib_core_cleanup); 2867