1 /* 2 * Copyright (c) 2004 Topspin Communications. All rights reserved. 3 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. 4 * Copyright (c) 2004 Voltaire, Inc. All rights reserved. 5 * 6 * This software is available to you under a choice of one of two 7 * licenses. You may choose to be licensed under the terms of the GNU 8 * General Public License (GPL) Version 2, available from the file 9 * COPYING in the main directory of this source tree, or the 10 * OpenIB.org BSD license below: 11 * 12 * Redistribution and use in source and binary forms, with or 13 * without modification, are permitted provided that the following 14 * conditions are met: 15 * 16 * - Redistributions of source code must retain the above 17 * copyright notice, this list of conditions and the following 18 * disclaimer. 19 * 20 * - Redistributions in binary form must reproduce the above 21 * copyright notice, this list of conditions and the following 22 * disclaimer in the documentation and/or other materials 23 * provided with the distribution. 24 * 25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 32 * SOFTWARE. 33 * 34 * $Id: ipoib_main.c 1377 2004-12-23 19:57:12Z roland $ 35 */ 36 37 #include "ipoib.h" 38 39 #include <linux/module.h> 40 41 #include <linux/init.h> 42 #include <linux/slab.h> 43 #include <linux/kernel.h> 44 45 #include <linux/if_arp.h> /* For ARPHRD_xxx */ 46 47 #include <linux/ip.h> 48 #include <linux/in.h> 49 50 #include <net/dst.h> 51 52 MODULE_AUTHOR("Roland Dreier"); 53 MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); 54 MODULE_LICENSE("Dual BSD/GPL"); 55 56 int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE; 57 int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE; 58 59 module_param_named(send_queue_size, ipoib_sendq_size, int, 0444); 60 MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue"); 61 module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444); 62 MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue"); 63 64 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG 65 int ipoib_debug_level; 66 67 module_param_named(debug_level, ipoib_debug_level, int, 0644); 68 MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); 69 #endif 70 71 struct ipoib_path_iter { 72 struct net_device *dev; 73 struct ipoib_path path; 74 }; 75 76 static const u8 ipv4_bcast_addr[] = { 77 0x00, 0xff, 0xff, 0xff, 78 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 79 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff 80 }; 81 82 struct workqueue_struct *ipoib_workqueue; 83 84 struct ib_sa_client ipoib_sa_client; 85 86 static void ipoib_add_one(struct ib_device *device); 87 static void ipoib_remove_one(struct ib_device *device); 88 89 static struct ib_client ipoib_client = { 90 .name = "ipoib", 91 .add = ipoib_add_one, 92 .remove = ipoib_remove_one 93 }; 94 95 int ipoib_open(struct net_device *dev) 96 { 97 struct ipoib_dev_priv *priv = netdev_priv(dev); 98 99 ipoib_dbg(priv, "bringing up interface\n"); 100 101 set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); 102 103 if (ipoib_pkey_dev_delay_open(dev)) 104 return 0; 105 106 if (ipoib_ib_dev_open(dev)) 107 return -EINVAL; 108 109 if (ipoib_ib_dev_up(dev)) { 110 ipoib_ib_dev_stop(dev); 111 return -EINVAL; 112 } 113 114 if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { 115 struct ipoib_dev_priv *cpriv; 116 117 /* Bring up any child interfaces too */ 118 mutex_lock(&priv->vlan_mutex); 119 list_for_each_entry(cpriv, &priv->child_intfs, list) { 120 int flags; 121 122 flags = cpriv->dev->flags; 123 if (flags & IFF_UP) 124 continue; 125 126 dev_change_flags(cpriv->dev, flags | IFF_UP); 127 } 128 mutex_unlock(&priv->vlan_mutex); 129 } 130 131 netif_start_queue(dev); 132 133 return 0; 134 } 135 136 static int ipoib_stop(struct net_device *dev) 137 { 138 struct ipoib_dev_priv *priv = netdev_priv(dev); 139 140 ipoib_dbg(priv, "stopping interface\n"); 141 142 clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); 143 144 netif_stop_queue(dev); 145 146 clear_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags); 147 148 /* 149 * Now flush workqueue to make sure a scheduled task doesn't 150 * bring our internal state back up. 151 */ 152 flush_workqueue(ipoib_workqueue); 153 154 ipoib_ib_dev_down(dev, 1); 155 ipoib_ib_dev_stop(dev); 156 157 if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { 158 struct ipoib_dev_priv *cpriv; 159 160 /* Bring down any child interfaces too */ 161 mutex_lock(&priv->vlan_mutex); 162 list_for_each_entry(cpriv, &priv->child_intfs, list) { 163 int flags; 164 165 flags = cpriv->dev->flags; 166 if (!(flags & IFF_UP)) 167 continue; 168 169 dev_change_flags(cpriv->dev, flags & ~IFF_UP); 170 } 171 mutex_unlock(&priv->vlan_mutex); 172 } 173 174 return 0; 175 } 176 177 static int ipoib_change_mtu(struct net_device *dev, int new_mtu) 178 { 179 struct ipoib_dev_priv *priv = netdev_priv(dev); 180 181 /* dev->mtu > 2K ==> connected mode */ 182 if (ipoib_cm_admin_enabled(dev) && new_mtu <= IPOIB_CM_MTU) { 183 if (new_mtu > priv->mcast_mtu) 184 ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n", 185 priv->mcast_mtu); 186 dev->mtu = new_mtu; 187 return 0; 188 } 189 190 if (new_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN) { 191 return -EINVAL; 192 } 193 194 priv->admin_mtu = new_mtu; 195 196 dev->mtu = min(priv->mcast_mtu, priv->admin_mtu); 197 198 return 0; 199 } 200 201 static struct ipoib_path *__path_find(struct net_device *dev, void *gid) 202 { 203 struct ipoib_dev_priv *priv = netdev_priv(dev); 204 struct rb_node *n = priv->path_tree.rb_node; 205 struct ipoib_path *path; 206 int ret; 207 208 while (n) { 209 path = rb_entry(n, struct ipoib_path, rb_node); 210 211 ret = memcmp(gid, path->pathrec.dgid.raw, 212 sizeof (union ib_gid)); 213 214 if (ret < 0) 215 n = n->rb_left; 216 else if (ret > 0) 217 n = n->rb_right; 218 else 219 return path; 220 } 221 222 return NULL; 223 } 224 225 static int __path_add(struct net_device *dev, struct ipoib_path *path) 226 { 227 struct ipoib_dev_priv *priv = netdev_priv(dev); 228 struct rb_node **n = &priv->path_tree.rb_node; 229 struct rb_node *pn = NULL; 230 struct ipoib_path *tpath; 231 int ret; 232 233 while (*n) { 234 pn = *n; 235 tpath = rb_entry(pn, struct ipoib_path, rb_node); 236 237 ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw, 238 sizeof (union ib_gid)); 239 if (ret < 0) 240 n = &pn->rb_left; 241 else if (ret > 0) 242 n = &pn->rb_right; 243 else 244 return -EEXIST; 245 } 246 247 rb_link_node(&path->rb_node, pn, n); 248 rb_insert_color(&path->rb_node, &priv->path_tree); 249 250 list_add_tail(&path->list, &priv->path_list); 251 252 return 0; 253 } 254 255 static void path_free(struct net_device *dev, struct ipoib_path *path) 256 { 257 struct ipoib_dev_priv *priv = netdev_priv(dev); 258 struct ipoib_neigh *neigh, *tn; 259 struct sk_buff *skb; 260 unsigned long flags; 261 262 while ((skb = __skb_dequeue(&path->queue))) 263 dev_kfree_skb_irq(skb); 264 265 spin_lock_irqsave(&priv->lock, flags); 266 267 list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) { 268 /* 269 * It's safe to call ipoib_put_ah() inside priv->lock 270 * here, because we know that path->ah will always 271 * hold one more reference, so ipoib_put_ah() will 272 * never do more than decrement the ref count. 273 */ 274 if (neigh->ah) 275 ipoib_put_ah(neigh->ah); 276 277 ipoib_neigh_free(dev, neigh); 278 } 279 280 spin_unlock_irqrestore(&priv->lock, flags); 281 282 if (path->ah) 283 ipoib_put_ah(path->ah); 284 285 kfree(path); 286 } 287 288 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG 289 290 struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev) 291 { 292 struct ipoib_path_iter *iter; 293 294 iter = kmalloc(sizeof *iter, GFP_KERNEL); 295 if (!iter) 296 return NULL; 297 298 iter->dev = dev; 299 memset(iter->path.pathrec.dgid.raw, 0, 16); 300 301 if (ipoib_path_iter_next(iter)) { 302 kfree(iter); 303 return NULL; 304 } 305 306 return iter; 307 } 308 309 int ipoib_path_iter_next(struct ipoib_path_iter *iter) 310 { 311 struct ipoib_dev_priv *priv = netdev_priv(iter->dev); 312 struct rb_node *n; 313 struct ipoib_path *path; 314 int ret = 1; 315 316 spin_lock_irq(&priv->lock); 317 318 n = rb_first(&priv->path_tree); 319 320 while (n) { 321 path = rb_entry(n, struct ipoib_path, rb_node); 322 323 if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw, 324 sizeof (union ib_gid)) < 0) { 325 iter->path = *path; 326 ret = 0; 327 break; 328 } 329 330 n = rb_next(n); 331 } 332 333 spin_unlock_irq(&priv->lock); 334 335 return ret; 336 } 337 338 void ipoib_path_iter_read(struct ipoib_path_iter *iter, 339 struct ipoib_path *path) 340 { 341 *path = iter->path; 342 } 343 344 #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ 345 346 void ipoib_flush_paths(struct net_device *dev) 347 { 348 struct ipoib_dev_priv *priv = netdev_priv(dev); 349 struct ipoib_path *path, *tp; 350 LIST_HEAD(remove_list); 351 352 spin_lock_irq(&priv->tx_lock); 353 spin_lock(&priv->lock); 354 355 list_splice(&priv->path_list, &remove_list); 356 INIT_LIST_HEAD(&priv->path_list); 357 358 list_for_each_entry(path, &remove_list, list) 359 rb_erase(&path->rb_node, &priv->path_tree); 360 361 list_for_each_entry_safe(path, tp, &remove_list, list) { 362 if (path->query) 363 ib_sa_cancel_query(path->query_id, path->query); 364 spin_unlock(&priv->lock); 365 spin_unlock_irq(&priv->tx_lock); 366 wait_for_completion(&path->done); 367 path_free(dev, path); 368 spin_lock_irq(&priv->tx_lock); 369 spin_lock(&priv->lock); 370 } 371 spin_unlock(&priv->lock); 372 spin_unlock_irq(&priv->tx_lock); 373 } 374 375 static void path_rec_completion(int status, 376 struct ib_sa_path_rec *pathrec, 377 void *path_ptr) 378 { 379 struct ipoib_path *path = path_ptr; 380 struct net_device *dev = path->dev; 381 struct ipoib_dev_priv *priv = netdev_priv(dev); 382 struct ipoib_ah *ah = NULL; 383 struct ipoib_neigh *neigh; 384 struct sk_buff_head skqueue; 385 struct sk_buff *skb; 386 unsigned long flags; 387 388 if (!status) 389 ipoib_dbg(priv, "PathRec LID 0x%04x for GID " IPOIB_GID_FMT "\n", 390 be16_to_cpu(pathrec->dlid), IPOIB_GID_ARG(pathrec->dgid)); 391 else 392 ipoib_dbg(priv, "PathRec status %d for GID " IPOIB_GID_FMT "\n", 393 status, IPOIB_GID_ARG(path->pathrec.dgid)); 394 395 skb_queue_head_init(&skqueue); 396 397 if (!status) { 398 struct ib_ah_attr av = { 399 .dlid = be16_to_cpu(pathrec->dlid), 400 .sl = pathrec->sl, 401 .port_num = priv->port, 402 .static_rate = pathrec->rate 403 }; 404 405 ah = ipoib_create_ah(dev, priv->pd, &av); 406 } 407 408 spin_lock_irqsave(&priv->lock, flags); 409 410 path->ah = ah; 411 412 if (ah) { 413 path->pathrec = *pathrec; 414 415 ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n", 416 ah, be16_to_cpu(pathrec->dlid), pathrec->sl); 417 418 while ((skb = __skb_dequeue(&path->queue))) 419 __skb_queue_tail(&skqueue, skb); 420 421 list_for_each_entry(neigh, &path->neigh_list, list) { 422 kref_get(&path->ah->ref); 423 neigh->ah = path->ah; 424 memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw, 425 sizeof(union ib_gid)); 426 427 if (ipoib_cm_enabled(dev, neigh->neighbour)) { 428 if (!ipoib_cm_get(neigh)) 429 ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, 430 path, 431 neigh)); 432 if (!ipoib_cm_get(neigh)) { 433 list_del(&neigh->list); 434 if (neigh->ah) 435 ipoib_put_ah(neigh->ah); 436 ipoib_neigh_free(dev, neigh); 437 continue; 438 } 439 } 440 441 while ((skb = __skb_dequeue(&neigh->queue))) 442 __skb_queue_tail(&skqueue, skb); 443 } 444 } 445 446 path->query = NULL; 447 complete(&path->done); 448 449 spin_unlock_irqrestore(&priv->lock, flags); 450 451 while ((skb = __skb_dequeue(&skqueue))) { 452 skb->dev = dev; 453 if (dev_queue_xmit(skb)) 454 ipoib_warn(priv, "dev_queue_xmit failed " 455 "to requeue packet\n"); 456 } 457 } 458 459 static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid) 460 { 461 struct ipoib_dev_priv *priv = netdev_priv(dev); 462 struct ipoib_path *path; 463 464 path = kzalloc(sizeof *path, GFP_ATOMIC); 465 if (!path) 466 return NULL; 467 468 path->dev = dev; 469 470 skb_queue_head_init(&path->queue); 471 472 INIT_LIST_HEAD(&path->neigh_list); 473 474 memcpy(path->pathrec.dgid.raw, gid, sizeof (union ib_gid)); 475 path->pathrec.sgid = priv->local_gid; 476 path->pathrec.pkey = cpu_to_be16(priv->pkey); 477 path->pathrec.numb_path = 1; 478 479 return path; 480 } 481 482 static int path_rec_start(struct net_device *dev, 483 struct ipoib_path *path) 484 { 485 struct ipoib_dev_priv *priv = netdev_priv(dev); 486 487 ipoib_dbg(priv, "Start path record lookup for " IPOIB_GID_FMT "\n", 488 IPOIB_GID_ARG(path->pathrec.dgid)); 489 490 init_completion(&path->done); 491 492 path->query_id = 493 ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port, 494 &path->pathrec, 495 IB_SA_PATH_REC_DGID | 496 IB_SA_PATH_REC_SGID | 497 IB_SA_PATH_REC_NUMB_PATH | 498 IB_SA_PATH_REC_PKEY, 499 1000, GFP_ATOMIC, 500 path_rec_completion, 501 path, &path->query); 502 if (path->query_id < 0) { 503 ipoib_warn(priv, "ib_sa_path_rec_get failed\n"); 504 path->query = NULL; 505 return path->query_id; 506 } 507 508 return 0; 509 } 510 511 static void neigh_add_path(struct sk_buff *skb, struct net_device *dev) 512 { 513 struct ipoib_dev_priv *priv = netdev_priv(dev); 514 struct ipoib_path *path; 515 struct ipoib_neigh *neigh; 516 517 neigh = ipoib_neigh_alloc(skb->dst->neighbour); 518 if (!neigh) { 519 ++priv->stats.tx_dropped; 520 dev_kfree_skb_any(skb); 521 return; 522 } 523 524 /* 525 * We can only be called from ipoib_start_xmit, so we're 526 * inside tx_lock -- no need to save/restore flags. 527 */ 528 spin_lock(&priv->lock); 529 530 path = __path_find(dev, skb->dst->neighbour->ha + 4); 531 if (!path) { 532 path = path_rec_create(dev, skb->dst->neighbour->ha + 4); 533 if (!path) 534 goto err_path; 535 536 __path_add(dev, path); 537 } 538 539 list_add_tail(&neigh->list, &path->neigh_list); 540 541 if (path->ah) { 542 kref_get(&path->ah->ref); 543 neigh->ah = path->ah; 544 memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw, 545 sizeof(union ib_gid)); 546 547 if (ipoib_cm_enabled(dev, neigh->neighbour)) { 548 if (!ipoib_cm_get(neigh)) 549 ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh)); 550 if (!ipoib_cm_get(neigh)) { 551 list_del(&neigh->list); 552 if (neigh->ah) 553 ipoib_put_ah(neigh->ah); 554 ipoib_neigh_free(dev, neigh); 555 goto err_drop; 556 } 557 if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) 558 __skb_queue_tail(&neigh->queue, skb); 559 else { 560 ipoib_warn(priv, "queue length limit %d. Packet drop.\n", 561 skb_queue_len(&neigh->queue)); 562 goto err_drop; 563 } 564 } else 565 ipoib_send(dev, skb, path->ah, IPOIB_QPN(skb->dst->neighbour->ha)); 566 } else { 567 neigh->ah = NULL; 568 569 if (!path->query && path_rec_start(dev, path)) 570 goto err_list; 571 572 __skb_queue_tail(&neigh->queue, skb); 573 } 574 575 spin_unlock(&priv->lock); 576 return; 577 578 err_list: 579 list_del(&neigh->list); 580 581 err_path: 582 ipoib_neigh_free(dev, neigh); 583 err_drop: 584 ++priv->stats.tx_dropped; 585 dev_kfree_skb_any(skb); 586 587 spin_unlock(&priv->lock); 588 } 589 590 static void ipoib_path_lookup(struct sk_buff *skb, struct net_device *dev) 591 { 592 struct ipoib_dev_priv *priv = netdev_priv(skb->dev); 593 594 /* Look up path record for unicasts */ 595 if (skb->dst->neighbour->ha[4] != 0xff) { 596 neigh_add_path(skb, dev); 597 return; 598 } 599 600 /* Add in the P_Key for multicasts */ 601 skb->dst->neighbour->ha[8] = (priv->pkey >> 8) & 0xff; 602 skb->dst->neighbour->ha[9] = priv->pkey & 0xff; 603 ipoib_mcast_send(dev, skb->dst->neighbour->ha + 4, skb); 604 } 605 606 static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, 607 struct ipoib_pseudoheader *phdr) 608 { 609 struct ipoib_dev_priv *priv = netdev_priv(dev); 610 struct ipoib_path *path; 611 612 /* 613 * We can only be called from ipoib_start_xmit, so we're 614 * inside tx_lock -- no need to save/restore flags. 615 */ 616 spin_lock(&priv->lock); 617 618 path = __path_find(dev, phdr->hwaddr + 4); 619 if (!path) { 620 path = path_rec_create(dev, phdr->hwaddr + 4); 621 if (path) { 622 /* put pseudoheader back on for next time */ 623 skb_push(skb, sizeof *phdr); 624 __skb_queue_tail(&path->queue, skb); 625 626 if (path_rec_start(dev, path)) { 627 spin_unlock(&priv->lock); 628 path_free(dev, path); 629 return; 630 } else 631 __path_add(dev, path); 632 } else { 633 ++priv->stats.tx_dropped; 634 dev_kfree_skb_any(skb); 635 } 636 637 spin_unlock(&priv->lock); 638 return; 639 } 640 641 if (path->ah) { 642 ipoib_dbg(priv, "Send unicast ARP to %04x\n", 643 be16_to_cpu(path->pathrec.dlid)); 644 645 ipoib_send(dev, skb, path->ah, IPOIB_QPN(phdr->hwaddr)); 646 } else if ((path->query || !path_rec_start(dev, path)) && 647 skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { 648 /* put pseudoheader back on for next time */ 649 skb_push(skb, sizeof *phdr); 650 __skb_queue_tail(&path->queue, skb); 651 } else { 652 ++priv->stats.tx_dropped; 653 dev_kfree_skb_any(skb); 654 } 655 656 spin_unlock(&priv->lock); 657 } 658 659 static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) 660 { 661 struct ipoib_dev_priv *priv = netdev_priv(dev); 662 struct ipoib_neigh *neigh; 663 unsigned long flags; 664 665 if (unlikely(!spin_trylock_irqsave(&priv->tx_lock, flags))) 666 return NETDEV_TX_LOCKED; 667 668 /* 669 * Check if our queue is stopped. Since we have the LLTX bit 670 * set, we can't rely on netif_stop_queue() preventing our 671 * xmit function from being called with a full queue. 672 */ 673 if (unlikely(netif_queue_stopped(dev))) { 674 spin_unlock_irqrestore(&priv->tx_lock, flags); 675 return NETDEV_TX_BUSY; 676 } 677 678 if (likely(skb->dst && skb->dst->neighbour)) { 679 if (unlikely(!*to_ipoib_neigh(skb->dst->neighbour))) { 680 ipoib_path_lookup(skb, dev); 681 goto out; 682 } 683 684 neigh = *to_ipoib_neigh(skb->dst->neighbour); 685 686 if (ipoib_cm_get(neigh)) { 687 if (ipoib_cm_up(neigh)) { 688 ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); 689 goto out; 690 } 691 } else if (neigh->ah) { 692 if (unlikely(memcmp(&neigh->dgid.raw, 693 skb->dst->neighbour->ha + 4, 694 sizeof(union ib_gid)))) { 695 spin_lock(&priv->lock); 696 /* 697 * It's safe to call ipoib_put_ah() inside 698 * priv->lock here, because we know that 699 * path->ah will always hold one more reference, 700 * so ipoib_put_ah() will never do more than 701 * decrement the ref count. 702 */ 703 ipoib_put_ah(neigh->ah); 704 list_del(&neigh->list); 705 ipoib_neigh_free(dev, neigh); 706 spin_unlock(&priv->lock); 707 ipoib_path_lookup(skb, dev); 708 goto out; 709 } 710 711 ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(skb->dst->neighbour->ha)); 712 goto out; 713 } 714 715 if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { 716 spin_lock(&priv->lock); 717 __skb_queue_tail(&neigh->queue, skb); 718 spin_unlock(&priv->lock); 719 } else { 720 ++priv->stats.tx_dropped; 721 dev_kfree_skb_any(skb); 722 } 723 } else { 724 struct ipoib_pseudoheader *phdr = 725 (struct ipoib_pseudoheader *) skb->data; 726 skb_pull(skb, sizeof *phdr); 727 728 if (phdr->hwaddr[4] == 0xff) { 729 /* Add in the P_Key for multicast*/ 730 phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff; 731 phdr->hwaddr[9] = priv->pkey & 0xff; 732 733 ipoib_mcast_send(dev, phdr->hwaddr + 4, skb); 734 } else { 735 /* unicast GID -- should be ARP or RARP reply */ 736 737 if ((be16_to_cpup((__be16 *) skb->data) != ETH_P_ARP) && 738 (be16_to_cpup((__be16 *) skb->data) != ETH_P_RARP)) { 739 ipoib_warn(priv, "Unicast, no %s: type %04x, QPN %06x " 740 IPOIB_GID_FMT "\n", 741 skb->dst ? "neigh" : "dst", 742 be16_to_cpup((__be16 *) skb->data), 743 IPOIB_QPN(phdr->hwaddr), 744 IPOIB_GID_RAW_ARG(phdr->hwaddr + 4)); 745 dev_kfree_skb_any(skb); 746 ++priv->stats.tx_dropped; 747 goto out; 748 } 749 750 unicast_arp_send(skb, dev, phdr); 751 } 752 } 753 754 out: 755 spin_unlock_irqrestore(&priv->tx_lock, flags); 756 757 return NETDEV_TX_OK; 758 } 759 760 static struct net_device_stats *ipoib_get_stats(struct net_device *dev) 761 { 762 struct ipoib_dev_priv *priv = netdev_priv(dev); 763 764 return &priv->stats; 765 } 766 767 static void ipoib_timeout(struct net_device *dev) 768 { 769 struct ipoib_dev_priv *priv = netdev_priv(dev); 770 771 ipoib_warn(priv, "transmit timeout: latency %d msecs\n", 772 jiffies_to_msecs(jiffies - dev->trans_start)); 773 ipoib_warn(priv, "queue stopped %d, tx_head %u, tx_tail %u\n", 774 netif_queue_stopped(dev), 775 priv->tx_head, priv->tx_tail); 776 /* XXX reset QP, etc. */ 777 } 778 779 static int ipoib_hard_header(struct sk_buff *skb, 780 struct net_device *dev, 781 unsigned short type, 782 void *daddr, void *saddr, unsigned len) 783 { 784 struct ipoib_header *header; 785 786 header = (struct ipoib_header *) skb_push(skb, sizeof *header); 787 788 header->proto = htons(type); 789 header->reserved = 0; 790 791 /* 792 * If we don't have a neighbour structure, stuff the 793 * destination address onto the front of the skb so we can 794 * figure out where to send the packet later. 795 */ 796 if ((!skb->dst || !skb->dst->neighbour) && daddr) { 797 struct ipoib_pseudoheader *phdr = 798 (struct ipoib_pseudoheader *) skb_push(skb, sizeof *phdr); 799 memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN); 800 } 801 802 return 0; 803 } 804 805 static void ipoib_set_mcast_list(struct net_device *dev) 806 { 807 struct ipoib_dev_priv *priv = netdev_priv(dev); 808 809 if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) { 810 ipoib_dbg(priv, "IPOIB_FLAG_OPER_UP not set"); 811 return; 812 } 813 814 queue_work(ipoib_workqueue, &priv->restart_task); 815 } 816 817 static void ipoib_neigh_destructor(struct neighbour *n) 818 { 819 struct ipoib_neigh *neigh; 820 struct ipoib_dev_priv *priv = netdev_priv(n->dev); 821 unsigned long flags; 822 struct ipoib_ah *ah = NULL; 823 824 ipoib_dbg(priv, 825 "neigh_destructor for %06x " IPOIB_GID_FMT "\n", 826 IPOIB_QPN(n->ha), 827 IPOIB_GID_RAW_ARG(n->ha + 4)); 828 829 spin_lock_irqsave(&priv->lock, flags); 830 831 neigh = *to_ipoib_neigh(n); 832 if (neigh) { 833 if (neigh->ah) 834 ah = neigh->ah; 835 list_del(&neigh->list); 836 ipoib_neigh_free(n->dev, neigh); 837 } 838 839 spin_unlock_irqrestore(&priv->lock, flags); 840 841 if (ah) 842 ipoib_put_ah(ah); 843 } 844 845 struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour) 846 { 847 struct ipoib_neigh *neigh; 848 849 neigh = kmalloc(sizeof *neigh, GFP_ATOMIC); 850 if (!neigh) 851 return NULL; 852 853 neigh->neighbour = neighbour; 854 *to_ipoib_neigh(neighbour) = neigh; 855 skb_queue_head_init(&neigh->queue); 856 ipoib_cm_set(neigh, NULL); 857 858 return neigh; 859 } 860 861 void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh) 862 { 863 struct ipoib_dev_priv *priv = netdev_priv(dev); 864 struct sk_buff *skb; 865 *to_ipoib_neigh(neigh->neighbour) = NULL; 866 while ((skb = __skb_dequeue(&neigh->queue))) { 867 ++priv->stats.tx_dropped; 868 dev_kfree_skb_any(skb); 869 } 870 if (ipoib_cm_get(neigh)) 871 ipoib_cm_destroy_tx(ipoib_cm_get(neigh)); 872 kfree(neigh); 873 } 874 875 static int ipoib_neigh_setup_dev(struct net_device *dev, struct neigh_parms *parms) 876 { 877 parms->neigh_destructor = ipoib_neigh_destructor; 878 879 return 0; 880 } 881 882 int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) 883 { 884 struct ipoib_dev_priv *priv = netdev_priv(dev); 885 886 /* Allocate RX/TX "rings" to hold queued skbs */ 887 priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring, 888 GFP_KERNEL); 889 if (!priv->rx_ring) { 890 printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n", 891 ca->name, ipoib_recvq_size); 892 goto out; 893 } 894 895 priv->tx_ring = kzalloc(ipoib_sendq_size * sizeof *priv->tx_ring, 896 GFP_KERNEL); 897 if (!priv->tx_ring) { 898 printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n", 899 ca->name, ipoib_sendq_size); 900 goto out_rx_ring_cleanup; 901 } 902 903 /* priv->tx_head & tx_tail are already 0 */ 904 905 if (ipoib_ib_dev_init(dev, ca, port)) 906 goto out_tx_ring_cleanup; 907 908 return 0; 909 910 out_tx_ring_cleanup: 911 kfree(priv->tx_ring); 912 913 out_rx_ring_cleanup: 914 kfree(priv->rx_ring); 915 916 out: 917 return -ENOMEM; 918 } 919 920 void ipoib_dev_cleanup(struct net_device *dev) 921 { 922 struct ipoib_dev_priv *priv = netdev_priv(dev), *cpriv, *tcpriv; 923 924 ipoib_delete_debug_files(dev); 925 926 /* Delete any child interfaces first */ 927 list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) { 928 unregister_netdev(cpriv->dev); 929 ipoib_dev_cleanup(cpriv->dev); 930 free_netdev(cpriv->dev); 931 } 932 933 ipoib_ib_dev_cleanup(dev); 934 935 kfree(priv->rx_ring); 936 kfree(priv->tx_ring); 937 938 priv->rx_ring = NULL; 939 priv->tx_ring = NULL; 940 } 941 942 static void ipoib_setup(struct net_device *dev) 943 { 944 struct ipoib_dev_priv *priv = netdev_priv(dev); 945 946 dev->open = ipoib_open; 947 dev->stop = ipoib_stop; 948 dev->change_mtu = ipoib_change_mtu; 949 dev->hard_start_xmit = ipoib_start_xmit; 950 dev->get_stats = ipoib_get_stats; 951 dev->tx_timeout = ipoib_timeout; 952 dev->hard_header = ipoib_hard_header; 953 dev->set_multicast_list = ipoib_set_mcast_list; 954 dev->neigh_setup = ipoib_neigh_setup_dev; 955 956 dev->watchdog_timeo = HZ; 957 958 dev->flags |= IFF_BROADCAST | IFF_MULTICAST; 959 960 /* 961 * We add in INFINIBAND_ALEN to allow for the destination 962 * address "pseudoheader" for skbs without neighbour struct. 963 */ 964 dev->hard_header_len = IPOIB_ENCAP_LEN + INFINIBAND_ALEN; 965 dev->addr_len = INFINIBAND_ALEN; 966 dev->type = ARPHRD_INFINIBAND; 967 dev->tx_queue_len = ipoib_sendq_size * 2; 968 dev->features = NETIF_F_VLAN_CHALLENGED | NETIF_F_LLTX; 969 970 /* MTU will be reset when mcast join happens */ 971 dev->mtu = IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN; 972 priv->mcast_mtu = priv->admin_mtu = dev->mtu; 973 974 memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN); 975 976 netif_carrier_off(dev); 977 978 SET_MODULE_OWNER(dev); 979 980 priv->dev = dev; 981 982 spin_lock_init(&priv->lock); 983 spin_lock_init(&priv->tx_lock); 984 985 mutex_init(&priv->mcast_mutex); 986 mutex_init(&priv->vlan_mutex); 987 988 INIT_LIST_HEAD(&priv->path_list); 989 INIT_LIST_HEAD(&priv->child_intfs); 990 INIT_LIST_HEAD(&priv->dead_ahs); 991 INIT_LIST_HEAD(&priv->multicast_list); 992 993 INIT_DELAYED_WORK(&priv->pkey_task, ipoib_pkey_poll); 994 INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task); 995 INIT_WORK(&priv->flush_task, ipoib_ib_dev_flush); 996 INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task); 997 INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah); 998 } 999 1000 struct ipoib_dev_priv *ipoib_intf_alloc(const char *name) 1001 { 1002 struct net_device *dev; 1003 1004 dev = alloc_netdev((int) sizeof (struct ipoib_dev_priv), name, 1005 ipoib_setup); 1006 if (!dev) 1007 return NULL; 1008 1009 return netdev_priv(dev); 1010 } 1011 1012 static ssize_t show_pkey(struct device *dev, 1013 struct device_attribute *attr, char *buf) 1014 { 1015 struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev)); 1016 1017 return sprintf(buf, "0x%04x\n", priv->pkey); 1018 } 1019 static DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL); 1020 1021 static ssize_t create_child(struct device *dev, 1022 struct device_attribute *attr, 1023 const char *buf, size_t count) 1024 { 1025 int pkey; 1026 int ret; 1027 1028 if (sscanf(buf, "%i", &pkey) != 1) 1029 return -EINVAL; 1030 1031 if (pkey < 0 || pkey > 0xffff) 1032 return -EINVAL; 1033 1034 /* 1035 * Set the full membership bit, so that we join the right 1036 * broadcast group, etc. 1037 */ 1038 pkey |= 0x8000; 1039 1040 ret = ipoib_vlan_add(to_net_dev(dev), pkey); 1041 1042 return ret ? ret : count; 1043 } 1044 static DEVICE_ATTR(create_child, S_IWUGO, NULL, create_child); 1045 1046 static ssize_t delete_child(struct device *dev, 1047 struct device_attribute *attr, 1048 const char *buf, size_t count) 1049 { 1050 int pkey; 1051 int ret; 1052 1053 if (sscanf(buf, "%i", &pkey) != 1) 1054 return -EINVAL; 1055 1056 if (pkey < 0 || pkey > 0xffff) 1057 return -EINVAL; 1058 1059 ret = ipoib_vlan_delete(to_net_dev(dev), pkey); 1060 1061 return ret ? ret : count; 1062 1063 } 1064 static DEVICE_ATTR(delete_child, S_IWUGO, NULL, delete_child); 1065 1066 int ipoib_add_pkey_attr(struct net_device *dev) 1067 { 1068 return device_create_file(&dev->dev, &dev_attr_pkey); 1069 } 1070 1071 static struct net_device *ipoib_add_port(const char *format, 1072 struct ib_device *hca, u8 port) 1073 { 1074 struct ipoib_dev_priv *priv; 1075 int result = -ENOMEM; 1076 1077 priv = ipoib_intf_alloc(format); 1078 if (!priv) 1079 goto alloc_mem_failed; 1080 1081 SET_NETDEV_DEV(priv->dev, hca->dma_device); 1082 1083 result = ib_query_pkey(hca, port, 0, &priv->pkey); 1084 if (result) { 1085 printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n", 1086 hca->name, port, result); 1087 goto alloc_mem_failed; 1088 } 1089 1090 /* 1091 * Set the full membership bit, so that we join the right 1092 * broadcast group, etc. 1093 */ 1094 priv->pkey |= 0x8000; 1095 1096 priv->dev->broadcast[8] = priv->pkey >> 8; 1097 priv->dev->broadcast[9] = priv->pkey & 0xff; 1098 1099 result = ib_query_gid(hca, port, 0, &priv->local_gid); 1100 if (result) { 1101 printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n", 1102 hca->name, port, result); 1103 goto alloc_mem_failed; 1104 } else 1105 memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid)); 1106 1107 1108 result = ipoib_dev_init(priv->dev, hca, port); 1109 if (result < 0) { 1110 printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n", 1111 hca->name, port, result); 1112 goto device_init_failed; 1113 } 1114 1115 INIT_IB_EVENT_HANDLER(&priv->event_handler, 1116 priv->ca, ipoib_event); 1117 result = ib_register_event_handler(&priv->event_handler); 1118 if (result < 0) { 1119 printk(KERN_WARNING "%s: ib_register_event_handler failed for " 1120 "port %d (ret = %d)\n", 1121 hca->name, port, result); 1122 goto event_failed; 1123 } 1124 1125 result = register_netdev(priv->dev); 1126 if (result) { 1127 printk(KERN_WARNING "%s: couldn't register ipoib port %d; error %d\n", 1128 hca->name, port, result); 1129 goto register_failed; 1130 } 1131 1132 ipoib_create_debug_files(priv->dev); 1133 1134 if (ipoib_cm_add_mode_attr(priv->dev)) 1135 goto sysfs_failed; 1136 if (ipoib_add_pkey_attr(priv->dev)) 1137 goto sysfs_failed; 1138 if (device_create_file(&priv->dev->dev, &dev_attr_create_child)) 1139 goto sysfs_failed; 1140 if (device_create_file(&priv->dev->dev, &dev_attr_delete_child)) 1141 goto sysfs_failed; 1142 1143 return priv->dev; 1144 1145 sysfs_failed: 1146 ipoib_delete_debug_files(priv->dev); 1147 unregister_netdev(priv->dev); 1148 1149 register_failed: 1150 ib_unregister_event_handler(&priv->event_handler); 1151 flush_scheduled_work(); 1152 1153 event_failed: 1154 ipoib_dev_cleanup(priv->dev); 1155 1156 device_init_failed: 1157 free_netdev(priv->dev); 1158 1159 alloc_mem_failed: 1160 return ERR_PTR(result); 1161 } 1162 1163 static void ipoib_add_one(struct ib_device *device) 1164 { 1165 struct list_head *dev_list; 1166 struct net_device *dev; 1167 struct ipoib_dev_priv *priv; 1168 int s, e, p; 1169 1170 if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) 1171 return; 1172 1173 dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL); 1174 if (!dev_list) 1175 return; 1176 1177 INIT_LIST_HEAD(dev_list); 1178 1179 if (device->node_type == RDMA_NODE_IB_SWITCH) { 1180 s = 0; 1181 e = 0; 1182 } else { 1183 s = 1; 1184 e = device->phys_port_cnt; 1185 } 1186 1187 for (p = s; p <= e; ++p) { 1188 dev = ipoib_add_port("ib%d", device, p); 1189 if (!IS_ERR(dev)) { 1190 priv = netdev_priv(dev); 1191 list_add_tail(&priv->list, dev_list); 1192 } 1193 } 1194 1195 ib_set_client_data(device, &ipoib_client, dev_list); 1196 } 1197 1198 static void ipoib_remove_one(struct ib_device *device) 1199 { 1200 struct ipoib_dev_priv *priv, *tmp; 1201 struct list_head *dev_list; 1202 1203 if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) 1204 return; 1205 1206 dev_list = ib_get_client_data(device, &ipoib_client); 1207 1208 list_for_each_entry_safe(priv, tmp, dev_list, list) { 1209 ib_unregister_event_handler(&priv->event_handler); 1210 flush_scheduled_work(); 1211 1212 unregister_netdev(priv->dev); 1213 ipoib_dev_cleanup(priv->dev); 1214 free_netdev(priv->dev); 1215 } 1216 1217 kfree(dev_list); 1218 } 1219 1220 static int __init ipoib_init_module(void) 1221 { 1222 int ret; 1223 1224 ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size); 1225 ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE); 1226 ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE); 1227 1228 ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size); 1229 ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE); 1230 ipoib_sendq_size = max(ipoib_sendq_size, IPOIB_MIN_QUEUE_SIZE); 1231 1232 ret = ipoib_register_debugfs(); 1233 if (ret) 1234 return ret; 1235 1236 /* 1237 * We create our own workqueue mainly because we want to be 1238 * able to flush it when devices are being removed. We can't 1239 * use schedule_work()/flush_scheduled_work() because both 1240 * unregister_netdev() and linkwatch_event take the rtnl lock, 1241 * so flush_scheduled_work() can deadlock during device 1242 * removal. 1243 */ 1244 ipoib_workqueue = create_singlethread_workqueue("ipoib"); 1245 if (!ipoib_workqueue) { 1246 ret = -ENOMEM; 1247 goto err_fs; 1248 } 1249 1250 ib_sa_register_client(&ipoib_sa_client); 1251 1252 ret = ib_register_client(&ipoib_client); 1253 if (ret) 1254 goto err_sa; 1255 1256 return 0; 1257 1258 err_sa: 1259 ib_sa_unregister_client(&ipoib_sa_client); 1260 destroy_workqueue(ipoib_workqueue); 1261 1262 err_fs: 1263 ipoib_unregister_debugfs(); 1264 1265 return ret; 1266 } 1267 1268 static void __exit ipoib_cleanup_module(void) 1269 { 1270 ib_unregister_client(&ipoib_client); 1271 ib_sa_unregister_client(&ipoib_sa_client); 1272 ipoib_unregister_debugfs(); 1273 destroy_workqueue(ipoib_workqueue); 1274 } 1275 1276 module_init(ipoib_init_module); 1277 module_exit(ipoib_cleanup_module); 1278