1 /* 2 * Copyright (c) 2004 Topspin Communications. All rights reserved. 3 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. 4 * Copyright (c) 2004 Voltaire, Inc. All rights reserved. 5 * 6 * This software is available to you under a choice of one of two 7 * licenses. You may choose to be licensed under the terms of the GNU 8 * General Public License (GPL) Version 2, available from the file 9 * COPYING in the main directory of this source tree, or the 10 * OpenIB.org BSD license below: 11 * 12 * Redistribution and use in source and binary forms, with or 13 * without modification, are permitted provided that the following 14 * conditions are met: 15 * 16 * - Redistributions of source code must retain the above 17 * copyright notice, this list of conditions and the following 18 * disclaimer. 19 * 20 * - Redistributions in binary form must reproduce the above 21 * copyright notice, this list of conditions and the following 22 * disclaimer in the documentation and/or other materials 23 * provided with the distribution. 24 * 25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 32 * SOFTWARE. 33 * 34 * $Id: ipoib_main.c 1377 2004-12-23 19:57:12Z roland $ 35 */ 36 37 #include "ipoib.h" 38 39 #include <linux/module.h> 40 41 #include <linux/init.h> 42 #include <linux/slab.h> 43 #include <linux/vmalloc.h> 44 45 #include <linux/if_arp.h> /* For ARPHRD_xxx */ 46 47 #include <linux/ip.h> 48 #include <linux/in.h> 49 50 MODULE_AUTHOR("Roland Dreier"); 51 MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); 52 MODULE_LICENSE("Dual BSD/GPL"); 53 54 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG 55 int ipoib_debug_level; 56 57 module_param_named(debug_level, ipoib_debug_level, int, 0644); 58 MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); 59 #endif 60 61 struct ipoib_path_iter { 62 struct net_device *dev; 63 struct ipoib_path path; 64 }; 65 66 static const u8 ipv4_bcast_addr[] = { 67 0x00, 0xff, 0xff, 0xff, 68 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 69 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff 70 }; 71 72 struct workqueue_struct *ipoib_workqueue; 73 74 static void ipoib_add_one(struct ib_device *device); 75 static void ipoib_remove_one(struct ib_device *device); 76 77 static struct ib_client ipoib_client = { 78 .name = "ipoib", 79 .add = ipoib_add_one, 80 .remove = ipoib_remove_one 81 }; 82 83 int ipoib_open(struct net_device *dev) 84 { 85 struct ipoib_dev_priv *priv = netdev_priv(dev); 86 87 ipoib_dbg(priv, "bringing up interface\n"); 88 89 set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); 90 91 if (ipoib_pkey_dev_delay_open(dev)) 92 return 0; 93 94 if (ipoib_ib_dev_open(dev)) 95 return -EINVAL; 96 97 if (ipoib_ib_dev_up(dev)) { 98 ipoib_ib_dev_stop(dev); 99 return -EINVAL; 100 } 101 102 if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { 103 struct ipoib_dev_priv *cpriv; 104 105 /* Bring up any child interfaces too */ 106 down(&priv->vlan_mutex); 107 list_for_each_entry(cpriv, &priv->child_intfs, list) { 108 int flags; 109 110 flags = cpriv->dev->flags; 111 if (flags & IFF_UP) 112 continue; 113 114 dev_change_flags(cpriv->dev, flags | IFF_UP); 115 } 116 up(&priv->vlan_mutex); 117 } 118 119 netif_start_queue(dev); 120 121 return 0; 122 } 123 124 static int ipoib_stop(struct net_device *dev) 125 { 126 struct ipoib_dev_priv *priv = netdev_priv(dev); 127 128 ipoib_dbg(priv, "stopping interface\n"); 129 130 clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); 131 132 netif_stop_queue(dev); 133 134 ipoib_ib_dev_down(dev); 135 ipoib_ib_dev_stop(dev); 136 137 if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { 138 struct ipoib_dev_priv *cpriv; 139 140 /* Bring down any child interfaces too */ 141 down(&priv->vlan_mutex); 142 list_for_each_entry(cpriv, &priv->child_intfs, list) { 143 int flags; 144 145 flags = cpriv->dev->flags; 146 if (!(flags & IFF_UP)) 147 continue; 148 149 dev_change_flags(cpriv->dev, flags & ~IFF_UP); 150 } 151 up(&priv->vlan_mutex); 152 } 153 154 return 0; 155 } 156 157 static int ipoib_change_mtu(struct net_device *dev, int new_mtu) 158 { 159 struct ipoib_dev_priv *priv = netdev_priv(dev); 160 161 if (new_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN) 162 return -EINVAL; 163 164 priv->admin_mtu = new_mtu; 165 166 dev->mtu = min(priv->mcast_mtu, priv->admin_mtu); 167 168 return 0; 169 } 170 171 static struct ipoib_path *__path_find(struct net_device *dev, 172 union ib_gid *gid) 173 { 174 struct ipoib_dev_priv *priv = netdev_priv(dev); 175 struct rb_node *n = priv->path_tree.rb_node; 176 struct ipoib_path *path; 177 int ret; 178 179 while (n) { 180 path = rb_entry(n, struct ipoib_path, rb_node); 181 182 ret = memcmp(gid->raw, path->pathrec.dgid.raw, 183 sizeof (union ib_gid)); 184 185 if (ret < 0) 186 n = n->rb_left; 187 else if (ret > 0) 188 n = n->rb_right; 189 else 190 return path; 191 } 192 193 return NULL; 194 } 195 196 static int __path_add(struct net_device *dev, struct ipoib_path *path) 197 { 198 struct ipoib_dev_priv *priv = netdev_priv(dev); 199 struct rb_node **n = &priv->path_tree.rb_node; 200 struct rb_node *pn = NULL; 201 struct ipoib_path *tpath; 202 int ret; 203 204 while (*n) { 205 pn = *n; 206 tpath = rb_entry(pn, struct ipoib_path, rb_node); 207 208 ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw, 209 sizeof (union ib_gid)); 210 if (ret < 0) 211 n = &pn->rb_left; 212 else if (ret > 0) 213 n = &pn->rb_right; 214 else 215 return -EEXIST; 216 } 217 218 rb_link_node(&path->rb_node, pn, n); 219 rb_insert_color(&path->rb_node, &priv->path_tree); 220 221 list_add_tail(&path->list, &priv->path_list); 222 223 return 0; 224 } 225 226 static void path_free(struct net_device *dev, struct ipoib_path *path) 227 { 228 struct ipoib_dev_priv *priv = netdev_priv(dev); 229 struct ipoib_neigh *neigh, *tn; 230 struct sk_buff *skb; 231 unsigned long flags; 232 233 while ((skb = __skb_dequeue(&path->queue))) 234 dev_kfree_skb_irq(skb); 235 236 spin_lock_irqsave(&priv->lock, flags); 237 238 list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) { 239 /* 240 * It's safe to call ipoib_put_ah() inside priv->lock 241 * here, because we know that path->ah will always 242 * hold one more reference, so ipoib_put_ah() will 243 * never do more than decrement the ref count. 244 */ 245 if (neigh->ah) 246 ipoib_put_ah(neigh->ah); 247 *to_ipoib_neigh(neigh->neighbour) = NULL; 248 neigh->neighbour->ops->destructor = NULL; 249 kfree(neigh); 250 } 251 252 spin_unlock_irqrestore(&priv->lock, flags); 253 254 if (path->ah) 255 ipoib_put_ah(path->ah); 256 257 kfree(path); 258 } 259 260 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG 261 262 struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev) 263 { 264 struct ipoib_path_iter *iter; 265 266 iter = kmalloc(sizeof *iter, GFP_KERNEL); 267 if (!iter) 268 return NULL; 269 270 iter->dev = dev; 271 memset(iter->path.pathrec.dgid.raw, 0, 16); 272 273 if (ipoib_path_iter_next(iter)) { 274 kfree(iter); 275 return NULL; 276 } 277 278 return iter; 279 } 280 281 int ipoib_path_iter_next(struct ipoib_path_iter *iter) 282 { 283 struct ipoib_dev_priv *priv = netdev_priv(iter->dev); 284 struct rb_node *n; 285 struct ipoib_path *path; 286 int ret = 1; 287 288 spin_lock_irq(&priv->lock); 289 290 n = rb_first(&priv->path_tree); 291 292 while (n) { 293 path = rb_entry(n, struct ipoib_path, rb_node); 294 295 if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw, 296 sizeof (union ib_gid)) < 0) { 297 iter->path = *path; 298 ret = 0; 299 break; 300 } 301 302 n = rb_next(n); 303 } 304 305 spin_unlock_irq(&priv->lock); 306 307 return ret; 308 } 309 310 void ipoib_path_iter_read(struct ipoib_path_iter *iter, 311 struct ipoib_path *path) 312 { 313 *path = iter->path; 314 } 315 316 #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ 317 318 void ipoib_flush_paths(struct net_device *dev) 319 { 320 struct ipoib_dev_priv *priv = netdev_priv(dev); 321 struct ipoib_path *path, *tp; 322 LIST_HEAD(remove_list); 323 unsigned long flags; 324 325 spin_lock_irqsave(&priv->lock, flags); 326 327 list_splice(&priv->path_list, &remove_list); 328 INIT_LIST_HEAD(&priv->path_list); 329 330 list_for_each_entry(path, &remove_list, list) 331 rb_erase(&path->rb_node, &priv->path_tree); 332 333 spin_unlock_irqrestore(&priv->lock, flags); 334 335 list_for_each_entry_safe(path, tp, &remove_list, list) { 336 if (path->query) 337 ib_sa_cancel_query(path->query_id, path->query); 338 wait_for_completion(&path->done); 339 path_free(dev, path); 340 } 341 } 342 343 static void path_rec_completion(int status, 344 struct ib_sa_path_rec *pathrec, 345 void *path_ptr) 346 { 347 struct ipoib_path *path = path_ptr; 348 struct net_device *dev = path->dev; 349 struct ipoib_dev_priv *priv = netdev_priv(dev); 350 struct ipoib_ah *ah = NULL; 351 struct ipoib_neigh *neigh; 352 struct sk_buff_head skqueue; 353 struct sk_buff *skb; 354 unsigned long flags; 355 356 if (pathrec) 357 ipoib_dbg(priv, "PathRec LID 0x%04x for GID " IPOIB_GID_FMT "\n", 358 be16_to_cpu(pathrec->dlid), IPOIB_GID_ARG(pathrec->dgid)); 359 else 360 ipoib_dbg(priv, "PathRec status %d for GID " IPOIB_GID_FMT "\n", 361 status, IPOIB_GID_ARG(path->pathrec.dgid)); 362 363 skb_queue_head_init(&skqueue); 364 365 if (!status) { 366 struct ib_ah_attr av = { 367 .dlid = be16_to_cpu(pathrec->dlid), 368 .sl = pathrec->sl, 369 .port_num = priv->port 370 }; 371 int path_rate = ib_sa_rate_enum_to_int(pathrec->rate); 372 373 if (path_rate > 0 && priv->local_rate > path_rate) 374 av.static_rate = (priv->local_rate - 1) / path_rate; 375 376 ipoib_dbg(priv, "static_rate %d for local port %dX, path %dX\n", 377 av.static_rate, priv->local_rate, 378 ib_sa_rate_enum_to_int(pathrec->rate)); 379 380 ah = ipoib_create_ah(dev, priv->pd, &av); 381 } 382 383 spin_lock_irqsave(&priv->lock, flags); 384 385 path->ah = ah; 386 387 if (ah) { 388 path->pathrec = *pathrec; 389 390 ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n", 391 ah, be16_to_cpu(pathrec->dlid), pathrec->sl); 392 393 while ((skb = __skb_dequeue(&path->queue))) 394 __skb_queue_tail(&skqueue, skb); 395 396 list_for_each_entry(neigh, &path->neigh_list, list) { 397 kref_get(&path->ah->ref); 398 neigh->ah = path->ah; 399 400 while ((skb = __skb_dequeue(&neigh->queue))) 401 __skb_queue_tail(&skqueue, skb); 402 } 403 } 404 405 path->query = NULL; 406 complete(&path->done); 407 408 spin_unlock_irqrestore(&priv->lock, flags); 409 410 while ((skb = __skb_dequeue(&skqueue))) { 411 skb->dev = dev; 412 if (dev_queue_xmit(skb)) 413 ipoib_warn(priv, "dev_queue_xmit failed " 414 "to requeue packet\n"); 415 } 416 } 417 418 static struct ipoib_path *path_rec_create(struct net_device *dev, 419 union ib_gid *gid) 420 { 421 struct ipoib_dev_priv *priv = netdev_priv(dev); 422 struct ipoib_path *path; 423 424 path = kzalloc(sizeof *path, GFP_ATOMIC); 425 if (!path) 426 return NULL; 427 428 path->dev = dev; 429 430 skb_queue_head_init(&path->queue); 431 432 INIT_LIST_HEAD(&path->neigh_list); 433 434 memcpy(path->pathrec.dgid.raw, gid->raw, sizeof (union ib_gid)); 435 path->pathrec.sgid = priv->local_gid; 436 path->pathrec.pkey = cpu_to_be16(priv->pkey); 437 path->pathrec.numb_path = 1; 438 439 return path; 440 } 441 442 static int path_rec_start(struct net_device *dev, 443 struct ipoib_path *path) 444 { 445 struct ipoib_dev_priv *priv = netdev_priv(dev); 446 447 ipoib_dbg(priv, "Start path record lookup for " IPOIB_GID_FMT "\n", 448 IPOIB_GID_ARG(path->pathrec.dgid)); 449 450 init_completion(&path->done); 451 452 path->query_id = 453 ib_sa_path_rec_get(priv->ca, priv->port, 454 &path->pathrec, 455 IB_SA_PATH_REC_DGID | 456 IB_SA_PATH_REC_SGID | 457 IB_SA_PATH_REC_NUMB_PATH | 458 IB_SA_PATH_REC_PKEY, 459 1000, GFP_ATOMIC, 460 path_rec_completion, 461 path, &path->query); 462 if (path->query_id < 0) { 463 ipoib_warn(priv, "ib_sa_path_rec_get failed\n"); 464 path->query = NULL; 465 return path->query_id; 466 } 467 468 return 0; 469 } 470 471 static void neigh_add_path(struct sk_buff *skb, struct net_device *dev) 472 { 473 struct ipoib_dev_priv *priv = netdev_priv(dev); 474 struct ipoib_path *path; 475 struct ipoib_neigh *neigh; 476 477 neigh = kmalloc(sizeof *neigh, GFP_ATOMIC); 478 if (!neigh) { 479 ++priv->stats.tx_dropped; 480 dev_kfree_skb_any(skb); 481 return; 482 } 483 484 skb_queue_head_init(&neigh->queue); 485 neigh->neighbour = skb->dst->neighbour; 486 *to_ipoib_neigh(skb->dst->neighbour) = neigh; 487 488 /* 489 * We can only be called from ipoib_start_xmit, so we're 490 * inside tx_lock -- no need to save/restore flags. 491 */ 492 spin_lock(&priv->lock); 493 494 path = __path_find(dev, (union ib_gid *) (skb->dst->neighbour->ha + 4)); 495 if (!path) { 496 path = path_rec_create(dev, 497 (union ib_gid *) (skb->dst->neighbour->ha + 4)); 498 if (!path) 499 goto err; 500 501 __path_add(dev, path); 502 } 503 504 list_add_tail(&neigh->list, &path->neigh_list); 505 506 if (path->pathrec.dlid) { 507 kref_get(&path->ah->ref); 508 neigh->ah = path->ah; 509 510 ipoib_send(dev, skb, path->ah, 511 be32_to_cpup((__be32 *) skb->dst->neighbour->ha)); 512 } else { 513 neigh->ah = NULL; 514 if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { 515 __skb_queue_tail(&neigh->queue, skb); 516 } else { 517 ++priv->stats.tx_dropped; 518 dev_kfree_skb_any(skb); 519 } 520 521 if (!path->query && path_rec_start(dev, path)) 522 goto err; 523 } 524 525 spin_unlock(&priv->lock); 526 return; 527 528 err: 529 *to_ipoib_neigh(skb->dst->neighbour) = NULL; 530 list_del(&neigh->list); 531 neigh->neighbour->ops->destructor = NULL; 532 kfree(neigh); 533 534 ++priv->stats.tx_dropped; 535 dev_kfree_skb_any(skb); 536 537 spin_unlock(&priv->lock); 538 } 539 540 static void ipoib_path_lookup(struct sk_buff *skb, struct net_device *dev) 541 { 542 struct ipoib_dev_priv *priv = netdev_priv(skb->dev); 543 544 /* Look up path record for unicasts */ 545 if (skb->dst->neighbour->ha[4] != 0xff) { 546 neigh_add_path(skb, dev); 547 return; 548 } 549 550 /* Add in the P_Key for multicasts */ 551 skb->dst->neighbour->ha[8] = (priv->pkey >> 8) & 0xff; 552 skb->dst->neighbour->ha[9] = priv->pkey & 0xff; 553 ipoib_mcast_send(dev, (union ib_gid *) (skb->dst->neighbour->ha + 4), skb); 554 } 555 556 static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, 557 struct ipoib_pseudoheader *phdr) 558 { 559 struct ipoib_dev_priv *priv = netdev_priv(dev); 560 struct ipoib_path *path; 561 562 /* 563 * We can only be called from ipoib_start_xmit, so we're 564 * inside tx_lock -- no need to save/restore flags. 565 */ 566 spin_lock(&priv->lock); 567 568 path = __path_find(dev, (union ib_gid *) (phdr->hwaddr + 4)); 569 if (!path) { 570 path = path_rec_create(dev, 571 (union ib_gid *) (phdr->hwaddr + 4)); 572 if (path) { 573 /* put pseudoheader back on for next time */ 574 skb_push(skb, sizeof *phdr); 575 __skb_queue_tail(&path->queue, skb); 576 577 if (path_rec_start(dev, path)) { 578 spin_unlock(&priv->lock); 579 path_free(dev, path); 580 return; 581 } else 582 __path_add(dev, path); 583 } else { 584 ++priv->stats.tx_dropped; 585 dev_kfree_skb_any(skb); 586 } 587 588 spin_unlock(&priv->lock); 589 return; 590 } 591 592 if (path->pathrec.dlid) { 593 ipoib_dbg(priv, "Send unicast ARP to %04x\n", 594 be16_to_cpu(path->pathrec.dlid)); 595 596 ipoib_send(dev, skb, path->ah, 597 be32_to_cpup((__be32 *) phdr->hwaddr)); 598 } else if ((path->query || !path_rec_start(dev, path)) && 599 skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { 600 /* put pseudoheader back on for next time */ 601 skb_push(skb, sizeof *phdr); 602 __skb_queue_tail(&path->queue, skb); 603 } else { 604 ++priv->stats.tx_dropped; 605 dev_kfree_skb_any(skb); 606 } 607 608 spin_unlock(&priv->lock); 609 } 610 611 static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) 612 { 613 struct ipoib_dev_priv *priv = netdev_priv(dev); 614 struct ipoib_neigh *neigh; 615 unsigned long flags; 616 617 if (!spin_trylock_irqsave(&priv->tx_lock, flags)) 618 return NETDEV_TX_LOCKED; 619 620 /* 621 * Check if our queue is stopped. Since we have the LLTX bit 622 * set, we can't rely on netif_stop_queue() preventing our 623 * xmit function from being called with a full queue. 624 */ 625 if (unlikely(netif_queue_stopped(dev))) { 626 spin_unlock_irqrestore(&priv->tx_lock, flags); 627 return NETDEV_TX_BUSY; 628 } 629 630 if (skb->dst && skb->dst->neighbour) { 631 if (unlikely(!*to_ipoib_neigh(skb->dst->neighbour))) { 632 ipoib_path_lookup(skb, dev); 633 goto out; 634 } 635 636 neigh = *to_ipoib_neigh(skb->dst->neighbour); 637 638 if (likely(neigh->ah)) { 639 ipoib_send(dev, skb, neigh->ah, 640 be32_to_cpup((__be32 *) skb->dst->neighbour->ha)); 641 goto out; 642 } 643 644 if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { 645 spin_lock(&priv->lock); 646 __skb_queue_tail(&neigh->queue, skb); 647 spin_unlock(&priv->lock); 648 } else { 649 ++priv->stats.tx_dropped; 650 dev_kfree_skb_any(skb); 651 } 652 } else { 653 struct ipoib_pseudoheader *phdr = 654 (struct ipoib_pseudoheader *) skb->data; 655 skb_pull(skb, sizeof *phdr); 656 657 if (phdr->hwaddr[4] == 0xff) { 658 /* Add in the P_Key for multicast*/ 659 phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff; 660 phdr->hwaddr[9] = priv->pkey & 0xff; 661 662 ipoib_mcast_send(dev, (union ib_gid *) (phdr->hwaddr + 4), skb); 663 } else { 664 /* unicast GID -- should be ARP or RARP reply */ 665 666 if ((be16_to_cpup((__be16 *) skb->data) != ETH_P_ARP) && 667 (be16_to_cpup((__be16 *) skb->data) != ETH_P_RARP)) { 668 ipoib_warn(priv, "Unicast, no %s: type %04x, QPN %06x " 669 IPOIB_GID_FMT "\n", 670 skb->dst ? "neigh" : "dst", 671 be16_to_cpup((__be16 *) skb->data), 672 be32_to_cpup((__be32 *) phdr->hwaddr), 673 IPOIB_GID_ARG(*(union ib_gid *) (phdr->hwaddr + 4))); 674 dev_kfree_skb_any(skb); 675 ++priv->stats.tx_dropped; 676 goto out; 677 } 678 679 unicast_arp_send(skb, dev, phdr); 680 } 681 } 682 683 out: 684 spin_unlock_irqrestore(&priv->tx_lock, flags); 685 686 return NETDEV_TX_OK; 687 } 688 689 static struct net_device_stats *ipoib_get_stats(struct net_device *dev) 690 { 691 struct ipoib_dev_priv *priv = netdev_priv(dev); 692 693 return &priv->stats; 694 } 695 696 static void ipoib_timeout(struct net_device *dev) 697 { 698 struct ipoib_dev_priv *priv = netdev_priv(dev); 699 700 ipoib_warn(priv, "transmit timeout: latency %d msecs\n", 701 jiffies_to_msecs(jiffies - dev->trans_start)); 702 ipoib_warn(priv, "queue stopped %d, tx_head %u, tx_tail %u\n", 703 netif_queue_stopped(dev), 704 priv->tx_head, priv->tx_tail); 705 /* XXX reset QP, etc. */ 706 } 707 708 static int ipoib_hard_header(struct sk_buff *skb, 709 struct net_device *dev, 710 unsigned short type, 711 void *daddr, void *saddr, unsigned len) 712 { 713 struct ipoib_header *header; 714 715 header = (struct ipoib_header *) skb_push(skb, sizeof *header); 716 717 header->proto = htons(type); 718 header->reserved = 0; 719 720 /* 721 * If we don't have a neighbour structure, stuff the 722 * destination address onto the front of the skb so we can 723 * figure out where to send the packet later. 724 */ 725 if (!skb->dst || !skb->dst->neighbour) { 726 struct ipoib_pseudoheader *phdr = 727 (struct ipoib_pseudoheader *) skb_push(skb, sizeof *phdr); 728 memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN); 729 } 730 731 return 0; 732 } 733 734 static void ipoib_set_mcast_list(struct net_device *dev) 735 { 736 struct ipoib_dev_priv *priv = netdev_priv(dev); 737 738 queue_work(ipoib_workqueue, &priv->restart_task); 739 } 740 741 static void ipoib_neigh_destructor(struct neighbour *n) 742 { 743 struct ipoib_neigh *neigh; 744 struct ipoib_dev_priv *priv = netdev_priv(n->dev); 745 unsigned long flags; 746 struct ipoib_ah *ah = NULL; 747 748 ipoib_dbg(priv, 749 "neigh_destructor for %06x " IPOIB_GID_FMT "\n", 750 be32_to_cpup((__be32 *) n->ha), 751 IPOIB_GID_ARG(*((union ib_gid *) (n->ha + 4)))); 752 753 spin_lock_irqsave(&priv->lock, flags); 754 755 neigh = *to_ipoib_neigh(n); 756 if (neigh) { 757 if (neigh->ah) 758 ah = neigh->ah; 759 list_del(&neigh->list); 760 *to_ipoib_neigh(n) = NULL; 761 kfree(neigh); 762 } 763 764 spin_unlock_irqrestore(&priv->lock, flags); 765 766 if (ah) 767 ipoib_put_ah(ah); 768 } 769 770 static int ipoib_neigh_setup(struct neighbour *neigh) 771 { 772 /* 773 * Is this kosher? I can't find anybody in the kernel that 774 * sets neigh->destructor, so we should be able to set it here 775 * without trouble. 776 */ 777 neigh->ops->destructor = ipoib_neigh_destructor; 778 779 return 0; 780 } 781 782 static int ipoib_neigh_setup_dev(struct net_device *dev, struct neigh_parms *parms) 783 { 784 parms->neigh_setup = ipoib_neigh_setup; 785 786 return 0; 787 } 788 789 int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) 790 { 791 struct ipoib_dev_priv *priv = netdev_priv(dev); 792 793 /* Allocate RX/TX "rings" to hold queued skbs */ 794 795 priv->rx_ring = kzalloc(IPOIB_RX_RING_SIZE * sizeof (struct ipoib_rx_buf), 796 GFP_KERNEL); 797 if (!priv->rx_ring) { 798 printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n", 799 ca->name, IPOIB_RX_RING_SIZE); 800 goto out; 801 } 802 803 priv->tx_ring = kzalloc(IPOIB_TX_RING_SIZE * sizeof (struct ipoib_tx_buf), 804 GFP_KERNEL); 805 if (!priv->tx_ring) { 806 printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n", 807 ca->name, IPOIB_TX_RING_SIZE); 808 goto out_rx_ring_cleanup; 809 } 810 811 /* priv->tx_head & tx_tail are already 0 */ 812 813 if (ipoib_ib_dev_init(dev, ca, port)) 814 goto out_tx_ring_cleanup; 815 816 return 0; 817 818 out_tx_ring_cleanup: 819 kfree(priv->tx_ring); 820 821 out_rx_ring_cleanup: 822 kfree(priv->rx_ring); 823 824 out: 825 return -ENOMEM; 826 } 827 828 void ipoib_dev_cleanup(struct net_device *dev) 829 { 830 struct ipoib_dev_priv *priv = netdev_priv(dev), *cpriv, *tcpriv; 831 832 ipoib_delete_debug_files(dev); 833 834 /* Delete any child interfaces first */ 835 list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) { 836 unregister_netdev(cpriv->dev); 837 ipoib_dev_cleanup(cpriv->dev); 838 free_netdev(cpriv->dev); 839 } 840 841 ipoib_ib_dev_cleanup(dev); 842 843 kfree(priv->rx_ring); 844 kfree(priv->tx_ring); 845 846 priv->rx_ring = NULL; 847 priv->tx_ring = NULL; 848 } 849 850 static void ipoib_setup(struct net_device *dev) 851 { 852 struct ipoib_dev_priv *priv = netdev_priv(dev); 853 854 dev->open = ipoib_open; 855 dev->stop = ipoib_stop; 856 dev->change_mtu = ipoib_change_mtu; 857 dev->hard_start_xmit = ipoib_start_xmit; 858 dev->get_stats = ipoib_get_stats; 859 dev->tx_timeout = ipoib_timeout; 860 dev->hard_header = ipoib_hard_header; 861 dev->set_multicast_list = ipoib_set_mcast_list; 862 dev->neigh_setup = ipoib_neigh_setup_dev; 863 864 dev->watchdog_timeo = HZ; 865 866 dev->flags |= IFF_BROADCAST | IFF_MULTICAST; 867 868 /* 869 * We add in INFINIBAND_ALEN to allow for the destination 870 * address "pseudoheader" for skbs without neighbour struct. 871 */ 872 dev->hard_header_len = IPOIB_ENCAP_LEN + INFINIBAND_ALEN; 873 dev->addr_len = INFINIBAND_ALEN; 874 dev->type = ARPHRD_INFINIBAND; 875 dev->tx_queue_len = IPOIB_TX_RING_SIZE * 2; 876 dev->features = NETIF_F_VLAN_CHALLENGED | NETIF_F_LLTX; 877 878 /* MTU will be reset when mcast join happens */ 879 dev->mtu = IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN; 880 priv->mcast_mtu = priv->admin_mtu = dev->mtu; 881 882 memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN); 883 884 netif_carrier_off(dev); 885 886 SET_MODULE_OWNER(dev); 887 888 priv->dev = dev; 889 890 spin_lock_init(&priv->lock); 891 spin_lock_init(&priv->tx_lock); 892 893 init_MUTEX(&priv->mcast_mutex); 894 init_MUTEX(&priv->vlan_mutex); 895 896 INIT_LIST_HEAD(&priv->path_list); 897 INIT_LIST_HEAD(&priv->child_intfs); 898 INIT_LIST_HEAD(&priv->dead_ahs); 899 INIT_LIST_HEAD(&priv->multicast_list); 900 901 INIT_WORK(&priv->pkey_task, ipoib_pkey_poll, priv->dev); 902 INIT_WORK(&priv->mcast_task, ipoib_mcast_join_task, priv->dev); 903 INIT_WORK(&priv->flush_task, ipoib_ib_dev_flush, priv->dev); 904 INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task, priv->dev); 905 INIT_WORK(&priv->ah_reap_task, ipoib_reap_ah, priv->dev); 906 } 907 908 struct ipoib_dev_priv *ipoib_intf_alloc(const char *name) 909 { 910 struct net_device *dev; 911 912 dev = alloc_netdev((int) sizeof (struct ipoib_dev_priv), name, 913 ipoib_setup); 914 if (!dev) 915 return NULL; 916 917 return netdev_priv(dev); 918 } 919 920 static ssize_t show_pkey(struct class_device *cdev, char *buf) 921 { 922 struct ipoib_dev_priv *priv = 923 netdev_priv(container_of(cdev, struct net_device, class_dev)); 924 925 return sprintf(buf, "0x%04x\n", priv->pkey); 926 } 927 static CLASS_DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL); 928 929 static ssize_t create_child(struct class_device *cdev, 930 const char *buf, size_t count) 931 { 932 int pkey; 933 int ret; 934 935 if (sscanf(buf, "%i", &pkey) != 1) 936 return -EINVAL; 937 938 if (pkey < 0 || pkey > 0xffff) 939 return -EINVAL; 940 941 /* 942 * Set the full membership bit, so that we join the right 943 * broadcast group, etc. 944 */ 945 pkey |= 0x8000; 946 947 ret = ipoib_vlan_add(container_of(cdev, struct net_device, class_dev), 948 pkey); 949 950 return ret ? ret : count; 951 } 952 static CLASS_DEVICE_ATTR(create_child, S_IWUGO, NULL, create_child); 953 954 static ssize_t delete_child(struct class_device *cdev, 955 const char *buf, size_t count) 956 { 957 int pkey; 958 int ret; 959 960 if (sscanf(buf, "%i", &pkey) != 1) 961 return -EINVAL; 962 963 if (pkey < 0 || pkey > 0xffff) 964 return -EINVAL; 965 966 ret = ipoib_vlan_delete(container_of(cdev, struct net_device, class_dev), 967 pkey); 968 969 return ret ? ret : count; 970 971 } 972 static CLASS_DEVICE_ATTR(delete_child, S_IWUGO, NULL, delete_child); 973 974 int ipoib_add_pkey_attr(struct net_device *dev) 975 { 976 return class_device_create_file(&dev->class_dev, 977 &class_device_attr_pkey); 978 } 979 980 static struct net_device *ipoib_add_port(const char *format, 981 struct ib_device *hca, u8 port) 982 { 983 struct ipoib_dev_priv *priv; 984 int result = -ENOMEM; 985 986 priv = ipoib_intf_alloc(format); 987 if (!priv) 988 goto alloc_mem_failed; 989 990 SET_NETDEV_DEV(priv->dev, hca->dma_device); 991 992 result = ib_query_pkey(hca, port, 0, &priv->pkey); 993 if (result) { 994 printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n", 995 hca->name, port, result); 996 goto alloc_mem_failed; 997 } 998 999 /* 1000 * Set the full membership bit, so that we join the right 1001 * broadcast group, etc. 1002 */ 1003 priv->pkey |= 0x8000; 1004 1005 priv->dev->broadcast[8] = priv->pkey >> 8; 1006 priv->dev->broadcast[9] = priv->pkey & 0xff; 1007 1008 result = ib_query_gid(hca, port, 0, &priv->local_gid); 1009 if (result) { 1010 printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n", 1011 hca->name, port, result); 1012 goto alloc_mem_failed; 1013 } else 1014 memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid)); 1015 1016 1017 result = ipoib_dev_init(priv->dev, hca, port); 1018 if (result < 0) { 1019 printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n", 1020 hca->name, port, result); 1021 goto device_init_failed; 1022 } 1023 1024 INIT_IB_EVENT_HANDLER(&priv->event_handler, 1025 priv->ca, ipoib_event); 1026 result = ib_register_event_handler(&priv->event_handler); 1027 if (result < 0) { 1028 printk(KERN_WARNING "%s: ib_register_event_handler failed for " 1029 "port %d (ret = %d)\n", 1030 hca->name, port, result); 1031 goto event_failed; 1032 } 1033 1034 result = register_netdev(priv->dev); 1035 if (result) { 1036 printk(KERN_WARNING "%s: couldn't register ipoib port %d; error %d\n", 1037 hca->name, port, result); 1038 goto register_failed; 1039 } 1040 1041 ipoib_create_debug_files(priv->dev); 1042 1043 if (ipoib_add_pkey_attr(priv->dev)) 1044 goto sysfs_failed; 1045 if (class_device_create_file(&priv->dev->class_dev, 1046 &class_device_attr_create_child)) 1047 goto sysfs_failed; 1048 if (class_device_create_file(&priv->dev->class_dev, 1049 &class_device_attr_delete_child)) 1050 goto sysfs_failed; 1051 1052 return priv->dev; 1053 1054 sysfs_failed: 1055 ipoib_delete_debug_files(priv->dev); 1056 unregister_netdev(priv->dev); 1057 1058 register_failed: 1059 ib_unregister_event_handler(&priv->event_handler); 1060 flush_scheduled_work(); 1061 1062 event_failed: 1063 ipoib_dev_cleanup(priv->dev); 1064 1065 device_init_failed: 1066 free_netdev(priv->dev); 1067 1068 alloc_mem_failed: 1069 return ERR_PTR(result); 1070 } 1071 1072 static void ipoib_add_one(struct ib_device *device) 1073 { 1074 struct list_head *dev_list; 1075 struct net_device *dev; 1076 struct ipoib_dev_priv *priv; 1077 int s, e, p; 1078 1079 dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL); 1080 if (!dev_list) 1081 return; 1082 1083 INIT_LIST_HEAD(dev_list); 1084 1085 if (device->node_type == IB_NODE_SWITCH) { 1086 s = 0; 1087 e = 0; 1088 } else { 1089 s = 1; 1090 e = device->phys_port_cnt; 1091 } 1092 1093 for (p = s; p <= e; ++p) { 1094 dev = ipoib_add_port("ib%d", device, p); 1095 if (!IS_ERR(dev)) { 1096 priv = netdev_priv(dev); 1097 list_add_tail(&priv->list, dev_list); 1098 } 1099 } 1100 1101 ib_set_client_data(device, &ipoib_client, dev_list); 1102 } 1103 1104 static void ipoib_remove_one(struct ib_device *device) 1105 { 1106 struct ipoib_dev_priv *priv, *tmp; 1107 struct list_head *dev_list; 1108 1109 dev_list = ib_get_client_data(device, &ipoib_client); 1110 1111 list_for_each_entry_safe(priv, tmp, dev_list, list) { 1112 ib_unregister_event_handler(&priv->event_handler); 1113 flush_scheduled_work(); 1114 1115 unregister_netdev(priv->dev); 1116 ipoib_dev_cleanup(priv->dev); 1117 free_netdev(priv->dev); 1118 } 1119 1120 kfree(dev_list); 1121 } 1122 1123 static int __init ipoib_init_module(void) 1124 { 1125 int ret; 1126 1127 ret = ipoib_register_debugfs(); 1128 if (ret) 1129 return ret; 1130 1131 /* 1132 * We create our own workqueue mainly because we want to be 1133 * able to flush it when devices are being removed. We can't 1134 * use schedule_work()/flush_scheduled_work() because both 1135 * unregister_netdev() and linkwatch_event take the rtnl lock, 1136 * so flush_scheduled_work() can deadlock during device 1137 * removal. 1138 */ 1139 ipoib_workqueue = create_singlethread_workqueue("ipoib"); 1140 if (!ipoib_workqueue) { 1141 ret = -ENOMEM; 1142 goto err_fs; 1143 } 1144 1145 ret = ib_register_client(&ipoib_client); 1146 if (ret) 1147 goto err_wq; 1148 1149 return 0; 1150 1151 err_wq: 1152 destroy_workqueue(ipoib_workqueue); 1153 1154 err_fs: 1155 ipoib_unregister_debugfs(); 1156 1157 return ret; 1158 } 1159 1160 static void __exit ipoib_cleanup_module(void) 1161 { 1162 ib_unregister_client(&ipoib_client); 1163 ipoib_unregister_debugfs(); 1164 destroy_workqueue(ipoib_workqueue); 1165 } 1166 1167 module_init(ipoib_init_module); 1168 module_exit(ipoib_cleanup_module); 1169