xref: /linux/drivers/infiniband/ulp/ipoib/ipoib_main.c (revision 9429ec96c2718c0d1e3317cf60a87a0405223814)
1 /*
2  * Copyright (c) 2004 Topspin Communications.  All rights reserved.
3  * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
4  * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
5  *
6  * This software is available to you under a choice of one of two
7  * licenses.  You may choose to be licensed under the terms of the GNU
8  * General Public License (GPL) Version 2, available from the file
9  * COPYING in the main directory of this source tree, or the
10  * OpenIB.org BSD license below:
11  *
12  *     Redistribution and use in source and binary forms, with or
13  *     without modification, are permitted provided that the following
14  *     conditions are met:
15  *
16  *      - Redistributions of source code must retain the above
17  *        copyright notice, this list of conditions and the following
18  *        disclaimer.
19  *
20  *      - Redistributions in binary form must reproduce the above
21  *        copyright notice, this list of conditions and the following
22  *        disclaimer in the documentation and/or other materials
23  *        provided with the distribution.
24  *
25  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32  * SOFTWARE.
33  */
34 
35 #include "ipoib.h"
36 
37 #include <linux/module.h>
38 
39 #include <linux/init.h>
40 #include <linux/slab.h>
41 #include <linux/kernel.h>
42 #include <linux/vmalloc.h>
43 
44 #include <linux/if_arp.h>	/* For ARPHRD_xxx */
45 
46 #include <linux/ip.h>
47 #include <linux/in.h>
48 
49 #include <linux/jhash.h>
50 #include <net/arp.h>
51 
52 MODULE_AUTHOR("Roland Dreier");
53 MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
54 MODULE_LICENSE("Dual BSD/GPL");
55 
56 int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE;
57 int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE;
58 
59 module_param_named(send_queue_size, ipoib_sendq_size, int, 0444);
60 MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");
61 module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);
62 MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue");
63 
64 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
65 int ipoib_debug_level;
66 
67 module_param_named(debug_level, ipoib_debug_level, int, 0644);
68 MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
69 #endif
70 
71 struct ipoib_path_iter {
72 	struct net_device *dev;
73 	struct ipoib_path  path;
74 };
75 
76 static const u8 ipv4_bcast_addr[] = {
77 	0x00, 0xff, 0xff, 0xff,
78 	0xff, 0x12, 0x40, 0x1b,	0x00, 0x00, 0x00, 0x00,
79 	0x00, 0x00, 0x00, 0x00,	0xff, 0xff, 0xff, 0xff
80 };
81 
82 struct workqueue_struct *ipoib_workqueue;
83 
84 struct ib_sa_client ipoib_sa_client;
85 
86 static void ipoib_add_one(struct ib_device *device);
87 static void ipoib_remove_one(struct ib_device *device);
88 static void ipoib_neigh_reclaim(struct rcu_head *rp);
89 
90 static struct ib_client ipoib_client = {
91 	.name   = "ipoib",
92 	.add    = ipoib_add_one,
93 	.remove = ipoib_remove_one
94 };
95 
96 int ipoib_open(struct net_device *dev)
97 {
98 	struct ipoib_dev_priv *priv = netdev_priv(dev);
99 
100 	ipoib_dbg(priv, "bringing up interface\n");
101 
102 	set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
103 
104 	if (ipoib_pkey_dev_delay_open(dev))
105 		return 0;
106 
107 	if (ipoib_ib_dev_open(dev))
108 		goto err_disable;
109 
110 	if (ipoib_ib_dev_up(dev))
111 		goto err_stop;
112 
113 	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
114 		struct ipoib_dev_priv *cpriv;
115 
116 		/* Bring up any child interfaces too */
117 		mutex_lock(&priv->vlan_mutex);
118 		list_for_each_entry(cpriv, &priv->child_intfs, list) {
119 			int flags;
120 
121 			flags = cpriv->dev->flags;
122 			if (flags & IFF_UP)
123 				continue;
124 
125 			dev_change_flags(cpriv->dev, flags | IFF_UP);
126 		}
127 		mutex_unlock(&priv->vlan_mutex);
128 	}
129 
130 	netif_start_queue(dev);
131 
132 	return 0;
133 
134 err_stop:
135 	ipoib_ib_dev_stop(dev, 1);
136 
137 err_disable:
138 	clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
139 
140 	return -EINVAL;
141 }
142 
143 static int ipoib_stop(struct net_device *dev)
144 {
145 	struct ipoib_dev_priv *priv = netdev_priv(dev);
146 
147 	ipoib_dbg(priv, "stopping interface\n");
148 
149 	clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
150 
151 	netif_stop_queue(dev);
152 
153 	ipoib_ib_dev_down(dev, 0);
154 	ipoib_ib_dev_stop(dev, 0);
155 
156 	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
157 		struct ipoib_dev_priv *cpriv;
158 
159 		/* Bring down any child interfaces too */
160 		mutex_lock(&priv->vlan_mutex);
161 		list_for_each_entry(cpriv, &priv->child_intfs, list) {
162 			int flags;
163 
164 			flags = cpriv->dev->flags;
165 			if (!(flags & IFF_UP))
166 				continue;
167 
168 			dev_change_flags(cpriv->dev, flags & ~IFF_UP);
169 		}
170 		mutex_unlock(&priv->vlan_mutex);
171 	}
172 
173 	return 0;
174 }
175 
176 static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_features_t features)
177 {
178 	struct ipoib_dev_priv *priv = netdev_priv(dev);
179 
180 	if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
181 		features &= ~(NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO);
182 
183 	return features;
184 }
185 
186 static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
187 {
188 	struct ipoib_dev_priv *priv = netdev_priv(dev);
189 
190 	/* dev->mtu > 2K ==> connected mode */
191 	if (ipoib_cm_admin_enabled(dev)) {
192 		if (new_mtu > ipoib_cm_max_mtu(dev))
193 			return -EINVAL;
194 
195 		if (new_mtu > priv->mcast_mtu)
196 			ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n",
197 				   priv->mcast_mtu);
198 
199 		dev->mtu = new_mtu;
200 		return 0;
201 	}
202 
203 	if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu))
204 		return -EINVAL;
205 
206 	priv->admin_mtu = new_mtu;
207 
208 	dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
209 
210 	return 0;
211 }
212 
213 static struct ipoib_path *__path_find(struct net_device *dev, void *gid)
214 {
215 	struct ipoib_dev_priv *priv = netdev_priv(dev);
216 	struct rb_node *n = priv->path_tree.rb_node;
217 	struct ipoib_path *path;
218 	int ret;
219 
220 	while (n) {
221 		path = rb_entry(n, struct ipoib_path, rb_node);
222 
223 		ret = memcmp(gid, path->pathrec.dgid.raw,
224 			     sizeof (union ib_gid));
225 
226 		if (ret < 0)
227 			n = n->rb_left;
228 		else if (ret > 0)
229 			n = n->rb_right;
230 		else
231 			return path;
232 	}
233 
234 	return NULL;
235 }
236 
237 static int __path_add(struct net_device *dev, struct ipoib_path *path)
238 {
239 	struct ipoib_dev_priv *priv = netdev_priv(dev);
240 	struct rb_node **n = &priv->path_tree.rb_node;
241 	struct rb_node *pn = NULL;
242 	struct ipoib_path *tpath;
243 	int ret;
244 
245 	while (*n) {
246 		pn = *n;
247 		tpath = rb_entry(pn, struct ipoib_path, rb_node);
248 
249 		ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw,
250 			     sizeof (union ib_gid));
251 		if (ret < 0)
252 			n = &pn->rb_left;
253 		else if (ret > 0)
254 			n = &pn->rb_right;
255 		else
256 			return -EEXIST;
257 	}
258 
259 	rb_link_node(&path->rb_node, pn, n);
260 	rb_insert_color(&path->rb_node, &priv->path_tree);
261 
262 	list_add_tail(&path->list, &priv->path_list);
263 
264 	return 0;
265 }
266 
267 static void path_free(struct net_device *dev, struct ipoib_path *path)
268 {
269 	struct sk_buff *skb;
270 
271 	while ((skb = __skb_dequeue(&path->queue)))
272 		dev_kfree_skb_irq(skb);
273 
274 	ipoib_dbg(netdev_priv(dev), "path_free\n");
275 
276 	/* remove all neigh connected to this path */
277 	ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw);
278 
279 	if (path->ah)
280 		ipoib_put_ah(path->ah);
281 
282 	kfree(path);
283 }
284 
285 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
286 
287 struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev)
288 {
289 	struct ipoib_path_iter *iter;
290 
291 	iter = kmalloc(sizeof *iter, GFP_KERNEL);
292 	if (!iter)
293 		return NULL;
294 
295 	iter->dev = dev;
296 	memset(iter->path.pathrec.dgid.raw, 0, 16);
297 
298 	if (ipoib_path_iter_next(iter)) {
299 		kfree(iter);
300 		return NULL;
301 	}
302 
303 	return iter;
304 }
305 
306 int ipoib_path_iter_next(struct ipoib_path_iter *iter)
307 {
308 	struct ipoib_dev_priv *priv = netdev_priv(iter->dev);
309 	struct rb_node *n;
310 	struct ipoib_path *path;
311 	int ret = 1;
312 
313 	spin_lock_irq(&priv->lock);
314 
315 	n = rb_first(&priv->path_tree);
316 
317 	while (n) {
318 		path = rb_entry(n, struct ipoib_path, rb_node);
319 
320 		if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw,
321 			   sizeof (union ib_gid)) < 0) {
322 			iter->path = *path;
323 			ret = 0;
324 			break;
325 		}
326 
327 		n = rb_next(n);
328 	}
329 
330 	spin_unlock_irq(&priv->lock);
331 
332 	return ret;
333 }
334 
335 void ipoib_path_iter_read(struct ipoib_path_iter *iter,
336 			  struct ipoib_path *path)
337 {
338 	*path = iter->path;
339 }
340 
341 #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
342 
343 void ipoib_mark_paths_invalid(struct net_device *dev)
344 {
345 	struct ipoib_dev_priv *priv = netdev_priv(dev);
346 	struct ipoib_path *path, *tp;
347 
348 	spin_lock_irq(&priv->lock);
349 
350 	list_for_each_entry_safe(path, tp, &priv->path_list, list) {
351 		ipoib_dbg(priv, "mark path LID 0x%04x GID %pI6 invalid\n",
352 			be16_to_cpu(path->pathrec.dlid),
353 			path->pathrec.dgid.raw);
354 		path->valid =  0;
355 	}
356 
357 	spin_unlock_irq(&priv->lock);
358 }
359 
360 void ipoib_flush_paths(struct net_device *dev)
361 {
362 	struct ipoib_dev_priv *priv = netdev_priv(dev);
363 	struct ipoib_path *path, *tp;
364 	LIST_HEAD(remove_list);
365 	unsigned long flags;
366 
367 	netif_tx_lock_bh(dev);
368 	spin_lock_irqsave(&priv->lock, flags);
369 
370 	list_splice_init(&priv->path_list, &remove_list);
371 
372 	list_for_each_entry(path, &remove_list, list)
373 		rb_erase(&path->rb_node, &priv->path_tree);
374 
375 	list_for_each_entry_safe(path, tp, &remove_list, list) {
376 		if (path->query)
377 			ib_sa_cancel_query(path->query_id, path->query);
378 		spin_unlock_irqrestore(&priv->lock, flags);
379 		netif_tx_unlock_bh(dev);
380 		wait_for_completion(&path->done);
381 		path_free(dev, path);
382 		netif_tx_lock_bh(dev);
383 		spin_lock_irqsave(&priv->lock, flags);
384 	}
385 
386 	spin_unlock_irqrestore(&priv->lock, flags);
387 	netif_tx_unlock_bh(dev);
388 }
389 
390 static void path_rec_completion(int status,
391 				struct ib_sa_path_rec *pathrec,
392 				void *path_ptr)
393 {
394 	struct ipoib_path *path = path_ptr;
395 	struct net_device *dev = path->dev;
396 	struct ipoib_dev_priv *priv = netdev_priv(dev);
397 	struct ipoib_ah *ah = NULL;
398 	struct ipoib_ah *old_ah = NULL;
399 	struct ipoib_neigh *neigh, *tn;
400 	struct sk_buff_head skqueue;
401 	struct sk_buff *skb;
402 	unsigned long flags;
403 
404 	if (!status)
405 		ipoib_dbg(priv, "PathRec LID 0x%04x for GID %pI6\n",
406 			  be16_to_cpu(pathrec->dlid), pathrec->dgid.raw);
407 	else
408 		ipoib_dbg(priv, "PathRec status %d for GID %pI6\n",
409 			  status, path->pathrec.dgid.raw);
410 
411 	skb_queue_head_init(&skqueue);
412 
413 	if (!status) {
414 		struct ib_ah_attr av;
415 
416 		if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av))
417 			ah = ipoib_create_ah(dev, priv->pd, &av);
418 	}
419 
420 	spin_lock_irqsave(&priv->lock, flags);
421 
422 	if (!IS_ERR_OR_NULL(ah)) {
423 		path->pathrec = *pathrec;
424 
425 		old_ah   = path->ah;
426 		path->ah = ah;
427 
428 		ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n",
429 			  ah, be16_to_cpu(pathrec->dlid), pathrec->sl);
430 
431 		while ((skb = __skb_dequeue(&path->queue)))
432 			__skb_queue_tail(&skqueue, skb);
433 
434 		list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {
435 			if (neigh->ah) {
436 				WARN_ON(neigh->ah != old_ah);
437 				/*
438 				 * Dropping the ah reference inside
439 				 * priv->lock is safe here, because we
440 				 * will hold one more reference from
441 				 * the original value of path->ah (ie
442 				 * old_ah).
443 				 */
444 				ipoib_put_ah(neigh->ah);
445 			}
446 			kref_get(&path->ah->ref);
447 			neigh->ah = path->ah;
448 
449 			if (ipoib_cm_enabled(dev, neigh->daddr)) {
450 				if (!ipoib_cm_get(neigh))
451 					ipoib_cm_set(neigh, ipoib_cm_create_tx(dev,
452 									       path,
453 									       neigh));
454 				if (!ipoib_cm_get(neigh)) {
455 					list_del(&neigh->list);
456 					ipoib_neigh_free(neigh);
457 					continue;
458 				}
459 			}
460 
461 			while ((skb = __skb_dequeue(&neigh->queue)))
462 				__skb_queue_tail(&skqueue, skb);
463 		}
464 		path->valid = 1;
465 	}
466 
467 	path->query = NULL;
468 	complete(&path->done);
469 
470 	spin_unlock_irqrestore(&priv->lock, flags);
471 
472 	if (old_ah)
473 		ipoib_put_ah(old_ah);
474 
475 	while ((skb = __skb_dequeue(&skqueue))) {
476 		skb->dev = dev;
477 		if (dev_queue_xmit(skb))
478 			ipoib_warn(priv, "dev_queue_xmit failed "
479 				   "to requeue packet\n");
480 	}
481 }
482 
483 static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid)
484 {
485 	struct ipoib_dev_priv *priv = netdev_priv(dev);
486 	struct ipoib_path *path;
487 
488 	if (!priv->broadcast)
489 		return NULL;
490 
491 	path = kzalloc(sizeof *path, GFP_ATOMIC);
492 	if (!path)
493 		return NULL;
494 
495 	path->dev = dev;
496 
497 	skb_queue_head_init(&path->queue);
498 
499 	INIT_LIST_HEAD(&path->neigh_list);
500 
501 	memcpy(path->pathrec.dgid.raw, gid, sizeof (union ib_gid));
502 	path->pathrec.sgid	    = priv->local_gid;
503 	path->pathrec.pkey	    = cpu_to_be16(priv->pkey);
504 	path->pathrec.numb_path     = 1;
505 	path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class;
506 
507 	return path;
508 }
509 
510 static int path_rec_start(struct net_device *dev,
511 			  struct ipoib_path *path)
512 {
513 	struct ipoib_dev_priv *priv = netdev_priv(dev);
514 
515 	ipoib_dbg(priv, "Start path record lookup for %pI6\n",
516 		  path->pathrec.dgid.raw);
517 
518 	init_completion(&path->done);
519 
520 	path->query_id =
521 		ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port,
522 				   &path->pathrec,
523 				   IB_SA_PATH_REC_DGID		|
524 				   IB_SA_PATH_REC_SGID		|
525 				   IB_SA_PATH_REC_NUMB_PATH	|
526 				   IB_SA_PATH_REC_TRAFFIC_CLASS |
527 				   IB_SA_PATH_REC_PKEY,
528 				   1000, GFP_ATOMIC,
529 				   path_rec_completion,
530 				   path, &path->query);
531 	if (path->query_id < 0) {
532 		ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id);
533 		path->query = NULL;
534 		complete(&path->done);
535 		return path->query_id;
536 	}
537 
538 	return 0;
539 }
540 
541 static void neigh_add_path(struct sk_buff *skb, u8 *daddr,
542 			   struct net_device *dev)
543 {
544 	struct ipoib_dev_priv *priv = netdev_priv(dev);
545 	struct ipoib_path *path;
546 	struct ipoib_neigh *neigh;
547 	unsigned long flags;
548 
549 	spin_lock_irqsave(&priv->lock, flags);
550 	neigh = ipoib_neigh_alloc(daddr, dev);
551 	if (!neigh) {
552 		spin_unlock_irqrestore(&priv->lock, flags);
553 		++dev->stats.tx_dropped;
554 		dev_kfree_skb_any(skb);
555 		return;
556 	}
557 
558 	path = __path_find(dev, daddr + 4);
559 	if (!path) {
560 		path = path_rec_create(dev, daddr + 4);
561 		if (!path)
562 			goto err_path;
563 
564 		__path_add(dev, path);
565 	}
566 
567 	list_add_tail(&neigh->list, &path->neigh_list);
568 
569 	if (path->ah) {
570 		kref_get(&path->ah->ref);
571 		neigh->ah = path->ah;
572 
573 		if (ipoib_cm_enabled(dev, neigh->daddr)) {
574 			if (!ipoib_cm_get(neigh))
575 				ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh));
576 			if (!ipoib_cm_get(neigh)) {
577 				list_del(&neigh->list);
578 				ipoib_neigh_free(neigh);
579 				goto err_drop;
580 			}
581 			if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)
582 				__skb_queue_tail(&neigh->queue, skb);
583 			else {
584 				ipoib_warn(priv, "queue length limit %d. Packet drop.\n",
585 					   skb_queue_len(&neigh->queue));
586 				goto err_drop;
587 			}
588 		} else {
589 			spin_unlock_irqrestore(&priv->lock, flags);
590 			ipoib_send(dev, skb, path->ah, IPOIB_QPN(daddr));
591 			ipoib_neigh_put(neigh);
592 			return;
593 		}
594 	} else {
595 		neigh->ah  = NULL;
596 
597 		if (!path->query && path_rec_start(dev, path))
598 			goto err_list;
599 
600 		__skb_queue_tail(&neigh->queue, skb);
601 	}
602 
603 	spin_unlock_irqrestore(&priv->lock, flags);
604 	ipoib_neigh_put(neigh);
605 	return;
606 
607 err_list:
608 	list_del(&neigh->list);
609 
610 err_path:
611 	ipoib_neigh_free(neigh);
612 err_drop:
613 	++dev->stats.tx_dropped;
614 	dev_kfree_skb_any(skb);
615 
616 	spin_unlock_irqrestore(&priv->lock, flags);
617 	ipoib_neigh_put(neigh);
618 }
619 
620 static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
621 			     struct ipoib_cb *cb)
622 {
623 	struct ipoib_dev_priv *priv = netdev_priv(dev);
624 	struct ipoib_path *path;
625 	unsigned long flags;
626 
627 	spin_lock_irqsave(&priv->lock, flags);
628 
629 	path = __path_find(dev, cb->hwaddr + 4);
630 	if (!path || !path->valid) {
631 		int new_path = 0;
632 
633 		if (!path) {
634 			path = path_rec_create(dev, cb->hwaddr + 4);
635 			new_path = 1;
636 		}
637 		if (path) {
638 			__skb_queue_tail(&path->queue, skb);
639 
640 			if (!path->query && path_rec_start(dev, path)) {
641 				spin_unlock_irqrestore(&priv->lock, flags);
642 				if (new_path)
643 					path_free(dev, path);
644 				return;
645 			} else
646 				__path_add(dev, path);
647 		} else {
648 			++dev->stats.tx_dropped;
649 			dev_kfree_skb_any(skb);
650 		}
651 
652 		spin_unlock_irqrestore(&priv->lock, flags);
653 		return;
654 	}
655 
656 	if (path->ah) {
657 		ipoib_dbg(priv, "Send unicast ARP to %04x\n",
658 			  be16_to_cpu(path->pathrec.dlid));
659 
660 		spin_unlock_irqrestore(&priv->lock, flags);
661 		ipoib_send(dev, skb, path->ah, IPOIB_QPN(cb->hwaddr));
662 		return;
663 	} else if ((path->query || !path_rec_start(dev, path)) &&
664 		   skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
665 		__skb_queue_tail(&path->queue, skb);
666 	} else {
667 		++dev->stats.tx_dropped;
668 		dev_kfree_skb_any(skb);
669 	}
670 
671 	spin_unlock_irqrestore(&priv->lock, flags);
672 }
673 
674 static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
675 {
676 	struct ipoib_dev_priv *priv = netdev_priv(dev);
677 	struct ipoib_neigh *neigh;
678 	struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb;
679 	struct ipoib_header *header;
680 	unsigned long flags;
681 
682 	header = (struct ipoib_header *) skb->data;
683 
684 	if (unlikely(cb->hwaddr[4] == 0xff)) {
685 		/* multicast, arrange "if" according to probability */
686 		if ((header->proto != htons(ETH_P_IP)) &&
687 		    (header->proto != htons(ETH_P_IPV6)) &&
688 		    (header->proto != htons(ETH_P_ARP)) &&
689 		    (header->proto != htons(ETH_P_RARP))) {
690 			/* ethertype not supported by IPoIB */
691 			++dev->stats.tx_dropped;
692 			dev_kfree_skb_any(skb);
693 			return NETDEV_TX_OK;
694 		}
695 		/* Add in the P_Key for multicast*/
696 		cb->hwaddr[8] = (priv->pkey >> 8) & 0xff;
697 		cb->hwaddr[9] = priv->pkey & 0xff;
698 
699 		neigh = ipoib_neigh_get(dev, cb->hwaddr);
700 		if (likely(neigh))
701 			goto send_using_neigh;
702 		ipoib_mcast_send(dev, cb->hwaddr, skb);
703 		return NETDEV_TX_OK;
704 	}
705 
706 	/* unicast, arrange "switch" according to probability */
707 	switch (header->proto) {
708 	case htons(ETH_P_IP):
709 	case htons(ETH_P_IPV6):
710 		neigh = ipoib_neigh_get(dev, cb->hwaddr);
711 		if (unlikely(!neigh)) {
712 			neigh_add_path(skb, cb->hwaddr, dev);
713 			return NETDEV_TX_OK;
714 		}
715 		break;
716 	case htons(ETH_P_ARP):
717 	case htons(ETH_P_RARP):
718 		/* for unicast ARP and RARP should always perform path find */
719 		unicast_arp_send(skb, dev, cb);
720 		return NETDEV_TX_OK;
721 	default:
722 		/* ethertype not supported by IPoIB */
723 		++dev->stats.tx_dropped;
724 		dev_kfree_skb_any(skb);
725 		return NETDEV_TX_OK;
726 	}
727 
728 send_using_neigh:
729 	/* note we now hold a ref to neigh */
730 	if (ipoib_cm_get(neigh)) {
731 		if (ipoib_cm_up(neigh)) {
732 			ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
733 			goto unref;
734 		}
735 	} else if (neigh->ah) {
736 		ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(cb->hwaddr));
737 		goto unref;
738 	}
739 
740 	if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
741 		spin_lock_irqsave(&priv->lock, flags);
742 		__skb_queue_tail(&neigh->queue, skb);
743 		spin_unlock_irqrestore(&priv->lock, flags);
744 	} else {
745 		++dev->stats.tx_dropped;
746 		dev_kfree_skb_any(skb);
747 	}
748 
749 unref:
750 	ipoib_neigh_put(neigh);
751 
752 	return NETDEV_TX_OK;
753 }
754 
755 static void ipoib_timeout(struct net_device *dev)
756 {
757 	struct ipoib_dev_priv *priv = netdev_priv(dev);
758 
759 	ipoib_warn(priv, "transmit timeout: latency %d msecs\n",
760 		   jiffies_to_msecs(jiffies - dev->trans_start));
761 	ipoib_warn(priv, "queue stopped %d, tx_head %u, tx_tail %u\n",
762 		   netif_queue_stopped(dev),
763 		   priv->tx_head, priv->tx_tail);
764 	/* XXX reset QP, etc. */
765 }
766 
767 static int ipoib_hard_header(struct sk_buff *skb,
768 			     struct net_device *dev,
769 			     unsigned short type,
770 			     const void *daddr, const void *saddr, unsigned len)
771 {
772 	struct ipoib_header *header;
773 	struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb;
774 
775 	header = (struct ipoib_header *) skb_push(skb, sizeof *header);
776 
777 	header->proto = htons(type);
778 	header->reserved = 0;
779 
780 	/*
781 	 * we don't rely on dst_entry structure,  always stuff the
782 	 * destination address into skb->cb so we can figure out where
783 	 * to send the packet later.
784 	 */
785 	memcpy(cb->hwaddr, daddr, INFINIBAND_ALEN);
786 
787 	return 0;
788 }
789 
790 static void ipoib_set_mcast_list(struct net_device *dev)
791 {
792 	struct ipoib_dev_priv *priv = netdev_priv(dev);
793 
794 	if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
795 		ipoib_dbg(priv, "IPOIB_FLAG_OPER_UP not set");
796 		return;
797 	}
798 
799 	queue_work(ipoib_workqueue, &priv->restart_task);
800 }
801 
802 static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr)
803 {
804 	/*
805 	 * Use only the address parts that contributes to spreading
806 	 * The subnet prefix is not used as one can not connect to
807 	 * same remote port (GUID) using the same remote QPN via two
808 	 * different subnets.
809 	 */
810 	 /* qpn octets[1:4) & port GUID octets[12:20) */
811 	u32 *daddr_32 = (u32 *) daddr;
812 	u32 hv;
813 
814 	hv = jhash_3words(daddr_32[3], daddr_32[4], 0xFFFFFF & daddr_32[0], 0);
815 	return hv & htbl->mask;
816 }
817 
818 struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr)
819 {
820 	struct ipoib_dev_priv *priv = netdev_priv(dev);
821 	struct ipoib_neigh_table *ntbl = &priv->ntbl;
822 	struct ipoib_neigh_hash *htbl;
823 	struct ipoib_neigh *neigh = NULL;
824 	u32 hash_val;
825 
826 	rcu_read_lock_bh();
827 
828 	htbl = rcu_dereference_bh(ntbl->htbl);
829 
830 	if (!htbl)
831 		goto out_unlock;
832 
833 	hash_val = ipoib_addr_hash(htbl, daddr);
834 	for (neigh = rcu_dereference_bh(htbl->buckets[hash_val]);
835 	     neigh != NULL;
836 	     neigh = rcu_dereference_bh(neigh->hnext)) {
837 		if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
838 			/* found, take one ref on behalf of the caller */
839 			if (!atomic_inc_not_zero(&neigh->refcnt)) {
840 				/* deleted */
841 				neigh = NULL;
842 				goto out_unlock;
843 			}
844 			neigh->alive = jiffies;
845 			goto out_unlock;
846 		}
847 	}
848 
849 out_unlock:
850 	rcu_read_unlock_bh();
851 	return neigh;
852 }
853 
854 static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv)
855 {
856 	struct ipoib_neigh_table *ntbl = &priv->ntbl;
857 	struct ipoib_neigh_hash *htbl;
858 	unsigned long neigh_obsolete;
859 	unsigned long dt;
860 	unsigned long flags;
861 	int i;
862 
863 	if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
864 		return;
865 
866 	spin_lock_irqsave(&priv->lock, flags);
867 
868 	htbl = rcu_dereference_protected(ntbl->htbl,
869 					 lockdep_is_held(&priv->lock));
870 
871 	if (!htbl)
872 		goto out_unlock;
873 
874 	/* neigh is obsolete if it was idle for two GC periods */
875 	dt = 2 * arp_tbl.gc_interval;
876 	neigh_obsolete = jiffies - dt;
877 	/* handle possible race condition */
878 	if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
879 		goto out_unlock;
880 
881 	for (i = 0; i < htbl->size; i++) {
882 		struct ipoib_neigh *neigh;
883 		struct ipoib_neigh __rcu **np = &htbl->buckets[i];
884 
885 		while ((neigh = rcu_dereference_protected(*np,
886 							  lockdep_is_held(&priv->lock))) != NULL) {
887 			/* was the neigh idle for two GC periods */
888 			if (time_after(neigh_obsolete, neigh->alive)) {
889 				rcu_assign_pointer(*np,
890 						   rcu_dereference_protected(neigh->hnext,
891 									     lockdep_is_held(&priv->lock)));
892 				/* remove from path/mc list */
893 				list_del(&neigh->list);
894 				call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
895 			} else {
896 				np = &neigh->hnext;
897 			}
898 
899 		}
900 	}
901 
902 out_unlock:
903 	spin_unlock_irqrestore(&priv->lock, flags);
904 }
905 
906 static void ipoib_reap_neigh(struct work_struct *work)
907 {
908 	struct ipoib_dev_priv *priv =
909 		container_of(work, struct ipoib_dev_priv, neigh_reap_task.work);
910 
911 	__ipoib_reap_neigh(priv);
912 
913 	if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
914 		queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task,
915 				   arp_tbl.gc_interval);
916 }
917 
918 
919 static struct ipoib_neigh *ipoib_neigh_ctor(u8 *daddr,
920 				      struct net_device *dev)
921 {
922 	struct ipoib_neigh *neigh;
923 
924 	neigh = kzalloc(sizeof *neigh, GFP_ATOMIC);
925 	if (!neigh)
926 		return NULL;
927 
928 	neigh->dev = dev;
929 	memcpy(&neigh->daddr, daddr, sizeof(neigh->daddr));
930 	skb_queue_head_init(&neigh->queue);
931 	INIT_LIST_HEAD(&neigh->list);
932 	ipoib_cm_set(neigh, NULL);
933 	/* one ref on behalf of the caller */
934 	atomic_set(&neigh->refcnt, 1);
935 
936 	return neigh;
937 }
938 
939 struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr,
940 				      struct net_device *dev)
941 {
942 	struct ipoib_dev_priv *priv = netdev_priv(dev);
943 	struct ipoib_neigh_table *ntbl = &priv->ntbl;
944 	struct ipoib_neigh_hash *htbl;
945 	struct ipoib_neigh *neigh;
946 	u32 hash_val;
947 
948 	htbl = rcu_dereference_protected(ntbl->htbl,
949 					 lockdep_is_held(&priv->lock));
950 	if (!htbl) {
951 		neigh = NULL;
952 		goto out_unlock;
953 	}
954 
955 	/* need to add a new neigh, but maybe some other thread succeeded?
956 	 * recalc hash, maybe hash resize took place so we do a search
957 	 */
958 	hash_val = ipoib_addr_hash(htbl, daddr);
959 	for (neigh = rcu_dereference_protected(htbl->buckets[hash_val],
960 					       lockdep_is_held(&priv->lock));
961 	     neigh != NULL;
962 	     neigh = rcu_dereference_protected(neigh->hnext,
963 					       lockdep_is_held(&priv->lock))) {
964 		if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
965 			/* found, take one ref on behalf of the caller */
966 			if (!atomic_inc_not_zero(&neigh->refcnt)) {
967 				/* deleted */
968 				neigh = NULL;
969 				break;
970 			}
971 			neigh->alive = jiffies;
972 			goto out_unlock;
973 		}
974 	}
975 
976 	neigh = ipoib_neigh_ctor(daddr, dev);
977 	if (!neigh)
978 		goto out_unlock;
979 
980 	/* one ref on behalf of the hash table */
981 	atomic_inc(&neigh->refcnt);
982 	neigh->alive = jiffies;
983 	/* put in hash */
984 	rcu_assign_pointer(neigh->hnext,
985 			   rcu_dereference_protected(htbl->buckets[hash_val],
986 						     lockdep_is_held(&priv->lock)));
987 	rcu_assign_pointer(htbl->buckets[hash_val], neigh);
988 	atomic_inc(&ntbl->entries);
989 
990 out_unlock:
991 
992 	return neigh;
993 }
994 
995 void ipoib_neigh_dtor(struct ipoib_neigh *neigh)
996 {
997 	/* neigh reference count was dropprd to zero */
998 	struct net_device *dev = neigh->dev;
999 	struct ipoib_dev_priv *priv = netdev_priv(dev);
1000 	struct sk_buff *skb;
1001 	if (neigh->ah)
1002 		ipoib_put_ah(neigh->ah);
1003 	while ((skb = __skb_dequeue(&neigh->queue))) {
1004 		++dev->stats.tx_dropped;
1005 		dev_kfree_skb_any(skb);
1006 	}
1007 	if (ipoib_cm_get(neigh))
1008 		ipoib_cm_destroy_tx(ipoib_cm_get(neigh));
1009 	ipoib_dbg(netdev_priv(dev),
1010 		  "neigh free for %06x %pI6\n",
1011 		  IPOIB_QPN(neigh->daddr),
1012 		  neigh->daddr + 4);
1013 	kfree(neigh);
1014 	if (atomic_dec_and_test(&priv->ntbl.entries)) {
1015 		if (test_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags))
1016 			complete(&priv->ntbl.flushed);
1017 	}
1018 }
1019 
1020 static void ipoib_neigh_reclaim(struct rcu_head *rp)
1021 {
1022 	/* Called as a result of removal from hash table */
1023 	struct ipoib_neigh *neigh = container_of(rp, struct ipoib_neigh, rcu);
1024 	/* note TX context may hold another ref */
1025 	ipoib_neigh_put(neigh);
1026 }
1027 
1028 void ipoib_neigh_free(struct ipoib_neigh *neigh)
1029 {
1030 	struct net_device *dev = neigh->dev;
1031 	struct ipoib_dev_priv *priv = netdev_priv(dev);
1032 	struct ipoib_neigh_table *ntbl = &priv->ntbl;
1033 	struct ipoib_neigh_hash *htbl;
1034 	struct ipoib_neigh __rcu **np;
1035 	struct ipoib_neigh *n;
1036 	u32 hash_val;
1037 
1038 	htbl = rcu_dereference_protected(ntbl->htbl,
1039 					lockdep_is_held(&priv->lock));
1040 	if (!htbl)
1041 		return;
1042 
1043 	hash_val = ipoib_addr_hash(htbl, neigh->daddr);
1044 	np = &htbl->buckets[hash_val];
1045 	for (n = rcu_dereference_protected(*np,
1046 					    lockdep_is_held(&priv->lock));
1047 	     n != NULL;
1048 	     n = rcu_dereference_protected(*np,
1049 					lockdep_is_held(&priv->lock))) {
1050 		if (n == neigh) {
1051 			/* found */
1052 			rcu_assign_pointer(*np,
1053 					   rcu_dereference_protected(neigh->hnext,
1054 								     lockdep_is_held(&priv->lock)));
1055 			call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
1056 			return;
1057 		} else {
1058 			np = &n->hnext;
1059 		}
1060 	}
1061 }
1062 
1063 static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv)
1064 {
1065 	struct ipoib_neigh_table *ntbl = &priv->ntbl;
1066 	struct ipoib_neigh_hash *htbl;
1067 	struct ipoib_neigh **buckets;
1068 	u32 size;
1069 
1070 	clear_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags);
1071 	ntbl->htbl = NULL;
1072 	htbl = kzalloc(sizeof(*htbl), GFP_KERNEL);
1073 	if (!htbl)
1074 		return -ENOMEM;
1075 	set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1076 	size = roundup_pow_of_two(arp_tbl.gc_thresh3);
1077 	buckets = kzalloc(size * sizeof(*buckets), GFP_KERNEL);
1078 	if (!buckets) {
1079 		kfree(htbl);
1080 		return -ENOMEM;
1081 	}
1082 	htbl->size = size;
1083 	htbl->mask = (size - 1);
1084 	htbl->buckets = buckets;
1085 	ntbl->htbl = htbl;
1086 	htbl->ntbl = ntbl;
1087 	atomic_set(&ntbl->entries, 0);
1088 
1089 	/* start garbage collection */
1090 	clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1091 	queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task,
1092 			   arp_tbl.gc_interval);
1093 
1094 	return 0;
1095 }
1096 
1097 static void neigh_hash_free_rcu(struct rcu_head *head)
1098 {
1099 	struct ipoib_neigh_hash *htbl = container_of(head,
1100 						    struct ipoib_neigh_hash,
1101 						    rcu);
1102 	struct ipoib_neigh __rcu **buckets = htbl->buckets;
1103 	struct ipoib_neigh_table *ntbl = htbl->ntbl;
1104 
1105 	kfree(buckets);
1106 	kfree(htbl);
1107 	complete(&ntbl->deleted);
1108 }
1109 
1110 void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid)
1111 {
1112 	struct ipoib_dev_priv *priv = netdev_priv(dev);
1113 	struct ipoib_neigh_table *ntbl = &priv->ntbl;
1114 	struct ipoib_neigh_hash *htbl;
1115 	unsigned long flags;
1116 	int i;
1117 
1118 	/* remove all neigh connected to a given path or mcast */
1119 	spin_lock_irqsave(&priv->lock, flags);
1120 
1121 	htbl = rcu_dereference_protected(ntbl->htbl,
1122 					 lockdep_is_held(&priv->lock));
1123 
1124 	if (!htbl)
1125 		goto out_unlock;
1126 
1127 	for (i = 0; i < htbl->size; i++) {
1128 		struct ipoib_neigh *neigh;
1129 		struct ipoib_neigh __rcu **np = &htbl->buckets[i];
1130 
1131 		while ((neigh = rcu_dereference_protected(*np,
1132 							  lockdep_is_held(&priv->lock))) != NULL) {
1133 			/* delete neighs belong to this parent */
1134 			if (!memcmp(gid, neigh->daddr + 4, sizeof (union ib_gid))) {
1135 				rcu_assign_pointer(*np,
1136 						   rcu_dereference_protected(neigh->hnext,
1137 									     lockdep_is_held(&priv->lock)));
1138 				/* remove from parent list */
1139 				list_del(&neigh->list);
1140 				call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
1141 			} else {
1142 				np = &neigh->hnext;
1143 			}
1144 
1145 		}
1146 	}
1147 out_unlock:
1148 	spin_unlock_irqrestore(&priv->lock, flags);
1149 }
1150 
1151 static void ipoib_flush_neighs(struct ipoib_dev_priv *priv)
1152 {
1153 	struct ipoib_neigh_table *ntbl = &priv->ntbl;
1154 	struct ipoib_neigh_hash *htbl;
1155 	unsigned long flags;
1156 	int i, wait_flushed = 0;
1157 
1158 	init_completion(&priv->ntbl.flushed);
1159 
1160 	spin_lock_irqsave(&priv->lock, flags);
1161 
1162 	htbl = rcu_dereference_protected(ntbl->htbl,
1163 					lockdep_is_held(&priv->lock));
1164 	if (!htbl)
1165 		goto out_unlock;
1166 
1167 	wait_flushed = atomic_read(&priv->ntbl.entries);
1168 	if (!wait_flushed)
1169 		goto free_htbl;
1170 
1171 	for (i = 0; i < htbl->size; i++) {
1172 		struct ipoib_neigh *neigh;
1173 		struct ipoib_neigh __rcu **np = &htbl->buckets[i];
1174 
1175 		while ((neigh = rcu_dereference_protected(*np,
1176 				       lockdep_is_held(&priv->lock))) != NULL) {
1177 			rcu_assign_pointer(*np,
1178 					   rcu_dereference_protected(neigh->hnext,
1179 								     lockdep_is_held(&priv->lock)));
1180 			/* remove from path/mc list */
1181 			list_del(&neigh->list);
1182 			call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
1183 		}
1184 	}
1185 
1186 free_htbl:
1187 	rcu_assign_pointer(ntbl->htbl, NULL);
1188 	call_rcu(&htbl->rcu, neigh_hash_free_rcu);
1189 
1190 out_unlock:
1191 	spin_unlock_irqrestore(&priv->lock, flags);
1192 	if (wait_flushed)
1193 		wait_for_completion(&priv->ntbl.flushed);
1194 }
1195 
1196 static void ipoib_neigh_hash_uninit(struct net_device *dev)
1197 {
1198 	struct ipoib_dev_priv *priv = netdev_priv(dev);
1199 	int stopped;
1200 
1201 	ipoib_dbg(priv, "ipoib_neigh_hash_uninit\n");
1202 	init_completion(&priv->ntbl.deleted);
1203 	set_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags);
1204 
1205 	/* Stop GC if called at init fail need to cancel work */
1206 	stopped = test_and_set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1207 	if (!stopped)
1208 		cancel_delayed_work(&priv->neigh_reap_task);
1209 
1210 	ipoib_flush_neighs(priv);
1211 
1212 	wait_for_completion(&priv->ntbl.deleted);
1213 }
1214 
1215 
1216 int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
1217 {
1218 	struct ipoib_dev_priv *priv = netdev_priv(dev);
1219 
1220 	if (ipoib_neigh_hash_init(priv) < 0)
1221 		goto out;
1222 	/* Allocate RX/TX "rings" to hold queued skbs */
1223 	priv->rx_ring =	kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring,
1224 				GFP_KERNEL);
1225 	if (!priv->rx_ring) {
1226 		printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n",
1227 		       ca->name, ipoib_recvq_size);
1228 		goto out_neigh_hash_cleanup;
1229 	}
1230 
1231 	priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring);
1232 	if (!priv->tx_ring) {
1233 		printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n",
1234 		       ca->name, ipoib_sendq_size);
1235 		goto out_rx_ring_cleanup;
1236 	}
1237 
1238 	/* priv->tx_head, tx_tail & tx_outstanding are already 0 */
1239 
1240 	if (ipoib_ib_dev_init(dev, ca, port))
1241 		goto out_tx_ring_cleanup;
1242 
1243 	return 0;
1244 
1245 out_tx_ring_cleanup:
1246 	vfree(priv->tx_ring);
1247 
1248 out_rx_ring_cleanup:
1249 	kfree(priv->rx_ring);
1250 
1251 out_neigh_hash_cleanup:
1252 	ipoib_neigh_hash_uninit(dev);
1253 out:
1254 	return -ENOMEM;
1255 }
1256 
1257 void ipoib_dev_cleanup(struct net_device *dev)
1258 {
1259 	struct ipoib_dev_priv *priv = netdev_priv(dev), *cpriv, *tcpriv;
1260 
1261 	ipoib_delete_debug_files(dev);
1262 
1263 	/* Delete any child interfaces first */
1264 	list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) {
1265 		/* Stop GC on child */
1266 		set_bit(IPOIB_STOP_NEIGH_GC, &cpriv->flags);
1267 		cancel_delayed_work(&cpriv->neigh_reap_task);
1268 		unregister_netdev(cpriv->dev);
1269 		ipoib_dev_cleanup(cpriv->dev);
1270 		free_netdev(cpriv->dev);
1271 	}
1272 
1273 	ipoib_ib_dev_cleanup(dev);
1274 
1275 	kfree(priv->rx_ring);
1276 	vfree(priv->tx_ring);
1277 
1278 	priv->rx_ring = NULL;
1279 	priv->tx_ring = NULL;
1280 
1281 	ipoib_neigh_hash_uninit(dev);
1282 }
1283 
1284 static const struct header_ops ipoib_header_ops = {
1285 	.create	= ipoib_hard_header,
1286 };
1287 
1288 static const struct net_device_ops ipoib_netdev_ops = {
1289 	.ndo_open		 = ipoib_open,
1290 	.ndo_stop		 = ipoib_stop,
1291 	.ndo_change_mtu		 = ipoib_change_mtu,
1292 	.ndo_fix_features	 = ipoib_fix_features,
1293 	.ndo_start_xmit	 	 = ipoib_start_xmit,
1294 	.ndo_tx_timeout		 = ipoib_timeout,
1295 	.ndo_set_rx_mode	 = ipoib_set_mcast_list,
1296 };
1297 
1298 static void ipoib_setup(struct net_device *dev)
1299 {
1300 	struct ipoib_dev_priv *priv = netdev_priv(dev);
1301 
1302 	dev->netdev_ops		 = &ipoib_netdev_ops;
1303 	dev->header_ops		 = &ipoib_header_ops;
1304 
1305 	ipoib_set_ethtool_ops(dev);
1306 
1307 	netif_napi_add(dev, &priv->napi, ipoib_poll, 100);
1308 
1309 	dev->watchdog_timeo	 = HZ;
1310 
1311 	dev->flags		|= IFF_BROADCAST | IFF_MULTICAST;
1312 
1313 	dev->hard_header_len	 = IPOIB_ENCAP_LEN;
1314 	dev->addr_len		 = INFINIBAND_ALEN;
1315 	dev->type		 = ARPHRD_INFINIBAND;
1316 	dev->tx_queue_len	 = ipoib_sendq_size * 2;
1317 	dev->features		 = (NETIF_F_VLAN_CHALLENGED	|
1318 				    NETIF_F_HIGHDMA);
1319 	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
1320 
1321 	memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);
1322 
1323 	netif_carrier_off(dev);
1324 
1325 	priv->dev = dev;
1326 
1327 	spin_lock_init(&priv->lock);
1328 
1329 	mutex_init(&priv->vlan_mutex);
1330 
1331 	INIT_LIST_HEAD(&priv->path_list);
1332 	INIT_LIST_HEAD(&priv->child_intfs);
1333 	INIT_LIST_HEAD(&priv->dead_ahs);
1334 	INIT_LIST_HEAD(&priv->multicast_list);
1335 
1336 	INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll);
1337 	INIT_DELAYED_WORK(&priv->mcast_task,   ipoib_mcast_join_task);
1338 	INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task);
1339 	INIT_WORK(&priv->flush_light,   ipoib_ib_dev_flush_light);
1340 	INIT_WORK(&priv->flush_normal,   ipoib_ib_dev_flush_normal);
1341 	INIT_WORK(&priv->flush_heavy,   ipoib_ib_dev_flush_heavy);
1342 	INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
1343 	INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);
1344 	INIT_DELAYED_WORK(&priv->neigh_reap_task, ipoib_reap_neigh);
1345 }
1346 
1347 struct ipoib_dev_priv *ipoib_intf_alloc(const char *name)
1348 {
1349 	struct net_device *dev;
1350 
1351 	dev = alloc_netdev((int) sizeof (struct ipoib_dev_priv), name,
1352 			   ipoib_setup);
1353 	if (!dev)
1354 		return NULL;
1355 
1356 	return netdev_priv(dev);
1357 }
1358 
1359 static ssize_t show_pkey(struct device *dev,
1360 			 struct device_attribute *attr, char *buf)
1361 {
1362 	struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev));
1363 
1364 	return sprintf(buf, "0x%04x\n", priv->pkey);
1365 }
1366 static DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL);
1367 
1368 static ssize_t show_umcast(struct device *dev,
1369 			   struct device_attribute *attr, char *buf)
1370 {
1371 	struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev));
1372 
1373 	return sprintf(buf, "%d\n", test_bit(IPOIB_FLAG_UMCAST, &priv->flags));
1374 }
1375 
1376 static ssize_t set_umcast(struct device *dev,
1377 			  struct device_attribute *attr,
1378 			  const char *buf, size_t count)
1379 {
1380 	struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev));
1381 	unsigned long umcast_val = simple_strtoul(buf, NULL, 0);
1382 
1383 	if (umcast_val > 0) {
1384 		set_bit(IPOIB_FLAG_UMCAST, &priv->flags);
1385 		ipoib_warn(priv, "ignoring multicast groups joined directly "
1386 				"by userspace\n");
1387 	} else
1388 		clear_bit(IPOIB_FLAG_UMCAST, &priv->flags);
1389 
1390 	return count;
1391 }
1392 static DEVICE_ATTR(umcast, S_IWUSR | S_IRUGO, show_umcast, set_umcast);
1393 
1394 int ipoib_add_umcast_attr(struct net_device *dev)
1395 {
1396 	return device_create_file(&dev->dev, &dev_attr_umcast);
1397 }
1398 
1399 static ssize_t create_child(struct device *dev,
1400 			    struct device_attribute *attr,
1401 			    const char *buf, size_t count)
1402 {
1403 	int pkey;
1404 	int ret;
1405 
1406 	if (sscanf(buf, "%i", &pkey) != 1)
1407 		return -EINVAL;
1408 
1409 	if (pkey < 0 || pkey > 0xffff)
1410 		return -EINVAL;
1411 
1412 	/*
1413 	 * Set the full membership bit, so that we join the right
1414 	 * broadcast group, etc.
1415 	 */
1416 	pkey |= 0x8000;
1417 
1418 	ret = ipoib_vlan_add(to_net_dev(dev), pkey);
1419 
1420 	return ret ? ret : count;
1421 }
1422 static DEVICE_ATTR(create_child, S_IWUSR, NULL, create_child);
1423 
1424 static ssize_t delete_child(struct device *dev,
1425 			    struct device_attribute *attr,
1426 			    const char *buf, size_t count)
1427 {
1428 	int pkey;
1429 	int ret;
1430 
1431 	if (sscanf(buf, "%i", &pkey) != 1)
1432 		return -EINVAL;
1433 
1434 	if (pkey < 0 || pkey > 0xffff)
1435 		return -EINVAL;
1436 
1437 	ret = ipoib_vlan_delete(to_net_dev(dev), pkey);
1438 
1439 	return ret ? ret : count;
1440 
1441 }
1442 static DEVICE_ATTR(delete_child, S_IWUSR, NULL, delete_child);
1443 
1444 int ipoib_add_pkey_attr(struct net_device *dev)
1445 {
1446 	return device_create_file(&dev->dev, &dev_attr_pkey);
1447 }
1448 
1449 int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca)
1450 {
1451 	struct ib_device_attr *device_attr;
1452 	int result = -ENOMEM;
1453 
1454 	device_attr = kmalloc(sizeof *device_attr, GFP_KERNEL);
1455 	if (!device_attr) {
1456 		printk(KERN_WARNING "%s: allocation of %zu bytes failed\n",
1457 		       hca->name, sizeof *device_attr);
1458 		return result;
1459 	}
1460 
1461 	result = ib_query_device(hca, device_attr);
1462 	if (result) {
1463 		printk(KERN_WARNING "%s: ib_query_device failed (ret = %d)\n",
1464 		       hca->name, result);
1465 		kfree(device_attr);
1466 		return result;
1467 	}
1468 	priv->hca_caps = device_attr->device_cap_flags;
1469 
1470 	kfree(device_attr);
1471 
1472 	if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) {
1473 		priv->dev->hw_features = NETIF_F_SG |
1474 			NETIF_F_IP_CSUM | NETIF_F_RXCSUM;
1475 
1476 		if (priv->hca_caps & IB_DEVICE_UD_TSO)
1477 			priv->dev->hw_features |= NETIF_F_TSO;
1478 
1479 		priv->dev->features |= priv->dev->hw_features;
1480 	}
1481 
1482 	return 0;
1483 }
1484 
1485 static struct net_device *ipoib_add_port(const char *format,
1486 					 struct ib_device *hca, u8 port)
1487 {
1488 	struct ipoib_dev_priv *priv;
1489 	struct ib_port_attr attr;
1490 	int result = -ENOMEM;
1491 
1492 	priv = ipoib_intf_alloc(format);
1493 	if (!priv)
1494 		goto alloc_mem_failed;
1495 
1496 	SET_NETDEV_DEV(priv->dev, hca->dma_device);
1497 	priv->dev->dev_id = port - 1;
1498 
1499 	if (!ib_query_port(hca, port, &attr))
1500 		priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);
1501 	else {
1502 		printk(KERN_WARNING "%s: ib_query_port %d failed\n",
1503 		       hca->name, port);
1504 		goto device_init_failed;
1505 	}
1506 
1507 	/* MTU will be reset when mcast join happens */
1508 	priv->dev->mtu  = IPOIB_UD_MTU(priv->max_ib_mtu);
1509 	priv->mcast_mtu  = priv->admin_mtu = priv->dev->mtu;
1510 
1511 	priv->dev->neigh_priv_len = sizeof(struct ipoib_neigh);
1512 
1513 	result = ib_query_pkey(hca, port, 0, &priv->pkey);
1514 	if (result) {
1515 		printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",
1516 		       hca->name, port, result);
1517 		goto device_init_failed;
1518 	}
1519 
1520 	if (ipoib_set_dev_features(priv, hca))
1521 		goto device_init_failed;
1522 
1523 	/*
1524 	 * Set the full membership bit, so that we join the right
1525 	 * broadcast group, etc.
1526 	 */
1527 	priv->pkey |= 0x8000;
1528 
1529 	priv->dev->broadcast[8] = priv->pkey >> 8;
1530 	priv->dev->broadcast[9] = priv->pkey & 0xff;
1531 
1532 	result = ib_query_gid(hca, port, 0, &priv->local_gid);
1533 	if (result) {
1534 		printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n",
1535 		       hca->name, port, result);
1536 		goto device_init_failed;
1537 	} else
1538 		memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
1539 
1540 	result = ipoib_dev_init(priv->dev, hca, port);
1541 	if (result < 0) {
1542 		printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n",
1543 		       hca->name, port, result);
1544 		goto device_init_failed;
1545 	}
1546 
1547 	INIT_IB_EVENT_HANDLER(&priv->event_handler,
1548 			      priv->ca, ipoib_event);
1549 	result = ib_register_event_handler(&priv->event_handler);
1550 	if (result < 0) {
1551 		printk(KERN_WARNING "%s: ib_register_event_handler failed for "
1552 		       "port %d (ret = %d)\n",
1553 		       hca->name, port, result);
1554 		goto event_failed;
1555 	}
1556 
1557 	result = register_netdev(priv->dev);
1558 	if (result) {
1559 		printk(KERN_WARNING "%s: couldn't register ipoib port %d; error %d\n",
1560 		       hca->name, port, result);
1561 		goto register_failed;
1562 	}
1563 
1564 	ipoib_create_debug_files(priv->dev);
1565 
1566 	if (ipoib_cm_add_mode_attr(priv->dev))
1567 		goto sysfs_failed;
1568 	if (ipoib_add_pkey_attr(priv->dev))
1569 		goto sysfs_failed;
1570 	if (ipoib_add_umcast_attr(priv->dev))
1571 		goto sysfs_failed;
1572 	if (device_create_file(&priv->dev->dev, &dev_attr_create_child))
1573 		goto sysfs_failed;
1574 	if (device_create_file(&priv->dev->dev, &dev_attr_delete_child))
1575 		goto sysfs_failed;
1576 
1577 	return priv->dev;
1578 
1579 sysfs_failed:
1580 	ipoib_delete_debug_files(priv->dev);
1581 	unregister_netdev(priv->dev);
1582 
1583 register_failed:
1584 	ib_unregister_event_handler(&priv->event_handler);
1585 	/* Stop GC if started before flush */
1586 	set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1587 	cancel_delayed_work(&priv->neigh_reap_task);
1588 	flush_workqueue(ipoib_workqueue);
1589 
1590 event_failed:
1591 	ipoib_dev_cleanup(priv->dev);
1592 
1593 device_init_failed:
1594 	free_netdev(priv->dev);
1595 
1596 alloc_mem_failed:
1597 	return ERR_PTR(result);
1598 }
1599 
1600 static void ipoib_add_one(struct ib_device *device)
1601 {
1602 	struct list_head *dev_list;
1603 	struct net_device *dev;
1604 	struct ipoib_dev_priv *priv;
1605 	int s, e, p;
1606 
1607 	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
1608 		return;
1609 
1610 	dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL);
1611 	if (!dev_list)
1612 		return;
1613 
1614 	INIT_LIST_HEAD(dev_list);
1615 
1616 	if (device->node_type == RDMA_NODE_IB_SWITCH) {
1617 		s = 0;
1618 		e = 0;
1619 	} else {
1620 		s = 1;
1621 		e = device->phys_port_cnt;
1622 	}
1623 
1624 	for (p = s; p <= e; ++p) {
1625 		if (rdma_port_get_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND)
1626 			continue;
1627 		dev = ipoib_add_port("ib%d", device, p);
1628 		if (!IS_ERR(dev)) {
1629 			priv = netdev_priv(dev);
1630 			list_add_tail(&priv->list, dev_list);
1631 		}
1632 	}
1633 
1634 	ib_set_client_data(device, &ipoib_client, dev_list);
1635 }
1636 
1637 static void ipoib_remove_one(struct ib_device *device)
1638 {
1639 	struct ipoib_dev_priv *priv, *tmp;
1640 	struct list_head *dev_list;
1641 
1642 	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
1643 		return;
1644 
1645 	dev_list = ib_get_client_data(device, &ipoib_client);
1646 
1647 	list_for_each_entry_safe(priv, tmp, dev_list, list) {
1648 		ib_unregister_event_handler(&priv->event_handler);
1649 
1650 		rtnl_lock();
1651 		dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP);
1652 		rtnl_unlock();
1653 
1654 		/* Stop GC */
1655 		set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1656 		cancel_delayed_work(&priv->neigh_reap_task);
1657 		flush_workqueue(ipoib_workqueue);
1658 
1659 		unregister_netdev(priv->dev);
1660 		ipoib_dev_cleanup(priv->dev);
1661 		free_netdev(priv->dev);
1662 	}
1663 
1664 	kfree(dev_list);
1665 }
1666 
1667 static int __init ipoib_init_module(void)
1668 {
1669 	int ret;
1670 
1671 	ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size);
1672 	ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE);
1673 	ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE);
1674 
1675 	ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size);
1676 	ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE);
1677 	ipoib_sendq_size = max3(ipoib_sendq_size, 2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE);
1678 #ifdef CONFIG_INFINIBAND_IPOIB_CM
1679 	ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP);
1680 #endif
1681 
1682 	/*
1683 	 * When copying small received packets, we only copy from the
1684 	 * linear data part of the SKB, so we rely on this condition.
1685 	 */
1686 	BUILD_BUG_ON(IPOIB_CM_COPYBREAK > IPOIB_CM_HEAD_SIZE);
1687 
1688 	ret = ipoib_register_debugfs();
1689 	if (ret)
1690 		return ret;
1691 
1692 	/*
1693 	 * We create our own workqueue mainly because we want to be
1694 	 * able to flush it when devices are being removed.  We can't
1695 	 * use schedule_work()/flush_scheduled_work() because both
1696 	 * unregister_netdev() and linkwatch_event take the rtnl lock,
1697 	 * so flush_scheduled_work() can deadlock during device
1698 	 * removal.
1699 	 */
1700 	ipoib_workqueue = create_singlethread_workqueue("ipoib");
1701 	if (!ipoib_workqueue) {
1702 		ret = -ENOMEM;
1703 		goto err_fs;
1704 	}
1705 
1706 	ib_sa_register_client(&ipoib_sa_client);
1707 
1708 	ret = ib_register_client(&ipoib_client);
1709 	if (ret)
1710 		goto err_sa;
1711 
1712 	return 0;
1713 
1714 err_sa:
1715 	ib_sa_unregister_client(&ipoib_sa_client);
1716 	destroy_workqueue(ipoib_workqueue);
1717 
1718 err_fs:
1719 	ipoib_unregister_debugfs();
1720 
1721 	return ret;
1722 }
1723 
1724 static void __exit ipoib_cleanup_module(void)
1725 {
1726 	ib_unregister_client(&ipoib_client);
1727 	ib_sa_unregister_client(&ipoib_sa_client);
1728 	ipoib_unregister_debugfs();
1729 	destroy_workqueue(ipoib_workqueue);
1730 }
1731 
1732 module_init(ipoib_init_module);
1733 module_exit(ipoib_cleanup_module);
1734