xref: /freebsd/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c (revision ebacd8013fe5f7fdf9f6a5b286f6680dd2891036)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0
3  *
4  * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
5  * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
6  * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
7  *
8  * This software is available to you under a choice of one of two
9  * licenses.  You may choose to be licensed under the terms of the GNU
10  * General Public License (GPL) Version 2, available from the file
11  * COPYING in the main directory of this source tree, or the
12  * OpenIB.org BSD license below:
13  *
14  *     Redistribution and use in source and binary forms, with or
15  *     without modification, are permitted provided that the following
16  *     conditions are met:
17  *
18  *      - Redistributions of source code must retain the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer.
21  *
22  *      - Redistributions in binary form must reproduce the above
23  *        copyright notice, this list of conditions and the following
24  *        disclaimer in the documentation and/or other materials
25  *        provided with the distribution.
26  *
27  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
28  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
29  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
30  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
31  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
32  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
33  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
34  * SOFTWARE.
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "ipoib.h"
41 
42 #include <linux/delay.h>
43 #include <linux/completion.h>
44 
45 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
46 static int mcast_debug_level = 1;
47 
48 module_param(mcast_debug_level, int, 0644);
49 MODULE_PARM_DESC(mcast_debug_level,
50 		 "Enable multicast debug tracing if > 0");
51 #endif
52 
53 static DEFINE_MUTEX(mcast_mutex);
54 
55 struct ipoib_mcast_iter {
56 	struct ipoib_dev_priv *priv;
57 	union ib_gid       mgid;
58 	unsigned long      created;
59 	unsigned int       queuelen;
60 	unsigned int       complete;
61 	unsigned int       send_only;
62 };
63 
64 static void ipoib_mcast_free(struct ipoib_mcast *mcast)
65 {
66 	if_t dev = mcast->priv->dev;
67 	int tx_dropped = 0;
68 
69 	ipoib_dbg_mcast(mcast->priv, "deleting multicast group %16D\n",
70 			mcast->mcmember.mgid.raw, ":");
71 
72 	if (mcast->ah)
73 		ipoib_put_ah(mcast->ah);
74 
75 	tx_dropped = mcast->pkt_queue.ifq_len;
76 	_IF_DRAIN(&mcast->pkt_queue);	/* XXX Locking. */
77 
78 	if_inc_counter(dev, IFCOUNTER_OERRORS, tx_dropped);
79 
80 	kfree(mcast);
81 }
82 
83 static struct ipoib_mcast *ipoib_mcast_alloc(struct ipoib_dev_priv *priv,
84 					     int can_sleep)
85 {
86 	struct ipoib_mcast *mcast;
87 
88 	mcast = kzalloc(sizeof *mcast, can_sleep ? GFP_KERNEL : GFP_ATOMIC);
89 	if (!mcast)
90 		return NULL;
91 
92 	mcast->priv = priv;
93 	mcast->created = jiffies;
94 	mcast->backoff = 1;
95 
96 	INIT_LIST_HEAD(&mcast->list);
97 	bzero(&mcast->pkt_queue, sizeof(mcast->pkt_queue));
98 
99 	return mcast;
100 }
101 
102 static struct ipoib_mcast *__ipoib_mcast_find(struct ipoib_dev_priv *priv,
103     void *mgid)
104 {
105 	struct rb_node *n = priv->multicast_tree.rb_node;
106 
107 	while (n) {
108 		struct ipoib_mcast *mcast;
109 		int ret;
110 
111 		mcast = rb_entry(n, struct ipoib_mcast, rb_node);
112 
113 		ret = memcmp(mgid, mcast->mcmember.mgid.raw,
114 			     sizeof (union ib_gid));
115 		if (ret < 0)
116 			n = n->rb_left;
117 		else if (ret > 0)
118 			n = n->rb_right;
119 		else
120 			return mcast;
121 	}
122 
123 	return NULL;
124 }
125 
126 static int __ipoib_mcast_add(struct ipoib_dev_priv *priv,
127     struct ipoib_mcast *mcast)
128 {
129 	struct rb_node **n = &priv->multicast_tree.rb_node, *pn = NULL;
130 
131 	while (*n) {
132 		struct ipoib_mcast *tmcast;
133 		int ret;
134 
135 		pn = *n;
136 		tmcast = rb_entry(pn, struct ipoib_mcast, rb_node);
137 
138 		ret = memcmp(mcast->mcmember.mgid.raw, tmcast->mcmember.mgid.raw,
139 			     sizeof (union ib_gid));
140 		if (ret < 0)
141 			n = &pn->rb_left;
142 		else if (ret > 0)
143 			n = &pn->rb_right;
144 		else
145 			return -EEXIST;
146 	}
147 
148 	rb_link_node(&mcast->rb_node, pn, n);
149 	rb_insert_color(&mcast->rb_node, &priv->multicast_tree);
150 
151 	return 0;
152 }
153 
154 static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
155 				   struct ib_sa_mcmember_rec *mcmember)
156 {
157 	struct ipoib_dev_priv *priv = mcast->priv;
158 	if_t dev = priv->dev;
159 	struct ipoib_ah *ah;
160 	struct epoch_tracker et;
161 	int ret;
162 	int set_qkey = 0;
163 
164 	mcast->mcmember = *mcmember;
165 
166 	/* Set the cached Q_Key before we attach if it's the broadcast group */
167 	if (!memcmp(mcast->mcmember.mgid.raw, if_getbroadcastaddr(dev) + 4,
168 		    sizeof (union ib_gid))) {
169 		spin_lock_irq(&priv->lock);
170 		if (!priv->broadcast) {
171 			spin_unlock_irq(&priv->lock);
172 			return -EAGAIN;
173 		}
174 		priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey);
175 		spin_unlock_irq(&priv->lock);
176 		priv->tx_wr.remote_qkey = priv->qkey;
177 		set_qkey = 1;
178 	}
179 
180 	if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
181 		if (test_and_set_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
182 			ipoib_warn(priv, "multicast group %16D already attached\n",
183 				   mcast->mcmember.mgid.raw, ":");
184 
185 			return 0;
186 		}
187 
188 		ret = ipoib_mcast_attach(priv, be16_to_cpu(mcast->mcmember.mlid),
189 					 &mcast->mcmember.mgid, set_qkey);
190 		if (ret < 0) {
191 			ipoib_warn(priv, "couldn't attach QP to multicast group %16D\n",
192 				   mcast->mcmember.mgid.raw, ":");
193 
194 			clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags);
195 			return ret;
196 		}
197 	}
198 
199 	{
200 		struct ib_ah_attr av = {
201 			.dlid	       = be16_to_cpu(mcast->mcmember.mlid),
202 			.port_num      = priv->port,
203 			.sl	       = mcast->mcmember.sl,
204 			.ah_flags      = IB_AH_GRH,
205 			.static_rate   = mcast->mcmember.rate,
206 			.grh	       = {
207 				.flow_label    = be32_to_cpu(mcast->mcmember.flow_label),
208 				.hop_limit     = mcast->mcmember.hop_limit,
209 				.sgid_index    = 0,
210 				.traffic_class = mcast->mcmember.traffic_class
211 			}
212 		};
213 		av.grh.dgid = mcast->mcmember.mgid;
214 
215 		ah = ipoib_create_ah(priv, priv->pd, &av);
216 		if (!ah) {
217 			ipoib_warn(priv, "ib_address_create failed\n");
218 		} else {
219 			spin_lock_irq(&priv->lock);
220 			mcast->ah = ah;
221 			spin_unlock_irq(&priv->lock);
222 
223 			ipoib_dbg_mcast(priv, "MGID %16D AV %p, LID 0x%04x, SL %d\n",
224 					mcast->mcmember.mgid.raw, ":",
225 					mcast->ah->ah,
226 					be16_to_cpu(mcast->mcmember.mlid),
227 					mcast->mcmember.sl);
228 		}
229 	}
230 
231 	NET_EPOCH_ENTER(et);
232 
233 	/* actually send any queued packets */
234 	while (mcast->pkt_queue.ifq_len) {
235 		struct mbuf *mb;
236 		_IF_DEQUEUE(&mcast->pkt_queue, mb);
237 		mb->m_pkthdr.rcvif = dev;
238 
239 		if (if_transmit(dev, mb))
240 			ipoib_warn(priv, "dev_queue_xmit failed to requeue packet\n");
241 	}
242 
243 	NET_EPOCH_EXIT(et);
244 	return 0;
245 }
246 
247 static int
248 ipoib_mcast_sendonly_join_complete(int status,
249 				   struct ib_sa_multicast *multicast)
250 {
251 	struct ipoib_mcast *mcast = multicast->context;
252 	struct ipoib_dev_priv *priv = mcast->priv;
253 
254 	/* We trap for port events ourselves. */
255 	if (status == -ENETRESET)
256 		return 0;
257 
258 	if (!status)
259 		status = ipoib_mcast_join_finish(mcast, &multicast->rec);
260 
261 	if (status) {
262 		if (mcast->logcount++ < 20)
263 			ipoib_dbg_mcast(priv, "multicast join failed for %16D, status %d\n",
264 					mcast->mcmember.mgid.raw, ":", status);
265 
266 		/* Flush out any queued packets */
267 		if_inc_counter(priv->dev, IFCOUNTER_OERRORS, mcast->pkt_queue.ifq_len);
268 		_IF_DRAIN(&mcast->pkt_queue);
269 
270 		/* Clear the busy flag so we try again */
271 		status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY,
272 					    &mcast->flags);
273 	}
274 	return status;
275 }
276 
277 static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast)
278 {
279 	struct ipoib_dev_priv *priv = mcast->priv;
280 	struct ib_sa_mcmember_rec rec = {
281 #if 0				/* Some SMs don't support send-only yet */
282 		.join_state = 4
283 #else
284 		.join_state = 1
285 #endif
286 	};
287 	int ret = 0;
288 
289 	if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
290 		ipoib_dbg_mcast(priv, "device shutting down, no multicast joins\n");
291 		return -ENODEV;
292 	}
293 
294 	if (test_and_set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) {
295 		ipoib_dbg_mcast(priv, "multicast entry busy, skipping\n");
296 		return -EBUSY;
297 	}
298 
299 	rec.mgid     = mcast->mcmember.mgid;
300 	rec.port_gid = priv->local_gid;
301 	rec.pkey     = cpu_to_be16(priv->pkey);
302 
303 	mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca,
304 					 priv->port, &rec,
305 					 IB_SA_MCMEMBER_REC_MGID	|
306 					 IB_SA_MCMEMBER_REC_PORT_GID	|
307 					 IB_SA_MCMEMBER_REC_PKEY	|
308 					 IB_SA_MCMEMBER_REC_JOIN_STATE,
309 					 GFP_ATOMIC,
310 					 ipoib_mcast_sendonly_join_complete,
311 					 mcast);
312 	if (IS_ERR(mcast->mc)) {
313 		ret = PTR_ERR(mcast->mc);
314 		clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
315 		ipoib_warn(priv, "ib_sa_join_multicast failed (ret = %d)\n",
316 			   ret);
317 	} else {
318 		ipoib_dbg_mcast(priv, "no multicast record for %16D, starting join\n",
319 				mcast->mcmember.mgid.raw, ":");
320 	}
321 
322 	return ret;
323 }
324 
325 void ipoib_mcast_carrier_on_task(struct work_struct *work)
326 {
327 	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
328 						   carrier_on_task);
329 	struct ib_port_attr attr;
330 
331 	/*
332 	 * Take rtnl_lock to avoid racing with ipoib_stop() and
333 	 * turning the carrier back on while a device is being
334 	 * removed.
335 	 */
336 	if (ib_query_port(priv->ca, priv->port, &attr) ||
337 	    attr.state != IB_PORT_ACTIVE) {
338 		ipoib_dbg(priv, "Keeping carrier off until IB port is active\n");
339 		return;
340 	}
341 	if_link_state_change(priv->dev, LINK_STATE_UP);
342 }
343 
344 static int ipoib_mcast_join_complete(int status,
345 				     struct ib_sa_multicast *multicast)
346 {
347 	struct ipoib_mcast *mcast = multicast->context;
348 	struct ipoib_dev_priv *priv = mcast->priv;
349 
350 	ipoib_dbg_mcast(priv, "join completion for %16D (status %d)\n",
351 			mcast->mcmember.mgid.raw, ":", status);
352 
353 	/* We trap for port events ourselves. */
354 	if (status == -ENETRESET)
355 		return 0;
356 
357 	if (!status)
358 		status = ipoib_mcast_join_finish(mcast, &multicast->rec);
359 
360 	if (!status) {
361 		mcast->backoff = 1;
362 		mutex_lock(&mcast_mutex);
363 		if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
364 			queue_delayed_work(ipoib_workqueue,
365 					   &priv->mcast_task, 0);
366 		mutex_unlock(&mcast_mutex);
367 
368 		/*
369 		 * Defer carrier on work to ipoib_workqueue to avoid a
370 		 * deadlock on rtnl_lock here.
371 		 */
372 		if (mcast == priv->broadcast)
373 			queue_work(ipoib_workqueue, &priv->carrier_on_task);
374 
375 		return 0;
376 	}
377 
378 	if (mcast->logcount++ < 20) {
379 		if (status == -ETIMEDOUT || status == -EAGAIN) {
380 			ipoib_dbg_mcast(priv, "multicast join failed for %16D, status %d\n",
381 					mcast->mcmember.mgid.raw, ":", status);
382 		} else {
383 			ipoib_warn(priv, "multicast join failed for %16D, status %d\n",
384 				   mcast->mcmember.mgid.raw, ":", status);
385 		}
386 	}
387 
388 	mcast->backoff *= 2;
389 	if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
390 		mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
391 
392 	/* Clear the busy flag so we try again */
393 	status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
394 
395 	mutex_lock(&mcast_mutex);
396 	spin_lock_irq(&priv->lock);
397 	if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
398 		queue_delayed_work(ipoib_workqueue, &priv->mcast_task,
399 				   mcast->backoff * HZ);
400 	spin_unlock_irq(&priv->lock);
401 	mutex_unlock(&mcast_mutex);
402 
403 	return status;
404 }
405 
406 static void ipoib_mcast_join(struct ipoib_dev_priv *priv,
407     struct ipoib_mcast *mcast, int create)
408 {
409 	struct ib_sa_mcmember_rec rec = {
410 		.join_state = 1
411 	};
412 	ib_sa_comp_mask comp_mask;
413 	int ret = 0;
414 
415 	ipoib_dbg_mcast(priv, "joining MGID %16D\n",
416 	    mcast->mcmember.mgid.raw, ":");
417 
418 	rec.mgid     = mcast->mcmember.mgid;
419 	rec.port_gid = priv->local_gid;
420 	rec.pkey     = cpu_to_be16(priv->pkey);
421 
422 	comp_mask =
423 		IB_SA_MCMEMBER_REC_MGID		|
424 		IB_SA_MCMEMBER_REC_PORT_GID	|
425 		IB_SA_MCMEMBER_REC_PKEY		|
426 		IB_SA_MCMEMBER_REC_JOIN_STATE;
427 
428 	if (create) {
429 		comp_mask |=
430 			IB_SA_MCMEMBER_REC_QKEY			|
431 			IB_SA_MCMEMBER_REC_MTU_SELECTOR		|
432 			IB_SA_MCMEMBER_REC_MTU			|
433 			IB_SA_MCMEMBER_REC_TRAFFIC_CLASS	|
434 			IB_SA_MCMEMBER_REC_RATE_SELECTOR	|
435 			IB_SA_MCMEMBER_REC_RATE			|
436 			IB_SA_MCMEMBER_REC_SL			|
437 			IB_SA_MCMEMBER_REC_FLOW_LABEL		|
438 			IB_SA_MCMEMBER_REC_HOP_LIMIT;
439 
440 		rec.qkey	  = priv->broadcast->mcmember.qkey;
441 		rec.mtu_selector  = IB_SA_EQ;
442 		rec.mtu		  = priv->broadcast->mcmember.mtu;
443 		rec.traffic_class = priv->broadcast->mcmember.traffic_class;
444 		rec.rate_selector = IB_SA_EQ;
445 		rec.rate	  = priv->broadcast->mcmember.rate;
446 		rec.sl		  = priv->broadcast->mcmember.sl;
447 		rec.flow_label	  = priv->broadcast->mcmember.flow_label;
448 		rec.hop_limit	  = priv->broadcast->mcmember.hop_limit;
449 	}
450 
451 	set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
452 	mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port,
453 					 &rec, comp_mask, GFP_KERNEL,
454 					 ipoib_mcast_join_complete, mcast);
455 	if (IS_ERR(mcast->mc)) {
456 		clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
457 		ret = PTR_ERR(mcast->mc);
458 		ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret);
459 
460 		mcast->backoff *= 2;
461 		if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
462 			mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
463 
464 		mutex_lock(&mcast_mutex);
465 		if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
466 			queue_delayed_work(ipoib_workqueue,
467 					   &priv->mcast_task,
468 					   mcast->backoff * HZ);
469 		mutex_unlock(&mcast_mutex);
470 	}
471 }
472 
473 void ipoib_mcast_join_task(struct work_struct *work)
474 {
475 	struct ipoib_dev_priv *priv =
476 		container_of(work, struct ipoib_dev_priv, mcast_task.work);
477 	if_t dev = priv->dev;
478 	struct ib_port_attr attr;
479 
480 	ipoib_dbg_mcast(priv, "Running join task. flags 0x%lX\n", priv->flags);
481 
482 	if (!test_bit(IPOIB_MCAST_RUN, &priv->flags))
483 		return;
484 
485 	if (ib_query_port(priv->ca, priv->port, &attr) ||
486             attr.state != IB_PORT_ACTIVE) {
487 		ipoib_dbg(priv, "%s: port state is not ACTIVE (state = %d) suspend task.\n",
488                           __func__, attr.state);
489 		return;
490 	}
491 
492 	if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid, NULL))
493 		ipoib_warn(priv, "ib_query_gid() failed\n");
494 	else
495 		memcpy(if_getlladdr(dev) + 4, priv->local_gid.raw, sizeof (union ib_gid));
496 
497 	{
498 		struct ib_port_attr attr;
499 
500 		if (!ib_query_port(priv->ca, priv->port, &attr))
501 			priv->local_lid = attr.lid;
502 		else
503 			ipoib_warn(priv, "ib_query_port failed\n");
504 	}
505 
506 	if (!priv->broadcast) {
507 		struct ipoib_mcast *broadcast;
508 
509 		if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
510 			return;
511 
512 		broadcast = ipoib_mcast_alloc(priv, 1);
513 		if (!broadcast) {
514 			ipoib_warn(priv, "failed to allocate broadcast group\n");
515 			mutex_lock(&mcast_mutex);
516 			if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
517 				queue_delayed_work(ipoib_workqueue,
518 						   &priv->mcast_task, HZ);
519 			mutex_unlock(&mcast_mutex);
520 			return;
521 		}
522 
523 		spin_lock_irq(&priv->lock);
524 		memcpy(broadcast->mcmember.mgid.raw, if_getbroadcastaddr(dev) + 4,
525 		       sizeof (union ib_gid));
526 		priv->broadcast = broadcast;
527 
528 		__ipoib_mcast_add(priv, priv->broadcast);
529 		spin_unlock_irq(&priv->lock);
530 	}
531 
532 	if (priv->broadcast &&
533 	    !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) {
534 		if (priv->broadcast &&
535 		    !test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags))
536 			ipoib_mcast_join(priv, priv->broadcast, 0);
537 		return;
538 	}
539 
540 	while (1) {
541 		struct ipoib_mcast *mcast = NULL;
542 
543 		spin_lock_irq(&priv->lock);
544 		list_for_each_entry(mcast, &priv->multicast_list, list) {
545 			if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)
546 			    && !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)
547 			    && !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
548 				/* Found the next unjoined group */
549 				break;
550 			}
551 		}
552 		spin_unlock_irq(&priv->lock);
553 
554 		if (&mcast->list == &priv->multicast_list) {
555 			/* All done */
556 			break;
557 		}
558 
559 		ipoib_mcast_join(priv, mcast, 1);
560 		return;
561 	}
562 
563 	spin_lock_irq(&priv->lock);
564 	if (priv->broadcast)
565 		priv->mcast_mtu = IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu));
566 	else
567 		priv->mcast_mtu = priv->admin_mtu;
568 	spin_unlock_irq(&priv->lock);
569 
570 	if (!ipoib_cm_admin_enabled(priv))
571 		ipoib_change_mtu(priv, min(priv->mcast_mtu, priv->admin_mtu),
572 		    true);
573 
574 	ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n");
575 
576 	clear_bit(IPOIB_MCAST_RUN, &priv->flags);
577 }
578 
579 int ipoib_mcast_start_thread(struct ipoib_dev_priv *priv)
580 {
581 	ipoib_dbg_mcast(priv, "starting multicast thread flags 0x%lX\n",
582 	    priv->flags);
583 
584 	mutex_lock(&mcast_mutex);
585 	if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags))
586 		queue_delayed_work(ipoib_workqueue, &priv->mcast_task, 0);
587 	mutex_unlock(&mcast_mutex);
588 
589 	return 0;
590 }
591 
592 int ipoib_mcast_stop_thread(struct ipoib_dev_priv *priv, int flush)
593 {
594 
595 	ipoib_dbg_mcast(priv, "stopping multicast thread\n");
596 
597 	mutex_lock(&mcast_mutex);
598 	clear_bit(IPOIB_MCAST_RUN, &priv->flags);
599 	cancel_delayed_work(&priv->mcast_task);
600 	mutex_unlock(&mcast_mutex);
601 
602 	if (flush)
603 		flush_workqueue(ipoib_workqueue);
604 
605 	return 0;
606 }
607 
608 static int ipoib_mcast_leave(struct ipoib_dev_priv *priv, struct ipoib_mcast *mcast)
609 {
610 	int ret = 0;
611 
612 	if (test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
613 		ib_sa_free_multicast(mcast->mc);
614 
615 	if (test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
616 		ipoib_dbg_mcast(priv, "leaving MGID %16D\n",
617 				mcast->mcmember.mgid.raw, ":");
618 
619 		/* Remove ourselves from the multicast group */
620 		ret = ib_detach_mcast(priv->qp, &mcast->mcmember.mgid,
621 				      be16_to_cpu(mcast->mcmember.mlid));
622 		if (ret)
623 			ipoib_warn(priv, "ib_detach_mcast failed (result = %d)\n", ret);
624 	}
625 
626 	return 0;
627 }
628 
629 void
630 ipoib_mcast_send(struct ipoib_dev_priv *priv, void *mgid, struct mbuf *mb)
631 {
632 	if_t dev = priv->dev;
633 	struct ipoib_mcast *mcast;
634 
635 	if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)		||
636 	    !priv->broadcast					||
637 	    !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) {
638 		if_inc_counter(dev, IFCOUNTER_OERRORS, 1);
639 		m_freem(mb);
640 		return;
641 	}
642 
643 	mcast = __ipoib_mcast_find(priv, mgid);
644 	if (!mcast) {
645 		/* Let's create a new send only group now */
646 		ipoib_dbg_mcast(priv, "setting up send only multicast group for %16D\n",
647 				mgid, ":");
648 
649 		mcast = ipoib_mcast_alloc(priv, 0);
650 		if (!mcast) {
651 			ipoib_warn(priv, "unable to allocate memory for "
652 				   "multicast structure\n");
653 			if_inc_counter(dev, IFCOUNTER_OERRORS, 1);
654 			m_freem(mb);
655 			goto out;
656 		}
657 
658 		set_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags);
659 		memcpy(mcast->mcmember.mgid.raw, mgid, sizeof (union ib_gid));
660 		__ipoib_mcast_add(priv, mcast);
661 		list_add_tail(&mcast->list, &priv->multicast_list);
662 	}
663 
664 	if (!mcast->ah) {
665 		if (mcast->pkt_queue.ifq_len < IPOIB_MAX_MCAST_QUEUE) {
666 			_IF_ENQUEUE(&mcast->pkt_queue, mb);
667 		} else {
668 			if_inc_counter(dev, IFCOUNTER_OERRORS, 1);
669 			m_freem(mb);
670 		}
671 
672 		if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
673 			ipoib_dbg_mcast(priv, "no address vector, "
674 					"but multicast join already started\n");
675 		else if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
676 			ipoib_mcast_sendonly_join(mcast);
677 
678 		/*
679 		 * If lookup completes between here and out:, don't
680 		 * want to send packet twice.
681 		 */
682 		mcast = NULL;
683 	}
684 
685 out:
686 	if (mcast && mcast->ah)
687 		ipoib_send(priv, mb, mcast->ah, IB_MULTICAST_QPN);
688 }
689 
690 void ipoib_mcast_dev_flush(struct ipoib_dev_priv *priv)
691 {
692 	LIST_HEAD(remove_list);
693 	struct ipoib_mcast *mcast, *tmcast;
694 	unsigned long flags;
695 
696 	ipoib_dbg_mcast(priv, "flushing multicast list\n");
697 
698 	spin_lock_irqsave(&priv->lock, flags);
699 
700 	list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) {
701 		list_del(&mcast->list);
702 		rb_erase(&mcast->rb_node, &priv->multicast_tree);
703 		list_add_tail(&mcast->list, &remove_list);
704 	}
705 
706 	if (priv->broadcast) {
707 		rb_erase(&priv->broadcast->rb_node, &priv->multicast_tree);
708 		list_add_tail(&priv->broadcast->list, &remove_list);
709 		priv->broadcast = NULL;
710 	}
711 
712 	spin_unlock_irqrestore(&priv->lock, flags);
713 
714 	list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
715 		ipoib_mcast_leave(priv, mcast);
716 		ipoib_mcast_free(mcast);
717 	}
718 }
719 
720 static int ipoib_mcast_addr_is_valid(const u8 *addr, unsigned int addrlen,
721 				     const u8 *broadcast)
722 {
723 	if (addrlen != INFINIBAND_ALEN)
724 		return 0;
725 	/* reserved QPN, prefix, scope */
726 	if (memcmp(addr, broadcast, 6))
727 		return 0;
728 	/* signature lower, pkey */
729 	if (memcmp(addr + 7, broadcast + 7, 3))
730 		return 0;
731 	return 1;
732 }
733 
734 void ipoib_mcast_restart_task(struct work_struct *work)
735 {
736 	struct ipoib_dev_priv *priv =
737 		container_of(work, struct ipoib_dev_priv, restart_task);
738 	ipoib_mcast_restart(priv);
739 }
740 
741 struct ipoib_mcast_ctx {
742 	struct ipoib_dev_priv *priv;
743 	struct list_head remove_list;
744 };
745 
746 static u_int
747 ipoib_process_maddr(void *arg, struct sockaddr_dl *sdl, u_int cnt)
748 {
749 	struct ipoib_mcast_ctx *ctx = arg;
750 	struct ipoib_dev_priv *priv = ctx->priv;
751 	struct ipoib_mcast *mcast;
752 	struct ib_sa_mcmember_rec rec;
753 	union ib_gid mgid;
754 	uint8_t *addr;
755 	int addrlen;
756 
757 	addr = LLADDR(sdl);
758 	addrlen = sdl->sdl_alen;
759 	if (!ipoib_mcast_addr_is_valid(addr, addrlen,
760 	    if_getbroadcastaddr(priv->dev)))
761 		return (0);
762 
763 	memcpy(mgid.raw, addr + 4, sizeof mgid);
764 
765 	mcast = __ipoib_mcast_find(priv, &mgid);
766 	if (!mcast || test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
767 		struct ipoib_mcast *nmcast;
768 
769 		/* ignore group which is directly joined by userspace */
770 		if (test_bit(IPOIB_FLAG_UMCAST, &priv->flags) &&
771 		    !ib_sa_get_mcmember_rec(priv->ca, priv->port, &mgid, &rec)) {
772 			ipoib_dbg_mcast(priv, "ignoring multicast entry for mgid %16D\n",
773 					mgid.raw, ":");
774 			return (0);
775 		}
776 
777 		/* Not found or send-only group, let's add a new entry */
778 		ipoib_dbg_mcast(priv, "adding multicast entry for mgid %16D\n",
779 				mgid.raw, ":");
780 
781 		nmcast = ipoib_mcast_alloc(priv, 0);
782 		if (!nmcast) {
783 			ipoib_warn(priv, "unable to allocate memory for multicast structure\n");
784 			return (0);
785 		}
786 
787 		set_bit(IPOIB_MCAST_FLAG_FOUND, &nmcast->flags);
788 
789 		nmcast->mcmember.mgid = mgid;
790 
791 		if (mcast) {
792 			/* Destroy the send only entry */
793 			list_move_tail(&mcast->list, &ctx->remove_list);
794 
795 			rb_replace_node(&mcast->rb_node,
796 					&nmcast->rb_node,
797 					&priv->multicast_tree);
798 		} else
799 			__ipoib_mcast_add(priv, nmcast);
800 
801 		list_add_tail(&nmcast->list, &priv->multicast_list);
802 	}
803 
804 	if (mcast)
805 		set_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags);
806 
807 	return (1);
808 }
809 
810 void ipoib_mcast_restart(struct ipoib_dev_priv *priv)
811 {
812 	struct ipoib_mcast_ctx ctx = { priv,
813 	    { &ctx.remove_list, &ctx.remove_list }};
814 	if_t dev = priv->dev;
815 	struct ipoib_mcast *mcast, *tmcast;
816 
817 	ipoib_dbg_mcast(priv, "restarting multicast task flags 0x%lX\n",
818 	    priv->flags);
819 
820 	ipoib_mcast_stop_thread(priv, 0);
821 
822 	spin_lock(&priv->lock);
823 
824 	/*
825 	 * Unfortunately, the networking core only gives us a list of all of
826 	 * the multicast hardware addresses. We need to figure out which ones
827 	 * are new and which ones have been removed
828 	 */
829 
830 	/* Clear out the found flag */
831 	list_for_each_entry(mcast, &priv->multicast_list, list)
832 		clear_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags);
833 
834 	/* Mark all of the entries that are found or don't exist */
835 	ctx.priv = priv;
836 	if_foreach_llmaddr(dev, ipoib_process_maddr, &ctx);
837 
838 	/* Remove all of the entries don't exist anymore */
839 	list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) {
840 		if (!test_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags) &&
841 		    !test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
842 			ipoib_dbg_mcast(priv, "deleting multicast group %16D\n",
843 					mcast->mcmember.mgid.raw, ":");
844 
845 			rb_erase(&mcast->rb_node, &priv->multicast_tree);
846 
847 			/* Move to the remove list */
848 			list_move_tail(&mcast->list, &ctx.remove_list);
849 		}
850 	}
851 
852 	spin_unlock(&priv->lock);
853 
854 	/* We have to cancel outside of the spinlock */
855 	list_for_each_entry_safe(mcast, tmcast, &ctx.remove_list, list) {
856 		ipoib_mcast_leave(mcast->priv, mcast);
857 		ipoib_mcast_free(mcast);
858 	}
859 
860 	if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
861 		ipoib_mcast_start_thread(priv);
862 }
863 
864 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
865 
866 struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct ipoib_dev_priv *priv)
867 {
868 	struct ipoib_mcast_iter *iter;
869 
870 	iter = kmalloc(sizeof *iter, GFP_KERNEL);
871 	if (!iter)
872 		return NULL;
873 
874 	iter->priv = priv;
875 	memset(iter->mgid.raw, 0, 16);
876 
877 	if (ipoib_mcast_iter_next(iter)) {
878 		kfree(iter);
879 		return NULL;
880 	}
881 
882 	return iter;
883 }
884 
885 int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter)
886 {
887 	struct ipoib_dev_priv *priv = iter->priv;
888 	struct rb_node *n;
889 	struct ipoib_mcast *mcast;
890 	int ret = 1;
891 
892 	spin_lock_irq(&priv->lock);
893 
894 	n = rb_first(&priv->multicast_tree);
895 
896 	while (n) {
897 		mcast = rb_entry(n, struct ipoib_mcast, rb_node);
898 
899 		if (memcmp(iter->mgid.raw, mcast->mcmember.mgid.raw,
900 			   sizeof (union ib_gid)) < 0) {
901 			iter->mgid      = mcast->mcmember.mgid;
902 			iter->created   = mcast->created;
903 			iter->queuelen  = mcast->pkt_queue.ifq_len;
904 			iter->complete  = !!mcast->ah;
905 			iter->send_only = !!(mcast->flags & (1 << IPOIB_MCAST_FLAG_SENDONLY));
906 
907 			ret = 0;
908 
909 			break;
910 		}
911 
912 		n = rb_next(n);
913 	}
914 
915 	spin_unlock_irq(&priv->lock);
916 
917 	return ret;
918 }
919 
920 void ipoib_mcast_iter_read(struct ipoib_mcast_iter *iter,
921 			   union ib_gid *mgid,
922 			   unsigned long *created,
923 			   unsigned int *queuelen,
924 			   unsigned int *complete,
925 			   unsigned int *send_only)
926 {
927 	*mgid      = iter->mgid;
928 	*created   = iter->created;
929 	*queuelen  = iter->queuelen;
930 	*complete  = iter->complete;
931 	*send_only = iter->send_only;
932 }
933 
934 #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
935