xref: /freebsd/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c (revision 184c1b943937986c81e1996d999d21626ec7a4ff)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0
3  *
4  * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
5  * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
6  * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
7  * Copyright (c) 2004, 2005 Voltaire, Inc. All rights reserved.
8  *
9  * This software is available to you under a choice of one of two
10  * licenses.  You may choose to be licensed under the terms of the GNU
11  * General Public License (GPL) Version 2, available from the file
12  * COPYING in the main directory of this source tree, or the
13  * OpenIB.org BSD license below:
14  *
15  *     Redistribution and use in source and binary forms, with or
16  *     without modification, are permitted provided that the following
17  *     conditions are met:
18  *
19  *      - Redistributions of source code must retain the above
20  *        copyright notice, this list of conditions and the following
21  *        disclaimer.
22  *
23  *      - Redistributions in binary form must reproduce the above
24  *        copyright notice, this list of conditions and the following
25  *        disclaimer in the documentation and/or other materials
26  *        provided with the distribution.
27  *
28  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
29  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
30  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
31  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
32  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
33  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
34  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
35  * SOFTWARE.
36  */
37 
38 #include <sys/cdefs.h>
39 __FBSDID("$FreeBSD$");
40 
41 #include "ipoib.h"
42 
43 #include <rdma/ib_cache.h>
44 
45 #include <security/mac/mac_framework.h>
46 
47 #include <linux/delay.h>
48 #include <linux/dma-mapping.h>
49 
50 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
51 static int data_debug_level;
52 
53 module_param(data_debug_level, int, 0644);
54 MODULE_PARM_DESC(data_debug_level,
55 		 "Enable data path debug tracing if > 0");
56 #endif
57 
58 static DEFINE_MUTEX(pkey_mutex);
59 
60 struct ipoib_ah *ipoib_create_ah(struct ipoib_dev_priv *priv,
61 				 struct ib_pd *pd, struct ib_ah_attr *attr)
62 {
63 	struct ipoib_ah *ah;
64 
65 	ah = kmalloc(sizeof *ah, GFP_KERNEL);
66 	if (!ah)
67 		return NULL;
68 
69 	ah->priv      = priv;
70 	ah->last_send = 0;
71 	kref_init(&ah->ref);
72 
73 	ah->ah = ib_create_ah(pd, attr);
74 	if (IS_ERR(ah->ah)) {
75 		kfree(ah);
76 		ah = NULL;
77 	} else
78 		ipoib_dbg(priv, "Created ah %p\n", ah->ah);
79 
80 	return ah;
81 }
82 
83 void ipoib_free_ah(struct kref *kref)
84 {
85 	struct ipoib_ah *ah = container_of(kref, struct ipoib_ah, ref);
86 	struct ipoib_dev_priv *priv = ah->priv;
87 
88 	unsigned long flags;
89 
90 	spin_lock_irqsave(&priv->lock, flags);
91 	list_add_tail(&ah->list, &priv->dead_ahs);
92 	spin_unlock_irqrestore(&priv->lock, flags);
93 }
94 
95 void
96 ipoib_dma_unmap_rx(struct ipoib_dev_priv *priv, struct ipoib_rx_buf *rx_req)
97 {
98 	struct mbuf *m;
99 	int i;
100 
101 	for (i = 0, m = rx_req->mb; m != NULL; m = m->m_next, i++)
102 		ib_dma_unmap_single(priv->ca, rx_req->mapping[i], m->m_len,
103 		    DMA_FROM_DEVICE);
104 }
105 
106 void
107 ipoib_dma_mb(struct ipoib_dev_priv *priv, struct mbuf *mb, unsigned int length)
108 {
109 
110 	m_adj(mb, -(mb->m_pkthdr.len - length));
111 }
112 
113 struct mbuf *
114 ipoib_alloc_map_mb(struct ipoib_dev_priv *priv, struct ipoib_rx_buf *rx_req,
115     int align, int size)
116 {
117 	struct mbuf *mb, *m;
118 	int i, j;
119 
120 	rx_req->mb = NULL;
121 	mb = m_getm2(NULL, align + size, M_NOWAIT, MT_DATA, M_PKTHDR);
122 	if (mb == NULL)
123 		return (NULL);
124 	for (i = 0, m = mb; m != NULL; m = m->m_next, i++) {
125 		m->m_len = M_SIZE(m) - align;
126 		m->m_data += align;
127 		align = 0;
128 		mb->m_pkthdr.len += m->m_len;
129 		rx_req->mapping[i] = ib_dma_map_single(priv->ca,
130 		    mtod(m, void *), m->m_len, DMA_FROM_DEVICE);
131 		if (unlikely(ib_dma_mapping_error(priv->ca,
132 		    rx_req->mapping[i])))
133 			goto error;
134 
135 	}
136 	rx_req->mb = mb;
137 	return (mb);
138 error:
139 	for (j = 0, m = mb; j < i; m = m->m_next, j++)
140 		ib_dma_unmap_single(priv->ca, rx_req->mapping[j], m->m_len,
141 		    DMA_FROM_DEVICE);
142 	m_freem(mb);
143 	return (NULL);
144 
145 }
146 
147 static int ipoib_ib_post_receive(struct ipoib_dev_priv *priv, int id)
148 {
149 	struct ipoib_rx_buf *rx_req;
150 	struct ib_recv_wr *bad_wr;
151 	struct mbuf *m;
152 	int ret;
153 	int i;
154 
155 	rx_req = &priv->rx_ring[id];
156 	for (m = rx_req->mb, i = 0; m != NULL; m = m->m_next, i++) {
157 		priv->rx_sge[i].addr = rx_req->mapping[i];
158 		priv->rx_sge[i].length = m->m_len;
159 	}
160 	priv->rx_wr.num_sge = i;
161 	priv->rx_wr.wr_id = id | IPOIB_OP_RECV;
162 
163 	ret = ib_post_recv(priv->qp, &priv->rx_wr, &bad_wr);
164 	if (unlikely(ret)) {
165 		ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret);
166 		ipoib_dma_unmap_rx(priv, &priv->rx_ring[id]);
167 		m_freem(priv->rx_ring[id].mb);
168 		priv->rx_ring[id].mb = NULL;
169 	}
170 
171 	return ret;
172 }
173 
174 static struct mbuf *
175 ipoib_alloc_rx_mb(struct ipoib_dev_priv *priv, int id)
176 {
177 
178 	return ipoib_alloc_map_mb(priv, &priv->rx_ring[id],
179 	    0, priv->max_ib_mtu + IB_GRH_BYTES);
180 }
181 
182 static int ipoib_ib_post_receives(struct ipoib_dev_priv *priv)
183 {
184 	int i;
185 
186 	for (i = 0; i < ipoib_recvq_size; ++i) {
187 		if (!ipoib_alloc_rx_mb(priv, i)) {
188 			ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
189 			return -ENOMEM;
190 		}
191 		if (ipoib_ib_post_receive(priv, i)) {
192 			ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i);
193 			return -EIO;
194 		}
195 	}
196 
197 	return 0;
198 }
199 
200 static void
201 ipoib_ib_handle_rx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc)
202 {
203 	struct ipoib_rx_buf saverx;
204 	unsigned int wr_id = wc->wr_id & ~IPOIB_OP_RECV;
205 	struct ifnet *dev = priv->dev;
206 	struct ipoib_header *eh;
207 	struct mbuf *mb;
208 
209 	ipoib_dbg_data(priv, "recv completion: id %d, status: %d\n",
210 		       wr_id, wc->status);
211 
212 	if (unlikely(wr_id >= ipoib_recvq_size)) {
213 		ipoib_warn(priv, "recv completion event with wrid %d (> %d)\n",
214 			   wr_id, ipoib_recvq_size);
215 		return;
216 	}
217 
218 	mb  = priv->rx_ring[wr_id].mb;
219 
220 	if (unlikely(wc->status != IB_WC_SUCCESS)) {
221 		if (wc->status != IB_WC_WR_FLUSH_ERR) {
222 			ipoib_warn(priv, "failed recv event "
223 				   "(status=%d, wrid=%d vend_err %x)\n",
224 				   wc->status, wr_id, wc->vendor_err);
225 			goto repost;
226 		}
227 		if (mb) {
228 			ipoib_dma_unmap_rx(priv, &priv->rx_ring[wr_id]);
229 			m_freem(mb);
230 			priv->rx_ring[wr_id].mb = NULL;
231 		}
232 		return;
233 	}
234 
235 	/*
236 	 * Drop packets that this interface sent, ie multicast packets
237 	 * that the HCA has replicated.
238 	 */
239 	if (wc->slid == priv->local_lid && wc->src_qp == priv->qp->qp_num)
240 		goto repost;
241 
242 	memcpy(&saverx, &priv->rx_ring[wr_id], sizeof(saverx));
243 	/*
244 	 * If we can't allocate a new RX buffer, dump
245 	 * this packet and reuse the old buffer.
246 	 */
247 	if (unlikely(!ipoib_alloc_rx_mb(priv, wr_id))) {
248 		memcpy(&priv->rx_ring[wr_id], &saverx, sizeof(saverx));
249 		if_inc_counter(dev, IFCOUNTER_IQDROPS, 1);
250 		goto repost;
251 	}
252 
253 	ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
254 		       wc->byte_len, wc->slid);
255 
256 	ipoib_dma_unmap_rx(priv, &saverx);
257 	ipoib_dma_mb(priv, mb, wc->byte_len);
258 
259 	if_inc_counter(dev, IFCOUNTER_IPACKETS, 1);
260 	if_inc_counter(dev, IFCOUNTER_IBYTES, mb->m_pkthdr.len);
261 	mb->m_pkthdr.rcvif = dev;
262 	m_adj(mb, sizeof(struct ib_grh) - INFINIBAND_ALEN);
263 	eh = mtod(mb, struct ipoib_header *);
264 	bzero(eh->hwaddr, 4);	/* Zero the queue pair, only dgid is in grh */
265 
266 	if (test_bit(IPOIB_FLAG_CSUM, &priv->flags) && likely(wc->wc_flags & IB_WC_IP_CSUM_OK))
267 		mb->m_pkthdr.csum_flags = CSUM_IP_CHECKED | CSUM_IP_VALID;
268 
269 	dev->if_input(dev, mb);
270 
271 repost:
272 	if (unlikely(ipoib_ib_post_receive(priv, wr_id)))
273 		ipoib_warn(priv, "ipoib_ib_post_receive failed "
274 			   "for buf %d\n", wr_id);
275 }
276 
277 int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req, int max)
278 {
279 	struct mbuf *mb = tx_req->mb;
280 	u64 *mapping = tx_req->mapping;
281 	struct mbuf *m, *p;
282 	int error;
283 	int i;
284 
285 	for (m = mb, p = NULL, i = 0; m != NULL; p = m, m = m->m_next, i++) {
286 		if (m->m_len != 0)
287 			continue;
288 		if (p == NULL)
289 			panic("ipoib_dma_map_tx: First mbuf empty\n");
290 		p->m_next = m_free(m);
291 		m = p;
292 		i--;
293 	}
294 	i--;
295 	if (i >= max) {
296 		tx_req->mb = mb = m_defrag(mb, M_NOWAIT);
297 		if (mb == NULL)
298 			return -EIO;
299 		for (m = mb, i = 0; m != NULL; m = m->m_next, i++);
300 		if (i >= max)
301 			return -EIO;
302 	}
303 	error = 0;
304 	for (m = mb, i = 0; m != NULL; m = m->m_next, i++) {
305 		mapping[i] = ib_dma_map_single(ca, mtod(m, void *),
306 					       m->m_len, DMA_TO_DEVICE);
307 		if (unlikely(ib_dma_mapping_error(ca, mapping[i]))) {
308 			error = -EIO;
309 			break;
310 		}
311 	}
312 	if (error) {
313 		int end;
314 
315 		end = i;
316 		for (m = mb, i = 0; i < end; m = m->m_next, i++)
317 			ib_dma_unmap_single(ca, mapping[i], m->m_len,
318 					    DMA_TO_DEVICE);
319 	}
320 	return error;
321 }
322 
323 void ipoib_dma_unmap_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req)
324 {
325 	struct mbuf *mb = tx_req->mb;
326 	u64 *mapping = tx_req->mapping;
327 	struct mbuf *m;
328 	int i;
329 
330 	for (m = mb, i = 0; m != NULL; m = m->m_next, i++)
331 		ib_dma_unmap_single(ca, mapping[i], m->m_len, DMA_TO_DEVICE);
332 }
333 
334 static void ipoib_ib_handle_tx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc)
335 {
336 	struct ifnet *dev = priv->dev;
337 	unsigned int wr_id = wc->wr_id;
338 	struct ipoib_tx_buf *tx_req;
339 
340 	ipoib_dbg_data(priv, "send completion: id %d, status: %d\n",
341 		       wr_id, wc->status);
342 
343 	if (unlikely(wr_id >= ipoib_sendq_size)) {
344 		ipoib_warn(priv, "send completion event with wrid %d (> %d)\n",
345 			   wr_id, ipoib_sendq_size);
346 		return;
347 	}
348 
349 	tx_req = &priv->tx_ring[wr_id];
350 
351 	ipoib_dma_unmap_tx(priv->ca, tx_req);
352 
353 	if_inc_counter(dev, IFCOUNTER_OPACKETS, 1);
354 
355 	m_freem(tx_req->mb);
356 
357 	++priv->tx_tail;
358 	if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
359 	    (dev->if_drv_flags & IFF_DRV_OACTIVE) &&
360 	    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
361 		dev->if_drv_flags &= ~IFF_DRV_OACTIVE;
362 
363 	if (wc->status != IB_WC_SUCCESS &&
364 	    wc->status != IB_WC_WR_FLUSH_ERR)
365 		ipoib_warn(priv, "failed send event "
366 			   "(status=%d, wrid=%d vend_err %x)\n",
367 			   wc->status, wr_id, wc->vendor_err);
368 }
369 
370 int
371 ipoib_poll_tx(struct ipoib_dev_priv *priv, bool do_start)
372 {
373 	int n, i;
374 
375 	n = ib_poll_cq(priv->send_cq, MAX_SEND_CQE, priv->send_wc);
376 	for (i = 0; i < n; ++i) {
377 		struct ib_wc *wc = priv->send_wc + i;
378 		if (wc->wr_id & IPOIB_OP_CM)
379 			ipoib_cm_handle_tx_wc(priv, wc);
380 		else
381 			ipoib_ib_handle_tx_wc(priv, wc);
382 	}
383 
384 	if (do_start && n != 0)
385 		ipoib_start_locked(priv->dev, priv);
386 
387 	return n == MAX_SEND_CQE;
388 }
389 
390 static void
391 ipoib_poll(struct ipoib_dev_priv *priv)
392 {
393 	int n, i;
394 
395 poll_more:
396 	spin_lock(&priv->drain_lock);
397 	for (;;) {
398 		n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc);
399 		for (i = 0; i < n; i++) {
400 			struct ib_wc *wc = priv->ibwc + i;
401 
402 			if ((wc->wr_id & IPOIB_OP_RECV) == 0)
403 				panic("ipoib_poll: Bad wr_id 0x%jX\n",
404 				    (intmax_t)wc->wr_id);
405 			if (wc->wr_id & IPOIB_OP_CM)
406 				ipoib_cm_handle_rx_wc(priv, wc);
407 			else
408 				ipoib_ib_handle_rx_wc(priv, wc);
409 		}
410 
411 		if (n != IPOIB_NUM_WC)
412 			break;
413 	}
414 	spin_unlock(&priv->drain_lock);
415 
416 	if (ib_req_notify_cq(priv->recv_cq,
417 	    IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS) > 0)
418 		goto poll_more;
419 }
420 
421 void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr)
422 {
423 	struct ipoib_dev_priv *priv = dev_ptr;
424 
425 	ipoib_poll(priv);
426 }
427 
428 static void drain_tx_cq(struct ipoib_dev_priv *priv)
429 {
430 	struct ifnet *dev = priv->dev;
431 
432 	spin_lock(&priv->lock);
433 	while (ipoib_poll_tx(priv, true))
434 		; /* nothing */
435 
436 	if (dev->if_drv_flags & IFF_DRV_OACTIVE)
437 		mod_timer(&priv->poll_timer, jiffies + 1);
438 
439 	spin_unlock(&priv->lock);
440 }
441 
442 void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr)
443 {
444 	struct ipoib_dev_priv *priv = dev_ptr;
445 
446 	mod_timer(&priv->poll_timer, jiffies);
447 }
448 
449 static inline int
450 post_send(struct ipoib_dev_priv *priv, unsigned int wr_id,
451     struct ib_ah *address, u32 qpn, struct ipoib_tx_buf *tx_req, void *head,
452     int hlen)
453 {
454 	struct ib_send_wr *bad_wr;
455 	struct mbuf *mb = tx_req->mb;
456 	u64 *mapping = tx_req->mapping;
457 	struct mbuf *m;
458 	int i;
459 
460 	for (m = mb, i = 0; m != NULL; m = m->m_next, i++) {
461 		priv->tx_sge[i].addr         = mapping[i];
462 		priv->tx_sge[i].length       = m->m_len;
463 	}
464 	priv->tx_wr.wr.num_sge	= i;
465 	priv->tx_wr.wr.wr_id	= wr_id;
466 	priv->tx_wr.remote_qpn	= qpn;
467 	priv->tx_wr.ah		= address;
468 
469 	if (head) {
470 		priv->tx_wr.mss		= 0; /* XXX mb_shinfo(mb)->gso_size; */
471 		priv->tx_wr.header	= head;
472 		priv->tx_wr.hlen	= hlen;
473 		priv->tx_wr.wr.opcode	= IB_WR_LSO;
474 	} else
475 		priv->tx_wr.wr.opcode	= IB_WR_SEND;
476 
477 	return ib_post_send(priv->qp, &priv->tx_wr.wr, &bad_wr);
478 }
479 
480 void
481 ipoib_send(struct ipoib_dev_priv *priv, struct mbuf *mb,
482     struct ipoib_ah *address, u32 qpn)
483 {
484 	struct ifnet *dev = priv->dev;
485 	struct ipoib_tx_buf *tx_req;
486 	int hlen;
487 	void *phead;
488 
489 	if (unlikely(priv->tx_outstanding > MAX_SEND_CQE))
490 		while (ipoib_poll_tx(priv, false))
491 			; /* nothing */
492 
493 	m_adj(mb, sizeof (struct ipoib_pseudoheader));
494 	if (0 /* XXX segment offload mb_is_gso(mb) */) {
495 		/* XXX hlen = mb_transport_offset(mb) + tcp_hdrlen(mb); */
496 		phead = mtod(mb, void *);
497 		if (mb->m_len < hlen) {
498 			ipoib_warn(priv, "linear data too small\n");
499 			if_inc_counter(dev, IFCOUNTER_OERRORS, 1);
500 			m_freem(mb);
501 			return;
502 		}
503 		m_adj(mb, hlen);
504 	} else {
505 		if (unlikely(mb->m_pkthdr.len - IPOIB_ENCAP_LEN > priv->mcast_mtu)) {
506 			ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
507 				   mb->m_pkthdr.len, priv->mcast_mtu);
508 			if_inc_counter(dev, IFCOUNTER_OERRORS, 1);
509 			ipoib_cm_mb_too_long(priv, mb, priv->mcast_mtu);
510 			return;
511 		}
512 		phead = NULL;
513 		hlen  = 0;
514 	}
515 
516 	ipoib_dbg_data(priv, "sending packet, length=%d address=%p qpn=0x%06x\n",
517 		       mb->m_pkthdr.len, address, qpn);
518 
519 	/*
520 	 * We put the mb into the tx_ring _before_ we call post_send()
521 	 * because it's entirely possible that the completion handler will
522 	 * run before we execute anything after the post_send().  That
523 	 * means we have to make sure everything is properly recorded and
524 	 * our state is consistent before we call post_send().
525 	 */
526 	tx_req = &priv->tx_ring[priv->tx_head & (ipoib_sendq_size - 1)];
527 	tx_req->mb = mb;
528 	if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req, IPOIB_UD_TX_SG))) {
529 		if_inc_counter(dev, IFCOUNTER_OERRORS, 1);
530 		if (tx_req->mb)
531 			m_freem(tx_req->mb);
532 		return;
533 	}
534 
535 	if (mb->m_pkthdr.csum_flags & (CSUM_IP|CSUM_TCP|CSUM_UDP))
536 		priv->tx_wr.wr.send_flags |= IB_SEND_IP_CSUM;
537 	else
538 		priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM;
539 
540 	if (++priv->tx_outstanding == ipoib_sendq_size) {
541 		ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
542 		if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP))
543 			ipoib_warn(priv, "request notify on send CQ failed\n");
544 		dev->if_drv_flags |= IFF_DRV_OACTIVE;
545 	}
546 
547 	if (unlikely(post_send(priv,
548 	    priv->tx_head & (ipoib_sendq_size - 1), address->ah, qpn,
549 	    tx_req, phead, hlen))) {
550 		ipoib_warn(priv, "post_send failed\n");
551 		if_inc_counter(dev, IFCOUNTER_OERRORS, 1);
552 		--priv->tx_outstanding;
553 		ipoib_dma_unmap_tx(priv->ca, tx_req);
554 		m_freem(mb);
555 		if (dev->if_drv_flags & IFF_DRV_OACTIVE)
556 			dev->if_drv_flags &= ~IFF_DRV_OACTIVE;
557 	} else {
558 		address->last_send = priv->tx_head;
559 		++priv->tx_head;
560 	}
561 }
562 
563 static void __ipoib_reap_ah(struct ipoib_dev_priv *priv)
564 {
565 	struct ipoib_ah *ah, *tah;
566 	LIST_HEAD(remove_list);
567 	unsigned long flags;
568 
569 	spin_lock_irqsave(&priv->lock, flags);
570 
571 	list_for_each_entry_safe(ah, tah, &priv->dead_ahs, list)
572 		if ((int) priv->tx_tail - (int) ah->last_send >= 0) {
573 			list_del(&ah->list);
574 			ib_destroy_ah(ah->ah);
575 			kfree(ah);
576 		}
577 
578 	spin_unlock_irqrestore(&priv->lock, flags);
579 }
580 
581 void ipoib_reap_ah(struct work_struct *work)
582 {
583 	struct ipoib_dev_priv *priv =
584 		container_of(work, struct ipoib_dev_priv, ah_reap_task.work);
585 
586 	__ipoib_reap_ah(priv);
587 
588 	if (!test_bit(IPOIB_STOP_REAPER, &priv->flags))
589 		queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task,
590 				   HZ);
591 }
592 
593 static void ipoib_ah_dev_cleanup(struct ipoib_dev_priv *priv)
594 {
595 	unsigned long begin;
596 
597 	begin = jiffies;
598 
599 	while (!list_empty(&priv->dead_ahs)) {
600 		__ipoib_reap_ah(priv);
601 
602 		if (time_after(jiffies, begin + HZ)) {
603 			ipoib_warn(priv, "timing out; will leak address handles\n");
604 			break;
605 		}
606 
607 		msleep(1);
608 	}
609 }
610 
611 static void ipoib_ib_tx_timer_func(unsigned long ctx)
612 {
613 	drain_tx_cq((struct ipoib_dev_priv *)ctx);
614 }
615 
616 int ipoib_ib_dev_open(struct ipoib_dev_priv *priv)
617 {
618 	int ret;
619 
620 	if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &priv->pkey_index)) {
621 		ipoib_warn(priv, "P_Key 0x%04x not found\n", priv->pkey);
622 		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
623 		return -1;
624 	}
625 	set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
626 
627 	ret = ipoib_init_qp(priv);
628 	if (ret) {
629 		ipoib_warn(priv, "ipoib_init_qp returned %d\n", ret);
630 		return -1;
631 	}
632 
633 	ret = ipoib_ib_post_receives(priv);
634 	if (ret) {
635 		ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret);
636 		ipoib_ib_dev_stop(priv, 1);
637 		return -1;
638 	}
639 
640 	ret = ipoib_cm_dev_open(priv);
641 	if (ret) {
642 		ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n", ret);
643 		ipoib_ib_dev_stop(priv, 1);
644 		return -1;
645 	}
646 
647 	clear_bit(IPOIB_STOP_REAPER, &priv->flags);
648 	queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ);
649 
650 	set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
651 
652 	return 0;
653 }
654 
655 static void ipoib_pkey_dev_check_presence(struct ipoib_dev_priv *priv)
656 {
657 	u16 pkey_index = 0;
658 
659 	if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index))
660 		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
661 	else
662 		set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
663 }
664 
665 int ipoib_ib_dev_up(struct ipoib_dev_priv *priv)
666 {
667 
668 	ipoib_pkey_dev_check_presence(priv);
669 
670 	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
671 		ipoib_dbg(priv, "PKEY is not assigned.\n");
672 		return 0;
673 	}
674 
675 	set_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
676 
677 	return ipoib_mcast_start_thread(priv);
678 }
679 
680 int ipoib_ib_dev_down(struct ipoib_dev_priv *priv, int flush)
681 {
682 
683 	ipoib_dbg(priv, "downing ib_dev\n");
684 
685 	clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
686 	if_link_state_change(priv->dev, LINK_STATE_DOWN);
687 
688 	/* Shutdown the P_Key thread if still active */
689 	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
690 		mutex_lock(&pkey_mutex);
691 		set_bit(IPOIB_PKEY_STOP, &priv->flags);
692 		cancel_delayed_work(&priv->pkey_poll_task);
693 		mutex_unlock(&pkey_mutex);
694 		if (flush)
695 			flush_workqueue(ipoib_workqueue);
696 	}
697 
698 	ipoib_mcast_stop_thread(priv, flush);
699 	ipoib_mcast_dev_flush(priv);
700 
701 	ipoib_flush_paths(priv);
702 
703 	return 0;
704 }
705 
706 static int recvs_pending(struct ipoib_dev_priv *priv)
707 {
708 	int pending = 0;
709 	int i;
710 
711 	for (i = 0; i < ipoib_recvq_size; ++i)
712 		if (priv->rx_ring[i].mb)
713 			++pending;
714 
715 	return pending;
716 }
717 
718 static void check_qp_movement_and_print(struct ipoib_dev_priv *priv,
719 					struct ib_qp *qp,
720 					enum ib_qp_state new_state)
721 {
722 	struct ib_qp_attr qp_attr;
723 	struct ib_qp_init_attr query_init_attr;
724 	int ret;
725 
726 	ret = ib_query_qp(qp, &qp_attr, IB_QP_STATE, &query_init_attr);
727 	if (ret) {
728 		ipoib_warn(priv, "%s: Failed to query QP (%d)\n", __func__, ret);
729 		return;
730 	}
731 
732 	/* print according to the new-state and the previous state */
733 	if (new_state == IB_QPS_ERR && qp_attr.qp_state == IB_QPS_RESET) {
734 		ipoib_dbg(priv, "Failed to modify QP %d->%d, acceptable\n",
735 			  qp_attr.qp_state, new_state);
736 	} else {
737 		ipoib_warn(priv, "Failed to modify QP %d->%d\n",
738 			   qp_attr.qp_state, new_state);
739 	}
740 }
741 
742 void ipoib_drain_cq(struct ipoib_dev_priv *priv)
743 {
744 	int i, n;
745 
746 	spin_lock(&priv->drain_lock);
747 	do {
748 		n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc);
749 		for (i = 0; i < n; ++i) {
750 			/*
751 			 * Convert any successful completions to flush
752 			 * errors to avoid passing packets up the
753 			 * stack after bringing the device down.
754 			 */
755 			if (priv->ibwc[i].status == IB_WC_SUCCESS)
756 				priv->ibwc[i].status = IB_WC_WR_FLUSH_ERR;
757 
758 			if ((priv->ibwc[i].wr_id & IPOIB_OP_RECV) == 0)
759 				panic("ipoib_drain_cq:  Bad wrid 0x%jX\n",
760 				    (intmax_t)priv->ibwc[i].wr_id);
761 			if (priv->ibwc[i].wr_id & IPOIB_OP_CM)
762 				ipoib_cm_handle_rx_wc(priv, priv->ibwc + i);
763 			else
764 				ipoib_ib_handle_rx_wc(priv, priv->ibwc + i);
765 		}
766 	} while (n == IPOIB_NUM_WC);
767 	spin_unlock(&priv->drain_lock);
768 
769 	spin_lock(&priv->lock);
770 	while (ipoib_poll_tx(priv, true))
771 		; /* nothing */
772 
773 	spin_unlock(&priv->lock);
774 }
775 
776 int ipoib_ib_dev_stop(struct ipoib_dev_priv *priv, int flush)
777 {
778 	struct ib_qp_attr qp_attr;
779 	unsigned long begin;
780 	struct ipoib_tx_buf *tx_req;
781 	int i;
782 
783 	clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
784 
785 	ipoib_cm_dev_stop(priv);
786 
787 	/*
788 	 * Move our QP to the error state and then reinitialize in
789 	 * when all work requests have completed or have been flushed.
790 	 */
791 	qp_attr.qp_state = IB_QPS_ERR;
792 	if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
793 		check_qp_movement_and_print(priv, priv->qp, IB_QPS_ERR);
794 
795 	/* Wait for all sends and receives to complete */
796 	begin = jiffies;
797 
798 	while (priv->tx_head != priv->tx_tail || recvs_pending(priv)) {
799 		if (time_after(jiffies, begin + 5 * HZ)) {
800 			ipoib_warn(priv, "timing out; %d sends %d receives not completed\n",
801 				   priv->tx_head - priv->tx_tail, recvs_pending(priv));
802 
803 			/*
804 			 * assume the HW is wedged and just free up
805 			 * all our pending work requests.
806 			 */
807 			while ((int) priv->tx_tail - (int) priv->tx_head < 0) {
808 				tx_req = &priv->tx_ring[priv->tx_tail &
809 							(ipoib_sendq_size - 1)];
810 				ipoib_dma_unmap_tx(priv->ca, tx_req);
811 				m_freem(tx_req->mb);
812 				++priv->tx_tail;
813 				--priv->tx_outstanding;
814 			}
815 
816 			for (i = 0; i < ipoib_recvq_size; ++i) {
817 				struct ipoib_rx_buf *rx_req;
818 
819 				rx_req = &priv->rx_ring[i];
820 				if (!rx_req->mb)
821 					continue;
822 				ipoib_dma_unmap_rx(priv, &priv->rx_ring[i]);
823 				m_freem(rx_req->mb);
824 				rx_req->mb = NULL;
825 			}
826 
827 			goto timeout;
828 		}
829 
830 		ipoib_drain_cq(priv);
831 
832 		msleep(1);
833 	}
834 
835 	ipoib_dbg(priv, "All sends and receives done.\n");
836 
837 timeout:
838 	del_timer_sync(&priv->poll_timer);
839 	qp_attr.qp_state = IB_QPS_RESET;
840 	if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
841 		ipoib_warn(priv, "Failed to modify QP to RESET state\n");
842 
843 	/* Wait for all AHs to be reaped */
844 	set_bit(IPOIB_STOP_REAPER, &priv->flags);
845 	cancel_delayed_work(&priv->ah_reap_task);
846 	if (flush)
847 		flush_workqueue(ipoib_workqueue);
848 
849 	ipoib_ah_dev_cleanup(priv);
850 
851 	ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP);
852 
853 	return 0;
854 }
855 
856 int ipoib_ib_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca, int port)
857 {
858 	struct ifnet *dev = priv->dev;
859 
860 	priv->ca = ca;
861 	priv->port = port;
862 	priv->qp = NULL;
863 
864 	if (ipoib_transport_dev_init(priv, ca)) {
865 		printk(KERN_WARNING "%s: ipoib_transport_dev_init failed\n", ca->name);
866 		return -ENODEV;
867 	}
868 
869 	setup_timer(&priv->poll_timer, ipoib_ib_tx_timer_func,
870 		    (unsigned long) priv);
871 
872 	if (dev->if_flags & IFF_UP) {
873 		if (ipoib_ib_dev_open(priv)) {
874 			ipoib_transport_dev_cleanup(priv);
875 			return -ENODEV;
876 		}
877 	}
878 
879 	return 0;
880 }
881 
882 static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
883 				enum ipoib_flush_level level)
884 {
885 	struct ipoib_dev_priv *cpriv;
886 	u16 new_index;
887 
888 	mutex_lock(&priv->vlan_mutex);
889 
890 	/*
891 	 * Flush any child interfaces too -- they might be up even if
892 	 * the parent is down.
893 	 */
894 	list_for_each_entry(cpriv, &priv->child_intfs, list)
895 		__ipoib_ib_dev_flush(cpriv, level);
896 
897 	mutex_unlock(&priv->vlan_mutex);
898 
899 	if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) {
900 		ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n");
901 		return;
902 	}
903 
904 	if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
905 		ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_ADMIN_UP not set.\n");
906 		return;
907 	}
908 
909 	if (level == IPOIB_FLUSH_HEAVY) {
910 		if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) {
911 			clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
912 			ipoib_ib_dev_down(priv, 0);
913 			ipoib_ib_dev_stop(priv, 0);
914 			if (ipoib_pkey_dev_delay_open(priv))
915 				return;
916 		}
917 
918 		/* restart QP only if P_Key index is changed */
919 		if (test_and_set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) &&
920 		    new_index == priv->pkey_index) {
921 			ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n");
922 			return;
923 		}
924 		priv->pkey_index = new_index;
925 	}
926 
927 	if (level == IPOIB_FLUSH_LIGHT) {
928 		ipoib_mark_paths_invalid(priv);
929 		ipoib_mcast_dev_flush(priv);
930 	}
931 
932 	if (level >= IPOIB_FLUSH_NORMAL)
933 		ipoib_ib_dev_down(priv, 0);
934 
935 	if (level == IPOIB_FLUSH_HEAVY) {
936 		ipoib_ib_dev_stop(priv, 0);
937 		ipoib_ib_dev_open(priv);
938 	}
939 
940 	/*
941 	 * The device could have been brought down between the start and when
942 	 * we get here, don't bring it back up if it's not configured up
943 	 */
944 	if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
945 		if (level >= IPOIB_FLUSH_NORMAL)
946 			ipoib_ib_dev_up(priv);
947 		ipoib_mcast_restart_task(&priv->restart_task);
948 	}
949 }
950 
951 void ipoib_ib_dev_flush_light(struct work_struct *work)
952 {
953 	struct ipoib_dev_priv *priv =
954 		container_of(work, struct ipoib_dev_priv, flush_light);
955 
956 	__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_LIGHT);
957 }
958 
959 void ipoib_ib_dev_flush_normal(struct work_struct *work)
960 {
961 	struct ipoib_dev_priv *priv =
962 		container_of(work, struct ipoib_dev_priv, flush_normal);
963 
964 	__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_NORMAL);
965 }
966 
967 void ipoib_ib_dev_flush_heavy(struct work_struct *work)
968 {
969 	struct ipoib_dev_priv *priv =
970 		container_of(work, struct ipoib_dev_priv, flush_heavy);
971 
972 	__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_HEAVY);
973 }
974 
975 void ipoib_ib_dev_cleanup(struct ipoib_dev_priv *priv)
976 {
977 
978 	ipoib_dbg(priv, "cleaning up ib_dev\n");
979 
980 	ipoib_mcast_stop_thread(priv, 1);
981 	ipoib_mcast_dev_flush(priv);
982 
983 	ipoib_ah_dev_cleanup(priv);
984 	ipoib_transport_dev_cleanup(priv);
985 }
986 
987 /*
988  * Delayed P_Key Assigment Interim Support
989  *
990  * The following is initial implementation of delayed P_Key assigment
991  * mechanism. It is using the same approach implemented for the multicast
992  * group join. The single goal of this implementation is to quickly address
993  * Bug #2507. This implementation will probably be removed when the P_Key
994  * change async notification is available.
995  */
996 
997 void ipoib_pkey_poll(struct work_struct *work)
998 {
999 	struct ipoib_dev_priv *priv =
1000 		container_of(work, struct ipoib_dev_priv, pkey_poll_task.work);
1001 
1002 	ipoib_pkey_dev_check_presence(priv);
1003 
1004 	if (test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
1005 		ipoib_open(priv);
1006 	else {
1007 		mutex_lock(&pkey_mutex);
1008 		if (!test_bit(IPOIB_PKEY_STOP, &priv->flags))
1009 			queue_delayed_work(ipoib_workqueue,
1010 					   &priv->pkey_poll_task,
1011 					   HZ);
1012 		mutex_unlock(&pkey_mutex);
1013 	}
1014 }
1015 
1016 int ipoib_pkey_dev_delay_open(struct ipoib_dev_priv *priv)
1017 {
1018 
1019 	/* Look for the interface pkey value in the IB Port P_Key table and */
1020 	/* set the interface pkey assigment flag                            */
1021 	ipoib_pkey_dev_check_presence(priv);
1022 
1023 	/* P_Key value not assigned yet - start polling */
1024 	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
1025 		mutex_lock(&pkey_mutex);
1026 		clear_bit(IPOIB_PKEY_STOP, &priv->flags);
1027 		queue_delayed_work(ipoib_workqueue,
1028 				   &priv->pkey_poll_task,
1029 				   HZ);
1030 		mutex_unlock(&pkey_mutex);
1031 		return 1;
1032 	}
1033 
1034 	return 0;
1035 }
1036