xref: /freebsd/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c (revision 4f52dfbb8d6c4d446500c5b097e3806ec219fbd4)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0
3  *
4  * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
5  * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
6  * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
7  * Copyright (c) 2004, 2005 Voltaire, Inc. All rights reserved.
8  *
9  * This software is available to you under a choice of one of two
10  * licenses.  You may choose to be licensed under the terms of the GNU
11  * General Public License (GPL) Version 2, available from the file
12  * COPYING in the main directory of this source tree, or the
13  * OpenIB.org BSD license below:
14  *
15  *     Redistribution and use in source and binary forms, with or
16  *     without modification, are permitted provided that the following
17  *     conditions are met:
18  *
19  *      - Redistributions of source code must retain the above
20  *        copyright notice, this list of conditions and the following
21  *        disclaimer.
22  *
23  *      - Redistributions in binary form must reproduce the above
24  *        copyright notice, this list of conditions and the following
25  *        disclaimer in the documentation and/or other materials
26  *        provided with the distribution.
27  *
28  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
29  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
30  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
31  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
32  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
33  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
34  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
35  * SOFTWARE.
36  */
37 
38 #include "ipoib.h"
39 
40 #include <rdma/ib_cache.h>
41 
42 #include <security/mac/mac_framework.h>
43 
44 #include <linux/delay.h>
45 #include <linux/dma-mapping.h>
46 
47 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
48 static int data_debug_level;
49 
50 module_param(data_debug_level, int, 0644);
51 MODULE_PARM_DESC(data_debug_level,
52 		 "Enable data path debug tracing if > 0");
53 #endif
54 
55 static DEFINE_MUTEX(pkey_mutex);
56 
57 struct ipoib_ah *ipoib_create_ah(struct ipoib_dev_priv *priv,
58 				 struct ib_pd *pd, struct ib_ah_attr *attr)
59 {
60 	struct ipoib_ah *ah;
61 
62 	ah = kmalloc(sizeof *ah, GFP_KERNEL);
63 	if (!ah)
64 		return NULL;
65 
66 	ah->priv      = priv;
67 	ah->last_send = 0;
68 	kref_init(&ah->ref);
69 
70 	ah->ah = ib_create_ah(pd, attr);
71 	if (IS_ERR(ah->ah)) {
72 		kfree(ah);
73 		ah = NULL;
74 	} else
75 		ipoib_dbg(priv, "Created ah %p\n", ah->ah);
76 
77 	return ah;
78 }
79 
80 void ipoib_free_ah(struct kref *kref)
81 {
82 	struct ipoib_ah *ah = container_of(kref, struct ipoib_ah, ref);
83 	struct ipoib_dev_priv *priv = ah->priv;
84 
85 	unsigned long flags;
86 
87 	spin_lock_irqsave(&priv->lock, flags);
88 	list_add_tail(&ah->list, &priv->dead_ahs);
89 	spin_unlock_irqrestore(&priv->lock, flags);
90 }
91 
92 void
93 ipoib_dma_unmap_rx(struct ipoib_dev_priv *priv, struct ipoib_rx_buf *rx_req)
94 {
95 	struct mbuf *m;
96 	int i;
97 
98 	for (i = 0, m = rx_req->mb; m != NULL; m = m->m_next, i++)
99 		ib_dma_unmap_single(priv->ca, rx_req->mapping[i], m->m_len,
100 		    DMA_FROM_DEVICE);
101 }
102 
103 void
104 ipoib_dma_mb(struct ipoib_dev_priv *priv, struct mbuf *mb, unsigned int length)
105 {
106 
107 	m_adj(mb, -(mb->m_pkthdr.len - length));
108 }
109 
110 struct mbuf *
111 ipoib_alloc_map_mb(struct ipoib_dev_priv *priv, struct ipoib_rx_buf *rx_req,
112     int size)
113 {
114 	struct mbuf *mb, *m;
115 	int i, j;
116 
117 	rx_req->mb = NULL;
118 	mb = m_getm2(NULL, size, M_NOWAIT, MT_DATA, M_PKTHDR);
119 	if (mb == NULL)
120 		return (NULL);
121 	for (i = 0, m = mb; m != NULL; m = m->m_next, i++) {
122 		m->m_len = M_SIZE(m);
123 		mb->m_pkthdr.len += m->m_len;
124 		rx_req->mapping[i] = ib_dma_map_single(priv->ca,
125 		    mtod(m, void *), m->m_len, DMA_FROM_DEVICE);
126 		if (unlikely(ib_dma_mapping_error(priv->ca,
127 		    rx_req->mapping[i])))
128 			goto error;
129 
130 	}
131 	rx_req->mb = mb;
132 	return (mb);
133 error:
134 	for (j = 0, m = mb; j < i; m = m->m_next, j++)
135 		ib_dma_unmap_single(priv->ca, rx_req->mapping[j], m->m_len,
136 		    DMA_FROM_DEVICE);
137 	m_freem(mb);
138 	return (NULL);
139 
140 }
141 
142 static int ipoib_ib_post_receive(struct ipoib_dev_priv *priv, int id)
143 {
144 	struct ipoib_rx_buf *rx_req;
145 	struct ib_recv_wr *bad_wr;
146 	struct mbuf *m;
147 	int ret;
148 	int i;
149 
150 	rx_req = &priv->rx_ring[id];
151 	for (m = rx_req->mb, i = 0; m != NULL; m = m->m_next, i++) {
152 		priv->rx_sge[i].addr = rx_req->mapping[i];
153 		priv->rx_sge[i].length = m->m_len;
154 	}
155 	priv->rx_wr.num_sge = i;
156 	priv->rx_wr.wr_id = id | IPOIB_OP_RECV;
157 
158 	ret = ib_post_recv(priv->qp, &priv->rx_wr, &bad_wr);
159 	if (unlikely(ret)) {
160 		ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret);
161 		ipoib_dma_unmap_rx(priv, &priv->rx_ring[id]);
162 		m_freem(priv->rx_ring[id].mb);
163 		priv->rx_ring[id].mb = NULL;
164 	}
165 
166 	return ret;
167 }
168 
169 static struct mbuf *
170 ipoib_alloc_rx_mb(struct ipoib_dev_priv *priv, int id)
171 {
172 
173 	return ipoib_alloc_map_mb(priv, &priv->rx_ring[id],
174 	    priv->max_ib_mtu + IB_GRH_BYTES);
175 }
176 
177 static int ipoib_ib_post_receives(struct ipoib_dev_priv *priv)
178 {
179 	int i;
180 
181 	for (i = 0; i < ipoib_recvq_size; ++i) {
182 		if (!ipoib_alloc_rx_mb(priv, i)) {
183 			ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
184 			return -ENOMEM;
185 		}
186 		if (ipoib_ib_post_receive(priv, i)) {
187 			ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i);
188 			return -EIO;
189 		}
190 	}
191 
192 	return 0;
193 }
194 
195 static void
196 ipoib_ib_handle_rx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc)
197 {
198 	struct ipoib_rx_buf saverx;
199 	unsigned int wr_id = wc->wr_id & ~IPOIB_OP_RECV;
200 	struct ifnet *dev = priv->dev;
201 	struct ipoib_header *eh;
202 	struct mbuf *mb;
203 
204 	ipoib_dbg_data(priv, "recv completion: id %d, status: %d\n",
205 		       wr_id, wc->status);
206 
207 	if (unlikely(wr_id >= ipoib_recvq_size)) {
208 		ipoib_warn(priv, "recv completion event with wrid %d (> %d)\n",
209 			   wr_id, ipoib_recvq_size);
210 		return;
211 	}
212 
213 	mb  = priv->rx_ring[wr_id].mb;
214 
215 	if (unlikely(wc->status != IB_WC_SUCCESS)) {
216 		if (wc->status != IB_WC_WR_FLUSH_ERR) {
217 			ipoib_warn(priv, "failed recv event "
218 				   "(status=%d, wrid=%d vend_err %x)\n",
219 				   wc->status, wr_id, wc->vendor_err);
220 			goto repost;
221 		}
222 		if (mb) {
223 			ipoib_dma_unmap_rx(priv, &priv->rx_ring[wr_id]);
224 			m_freem(mb);
225 			priv->rx_ring[wr_id].mb = NULL;
226 		}
227 		return;
228 	}
229 
230 	/*
231 	 * Drop packets that this interface sent, ie multicast packets
232 	 * that the HCA has replicated.
233 	 */
234 	if (wc->slid == priv->local_lid && wc->src_qp == priv->qp->qp_num)
235 		goto repost;
236 
237 	memcpy(&saverx, &priv->rx_ring[wr_id], sizeof(saverx));
238 	/*
239 	 * If we can't allocate a new RX buffer, dump
240 	 * this packet and reuse the old buffer.
241 	 */
242 	if (unlikely(!ipoib_alloc_rx_mb(priv, wr_id))) {
243 		memcpy(&priv->rx_ring[wr_id], &saverx, sizeof(saverx));
244 		if_inc_counter(dev, IFCOUNTER_IQDROPS, 1);
245 		goto repost;
246 	}
247 
248 	ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
249 		       wc->byte_len, wc->slid);
250 
251 	ipoib_dma_unmap_rx(priv, &saverx);
252 	ipoib_dma_mb(priv, mb, wc->byte_len);
253 
254 	if_inc_counter(dev, IFCOUNTER_IPACKETS, 1);
255 	if_inc_counter(dev, IFCOUNTER_IBYTES, mb->m_pkthdr.len);
256 	mb->m_pkthdr.rcvif = dev;
257 	m_adj(mb, sizeof(struct ib_grh) - INFINIBAND_ALEN);
258 	eh = mtod(mb, struct ipoib_header *);
259 	bzero(eh->hwaddr, 4);	/* Zero the queue pair, only dgid is in grh */
260 
261 	if (test_bit(IPOIB_FLAG_CSUM, &priv->flags) && likely(wc->wc_flags & IB_WC_IP_CSUM_OK))
262 		mb->m_pkthdr.csum_flags = CSUM_IP_CHECKED | CSUM_IP_VALID;
263 
264 	dev->if_input(dev, mb);
265 
266 repost:
267 	if (unlikely(ipoib_ib_post_receive(priv, wr_id)))
268 		ipoib_warn(priv, "ipoib_ib_post_receive failed "
269 			   "for buf %d\n", wr_id);
270 }
271 
272 int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req, int max)
273 {
274 	struct mbuf *mb = tx_req->mb;
275 	u64 *mapping = tx_req->mapping;
276 	struct mbuf *m, *p;
277 	int error;
278 	int i;
279 
280 	for (m = mb, p = NULL, i = 0; m != NULL; p = m, m = m->m_next, i++) {
281 		if (m->m_len != 0)
282 			continue;
283 		if (p == NULL)
284 			panic("ipoib_dma_map_tx: First mbuf empty\n");
285 		p->m_next = m_free(m);
286 		m = p;
287 		i--;
288 	}
289 	i--;
290 	if (i >= max) {
291 		tx_req->mb = mb = m_defrag(mb, M_NOWAIT);
292 		if (mb == NULL)
293 			return -EIO;
294 		for (m = mb, i = 0; m != NULL; m = m->m_next, i++);
295 		if (i >= max)
296 			return -EIO;
297 	}
298 	error = 0;
299 	for (m = mb, i = 0; m != NULL; m = m->m_next, i++) {
300 		mapping[i] = ib_dma_map_single(ca, mtod(m, void *),
301 					       m->m_len, DMA_TO_DEVICE);
302 		if (unlikely(ib_dma_mapping_error(ca, mapping[i]))) {
303 			error = -EIO;
304 			break;
305 		}
306 	}
307 	if (error) {
308 		int end;
309 
310 		end = i;
311 		for (m = mb, i = 0; i < end; m = m->m_next, i++)
312 			ib_dma_unmap_single(ca, mapping[i], m->m_len,
313 					    DMA_TO_DEVICE);
314 	}
315 	return error;
316 }
317 
318 void ipoib_dma_unmap_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req)
319 {
320 	struct mbuf *mb = tx_req->mb;
321 	u64 *mapping = tx_req->mapping;
322 	struct mbuf *m;
323 	int i;
324 
325 	for (m = mb, i = 0; m != NULL; m = m->m_next, i++)
326 		ib_dma_unmap_single(ca, mapping[i], m->m_len, DMA_TO_DEVICE);
327 }
328 
329 static void ipoib_ib_handle_tx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc)
330 {
331 	struct ifnet *dev = priv->dev;
332 	unsigned int wr_id = wc->wr_id;
333 	struct ipoib_tx_buf *tx_req;
334 
335 	ipoib_dbg_data(priv, "send completion: id %d, status: %d\n",
336 		       wr_id, wc->status);
337 
338 	if (unlikely(wr_id >= ipoib_sendq_size)) {
339 		ipoib_warn(priv, "send completion event with wrid %d (> %d)\n",
340 			   wr_id, ipoib_sendq_size);
341 		return;
342 	}
343 
344 	tx_req = &priv->tx_ring[wr_id];
345 
346 	ipoib_dma_unmap_tx(priv->ca, tx_req);
347 
348 	if_inc_counter(dev, IFCOUNTER_OPACKETS, 1);
349 
350 	m_freem(tx_req->mb);
351 
352 	++priv->tx_tail;
353 	if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
354 	    (dev->if_drv_flags & IFF_DRV_OACTIVE) &&
355 	    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
356 		dev->if_drv_flags &= ~IFF_DRV_OACTIVE;
357 
358 	if (wc->status != IB_WC_SUCCESS &&
359 	    wc->status != IB_WC_WR_FLUSH_ERR)
360 		ipoib_warn(priv, "failed send event "
361 			   "(status=%d, wrid=%d vend_err %x)\n",
362 			   wc->status, wr_id, wc->vendor_err);
363 }
364 
365 int
366 ipoib_poll_tx(struct ipoib_dev_priv *priv)
367 {
368 	int n, i;
369 
370 	n = ib_poll_cq(priv->send_cq, MAX_SEND_CQE, priv->send_wc);
371 	for (i = 0; i < n; ++i) {
372 		struct ib_wc *wc = priv->send_wc + i;
373 		if (wc->wr_id & IPOIB_OP_CM)
374 			ipoib_cm_handle_tx_wc(priv, wc);
375 		else
376 			ipoib_ib_handle_tx_wc(priv, wc);
377 	}
378 
379 	return n == MAX_SEND_CQE;
380 }
381 
382 static void
383 ipoib_poll(struct ipoib_dev_priv *priv)
384 {
385 	int n, i;
386 
387 poll_more:
388 	spin_lock(&priv->drain_lock);
389 	for (;;) {
390 		n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc);
391 		for (i = 0; i < n; i++) {
392 			struct ib_wc *wc = priv->ibwc + i;
393 
394 			if ((wc->wr_id & IPOIB_OP_RECV) == 0)
395 				panic("ipoib_poll: Bad wr_id 0x%jX\n",
396 				    (intmax_t)wc->wr_id);
397 			if (wc->wr_id & IPOIB_OP_CM)
398 				ipoib_cm_handle_rx_wc(priv, wc);
399 			else
400 				ipoib_ib_handle_rx_wc(priv, wc);
401 		}
402 
403 		if (n != IPOIB_NUM_WC)
404 			break;
405 	}
406 	spin_unlock(&priv->drain_lock);
407 
408 	if (ib_req_notify_cq(priv->recv_cq,
409 	    IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS))
410 		goto poll_more;
411 }
412 
413 void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr)
414 {
415 	struct ipoib_dev_priv *priv = dev_ptr;
416 
417 	ipoib_poll(priv);
418 }
419 
420 static void drain_tx_cq(struct ipoib_dev_priv *priv)
421 {
422 	struct ifnet *dev = priv->dev;
423 
424 	spin_lock(&priv->lock);
425 	while (ipoib_poll_tx(priv))
426 		; /* nothing */
427 
428 	if (dev->if_drv_flags & IFF_DRV_OACTIVE)
429 		mod_timer(&priv->poll_timer, jiffies + 1);
430 
431 	spin_unlock(&priv->lock);
432 }
433 
434 void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr)
435 {
436 	struct ipoib_dev_priv *priv = dev_ptr;
437 
438 	mod_timer(&priv->poll_timer, jiffies);
439 }
440 
441 static inline int
442 post_send(struct ipoib_dev_priv *priv, unsigned int wr_id,
443     struct ib_ah *address, u32 qpn, struct ipoib_tx_buf *tx_req, void *head,
444     int hlen)
445 {
446 	struct ib_send_wr *bad_wr;
447 	struct mbuf *mb = tx_req->mb;
448 	u64 *mapping = tx_req->mapping;
449 	struct mbuf *m;
450 	int i;
451 
452 	for (m = mb, i = 0; m != NULL; m = m->m_next, i++) {
453 		priv->tx_sge[i].addr         = mapping[i];
454 		priv->tx_sge[i].length       = m->m_len;
455 	}
456 	priv->tx_wr.wr.num_sge	= i;
457 	priv->tx_wr.wr.wr_id	= wr_id;
458 	priv->tx_wr.remote_qpn	= qpn;
459 	priv->tx_wr.ah		= address;
460 
461 	if (head) {
462 		priv->tx_wr.mss		= 0; /* XXX mb_shinfo(mb)->gso_size; */
463 		priv->tx_wr.header	= head;
464 		priv->tx_wr.hlen	= hlen;
465 		priv->tx_wr.wr.opcode	= IB_WR_LSO;
466 	} else
467 		priv->tx_wr.wr.opcode	= IB_WR_SEND;
468 
469 	return ib_post_send(priv->qp, &priv->tx_wr.wr, &bad_wr);
470 }
471 
472 void
473 ipoib_send(struct ipoib_dev_priv *priv, struct mbuf *mb,
474     struct ipoib_ah *address, u32 qpn)
475 {
476 	struct ifnet *dev = priv->dev;
477 	struct ipoib_tx_buf *tx_req;
478 	int hlen;
479 	void *phead;
480 
481 	if (unlikely(priv->tx_outstanding > MAX_SEND_CQE))
482 		while (ipoib_poll_tx(priv))
483 			; /* nothing */
484 
485 	m_adj(mb, sizeof (struct ipoib_pseudoheader));
486 	if (0 /* XXX segment offload mb_is_gso(mb) */) {
487 		/* XXX hlen = mb_transport_offset(mb) + tcp_hdrlen(mb); */
488 		phead = mtod(mb, void *);
489 		if (mb->m_len < hlen) {
490 			ipoib_warn(priv, "linear data too small\n");
491 			if_inc_counter(dev, IFCOUNTER_OERRORS, 1);
492 			m_freem(mb);
493 			return;
494 		}
495 		m_adj(mb, hlen);
496 	} else {
497 		if (unlikely(mb->m_pkthdr.len - IPOIB_ENCAP_LEN > priv->mcast_mtu)) {
498 			ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
499 				   mb->m_pkthdr.len, priv->mcast_mtu);
500 			if_inc_counter(dev, IFCOUNTER_OERRORS, 1);
501 			ipoib_cm_mb_too_long(priv, mb, priv->mcast_mtu);
502 			return;
503 		}
504 		phead = NULL;
505 		hlen  = 0;
506 	}
507 
508 	ipoib_dbg_data(priv, "sending packet, length=%d address=%p qpn=0x%06x\n",
509 		       mb->m_pkthdr.len, address, qpn);
510 
511 	/*
512 	 * We put the mb into the tx_ring _before_ we call post_send()
513 	 * because it's entirely possible that the completion handler will
514 	 * run before we execute anything after the post_send().  That
515 	 * means we have to make sure everything is properly recorded and
516 	 * our state is consistent before we call post_send().
517 	 */
518 	tx_req = &priv->tx_ring[priv->tx_head & (ipoib_sendq_size - 1)];
519 	tx_req->mb = mb;
520 	if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req, IPOIB_UD_TX_SG))) {
521 		if_inc_counter(dev, IFCOUNTER_OERRORS, 1);
522 		if (tx_req->mb)
523 			m_freem(tx_req->mb);
524 		return;
525 	}
526 
527 	if (mb->m_pkthdr.csum_flags & (CSUM_IP|CSUM_TCP|CSUM_UDP))
528 		priv->tx_wr.wr.send_flags |= IB_SEND_IP_CSUM;
529 	else
530 		priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM;
531 
532 	if (++priv->tx_outstanding == ipoib_sendq_size) {
533 		ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
534 		if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP))
535 			ipoib_warn(priv, "request notify on send CQ failed\n");
536 		dev->if_drv_flags |= IFF_DRV_OACTIVE;
537 	}
538 
539 	if (unlikely(post_send(priv,
540 	    priv->tx_head & (ipoib_sendq_size - 1), address->ah, qpn,
541 	    tx_req, phead, hlen))) {
542 		ipoib_warn(priv, "post_send failed\n");
543 		if_inc_counter(dev, IFCOUNTER_OERRORS, 1);
544 		--priv->tx_outstanding;
545 		ipoib_dma_unmap_tx(priv->ca, tx_req);
546 		m_freem(mb);
547 		if (dev->if_drv_flags & IFF_DRV_OACTIVE)
548 			dev->if_drv_flags &= ~IFF_DRV_OACTIVE;
549 	} else {
550 		address->last_send = priv->tx_head;
551 		++priv->tx_head;
552 	}
553 }
554 
555 static void __ipoib_reap_ah(struct ipoib_dev_priv *priv)
556 {
557 	struct ipoib_ah *ah, *tah;
558 	LIST_HEAD(remove_list);
559 	unsigned long flags;
560 
561 	spin_lock_irqsave(&priv->lock, flags);
562 
563 	list_for_each_entry_safe(ah, tah, &priv->dead_ahs, list)
564 		if ((int) priv->tx_tail - (int) ah->last_send >= 0) {
565 			list_del(&ah->list);
566 			ib_destroy_ah(ah->ah);
567 			kfree(ah);
568 		}
569 
570 	spin_unlock_irqrestore(&priv->lock, flags);
571 }
572 
573 void ipoib_reap_ah(struct work_struct *work)
574 {
575 	struct ipoib_dev_priv *priv =
576 		container_of(work, struct ipoib_dev_priv, ah_reap_task.work);
577 
578 	__ipoib_reap_ah(priv);
579 
580 	if (!test_bit(IPOIB_STOP_REAPER, &priv->flags))
581 		queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task,
582 				   HZ);
583 }
584 
585 static void ipoib_ah_dev_cleanup(struct ipoib_dev_priv *priv)
586 {
587 	unsigned long begin;
588 
589 	begin = jiffies;
590 
591 	while (!list_empty(&priv->dead_ahs)) {
592 		__ipoib_reap_ah(priv);
593 
594 		if (time_after(jiffies, begin + HZ)) {
595 			ipoib_warn(priv, "timing out; will leak address handles\n");
596 			break;
597 		}
598 
599 		msleep(1);
600 	}
601 }
602 
603 static void ipoib_ib_tx_timer_func(unsigned long ctx)
604 {
605 	drain_tx_cq((struct ipoib_dev_priv *)ctx);
606 }
607 
608 int ipoib_ib_dev_open(struct ipoib_dev_priv *priv)
609 {
610 	int ret;
611 
612 	if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &priv->pkey_index)) {
613 		ipoib_warn(priv, "P_Key 0x%04x not found\n", priv->pkey);
614 		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
615 		return -1;
616 	}
617 	set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
618 
619 	ret = ipoib_init_qp(priv);
620 	if (ret) {
621 		ipoib_warn(priv, "ipoib_init_qp returned %d\n", ret);
622 		return -1;
623 	}
624 
625 	ret = ipoib_ib_post_receives(priv);
626 	if (ret) {
627 		ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret);
628 		ipoib_ib_dev_stop(priv, 1);
629 		return -1;
630 	}
631 
632 	ret = ipoib_cm_dev_open(priv);
633 	if (ret) {
634 		ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n", ret);
635 		ipoib_ib_dev_stop(priv, 1);
636 		return -1;
637 	}
638 
639 	clear_bit(IPOIB_STOP_REAPER, &priv->flags);
640 	queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ);
641 
642 	set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
643 
644 	return 0;
645 }
646 
647 static void ipoib_pkey_dev_check_presence(struct ipoib_dev_priv *priv)
648 {
649 	u16 pkey_index = 0;
650 
651 	if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index))
652 		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
653 	else
654 		set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
655 }
656 
657 int ipoib_ib_dev_up(struct ipoib_dev_priv *priv)
658 {
659 
660 	ipoib_pkey_dev_check_presence(priv);
661 
662 	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
663 		ipoib_dbg(priv, "PKEY is not assigned.\n");
664 		return 0;
665 	}
666 
667 	set_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
668 
669 	return ipoib_mcast_start_thread(priv);
670 }
671 
672 int ipoib_ib_dev_down(struct ipoib_dev_priv *priv, int flush)
673 {
674 
675 	ipoib_dbg(priv, "downing ib_dev\n");
676 
677 	clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
678 	if_link_state_change(priv->dev, LINK_STATE_DOWN);
679 
680 	/* Shutdown the P_Key thread if still active */
681 	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
682 		mutex_lock(&pkey_mutex);
683 		set_bit(IPOIB_PKEY_STOP, &priv->flags);
684 		cancel_delayed_work(&priv->pkey_poll_task);
685 		mutex_unlock(&pkey_mutex);
686 		if (flush)
687 			flush_workqueue(ipoib_workqueue);
688 	}
689 
690 	ipoib_mcast_stop_thread(priv, flush);
691 	ipoib_mcast_dev_flush(priv);
692 
693 	ipoib_flush_paths(priv);
694 
695 	return 0;
696 }
697 
698 static int recvs_pending(struct ipoib_dev_priv *priv)
699 {
700 	int pending = 0;
701 	int i;
702 
703 	for (i = 0; i < ipoib_recvq_size; ++i)
704 		if (priv->rx_ring[i].mb)
705 			++pending;
706 
707 	return pending;
708 }
709 
710 void ipoib_drain_cq(struct ipoib_dev_priv *priv)
711 {
712 	int i, n;
713 
714 	spin_lock(&priv->drain_lock);
715 	do {
716 		n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc);
717 		for (i = 0; i < n; ++i) {
718 			/*
719 			 * Convert any successful completions to flush
720 			 * errors to avoid passing packets up the
721 			 * stack after bringing the device down.
722 			 */
723 			if (priv->ibwc[i].status == IB_WC_SUCCESS)
724 				priv->ibwc[i].status = IB_WC_WR_FLUSH_ERR;
725 
726 			if ((priv->ibwc[i].wr_id & IPOIB_OP_RECV) == 0)
727 				panic("ipoib_drain_cq:  Bad wrid 0x%jX\n",
728 				    (intmax_t)priv->ibwc[i].wr_id);
729 			if (priv->ibwc[i].wr_id & IPOIB_OP_CM)
730 				ipoib_cm_handle_rx_wc(priv, priv->ibwc + i);
731 			else
732 				ipoib_ib_handle_rx_wc(priv, priv->ibwc + i);
733 		}
734 	} while (n == IPOIB_NUM_WC);
735 	spin_unlock(&priv->drain_lock);
736 
737 	spin_lock(&priv->lock);
738 	while (ipoib_poll_tx(priv))
739 		; /* nothing */
740 
741 	spin_unlock(&priv->lock);
742 }
743 
744 int ipoib_ib_dev_stop(struct ipoib_dev_priv *priv, int flush)
745 {
746 	struct ib_qp_attr qp_attr;
747 	unsigned long begin;
748 	struct ipoib_tx_buf *tx_req;
749 	int i;
750 
751 	clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
752 
753 	ipoib_cm_dev_stop(priv);
754 
755 	/*
756 	 * Move our QP to the error state and then reinitialize in
757 	 * when all work requests have completed or have been flushed.
758 	 */
759 	qp_attr.qp_state = IB_QPS_ERR;
760 	if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
761 		ipoib_warn(priv, "Failed to modify QP to ERROR state\n");
762 
763 	/* Wait for all sends and receives to complete */
764 	begin = jiffies;
765 
766 	while (priv->tx_head != priv->tx_tail || recvs_pending(priv)) {
767 		if (time_after(jiffies, begin + 5 * HZ)) {
768 			ipoib_warn(priv, "timing out; %d sends %d receives not completed\n",
769 				   priv->tx_head - priv->tx_tail, recvs_pending(priv));
770 
771 			/*
772 			 * assume the HW is wedged and just free up
773 			 * all our pending work requests.
774 			 */
775 			while ((int) priv->tx_tail - (int) priv->tx_head < 0) {
776 				tx_req = &priv->tx_ring[priv->tx_tail &
777 							(ipoib_sendq_size - 1)];
778 				ipoib_dma_unmap_tx(priv->ca, tx_req);
779 				m_freem(tx_req->mb);
780 				++priv->tx_tail;
781 				--priv->tx_outstanding;
782 			}
783 
784 			for (i = 0; i < ipoib_recvq_size; ++i) {
785 				struct ipoib_rx_buf *rx_req;
786 
787 				rx_req = &priv->rx_ring[i];
788 				if (!rx_req->mb)
789 					continue;
790 				ipoib_dma_unmap_rx(priv, &priv->rx_ring[i]);
791 				m_freem(rx_req->mb);
792 				rx_req->mb = NULL;
793 			}
794 
795 			goto timeout;
796 		}
797 
798 		ipoib_drain_cq(priv);
799 
800 		msleep(1);
801 	}
802 
803 	ipoib_dbg(priv, "All sends and receives done.\n");
804 
805 timeout:
806 	del_timer_sync(&priv->poll_timer);
807 	qp_attr.qp_state = IB_QPS_RESET;
808 	if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
809 		ipoib_warn(priv, "Failed to modify QP to RESET state\n");
810 
811 	/* Wait for all AHs to be reaped */
812 	set_bit(IPOIB_STOP_REAPER, &priv->flags);
813 	cancel_delayed_work(&priv->ah_reap_task);
814 	if (flush)
815 		flush_workqueue(ipoib_workqueue);
816 
817 	ipoib_ah_dev_cleanup(priv);
818 
819 	ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP);
820 
821 	return 0;
822 }
823 
824 int ipoib_ib_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca, int port)
825 {
826 	struct ifnet *dev = priv->dev;
827 
828 	priv->ca = ca;
829 	priv->port = port;
830 	priv->qp = NULL;
831 
832 	if (ipoib_transport_dev_init(priv, ca)) {
833 		printk(KERN_WARNING "%s: ipoib_transport_dev_init failed\n", ca->name);
834 		return -ENODEV;
835 	}
836 
837 	setup_timer(&priv->poll_timer, ipoib_ib_tx_timer_func,
838 		    (unsigned long) priv);
839 
840 	if (dev->if_flags & IFF_UP) {
841 		if (ipoib_ib_dev_open(priv)) {
842 			ipoib_transport_dev_cleanup(priv);
843 			return -ENODEV;
844 		}
845 	}
846 
847 	return 0;
848 }
849 
850 static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
851 				enum ipoib_flush_level level)
852 {
853 	struct ipoib_dev_priv *cpriv;
854 	u16 new_index;
855 
856 	mutex_lock(&priv->vlan_mutex);
857 
858 	/*
859 	 * Flush any child interfaces too -- they might be up even if
860 	 * the parent is down.
861 	 */
862 	list_for_each_entry(cpriv, &priv->child_intfs, list)
863 		__ipoib_ib_dev_flush(cpriv, level);
864 
865 	mutex_unlock(&priv->vlan_mutex);
866 
867 	if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) {
868 		ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n");
869 		return;
870 	}
871 
872 	if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
873 		ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_ADMIN_UP not set.\n");
874 		return;
875 	}
876 
877 	if (level == IPOIB_FLUSH_HEAVY) {
878 		if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) {
879 			clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
880 			ipoib_ib_dev_down(priv, 0);
881 			ipoib_ib_dev_stop(priv, 0);
882 			if (ipoib_pkey_dev_delay_open(priv))
883 				return;
884 		}
885 
886 		/* restart QP only if P_Key index is changed */
887 		if (test_and_set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) &&
888 		    new_index == priv->pkey_index) {
889 			ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n");
890 			return;
891 		}
892 		priv->pkey_index = new_index;
893 	}
894 
895 	if (level == IPOIB_FLUSH_LIGHT) {
896 		ipoib_mark_paths_invalid(priv);
897 		ipoib_mcast_dev_flush(priv);
898 	}
899 
900 	if (level >= IPOIB_FLUSH_NORMAL)
901 		ipoib_ib_dev_down(priv, 0);
902 
903 	if (level == IPOIB_FLUSH_HEAVY) {
904 		ipoib_ib_dev_stop(priv, 0);
905 		ipoib_ib_dev_open(priv);
906 	}
907 
908 	/*
909 	 * The device could have been brought down between the start and when
910 	 * we get here, don't bring it back up if it's not configured up
911 	 */
912 	if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
913 		if (level >= IPOIB_FLUSH_NORMAL)
914 			ipoib_ib_dev_up(priv);
915 		ipoib_mcast_restart_task(&priv->restart_task);
916 	}
917 }
918 
919 void ipoib_ib_dev_flush_light(struct work_struct *work)
920 {
921 	struct ipoib_dev_priv *priv =
922 		container_of(work, struct ipoib_dev_priv, flush_light);
923 
924 	__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_LIGHT);
925 }
926 
927 void ipoib_ib_dev_flush_normal(struct work_struct *work)
928 {
929 	struct ipoib_dev_priv *priv =
930 		container_of(work, struct ipoib_dev_priv, flush_normal);
931 
932 	__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_NORMAL);
933 }
934 
935 void ipoib_ib_dev_flush_heavy(struct work_struct *work)
936 {
937 	struct ipoib_dev_priv *priv =
938 		container_of(work, struct ipoib_dev_priv, flush_heavy);
939 
940 	__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_HEAVY);
941 }
942 
943 void ipoib_ib_dev_cleanup(struct ipoib_dev_priv *priv)
944 {
945 
946 	ipoib_dbg(priv, "cleaning up ib_dev\n");
947 
948 	ipoib_mcast_stop_thread(priv, 1);
949 	ipoib_mcast_dev_flush(priv);
950 
951 	ipoib_ah_dev_cleanup(priv);
952 	ipoib_transport_dev_cleanup(priv);
953 }
954 
955 /*
956  * Delayed P_Key Assigment Interim Support
957  *
958  * The following is initial implementation of delayed P_Key assigment
959  * mechanism. It is using the same approach implemented for the multicast
960  * group join. The single goal of this implementation is to quickly address
961  * Bug #2507. This implementation will probably be removed when the P_Key
962  * change async notification is available.
963  */
964 
965 void ipoib_pkey_poll(struct work_struct *work)
966 {
967 	struct ipoib_dev_priv *priv =
968 		container_of(work, struct ipoib_dev_priv, pkey_poll_task.work);
969 
970 	ipoib_pkey_dev_check_presence(priv);
971 
972 	if (test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
973 		ipoib_open(priv);
974 	else {
975 		mutex_lock(&pkey_mutex);
976 		if (!test_bit(IPOIB_PKEY_STOP, &priv->flags))
977 			queue_delayed_work(ipoib_workqueue,
978 					   &priv->pkey_poll_task,
979 					   HZ);
980 		mutex_unlock(&pkey_mutex);
981 	}
982 }
983 
984 int ipoib_pkey_dev_delay_open(struct ipoib_dev_priv *priv)
985 {
986 
987 	/* Look for the interface pkey value in the IB Port P_Key table and */
988 	/* set the interface pkey assigment flag                            */
989 	ipoib_pkey_dev_check_presence(priv);
990 
991 	/* P_Key value not assigned yet - start polling */
992 	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
993 		mutex_lock(&pkey_mutex);
994 		clear_bit(IPOIB_PKEY_STOP, &priv->flags);
995 		queue_delayed_work(ipoib_workqueue,
996 				   &priv->pkey_poll_task,
997 				   HZ);
998 		mutex_unlock(&pkey_mutex);
999 		return 1;
1000 	}
1001 
1002 	return 0;
1003 }
1004