1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0
3 *
4 * Copyright (c) 2006 Mellanox Technologies. All rights reserved
5 *
6 * This software is available to you under a choice of one of two
7 * licenses. You may choose to be licensed under the terms of the GNU
8 * General Public License (GPL) Version 2, available from the file
9 * COPYING in the main directory of this source tree, or the
10 * OpenIB.org BSD license below:
11 *
12 * Redistribution and use in source and binary forms, with or
13 * without modification, are permitted provided that the following
14 * conditions are met:
15 *
16 * - Redistributions of source code must retain the above
17 * copyright notice, this list of conditions and the following
18 * disclaimer.
19 *
20 * - Redistributions in binary form must reproduce the above
21 * copyright notice, this list of conditions and the following
22 * disclaimer in the documentation and/or other materials
23 * provided with the distribution.
24 *
25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32 * SOFTWARE.
33 */
34
35 #include <sys/cdefs.h>
36 #include "ipoib.h"
37
38 #ifdef CONFIG_INFINIBAND_IPOIB_CM
39
40 #include <netinet/ip.h>
41 #include <netinet/ip_icmp.h>
42 #include <netinet/icmp6.h>
43
44 #include <rdma/ib_cm.h>
45 #include <rdma/ib_cache.h>
46 #include <linux/delay.h>
47
48 int ipoib_max_conn_qp = 128;
49
50 module_param_named(max_nonsrq_conn_qp, ipoib_max_conn_qp, int, 0444);
51 MODULE_PARM_DESC(max_nonsrq_conn_qp,
52 "Max number of connected-mode QPs per interface "
53 "(applied only if shared receive queue is not available)");
54
55 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
56 static int data_debug_level;
57
58 module_param_named(cm_data_debug_level, data_debug_level, int, 0644);
59 MODULE_PARM_DESC(cm_data_debug_level,
60 "Enable data path debug tracing for connected mode if > 0");
61 #endif
62
63 #define IPOIB_CM_IETF_ID 0x1000000000000000ULL
64
65 #define IPOIB_CM_RX_UPDATE_TIME (256 * HZ)
66 #define IPOIB_CM_RX_TIMEOUT (2 * 256 * HZ)
67 #define IPOIB_CM_RX_DELAY (3 * 256 * HZ)
68 #define IPOIB_CM_RX_UPDATE_MASK (0x3)
69
70 static struct ib_qp_attr ipoib_cm_err_attr = {
71 .qp_state = IB_QPS_ERR
72 };
73
74 #define IPOIB_CM_RX_DRAIN_WRID 0xffffffff
75
76 static struct ib_send_wr ipoib_cm_rx_drain_wr = {
77 .wr_id = IPOIB_CM_RX_DRAIN_WRID,
78 .opcode = IB_WR_SEND,
79 };
80
81 static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
82 struct ib_cm_event *event);
83
ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv * priv,struct ipoib_cm_rx_buf * rx_req)84 static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, struct ipoib_cm_rx_buf *rx_req)
85 {
86
87 ipoib_dma_unmap_rx(priv, (struct ipoib_rx_buf *)rx_req);
88
89 }
90
ipoib_cm_post_receive_srq(struct ipoib_dev_priv * priv,int id)91 static int ipoib_cm_post_receive_srq(struct ipoib_dev_priv *priv, int id)
92 {
93 const struct ib_recv_wr *bad_wr;
94 struct ipoib_rx_buf *rx_req;
95 struct mbuf *m;
96 int ret;
97 int i;
98
99 rx_req = (struct ipoib_rx_buf *)&priv->cm.srq_ring[id];
100 for (m = rx_req->mb, i = 0; m != NULL; m = m->m_next, i++) {
101 priv->cm.rx_sge[i].addr = rx_req->mapping[i];
102 priv->cm.rx_sge[i].length = m->m_len;
103 }
104
105 priv->cm.rx_wr.num_sge = i;
106 priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
107
108 ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, &bad_wr);
109 if (unlikely(ret)) {
110 ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret);
111 ipoib_dma_unmap_rx(priv, rx_req);
112 m_freem(priv->cm.srq_ring[id].mb);
113 priv->cm.srq_ring[id].mb = NULL;
114 }
115
116 return ret;
117 }
118
ipoib_cm_post_receive_nonsrq(struct ipoib_dev_priv * priv,struct ipoib_cm_rx * rx,struct ib_recv_wr * wr,struct ib_sge * sge,int id)119 static int ipoib_cm_post_receive_nonsrq(struct ipoib_dev_priv *priv,
120 struct ipoib_cm_rx *rx,
121 struct ib_recv_wr *wr,
122 struct ib_sge *sge, int id)
123 {
124 struct ipoib_rx_buf *rx_req;
125 const struct ib_recv_wr *bad_wr;
126 struct mbuf *m;
127 int ret;
128 int i;
129
130 rx_req = (struct ipoib_rx_buf *)&rx->rx_ring[id];
131 for (m = rx_req->mb, i = 0; m != NULL; m = m->m_next, i++) {
132 sge[i].addr = rx_req->mapping[i];
133 sge[i].length = m->m_len;
134 }
135
136 wr->num_sge = i;
137 wr->wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
138
139 ret = ib_post_recv(rx->qp, wr, &bad_wr);
140 if (unlikely(ret)) {
141 ipoib_warn(priv, "post recv failed for buf %d (%d)\n", id, ret);
142 ipoib_dma_unmap_rx(priv, rx_req);
143 m_freem(rx->rx_ring[id].mb);
144 rx->rx_ring[id].mb = NULL;
145 }
146
147 return ret;
148 }
149
150 static struct mbuf *
ipoib_cm_alloc_rx_mb(struct ipoib_dev_priv * priv,struct ipoib_cm_rx_buf * rx_req)151 ipoib_cm_alloc_rx_mb(struct ipoib_dev_priv *priv, struct ipoib_cm_rx_buf *rx_req)
152 {
153 return ipoib_alloc_map_mb(priv, (struct ipoib_rx_buf *)rx_req,
154 sizeof(struct ipoib_pseudoheader), priv->cm.max_cm_mtu, IPOIB_CM_RX_SG);
155 }
156
ipoib_cm_free_rx_ring(struct ipoib_dev_priv * priv,struct ipoib_cm_rx_buf * rx_ring)157 static void ipoib_cm_free_rx_ring(struct ipoib_dev_priv *priv,
158 struct ipoib_cm_rx_buf *rx_ring)
159 {
160 int i;
161
162 for (i = 0; i < ipoib_recvq_size; ++i)
163 if (rx_ring[i].mb) {
164 ipoib_cm_dma_unmap_rx(priv, &rx_ring[i]);
165 m_freem(rx_ring[i].mb);
166 }
167
168 kfree(rx_ring);
169 }
170
ipoib_cm_start_rx_drain(struct ipoib_dev_priv * priv)171 static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv)
172 {
173 const struct ib_send_wr *bad_wr;
174 struct ipoib_cm_rx *p;
175
176 /* We only reserved 1 extra slot in CQ for drain WRs, so
177 * make sure we have at most 1 outstanding WR. */
178 if (list_empty(&priv->cm.rx_flush_list) ||
179 !list_empty(&priv->cm.rx_drain_list))
180 return;
181
182 /*
183 * QPs on flush list are error state. This way, a "flush
184 * error" WC will be immediately generated for each WR we post.
185 */
186 p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list);
187 if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, &bad_wr))
188 ipoib_warn(priv, "failed to post drain wr\n");
189
190 list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list);
191 }
192
ipoib_cm_rx_event_handler(struct ib_event * event,void * ctx)193 static void ipoib_cm_rx_event_handler(struct ib_event *event, void *ctx)
194 {
195 struct ipoib_cm_rx *p = ctx;
196 struct ipoib_dev_priv *priv = p->priv;
197 unsigned long flags;
198
199 if (event->event != IB_EVENT_QP_LAST_WQE_REACHED)
200 return;
201
202 spin_lock_irqsave(&priv->lock, flags);
203 list_move(&p->list, &priv->cm.rx_flush_list);
204 p->state = IPOIB_CM_RX_FLUSH;
205 ipoib_cm_start_rx_drain(priv);
206 spin_unlock_irqrestore(&priv->lock, flags);
207 }
208
ipoib_cm_create_rx_qp(struct ipoib_dev_priv * priv,struct ipoib_cm_rx * p)209 static struct ib_qp *ipoib_cm_create_rx_qp(struct ipoib_dev_priv *priv,
210 struct ipoib_cm_rx *p)
211 {
212 struct ib_qp_init_attr attr = {
213 .event_handler = ipoib_cm_rx_event_handler,
214 .send_cq = priv->recv_cq, /* For drain WR */
215 .recv_cq = priv->recv_cq,
216 .srq = priv->cm.srq,
217 .cap.max_send_wr = 1, /* For drain WR */
218 .cap.max_send_sge = 1,
219 .sq_sig_type = IB_SIGNAL_ALL_WR,
220 .qp_type = IB_QPT_RC,
221 .qp_context = p,
222 };
223
224 if (!ipoib_cm_has_srq(priv)) {
225 attr.cap.max_recv_wr = ipoib_recvq_size;
226 attr.cap.max_recv_sge = priv->cm.num_frags;
227 }
228
229 return ib_create_qp(priv->pd, &attr);
230 }
231
ipoib_cm_modify_rx_qp(struct ipoib_dev_priv * priv,struct ib_cm_id * cm_id,struct ib_qp * qp,unsigned psn)232 static int ipoib_cm_modify_rx_qp(struct ipoib_dev_priv *priv,
233 struct ib_cm_id *cm_id, struct ib_qp *qp,
234 unsigned psn)
235 {
236 struct ib_qp_attr qp_attr;
237 int qp_attr_mask, ret;
238
239 qp_attr.qp_state = IB_QPS_INIT;
240 ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
241 if (ret) {
242 ipoib_warn(priv, "failed to init QP attr for INIT: %d\n", ret);
243 return ret;
244 }
245 ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
246 if (ret) {
247 ipoib_warn(priv, "failed to modify QP to INIT: %d\n", ret);
248 return ret;
249 }
250 qp_attr.qp_state = IB_QPS_RTR;
251 ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
252 if (ret) {
253 ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret);
254 return ret;
255 }
256 qp_attr.rq_psn = psn;
257 ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
258 if (ret) {
259 ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
260 return ret;
261 }
262
263 /*
264 * Current Mellanox HCA firmware won't generate completions
265 * with error for drain WRs unless the QP has been moved to
266 * RTS first. This work-around leaves a window where a QP has
267 * moved to error asynchronously, but this will eventually get
268 * fixed in firmware, so let's not error out if modify QP
269 * fails.
270 */
271 qp_attr.qp_state = IB_QPS_RTS;
272 ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
273 if (ret) {
274 ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret);
275 return 0;
276 }
277 ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
278 if (ret) {
279 ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret);
280 return 0;
281 }
282
283 return 0;
284 }
285
ipoib_cm_init_rx_wr(struct ipoib_dev_priv * priv,struct ib_recv_wr * wr,struct ib_sge * sge)286 static void ipoib_cm_init_rx_wr(struct ipoib_dev_priv *priv,
287 struct ib_recv_wr *wr,
288 struct ib_sge *sge)
289 {
290 int i;
291
292 for (i = 0; i < IPOIB_CM_RX_SG; i++)
293 sge[i].lkey = priv->pd->local_dma_lkey;
294
295 wr->next = NULL;
296 wr->sg_list = sge;
297 wr->num_sge = 1;
298 }
299
ipoib_cm_nonsrq_init_rx(struct ipoib_dev_priv * priv,struct ib_cm_id * cm_id,struct ipoib_cm_rx * rx)300 static int ipoib_cm_nonsrq_init_rx(struct ipoib_dev_priv *priv,
301 struct ib_cm_id *cm_id, struct ipoib_cm_rx *rx)
302 {
303 struct {
304 struct ib_recv_wr wr;
305 struct ib_sge sge[IPOIB_CM_RX_SG];
306 } *t;
307 int ret;
308 int i;
309
310 rx->rx_ring = kzalloc(ipoib_recvq_size * sizeof *rx->rx_ring, GFP_KERNEL);
311 if (!rx->rx_ring) {
312 printk(KERN_WARNING "%s: failed to allocate CM non-SRQ ring (%d entries)\n",
313 priv->ca->name, ipoib_recvq_size);
314 return -ENOMEM;
315 }
316
317 memset(rx->rx_ring, 0, ipoib_recvq_size * sizeof *rx->rx_ring);
318
319 t = kmalloc(sizeof *t, GFP_KERNEL);
320 if (!t) {
321 ret = -ENOMEM;
322 goto err_free;
323 }
324
325 ipoib_cm_init_rx_wr(priv, &t->wr, t->sge);
326
327 spin_lock_irq(&priv->lock);
328
329 if (priv->cm.nonsrq_conn_qp >= ipoib_max_conn_qp) {
330 spin_unlock_irq(&priv->lock);
331 ib_send_cm_rej(cm_id, IB_CM_REJ_NO_QP, NULL, 0, NULL, 0);
332 ret = -EINVAL;
333 goto err_free;
334 } else
335 ++priv->cm.nonsrq_conn_qp;
336
337 spin_unlock_irq(&priv->lock);
338
339 for (i = 0; i < ipoib_recvq_size; ++i) {
340 if (!ipoib_cm_alloc_rx_mb(priv, &rx->rx_ring[i])) {
341 ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
342 ret = -ENOMEM;
343 goto err_count;
344 }
345 ret = ipoib_cm_post_receive_nonsrq(priv, rx, &t->wr, t->sge, i);
346 if (ret) {
347 ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq "
348 "failed for buf %d\n", i);
349 ret = -EIO;
350 goto err_count;
351 }
352 }
353
354 rx->recv_count = ipoib_recvq_size;
355
356 kfree(t);
357
358 return 0;
359
360 err_count:
361 spin_lock_irq(&priv->lock);
362 --priv->cm.nonsrq_conn_qp;
363 spin_unlock_irq(&priv->lock);
364
365 err_free:
366 kfree(t);
367 ipoib_cm_free_rx_ring(priv, rx->rx_ring);
368
369 return ret;
370 }
371
ipoib_cm_send_rep(struct ipoib_dev_priv * priv,struct ib_cm_id * cm_id,struct ib_qp * qp,struct ib_cm_req_event_param * req,unsigned psn)372 static int ipoib_cm_send_rep(struct ipoib_dev_priv *priv, struct ib_cm_id *cm_id,
373 struct ib_qp *qp, struct ib_cm_req_event_param *req,
374 unsigned psn)
375 {
376 struct ipoib_cm_data data = {};
377 struct ib_cm_rep_param rep = {};
378
379 data.qpn = cpu_to_be32(priv->qp->qp_num);
380 data.mtu = cpu_to_be32(priv->cm.max_cm_mtu);
381
382 rep.private_data = &data;
383 rep.private_data_len = sizeof data;
384 rep.flow_control = 0;
385 rep.rnr_retry_count = req->rnr_retry_count;
386 rep.srq = ipoib_cm_has_srq(priv);
387 rep.qp_num = qp->qp_num;
388 rep.starting_psn = psn;
389 return ib_send_cm_rep(cm_id, &rep);
390 }
391
ipoib_cm_req_handler(struct ib_cm_id * cm_id,struct ib_cm_event * event)392 static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
393 {
394 struct ipoib_dev_priv *priv = cm_id->context;
395 struct ipoib_cm_rx *p;
396 unsigned psn;
397 int ret;
398
399 ipoib_dbg(priv, "REQ arrived\n");
400 p = kzalloc(sizeof *p, GFP_KERNEL);
401 if (!p)
402 return -ENOMEM;
403 p->priv = priv;
404 p->id = cm_id;
405 cm_id->context = p;
406 p->state = IPOIB_CM_RX_LIVE;
407 p->jiffies = jiffies;
408 INIT_LIST_HEAD(&p->list);
409
410 p->qp = ipoib_cm_create_rx_qp(priv, p);
411 if (IS_ERR(p->qp)) {
412 ret = PTR_ERR(p->qp);
413 goto err_qp;
414 }
415
416 psn = random() & 0xffffff;
417 ret = ipoib_cm_modify_rx_qp(priv, cm_id, p->qp, psn);
418 if (ret)
419 goto err_modify;
420
421 if (!ipoib_cm_has_srq(priv)) {
422 ret = ipoib_cm_nonsrq_init_rx(priv, cm_id, p);
423 if (ret)
424 goto err_modify;
425 }
426
427 spin_lock_irq(&priv->lock);
428 queue_delayed_work(ipoib_workqueue,
429 &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
430 /* Add this entry to passive ids list head, but do not re-add it
431 * if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */
432 p->jiffies = jiffies;
433 if (p->state == IPOIB_CM_RX_LIVE)
434 list_move(&p->list, &priv->cm.passive_ids);
435 spin_unlock_irq(&priv->lock);
436
437 ret = ipoib_cm_send_rep(priv, cm_id, p->qp, &event->param.req_rcvd, psn);
438 if (ret) {
439 ipoib_warn(priv, "failed to send REP: %d\n", ret);
440 if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE))
441 ipoib_warn(priv, "unable to move qp to error state\n");
442 }
443 return 0;
444
445 err_modify:
446 ib_destroy_qp(p->qp);
447 err_qp:
448 kfree(p);
449 return ret;
450 }
451
ipoib_cm_rx_handler(struct ib_cm_id * cm_id,struct ib_cm_event * event)452 static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id,
453 struct ib_cm_event *event)
454 {
455 struct ipoib_cm_rx *p;
456 struct ipoib_dev_priv *priv;
457
458 switch (event->event) {
459 case IB_CM_REQ_RECEIVED:
460 return ipoib_cm_req_handler(cm_id, event);
461 case IB_CM_DREQ_RECEIVED:
462 p = cm_id->context;
463 ib_send_cm_drep(cm_id, NULL, 0);
464 /* Fall through */
465 case IB_CM_REJ_RECEIVED:
466 p = cm_id->context;
467 priv = p->priv;
468 if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE))
469 ipoib_warn(priv, "unable to move qp to error state\n");
470 /* Fall through */
471 default:
472 return 0;
473 }
474 }
475
ipoib_cm_handle_rx_wc(struct ipoib_dev_priv * priv,struct ib_wc * wc)476 void ipoib_cm_handle_rx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc)
477 {
478 struct ipoib_cm_rx_buf saverx;
479 struct ipoib_cm_rx_buf *rx_ring;
480 unsigned int wr_id = wc->wr_id & ~(IPOIB_OP_CM | IPOIB_OP_RECV);
481 if_t dev = priv->dev;
482 struct mbuf *mb, *newmb;
483 struct ipoib_cm_rx *p;
484 int has_srq;
485
486 ipoib_dbg_data(priv, "cm recv completion: id %d, status: %d\n",
487 wr_id, wc->status);
488
489 if (unlikely(wr_id >= ipoib_recvq_size)) {
490 if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~(IPOIB_OP_CM | IPOIB_OP_RECV))) {
491 spin_lock(&priv->lock);
492 list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list);
493 ipoib_cm_start_rx_drain(priv);
494 if (priv->cm.id != NULL)
495 queue_work(ipoib_workqueue,
496 &priv->cm.rx_reap_task);
497 spin_unlock(&priv->lock);
498 } else
499 ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n",
500 wr_id, ipoib_recvq_size);
501 goto done;
502 }
503
504 p = wc->qp->qp_context;
505
506 has_srq = ipoib_cm_has_srq(priv);
507 rx_ring = has_srq ? priv->cm.srq_ring : p->rx_ring;
508
509 mb = rx_ring[wr_id].mb;
510
511 if (unlikely(wc->status != IB_WC_SUCCESS)) {
512 ipoib_dbg(priv, "cm recv error "
513 "(status=%d, wrid=%d vend_err %x)\n",
514 wc->status, wr_id, wc->vendor_err);
515 if_inc_counter(dev, IFCOUNTER_IERRORS, 1);
516 if (has_srq)
517 goto repost;
518 else {
519 if (!--p->recv_count) {
520 spin_lock(&priv->lock);
521 list_move(&p->list, &priv->cm.rx_reap_list);
522 queue_work(ipoib_workqueue, &priv->cm.rx_reap_task);
523 spin_unlock(&priv->lock);
524 }
525 goto done;
526 }
527 }
528
529 if (unlikely(!(wr_id & IPOIB_CM_RX_UPDATE_MASK))) {
530 if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) {
531 p->jiffies = jiffies;
532 /* Move this entry to list head, but do not re-add it
533 * if it has been moved out of list. */
534 if (p->state == IPOIB_CM_RX_LIVE)
535 list_move(&p->list, &priv->cm.passive_ids);
536 }
537 }
538
539 memcpy(&saverx, &rx_ring[wr_id], sizeof(saverx));
540 newmb = ipoib_cm_alloc_rx_mb(priv, &rx_ring[wr_id]);
541 if (unlikely(!newmb)) {
542 /*
543 * If we can't allocate a new RX buffer, dump
544 * this packet and reuse the old buffer.
545 */
546 ipoib_dbg(priv, "failed to allocate receive buffer %d\n", wr_id);
547 if_inc_counter(dev, IFCOUNTER_IERRORS, 1);
548 memcpy(&rx_ring[wr_id], &saverx, sizeof(saverx));
549 goto repost;
550 }
551
552 ipoib_cm_dma_unmap_rx(priv, &saverx);
553
554 ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
555 wc->byte_len, wc->slid);
556
557 ipoib_dma_mb(priv, mb, wc->byte_len);
558
559 mb->m_pkthdr.rcvif = dev;
560
561 M_PREPEND(mb, sizeof(struct ipoib_pseudoheader), M_NOWAIT);
562 if (likely(mb != NULL)) {
563 struct ipoib_header *ibh;
564
565 if_inc_counter(dev, IFCOUNTER_IPACKETS, 1);
566 if_inc_counter(dev, IFCOUNTER_IBYTES, mb->m_pkthdr.len);
567
568 /* fixup destination infiniband address */
569 ibh = mtod(mb, struct ipoib_header *);
570 memset(ibh->hwaddr, 0, 4);
571 memcpy(ibh->hwaddr + 4, priv->local_gid.raw, sizeof(union ib_gid));
572
573 if_input(dev, mb);
574 } else {
575 if_inc_counter(dev, IFCOUNTER_IERRORS, 1);
576 }
577 repost:
578 if (has_srq) {
579 if (unlikely(ipoib_cm_post_receive_srq(priv, wr_id)))
580 ipoib_warn(priv, "ipoib_cm_post_receive_srq failed "
581 "for buf %d\n", wr_id);
582 } else {
583 if (unlikely(ipoib_cm_post_receive_nonsrq(priv, p,
584 &priv->cm.rx_wr,
585 priv->cm.rx_sge,
586 wr_id))) {
587 --p->recv_count;
588 ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq failed "
589 "for buf %d\n", wr_id);
590 }
591 }
592 done:
593 return;
594 }
595
post_send(struct ipoib_dev_priv * priv,struct ipoib_cm_tx * tx,struct ipoib_cm_tx_buf * tx_req,unsigned int wr_id)596 static inline int post_send(struct ipoib_dev_priv *priv,
597 struct ipoib_cm_tx *tx,
598 struct ipoib_cm_tx_buf *tx_req,
599 unsigned int wr_id)
600 {
601 const struct ib_send_wr *bad_wr;
602 struct mbuf *mb = tx_req->mb;
603 u64 *mapping = tx_req->mapping;
604 struct mbuf *m;
605 int i;
606
607 for (m = mb, i = 0; m != NULL; m = m->m_next, i++) {
608 priv->tx_sge[i].addr = mapping[i];
609 priv->tx_sge[i].length = m->m_len;
610 }
611 priv->tx_wr.wr.num_sge = i;
612 priv->tx_wr.wr.wr_id = wr_id | IPOIB_OP_CM;
613 priv->tx_wr.wr.opcode = IB_WR_SEND;
614
615 return ib_post_send(tx->qp, &priv->tx_wr.wr, &bad_wr);
616 }
617
ipoib_cm_send(struct ipoib_dev_priv * priv,struct mbuf * mb,struct ipoib_cm_tx * tx)618 void ipoib_cm_send(struct ipoib_dev_priv *priv, struct mbuf *mb, struct ipoib_cm_tx *tx)
619 {
620 struct ipoib_cm_tx_buf *tx_req;
621 if_t dev = priv->dev;
622
623 if (unlikely(priv->tx_outstanding > MAX_SEND_CQE)) {
624 while (ipoib_poll_tx(priv, false))
625 ; /* nothing */
626 }
627
628 m_adj(mb, sizeof(struct ipoib_pseudoheader));
629 if (unlikely(mb->m_pkthdr.len > tx->mtu)) {
630 ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
631 mb->m_pkthdr.len, tx->mtu);
632 if_inc_counter(dev, IFCOUNTER_OERRORS, 1);
633 ipoib_cm_mb_too_long(priv, mb, IPOIB_CM_MTU(tx->mtu));
634 return;
635 }
636
637 ipoib_dbg_data(priv, "sending packet: head 0x%x length %d connection 0x%x\n",
638 tx->tx_head, mb->m_pkthdr.len, tx->qp->qp_num);
639
640
641 /*
642 * We put the mb into the tx_ring _before_ we call post_send()
643 * because it's entirely possible that the completion handler will
644 * run before we execute anything after the post_send(). That
645 * means we have to make sure everything is properly recorded and
646 * our state is consistent before we call post_send().
647 */
648 tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)];
649 tx_req->mb = mb;
650 if (unlikely(ipoib_dma_map_tx(priv->ca, (struct ipoib_tx_buf *)tx_req,
651 priv->cm.num_frags))) {
652 if_inc_counter(dev, IFCOUNTER_OERRORS, 1);
653 if (tx_req->mb)
654 m_freem(tx_req->mb);
655 return;
656 }
657
658 if (unlikely(post_send(priv, tx, tx_req, tx->tx_head & (ipoib_sendq_size - 1)))) {
659 ipoib_warn(priv, "post_send failed\n");
660 if_inc_counter(dev, IFCOUNTER_OERRORS, 1);
661 ipoib_dma_unmap_tx(priv->ca, (struct ipoib_tx_buf *)tx_req);
662 m_freem(mb);
663 } else {
664 ++tx->tx_head;
665
666 if (++priv->tx_outstanding == ipoib_sendq_size) {
667 ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n",
668 tx->qp->qp_num);
669 if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP))
670 ipoib_warn(priv, "request notify on send CQ failed\n");
671 if_setdrvflagbits(dev, IFF_DRV_OACTIVE, 0);
672 }
673 }
674
675 }
676
ipoib_cm_handle_tx_wc(struct ipoib_dev_priv * priv,struct ib_wc * wc)677 void ipoib_cm_handle_tx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc)
678 {
679 struct ipoib_cm_tx *tx = wc->qp->qp_context;
680 unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM;
681 if_t dev = priv->dev;
682 struct ipoib_cm_tx_buf *tx_req;
683
684 ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n",
685 wr_id, wc->status);
686
687 if (unlikely(wr_id >= ipoib_sendq_size)) {
688 ipoib_warn(priv, "cm send completion event with wrid %d (> %d)\n",
689 wr_id, ipoib_sendq_size);
690 return;
691 }
692
693 tx_req = &tx->tx_ring[wr_id];
694
695 ipoib_dma_unmap_tx(priv->ca, (struct ipoib_tx_buf *)tx_req);
696
697 /* FIXME: is this right? Shouldn't we only increment on success? */
698 if_inc_counter(dev, IFCOUNTER_OPACKETS, 1);
699
700 m_freem(tx_req->mb);
701
702 ++tx->tx_tail;
703 if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
704 (if_getdrvflags(dev) & IFF_DRV_OACTIVE) != 0 &&
705 test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
706 if_setdrvflagbits(dev, 0, IFF_DRV_OACTIVE);
707
708 if (wc->status != IB_WC_SUCCESS &&
709 wc->status != IB_WC_WR_FLUSH_ERR) {
710 struct ipoib_path *path;
711
712 ipoib_dbg(priv, "failed cm send event "
713 "(status=%d, wrid=%d vend_err %x)\n",
714 wc->status, wr_id, wc->vendor_err);
715
716 path = tx->path;
717
718 if (path) {
719 path->cm = NULL;
720 rb_erase(&path->rb_node, &priv->path_tree);
721 list_del(&path->list);
722 }
723
724 if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
725 list_move(&tx->list, &priv->cm.reap_list);
726 queue_work(ipoib_workqueue, &priv->cm.reap_task);
727 }
728
729 clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags);
730 }
731
732 }
733
ipoib_cm_dev_open(struct ipoib_dev_priv * priv)734 int ipoib_cm_dev_open(struct ipoib_dev_priv *priv)
735 {
736 int ret;
737
738 if (!IPOIB_CM_SUPPORTED(if_getlladdr(priv->dev)))
739 return 0;
740
741 priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, priv);
742 if (IS_ERR(priv->cm.id)) {
743 printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name);
744 ret = PTR_ERR(priv->cm.id);
745 goto err_cm;
746 }
747
748 ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num), 0);
749 if (ret) {
750 printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name,
751 IPOIB_CM_IETF_ID | priv->qp->qp_num);
752 goto err_listen;
753 }
754
755 return 0;
756
757 err_listen:
758 ib_destroy_cm_id(priv->cm.id);
759 err_cm:
760 priv->cm.id = NULL;
761 return ret;
762 }
763
ipoib_cm_free_rx_reap_list(struct ipoib_dev_priv * priv)764 static void ipoib_cm_free_rx_reap_list(struct ipoib_dev_priv *priv)
765 {
766 struct ipoib_cm_rx *rx, *n;
767 LIST_HEAD(list);
768
769 spin_lock_irq(&priv->lock);
770 list_splice_init(&priv->cm.rx_reap_list, &list);
771 spin_unlock_irq(&priv->lock);
772
773 list_for_each_entry_safe(rx, n, &list, list) {
774 ib_destroy_cm_id(rx->id);
775 ib_destroy_qp(rx->qp);
776 if (!ipoib_cm_has_srq(priv)) {
777 ipoib_cm_free_rx_ring(priv, rx->rx_ring);
778 spin_lock_irq(&priv->lock);
779 --priv->cm.nonsrq_conn_qp;
780 spin_unlock_irq(&priv->lock);
781 }
782 kfree(rx);
783 }
784 }
785
ipoib_cm_dev_stop(struct ipoib_dev_priv * priv)786 void ipoib_cm_dev_stop(struct ipoib_dev_priv *priv)
787 {
788 struct ipoib_cm_rx *p;
789 unsigned long begin;
790 int ret;
791
792 if (!IPOIB_CM_SUPPORTED(if_getlladdr(priv->dev)) || !priv->cm.id)
793 return;
794
795 ib_destroy_cm_id(priv->cm.id);
796 priv->cm.id = NULL;
797
798 cancel_work_sync(&priv->cm.rx_reap_task);
799
800 spin_lock_irq(&priv->lock);
801 while (!list_empty(&priv->cm.passive_ids)) {
802 p = list_entry(priv->cm.passive_ids.next, typeof(*p), list);
803 list_move(&p->list, &priv->cm.rx_error_list);
804 p->state = IPOIB_CM_RX_ERROR;
805 spin_unlock_irq(&priv->lock);
806 ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
807 if (ret)
808 ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
809 spin_lock_irq(&priv->lock);
810 }
811
812 /* Wait for all RX to be drained */
813 begin = jiffies;
814
815 while (!list_empty(&priv->cm.rx_error_list) ||
816 !list_empty(&priv->cm.rx_flush_list) ||
817 !list_empty(&priv->cm.rx_drain_list)) {
818 if (time_after(jiffies, begin + 5 * HZ)) {
819 ipoib_warn(priv, "RX drain timing out\n");
820
821 /*
822 * assume the HW is wedged and just free up everything.
823 */
824 list_splice_init(&priv->cm.rx_flush_list,
825 &priv->cm.rx_reap_list);
826 list_splice_init(&priv->cm.rx_error_list,
827 &priv->cm.rx_reap_list);
828 list_splice_init(&priv->cm.rx_drain_list,
829 &priv->cm.rx_reap_list);
830 break;
831 }
832 spin_unlock_irq(&priv->lock);
833 msleep(1);
834 ipoib_drain_cq(priv);
835 spin_lock_irq(&priv->lock);
836 }
837
838 spin_unlock_irq(&priv->lock);
839
840 ipoib_cm_free_rx_reap_list(priv);
841
842 cancel_delayed_work_sync(&priv->cm.stale_task);
843 }
844
ipoib_cm_rep_handler(struct ib_cm_id * cm_id,struct ib_cm_event * event)845 static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
846 {
847 struct ipoib_cm_tx *p = cm_id->context;
848 struct ipoib_dev_priv *priv = p->priv;
849 struct ipoib_cm_data *data = event->private_data;
850 struct epoch_tracker et;
851 struct ifqueue mbqueue;
852 struct ib_qp_attr qp_attr;
853 int qp_attr_mask, ret;
854 struct mbuf *mb;
855
856 ipoib_dbg(priv, "cm rep handler\n");
857 p->mtu = be32_to_cpu(data->mtu);
858
859 if (p->mtu <= IPOIB_ENCAP_LEN) {
860 ipoib_warn(priv, "Rejecting connection: mtu %d <= %d\n",
861 p->mtu, IPOIB_ENCAP_LEN);
862 return -EINVAL;
863 }
864
865 qp_attr.qp_state = IB_QPS_RTR;
866 ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
867 if (ret) {
868 ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret);
869 return ret;
870 }
871
872 qp_attr.rq_psn = 0 /* FIXME */;
873 ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask);
874 if (ret) {
875 ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
876 return ret;
877 }
878
879 qp_attr.qp_state = IB_QPS_RTS;
880 ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
881 if (ret) {
882 ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret);
883 return ret;
884 }
885 ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask);
886 if (ret) {
887 ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret);
888 return ret;
889 }
890
891 bzero(&mbqueue, sizeof(mbqueue));
892
893 spin_lock_irq(&priv->lock);
894 set_bit(IPOIB_FLAG_OPER_UP, &p->flags);
895 if (p->path)
896 for (;;) {
897 _IF_DEQUEUE(&p->path->queue, mb);
898 if (mb == NULL)
899 break;
900 _IF_ENQUEUE(&mbqueue, mb);
901 }
902 spin_unlock_irq(&priv->lock);
903
904 NET_EPOCH_ENTER(et);
905 for (;;) {
906 if_t dev = p->priv->dev;
907 _IF_DEQUEUE(&mbqueue, mb);
908 if (mb == NULL)
909 break;
910 mb->m_pkthdr.rcvif = dev;
911 if (if_transmit(dev, mb))
912 ipoib_warn(priv, "dev_queue_xmit failed "
913 "to requeue packet\n");
914 }
915 NET_EPOCH_EXIT(et);
916
917 ret = ib_send_cm_rtu(cm_id, NULL, 0);
918 if (ret) {
919 ipoib_warn(priv, "failed to send RTU: %d\n", ret);
920 return ret;
921 }
922 return 0;
923 }
924
ipoib_cm_create_tx_qp(struct ipoib_dev_priv * priv,struct ipoib_cm_tx * tx)925 static struct ib_qp *ipoib_cm_create_tx_qp(struct ipoib_dev_priv *priv,
926 struct ipoib_cm_tx *tx)
927 {
928 struct ib_qp_init_attr attr = {
929 .send_cq = priv->send_cq,
930 .recv_cq = priv->recv_cq,
931 .srq = priv->cm.srq,
932 .cap.max_send_wr = ipoib_sendq_size,
933 .cap.max_send_sge = priv->cm.num_frags,
934 .sq_sig_type = IB_SIGNAL_ALL_WR,
935 .qp_type = IB_QPT_RC,
936 .qp_context = tx
937 };
938
939 return ib_create_qp(priv->pd, &attr);
940 }
941
ipoib_cm_send_req(struct ipoib_dev_priv * priv,struct ib_cm_id * id,struct ib_qp * qp,u32 qpn,struct ib_sa_path_rec * pathrec)942 static int ipoib_cm_send_req(struct ipoib_dev_priv *priv,
943 struct ib_cm_id *id, struct ib_qp *qp,
944 u32 qpn,
945 struct ib_sa_path_rec *pathrec)
946 {
947 struct ipoib_cm_data data = {};
948 struct ib_cm_req_param req = {};
949
950 ipoib_dbg(priv, "cm send req\n");
951
952 data.qpn = cpu_to_be32(priv->qp->qp_num);
953 data.mtu = cpu_to_be32(priv->cm.max_cm_mtu);
954
955 req.primary_path = pathrec;
956 req.alternate_path = NULL;
957 req.service_id = cpu_to_be64(IPOIB_CM_IETF_ID | qpn);
958 req.qp_num = qp->qp_num;
959 req.qp_type = qp->qp_type;
960 req.private_data = &data;
961 req.private_data_len = sizeof data;
962 req.flow_control = 0;
963
964 req.starting_psn = 0; /* FIXME */
965
966 /*
967 * Pick some arbitrary defaults here; we could make these
968 * module parameters if anyone cared about setting them.
969 */
970 req.responder_resources = 4;
971 req.remote_cm_response_timeout = 20;
972 req.local_cm_response_timeout = 20;
973 req.retry_count = 0; /* RFC draft warns against retries */
974 req.rnr_retry_count = 0; /* RFC draft warns against retries */
975 req.max_cm_retries = 15;
976 req.srq = ipoib_cm_has_srq(priv);
977 return ib_send_cm_req(id, &req);
978 }
979
ipoib_cm_modify_tx_init(struct ipoib_dev_priv * priv,struct ib_cm_id * cm_id,struct ib_qp * qp)980 static int ipoib_cm_modify_tx_init(struct ipoib_dev_priv *priv,
981 struct ib_cm_id *cm_id, struct ib_qp *qp)
982 {
983 struct ib_qp_attr qp_attr;
984 int qp_attr_mask, ret;
985 ret = ib_find_pkey(priv->ca, priv->port, priv->pkey, &qp_attr.pkey_index);
986 if (ret) {
987 ipoib_warn(priv, "pkey 0x%x not found: %d\n", priv->pkey, ret);
988 return ret;
989 }
990
991 qp_attr.qp_state = IB_QPS_INIT;
992 qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE;
993 qp_attr.port_num = priv->port;
994 qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT;
995
996 ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
997 if (ret) {
998 ipoib_warn(priv, "failed to modify tx QP to INIT: %d\n", ret);
999 return ret;
1000 }
1001 return 0;
1002 }
1003
ipoib_cm_tx_init(struct ipoib_cm_tx * p,u32 qpn,struct ib_sa_path_rec * pathrec)1004 static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn,
1005 struct ib_sa_path_rec *pathrec)
1006 {
1007 struct ipoib_dev_priv *priv = p->priv;
1008 int ret;
1009
1010 p->tx_ring = kzalloc(ipoib_sendq_size * sizeof *p->tx_ring, GFP_KERNEL);
1011 if (!p->tx_ring) {
1012 ipoib_warn(priv, "failed to allocate tx ring\n");
1013 ret = -ENOMEM;
1014 goto err_tx;
1015 }
1016 memset(p->tx_ring, 0, ipoib_sendq_size * sizeof *p->tx_ring);
1017
1018 p->qp = ipoib_cm_create_tx_qp(p->priv, p);
1019 if (IS_ERR(p->qp)) {
1020 ret = PTR_ERR(p->qp);
1021 ipoib_warn(priv, "failed to allocate tx qp: %d\n", ret);
1022 goto err_qp;
1023 }
1024
1025 p->id = ib_create_cm_id(priv->ca, ipoib_cm_tx_handler, p);
1026 if (IS_ERR(p->id)) {
1027 ret = PTR_ERR(p->id);
1028 ipoib_warn(priv, "failed to create tx cm id: %d\n", ret);
1029 goto err_id;
1030 }
1031
1032 ret = ipoib_cm_modify_tx_init(p->priv, p->id, p->qp);
1033 if (ret) {
1034 ipoib_warn(priv, "failed to modify tx qp to rtr: %d\n", ret);
1035 goto err_modify;
1036 }
1037
1038 ret = ipoib_cm_send_req(p->priv, p->id, p->qp, qpn, pathrec);
1039 if (ret) {
1040 ipoib_warn(priv, "failed to send cm req: %d\n", ret);
1041 goto err_send_cm;
1042 }
1043
1044 ipoib_dbg(priv, "Request connection 0x%x for gid %pI6 qpn 0x%x\n",
1045 p->qp->qp_num, pathrec->dgid.raw, qpn);
1046
1047 return 0;
1048
1049 err_send_cm:
1050 err_modify:
1051 ib_destroy_cm_id(p->id);
1052 err_id:
1053 p->id = NULL;
1054 ib_destroy_qp(p->qp);
1055 err_qp:
1056 p->qp = NULL;
1057 kfree(p->tx_ring);
1058 err_tx:
1059 return ret;
1060 }
1061
ipoib_cm_tx_destroy(struct ipoib_cm_tx * p)1062 static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
1063 {
1064 struct ipoib_dev_priv *priv = p->priv;
1065 if_t dev = priv->dev;
1066 struct ipoib_cm_tx_buf *tx_req;
1067 unsigned long begin;
1068
1069 ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n",
1070 p->qp ? p->qp->qp_num : 0, p->tx_head, p->tx_tail);
1071
1072 if (p->path)
1073 ipoib_path_free(priv, p->path);
1074
1075 if (p->id)
1076 ib_destroy_cm_id(p->id);
1077
1078 if (p->tx_ring) {
1079 /* Wait for all sends to complete */
1080 begin = jiffies;
1081 while ((int) p->tx_tail - (int) p->tx_head < 0) {
1082 if (time_after(jiffies, begin + 5 * HZ)) {
1083 ipoib_warn(priv, "timing out; %d sends not completed\n",
1084 p->tx_head - p->tx_tail);
1085 goto timeout;
1086 }
1087
1088 msleep(1);
1089 }
1090 }
1091
1092 timeout:
1093
1094 while ((int) p->tx_tail - (int) p->tx_head < 0) {
1095 tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
1096 ipoib_dma_unmap_tx(priv->ca, (struct ipoib_tx_buf *)tx_req);
1097 m_freem(tx_req->mb);
1098 ++p->tx_tail;
1099 if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
1100 (if_getdrvflags(dev) & IFF_DRV_OACTIVE) != 0 &&
1101 test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
1102 if_setdrvflagbits(dev, 0, IFF_DRV_OACTIVE);
1103 }
1104
1105 if (p->qp)
1106 ib_destroy_qp(p->qp);
1107
1108 kfree(p->tx_ring);
1109 kfree(p);
1110 }
1111
ipoib_cm_tx_handler(struct ib_cm_id * cm_id,struct ib_cm_event * event)1112 static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
1113 struct ib_cm_event *event)
1114 {
1115 struct ipoib_cm_tx *tx = cm_id->context;
1116 struct ipoib_dev_priv *priv = tx->priv;
1117 struct ipoib_path *path;
1118 unsigned long flags;
1119 int ret;
1120
1121 switch (event->event) {
1122 case IB_CM_DREQ_RECEIVED:
1123 ipoib_dbg(priv, "DREQ received.\n");
1124 ib_send_cm_drep(cm_id, NULL, 0);
1125 break;
1126 case IB_CM_REP_RECEIVED:
1127 ipoib_dbg(priv, "REP received.\n");
1128 ret = ipoib_cm_rep_handler(cm_id, event);
1129 if (ret)
1130 ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED,
1131 NULL, 0, NULL, 0);
1132 break;
1133 case IB_CM_REQ_ERROR:
1134 case IB_CM_REJ_RECEIVED:
1135 case IB_CM_TIMEWAIT_EXIT:
1136 ipoib_dbg(priv, "CM error %d.\n", event->event);
1137 spin_lock_irqsave(&priv->lock, flags);
1138 path = tx->path;
1139
1140 if (path) {
1141 path->cm = NULL;
1142 tx->path = NULL;
1143 rb_erase(&path->rb_node, &priv->path_tree);
1144 list_del(&path->list);
1145 }
1146
1147 if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
1148 list_move(&tx->list, &priv->cm.reap_list);
1149 queue_work(ipoib_workqueue, &priv->cm.reap_task);
1150 }
1151
1152 spin_unlock_irqrestore(&priv->lock, flags);
1153 if (path)
1154 ipoib_path_free(tx->priv, path);
1155 break;
1156 default:
1157 break;
1158 }
1159
1160 return 0;
1161 }
1162
ipoib_cm_create_tx(struct ipoib_dev_priv * priv,struct ipoib_path * path)1163 struct ipoib_cm_tx *ipoib_cm_create_tx(struct ipoib_dev_priv *priv,
1164 struct ipoib_path *path)
1165 {
1166 struct ipoib_cm_tx *tx;
1167
1168 tx = kzalloc(sizeof *tx, GFP_ATOMIC);
1169 if (!tx)
1170 return NULL;
1171
1172 ipoib_dbg(priv, "Creating cm tx\n");
1173 path->cm = tx;
1174 tx->path = path;
1175 tx->priv = priv;
1176 list_add(&tx->list, &priv->cm.start_list);
1177 set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags);
1178 queue_work(ipoib_workqueue, &priv->cm.start_task);
1179 return tx;
1180 }
1181
ipoib_cm_destroy_tx(struct ipoib_cm_tx * tx)1182 void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx)
1183 {
1184 struct ipoib_dev_priv *priv = tx->priv;
1185 if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
1186 spin_lock(&priv->lock);
1187 list_move(&tx->list, &priv->cm.reap_list);
1188 spin_unlock(&priv->lock);
1189 queue_work(ipoib_workqueue, &priv->cm.reap_task);
1190 ipoib_dbg(priv, "Reap connection for gid %pI6\n",
1191 tx->path->pathrec.dgid.raw);
1192 tx->path = NULL;
1193 }
1194 }
1195
ipoib_cm_tx_start(struct work_struct * work)1196 static void ipoib_cm_tx_start(struct work_struct *work)
1197 {
1198 struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
1199 cm.start_task);
1200 struct ipoib_path *path;
1201 struct ipoib_cm_tx *p;
1202 unsigned long flags;
1203 int ret;
1204
1205 struct ib_sa_path_rec pathrec;
1206 u32 qpn;
1207
1208 ipoib_dbg(priv, "cm start task\n");
1209 spin_lock_irqsave(&priv->lock, flags);
1210
1211 while (!list_empty(&priv->cm.start_list)) {
1212 p = list_entry(priv->cm.start_list.next, typeof(*p), list);
1213 list_del_init(&p->list);
1214 path = p->path;
1215 qpn = IPOIB_QPN(path->hwaddr);
1216 memcpy(&pathrec, &p->path->pathrec, sizeof pathrec);
1217
1218 spin_unlock_irqrestore(&priv->lock, flags);
1219
1220 ret = ipoib_cm_tx_init(p, qpn, &pathrec);
1221
1222 spin_lock_irqsave(&priv->lock, flags);
1223
1224 if (ret) {
1225 path = p->path;
1226 if (path) {
1227 path->cm = NULL;
1228 rb_erase(&path->rb_node, &priv->path_tree);
1229 list_del(&path->list);
1230 ipoib_path_free(priv, path);
1231 }
1232 list_del(&p->list);
1233 kfree(p);
1234 }
1235 }
1236
1237 spin_unlock_irqrestore(&priv->lock, flags);
1238 }
1239
ipoib_cm_tx_reap(struct work_struct * work)1240 static void ipoib_cm_tx_reap(struct work_struct *work)
1241 {
1242 struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
1243 cm.reap_task);
1244 struct ipoib_cm_tx *p;
1245 unsigned long flags;
1246
1247 spin_lock_irqsave(&priv->lock, flags);
1248
1249 while (!list_empty(&priv->cm.reap_list)) {
1250 p = list_entry(priv->cm.reap_list.next, typeof(*p), list);
1251 list_del(&p->list);
1252 spin_unlock_irqrestore(&priv->lock, flags);
1253 ipoib_cm_tx_destroy(p);
1254 spin_lock_irqsave(&priv->lock, flags);
1255 }
1256
1257 spin_unlock_irqrestore(&priv->lock, flags);
1258 }
1259
ipoib_cm_mb_reap(struct work_struct * work)1260 static void ipoib_cm_mb_reap(struct work_struct *work)
1261 {
1262 struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
1263 cm.mb_task);
1264 struct mbuf *mb;
1265 unsigned long flags;
1266 #if defined(INET) || defined(INET6)
1267 unsigned mtu = priv->mcast_mtu;
1268 #endif
1269 uint16_t proto;
1270
1271 spin_lock_irqsave(&priv->lock, flags);
1272
1273 CURVNET_SET_QUIET(if_getvnet(priv->dev));
1274
1275 for (;;) {
1276 IF_DEQUEUE(&priv->cm.mb_queue, mb);
1277 if (mb == NULL)
1278 break;
1279 spin_unlock_irqrestore(&priv->lock, flags);
1280
1281 proto = htons(*mtod(mb, uint16_t *));
1282 m_adj(mb, IPOIB_ENCAP_LEN);
1283 switch (proto) {
1284 #if defined(INET)
1285 case ETHERTYPE_IP:
1286 icmp_error(mb, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0, mtu);
1287 break;
1288 #endif
1289 #if defined(INET6)
1290 case ETHERTYPE_IPV6:
1291 icmp6_error(mb, ICMP6_PACKET_TOO_BIG, 0, mtu);
1292 break;
1293 #endif
1294 default:
1295 m_freem(mb);
1296 }
1297
1298 spin_lock_irqsave(&priv->lock, flags);
1299 }
1300
1301 CURVNET_RESTORE();
1302
1303 spin_unlock_irqrestore(&priv->lock, flags);
1304 }
1305
1306 void
ipoib_cm_mb_too_long(struct ipoib_dev_priv * priv,struct mbuf * mb,unsigned int mtu)1307 ipoib_cm_mb_too_long(struct ipoib_dev_priv *priv, struct mbuf *mb, unsigned int mtu)
1308 {
1309 int e = priv->cm.mb_queue.ifq_len;
1310
1311 IF_ENQUEUE(&priv->cm.mb_queue, mb);
1312 if (e == 0)
1313 queue_work(ipoib_workqueue, &priv->cm.mb_task);
1314 }
1315
ipoib_cm_rx_reap(struct work_struct * work)1316 static void ipoib_cm_rx_reap(struct work_struct *work)
1317 {
1318 ipoib_cm_free_rx_reap_list(container_of(work, struct ipoib_dev_priv,
1319 cm.rx_reap_task));
1320 }
1321
ipoib_cm_stale_task(struct work_struct * work)1322 static void ipoib_cm_stale_task(struct work_struct *work)
1323 {
1324 struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
1325 cm.stale_task.work);
1326 struct ipoib_cm_rx *p;
1327 int ret;
1328
1329 spin_lock_irq(&priv->lock);
1330 while (!list_empty(&priv->cm.passive_ids)) {
1331 /* List is sorted by LRU, start from tail,
1332 * stop when we see a recently used entry */
1333 p = list_entry(priv->cm.passive_ids.prev, typeof(*p), list);
1334 if (time_before_eq(jiffies, p->jiffies + IPOIB_CM_RX_TIMEOUT))
1335 break;
1336 list_move(&p->list, &priv->cm.rx_error_list);
1337 p->state = IPOIB_CM_RX_ERROR;
1338 spin_unlock_irq(&priv->lock);
1339 ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
1340 if (ret)
1341 ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
1342 spin_lock_irq(&priv->lock);
1343 }
1344
1345 if (!list_empty(&priv->cm.passive_ids))
1346 queue_delayed_work(ipoib_workqueue,
1347 &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
1348 spin_unlock_irq(&priv->lock);
1349 }
1350
1351
ipoib_cm_create_srq(struct ipoib_dev_priv * priv,int max_sge)1352 static void ipoib_cm_create_srq(struct ipoib_dev_priv *priv, int max_sge)
1353 {
1354 struct ib_srq_init_attr srq_init_attr = {
1355 .attr = {
1356 .max_wr = ipoib_recvq_size,
1357 .max_sge = max_sge
1358 }
1359 };
1360
1361 priv->cm.srq = ib_create_srq(priv->pd, &srq_init_attr);
1362 if (IS_ERR(priv->cm.srq)) {
1363 if (PTR_ERR(priv->cm.srq) != -ENOSYS)
1364 printk(KERN_WARNING "%s: failed to allocate SRQ, error %ld\n",
1365 priv->ca->name, PTR_ERR(priv->cm.srq));
1366 priv->cm.srq = NULL;
1367 return;
1368 }
1369
1370 priv->cm.srq_ring = kzalloc(ipoib_recvq_size * sizeof *priv->cm.srq_ring, GFP_KERNEL);
1371 if (!priv->cm.srq_ring) {
1372 printk(KERN_WARNING "%s: failed to allocate CM SRQ ring (%d entries)\n",
1373 priv->ca->name, ipoib_recvq_size);
1374 ib_destroy_srq(priv->cm.srq);
1375 priv->cm.srq = NULL;
1376 return;
1377 }
1378
1379 memset(priv->cm.srq_ring, 0, ipoib_recvq_size * sizeof *priv->cm.srq_ring);
1380 }
1381
ipoib_cm_dev_init(struct ipoib_dev_priv * priv)1382 int ipoib_cm_dev_init(struct ipoib_dev_priv *priv)
1383 {
1384 if_t dev = priv->dev;
1385 int i;
1386 int max_srq_sge;
1387
1388 INIT_LIST_HEAD(&priv->cm.passive_ids);
1389 INIT_LIST_HEAD(&priv->cm.reap_list);
1390 INIT_LIST_HEAD(&priv->cm.start_list);
1391 INIT_LIST_HEAD(&priv->cm.rx_error_list);
1392 INIT_LIST_HEAD(&priv->cm.rx_flush_list);
1393 INIT_LIST_HEAD(&priv->cm.rx_drain_list);
1394 INIT_LIST_HEAD(&priv->cm.rx_reap_list);
1395 INIT_WORK(&priv->cm.start_task, ipoib_cm_tx_start);
1396 INIT_WORK(&priv->cm.reap_task, ipoib_cm_tx_reap);
1397 INIT_WORK(&priv->cm.mb_task, ipoib_cm_mb_reap);
1398 INIT_WORK(&priv->cm.rx_reap_task, ipoib_cm_rx_reap);
1399 INIT_DELAYED_WORK(&priv->cm.stale_task, ipoib_cm_stale_task);
1400
1401 bzero(&priv->cm.mb_queue, sizeof(priv->cm.mb_queue));
1402 mtx_init(&priv->cm.mb_queue.ifq_mtx,
1403 if_name(dev), "if send queue", MTX_DEF);
1404
1405 max_srq_sge = priv->ca->attrs.max_srq_sge;
1406
1407 ipoib_dbg(priv, "max_srq_sge=%d\n", max_srq_sge);
1408
1409 max_srq_sge = min_t(int, IPOIB_CM_RX_SG, max_srq_sge);
1410 ipoib_cm_create_srq(priv, max_srq_sge);
1411 if (ipoib_cm_has_srq(priv)) {
1412 priv->cm.max_cm_mtu = max_srq_sge * MJUMPAGESIZE;
1413 priv->cm.num_frags = max_srq_sge;
1414 ipoib_dbg(priv, "max_cm_mtu = 0x%x, num_frags=%d\n",
1415 priv->cm.max_cm_mtu, priv->cm.num_frags);
1416 } else {
1417 priv->cm.max_cm_mtu = IPOIB_CM_MAX_MTU;
1418 priv->cm.num_frags = IPOIB_CM_RX_SG;
1419 }
1420
1421 ipoib_cm_init_rx_wr(priv, &priv->cm.rx_wr, priv->cm.rx_sge);
1422
1423 if (ipoib_cm_has_srq(priv)) {
1424 for (i = 0; i < ipoib_recvq_size; ++i) {
1425 if (!ipoib_cm_alloc_rx_mb(priv, &priv->cm.srq_ring[i])) {
1426 ipoib_warn(priv, "failed to allocate "
1427 "receive buffer %d\n", i);
1428 ipoib_cm_dev_cleanup(priv);
1429 return -ENOMEM;
1430 }
1431
1432 if (ipoib_cm_post_receive_srq(priv, i)) {
1433 ipoib_warn(priv, "ipoib_cm_post_receive_srq "
1434 "failed for buf %d\n", i);
1435 ipoib_cm_dev_cleanup(priv);
1436 return -EIO;
1437 }
1438 }
1439 }
1440
1441 if_getlladdr(priv->dev)[0] = IPOIB_FLAGS_RC;
1442 return 0;
1443 }
1444
ipoib_cm_dev_cleanup(struct ipoib_dev_priv * priv)1445 void ipoib_cm_dev_cleanup(struct ipoib_dev_priv *priv)
1446 {
1447 int ret;
1448
1449 if (!priv->cm.srq)
1450 return;
1451
1452 ipoib_dbg(priv, "Cleanup ipoib connected mode.\n");
1453
1454 ret = ib_destroy_srq(priv->cm.srq);
1455 if (ret)
1456 ipoib_warn(priv, "ib_destroy_srq failed: %d\n", ret);
1457
1458 priv->cm.srq = NULL;
1459 if (!priv->cm.srq_ring)
1460 return;
1461
1462 ipoib_cm_free_rx_ring(priv, priv->cm.srq_ring);
1463 priv->cm.srq_ring = NULL;
1464
1465 mtx_destroy(&priv->cm.mb_queue.ifq_mtx);
1466 }
1467
1468 #endif /* CONFIG_INFINIBAND_IPOIB_CM */
1469