1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0 3 * 4 * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. 5 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. 6 * Copyright (c) 2005 Mellanox Technologies. All rights reserved. 7 * Copyright (c) 2004, 2005 Voltaire, Inc. All rights reserved. 8 * 9 * This software is available to you under a choice of one of two 10 * licenses. You may choose to be licensed under the terms of the GNU 11 * General Public License (GPL) Version 2, available from the file 12 * COPYING in the main directory of this source tree, or the 13 * OpenIB.org BSD license below: 14 * 15 * Redistribution and use in source and binary forms, with or 16 * without modification, are permitted provided that the following 17 * conditions are met: 18 * 19 * - Redistributions of source code must retain the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer. 22 * 23 * - Redistributions in binary form must reproduce the above 24 * copyright notice, this list of conditions and the following 25 * disclaimer in the documentation and/or other materials 26 * provided with the distribution. 27 * 28 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 29 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 30 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 31 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 32 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 33 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 34 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 35 * SOFTWARE. 36 */ 37 38 #include <sys/cdefs.h> 39 #include "ipoib.h" 40 41 #include <rdma/ib_cache.h> 42 43 #include <security/mac/mac_framework.h> 44 45 #include <linux/delay.h> 46 #include <linux/dma-mapping.h> 47 48 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA 49 static int data_debug_level; 50 51 module_param(data_debug_level, int, 0644); 52 MODULE_PARM_DESC(data_debug_level, 53 "Enable data path debug tracing if > 0"); 54 #endif 55 56 static DEFINE_MUTEX(pkey_mutex); 57 58 struct ipoib_ah *ipoib_create_ah(struct ipoib_dev_priv *priv, 59 struct ib_pd *pd, struct ib_ah_attr *attr) 60 { 61 struct ipoib_ah *ah; 62 63 ah = kmalloc(sizeof *ah, GFP_KERNEL); 64 if (!ah) 65 return NULL; 66 67 ah->priv = priv; 68 ah->last_send = 0; 69 kref_init(&ah->ref); 70 71 ah->ah = ib_create_ah(pd, attr, RDMA_CREATE_AH_SLEEPABLE); 72 if (IS_ERR(ah->ah)) { 73 kfree(ah); 74 ah = NULL; 75 } else 76 ipoib_dbg(priv, "Created ah %p\n", ah->ah); 77 78 return ah; 79 } 80 81 void ipoib_free_ah(struct kref *kref) 82 { 83 struct ipoib_ah *ah = container_of(kref, struct ipoib_ah, ref); 84 struct ipoib_dev_priv *priv = ah->priv; 85 86 unsigned long flags; 87 88 spin_lock_irqsave(&priv->lock, flags); 89 list_add_tail(&ah->list, &priv->dead_ahs); 90 spin_unlock_irqrestore(&priv->lock, flags); 91 } 92 93 void 94 ipoib_dma_unmap_rx(struct ipoib_dev_priv *priv, struct ipoib_rx_buf *rx_req) 95 { 96 struct mbuf *m; 97 int i; 98 99 for (i = 0, m = rx_req->mb; m != NULL; m = m->m_next, i++) 100 ib_dma_unmap_single(priv->ca, rx_req->mapping[i], m->m_len, 101 DMA_FROM_DEVICE); 102 } 103 104 void 105 ipoib_dma_mb(struct ipoib_dev_priv *priv, struct mbuf *mb, unsigned int length) 106 { 107 108 m_adj(mb, -(mb->m_pkthdr.len - length)); 109 } 110 111 struct mbuf * 112 ipoib_alloc_map_mb(struct ipoib_dev_priv *priv, struct ipoib_rx_buf *rx_req, 113 int align, int size, int max_frags) 114 { 115 struct mbuf *mb, *m; 116 int i, j; 117 118 rx_req->mb = NULL; 119 mb = m_getm2(NULL, align + size, M_NOWAIT, MT_DATA, M_PKTHDR); 120 if (mb == NULL) 121 return (NULL); 122 for (i = 0, m = mb; m != NULL; m = m->m_next, i++) { 123 MPASS(i < max_frags); 124 125 m->m_len = M_SIZE(m) - align; 126 m->m_data += align; 127 align = 0; 128 mb->m_pkthdr.len += m->m_len; 129 rx_req->mapping[i] = ib_dma_map_single(priv->ca, 130 mtod(m, void *), m->m_len, DMA_FROM_DEVICE); 131 if (unlikely(ib_dma_mapping_error(priv->ca, 132 rx_req->mapping[i]))) 133 goto error; 134 135 } 136 rx_req->mb = mb; 137 return (mb); 138 error: 139 for (j = 0, m = mb; j < i; m = m->m_next, j++) 140 ib_dma_unmap_single(priv->ca, rx_req->mapping[j], m->m_len, 141 DMA_FROM_DEVICE); 142 m_freem(mb); 143 return (NULL); 144 145 } 146 147 static int ipoib_ib_post_receive(struct ipoib_dev_priv *priv, int id) 148 { 149 struct ipoib_rx_buf *rx_req; 150 const struct ib_recv_wr *bad_wr; 151 struct mbuf *m; 152 int ret; 153 int i; 154 155 rx_req = &priv->rx_ring[id]; 156 for (m = rx_req->mb, i = 0; m != NULL; m = m->m_next, i++) { 157 priv->rx_sge[i].addr = rx_req->mapping[i]; 158 priv->rx_sge[i].length = m->m_len; 159 } 160 priv->rx_wr.num_sge = i; 161 priv->rx_wr.wr_id = id | IPOIB_OP_RECV; 162 163 ret = ib_post_recv(priv->qp, &priv->rx_wr, &bad_wr); 164 if (unlikely(ret)) { 165 ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret); 166 ipoib_dma_unmap_rx(priv, &priv->rx_ring[id]); 167 m_freem(priv->rx_ring[id].mb); 168 priv->rx_ring[id].mb = NULL; 169 } 170 171 return ret; 172 } 173 174 static struct mbuf * 175 ipoib_alloc_rx_mb(struct ipoib_dev_priv *priv, int id) 176 { 177 return ipoib_alloc_map_mb(priv, &priv->rx_ring[id], 178 0, priv->max_ib_mtu + IB_GRH_BYTES, IPOIB_UD_RX_SG); 179 } 180 181 static int ipoib_ib_post_receives(struct ipoib_dev_priv *priv) 182 { 183 int i; 184 185 for (i = 0; i < ipoib_recvq_size; ++i) { 186 if (!ipoib_alloc_rx_mb(priv, i)) { 187 ipoib_warn(priv, "failed to allocate receive buffer %d\n", i); 188 return -ENOMEM; 189 } 190 if (ipoib_ib_post_receive(priv, i)) { 191 ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i); 192 return -EIO; 193 } 194 } 195 196 return 0; 197 } 198 199 static void 200 ipoib_ib_handle_rx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc) 201 { 202 struct ipoib_rx_buf saverx; 203 unsigned int wr_id = wc->wr_id & ~IPOIB_OP_RECV; 204 if_t dev = priv->dev; 205 struct ipoib_header *eh; 206 struct mbuf *mb; 207 208 ipoib_dbg_data(priv, "recv completion: id %d, status: %d\n", 209 wr_id, wc->status); 210 211 if (unlikely(wr_id >= ipoib_recvq_size)) { 212 ipoib_warn(priv, "recv completion event with wrid %d (> %d)\n", 213 wr_id, ipoib_recvq_size); 214 return; 215 } 216 217 mb = priv->rx_ring[wr_id].mb; 218 219 if (unlikely(wc->status != IB_WC_SUCCESS)) { 220 if (wc->status != IB_WC_WR_FLUSH_ERR) { 221 ipoib_warn(priv, "failed recv event " 222 "(status=%d, wrid=%d vend_err %x)\n", 223 wc->status, wr_id, wc->vendor_err); 224 goto repost; 225 } 226 if (mb) { 227 ipoib_dma_unmap_rx(priv, &priv->rx_ring[wr_id]); 228 m_freem(mb); 229 priv->rx_ring[wr_id].mb = NULL; 230 } 231 return; 232 } 233 234 /* 235 * Drop packets that this interface sent, ie multicast packets 236 * that the HCA has replicated. 237 */ 238 if (wc->slid == priv->local_lid && wc->src_qp == priv->qp->qp_num) 239 goto repost; 240 241 memcpy(&saverx, &priv->rx_ring[wr_id], sizeof(saverx)); 242 /* 243 * If we can't allocate a new RX buffer, dump 244 * this packet and reuse the old buffer. 245 */ 246 if (unlikely(!ipoib_alloc_rx_mb(priv, wr_id))) { 247 memcpy(&priv->rx_ring[wr_id], &saverx, sizeof(saverx)); 248 if_inc_counter(dev, IFCOUNTER_IQDROPS, 1); 249 goto repost; 250 } 251 252 ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n", 253 wc->byte_len, wc->slid); 254 255 ipoib_dma_unmap_rx(priv, &saverx); 256 ipoib_dma_mb(priv, mb, wc->byte_len); 257 258 if_inc_counter(dev, IFCOUNTER_IPACKETS, 1); 259 if_inc_counter(dev, IFCOUNTER_IBYTES, mb->m_pkthdr.len); 260 mb->m_pkthdr.rcvif = dev; 261 m_adj(mb, sizeof(struct ib_grh) - INFINIBAND_ALEN); 262 eh = mtod(mb, struct ipoib_header *); 263 bzero(eh->hwaddr, 4); /* Zero the queue pair, only dgid is in grh */ 264 265 if (test_bit(IPOIB_FLAG_CSUM, &priv->flags) && likely(wc->wc_flags & IB_WC_IP_CSUM_OK)) 266 mb->m_pkthdr.csum_flags = CSUM_IP_CHECKED | CSUM_IP_VALID; 267 268 if_input(dev, mb); 269 270 repost: 271 if (unlikely(ipoib_ib_post_receive(priv, wr_id))) 272 ipoib_warn(priv, "ipoib_ib_post_receive failed " 273 "for buf %d\n", wr_id); 274 } 275 276 int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req, int max) 277 { 278 struct mbuf *mb = tx_req->mb; 279 u64 *mapping = tx_req->mapping; 280 struct mbuf *m, *p; 281 int error; 282 int i; 283 284 for (m = mb, p = NULL, i = 0; m != NULL; p = m, m = m->m_next, i++) { 285 if (m->m_len != 0) 286 continue; 287 if (p == NULL) 288 panic("ipoib_dma_map_tx: First mbuf empty\n"); 289 p->m_next = m_free(m); 290 m = p; 291 i--; 292 } 293 i--; 294 if (i >= max) { 295 tx_req->mb = mb = m_defrag(mb, M_NOWAIT); 296 if (mb == NULL) 297 return -EIO; 298 for (m = mb, i = 0; m != NULL; m = m->m_next, i++); 299 if (i >= max) 300 return -EIO; 301 } 302 error = 0; 303 for (m = mb, i = 0; m != NULL; m = m->m_next, i++) { 304 mapping[i] = ib_dma_map_single(ca, mtod(m, void *), 305 m->m_len, DMA_TO_DEVICE); 306 if (unlikely(ib_dma_mapping_error(ca, mapping[i]))) { 307 error = -EIO; 308 break; 309 } 310 } 311 if (error) { 312 int end; 313 314 end = i; 315 for (m = mb, i = 0; i < end; m = m->m_next, i++) 316 ib_dma_unmap_single(ca, mapping[i], m->m_len, 317 DMA_TO_DEVICE); 318 } 319 return error; 320 } 321 322 void ipoib_dma_unmap_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req) 323 { 324 struct mbuf *mb = tx_req->mb; 325 u64 *mapping = tx_req->mapping; 326 struct mbuf *m; 327 int i; 328 329 for (m = mb, i = 0; m != NULL; m = m->m_next, i++) 330 ib_dma_unmap_single(ca, mapping[i], m->m_len, DMA_TO_DEVICE); 331 } 332 333 static void ipoib_ib_handle_tx_wc(struct ipoib_dev_priv *priv, struct ib_wc *wc) 334 { 335 if_t dev = priv->dev; 336 unsigned int wr_id = wc->wr_id; 337 struct ipoib_tx_buf *tx_req; 338 339 ipoib_dbg_data(priv, "send completion: id %d, status: %d\n", 340 wr_id, wc->status); 341 342 if (unlikely(wr_id >= ipoib_sendq_size)) { 343 ipoib_warn(priv, "send completion event with wrid %d (> %d)\n", 344 wr_id, ipoib_sendq_size); 345 return; 346 } 347 348 tx_req = &priv->tx_ring[wr_id]; 349 350 ipoib_dma_unmap_tx(priv->ca, tx_req); 351 352 if_inc_counter(dev, IFCOUNTER_OPACKETS, 1); 353 354 m_freem(tx_req->mb); 355 356 ++priv->tx_tail; 357 if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) && 358 (if_getdrvflags(dev) & IFF_DRV_OACTIVE) && 359 test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) 360 if_setdrvflagbits(dev, 0, IFF_DRV_OACTIVE); 361 362 if (wc->status != IB_WC_SUCCESS && 363 wc->status != IB_WC_WR_FLUSH_ERR) 364 ipoib_warn(priv, "failed send event " 365 "(status=%d, wrid=%d vend_err %x)\n", 366 wc->status, wr_id, wc->vendor_err); 367 } 368 369 int 370 ipoib_poll_tx(struct ipoib_dev_priv *priv, bool do_start) 371 { 372 int n, i; 373 374 n = ib_poll_cq(priv->send_cq, MAX_SEND_CQE, priv->send_wc); 375 for (i = 0; i < n; ++i) { 376 struct ib_wc *wc = priv->send_wc + i; 377 if (wc->wr_id & IPOIB_OP_CM) 378 ipoib_cm_handle_tx_wc(priv, wc); 379 else 380 ipoib_ib_handle_tx_wc(priv, wc); 381 } 382 383 if (do_start && n != 0) 384 ipoib_start_locked(priv->dev, priv); 385 386 return n == MAX_SEND_CQE; 387 } 388 389 static void 390 ipoib_poll(struct ipoib_dev_priv *priv) 391 { 392 int n, i; 393 394 poll_more: 395 spin_lock(&priv->drain_lock); 396 for (;;) { 397 n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc); 398 for (i = 0; i < n; i++) { 399 struct ib_wc *wc = priv->ibwc + i; 400 401 if ((wc->wr_id & IPOIB_OP_RECV) == 0) 402 panic("ipoib_poll: Bad wr_id 0x%jX\n", 403 (intmax_t)wc->wr_id); 404 if (wc->wr_id & IPOIB_OP_CM) 405 ipoib_cm_handle_rx_wc(priv, wc); 406 else 407 ipoib_ib_handle_rx_wc(priv, wc); 408 } 409 410 if (n != IPOIB_NUM_WC) 411 break; 412 } 413 spin_unlock(&priv->drain_lock); 414 415 if (ib_req_notify_cq(priv->recv_cq, 416 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS) > 0) 417 goto poll_more; 418 } 419 420 void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr) 421 { 422 struct ipoib_dev_priv *priv = dev_ptr; 423 424 ipoib_poll(priv); 425 } 426 427 static void drain_tx_cq(struct ipoib_dev_priv *priv) 428 { 429 if_t dev = priv->dev; 430 431 spin_lock(&priv->lock); 432 while (ipoib_poll_tx(priv, true)) 433 ; /* nothing */ 434 435 if (if_getdrvflags(dev) & IFF_DRV_OACTIVE) 436 mod_timer(&priv->poll_timer, jiffies + 1); 437 438 spin_unlock(&priv->lock); 439 } 440 441 void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr) 442 { 443 struct ipoib_dev_priv *priv = dev_ptr; 444 445 mod_timer(&priv->poll_timer, jiffies); 446 } 447 448 static inline int 449 post_send(struct ipoib_dev_priv *priv, unsigned int wr_id, 450 struct ib_ah *address, u32 qpn, struct ipoib_tx_buf *tx_req, void *head, 451 int hlen) 452 { 453 const struct ib_send_wr *bad_wr; 454 struct mbuf *mb = tx_req->mb; 455 u64 *mapping = tx_req->mapping; 456 struct mbuf *m; 457 int i; 458 459 for (m = mb, i = 0; m != NULL; m = m->m_next, i++) { 460 priv->tx_sge[i].addr = mapping[i]; 461 priv->tx_sge[i].length = m->m_len; 462 } 463 priv->tx_wr.wr.num_sge = i; 464 priv->tx_wr.wr.wr_id = wr_id; 465 priv->tx_wr.remote_qpn = qpn; 466 priv->tx_wr.ah = address; 467 468 if (head) { 469 priv->tx_wr.mss = 0; /* XXX mb_shinfo(mb)->gso_size; */ 470 priv->tx_wr.header = head; 471 priv->tx_wr.hlen = hlen; 472 priv->tx_wr.wr.opcode = IB_WR_LSO; 473 } else 474 priv->tx_wr.wr.opcode = IB_WR_SEND; 475 476 return ib_post_send(priv->qp, &priv->tx_wr.wr, &bad_wr); 477 } 478 479 void 480 ipoib_send(struct ipoib_dev_priv *priv, struct mbuf *mb, 481 struct ipoib_ah *address, u32 qpn) 482 { 483 if_t dev = priv->dev; 484 struct ipoib_tx_buf *tx_req; 485 int hlen; 486 void *phead; 487 488 if (unlikely(priv->tx_outstanding > MAX_SEND_CQE)) 489 while (ipoib_poll_tx(priv, false)) 490 ; /* nothing */ 491 492 m_adj(mb, sizeof (struct ipoib_pseudoheader)); 493 if (0 /* XXX segment offload mb_is_gso(mb) */) { 494 /* XXX hlen = mb_transport_offset(mb) + tcp_hdrlen(mb); */ 495 phead = mtod(mb, void *); 496 if (mb->m_len < hlen) { 497 ipoib_warn(priv, "linear data too small\n"); 498 if_inc_counter(dev, IFCOUNTER_OERRORS, 1); 499 m_freem(mb); 500 return; 501 } 502 m_adj(mb, hlen); 503 } else { 504 if (unlikely(mb->m_pkthdr.len - IPOIB_ENCAP_LEN > priv->mcast_mtu)) { 505 ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", 506 mb->m_pkthdr.len, priv->mcast_mtu); 507 if_inc_counter(dev, IFCOUNTER_OERRORS, 1); 508 ipoib_cm_mb_too_long(priv, mb, priv->mcast_mtu); 509 return; 510 } 511 phead = NULL; 512 hlen = 0; 513 } 514 515 ipoib_dbg_data(priv, "sending packet, length=%d address=%p qpn=0x%06x\n", 516 mb->m_pkthdr.len, address, qpn); 517 518 /* 519 * We put the mb into the tx_ring _before_ we call post_send() 520 * because it's entirely possible that the completion handler will 521 * run before we execute anything after the post_send(). That 522 * means we have to make sure everything is properly recorded and 523 * our state is consistent before we call post_send(). 524 */ 525 tx_req = &priv->tx_ring[priv->tx_head & (ipoib_sendq_size - 1)]; 526 tx_req->mb = mb; 527 if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req, IPOIB_UD_TX_SG))) { 528 if_inc_counter(dev, IFCOUNTER_OERRORS, 1); 529 if (tx_req->mb) 530 m_freem(tx_req->mb); 531 return; 532 } 533 534 if (mb->m_pkthdr.csum_flags & (CSUM_IP|CSUM_TCP|CSUM_UDP)) 535 priv->tx_wr.wr.send_flags |= IB_SEND_IP_CSUM; 536 else 537 priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM; 538 539 if (++priv->tx_outstanding == ipoib_sendq_size) { 540 ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n"); 541 if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP)) 542 ipoib_warn(priv, "request notify on send CQ failed\n"); 543 if_setdrvflagbits(dev, IFF_DRV_OACTIVE, 0); 544 } 545 546 if (unlikely(post_send(priv, 547 priv->tx_head & (ipoib_sendq_size - 1), address->ah, qpn, 548 tx_req, phead, hlen))) { 549 ipoib_warn(priv, "post_send failed\n"); 550 if_inc_counter(dev, IFCOUNTER_OERRORS, 1); 551 --priv->tx_outstanding; 552 ipoib_dma_unmap_tx(priv->ca, tx_req); 553 m_freem(mb); 554 if (if_getdrvflags(dev) & IFF_DRV_OACTIVE) 555 if_setdrvflagbits(dev, 0, IFF_DRV_OACTIVE); 556 } else { 557 address->last_send = priv->tx_head; 558 ++priv->tx_head; 559 } 560 } 561 562 static void __ipoib_reap_ah(struct ipoib_dev_priv *priv) 563 { 564 struct ipoib_ah *ah, *tah; 565 LIST_HEAD(remove_list); 566 unsigned long flags; 567 568 spin_lock_irqsave(&priv->lock, flags); 569 570 list_for_each_entry_safe(ah, tah, &priv->dead_ahs, list) 571 if ((int) priv->tx_tail - (int) ah->last_send >= 0) { 572 list_del(&ah->list); 573 ib_destroy_ah(ah->ah, 0); 574 kfree(ah); 575 } 576 577 spin_unlock_irqrestore(&priv->lock, flags); 578 } 579 580 void ipoib_reap_ah(struct work_struct *work) 581 { 582 struct ipoib_dev_priv *priv = 583 container_of(work, struct ipoib_dev_priv, ah_reap_task.work); 584 585 __ipoib_reap_ah(priv); 586 587 if (!test_bit(IPOIB_STOP_REAPER, &priv->flags)) 588 queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, 589 HZ); 590 } 591 592 static void ipoib_ah_dev_cleanup(struct ipoib_dev_priv *priv) 593 { 594 unsigned long begin; 595 596 begin = jiffies; 597 598 while (!list_empty(&priv->dead_ahs)) { 599 __ipoib_reap_ah(priv); 600 601 if (time_after(jiffies, begin + HZ)) { 602 ipoib_warn(priv, "timing out; will leak address handles\n"); 603 break; 604 } 605 606 msleep(1); 607 } 608 } 609 610 static void ipoib_ib_tx_timer_func(unsigned long ctx) 611 { 612 drain_tx_cq((struct ipoib_dev_priv *)ctx); 613 } 614 615 int ipoib_ib_dev_open(struct ipoib_dev_priv *priv) 616 { 617 int ret; 618 619 if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &priv->pkey_index)) { 620 ipoib_warn(priv, "P_Key 0x%04x not found\n", priv->pkey); 621 clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); 622 return -1; 623 } 624 set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); 625 626 ret = ipoib_init_qp(priv); 627 if (ret) { 628 ipoib_warn(priv, "ipoib_init_qp returned %d\n", ret); 629 return -1; 630 } 631 632 ret = ipoib_ib_post_receives(priv); 633 if (ret) { 634 ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret); 635 ipoib_ib_dev_stop(priv, 1); 636 return -1; 637 } 638 639 ret = ipoib_cm_dev_open(priv); 640 if (ret) { 641 ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n", ret); 642 ipoib_ib_dev_stop(priv, 1); 643 return -1; 644 } 645 646 clear_bit(IPOIB_STOP_REAPER, &priv->flags); 647 queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ); 648 649 set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags); 650 651 return 0; 652 } 653 654 static void ipoib_pkey_dev_check_presence(struct ipoib_dev_priv *priv) 655 { 656 u16 pkey_index = 0; 657 658 if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) 659 clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); 660 else 661 set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); 662 } 663 664 int ipoib_ib_dev_up(struct ipoib_dev_priv *priv) 665 { 666 667 ipoib_pkey_dev_check_presence(priv); 668 669 if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { 670 ipoib_dbg(priv, "PKEY is not assigned.\n"); 671 return 0; 672 } 673 674 set_bit(IPOIB_FLAG_OPER_UP, &priv->flags); 675 676 return ipoib_mcast_start_thread(priv); 677 } 678 679 int ipoib_ib_dev_down(struct ipoib_dev_priv *priv, int flush) 680 { 681 682 ipoib_dbg(priv, "downing ib_dev\n"); 683 684 clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags); 685 if_link_state_change(priv->dev, LINK_STATE_DOWN); 686 687 /* Shutdown the P_Key thread if still active */ 688 if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { 689 mutex_lock(&pkey_mutex); 690 set_bit(IPOIB_PKEY_STOP, &priv->flags); 691 cancel_delayed_work(&priv->pkey_poll_task); 692 mutex_unlock(&pkey_mutex); 693 if (flush) 694 flush_workqueue(ipoib_workqueue); 695 } 696 697 ipoib_mcast_stop_thread(priv, flush); 698 ipoib_mcast_dev_flush(priv); 699 700 ipoib_flush_paths(priv); 701 702 return 0; 703 } 704 705 static int recvs_pending(struct ipoib_dev_priv *priv) 706 { 707 int pending = 0; 708 int i; 709 710 for (i = 0; i < ipoib_recvq_size; ++i) 711 if (priv->rx_ring[i].mb) 712 ++pending; 713 714 return pending; 715 } 716 717 static void check_qp_movement_and_print(struct ipoib_dev_priv *priv, 718 struct ib_qp *qp, 719 enum ib_qp_state new_state) 720 { 721 struct ib_qp_attr qp_attr; 722 struct ib_qp_init_attr query_init_attr; 723 int ret; 724 725 ret = ib_query_qp(qp, &qp_attr, IB_QP_STATE, &query_init_attr); 726 if (ret) { 727 ipoib_warn(priv, "%s: Failed to query QP (%d)\n", __func__, ret); 728 return; 729 } 730 731 /* print according to the new-state and the previous state */ 732 if (new_state == IB_QPS_ERR && qp_attr.qp_state == IB_QPS_RESET) { 733 ipoib_dbg(priv, "Failed to modify QP %d->%d, acceptable\n", 734 qp_attr.qp_state, new_state); 735 } else { 736 ipoib_warn(priv, "Failed to modify QP %d->%d\n", 737 qp_attr.qp_state, new_state); 738 } 739 } 740 741 void ipoib_drain_cq(struct ipoib_dev_priv *priv) 742 { 743 int i, n; 744 745 spin_lock(&priv->drain_lock); 746 do { 747 n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc); 748 for (i = 0; i < n; ++i) { 749 /* 750 * Convert any successful completions to flush 751 * errors to avoid passing packets up the 752 * stack after bringing the device down. 753 */ 754 if (priv->ibwc[i].status == IB_WC_SUCCESS) 755 priv->ibwc[i].status = IB_WC_WR_FLUSH_ERR; 756 757 if ((priv->ibwc[i].wr_id & IPOIB_OP_RECV) == 0) 758 panic("ipoib_drain_cq: Bad wrid 0x%jX\n", 759 (intmax_t)priv->ibwc[i].wr_id); 760 if (priv->ibwc[i].wr_id & IPOIB_OP_CM) 761 ipoib_cm_handle_rx_wc(priv, priv->ibwc + i); 762 else 763 ipoib_ib_handle_rx_wc(priv, priv->ibwc + i); 764 } 765 } while (n == IPOIB_NUM_WC); 766 spin_unlock(&priv->drain_lock); 767 768 spin_lock(&priv->lock); 769 while (ipoib_poll_tx(priv, true)) 770 ; /* nothing */ 771 772 spin_unlock(&priv->lock); 773 } 774 775 int ipoib_ib_dev_stop(struct ipoib_dev_priv *priv, int flush) 776 { 777 struct ib_qp_attr qp_attr; 778 unsigned long begin; 779 struct ipoib_tx_buf *tx_req; 780 int i; 781 782 clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags); 783 784 ipoib_cm_dev_stop(priv); 785 786 /* 787 * Move our QP to the error state and then reinitialize in 788 * when all work requests have completed or have been flushed. 789 */ 790 qp_attr.qp_state = IB_QPS_ERR; 791 if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE)) 792 check_qp_movement_and_print(priv, priv->qp, IB_QPS_ERR); 793 794 /* Wait for all sends and receives to complete */ 795 begin = jiffies; 796 797 while (priv->tx_head != priv->tx_tail || recvs_pending(priv)) { 798 if (time_after(jiffies, begin + 5 * HZ)) { 799 ipoib_warn(priv, "timing out; %d sends %d receives not completed\n", 800 priv->tx_head - priv->tx_tail, recvs_pending(priv)); 801 802 /* 803 * assume the HW is wedged and just free up 804 * all our pending work requests. 805 */ 806 while ((int) priv->tx_tail - (int) priv->tx_head < 0) { 807 tx_req = &priv->tx_ring[priv->tx_tail & 808 (ipoib_sendq_size - 1)]; 809 ipoib_dma_unmap_tx(priv->ca, tx_req); 810 m_freem(tx_req->mb); 811 ++priv->tx_tail; 812 --priv->tx_outstanding; 813 } 814 815 for (i = 0; i < ipoib_recvq_size; ++i) { 816 struct ipoib_rx_buf *rx_req; 817 818 rx_req = &priv->rx_ring[i]; 819 if (!rx_req->mb) 820 continue; 821 ipoib_dma_unmap_rx(priv, &priv->rx_ring[i]); 822 m_freem(rx_req->mb); 823 rx_req->mb = NULL; 824 } 825 826 goto timeout; 827 } 828 829 ipoib_drain_cq(priv); 830 831 msleep(1); 832 } 833 834 ipoib_dbg(priv, "All sends and receives done.\n"); 835 836 timeout: 837 del_timer_sync(&priv->poll_timer); 838 qp_attr.qp_state = IB_QPS_RESET; 839 if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE)) 840 ipoib_warn(priv, "Failed to modify QP to RESET state\n"); 841 842 /* Wait for all AHs to be reaped */ 843 set_bit(IPOIB_STOP_REAPER, &priv->flags); 844 cancel_delayed_work(&priv->ah_reap_task); 845 if (flush) 846 flush_workqueue(ipoib_workqueue); 847 848 ipoib_ah_dev_cleanup(priv); 849 850 ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP); 851 852 return 0; 853 } 854 855 int ipoib_ib_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca, int port) 856 { 857 if_t dev = priv->dev; 858 859 priv->ca = ca; 860 priv->port = port; 861 priv->qp = NULL; 862 863 if (ipoib_transport_dev_init(priv, ca)) { 864 printk(KERN_WARNING "%s: ipoib_transport_dev_init failed\n", ca->name); 865 return -ENODEV; 866 } 867 868 setup_timer(&priv->poll_timer, ipoib_ib_tx_timer_func, 869 (unsigned long) priv); 870 871 if (if_getflags(dev) & IFF_UP) { 872 if (ipoib_ib_dev_open(priv)) { 873 ipoib_transport_dev_cleanup(priv); 874 return -ENODEV; 875 } 876 } 877 878 return 0; 879 } 880 881 static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, 882 enum ipoib_flush_level level) 883 { 884 struct ipoib_dev_priv *cpriv; 885 u16 new_index; 886 887 mutex_lock(&priv->vlan_mutex); 888 889 /* 890 * Flush any child interfaces too -- they might be up even if 891 * the parent is down. 892 */ 893 list_for_each_entry(cpriv, &priv->child_intfs, list) 894 __ipoib_ib_dev_flush(cpriv, level); 895 896 mutex_unlock(&priv->vlan_mutex); 897 898 if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) { 899 ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n"); 900 return; 901 } 902 903 if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) { 904 ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_ADMIN_UP not set.\n"); 905 return; 906 } 907 908 if (level == IPOIB_FLUSH_HEAVY) { 909 if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) { 910 clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); 911 ipoib_ib_dev_down(priv, 0); 912 ipoib_ib_dev_stop(priv, 0); 913 if (ipoib_pkey_dev_delay_open(priv)) 914 return; 915 } 916 917 /* restart QP only if P_Key index is changed */ 918 if (test_and_set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) && 919 new_index == priv->pkey_index) { 920 ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n"); 921 return; 922 } 923 priv->pkey_index = new_index; 924 } 925 926 if (level == IPOIB_FLUSH_LIGHT) { 927 ipoib_mark_paths_invalid(priv); 928 ipoib_mcast_dev_flush(priv); 929 } 930 931 if (level >= IPOIB_FLUSH_NORMAL) 932 ipoib_ib_dev_down(priv, 0); 933 934 if (level == IPOIB_FLUSH_HEAVY) { 935 ipoib_ib_dev_stop(priv, 0); 936 ipoib_ib_dev_open(priv); 937 } 938 939 /* 940 * The device could have been brought down between the start and when 941 * we get here, don't bring it back up if it's not configured up 942 */ 943 if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) { 944 if (level >= IPOIB_FLUSH_NORMAL) 945 ipoib_ib_dev_up(priv); 946 ipoib_mcast_restart_task(&priv->restart_task); 947 } 948 } 949 950 void ipoib_ib_dev_flush_light(struct work_struct *work) 951 { 952 struct ipoib_dev_priv *priv = 953 container_of(work, struct ipoib_dev_priv, flush_light); 954 955 __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_LIGHT); 956 } 957 958 void ipoib_ib_dev_flush_normal(struct work_struct *work) 959 { 960 struct ipoib_dev_priv *priv = 961 container_of(work, struct ipoib_dev_priv, flush_normal); 962 963 __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_NORMAL); 964 } 965 966 void ipoib_ib_dev_flush_heavy(struct work_struct *work) 967 { 968 struct ipoib_dev_priv *priv = 969 container_of(work, struct ipoib_dev_priv, flush_heavy); 970 971 __ipoib_ib_dev_flush(priv, IPOIB_FLUSH_HEAVY); 972 } 973 974 void ipoib_ib_dev_cleanup(struct ipoib_dev_priv *priv) 975 { 976 977 ipoib_dbg(priv, "cleaning up ib_dev\n"); 978 979 ipoib_mcast_stop_thread(priv, 1); 980 ipoib_mcast_dev_flush(priv); 981 982 ipoib_ah_dev_cleanup(priv); 983 ipoib_transport_dev_cleanup(priv); 984 } 985 986 /* 987 * Delayed P_Key Assigment Interim Support 988 * 989 * The following is initial implementation of delayed P_Key assigment 990 * mechanism. It is using the same approach implemented for the multicast 991 * group join. The single goal of this implementation is to quickly address 992 * Bug #2507. This implementation will probably be removed when the P_Key 993 * change async notification is available. 994 */ 995 996 void ipoib_pkey_poll(struct work_struct *work) 997 { 998 struct ipoib_dev_priv *priv = 999 container_of(work, struct ipoib_dev_priv, pkey_poll_task.work); 1000 1001 ipoib_pkey_dev_check_presence(priv); 1002 1003 if (test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) 1004 ipoib_open(priv); 1005 else { 1006 mutex_lock(&pkey_mutex); 1007 if (!test_bit(IPOIB_PKEY_STOP, &priv->flags)) 1008 queue_delayed_work(ipoib_workqueue, 1009 &priv->pkey_poll_task, 1010 HZ); 1011 mutex_unlock(&pkey_mutex); 1012 } 1013 } 1014 1015 int ipoib_pkey_dev_delay_open(struct ipoib_dev_priv *priv) 1016 { 1017 1018 /* Look for the interface pkey value in the IB Port P_Key table and */ 1019 /* set the interface pkey assigment flag */ 1020 ipoib_pkey_dev_check_presence(priv); 1021 1022 /* P_Key value not assigned yet - start polling */ 1023 if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { 1024 mutex_lock(&pkey_mutex); 1025 clear_bit(IPOIB_PKEY_STOP, &priv->flags); 1026 queue_delayed_work(ipoib_workqueue, 1027 &priv->pkey_poll_task, 1028 HZ); 1029 mutex_unlock(&pkey_mutex); 1030 return 1; 1031 } 1032 1033 return 0; 1034 } 1035