1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2 3 /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ 4 /* Copyright (c) 2008-2019, IBM Corporation */ 5 6 #include <linux/errno.h> 7 #include <linux/types.h> 8 #include <linux/net.h> 9 #include <linux/scatterlist.h> 10 #include <linux/highmem.h> 11 #include <net/tcp.h> 12 13 #include <rdma/iw_cm.h> 14 #include <rdma/ib_verbs.h> 15 #include <rdma/ib_user_verbs.h> 16 17 #include "siw.h" 18 #include "siw_verbs.h" 19 #include "siw_mem.h" 20 21 #define MAX_HDR_INLINE \ 22 (((uint32_t)(sizeof(struct siw_rreq_pkt) - \ 23 sizeof(struct iwarp_send))) & 0xF8) 24 25 static struct page *siw_get_pblpage(struct siw_mem *mem, u64 addr, int *idx) 26 { 27 struct siw_pbl *pbl = mem->pbl; 28 u64 offset = addr - mem->va; 29 dma_addr_t paddr = siw_pbl_get_buffer(pbl, offset, NULL, idx); 30 31 if (paddr) 32 return ib_virt_dma_to_page(paddr); 33 34 return NULL; 35 } 36 37 static struct page *siw_get_page(struct siw_mem *mem, struct siw_sge *sge, 38 unsigned long offset, int *pbl_idx) 39 { 40 if (!mem->is_pbl) 41 return siw_get_upage(mem->umem, sge->laddr + offset); 42 else 43 return siw_get_pblpage(mem, sge->laddr + offset, pbl_idx); 44 } 45 46 /* 47 * Copy short payload at provided destination payload address 48 */ 49 static int siw_try_1seg(struct siw_iwarp_tx *c_tx, void *paddr) 50 { 51 struct siw_wqe *wqe = &c_tx->wqe_active; 52 struct siw_sge *sge = &wqe->sqe.sge[0]; 53 u32 bytes = sge->length; 54 55 if (bytes > MAX_HDR_INLINE || wqe->sqe.num_sge != 1) 56 return MAX_HDR_INLINE + 1; 57 58 if (!bytes) 59 return 0; 60 61 if (tx_flags(wqe) & SIW_WQE_INLINE) { 62 memcpy(paddr, &wqe->sqe.sge[1], bytes); 63 } else { 64 struct siw_mem *mem = wqe->mem[0]; 65 66 if (!mem->mem_obj) { 67 /* Kernel client using kva */ 68 memcpy(paddr, ib_virt_dma_to_ptr(sge->laddr), bytes); 69 } else if (c_tx->in_syscall) { 70 if (copy_from_user(paddr, u64_to_user_ptr(sge->laddr), 71 bytes)) 72 return -EFAULT; 73 } else { 74 unsigned int off = sge->laddr & ~PAGE_MASK; 75 struct page *p; 76 char *buffer; 77 int pbl_idx = 0; 78 79 p = siw_get_page(mem, sge, 0, &pbl_idx); 80 if (unlikely(!p)) 81 return -EFAULT; 82 83 buffer = kmap_local_page(p); 84 85 if (likely(PAGE_SIZE - off >= bytes)) { 86 memcpy(paddr, buffer + off, bytes); 87 } else { 88 unsigned long part = bytes - (PAGE_SIZE - off); 89 90 memcpy(paddr, buffer + off, part); 91 kunmap_local(buffer); 92 93 p = siw_get_page(mem, sge, part, &pbl_idx); 94 if (unlikely(!p)) 95 return -EFAULT; 96 97 buffer = kmap_local_page(p); 98 memcpy(paddr + part, buffer, bytes - part); 99 } 100 kunmap_local(buffer); 101 } 102 } 103 return (int)bytes; 104 } 105 106 #define PKT_FRAGMENTED 1 107 #define PKT_COMPLETE 0 108 109 /* 110 * siw_qp_prepare_tx() 111 * 112 * Prepare tx state for sending out one fpdu. Builds complete pkt 113 * if no user data or only immediate data are present. 114 * 115 * returns PKT_COMPLETE if complete pkt built, PKT_FRAGMENTED otherwise. 116 */ 117 static int siw_qp_prepare_tx(struct siw_iwarp_tx *c_tx) 118 { 119 struct siw_wqe *wqe = &c_tx->wqe_active; 120 char *crc = NULL; 121 int data = 0; 122 123 switch (tx_type(wqe)) { 124 case SIW_OP_READ: 125 case SIW_OP_READ_LOCAL_INV: 126 memcpy(&c_tx->pkt.ctrl, 127 &iwarp_pktinfo[RDMAP_RDMA_READ_REQ].ctrl, 128 sizeof(struct iwarp_ctrl)); 129 130 c_tx->pkt.rreq.rsvd = 0; 131 c_tx->pkt.rreq.ddp_qn = htonl(RDMAP_UNTAGGED_QN_RDMA_READ); 132 c_tx->pkt.rreq.ddp_msn = 133 htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]); 134 c_tx->pkt.rreq.ddp_mo = 0; 135 c_tx->pkt.rreq.sink_stag = htonl(wqe->sqe.sge[0].lkey); 136 c_tx->pkt.rreq.sink_to = 137 cpu_to_be64(wqe->sqe.sge[0].laddr); 138 c_tx->pkt.rreq.source_stag = htonl(wqe->sqe.rkey); 139 c_tx->pkt.rreq.source_to = cpu_to_be64(wqe->sqe.raddr); 140 c_tx->pkt.rreq.read_size = htonl(wqe->sqe.sge[0].length); 141 142 c_tx->ctrl_len = sizeof(struct iwarp_rdma_rreq); 143 crc = (char *)&c_tx->pkt.rreq_pkt.crc; 144 break; 145 146 case SIW_OP_SEND: 147 if (tx_flags(wqe) & SIW_WQE_SOLICITED) 148 memcpy(&c_tx->pkt.ctrl, 149 &iwarp_pktinfo[RDMAP_SEND_SE].ctrl, 150 sizeof(struct iwarp_ctrl)); 151 else 152 memcpy(&c_tx->pkt.ctrl, &iwarp_pktinfo[RDMAP_SEND].ctrl, 153 sizeof(struct iwarp_ctrl)); 154 155 c_tx->pkt.send.ddp_qn = RDMAP_UNTAGGED_QN_SEND; 156 c_tx->pkt.send.ddp_msn = 157 htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]); 158 c_tx->pkt.send.ddp_mo = 0; 159 160 c_tx->pkt.send_inv.inval_stag = 0; 161 162 c_tx->ctrl_len = sizeof(struct iwarp_send); 163 164 crc = (char *)&c_tx->pkt.send_pkt.crc; 165 data = siw_try_1seg(c_tx, crc); 166 break; 167 168 case SIW_OP_SEND_REMOTE_INV: 169 if (tx_flags(wqe) & SIW_WQE_SOLICITED) 170 memcpy(&c_tx->pkt.ctrl, 171 &iwarp_pktinfo[RDMAP_SEND_SE_INVAL].ctrl, 172 sizeof(struct iwarp_ctrl)); 173 else 174 memcpy(&c_tx->pkt.ctrl, 175 &iwarp_pktinfo[RDMAP_SEND_INVAL].ctrl, 176 sizeof(struct iwarp_ctrl)); 177 178 c_tx->pkt.send.ddp_qn = RDMAP_UNTAGGED_QN_SEND; 179 c_tx->pkt.send.ddp_msn = 180 htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]); 181 c_tx->pkt.send.ddp_mo = 0; 182 183 c_tx->pkt.send_inv.inval_stag = cpu_to_be32(wqe->sqe.rkey); 184 185 c_tx->ctrl_len = sizeof(struct iwarp_send_inv); 186 187 crc = (char *)&c_tx->pkt.send_pkt.crc; 188 data = siw_try_1seg(c_tx, crc); 189 break; 190 191 case SIW_OP_WRITE: 192 memcpy(&c_tx->pkt.ctrl, &iwarp_pktinfo[RDMAP_RDMA_WRITE].ctrl, 193 sizeof(struct iwarp_ctrl)); 194 195 c_tx->pkt.rwrite.sink_stag = htonl(wqe->sqe.rkey); 196 c_tx->pkt.rwrite.sink_to = cpu_to_be64(wqe->sqe.raddr); 197 c_tx->ctrl_len = sizeof(struct iwarp_rdma_write); 198 199 crc = (char *)&c_tx->pkt.write_pkt.crc; 200 data = siw_try_1seg(c_tx, crc); 201 break; 202 203 case SIW_OP_READ_RESPONSE: 204 memcpy(&c_tx->pkt.ctrl, 205 &iwarp_pktinfo[RDMAP_RDMA_READ_RESP].ctrl, 206 sizeof(struct iwarp_ctrl)); 207 208 /* NBO */ 209 c_tx->pkt.rresp.sink_stag = cpu_to_be32(wqe->sqe.rkey); 210 c_tx->pkt.rresp.sink_to = cpu_to_be64(wqe->sqe.raddr); 211 212 c_tx->ctrl_len = sizeof(struct iwarp_rdma_rresp); 213 214 crc = (char *)&c_tx->pkt.write_pkt.crc; 215 data = siw_try_1seg(c_tx, crc); 216 break; 217 218 default: 219 siw_dbg_qp(tx_qp(c_tx), "stale wqe type %d\n", tx_type(wqe)); 220 return -EOPNOTSUPP; 221 } 222 if (unlikely(data < 0)) 223 return data; 224 225 c_tx->ctrl_sent = 0; 226 227 if (data <= MAX_HDR_INLINE) { 228 if (data) { 229 wqe->processed = data; 230 231 c_tx->pkt.ctrl.mpa_len = 232 htons(c_tx->ctrl_len + data - MPA_HDR_SIZE); 233 234 /* Add pad, if needed */ 235 data += -(int)data & 0x3; 236 /* advance CRC location after payload */ 237 crc += data; 238 c_tx->ctrl_len += data; 239 240 if (!(c_tx->pkt.ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)) 241 c_tx->pkt.c_untagged.ddp_mo = 0; 242 else 243 c_tx->pkt.c_tagged.ddp_to = 244 cpu_to_be64(wqe->sqe.raddr); 245 } 246 247 *(u32 *)crc = 0; 248 /* 249 * Do complete CRC if enabled and short packet 250 */ 251 if (c_tx->mpa_crc_enabled) 252 siw_crc_oneshot(&c_tx->pkt, c_tx->ctrl_len, (u8 *)crc); 253 c_tx->ctrl_len += MPA_CRC_SIZE; 254 255 return PKT_COMPLETE; 256 } 257 c_tx->ctrl_len += MPA_CRC_SIZE; 258 c_tx->sge_idx = 0; 259 c_tx->sge_off = 0; 260 c_tx->pbl_idx = 0; 261 262 /* 263 * Allow direct sending out of user buffer if WR is non signalled 264 * and payload is over threshold. 265 * Per RDMA verbs, the application should not change the send buffer 266 * until the work completed. In iWarp, work completion is only 267 * local delivery to TCP. TCP may reuse the buffer for 268 * retransmission. Changing unsent data also breaks the CRC, 269 * if applied. 270 */ 271 if (c_tx->zcopy_tx && wqe->bytes >= SENDPAGE_THRESH && 272 !(tx_flags(wqe) & SIW_WQE_SIGNALLED)) 273 c_tx->use_sendpage = 1; 274 else 275 c_tx->use_sendpage = 0; 276 277 return PKT_FRAGMENTED; 278 } 279 280 /* 281 * Send out one complete control type FPDU, or header of FPDU carrying 282 * data. Used for fixed sized packets like Read.Requests or zero length 283 * SENDs, WRITEs, READ.Responses, or header only. 284 */ 285 static int siw_tx_ctrl(struct siw_iwarp_tx *c_tx, struct socket *s, 286 int flags) 287 { 288 struct msghdr msg = { .msg_flags = flags }; 289 struct kvec iov = { .iov_base = 290 (char *)&c_tx->pkt.ctrl + c_tx->ctrl_sent, 291 .iov_len = c_tx->ctrl_len - c_tx->ctrl_sent }; 292 293 int rv = kernel_sendmsg(s, &msg, &iov, 1, iov.iov_len); 294 295 if (rv >= 0) { 296 c_tx->ctrl_sent += rv; 297 298 if (c_tx->ctrl_sent == c_tx->ctrl_len) 299 rv = 0; 300 else 301 rv = -EAGAIN; 302 } 303 return rv; 304 } 305 306 /* 307 * 0copy TCP transmit interface: Use MSG_SPLICE_PAGES. 308 * 309 * Using sendpage to push page by page appears to be less efficient 310 * than using sendmsg, even if data are copied. 311 * 312 * A general performance limitation might be the extra four bytes 313 * trailer checksum segment to be pushed after user data. 314 */ 315 static int siw_tcp_sendpages(struct socket *s, struct page **page, int offset, 316 size_t size) 317 { 318 struct bio_vec bvec; 319 struct msghdr msg = { 320 .msg_flags = (MSG_MORE | MSG_DONTWAIT | MSG_SPLICE_PAGES), 321 }; 322 struct sock *sk = s->sk; 323 int i = 0, rv = 0, sent = 0; 324 325 while (size) { 326 size_t bytes = min_t(size_t, PAGE_SIZE - offset, size); 327 328 if (size + offset <= PAGE_SIZE) 329 msg.msg_flags &= ~MSG_MORE; 330 331 tcp_rate_check_app_limited(sk); 332 if (!sendpage_ok(page[i])) 333 msg.msg_flags &= ~MSG_SPLICE_PAGES; 334 bvec_set_page(&bvec, page[i], bytes, offset); 335 iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); 336 337 try_page_again: 338 lock_sock(sk); 339 rv = tcp_sendmsg_locked(sk, &msg, size); 340 release_sock(sk); 341 342 if (rv > 0) { 343 size -= rv; 344 sent += rv; 345 if (rv != bytes) { 346 offset += rv; 347 bytes -= rv; 348 goto try_page_again; 349 } 350 offset = 0; 351 } else { 352 if (rv == -EAGAIN || rv == 0) 353 break; 354 return rv; 355 } 356 i++; 357 } 358 return sent; 359 } 360 361 /* 362 * siw_0copy_tx() 363 * 364 * Pushes list of pages to TCP socket. If pages from multiple 365 * SGE's, all referenced pages of each SGE are pushed in one 366 * shot. 367 */ 368 static int siw_0copy_tx(struct socket *s, struct page **page, 369 struct siw_sge *sge, unsigned int offset, 370 unsigned int size) 371 { 372 int i = 0, sent = 0, rv; 373 int sge_bytes = min(sge->length - offset, size); 374 375 offset = (sge->laddr + offset) & ~PAGE_MASK; 376 377 while (sent != size) { 378 rv = siw_tcp_sendpages(s, &page[i], offset, sge_bytes); 379 if (rv >= 0) { 380 sent += rv; 381 if (size == sent || sge_bytes > rv) 382 break; 383 384 i += PAGE_ALIGN(sge_bytes + offset) >> PAGE_SHIFT; 385 sge++; 386 sge_bytes = min(sge->length, size - sent); 387 offset = sge->laddr & ~PAGE_MASK; 388 } else { 389 sent = rv; 390 break; 391 } 392 } 393 return sent; 394 } 395 396 #define MAX_TRAILER (MPA_CRC_SIZE + 4) 397 398 static void siw_unmap_pages(struct kvec *iov, unsigned long kmap_mask, int len) 399 { 400 int i; 401 402 /* 403 * Work backwards through the array to honor the kmap_local_page() 404 * ordering requirements. 405 */ 406 for (i = (len-1); i >= 0; i--) { 407 if (kmap_mask & BIT(i)) { 408 unsigned long addr = (unsigned long)iov[i].iov_base; 409 410 kunmap_local((void *)(addr & PAGE_MASK)); 411 } 412 } 413 } 414 415 /* 416 * siw_tx_hdt() tries to push a complete packet to TCP where all 417 * packet fragments are referenced by the elements of one iovec. 418 * For the data portion, each involved page must be referenced by 419 * one extra element. All sge's data can be non-aligned to page 420 * boundaries. Two more elements are referencing iWARP header 421 * and trailer: 422 * MAX_ARRAY = 64KB/PAGE_SIZE + 1 + (2 * (SIW_MAX_SGE - 1) + HDR + TRL 423 */ 424 #define MAX_ARRAY ((0xffff / PAGE_SIZE) + 1 + (2 * (SIW_MAX_SGE - 1) + 2)) 425 426 /* 427 * Write out iov referencing hdr, data and trailer of current FPDU. 428 * Update transmit state dependent on write return status 429 */ 430 static int siw_tx_hdt(struct siw_iwarp_tx *c_tx, struct socket *s) 431 { 432 struct siw_wqe *wqe = &c_tx->wqe_active; 433 struct siw_sge *sge = &wqe->sqe.sge[c_tx->sge_idx]; 434 struct kvec iov[MAX_ARRAY]; 435 struct page *page_array[MAX_ARRAY]; 436 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR }; 437 438 int seg = 0, do_crc = c_tx->do_crc, is_kva = 0, rv; 439 unsigned int data_len = c_tx->bytes_unsent, hdr_len = 0, trl_len = 0, 440 sge_off = c_tx->sge_off, sge_idx = c_tx->sge_idx, 441 pbl_idx = c_tx->pbl_idx; 442 unsigned long kmap_mask = 0L; 443 444 if (c_tx->state == SIW_SEND_HDR) { 445 if (c_tx->use_sendpage) { 446 rv = siw_tx_ctrl(c_tx, s, MSG_DONTWAIT | MSG_MORE); 447 if (rv) 448 goto done; 449 450 c_tx->state = SIW_SEND_DATA; 451 } else { 452 iov[0].iov_base = 453 (char *)&c_tx->pkt.ctrl + c_tx->ctrl_sent; 454 iov[0].iov_len = hdr_len = 455 c_tx->ctrl_len - c_tx->ctrl_sent; 456 seg = 1; 457 } 458 } 459 460 wqe->processed += data_len; 461 462 while (data_len) { /* walk the list of SGE's */ 463 unsigned int sge_len = min(sge->length - sge_off, data_len); 464 unsigned int fp_off = (sge->laddr + sge_off) & ~PAGE_MASK; 465 struct siw_mem *mem; 466 467 if (!(tx_flags(wqe) & SIW_WQE_INLINE)) { 468 mem = wqe->mem[sge_idx]; 469 is_kva = mem->mem_obj == NULL ? 1 : 0; 470 } else { 471 is_kva = 1; 472 } 473 if (is_kva && !c_tx->use_sendpage) { 474 /* 475 * tx from kernel virtual address: either inline data 476 * or memory region with assigned kernel buffer 477 */ 478 iov[seg].iov_base = 479 ib_virt_dma_to_ptr(sge->laddr + sge_off); 480 iov[seg].iov_len = sge_len; 481 482 if (do_crc) 483 siw_crc_update(&c_tx->mpa_crc, 484 iov[seg].iov_base, sge_len); 485 sge_off += sge_len; 486 data_len -= sge_len; 487 seg++; 488 goto sge_done; 489 } 490 491 while (sge_len) { 492 size_t plen = min((int)PAGE_SIZE - fp_off, sge_len); 493 void *kaddr; 494 495 if (!is_kva) { 496 struct page *p; 497 498 p = siw_get_page(mem, sge, sge_off, &pbl_idx); 499 if (unlikely(!p)) { 500 siw_unmap_pages(iov, kmap_mask, seg); 501 wqe->processed -= c_tx->bytes_unsent; 502 rv = -EFAULT; 503 goto done_crc; 504 } 505 page_array[seg] = p; 506 507 if (!c_tx->use_sendpage) { 508 void *kaddr = kmap_local_page(p); 509 510 /* Remember for later kunmap() */ 511 kmap_mask |= BIT(seg); 512 iov[seg].iov_base = kaddr + fp_off; 513 iov[seg].iov_len = plen; 514 515 if (do_crc) 516 siw_crc_update( 517 &c_tx->mpa_crc, 518 iov[seg].iov_base, 519 plen); 520 } else if (do_crc) { 521 kaddr = kmap_local_page(p); 522 siw_crc_update(&c_tx->mpa_crc, 523 kaddr + fp_off, plen); 524 kunmap_local(kaddr); 525 } 526 } else { 527 /* 528 * Cast to an uintptr_t to preserve all 64 bits 529 * in sge->laddr. 530 */ 531 u64 va = sge->laddr + sge_off; 532 533 page_array[seg] = ib_virt_dma_to_page(va); 534 if (do_crc) 535 siw_crc_update(&c_tx->mpa_crc, 536 ib_virt_dma_to_ptr(va), 537 plen); 538 } 539 540 sge_len -= plen; 541 sge_off += plen; 542 data_len -= plen; 543 fp_off = 0; 544 545 if (++seg >= (int)MAX_ARRAY) { 546 siw_dbg_qp(tx_qp(c_tx), "to many fragments\n"); 547 siw_unmap_pages(iov, kmap_mask, seg-1); 548 wqe->processed -= c_tx->bytes_unsent; 549 rv = -EMSGSIZE; 550 goto done_crc; 551 } 552 } 553 sge_done: 554 /* Update SGE variables at end of SGE */ 555 if (sge_off == sge->length && 556 (data_len != 0 || wqe->processed < wqe->bytes)) { 557 sge_idx++; 558 sge++; 559 sge_off = 0; 560 } 561 } 562 /* trailer */ 563 if (likely(c_tx->state != SIW_SEND_TRAILER)) { 564 iov[seg].iov_base = &c_tx->trailer.pad[4 - c_tx->pad]; 565 iov[seg].iov_len = trl_len = MAX_TRAILER - (4 - c_tx->pad); 566 } else { 567 iov[seg].iov_base = &c_tx->trailer.pad[c_tx->ctrl_sent]; 568 iov[seg].iov_len = trl_len = MAX_TRAILER - c_tx->ctrl_sent; 569 } 570 571 if (c_tx->pad) { 572 *(u32 *)c_tx->trailer.pad = 0; 573 if (do_crc) 574 siw_crc_update(&c_tx->mpa_crc, 575 (u8 *)&c_tx->trailer.crc - c_tx->pad, 576 c_tx->pad); 577 } 578 if (!c_tx->mpa_crc_enabled) 579 c_tx->trailer.crc = 0; 580 else if (do_crc) 581 siw_crc_final(&c_tx->mpa_crc, (u8 *)&c_tx->trailer.crc); 582 583 data_len = c_tx->bytes_unsent; 584 585 if (c_tx->use_sendpage) { 586 rv = siw_0copy_tx(s, page_array, &wqe->sqe.sge[c_tx->sge_idx], 587 c_tx->sge_off, data_len); 588 if (rv == data_len) { 589 rv = kernel_sendmsg(s, &msg, &iov[seg], 1, trl_len); 590 if (rv > 0) 591 rv += data_len; 592 else 593 rv = data_len; 594 } 595 } else { 596 rv = kernel_sendmsg(s, &msg, iov, seg + 1, 597 hdr_len + data_len + trl_len); 598 siw_unmap_pages(iov, kmap_mask, seg); 599 } 600 if (rv < (int)hdr_len) { 601 /* Not even complete hdr pushed or negative rv */ 602 wqe->processed -= data_len; 603 if (rv >= 0) { 604 c_tx->ctrl_sent += rv; 605 rv = -EAGAIN; 606 } 607 goto done_crc; 608 } 609 rv -= hdr_len; 610 611 if (rv >= (int)data_len) { 612 /* all user data pushed to TCP or no data to push */ 613 if (data_len > 0 && wqe->processed < wqe->bytes) { 614 /* Save the current state for next tx */ 615 c_tx->sge_idx = sge_idx; 616 c_tx->sge_off = sge_off; 617 c_tx->pbl_idx = pbl_idx; 618 } 619 rv -= data_len; 620 621 if (rv == trl_len) /* all pushed */ 622 rv = 0; 623 else { 624 c_tx->state = SIW_SEND_TRAILER; 625 c_tx->ctrl_len = MAX_TRAILER; 626 c_tx->ctrl_sent = rv + 4 - c_tx->pad; 627 c_tx->bytes_unsent = 0; 628 rv = -EAGAIN; 629 } 630 631 } else if (data_len > 0) { 632 /* Maybe some user data pushed to TCP */ 633 c_tx->state = SIW_SEND_DATA; 634 wqe->processed -= data_len - rv; 635 636 if (rv) { 637 /* 638 * Some bytes out. Recompute tx state based 639 * on old state and bytes pushed 640 */ 641 unsigned int sge_unsent; 642 643 c_tx->bytes_unsent -= rv; 644 sge = &wqe->sqe.sge[c_tx->sge_idx]; 645 sge_unsent = sge->length - c_tx->sge_off; 646 647 while (sge_unsent <= rv) { 648 rv -= sge_unsent; 649 c_tx->sge_idx++; 650 c_tx->sge_off = 0; 651 sge++; 652 sge_unsent = sge->length; 653 } 654 c_tx->sge_off += rv; 655 } 656 rv = -EAGAIN; 657 } 658 done_crc: 659 c_tx->do_crc = 0; 660 done: 661 return rv; 662 } 663 664 static void siw_update_tcpseg(struct siw_iwarp_tx *c_tx, 665 struct socket *s) 666 { 667 struct tcp_sock *tp = tcp_sk(s->sk); 668 669 if (tp->gso_segs) { 670 if (c_tx->gso_seg_limit == 0) 671 c_tx->tcp_seglen = tp->mss_cache * tp->gso_segs; 672 else 673 c_tx->tcp_seglen = 674 tp->mss_cache * 675 min_t(u16, c_tx->gso_seg_limit, tp->gso_segs); 676 } else { 677 c_tx->tcp_seglen = tp->mss_cache; 678 } 679 /* Loopback may give odd numbers */ 680 c_tx->tcp_seglen &= 0xfffffff8; 681 } 682 683 /* 684 * siw_prepare_fpdu() 685 * 686 * Prepares transmit context to send out one FPDU if FPDU will contain 687 * user data and user data are not immediate data. 688 * Computes maximum FPDU length to fill up TCP MSS if possible. 689 * 690 * @qp: QP from which to transmit 691 * @wqe: Current WQE causing transmission 692 * 693 * TODO: Take into account real available sendspace on socket 694 * to avoid header misalignment due to send pausing within 695 * fpdu transmission 696 */ 697 static void siw_prepare_fpdu(struct siw_qp *qp, struct siw_wqe *wqe) 698 { 699 struct siw_iwarp_tx *c_tx = &qp->tx_ctx; 700 int data_len; 701 702 c_tx->ctrl_len = 703 iwarp_pktinfo[__rdmap_get_opcode(&c_tx->pkt.ctrl)].hdr_len; 704 c_tx->ctrl_sent = 0; 705 706 /* 707 * Update target buffer offset if any 708 */ 709 if (!(c_tx->pkt.ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)) 710 /* Untagged message */ 711 c_tx->pkt.c_untagged.ddp_mo = cpu_to_be32(wqe->processed); 712 else /* Tagged message */ 713 c_tx->pkt.c_tagged.ddp_to = 714 cpu_to_be64(wqe->sqe.raddr + wqe->processed); 715 716 data_len = wqe->bytes - wqe->processed; 717 if (data_len + c_tx->ctrl_len + MPA_CRC_SIZE > c_tx->tcp_seglen) { 718 /* Trim DDP payload to fit into current TCP segment */ 719 data_len = c_tx->tcp_seglen - (c_tx->ctrl_len + MPA_CRC_SIZE); 720 c_tx->pkt.ctrl.ddp_rdmap_ctrl &= ~DDP_FLAG_LAST; 721 c_tx->pad = 0; 722 } else { 723 c_tx->pkt.ctrl.ddp_rdmap_ctrl |= DDP_FLAG_LAST; 724 c_tx->pad = -data_len & 0x3; 725 } 726 c_tx->bytes_unsent = data_len; 727 728 c_tx->pkt.ctrl.mpa_len = 729 htons(c_tx->ctrl_len + data_len - MPA_HDR_SIZE); 730 731 /* 732 * Init MPA CRC computation 733 */ 734 if (c_tx->mpa_crc_enabled) { 735 siw_crc_init(&c_tx->mpa_crc); 736 siw_crc_update(&c_tx->mpa_crc, &c_tx->pkt, c_tx->ctrl_len); 737 c_tx->do_crc = 1; 738 } 739 } 740 741 /* 742 * siw_check_sgl_tx() 743 * 744 * Check permissions for a list of SGE's (SGL). 745 * A successful check will have all memory referenced 746 * for transmission resolved and assigned to the WQE. 747 * 748 * @pd: Protection Domain SGL should belong to 749 * @wqe: WQE to be checked 750 * @perms: requested access permissions 751 * 752 */ 753 754 static int siw_check_sgl_tx(struct ib_pd *pd, struct siw_wqe *wqe, 755 enum ib_access_flags perms) 756 { 757 struct siw_sge *sge = &wqe->sqe.sge[0]; 758 int i, len, num_sge = wqe->sqe.num_sge; 759 760 if (unlikely(num_sge > SIW_MAX_SGE)) 761 return -EINVAL; 762 763 for (i = 0, len = 0; num_sge; num_sge--, i++, sge++) { 764 /* 765 * rdma verbs: do not check stag for a zero length sge 766 */ 767 if (sge->length) { 768 int rv = siw_check_sge(pd, sge, &wqe->mem[i], perms, 0, 769 sge->length); 770 771 if (unlikely(rv != E_ACCESS_OK)) 772 return rv; 773 } 774 len += sge->length; 775 } 776 return len; 777 } 778 779 /* 780 * siw_qp_sq_proc_tx() 781 * 782 * Process one WQE which needs transmission on the wire. 783 */ 784 static int siw_qp_sq_proc_tx(struct siw_qp *qp, struct siw_wqe *wqe) 785 { 786 struct siw_iwarp_tx *c_tx = &qp->tx_ctx; 787 struct socket *s = qp->attrs.sk; 788 int rv = 0, burst_len = qp->tx_ctx.burst; 789 enum rdmap_ecode ecode = RDMAP_ECODE_CATASTROPHIC_STREAM; 790 791 if (unlikely(wqe->wr_status == SIW_WR_IDLE)) 792 return 0; 793 794 if (!burst_len) 795 burst_len = SQ_USER_MAXBURST; 796 797 if (wqe->wr_status == SIW_WR_QUEUED) { 798 if (!(wqe->sqe.flags & SIW_WQE_INLINE)) { 799 if (tx_type(wqe) == SIW_OP_READ_RESPONSE) 800 wqe->sqe.num_sge = 1; 801 802 if (tx_type(wqe) != SIW_OP_READ && 803 tx_type(wqe) != SIW_OP_READ_LOCAL_INV) { 804 /* 805 * Reference memory to be tx'd w/o checking 806 * access for LOCAL_READ permission, since 807 * not defined in RDMA core. 808 */ 809 rv = siw_check_sgl_tx(qp->pd, wqe, 0); 810 if (rv < 0) { 811 if (tx_type(wqe) == 812 SIW_OP_READ_RESPONSE) 813 ecode = siw_rdmap_error(-rv); 814 rv = -EINVAL; 815 goto tx_error; 816 } 817 wqe->bytes = rv; 818 } else { 819 wqe->bytes = 0; 820 } 821 } else { 822 wqe->bytes = wqe->sqe.sge[0].length; 823 if (!rdma_is_kernel_res(&qp->base_qp.res)) { 824 if (wqe->bytes > SIW_MAX_INLINE) { 825 rv = -EINVAL; 826 goto tx_error; 827 } 828 wqe->sqe.sge[0].laddr = 829 (u64)(uintptr_t)&wqe->sqe.sge[1]; 830 } 831 } 832 wqe->wr_status = SIW_WR_INPROGRESS; 833 wqe->processed = 0; 834 835 siw_update_tcpseg(c_tx, s); 836 837 rv = siw_qp_prepare_tx(c_tx); 838 if (rv == PKT_FRAGMENTED) { 839 c_tx->state = SIW_SEND_HDR; 840 siw_prepare_fpdu(qp, wqe); 841 } else if (rv == PKT_COMPLETE) { 842 c_tx->state = SIW_SEND_SHORT_FPDU; 843 } else { 844 goto tx_error; 845 } 846 } 847 848 next_segment: 849 siw_dbg_qp(qp, "wr type %d, state %d, data %u, sent %u, id %llx\n", 850 tx_type(wqe), wqe->wr_status, wqe->bytes, wqe->processed, 851 wqe->sqe.id); 852 853 if (--burst_len == 0) { 854 rv = -EINPROGRESS; 855 goto tx_done; 856 } 857 if (c_tx->state == SIW_SEND_SHORT_FPDU) { 858 enum siw_opcode tx_type = tx_type(wqe); 859 unsigned int msg_flags; 860 861 if (siw_sq_empty(qp) || !siw_tcp_nagle || burst_len == 1) 862 /* 863 * End current TCP segment, if SQ runs empty, 864 * or siw_tcp_nagle is not set, or we bail out 865 * soon due to no burst credit left. 866 */ 867 msg_flags = MSG_DONTWAIT; 868 else 869 msg_flags = MSG_DONTWAIT | MSG_MORE; 870 871 rv = siw_tx_ctrl(c_tx, s, msg_flags); 872 873 if (!rv && tx_type != SIW_OP_READ && 874 tx_type != SIW_OP_READ_LOCAL_INV) 875 wqe->processed = wqe->bytes; 876 877 goto tx_done; 878 879 } else { 880 rv = siw_tx_hdt(c_tx, s); 881 } 882 if (!rv) { 883 /* 884 * One segment sent. Processing completed if last 885 * segment, Do next segment otherwise. 886 */ 887 if (unlikely(c_tx->tx_suspend)) { 888 /* 889 * Verbs, 6.4.: Try stopping sending after a full 890 * DDP segment if the connection goes down 891 * (== peer halfclose) 892 */ 893 rv = -ECONNABORTED; 894 goto tx_done; 895 } 896 if (c_tx->pkt.ctrl.ddp_rdmap_ctrl & DDP_FLAG_LAST) { 897 siw_dbg_qp(qp, "WQE completed\n"); 898 goto tx_done; 899 } 900 c_tx->state = SIW_SEND_HDR; 901 902 siw_update_tcpseg(c_tx, s); 903 904 siw_prepare_fpdu(qp, wqe); 905 goto next_segment; 906 } 907 tx_done: 908 qp->tx_ctx.burst = burst_len; 909 return rv; 910 911 tx_error: 912 if (ecode != RDMAP_ECODE_CATASTROPHIC_STREAM) 913 siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP, 914 RDMAP_ETYPE_REMOTE_PROTECTION, ecode, 1); 915 else 916 siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP, 917 RDMAP_ETYPE_CATASTROPHIC, 918 RDMAP_ECODE_UNSPECIFIED, 1); 919 return rv; 920 } 921 922 static int siw_fastreg_mr(struct ib_pd *pd, struct siw_sqe *sqe) 923 { 924 struct ib_mr *base_mr = (struct ib_mr *)(uintptr_t)sqe->base_mr; 925 struct siw_device *sdev = to_siw_dev(pd->device); 926 struct siw_mem *mem; 927 int rv = 0; 928 929 siw_dbg_pd(pd, "STag 0x%08x\n", sqe->rkey); 930 931 if (unlikely(!base_mr)) { 932 pr_warn("siw: fastreg: STag 0x%08x unknown\n", sqe->rkey); 933 return -EINVAL; 934 } 935 936 if (unlikely(base_mr->rkey >> 8 != sqe->rkey >> 8)) { 937 pr_warn("siw: fastreg: STag 0x%08x: bad MR\n", sqe->rkey); 938 return -EINVAL; 939 } 940 941 mem = siw_mem_id2obj(sdev, sqe->rkey >> 8); 942 if (unlikely(!mem)) { 943 pr_warn("siw: fastreg: STag 0x%08x unknown\n", sqe->rkey); 944 return -EINVAL; 945 } 946 947 if (unlikely(mem->pd != pd)) { 948 pr_warn("siw: fastreg: PD mismatch\n"); 949 rv = -EINVAL; 950 goto out; 951 } 952 if (unlikely(mem->stag_valid)) { 953 pr_warn("siw: fastreg: STag 0x%08x already valid\n", sqe->rkey); 954 rv = -EINVAL; 955 goto out; 956 } 957 /* Refresh STag since user may have changed key part */ 958 mem->stag = sqe->rkey; 959 mem->perms = sqe->access; 960 961 siw_dbg_mem(mem, "STag 0x%08x now valid\n", sqe->rkey); 962 mem->va = base_mr->iova; 963 mem->stag_valid = 1; 964 out: 965 siw_mem_put(mem); 966 return rv; 967 } 968 969 static int siw_qp_sq_proc_local(struct siw_qp *qp, struct siw_wqe *wqe) 970 { 971 int rv; 972 973 switch (tx_type(wqe)) { 974 case SIW_OP_REG_MR: 975 rv = siw_fastreg_mr(qp->pd, &wqe->sqe); 976 break; 977 978 case SIW_OP_INVAL_STAG: 979 rv = siw_invalidate_stag(qp->pd, wqe->sqe.rkey); 980 break; 981 982 default: 983 rv = -EINVAL; 984 } 985 return rv; 986 } 987 988 /* 989 * siw_qp_sq_process() 990 * 991 * Core TX path routine for RDMAP/DDP/MPA using a TCP kernel socket. 992 * Sends RDMAP payload for the current SQ WR @wqe of @qp in one or more 993 * MPA FPDUs, each containing a DDP segment. 994 * 995 * SQ processing may occur in user context as a result of posting 996 * new WQE's or from siw_tx_thread context. Processing in 997 * user context is limited to non-kernel verbs users. 998 * 999 * SQ processing may get paused anytime, possibly in the middle of a WR 1000 * or FPDU, if insufficient send space is available. SQ processing 1001 * gets resumed from siw_tx_thread, if send space becomes available again. 1002 * 1003 * Must be called with the QP state read-locked. 1004 * 1005 * Note: 1006 * An outbound RREQ can be satisfied by the corresponding RRESP 1007 * _before_ it gets assigned to the ORQ. This happens regularly 1008 * in RDMA READ via loopback case. Since both outbound RREQ and 1009 * inbound RRESP can be handled by the same CPU, locking the ORQ 1010 * is dead-lock prone and thus not an option. With that, the 1011 * RREQ gets assigned to the ORQ _before_ being sent - see 1012 * siw_activate_tx() - and pulled back in case of send failure. 1013 */ 1014 int siw_qp_sq_process(struct siw_qp *qp) 1015 { 1016 struct siw_wqe *wqe = tx_wqe(qp); 1017 enum siw_opcode tx_type; 1018 unsigned long flags; 1019 int rv = 0; 1020 1021 siw_dbg_qp(qp, "enter for type %d\n", tx_type(wqe)); 1022 1023 next_wqe: 1024 /* 1025 * Stop QP processing if SQ state changed 1026 */ 1027 if (unlikely(qp->tx_ctx.tx_suspend)) { 1028 siw_dbg_qp(qp, "tx suspended\n"); 1029 goto done; 1030 } 1031 tx_type = tx_type(wqe); 1032 1033 if (tx_type <= SIW_OP_READ_RESPONSE) 1034 rv = siw_qp_sq_proc_tx(qp, wqe); 1035 else 1036 rv = siw_qp_sq_proc_local(qp, wqe); 1037 1038 if (!rv) { 1039 /* 1040 * WQE processing done 1041 */ 1042 switch (tx_type) { 1043 case SIW_OP_SEND: 1044 case SIW_OP_SEND_REMOTE_INV: 1045 case SIW_OP_WRITE: 1046 siw_wqe_put_mem(wqe, tx_type); 1047 fallthrough; 1048 1049 case SIW_OP_INVAL_STAG: 1050 case SIW_OP_REG_MR: 1051 if (tx_flags(wqe) & SIW_WQE_SIGNALLED) 1052 siw_sqe_complete(qp, &wqe->sqe, wqe->bytes, 1053 SIW_WC_SUCCESS); 1054 break; 1055 1056 case SIW_OP_READ: 1057 case SIW_OP_READ_LOCAL_INV: 1058 /* 1059 * already enqueued to ORQ queue 1060 */ 1061 break; 1062 1063 case SIW_OP_READ_RESPONSE: 1064 siw_wqe_put_mem(wqe, tx_type); 1065 break; 1066 1067 default: 1068 WARN(1, "undefined WQE type %d\n", tx_type); 1069 rv = -EINVAL; 1070 goto done; 1071 } 1072 1073 spin_lock_irqsave(&qp->sq_lock, flags); 1074 wqe->wr_status = SIW_WR_IDLE; 1075 rv = siw_activate_tx(qp); 1076 spin_unlock_irqrestore(&qp->sq_lock, flags); 1077 1078 if (rv <= 0) 1079 goto done; 1080 1081 goto next_wqe; 1082 1083 } else if (rv == -EAGAIN) { 1084 siw_dbg_qp(qp, "sq paused: hd/tr %d of %d, data %d\n", 1085 qp->tx_ctx.ctrl_sent, qp->tx_ctx.ctrl_len, 1086 qp->tx_ctx.bytes_unsent); 1087 rv = 0; 1088 goto done; 1089 } else if (rv == -EINPROGRESS) { 1090 rv = siw_sq_start(qp); 1091 goto done; 1092 } else { 1093 /* 1094 * WQE processing failed. 1095 * Verbs 8.3.2: 1096 * o It turns any WQE into a signalled WQE. 1097 * o Local catastrophic error must be surfaced 1098 * o QP must be moved into Terminate state: done by code 1099 * doing socket state change processing 1100 * 1101 * o TODO: Termination message must be sent. 1102 * o TODO: Implement more precise work completion errors, 1103 * see enum ib_wc_status in ib_verbs.h 1104 */ 1105 siw_dbg_qp(qp, "wqe type %d processing failed: %d\n", 1106 tx_type(wqe), rv); 1107 1108 spin_lock_irqsave(&qp->sq_lock, flags); 1109 /* 1110 * RREQ may have already been completed by inbound RRESP! 1111 */ 1112 if ((tx_type == SIW_OP_READ || 1113 tx_type == SIW_OP_READ_LOCAL_INV) && qp->attrs.orq_size) { 1114 /* Cleanup pending entry in ORQ */ 1115 qp->orq_put--; 1116 qp->orq[qp->orq_put % qp->attrs.orq_size].flags = 0; 1117 } 1118 spin_unlock_irqrestore(&qp->sq_lock, flags); 1119 /* 1120 * immediately suspends further TX processing 1121 */ 1122 if (!qp->tx_ctx.tx_suspend) 1123 siw_qp_cm_drop(qp, 0); 1124 1125 switch (tx_type) { 1126 case SIW_OP_SEND: 1127 case SIW_OP_SEND_REMOTE_INV: 1128 case SIW_OP_SEND_WITH_IMM: 1129 case SIW_OP_WRITE: 1130 case SIW_OP_READ: 1131 case SIW_OP_READ_LOCAL_INV: 1132 siw_wqe_put_mem(wqe, tx_type); 1133 fallthrough; 1134 1135 case SIW_OP_INVAL_STAG: 1136 case SIW_OP_REG_MR: 1137 siw_sqe_complete(qp, &wqe->sqe, wqe->bytes, 1138 SIW_WC_LOC_QP_OP_ERR); 1139 1140 siw_qp_event(qp, IB_EVENT_QP_FATAL); 1141 1142 break; 1143 1144 case SIW_OP_READ_RESPONSE: 1145 siw_dbg_qp(qp, "proc. read.response failed: %d\n", rv); 1146 1147 siw_qp_event(qp, IB_EVENT_QP_REQ_ERR); 1148 1149 siw_wqe_put_mem(wqe, SIW_OP_READ_RESPONSE); 1150 1151 break; 1152 1153 default: 1154 WARN(1, "undefined WQE type %d\n", tx_type); 1155 rv = -EINVAL; 1156 } 1157 wqe->wr_status = SIW_WR_IDLE; 1158 } 1159 done: 1160 return rv; 1161 } 1162 1163 static void siw_sq_resume(struct siw_qp *qp) 1164 { 1165 if (down_read_trylock(&qp->state_lock)) { 1166 if (likely(qp->attrs.state == SIW_QP_STATE_RTS && 1167 !qp->tx_ctx.tx_suspend)) { 1168 int rv = siw_qp_sq_process(qp); 1169 1170 up_read(&qp->state_lock); 1171 1172 if (unlikely(rv < 0)) { 1173 siw_dbg_qp(qp, "SQ task failed: err %d\n", rv); 1174 1175 if (!qp->tx_ctx.tx_suspend) 1176 siw_qp_cm_drop(qp, 0); 1177 } 1178 } else { 1179 up_read(&qp->state_lock); 1180 } 1181 } else { 1182 siw_dbg_qp(qp, "Resume SQ while QP locked\n"); 1183 } 1184 siw_qp_put(qp); 1185 } 1186 1187 struct tx_task_t { 1188 struct llist_head active; 1189 wait_queue_head_t waiting; 1190 }; 1191 1192 static DEFINE_PER_CPU(struct tx_task_t, siw_tx_task_g); 1193 1194 int siw_create_tx_threads(void) 1195 { 1196 int cpu, assigned = 0; 1197 1198 for_each_online_cpu(cpu) { 1199 struct tx_task_t *tx_task; 1200 1201 /* Skip HT cores */ 1202 if (cpu % cpumask_weight(topology_sibling_cpumask(cpu))) 1203 continue; 1204 1205 tx_task = &per_cpu(siw_tx_task_g, cpu); 1206 init_llist_head(&tx_task->active); 1207 init_waitqueue_head(&tx_task->waiting); 1208 1209 siw_tx_thread[cpu] = 1210 kthread_run_on_cpu(siw_run_sq, 1211 (unsigned long *)(long)cpu, 1212 cpu, "siw_tx/%u"); 1213 if (IS_ERR(siw_tx_thread[cpu])) { 1214 siw_tx_thread[cpu] = NULL; 1215 continue; 1216 } 1217 assigned++; 1218 } 1219 return assigned; 1220 } 1221 1222 void siw_stop_tx_threads(void) 1223 { 1224 int cpu; 1225 1226 for_each_possible_cpu(cpu) { 1227 if (siw_tx_thread[cpu]) { 1228 kthread_stop(siw_tx_thread[cpu]); 1229 wake_up(&per_cpu(siw_tx_task_g, cpu).waiting); 1230 siw_tx_thread[cpu] = NULL; 1231 } 1232 } 1233 } 1234 1235 int siw_run_sq(void *data) 1236 { 1237 const int nr_cpu = (unsigned int)(long)data; 1238 struct llist_node *active; 1239 struct siw_qp *qp; 1240 struct tx_task_t *tx_task = &per_cpu(siw_tx_task_g, nr_cpu); 1241 1242 while (1) { 1243 struct llist_node *fifo_list = NULL; 1244 1245 wait_event_interruptible(tx_task->waiting, 1246 !llist_empty(&tx_task->active) || 1247 kthread_should_stop()); 1248 1249 if (kthread_should_stop()) 1250 break; 1251 1252 active = llist_del_all(&tx_task->active); 1253 /* 1254 * llist_del_all returns a list with newest entry first. 1255 * Re-order list for fairness among QP's. 1256 */ 1257 fifo_list = llist_reverse_order(active); 1258 while (fifo_list) { 1259 qp = container_of(fifo_list, struct siw_qp, tx_list); 1260 fifo_list = llist_next(fifo_list); 1261 qp->tx_list.next = NULL; 1262 1263 siw_sq_resume(qp); 1264 } 1265 } 1266 active = llist_del_all(&tx_task->active); 1267 if (active) { 1268 llist_for_each_entry(qp, active, tx_list) { 1269 qp->tx_list.next = NULL; 1270 siw_sq_resume(qp); 1271 } 1272 } 1273 return 0; 1274 } 1275 1276 int siw_sq_start(struct siw_qp *qp) 1277 { 1278 if (tx_wqe(qp)->wr_status == SIW_WR_IDLE) 1279 return 0; 1280 1281 if (unlikely(!cpu_online(qp->tx_cpu))) { 1282 siw_put_tx_cpu(qp->tx_cpu); 1283 qp->tx_cpu = siw_get_tx_cpu(qp->sdev); 1284 if (qp->tx_cpu < 0) { 1285 pr_warn("siw: no tx cpu available\n"); 1286 1287 return -EIO; 1288 } 1289 } 1290 siw_qp_get(qp); 1291 1292 llist_add(&qp->tx_list, &per_cpu(siw_tx_task_g, qp->tx_cpu).active); 1293 1294 wake_up(&per_cpu(siw_tx_task_g, qp->tx_cpu).waiting); 1295 1296 return 0; 1297 } 1298