1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2 3 /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ 4 /* Copyright (c) 2008-2019, IBM Corporation */ 5 6 #include <linux/errno.h> 7 #include <linux/types.h> 8 #include <linux/net.h> 9 #include <linux/scatterlist.h> 10 #include <linux/highmem.h> 11 #include <net/tcp.h> 12 13 #include <rdma/iw_cm.h> 14 #include <rdma/ib_verbs.h> 15 #include <rdma/ib_user_verbs.h> 16 17 #include "siw.h" 18 #include "siw_verbs.h" 19 #include "siw_mem.h" 20 21 #define MAX_HDR_INLINE \ 22 (((uint32_t)(sizeof(struct siw_rreq_pkt) - \ 23 sizeof(struct iwarp_send))) & 0xF8) 24 25 static struct page *siw_get_pblpage(struct siw_mem *mem, u64 addr, int *idx) 26 { 27 struct siw_pbl *pbl = mem->pbl; 28 u64 offset = addr - mem->va; 29 dma_addr_t paddr = siw_pbl_get_buffer(pbl, offset, NULL, idx); 30 31 if (paddr) 32 return ib_virt_dma_to_page(paddr); 33 34 return NULL; 35 } 36 37 static struct page *siw_get_page(struct siw_mem *mem, struct siw_sge *sge, 38 unsigned long offset, int *pbl_idx) 39 { 40 if (!mem->is_pbl) 41 return siw_get_upage(mem->umem, sge->laddr + offset); 42 else 43 return siw_get_pblpage(mem, sge->laddr + offset, pbl_idx); 44 } 45 46 /* 47 * Copy short payload at provided destination payload address 48 */ 49 static int siw_try_1seg(struct siw_iwarp_tx *c_tx, void *paddr) 50 { 51 struct siw_wqe *wqe = &c_tx->wqe_active; 52 struct siw_sge *sge = &wqe->sqe.sge[0]; 53 u32 bytes = sge->length; 54 55 if (bytes > MAX_HDR_INLINE || wqe->sqe.num_sge != 1) 56 return MAX_HDR_INLINE + 1; 57 58 if (!bytes) 59 return 0; 60 61 if (tx_flags(wqe) & SIW_WQE_INLINE) { 62 memcpy(paddr, &wqe->sqe.sge[1], bytes); 63 } else { 64 struct siw_mem *mem = wqe->mem[0]; 65 66 if (!mem->mem_obj) { 67 /* Kernel client using kva */ 68 memcpy(paddr, ib_virt_dma_to_ptr(sge->laddr), bytes); 69 } else if (c_tx->in_syscall) { 70 if (copy_from_user(paddr, u64_to_user_ptr(sge->laddr), 71 bytes)) 72 return -EFAULT; 73 } else { 74 unsigned int off = sge->laddr & ~PAGE_MASK; 75 struct page *p; 76 char *buffer; 77 int pbl_idx = 0; 78 79 p = siw_get_page(mem, sge, 0, &pbl_idx); 80 if (unlikely(!p)) 81 return -EFAULT; 82 83 buffer = kmap_local_page(p); 84 85 if (likely(PAGE_SIZE - off >= bytes)) { 86 memcpy(paddr, buffer + off, bytes); 87 } else { 88 unsigned long part = bytes - (PAGE_SIZE - off); 89 90 memcpy(paddr, buffer + off, part); 91 kunmap_local(buffer); 92 93 p = siw_get_page(mem, sge, part, &pbl_idx); 94 if (unlikely(!p)) 95 return -EFAULT; 96 97 buffer = kmap_local_page(p); 98 memcpy(paddr + part, buffer, bytes - part); 99 } 100 kunmap_local(buffer); 101 } 102 } 103 return (int)bytes; 104 } 105 106 #define PKT_FRAGMENTED 1 107 #define PKT_COMPLETE 0 108 109 /* 110 * siw_qp_prepare_tx() 111 * 112 * Prepare tx state for sending out one fpdu. Builds complete pkt 113 * if no user data or only immediate data are present. 114 * 115 * returns PKT_COMPLETE if complete pkt built, PKT_FRAGMENTED otherwise. 116 */ 117 static int siw_qp_prepare_tx(struct siw_iwarp_tx *c_tx) 118 { 119 struct siw_wqe *wqe = &c_tx->wqe_active; 120 char *crc = NULL; 121 int data = 0; 122 123 switch (tx_type(wqe)) { 124 case SIW_OP_READ: 125 case SIW_OP_READ_LOCAL_INV: 126 memcpy(&c_tx->pkt.ctrl, 127 &iwarp_pktinfo[RDMAP_RDMA_READ_REQ].ctrl, 128 sizeof(struct iwarp_ctrl)); 129 130 c_tx->pkt.rreq.rsvd = 0; 131 c_tx->pkt.rreq.ddp_qn = htonl(RDMAP_UNTAGGED_QN_RDMA_READ); 132 c_tx->pkt.rreq.ddp_msn = 133 htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]); 134 c_tx->pkt.rreq.ddp_mo = 0; 135 c_tx->pkt.rreq.sink_stag = htonl(wqe->sqe.sge[0].lkey); 136 c_tx->pkt.rreq.sink_to = 137 cpu_to_be64(wqe->sqe.sge[0].laddr); 138 c_tx->pkt.rreq.source_stag = htonl(wqe->sqe.rkey); 139 c_tx->pkt.rreq.source_to = cpu_to_be64(wqe->sqe.raddr); 140 c_tx->pkt.rreq.read_size = htonl(wqe->sqe.sge[0].length); 141 142 c_tx->ctrl_len = sizeof(struct iwarp_rdma_rreq); 143 crc = (char *)&c_tx->pkt.rreq_pkt.crc; 144 break; 145 146 case SIW_OP_SEND: 147 if (tx_flags(wqe) & SIW_WQE_SOLICITED) 148 memcpy(&c_tx->pkt.ctrl, 149 &iwarp_pktinfo[RDMAP_SEND_SE].ctrl, 150 sizeof(struct iwarp_ctrl)); 151 else 152 memcpy(&c_tx->pkt.ctrl, &iwarp_pktinfo[RDMAP_SEND].ctrl, 153 sizeof(struct iwarp_ctrl)); 154 155 c_tx->pkt.send.ddp_qn = RDMAP_UNTAGGED_QN_SEND; 156 c_tx->pkt.send.ddp_msn = 157 htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]); 158 c_tx->pkt.send.ddp_mo = 0; 159 160 c_tx->pkt.send_inv.inval_stag = 0; 161 162 c_tx->ctrl_len = sizeof(struct iwarp_send); 163 164 crc = (char *)&c_tx->pkt.send_pkt.crc; 165 data = siw_try_1seg(c_tx, crc); 166 break; 167 168 case SIW_OP_SEND_REMOTE_INV: 169 if (tx_flags(wqe) & SIW_WQE_SOLICITED) 170 memcpy(&c_tx->pkt.ctrl, 171 &iwarp_pktinfo[RDMAP_SEND_SE_INVAL].ctrl, 172 sizeof(struct iwarp_ctrl)); 173 else 174 memcpy(&c_tx->pkt.ctrl, 175 &iwarp_pktinfo[RDMAP_SEND_INVAL].ctrl, 176 sizeof(struct iwarp_ctrl)); 177 178 c_tx->pkt.send.ddp_qn = RDMAP_UNTAGGED_QN_SEND; 179 c_tx->pkt.send.ddp_msn = 180 htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]); 181 c_tx->pkt.send.ddp_mo = 0; 182 183 c_tx->pkt.send_inv.inval_stag = cpu_to_be32(wqe->sqe.rkey); 184 185 c_tx->ctrl_len = sizeof(struct iwarp_send_inv); 186 187 crc = (char *)&c_tx->pkt.send_pkt.crc; 188 data = siw_try_1seg(c_tx, crc); 189 break; 190 191 case SIW_OP_WRITE: 192 memcpy(&c_tx->pkt.ctrl, &iwarp_pktinfo[RDMAP_RDMA_WRITE].ctrl, 193 sizeof(struct iwarp_ctrl)); 194 195 c_tx->pkt.rwrite.sink_stag = htonl(wqe->sqe.rkey); 196 c_tx->pkt.rwrite.sink_to = cpu_to_be64(wqe->sqe.raddr); 197 c_tx->ctrl_len = sizeof(struct iwarp_rdma_write); 198 199 crc = (char *)&c_tx->pkt.write_pkt.crc; 200 data = siw_try_1seg(c_tx, crc); 201 break; 202 203 case SIW_OP_READ_RESPONSE: 204 memcpy(&c_tx->pkt.ctrl, 205 &iwarp_pktinfo[RDMAP_RDMA_READ_RESP].ctrl, 206 sizeof(struct iwarp_ctrl)); 207 208 /* NBO */ 209 c_tx->pkt.rresp.sink_stag = cpu_to_be32(wqe->sqe.rkey); 210 c_tx->pkt.rresp.sink_to = cpu_to_be64(wqe->sqe.raddr); 211 212 c_tx->ctrl_len = sizeof(struct iwarp_rdma_rresp); 213 214 crc = (char *)&c_tx->pkt.write_pkt.crc; 215 data = siw_try_1seg(c_tx, crc); 216 break; 217 218 default: 219 siw_dbg_qp(tx_qp(c_tx), "stale wqe type %d\n", tx_type(wqe)); 220 return -EOPNOTSUPP; 221 } 222 if (unlikely(data < 0)) 223 return data; 224 225 c_tx->ctrl_sent = 0; 226 227 if (data <= MAX_HDR_INLINE) { 228 if (data) { 229 wqe->processed = data; 230 231 c_tx->pkt.ctrl.mpa_len = 232 htons(c_tx->ctrl_len + data - MPA_HDR_SIZE); 233 234 /* Add pad, if needed */ 235 data += -(int)data & 0x3; 236 /* advance CRC location after payload */ 237 crc += data; 238 c_tx->ctrl_len += data; 239 240 if (!(c_tx->pkt.ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)) 241 c_tx->pkt.c_untagged.ddp_mo = 0; 242 else 243 c_tx->pkt.c_tagged.ddp_to = 244 cpu_to_be64(wqe->sqe.raddr); 245 } 246 247 *(u32 *)crc = 0; 248 /* 249 * Do complete CRC if enabled and short packet 250 */ 251 if (c_tx->mpa_crc_hd && 252 crypto_shash_digest(c_tx->mpa_crc_hd, (u8 *)&c_tx->pkt, 253 c_tx->ctrl_len, (u8 *)crc) != 0) 254 return -EINVAL; 255 c_tx->ctrl_len += MPA_CRC_SIZE; 256 257 return PKT_COMPLETE; 258 } 259 c_tx->ctrl_len += MPA_CRC_SIZE; 260 c_tx->sge_idx = 0; 261 c_tx->sge_off = 0; 262 c_tx->pbl_idx = 0; 263 264 /* 265 * Allow direct sending out of user buffer if WR is non signalled 266 * and payload is over threshold. 267 * Per RDMA verbs, the application should not change the send buffer 268 * until the work completed. In iWarp, work completion is only 269 * local delivery to TCP. TCP may reuse the buffer for 270 * retransmission. Changing unsent data also breaks the CRC, 271 * if applied. 272 */ 273 if (c_tx->zcopy_tx && wqe->bytes >= SENDPAGE_THRESH && 274 !(tx_flags(wqe) & SIW_WQE_SIGNALLED)) 275 c_tx->use_sendpage = 1; 276 else 277 c_tx->use_sendpage = 0; 278 279 return PKT_FRAGMENTED; 280 } 281 282 /* 283 * Send out one complete control type FPDU, or header of FPDU carrying 284 * data. Used for fixed sized packets like Read.Requests or zero length 285 * SENDs, WRITEs, READ.Responses, or header only. 286 */ 287 static int siw_tx_ctrl(struct siw_iwarp_tx *c_tx, struct socket *s, 288 int flags) 289 { 290 struct msghdr msg = { .msg_flags = flags }; 291 struct kvec iov = { .iov_base = 292 (char *)&c_tx->pkt.ctrl + c_tx->ctrl_sent, 293 .iov_len = c_tx->ctrl_len - c_tx->ctrl_sent }; 294 295 int rv = kernel_sendmsg(s, &msg, &iov, 1, iov.iov_len); 296 297 if (rv >= 0) { 298 c_tx->ctrl_sent += rv; 299 300 if (c_tx->ctrl_sent == c_tx->ctrl_len) 301 rv = 0; 302 else 303 rv = -EAGAIN; 304 } 305 return rv; 306 } 307 308 /* 309 * 0copy TCP transmit interface: Use MSG_SPLICE_PAGES. 310 * 311 * Using sendpage to push page by page appears to be less efficient 312 * than using sendmsg, even if data are copied. 313 * 314 * A general performance limitation might be the extra four bytes 315 * trailer checksum segment to be pushed after user data. 316 */ 317 static int siw_tcp_sendpages(struct socket *s, struct page **page, int offset, 318 size_t size) 319 { 320 struct bio_vec bvec; 321 struct msghdr msg = { 322 .msg_flags = (MSG_MORE | MSG_DONTWAIT | MSG_SPLICE_PAGES), 323 }; 324 struct sock *sk = s->sk; 325 int i = 0, rv = 0, sent = 0; 326 327 while (size) { 328 size_t bytes = min_t(size_t, PAGE_SIZE - offset, size); 329 330 if (size + offset <= PAGE_SIZE) 331 msg.msg_flags &= ~MSG_MORE; 332 333 tcp_rate_check_app_limited(sk); 334 bvec_set_page(&bvec, page[i], bytes, offset); 335 iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); 336 337 try_page_again: 338 lock_sock(sk); 339 rv = tcp_sendmsg_locked(sk, &msg, size); 340 release_sock(sk); 341 342 if (rv > 0) { 343 size -= rv; 344 sent += rv; 345 if (rv != bytes) { 346 offset += rv; 347 bytes -= rv; 348 goto try_page_again; 349 } 350 offset = 0; 351 } else { 352 if (rv == -EAGAIN || rv == 0) 353 break; 354 return rv; 355 } 356 i++; 357 } 358 return sent; 359 } 360 361 /* 362 * siw_0copy_tx() 363 * 364 * Pushes list of pages to TCP socket. If pages from multiple 365 * SGE's, all referenced pages of each SGE are pushed in one 366 * shot. 367 */ 368 static int siw_0copy_tx(struct socket *s, struct page **page, 369 struct siw_sge *sge, unsigned int offset, 370 unsigned int size) 371 { 372 int i = 0, sent = 0, rv; 373 int sge_bytes = min(sge->length - offset, size); 374 375 offset = (sge->laddr + offset) & ~PAGE_MASK; 376 377 while (sent != size) { 378 rv = siw_tcp_sendpages(s, &page[i], offset, sge_bytes); 379 if (rv >= 0) { 380 sent += rv; 381 if (size == sent || sge_bytes > rv) 382 break; 383 384 i += PAGE_ALIGN(sge_bytes + offset) >> PAGE_SHIFT; 385 sge++; 386 sge_bytes = min(sge->length, size - sent); 387 offset = sge->laddr & ~PAGE_MASK; 388 } else { 389 sent = rv; 390 break; 391 } 392 } 393 return sent; 394 } 395 396 #define MAX_TRAILER (MPA_CRC_SIZE + 4) 397 398 static void siw_unmap_pages(struct kvec *iov, unsigned long kmap_mask, int len) 399 { 400 int i; 401 402 /* 403 * Work backwards through the array to honor the kmap_local_page() 404 * ordering requirements. 405 */ 406 for (i = (len-1); i >= 0; i--) { 407 if (kmap_mask & BIT(i)) { 408 unsigned long addr = (unsigned long)iov[i].iov_base; 409 410 kunmap_local((void *)(addr & PAGE_MASK)); 411 } 412 } 413 } 414 415 /* 416 * siw_tx_hdt() tries to push a complete packet to TCP where all 417 * packet fragments are referenced by the elements of one iovec. 418 * For the data portion, each involved page must be referenced by 419 * one extra element. All sge's data can be non-aligned to page 420 * boundaries. Two more elements are referencing iWARP header 421 * and trailer: 422 * MAX_ARRAY = 64KB/PAGE_SIZE + 1 + (2 * (SIW_MAX_SGE - 1) + HDR + TRL 423 */ 424 #define MAX_ARRAY ((0xffff / PAGE_SIZE) + 1 + (2 * (SIW_MAX_SGE - 1) + 2)) 425 426 /* 427 * Write out iov referencing hdr, data and trailer of current FPDU. 428 * Update transmit state dependent on write return status 429 */ 430 static int siw_tx_hdt(struct siw_iwarp_tx *c_tx, struct socket *s) 431 { 432 struct siw_wqe *wqe = &c_tx->wqe_active; 433 struct siw_sge *sge = &wqe->sqe.sge[c_tx->sge_idx]; 434 struct kvec iov[MAX_ARRAY]; 435 struct page *page_array[MAX_ARRAY]; 436 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR }; 437 438 int seg = 0, do_crc = c_tx->do_crc, is_kva = 0, rv; 439 unsigned int data_len = c_tx->bytes_unsent, hdr_len = 0, trl_len = 0, 440 sge_off = c_tx->sge_off, sge_idx = c_tx->sge_idx, 441 pbl_idx = c_tx->pbl_idx; 442 unsigned long kmap_mask = 0L; 443 444 if (c_tx->state == SIW_SEND_HDR) { 445 if (c_tx->use_sendpage) { 446 rv = siw_tx_ctrl(c_tx, s, MSG_DONTWAIT | MSG_MORE); 447 if (rv) 448 goto done; 449 450 c_tx->state = SIW_SEND_DATA; 451 } else { 452 iov[0].iov_base = 453 (char *)&c_tx->pkt.ctrl + c_tx->ctrl_sent; 454 iov[0].iov_len = hdr_len = 455 c_tx->ctrl_len - c_tx->ctrl_sent; 456 seg = 1; 457 } 458 } 459 460 wqe->processed += data_len; 461 462 while (data_len) { /* walk the list of SGE's */ 463 unsigned int sge_len = min(sge->length - sge_off, data_len); 464 unsigned int fp_off = (sge->laddr + sge_off) & ~PAGE_MASK; 465 struct siw_mem *mem; 466 467 if (!(tx_flags(wqe) & SIW_WQE_INLINE)) { 468 mem = wqe->mem[sge_idx]; 469 is_kva = mem->mem_obj == NULL ? 1 : 0; 470 } else { 471 is_kva = 1; 472 } 473 if (is_kva && !c_tx->use_sendpage) { 474 /* 475 * tx from kernel virtual address: either inline data 476 * or memory region with assigned kernel buffer 477 */ 478 iov[seg].iov_base = 479 ib_virt_dma_to_ptr(sge->laddr + sge_off); 480 iov[seg].iov_len = sge_len; 481 482 if (do_crc) 483 crypto_shash_update(c_tx->mpa_crc_hd, 484 iov[seg].iov_base, 485 sge_len); 486 sge_off += sge_len; 487 data_len -= sge_len; 488 seg++; 489 goto sge_done; 490 } 491 492 while (sge_len) { 493 size_t plen = min((int)PAGE_SIZE - fp_off, sge_len); 494 void *kaddr; 495 496 if (!is_kva) { 497 struct page *p; 498 499 p = siw_get_page(mem, sge, sge_off, &pbl_idx); 500 if (unlikely(!p)) { 501 siw_unmap_pages(iov, kmap_mask, seg); 502 wqe->processed -= c_tx->bytes_unsent; 503 rv = -EFAULT; 504 goto done_crc; 505 } 506 page_array[seg] = p; 507 508 if (!c_tx->use_sendpage) { 509 void *kaddr = kmap_local_page(p); 510 511 /* Remember for later kunmap() */ 512 kmap_mask |= BIT(seg); 513 iov[seg].iov_base = kaddr + fp_off; 514 iov[seg].iov_len = plen; 515 516 if (do_crc) 517 crypto_shash_update( 518 c_tx->mpa_crc_hd, 519 iov[seg].iov_base, 520 plen); 521 } else if (do_crc) { 522 kaddr = kmap_local_page(p); 523 crypto_shash_update(c_tx->mpa_crc_hd, 524 kaddr + fp_off, 525 plen); 526 kunmap_local(kaddr); 527 } 528 } else { 529 /* 530 * Cast to an uintptr_t to preserve all 64 bits 531 * in sge->laddr. 532 */ 533 u64 va = sge->laddr + sge_off; 534 535 page_array[seg] = ib_virt_dma_to_page(va); 536 if (do_crc) 537 crypto_shash_update( 538 c_tx->mpa_crc_hd, 539 ib_virt_dma_to_ptr(va), 540 plen); 541 } 542 543 sge_len -= plen; 544 sge_off += plen; 545 data_len -= plen; 546 fp_off = 0; 547 548 if (++seg >= (int)MAX_ARRAY) { 549 siw_dbg_qp(tx_qp(c_tx), "to many fragments\n"); 550 siw_unmap_pages(iov, kmap_mask, seg-1); 551 wqe->processed -= c_tx->bytes_unsent; 552 rv = -EMSGSIZE; 553 goto done_crc; 554 } 555 } 556 sge_done: 557 /* Update SGE variables at end of SGE */ 558 if (sge_off == sge->length && 559 (data_len != 0 || wqe->processed < wqe->bytes)) { 560 sge_idx++; 561 sge++; 562 sge_off = 0; 563 } 564 } 565 /* trailer */ 566 if (likely(c_tx->state != SIW_SEND_TRAILER)) { 567 iov[seg].iov_base = &c_tx->trailer.pad[4 - c_tx->pad]; 568 iov[seg].iov_len = trl_len = MAX_TRAILER - (4 - c_tx->pad); 569 } else { 570 iov[seg].iov_base = &c_tx->trailer.pad[c_tx->ctrl_sent]; 571 iov[seg].iov_len = trl_len = MAX_TRAILER - c_tx->ctrl_sent; 572 } 573 574 if (c_tx->pad) { 575 *(u32 *)c_tx->trailer.pad = 0; 576 if (do_crc) 577 crypto_shash_update(c_tx->mpa_crc_hd, 578 (u8 *)&c_tx->trailer.crc - c_tx->pad, 579 c_tx->pad); 580 } 581 if (!c_tx->mpa_crc_hd) 582 c_tx->trailer.crc = 0; 583 else if (do_crc) 584 crypto_shash_final(c_tx->mpa_crc_hd, (u8 *)&c_tx->trailer.crc); 585 586 data_len = c_tx->bytes_unsent; 587 588 if (c_tx->use_sendpage) { 589 rv = siw_0copy_tx(s, page_array, &wqe->sqe.sge[c_tx->sge_idx], 590 c_tx->sge_off, data_len); 591 if (rv == data_len) { 592 rv = kernel_sendmsg(s, &msg, &iov[seg], 1, trl_len); 593 if (rv > 0) 594 rv += data_len; 595 else 596 rv = data_len; 597 } 598 } else { 599 rv = kernel_sendmsg(s, &msg, iov, seg + 1, 600 hdr_len + data_len + trl_len); 601 siw_unmap_pages(iov, kmap_mask, seg); 602 } 603 if (rv < (int)hdr_len) { 604 /* Not even complete hdr pushed or negative rv */ 605 wqe->processed -= data_len; 606 if (rv >= 0) { 607 c_tx->ctrl_sent += rv; 608 rv = -EAGAIN; 609 } 610 goto done_crc; 611 } 612 rv -= hdr_len; 613 614 if (rv >= (int)data_len) { 615 /* all user data pushed to TCP or no data to push */ 616 if (data_len > 0 && wqe->processed < wqe->bytes) { 617 /* Save the current state for next tx */ 618 c_tx->sge_idx = sge_idx; 619 c_tx->sge_off = sge_off; 620 c_tx->pbl_idx = pbl_idx; 621 } 622 rv -= data_len; 623 624 if (rv == trl_len) /* all pushed */ 625 rv = 0; 626 else { 627 c_tx->state = SIW_SEND_TRAILER; 628 c_tx->ctrl_len = MAX_TRAILER; 629 c_tx->ctrl_sent = rv + 4 - c_tx->pad; 630 c_tx->bytes_unsent = 0; 631 rv = -EAGAIN; 632 } 633 634 } else if (data_len > 0) { 635 /* Maybe some user data pushed to TCP */ 636 c_tx->state = SIW_SEND_DATA; 637 wqe->processed -= data_len - rv; 638 639 if (rv) { 640 /* 641 * Some bytes out. Recompute tx state based 642 * on old state and bytes pushed 643 */ 644 unsigned int sge_unsent; 645 646 c_tx->bytes_unsent -= rv; 647 sge = &wqe->sqe.sge[c_tx->sge_idx]; 648 sge_unsent = sge->length - c_tx->sge_off; 649 650 while (sge_unsent <= rv) { 651 rv -= sge_unsent; 652 c_tx->sge_idx++; 653 c_tx->sge_off = 0; 654 sge++; 655 sge_unsent = sge->length; 656 } 657 c_tx->sge_off += rv; 658 } 659 rv = -EAGAIN; 660 } 661 done_crc: 662 c_tx->do_crc = 0; 663 done: 664 return rv; 665 } 666 667 static void siw_update_tcpseg(struct siw_iwarp_tx *c_tx, 668 struct socket *s) 669 { 670 struct tcp_sock *tp = tcp_sk(s->sk); 671 672 if (tp->gso_segs) { 673 if (c_tx->gso_seg_limit == 0) 674 c_tx->tcp_seglen = tp->mss_cache * tp->gso_segs; 675 else 676 c_tx->tcp_seglen = 677 tp->mss_cache * 678 min_t(u16, c_tx->gso_seg_limit, tp->gso_segs); 679 } else { 680 c_tx->tcp_seglen = tp->mss_cache; 681 } 682 /* Loopback may give odd numbers */ 683 c_tx->tcp_seglen &= 0xfffffff8; 684 } 685 686 /* 687 * siw_prepare_fpdu() 688 * 689 * Prepares transmit context to send out one FPDU if FPDU will contain 690 * user data and user data are not immediate data. 691 * Computes maximum FPDU length to fill up TCP MSS if possible. 692 * 693 * @qp: QP from which to transmit 694 * @wqe: Current WQE causing transmission 695 * 696 * TODO: Take into account real available sendspace on socket 697 * to avoid header misalignment due to send pausing within 698 * fpdu transmission 699 */ 700 static void siw_prepare_fpdu(struct siw_qp *qp, struct siw_wqe *wqe) 701 { 702 struct siw_iwarp_tx *c_tx = &qp->tx_ctx; 703 int data_len; 704 705 c_tx->ctrl_len = 706 iwarp_pktinfo[__rdmap_get_opcode(&c_tx->pkt.ctrl)].hdr_len; 707 c_tx->ctrl_sent = 0; 708 709 /* 710 * Update target buffer offset if any 711 */ 712 if (!(c_tx->pkt.ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)) 713 /* Untagged message */ 714 c_tx->pkt.c_untagged.ddp_mo = cpu_to_be32(wqe->processed); 715 else /* Tagged message */ 716 c_tx->pkt.c_tagged.ddp_to = 717 cpu_to_be64(wqe->sqe.raddr + wqe->processed); 718 719 data_len = wqe->bytes - wqe->processed; 720 if (data_len + c_tx->ctrl_len + MPA_CRC_SIZE > c_tx->tcp_seglen) { 721 /* Trim DDP payload to fit into current TCP segment */ 722 data_len = c_tx->tcp_seglen - (c_tx->ctrl_len + MPA_CRC_SIZE); 723 c_tx->pkt.ctrl.ddp_rdmap_ctrl &= ~DDP_FLAG_LAST; 724 c_tx->pad = 0; 725 } else { 726 c_tx->pkt.ctrl.ddp_rdmap_ctrl |= DDP_FLAG_LAST; 727 c_tx->pad = -data_len & 0x3; 728 } 729 c_tx->bytes_unsent = data_len; 730 731 c_tx->pkt.ctrl.mpa_len = 732 htons(c_tx->ctrl_len + data_len - MPA_HDR_SIZE); 733 734 /* 735 * Init MPA CRC computation 736 */ 737 if (c_tx->mpa_crc_hd) { 738 crypto_shash_init(c_tx->mpa_crc_hd); 739 crypto_shash_update(c_tx->mpa_crc_hd, (u8 *)&c_tx->pkt, 740 c_tx->ctrl_len); 741 c_tx->do_crc = 1; 742 } 743 } 744 745 /* 746 * siw_check_sgl_tx() 747 * 748 * Check permissions for a list of SGE's (SGL). 749 * A successful check will have all memory referenced 750 * for transmission resolved and assigned to the WQE. 751 * 752 * @pd: Protection Domain SGL should belong to 753 * @wqe: WQE to be checked 754 * @perms: requested access permissions 755 * 756 */ 757 758 static int siw_check_sgl_tx(struct ib_pd *pd, struct siw_wqe *wqe, 759 enum ib_access_flags perms) 760 { 761 struct siw_sge *sge = &wqe->sqe.sge[0]; 762 int i, len, num_sge = wqe->sqe.num_sge; 763 764 if (unlikely(num_sge > SIW_MAX_SGE)) 765 return -EINVAL; 766 767 for (i = 0, len = 0; num_sge; num_sge--, i++, sge++) { 768 /* 769 * rdma verbs: do not check stag for a zero length sge 770 */ 771 if (sge->length) { 772 int rv = siw_check_sge(pd, sge, &wqe->mem[i], perms, 0, 773 sge->length); 774 775 if (unlikely(rv != E_ACCESS_OK)) 776 return rv; 777 } 778 len += sge->length; 779 } 780 return len; 781 } 782 783 /* 784 * siw_qp_sq_proc_tx() 785 * 786 * Process one WQE which needs transmission on the wire. 787 */ 788 static int siw_qp_sq_proc_tx(struct siw_qp *qp, struct siw_wqe *wqe) 789 { 790 struct siw_iwarp_tx *c_tx = &qp->tx_ctx; 791 struct socket *s = qp->attrs.sk; 792 int rv = 0, burst_len = qp->tx_ctx.burst; 793 enum rdmap_ecode ecode = RDMAP_ECODE_CATASTROPHIC_STREAM; 794 795 if (unlikely(wqe->wr_status == SIW_WR_IDLE)) 796 return 0; 797 798 if (!burst_len) 799 burst_len = SQ_USER_MAXBURST; 800 801 if (wqe->wr_status == SIW_WR_QUEUED) { 802 if (!(wqe->sqe.flags & SIW_WQE_INLINE)) { 803 if (tx_type(wqe) == SIW_OP_READ_RESPONSE) 804 wqe->sqe.num_sge = 1; 805 806 if (tx_type(wqe) != SIW_OP_READ && 807 tx_type(wqe) != SIW_OP_READ_LOCAL_INV) { 808 /* 809 * Reference memory to be tx'd w/o checking 810 * access for LOCAL_READ permission, since 811 * not defined in RDMA core. 812 */ 813 rv = siw_check_sgl_tx(qp->pd, wqe, 0); 814 if (rv < 0) { 815 if (tx_type(wqe) == 816 SIW_OP_READ_RESPONSE) 817 ecode = siw_rdmap_error(-rv); 818 rv = -EINVAL; 819 goto tx_error; 820 } 821 wqe->bytes = rv; 822 } else { 823 wqe->bytes = 0; 824 } 825 } else { 826 wqe->bytes = wqe->sqe.sge[0].length; 827 if (!rdma_is_kernel_res(&qp->base_qp.res)) { 828 if (wqe->bytes > SIW_MAX_INLINE) { 829 rv = -EINVAL; 830 goto tx_error; 831 } 832 wqe->sqe.sge[0].laddr = 833 (u64)(uintptr_t)&wqe->sqe.sge[1]; 834 } 835 } 836 wqe->wr_status = SIW_WR_INPROGRESS; 837 wqe->processed = 0; 838 839 siw_update_tcpseg(c_tx, s); 840 841 rv = siw_qp_prepare_tx(c_tx); 842 if (rv == PKT_FRAGMENTED) { 843 c_tx->state = SIW_SEND_HDR; 844 siw_prepare_fpdu(qp, wqe); 845 } else if (rv == PKT_COMPLETE) { 846 c_tx->state = SIW_SEND_SHORT_FPDU; 847 } else { 848 goto tx_error; 849 } 850 } 851 852 next_segment: 853 siw_dbg_qp(qp, "wr type %d, state %d, data %u, sent %u, id %llx\n", 854 tx_type(wqe), wqe->wr_status, wqe->bytes, wqe->processed, 855 wqe->sqe.id); 856 857 if (--burst_len == 0) { 858 rv = -EINPROGRESS; 859 goto tx_done; 860 } 861 if (c_tx->state == SIW_SEND_SHORT_FPDU) { 862 enum siw_opcode tx_type = tx_type(wqe); 863 unsigned int msg_flags; 864 865 if (siw_sq_empty(qp) || !siw_tcp_nagle || burst_len == 1) 866 /* 867 * End current TCP segment, if SQ runs empty, 868 * or siw_tcp_nagle is not set, or we bail out 869 * soon due to no burst credit left. 870 */ 871 msg_flags = MSG_DONTWAIT; 872 else 873 msg_flags = MSG_DONTWAIT | MSG_MORE; 874 875 rv = siw_tx_ctrl(c_tx, s, msg_flags); 876 877 if (!rv && tx_type != SIW_OP_READ && 878 tx_type != SIW_OP_READ_LOCAL_INV) 879 wqe->processed = wqe->bytes; 880 881 goto tx_done; 882 883 } else { 884 rv = siw_tx_hdt(c_tx, s); 885 } 886 if (!rv) { 887 /* 888 * One segment sent. Processing completed if last 889 * segment, Do next segment otherwise. 890 */ 891 if (unlikely(c_tx->tx_suspend)) { 892 /* 893 * Verbs, 6.4.: Try stopping sending after a full 894 * DDP segment if the connection goes down 895 * (== peer halfclose) 896 */ 897 rv = -ECONNABORTED; 898 goto tx_done; 899 } 900 if (c_tx->pkt.ctrl.ddp_rdmap_ctrl & DDP_FLAG_LAST) { 901 siw_dbg_qp(qp, "WQE completed\n"); 902 goto tx_done; 903 } 904 c_tx->state = SIW_SEND_HDR; 905 906 siw_update_tcpseg(c_tx, s); 907 908 siw_prepare_fpdu(qp, wqe); 909 goto next_segment; 910 } 911 tx_done: 912 qp->tx_ctx.burst = burst_len; 913 return rv; 914 915 tx_error: 916 if (ecode != RDMAP_ECODE_CATASTROPHIC_STREAM) 917 siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP, 918 RDMAP_ETYPE_REMOTE_PROTECTION, ecode, 1); 919 else 920 siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP, 921 RDMAP_ETYPE_CATASTROPHIC, 922 RDMAP_ECODE_UNSPECIFIED, 1); 923 return rv; 924 } 925 926 static int siw_fastreg_mr(struct ib_pd *pd, struct siw_sqe *sqe) 927 { 928 struct ib_mr *base_mr = (struct ib_mr *)(uintptr_t)sqe->base_mr; 929 struct siw_device *sdev = to_siw_dev(pd->device); 930 struct siw_mem *mem; 931 int rv = 0; 932 933 siw_dbg_pd(pd, "STag 0x%08x\n", sqe->rkey); 934 935 if (unlikely(!base_mr)) { 936 pr_warn("siw: fastreg: STag 0x%08x unknown\n", sqe->rkey); 937 return -EINVAL; 938 } 939 940 if (unlikely(base_mr->rkey >> 8 != sqe->rkey >> 8)) { 941 pr_warn("siw: fastreg: STag 0x%08x: bad MR\n", sqe->rkey); 942 return -EINVAL; 943 } 944 945 mem = siw_mem_id2obj(sdev, sqe->rkey >> 8); 946 if (unlikely(!mem)) { 947 pr_warn("siw: fastreg: STag 0x%08x unknown\n", sqe->rkey); 948 return -EINVAL; 949 } 950 951 if (unlikely(mem->pd != pd)) { 952 pr_warn("siw: fastreg: PD mismatch\n"); 953 rv = -EINVAL; 954 goto out; 955 } 956 if (unlikely(mem->stag_valid)) { 957 pr_warn("siw: fastreg: STag 0x%08x already valid\n", sqe->rkey); 958 rv = -EINVAL; 959 goto out; 960 } 961 /* Refresh STag since user may have changed key part */ 962 mem->stag = sqe->rkey; 963 mem->perms = sqe->access; 964 965 siw_dbg_mem(mem, "STag 0x%08x now valid\n", sqe->rkey); 966 mem->va = base_mr->iova; 967 mem->stag_valid = 1; 968 out: 969 siw_mem_put(mem); 970 return rv; 971 } 972 973 static int siw_qp_sq_proc_local(struct siw_qp *qp, struct siw_wqe *wqe) 974 { 975 int rv; 976 977 switch (tx_type(wqe)) { 978 case SIW_OP_REG_MR: 979 rv = siw_fastreg_mr(qp->pd, &wqe->sqe); 980 break; 981 982 case SIW_OP_INVAL_STAG: 983 rv = siw_invalidate_stag(qp->pd, wqe->sqe.rkey); 984 break; 985 986 default: 987 rv = -EINVAL; 988 } 989 return rv; 990 } 991 992 /* 993 * siw_qp_sq_process() 994 * 995 * Core TX path routine for RDMAP/DDP/MPA using a TCP kernel socket. 996 * Sends RDMAP payload for the current SQ WR @wqe of @qp in one or more 997 * MPA FPDUs, each containing a DDP segment. 998 * 999 * SQ processing may occur in user context as a result of posting 1000 * new WQE's or from siw_tx_thread context. Processing in 1001 * user context is limited to non-kernel verbs users. 1002 * 1003 * SQ processing may get paused anytime, possibly in the middle of a WR 1004 * or FPDU, if insufficient send space is available. SQ processing 1005 * gets resumed from siw_tx_thread, if send space becomes available again. 1006 * 1007 * Must be called with the QP state read-locked. 1008 * 1009 * Note: 1010 * An outbound RREQ can be satisfied by the corresponding RRESP 1011 * _before_ it gets assigned to the ORQ. This happens regularly 1012 * in RDMA READ via loopback case. Since both outbound RREQ and 1013 * inbound RRESP can be handled by the same CPU, locking the ORQ 1014 * is dead-lock prone and thus not an option. With that, the 1015 * RREQ gets assigned to the ORQ _before_ being sent - see 1016 * siw_activate_tx() - and pulled back in case of send failure. 1017 */ 1018 int siw_qp_sq_process(struct siw_qp *qp) 1019 { 1020 struct siw_wqe *wqe = tx_wqe(qp); 1021 enum siw_opcode tx_type; 1022 unsigned long flags; 1023 int rv = 0; 1024 1025 siw_dbg_qp(qp, "enter for type %d\n", tx_type(wqe)); 1026 1027 next_wqe: 1028 /* 1029 * Stop QP processing if SQ state changed 1030 */ 1031 if (unlikely(qp->tx_ctx.tx_suspend)) { 1032 siw_dbg_qp(qp, "tx suspended\n"); 1033 goto done; 1034 } 1035 tx_type = tx_type(wqe); 1036 1037 if (tx_type <= SIW_OP_READ_RESPONSE) 1038 rv = siw_qp_sq_proc_tx(qp, wqe); 1039 else 1040 rv = siw_qp_sq_proc_local(qp, wqe); 1041 1042 if (!rv) { 1043 /* 1044 * WQE processing done 1045 */ 1046 switch (tx_type) { 1047 case SIW_OP_SEND: 1048 case SIW_OP_SEND_REMOTE_INV: 1049 case SIW_OP_WRITE: 1050 siw_wqe_put_mem(wqe, tx_type); 1051 fallthrough; 1052 1053 case SIW_OP_INVAL_STAG: 1054 case SIW_OP_REG_MR: 1055 if (tx_flags(wqe) & SIW_WQE_SIGNALLED) 1056 siw_sqe_complete(qp, &wqe->sqe, wqe->bytes, 1057 SIW_WC_SUCCESS); 1058 break; 1059 1060 case SIW_OP_READ: 1061 case SIW_OP_READ_LOCAL_INV: 1062 /* 1063 * already enqueued to ORQ queue 1064 */ 1065 break; 1066 1067 case SIW_OP_READ_RESPONSE: 1068 siw_wqe_put_mem(wqe, tx_type); 1069 break; 1070 1071 default: 1072 WARN(1, "undefined WQE type %d\n", tx_type); 1073 rv = -EINVAL; 1074 goto done; 1075 } 1076 1077 spin_lock_irqsave(&qp->sq_lock, flags); 1078 wqe->wr_status = SIW_WR_IDLE; 1079 rv = siw_activate_tx(qp); 1080 spin_unlock_irqrestore(&qp->sq_lock, flags); 1081 1082 if (rv <= 0) 1083 goto done; 1084 1085 goto next_wqe; 1086 1087 } else if (rv == -EAGAIN) { 1088 siw_dbg_qp(qp, "sq paused: hd/tr %d of %d, data %d\n", 1089 qp->tx_ctx.ctrl_sent, qp->tx_ctx.ctrl_len, 1090 qp->tx_ctx.bytes_unsent); 1091 rv = 0; 1092 goto done; 1093 } else if (rv == -EINPROGRESS) { 1094 rv = siw_sq_start(qp); 1095 goto done; 1096 } else { 1097 /* 1098 * WQE processing failed. 1099 * Verbs 8.3.2: 1100 * o It turns any WQE into a signalled WQE. 1101 * o Local catastrophic error must be surfaced 1102 * o QP must be moved into Terminate state: done by code 1103 * doing socket state change processing 1104 * 1105 * o TODO: Termination message must be sent. 1106 * o TODO: Implement more precise work completion errors, 1107 * see enum ib_wc_status in ib_verbs.h 1108 */ 1109 siw_dbg_qp(qp, "wqe type %d processing failed: %d\n", 1110 tx_type(wqe), rv); 1111 1112 spin_lock_irqsave(&qp->sq_lock, flags); 1113 /* 1114 * RREQ may have already been completed by inbound RRESP! 1115 */ 1116 if ((tx_type == SIW_OP_READ || 1117 tx_type == SIW_OP_READ_LOCAL_INV) && qp->attrs.orq_size) { 1118 /* Cleanup pending entry in ORQ */ 1119 qp->orq_put--; 1120 qp->orq[qp->orq_put % qp->attrs.orq_size].flags = 0; 1121 } 1122 spin_unlock_irqrestore(&qp->sq_lock, flags); 1123 /* 1124 * immediately suspends further TX processing 1125 */ 1126 if (!qp->tx_ctx.tx_suspend) 1127 siw_qp_cm_drop(qp, 0); 1128 1129 switch (tx_type) { 1130 case SIW_OP_SEND: 1131 case SIW_OP_SEND_REMOTE_INV: 1132 case SIW_OP_SEND_WITH_IMM: 1133 case SIW_OP_WRITE: 1134 case SIW_OP_READ: 1135 case SIW_OP_READ_LOCAL_INV: 1136 siw_wqe_put_mem(wqe, tx_type); 1137 fallthrough; 1138 1139 case SIW_OP_INVAL_STAG: 1140 case SIW_OP_REG_MR: 1141 siw_sqe_complete(qp, &wqe->sqe, wqe->bytes, 1142 SIW_WC_LOC_QP_OP_ERR); 1143 1144 siw_qp_event(qp, IB_EVENT_QP_FATAL); 1145 1146 break; 1147 1148 case SIW_OP_READ_RESPONSE: 1149 siw_dbg_qp(qp, "proc. read.response failed: %d\n", rv); 1150 1151 siw_qp_event(qp, IB_EVENT_QP_REQ_ERR); 1152 1153 siw_wqe_put_mem(wqe, SIW_OP_READ_RESPONSE); 1154 1155 break; 1156 1157 default: 1158 WARN(1, "undefined WQE type %d\n", tx_type); 1159 rv = -EINVAL; 1160 } 1161 wqe->wr_status = SIW_WR_IDLE; 1162 } 1163 done: 1164 return rv; 1165 } 1166 1167 static void siw_sq_resume(struct siw_qp *qp) 1168 { 1169 if (down_read_trylock(&qp->state_lock)) { 1170 if (likely(qp->attrs.state == SIW_QP_STATE_RTS && 1171 !qp->tx_ctx.tx_suspend)) { 1172 int rv = siw_qp_sq_process(qp); 1173 1174 up_read(&qp->state_lock); 1175 1176 if (unlikely(rv < 0)) { 1177 siw_dbg_qp(qp, "SQ task failed: err %d\n", rv); 1178 1179 if (!qp->tx_ctx.tx_suspend) 1180 siw_qp_cm_drop(qp, 0); 1181 } 1182 } else { 1183 up_read(&qp->state_lock); 1184 } 1185 } else { 1186 siw_dbg_qp(qp, "Resume SQ while QP locked\n"); 1187 } 1188 siw_qp_put(qp); 1189 } 1190 1191 struct tx_task_t { 1192 struct llist_head active; 1193 wait_queue_head_t waiting; 1194 }; 1195 1196 static DEFINE_PER_CPU(struct tx_task_t, siw_tx_task_g); 1197 1198 int siw_create_tx_threads(void) 1199 { 1200 int cpu, assigned = 0; 1201 1202 for_each_online_cpu(cpu) { 1203 struct tx_task_t *tx_task; 1204 1205 /* Skip HT cores */ 1206 if (cpu % cpumask_weight(topology_sibling_cpumask(cpu))) 1207 continue; 1208 1209 tx_task = &per_cpu(siw_tx_task_g, cpu); 1210 init_llist_head(&tx_task->active); 1211 init_waitqueue_head(&tx_task->waiting); 1212 1213 siw_tx_thread[cpu] = 1214 kthread_run_on_cpu(siw_run_sq, 1215 (unsigned long *)(long)cpu, 1216 cpu, "siw_tx/%u"); 1217 if (IS_ERR(siw_tx_thread[cpu])) { 1218 siw_tx_thread[cpu] = NULL; 1219 continue; 1220 } 1221 assigned++; 1222 } 1223 return assigned; 1224 } 1225 1226 void siw_stop_tx_threads(void) 1227 { 1228 int cpu; 1229 1230 for_each_possible_cpu(cpu) { 1231 if (siw_tx_thread[cpu]) { 1232 kthread_stop(siw_tx_thread[cpu]); 1233 wake_up(&per_cpu(siw_tx_task_g, cpu).waiting); 1234 siw_tx_thread[cpu] = NULL; 1235 } 1236 } 1237 } 1238 1239 int siw_run_sq(void *data) 1240 { 1241 const int nr_cpu = (unsigned int)(long)data; 1242 struct llist_node *active; 1243 struct siw_qp *qp; 1244 struct tx_task_t *tx_task = &per_cpu(siw_tx_task_g, nr_cpu); 1245 1246 while (1) { 1247 struct llist_node *fifo_list = NULL; 1248 1249 wait_event_interruptible(tx_task->waiting, 1250 !llist_empty(&tx_task->active) || 1251 kthread_should_stop()); 1252 1253 if (kthread_should_stop()) 1254 break; 1255 1256 active = llist_del_all(&tx_task->active); 1257 /* 1258 * llist_del_all returns a list with newest entry first. 1259 * Re-order list for fairness among QP's. 1260 */ 1261 fifo_list = llist_reverse_order(active); 1262 while (fifo_list) { 1263 qp = container_of(fifo_list, struct siw_qp, tx_list); 1264 fifo_list = llist_next(fifo_list); 1265 qp->tx_list.next = NULL; 1266 1267 siw_sq_resume(qp); 1268 } 1269 } 1270 active = llist_del_all(&tx_task->active); 1271 if (active) { 1272 llist_for_each_entry(qp, active, tx_list) { 1273 qp->tx_list.next = NULL; 1274 siw_sq_resume(qp); 1275 } 1276 } 1277 return 0; 1278 } 1279 1280 int siw_sq_start(struct siw_qp *qp) 1281 { 1282 if (tx_wqe(qp)->wr_status == SIW_WR_IDLE) 1283 return 0; 1284 1285 if (unlikely(!cpu_online(qp->tx_cpu))) { 1286 siw_put_tx_cpu(qp->tx_cpu); 1287 qp->tx_cpu = siw_get_tx_cpu(qp->sdev); 1288 if (qp->tx_cpu < 0) { 1289 pr_warn("siw: no tx cpu available\n"); 1290 1291 return -EIO; 1292 } 1293 } 1294 siw_qp_get(qp); 1295 1296 llist_add(&qp->tx_list, &per_cpu(siw_tx_task_g, qp->tx_cpu).active); 1297 1298 wake_up(&per_cpu(siw_tx_task_g, qp->tx_cpu).waiting); 1299 1300 return 0; 1301 } 1302