1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2 /* 3 * Copyright(c) 2020 - 2023 Cornelis Networks, Inc. 4 * Copyright(c) 2015 - 2018 Intel Corporation. 5 */ 6 7 #include <linux/mm.h> 8 #include <linux/types.h> 9 #include <linux/device.h> 10 #include <linux/dmapool.h> 11 #include <linux/slab.h> 12 #include <linux/list.h> 13 #include <linux/highmem.h> 14 #include <linux/io.h> 15 #include <linux/uio.h> 16 #include <linux/rbtree.h> 17 #include <linux/spinlock.h> 18 #include <linux/delay.h> 19 #include <linux/kthread.h> 20 #include <linux/mmu_context.h> 21 #include <linux/module.h> 22 #include <linux/vmalloc.h> 23 #include <linux/string.h> 24 25 #include "hfi.h" 26 #include "sdma.h" 27 #include "user_sdma.h" 28 #include "verbs.h" /* for the headers */ 29 #include "common.h" /* for struct hfi1_tid_info */ 30 #include "trace.h" 31 32 static uint hfi1_sdma_comp_ring_size = 128; 33 module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO); 34 MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128"); 35 36 static unsigned initial_pkt_count = 8; 37 38 static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts); 39 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status); 40 static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq); 41 static void user_sdma_free_request(struct user_sdma_request *req); 42 static int check_header_template(struct user_sdma_request *req, 43 struct hfi1_pkt_header *hdr, u32 lrhlen, 44 u32 datalen); 45 static int set_txreq_header(struct user_sdma_request *req, 46 struct user_sdma_txreq *tx, u32 datalen); 47 static int set_txreq_header_ahg(struct user_sdma_request *req, 48 struct user_sdma_txreq *tx, u32 len); 49 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, 50 struct hfi1_user_sdma_comp_q *cq, 51 u16 idx, enum hfi1_sdma_comp_state state, 52 int ret); 53 static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags); 54 static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len); 55 56 static int defer_packet_queue( 57 struct sdma_engine *sde, 58 struct iowait_work *wait, 59 struct sdma_txreq *txreq, 60 uint seq, 61 bool pkts_sent); 62 static void activate_packet_queue(struct iowait *wait, int reason); 63 64 static int defer_packet_queue( 65 struct sdma_engine *sde, 66 struct iowait_work *wait, 67 struct sdma_txreq *txreq, 68 uint seq, 69 bool pkts_sent) 70 { 71 struct hfi1_user_sdma_pkt_q *pq = 72 container_of(wait->iow, struct hfi1_user_sdma_pkt_q, busy); 73 74 write_seqlock(&sde->waitlock); 75 trace_hfi1_usdma_defer(pq, sde, &pq->busy); 76 if (sdma_progress(sde, seq, txreq)) 77 goto eagain; 78 /* 79 * We are assuming that if the list is enqueued somewhere, it 80 * is to the dmawait list since that is the only place where 81 * it is supposed to be enqueued. 82 */ 83 xchg(&pq->state, SDMA_PKT_Q_DEFERRED); 84 if (list_empty(&pq->busy.list)) { 85 pq->busy.lock = &sde->waitlock; 86 iowait_get_priority(&pq->busy); 87 iowait_queue(pkts_sent, &pq->busy, &sde->dmawait); 88 } 89 write_sequnlock(&sde->waitlock); 90 return -EBUSY; 91 eagain: 92 write_sequnlock(&sde->waitlock); 93 return -EAGAIN; 94 } 95 96 static void activate_packet_queue(struct iowait *wait, int reason) 97 { 98 struct hfi1_user_sdma_pkt_q *pq = 99 container_of(wait, struct hfi1_user_sdma_pkt_q, busy); 100 101 trace_hfi1_usdma_activate(pq, wait, reason); 102 xchg(&pq->state, SDMA_PKT_Q_ACTIVE); 103 wake_up(&wait->wait_dma); 104 }; 105 106 int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, 107 struct hfi1_filedata *fd) 108 { 109 int ret = -ENOMEM; 110 char buf[64]; 111 struct hfi1_devdata *dd; 112 struct hfi1_user_sdma_comp_q *cq; 113 struct hfi1_user_sdma_pkt_q *pq; 114 115 if (!uctxt || !fd) 116 return -EBADF; 117 118 if (!hfi1_sdma_comp_ring_size) 119 return -EINVAL; 120 121 dd = uctxt->dd; 122 123 pq = kzalloc(sizeof(*pq), GFP_KERNEL); 124 if (!pq) 125 return -ENOMEM; 126 pq->dd = dd; 127 pq->ctxt = uctxt->ctxt; 128 pq->subctxt = fd->subctxt; 129 pq->n_max_reqs = hfi1_sdma_comp_ring_size; 130 atomic_set(&pq->n_reqs, 0); 131 init_waitqueue_head(&pq->wait); 132 atomic_set(&pq->n_locked, 0); 133 134 iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue, 135 activate_packet_queue, NULL, NULL); 136 pq->reqidx = 0; 137 138 pq->reqs = kcalloc(hfi1_sdma_comp_ring_size, 139 sizeof(*pq->reqs), 140 GFP_KERNEL); 141 if (!pq->reqs) 142 goto pq_reqs_nomem; 143 144 pq->req_in_use = bitmap_zalloc(hfi1_sdma_comp_ring_size, GFP_KERNEL); 145 if (!pq->req_in_use) 146 goto pq_reqs_no_in_use; 147 148 snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt, 149 fd->subctxt); 150 pq->txreq_cache = kmem_cache_create(buf, 151 sizeof(struct user_sdma_txreq), 152 L1_CACHE_BYTES, 153 SLAB_HWCACHE_ALIGN, 154 NULL); 155 if (!pq->txreq_cache) { 156 dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n", 157 uctxt->ctxt); 158 goto pq_txreq_nomem; 159 } 160 161 cq = kzalloc(sizeof(*cq), GFP_KERNEL); 162 if (!cq) 163 goto cq_nomem; 164 165 cq->comps = vmalloc_user(PAGE_ALIGN(sizeof(*cq->comps) 166 * hfi1_sdma_comp_ring_size)); 167 if (!cq->comps) 168 goto cq_comps_nomem; 169 170 cq->nentries = hfi1_sdma_comp_ring_size; 171 172 ret = hfi1_init_system_pinning(pq); 173 if (ret) 174 goto pq_mmu_fail; 175 176 rcu_assign_pointer(fd->pq, pq); 177 fd->cq = cq; 178 179 return 0; 180 181 pq_mmu_fail: 182 vfree(cq->comps); 183 cq_comps_nomem: 184 kfree(cq); 185 cq_nomem: 186 kmem_cache_destroy(pq->txreq_cache); 187 pq_txreq_nomem: 188 bitmap_free(pq->req_in_use); 189 pq_reqs_no_in_use: 190 kfree(pq->reqs); 191 pq_reqs_nomem: 192 kfree(pq); 193 194 return ret; 195 } 196 197 static void flush_pq_iowait(struct hfi1_user_sdma_pkt_q *pq) 198 { 199 unsigned long flags; 200 seqlock_t *lock = pq->busy.lock; 201 202 if (!lock) 203 return; 204 write_seqlock_irqsave(lock, flags); 205 if (!list_empty(&pq->busy.list)) { 206 list_del_init(&pq->busy.list); 207 pq->busy.lock = NULL; 208 } 209 write_sequnlock_irqrestore(lock, flags); 210 } 211 212 int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd, 213 struct hfi1_ctxtdata *uctxt) 214 { 215 struct hfi1_user_sdma_pkt_q *pq; 216 217 trace_hfi1_sdma_user_free_queues(uctxt->dd, uctxt->ctxt, fd->subctxt); 218 219 spin_lock(&fd->pq_rcu_lock); 220 pq = srcu_dereference_check(fd->pq, &fd->pq_srcu, 221 lockdep_is_held(&fd->pq_rcu_lock)); 222 if (pq) { 223 rcu_assign_pointer(fd->pq, NULL); 224 spin_unlock(&fd->pq_rcu_lock); 225 synchronize_srcu(&fd->pq_srcu); 226 /* at this point there can be no more new requests */ 227 iowait_sdma_drain(&pq->busy); 228 /* Wait until all requests have been freed. */ 229 wait_event_interruptible( 230 pq->wait, 231 !atomic_read(&pq->n_reqs)); 232 kfree(pq->reqs); 233 hfi1_free_system_pinning(pq); 234 bitmap_free(pq->req_in_use); 235 kmem_cache_destroy(pq->txreq_cache); 236 flush_pq_iowait(pq); 237 kfree(pq); 238 } else { 239 spin_unlock(&fd->pq_rcu_lock); 240 } 241 if (fd->cq) { 242 vfree(fd->cq->comps); 243 kfree(fd->cq); 244 fd->cq = NULL; 245 } 246 return 0; 247 } 248 249 static u8 dlid_to_selector(u16 dlid) 250 { 251 static u8 mapping[256]; 252 static int initialized; 253 static u8 next; 254 int hash; 255 256 if (!initialized) { 257 memset(mapping, 0xFF, 256); 258 initialized = 1; 259 } 260 261 hash = ((dlid >> 8) ^ dlid) & 0xFF; 262 if (mapping[hash] == 0xFF) { 263 mapping[hash] = next; 264 next = (next + 1) & 0x7F; 265 } 266 267 return mapping[hash]; 268 } 269 270 /** 271 * hfi1_user_sdma_process_request() - Process and start a user sdma request 272 * @fd: valid file descriptor 273 * @iovec: array of io vectors to process 274 * @dim: overall iovec array size 275 * @count: number of io vector array entries processed 276 */ 277 int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, 278 struct iovec *iovec, unsigned long dim, 279 unsigned long *count) 280 { 281 int ret = 0, i; 282 struct hfi1_ctxtdata *uctxt = fd->uctxt; 283 struct hfi1_user_sdma_pkt_q *pq = 284 srcu_dereference(fd->pq, &fd->pq_srcu); 285 struct hfi1_user_sdma_comp_q *cq = fd->cq; 286 struct hfi1_devdata *dd = pq->dd; 287 unsigned long idx = 0; 288 u8 pcount = initial_pkt_count; 289 struct sdma_req_info info; 290 struct user_sdma_request *req; 291 u8 opcode, sc, vl; 292 u16 pkey; 293 u32 slid; 294 u16 dlid; 295 u32 selector; 296 297 if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) { 298 hfi1_cdbg( 299 SDMA, 300 "[%u:%u:%u] First vector not big enough for header %lu/%lu", 301 dd->unit, uctxt->ctxt, fd->subctxt, 302 iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr)); 303 return -EINVAL; 304 } 305 ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info)); 306 if (ret) { 307 hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)", 308 dd->unit, uctxt->ctxt, fd->subctxt, ret); 309 return -EFAULT; 310 } 311 312 trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt, 313 (u16 *)&info); 314 if (info.comp_idx >= hfi1_sdma_comp_ring_size) { 315 hfi1_cdbg(SDMA, 316 "[%u:%u:%u:%u] Invalid comp index", 317 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); 318 return -EINVAL; 319 } 320 321 /* 322 * Sanity check the header io vector count. Need at least 1 vector 323 * (header) and cannot be larger than the actual io vector count. 324 */ 325 if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) { 326 hfi1_cdbg(SDMA, 327 "[%u:%u:%u:%u] Invalid iov count %d, dim %ld", 328 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx, 329 req_iovcnt(info.ctrl), dim); 330 return -EINVAL; 331 } 332 333 if (!info.fragsize) { 334 hfi1_cdbg(SDMA, 335 "[%u:%u:%u:%u] Request does not specify fragsize", 336 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); 337 return -EINVAL; 338 } 339 340 /* Try to claim the request. */ 341 if (test_and_set_bit(info.comp_idx, pq->req_in_use)) { 342 hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use", 343 dd->unit, uctxt->ctxt, fd->subctxt, 344 info.comp_idx); 345 return -EBADSLT; 346 } 347 /* 348 * All safety checks have been done and this request has been claimed. 349 */ 350 trace_hfi1_sdma_user_process_request(dd, uctxt->ctxt, fd->subctxt, 351 info.comp_idx); 352 req = pq->reqs + info.comp_idx; 353 req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */ 354 req->data_len = 0; 355 req->pq = pq; 356 req->cq = cq; 357 req->ahg_idx = -1; 358 req->iov_idx = 0; 359 req->sent = 0; 360 req->seqnum = 0; 361 req->seqcomp = 0; 362 req->seqsubmitted = 0; 363 req->tids = NULL; 364 req->has_error = 0; 365 INIT_LIST_HEAD(&req->txps); 366 367 memcpy(&req->info, &info, sizeof(info)); 368 369 /* The request is initialized, count it */ 370 atomic_inc(&pq->n_reqs); 371 372 if (req_opcode(info.ctrl) == EXPECTED) { 373 /* expected must have a TID info and at least one data vector */ 374 if (req->data_iovs < 2) { 375 SDMA_DBG(req, 376 "Not enough vectors for expected request"); 377 ret = -EINVAL; 378 goto free_req; 379 } 380 req->data_iovs--; 381 } 382 383 if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) { 384 SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs, 385 MAX_VECTORS_PER_REQ); 386 ret = -EINVAL; 387 goto free_req; 388 } 389 390 /* Copy the header from the user buffer */ 391 ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info), 392 sizeof(req->hdr)); 393 if (ret) { 394 SDMA_DBG(req, "Failed to copy header template (%d)", ret); 395 ret = -EFAULT; 396 goto free_req; 397 } 398 399 /* If Static rate control is not enabled, sanitize the header. */ 400 if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL)) 401 req->hdr.pbc[2] = 0; 402 403 /* Validate the opcode. Do not trust packets from user space blindly. */ 404 opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff; 405 if ((opcode & USER_OPCODE_CHECK_MASK) != 406 USER_OPCODE_CHECK_VAL) { 407 SDMA_DBG(req, "Invalid opcode (%d)", opcode); 408 ret = -EINVAL; 409 goto free_req; 410 } 411 /* 412 * Validate the vl. Do not trust packets from user space blindly. 413 * VL comes from PBC, SC comes from LRH, and the VL needs to 414 * match the SC look up. 415 */ 416 vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF; 417 sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) | 418 (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4)); 419 if (vl >= dd->pport->vls_operational || 420 vl != sc_to_vlt(dd, sc)) { 421 SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl); 422 ret = -EINVAL; 423 goto free_req; 424 } 425 426 /* Checking P_KEY for requests from user-space */ 427 pkey = (u16)be32_to_cpu(req->hdr.bth[0]); 428 slid = be16_to_cpu(req->hdr.lrh[3]); 429 if (egress_pkey_check(dd->pport, slid, pkey, sc, PKEY_CHECK_INVALID)) { 430 ret = -EINVAL; 431 goto free_req; 432 } 433 434 /* 435 * Also should check the BTH.lnh. If it says the next header is GRH then 436 * the RXE parsing will be off and will land in the middle of the KDETH 437 * or miss it entirely. 438 */ 439 if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) { 440 SDMA_DBG(req, "User tried to pass in a GRH"); 441 ret = -EINVAL; 442 goto free_req; 443 } 444 445 req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]); 446 /* 447 * Calculate the initial TID offset based on the values of 448 * KDETH.OFFSET and KDETH.OM that are passed in. 449 */ 450 req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) * 451 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? 452 KDETH_OM_LARGE : KDETH_OM_SMALL); 453 trace_hfi1_sdma_user_initial_tidoffset(dd, uctxt->ctxt, fd->subctxt, 454 info.comp_idx, req->tidoffset); 455 idx++; 456 457 /* Save all the IO vector structures */ 458 for (i = 0; i < req->data_iovs; i++) { 459 req->iovs[i].offset = 0; 460 INIT_LIST_HEAD(&req->iovs[i].list); 461 memcpy(&req->iovs[i].iov, 462 iovec + idx++, 463 sizeof(req->iovs[i].iov)); 464 if (req->iovs[i].iov.iov_len == 0) { 465 ret = -EINVAL; 466 goto free_req; 467 } 468 req->data_len += req->iovs[i].iov.iov_len; 469 } 470 trace_hfi1_sdma_user_data_length(dd, uctxt->ctxt, fd->subctxt, 471 info.comp_idx, req->data_len); 472 if (pcount > req->info.npkts) 473 pcount = req->info.npkts; 474 /* 475 * Copy any TID info 476 * User space will provide the TID info only when the 477 * request type is EXPECTED. This is true even if there is 478 * only one packet in the request and the header is already 479 * setup. The reason for the singular TID case is that the 480 * driver needs to perform safety checks. 481 */ 482 if (req_opcode(req->info.ctrl) == EXPECTED) { 483 u16 ntids = iovec[idx].iov_len / sizeof(*req->tids); 484 u32 *tmp; 485 486 if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) { 487 ret = -EINVAL; 488 goto free_req; 489 } 490 491 /* 492 * We have to copy all of the tids because they may vary 493 * in size and, therefore, the TID count might not be 494 * equal to the pkt count. However, there is no way to 495 * tell at this point. 496 */ 497 tmp = memdup_array_user(iovec[idx].iov_base, 498 ntids, sizeof(*req->tids)); 499 if (IS_ERR(tmp)) { 500 ret = PTR_ERR(tmp); 501 SDMA_DBG(req, "Failed to copy %d TIDs (%d)", 502 ntids, ret); 503 goto free_req; 504 } 505 req->tids = tmp; 506 req->n_tids = ntids; 507 req->tididx = 0; 508 idx++; 509 } 510 511 dlid = be16_to_cpu(req->hdr.lrh[1]); 512 selector = dlid_to_selector(dlid); 513 selector += uctxt->ctxt + fd->subctxt; 514 req->sde = sdma_select_user_engine(dd, selector, vl); 515 516 if (!req->sde || !sdma_running(req->sde)) { 517 ret = -ECOMM; 518 goto free_req; 519 } 520 521 /* We don't need an AHG entry if the request contains only one packet */ 522 if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG)) 523 req->ahg_idx = sdma_ahg_alloc(req->sde); 524 525 set_comp_state(pq, cq, info.comp_idx, QUEUED, 0); 526 pq->state = SDMA_PKT_Q_ACTIVE; 527 528 /* 529 * This is a somewhat blocking send implementation. 530 * The driver will block the caller until all packets of the 531 * request have been submitted to the SDMA engine. However, it 532 * will not wait for send completions. 533 */ 534 while (req->seqsubmitted != req->info.npkts) { 535 ret = user_sdma_send_pkts(req, pcount); 536 if (ret < 0) { 537 int we_ret; 538 539 if (ret != -EBUSY) 540 goto free_req; 541 we_ret = wait_event_interruptible_timeout( 542 pq->busy.wait_dma, 543 pq->state == SDMA_PKT_Q_ACTIVE, 544 msecs_to_jiffies( 545 SDMA_IOWAIT_TIMEOUT)); 546 trace_hfi1_usdma_we(pq, we_ret); 547 if (we_ret <= 0) 548 flush_pq_iowait(pq); 549 } 550 } 551 *count += idx; 552 return 0; 553 free_req: 554 /* 555 * If the submitted seqsubmitted == npkts, the completion routine 556 * controls the final state. If sequbmitted < npkts, wait for any 557 * outstanding packets to finish before cleaning up. 558 */ 559 if (req->seqsubmitted < req->info.npkts) { 560 if (req->seqsubmitted) 561 wait_event(pq->busy.wait_dma, 562 (req->seqcomp == req->seqsubmitted - 1)); 563 user_sdma_free_request(req); 564 pq_update(pq); 565 set_comp_state(pq, cq, info.comp_idx, ERROR, ret); 566 } 567 return ret; 568 } 569 570 static inline u32 compute_data_length(struct user_sdma_request *req, 571 struct user_sdma_txreq *tx) 572 { 573 /* 574 * Determine the proper size of the packet data. 575 * The size of the data of the first packet is in the header 576 * template. However, it includes the header and ICRC, which need 577 * to be subtracted. 578 * The minimum representable packet data length in a header is 4 bytes, 579 * therefore, when the data length request is less than 4 bytes, there's 580 * only one packet, and the packet data length is equal to that of the 581 * request data length. 582 * The size of the remaining packets is the minimum of the frag 583 * size (MTU) or remaining data in the request. 584 */ 585 u32 len; 586 587 if (!req->seqnum) { 588 if (req->data_len < sizeof(u32)) 589 len = req->data_len; 590 else 591 len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) - 592 (sizeof(tx->hdr) - 4)); 593 } else if (req_opcode(req->info.ctrl) == EXPECTED) { 594 u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) * 595 PAGE_SIZE; 596 /* 597 * Get the data length based on the remaining space in the 598 * TID pair. 599 */ 600 len = min(tidlen - req->tidoffset, (u32)req->info.fragsize); 601 /* If we've filled up the TID pair, move to the next one. */ 602 if (unlikely(!len) && ++req->tididx < req->n_tids && 603 req->tids[req->tididx]) { 604 tidlen = EXP_TID_GET(req->tids[req->tididx], 605 LEN) * PAGE_SIZE; 606 req->tidoffset = 0; 607 len = min_t(u32, tidlen, req->info.fragsize); 608 } 609 /* 610 * Since the TID pairs map entire pages, make sure that we 611 * are not going to try to send more data that we have 612 * remaining. 613 */ 614 len = min(len, req->data_len - req->sent); 615 } else { 616 len = min(req->data_len - req->sent, (u32)req->info.fragsize); 617 } 618 trace_hfi1_sdma_user_compute_length(req->pq->dd, 619 req->pq->ctxt, 620 req->pq->subctxt, 621 req->info.comp_idx, 622 len); 623 return len; 624 } 625 626 static inline u32 pad_len(u32 len) 627 { 628 if (len & (sizeof(u32) - 1)) 629 len += sizeof(u32) - (len & (sizeof(u32) - 1)); 630 return len; 631 } 632 633 static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len) 634 { 635 /* (Size of complete header - size of PBC) + 4B ICRC + data length */ 636 return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len); 637 } 638 639 static int user_sdma_txadd_ahg(struct user_sdma_request *req, 640 struct user_sdma_txreq *tx, 641 u32 datalen) 642 { 643 int ret; 644 u16 pbclen = le16_to_cpu(req->hdr.pbc[0]); 645 u32 lrhlen = get_lrh_len(req->hdr, pad_len(datalen)); 646 struct hfi1_user_sdma_pkt_q *pq = req->pq; 647 648 /* 649 * Copy the request header into the tx header 650 * because the HW needs a cacheline-aligned 651 * address. 652 * This copy can be optimized out if the hdr 653 * member of user_sdma_request were also 654 * cacheline aligned. 655 */ 656 memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr)); 657 if (PBC2LRH(pbclen) != lrhlen) { 658 pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen); 659 tx->hdr.pbc[0] = cpu_to_le16(pbclen); 660 } 661 ret = check_header_template(req, &tx->hdr, lrhlen, datalen); 662 if (ret) 663 return ret; 664 ret = sdma_txinit_ahg(&tx->txreq, SDMA_TXREQ_F_AHG_COPY, 665 sizeof(tx->hdr) + datalen, req->ahg_idx, 666 0, NULL, 0, user_sdma_txreq_cb); 667 if (ret) 668 return ret; 669 ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, &tx->hdr, sizeof(tx->hdr)); 670 if (ret) 671 sdma_txclean(pq->dd, &tx->txreq); 672 return ret; 673 } 674 675 static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts) 676 { 677 int ret = 0; 678 u16 count; 679 unsigned npkts = 0; 680 struct user_sdma_txreq *tx = NULL; 681 struct hfi1_user_sdma_pkt_q *pq = NULL; 682 struct user_sdma_iovec *iovec = NULL; 683 684 if (!req->pq) 685 return -EINVAL; 686 687 pq = req->pq; 688 689 /* If tx completion has reported an error, we are done. */ 690 if (READ_ONCE(req->has_error)) 691 return -EFAULT; 692 693 /* 694 * Check if we might have sent the entire request already 695 */ 696 if (unlikely(req->seqnum == req->info.npkts)) { 697 if (!list_empty(&req->txps)) 698 goto dosend; 699 return ret; 700 } 701 702 if (!maxpkts || maxpkts > req->info.npkts - req->seqnum) 703 maxpkts = req->info.npkts - req->seqnum; 704 705 while (npkts < maxpkts) { 706 u32 datalen = 0; 707 708 /* 709 * Check whether any of the completions have come back 710 * with errors. If so, we are not going to process any 711 * more packets from this request. 712 */ 713 if (READ_ONCE(req->has_error)) 714 return -EFAULT; 715 716 tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL); 717 if (!tx) 718 return -ENOMEM; 719 720 tx->flags = 0; 721 tx->req = req; 722 INIT_LIST_HEAD(&tx->list); 723 724 /* 725 * For the last packet set the ACK request 726 * and disable header suppression. 727 */ 728 if (req->seqnum == req->info.npkts - 1) 729 tx->flags |= (TXREQ_FLAGS_REQ_ACK | 730 TXREQ_FLAGS_REQ_DISABLE_SH); 731 732 /* 733 * Calculate the payload size - this is min of the fragment 734 * (MTU) size or the remaining bytes in the request but only 735 * if we have payload data. 736 */ 737 if (req->data_len) { 738 iovec = &req->iovs[req->iov_idx]; 739 if (READ_ONCE(iovec->offset) == iovec->iov.iov_len) { 740 if (++req->iov_idx == req->data_iovs) { 741 ret = -EFAULT; 742 goto free_tx; 743 } 744 iovec = &req->iovs[req->iov_idx]; 745 WARN_ON(iovec->offset); 746 } 747 748 datalen = compute_data_length(req, tx); 749 750 /* 751 * Disable header suppression for the payload <= 8DWS. 752 * If there is an uncorrectable error in the receive 753 * data FIFO when the received payload size is less than 754 * or equal to 8DWS then the RxDmaDataFifoRdUncErr is 755 * not reported.There is set RHF.EccErr if the header 756 * is not suppressed. 757 */ 758 if (!datalen) { 759 SDMA_DBG(req, 760 "Request has data but pkt len is 0"); 761 ret = -EFAULT; 762 goto free_tx; 763 } else if (datalen <= 32) { 764 tx->flags |= TXREQ_FLAGS_REQ_DISABLE_SH; 765 } 766 } 767 768 if (req->ahg_idx >= 0) { 769 if (!req->seqnum) { 770 ret = user_sdma_txadd_ahg(req, tx, datalen); 771 if (ret) 772 goto free_tx; 773 } else { 774 int changes; 775 776 changes = set_txreq_header_ahg(req, tx, 777 datalen); 778 if (changes < 0) { 779 ret = changes; 780 goto free_tx; 781 } 782 } 783 } else { 784 ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) + 785 datalen, user_sdma_txreq_cb); 786 if (ret) 787 goto free_tx; 788 /* 789 * Modify the header for this packet. This only needs 790 * to be done if we are not going to use AHG. Otherwise, 791 * the HW will do it based on the changes we gave it 792 * during sdma_txinit_ahg(). 793 */ 794 ret = set_txreq_header(req, tx, datalen); 795 if (ret) 796 goto free_txreq; 797 } 798 799 req->koffset += datalen; 800 if (req_opcode(req->info.ctrl) == EXPECTED) 801 req->tidoffset += datalen; 802 req->sent += datalen; 803 while (datalen) { 804 ret = hfi1_add_pages_to_sdma_packet(req, tx, iovec, 805 &datalen); 806 if (ret) 807 goto free_txreq; 808 iovec = &req->iovs[req->iov_idx]; 809 } 810 list_add_tail(&tx->txreq.list, &req->txps); 811 /* 812 * It is important to increment this here as it is used to 813 * generate the BTH.PSN and, therefore, can't be bulk-updated 814 * outside of the loop. 815 */ 816 tx->seqnum = req->seqnum++; 817 npkts++; 818 } 819 dosend: 820 ret = sdma_send_txlist(req->sde, 821 iowait_get_ib_work(&pq->busy), 822 &req->txps, &count); 823 req->seqsubmitted += count; 824 if (req->seqsubmitted == req->info.npkts) { 825 /* 826 * The txreq has already been submitted to the HW queue 827 * so we can free the AHG entry now. Corruption will not 828 * happen due to the sequential manner in which 829 * descriptors are processed. 830 */ 831 if (req->ahg_idx >= 0) 832 sdma_ahg_free(req->sde, req->ahg_idx); 833 } 834 return ret; 835 836 free_txreq: 837 sdma_txclean(pq->dd, &tx->txreq); 838 free_tx: 839 kmem_cache_free(pq->txreq_cache, tx); 840 return ret; 841 } 842 843 static int check_header_template(struct user_sdma_request *req, 844 struct hfi1_pkt_header *hdr, u32 lrhlen, 845 u32 datalen) 846 { 847 /* 848 * Perform safety checks for any type of packet: 849 * - transfer size is multiple of 64bytes 850 * - packet length is multiple of 4 bytes 851 * - packet length is not larger than MTU size 852 * 853 * These checks are only done for the first packet of the 854 * transfer since the header is "given" to us by user space. 855 * For the remainder of the packets we compute the values. 856 */ 857 if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 || 858 lrhlen > get_lrh_len(*hdr, req->info.fragsize)) 859 return -EINVAL; 860 861 if (req_opcode(req->info.ctrl) == EXPECTED) { 862 /* 863 * The header is checked only on the first packet. Furthermore, 864 * we ensure that at least one TID entry is copied when the 865 * request is submitted. Therefore, we don't have to verify that 866 * tididx points to something sane. 867 */ 868 u32 tidval = req->tids[req->tididx], 869 tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE, 870 tididx = EXP_TID_GET(tidval, IDX), 871 tidctrl = EXP_TID_GET(tidval, CTRL), 872 tidoff; 873 __le32 kval = hdr->kdeth.ver_tid_offset; 874 875 tidoff = KDETH_GET(kval, OFFSET) * 876 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? 877 KDETH_OM_LARGE : KDETH_OM_SMALL); 878 /* 879 * Expected receive packets have the following 880 * additional checks: 881 * - offset is not larger than the TID size 882 * - TIDCtrl values match between header and TID array 883 * - TID indexes match between header and TID array 884 */ 885 if ((tidoff + datalen > tidlen) || 886 KDETH_GET(kval, TIDCTRL) != tidctrl || 887 KDETH_GET(kval, TID) != tididx) 888 return -EINVAL; 889 } 890 return 0; 891 } 892 893 /* 894 * Correctly set the BTH.PSN field based on type of 895 * transfer - eager packets can just increment the PSN but 896 * expected packets encode generation and sequence in the 897 * BTH.PSN field so just incrementing will result in errors. 898 */ 899 static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags) 900 { 901 u32 val = be32_to_cpu(bthpsn), 902 mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull : 903 0xffffffull), 904 psn = val & mask; 905 if (expct) 906 psn = (psn & ~HFI1_KDETH_BTH_SEQ_MASK) | 907 ((psn + frags) & HFI1_KDETH_BTH_SEQ_MASK); 908 else 909 psn = psn + frags; 910 return psn & mask; 911 } 912 913 static int set_txreq_header(struct user_sdma_request *req, 914 struct user_sdma_txreq *tx, u32 datalen) 915 { 916 struct hfi1_user_sdma_pkt_q *pq = req->pq; 917 struct hfi1_pkt_header *hdr = &tx->hdr; 918 u8 omfactor; /* KDETH.OM */ 919 u16 pbclen; 920 int ret; 921 u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen)); 922 923 /* Copy the header template to the request before modification */ 924 memcpy(hdr, &req->hdr, sizeof(*hdr)); 925 926 /* 927 * Check if the PBC and LRH length are mismatched. If so 928 * adjust both in the header. 929 */ 930 pbclen = le16_to_cpu(hdr->pbc[0]); 931 if (PBC2LRH(pbclen) != lrhlen) { 932 pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen); 933 hdr->pbc[0] = cpu_to_le16(pbclen); 934 hdr->lrh[2] = cpu_to_be16(lrhlen >> 2); 935 /* 936 * Third packet 937 * This is the first packet in the sequence that has 938 * a "static" size that can be used for the rest of 939 * the packets (besides the last one). 940 */ 941 if (unlikely(req->seqnum == 2)) { 942 /* 943 * From this point on the lengths in both the 944 * PBC and LRH are the same until the last 945 * packet. 946 * Adjust the template so we don't have to update 947 * every packet 948 */ 949 req->hdr.pbc[0] = hdr->pbc[0]; 950 req->hdr.lrh[2] = hdr->lrh[2]; 951 } 952 } 953 /* 954 * We only have to modify the header if this is not the 955 * first packet in the request. Otherwise, we use the 956 * header given to us. 957 */ 958 if (unlikely(!req->seqnum)) { 959 ret = check_header_template(req, hdr, lrhlen, datalen); 960 if (ret) 961 return ret; 962 goto done; 963 } 964 965 hdr->bth[2] = cpu_to_be32( 966 set_pkt_bth_psn(hdr->bth[2], 967 (req_opcode(req->info.ctrl) == EXPECTED), 968 req->seqnum)); 969 970 /* Set ACK request on last packet */ 971 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK)) 972 hdr->bth[2] |= cpu_to_be32(1UL << 31); 973 974 /* Set the new offset */ 975 hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset); 976 /* Expected packets have to fill in the new TID information */ 977 if (req_opcode(req->info.ctrl) == EXPECTED) { 978 tidval = req->tids[req->tididx]; 979 /* 980 * If the offset puts us at the end of the current TID, 981 * advance everything. 982 */ 983 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * 984 PAGE_SIZE)) { 985 req->tidoffset = 0; 986 /* 987 * Since we don't copy all the TIDs, all at once, 988 * we have to check again. 989 */ 990 if (++req->tididx > req->n_tids - 1 || 991 !req->tids[req->tididx]) { 992 return -EINVAL; 993 } 994 tidval = req->tids[req->tididx]; 995 } 996 omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >= 997 KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE_SHIFT : 998 KDETH_OM_SMALL_SHIFT; 999 /* Set KDETH.TIDCtrl based on value for this TID. */ 1000 KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL, 1001 EXP_TID_GET(tidval, CTRL)); 1002 /* Set KDETH.TID based on value for this TID */ 1003 KDETH_SET(hdr->kdeth.ver_tid_offset, TID, 1004 EXP_TID_GET(tidval, IDX)); 1005 /* Clear KDETH.SH when DISABLE_SH flag is set */ 1006 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) 1007 KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0); 1008 /* 1009 * Set the KDETH.OFFSET and KDETH.OM based on size of 1010 * transfer. 1011 */ 1012 trace_hfi1_sdma_user_tid_info( 1013 pq->dd, pq->ctxt, pq->subctxt, req->info.comp_idx, 1014 req->tidoffset, req->tidoffset >> omfactor, 1015 omfactor != KDETH_OM_SMALL_SHIFT); 1016 KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET, 1017 req->tidoffset >> omfactor); 1018 KDETH_SET(hdr->kdeth.ver_tid_offset, OM, 1019 omfactor != KDETH_OM_SMALL_SHIFT); 1020 } 1021 done: 1022 trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt, 1023 req->info.comp_idx, hdr, tidval); 1024 return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr)); 1025 } 1026 1027 static int set_txreq_header_ahg(struct user_sdma_request *req, 1028 struct user_sdma_txreq *tx, u32 datalen) 1029 { 1030 u32 ahg[AHG_KDETH_ARRAY_SIZE]; 1031 int idx = 0; 1032 u8 omfactor; /* KDETH.OM */ 1033 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1034 struct hfi1_pkt_header *hdr = &req->hdr; 1035 u16 pbclen = le16_to_cpu(hdr->pbc[0]); 1036 u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen)); 1037 size_t array_size = ARRAY_SIZE(ahg); 1038 1039 if (PBC2LRH(pbclen) != lrhlen) { 1040 /* PBC.PbcLengthDWs */ 1041 idx = ahg_header_set(ahg, idx, array_size, 0, 0, 12, 1042 (__force u16)cpu_to_le16(LRH2PBC(lrhlen))); 1043 if (idx < 0) 1044 return idx; 1045 /* LRH.PktLen (we need the full 16 bits due to byte swap) */ 1046 idx = ahg_header_set(ahg, idx, array_size, 3, 0, 16, 1047 (__force u16)cpu_to_be16(lrhlen >> 2)); 1048 if (idx < 0) 1049 return idx; 1050 } 1051 1052 /* 1053 * Do the common updates 1054 */ 1055 /* BTH.PSN and BTH.A */ 1056 val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) & 1057 (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff); 1058 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK)) 1059 val32 |= 1UL << 31; 1060 idx = ahg_header_set(ahg, idx, array_size, 6, 0, 16, 1061 (__force u16)cpu_to_be16(val32 >> 16)); 1062 if (idx < 0) 1063 return idx; 1064 idx = ahg_header_set(ahg, idx, array_size, 6, 16, 16, 1065 (__force u16)cpu_to_be16(val32 & 0xffff)); 1066 if (idx < 0) 1067 return idx; 1068 /* KDETH.Offset */ 1069 idx = ahg_header_set(ahg, idx, array_size, 15, 0, 16, 1070 (__force u16)cpu_to_le16(req->koffset & 0xffff)); 1071 if (idx < 0) 1072 return idx; 1073 idx = ahg_header_set(ahg, idx, array_size, 15, 16, 16, 1074 (__force u16)cpu_to_le16(req->koffset >> 16)); 1075 if (idx < 0) 1076 return idx; 1077 if (req_opcode(req->info.ctrl) == EXPECTED) { 1078 __le16 val; 1079 1080 tidval = req->tids[req->tididx]; 1081 1082 /* 1083 * If the offset puts us at the end of the current TID, 1084 * advance everything. 1085 */ 1086 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * 1087 PAGE_SIZE)) { 1088 req->tidoffset = 0; 1089 /* 1090 * Since we don't copy all the TIDs, all at once, 1091 * we have to check again. 1092 */ 1093 if (++req->tididx > req->n_tids - 1 || 1094 !req->tids[req->tididx]) 1095 return -EINVAL; 1096 tidval = req->tids[req->tididx]; 1097 } 1098 omfactor = ((EXP_TID_GET(tidval, LEN) * 1099 PAGE_SIZE) >= 1100 KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE_SHIFT : 1101 KDETH_OM_SMALL_SHIFT; 1102 /* KDETH.OM and KDETH.OFFSET (TID) */ 1103 idx = ahg_header_set( 1104 ahg, idx, array_size, 7, 0, 16, 1105 ((!!(omfactor - KDETH_OM_SMALL_SHIFT)) << 15 | 1106 ((req->tidoffset >> omfactor) 1107 & 0x7fff))); 1108 if (idx < 0) 1109 return idx; 1110 /* KDETH.TIDCtrl, KDETH.TID, KDETH.Intr, KDETH.SH */ 1111 val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) | 1112 (EXP_TID_GET(tidval, IDX) & 0x3ff)); 1113 1114 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) { 1115 val |= cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset, 1116 INTR) << 1117 AHG_KDETH_INTR_SHIFT)); 1118 } else { 1119 val |= KDETH_GET(hdr->kdeth.ver_tid_offset, SH) ? 1120 cpu_to_le16(0x1 << AHG_KDETH_SH_SHIFT) : 1121 cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset, 1122 INTR) << 1123 AHG_KDETH_INTR_SHIFT)); 1124 } 1125 1126 idx = ahg_header_set(ahg, idx, array_size, 1127 7, 16, 14, (__force u16)val); 1128 if (idx < 0) 1129 return idx; 1130 } 1131 1132 trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt, 1133 req->info.comp_idx, req->sde->this_idx, 1134 req->ahg_idx, ahg, idx, tidval); 1135 sdma_txinit_ahg(&tx->txreq, 1136 SDMA_TXREQ_F_USE_AHG, 1137 datalen, req->ahg_idx, idx, 1138 ahg, sizeof(req->hdr), 1139 user_sdma_txreq_cb); 1140 1141 return idx; 1142 } 1143 1144 /** 1145 * user_sdma_txreq_cb() - SDMA tx request completion callback. 1146 * @txreq: valid sdma tx request 1147 * @status: success/failure of request 1148 * 1149 * Called when the SDMA progress state machine gets notification that 1150 * the SDMA descriptors for this tx request have been processed by the 1151 * DMA engine. Called in interrupt context. 1152 * Only do work on completed sequences. 1153 */ 1154 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status) 1155 { 1156 struct user_sdma_txreq *tx = 1157 container_of(txreq, struct user_sdma_txreq, txreq); 1158 struct user_sdma_request *req; 1159 struct hfi1_user_sdma_pkt_q *pq; 1160 struct hfi1_user_sdma_comp_q *cq; 1161 enum hfi1_sdma_comp_state state = COMPLETE; 1162 1163 if (!tx->req) 1164 return; 1165 1166 req = tx->req; 1167 pq = req->pq; 1168 cq = req->cq; 1169 1170 if (status != SDMA_TXREQ_S_OK) { 1171 SDMA_DBG(req, "SDMA completion with error %d", 1172 status); 1173 WRITE_ONCE(req->has_error, 1); 1174 state = ERROR; 1175 } 1176 1177 req->seqcomp = tx->seqnum; 1178 kmem_cache_free(pq->txreq_cache, tx); 1179 1180 /* sequence isn't complete? We are done */ 1181 if (req->seqcomp != req->info.npkts - 1) 1182 return; 1183 1184 user_sdma_free_request(req); 1185 set_comp_state(pq, cq, req->info.comp_idx, state, status); 1186 pq_update(pq); 1187 } 1188 1189 static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq) 1190 { 1191 if (atomic_dec_and_test(&pq->n_reqs)) 1192 wake_up(&pq->wait); 1193 } 1194 1195 static void user_sdma_free_request(struct user_sdma_request *req) 1196 { 1197 if (!list_empty(&req->txps)) { 1198 struct sdma_txreq *t, *p; 1199 1200 list_for_each_entry_safe(t, p, &req->txps, list) { 1201 struct user_sdma_txreq *tx = 1202 container_of(t, struct user_sdma_txreq, txreq); 1203 list_del_init(&t->list); 1204 sdma_txclean(req->pq->dd, t); 1205 kmem_cache_free(req->pq->txreq_cache, tx); 1206 } 1207 } 1208 1209 kfree(req->tids); 1210 clear_bit(req->info.comp_idx, req->pq->req_in_use); 1211 } 1212 1213 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, 1214 struct hfi1_user_sdma_comp_q *cq, 1215 u16 idx, enum hfi1_sdma_comp_state state, 1216 int ret) 1217 { 1218 if (state == ERROR) 1219 cq->comps[idx].errcode = -ret; 1220 smp_wmb(); /* make sure errcode is visible first */ 1221 cq->comps[idx].status = state; 1222 trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt, 1223 idx, state, ret); 1224 } 1225