1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2 /* 3 * Copyright(c) 2020 - 2023 Cornelis Networks, Inc. 4 * Copyright(c) 2015 - 2018 Intel Corporation. 5 */ 6 7 #include <linux/mm.h> 8 #include <linux/types.h> 9 #include <linux/device.h> 10 #include <linux/dmapool.h> 11 #include <linux/slab.h> 12 #include <linux/list.h> 13 #include <linux/highmem.h> 14 #include <linux/io.h> 15 #include <linux/uio.h> 16 #include <linux/rbtree.h> 17 #include <linux/spinlock.h> 18 #include <linux/delay.h> 19 #include <linux/kthread.h> 20 #include <linux/mmu_context.h> 21 #include <linux/module.h> 22 #include <linux/vmalloc.h> 23 #include <linux/string.h> 24 25 #include "hfi.h" 26 #include "sdma.h" 27 #include "user_sdma.h" 28 #include "verbs.h" /* for the headers */ 29 #include "common.h" /* for struct hfi1_tid_info */ 30 #include "trace.h" 31 32 static uint hfi1_sdma_comp_ring_size = 128; 33 module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO); 34 MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128"); 35 36 static unsigned initial_pkt_count = 8; 37 38 static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts); 39 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status); 40 static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq); 41 static void user_sdma_free_request(struct user_sdma_request *req); 42 static int check_header_template(struct user_sdma_request *req, 43 struct hfi1_pkt_header *hdr, u32 lrhlen, 44 u32 datalen); 45 static int set_txreq_header(struct user_sdma_request *req, 46 struct user_sdma_txreq *tx, u32 datalen); 47 static int set_txreq_header_ahg(struct user_sdma_request *req, 48 struct user_sdma_txreq *tx, u32 len); 49 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, 50 struct hfi1_user_sdma_comp_q *cq, 51 u16 idx, enum hfi1_sdma_comp_state state, 52 int ret); 53 static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags); 54 static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len); 55 56 static int defer_packet_queue( 57 struct sdma_engine *sde, 58 struct iowait_work *wait, 59 struct sdma_txreq *txreq, 60 uint seq, 61 bool pkts_sent); 62 static void activate_packet_queue(struct iowait *wait, int reason); 63 64 static int defer_packet_queue( 65 struct sdma_engine *sde, 66 struct iowait_work *wait, 67 struct sdma_txreq *txreq, 68 uint seq, 69 bool pkts_sent) 70 { 71 struct hfi1_user_sdma_pkt_q *pq = 72 container_of(wait->iow, struct hfi1_user_sdma_pkt_q, busy); 73 74 write_seqlock(&sde->waitlock); 75 trace_hfi1_usdma_defer(pq, sde, &pq->busy); 76 if (sdma_progress(sde, seq, txreq)) 77 goto eagain; 78 /* 79 * We are assuming that if the list is enqueued somewhere, it 80 * is to the dmawait list since that is the only place where 81 * it is supposed to be enqueued. 82 */ 83 xchg(&pq->state, SDMA_PKT_Q_DEFERRED); 84 if (list_empty(&pq->busy.list)) { 85 pq->busy.lock = &sde->waitlock; 86 iowait_get_priority(&pq->busy); 87 iowait_queue(pkts_sent, &pq->busy, &sde->dmawait); 88 } 89 write_sequnlock(&sde->waitlock); 90 return -EBUSY; 91 eagain: 92 write_sequnlock(&sde->waitlock); 93 return -EAGAIN; 94 } 95 96 static void activate_packet_queue(struct iowait *wait, int reason) 97 { 98 struct hfi1_user_sdma_pkt_q *pq = 99 container_of(wait, struct hfi1_user_sdma_pkt_q, busy); 100 101 trace_hfi1_usdma_activate(pq, wait, reason); 102 xchg(&pq->state, SDMA_PKT_Q_ACTIVE); 103 wake_up(&wait->wait_dma); 104 }; 105 106 int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, 107 struct hfi1_filedata *fd) 108 { 109 int ret = -ENOMEM; 110 char buf[64]; 111 struct hfi1_devdata *dd; 112 struct hfi1_user_sdma_comp_q *cq; 113 struct hfi1_user_sdma_pkt_q *pq; 114 115 if (!uctxt || !fd) 116 return -EBADF; 117 118 if (!hfi1_sdma_comp_ring_size) 119 return -EINVAL; 120 121 dd = uctxt->dd; 122 123 pq = kzalloc_obj(*pq); 124 if (!pq) 125 return -ENOMEM; 126 pq->dd = dd; 127 pq->ctxt = uctxt->ctxt; 128 pq->subctxt = fd->subctxt; 129 pq->n_max_reqs = hfi1_sdma_comp_ring_size; 130 atomic_set(&pq->n_reqs, 0); 131 init_waitqueue_head(&pq->wait); 132 atomic_set(&pq->n_locked, 0); 133 134 iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue, 135 activate_packet_queue, NULL, NULL); 136 pq->reqidx = 0; 137 138 pq->reqs = kzalloc_objs(*pq->reqs, hfi1_sdma_comp_ring_size); 139 if (!pq->reqs) 140 goto pq_reqs_nomem; 141 142 pq->req_in_use = bitmap_zalloc(hfi1_sdma_comp_ring_size, GFP_KERNEL); 143 if (!pq->req_in_use) 144 goto pq_reqs_no_in_use; 145 146 snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt, 147 fd->subctxt); 148 pq->txreq_cache = kmem_cache_create(buf, 149 sizeof(struct user_sdma_txreq), 150 L1_CACHE_BYTES, 151 SLAB_HWCACHE_ALIGN, 152 NULL); 153 if (!pq->txreq_cache) { 154 dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n", 155 uctxt->ctxt); 156 goto pq_txreq_nomem; 157 } 158 159 cq = kzalloc_obj(*cq); 160 if (!cq) 161 goto cq_nomem; 162 163 cq->comps = vmalloc_user(PAGE_ALIGN(sizeof(*cq->comps) 164 * hfi1_sdma_comp_ring_size)); 165 if (!cq->comps) 166 goto cq_comps_nomem; 167 168 cq->nentries = hfi1_sdma_comp_ring_size; 169 170 ret = hfi1_init_system_pinning(pq); 171 if (ret) 172 goto pq_mmu_fail; 173 174 rcu_assign_pointer(fd->pq, pq); 175 fd->cq = cq; 176 177 return 0; 178 179 pq_mmu_fail: 180 vfree(cq->comps); 181 cq_comps_nomem: 182 kfree(cq); 183 cq_nomem: 184 kmem_cache_destroy(pq->txreq_cache); 185 pq_txreq_nomem: 186 bitmap_free(pq->req_in_use); 187 pq_reqs_no_in_use: 188 kfree(pq->reqs); 189 pq_reqs_nomem: 190 kfree(pq); 191 192 return ret; 193 } 194 195 static void flush_pq_iowait(struct hfi1_user_sdma_pkt_q *pq) 196 { 197 unsigned long flags; 198 seqlock_t *lock = pq->busy.lock; 199 200 if (!lock) 201 return; 202 write_seqlock_irqsave(lock, flags); 203 if (!list_empty(&pq->busy.list)) { 204 list_del_init(&pq->busy.list); 205 pq->busy.lock = NULL; 206 } 207 write_sequnlock_irqrestore(lock, flags); 208 } 209 210 int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd, 211 struct hfi1_ctxtdata *uctxt) 212 { 213 struct hfi1_user_sdma_pkt_q *pq; 214 215 trace_hfi1_sdma_user_free_queues(uctxt->dd, uctxt->ctxt, fd->subctxt); 216 217 spin_lock(&fd->pq_rcu_lock); 218 pq = srcu_dereference_check(fd->pq, &fd->pq_srcu, 219 lockdep_is_held(&fd->pq_rcu_lock)); 220 if (pq) { 221 rcu_assign_pointer(fd->pq, NULL); 222 spin_unlock(&fd->pq_rcu_lock); 223 synchronize_srcu(&fd->pq_srcu); 224 /* at this point there can be no more new requests */ 225 iowait_sdma_drain(&pq->busy); 226 /* Wait until all requests have been freed. */ 227 wait_event_interruptible( 228 pq->wait, 229 !atomic_read(&pq->n_reqs)); 230 kfree(pq->reqs); 231 hfi1_free_system_pinning(pq); 232 bitmap_free(pq->req_in_use); 233 kmem_cache_destroy(pq->txreq_cache); 234 flush_pq_iowait(pq); 235 kfree(pq); 236 } else { 237 spin_unlock(&fd->pq_rcu_lock); 238 } 239 if (fd->cq) { 240 vfree(fd->cq->comps); 241 kfree(fd->cq); 242 fd->cq = NULL; 243 } 244 return 0; 245 } 246 247 static u8 dlid_to_selector(u16 dlid) 248 { 249 static u8 mapping[256]; 250 static int initialized; 251 static u8 next; 252 int hash; 253 254 if (!initialized) { 255 memset(mapping, 0xFF, 256); 256 initialized = 1; 257 } 258 259 hash = ((dlid >> 8) ^ dlid) & 0xFF; 260 if (mapping[hash] == 0xFF) { 261 mapping[hash] = next; 262 next = (next + 1) & 0x7F; 263 } 264 265 return mapping[hash]; 266 } 267 268 /** 269 * hfi1_user_sdma_process_request() - Process and start a user sdma request 270 * @fd: valid file descriptor 271 * @iovec: array of io vectors to process 272 * @dim: overall iovec array size 273 * @count: number of io vector array entries processed 274 */ 275 int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, 276 struct iovec *iovec, unsigned long dim, 277 unsigned long *count) 278 { 279 int ret = 0, i; 280 struct hfi1_ctxtdata *uctxt = fd->uctxt; 281 struct hfi1_user_sdma_pkt_q *pq = 282 srcu_dereference(fd->pq, &fd->pq_srcu); 283 struct hfi1_user_sdma_comp_q *cq = fd->cq; 284 struct hfi1_devdata *dd = pq->dd; 285 unsigned long idx = 0; 286 u8 pcount = initial_pkt_count; 287 struct sdma_req_info info; 288 struct user_sdma_request *req; 289 u8 opcode, sc, vl; 290 u16 pkey; 291 u32 slid; 292 u16 dlid; 293 u32 selector; 294 295 if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) { 296 hfi1_cdbg( 297 SDMA, 298 "[%u:%u:%u] First vector not big enough for header %lu/%lu", 299 dd->unit, uctxt->ctxt, fd->subctxt, 300 iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr)); 301 return -EINVAL; 302 } 303 ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info)); 304 if (ret) { 305 hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)", 306 dd->unit, uctxt->ctxt, fd->subctxt, ret); 307 return -EFAULT; 308 } 309 310 trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt, 311 (u16 *)&info); 312 if (info.comp_idx >= hfi1_sdma_comp_ring_size) { 313 hfi1_cdbg(SDMA, 314 "[%u:%u:%u:%u] Invalid comp index", 315 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); 316 return -EINVAL; 317 } 318 319 /* 320 * Sanity check the header io vector count. Need at least 1 vector 321 * (header) and cannot be larger than the actual io vector count. 322 */ 323 if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) { 324 hfi1_cdbg(SDMA, 325 "[%u:%u:%u:%u] Invalid iov count %d, dim %ld", 326 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx, 327 req_iovcnt(info.ctrl), dim); 328 return -EINVAL; 329 } 330 331 if (!info.fragsize) { 332 hfi1_cdbg(SDMA, 333 "[%u:%u:%u:%u] Request does not specify fragsize", 334 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); 335 return -EINVAL; 336 } 337 338 /* Try to claim the request. */ 339 if (test_and_set_bit(info.comp_idx, pq->req_in_use)) { 340 hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use", 341 dd->unit, uctxt->ctxt, fd->subctxt, 342 info.comp_idx); 343 return -EBADSLT; 344 } 345 /* 346 * All safety checks have been done and this request has been claimed. 347 */ 348 trace_hfi1_sdma_user_process_request(dd, uctxt->ctxt, fd->subctxt, 349 info.comp_idx); 350 req = pq->reqs + info.comp_idx; 351 req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */ 352 req->data_len = 0; 353 req->pq = pq; 354 req->cq = cq; 355 req->ahg_idx = -1; 356 req->iov_idx = 0; 357 req->sent = 0; 358 req->seqnum = 0; 359 req->seqcomp = 0; 360 req->seqsubmitted = 0; 361 req->tids = NULL; 362 req->has_error = 0; 363 INIT_LIST_HEAD(&req->txps); 364 365 memcpy(&req->info, &info, sizeof(info)); 366 367 /* The request is initialized, count it */ 368 atomic_inc(&pq->n_reqs); 369 370 if (req_opcode(info.ctrl) == EXPECTED) { 371 /* expected must have a TID info and at least one data vector */ 372 if (req->data_iovs < 2) { 373 SDMA_DBG(req, 374 "Not enough vectors for expected request"); 375 ret = -EINVAL; 376 goto free_req; 377 } 378 req->data_iovs--; 379 } 380 381 if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) { 382 SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs, 383 MAX_VECTORS_PER_REQ); 384 ret = -EINVAL; 385 goto free_req; 386 } 387 388 /* Copy the header from the user buffer */ 389 ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info), 390 sizeof(req->hdr)); 391 if (ret) { 392 SDMA_DBG(req, "Failed to copy header template (%d)", ret); 393 ret = -EFAULT; 394 goto free_req; 395 } 396 397 /* If Static rate control is not enabled, sanitize the header. */ 398 if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL)) 399 req->hdr.pbc[2] = 0; 400 401 /* Validate the opcode. Do not trust packets from user space blindly. */ 402 opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff; 403 if ((opcode & USER_OPCODE_CHECK_MASK) != 404 USER_OPCODE_CHECK_VAL) { 405 SDMA_DBG(req, "Invalid opcode (%d)", opcode); 406 ret = -EINVAL; 407 goto free_req; 408 } 409 /* 410 * Validate the vl. Do not trust packets from user space blindly. 411 * VL comes from PBC, SC comes from LRH, and the VL needs to 412 * match the SC look up. 413 */ 414 vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF; 415 sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) | 416 (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4)); 417 if (vl >= dd->pport->vls_operational || 418 vl != sc_to_vlt(dd, sc)) { 419 SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl); 420 ret = -EINVAL; 421 goto free_req; 422 } 423 424 /* Checking P_KEY for requests from user-space */ 425 pkey = (u16)be32_to_cpu(req->hdr.bth[0]); 426 slid = be16_to_cpu(req->hdr.lrh[3]); 427 if (egress_pkey_check(dd->pport, slid, pkey, sc, PKEY_CHECK_INVALID)) { 428 ret = -EINVAL; 429 goto free_req; 430 } 431 432 /* 433 * Also should check the BTH.lnh. If it says the next header is GRH then 434 * the RXE parsing will be off and will land in the middle of the KDETH 435 * or miss it entirely. 436 */ 437 if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) { 438 SDMA_DBG(req, "User tried to pass in a GRH"); 439 ret = -EINVAL; 440 goto free_req; 441 } 442 443 req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]); 444 /* 445 * Calculate the initial TID offset based on the values of 446 * KDETH.OFFSET and KDETH.OM that are passed in. 447 */ 448 req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) * 449 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? 450 KDETH_OM_LARGE : KDETH_OM_SMALL); 451 trace_hfi1_sdma_user_initial_tidoffset(dd, uctxt->ctxt, fd->subctxt, 452 info.comp_idx, req->tidoffset); 453 idx++; 454 455 /* Save all the IO vector structures */ 456 for (i = 0; i < req->data_iovs; i++) { 457 req->iovs[i].offset = 0; 458 INIT_LIST_HEAD(&req->iovs[i].list); 459 memcpy(&req->iovs[i].iov, 460 iovec + idx++, 461 sizeof(req->iovs[i].iov)); 462 if (req->iovs[i].iov.iov_len == 0) { 463 ret = -EINVAL; 464 goto free_req; 465 } 466 req->data_len += req->iovs[i].iov.iov_len; 467 } 468 trace_hfi1_sdma_user_data_length(dd, uctxt->ctxt, fd->subctxt, 469 info.comp_idx, req->data_len); 470 if (pcount > req->info.npkts) 471 pcount = req->info.npkts; 472 /* 473 * Copy any TID info 474 * User space will provide the TID info only when the 475 * request type is EXPECTED. This is true even if there is 476 * only one packet in the request and the header is already 477 * setup. The reason for the singular TID case is that the 478 * driver needs to perform safety checks. 479 */ 480 if (req_opcode(req->info.ctrl) == EXPECTED) { 481 u16 ntids = iovec[idx].iov_len / sizeof(*req->tids); 482 u32 *tmp; 483 484 if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) { 485 ret = -EINVAL; 486 goto free_req; 487 } 488 489 /* 490 * We have to copy all of the tids because they may vary 491 * in size and, therefore, the TID count might not be 492 * equal to the pkt count. However, there is no way to 493 * tell at this point. 494 */ 495 tmp = memdup_array_user(iovec[idx].iov_base, 496 ntids, sizeof(*req->tids)); 497 if (IS_ERR(tmp)) { 498 ret = PTR_ERR(tmp); 499 SDMA_DBG(req, "Failed to copy %d TIDs (%pe)", ntids, 500 tmp); 501 goto free_req; 502 } 503 req->tids = tmp; 504 req->n_tids = ntids; 505 req->tididx = 0; 506 idx++; 507 } 508 509 dlid = be16_to_cpu(req->hdr.lrh[1]); 510 selector = dlid_to_selector(dlid); 511 selector += uctxt->ctxt + fd->subctxt; 512 req->sde = sdma_select_user_engine(dd, selector, vl); 513 514 if (!req->sde || !sdma_running(req->sde)) { 515 ret = -ECOMM; 516 goto free_req; 517 } 518 519 /* We don't need an AHG entry if the request contains only one packet */ 520 if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG)) 521 req->ahg_idx = sdma_ahg_alloc(req->sde); 522 523 set_comp_state(pq, cq, info.comp_idx, QUEUED, 0); 524 pq->state = SDMA_PKT_Q_ACTIVE; 525 526 /* 527 * This is a somewhat blocking send implementation. 528 * The driver will block the caller until all packets of the 529 * request have been submitted to the SDMA engine. However, it 530 * will not wait for send completions. 531 */ 532 while (req->seqsubmitted != req->info.npkts) { 533 ret = user_sdma_send_pkts(req, pcount); 534 if (ret < 0) { 535 int we_ret; 536 537 if (ret != -EBUSY) 538 goto free_req; 539 we_ret = wait_event_interruptible_timeout( 540 pq->busy.wait_dma, 541 pq->state == SDMA_PKT_Q_ACTIVE, 542 msecs_to_jiffies( 543 SDMA_IOWAIT_TIMEOUT)); 544 trace_hfi1_usdma_we(pq, we_ret); 545 if (we_ret <= 0) 546 flush_pq_iowait(pq); 547 } 548 } 549 *count += idx; 550 return 0; 551 free_req: 552 /* 553 * If the submitted seqsubmitted == npkts, the completion routine 554 * controls the final state. If sequbmitted < npkts, wait for any 555 * outstanding packets to finish before cleaning up. 556 */ 557 if (req->seqsubmitted < req->info.npkts) { 558 if (req->seqsubmitted) 559 wait_event(pq->busy.wait_dma, 560 (req->seqcomp == req->seqsubmitted - 1)); 561 user_sdma_free_request(req); 562 pq_update(pq); 563 set_comp_state(pq, cq, info.comp_idx, ERROR, ret); 564 } 565 return ret; 566 } 567 568 static inline u32 compute_data_length(struct user_sdma_request *req, 569 struct user_sdma_txreq *tx) 570 { 571 /* 572 * Determine the proper size of the packet data. 573 * The size of the data of the first packet is in the header 574 * template. However, it includes the header and ICRC, which need 575 * to be subtracted. 576 * The minimum representable packet data length in a header is 4 bytes, 577 * therefore, when the data length request is less than 4 bytes, there's 578 * only one packet, and the packet data length is equal to that of the 579 * request data length. 580 * The size of the remaining packets is the minimum of the frag 581 * size (MTU) or remaining data in the request. 582 */ 583 u32 len; 584 585 if (!req->seqnum) { 586 if (req->data_len < sizeof(u32)) 587 len = req->data_len; 588 else 589 len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) - 590 (sizeof(tx->hdr) - 4)); 591 } else if (req_opcode(req->info.ctrl) == EXPECTED) { 592 u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) * 593 PAGE_SIZE; 594 /* 595 * Get the data length based on the remaining space in the 596 * TID pair. 597 */ 598 len = min(tidlen - req->tidoffset, (u32)req->info.fragsize); 599 /* If we've filled up the TID pair, move to the next one. */ 600 if (unlikely(!len) && ++req->tididx < req->n_tids && 601 req->tids[req->tididx]) { 602 tidlen = EXP_TID_GET(req->tids[req->tididx], 603 LEN) * PAGE_SIZE; 604 req->tidoffset = 0; 605 len = min_t(u32, tidlen, req->info.fragsize); 606 } 607 /* 608 * Since the TID pairs map entire pages, make sure that we 609 * are not going to try to send more data that we have 610 * remaining. 611 */ 612 len = min(len, req->data_len - req->sent); 613 } else { 614 len = min(req->data_len - req->sent, (u32)req->info.fragsize); 615 } 616 trace_hfi1_sdma_user_compute_length(req->pq->dd, 617 req->pq->ctxt, 618 req->pq->subctxt, 619 req->info.comp_idx, 620 len); 621 return len; 622 } 623 624 static inline u32 pad_len(u32 len) 625 { 626 if (len & (sizeof(u32) - 1)) 627 len += sizeof(u32) - (len & (sizeof(u32) - 1)); 628 return len; 629 } 630 631 static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len) 632 { 633 /* (Size of complete header - size of PBC) + 4B ICRC + data length */ 634 return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len); 635 } 636 637 static int user_sdma_txadd_ahg(struct user_sdma_request *req, 638 struct user_sdma_txreq *tx, 639 u32 datalen) 640 { 641 int ret; 642 u16 pbclen = le16_to_cpu(req->hdr.pbc[0]); 643 u32 lrhlen = get_lrh_len(req->hdr, pad_len(datalen)); 644 struct hfi1_user_sdma_pkt_q *pq = req->pq; 645 646 /* 647 * Copy the request header into the tx header 648 * because the HW needs a cacheline-aligned 649 * address. 650 * This copy can be optimized out if the hdr 651 * member of user_sdma_request were also 652 * cacheline aligned. 653 */ 654 memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr)); 655 if (PBC2LRH(pbclen) != lrhlen) { 656 pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen); 657 tx->hdr.pbc[0] = cpu_to_le16(pbclen); 658 } 659 ret = check_header_template(req, &tx->hdr, lrhlen, datalen); 660 if (ret) 661 return ret; 662 ret = sdma_txinit_ahg(&tx->txreq, SDMA_TXREQ_F_AHG_COPY, 663 sizeof(tx->hdr) + datalen, req->ahg_idx, 664 0, NULL, 0, user_sdma_txreq_cb); 665 if (ret) 666 return ret; 667 ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, &tx->hdr, sizeof(tx->hdr)); 668 if (ret) 669 sdma_txclean(pq->dd, &tx->txreq); 670 return ret; 671 } 672 673 static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts) 674 { 675 int ret = 0; 676 u16 count; 677 unsigned npkts = 0; 678 struct user_sdma_txreq *tx = NULL; 679 struct hfi1_user_sdma_pkt_q *pq = NULL; 680 struct user_sdma_iovec *iovec = NULL; 681 682 if (!req->pq) 683 return -EINVAL; 684 685 pq = req->pq; 686 687 /* If tx completion has reported an error, we are done. */ 688 if (READ_ONCE(req->has_error)) 689 return -EFAULT; 690 691 /* 692 * Check if we might have sent the entire request already 693 */ 694 if (unlikely(req->seqnum == req->info.npkts)) { 695 if (!list_empty(&req->txps)) 696 goto dosend; 697 return ret; 698 } 699 700 if (!maxpkts || maxpkts > req->info.npkts - req->seqnum) 701 maxpkts = req->info.npkts - req->seqnum; 702 703 while (npkts < maxpkts) { 704 u32 datalen = 0; 705 706 /* 707 * Check whether any of the completions have come back 708 * with errors. If so, we are not going to process any 709 * more packets from this request. 710 */ 711 if (READ_ONCE(req->has_error)) 712 return -EFAULT; 713 714 tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL); 715 if (!tx) 716 return -ENOMEM; 717 718 tx->flags = 0; 719 tx->req = req; 720 INIT_LIST_HEAD(&tx->list); 721 722 /* 723 * For the last packet set the ACK request 724 * and disable header suppression. 725 */ 726 if (req->seqnum == req->info.npkts - 1) 727 tx->flags |= (TXREQ_FLAGS_REQ_ACK | 728 TXREQ_FLAGS_REQ_DISABLE_SH); 729 730 /* 731 * Calculate the payload size - this is min of the fragment 732 * (MTU) size or the remaining bytes in the request but only 733 * if we have payload data. 734 */ 735 if (req->data_len) { 736 iovec = &req->iovs[req->iov_idx]; 737 if (READ_ONCE(iovec->offset) == iovec->iov.iov_len) { 738 if (++req->iov_idx == req->data_iovs) { 739 ret = -EFAULT; 740 goto free_tx; 741 } 742 iovec = &req->iovs[req->iov_idx]; 743 WARN_ON(iovec->offset); 744 } 745 746 datalen = compute_data_length(req, tx); 747 748 /* 749 * Disable header suppression for the payload <= 8DWS. 750 * If there is an uncorrectable error in the receive 751 * data FIFO when the received payload size is less than 752 * or equal to 8DWS then the RxDmaDataFifoRdUncErr is 753 * not reported.There is set RHF.EccErr if the header 754 * is not suppressed. 755 */ 756 if (!datalen) { 757 SDMA_DBG(req, 758 "Request has data but pkt len is 0"); 759 ret = -EFAULT; 760 goto free_tx; 761 } else if (datalen <= 32) { 762 tx->flags |= TXREQ_FLAGS_REQ_DISABLE_SH; 763 } 764 } 765 766 if (req->ahg_idx >= 0) { 767 if (!req->seqnum) { 768 ret = user_sdma_txadd_ahg(req, tx, datalen); 769 if (ret) 770 goto free_tx; 771 } else { 772 int changes; 773 774 changes = set_txreq_header_ahg(req, tx, 775 datalen); 776 if (changes < 0) { 777 ret = changes; 778 goto free_tx; 779 } 780 } 781 } else { 782 ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) + 783 datalen, user_sdma_txreq_cb); 784 if (ret) 785 goto free_tx; 786 /* 787 * Modify the header for this packet. This only needs 788 * to be done if we are not going to use AHG. Otherwise, 789 * the HW will do it based on the changes we gave it 790 * during sdma_txinit_ahg(). 791 */ 792 ret = set_txreq_header(req, tx, datalen); 793 if (ret) 794 goto free_txreq; 795 } 796 797 req->koffset += datalen; 798 if (req_opcode(req->info.ctrl) == EXPECTED) 799 req->tidoffset += datalen; 800 req->sent += datalen; 801 while (datalen) { 802 ret = hfi1_add_pages_to_sdma_packet(req, tx, iovec, 803 &datalen); 804 if (ret) 805 goto free_txreq; 806 iovec = &req->iovs[req->iov_idx]; 807 } 808 list_add_tail(&tx->txreq.list, &req->txps); 809 /* 810 * It is important to increment this here as it is used to 811 * generate the BTH.PSN and, therefore, can't be bulk-updated 812 * outside of the loop. 813 */ 814 tx->seqnum = req->seqnum++; 815 npkts++; 816 } 817 dosend: 818 ret = sdma_send_txlist(req->sde, 819 iowait_get_ib_work(&pq->busy), 820 &req->txps, &count); 821 req->seqsubmitted += count; 822 if (req->seqsubmitted == req->info.npkts) { 823 /* 824 * The txreq has already been submitted to the HW queue 825 * so we can free the AHG entry now. Corruption will not 826 * happen due to the sequential manner in which 827 * descriptors are processed. 828 */ 829 if (req->ahg_idx >= 0) 830 sdma_ahg_free(req->sde, req->ahg_idx); 831 } 832 return ret; 833 834 free_txreq: 835 sdma_txclean(pq->dd, &tx->txreq); 836 free_tx: 837 kmem_cache_free(pq->txreq_cache, tx); 838 return ret; 839 } 840 841 static int check_header_template(struct user_sdma_request *req, 842 struct hfi1_pkt_header *hdr, u32 lrhlen, 843 u32 datalen) 844 { 845 /* 846 * Perform safety checks for any type of packet: 847 * - transfer size is multiple of 64bytes 848 * - packet length is multiple of 4 bytes 849 * - packet length is not larger than MTU size 850 * 851 * These checks are only done for the first packet of the 852 * transfer since the header is "given" to us by user space. 853 * For the remainder of the packets we compute the values. 854 */ 855 if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 || 856 lrhlen > get_lrh_len(*hdr, req->info.fragsize)) 857 return -EINVAL; 858 859 if (req_opcode(req->info.ctrl) == EXPECTED) { 860 /* 861 * The header is checked only on the first packet. Furthermore, 862 * we ensure that at least one TID entry is copied when the 863 * request is submitted. Therefore, we don't have to verify that 864 * tididx points to something sane. 865 */ 866 u32 tidval = req->tids[req->tididx], 867 tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE, 868 tididx = EXP_TID_GET(tidval, IDX), 869 tidctrl = EXP_TID_GET(tidval, CTRL), 870 tidoff; 871 __le32 kval = hdr->kdeth.ver_tid_offset; 872 873 tidoff = KDETH_GET(kval, OFFSET) * 874 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? 875 KDETH_OM_LARGE : KDETH_OM_SMALL); 876 /* 877 * Expected receive packets have the following 878 * additional checks: 879 * - offset is not larger than the TID size 880 * - TIDCtrl values match between header and TID array 881 * - TID indexes match between header and TID array 882 */ 883 if ((tidoff + datalen > tidlen) || 884 KDETH_GET(kval, TIDCTRL) != tidctrl || 885 KDETH_GET(kval, TID) != tididx) 886 return -EINVAL; 887 } 888 return 0; 889 } 890 891 /* 892 * Correctly set the BTH.PSN field based on type of 893 * transfer - eager packets can just increment the PSN but 894 * expected packets encode generation and sequence in the 895 * BTH.PSN field so just incrementing will result in errors. 896 */ 897 static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags) 898 { 899 u32 val = be32_to_cpu(bthpsn), 900 mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull : 901 0xffffffull), 902 psn = val & mask; 903 if (expct) 904 psn = (psn & ~HFI1_KDETH_BTH_SEQ_MASK) | 905 ((psn + frags) & HFI1_KDETH_BTH_SEQ_MASK); 906 else 907 psn = psn + frags; 908 return psn & mask; 909 } 910 911 static int set_txreq_header(struct user_sdma_request *req, 912 struct user_sdma_txreq *tx, u32 datalen) 913 { 914 struct hfi1_user_sdma_pkt_q *pq = req->pq; 915 struct hfi1_pkt_header *hdr = &tx->hdr; 916 u8 omfactor; /* KDETH.OM */ 917 u16 pbclen; 918 int ret; 919 u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen)); 920 921 /* Copy the header template to the request before modification */ 922 memcpy(hdr, &req->hdr, sizeof(*hdr)); 923 924 /* 925 * Check if the PBC and LRH length are mismatched. If so 926 * adjust both in the header. 927 */ 928 pbclen = le16_to_cpu(hdr->pbc[0]); 929 if (PBC2LRH(pbclen) != lrhlen) { 930 pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen); 931 hdr->pbc[0] = cpu_to_le16(pbclen); 932 hdr->lrh[2] = cpu_to_be16(lrhlen >> 2); 933 /* 934 * Third packet 935 * This is the first packet in the sequence that has 936 * a "static" size that can be used for the rest of 937 * the packets (besides the last one). 938 */ 939 if (unlikely(req->seqnum == 2)) { 940 /* 941 * From this point on the lengths in both the 942 * PBC and LRH are the same until the last 943 * packet. 944 * Adjust the template so we don't have to update 945 * every packet 946 */ 947 req->hdr.pbc[0] = hdr->pbc[0]; 948 req->hdr.lrh[2] = hdr->lrh[2]; 949 } 950 } 951 /* 952 * We only have to modify the header if this is not the 953 * first packet in the request. Otherwise, we use the 954 * header given to us. 955 */ 956 if (unlikely(!req->seqnum)) { 957 ret = check_header_template(req, hdr, lrhlen, datalen); 958 if (ret) 959 return ret; 960 goto done; 961 } 962 963 hdr->bth[2] = cpu_to_be32( 964 set_pkt_bth_psn(hdr->bth[2], 965 (req_opcode(req->info.ctrl) == EXPECTED), 966 req->seqnum)); 967 968 /* Set ACK request on last packet */ 969 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK)) 970 hdr->bth[2] |= cpu_to_be32(1UL << 31); 971 972 /* Set the new offset */ 973 hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset); 974 /* Expected packets have to fill in the new TID information */ 975 if (req_opcode(req->info.ctrl) == EXPECTED) { 976 tidval = req->tids[req->tididx]; 977 /* 978 * If the offset puts us at the end of the current TID, 979 * advance everything. 980 */ 981 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * 982 PAGE_SIZE)) { 983 req->tidoffset = 0; 984 /* 985 * Since we don't copy all the TIDs, all at once, 986 * we have to check again. 987 */ 988 if (++req->tididx > req->n_tids - 1 || 989 !req->tids[req->tididx]) { 990 return -EINVAL; 991 } 992 tidval = req->tids[req->tididx]; 993 } 994 omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >= 995 KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE_SHIFT : 996 KDETH_OM_SMALL_SHIFT; 997 /* Set KDETH.TIDCtrl based on value for this TID. */ 998 KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL, 999 EXP_TID_GET(tidval, CTRL)); 1000 /* Set KDETH.TID based on value for this TID */ 1001 KDETH_SET(hdr->kdeth.ver_tid_offset, TID, 1002 EXP_TID_GET(tidval, IDX)); 1003 /* Clear KDETH.SH when DISABLE_SH flag is set */ 1004 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) 1005 KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0); 1006 /* 1007 * Set the KDETH.OFFSET and KDETH.OM based on size of 1008 * transfer. 1009 */ 1010 trace_hfi1_sdma_user_tid_info( 1011 pq->dd, pq->ctxt, pq->subctxt, req->info.comp_idx, 1012 req->tidoffset, req->tidoffset >> omfactor, 1013 omfactor != KDETH_OM_SMALL_SHIFT); 1014 KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET, 1015 req->tidoffset >> omfactor); 1016 KDETH_SET(hdr->kdeth.ver_tid_offset, OM, 1017 omfactor != KDETH_OM_SMALL_SHIFT); 1018 } 1019 done: 1020 trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt, 1021 req->info.comp_idx, hdr, tidval); 1022 return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr)); 1023 } 1024 1025 static int set_txreq_header_ahg(struct user_sdma_request *req, 1026 struct user_sdma_txreq *tx, u32 datalen) 1027 { 1028 u32 ahg[AHG_KDETH_ARRAY_SIZE]; 1029 int idx = 0; 1030 u8 omfactor; /* KDETH.OM */ 1031 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1032 struct hfi1_pkt_header *hdr = &req->hdr; 1033 u16 pbclen = le16_to_cpu(hdr->pbc[0]); 1034 u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen)); 1035 size_t array_size = ARRAY_SIZE(ahg); 1036 1037 if (PBC2LRH(pbclen) != lrhlen) { 1038 /* PBC.PbcLengthDWs */ 1039 idx = ahg_header_set(ahg, idx, array_size, 0, 0, 12, 1040 (__force u16)cpu_to_le16(LRH2PBC(lrhlen))); 1041 if (idx < 0) 1042 return idx; 1043 /* LRH.PktLen (we need the full 16 bits due to byte swap) */ 1044 idx = ahg_header_set(ahg, idx, array_size, 3, 0, 16, 1045 (__force u16)cpu_to_be16(lrhlen >> 2)); 1046 if (idx < 0) 1047 return idx; 1048 } 1049 1050 /* 1051 * Do the common updates 1052 */ 1053 /* BTH.PSN and BTH.A */ 1054 val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) & 1055 (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff); 1056 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK)) 1057 val32 |= 1UL << 31; 1058 idx = ahg_header_set(ahg, idx, array_size, 6, 0, 16, 1059 (__force u16)cpu_to_be16(val32 >> 16)); 1060 if (idx < 0) 1061 return idx; 1062 idx = ahg_header_set(ahg, idx, array_size, 6, 16, 16, 1063 (__force u16)cpu_to_be16(val32 & 0xffff)); 1064 if (idx < 0) 1065 return idx; 1066 /* KDETH.Offset */ 1067 idx = ahg_header_set(ahg, idx, array_size, 15, 0, 16, 1068 (__force u16)cpu_to_le16(req->koffset & 0xffff)); 1069 if (idx < 0) 1070 return idx; 1071 idx = ahg_header_set(ahg, idx, array_size, 15, 16, 16, 1072 (__force u16)cpu_to_le16(req->koffset >> 16)); 1073 if (idx < 0) 1074 return idx; 1075 if (req_opcode(req->info.ctrl) == EXPECTED) { 1076 __le16 val; 1077 1078 tidval = req->tids[req->tididx]; 1079 1080 /* 1081 * If the offset puts us at the end of the current TID, 1082 * advance everything. 1083 */ 1084 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * 1085 PAGE_SIZE)) { 1086 req->tidoffset = 0; 1087 /* 1088 * Since we don't copy all the TIDs, all at once, 1089 * we have to check again. 1090 */ 1091 if (++req->tididx > req->n_tids - 1 || 1092 !req->tids[req->tididx]) 1093 return -EINVAL; 1094 tidval = req->tids[req->tididx]; 1095 } 1096 omfactor = ((EXP_TID_GET(tidval, LEN) * 1097 PAGE_SIZE) >= 1098 KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE_SHIFT : 1099 KDETH_OM_SMALL_SHIFT; 1100 /* KDETH.OM and KDETH.OFFSET (TID) */ 1101 idx = ahg_header_set( 1102 ahg, idx, array_size, 7, 0, 16, 1103 ((!!(omfactor - KDETH_OM_SMALL_SHIFT)) << 15 | 1104 ((req->tidoffset >> omfactor) 1105 & 0x7fff))); 1106 if (idx < 0) 1107 return idx; 1108 /* KDETH.TIDCtrl, KDETH.TID, KDETH.Intr, KDETH.SH */ 1109 val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) | 1110 (EXP_TID_GET(tidval, IDX) & 0x3ff)); 1111 1112 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) { 1113 val |= cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset, 1114 INTR) << 1115 AHG_KDETH_INTR_SHIFT)); 1116 } else { 1117 val |= KDETH_GET(hdr->kdeth.ver_tid_offset, SH) ? 1118 cpu_to_le16(0x1 << AHG_KDETH_SH_SHIFT) : 1119 cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset, 1120 INTR) << 1121 AHG_KDETH_INTR_SHIFT)); 1122 } 1123 1124 idx = ahg_header_set(ahg, idx, array_size, 1125 7, 16, 14, (__force u16)val); 1126 if (idx < 0) 1127 return idx; 1128 } 1129 1130 trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt, 1131 req->info.comp_idx, req->sde->this_idx, 1132 req->ahg_idx, ahg, idx, tidval); 1133 sdma_txinit_ahg(&tx->txreq, 1134 SDMA_TXREQ_F_USE_AHG, 1135 datalen, req->ahg_idx, idx, 1136 ahg, sizeof(req->hdr), 1137 user_sdma_txreq_cb); 1138 1139 return idx; 1140 } 1141 1142 /** 1143 * user_sdma_txreq_cb() - SDMA tx request completion callback. 1144 * @txreq: valid sdma tx request 1145 * @status: success/failure of request 1146 * 1147 * Called when the SDMA progress state machine gets notification that 1148 * the SDMA descriptors for this tx request have been processed by the 1149 * DMA engine. Called in interrupt context. 1150 * Only do work on completed sequences. 1151 */ 1152 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status) 1153 { 1154 struct user_sdma_txreq *tx = 1155 container_of(txreq, struct user_sdma_txreq, txreq); 1156 struct user_sdma_request *req; 1157 struct hfi1_user_sdma_pkt_q *pq; 1158 struct hfi1_user_sdma_comp_q *cq; 1159 enum hfi1_sdma_comp_state state = COMPLETE; 1160 1161 if (!tx->req) 1162 return; 1163 1164 req = tx->req; 1165 pq = req->pq; 1166 cq = req->cq; 1167 1168 if (status != SDMA_TXREQ_S_OK) { 1169 SDMA_DBG(req, "SDMA completion with error %d", 1170 status); 1171 WRITE_ONCE(req->has_error, 1); 1172 state = ERROR; 1173 } 1174 1175 req->seqcomp = tx->seqnum; 1176 kmem_cache_free(pq->txreq_cache, tx); 1177 1178 /* sequence isn't complete? We are done */ 1179 if (req->seqcomp != req->info.npkts - 1) 1180 return; 1181 1182 user_sdma_free_request(req); 1183 set_comp_state(pq, cq, req->info.comp_idx, state, status); 1184 pq_update(pq); 1185 } 1186 1187 static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq) 1188 { 1189 if (atomic_dec_and_test(&pq->n_reqs)) 1190 wake_up(&pq->wait); 1191 } 1192 1193 static void user_sdma_free_request(struct user_sdma_request *req) 1194 { 1195 if (!list_empty(&req->txps)) { 1196 struct sdma_txreq *t, *p; 1197 1198 list_for_each_entry_safe(t, p, &req->txps, list) { 1199 struct user_sdma_txreq *tx = 1200 container_of(t, struct user_sdma_txreq, txreq); 1201 list_del_init(&t->list); 1202 sdma_txclean(req->pq->dd, t); 1203 kmem_cache_free(req->pq->txreq_cache, tx); 1204 } 1205 } 1206 1207 kfree(req->tids); 1208 clear_bit(req->info.comp_idx, req->pq->req_in_use); 1209 } 1210 1211 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, 1212 struct hfi1_user_sdma_comp_q *cq, 1213 u16 idx, enum hfi1_sdma_comp_state state, 1214 int ret) 1215 { 1216 if (state == ERROR) 1217 cq->comps[idx].errcode = -ret; 1218 smp_wmb(); /* make sure errcode is visible first */ 1219 cq->comps[idx].status = state; 1220 trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt, 1221 idx, state, ret); 1222 } 1223