1 /* 2 * Copyright(c) 2015, 2016 Intel Corporation. 3 * 4 * This file is provided under a dual BSD/GPLv2 license. When using or 5 * redistributing this file, you may do so under either license. 6 * 7 * GPL LICENSE SUMMARY 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of version 2 of the GNU General Public License as 11 * published by the Free Software Foundation. 12 * 13 * This program is distributed in the hope that it will be useful, but 14 * WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * General Public License for more details. 17 * 18 * BSD LICENSE 19 * 20 * Redistribution and use in source and binary forms, with or without 21 * modification, are permitted provided that the following conditions 22 * are met: 23 * 24 * - Redistributions of source code must retain the above copyright 25 * notice, this list of conditions and the following disclaimer. 26 * - Redistributions in binary form must reproduce the above copyright 27 * notice, this list of conditions and the following disclaimer in 28 * the documentation and/or other materials provided with the 29 * distribution. 30 * - Neither the name of Intel Corporation nor the names of its 31 * contributors may be used to endorse or promote products derived 32 * from this software without specific prior written permission. 33 * 34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 45 * 46 */ 47 #include <linux/mm.h> 48 #include <linux/types.h> 49 #include <linux/device.h> 50 #include <linux/dmapool.h> 51 #include <linux/slab.h> 52 #include <linux/list.h> 53 #include <linux/highmem.h> 54 #include <linux/io.h> 55 #include <linux/uio.h> 56 #include <linux/rbtree.h> 57 #include <linux/spinlock.h> 58 #include <linux/delay.h> 59 #include <linux/kthread.h> 60 #include <linux/mmu_context.h> 61 #include <linux/module.h> 62 #include <linux/vmalloc.h> 63 64 #include "hfi.h" 65 #include "sdma.h" 66 #include "user_sdma.h" 67 #include "verbs.h" /* for the headers */ 68 #include "common.h" /* for struct hfi1_tid_info */ 69 #include "trace.h" 70 #include "mmu_rb.h" 71 72 static uint hfi1_sdma_comp_ring_size = 128; 73 module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO); 74 MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128"); 75 76 /* The maximum number of Data io vectors per message/request */ 77 #define MAX_VECTORS_PER_REQ 8 78 /* 79 * Maximum number of packet to send from each message/request 80 * before moving to the next one. 81 */ 82 #define MAX_PKTS_PER_QUEUE 16 83 84 #define num_pages(x) (1 + ((((x) - 1) & PAGE_MASK) >> PAGE_SHIFT)) 85 86 #define req_opcode(x) \ 87 (((x) >> HFI1_SDMA_REQ_OPCODE_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK) 88 #define req_version(x) \ 89 (((x) >> HFI1_SDMA_REQ_VERSION_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK) 90 #define req_iovcnt(x) \ 91 (((x) >> HFI1_SDMA_REQ_IOVCNT_SHIFT) & HFI1_SDMA_REQ_IOVCNT_MASK) 92 93 /* Number of BTH.PSN bits used for sequence number in expected rcvs */ 94 #define BTH_SEQ_MASK 0x7ffull 95 96 /* 97 * Define fields in the KDETH header so we can update the header 98 * template. 99 */ 100 #define KDETH_OFFSET_SHIFT 0 101 #define KDETH_OFFSET_MASK 0x7fff 102 #define KDETH_OM_SHIFT 15 103 #define KDETH_OM_MASK 0x1 104 #define KDETH_TID_SHIFT 16 105 #define KDETH_TID_MASK 0x3ff 106 #define KDETH_TIDCTRL_SHIFT 26 107 #define KDETH_TIDCTRL_MASK 0x3 108 #define KDETH_INTR_SHIFT 28 109 #define KDETH_INTR_MASK 0x1 110 #define KDETH_SH_SHIFT 29 111 #define KDETH_SH_MASK 0x1 112 #define KDETH_HCRC_UPPER_SHIFT 16 113 #define KDETH_HCRC_UPPER_MASK 0xff 114 #define KDETH_HCRC_LOWER_SHIFT 24 115 #define KDETH_HCRC_LOWER_MASK 0xff 116 117 #define AHG_KDETH_INTR_SHIFT 12 118 #define AHG_KDETH_SH_SHIFT 13 119 120 #define PBC2LRH(x) ((((x) & 0xfff) << 2) - 4) 121 #define LRH2PBC(x) ((((x) >> 2) + 1) & 0xfff) 122 123 #define KDETH_GET(val, field) \ 124 (((le32_to_cpu((val))) >> KDETH_##field##_SHIFT) & KDETH_##field##_MASK) 125 #define KDETH_SET(dw, field, val) do { \ 126 u32 dwval = le32_to_cpu(dw); \ 127 dwval &= ~(KDETH_##field##_MASK << KDETH_##field##_SHIFT); \ 128 dwval |= (((val) & KDETH_##field##_MASK) << \ 129 KDETH_##field##_SHIFT); \ 130 dw = cpu_to_le32(dwval); \ 131 } while (0) 132 133 #define AHG_HEADER_SET(arr, idx, dw, bit, width, value) \ 134 do { \ 135 if ((idx) < ARRAY_SIZE((arr))) \ 136 (arr)[(idx++)] = sdma_build_ahg_descriptor( \ 137 (__force u16)(value), (dw), (bit), \ 138 (width)); \ 139 else \ 140 return -ERANGE; \ 141 } while (0) 142 143 /* KDETH OM multipliers and switch over point */ 144 #define KDETH_OM_SMALL 4 145 #define KDETH_OM_LARGE 64 146 #define KDETH_OM_MAX_SIZE (1 << ((KDETH_OM_LARGE / KDETH_OM_SMALL) + 1)) 147 148 /* Tx request flag bits */ 149 #define TXREQ_FLAGS_REQ_ACK BIT(0) /* Set the ACK bit in the header */ 150 #define TXREQ_FLAGS_REQ_DISABLE_SH BIT(1) /* Disable header suppression */ 151 152 /* SDMA request flag bits */ 153 #define SDMA_REQ_FOR_THREAD 1 154 #define SDMA_REQ_SEND_DONE 2 155 #define SDMA_REQ_HAVE_AHG 3 156 #define SDMA_REQ_HAS_ERROR 4 157 #define SDMA_REQ_DONE_ERROR 5 158 159 #define SDMA_PKT_Q_INACTIVE BIT(0) 160 #define SDMA_PKT_Q_ACTIVE BIT(1) 161 #define SDMA_PKT_Q_DEFERRED BIT(2) 162 163 /* 164 * Maximum retry attempts to submit a TX request 165 * before putting the process to sleep. 166 */ 167 #define MAX_DEFER_RETRY_COUNT 1 168 169 static unsigned initial_pkt_count = 8; 170 171 #define SDMA_IOWAIT_TIMEOUT 1000 /* in milliseconds */ 172 173 struct sdma_mmu_node; 174 175 struct user_sdma_iovec { 176 struct list_head list; 177 struct iovec iov; 178 /* number of pages in this vector */ 179 unsigned npages; 180 /* array of pinned pages for this vector */ 181 struct page **pages; 182 /* 183 * offset into the virtual address space of the vector at 184 * which we last left off. 185 */ 186 u64 offset; 187 struct sdma_mmu_node *node; 188 }; 189 190 struct sdma_mmu_node { 191 struct mmu_rb_node rb; 192 struct hfi1_user_sdma_pkt_q *pq; 193 atomic_t refcount; 194 struct page **pages; 195 unsigned npages; 196 }; 197 198 /* evict operation argument */ 199 struct evict_data { 200 u32 cleared; /* count evicted so far */ 201 u32 target; /* target count to evict */ 202 }; 203 204 struct user_sdma_request { 205 struct sdma_req_info info; 206 struct hfi1_user_sdma_pkt_q *pq; 207 struct hfi1_user_sdma_comp_q *cq; 208 /* This is the original header from user space */ 209 struct hfi1_pkt_header hdr; 210 /* 211 * Pointer to the SDMA engine for this request. 212 * Since different request could be on different VLs, 213 * each request will need it's own engine pointer. 214 */ 215 struct sdma_engine *sde; 216 u8 ahg_idx; 217 u32 ahg[9]; 218 /* 219 * KDETH.Offset (Eager) field 220 * We need to remember the initial value so the headers 221 * can be updated properly. 222 */ 223 u32 koffset; 224 /* 225 * KDETH.OFFSET (TID) field 226 * The offset can cover multiple packets, depending on the 227 * size of the TID entry. 228 */ 229 u32 tidoffset; 230 /* 231 * KDETH.OM 232 * Remember this because the header template always sets it 233 * to 0. 234 */ 235 u8 omfactor; 236 /* 237 * We copy the iovs for this request (based on 238 * info.iovcnt). These are only the data vectors 239 */ 240 unsigned data_iovs; 241 /* total length of the data in the request */ 242 u32 data_len; 243 /* progress index moving along the iovs array */ 244 unsigned iov_idx; 245 struct user_sdma_iovec iovs[MAX_VECTORS_PER_REQ]; 246 /* number of elements copied to the tids array */ 247 u16 n_tids; 248 /* TID array values copied from the tid_iov vector */ 249 u32 *tids; 250 u16 tididx; 251 u32 sent; 252 u64 seqnum; 253 u64 seqcomp; 254 u64 seqsubmitted; 255 struct list_head txps; 256 unsigned long flags; 257 /* status of the last txreq completed */ 258 int status; 259 }; 260 261 /* 262 * A single txreq could span up to 3 physical pages when the MTU 263 * is sufficiently large (> 4K). Each of the IOV pointers also 264 * needs it's own set of flags so the vector has been handled 265 * independently of each other. 266 */ 267 struct user_sdma_txreq { 268 /* Packet header for the txreq */ 269 struct hfi1_pkt_header hdr; 270 struct sdma_txreq txreq; 271 struct list_head list; 272 struct user_sdma_request *req; 273 u16 flags; 274 unsigned busycount; 275 u64 seqnum; 276 }; 277 278 #define SDMA_DBG(req, fmt, ...) \ 279 hfi1_cdbg(SDMA, "[%u:%u:%u:%u] " fmt, (req)->pq->dd->unit, \ 280 (req)->pq->ctxt, (req)->pq->subctxt, (req)->info.comp_idx, \ 281 ##__VA_ARGS__) 282 #define SDMA_Q_DBG(pq, fmt, ...) \ 283 hfi1_cdbg(SDMA, "[%u:%u:%u] " fmt, (pq)->dd->unit, (pq)->ctxt, \ 284 (pq)->subctxt, ##__VA_ARGS__) 285 286 static int user_sdma_send_pkts(struct user_sdma_request *, unsigned); 287 static int num_user_pages(const struct iovec *); 288 static void user_sdma_txreq_cb(struct sdma_txreq *, int); 289 static inline void pq_update(struct hfi1_user_sdma_pkt_q *); 290 static void user_sdma_free_request(struct user_sdma_request *, bool); 291 static int pin_vector_pages(struct user_sdma_request *, 292 struct user_sdma_iovec *); 293 static void unpin_vector_pages(struct mm_struct *, struct page **, unsigned, 294 unsigned); 295 static int check_header_template(struct user_sdma_request *, 296 struct hfi1_pkt_header *, u32, u32); 297 static int set_txreq_header(struct user_sdma_request *, 298 struct user_sdma_txreq *, u32); 299 static int set_txreq_header_ahg(struct user_sdma_request *, 300 struct user_sdma_txreq *, u32); 301 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *, 302 struct hfi1_user_sdma_comp_q *, 303 u16, enum hfi1_sdma_comp_state, int); 304 static inline u32 set_pkt_bth_psn(__be32, u8, u32); 305 static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len); 306 307 static int defer_packet_queue( 308 struct sdma_engine *, 309 struct iowait *, 310 struct sdma_txreq *, 311 unsigned seq); 312 static void activate_packet_queue(struct iowait *, int); 313 static bool sdma_rb_filter(struct mmu_rb_node *, unsigned long, unsigned long); 314 static int sdma_rb_insert(void *, struct mmu_rb_node *); 315 static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, 316 void *arg2, bool *stop); 317 static void sdma_rb_remove(void *, struct mmu_rb_node *); 318 static int sdma_rb_invalidate(void *, struct mmu_rb_node *); 319 320 static struct mmu_rb_ops sdma_rb_ops = { 321 .filter = sdma_rb_filter, 322 .insert = sdma_rb_insert, 323 .evict = sdma_rb_evict, 324 .remove = sdma_rb_remove, 325 .invalidate = sdma_rb_invalidate 326 }; 327 328 static int defer_packet_queue( 329 struct sdma_engine *sde, 330 struct iowait *wait, 331 struct sdma_txreq *txreq, 332 unsigned seq) 333 { 334 struct hfi1_user_sdma_pkt_q *pq = 335 container_of(wait, struct hfi1_user_sdma_pkt_q, busy); 336 struct hfi1_ibdev *dev = &pq->dd->verbs_dev; 337 struct user_sdma_txreq *tx = 338 container_of(txreq, struct user_sdma_txreq, txreq); 339 340 if (sdma_progress(sde, seq, txreq)) { 341 if (tx->busycount++ < MAX_DEFER_RETRY_COUNT) 342 goto eagain; 343 } 344 /* 345 * We are assuming that if the list is enqueued somewhere, it 346 * is to the dmawait list since that is the only place where 347 * it is supposed to be enqueued. 348 */ 349 xchg(&pq->state, SDMA_PKT_Q_DEFERRED); 350 write_seqlock(&dev->iowait_lock); 351 if (list_empty(&pq->busy.list)) 352 list_add_tail(&pq->busy.list, &sde->dmawait); 353 write_sequnlock(&dev->iowait_lock); 354 return -EBUSY; 355 eagain: 356 return -EAGAIN; 357 } 358 359 static void activate_packet_queue(struct iowait *wait, int reason) 360 { 361 struct hfi1_user_sdma_pkt_q *pq = 362 container_of(wait, struct hfi1_user_sdma_pkt_q, busy); 363 xchg(&pq->state, SDMA_PKT_Q_ACTIVE); 364 wake_up(&wait->wait_dma); 365 }; 366 367 static void sdma_kmem_cache_ctor(void *obj) 368 { 369 struct user_sdma_txreq *tx = obj; 370 371 memset(tx, 0, sizeof(*tx)); 372 } 373 374 int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp) 375 { 376 struct hfi1_filedata *fd; 377 int ret = 0; 378 unsigned memsize; 379 char buf[64]; 380 struct hfi1_devdata *dd; 381 struct hfi1_user_sdma_comp_q *cq; 382 struct hfi1_user_sdma_pkt_q *pq; 383 unsigned long flags; 384 385 if (!uctxt || !fp) { 386 ret = -EBADF; 387 goto done; 388 } 389 390 fd = fp->private_data; 391 392 if (!hfi1_sdma_comp_ring_size) { 393 ret = -EINVAL; 394 goto done; 395 } 396 397 dd = uctxt->dd; 398 399 pq = kzalloc(sizeof(*pq), GFP_KERNEL); 400 if (!pq) 401 goto pq_nomem; 402 403 memsize = sizeof(*pq->reqs) * hfi1_sdma_comp_ring_size; 404 pq->reqs = kzalloc(memsize, GFP_KERNEL); 405 if (!pq->reqs) 406 goto pq_reqs_nomem; 407 408 memsize = BITS_TO_LONGS(hfi1_sdma_comp_ring_size) * sizeof(long); 409 pq->req_in_use = kzalloc(memsize, GFP_KERNEL); 410 if (!pq->req_in_use) 411 goto pq_reqs_no_in_use; 412 413 INIT_LIST_HEAD(&pq->list); 414 pq->dd = dd; 415 pq->ctxt = uctxt->ctxt; 416 pq->subctxt = fd->subctxt; 417 pq->n_max_reqs = hfi1_sdma_comp_ring_size; 418 pq->state = SDMA_PKT_Q_INACTIVE; 419 atomic_set(&pq->n_reqs, 0); 420 init_waitqueue_head(&pq->wait); 421 atomic_set(&pq->n_locked, 0); 422 pq->mm = fd->mm; 423 424 iowait_init(&pq->busy, 0, NULL, defer_packet_queue, 425 activate_packet_queue, NULL); 426 pq->reqidx = 0; 427 snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt, 428 fd->subctxt); 429 pq->txreq_cache = kmem_cache_create(buf, 430 sizeof(struct user_sdma_txreq), 431 L1_CACHE_BYTES, 432 SLAB_HWCACHE_ALIGN, 433 sdma_kmem_cache_ctor); 434 if (!pq->txreq_cache) { 435 dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n", 436 uctxt->ctxt); 437 goto pq_txreq_nomem; 438 } 439 fd->pq = pq; 440 cq = kzalloc(sizeof(*cq), GFP_KERNEL); 441 if (!cq) 442 goto cq_nomem; 443 444 memsize = PAGE_ALIGN(sizeof(*cq->comps) * hfi1_sdma_comp_ring_size); 445 cq->comps = vmalloc_user(memsize); 446 if (!cq->comps) 447 goto cq_comps_nomem; 448 449 cq->nentries = hfi1_sdma_comp_ring_size; 450 fd->cq = cq; 451 452 ret = hfi1_mmu_rb_register(pq, pq->mm, &sdma_rb_ops, dd->pport->hfi1_wq, 453 &pq->handler); 454 if (ret) { 455 dd_dev_err(dd, "Failed to register with MMU %d", ret); 456 goto done; 457 } 458 459 spin_lock_irqsave(&uctxt->sdma_qlock, flags); 460 list_add(&pq->list, &uctxt->sdma_queues); 461 spin_unlock_irqrestore(&uctxt->sdma_qlock, flags); 462 goto done; 463 464 cq_comps_nomem: 465 kfree(cq); 466 cq_nomem: 467 kmem_cache_destroy(pq->txreq_cache); 468 pq_txreq_nomem: 469 kfree(pq->req_in_use); 470 pq_reqs_no_in_use: 471 kfree(pq->reqs); 472 pq_reqs_nomem: 473 kfree(pq); 474 fd->pq = NULL; 475 pq_nomem: 476 ret = -ENOMEM; 477 done: 478 return ret; 479 } 480 481 int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd) 482 { 483 struct hfi1_ctxtdata *uctxt = fd->uctxt; 484 struct hfi1_user_sdma_pkt_q *pq; 485 unsigned long flags; 486 487 hfi1_cdbg(SDMA, "[%u:%u:%u] Freeing user SDMA queues", uctxt->dd->unit, 488 uctxt->ctxt, fd->subctxt); 489 pq = fd->pq; 490 if (pq) { 491 if (pq->handler) 492 hfi1_mmu_rb_unregister(pq->handler); 493 spin_lock_irqsave(&uctxt->sdma_qlock, flags); 494 if (!list_empty(&pq->list)) 495 list_del_init(&pq->list); 496 spin_unlock_irqrestore(&uctxt->sdma_qlock, flags); 497 iowait_sdma_drain(&pq->busy); 498 /* Wait until all requests have been freed. */ 499 wait_event_interruptible( 500 pq->wait, 501 (ACCESS_ONCE(pq->state) == SDMA_PKT_Q_INACTIVE)); 502 kfree(pq->reqs); 503 kfree(pq->req_in_use); 504 kmem_cache_destroy(pq->txreq_cache); 505 kfree(pq); 506 fd->pq = NULL; 507 } 508 if (fd->cq) { 509 vfree(fd->cq->comps); 510 kfree(fd->cq); 511 fd->cq = NULL; 512 } 513 return 0; 514 } 515 516 static u8 dlid_to_selector(u16 dlid) 517 { 518 static u8 mapping[256]; 519 static int initialized; 520 static u8 next; 521 int hash; 522 523 if (!initialized) { 524 memset(mapping, 0xFF, 256); 525 initialized = 1; 526 } 527 528 hash = ((dlid >> 8) ^ dlid) & 0xFF; 529 if (mapping[hash] == 0xFF) { 530 mapping[hash] = next; 531 next = (next + 1) & 0x7F; 532 } 533 534 return mapping[hash]; 535 } 536 537 int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec, 538 unsigned long dim, unsigned long *count) 539 { 540 int ret = 0, i; 541 struct hfi1_filedata *fd = fp->private_data; 542 struct hfi1_ctxtdata *uctxt = fd->uctxt; 543 struct hfi1_user_sdma_pkt_q *pq = fd->pq; 544 struct hfi1_user_sdma_comp_q *cq = fd->cq; 545 struct hfi1_devdata *dd = pq->dd; 546 unsigned long idx = 0; 547 u8 pcount = initial_pkt_count; 548 struct sdma_req_info info; 549 struct user_sdma_request *req; 550 u8 opcode, sc, vl; 551 int req_queued = 0; 552 u16 dlid; 553 u32 selector; 554 555 if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) { 556 hfi1_cdbg( 557 SDMA, 558 "[%u:%u:%u] First vector not big enough for header %lu/%lu", 559 dd->unit, uctxt->ctxt, fd->subctxt, 560 iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr)); 561 return -EINVAL; 562 } 563 ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info)); 564 if (ret) { 565 hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)", 566 dd->unit, uctxt->ctxt, fd->subctxt, ret); 567 return -EFAULT; 568 } 569 570 trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt, 571 (u16 *)&info); 572 573 if (info.comp_idx >= hfi1_sdma_comp_ring_size) { 574 hfi1_cdbg(SDMA, 575 "[%u:%u:%u:%u] Invalid comp index", 576 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); 577 return -EINVAL; 578 } 579 580 /* 581 * Sanity check the header io vector count. Need at least 1 vector 582 * (header) and cannot be larger than the actual io vector count. 583 */ 584 if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) { 585 hfi1_cdbg(SDMA, 586 "[%u:%u:%u:%u] Invalid iov count %d, dim %ld", 587 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx, 588 req_iovcnt(info.ctrl), dim); 589 return -EINVAL; 590 } 591 592 if (!info.fragsize) { 593 hfi1_cdbg(SDMA, 594 "[%u:%u:%u:%u] Request does not specify fragsize", 595 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); 596 return -EINVAL; 597 } 598 599 /* Try to claim the request. */ 600 if (test_and_set_bit(info.comp_idx, pq->req_in_use)) { 601 hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use", 602 dd->unit, uctxt->ctxt, fd->subctxt, 603 info.comp_idx); 604 return -EBADSLT; 605 } 606 /* 607 * All safety checks have been done and this request has been claimed. 608 */ 609 hfi1_cdbg(SDMA, "[%u:%u:%u] Using req/comp entry %u\n", dd->unit, 610 uctxt->ctxt, fd->subctxt, info.comp_idx); 611 req = pq->reqs + info.comp_idx; 612 memset(req, 0, sizeof(*req)); 613 req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */ 614 req->pq = pq; 615 req->cq = cq; 616 req->status = -1; 617 INIT_LIST_HEAD(&req->txps); 618 619 memcpy(&req->info, &info, sizeof(info)); 620 621 if (req_opcode(info.ctrl) == EXPECTED) { 622 /* expected must have a TID info and at least one data vector */ 623 if (req->data_iovs < 2) { 624 SDMA_DBG(req, 625 "Not enough vectors for expected request"); 626 ret = -EINVAL; 627 goto free_req; 628 } 629 req->data_iovs--; 630 } 631 632 if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) { 633 SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs, 634 MAX_VECTORS_PER_REQ); 635 ret = -EINVAL; 636 goto free_req; 637 } 638 /* Copy the header from the user buffer */ 639 ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info), 640 sizeof(req->hdr)); 641 if (ret) { 642 SDMA_DBG(req, "Failed to copy header template (%d)", ret); 643 ret = -EFAULT; 644 goto free_req; 645 } 646 647 /* If Static rate control is not enabled, sanitize the header. */ 648 if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL)) 649 req->hdr.pbc[2] = 0; 650 651 /* Validate the opcode. Do not trust packets from user space blindly. */ 652 opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff; 653 if ((opcode & USER_OPCODE_CHECK_MASK) != 654 USER_OPCODE_CHECK_VAL) { 655 SDMA_DBG(req, "Invalid opcode (%d)", opcode); 656 ret = -EINVAL; 657 goto free_req; 658 } 659 /* 660 * Validate the vl. Do not trust packets from user space blindly. 661 * VL comes from PBC, SC comes from LRH, and the VL needs to 662 * match the SC look up. 663 */ 664 vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF; 665 sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) | 666 (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4)); 667 if (vl >= dd->pport->vls_operational || 668 vl != sc_to_vlt(dd, sc)) { 669 SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl); 670 ret = -EINVAL; 671 goto free_req; 672 } 673 674 /* Checking P_KEY for requests from user-space */ 675 if (egress_pkey_check(dd->pport, req->hdr.lrh, req->hdr.bth, sc, 676 PKEY_CHECK_INVALID)) { 677 ret = -EINVAL; 678 goto free_req; 679 } 680 681 /* 682 * Also should check the BTH.lnh. If it says the next header is GRH then 683 * the RXE parsing will be off and will land in the middle of the KDETH 684 * or miss it entirely. 685 */ 686 if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) { 687 SDMA_DBG(req, "User tried to pass in a GRH"); 688 ret = -EINVAL; 689 goto free_req; 690 } 691 692 req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]); 693 /* 694 * Calculate the initial TID offset based on the values of 695 * KDETH.OFFSET and KDETH.OM that are passed in. 696 */ 697 req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) * 698 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? 699 KDETH_OM_LARGE : KDETH_OM_SMALL); 700 SDMA_DBG(req, "Initial TID offset %u", req->tidoffset); 701 idx++; 702 703 /* Save all the IO vector structures */ 704 for (i = 0; i < req->data_iovs; i++) { 705 INIT_LIST_HEAD(&req->iovs[i].list); 706 memcpy(&req->iovs[i].iov, iovec + idx++, sizeof(struct iovec)); 707 ret = pin_vector_pages(req, &req->iovs[i]); 708 if (ret) { 709 req->status = ret; 710 goto free_req; 711 } 712 req->data_len += req->iovs[i].iov.iov_len; 713 } 714 SDMA_DBG(req, "total data length %u", req->data_len); 715 716 if (pcount > req->info.npkts) 717 pcount = req->info.npkts; 718 /* 719 * Copy any TID info 720 * User space will provide the TID info only when the 721 * request type is EXPECTED. This is true even if there is 722 * only one packet in the request and the header is already 723 * setup. The reason for the singular TID case is that the 724 * driver needs to perform safety checks. 725 */ 726 if (req_opcode(req->info.ctrl) == EXPECTED) { 727 u16 ntids = iovec[idx].iov_len / sizeof(*req->tids); 728 729 if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) { 730 ret = -EINVAL; 731 goto free_req; 732 } 733 req->tids = kcalloc(ntids, sizeof(*req->tids), GFP_KERNEL); 734 if (!req->tids) { 735 ret = -ENOMEM; 736 goto free_req; 737 } 738 /* 739 * We have to copy all of the tids because they may vary 740 * in size and, therefore, the TID count might not be 741 * equal to the pkt count. However, there is no way to 742 * tell at this point. 743 */ 744 ret = copy_from_user(req->tids, iovec[idx].iov_base, 745 ntids * sizeof(*req->tids)); 746 if (ret) { 747 SDMA_DBG(req, "Failed to copy %d TIDs (%d)", 748 ntids, ret); 749 ret = -EFAULT; 750 goto free_req; 751 } 752 req->n_tids = ntids; 753 idx++; 754 } 755 756 dlid = be16_to_cpu(req->hdr.lrh[1]); 757 selector = dlid_to_selector(dlid); 758 selector += uctxt->ctxt + fd->subctxt; 759 req->sde = sdma_select_user_engine(dd, selector, vl); 760 761 if (!req->sde || !sdma_running(req->sde)) { 762 ret = -ECOMM; 763 goto free_req; 764 } 765 766 /* We don't need an AHG entry if the request contains only one packet */ 767 if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG)) { 768 int ahg = sdma_ahg_alloc(req->sde); 769 770 if (likely(ahg >= 0)) { 771 req->ahg_idx = (u8)ahg; 772 set_bit(SDMA_REQ_HAVE_AHG, &req->flags); 773 } 774 } 775 776 set_comp_state(pq, cq, info.comp_idx, QUEUED, 0); 777 atomic_inc(&pq->n_reqs); 778 req_queued = 1; 779 /* Send the first N packets in the request to buy us some time */ 780 ret = user_sdma_send_pkts(req, pcount); 781 if (unlikely(ret < 0 && ret != -EBUSY)) { 782 req->status = ret; 783 goto free_req; 784 } 785 786 /* 787 * It is possible that the SDMA engine would have processed all the 788 * submitted packets by the time we get here. Therefore, only set 789 * packet queue state to ACTIVE if there are still uncompleted 790 * requests. 791 */ 792 if (atomic_read(&pq->n_reqs)) 793 xchg(&pq->state, SDMA_PKT_Q_ACTIVE); 794 795 /* 796 * This is a somewhat blocking send implementation. 797 * The driver will block the caller until all packets of the 798 * request have been submitted to the SDMA engine. However, it 799 * will not wait for send completions. 800 */ 801 while (!test_bit(SDMA_REQ_SEND_DONE, &req->flags)) { 802 ret = user_sdma_send_pkts(req, pcount); 803 if (ret < 0) { 804 if (ret != -EBUSY) { 805 req->status = ret; 806 set_bit(SDMA_REQ_DONE_ERROR, &req->flags); 807 if (ACCESS_ONCE(req->seqcomp) == 808 req->seqsubmitted - 1) 809 goto free_req; 810 return ret; 811 } 812 wait_event_interruptible_timeout( 813 pq->busy.wait_dma, 814 (pq->state == SDMA_PKT_Q_ACTIVE), 815 msecs_to_jiffies( 816 SDMA_IOWAIT_TIMEOUT)); 817 } 818 } 819 *count += idx; 820 return 0; 821 free_req: 822 user_sdma_free_request(req, true); 823 if (req_queued) 824 pq_update(pq); 825 set_comp_state(pq, cq, info.comp_idx, ERROR, req->status); 826 return ret; 827 } 828 829 static inline u32 compute_data_length(struct user_sdma_request *req, 830 struct user_sdma_txreq *tx) 831 { 832 /* 833 * Determine the proper size of the packet data. 834 * The size of the data of the first packet is in the header 835 * template. However, it includes the header and ICRC, which need 836 * to be subtracted. 837 * The minimum representable packet data length in a header is 4 bytes, 838 * therefore, when the data length request is less than 4 bytes, there's 839 * only one packet, and the packet data length is equal to that of the 840 * request data length. 841 * The size of the remaining packets is the minimum of the frag 842 * size (MTU) or remaining data in the request. 843 */ 844 u32 len; 845 846 if (!req->seqnum) { 847 if (req->data_len < sizeof(u32)) 848 len = req->data_len; 849 else 850 len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) - 851 (sizeof(tx->hdr) - 4)); 852 } else if (req_opcode(req->info.ctrl) == EXPECTED) { 853 u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) * 854 PAGE_SIZE; 855 /* 856 * Get the data length based on the remaining space in the 857 * TID pair. 858 */ 859 len = min(tidlen - req->tidoffset, (u32)req->info.fragsize); 860 /* If we've filled up the TID pair, move to the next one. */ 861 if (unlikely(!len) && ++req->tididx < req->n_tids && 862 req->tids[req->tididx]) { 863 tidlen = EXP_TID_GET(req->tids[req->tididx], 864 LEN) * PAGE_SIZE; 865 req->tidoffset = 0; 866 len = min_t(u32, tidlen, req->info.fragsize); 867 } 868 /* 869 * Since the TID pairs map entire pages, make sure that we 870 * are not going to try to send more data that we have 871 * remaining. 872 */ 873 len = min(len, req->data_len - req->sent); 874 } else { 875 len = min(req->data_len - req->sent, (u32)req->info.fragsize); 876 } 877 SDMA_DBG(req, "Data Length = %u", len); 878 return len; 879 } 880 881 static inline u32 pad_len(u32 len) 882 { 883 if (len & (sizeof(u32) - 1)) 884 len += sizeof(u32) - (len & (sizeof(u32) - 1)); 885 return len; 886 } 887 888 static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len) 889 { 890 /* (Size of complete header - size of PBC) + 4B ICRC + data length */ 891 return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len); 892 } 893 894 static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts) 895 { 896 int ret = 0, count; 897 unsigned npkts = 0; 898 struct user_sdma_txreq *tx = NULL; 899 struct hfi1_user_sdma_pkt_q *pq = NULL; 900 struct user_sdma_iovec *iovec = NULL; 901 902 if (!req->pq) 903 return -EINVAL; 904 905 pq = req->pq; 906 907 /* If tx completion has reported an error, we are done. */ 908 if (test_bit(SDMA_REQ_HAS_ERROR, &req->flags)) { 909 set_bit(SDMA_REQ_DONE_ERROR, &req->flags); 910 return -EFAULT; 911 } 912 913 /* 914 * Check if we might have sent the entire request already 915 */ 916 if (unlikely(req->seqnum == req->info.npkts)) { 917 if (!list_empty(&req->txps)) 918 goto dosend; 919 return ret; 920 } 921 922 if (!maxpkts || maxpkts > req->info.npkts - req->seqnum) 923 maxpkts = req->info.npkts - req->seqnum; 924 925 while (npkts < maxpkts) { 926 u32 datalen = 0, queued = 0, data_sent = 0; 927 u64 iov_offset = 0; 928 929 /* 930 * Check whether any of the completions have come back 931 * with errors. If so, we are not going to process any 932 * more packets from this request. 933 */ 934 if (test_bit(SDMA_REQ_HAS_ERROR, &req->flags)) { 935 set_bit(SDMA_REQ_DONE_ERROR, &req->flags); 936 return -EFAULT; 937 } 938 939 tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL); 940 if (!tx) 941 return -ENOMEM; 942 943 tx->flags = 0; 944 tx->req = req; 945 tx->busycount = 0; 946 INIT_LIST_HEAD(&tx->list); 947 948 /* 949 * For the last packet set the ACK request 950 * and disable header suppression. 951 */ 952 if (req->seqnum == req->info.npkts - 1) 953 tx->flags |= (TXREQ_FLAGS_REQ_ACK | 954 TXREQ_FLAGS_REQ_DISABLE_SH); 955 956 /* 957 * Calculate the payload size - this is min of the fragment 958 * (MTU) size or the remaining bytes in the request but only 959 * if we have payload data. 960 */ 961 if (req->data_len) { 962 iovec = &req->iovs[req->iov_idx]; 963 if (ACCESS_ONCE(iovec->offset) == iovec->iov.iov_len) { 964 if (++req->iov_idx == req->data_iovs) { 965 ret = -EFAULT; 966 goto free_txreq; 967 } 968 iovec = &req->iovs[req->iov_idx]; 969 WARN_ON(iovec->offset); 970 } 971 972 datalen = compute_data_length(req, tx); 973 974 /* 975 * Disable header suppression for the payload <= 8DWS. 976 * If there is an uncorrectable error in the receive 977 * data FIFO when the received payload size is less than 978 * or equal to 8DWS then the RxDmaDataFifoRdUncErr is 979 * not reported.There is set RHF.EccErr if the header 980 * is not suppressed. 981 */ 982 if (!datalen) { 983 SDMA_DBG(req, 984 "Request has data but pkt len is 0"); 985 ret = -EFAULT; 986 goto free_tx; 987 } else if (datalen <= 32) { 988 tx->flags |= TXREQ_FLAGS_REQ_DISABLE_SH; 989 } 990 } 991 992 if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags)) { 993 if (!req->seqnum) { 994 u16 pbclen = le16_to_cpu(req->hdr.pbc[0]); 995 u32 lrhlen = get_lrh_len(req->hdr, 996 pad_len(datalen)); 997 /* 998 * Copy the request header into the tx header 999 * because the HW needs a cacheline-aligned 1000 * address. 1001 * This copy can be optimized out if the hdr 1002 * member of user_sdma_request were also 1003 * cacheline aligned. 1004 */ 1005 memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr)); 1006 if (PBC2LRH(pbclen) != lrhlen) { 1007 pbclen = (pbclen & 0xf000) | 1008 LRH2PBC(lrhlen); 1009 tx->hdr.pbc[0] = cpu_to_le16(pbclen); 1010 } 1011 ret = check_header_template(req, &tx->hdr, 1012 lrhlen, datalen); 1013 if (ret) 1014 goto free_tx; 1015 ret = sdma_txinit_ahg(&tx->txreq, 1016 SDMA_TXREQ_F_AHG_COPY, 1017 sizeof(tx->hdr) + datalen, 1018 req->ahg_idx, 0, NULL, 0, 1019 user_sdma_txreq_cb); 1020 if (ret) 1021 goto free_tx; 1022 ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, 1023 &tx->hdr, 1024 sizeof(tx->hdr)); 1025 if (ret) 1026 goto free_txreq; 1027 } else { 1028 int changes; 1029 1030 changes = set_txreq_header_ahg(req, tx, 1031 datalen); 1032 if (changes < 0) 1033 goto free_tx; 1034 sdma_txinit_ahg(&tx->txreq, 1035 SDMA_TXREQ_F_USE_AHG, 1036 datalen, req->ahg_idx, changes, 1037 req->ahg, sizeof(req->hdr), 1038 user_sdma_txreq_cb); 1039 } 1040 } else { 1041 ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) + 1042 datalen, user_sdma_txreq_cb); 1043 if (ret) 1044 goto free_tx; 1045 /* 1046 * Modify the header for this packet. This only needs 1047 * to be done if we are not going to use AHG. Otherwise, 1048 * the HW will do it based on the changes we gave it 1049 * during sdma_txinit_ahg(). 1050 */ 1051 ret = set_txreq_header(req, tx, datalen); 1052 if (ret) 1053 goto free_txreq; 1054 } 1055 1056 /* 1057 * If the request contains any data vectors, add up to 1058 * fragsize bytes to the descriptor. 1059 */ 1060 while (queued < datalen && 1061 (req->sent + data_sent) < req->data_len) { 1062 unsigned long base, offset; 1063 unsigned pageidx, len; 1064 1065 base = (unsigned long)iovec->iov.iov_base; 1066 offset = offset_in_page(base + iovec->offset + 1067 iov_offset); 1068 pageidx = (((iovec->offset + iov_offset + 1069 base) - (base & PAGE_MASK)) >> PAGE_SHIFT); 1070 len = offset + req->info.fragsize > PAGE_SIZE ? 1071 PAGE_SIZE - offset : req->info.fragsize; 1072 len = min((datalen - queued), len); 1073 ret = sdma_txadd_page(pq->dd, &tx->txreq, 1074 iovec->pages[pageidx], 1075 offset, len); 1076 if (ret) { 1077 SDMA_DBG(req, "SDMA txreq add page failed %d\n", 1078 ret); 1079 goto free_txreq; 1080 } 1081 iov_offset += len; 1082 queued += len; 1083 data_sent += len; 1084 if (unlikely(queued < datalen && 1085 pageidx == iovec->npages && 1086 req->iov_idx < req->data_iovs - 1)) { 1087 iovec->offset += iov_offset; 1088 iovec = &req->iovs[++req->iov_idx]; 1089 iov_offset = 0; 1090 } 1091 } 1092 /* 1093 * The txreq was submitted successfully so we can update 1094 * the counters. 1095 */ 1096 req->koffset += datalen; 1097 if (req_opcode(req->info.ctrl) == EXPECTED) 1098 req->tidoffset += datalen; 1099 req->sent += data_sent; 1100 if (req->data_len) 1101 iovec->offset += iov_offset; 1102 list_add_tail(&tx->txreq.list, &req->txps); 1103 /* 1104 * It is important to increment this here as it is used to 1105 * generate the BTH.PSN and, therefore, can't be bulk-updated 1106 * outside of the loop. 1107 */ 1108 tx->seqnum = req->seqnum++; 1109 npkts++; 1110 } 1111 dosend: 1112 ret = sdma_send_txlist(req->sde, &pq->busy, &req->txps, &count); 1113 req->seqsubmitted += count; 1114 if (req->seqsubmitted == req->info.npkts) { 1115 set_bit(SDMA_REQ_SEND_DONE, &req->flags); 1116 /* 1117 * The txreq has already been submitted to the HW queue 1118 * so we can free the AHG entry now. Corruption will not 1119 * happen due to the sequential manner in which 1120 * descriptors are processed. 1121 */ 1122 if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags)) 1123 sdma_ahg_free(req->sde, req->ahg_idx); 1124 } 1125 return ret; 1126 1127 free_txreq: 1128 sdma_txclean(pq->dd, &tx->txreq); 1129 free_tx: 1130 kmem_cache_free(pq->txreq_cache, tx); 1131 return ret; 1132 } 1133 1134 /* 1135 * How many pages in this iovec element? 1136 */ 1137 static inline int num_user_pages(const struct iovec *iov) 1138 { 1139 const unsigned long addr = (unsigned long)iov->iov_base; 1140 const unsigned long len = iov->iov_len; 1141 const unsigned long spage = addr & PAGE_MASK; 1142 const unsigned long epage = (addr + len - 1) & PAGE_MASK; 1143 1144 return 1 + ((epage - spage) >> PAGE_SHIFT); 1145 } 1146 1147 static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages) 1148 { 1149 struct evict_data evict_data; 1150 1151 evict_data.cleared = 0; 1152 evict_data.target = npages; 1153 hfi1_mmu_rb_evict(pq->handler, &evict_data); 1154 return evict_data.cleared; 1155 } 1156 1157 static int pin_vector_pages(struct user_sdma_request *req, 1158 struct user_sdma_iovec *iovec) 1159 { 1160 int ret = 0, pinned, npages, cleared; 1161 struct page **pages; 1162 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1163 struct sdma_mmu_node *node = NULL; 1164 struct mmu_rb_node *rb_node; 1165 1166 rb_node = hfi1_mmu_rb_extract(pq->handler, 1167 (unsigned long)iovec->iov.iov_base, 1168 iovec->iov.iov_len); 1169 if (rb_node) 1170 node = container_of(rb_node, struct sdma_mmu_node, rb); 1171 else 1172 rb_node = NULL; 1173 1174 if (!node) { 1175 node = kzalloc(sizeof(*node), GFP_KERNEL); 1176 if (!node) 1177 return -ENOMEM; 1178 1179 node->rb.addr = (unsigned long)iovec->iov.iov_base; 1180 node->pq = pq; 1181 atomic_set(&node->refcount, 0); 1182 } 1183 1184 npages = num_user_pages(&iovec->iov); 1185 if (node->npages < npages) { 1186 pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL); 1187 if (!pages) { 1188 SDMA_DBG(req, "Failed page array alloc"); 1189 ret = -ENOMEM; 1190 goto bail; 1191 } 1192 memcpy(pages, node->pages, node->npages * sizeof(*pages)); 1193 1194 npages -= node->npages; 1195 1196 retry: 1197 if (!hfi1_can_pin_pages(pq->dd, pq->mm, 1198 atomic_read(&pq->n_locked), npages)) { 1199 cleared = sdma_cache_evict(pq, npages); 1200 if (cleared >= npages) 1201 goto retry; 1202 } 1203 pinned = hfi1_acquire_user_pages(pq->mm, 1204 ((unsigned long)iovec->iov.iov_base + 1205 (node->npages * PAGE_SIZE)), npages, 0, 1206 pages + node->npages); 1207 if (pinned < 0) { 1208 kfree(pages); 1209 ret = pinned; 1210 goto bail; 1211 } 1212 if (pinned != npages) { 1213 unpin_vector_pages(pq->mm, pages, node->npages, 1214 pinned); 1215 ret = -EFAULT; 1216 goto bail; 1217 } 1218 kfree(node->pages); 1219 node->rb.len = iovec->iov.iov_len; 1220 node->pages = pages; 1221 node->npages += pinned; 1222 npages = node->npages; 1223 atomic_add(pinned, &pq->n_locked); 1224 } 1225 iovec->pages = node->pages; 1226 iovec->npages = npages; 1227 iovec->node = node; 1228 1229 ret = hfi1_mmu_rb_insert(req->pq->handler, &node->rb); 1230 if (ret) { 1231 atomic_sub(node->npages, &pq->n_locked); 1232 iovec->node = NULL; 1233 goto bail; 1234 } 1235 return 0; 1236 bail: 1237 if (rb_node) 1238 unpin_vector_pages(pq->mm, node->pages, 0, node->npages); 1239 kfree(node); 1240 return ret; 1241 } 1242 1243 static void unpin_vector_pages(struct mm_struct *mm, struct page **pages, 1244 unsigned start, unsigned npages) 1245 { 1246 hfi1_release_user_pages(mm, pages + start, npages, false); 1247 kfree(pages); 1248 } 1249 1250 static int check_header_template(struct user_sdma_request *req, 1251 struct hfi1_pkt_header *hdr, u32 lrhlen, 1252 u32 datalen) 1253 { 1254 /* 1255 * Perform safety checks for any type of packet: 1256 * - transfer size is multiple of 64bytes 1257 * - packet length is multiple of 4 bytes 1258 * - packet length is not larger than MTU size 1259 * 1260 * These checks are only done for the first packet of the 1261 * transfer since the header is "given" to us by user space. 1262 * For the remainder of the packets we compute the values. 1263 */ 1264 if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 || 1265 lrhlen > get_lrh_len(*hdr, req->info.fragsize)) 1266 return -EINVAL; 1267 1268 if (req_opcode(req->info.ctrl) == EXPECTED) { 1269 /* 1270 * The header is checked only on the first packet. Furthermore, 1271 * we ensure that at least one TID entry is copied when the 1272 * request is submitted. Therefore, we don't have to verify that 1273 * tididx points to something sane. 1274 */ 1275 u32 tidval = req->tids[req->tididx], 1276 tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE, 1277 tididx = EXP_TID_GET(tidval, IDX), 1278 tidctrl = EXP_TID_GET(tidval, CTRL), 1279 tidoff; 1280 __le32 kval = hdr->kdeth.ver_tid_offset; 1281 1282 tidoff = KDETH_GET(kval, OFFSET) * 1283 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? 1284 KDETH_OM_LARGE : KDETH_OM_SMALL); 1285 /* 1286 * Expected receive packets have the following 1287 * additional checks: 1288 * - offset is not larger than the TID size 1289 * - TIDCtrl values match between header and TID array 1290 * - TID indexes match between header and TID array 1291 */ 1292 if ((tidoff + datalen > tidlen) || 1293 KDETH_GET(kval, TIDCTRL) != tidctrl || 1294 KDETH_GET(kval, TID) != tididx) 1295 return -EINVAL; 1296 } 1297 return 0; 1298 } 1299 1300 /* 1301 * Correctly set the BTH.PSN field based on type of 1302 * transfer - eager packets can just increment the PSN but 1303 * expected packets encode generation and sequence in the 1304 * BTH.PSN field so just incrementing will result in errors. 1305 */ 1306 static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags) 1307 { 1308 u32 val = be32_to_cpu(bthpsn), 1309 mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull : 1310 0xffffffull), 1311 psn = val & mask; 1312 if (expct) 1313 psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK); 1314 else 1315 psn = psn + frags; 1316 return psn & mask; 1317 } 1318 1319 static int set_txreq_header(struct user_sdma_request *req, 1320 struct user_sdma_txreq *tx, u32 datalen) 1321 { 1322 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1323 struct hfi1_pkt_header *hdr = &tx->hdr; 1324 u16 pbclen; 1325 int ret; 1326 u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen)); 1327 1328 /* Copy the header template to the request before modification */ 1329 memcpy(hdr, &req->hdr, sizeof(*hdr)); 1330 1331 /* 1332 * Check if the PBC and LRH length are mismatched. If so 1333 * adjust both in the header. 1334 */ 1335 pbclen = le16_to_cpu(hdr->pbc[0]); 1336 if (PBC2LRH(pbclen) != lrhlen) { 1337 pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen); 1338 hdr->pbc[0] = cpu_to_le16(pbclen); 1339 hdr->lrh[2] = cpu_to_be16(lrhlen >> 2); 1340 /* 1341 * Third packet 1342 * This is the first packet in the sequence that has 1343 * a "static" size that can be used for the rest of 1344 * the packets (besides the last one). 1345 */ 1346 if (unlikely(req->seqnum == 2)) { 1347 /* 1348 * From this point on the lengths in both the 1349 * PBC and LRH are the same until the last 1350 * packet. 1351 * Adjust the template so we don't have to update 1352 * every packet 1353 */ 1354 req->hdr.pbc[0] = hdr->pbc[0]; 1355 req->hdr.lrh[2] = hdr->lrh[2]; 1356 } 1357 } 1358 /* 1359 * We only have to modify the header if this is not the 1360 * first packet in the request. Otherwise, we use the 1361 * header given to us. 1362 */ 1363 if (unlikely(!req->seqnum)) { 1364 ret = check_header_template(req, hdr, lrhlen, datalen); 1365 if (ret) 1366 return ret; 1367 goto done; 1368 } 1369 1370 hdr->bth[2] = cpu_to_be32( 1371 set_pkt_bth_psn(hdr->bth[2], 1372 (req_opcode(req->info.ctrl) == EXPECTED), 1373 req->seqnum)); 1374 1375 /* Set ACK request on last packet */ 1376 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK)) 1377 hdr->bth[2] |= cpu_to_be32(1UL << 31); 1378 1379 /* Set the new offset */ 1380 hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset); 1381 /* Expected packets have to fill in the new TID information */ 1382 if (req_opcode(req->info.ctrl) == EXPECTED) { 1383 tidval = req->tids[req->tididx]; 1384 /* 1385 * If the offset puts us at the end of the current TID, 1386 * advance everything. 1387 */ 1388 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * 1389 PAGE_SIZE)) { 1390 req->tidoffset = 0; 1391 /* 1392 * Since we don't copy all the TIDs, all at once, 1393 * we have to check again. 1394 */ 1395 if (++req->tididx > req->n_tids - 1 || 1396 !req->tids[req->tididx]) { 1397 return -EINVAL; 1398 } 1399 tidval = req->tids[req->tididx]; 1400 } 1401 req->omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >= 1402 KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE : KDETH_OM_SMALL; 1403 /* Set KDETH.TIDCtrl based on value for this TID. */ 1404 KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL, 1405 EXP_TID_GET(tidval, CTRL)); 1406 /* Set KDETH.TID based on value for this TID */ 1407 KDETH_SET(hdr->kdeth.ver_tid_offset, TID, 1408 EXP_TID_GET(tidval, IDX)); 1409 /* Clear KDETH.SH when DISABLE_SH flag is set */ 1410 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) 1411 KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0); 1412 /* 1413 * Set the KDETH.OFFSET and KDETH.OM based on size of 1414 * transfer. 1415 */ 1416 SDMA_DBG(req, "TID offset %ubytes %uunits om%u", 1417 req->tidoffset, req->tidoffset / req->omfactor, 1418 req->omfactor != KDETH_OM_SMALL); 1419 KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET, 1420 req->tidoffset / req->omfactor); 1421 KDETH_SET(hdr->kdeth.ver_tid_offset, OM, 1422 req->omfactor != KDETH_OM_SMALL); 1423 } 1424 done: 1425 trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt, 1426 req->info.comp_idx, hdr, tidval); 1427 return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr)); 1428 } 1429 1430 static int set_txreq_header_ahg(struct user_sdma_request *req, 1431 struct user_sdma_txreq *tx, u32 len) 1432 { 1433 int diff = 0; 1434 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1435 struct hfi1_pkt_header *hdr = &req->hdr; 1436 u16 pbclen = le16_to_cpu(hdr->pbc[0]); 1437 u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(len)); 1438 1439 if (PBC2LRH(pbclen) != lrhlen) { 1440 /* PBC.PbcLengthDWs */ 1441 AHG_HEADER_SET(req->ahg, diff, 0, 0, 12, 1442 cpu_to_le16(LRH2PBC(lrhlen))); 1443 /* LRH.PktLen (we need the full 16 bits due to byte swap) */ 1444 AHG_HEADER_SET(req->ahg, diff, 3, 0, 16, 1445 cpu_to_be16(lrhlen >> 2)); 1446 } 1447 1448 /* 1449 * Do the common updates 1450 */ 1451 /* BTH.PSN and BTH.A */ 1452 val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) & 1453 (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff); 1454 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK)) 1455 val32 |= 1UL << 31; 1456 AHG_HEADER_SET(req->ahg, diff, 6, 0, 16, cpu_to_be16(val32 >> 16)); 1457 AHG_HEADER_SET(req->ahg, diff, 6, 16, 16, cpu_to_be16(val32 & 0xffff)); 1458 /* KDETH.Offset */ 1459 AHG_HEADER_SET(req->ahg, diff, 15, 0, 16, 1460 cpu_to_le16(req->koffset & 0xffff)); 1461 AHG_HEADER_SET(req->ahg, diff, 15, 16, 16, 1462 cpu_to_le16(req->koffset >> 16)); 1463 if (req_opcode(req->info.ctrl) == EXPECTED) { 1464 __le16 val; 1465 1466 tidval = req->tids[req->tididx]; 1467 1468 /* 1469 * If the offset puts us at the end of the current TID, 1470 * advance everything. 1471 */ 1472 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * 1473 PAGE_SIZE)) { 1474 req->tidoffset = 0; 1475 /* 1476 * Since we don't copy all the TIDs, all at once, 1477 * we have to check again. 1478 */ 1479 if (++req->tididx > req->n_tids - 1 || 1480 !req->tids[req->tididx]) { 1481 return -EINVAL; 1482 } 1483 tidval = req->tids[req->tididx]; 1484 } 1485 req->omfactor = ((EXP_TID_GET(tidval, LEN) * 1486 PAGE_SIZE) >= 1487 KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE : 1488 KDETH_OM_SMALL; 1489 /* KDETH.OM and KDETH.OFFSET (TID) */ 1490 AHG_HEADER_SET(req->ahg, diff, 7, 0, 16, 1491 ((!!(req->omfactor - KDETH_OM_SMALL)) << 15 | 1492 ((req->tidoffset / req->omfactor) & 0x7fff))); 1493 /* KDETH.TIDCtrl, KDETH.TID, KDETH.Intr, KDETH.SH */ 1494 val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) | 1495 (EXP_TID_GET(tidval, IDX) & 0x3ff)); 1496 1497 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) { 1498 val |= cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset, 1499 INTR) << 1500 AHG_KDETH_INTR_SHIFT)); 1501 } else { 1502 val |= KDETH_GET(hdr->kdeth.ver_tid_offset, SH) ? 1503 cpu_to_le16(0x1 << AHG_KDETH_SH_SHIFT) : 1504 cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset, 1505 INTR) << 1506 AHG_KDETH_INTR_SHIFT)); 1507 } 1508 1509 AHG_HEADER_SET(req->ahg, diff, 7, 16, 14, val); 1510 } 1511 1512 trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt, 1513 req->info.comp_idx, req->sde->this_idx, 1514 req->ahg_idx, req->ahg, diff, tidval); 1515 return diff; 1516 } 1517 1518 /* 1519 * SDMA tx request completion callback. Called when the SDMA progress 1520 * state machine gets notification that the SDMA descriptors for this 1521 * tx request have been processed by the DMA engine. Called in 1522 * interrupt context. 1523 */ 1524 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status) 1525 { 1526 struct user_sdma_txreq *tx = 1527 container_of(txreq, struct user_sdma_txreq, txreq); 1528 struct user_sdma_request *req; 1529 struct hfi1_user_sdma_pkt_q *pq; 1530 struct hfi1_user_sdma_comp_q *cq; 1531 u16 idx; 1532 1533 if (!tx->req) 1534 return; 1535 1536 req = tx->req; 1537 pq = req->pq; 1538 cq = req->cq; 1539 1540 if (status != SDMA_TXREQ_S_OK) { 1541 SDMA_DBG(req, "SDMA completion with error %d", 1542 status); 1543 set_bit(SDMA_REQ_HAS_ERROR, &req->flags); 1544 } 1545 1546 req->seqcomp = tx->seqnum; 1547 kmem_cache_free(pq->txreq_cache, tx); 1548 tx = NULL; 1549 1550 idx = req->info.comp_idx; 1551 if (req->status == -1 && status == SDMA_TXREQ_S_OK) { 1552 if (req->seqcomp == req->info.npkts - 1) { 1553 req->status = 0; 1554 user_sdma_free_request(req, false); 1555 pq_update(pq); 1556 set_comp_state(pq, cq, idx, COMPLETE, 0); 1557 } 1558 } else { 1559 if (status != SDMA_TXREQ_S_OK) 1560 req->status = status; 1561 if (req->seqcomp == (ACCESS_ONCE(req->seqsubmitted) - 1) && 1562 (test_bit(SDMA_REQ_SEND_DONE, &req->flags) || 1563 test_bit(SDMA_REQ_DONE_ERROR, &req->flags))) { 1564 user_sdma_free_request(req, false); 1565 pq_update(pq); 1566 set_comp_state(pq, cq, idx, ERROR, req->status); 1567 } 1568 } 1569 } 1570 1571 static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq) 1572 { 1573 if (atomic_dec_and_test(&pq->n_reqs)) { 1574 xchg(&pq->state, SDMA_PKT_Q_INACTIVE); 1575 wake_up(&pq->wait); 1576 } 1577 } 1578 1579 static void user_sdma_free_request(struct user_sdma_request *req, bool unpin) 1580 { 1581 if (!list_empty(&req->txps)) { 1582 struct sdma_txreq *t, *p; 1583 1584 list_for_each_entry_safe(t, p, &req->txps, list) { 1585 struct user_sdma_txreq *tx = 1586 container_of(t, struct user_sdma_txreq, txreq); 1587 list_del_init(&t->list); 1588 sdma_txclean(req->pq->dd, t); 1589 kmem_cache_free(req->pq->txreq_cache, tx); 1590 } 1591 } 1592 if (req->data_iovs) { 1593 struct sdma_mmu_node *node; 1594 int i; 1595 1596 for (i = 0; i < req->data_iovs; i++) { 1597 node = req->iovs[i].node; 1598 if (!node) 1599 continue; 1600 1601 if (unpin) 1602 hfi1_mmu_rb_remove(req->pq->handler, 1603 &node->rb); 1604 else 1605 atomic_dec(&node->refcount); 1606 } 1607 } 1608 kfree(req->tids); 1609 clear_bit(req->info.comp_idx, req->pq->req_in_use); 1610 } 1611 1612 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, 1613 struct hfi1_user_sdma_comp_q *cq, 1614 u16 idx, enum hfi1_sdma_comp_state state, 1615 int ret) 1616 { 1617 hfi1_cdbg(SDMA, "[%u:%u:%u:%u] Setting completion status %u %d", 1618 pq->dd->unit, pq->ctxt, pq->subctxt, idx, state, ret); 1619 cq->comps[idx].status = state; 1620 if (state == ERROR) 1621 cq->comps[idx].errcode = -ret; 1622 trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt, 1623 idx, state, ret); 1624 } 1625 1626 static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, 1627 unsigned long len) 1628 { 1629 return (bool)(node->addr == addr); 1630 } 1631 1632 static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode) 1633 { 1634 struct sdma_mmu_node *node = 1635 container_of(mnode, struct sdma_mmu_node, rb); 1636 1637 atomic_inc(&node->refcount); 1638 return 0; 1639 } 1640 1641 /* 1642 * Return 1 to remove the node from the rb tree and call the remove op. 1643 * 1644 * Called with the rb tree lock held. 1645 */ 1646 static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, 1647 void *evict_arg, bool *stop) 1648 { 1649 struct sdma_mmu_node *node = 1650 container_of(mnode, struct sdma_mmu_node, rb); 1651 struct evict_data *evict_data = evict_arg; 1652 1653 /* is this node still being used? */ 1654 if (atomic_read(&node->refcount)) 1655 return 0; /* keep this node */ 1656 1657 /* this node will be evicted, add its pages to our count */ 1658 evict_data->cleared += node->npages; 1659 1660 /* have enough pages been cleared? */ 1661 if (evict_data->cleared >= evict_data->target) 1662 *stop = true; 1663 1664 return 1; /* remove this node */ 1665 } 1666 1667 static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode) 1668 { 1669 struct sdma_mmu_node *node = 1670 container_of(mnode, struct sdma_mmu_node, rb); 1671 1672 atomic_sub(node->npages, &node->pq->n_locked); 1673 1674 unpin_vector_pages(node->pq->mm, node->pages, 0, node->npages); 1675 1676 kfree(node); 1677 } 1678 1679 static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode) 1680 { 1681 struct sdma_mmu_node *node = 1682 container_of(mnode, struct sdma_mmu_node, rb); 1683 1684 if (!atomic_read(&node->refcount)) 1685 return 1; 1686 return 0; 1687 } 1688