1e682d02eSNavdeep Parhar /*- 2e682d02eSNavdeep Parhar * Copyright (c) 2012 Chelsio Communications, Inc. 3e682d02eSNavdeep Parhar * All rights reserved. 4e682d02eSNavdeep Parhar * Written by: Navdeep Parhar <np@FreeBSD.org> 5e682d02eSNavdeep Parhar * 6e682d02eSNavdeep Parhar * Redistribution and use in source and binary forms, with or without 7e682d02eSNavdeep Parhar * modification, are permitted provided that the following conditions 8e682d02eSNavdeep Parhar * are met: 9e682d02eSNavdeep Parhar * 1. Redistributions of source code must retain the above copyright 10e682d02eSNavdeep Parhar * notice, this list of conditions and the following disclaimer. 11e682d02eSNavdeep Parhar * 2. Redistributions in binary form must reproduce the above copyright 12e682d02eSNavdeep Parhar * notice, this list of conditions and the following disclaimer in the 13e682d02eSNavdeep Parhar * documentation and/or other materials provided with the distribution. 14e682d02eSNavdeep Parhar * 15e682d02eSNavdeep Parhar * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16e682d02eSNavdeep Parhar * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17e682d02eSNavdeep Parhar * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18e682d02eSNavdeep Parhar * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19e682d02eSNavdeep Parhar * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20e682d02eSNavdeep Parhar * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21e682d02eSNavdeep Parhar * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22e682d02eSNavdeep Parhar * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23e682d02eSNavdeep Parhar * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24e682d02eSNavdeep Parhar * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25e682d02eSNavdeep Parhar * SUCH DAMAGE. 26e682d02eSNavdeep Parhar */ 27e682d02eSNavdeep Parhar 28e682d02eSNavdeep Parhar #include <sys/cdefs.h> 29e682d02eSNavdeep Parhar __FBSDID("$FreeBSD$"); 30e682d02eSNavdeep Parhar 31e682d02eSNavdeep Parhar #include "opt_inet.h" 32e682d02eSNavdeep Parhar 33e682d02eSNavdeep Parhar #include <sys/param.h> 34dc964385SJohn Baldwin #include <sys/aio.h> 35dc964385SJohn Baldwin #include <sys/file.h> 36e682d02eSNavdeep Parhar #include <sys/systm.h> 37e682d02eSNavdeep Parhar #include <sys/kernel.h> 38e682d02eSNavdeep Parhar #include <sys/ktr.h> 39e682d02eSNavdeep Parhar #include <sys/module.h> 40e682d02eSNavdeep Parhar #include <sys/protosw.h> 41e682d02eSNavdeep Parhar #include <sys/proc.h> 42e682d02eSNavdeep Parhar #include <sys/domain.h> 43e682d02eSNavdeep Parhar #include <sys/socket.h> 44e682d02eSNavdeep Parhar #include <sys/socketvar.h> 45dc964385SJohn Baldwin #include <sys/taskqueue.h> 46e682d02eSNavdeep Parhar #include <sys/uio.h> 47e682d02eSNavdeep Parhar #include <netinet/in.h> 48e682d02eSNavdeep Parhar #include <netinet/in_pcb.h> 49e682d02eSNavdeep Parhar #include <netinet/ip.h> 50e682d02eSNavdeep Parhar #include <netinet/tcp_var.h> 51e682d02eSNavdeep Parhar #define TCPSTATES 52e682d02eSNavdeep Parhar #include <netinet/tcp_fsm.h> 53e682d02eSNavdeep Parhar #include <netinet/toecore.h> 54e682d02eSNavdeep Parhar 55e682d02eSNavdeep Parhar #include <vm/vm.h> 56e682d02eSNavdeep Parhar #include <vm/vm_extern.h> 57e682d02eSNavdeep Parhar #include <vm/vm_param.h> 58e682d02eSNavdeep Parhar #include <vm/pmap.h> 59e682d02eSNavdeep Parhar #include <vm/vm_map.h> 60e682d02eSNavdeep Parhar #include <vm/vm_page.h> 61e682d02eSNavdeep Parhar #include <vm/vm_object.h> 62e682d02eSNavdeep Parhar 63e682d02eSNavdeep Parhar #ifdef TCP_OFFLOAD 64e682d02eSNavdeep Parhar #include "common/common.h" 65e682d02eSNavdeep Parhar #include "common/t4_msg.h" 66e682d02eSNavdeep Parhar #include "common/t4_regs.h" 67e682d02eSNavdeep Parhar #include "common/t4_tcb.h" 68e682d02eSNavdeep Parhar #include "tom/t4_tom.h" 69e682d02eSNavdeep Parhar 7069a08863SJohn Baldwin VNET_DECLARE(int, tcp_do_autorcvbuf); 7169a08863SJohn Baldwin #define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) 7269a08863SJohn Baldwin VNET_DECLARE(int, tcp_autorcvbuf_inc); 7369a08863SJohn Baldwin #define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc) 7469a08863SJohn Baldwin VNET_DECLARE(int, tcp_autorcvbuf_max); 7569a08863SJohn Baldwin #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) 7669a08863SJohn Baldwin 77fe0bdd1dSJohn Baldwin /* 78fe0bdd1dSJohn Baldwin * Use the 'backend3' field in AIO jobs to store the amount of data 79fe0bdd1dSJohn Baldwin * received by the AIO job so far. 80fe0bdd1dSJohn Baldwin */ 81fe0bdd1dSJohn Baldwin #define aio_received backend3 82fe0bdd1dSJohn Baldwin 83dc964385SJohn Baldwin static void aio_ddp_requeue_task(void *context, int pending); 84dc964385SJohn Baldwin static void ddp_complete_all(struct toepcb *toep, int error); 85dc964385SJohn Baldwin static void t4_aio_cancel_active(struct kaiocb *job); 86dc964385SJohn Baldwin static void t4_aio_cancel_queued(struct kaiocb *job); 87b12c0a9eSJohn Baldwin 88dc964385SJohn Baldwin static TAILQ_HEAD(, pageset) ddp_orphan_pagesets; 89dc964385SJohn Baldwin static struct mtx ddp_orphan_pagesets_lock; 90dc964385SJohn Baldwin static struct task ddp_orphan_task; 91dc964385SJohn Baldwin 92e682d02eSNavdeep Parhar #define MAX_DDP_BUFFER_SIZE (M_TCB_RX_DDP_BUF0_LEN) 93e682d02eSNavdeep Parhar 94dc964385SJohn Baldwin /* 95dc964385SJohn Baldwin * A page set holds information about a buffer used for DDP. The page 96dc964385SJohn Baldwin * set holds resources such as the VM pages backing the buffer (either 97dc964385SJohn Baldwin * held or wired) and the page pods associated with the buffer. 98dc964385SJohn Baldwin * Recently used page sets are cached to allow for efficient reuse of 99dc964385SJohn Baldwin * buffers (avoiding the need to re-fault in pages, hold them, etc.). 100dc964385SJohn Baldwin * Note that cached page sets keep the backing pages wired. The 101dc964385SJohn Baldwin * number of wired pages is capped by only allowing for two wired 102dc964385SJohn Baldwin * pagesets per connection. This is not a perfect cap, but is a 103dc964385SJohn Baldwin * trade-off for performance. 104dc964385SJohn Baldwin * 105dc964385SJohn Baldwin * If an application ping-pongs two buffers for a connection via 106dc964385SJohn Baldwin * aio_read(2) then those buffers should remain wired and expensive VM 107dc964385SJohn Baldwin * fault lookups should be avoided after each buffer has been used 108dc964385SJohn Baldwin * once. If an application uses more than two buffers then this will 109dc964385SJohn Baldwin * fall back to doing expensive VM fault lookups for each operation. 110dc964385SJohn Baldwin */ 111dc964385SJohn Baldwin static void 112dc964385SJohn Baldwin free_pageset(struct tom_data *td, struct pageset *ps) 113dc964385SJohn Baldwin { 114dc964385SJohn Baldwin vm_page_t p; 115dc964385SJohn Baldwin int i; 116dc964385SJohn Baldwin 117968267fdSNavdeep Parhar if (ps->prsv.prsv_nppods > 0) 118968267fdSNavdeep Parhar t4_free_page_pods(&ps->prsv); 119dc964385SJohn Baldwin 120dc964385SJohn Baldwin if (ps->flags & PS_WIRED) { 121dc964385SJohn Baldwin for (i = 0; i < ps->npages; i++) { 122dc964385SJohn Baldwin p = ps->pages[i]; 123dc964385SJohn Baldwin vm_page_lock(p); 124dc964385SJohn Baldwin vm_page_unwire(p, PQ_INACTIVE); 125dc964385SJohn Baldwin vm_page_unlock(p); 126dc964385SJohn Baldwin } 127dc964385SJohn Baldwin } else 128dc964385SJohn Baldwin vm_page_unhold_pages(ps->pages, ps->npages); 129dc964385SJohn Baldwin mtx_lock(&ddp_orphan_pagesets_lock); 130dc964385SJohn Baldwin TAILQ_INSERT_TAIL(&ddp_orphan_pagesets, ps, link); 131dc964385SJohn Baldwin taskqueue_enqueue(taskqueue_thread, &ddp_orphan_task); 132dc964385SJohn Baldwin mtx_unlock(&ddp_orphan_pagesets_lock); 133dc964385SJohn Baldwin } 134dc964385SJohn Baldwin 135dc964385SJohn Baldwin static void 136dc964385SJohn Baldwin ddp_free_orphan_pagesets(void *context, int pending) 137dc964385SJohn Baldwin { 138dc964385SJohn Baldwin struct pageset *ps; 139dc964385SJohn Baldwin 140dc964385SJohn Baldwin mtx_lock(&ddp_orphan_pagesets_lock); 141dc964385SJohn Baldwin while (!TAILQ_EMPTY(&ddp_orphan_pagesets)) { 142dc964385SJohn Baldwin ps = TAILQ_FIRST(&ddp_orphan_pagesets); 143dc964385SJohn Baldwin TAILQ_REMOVE(&ddp_orphan_pagesets, ps, link); 144dc964385SJohn Baldwin mtx_unlock(&ddp_orphan_pagesets_lock); 145dc964385SJohn Baldwin if (ps->vm) 146dc964385SJohn Baldwin vmspace_free(ps->vm); 147dc964385SJohn Baldwin free(ps, M_CXGBE); 148dc964385SJohn Baldwin mtx_lock(&ddp_orphan_pagesets_lock); 149dc964385SJohn Baldwin } 150dc964385SJohn Baldwin mtx_unlock(&ddp_orphan_pagesets_lock); 151dc964385SJohn Baldwin } 152dc964385SJohn Baldwin 153dc964385SJohn Baldwin static void 154dc964385SJohn Baldwin recycle_pageset(struct toepcb *toep, struct pageset *ps) 155dc964385SJohn Baldwin { 156dc964385SJohn Baldwin 157dc964385SJohn Baldwin DDP_ASSERT_LOCKED(toep); 158dc964385SJohn Baldwin if (!(toep->ddp_flags & DDP_DEAD) && ps->flags & PS_WIRED) { 159dc964385SJohn Baldwin KASSERT(toep->ddp_cached_count + toep->ddp_active_count < 160dc964385SJohn Baldwin nitems(toep->db), ("too many wired pagesets")); 161dc964385SJohn Baldwin TAILQ_INSERT_HEAD(&toep->ddp_cached_pagesets, ps, link); 162dc964385SJohn Baldwin toep->ddp_cached_count++; 163dc964385SJohn Baldwin } else 164dc964385SJohn Baldwin free_pageset(toep->td, ps); 165dc964385SJohn Baldwin } 166dc964385SJohn Baldwin 167dc964385SJohn Baldwin static void 168dc964385SJohn Baldwin ddp_complete_one(struct kaiocb *job, int error) 169dc964385SJohn Baldwin { 170dc964385SJohn Baldwin long copied; 171dc964385SJohn Baldwin 172dc964385SJohn Baldwin /* 173dc964385SJohn Baldwin * If this job had copied data out of the socket buffer before 174dc964385SJohn Baldwin * it was cancelled, report it as a short read rather than an 175dc964385SJohn Baldwin * error. 176dc964385SJohn Baldwin */ 177fe0bdd1dSJohn Baldwin copied = job->aio_received; 178dc964385SJohn Baldwin if (copied != 0 || error == 0) 179dc964385SJohn Baldwin aio_complete(job, copied, 0); 180dc964385SJohn Baldwin else 181dc964385SJohn Baldwin aio_complete(job, -1, error); 182dc964385SJohn Baldwin } 183dc964385SJohn Baldwin 184e682d02eSNavdeep Parhar static void 185e682d02eSNavdeep Parhar free_ddp_buffer(struct tom_data *td, struct ddp_buffer *db) 186e682d02eSNavdeep Parhar { 187e682d02eSNavdeep Parhar 188dc964385SJohn Baldwin if (db->job) { 189dc964385SJohn Baldwin /* 190dc964385SJohn Baldwin * XXX: If we are un-offloading the socket then we 191dc964385SJohn Baldwin * should requeue these on the socket somehow. If we 192dc964385SJohn Baldwin * got a FIN from the remote end, then this completes 193dc964385SJohn Baldwin * any remaining requests with an EOF read. 194dc964385SJohn Baldwin */ 195dc964385SJohn Baldwin if (!aio_clear_cancel_function(db->job)) 196dc964385SJohn Baldwin ddp_complete_one(db->job, 0); 197dc964385SJohn Baldwin } 198e682d02eSNavdeep Parhar 199dc964385SJohn Baldwin if (db->ps) 200dc964385SJohn Baldwin free_pageset(td, db->ps); 201dc964385SJohn Baldwin } 202e682d02eSNavdeep Parhar 203dc964385SJohn Baldwin void 204dc964385SJohn Baldwin ddp_init_toep(struct toepcb *toep) 205dc964385SJohn Baldwin { 206e682d02eSNavdeep Parhar 207dc964385SJohn Baldwin TAILQ_INIT(&toep->ddp_aiojobq); 208dc964385SJohn Baldwin TASK_INIT(&toep->ddp_requeue_task, 0, aio_ddp_requeue_task, toep); 209dc964385SJohn Baldwin toep->ddp_active_id = -1; 210dc964385SJohn Baldwin mtx_init(&toep->ddp_lock, "t4 ddp", NULL, MTX_DEF); 211dc964385SJohn Baldwin } 212dc964385SJohn Baldwin 213dc964385SJohn Baldwin void 214dc964385SJohn Baldwin ddp_uninit_toep(struct toepcb *toep) 215dc964385SJohn Baldwin { 216dc964385SJohn Baldwin 217dc964385SJohn Baldwin mtx_destroy(&toep->ddp_lock); 218e682d02eSNavdeep Parhar } 219e682d02eSNavdeep Parhar 220e682d02eSNavdeep Parhar void 221e682d02eSNavdeep Parhar release_ddp_resources(struct toepcb *toep) 222e682d02eSNavdeep Parhar { 223dc964385SJohn Baldwin struct pageset *ps; 224e682d02eSNavdeep Parhar int i; 225e682d02eSNavdeep Parhar 226dc964385SJohn Baldwin DDP_LOCK(toep); 227dc964385SJohn Baldwin toep->flags |= DDP_DEAD; 22857c60f98SNavdeep Parhar for (i = 0; i < nitems(toep->db); i++) { 229dc964385SJohn Baldwin free_ddp_buffer(toep->td, &toep->db[i]); 230e682d02eSNavdeep Parhar } 231dc964385SJohn Baldwin while ((ps = TAILQ_FIRST(&toep->ddp_cached_pagesets)) != NULL) { 232dc964385SJohn Baldwin TAILQ_REMOVE(&toep->ddp_cached_pagesets, ps, link); 233dc964385SJohn Baldwin free_pageset(toep->td, ps); 234e682d02eSNavdeep Parhar } 235dc964385SJohn Baldwin ddp_complete_all(toep, 0); 236dc964385SJohn Baldwin DDP_UNLOCK(toep); 237dc964385SJohn Baldwin } 238dc964385SJohn Baldwin 239dc964385SJohn Baldwin #ifdef INVARIANTS 240dc964385SJohn Baldwin void 241dc964385SJohn Baldwin ddp_assert_empty(struct toepcb *toep) 242dc964385SJohn Baldwin { 243dc964385SJohn Baldwin int i; 244dc964385SJohn Baldwin 245dc964385SJohn Baldwin MPASS(!(toep->ddp_flags & DDP_TASK_ACTIVE)); 246dc964385SJohn Baldwin for (i = 0; i < nitems(toep->db); i++) { 247dc964385SJohn Baldwin MPASS(toep->db[i].job == NULL); 248dc964385SJohn Baldwin MPASS(toep->db[i].ps == NULL); 249dc964385SJohn Baldwin } 250dc964385SJohn Baldwin MPASS(TAILQ_EMPTY(&toep->ddp_cached_pagesets)); 251dc964385SJohn Baldwin MPASS(TAILQ_EMPTY(&toep->ddp_aiojobq)); 252dc964385SJohn Baldwin } 253dc964385SJohn Baldwin #endif 254dc964385SJohn Baldwin 255dc964385SJohn Baldwin static void 256dc964385SJohn Baldwin complete_ddp_buffer(struct toepcb *toep, struct ddp_buffer *db, 257dc964385SJohn Baldwin unsigned int db_idx) 258dc964385SJohn Baldwin { 259dc964385SJohn Baldwin unsigned int db_flag; 260dc964385SJohn Baldwin 261dc964385SJohn Baldwin toep->ddp_active_count--; 262dc964385SJohn Baldwin if (toep->ddp_active_id == db_idx) { 263dc964385SJohn Baldwin if (toep->ddp_active_count == 0) { 264dc964385SJohn Baldwin KASSERT(toep->db[db_idx ^ 1].job == NULL, 265dc964385SJohn Baldwin ("%s: active_count mismatch", __func__)); 266dc964385SJohn Baldwin toep->ddp_active_id = -1; 267dc964385SJohn Baldwin } else 268dc964385SJohn Baldwin toep->ddp_active_id ^= 1; 2691081d276SJohn Baldwin #ifdef VERBOSE_TRACES 270dc964385SJohn Baldwin CTR2(KTR_CXGBE, "%s: ddp_active_id = %d", __func__, 271dc964385SJohn Baldwin toep->ddp_active_id); 2721081d276SJohn Baldwin #endif 273dc964385SJohn Baldwin } else { 274dc964385SJohn Baldwin KASSERT(toep->ddp_active_count != 0 && 275dc964385SJohn Baldwin toep->ddp_active_id != -1, 276dc964385SJohn Baldwin ("%s: active count mismatch", __func__)); 277dc964385SJohn Baldwin } 278dc964385SJohn Baldwin 279dc964385SJohn Baldwin db->cancel_pending = 0; 280dc964385SJohn Baldwin db->job = NULL; 281dc964385SJohn Baldwin recycle_pageset(toep, db->ps); 282dc964385SJohn Baldwin db->ps = NULL; 283dc964385SJohn Baldwin 284dc964385SJohn Baldwin db_flag = db_idx == 1 ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE; 285dc964385SJohn Baldwin KASSERT(toep->ddp_flags & db_flag, 286dc964385SJohn Baldwin ("%s: DDP buffer not active. toep %p, ddp_flags 0x%x", 287dc964385SJohn Baldwin __func__, toep, toep->ddp_flags)); 288dc964385SJohn Baldwin toep->ddp_flags &= ~db_flag; 289e682d02eSNavdeep Parhar } 290e682d02eSNavdeep Parhar 291d588c1f9SNavdeep Parhar /* XXX: handle_ddp_data code duplication */ 292d588c1f9SNavdeep Parhar void 293d588c1f9SNavdeep Parhar insert_ddp_data(struct toepcb *toep, uint32_t n) 294d588c1f9SNavdeep Parhar { 295d588c1f9SNavdeep Parhar struct inpcb *inp = toep->inp; 296d588c1f9SNavdeep Parhar struct tcpcb *tp = intotcpcb(inp); 297dc964385SJohn Baldwin struct ddp_buffer *db; 298dc964385SJohn Baldwin struct kaiocb *job; 299dc964385SJohn Baldwin size_t placed; 300dc964385SJohn Baldwin long copied; 301dc964385SJohn Baldwin unsigned int db_flag, db_idx; 302d588c1f9SNavdeep Parhar 303d588c1f9SNavdeep Parhar INP_WLOCK_ASSERT(inp); 304dc964385SJohn Baldwin DDP_ASSERT_LOCKED(toep); 305d588c1f9SNavdeep Parhar 306d588c1f9SNavdeep Parhar tp->rcv_nxt += n; 307d588c1f9SNavdeep Parhar #ifndef USE_DDP_RX_FLOW_CONTROL 308d588c1f9SNavdeep Parhar KASSERT(tp->rcv_wnd >= n, ("%s: negative window size", __func__)); 309d588c1f9SNavdeep Parhar tp->rcv_wnd -= n; 310d588c1f9SNavdeep Parhar #endif 311dc964385SJohn Baldwin #ifndef USE_DDP_RX_FLOW_CONTROL 312dc964385SJohn Baldwin toep->rx_credits += n; 313d588c1f9SNavdeep Parhar #endif 314dc964385SJohn Baldwin CTR2(KTR_CXGBE, "%s: placed %u bytes before falling out of DDP", 315dc964385SJohn Baldwin __func__, n); 316dc964385SJohn Baldwin while (toep->ddp_active_count > 0) { 317dc964385SJohn Baldwin MPASS(toep->ddp_active_id != -1); 318dc964385SJohn Baldwin db_idx = toep->ddp_active_id; 319dc964385SJohn Baldwin db_flag = db_idx == 1 ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE; 320dc964385SJohn Baldwin MPASS((toep->ddp_flags & db_flag) != 0); 321dc964385SJohn Baldwin db = &toep->db[db_idx]; 322dc964385SJohn Baldwin job = db->job; 323fe0bdd1dSJohn Baldwin copied = job->aio_received; 324dc964385SJohn Baldwin placed = n; 325dc964385SJohn Baldwin if (placed > job->uaiocb.aio_nbytes - copied) 326dc964385SJohn Baldwin placed = job->uaiocb.aio_nbytes - copied; 327b1012d80SJohn Baldwin if (placed > 0) 328b1012d80SJohn Baldwin job->msgrcv = 1; 329dc964385SJohn Baldwin if (!aio_clear_cancel_function(job)) { 330dc964385SJohn Baldwin /* 331dc964385SJohn Baldwin * Update the copied length for when 332dc964385SJohn Baldwin * t4_aio_cancel_active() completes this 333dc964385SJohn Baldwin * request. 334dc964385SJohn Baldwin */ 335fe0bdd1dSJohn Baldwin job->aio_received += placed; 336dc964385SJohn Baldwin } else if (copied + placed != 0) { 337dc964385SJohn Baldwin CTR4(KTR_CXGBE, 338dc964385SJohn Baldwin "%s: completing %p (copied %ld, placed %lu)", 339dc964385SJohn Baldwin __func__, job, copied, placed); 340dc964385SJohn Baldwin /* XXX: This always completes if there is some data. */ 341dc964385SJohn Baldwin aio_complete(job, copied + placed, 0); 342dc964385SJohn Baldwin } else if (aio_set_cancel_function(job, t4_aio_cancel_queued)) { 343dc964385SJohn Baldwin TAILQ_INSERT_HEAD(&toep->ddp_aiojobq, job, list); 344dc964385SJohn Baldwin toep->ddp_waiting_count++; 345dc964385SJohn Baldwin } else 346dc964385SJohn Baldwin aio_cancel(job); 347dc964385SJohn Baldwin n -= placed; 348dc964385SJohn Baldwin complete_ddp_buffer(toep, db, db_idx); 349dc964385SJohn Baldwin } 350dc964385SJohn Baldwin 351dc964385SJohn Baldwin MPASS(n == 0); 352d588c1f9SNavdeep Parhar } 353d588c1f9SNavdeep Parhar 354e682d02eSNavdeep Parhar /* SET_TCB_FIELD sent as a ULP command looks like this */ 355e682d02eSNavdeep Parhar #define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \ 356e682d02eSNavdeep Parhar sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core)) 357e682d02eSNavdeep Parhar 358e682d02eSNavdeep Parhar /* RX_DATA_ACK sent as a ULP command looks like this */ 359e682d02eSNavdeep Parhar #define LEN__RX_DATA_ACK_ULP (sizeof(struct ulp_txpkt) + \ 360e682d02eSNavdeep Parhar sizeof(struct ulptx_idata) + sizeof(struct cpl_rx_data_ack_core)) 361e682d02eSNavdeep Parhar 362e682d02eSNavdeep Parhar static inline void * 363e682d02eSNavdeep Parhar mk_set_tcb_field_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep, 364e682d02eSNavdeep Parhar uint64_t word, uint64_t mask, uint64_t val) 365e682d02eSNavdeep Parhar { 366e682d02eSNavdeep Parhar struct ulptx_idata *ulpsc; 367e682d02eSNavdeep Parhar struct cpl_set_tcb_field_core *req; 368e682d02eSNavdeep Parhar 369e682d02eSNavdeep Parhar ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0)); 370e682d02eSNavdeep Parhar ulpmc->len = htobe32(howmany(LEN__SET_TCB_FIELD_ULP, 16)); 371e682d02eSNavdeep Parhar 372e682d02eSNavdeep Parhar ulpsc = (struct ulptx_idata *)(ulpmc + 1); 373e682d02eSNavdeep Parhar ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); 374e682d02eSNavdeep Parhar ulpsc->len = htobe32(sizeof(*req)); 375e682d02eSNavdeep Parhar 376e682d02eSNavdeep Parhar req = (struct cpl_set_tcb_field_core *)(ulpsc + 1); 377e682d02eSNavdeep Parhar OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tid)); 378e682d02eSNavdeep Parhar req->reply_ctrl = htobe16(V_NO_REPLY(1) | 379e682d02eSNavdeep Parhar V_QUEUENO(toep->ofld_rxq->iq.abs_id)); 380e682d02eSNavdeep Parhar req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(0)); 381e682d02eSNavdeep Parhar req->mask = htobe64(mask); 382e682d02eSNavdeep Parhar req->val = htobe64(val); 383e682d02eSNavdeep Parhar 384e682d02eSNavdeep Parhar ulpsc = (struct ulptx_idata *)(req + 1); 385e682d02eSNavdeep Parhar if (LEN__SET_TCB_FIELD_ULP % 16) { 386e682d02eSNavdeep Parhar ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP)); 387e682d02eSNavdeep Parhar ulpsc->len = htobe32(0); 388e682d02eSNavdeep Parhar return (ulpsc + 1); 389e682d02eSNavdeep Parhar } 390e682d02eSNavdeep Parhar return (ulpsc); 391e682d02eSNavdeep Parhar } 392e682d02eSNavdeep Parhar 393e682d02eSNavdeep Parhar static inline void * 394e682d02eSNavdeep Parhar mk_rx_data_ack_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep) 395e682d02eSNavdeep Parhar { 396e682d02eSNavdeep Parhar struct ulptx_idata *ulpsc; 397e682d02eSNavdeep Parhar struct cpl_rx_data_ack_core *req; 398e682d02eSNavdeep Parhar 399e682d02eSNavdeep Parhar ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0)); 400e682d02eSNavdeep Parhar ulpmc->len = htobe32(howmany(LEN__RX_DATA_ACK_ULP, 16)); 401e682d02eSNavdeep Parhar 402e682d02eSNavdeep Parhar ulpsc = (struct ulptx_idata *)(ulpmc + 1); 403e682d02eSNavdeep Parhar ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); 404e682d02eSNavdeep Parhar ulpsc->len = htobe32(sizeof(*req)); 405e682d02eSNavdeep Parhar 406e682d02eSNavdeep Parhar req = (struct cpl_rx_data_ack_core *)(ulpsc + 1); 407e682d02eSNavdeep Parhar OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tid)); 408e682d02eSNavdeep Parhar req->credit_dack = htobe32(F_RX_MODULATE_RX); 409e682d02eSNavdeep Parhar 410e682d02eSNavdeep Parhar ulpsc = (struct ulptx_idata *)(req + 1); 411e682d02eSNavdeep Parhar if (LEN__RX_DATA_ACK_ULP % 16) { 412e682d02eSNavdeep Parhar ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP)); 413e682d02eSNavdeep Parhar ulpsc->len = htobe32(0); 414e682d02eSNavdeep Parhar return (ulpsc + 1); 415e682d02eSNavdeep Parhar } 416e682d02eSNavdeep Parhar return (ulpsc); 417e682d02eSNavdeep Parhar } 418e682d02eSNavdeep Parhar 419e682d02eSNavdeep Parhar static struct wrqe * 420e682d02eSNavdeep Parhar mk_update_tcb_for_ddp(struct adapter *sc, struct toepcb *toep, int db_idx, 421dc964385SJohn Baldwin struct pageset *ps, int offset, uint64_t ddp_flags, uint64_t ddp_flags_mask) 422e682d02eSNavdeep Parhar { 423e682d02eSNavdeep Parhar struct wrqe *wr; 424e682d02eSNavdeep Parhar struct work_request_hdr *wrh; 425e682d02eSNavdeep Parhar struct ulp_txpkt *ulpmc; 426e682d02eSNavdeep Parhar int len; 427e682d02eSNavdeep Parhar 428e682d02eSNavdeep Parhar KASSERT(db_idx == 0 || db_idx == 1, 429e682d02eSNavdeep Parhar ("%s: bad DDP buffer index %d", __func__, db_idx)); 430e682d02eSNavdeep Parhar 431e682d02eSNavdeep Parhar /* 432e682d02eSNavdeep Parhar * We'll send a compound work request that has 3 SET_TCB_FIELDs and an 433e682d02eSNavdeep Parhar * RX_DATA_ACK (with RX_MODULATE to speed up delivery). 434e682d02eSNavdeep Parhar * 435e682d02eSNavdeep Parhar * The work request header is 16B and always ends at a 16B boundary. 436e682d02eSNavdeep Parhar * The ULPTX master commands that follow must all end at 16B boundaries 437e682d02eSNavdeep Parhar * too so we round up the size to 16. 438e682d02eSNavdeep Parhar */ 439d14b0ac1SNavdeep Parhar len = sizeof(*wrh) + 3 * roundup2(LEN__SET_TCB_FIELD_ULP, 16) + 440d14b0ac1SNavdeep Parhar roundup2(LEN__RX_DATA_ACK_ULP, 16); 441e682d02eSNavdeep Parhar 442e682d02eSNavdeep Parhar wr = alloc_wrqe(len, toep->ctrlq); 443e682d02eSNavdeep Parhar if (wr == NULL) 444e682d02eSNavdeep Parhar return (NULL); 445e682d02eSNavdeep Parhar wrh = wrtod(wr); 446e682d02eSNavdeep Parhar INIT_ULPTX_WRH(wrh, len, 1, 0); /* atomic */ 447e682d02eSNavdeep Parhar ulpmc = (struct ulp_txpkt *)(wrh + 1); 448e682d02eSNavdeep Parhar 449e682d02eSNavdeep Parhar /* Write the buffer's tag */ 450e682d02eSNavdeep Parhar ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, 451e682d02eSNavdeep Parhar W_TCB_RX_DDP_BUF0_TAG + db_idx, 452e682d02eSNavdeep Parhar V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG), 453968267fdSNavdeep Parhar V_TCB_RX_DDP_BUF0_TAG(ps->prsv.prsv_tag)); 454e682d02eSNavdeep Parhar 455e682d02eSNavdeep Parhar /* Update the current offset in the DDP buffer and its total length */ 456e682d02eSNavdeep Parhar if (db_idx == 0) 457e682d02eSNavdeep Parhar ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, 458e682d02eSNavdeep Parhar W_TCB_RX_DDP_BUF0_OFFSET, 459e682d02eSNavdeep Parhar V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | 460e682d02eSNavdeep Parhar V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), 461e682d02eSNavdeep Parhar V_TCB_RX_DDP_BUF0_OFFSET(offset) | 462dc964385SJohn Baldwin V_TCB_RX_DDP_BUF0_LEN(ps->len)); 463e682d02eSNavdeep Parhar else 464e682d02eSNavdeep Parhar ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, 465e682d02eSNavdeep Parhar W_TCB_RX_DDP_BUF1_OFFSET, 466e682d02eSNavdeep Parhar V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | 467e682d02eSNavdeep Parhar V_TCB_RX_DDP_BUF1_LEN((u64)M_TCB_RX_DDP_BUF1_LEN << 32), 468e682d02eSNavdeep Parhar V_TCB_RX_DDP_BUF1_OFFSET(offset) | 469dc964385SJohn Baldwin V_TCB_RX_DDP_BUF1_LEN((u64)ps->len << 32)); 470e682d02eSNavdeep Parhar 471e682d02eSNavdeep Parhar /* Update DDP flags */ 472e682d02eSNavdeep Parhar ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_RX_DDP_FLAGS, 473dc964385SJohn Baldwin ddp_flags_mask, ddp_flags); 474e682d02eSNavdeep Parhar 475e682d02eSNavdeep Parhar /* Gratuitous RX_DATA_ACK with RX_MODULATE set to speed up delivery. */ 476e682d02eSNavdeep Parhar ulpmc = mk_rx_data_ack_ulp(ulpmc, toep); 477e682d02eSNavdeep Parhar 478e682d02eSNavdeep Parhar return (wr); 479e682d02eSNavdeep Parhar } 480e682d02eSNavdeep Parhar 481e682d02eSNavdeep Parhar static int 482e682d02eSNavdeep Parhar handle_ddp_data(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt, int len) 483e682d02eSNavdeep Parhar { 484e682d02eSNavdeep Parhar uint32_t report = be32toh(ddp_report); 485dc964385SJohn Baldwin unsigned int db_idx; 486e682d02eSNavdeep Parhar struct inpcb *inp = toep->inp; 487dc964385SJohn Baldwin struct ddp_buffer *db; 488e682d02eSNavdeep Parhar struct tcpcb *tp; 489e682d02eSNavdeep Parhar struct socket *so; 490e682d02eSNavdeep Parhar struct sockbuf *sb; 491dc964385SJohn Baldwin struct kaiocb *job; 492dc964385SJohn Baldwin long copied; 493e682d02eSNavdeep Parhar 494dc964385SJohn Baldwin db_idx = report & F_DDP_BUF_IDX ? 1 : 0; 495e682d02eSNavdeep Parhar 496e682d02eSNavdeep Parhar if (__predict_false(!(report & F_DDP_INV))) 497e682d02eSNavdeep Parhar CXGBE_UNIMPLEMENTED("DDP buffer still valid"); 498e682d02eSNavdeep Parhar 499e682d02eSNavdeep Parhar INP_WLOCK(inp); 500e682d02eSNavdeep Parhar so = inp_inpcbtosocket(inp); 501e682d02eSNavdeep Parhar sb = &so->so_rcv; 502dc964385SJohn Baldwin DDP_LOCK(toep); 503dc964385SJohn Baldwin 504dc964385SJohn Baldwin KASSERT(toep->ddp_active_id == db_idx, 505dc964385SJohn Baldwin ("completed DDP buffer (%d) != active_id (%d) for tid %d", db_idx, 506dc964385SJohn Baldwin toep->ddp_active_id, toep->tid)); 507dc964385SJohn Baldwin db = &toep->db[db_idx]; 508dc964385SJohn Baldwin job = db->job; 509dc964385SJohn Baldwin 510e682d02eSNavdeep Parhar if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) { 511e682d02eSNavdeep Parhar /* 512dc964385SJohn Baldwin * This can happen due to an administrative tcpdrop(8). 513dc964385SJohn Baldwin * Just fail the request with ECONNRESET. 514e682d02eSNavdeep Parhar */ 515e682d02eSNavdeep Parhar CTR5(KTR_CXGBE, "%s: tid %u, seq 0x%x, len %d, inp_flags 0x%x", 516e682d02eSNavdeep Parhar __func__, toep->tid, be32toh(rcv_nxt), len, inp->inp_flags); 517dc964385SJohn Baldwin if (aio_clear_cancel_function(job)) 518dc964385SJohn Baldwin ddp_complete_one(job, ECONNRESET); 519dc964385SJohn Baldwin goto completed; 520e682d02eSNavdeep Parhar } 521e682d02eSNavdeep Parhar 522e682d02eSNavdeep Parhar tp = intotcpcb(inp); 5238fb15ddbSJohn Baldwin 5248fb15ddbSJohn Baldwin /* 5258fb15ddbSJohn Baldwin * For RX_DDP_COMPLETE, len will be zero and rcv_nxt is the 5268fb15ddbSJohn Baldwin * sequence number of the next byte to receive. The length of 5278fb15ddbSJohn Baldwin * the data received for this message must be computed by 5288fb15ddbSJohn Baldwin * comparing the new and old values of rcv_nxt. 5298fb15ddbSJohn Baldwin * 5308fb15ddbSJohn Baldwin * For RX_DATA_DDP, len might be non-zero, but it is only the 5318fb15ddbSJohn Baldwin * length of the most recent DMA. It does not include the 5328fb15ddbSJohn Baldwin * total length of the data received since the previous update 5338fb15ddbSJohn Baldwin * for this DDP buffer. rcv_nxt is the sequence number of the 5348fb15ddbSJohn Baldwin * first received byte from the most recent DMA. 5358fb15ddbSJohn Baldwin */ 536e682d02eSNavdeep Parhar len += be32toh(rcv_nxt) - tp->rcv_nxt; 537e682d02eSNavdeep Parhar tp->rcv_nxt += len; 538e682d02eSNavdeep Parhar tp->t_rcvtime = ticks; 539e682d02eSNavdeep Parhar #ifndef USE_DDP_RX_FLOW_CONTROL 540e682d02eSNavdeep Parhar KASSERT(tp->rcv_wnd >= len, ("%s: negative window size", __func__)); 541e682d02eSNavdeep Parhar tp->rcv_wnd -= len; 542e682d02eSNavdeep Parhar #endif 543dc964385SJohn Baldwin #ifdef VERBOSE_TRACES 544dc964385SJohn Baldwin CTR4(KTR_CXGBE, "%s: DDP[%d] placed %d bytes (%#x)", __func__, db_idx, 545dc964385SJohn Baldwin len, report); 546dc964385SJohn Baldwin #endif 547e682d02eSNavdeep Parhar 54869a08863SJohn Baldwin /* receive buffer autosize */ 549*a342904bSNavdeep Parhar MPASS(toep->vnet == so->so_vnet); 550*a342904bSNavdeep Parhar CURVNET_SET(toep->vnet); 551dc964385SJohn Baldwin SOCKBUF_LOCK(sb); 55269a08863SJohn Baldwin if (sb->sb_flags & SB_AUTOSIZE && 55369a08863SJohn Baldwin V_tcp_do_autorcvbuf && 55469a08863SJohn Baldwin sb->sb_hiwat < V_tcp_autorcvbuf_max && 55569a08863SJohn Baldwin len > (sbspace(sb) / 8 * 7)) { 55669a08863SJohn Baldwin unsigned int hiwat = sb->sb_hiwat; 55769a08863SJohn Baldwin unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc, 55869a08863SJohn Baldwin V_tcp_autorcvbuf_max); 55969a08863SJohn Baldwin 56069a08863SJohn Baldwin if (!sbreserve_locked(sb, newsize, so, NULL)) 56169a08863SJohn Baldwin sb->sb_flags &= ~SB_AUTOSIZE; 56269a08863SJohn Baldwin else 56369a08863SJohn Baldwin toep->rx_credits += newsize - hiwat; 56469a08863SJohn Baldwin } 565dc964385SJohn Baldwin SOCKBUF_UNLOCK(sb); 566dc964385SJohn Baldwin CURVNET_RESTORE(); 56769a08863SJohn Baldwin 568dc964385SJohn Baldwin #ifndef USE_DDP_RX_FLOW_CONTROL 569dc964385SJohn Baldwin toep->rx_credits += len; 570e682d02eSNavdeep Parhar #endif 571e682d02eSNavdeep Parhar 572b1012d80SJohn Baldwin job->msgrcv = 1; 573dc964385SJohn Baldwin if (db->cancel_pending) { 574dc964385SJohn Baldwin /* 575dc964385SJohn Baldwin * Update the job's length but defer completion to the 576dc964385SJohn Baldwin * TCB_RPL callback. 577dc964385SJohn Baldwin */ 578fe0bdd1dSJohn Baldwin job->aio_received += len; 579dc964385SJohn Baldwin goto out; 580dc964385SJohn Baldwin } else if (!aio_clear_cancel_function(job)) { 581dc964385SJohn Baldwin /* 582dc964385SJohn Baldwin * Update the copied length for when 583dc964385SJohn Baldwin * t4_aio_cancel_active() completes this request. 584dc964385SJohn Baldwin */ 585fe0bdd1dSJohn Baldwin job->aio_received += len; 586dc964385SJohn Baldwin } else { 587fe0bdd1dSJohn Baldwin copied = job->aio_received; 588dc964385SJohn Baldwin #ifdef VERBOSE_TRACES 589dc964385SJohn Baldwin CTR4(KTR_CXGBE, "%s: completing %p (copied %ld, placed %d)", 590dc964385SJohn Baldwin __func__, job, copied, len); 591dc964385SJohn Baldwin #endif 592dc964385SJohn Baldwin aio_complete(job, copied + len, 0); 593dc964385SJohn Baldwin t4_rcvd(&toep->td->tod, tp); 594dc964385SJohn Baldwin } 595dc964385SJohn Baldwin 596dc964385SJohn Baldwin completed: 597dc964385SJohn Baldwin complete_ddp_buffer(toep, db, db_idx); 598dc964385SJohn Baldwin if (toep->ddp_waiting_count > 0) 599dc964385SJohn Baldwin ddp_queue_toep(toep); 600dc964385SJohn Baldwin out: 601dc964385SJohn Baldwin DDP_UNLOCK(toep); 602e682d02eSNavdeep Parhar INP_WUNLOCK(inp); 603dc964385SJohn Baldwin 604e682d02eSNavdeep Parhar return (0); 605e682d02eSNavdeep Parhar } 606e682d02eSNavdeep Parhar 607b12c0a9eSJohn Baldwin void 608dc964385SJohn Baldwin handle_ddp_indicate(struct toepcb *toep) 609b12c0a9eSJohn Baldwin { 610b12c0a9eSJohn Baldwin 611dc964385SJohn Baldwin DDP_ASSERT_LOCKED(toep); 612dc964385SJohn Baldwin MPASS(toep->ddp_active_count == 0); 613dc964385SJohn Baldwin MPASS((toep->ddp_flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)) == 0); 614dc964385SJohn Baldwin if (toep->ddp_waiting_count == 0) { 615dc964385SJohn Baldwin /* 616dc964385SJohn Baldwin * The pending requests that triggered the request for an 617dc964385SJohn Baldwin * an indicate were cancelled. Those cancels should have 618dc964385SJohn Baldwin * already disabled DDP. Just ignore this as the data is 619dc964385SJohn Baldwin * going into the socket buffer anyway. 620dc964385SJohn Baldwin */ 621dc964385SJohn Baldwin return; 622dc964385SJohn Baldwin } 623dc964385SJohn Baldwin CTR3(KTR_CXGBE, "%s: tid %d indicated (%d waiting)", __func__, 624dc964385SJohn Baldwin toep->tid, toep->ddp_waiting_count); 625dc964385SJohn Baldwin ddp_queue_toep(toep); 626dc964385SJohn Baldwin } 627dc964385SJohn Baldwin 628dc964385SJohn Baldwin enum { 629dc964385SJohn Baldwin DDP_BUF0_INVALIDATED = 0x2, 630dc964385SJohn Baldwin DDP_BUF1_INVALIDATED 631dc964385SJohn Baldwin }; 632dc964385SJohn Baldwin 633dc964385SJohn Baldwin void 634dc964385SJohn Baldwin handle_ddp_tcb_rpl(struct toepcb *toep, const struct cpl_set_tcb_rpl *cpl) 635dc964385SJohn Baldwin { 636dc964385SJohn Baldwin unsigned int db_idx; 637dc964385SJohn Baldwin struct inpcb *inp = toep->inp; 638dc964385SJohn Baldwin struct ddp_buffer *db; 639dc964385SJohn Baldwin struct kaiocb *job; 640dc964385SJohn Baldwin long copied; 641dc964385SJohn Baldwin 642dc964385SJohn Baldwin if (cpl->status != CPL_ERR_NONE) 643dc964385SJohn Baldwin panic("XXX: tcp_rpl failed: %d", cpl->status); 644dc964385SJohn Baldwin 645dc964385SJohn Baldwin switch (cpl->cookie) { 646dc964385SJohn Baldwin case V_WORD(W_TCB_RX_DDP_FLAGS) | V_COOKIE(DDP_BUF0_INVALIDATED): 647dc964385SJohn Baldwin case V_WORD(W_TCB_RX_DDP_FLAGS) | V_COOKIE(DDP_BUF1_INVALIDATED): 648dc964385SJohn Baldwin /* 649dc964385SJohn Baldwin * XXX: This duplicates a lot of code with handle_ddp_data(). 650dc964385SJohn Baldwin */ 651dc964385SJohn Baldwin db_idx = G_COOKIE(cpl->cookie) - DDP_BUF0_INVALIDATED; 652dc964385SJohn Baldwin INP_WLOCK(inp); 653dc964385SJohn Baldwin DDP_LOCK(toep); 654dc964385SJohn Baldwin db = &toep->db[db_idx]; 655dc964385SJohn Baldwin 656dc964385SJohn Baldwin /* 657dc964385SJohn Baldwin * handle_ddp_data() should leave the job around until 658dc964385SJohn Baldwin * this callback runs once a cancel is pending. 659dc964385SJohn Baldwin */ 660dc964385SJohn Baldwin MPASS(db != NULL); 661dc964385SJohn Baldwin MPASS(db->job != NULL); 662dc964385SJohn Baldwin MPASS(db->cancel_pending); 663dc964385SJohn Baldwin 664dc964385SJohn Baldwin /* 665dc964385SJohn Baldwin * XXX: It's not clear what happens if there is data 666dc964385SJohn Baldwin * placed when the buffer is invalidated. I suspect we 667dc964385SJohn Baldwin * need to read the TCB to see how much data was placed. 668dc964385SJohn Baldwin * 669dc964385SJohn Baldwin * For now this just pretends like nothing was placed. 670dc964385SJohn Baldwin * 671dc964385SJohn Baldwin * XXX: Note that if we did check the PCB we would need to 672dc964385SJohn Baldwin * also take care of updating the tp, etc. 673dc964385SJohn Baldwin */ 674dc964385SJohn Baldwin job = db->job; 675fe0bdd1dSJohn Baldwin copied = job->aio_received; 676dc964385SJohn Baldwin if (copied == 0) { 677dc964385SJohn Baldwin CTR2(KTR_CXGBE, "%s: cancelling %p", __func__, job); 678dc964385SJohn Baldwin aio_cancel(job); 679dc964385SJohn Baldwin } else { 680dc964385SJohn Baldwin CTR3(KTR_CXGBE, "%s: completing %p (copied %ld)", 681dc964385SJohn Baldwin __func__, job, copied); 682dc964385SJohn Baldwin aio_complete(job, copied, 0); 683dc964385SJohn Baldwin t4_rcvd(&toep->td->tod, intotcpcb(inp)); 684dc964385SJohn Baldwin } 685dc964385SJohn Baldwin 686dc964385SJohn Baldwin complete_ddp_buffer(toep, db, db_idx); 687dc964385SJohn Baldwin if (toep->ddp_waiting_count > 0) 688dc964385SJohn Baldwin ddp_queue_toep(toep); 689dc964385SJohn Baldwin DDP_UNLOCK(toep); 690dc964385SJohn Baldwin INP_WUNLOCK(inp); 691dc964385SJohn Baldwin break; 692dc964385SJohn Baldwin default: 693dc964385SJohn Baldwin panic("XXX: unknown tcb_rpl offset %#x, cookie %#x", 694dc964385SJohn Baldwin G_WORD(cpl->cookie), G_COOKIE(cpl->cookie)); 695dc964385SJohn Baldwin } 696dc964385SJohn Baldwin } 697dc964385SJohn Baldwin 698dc964385SJohn Baldwin void 699dc964385SJohn Baldwin handle_ddp_close(struct toepcb *toep, struct tcpcb *tp, __be32 rcv_nxt) 700dc964385SJohn Baldwin { 701dc964385SJohn Baldwin struct ddp_buffer *db; 702dc964385SJohn Baldwin struct kaiocb *job; 703dc964385SJohn Baldwin long copied; 704dc964385SJohn Baldwin unsigned int db_flag, db_idx; 705dc964385SJohn Baldwin int len, placed; 706dc964385SJohn Baldwin 707b12c0a9eSJohn Baldwin INP_WLOCK_ASSERT(toep->inp); 708dc964385SJohn Baldwin DDP_ASSERT_LOCKED(toep); 709b12c0a9eSJohn Baldwin len = be32toh(rcv_nxt) - tp->rcv_nxt; 710b12c0a9eSJohn Baldwin 711b12c0a9eSJohn Baldwin tp->rcv_nxt += len; 712dc964385SJohn Baldwin #ifndef USE_DDP_RX_FLOW_CONTROL 713dc964385SJohn Baldwin toep->rx_credits += len; 714b12c0a9eSJohn Baldwin #endif 715b12c0a9eSJohn Baldwin 716dc964385SJohn Baldwin while (toep->ddp_active_count > 0) { 717dc964385SJohn Baldwin MPASS(toep->ddp_active_id != -1); 718dc964385SJohn Baldwin db_idx = toep->ddp_active_id; 719dc964385SJohn Baldwin db_flag = db_idx == 1 ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE; 720dc964385SJohn Baldwin MPASS((toep->ddp_flags & db_flag) != 0); 721dc964385SJohn Baldwin db = &toep->db[db_idx]; 722dc964385SJohn Baldwin job = db->job; 723fe0bdd1dSJohn Baldwin copied = job->aio_received; 724dc964385SJohn Baldwin placed = len; 725dc964385SJohn Baldwin if (placed > job->uaiocb.aio_nbytes - copied) 726dc964385SJohn Baldwin placed = job->uaiocb.aio_nbytes - copied; 727b1012d80SJohn Baldwin if (placed > 0) 728b1012d80SJohn Baldwin job->msgrcv = 1; 729dc964385SJohn Baldwin if (!aio_clear_cancel_function(job)) { 730dc964385SJohn Baldwin /* 731dc964385SJohn Baldwin * Update the copied length for when 732dc964385SJohn Baldwin * t4_aio_cancel_active() completes this 733dc964385SJohn Baldwin * request. 734dc964385SJohn Baldwin */ 735fe0bdd1dSJohn Baldwin job->aio_received += placed; 736dc964385SJohn Baldwin } else { 737dc964385SJohn Baldwin CTR4(KTR_CXGBE, "%s: tid %d completed buf %d len %d", 738dc964385SJohn Baldwin __func__, toep->tid, db_idx, placed); 739dc964385SJohn Baldwin aio_complete(job, copied + placed, 0); 740dc964385SJohn Baldwin } 741dc964385SJohn Baldwin len -= placed; 742dc964385SJohn Baldwin complete_ddp_buffer(toep, db, db_idx); 743dc964385SJohn Baldwin } 744b12c0a9eSJohn Baldwin 745dc964385SJohn Baldwin MPASS(len == 0); 746dc964385SJohn Baldwin ddp_complete_all(toep, 0); 747b12c0a9eSJohn Baldwin } 748b12c0a9eSJohn Baldwin 749e682d02eSNavdeep Parhar #define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\ 750e682d02eSNavdeep Parhar F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\ 751e682d02eSNavdeep Parhar F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\ 752e682d02eSNavdeep Parhar F_DDP_INVALID_PPOD | F_DDP_HDRCRC_ERR | F_DDP_DATACRC_ERR) 753e682d02eSNavdeep Parhar 754671bf2b8SNavdeep Parhar extern cpl_handler_t t4_cpl_handler[]; 755671bf2b8SNavdeep Parhar 756e682d02eSNavdeep Parhar static int 757e682d02eSNavdeep Parhar do_rx_data_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 758e682d02eSNavdeep Parhar { 759e682d02eSNavdeep Parhar struct adapter *sc = iq->adapter; 760e682d02eSNavdeep Parhar const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1); 761e682d02eSNavdeep Parhar unsigned int tid = GET_TID(cpl); 762e682d02eSNavdeep Parhar uint32_t vld; 763e682d02eSNavdeep Parhar struct toepcb *toep = lookup_tid(sc, tid); 764e682d02eSNavdeep Parhar 765e682d02eSNavdeep Parhar KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 766e682d02eSNavdeep Parhar KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__)); 767c91bcaaaSNavdeep Parhar KASSERT(!(toep->flags & TPF_SYNQE), 768e682d02eSNavdeep Parhar ("%s: toep %p claims to be a synq entry", __func__, toep)); 769e682d02eSNavdeep Parhar 770e682d02eSNavdeep Parhar vld = be32toh(cpl->ddpvld); 771e682d02eSNavdeep Parhar if (__predict_false(vld & DDP_ERR)) { 772e682d02eSNavdeep Parhar panic("%s: DDP error 0x%x (tid %d, toep %p)", 773e682d02eSNavdeep Parhar __func__, vld, tid, toep); 774e682d02eSNavdeep Parhar } 7759eb533d3SNavdeep Parhar 7760fe98277SNavdeep Parhar if (toep->ulp_mode == ULP_MODE_ISCSI) { 777671bf2b8SNavdeep Parhar t4_cpl_handler[CPL_RX_ISCSI_DDP](iq, rss, m); 7780fe98277SNavdeep Parhar return (0); 7790fe98277SNavdeep Parhar } 780e682d02eSNavdeep Parhar 781e682d02eSNavdeep Parhar handle_ddp_data(toep, cpl->u.ddp_report, cpl->seq, be16toh(cpl->len)); 782e682d02eSNavdeep Parhar 783e682d02eSNavdeep Parhar return (0); 784e682d02eSNavdeep Parhar } 785e682d02eSNavdeep Parhar 786e682d02eSNavdeep Parhar static int 787e682d02eSNavdeep Parhar do_rx_ddp_complete(struct sge_iq *iq, const struct rss_header *rss, 788e682d02eSNavdeep Parhar struct mbuf *m) 789e682d02eSNavdeep Parhar { 790e682d02eSNavdeep Parhar struct adapter *sc = iq->adapter; 791e682d02eSNavdeep Parhar const struct cpl_rx_ddp_complete *cpl = (const void *)(rss + 1); 792e682d02eSNavdeep Parhar unsigned int tid = GET_TID(cpl); 793e682d02eSNavdeep Parhar struct toepcb *toep = lookup_tid(sc, tid); 794e682d02eSNavdeep Parhar 795e682d02eSNavdeep Parhar KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 796e682d02eSNavdeep Parhar KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__)); 797c91bcaaaSNavdeep Parhar KASSERT(!(toep->flags & TPF_SYNQE), 798e682d02eSNavdeep Parhar ("%s: toep %p claims to be a synq entry", __func__, toep)); 799e682d02eSNavdeep Parhar 800e682d02eSNavdeep Parhar handle_ddp_data(toep, cpl->ddp_report, cpl->rcv_nxt, 0); 801e682d02eSNavdeep Parhar 802e682d02eSNavdeep Parhar return (0); 803e682d02eSNavdeep Parhar } 804e682d02eSNavdeep Parhar 805dc964385SJohn Baldwin static void 806e682d02eSNavdeep Parhar enable_ddp(struct adapter *sc, struct toepcb *toep) 807e682d02eSNavdeep Parhar { 808e682d02eSNavdeep Parhar 809e682d02eSNavdeep Parhar KASSERT((toep->ddp_flags & (DDP_ON | DDP_OK | DDP_SC_REQ)) == DDP_OK, 810e682d02eSNavdeep Parhar ("%s: toep %p has bad ddp_flags 0x%x", 811e682d02eSNavdeep Parhar __func__, toep, toep->ddp_flags)); 812e682d02eSNavdeep Parhar 813e682d02eSNavdeep Parhar CTR3(KTR_CXGBE, "%s: tid %u (time %u)", 814e682d02eSNavdeep Parhar __func__, toep->tid, time_uptime); 815e682d02eSNavdeep Parhar 816dc964385SJohn Baldwin DDP_ASSERT_LOCKED(toep); 817e682d02eSNavdeep Parhar toep->ddp_flags |= DDP_SC_REQ; 818671bf2b8SNavdeep Parhar t4_set_tcb_field(sc, toep->ctrlq, toep->tid, W_TCB_RX_DDP_FLAGS, 819e682d02eSNavdeep Parhar V_TF_DDP_OFF(1) | V_TF_DDP_INDICATE_OUT(1) | 820e682d02eSNavdeep Parhar V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1) | 821e682d02eSNavdeep Parhar V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1), 822671bf2b8SNavdeep Parhar V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1), 0, 0, 823671bf2b8SNavdeep Parhar toep->ofld_rxq->iq.abs_id); 824671bf2b8SNavdeep Parhar t4_set_tcb_field(sc, toep->ctrlq, toep->tid, W_TCB_T_FLAGS, 825671bf2b8SNavdeep Parhar V_TF_RCV_COALESCE_ENABLE(1), 0, 0, 0, toep->ofld_rxq->iq.abs_id); 826e682d02eSNavdeep Parhar } 827e682d02eSNavdeep Parhar 828e682d02eSNavdeep Parhar static int 829e682d02eSNavdeep Parhar calculate_hcf(int n1, int n2) 830e682d02eSNavdeep Parhar { 831e682d02eSNavdeep Parhar int a, b, t; 832e682d02eSNavdeep Parhar 833e682d02eSNavdeep Parhar if (n1 <= n2) { 834e682d02eSNavdeep Parhar a = n1; 835e682d02eSNavdeep Parhar b = n2; 836e682d02eSNavdeep Parhar } else { 837e682d02eSNavdeep Parhar a = n2; 838e682d02eSNavdeep Parhar b = n1; 839e682d02eSNavdeep Parhar } 840e682d02eSNavdeep Parhar 841e682d02eSNavdeep Parhar while (a != 0) { 842e682d02eSNavdeep Parhar t = a; 843e682d02eSNavdeep Parhar a = b % a; 844e682d02eSNavdeep Parhar b = t; 845e682d02eSNavdeep Parhar } 846e682d02eSNavdeep Parhar 847e682d02eSNavdeep Parhar return (b); 848e682d02eSNavdeep Parhar } 849e682d02eSNavdeep Parhar 850968267fdSNavdeep Parhar static inline int 851968267fdSNavdeep Parhar pages_to_nppods(int npages, int ddp_page_shift) 852e682d02eSNavdeep Parhar { 853dc964385SJohn Baldwin 854968267fdSNavdeep Parhar MPASS(ddp_page_shift >= PAGE_SHIFT); 855968267fdSNavdeep Parhar 856968267fdSNavdeep Parhar return (howmany(npages >> (ddp_page_shift - PAGE_SHIFT), PPOD_PAGES)); 857968267fdSNavdeep Parhar } 858968267fdSNavdeep Parhar 859968267fdSNavdeep Parhar static int 860968267fdSNavdeep Parhar alloc_page_pods(struct ppod_region *pr, u_int nppods, u_int pgsz_idx, 861968267fdSNavdeep Parhar struct ppod_reservation *prsv) 862968267fdSNavdeep Parhar { 863968267fdSNavdeep Parhar vmem_addr_t addr; /* relative to start of region */ 864968267fdSNavdeep Parhar 865968267fdSNavdeep Parhar if (vmem_alloc(pr->pr_arena, PPOD_SZ(nppods), M_NOWAIT | M_FIRSTFIT, 866968267fdSNavdeep Parhar &addr) != 0) 867968267fdSNavdeep Parhar return (ENOMEM); 868968267fdSNavdeep Parhar 869968267fdSNavdeep Parhar CTR5(KTR_CXGBE, "%-17s arena %p, addr 0x%08x, nppods %d, pgsz %d", 870968267fdSNavdeep Parhar __func__, pr->pr_arena, (uint32_t)addr & pr->pr_tag_mask, 871968267fdSNavdeep Parhar nppods, 1 << pr->pr_page_shift[pgsz_idx]); 872968267fdSNavdeep Parhar 873968267fdSNavdeep Parhar /* 874968267fdSNavdeep Parhar * The hardware tagmask includes an extra invalid bit but the arena was 875968267fdSNavdeep Parhar * seeded with valid values only. An allocation out of this arena will 876968267fdSNavdeep Parhar * fit inside the tagmask but won't have the invalid bit set. 877968267fdSNavdeep Parhar */ 878968267fdSNavdeep Parhar MPASS((addr & pr->pr_tag_mask) == addr); 879968267fdSNavdeep Parhar MPASS((addr & pr->pr_invalid_bit) == 0); 880968267fdSNavdeep Parhar 881968267fdSNavdeep Parhar prsv->prsv_pr = pr; 882968267fdSNavdeep Parhar prsv->prsv_tag = V_PPOD_PGSZ(pgsz_idx) | addr; 883968267fdSNavdeep Parhar prsv->prsv_nppods = nppods; 884968267fdSNavdeep Parhar 885968267fdSNavdeep Parhar return (0); 886968267fdSNavdeep Parhar } 887968267fdSNavdeep Parhar 888968267fdSNavdeep Parhar int 889968267fdSNavdeep Parhar t4_alloc_page_pods_for_ps(struct ppod_region *pr, struct pageset *ps) 890968267fdSNavdeep Parhar { 891968267fdSNavdeep Parhar int i, hcf, seglen, idx, nppods; 892968267fdSNavdeep Parhar struct ppod_reservation *prsv = &ps->prsv; 893968267fdSNavdeep Parhar 894968267fdSNavdeep Parhar KASSERT(prsv->prsv_nppods == 0, 895968267fdSNavdeep Parhar ("%s: page pods already allocated", __func__)); 896e682d02eSNavdeep Parhar 897e682d02eSNavdeep Parhar /* 898e682d02eSNavdeep Parhar * The DDP page size is unrelated to the VM page size. We combine 899e682d02eSNavdeep Parhar * contiguous physical pages into larger segments to get the best DDP 900e682d02eSNavdeep Parhar * page size possible. This is the largest of the four sizes in 901e682d02eSNavdeep Parhar * A_ULP_RX_TDDP_PSZ that evenly divides the HCF of the segment sizes in 902e682d02eSNavdeep Parhar * the page list. 903e682d02eSNavdeep Parhar */ 904e682d02eSNavdeep Parhar hcf = 0; 905dc964385SJohn Baldwin for (i = 0; i < ps->npages; i++) { 906e682d02eSNavdeep Parhar seglen = PAGE_SIZE; 907dc964385SJohn Baldwin while (i < ps->npages - 1 && 908dc964385SJohn Baldwin ps->pages[i]->phys_addr + PAGE_SIZE == 909dc964385SJohn Baldwin ps->pages[i + 1]->phys_addr) { 910e682d02eSNavdeep Parhar seglen += PAGE_SIZE; 911e682d02eSNavdeep Parhar i++; 912e682d02eSNavdeep Parhar } 913e682d02eSNavdeep Parhar 914e682d02eSNavdeep Parhar hcf = calculate_hcf(hcf, seglen); 915968267fdSNavdeep Parhar if (hcf < (1 << pr->pr_page_shift[1])) { 916e682d02eSNavdeep Parhar idx = 0; 917e682d02eSNavdeep Parhar goto have_pgsz; /* give up, short circuit */ 918e682d02eSNavdeep Parhar } 919e682d02eSNavdeep Parhar } 920e682d02eSNavdeep Parhar 921968267fdSNavdeep Parhar #define PR_PAGE_MASK(x) ((1 << pr->pr_page_shift[(x)]) - 1) 922968267fdSNavdeep Parhar MPASS((hcf & PR_PAGE_MASK(0)) == 0); /* PAGE_SIZE is >= 4K everywhere */ 923968267fdSNavdeep Parhar for (idx = nitems(pr->pr_page_shift) - 1; idx > 0; idx--) { 924968267fdSNavdeep Parhar if ((hcf & PR_PAGE_MASK(idx)) == 0) 925e682d02eSNavdeep Parhar break; 926e682d02eSNavdeep Parhar } 927968267fdSNavdeep Parhar #undef PR_PAGE_MASK 928968267fdSNavdeep Parhar 929e682d02eSNavdeep Parhar have_pgsz: 930db8bcd1bSNavdeep Parhar MPASS(idx <= M_PPOD_PGSZ); 931e682d02eSNavdeep Parhar 932968267fdSNavdeep Parhar nppods = pages_to_nppods(ps->npages, pr->pr_page_shift[idx]); 933968267fdSNavdeep Parhar if (alloc_page_pods(pr, nppods, idx, prsv) != 0) 934dc964385SJohn Baldwin return (0); 935968267fdSNavdeep Parhar MPASS(prsv->prsv_nppods > 0); 936e682d02eSNavdeep Parhar 937dc964385SJohn Baldwin return (1); 938e682d02eSNavdeep Parhar } 939e682d02eSNavdeep Parhar 940a9feb2cdSNavdeep Parhar int 941a9feb2cdSNavdeep Parhar t4_alloc_page_pods_for_buf(struct ppod_region *pr, vm_offset_t buf, int len, 942a9feb2cdSNavdeep Parhar struct ppod_reservation *prsv) 943a9feb2cdSNavdeep Parhar { 944a9feb2cdSNavdeep Parhar int hcf, seglen, idx, npages, nppods; 945a9feb2cdSNavdeep Parhar uintptr_t start_pva, end_pva, pva, p1; 946a9feb2cdSNavdeep Parhar 947a9feb2cdSNavdeep Parhar MPASS(buf > 0); 948a9feb2cdSNavdeep Parhar MPASS(len > 0); 949a9feb2cdSNavdeep Parhar 950a9feb2cdSNavdeep Parhar /* 951a9feb2cdSNavdeep Parhar * The DDP page size is unrelated to the VM page size. We combine 952a9feb2cdSNavdeep Parhar * contiguous physical pages into larger segments to get the best DDP 953a9feb2cdSNavdeep Parhar * page size possible. This is the largest of the four sizes in 954a9feb2cdSNavdeep Parhar * A_ULP_RX_ISCSI_PSZ that evenly divides the HCF of the segment sizes 955a9feb2cdSNavdeep Parhar * in the page list. 956a9feb2cdSNavdeep Parhar */ 957a9feb2cdSNavdeep Parhar hcf = 0; 958a9feb2cdSNavdeep Parhar start_pva = trunc_page(buf); 959a9feb2cdSNavdeep Parhar end_pva = trunc_page(buf + len - 1); 960a9feb2cdSNavdeep Parhar pva = start_pva; 961a9feb2cdSNavdeep Parhar while (pva <= end_pva) { 962a9feb2cdSNavdeep Parhar seglen = PAGE_SIZE; 963a9feb2cdSNavdeep Parhar p1 = pmap_kextract(pva); 964a9feb2cdSNavdeep Parhar pva += PAGE_SIZE; 965a9feb2cdSNavdeep Parhar while (pva <= end_pva && p1 + seglen == pmap_kextract(pva)) { 966a9feb2cdSNavdeep Parhar seglen += PAGE_SIZE; 967a9feb2cdSNavdeep Parhar pva += PAGE_SIZE; 968a9feb2cdSNavdeep Parhar } 969a9feb2cdSNavdeep Parhar 970a9feb2cdSNavdeep Parhar hcf = calculate_hcf(hcf, seglen); 971a9feb2cdSNavdeep Parhar if (hcf < (1 << pr->pr_page_shift[1])) { 972a9feb2cdSNavdeep Parhar idx = 0; 973a9feb2cdSNavdeep Parhar goto have_pgsz; /* give up, short circuit */ 974a9feb2cdSNavdeep Parhar } 975a9feb2cdSNavdeep Parhar } 976a9feb2cdSNavdeep Parhar 977a9feb2cdSNavdeep Parhar #define PR_PAGE_MASK(x) ((1 << pr->pr_page_shift[(x)]) - 1) 978a9feb2cdSNavdeep Parhar MPASS((hcf & PR_PAGE_MASK(0)) == 0); /* PAGE_SIZE is >= 4K everywhere */ 979a9feb2cdSNavdeep Parhar for (idx = nitems(pr->pr_page_shift) - 1; idx > 0; idx--) { 980a9feb2cdSNavdeep Parhar if ((hcf & PR_PAGE_MASK(idx)) == 0) 981a9feb2cdSNavdeep Parhar break; 982a9feb2cdSNavdeep Parhar } 983a9feb2cdSNavdeep Parhar #undef PR_PAGE_MASK 984a9feb2cdSNavdeep Parhar 985a9feb2cdSNavdeep Parhar have_pgsz: 986a9feb2cdSNavdeep Parhar MPASS(idx <= M_PPOD_PGSZ); 987a9feb2cdSNavdeep Parhar 988a9feb2cdSNavdeep Parhar npages = 1; 989a9feb2cdSNavdeep Parhar npages += (end_pva - start_pva) >> pr->pr_page_shift[idx]; 990a9feb2cdSNavdeep Parhar nppods = howmany(npages, PPOD_PAGES); 991a9feb2cdSNavdeep Parhar if (alloc_page_pods(pr, nppods, idx, prsv) != 0) 992a9feb2cdSNavdeep Parhar return (ENOMEM); 993a9feb2cdSNavdeep Parhar MPASS(prsv->prsv_nppods > 0); 994a9feb2cdSNavdeep Parhar 995a9feb2cdSNavdeep Parhar return (0); 996a9feb2cdSNavdeep Parhar } 997a9feb2cdSNavdeep Parhar 998968267fdSNavdeep Parhar void 999968267fdSNavdeep Parhar t4_free_page_pods(struct ppod_reservation *prsv) 1000968267fdSNavdeep Parhar { 1001968267fdSNavdeep Parhar struct ppod_region *pr = prsv->prsv_pr; 1002968267fdSNavdeep Parhar vmem_addr_t addr; 1003968267fdSNavdeep Parhar 1004968267fdSNavdeep Parhar MPASS(prsv != NULL); 1005968267fdSNavdeep Parhar MPASS(prsv->prsv_nppods != 0); 1006968267fdSNavdeep Parhar 1007968267fdSNavdeep Parhar addr = prsv->prsv_tag & pr->pr_tag_mask; 1008968267fdSNavdeep Parhar MPASS((addr & pr->pr_invalid_bit) == 0); 1009968267fdSNavdeep Parhar 1010968267fdSNavdeep Parhar CTR4(KTR_CXGBE, "%-17s arena %p, addr 0x%08x, nppods %d", __func__, 1011968267fdSNavdeep Parhar pr->pr_arena, addr, prsv->prsv_nppods); 1012968267fdSNavdeep Parhar 1013968267fdSNavdeep Parhar vmem_free(pr->pr_arena, addr, PPOD_SZ(prsv->prsv_nppods)); 1014968267fdSNavdeep Parhar prsv->prsv_nppods = 0; 1015968267fdSNavdeep Parhar } 1016968267fdSNavdeep Parhar 1017e682d02eSNavdeep Parhar #define NUM_ULP_TX_SC_IMM_PPODS (256 / PPOD_SIZE) 1018e682d02eSNavdeep Parhar 1019968267fdSNavdeep Parhar int 1020968267fdSNavdeep Parhar t4_write_page_pods_for_ps(struct adapter *sc, struct sge_wrq *wrq, int tid, 1021968267fdSNavdeep Parhar struct pageset *ps) 1022e682d02eSNavdeep Parhar { 1023e682d02eSNavdeep Parhar struct wrqe *wr; 1024e682d02eSNavdeep Parhar struct ulp_mem_io *ulpmc; 1025e682d02eSNavdeep Parhar struct ulptx_idata *ulpsc; 1026e682d02eSNavdeep Parhar struct pagepod *ppod; 1027db8bcd1bSNavdeep Parhar int i, j, k, n, chunk, len, ddp_pgsz, idx; 1028db8bcd1bSNavdeep Parhar u_int ppod_addr; 102988c4ff7bSNavdeep Parhar uint32_t cmd; 1030968267fdSNavdeep Parhar struct ppod_reservation *prsv = &ps->prsv; 1031968267fdSNavdeep Parhar struct ppod_region *pr = prsv->prsv_pr; 1032e682d02eSNavdeep Parhar 1033dc964385SJohn Baldwin KASSERT(!(ps->flags & PS_PPODS_WRITTEN), 1034dc964385SJohn Baldwin ("%s: page pods already written", __func__)); 1035968267fdSNavdeep Parhar MPASS(prsv->prsv_nppods > 0); 1036dc964385SJohn Baldwin 103788c4ff7bSNavdeep Parhar cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE)); 103888c4ff7bSNavdeep Parhar if (is_t4(sc)) 103988c4ff7bSNavdeep Parhar cmd |= htobe32(F_ULP_MEMIO_ORDER); 104088c4ff7bSNavdeep Parhar else 104188c4ff7bSNavdeep Parhar cmd |= htobe32(F_T5_ULP_MEMIO_IMM); 1042968267fdSNavdeep Parhar ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)]; 1043968267fdSNavdeep Parhar ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask); 1044968267fdSNavdeep Parhar for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) { 1045e682d02eSNavdeep Parhar 1046e682d02eSNavdeep Parhar /* How many page pods are we writing in this cycle */ 1047968267fdSNavdeep Parhar n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS); 1048e682d02eSNavdeep Parhar chunk = PPOD_SZ(n); 1049d14b0ac1SNavdeep Parhar len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16); 1050e682d02eSNavdeep Parhar 1051968267fdSNavdeep Parhar wr = alloc_wrqe(len, wrq); 1052e682d02eSNavdeep Parhar if (wr == NULL) 1053e682d02eSNavdeep Parhar return (ENOMEM); /* ok to just bail out */ 1054e682d02eSNavdeep Parhar ulpmc = wrtod(wr); 1055e682d02eSNavdeep Parhar 1056e682d02eSNavdeep Parhar INIT_ULPTX_WR(ulpmc, len, 0, 0); 105788c4ff7bSNavdeep Parhar ulpmc->cmd = cmd; 1058e682d02eSNavdeep Parhar ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32)); 1059e682d02eSNavdeep Parhar ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16)); 1060e682d02eSNavdeep Parhar ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5)); 1061e682d02eSNavdeep Parhar 1062e682d02eSNavdeep Parhar ulpsc = (struct ulptx_idata *)(ulpmc + 1); 1063e682d02eSNavdeep Parhar ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); 1064e682d02eSNavdeep Parhar ulpsc->len = htobe32(chunk); 1065e682d02eSNavdeep Parhar 1066e682d02eSNavdeep Parhar ppod = (struct pagepod *)(ulpsc + 1); 1067e682d02eSNavdeep Parhar for (j = 0; j < n; i++, j++, ppod++) { 1068e682d02eSNavdeep Parhar ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID | 1069968267fdSNavdeep Parhar V_PPOD_TID(tid) | prsv->prsv_tag); 1070dc964385SJohn Baldwin ppod->len_offset = htobe64(V_PPOD_LEN(ps->len) | 1071dc964385SJohn Baldwin V_PPOD_OFST(ps->offset)); 1072e682d02eSNavdeep Parhar ppod->rsvd = 0; 1073e682d02eSNavdeep Parhar idx = i * PPOD_PAGES * (ddp_pgsz / PAGE_SIZE); 107457c60f98SNavdeep Parhar for (k = 0; k < nitems(ppod->addr); k++) { 1075dc964385SJohn Baldwin if (idx < ps->npages) { 1076e682d02eSNavdeep Parhar ppod->addr[k] = 1077dc964385SJohn Baldwin htobe64(ps->pages[idx]->phys_addr); 1078e682d02eSNavdeep Parhar idx += ddp_pgsz / PAGE_SIZE; 1079e682d02eSNavdeep Parhar } else 1080e682d02eSNavdeep Parhar ppod->addr[k] = 0; 1081e682d02eSNavdeep Parhar #if 0 1082e682d02eSNavdeep Parhar CTR5(KTR_CXGBE, 1083e682d02eSNavdeep Parhar "%s: tid %d ppod[%d]->addr[%d] = %p", 1084e682d02eSNavdeep Parhar __func__, toep->tid, i, k, 1085e682d02eSNavdeep Parhar htobe64(ppod->addr[k])); 1086e682d02eSNavdeep Parhar #endif 1087e682d02eSNavdeep Parhar } 1088e682d02eSNavdeep Parhar 1089e682d02eSNavdeep Parhar } 1090e682d02eSNavdeep Parhar 1091e682d02eSNavdeep Parhar t4_wrq_tx(sc, wr); 1092e682d02eSNavdeep Parhar } 1093dc964385SJohn Baldwin ps->flags |= PS_PPODS_WRITTEN; 1094e682d02eSNavdeep Parhar 1095e682d02eSNavdeep Parhar return (0); 1096e682d02eSNavdeep Parhar } 1097e682d02eSNavdeep Parhar 1098a9feb2cdSNavdeep Parhar int 1099a9feb2cdSNavdeep Parhar t4_write_page_pods_for_buf(struct adapter *sc, struct sge_wrq *wrq, int tid, 1100a9feb2cdSNavdeep Parhar struct ppod_reservation *prsv, vm_offset_t buf, int buflen) 1101a9feb2cdSNavdeep Parhar { 1102a9feb2cdSNavdeep Parhar struct wrqe *wr; 1103a9feb2cdSNavdeep Parhar struct ulp_mem_io *ulpmc; 1104a9feb2cdSNavdeep Parhar struct ulptx_idata *ulpsc; 1105a9feb2cdSNavdeep Parhar struct pagepod *ppod; 1106a9feb2cdSNavdeep Parhar int i, j, k, n, chunk, len, ddp_pgsz; 1107a9feb2cdSNavdeep Parhar u_int ppod_addr, offset; 1108a9feb2cdSNavdeep Parhar uint32_t cmd; 1109a9feb2cdSNavdeep Parhar struct ppod_region *pr = prsv->prsv_pr; 1110a9feb2cdSNavdeep Parhar uintptr_t end_pva, pva, pa; 1111a9feb2cdSNavdeep Parhar 1112a9feb2cdSNavdeep Parhar cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE)); 1113a9feb2cdSNavdeep Parhar if (is_t4(sc)) 1114a9feb2cdSNavdeep Parhar cmd |= htobe32(F_ULP_MEMIO_ORDER); 1115a9feb2cdSNavdeep Parhar else 1116a9feb2cdSNavdeep Parhar cmd |= htobe32(F_T5_ULP_MEMIO_IMM); 1117a9feb2cdSNavdeep Parhar ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)]; 1118a9feb2cdSNavdeep Parhar offset = buf & PAGE_MASK; 1119a9feb2cdSNavdeep Parhar ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask); 1120a9feb2cdSNavdeep Parhar pva = trunc_page(buf); 1121a9feb2cdSNavdeep Parhar end_pva = trunc_page(buf + buflen - 1); 1122a9feb2cdSNavdeep Parhar for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) { 1123a9feb2cdSNavdeep Parhar 1124a9feb2cdSNavdeep Parhar /* How many page pods are we writing in this cycle */ 1125a9feb2cdSNavdeep Parhar n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS); 1126a9feb2cdSNavdeep Parhar MPASS(n > 0); 1127a9feb2cdSNavdeep Parhar chunk = PPOD_SZ(n); 1128a9feb2cdSNavdeep Parhar len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16); 1129a9feb2cdSNavdeep Parhar 1130a9feb2cdSNavdeep Parhar wr = alloc_wrqe(len, wrq); 1131a9feb2cdSNavdeep Parhar if (wr == NULL) 1132a9feb2cdSNavdeep Parhar return (ENOMEM); /* ok to just bail out */ 1133a9feb2cdSNavdeep Parhar ulpmc = wrtod(wr); 1134a9feb2cdSNavdeep Parhar 1135a9feb2cdSNavdeep Parhar INIT_ULPTX_WR(ulpmc, len, 0, 0); 1136a9feb2cdSNavdeep Parhar ulpmc->cmd = cmd; 1137a9feb2cdSNavdeep Parhar ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32)); 1138a9feb2cdSNavdeep Parhar ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16)); 1139a9feb2cdSNavdeep Parhar ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5)); 1140a9feb2cdSNavdeep Parhar 1141a9feb2cdSNavdeep Parhar ulpsc = (struct ulptx_idata *)(ulpmc + 1); 1142a9feb2cdSNavdeep Parhar ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); 1143a9feb2cdSNavdeep Parhar ulpsc->len = htobe32(chunk); 1144a9feb2cdSNavdeep Parhar 1145a9feb2cdSNavdeep Parhar ppod = (struct pagepod *)(ulpsc + 1); 1146a9feb2cdSNavdeep Parhar for (j = 0; j < n; i++, j++, ppod++) { 1147a9feb2cdSNavdeep Parhar ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID | 1148a9feb2cdSNavdeep Parhar V_PPOD_TID(tid) | 1149a9feb2cdSNavdeep Parhar (prsv->prsv_tag & ~V_PPOD_PGSZ(M_PPOD_PGSZ))); 1150a9feb2cdSNavdeep Parhar ppod->len_offset = htobe64(V_PPOD_LEN(buflen) | 1151a9feb2cdSNavdeep Parhar V_PPOD_OFST(offset)); 1152a9feb2cdSNavdeep Parhar ppod->rsvd = 0; 1153a9feb2cdSNavdeep Parhar 1154a9feb2cdSNavdeep Parhar for (k = 0; k < nitems(ppod->addr); k++) { 1155a9feb2cdSNavdeep Parhar if (pva > end_pva) 1156a9feb2cdSNavdeep Parhar ppod->addr[k] = 0; 1157a9feb2cdSNavdeep Parhar else { 1158a9feb2cdSNavdeep Parhar pa = pmap_kextract(pva); 1159a9feb2cdSNavdeep Parhar ppod->addr[k] = htobe64(pa); 1160a9feb2cdSNavdeep Parhar pva += ddp_pgsz; 1161a9feb2cdSNavdeep Parhar } 1162a9feb2cdSNavdeep Parhar #if 0 1163a9feb2cdSNavdeep Parhar CTR5(KTR_CXGBE, 1164a9feb2cdSNavdeep Parhar "%s: tid %d ppod[%d]->addr[%d] = %p", 1165a9feb2cdSNavdeep Parhar __func__, tid, i, k, 1166a9feb2cdSNavdeep Parhar htobe64(ppod->addr[k])); 1167a9feb2cdSNavdeep Parhar #endif 1168a9feb2cdSNavdeep Parhar } 1169a9feb2cdSNavdeep Parhar 1170a9feb2cdSNavdeep Parhar /* 1171a9feb2cdSNavdeep Parhar * Walk back 1 segment so that the first address in the 1172a9feb2cdSNavdeep Parhar * next pod is the same as the last one in the current 1173a9feb2cdSNavdeep Parhar * pod. 1174a9feb2cdSNavdeep Parhar */ 1175a9feb2cdSNavdeep Parhar pva -= ddp_pgsz; 1176a9feb2cdSNavdeep Parhar } 1177a9feb2cdSNavdeep Parhar 1178a9feb2cdSNavdeep Parhar t4_wrq_tx(sc, wr); 1179a9feb2cdSNavdeep Parhar } 1180a9feb2cdSNavdeep Parhar 1181a9feb2cdSNavdeep Parhar MPASS(pva <= end_pva); 1182a9feb2cdSNavdeep Parhar 1183a9feb2cdSNavdeep Parhar return (0); 1184a9feb2cdSNavdeep Parhar } 1185a9feb2cdSNavdeep Parhar 1186e682d02eSNavdeep Parhar static void 1187dc964385SJohn Baldwin wire_pageset(struct pageset *ps) 1188e682d02eSNavdeep Parhar { 1189e682d02eSNavdeep Parhar vm_page_t p; 1190dc964385SJohn Baldwin int i; 1191e682d02eSNavdeep Parhar 1192dc964385SJohn Baldwin KASSERT(!(ps->flags & PS_WIRED), ("pageset already wired")); 1193dc964385SJohn Baldwin 1194dc964385SJohn Baldwin for (i = 0; i < ps->npages; i++) { 1195dc964385SJohn Baldwin p = ps->pages[i]; 1196e682d02eSNavdeep Parhar vm_page_lock(p); 1197e682d02eSNavdeep Parhar vm_page_wire(p); 1198e682d02eSNavdeep Parhar vm_page_unhold(p); 1199e682d02eSNavdeep Parhar vm_page_unlock(p); 1200e682d02eSNavdeep Parhar } 1201dc964385SJohn Baldwin ps->flags |= PS_WIRED; 1202e682d02eSNavdeep Parhar } 1203e682d02eSNavdeep Parhar 1204dc964385SJohn Baldwin /* 1205dc964385SJohn Baldwin * Prepare a pageset for DDP. This wires the pageset and sets up page 1206dc964385SJohn Baldwin * pods. 1207dc964385SJohn Baldwin */ 1208e682d02eSNavdeep Parhar static int 1209dc964385SJohn Baldwin prep_pageset(struct adapter *sc, struct toepcb *toep, struct pageset *ps) 1210e682d02eSNavdeep Parhar { 1211dc964385SJohn Baldwin struct tom_data *td = sc->tom_softc; 1212e682d02eSNavdeep Parhar 1213dc964385SJohn Baldwin if (!(ps->flags & PS_WIRED)) 1214dc964385SJohn Baldwin wire_pageset(ps); 1215968267fdSNavdeep Parhar if (ps->prsv.prsv_nppods == 0 && 1216968267fdSNavdeep Parhar !t4_alloc_page_pods_for_ps(&td->pr, ps)) { 1217e682d02eSNavdeep Parhar return (0); 1218e682d02eSNavdeep Parhar } 1219dc964385SJohn Baldwin if (!(ps->flags & PS_PPODS_WRITTEN) && 1220968267fdSNavdeep Parhar t4_write_page_pods_for_ps(sc, toep->ctrlq, toep->tid, ps) != 0) { 1221dc964385SJohn Baldwin return (0); 1222dc964385SJohn Baldwin } 1223dc964385SJohn Baldwin 1224dc964385SJohn Baldwin return (1); 1225dc964385SJohn Baldwin } 1226e682d02eSNavdeep Parhar 1227968267fdSNavdeep Parhar int 1228968267fdSNavdeep Parhar t4_init_ppod_region(struct ppod_region *pr, struct t4_range *r, u_int psz, 1229968267fdSNavdeep Parhar const char *name) 1230e682d02eSNavdeep Parhar { 1231515b36c5SNavdeep Parhar int i; 1232515b36c5SNavdeep Parhar 1233968267fdSNavdeep Parhar MPASS(pr != NULL); 1234968267fdSNavdeep Parhar MPASS(r->size > 0); 1235515b36c5SNavdeep Parhar 1236968267fdSNavdeep Parhar pr->pr_start = r->start; 1237968267fdSNavdeep Parhar pr->pr_len = r->size; 1238968267fdSNavdeep Parhar pr->pr_page_shift[0] = 12 + G_HPZ0(psz); 1239968267fdSNavdeep Parhar pr->pr_page_shift[1] = 12 + G_HPZ1(psz); 1240968267fdSNavdeep Parhar pr->pr_page_shift[2] = 12 + G_HPZ2(psz); 1241968267fdSNavdeep Parhar pr->pr_page_shift[3] = 12 + G_HPZ3(psz); 1242968267fdSNavdeep Parhar 1243968267fdSNavdeep Parhar /* The SGL -> page pod algorithm requires the sizes to be in order. */ 1244968267fdSNavdeep Parhar for (i = 1; i < nitems(pr->pr_page_shift); i++) { 1245968267fdSNavdeep Parhar if (pr->pr_page_shift[i] <= pr->pr_page_shift[i - 1]) 1246968267fdSNavdeep Parhar return (ENXIO); 1247515b36c5SNavdeep Parhar } 1248e682d02eSNavdeep Parhar 1249968267fdSNavdeep Parhar pr->pr_tag_mask = ((1 << fls(r->size)) - 1) & V_PPOD_TAG(M_PPOD_TAG); 1250968267fdSNavdeep Parhar pr->pr_alias_mask = V_PPOD_TAG(M_PPOD_TAG) & ~pr->pr_tag_mask; 1251968267fdSNavdeep Parhar if (pr->pr_tag_mask == 0 || pr->pr_alias_mask == 0) 1252968267fdSNavdeep Parhar return (ENXIO); 1253968267fdSNavdeep Parhar pr->pr_alias_shift = fls(pr->pr_tag_mask); 1254968267fdSNavdeep Parhar pr->pr_invalid_bit = 1 << (pr->pr_alias_shift - 1); 1255968267fdSNavdeep Parhar 1256968267fdSNavdeep Parhar pr->pr_arena = vmem_create(name, 0, pr->pr_len, PPOD_SIZE, 0, 1257968267fdSNavdeep Parhar M_FIRSTFIT | M_NOWAIT); 1258968267fdSNavdeep Parhar if (pr->pr_arena == NULL) 1259968267fdSNavdeep Parhar return (ENOMEM); 1260968267fdSNavdeep Parhar 1261968267fdSNavdeep Parhar return (0); 1262e682d02eSNavdeep Parhar } 1263e682d02eSNavdeep Parhar 1264e682d02eSNavdeep Parhar void 1265968267fdSNavdeep Parhar t4_free_ppod_region(struct ppod_region *pr) 1266e682d02eSNavdeep Parhar { 1267e682d02eSNavdeep Parhar 1268968267fdSNavdeep Parhar MPASS(pr != NULL); 1269968267fdSNavdeep Parhar 1270968267fdSNavdeep Parhar if (pr->pr_arena) 1271968267fdSNavdeep Parhar vmem_destroy(pr->pr_arena); 1272968267fdSNavdeep Parhar bzero(pr, sizeof(*pr)); 1273e682d02eSNavdeep Parhar } 1274e682d02eSNavdeep Parhar 1275e682d02eSNavdeep Parhar static int 1276dc964385SJohn Baldwin pscmp(struct pageset *ps, struct vmspace *vm, vm_offset_t start, int npages, 1277dc964385SJohn Baldwin int pgoff, int len) 1278e682d02eSNavdeep Parhar { 1279e682d02eSNavdeep Parhar 1280dc964385SJohn Baldwin if (ps->npages != npages || ps->offset != pgoff || ps->len != len) 1281dc964385SJohn Baldwin return (1); 1282dc964385SJohn Baldwin 1283dc964385SJohn Baldwin return (ps->vm != vm || ps->vm_timestamp != vm->vm_map.timestamp); 1284e682d02eSNavdeep Parhar } 1285e682d02eSNavdeep Parhar 1286dc964385SJohn Baldwin static int 1287dc964385SJohn Baldwin hold_aio(struct toepcb *toep, struct kaiocb *job, struct pageset **pps) 1288688dba74SNavdeep Parhar { 1289dc964385SJohn Baldwin struct vmspace *vm; 1290dc964385SJohn Baldwin vm_map_t map; 1291dc964385SJohn Baldwin vm_offset_t start, end, pgoff; 1292dc964385SJohn Baldwin struct pageset *ps; 1293dc964385SJohn Baldwin int n; 1294688dba74SNavdeep Parhar 1295dc964385SJohn Baldwin DDP_ASSERT_LOCKED(toep); 1296688dba74SNavdeep Parhar 1297dc964385SJohn Baldwin /* 1298dc964385SJohn Baldwin * The AIO subsystem will cancel and drain all requests before 1299dc964385SJohn Baldwin * permitting a process to exit or exec, so p_vmspace should 1300dc964385SJohn Baldwin * be stable here. 1301dc964385SJohn Baldwin */ 1302dc964385SJohn Baldwin vm = job->userproc->p_vmspace; 1303dc964385SJohn Baldwin map = &vm->vm_map; 1304dc964385SJohn Baldwin start = (uintptr_t)job->uaiocb.aio_buf; 1305dc964385SJohn Baldwin pgoff = start & PAGE_MASK; 1306dc964385SJohn Baldwin end = round_page(start + job->uaiocb.aio_nbytes); 1307dc964385SJohn Baldwin start = trunc_page(start); 1308dc964385SJohn Baldwin 1309dc964385SJohn Baldwin if (end - start > MAX_DDP_BUFFER_SIZE) { 1310dc964385SJohn Baldwin /* 1311dc964385SJohn Baldwin * Truncate the request to a short read. 1312dc964385SJohn Baldwin * Alternatively, we could DDP in chunks to the larger 1313dc964385SJohn Baldwin * buffer, but that would be quite a bit more work. 1314dc964385SJohn Baldwin * 1315dc964385SJohn Baldwin * When truncating, round the request down to avoid 1316dc964385SJohn Baldwin * crossing a cache line on the final transaction. 1317dc964385SJohn Baldwin */ 1318dc964385SJohn Baldwin end = rounddown2(start + MAX_DDP_BUFFER_SIZE, CACHE_LINE_SIZE); 1319dc964385SJohn Baldwin #ifdef VERBOSE_TRACES 1320dc964385SJohn Baldwin CTR4(KTR_CXGBE, "%s: tid %d, truncating size from %lu to %lu", 1321dc964385SJohn Baldwin __func__, toep->tid, (unsigned long)job->uaiocb.aio_nbytes, 1322dc964385SJohn Baldwin (unsigned long)(end - (start + pgoff))); 1323dc964385SJohn Baldwin job->uaiocb.aio_nbytes = end - (start + pgoff); 1324dc964385SJohn Baldwin #endif 1325dc964385SJohn Baldwin end = round_page(end); 1326688dba74SNavdeep Parhar } 1327688dba74SNavdeep Parhar 1328dc964385SJohn Baldwin n = atop(end - start); 1329688dba74SNavdeep Parhar 1330dc964385SJohn Baldwin /* 1331dc964385SJohn Baldwin * Try to reuse a cached pageset. 1332dc964385SJohn Baldwin */ 1333dc964385SJohn Baldwin TAILQ_FOREACH(ps, &toep->ddp_cached_pagesets, link) { 1334dc964385SJohn Baldwin if (pscmp(ps, vm, start, n, pgoff, 1335dc964385SJohn Baldwin job->uaiocb.aio_nbytes) == 0) { 1336dc964385SJohn Baldwin TAILQ_REMOVE(&toep->ddp_cached_pagesets, ps, link); 1337dc964385SJohn Baldwin toep->ddp_cached_count--; 1338dc964385SJohn Baldwin *pps = ps; 1339dc964385SJohn Baldwin return (0); 1340dc964385SJohn Baldwin } 1341688dba74SNavdeep Parhar } 1342688dba74SNavdeep Parhar 1343e682d02eSNavdeep Parhar /* 1344dc964385SJohn Baldwin * If there are too many cached pagesets to create a new one, 1345dc964385SJohn Baldwin * free a pageset before creating a new one. 1346e682d02eSNavdeep Parhar */ 1347dc964385SJohn Baldwin KASSERT(toep->ddp_active_count + toep->ddp_cached_count <= 1348dc964385SJohn Baldwin nitems(toep->db), ("%s: too many wired pagesets", __func__)); 1349dc964385SJohn Baldwin if (toep->ddp_active_count + toep->ddp_cached_count == 1350dc964385SJohn Baldwin nitems(toep->db)) { 1351dc964385SJohn Baldwin KASSERT(toep->ddp_cached_count > 0, 1352dc964385SJohn Baldwin ("no cached pageset to free")); 1353dc964385SJohn Baldwin ps = TAILQ_LAST(&toep->ddp_cached_pagesets, pagesetq); 1354dc964385SJohn Baldwin TAILQ_REMOVE(&toep->ddp_cached_pagesets, ps, link); 1355dc964385SJohn Baldwin toep->ddp_cached_count--; 1356dc964385SJohn Baldwin free_pageset(toep->td, ps); 1357dc964385SJohn Baldwin } 1358dc964385SJohn Baldwin DDP_UNLOCK(toep); 1359e682d02eSNavdeep Parhar 1360dc964385SJohn Baldwin /* Create a new pageset. */ 1361dc964385SJohn Baldwin ps = malloc(sizeof(*ps) + n * sizeof(vm_page_t), M_CXGBE, M_WAITOK | 1362dc964385SJohn Baldwin M_ZERO); 1363dc964385SJohn Baldwin ps->pages = (vm_page_t *)(ps + 1); 1364dc964385SJohn Baldwin ps->vm_timestamp = map->timestamp; 1365dc964385SJohn Baldwin ps->npages = vm_fault_quick_hold_pages(map, start, end - start, 1366dc964385SJohn Baldwin VM_PROT_WRITE, ps->pages, n); 1367e682d02eSNavdeep Parhar 1368dc964385SJohn Baldwin DDP_LOCK(toep); 1369dc964385SJohn Baldwin if (ps->npages < 0) { 1370dc964385SJohn Baldwin free(ps, M_CXGBE); 1371dc964385SJohn Baldwin return (EFAULT); 1372e682d02eSNavdeep Parhar } 1373e682d02eSNavdeep Parhar 1374dc964385SJohn Baldwin KASSERT(ps->npages == n, ("hold_aio: page count mismatch: %d vs %d", 1375dc964385SJohn Baldwin ps->npages, n)); 1376dc964385SJohn Baldwin 1377dc964385SJohn Baldwin ps->offset = pgoff; 1378dc964385SJohn Baldwin ps->len = job->uaiocb.aio_nbytes; 1379dc964385SJohn Baldwin atomic_add_int(&vm->vm_refcnt, 1); 1380dc964385SJohn Baldwin ps->vm = vm; 1381dc964385SJohn Baldwin 1382dc964385SJohn Baldwin CTR5(KTR_CXGBE, "%s: tid %d, new pageset %p for job %p, npages %d", 1383dc964385SJohn Baldwin __func__, toep->tid, ps, job, ps->npages); 1384dc964385SJohn Baldwin *pps = ps; 1385e682d02eSNavdeep Parhar return (0); 1386e682d02eSNavdeep Parhar } 1387e682d02eSNavdeep Parhar 1388dc964385SJohn Baldwin static void 1389dc964385SJohn Baldwin ddp_complete_all(struct toepcb *toep, int error) 1390e682d02eSNavdeep Parhar { 1391dc964385SJohn Baldwin struct kaiocb *job; 1392e682d02eSNavdeep Parhar 1393dc964385SJohn Baldwin DDP_ASSERT_LOCKED(toep); 1394dc964385SJohn Baldwin while (!TAILQ_EMPTY(&toep->ddp_aiojobq)) { 1395dc964385SJohn Baldwin job = TAILQ_FIRST(&toep->ddp_aiojobq); 1396dc964385SJohn Baldwin TAILQ_REMOVE(&toep->ddp_aiojobq, job, list); 1397dc964385SJohn Baldwin toep->ddp_waiting_count--; 1398dc964385SJohn Baldwin if (aio_clear_cancel_function(job)) 1399dc964385SJohn Baldwin ddp_complete_one(job, error); 1400dc964385SJohn Baldwin } 1401dc964385SJohn Baldwin } 1402dc964385SJohn Baldwin 1403dc964385SJohn Baldwin static void 1404dc964385SJohn Baldwin aio_ddp_cancel_one(struct kaiocb *job) 1405dc964385SJohn Baldwin { 1406dc964385SJohn Baldwin long copied; 1407dc964385SJohn Baldwin 1408dc964385SJohn Baldwin /* 1409dc964385SJohn Baldwin * If this job had copied data out of the socket buffer before 1410dc964385SJohn Baldwin * it was cancelled, report it as a short read rather than an 1411dc964385SJohn Baldwin * error. 1412dc964385SJohn Baldwin */ 1413fe0bdd1dSJohn Baldwin copied = job->aio_received; 1414dc964385SJohn Baldwin if (copied != 0) 1415dc964385SJohn Baldwin aio_complete(job, copied, 0); 1416e682d02eSNavdeep Parhar else 1417dc964385SJohn Baldwin aio_cancel(job); 1418e682d02eSNavdeep Parhar } 1419e682d02eSNavdeep Parhar 1420dc964385SJohn Baldwin /* 1421dc964385SJohn Baldwin * Called when the main loop wants to requeue a job to retry it later. 1422dc964385SJohn Baldwin * Deals with the race of the job being cancelled while it was being 1423dc964385SJohn Baldwin * examined. 1424dc964385SJohn Baldwin */ 1425dc964385SJohn Baldwin static void 1426dc964385SJohn Baldwin aio_ddp_requeue_one(struct toepcb *toep, struct kaiocb *job) 1427dc964385SJohn Baldwin { 1428dc964385SJohn Baldwin 1429dc964385SJohn Baldwin DDP_ASSERT_LOCKED(toep); 1430dc964385SJohn Baldwin if (!(toep->ddp_flags & DDP_DEAD) && 1431dc964385SJohn Baldwin aio_set_cancel_function(job, t4_aio_cancel_queued)) { 1432dc964385SJohn Baldwin TAILQ_INSERT_HEAD(&toep->ddp_aiojobq, job, list); 1433dc964385SJohn Baldwin toep->ddp_waiting_count++; 1434dc964385SJohn Baldwin } else 1435dc964385SJohn Baldwin aio_ddp_cancel_one(job); 1436e682d02eSNavdeep Parhar } 1437e682d02eSNavdeep Parhar 1438dc964385SJohn Baldwin static void 1439dc964385SJohn Baldwin aio_ddp_requeue(struct toepcb *toep) 1440dc964385SJohn Baldwin { 1441dc964385SJohn Baldwin struct adapter *sc = td_adapter(toep->td); 1442dc964385SJohn Baldwin struct socket *so; 1443dc964385SJohn Baldwin struct sockbuf *sb; 1444dc964385SJohn Baldwin struct inpcb *inp; 1445dc964385SJohn Baldwin struct kaiocb *job; 1446dc964385SJohn Baldwin struct ddp_buffer *db; 1447dc964385SJohn Baldwin size_t copied, offset, resid; 1448dc964385SJohn Baldwin struct pageset *ps; 1449dc964385SJohn Baldwin struct mbuf *m; 1450dc964385SJohn Baldwin uint64_t ddp_flags, ddp_flags_mask; 1451dc964385SJohn Baldwin struct wrqe *wr; 1452dc964385SJohn Baldwin int buf_flag, db_idx, error; 1453dc964385SJohn Baldwin 1454dc964385SJohn Baldwin DDP_ASSERT_LOCKED(toep); 1455dc964385SJohn Baldwin 1456e682d02eSNavdeep Parhar restart: 1457dc964385SJohn Baldwin if (toep->ddp_flags & DDP_DEAD) { 1458dc964385SJohn Baldwin MPASS(toep->ddp_waiting_count == 0); 1459dc964385SJohn Baldwin MPASS(toep->ddp_active_count == 0); 1460dc964385SJohn Baldwin return; 1461e682d02eSNavdeep Parhar } 1462e682d02eSNavdeep Parhar 1463dc964385SJohn Baldwin if (toep->ddp_waiting_count == 0 || 1464dc964385SJohn Baldwin toep->ddp_active_count == nitems(toep->db)) { 1465dc964385SJohn Baldwin return; 1466dc964385SJohn Baldwin } 1467dc964385SJohn Baldwin 1468dc964385SJohn Baldwin job = TAILQ_FIRST(&toep->ddp_aiojobq); 1469dc964385SJohn Baldwin so = job->fd_file->f_data; 1470dc964385SJohn Baldwin sb = &so->so_rcv; 1471dc964385SJohn Baldwin SOCKBUF_LOCK(sb); 1472dc964385SJohn Baldwin 1473dc964385SJohn Baldwin /* We will never get anything unless we are or were connected. */ 1474dc964385SJohn Baldwin if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { 1475dc964385SJohn Baldwin SOCKBUF_UNLOCK(sb); 1476dc964385SJohn Baldwin ddp_complete_all(toep, ENOTCONN); 1477dc964385SJohn Baldwin return; 1478dc964385SJohn Baldwin } 1479dc964385SJohn Baldwin 1480dc964385SJohn Baldwin KASSERT(toep->ddp_active_count == 0 || sbavail(sb) == 0, 1481dc964385SJohn Baldwin ("%s: pending sockbuf data and DDP is active", __func__)); 1482dc964385SJohn Baldwin 1483e682d02eSNavdeep Parhar /* Abort if socket has reported problems. */ 1484dc964385SJohn Baldwin /* XXX: Wait for any queued DDP's to finish and/or flush them? */ 1485dc964385SJohn Baldwin if (so->so_error && sbavail(sb) == 0) { 1486dc964385SJohn Baldwin toep->ddp_waiting_count--; 1487dc964385SJohn Baldwin TAILQ_REMOVE(&toep->ddp_aiojobq, job, list); 1488dc964385SJohn Baldwin if (!aio_clear_cancel_function(job)) { 1489dc964385SJohn Baldwin SOCKBUF_UNLOCK(sb); 1490dc964385SJohn Baldwin goto restart; 1491dc964385SJohn Baldwin } 1492dc964385SJohn Baldwin 1493dc964385SJohn Baldwin /* 1494dc964385SJohn Baldwin * If this job has previously copied some data, report 1495dc964385SJohn Baldwin * a short read and leave the error to be reported by 1496dc964385SJohn Baldwin * a future request. 1497dc964385SJohn Baldwin */ 1498fe0bdd1dSJohn Baldwin copied = job->aio_received; 1499dc964385SJohn Baldwin if (copied != 0) { 1500dc964385SJohn Baldwin SOCKBUF_UNLOCK(sb); 1501dc964385SJohn Baldwin aio_complete(job, copied, 0); 1502dc964385SJohn Baldwin goto restart; 1503dc964385SJohn Baldwin } 1504e682d02eSNavdeep Parhar error = so->so_error; 1505e682d02eSNavdeep Parhar so->so_error = 0; 1506dc964385SJohn Baldwin SOCKBUF_UNLOCK(sb); 1507dc964385SJohn Baldwin aio_complete(job, -1, error); 1508dc964385SJohn Baldwin goto restart; 1509e682d02eSNavdeep Parhar } 1510e682d02eSNavdeep Parhar 1511e682d02eSNavdeep Parhar /* 1512dc964385SJohn Baldwin * Door is closed. If there is pending data in the socket buffer, 1513dc964385SJohn Baldwin * deliver it. If there are pending DDP requests, wait for those 1514dc964385SJohn Baldwin * to complete. Once they have completed, return EOF reads. 1515e682d02eSNavdeep Parhar */ 1516dc964385SJohn Baldwin if (sb->sb_state & SBS_CANTRCVMORE && sbavail(sb) == 0) { 1517dc964385SJohn Baldwin SOCKBUF_UNLOCK(sb); 1518dc964385SJohn Baldwin if (toep->ddp_active_count != 0) 1519dc964385SJohn Baldwin return; 1520dc964385SJohn Baldwin ddp_complete_all(toep, 0); 1521dc964385SJohn Baldwin return; 1522e682d02eSNavdeep Parhar } 1523dc964385SJohn Baldwin 1524dc964385SJohn Baldwin /* 1525dc964385SJohn Baldwin * If DDP is not enabled and there is no pending socket buffer 1526dc964385SJohn Baldwin * data, try to enable DDP. 1527dc964385SJohn Baldwin */ 1528dc964385SJohn Baldwin if (sbavail(sb) == 0 && (toep->ddp_flags & DDP_ON) == 0) { 1529dc964385SJohn Baldwin SOCKBUF_UNLOCK(sb); 1530dc964385SJohn Baldwin 1531dc964385SJohn Baldwin /* 1532dc964385SJohn Baldwin * Wait for the card to ACK that DDP is enabled before 1533dc964385SJohn Baldwin * queueing any buffers. Currently this waits for an 1534dc964385SJohn Baldwin * indicate to arrive. This could use a TCB_SET_FIELD_RPL 1535dc964385SJohn Baldwin * message to know that DDP was enabled instead of waiting 1536dc964385SJohn Baldwin * for the indicate which would avoid copying the indicate 1537dc964385SJohn Baldwin * if no data is pending. 1538dc964385SJohn Baldwin * 1539dc964385SJohn Baldwin * XXX: Might want to limit the indicate size to the size 1540dc964385SJohn Baldwin * of the first queued request. 1541dc964385SJohn Baldwin */ 1542dc964385SJohn Baldwin if ((toep->ddp_flags & DDP_SC_REQ) == 0) 1543dc964385SJohn Baldwin enable_ddp(sc, toep); 1544dc964385SJohn Baldwin return; 1545e682d02eSNavdeep Parhar } 1546dc964385SJohn Baldwin SOCKBUF_UNLOCK(sb); 1547dc964385SJohn Baldwin 1548dc964385SJohn Baldwin /* 1549dc964385SJohn Baldwin * If another thread is queueing a buffer for DDP, let it 1550dc964385SJohn Baldwin * drain any work and return. 1551dc964385SJohn Baldwin */ 1552dc964385SJohn Baldwin if (toep->ddp_queueing != NULL) 1553dc964385SJohn Baldwin return; 1554dc964385SJohn Baldwin 1555dc964385SJohn Baldwin /* Take the next job to prep it for DDP. */ 1556dc964385SJohn Baldwin toep->ddp_waiting_count--; 1557dc964385SJohn Baldwin TAILQ_REMOVE(&toep->ddp_aiojobq, job, list); 1558dc964385SJohn Baldwin if (!aio_clear_cancel_function(job)) 1559e682d02eSNavdeep Parhar goto restart; 1560dc964385SJohn Baldwin toep->ddp_queueing = job; 1561e682d02eSNavdeep Parhar 1562dc964385SJohn Baldwin /* NB: This drops DDP_LOCK while it holds the backing VM pages. */ 1563dc964385SJohn Baldwin error = hold_aio(toep, job, &ps); 1564dc964385SJohn Baldwin if (error != 0) { 1565dc964385SJohn Baldwin ddp_complete_one(job, error); 1566dc964385SJohn Baldwin toep->ddp_queueing = NULL; 1567e682d02eSNavdeep Parhar goto restart; 1568dc964385SJohn Baldwin } 1569e682d02eSNavdeep Parhar 1570dc964385SJohn Baldwin SOCKBUF_LOCK(sb); 1571dc964385SJohn Baldwin if (so->so_error && sbavail(sb) == 0) { 1572fe0bdd1dSJohn Baldwin copied = job->aio_received; 1573dc964385SJohn Baldwin if (copied != 0) { 1574dc964385SJohn Baldwin SOCKBUF_UNLOCK(sb); 1575dc964385SJohn Baldwin recycle_pageset(toep, ps); 1576dc964385SJohn Baldwin aio_complete(job, copied, 0); 1577dc964385SJohn Baldwin toep->ddp_queueing = NULL; 1578dc964385SJohn Baldwin goto restart; 1579dc964385SJohn Baldwin } 1580e682d02eSNavdeep Parhar 1581dc964385SJohn Baldwin error = so->so_error; 1582dc964385SJohn Baldwin so->so_error = 0; 1583dc964385SJohn Baldwin SOCKBUF_UNLOCK(sb); 1584dc964385SJohn Baldwin recycle_pageset(toep, ps); 1585dc964385SJohn Baldwin aio_complete(job, -1, error); 1586dc964385SJohn Baldwin toep->ddp_queueing = NULL; 1587dc964385SJohn Baldwin goto restart; 1588e682d02eSNavdeep Parhar } 1589e682d02eSNavdeep Parhar 1590dc964385SJohn Baldwin if (sb->sb_state & SBS_CANTRCVMORE && sbavail(sb) == 0) { 1591dc964385SJohn Baldwin SOCKBUF_UNLOCK(sb); 1592dc964385SJohn Baldwin recycle_pageset(toep, ps); 1593dc964385SJohn Baldwin if (toep->ddp_active_count != 0) { 1594dc964385SJohn Baldwin /* 1595dc964385SJohn Baldwin * The door is closed, but there are still pending 1596dc964385SJohn Baldwin * DDP buffers. Requeue. These jobs will all be 1597dc964385SJohn Baldwin * completed once those buffers drain. 1598dc964385SJohn Baldwin */ 1599dc964385SJohn Baldwin aio_ddp_requeue_one(toep, job); 1600dc964385SJohn Baldwin toep->ddp_queueing = NULL; 1601dc964385SJohn Baldwin return; 1602e682d02eSNavdeep Parhar } 1603dc964385SJohn Baldwin ddp_complete_one(job, 0); 1604dc964385SJohn Baldwin ddp_complete_all(toep, 0); 1605dc964385SJohn Baldwin toep->ddp_queueing = NULL; 1606dc964385SJohn Baldwin return; 1607e682d02eSNavdeep Parhar } 1608dc964385SJohn Baldwin 1609dc964385SJohn Baldwin sbcopy: 1610dc964385SJohn Baldwin /* 1611dc964385SJohn Baldwin * If the toep is dead, there shouldn't be any data in the socket 1612dc964385SJohn Baldwin * buffer, so the above case should have handled this. 1613dc964385SJohn Baldwin */ 1614dc964385SJohn Baldwin MPASS(!(toep->ddp_flags & DDP_DEAD)); 1615dc964385SJohn Baldwin 1616dc964385SJohn Baldwin /* 1617dc964385SJohn Baldwin * If there is pending data in the socket buffer (either 1618dc964385SJohn Baldwin * from before the requests were queued or a DDP indicate), 1619dc964385SJohn Baldwin * copy those mbufs out directly. 1620dc964385SJohn Baldwin */ 1621dc964385SJohn Baldwin copied = 0; 1622fe0bdd1dSJohn Baldwin offset = ps->offset + job->aio_received; 1623fe0bdd1dSJohn Baldwin MPASS(job->aio_received <= job->uaiocb.aio_nbytes); 1624fe0bdd1dSJohn Baldwin resid = job->uaiocb.aio_nbytes - job->aio_received; 1625dc964385SJohn Baldwin m = sb->sb_mb; 1626dc964385SJohn Baldwin KASSERT(m == NULL || toep->ddp_active_count == 0, 1627dc964385SJohn Baldwin ("%s: sockbuf data with active DDP", __func__)); 1628dc964385SJohn Baldwin while (m != NULL && resid > 0) { 1629dc964385SJohn Baldwin struct iovec iov[1]; 1630dc964385SJohn Baldwin struct uio uio; 1631dc964385SJohn Baldwin int error; 1632dc964385SJohn Baldwin 1633dc964385SJohn Baldwin iov[0].iov_base = mtod(m, void *); 1634dc964385SJohn Baldwin iov[0].iov_len = m->m_len; 1635dc964385SJohn Baldwin if (iov[0].iov_len > resid) 1636dc964385SJohn Baldwin iov[0].iov_len = resid; 1637dc964385SJohn Baldwin uio.uio_iov = iov; 1638dc964385SJohn Baldwin uio.uio_iovcnt = 1; 1639dc964385SJohn Baldwin uio.uio_offset = 0; 1640dc964385SJohn Baldwin uio.uio_resid = iov[0].iov_len; 1641dc964385SJohn Baldwin uio.uio_segflg = UIO_SYSSPACE; 1642dc964385SJohn Baldwin uio.uio_rw = UIO_WRITE; 1643dc964385SJohn Baldwin error = uiomove_fromphys(ps->pages, offset + copied, 1644dc964385SJohn Baldwin uio.uio_resid, &uio); 1645dc964385SJohn Baldwin MPASS(error == 0 && uio.uio_resid == 0); 1646dc964385SJohn Baldwin copied += uio.uio_offset; 1647dc964385SJohn Baldwin resid -= uio.uio_offset; 1648dc964385SJohn Baldwin m = m->m_next; 1649dc964385SJohn Baldwin } 1650dc964385SJohn Baldwin if (copied != 0) { 1651dc964385SJohn Baldwin sbdrop_locked(sb, copied); 1652fe0bdd1dSJohn Baldwin job->aio_received += copied; 1653b1012d80SJohn Baldwin job->msgrcv = 1; 1654fe0bdd1dSJohn Baldwin copied = job->aio_received; 1655dc964385SJohn Baldwin inp = sotoinpcb(so); 1656dc964385SJohn Baldwin if (!INP_TRY_WLOCK(inp)) { 1657dc964385SJohn Baldwin /* 1658dc964385SJohn Baldwin * The reference on the socket file descriptor in 1659dc964385SJohn Baldwin * the AIO job should keep 'sb' and 'inp' stable. 1660dc964385SJohn Baldwin * Our caller has a reference on the 'toep' that 1661dc964385SJohn Baldwin * keeps it stable. 1662dc964385SJohn Baldwin */ 1663dc964385SJohn Baldwin SOCKBUF_UNLOCK(sb); 1664dc964385SJohn Baldwin DDP_UNLOCK(toep); 1665dc964385SJohn Baldwin INP_WLOCK(inp); 1666dc964385SJohn Baldwin DDP_LOCK(toep); 1667dc964385SJohn Baldwin SOCKBUF_LOCK(sb); 1668dc964385SJohn Baldwin 1669dc964385SJohn Baldwin /* 1670dc964385SJohn Baldwin * If the socket has been closed, we should detect 1671dc964385SJohn Baldwin * that and complete this request if needed on 1672dc964385SJohn Baldwin * the next trip around the loop. 1673dc964385SJohn Baldwin */ 1674dc964385SJohn Baldwin } 1675dc964385SJohn Baldwin t4_rcvd_locked(&toep->td->tod, intotcpcb(inp)); 1676dc964385SJohn Baldwin INP_WUNLOCK(inp); 1677dc964385SJohn Baldwin if (resid == 0 || toep->ddp_flags & DDP_DEAD) { 1678dc964385SJohn Baldwin /* 1679dc964385SJohn Baldwin * We filled the entire buffer with socket 1680dc964385SJohn Baldwin * data, DDP is not being used, or the socket 1681dc964385SJohn Baldwin * is being shut down, so complete the 1682dc964385SJohn Baldwin * request. 1683dc964385SJohn Baldwin */ 1684dc964385SJohn Baldwin SOCKBUF_UNLOCK(sb); 1685dc964385SJohn Baldwin recycle_pageset(toep, ps); 1686dc964385SJohn Baldwin aio_complete(job, copied, 0); 1687dc964385SJohn Baldwin toep->ddp_queueing = NULL; 1688dc964385SJohn Baldwin goto restart; 1689dc964385SJohn Baldwin } 1690dc964385SJohn Baldwin 1691dc964385SJohn Baldwin /* 1692dc964385SJohn Baldwin * If DDP is not enabled, requeue this request and restart. 1693dc964385SJohn Baldwin * This will either enable DDP or wait for more data to 1694dc964385SJohn Baldwin * arrive on the socket buffer. 1695dc964385SJohn Baldwin */ 1696dc964385SJohn Baldwin if ((toep->ddp_flags & (DDP_ON | DDP_SC_REQ)) != DDP_ON) { 1697dc964385SJohn Baldwin SOCKBUF_UNLOCK(sb); 1698dc964385SJohn Baldwin recycle_pageset(toep, ps); 1699dc964385SJohn Baldwin aio_ddp_requeue_one(toep, job); 1700dc964385SJohn Baldwin toep->ddp_queueing = NULL; 1701dc964385SJohn Baldwin goto restart; 1702dc964385SJohn Baldwin } 1703dc964385SJohn Baldwin 1704dc964385SJohn Baldwin /* 1705dc964385SJohn Baldwin * An indicate might have arrived and been added to 1706dc964385SJohn Baldwin * the socket buffer while it was unlocked after the 1707dc964385SJohn Baldwin * copy to lock the INP. If so, restart the copy. 1708dc964385SJohn Baldwin */ 1709dc964385SJohn Baldwin if (sbavail(sb) != 0) 1710dc964385SJohn Baldwin goto sbcopy; 1711dc964385SJohn Baldwin } 1712dc964385SJohn Baldwin SOCKBUF_UNLOCK(sb); 1713dc964385SJohn Baldwin 1714dc964385SJohn Baldwin if (prep_pageset(sc, toep, ps) == 0) { 1715dc964385SJohn Baldwin recycle_pageset(toep, ps); 1716dc964385SJohn Baldwin aio_ddp_requeue_one(toep, job); 1717dc964385SJohn Baldwin toep->ddp_queueing = NULL; 1718dc964385SJohn Baldwin 1719dc964385SJohn Baldwin /* 1720dc964385SJohn Baldwin * XXX: Need to retry this later. Mostly need a trigger 1721dc964385SJohn Baldwin * when page pods are freed up. 1722dc964385SJohn Baldwin */ 1723dc964385SJohn Baldwin printf("%s: prep_pageset failed\n", __func__); 1724dc964385SJohn Baldwin return; 1725dc964385SJohn Baldwin } 1726dc964385SJohn Baldwin 1727dc964385SJohn Baldwin /* Determine which DDP buffer to use. */ 1728dc964385SJohn Baldwin if (toep->db[0].job == NULL) { 1729dc964385SJohn Baldwin db_idx = 0; 1730e682d02eSNavdeep Parhar } else { 1731dc964385SJohn Baldwin MPASS(toep->db[1].job == NULL); 1732dc964385SJohn Baldwin db_idx = 1; 1733e682d02eSNavdeep Parhar } 1734e682d02eSNavdeep Parhar 1735dc964385SJohn Baldwin ddp_flags = 0; 1736dc964385SJohn Baldwin ddp_flags_mask = 0; 1737dc964385SJohn Baldwin if (db_idx == 0) { 1738dc964385SJohn Baldwin ddp_flags |= V_TF_DDP_BUF0_VALID(1); 1739dc964385SJohn Baldwin if (so->so_state & SS_NBIO) 1740dc964385SJohn Baldwin ddp_flags |= V_TF_DDP_BUF0_FLUSH(1); 1741dc964385SJohn Baldwin ddp_flags_mask |= V_TF_DDP_PSH_NO_INVALIDATE0(1) | 1742dc964385SJohn Baldwin V_TF_DDP_PUSH_DISABLE_0(1) | V_TF_DDP_PSHF_ENABLE_0(1) | 1743dc964385SJohn Baldwin V_TF_DDP_BUF0_FLUSH(1) | V_TF_DDP_BUF0_VALID(1); 1744dc964385SJohn Baldwin buf_flag = DDP_BUF0_ACTIVE; 1745dc964385SJohn Baldwin } else { 1746dc964385SJohn Baldwin ddp_flags |= V_TF_DDP_BUF1_VALID(1); 1747dc964385SJohn Baldwin if (so->so_state & SS_NBIO) 1748dc964385SJohn Baldwin ddp_flags |= V_TF_DDP_BUF1_FLUSH(1); 1749dc964385SJohn Baldwin ddp_flags_mask |= V_TF_DDP_PSH_NO_INVALIDATE1(1) | 1750dc964385SJohn Baldwin V_TF_DDP_PUSH_DISABLE_1(1) | V_TF_DDP_PSHF_ENABLE_1(1) | 1751dc964385SJohn Baldwin V_TF_DDP_BUF1_FLUSH(1) | V_TF_DDP_BUF1_VALID(1); 1752dc964385SJohn Baldwin buf_flag = DDP_BUF1_ACTIVE; 1753e682d02eSNavdeep Parhar } 1754dc964385SJohn Baldwin MPASS((toep->ddp_flags & buf_flag) == 0); 1755dc964385SJohn Baldwin if ((toep->ddp_flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)) == 0) { 1756dc964385SJohn Baldwin MPASS(db_idx == 0); 1757dc964385SJohn Baldwin MPASS(toep->ddp_active_id == -1); 1758dc964385SJohn Baldwin MPASS(toep->ddp_active_count == 0); 1759dc964385SJohn Baldwin ddp_flags_mask |= V_TF_DDP_ACTIVE_BUF(1); 1760e682d02eSNavdeep Parhar } 1761e682d02eSNavdeep Parhar 1762e682d02eSNavdeep Parhar /* 1763dc964385SJohn Baldwin * The TID for this connection should still be valid. If DDP_DEAD 1764dc964385SJohn Baldwin * is set, SBS_CANTRCVMORE should be set, so we shouldn't be 1765dc964385SJohn Baldwin * this far anyway. Even if the socket is closing on the other 1766dc964385SJohn Baldwin * end, the AIO job holds a reference on this end of the socket 1767dc964385SJohn Baldwin * which will keep it open and keep the TCP PCB attached until 1768dc964385SJohn Baldwin * after the job is completed. 1769e682d02eSNavdeep Parhar */ 1770fe0bdd1dSJohn Baldwin wr = mk_update_tcb_for_ddp(sc, toep, db_idx, ps, job->aio_received, 1771fe0bdd1dSJohn Baldwin ddp_flags, ddp_flags_mask); 1772dc964385SJohn Baldwin if (wr == NULL) { 1773dc964385SJohn Baldwin recycle_pageset(toep, ps); 1774dc964385SJohn Baldwin aio_ddp_requeue_one(toep, job); 1775dc964385SJohn Baldwin toep->ddp_queueing = NULL; 1776dc964385SJohn Baldwin 1777dc964385SJohn Baldwin /* 1778dc964385SJohn Baldwin * XXX: Need a way to kick a retry here. 1779dc964385SJohn Baldwin * 1780dc964385SJohn Baldwin * XXX: We know the fixed size needed and could 1781dc964385SJohn Baldwin * preallocate this using a blocking request at the 1782dc964385SJohn Baldwin * start of the task to avoid having to handle this 1783dc964385SJohn Baldwin * edge case. 1784dc964385SJohn Baldwin */ 1785dc964385SJohn Baldwin printf("%s: mk_update_tcb_for_ddp failed\n", __func__); 1786dc964385SJohn Baldwin return; 1787dc964385SJohn Baldwin } 1788dc964385SJohn Baldwin 1789dc964385SJohn Baldwin if (!aio_set_cancel_function(job, t4_aio_cancel_active)) { 1790dc964385SJohn Baldwin free_wrqe(wr); 1791dc964385SJohn Baldwin recycle_pageset(toep, ps); 1792dc964385SJohn Baldwin aio_ddp_cancel_one(job); 1793dc964385SJohn Baldwin toep->ddp_queueing = NULL; 1794e682d02eSNavdeep Parhar goto restart; 1795e682d02eSNavdeep Parhar } 1796e682d02eSNavdeep Parhar 1797dc964385SJohn Baldwin #ifdef VERBOSE_TRACES 1798dc964385SJohn Baldwin CTR5(KTR_CXGBE, "%s: scheduling %p for DDP[%d] (flags %#lx/%#lx)", 1799dc964385SJohn Baldwin __func__, job, db_idx, ddp_flags, ddp_flags_mask); 1800dc964385SJohn Baldwin #endif 1801dc964385SJohn Baldwin /* Give the chip the go-ahead. */ 1802dc964385SJohn Baldwin t4_wrq_tx(sc, wr); 1803dc964385SJohn Baldwin db = &toep->db[db_idx]; 1804dc964385SJohn Baldwin db->cancel_pending = 0; 1805dc964385SJohn Baldwin db->job = job; 1806dc964385SJohn Baldwin db->ps = ps; 1807dc964385SJohn Baldwin toep->ddp_queueing = NULL; 1808dc964385SJohn Baldwin toep->ddp_flags |= buf_flag; 1809dc964385SJohn Baldwin toep->ddp_active_count++; 1810dc964385SJohn Baldwin if (toep->ddp_active_count == 1) { 1811dc964385SJohn Baldwin MPASS(toep->ddp_active_id == -1); 1812dc964385SJohn Baldwin toep->ddp_active_id = db_idx; 1813dc964385SJohn Baldwin CTR2(KTR_CXGBE, "%s: ddp_active_id = %d", __func__, 1814dc964385SJohn Baldwin toep->ddp_active_id); 1815dc964385SJohn Baldwin } 1816dc964385SJohn Baldwin goto restart; 1817dc964385SJohn Baldwin } 1818dc964385SJohn Baldwin 1819dc964385SJohn Baldwin void 1820dc964385SJohn Baldwin ddp_queue_toep(struct toepcb *toep) 1821dc964385SJohn Baldwin { 1822dc964385SJohn Baldwin 1823dc964385SJohn Baldwin DDP_ASSERT_LOCKED(toep); 1824dc964385SJohn Baldwin if (toep->ddp_flags & DDP_TASK_ACTIVE) 1825dc964385SJohn Baldwin return; 1826dc964385SJohn Baldwin toep->ddp_flags |= DDP_TASK_ACTIVE; 1827dc964385SJohn Baldwin hold_toepcb(toep); 1828dc964385SJohn Baldwin soaio_enqueue(&toep->ddp_requeue_task); 1829dc964385SJohn Baldwin } 1830dc964385SJohn Baldwin 1831dc964385SJohn Baldwin static void 1832dc964385SJohn Baldwin aio_ddp_requeue_task(void *context, int pending) 1833dc964385SJohn Baldwin { 1834dc964385SJohn Baldwin struct toepcb *toep = context; 1835dc964385SJohn Baldwin 1836dc964385SJohn Baldwin DDP_LOCK(toep); 1837dc964385SJohn Baldwin aio_ddp_requeue(toep); 1838dc964385SJohn Baldwin toep->ddp_flags &= ~DDP_TASK_ACTIVE; 1839dc964385SJohn Baldwin DDP_UNLOCK(toep); 1840dc964385SJohn Baldwin 1841dc964385SJohn Baldwin free_toepcb(toep); 1842dc964385SJohn Baldwin } 1843dc964385SJohn Baldwin 1844dc964385SJohn Baldwin static void 1845dc964385SJohn Baldwin t4_aio_cancel_active(struct kaiocb *job) 1846dc964385SJohn Baldwin { 1847dc964385SJohn Baldwin struct socket *so = job->fd_file->f_data; 1848dc964385SJohn Baldwin struct tcpcb *tp = so_sototcpcb(so); 1849dc964385SJohn Baldwin struct toepcb *toep = tp->t_toe; 1850dc964385SJohn Baldwin struct adapter *sc = td_adapter(toep->td); 1851dc964385SJohn Baldwin uint64_t valid_flag; 1852dc964385SJohn Baldwin int i; 1853dc964385SJohn Baldwin 1854dc964385SJohn Baldwin DDP_LOCK(toep); 1855dc964385SJohn Baldwin if (aio_cancel_cleared(job)) { 1856dc964385SJohn Baldwin DDP_UNLOCK(toep); 1857dc964385SJohn Baldwin aio_ddp_cancel_one(job); 1858dc964385SJohn Baldwin return; 1859dc964385SJohn Baldwin } 1860dc964385SJohn Baldwin 1861dc964385SJohn Baldwin for (i = 0; i < nitems(toep->db); i++) { 1862dc964385SJohn Baldwin if (toep->db[i].job == job) { 1863dc964385SJohn Baldwin /* Should only ever get one cancel request for a job. */ 1864dc964385SJohn Baldwin MPASS(toep->db[i].cancel_pending == 0); 1865dc964385SJohn Baldwin 1866dc964385SJohn Baldwin /* 1867dc964385SJohn Baldwin * Invalidate this buffer. It will be 1868dc964385SJohn Baldwin * cancelled or partially completed once the 1869dc964385SJohn Baldwin * card ACKs the invalidate. 1870dc964385SJohn Baldwin */ 1871dc964385SJohn Baldwin valid_flag = i == 0 ? V_TF_DDP_BUF0_VALID(1) : 1872dc964385SJohn Baldwin V_TF_DDP_BUF1_VALID(1); 1873671bf2b8SNavdeep Parhar t4_set_tcb_field(sc, toep->ctrlq, toep->tid, 1874671bf2b8SNavdeep Parhar W_TCB_RX_DDP_FLAGS, valid_flag, 0, 1, 1875671bf2b8SNavdeep Parhar i + DDP_BUF0_INVALIDATED, 1876671bf2b8SNavdeep Parhar toep->ofld_rxq->iq.abs_id); 1877dc964385SJohn Baldwin toep->db[i].cancel_pending = 1; 1878dc964385SJohn Baldwin CTR2(KTR_CXGBE, "%s: request %p marked pending", 1879dc964385SJohn Baldwin __func__, job); 1880dc964385SJohn Baldwin break; 1881dc964385SJohn Baldwin } 1882dc964385SJohn Baldwin } 1883dc964385SJohn Baldwin DDP_UNLOCK(toep); 1884dc964385SJohn Baldwin } 1885dc964385SJohn Baldwin 1886dc964385SJohn Baldwin static void 1887dc964385SJohn Baldwin t4_aio_cancel_queued(struct kaiocb *job) 1888dc964385SJohn Baldwin { 1889dc964385SJohn Baldwin struct socket *so = job->fd_file->f_data; 1890dc964385SJohn Baldwin struct tcpcb *tp = so_sototcpcb(so); 1891dc964385SJohn Baldwin struct toepcb *toep = tp->t_toe; 1892dc964385SJohn Baldwin 1893dc964385SJohn Baldwin DDP_LOCK(toep); 1894dc964385SJohn Baldwin if (!aio_cancel_cleared(job)) { 1895dc964385SJohn Baldwin TAILQ_REMOVE(&toep->ddp_aiojobq, job, list); 1896dc964385SJohn Baldwin toep->ddp_waiting_count--; 1897dc964385SJohn Baldwin if (toep->ddp_waiting_count == 0) 1898dc964385SJohn Baldwin ddp_queue_toep(toep); 1899dc964385SJohn Baldwin } 1900dc964385SJohn Baldwin CTR2(KTR_CXGBE, "%s: request %p cancelled", __func__, job); 1901dc964385SJohn Baldwin DDP_UNLOCK(toep); 1902dc964385SJohn Baldwin 1903dc964385SJohn Baldwin aio_ddp_cancel_one(job); 1904dc964385SJohn Baldwin } 1905dc964385SJohn Baldwin 1906dc964385SJohn Baldwin int 1907dc964385SJohn Baldwin t4_aio_queue_ddp(struct socket *so, struct kaiocb *job) 1908dc964385SJohn Baldwin { 1909dc964385SJohn Baldwin struct tcpcb *tp = so_sototcpcb(so); 1910dc964385SJohn Baldwin struct toepcb *toep = tp->t_toe; 1911dc964385SJohn Baldwin 1912dc964385SJohn Baldwin 1913dc964385SJohn Baldwin /* Ignore writes. */ 1914dc964385SJohn Baldwin if (job->uaiocb.aio_lio_opcode != LIO_READ) 1915dc964385SJohn Baldwin return (EOPNOTSUPP); 1916dc964385SJohn Baldwin 1917dc964385SJohn Baldwin DDP_LOCK(toep); 1918dc964385SJohn Baldwin 1919dc964385SJohn Baldwin /* 1920dc964385SJohn Baldwin * XXX: Think about possibly returning errors for ENOTCONN, 1921dc964385SJohn Baldwin * etc. Perhaps the caller would only queue the request 1922dc964385SJohn Baldwin * if it failed with EOPNOTSUPP? 1923dc964385SJohn Baldwin */ 1924dc964385SJohn Baldwin 1925dc964385SJohn Baldwin #ifdef VERBOSE_TRACES 1926dc964385SJohn Baldwin CTR2(KTR_CXGBE, "%s: queueing %p", __func__, job); 1927dc964385SJohn Baldwin #endif 1928dc964385SJohn Baldwin if (!aio_set_cancel_function(job, t4_aio_cancel_queued)) 1929dc964385SJohn Baldwin panic("new job was cancelled"); 1930dc964385SJohn Baldwin TAILQ_INSERT_TAIL(&toep->ddp_aiojobq, job, list); 1931dc964385SJohn Baldwin toep->ddp_waiting_count++; 1932dc964385SJohn Baldwin toep->ddp_flags |= DDP_OK; 1933dc964385SJohn Baldwin 1934dc964385SJohn Baldwin /* 1935dc964385SJohn Baldwin * Try to handle this request synchronously. If this has 1936dc964385SJohn Baldwin * to block because the task is running, it will just bail 1937dc964385SJohn Baldwin * and let the task handle it instead. 1938dc964385SJohn Baldwin */ 1939dc964385SJohn Baldwin aio_ddp_requeue(toep); 1940dc964385SJohn Baldwin DDP_UNLOCK(toep); 1941dc964385SJohn Baldwin return (0); 1942dc964385SJohn Baldwin } 1943dc964385SJohn Baldwin 1944dc964385SJohn Baldwin int 1945dc964385SJohn Baldwin t4_ddp_mod_load(void) 1946dc964385SJohn Baldwin { 1947dc964385SJohn Baldwin 1948671bf2b8SNavdeep Parhar t4_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp); 1949671bf2b8SNavdeep Parhar t4_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete); 1950dc964385SJohn Baldwin TAILQ_INIT(&ddp_orphan_pagesets); 1951dc964385SJohn Baldwin mtx_init(&ddp_orphan_pagesets_lock, "ddp orphans", NULL, MTX_DEF); 1952dc964385SJohn Baldwin TASK_INIT(&ddp_orphan_task, 0, ddp_free_orphan_pagesets, NULL); 1953dc964385SJohn Baldwin return (0); 1954dc964385SJohn Baldwin } 1955dc964385SJohn Baldwin 1956dc964385SJohn Baldwin void 1957dc964385SJohn Baldwin t4_ddp_mod_unload(void) 1958dc964385SJohn Baldwin { 1959dc964385SJohn Baldwin 1960dc964385SJohn Baldwin taskqueue_drain(taskqueue_thread, &ddp_orphan_task); 1961dc964385SJohn Baldwin MPASS(TAILQ_EMPTY(&ddp_orphan_pagesets)); 1962dc964385SJohn Baldwin mtx_destroy(&ddp_orphan_pagesets_lock); 1963671bf2b8SNavdeep Parhar t4_register_cpl_handler(CPL_RX_DATA_DDP, NULL); 1964671bf2b8SNavdeep Parhar t4_register_cpl_handler(CPL_RX_DDP_COMPLETE, NULL); 1965dc964385SJohn Baldwin } 1966e682d02eSNavdeep Parhar #endif 1967