1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2023 Chelsio Communications, Inc.
5 * Written by: John Baldwin <jhb@FreeBSD.org>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 #include "opt_inet.h"
30
31 #include <sys/param.h>
32 #include <sys/libkern.h>
33 #include <sys/kernel.h>
34 #include <sys/module.h>
35
36 #ifdef TCP_OFFLOAD
37 #include <sys/bitset.h>
38 #include <sys/capsicum.h>
39 #include <sys/file.h>
40 #include <sys/kthread.h>
41 #include <sys/malloc.h>
42 #include <sys/mbuf.h>
43 #include <sys/nv.h>
44 #include <sys/protosw.h>
45 #include <sys/socket.h>
46 #include <sys/socketvar.h>
47 #include <netinet/in.h>
48 #include <netinet/in_pcb.h>
49 #include <netinet/tcp_var.h>
50 #include <netinet/toecore.h>
51
52 #include <dev/nvmf/nvmf.h>
53 #include <dev/nvmf/nvmf_proto.h>
54 #include <dev/nvmf/nvmf_tcp.h>
55 #include <dev/nvmf/nvmf_transport.h>
56 #include <dev/nvmf/nvmf_transport_internal.h>
57
58 #include <vm/pmap.h>
59 #include <vm/vm_page.h>
60
61 #include "common/common.h"
62 #include "common/t4_regs.h"
63 #include "common/t4_tcb.h"
64 #include "tom/t4_tom.h"
65
66 /* Status code values in CPL_NVMT_CMP. */
67 #define CMP_STATUS_ERROR_MASK 0x7f
68 #define CMP_STATUS_NO_ERROR 0
69 #define CMP_STATUS_HEADER_DIGEST 1
70 #define CMP_STATUS_DIRECTION_MISMATCH 2
71 #define CMP_STATUS_DIGEST_FLAG_MISMATCH 3
72 #define CMP_STATUS_SUCCESS_NOT_LAST 4
73 #define CMP_STATUS_BAD_DATA_LENGTH 5
74 #define CMP_STATUS_USER_MODE_UNALLOCATED 6
75 #define CMP_STATUS_RQT_LIMIT 7
76 #define CMP_STATUS_RQT_WRAP 8
77 #define CMP_STATUS_RQT_BOUND 9
78 #define CMP_STATUS_TPT_LIMIT 16
79 #define CMP_STATUS_TPT_INVALID 17
80 #define CMP_STATUS_TPT_COLOUR_MISMATCH 18
81 #define CMP_STATUS_TPT_MISC 19
82 #define CMP_STATUS_TPT_WRAP 20
83 #define CMP_STATUS_TPT_BOUND 21
84 #define CMP_STATUS_TPT_LAST_PDU_UNALIGNED 22
85 #define CMP_STATUS_PBL_LIMIT 24
86 #define CMP_STATUS_DATA_DIGEST 25
87 #define CMP_STATUS_DDP 0x80
88
89 /*
90 * Transfer tags and CIDs with the MSB set are "unallocated" tags that
91 * pass data through to the freelist without using DDP.
92 */
93 #define CHE_FL_TAG_MASK 0x8000
94 #define CHE_MAX_FL_TAG 0x7fff
95 #define CHE_NUM_FL_TAGS (CHE_MAX_FL_TAG + 1)
96
97 #define CHE_TAG_IS_FL(ttag) (((ttag) & CHE_FL_TAG_MASK) == CHE_FL_TAG_MASK)
98 #define CHE_RAW_FL_TAG(ttag) ((ttag) & ~CHE_FL_TAG_MASK)
99 #define CHE_DDP_TAG(stag_idx, color) ((stag_idx) << 4 | (color))
100 #define CHE_STAG_COLOR(stag) ((stag) & 0xf)
101 #define CHE_STAG_IDX(stag) ((stag) >> 4)
102 #define CHE_DDP_MAX_COLOR 0xf
103
104 #define CHE_DDP_NO_TAG 0xffff
105
106 /*
107 * A bitmap of non-DDP CIDs in use on the host. Since there is no
108 * _BIT_FFC (find first clear), the bitset is inverted so that a clear
109 * bit indicates an in-use CID.
110 */
111 BITSET_DEFINE(fl_cid_set, CHE_NUM_FL_TAGS);
112 #define FL_CID_INIT(p) __BIT_FILL(CHE_NUM_FL_TAGS, p)
113 #define FL_CID_BUSY(n, p) __BIT_CLR(CHE_NUM_FL_TAGS, n, p)
114 #define FL_CID_ISACTIVE(n, p) !__BIT_ISSET(CHE_NUM_FL_TAGS, n, p)
115 #define FL_CID_FREE(n, p) __BIT_SET(CHE_NUM_FL_TAGS, n, p)
116 #define FL_CID_FINDFREE_AT(p, start) __BIT_FFS_AT(CHE_NUM_FL_TAGS, p, start)
117
118 /*
119 * The TCP sequence number of both CPL_NVMT_DATA and CPL_NVMT_CMP
120 * mbufs are saved here while the mbuf is in qp->rx_data and qp->rx_pdus.
121 */
122 #define nvmf_tcp_seq PH_loc.thirtytwo[0]
123
124 /*
125 * The CPL status of CPL_NVMT_CMP mbufs are saved here while the mbuf
126 * is in qp->rx_pdus.
127 */
128 #define nvmf_cpl_status PH_loc.eight[4]
129
130 struct nvmf_che_capsule;
131 struct nvmf_che_qpair;
132
133 struct nvmf_che_adapter {
134 struct adapter *sc;
135
136 u_int ddp_threshold;
137 u_int max_transmit_pdu;
138 u_int max_receive_pdu;
139 bool nvmt_data_iqe;
140
141 struct sysctl_ctx_list ctx; /* from uld_activate to deactivate */
142 };
143
144 struct nvmf_che_command_buffer {
145 struct nvmf_che_qpair *qp;
146
147 struct nvmf_io_request io;
148 size_t data_len;
149 size_t data_xfered;
150 uint32_t data_offset;
151
152 u_int refs;
153 int error;
154
155 bool ddp_ok;
156 uint16_t cid;
157 uint16_t ttag;
158 uint16_t original_cid; /* Host only */
159
160 TAILQ_ENTRY(nvmf_che_command_buffer) link;
161
162 /* Fields used for DDP. */
163 struct fw_ri_tpte tpte;
164 uint64_t *pbl;
165 uint32_t pbl_addr;
166 uint32_t pbl_len;
167
168 /* Controller only */
169 struct nvmf_che_capsule *cc;
170 };
171
172 struct nvmf_che_command_buffer_list {
173 TAILQ_HEAD(, nvmf_che_command_buffer) head;
174 struct mtx lock;
175 };
176
177 struct nvmf_che_qpair {
178 struct nvmf_qpair qp;
179
180 struct socket *so;
181 struct toepcb *toep;
182 struct nvmf_che_adapter *nca;
183
184 volatile u_int refs; /* Every allocated capsule holds a reference */
185 uint8_t txpda;
186 uint8_t rxpda;
187 bool header_digests;
188 bool data_digests;
189 uint32_t maxr2t;
190 uint32_t maxh2cdata; /* Controller only */
191 uint32_t max_rx_data;
192 uint32_t max_tx_data;
193 uint32_t max_icd; /* Host only */
194 uint32_t max_ioccsz; /* Controller only */
195 union {
196 uint16_t next_fl_ttag; /* Controller only */
197 uint16_t next_cid; /* Host only */
198 };
199 uint16_t next_ddp_tag;
200 u_int num_fl_ttags; /* Controller only */
201 u_int active_fl_ttags; /* Controller only */
202 u_int num_ddp_tags;
203 u_int active_ddp_tags;
204 bool send_success; /* Controller only */
205 uint8_t ddp_color;
206 uint32_t tpt_offset;
207
208 /* Receive state. */
209 struct thread *rx_thread;
210 struct cv rx_cv;
211 bool rx_shutdown;
212 int rx_error;
213 struct mbufq rx_data; /* Data received via CPL_NVMT_DATA. */
214 struct mbufq rx_pdus; /* PDU headers received via CPL_NVMT_CMP. */
215
216 /* Transmit state. */
217 struct thread *tx_thread;
218 struct cv tx_cv;
219 bool tx_shutdown;
220 STAILQ_HEAD(, nvmf_che_capsule) tx_capsules;
221
222 struct nvmf_che_command_buffer_list tx_buffers;
223 struct nvmf_che_command_buffer_list rx_buffers;
224
225 /*
226 * For the controller, an RX command buffer can be in one of
227 * three locations, all protected by the rx_buffers.lock. If
228 * a receive request is waiting for either an R2T slot for its
229 * command (due to exceeding MAXR2T), or a transfer tag it is
230 * placed on the rx_buffers list. When a request is allocated
231 * an active transfer tag, it moves to either the
232 * open_ddp_tags[] or open_fl_ttags[] array (indexed by the
233 * tag) until it completes.
234 *
235 * For the host, an RX command buffer using DDP is in
236 * open_ddp_tags[], otherwise it is in rx_buffers.
237 */
238 struct nvmf_che_command_buffer **open_ddp_tags;
239 struct nvmf_che_command_buffer **open_fl_ttags; /* Controller only */
240
241 /*
242 * For the host, CIDs submitted by nvmf(4) must be rewritten
243 * to either use DDP or not use DDP. The CID in response
244 * capsules must be restored to their original value. For
245 * DDP, the original CID is stored in the command buffer.
246 * These variables manage non-DDP CIDs.
247 */
248 uint16_t *fl_cids; /* Host only */
249 struct fl_cid_set *fl_cid_set; /* Host only */
250 struct mtx fl_cid_lock; /* Host only */
251 };
252
253 struct nvmf_che_rxpdu {
254 struct mbuf *m;
255 const struct nvme_tcp_common_pdu_hdr *hdr;
256 uint32_t data_len;
257 bool data_digest_mismatch;
258 bool ddp;
259 };
260
261 struct nvmf_che_capsule {
262 struct nvmf_capsule nc;
263
264 volatile u_int refs;
265
266 struct nvmf_che_rxpdu rx_pdu;
267
268 uint32_t active_r2ts; /* Controller only */
269 #ifdef INVARIANTS
270 uint32_t tx_data_offset; /* Controller only */
271 u_int pending_r2ts; /* Controller only */
272 #endif
273
274 STAILQ_ENTRY(nvmf_che_capsule) link;
275 };
276
277 #define CCAP(nc) ((struct nvmf_che_capsule *)(nc))
278 #define CQP(qp) ((struct nvmf_che_qpair *)(qp))
279
280 static void che_release_capsule(struct nvmf_che_capsule *cc);
281 static void che_free_qpair(struct nvmf_qpair *nq);
282
283 SYSCTL_NODE(_kern_nvmf, OID_AUTO, che, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
284 "Chelsio TCP offload transport");
285
286 static u_int che_max_transmit_pdu = 32 * 1024;
287 SYSCTL_UINT(_kern_nvmf_che, OID_AUTO, max_transmit_pdu, CTLFLAG_RWTUN,
288 &che_max_transmit_pdu, 0,
289 "Maximum size of a transmitted PDU");
290
291 static u_int che_max_receive_pdu = 32 * 1024;
292 SYSCTL_UINT(_kern_nvmf_che, OID_AUTO, max_receive_pdu, CTLFLAG_RWTUN,
293 &che_max_receive_pdu, 0,
294 "Maximum size of a received PDU");
295
296 static int use_dsgl = 1;
297 SYSCTL_INT(_kern_nvmf_che, OID_AUTO, use_dsgl, CTLFLAG_RWTUN, &use_dsgl, 0,
298 "Use DSGL for PBL/FastReg (default=1)");
299
300 static int inline_threshold = 256;
301 SYSCTL_INT(_kern_nvmf_che, OID_AUTO, inline_threshold, CTLFLAG_RWTUN,
302 &inline_threshold, 0,
303 "inline vs dsgl threshold (default=256)");
304
305 static int ddp_tags_per_qp = 128;
306 SYSCTL_INT(_kern_nvmf_che, OID_AUTO, ddp_tags_per_qp, CTLFLAG_RWTUN,
307 &ddp_tags_per_qp, 0,
308 "Number of DDP tags to reserve for each queue pair");
309
310 static MALLOC_DEFINE(M_NVMF_CHE, "nvmf_che", "Chelsio NVMe-TCP offload");
311
312 /*
313 * PBL regions consist of N full-sized pages. TPT entries support an
314 * initial offset into the first page (FBO) and can handle a partial
315 * length on the last page.
316 */
317 static bool
che_ddp_io_check(struct nvmf_che_qpair * qp,const struct nvmf_io_request * io)318 che_ddp_io_check(struct nvmf_che_qpair *qp, const struct nvmf_io_request *io)
319 {
320 const struct memdesc *mem = &io->io_mem;
321 struct bus_dma_segment *ds;
322 int i;
323
324 if (io->io_len < qp->nca->ddp_threshold) {
325 return (false);
326 }
327
328 switch (mem->md_type) {
329 case MEMDESC_VADDR:
330 case MEMDESC_PADDR:
331 case MEMDESC_VMPAGES:
332 return (true);
333 case MEMDESC_VLIST:
334 case MEMDESC_PLIST:
335 /*
336 * Require all but the first segment to start on a
337 * page boundary. Require all but the last segment to
338 * end on a page boundary.
339 */
340 ds = mem->u.md_list;
341 for (i = 0; i < mem->md_nseg; i++, ds++) {
342 if (i != 0 && ds->ds_addr % PAGE_SIZE != 0)
343 return (false);
344 if (i != mem->md_nseg - 1 &&
345 (ds->ds_addr + ds->ds_len) % PAGE_SIZE != 0)
346 return (false);
347 }
348 return (true);
349 default:
350 /*
351 * Other types could be validated with more work, but
352 * they aren't used currently by nvmf(4) or nvmft(4).
353 */
354 return (false);
355 }
356 }
357
358 static u_int
che_fbo(struct nvmf_che_command_buffer * cb)359 che_fbo(struct nvmf_che_command_buffer *cb)
360 {
361 struct memdesc *mem = &cb->io.io_mem;
362
363 switch (mem->md_type) {
364 case MEMDESC_VADDR:
365 return ((uintptr_t)mem->u.md_vaddr & PAGE_MASK);
366 case MEMDESC_PADDR:
367 return (mem->u.md_paddr & PAGE_MASK);
368 case MEMDESC_VMPAGES:
369 return (mem->md_offset);
370 case MEMDESC_VLIST:
371 case MEMDESC_PLIST:
372 return (mem->u.md_list[0].ds_addr & PAGE_MASK);
373 default:
374 __assert_unreachable();
375 }
376 }
377
378 static u_int
che_npages(struct nvmf_che_command_buffer * cb)379 che_npages(struct nvmf_che_command_buffer *cb)
380 {
381 return (howmany(che_fbo(cb) + cb->io.io_len, PAGE_SIZE));
382 }
383
384 static struct nvmf_che_command_buffer *
che_alloc_command_buffer(struct nvmf_che_qpair * qp,const struct nvmf_io_request * io,uint32_t data_offset,size_t data_len,uint16_t cid)385 che_alloc_command_buffer(struct nvmf_che_qpair *qp,
386 const struct nvmf_io_request *io, uint32_t data_offset, size_t data_len,
387 uint16_t cid)
388 {
389 struct nvmf_che_command_buffer *cb;
390
391 cb = malloc(sizeof(*cb), M_NVMF_CHE, M_WAITOK);
392 cb->qp = qp;
393 cb->io = *io;
394 cb->data_offset = data_offset;
395 cb->data_len = data_len;
396 cb->data_xfered = 0;
397 refcount_init(&cb->refs, 1);
398 cb->error = 0;
399 cb->ddp_ok = che_ddp_io_check(qp, io);
400 cb->cid = cid;
401 cb->ttag = 0;
402 cb->original_cid = 0;
403 cb->cc = NULL;
404 cb->pbl = NULL;
405
406 return (cb);
407 }
408
409 static void
che_hold_command_buffer(struct nvmf_che_command_buffer * cb)410 che_hold_command_buffer(struct nvmf_che_command_buffer *cb)
411 {
412 refcount_acquire(&cb->refs);
413 }
414
415 static void
che_free_command_buffer(struct nvmf_che_command_buffer * cb)416 che_free_command_buffer(struct nvmf_che_command_buffer *cb)
417 {
418 nvmf_complete_io_request(&cb->io, cb->data_xfered, cb->error);
419 if (cb->cc != NULL)
420 che_release_capsule(cb->cc);
421 MPASS(cb->pbl == NULL);
422 free(cb, M_NVMF_CHE);
423 }
424
425 static void
che_release_command_buffer(struct nvmf_che_command_buffer * cb)426 che_release_command_buffer(struct nvmf_che_command_buffer *cb)
427 {
428 if (refcount_release(&cb->refs))
429 che_free_command_buffer(cb);
430 }
431
432 static void
che_add_command_buffer(struct nvmf_che_command_buffer_list * list,struct nvmf_che_command_buffer * cb)433 che_add_command_buffer(struct nvmf_che_command_buffer_list *list,
434 struct nvmf_che_command_buffer *cb)
435 {
436 mtx_assert(&list->lock, MA_OWNED);
437 TAILQ_INSERT_HEAD(&list->head, cb, link);
438 }
439
440 static struct nvmf_che_command_buffer *
che_find_command_buffer(struct nvmf_che_command_buffer_list * list,uint16_t cid)441 che_find_command_buffer(struct nvmf_che_command_buffer_list *list,
442 uint16_t cid)
443 {
444 struct nvmf_che_command_buffer *cb;
445
446 mtx_assert(&list->lock, MA_OWNED);
447 TAILQ_FOREACH(cb, &list->head, link) {
448 if (cb->cid == cid)
449 return (cb);
450 }
451 return (NULL);
452 }
453
454 static void
che_remove_command_buffer(struct nvmf_che_command_buffer_list * list,struct nvmf_che_command_buffer * cb)455 che_remove_command_buffer(struct nvmf_che_command_buffer_list *list,
456 struct nvmf_che_command_buffer *cb)
457 {
458 mtx_assert(&list->lock, MA_OWNED);
459 TAILQ_REMOVE(&list->head, cb, link);
460 }
461
462 static void
che_purge_command_buffer(struct nvmf_che_command_buffer_list * list,uint16_t cid)463 che_purge_command_buffer(struct nvmf_che_command_buffer_list *list,
464 uint16_t cid)
465 {
466 struct nvmf_che_command_buffer *cb;
467
468 mtx_lock(&list->lock);
469 cb = che_find_command_buffer(list, cid);
470 if (cb != NULL) {
471 che_remove_command_buffer(list, cb);
472 mtx_unlock(&list->lock);
473 che_release_command_buffer(cb);
474 } else
475 mtx_unlock(&list->lock);
476 }
477
478 static int
che_write_mem_inline(struct adapter * sc,struct toepcb * toep,uint32_t addr,uint32_t len,void * data,struct mbufq * wrq)479 che_write_mem_inline(struct adapter *sc, struct toepcb *toep, uint32_t addr,
480 uint32_t len, void *data, struct mbufq *wrq)
481 {
482 struct mbuf *m;
483 char *cp;
484 int copy_len, i, num_wqe, wr_len;
485
486 #ifdef VERBOSE_TRACES
487 CTR(KTR_CXGBE, "%s: addr 0x%x len %u", __func__, addr << 5, len);
488 #endif
489 num_wqe = DIV_ROUND_UP(len, T4_MAX_INLINE_SIZE);
490 cp = data;
491 for (i = 0; i < num_wqe; i++) {
492 copy_len = min(len, T4_MAX_INLINE_SIZE);
493 wr_len = T4_WRITE_MEM_INLINE_LEN(copy_len);
494
495 m = alloc_raw_wr_mbuf(wr_len);
496 if (m == NULL)
497 return (ENOMEM);
498 t4_write_mem_inline_wr(sc, mtod(m, void *), wr_len, toep->tid,
499 addr, copy_len, cp, 0);
500 if (cp != NULL)
501 cp += T4_MAX_INLINE_SIZE;
502 addr += T4_MAX_INLINE_SIZE >> 5;
503 len -= T4_MAX_INLINE_SIZE;
504
505 mbufq_enqueue(wrq, m);
506 }
507 return (0);
508 }
509
510 static int
che_write_mem_dma_aligned(struct adapter * sc,struct toepcb * toep,uint32_t addr,uint32_t len,void * data,struct mbufq * wrq)511 che_write_mem_dma_aligned(struct adapter *sc, struct toepcb *toep,
512 uint32_t addr, uint32_t len, void *data, struct mbufq *wrq)
513 {
514 struct mbuf *m;
515 vm_offset_t va;
516 u_int todo;
517 int wr_len;
518
519 /* First page. */
520 va = (vm_offset_t)data;
521 todo = min(PAGE_SIZE - (va % PAGE_SIZE), len);
522 wr_len = T4_WRITE_MEM_DMA_LEN;
523 m = alloc_raw_wr_mbuf(wr_len);
524 if (m == NULL)
525 return (ENOMEM);
526 t4_write_mem_dma_wr(sc, mtod(m, void *), wr_len, toep->tid, addr,
527 todo, pmap_kextract(va), 0);
528 mbufq_enqueue(wrq, m);
529 len -= todo;
530 addr += todo >> 5;
531 va += todo;
532
533 while (len > 0) {
534 MPASS(va == trunc_page(va));
535 todo = min(PAGE_SIZE, len);
536 m = alloc_raw_wr_mbuf(wr_len);
537 if (m == NULL)
538 return (ENOMEM);
539 t4_write_mem_dma_wr(sc, mtod(m, void *), wr_len, toep->tid,
540 addr, todo, pmap_kextract(va), 0);
541 mbufq_enqueue(wrq, m);
542 len -= todo;
543 addr += todo >> 5;
544 va += todo;
545 }
546 return (0);
547 }
548
549 static int
che_write_adapter_mem(struct nvmf_che_qpair * qp,uint32_t addr,uint32_t len,void * data)550 che_write_adapter_mem(struct nvmf_che_qpair *qp, uint32_t addr, uint32_t len,
551 void *data)
552 {
553 struct adapter *sc = qp->nca->sc;
554 struct toepcb *toep = qp->toep;
555 struct socket *so = qp->so;
556 struct inpcb *inp = sotoinpcb(so);
557 struct mbufq mq;
558 int error;
559
560 mbufq_init(&mq, INT_MAX);
561 if (!use_dsgl || len < inline_threshold || data == NULL)
562 error = che_write_mem_inline(sc, toep, addr, len, data, &mq);
563 else
564 error = che_write_mem_dma_aligned(sc, toep, addr, len, data,
565 &mq);
566 if (__predict_false(error != 0))
567 goto error;
568
569 INP_WLOCK(inp);
570 if ((inp->inp_flags & INP_DROPPED) != 0) {
571 INP_WUNLOCK(inp);
572 error = ECONNRESET;
573 goto error;
574 }
575 mbufq_concat(&toep->ulp_pduq, &mq);
576 INP_WUNLOCK(inp);
577 return (0);
578
579 error:
580 mbufq_drain(&mq);
581 return (error);
582 }
583
584 static bool
che_alloc_pbl(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb)585 che_alloc_pbl(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb)
586 {
587 struct adapter *sc = qp->nca->sc;
588 struct memdesc *mem = &cb->io.io_mem;
589 uint64_t *pbl;
590 uint32_t addr, len;
591 u_int i, npages;
592 int error;
593
594 MPASS(cb->pbl == NULL);
595 MPASS(cb->ddp_ok);
596
597 /* Hardware limit? iWARP only enforces this for T5. */
598 if (cb->io.io_len >= (8 * 1024 * 1024 * 1024ULL))
599 return (false);
600
601 npages = che_npages(cb);
602 len = roundup2(npages, 4) * sizeof(*cb->pbl);
603 addr = t4_pblpool_alloc(sc, len);
604 if (addr == 0)
605 return (false);
606
607 pbl = malloc(len, M_NVMF_CHE, M_NOWAIT | M_ZERO);
608 if (pbl == NULL) {
609 t4_pblpool_free(sc, addr, len);
610 return (false);
611 }
612
613 switch (mem->md_type) {
614 case MEMDESC_VADDR:
615 {
616 vm_offset_t va;
617
618 va = trunc_page((uintptr_t)mem->u.md_vaddr);
619 for (i = 0; i < npages; i++)
620 pbl[i] = htobe64(pmap_kextract(va + i * PAGE_SIZE));
621 break;
622 }
623 case MEMDESC_PADDR:
624 {
625 vm_paddr_t pa;
626
627 pa = trunc_page(mem->u.md_paddr);
628 for (i = 0; i < npages; i++)
629 pbl[i] = htobe64(pa + i * PAGE_SIZE);
630 break;
631 }
632 case MEMDESC_VMPAGES:
633 for (i = 0; i < npages; i++)
634 pbl[i] = htobe64(VM_PAGE_TO_PHYS(mem->u.md_ma[i]));
635 break;
636 case MEMDESC_VLIST:
637 {
638 struct bus_dma_segment *ds;
639 vm_offset_t va;
640 vm_size_t len;
641 u_int j, k;
642
643 i = 0;
644 ds = mem->u.md_list;
645 for (j = 0; j < mem->md_nseg; j++, ds++) {
646 va = trunc_page((uintptr_t)ds->ds_addr);
647 len = ds->ds_len;
648 if (ds->ds_addr % PAGE_SIZE != 0)
649 len += ds->ds_addr % PAGE_SIZE;
650 for (k = 0; k < howmany(len, PAGE_SIZE); k++) {
651 pbl[i] = htobe64(pmap_kextract(va +
652 k * PAGE_SIZE));
653 i++;
654 }
655 }
656 MPASS(i == npages);
657 break;
658 }
659 case MEMDESC_PLIST:
660 {
661 struct bus_dma_segment *ds;
662 vm_paddr_t pa;
663 vm_size_t len;
664 u_int j, k;
665
666 i = 0;
667 ds = mem->u.md_list;
668 for (j = 0; j < mem->md_nseg; j++, ds++) {
669 pa = trunc_page((vm_paddr_t)ds->ds_addr);
670 len = ds->ds_len;
671 if (ds->ds_addr % PAGE_SIZE != 0)
672 len += ds->ds_addr % PAGE_SIZE;
673 for (k = 0; k < howmany(len, PAGE_SIZE); k++) {
674 pbl[i] = htobe64(pa + k * PAGE_SIZE);
675 i++;
676 }
677 }
678 MPASS(i == npages);
679 break;
680 }
681 default:
682 __assert_unreachable();
683 }
684
685 error = che_write_adapter_mem(qp, addr >> 5, len, pbl);
686 if (error != 0) {
687 t4_pblpool_free(sc, addr, len);
688 free(pbl, M_NVMF_CHE);
689 return (false);
690 }
691
692 cb->pbl = pbl;
693 cb->pbl_addr = addr;
694 cb->pbl_len = len;
695
696 return (true);
697 }
698
699 static void
che_free_pbl(struct nvmf_che_command_buffer * cb)700 che_free_pbl(struct nvmf_che_command_buffer *cb)
701 {
702 free(cb->pbl, M_NVMF_CHE);
703 t4_pblpool_free(cb->qp->nca->sc, cb->pbl_addr, cb->pbl_len);
704 cb->pbl = NULL;
705 cb->pbl_addr = 0;
706 cb->pbl_len = 0;
707 }
708
709 static bool
che_write_tpt_entry(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb,uint16_t stag)710 che_write_tpt_entry(struct nvmf_che_qpair *qp,
711 struct nvmf_che_command_buffer *cb, uint16_t stag)
712 {
713 uint32_t tpt_addr;
714 int error;
715
716 cb->tpte.valid_to_pdid = htobe32(F_FW_RI_TPTE_VALID |
717 V_FW_RI_TPTE_STAGKEY(CHE_STAG_COLOR(stag)) |
718 F_FW_RI_TPTE_STAGSTATE |
719 V_FW_RI_TPTE_STAGTYPE(FW_RI_STAG_NSMR) |
720 V_FW_RI_TPTE_PDID(0));
721 cb->tpte.locread_to_qpid = htobe32(
722 V_FW_RI_TPTE_PERM(FW_RI_MEM_ACCESS_REM_WRITE) |
723 V_FW_RI_TPTE_ADDRTYPE(FW_RI_ZERO_BASED_TO) |
724 V_FW_RI_TPTE_PS(PAGE_SIZE) |
725 V_FW_RI_TPTE_QPID(qp->toep->tid));
726 #define PBL_OFF(qp, a) ((a) - (qp)->nca->sc->vres.pbl.start)
727 cb->tpte.nosnoop_pbladdr =
728 htobe32(V_FW_RI_TPTE_PBLADDR(PBL_OFF(qp, cb->pbl_addr) >> 3));
729 cb->tpte.len_lo = htobe32(cb->data_len);
730 cb->tpte.va_hi = 0;
731 cb->tpte.va_lo_fbo = htobe32(che_fbo(cb));
732 cb->tpte.dca_mwbcnt_pstag = 0;
733 cb->tpte.len_hi = htobe32(cb->data_offset);
734
735 tpt_addr = qp->tpt_offset + CHE_STAG_IDX(stag) +
736 (qp->nca->sc->vres.stag.start >> 5);
737
738 error = che_write_adapter_mem(qp, tpt_addr, sizeof(cb->tpte),
739 &cb->tpte);
740 return (error == 0);
741 }
742
743 static void
che_clear_tpt_entry(struct nvmf_che_qpair * qp,uint16_t stag)744 che_clear_tpt_entry(struct nvmf_che_qpair *qp, uint16_t stag)
745 {
746 uint32_t tpt_addr;
747
748 tpt_addr = qp->tpt_offset + CHE_STAG_IDX(stag) +
749 (qp->nca->sc->vres.stag.start >> 5);
750
751 (void)che_write_adapter_mem(qp, tpt_addr, sizeof(struct fw_ri_tpte),
752 NULL);
753 }
754
755 static uint16_t
che_alloc_ddp_stag(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb)756 che_alloc_ddp_stag(struct nvmf_che_qpair *qp,
757 struct nvmf_che_command_buffer *cb)
758 {
759 uint16_t stag_idx;
760
761 mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
762 MPASS(cb->ddp_ok);
763
764 if (qp->active_ddp_tags == qp->num_ddp_tags)
765 return (CHE_DDP_NO_TAG);
766
767 MPASS(qp->num_ddp_tags != 0);
768
769 stag_idx = qp->next_ddp_tag;
770 for (;;) {
771 if (qp->open_ddp_tags[stag_idx] == NULL)
772 break;
773 if (stag_idx == qp->num_ddp_tags - 1) {
774 stag_idx = 0;
775 if (qp->ddp_color == CHE_DDP_MAX_COLOR)
776 qp->ddp_color = 0;
777 else
778 qp->ddp_color++;
779 } else
780 stag_idx++;
781 MPASS(stag_idx != qp->next_ddp_tag);
782 }
783 if (stag_idx == qp->num_ddp_tags - 1)
784 qp->next_ddp_tag = 0;
785 else
786 qp->next_ddp_tag = stag_idx + 1;
787
788 qp->active_ddp_tags++;
789 qp->open_ddp_tags[stag_idx] = cb;
790
791 return (CHE_DDP_TAG(stag_idx, qp->ddp_color));
792 }
793
794 static void
che_free_ddp_stag(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb,uint16_t stag)795 che_free_ddp_stag(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb,
796 uint16_t stag)
797 {
798 MPASS(!CHE_TAG_IS_FL(stag));
799
800 mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
801
802 MPASS(qp->open_ddp_tags[CHE_STAG_IDX(stag)] == cb);
803
804 qp->open_ddp_tags[CHE_STAG_IDX(stag)] = NULL;
805 qp->active_ddp_tags--;
806 }
807
808 static uint16_t
che_alloc_ddp_tag(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb)809 che_alloc_ddp_tag(struct nvmf_che_qpair *qp,
810 struct nvmf_che_command_buffer *cb)
811 {
812 uint16_t stag;
813
814 mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
815
816 if (!cb->ddp_ok)
817 return (CHE_DDP_NO_TAG);
818
819 stag = che_alloc_ddp_stag(qp, cb);
820 if (stag == CHE_DDP_NO_TAG) {
821 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_setup_no_stag,
822 1);
823 return (CHE_DDP_NO_TAG);
824 }
825
826 if (!che_alloc_pbl(qp, cb)) {
827 che_free_ddp_stag(qp, cb, stag);
828 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_setup_error, 1);
829 return (CHE_DDP_NO_TAG);
830 }
831
832 if (!che_write_tpt_entry(qp, cb, stag)) {
833 che_free_pbl(cb);
834 che_free_ddp_stag(qp, cb, stag);
835 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_setup_error, 1);
836 return (CHE_DDP_NO_TAG);
837 }
838
839 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_setup_ok, 1);
840 return (stag);
841 }
842
843 static void
che_free_ddp_tag(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb,uint16_t stag)844 che_free_ddp_tag(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb,
845 uint16_t stag)
846 {
847 MPASS(!CHE_TAG_IS_FL(stag));
848
849 mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
850
851 MPASS(qp->open_ddp_tags[CHE_STAG_IDX(stag)] == cb);
852
853 che_clear_tpt_entry(qp, stag);
854 che_free_pbl(cb);
855 che_free_ddp_stag(qp, cb, stag);
856 }
857
858 static void
nvmf_che_write_pdu(struct nvmf_che_qpair * qp,struct mbuf * m)859 nvmf_che_write_pdu(struct nvmf_che_qpair *qp, struct mbuf *m)
860 {
861 struct epoch_tracker et;
862 struct socket *so = qp->so;
863 struct inpcb *inp = sotoinpcb(so);
864 struct toepcb *toep = qp->toep;
865
866 CURVNET_SET(so->so_vnet);
867 NET_EPOCH_ENTER(et);
868 INP_WLOCK(inp);
869 if (__predict_false(inp->inp_flags & INP_DROPPED) ||
870 __predict_false((toep->flags & TPF_ATTACHED) == 0)) {
871 m_freem(m);
872 } else {
873 mbufq_enqueue(&toep->ulp_pduq, m);
874 t4_push_pdus(toep->vi->adapter, toep, 0);
875 }
876 INP_WUNLOCK(inp);
877 NET_EPOCH_EXIT(et);
878 CURVNET_RESTORE();
879 }
880
881 static void
nvmf_che_report_error(struct nvmf_che_qpair * qp,uint16_t fes,uint32_t fei,struct mbuf * rx_pdu,u_int hlen)882 nvmf_che_report_error(struct nvmf_che_qpair *qp, uint16_t fes, uint32_t fei,
883 struct mbuf *rx_pdu, u_int hlen)
884 {
885 struct nvme_tcp_term_req_hdr *hdr;
886 struct mbuf *m;
887
888 if (hlen != 0) {
889 hlen = min(hlen, NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE);
890 hlen = min(hlen, m_length(rx_pdu, NULL));
891 }
892
893 m = m_get2(sizeof(*hdr) + hlen, M_WAITOK, MT_DATA, M_PKTHDR);
894 m->m_len = sizeof(*hdr) + hlen;
895 m->m_pkthdr.len = m->m_len;
896 hdr = mtod(m, void *);
897 memset(hdr, 0, sizeof(*hdr));
898 hdr->common.pdu_type = qp->qp.nq_controller ?
899 NVME_TCP_PDU_TYPE_C2H_TERM_REQ : NVME_TCP_PDU_TYPE_H2C_TERM_REQ;
900 hdr->common.hlen = sizeof(*hdr);
901 hdr->common.plen = sizeof(*hdr) + hlen;
902 hdr->fes = htole16(fes);
903 le32enc(hdr->fei, fei);
904 if (hlen != 0)
905 m_copydata(rx_pdu, 0, hlen, (caddr_t)(hdr + 1));
906
907 nvmf_che_write_pdu(qp, m);
908 }
909
910 static int
nvmf_che_validate_pdu(struct nvmf_che_qpair * qp,struct nvmf_che_rxpdu * pdu)911 nvmf_che_validate_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu)
912 {
913 const struct nvme_tcp_common_pdu_hdr *ch;
914 struct mbuf *m = pdu->m;
915 uint32_t data_len, fei, plen, rx_digest;
916 u_int hlen, cpl_error;
917 int error;
918 uint16_t fes;
919
920 /* Determine how large of a PDU header to return for errors. */
921 ch = pdu->hdr;
922 hlen = ch->hlen;
923 plen = le32toh(ch->plen);
924 if (hlen < sizeof(*ch) || hlen > plen)
925 hlen = sizeof(*ch);
926
927 cpl_error = m->m_pkthdr.nvmf_cpl_status & CMP_STATUS_ERROR_MASK;
928 switch (cpl_error) {
929 case CMP_STATUS_NO_ERROR:
930 break;
931 case CMP_STATUS_HEADER_DIGEST:
932 counter_u64_add(
933 qp->toep->ofld_rxq->rx_nvme_header_digest_errors, 1);
934 printf("NVMe/TCP: Header digest mismatch\n");
935 rx_digest = le32dec(mtodo(m, ch->hlen));
936 nvmf_che_report_error(qp,
937 NVME_TCP_TERM_REQ_FES_HDGST_ERROR, rx_digest, m,
938 hlen);
939 return (EBADMSG);
940 case CMP_STATUS_DIRECTION_MISMATCH:
941 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1);
942 printf("NVMe/TCP: Invalid PDU type %u\n", ch->pdu_type);
943 nvmf_che_report_error(qp,
944 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
945 offsetof(struct nvme_tcp_common_pdu_hdr, pdu_type), m,
946 hlen);
947 return (EBADMSG);
948 case CMP_STATUS_SUCCESS_NOT_LAST:
949 case CMP_STATUS_DIGEST_FLAG_MISMATCH:
950 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1);
951 printf("NVMe/TCP: Invalid PDU header flags %#x\n", ch->flags);
952 nvmf_che_report_error(qp,
953 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
954 offsetof(struct nvme_tcp_common_pdu_hdr, flags), m, hlen);
955 return (EBADMSG);
956 case CMP_STATUS_BAD_DATA_LENGTH:
957 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1);
958 printf("NVMe/TCP: Invalid PDU length %u\n", plen);
959 nvmf_che_report_error(qp,
960 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
961 offsetof(struct nvme_tcp_common_pdu_hdr, plen), m, hlen);
962 return (EBADMSG);
963 case CMP_STATUS_USER_MODE_UNALLOCATED:
964 case CMP_STATUS_RQT_LIMIT:
965 case CMP_STATUS_RQT_WRAP:
966 case CMP_STATUS_RQT_BOUND:
967 device_printf(qp->nca->sc->dev,
968 "received invalid NVMET error %u\n",
969 cpl_error);
970 return (ECONNRESET);
971 case CMP_STATUS_TPT_LIMIT:
972 case CMP_STATUS_TPT_INVALID:
973 case CMP_STATUS_TPT_COLOUR_MISMATCH:
974 case CMP_STATUS_TPT_MISC:
975 case CMP_STATUS_TPT_WRAP:
976 case CMP_STATUS_TPT_BOUND:
977 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1);
978 switch (ch->pdu_type) {
979 case NVME_TCP_PDU_TYPE_H2C_DATA:
980 nvmf_che_report_error(qp,
981 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
982 offsetof(struct nvme_tcp_h2c_data_hdr, ttag),
983 pdu->m, pdu->hdr->hlen);
984 return (EBADMSG);
985 case NVME_TCP_PDU_TYPE_C2H_DATA:
986 nvmf_che_report_error(qp,
987 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
988 offsetof(struct nvme_tcp_c2h_data_hdr, cccid), m,
989 hlen);
990 return (EBADMSG);
991 default:
992 device_printf(qp->nca->sc->dev,
993 "received DDP NVMET error %u for PDU %u\n",
994 cpl_error, ch->pdu_type);
995 return (ECONNRESET);
996 }
997 case CMP_STATUS_TPT_LAST_PDU_UNALIGNED:
998 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1);
999 nvmf_che_report_error(qp,
1000 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, m, hlen);
1001 return (EBADMSG);
1002 case CMP_STATUS_PBL_LIMIT:
1003 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1);
1004 nvmf_che_report_error(qp,
1005 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, m,
1006 hlen);
1007 return (EBADMSG);
1008 case CMP_STATUS_DATA_DIGEST:
1009 /* Handled below. */
1010 break;
1011 default:
1012 device_printf(qp->nca->sc->dev,
1013 "received unknown NVMET error %u\n",
1014 cpl_error);
1015 return (ECONNRESET);
1016 }
1017
1018 error = nvmf_tcp_validate_pdu_header(ch, qp->qp.nq_controller,
1019 qp->header_digests, qp->data_digests, qp->rxpda, &data_len, &fes,
1020 &fei);
1021 if (error != 0) {
1022 if (error != ECONNRESET)
1023 nvmf_che_report_error(qp, fes, fei, m, hlen);
1024 return (error);
1025 }
1026
1027 /* Check data digest if present. */
1028 pdu->data_digest_mismatch = false;
1029 if ((ch->flags & NVME_TCP_CH_FLAGS_DDGSTF) != 0) {
1030 if (cpl_error == CMP_STATUS_DATA_DIGEST) {
1031 printf("NVMe/TCP: Data digest mismatch\n");
1032 pdu->data_digest_mismatch = true;
1033 counter_u64_add(
1034 qp->toep->ofld_rxq->rx_nvme_data_digest_errors, 1);
1035 }
1036 }
1037
1038 pdu->data_len = data_len;
1039
1040 return (0);
1041 }
1042
1043 static void
nvmf_che_free_pdu(struct nvmf_che_rxpdu * pdu)1044 nvmf_che_free_pdu(struct nvmf_che_rxpdu *pdu)
1045 {
1046 m_freem(pdu->m);
1047 pdu->m = NULL;
1048 pdu->hdr = NULL;
1049 }
1050
1051 static int
nvmf_che_handle_term_req(struct nvmf_che_rxpdu * pdu)1052 nvmf_che_handle_term_req(struct nvmf_che_rxpdu *pdu)
1053 {
1054 const struct nvme_tcp_term_req_hdr *hdr;
1055
1056 hdr = (const void *)pdu->hdr;
1057
1058 printf("NVMe/TCP: Received termination request: fes %#x fei %#x\n",
1059 le16toh(hdr->fes), le32dec(hdr->fei));
1060 nvmf_che_free_pdu(pdu);
1061 return (ECONNRESET);
1062 }
1063
1064 static int
nvmf_che_save_command_capsule(struct nvmf_che_qpair * qp,struct nvmf_che_rxpdu * pdu)1065 nvmf_che_save_command_capsule(struct nvmf_che_qpair *qp,
1066 struct nvmf_che_rxpdu *pdu)
1067 {
1068 const struct nvme_tcp_cmd *cmd;
1069 struct nvmf_capsule *nc;
1070 struct nvmf_che_capsule *cc;
1071
1072 cmd = (const void *)pdu->hdr;
1073
1074 nc = nvmf_allocate_command(&qp->qp, &cmd->ccsqe, M_WAITOK);
1075
1076 cc = CCAP(nc);
1077 cc->rx_pdu = *pdu;
1078
1079 nvmf_capsule_received(&qp->qp, nc);
1080 return (0);
1081 }
1082
1083 static int
nvmf_che_save_response_capsule(struct nvmf_che_qpair * qp,struct nvmf_che_rxpdu * pdu)1084 nvmf_che_save_response_capsule(struct nvmf_che_qpair *qp,
1085 struct nvmf_che_rxpdu *pdu)
1086 {
1087 const struct nvme_tcp_rsp *rsp;
1088 struct nvme_completion cpl;
1089 struct nvmf_capsule *nc;
1090 struct nvmf_che_capsule *cc;
1091 uint16_t cid;
1092
1093 rsp = (const void *)pdu->hdr;
1094
1095 /*
1096 * Restore the original CID and ensure any command buffers
1097 * associated with this CID have been released. Once the CQE
1098 * has been received, no further transfers to the command
1099 * buffer for the associated CID can occur.
1100 */
1101 cpl = rsp->rccqe;
1102 cid = le16toh(cpl.cid);
1103 if (CHE_TAG_IS_FL(cid)) {
1104 cid = CHE_RAW_FL_TAG(cid);
1105 mtx_lock(&qp->fl_cid_lock);
1106 MPASS(FL_CID_ISACTIVE(cid, qp->fl_cid_set));
1107 cpl.cid = qp->fl_cids[cid];
1108 FL_CID_FREE(cid, qp->fl_cid_set);
1109 mtx_unlock(&qp->fl_cid_lock);
1110
1111 che_purge_command_buffer(&qp->rx_buffers, rsp->rccqe.cid);
1112 che_purge_command_buffer(&qp->tx_buffers, rsp->rccqe.cid);
1113 } else {
1114 struct nvmf_che_command_buffer *cb;
1115
1116 mtx_lock(&qp->rx_buffers.lock);
1117 cb = qp->open_ddp_tags[CHE_STAG_IDX(cid)];
1118 MPASS(cb != NULL);
1119 MPASS(cb->cid == rsp->rccqe.cid);
1120 cpl.cid = cb->original_cid;
1121 che_free_ddp_tag(qp, cb, cid);
1122 mtx_unlock(&qp->rx_buffers.lock);
1123 che_release_command_buffer(cb);
1124 }
1125 #ifdef VERBOSE_TRACES
1126 CTR(KTR_CXGBE, "%s: tid %u freed cid 0x%04x for 0x%04x", __func__,
1127 qp->toep->tid, le16toh(rsp->rccqe.cid), cpl.cid);
1128 #endif
1129
1130 nc = nvmf_allocate_response(&qp->qp, &cpl, M_WAITOK);
1131
1132 nc->nc_sqhd_valid = true;
1133 cc = CCAP(nc);
1134 cc->rx_pdu = *pdu;
1135
1136 nvmf_capsule_received(&qp->qp, nc);
1137 return (0);
1138 }
1139
1140 /*
1141 * Construct a PDU that contains an optional data payload. This
1142 * includes dealing with the length fields in the common header. The
1143 * adapter inserts digests and padding when the PDU is transmitted.
1144 */
1145 static struct mbuf *
nvmf_che_construct_pdu(struct nvmf_che_qpair * qp,void * hdr,size_t hlen,struct mbuf * data,uint32_t data_len)1146 nvmf_che_construct_pdu(struct nvmf_che_qpair *qp, void *hdr, size_t hlen,
1147 struct mbuf *data, uint32_t data_len)
1148 {
1149 struct nvme_tcp_common_pdu_hdr *ch;
1150 struct mbuf *top;
1151 uint32_t pdo, plen;
1152 uint8_t ulp_submode;
1153
1154 plen = hlen;
1155 if (qp->header_digests)
1156 plen += sizeof(uint32_t);
1157 if (data_len != 0) {
1158 KASSERT(m_length(data, NULL) == data_len, ("length mismatch"));
1159 pdo = roundup(plen, qp->txpda);
1160 plen = pdo + data_len;
1161 if (qp->data_digests)
1162 plen += sizeof(uint32_t);
1163 } else {
1164 KASSERT(data == NULL, ("payload mbuf with zero length"));
1165 pdo = 0;
1166 }
1167
1168 top = m_get2(hlen, M_WAITOK, MT_DATA, M_PKTHDR);
1169 top->m_len = hlen;
1170 top->m_pkthdr.len = hlen;
1171 ch = mtod(top, void *);
1172 memcpy(ch, hdr, hlen);
1173 ch->hlen = hlen;
1174 ulp_submode = 0;
1175 if (qp->header_digests) {
1176 ch->flags |= NVME_TCP_CH_FLAGS_HDGSTF;
1177 ulp_submode |= ULP_CRC_HEADER;
1178 }
1179 if (qp->data_digests && data_len != 0) {
1180 ch->flags |= NVME_TCP_CH_FLAGS_DDGSTF;
1181 ulp_submode |= ULP_CRC_DATA;
1182 }
1183 ch->pdo = pdo;
1184 ch->plen = htole32(plen);
1185 set_mbuf_ulp_submode(top, ulp_submode);
1186
1187 if (data_len != 0) {
1188 top->m_pkthdr.len += data_len;
1189 top->m_next = data;
1190 }
1191
1192 return (top);
1193 }
1194
1195 /* Allocate the next free freelist transfer tag. */
1196 static bool
nvmf_che_allocate_fl_ttag(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb)1197 nvmf_che_allocate_fl_ttag(struct nvmf_che_qpair *qp,
1198 struct nvmf_che_command_buffer *cb)
1199 {
1200 uint16_t ttag;
1201
1202 mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
1203
1204 if (qp->active_fl_ttags == qp->num_fl_ttags)
1205 return (false);
1206
1207 ttag = qp->next_fl_ttag;
1208 for (;;) {
1209 if (qp->open_fl_ttags[ttag] == NULL)
1210 break;
1211 if (ttag == qp->num_fl_ttags - 1)
1212 ttag = 0;
1213 else
1214 ttag++;
1215 MPASS(ttag != qp->next_fl_ttag);
1216 }
1217 if (ttag == qp->num_fl_ttags - 1)
1218 qp->next_fl_ttag = 0;
1219 else
1220 qp->next_fl_ttag = ttag + 1;
1221
1222 qp->active_fl_ttags++;
1223 qp->open_fl_ttags[ttag] = cb;
1224
1225 cb->ttag = ttag | CHE_FL_TAG_MASK;
1226 return (true);
1227 }
1228
1229 /* Attempt to allocate a free transfer tag and assign it to cb. */
1230 static bool
nvmf_che_allocate_ttag(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb)1231 nvmf_che_allocate_ttag(struct nvmf_che_qpair *qp,
1232 struct nvmf_che_command_buffer *cb)
1233 {
1234 uint16_t stag;
1235
1236 mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
1237
1238 stag = che_alloc_ddp_tag(qp, cb);
1239 if (stag == CHE_DDP_NO_TAG) {
1240 if (!nvmf_che_allocate_fl_ttag(qp, cb))
1241 return (false);
1242 } else {
1243 cb->ttag = stag;
1244 }
1245 #ifdef VERBOSE_TRACES
1246 CTR(KTR_CXGBE, "%s: tid %u allocated ttag 0x%04x", __func__,
1247 qp->toep->tid, cb->ttag);
1248 #endif
1249 cb->cc->active_r2ts++;
1250 return (true);
1251 }
1252
1253 /* Find the next command buffer eligible to schedule for R2T. */
1254 static struct nvmf_che_command_buffer *
nvmf_che_next_r2t(struct nvmf_che_qpair * qp)1255 nvmf_che_next_r2t(struct nvmf_che_qpair *qp)
1256 {
1257 struct nvmf_che_command_buffer *cb;
1258
1259 mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
1260
1261 TAILQ_FOREACH(cb, &qp->rx_buffers.head, link) {
1262 /* NB: maxr2t is 0's based. */
1263 if (cb->cc->active_r2ts > qp->maxr2t)
1264 continue;
1265
1266 if (!nvmf_che_allocate_ttag(qp, cb))
1267 return (NULL);
1268 #ifdef INVARIANTS
1269 cb->cc->pending_r2ts--;
1270 #endif
1271 TAILQ_REMOVE(&qp->rx_buffers.head, cb, link);
1272 return (cb);
1273 }
1274 return (NULL);
1275 }
1276
1277 /* NB: cid and is little-endian already. */
1278 static void
che_send_r2t(struct nvmf_che_qpair * qp,uint16_t cid,uint16_t ttag,uint32_t data_offset,uint32_t data_len)1279 che_send_r2t(struct nvmf_che_qpair *qp, uint16_t cid, uint16_t ttag,
1280 uint32_t data_offset, uint32_t data_len)
1281 {
1282 struct nvme_tcp_r2t_hdr r2t;
1283 struct mbuf *m;
1284
1285 memset(&r2t, 0, sizeof(r2t));
1286 r2t.common.pdu_type = NVME_TCP_PDU_TYPE_R2T;
1287 r2t.cccid = cid;
1288 r2t.ttag = htole16(ttag);
1289 r2t.r2to = htole32(data_offset);
1290 r2t.r2tl = htole32(data_len);
1291
1292 m = nvmf_che_construct_pdu(qp, &r2t, sizeof(r2t), NULL, 0);
1293 nvmf_che_write_pdu(qp, m);
1294 }
1295
1296 /*
1297 * Release a transfer tag and schedule another R2T.
1298 *
1299 * NB: This drops the rx_buffers.lock mutex.
1300 */
1301 static void
nvmf_che_send_next_r2t(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb)1302 nvmf_che_send_next_r2t(struct nvmf_che_qpair *qp,
1303 struct nvmf_che_command_buffer *cb)
1304 {
1305 struct nvmf_che_command_buffer *ncb;
1306
1307 mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
1308
1309 #ifdef VERBOSE_TRACES
1310 CTR(KTR_CXGBE, "%s: tid %u freed ttag 0x%04x", __func__, qp->toep->tid,
1311 cb->ttag);
1312 #endif
1313 if (CHE_TAG_IS_FL(cb->ttag)) {
1314 uint16_t ttag;
1315
1316 ttag = CHE_RAW_FL_TAG(cb->ttag);
1317 MPASS(qp->open_fl_ttags[ttag] == cb);
1318
1319 /* Release this transfer tag. */
1320 qp->open_fl_ttags[ttag] = NULL;
1321 qp->active_fl_ttags--;
1322 } else
1323 che_free_ddp_tag(qp, cb, cb->ttag);
1324
1325 cb->cc->active_r2ts--;
1326
1327 /* Schedule another R2T. */
1328 ncb = nvmf_che_next_r2t(qp);
1329 mtx_unlock(&qp->rx_buffers.lock);
1330 if (ncb != NULL)
1331 che_send_r2t(qp, ncb->cid, ncb->ttag, ncb->data_offset,
1332 ncb->data_len);
1333 }
1334
1335 /*
1336 * Copy len bytes starting at offset skip from an mbuf chain into an
1337 * I/O buffer at destination offset io_offset.
1338 */
1339 static void
mbuf_copyto_io(struct mbuf * m,u_int skip,u_int len,struct nvmf_io_request * io,u_int io_offset)1340 mbuf_copyto_io(struct mbuf *m, u_int skip, u_int len,
1341 struct nvmf_io_request *io, u_int io_offset)
1342 {
1343 u_int todo;
1344
1345 while (m->m_len <= skip) {
1346 skip -= m->m_len;
1347 m = m->m_next;
1348 }
1349 while (len != 0) {
1350 MPASS((m->m_flags & M_EXTPG) == 0);
1351
1352 todo = min(m->m_len - skip, len);
1353 memdesc_copyback(&io->io_mem, io_offset, todo, mtodo(m, skip));
1354 skip = 0;
1355 io_offset += todo;
1356 len -= todo;
1357 m = m->m_next;
1358 }
1359 }
1360
1361 static int
nvmf_che_handle_h2c_data(struct nvmf_che_qpair * qp,struct nvmf_che_rxpdu * pdu)1362 nvmf_che_handle_h2c_data(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu)
1363 {
1364 const struct nvme_tcp_h2c_data_hdr *h2c;
1365 struct nvmf_che_command_buffer *cb;
1366 uint32_t data_len, data_offset;
1367 uint16_t ttag, fl_ttag;
1368
1369 h2c = (const void *)pdu->hdr;
1370 if (le32toh(h2c->datal) > qp->maxh2cdata) {
1371 nvmf_che_report_error(qp,
1372 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_LIMIT_EXCEEDED, 0,
1373 pdu->m, pdu->hdr->hlen);
1374 nvmf_che_free_pdu(pdu);
1375 return (EBADMSG);
1376 }
1377
1378 ttag = le16toh(h2c->ttag);
1379 if (CHE_TAG_IS_FL(ttag)) {
1380 fl_ttag = CHE_RAW_FL_TAG(ttag);
1381 if (fl_ttag >= qp->num_fl_ttags) {
1382 nvmf_che_report_error(qp,
1383 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
1384 offsetof(struct nvme_tcp_h2c_data_hdr, ttag),
1385 pdu->m, pdu->hdr->hlen);
1386 nvmf_che_free_pdu(pdu);
1387 return (EBADMSG);
1388 }
1389
1390 mtx_lock(&qp->rx_buffers.lock);
1391 cb = qp->open_fl_ttags[fl_ttag];
1392 } else {
1393 if (CHE_STAG_IDX(ttag) >= qp->num_ddp_tags) {
1394 nvmf_che_report_error(qp,
1395 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
1396 offsetof(struct nvme_tcp_h2c_data_hdr, ttag),
1397 pdu->m, pdu->hdr->hlen);
1398 nvmf_che_free_pdu(pdu);
1399 return (EBADMSG);
1400 }
1401
1402 mtx_lock(&qp->rx_buffers.lock);
1403 cb = qp->open_ddp_tags[CHE_STAG_IDX(ttag)];
1404 }
1405
1406 if (cb == NULL) {
1407 mtx_unlock(&qp->rx_buffers.lock);
1408 nvmf_che_report_error(qp,
1409 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
1410 offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->m,
1411 pdu->hdr->hlen);
1412 nvmf_che_free_pdu(pdu);
1413 return (EBADMSG);
1414 }
1415 MPASS(cb->ttag == ttag);
1416
1417 /* For a data digest mismatch, fail the I/O request. */
1418 if (pdu->data_digest_mismatch) {
1419 nvmf_che_send_next_r2t(qp, cb);
1420 cb->error = EINTEGRITY;
1421 che_release_command_buffer(cb);
1422 nvmf_che_free_pdu(pdu);
1423 return (0);
1424 }
1425
1426 data_len = le32toh(h2c->datal);
1427 if (data_len != pdu->data_len) {
1428 mtx_unlock(&qp->rx_buffers.lock);
1429 nvmf_che_report_error(qp,
1430 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
1431 offsetof(struct nvme_tcp_h2c_data_hdr, datal), pdu->m,
1432 pdu->hdr->hlen);
1433 nvmf_che_free_pdu(pdu);
1434 return (EBADMSG);
1435 }
1436
1437 data_offset = le32toh(h2c->datao);
1438 if (data_offset < cb->data_offset ||
1439 data_offset + data_len > cb->data_offset + cb->data_len) {
1440 mtx_unlock(&qp->rx_buffers.lock);
1441 nvmf_che_report_error(qp,
1442 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, pdu->m,
1443 pdu->hdr->hlen);
1444 nvmf_che_free_pdu(pdu);
1445 return (EBADMSG);
1446 }
1447
1448 if (data_offset != cb->data_offset + cb->data_xfered) {
1449 if (CHE_TAG_IS_FL(ttag)) {
1450 mtx_unlock(&qp->rx_buffers.lock);
1451 nvmf_che_report_error(qp,
1452 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
1453 pdu->hdr->hlen);
1454 nvmf_che_free_pdu(pdu);
1455 return (EBADMSG);
1456 } else {
1457 uint32_t ddp_bytes;
1458
1459 /* Account for PDUs silently received via DDP. */
1460 ddp_bytes = data_offset -
1461 (cb->data_offset + cb->data_xfered);
1462 cb->data_xfered += ddp_bytes;
1463 #ifdef VERBOSE_TRACES
1464 CTR(KTR_CXGBE, "%s: tid %u previous ddp_bytes %u",
1465 __func__, qp->toep->tid, ddp_bytes);
1466 #endif
1467 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_octets,
1468 ddp_bytes);
1469 }
1470 }
1471
1472 if ((cb->data_xfered + data_len == cb->data_len) !=
1473 ((pdu->hdr->flags & NVME_TCP_H2C_DATA_FLAGS_LAST_PDU) != 0)) {
1474 mtx_unlock(&qp->rx_buffers.lock);
1475 nvmf_che_report_error(qp,
1476 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
1477 pdu->hdr->hlen);
1478 nvmf_che_free_pdu(pdu);
1479 return (EBADMSG);
1480 }
1481
1482 cb->data_xfered += data_len;
1483 data_offset -= cb->data_offset;
1484 if (cb->data_xfered == cb->data_len) {
1485 nvmf_che_send_next_r2t(qp, cb);
1486 } else {
1487 che_hold_command_buffer(cb);
1488 mtx_unlock(&qp->rx_buffers.lock);
1489 }
1490
1491 if (CHE_TAG_IS_FL(ttag))
1492 mbuf_copyto_io(pdu->m->m_next, 0, data_len, &cb->io,
1493 data_offset);
1494
1495 che_release_command_buffer(cb);
1496 nvmf_che_free_pdu(pdu);
1497 return (0);
1498 }
1499
1500 static int
nvmf_che_handle_c2h_data(struct nvmf_che_qpair * qp,struct nvmf_che_rxpdu * pdu)1501 nvmf_che_handle_c2h_data(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu)
1502 {
1503 const struct nvme_tcp_c2h_data_hdr *c2h;
1504 struct nvmf_che_command_buffer *cb;
1505 uint32_t data_len, data_offset;
1506 uint16_t cid, original_cid;
1507
1508 /*
1509 * Unlike freelist command buffers, DDP command buffers are
1510 * not released until the response capsule is received to keep
1511 * the STAG allocated until the command has completed.
1512 */
1513 c2h = (const void *)pdu->hdr;
1514
1515 cid = le16toh(c2h->cccid);
1516 if (CHE_TAG_IS_FL(cid)) {
1517 mtx_lock(&qp->rx_buffers.lock);
1518 cb = che_find_command_buffer(&qp->rx_buffers, c2h->cccid);
1519 } else {
1520 if (CHE_STAG_IDX(cid) >= qp->num_ddp_tags) {
1521 nvmf_che_report_error(qp,
1522 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
1523 offsetof(struct nvme_tcp_c2h_data_hdr, cccid),
1524 pdu->m, pdu->hdr->hlen);
1525 nvmf_che_free_pdu(pdu);
1526 return (EBADMSG);
1527 }
1528
1529 mtx_lock(&qp->rx_buffers.lock);
1530 cb = qp->open_ddp_tags[CHE_STAG_IDX(cid)];
1531 }
1532
1533 if (cb == NULL) {
1534 mtx_unlock(&qp->rx_buffers.lock);
1535 /*
1536 * XXX: Could be PDU sequence error if cccid is for a
1537 * command that doesn't use a command buffer.
1538 */
1539 nvmf_che_report_error(qp,
1540 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
1541 offsetof(struct nvme_tcp_c2h_data_hdr, cccid), pdu->m,
1542 pdu->hdr->hlen);
1543 nvmf_che_free_pdu(pdu);
1544 return (EBADMSG);
1545 }
1546
1547 /* For a data digest mismatch, fail the I/O request. */
1548 if (pdu->data_digest_mismatch) {
1549 cb->error = EINTEGRITY;
1550 if (CHE_TAG_IS_FL(cid)) {
1551 che_remove_command_buffer(&qp->rx_buffers, cb);
1552 mtx_unlock(&qp->rx_buffers.lock);
1553 che_release_command_buffer(cb);
1554 } else
1555 mtx_unlock(&qp->rx_buffers.lock);
1556 nvmf_che_free_pdu(pdu);
1557 return (0);
1558 }
1559
1560 data_len = le32toh(c2h->datal);
1561 if (data_len != pdu->data_len) {
1562 mtx_unlock(&qp->rx_buffers.lock);
1563 nvmf_che_report_error(qp,
1564 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
1565 offsetof(struct nvme_tcp_c2h_data_hdr, datal), pdu->m,
1566 pdu->hdr->hlen);
1567 nvmf_che_free_pdu(pdu);
1568 return (EBADMSG);
1569 }
1570
1571 data_offset = le32toh(c2h->datao);
1572 if (data_offset < cb->data_offset ||
1573 data_offset + data_len > cb->data_offset + cb->data_len) {
1574 mtx_unlock(&qp->rx_buffers.lock);
1575 nvmf_che_report_error(qp,
1576 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
1577 pdu->m, pdu->hdr->hlen);
1578 nvmf_che_free_pdu(pdu);
1579 return (EBADMSG);
1580 }
1581
1582 if (data_offset != cb->data_offset + cb->data_xfered) {
1583 if (CHE_TAG_IS_FL(cid)) {
1584 mtx_unlock(&qp->rx_buffers.lock);
1585 nvmf_che_report_error(qp,
1586 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
1587 pdu->hdr->hlen);
1588 nvmf_che_free_pdu(pdu);
1589 return (EBADMSG);
1590 } else {
1591 uint32_t ddp_bytes;
1592
1593 /* Account for PDUs silently received via DDP. */
1594 ddp_bytes = data_offset -
1595 (cb->data_offset + cb->data_xfered);
1596 cb->data_xfered += ddp_bytes;
1597 #ifdef VERBOSE_TRACES
1598 CTR(KTR_CXGBE, "%s: tid %u previous ddp_bytes %u",
1599 __func__, qp->toep->tid, ddp_bytes);
1600 #endif
1601 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_octets,
1602 ddp_bytes);
1603 }
1604 }
1605
1606 if ((cb->data_xfered + data_len == cb->data_len) !=
1607 ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_LAST_PDU) != 0)) {
1608 mtx_unlock(&qp->rx_buffers.lock);
1609 nvmf_che_report_error(qp,
1610 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
1611 pdu->hdr->hlen);
1612 nvmf_che_free_pdu(pdu);
1613 return (EBADMSG);
1614 }
1615
1616 cb->data_xfered += data_len;
1617 original_cid = cb->original_cid;
1618
1619 if (CHE_TAG_IS_FL(cid)) {
1620 data_offset -= cb->data_offset;
1621 if (cb->data_xfered == cb->data_len)
1622 che_remove_command_buffer(&qp->rx_buffers, cb);
1623 else
1624 che_hold_command_buffer(cb);
1625 mtx_unlock(&qp->rx_buffers.lock);
1626
1627 if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) {
1628 /*
1629 * Free the CID as the command has now been
1630 * completed.
1631 */
1632 cid = CHE_RAW_FL_TAG(cid);
1633 mtx_lock(&qp->fl_cid_lock);
1634 MPASS(FL_CID_ISACTIVE(cid, qp->fl_cid_set));
1635 MPASS(original_cid == qp->fl_cids[cid]);
1636 FL_CID_FREE(cid, qp->fl_cid_set);
1637 mtx_unlock(&qp->fl_cid_lock);
1638 }
1639
1640 mbuf_copyto_io(pdu->m->m_next, 0, data_len, &cb->io,
1641 data_offset);
1642
1643 che_release_command_buffer(cb);
1644 } else {
1645 if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) {
1646 /*
1647 * Free the command buffer and STAG as the
1648 * command has now been completed.
1649 */
1650 che_free_ddp_tag(qp, cb, cid);
1651 mtx_unlock(&qp->rx_buffers.lock);
1652 che_release_command_buffer(cb);
1653 } else
1654 mtx_unlock(&qp->rx_buffers.lock);
1655 }
1656
1657 if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) {
1658 struct nvme_completion cqe;
1659 struct nvmf_capsule *nc;
1660
1661 memset(&cqe, 0, sizeof(cqe));
1662 cqe.cid = original_cid;
1663
1664 nc = nvmf_allocate_response(&qp->qp, &cqe, M_WAITOK);
1665 nc->nc_sqhd_valid = false;
1666
1667 nvmf_capsule_received(&qp->qp, nc);
1668 }
1669
1670 nvmf_che_free_pdu(pdu);
1671 return (0);
1672 }
1673
1674 /* Called when m_free drops refcount to 0. */
1675 static void
nvmf_che_mbuf_done(struct mbuf * m)1676 nvmf_che_mbuf_done(struct mbuf *m)
1677 {
1678 struct nvmf_che_command_buffer *cb = m->m_ext.ext_arg1;
1679
1680 che_free_command_buffer(cb);
1681 }
1682
1683 static struct mbuf *
nvmf_che_mbuf(void * arg,int how,void * data,size_t len)1684 nvmf_che_mbuf(void *arg, int how, void *data, size_t len)
1685 {
1686 struct nvmf_che_command_buffer *cb = arg;
1687 struct mbuf *m;
1688
1689 m = m_get(how, MT_DATA);
1690 m->m_flags |= M_RDONLY;
1691 m_extaddref(m, data, len, &cb->refs, nvmf_che_mbuf_done, cb, NULL);
1692 m->m_len = len;
1693 return (m);
1694 }
1695
1696 static void
nvmf_che_free_mext_pg(struct mbuf * m)1697 nvmf_che_free_mext_pg(struct mbuf *m)
1698 {
1699 struct nvmf_che_command_buffer *cb = m->m_ext.ext_arg1;
1700
1701 M_ASSERTEXTPG(m);
1702 che_release_command_buffer(cb);
1703 }
1704
1705 static struct mbuf *
nvmf_che_mext_pg(void * arg,int how)1706 nvmf_che_mext_pg(void *arg, int how)
1707 {
1708 struct nvmf_che_command_buffer *cb = arg;
1709 struct mbuf *m;
1710
1711 m = mb_alloc_ext_pgs(how, nvmf_che_free_mext_pg, M_RDONLY);
1712 m->m_ext.ext_arg1 = cb;
1713 che_hold_command_buffer(cb);
1714 return (m);
1715 }
1716
1717 /*
1718 * Return an mbuf chain for a range of data belonging to a command
1719 * buffer.
1720 *
1721 * The mbuf chain uses M_EXT mbufs which hold references on the
1722 * command buffer so that it remains "alive" until the data has been
1723 * fully transmitted. If truncate_ok is true, then the mbuf chain
1724 * might return a short chain to avoid gratuitously splitting up a
1725 * page.
1726 */
1727 static struct mbuf *
nvmf_che_command_buffer_mbuf(struct nvmf_che_command_buffer * cb,uint32_t data_offset,uint32_t data_len,uint32_t * actual_len,bool can_truncate)1728 nvmf_che_command_buffer_mbuf(struct nvmf_che_command_buffer *cb,
1729 uint32_t data_offset, uint32_t data_len, uint32_t *actual_len,
1730 bool can_truncate)
1731 {
1732 struct mbuf *m;
1733 size_t len;
1734
1735 m = memdesc_alloc_ext_mbufs(&cb->io.io_mem, nvmf_che_mbuf,
1736 nvmf_che_mext_pg, cb, M_WAITOK, data_offset, data_len, &len,
1737 can_truncate);
1738 if (actual_len != NULL)
1739 *actual_len = len;
1740 return (m);
1741 }
1742
1743 /* NB: cid and ttag and little-endian already. */
1744 static void
che_send_h2c_pdu(struct nvmf_che_qpair * qp,uint16_t cid,uint16_t ttag,uint32_t data_offset,struct mbuf * m,size_t len,bool last_pdu)1745 che_send_h2c_pdu(struct nvmf_che_qpair *qp, uint16_t cid, uint16_t ttag,
1746 uint32_t data_offset, struct mbuf *m, size_t len, bool last_pdu)
1747 {
1748 struct nvme_tcp_h2c_data_hdr h2c;
1749 struct mbuf *top;
1750
1751 memset(&h2c, 0, sizeof(h2c));
1752 h2c.common.pdu_type = NVME_TCP_PDU_TYPE_H2C_DATA;
1753 if (last_pdu)
1754 h2c.common.flags |= NVME_TCP_H2C_DATA_FLAGS_LAST_PDU;
1755 h2c.cccid = cid;
1756 h2c.ttag = ttag;
1757 h2c.datao = htole32(data_offset);
1758 h2c.datal = htole32(len);
1759
1760 top = nvmf_che_construct_pdu(qp, &h2c, sizeof(h2c), m, len);
1761 nvmf_che_write_pdu(qp, top);
1762 }
1763
1764 static int
nvmf_che_handle_r2t(struct nvmf_che_qpair * qp,struct nvmf_che_rxpdu * pdu)1765 nvmf_che_handle_r2t(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu)
1766 {
1767 const struct nvme_tcp_r2t_hdr *r2t;
1768 struct nvmf_che_command_buffer *cb;
1769 uint32_t data_len, data_offset;
1770
1771 r2t = (const void *)pdu->hdr;
1772
1773 mtx_lock(&qp->tx_buffers.lock);
1774 cb = che_find_command_buffer(&qp->tx_buffers, r2t->cccid);
1775 if (cb == NULL) {
1776 mtx_unlock(&qp->tx_buffers.lock);
1777 nvmf_che_report_error(qp,
1778 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
1779 offsetof(struct nvme_tcp_r2t_hdr, cccid), pdu->m,
1780 pdu->hdr->hlen);
1781 nvmf_che_free_pdu(pdu);
1782 return (EBADMSG);
1783 }
1784
1785 data_offset = le32toh(r2t->r2to);
1786 if (data_offset != cb->data_xfered) {
1787 mtx_unlock(&qp->tx_buffers.lock);
1788 nvmf_che_report_error(qp,
1789 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
1790 pdu->hdr->hlen);
1791 nvmf_che_free_pdu(pdu);
1792 return (EBADMSG);
1793 }
1794
1795 /*
1796 * XXX: The spec does not specify how to handle R2T tranfers
1797 * out of range of the original command.
1798 */
1799 data_len = le32toh(r2t->r2tl);
1800 if (data_offset + data_len > cb->data_len) {
1801 mtx_unlock(&qp->tx_buffers.lock);
1802 nvmf_che_report_error(qp,
1803 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
1804 pdu->m, pdu->hdr->hlen);
1805 nvmf_che_free_pdu(pdu);
1806 return (EBADMSG);
1807 }
1808
1809 cb->data_xfered += data_len;
1810 if (cb->data_xfered == cb->data_len)
1811 che_remove_command_buffer(&qp->tx_buffers, cb);
1812 else
1813 che_hold_command_buffer(cb);
1814 mtx_unlock(&qp->tx_buffers.lock);
1815
1816 /*
1817 * Queue one or more H2C_DATA PDUs containing the requested
1818 * data.
1819 */
1820 while (data_len > 0) {
1821 struct mbuf *m;
1822 uint32_t sent, todo;
1823
1824 todo = min(data_len, qp->max_tx_data);
1825 m = nvmf_che_command_buffer_mbuf(cb, data_offset, todo, &sent,
1826 todo < data_len);
1827 che_send_h2c_pdu(qp, r2t->cccid, r2t->ttag, data_offset, m,
1828 sent, sent == data_len);
1829
1830 data_offset += sent;
1831 data_len -= sent;
1832 }
1833
1834 che_release_command_buffer(cb);
1835 nvmf_che_free_pdu(pdu);
1836 return (0);
1837 }
1838
1839 static int
nvmf_che_dispatch_pdu(struct nvmf_che_qpair * qp,struct nvmf_che_rxpdu * pdu)1840 nvmf_che_dispatch_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu)
1841 {
1842 /*
1843 * The PDU header should always be contiguous in the mbuf from
1844 * CPL_NVMT_CMP.
1845 */
1846 pdu->hdr = mtod(pdu->m, void *);
1847 KASSERT(pdu->m->m_len == pdu->hdr->hlen +
1848 ((pdu->hdr->flags & NVME_TCP_CH_FLAGS_HDGSTF) != 0 ?
1849 sizeof(uint32_t) : 0),
1850 ("%s: mismatched PDU header mbuf length", __func__));
1851
1852 switch (pdu->hdr->pdu_type) {
1853 default:
1854 __assert_unreachable();
1855 break;
1856 case NVME_TCP_PDU_TYPE_H2C_TERM_REQ:
1857 case NVME_TCP_PDU_TYPE_C2H_TERM_REQ:
1858 return (nvmf_che_handle_term_req(pdu));
1859 case NVME_TCP_PDU_TYPE_CAPSULE_CMD:
1860 return (nvmf_che_save_command_capsule(qp, pdu));
1861 case NVME_TCP_PDU_TYPE_CAPSULE_RESP:
1862 return (nvmf_che_save_response_capsule(qp, pdu));
1863 case NVME_TCP_PDU_TYPE_H2C_DATA:
1864 return (nvmf_che_handle_h2c_data(qp, pdu));
1865 case NVME_TCP_PDU_TYPE_C2H_DATA:
1866 return (nvmf_che_handle_c2h_data(qp, pdu));
1867 case NVME_TCP_PDU_TYPE_R2T:
1868 return (nvmf_che_handle_r2t(qp, pdu));
1869 }
1870 }
1871
1872 static int
nvmf_che_attach_pdu_data(struct nvmf_che_qpair * qp,struct nvmf_che_rxpdu * pdu)1873 nvmf_che_attach_pdu_data(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu)
1874 {
1875 struct socket *so = qp->so;
1876 struct mbuf *m, *n;
1877 uint32_t tcp_seq;
1878 size_t len;
1879 int error;
1880
1881 /* Check for DDP data. */
1882 if (pdu->ddp) {
1883 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_pdus, 1);
1884 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_octets,
1885 pdu->data_len);
1886 return (0);
1887 }
1888
1889 error = 0;
1890 len = pdu->data_len;
1891 tcp_seq = pdu->m->m_pkthdr.nvmf_tcp_seq;
1892 m = pdu->m;
1893 SOCKBUF_LOCK(&so->so_rcv);
1894 while (len > 0) {
1895 n = mbufq_dequeue(&qp->rx_data);
1896 KASSERT(n != NULL, ("%s: missing %zu data", __func__, len));
1897 if (n == NULL) {
1898 error = ENOBUFS;
1899 break;
1900 }
1901
1902 KASSERT(n->m_pkthdr.nvmf_tcp_seq == tcp_seq,
1903 ("%s: TCP seq mismatch", __func__));
1904 KASSERT(n->m_pkthdr.len <= len,
1905 ("%s: too much data", __func__));
1906 if (n->m_pkthdr.nvmf_tcp_seq != tcp_seq ||
1907 n->m_pkthdr.len > len) {
1908 m_freem(n);
1909 error = ENOBUFS;
1910 break;
1911 }
1912
1913 #ifdef VERBOSE_TRACES
1914 CTR(KTR_CXGBE, "%s: tid %u len %d seq %u", __func__,
1915 qp->toep->tid, n->m_pkthdr.len, n->m_pkthdr.nvmf_tcp_seq);
1916 #endif
1917 pdu->m->m_pkthdr.len += n->m_pkthdr.len;
1918 len -= n->m_pkthdr.len;
1919 tcp_seq += n->m_pkthdr.len;
1920 m_demote_pkthdr(n);
1921 m->m_next = n;
1922 m = m_last(n);
1923 }
1924 SOCKBUF_UNLOCK(&so->so_rcv);
1925
1926 if (error == 0) {
1927 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_fl_pdus, 1);
1928 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_fl_octets,
1929 pdu->data_len);
1930 }
1931 return (error);
1932 }
1933
1934 static void
nvmf_che_receive(void * arg)1935 nvmf_che_receive(void *arg)
1936 {
1937 struct nvmf_che_qpair *qp = arg;
1938 struct socket *so = qp->so;
1939 struct nvmf_che_rxpdu pdu;
1940 struct mbuf *m;
1941 int error, terror;
1942
1943 SOCKBUF_LOCK(&so->so_rcv);
1944 while (!qp->rx_shutdown) {
1945 /* Wait for a PDU. */
1946 if (so->so_error != 0 || so->so_rerror != 0) {
1947 if (so->so_error != 0)
1948 error = so->so_error;
1949 else
1950 error = so->so_rerror;
1951 SOCKBUF_UNLOCK(&so->so_rcv);
1952 error:
1953 nvmf_qpair_error(&qp->qp, error);
1954 SOCKBUF_LOCK(&so->so_rcv);
1955 while (!qp->rx_shutdown)
1956 cv_wait(&qp->rx_cv, SOCKBUF_MTX(&so->so_rcv));
1957 break;
1958 }
1959
1960 m = mbufq_dequeue(&qp->rx_pdus);
1961 if (m == NULL) {
1962 if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) != 0) {
1963 error = 0;
1964 SOCKBUF_UNLOCK(&so->so_rcv);
1965 goto error;
1966 }
1967 cv_wait(&qp->rx_cv, SOCKBUF_MTX(&so->so_rcv));
1968 continue;
1969 }
1970 SOCKBUF_UNLOCK(&so->so_rcv);
1971
1972 pdu.m = m;
1973 pdu.hdr = mtod(m, const void *);
1974 pdu.ddp = (m->m_pkthdr.nvmf_cpl_status & CMP_STATUS_DDP) != 0;
1975
1976 error = nvmf_che_validate_pdu(qp, &pdu);
1977 if (error == 0 && pdu.data_len != 0)
1978 error = nvmf_che_attach_pdu_data(qp, &pdu);
1979 if (error != 0)
1980 nvmf_che_free_pdu(&pdu);
1981 else
1982 error = nvmf_che_dispatch_pdu(qp, &pdu);
1983 if (error != 0) {
1984 /*
1985 * If we received a termination request, close
1986 * the connection immediately.
1987 */
1988 if (error == ECONNRESET)
1989 goto error;
1990
1991 /*
1992 * Wait for up to 30 seconds for the socket to
1993 * be closed by the other end.
1994 */
1995 SOCKBUF_LOCK(&so->so_rcv);
1996 if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1997 terror = cv_timedwait(&qp->rx_cv,
1998 SOCKBUF_MTX(&so->so_rcv), 30 * hz);
1999 if (terror == ETIMEDOUT)
2000 printf("NVMe/TCP: Timed out after sending terminate request\n");
2001 }
2002 SOCKBUF_UNLOCK(&so->so_rcv);
2003 goto error;
2004 }
2005
2006 SOCKBUF_LOCK(&so->so_rcv);
2007 }
2008 SOCKBUF_UNLOCK(&so->so_rcv);
2009 kthread_exit();
2010 }
2011
2012 static int
nvmf_che_soupcall_receive(struct socket * so,void * arg,int waitflag)2013 nvmf_che_soupcall_receive(struct socket *so, void *arg, int waitflag)
2014 {
2015 struct nvmf_che_qpair *qp = arg;
2016
2017 cv_signal(&qp->rx_cv);
2018 return (SU_OK);
2019 }
2020
2021 static int
do_nvmt_data(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)2022 do_nvmt_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
2023 {
2024 struct adapter *sc = iq->adapter;
2025 struct nvmf_che_adapter *nca = sc->nvme_ulp_softc;
2026 const struct cpl_nvmt_data *cpl;
2027 u_int tid;
2028 struct toepcb *toep;
2029 struct nvmf_che_qpair *qp;
2030 struct socket *so;
2031 struct inpcb *inp;
2032 struct tcpcb *tp;
2033 int len __diagused;
2034
2035 if (nca->nvmt_data_iqe) {
2036 cpl = (const void *)(rss + 1);
2037 } else {
2038 cpl = mtod(m, const void *);
2039
2040 /* strip off CPL header */
2041 m_adj(m, sizeof(*cpl));
2042 }
2043 tid = GET_TID(cpl);
2044 toep = lookup_tid(sc, tid);
2045
2046 KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
2047
2048 len = m->m_pkthdr.len;
2049
2050 KASSERT(len == be16toh(cpl->length),
2051 ("%s: payload length mismatch", __func__));
2052
2053 inp = toep->inp;
2054 INP_WLOCK(inp);
2055 if (inp->inp_flags & INP_DROPPED) {
2056 CTR(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
2057 __func__, tid, len, inp->inp_flags);
2058 INP_WUNLOCK(inp);
2059 m_freem(m);
2060 return (0);
2061 }
2062
2063 /* Save TCP sequence number. */
2064 m->m_pkthdr.nvmf_tcp_seq = be32toh(cpl->seq);
2065
2066 qp = toep->ulpcb;
2067 so = qp->so;
2068 SOCKBUF_LOCK(&so->so_rcv);
2069 mbufq_enqueue(&qp->rx_data, m);
2070 SOCKBUF_UNLOCK(&so->so_rcv);
2071
2072 tp = intotcpcb(inp);
2073 tp->t_rcvtime = ticks;
2074
2075 #ifdef VERBOSE_TRACES
2076 CTR(KTR_CXGBE, "%s: tid %u len %d seq %u", __func__, tid, len,
2077 be32toh(cpl->seq));
2078 #endif
2079
2080 INP_WUNLOCK(inp);
2081 return (0);
2082 }
2083
2084 static int
do_nvmt_cmp(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)2085 do_nvmt_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
2086 {
2087 struct adapter *sc = iq->adapter;
2088 const struct cpl_nvmt_cmp *cpl = mtod(m, const void *);
2089 u_int tid = GET_TID(cpl);
2090 struct toepcb *toep = lookup_tid(sc, tid);
2091 struct nvmf_che_qpair *qp = toep->ulpcb;
2092 struct socket *so = qp->so;
2093 struct inpcb *inp = toep->inp;
2094 u_int hlen __diagused;
2095 bool empty;
2096
2097 KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
2098 KASSERT(!(toep->flags & TPF_SYNQE),
2099 ("%s: toep %p claims to be a synq entry", __func__, toep));
2100
2101 /* strip off CPL header */
2102 m_adj(m, sizeof(*cpl));
2103 hlen = m->m_pkthdr.len;
2104
2105 KASSERT(hlen == be16toh(cpl->length),
2106 ("%s: payload length mismatch", __func__));
2107
2108 INP_WLOCK(inp);
2109 if (inp->inp_flags & INP_DROPPED) {
2110 CTR(KTR_CXGBE, "%s: tid %u, rx (hlen %u), inp_flags 0x%x",
2111 __func__, tid, hlen, inp->inp_flags);
2112 INP_WUNLOCK(inp);
2113 m_freem(m);
2114 return (0);
2115 }
2116
2117 #ifdef VERBOSE_TRACES
2118 CTR(KTR_CXGBE, "%s: tid %u hlen %u seq %u status %u", __func__, tid,
2119 hlen, be32toh(cpl->seq), cpl->status);
2120 #endif
2121
2122 /* Save TCP sequence number and CPL status. */
2123 m->m_pkthdr.nvmf_tcp_seq = be32toh(cpl->seq);
2124 m->m_pkthdr.nvmf_cpl_status = cpl->status;
2125
2126 SOCKBUF_LOCK(&so->so_rcv);
2127 empty = mbufq_len(&qp->rx_pdus) == 0;
2128 mbufq_enqueue(&qp->rx_pdus, m);
2129 SOCKBUF_UNLOCK(&so->so_rcv);
2130 INP_WUNLOCK(inp);
2131 if (empty)
2132 cv_signal(&qp->rx_cv);
2133 return (0);
2134 }
2135
2136 static uint16_t
che_alloc_fl_cid(struct nvmf_che_qpair * qp,uint16_t original_cid)2137 che_alloc_fl_cid(struct nvmf_che_qpair *qp, uint16_t original_cid)
2138 {
2139 uint16_t new_cid;
2140
2141 mtx_lock(&qp->fl_cid_lock);
2142 new_cid = FL_CID_FINDFREE_AT(qp->fl_cid_set, qp->next_cid);
2143 if (new_cid == 0) {
2144 new_cid = FL_CID_FINDFREE_AT(qp->fl_cid_set, 0);
2145 MPASS(new_cid != 0);
2146 }
2147 new_cid--;
2148 FL_CID_BUSY(new_cid, qp->fl_cid_set);
2149 if (new_cid == CHE_MAX_FL_TAG)
2150 qp->next_cid = 0;
2151 else
2152 qp->next_cid = new_cid + 1;
2153 qp->fl_cids[new_cid] = original_cid;
2154 mtx_unlock(&qp->fl_cid_lock);
2155
2156 return (new_cid | CHE_FL_TAG_MASK);
2157 }
2158
2159 static uint16_t
che_alloc_ddp_cid(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb)2160 che_alloc_ddp_cid(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb)
2161 {
2162 mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
2163
2164 return (che_alloc_ddp_tag(qp, cb));
2165 }
2166
2167 static struct mbuf *
che_command_pdu(struct nvmf_che_qpair * qp,struct nvmf_che_capsule * cc)2168 che_command_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_capsule *cc)
2169 {
2170 struct nvmf_capsule *nc = &cc->nc;
2171 struct nvmf_che_command_buffer *cb;
2172 struct nvme_sgl_descriptor *sgl;
2173 struct nvme_tcp_cmd cmd;
2174 struct mbuf *top, *m;
2175 uint16_t cid;
2176 bool use_icd;
2177
2178 use_icd = false;
2179 cb = NULL;
2180 m = NULL;
2181
2182 if (nc->nc_data.io_len != 0) {
2183 cb = che_alloc_command_buffer(qp, &nc->nc_data, 0,
2184 nc->nc_data.io_len, nc->nc_sqe.cid);
2185 cb->original_cid = nc->nc_sqe.cid;
2186
2187 if (nc->nc_send_data && nc->nc_data.io_len <= qp->max_icd) {
2188 cid = che_alloc_fl_cid(qp, nc->nc_sqe.cid);
2189 use_icd = true;
2190 m = nvmf_che_command_buffer_mbuf(cb, 0,
2191 nc->nc_data.io_len, NULL, false);
2192 cb->data_xfered = nc->nc_data.io_len;
2193 che_release_command_buffer(cb);
2194 } else if (nc->nc_send_data) {
2195 cid = che_alloc_fl_cid(qp, nc->nc_sqe.cid);
2196 cb->cid = htole16(cid);
2197 mtx_lock(&qp->tx_buffers.lock);
2198 che_add_command_buffer(&qp->tx_buffers, cb);
2199 mtx_unlock(&qp->tx_buffers.lock);
2200 } else {
2201 mtx_lock(&qp->rx_buffers.lock);
2202 cid = che_alloc_ddp_cid(qp, cb);
2203 if (cid == CHE_DDP_NO_TAG) {
2204 cid = che_alloc_fl_cid(qp, nc->nc_sqe.cid);
2205 che_add_command_buffer(&qp->rx_buffers, cb);
2206 }
2207 cb->cid = htole16(cid);
2208 mtx_unlock(&qp->rx_buffers.lock);
2209 }
2210 } else
2211 cid = che_alloc_fl_cid(qp, nc->nc_sqe.cid);
2212
2213 #ifdef VERBOSE_TRACES
2214 CTR(KTR_CXGBE, "%s: tid %u allocated cid 0x%04x for 0x%04x", __func__,
2215 qp->toep->tid, cid, nc->nc_sqe.cid);
2216 #endif
2217 memset(&cmd, 0, sizeof(cmd));
2218 cmd.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_CMD;
2219 cmd.ccsqe = nc->nc_sqe;
2220 cmd.ccsqe.cid = htole16(cid);
2221
2222 /* Populate SGL in SQE. */
2223 sgl = &cmd.ccsqe.sgl;
2224 memset(sgl, 0, sizeof(*sgl));
2225 sgl->address = 0;
2226 sgl->length = htole32(nc->nc_data.io_len);
2227 if (use_icd) {
2228 /* Use in-capsule data. */
2229 sgl->type = NVME_SGL_TYPE_ICD;
2230 } else {
2231 /* Use a command buffer. */
2232 sgl->type = NVME_SGL_TYPE_COMMAND_BUFFER;
2233 }
2234
2235 top = nvmf_che_construct_pdu(qp, &cmd, sizeof(cmd), m, m != NULL ?
2236 nc->nc_data.io_len : 0);
2237 return (top);
2238 }
2239
2240 static struct mbuf *
che_response_pdu(struct nvmf_che_qpair * qp,struct nvmf_che_capsule * cc)2241 che_response_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_capsule *cc)
2242 {
2243 struct nvmf_capsule *nc = &cc->nc;
2244 struct nvme_tcp_rsp rsp;
2245
2246 memset(&rsp, 0, sizeof(rsp));
2247 rsp.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_RESP;
2248 rsp.rccqe = nc->nc_cqe;
2249
2250 return (nvmf_che_construct_pdu(qp, &rsp, sizeof(rsp), NULL, 0));
2251 }
2252
2253 static struct mbuf *
capsule_to_pdu(struct nvmf_che_qpair * qp,struct nvmf_che_capsule * cc)2254 capsule_to_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_capsule *cc)
2255 {
2256 if (cc->nc.nc_qe_len == sizeof(struct nvme_command))
2257 return (che_command_pdu(qp, cc));
2258 else
2259 return (che_response_pdu(qp, cc));
2260 }
2261
2262 static void
nvmf_che_send(void * arg)2263 nvmf_che_send(void *arg)
2264 {
2265 struct nvmf_che_qpair *qp = arg;
2266 struct nvmf_che_capsule *cc;
2267 struct socket *so = qp->so;
2268 struct mbuf *m;
2269 int error;
2270
2271 m = NULL;
2272 SOCKBUF_LOCK(&so->so_snd);
2273 while (!qp->tx_shutdown) {
2274 if (so->so_error != 0) {
2275 error = so->so_error;
2276 SOCKBUF_UNLOCK(&so->so_snd);
2277 m_freem(m);
2278 nvmf_qpair_error(&qp->qp, error);
2279 SOCKBUF_LOCK(&so->so_snd);
2280 while (!qp->tx_shutdown)
2281 cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd));
2282 break;
2283 }
2284
2285 if (STAILQ_EMPTY(&qp->tx_capsules)) {
2286 cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd));
2287 continue;
2288 }
2289
2290 /* Convert a capsule into a PDU. */
2291 cc = STAILQ_FIRST(&qp->tx_capsules);
2292 STAILQ_REMOVE_HEAD(&qp->tx_capsules, link);
2293 SOCKBUF_UNLOCK(&so->so_snd);
2294
2295 m = capsule_to_pdu(qp, cc);
2296 che_release_capsule(cc);
2297
2298 nvmf_che_write_pdu(qp, m);
2299
2300 SOCKBUF_LOCK(&so->so_snd);
2301 }
2302 SOCKBUF_UNLOCK(&so->so_snd);
2303 kthread_exit();
2304 }
2305
2306 static int
nvmf_che_setsockopt(struct socket * so,u_int sspace,u_int rspace)2307 nvmf_che_setsockopt(struct socket *so, u_int sspace, u_int rspace)
2308 {
2309 struct sockopt opt;
2310 int error, one = 1;
2311
2312 /* Don't lower the buffer sizes, just enforce a minimum. */
2313 SOCKBUF_LOCK(&so->so_snd);
2314 if (sspace < so->so_snd.sb_hiwat)
2315 sspace = so->so_snd.sb_hiwat;
2316 SOCKBUF_UNLOCK(&so->so_snd);
2317 SOCKBUF_LOCK(&so->so_rcv);
2318 if (rspace < so->so_rcv.sb_hiwat)
2319 rspace = so->so_rcv.sb_hiwat;
2320 SOCKBUF_UNLOCK(&so->so_rcv);
2321
2322 error = soreserve(so, sspace, rspace);
2323 if (error != 0)
2324 return (error);
2325 SOCKBUF_LOCK(&so->so_snd);
2326 so->so_snd.sb_flags |= SB_AUTOSIZE;
2327 SOCKBUF_UNLOCK(&so->so_snd);
2328 SOCKBUF_LOCK(&so->so_rcv);
2329 so->so_rcv.sb_flags |= SB_AUTOSIZE;
2330 SOCKBUF_UNLOCK(&so->so_rcv);
2331
2332 /*
2333 * Disable Nagle.
2334 */
2335 bzero(&opt, sizeof(opt));
2336 opt.sopt_dir = SOPT_SET;
2337 opt.sopt_level = IPPROTO_TCP;
2338 opt.sopt_name = TCP_NODELAY;
2339 opt.sopt_val = &one;
2340 opt.sopt_valsize = sizeof(one);
2341 error = sosetopt(so, &opt);
2342 if (error != 0)
2343 return (error);
2344
2345 return (0);
2346 }
2347
2348 static void
t4_nvme_set_tcb_field(struct toepcb * toep,uint16_t word,uint64_t mask,uint64_t val)2349 t4_nvme_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask,
2350 uint64_t val)
2351 {
2352 struct adapter *sc = td_adapter(toep->td);
2353
2354 t4_set_tcb_field(sc, &toep->ofld_txq->wrq, toep, word, mask, val, 0, 0);
2355 }
2356
2357 static void
set_ulp_mode_nvme(struct toepcb * toep,u_int ulp_submode,uint8_t rxpda)2358 set_ulp_mode_nvme(struct toepcb *toep, u_int ulp_submode, uint8_t rxpda)
2359 {
2360 uint64_t val;
2361
2362 CTR(KTR_CXGBE, "%s: tid %u, ULP_MODE_NVMET, submode=%#x, rxpda=%u",
2363 __func__, toep->tid, ulp_submode, rxpda);
2364
2365 val = V_TCB_ULP_TYPE(ULP_MODE_NVMET) | V_TCB_ULP_RAW(ulp_submode);
2366 t4_nvme_set_tcb_field(toep, W_TCB_ULP_TYPE,
2367 V_TCB_ULP_TYPE(M_TCB_ULP_TYPE) | V_TCB_ULP_RAW(M_TCB_ULP_RAW), val);
2368
2369 val = V_TF_RX_FLOW_CONTROL_DISABLE(1ULL);
2370 t4_nvme_set_tcb_field(toep, W_TCB_T_FLAGS, val, val);
2371
2372 val = V_TCB_RSVD((rxpda / 4) - 1);
2373 t4_nvme_set_tcb_field(toep, W_TCB_RSVD, V_TCB_RSVD(M_TCB_RSVD), val);
2374
2375 /* 0 disables CPL_NVMT_CMP_IMM which is not useful in this driver. */
2376 val = 0;
2377 t4_nvme_set_tcb_field(toep, W_TCB_CMP_IMM_SZ,
2378 V_TCB_CMP_IMM_SZ(M_TCB_CMP_IMM_SZ), val);
2379 }
2380
2381 static u_int
pdu_max_data_len(const nvlist_t * nvl,u_int max_pdu_len,u_int hlen,uint8_t pda)2382 pdu_max_data_len(const nvlist_t *nvl, u_int max_pdu_len, u_int hlen,
2383 uint8_t pda)
2384 {
2385 u_int max_data_len;
2386
2387 if (nvlist_get_bool(nvl, "header_digests"))
2388 hlen += sizeof(uint32_t);
2389 hlen = roundup(hlen, pda);
2390 max_data_len = max_pdu_len - hlen;
2391 if (nvlist_get_bool(nvl, "data_digests"))
2392 max_data_len -= sizeof(uint32_t);
2393 return (max_data_len);
2394 }
2395
2396 static struct nvmf_qpair *
che_allocate_qpair(bool controller,const nvlist_t * nvl)2397 che_allocate_qpair(bool controller, const nvlist_t *nvl)
2398 {
2399 struct nvmf_che_adapter *nca;
2400 struct nvmf_che_qpair *qp;
2401 struct adapter *sc;
2402 struct file *fp;
2403 struct socket *so;
2404 struct inpcb *inp;
2405 struct tcpcb *tp;
2406 struct toepcb *toep;
2407 cap_rights_t rights;
2408 u_int max_tx_pdu_len, num_ddp_tags;
2409 int error, ulp_submode;
2410
2411 if (!nvlist_exists_number(nvl, "fd") ||
2412 !nvlist_exists_number(nvl, "rxpda") ||
2413 !nvlist_exists_number(nvl, "txpda") ||
2414 !nvlist_exists_bool(nvl, "header_digests") ||
2415 !nvlist_exists_bool(nvl, "data_digests") ||
2416 !nvlist_exists_number(nvl, "maxr2t") ||
2417 !nvlist_exists_number(nvl, "maxh2cdata") ||
2418 !nvlist_exists_number(nvl, "max_icd"))
2419 return (NULL);
2420
2421 error = fget(curthread, nvlist_get_number(nvl, "fd"),
2422 cap_rights_init_one(&rights, CAP_SOCK_CLIENT), &fp);
2423 if (error != 0)
2424 return (NULL);
2425 if (fp->f_type != DTYPE_SOCKET) {
2426 fdrop(fp, curthread);
2427 return (NULL);
2428 }
2429 so = fp->f_data;
2430 if (so->so_type != SOCK_STREAM ||
2431 so->so_proto->pr_protocol != IPPROTO_TCP) {
2432 fdrop(fp, curthread);
2433 return (NULL);
2434 }
2435
2436 sc = find_offload_adapter(so);
2437 if (sc == NULL) {
2438 fdrop(fp, curthread);
2439 return (NULL);
2440 }
2441 nca = sc->nvme_ulp_softc;
2442
2443 /*
2444 * Controller: Require advertised MAXH2CDATA to be small
2445 * enough.
2446 */
2447 if (controller) {
2448 u_int max_rx_data;
2449
2450 max_rx_data = pdu_max_data_len(nvl, nca->max_receive_pdu,
2451 sizeof(struct nvme_tcp_h2c_data_hdr),
2452 nvlist_get_number(nvl, "rxpda"));
2453 if (nvlist_get_number(nvl, "maxh2cdata") > max_rx_data) {
2454 fdrop(fp, curthread);
2455 return (NULL);
2456 }
2457 }
2458
2459 /*
2460 * Host: Require the queue size to be small enough that all of
2461 * the command ids allocated by nvmf(4) will fit in the
2462 * unallocated range.
2463 *
2464 * XXX: Alternatively this driver could just queue commands
2465 * when an unallocated ID isn't available.
2466 */
2467 if (!controller) {
2468 u_int num_commands;
2469
2470 num_commands = nvlist_get_number(nvl, "qsize") - 1;
2471 if (nvlist_get_bool(nvl, "admin"))
2472 num_commands += 8; /* Max AER */
2473 if (num_commands > CHE_NUM_FL_TAGS) {
2474 fdrop(fp, curthread);
2475 return (NULL);
2476 }
2477 }
2478
2479 qp = malloc(sizeof(*qp), M_NVMF_CHE, M_WAITOK | M_ZERO);
2480 qp->txpda = nvlist_get_number(nvl, "txpda");
2481 qp->rxpda = nvlist_get_number(nvl, "rxpda");
2482 qp->header_digests = nvlist_get_bool(nvl, "header_digests");
2483 qp->data_digests = nvlist_get_bool(nvl, "data_digests");
2484 qp->maxr2t = nvlist_get_number(nvl, "maxr2t");
2485 if (controller)
2486 qp->maxh2cdata = nvlist_get_number(nvl, "maxh2cdata");
2487
2488 if (controller) {
2489 /* NB: maxr2t is 0's based. */
2490 qp->num_fl_ttags = MIN(CHE_NUM_FL_TAGS,
2491 nvlist_get_number(nvl, "qsize") *
2492 ((uint64_t)qp->maxr2t + 1));
2493 qp->open_fl_ttags = mallocarray(qp->num_fl_ttags,
2494 sizeof(*qp->open_fl_ttags), M_NVMF_CHE, M_WAITOK | M_ZERO);
2495 } else {
2496 qp->fl_cids = mallocarray(CHE_NUM_FL_TAGS,
2497 sizeof(*qp->fl_cids), M_NVMF_CHE, M_WAITOK | M_ZERO);
2498 qp->fl_cid_set = malloc(sizeof(*qp->fl_cid_set), M_NVMF_CHE,
2499 M_WAITOK);
2500 FL_CID_INIT(qp->fl_cid_set);
2501 mtx_init(&qp->fl_cid_lock, "nvmf/che fl cids", NULL, MTX_DEF);
2502 }
2503
2504 inp = sotoinpcb(so);
2505 INP_WLOCK(inp);
2506 tp = intotcpcb(inp);
2507 if (inp->inp_flags & INP_DROPPED) {
2508 INP_WUNLOCK(inp);
2509 free(qp->fl_cid_set, M_NVMF_CHE);
2510 free(qp->fl_cids, M_NVMF_CHE);
2511 free(qp->open_fl_ttags, M_NVMF_CHE);
2512 free(qp, M_NVMF_CHE);
2513 fdrop(fp, curthread);
2514 return (NULL);
2515 }
2516
2517 MPASS(tp->t_flags & TF_TOE);
2518 MPASS(tp->tod != NULL);
2519 MPASS(tp->t_toe != NULL);
2520 toep = tp->t_toe;
2521 MPASS(toep->vi->adapter == sc);
2522
2523 if (ulp_mode(toep) != ULP_MODE_NONE) {
2524 INP_WUNLOCK(inp);
2525 free(qp->fl_cid_set, M_NVMF_CHE);
2526 free(qp->fl_cids, M_NVMF_CHE);
2527 free(qp->open_fl_ttags, M_NVMF_CHE);
2528 free(qp, M_NVMF_CHE);
2529 fdrop(fp, curthread);
2530 return (NULL);
2531 }
2532
2533 /* Claim socket from file descriptor. */
2534 fp->f_ops = &badfileops;
2535 fp->f_data = NULL;
2536
2537 qp->so = so;
2538 qp->toep = toep;
2539 qp->nca = nca;
2540 refcount_init(&qp->refs, 1);
2541
2542 /* NB: C2H and H2C headers are the same size. */
2543 qp->max_rx_data = pdu_max_data_len(nvl, nca->max_receive_pdu,
2544 sizeof(struct nvme_tcp_c2h_data_hdr), qp->rxpda);
2545 qp->max_tx_data = pdu_max_data_len(nvl, nca->max_transmit_pdu,
2546 sizeof(struct nvme_tcp_c2h_data_hdr), qp->txpda);
2547 if (!controller) {
2548 qp->max_tx_data = min(qp->max_tx_data,
2549 nvlist_get_number(nvl, "maxh2cdata"));
2550 qp->max_icd = min(nvlist_get_number(nvl, "max_icd"),
2551 pdu_max_data_len(nvl, nca->max_transmit_pdu,
2552 sizeof(struct nvme_tcp_cmd), qp->txpda));
2553 } else {
2554 /*
2555 * IOCCSZ represents the size of a logical command
2556 * capsule including the 64 byte SQE and the
2557 * in-capsule data. Use pdu_max_data_len to compute
2558 * the maximum supported ICD length.
2559 */
2560 qp->max_ioccsz = rounddown(pdu_max_data_len(nvl,
2561 nca->max_receive_pdu, sizeof(struct nvme_tcp_cmd),
2562 qp->rxpda), 16) + sizeof(struct nvme_command);
2563 }
2564
2565 ulp_submode = 0;
2566 if (qp->header_digests)
2567 ulp_submode |= FW_NVMET_ULPSUBMODE_HCRC;
2568 if (qp->data_digests)
2569 ulp_submode |= FW_NVMET_ULPSUBMODE_DCRC;
2570 if (!controller)
2571 ulp_submode |= FW_NVMET_ULPSUBMODE_ING_DIR;
2572
2573 max_tx_pdu_len = sizeof(struct nvme_tcp_h2c_data_hdr);
2574 if (qp->header_digests)
2575 max_tx_pdu_len += sizeof(uint32_t);
2576 max_tx_pdu_len = roundup(max_tx_pdu_len, qp->txpda);
2577 max_tx_pdu_len += qp->max_tx_data;
2578 if (qp->data_digests)
2579 max_tx_pdu_len += sizeof(uint32_t);
2580
2581 /* TODO: ISO limits */
2582
2583 if (controller) {
2584 /* Use the SUCCESS flag if SQ flow control is disabled. */
2585 qp->send_success = !nvlist_get_bool(nvl, "sq_flow_control");
2586 }
2587
2588 toep->params.ulp_mode = ULP_MODE_NVMET;
2589 toep->ulpcb = qp;
2590
2591 send_txdataplen_max_flowc_wr(sc, toep,
2592 roundup(/* max_iso_pdus * */ max_tx_pdu_len, tp->t_maxseg));
2593 set_ulp_mode_nvme(toep, ulp_submode, qp->rxpda);
2594 INP_WUNLOCK(inp);
2595
2596 fdrop(fp, curthread);
2597
2598 error = nvmf_che_setsockopt(so, max_tx_pdu_len, nca->max_receive_pdu);
2599 if (error != 0) {
2600 free(qp->fl_cid_set, M_NVMF_CHE);
2601 free(qp->fl_cids, M_NVMF_CHE);
2602 free(qp->open_fl_ttags, M_NVMF_CHE);
2603 free(qp, M_NVMF_CHE);
2604 return (NULL);
2605 }
2606
2607 num_ddp_tags = ddp_tags_per_qp;
2608 if (num_ddp_tags > 0) {
2609 qp->tpt_offset = t4_stag_alloc(sc, num_ddp_tags);
2610 if (qp->tpt_offset != T4_STAG_UNSET) {
2611 #ifdef VERBOSE_TRACES
2612 CTR(KTR_CXGBE,
2613 "%s: tid %u using %u tags at offset 0x%x",
2614 __func__, toep->tid, num_ddp_tags, qp->tpt_offset);
2615 #endif
2616 qp->num_ddp_tags = num_ddp_tags;
2617 qp->open_ddp_tags = mallocarray(qp->num_ddp_tags,
2618 sizeof(*qp->open_ddp_tags), M_NVMF_CHE, M_WAITOK |
2619 M_ZERO);
2620
2621 t4_nvme_set_tcb_field(toep, W_TCB_TPT_OFFSET,
2622 M_TCB_TPT_OFFSET, V_TCB_TPT_OFFSET(qp->tpt_offset));
2623 }
2624 }
2625
2626 TAILQ_INIT(&qp->rx_buffers.head);
2627 TAILQ_INIT(&qp->tx_buffers.head);
2628 mtx_init(&qp->rx_buffers.lock, "nvmf/che rx buffers", NULL, MTX_DEF);
2629 mtx_init(&qp->tx_buffers.lock, "nvmf/che tx buffers", NULL, MTX_DEF);
2630
2631 cv_init(&qp->rx_cv, "-");
2632 cv_init(&qp->tx_cv, "-");
2633 mbufq_init(&qp->rx_data, 0);
2634 mbufq_init(&qp->rx_pdus, 0);
2635 STAILQ_INIT(&qp->tx_capsules);
2636
2637 /* Register socket upcall for receive to handle remote FIN. */
2638 SOCKBUF_LOCK(&so->so_rcv);
2639 soupcall_set(so, SO_RCV, nvmf_che_soupcall_receive, qp);
2640 SOCKBUF_UNLOCK(&so->so_rcv);
2641
2642 /* Spin up kthreads. */
2643 error = kthread_add(nvmf_che_receive, qp, NULL, &qp->rx_thread, 0, 0,
2644 "nvmef che rx");
2645 if (error != 0) {
2646 che_free_qpair(&qp->qp);
2647 return (NULL);
2648 }
2649 error = kthread_add(nvmf_che_send, qp, NULL, &qp->tx_thread, 0, 0,
2650 "nvmef che tx");
2651 if (error != 0) {
2652 che_free_qpair(&qp->qp);
2653 return (NULL);
2654 }
2655
2656 return (&qp->qp);
2657 }
2658
2659 static void
che_release_qpair(struct nvmf_che_qpair * qp)2660 che_release_qpair(struct nvmf_che_qpair *qp)
2661 {
2662 if (refcount_release(&qp->refs))
2663 free(qp, M_NVMF_CHE);
2664 }
2665
2666 static void
che_free_qpair(struct nvmf_qpair * nq)2667 che_free_qpair(struct nvmf_qpair *nq)
2668 {
2669 struct nvmf_che_qpair *qp = CQP(nq);
2670 struct nvmf_che_command_buffer *ncb, *cb;
2671 struct nvmf_che_capsule *ncc, *cc;
2672 struct socket *so = qp->so;
2673 struct toepcb *toep = qp->toep;
2674 struct inpcb *inp = sotoinpcb(so);
2675
2676 /* Shut down kthreads. */
2677 SOCKBUF_LOCK(&so->so_snd);
2678 qp->tx_shutdown = true;
2679 if (qp->tx_thread != NULL) {
2680 cv_signal(&qp->tx_cv);
2681 mtx_sleep(qp->tx_thread, SOCKBUF_MTX(&so->so_snd), 0,
2682 "nvchetx", 0);
2683 }
2684 SOCKBUF_UNLOCK(&so->so_snd);
2685
2686 SOCKBUF_LOCK(&so->so_rcv);
2687 qp->rx_shutdown = true;
2688 if (qp->rx_thread != NULL) {
2689 cv_signal(&qp->rx_cv);
2690 mtx_sleep(qp->rx_thread, SOCKBUF_MTX(&so->so_rcv), 0,
2691 "nvcherx", 0);
2692 }
2693 soupcall_clear(so, SO_RCV);
2694 SOCKBUF_UNLOCK(&so->so_rcv);
2695 mbufq_drain(&qp->rx_data);
2696 mbufq_drain(&qp->rx_pdus);
2697
2698 STAILQ_FOREACH_SAFE(cc, &qp->tx_capsules, link, ncc) {
2699 nvmf_abort_capsule_data(&cc->nc, ECONNABORTED);
2700 che_release_capsule(cc);
2701 }
2702
2703 cv_destroy(&qp->tx_cv);
2704 cv_destroy(&qp->rx_cv);
2705
2706 if (qp->open_fl_ttags != NULL) {
2707 for (u_int i = 0; i < qp->num_fl_ttags; i++) {
2708 cb = qp->open_fl_ttags[i];
2709 if (cb != NULL) {
2710 cb->cc->active_r2ts--;
2711 cb->error = ECONNABORTED;
2712 che_release_command_buffer(cb);
2713 }
2714 }
2715 free(qp->open_fl_ttags, M_NVMF_CHE);
2716 }
2717 if (qp->num_ddp_tags != 0) {
2718 for (u_int i = 0; i < qp->num_ddp_tags; i++) {
2719 cb = qp->open_ddp_tags[i];
2720 if (cb != NULL) {
2721 if (cb->cc != NULL)
2722 cb->cc->active_r2ts--;
2723 cb->error = ECONNABORTED;
2724 mtx_lock(&qp->rx_buffers.lock);
2725 che_free_ddp_tag(qp, cb, cb->ttag);
2726 mtx_unlock(&qp->rx_buffers.lock);
2727 che_release_command_buffer(cb);
2728 }
2729 }
2730 free(qp->open_ddp_tags, M_NVMF_CHE);
2731 }
2732
2733 mtx_lock(&qp->rx_buffers.lock);
2734 TAILQ_FOREACH_SAFE(cb, &qp->rx_buffers.head, link, ncb) {
2735 che_remove_command_buffer(&qp->rx_buffers, cb);
2736 mtx_unlock(&qp->rx_buffers.lock);
2737 #ifdef INVARIANTS
2738 if (cb->cc != NULL)
2739 cb->cc->pending_r2ts--;
2740 #endif
2741 cb->error = ECONNABORTED;
2742 che_release_command_buffer(cb);
2743 mtx_lock(&qp->rx_buffers.lock);
2744 }
2745 mtx_destroy(&qp->rx_buffers.lock);
2746
2747 mtx_lock(&qp->tx_buffers.lock);
2748 TAILQ_FOREACH_SAFE(cb, &qp->tx_buffers.head, link, ncb) {
2749 che_remove_command_buffer(&qp->tx_buffers, cb);
2750 mtx_unlock(&qp->tx_buffers.lock);
2751 cb->error = ECONNABORTED;
2752 che_release_command_buffer(cb);
2753 mtx_lock(&qp->tx_buffers.lock);
2754 }
2755 mtx_destroy(&qp->tx_buffers.lock);
2756
2757 if (qp->num_ddp_tags != 0)
2758 t4_stag_free(qp->nca->sc, qp->tpt_offset, qp->num_ddp_tags);
2759
2760 if (!qp->qp.nq_controller) {
2761 free(qp->fl_cids, M_NVMF_CHE);
2762 free(qp->fl_cid_set, M_NVMF_CHE);
2763 mtx_destroy(&qp->fl_cid_lock);
2764 }
2765
2766 INP_WLOCK(inp);
2767 toep->ulpcb = NULL;
2768 mbufq_drain(&toep->ulp_pduq);
2769
2770 /*
2771 * Grab a reference to use when waiting for the final CPL to
2772 * be received. If toep->inp is NULL, then
2773 * final_cpl_received() has already been called (e.g. due to
2774 * the peer sending a RST).
2775 */
2776 if (toep->inp != NULL) {
2777 toep = hold_toepcb(toep);
2778 toep->flags |= TPF_WAITING_FOR_FINAL;
2779 } else
2780 toep = NULL;
2781 INP_WUNLOCK(inp);
2782
2783 soclose(so);
2784
2785 /*
2786 * Wait for the socket to fully close. This ensures any
2787 * pending received data has been received (and in particular,
2788 * any data that would be received by DDP has been handled).
2789 */
2790 if (toep != NULL) {
2791 struct mtx *lock = mtx_pool_find(mtxpool_sleep, toep);
2792
2793 mtx_lock(lock);
2794 while ((toep->flags & TPF_WAITING_FOR_FINAL) != 0)
2795 mtx_sleep(toep, lock, PSOCK, "conclo2", 0);
2796 mtx_unlock(lock);
2797 free_toepcb(toep);
2798 }
2799
2800 che_release_qpair(qp);
2801 }
2802
2803 static uint32_t
che_max_ioccsz(struct nvmf_qpair * nq)2804 che_max_ioccsz(struct nvmf_qpair *nq)
2805 {
2806 struct nvmf_che_qpair *qp = CQP(nq);
2807
2808 /*
2809 * Limit the command capsule size so that with maximum ICD it
2810 * fits within the limit of the largest PDU the adapter can
2811 * receive.
2812 */
2813 return (qp->max_ioccsz);
2814 }
2815
2816 static uint64_t
che_max_xfer_size(struct nvmf_qpair * nq)2817 che_max_xfer_size(struct nvmf_qpair *nq)
2818 {
2819 struct nvmf_che_qpair *qp = CQP(nq);
2820
2821 /*
2822 * Limit host transfers to the size of the data payload in the
2823 * largest PDU the adapter can receive.
2824 */
2825 return (qp->max_rx_data);
2826 }
2827
2828 static struct nvmf_capsule *
che_allocate_capsule(struct nvmf_qpair * nq,int how)2829 che_allocate_capsule(struct nvmf_qpair *nq, int how)
2830 {
2831 struct nvmf_che_qpair *qp = CQP(nq);
2832 struct nvmf_che_capsule *cc;
2833
2834 cc = malloc(sizeof(*cc), M_NVMF_CHE, how | M_ZERO);
2835 if (cc == NULL)
2836 return (NULL);
2837 refcount_init(&cc->refs, 1);
2838 refcount_acquire(&qp->refs);
2839 return (&cc->nc);
2840 }
2841
2842 static void
che_release_capsule(struct nvmf_che_capsule * cc)2843 che_release_capsule(struct nvmf_che_capsule *cc)
2844 {
2845 struct nvmf_che_qpair *qp = CQP(cc->nc.nc_qpair);
2846
2847 if (!refcount_release(&cc->refs))
2848 return;
2849
2850 MPASS(cc->active_r2ts == 0);
2851 MPASS(cc->pending_r2ts == 0);
2852
2853 nvmf_che_free_pdu(&cc->rx_pdu);
2854 free(cc, M_NVMF_CHE);
2855 che_release_qpair(qp);
2856 }
2857
2858 static void
che_free_capsule(struct nvmf_capsule * nc)2859 che_free_capsule(struct nvmf_capsule *nc)
2860 {
2861 che_release_capsule(CCAP(nc));
2862 }
2863
2864 static int
che_transmit_capsule(struct nvmf_capsule * nc)2865 che_transmit_capsule(struct nvmf_capsule *nc)
2866 {
2867 struct nvmf_che_qpair *qp = CQP(nc->nc_qpair);
2868 struct nvmf_che_capsule *cc = CCAP(nc);
2869 struct socket *so = qp->so;
2870
2871 refcount_acquire(&cc->refs);
2872 SOCKBUF_LOCK(&so->so_snd);
2873 STAILQ_INSERT_TAIL(&qp->tx_capsules, cc, link);
2874 cv_signal(&qp->tx_cv);
2875 SOCKBUF_UNLOCK(&so->so_snd);
2876 return (0);
2877 }
2878
2879 static uint8_t
che_validate_command_capsule(struct nvmf_capsule * nc)2880 che_validate_command_capsule(struct nvmf_capsule *nc)
2881 {
2882 struct nvmf_che_capsule *cc = CCAP(nc);
2883 struct nvme_sgl_descriptor *sgl;
2884
2885 KASSERT(cc->rx_pdu.hdr != NULL, ("capsule wasn't received"));
2886
2887 sgl = &nc->nc_sqe.sgl;
2888 switch (sgl->type) {
2889 case NVME_SGL_TYPE_ICD:
2890 if (cc->rx_pdu.data_len != le32toh(sgl->length)) {
2891 printf("NVMe/TCP: Command Capsule with mismatched ICD length\n");
2892 return (NVME_SC_DATA_SGL_LENGTH_INVALID);
2893 }
2894 break;
2895 case NVME_SGL_TYPE_COMMAND_BUFFER:
2896 if (cc->rx_pdu.data_len != 0) {
2897 printf("NVMe/TCP: Command Buffer SGL with ICD\n");
2898 return (NVME_SC_INVALID_FIELD);
2899 }
2900 break;
2901 default:
2902 printf("NVMe/TCP: Invalid SGL type in Command Capsule\n");
2903 return (NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID);
2904 }
2905
2906 if (sgl->address != 0) {
2907 printf("NVMe/TCP: Invalid SGL offset in Command Capsule\n");
2908 return (NVME_SC_SGL_OFFSET_INVALID);
2909 }
2910
2911 return (NVME_SC_SUCCESS);
2912 }
2913
2914 static size_t
che_capsule_data_len(const struct nvmf_capsule * nc)2915 che_capsule_data_len(const struct nvmf_capsule *nc)
2916 {
2917 MPASS(nc->nc_qe_len == sizeof(struct nvme_command));
2918 return (le32toh(nc->nc_sqe.sgl.length));
2919 }
2920
2921 static void
che_receive_r2t_data(struct nvmf_capsule * nc,uint32_t data_offset,struct nvmf_io_request * io)2922 che_receive_r2t_data(struct nvmf_capsule *nc, uint32_t data_offset,
2923 struct nvmf_io_request *io)
2924 {
2925 struct nvmf_che_qpair *qp = CQP(nc->nc_qpair);
2926 struct nvmf_che_capsule *cc = CCAP(nc);
2927 struct nvmf_che_command_buffer *cb;
2928
2929 cb = che_alloc_command_buffer(qp, io, data_offset, io->io_len,
2930 nc->nc_sqe.cid);
2931
2932 cb->cc = cc;
2933 refcount_acquire(&cc->refs);
2934
2935 /*
2936 * If this command has too many active R2Ts or there are no
2937 * available transfer tags, queue the request for later.
2938 *
2939 * NB: maxr2t is 0's based.
2940 */
2941 mtx_lock(&qp->rx_buffers.lock);
2942 if (cc->active_r2ts > qp->maxr2t ||
2943 !nvmf_che_allocate_ttag(qp, cb)) {
2944 #ifdef INVARIANTS
2945 cc->pending_r2ts++;
2946 #endif
2947 TAILQ_INSERT_TAIL(&qp->rx_buffers.head, cb, link);
2948 mtx_unlock(&qp->rx_buffers.lock);
2949 return;
2950 }
2951 mtx_unlock(&qp->rx_buffers.lock);
2952
2953 che_send_r2t(qp, nc->nc_sqe.cid, cb->ttag, data_offset, io->io_len);
2954 }
2955
2956 static void
che_receive_icd_data(struct nvmf_capsule * nc,uint32_t data_offset,struct nvmf_io_request * io)2957 che_receive_icd_data(struct nvmf_capsule *nc, uint32_t data_offset,
2958 struct nvmf_io_request *io)
2959 {
2960 struct nvmf_che_capsule *cc = CCAP(nc);
2961
2962 /*
2963 * The header is in rx_pdu.m, the padding is discarded, and
2964 * the data starts at rx_pdu.m->m_next.
2965 */
2966 mbuf_copyto_io(cc->rx_pdu.m->m_next, data_offset, io->io_len, io, 0);
2967 nvmf_complete_io_request(io, io->io_len, 0);
2968 }
2969
2970 static int
che_receive_controller_data(struct nvmf_capsule * nc,uint32_t data_offset,struct nvmf_io_request * io)2971 che_receive_controller_data(struct nvmf_capsule *nc, uint32_t data_offset,
2972 struct nvmf_io_request *io)
2973 {
2974 struct nvme_sgl_descriptor *sgl;
2975 size_t data_len;
2976
2977 if (nc->nc_qe_len != sizeof(struct nvme_command) ||
2978 !nc->nc_qpair->nq_controller)
2979 return (EINVAL);
2980
2981 sgl = &nc->nc_sqe.sgl;
2982 data_len = le32toh(sgl->length);
2983 if (data_offset + io->io_len > data_len)
2984 return (EFBIG);
2985
2986 if (sgl->type == NVME_SGL_TYPE_ICD)
2987 che_receive_icd_data(nc, data_offset, io);
2988 else
2989 che_receive_r2t_data(nc, data_offset, io);
2990 return (0);
2991 }
2992
2993 /* NB: cid is little-endian already. */
2994 static void
che_send_c2h_pdu(struct nvmf_che_qpair * qp,uint16_t cid,uint32_t data_offset,struct mbuf * m,size_t len,bool last_pdu,bool success)2995 che_send_c2h_pdu(struct nvmf_che_qpair *qp, uint16_t cid, uint32_t data_offset,
2996 struct mbuf *m, size_t len, bool last_pdu, bool success)
2997 {
2998 struct nvme_tcp_c2h_data_hdr c2h;
2999 struct mbuf *top;
3000
3001 memset(&c2h, 0, sizeof(c2h));
3002 c2h.common.pdu_type = NVME_TCP_PDU_TYPE_C2H_DATA;
3003 if (last_pdu)
3004 c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_LAST_PDU;
3005 if (success)
3006 c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_SUCCESS;
3007 c2h.cccid = cid;
3008 c2h.datao = htole32(data_offset);
3009 c2h.datal = htole32(len);
3010
3011 top = nvmf_che_construct_pdu(qp, &c2h, sizeof(c2h), m, len);
3012 nvmf_che_write_pdu(qp, top);
3013 }
3014
3015 static u_int
che_send_controller_data(struct nvmf_capsule * nc,uint32_t data_offset,struct mbuf * m,size_t len)3016 che_send_controller_data(struct nvmf_capsule *nc, uint32_t data_offset,
3017 struct mbuf *m, size_t len)
3018 {
3019 struct nvmf_che_qpair *qp = CQP(nc->nc_qpair);
3020 struct nvme_sgl_descriptor *sgl;
3021 uint32_t data_len;
3022 bool last_pdu, last_xfer;
3023
3024 if (nc->nc_qe_len != sizeof(struct nvme_command) ||
3025 !qp->qp.nq_controller) {
3026 m_freem(m);
3027 return (NVME_SC_INVALID_FIELD);
3028 }
3029
3030 sgl = &nc->nc_sqe.sgl;
3031 data_len = le32toh(sgl->length);
3032 if (data_offset + len > data_len) {
3033 m_freem(m);
3034 return (NVME_SC_INVALID_FIELD);
3035 }
3036 last_xfer = (data_offset + len == data_len);
3037
3038 if (sgl->type != NVME_SGL_TYPE_COMMAND_BUFFER) {
3039 m_freem(m);
3040 return (NVME_SC_INVALID_FIELD);
3041 }
3042
3043 KASSERT(data_offset == CCAP(nc)->tx_data_offset,
3044 ("%s: starting data_offset %u doesn't match end of previous xfer %u",
3045 __func__, data_offset, CCAP(nc)->tx_data_offset));
3046
3047 /* Queue one or more C2H_DATA PDUs containing the data from 'm'. */
3048 while (m != NULL) {
3049 struct mbuf *n;
3050 uint32_t todo;
3051
3052 if (m->m_len > qp->max_tx_data) {
3053 n = m_split(m, qp->max_tx_data, M_WAITOK);
3054 todo = m->m_len;
3055 } else {
3056 struct mbuf *p;
3057
3058 todo = m->m_len;
3059 p = m;
3060 n = p->m_next;
3061 while (n != NULL) {
3062 if (todo + n->m_len > qp->max_tx_data) {
3063 p->m_next = NULL;
3064 break;
3065 }
3066 todo += n->m_len;
3067 p = n;
3068 n = p->m_next;
3069 }
3070 MPASS(m_length(m, NULL) == todo);
3071 }
3072
3073 last_pdu = (n == NULL && last_xfer);
3074 che_send_c2h_pdu(qp, nc->nc_sqe.cid, data_offset, m, todo,
3075 last_pdu, last_pdu && qp->send_success);
3076
3077 data_offset += todo;
3078 data_len -= todo;
3079 m = n;
3080 }
3081 MPASS(data_len == 0);
3082
3083 #ifdef INVARIANTS
3084 CCAP(nc)->tx_data_offset = data_offset;
3085 #endif
3086 if (!last_xfer)
3087 return (NVMF_MORE);
3088 else if (qp->send_success)
3089 return (NVMF_SUCCESS_SENT);
3090 else
3091 return (NVME_SC_SUCCESS);
3092 }
3093
3094 struct nvmf_transport_ops che_ops = {
3095 .allocate_qpair = che_allocate_qpair,
3096 .free_qpair = che_free_qpair,
3097 .max_ioccsz = che_max_ioccsz,
3098 .max_xfer_size = che_max_xfer_size,
3099 .allocate_capsule = che_allocate_capsule,
3100 .free_capsule = che_free_capsule,
3101 .transmit_capsule = che_transmit_capsule,
3102 .validate_command_capsule = che_validate_command_capsule,
3103 .capsule_data_len = che_capsule_data_len,
3104 .receive_controller_data = che_receive_controller_data,
3105 .send_controller_data = che_send_controller_data,
3106 .trtype = NVMF_TRTYPE_TCP,
3107 .priority = 10,
3108 };
3109
3110 NVMF_TRANSPORT(che, che_ops);
3111
3112 static void
read_pdu_limits(struct adapter * sc,u_int * max_tx_pdu_len,uint32_t * max_rx_pdu_len)3113 read_pdu_limits(struct adapter *sc, u_int *max_tx_pdu_len,
3114 uint32_t *max_rx_pdu_len)
3115 {
3116 uint32_t tx_len, rx_len, r, v;
3117
3118 /* Copied from cxgbei, but not sure if this is correct. */
3119 rx_len = t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE);
3120 tx_len = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE);
3121
3122 r = t4_read_reg(sc, A_TP_PARA_REG2);
3123 rx_len = min(rx_len, G_MAXRXDATA(r));
3124 tx_len = min(tx_len, G_MAXRXDATA(r));
3125
3126 r = t4_read_reg(sc, A_TP_PARA_REG7);
3127 v = min(G_PMMAXXFERLEN0(r), G_PMMAXXFERLEN1(r));
3128 rx_len = min(rx_len, v);
3129 tx_len = min(tx_len, v);
3130
3131 /* Cannot be larger than 32KB - 256. */
3132 rx_len = min(rx_len, 32512);
3133 tx_len = min(tx_len, 32512);
3134
3135 *max_tx_pdu_len = tx_len;
3136 *max_rx_pdu_len = rx_len;
3137 }
3138
3139 static int
nvmf_che_init(struct adapter * sc,struct nvmf_che_adapter * nca)3140 nvmf_che_init(struct adapter *sc, struct nvmf_che_adapter *nca)
3141 {
3142 struct sysctl_oid *oid;
3143 struct sysctl_oid_list *children;
3144 uint32_t val;
3145
3146 read_pdu_limits(sc, &nca->max_transmit_pdu, &nca->max_receive_pdu);
3147 if (nca->max_transmit_pdu > che_max_transmit_pdu)
3148 nca->max_transmit_pdu = che_max_transmit_pdu;
3149 if (nca->max_receive_pdu > che_max_receive_pdu)
3150 nca->max_receive_pdu = che_max_receive_pdu;
3151 val = t4_read_reg(sc, A_SGE_CONTROL2);
3152 nca->nvmt_data_iqe = (val & F_RXCPLMODE_NVMT) != 0;
3153
3154 sysctl_ctx_init(&nca->ctx);
3155 oid = device_get_sysctl_tree(sc->dev); /* dev.che.X */
3156 children = SYSCTL_CHILDREN(oid);
3157
3158 oid = SYSCTL_ADD_NODE(&nca->ctx, children, OID_AUTO, "nvme",
3159 CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "NVMe ULP settings");
3160 children = SYSCTL_CHILDREN(oid);
3161
3162 nca->ddp_threshold = 8192;
3163 SYSCTL_ADD_UINT(&nca->ctx, children, OID_AUTO, "ddp_threshold",
3164 CTLFLAG_RW, &nca->ddp_threshold, 0, "Rx zero copy threshold");
3165
3166 SYSCTL_ADD_UINT(&nca->ctx, children, OID_AUTO, "max_transmit_pdu",
3167 CTLFLAG_RW, &nca->max_transmit_pdu, 0,
3168 "Maximum size of a transmitted PDU");
3169
3170 SYSCTL_ADD_UINT(&nca->ctx, children, OID_AUTO, "max_receive_pdu",
3171 CTLFLAG_RW, &nca->max_receive_pdu, 0,
3172 "Maximum size of a received PDU");
3173
3174 return (0);
3175 }
3176
3177 static void
nvmf_che_destroy(struct nvmf_che_adapter * nca)3178 nvmf_che_destroy(struct nvmf_che_adapter *nca)
3179 {
3180 sysctl_ctx_free(&nca->ctx);
3181 free(nca, M_CXGBE);
3182 }
3183
3184 static int
nvmf_che_activate(struct adapter * sc)3185 nvmf_che_activate(struct adapter *sc)
3186 {
3187 struct nvmf_che_adapter *nca;
3188 int rc;
3189
3190 ASSERT_SYNCHRONIZED_OP(sc);
3191
3192 if (uld_active(sc, ULD_NVME)) {
3193 KASSERT(0, ("%s: NVMe offload already enabled on adapter %p",
3194 __func__, sc));
3195 return (0);
3196 }
3197
3198 if ((sc->nvmecaps & FW_CAPS_CONFIG_NVME_TCP) == 0) {
3199 device_printf(sc->dev,
3200 "not NVMe offload capable, or capability disabled\n");
3201 return (ENOSYS);
3202 }
3203
3204 /* per-adapter softc for NVMe */
3205 nca = malloc(sizeof(*nca), M_CXGBE, M_ZERO | M_WAITOK);
3206 nca->sc = sc;
3207
3208 rc = nvmf_che_init(sc, nca);
3209 if (rc != 0) {
3210 free(nca, M_CXGBE);
3211 return (rc);
3212 }
3213
3214 sc->nvme_ulp_softc = nca;
3215
3216 return (0);
3217 }
3218
3219 static int
nvmf_che_deactivate(struct adapter * sc)3220 nvmf_che_deactivate(struct adapter *sc)
3221 {
3222 struct nvmf_che_adapter *nca = sc->nvme_ulp_softc;
3223
3224 ASSERT_SYNCHRONIZED_OP(sc);
3225
3226 if (nca != NULL) {
3227 nvmf_che_destroy(nca);
3228 sc->nvme_ulp_softc = NULL;
3229 }
3230
3231 return (0);
3232 }
3233
3234 static void
nvmf_che_activate_all(struct adapter * sc,void * arg __unused)3235 nvmf_che_activate_all(struct adapter *sc, void *arg __unused)
3236 {
3237 if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t7nvact") != 0)
3238 return;
3239
3240 /* Activate NVMe if any port on this adapter has IFCAP_TOE enabled. */
3241 if (sc->offload_map && !uld_active(sc, ULD_NVME))
3242 (void) t4_activate_uld(sc, ULD_NVME);
3243
3244 end_synchronized_op(sc, 0);
3245 }
3246
3247 static void
nvmf_che_deactivate_all(struct adapter * sc,void * arg __unused)3248 nvmf_che_deactivate_all(struct adapter *sc, void *arg __unused)
3249 {
3250 if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t7nvdea") != 0)
3251 return;
3252
3253 if (uld_active(sc, ULD_NVME))
3254 (void) t4_deactivate_uld(sc, ULD_NVME);
3255
3256 end_synchronized_op(sc, 0);
3257 }
3258
3259 static struct uld_info nvmf_che_uld_info = {
3260 .uld_activate = nvmf_che_activate,
3261 .uld_deactivate = nvmf_che_deactivate,
3262 };
3263
3264 static int
nvmf_che_mod_load(void)3265 nvmf_che_mod_load(void)
3266 {
3267 int rc;
3268
3269 t4_register_cpl_handler(CPL_NVMT_CMP, do_nvmt_cmp);
3270 t4_register_cpl_handler(CPL_NVMT_DATA, do_nvmt_data);
3271
3272 rc = t4_register_uld(&nvmf_che_uld_info, ULD_NVME);
3273 if (rc != 0)
3274 return (rc);
3275
3276 t4_iterate(nvmf_che_activate_all, NULL);
3277
3278 return (rc);
3279 }
3280
3281 static int
nvmf_che_mod_unload(void)3282 nvmf_che_mod_unload(void)
3283 {
3284 t4_iterate(nvmf_che_deactivate_all, NULL);
3285
3286 if (t4_unregister_uld(&nvmf_che_uld_info, ULD_NVME) == EBUSY)
3287 return (EBUSY);
3288
3289 t4_register_cpl_handler(CPL_NVMT_CMP, NULL);
3290 t4_register_cpl_handler(CPL_NVMT_DATA, NULL);
3291
3292 return (0);
3293 }
3294 #endif
3295
3296 static int
nvmf_che_modevent(module_t mod,int cmd,void * arg)3297 nvmf_che_modevent(module_t mod, int cmd, void *arg)
3298 {
3299 int rc;
3300
3301 #ifdef TCP_OFFLOAD
3302 switch (cmd) {
3303 case MOD_LOAD:
3304 rc = nvmf_che_mod_load();
3305 break;
3306 case MOD_UNLOAD:
3307 rc = nvmf_che_mod_unload();
3308 break;
3309 default:
3310 rc = EOPNOTSUPP;
3311 break;
3312 }
3313 #else
3314 printf("nvmf_che: compiled without TCP_OFFLOAD support.\n");
3315 rc = EOPNOTSUPP;
3316 #endif
3317
3318 return (rc);
3319 }
3320
3321 static moduledata_t nvmf_che_mod = {
3322 "nvmf_che",
3323 nvmf_che_modevent,
3324 NULL,
3325 };
3326
3327 MODULE_VERSION(nvmf_che, 1);
3328 DECLARE_MODULE(nvmf_che, nvmf_che_mod, SI_SUB_EXEC, SI_ORDER_ANY);
3329 MODULE_DEPEND(nvmf_che, t4_tom, 1, 1, 1);
3330 MODULE_DEPEND(nvmf_che, cxgbe, 1, 1, 1);
3331