1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2023 Chelsio Communications, Inc.
5 * Written by: John Baldwin <jhb@FreeBSD.org>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 #include "opt_inet.h"
30
31 #include <sys/param.h>
32 #include <sys/libkern.h>
33 #include <sys/kernel.h>
34 #include <sys/module.h>
35
36 #ifdef TCP_OFFLOAD
37 #include <sys/bitset.h>
38 #include <sys/capsicum.h>
39 #include <sys/file.h>
40 #include <sys/kthread.h>
41 #include <sys/ktr.h>
42 #include <sys/malloc.h>
43 #include <sys/mbuf.h>
44 #include <sys/nv.h>
45 #include <sys/protosw.h>
46 #include <sys/socket.h>
47 #include <sys/socketvar.h>
48 #include <netinet/in.h>
49 #include <netinet/in_pcb.h>
50 #include <netinet/tcp_var.h>
51 #include <netinet/toecore.h>
52
53 #include <dev/nvmf/nvmf.h>
54 #include <dev/nvmf/nvmf_proto.h>
55 #include <dev/nvmf/nvmf_tcp.h>
56 #include <dev/nvmf/nvmf_transport.h>
57 #include <dev/nvmf/nvmf_transport_internal.h>
58
59 #include <vm/pmap.h>
60 #include <vm/vm_page.h>
61
62 #include "common/common.h"
63 #include "common/t4_regs.h"
64 #include "common/t4_tcb.h"
65 #include "tom/t4_tom.h"
66
67 /* Status code values in CPL_NVMT_CMP. */
68 #define CMP_STATUS_ERROR_MASK 0x7f
69 #define CMP_STATUS_NO_ERROR 0
70 #define CMP_STATUS_HEADER_DIGEST 1
71 #define CMP_STATUS_DIRECTION_MISMATCH 2
72 #define CMP_STATUS_DIGEST_FLAG_MISMATCH 3
73 #define CMP_STATUS_SUCCESS_NOT_LAST 4
74 #define CMP_STATUS_BAD_DATA_LENGTH 5
75 #define CMP_STATUS_USER_MODE_UNALLOCATED 6
76 #define CMP_STATUS_RQT_LIMIT 7
77 #define CMP_STATUS_RQT_WRAP 8
78 #define CMP_STATUS_RQT_BOUND 9
79 #define CMP_STATUS_TPT_LIMIT 16
80 #define CMP_STATUS_TPT_INVALID 17
81 #define CMP_STATUS_TPT_COLOUR_MISMATCH 18
82 #define CMP_STATUS_TPT_MISC 19
83 #define CMP_STATUS_TPT_WRAP 20
84 #define CMP_STATUS_TPT_BOUND 21
85 #define CMP_STATUS_TPT_LAST_PDU_UNALIGNED 22
86 #define CMP_STATUS_PBL_LIMIT 24
87 #define CMP_STATUS_DATA_DIGEST 25
88 #define CMP_STATUS_DDP 0x80
89
90 /*
91 * Transfer tags and CIDs with the MSB set are "unallocated" tags that
92 * pass data through to the freelist without using DDP.
93 */
94 #define CHE_FL_TAG_MASK 0x8000
95 #define CHE_MAX_FL_TAG 0x7fff
96 #define CHE_NUM_FL_TAGS (CHE_MAX_FL_TAG + 1)
97
98 #define CHE_TAG_IS_FL(ttag) (((ttag) & CHE_FL_TAG_MASK) == CHE_FL_TAG_MASK)
99 #define CHE_RAW_FL_TAG(ttag) ((ttag) & ~CHE_FL_TAG_MASK)
100 #define CHE_DDP_TAG(stag_idx, color) ((stag_idx) << 4 | (color))
101 #define CHE_STAG_COLOR(stag) ((stag) & 0xf)
102 #define CHE_STAG_IDX(stag) ((stag) >> 4)
103 #define CHE_DDP_MAX_COLOR 0xf
104
105 #define CHE_DDP_NO_TAG 0xffff
106
107 /*
108 * A bitmap of non-DDP CIDs in use on the host. Since there is no
109 * _BIT_FFC (find first clear), the bitset is inverted so that a clear
110 * bit indicates an in-use CID.
111 */
112 BITSET_DEFINE(fl_cid_set, CHE_NUM_FL_TAGS);
113 #define FL_CID_INIT(p) __BIT_FILL(CHE_NUM_FL_TAGS, p)
114 #define FL_CID_BUSY(n, p) __BIT_CLR(CHE_NUM_FL_TAGS, n, p)
115 #define FL_CID_ISACTIVE(n, p) !__BIT_ISSET(CHE_NUM_FL_TAGS, n, p)
116 #define FL_CID_FREE(n, p) __BIT_SET(CHE_NUM_FL_TAGS, n, p)
117 #define FL_CID_FINDFREE_AT(p, start) __BIT_FFS_AT(CHE_NUM_FL_TAGS, p, start)
118
119 /*
120 * The TCP sequence number of both CPL_NVMT_DATA and CPL_NVMT_CMP
121 * mbufs are saved here while the mbuf is in qp->rx_data and qp->rx_pdus.
122 */
123 #define nvmf_tcp_seq PH_loc.thirtytwo[0]
124
125 /*
126 * The CPL status of CPL_NVMT_CMP mbufs are saved here while the mbuf
127 * is in qp->rx_pdus.
128 */
129 #define nvmf_cpl_status PH_loc.eight[4]
130
131 struct nvmf_che_capsule;
132 struct nvmf_che_qpair;
133
134 struct nvmf_che_adapter {
135 struct adapter *sc;
136
137 u_int ddp_threshold;
138 u_int max_transmit_pdu;
139 u_int max_receive_pdu;
140 bool nvmt_data_iqe;
141
142 struct sysctl_ctx_list ctx; /* from uld_activate to deactivate */
143 };
144
145 struct nvmf_che_command_buffer {
146 struct nvmf_che_qpair *qp;
147
148 struct nvmf_io_request io;
149 size_t data_len;
150 size_t data_xfered;
151 uint32_t data_offset;
152
153 u_int refs;
154 int error;
155
156 bool ddp_ok;
157 uint16_t cid;
158 uint16_t ttag;
159 uint16_t original_cid; /* Host only */
160
161 TAILQ_ENTRY(nvmf_che_command_buffer) link;
162
163 /* Fields used for DDP. */
164 struct fw_ri_tpte tpte;
165 uint64_t *pbl;
166 uint32_t pbl_addr;
167 uint32_t pbl_len;
168
169 /* Controller only */
170 struct nvmf_che_capsule *cc;
171 };
172
173 struct nvmf_che_command_buffer_list {
174 TAILQ_HEAD(, nvmf_che_command_buffer) head;
175 struct mtx lock;
176 };
177
178 struct nvmf_che_qpair {
179 struct nvmf_qpair qp;
180
181 struct socket *so;
182 struct toepcb *toep;
183 struct nvmf_che_adapter *nca;
184
185 volatile u_int refs; /* Every allocated capsule holds a reference */
186 uint8_t txpda;
187 uint8_t rxpda;
188 bool header_digests;
189 bool data_digests;
190 uint32_t maxr2t;
191 uint32_t maxh2cdata; /* Controller only */
192 uint32_t max_rx_data;
193 uint32_t max_tx_data;
194 uint32_t max_icd; /* Host only */
195 uint32_t max_ioccsz; /* Controller only */
196 union {
197 uint16_t next_fl_ttag; /* Controller only */
198 uint16_t next_cid; /* Host only */
199 };
200 uint16_t next_ddp_tag;
201 u_int num_fl_ttags; /* Controller only */
202 u_int active_fl_ttags; /* Controller only */
203 u_int num_ddp_tags;
204 u_int active_ddp_tags;
205 bool send_success; /* Controller only */
206 uint8_t ddp_color;
207 uint32_t tpt_offset;
208
209 /* Receive state. */
210 struct thread *rx_thread;
211 struct cv rx_cv;
212 bool rx_shutdown;
213 int rx_error;
214 struct mbufq rx_data; /* Data received via CPL_NVMT_DATA. */
215 struct mbufq rx_pdus; /* PDU headers received via CPL_NVMT_CMP. */
216
217 /* Transmit state. */
218 struct thread *tx_thread;
219 struct cv tx_cv;
220 bool tx_shutdown;
221 STAILQ_HEAD(, nvmf_che_capsule) tx_capsules;
222
223 struct nvmf_che_command_buffer_list tx_buffers;
224 struct nvmf_che_command_buffer_list rx_buffers;
225
226 /*
227 * For the controller, an RX command buffer can be in one of
228 * three locations, all protected by the rx_buffers.lock. If
229 * a receive request is waiting for either an R2T slot for its
230 * command (due to exceeding MAXR2T), or a transfer tag it is
231 * placed on the rx_buffers list. When a request is allocated
232 * an active transfer tag, it moves to either the
233 * open_ddp_tags[] or open_fl_ttags[] array (indexed by the
234 * tag) until it completes.
235 *
236 * For the host, an RX command buffer using DDP is in
237 * open_ddp_tags[], otherwise it is in rx_buffers.
238 */
239 struct nvmf_che_command_buffer **open_ddp_tags;
240 struct nvmf_che_command_buffer **open_fl_ttags; /* Controller only */
241
242 /*
243 * For the host, CIDs submitted by nvmf(4) must be rewritten
244 * to either use DDP or not use DDP. The CID in response
245 * capsules must be restored to their original value. For
246 * DDP, the original CID is stored in the command buffer.
247 * These variables manage non-DDP CIDs.
248 */
249 uint16_t *fl_cids; /* Host only */
250 struct fl_cid_set *fl_cid_set; /* Host only */
251 struct mtx fl_cid_lock; /* Host only */
252 };
253
254 struct nvmf_che_rxpdu {
255 struct mbuf *m;
256 const struct nvme_tcp_common_pdu_hdr *hdr;
257 uint32_t data_len;
258 bool data_digest_mismatch;
259 bool ddp;
260 };
261
262 struct nvmf_che_capsule {
263 struct nvmf_capsule nc;
264
265 volatile u_int refs;
266
267 struct nvmf_che_rxpdu rx_pdu;
268
269 uint32_t active_r2ts; /* Controller only */
270 #ifdef INVARIANTS
271 uint32_t tx_data_offset; /* Controller only */
272 u_int pending_r2ts; /* Controller only */
273 #endif
274
275 STAILQ_ENTRY(nvmf_che_capsule) link;
276 };
277
278 #define CCAP(nc) ((struct nvmf_che_capsule *)(nc))
279 #define CQP(qp) ((struct nvmf_che_qpair *)(qp))
280
281 static void che_release_capsule(struct nvmf_che_capsule *cc);
282 static void che_free_qpair(struct nvmf_qpair *nq);
283
284 SYSCTL_NODE(_kern_nvmf, OID_AUTO, che, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
285 "Chelsio TCP offload transport");
286
287 static u_int che_max_transmit_pdu = 32 * 1024;
288 SYSCTL_UINT(_kern_nvmf_che, OID_AUTO, max_transmit_pdu, CTLFLAG_RWTUN,
289 &che_max_transmit_pdu, 0,
290 "Maximum size of a transmitted PDU");
291
292 static u_int che_max_receive_pdu = 32 * 1024;
293 SYSCTL_UINT(_kern_nvmf_che, OID_AUTO, max_receive_pdu, CTLFLAG_RWTUN,
294 &che_max_receive_pdu, 0,
295 "Maximum size of a received PDU");
296
297 static int use_dsgl = 1;
298 SYSCTL_INT(_kern_nvmf_che, OID_AUTO, use_dsgl, CTLFLAG_RWTUN, &use_dsgl, 0,
299 "Use DSGL for PBL/FastReg (default=1)");
300
301 static int inline_threshold = 256;
302 SYSCTL_INT(_kern_nvmf_che, OID_AUTO, inline_threshold, CTLFLAG_RWTUN,
303 &inline_threshold, 0,
304 "inline vs dsgl threshold (default=256)");
305
306 static int ddp_tags_per_qp = 128;
307 SYSCTL_INT(_kern_nvmf_che, OID_AUTO, ddp_tags_per_qp, CTLFLAG_RWTUN,
308 &ddp_tags_per_qp, 0,
309 "Number of DDP tags to reserve for each queue pair");
310
311 static MALLOC_DEFINE(M_NVMF_CHE, "nvmf_che", "Chelsio NVMe-TCP offload");
312
313 /*
314 * PBL regions consist of N full-sized pages. TPT entries support an
315 * initial offset into the first page (FBO) and can handle a partial
316 * length on the last page.
317 */
318 static bool
che_ddp_io_check(struct nvmf_che_qpair * qp,const struct nvmf_io_request * io)319 che_ddp_io_check(struct nvmf_che_qpair *qp, const struct nvmf_io_request *io)
320 {
321 const struct memdesc *mem = &io->io_mem;
322 struct bus_dma_segment *ds;
323 int i;
324
325 if (io->io_len < qp->nca->ddp_threshold) {
326 return (false);
327 }
328
329 switch (mem->md_type) {
330 case MEMDESC_VADDR:
331 case MEMDESC_PADDR:
332 case MEMDESC_VMPAGES:
333 return (true);
334 case MEMDESC_VLIST:
335 case MEMDESC_PLIST:
336 /*
337 * Require all but the first segment to start on a
338 * page boundary. Require all but the last segment to
339 * end on a page boundary.
340 */
341 ds = mem->u.md_list;
342 for (i = 0; i < mem->md_nseg; i++, ds++) {
343 if (i != 0 && ds->ds_addr % PAGE_SIZE != 0)
344 return (false);
345 if (i != mem->md_nseg - 1 &&
346 (ds->ds_addr + ds->ds_len) % PAGE_SIZE != 0)
347 return (false);
348 }
349 return (true);
350 default:
351 /*
352 * Other types could be validated with more work, but
353 * they aren't used currently by nvmf(4) or nvmft(4).
354 */
355 return (false);
356 }
357 }
358
359 static u_int
che_fbo(struct nvmf_che_command_buffer * cb)360 che_fbo(struct nvmf_che_command_buffer *cb)
361 {
362 struct memdesc *mem = &cb->io.io_mem;
363
364 switch (mem->md_type) {
365 case MEMDESC_VADDR:
366 return ((uintptr_t)mem->u.md_vaddr & PAGE_MASK);
367 case MEMDESC_PADDR:
368 return (mem->u.md_paddr & PAGE_MASK);
369 case MEMDESC_VMPAGES:
370 return (mem->md_offset);
371 case MEMDESC_VLIST:
372 case MEMDESC_PLIST:
373 return (mem->u.md_list[0].ds_addr & PAGE_MASK);
374 default:
375 __assert_unreachable();
376 }
377 }
378
379 static u_int
che_npages(struct nvmf_che_command_buffer * cb)380 che_npages(struct nvmf_che_command_buffer *cb)
381 {
382 return (howmany(che_fbo(cb) + cb->io.io_len, PAGE_SIZE));
383 }
384
385 static struct nvmf_che_command_buffer *
che_alloc_command_buffer(struct nvmf_che_qpair * qp,const struct nvmf_io_request * io,uint32_t data_offset,size_t data_len,uint16_t cid)386 che_alloc_command_buffer(struct nvmf_che_qpair *qp,
387 const struct nvmf_io_request *io, uint32_t data_offset, size_t data_len,
388 uint16_t cid)
389 {
390 struct nvmf_che_command_buffer *cb;
391
392 cb = malloc(sizeof(*cb), M_NVMF_CHE, M_WAITOK);
393 cb->qp = qp;
394 cb->io = *io;
395 cb->data_offset = data_offset;
396 cb->data_len = data_len;
397 cb->data_xfered = 0;
398 refcount_init(&cb->refs, 1);
399 cb->error = 0;
400 cb->ddp_ok = che_ddp_io_check(qp, io);
401 cb->cid = cid;
402 cb->ttag = 0;
403 cb->original_cid = 0;
404 cb->cc = NULL;
405 cb->pbl = NULL;
406
407 return (cb);
408 }
409
410 static void
che_hold_command_buffer(struct nvmf_che_command_buffer * cb)411 che_hold_command_buffer(struct nvmf_che_command_buffer *cb)
412 {
413 refcount_acquire(&cb->refs);
414 }
415
416 static void
che_free_command_buffer(struct nvmf_che_command_buffer * cb)417 che_free_command_buffer(struct nvmf_che_command_buffer *cb)
418 {
419 nvmf_complete_io_request(&cb->io, cb->data_xfered, cb->error);
420 if (cb->cc != NULL)
421 che_release_capsule(cb->cc);
422 MPASS(cb->pbl == NULL);
423 free(cb, M_NVMF_CHE);
424 }
425
426 static void
che_release_command_buffer(struct nvmf_che_command_buffer * cb)427 che_release_command_buffer(struct nvmf_che_command_buffer *cb)
428 {
429 if (refcount_release(&cb->refs))
430 che_free_command_buffer(cb);
431 }
432
433 static void
che_add_command_buffer(struct nvmf_che_command_buffer_list * list,struct nvmf_che_command_buffer * cb)434 che_add_command_buffer(struct nvmf_che_command_buffer_list *list,
435 struct nvmf_che_command_buffer *cb)
436 {
437 mtx_assert(&list->lock, MA_OWNED);
438 TAILQ_INSERT_HEAD(&list->head, cb, link);
439 }
440
441 static struct nvmf_che_command_buffer *
che_find_command_buffer(struct nvmf_che_command_buffer_list * list,uint16_t cid)442 che_find_command_buffer(struct nvmf_che_command_buffer_list *list,
443 uint16_t cid)
444 {
445 struct nvmf_che_command_buffer *cb;
446
447 mtx_assert(&list->lock, MA_OWNED);
448 TAILQ_FOREACH(cb, &list->head, link) {
449 if (cb->cid == cid)
450 return (cb);
451 }
452 return (NULL);
453 }
454
455 static void
che_remove_command_buffer(struct nvmf_che_command_buffer_list * list,struct nvmf_che_command_buffer * cb)456 che_remove_command_buffer(struct nvmf_che_command_buffer_list *list,
457 struct nvmf_che_command_buffer *cb)
458 {
459 mtx_assert(&list->lock, MA_OWNED);
460 TAILQ_REMOVE(&list->head, cb, link);
461 }
462
463 static void
che_purge_command_buffer(struct nvmf_che_command_buffer_list * list,uint16_t cid)464 che_purge_command_buffer(struct nvmf_che_command_buffer_list *list,
465 uint16_t cid)
466 {
467 struct nvmf_che_command_buffer *cb;
468
469 mtx_lock(&list->lock);
470 cb = che_find_command_buffer(list, cid);
471 if (cb != NULL) {
472 che_remove_command_buffer(list, cb);
473 mtx_unlock(&list->lock);
474 che_release_command_buffer(cb);
475 } else
476 mtx_unlock(&list->lock);
477 }
478
479 static int
che_write_mem_inline(struct adapter * sc,struct toepcb * toep,uint32_t addr,uint32_t len,void * data,struct mbufq * wrq)480 che_write_mem_inline(struct adapter *sc, struct toepcb *toep, uint32_t addr,
481 uint32_t len, void *data, struct mbufq *wrq)
482 {
483 struct mbuf *m;
484 char *cp;
485 int copy_len, i, num_wqe, wr_len;
486
487 #ifdef VERBOSE_TRACES
488 CTR(KTR_CXGBE, "%s: addr 0x%x len %u", __func__, addr << 5, len);
489 #endif
490 num_wqe = DIV_ROUND_UP(len, T4_MAX_INLINE_SIZE);
491 cp = data;
492 for (i = 0; i < num_wqe; i++) {
493 copy_len = min(len, T4_MAX_INLINE_SIZE);
494 wr_len = T4_WRITE_MEM_INLINE_LEN(copy_len);
495
496 m = alloc_raw_wr_mbuf(wr_len);
497 if (m == NULL)
498 return (ENOMEM);
499 t4_write_mem_inline_wr(sc, mtod(m, void *), wr_len, toep->tid,
500 addr, copy_len, cp, 0);
501 if (cp != NULL)
502 cp += T4_MAX_INLINE_SIZE;
503 addr += T4_MAX_INLINE_SIZE >> 5;
504 len -= T4_MAX_INLINE_SIZE;
505
506 mbufq_enqueue(wrq, m);
507 }
508 return (0);
509 }
510
511 static int
che_write_mem_dma_aligned(struct adapter * sc,struct toepcb * toep,uint32_t addr,uint32_t len,void * data,struct mbufq * wrq)512 che_write_mem_dma_aligned(struct adapter *sc, struct toepcb *toep,
513 uint32_t addr, uint32_t len, void *data, struct mbufq *wrq)
514 {
515 struct mbuf *m;
516 vm_offset_t va;
517 u_int todo;
518 int wr_len;
519
520 /* First page. */
521 va = (vm_offset_t)data;
522 todo = min(PAGE_SIZE - (va % PAGE_SIZE), len);
523 wr_len = T4_WRITE_MEM_DMA_LEN;
524 m = alloc_raw_wr_mbuf(wr_len);
525 if (m == NULL)
526 return (ENOMEM);
527 t4_write_mem_dma_wr(sc, mtod(m, void *), wr_len, toep->tid, addr,
528 todo, pmap_kextract(va), 0);
529 mbufq_enqueue(wrq, m);
530 len -= todo;
531 addr += todo >> 5;
532 va += todo;
533
534 while (len > 0) {
535 MPASS(va == trunc_page(va));
536 todo = min(PAGE_SIZE, len);
537 m = alloc_raw_wr_mbuf(wr_len);
538 if (m == NULL)
539 return (ENOMEM);
540 t4_write_mem_dma_wr(sc, mtod(m, void *), wr_len, toep->tid,
541 addr, todo, pmap_kextract(va), 0);
542 mbufq_enqueue(wrq, m);
543 len -= todo;
544 addr += todo >> 5;
545 va += todo;
546 }
547 return (0);
548 }
549
550 static int
che_write_adapter_mem(struct nvmf_che_qpair * qp,uint32_t addr,uint32_t len,void * data)551 che_write_adapter_mem(struct nvmf_che_qpair *qp, uint32_t addr, uint32_t len,
552 void *data)
553 {
554 struct adapter *sc = qp->nca->sc;
555 struct toepcb *toep = qp->toep;
556 struct socket *so = qp->so;
557 struct inpcb *inp = sotoinpcb(so);
558 struct mbufq mq;
559 int error;
560
561 mbufq_init(&mq, INT_MAX);
562 if (!use_dsgl || len < inline_threshold || data == NULL)
563 error = che_write_mem_inline(sc, toep, addr, len, data, &mq);
564 else
565 error = che_write_mem_dma_aligned(sc, toep, addr, len, data,
566 &mq);
567 if (__predict_false(error != 0))
568 goto error;
569
570 INP_WLOCK(inp);
571 if ((inp->inp_flags & INP_DROPPED) != 0) {
572 INP_WUNLOCK(inp);
573 error = ECONNRESET;
574 goto error;
575 }
576 mbufq_concat(&toep->ulp_pduq, &mq);
577 INP_WUNLOCK(inp);
578 return (0);
579
580 error:
581 mbufq_drain(&mq);
582 return (error);
583 }
584
585 static bool
che_alloc_pbl(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb)586 che_alloc_pbl(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb)
587 {
588 struct adapter *sc = qp->nca->sc;
589 struct memdesc *mem = &cb->io.io_mem;
590 uint64_t *pbl;
591 uint32_t addr, len;
592 u_int i, npages;
593 int error;
594
595 MPASS(cb->pbl == NULL);
596 MPASS(cb->ddp_ok);
597
598 /* Hardware limit? iWARP only enforces this for T5. */
599 if (cb->io.io_len >= (8 * 1024 * 1024 * 1024ULL))
600 return (false);
601
602 npages = che_npages(cb);
603 len = roundup2(npages, 4) * sizeof(*cb->pbl);
604 addr = t4_pblpool_alloc(sc, len);
605 if (addr == 0)
606 return (false);
607
608 pbl = malloc(len, M_NVMF_CHE, M_NOWAIT | M_ZERO);
609 if (pbl == NULL) {
610 t4_pblpool_free(sc, addr, len);
611 return (false);
612 }
613
614 switch (mem->md_type) {
615 case MEMDESC_VADDR:
616 {
617 vm_offset_t va;
618
619 va = trunc_page((uintptr_t)mem->u.md_vaddr);
620 for (i = 0; i < npages; i++)
621 pbl[i] = htobe64(pmap_kextract(va + i * PAGE_SIZE));
622 break;
623 }
624 case MEMDESC_PADDR:
625 {
626 vm_paddr_t pa;
627
628 pa = trunc_page(mem->u.md_paddr);
629 for (i = 0; i < npages; i++)
630 pbl[i] = htobe64(pa + i * PAGE_SIZE);
631 break;
632 }
633 case MEMDESC_VMPAGES:
634 for (i = 0; i < npages; i++)
635 pbl[i] = htobe64(VM_PAGE_TO_PHYS(mem->u.md_ma[i]));
636 break;
637 case MEMDESC_VLIST:
638 {
639 struct bus_dma_segment *ds;
640 vm_offset_t va;
641 vm_size_t len;
642 u_int j, k;
643
644 i = 0;
645 ds = mem->u.md_list;
646 for (j = 0; j < mem->md_nseg; j++, ds++) {
647 va = trunc_page((uintptr_t)ds->ds_addr);
648 len = ds->ds_len;
649 if (ds->ds_addr % PAGE_SIZE != 0)
650 len += ds->ds_addr % PAGE_SIZE;
651 for (k = 0; k < howmany(len, PAGE_SIZE); k++) {
652 pbl[i] = htobe64(pmap_kextract(va +
653 k * PAGE_SIZE));
654 i++;
655 }
656 }
657 MPASS(i == npages);
658 break;
659 }
660 case MEMDESC_PLIST:
661 {
662 struct bus_dma_segment *ds;
663 vm_paddr_t pa;
664 vm_size_t len;
665 u_int j, k;
666
667 i = 0;
668 ds = mem->u.md_list;
669 for (j = 0; j < mem->md_nseg; j++, ds++) {
670 pa = trunc_page((vm_paddr_t)ds->ds_addr);
671 len = ds->ds_len;
672 if (ds->ds_addr % PAGE_SIZE != 0)
673 len += ds->ds_addr % PAGE_SIZE;
674 for (k = 0; k < howmany(len, PAGE_SIZE); k++) {
675 pbl[i] = htobe64(pa + k * PAGE_SIZE);
676 i++;
677 }
678 }
679 MPASS(i == npages);
680 break;
681 }
682 default:
683 __assert_unreachable();
684 }
685
686 error = che_write_adapter_mem(qp, addr >> 5, len, pbl);
687 if (error != 0) {
688 t4_pblpool_free(sc, addr, len);
689 free(pbl, M_NVMF_CHE);
690 return (false);
691 }
692
693 cb->pbl = pbl;
694 cb->pbl_addr = addr;
695 cb->pbl_len = len;
696
697 return (true);
698 }
699
700 static void
che_free_pbl(struct nvmf_che_command_buffer * cb)701 che_free_pbl(struct nvmf_che_command_buffer *cb)
702 {
703 free(cb->pbl, M_NVMF_CHE);
704 t4_pblpool_free(cb->qp->nca->sc, cb->pbl_addr, cb->pbl_len);
705 cb->pbl = NULL;
706 cb->pbl_addr = 0;
707 cb->pbl_len = 0;
708 }
709
710 static bool
che_write_tpt_entry(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb,uint16_t stag)711 che_write_tpt_entry(struct nvmf_che_qpair *qp,
712 struct nvmf_che_command_buffer *cb, uint16_t stag)
713 {
714 uint32_t tpt_addr;
715 int error;
716
717 cb->tpte.valid_to_pdid = htobe32(F_FW_RI_TPTE_VALID |
718 V_FW_RI_TPTE_STAGKEY(CHE_STAG_COLOR(stag)) |
719 F_FW_RI_TPTE_STAGSTATE |
720 V_FW_RI_TPTE_STAGTYPE(FW_RI_STAG_NSMR) |
721 V_FW_RI_TPTE_PDID(0));
722 cb->tpte.locread_to_qpid = htobe32(
723 V_FW_RI_TPTE_PERM(FW_RI_MEM_ACCESS_REM_WRITE) |
724 V_FW_RI_TPTE_ADDRTYPE(FW_RI_ZERO_BASED_TO) |
725 V_FW_RI_TPTE_PS(PAGE_SIZE) |
726 V_FW_RI_TPTE_QPID(qp->toep->tid));
727 #define PBL_OFF(qp, a) ((a) - (qp)->nca->sc->vres.pbl.start)
728 cb->tpte.nosnoop_pbladdr =
729 htobe32(V_FW_RI_TPTE_PBLADDR(PBL_OFF(qp, cb->pbl_addr) >> 3));
730 cb->tpte.len_lo = htobe32(cb->data_len);
731 cb->tpte.va_hi = 0;
732 cb->tpte.va_lo_fbo = htobe32(che_fbo(cb));
733 cb->tpte.dca_mwbcnt_pstag = 0;
734 cb->tpte.len_hi = htobe32(cb->data_offset);
735
736 tpt_addr = qp->tpt_offset + CHE_STAG_IDX(stag) +
737 (qp->nca->sc->vres.stag.start >> 5);
738
739 error = che_write_adapter_mem(qp, tpt_addr, sizeof(cb->tpte),
740 &cb->tpte);
741 return (error == 0);
742 }
743
744 static void
che_clear_tpt_entry(struct nvmf_che_qpair * qp,uint16_t stag)745 che_clear_tpt_entry(struct nvmf_che_qpair *qp, uint16_t stag)
746 {
747 uint32_t tpt_addr;
748
749 tpt_addr = qp->tpt_offset + CHE_STAG_IDX(stag) +
750 (qp->nca->sc->vres.stag.start >> 5);
751
752 (void)che_write_adapter_mem(qp, tpt_addr, sizeof(struct fw_ri_tpte),
753 NULL);
754 }
755
756 static uint16_t
che_alloc_ddp_stag(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb)757 che_alloc_ddp_stag(struct nvmf_che_qpair *qp,
758 struct nvmf_che_command_buffer *cb)
759 {
760 uint16_t stag_idx;
761
762 mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
763 MPASS(cb->ddp_ok);
764
765 if (qp->active_ddp_tags == qp->num_ddp_tags)
766 return (CHE_DDP_NO_TAG);
767
768 MPASS(qp->num_ddp_tags != 0);
769
770 stag_idx = qp->next_ddp_tag;
771 for (;;) {
772 if (qp->open_ddp_tags[stag_idx] == NULL)
773 break;
774 if (stag_idx == qp->num_ddp_tags - 1) {
775 stag_idx = 0;
776 if (qp->ddp_color == CHE_DDP_MAX_COLOR)
777 qp->ddp_color = 0;
778 else
779 qp->ddp_color++;
780 } else
781 stag_idx++;
782 MPASS(stag_idx != qp->next_ddp_tag);
783 }
784 if (stag_idx == qp->num_ddp_tags - 1)
785 qp->next_ddp_tag = 0;
786 else
787 qp->next_ddp_tag = stag_idx + 1;
788
789 qp->active_ddp_tags++;
790 qp->open_ddp_tags[stag_idx] = cb;
791
792 return (CHE_DDP_TAG(stag_idx, qp->ddp_color));
793 }
794
795 static void
che_free_ddp_stag(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb,uint16_t stag)796 che_free_ddp_stag(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb,
797 uint16_t stag)
798 {
799 MPASS(!CHE_TAG_IS_FL(stag));
800
801 mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
802
803 MPASS(qp->open_ddp_tags[CHE_STAG_IDX(stag)] == cb);
804
805 qp->open_ddp_tags[CHE_STAG_IDX(stag)] = NULL;
806 qp->active_ddp_tags--;
807 }
808
809 static uint16_t
che_alloc_ddp_tag(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb)810 che_alloc_ddp_tag(struct nvmf_che_qpair *qp,
811 struct nvmf_che_command_buffer *cb)
812 {
813 uint16_t stag;
814
815 mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
816
817 if (!cb->ddp_ok)
818 return (CHE_DDP_NO_TAG);
819
820 stag = che_alloc_ddp_stag(qp, cb);
821 if (stag == CHE_DDP_NO_TAG) {
822 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_setup_no_stag,
823 1);
824 return (CHE_DDP_NO_TAG);
825 }
826
827 if (!che_alloc_pbl(qp, cb)) {
828 che_free_ddp_stag(qp, cb, stag);
829 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_setup_error, 1);
830 return (CHE_DDP_NO_TAG);
831 }
832
833 if (!che_write_tpt_entry(qp, cb, stag)) {
834 che_free_pbl(cb);
835 che_free_ddp_stag(qp, cb, stag);
836 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_setup_error, 1);
837 return (CHE_DDP_NO_TAG);
838 }
839
840 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_setup_ok, 1);
841 return (stag);
842 }
843
844 static void
che_free_ddp_tag(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb,uint16_t stag)845 che_free_ddp_tag(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb,
846 uint16_t stag)
847 {
848 MPASS(!CHE_TAG_IS_FL(stag));
849
850 mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
851
852 MPASS(qp->open_ddp_tags[CHE_STAG_IDX(stag)] == cb);
853
854 che_clear_tpt_entry(qp, stag);
855 che_free_pbl(cb);
856 che_free_ddp_stag(qp, cb, stag);
857 }
858
859 static void
nvmf_che_write_pdu(struct nvmf_che_qpair * qp,struct mbuf * m)860 nvmf_che_write_pdu(struct nvmf_che_qpair *qp, struct mbuf *m)
861 {
862 struct epoch_tracker et;
863 struct socket *so = qp->so;
864 struct inpcb *inp = sotoinpcb(so);
865 struct toepcb *toep = qp->toep;
866
867 CURVNET_SET(so->so_vnet);
868 NET_EPOCH_ENTER(et);
869 INP_WLOCK(inp);
870 if (__predict_false(inp->inp_flags & INP_DROPPED) ||
871 __predict_false((toep->flags & TPF_ATTACHED) == 0)) {
872 m_freem(m);
873 } else {
874 mbufq_enqueue(&toep->ulp_pduq, m);
875 t4_push_pdus(toep->vi->adapter, toep, 0);
876 }
877 INP_WUNLOCK(inp);
878 NET_EPOCH_EXIT(et);
879 CURVNET_RESTORE();
880 }
881
882 static void
nvmf_che_report_error(struct nvmf_che_qpair * qp,uint16_t fes,uint32_t fei,struct mbuf * rx_pdu,u_int hlen)883 nvmf_che_report_error(struct nvmf_che_qpair *qp, uint16_t fes, uint32_t fei,
884 struct mbuf *rx_pdu, u_int hlen)
885 {
886 struct nvme_tcp_term_req_hdr *hdr;
887 struct mbuf *m;
888
889 if (hlen != 0) {
890 hlen = min(hlen, NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE);
891 hlen = min(hlen, m_length(rx_pdu, NULL));
892 }
893
894 m = m_get2(sizeof(*hdr) + hlen, M_WAITOK, MT_DATA, M_PKTHDR);
895 m->m_len = sizeof(*hdr) + hlen;
896 m->m_pkthdr.len = m->m_len;
897 hdr = mtod(m, void *);
898 memset(hdr, 0, sizeof(*hdr));
899 hdr->common.pdu_type = qp->qp.nq_controller ?
900 NVME_TCP_PDU_TYPE_C2H_TERM_REQ : NVME_TCP_PDU_TYPE_H2C_TERM_REQ;
901 hdr->common.hlen = sizeof(*hdr);
902 hdr->common.plen = sizeof(*hdr) + hlen;
903 hdr->fes = htole16(fes);
904 le32enc(hdr->fei, fei);
905 if (hlen != 0)
906 m_copydata(rx_pdu, 0, hlen, (caddr_t)(hdr + 1));
907
908 nvmf_che_write_pdu(qp, m);
909 }
910
911 static int
nvmf_che_validate_pdu(struct nvmf_che_qpair * qp,struct nvmf_che_rxpdu * pdu)912 nvmf_che_validate_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu)
913 {
914 const struct nvme_tcp_common_pdu_hdr *ch;
915 struct mbuf *m = pdu->m;
916 uint32_t data_len, fei, plen, rx_digest;
917 u_int hlen, cpl_error;
918 int error;
919 uint16_t fes;
920
921 /* Determine how large of a PDU header to return for errors. */
922 ch = pdu->hdr;
923 hlen = ch->hlen;
924 plen = le32toh(ch->plen);
925 if (hlen < sizeof(*ch) || hlen > plen)
926 hlen = sizeof(*ch);
927
928 cpl_error = m->m_pkthdr.nvmf_cpl_status & CMP_STATUS_ERROR_MASK;
929 switch (cpl_error) {
930 case CMP_STATUS_NO_ERROR:
931 break;
932 case CMP_STATUS_HEADER_DIGEST:
933 counter_u64_add(
934 qp->toep->ofld_rxq->rx_nvme_header_digest_errors, 1);
935 printf("NVMe/TCP: Header digest mismatch\n");
936 rx_digest = le32dec(mtodo(m, ch->hlen));
937 nvmf_che_report_error(qp,
938 NVME_TCP_TERM_REQ_FES_HDGST_ERROR, rx_digest, m,
939 hlen);
940 return (EBADMSG);
941 case CMP_STATUS_DIRECTION_MISMATCH:
942 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1);
943 printf("NVMe/TCP: Invalid PDU type %u\n", ch->pdu_type);
944 nvmf_che_report_error(qp,
945 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
946 offsetof(struct nvme_tcp_common_pdu_hdr, pdu_type), m,
947 hlen);
948 return (EBADMSG);
949 case CMP_STATUS_SUCCESS_NOT_LAST:
950 case CMP_STATUS_DIGEST_FLAG_MISMATCH:
951 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1);
952 printf("NVMe/TCP: Invalid PDU header flags %#x\n", ch->flags);
953 nvmf_che_report_error(qp,
954 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
955 offsetof(struct nvme_tcp_common_pdu_hdr, flags), m, hlen);
956 return (EBADMSG);
957 case CMP_STATUS_BAD_DATA_LENGTH:
958 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1);
959 printf("NVMe/TCP: Invalid PDU length %u\n", plen);
960 nvmf_che_report_error(qp,
961 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
962 offsetof(struct nvme_tcp_common_pdu_hdr, plen), m, hlen);
963 return (EBADMSG);
964 case CMP_STATUS_USER_MODE_UNALLOCATED:
965 case CMP_STATUS_RQT_LIMIT:
966 case CMP_STATUS_RQT_WRAP:
967 case CMP_STATUS_RQT_BOUND:
968 device_printf(qp->nca->sc->dev,
969 "received invalid NVMET error %u\n",
970 cpl_error);
971 return (ECONNRESET);
972 case CMP_STATUS_TPT_LIMIT:
973 case CMP_STATUS_TPT_INVALID:
974 case CMP_STATUS_TPT_COLOUR_MISMATCH:
975 case CMP_STATUS_TPT_MISC:
976 case CMP_STATUS_TPT_WRAP:
977 case CMP_STATUS_TPT_BOUND:
978 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1);
979 switch (ch->pdu_type) {
980 case NVME_TCP_PDU_TYPE_H2C_DATA:
981 nvmf_che_report_error(qp,
982 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
983 offsetof(struct nvme_tcp_h2c_data_hdr, ttag),
984 pdu->m, pdu->hdr->hlen);
985 return (EBADMSG);
986 case NVME_TCP_PDU_TYPE_C2H_DATA:
987 nvmf_che_report_error(qp,
988 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
989 offsetof(struct nvme_tcp_c2h_data_hdr, cccid), m,
990 hlen);
991 return (EBADMSG);
992 default:
993 device_printf(qp->nca->sc->dev,
994 "received DDP NVMET error %u for PDU %u\n",
995 cpl_error, ch->pdu_type);
996 return (ECONNRESET);
997 }
998 case CMP_STATUS_TPT_LAST_PDU_UNALIGNED:
999 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1);
1000 nvmf_che_report_error(qp,
1001 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, m, hlen);
1002 return (EBADMSG);
1003 case CMP_STATUS_PBL_LIMIT:
1004 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1);
1005 nvmf_che_report_error(qp,
1006 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, m,
1007 hlen);
1008 return (EBADMSG);
1009 case CMP_STATUS_DATA_DIGEST:
1010 /* Handled below. */
1011 break;
1012 default:
1013 device_printf(qp->nca->sc->dev,
1014 "received unknown NVMET error %u\n",
1015 cpl_error);
1016 return (ECONNRESET);
1017 }
1018
1019 error = nvmf_tcp_validate_pdu_header(ch, qp->qp.nq_controller,
1020 qp->header_digests, qp->data_digests, qp->rxpda, &data_len, &fes,
1021 &fei);
1022 if (error != 0) {
1023 if (error != ECONNRESET)
1024 nvmf_che_report_error(qp, fes, fei, m, hlen);
1025 return (error);
1026 }
1027
1028 /* Check data digest if present. */
1029 pdu->data_digest_mismatch = false;
1030 if ((ch->flags & NVME_TCP_CH_FLAGS_DDGSTF) != 0) {
1031 if (cpl_error == CMP_STATUS_DATA_DIGEST) {
1032 printf("NVMe/TCP: Data digest mismatch\n");
1033 pdu->data_digest_mismatch = true;
1034 counter_u64_add(
1035 qp->toep->ofld_rxq->rx_nvme_data_digest_errors, 1);
1036 }
1037 }
1038
1039 pdu->data_len = data_len;
1040
1041 return (0);
1042 }
1043
1044 static void
nvmf_che_free_pdu(struct nvmf_che_rxpdu * pdu)1045 nvmf_che_free_pdu(struct nvmf_che_rxpdu *pdu)
1046 {
1047 m_freem(pdu->m);
1048 pdu->m = NULL;
1049 pdu->hdr = NULL;
1050 }
1051
1052 static int
nvmf_che_handle_term_req(struct nvmf_che_rxpdu * pdu)1053 nvmf_che_handle_term_req(struct nvmf_che_rxpdu *pdu)
1054 {
1055 const struct nvme_tcp_term_req_hdr *hdr;
1056
1057 hdr = (const void *)pdu->hdr;
1058
1059 printf("NVMe/TCP: Received termination request: fes %#x fei %#x\n",
1060 le16toh(hdr->fes), le32dec(hdr->fei));
1061 nvmf_che_free_pdu(pdu);
1062 return (ECONNRESET);
1063 }
1064
1065 static int
nvmf_che_save_command_capsule(struct nvmf_che_qpair * qp,struct nvmf_che_rxpdu * pdu)1066 nvmf_che_save_command_capsule(struct nvmf_che_qpair *qp,
1067 struct nvmf_che_rxpdu *pdu)
1068 {
1069 const struct nvme_tcp_cmd *cmd;
1070 struct nvmf_capsule *nc;
1071 struct nvmf_che_capsule *cc;
1072
1073 cmd = (const void *)pdu->hdr;
1074
1075 nc = nvmf_allocate_command(&qp->qp, &cmd->ccsqe, M_WAITOK);
1076
1077 cc = CCAP(nc);
1078 cc->rx_pdu = *pdu;
1079
1080 nvmf_capsule_received(&qp->qp, nc);
1081 return (0);
1082 }
1083
1084 static int
nvmf_che_save_response_capsule(struct nvmf_che_qpair * qp,struct nvmf_che_rxpdu * pdu)1085 nvmf_che_save_response_capsule(struct nvmf_che_qpair *qp,
1086 struct nvmf_che_rxpdu *pdu)
1087 {
1088 const struct nvme_tcp_rsp *rsp;
1089 struct nvme_completion cpl;
1090 struct nvmf_capsule *nc;
1091 struct nvmf_che_capsule *cc;
1092 uint16_t cid;
1093
1094 rsp = (const void *)pdu->hdr;
1095
1096 /*
1097 * Restore the original CID and ensure any command buffers
1098 * associated with this CID have been released. Once the CQE
1099 * has been received, no further transfers to the command
1100 * buffer for the associated CID can occur.
1101 */
1102 cpl = rsp->rccqe;
1103 cid = le16toh(cpl.cid);
1104 if (CHE_TAG_IS_FL(cid)) {
1105 cid = CHE_RAW_FL_TAG(cid);
1106 mtx_lock(&qp->fl_cid_lock);
1107 MPASS(FL_CID_ISACTIVE(cid, qp->fl_cid_set));
1108 cpl.cid = qp->fl_cids[cid];
1109 FL_CID_FREE(cid, qp->fl_cid_set);
1110 mtx_unlock(&qp->fl_cid_lock);
1111
1112 che_purge_command_buffer(&qp->rx_buffers, rsp->rccqe.cid);
1113 che_purge_command_buffer(&qp->tx_buffers, rsp->rccqe.cid);
1114 } else {
1115 struct nvmf_che_command_buffer *cb;
1116
1117 mtx_lock(&qp->rx_buffers.lock);
1118 cb = qp->open_ddp_tags[CHE_STAG_IDX(cid)];
1119 MPASS(cb != NULL);
1120 MPASS(cb->cid == rsp->rccqe.cid);
1121 cpl.cid = cb->original_cid;
1122 che_free_ddp_tag(qp, cb, cid);
1123 mtx_unlock(&qp->rx_buffers.lock);
1124 che_release_command_buffer(cb);
1125 }
1126 #ifdef VERBOSE_TRACES
1127 CTR(KTR_CXGBE, "%s: tid %u freed cid 0x%04x for 0x%04x", __func__,
1128 qp->toep->tid, le16toh(rsp->rccqe.cid), cpl.cid);
1129 #endif
1130
1131 nc = nvmf_allocate_response(&qp->qp, &cpl, M_WAITOK);
1132
1133 nc->nc_sqhd_valid = true;
1134 cc = CCAP(nc);
1135 cc->rx_pdu = *pdu;
1136
1137 nvmf_capsule_received(&qp->qp, nc);
1138 return (0);
1139 }
1140
1141 /*
1142 * Construct a PDU that contains an optional data payload. This
1143 * includes dealing with the length fields in the common header. The
1144 * adapter inserts digests and padding when the PDU is transmitted.
1145 */
1146 static struct mbuf *
nvmf_che_construct_pdu(struct nvmf_che_qpair * qp,void * hdr,size_t hlen,struct mbuf * data,uint32_t data_len)1147 nvmf_che_construct_pdu(struct nvmf_che_qpair *qp, void *hdr, size_t hlen,
1148 struct mbuf *data, uint32_t data_len)
1149 {
1150 struct nvme_tcp_common_pdu_hdr *ch;
1151 struct mbuf *top;
1152 uint32_t pdo, plen;
1153 uint8_t ulp_submode;
1154
1155 plen = hlen;
1156 if (qp->header_digests)
1157 plen += sizeof(uint32_t);
1158 if (data_len != 0) {
1159 KASSERT(m_length(data, NULL) == data_len, ("length mismatch"));
1160 pdo = roundup(plen, qp->txpda);
1161 plen = pdo + data_len;
1162 if (qp->data_digests)
1163 plen += sizeof(uint32_t);
1164 } else {
1165 KASSERT(data == NULL, ("payload mbuf with zero length"));
1166 pdo = 0;
1167 }
1168
1169 top = m_get2(hlen, M_WAITOK, MT_DATA, M_PKTHDR);
1170 top->m_len = hlen;
1171 top->m_pkthdr.len = hlen;
1172 ch = mtod(top, void *);
1173 memcpy(ch, hdr, hlen);
1174 ch->hlen = hlen;
1175 ulp_submode = 0;
1176 if (qp->header_digests) {
1177 ch->flags |= NVME_TCP_CH_FLAGS_HDGSTF;
1178 ulp_submode |= ULP_CRC_HEADER;
1179 }
1180 if (qp->data_digests && data_len != 0) {
1181 ch->flags |= NVME_TCP_CH_FLAGS_DDGSTF;
1182 ulp_submode |= ULP_CRC_DATA;
1183 }
1184 ch->pdo = pdo;
1185 ch->plen = htole32(plen);
1186 set_mbuf_ulp_submode(top, ulp_submode);
1187
1188 if (data_len != 0) {
1189 top->m_pkthdr.len += data_len;
1190 top->m_next = data;
1191 }
1192
1193 return (top);
1194 }
1195
1196 /* Allocate the next free freelist transfer tag. */
1197 static bool
nvmf_che_allocate_fl_ttag(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb)1198 nvmf_che_allocate_fl_ttag(struct nvmf_che_qpair *qp,
1199 struct nvmf_che_command_buffer *cb)
1200 {
1201 uint16_t ttag;
1202
1203 mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
1204
1205 if (qp->active_fl_ttags == qp->num_fl_ttags)
1206 return (false);
1207
1208 ttag = qp->next_fl_ttag;
1209 for (;;) {
1210 if (qp->open_fl_ttags[ttag] == NULL)
1211 break;
1212 if (ttag == qp->num_fl_ttags - 1)
1213 ttag = 0;
1214 else
1215 ttag++;
1216 MPASS(ttag != qp->next_fl_ttag);
1217 }
1218 if (ttag == qp->num_fl_ttags - 1)
1219 qp->next_fl_ttag = 0;
1220 else
1221 qp->next_fl_ttag = ttag + 1;
1222
1223 qp->active_fl_ttags++;
1224 qp->open_fl_ttags[ttag] = cb;
1225
1226 cb->ttag = ttag | CHE_FL_TAG_MASK;
1227 return (true);
1228 }
1229
1230 /* Attempt to allocate a free transfer tag and assign it to cb. */
1231 static bool
nvmf_che_allocate_ttag(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb)1232 nvmf_che_allocate_ttag(struct nvmf_che_qpair *qp,
1233 struct nvmf_che_command_buffer *cb)
1234 {
1235 uint16_t stag;
1236
1237 mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
1238
1239 stag = che_alloc_ddp_tag(qp, cb);
1240 if (stag == CHE_DDP_NO_TAG) {
1241 if (!nvmf_che_allocate_fl_ttag(qp, cb))
1242 return (false);
1243 } else {
1244 cb->ttag = stag;
1245 }
1246 #ifdef VERBOSE_TRACES
1247 CTR(KTR_CXGBE, "%s: tid %u allocated ttag 0x%04x", __func__,
1248 qp->toep->tid, cb->ttag);
1249 #endif
1250 cb->cc->active_r2ts++;
1251 return (true);
1252 }
1253
1254 /* Find the next command buffer eligible to schedule for R2T. */
1255 static struct nvmf_che_command_buffer *
nvmf_che_next_r2t(struct nvmf_che_qpair * qp)1256 nvmf_che_next_r2t(struct nvmf_che_qpair *qp)
1257 {
1258 struct nvmf_che_command_buffer *cb;
1259
1260 mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
1261
1262 TAILQ_FOREACH(cb, &qp->rx_buffers.head, link) {
1263 /* NB: maxr2t is 0's based. */
1264 if (cb->cc->active_r2ts > qp->maxr2t)
1265 continue;
1266
1267 if (!nvmf_che_allocate_ttag(qp, cb))
1268 return (NULL);
1269 #ifdef INVARIANTS
1270 cb->cc->pending_r2ts--;
1271 #endif
1272 TAILQ_REMOVE(&qp->rx_buffers.head, cb, link);
1273 return (cb);
1274 }
1275 return (NULL);
1276 }
1277
1278 /* NB: cid and is little-endian already. */
1279 static void
che_send_r2t(struct nvmf_che_qpair * qp,uint16_t cid,uint16_t ttag,uint32_t data_offset,uint32_t data_len)1280 che_send_r2t(struct nvmf_che_qpair *qp, uint16_t cid, uint16_t ttag,
1281 uint32_t data_offset, uint32_t data_len)
1282 {
1283 struct nvme_tcp_r2t_hdr r2t;
1284 struct mbuf *m;
1285
1286 memset(&r2t, 0, sizeof(r2t));
1287 r2t.common.pdu_type = NVME_TCP_PDU_TYPE_R2T;
1288 r2t.cccid = cid;
1289 r2t.ttag = htole16(ttag);
1290 r2t.r2to = htole32(data_offset);
1291 r2t.r2tl = htole32(data_len);
1292
1293 m = nvmf_che_construct_pdu(qp, &r2t, sizeof(r2t), NULL, 0);
1294 nvmf_che_write_pdu(qp, m);
1295 }
1296
1297 /*
1298 * Release a transfer tag and schedule another R2T.
1299 *
1300 * NB: This drops the rx_buffers.lock mutex.
1301 */
1302 static void
nvmf_che_send_next_r2t(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb)1303 nvmf_che_send_next_r2t(struct nvmf_che_qpair *qp,
1304 struct nvmf_che_command_buffer *cb)
1305 {
1306 struct nvmf_che_command_buffer *ncb;
1307
1308 mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
1309
1310 #ifdef VERBOSE_TRACES
1311 CTR(KTR_CXGBE, "%s: tid %u freed ttag 0x%04x", __func__, qp->toep->tid,
1312 cb->ttag);
1313 #endif
1314 if (CHE_TAG_IS_FL(cb->ttag)) {
1315 uint16_t ttag;
1316
1317 ttag = CHE_RAW_FL_TAG(cb->ttag);
1318 MPASS(qp->open_fl_ttags[ttag] == cb);
1319
1320 /* Release this transfer tag. */
1321 qp->open_fl_ttags[ttag] = NULL;
1322 qp->active_fl_ttags--;
1323 } else
1324 che_free_ddp_tag(qp, cb, cb->ttag);
1325
1326 cb->cc->active_r2ts--;
1327
1328 /* Schedule another R2T. */
1329 ncb = nvmf_che_next_r2t(qp);
1330 mtx_unlock(&qp->rx_buffers.lock);
1331 if (ncb != NULL)
1332 che_send_r2t(qp, ncb->cid, ncb->ttag, ncb->data_offset,
1333 ncb->data_len);
1334 }
1335
1336 /*
1337 * Copy len bytes starting at offset skip from an mbuf chain into an
1338 * I/O buffer at destination offset io_offset.
1339 */
1340 static void
mbuf_copyto_io(struct mbuf * m,u_int skip,u_int len,struct nvmf_io_request * io,u_int io_offset)1341 mbuf_copyto_io(struct mbuf *m, u_int skip, u_int len,
1342 struct nvmf_io_request *io, u_int io_offset)
1343 {
1344 u_int todo;
1345
1346 while (m->m_len <= skip) {
1347 skip -= m->m_len;
1348 m = m->m_next;
1349 }
1350 while (len != 0) {
1351 MPASS((m->m_flags & M_EXTPG) == 0);
1352
1353 todo = min(m->m_len - skip, len);
1354 memdesc_copyback(&io->io_mem, io_offset, todo, mtodo(m, skip));
1355 skip = 0;
1356 io_offset += todo;
1357 len -= todo;
1358 m = m->m_next;
1359 }
1360 }
1361
1362 static int
nvmf_che_handle_h2c_data(struct nvmf_che_qpair * qp,struct nvmf_che_rxpdu * pdu)1363 nvmf_che_handle_h2c_data(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu)
1364 {
1365 const struct nvme_tcp_h2c_data_hdr *h2c;
1366 struct nvmf_che_command_buffer *cb;
1367 uint32_t data_len, data_offset;
1368 uint16_t ttag, fl_ttag;
1369
1370 h2c = (const void *)pdu->hdr;
1371 if (le32toh(h2c->datal) > qp->maxh2cdata) {
1372 nvmf_che_report_error(qp,
1373 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_LIMIT_EXCEEDED, 0,
1374 pdu->m, pdu->hdr->hlen);
1375 nvmf_che_free_pdu(pdu);
1376 return (EBADMSG);
1377 }
1378
1379 ttag = le16toh(h2c->ttag);
1380 if (CHE_TAG_IS_FL(ttag)) {
1381 fl_ttag = CHE_RAW_FL_TAG(ttag);
1382 if (fl_ttag >= qp->num_fl_ttags) {
1383 nvmf_che_report_error(qp,
1384 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
1385 offsetof(struct nvme_tcp_h2c_data_hdr, ttag),
1386 pdu->m, pdu->hdr->hlen);
1387 nvmf_che_free_pdu(pdu);
1388 return (EBADMSG);
1389 }
1390
1391 mtx_lock(&qp->rx_buffers.lock);
1392 cb = qp->open_fl_ttags[fl_ttag];
1393 } else {
1394 if (CHE_STAG_IDX(ttag) >= qp->num_ddp_tags) {
1395 nvmf_che_report_error(qp,
1396 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
1397 offsetof(struct nvme_tcp_h2c_data_hdr, ttag),
1398 pdu->m, pdu->hdr->hlen);
1399 nvmf_che_free_pdu(pdu);
1400 return (EBADMSG);
1401 }
1402
1403 mtx_lock(&qp->rx_buffers.lock);
1404 cb = qp->open_ddp_tags[CHE_STAG_IDX(ttag)];
1405 }
1406
1407 if (cb == NULL) {
1408 mtx_unlock(&qp->rx_buffers.lock);
1409 nvmf_che_report_error(qp,
1410 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
1411 offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->m,
1412 pdu->hdr->hlen);
1413 nvmf_che_free_pdu(pdu);
1414 return (EBADMSG);
1415 }
1416 MPASS(cb->ttag == ttag);
1417
1418 /* For a data digest mismatch, fail the I/O request. */
1419 if (pdu->data_digest_mismatch) {
1420 nvmf_che_send_next_r2t(qp, cb);
1421 cb->error = EINTEGRITY;
1422 che_release_command_buffer(cb);
1423 nvmf_che_free_pdu(pdu);
1424 return (0);
1425 }
1426
1427 data_len = le32toh(h2c->datal);
1428 if (data_len != pdu->data_len) {
1429 mtx_unlock(&qp->rx_buffers.lock);
1430 nvmf_che_report_error(qp,
1431 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
1432 offsetof(struct nvme_tcp_h2c_data_hdr, datal), pdu->m,
1433 pdu->hdr->hlen);
1434 nvmf_che_free_pdu(pdu);
1435 return (EBADMSG);
1436 }
1437
1438 data_offset = le32toh(h2c->datao);
1439 if (data_offset < cb->data_offset ||
1440 data_offset + data_len > cb->data_offset + cb->data_len) {
1441 mtx_unlock(&qp->rx_buffers.lock);
1442 nvmf_che_report_error(qp,
1443 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, pdu->m,
1444 pdu->hdr->hlen);
1445 nvmf_che_free_pdu(pdu);
1446 return (EBADMSG);
1447 }
1448
1449 if (data_offset != cb->data_offset + cb->data_xfered) {
1450 if (CHE_TAG_IS_FL(ttag)) {
1451 mtx_unlock(&qp->rx_buffers.lock);
1452 nvmf_che_report_error(qp,
1453 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
1454 pdu->hdr->hlen);
1455 nvmf_che_free_pdu(pdu);
1456 return (EBADMSG);
1457 } else {
1458 uint32_t ddp_bytes;
1459
1460 /* Account for PDUs silently received via DDP. */
1461 ddp_bytes = data_offset -
1462 (cb->data_offset + cb->data_xfered);
1463 cb->data_xfered += ddp_bytes;
1464 #ifdef VERBOSE_TRACES
1465 CTR(KTR_CXGBE, "%s: tid %u previous ddp_bytes %u",
1466 __func__, qp->toep->tid, ddp_bytes);
1467 #endif
1468 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_octets,
1469 ddp_bytes);
1470 }
1471 }
1472
1473 if ((cb->data_xfered + data_len == cb->data_len) !=
1474 ((pdu->hdr->flags & NVME_TCP_H2C_DATA_FLAGS_LAST_PDU) != 0)) {
1475 mtx_unlock(&qp->rx_buffers.lock);
1476 nvmf_che_report_error(qp,
1477 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
1478 pdu->hdr->hlen);
1479 nvmf_che_free_pdu(pdu);
1480 return (EBADMSG);
1481 }
1482
1483 cb->data_xfered += data_len;
1484 data_offset -= cb->data_offset;
1485 if (cb->data_xfered == cb->data_len) {
1486 nvmf_che_send_next_r2t(qp, cb);
1487 } else {
1488 che_hold_command_buffer(cb);
1489 mtx_unlock(&qp->rx_buffers.lock);
1490 }
1491
1492 if (CHE_TAG_IS_FL(ttag))
1493 mbuf_copyto_io(pdu->m->m_next, 0, data_len, &cb->io,
1494 data_offset);
1495
1496 che_release_command_buffer(cb);
1497 nvmf_che_free_pdu(pdu);
1498 return (0);
1499 }
1500
1501 static int
nvmf_che_handle_c2h_data(struct nvmf_che_qpair * qp,struct nvmf_che_rxpdu * pdu)1502 nvmf_che_handle_c2h_data(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu)
1503 {
1504 const struct nvme_tcp_c2h_data_hdr *c2h;
1505 struct nvmf_che_command_buffer *cb;
1506 uint32_t data_len, data_offset;
1507 uint16_t cid, original_cid;
1508
1509 /*
1510 * Unlike freelist command buffers, DDP command buffers are
1511 * not released until the response capsule is received to keep
1512 * the STAG allocated until the command has completed.
1513 */
1514 c2h = (const void *)pdu->hdr;
1515
1516 cid = le16toh(c2h->cccid);
1517 if (CHE_TAG_IS_FL(cid)) {
1518 mtx_lock(&qp->rx_buffers.lock);
1519 cb = che_find_command_buffer(&qp->rx_buffers, c2h->cccid);
1520 } else {
1521 if (CHE_STAG_IDX(cid) >= qp->num_ddp_tags) {
1522 nvmf_che_report_error(qp,
1523 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
1524 offsetof(struct nvme_tcp_c2h_data_hdr, cccid),
1525 pdu->m, pdu->hdr->hlen);
1526 nvmf_che_free_pdu(pdu);
1527 return (EBADMSG);
1528 }
1529
1530 mtx_lock(&qp->rx_buffers.lock);
1531 cb = qp->open_ddp_tags[CHE_STAG_IDX(cid)];
1532 }
1533
1534 if (cb == NULL) {
1535 mtx_unlock(&qp->rx_buffers.lock);
1536 /*
1537 * XXX: Could be PDU sequence error if cccid is for a
1538 * command that doesn't use a command buffer.
1539 */
1540 nvmf_che_report_error(qp,
1541 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
1542 offsetof(struct nvme_tcp_c2h_data_hdr, cccid), pdu->m,
1543 pdu->hdr->hlen);
1544 nvmf_che_free_pdu(pdu);
1545 return (EBADMSG);
1546 }
1547
1548 /* For a data digest mismatch, fail the I/O request. */
1549 if (pdu->data_digest_mismatch) {
1550 cb->error = EINTEGRITY;
1551 if (CHE_TAG_IS_FL(cid)) {
1552 che_remove_command_buffer(&qp->rx_buffers, cb);
1553 mtx_unlock(&qp->rx_buffers.lock);
1554 che_release_command_buffer(cb);
1555 } else
1556 mtx_unlock(&qp->rx_buffers.lock);
1557 nvmf_che_free_pdu(pdu);
1558 return (0);
1559 }
1560
1561 data_len = le32toh(c2h->datal);
1562 if (data_len != pdu->data_len) {
1563 mtx_unlock(&qp->rx_buffers.lock);
1564 nvmf_che_report_error(qp,
1565 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
1566 offsetof(struct nvme_tcp_c2h_data_hdr, datal), pdu->m,
1567 pdu->hdr->hlen);
1568 nvmf_che_free_pdu(pdu);
1569 return (EBADMSG);
1570 }
1571
1572 data_offset = le32toh(c2h->datao);
1573 if (data_offset < cb->data_offset ||
1574 data_offset + data_len > cb->data_offset + cb->data_len) {
1575 mtx_unlock(&qp->rx_buffers.lock);
1576 nvmf_che_report_error(qp,
1577 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
1578 pdu->m, pdu->hdr->hlen);
1579 nvmf_che_free_pdu(pdu);
1580 return (EBADMSG);
1581 }
1582
1583 if (data_offset != cb->data_offset + cb->data_xfered) {
1584 if (CHE_TAG_IS_FL(cid)) {
1585 mtx_unlock(&qp->rx_buffers.lock);
1586 nvmf_che_report_error(qp,
1587 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
1588 pdu->hdr->hlen);
1589 nvmf_che_free_pdu(pdu);
1590 return (EBADMSG);
1591 } else {
1592 uint32_t ddp_bytes;
1593
1594 /* Account for PDUs silently received via DDP. */
1595 ddp_bytes = data_offset -
1596 (cb->data_offset + cb->data_xfered);
1597 cb->data_xfered += ddp_bytes;
1598 #ifdef VERBOSE_TRACES
1599 CTR(KTR_CXGBE, "%s: tid %u previous ddp_bytes %u",
1600 __func__, qp->toep->tid, ddp_bytes);
1601 #endif
1602 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_octets,
1603 ddp_bytes);
1604 }
1605 }
1606
1607 if ((cb->data_xfered + data_len == cb->data_len) !=
1608 ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_LAST_PDU) != 0)) {
1609 mtx_unlock(&qp->rx_buffers.lock);
1610 nvmf_che_report_error(qp,
1611 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
1612 pdu->hdr->hlen);
1613 nvmf_che_free_pdu(pdu);
1614 return (EBADMSG);
1615 }
1616
1617 cb->data_xfered += data_len;
1618 original_cid = cb->original_cid;
1619
1620 if (CHE_TAG_IS_FL(cid)) {
1621 data_offset -= cb->data_offset;
1622 if (cb->data_xfered == cb->data_len)
1623 che_remove_command_buffer(&qp->rx_buffers, cb);
1624 else
1625 che_hold_command_buffer(cb);
1626 mtx_unlock(&qp->rx_buffers.lock);
1627
1628 if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) {
1629 /*
1630 * Free the CID as the command has now been
1631 * completed.
1632 */
1633 cid = CHE_RAW_FL_TAG(cid);
1634 mtx_lock(&qp->fl_cid_lock);
1635 MPASS(FL_CID_ISACTIVE(cid, qp->fl_cid_set));
1636 MPASS(original_cid == qp->fl_cids[cid]);
1637 FL_CID_FREE(cid, qp->fl_cid_set);
1638 mtx_unlock(&qp->fl_cid_lock);
1639 }
1640
1641 mbuf_copyto_io(pdu->m->m_next, 0, data_len, &cb->io,
1642 data_offset);
1643
1644 che_release_command_buffer(cb);
1645 } else {
1646 if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) {
1647 /*
1648 * Free the command buffer and STAG as the
1649 * command has now been completed.
1650 */
1651 che_free_ddp_tag(qp, cb, cid);
1652 mtx_unlock(&qp->rx_buffers.lock);
1653 che_release_command_buffer(cb);
1654 } else
1655 mtx_unlock(&qp->rx_buffers.lock);
1656 }
1657
1658 if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) {
1659 struct nvme_completion cqe;
1660 struct nvmf_capsule *nc;
1661
1662 memset(&cqe, 0, sizeof(cqe));
1663 cqe.cid = original_cid;
1664
1665 nc = nvmf_allocate_response(&qp->qp, &cqe, M_WAITOK);
1666 nc->nc_sqhd_valid = false;
1667
1668 nvmf_capsule_received(&qp->qp, nc);
1669 }
1670
1671 nvmf_che_free_pdu(pdu);
1672 return (0);
1673 }
1674
1675 /* Called when m_free drops refcount to 0. */
1676 static void
nvmf_che_mbuf_done(struct mbuf * m)1677 nvmf_che_mbuf_done(struct mbuf *m)
1678 {
1679 struct nvmf_che_command_buffer *cb = m->m_ext.ext_arg1;
1680
1681 che_free_command_buffer(cb);
1682 }
1683
1684 static struct mbuf *
nvmf_che_mbuf(void * arg,int how,void * data,size_t len)1685 nvmf_che_mbuf(void *arg, int how, void *data, size_t len)
1686 {
1687 struct nvmf_che_command_buffer *cb = arg;
1688 struct mbuf *m;
1689
1690 m = m_get(how, MT_DATA);
1691 m->m_flags |= M_RDONLY;
1692 m_extaddref(m, data, len, &cb->refs, nvmf_che_mbuf_done, cb, NULL);
1693 m->m_len = len;
1694 return (m);
1695 }
1696
1697 static void
nvmf_che_free_mext_pg(struct mbuf * m)1698 nvmf_che_free_mext_pg(struct mbuf *m)
1699 {
1700 struct nvmf_che_command_buffer *cb = m->m_ext.ext_arg1;
1701
1702 M_ASSERTEXTPG(m);
1703 che_release_command_buffer(cb);
1704 }
1705
1706 static struct mbuf *
nvmf_che_mext_pg(void * arg,int how)1707 nvmf_che_mext_pg(void *arg, int how)
1708 {
1709 struct nvmf_che_command_buffer *cb = arg;
1710 struct mbuf *m;
1711
1712 m = mb_alloc_ext_pgs(how, nvmf_che_free_mext_pg, M_RDONLY);
1713 m->m_ext.ext_arg1 = cb;
1714 che_hold_command_buffer(cb);
1715 return (m);
1716 }
1717
1718 /*
1719 * Return an mbuf chain for a range of data belonging to a command
1720 * buffer.
1721 *
1722 * The mbuf chain uses M_EXT mbufs which hold references on the
1723 * command buffer so that it remains "alive" until the data has been
1724 * fully transmitted. If truncate_ok is true, then the mbuf chain
1725 * might return a short chain to avoid gratuitously splitting up a
1726 * page.
1727 */
1728 static struct mbuf *
nvmf_che_command_buffer_mbuf(struct nvmf_che_command_buffer * cb,uint32_t data_offset,uint32_t data_len,uint32_t * actual_len,bool can_truncate)1729 nvmf_che_command_buffer_mbuf(struct nvmf_che_command_buffer *cb,
1730 uint32_t data_offset, uint32_t data_len, uint32_t *actual_len,
1731 bool can_truncate)
1732 {
1733 struct mbuf *m;
1734 size_t len;
1735
1736 m = memdesc_alloc_ext_mbufs(&cb->io.io_mem, nvmf_che_mbuf,
1737 nvmf_che_mext_pg, cb, M_WAITOK, data_offset, data_len, &len,
1738 can_truncate);
1739 if (actual_len != NULL)
1740 *actual_len = len;
1741 return (m);
1742 }
1743
1744 /* NB: cid and ttag and little-endian already. */
1745 static void
che_send_h2c_pdu(struct nvmf_che_qpair * qp,uint16_t cid,uint16_t ttag,uint32_t data_offset,struct mbuf * m,size_t len,bool last_pdu)1746 che_send_h2c_pdu(struct nvmf_che_qpair *qp, uint16_t cid, uint16_t ttag,
1747 uint32_t data_offset, struct mbuf *m, size_t len, bool last_pdu)
1748 {
1749 struct nvme_tcp_h2c_data_hdr h2c;
1750 struct mbuf *top;
1751
1752 memset(&h2c, 0, sizeof(h2c));
1753 h2c.common.pdu_type = NVME_TCP_PDU_TYPE_H2C_DATA;
1754 if (last_pdu)
1755 h2c.common.flags |= NVME_TCP_H2C_DATA_FLAGS_LAST_PDU;
1756 h2c.cccid = cid;
1757 h2c.ttag = ttag;
1758 h2c.datao = htole32(data_offset);
1759 h2c.datal = htole32(len);
1760
1761 top = nvmf_che_construct_pdu(qp, &h2c, sizeof(h2c), m, len);
1762 nvmf_che_write_pdu(qp, top);
1763 }
1764
1765 static int
nvmf_che_handle_r2t(struct nvmf_che_qpair * qp,struct nvmf_che_rxpdu * pdu)1766 nvmf_che_handle_r2t(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu)
1767 {
1768 const struct nvme_tcp_r2t_hdr *r2t;
1769 struct nvmf_che_command_buffer *cb;
1770 uint32_t data_len, data_offset;
1771
1772 r2t = (const void *)pdu->hdr;
1773
1774 mtx_lock(&qp->tx_buffers.lock);
1775 cb = che_find_command_buffer(&qp->tx_buffers, r2t->cccid);
1776 if (cb == NULL) {
1777 mtx_unlock(&qp->tx_buffers.lock);
1778 nvmf_che_report_error(qp,
1779 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
1780 offsetof(struct nvme_tcp_r2t_hdr, cccid), pdu->m,
1781 pdu->hdr->hlen);
1782 nvmf_che_free_pdu(pdu);
1783 return (EBADMSG);
1784 }
1785
1786 data_offset = le32toh(r2t->r2to);
1787 if (data_offset != cb->data_xfered) {
1788 mtx_unlock(&qp->tx_buffers.lock);
1789 nvmf_che_report_error(qp,
1790 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
1791 pdu->hdr->hlen);
1792 nvmf_che_free_pdu(pdu);
1793 return (EBADMSG);
1794 }
1795
1796 /*
1797 * XXX: The spec does not specify how to handle R2T tranfers
1798 * out of range of the original command.
1799 */
1800 data_len = le32toh(r2t->r2tl);
1801 if (data_offset + data_len > cb->data_len) {
1802 mtx_unlock(&qp->tx_buffers.lock);
1803 nvmf_che_report_error(qp,
1804 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
1805 pdu->m, pdu->hdr->hlen);
1806 nvmf_che_free_pdu(pdu);
1807 return (EBADMSG);
1808 }
1809
1810 cb->data_xfered += data_len;
1811 if (cb->data_xfered == cb->data_len)
1812 che_remove_command_buffer(&qp->tx_buffers, cb);
1813 else
1814 che_hold_command_buffer(cb);
1815 mtx_unlock(&qp->tx_buffers.lock);
1816
1817 /*
1818 * Queue one or more H2C_DATA PDUs containing the requested
1819 * data.
1820 */
1821 while (data_len > 0) {
1822 struct mbuf *m;
1823 uint32_t sent, todo;
1824
1825 todo = min(data_len, qp->max_tx_data);
1826 m = nvmf_che_command_buffer_mbuf(cb, data_offset, todo, &sent,
1827 todo < data_len);
1828 che_send_h2c_pdu(qp, r2t->cccid, r2t->ttag, data_offset, m,
1829 sent, sent == data_len);
1830
1831 data_offset += sent;
1832 data_len -= sent;
1833 }
1834
1835 che_release_command_buffer(cb);
1836 nvmf_che_free_pdu(pdu);
1837 return (0);
1838 }
1839
1840 static int
nvmf_che_dispatch_pdu(struct nvmf_che_qpair * qp,struct nvmf_che_rxpdu * pdu)1841 nvmf_che_dispatch_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu)
1842 {
1843 /*
1844 * The PDU header should always be contiguous in the mbuf from
1845 * CPL_NVMT_CMP.
1846 */
1847 pdu->hdr = mtod(pdu->m, void *);
1848 KASSERT(pdu->m->m_len == pdu->hdr->hlen +
1849 ((pdu->hdr->flags & NVME_TCP_CH_FLAGS_HDGSTF) != 0 ?
1850 sizeof(uint32_t) : 0),
1851 ("%s: mismatched PDU header mbuf length", __func__));
1852
1853 switch (pdu->hdr->pdu_type) {
1854 default:
1855 __assert_unreachable();
1856 break;
1857 case NVME_TCP_PDU_TYPE_H2C_TERM_REQ:
1858 case NVME_TCP_PDU_TYPE_C2H_TERM_REQ:
1859 return (nvmf_che_handle_term_req(pdu));
1860 case NVME_TCP_PDU_TYPE_CAPSULE_CMD:
1861 return (nvmf_che_save_command_capsule(qp, pdu));
1862 case NVME_TCP_PDU_TYPE_CAPSULE_RESP:
1863 return (nvmf_che_save_response_capsule(qp, pdu));
1864 case NVME_TCP_PDU_TYPE_H2C_DATA:
1865 return (nvmf_che_handle_h2c_data(qp, pdu));
1866 case NVME_TCP_PDU_TYPE_C2H_DATA:
1867 return (nvmf_che_handle_c2h_data(qp, pdu));
1868 case NVME_TCP_PDU_TYPE_R2T:
1869 return (nvmf_che_handle_r2t(qp, pdu));
1870 }
1871 }
1872
1873 static int
nvmf_che_attach_pdu_data(struct nvmf_che_qpair * qp,struct nvmf_che_rxpdu * pdu)1874 nvmf_che_attach_pdu_data(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu)
1875 {
1876 struct socket *so = qp->so;
1877 struct mbuf *m, *n;
1878 uint32_t tcp_seq;
1879 size_t len;
1880 int error;
1881
1882 /* Check for DDP data. */
1883 if (pdu->ddp) {
1884 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_pdus, 1);
1885 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_octets,
1886 pdu->data_len);
1887 return (0);
1888 }
1889
1890 error = 0;
1891 len = pdu->data_len;
1892 tcp_seq = pdu->m->m_pkthdr.nvmf_tcp_seq;
1893 m = pdu->m;
1894 SOCKBUF_LOCK(&so->so_rcv);
1895 while (len > 0) {
1896 n = mbufq_dequeue(&qp->rx_data);
1897 KASSERT(n != NULL, ("%s: missing %zu data", __func__, len));
1898 if (n == NULL) {
1899 error = ENOBUFS;
1900 break;
1901 }
1902
1903 KASSERT(n->m_pkthdr.nvmf_tcp_seq == tcp_seq,
1904 ("%s: TCP seq mismatch", __func__));
1905 KASSERT(n->m_pkthdr.len <= len,
1906 ("%s: too much data", __func__));
1907 if (n->m_pkthdr.nvmf_tcp_seq != tcp_seq ||
1908 n->m_pkthdr.len > len) {
1909 m_freem(n);
1910 error = ENOBUFS;
1911 break;
1912 }
1913
1914 #ifdef VERBOSE_TRACES
1915 CTR(KTR_CXGBE, "%s: tid %u len %d seq %u", __func__,
1916 qp->toep->tid, n->m_pkthdr.len, n->m_pkthdr.nvmf_tcp_seq);
1917 #endif
1918 pdu->m->m_pkthdr.len += n->m_pkthdr.len;
1919 len -= n->m_pkthdr.len;
1920 tcp_seq += n->m_pkthdr.len;
1921 m_demote_pkthdr(n);
1922 m->m_next = n;
1923 m = m_last(n);
1924 }
1925 SOCKBUF_UNLOCK(&so->so_rcv);
1926
1927 if (error == 0) {
1928 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_fl_pdus, 1);
1929 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_fl_octets,
1930 pdu->data_len);
1931 }
1932 return (error);
1933 }
1934
1935 static void
nvmf_che_receive(void * arg)1936 nvmf_che_receive(void *arg)
1937 {
1938 struct nvmf_che_qpair *qp = arg;
1939 struct socket *so = qp->so;
1940 struct nvmf_che_rxpdu pdu;
1941 struct mbuf *m;
1942 int error, terror;
1943
1944 SOCKBUF_LOCK(&so->so_rcv);
1945 while (!qp->rx_shutdown) {
1946 /* Wait for a PDU. */
1947 if (so->so_error != 0 || so->so_rerror != 0) {
1948 if (so->so_error != 0)
1949 error = so->so_error;
1950 else
1951 error = so->so_rerror;
1952 SOCKBUF_UNLOCK(&so->so_rcv);
1953 error:
1954 nvmf_qpair_error(&qp->qp, error);
1955 SOCKBUF_LOCK(&so->so_rcv);
1956 while (!qp->rx_shutdown)
1957 cv_wait(&qp->rx_cv, SOCKBUF_MTX(&so->so_rcv));
1958 break;
1959 }
1960
1961 m = mbufq_dequeue(&qp->rx_pdus);
1962 if (m == NULL) {
1963 if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) != 0) {
1964 error = 0;
1965 SOCKBUF_UNLOCK(&so->so_rcv);
1966 goto error;
1967 }
1968 cv_wait(&qp->rx_cv, SOCKBUF_MTX(&so->so_rcv));
1969 continue;
1970 }
1971 SOCKBUF_UNLOCK(&so->so_rcv);
1972
1973 pdu.m = m;
1974 pdu.hdr = mtod(m, const void *);
1975 pdu.ddp = (m->m_pkthdr.nvmf_cpl_status & CMP_STATUS_DDP) != 0;
1976
1977 error = nvmf_che_validate_pdu(qp, &pdu);
1978 if (error == 0 && pdu.data_len != 0)
1979 error = nvmf_che_attach_pdu_data(qp, &pdu);
1980 if (error != 0)
1981 nvmf_che_free_pdu(&pdu);
1982 else
1983 error = nvmf_che_dispatch_pdu(qp, &pdu);
1984 if (error != 0) {
1985 /*
1986 * If we received a termination request, close
1987 * the connection immediately.
1988 */
1989 if (error == ECONNRESET)
1990 goto error;
1991
1992 /*
1993 * Wait for up to 30 seconds for the socket to
1994 * be closed by the other end.
1995 */
1996 SOCKBUF_LOCK(&so->so_rcv);
1997 if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1998 terror = cv_timedwait(&qp->rx_cv,
1999 SOCKBUF_MTX(&so->so_rcv), 30 * hz);
2000 if (terror == ETIMEDOUT)
2001 printf("NVMe/TCP: Timed out after sending terminate request\n");
2002 }
2003 SOCKBUF_UNLOCK(&so->so_rcv);
2004 goto error;
2005 }
2006
2007 SOCKBUF_LOCK(&so->so_rcv);
2008 }
2009 SOCKBUF_UNLOCK(&so->so_rcv);
2010 kthread_exit();
2011 }
2012
2013 static int
nvmf_che_soupcall_receive(struct socket * so,void * arg,int waitflag)2014 nvmf_che_soupcall_receive(struct socket *so, void *arg, int waitflag)
2015 {
2016 struct nvmf_che_qpair *qp = arg;
2017
2018 cv_signal(&qp->rx_cv);
2019 return (SU_OK);
2020 }
2021
2022 static int
do_nvmt_data(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)2023 do_nvmt_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
2024 {
2025 struct adapter *sc = iq->adapter;
2026 struct nvmf_che_adapter *nca = sc->nvme_ulp_softc;
2027 const struct cpl_nvmt_data *cpl;
2028 u_int tid;
2029 struct toepcb *toep;
2030 struct nvmf_che_qpair *qp;
2031 struct socket *so;
2032 struct inpcb *inp;
2033 struct tcpcb *tp;
2034 int len __diagused;
2035
2036 if (nca->nvmt_data_iqe) {
2037 cpl = (const void *)(rss + 1);
2038 } else {
2039 cpl = mtod(m, const void *);
2040
2041 /* strip off CPL header */
2042 m_adj(m, sizeof(*cpl));
2043 }
2044 tid = GET_TID(cpl);
2045 toep = lookup_tid(sc, tid);
2046
2047 KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
2048
2049 len = m->m_pkthdr.len;
2050
2051 KASSERT(len == be16toh(cpl->length),
2052 ("%s: payload length mismatch", __func__));
2053
2054 inp = toep->inp;
2055 INP_WLOCK(inp);
2056 if (inp->inp_flags & INP_DROPPED) {
2057 CTR(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
2058 __func__, tid, len, inp->inp_flags);
2059 INP_WUNLOCK(inp);
2060 m_freem(m);
2061 return (0);
2062 }
2063
2064 /* Save TCP sequence number. */
2065 m->m_pkthdr.nvmf_tcp_seq = be32toh(cpl->seq);
2066
2067 qp = toep->ulpcb;
2068 so = qp->so;
2069 SOCKBUF_LOCK(&so->so_rcv);
2070 mbufq_enqueue(&qp->rx_data, m);
2071 SOCKBUF_UNLOCK(&so->so_rcv);
2072
2073 tp = intotcpcb(inp);
2074 tp->t_rcvtime = ticks;
2075
2076 #ifdef VERBOSE_TRACES
2077 CTR(KTR_CXGBE, "%s: tid %u len %d seq %u", __func__, tid, len,
2078 be32toh(cpl->seq));
2079 #endif
2080
2081 INP_WUNLOCK(inp);
2082 return (0);
2083 }
2084
2085 static int
do_nvmt_cmp(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)2086 do_nvmt_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
2087 {
2088 struct adapter *sc = iq->adapter;
2089 const struct cpl_nvmt_cmp *cpl = mtod(m, const void *);
2090 u_int tid = GET_TID(cpl);
2091 struct toepcb *toep = lookup_tid(sc, tid);
2092 struct nvmf_che_qpair *qp = toep->ulpcb;
2093 struct socket *so = qp->so;
2094 struct inpcb *inp = toep->inp;
2095 u_int hlen __diagused;
2096 bool empty;
2097
2098 KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
2099 KASSERT(!(toep->flags & TPF_SYNQE),
2100 ("%s: toep %p claims to be a synq entry", __func__, toep));
2101
2102 /* strip off CPL header */
2103 m_adj(m, sizeof(*cpl));
2104 hlen = m->m_pkthdr.len;
2105
2106 KASSERT(hlen == be16toh(cpl->length),
2107 ("%s: payload length mismatch", __func__));
2108
2109 INP_WLOCK(inp);
2110 if (inp->inp_flags & INP_DROPPED) {
2111 CTR(KTR_CXGBE, "%s: tid %u, rx (hlen %u), inp_flags 0x%x",
2112 __func__, tid, hlen, inp->inp_flags);
2113 INP_WUNLOCK(inp);
2114 m_freem(m);
2115 return (0);
2116 }
2117
2118 #ifdef VERBOSE_TRACES
2119 CTR(KTR_CXGBE, "%s: tid %u hlen %u seq %u status %u", __func__, tid,
2120 hlen, be32toh(cpl->seq), cpl->status);
2121 #endif
2122
2123 /* Save TCP sequence number and CPL status. */
2124 m->m_pkthdr.nvmf_tcp_seq = be32toh(cpl->seq);
2125 m->m_pkthdr.nvmf_cpl_status = cpl->status;
2126
2127 SOCKBUF_LOCK(&so->so_rcv);
2128 empty = mbufq_len(&qp->rx_pdus) == 0;
2129 mbufq_enqueue(&qp->rx_pdus, m);
2130 SOCKBUF_UNLOCK(&so->so_rcv);
2131 INP_WUNLOCK(inp);
2132 if (empty)
2133 cv_signal(&qp->rx_cv);
2134 return (0);
2135 }
2136
2137 static uint16_t
che_alloc_fl_cid(struct nvmf_che_qpair * qp,uint16_t original_cid)2138 che_alloc_fl_cid(struct nvmf_che_qpair *qp, uint16_t original_cid)
2139 {
2140 uint16_t new_cid;
2141
2142 mtx_lock(&qp->fl_cid_lock);
2143 new_cid = FL_CID_FINDFREE_AT(qp->fl_cid_set, qp->next_cid);
2144 if (new_cid == 0) {
2145 new_cid = FL_CID_FINDFREE_AT(qp->fl_cid_set, 0);
2146 MPASS(new_cid != 0);
2147 }
2148 new_cid--;
2149 FL_CID_BUSY(new_cid, qp->fl_cid_set);
2150 if (new_cid == CHE_MAX_FL_TAG)
2151 qp->next_cid = 0;
2152 else
2153 qp->next_cid = new_cid + 1;
2154 qp->fl_cids[new_cid] = original_cid;
2155 mtx_unlock(&qp->fl_cid_lock);
2156
2157 return (new_cid | CHE_FL_TAG_MASK);
2158 }
2159
2160 static uint16_t
che_alloc_ddp_cid(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb)2161 che_alloc_ddp_cid(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb)
2162 {
2163 mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
2164
2165 return (che_alloc_ddp_tag(qp, cb));
2166 }
2167
2168 static struct mbuf *
che_command_pdu(struct nvmf_che_qpair * qp,struct nvmf_che_capsule * cc)2169 che_command_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_capsule *cc)
2170 {
2171 struct nvmf_capsule *nc = &cc->nc;
2172 struct nvmf_che_command_buffer *cb;
2173 struct nvme_sgl_descriptor *sgl;
2174 struct nvme_tcp_cmd cmd;
2175 struct mbuf *top, *m;
2176 uint16_t cid;
2177 bool use_icd;
2178
2179 use_icd = false;
2180 cb = NULL;
2181 m = NULL;
2182
2183 if (nc->nc_data.io_len != 0) {
2184 cb = che_alloc_command_buffer(qp, &nc->nc_data, 0,
2185 nc->nc_data.io_len, nc->nc_sqe.cid);
2186 cb->original_cid = nc->nc_sqe.cid;
2187
2188 if (nc->nc_send_data && nc->nc_data.io_len <= qp->max_icd) {
2189 cid = che_alloc_fl_cid(qp, nc->nc_sqe.cid);
2190 use_icd = true;
2191 m = nvmf_che_command_buffer_mbuf(cb, 0,
2192 nc->nc_data.io_len, NULL, false);
2193 cb->data_xfered = nc->nc_data.io_len;
2194 che_release_command_buffer(cb);
2195 } else if (nc->nc_send_data) {
2196 cid = che_alloc_fl_cid(qp, nc->nc_sqe.cid);
2197 cb->cid = htole16(cid);
2198 mtx_lock(&qp->tx_buffers.lock);
2199 che_add_command_buffer(&qp->tx_buffers, cb);
2200 mtx_unlock(&qp->tx_buffers.lock);
2201 } else {
2202 mtx_lock(&qp->rx_buffers.lock);
2203 cid = che_alloc_ddp_cid(qp, cb);
2204 if (cid == CHE_DDP_NO_TAG) {
2205 cid = che_alloc_fl_cid(qp, nc->nc_sqe.cid);
2206 che_add_command_buffer(&qp->rx_buffers, cb);
2207 }
2208 cb->cid = htole16(cid);
2209 mtx_unlock(&qp->rx_buffers.lock);
2210 }
2211 } else
2212 cid = che_alloc_fl_cid(qp, nc->nc_sqe.cid);
2213
2214 #ifdef VERBOSE_TRACES
2215 CTR(KTR_CXGBE, "%s: tid %u allocated cid 0x%04x for 0x%04x", __func__,
2216 qp->toep->tid, cid, nc->nc_sqe.cid);
2217 #endif
2218 memset(&cmd, 0, sizeof(cmd));
2219 cmd.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_CMD;
2220 cmd.ccsqe = nc->nc_sqe;
2221 cmd.ccsqe.cid = htole16(cid);
2222
2223 /* Populate SGL in SQE. */
2224 sgl = &cmd.ccsqe.sgl;
2225 memset(sgl, 0, sizeof(*sgl));
2226 sgl->address = 0;
2227 sgl->length = htole32(nc->nc_data.io_len);
2228 if (use_icd) {
2229 /* Use in-capsule data. */
2230 sgl->type = NVME_SGL_TYPE_ICD;
2231 } else {
2232 /* Use a command buffer. */
2233 sgl->type = NVME_SGL_TYPE_COMMAND_BUFFER;
2234 }
2235
2236 top = nvmf_che_construct_pdu(qp, &cmd, sizeof(cmd), m, m != NULL ?
2237 nc->nc_data.io_len : 0);
2238 return (top);
2239 }
2240
2241 static struct mbuf *
che_response_pdu(struct nvmf_che_qpair * qp,struct nvmf_che_capsule * cc)2242 che_response_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_capsule *cc)
2243 {
2244 struct nvmf_capsule *nc = &cc->nc;
2245 struct nvme_tcp_rsp rsp;
2246
2247 memset(&rsp, 0, sizeof(rsp));
2248 rsp.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_RESP;
2249 rsp.rccqe = nc->nc_cqe;
2250
2251 return (nvmf_che_construct_pdu(qp, &rsp, sizeof(rsp), NULL, 0));
2252 }
2253
2254 static struct mbuf *
capsule_to_pdu(struct nvmf_che_qpair * qp,struct nvmf_che_capsule * cc)2255 capsule_to_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_capsule *cc)
2256 {
2257 if (cc->nc.nc_qe_len == sizeof(struct nvme_command))
2258 return (che_command_pdu(qp, cc));
2259 else
2260 return (che_response_pdu(qp, cc));
2261 }
2262
2263 static void
nvmf_che_send(void * arg)2264 nvmf_che_send(void *arg)
2265 {
2266 struct nvmf_che_qpair *qp = arg;
2267 struct nvmf_che_capsule *cc;
2268 struct socket *so = qp->so;
2269 struct mbuf *m;
2270 int error;
2271
2272 m = NULL;
2273 SOCKBUF_LOCK(&so->so_snd);
2274 while (!qp->tx_shutdown) {
2275 if (so->so_error != 0) {
2276 error = so->so_error;
2277 SOCKBUF_UNLOCK(&so->so_snd);
2278 m_freem(m);
2279 nvmf_qpair_error(&qp->qp, error);
2280 SOCKBUF_LOCK(&so->so_snd);
2281 while (!qp->tx_shutdown)
2282 cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd));
2283 break;
2284 }
2285
2286 if (STAILQ_EMPTY(&qp->tx_capsules)) {
2287 cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd));
2288 continue;
2289 }
2290
2291 /* Convert a capsule into a PDU. */
2292 cc = STAILQ_FIRST(&qp->tx_capsules);
2293 STAILQ_REMOVE_HEAD(&qp->tx_capsules, link);
2294 SOCKBUF_UNLOCK(&so->so_snd);
2295
2296 m = capsule_to_pdu(qp, cc);
2297 che_release_capsule(cc);
2298
2299 nvmf_che_write_pdu(qp, m);
2300
2301 SOCKBUF_LOCK(&so->so_snd);
2302 }
2303 SOCKBUF_UNLOCK(&so->so_snd);
2304 kthread_exit();
2305 }
2306
2307 static int
nvmf_che_setsockopt(struct socket * so,u_int sspace,u_int rspace)2308 nvmf_che_setsockopt(struct socket *so, u_int sspace, u_int rspace)
2309 {
2310 struct sockopt opt;
2311 int error, one = 1;
2312
2313 /* Don't lower the buffer sizes, just enforce a minimum. */
2314 SOCKBUF_LOCK(&so->so_snd);
2315 if (sspace < so->so_snd.sb_hiwat)
2316 sspace = so->so_snd.sb_hiwat;
2317 SOCKBUF_UNLOCK(&so->so_snd);
2318 SOCKBUF_LOCK(&so->so_rcv);
2319 if (rspace < so->so_rcv.sb_hiwat)
2320 rspace = so->so_rcv.sb_hiwat;
2321 SOCKBUF_UNLOCK(&so->so_rcv);
2322
2323 error = soreserve(so, sspace, rspace);
2324 if (error != 0)
2325 return (error);
2326 SOCKBUF_LOCK(&so->so_snd);
2327 so->so_snd.sb_flags |= SB_AUTOSIZE;
2328 SOCKBUF_UNLOCK(&so->so_snd);
2329 SOCKBUF_LOCK(&so->so_rcv);
2330 so->so_rcv.sb_flags |= SB_AUTOSIZE;
2331 SOCKBUF_UNLOCK(&so->so_rcv);
2332
2333 /*
2334 * Disable Nagle.
2335 */
2336 bzero(&opt, sizeof(opt));
2337 opt.sopt_dir = SOPT_SET;
2338 opt.sopt_level = IPPROTO_TCP;
2339 opt.sopt_name = TCP_NODELAY;
2340 opt.sopt_val = &one;
2341 opt.sopt_valsize = sizeof(one);
2342 error = sosetopt(so, &opt);
2343 if (error != 0)
2344 return (error);
2345
2346 return (0);
2347 }
2348
2349 static void
t4_nvme_set_tcb_field(struct toepcb * toep,uint16_t word,uint64_t mask,uint64_t val)2350 t4_nvme_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask,
2351 uint64_t val)
2352 {
2353 struct adapter *sc = td_adapter(toep->td);
2354
2355 t4_set_tcb_field(sc, &toep->ofld_txq->wrq, toep, word, mask, val, 0, 0);
2356 }
2357
2358 static void
set_ulp_mode_nvme(struct toepcb * toep,u_int ulp_submode,uint8_t rxpda)2359 set_ulp_mode_nvme(struct toepcb *toep, u_int ulp_submode, uint8_t rxpda)
2360 {
2361 uint64_t val;
2362
2363 CTR(KTR_CXGBE, "%s: tid %u, ULP_MODE_NVMET, submode=%#x, rxpda=%u",
2364 __func__, toep->tid, ulp_submode, rxpda);
2365
2366 val = V_TCB_ULP_TYPE(ULP_MODE_NVMET) | V_TCB_ULP_RAW(ulp_submode);
2367 t4_nvme_set_tcb_field(toep, W_TCB_ULP_TYPE,
2368 V_TCB_ULP_TYPE(M_TCB_ULP_TYPE) | V_TCB_ULP_RAW(M_TCB_ULP_RAW), val);
2369
2370 val = V_TF_RX_FLOW_CONTROL_DISABLE(1ULL);
2371 t4_nvme_set_tcb_field(toep, W_TCB_T_FLAGS, val, val);
2372
2373 val = V_TCB_RSVD((rxpda / 4) - 1);
2374 t4_nvme_set_tcb_field(toep, W_TCB_RSVD, V_TCB_RSVD(M_TCB_RSVD), val);
2375
2376 /* 0 disables CPL_NVMT_CMP_IMM which is not useful in this driver. */
2377 val = 0;
2378 t4_nvme_set_tcb_field(toep, W_TCB_CMP_IMM_SZ,
2379 V_TCB_CMP_IMM_SZ(M_TCB_CMP_IMM_SZ), val);
2380 }
2381
2382 static u_int
pdu_max_data_len(const nvlist_t * nvl,u_int max_pdu_len,u_int hlen,uint8_t pda)2383 pdu_max_data_len(const nvlist_t *nvl, u_int max_pdu_len, u_int hlen,
2384 uint8_t pda)
2385 {
2386 u_int max_data_len;
2387
2388 if (nvlist_get_bool(nvl, "header_digests"))
2389 hlen += sizeof(uint32_t);
2390 hlen = roundup(hlen, pda);
2391 max_data_len = max_pdu_len - hlen;
2392 if (nvlist_get_bool(nvl, "data_digests"))
2393 max_data_len -= sizeof(uint32_t);
2394 return (max_data_len);
2395 }
2396
2397 static struct nvmf_qpair *
che_allocate_qpair(bool controller,const nvlist_t * nvl)2398 che_allocate_qpair(bool controller, const nvlist_t *nvl)
2399 {
2400 struct nvmf_che_adapter *nca;
2401 struct nvmf_che_qpair *qp;
2402 struct adapter *sc;
2403 struct file *fp;
2404 struct socket *so;
2405 struct inpcb *inp;
2406 struct tcpcb *tp;
2407 struct toepcb *toep;
2408 cap_rights_t rights;
2409 u_int max_tx_pdu_len, num_ddp_tags;
2410 int error, ulp_submode;
2411
2412 if (!nvlist_exists_number(nvl, "fd") ||
2413 !nvlist_exists_number(nvl, "rxpda") ||
2414 !nvlist_exists_number(nvl, "txpda") ||
2415 !nvlist_exists_bool(nvl, "header_digests") ||
2416 !nvlist_exists_bool(nvl, "data_digests") ||
2417 !nvlist_exists_number(nvl, "maxr2t") ||
2418 !nvlist_exists_number(nvl, "maxh2cdata") ||
2419 !nvlist_exists_number(nvl, "max_icd"))
2420 return (NULL);
2421
2422 error = fget(curthread, nvlist_get_number(nvl, "fd"),
2423 cap_rights_init_one(&rights, CAP_SOCK_CLIENT), &fp);
2424 if (error != 0)
2425 return (NULL);
2426 if (fp->f_type != DTYPE_SOCKET) {
2427 fdrop(fp, curthread);
2428 return (NULL);
2429 }
2430 so = fp->f_data;
2431 if (so->so_type != SOCK_STREAM ||
2432 so->so_proto->pr_protocol != IPPROTO_TCP) {
2433 fdrop(fp, curthread);
2434 return (NULL);
2435 }
2436
2437 sc = find_offload_adapter(so);
2438 if (sc == NULL) {
2439 fdrop(fp, curthread);
2440 return (NULL);
2441 }
2442 nca = sc->nvme_ulp_softc;
2443
2444 /*
2445 * Controller: Require advertised MAXH2CDATA to be small
2446 * enough.
2447 */
2448 if (controller) {
2449 u_int max_rx_data;
2450
2451 max_rx_data = pdu_max_data_len(nvl, nca->max_receive_pdu,
2452 sizeof(struct nvme_tcp_h2c_data_hdr),
2453 nvlist_get_number(nvl, "rxpda"));
2454 if (nvlist_get_number(nvl, "maxh2cdata") > max_rx_data) {
2455 fdrop(fp, curthread);
2456 return (NULL);
2457 }
2458 }
2459
2460 /*
2461 * Host: Require the queue size to be small enough that all of
2462 * the command ids allocated by nvmf(4) will fit in the
2463 * unallocated range.
2464 *
2465 * XXX: Alternatively this driver could just queue commands
2466 * when an unallocated ID isn't available.
2467 */
2468 if (!controller) {
2469 u_int num_commands;
2470
2471 num_commands = nvlist_get_number(nvl, "qsize") - 1;
2472 if (nvlist_get_bool(nvl, "admin"))
2473 num_commands += 8; /* Max AER */
2474 if (num_commands > CHE_NUM_FL_TAGS) {
2475 fdrop(fp, curthread);
2476 return (NULL);
2477 }
2478 }
2479
2480 qp = malloc(sizeof(*qp), M_NVMF_CHE, M_WAITOK | M_ZERO);
2481 qp->txpda = nvlist_get_number(nvl, "txpda");
2482 qp->rxpda = nvlist_get_number(nvl, "rxpda");
2483 qp->header_digests = nvlist_get_bool(nvl, "header_digests");
2484 qp->data_digests = nvlist_get_bool(nvl, "data_digests");
2485 qp->maxr2t = nvlist_get_number(nvl, "maxr2t");
2486 if (controller)
2487 qp->maxh2cdata = nvlist_get_number(nvl, "maxh2cdata");
2488
2489 if (controller) {
2490 /* NB: maxr2t is 0's based. */
2491 qp->num_fl_ttags = MIN(CHE_NUM_FL_TAGS,
2492 nvlist_get_number(nvl, "qsize") *
2493 ((uint64_t)qp->maxr2t + 1));
2494 qp->open_fl_ttags = mallocarray(qp->num_fl_ttags,
2495 sizeof(*qp->open_fl_ttags), M_NVMF_CHE, M_WAITOK | M_ZERO);
2496 } else {
2497 qp->fl_cids = mallocarray(CHE_NUM_FL_TAGS,
2498 sizeof(*qp->fl_cids), M_NVMF_CHE, M_WAITOK | M_ZERO);
2499 qp->fl_cid_set = malloc(sizeof(*qp->fl_cid_set), M_NVMF_CHE,
2500 M_WAITOK);
2501 FL_CID_INIT(qp->fl_cid_set);
2502 mtx_init(&qp->fl_cid_lock, "nvmf/che fl cids", NULL, MTX_DEF);
2503 }
2504
2505 inp = sotoinpcb(so);
2506 INP_WLOCK(inp);
2507 tp = intotcpcb(inp);
2508 if (inp->inp_flags & INP_DROPPED) {
2509 INP_WUNLOCK(inp);
2510 free(qp->fl_cid_set, M_NVMF_CHE);
2511 free(qp->fl_cids, M_NVMF_CHE);
2512 free(qp->open_fl_ttags, M_NVMF_CHE);
2513 free(qp, M_NVMF_CHE);
2514 fdrop(fp, curthread);
2515 return (NULL);
2516 }
2517
2518 MPASS(tp->t_flags & TF_TOE);
2519 MPASS(tp->tod != NULL);
2520 MPASS(tp->t_toe != NULL);
2521 toep = tp->t_toe;
2522 MPASS(toep->vi->adapter == sc);
2523
2524 if (ulp_mode(toep) != ULP_MODE_NONE) {
2525 INP_WUNLOCK(inp);
2526 free(qp->fl_cid_set, M_NVMF_CHE);
2527 free(qp->fl_cids, M_NVMF_CHE);
2528 free(qp->open_fl_ttags, M_NVMF_CHE);
2529 free(qp, M_NVMF_CHE);
2530 fdrop(fp, curthread);
2531 return (NULL);
2532 }
2533
2534 /* Claim socket from file descriptor. */
2535 fp->f_ops = &badfileops;
2536 fp->f_data = NULL;
2537
2538 qp->so = so;
2539 qp->toep = toep;
2540 qp->nca = nca;
2541 refcount_init(&qp->refs, 1);
2542
2543 /* NB: C2H and H2C headers are the same size. */
2544 qp->max_rx_data = pdu_max_data_len(nvl, nca->max_receive_pdu,
2545 sizeof(struct nvme_tcp_c2h_data_hdr), qp->rxpda);
2546 qp->max_tx_data = pdu_max_data_len(nvl, nca->max_transmit_pdu,
2547 sizeof(struct nvme_tcp_c2h_data_hdr), qp->txpda);
2548 if (!controller) {
2549 qp->max_tx_data = min(qp->max_tx_data,
2550 nvlist_get_number(nvl, "maxh2cdata"));
2551 qp->max_icd = min(nvlist_get_number(nvl, "max_icd"),
2552 pdu_max_data_len(nvl, nca->max_transmit_pdu,
2553 sizeof(struct nvme_tcp_cmd), qp->txpda));
2554 } else {
2555 /*
2556 * IOCCSZ represents the size of a logical command
2557 * capsule including the 64 byte SQE and the
2558 * in-capsule data. Use pdu_max_data_len to compute
2559 * the maximum supported ICD length.
2560 */
2561 qp->max_ioccsz = rounddown(pdu_max_data_len(nvl,
2562 nca->max_receive_pdu, sizeof(struct nvme_tcp_cmd),
2563 qp->rxpda), 16) + sizeof(struct nvme_command);
2564 }
2565
2566 ulp_submode = 0;
2567 if (qp->header_digests)
2568 ulp_submode |= FW_NVMET_ULPSUBMODE_HCRC;
2569 if (qp->data_digests)
2570 ulp_submode |= FW_NVMET_ULPSUBMODE_DCRC;
2571 if (!controller)
2572 ulp_submode |= FW_NVMET_ULPSUBMODE_ING_DIR;
2573
2574 max_tx_pdu_len = sizeof(struct nvme_tcp_h2c_data_hdr);
2575 if (qp->header_digests)
2576 max_tx_pdu_len += sizeof(uint32_t);
2577 max_tx_pdu_len = roundup(max_tx_pdu_len, qp->txpda);
2578 max_tx_pdu_len += qp->max_tx_data;
2579 if (qp->data_digests)
2580 max_tx_pdu_len += sizeof(uint32_t);
2581
2582 /* TODO: ISO limits */
2583
2584 if (controller) {
2585 /* Use the SUCCESS flag if SQ flow control is disabled. */
2586 qp->send_success = !nvlist_get_bool(nvl, "sq_flow_control");
2587 }
2588
2589 toep->params.ulp_mode = ULP_MODE_NVMET;
2590 toep->ulpcb = qp;
2591
2592 send_txdataplen_max_flowc_wr(sc, toep,
2593 roundup(/* max_iso_pdus * */ max_tx_pdu_len, tp->t_maxseg));
2594 set_ulp_mode_nvme(toep, ulp_submode, qp->rxpda);
2595 INP_WUNLOCK(inp);
2596
2597 fdrop(fp, curthread);
2598
2599 error = nvmf_che_setsockopt(so, max_tx_pdu_len, nca->max_receive_pdu);
2600 if (error != 0) {
2601 free(qp->fl_cid_set, M_NVMF_CHE);
2602 free(qp->fl_cids, M_NVMF_CHE);
2603 free(qp->open_fl_ttags, M_NVMF_CHE);
2604 free(qp, M_NVMF_CHE);
2605 return (NULL);
2606 }
2607
2608 num_ddp_tags = ddp_tags_per_qp;
2609 if (num_ddp_tags > 0) {
2610 qp->tpt_offset = t4_stag_alloc(sc, num_ddp_tags);
2611 if (qp->tpt_offset != T4_STAG_UNSET) {
2612 #ifdef VERBOSE_TRACES
2613 CTR(KTR_CXGBE,
2614 "%s: tid %u using %u tags at offset 0x%x",
2615 __func__, toep->tid, num_ddp_tags, qp->tpt_offset);
2616 #endif
2617 qp->num_ddp_tags = num_ddp_tags;
2618 qp->open_ddp_tags = mallocarray(qp->num_ddp_tags,
2619 sizeof(*qp->open_ddp_tags), M_NVMF_CHE, M_WAITOK |
2620 M_ZERO);
2621
2622 t4_nvme_set_tcb_field(toep, W_TCB_TPT_OFFSET,
2623 M_TCB_TPT_OFFSET, V_TCB_TPT_OFFSET(qp->tpt_offset));
2624 }
2625 }
2626
2627 TAILQ_INIT(&qp->rx_buffers.head);
2628 TAILQ_INIT(&qp->tx_buffers.head);
2629 mtx_init(&qp->rx_buffers.lock, "nvmf/che rx buffers", NULL, MTX_DEF);
2630 mtx_init(&qp->tx_buffers.lock, "nvmf/che tx buffers", NULL, MTX_DEF);
2631
2632 cv_init(&qp->rx_cv, "-");
2633 cv_init(&qp->tx_cv, "-");
2634 mbufq_init(&qp->rx_data, 0);
2635 mbufq_init(&qp->rx_pdus, 0);
2636 STAILQ_INIT(&qp->tx_capsules);
2637
2638 /* Register socket upcall for receive to handle remote FIN. */
2639 SOCKBUF_LOCK(&so->so_rcv);
2640 soupcall_set(so, SO_RCV, nvmf_che_soupcall_receive, qp);
2641 SOCKBUF_UNLOCK(&so->so_rcv);
2642
2643 /* Spin up kthreads. */
2644 error = kthread_add(nvmf_che_receive, qp, NULL, &qp->rx_thread, 0, 0,
2645 "nvmef che rx");
2646 if (error != 0) {
2647 che_free_qpair(&qp->qp);
2648 return (NULL);
2649 }
2650 error = kthread_add(nvmf_che_send, qp, NULL, &qp->tx_thread, 0, 0,
2651 "nvmef che tx");
2652 if (error != 0) {
2653 che_free_qpair(&qp->qp);
2654 return (NULL);
2655 }
2656
2657 return (&qp->qp);
2658 }
2659
2660 static void
che_release_qpair(struct nvmf_che_qpair * qp)2661 che_release_qpair(struct nvmf_che_qpair *qp)
2662 {
2663 if (refcount_release(&qp->refs))
2664 free(qp, M_NVMF_CHE);
2665 }
2666
2667 static void
che_free_qpair(struct nvmf_qpair * nq)2668 che_free_qpair(struct nvmf_qpair *nq)
2669 {
2670 struct nvmf_che_qpair *qp = CQP(nq);
2671 struct nvmf_che_command_buffer *ncb, *cb;
2672 struct nvmf_che_capsule *ncc, *cc;
2673 struct socket *so = qp->so;
2674 struct toepcb *toep = qp->toep;
2675 struct inpcb *inp = sotoinpcb(so);
2676
2677 /* Shut down kthreads. */
2678 SOCKBUF_LOCK(&so->so_snd);
2679 qp->tx_shutdown = true;
2680 if (qp->tx_thread != NULL) {
2681 cv_signal(&qp->tx_cv);
2682 mtx_sleep(qp->tx_thread, SOCKBUF_MTX(&so->so_snd), 0,
2683 "nvchetx", 0);
2684 }
2685 SOCKBUF_UNLOCK(&so->so_snd);
2686
2687 SOCKBUF_LOCK(&so->so_rcv);
2688 qp->rx_shutdown = true;
2689 if (qp->rx_thread != NULL) {
2690 cv_signal(&qp->rx_cv);
2691 mtx_sleep(qp->rx_thread, SOCKBUF_MTX(&so->so_rcv), 0,
2692 "nvcherx", 0);
2693 }
2694 soupcall_clear(so, SO_RCV);
2695 SOCKBUF_UNLOCK(&so->so_rcv);
2696 mbufq_drain(&qp->rx_data);
2697 mbufq_drain(&qp->rx_pdus);
2698
2699 STAILQ_FOREACH_SAFE(cc, &qp->tx_capsules, link, ncc) {
2700 nvmf_abort_capsule_data(&cc->nc, ECONNABORTED);
2701 che_release_capsule(cc);
2702 }
2703
2704 cv_destroy(&qp->tx_cv);
2705 cv_destroy(&qp->rx_cv);
2706
2707 if (qp->open_fl_ttags != NULL) {
2708 for (u_int i = 0; i < qp->num_fl_ttags; i++) {
2709 cb = qp->open_fl_ttags[i];
2710 if (cb != NULL) {
2711 cb->cc->active_r2ts--;
2712 cb->error = ECONNABORTED;
2713 che_release_command_buffer(cb);
2714 }
2715 }
2716 free(qp->open_fl_ttags, M_NVMF_CHE);
2717 }
2718 if (qp->num_ddp_tags != 0) {
2719 for (u_int i = 0; i < qp->num_ddp_tags; i++) {
2720 cb = qp->open_ddp_tags[i];
2721 if (cb != NULL) {
2722 if (cb->cc != NULL)
2723 cb->cc->active_r2ts--;
2724 cb->error = ECONNABORTED;
2725 mtx_lock(&qp->rx_buffers.lock);
2726 che_free_ddp_tag(qp, cb, cb->ttag);
2727 mtx_unlock(&qp->rx_buffers.lock);
2728 che_release_command_buffer(cb);
2729 }
2730 }
2731 free(qp->open_ddp_tags, M_NVMF_CHE);
2732 }
2733
2734 mtx_lock(&qp->rx_buffers.lock);
2735 TAILQ_FOREACH_SAFE(cb, &qp->rx_buffers.head, link, ncb) {
2736 che_remove_command_buffer(&qp->rx_buffers, cb);
2737 mtx_unlock(&qp->rx_buffers.lock);
2738 #ifdef INVARIANTS
2739 if (cb->cc != NULL)
2740 cb->cc->pending_r2ts--;
2741 #endif
2742 cb->error = ECONNABORTED;
2743 che_release_command_buffer(cb);
2744 mtx_lock(&qp->rx_buffers.lock);
2745 }
2746 mtx_destroy(&qp->rx_buffers.lock);
2747
2748 mtx_lock(&qp->tx_buffers.lock);
2749 TAILQ_FOREACH_SAFE(cb, &qp->tx_buffers.head, link, ncb) {
2750 che_remove_command_buffer(&qp->tx_buffers, cb);
2751 mtx_unlock(&qp->tx_buffers.lock);
2752 cb->error = ECONNABORTED;
2753 che_release_command_buffer(cb);
2754 mtx_lock(&qp->tx_buffers.lock);
2755 }
2756 mtx_destroy(&qp->tx_buffers.lock);
2757
2758 if (qp->num_ddp_tags != 0)
2759 t4_stag_free(qp->nca->sc, qp->tpt_offset, qp->num_ddp_tags);
2760
2761 if (!qp->qp.nq_controller) {
2762 free(qp->fl_cids, M_NVMF_CHE);
2763 free(qp->fl_cid_set, M_NVMF_CHE);
2764 mtx_destroy(&qp->fl_cid_lock);
2765 }
2766
2767 INP_WLOCK(inp);
2768 toep->ulpcb = NULL;
2769 mbufq_drain(&toep->ulp_pduq);
2770
2771 /*
2772 * Grab a reference to use when waiting for the final CPL to
2773 * be received. If toep->inp is NULL, then
2774 * final_cpl_received() has already been called (e.g. due to
2775 * the peer sending a RST).
2776 */
2777 if (toep->inp != NULL) {
2778 toep = hold_toepcb(toep);
2779 toep->flags |= TPF_WAITING_FOR_FINAL;
2780 } else
2781 toep = NULL;
2782 INP_WUNLOCK(inp);
2783
2784 soclose(so);
2785
2786 /*
2787 * Wait for the socket to fully close. This ensures any
2788 * pending received data has been received (and in particular,
2789 * any data that would be received by DDP has been handled).
2790 */
2791 if (toep != NULL) {
2792 struct mtx *lock = mtx_pool_find(mtxpool_sleep, toep);
2793
2794 mtx_lock(lock);
2795 while ((toep->flags & TPF_WAITING_FOR_FINAL) != 0)
2796 mtx_sleep(toep, lock, PSOCK, "conclo2", 0);
2797 mtx_unlock(lock);
2798 free_toepcb(toep);
2799 }
2800
2801 che_release_qpair(qp);
2802 }
2803
2804 static uint32_t
che_max_ioccsz(struct nvmf_qpair * nq)2805 che_max_ioccsz(struct nvmf_qpair *nq)
2806 {
2807 struct nvmf_che_qpair *qp = CQP(nq);
2808
2809 /*
2810 * Limit the command capsule size so that with maximum ICD it
2811 * fits within the limit of the largest PDU the adapter can
2812 * receive.
2813 */
2814 return (qp->max_ioccsz);
2815 }
2816
2817 static uint64_t
che_max_xfer_size(struct nvmf_qpair * nq)2818 che_max_xfer_size(struct nvmf_qpair *nq)
2819 {
2820 struct nvmf_che_qpair *qp = CQP(nq);
2821
2822 /*
2823 * Limit host transfers to the size of the data payload in the
2824 * largest PDU the adapter can receive.
2825 */
2826 return (qp->max_rx_data);
2827 }
2828
2829 static struct nvmf_capsule *
che_allocate_capsule(struct nvmf_qpair * nq,int how)2830 che_allocate_capsule(struct nvmf_qpair *nq, int how)
2831 {
2832 struct nvmf_che_qpair *qp = CQP(nq);
2833 struct nvmf_che_capsule *cc;
2834
2835 cc = malloc(sizeof(*cc), M_NVMF_CHE, how | M_ZERO);
2836 if (cc == NULL)
2837 return (NULL);
2838 refcount_init(&cc->refs, 1);
2839 refcount_acquire(&qp->refs);
2840 return (&cc->nc);
2841 }
2842
2843 static void
che_release_capsule(struct nvmf_che_capsule * cc)2844 che_release_capsule(struct nvmf_che_capsule *cc)
2845 {
2846 struct nvmf_che_qpair *qp = CQP(cc->nc.nc_qpair);
2847
2848 if (!refcount_release(&cc->refs))
2849 return;
2850
2851 MPASS(cc->active_r2ts == 0);
2852 MPASS(cc->pending_r2ts == 0);
2853
2854 nvmf_che_free_pdu(&cc->rx_pdu);
2855 free(cc, M_NVMF_CHE);
2856 che_release_qpair(qp);
2857 }
2858
2859 static void
che_free_capsule(struct nvmf_capsule * nc)2860 che_free_capsule(struct nvmf_capsule *nc)
2861 {
2862 che_release_capsule(CCAP(nc));
2863 }
2864
2865 static int
che_transmit_capsule(struct nvmf_capsule * nc)2866 che_transmit_capsule(struct nvmf_capsule *nc)
2867 {
2868 struct nvmf_che_qpair *qp = CQP(nc->nc_qpair);
2869 struct nvmf_che_capsule *cc = CCAP(nc);
2870 struct socket *so = qp->so;
2871
2872 refcount_acquire(&cc->refs);
2873 SOCKBUF_LOCK(&so->so_snd);
2874 STAILQ_INSERT_TAIL(&qp->tx_capsules, cc, link);
2875 cv_signal(&qp->tx_cv);
2876 SOCKBUF_UNLOCK(&so->so_snd);
2877 return (0);
2878 }
2879
2880 static uint8_t
che_validate_command_capsule(struct nvmf_capsule * nc)2881 che_validate_command_capsule(struct nvmf_capsule *nc)
2882 {
2883 struct nvmf_che_capsule *cc = CCAP(nc);
2884 struct nvme_sgl_descriptor *sgl;
2885
2886 KASSERT(cc->rx_pdu.hdr != NULL, ("capsule wasn't received"));
2887
2888 sgl = &nc->nc_sqe.sgl;
2889 switch (sgl->type) {
2890 case NVME_SGL_TYPE_ICD:
2891 if (cc->rx_pdu.data_len != le32toh(sgl->length)) {
2892 printf("NVMe/TCP: Command Capsule with mismatched ICD length\n");
2893 return (NVME_SC_DATA_SGL_LENGTH_INVALID);
2894 }
2895 break;
2896 case NVME_SGL_TYPE_COMMAND_BUFFER:
2897 if (cc->rx_pdu.data_len != 0) {
2898 printf("NVMe/TCP: Command Buffer SGL with ICD\n");
2899 return (NVME_SC_INVALID_FIELD);
2900 }
2901 break;
2902 default:
2903 printf("NVMe/TCP: Invalid SGL type in Command Capsule\n");
2904 return (NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID);
2905 }
2906
2907 if (sgl->address != 0) {
2908 printf("NVMe/TCP: Invalid SGL offset in Command Capsule\n");
2909 return (NVME_SC_SGL_OFFSET_INVALID);
2910 }
2911
2912 return (NVME_SC_SUCCESS);
2913 }
2914
2915 static size_t
che_capsule_data_len(const struct nvmf_capsule * nc)2916 che_capsule_data_len(const struct nvmf_capsule *nc)
2917 {
2918 MPASS(nc->nc_qe_len == sizeof(struct nvme_command));
2919 return (le32toh(nc->nc_sqe.sgl.length));
2920 }
2921
2922 static void
che_receive_r2t_data(struct nvmf_capsule * nc,uint32_t data_offset,struct nvmf_io_request * io)2923 che_receive_r2t_data(struct nvmf_capsule *nc, uint32_t data_offset,
2924 struct nvmf_io_request *io)
2925 {
2926 struct nvmf_che_qpair *qp = CQP(nc->nc_qpair);
2927 struct nvmf_che_capsule *cc = CCAP(nc);
2928 struct nvmf_che_command_buffer *cb;
2929
2930 cb = che_alloc_command_buffer(qp, io, data_offset, io->io_len,
2931 nc->nc_sqe.cid);
2932
2933 cb->cc = cc;
2934 refcount_acquire(&cc->refs);
2935
2936 /*
2937 * If this command has too many active R2Ts or there are no
2938 * available transfer tags, queue the request for later.
2939 *
2940 * NB: maxr2t is 0's based.
2941 */
2942 mtx_lock(&qp->rx_buffers.lock);
2943 if (cc->active_r2ts > qp->maxr2t ||
2944 !nvmf_che_allocate_ttag(qp, cb)) {
2945 #ifdef INVARIANTS
2946 cc->pending_r2ts++;
2947 #endif
2948 TAILQ_INSERT_TAIL(&qp->rx_buffers.head, cb, link);
2949 mtx_unlock(&qp->rx_buffers.lock);
2950 return;
2951 }
2952 mtx_unlock(&qp->rx_buffers.lock);
2953
2954 che_send_r2t(qp, nc->nc_sqe.cid, cb->ttag, data_offset, io->io_len);
2955 }
2956
2957 static void
che_receive_icd_data(struct nvmf_capsule * nc,uint32_t data_offset,struct nvmf_io_request * io)2958 che_receive_icd_data(struct nvmf_capsule *nc, uint32_t data_offset,
2959 struct nvmf_io_request *io)
2960 {
2961 struct nvmf_che_capsule *cc = CCAP(nc);
2962
2963 /*
2964 * The header is in rx_pdu.m, the padding is discarded, and
2965 * the data starts at rx_pdu.m->m_next.
2966 */
2967 mbuf_copyto_io(cc->rx_pdu.m->m_next, data_offset, io->io_len, io, 0);
2968 nvmf_complete_io_request(io, io->io_len, 0);
2969 }
2970
2971 static int
che_receive_controller_data(struct nvmf_capsule * nc,uint32_t data_offset,struct nvmf_io_request * io)2972 che_receive_controller_data(struct nvmf_capsule *nc, uint32_t data_offset,
2973 struct nvmf_io_request *io)
2974 {
2975 struct nvme_sgl_descriptor *sgl;
2976 size_t data_len;
2977
2978 if (nc->nc_qe_len != sizeof(struct nvme_command) ||
2979 !nc->nc_qpair->nq_controller)
2980 return (EINVAL);
2981
2982 sgl = &nc->nc_sqe.sgl;
2983 data_len = le32toh(sgl->length);
2984 if (data_offset + io->io_len > data_len)
2985 return (EFBIG);
2986
2987 if (sgl->type == NVME_SGL_TYPE_ICD)
2988 che_receive_icd_data(nc, data_offset, io);
2989 else
2990 che_receive_r2t_data(nc, data_offset, io);
2991 return (0);
2992 }
2993
2994 /* NB: cid is little-endian already. */
2995 static void
che_send_c2h_pdu(struct nvmf_che_qpair * qp,uint16_t cid,uint32_t data_offset,struct mbuf * m,size_t len,bool last_pdu,bool success)2996 che_send_c2h_pdu(struct nvmf_che_qpair *qp, uint16_t cid, uint32_t data_offset,
2997 struct mbuf *m, size_t len, bool last_pdu, bool success)
2998 {
2999 struct nvme_tcp_c2h_data_hdr c2h;
3000 struct mbuf *top;
3001
3002 memset(&c2h, 0, sizeof(c2h));
3003 c2h.common.pdu_type = NVME_TCP_PDU_TYPE_C2H_DATA;
3004 if (last_pdu)
3005 c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_LAST_PDU;
3006 if (success)
3007 c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_SUCCESS;
3008 c2h.cccid = cid;
3009 c2h.datao = htole32(data_offset);
3010 c2h.datal = htole32(len);
3011
3012 top = nvmf_che_construct_pdu(qp, &c2h, sizeof(c2h), m, len);
3013 nvmf_che_write_pdu(qp, top);
3014 }
3015
3016 static u_int
che_send_controller_data(struct nvmf_capsule * nc,uint32_t data_offset,struct mbuf * m,size_t len)3017 che_send_controller_data(struct nvmf_capsule *nc, uint32_t data_offset,
3018 struct mbuf *m, size_t len)
3019 {
3020 struct nvmf_che_qpair *qp = CQP(nc->nc_qpair);
3021 struct nvme_sgl_descriptor *sgl;
3022 uint32_t data_len;
3023 bool last_pdu, last_xfer;
3024
3025 if (nc->nc_qe_len != sizeof(struct nvme_command) ||
3026 !qp->qp.nq_controller) {
3027 m_freem(m);
3028 return (NVME_SC_INVALID_FIELD);
3029 }
3030
3031 sgl = &nc->nc_sqe.sgl;
3032 data_len = le32toh(sgl->length);
3033 if (data_offset + len > data_len) {
3034 m_freem(m);
3035 return (NVME_SC_INVALID_FIELD);
3036 }
3037 last_xfer = (data_offset + len == data_len);
3038
3039 if (sgl->type != NVME_SGL_TYPE_COMMAND_BUFFER) {
3040 m_freem(m);
3041 return (NVME_SC_INVALID_FIELD);
3042 }
3043
3044 KASSERT(data_offset == CCAP(nc)->tx_data_offset,
3045 ("%s: starting data_offset %u doesn't match end of previous xfer %u",
3046 __func__, data_offset, CCAP(nc)->tx_data_offset));
3047
3048 /* Queue one or more C2H_DATA PDUs containing the data from 'm'. */
3049 while (m != NULL) {
3050 struct mbuf *n;
3051 uint32_t todo;
3052
3053 if (m->m_len > qp->max_tx_data) {
3054 n = m_split(m, qp->max_tx_data, M_WAITOK);
3055 todo = m->m_len;
3056 } else {
3057 struct mbuf *p;
3058
3059 todo = m->m_len;
3060 p = m;
3061 n = p->m_next;
3062 while (n != NULL) {
3063 if (todo + n->m_len > qp->max_tx_data) {
3064 p->m_next = NULL;
3065 break;
3066 }
3067 todo += n->m_len;
3068 p = n;
3069 n = p->m_next;
3070 }
3071 MPASS(m_length(m, NULL) == todo);
3072 }
3073
3074 last_pdu = (n == NULL && last_xfer);
3075 che_send_c2h_pdu(qp, nc->nc_sqe.cid, data_offset, m, todo,
3076 last_pdu, last_pdu && qp->send_success);
3077
3078 data_offset += todo;
3079 data_len -= todo;
3080 m = n;
3081 }
3082 MPASS(data_len == 0);
3083
3084 #ifdef INVARIANTS
3085 CCAP(nc)->tx_data_offset = data_offset;
3086 #endif
3087 if (!last_xfer)
3088 return (NVMF_MORE);
3089 else if (qp->send_success)
3090 return (NVMF_SUCCESS_SENT);
3091 else
3092 return (NVME_SC_SUCCESS);
3093 }
3094
3095 struct nvmf_transport_ops che_ops = {
3096 .allocate_qpair = che_allocate_qpair,
3097 .free_qpair = che_free_qpair,
3098 .max_ioccsz = che_max_ioccsz,
3099 .max_xfer_size = che_max_xfer_size,
3100 .allocate_capsule = che_allocate_capsule,
3101 .free_capsule = che_free_capsule,
3102 .transmit_capsule = che_transmit_capsule,
3103 .validate_command_capsule = che_validate_command_capsule,
3104 .capsule_data_len = che_capsule_data_len,
3105 .receive_controller_data = che_receive_controller_data,
3106 .send_controller_data = che_send_controller_data,
3107 .trtype = NVMF_TRTYPE_TCP,
3108 .priority = 10,
3109 };
3110
3111 NVMF_TRANSPORT(che, che_ops);
3112
3113 static void
read_pdu_limits(struct adapter * sc,u_int * max_tx_pdu_len,uint32_t * max_rx_pdu_len)3114 read_pdu_limits(struct adapter *sc, u_int *max_tx_pdu_len,
3115 uint32_t *max_rx_pdu_len)
3116 {
3117 uint32_t tx_len, rx_len, r, v;
3118
3119 /* Copied from cxgbei, but not sure if this is correct. */
3120 rx_len = t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE);
3121 tx_len = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE);
3122
3123 r = t4_read_reg(sc, A_TP_PARA_REG2);
3124 rx_len = min(rx_len, G_MAXRXDATA(r));
3125 tx_len = min(tx_len, G_MAXRXDATA(r));
3126
3127 r = t4_read_reg(sc, A_TP_PARA_REG7);
3128 v = min(G_PMMAXXFERLEN0(r), G_PMMAXXFERLEN1(r));
3129 rx_len = min(rx_len, v);
3130 tx_len = min(tx_len, v);
3131
3132 /* Cannot be larger than 32KB - 256. */
3133 rx_len = min(rx_len, 32512);
3134 tx_len = min(tx_len, 32512);
3135
3136 *max_tx_pdu_len = tx_len;
3137 *max_rx_pdu_len = rx_len;
3138 }
3139
3140 static int
nvmf_che_init(struct adapter * sc,struct nvmf_che_adapter * nca)3141 nvmf_che_init(struct adapter *sc, struct nvmf_che_adapter *nca)
3142 {
3143 struct sysctl_oid *oid;
3144 struct sysctl_oid_list *children;
3145 uint32_t val;
3146
3147 read_pdu_limits(sc, &nca->max_transmit_pdu, &nca->max_receive_pdu);
3148 if (nca->max_transmit_pdu > che_max_transmit_pdu)
3149 nca->max_transmit_pdu = che_max_transmit_pdu;
3150 if (nca->max_receive_pdu > che_max_receive_pdu)
3151 nca->max_receive_pdu = che_max_receive_pdu;
3152 val = t4_read_reg(sc, A_SGE_CONTROL2);
3153 nca->nvmt_data_iqe = (val & F_RXCPLMODE_NVMT) != 0;
3154
3155 sysctl_ctx_init(&nca->ctx);
3156 oid = device_get_sysctl_tree(sc->dev); /* dev.che.X */
3157 children = SYSCTL_CHILDREN(oid);
3158
3159 oid = SYSCTL_ADD_NODE(&nca->ctx, children, OID_AUTO, "nvme",
3160 CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "NVMe ULP settings");
3161 children = SYSCTL_CHILDREN(oid);
3162
3163 nca->ddp_threshold = 8192;
3164 SYSCTL_ADD_UINT(&nca->ctx, children, OID_AUTO, "ddp_threshold",
3165 CTLFLAG_RW, &nca->ddp_threshold, 0, "Rx zero copy threshold");
3166
3167 SYSCTL_ADD_UINT(&nca->ctx, children, OID_AUTO, "max_transmit_pdu",
3168 CTLFLAG_RW, &nca->max_transmit_pdu, 0,
3169 "Maximum size of a transmitted PDU");
3170
3171 SYSCTL_ADD_UINT(&nca->ctx, children, OID_AUTO, "max_receive_pdu",
3172 CTLFLAG_RW, &nca->max_receive_pdu, 0,
3173 "Maximum size of a received PDU");
3174
3175 return (0);
3176 }
3177
3178 static void
nvmf_che_destroy(struct nvmf_che_adapter * nca)3179 nvmf_che_destroy(struct nvmf_che_adapter *nca)
3180 {
3181 sysctl_ctx_free(&nca->ctx);
3182 free(nca, M_CXGBE);
3183 }
3184
3185 static int
nvmf_che_activate(struct adapter * sc)3186 nvmf_che_activate(struct adapter *sc)
3187 {
3188 struct nvmf_che_adapter *nca;
3189 int rc;
3190
3191 ASSERT_SYNCHRONIZED_OP(sc);
3192
3193 if (uld_active(sc, ULD_NVME)) {
3194 KASSERT(0, ("%s: NVMe offload already enabled on adapter %p",
3195 __func__, sc));
3196 return (0);
3197 }
3198
3199 if ((sc->nvmecaps & FW_CAPS_CONFIG_NVME_TCP) == 0) {
3200 device_printf(sc->dev,
3201 "not NVMe offload capable, or capability disabled\n");
3202 return (ENOSYS);
3203 }
3204
3205 /* per-adapter softc for NVMe */
3206 nca = malloc(sizeof(*nca), M_CXGBE, M_ZERO | M_WAITOK);
3207 nca->sc = sc;
3208
3209 rc = nvmf_che_init(sc, nca);
3210 if (rc != 0) {
3211 free(nca, M_CXGBE);
3212 return (rc);
3213 }
3214
3215 sc->nvme_ulp_softc = nca;
3216
3217 return (0);
3218 }
3219
3220 static int
nvmf_che_deactivate(struct adapter * sc)3221 nvmf_che_deactivate(struct adapter *sc)
3222 {
3223 struct nvmf_che_adapter *nca = sc->nvme_ulp_softc;
3224
3225 ASSERT_SYNCHRONIZED_OP(sc);
3226
3227 if (nca != NULL) {
3228 nvmf_che_destroy(nca);
3229 sc->nvme_ulp_softc = NULL;
3230 }
3231
3232 return (0);
3233 }
3234
3235 static void
nvmf_che_activate_all(struct adapter * sc,void * arg __unused)3236 nvmf_che_activate_all(struct adapter *sc, void *arg __unused)
3237 {
3238 if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t7nvact") != 0)
3239 return;
3240
3241 /* Activate NVMe if any port on this adapter has IFCAP_TOE enabled. */
3242 if (sc->offload_map && !uld_active(sc, ULD_NVME))
3243 (void) t4_activate_uld(sc, ULD_NVME);
3244
3245 end_synchronized_op(sc, 0);
3246 }
3247
3248 static void
nvmf_che_deactivate_all(struct adapter * sc,void * arg __unused)3249 nvmf_che_deactivate_all(struct adapter *sc, void *arg __unused)
3250 {
3251 if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t7nvdea") != 0)
3252 return;
3253
3254 if (uld_active(sc, ULD_NVME))
3255 (void) t4_deactivate_uld(sc, ULD_NVME);
3256
3257 end_synchronized_op(sc, 0);
3258 }
3259
3260 static struct uld_info nvmf_che_uld_info = {
3261 .uld_activate = nvmf_che_activate,
3262 .uld_deactivate = nvmf_che_deactivate,
3263 };
3264
3265 static int
nvmf_che_mod_load(void)3266 nvmf_che_mod_load(void)
3267 {
3268 int rc;
3269
3270 t4_register_cpl_handler(CPL_NVMT_CMP, do_nvmt_cmp);
3271 t4_register_cpl_handler(CPL_NVMT_DATA, do_nvmt_data);
3272
3273 rc = t4_register_uld(&nvmf_che_uld_info, ULD_NVME);
3274 if (rc != 0)
3275 return (rc);
3276
3277 t4_iterate(nvmf_che_activate_all, NULL);
3278
3279 return (rc);
3280 }
3281
3282 static int
nvmf_che_mod_unload(void)3283 nvmf_che_mod_unload(void)
3284 {
3285 t4_iterate(nvmf_che_deactivate_all, NULL);
3286
3287 if (t4_unregister_uld(&nvmf_che_uld_info, ULD_NVME) == EBUSY)
3288 return (EBUSY);
3289
3290 t4_register_cpl_handler(CPL_NVMT_CMP, NULL);
3291 t4_register_cpl_handler(CPL_NVMT_DATA, NULL);
3292
3293 return (0);
3294 }
3295 #endif
3296
3297 static int
nvmf_che_modevent(module_t mod,int cmd,void * arg)3298 nvmf_che_modevent(module_t mod, int cmd, void *arg)
3299 {
3300 int rc;
3301
3302 #ifdef TCP_OFFLOAD
3303 switch (cmd) {
3304 case MOD_LOAD:
3305 rc = nvmf_che_mod_load();
3306 break;
3307 case MOD_UNLOAD:
3308 rc = nvmf_che_mod_unload();
3309 break;
3310 default:
3311 rc = EOPNOTSUPP;
3312 break;
3313 }
3314 #else
3315 printf("nvmf_che: compiled without TCP_OFFLOAD support.\n");
3316 rc = EOPNOTSUPP;
3317 #endif
3318
3319 return (rc);
3320 }
3321
3322 static moduledata_t nvmf_che_mod = {
3323 "nvmf_che",
3324 nvmf_che_modevent,
3325 NULL,
3326 };
3327
3328 MODULE_VERSION(nvmf_che, 1);
3329 DECLARE_MODULE(nvmf_che, nvmf_che_mod, SI_SUB_EXEC, SI_ORDER_ANY);
3330 MODULE_DEPEND(nvmf_che, t4_tom, 1, 1, 1);
3331 MODULE_DEPEND(nvmf_che, cxgbe, 1, 1, 1);
3332