1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2023 Chelsio Communications, Inc.
5 * Written by: John Baldwin <jhb@FreeBSD.org>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 #include "opt_inet.h"
30
31 #include <sys/param.h>
32 #include <sys/libkern.h>
33 #include <sys/kernel.h>
34 #include <sys/module.h>
35
36 #ifdef TCP_OFFLOAD
37 #include <sys/bitset.h>
38 #include <sys/capsicum.h>
39 #include <sys/file.h>
40 #include <sys/kthread.h>
41 #include <sys/ktr.h>
42 #include <sys/malloc.h>
43 #include <sys/mbuf.h>
44 #include <sys/nv.h>
45 #include <sys/protosw.h>
46 #include <sys/socket.h>
47 #include <sys/socketvar.h>
48 #include <netinet/in.h>
49 #include <netinet/in_pcb.h>
50 #include <netinet/tcp_var.h>
51 #include <netinet/toecore.h>
52
53 #include <dev/nvmf/nvmf.h>
54 #include <dev/nvmf/nvmf_proto.h>
55 #include <dev/nvmf/nvmf_tcp.h>
56 #include <dev/nvmf/nvmf_transport.h>
57 #include <dev/nvmf/nvmf_transport_internal.h>
58
59 #include <vm/pmap.h>
60 #include <vm/vm_page.h>
61
62 #include "common/common.h"
63 #include "common/t4_regs.h"
64 #include "common/t4_tcb.h"
65 #include "tom/t4_tom.h"
66
67 /* Status code values in CPL_NVMT_CMP. */
68 #define CMP_STATUS_ERROR_MASK 0x7f
69 #define CMP_STATUS_NO_ERROR 0
70 #define CMP_STATUS_HEADER_DIGEST 1
71 #define CMP_STATUS_DIRECTION_MISMATCH 2
72 #define CMP_STATUS_DIGEST_FLAG_MISMATCH 3
73 #define CMP_STATUS_SUCCESS_NOT_LAST 4
74 #define CMP_STATUS_BAD_DATA_LENGTH 5
75 #define CMP_STATUS_USER_MODE_UNALLOCATED 6
76 #define CMP_STATUS_RQT_LIMIT 7
77 #define CMP_STATUS_RQT_WRAP 8
78 #define CMP_STATUS_RQT_BOUND 9
79 #define CMP_STATUS_TPT_LIMIT 16
80 #define CMP_STATUS_TPT_INVALID 17
81 #define CMP_STATUS_TPT_COLOUR_MISMATCH 18
82 #define CMP_STATUS_TPT_MISC 19
83 #define CMP_STATUS_TPT_WRAP 20
84 #define CMP_STATUS_TPT_BOUND 21
85 #define CMP_STATUS_TPT_LAST_PDU_UNALIGNED 22
86 #define CMP_STATUS_PBL_LIMIT 24
87 #define CMP_STATUS_DATA_DIGEST 25
88 #define CMP_STATUS_DDP 0x80
89
90 /*
91 * Transfer tags and CIDs with the MSB set are "unallocated" tags that
92 * pass data through to the freelist without using DDP.
93 */
94 #define CHE_FL_TAG_MASK 0x8000
95 #define CHE_MAX_FL_TAG 0x7fff
96 #define CHE_NUM_FL_TAGS (CHE_MAX_FL_TAG + 1)
97
98 #define CHE_TAG_IS_FL(ttag) (((ttag) & CHE_FL_TAG_MASK) == CHE_FL_TAG_MASK)
99 #define CHE_RAW_FL_TAG(ttag) ((ttag) & ~CHE_FL_TAG_MASK)
100 #define CHE_DDP_TAG(stag_idx, color) ((stag_idx) << 4 | (color))
101 #define CHE_STAG_COLOR(stag) ((stag) & 0xf)
102 #define CHE_STAG_IDX(stag) ((stag) >> 4)
103 #define CHE_DDP_MAX_COLOR 0xf
104
105 #define CHE_DDP_NO_TAG 0xffff
106
107 /*
108 * A bitmap of non-DDP CIDs in use on the host. Since there is no
109 * _BIT_FFC (find first clear), the bitset is inverted so that a clear
110 * bit indicates an in-use CID.
111 */
112 BITSET_DEFINE(fl_cid_set, CHE_NUM_FL_TAGS);
113 #define FL_CID_INIT(p) __BIT_FILL(CHE_NUM_FL_TAGS, p)
114 #define FL_CID_BUSY(n, p) __BIT_CLR(CHE_NUM_FL_TAGS, n, p)
115 #define FL_CID_ISACTIVE(n, p) !__BIT_ISSET(CHE_NUM_FL_TAGS, n, p)
116 #define FL_CID_FREE(n, p) __BIT_SET(CHE_NUM_FL_TAGS, n, p)
117 #define FL_CID_FINDFREE_AT(p, start) __BIT_FFS_AT(CHE_NUM_FL_TAGS, p, start)
118
119 /*
120 * The TCP sequence number of both CPL_NVMT_DATA and CPL_NVMT_CMP
121 * mbufs are saved here while the mbuf is in qp->rx_data and qp->rx_pdus.
122 */
123 #define nvmf_tcp_seq PH_loc.thirtytwo[0]
124
125 /*
126 * The CPL status of CPL_NVMT_CMP mbufs are saved here while the mbuf
127 * is in qp->rx_pdus.
128 */
129 #define nvmf_cpl_status PH_loc.eight[4]
130
131 struct nvmf_che_capsule;
132 struct nvmf_che_qpair;
133
134 struct nvmf_che_adapter {
135 struct adapter *sc;
136
137 u_int ddp_threshold;
138 u_int max_transmit_pdu;
139 u_int max_receive_pdu;
140 bool nvmt_data_iqe;
141
142 struct sysctl_ctx_list ctx; /* from uld_activate to deactivate */
143 };
144
145 struct nvmf_che_command_buffer {
146 struct nvmf_che_qpair *qp;
147
148 struct nvmf_io_request io;
149 size_t data_len;
150 size_t data_xfered;
151 uint32_t data_offset;
152
153 u_int refs;
154 int error;
155
156 bool ddp_ok;
157 uint16_t cid;
158 uint16_t ttag;
159 uint16_t original_cid; /* Host only */
160
161 TAILQ_ENTRY(nvmf_che_command_buffer) link;
162
163 /* Fields used for DDP. */
164 struct fw_ri_tpte tpte;
165 uint64_t *pbl;
166 uint32_t pbl_addr;
167 uint32_t pbl_len;
168
169 /* Controller only */
170 struct nvmf_che_capsule *cc;
171 };
172
173 struct nvmf_che_command_buffer_list {
174 TAILQ_HEAD(, nvmf_che_command_buffer) head;
175 struct mtx lock;
176 };
177
178 struct nvmf_che_qpair {
179 struct nvmf_qpair qp;
180
181 struct socket *so;
182 struct toepcb *toep;
183 struct nvmf_che_adapter *nca;
184
185 volatile u_int refs; /* Every allocated capsule holds a reference */
186 uint8_t txpda;
187 uint8_t rxpda;
188 bool header_digests;
189 bool data_digests;
190 uint32_t maxr2t;
191 uint32_t maxh2cdata; /* Controller only */
192 uint32_t max_rx_data;
193 uint32_t max_tx_data;
194 uint32_t max_icd; /* Host only */
195 uint32_t max_ioccsz; /* Controller only */
196 union {
197 uint16_t next_fl_ttag; /* Controller only */
198 uint16_t next_cid; /* Host only */
199 };
200 uint16_t next_ddp_tag;
201 u_int num_fl_ttags; /* Controller only */
202 u_int active_fl_ttags; /* Controller only */
203 u_int num_ddp_tags;
204 u_int active_ddp_tags;
205 bool send_success; /* Controller only */
206 uint8_t ddp_color;
207 uint32_t tpt_offset;
208
209 /* Receive state. */
210 struct thread *rx_thread;
211 struct cv rx_cv;
212 bool rx_shutdown;
213 int rx_error;
214 struct mbufq rx_data; /* Data received via CPL_NVMT_DATA. */
215 struct mbufq rx_pdus; /* PDU headers received via CPL_NVMT_CMP. */
216
217 /* Transmit state. */
218 struct thread *tx_thread;
219 struct cv tx_cv;
220 bool tx_shutdown;
221 STAILQ_HEAD(, nvmf_che_capsule) tx_capsules;
222
223 struct nvmf_che_command_buffer_list tx_buffers;
224 struct nvmf_che_command_buffer_list rx_buffers;
225
226 /*
227 * For the controller, an RX command buffer can be in one of
228 * three locations, all protected by the rx_buffers.lock. If
229 * a receive request is waiting for either an R2T slot for its
230 * command (due to exceeding MAXR2T), or a transfer tag it is
231 * placed on the rx_buffers list. When a request is allocated
232 * an active transfer tag, it moves to either the
233 * open_ddp_tags[] or open_fl_ttags[] array (indexed by the
234 * tag) until it completes.
235 *
236 * For the host, an RX command buffer using DDP is in
237 * open_ddp_tags[], otherwise it is in rx_buffers.
238 */
239 struct nvmf_che_command_buffer **open_ddp_tags;
240 struct nvmf_che_command_buffer **open_fl_ttags; /* Controller only */
241
242 /*
243 * For the host, CIDs submitted by nvmf(4) must be rewritten
244 * to either use DDP or not use DDP. The CID in response
245 * capsules must be restored to their original value. For
246 * DDP, the original CID is stored in the command buffer.
247 * These variables manage non-DDP CIDs.
248 */
249 uint16_t *fl_cids; /* Host only */
250 struct fl_cid_set *fl_cid_set; /* Host only */
251 struct mtx fl_cid_lock; /* Host only */
252 };
253
254 struct nvmf_che_rxpdu {
255 struct mbuf *m;
256 const struct nvme_tcp_common_pdu_hdr *hdr;
257 uint32_t data_len;
258 bool data_digest_mismatch;
259 bool ddp;
260 };
261
262 struct nvmf_che_capsule {
263 struct nvmf_capsule nc;
264
265 volatile u_int refs;
266
267 struct nvmf_che_rxpdu rx_pdu;
268
269 uint32_t active_r2ts; /* Controller only */
270 #ifdef INVARIANTS
271 uint32_t tx_data_offset; /* Controller only */
272 u_int pending_r2ts; /* Controller only */
273 #endif
274
275 STAILQ_ENTRY(nvmf_che_capsule) link;
276 };
277
278 #define CCAP(nc) ((struct nvmf_che_capsule *)(nc))
279 #define CQP(qp) ((struct nvmf_che_qpair *)(qp))
280
281 static void che_release_capsule(struct nvmf_che_capsule *cc);
282 static void che_free_qpair(struct nvmf_qpair *nq);
283
284 SYSCTL_NODE(_kern_nvmf, OID_AUTO, che, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
285 "Chelsio TCP offload transport");
286
287 static u_int che_max_transmit_pdu = 32 * 1024;
288 SYSCTL_UINT(_kern_nvmf_che, OID_AUTO, max_transmit_pdu, CTLFLAG_RWTUN,
289 &che_max_transmit_pdu, 0,
290 "Maximum size of a transmitted PDU");
291
292 static u_int che_max_receive_pdu = 32 * 1024;
293 SYSCTL_UINT(_kern_nvmf_che, OID_AUTO, max_receive_pdu, CTLFLAG_RWTUN,
294 &che_max_receive_pdu, 0,
295 "Maximum size of a received PDU");
296
297 static int use_dsgl = 1;
298 SYSCTL_INT(_kern_nvmf_che, OID_AUTO, use_dsgl, CTLFLAG_RWTUN, &use_dsgl, 0,
299 "Use DSGL for PBL/FastReg (default=1)");
300
301 static int inline_threshold = 256;
302 SYSCTL_INT(_kern_nvmf_che, OID_AUTO, inline_threshold, CTLFLAG_RWTUN,
303 &inline_threshold, 0,
304 "inline vs dsgl threshold (default=256)");
305
306 static int ddp_tags_per_qp = 128;
307 SYSCTL_INT(_kern_nvmf_che, OID_AUTO, ddp_tags_per_qp, CTLFLAG_RWTUN,
308 &ddp_tags_per_qp, 0,
309 "Number of DDP tags to reserve for each queue pair");
310
311 static MALLOC_DEFINE(M_NVMF_CHE, "nvmf_che", "Chelsio NVMe-TCP offload");
312
313 /*
314 * PBL regions consist of N full-sized pages. TPT entries support an
315 * initial offset into the first page (FBO) and can handle a partial
316 * length on the last page.
317 */
318 static bool
che_ddp_io_check(struct nvmf_che_qpair * qp,const struct nvmf_io_request * io)319 che_ddp_io_check(struct nvmf_che_qpair *qp, const struct nvmf_io_request *io)
320 {
321 const struct memdesc *mem = &io->io_mem;
322 struct bus_dma_segment *ds;
323 int i;
324
325 if (io->io_len < qp->nca->ddp_threshold) {
326 return (false);
327 }
328
329 switch (mem->md_type) {
330 case MEMDESC_VADDR:
331 case MEMDESC_PADDR:
332 case MEMDESC_VMPAGES:
333 return (true);
334 case MEMDESC_VLIST:
335 case MEMDESC_PLIST:
336 /*
337 * Require all but the first segment to start on a
338 * page boundary. Require all but the last segment to
339 * end on a page boundary.
340 */
341 ds = mem->u.md_list;
342 for (i = 0; i < mem->md_nseg; i++, ds++) {
343 if (i != 0 && ds->ds_addr % PAGE_SIZE != 0)
344 return (false);
345 if (i != mem->md_nseg - 1 &&
346 (ds->ds_addr + ds->ds_len) % PAGE_SIZE != 0)
347 return (false);
348 }
349 return (true);
350 default:
351 /*
352 * Other types could be validated with more work, but
353 * they aren't used currently by nvmf(4) or nvmft(4).
354 */
355 return (false);
356 }
357 }
358
359 static u_int
che_fbo(struct nvmf_che_command_buffer * cb)360 che_fbo(struct nvmf_che_command_buffer *cb)
361 {
362 struct memdesc *mem = &cb->io.io_mem;
363
364 switch (mem->md_type) {
365 case MEMDESC_VADDR:
366 return ((uintptr_t)mem->u.md_vaddr & PAGE_MASK);
367 case MEMDESC_PADDR:
368 return (mem->u.md_paddr & PAGE_MASK);
369 case MEMDESC_VMPAGES:
370 return (mem->md_offset);
371 case MEMDESC_VLIST:
372 case MEMDESC_PLIST:
373 return (mem->u.md_list[0].ds_addr & PAGE_MASK);
374 default:
375 __assert_unreachable();
376 }
377 }
378
379 static u_int
che_npages(struct nvmf_che_command_buffer * cb)380 che_npages(struct nvmf_che_command_buffer *cb)
381 {
382 return (howmany(che_fbo(cb) + cb->io.io_len, PAGE_SIZE));
383 }
384
385 static struct nvmf_che_command_buffer *
che_alloc_command_buffer(struct nvmf_che_qpair * qp,const struct nvmf_io_request * io,uint32_t data_offset,size_t data_len,uint16_t cid)386 che_alloc_command_buffer(struct nvmf_che_qpair *qp,
387 const struct nvmf_io_request *io, uint32_t data_offset, size_t data_len,
388 uint16_t cid)
389 {
390 struct nvmf_che_command_buffer *cb;
391
392 cb = malloc(sizeof(*cb), M_NVMF_CHE, M_WAITOK);
393 cb->qp = qp;
394 cb->io = *io;
395 cb->data_offset = data_offset;
396 cb->data_len = data_len;
397 cb->data_xfered = 0;
398 refcount_init(&cb->refs, 1);
399 cb->error = 0;
400 cb->ddp_ok = che_ddp_io_check(qp, io);
401 cb->cid = cid;
402 cb->ttag = 0;
403 cb->original_cid = 0;
404 cb->cc = NULL;
405 cb->pbl = NULL;
406
407 return (cb);
408 }
409
410 static void
che_hold_command_buffer(struct nvmf_che_command_buffer * cb)411 che_hold_command_buffer(struct nvmf_che_command_buffer *cb)
412 {
413 refcount_acquire(&cb->refs);
414 }
415
416 static void
che_free_command_buffer(struct nvmf_che_command_buffer * cb)417 che_free_command_buffer(struct nvmf_che_command_buffer *cb)
418 {
419 nvmf_complete_io_request(&cb->io, cb->data_xfered, cb->error);
420 if (cb->cc != NULL)
421 che_release_capsule(cb->cc);
422 MPASS(cb->pbl == NULL);
423 free(cb, M_NVMF_CHE);
424 }
425
426 static void
che_release_command_buffer(struct nvmf_che_command_buffer * cb)427 che_release_command_buffer(struct nvmf_che_command_buffer *cb)
428 {
429 if (refcount_release(&cb->refs))
430 che_free_command_buffer(cb);
431 }
432
433 static void
che_add_command_buffer(struct nvmf_che_command_buffer_list * list,struct nvmf_che_command_buffer * cb)434 che_add_command_buffer(struct nvmf_che_command_buffer_list *list,
435 struct nvmf_che_command_buffer *cb)
436 {
437 mtx_assert(&list->lock, MA_OWNED);
438 TAILQ_INSERT_HEAD(&list->head, cb, link);
439 }
440
441 static struct nvmf_che_command_buffer *
che_find_command_buffer(struct nvmf_che_command_buffer_list * list,uint16_t cid)442 che_find_command_buffer(struct nvmf_che_command_buffer_list *list,
443 uint16_t cid)
444 {
445 struct nvmf_che_command_buffer *cb;
446
447 mtx_assert(&list->lock, MA_OWNED);
448 TAILQ_FOREACH(cb, &list->head, link) {
449 if (cb->cid == cid)
450 return (cb);
451 }
452 return (NULL);
453 }
454
455 static void
che_remove_command_buffer(struct nvmf_che_command_buffer_list * list,struct nvmf_che_command_buffer * cb)456 che_remove_command_buffer(struct nvmf_che_command_buffer_list *list,
457 struct nvmf_che_command_buffer *cb)
458 {
459 mtx_assert(&list->lock, MA_OWNED);
460 TAILQ_REMOVE(&list->head, cb, link);
461 }
462
463 static void
che_purge_command_buffer(struct nvmf_che_command_buffer_list * list,uint16_t cid)464 che_purge_command_buffer(struct nvmf_che_command_buffer_list *list,
465 uint16_t cid)
466 {
467 struct nvmf_che_command_buffer *cb;
468
469 mtx_lock(&list->lock);
470 cb = che_find_command_buffer(list, cid);
471 if (cb != NULL) {
472 che_remove_command_buffer(list, cb);
473 mtx_unlock(&list->lock);
474 che_release_command_buffer(cb);
475 } else
476 mtx_unlock(&list->lock);
477 }
478
479 static int
che_write_mem_inline(struct adapter * sc,struct toepcb * toep,uint32_t addr,uint32_t len,void * data,struct mbufq * wrq)480 che_write_mem_inline(struct adapter *sc, struct toepcb *toep, uint32_t addr,
481 uint32_t len, void *data, struct mbufq *wrq)
482 {
483 struct mbuf *m;
484 char *cp;
485 int copy_len, i, num_wqe, wr_len;
486
487 #ifdef VERBOSE_TRACES
488 CTR(KTR_CXGBE, "%s: addr 0x%x len %u", __func__, addr << 5, len);
489 #endif
490 num_wqe = DIV_ROUND_UP(len, T4_MAX_INLINE_SIZE);
491 cp = data;
492 for (i = 0; i < num_wqe; i++) {
493 copy_len = min(len, T4_MAX_INLINE_SIZE);
494 wr_len = T4_WRITE_MEM_INLINE_LEN(copy_len);
495
496 m = alloc_raw_wr_mbuf(wr_len);
497 if (m == NULL)
498 return (ENOMEM);
499 t4_write_mem_inline_wr(sc, mtod(m, void *), wr_len, toep->tid,
500 addr, copy_len, cp, 0);
501 if (cp != NULL)
502 cp += T4_MAX_INLINE_SIZE;
503 addr += T4_MAX_INLINE_SIZE >> 5;
504 len -= T4_MAX_INLINE_SIZE;
505
506 mbufq_enqueue(wrq, m);
507 }
508 return (0);
509 }
510
511 static int
che_write_mem_dma_aligned(struct adapter * sc,struct toepcb * toep,uint32_t addr,uint32_t len,void * data,struct mbufq * wrq)512 che_write_mem_dma_aligned(struct adapter *sc, struct toepcb *toep,
513 uint32_t addr, uint32_t len, void *data, struct mbufq *wrq)
514 {
515 struct mbuf *m;
516 vm_offset_t va;
517 u_int todo;
518 int wr_len;
519
520 /* First page. */
521 va = (vm_offset_t)data;
522 todo = min(PAGE_SIZE - (va % PAGE_SIZE), len);
523 wr_len = T4_WRITE_MEM_DMA_LEN;
524 m = alloc_raw_wr_mbuf(wr_len);
525 if (m == NULL)
526 return (ENOMEM);
527 t4_write_mem_dma_wr(sc, mtod(m, void *), wr_len, toep->tid, addr,
528 todo, pmap_kextract(va), 0);
529 mbufq_enqueue(wrq, m);
530 len -= todo;
531 addr += todo >> 5;
532 va += todo;
533
534 while (len > 0) {
535 MPASS(va == trunc_page(va));
536 todo = min(PAGE_SIZE, len);
537 m = alloc_raw_wr_mbuf(wr_len);
538 if (m == NULL)
539 return (ENOMEM);
540 t4_write_mem_dma_wr(sc, mtod(m, void *), wr_len, toep->tid,
541 addr, todo, pmap_kextract(va), 0);
542 mbufq_enqueue(wrq, m);
543 len -= todo;
544 addr += todo >> 5;
545 va += todo;
546 }
547 return (0);
548 }
549
550 static int
che_write_adapter_mem(struct nvmf_che_qpair * qp,uint32_t addr,uint32_t len,void * data)551 che_write_adapter_mem(struct nvmf_che_qpair *qp, uint32_t addr, uint32_t len,
552 void *data)
553 {
554 struct adapter *sc = qp->nca->sc;
555 struct toepcb *toep = qp->toep;
556 struct socket *so = qp->so;
557 struct inpcb *inp = sotoinpcb(so);
558 struct tcpcb *tp = intotcpcb(inp);
559 struct mbufq mq;
560 int error;
561
562 mbufq_init(&mq, INT_MAX);
563 if (!use_dsgl || len < inline_threshold || data == NULL)
564 error = che_write_mem_inline(sc, toep, addr, len, data, &mq);
565 else
566 error = che_write_mem_dma_aligned(sc, toep, addr, len, data,
567 &mq);
568 if (__predict_false(error != 0))
569 goto error;
570
571 INP_WLOCK(inp);
572 if ((tp->t_flags & TF_DISCONNECTED) != 0) {
573 INP_WUNLOCK(inp);
574 error = ECONNRESET;
575 goto error;
576 }
577 mbufq_concat(&toep->ulp_pduq, &mq);
578 INP_WUNLOCK(inp);
579 return (0);
580
581 error:
582 mbufq_drain(&mq);
583 return (error);
584 }
585
586 static bool
che_alloc_pbl(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb)587 che_alloc_pbl(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb)
588 {
589 struct adapter *sc = qp->nca->sc;
590 struct memdesc *mem = &cb->io.io_mem;
591 uint64_t *pbl;
592 uint32_t addr, len;
593 u_int i, npages;
594 int error;
595
596 MPASS(cb->pbl == NULL);
597 MPASS(cb->ddp_ok);
598
599 /* Hardware limit? iWARP only enforces this for T5. */
600 if (cb->io.io_len >= (8 * 1024 * 1024 * 1024ULL))
601 return (false);
602
603 npages = che_npages(cb);
604 len = roundup2(npages, 4) * sizeof(*cb->pbl);
605 addr = t4_pblpool_alloc(sc, len);
606 if (addr == 0)
607 return (false);
608
609 pbl = malloc(len, M_NVMF_CHE, M_NOWAIT | M_ZERO);
610 if (pbl == NULL) {
611 t4_pblpool_free(sc, addr, len);
612 return (false);
613 }
614
615 switch (mem->md_type) {
616 case MEMDESC_VADDR:
617 {
618 vm_offset_t va;
619
620 va = trunc_page((uintptr_t)mem->u.md_vaddr);
621 for (i = 0; i < npages; i++)
622 pbl[i] = htobe64(pmap_kextract(va + i * PAGE_SIZE));
623 break;
624 }
625 case MEMDESC_PADDR:
626 {
627 vm_paddr_t pa;
628
629 pa = trunc_page(mem->u.md_paddr);
630 for (i = 0; i < npages; i++)
631 pbl[i] = htobe64(pa + i * PAGE_SIZE);
632 break;
633 }
634 case MEMDESC_VMPAGES:
635 for (i = 0; i < npages; i++)
636 pbl[i] = htobe64(VM_PAGE_TO_PHYS(mem->u.md_ma[i]));
637 break;
638 case MEMDESC_VLIST:
639 {
640 struct bus_dma_segment *ds;
641 vm_offset_t va;
642 vm_size_t len;
643 u_int j, k;
644
645 i = 0;
646 ds = mem->u.md_list;
647 for (j = 0; j < mem->md_nseg; j++, ds++) {
648 va = trunc_page((uintptr_t)ds->ds_addr);
649 len = ds->ds_len;
650 if (ds->ds_addr % PAGE_SIZE != 0)
651 len += ds->ds_addr % PAGE_SIZE;
652 for (k = 0; k < howmany(len, PAGE_SIZE); k++) {
653 pbl[i] = htobe64(pmap_kextract(va +
654 k * PAGE_SIZE));
655 i++;
656 }
657 }
658 MPASS(i == npages);
659 break;
660 }
661 case MEMDESC_PLIST:
662 {
663 struct bus_dma_segment *ds;
664 vm_paddr_t pa;
665 vm_size_t len;
666 u_int j, k;
667
668 i = 0;
669 ds = mem->u.md_list;
670 for (j = 0; j < mem->md_nseg; j++, ds++) {
671 pa = trunc_page((vm_paddr_t)ds->ds_addr);
672 len = ds->ds_len;
673 if (ds->ds_addr % PAGE_SIZE != 0)
674 len += ds->ds_addr % PAGE_SIZE;
675 for (k = 0; k < howmany(len, PAGE_SIZE); k++) {
676 pbl[i] = htobe64(pa + k * PAGE_SIZE);
677 i++;
678 }
679 }
680 MPASS(i == npages);
681 break;
682 }
683 default:
684 __assert_unreachable();
685 }
686
687 error = che_write_adapter_mem(qp, addr >> 5, len, pbl);
688 if (error != 0) {
689 t4_pblpool_free(sc, addr, len);
690 free(pbl, M_NVMF_CHE);
691 return (false);
692 }
693
694 cb->pbl = pbl;
695 cb->pbl_addr = addr;
696 cb->pbl_len = len;
697
698 return (true);
699 }
700
701 static void
che_free_pbl(struct nvmf_che_command_buffer * cb)702 che_free_pbl(struct nvmf_che_command_buffer *cb)
703 {
704 free(cb->pbl, M_NVMF_CHE);
705 t4_pblpool_free(cb->qp->nca->sc, cb->pbl_addr, cb->pbl_len);
706 cb->pbl = NULL;
707 cb->pbl_addr = 0;
708 cb->pbl_len = 0;
709 }
710
711 static bool
che_write_tpt_entry(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb,uint16_t stag)712 che_write_tpt_entry(struct nvmf_che_qpair *qp,
713 struct nvmf_che_command_buffer *cb, uint16_t stag)
714 {
715 uint32_t tpt_addr;
716 int error;
717
718 cb->tpte.valid_to_pdid = htobe32(F_FW_RI_TPTE_VALID |
719 V_FW_RI_TPTE_STAGKEY(CHE_STAG_COLOR(stag)) |
720 F_FW_RI_TPTE_STAGSTATE |
721 V_FW_RI_TPTE_STAGTYPE(FW_RI_STAG_NSMR) |
722 V_FW_RI_TPTE_PDID(0));
723 cb->tpte.locread_to_qpid = htobe32(
724 V_FW_RI_TPTE_PERM(FW_RI_MEM_ACCESS_REM_WRITE) |
725 V_FW_RI_TPTE_ADDRTYPE(FW_RI_ZERO_BASED_TO) |
726 V_FW_RI_TPTE_PS(PAGE_SIZE) |
727 V_FW_RI_TPTE_QPID(qp->toep->tid));
728 #define PBL_OFF(qp, a) ((a) - (qp)->nca->sc->vres.pbl.start)
729 cb->tpte.nosnoop_pbladdr =
730 htobe32(V_FW_RI_TPTE_PBLADDR(PBL_OFF(qp, cb->pbl_addr) >> 3));
731 cb->tpte.len_lo = htobe32(cb->data_len);
732 cb->tpte.va_hi = 0;
733 cb->tpte.va_lo_fbo = htobe32(che_fbo(cb));
734 cb->tpte.dca_mwbcnt_pstag = 0;
735 cb->tpte.len_hi = htobe32(cb->data_offset);
736
737 tpt_addr = qp->tpt_offset + CHE_STAG_IDX(stag) +
738 (qp->nca->sc->vres.stag.start >> 5);
739
740 error = che_write_adapter_mem(qp, tpt_addr, sizeof(cb->tpte),
741 &cb->tpte);
742 return (error == 0);
743 }
744
745 static void
che_clear_tpt_entry(struct nvmf_che_qpair * qp,uint16_t stag)746 che_clear_tpt_entry(struct nvmf_che_qpair *qp, uint16_t stag)
747 {
748 uint32_t tpt_addr;
749
750 tpt_addr = qp->tpt_offset + CHE_STAG_IDX(stag) +
751 (qp->nca->sc->vres.stag.start >> 5);
752
753 (void)che_write_adapter_mem(qp, tpt_addr, sizeof(struct fw_ri_tpte),
754 NULL);
755 }
756
757 static uint16_t
che_alloc_ddp_stag(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb)758 che_alloc_ddp_stag(struct nvmf_che_qpair *qp,
759 struct nvmf_che_command_buffer *cb)
760 {
761 uint16_t stag_idx;
762
763 mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
764 MPASS(cb->ddp_ok);
765
766 if (qp->active_ddp_tags == qp->num_ddp_tags)
767 return (CHE_DDP_NO_TAG);
768
769 MPASS(qp->num_ddp_tags != 0);
770
771 stag_idx = qp->next_ddp_tag;
772 for (;;) {
773 if (qp->open_ddp_tags[stag_idx] == NULL)
774 break;
775 if (stag_idx == qp->num_ddp_tags - 1) {
776 stag_idx = 0;
777 if (qp->ddp_color == CHE_DDP_MAX_COLOR)
778 qp->ddp_color = 0;
779 else
780 qp->ddp_color++;
781 } else
782 stag_idx++;
783 MPASS(stag_idx != qp->next_ddp_tag);
784 }
785 if (stag_idx == qp->num_ddp_tags - 1)
786 qp->next_ddp_tag = 0;
787 else
788 qp->next_ddp_tag = stag_idx + 1;
789
790 qp->active_ddp_tags++;
791 qp->open_ddp_tags[stag_idx] = cb;
792
793 return (CHE_DDP_TAG(stag_idx, qp->ddp_color));
794 }
795
796 static void
che_free_ddp_stag(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb,uint16_t stag)797 che_free_ddp_stag(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb,
798 uint16_t stag)
799 {
800 MPASS(!CHE_TAG_IS_FL(stag));
801
802 mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
803
804 MPASS(qp->open_ddp_tags[CHE_STAG_IDX(stag)] == cb);
805
806 qp->open_ddp_tags[CHE_STAG_IDX(stag)] = NULL;
807 qp->active_ddp_tags--;
808 }
809
810 static uint16_t
che_alloc_ddp_tag(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb)811 che_alloc_ddp_tag(struct nvmf_che_qpair *qp,
812 struct nvmf_che_command_buffer *cb)
813 {
814 uint16_t stag;
815
816 mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
817
818 if (!cb->ddp_ok)
819 return (CHE_DDP_NO_TAG);
820
821 stag = che_alloc_ddp_stag(qp, cb);
822 if (stag == CHE_DDP_NO_TAG) {
823 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_setup_no_stag,
824 1);
825 return (CHE_DDP_NO_TAG);
826 }
827
828 if (!che_alloc_pbl(qp, cb)) {
829 che_free_ddp_stag(qp, cb, stag);
830 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_setup_error, 1);
831 return (CHE_DDP_NO_TAG);
832 }
833
834 if (!che_write_tpt_entry(qp, cb, stag)) {
835 che_free_pbl(cb);
836 che_free_ddp_stag(qp, cb, stag);
837 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_setup_error, 1);
838 return (CHE_DDP_NO_TAG);
839 }
840
841 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_setup_ok, 1);
842 return (stag);
843 }
844
845 static void
che_free_ddp_tag(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb,uint16_t stag)846 che_free_ddp_tag(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb,
847 uint16_t stag)
848 {
849 MPASS(!CHE_TAG_IS_FL(stag));
850
851 mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
852
853 MPASS(qp->open_ddp_tags[CHE_STAG_IDX(stag)] == cb);
854
855 che_clear_tpt_entry(qp, stag);
856 che_free_pbl(cb);
857 che_free_ddp_stag(qp, cb, stag);
858 }
859
860 static void
nvmf_che_write_pdu(struct nvmf_che_qpair * qp,struct mbuf * m)861 nvmf_che_write_pdu(struct nvmf_che_qpair *qp, struct mbuf *m)
862 {
863 struct epoch_tracker et;
864 struct socket *so = qp->so;
865 struct inpcb *inp = sotoinpcb(so);
866 struct tcpcb *tp = intotcpcb(inp);
867 struct toepcb *toep = qp->toep;
868
869 CURVNET_SET(so->so_vnet);
870 NET_EPOCH_ENTER(et);
871 INP_WLOCK(inp);
872 if (__predict_false(tp->t_flags & TF_DISCONNECTED) ||
873 __predict_false((toep->flags & TPF_ATTACHED) == 0)) {
874 m_freem(m);
875 } else {
876 mbufq_enqueue(&toep->ulp_pduq, m);
877 t4_push_pdus(toep->vi->adapter, toep, 0);
878 }
879 INP_WUNLOCK(inp);
880 NET_EPOCH_EXIT(et);
881 CURVNET_RESTORE();
882 }
883
884 static void
nvmf_che_report_error(struct nvmf_che_qpair * qp,uint16_t fes,uint32_t fei,struct mbuf * rx_pdu,u_int hlen)885 nvmf_che_report_error(struct nvmf_che_qpair *qp, uint16_t fes, uint32_t fei,
886 struct mbuf *rx_pdu, u_int hlen)
887 {
888 struct nvme_tcp_term_req_hdr *hdr;
889 struct mbuf *m;
890
891 if (hlen != 0) {
892 hlen = min(hlen, NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE);
893 hlen = min(hlen, m_length(rx_pdu, NULL));
894 }
895
896 m = m_get2(sizeof(*hdr) + hlen, M_WAITOK, MT_DATA, M_PKTHDR);
897 m->m_len = sizeof(*hdr) + hlen;
898 m->m_pkthdr.len = m->m_len;
899 hdr = mtod(m, void *);
900 memset(hdr, 0, sizeof(*hdr));
901 hdr->common.pdu_type = qp->qp.nq_controller ?
902 NVME_TCP_PDU_TYPE_C2H_TERM_REQ : NVME_TCP_PDU_TYPE_H2C_TERM_REQ;
903 hdr->common.hlen = sizeof(*hdr);
904 hdr->common.plen = sizeof(*hdr) + hlen;
905 hdr->fes = htole16(fes);
906 le32enc(hdr->fei, fei);
907 if (hlen != 0)
908 m_copydata(rx_pdu, 0, hlen, (caddr_t)(hdr + 1));
909
910 nvmf_che_write_pdu(qp, m);
911 }
912
913 static int
nvmf_che_validate_pdu(struct nvmf_che_qpair * qp,struct nvmf_che_rxpdu * pdu)914 nvmf_che_validate_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu)
915 {
916 const struct nvme_tcp_common_pdu_hdr *ch;
917 struct mbuf *m = pdu->m;
918 uint32_t data_len, fei, plen, rx_digest;
919 u_int hlen, cpl_error;
920 int error;
921 uint16_t fes;
922
923 /* Determine how large of a PDU header to return for errors. */
924 ch = pdu->hdr;
925 hlen = ch->hlen;
926 plen = le32toh(ch->plen);
927 if (hlen < sizeof(*ch) || hlen > plen)
928 hlen = sizeof(*ch);
929
930 cpl_error = m->m_pkthdr.nvmf_cpl_status & CMP_STATUS_ERROR_MASK;
931 switch (cpl_error) {
932 case CMP_STATUS_NO_ERROR:
933 break;
934 case CMP_STATUS_HEADER_DIGEST:
935 counter_u64_add(
936 qp->toep->ofld_rxq->rx_nvme_header_digest_errors, 1);
937 printf("NVMe/TCP: Header digest mismatch\n");
938 rx_digest = le32dec(mtodo(m, ch->hlen));
939 nvmf_che_report_error(qp,
940 NVME_TCP_TERM_REQ_FES_HDGST_ERROR, rx_digest, m,
941 hlen);
942 return (EBADMSG);
943 case CMP_STATUS_DIRECTION_MISMATCH:
944 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1);
945 printf("NVMe/TCP: Invalid PDU type %u\n", ch->pdu_type);
946 nvmf_che_report_error(qp,
947 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
948 offsetof(struct nvme_tcp_common_pdu_hdr, pdu_type), m,
949 hlen);
950 return (EBADMSG);
951 case CMP_STATUS_SUCCESS_NOT_LAST:
952 case CMP_STATUS_DIGEST_FLAG_MISMATCH:
953 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1);
954 printf("NVMe/TCP: Invalid PDU header flags %#x\n", ch->flags);
955 nvmf_che_report_error(qp,
956 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
957 offsetof(struct nvme_tcp_common_pdu_hdr, flags), m, hlen);
958 return (EBADMSG);
959 case CMP_STATUS_BAD_DATA_LENGTH:
960 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1);
961 printf("NVMe/TCP: Invalid PDU length %u\n", plen);
962 nvmf_che_report_error(qp,
963 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
964 offsetof(struct nvme_tcp_common_pdu_hdr, plen), m, hlen);
965 return (EBADMSG);
966 case CMP_STATUS_USER_MODE_UNALLOCATED:
967 case CMP_STATUS_RQT_LIMIT:
968 case CMP_STATUS_RQT_WRAP:
969 case CMP_STATUS_RQT_BOUND:
970 device_printf(qp->nca->sc->dev,
971 "received invalid NVMET error %u\n",
972 cpl_error);
973 return (ECONNRESET);
974 case CMP_STATUS_TPT_LIMIT:
975 case CMP_STATUS_TPT_INVALID:
976 case CMP_STATUS_TPT_COLOUR_MISMATCH:
977 case CMP_STATUS_TPT_MISC:
978 case CMP_STATUS_TPT_WRAP:
979 case CMP_STATUS_TPT_BOUND:
980 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1);
981 switch (ch->pdu_type) {
982 case NVME_TCP_PDU_TYPE_H2C_DATA:
983 nvmf_che_report_error(qp,
984 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
985 offsetof(struct nvme_tcp_h2c_data_hdr, ttag),
986 pdu->m, pdu->hdr->hlen);
987 return (EBADMSG);
988 case NVME_TCP_PDU_TYPE_C2H_DATA:
989 nvmf_che_report_error(qp,
990 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
991 offsetof(struct nvme_tcp_c2h_data_hdr, cccid), m,
992 hlen);
993 return (EBADMSG);
994 default:
995 device_printf(qp->nca->sc->dev,
996 "received DDP NVMET error %u for PDU %u\n",
997 cpl_error, ch->pdu_type);
998 return (ECONNRESET);
999 }
1000 case CMP_STATUS_TPT_LAST_PDU_UNALIGNED:
1001 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1);
1002 nvmf_che_report_error(qp,
1003 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, m, hlen);
1004 return (EBADMSG);
1005 case CMP_STATUS_PBL_LIMIT:
1006 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1);
1007 nvmf_che_report_error(qp,
1008 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, m,
1009 hlen);
1010 return (EBADMSG);
1011 case CMP_STATUS_DATA_DIGEST:
1012 /* Handled below. */
1013 break;
1014 default:
1015 device_printf(qp->nca->sc->dev,
1016 "received unknown NVMET error %u\n",
1017 cpl_error);
1018 return (ECONNRESET);
1019 }
1020
1021 error = nvmf_tcp_validate_pdu_header(ch, qp->qp.nq_controller,
1022 qp->header_digests, qp->data_digests, qp->rxpda, &data_len, &fes,
1023 &fei);
1024 if (error != 0) {
1025 if (error != ECONNRESET)
1026 nvmf_che_report_error(qp, fes, fei, m, hlen);
1027 return (error);
1028 }
1029
1030 /* Check data digest if present. */
1031 pdu->data_digest_mismatch = false;
1032 if ((ch->flags & NVME_TCP_CH_FLAGS_DDGSTF) != 0) {
1033 if (cpl_error == CMP_STATUS_DATA_DIGEST) {
1034 printf("NVMe/TCP: Data digest mismatch\n");
1035 pdu->data_digest_mismatch = true;
1036 counter_u64_add(
1037 qp->toep->ofld_rxq->rx_nvme_data_digest_errors, 1);
1038 }
1039 }
1040
1041 pdu->data_len = data_len;
1042
1043 return (0);
1044 }
1045
1046 static void
nvmf_che_free_pdu(struct nvmf_che_rxpdu * pdu)1047 nvmf_che_free_pdu(struct nvmf_che_rxpdu *pdu)
1048 {
1049 m_freem(pdu->m);
1050 pdu->m = NULL;
1051 pdu->hdr = NULL;
1052 }
1053
1054 static int
nvmf_che_handle_term_req(struct nvmf_che_rxpdu * pdu)1055 nvmf_che_handle_term_req(struct nvmf_che_rxpdu *pdu)
1056 {
1057 const struct nvme_tcp_term_req_hdr *hdr;
1058
1059 hdr = (const void *)pdu->hdr;
1060
1061 printf("NVMe/TCP: Received termination request: fes %#x fei %#x\n",
1062 le16toh(hdr->fes), le32dec(hdr->fei));
1063 nvmf_che_free_pdu(pdu);
1064 return (ECONNRESET);
1065 }
1066
1067 static int
nvmf_che_save_command_capsule(struct nvmf_che_qpair * qp,struct nvmf_che_rxpdu * pdu)1068 nvmf_che_save_command_capsule(struct nvmf_che_qpair *qp,
1069 struct nvmf_che_rxpdu *pdu)
1070 {
1071 const struct nvme_tcp_cmd *cmd;
1072 struct nvmf_capsule *nc;
1073 struct nvmf_che_capsule *cc;
1074
1075 cmd = (const void *)pdu->hdr;
1076
1077 nc = nvmf_allocate_command(&qp->qp, &cmd->ccsqe, M_WAITOK);
1078
1079 cc = CCAP(nc);
1080 cc->rx_pdu = *pdu;
1081
1082 nvmf_capsule_received(&qp->qp, nc);
1083 return (0);
1084 }
1085
1086 static int
nvmf_che_save_response_capsule(struct nvmf_che_qpair * qp,struct nvmf_che_rxpdu * pdu)1087 nvmf_che_save_response_capsule(struct nvmf_che_qpair *qp,
1088 struct nvmf_che_rxpdu *pdu)
1089 {
1090 const struct nvme_tcp_rsp *rsp;
1091 struct nvme_completion cpl;
1092 struct nvmf_capsule *nc;
1093 struct nvmf_che_capsule *cc;
1094 uint16_t cid;
1095
1096 rsp = (const void *)pdu->hdr;
1097
1098 /*
1099 * Restore the original CID and ensure any command buffers
1100 * associated with this CID have been released. Once the CQE
1101 * has been received, no further transfers to the command
1102 * buffer for the associated CID can occur.
1103 */
1104 cpl = rsp->rccqe;
1105 cid = le16toh(cpl.cid);
1106 if (CHE_TAG_IS_FL(cid)) {
1107 cid = CHE_RAW_FL_TAG(cid);
1108 mtx_lock(&qp->fl_cid_lock);
1109 MPASS(FL_CID_ISACTIVE(cid, qp->fl_cid_set));
1110 cpl.cid = qp->fl_cids[cid];
1111 FL_CID_FREE(cid, qp->fl_cid_set);
1112 mtx_unlock(&qp->fl_cid_lock);
1113
1114 che_purge_command_buffer(&qp->rx_buffers, rsp->rccqe.cid);
1115 che_purge_command_buffer(&qp->tx_buffers, rsp->rccqe.cid);
1116 } else {
1117 struct nvmf_che_command_buffer *cb;
1118
1119 mtx_lock(&qp->rx_buffers.lock);
1120 cb = qp->open_ddp_tags[CHE_STAG_IDX(cid)];
1121 MPASS(cb != NULL);
1122 MPASS(cb->cid == rsp->rccqe.cid);
1123 cpl.cid = cb->original_cid;
1124 che_free_ddp_tag(qp, cb, cid);
1125 mtx_unlock(&qp->rx_buffers.lock);
1126 che_release_command_buffer(cb);
1127 }
1128 #ifdef VERBOSE_TRACES
1129 CTR(KTR_CXGBE, "%s: tid %u freed cid 0x%04x for 0x%04x", __func__,
1130 qp->toep->tid, le16toh(rsp->rccqe.cid), cpl.cid);
1131 #endif
1132
1133 nc = nvmf_allocate_response(&qp->qp, &cpl, M_WAITOK);
1134
1135 nc->nc_sqhd_valid = true;
1136 cc = CCAP(nc);
1137 cc->rx_pdu = *pdu;
1138
1139 nvmf_capsule_received(&qp->qp, nc);
1140 return (0);
1141 }
1142
1143 /*
1144 * Construct a PDU that contains an optional data payload. This
1145 * includes dealing with the length fields in the common header. The
1146 * adapter inserts digests and padding when the PDU is transmitted.
1147 */
1148 static struct mbuf *
nvmf_che_construct_pdu(struct nvmf_che_qpair * qp,void * hdr,size_t hlen,struct mbuf * data,uint32_t data_len)1149 nvmf_che_construct_pdu(struct nvmf_che_qpair *qp, void *hdr, size_t hlen,
1150 struct mbuf *data, uint32_t data_len)
1151 {
1152 struct nvme_tcp_common_pdu_hdr *ch;
1153 struct mbuf *top;
1154 uint32_t pdo, plen;
1155 uint8_t ulp_submode;
1156
1157 plen = hlen;
1158 if (qp->header_digests)
1159 plen += sizeof(uint32_t);
1160 if (data_len != 0) {
1161 KASSERT(m_length(data, NULL) == data_len, ("length mismatch"));
1162 pdo = roundup(plen, qp->txpda);
1163 plen = pdo + data_len;
1164 if (qp->data_digests)
1165 plen += sizeof(uint32_t);
1166 } else {
1167 KASSERT(data == NULL, ("payload mbuf with zero length"));
1168 pdo = 0;
1169 }
1170
1171 top = m_get2(hlen, M_WAITOK, MT_DATA, M_PKTHDR);
1172 top->m_len = hlen;
1173 top->m_pkthdr.len = hlen;
1174 ch = mtod(top, void *);
1175 memcpy(ch, hdr, hlen);
1176 ch->hlen = hlen;
1177 ulp_submode = 0;
1178 if (qp->header_digests) {
1179 ch->flags |= NVME_TCP_CH_FLAGS_HDGSTF;
1180 ulp_submode |= ULP_CRC_HEADER;
1181 }
1182 if (qp->data_digests && data_len != 0) {
1183 ch->flags |= NVME_TCP_CH_FLAGS_DDGSTF;
1184 ulp_submode |= ULP_CRC_DATA;
1185 }
1186 ch->pdo = pdo;
1187 ch->plen = htole32(plen);
1188 set_mbuf_ulp_submode(top, ulp_submode);
1189
1190 if (data_len != 0) {
1191 top->m_pkthdr.len += data_len;
1192 top->m_next = data;
1193 }
1194
1195 return (top);
1196 }
1197
1198 /* Allocate the next free freelist transfer tag. */
1199 static bool
nvmf_che_allocate_fl_ttag(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb)1200 nvmf_che_allocate_fl_ttag(struct nvmf_che_qpair *qp,
1201 struct nvmf_che_command_buffer *cb)
1202 {
1203 uint16_t ttag;
1204
1205 mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
1206
1207 if (qp->active_fl_ttags == qp->num_fl_ttags)
1208 return (false);
1209
1210 ttag = qp->next_fl_ttag;
1211 for (;;) {
1212 if (qp->open_fl_ttags[ttag] == NULL)
1213 break;
1214 if (ttag == qp->num_fl_ttags - 1)
1215 ttag = 0;
1216 else
1217 ttag++;
1218 MPASS(ttag != qp->next_fl_ttag);
1219 }
1220 if (ttag == qp->num_fl_ttags - 1)
1221 qp->next_fl_ttag = 0;
1222 else
1223 qp->next_fl_ttag = ttag + 1;
1224
1225 qp->active_fl_ttags++;
1226 qp->open_fl_ttags[ttag] = cb;
1227
1228 cb->ttag = ttag | CHE_FL_TAG_MASK;
1229 return (true);
1230 }
1231
1232 /* Attempt to allocate a free transfer tag and assign it to cb. */
1233 static bool
nvmf_che_allocate_ttag(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb)1234 nvmf_che_allocate_ttag(struct nvmf_che_qpair *qp,
1235 struct nvmf_che_command_buffer *cb)
1236 {
1237 uint16_t stag;
1238
1239 mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
1240
1241 stag = che_alloc_ddp_tag(qp, cb);
1242 if (stag == CHE_DDP_NO_TAG) {
1243 if (!nvmf_che_allocate_fl_ttag(qp, cb))
1244 return (false);
1245 } else {
1246 cb->ttag = stag;
1247 }
1248 #ifdef VERBOSE_TRACES
1249 CTR(KTR_CXGBE, "%s: tid %u allocated ttag 0x%04x", __func__,
1250 qp->toep->tid, cb->ttag);
1251 #endif
1252 cb->cc->active_r2ts++;
1253 return (true);
1254 }
1255
1256 /* Find the next command buffer eligible to schedule for R2T. */
1257 static struct nvmf_che_command_buffer *
nvmf_che_next_r2t(struct nvmf_che_qpair * qp)1258 nvmf_che_next_r2t(struct nvmf_che_qpair *qp)
1259 {
1260 struct nvmf_che_command_buffer *cb;
1261
1262 mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
1263
1264 TAILQ_FOREACH(cb, &qp->rx_buffers.head, link) {
1265 /* NB: maxr2t is 0's based. */
1266 if (cb->cc->active_r2ts > qp->maxr2t)
1267 continue;
1268
1269 if (!nvmf_che_allocate_ttag(qp, cb))
1270 return (NULL);
1271 #ifdef INVARIANTS
1272 cb->cc->pending_r2ts--;
1273 #endif
1274 TAILQ_REMOVE(&qp->rx_buffers.head, cb, link);
1275 return (cb);
1276 }
1277 return (NULL);
1278 }
1279
1280 /* NB: cid and is little-endian already. */
1281 static void
che_send_r2t(struct nvmf_che_qpair * qp,uint16_t cid,uint16_t ttag,uint32_t data_offset,uint32_t data_len)1282 che_send_r2t(struct nvmf_che_qpair *qp, uint16_t cid, uint16_t ttag,
1283 uint32_t data_offset, uint32_t data_len)
1284 {
1285 struct nvme_tcp_r2t_hdr r2t;
1286 struct mbuf *m;
1287
1288 memset(&r2t, 0, sizeof(r2t));
1289 r2t.common.pdu_type = NVME_TCP_PDU_TYPE_R2T;
1290 r2t.cccid = cid;
1291 r2t.ttag = htole16(ttag);
1292 r2t.r2to = htole32(data_offset);
1293 r2t.r2tl = htole32(data_len);
1294
1295 m = nvmf_che_construct_pdu(qp, &r2t, sizeof(r2t), NULL, 0);
1296 nvmf_che_write_pdu(qp, m);
1297 }
1298
1299 /*
1300 * Release a transfer tag and schedule another R2T.
1301 *
1302 * NB: This drops the rx_buffers.lock mutex.
1303 */
1304 static void
nvmf_che_send_next_r2t(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb)1305 nvmf_che_send_next_r2t(struct nvmf_che_qpair *qp,
1306 struct nvmf_che_command_buffer *cb)
1307 {
1308 struct nvmf_che_command_buffer *ncb;
1309
1310 mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
1311
1312 #ifdef VERBOSE_TRACES
1313 CTR(KTR_CXGBE, "%s: tid %u freed ttag 0x%04x", __func__, qp->toep->tid,
1314 cb->ttag);
1315 #endif
1316 if (CHE_TAG_IS_FL(cb->ttag)) {
1317 uint16_t ttag;
1318
1319 ttag = CHE_RAW_FL_TAG(cb->ttag);
1320 MPASS(qp->open_fl_ttags[ttag] == cb);
1321
1322 /* Release this transfer tag. */
1323 qp->open_fl_ttags[ttag] = NULL;
1324 qp->active_fl_ttags--;
1325 } else
1326 che_free_ddp_tag(qp, cb, cb->ttag);
1327
1328 cb->cc->active_r2ts--;
1329
1330 /* Schedule another R2T. */
1331 ncb = nvmf_che_next_r2t(qp);
1332 mtx_unlock(&qp->rx_buffers.lock);
1333 if (ncb != NULL)
1334 che_send_r2t(qp, ncb->cid, ncb->ttag, ncb->data_offset,
1335 ncb->data_len);
1336 }
1337
1338 /*
1339 * Copy len bytes starting at offset skip from an mbuf chain into an
1340 * I/O buffer at destination offset io_offset.
1341 */
1342 static void
mbuf_copyto_io(struct mbuf * m,u_int skip,u_int len,struct nvmf_io_request * io,u_int io_offset)1343 mbuf_copyto_io(struct mbuf *m, u_int skip, u_int len,
1344 struct nvmf_io_request *io, u_int io_offset)
1345 {
1346 u_int todo;
1347
1348 while (m->m_len <= skip) {
1349 skip -= m->m_len;
1350 m = m->m_next;
1351 }
1352 while (len != 0) {
1353 MPASS((m->m_flags & M_EXTPG) == 0);
1354
1355 todo = min(m->m_len - skip, len);
1356 memdesc_copyback(&io->io_mem, io_offset, todo, mtodo(m, skip));
1357 skip = 0;
1358 io_offset += todo;
1359 len -= todo;
1360 m = m->m_next;
1361 }
1362 }
1363
1364 static int
nvmf_che_handle_h2c_data(struct nvmf_che_qpair * qp,struct nvmf_che_rxpdu * pdu)1365 nvmf_che_handle_h2c_data(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu)
1366 {
1367 const struct nvme_tcp_h2c_data_hdr *h2c;
1368 struct nvmf_che_command_buffer *cb;
1369 uint32_t data_len, data_offset;
1370 uint16_t ttag, fl_ttag;
1371
1372 h2c = (const void *)pdu->hdr;
1373 if (le32toh(h2c->datal) > qp->maxh2cdata) {
1374 nvmf_che_report_error(qp,
1375 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_LIMIT_EXCEEDED, 0,
1376 pdu->m, pdu->hdr->hlen);
1377 nvmf_che_free_pdu(pdu);
1378 return (EBADMSG);
1379 }
1380
1381 ttag = le16toh(h2c->ttag);
1382 if (CHE_TAG_IS_FL(ttag)) {
1383 fl_ttag = CHE_RAW_FL_TAG(ttag);
1384 if (fl_ttag >= qp->num_fl_ttags) {
1385 nvmf_che_report_error(qp,
1386 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
1387 offsetof(struct nvme_tcp_h2c_data_hdr, ttag),
1388 pdu->m, pdu->hdr->hlen);
1389 nvmf_che_free_pdu(pdu);
1390 return (EBADMSG);
1391 }
1392
1393 mtx_lock(&qp->rx_buffers.lock);
1394 cb = qp->open_fl_ttags[fl_ttag];
1395 } else {
1396 if (CHE_STAG_IDX(ttag) >= qp->num_ddp_tags) {
1397 nvmf_che_report_error(qp,
1398 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
1399 offsetof(struct nvme_tcp_h2c_data_hdr, ttag),
1400 pdu->m, pdu->hdr->hlen);
1401 nvmf_che_free_pdu(pdu);
1402 return (EBADMSG);
1403 }
1404
1405 mtx_lock(&qp->rx_buffers.lock);
1406 cb = qp->open_ddp_tags[CHE_STAG_IDX(ttag)];
1407 }
1408
1409 if (cb == NULL) {
1410 mtx_unlock(&qp->rx_buffers.lock);
1411 nvmf_che_report_error(qp,
1412 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
1413 offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->m,
1414 pdu->hdr->hlen);
1415 nvmf_che_free_pdu(pdu);
1416 return (EBADMSG);
1417 }
1418 MPASS(cb->ttag == ttag);
1419
1420 /* For a data digest mismatch, fail the I/O request. */
1421 if (pdu->data_digest_mismatch) {
1422 nvmf_che_send_next_r2t(qp, cb);
1423 cb->error = EINTEGRITY;
1424 che_release_command_buffer(cb);
1425 nvmf_che_free_pdu(pdu);
1426 return (0);
1427 }
1428
1429 data_len = le32toh(h2c->datal);
1430 if (data_len != pdu->data_len) {
1431 mtx_unlock(&qp->rx_buffers.lock);
1432 nvmf_che_report_error(qp,
1433 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
1434 offsetof(struct nvme_tcp_h2c_data_hdr, datal), pdu->m,
1435 pdu->hdr->hlen);
1436 nvmf_che_free_pdu(pdu);
1437 return (EBADMSG);
1438 }
1439
1440 data_offset = le32toh(h2c->datao);
1441 if (data_offset < cb->data_offset ||
1442 data_offset + data_len > cb->data_offset + cb->data_len) {
1443 mtx_unlock(&qp->rx_buffers.lock);
1444 nvmf_che_report_error(qp,
1445 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, pdu->m,
1446 pdu->hdr->hlen);
1447 nvmf_che_free_pdu(pdu);
1448 return (EBADMSG);
1449 }
1450
1451 if (data_offset != cb->data_offset + cb->data_xfered) {
1452 if (CHE_TAG_IS_FL(ttag)) {
1453 mtx_unlock(&qp->rx_buffers.lock);
1454 nvmf_che_report_error(qp,
1455 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
1456 pdu->hdr->hlen);
1457 nvmf_che_free_pdu(pdu);
1458 return (EBADMSG);
1459 } else {
1460 uint32_t ddp_bytes;
1461
1462 /* Account for PDUs silently received via DDP. */
1463 ddp_bytes = data_offset -
1464 (cb->data_offset + cb->data_xfered);
1465 cb->data_xfered += ddp_bytes;
1466 #ifdef VERBOSE_TRACES
1467 CTR(KTR_CXGBE, "%s: tid %u previous ddp_bytes %u",
1468 __func__, qp->toep->tid, ddp_bytes);
1469 #endif
1470 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_octets,
1471 ddp_bytes);
1472 }
1473 }
1474
1475 if ((cb->data_xfered + data_len == cb->data_len) !=
1476 ((pdu->hdr->flags & NVME_TCP_H2C_DATA_FLAGS_LAST_PDU) != 0)) {
1477 mtx_unlock(&qp->rx_buffers.lock);
1478 nvmf_che_report_error(qp,
1479 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
1480 pdu->hdr->hlen);
1481 nvmf_che_free_pdu(pdu);
1482 return (EBADMSG);
1483 }
1484
1485 cb->data_xfered += data_len;
1486 data_offset -= cb->data_offset;
1487 if (cb->data_xfered == cb->data_len) {
1488 nvmf_che_send_next_r2t(qp, cb);
1489 } else {
1490 che_hold_command_buffer(cb);
1491 mtx_unlock(&qp->rx_buffers.lock);
1492 }
1493
1494 if (CHE_TAG_IS_FL(ttag))
1495 mbuf_copyto_io(pdu->m->m_next, 0, data_len, &cb->io,
1496 data_offset);
1497
1498 che_release_command_buffer(cb);
1499 nvmf_che_free_pdu(pdu);
1500 return (0);
1501 }
1502
1503 static int
nvmf_che_handle_c2h_data(struct nvmf_che_qpair * qp,struct nvmf_che_rxpdu * pdu)1504 nvmf_che_handle_c2h_data(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu)
1505 {
1506 const struct nvme_tcp_c2h_data_hdr *c2h;
1507 struct nvmf_che_command_buffer *cb;
1508 uint32_t data_len, data_offset;
1509 uint16_t cid, original_cid;
1510
1511 /*
1512 * Unlike freelist command buffers, DDP command buffers are
1513 * not released until the response capsule is received to keep
1514 * the STAG allocated until the command has completed.
1515 */
1516 c2h = (const void *)pdu->hdr;
1517
1518 cid = le16toh(c2h->cccid);
1519 if (CHE_TAG_IS_FL(cid)) {
1520 mtx_lock(&qp->rx_buffers.lock);
1521 cb = che_find_command_buffer(&qp->rx_buffers, c2h->cccid);
1522 } else {
1523 if (CHE_STAG_IDX(cid) >= qp->num_ddp_tags) {
1524 nvmf_che_report_error(qp,
1525 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
1526 offsetof(struct nvme_tcp_c2h_data_hdr, cccid),
1527 pdu->m, pdu->hdr->hlen);
1528 nvmf_che_free_pdu(pdu);
1529 return (EBADMSG);
1530 }
1531
1532 mtx_lock(&qp->rx_buffers.lock);
1533 cb = qp->open_ddp_tags[CHE_STAG_IDX(cid)];
1534 }
1535
1536 if (cb == NULL) {
1537 mtx_unlock(&qp->rx_buffers.lock);
1538 /*
1539 * XXX: Could be PDU sequence error if cccid is for a
1540 * command that doesn't use a command buffer.
1541 */
1542 nvmf_che_report_error(qp,
1543 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
1544 offsetof(struct nvme_tcp_c2h_data_hdr, cccid), pdu->m,
1545 pdu->hdr->hlen);
1546 nvmf_che_free_pdu(pdu);
1547 return (EBADMSG);
1548 }
1549
1550 /* For a data digest mismatch, fail the I/O request. */
1551 if (pdu->data_digest_mismatch) {
1552 cb->error = EINTEGRITY;
1553 if (CHE_TAG_IS_FL(cid)) {
1554 che_remove_command_buffer(&qp->rx_buffers, cb);
1555 mtx_unlock(&qp->rx_buffers.lock);
1556 che_release_command_buffer(cb);
1557 } else
1558 mtx_unlock(&qp->rx_buffers.lock);
1559 nvmf_che_free_pdu(pdu);
1560 return (0);
1561 }
1562
1563 data_len = le32toh(c2h->datal);
1564 if (data_len != pdu->data_len) {
1565 mtx_unlock(&qp->rx_buffers.lock);
1566 nvmf_che_report_error(qp,
1567 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
1568 offsetof(struct nvme_tcp_c2h_data_hdr, datal), pdu->m,
1569 pdu->hdr->hlen);
1570 nvmf_che_free_pdu(pdu);
1571 return (EBADMSG);
1572 }
1573
1574 data_offset = le32toh(c2h->datao);
1575 if (data_offset < cb->data_offset ||
1576 data_offset + data_len > cb->data_offset + cb->data_len) {
1577 mtx_unlock(&qp->rx_buffers.lock);
1578 nvmf_che_report_error(qp,
1579 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
1580 pdu->m, pdu->hdr->hlen);
1581 nvmf_che_free_pdu(pdu);
1582 return (EBADMSG);
1583 }
1584
1585 if (data_offset != cb->data_offset + cb->data_xfered) {
1586 if (CHE_TAG_IS_FL(cid)) {
1587 mtx_unlock(&qp->rx_buffers.lock);
1588 nvmf_che_report_error(qp,
1589 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
1590 pdu->hdr->hlen);
1591 nvmf_che_free_pdu(pdu);
1592 return (EBADMSG);
1593 } else {
1594 uint32_t ddp_bytes;
1595
1596 /* Account for PDUs silently received via DDP. */
1597 ddp_bytes = data_offset -
1598 (cb->data_offset + cb->data_xfered);
1599 cb->data_xfered += ddp_bytes;
1600 #ifdef VERBOSE_TRACES
1601 CTR(KTR_CXGBE, "%s: tid %u previous ddp_bytes %u",
1602 __func__, qp->toep->tid, ddp_bytes);
1603 #endif
1604 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_octets,
1605 ddp_bytes);
1606 }
1607 }
1608
1609 if ((cb->data_xfered + data_len == cb->data_len) !=
1610 ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_LAST_PDU) != 0)) {
1611 mtx_unlock(&qp->rx_buffers.lock);
1612 nvmf_che_report_error(qp,
1613 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
1614 pdu->hdr->hlen);
1615 nvmf_che_free_pdu(pdu);
1616 return (EBADMSG);
1617 }
1618
1619 cb->data_xfered += data_len;
1620 original_cid = cb->original_cid;
1621
1622 if (CHE_TAG_IS_FL(cid)) {
1623 data_offset -= cb->data_offset;
1624 if (cb->data_xfered == cb->data_len)
1625 che_remove_command_buffer(&qp->rx_buffers, cb);
1626 else
1627 che_hold_command_buffer(cb);
1628 mtx_unlock(&qp->rx_buffers.lock);
1629
1630 if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) {
1631 /*
1632 * Free the CID as the command has now been
1633 * completed.
1634 */
1635 cid = CHE_RAW_FL_TAG(cid);
1636 mtx_lock(&qp->fl_cid_lock);
1637 MPASS(FL_CID_ISACTIVE(cid, qp->fl_cid_set));
1638 MPASS(original_cid == qp->fl_cids[cid]);
1639 FL_CID_FREE(cid, qp->fl_cid_set);
1640 mtx_unlock(&qp->fl_cid_lock);
1641 }
1642
1643 mbuf_copyto_io(pdu->m->m_next, 0, data_len, &cb->io,
1644 data_offset);
1645
1646 che_release_command_buffer(cb);
1647 } else {
1648 if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) {
1649 /*
1650 * Free the command buffer and STAG as the
1651 * command has now been completed.
1652 */
1653 che_free_ddp_tag(qp, cb, cid);
1654 mtx_unlock(&qp->rx_buffers.lock);
1655 che_release_command_buffer(cb);
1656 } else
1657 mtx_unlock(&qp->rx_buffers.lock);
1658 }
1659
1660 if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) {
1661 struct nvme_completion cqe;
1662 struct nvmf_capsule *nc;
1663
1664 memset(&cqe, 0, sizeof(cqe));
1665 cqe.cid = original_cid;
1666
1667 nc = nvmf_allocate_response(&qp->qp, &cqe, M_WAITOK);
1668 nc->nc_sqhd_valid = false;
1669
1670 nvmf_capsule_received(&qp->qp, nc);
1671 }
1672
1673 nvmf_che_free_pdu(pdu);
1674 return (0);
1675 }
1676
1677 /* Called when m_free drops refcount to 0. */
1678 static void
nvmf_che_mbuf_done(struct mbuf * m)1679 nvmf_che_mbuf_done(struct mbuf *m)
1680 {
1681 struct nvmf_che_command_buffer *cb = m->m_ext.ext_arg1;
1682
1683 che_free_command_buffer(cb);
1684 }
1685
1686 static struct mbuf *
nvmf_che_mbuf(void * arg,int how,void * data,size_t len)1687 nvmf_che_mbuf(void *arg, int how, void *data, size_t len)
1688 {
1689 struct nvmf_che_command_buffer *cb = arg;
1690 struct mbuf *m;
1691
1692 m = m_get(how, MT_DATA);
1693 m->m_flags |= M_RDONLY;
1694 m_extaddref(m, data, len, &cb->refs, nvmf_che_mbuf_done, cb, NULL);
1695 m->m_len = len;
1696 return (m);
1697 }
1698
1699 static void
nvmf_che_free_mext_pg(struct mbuf * m)1700 nvmf_che_free_mext_pg(struct mbuf *m)
1701 {
1702 struct nvmf_che_command_buffer *cb = m->m_ext.ext_arg1;
1703
1704 M_ASSERTEXTPG(m);
1705 che_release_command_buffer(cb);
1706 }
1707
1708 static struct mbuf *
nvmf_che_mext_pg(void * arg,int how)1709 nvmf_che_mext_pg(void *arg, int how)
1710 {
1711 struct nvmf_che_command_buffer *cb = arg;
1712 struct mbuf *m;
1713
1714 m = mb_alloc_ext_pgs(how, nvmf_che_free_mext_pg, M_RDONLY);
1715 m->m_ext.ext_arg1 = cb;
1716 che_hold_command_buffer(cb);
1717 return (m);
1718 }
1719
1720 /*
1721 * Return an mbuf chain for a range of data belonging to a command
1722 * buffer.
1723 *
1724 * The mbuf chain uses M_EXT mbufs which hold references on the
1725 * command buffer so that it remains "alive" until the data has been
1726 * fully transmitted. If truncate_ok is true, then the mbuf chain
1727 * might return a short chain to avoid gratuitously splitting up a
1728 * page.
1729 */
1730 static struct mbuf *
nvmf_che_command_buffer_mbuf(struct nvmf_che_command_buffer * cb,uint32_t data_offset,uint32_t data_len,uint32_t * actual_len,bool can_truncate)1731 nvmf_che_command_buffer_mbuf(struct nvmf_che_command_buffer *cb,
1732 uint32_t data_offset, uint32_t data_len, uint32_t *actual_len,
1733 bool can_truncate)
1734 {
1735 struct mbuf *m;
1736 size_t len;
1737
1738 m = memdesc_alloc_ext_mbufs(&cb->io.io_mem, nvmf_che_mbuf,
1739 nvmf_che_mext_pg, cb, M_WAITOK, data_offset, data_len, &len,
1740 can_truncate);
1741 if (actual_len != NULL)
1742 *actual_len = len;
1743 return (m);
1744 }
1745
1746 /* NB: cid and ttag and little-endian already. */
1747 static void
che_send_h2c_pdu(struct nvmf_che_qpair * qp,uint16_t cid,uint16_t ttag,uint32_t data_offset,struct mbuf * m,size_t len,bool last_pdu)1748 che_send_h2c_pdu(struct nvmf_che_qpair *qp, uint16_t cid, uint16_t ttag,
1749 uint32_t data_offset, struct mbuf *m, size_t len, bool last_pdu)
1750 {
1751 struct nvme_tcp_h2c_data_hdr h2c;
1752 struct mbuf *top;
1753
1754 memset(&h2c, 0, sizeof(h2c));
1755 h2c.common.pdu_type = NVME_TCP_PDU_TYPE_H2C_DATA;
1756 if (last_pdu)
1757 h2c.common.flags |= NVME_TCP_H2C_DATA_FLAGS_LAST_PDU;
1758 h2c.cccid = cid;
1759 h2c.ttag = ttag;
1760 h2c.datao = htole32(data_offset);
1761 h2c.datal = htole32(len);
1762
1763 top = nvmf_che_construct_pdu(qp, &h2c, sizeof(h2c), m, len);
1764 nvmf_che_write_pdu(qp, top);
1765 }
1766
1767 static int
nvmf_che_handle_r2t(struct nvmf_che_qpair * qp,struct nvmf_che_rxpdu * pdu)1768 nvmf_che_handle_r2t(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu)
1769 {
1770 const struct nvme_tcp_r2t_hdr *r2t;
1771 struct nvmf_che_command_buffer *cb;
1772 uint32_t data_len, data_offset;
1773
1774 r2t = (const void *)pdu->hdr;
1775
1776 mtx_lock(&qp->tx_buffers.lock);
1777 cb = che_find_command_buffer(&qp->tx_buffers, r2t->cccid);
1778 if (cb == NULL) {
1779 mtx_unlock(&qp->tx_buffers.lock);
1780 nvmf_che_report_error(qp,
1781 NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
1782 offsetof(struct nvme_tcp_r2t_hdr, cccid), pdu->m,
1783 pdu->hdr->hlen);
1784 nvmf_che_free_pdu(pdu);
1785 return (EBADMSG);
1786 }
1787
1788 data_offset = le32toh(r2t->r2to);
1789 if (data_offset != cb->data_xfered) {
1790 mtx_unlock(&qp->tx_buffers.lock);
1791 nvmf_che_report_error(qp,
1792 NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
1793 pdu->hdr->hlen);
1794 nvmf_che_free_pdu(pdu);
1795 return (EBADMSG);
1796 }
1797
1798 /*
1799 * XXX: The spec does not specify how to handle R2T tranfers
1800 * out of range of the original command.
1801 */
1802 data_len = le32toh(r2t->r2tl);
1803 if (data_offset + data_len > cb->data_len) {
1804 mtx_unlock(&qp->tx_buffers.lock);
1805 nvmf_che_report_error(qp,
1806 NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
1807 pdu->m, pdu->hdr->hlen);
1808 nvmf_che_free_pdu(pdu);
1809 return (EBADMSG);
1810 }
1811
1812 cb->data_xfered += data_len;
1813 if (cb->data_xfered == cb->data_len)
1814 che_remove_command_buffer(&qp->tx_buffers, cb);
1815 else
1816 che_hold_command_buffer(cb);
1817 mtx_unlock(&qp->tx_buffers.lock);
1818
1819 /*
1820 * Queue one or more H2C_DATA PDUs containing the requested
1821 * data.
1822 */
1823 while (data_len > 0) {
1824 struct mbuf *m;
1825 uint32_t sent, todo;
1826
1827 todo = min(data_len, qp->max_tx_data);
1828 m = nvmf_che_command_buffer_mbuf(cb, data_offset, todo, &sent,
1829 todo < data_len);
1830 che_send_h2c_pdu(qp, r2t->cccid, r2t->ttag, data_offset, m,
1831 sent, sent == data_len);
1832
1833 data_offset += sent;
1834 data_len -= sent;
1835 }
1836
1837 che_release_command_buffer(cb);
1838 nvmf_che_free_pdu(pdu);
1839 return (0);
1840 }
1841
1842 static int
nvmf_che_dispatch_pdu(struct nvmf_che_qpair * qp,struct nvmf_che_rxpdu * pdu)1843 nvmf_che_dispatch_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu)
1844 {
1845 /*
1846 * The PDU header should always be contiguous in the mbuf from
1847 * CPL_NVMT_CMP.
1848 */
1849 pdu->hdr = mtod(pdu->m, void *);
1850 KASSERT(pdu->m->m_len == pdu->hdr->hlen +
1851 ((pdu->hdr->flags & NVME_TCP_CH_FLAGS_HDGSTF) != 0 ?
1852 sizeof(uint32_t) : 0),
1853 ("%s: mismatched PDU header mbuf length", __func__));
1854
1855 switch (pdu->hdr->pdu_type) {
1856 default:
1857 __assert_unreachable();
1858 break;
1859 case NVME_TCP_PDU_TYPE_H2C_TERM_REQ:
1860 case NVME_TCP_PDU_TYPE_C2H_TERM_REQ:
1861 return (nvmf_che_handle_term_req(pdu));
1862 case NVME_TCP_PDU_TYPE_CAPSULE_CMD:
1863 return (nvmf_che_save_command_capsule(qp, pdu));
1864 case NVME_TCP_PDU_TYPE_CAPSULE_RESP:
1865 return (nvmf_che_save_response_capsule(qp, pdu));
1866 case NVME_TCP_PDU_TYPE_H2C_DATA:
1867 return (nvmf_che_handle_h2c_data(qp, pdu));
1868 case NVME_TCP_PDU_TYPE_C2H_DATA:
1869 return (nvmf_che_handle_c2h_data(qp, pdu));
1870 case NVME_TCP_PDU_TYPE_R2T:
1871 return (nvmf_che_handle_r2t(qp, pdu));
1872 }
1873 }
1874
1875 static int
nvmf_che_attach_pdu_data(struct nvmf_che_qpair * qp,struct nvmf_che_rxpdu * pdu)1876 nvmf_che_attach_pdu_data(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu)
1877 {
1878 struct socket *so = qp->so;
1879 struct mbuf *m, *n;
1880 uint32_t tcp_seq;
1881 size_t len;
1882 int error;
1883
1884 /* Check for DDP data. */
1885 if (pdu->ddp) {
1886 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_pdus, 1);
1887 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_octets,
1888 pdu->data_len);
1889 return (0);
1890 }
1891
1892 error = 0;
1893 len = pdu->data_len;
1894 tcp_seq = pdu->m->m_pkthdr.nvmf_tcp_seq;
1895 m = pdu->m;
1896 SOCKBUF_LOCK(&so->so_rcv);
1897 while (len > 0) {
1898 n = mbufq_dequeue(&qp->rx_data);
1899 KASSERT(n != NULL, ("%s: missing %zu data", __func__, len));
1900 if (n == NULL) {
1901 error = ENOBUFS;
1902 break;
1903 }
1904
1905 KASSERT(n->m_pkthdr.nvmf_tcp_seq == tcp_seq,
1906 ("%s: TCP seq mismatch", __func__));
1907 KASSERT(n->m_pkthdr.len <= len,
1908 ("%s: too much data", __func__));
1909 if (n->m_pkthdr.nvmf_tcp_seq != tcp_seq ||
1910 n->m_pkthdr.len > len) {
1911 m_freem(n);
1912 error = ENOBUFS;
1913 break;
1914 }
1915
1916 #ifdef VERBOSE_TRACES
1917 CTR(KTR_CXGBE, "%s: tid %u len %d seq %u", __func__,
1918 qp->toep->tid, n->m_pkthdr.len, n->m_pkthdr.nvmf_tcp_seq);
1919 #endif
1920 pdu->m->m_pkthdr.len += n->m_pkthdr.len;
1921 len -= n->m_pkthdr.len;
1922 tcp_seq += n->m_pkthdr.len;
1923 m_demote_pkthdr(n);
1924 m->m_next = n;
1925 m = m_last(n);
1926 }
1927 SOCKBUF_UNLOCK(&so->so_rcv);
1928
1929 if (error == 0) {
1930 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_fl_pdus, 1);
1931 counter_u64_add(qp->toep->ofld_rxq->rx_nvme_fl_octets,
1932 pdu->data_len);
1933 }
1934 return (error);
1935 }
1936
1937 static void
nvmf_che_receive(void * arg)1938 nvmf_che_receive(void *arg)
1939 {
1940 struct nvmf_che_qpair *qp = arg;
1941 struct socket *so = qp->so;
1942 struct nvmf_che_rxpdu pdu;
1943 struct mbuf *m;
1944 int error, terror;
1945
1946 SOCKBUF_LOCK(&so->so_rcv);
1947 while (!qp->rx_shutdown) {
1948 /* Wait for a PDU. */
1949 if (so->so_error != 0 || so->so_rerror != 0) {
1950 if (so->so_error != 0)
1951 error = so->so_error;
1952 else
1953 error = so->so_rerror;
1954 SOCKBUF_UNLOCK(&so->so_rcv);
1955 error:
1956 nvmf_qpair_error(&qp->qp, error);
1957 SOCKBUF_LOCK(&so->so_rcv);
1958 while (!qp->rx_shutdown)
1959 cv_wait(&qp->rx_cv, SOCKBUF_MTX(&so->so_rcv));
1960 break;
1961 }
1962
1963 m = mbufq_dequeue(&qp->rx_pdus);
1964 if (m == NULL) {
1965 if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) != 0) {
1966 error = 0;
1967 SOCKBUF_UNLOCK(&so->so_rcv);
1968 goto error;
1969 }
1970 cv_wait(&qp->rx_cv, SOCKBUF_MTX(&so->so_rcv));
1971 continue;
1972 }
1973 SOCKBUF_UNLOCK(&so->so_rcv);
1974
1975 pdu.m = m;
1976 pdu.hdr = mtod(m, const void *);
1977 pdu.ddp = (m->m_pkthdr.nvmf_cpl_status & CMP_STATUS_DDP) != 0;
1978
1979 error = nvmf_che_validate_pdu(qp, &pdu);
1980 if (error == 0 && pdu.data_len != 0)
1981 error = nvmf_che_attach_pdu_data(qp, &pdu);
1982 if (error != 0)
1983 nvmf_che_free_pdu(&pdu);
1984 else
1985 error = nvmf_che_dispatch_pdu(qp, &pdu);
1986 if (error != 0) {
1987 /*
1988 * If we received a termination request, close
1989 * the connection immediately.
1990 */
1991 if (error == ECONNRESET)
1992 goto error;
1993
1994 /*
1995 * Wait for up to 30 seconds for the socket to
1996 * be closed by the other end.
1997 */
1998 SOCKBUF_LOCK(&so->so_rcv);
1999 if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
2000 terror = cv_timedwait(&qp->rx_cv,
2001 SOCKBUF_MTX(&so->so_rcv), 30 * hz);
2002 if (terror == ETIMEDOUT)
2003 printf("NVMe/TCP: Timed out after sending terminate request\n");
2004 }
2005 SOCKBUF_UNLOCK(&so->so_rcv);
2006 goto error;
2007 }
2008
2009 SOCKBUF_LOCK(&so->so_rcv);
2010 }
2011 SOCKBUF_UNLOCK(&so->so_rcv);
2012 kthread_exit();
2013 }
2014
2015 static int
nvmf_che_soupcall_receive(struct socket * so,void * arg,int waitflag)2016 nvmf_che_soupcall_receive(struct socket *so, void *arg, int waitflag)
2017 {
2018 struct nvmf_che_qpair *qp = arg;
2019
2020 cv_signal(&qp->rx_cv);
2021 return (SU_OK);
2022 }
2023
2024 static int
do_nvmt_data(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)2025 do_nvmt_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
2026 {
2027 struct adapter *sc = iq->adapter;
2028 struct nvmf_che_adapter *nca = sc->nvme_ulp_softc;
2029 const struct cpl_nvmt_data *cpl;
2030 u_int tid;
2031 struct toepcb *toep;
2032 struct nvmf_che_qpair *qp;
2033 struct socket *so;
2034 struct inpcb *inp;
2035 struct tcpcb *tp;
2036 int len __diagused;
2037
2038 if (nca->nvmt_data_iqe) {
2039 cpl = (const void *)(rss + 1);
2040 } else {
2041 cpl = mtod(m, const void *);
2042
2043 /* strip off CPL header */
2044 m_adj(m, sizeof(*cpl));
2045 }
2046 tid = GET_TID(cpl);
2047 toep = lookup_tid(sc, tid);
2048
2049 KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
2050
2051 len = m->m_pkthdr.len;
2052
2053 KASSERT(len == be16toh(cpl->length),
2054 ("%s: payload length mismatch", __func__));
2055
2056 inp = toep->inp;
2057 tp = intotcpcb(inp);
2058 INP_WLOCK(inp);
2059 if (tp->t_flags & TF_DISCONNECTED) {
2060 CTR(KTR_CXGBE, "%s: tid %u, rx (%d bytes), t_flags 0x%x",
2061 __func__, tid, len, tp->t_flags);
2062 INP_WUNLOCK(inp);
2063 m_freem(m);
2064 return (0);
2065 }
2066
2067 /* Save TCP sequence number. */
2068 m->m_pkthdr.nvmf_tcp_seq = be32toh(cpl->seq);
2069
2070 qp = toep->ulpcb;
2071 so = qp->so;
2072 SOCKBUF_LOCK(&so->so_rcv);
2073 mbufq_enqueue(&qp->rx_data, m);
2074 SOCKBUF_UNLOCK(&so->so_rcv);
2075
2076 tp->t_rcvtime = ticks;
2077
2078 #ifdef VERBOSE_TRACES
2079 CTR(KTR_CXGBE, "%s: tid %u len %d seq %u", __func__, tid, len,
2080 be32toh(cpl->seq));
2081 #endif
2082
2083 INP_WUNLOCK(inp);
2084 return (0);
2085 }
2086
2087 static int
do_nvmt_cmp(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)2088 do_nvmt_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
2089 {
2090 struct adapter *sc = iq->adapter;
2091 const struct cpl_nvmt_cmp *cpl = mtod(m, const void *);
2092 u_int tid = GET_TID(cpl);
2093 struct toepcb *toep = lookup_tid(sc, tid);
2094 struct nvmf_che_qpair *qp = toep->ulpcb;
2095 struct socket *so = qp->so;
2096 struct inpcb *inp = toep->inp;
2097 struct tcpcb *tp = intotcpcb(inp);
2098 u_int hlen __diagused;
2099 bool empty;
2100
2101 KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
2102 KASSERT(!(toep->flags & TPF_SYNQE),
2103 ("%s: toep %p claims to be a synq entry", __func__, toep));
2104
2105 /* strip off CPL header */
2106 m_adj(m, sizeof(*cpl));
2107 hlen = m->m_pkthdr.len;
2108
2109 KASSERT(hlen == be16toh(cpl->length),
2110 ("%s: payload length mismatch", __func__));
2111
2112 INP_WLOCK(inp);
2113 if (tp->t_flags & TF_DISCONNECTED) {
2114 CTR(KTR_CXGBE, "%s: tid %u, rx (hlen %u), t_flags 0x%x",
2115 __func__, tid, hlen, tp->t_flags);
2116 INP_WUNLOCK(inp);
2117 m_freem(m);
2118 return (0);
2119 }
2120
2121 #ifdef VERBOSE_TRACES
2122 CTR(KTR_CXGBE, "%s: tid %u hlen %u seq %u status %u", __func__, tid,
2123 hlen, be32toh(cpl->seq), cpl->status);
2124 #endif
2125
2126 /* Save TCP sequence number and CPL status. */
2127 m->m_pkthdr.nvmf_tcp_seq = be32toh(cpl->seq);
2128 m->m_pkthdr.nvmf_cpl_status = cpl->status;
2129
2130 SOCKBUF_LOCK(&so->so_rcv);
2131 empty = mbufq_len(&qp->rx_pdus) == 0;
2132 mbufq_enqueue(&qp->rx_pdus, m);
2133 SOCKBUF_UNLOCK(&so->so_rcv);
2134 INP_WUNLOCK(inp);
2135 if (empty)
2136 cv_signal(&qp->rx_cv);
2137 return (0);
2138 }
2139
2140 static uint16_t
che_alloc_fl_cid(struct nvmf_che_qpair * qp,uint16_t original_cid)2141 che_alloc_fl_cid(struct nvmf_che_qpair *qp, uint16_t original_cid)
2142 {
2143 uint16_t new_cid;
2144
2145 mtx_lock(&qp->fl_cid_lock);
2146 new_cid = FL_CID_FINDFREE_AT(qp->fl_cid_set, qp->next_cid);
2147 if (new_cid == 0) {
2148 new_cid = FL_CID_FINDFREE_AT(qp->fl_cid_set, 0);
2149 MPASS(new_cid != 0);
2150 }
2151 new_cid--;
2152 FL_CID_BUSY(new_cid, qp->fl_cid_set);
2153 if (new_cid == CHE_MAX_FL_TAG)
2154 qp->next_cid = 0;
2155 else
2156 qp->next_cid = new_cid + 1;
2157 qp->fl_cids[new_cid] = original_cid;
2158 mtx_unlock(&qp->fl_cid_lock);
2159
2160 return (new_cid | CHE_FL_TAG_MASK);
2161 }
2162
2163 static uint16_t
che_alloc_ddp_cid(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb)2164 che_alloc_ddp_cid(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb)
2165 {
2166 mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
2167
2168 return (che_alloc_ddp_tag(qp, cb));
2169 }
2170
2171 static struct mbuf *
che_command_pdu(struct nvmf_che_qpair * qp,struct nvmf_che_capsule * cc)2172 che_command_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_capsule *cc)
2173 {
2174 struct nvmf_capsule *nc = &cc->nc;
2175 struct nvmf_che_command_buffer *cb;
2176 struct nvme_sgl_descriptor *sgl;
2177 struct nvme_tcp_cmd cmd;
2178 struct mbuf *top, *m;
2179 uint16_t cid;
2180 bool use_icd;
2181
2182 use_icd = false;
2183 cb = NULL;
2184 m = NULL;
2185
2186 if (nc->nc_data.io_len != 0) {
2187 cb = che_alloc_command_buffer(qp, &nc->nc_data, 0,
2188 nc->nc_data.io_len, nc->nc_sqe.cid);
2189 cb->original_cid = nc->nc_sqe.cid;
2190
2191 if (nc->nc_send_data && nc->nc_data.io_len <= qp->max_icd) {
2192 cid = che_alloc_fl_cid(qp, nc->nc_sqe.cid);
2193 use_icd = true;
2194 m = nvmf_che_command_buffer_mbuf(cb, 0,
2195 nc->nc_data.io_len, NULL, false);
2196 cb->data_xfered = nc->nc_data.io_len;
2197 che_release_command_buffer(cb);
2198 } else if (nc->nc_send_data) {
2199 cid = che_alloc_fl_cid(qp, nc->nc_sqe.cid);
2200 cb->cid = htole16(cid);
2201 mtx_lock(&qp->tx_buffers.lock);
2202 che_add_command_buffer(&qp->tx_buffers, cb);
2203 mtx_unlock(&qp->tx_buffers.lock);
2204 } else {
2205 mtx_lock(&qp->rx_buffers.lock);
2206 cid = che_alloc_ddp_cid(qp, cb);
2207 if (cid == CHE_DDP_NO_TAG) {
2208 cid = che_alloc_fl_cid(qp, nc->nc_sqe.cid);
2209 che_add_command_buffer(&qp->rx_buffers, cb);
2210 }
2211 cb->cid = htole16(cid);
2212 mtx_unlock(&qp->rx_buffers.lock);
2213 }
2214 } else
2215 cid = che_alloc_fl_cid(qp, nc->nc_sqe.cid);
2216
2217 #ifdef VERBOSE_TRACES
2218 CTR(KTR_CXGBE, "%s: tid %u allocated cid 0x%04x for 0x%04x", __func__,
2219 qp->toep->tid, cid, nc->nc_sqe.cid);
2220 #endif
2221 memset(&cmd, 0, sizeof(cmd));
2222 cmd.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_CMD;
2223 cmd.ccsqe = nc->nc_sqe;
2224 cmd.ccsqe.cid = htole16(cid);
2225
2226 /* Populate SGL in SQE. */
2227 sgl = &cmd.ccsqe.sgl;
2228 memset(sgl, 0, sizeof(*sgl));
2229 sgl->address = 0;
2230 sgl->length = htole32(nc->nc_data.io_len);
2231 if (use_icd) {
2232 /* Use in-capsule data. */
2233 sgl->type = NVME_SGL_TYPE_ICD;
2234 } else {
2235 /* Use a command buffer. */
2236 sgl->type = NVME_SGL_TYPE_COMMAND_BUFFER;
2237 }
2238
2239 top = nvmf_che_construct_pdu(qp, &cmd, sizeof(cmd), m, m != NULL ?
2240 nc->nc_data.io_len : 0);
2241 return (top);
2242 }
2243
2244 static struct mbuf *
che_response_pdu(struct nvmf_che_qpair * qp,struct nvmf_che_capsule * cc)2245 che_response_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_capsule *cc)
2246 {
2247 struct nvmf_capsule *nc = &cc->nc;
2248 struct nvme_tcp_rsp rsp;
2249
2250 memset(&rsp, 0, sizeof(rsp));
2251 rsp.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_RESP;
2252 rsp.rccqe = nc->nc_cqe;
2253
2254 return (nvmf_che_construct_pdu(qp, &rsp, sizeof(rsp), NULL, 0));
2255 }
2256
2257 static struct mbuf *
capsule_to_pdu(struct nvmf_che_qpair * qp,struct nvmf_che_capsule * cc)2258 capsule_to_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_capsule *cc)
2259 {
2260 if (cc->nc.nc_qe_len == sizeof(struct nvme_command))
2261 return (che_command_pdu(qp, cc));
2262 else
2263 return (che_response_pdu(qp, cc));
2264 }
2265
2266 static void
nvmf_che_send(void * arg)2267 nvmf_che_send(void *arg)
2268 {
2269 struct nvmf_che_qpair *qp = arg;
2270 struct nvmf_che_capsule *cc;
2271 struct socket *so = qp->so;
2272 struct mbuf *m;
2273 int error;
2274
2275 m = NULL;
2276 SOCKBUF_LOCK(&so->so_snd);
2277 while (!qp->tx_shutdown) {
2278 if (so->so_error != 0) {
2279 error = so->so_error;
2280 SOCKBUF_UNLOCK(&so->so_snd);
2281 m_freem(m);
2282 nvmf_qpair_error(&qp->qp, error);
2283 SOCKBUF_LOCK(&so->so_snd);
2284 while (!qp->tx_shutdown)
2285 cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd));
2286 break;
2287 }
2288
2289 if (STAILQ_EMPTY(&qp->tx_capsules)) {
2290 cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd));
2291 continue;
2292 }
2293
2294 /* Convert a capsule into a PDU. */
2295 cc = STAILQ_FIRST(&qp->tx_capsules);
2296 STAILQ_REMOVE_HEAD(&qp->tx_capsules, link);
2297 SOCKBUF_UNLOCK(&so->so_snd);
2298
2299 m = capsule_to_pdu(qp, cc);
2300 che_release_capsule(cc);
2301
2302 nvmf_che_write_pdu(qp, m);
2303
2304 SOCKBUF_LOCK(&so->so_snd);
2305 }
2306 SOCKBUF_UNLOCK(&so->so_snd);
2307 kthread_exit();
2308 }
2309
2310 static int
nvmf_che_setsockopt(struct socket * so,u_int sspace,u_int rspace)2311 nvmf_che_setsockopt(struct socket *so, u_int sspace, u_int rspace)
2312 {
2313 struct sockopt opt;
2314 int error, one = 1;
2315
2316 /* Don't lower the buffer sizes, just enforce a minimum. */
2317 SOCKBUF_LOCK(&so->so_snd);
2318 if (sspace < so->so_snd.sb_hiwat)
2319 sspace = so->so_snd.sb_hiwat;
2320 SOCKBUF_UNLOCK(&so->so_snd);
2321 SOCKBUF_LOCK(&so->so_rcv);
2322 if (rspace < so->so_rcv.sb_hiwat)
2323 rspace = so->so_rcv.sb_hiwat;
2324 SOCKBUF_UNLOCK(&so->so_rcv);
2325
2326 error = soreserve(so, sspace, rspace);
2327 if (error != 0)
2328 return (error);
2329 SOCKBUF_LOCK(&so->so_snd);
2330 so->so_snd.sb_flags |= SB_AUTOSIZE;
2331 SOCKBUF_UNLOCK(&so->so_snd);
2332 SOCKBUF_LOCK(&so->so_rcv);
2333 so->so_rcv.sb_flags |= SB_AUTOSIZE;
2334 SOCKBUF_UNLOCK(&so->so_rcv);
2335
2336 /*
2337 * Disable Nagle.
2338 */
2339 bzero(&opt, sizeof(opt));
2340 opt.sopt_dir = SOPT_SET;
2341 opt.sopt_level = IPPROTO_TCP;
2342 opt.sopt_name = TCP_NODELAY;
2343 opt.sopt_val = &one;
2344 opt.sopt_valsize = sizeof(one);
2345 error = sosetopt(so, &opt);
2346 if (error != 0)
2347 return (error);
2348
2349 return (0);
2350 }
2351
2352 static void
t4_nvme_set_tcb_field(struct toepcb * toep,uint16_t word,uint64_t mask,uint64_t val)2353 t4_nvme_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask,
2354 uint64_t val)
2355 {
2356 struct adapter *sc = td_adapter(toep->td);
2357
2358 t4_set_tcb_field(sc, &toep->ofld_txq->wrq, toep, word, mask, val, 0, 0);
2359 }
2360
2361 static void
set_ulp_mode_nvme(struct toepcb * toep,u_int ulp_submode,uint8_t rxpda)2362 set_ulp_mode_nvme(struct toepcb *toep, u_int ulp_submode, uint8_t rxpda)
2363 {
2364 uint64_t val;
2365
2366 CTR(KTR_CXGBE, "%s: tid %u, ULP_MODE_NVMET, submode=%#x, rxpda=%u",
2367 __func__, toep->tid, ulp_submode, rxpda);
2368
2369 val = V_TCB_ULP_TYPE(ULP_MODE_NVMET) | V_TCB_ULP_RAW(ulp_submode);
2370 t4_nvme_set_tcb_field(toep, W_TCB_ULP_TYPE,
2371 V_TCB_ULP_TYPE(M_TCB_ULP_TYPE) | V_TCB_ULP_RAW(M_TCB_ULP_RAW), val);
2372
2373 val = V_TF_RX_FLOW_CONTROL_DISABLE(1ULL);
2374 t4_nvme_set_tcb_field(toep, W_TCB_T_FLAGS, val, val);
2375
2376 val = V_TCB_RSVD((rxpda / 4) - 1);
2377 t4_nvme_set_tcb_field(toep, W_TCB_RSVD, V_TCB_RSVD(M_TCB_RSVD), val);
2378
2379 /* 0 disables CPL_NVMT_CMP_IMM which is not useful in this driver. */
2380 val = 0;
2381 t4_nvme_set_tcb_field(toep, W_TCB_CMP_IMM_SZ,
2382 V_TCB_CMP_IMM_SZ(M_TCB_CMP_IMM_SZ), val);
2383 }
2384
2385 static u_int
pdu_max_data_len(const nvlist_t * nvl,u_int max_pdu_len,u_int hlen,uint8_t pda)2386 pdu_max_data_len(const nvlist_t *nvl, u_int max_pdu_len, u_int hlen,
2387 uint8_t pda)
2388 {
2389 u_int max_data_len;
2390
2391 if (nvlist_get_bool(nvl, "header_digests"))
2392 hlen += sizeof(uint32_t);
2393 hlen = roundup(hlen, pda);
2394 max_data_len = max_pdu_len - hlen;
2395 if (nvlist_get_bool(nvl, "data_digests"))
2396 max_data_len -= sizeof(uint32_t);
2397 return (max_data_len);
2398 }
2399
2400 static struct nvmf_qpair *
che_allocate_qpair(bool controller,const nvlist_t * nvl)2401 che_allocate_qpair(bool controller, const nvlist_t *nvl)
2402 {
2403 struct nvmf_che_adapter *nca;
2404 struct nvmf_che_qpair *qp;
2405 struct adapter *sc;
2406 struct file *fp;
2407 struct socket *so;
2408 struct inpcb *inp;
2409 struct tcpcb *tp;
2410 struct toepcb *toep;
2411 cap_rights_t rights;
2412 u_int max_tx_pdu_len, num_ddp_tags;
2413 int error, ulp_submode;
2414
2415 if (!nvlist_exists_number(nvl, "fd") ||
2416 !nvlist_exists_number(nvl, "rxpda") ||
2417 !nvlist_exists_number(nvl, "txpda") ||
2418 !nvlist_exists_bool(nvl, "header_digests") ||
2419 !nvlist_exists_bool(nvl, "data_digests") ||
2420 !nvlist_exists_number(nvl, "maxr2t") ||
2421 !nvlist_exists_number(nvl, "maxh2cdata") ||
2422 !nvlist_exists_number(nvl, "max_icd"))
2423 return (NULL);
2424
2425 error = fget(curthread, nvlist_get_number(nvl, "fd"),
2426 cap_rights_init_one(&rights, CAP_SOCK_CLIENT), &fp);
2427 if (error != 0)
2428 return (NULL);
2429 if (fp->f_type != DTYPE_SOCKET) {
2430 fdrop(fp, curthread);
2431 return (NULL);
2432 }
2433 so = fp->f_data;
2434 if (so->so_type != SOCK_STREAM ||
2435 so->so_proto->pr_protocol != IPPROTO_TCP) {
2436 fdrop(fp, curthread);
2437 return (NULL);
2438 }
2439
2440 sc = find_offload_adapter(so);
2441 if (sc == NULL) {
2442 fdrop(fp, curthread);
2443 return (NULL);
2444 }
2445 nca = sc->nvme_ulp_softc;
2446
2447 /*
2448 * Controller: Require advertised MAXH2CDATA to be small
2449 * enough.
2450 */
2451 if (controller) {
2452 u_int max_rx_data;
2453
2454 max_rx_data = pdu_max_data_len(nvl, nca->max_receive_pdu,
2455 sizeof(struct nvme_tcp_h2c_data_hdr),
2456 nvlist_get_number(nvl, "rxpda"));
2457 if (nvlist_get_number(nvl, "maxh2cdata") > max_rx_data) {
2458 fdrop(fp, curthread);
2459 return (NULL);
2460 }
2461 }
2462
2463 /*
2464 * Host: Require the queue size to be small enough that all of
2465 * the command ids allocated by nvmf(4) will fit in the
2466 * unallocated range.
2467 *
2468 * XXX: Alternatively this driver could just queue commands
2469 * when an unallocated ID isn't available.
2470 */
2471 if (!controller) {
2472 u_int num_commands;
2473
2474 num_commands = nvlist_get_number(nvl, "qsize") - 1;
2475 if (nvlist_get_bool(nvl, "admin"))
2476 num_commands += 8; /* Max AER */
2477 if (num_commands > CHE_NUM_FL_TAGS) {
2478 fdrop(fp, curthread);
2479 return (NULL);
2480 }
2481 }
2482
2483 qp = malloc(sizeof(*qp), M_NVMF_CHE, M_WAITOK | M_ZERO);
2484 qp->txpda = nvlist_get_number(nvl, "txpda");
2485 qp->rxpda = nvlist_get_number(nvl, "rxpda");
2486 qp->header_digests = nvlist_get_bool(nvl, "header_digests");
2487 qp->data_digests = nvlist_get_bool(nvl, "data_digests");
2488 qp->maxr2t = nvlist_get_number(nvl, "maxr2t");
2489 if (controller)
2490 qp->maxh2cdata = nvlist_get_number(nvl, "maxh2cdata");
2491
2492 if (controller) {
2493 /* NB: maxr2t is 0's based. */
2494 qp->num_fl_ttags = MIN(CHE_NUM_FL_TAGS,
2495 nvlist_get_number(nvl, "qsize") *
2496 ((uint64_t)qp->maxr2t + 1));
2497 qp->open_fl_ttags = mallocarray(qp->num_fl_ttags,
2498 sizeof(*qp->open_fl_ttags), M_NVMF_CHE, M_WAITOK | M_ZERO);
2499 } else {
2500 qp->fl_cids = mallocarray(CHE_NUM_FL_TAGS,
2501 sizeof(*qp->fl_cids), M_NVMF_CHE, M_WAITOK | M_ZERO);
2502 qp->fl_cid_set = malloc(sizeof(*qp->fl_cid_set), M_NVMF_CHE,
2503 M_WAITOK);
2504 FL_CID_INIT(qp->fl_cid_set);
2505 mtx_init(&qp->fl_cid_lock, "nvmf/che fl cids", NULL, MTX_DEF);
2506 }
2507
2508 inp = sotoinpcb(so);
2509 INP_WLOCK(inp);
2510 tp = intotcpcb(inp);
2511 if (tp->t_flags & TF_DISCONNECTED) {
2512 INP_WUNLOCK(inp);
2513 free(qp->fl_cid_set, M_NVMF_CHE);
2514 free(qp->fl_cids, M_NVMF_CHE);
2515 free(qp->open_fl_ttags, M_NVMF_CHE);
2516 free(qp, M_NVMF_CHE);
2517 fdrop(fp, curthread);
2518 return (NULL);
2519 }
2520
2521 MPASS(tp->t_flags & TF_TOE);
2522 MPASS(tp->tod != NULL);
2523 MPASS(tp->t_toe != NULL);
2524 toep = tp->t_toe;
2525 MPASS(toep->vi->adapter == sc);
2526
2527 if (ulp_mode(toep) != ULP_MODE_NONE) {
2528 INP_WUNLOCK(inp);
2529 free(qp->fl_cid_set, M_NVMF_CHE);
2530 free(qp->fl_cids, M_NVMF_CHE);
2531 free(qp->open_fl_ttags, M_NVMF_CHE);
2532 free(qp, M_NVMF_CHE);
2533 fdrop(fp, curthread);
2534 return (NULL);
2535 }
2536
2537 /* Claim socket from file descriptor. */
2538 fp->f_ops = &badfileops;
2539 fp->f_data = NULL;
2540
2541 qp->so = so;
2542 qp->toep = toep;
2543 qp->nca = nca;
2544 refcount_init(&qp->refs, 1);
2545
2546 /* NB: C2H and H2C headers are the same size. */
2547 qp->max_rx_data = pdu_max_data_len(nvl, nca->max_receive_pdu,
2548 sizeof(struct nvme_tcp_c2h_data_hdr), qp->rxpda);
2549 qp->max_tx_data = pdu_max_data_len(nvl, nca->max_transmit_pdu,
2550 sizeof(struct nvme_tcp_c2h_data_hdr), qp->txpda);
2551 if (!controller) {
2552 qp->max_tx_data = min(qp->max_tx_data,
2553 nvlist_get_number(nvl, "maxh2cdata"));
2554 qp->max_icd = min(nvlist_get_number(nvl, "max_icd"),
2555 pdu_max_data_len(nvl, nca->max_transmit_pdu,
2556 sizeof(struct nvme_tcp_cmd), qp->txpda));
2557 } else {
2558 /*
2559 * IOCCSZ represents the size of a logical command
2560 * capsule including the 64 byte SQE and the
2561 * in-capsule data. Use pdu_max_data_len to compute
2562 * the maximum supported ICD length.
2563 */
2564 qp->max_ioccsz = rounddown(pdu_max_data_len(nvl,
2565 nca->max_receive_pdu, sizeof(struct nvme_tcp_cmd),
2566 qp->rxpda), 16) + sizeof(struct nvme_command);
2567 }
2568
2569 ulp_submode = 0;
2570 if (qp->header_digests)
2571 ulp_submode |= FW_NVMET_ULPSUBMODE_HCRC;
2572 if (qp->data_digests)
2573 ulp_submode |= FW_NVMET_ULPSUBMODE_DCRC;
2574 if (!controller)
2575 ulp_submode |= FW_NVMET_ULPSUBMODE_ING_DIR;
2576
2577 max_tx_pdu_len = sizeof(struct nvme_tcp_h2c_data_hdr);
2578 if (qp->header_digests)
2579 max_tx_pdu_len += sizeof(uint32_t);
2580 max_tx_pdu_len = roundup(max_tx_pdu_len, qp->txpda);
2581 max_tx_pdu_len += qp->max_tx_data;
2582 if (qp->data_digests)
2583 max_tx_pdu_len += sizeof(uint32_t);
2584
2585 /* TODO: ISO limits */
2586
2587 if (controller) {
2588 /* Use the SUCCESS flag if SQ flow control is disabled. */
2589 qp->send_success = !nvlist_get_bool(nvl, "sq_flow_control");
2590 }
2591
2592 toep->params.ulp_mode = ULP_MODE_NVMET;
2593 toep->ulpcb = qp;
2594
2595 send_txdataplen_max_flowc_wr(sc, toep,
2596 roundup(/* max_iso_pdus * */ max_tx_pdu_len, tp->t_maxseg));
2597 set_ulp_mode_nvme(toep, ulp_submode, qp->rxpda);
2598 INP_WUNLOCK(inp);
2599
2600 fdrop(fp, curthread);
2601
2602 error = nvmf_che_setsockopt(so, max_tx_pdu_len, nca->max_receive_pdu);
2603 if (error != 0) {
2604 free(qp->fl_cid_set, M_NVMF_CHE);
2605 free(qp->fl_cids, M_NVMF_CHE);
2606 free(qp->open_fl_ttags, M_NVMF_CHE);
2607 free(qp, M_NVMF_CHE);
2608 soclose(so);
2609 return (NULL);
2610 }
2611
2612 num_ddp_tags = ddp_tags_per_qp;
2613 if (num_ddp_tags > 0) {
2614 qp->tpt_offset = t4_stag_alloc(sc, num_ddp_tags);
2615 if (qp->tpt_offset != T4_STAG_UNSET) {
2616 #ifdef VERBOSE_TRACES
2617 CTR(KTR_CXGBE,
2618 "%s: tid %u using %u tags at offset 0x%x",
2619 __func__, toep->tid, num_ddp_tags, qp->tpt_offset);
2620 #endif
2621 qp->num_ddp_tags = num_ddp_tags;
2622 qp->open_ddp_tags = mallocarray(qp->num_ddp_tags,
2623 sizeof(*qp->open_ddp_tags), M_NVMF_CHE, M_WAITOK |
2624 M_ZERO);
2625
2626 t4_nvme_set_tcb_field(toep, W_TCB_TPT_OFFSET,
2627 M_TCB_TPT_OFFSET, V_TCB_TPT_OFFSET(qp->tpt_offset));
2628 }
2629 }
2630
2631 TAILQ_INIT(&qp->rx_buffers.head);
2632 TAILQ_INIT(&qp->tx_buffers.head);
2633 mtx_init(&qp->rx_buffers.lock, "nvmf/che rx buffers", NULL, MTX_DEF);
2634 mtx_init(&qp->tx_buffers.lock, "nvmf/che tx buffers", NULL, MTX_DEF);
2635
2636 cv_init(&qp->rx_cv, "-");
2637 cv_init(&qp->tx_cv, "-");
2638 mbufq_init(&qp->rx_data, 0);
2639 mbufq_init(&qp->rx_pdus, 0);
2640 STAILQ_INIT(&qp->tx_capsules);
2641
2642 /* Register socket upcall for receive to handle remote FIN. */
2643 SOCKBUF_LOCK(&so->so_rcv);
2644 soupcall_set(so, SO_RCV, nvmf_che_soupcall_receive, qp);
2645 SOCKBUF_UNLOCK(&so->so_rcv);
2646
2647 /* Spin up kthreads. */
2648 error = kthread_add(nvmf_che_receive, qp, NULL, &qp->rx_thread, 0, 0,
2649 "nvmef che rx");
2650 if (error != 0) {
2651 che_free_qpair(&qp->qp);
2652 return (NULL);
2653 }
2654 error = kthread_add(nvmf_che_send, qp, NULL, &qp->tx_thread, 0, 0,
2655 "nvmef che tx");
2656 if (error != 0) {
2657 che_free_qpair(&qp->qp);
2658 return (NULL);
2659 }
2660
2661 return (&qp->qp);
2662 }
2663
2664 static void
che_release_qpair(struct nvmf_che_qpair * qp)2665 che_release_qpair(struct nvmf_che_qpair *qp)
2666 {
2667 if (refcount_release(&qp->refs))
2668 free(qp, M_NVMF_CHE);
2669 }
2670
2671 static void
che_free_qpair(struct nvmf_qpair * nq)2672 che_free_qpair(struct nvmf_qpair *nq)
2673 {
2674 struct nvmf_che_qpair *qp = CQP(nq);
2675 struct nvmf_che_command_buffer *ncb, *cb;
2676 struct nvmf_che_capsule *ncc, *cc;
2677 struct socket *so = qp->so;
2678 struct toepcb *toep = qp->toep;
2679 struct inpcb *inp = sotoinpcb(so);
2680
2681 /* Shut down kthreads. */
2682 SOCKBUF_LOCK(&so->so_snd);
2683 qp->tx_shutdown = true;
2684 if (qp->tx_thread != NULL) {
2685 cv_signal(&qp->tx_cv);
2686 mtx_sleep(qp->tx_thread, SOCKBUF_MTX(&so->so_snd), 0,
2687 "nvchetx", 0);
2688 }
2689 SOCKBUF_UNLOCK(&so->so_snd);
2690
2691 SOCKBUF_LOCK(&so->so_rcv);
2692 qp->rx_shutdown = true;
2693 if (qp->rx_thread != NULL) {
2694 cv_signal(&qp->rx_cv);
2695 mtx_sleep(qp->rx_thread, SOCKBUF_MTX(&so->so_rcv), 0,
2696 "nvcherx", 0);
2697 }
2698 soupcall_clear(so, SO_RCV);
2699 SOCKBUF_UNLOCK(&so->so_rcv);
2700 mbufq_drain(&qp->rx_data);
2701 mbufq_drain(&qp->rx_pdus);
2702
2703 STAILQ_FOREACH_SAFE(cc, &qp->tx_capsules, link, ncc) {
2704 nvmf_abort_capsule_data(&cc->nc, ECONNABORTED);
2705 che_release_capsule(cc);
2706 }
2707
2708 cv_destroy(&qp->tx_cv);
2709 cv_destroy(&qp->rx_cv);
2710
2711 if (qp->open_fl_ttags != NULL) {
2712 for (u_int i = 0; i < qp->num_fl_ttags; i++) {
2713 cb = qp->open_fl_ttags[i];
2714 if (cb != NULL) {
2715 cb->cc->active_r2ts--;
2716 cb->error = ECONNABORTED;
2717 che_release_command_buffer(cb);
2718 }
2719 }
2720 free(qp->open_fl_ttags, M_NVMF_CHE);
2721 }
2722 if (qp->num_ddp_tags != 0) {
2723 for (u_int i = 0; i < qp->num_ddp_tags; i++) {
2724 cb = qp->open_ddp_tags[i];
2725 if (cb != NULL) {
2726 if (cb->cc != NULL)
2727 cb->cc->active_r2ts--;
2728 cb->error = ECONNABORTED;
2729 mtx_lock(&qp->rx_buffers.lock);
2730 che_free_ddp_tag(qp, cb, cb->ttag);
2731 mtx_unlock(&qp->rx_buffers.lock);
2732 che_release_command_buffer(cb);
2733 }
2734 }
2735 free(qp->open_ddp_tags, M_NVMF_CHE);
2736 }
2737
2738 mtx_lock(&qp->rx_buffers.lock);
2739 TAILQ_FOREACH_SAFE(cb, &qp->rx_buffers.head, link, ncb) {
2740 che_remove_command_buffer(&qp->rx_buffers, cb);
2741 mtx_unlock(&qp->rx_buffers.lock);
2742 #ifdef INVARIANTS
2743 if (cb->cc != NULL)
2744 cb->cc->pending_r2ts--;
2745 #endif
2746 cb->error = ECONNABORTED;
2747 che_release_command_buffer(cb);
2748 mtx_lock(&qp->rx_buffers.lock);
2749 }
2750 mtx_destroy(&qp->rx_buffers.lock);
2751
2752 mtx_lock(&qp->tx_buffers.lock);
2753 TAILQ_FOREACH_SAFE(cb, &qp->tx_buffers.head, link, ncb) {
2754 che_remove_command_buffer(&qp->tx_buffers, cb);
2755 mtx_unlock(&qp->tx_buffers.lock);
2756 cb->error = ECONNABORTED;
2757 che_release_command_buffer(cb);
2758 mtx_lock(&qp->tx_buffers.lock);
2759 }
2760 mtx_destroy(&qp->tx_buffers.lock);
2761
2762 if (qp->num_ddp_tags != 0)
2763 t4_stag_free(qp->nca->sc, qp->tpt_offset, qp->num_ddp_tags);
2764
2765 if (!qp->qp.nq_controller) {
2766 free(qp->fl_cids, M_NVMF_CHE);
2767 free(qp->fl_cid_set, M_NVMF_CHE);
2768 mtx_destroy(&qp->fl_cid_lock);
2769 }
2770
2771 INP_WLOCK(inp);
2772 toep->ulpcb = NULL;
2773 mbufq_drain(&toep->ulp_pduq);
2774
2775 /*
2776 * Grab a reference to use when waiting for the final CPL to
2777 * be received. If toep->inp is NULL, then
2778 * final_cpl_received() has already been called (e.g. due to
2779 * the peer sending a RST).
2780 */
2781 if (toep->inp != NULL) {
2782 toep = hold_toepcb(toep);
2783 toep->flags |= TPF_WAITING_FOR_FINAL;
2784 } else
2785 toep = NULL;
2786 INP_WUNLOCK(inp);
2787
2788 soclose(so);
2789
2790 /*
2791 * Wait for the socket to fully close. This ensures any
2792 * pending received data has been received (and in particular,
2793 * any data that would be received by DDP has been handled).
2794 */
2795 if (toep != NULL) {
2796 struct mtx *lock = mtx_pool_find(mtxpool_sleep, toep);
2797
2798 mtx_lock(lock);
2799 while ((toep->flags & TPF_WAITING_FOR_FINAL) != 0)
2800 mtx_sleep(toep, lock, PSOCK, "conclo2", 0);
2801 mtx_unlock(lock);
2802 free_toepcb(toep);
2803 }
2804
2805 che_release_qpair(qp);
2806 }
2807
2808 static uint32_t
che_max_ioccsz(struct nvmf_qpair * nq)2809 che_max_ioccsz(struct nvmf_qpair *nq)
2810 {
2811 struct nvmf_che_qpair *qp = CQP(nq);
2812
2813 /*
2814 * Limit the command capsule size so that with maximum ICD it
2815 * fits within the limit of the largest PDU the adapter can
2816 * receive.
2817 */
2818 return (qp->max_ioccsz);
2819 }
2820
2821 static uint64_t
che_max_xfer_size(struct nvmf_qpair * nq)2822 che_max_xfer_size(struct nvmf_qpair *nq)
2823 {
2824 struct nvmf_che_qpair *qp = CQP(nq);
2825
2826 /*
2827 * Limit host transfers to the size of the data payload in the
2828 * largest PDU the adapter can receive.
2829 */
2830 return (qp->max_rx_data);
2831 }
2832
2833 static struct nvmf_capsule *
che_allocate_capsule(struct nvmf_qpair * nq,int how)2834 che_allocate_capsule(struct nvmf_qpair *nq, int how)
2835 {
2836 struct nvmf_che_qpair *qp = CQP(nq);
2837 struct nvmf_che_capsule *cc;
2838
2839 cc = malloc(sizeof(*cc), M_NVMF_CHE, how | M_ZERO);
2840 if (cc == NULL)
2841 return (NULL);
2842 refcount_init(&cc->refs, 1);
2843 refcount_acquire(&qp->refs);
2844 return (&cc->nc);
2845 }
2846
2847 static void
che_release_capsule(struct nvmf_che_capsule * cc)2848 che_release_capsule(struct nvmf_che_capsule *cc)
2849 {
2850 struct nvmf_che_qpair *qp = CQP(cc->nc.nc_qpair);
2851
2852 if (!refcount_release(&cc->refs))
2853 return;
2854
2855 MPASS(cc->active_r2ts == 0);
2856 MPASS(cc->pending_r2ts == 0);
2857
2858 nvmf_che_free_pdu(&cc->rx_pdu);
2859 free(cc, M_NVMF_CHE);
2860 che_release_qpair(qp);
2861 }
2862
2863 static void
che_free_capsule(struct nvmf_capsule * nc)2864 che_free_capsule(struct nvmf_capsule *nc)
2865 {
2866 che_release_capsule(CCAP(nc));
2867 }
2868
2869 static int
che_transmit_capsule(struct nvmf_capsule * nc)2870 che_transmit_capsule(struct nvmf_capsule *nc)
2871 {
2872 struct nvmf_che_qpair *qp = CQP(nc->nc_qpair);
2873 struct nvmf_che_capsule *cc = CCAP(nc);
2874 struct socket *so = qp->so;
2875
2876 refcount_acquire(&cc->refs);
2877 SOCKBUF_LOCK(&so->so_snd);
2878 STAILQ_INSERT_TAIL(&qp->tx_capsules, cc, link);
2879 cv_signal(&qp->tx_cv);
2880 SOCKBUF_UNLOCK(&so->so_snd);
2881 return (0);
2882 }
2883
2884 static uint8_t
che_validate_command_capsule(struct nvmf_capsule * nc)2885 che_validate_command_capsule(struct nvmf_capsule *nc)
2886 {
2887 struct nvmf_che_capsule *cc = CCAP(nc);
2888 struct nvme_sgl_descriptor *sgl;
2889
2890 KASSERT(cc->rx_pdu.hdr != NULL, ("capsule wasn't received"));
2891
2892 sgl = &nc->nc_sqe.sgl;
2893 switch (sgl->type) {
2894 case NVME_SGL_TYPE_ICD:
2895 if (cc->rx_pdu.data_len != le32toh(sgl->length)) {
2896 printf("NVMe/TCP: Command Capsule with mismatched ICD length\n");
2897 return (NVME_SC_DATA_SGL_LENGTH_INVALID);
2898 }
2899 break;
2900 case NVME_SGL_TYPE_COMMAND_BUFFER:
2901 if (cc->rx_pdu.data_len != 0) {
2902 printf("NVMe/TCP: Command Buffer SGL with ICD\n");
2903 return (NVME_SC_INVALID_FIELD);
2904 }
2905 break;
2906 default:
2907 printf("NVMe/TCP: Invalid SGL type in Command Capsule\n");
2908 return (NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID);
2909 }
2910
2911 if (sgl->address != 0) {
2912 printf("NVMe/TCP: Invalid SGL offset in Command Capsule\n");
2913 return (NVME_SC_SGL_OFFSET_INVALID);
2914 }
2915
2916 return (NVME_SC_SUCCESS);
2917 }
2918
2919 static size_t
che_capsule_data_len(const struct nvmf_capsule * nc)2920 che_capsule_data_len(const struct nvmf_capsule *nc)
2921 {
2922 MPASS(nc->nc_qe_len == sizeof(struct nvme_command));
2923 return (le32toh(nc->nc_sqe.sgl.length));
2924 }
2925
2926 static void
che_receive_r2t_data(struct nvmf_capsule * nc,uint32_t data_offset,struct nvmf_io_request * io)2927 che_receive_r2t_data(struct nvmf_capsule *nc, uint32_t data_offset,
2928 struct nvmf_io_request *io)
2929 {
2930 struct nvmf_che_qpair *qp = CQP(nc->nc_qpair);
2931 struct nvmf_che_capsule *cc = CCAP(nc);
2932 struct nvmf_che_command_buffer *cb;
2933
2934 cb = che_alloc_command_buffer(qp, io, data_offset, io->io_len,
2935 nc->nc_sqe.cid);
2936
2937 cb->cc = cc;
2938 refcount_acquire(&cc->refs);
2939
2940 /*
2941 * If this command has too many active R2Ts or there are no
2942 * available transfer tags, queue the request for later.
2943 *
2944 * NB: maxr2t is 0's based.
2945 */
2946 mtx_lock(&qp->rx_buffers.lock);
2947 if (cc->active_r2ts > qp->maxr2t ||
2948 !nvmf_che_allocate_ttag(qp, cb)) {
2949 #ifdef INVARIANTS
2950 cc->pending_r2ts++;
2951 #endif
2952 TAILQ_INSERT_TAIL(&qp->rx_buffers.head, cb, link);
2953 mtx_unlock(&qp->rx_buffers.lock);
2954 return;
2955 }
2956 mtx_unlock(&qp->rx_buffers.lock);
2957
2958 che_send_r2t(qp, nc->nc_sqe.cid, cb->ttag, data_offset, io->io_len);
2959 }
2960
2961 static void
che_receive_icd_data(struct nvmf_capsule * nc,uint32_t data_offset,struct nvmf_io_request * io)2962 che_receive_icd_data(struct nvmf_capsule *nc, uint32_t data_offset,
2963 struct nvmf_io_request *io)
2964 {
2965 struct nvmf_che_capsule *cc = CCAP(nc);
2966
2967 /*
2968 * The header is in rx_pdu.m, the padding is discarded, and
2969 * the data starts at rx_pdu.m->m_next.
2970 */
2971 mbuf_copyto_io(cc->rx_pdu.m->m_next, data_offset, io->io_len, io, 0);
2972 nvmf_complete_io_request(io, io->io_len, 0);
2973 }
2974
2975 static int
che_receive_controller_data(struct nvmf_capsule * nc,uint32_t data_offset,struct nvmf_io_request * io)2976 che_receive_controller_data(struct nvmf_capsule *nc, uint32_t data_offset,
2977 struct nvmf_io_request *io)
2978 {
2979 struct nvme_sgl_descriptor *sgl;
2980 size_t data_len;
2981
2982 if (nc->nc_qe_len != sizeof(struct nvme_command) ||
2983 !nc->nc_qpair->nq_controller)
2984 return (EINVAL);
2985
2986 sgl = &nc->nc_sqe.sgl;
2987 data_len = le32toh(sgl->length);
2988 if (data_offset + io->io_len > data_len)
2989 return (EFBIG);
2990
2991 if (sgl->type == NVME_SGL_TYPE_ICD)
2992 che_receive_icd_data(nc, data_offset, io);
2993 else
2994 che_receive_r2t_data(nc, data_offset, io);
2995 return (0);
2996 }
2997
2998 /* NB: cid is little-endian already. */
2999 static void
che_send_c2h_pdu(struct nvmf_che_qpair * qp,uint16_t cid,uint32_t data_offset,struct mbuf * m,size_t len,bool last_pdu,bool success)3000 che_send_c2h_pdu(struct nvmf_che_qpair *qp, uint16_t cid, uint32_t data_offset,
3001 struct mbuf *m, size_t len, bool last_pdu, bool success)
3002 {
3003 struct nvme_tcp_c2h_data_hdr c2h;
3004 struct mbuf *top;
3005
3006 memset(&c2h, 0, sizeof(c2h));
3007 c2h.common.pdu_type = NVME_TCP_PDU_TYPE_C2H_DATA;
3008 if (last_pdu)
3009 c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_LAST_PDU;
3010 if (success)
3011 c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_SUCCESS;
3012 c2h.cccid = cid;
3013 c2h.datao = htole32(data_offset);
3014 c2h.datal = htole32(len);
3015
3016 top = nvmf_che_construct_pdu(qp, &c2h, sizeof(c2h), m, len);
3017 nvmf_che_write_pdu(qp, top);
3018 }
3019
3020 static u_int
che_send_controller_data(struct nvmf_capsule * nc,uint32_t data_offset,struct mbuf * m,size_t len)3021 che_send_controller_data(struct nvmf_capsule *nc, uint32_t data_offset,
3022 struct mbuf *m, size_t len)
3023 {
3024 struct nvmf_che_qpair *qp = CQP(nc->nc_qpair);
3025 struct nvme_sgl_descriptor *sgl;
3026 uint32_t data_len;
3027 bool last_pdu, last_xfer;
3028
3029 if (nc->nc_qe_len != sizeof(struct nvme_command) ||
3030 !qp->qp.nq_controller) {
3031 m_freem(m);
3032 return (NVME_SC_INVALID_FIELD);
3033 }
3034
3035 sgl = &nc->nc_sqe.sgl;
3036 data_len = le32toh(sgl->length);
3037 if (data_offset + len > data_len) {
3038 m_freem(m);
3039 return (NVME_SC_INVALID_FIELD);
3040 }
3041 last_xfer = (data_offset + len == data_len);
3042
3043 if (sgl->type != NVME_SGL_TYPE_COMMAND_BUFFER) {
3044 m_freem(m);
3045 return (NVME_SC_INVALID_FIELD);
3046 }
3047
3048 KASSERT(data_offset == CCAP(nc)->tx_data_offset,
3049 ("%s: starting data_offset %u doesn't match end of previous xfer %u",
3050 __func__, data_offset, CCAP(nc)->tx_data_offset));
3051
3052 /* Queue one or more C2H_DATA PDUs containing the data from 'm'. */
3053 while (m != NULL) {
3054 struct mbuf *n;
3055 uint32_t todo;
3056
3057 if (m->m_len > qp->max_tx_data) {
3058 n = m_split(m, qp->max_tx_data, M_WAITOK);
3059 todo = m->m_len;
3060 } else {
3061 struct mbuf *p;
3062
3063 todo = m->m_len;
3064 p = m;
3065 n = p->m_next;
3066 while (n != NULL) {
3067 if (todo + n->m_len > qp->max_tx_data) {
3068 p->m_next = NULL;
3069 break;
3070 }
3071 todo += n->m_len;
3072 p = n;
3073 n = p->m_next;
3074 }
3075 MPASS(m_length(m, NULL) == todo);
3076 }
3077
3078 last_pdu = (n == NULL && last_xfer);
3079 che_send_c2h_pdu(qp, nc->nc_sqe.cid, data_offset, m, todo,
3080 last_pdu, last_pdu && qp->send_success);
3081
3082 data_offset += todo;
3083 data_len -= todo;
3084 m = n;
3085 }
3086 MPASS(data_len == 0);
3087
3088 #ifdef INVARIANTS
3089 CCAP(nc)->tx_data_offset = data_offset;
3090 #endif
3091 if (!last_xfer)
3092 return (NVMF_MORE);
3093 else if (qp->send_success)
3094 return (NVMF_SUCCESS_SENT);
3095 else
3096 return (NVME_SC_SUCCESS);
3097 }
3098
3099 struct nvmf_transport_ops che_ops = {
3100 .allocate_qpair = che_allocate_qpair,
3101 .free_qpair = che_free_qpair,
3102 .max_ioccsz = che_max_ioccsz,
3103 .max_xfer_size = che_max_xfer_size,
3104 .allocate_capsule = che_allocate_capsule,
3105 .free_capsule = che_free_capsule,
3106 .transmit_capsule = che_transmit_capsule,
3107 .validate_command_capsule = che_validate_command_capsule,
3108 .capsule_data_len = che_capsule_data_len,
3109 .receive_controller_data = che_receive_controller_data,
3110 .send_controller_data = che_send_controller_data,
3111 .trtype = NVMF_TRTYPE_TCP,
3112 .priority = 10,
3113 };
3114
3115 NVMF_TRANSPORT(che, che_ops);
3116
3117 static void
read_pdu_limits(struct adapter * sc,u_int * max_tx_pdu_len,uint32_t * max_rx_pdu_len)3118 read_pdu_limits(struct adapter *sc, u_int *max_tx_pdu_len,
3119 uint32_t *max_rx_pdu_len)
3120 {
3121 uint32_t tx_len, rx_len, r, v;
3122
3123 /* Copied from cxgbei, but not sure if this is correct. */
3124 rx_len = t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE);
3125 tx_len = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE);
3126
3127 r = t4_read_reg(sc, A_TP_PARA_REG2);
3128 rx_len = min(rx_len, G_MAXRXDATA(r));
3129 tx_len = min(tx_len, G_MAXRXDATA(r));
3130
3131 r = t4_read_reg(sc, A_TP_PARA_REG7);
3132 v = min(G_PMMAXXFERLEN0(r), G_PMMAXXFERLEN1(r));
3133 rx_len = min(rx_len, v);
3134 tx_len = min(tx_len, v);
3135
3136 /* Cannot be larger than 32KB - 256. */
3137 rx_len = min(rx_len, 32512);
3138 tx_len = min(tx_len, 32512);
3139
3140 *max_tx_pdu_len = tx_len;
3141 *max_rx_pdu_len = rx_len;
3142 }
3143
3144 static int
nvmf_che_init(struct adapter * sc,struct nvmf_che_adapter * nca)3145 nvmf_che_init(struct adapter *sc, struct nvmf_che_adapter *nca)
3146 {
3147 struct sysctl_oid *oid;
3148 struct sysctl_oid_list *children;
3149 uint32_t val;
3150
3151 read_pdu_limits(sc, &nca->max_transmit_pdu, &nca->max_receive_pdu);
3152 if (nca->max_transmit_pdu > che_max_transmit_pdu)
3153 nca->max_transmit_pdu = che_max_transmit_pdu;
3154 if (nca->max_receive_pdu > che_max_receive_pdu)
3155 nca->max_receive_pdu = che_max_receive_pdu;
3156 val = t4_read_reg(sc, A_SGE_CONTROL2);
3157 nca->nvmt_data_iqe = (val & F_RXCPLMODE_NVMT) != 0;
3158
3159 sysctl_ctx_init(&nca->ctx);
3160 oid = device_get_sysctl_tree(sc->dev); /* dev.che.X */
3161 children = SYSCTL_CHILDREN(oid);
3162
3163 oid = SYSCTL_ADD_NODE(&nca->ctx, children, OID_AUTO, "nvme",
3164 CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "NVMe ULP settings");
3165 children = SYSCTL_CHILDREN(oid);
3166
3167 nca->ddp_threshold = 8192;
3168 SYSCTL_ADD_UINT(&nca->ctx, children, OID_AUTO, "ddp_threshold",
3169 CTLFLAG_RW, &nca->ddp_threshold, 0, "Rx zero copy threshold");
3170
3171 SYSCTL_ADD_UINT(&nca->ctx, children, OID_AUTO, "max_transmit_pdu",
3172 CTLFLAG_RW, &nca->max_transmit_pdu, 0,
3173 "Maximum size of a transmitted PDU");
3174
3175 SYSCTL_ADD_UINT(&nca->ctx, children, OID_AUTO, "max_receive_pdu",
3176 CTLFLAG_RW, &nca->max_receive_pdu, 0,
3177 "Maximum size of a received PDU");
3178
3179 return (0);
3180 }
3181
3182 static void
nvmf_che_destroy(struct nvmf_che_adapter * nca)3183 nvmf_che_destroy(struct nvmf_che_adapter *nca)
3184 {
3185 sysctl_ctx_free(&nca->ctx);
3186 free(nca, M_CXGBE);
3187 }
3188
3189 static int
nvmf_che_activate(struct adapter * sc)3190 nvmf_che_activate(struct adapter *sc)
3191 {
3192 struct nvmf_che_adapter *nca;
3193 int rc;
3194
3195 ASSERT_SYNCHRONIZED_OP(sc);
3196
3197 if (uld_active(sc, ULD_NVME)) {
3198 KASSERT(0, ("%s: NVMe offload already enabled on adapter %p",
3199 __func__, sc));
3200 return (0);
3201 }
3202
3203 if ((sc->nvmecaps & FW_CAPS_CONFIG_NVME_TCP) == 0) {
3204 device_printf(sc->dev,
3205 "not NVMe offload capable, or capability disabled\n");
3206 return (ENOSYS);
3207 }
3208
3209 /* per-adapter softc for NVMe */
3210 nca = malloc(sizeof(*nca), M_CXGBE, M_ZERO | M_WAITOK);
3211 nca->sc = sc;
3212
3213 rc = nvmf_che_init(sc, nca);
3214 if (rc != 0) {
3215 free(nca, M_CXGBE);
3216 return (rc);
3217 }
3218
3219 sc->nvme_ulp_softc = nca;
3220
3221 return (0);
3222 }
3223
3224 static int
nvmf_che_deactivate(struct adapter * sc)3225 nvmf_che_deactivate(struct adapter *sc)
3226 {
3227 struct nvmf_che_adapter *nca = sc->nvme_ulp_softc;
3228
3229 ASSERT_SYNCHRONIZED_OP(sc);
3230
3231 if (nca != NULL) {
3232 nvmf_che_destroy(nca);
3233 sc->nvme_ulp_softc = NULL;
3234 }
3235
3236 return (0);
3237 }
3238
3239 static void
nvmf_che_activate_all(struct adapter * sc,void * arg __unused)3240 nvmf_che_activate_all(struct adapter *sc, void *arg __unused)
3241 {
3242 if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t7nvact") != 0)
3243 return;
3244
3245 /* Activate NVMe if any port on this adapter has IFCAP_TOE enabled. */
3246 if (sc->offload_map && !uld_active(sc, ULD_NVME))
3247 (void) t4_activate_uld(sc, ULD_NVME);
3248
3249 end_synchronized_op(sc, 0);
3250 }
3251
3252 static void
nvmf_che_deactivate_all(struct adapter * sc,void * arg __unused)3253 nvmf_che_deactivate_all(struct adapter *sc, void *arg __unused)
3254 {
3255 if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t7nvdea") != 0)
3256 return;
3257
3258 if (uld_active(sc, ULD_NVME))
3259 (void) t4_deactivate_uld(sc, ULD_NVME);
3260
3261 end_synchronized_op(sc, 0);
3262 }
3263
3264 static struct uld_info nvmf_che_uld_info = {
3265 .uld_activate = nvmf_che_activate,
3266 .uld_deactivate = nvmf_che_deactivate,
3267 };
3268
3269 static int
nvmf_che_mod_load(void)3270 nvmf_che_mod_load(void)
3271 {
3272 int rc;
3273
3274 t4_register_cpl_handler(CPL_NVMT_CMP, do_nvmt_cmp);
3275 t4_register_cpl_handler(CPL_NVMT_DATA, do_nvmt_data);
3276
3277 rc = t4_register_uld(&nvmf_che_uld_info, ULD_NVME);
3278 if (rc != 0)
3279 return (rc);
3280
3281 t4_iterate(nvmf_che_activate_all, NULL);
3282
3283 return (rc);
3284 }
3285
3286 static int
nvmf_che_mod_unload(void)3287 nvmf_che_mod_unload(void)
3288 {
3289 t4_iterate(nvmf_che_deactivate_all, NULL);
3290
3291 if (t4_unregister_uld(&nvmf_che_uld_info, ULD_NVME) == EBUSY)
3292 return (EBUSY);
3293
3294 t4_register_cpl_handler(CPL_NVMT_CMP, NULL);
3295 t4_register_cpl_handler(CPL_NVMT_DATA, NULL);
3296
3297 return (0);
3298 }
3299 #endif
3300
3301 static int
nvmf_che_modevent(module_t mod,int cmd,void * arg)3302 nvmf_che_modevent(module_t mod, int cmd, void *arg)
3303 {
3304 int rc;
3305
3306 #ifdef TCP_OFFLOAD
3307 switch (cmd) {
3308 case MOD_LOAD:
3309 rc = nvmf_che_mod_load();
3310 break;
3311 case MOD_UNLOAD:
3312 rc = nvmf_che_mod_unload();
3313 break;
3314 default:
3315 rc = EOPNOTSUPP;
3316 break;
3317 }
3318 #else
3319 printf("nvmf_che: compiled without TCP_OFFLOAD support.\n");
3320 rc = EOPNOTSUPP;
3321 #endif
3322
3323 return (rc);
3324 }
3325
3326 static moduledata_t nvmf_che_mod = {
3327 "nvmf_che",
3328 nvmf_che_modevent,
3329 NULL,
3330 };
3331
3332 MODULE_VERSION(nvmf_che, 1);
3333 DECLARE_MODULE(nvmf_che, nvmf_che_mod, SI_SUB_EXEC, SI_ORDER_ANY);
3334 MODULE_DEPEND(nvmf_che, t4_tom, 1, 1, 1);
3335 MODULE_DEPEND(nvmf_che, cxgbe, 1, 1, 1);
3336