xref: /freebsd/sys/dev/cxgbe/nvmf/nvmf_che.c (revision ec0cd287f55f7ea93ff4ccfa4de0f70eca5fef75)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2023 Chelsio Communications, Inc.
5  * Written by: John Baldwin <jhb@FreeBSD.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include "opt_inet.h"
30 
31 #include <sys/param.h>
32 #include <sys/libkern.h>
33 #include <sys/kernel.h>
34 #include <sys/module.h>
35 
36 #ifdef TCP_OFFLOAD
37 #include <sys/bitset.h>
38 #include <sys/capsicum.h>
39 #include <sys/file.h>
40 #include <sys/kthread.h>
41 #include <sys/malloc.h>
42 #include <sys/mbuf.h>
43 #include <sys/nv.h>
44 #include <sys/protosw.h>
45 #include <sys/socket.h>
46 #include <sys/socketvar.h>
47 #include <netinet/in.h>
48 #include <netinet/in_pcb.h>
49 #include <netinet/tcp_var.h>
50 #include <netinet/toecore.h>
51 
52 #include <dev/nvmf/nvmf.h>
53 #include <dev/nvmf/nvmf_proto.h>
54 #include <dev/nvmf/nvmf_tcp.h>
55 #include <dev/nvmf/nvmf_transport.h>
56 #include <dev/nvmf/nvmf_transport_internal.h>
57 
58 #include <vm/pmap.h>
59 #include <vm/vm_page.h>
60 
61 #include "common/common.h"
62 #include "common/t4_regs.h"
63 #include "common/t4_tcb.h"
64 #include "tom/t4_tom.h"
65 
66 /* Status code values in CPL_NVMT_CMP. */
67 #define	CMP_STATUS_ERROR_MASK		0x7f
68 #define	CMP_STATUS_NO_ERROR		0
69 #define	CMP_STATUS_HEADER_DIGEST	1
70 #define	CMP_STATUS_DIRECTION_MISMATCH	2
71 #define	CMP_STATUS_DIGEST_FLAG_MISMATCH	3
72 #define	CMP_STATUS_SUCCESS_NOT_LAST	4
73 #define	CMP_STATUS_BAD_DATA_LENGTH	5
74 #define	CMP_STATUS_USER_MODE_UNALLOCATED	6
75 #define	CMP_STATUS_RQT_LIMIT		7
76 #define	CMP_STATUS_RQT_WRAP		8
77 #define	CMP_STATUS_RQT_BOUND		9
78 #define	CMP_STATUS_TPT_LIMIT		16
79 #define	CMP_STATUS_TPT_INVALID		17
80 #define	CMP_STATUS_TPT_COLOUR_MISMATCH	18
81 #define	CMP_STATUS_TPT_MISC		19
82 #define	CMP_STATUS_TPT_WRAP		20
83 #define	CMP_STATUS_TPT_BOUND		21
84 #define	CMP_STATUS_TPT_LAST_PDU_UNALIGNED	22
85 #define	CMP_STATUS_PBL_LIMIT		24
86 #define	CMP_STATUS_DATA_DIGEST		25
87 #define	CMP_STATUS_DDP			0x80
88 
89 /*
90  * Transfer tags and CIDs with the MSB set are "unallocated" tags that
91  * pass data through to the freelist without using DDP.
92  */
93 #define	CHE_FL_TAG_MASK		0x8000
94 #define	CHE_MAX_FL_TAG		0x7fff
95 #define	CHE_NUM_FL_TAGS		(CHE_MAX_FL_TAG + 1)
96 
97 #define	CHE_TAG_IS_FL(ttag)	(((ttag) & CHE_FL_TAG_MASK) == CHE_FL_TAG_MASK)
98 #define	CHE_RAW_FL_TAG(ttag)	((ttag) & ~CHE_FL_TAG_MASK)
99 #define	CHE_DDP_TAG(stag_idx, color)	((stag_idx) << 4 | (color))
100 #define	CHE_STAG_COLOR(stag)	((stag) & 0xf)
101 #define	CHE_STAG_IDX(stag)	((stag) >> 4)
102 #define	CHE_DDP_MAX_COLOR	0xf
103 
104 #define	CHE_DDP_NO_TAG		0xffff
105 
106 /*
107  * A bitmap of non-DDP CIDs in use on the host.  Since there is no
108  * _BIT_FFC (find first clear), the bitset is inverted so that a clear
109  * bit indicates an in-use CID.
110  */
111 BITSET_DEFINE(fl_cid_set, CHE_NUM_FL_TAGS);
112 #define	FL_CID_INIT(p)		__BIT_FILL(CHE_NUM_FL_TAGS, p)
113 #define	FL_CID_BUSY(n, p)	__BIT_CLR(CHE_NUM_FL_TAGS, n, p)
114 #define	FL_CID_ISACTIVE(n, p)	!__BIT_ISSET(CHE_NUM_FL_TAGS, n, p)
115 #define	FL_CID_FREE(n, p)	__BIT_SET(CHE_NUM_FL_TAGS, n, p)
116 #define	FL_CID_FINDFREE_AT(p, start)	__BIT_FFS_AT(CHE_NUM_FL_TAGS, p, start)
117 
118 /*
119  * The TCP sequence number of both CPL_NVMT_DATA and CPL_NVMT_CMP
120  * mbufs are saved here while the mbuf is in qp->rx_data and qp->rx_pdus.
121  */
122 #define	nvmf_tcp_seq	PH_loc.thirtytwo[0]
123 
124 /*
125  * The CPL status of CPL_NVMT_CMP mbufs are saved here while the mbuf
126  * is in qp->rx_pdus.
127  */
128 #define	nvmf_cpl_status	PH_loc.eight[4]
129 
130 struct nvmf_che_capsule;
131 struct nvmf_che_qpair;
132 
133 struct nvmf_che_adapter {
134 	struct adapter *sc;
135 
136 	u_int ddp_threshold;
137 	u_int max_transmit_pdu;
138 	u_int max_receive_pdu;
139 	bool nvmt_data_iqe;
140 
141 	struct sysctl_ctx_list ctx;	/* from uld_activate to deactivate */
142 };
143 
144 struct nvmf_che_command_buffer {
145 	struct nvmf_che_qpair *qp;
146 
147 	struct nvmf_io_request io;
148 	size_t	data_len;
149 	size_t	data_xfered;
150 	uint32_t data_offset;
151 
152 	u_int	refs;
153 	int	error;
154 
155 	bool	ddp_ok;
156 	uint16_t cid;
157 	uint16_t ttag;
158 	uint16_t original_cid;	/* Host only */
159 
160 	TAILQ_ENTRY(nvmf_che_command_buffer) link;
161 
162 	/* Fields used for DDP. */
163 	struct fw_ri_tpte tpte;
164 	uint64_t *pbl;
165 	uint32_t pbl_addr;
166 	uint32_t pbl_len;
167 
168 	/* Controller only */
169 	struct nvmf_che_capsule *cc;
170 };
171 
172 struct nvmf_che_command_buffer_list {
173 	TAILQ_HEAD(, nvmf_che_command_buffer) head;
174 	struct mtx lock;
175 };
176 
177 struct nvmf_che_qpair {
178 	struct nvmf_qpair qp;
179 
180 	struct socket *so;
181 	struct toepcb *toep;
182 	struct nvmf_che_adapter *nca;
183 
184 	volatile u_int refs;	/* Every allocated capsule holds a reference */
185 	uint8_t	txpda;
186 	uint8_t rxpda;
187 	bool header_digests;
188 	bool data_digests;
189 	uint32_t maxr2t;
190 	uint32_t maxh2cdata;	/* Controller only */
191 	uint32_t max_rx_data;
192 	uint32_t max_tx_data;
193 	uint32_t max_icd;	/* Host only */
194 	uint32_t max_ioccsz;	/* Controller only */
195 	union {
196 		uint16_t next_fl_ttag;	/* Controller only */
197 		uint16_t next_cid;	/* Host only */
198 	};
199 	uint16_t next_ddp_tag;
200 	u_int num_fl_ttags;	/* Controller only */
201 	u_int active_fl_ttags;	/* Controller only */
202 	u_int num_ddp_tags;
203 	u_int active_ddp_tags;
204 	bool send_success;	/* Controller only */
205 	uint8_t ddp_color;
206 	uint32_t tpt_offset;
207 
208 	/* Receive state. */
209 	struct thread *rx_thread;
210 	struct cv rx_cv;
211 	bool	rx_shutdown;
212 	int	rx_error;
213 	struct mbufq rx_data;	/* Data received via CPL_NVMT_DATA. */
214 	struct mbufq rx_pdus;	/* PDU headers received via CPL_NVMT_CMP. */
215 
216 	/* Transmit state. */
217 	struct thread *tx_thread;
218 	struct cv tx_cv;
219 	bool	tx_shutdown;
220 	STAILQ_HEAD(, nvmf_che_capsule) tx_capsules;
221 
222 	struct nvmf_che_command_buffer_list tx_buffers;
223 	struct nvmf_che_command_buffer_list rx_buffers;
224 
225 	/*
226 	 * For the controller, an RX command buffer can be in one of
227 	 * three locations, all protected by the rx_buffers.lock.  If
228 	 * a receive request is waiting for either an R2T slot for its
229 	 * command (due to exceeding MAXR2T), or a transfer tag it is
230 	 * placed on the rx_buffers list.  When a request is allocated
231 	 * an active transfer tag, it moves to either the
232 	 * open_ddp_tags[] or open_fl_ttags[] array (indexed by the
233 	 * tag) until it completes.
234 	 *
235 	 * For the host, an RX command buffer using DDP is in
236 	 * open_ddp_tags[], otherwise it is in rx_buffers.
237 	 */
238 	struct nvmf_che_command_buffer **open_ddp_tags;
239 	struct nvmf_che_command_buffer **open_fl_ttags;	/* Controller only */
240 
241 	/*
242 	 * For the host, CIDs submitted by nvmf(4) must be rewritten
243 	 * to either use DDP or not use DDP.  The CID in response
244 	 * capsules must be restored to their original value.  For
245 	 * DDP, the original CID is stored in the command buffer.
246 	 * These variables manage non-DDP CIDs.
247 	 */
248 	uint16_t *fl_cids;		/* Host only */
249 	struct fl_cid_set *fl_cid_set;	/* Host only */
250 	struct mtx fl_cid_lock;		/* Host only */
251 };
252 
253 struct nvmf_che_rxpdu {
254 	struct mbuf *m;
255 	const struct nvme_tcp_common_pdu_hdr *hdr;
256 	uint32_t data_len;
257 	bool data_digest_mismatch;
258 	bool ddp;
259 };
260 
261 struct nvmf_che_capsule {
262 	struct nvmf_capsule nc;
263 
264 	volatile u_int refs;
265 
266 	struct nvmf_che_rxpdu rx_pdu;
267 
268 	uint32_t active_r2ts;		/* Controller only */
269 #ifdef INVARIANTS
270 	uint32_t tx_data_offset;	/* Controller only */
271 	u_int pending_r2ts;		/* Controller only */
272 #endif
273 
274 	STAILQ_ENTRY(nvmf_che_capsule) link;
275 };
276 
277 #define	CCAP(nc)	((struct nvmf_che_capsule *)(nc))
278 #define	CQP(qp)		((struct nvmf_che_qpair *)(qp))
279 
280 static void	che_release_capsule(struct nvmf_che_capsule *cc);
281 static void	che_free_qpair(struct nvmf_qpair *nq);
282 
283 SYSCTL_NODE(_kern_nvmf, OID_AUTO, che, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
284     "Chelsio TCP offload transport");
285 
286 static u_int che_max_transmit_pdu = 32 * 1024;
287 SYSCTL_UINT(_kern_nvmf_che, OID_AUTO, max_transmit_pdu, CTLFLAG_RWTUN,
288     &che_max_transmit_pdu, 0,
289     "Maximum size of a transmitted PDU");
290 
291 static u_int che_max_receive_pdu = 32 * 1024;
292 SYSCTL_UINT(_kern_nvmf_che, OID_AUTO, max_receive_pdu, CTLFLAG_RWTUN,
293     &che_max_receive_pdu, 0,
294     "Maximum size of a received PDU");
295 
296 static int use_dsgl = 1;
297 SYSCTL_INT(_kern_nvmf_che, OID_AUTO, use_dsgl, CTLFLAG_RWTUN, &use_dsgl, 0,
298     "Use DSGL for PBL/FastReg (default=1)");
299 
300 static int inline_threshold = 256;
301 SYSCTL_INT(_kern_nvmf_che, OID_AUTO, inline_threshold, CTLFLAG_RWTUN,
302     &inline_threshold, 0,
303     "inline vs dsgl threshold (default=256)");
304 
305 static int ddp_tags_per_qp = 128;
306 SYSCTL_INT(_kern_nvmf_che, OID_AUTO, ddp_tags_per_qp, CTLFLAG_RWTUN,
307     &ddp_tags_per_qp, 0,
308     "Number of DDP tags to reserve for each queue pair");
309 
310 static MALLOC_DEFINE(M_NVMF_CHE, "nvmf_che", "Chelsio NVMe-TCP offload");
311 
312 /*
313  * PBL regions consist of N full-sized pages.  TPT entries support an
314  * initial offset into the first page (FBO) and can handle a partial
315  * length on the last page.
316  */
317 static bool
che_ddp_io_check(struct nvmf_che_qpair * qp,const struct nvmf_io_request * io)318 che_ddp_io_check(struct nvmf_che_qpair *qp, const struct nvmf_io_request *io)
319 {
320 	const struct memdesc *mem = &io->io_mem;
321 	struct bus_dma_segment *ds;
322 	int i;
323 
324 	if (io->io_len < qp->nca->ddp_threshold) {
325 		return (false);
326 	}
327 
328 	switch (mem->md_type) {
329 	case MEMDESC_VADDR:
330 	case MEMDESC_PADDR:
331 	case MEMDESC_VMPAGES:
332 		return (true);
333 	case MEMDESC_VLIST:
334 	case MEMDESC_PLIST:
335 		/*
336 		 * Require all but the first segment to start on a
337 		 * page boundary.  Require all but the last segment to
338 		 * end on a page boundary.
339 		 */
340 		ds = mem->u.md_list;
341 		for (i = 0; i < mem->md_nseg; i++, ds++) {
342 			if (i != 0 && ds->ds_addr % PAGE_SIZE != 0)
343 				return (false);
344 			if (i != mem->md_nseg - 1 &&
345 			    (ds->ds_addr + ds->ds_len) % PAGE_SIZE != 0)
346 				return (false);
347 		}
348 		return (true);
349 	default:
350 		/*
351 		 * Other types could be validated with more work, but
352 		 * they aren't used currently by nvmf(4) or nvmft(4).
353 		 */
354 		return (false);
355 	}
356 }
357 
358 static u_int
che_fbo(struct nvmf_che_command_buffer * cb)359 che_fbo(struct nvmf_che_command_buffer *cb)
360 {
361 	struct memdesc *mem = &cb->io.io_mem;
362 
363 	switch (mem->md_type) {
364 	case MEMDESC_VADDR:
365 		return ((uintptr_t)mem->u.md_vaddr & PAGE_MASK);
366 	case MEMDESC_PADDR:
367 		return (mem->u.md_paddr & PAGE_MASK);
368 	case MEMDESC_VMPAGES:
369 		return (mem->md_offset);
370 	case MEMDESC_VLIST:
371 	case MEMDESC_PLIST:
372 		return (mem->u.md_list[0].ds_addr & PAGE_MASK);
373 	default:
374 		__assert_unreachable();
375 	}
376 }
377 
378 static u_int
che_npages(struct nvmf_che_command_buffer * cb)379 che_npages(struct nvmf_che_command_buffer *cb)
380 {
381 	return (howmany(che_fbo(cb) + cb->io.io_len, PAGE_SIZE));
382 }
383 
384 static struct nvmf_che_command_buffer *
che_alloc_command_buffer(struct nvmf_che_qpair * qp,const struct nvmf_io_request * io,uint32_t data_offset,size_t data_len,uint16_t cid)385 che_alloc_command_buffer(struct nvmf_che_qpair *qp,
386     const struct nvmf_io_request *io, uint32_t data_offset, size_t data_len,
387     uint16_t cid)
388 {
389 	struct nvmf_che_command_buffer *cb;
390 
391 	cb = malloc(sizeof(*cb), M_NVMF_CHE, M_WAITOK);
392 	cb->qp = qp;
393 	cb->io = *io;
394 	cb->data_offset = data_offset;
395 	cb->data_len = data_len;
396 	cb->data_xfered = 0;
397 	refcount_init(&cb->refs, 1);
398 	cb->error = 0;
399 	cb->ddp_ok = che_ddp_io_check(qp, io);
400 	cb->cid = cid;
401 	cb->ttag = 0;
402 	cb->original_cid = 0;
403 	cb->cc = NULL;
404 	cb->pbl = NULL;
405 
406 	return (cb);
407 }
408 
409 static void
che_hold_command_buffer(struct nvmf_che_command_buffer * cb)410 che_hold_command_buffer(struct nvmf_che_command_buffer *cb)
411 {
412 	refcount_acquire(&cb->refs);
413 }
414 
415 static void
che_free_command_buffer(struct nvmf_che_command_buffer * cb)416 che_free_command_buffer(struct nvmf_che_command_buffer *cb)
417 {
418 	nvmf_complete_io_request(&cb->io, cb->data_xfered, cb->error);
419 	if (cb->cc != NULL)
420 		che_release_capsule(cb->cc);
421 	MPASS(cb->pbl == NULL);
422 	free(cb, M_NVMF_CHE);
423 }
424 
425 static void
che_release_command_buffer(struct nvmf_che_command_buffer * cb)426 che_release_command_buffer(struct nvmf_che_command_buffer *cb)
427 {
428 	if (refcount_release(&cb->refs))
429 		che_free_command_buffer(cb);
430 }
431 
432 static void
che_add_command_buffer(struct nvmf_che_command_buffer_list * list,struct nvmf_che_command_buffer * cb)433 che_add_command_buffer(struct nvmf_che_command_buffer_list *list,
434     struct nvmf_che_command_buffer *cb)
435 {
436 	mtx_assert(&list->lock, MA_OWNED);
437 	TAILQ_INSERT_HEAD(&list->head, cb, link);
438 }
439 
440 static struct nvmf_che_command_buffer *
che_find_command_buffer(struct nvmf_che_command_buffer_list * list,uint16_t cid)441 che_find_command_buffer(struct nvmf_che_command_buffer_list *list,
442     uint16_t cid)
443 {
444 	struct nvmf_che_command_buffer *cb;
445 
446 	mtx_assert(&list->lock, MA_OWNED);
447 	TAILQ_FOREACH(cb, &list->head, link) {
448 		if (cb->cid == cid)
449 			return (cb);
450 	}
451 	return (NULL);
452 }
453 
454 static void
che_remove_command_buffer(struct nvmf_che_command_buffer_list * list,struct nvmf_che_command_buffer * cb)455 che_remove_command_buffer(struct nvmf_che_command_buffer_list *list,
456     struct nvmf_che_command_buffer *cb)
457 {
458 	mtx_assert(&list->lock, MA_OWNED);
459 	TAILQ_REMOVE(&list->head, cb, link);
460 }
461 
462 static void
che_purge_command_buffer(struct nvmf_che_command_buffer_list * list,uint16_t cid)463 che_purge_command_buffer(struct nvmf_che_command_buffer_list *list,
464     uint16_t cid)
465 {
466 	struct nvmf_che_command_buffer *cb;
467 
468 	mtx_lock(&list->lock);
469 	cb = che_find_command_buffer(list, cid);
470 	if (cb != NULL) {
471 		che_remove_command_buffer(list, cb);
472 		mtx_unlock(&list->lock);
473 		che_release_command_buffer(cb);
474 	} else
475 		mtx_unlock(&list->lock);
476 }
477 
478 static int
che_write_mem_inline(struct adapter * sc,struct toepcb * toep,uint32_t addr,uint32_t len,void * data,struct mbufq * wrq)479 che_write_mem_inline(struct adapter *sc, struct toepcb *toep, uint32_t addr,
480     uint32_t len, void *data, struct mbufq *wrq)
481 {
482 	struct mbuf *m;
483 	char *cp;
484 	int copy_len, i, num_wqe, wr_len;
485 
486 #ifdef VERBOSE_TRACES
487 	CTR(KTR_CXGBE, "%s: addr 0x%x len %u", __func__, addr << 5, len);
488 #endif
489 	num_wqe = DIV_ROUND_UP(len, T4_MAX_INLINE_SIZE);
490 	cp = data;
491 	for (i = 0; i < num_wqe; i++) {
492 		copy_len = min(len, T4_MAX_INLINE_SIZE);
493 		wr_len = T4_WRITE_MEM_INLINE_LEN(copy_len);
494 
495 		m = alloc_raw_wr_mbuf(wr_len);
496 		if (m == NULL)
497 			return (ENOMEM);
498 		t4_write_mem_inline_wr(sc, mtod(m, void *), wr_len, toep->tid,
499 		    addr, copy_len, cp, 0);
500 		if (cp != NULL)
501 			cp += T4_MAX_INLINE_SIZE;
502 		addr += T4_MAX_INLINE_SIZE >> 5;
503 		len -= T4_MAX_INLINE_SIZE;
504 
505 		mbufq_enqueue(wrq, m);
506 	}
507 	return (0);
508 }
509 
510 static int
che_write_mem_dma_aligned(struct adapter * sc,struct toepcb * toep,uint32_t addr,uint32_t len,void * data,struct mbufq * wrq)511 che_write_mem_dma_aligned(struct adapter *sc, struct toepcb *toep,
512     uint32_t addr, uint32_t len, void *data, struct mbufq *wrq)
513 {
514 	struct mbuf *m;
515 	vm_offset_t va;
516 	u_int todo;
517 	int wr_len;
518 
519 	/* First page. */
520 	va = (vm_offset_t)data;
521 	todo = min(PAGE_SIZE - (va % PAGE_SIZE), len);
522 	wr_len = T4_WRITE_MEM_DMA_LEN;
523 	m = alloc_raw_wr_mbuf(wr_len);
524 	if (m == NULL)
525 		return (ENOMEM);
526 	t4_write_mem_dma_wr(sc, mtod(m, void *), wr_len, toep->tid, addr,
527 	    todo, pmap_kextract(va), 0);
528 	mbufq_enqueue(wrq, m);
529 	len -= todo;
530 	addr += todo >> 5;
531 	va += todo;
532 
533 	while (len > 0) {
534 		MPASS(va == trunc_page(va));
535 		todo = min(PAGE_SIZE, len);
536 		m = alloc_raw_wr_mbuf(wr_len);
537 		if (m == NULL)
538 			return (ENOMEM);
539 		t4_write_mem_dma_wr(sc, mtod(m, void *), wr_len, toep->tid,
540 		    addr, todo, pmap_kextract(va), 0);
541 		mbufq_enqueue(wrq, m);
542 		len -= todo;
543 		addr += todo >> 5;
544 		va += todo;
545 	}
546 	return (0);
547 }
548 
549 static int
che_write_adapter_mem(struct nvmf_che_qpair * qp,uint32_t addr,uint32_t len,void * data)550 che_write_adapter_mem(struct nvmf_che_qpair *qp, uint32_t addr, uint32_t len,
551     void *data)
552 {
553 	struct adapter *sc = qp->nca->sc;
554 	struct toepcb *toep = qp->toep;
555 	struct socket *so = qp->so;
556 	struct inpcb *inp = sotoinpcb(so);
557 	struct mbufq mq;
558 	int error;
559 
560 	mbufq_init(&mq, INT_MAX);
561 	if (!use_dsgl || len < inline_threshold || data == NULL)
562 		error = che_write_mem_inline(sc, toep, addr, len, data, &mq);
563 	else
564 		error = che_write_mem_dma_aligned(sc, toep, addr, len, data,
565 		    &mq);
566 	if (__predict_false(error != 0))
567 		goto error;
568 
569 	INP_WLOCK(inp);
570 	if ((inp->inp_flags & INP_DROPPED) != 0) {
571 		INP_WUNLOCK(inp);
572 		error = ECONNRESET;
573 		goto error;
574 	}
575 	mbufq_concat(&toep->ulp_pduq, &mq);
576 	INP_WUNLOCK(inp);
577 	return (0);
578 
579 error:
580 	mbufq_drain(&mq);
581 	return (error);
582 }
583 
584 static bool
che_alloc_pbl(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb)585 che_alloc_pbl(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb)
586 {
587 	struct adapter *sc = qp->nca->sc;
588 	struct memdesc *mem = &cb->io.io_mem;
589 	uint64_t *pbl;
590 	uint32_t addr, len;
591 	u_int i, npages;
592 	int error;
593 
594 	MPASS(cb->pbl == NULL);
595 	MPASS(cb->ddp_ok);
596 
597 	/* Hardware limit?  iWARP only enforces this for T5. */
598 	if (cb->io.io_len >= (8 * 1024 * 1024 * 1024ULL))
599 		return (false);
600 
601 	npages = che_npages(cb);
602 	len = roundup2(npages, 4) * sizeof(*cb->pbl);
603 	addr = t4_pblpool_alloc(sc, len);
604 	if (addr == 0)
605 		return (false);
606 
607 	pbl = malloc(len, M_NVMF_CHE, M_NOWAIT | M_ZERO);
608 	if (pbl == NULL) {
609 		t4_pblpool_free(sc, addr, len);
610 		return (false);
611 	}
612 
613 	switch (mem->md_type) {
614 	case MEMDESC_VADDR:
615 	{
616 		vm_offset_t va;
617 
618 		va = trunc_page((uintptr_t)mem->u.md_vaddr);
619 		for (i = 0; i < npages; i++)
620 			pbl[i] = htobe64(pmap_kextract(va + i * PAGE_SIZE));
621 		break;
622 	}
623 	case MEMDESC_PADDR:
624 	{
625 		vm_paddr_t pa;
626 
627 		pa = trunc_page(mem->u.md_paddr);
628 		for (i = 0; i < npages; i++)
629 			pbl[i] = htobe64(pa + i * PAGE_SIZE);
630 		break;
631 	}
632 	case MEMDESC_VMPAGES:
633 		for (i = 0; i < npages; i++)
634 			pbl[i] = htobe64(VM_PAGE_TO_PHYS(mem->u.md_ma[i]));
635 		break;
636 	case MEMDESC_VLIST:
637 	{
638 		struct bus_dma_segment *ds;
639 		vm_offset_t va;
640 		vm_size_t len;
641 		u_int j, k;
642 
643 		i = 0;
644 		ds = mem->u.md_list;
645 		for (j = 0; j < mem->md_nseg; j++, ds++) {
646 			va = trunc_page((uintptr_t)ds->ds_addr);
647 			len = ds->ds_len;
648 			if (ds->ds_addr % PAGE_SIZE != 0)
649 				len += ds->ds_addr % PAGE_SIZE;
650 			for (k = 0; k < howmany(len, PAGE_SIZE); k++) {
651 				pbl[i] = htobe64(pmap_kextract(va +
652 					k * PAGE_SIZE));
653 				i++;
654 			}
655 		}
656 		MPASS(i == npages);
657 		break;
658 	}
659 	case MEMDESC_PLIST:
660 	{
661 		struct bus_dma_segment *ds;
662 		vm_paddr_t pa;
663 		vm_size_t len;
664 		u_int j, k;
665 
666 		i = 0;
667 		ds = mem->u.md_list;
668 		for (j = 0; j < mem->md_nseg; j++, ds++) {
669 			pa = trunc_page((vm_paddr_t)ds->ds_addr);
670 			len = ds->ds_len;
671 			if (ds->ds_addr % PAGE_SIZE != 0)
672 				len += ds->ds_addr % PAGE_SIZE;
673 			for (k = 0; k < howmany(len, PAGE_SIZE); k++) {
674 				pbl[i] = htobe64(pa + k * PAGE_SIZE);
675 				i++;
676 			}
677 		}
678 		MPASS(i == npages);
679 		break;
680 	}
681 	default:
682 		__assert_unreachable();
683 	}
684 
685 	error = che_write_adapter_mem(qp, addr >> 5, len, pbl);
686 	if (error != 0) {
687 		t4_pblpool_free(sc, addr, len);
688 		free(pbl, M_NVMF_CHE);
689 		return (false);
690 	}
691 
692 	cb->pbl = pbl;
693 	cb->pbl_addr = addr;
694 	cb->pbl_len = len;
695 
696 	return (true);
697 }
698 
699 static void
che_free_pbl(struct nvmf_che_command_buffer * cb)700 che_free_pbl(struct nvmf_che_command_buffer *cb)
701 {
702 	free(cb->pbl, M_NVMF_CHE);
703 	t4_pblpool_free(cb->qp->nca->sc, cb->pbl_addr, cb->pbl_len);
704 	cb->pbl = NULL;
705 	cb->pbl_addr = 0;
706 	cb->pbl_len = 0;
707 }
708 
709 static bool
che_write_tpt_entry(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb,uint16_t stag)710 che_write_tpt_entry(struct nvmf_che_qpair *qp,
711     struct nvmf_che_command_buffer *cb, uint16_t stag)
712 {
713 	uint32_t tpt_addr;
714 	int error;
715 
716 	cb->tpte.valid_to_pdid = htobe32(F_FW_RI_TPTE_VALID |
717 	    V_FW_RI_TPTE_STAGKEY(CHE_STAG_COLOR(stag)) |
718 	    F_FW_RI_TPTE_STAGSTATE |
719 	    V_FW_RI_TPTE_STAGTYPE(FW_RI_STAG_NSMR) |
720 	    V_FW_RI_TPTE_PDID(0));
721 	cb->tpte.locread_to_qpid = htobe32(
722 	    V_FW_RI_TPTE_PERM(FW_RI_MEM_ACCESS_REM_WRITE) |
723 	    V_FW_RI_TPTE_ADDRTYPE(FW_RI_ZERO_BASED_TO) |
724 	    V_FW_RI_TPTE_PS(PAGE_SIZE) |
725 	    V_FW_RI_TPTE_QPID(qp->toep->tid));
726 #define PBL_OFF(qp, a)	((a) - (qp)->nca->sc->vres.pbl.start)
727 	cb->tpte.nosnoop_pbladdr =
728 	    htobe32(V_FW_RI_TPTE_PBLADDR(PBL_OFF(qp, cb->pbl_addr) >> 3));
729 	cb->tpte.len_lo = htobe32(cb->data_len);
730 	cb->tpte.va_hi = 0;
731 	cb->tpte.va_lo_fbo = htobe32(che_fbo(cb));
732 	cb->tpte.dca_mwbcnt_pstag = 0;
733 	cb->tpte.len_hi = htobe32(cb->data_offset);
734 
735 	tpt_addr = qp->tpt_offset + CHE_STAG_IDX(stag) +
736 	    (qp->nca->sc->vres.stag.start >> 5);
737 
738 	error = che_write_adapter_mem(qp, tpt_addr, sizeof(cb->tpte),
739 	    &cb->tpte);
740 	return (error == 0);
741 }
742 
743 static void
che_clear_tpt_entry(struct nvmf_che_qpair * qp,uint16_t stag)744 che_clear_tpt_entry(struct nvmf_che_qpair *qp, uint16_t stag)
745 {
746 	uint32_t tpt_addr;
747 
748 	tpt_addr = qp->tpt_offset + CHE_STAG_IDX(stag) +
749 	    (qp->nca->sc->vres.stag.start >> 5);
750 
751 	(void)che_write_adapter_mem(qp, tpt_addr, sizeof(struct fw_ri_tpte),
752 	    NULL);
753 }
754 
755 static uint16_t
che_alloc_ddp_stag(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb)756 che_alloc_ddp_stag(struct nvmf_che_qpair *qp,
757     struct nvmf_che_command_buffer *cb)
758 {
759 	uint16_t stag_idx;
760 
761 	mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
762 	MPASS(cb->ddp_ok);
763 
764 	if (qp->active_ddp_tags == qp->num_ddp_tags)
765 		return (CHE_DDP_NO_TAG);
766 
767 	MPASS(qp->num_ddp_tags != 0);
768 
769 	stag_idx = qp->next_ddp_tag;
770 	for (;;) {
771 		if (qp->open_ddp_tags[stag_idx] == NULL)
772 			break;
773 		if (stag_idx == qp->num_ddp_tags - 1) {
774 			stag_idx = 0;
775 			if (qp->ddp_color == CHE_DDP_MAX_COLOR)
776 				qp->ddp_color = 0;
777 			else
778 				qp->ddp_color++;
779 		} else
780 			stag_idx++;
781 		MPASS(stag_idx != qp->next_ddp_tag);
782 	}
783 	if (stag_idx == qp->num_ddp_tags - 1)
784 		qp->next_ddp_tag = 0;
785 	else
786 		qp->next_ddp_tag = stag_idx + 1;
787 
788 	qp->active_ddp_tags++;
789 	qp->open_ddp_tags[stag_idx] = cb;
790 
791 	return (CHE_DDP_TAG(stag_idx, qp->ddp_color));
792 }
793 
794 static void
che_free_ddp_stag(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb,uint16_t stag)795 che_free_ddp_stag(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb,
796     uint16_t stag)
797 {
798 	MPASS(!CHE_TAG_IS_FL(stag));
799 
800 	mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
801 
802 	MPASS(qp->open_ddp_tags[CHE_STAG_IDX(stag)] == cb);
803 
804 	qp->open_ddp_tags[CHE_STAG_IDX(stag)] = NULL;
805 	qp->active_ddp_tags--;
806 }
807 
808 static uint16_t
che_alloc_ddp_tag(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb)809 che_alloc_ddp_tag(struct nvmf_che_qpair *qp,
810     struct nvmf_che_command_buffer *cb)
811 {
812 	uint16_t stag;
813 
814 	mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
815 
816 	if (!cb->ddp_ok)
817 		return (CHE_DDP_NO_TAG);
818 
819 	stag = che_alloc_ddp_stag(qp, cb);
820 	if (stag == CHE_DDP_NO_TAG) {
821 		counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_setup_no_stag,
822 		    1);
823 		return (CHE_DDP_NO_TAG);
824 	}
825 
826 	if (!che_alloc_pbl(qp, cb)) {
827 		che_free_ddp_stag(qp, cb, stag);
828 		counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_setup_error, 1);
829 		return (CHE_DDP_NO_TAG);
830 	}
831 
832 	if (!che_write_tpt_entry(qp, cb, stag)) {
833 		che_free_pbl(cb);
834 		che_free_ddp_stag(qp, cb, stag);
835 		counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_setup_error, 1);
836 		return (CHE_DDP_NO_TAG);
837 	}
838 
839 	counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_setup_ok, 1);
840 	return (stag);
841 }
842 
843 static void
che_free_ddp_tag(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb,uint16_t stag)844 che_free_ddp_tag(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb,
845     uint16_t stag)
846 {
847 	MPASS(!CHE_TAG_IS_FL(stag));
848 
849 	mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
850 
851 	MPASS(qp->open_ddp_tags[CHE_STAG_IDX(stag)] == cb);
852 
853 	che_clear_tpt_entry(qp, stag);
854 	che_free_pbl(cb);
855 	che_free_ddp_stag(qp, cb, stag);
856 }
857 
858 static void
nvmf_che_write_pdu(struct nvmf_che_qpair * qp,struct mbuf * m)859 nvmf_che_write_pdu(struct nvmf_che_qpair *qp, struct mbuf *m)
860 {
861 	struct epoch_tracker et;
862 	struct socket *so = qp->so;
863 	struct inpcb *inp = sotoinpcb(so);
864 	struct toepcb *toep = qp->toep;
865 
866 	CURVNET_SET(so->so_vnet);
867 	NET_EPOCH_ENTER(et);
868 	INP_WLOCK(inp);
869 	if (__predict_false(inp->inp_flags & INP_DROPPED) ||
870 	    __predict_false((toep->flags & TPF_ATTACHED) == 0)) {
871 		m_freem(m);
872 	} else {
873 		mbufq_enqueue(&toep->ulp_pduq, m);
874 		t4_push_pdus(toep->vi->adapter, toep, 0);
875 	}
876 	INP_WUNLOCK(inp);
877 	NET_EPOCH_EXIT(et);
878 	CURVNET_RESTORE();
879 }
880 
881 static void
nvmf_che_report_error(struct nvmf_che_qpair * qp,uint16_t fes,uint32_t fei,struct mbuf * rx_pdu,u_int hlen)882 nvmf_che_report_error(struct nvmf_che_qpair *qp, uint16_t fes, uint32_t fei,
883     struct mbuf *rx_pdu, u_int hlen)
884 {
885 	struct nvme_tcp_term_req_hdr *hdr;
886 	struct mbuf *m;
887 
888 	if (hlen != 0) {
889 		hlen = min(hlen, NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE);
890 		hlen = min(hlen, m_length(rx_pdu, NULL));
891 	}
892 
893 	m = m_get2(sizeof(*hdr) + hlen, M_WAITOK, MT_DATA, M_PKTHDR);
894 	m->m_len = sizeof(*hdr) + hlen;
895 	m->m_pkthdr.len = m->m_len;
896 	hdr = mtod(m, void *);
897 	memset(hdr, 0, sizeof(*hdr));
898 	hdr->common.pdu_type = qp->qp.nq_controller ?
899 	    NVME_TCP_PDU_TYPE_C2H_TERM_REQ : NVME_TCP_PDU_TYPE_H2C_TERM_REQ;
900 	hdr->common.hlen = sizeof(*hdr);
901 	hdr->common.plen = sizeof(*hdr) + hlen;
902 	hdr->fes = htole16(fes);
903 	le32enc(hdr->fei, fei);
904 	if (hlen != 0)
905 		m_copydata(rx_pdu, 0, hlen, (caddr_t)(hdr + 1));
906 
907 	nvmf_che_write_pdu(qp, m);
908 }
909 
910 static int
nvmf_che_validate_pdu(struct nvmf_che_qpair * qp,struct nvmf_che_rxpdu * pdu)911 nvmf_che_validate_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu)
912 {
913 	const struct nvme_tcp_common_pdu_hdr *ch;
914 	struct mbuf *m = pdu->m;
915 	uint32_t data_len, fei, plen, rx_digest;
916 	u_int hlen, cpl_error;
917 	int error;
918 	uint16_t fes;
919 
920 	/* Determine how large of a PDU header to return for errors. */
921 	ch = pdu->hdr;
922 	hlen = ch->hlen;
923 	plen = le32toh(ch->plen);
924 	if (hlen < sizeof(*ch) || hlen > plen)
925 		hlen = sizeof(*ch);
926 
927 	cpl_error = m->m_pkthdr.nvmf_cpl_status & CMP_STATUS_ERROR_MASK;
928 	switch (cpl_error) {
929 	case CMP_STATUS_NO_ERROR:
930 		break;
931 	case CMP_STATUS_HEADER_DIGEST:
932 		counter_u64_add(
933 		    qp->toep->ofld_rxq->rx_nvme_header_digest_errors, 1);
934 		printf("NVMe/TCP: Header digest mismatch\n");
935 		rx_digest = le32dec(mtodo(m, ch->hlen));
936 		nvmf_che_report_error(qp,
937 		    NVME_TCP_TERM_REQ_FES_HDGST_ERROR, rx_digest, m,
938 		    hlen);
939 		return (EBADMSG);
940 	case CMP_STATUS_DIRECTION_MISMATCH:
941 		counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1);
942 		printf("NVMe/TCP: Invalid PDU type %u\n", ch->pdu_type);
943 		nvmf_che_report_error(qp,
944 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
945 		    offsetof(struct nvme_tcp_common_pdu_hdr, pdu_type), m,
946 		    hlen);
947 		return (EBADMSG);
948 	case CMP_STATUS_SUCCESS_NOT_LAST:
949 	case CMP_STATUS_DIGEST_FLAG_MISMATCH:
950 		counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1);
951 		printf("NVMe/TCP: Invalid PDU header flags %#x\n", ch->flags);
952 		nvmf_che_report_error(qp,
953 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
954 		    offsetof(struct nvme_tcp_common_pdu_hdr, flags), m, hlen);
955 		return (EBADMSG);
956 	case CMP_STATUS_BAD_DATA_LENGTH:
957 		counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1);
958 		printf("NVMe/TCP: Invalid PDU length %u\n", plen);
959 		nvmf_che_report_error(qp,
960 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
961 		    offsetof(struct nvme_tcp_common_pdu_hdr, plen), m, hlen);
962 		return (EBADMSG);
963 	case CMP_STATUS_USER_MODE_UNALLOCATED:
964 	case CMP_STATUS_RQT_LIMIT:
965 	case CMP_STATUS_RQT_WRAP:
966 	case CMP_STATUS_RQT_BOUND:
967 		device_printf(qp->nca->sc->dev,
968 		    "received invalid NVMET error %u\n",
969 		    cpl_error);
970 		return (ECONNRESET);
971 	case CMP_STATUS_TPT_LIMIT:
972 	case CMP_STATUS_TPT_INVALID:
973 	case CMP_STATUS_TPT_COLOUR_MISMATCH:
974 	case CMP_STATUS_TPT_MISC:
975 	case CMP_STATUS_TPT_WRAP:
976 	case CMP_STATUS_TPT_BOUND:
977 		counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1);
978 		switch (ch->pdu_type) {
979 		case NVME_TCP_PDU_TYPE_H2C_DATA:
980 			nvmf_che_report_error(qp,
981 			    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
982 			    offsetof(struct nvme_tcp_h2c_data_hdr, ttag),
983 			    pdu->m, pdu->hdr->hlen);
984 			return (EBADMSG);
985 		case NVME_TCP_PDU_TYPE_C2H_DATA:
986 			nvmf_che_report_error(qp,
987 			    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
988 			    offsetof(struct nvme_tcp_c2h_data_hdr, cccid), m,
989 			    hlen);
990 			return (EBADMSG);
991 		default:
992 			device_printf(qp->nca->sc->dev,
993 			    "received DDP NVMET error %u for PDU %u\n",
994 			    cpl_error, ch->pdu_type);
995 			return (ECONNRESET);
996 		}
997 	case CMP_STATUS_TPT_LAST_PDU_UNALIGNED:
998 		counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1);
999 		nvmf_che_report_error(qp,
1000 		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, m, hlen);
1001 		return (EBADMSG);
1002 	case CMP_STATUS_PBL_LIMIT:
1003 		counter_u64_add(qp->toep->ofld_rxq->rx_nvme_invalid_headers, 1);
1004 		nvmf_che_report_error(qp,
1005 		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, m,
1006 		    hlen);
1007 		return (EBADMSG);
1008 	case CMP_STATUS_DATA_DIGEST:
1009 		/* Handled below. */
1010 		break;
1011 	default:
1012 		device_printf(qp->nca->sc->dev,
1013 		    "received unknown NVMET error %u\n",
1014 		    cpl_error);
1015 		return (ECONNRESET);
1016 	}
1017 
1018 	error = nvmf_tcp_validate_pdu_header(ch, qp->qp.nq_controller,
1019 	    qp->header_digests, qp->data_digests, qp->rxpda, &data_len, &fes,
1020 	    &fei);
1021 	if (error != 0) {
1022 		if (error != ECONNRESET)
1023 			nvmf_che_report_error(qp, fes, fei, m, hlen);
1024 		return (error);
1025 	}
1026 
1027 	/* Check data digest if present. */
1028 	pdu->data_digest_mismatch = false;
1029 	if ((ch->flags & NVME_TCP_CH_FLAGS_DDGSTF) != 0) {
1030 		if (cpl_error == CMP_STATUS_DATA_DIGEST) {
1031 			printf("NVMe/TCP: Data digest mismatch\n");
1032 			pdu->data_digest_mismatch = true;
1033 			counter_u64_add(
1034 			    qp->toep->ofld_rxq->rx_nvme_data_digest_errors, 1);
1035 		}
1036 	}
1037 
1038 	pdu->data_len = data_len;
1039 
1040 	return (0);
1041 }
1042 
1043 static void
nvmf_che_free_pdu(struct nvmf_che_rxpdu * pdu)1044 nvmf_che_free_pdu(struct nvmf_che_rxpdu *pdu)
1045 {
1046 	m_freem(pdu->m);
1047 	pdu->m = NULL;
1048 	pdu->hdr = NULL;
1049 }
1050 
1051 static int
nvmf_che_handle_term_req(struct nvmf_che_rxpdu * pdu)1052 nvmf_che_handle_term_req(struct nvmf_che_rxpdu *pdu)
1053 {
1054 	const struct nvme_tcp_term_req_hdr *hdr;
1055 
1056 	hdr = (const void *)pdu->hdr;
1057 
1058 	printf("NVMe/TCP: Received termination request: fes %#x fei %#x\n",
1059 	    le16toh(hdr->fes), le32dec(hdr->fei));
1060 	nvmf_che_free_pdu(pdu);
1061 	return (ECONNRESET);
1062 }
1063 
1064 static int
nvmf_che_save_command_capsule(struct nvmf_che_qpair * qp,struct nvmf_che_rxpdu * pdu)1065 nvmf_che_save_command_capsule(struct nvmf_che_qpair *qp,
1066     struct nvmf_che_rxpdu *pdu)
1067 {
1068 	const struct nvme_tcp_cmd *cmd;
1069 	struct nvmf_capsule *nc;
1070 	struct nvmf_che_capsule *cc;
1071 
1072 	cmd = (const void *)pdu->hdr;
1073 
1074 	nc = nvmf_allocate_command(&qp->qp, &cmd->ccsqe, M_WAITOK);
1075 
1076 	cc = CCAP(nc);
1077 	cc->rx_pdu = *pdu;
1078 
1079 	nvmf_capsule_received(&qp->qp, nc);
1080 	return (0);
1081 }
1082 
1083 static int
nvmf_che_save_response_capsule(struct nvmf_che_qpair * qp,struct nvmf_che_rxpdu * pdu)1084 nvmf_che_save_response_capsule(struct nvmf_che_qpair *qp,
1085     struct nvmf_che_rxpdu *pdu)
1086 {
1087 	const struct nvme_tcp_rsp *rsp;
1088 	struct nvme_completion cpl;
1089 	struct nvmf_capsule *nc;
1090 	struct nvmf_che_capsule *cc;
1091 	uint16_t cid;
1092 
1093 	rsp = (const void *)pdu->hdr;
1094 
1095 	/*
1096 	 * Restore the original CID and ensure any command buffers
1097 	 * associated with this CID have been released.  Once the CQE
1098 	 * has been received, no further transfers to the command
1099 	 * buffer for the associated CID can occur.
1100 	 */
1101 	cpl = rsp->rccqe;
1102 	cid = le16toh(cpl.cid);
1103 	if (CHE_TAG_IS_FL(cid)) {
1104 		cid = CHE_RAW_FL_TAG(cid);
1105 		mtx_lock(&qp->fl_cid_lock);
1106 		MPASS(FL_CID_ISACTIVE(cid, qp->fl_cid_set));
1107 		cpl.cid = qp->fl_cids[cid];
1108 		FL_CID_FREE(cid, qp->fl_cid_set);
1109 		mtx_unlock(&qp->fl_cid_lock);
1110 
1111 		che_purge_command_buffer(&qp->rx_buffers, rsp->rccqe.cid);
1112 		che_purge_command_buffer(&qp->tx_buffers, rsp->rccqe.cid);
1113 	} else {
1114 		struct nvmf_che_command_buffer *cb;
1115 
1116 		mtx_lock(&qp->rx_buffers.lock);
1117 		cb = qp->open_ddp_tags[CHE_STAG_IDX(cid)];
1118 		MPASS(cb != NULL);
1119 		MPASS(cb->cid == rsp->rccqe.cid);
1120 		cpl.cid = cb->original_cid;
1121 		che_free_ddp_tag(qp, cb, cid);
1122 		mtx_unlock(&qp->rx_buffers.lock);
1123 		che_release_command_buffer(cb);
1124 	}
1125 #ifdef VERBOSE_TRACES
1126 	CTR(KTR_CXGBE, "%s: tid %u freed cid 0x%04x for 0x%04x", __func__,
1127 	    qp->toep->tid, le16toh(rsp->rccqe.cid), cpl.cid);
1128 #endif
1129 
1130 	nc = nvmf_allocate_response(&qp->qp, &cpl, M_WAITOK);
1131 
1132 	nc->nc_sqhd_valid = true;
1133 	cc = CCAP(nc);
1134 	cc->rx_pdu = *pdu;
1135 
1136 	nvmf_capsule_received(&qp->qp, nc);
1137 	return (0);
1138 }
1139 
1140 /*
1141  * Construct a PDU that contains an optional data payload.  This
1142  * includes dealing with the length fields in the common header.  The
1143  * adapter inserts digests and padding when the PDU is transmitted.
1144  */
1145 static struct mbuf *
nvmf_che_construct_pdu(struct nvmf_che_qpair * qp,void * hdr,size_t hlen,struct mbuf * data,uint32_t data_len)1146 nvmf_che_construct_pdu(struct nvmf_che_qpair *qp, void *hdr, size_t hlen,
1147     struct mbuf *data, uint32_t data_len)
1148 {
1149 	struct nvme_tcp_common_pdu_hdr *ch;
1150 	struct mbuf *top;
1151 	uint32_t pdo, plen;
1152 	uint8_t ulp_submode;
1153 
1154 	plen = hlen;
1155 	if (qp->header_digests)
1156 		plen += sizeof(uint32_t);
1157 	if (data_len != 0) {
1158 		KASSERT(m_length(data, NULL) == data_len, ("length mismatch"));
1159 		pdo = roundup(plen, qp->txpda);
1160 		plen = pdo + data_len;
1161 		if (qp->data_digests)
1162 			plen += sizeof(uint32_t);
1163 	} else {
1164 		KASSERT(data == NULL, ("payload mbuf with zero length"));
1165 		pdo = 0;
1166 	}
1167 
1168 	top = m_get2(hlen, M_WAITOK, MT_DATA, M_PKTHDR);
1169 	top->m_len = hlen;
1170 	top->m_pkthdr.len = hlen;
1171 	ch = mtod(top, void *);
1172 	memcpy(ch, hdr, hlen);
1173 	ch->hlen = hlen;
1174 	ulp_submode = 0;
1175 	if (qp->header_digests) {
1176 		ch->flags |= NVME_TCP_CH_FLAGS_HDGSTF;
1177 		ulp_submode |= ULP_CRC_HEADER;
1178 	}
1179 	if (qp->data_digests && data_len != 0) {
1180 		ch->flags |= NVME_TCP_CH_FLAGS_DDGSTF;
1181 		ulp_submode |= ULP_CRC_DATA;
1182 	}
1183 	ch->pdo = pdo;
1184 	ch->plen = htole32(plen);
1185 	set_mbuf_ulp_submode(top, ulp_submode);
1186 
1187 	if (data_len != 0) {
1188 		top->m_pkthdr.len += data_len;
1189 		top->m_next = data;
1190 	}
1191 
1192 	return (top);
1193 }
1194 
1195 /* Allocate the next free freelist transfer tag. */
1196 static bool
nvmf_che_allocate_fl_ttag(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb)1197 nvmf_che_allocate_fl_ttag(struct nvmf_che_qpair *qp,
1198     struct nvmf_che_command_buffer *cb)
1199 {
1200 	uint16_t ttag;
1201 
1202 	mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
1203 
1204 	if (qp->active_fl_ttags == qp->num_fl_ttags)
1205 		return (false);
1206 
1207 	ttag = qp->next_fl_ttag;
1208 	for (;;) {
1209 		if (qp->open_fl_ttags[ttag] == NULL)
1210 			break;
1211 		if (ttag == qp->num_fl_ttags - 1)
1212 			ttag = 0;
1213 		else
1214 			ttag++;
1215 		MPASS(ttag != qp->next_fl_ttag);
1216 	}
1217 	if (ttag == qp->num_fl_ttags - 1)
1218 		qp->next_fl_ttag = 0;
1219 	else
1220 		qp->next_fl_ttag = ttag + 1;
1221 
1222 	qp->active_fl_ttags++;
1223 	qp->open_fl_ttags[ttag] = cb;
1224 
1225 	cb->ttag = ttag | CHE_FL_TAG_MASK;
1226 	return (true);
1227 }
1228 
1229 /* Attempt to allocate a free transfer tag and assign it to cb. */
1230 static bool
nvmf_che_allocate_ttag(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb)1231 nvmf_che_allocate_ttag(struct nvmf_che_qpair *qp,
1232     struct nvmf_che_command_buffer *cb)
1233 {
1234 	uint16_t stag;
1235 
1236 	mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
1237 
1238 	stag = che_alloc_ddp_tag(qp, cb);
1239 	if (stag == CHE_DDP_NO_TAG) {
1240 		if (!nvmf_che_allocate_fl_ttag(qp, cb))
1241 			return (false);
1242 	} else {
1243 		cb->ttag = stag;
1244 	}
1245 #ifdef VERBOSE_TRACES
1246 	CTR(KTR_CXGBE, "%s: tid %u allocated ttag 0x%04x", __func__,
1247 	    qp->toep->tid, cb->ttag);
1248 #endif
1249 	cb->cc->active_r2ts++;
1250 	return (true);
1251 }
1252 
1253 /* Find the next command buffer eligible to schedule for R2T. */
1254 static struct nvmf_che_command_buffer *
nvmf_che_next_r2t(struct nvmf_che_qpair * qp)1255 nvmf_che_next_r2t(struct nvmf_che_qpair *qp)
1256 {
1257 	struct nvmf_che_command_buffer *cb;
1258 
1259 	mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
1260 
1261 	TAILQ_FOREACH(cb, &qp->rx_buffers.head, link) {
1262 		/* NB: maxr2t is 0's based. */
1263 		if (cb->cc->active_r2ts > qp->maxr2t)
1264 			continue;
1265 
1266 		if (!nvmf_che_allocate_ttag(qp, cb))
1267 			return (NULL);
1268 #ifdef INVARIANTS
1269 		cb->cc->pending_r2ts--;
1270 #endif
1271 		TAILQ_REMOVE(&qp->rx_buffers.head, cb, link);
1272 		return (cb);
1273 	}
1274 	return (NULL);
1275 }
1276 
1277 /* NB: cid and is little-endian already. */
1278 static void
che_send_r2t(struct nvmf_che_qpair * qp,uint16_t cid,uint16_t ttag,uint32_t data_offset,uint32_t data_len)1279 che_send_r2t(struct nvmf_che_qpair *qp, uint16_t cid, uint16_t ttag,
1280     uint32_t data_offset, uint32_t data_len)
1281 {
1282 	struct nvme_tcp_r2t_hdr r2t;
1283 	struct mbuf *m;
1284 
1285 	memset(&r2t, 0, sizeof(r2t));
1286 	r2t.common.pdu_type = NVME_TCP_PDU_TYPE_R2T;
1287 	r2t.cccid = cid;
1288 	r2t.ttag = htole16(ttag);
1289 	r2t.r2to = htole32(data_offset);
1290 	r2t.r2tl = htole32(data_len);
1291 
1292 	m = nvmf_che_construct_pdu(qp, &r2t, sizeof(r2t), NULL, 0);
1293 	nvmf_che_write_pdu(qp, m);
1294 }
1295 
1296 /*
1297  * Release a transfer tag and schedule another R2T.
1298  *
1299  * NB: This drops the rx_buffers.lock mutex.
1300  */
1301 static void
nvmf_che_send_next_r2t(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb)1302 nvmf_che_send_next_r2t(struct nvmf_che_qpair *qp,
1303     struct nvmf_che_command_buffer *cb)
1304 {
1305 	struct nvmf_che_command_buffer *ncb;
1306 
1307 	mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
1308 
1309 #ifdef VERBOSE_TRACES
1310 	CTR(KTR_CXGBE, "%s: tid %u freed ttag 0x%04x", __func__, qp->toep->tid,
1311 	    cb->ttag);
1312 #endif
1313 	if (CHE_TAG_IS_FL(cb->ttag)) {
1314 		uint16_t ttag;
1315 
1316 		ttag = CHE_RAW_FL_TAG(cb->ttag);
1317 		MPASS(qp->open_fl_ttags[ttag] == cb);
1318 
1319 		/* Release this transfer tag. */
1320 		qp->open_fl_ttags[ttag] = NULL;
1321 		qp->active_fl_ttags--;
1322 	} else
1323 		che_free_ddp_tag(qp, cb, cb->ttag);
1324 
1325 	cb->cc->active_r2ts--;
1326 
1327 	/* Schedule another R2T. */
1328 	ncb = nvmf_che_next_r2t(qp);
1329 	mtx_unlock(&qp->rx_buffers.lock);
1330 	if (ncb != NULL)
1331 		che_send_r2t(qp, ncb->cid, ncb->ttag, ncb->data_offset,
1332 		    ncb->data_len);
1333 }
1334 
1335 /*
1336  * Copy len bytes starting at offset skip from an mbuf chain into an
1337  * I/O buffer at destination offset io_offset.
1338  */
1339 static void
mbuf_copyto_io(struct mbuf * m,u_int skip,u_int len,struct nvmf_io_request * io,u_int io_offset)1340 mbuf_copyto_io(struct mbuf *m, u_int skip, u_int len,
1341     struct nvmf_io_request *io, u_int io_offset)
1342 {
1343 	u_int todo;
1344 
1345 	while (m->m_len <= skip) {
1346 		skip -= m->m_len;
1347 		m = m->m_next;
1348 	}
1349 	while (len != 0) {
1350 		MPASS((m->m_flags & M_EXTPG) == 0);
1351 
1352 		todo = min(m->m_len - skip, len);
1353 		memdesc_copyback(&io->io_mem, io_offset, todo, mtodo(m, skip));
1354 		skip = 0;
1355 		io_offset += todo;
1356 		len -= todo;
1357 		m = m->m_next;
1358 	}
1359 }
1360 
1361 static int
nvmf_che_handle_h2c_data(struct nvmf_che_qpair * qp,struct nvmf_che_rxpdu * pdu)1362 nvmf_che_handle_h2c_data(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu)
1363 {
1364 	const struct nvme_tcp_h2c_data_hdr *h2c;
1365 	struct nvmf_che_command_buffer *cb;
1366 	uint32_t data_len, data_offset;
1367 	uint16_t ttag, fl_ttag;
1368 
1369 	h2c = (const void *)pdu->hdr;
1370 	if (le32toh(h2c->datal) > qp->maxh2cdata) {
1371 		nvmf_che_report_error(qp,
1372 		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_LIMIT_EXCEEDED, 0,
1373 		    pdu->m, pdu->hdr->hlen);
1374 		nvmf_che_free_pdu(pdu);
1375 		return (EBADMSG);
1376 	}
1377 
1378 	ttag = le16toh(h2c->ttag);
1379 	if (CHE_TAG_IS_FL(ttag)) {
1380 		fl_ttag = CHE_RAW_FL_TAG(ttag);
1381 		if (fl_ttag >= qp->num_fl_ttags) {
1382 			nvmf_che_report_error(qp,
1383 			    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
1384 			    offsetof(struct nvme_tcp_h2c_data_hdr, ttag),
1385 			    pdu->m, pdu->hdr->hlen);
1386 			nvmf_che_free_pdu(pdu);
1387 			return (EBADMSG);
1388 		}
1389 
1390 		mtx_lock(&qp->rx_buffers.lock);
1391 		cb = qp->open_fl_ttags[fl_ttag];
1392 	} else {
1393 		if (CHE_STAG_IDX(ttag) >= qp->num_ddp_tags) {
1394 			nvmf_che_report_error(qp,
1395 			    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
1396 			    offsetof(struct nvme_tcp_h2c_data_hdr, ttag),
1397 			    pdu->m, pdu->hdr->hlen);
1398 			nvmf_che_free_pdu(pdu);
1399 			return (EBADMSG);
1400 		}
1401 
1402 		mtx_lock(&qp->rx_buffers.lock);
1403 		cb = qp->open_ddp_tags[CHE_STAG_IDX(ttag)];
1404 	}
1405 
1406 	if (cb == NULL) {
1407 		mtx_unlock(&qp->rx_buffers.lock);
1408 		nvmf_che_report_error(qp,
1409 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
1410 		    offsetof(struct nvme_tcp_h2c_data_hdr, ttag), pdu->m,
1411 		    pdu->hdr->hlen);
1412 		nvmf_che_free_pdu(pdu);
1413 		return (EBADMSG);
1414 	}
1415 	MPASS(cb->ttag == ttag);
1416 
1417 	/* For a data digest mismatch, fail the I/O request. */
1418 	if (pdu->data_digest_mismatch) {
1419 		nvmf_che_send_next_r2t(qp, cb);
1420 		cb->error = EINTEGRITY;
1421 		che_release_command_buffer(cb);
1422 		nvmf_che_free_pdu(pdu);
1423 		return (0);
1424 	}
1425 
1426 	data_len = le32toh(h2c->datal);
1427 	if (data_len != pdu->data_len) {
1428 		mtx_unlock(&qp->rx_buffers.lock);
1429 		nvmf_che_report_error(qp,
1430 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
1431 		    offsetof(struct nvme_tcp_h2c_data_hdr, datal), pdu->m,
1432 		    pdu->hdr->hlen);
1433 		nvmf_che_free_pdu(pdu);
1434 		return (EBADMSG);
1435 	}
1436 
1437 	data_offset = le32toh(h2c->datao);
1438 	if (data_offset < cb->data_offset ||
1439 	    data_offset + data_len > cb->data_offset + cb->data_len) {
1440 		mtx_unlock(&qp->rx_buffers.lock);
1441 		nvmf_che_report_error(qp,
1442 		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0, pdu->m,
1443 		    pdu->hdr->hlen);
1444 		nvmf_che_free_pdu(pdu);
1445 		return (EBADMSG);
1446 	}
1447 
1448 	if (data_offset != cb->data_offset + cb->data_xfered) {
1449 		if (CHE_TAG_IS_FL(ttag)) {
1450 			mtx_unlock(&qp->rx_buffers.lock);
1451 			nvmf_che_report_error(qp,
1452 			    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
1453 			    pdu->hdr->hlen);
1454 			nvmf_che_free_pdu(pdu);
1455 			return (EBADMSG);
1456 		} else {
1457 			uint32_t ddp_bytes;
1458 
1459 			/* Account for PDUs silently received via DDP. */
1460 			ddp_bytes = data_offset -
1461 			    (cb->data_offset + cb->data_xfered);
1462 			cb->data_xfered += ddp_bytes;
1463 #ifdef VERBOSE_TRACES
1464 			CTR(KTR_CXGBE, "%s: tid %u previous ddp_bytes %u",
1465 			    __func__, qp->toep->tid, ddp_bytes);
1466 #endif
1467 			counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_octets,
1468 			    ddp_bytes);
1469 		}
1470 	}
1471 
1472 	if ((cb->data_xfered + data_len == cb->data_len) !=
1473 	    ((pdu->hdr->flags & NVME_TCP_H2C_DATA_FLAGS_LAST_PDU) != 0)) {
1474 		mtx_unlock(&qp->rx_buffers.lock);
1475 		nvmf_che_report_error(qp,
1476 		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
1477 		    pdu->hdr->hlen);
1478 		nvmf_che_free_pdu(pdu);
1479 		return (EBADMSG);
1480 	}
1481 
1482 	cb->data_xfered += data_len;
1483 	data_offset -= cb->data_offset;
1484 	if (cb->data_xfered == cb->data_len) {
1485 		nvmf_che_send_next_r2t(qp, cb);
1486 	} else {
1487 		che_hold_command_buffer(cb);
1488 		mtx_unlock(&qp->rx_buffers.lock);
1489 	}
1490 
1491 	if (CHE_TAG_IS_FL(ttag))
1492 		mbuf_copyto_io(pdu->m->m_next, 0, data_len, &cb->io,
1493 		    data_offset);
1494 
1495 	che_release_command_buffer(cb);
1496 	nvmf_che_free_pdu(pdu);
1497 	return (0);
1498 }
1499 
1500 static int
nvmf_che_handle_c2h_data(struct nvmf_che_qpair * qp,struct nvmf_che_rxpdu * pdu)1501 nvmf_che_handle_c2h_data(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu)
1502 {
1503 	const struct nvme_tcp_c2h_data_hdr *c2h;
1504 	struct nvmf_che_command_buffer *cb;
1505 	uint32_t data_len, data_offset;
1506 	uint16_t cid, original_cid;
1507 
1508 	/*
1509 	 * Unlike freelist command buffers, DDP command buffers are
1510 	 * not released until the response capsule is received to keep
1511 	 * the STAG allocated until the command has completed.
1512 	 */
1513 	c2h = (const void *)pdu->hdr;
1514 
1515 	cid = le16toh(c2h->cccid);
1516 	if (CHE_TAG_IS_FL(cid)) {
1517 		mtx_lock(&qp->rx_buffers.lock);
1518 		cb = che_find_command_buffer(&qp->rx_buffers, c2h->cccid);
1519 	} else {
1520 		if (CHE_STAG_IDX(cid) >= qp->num_ddp_tags) {
1521 			nvmf_che_report_error(qp,
1522 			    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
1523 			    offsetof(struct nvme_tcp_c2h_data_hdr, cccid),
1524 			    pdu->m, pdu->hdr->hlen);
1525 			nvmf_che_free_pdu(pdu);
1526 			return (EBADMSG);
1527 		}
1528 
1529 		mtx_lock(&qp->rx_buffers.lock);
1530 		cb = qp->open_ddp_tags[CHE_STAG_IDX(cid)];
1531 	}
1532 
1533 	if (cb == NULL) {
1534 		mtx_unlock(&qp->rx_buffers.lock);
1535 		/*
1536 		 * XXX: Could be PDU sequence error if cccid is for a
1537 		 * command that doesn't use a command buffer.
1538 		 */
1539 		nvmf_che_report_error(qp,
1540 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
1541 		    offsetof(struct nvme_tcp_c2h_data_hdr, cccid), pdu->m,
1542 		    pdu->hdr->hlen);
1543 		nvmf_che_free_pdu(pdu);
1544 		return (EBADMSG);
1545 	}
1546 
1547 	/* For a data digest mismatch, fail the I/O request. */
1548 	if (pdu->data_digest_mismatch) {
1549 		cb->error = EINTEGRITY;
1550 		if (CHE_TAG_IS_FL(cid)) {
1551 			che_remove_command_buffer(&qp->rx_buffers, cb);
1552 			mtx_unlock(&qp->rx_buffers.lock);
1553 			che_release_command_buffer(cb);
1554 		} else
1555 			mtx_unlock(&qp->rx_buffers.lock);
1556 		nvmf_che_free_pdu(pdu);
1557 		return (0);
1558 	}
1559 
1560 	data_len = le32toh(c2h->datal);
1561 	if (data_len != pdu->data_len) {
1562 		mtx_unlock(&qp->rx_buffers.lock);
1563 		nvmf_che_report_error(qp,
1564 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
1565 		    offsetof(struct nvme_tcp_c2h_data_hdr, datal), pdu->m,
1566 		    pdu->hdr->hlen);
1567 		nvmf_che_free_pdu(pdu);
1568 		return (EBADMSG);
1569 	}
1570 
1571 	data_offset = le32toh(c2h->datao);
1572 	if (data_offset < cb->data_offset ||
1573 	    data_offset + data_len > cb->data_offset + cb->data_len) {
1574 		mtx_unlock(&qp->rx_buffers.lock);
1575 		nvmf_che_report_error(qp,
1576 		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
1577 		    pdu->m, pdu->hdr->hlen);
1578 		nvmf_che_free_pdu(pdu);
1579 		return (EBADMSG);
1580 	}
1581 
1582 	if (data_offset != cb->data_offset + cb->data_xfered) {
1583 		if (CHE_TAG_IS_FL(cid)) {
1584 			mtx_unlock(&qp->rx_buffers.lock);
1585 			nvmf_che_report_error(qp,
1586 			    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
1587 			    pdu->hdr->hlen);
1588 			nvmf_che_free_pdu(pdu);
1589 			return (EBADMSG);
1590 		} else {
1591 			uint32_t ddp_bytes;
1592 
1593 			/* Account for PDUs silently received via DDP. */
1594 			ddp_bytes = data_offset -
1595 			    (cb->data_offset + cb->data_xfered);
1596 			cb->data_xfered += ddp_bytes;
1597 #ifdef VERBOSE_TRACES
1598 			CTR(KTR_CXGBE, "%s: tid %u previous ddp_bytes %u",
1599 			    __func__, qp->toep->tid, ddp_bytes);
1600 #endif
1601 			counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_octets,
1602 			    ddp_bytes);
1603 		}
1604 	}
1605 
1606 	if ((cb->data_xfered + data_len == cb->data_len) !=
1607 	    ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_LAST_PDU) != 0)) {
1608 		mtx_unlock(&qp->rx_buffers.lock);
1609 		nvmf_che_report_error(qp,
1610 		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
1611 		    pdu->hdr->hlen);
1612 		nvmf_che_free_pdu(pdu);
1613 		return (EBADMSG);
1614 	}
1615 
1616 	cb->data_xfered += data_len;
1617 	original_cid = cb->original_cid;
1618 
1619 	if (CHE_TAG_IS_FL(cid)) {
1620 		data_offset -= cb->data_offset;
1621 		if (cb->data_xfered == cb->data_len)
1622 			che_remove_command_buffer(&qp->rx_buffers, cb);
1623 		else
1624 			che_hold_command_buffer(cb);
1625 		mtx_unlock(&qp->rx_buffers.lock);
1626 
1627 		if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) {
1628 			/*
1629 			 * Free the CID as the command has now been
1630 			 * completed.
1631 			 */
1632 			cid = CHE_RAW_FL_TAG(cid);
1633 			mtx_lock(&qp->fl_cid_lock);
1634 			MPASS(FL_CID_ISACTIVE(cid, qp->fl_cid_set));
1635 			MPASS(original_cid == qp->fl_cids[cid]);
1636 			FL_CID_FREE(cid, qp->fl_cid_set);
1637 			mtx_unlock(&qp->fl_cid_lock);
1638 		}
1639 
1640 		mbuf_copyto_io(pdu->m->m_next, 0, data_len, &cb->io,
1641 		    data_offset);
1642 
1643 		che_release_command_buffer(cb);
1644 	} else {
1645 		if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) {
1646 			/*
1647 			 * Free the command buffer and STAG as the
1648 			 * command has now been completed.
1649 			 */
1650 			che_free_ddp_tag(qp, cb, cid);
1651 			mtx_unlock(&qp->rx_buffers.lock);
1652 			che_release_command_buffer(cb);
1653 		} else
1654 			mtx_unlock(&qp->rx_buffers.lock);
1655 	}
1656 
1657 	if ((pdu->hdr->flags & NVME_TCP_C2H_DATA_FLAGS_SUCCESS) != 0) {
1658 		struct nvme_completion cqe;
1659 		struct nvmf_capsule *nc;
1660 
1661 		memset(&cqe, 0, sizeof(cqe));
1662 		cqe.cid = original_cid;
1663 
1664 		nc = nvmf_allocate_response(&qp->qp, &cqe, M_WAITOK);
1665 		nc->nc_sqhd_valid = false;
1666 
1667 		nvmf_capsule_received(&qp->qp, nc);
1668 	}
1669 
1670 	nvmf_che_free_pdu(pdu);
1671 	return (0);
1672 }
1673 
1674 /* Called when m_free drops refcount to 0. */
1675 static void
nvmf_che_mbuf_done(struct mbuf * m)1676 nvmf_che_mbuf_done(struct mbuf *m)
1677 {
1678 	struct nvmf_che_command_buffer *cb = m->m_ext.ext_arg1;
1679 
1680 	che_free_command_buffer(cb);
1681 }
1682 
1683 static struct mbuf *
nvmf_che_mbuf(void * arg,int how,void * data,size_t len)1684 nvmf_che_mbuf(void *arg, int how, void *data, size_t len)
1685 {
1686 	struct nvmf_che_command_buffer *cb = arg;
1687 	struct mbuf *m;
1688 
1689 	m = m_get(how, MT_DATA);
1690 	m->m_flags |= M_RDONLY;
1691 	m_extaddref(m, data, len, &cb->refs, nvmf_che_mbuf_done, cb, NULL);
1692 	m->m_len = len;
1693 	return (m);
1694 }
1695 
1696 static void
nvmf_che_free_mext_pg(struct mbuf * m)1697 nvmf_che_free_mext_pg(struct mbuf *m)
1698 {
1699 	struct nvmf_che_command_buffer *cb = m->m_ext.ext_arg1;
1700 
1701 	M_ASSERTEXTPG(m);
1702 	che_release_command_buffer(cb);
1703 }
1704 
1705 static struct mbuf *
nvmf_che_mext_pg(void * arg,int how)1706 nvmf_che_mext_pg(void *arg, int how)
1707 {
1708 	struct nvmf_che_command_buffer *cb = arg;
1709 	struct mbuf *m;
1710 
1711 	m = mb_alloc_ext_pgs(how, nvmf_che_free_mext_pg, M_RDONLY);
1712 	m->m_ext.ext_arg1 = cb;
1713 	che_hold_command_buffer(cb);
1714 	return (m);
1715 }
1716 
1717 /*
1718  * Return an mbuf chain for a range of data belonging to a command
1719  * buffer.
1720  *
1721  * The mbuf chain uses M_EXT mbufs which hold references on the
1722  * command buffer so that it remains "alive" until the data has been
1723  * fully transmitted.  If truncate_ok is true, then the mbuf chain
1724  * might return a short chain to avoid gratuitously splitting up a
1725  * page.
1726  */
1727 static struct mbuf *
nvmf_che_command_buffer_mbuf(struct nvmf_che_command_buffer * cb,uint32_t data_offset,uint32_t data_len,uint32_t * actual_len,bool can_truncate)1728 nvmf_che_command_buffer_mbuf(struct nvmf_che_command_buffer *cb,
1729     uint32_t data_offset, uint32_t data_len, uint32_t *actual_len,
1730     bool can_truncate)
1731 {
1732 	struct mbuf *m;
1733 	size_t len;
1734 
1735 	m = memdesc_alloc_ext_mbufs(&cb->io.io_mem, nvmf_che_mbuf,
1736 	    nvmf_che_mext_pg, cb, M_WAITOK, data_offset, data_len, &len,
1737 	    can_truncate);
1738 	if (actual_len != NULL)
1739 		*actual_len = len;
1740 	return (m);
1741 }
1742 
1743 /* NB: cid and ttag and little-endian already. */
1744 static void
che_send_h2c_pdu(struct nvmf_che_qpair * qp,uint16_t cid,uint16_t ttag,uint32_t data_offset,struct mbuf * m,size_t len,bool last_pdu)1745 che_send_h2c_pdu(struct nvmf_che_qpair *qp, uint16_t cid, uint16_t ttag,
1746     uint32_t data_offset, struct mbuf *m, size_t len, bool last_pdu)
1747 {
1748 	struct nvme_tcp_h2c_data_hdr h2c;
1749 	struct mbuf *top;
1750 
1751 	memset(&h2c, 0, sizeof(h2c));
1752 	h2c.common.pdu_type = NVME_TCP_PDU_TYPE_H2C_DATA;
1753 	if (last_pdu)
1754 		h2c.common.flags |= NVME_TCP_H2C_DATA_FLAGS_LAST_PDU;
1755 	h2c.cccid = cid;
1756 	h2c.ttag = ttag;
1757 	h2c.datao = htole32(data_offset);
1758 	h2c.datal = htole32(len);
1759 
1760 	top = nvmf_che_construct_pdu(qp, &h2c, sizeof(h2c), m, len);
1761 	nvmf_che_write_pdu(qp, top);
1762 }
1763 
1764 static int
nvmf_che_handle_r2t(struct nvmf_che_qpair * qp,struct nvmf_che_rxpdu * pdu)1765 nvmf_che_handle_r2t(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu)
1766 {
1767 	const struct nvme_tcp_r2t_hdr *r2t;
1768 	struct nvmf_che_command_buffer *cb;
1769 	uint32_t data_len, data_offset;
1770 
1771 	r2t = (const void *)pdu->hdr;
1772 
1773 	mtx_lock(&qp->tx_buffers.lock);
1774 	cb = che_find_command_buffer(&qp->tx_buffers, r2t->cccid);
1775 	if (cb == NULL) {
1776 		mtx_unlock(&qp->tx_buffers.lock);
1777 		nvmf_che_report_error(qp,
1778 		    NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD,
1779 		    offsetof(struct nvme_tcp_r2t_hdr, cccid), pdu->m,
1780 		    pdu->hdr->hlen);
1781 		nvmf_che_free_pdu(pdu);
1782 		return (EBADMSG);
1783 	}
1784 
1785 	data_offset = le32toh(r2t->r2to);
1786 	if (data_offset != cb->data_xfered) {
1787 		mtx_unlock(&qp->tx_buffers.lock);
1788 		nvmf_che_report_error(qp,
1789 		    NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR, 0, pdu->m,
1790 		    pdu->hdr->hlen);
1791 		nvmf_che_free_pdu(pdu);
1792 		return (EBADMSG);
1793 	}
1794 
1795 	/*
1796 	 * XXX: The spec does not specify how to handle R2T tranfers
1797 	 * out of range of the original command.
1798 	 */
1799 	data_len = le32toh(r2t->r2tl);
1800 	if (data_offset + data_len > cb->data_len) {
1801 		mtx_unlock(&qp->tx_buffers.lock);
1802 		nvmf_che_report_error(qp,
1803 		    NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE, 0,
1804 		    pdu->m, pdu->hdr->hlen);
1805 		nvmf_che_free_pdu(pdu);
1806 		return (EBADMSG);
1807 	}
1808 
1809 	cb->data_xfered += data_len;
1810 	if (cb->data_xfered == cb->data_len)
1811 		che_remove_command_buffer(&qp->tx_buffers, cb);
1812 	else
1813 		che_hold_command_buffer(cb);
1814 	mtx_unlock(&qp->tx_buffers.lock);
1815 
1816 	/*
1817 	 * Queue one or more H2C_DATA PDUs containing the requested
1818 	 * data.
1819 	 */
1820 	while (data_len > 0) {
1821 		struct mbuf *m;
1822 		uint32_t sent, todo;
1823 
1824 		todo = min(data_len, qp->max_tx_data);
1825 		m = nvmf_che_command_buffer_mbuf(cb, data_offset, todo, &sent,
1826 		    todo < data_len);
1827 		che_send_h2c_pdu(qp, r2t->cccid, r2t->ttag, data_offset, m,
1828 		    sent, sent == data_len);
1829 
1830 		data_offset += sent;
1831 		data_len -= sent;
1832 	}
1833 
1834 	che_release_command_buffer(cb);
1835 	nvmf_che_free_pdu(pdu);
1836 	return (0);
1837 }
1838 
1839 static int
nvmf_che_dispatch_pdu(struct nvmf_che_qpair * qp,struct nvmf_che_rxpdu * pdu)1840 nvmf_che_dispatch_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu)
1841 {
1842 	/*
1843 	 * The PDU header should always be contiguous in the mbuf from
1844 	 * CPL_NVMT_CMP.
1845 	 */
1846 	pdu->hdr = mtod(pdu->m, void *);
1847 	KASSERT(pdu->m->m_len == pdu->hdr->hlen +
1848 	    ((pdu->hdr->flags & NVME_TCP_CH_FLAGS_HDGSTF) != 0 ?
1849 	    sizeof(uint32_t) : 0),
1850 	    ("%s: mismatched PDU header mbuf length", __func__));
1851 
1852 	switch (pdu->hdr->pdu_type) {
1853 	default:
1854 		__assert_unreachable();
1855 		break;
1856 	case NVME_TCP_PDU_TYPE_H2C_TERM_REQ:
1857 	case NVME_TCP_PDU_TYPE_C2H_TERM_REQ:
1858 		return (nvmf_che_handle_term_req(pdu));
1859 	case NVME_TCP_PDU_TYPE_CAPSULE_CMD:
1860 		return (nvmf_che_save_command_capsule(qp, pdu));
1861 	case NVME_TCP_PDU_TYPE_CAPSULE_RESP:
1862 		return (nvmf_che_save_response_capsule(qp, pdu));
1863 	case NVME_TCP_PDU_TYPE_H2C_DATA:
1864 		return (nvmf_che_handle_h2c_data(qp, pdu));
1865 	case NVME_TCP_PDU_TYPE_C2H_DATA:
1866 		return (nvmf_che_handle_c2h_data(qp, pdu));
1867 	case NVME_TCP_PDU_TYPE_R2T:
1868 		return (nvmf_che_handle_r2t(qp, pdu));
1869 	}
1870 }
1871 
1872 static int
nvmf_che_attach_pdu_data(struct nvmf_che_qpair * qp,struct nvmf_che_rxpdu * pdu)1873 nvmf_che_attach_pdu_data(struct nvmf_che_qpair *qp, struct nvmf_che_rxpdu *pdu)
1874 {
1875 	struct socket *so = qp->so;
1876 	struct mbuf *m, *n;
1877 	uint32_t tcp_seq;
1878 	size_t len;
1879 	int error;
1880 
1881 	/* Check for DDP data. */
1882 	if (pdu->ddp) {
1883 		counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_pdus, 1);
1884 		counter_u64_add(qp->toep->ofld_rxq->rx_nvme_ddp_octets,
1885 		    pdu->data_len);
1886 		return (0);
1887 	}
1888 
1889 	error = 0;
1890 	len = pdu->data_len;
1891 	tcp_seq = pdu->m->m_pkthdr.nvmf_tcp_seq;
1892 	m = pdu->m;
1893 	SOCKBUF_LOCK(&so->so_rcv);
1894 	while (len > 0) {
1895 		n = mbufq_dequeue(&qp->rx_data);
1896 		KASSERT(n != NULL, ("%s: missing %zu data", __func__, len));
1897 		if (n == NULL) {
1898 			error = ENOBUFS;
1899 			break;
1900 		}
1901 
1902 		KASSERT(n->m_pkthdr.nvmf_tcp_seq == tcp_seq,
1903 		    ("%s: TCP seq mismatch", __func__));
1904 		KASSERT(n->m_pkthdr.len <= len,
1905 		    ("%s: too much data", __func__));
1906 		if (n->m_pkthdr.nvmf_tcp_seq != tcp_seq ||
1907 		    n->m_pkthdr.len > len) {
1908 			m_freem(n);
1909 			error = ENOBUFS;
1910 			break;
1911 		}
1912 
1913 #ifdef VERBOSE_TRACES
1914 		CTR(KTR_CXGBE, "%s: tid %u len %d seq %u", __func__,
1915 		    qp->toep->tid, n->m_pkthdr.len, n->m_pkthdr.nvmf_tcp_seq);
1916 #endif
1917 		pdu->m->m_pkthdr.len += n->m_pkthdr.len;
1918 		len -= n->m_pkthdr.len;
1919 		tcp_seq += n->m_pkthdr.len;
1920 		m_demote_pkthdr(n);
1921 		m->m_next = n;
1922 		m = m_last(n);
1923 	}
1924 	SOCKBUF_UNLOCK(&so->so_rcv);
1925 
1926 	if (error == 0) {
1927 		counter_u64_add(qp->toep->ofld_rxq->rx_nvme_fl_pdus, 1);
1928 		counter_u64_add(qp->toep->ofld_rxq->rx_nvme_fl_octets,
1929 		    pdu->data_len);
1930 	}
1931 	return (error);
1932 }
1933 
1934 static void
nvmf_che_receive(void * arg)1935 nvmf_che_receive(void *arg)
1936 {
1937 	struct nvmf_che_qpair *qp = arg;
1938 	struct socket *so = qp->so;
1939 	struct nvmf_che_rxpdu pdu;
1940 	struct mbuf *m;
1941 	int error, terror;
1942 
1943 	SOCKBUF_LOCK(&so->so_rcv);
1944 	while (!qp->rx_shutdown) {
1945 		/* Wait for a PDU. */
1946 		if (so->so_error != 0 || so->so_rerror != 0) {
1947 			if (so->so_error != 0)
1948 				error = so->so_error;
1949 			else
1950 				error = so->so_rerror;
1951 			SOCKBUF_UNLOCK(&so->so_rcv);
1952 		error:
1953 			nvmf_qpair_error(&qp->qp, error);
1954 			SOCKBUF_LOCK(&so->so_rcv);
1955 			while (!qp->rx_shutdown)
1956 				cv_wait(&qp->rx_cv, SOCKBUF_MTX(&so->so_rcv));
1957 			break;
1958 		}
1959 
1960 		m = mbufq_dequeue(&qp->rx_pdus);
1961 		if (m == NULL) {
1962 			if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) != 0) {
1963 				error = 0;
1964 				SOCKBUF_UNLOCK(&so->so_rcv);
1965 				goto error;
1966 			}
1967 			cv_wait(&qp->rx_cv, SOCKBUF_MTX(&so->so_rcv));
1968 			continue;
1969 		}
1970 		SOCKBUF_UNLOCK(&so->so_rcv);
1971 
1972 		pdu.m = m;
1973 		pdu.hdr = mtod(m, const void *);
1974 		pdu.ddp = (m->m_pkthdr.nvmf_cpl_status & CMP_STATUS_DDP) != 0;
1975 
1976 		error = nvmf_che_validate_pdu(qp, &pdu);
1977 		if (error == 0 && pdu.data_len != 0)
1978 			error = nvmf_che_attach_pdu_data(qp, &pdu);
1979 		if (error != 0)
1980 			nvmf_che_free_pdu(&pdu);
1981 		else
1982 			error = nvmf_che_dispatch_pdu(qp, &pdu);
1983 		if (error != 0) {
1984 			/*
1985 			 * If we received a termination request, close
1986 			 * the connection immediately.
1987 			 */
1988 			if (error == ECONNRESET)
1989 				goto error;
1990 
1991 			/*
1992 			 * Wait for up to 30 seconds for the socket to
1993 			 * be closed by the other end.
1994 			 */
1995 			SOCKBUF_LOCK(&so->so_rcv);
1996 			if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
1997 				terror = cv_timedwait(&qp->rx_cv,
1998 				    SOCKBUF_MTX(&so->so_rcv), 30 * hz);
1999 				if (terror == ETIMEDOUT)
2000 					printf("NVMe/TCP: Timed out after sending terminate request\n");
2001 			}
2002 			SOCKBUF_UNLOCK(&so->so_rcv);
2003 			goto error;
2004 		}
2005 
2006 		SOCKBUF_LOCK(&so->so_rcv);
2007 	}
2008 	SOCKBUF_UNLOCK(&so->so_rcv);
2009 	kthread_exit();
2010 }
2011 
2012 static int
nvmf_che_soupcall_receive(struct socket * so,void * arg,int waitflag)2013 nvmf_che_soupcall_receive(struct socket *so, void *arg, int waitflag)
2014 {
2015 	struct nvmf_che_qpair *qp = arg;
2016 
2017 	cv_signal(&qp->rx_cv);
2018 	return (SU_OK);
2019 }
2020 
2021 static int
do_nvmt_data(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)2022 do_nvmt_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
2023 {
2024 	struct adapter *sc = iq->adapter;
2025 	struct nvmf_che_adapter *nca = sc->nvme_ulp_softc;
2026 	const struct cpl_nvmt_data *cpl;
2027 	u_int tid;
2028 	struct toepcb *toep;
2029 	struct nvmf_che_qpair *qp;
2030 	struct socket *so;
2031 	struct inpcb *inp;
2032 	struct tcpcb *tp;
2033 	int len __diagused;
2034 
2035 	if (nca->nvmt_data_iqe) {
2036 		cpl = (const void *)(rss + 1);
2037 	} else {
2038 		cpl = mtod(m, const void *);
2039 
2040 		/* strip off CPL header */
2041 		m_adj(m, sizeof(*cpl));
2042 	}
2043 	tid = GET_TID(cpl);
2044 	toep = lookup_tid(sc, tid);
2045 
2046 	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
2047 
2048 	len = m->m_pkthdr.len;
2049 
2050 	KASSERT(len == be16toh(cpl->length),
2051 	    ("%s: payload length mismatch", __func__));
2052 
2053 	inp = toep->inp;
2054 	INP_WLOCK(inp);
2055 	if (inp->inp_flags & INP_DROPPED) {
2056 		CTR(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
2057 		    __func__, tid, len, inp->inp_flags);
2058 		INP_WUNLOCK(inp);
2059 		m_freem(m);
2060 		return (0);
2061 	}
2062 
2063 	/* Save TCP sequence number. */
2064 	m->m_pkthdr.nvmf_tcp_seq = be32toh(cpl->seq);
2065 
2066 	qp = toep->ulpcb;
2067 	so = qp->so;
2068 	SOCKBUF_LOCK(&so->so_rcv);
2069 	mbufq_enqueue(&qp->rx_data, m);
2070 	SOCKBUF_UNLOCK(&so->so_rcv);
2071 
2072 	tp = intotcpcb(inp);
2073 	tp->t_rcvtime = ticks;
2074 
2075 #ifdef VERBOSE_TRACES
2076 	CTR(KTR_CXGBE, "%s: tid %u len %d seq %u", __func__, tid, len,
2077 	    be32toh(cpl->seq));
2078 #endif
2079 
2080 	INP_WUNLOCK(inp);
2081 	return (0);
2082 }
2083 
2084 static int
do_nvmt_cmp(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)2085 do_nvmt_cmp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
2086 {
2087 	struct adapter *sc = iq->adapter;
2088 	const struct cpl_nvmt_cmp *cpl = mtod(m, const void *);
2089 	u_int tid = GET_TID(cpl);
2090 	struct toepcb *toep = lookup_tid(sc, tid);
2091 	struct nvmf_che_qpair *qp = toep->ulpcb;
2092 	struct socket *so = qp->so;
2093 	struct inpcb *inp = toep->inp;
2094 	u_int hlen __diagused;
2095 	bool empty;
2096 
2097 	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
2098 	KASSERT(!(toep->flags & TPF_SYNQE),
2099 	    ("%s: toep %p claims to be a synq entry", __func__, toep));
2100 
2101 	/* strip off CPL header */
2102 	m_adj(m, sizeof(*cpl));
2103 	hlen = m->m_pkthdr.len;
2104 
2105 	KASSERT(hlen == be16toh(cpl->length),
2106 	    ("%s: payload length mismatch", __func__));
2107 
2108 	INP_WLOCK(inp);
2109 	if (inp->inp_flags & INP_DROPPED) {
2110 		CTR(KTR_CXGBE, "%s: tid %u, rx (hlen %u), inp_flags 0x%x",
2111 		    __func__, tid, hlen, inp->inp_flags);
2112 		INP_WUNLOCK(inp);
2113 		m_freem(m);
2114 		return (0);
2115 	}
2116 
2117 #ifdef VERBOSE_TRACES
2118 	CTR(KTR_CXGBE, "%s: tid %u hlen %u seq %u status %u", __func__, tid,
2119 	    hlen, be32toh(cpl->seq), cpl->status);
2120 #endif
2121 
2122 	/* Save TCP sequence number and CPL status. */
2123 	m->m_pkthdr.nvmf_tcp_seq = be32toh(cpl->seq);
2124 	m->m_pkthdr.nvmf_cpl_status = cpl->status;
2125 
2126 	SOCKBUF_LOCK(&so->so_rcv);
2127 	empty = mbufq_len(&qp->rx_pdus) == 0;
2128 	mbufq_enqueue(&qp->rx_pdus, m);
2129 	SOCKBUF_UNLOCK(&so->so_rcv);
2130 	INP_WUNLOCK(inp);
2131 	if (empty)
2132 		cv_signal(&qp->rx_cv);
2133 	return (0);
2134 }
2135 
2136 static uint16_t
che_alloc_fl_cid(struct nvmf_che_qpair * qp,uint16_t original_cid)2137 che_alloc_fl_cid(struct nvmf_che_qpair *qp, uint16_t original_cid)
2138 {
2139 	uint16_t new_cid;
2140 
2141 	mtx_lock(&qp->fl_cid_lock);
2142 	new_cid = FL_CID_FINDFREE_AT(qp->fl_cid_set, qp->next_cid);
2143 	if (new_cid == 0) {
2144 		new_cid = FL_CID_FINDFREE_AT(qp->fl_cid_set, 0);
2145 		MPASS(new_cid != 0);
2146 	}
2147 	new_cid--;
2148 	FL_CID_BUSY(new_cid, qp->fl_cid_set);
2149 	if (new_cid == CHE_MAX_FL_TAG)
2150 		qp->next_cid = 0;
2151 	else
2152 		qp->next_cid = new_cid + 1;
2153 	qp->fl_cids[new_cid] = original_cid;
2154 	mtx_unlock(&qp->fl_cid_lock);
2155 
2156 	return (new_cid | CHE_FL_TAG_MASK);
2157 }
2158 
2159 static uint16_t
che_alloc_ddp_cid(struct nvmf_che_qpair * qp,struct nvmf_che_command_buffer * cb)2160 che_alloc_ddp_cid(struct nvmf_che_qpair *qp, struct nvmf_che_command_buffer *cb)
2161 {
2162 	mtx_assert(&qp->rx_buffers.lock, MA_OWNED);
2163 
2164 	return (che_alloc_ddp_tag(qp, cb));
2165 }
2166 
2167 static struct mbuf *
che_command_pdu(struct nvmf_che_qpair * qp,struct nvmf_che_capsule * cc)2168 che_command_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_capsule *cc)
2169 {
2170 	struct nvmf_capsule *nc = &cc->nc;
2171 	struct nvmf_che_command_buffer *cb;
2172 	struct nvme_sgl_descriptor *sgl;
2173 	struct nvme_tcp_cmd cmd;
2174 	struct mbuf *top, *m;
2175 	uint16_t cid;
2176 	bool use_icd;
2177 
2178 	use_icd = false;
2179 	cb = NULL;
2180 	m = NULL;
2181 
2182 	if (nc->nc_data.io_len != 0) {
2183 		cb = che_alloc_command_buffer(qp, &nc->nc_data, 0,
2184 		    nc->nc_data.io_len, nc->nc_sqe.cid);
2185 		cb->original_cid = nc->nc_sqe.cid;
2186 
2187 		if (nc->nc_send_data && nc->nc_data.io_len <= qp->max_icd) {
2188 			cid = che_alloc_fl_cid(qp, nc->nc_sqe.cid);
2189 			use_icd = true;
2190 			m = nvmf_che_command_buffer_mbuf(cb, 0,
2191 			    nc->nc_data.io_len, NULL, false);
2192 			cb->data_xfered = nc->nc_data.io_len;
2193 			che_release_command_buffer(cb);
2194 		} else if (nc->nc_send_data) {
2195 			cid = che_alloc_fl_cid(qp, nc->nc_sqe.cid);
2196 			cb->cid = htole16(cid);
2197 			mtx_lock(&qp->tx_buffers.lock);
2198 			che_add_command_buffer(&qp->tx_buffers, cb);
2199 			mtx_unlock(&qp->tx_buffers.lock);
2200 		} else {
2201 			mtx_lock(&qp->rx_buffers.lock);
2202 			cid = che_alloc_ddp_cid(qp, cb);
2203 			if (cid == CHE_DDP_NO_TAG) {
2204 				cid = che_alloc_fl_cid(qp, nc->nc_sqe.cid);
2205 				che_add_command_buffer(&qp->rx_buffers, cb);
2206 			}
2207 			cb->cid = htole16(cid);
2208 			mtx_unlock(&qp->rx_buffers.lock);
2209 		}
2210 	} else
2211 		cid = che_alloc_fl_cid(qp, nc->nc_sqe.cid);
2212 
2213 #ifdef VERBOSE_TRACES
2214 	CTR(KTR_CXGBE, "%s: tid %u allocated cid 0x%04x for 0x%04x", __func__,
2215 	    qp->toep->tid, cid, nc->nc_sqe.cid);
2216 #endif
2217 	memset(&cmd, 0, sizeof(cmd));
2218 	cmd.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_CMD;
2219 	cmd.ccsqe = nc->nc_sqe;
2220 	cmd.ccsqe.cid = htole16(cid);
2221 
2222 	/* Populate SGL in SQE. */
2223 	sgl = &cmd.ccsqe.sgl;
2224 	memset(sgl, 0, sizeof(*sgl));
2225 	sgl->address = 0;
2226 	sgl->length = htole32(nc->nc_data.io_len);
2227 	if (use_icd) {
2228 		/* Use in-capsule data. */
2229 		sgl->type = NVME_SGL_TYPE_ICD;
2230 	} else {
2231 		/* Use a command buffer. */
2232 		sgl->type = NVME_SGL_TYPE_COMMAND_BUFFER;
2233 	}
2234 
2235 	top = nvmf_che_construct_pdu(qp, &cmd, sizeof(cmd), m, m != NULL ?
2236 	    nc->nc_data.io_len : 0);
2237 	return (top);
2238 }
2239 
2240 static struct mbuf *
che_response_pdu(struct nvmf_che_qpair * qp,struct nvmf_che_capsule * cc)2241 che_response_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_capsule *cc)
2242 {
2243 	struct nvmf_capsule *nc = &cc->nc;
2244 	struct nvme_tcp_rsp rsp;
2245 
2246 	memset(&rsp, 0, sizeof(rsp));
2247 	rsp.common.pdu_type = NVME_TCP_PDU_TYPE_CAPSULE_RESP;
2248 	rsp.rccqe = nc->nc_cqe;
2249 
2250 	return (nvmf_che_construct_pdu(qp, &rsp, sizeof(rsp), NULL, 0));
2251 }
2252 
2253 static struct mbuf *
capsule_to_pdu(struct nvmf_che_qpair * qp,struct nvmf_che_capsule * cc)2254 capsule_to_pdu(struct nvmf_che_qpair *qp, struct nvmf_che_capsule *cc)
2255 {
2256 	if (cc->nc.nc_qe_len == sizeof(struct nvme_command))
2257 		return (che_command_pdu(qp, cc));
2258 	else
2259 		return (che_response_pdu(qp, cc));
2260 }
2261 
2262 static void
nvmf_che_send(void * arg)2263 nvmf_che_send(void *arg)
2264 {
2265 	struct nvmf_che_qpair *qp = arg;
2266 	struct nvmf_che_capsule *cc;
2267 	struct socket *so = qp->so;
2268 	struct mbuf *m;
2269 	int error;
2270 
2271 	m = NULL;
2272 	SOCKBUF_LOCK(&so->so_snd);
2273 	while (!qp->tx_shutdown) {
2274 		if (so->so_error != 0) {
2275 			error = so->so_error;
2276 			SOCKBUF_UNLOCK(&so->so_snd);
2277 			m_freem(m);
2278 			nvmf_qpair_error(&qp->qp, error);
2279 			SOCKBUF_LOCK(&so->so_snd);
2280 			while (!qp->tx_shutdown)
2281 				cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd));
2282 			break;
2283 		}
2284 
2285 		if (STAILQ_EMPTY(&qp->tx_capsules)) {
2286 			cv_wait(&qp->tx_cv, SOCKBUF_MTX(&so->so_snd));
2287 			continue;
2288 		}
2289 
2290 		/* Convert a capsule into a PDU. */
2291 		cc = STAILQ_FIRST(&qp->tx_capsules);
2292 		STAILQ_REMOVE_HEAD(&qp->tx_capsules, link);
2293 		SOCKBUF_UNLOCK(&so->so_snd);
2294 
2295 		m = capsule_to_pdu(qp, cc);
2296 		che_release_capsule(cc);
2297 
2298 		nvmf_che_write_pdu(qp, m);
2299 
2300 		SOCKBUF_LOCK(&so->so_snd);
2301 	}
2302 	SOCKBUF_UNLOCK(&so->so_snd);
2303 	kthread_exit();
2304 }
2305 
2306 static int
nvmf_che_setsockopt(struct socket * so,u_int sspace,u_int rspace)2307 nvmf_che_setsockopt(struct socket *so, u_int sspace, u_int rspace)
2308 {
2309 	struct sockopt opt;
2310 	int error, one = 1;
2311 
2312 	/* Don't lower the buffer sizes, just enforce a minimum. */
2313 	SOCKBUF_LOCK(&so->so_snd);
2314 	if (sspace < so->so_snd.sb_hiwat)
2315 		sspace = so->so_snd.sb_hiwat;
2316 	SOCKBUF_UNLOCK(&so->so_snd);
2317 	SOCKBUF_LOCK(&so->so_rcv);
2318 	if (rspace < so->so_rcv.sb_hiwat)
2319 		rspace = so->so_rcv.sb_hiwat;
2320 	SOCKBUF_UNLOCK(&so->so_rcv);
2321 
2322 	error = soreserve(so, sspace, rspace);
2323 	if (error != 0)
2324 		return (error);
2325 	SOCKBUF_LOCK(&so->so_snd);
2326 	so->so_snd.sb_flags |= SB_AUTOSIZE;
2327 	SOCKBUF_UNLOCK(&so->so_snd);
2328 	SOCKBUF_LOCK(&so->so_rcv);
2329 	so->so_rcv.sb_flags |= SB_AUTOSIZE;
2330 	SOCKBUF_UNLOCK(&so->so_rcv);
2331 
2332 	/*
2333 	 * Disable Nagle.
2334 	 */
2335 	bzero(&opt, sizeof(opt));
2336 	opt.sopt_dir = SOPT_SET;
2337 	opt.sopt_level = IPPROTO_TCP;
2338 	opt.sopt_name = TCP_NODELAY;
2339 	opt.sopt_val = &one;
2340 	opt.sopt_valsize = sizeof(one);
2341 	error = sosetopt(so, &opt);
2342 	if (error != 0)
2343 		return (error);
2344 
2345 	return (0);
2346 }
2347 
2348 static void
t4_nvme_set_tcb_field(struct toepcb * toep,uint16_t word,uint64_t mask,uint64_t val)2349 t4_nvme_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask,
2350     uint64_t val)
2351 {
2352 	struct adapter *sc = td_adapter(toep->td);
2353 
2354 	t4_set_tcb_field(sc, &toep->ofld_txq->wrq, toep, word, mask, val, 0, 0);
2355 }
2356 
2357 static void
set_ulp_mode_nvme(struct toepcb * toep,u_int ulp_submode,uint8_t rxpda)2358 set_ulp_mode_nvme(struct toepcb *toep, u_int ulp_submode, uint8_t rxpda)
2359 {
2360 	uint64_t val;
2361 
2362 	CTR(KTR_CXGBE, "%s: tid %u, ULP_MODE_NVMET, submode=%#x, rxpda=%u",
2363 	    __func__, toep->tid, ulp_submode, rxpda);
2364 
2365 	val = V_TCB_ULP_TYPE(ULP_MODE_NVMET) | V_TCB_ULP_RAW(ulp_submode);
2366 	t4_nvme_set_tcb_field(toep, W_TCB_ULP_TYPE,
2367 	    V_TCB_ULP_TYPE(M_TCB_ULP_TYPE) | V_TCB_ULP_RAW(M_TCB_ULP_RAW), val);
2368 
2369 	val = V_TF_RX_FLOW_CONTROL_DISABLE(1ULL);
2370 	t4_nvme_set_tcb_field(toep, W_TCB_T_FLAGS, val, val);
2371 
2372 	val = V_TCB_RSVD((rxpda / 4) - 1);
2373 	t4_nvme_set_tcb_field(toep, W_TCB_RSVD, V_TCB_RSVD(M_TCB_RSVD), val);
2374 
2375 	/* 0 disables CPL_NVMT_CMP_IMM which is not useful in this driver. */
2376 	val = 0;
2377 	t4_nvme_set_tcb_field(toep, W_TCB_CMP_IMM_SZ,
2378 	    V_TCB_CMP_IMM_SZ(M_TCB_CMP_IMM_SZ), val);
2379 }
2380 
2381 static u_int
pdu_max_data_len(const nvlist_t * nvl,u_int max_pdu_len,u_int hlen,uint8_t pda)2382 pdu_max_data_len(const nvlist_t *nvl, u_int max_pdu_len, u_int hlen,
2383     uint8_t pda)
2384 {
2385 	u_int max_data_len;
2386 
2387 	if (nvlist_get_bool(nvl, "header_digests"))
2388 		hlen += sizeof(uint32_t);
2389 	hlen = roundup(hlen, pda);
2390 	max_data_len = max_pdu_len - hlen;
2391 	if (nvlist_get_bool(nvl, "data_digests"))
2392 		max_data_len -= sizeof(uint32_t);
2393 	return (max_data_len);
2394 }
2395 
2396 static struct nvmf_qpair *
che_allocate_qpair(bool controller,const nvlist_t * nvl)2397 che_allocate_qpair(bool controller, const nvlist_t *nvl)
2398 {
2399 	struct nvmf_che_adapter *nca;
2400 	struct nvmf_che_qpair *qp;
2401 	struct adapter *sc;
2402 	struct file *fp;
2403 	struct socket *so;
2404 	struct inpcb *inp;
2405 	struct tcpcb *tp;
2406 	struct toepcb *toep;
2407 	cap_rights_t rights;
2408 	u_int max_tx_pdu_len, num_ddp_tags;
2409 	int error, ulp_submode;
2410 
2411 	if (!nvlist_exists_number(nvl, "fd") ||
2412 	    !nvlist_exists_number(nvl, "rxpda") ||
2413 	    !nvlist_exists_number(nvl, "txpda") ||
2414 	    !nvlist_exists_bool(nvl, "header_digests") ||
2415 	    !nvlist_exists_bool(nvl, "data_digests") ||
2416 	    !nvlist_exists_number(nvl, "maxr2t") ||
2417 	    !nvlist_exists_number(nvl, "maxh2cdata") ||
2418 	    !nvlist_exists_number(nvl, "max_icd"))
2419 		return (NULL);
2420 
2421 	error = fget(curthread, nvlist_get_number(nvl, "fd"),
2422 	    cap_rights_init_one(&rights, CAP_SOCK_CLIENT), &fp);
2423 	if (error != 0)
2424 		return (NULL);
2425 	if (fp->f_type != DTYPE_SOCKET) {
2426 		fdrop(fp, curthread);
2427 		return (NULL);
2428 	}
2429 	so = fp->f_data;
2430 	if (so->so_type != SOCK_STREAM ||
2431 	    so->so_proto->pr_protocol != IPPROTO_TCP) {
2432 		fdrop(fp, curthread);
2433 		return (NULL);
2434 	}
2435 
2436 	sc = find_offload_adapter(so);
2437 	if (sc == NULL) {
2438 		fdrop(fp, curthread);
2439 		return (NULL);
2440 	}
2441 	nca = sc->nvme_ulp_softc;
2442 
2443 	/*
2444 	 * Controller: Require advertised MAXH2CDATA to be small
2445 	 * enough.
2446 	 */
2447 	if (controller) {
2448 		u_int max_rx_data;
2449 
2450 		max_rx_data = pdu_max_data_len(nvl, nca->max_receive_pdu,
2451 		    sizeof(struct nvme_tcp_h2c_data_hdr),
2452 		    nvlist_get_number(nvl, "rxpda"));
2453 		if (nvlist_get_number(nvl, "maxh2cdata") > max_rx_data) {
2454 			fdrop(fp, curthread);
2455 			return (NULL);
2456 		}
2457 	}
2458 
2459 	/*
2460 	 * Host: Require the queue size to be small enough that all of
2461 	 * the command ids allocated by nvmf(4) will fit in the
2462 	 * unallocated range.
2463 	 *
2464 	 * XXX: Alternatively this driver could just queue commands
2465 	 * when an unallocated ID isn't available.
2466 	 */
2467 	if (!controller) {
2468 		u_int num_commands;
2469 
2470 		num_commands = nvlist_get_number(nvl, "qsize") - 1;
2471 		if (nvlist_get_bool(nvl, "admin"))
2472 			num_commands += 8;	/* Max AER */
2473 		if (num_commands > CHE_NUM_FL_TAGS) {
2474 			fdrop(fp, curthread);
2475 			return (NULL);
2476 		}
2477 	}
2478 
2479 	qp = malloc(sizeof(*qp), M_NVMF_CHE, M_WAITOK | M_ZERO);
2480 	qp->txpda = nvlist_get_number(nvl, "txpda");
2481 	qp->rxpda = nvlist_get_number(nvl, "rxpda");
2482 	qp->header_digests = nvlist_get_bool(nvl, "header_digests");
2483 	qp->data_digests = nvlist_get_bool(nvl, "data_digests");
2484 	qp->maxr2t = nvlist_get_number(nvl, "maxr2t");
2485 	if (controller)
2486 		qp->maxh2cdata = nvlist_get_number(nvl, "maxh2cdata");
2487 
2488 	if (controller) {
2489 		/* NB: maxr2t is 0's based. */
2490 		qp->num_fl_ttags = MIN(CHE_NUM_FL_TAGS,
2491 		    nvlist_get_number(nvl, "qsize") *
2492 		    ((uint64_t)qp->maxr2t + 1));
2493 		qp->open_fl_ttags = mallocarray(qp->num_fl_ttags,
2494 		    sizeof(*qp->open_fl_ttags), M_NVMF_CHE, M_WAITOK | M_ZERO);
2495 	} else {
2496 		qp->fl_cids = mallocarray(CHE_NUM_FL_TAGS,
2497 		    sizeof(*qp->fl_cids), M_NVMF_CHE, M_WAITOK | M_ZERO);
2498 		qp->fl_cid_set = malloc(sizeof(*qp->fl_cid_set), M_NVMF_CHE,
2499 		    M_WAITOK);
2500 		FL_CID_INIT(qp->fl_cid_set);
2501 		mtx_init(&qp->fl_cid_lock,  "nvmf/che fl cids", NULL, MTX_DEF);
2502 	}
2503 
2504 	inp = sotoinpcb(so);
2505 	INP_WLOCK(inp);
2506 	tp = intotcpcb(inp);
2507 	if (inp->inp_flags & INP_DROPPED) {
2508 		INP_WUNLOCK(inp);
2509 		free(qp->fl_cid_set, M_NVMF_CHE);
2510 		free(qp->fl_cids, M_NVMF_CHE);
2511 		free(qp->open_fl_ttags, M_NVMF_CHE);
2512 		free(qp, M_NVMF_CHE);
2513 		fdrop(fp, curthread);
2514 		return (NULL);
2515 	}
2516 
2517 	MPASS(tp->t_flags & TF_TOE);
2518 	MPASS(tp->tod != NULL);
2519 	MPASS(tp->t_toe != NULL);
2520 	toep = tp->t_toe;
2521 	MPASS(toep->vi->adapter == sc);
2522 
2523 	if (ulp_mode(toep) != ULP_MODE_NONE) {
2524 		INP_WUNLOCK(inp);
2525 		free(qp->fl_cid_set, M_NVMF_CHE);
2526 		free(qp->fl_cids, M_NVMF_CHE);
2527 		free(qp->open_fl_ttags, M_NVMF_CHE);
2528 		free(qp, M_NVMF_CHE);
2529 		fdrop(fp, curthread);
2530 		return (NULL);
2531 	}
2532 
2533 	/* Claim socket from file descriptor. */
2534 	fp->f_ops = &badfileops;
2535 	fp->f_data = NULL;
2536 
2537 	qp->so = so;
2538 	qp->toep = toep;
2539 	qp->nca = nca;
2540 	refcount_init(&qp->refs, 1);
2541 
2542 	/* NB: C2H and H2C headers are the same size. */
2543 	qp->max_rx_data = pdu_max_data_len(nvl, nca->max_receive_pdu,
2544 	    sizeof(struct nvme_tcp_c2h_data_hdr), qp->rxpda);
2545 	qp->max_tx_data = pdu_max_data_len(nvl, nca->max_transmit_pdu,
2546 	    sizeof(struct nvme_tcp_c2h_data_hdr), qp->txpda);
2547 	if (!controller) {
2548 		qp->max_tx_data = min(qp->max_tx_data,
2549 		    nvlist_get_number(nvl, "maxh2cdata"));
2550 		qp->max_icd = min(nvlist_get_number(nvl, "max_icd"),
2551 		    pdu_max_data_len(nvl, nca->max_transmit_pdu,
2552 		    sizeof(struct nvme_tcp_cmd), qp->txpda));
2553 	} else {
2554 		/*
2555 		 * IOCCSZ represents the size of a logical command
2556 		 * capsule including the 64 byte SQE and the
2557 		 * in-capsule data.  Use pdu_max_data_len to compute
2558 		 * the maximum supported ICD length.
2559 		 */
2560 		qp->max_ioccsz = rounddown(pdu_max_data_len(nvl,
2561 		    nca->max_receive_pdu, sizeof(struct nvme_tcp_cmd),
2562 		    qp->rxpda), 16) + sizeof(struct nvme_command);
2563 	}
2564 
2565 	ulp_submode = 0;
2566 	if (qp->header_digests)
2567 		ulp_submode |= FW_NVMET_ULPSUBMODE_HCRC;
2568 	if (qp->data_digests)
2569 		ulp_submode |= FW_NVMET_ULPSUBMODE_DCRC;
2570 	if (!controller)
2571 		ulp_submode |= FW_NVMET_ULPSUBMODE_ING_DIR;
2572 
2573 	max_tx_pdu_len = sizeof(struct nvme_tcp_h2c_data_hdr);
2574 	if (qp->header_digests)
2575 		max_tx_pdu_len += sizeof(uint32_t);
2576 	max_tx_pdu_len = roundup(max_tx_pdu_len, qp->txpda);
2577 	max_tx_pdu_len += qp->max_tx_data;
2578 	if (qp->data_digests)
2579 		max_tx_pdu_len += sizeof(uint32_t);
2580 
2581 	/* TODO: ISO limits */
2582 
2583 	if (controller) {
2584 		/* Use the SUCCESS flag if SQ flow control is disabled. */
2585 		qp->send_success = !nvlist_get_bool(nvl, "sq_flow_control");
2586 	}
2587 
2588 	toep->params.ulp_mode = ULP_MODE_NVMET;
2589 	toep->ulpcb = qp;
2590 
2591 	send_txdataplen_max_flowc_wr(sc, toep,
2592 	    roundup(/* max_iso_pdus * */ max_tx_pdu_len, tp->t_maxseg));
2593 	set_ulp_mode_nvme(toep, ulp_submode, qp->rxpda);
2594 	INP_WUNLOCK(inp);
2595 
2596 	fdrop(fp, curthread);
2597 
2598 	error = nvmf_che_setsockopt(so, max_tx_pdu_len, nca->max_receive_pdu);
2599 	if (error != 0) {
2600 		free(qp->fl_cid_set, M_NVMF_CHE);
2601 		free(qp->fl_cids, M_NVMF_CHE);
2602 		free(qp->open_fl_ttags, M_NVMF_CHE);
2603 		free(qp, M_NVMF_CHE);
2604 		return (NULL);
2605 	}
2606 
2607 	num_ddp_tags = ddp_tags_per_qp;
2608 	if (num_ddp_tags > 0) {
2609 		qp->tpt_offset = t4_stag_alloc(sc, num_ddp_tags);
2610 		if (qp->tpt_offset != T4_STAG_UNSET) {
2611 #ifdef VERBOSE_TRACES
2612 			CTR(KTR_CXGBE,
2613 			    "%s: tid %u using %u tags at offset 0x%x",
2614 			    __func__, toep->tid, num_ddp_tags, qp->tpt_offset);
2615 #endif
2616 			qp->num_ddp_tags = num_ddp_tags;
2617 			qp->open_ddp_tags = mallocarray(qp->num_ddp_tags,
2618 			    sizeof(*qp->open_ddp_tags), M_NVMF_CHE, M_WAITOK |
2619 			    M_ZERO);
2620 
2621 			t4_nvme_set_tcb_field(toep, W_TCB_TPT_OFFSET,
2622 			    M_TCB_TPT_OFFSET, V_TCB_TPT_OFFSET(qp->tpt_offset));
2623 		}
2624 	}
2625 
2626 	TAILQ_INIT(&qp->rx_buffers.head);
2627 	TAILQ_INIT(&qp->tx_buffers.head);
2628 	mtx_init(&qp->rx_buffers.lock, "nvmf/che rx buffers", NULL, MTX_DEF);
2629 	mtx_init(&qp->tx_buffers.lock, "nvmf/che tx buffers", NULL, MTX_DEF);
2630 
2631 	cv_init(&qp->rx_cv, "-");
2632 	cv_init(&qp->tx_cv, "-");
2633 	mbufq_init(&qp->rx_data, 0);
2634 	mbufq_init(&qp->rx_pdus, 0);
2635 	STAILQ_INIT(&qp->tx_capsules);
2636 
2637 	/* Register socket upcall for receive to handle remote FIN. */
2638 	SOCKBUF_LOCK(&so->so_rcv);
2639 	soupcall_set(so, SO_RCV, nvmf_che_soupcall_receive, qp);
2640 	SOCKBUF_UNLOCK(&so->so_rcv);
2641 
2642 	/* Spin up kthreads. */
2643 	error = kthread_add(nvmf_che_receive, qp, NULL, &qp->rx_thread, 0, 0,
2644 	    "nvmef che rx");
2645 	if (error != 0) {
2646 		che_free_qpair(&qp->qp);
2647 		return (NULL);
2648 	}
2649 	error = kthread_add(nvmf_che_send, qp, NULL, &qp->tx_thread, 0, 0,
2650 	    "nvmef che tx");
2651 	if (error != 0) {
2652 		che_free_qpair(&qp->qp);
2653 		return (NULL);
2654 	}
2655 
2656 	return (&qp->qp);
2657 }
2658 
2659 static void
che_release_qpair(struct nvmf_che_qpair * qp)2660 che_release_qpair(struct nvmf_che_qpair *qp)
2661 {
2662 	if (refcount_release(&qp->refs))
2663 		free(qp, M_NVMF_CHE);
2664 }
2665 
2666 static void
che_free_qpair(struct nvmf_qpair * nq)2667 che_free_qpair(struct nvmf_qpair *nq)
2668 {
2669 	struct nvmf_che_qpair *qp = CQP(nq);
2670 	struct nvmf_che_command_buffer *ncb, *cb;
2671 	struct nvmf_che_capsule *ncc, *cc;
2672 	struct socket *so = qp->so;
2673 	struct toepcb *toep = qp->toep;
2674 	struct inpcb *inp = sotoinpcb(so);
2675 
2676 	/* Shut down kthreads. */
2677 	SOCKBUF_LOCK(&so->so_snd);
2678 	qp->tx_shutdown = true;
2679 	if (qp->tx_thread != NULL) {
2680 		cv_signal(&qp->tx_cv);
2681 		mtx_sleep(qp->tx_thread, SOCKBUF_MTX(&so->so_snd), 0,
2682 		    "nvchetx", 0);
2683 	}
2684 	SOCKBUF_UNLOCK(&so->so_snd);
2685 
2686 	SOCKBUF_LOCK(&so->so_rcv);
2687 	qp->rx_shutdown = true;
2688 	if (qp->rx_thread != NULL) {
2689 		cv_signal(&qp->rx_cv);
2690 		mtx_sleep(qp->rx_thread, SOCKBUF_MTX(&so->so_rcv), 0,
2691 		    "nvcherx", 0);
2692 	}
2693 	soupcall_clear(so, SO_RCV);
2694 	SOCKBUF_UNLOCK(&so->so_rcv);
2695 	mbufq_drain(&qp->rx_data);
2696 	mbufq_drain(&qp->rx_pdus);
2697 
2698 	STAILQ_FOREACH_SAFE(cc, &qp->tx_capsules, link, ncc) {
2699 		nvmf_abort_capsule_data(&cc->nc, ECONNABORTED);
2700 		che_release_capsule(cc);
2701 	}
2702 
2703 	cv_destroy(&qp->tx_cv);
2704 	cv_destroy(&qp->rx_cv);
2705 
2706 	if (qp->open_fl_ttags != NULL) {
2707 		for (u_int i = 0; i < qp->num_fl_ttags; i++) {
2708 			cb = qp->open_fl_ttags[i];
2709 			if (cb != NULL) {
2710 				cb->cc->active_r2ts--;
2711 				cb->error = ECONNABORTED;
2712 				che_release_command_buffer(cb);
2713 			}
2714 		}
2715 		free(qp->open_fl_ttags, M_NVMF_CHE);
2716 	}
2717 	if (qp->num_ddp_tags != 0) {
2718 		for (u_int i = 0; i < qp->num_ddp_tags; i++) {
2719 			cb = qp->open_ddp_tags[i];
2720 			if (cb != NULL) {
2721 				if (cb->cc != NULL)
2722 					cb->cc->active_r2ts--;
2723 				cb->error = ECONNABORTED;
2724 				mtx_lock(&qp->rx_buffers.lock);
2725 				che_free_ddp_tag(qp, cb, cb->ttag);
2726 				mtx_unlock(&qp->rx_buffers.lock);
2727 				che_release_command_buffer(cb);
2728 			}
2729 		}
2730 		free(qp->open_ddp_tags, M_NVMF_CHE);
2731 	}
2732 
2733 	mtx_lock(&qp->rx_buffers.lock);
2734 	TAILQ_FOREACH_SAFE(cb, &qp->rx_buffers.head, link, ncb) {
2735 		che_remove_command_buffer(&qp->rx_buffers, cb);
2736 		mtx_unlock(&qp->rx_buffers.lock);
2737 #ifdef INVARIANTS
2738 		if (cb->cc != NULL)
2739 			cb->cc->pending_r2ts--;
2740 #endif
2741 		cb->error = ECONNABORTED;
2742 		che_release_command_buffer(cb);
2743 		mtx_lock(&qp->rx_buffers.lock);
2744 	}
2745 	mtx_destroy(&qp->rx_buffers.lock);
2746 
2747 	mtx_lock(&qp->tx_buffers.lock);
2748 	TAILQ_FOREACH_SAFE(cb, &qp->tx_buffers.head, link, ncb) {
2749 		che_remove_command_buffer(&qp->tx_buffers, cb);
2750 		mtx_unlock(&qp->tx_buffers.lock);
2751 		cb->error = ECONNABORTED;
2752 		che_release_command_buffer(cb);
2753 		mtx_lock(&qp->tx_buffers.lock);
2754 	}
2755 	mtx_destroy(&qp->tx_buffers.lock);
2756 
2757 	if (qp->num_ddp_tags != 0)
2758 		t4_stag_free(qp->nca->sc, qp->tpt_offset, qp->num_ddp_tags);
2759 
2760 	if (!qp->qp.nq_controller) {
2761 		free(qp->fl_cids, M_NVMF_CHE);
2762 		free(qp->fl_cid_set, M_NVMF_CHE);
2763 		mtx_destroy(&qp->fl_cid_lock);
2764 	}
2765 
2766 	INP_WLOCK(inp);
2767 	toep->ulpcb = NULL;
2768 	mbufq_drain(&toep->ulp_pduq);
2769 
2770 	/*
2771 	 * Grab a reference to use when waiting for the final CPL to
2772 	 * be received.  If toep->inp is NULL, then
2773 	 * final_cpl_received() has already been called (e.g.  due to
2774 	 * the peer sending a RST).
2775 	 */
2776 	if (toep->inp != NULL) {
2777 		toep = hold_toepcb(toep);
2778 		toep->flags |= TPF_WAITING_FOR_FINAL;
2779 	} else
2780 		toep = NULL;
2781 	INP_WUNLOCK(inp);
2782 
2783 	soclose(so);
2784 
2785 	/*
2786 	 * Wait for the socket to fully close.  This ensures any
2787 	 * pending received data has been received (and in particular,
2788 	 * any data that would be received by DDP has been handled).
2789 	 */
2790 	if (toep != NULL) {
2791 		struct mtx *lock = mtx_pool_find(mtxpool_sleep, toep);
2792 
2793 		mtx_lock(lock);
2794 		while ((toep->flags & TPF_WAITING_FOR_FINAL) != 0)
2795 			mtx_sleep(toep, lock, PSOCK, "conclo2", 0);
2796 		mtx_unlock(lock);
2797 		free_toepcb(toep);
2798 	}
2799 
2800 	che_release_qpair(qp);
2801 }
2802 
2803 static uint32_t
che_max_ioccsz(struct nvmf_qpair * nq)2804 che_max_ioccsz(struct nvmf_qpair *nq)
2805 {
2806 	struct nvmf_che_qpair *qp = CQP(nq);
2807 
2808 	/*
2809 	 * Limit the command capsule size so that with maximum ICD it
2810 	 * fits within the limit of the largest PDU the adapter can
2811 	 * receive.
2812 	 */
2813 	return (qp->max_ioccsz);
2814 }
2815 
2816 static uint64_t
che_max_xfer_size(struct nvmf_qpair * nq)2817 che_max_xfer_size(struct nvmf_qpair *nq)
2818 {
2819 	struct nvmf_che_qpair *qp = CQP(nq);
2820 
2821 	/*
2822 	 * Limit host transfers to the size of the data payload in the
2823 	 * largest PDU the adapter can receive.
2824 	 */
2825 	return (qp->max_rx_data);
2826 }
2827 
2828 static struct nvmf_capsule *
che_allocate_capsule(struct nvmf_qpair * nq,int how)2829 che_allocate_capsule(struct nvmf_qpair *nq, int how)
2830 {
2831 	struct nvmf_che_qpair *qp = CQP(nq);
2832 	struct nvmf_che_capsule *cc;
2833 
2834 	cc = malloc(sizeof(*cc), M_NVMF_CHE, how | M_ZERO);
2835 	if (cc == NULL)
2836 		return (NULL);
2837 	refcount_init(&cc->refs, 1);
2838 	refcount_acquire(&qp->refs);
2839 	return (&cc->nc);
2840 }
2841 
2842 static void
che_release_capsule(struct nvmf_che_capsule * cc)2843 che_release_capsule(struct nvmf_che_capsule *cc)
2844 {
2845 	struct nvmf_che_qpair *qp = CQP(cc->nc.nc_qpair);
2846 
2847 	if (!refcount_release(&cc->refs))
2848 		return;
2849 
2850 	MPASS(cc->active_r2ts == 0);
2851 	MPASS(cc->pending_r2ts == 0);
2852 
2853 	nvmf_che_free_pdu(&cc->rx_pdu);
2854 	free(cc, M_NVMF_CHE);
2855 	che_release_qpair(qp);
2856 }
2857 
2858 static void
che_free_capsule(struct nvmf_capsule * nc)2859 che_free_capsule(struct nvmf_capsule *nc)
2860 {
2861 	che_release_capsule(CCAP(nc));
2862 }
2863 
2864 static int
che_transmit_capsule(struct nvmf_capsule * nc)2865 che_transmit_capsule(struct nvmf_capsule *nc)
2866 {
2867 	struct nvmf_che_qpair *qp = CQP(nc->nc_qpair);
2868 	struct nvmf_che_capsule *cc = CCAP(nc);
2869 	struct socket *so = qp->so;
2870 
2871 	refcount_acquire(&cc->refs);
2872 	SOCKBUF_LOCK(&so->so_snd);
2873 	STAILQ_INSERT_TAIL(&qp->tx_capsules, cc, link);
2874 	cv_signal(&qp->tx_cv);
2875 	SOCKBUF_UNLOCK(&so->so_snd);
2876 	return (0);
2877 }
2878 
2879 static uint8_t
che_validate_command_capsule(struct nvmf_capsule * nc)2880 che_validate_command_capsule(struct nvmf_capsule *nc)
2881 {
2882 	struct nvmf_che_capsule *cc = CCAP(nc);
2883 	struct nvme_sgl_descriptor *sgl;
2884 
2885 	KASSERT(cc->rx_pdu.hdr != NULL, ("capsule wasn't received"));
2886 
2887 	sgl = &nc->nc_sqe.sgl;
2888 	switch (sgl->type) {
2889 	case NVME_SGL_TYPE_ICD:
2890 		if (cc->rx_pdu.data_len != le32toh(sgl->length)) {
2891 			printf("NVMe/TCP: Command Capsule with mismatched ICD length\n");
2892 			return (NVME_SC_DATA_SGL_LENGTH_INVALID);
2893 		}
2894 		break;
2895 	case NVME_SGL_TYPE_COMMAND_BUFFER:
2896 		if (cc->rx_pdu.data_len != 0) {
2897 			printf("NVMe/TCP: Command Buffer SGL with ICD\n");
2898 			return (NVME_SC_INVALID_FIELD);
2899 		}
2900 		break;
2901 	default:
2902 		printf("NVMe/TCP: Invalid SGL type in Command Capsule\n");
2903 		return (NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID);
2904 	}
2905 
2906 	if (sgl->address != 0) {
2907 		printf("NVMe/TCP: Invalid SGL offset in Command Capsule\n");
2908 		return (NVME_SC_SGL_OFFSET_INVALID);
2909 	}
2910 
2911 	return (NVME_SC_SUCCESS);
2912 }
2913 
2914 static size_t
che_capsule_data_len(const struct nvmf_capsule * nc)2915 che_capsule_data_len(const struct nvmf_capsule *nc)
2916 {
2917 	MPASS(nc->nc_qe_len == sizeof(struct nvme_command));
2918 	return (le32toh(nc->nc_sqe.sgl.length));
2919 }
2920 
2921 static void
che_receive_r2t_data(struct nvmf_capsule * nc,uint32_t data_offset,struct nvmf_io_request * io)2922 che_receive_r2t_data(struct nvmf_capsule *nc, uint32_t data_offset,
2923     struct nvmf_io_request *io)
2924 {
2925 	struct nvmf_che_qpair *qp = CQP(nc->nc_qpair);
2926 	struct nvmf_che_capsule *cc = CCAP(nc);
2927 	struct nvmf_che_command_buffer *cb;
2928 
2929 	cb = che_alloc_command_buffer(qp, io, data_offset, io->io_len,
2930 	    nc->nc_sqe.cid);
2931 
2932 	cb->cc = cc;
2933 	refcount_acquire(&cc->refs);
2934 
2935 	/*
2936 	 * If this command has too many active R2Ts or there are no
2937 	 * available transfer tags, queue the request for later.
2938 	 *
2939 	 * NB: maxr2t is 0's based.
2940 	 */
2941 	mtx_lock(&qp->rx_buffers.lock);
2942 	if (cc->active_r2ts > qp->maxr2t ||
2943 	    !nvmf_che_allocate_ttag(qp, cb)) {
2944 #ifdef INVARIANTS
2945 		cc->pending_r2ts++;
2946 #endif
2947 		TAILQ_INSERT_TAIL(&qp->rx_buffers.head, cb, link);
2948 		mtx_unlock(&qp->rx_buffers.lock);
2949 		return;
2950 	}
2951 	mtx_unlock(&qp->rx_buffers.lock);
2952 
2953 	che_send_r2t(qp, nc->nc_sqe.cid, cb->ttag, data_offset, io->io_len);
2954 }
2955 
2956 static void
che_receive_icd_data(struct nvmf_capsule * nc,uint32_t data_offset,struct nvmf_io_request * io)2957 che_receive_icd_data(struct nvmf_capsule *nc, uint32_t data_offset,
2958     struct nvmf_io_request *io)
2959 {
2960 	struct nvmf_che_capsule *cc = CCAP(nc);
2961 
2962 	/*
2963 	 * The header is in rx_pdu.m, the padding is discarded, and
2964 	 * the data starts at rx_pdu.m->m_next.
2965 	 */
2966 	mbuf_copyto_io(cc->rx_pdu.m->m_next, data_offset, io->io_len, io, 0);
2967 	nvmf_complete_io_request(io, io->io_len, 0);
2968 }
2969 
2970 static int
che_receive_controller_data(struct nvmf_capsule * nc,uint32_t data_offset,struct nvmf_io_request * io)2971 che_receive_controller_data(struct nvmf_capsule *nc, uint32_t data_offset,
2972     struct nvmf_io_request *io)
2973 {
2974 	struct nvme_sgl_descriptor *sgl;
2975 	size_t data_len;
2976 
2977 	if (nc->nc_qe_len != sizeof(struct nvme_command) ||
2978 	    !nc->nc_qpair->nq_controller)
2979 		return (EINVAL);
2980 
2981 	sgl = &nc->nc_sqe.sgl;
2982 	data_len = le32toh(sgl->length);
2983 	if (data_offset + io->io_len > data_len)
2984 		return (EFBIG);
2985 
2986 	if (sgl->type == NVME_SGL_TYPE_ICD)
2987 		che_receive_icd_data(nc, data_offset, io);
2988 	else
2989 		che_receive_r2t_data(nc, data_offset, io);
2990 	return (0);
2991 }
2992 
2993 /* NB: cid is little-endian already. */
2994 static void
che_send_c2h_pdu(struct nvmf_che_qpair * qp,uint16_t cid,uint32_t data_offset,struct mbuf * m,size_t len,bool last_pdu,bool success)2995 che_send_c2h_pdu(struct nvmf_che_qpair *qp, uint16_t cid, uint32_t data_offset,
2996     struct mbuf *m, size_t len, bool last_pdu, bool success)
2997 {
2998 	struct nvme_tcp_c2h_data_hdr c2h;
2999 	struct mbuf *top;
3000 
3001 	memset(&c2h, 0, sizeof(c2h));
3002 	c2h.common.pdu_type = NVME_TCP_PDU_TYPE_C2H_DATA;
3003 	if (last_pdu)
3004 		c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_LAST_PDU;
3005 	if (success)
3006 		c2h.common.flags |= NVME_TCP_C2H_DATA_FLAGS_SUCCESS;
3007 	c2h.cccid = cid;
3008 	c2h.datao = htole32(data_offset);
3009 	c2h.datal = htole32(len);
3010 
3011 	top = nvmf_che_construct_pdu(qp, &c2h, sizeof(c2h), m, len);
3012 	nvmf_che_write_pdu(qp, top);
3013 }
3014 
3015 static u_int
che_send_controller_data(struct nvmf_capsule * nc,uint32_t data_offset,struct mbuf * m,size_t len)3016 che_send_controller_data(struct nvmf_capsule *nc, uint32_t data_offset,
3017     struct mbuf *m, size_t len)
3018 {
3019 	struct nvmf_che_qpair *qp = CQP(nc->nc_qpair);
3020 	struct nvme_sgl_descriptor *sgl;
3021 	uint32_t data_len;
3022 	bool last_pdu, last_xfer;
3023 
3024 	if (nc->nc_qe_len != sizeof(struct nvme_command) ||
3025 	    !qp->qp.nq_controller) {
3026 		m_freem(m);
3027 		return (NVME_SC_INVALID_FIELD);
3028 	}
3029 
3030 	sgl = &nc->nc_sqe.sgl;
3031 	data_len = le32toh(sgl->length);
3032 	if (data_offset + len > data_len) {
3033 		m_freem(m);
3034 		return (NVME_SC_INVALID_FIELD);
3035 	}
3036 	last_xfer = (data_offset + len == data_len);
3037 
3038 	if (sgl->type != NVME_SGL_TYPE_COMMAND_BUFFER) {
3039 		m_freem(m);
3040 		return (NVME_SC_INVALID_FIELD);
3041 	}
3042 
3043 	KASSERT(data_offset == CCAP(nc)->tx_data_offset,
3044 	    ("%s: starting data_offset %u doesn't match end of previous xfer %u",
3045 	    __func__, data_offset, CCAP(nc)->tx_data_offset));
3046 
3047 	/* Queue one or more C2H_DATA PDUs containing the data from 'm'. */
3048 	while (m != NULL) {
3049 		struct mbuf *n;
3050 		uint32_t todo;
3051 
3052 		if (m->m_len > qp->max_tx_data) {
3053 			n = m_split(m, qp->max_tx_data, M_WAITOK);
3054 			todo = m->m_len;
3055 		} else {
3056 			struct mbuf *p;
3057 
3058 			todo = m->m_len;
3059 			p = m;
3060 			n = p->m_next;
3061 			while (n != NULL) {
3062 				if (todo + n->m_len > qp->max_tx_data) {
3063 					p->m_next = NULL;
3064 					break;
3065 				}
3066 				todo += n->m_len;
3067 				p = n;
3068 				n = p->m_next;
3069 			}
3070 			MPASS(m_length(m, NULL) == todo);
3071 		}
3072 
3073 		last_pdu = (n == NULL && last_xfer);
3074 		che_send_c2h_pdu(qp, nc->nc_sqe.cid, data_offset, m, todo,
3075 		    last_pdu, last_pdu && qp->send_success);
3076 
3077 		data_offset += todo;
3078 		data_len -= todo;
3079 		m = n;
3080 	}
3081 	MPASS(data_len == 0);
3082 
3083 #ifdef INVARIANTS
3084 	CCAP(nc)->tx_data_offset = data_offset;
3085 #endif
3086 	if (!last_xfer)
3087 		return (NVMF_MORE);
3088 	else if (qp->send_success)
3089 		return (NVMF_SUCCESS_SENT);
3090 	else
3091 		return (NVME_SC_SUCCESS);
3092 }
3093 
3094 struct nvmf_transport_ops che_ops = {
3095 	.allocate_qpair = che_allocate_qpair,
3096 	.free_qpair = che_free_qpair,
3097 	.max_ioccsz = che_max_ioccsz,
3098 	.max_xfer_size = che_max_xfer_size,
3099 	.allocate_capsule = che_allocate_capsule,
3100 	.free_capsule = che_free_capsule,
3101 	.transmit_capsule = che_transmit_capsule,
3102 	.validate_command_capsule = che_validate_command_capsule,
3103 	.capsule_data_len = che_capsule_data_len,
3104 	.receive_controller_data = che_receive_controller_data,
3105 	.send_controller_data = che_send_controller_data,
3106 	.trtype = NVMF_TRTYPE_TCP,
3107 	.priority = 10,
3108 };
3109 
3110 NVMF_TRANSPORT(che, che_ops);
3111 
3112 static void
read_pdu_limits(struct adapter * sc,u_int * max_tx_pdu_len,uint32_t * max_rx_pdu_len)3113 read_pdu_limits(struct adapter *sc, u_int *max_tx_pdu_len,
3114     uint32_t *max_rx_pdu_len)
3115 {
3116 	uint32_t tx_len, rx_len, r, v;
3117 
3118 	/* Copied from cxgbei, but not sure if this is correct. */
3119 	rx_len = t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE);
3120 	tx_len = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE);
3121 
3122 	r = t4_read_reg(sc, A_TP_PARA_REG2);
3123 	rx_len = min(rx_len, G_MAXRXDATA(r));
3124 	tx_len = min(tx_len, G_MAXRXDATA(r));
3125 
3126 	r = t4_read_reg(sc, A_TP_PARA_REG7);
3127 	v = min(G_PMMAXXFERLEN0(r), G_PMMAXXFERLEN1(r));
3128 	rx_len = min(rx_len, v);
3129 	tx_len = min(tx_len, v);
3130 
3131 	/* Cannot be larger than 32KB - 256. */
3132 	rx_len = min(rx_len, 32512);
3133 	tx_len = min(tx_len, 32512);
3134 
3135 	*max_tx_pdu_len = tx_len;
3136 	*max_rx_pdu_len = rx_len;
3137 }
3138 
3139 static int
nvmf_che_init(struct adapter * sc,struct nvmf_che_adapter * nca)3140 nvmf_che_init(struct adapter *sc, struct nvmf_che_adapter *nca)
3141 {
3142 	struct sysctl_oid *oid;
3143 	struct sysctl_oid_list *children;
3144 	uint32_t val;
3145 
3146 	read_pdu_limits(sc, &nca->max_transmit_pdu, &nca->max_receive_pdu);
3147 	if (nca->max_transmit_pdu > che_max_transmit_pdu)
3148 		nca->max_transmit_pdu = che_max_transmit_pdu;
3149 	if (nca->max_receive_pdu > che_max_receive_pdu)
3150 		nca->max_receive_pdu = che_max_receive_pdu;
3151 	val = t4_read_reg(sc, A_SGE_CONTROL2);
3152 	nca->nvmt_data_iqe = (val & F_RXCPLMODE_NVMT) != 0;
3153 
3154 	sysctl_ctx_init(&nca->ctx);
3155 	oid = device_get_sysctl_tree(sc->dev);	/* dev.che.X */
3156 	children = SYSCTL_CHILDREN(oid);
3157 
3158 	oid = SYSCTL_ADD_NODE(&nca->ctx, children, OID_AUTO, "nvme",
3159 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "NVMe ULP settings");
3160 	children = SYSCTL_CHILDREN(oid);
3161 
3162 	nca->ddp_threshold = 8192;
3163 	SYSCTL_ADD_UINT(&nca->ctx, children, OID_AUTO, "ddp_threshold",
3164 	    CTLFLAG_RW, &nca->ddp_threshold, 0, "Rx zero copy threshold");
3165 
3166 	SYSCTL_ADD_UINT(&nca->ctx, children, OID_AUTO, "max_transmit_pdu",
3167 	    CTLFLAG_RW, &nca->max_transmit_pdu, 0,
3168 	    "Maximum size of a transmitted PDU");
3169 
3170 	SYSCTL_ADD_UINT(&nca->ctx, children, OID_AUTO, "max_receive_pdu",
3171 	    CTLFLAG_RW, &nca->max_receive_pdu, 0,
3172 	    "Maximum size of a received PDU");
3173 
3174 	return (0);
3175 }
3176 
3177 static void
nvmf_che_destroy(struct nvmf_che_adapter * nca)3178 nvmf_che_destroy(struct nvmf_che_adapter *nca)
3179 {
3180 	sysctl_ctx_free(&nca->ctx);
3181 	free(nca, M_CXGBE);
3182 }
3183 
3184 static int
nvmf_che_activate(struct adapter * sc)3185 nvmf_che_activate(struct adapter *sc)
3186 {
3187 	struct nvmf_che_adapter *nca;
3188 	int rc;
3189 
3190 	ASSERT_SYNCHRONIZED_OP(sc);
3191 
3192 	if (uld_active(sc, ULD_NVME)) {
3193 		KASSERT(0, ("%s: NVMe offload already enabled on adapter %p",
3194 		    __func__, sc));
3195 		return (0);
3196 	}
3197 
3198 	if ((sc->nvmecaps & FW_CAPS_CONFIG_NVME_TCP) == 0) {
3199 		device_printf(sc->dev,
3200 		    "not NVMe offload capable, or capability disabled\n");
3201 		return (ENOSYS);
3202 	}
3203 
3204 	/* per-adapter softc for NVMe */
3205 	nca = malloc(sizeof(*nca), M_CXGBE, M_ZERO | M_WAITOK);
3206 	nca->sc = sc;
3207 
3208 	rc = nvmf_che_init(sc, nca);
3209 	if (rc != 0) {
3210 		free(nca, M_CXGBE);
3211 		return (rc);
3212 	}
3213 
3214 	sc->nvme_ulp_softc = nca;
3215 
3216 	return (0);
3217 }
3218 
3219 static int
nvmf_che_deactivate(struct adapter * sc)3220 nvmf_che_deactivate(struct adapter *sc)
3221 {
3222 	struct nvmf_che_adapter *nca = sc->nvme_ulp_softc;
3223 
3224 	ASSERT_SYNCHRONIZED_OP(sc);
3225 
3226 	if (nca != NULL) {
3227 		nvmf_che_destroy(nca);
3228 		sc->nvme_ulp_softc = NULL;
3229 	}
3230 
3231 	return (0);
3232 }
3233 
3234 static void
nvmf_che_activate_all(struct adapter * sc,void * arg __unused)3235 nvmf_che_activate_all(struct adapter *sc, void *arg __unused)
3236 {
3237 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t7nvact") != 0)
3238 		return;
3239 
3240 	/* Activate NVMe if any port on this adapter has IFCAP_TOE enabled. */
3241 	if (sc->offload_map && !uld_active(sc, ULD_NVME))
3242 		(void) t4_activate_uld(sc, ULD_NVME);
3243 
3244 	end_synchronized_op(sc, 0);
3245 }
3246 
3247 static void
nvmf_che_deactivate_all(struct adapter * sc,void * arg __unused)3248 nvmf_che_deactivate_all(struct adapter *sc, void *arg __unused)
3249 {
3250 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t7nvdea") != 0)
3251 		return;
3252 
3253 	if (uld_active(sc, ULD_NVME))
3254 	    (void) t4_deactivate_uld(sc, ULD_NVME);
3255 
3256 	end_synchronized_op(sc, 0);
3257 }
3258 
3259 static struct uld_info nvmf_che_uld_info = {
3260 	.uld_activate = nvmf_che_activate,
3261 	.uld_deactivate = nvmf_che_deactivate,
3262 };
3263 
3264 static int
nvmf_che_mod_load(void)3265 nvmf_che_mod_load(void)
3266 {
3267 	int rc;
3268 
3269 	t4_register_cpl_handler(CPL_NVMT_CMP, do_nvmt_cmp);
3270 	t4_register_cpl_handler(CPL_NVMT_DATA, do_nvmt_data);
3271 
3272 	rc = t4_register_uld(&nvmf_che_uld_info, ULD_NVME);
3273 	if (rc != 0)
3274 		return (rc);
3275 
3276 	t4_iterate(nvmf_che_activate_all, NULL);
3277 
3278 	return (rc);
3279 }
3280 
3281 static int
nvmf_che_mod_unload(void)3282 nvmf_che_mod_unload(void)
3283 {
3284 	t4_iterate(nvmf_che_deactivate_all, NULL);
3285 
3286 	if (t4_unregister_uld(&nvmf_che_uld_info, ULD_NVME) == EBUSY)
3287 		return (EBUSY);
3288 
3289 	t4_register_cpl_handler(CPL_NVMT_CMP, NULL);
3290 	t4_register_cpl_handler(CPL_NVMT_DATA, NULL);
3291 
3292 	return (0);
3293 }
3294 #endif
3295 
3296 static int
nvmf_che_modevent(module_t mod,int cmd,void * arg)3297 nvmf_che_modevent(module_t mod, int cmd, void *arg)
3298 {
3299 	int rc;
3300 
3301 #ifdef TCP_OFFLOAD
3302 	switch (cmd) {
3303 	case MOD_LOAD:
3304 		rc = nvmf_che_mod_load();
3305 		break;
3306 	case MOD_UNLOAD:
3307 		rc = nvmf_che_mod_unload();
3308 		break;
3309 	default:
3310 		rc = EOPNOTSUPP;
3311 		break;
3312 	}
3313 #else
3314 	printf("nvmf_che: compiled without TCP_OFFLOAD support.\n");
3315 	rc = EOPNOTSUPP;
3316 #endif
3317 
3318 	return (rc);
3319 }
3320 
3321 static moduledata_t nvmf_che_mod = {
3322 	"nvmf_che",
3323 	nvmf_che_modevent,
3324 	NULL,
3325 };
3326 
3327 MODULE_VERSION(nvmf_che, 1);
3328 DECLARE_MODULE(nvmf_che, nvmf_che_mod, SI_SUB_EXEC, SI_ORDER_ANY);
3329 MODULE_DEPEND(nvmf_che, t4_tom, 1, 1, 1);
3330 MODULE_DEPEND(nvmf_che, cxgbe, 1, 1, 1);
3331