xref: /freebsd/sys/dev/cxgbe/tom/t4_ddp.c (revision 40dbb06fa73cac37d57563c07e55efd0cabbd488)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2012 Chelsio Communications, Inc.
5  * All rights reserved.
6  * Written by: Navdeep Parhar <np@FreeBSD.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include <sys/cdefs.h>
31 #include "opt_inet.h"
32 
33 #include <sys/param.h>
34 #include <sys/aio.h>
35 #include <sys/bio.h>
36 #include <sys/file.h>
37 #include <sys/systm.h>
38 #include <sys/kernel.h>
39 #include <sys/ktr.h>
40 #include <sys/module.h>
41 #include <sys/protosw.h>
42 #include <sys/proc.h>
43 #include <sys/domain.h>
44 #include <sys/socket.h>
45 #include <sys/socketvar.h>
46 #include <sys/taskqueue.h>
47 #include <sys/uio.h>
48 #include <netinet/in.h>
49 #include <netinet/in_pcb.h>
50 #include <netinet/ip.h>
51 #include <netinet/tcp_var.h>
52 #define TCPSTATES
53 #include <netinet/tcp_fsm.h>
54 #include <netinet/toecore.h>
55 
56 #include <vm/vm.h>
57 #include <vm/vm_extern.h>
58 #include <vm/vm_param.h>
59 #include <vm/pmap.h>
60 #include <vm/vm_map.h>
61 #include <vm/vm_page.h>
62 #include <vm/vm_object.h>
63 
64 #include <cam/scsi/scsi_all.h>
65 #include <cam/ctl/ctl_io.h>
66 
67 #ifdef TCP_OFFLOAD
68 #include "common/common.h"
69 #include "common/t4_msg.h"
70 #include "common/t4_regs.h"
71 #include "common/t4_tcb.h"
72 #include "tom/t4_tom.h"
73 
74 /*
75  * Use the 'backend3' field in AIO jobs to store the amount of data
76  * received by the AIO job so far.
77  */
78 #define	aio_received	backend3
79 
80 static void aio_ddp_requeue_task(void *context, int pending);
81 static void ddp_complete_all(struct toepcb *toep, int error);
82 static void t4_aio_cancel_active(struct kaiocb *job);
83 static void t4_aio_cancel_queued(struct kaiocb *job);
84 static int t4_alloc_page_pods_for_rcvbuf(struct ppod_region *pr,
85     struct ddp_rcv_buffer *drb);
86 static int t4_write_page_pods_for_rcvbuf(struct adapter *sc,
87     struct sge_wrq *wrq, int tid, struct ddp_rcv_buffer *drb);
88 
89 static TAILQ_HEAD(, pageset) ddp_orphan_pagesets;
90 static struct mtx ddp_orphan_pagesets_lock;
91 static struct task ddp_orphan_task;
92 
93 #define MAX_DDP_BUFFER_SIZE		(M_TCB_RX_DDP_BUF0_LEN)
94 
95 /*
96  * A page set holds information about a user buffer used for AIO DDP.
97  * The page set holds resources such as the VM pages backing the
98  * buffer (either held or wired) and the page pods associated with the
99  * buffer.  Recently used page sets are cached to allow for efficient
100  * reuse of buffers (avoiding the need to re-fault in pages, hold
101  * them, etc.).  Note that cached page sets keep the backing pages
102  * wired.  The number of wired pages is capped by only allowing for
103  * two wired pagesets per connection.  This is not a perfect cap, but
104  * is a trade-off for performance.
105  *
106  * If an application ping-pongs two buffers for a connection via
107  * aio_read(2) then those buffers should remain wired and expensive VM
108  * fault lookups should be avoided after each buffer has been used
109  * once.  If an application uses more than two buffers then this will
110  * fall back to doing expensive VM fault lookups for each operation.
111  */
112 static void
free_pageset(struct tom_data * td,struct pageset * ps)113 free_pageset(struct tom_data *td, struct pageset *ps)
114 {
115 	vm_page_t p;
116 	int i;
117 
118 	if (ps->prsv.prsv_nppods > 0)
119 		t4_free_page_pods(&ps->prsv);
120 
121 	for (i = 0; i < ps->npages; i++) {
122 		p = ps->pages[i];
123 		vm_page_unwire(p, PQ_INACTIVE);
124 	}
125 	mtx_lock(&ddp_orphan_pagesets_lock);
126 	TAILQ_INSERT_TAIL(&ddp_orphan_pagesets, ps, link);
127 	taskqueue_enqueue(taskqueue_thread, &ddp_orphan_task);
128 	mtx_unlock(&ddp_orphan_pagesets_lock);
129 }
130 
131 static void
ddp_free_orphan_pagesets(void * context,int pending)132 ddp_free_orphan_pagesets(void *context, int pending)
133 {
134 	struct pageset *ps;
135 
136 	mtx_lock(&ddp_orphan_pagesets_lock);
137 	while (!TAILQ_EMPTY(&ddp_orphan_pagesets)) {
138 		ps = TAILQ_FIRST(&ddp_orphan_pagesets);
139 		TAILQ_REMOVE(&ddp_orphan_pagesets, ps, link);
140 		mtx_unlock(&ddp_orphan_pagesets_lock);
141 		if (ps->vm)
142 			vmspace_free(ps->vm);
143 		free(ps, M_CXGBE);
144 		mtx_lock(&ddp_orphan_pagesets_lock);
145 	}
146 	mtx_unlock(&ddp_orphan_pagesets_lock);
147 }
148 
149 static void
recycle_pageset(struct toepcb * toep,struct pageset * ps)150 recycle_pageset(struct toepcb *toep, struct pageset *ps)
151 {
152 
153 	DDP_ASSERT_LOCKED(toep);
154 	if (!(toep->ddp.flags & DDP_DEAD)) {
155 		KASSERT(toep->ddp.cached_count + toep->ddp.active_count <
156 		    nitems(toep->ddp.db), ("too many wired pagesets"));
157 		TAILQ_INSERT_HEAD(&toep->ddp.cached_pagesets, ps, link);
158 		toep->ddp.cached_count++;
159 	} else
160 		free_pageset(toep->td, ps);
161 }
162 
163 static void
ddp_complete_one(struct kaiocb * job,int error)164 ddp_complete_one(struct kaiocb *job, int error)
165 {
166 	long copied;
167 
168 	/*
169 	 * If this job had copied data out of the socket buffer before
170 	 * it was cancelled, report it as a short read rather than an
171 	 * error.
172 	 */
173 	copied = job->aio_received;
174 	if (copied != 0 || error == 0)
175 		aio_complete(job, copied, 0);
176 	else
177 		aio_complete(job, -1, error);
178 }
179 
180 static void
free_ddp_rcv_buffer(struct toepcb * toep,struct ddp_rcv_buffer * drb)181 free_ddp_rcv_buffer(struct toepcb *toep, struct ddp_rcv_buffer *drb)
182 {
183 	t4_free_page_pods(&drb->prsv);
184 	free(drb->buf, M_CXGBE);
185 	free(drb, M_CXGBE);
186 	counter_u64_add(toep->ofld_rxq->ddp_buffer_free, 1);
187 	free_toepcb(toep);
188 }
189 
190 static void
recycle_ddp_rcv_buffer(struct toepcb * toep,struct ddp_rcv_buffer * drb)191 recycle_ddp_rcv_buffer(struct toepcb *toep, struct ddp_rcv_buffer *drb)
192 {
193 	DDP_CACHE_LOCK(toep);
194 	if (!(toep->ddp.flags & DDP_DEAD) &&
195 	    toep->ddp.cached_count < t4_ddp_rcvbuf_cache) {
196 		TAILQ_INSERT_HEAD(&toep->ddp.cached_buffers, drb, link);
197 		toep->ddp.cached_count++;
198 		DDP_CACHE_UNLOCK(toep);
199 	} else {
200 		DDP_CACHE_UNLOCK(toep);
201 		free_ddp_rcv_buffer(toep, drb);
202 	}
203 }
204 
205 static struct ddp_rcv_buffer *
alloc_cached_ddp_rcv_buffer(struct toepcb * toep)206 alloc_cached_ddp_rcv_buffer(struct toepcb *toep)
207 {
208 	struct ddp_rcv_buffer *drb;
209 
210 	DDP_CACHE_LOCK(toep);
211 	if (!TAILQ_EMPTY(&toep->ddp.cached_buffers)) {
212 		drb = TAILQ_FIRST(&toep->ddp.cached_buffers);
213 		TAILQ_REMOVE(&toep->ddp.cached_buffers, drb, link);
214 		toep->ddp.cached_count--;
215 		counter_u64_add(toep->ofld_rxq->ddp_buffer_reuse, 1);
216 	} else
217 		drb = NULL;
218 	DDP_CACHE_UNLOCK(toep);
219 	return (drb);
220 }
221 
222 static struct ddp_rcv_buffer *
alloc_ddp_rcv_buffer(struct toepcb * toep,int how)223 alloc_ddp_rcv_buffer(struct toepcb *toep, int how)
224 {
225 	struct tom_data *td = toep->td;
226 	struct adapter *sc = td_adapter(td);
227 	struct ddp_rcv_buffer *drb;
228 	int error;
229 
230 	drb = malloc(sizeof(*drb), M_CXGBE, how | M_ZERO);
231 	if (drb == NULL)
232 		return (NULL);
233 
234 	drb->buf = contigmalloc(t4_ddp_rcvbuf_len, M_CXGBE, how, 0, ~0,
235 	    t4_ddp_rcvbuf_len, 0);
236 	if (drb->buf == NULL) {
237 		free(drb, M_CXGBE);
238 		return (NULL);
239 	}
240 	drb->len = t4_ddp_rcvbuf_len;
241 	drb->refs = 1;
242 
243 	error = t4_alloc_page_pods_for_rcvbuf(&td->pr, drb);
244 	if (error != 0) {
245 		free(drb->buf, M_CXGBE);
246 		free(drb, M_CXGBE);
247 		return (NULL);
248 	}
249 
250 	error = t4_write_page_pods_for_rcvbuf(sc, toep->ctrlq, toep->tid, drb);
251 	if (error != 0) {
252 		t4_free_page_pods(&drb->prsv);
253 		free(drb->buf, M_CXGBE);
254 		free(drb, M_CXGBE);
255 		return (NULL);
256 	}
257 
258 	hold_toepcb(toep);
259 	counter_u64_add(toep->ofld_rxq->ddp_buffer_alloc, 1);
260 	return (drb);
261 }
262 
263 static void
free_ddp_buffer(struct toepcb * toep,struct ddp_buffer * db)264 free_ddp_buffer(struct toepcb *toep, struct ddp_buffer *db)
265 {
266 	if ((toep->ddp.flags & DDP_RCVBUF) != 0) {
267 		if (db->drb != NULL)
268 			free_ddp_rcv_buffer(toep, db->drb);
269 #ifdef INVARIANTS
270 		db->drb = NULL;
271 #endif
272 		return;
273 	}
274 
275 	if (db->job) {
276 		/*
277 		 * XXX: If we are un-offloading the socket then we
278 		 * should requeue these on the socket somehow.  If we
279 		 * got a FIN from the remote end, then this completes
280 		 * any remaining requests with an EOF read.
281 		 */
282 		if (!aio_clear_cancel_function(db->job))
283 			ddp_complete_one(db->job, 0);
284 #ifdef INVARIANTS
285 		db->job = NULL;
286 #endif
287 	}
288 
289 	if (db->ps) {
290 		free_pageset(toep->td, db->ps);
291 #ifdef INVARIANTS
292 		db->ps = NULL;
293 #endif
294 	}
295 }
296 
297 static void
ddp_init_toep(struct toepcb * toep)298 ddp_init_toep(struct toepcb *toep)
299 {
300 
301 	toep->ddp.flags = DDP_OK;
302 	toep->ddp.active_id = -1;
303 	mtx_init(&toep->ddp.lock, "t4 ddp", NULL, MTX_DEF);
304 	mtx_init(&toep->ddp.cache_lock, "t4 ddp cache", NULL, MTX_DEF);
305 }
306 
307 void
ddp_uninit_toep(struct toepcb * toep)308 ddp_uninit_toep(struct toepcb *toep)
309 {
310 
311 	mtx_destroy(&toep->ddp.lock);
312 	mtx_destroy(&toep->ddp.cache_lock);
313 }
314 
315 void
release_ddp_resources(struct toepcb * toep)316 release_ddp_resources(struct toepcb *toep)
317 {
318 	struct ddp_rcv_buffer *drb;
319 	struct pageset *ps;
320 	int i;
321 
322 	DDP_LOCK(toep);
323 	DDP_CACHE_LOCK(toep);
324 	toep->ddp.flags |= DDP_DEAD;
325 	DDP_CACHE_UNLOCK(toep);
326 	for (i = 0; i < nitems(toep->ddp.db); i++) {
327 		free_ddp_buffer(toep, &toep->ddp.db[i]);
328 	}
329 	if ((toep->ddp.flags & DDP_AIO) != 0) {
330 		while ((ps = TAILQ_FIRST(&toep->ddp.cached_pagesets)) != NULL) {
331 			TAILQ_REMOVE(&toep->ddp.cached_pagesets, ps, link);
332 			free_pageset(toep->td, ps);
333 		}
334 		ddp_complete_all(toep, 0);
335 	}
336 	if ((toep->ddp.flags & DDP_RCVBUF) != 0) {
337 		DDP_CACHE_LOCK(toep);
338 		while ((drb = TAILQ_FIRST(&toep->ddp.cached_buffers)) != NULL) {
339 			TAILQ_REMOVE(&toep->ddp.cached_buffers, drb, link);
340 			free_ddp_rcv_buffer(toep, drb);
341 		}
342 		DDP_CACHE_UNLOCK(toep);
343 	}
344 	DDP_UNLOCK(toep);
345 }
346 
347 #ifdef INVARIANTS
348 void
ddp_assert_empty(struct toepcb * toep)349 ddp_assert_empty(struct toepcb *toep)
350 {
351 	int i;
352 
353 	MPASS((toep->ddp.flags & (DDP_TASK_ACTIVE | DDP_DEAD)) != DDP_TASK_ACTIVE);
354 	for (i = 0; i < nitems(toep->ddp.db); i++) {
355 		if ((toep->ddp.flags & DDP_AIO) != 0) {
356 			MPASS(toep->ddp.db[i].job == NULL);
357 			MPASS(toep->ddp.db[i].ps == NULL);
358 		} else
359 			MPASS(toep->ddp.db[i].drb == NULL);
360 	}
361 	if ((toep->ddp.flags & DDP_AIO) != 0) {
362 		MPASS(TAILQ_EMPTY(&toep->ddp.cached_pagesets));
363 		MPASS(TAILQ_EMPTY(&toep->ddp.aiojobq));
364 	}
365 	if ((toep->ddp.flags & DDP_RCVBUF) != 0)
366 		MPASS(TAILQ_EMPTY(&toep->ddp.cached_buffers));
367 }
368 #endif
369 
370 static void
complete_ddp_buffer(struct toepcb * toep,struct ddp_buffer * db,unsigned int db_idx)371 complete_ddp_buffer(struct toepcb *toep, struct ddp_buffer *db,
372     unsigned int db_idx)
373 {
374 	struct ddp_rcv_buffer *drb;
375 	unsigned int db_flag;
376 
377 	toep->ddp.active_count--;
378 	if (toep->ddp.active_id == db_idx) {
379 		if (toep->ddp.active_count == 0) {
380 			if ((toep->ddp.flags & DDP_AIO) != 0)
381 				KASSERT(toep->ddp.db[db_idx ^ 1].job == NULL,
382 				    ("%s: active_count mismatch", __func__));
383 			else
384 				KASSERT(toep->ddp.db[db_idx ^ 1].drb == NULL,
385 				    ("%s: active_count mismatch", __func__));
386 			toep->ddp.active_id = -1;
387 		} else
388 			toep->ddp.active_id ^= 1;
389 #ifdef VERBOSE_TRACES
390 		CTR3(KTR_CXGBE, "%s: tid %u, ddp_active_id = %d", __func__,
391 		    toep->tid, toep->ddp.active_id);
392 #endif
393 	} else {
394 		KASSERT(toep->ddp.active_count != 0 &&
395 		    toep->ddp.active_id != -1,
396 		    ("%s: active count mismatch", __func__));
397 	}
398 
399 	if ((toep->ddp.flags & DDP_AIO) != 0) {
400 		db->cancel_pending = 0;
401 		db->job = NULL;
402 		recycle_pageset(toep, db->ps);
403 		db->ps = NULL;
404 	} else {
405 		drb = db->drb;
406 		if (atomic_fetchadd_int(&drb->refs, -1) == 1)
407 			recycle_ddp_rcv_buffer(toep, drb);
408 		db->drb = NULL;
409 		db->placed = 0;
410 	}
411 
412 	db_flag = db_idx == 1 ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE;
413 	KASSERT(toep->ddp.flags & db_flag,
414 	    ("%s: DDP buffer not active. toep %p, ddp_flags 0x%x",
415 	    __func__, toep, toep->ddp.flags));
416 	toep->ddp.flags &= ~db_flag;
417 }
418 
419 /* Called when m_free drops the last reference. */
420 static void
ddp_rcv_mbuf_done(struct mbuf * m)421 ddp_rcv_mbuf_done(struct mbuf *m)
422 {
423 	struct toepcb *toep = m->m_ext.ext_arg1;
424 	struct ddp_rcv_buffer *drb = m->m_ext.ext_arg2;
425 
426 	recycle_ddp_rcv_buffer(toep, drb);
427 }
428 
429 static void
queue_ddp_rcvbuf_mbuf(struct toepcb * toep,u_int db_idx,u_int len)430 queue_ddp_rcvbuf_mbuf(struct toepcb *toep, u_int db_idx, u_int len)
431 {
432 	struct inpcb *inp = toep->inp;
433 	struct sockbuf *sb;
434 	struct ddp_buffer *db;
435 	struct ddp_rcv_buffer *drb;
436 	struct mbuf *m;
437 
438 	m = m_gethdr(M_NOWAIT, MT_DATA);
439 	if (m == NULL) {
440 		printf("%s: failed to allocate mbuf", __func__);
441 		return;
442 	}
443 	m->m_pkthdr.rcvif = toep->vi->ifp;
444 
445 	db = &toep->ddp.db[db_idx];
446 	drb = db->drb;
447 	m_extaddref(m, (char *)drb->buf + db->placed, len, &drb->refs,
448 	    ddp_rcv_mbuf_done, toep, drb);
449 	m->m_pkthdr.len = len;
450 	m->m_len = len;
451 
452 	sb = &inp->inp_socket->so_rcv;
453 	SOCKBUF_LOCK_ASSERT(sb);
454 	sbappendstream_locked(sb, m, 0);
455 
456 	db->placed += len;
457 	toep->ofld_rxq->rx_toe_ddp_octets += len;
458 }
459 
460 /* XXX: handle_ddp_data code duplication */
461 void
insert_ddp_data(struct toepcb * toep,uint32_t n)462 insert_ddp_data(struct toepcb *toep, uint32_t n)
463 {
464 	struct inpcb *inp = toep->inp;
465 	struct tcpcb *tp = intotcpcb(inp);
466 	struct ddp_buffer *db;
467 	struct kaiocb *job;
468 	size_t placed;
469 	long copied;
470 	unsigned int db_idx;
471 #ifdef INVARIANTS
472 	unsigned int db_flag;
473 #endif
474 	bool ddp_rcvbuf;
475 
476 	INP_WLOCK_ASSERT(inp);
477 	DDP_ASSERT_LOCKED(toep);
478 
479 	ddp_rcvbuf = (toep->ddp.flags & DDP_RCVBUF) != 0;
480 	tp->rcv_nxt += n;
481 #ifndef USE_DDP_RX_FLOW_CONTROL
482 	KASSERT(tp->rcv_wnd >= n, ("%s: negative window size", __func__));
483 	tp->rcv_wnd -= n;
484 #endif
485 	CTR2(KTR_CXGBE, "%s: placed %u bytes before falling out of DDP",
486 	    __func__, n);
487 	while (toep->ddp.active_count > 0) {
488 		MPASS(toep->ddp.active_id != -1);
489 		db_idx = toep->ddp.active_id;
490 #ifdef INVARIANTS
491 		db_flag = db_idx == 1 ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE;
492 #endif
493 		MPASS((toep->ddp.flags & db_flag) != 0);
494 		db = &toep->ddp.db[db_idx];
495 		if (ddp_rcvbuf) {
496 			placed = n;
497 			if (placed > db->drb->len - db->placed)
498 				placed = db->drb->len - db->placed;
499 			if (placed != 0)
500 				queue_ddp_rcvbuf_mbuf(toep, db_idx, placed);
501 			complete_ddp_buffer(toep, db, db_idx);
502 			n -= placed;
503 			continue;
504 		}
505 		job = db->job;
506 		copied = job->aio_received;
507 		placed = n;
508 		if (placed > job->uaiocb.aio_nbytes - copied)
509 			placed = job->uaiocb.aio_nbytes - copied;
510 		if (placed > 0) {
511 			job->msgrcv = 1;
512 			toep->ofld_rxq->rx_aio_ddp_jobs++;
513 		}
514 		toep->ofld_rxq->rx_aio_ddp_octets += placed;
515 		if (!aio_clear_cancel_function(job)) {
516 			/*
517 			 * Update the copied length for when
518 			 * t4_aio_cancel_active() completes this
519 			 * request.
520 			 */
521 			job->aio_received += placed;
522 		} else if (copied + placed != 0) {
523 			CTR4(KTR_CXGBE,
524 			    "%s: completing %p (copied %ld, placed %lu)",
525 			    __func__, job, copied, placed);
526 			/* XXX: This always completes if there is some data. */
527 			aio_complete(job, copied + placed, 0);
528 		} else if (aio_set_cancel_function(job, t4_aio_cancel_queued)) {
529 			TAILQ_INSERT_HEAD(&toep->ddp.aiojobq, job, list);
530 			toep->ddp.waiting_count++;
531 		} else
532 			aio_cancel(job);
533 		n -= placed;
534 		complete_ddp_buffer(toep, db, db_idx);
535 	}
536 
537 	MPASS(n == 0);
538 }
539 
540 /* SET_TCB_FIELD sent as a ULP command looks like this */
541 #define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \
542     sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core))
543 
544 /* RX_DATA_ACK sent as a ULP command looks like this */
545 #define LEN__RX_DATA_ACK_ULP (sizeof(struct ulp_txpkt) + \
546     sizeof(struct ulptx_idata) + sizeof(struct cpl_rx_data_ack_core))
547 
548 static inline void *
mk_rx_data_ack_ulp(struct ulp_txpkt * ulpmc,struct toepcb * toep)549 mk_rx_data_ack_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep)
550 {
551 	struct ulptx_idata *ulpsc;
552 	struct cpl_rx_data_ack_core *req;
553 
554 	ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0));
555 	ulpmc->len = htobe32(howmany(LEN__RX_DATA_ACK_ULP, 16));
556 
557 	ulpsc = (struct ulptx_idata *)(ulpmc + 1);
558 	ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
559 	ulpsc->len = htobe32(sizeof(*req));
560 
561 	req = (struct cpl_rx_data_ack_core *)(ulpsc + 1);
562 	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tid));
563 	req->credit_dack = htobe32(F_RX_MODULATE_RX);
564 
565 	ulpsc = (struct ulptx_idata *)(req + 1);
566 	if (LEN__RX_DATA_ACK_ULP % 16) {
567 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
568 		ulpsc->len = htobe32(0);
569 		return (ulpsc + 1);
570 	}
571 	return (ulpsc);
572 }
573 
574 static struct wrqe *
mk_update_tcb_for_ddp(struct adapter * sc,struct toepcb * toep,int db_idx,struct ppod_reservation * prsv,int offset,uint32_t len,uint64_t ddp_flags,uint64_t ddp_flags_mask)575 mk_update_tcb_for_ddp(struct adapter *sc, struct toepcb *toep, int db_idx,
576     struct ppod_reservation *prsv, int offset, uint32_t len,
577     uint64_t ddp_flags, uint64_t ddp_flags_mask)
578 {
579 	struct wrqe *wr;
580 	struct work_request_hdr *wrh;
581 	struct ulp_txpkt *ulpmc;
582 	int wrlen;
583 
584 	KASSERT(db_idx == 0 || db_idx == 1,
585 	    ("%s: bad DDP buffer index %d", __func__, db_idx));
586 
587 	/*
588 	 * We'll send a compound work request that has 3 SET_TCB_FIELDs and an
589 	 * RX_DATA_ACK (with RX_MODULATE to speed up delivery).
590 	 *
591 	 * The work request header is 16B and always ends at a 16B boundary.
592 	 * The ULPTX master commands that follow must all end at 16B boundaries
593 	 * too so we round up the size to 16.
594 	 */
595 	wrlen = sizeof(*wrh) + 3 * roundup2(LEN__SET_TCB_FIELD_ULP, 16) +
596 	    roundup2(LEN__RX_DATA_ACK_ULP, 16);
597 
598 	wr = alloc_wrqe(wrlen, toep->ctrlq);
599 	if (wr == NULL)
600 		return (NULL);
601 	wrh = wrtod(wr);
602 	INIT_ULPTX_WRH(wrh, wrlen, 1, 0);	/* atomic */
603 	ulpmc = (struct ulp_txpkt *)(wrh + 1);
604 
605 	/* Write the buffer's tag */
606 	ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid,
607 	    W_TCB_RX_DDP_BUF0_TAG + db_idx,
608 	    V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
609 	    V_TCB_RX_DDP_BUF0_TAG(prsv->prsv_tag));
610 
611 	/* Update the current offset in the DDP buffer and its total length */
612 	if (db_idx == 0)
613 		ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid,
614 		    W_TCB_RX_DDP_BUF0_OFFSET,
615 		    V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
616 		    V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
617 		    V_TCB_RX_DDP_BUF0_OFFSET(offset) |
618 		    V_TCB_RX_DDP_BUF0_LEN(len));
619 	else
620 		ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid,
621 		    W_TCB_RX_DDP_BUF1_OFFSET,
622 		    V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
623 		    V_TCB_RX_DDP_BUF1_LEN((u64)M_TCB_RX_DDP_BUF1_LEN << 32),
624 		    V_TCB_RX_DDP_BUF1_OFFSET(offset) |
625 		    V_TCB_RX_DDP_BUF1_LEN((u64)len << 32));
626 
627 	/* Update DDP flags */
628 	ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, W_TCB_RX_DDP_FLAGS,
629 	    ddp_flags_mask, ddp_flags);
630 
631 	/* Gratuitous RX_DATA_ACK with RX_MODULATE set to speed up delivery. */
632 	ulpmc = mk_rx_data_ack_ulp(ulpmc, toep);
633 
634 	return (wr);
635 }
636 
637 static int
handle_ddp_data_aio(struct toepcb * toep,__be32 ddp_report,__be32 rcv_nxt,int len)638 handle_ddp_data_aio(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt,
639     int len)
640 {
641 	uint32_t report = be32toh(ddp_report);
642 	unsigned int db_idx;
643 	struct inpcb *inp = toep->inp;
644 	struct tcpcb *tp = intotcpcb(inp);
645 	struct ddp_buffer *db;
646 	struct socket *so;
647 	struct sockbuf *sb;
648 	struct kaiocb *job;
649 	long copied;
650 
651 	db_idx = report & F_DDP_BUF_IDX ? 1 : 0;
652 
653 	if (__predict_false(!(report & F_DDP_INV)))
654 		CXGBE_UNIMPLEMENTED("DDP buffer still valid");
655 
656 	INP_WLOCK(inp);
657 	so = inp_inpcbtosocket(inp);
658 	sb = &so->so_rcv;
659 	DDP_LOCK(toep);
660 
661 	KASSERT(toep->ddp.active_id == db_idx,
662 	    ("completed DDP buffer (%d) != active_id (%d) for tid %d", db_idx,
663 	    toep->ddp.active_id, toep->tid));
664 	db = &toep->ddp.db[db_idx];
665 	job = db->job;
666 
667 	if (__predict_false(tp->t_flags & TF_DISCONNECTED)) {
668 		/*
669 		 * This can happen due to an administrative tcpdrop(8).
670 		 * Just fail the request with ECONNRESET.
671 		 */
672 		CTR5(KTR_CXGBE, "%s: tid %u, seq 0x%x, len %d, t_flags 0x%x",
673 		    __func__, toep->tid, be32toh(rcv_nxt), len, tp->t_flags);
674 		if (aio_clear_cancel_function(job))
675 			ddp_complete_one(job, ECONNRESET);
676 		goto completed;
677 	}
678 
679 	tp = intotcpcb(inp);
680 
681 	/*
682 	 * For RX_DDP_COMPLETE, len will be zero and rcv_nxt is the
683 	 * sequence number of the next byte to receive.  The length of
684 	 * the data received for this message must be computed by
685 	 * comparing the new and old values of rcv_nxt.
686 	 *
687 	 * For RX_DATA_DDP, len might be non-zero, but it is only the
688 	 * length of the most recent DMA.  It does not include the
689 	 * total length of the data received since the previous update
690 	 * for this DDP buffer.  rcv_nxt is the sequence number of the
691 	 * first received byte from the most recent DMA.
692 	 */
693 	len += be32toh(rcv_nxt) - tp->rcv_nxt;
694 	tp->rcv_nxt += len;
695 	tp->t_rcvtime = ticks;
696 #ifndef USE_DDP_RX_FLOW_CONTROL
697 	KASSERT(tp->rcv_wnd >= len, ("%s: negative window size", __func__));
698 	tp->rcv_wnd -= len;
699 #endif
700 #ifdef VERBOSE_TRACES
701 	CTR5(KTR_CXGBE, "%s: tid %u, DDP[%d] placed %d bytes (%#x)", __func__,
702 	    toep->tid, db_idx, len, report);
703 #endif
704 
705 	/* receive buffer autosize */
706 	MPASS(toep->vnet == so->so_vnet);
707 	CURVNET_SET(toep->vnet);
708 	SOCKBUF_LOCK(sb);
709 	if (sb->sb_flags & SB_AUTOSIZE &&
710 	    V_tcp_do_autorcvbuf &&
711 	    sb->sb_hiwat < V_tcp_autorcvbuf_max &&
712 	    len > (sbspace(sb) / 8 * 7)) {
713 		struct adapter *sc = td_adapter(toep->td);
714 		unsigned int hiwat = sb->sb_hiwat;
715 		unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc,
716 		    V_tcp_autorcvbuf_max);
717 
718 		if (!sbreserve_locked(so, SO_RCV, newsize, NULL))
719 			sb->sb_flags &= ~SB_AUTOSIZE;
720 	}
721 	SOCKBUF_UNLOCK(sb);
722 	CURVNET_RESTORE();
723 
724 	job->msgrcv = 1;
725 	toep->ofld_rxq->rx_aio_ddp_jobs++;
726 	toep->ofld_rxq->rx_aio_ddp_octets += len;
727 	if (db->cancel_pending) {
728 		/*
729 		 * Update the job's length but defer completion to the
730 		 * TCB_RPL callback.
731 		 */
732 		job->aio_received += len;
733 		goto out;
734 	} else if (!aio_clear_cancel_function(job)) {
735 		/*
736 		 * Update the copied length for when
737 		 * t4_aio_cancel_active() completes this request.
738 		 */
739 		job->aio_received += len;
740 	} else {
741 		copied = job->aio_received;
742 #ifdef VERBOSE_TRACES
743 		CTR5(KTR_CXGBE,
744 		    "%s: tid %u, completing %p (copied %ld, placed %d)",
745 		    __func__, toep->tid, job, copied, len);
746 #endif
747 		aio_complete(job, copied + len, 0);
748 		t4_rcvd(&toep->td->tod, tp);
749 	}
750 
751 completed:
752 	complete_ddp_buffer(toep, db, db_idx);
753 	if (toep->ddp.waiting_count > 0)
754 		ddp_queue_toep(toep);
755 out:
756 	DDP_UNLOCK(toep);
757 	INP_WUNLOCK(inp);
758 
759 	return (0);
760 }
761 
762 static bool
queue_ddp_rcvbuf(struct toepcb * toep,struct ddp_rcv_buffer * drb)763 queue_ddp_rcvbuf(struct toepcb *toep, struct ddp_rcv_buffer *drb)
764 {
765 	struct adapter *sc = td_adapter(toep->td);
766 	struct ddp_buffer *db;
767 	struct wrqe *wr;
768 	uint64_t ddp_flags, ddp_flags_mask;
769 	int buf_flag, db_idx;
770 
771 	DDP_ASSERT_LOCKED(toep);
772 
773 	KASSERT((toep->ddp.flags & DDP_DEAD) == 0, ("%s: DDP_DEAD", __func__));
774 	KASSERT(toep->ddp.active_count < nitems(toep->ddp.db),
775 	    ("%s: no empty DDP buffer slot", __func__));
776 
777 	/* Determine which DDP buffer to use. */
778 	if (toep->ddp.db[0].drb == NULL) {
779 		db_idx = 0;
780 	} else {
781 		MPASS(toep->ddp.db[1].drb == NULL);
782 		db_idx = 1;
783 	}
784 
785 	/*
786 	 * Permit PSH to trigger a partial completion without
787 	 * invalidating the rest of the buffer, but disable the PUSH
788 	 * timer.
789 	 */
790 	ddp_flags = 0;
791 	ddp_flags_mask = 0;
792 	if (db_idx == 0) {
793 		ddp_flags |= V_TF_DDP_PSH_NO_INVALIDATE0(1) |
794 		    V_TF_DDP_PUSH_DISABLE_0(0) | V_TF_DDP_PSHF_ENABLE_0(1) |
795 		    V_TF_DDP_BUF0_VALID(1);
796 		ddp_flags_mask |= V_TF_DDP_PSH_NO_INVALIDATE0(1) |
797 		    V_TF_DDP_PUSH_DISABLE_0(1) | V_TF_DDP_PSHF_ENABLE_0(1) |
798 		    V_TF_DDP_BUF0_FLUSH(1) | V_TF_DDP_BUF0_VALID(1);
799 		buf_flag = DDP_BUF0_ACTIVE;
800 	} else {
801 		ddp_flags |= V_TF_DDP_PSH_NO_INVALIDATE1(1) |
802 		    V_TF_DDP_PUSH_DISABLE_1(0) | V_TF_DDP_PSHF_ENABLE_1(1) |
803 		    V_TF_DDP_BUF1_VALID(1);
804 		ddp_flags_mask |= V_TF_DDP_PSH_NO_INVALIDATE1(1) |
805 		    V_TF_DDP_PUSH_DISABLE_1(1) | V_TF_DDP_PSHF_ENABLE_1(1) |
806 		    V_TF_DDP_BUF1_FLUSH(1) | V_TF_DDP_BUF1_VALID(1);
807 		buf_flag = DDP_BUF1_ACTIVE;
808 	}
809 	MPASS((toep->ddp.flags & buf_flag) == 0);
810 	if ((toep->ddp.flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)) == 0) {
811 		MPASS(db_idx == 0);
812 		MPASS(toep->ddp.active_id == -1);
813 		MPASS(toep->ddp.active_count == 0);
814 		ddp_flags_mask |= V_TF_DDP_ACTIVE_BUF(1);
815 	}
816 
817 	/*
818 	 * The TID for this connection should still be valid.  If
819 	 * DDP_DEAD is set, SBS_CANTRCVMORE should be set, so we
820 	 * shouldn't be this far anyway.
821 	 */
822 	wr = mk_update_tcb_for_ddp(sc, toep, db_idx, &drb->prsv, 0, drb->len,
823 	    ddp_flags, ddp_flags_mask);
824 	if (wr == NULL) {
825 		recycle_ddp_rcv_buffer(toep, drb);
826 		printf("%s: mk_update_tcb_for_ddp failed\n", __func__);
827 		return (false);
828 	}
829 
830 #ifdef VERBOSE_TRACES
831 	CTR(KTR_CXGBE,
832 	    "%s: tid %u, scheduling DDP[%d] (flags %#lx/%#lx)", __func__,
833 	    toep->tid, db_idx, ddp_flags, ddp_flags_mask);
834 #endif
835 	/*
836 	 * Hold a reference on scheduled buffers that is dropped in
837 	 * complete_ddp_buffer.
838 	 */
839 	drb->refs = 1;
840 
841 	/* Give the chip the go-ahead. */
842 	t4_wrq_tx(sc, wr);
843 	db = &toep->ddp.db[db_idx];
844 	db->drb = drb;
845 	toep->ddp.flags |= buf_flag;
846 	toep->ddp.active_count++;
847 	if (toep->ddp.active_count == 1) {
848 		MPASS(toep->ddp.active_id == -1);
849 		toep->ddp.active_id = db_idx;
850 		CTR2(KTR_CXGBE, "%s: ddp_active_id = %d", __func__,
851 		    toep->ddp.active_id);
852 	}
853 	return (true);
854 }
855 
856 static int
handle_ddp_data_rcvbuf(struct toepcb * toep,__be32 ddp_report,__be32 rcv_nxt,int len)857 handle_ddp_data_rcvbuf(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt,
858     int len)
859 {
860 	uint32_t report = be32toh(ddp_report);
861 	struct inpcb *inp = toep->inp;
862 	struct tcpcb *tp = intotcpcb(inp);
863 	struct socket *so;
864 	struct sockbuf *sb;
865 	struct ddp_buffer *db;
866 	struct ddp_rcv_buffer *drb;
867 	unsigned int db_idx;
868 	bool invalidated;
869 
870 	db_idx = report & F_DDP_BUF_IDX ? 1 : 0;
871 
872 	invalidated = (report & F_DDP_INV) != 0;
873 
874 	INP_WLOCK(inp);
875 	so = inp_inpcbtosocket(inp);
876 	sb = &so->so_rcv;
877 	DDP_LOCK(toep);
878 
879 	KASSERT(toep->ddp.active_id == db_idx,
880 	    ("completed DDP buffer (%d) != active_id (%d) for tid %d", db_idx,
881 	    toep->ddp.active_id, toep->tid));
882 	db = &toep->ddp.db[db_idx];
883 
884 	if (__predict_false(tp->t_flags & TF_DISCONNECTED)) {
885 		/*
886 		 * This can happen due to an administrative tcpdrop(8).
887 		 * Just ignore the received data.
888 		 */
889 		CTR5(KTR_CXGBE, "%s: tid %u, seq 0x%x, len %d, t_flags 0x%x",
890 		    __func__, toep->tid, be32toh(rcv_nxt), len, tp->t_flags);
891 		if (invalidated)
892 			complete_ddp_buffer(toep, db, db_idx);
893 		goto out;
894 	}
895 
896 	/*
897 	 * For RX_DDP_COMPLETE, len will be zero and rcv_nxt is the
898 	 * sequence number of the next byte to receive.  The length of
899 	 * the data received for this message must be computed by
900 	 * comparing the new and old values of rcv_nxt.
901 	 *
902 	 * For RX_DATA_DDP, len might be non-zero, but it is only the
903 	 * length of the most recent DMA.  It does not include the
904 	 * total length of the data received since the previous update
905 	 * for this DDP buffer.  rcv_nxt is the sequence number of the
906 	 * first received byte from the most recent DMA.
907 	 */
908 	len += be32toh(rcv_nxt) - tp->rcv_nxt;
909 	tp->rcv_nxt += len;
910 	tp->t_rcvtime = ticks;
911 #ifndef USE_DDP_RX_FLOW_CONTROL
912 	KASSERT(tp->rcv_wnd >= len, ("%s: negative window size", __func__));
913 	tp->rcv_wnd -= len;
914 #endif
915 #ifdef VERBOSE_TRACES
916 	CTR5(KTR_CXGBE, "%s: tid %u, DDP[%d] placed %d bytes (%#x)", __func__,
917 	    toep->tid, db_idx, len, report);
918 #endif
919 
920 	/* receive buffer autosize */
921 	MPASS(toep->vnet == so->so_vnet);
922 	CURVNET_SET(toep->vnet);
923 	SOCKBUF_LOCK(sb);
924 	if (sb->sb_flags & SB_AUTOSIZE &&
925 	    V_tcp_do_autorcvbuf &&
926 	    sb->sb_hiwat < V_tcp_autorcvbuf_max &&
927 	    len > (sbspace(sb) / 8 * 7)) {
928 		struct adapter *sc = td_adapter(toep->td);
929 		unsigned int hiwat = sb->sb_hiwat;
930 		unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc,
931 		    V_tcp_autorcvbuf_max);
932 
933 		if (!sbreserve_locked(so, SO_RCV, newsize, NULL))
934 			sb->sb_flags &= ~SB_AUTOSIZE;
935 	}
936 
937 	if (len > 0) {
938 		queue_ddp_rcvbuf_mbuf(toep, db_idx, len);
939 		t4_rcvd_locked(&toep->td->tod, tp);
940 	}
941 	sorwakeup_locked(so);
942 	SOCKBUF_UNLOCK_ASSERT(sb);
943 	CURVNET_RESTORE();
944 
945 	if (invalidated)
946 		complete_ddp_buffer(toep, db, db_idx);
947 	else
948 		KASSERT(db->placed < db->drb->len,
949 		    ("%s: full DDP buffer not invalidated", __func__));
950 
951 	if (toep->ddp.active_count != nitems(toep->ddp.db)) {
952 		drb = alloc_cached_ddp_rcv_buffer(toep);
953 		if (drb == NULL)
954 			drb = alloc_ddp_rcv_buffer(toep, M_NOWAIT);
955 		if (drb == NULL)
956 			ddp_queue_toep(toep);
957 		else {
958 			if (!queue_ddp_rcvbuf(toep, drb)) {
959 				ddp_queue_toep(toep);
960 			}
961 		}
962 	}
963 out:
964 	DDP_UNLOCK(toep);
965 	INP_WUNLOCK(inp);
966 
967 	return (0);
968 }
969 
970 static int
handle_ddp_data(struct toepcb * toep,__be32 ddp_report,__be32 rcv_nxt,int len)971 handle_ddp_data(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt, int len)
972 {
973 	if ((toep->ddp.flags & DDP_RCVBUF) != 0)
974 		return (handle_ddp_data_rcvbuf(toep, ddp_report, rcv_nxt, len));
975 	else
976 		return (handle_ddp_data_aio(toep, ddp_report, rcv_nxt, len));
977 }
978 
979 void
handle_ddp_indicate(struct toepcb * toep)980 handle_ddp_indicate(struct toepcb *toep)
981 {
982 
983 	DDP_ASSERT_LOCKED(toep);
984 	if ((toep->ddp.flags & DDP_RCVBUF) != 0) {
985 		/*
986 		 * Indicates are not meaningful for RCVBUF since
987 		 * buffers are activated when the socket option is
988 		 * set.
989 		 */
990 		return;
991 	}
992 
993 	MPASS(toep->ddp.active_count == 0);
994 	MPASS((toep->ddp.flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)) == 0);
995 	if (toep->ddp.waiting_count == 0) {
996 		/*
997 		 * The pending requests that triggered the request for an
998 		 * an indicate were cancelled.  Those cancels should have
999 		 * already disabled DDP.  Just ignore this as the data is
1000 		 * going into the socket buffer anyway.
1001 		 */
1002 		return;
1003 	}
1004 	CTR3(KTR_CXGBE, "%s: tid %d indicated (%d waiting)", __func__,
1005 	    toep->tid, toep->ddp.waiting_count);
1006 	ddp_queue_toep(toep);
1007 }
1008 
1009 CTASSERT(CPL_COOKIE_DDP0 + 1 == CPL_COOKIE_DDP1);
1010 
1011 static int
do_ddp_tcb_rpl(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)1012 do_ddp_tcb_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
1013 {
1014 	struct adapter *sc = iq->adapter;
1015 	const struct cpl_set_tcb_rpl *cpl = (const void *)(rss + 1);
1016 	unsigned int tid = GET_TID(cpl);
1017 	unsigned int db_idx;
1018 	struct toepcb *toep;
1019 	struct inpcb *inp;
1020 	struct ddp_buffer *db;
1021 	struct kaiocb *job;
1022 	long copied;
1023 
1024 	if (cpl->status != CPL_ERR_NONE)
1025 		panic("XXX: tcp_rpl failed: %d", cpl->status);
1026 
1027 	toep = lookup_tid(sc, tid);
1028 	inp = toep->inp;
1029 	switch (cpl->cookie) {
1030 	case V_WORD(W_TCB_RX_DDP_FLAGS) | V_COOKIE(CPL_COOKIE_DDP0):
1031 	case V_WORD(W_TCB_RX_DDP_FLAGS) | V_COOKIE(CPL_COOKIE_DDP1):
1032 		/*
1033 		 * XXX: This duplicates a lot of code with handle_ddp_data().
1034 		 */
1035 		KASSERT((toep->ddp.flags & DDP_AIO) != 0,
1036 		    ("%s: DDP_RCVBUF", __func__));
1037 		db_idx = G_COOKIE(cpl->cookie) - CPL_COOKIE_DDP0;
1038 		MPASS(db_idx < nitems(toep->ddp.db));
1039 		INP_WLOCK(inp);
1040 		DDP_LOCK(toep);
1041 		db = &toep->ddp.db[db_idx];
1042 
1043 		/*
1044 		 * handle_ddp_data() should leave the job around until
1045 		 * this callback runs once a cancel is pending.
1046 		 */
1047 		MPASS(db != NULL);
1048 		MPASS(db->job != NULL);
1049 		MPASS(db->cancel_pending);
1050 
1051 		/*
1052 		 * XXX: It's not clear what happens if there is data
1053 		 * placed when the buffer is invalidated.  I suspect we
1054 		 * need to read the TCB to see how much data was placed.
1055 		 *
1056 		 * For now this just pretends like nothing was placed.
1057 		 *
1058 		 * XXX: Note that if we did check the PCB we would need to
1059 		 * also take care of updating the tp, etc.
1060 		 */
1061 		job = db->job;
1062 		copied = job->aio_received;
1063 		if (copied == 0) {
1064 			CTR2(KTR_CXGBE, "%s: cancelling %p", __func__, job);
1065 			aio_cancel(job);
1066 		} else {
1067 			CTR3(KTR_CXGBE, "%s: completing %p (copied %ld)",
1068 			    __func__, job, copied);
1069 			aio_complete(job, copied, 0);
1070 			t4_rcvd(&toep->td->tod, intotcpcb(inp));
1071 		}
1072 
1073 		complete_ddp_buffer(toep, db, db_idx);
1074 		if (toep->ddp.waiting_count > 0)
1075 			ddp_queue_toep(toep);
1076 		DDP_UNLOCK(toep);
1077 		INP_WUNLOCK(inp);
1078 		break;
1079 	default:
1080 		panic("XXX: unknown tcb_rpl offset %#x, cookie %#x",
1081 		    G_WORD(cpl->cookie), G_COOKIE(cpl->cookie));
1082 	}
1083 
1084 	return (0);
1085 }
1086 
1087 void
handle_ddp_close(struct toepcb * toep,struct tcpcb * tp,__be32 rcv_nxt)1088 handle_ddp_close(struct toepcb *toep, struct tcpcb *tp, __be32 rcv_nxt)
1089 {
1090 	struct socket *so = toep->inp->inp_socket;
1091 	struct sockbuf *sb = &so->so_rcv;
1092 	struct ddp_buffer *db;
1093 	struct kaiocb *job;
1094 	long copied;
1095 	unsigned int db_idx;
1096 #ifdef INVARIANTS
1097 	unsigned int db_flag;
1098 #endif
1099 	int len, placed;
1100 	bool ddp_rcvbuf;
1101 
1102 	INP_WLOCK_ASSERT(toep->inp);
1103 	DDP_ASSERT_LOCKED(toep);
1104 
1105 	ddp_rcvbuf = (toep->ddp.flags & DDP_RCVBUF) != 0;
1106 
1107 	/* - 1 is to ignore the byte for FIN */
1108 	len = be32toh(rcv_nxt) - tp->rcv_nxt - 1;
1109 	tp->rcv_nxt += len;
1110 
1111 	CTR(KTR_CXGBE, "%s: tid %d placed %u bytes before FIN", __func__,
1112 	    toep->tid, len);
1113 	while (toep->ddp.active_count > 0) {
1114 		MPASS(toep->ddp.active_id != -1);
1115 		db_idx = toep->ddp.active_id;
1116 #ifdef INVARIANTS
1117 		db_flag = db_idx == 1 ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE;
1118 #endif
1119 		MPASS((toep->ddp.flags & db_flag) != 0);
1120 		db = &toep->ddp.db[db_idx];
1121 		if (ddp_rcvbuf) {
1122 			placed = len;
1123 			if (placed > db->drb->len - db->placed)
1124 				placed = db->drb->len - db->placed;
1125 			if (placed != 0) {
1126 				SOCKBUF_LOCK(sb);
1127 				queue_ddp_rcvbuf_mbuf(toep, db_idx, placed);
1128 				sorwakeup_locked(so);
1129 				SOCKBUF_UNLOCK_ASSERT(sb);
1130 			}
1131 			complete_ddp_buffer(toep, db, db_idx);
1132 			len -= placed;
1133 			continue;
1134 		}
1135 		job = db->job;
1136 		copied = job->aio_received;
1137 		placed = len;
1138 		if (placed > job->uaiocb.aio_nbytes - copied)
1139 			placed = job->uaiocb.aio_nbytes - copied;
1140 		if (placed > 0) {
1141 			job->msgrcv = 1;
1142 			toep->ofld_rxq->rx_aio_ddp_jobs++;
1143 		}
1144 		toep->ofld_rxq->rx_aio_ddp_octets += placed;
1145 		if (!aio_clear_cancel_function(job)) {
1146 			/*
1147 			 * Update the copied length for when
1148 			 * t4_aio_cancel_active() completes this
1149 			 * request.
1150 			 */
1151 			job->aio_received += placed;
1152 		} else {
1153 			CTR4(KTR_CXGBE, "%s: tid %d completed buf %d len %d",
1154 			    __func__, toep->tid, db_idx, placed);
1155 			aio_complete(job, copied + placed, 0);
1156 		}
1157 		len -= placed;
1158 		complete_ddp_buffer(toep, db, db_idx);
1159 	}
1160 
1161 	MPASS(len == 0);
1162 	if ((toep->ddp.flags & DDP_AIO) != 0)
1163 		ddp_complete_all(toep, 0);
1164 }
1165 
1166 #define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
1167 	 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
1168 	 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
1169 	 F_DDP_INVALID_PPOD | F_DDP_HDRCRC_ERR | F_DDP_DATACRC_ERR)
1170 
1171 extern cpl_handler_t t4_cpl_handler[];
1172 
1173 static int
do_rx_data_ddp(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)1174 do_rx_data_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
1175 {
1176 	struct adapter *sc = iq->adapter;
1177 	const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1);
1178 	unsigned int tid = GET_TID(cpl);
1179 	uint32_t vld;
1180 	struct toepcb *toep = lookup_tid(sc, tid);
1181 
1182 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1183 	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
1184 	KASSERT(!(toep->flags & TPF_SYNQE),
1185 	    ("%s: toep %p claims to be a synq entry", __func__, toep));
1186 
1187 	vld = be32toh(cpl->ddpvld);
1188 	if (__predict_false(vld & DDP_ERR)) {
1189 		panic("%s: DDP error 0x%x (tid %d, toep %p)",
1190 		    __func__, vld, tid, toep);
1191 	}
1192 
1193 	if (ulp_mode(toep) == ULP_MODE_ISCSI) {
1194 		t4_cpl_handler[CPL_RX_ISCSI_DDP](iq, rss, m);
1195 		return (0);
1196 	}
1197 
1198 	handle_ddp_data(toep, cpl->u.ddp_report, cpl->seq, be16toh(cpl->len));
1199 
1200 	return (0);
1201 }
1202 
1203 static int
do_rx_ddp_complete(struct sge_iq * iq,const struct rss_header * rss,struct mbuf * m)1204 do_rx_ddp_complete(struct sge_iq *iq, const struct rss_header *rss,
1205     struct mbuf *m)
1206 {
1207 	struct adapter *sc = iq->adapter;
1208 	const struct cpl_rx_ddp_complete *cpl = (const void *)(rss + 1);
1209 	unsigned int tid = GET_TID(cpl);
1210 	struct toepcb *toep = lookup_tid(sc, tid);
1211 
1212 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1213 	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
1214 	KASSERT(!(toep->flags & TPF_SYNQE),
1215 	    ("%s: toep %p claims to be a synq entry", __func__, toep));
1216 
1217 	handle_ddp_data(toep, cpl->ddp_report, cpl->rcv_nxt, 0);
1218 
1219 	return (0);
1220 }
1221 
1222 static bool
set_ddp_ulp_mode(struct toepcb * toep)1223 set_ddp_ulp_mode(struct toepcb *toep)
1224 {
1225 	struct adapter *sc = toep->vi->adapter;
1226 	struct wrqe *wr;
1227 	struct work_request_hdr *wrh;
1228 	struct ulp_txpkt *ulpmc;
1229 	int fields, len;
1230 
1231 	if (!sc->tt.ddp)
1232 		return (false);
1233 
1234 	fields = 0;
1235 
1236 	/* Overlay region including W_TCB_RX_DDP_FLAGS */
1237 	fields += 3;
1238 
1239 	/* W_TCB_ULP_TYPE */
1240 	fields++;
1241 
1242 #ifdef USE_DDP_RX_FLOW_CONTROL
1243 	/* W_TCB_T_FLAGS */
1244 	fields++;
1245 #endif
1246 
1247 	len = sizeof(*wrh) + fields * roundup2(LEN__SET_TCB_FIELD_ULP, 16);
1248 	KASSERT(len <= SGE_MAX_WR_LEN,
1249 	    ("%s: WR with %d TCB field updates too large", __func__, fields));
1250 
1251 	wr = alloc_wrqe(len, toep->ctrlq);
1252 	if (wr == NULL)
1253 		return (false);
1254 
1255 	CTR(KTR_CXGBE, "%s: tid %u", __func__, toep->tid);
1256 
1257 	wrh = wrtod(wr);
1258 	INIT_ULPTX_WRH(wrh, len, 1, 0);	/* atomic */
1259 	ulpmc = (struct ulp_txpkt *)(wrh + 1);
1260 
1261 	/*
1262 	 * Words 26/27 are zero except for the DDP_OFF flag in
1263 	 * W_TCB_RX_DDP_FLAGS (27).
1264 	 */
1265 	ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, 26,
1266 	    0xffffffffffffffff, (uint64_t)V_TF_DDP_OFF(1) << 32);
1267 
1268 	/* Words 28/29 are zero. */
1269 	ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, 28,
1270 	    0xffffffffffffffff, 0);
1271 
1272 	/* Words 30/31 are zero. */
1273 	ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, 30,
1274 	    0xffffffffffffffff, 0);
1275 
1276 	/* Set the ULP mode to ULP_MODE_TCPDDP. */
1277 	toep->params.ulp_mode = ULP_MODE_TCPDDP;
1278 	ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, W_TCB_ULP_TYPE,
1279 	    V_TCB_ULP_TYPE(M_TCB_ULP_TYPE), V_TCB_ULP_TYPE(ULP_MODE_TCPDDP));
1280 
1281 #ifdef USE_DDP_RX_FLOW_CONTROL
1282 	/* Set TF_RX_FLOW_CONTROL_DDP. */
1283 	ulpmc = mk_set_tcb_field_ulp(sc, ulpmc, toep->tid, W_TCB_T_FLAGS,
1284 	    V_TF_RX_FLOW_CONTROL_DDP(1), V_TF_RX_FLOW_CONTROL_DDP(1));
1285 #endif
1286 
1287 	ddp_init_toep(toep);
1288 
1289 	t4_wrq_tx(sc, wr);
1290 	return (true);
1291 }
1292 
1293 static void
enable_ddp(struct adapter * sc,struct toepcb * toep)1294 enable_ddp(struct adapter *sc, struct toepcb *toep)
1295 {
1296 	uint64_t ddp_flags;
1297 
1298 	KASSERT((toep->ddp.flags & (DDP_ON | DDP_OK | DDP_SC_REQ)) == DDP_OK,
1299 	    ("%s: toep %p has bad ddp_flags 0x%x",
1300 	    __func__, toep, toep->ddp.flags));
1301 
1302 	CTR3(KTR_CXGBE, "%s: tid %u (time %u)",
1303 	    __func__, toep->tid, time_uptime);
1304 
1305 	ddp_flags = 0;
1306 	if ((toep->ddp.flags & DDP_AIO) != 0)
1307 		ddp_flags |= V_TF_DDP_BUF0_INDICATE(1) |
1308 		    V_TF_DDP_BUF1_INDICATE(1);
1309 	DDP_ASSERT_LOCKED(toep);
1310 	toep->ddp.flags |= DDP_SC_REQ;
1311 	t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_RX_DDP_FLAGS,
1312 	    V_TF_DDP_OFF(1) | V_TF_DDP_INDICATE_OUT(1) |
1313 	    V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1) |
1314 	    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1), ddp_flags, 0, 0);
1315 	t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_T_FLAGS,
1316 	    V_TF_RCV_COALESCE_ENABLE(1), 0, 0, 0);
1317 }
1318 
1319 static int
calculate_hcf(int n1,int n2)1320 calculate_hcf(int n1, int n2)
1321 {
1322 	int a, b, t;
1323 
1324 	if (n1 <= n2) {
1325 		a = n1;
1326 		b = n2;
1327 	} else {
1328 		a = n2;
1329 		b = n1;
1330 	}
1331 
1332 	while (a != 0) {
1333 		t = a;
1334 		a = b % a;
1335 		b = t;
1336 	}
1337 
1338 	return (b);
1339 }
1340 
1341 static inline int
pages_to_nppods(int npages,int ddp_page_shift)1342 pages_to_nppods(int npages, int ddp_page_shift)
1343 {
1344 
1345 	MPASS(ddp_page_shift >= PAGE_SHIFT);
1346 
1347 	return (howmany(npages >> (ddp_page_shift - PAGE_SHIFT), PPOD_PAGES));
1348 }
1349 
1350 static int
alloc_page_pods(struct ppod_region * pr,u_int nppods,u_int pgsz_idx,struct ppod_reservation * prsv)1351 alloc_page_pods(struct ppod_region *pr, u_int nppods, u_int pgsz_idx,
1352     struct ppod_reservation *prsv)
1353 {
1354 	vmem_addr_t addr;       /* relative to start of region */
1355 
1356 	if (vmem_alloc(pr->pr_arena, PPOD_SZ(nppods), M_NOWAIT | M_FIRSTFIT,
1357 	    &addr) != 0)
1358 		return (ENOMEM);
1359 
1360 #ifdef VERBOSE_TRACES
1361 	CTR5(KTR_CXGBE, "%-17s arena %p, addr 0x%08x, nppods %d, pgsz %d",
1362 	    __func__, pr->pr_arena, (uint32_t)addr & pr->pr_tag_mask,
1363 	    nppods, 1 << pr->pr_page_shift[pgsz_idx]);
1364 #endif
1365 
1366 	/*
1367 	 * The hardware tagmask includes an extra invalid bit but the arena was
1368 	 * seeded with valid values only.  An allocation out of this arena will
1369 	 * fit inside the tagmask but won't have the invalid bit set.
1370 	 */
1371 	MPASS((addr & pr->pr_tag_mask) == addr);
1372 	MPASS((addr & pr->pr_invalid_bit) == 0);
1373 
1374 	prsv->prsv_pr = pr;
1375 	prsv->prsv_tag = V_PPOD_PGSZ(pgsz_idx) | addr;
1376 	prsv->prsv_nppods = nppods;
1377 
1378 	return (0);
1379 }
1380 
1381 static int
t4_alloc_page_pods_for_vmpages(struct ppod_region * pr,vm_page_t * pages,int npages,struct ppod_reservation * prsv)1382 t4_alloc_page_pods_for_vmpages(struct ppod_region *pr, vm_page_t *pages,
1383     int npages, struct ppod_reservation *prsv)
1384 {
1385 	int i, hcf, seglen, idx, nppods;
1386 
1387 	/*
1388 	 * The DDP page size is unrelated to the VM page size.  We combine
1389 	 * contiguous physical pages into larger segments to get the best DDP
1390 	 * page size possible.  This is the largest of the four sizes in
1391 	 * A_ULP_RX_TDDP_PSZ that evenly divides the HCF of the segment sizes in
1392 	 * the page list.
1393 	 */
1394 	hcf = 0;
1395 	for (i = 0; i < npages; i++) {
1396 		seglen = PAGE_SIZE;
1397 		while (i < npages - 1 &&
1398 		    VM_PAGE_TO_PHYS(pages[i]) + PAGE_SIZE ==
1399 		    VM_PAGE_TO_PHYS(pages[i + 1])) {
1400 			seglen += PAGE_SIZE;
1401 			i++;
1402 		}
1403 
1404 		hcf = calculate_hcf(hcf, seglen);
1405 		if (hcf < (1 << pr->pr_page_shift[1])) {
1406 			idx = 0;
1407 			goto have_pgsz;	/* give up, short circuit */
1408 		}
1409 	}
1410 
1411 #define PR_PAGE_MASK(x) ((1 << pr->pr_page_shift[(x)]) - 1)
1412 	MPASS((hcf & PR_PAGE_MASK(0)) == 0); /* PAGE_SIZE is >= 4K everywhere */
1413 	for (idx = nitems(pr->pr_page_shift) - 1; idx > 0; idx--) {
1414 		if ((hcf & PR_PAGE_MASK(idx)) == 0)
1415 			break;
1416 	}
1417 #undef PR_PAGE_MASK
1418 
1419 have_pgsz:
1420 	MPASS(idx <= M_PPOD_PGSZ);
1421 
1422 	nppods = pages_to_nppods(npages, pr->pr_page_shift[idx]);
1423 	if (alloc_page_pods(pr, nppods, idx, prsv) != 0)
1424 		return (ENOMEM);
1425 	MPASS(prsv->prsv_nppods > 0);
1426 
1427 	return (0);
1428 }
1429 
1430 int
t4_alloc_page_pods_for_ps(struct ppod_region * pr,struct pageset * ps)1431 t4_alloc_page_pods_for_ps(struct ppod_region *pr, struct pageset *ps)
1432 {
1433 	struct ppod_reservation *prsv = &ps->prsv;
1434 
1435 	KASSERT(prsv->prsv_nppods == 0,
1436 	    ("%s: page pods already allocated", __func__));
1437 
1438 	return (t4_alloc_page_pods_for_vmpages(pr, ps->pages, ps->npages,
1439 	    prsv));
1440 }
1441 
1442 int
t4_alloc_page_pods_for_bio(struct ppod_region * pr,struct bio * bp,struct ppod_reservation * prsv)1443 t4_alloc_page_pods_for_bio(struct ppod_region *pr, struct bio *bp,
1444     struct ppod_reservation *prsv)
1445 {
1446 
1447 	MPASS(bp->bio_flags & BIO_UNMAPPED);
1448 
1449 	return (t4_alloc_page_pods_for_vmpages(pr, bp->bio_ma, bp->bio_ma_n,
1450 	    prsv));
1451 }
1452 
1453 int
t4_alloc_page_pods_for_buf(struct ppod_region * pr,vm_offset_t buf,int len,struct ppod_reservation * prsv)1454 t4_alloc_page_pods_for_buf(struct ppod_region *pr, vm_offset_t buf, int len,
1455     struct ppod_reservation *prsv)
1456 {
1457 	int hcf, seglen, idx, npages, nppods;
1458 	uintptr_t start_pva, end_pva, pva, p1;
1459 
1460 	MPASS(buf > 0);
1461 	MPASS(len > 0);
1462 
1463 	/*
1464 	 * The DDP page size is unrelated to the VM page size.  We combine
1465 	 * contiguous physical pages into larger segments to get the best DDP
1466 	 * page size possible.  This is the largest of the four sizes in
1467 	 * A_ULP_RX_ISCSI_PSZ that evenly divides the HCF of the segment sizes
1468 	 * in the page list.
1469 	 */
1470 	hcf = 0;
1471 	start_pva = trunc_page(buf);
1472 	end_pva = trunc_page(buf + len - 1);
1473 	pva = start_pva;
1474 	while (pva <= end_pva) {
1475 		seglen = PAGE_SIZE;
1476 		p1 = pmap_kextract(pva);
1477 		pva += PAGE_SIZE;
1478 		while (pva <= end_pva && p1 + seglen == pmap_kextract(pva)) {
1479 			seglen += PAGE_SIZE;
1480 			pva += PAGE_SIZE;
1481 		}
1482 
1483 		hcf = calculate_hcf(hcf, seglen);
1484 		if (hcf < (1 << pr->pr_page_shift[1])) {
1485 			idx = 0;
1486 			goto have_pgsz;	/* give up, short circuit */
1487 		}
1488 	}
1489 
1490 #define PR_PAGE_MASK(x) ((1 << pr->pr_page_shift[(x)]) - 1)
1491 	MPASS((hcf & PR_PAGE_MASK(0)) == 0); /* PAGE_SIZE is >= 4K everywhere */
1492 	for (idx = nitems(pr->pr_page_shift) - 1; idx > 0; idx--) {
1493 		if ((hcf & PR_PAGE_MASK(idx)) == 0)
1494 			break;
1495 	}
1496 #undef PR_PAGE_MASK
1497 
1498 have_pgsz:
1499 	MPASS(idx <= M_PPOD_PGSZ);
1500 
1501 	npages = 1;
1502 	npages += (end_pva - start_pva) >> pr->pr_page_shift[idx];
1503 	nppods = howmany(npages, PPOD_PAGES);
1504 	if (alloc_page_pods(pr, nppods, idx, prsv) != 0)
1505 		return (ENOMEM);
1506 	MPASS(prsv->prsv_nppods > 0);
1507 
1508 	return (0);
1509 }
1510 
1511 static int
t4_alloc_page_pods_for_rcvbuf(struct ppod_region * pr,struct ddp_rcv_buffer * drb)1512 t4_alloc_page_pods_for_rcvbuf(struct ppod_region *pr,
1513     struct ddp_rcv_buffer *drb)
1514 {
1515 	struct ppod_reservation *prsv = &drb->prsv;
1516 
1517 	KASSERT(prsv->prsv_nppods == 0,
1518 	    ("%s: page pods already allocated", __func__));
1519 
1520 	return (t4_alloc_page_pods_for_buf(pr, (vm_offset_t)drb->buf, drb->len,
1521 	    prsv));
1522 }
1523 
1524 int
t4_alloc_page_pods_for_sgl(struct ppod_region * pr,struct ctl_sg_entry * sgl,int entries,struct ppod_reservation * prsv)1525 t4_alloc_page_pods_for_sgl(struct ppod_region *pr, struct ctl_sg_entry *sgl,
1526     int entries, struct ppod_reservation *prsv)
1527 {
1528 	int hcf, seglen, idx = 0, npages, nppods, i, len;
1529 	uintptr_t start_pva, end_pva, pva, p1 ;
1530 	vm_offset_t buf;
1531 	struct ctl_sg_entry *sge;
1532 
1533 	MPASS(entries > 0);
1534 	MPASS(sgl);
1535 
1536 	/*
1537 	 * The DDP page size is unrelated to the VM page size.	We combine
1538 	 * contiguous physical pages into larger segments to get the best DDP
1539 	 * page size possible.	This is the largest of the four sizes in
1540 	 * A_ULP_RX_ISCSI_PSZ that evenly divides the HCF of the segment sizes
1541 	 * in the page list.
1542 	 */
1543 	hcf = 0;
1544 	for (i = entries - 1; i >= 0; i--) {
1545 		sge = sgl + i;
1546 		buf = (vm_offset_t)sge->addr;
1547 		len = sge->len;
1548 		start_pva = trunc_page(buf);
1549 		end_pva = trunc_page(buf + len - 1);
1550 		pva = start_pva;
1551 		while (pva <= end_pva) {
1552 			seglen = PAGE_SIZE;
1553 			p1 = pmap_kextract(pva);
1554 			pva += PAGE_SIZE;
1555 			while (pva <= end_pva && p1 + seglen ==
1556 			    pmap_kextract(pva)) {
1557 				seglen += PAGE_SIZE;
1558 				pva += PAGE_SIZE;
1559 			}
1560 
1561 			hcf = calculate_hcf(hcf, seglen);
1562 			if (hcf < (1 << pr->pr_page_shift[1])) {
1563 				idx = 0;
1564 				goto have_pgsz; /* give up, short circuit */
1565 			}
1566 		}
1567 	}
1568 #define PR_PAGE_MASK(x) ((1 << pr->pr_page_shift[(x)]) - 1)
1569 	MPASS((hcf & PR_PAGE_MASK(0)) == 0); /* PAGE_SIZE is >= 4K everywhere */
1570 	for (idx = nitems(pr->pr_page_shift) - 1; idx > 0; idx--) {
1571 		if ((hcf & PR_PAGE_MASK(idx)) == 0)
1572 			break;
1573 	}
1574 #undef PR_PAGE_MASK
1575 
1576 have_pgsz:
1577 	MPASS(idx <= M_PPOD_PGSZ);
1578 
1579 	npages = 0;
1580 	while (entries--) {
1581 		npages++;
1582 		start_pva = trunc_page((vm_offset_t)sgl->addr);
1583 		end_pva = trunc_page((vm_offset_t)sgl->addr + sgl->len - 1);
1584 		npages += (end_pva - start_pva) >> pr->pr_page_shift[idx];
1585 		sgl = sgl + 1;
1586 	}
1587 	nppods = howmany(npages, PPOD_PAGES);
1588 	if (alloc_page_pods(pr, nppods, idx, prsv) != 0)
1589 		return (ENOMEM);
1590 	MPASS(prsv->prsv_nppods > 0);
1591 	return (0);
1592 }
1593 
1594 void
t4_free_page_pods(struct ppod_reservation * prsv)1595 t4_free_page_pods(struct ppod_reservation *prsv)
1596 {
1597 	struct ppod_region *pr = prsv->prsv_pr;
1598 	vmem_addr_t addr;
1599 
1600 	MPASS(prsv != NULL);
1601 	MPASS(prsv->prsv_nppods != 0);
1602 
1603 	addr = prsv->prsv_tag & pr->pr_tag_mask;
1604 	MPASS((addr & pr->pr_invalid_bit) == 0);
1605 
1606 #ifdef VERBOSE_TRACES
1607 	CTR4(KTR_CXGBE, "%-17s arena %p, addr 0x%08x, nppods %d", __func__,
1608 	    pr->pr_arena, addr, prsv->prsv_nppods);
1609 #endif
1610 
1611 	vmem_free(pr->pr_arena, addr, PPOD_SZ(prsv->prsv_nppods));
1612 	prsv->prsv_nppods = 0;
1613 }
1614 
1615 #define NUM_ULP_TX_SC_IMM_PPODS (256 / PPOD_SIZE)
1616 
1617 int
t4_write_page_pods_for_ps(struct adapter * sc,struct sge_wrq * wrq,int tid,struct pageset * ps)1618 t4_write_page_pods_for_ps(struct adapter *sc, struct sge_wrq *wrq, int tid,
1619     struct pageset *ps)
1620 {
1621 	struct wrqe *wr;
1622 	struct ulp_mem_io *ulpmc;
1623 	struct ulptx_idata *ulpsc;
1624 	struct pagepod *ppod;
1625 	int i, j, k, n, chunk, len, ddp_pgsz, idx;
1626 	u_int ppod_addr;
1627 	uint32_t cmd;
1628 	struct ppod_reservation *prsv = &ps->prsv;
1629 	struct ppod_region *pr = prsv->prsv_pr;
1630 	vm_paddr_t pa;
1631 
1632 	KASSERT(!(ps->flags & PS_PPODS_WRITTEN),
1633 	    ("%s: page pods already written", __func__));
1634 	MPASS(prsv->prsv_nppods > 0);
1635 
1636 	cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
1637 	if (is_t4(sc))
1638 		cmd |= htobe32(F_ULP_MEMIO_ORDER);
1639 	else
1640 		cmd |= htobe32(F_T5_ULP_MEMIO_IMM);
1641 	ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)];
1642 	ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
1643 	for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
1644 		/* How many page pods are we writing in this cycle */
1645 		n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
1646 		chunk = PPOD_SZ(n);
1647 		len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
1648 
1649 		wr = alloc_wrqe(len, wrq);
1650 		if (wr == NULL)
1651 			return (ENOMEM);	/* ok to just bail out */
1652 		ulpmc = wrtod(wr);
1653 
1654 		INIT_ULPTX_WR(ulpmc, len, 0, 0);
1655 		ulpmc->cmd = cmd;
1656 		if (chip_id(sc) >= CHELSIO_T7)
1657 			ulpmc->dlen = htobe32(V_T7_ULP_MEMIO_DATA_LEN(chunk >> 5));
1658 		else
1659 			ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk >> 5));
1660 		ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
1661 		ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
1662 
1663 		ulpsc = (struct ulptx_idata *)(ulpmc + 1);
1664 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
1665 		ulpsc->len = htobe32(chunk);
1666 
1667 		ppod = (struct pagepod *)(ulpsc + 1);
1668 		for (j = 0; j < n; i++, j++, ppod++) {
1669 			ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
1670 			    V_PPOD_TID(tid) | prsv->prsv_tag);
1671 			ppod->len_offset = htobe64(V_PPOD_LEN(ps->len) |
1672 			    V_PPOD_OFST(ps->offset));
1673 			ppod->rsvd = 0;
1674 			idx = i * PPOD_PAGES * (ddp_pgsz / PAGE_SIZE);
1675 			for (k = 0; k < nitems(ppod->addr); k++) {
1676 				if (idx < ps->npages) {
1677 					pa = VM_PAGE_TO_PHYS(ps->pages[idx]);
1678 					ppod->addr[k] = htobe64(pa);
1679 					idx += ddp_pgsz / PAGE_SIZE;
1680 				} else
1681 					ppod->addr[k] = 0;
1682 #if 0
1683 				CTR5(KTR_CXGBE,
1684 				    "%s: tid %d ppod[%d]->addr[%d] = %p",
1685 				    __func__, tid, i, k,
1686 				    be64toh(ppod->addr[k]));
1687 #endif
1688 			}
1689 
1690 		}
1691 
1692 		t4_wrq_tx(sc, wr);
1693 	}
1694 	ps->flags |= PS_PPODS_WRITTEN;
1695 
1696 	return (0);
1697 }
1698 
1699 static int
t4_write_page_pods_for_rcvbuf(struct adapter * sc,struct sge_wrq * wrq,int tid,struct ddp_rcv_buffer * drb)1700 t4_write_page_pods_for_rcvbuf(struct adapter *sc, struct sge_wrq *wrq, int tid,
1701     struct ddp_rcv_buffer *drb)
1702 {
1703 	struct wrqe *wr;
1704 	struct ulp_mem_io *ulpmc;
1705 	struct ulptx_idata *ulpsc;
1706 	struct pagepod *ppod;
1707 	int i, j, k, n, chunk, len, ddp_pgsz;
1708 	u_int ppod_addr, offset;
1709 	uint32_t cmd;
1710 	struct ppod_reservation *prsv = &drb->prsv;
1711 	struct ppod_region *pr = prsv->prsv_pr;
1712 	uintptr_t end_pva, pva;
1713 	vm_paddr_t pa;
1714 
1715 	MPASS(prsv->prsv_nppods > 0);
1716 
1717 	cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
1718 	if (is_t4(sc))
1719 		cmd |= htobe32(F_ULP_MEMIO_ORDER);
1720 	else
1721 		cmd |= htobe32(F_T5_ULP_MEMIO_IMM);
1722 	ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)];
1723 	offset = (uintptr_t)drb->buf & PAGE_MASK;
1724 	ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
1725 	pva = trunc_page((uintptr_t)drb->buf);
1726 	end_pva = trunc_page((uintptr_t)drb->buf + drb->len - 1);
1727 	for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
1728 		/* How many page pods are we writing in this cycle */
1729 		n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
1730 		MPASS(n > 0);
1731 		chunk = PPOD_SZ(n);
1732 		len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
1733 
1734 		wr = alloc_wrqe(len, wrq);
1735 		if (wr == NULL)
1736 			return (ENOMEM);	/* ok to just bail out */
1737 		ulpmc = wrtod(wr);
1738 
1739 		INIT_ULPTX_WR(ulpmc, len, 0, 0);
1740 		ulpmc->cmd = cmd;
1741 		ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32));
1742 		ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
1743 		ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
1744 
1745 		ulpsc = (struct ulptx_idata *)(ulpmc + 1);
1746 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
1747 		ulpsc->len = htobe32(chunk);
1748 
1749 		ppod = (struct pagepod *)(ulpsc + 1);
1750 		for (j = 0; j < n; i++, j++, ppod++) {
1751 			ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
1752 			    V_PPOD_TID(tid) | prsv->prsv_tag);
1753 			ppod->len_offset = htobe64(V_PPOD_LEN(drb->len) |
1754 			    V_PPOD_OFST(offset));
1755 			ppod->rsvd = 0;
1756 
1757 			for (k = 0; k < nitems(ppod->addr); k++) {
1758 				if (pva > end_pva)
1759 					ppod->addr[k] = 0;
1760 				else {
1761 					pa = pmap_kextract(pva);
1762 					ppod->addr[k] = htobe64(pa);
1763 					pva += ddp_pgsz;
1764 				}
1765 #if 0
1766 				CTR5(KTR_CXGBE,
1767 				    "%s: tid %d ppod[%d]->addr[%d] = %p",
1768 				    __func__, tid, i, k,
1769 				    be64toh(ppod->addr[k]));
1770 #endif
1771 			}
1772 
1773 			/*
1774 			 * Walk back 1 segment so that the first address in the
1775 			 * next pod is the same as the last one in the current
1776 			 * pod.
1777 			 */
1778 			pva -= ddp_pgsz;
1779 		}
1780 
1781 		t4_wrq_tx(sc, wr);
1782 	}
1783 
1784 	MPASS(pva <= end_pva);
1785 
1786 	return (0);
1787 }
1788 
1789 struct mbuf *
alloc_raw_wr_mbuf(int len)1790 alloc_raw_wr_mbuf(int len)
1791 {
1792 	struct mbuf *m;
1793 
1794 	if (len <= MHLEN)
1795 		m = m_gethdr(M_NOWAIT, MT_DATA);
1796 	else if (len <= MCLBYTES)
1797 		m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
1798 	else
1799 		m = NULL;
1800 	if (m == NULL)
1801 		return (NULL);
1802 	m->m_pkthdr.len = len;
1803 	m->m_len = len;
1804 	set_mbuf_raw_wr(m, true);
1805 	return (m);
1806 }
1807 
1808 int
t4_write_page_pods_for_bio(struct adapter * sc,struct toepcb * toep,struct ppod_reservation * prsv,struct bio * bp,struct mbufq * wrq)1809 t4_write_page_pods_for_bio(struct adapter *sc, struct toepcb *toep,
1810     struct ppod_reservation *prsv, struct bio *bp, struct mbufq *wrq)
1811 {
1812 	struct ulp_mem_io *ulpmc;
1813 	struct ulptx_idata *ulpsc;
1814 	struct pagepod *ppod;
1815 	int i, j, k, n, chunk, len, ddp_pgsz, idx;
1816 	u_int ppod_addr;
1817 	uint32_t cmd;
1818 	struct ppod_region *pr = prsv->prsv_pr;
1819 	vm_paddr_t pa;
1820 	struct mbuf *m;
1821 
1822 	MPASS(bp->bio_flags & BIO_UNMAPPED);
1823 
1824 	cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
1825 	if (is_t4(sc))
1826 		cmd |= htobe32(F_ULP_MEMIO_ORDER);
1827 	else
1828 		cmd |= htobe32(F_T5_ULP_MEMIO_IMM);
1829 	ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)];
1830 	ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
1831 	for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
1832 
1833 		/* How many page pods are we writing in this cycle */
1834 		n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
1835 		MPASS(n > 0);
1836 		chunk = PPOD_SZ(n);
1837 		len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
1838 
1839 		m = alloc_raw_wr_mbuf(len);
1840 		if (m == NULL)
1841 			return (ENOMEM);
1842 
1843 		ulpmc = mtod(m, struct ulp_mem_io *);
1844 		INIT_ULPTX_WR(ulpmc, len, 0, toep->tid);
1845 		ulpmc->cmd = cmd;
1846 		if (chip_id(sc) >= CHELSIO_T7)
1847 			ulpmc->dlen = htobe32(V_T7_ULP_MEMIO_DATA_LEN(chunk >> 5));
1848 		else
1849 			ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk >> 5));
1850 		ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
1851 		ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
1852 
1853 		ulpsc = (struct ulptx_idata *)(ulpmc + 1);
1854 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
1855 		ulpsc->len = htobe32(chunk);
1856 
1857 		ppod = (struct pagepod *)(ulpsc + 1);
1858 		for (j = 0; j < n; i++, j++, ppod++) {
1859 			ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
1860 			    V_PPOD_TID(toep->tid) |
1861 			    (prsv->prsv_tag & ~V_PPOD_PGSZ(M_PPOD_PGSZ)));
1862 			ppod->len_offset = htobe64(V_PPOD_LEN(bp->bio_bcount) |
1863 			    V_PPOD_OFST(bp->bio_ma_offset));
1864 			ppod->rsvd = 0;
1865 			idx = i * PPOD_PAGES * (ddp_pgsz / PAGE_SIZE);
1866 			for (k = 0; k < nitems(ppod->addr); k++) {
1867 				if (idx < bp->bio_ma_n) {
1868 					pa = VM_PAGE_TO_PHYS(bp->bio_ma[idx]);
1869 					ppod->addr[k] = htobe64(pa);
1870 					idx += ddp_pgsz / PAGE_SIZE;
1871 				} else
1872 					ppod->addr[k] = 0;
1873 #if 0
1874 				CTR5(KTR_CXGBE,
1875 				    "%s: tid %d ppod[%d]->addr[%d] = %p",
1876 				    __func__, toep->tid, i, k,
1877 				    be64toh(ppod->addr[k]));
1878 #endif
1879 			}
1880 		}
1881 
1882 		mbufq_enqueue(wrq, m);
1883 	}
1884 
1885 	return (0);
1886 }
1887 
1888 int
t4_write_page_pods_for_buf(struct adapter * sc,struct toepcb * toep,struct ppod_reservation * prsv,vm_offset_t buf,int buflen,struct mbufq * wrq)1889 t4_write_page_pods_for_buf(struct adapter *sc, struct toepcb *toep,
1890     struct ppod_reservation *prsv, vm_offset_t buf, int buflen,
1891     struct mbufq *wrq)
1892 {
1893 	struct ulp_mem_io *ulpmc;
1894 	struct ulptx_idata *ulpsc;
1895 	struct pagepod *ppod;
1896 	int i, j, k, n, chunk, len, ddp_pgsz;
1897 	u_int ppod_addr, offset;
1898 	uint32_t cmd;
1899 	struct ppod_region *pr = prsv->prsv_pr;
1900 	uintptr_t end_pva, pva;
1901 	vm_paddr_t pa;
1902 	struct mbuf *m;
1903 
1904 	cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
1905 	if (is_t4(sc))
1906 		cmd |= htobe32(F_ULP_MEMIO_ORDER);
1907 	else
1908 		cmd |= htobe32(F_T5_ULP_MEMIO_IMM);
1909 	ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)];
1910 	offset = buf & PAGE_MASK;
1911 	ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
1912 	pva = trunc_page(buf);
1913 	end_pva = trunc_page(buf + buflen - 1);
1914 	for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
1915 
1916 		/* How many page pods are we writing in this cycle */
1917 		n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
1918 		MPASS(n > 0);
1919 		chunk = PPOD_SZ(n);
1920 		len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
1921 
1922 		m = alloc_raw_wr_mbuf(len);
1923 		if (m == NULL)
1924 			return (ENOMEM);
1925 		ulpmc = mtod(m, struct ulp_mem_io *);
1926 
1927 		INIT_ULPTX_WR(ulpmc, len, 0, toep->tid);
1928 		ulpmc->cmd = cmd;
1929 		if (chip_id(sc) >= CHELSIO_T7)
1930 			ulpmc->dlen = htobe32(V_T7_ULP_MEMIO_DATA_LEN(chunk >> 5));
1931 		else
1932 			ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk >> 5));
1933 		ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
1934 		ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
1935 
1936 		ulpsc = (struct ulptx_idata *)(ulpmc + 1);
1937 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
1938 		ulpsc->len = htobe32(chunk);
1939 
1940 		ppod = (struct pagepod *)(ulpsc + 1);
1941 		for (j = 0; j < n; i++, j++, ppod++) {
1942 			ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
1943 			    V_PPOD_TID(toep->tid) |
1944 			    (prsv->prsv_tag & ~V_PPOD_PGSZ(M_PPOD_PGSZ)));
1945 			ppod->len_offset = htobe64(V_PPOD_LEN(buflen) |
1946 			    V_PPOD_OFST(offset));
1947 			ppod->rsvd = 0;
1948 
1949 			for (k = 0; k < nitems(ppod->addr); k++) {
1950 				if (pva > end_pva)
1951 					ppod->addr[k] = 0;
1952 				else {
1953 					pa = pmap_kextract(pva);
1954 					ppod->addr[k] = htobe64(pa);
1955 					pva += ddp_pgsz;
1956 				}
1957 #if 0
1958 				CTR5(KTR_CXGBE,
1959 				    "%s: tid %d ppod[%d]->addr[%d] = %p",
1960 				    __func__, toep->tid, i, k,
1961 				    be64toh(ppod->addr[k]));
1962 #endif
1963 			}
1964 
1965 			/*
1966 			 * Walk back 1 segment so that the first address in the
1967 			 * next pod is the same as the last one in the current
1968 			 * pod.
1969 			 */
1970 			pva -= ddp_pgsz;
1971 		}
1972 
1973 		mbufq_enqueue(wrq, m);
1974 	}
1975 
1976 	MPASS(pva <= end_pva);
1977 
1978 	return (0);
1979 }
1980 
1981 int
t4_write_page_pods_for_sgl(struct adapter * sc,struct toepcb * toep,struct ppod_reservation * prsv,struct ctl_sg_entry * sgl,int entries,int xferlen,struct mbufq * wrq)1982 t4_write_page_pods_for_sgl(struct adapter *sc, struct toepcb *toep,
1983     struct ppod_reservation *prsv, struct ctl_sg_entry *sgl, int entries,
1984     int xferlen, struct mbufq *wrq)
1985 {
1986 	struct ulp_mem_io *ulpmc;
1987 	struct ulptx_idata *ulpsc;
1988 	struct pagepod *ppod;
1989 	int i, j, k, n, chunk, len, ddp_pgsz;
1990 	u_int ppod_addr, offset, sg_offset = 0;
1991 	uint32_t cmd;
1992 	struct ppod_region *pr = prsv->prsv_pr;
1993 	uintptr_t pva;
1994 	vm_paddr_t pa;
1995 	struct mbuf *m;
1996 
1997 	MPASS(sgl != NULL);
1998 	MPASS(entries > 0);
1999 	cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
2000 	if (is_t4(sc))
2001 		cmd |= htobe32(F_ULP_MEMIO_ORDER);
2002 	else
2003 		cmd |= htobe32(F_T5_ULP_MEMIO_IMM);
2004 	ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)];
2005 	offset = (vm_offset_t)sgl->addr & PAGE_MASK;
2006 	ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
2007 	pva = trunc_page((vm_offset_t)sgl->addr);
2008 	for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
2009 
2010 		/* How many page pods are we writing in this cycle */
2011 		n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
2012 		MPASS(n > 0);
2013 		chunk = PPOD_SZ(n);
2014 		len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
2015 
2016 		m = alloc_raw_wr_mbuf(len);
2017 		if (m == NULL)
2018 			return (ENOMEM);
2019 		ulpmc = mtod(m, struct ulp_mem_io *);
2020 
2021 		INIT_ULPTX_WR(ulpmc, len, 0, toep->tid);
2022 		ulpmc->cmd = cmd;
2023 		if (chip_id(sc) >= CHELSIO_T7)
2024 			ulpmc->dlen = htobe32(V_T7_ULP_MEMIO_DATA_LEN(chunk >> 5));
2025 		else
2026 			ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk >> 5));
2027 		ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
2028 		ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
2029 
2030 		ulpsc = (struct ulptx_idata *)(ulpmc + 1);
2031 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
2032 		ulpsc->len = htobe32(chunk);
2033 
2034 		ppod = (struct pagepod *)(ulpsc + 1);
2035 		for (j = 0; j < n; i++, j++, ppod++) {
2036 			ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
2037 			    V_PPOD_TID(toep->tid) |
2038 			    (prsv->prsv_tag & ~V_PPOD_PGSZ(M_PPOD_PGSZ)));
2039 			ppod->len_offset = htobe64(V_PPOD_LEN(xferlen) |
2040 			    V_PPOD_OFST(offset));
2041 			ppod->rsvd = 0;
2042 
2043 			for (k = 0; k < nitems(ppod->addr); k++) {
2044 				if (entries != 0) {
2045 					pa = pmap_kextract(pva + sg_offset);
2046 					ppod->addr[k] = htobe64(pa);
2047 				} else
2048 					ppod->addr[k] = 0;
2049 
2050 #if 0
2051 				CTR5(KTR_CXGBE,
2052 				    "%s: tid %d ppod[%d]->addr[%d] = %p",
2053 				    __func__, toep->tid, i, k,
2054 				    be64toh(ppod->addr[k]));
2055 #endif
2056 
2057 				/*
2058 				 * If this is the last entry in a pod,
2059 				 * reuse the same entry for first address
2060 				 * in the next pod.
2061 				 */
2062 				if (k + 1 == nitems(ppod->addr))
2063 					break;
2064 
2065 				/*
2066 				 * Don't move to the next DDP page if the
2067 				 * sgl is already finished.
2068 				 */
2069 				if (entries == 0)
2070 					continue;
2071 
2072 				sg_offset += ddp_pgsz;
2073 				if (sg_offset == sgl->len) {
2074 					/*
2075 					 * This sgl entry is done.  Go
2076 					 * to the next.
2077 					 */
2078 					entries--;
2079 					sgl++;
2080 					sg_offset = 0;
2081 					if (entries != 0)
2082 						pva = trunc_page(
2083 						    (vm_offset_t)sgl->addr);
2084 				}
2085 			}
2086 		}
2087 
2088 		mbufq_enqueue(wrq, m);
2089 	}
2090 
2091 	return (0);
2092 }
2093 
2094 /*
2095  * Prepare a pageset for DDP.  This sets up page pods.
2096  */
2097 static int
prep_pageset(struct adapter * sc,struct toepcb * toep,struct pageset * ps)2098 prep_pageset(struct adapter *sc, struct toepcb *toep, struct pageset *ps)
2099 {
2100 	struct tom_data *td = sc->tom_softc;
2101 
2102 	if (ps->prsv.prsv_nppods == 0 &&
2103 	    t4_alloc_page_pods_for_ps(&td->pr, ps) != 0) {
2104 		return (0);
2105 	}
2106 	if (!(ps->flags & PS_PPODS_WRITTEN) &&
2107 	    t4_write_page_pods_for_ps(sc, toep->ctrlq, toep->tid, ps) != 0) {
2108 		return (0);
2109 	}
2110 
2111 	return (1);
2112 }
2113 
2114 int
t4_init_ppod_region(struct ppod_region * pr,struct t4_range * r,u_int psz,const char * name)2115 t4_init_ppod_region(struct ppod_region *pr, struct t4_range *r, u_int psz,
2116     const char *name)
2117 {
2118 	int i;
2119 
2120 	MPASS(pr != NULL);
2121 	MPASS(r->size > 0);
2122 
2123 	pr->pr_start = r->start;
2124 	pr->pr_len = r->size;
2125 	pr->pr_page_shift[0] = 12 + G_HPZ0(psz);
2126 	pr->pr_page_shift[1] = 12 + G_HPZ1(psz);
2127 	pr->pr_page_shift[2] = 12 + G_HPZ2(psz);
2128 	pr->pr_page_shift[3] = 12 + G_HPZ3(psz);
2129 
2130 	/* The SGL -> page pod algorithm requires the sizes to be in order. */
2131 	for (i = 1; i < nitems(pr->pr_page_shift); i++) {
2132 		if (pr->pr_page_shift[i] <= pr->pr_page_shift[i - 1])
2133 			return (ENXIO);
2134 	}
2135 
2136 	pr->pr_tag_mask = ((1 << fls(r->size)) - 1) & V_PPOD_TAG(M_PPOD_TAG);
2137 	pr->pr_alias_mask = V_PPOD_TAG(M_PPOD_TAG) & ~pr->pr_tag_mask;
2138 	if (pr->pr_tag_mask == 0 || pr->pr_alias_mask == 0)
2139 		return (ENXIO);
2140 	pr->pr_alias_shift = fls(pr->pr_tag_mask);
2141 	pr->pr_invalid_bit = 1 << (pr->pr_alias_shift - 1);
2142 
2143 	pr->pr_arena = vmem_create(name, 0, pr->pr_len, PPOD_SIZE, 0,
2144 	    M_FIRSTFIT | M_NOWAIT);
2145 	if (pr->pr_arena == NULL)
2146 		return (ENOMEM);
2147 
2148 	return (0);
2149 }
2150 
2151 void
t4_free_ppod_region(struct ppod_region * pr)2152 t4_free_ppod_region(struct ppod_region *pr)
2153 {
2154 
2155 	MPASS(pr != NULL);
2156 
2157 	if (pr->pr_arena)
2158 		vmem_destroy(pr->pr_arena);
2159 	bzero(pr, sizeof(*pr));
2160 }
2161 
2162 static int
pscmp(struct pageset * ps,struct vmspace * vm,vm_offset_t start,int npages,int pgoff,int len)2163 pscmp(struct pageset *ps, struct vmspace *vm, vm_offset_t start, int npages,
2164     int pgoff, int len)
2165 {
2166 
2167 	if (ps->start != start || ps->npages != npages ||
2168 	    ps->offset != pgoff || ps->len != len)
2169 		return (1);
2170 
2171 	return (ps->vm != vm || ps->vm_timestamp != vm->vm_map.timestamp);
2172 }
2173 
2174 static int
hold_aio(struct toepcb * toep,struct kaiocb * job,struct pageset ** pps)2175 hold_aio(struct toepcb *toep, struct kaiocb *job, struct pageset **pps)
2176 {
2177 	struct vmspace *vm;
2178 	vm_map_t map;
2179 	vm_offset_t start, end, pgoff;
2180 	struct pageset *ps;
2181 	int n;
2182 
2183 	DDP_ASSERT_LOCKED(toep);
2184 
2185 	/*
2186 	 * The AIO subsystem will cancel and drain all requests before
2187 	 * permitting a process to exit or exec, so p_vmspace should
2188 	 * be stable here.
2189 	 */
2190 	vm = job->userproc->p_vmspace;
2191 	map = &vm->vm_map;
2192 	start = (uintptr_t)job->uaiocb.aio_buf;
2193 	pgoff = start & PAGE_MASK;
2194 	end = round_page(start + job->uaiocb.aio_nbytes);
2195 	start = trunc_page(start);
2196 
2197 	if (end - start > MAX_DDP_BUFFER_SIZE) {
2198 		/*
2199 		 * Truncate the request to a short read.
2200 		 * Alternatively, we could DDP in chunks to the larger
2201 		 * buffer, but that would be quite a bit more work.
2202 		 *
2203 		 * When truncating, round the request down to avoid
2204 		 * crossing a cache line on the final transaction.
2205 		 */
2206 		end = rounddown2(start + MAX_DDP_BUFFER_SIZE, CACHE_LINE_SIZE);
2207 #ifdef VERBOSE_TRACES
2208 		CTR4(KTR_CXGBE, "%s: tid %d, truncating size from %lu to %lu",
2209 		    __func__, toep->tid, (unsigned long)job->uaiocb.aio_nbytes,
2210 		    (unsigned long)(end - (start + pgoff)));
2211 		job->uaiocb.aio_nbytes = end - (start + pgoff);
2212 #endif
2213 		end = round_page(end);
2214 	}
2215 
2216 	n = atop(end - start);
2217 
2218 	/*
2219 	 * Try to reuse a cached pageset.
2220 	 */
2221 	TAILQ_FOREACH(ps, &toep->ddp.cached_pagesets, link) {
2222 		if (pscmp(ps, vm, start, n, pgoff,
2223 		    job->uaiocb.aio_nbytes) == 0) {
2224 			TAILQ_REMOVE(&toep->ddp.cached_pagesets, ps, link);
2225 			toep->ddp.cached_count--;
2226 			*pps = ps;
2227 			return (0);
2228 		}
2229 	}
2230 
2231 	/*
2232 	 * If there are too many cached pagesets to create a new one,
2233 	 * free a pageset before creating a new one.
2234 	 */
2235 	KASSERT(toep->ddp.active_count + toep->ddp.cached_count <=
2236 	    nitems(toep->ddp.db), ("%s: too many wired pagesets", __func__));
2237 	if (toep->ddp.active_count + toep->ddp.cached_count ==
2238 	    nitems(toep->ddp.db)) {
2239 		KASSERT(toep->ddp.cached_count > 0,
2240 		    ("no cached pageset to free"));
2241 		ps = TAILQ_LAST(&toep->ddp.cached_pagesets, pagesetq);
2242 		TAILQ_REMOVE(&toep->ddp.cached_pagesets, ps, link);
2243 		toep->ddp.cached_count--;
2244 		free_pageset(toep->td, ps);
2245 	}
2246 	DDP_UNLOCK(toep);
2247 
2248 	/* Create a new pageset. */
2249 	ps = malloc(sizeof(*ps) + n * sizeof(vm_page_t), M_CXGBE, M_WAITOK |
2250 	    M_ZERO);
2251 	ps->pages = (vm_page_t *)(ps + 1);
2252 	ps->vm_timestamp = map->timestamp;
2253 	ps->npages = vm_fault_quick_hold_pages(map, start, end - start,
2254 	    VM_PROT_WRITE, ps->pages, n);
2255 
2256 	DDP_LOCK(toep);
2257 	if (ps->npages < 0) {
2258 		free(ps, M_CXGBE);
2259 		return (EFAULT);
2260 	}
2261 
2262 	KASSERT(ps->npages == n, ("hold_aio: page count mismatch: %d vs %d",
2263 	    ps->npages, n));
2264 
2265 	ps->offset = pgoff;
2266 	ps->len = job->uaiocb.aio_nbytes;
2267 	refcount_acquire(&vm->vm_refcnt);
2268 	ps->vm = vm;
2269 	ps->start = start;
2270 
2271 	CTR5(KTR_CXGBE, "%s: tid %d, new pageset %p for job %p, npages %d",
2272 	    __func__, toep->tid, ps, job, ps->npages);
2273 	*pps = ps;
2274 	return (0);
2275 }
2276 
2277 static void
ddp_complete_all(struct toepcb * toep,int error)2278 ddp_complete_all(struct toepcb *toep, int error)
2279 {
2280 	struct kaiocb *job;
2281 
2282 	DDP_ASSERT_LOCKED(toep);
2283 	KASSERT((toep->ddp.flags & DDP_AIO) != 0, ("%s: DDP_RCVBUF", __func__));
2284 	while (!TAILQ_EMPTY(&toep->ddp.aiojobq)) {
2285 		job = TAILQ_FIRST(&toep->ddp.aiojobq);
2286 		TAILQ_REMOVE(&toep->ddp.aiojobq, job, list);
2287 		toep->ddp.waiting_count--;
2288 		if (aio_clear_cancel_function(job))
2289 			ddp_complete_one(job, error);
2290 	}
2291 }
2292 
2293 static void
aio_ddp_cancel_one(struct kaiocb * job)2294 aio_ddp_cancel_one(struct kaiocb *job)
2295 {
2296 	long copied;
2297 
2298 	/*
2299 	 * If this job had copied data out of the socket buffer before
2300 	 * it was cancelled, report it as a short read rather than an
2301 	 * error.
2302 	 */
2303 	copied = job->aio_received;
2304 	if (copied != 0)
2305 		aio_complete(job, copied, 0);
2306 	else
2307 		aio_cancel(job);
2308 }
2309 
2310 /*
2311  * Called when the main loop wants to requeue a job to retry it later.
2312  * Deals with the race of the job being cancelled while it was being
2313  * examined.
2314  */
2315 static void
aio_ddp_requeue_one(struct toepcb * toep,struct kaiocb * job)2316 aio_ddp_requeue_one(struct toepcb *toep, struct kaiocb *job)
2317 {
2318 
2319 	DDP_ASSERT_LOCKED(toep);
2320 	if (!(toep->ddp.flags & DDP_DEAD) &&
2321 	    aio_set_cancel_function(job, t4_aio_cancel_queued)) {
2322 		TAILQ_INSERT_HEAD(&toep->ddp.aiojobq, job, list);
2323 		toep->ddp.waiting_count++;
2324 	} else
2325 		aio_ddp_cancel_one(job);
2326 }
2327 
2328 static void
aio_ddp_requeue(struct toepcb * toep)2329 aio_ddp_requeue(struct toepcb *toep)
2330 {
2331 	struct adapter *sc = td_adapter(toep->td);
2332 	struct socket *so;
2333 	struct sockbuf *sb;
2334 	struct inpcb *inp;
2335 	struct kaiocb *job;
2336 	struct ddp_buffer *db;
2337 	size_t copied, offset, resid;
2338 	struct pageset *ps;
2339 	struct mbuf *m;
2340 	uint64_t ddp_flags, ddp_flags_mask;
2341 	struct wrqe *wr;
2342 	int buf_flag, db_idx, error;
2343 
2344 	DDP_ASSERT_LOCKED(toep);
2345 
2346 restart:
2347 	if (toep->ddp.flags & DDP_DEAD) {
2348 		MPASS(toep->ddp.waiting_count == 0);
2349 		MPASS(toep->ddp.active_count == 0);
2350 		return;
2351 	}
2352 
2353 	if (toep->ddp.waiting_count == 0 ||
2354 	    toep->ddp.active_count == nitems(toep->ddp.db)) {
2355 		return;
2356 	}
2357 
2358 	job = TAILQ_FIRST(&toep->ddp.aiojobq);
2359 	so = job->fd_file->f_data;
2360 	sb = &so->so_rcv;
2361 	SOCKBUF_LOCK(sb);
2362 
2363 	/* We will never get anything unless we are or were connected. */
2364 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
2365 		SOCKBUF_UNLOCK(sb);
2366 		ddp_complete_all(toep, ENOTCONN);
2367 		return;
2368 	}
2369 
2370 	KASSERT(toep->ddp.active_count == 0 || sbavail(sb) == 0,
2371 	    ("%s: pending sockbuf data and DDP is active", __func__));
2372 
2373 	/* Abort if socket has reported problems. */
2374 	/* XXX: Wait for any queued DDP's to finish and/or flush them? */
2375 	if (so->so_error && sbavail(sb) == 0) {
2376 		toep->ddp.waiting_count--;
2377 		TAILQ_REMOVE(&toep->ddp.aiojobq, job, list);
2378 		if (!aio_clear_cancel_function(job)) {
2379 			SOCKBUF_UNLOCK(sb);
2380 			goto restart;
2381 		}
2382 
2383 		/*
2384 		 * If this job has previously copied some data, report
2385 		 * a short read and leave the error to be reported by
2386 		 * a future request.
2387 		 */
2388 		copied = job->aio_received;
2389 		if (copied != 0) {
2390 			SOCKBUF_UNLOCK(sb);
2391 			aio_complete(job, copied, 0);
2392 			goto restart;
2393 		}
2394 		error = so->so_error;
2395 		so->so_error = 0;
2396 		SOCKBUF_UNLOCK(sb);
2397 		aio_complete(job, -1, error);
2398 		goto restart;
2399 	}
2400 
2401 	/*
2402 	 * Door is closed.  If there is pending data in the socket buffer,
2403 	 * deliver it.  If there are pending DDP requests, wait for those
2404 	 * to complete.  Once they have completed, return EOF reads.
2405 	 */
2406 	if (sb->sb_state & SBS_CANTRCVMORE && sbavail(sb) == 0) {
2407 		SOCKBUF_UNLOCK(sb);
2408 		if (toep->ddp.active_count != 0)
2409 			return;
2410 		ddp_complete_all(toep, 0);
2411 		return;
2412 	}
2413 
2414 	/*
2415 	 * If DDP is not enabled and there is no pending socket buffer
2416 	 * data, try to enable DDP.
2417 	 */
2418 	if (sbavail(sb) == 0 && (toep->ddp.flags & DDP_ON) == 0) {
2419 		SOCKBUF_UNLOCK(sb);
2420 
2421 		/*
2422 		 * Wait for the card to ACK that DDP is enabled before
2423 		 * queueing any buffers.  Currently this waits for an
2424 		 * indicate to arrive.  This could use a TCB_SET_FIELD_RPL
2425 		 * message to know that DDP was enabled instead of waiting
2426 		 * for the indicate which would avoid copying the indicate
2427 		 * if no data is pending.
2428 		 *
2429 		 * XXX: Might want to limit the indicate size to the size
2430 		 * of the first queued request.
2431 		 */
2432 		if ((toep->ddp.flags & DDP_SC_REQ) == 0)
2433 			enable_ddp(sc, toep);
2434 		return;
2435 	}
2436 	SOCKBUF_UNLOCK(sb);
2437 
2438 	/*
2439 	 * If another thread is queueing a buffer for DDP, let it
2440 	 * drain any work and return.
2441 	 */
2442 	if (toep->ddp.queueing != NULL)
2443 		return;
2444 
2445 	/* Take the next job to prep it for DDP. */
2446 	toep->ddp.waiting_count--;
2447 	TAILQ_REMOVE(&toep->ddp.aiojobq, job, list);
2448 	if (!aio_clear_cancel_function(job))
2449 		goto restart;
2450 	toep->ddp.queueing = job;
2451 
2452 	/* NB: This drops DDP_LOCK while it holds the backing VM pages. */
2453 	error = hold_aio(toep, job, &ps);
2454 	if (error != 0) {
2455 		ddp_complete_one(job, error);
2456 		toep->ddp.queueing = NULL;
2457 		goto restart;
2458 	}
2459 
2460 	SOCKBUF_LOCK(sb);
2461 	if (so->so_error && sbavail(sb) == 0) {
2462 		copied = job->aio_received;
2463 		if (copied != 0) {
2464 			SOCKBUF_UNLOCK(sb);
2465 			recycle_pageset(toep, ps);
2466 			aio_complete(job, copied, 0);
2467 			toep->ddp.queueing = NULL;
2468 			goto restart;
2469 		}
2470 
2471 		error = so->so_error;
2472 		so->so_error = 0;
2473 		SOCKBUF_UNLOCK(sb);
2474 		recycle_pageset(toep, ps);
2475 		aio_complete(job, -1, error);
2476 		toep->ddp.queueing = NULL;
2477 		goto restart;
2478 	}
2479 
2480 	if (sb->sb_state & SBS_CANTRCVMORE && sbavail(sb) == 0) {
2481 		SOCKBUF_UNLOCK(sb);
2482 		recycle_pageset(toep, ps);
2483 		if (toep->ddp.active_count != 0) {
2484 			/*
2485 			 * The door is closed, but there are still pending
2486 			 * DDP buffers.  Requeue.  These jobs will all be
2487 			 * completed once those buffers drain.
2488 			 */
2489 			aio_ddp_requeue_one(toep, job);
2490 			toep->ddp.queueing = NULL;
2491 			return;
2492 		}
2493 		ddp_complete_one(job, 0);
2494 		ddp_complete_all(toep, 0);
2495 		toep->ddp.queueing = NULL;
2496 		return;
2497 	}
2498 
2499 sbcopy:
2500 	/*
2501 	 * If the toep is dead, there shouldn't be any data in the socket
2502 	 * buffer, so the above case should have handled this.
2503 	 */
2504 	MPASS(!(toep->ddp.flags & DDP_DEAD));
2505 
2506 	/*
2507 	 * If there is pending data in the socket buffer (either
2508 	 * from before the requests were queued or a DDP indicate),
2509 	 * copy those mbufs out directly.
2510 	 */
2511 	copied = 0;
2512 	offset = ps->offset + job->aio_received;
2513 	MPASS(job->aio_received <= job->uaiocb.aio_nbytes);
2514 	resid = job->uaiocb.aio_nbytes - job->aio_received;
2515 	m = sb->sb_mb;
2516 	KASSERT(m == NULL || toep->ddp.active_count == 0,
2517 	    ("%s: sockbuf data with active DDP", __func__));
2518 	while (m != NULL && resid > 0) {
2519 		struct iovec iov[1];
2520 		struct uio uio;
2521 #ifdef INVARIANTS
2522 		int error;
2523 #endif
2524 
2525 		iov[0].iov_base = mtod(m, void *);
2526 		iov[0].iov_len = m->m_len;
2527 		if (iov[0].iov_len > resid)
2528 			iov[0].iov_len = resid;
2529 		uio.uio_iov = iov;
2530 		uio.uio_iovcnt = 1;
2531 		uio.uio_offset = 0;
2532 		uio.uio_resid = iov[0].iov_len;
2533 		uio.uio_segflg = UIO_SYSSPACE;
2534 		uio.uio_rw = UIO_WRITE;
2535 #ifdef INVARIANTS
2536 		error = uiomove_fromphys(ps->pages, offset + copied,
2537 		    uio.uio_resid, &uio);
2538 #else
2539 		uiomove_fromphys(ps->pages, offset + copied, uio.uio_resid, &uio);
2540 #endif
2541 		MPASS(error == 0 && uio.uio_resid == 0);
2542 		copied += uio.uio_offset;
2543 		resid -= uio.uio_offset;
2544 		m = m->m_next;
2545 	}
2546 	if (copied != 0) {
2547 		sbdrop_locked(sb, copied);
2548 		job->aio_received += copied;
2549 		job->msgrcv = 1;
2550 		copied = job->aio_received;
2551 		inp = sotoinpcb(so);
2552 		if (!INP_TRY_WLOCK(inp)) {
2553 			/*
2554 			 * The reference on the socket file descriptor in
2555 			 * the AIO job should keep 'sb' and 'inp' stable.
2556 			 * Our caller has a reference on the 'toep' that
2557 			 * keeps it stable.
2558 			 */
2559 			SOCKBUF_UNLOCK(sb);
2560 			DDP_UNLOCK(toep);
2561 			INP_WLOCK(inp);
2562 			DDP_LOCK(toep);
2563 			SOCKBUF_LOCK(sb);
2564 
2565 			/*
2566 			 * If the socket has been closed, we should detect
2567 			 * that and complete this request if needed on
2568 			 * the next trip around the loop.
2569 			 */
2570 		}
2571 		t4_rcvd_locked(&toep->td->tod, intotcpcb(inp));
2572 		INP_WUNLOCK(inp);
2573 		if (resid == 0 || toep->ddp.flags & DDP_DEAD) {
2574 			/*
2575 			 * We filled the entire buffer with socket
2576 			 * data, DDP is not being used, or the socket
2577 			 * is being shut down, so complete the
2578 			 * request.
2579 			 */
2580 			SOCKBUF_UNLOCK(sb);
2581 			recycle_pageset(toep, ps);
2582 			aio_complete(job, copied, 0);
2583 			toep->ddp.queueing = NULL;
2584 			goto restart;
2585 		}
2586 
2587 		/*
2588 		 * If DDP is not enabled, requeue this request and restart.
2589 		 * This will either enable DDP or wait for more data to
2590 		 * arrive on the socket buffer.
2591 		 */
2592 		if ((toep->ddp.flags & (DDP_ON | DDP_SC_REQ)) != DDP_ON) {
2593 			SOCKBUF_UNLOCK(sb);
2594 			recycle_pageset(toep, ps);
2595 			aio_ddp_requeue_one(toep, job);
2596 			toep->ddp.queueing = NULL;
2597 			goto restart;
2598 		}
2599 
2600 		/*
2601 		 * An indicate might have arrived and been added to
2602 		 * the socket buffer while it was unlocked after the
2603 		 * copy to lock the INP.  If so, restart the copy.
2604 		 */
2605 		if (sbavail(sb) != 0)
2606 			goto sbcopy;
2607 	}
2608 	SOCKBUF_UNLOCK(sb);
2609 
2610 	if (prep_pageset(sc, toep, ps) == 0) {
2611 		recycle_pageset(toep, ps);
2612 		aio_ddp_requeue_one(toep, job);
2613 		toep->ddp.queueing = NULL;
2614 
2615 		/*
2616 		 * XXX: Need to retry this later.  Mostly need a trigger
2617 		 * when page pods are freed up.
2618 		 */
2619 		printf("%s: prep_pageset failed\n", __func__);
2620 		return;
2621 	}
2622 
2623 	/* Determine which DDP buffer to use. */
2624 	if (toep->ddp.db[0].job == NULL) {
2625 		db_idx = 0;
2626 	} else {
2627 		MPASS(toep->ddp.db[1].job == NULL);
2628 		db_idx = 1;
2629 	}
2630 
2631 	ddp_flags = 0;
2632 	ddp_flags_mask = 0;
2633 	if (db_idx == 0) {
2634 		ddp_flags |= V_TF_DDP_BUF0_VALID(1);
2635 		if (so->so_state & SS_NBIO)
2636 			ddp_flags |= V_TF_DDP_BUF0_FLUSH(1);
2637 		ddp_flags_mask |= V_TF_DDP_PSH_NO_INVALIDATE0(1) |
2638 		    V_TF_DDP_PUSH_DISABLE_0(1) | V_TF_DDP_PSHF_ENABLE_0(1) |
2639 		    V_TF_DDP_BUF0_FLUSH(1) | V_TF_DDP_BUF0_VALID(1);
2640 		buf_flag = DDP_BUF0_ACTIVE;
2641 	} else {
2642 		ddp_flags |= V_TF_DDP_BUF1_VALID(1);
2643 		if (so->so_state & SS_NBIO)
2644 			ddp_flags |= V_TF_DDP_BUF1_FLUSH(1);
2645 		ddp_flags_mask |= V_TF_DDP_PSH_NO_INVALIDATE1(1) |
2646 		    V_TF_DDP_PUSH_DISABLE_1(1) | V_TF_DDP_PSHF_ENABLE_1(1) |
2647 		    V_TF_DDP_BUF1_FLUSH(1) | V_TF_DDP_BUF1_VALID(1);
2648 		buf_flag = DDP_BUF1_ACTIVE;
2649 	}
2650 	MPASS((toep->ddp.flags & buf_flag) == 0);
2651 	if ((toep->ddp.flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)) == 0) {
2652 		MPASS(db_idx == 0);
2653 		MPASS(toep->ddp.active_id == -1);
2654 		MPASS(toep->ddp.active_count == 0);
2655 		ddp_flags_mask |= V_TF_DDP_ACTIVE_BUF(1);
2656 	}
2657 
2658 	/*
2659 	 * The TID for this connection should still be valid.  If DDP_DEAD
2660 	 * is set, SBS_CANTRCVMORE should be set, so we shouldn't be
2661 	 * this far anyway.  Even if the socket is closing on the other
2662 	 * end, the AIO job holds a reference on this end of the socket
2663 	 * which will keep it open and keep the TCP PCB attached until
2664 	 * after the job is completed.
2665 	 */
2666 	wr = mk_update_tcb_for_ddp(sc, toep, db_idx, &ps->prsv,
2667 	    job->aio_received, ps->len, ddp_flags, ddp_flags_mask);
2668 	if (wr == NULL) {
2669 		recycle_pageset(toep, ps);
2670 		aio_ddp_requeue_one(toep, job);
2671 		toep->ddp.queueing = NULL;
2672 
2673 		/*
2674 		 * XXX: Need a way to kick a retry here.
2675 		 *
2676 		 * XXX: We know the fixed size needed and could
2677 		 * preallocate this using a blocking request at the
2678 		 * start of the task to avoid having to handle this
2679 		 * edge case.
2680 		 */
2681 		printf("%s: mk_update_tcb_for_ddp failed\n", __func__);
2682 		return;
2683 	}
2684 
2685 	if (!aio_set_cancel_function(job, t4_aio_cancel_active)) {
2686 		free_wrqe(wr);
2687 		recycle_pageset(toep, ps);
2688 		aio_ddp_cancel_one(job);
2689 		toep->ddp.queueing = NULL;
2690 		goto restart;
2691 	}
2692 
2693 #ifdef VERBOSE_TRACES
2694 	CTR6(KTR_CXGBE,
2695 	    "%s: tid %u, scheduling %p for DDP[%d] (flags %#lx/%#lx)", __func__,
2696 	    toep->tid, job, db_idx, ddp_flags, ddp_flags_mask);
2697 #endif
2698 	/* Give the chip the go-ahead. */
2699 	t4_wrq_tx(sc, wr);
2700 	db = &toep->ddp.db[db_idx];
2701 	db->cancel_pending = 0;
2702 	db->job = job;
2703 	db->ps = ps;
2704 	toep->ddp.queueing = NULL;
2705 	toep->ddp.flags |= buf_flag;
2706 	toep->ddp.active_count++;
2707 	if (toep->ddp.active_count == 1) {
2708 		MPASS(toep->ddp.active_id == -1);
2709 		toep->ddp.active_id = db_idx;
2710 		CTR2(KTR_CXGBE, "%s: ddp_active_id = %d", __func__,
2711 		    toep->ddp.active_id);
2712 	}
2713 	goto restart;
2714 }
2715 
2716 void
ddp_queue_toep(struct toepcb * toep)2717 ddp_queue_toep(struct toepcb *toep)
2718 {
2719 
2720 	DDP_ASSERT_LOCKED(toep);
2721 	if (toep->ddp.flags & DDP_TASK_ACTIVE)
2722 		return;
2723 	toep->ddp.flags |= DDP_TASK_ACTIVE;
2724 	hold_toepcb(toep);
2725 	soaio_enqueue(&toep->ddp.requeue_task);
2726 }
2727 
2728 static void
aio_ddp_requeue_task(void * context,int pending)2729 aio_ddp_requeue_task(void *context, int pending)
2730 {
2731 	struct toepcb *toep = context;
2732 
2733 	DDP_LOCK(toep);
2734 	aio_ddp_requeue(toep);
2735 	toep->ddp.flags &= ~DDP_TASK_ACTIVE;
2736 	DDP_UNLOCK(toep);
2737 
2738 	free_toepcb(toep);
2739 }
2740 
2741 static void
t4_aio_cancel_active(struct kaiocb * job)2742 t4_aio_cancel_active(struct kaiocb *job)
2743 {
2744 	struct socket *so = job->fd_file->f_data;
2745 	struct tcpcb *tp = sototcpcb(so);
2746 	struct toepcb *toep = tp->t_toe;
2747 	struct adapter *sc = td_adapter(toep->td);
2748 	uint64_t valid_flag;
2749 	int i;
2750 
2751 	DDP_LOCK(toep);
2752 	if (aio_cancel_cleared(job)) {
2753 		DDP_UNLOCK(toep);
2754 		aio_ddp_cancel_one(job);
2755 		return;
2756 	}
2757 
2758 	for (i = 0; i < nitems(toep->ddp.db); i++) {
2759 		if (toep->ddp.db[i].job == job) {
2760 			/* Should only ever get one cancel request for a job. */
2761 			MPASS(toep->ddp.db[i].cancel_pending == 0);
2762 
2763 			/*
2764 			 * Invalidate this buffer.  It will be
2765 			 * cancelled or partially completed once the
2766 			 * card ACKs the invalidate.
2767 			 */
2768 			valid_flag = i == 0 ? V_TF_DDP_BUF0_VALID(1) :
2769 			    V_TF_DDP_BUF1_VALID(1);
2770 			t4_set_tcb_field(sc, toep->ctrlq, toep,
2771 			    W_TCB_RX_DDP_FLAGS, valid_flag, 0, 1,
2772 			    CPL_COOKIE_DDP0 + i);
2773 			toep->ddp.db[i].cancel_pending = 1;
2774 			CTR2(KTR_CXGBE, "%s: request %p marked pending",
2775 			    __func__, job);
2776 			break;
2777 		}
2778 	}
2779 	DDP_UNLOCK(toep);
2780 }
2781 
2782 static void
t4_aio_cancel_queued(struct kaiocb * job)2783 t4_aio_cancel_queued(struct kaiocb *job)
2784 {
2785 	struct socket *so = job->fd_file->f_data;
2786 	struct tcpcb *tp = sototcpcb(so);
2787 	struct toepcb *toep = tp->t_toe;
2788 
2789 	DDP_LOCK(toep);
2790 	if (!aio_cancel_cleared(job)) {
2791 		TAILQ_REMOVE(&toep->ddp.aiojobq, job, list);
2792 		toep->ddp.waiting_count--;
2793 		if (toep->ddp.waiting_count == 0)
2794 			ddp_queue_toep(toep);
2795 	}
2796 	CTR2(KTR_CXGBE, "%s: request %p cancelled", __func__, job);
2797 	DDP_UNLOCK(toep);
2798 
2799 	aio_ddp_cancel_one(job);
2800 }
2801 
2802 int
t4_aio_queue_ddp(struct socket * so,struct kaiocb * job)2803 t4_aio_queue_ddp(struct socket *so, struct kaiocb *job)
2804 {
2805 	struct inpcb *inp = sotoinpcb(so);
2806 	struct tcpcb *tp = intotcpcb(inp);
2807 	struct toepcb *toep = tp->t_toe;
2808 
2809 	/* Ignore writes. */
2810 	if (job->uaiocb.aio_lio_opcode != LIO_READ)
2811 		return (EOPNOTSUPP);
2812 
2813 	INP_WLOCK(inp);
2814 	if (__predict_false(ulp_mode(toep) == ULP_MODE_NONE)) {
2815 		if (!set_ddp_ulp_mode(toep)) {
2816 			INP_WUNLOCK(inp);
2817 			return (EOPNOTSUPP);
2818 		}
2819 	}
2820 	INP_WUNLOCK(inp);
2821 
2822 	DDP_LOCK(toep);
2823 
2824 	/*
2825 	 * If DDP is being used for all normal receive, don't use it
2826 	 * for AIO.
2827 	 */
2828 	if ((toep->ddp.flags & DDP_RCVBUF) != 0) {
2829 		DDP_UNLOCK(toep);
2830 		return (EOPNOTSUPP);
2831 	}
2832 
2833 	if ((toep->ddp.flags & DDP_AIO) == 0) {
2834 		toep->ddp.flags |= DDP_AIO;
2835 		TAILQ_INIT(&toep->ddp.cached_pagesets);
2836 		TAILQ_INIT(&toep->ddp.aiojobq);
2837 		TASK_INIT(&toep->ddp.requeue_task, 0, aio_ddp_requeue_task,
2838 		    toep);
2839 	}
2840 
2841 	/*
2842 	 * XXX: Think about possibly returning errors for ENOTCONN,
2843 	 * etc.  Perhaps the caller would only queue the request
2844 	 * if it failed with EOPNOTSUPP?
2845 	 */
2846 
2847 #ifdef VERBOSE_TRACES
2848 	CTR3(KTR_CXGBE, "%s: queueing %p for tid %u", __func__, job, toep->tid);
2849 #endif
2850 	if (!aio_set_cancel_function(job, t4_aio_cancel_queued))
2851 		panic("new job was cancelled");
2852 	TAILQ_INSERT_TAIL(&toep->ddp.aiojobq, job, list);
2853 	toep->ddp.waiting_count++;
2854 
2855 	/*
2856 	 * Try to handle this request synchronously.  If this has
2857 	 * to block because the task is running, it will just bail
2858 	 * and let the task handle it instead.
2859 	 */
2860 	aio_ddp_requeue(toep);
2861 	DDP_UNLOCK(toep);
2862 	return (0);
2863 }
2864 
2865 static void
ddp_rcvbuf_requeue(struct toepcb * toep)2866 ddp_rcvbuf_requeue(struct toepcb *toep)
2867 {
2868 	struct socket *so;
2869 	struct sockbuf *sb;
2870 	struct inpcb *inp;
2871 	struct ddp_rcv_buffer *drb;
2872 
2873 	DDP_ASSERT_LOCKED(toep);
2874 restart:
2875 	if ((toep->ddp.flags & DDP_DEAD) != 0) {
2876 		MPASS(toep->ddp.active_count == 0);
2877 		return;
2878 	}
2879 
2880 	/* If both buffers are active, nothing to do. */
2881 	if (toep->ddp.active_count == nitems(toep->ddp.db)) {
2882 		return;
2883 	}
2884 
2885 	inp = toep->inp;
2886 	so = inp->inp_socket;
2887 	sb = &so->so_rcv;
2888 
2889 	drb = alloc_cached_ddp_rcv_buffer(toep);
2890 	DDP_UNLOCK(toep);
2891 
2892 	if (drb == NULL) {
2893 		drb = alloc_ddp_rcv_buffer(toep, M_WAITOK);
2894 		if (drb == NULL) {
2895 			printf("%s: failed to allocate buffer\n", __func__);
2896 			DDP_LOCK(toep);
2897 			return;
2898 		}
2899 	}
2900 
2901 	DDP_LOCK(toep);
2902 	if ((toep->ddp.flags & DDP_DEAD) != 0 ||
2903 	    toep->ddp.active_count == nitems(toep->ddp.db)) {
2904 		recycle_ddp_rcv_buffer(toep, drb);
2905 		return;
2906 	}
2907 
2908 	/* We will never get anything unless we are or were connected. */
2909 	SOCKBUF_LOCK(sb);
2910 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
2911 		SOCKBUF_UNLOCK(sb);
2912 		recycle_ddp_rcv_buffer(toep, drb);
2913 		return;
2914 	}
2915 
2916 	/* Abort if socket has reported problems or is closed. */
2917 	if (so->so_error != 0 || (sb->sb_state & SBS_CANTRCVMORE) != 0) {
2918 		SOCKBUF_UNLOCK(sb);
2919 		recycle_ddp_rcv_buffer(toep, drb);
2920 		return;
2921 	}
2922 	SOCKBUF_UNLOCK(sb);
2923 
2924 	if (!queue_ddp_rcvbuf(toep, drb)) {
2925 		/*
2926 		 * XXX: Need a way to kick a retry here.
2927 		 *
2928 		 * XXX: We know the fixed size needed and could
2929 		 * preallocate the work request using a blocking
2930 		 * request at the start of the task to avoid having to
2931 		 * handle this edge case.
2932 		 */
2933 		return;
2934 	}
2935 	goto restart;
2936 }
2937 
2938 static void
ddp_rcvbuf_requeue_task(void * context,int pending)2939 ddp_rcvbuf_requeue_task(void *context, int pending)
2940 {
2941 	struct toepcb *toep = context;
2942 
2943 	DDP_LOCK(toep);
2944 	ddp_rcvbuf_requeue(toep);
2945 	toep->ddp.flags &= ~DDP_TASK_ACTIVE;
2946 	DDP_UNLOCK(toep);
2947 
2948 	free_toepcb(toep);
2949 }
2950 
2951 int
t4_enable_ddp_rcv(struct socket * so,struct toepcb * toep)2952 t4_enable_ddp_rcv(struct socket *so, struct toepcb *toep)
2953 {
2954 	struct inpcb *inp = sotoinpcb(so);
2955 	struct adapter *sc = td_adapter(toep->td);
2956 
2957 	INP_WLOCK(inp);
2958 	switch (ulp_mode(toep)) {
2959 	case ULP_MODE_TCPDDP:
2960 		break;
2961 	case ULP_MODE_NONE:
2962 		if (set_ddp_ulp_mode(toep))
2963 			break;
2964 		/* FALLTHROUGH */
2965 	default:
2966 		INP_WUNLOCK(inp);
2967 		return (EOPNOTSUPP);
2968 	}
2969 	INP_WUNLOCK(inp);
2970 
2971 	DDP_LOCK(toep);
2972 
2973 	/*
2974 	 * If DDP is being used for AIO already, don't use it for
2975 	 * normal receive.
2976 	 */
2977 	if ((toep->ddp.flags & DDP_AIO) != 0) {
2978 		DDP_UNLOCK(toep);
2979 		return (EOPNOTSUPP);
2980 	}
2981 
2982 	if ((toep->ddp.flags & DDP_RCVBUF) != 0) {
2983 		DDP_UNLOCK(toep);
2984 		return (EBUSY);
2985 	}
2986 
2987 	toep->ddp.flags |= DDP_RCVBUF;
2988 	TAILQ_INIT(&toep->ddp.cached_buffers);
2989 	enable_ddp(sc, toep);
2990 	TASK_INIT(&toep->ddp.requeue_task, 0, ddp_rcvbuf_requeue_task, toep);
2991 	ddp_queue_toep(toep);
2992 	DDP_UNLOCK(toep);
2993 	return (0);
2994 }
2995 
2996 void
t4_ddp_mod_load(void)2997 t4_ddp_mod_load(void)
2998 {
2999 	if (t4_ddp_rcvbuf_len < PAGE_SIZE)
3000 		t4_ddp_rcvbuf_len = PAGE_SIZE;
3001 	if (t4_ddp_rcvbuf_len > MAX_DDP_BUFFER_SIZE)
3002 		t4_ddp_rcvbuf_len = MAX_DDP_BUFFER_SIZE;
3003 	if (!powerof2(t4_ddp_rcvbuf_len))
3004 		t4_ddp_rcvbuf_len = 1 << fls(t4_ddp_rcvbuf_len);
3005 
3006 	t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, do_ddp_tcb_rpl,
3007 	    CPL_COOKIE_DDP0);
3008 	t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, do_ddp_tcb_rpl,
3009 	    CPL_COOKIE_DDP1);
3010 	t4_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
3011 	t4_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
3012 	TAILQ_INIT(&ddp_orphan_pagesets);
3013 	mtx_init(&ddp_orphan_pagesets_lock, "ddp orphans", NULL, MTX_DEF);
3014 	TASK_INIT(&ddp_orphan_task, 0, ddp_free_orphan_pagesets, NULL);
3015 }
3016 
3017 void
t4_ddp_mod_unload(void)3018 t4_ddp_mod_unload(void)
3019 {
3020 
3021 	taskqueue_drain(taskqueue_thread, &ddp_orphan_task);
3022 	MPASS(TAILQ_EMPTY(&ddp_orphan_pagesets));
3023 	mtx_destroy(&ddp_orphan_pagesets_lock);
3024 	t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, NULL, CPL_COOKIE_DDP0);
3025 	t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, NULL, CPL_COOKIE_DDP1);
3026 	t4_register_cpl_handler(CPL_RX_DATA_DDP, NULL);
3027 	t4_register_cpl_handler(CPL_RX_DDP_COMPLETE, NULL);
3028 }
3029 #endif
3030