xref: /freebsd/sys/dev/cxgbe/tom/t4_ddp.c (revision 56961fd7949de755f95a60fe8ac936f81e953f5b)
1 /*-
2  * Copyright (c) 2012 Chelsio Communications, Inc.
3  * All rights reserved.
4  * Written by: Navdeep Parhar <np@FreeBSD.org>
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 
31 #include "opt_inet.h"
32 
33 #include <sys/param.h>
34 #include <sys/types.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
37 #include <sys/ktr.h>
38 #include <sys/module.h>
39 #include <sys/protosw.h>
40 #include <sys/proc.h>
41 #include <sys/domain.h>
42 #include <sys/socket.h>
43 #include <sys/socketvar.h>
44 #include <sys/uio.h>
45 #include <netinet/in.h>
46 #include <netinet/in_pcb.h>
47 #include <netinet/ip.h>
48 #include <netinet/tcp_var.h>
49 #define TCPSTATES
50 #include <netinet/tcp_fsm.h>
51 #include <netinet/toecore.h>
52 
53 #include <vm/vm.h>
54 #include <vm/vm_extern.h>
55 #include <vm/vm_param.h>
56 #include <vm/pmap.h>
57 #include <vm/vm_map.h>
58 #include <vm/vm_page.h>
59 #include <vm/vm_object.h>
60 
61 #ifdef TCP_OFFLOAD
62 #include "common/common.h"
63 #include "common/t4_msg.h"
64 #include "common/t4_regs.h"
65 #include "common/t4_tcb.h"
66 #include "tom/t4_tom.h"
67 
68 #define PPOD_SZ(n)	((n) * sizeof(struct pagepod))
69 #define PPOD_SIZE	(PPOD_SZ(1))
70 
71 /* XXX: must match A_ULP_RX_TDDP_PSZ */
72 static int t4_ddp_pgsz[] = {4096, 4096 << 2, 4096 << 4, 4096 << 6};
73 
74 #if 0
75 static void
76 t4_dump_tcb(struct adapter *sc, int tid)
77 {
78 	uint32_t tcb_base, off, i, j;
79 
80 	/* Dump TCB for the tid */
81 	tcb_base = t4_read_reg(sc, A_TP_CMM_TCB_BASE);
82 	t4_write_reg(sc, PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_OFFSET, 2),
83 	    tcb_base + tid * TCB_SIZE);
84 	t4_read_reg(sc, PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_OFFSET, 2));
85 	off = 0;
86 	printf("\n");
87 	for (i = 0; i < 4; i++) {
88 		uint32_t buf[8];
89 		for (j = 0; j < 8; j++, off += 4)
90 			buf[j] = htonl(t4_read_reg(sc, MEMWIN2_BASE + off));
91 
92 		printf("%08x %08x %08x %08x %08x %08x %08x %08x\n",
93 		    buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6],
94 		    buf[7]);
95 	}
96 }
97 #endif
98 
99 #define MAX_DDP_BUFFER_SIZE		(M_TCB_RX_DDP_BUF0_LEN)
100 static int
101 alloc_ppods(struct tom_data *td, int n, struct ppod_region *pr)
102 {
103 	int ppod;
104 
105 	KASSERT(n > 0, ("%s: nonsense allocation (%d)", __func__, n));
106 
107 	mtx_lock(&td->ppod_lock);
108 	if (n > td->nppods_free) {
109 		mtx_unlock(&td->ppod_lock);
110 		return (-1);
111 	}
112 
113 	if (td->nppods_free_head >= n) {
114 		td->nppods_free_head -= n;
115 		ppod = td->nppods_free_head;
116 		TAILQ_INSERT_HEAD(&td->ppods, pr, link);
117 	} else {
118 		struct ppod_region *p;
119 
120 		ppod = td->nppods_free_head;
121 		TAILQ_FOREACH(p, &td->ppods, link) {
122 			ppod += p->used + p->free;
123 			if (n <= p->free) {
124 				ppod -= n;
125 				p->free -= n;
126 				TAILQ_INSERT_AFTER(&td->ppods, p, pr, link);
127 				goto allocated;
128 			}
129 		}
130 
131 		if (__predict_false(ppod != td->nppods)) {
132 			panic("%s: ppods TAILQ (%p) corrupt."
133 			    "  At %d instead of %d at the end of the queue.",
134 			    __func__, &td->ppods, ppod, td->nppods);
135 		}
136 
137 		mtx_unlock(&td->ppod_lock);
138 		return (-1);
139 	}
140 
141 allocated:
142 	pr->used = n;
143 	pr->free = 0;
144 	td->nppods_free -= n;
145 	mtx_unlock(&td->ppod_lock);
146 
147 	return (ppod);
148 }
149 
150 static void
151 free_ppods(struct tom_data *td, struct ppod_region *pr)
152 {
153 	struct ppod_region *p;
154 
155 	KASSERT(pr->used > 0, ("%s: nonsense free (%d)", __func__, pr->used));
156 
157 	mtx_lock(&td->ppod_lock);
158 	p = TAILQ_PREV(pr, ppod_head, link);
159 	if (p != NULL)
160 		p->free += pr->used + pr->free;
161 	else
162 		td->nppods_free_head += pr->used + pr->free;
163 	td->nppods_free += pr->used;
164 	KASSERT(td->nppods_free <= td->nppods,
165 	    ("%s: nppods_free (%d) > nppods (%d).  %d freed this time.",
166 	    __func__, td->nppods_free, td->nppods, pr->used));
167 	TAILQ_REMOVE(&td->ppods, pr, link);
168 	mtx_unlock(&td->ppod_lock);
169 }
170 
171 static inline int
172 pages_to_nppods(int npages, int ddp_pgsz)
173 {
174 	int nsegs = npages * PAGE_SIZE / ddp_pgsz;
175 
176 	return (howmany(nsegs, PPOD_PAGES));
177 }
178 
179 static void
180 free_ddp_buffer(struct tom_data *td, struct ddp_buffer *db)
181 {
182 
183 	if (db == NULL)
184 		return;
185 
186 	if (db->pages)
187 		free(db->pages, M_CXGBE);
188 
189 	if (db->nppods > 0)
190 		free_ppods(td, &db->ppod_region);
191 
192 	free(db, M_CXGBE);
193 }
194 
195 void
196 release_ddp_resources(struct toepcb *toep)
197 {
198 	int i;
199 
200 	for (i = 0; i < nitems(toep->db); i++) {
201 		if (toep->db[i] != NULL) {
202 			free_ddp_buffer(toep->td, toep->db[i]);
203 			toep->db[i] = NULL;
204 		}
205 	}
206 }
207 
208 /* XXX: handle_ddp_data code duplication */
209 void
210 insert_ddp_data(struct toepcb *toep, uint32_t n)
211 {
212 	struct inpcb *inp = toep->inp;
213 	struct tcpcb *tp = intotcpcb(inp);
214 	struct sockbuf *sb = &inp->inp_socket->so_rcv;
215 	struct mbuf *m;
216 
217 	INP_WLOCK_ASSERT(inp);
218 	SOCKBUF_LOCK_ASSERT(sb);
219 
220 	m = m_get(M_NOWAIT, MT_DATA);
221 	if (m == NULL)
222 		CXGBE_UNIMPLEMENTED("mbuf alloc failure");
223 	m->m_len = n;
224 	m->m_flags |= M_DDP;	/* Data is already where it should be */
225 	m->m_data = "nothing to see here";
226 
227 	tp->rcv_nxt += n;
228 #ifndef USE_DDP_RX_FLOW_CONTROL
229 	KASSERT(tp->rcv_wnd >= n, ("%s: negative window size", __func__));
230 	tp->rcv_wnd -= n;
231 #endif
232 
233 	KASSERT(toep->sb_cc >= sb->sb_cc,
234 	    ("%s: sb %p has more data (%d) than last time (%d).",
235 	    __func__, sb, sb->sb_cc, toep->sb_cc));
236 	toep->rx_credits += toep->sb_cc - sb->sb_cc;
237 #ifdef USE_DDP_RX_FLOW_CONTROL
238 	toep->rx_credits -= n;	/* adjust for F_RX_FC_DDP */
239 #endif
240 	sbappendstream_locked(sb, m);
241 	toep->sb_cc = sb->sb_cc;
242 }
243 
244 /* SET_TCB_FIELD sent as a ULP command looks like this */
245 #define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \
246     sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core))
247 
248 /* RX_DATA_ACK sent as a ULP command looks like this */
249 #define LEN__RX_DATA_ACK_ULP (sizeof(struct ulp_txpkt) + \
250     sizeof(struct ulptx_idata) + sizeof(struct cpl_rx_data_ack_core))
251 
252 static inline void *
253 mk_set_tcb_field_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep,
254     uint64_t word, uint64_t mask, uint64_t val)
255 {
256 	struct ulptx_idata *ulpsc;
257 	struct cpl_set_tcb_field_core *req;
258 
259 	ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0));
260 	ulpmc->len = htobe32(howmany(LEN__SET_TCB_FIELD_ULP, 16));
261 
262 	ulpsc = (struct ulptx_idata *)(ulpmc + 1);
263 	ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
264 	ulpsc->len = htobe32(sizeof(*req));
265 
266 	req = (struct cpl_set_tcb_field_core *)(ulpsc + 1);
267 	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tid));
268 	req->reply_ctrl = htobe16(V_NO_REPLY(1) |
269 	    V_QUEUENO(toep->ofld_rxq->iq.abs_id));
270 	req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(0));
271         req->mask = htobe64(mask);
272         req->val = htobe64(val);
273 
274 	ulpsc = (struct ulptx_idata *)(req + 1);
275 	if (LEN__SET_TCB_FIELD_ULP % 16) {
276 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
277 		ulpsc->len = htobe32(0);
278 		return (ulpsc + 1);
279 	}
280 	return (ulpsc);
281 }
282 
283 static inline void *
284 mk_rx_data_ack_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep)
285 {
286 	struct ulptx_idata *ulpsc;
287 	struct cpl_rx_data_ack_core *req;
288 
289 	ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0));
290 	ulpmc->len = htobe32(howmany(LEN__RX_DATA_ACK_ULP, 16));
291 
292 	ulpsc = (struct ulptx_idata *)(ulpmc + 1);
293 	ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
294 	ulpsc->len = htobe32(sizeof(*req));
295 
296 	req = (struct cpl_rx_data_ack_core *)(ulpsc + 1);
297 	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tid));
298 	req->credit_dack = htobe32(F_RX_MODULATE_RX);
299 
300 	ulpsc = (struct ulptx_idata *)(req + 1);
301 	if (LEN__RX_DATA_ACK_ULP % 16) {
302 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
303 		ulpsc->len = htobe32(0);
304 		return (ulpsc + 1);
305 	}
306 	return (ulpsc);
307 }
308 
309 static inline uint64_t
310 select_ddp_flags(struct socket *so, int flags, int db_idx)
311 {
312 	uint64_t ddp_flags = V_TF_DDP_INDICATE_OUT(0);
313 	int waitall = flags & MSG_WAITALL;
314 	int nb = so->so_state & SS_NBIO || flags & (MSG_DONTWAIT | MSG_NBIO);
315 
316 	KASSERT(db_idx == 0 || db_idx == 1,
317 	    ("%s: bad DDP buffer index %d", __func__, db_idx));
318 
319 	if (db_idx == 0) {
320 		ddp_flags |= V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0);
321 		if (waitall)
322 			ddp_flags |= V_TF_DDP_PUSH_DISABLE_0(1);
323 		else if (nb)
324 			ddp_flags |= V_TF_DDP_BUF0_FLUSH(1);
325 		else
326 			ddp_flags |= V_TF_DDP_BUF0_FLUSH(0);
327 	} else {
328 		ddp_flags |= V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1);
329 		if (waitall)
330 			ddp_flags |= V_TF_DDP_PUSH_DISABLE_1(1);
331 		else if (nb)
332 			ddp_flags |= V_TF_DDP_BUF1_FLUSH(1);
333 		else
334 			ddp_flags |= V_TF_DDP_BUF1_FLUSH(0);
335 	}
336 
337 	return (ddp_flags);
338 }
339 
340 static struct wrqe *
341 mk_update_tcb_for_ddp(struct adapter *sc, struct toepcb *toep, int db_idx,
342     int offset, uint64_t ddp_flags)
343 {
344 	struct ddp_buffer *db = toep->db[db_idx];
345 	struct wrqe *wr;
346 	struct work_request_hdr *wrh;
347 	struct ulp_txpkt *ulpmc;
348 	int len;
349 
350 	KASSERT(db_idx == 0 || db_idx == 1,
351 	    ("%s: bad DDP buffer index %d", __func__, db_idx));
352 
353 	/*
354 	 * We'll send a compound work request that has 3 SET_TCB_FIELDs and an
355 	 * RX_DATA_ACK (with RX_MODULATE to speed up delivery).
356 	 *
357 	 * The work request header is 16B and always ends at a 16B boundary.
358 	 * The ULPTX master commands that follow must all end at 16B boundaries
359 	 * too so we round up the size to 16.
360 	 */
361 	len = sizeof(*wrh) + 3 * roundup(LEN__SET_TCB_FIELD_ULP, 16) +
362 	    roundup(LEN__RX_DATA_ACK_ULP, 16);
363 
364 	wr = alloc_wrqe(len, toep->ctrlq);
365 	if (wr == NULL)
366 		return (NULL);
367 	wrh = wrtod(wr);
368 	INIT_ULPTX_WRH(wrh, len, 1, 0);	/* atomic */
369 	ulpmc = (struct ulp_txpkt *)(wrh + 1);
370 
371 	/* Write the buffer's tag */
372 	ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
373 	    W_TCB_RX_DDP_BUF0_TAG + db_idx,
374 	    V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
375 	    V_TCB_RX_DDP_BUF0_TAG(db->tag));
376 
377 	/* Update the current offset in the DDP buffer and its total length */
378 	if (db_idx == 0)
379 		ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
380 		    W_TCB_RX_DDP_BUF0_OFFSET,
381 		    V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
382 		    V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
383 		    V_TCB_RX_DDP_BUF0_OFFSET(offset) |
384 		    V_TCB_RX_DDP_BUF0_LEN(db->len));
385 	else
386 		ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
387 		    W_TCB_RX_DDP_BUF1_OFFSET,
388 		    V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
389 		    V_TCB_RX_DDP_BUF1_LEN((u64)M_TCB_RX_DDP_BUF1_LEN << 32),
390 		    V_TCB_RX_DDP_BUF1_OFFSET(offset) |
391 		    V_TCB_RX_DDP_BUF1_LEN((u64)db->len << 32));
392 
393 	/* Update DDP flags */
394 	ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_RX_DDP_FLAGS,
395 	    V_TF_DDP_BUF0_FLUSH(1) | V_TF_DDP_BUF1_FLUSH(1) |
396 	    V_TF_DDP_PUSH_DISABLE_0(1) | V_TF_DDP_PUSH_DISABLE_1(1) |
397 	    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1) |
398 	    V_TF_DDP_ACTIVE_BUF(1) | V_TF_DDP_INDICATE_OUT(1), ddp_flags);
399 
400 	/* Gratuitous RX_DATA_ACK with RX_MODULATE set to speed up delivery. */
401 	ulpmc = mk_rx_data_ack_ulp(ulpmc, toep);
402 
403 	return (wr);
404 }
405 
406 static void
407 discourage_ddp(struct toepcb *toep)
408 {
409 
410 	if (toep->ddp_score && --toep->ddp_score == 0) {
411 		toep->ddp_flags &= ~DDP_OK;
412 		toep->ddp_disabled = time_uptime;
413 		CTR3(KTR_CXGBE, "%s: tid %u !DDP_OK @ %u",
414 		    __func__, toep->tid, time_uptime);
415 	}
416 }
417 
418 static int
419 handle_ddp_data(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt, int len)
420 {
421 	uint32_t report = be32toh(ddp_report);
422 	unsigned int db_flag;
423 	struct inpcb *inp = toep->inp;
424 	struct tcpcb *tp;
425 	struct socket *so;
426 	struct sockbuf *sb;
427 	struct mbuf *m;
428 
429 	db_flag = report & F_DDP_BUF_IDX ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE;
430 
431 	if (__predict_false(!(report & F_DDP_INV)))
432 		CXGBE_UNIMPLEMENTED("DDP buffer still valid");
433 
434 	INP_WLOCK(inp);
435 	so = inp_inpcbtosocket(inp);
436 	sb = &so->so_rcv;
437 	if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
438 
439 		/*
440 		 * XXX: think a bit more.
441 		 * tcpcb probably gone, but socket should still be around
442 		 * because we always wait for DDP completion in soreceive no
443 		 * matter what.  Just wake it up and let it clean up.
444 		 */
445 
446 		CTR5(KTR_CXGBE, "%s: tid %u, seq 0x%x, len %d, inp_flags 0x%x",
447 		    __func__, toep->tid, be32toh(rcv_nxt), len, inp->inp_flags);
448 		SOCKBUF_LOCK(sb);
449 		goto wakeup;
450 	}
451 
452 	tp = intotcpcb(inp);
453 	len += be32toh(rcv_nxt) - tp->rcv_nxt;
454 	tp->rcv_nxt += len;
455 	tp->t_rcvtime = ticks;
456 #ifndef USE_DDP_RX_FLOW_CONTROL
457 	KASSERT(tp->rcv_wnd >= len, ("%s: negative window size", __func__));
458 	tp->rcv_wnd -= len;
459 #endif
460 
461 	m = m_get(M_NOWAIT, MT_DATA);
462 	if (m == NULL)
463 		CXGBE_UNIMPLEMENTED("mbuf alloc failure");
464 	m->m_len = len;
465 	m->m_flags |= M_DDP;	/* Data is already where it should be */
466 	m->m_data = "nothing to see here";
467 
468 	SOCKBUF_LOCK(sb);
469 	if (report & F_DDP_BUF_COMPLETE)
470 		toep->ddp_score = DDP_HIGH_SCORE;
471 	else
472 		discourage_ddp(toep);
473 
474 	KASSERT(toep->sb_cc >= sb->sb_cc,
475 	    ("%s: sb %p has more data (%d) than last time (%d).",
476 	    __func__, sb, sb->sb_cc, toep->sb_cc));
477 	toep->rx_credits += toep->sb_cc - sb->sb_cc;
478 #ifdef USE_DDP_RX_FLOW_CONTROL
479 	toep->rx_credits -= len;	/* adjust for F_RX_FC_DDP */
480 #endif
481 	sbappendstream_locked(sb, m);
482 	toep->sb_cc = sb->sb_cc;
483 wakeup:
484 	KASSERT(toep->ddp_flags & db_flag,
485 	    ("%s: DDP buffer not active. toep %p, ddp_flags 0x%x, report 0x%x",
486 	    __func__, toep, toep->ddp_flags, report));
487 	toep->ddp_flags &= ~db_flag;
488 	sorwakeup_locked(so);
489 	SOCKBUF_UNLOCK_ASSERT(sb);
490 
491 	INP_WUNLOCK(inp);
492 	return (0);
493 }
494 
495 #define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
496 	 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
497 	 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
498 	 F_DDP_INVALID_PPOD | F_DDP_HDRCRC_ERR | F_DDP_DATACRC_ERR)
499 
500 static int
501 do_rx_data_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
502 {
503 	struct adapter *sc = iq->adapter;
504 	const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1);
505 	unsigned int tid = GET_TID(cpl);
506 	uint32_t vld;
507 	struct toepcb *toep = lookup_tid(sc, tid);
508 
509 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
510 	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
511 	KASSERT(!(toep->flags & TPF_SYNQE),
512 	    ("%s: toep %p claims to be a synq entry", __func__, toep));
513 
514 	vld = be32toh(cpl->ddpvld);
515 	if (__predict_false(vld & DDP_ERR)) {
516 		panic("%s: DDP error 0x%x (tid %d, toep %p)",
517 		    __func__, vld, tid, toep);
518 	}
519 
520 	handle_ddp_data(toep, cpl->u.ddp_report, cpl->seq, be16toh(cpl->len));
521 
522 	return (0);
523 }
524 
525 static int
526 do_rx_ddp_complete(struct sge_iq *iq, const struct rss_header *rss,
527     struct mbuf *m)
528 {
529 	struct adapter *sc = iq->adapter;
530 	const struct cpl_rx_ddp_complete *cpl = (const void *)(rss + 1);
531 	unsigned int tid = GET_TID(cpl);
532 	struct toepcb *toep = lookup_tid(sc, tid);
533 
534 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
535 	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
536 	KASSERT(!(toep->flags & TPF_SYNQE),
537 	    ("%s: toep %p claims to be a synq entry", __func__, toep));
538 
539 	handle_ddp_data(toep, cpl->ddp_report, cpl->rcv_nxt, 0);
540 
541 	return (0);
542 }
543 
544 void
545 enable_ddp(struct adapter *sc, struct toepcb *toep)
546 {
547 
548 	KASSERT((toep->ddp_flags & (DDP_ON | DDP_OK | DDP_SC_REQ)) == DDP_OK,
549 	    ("%s: toep %p has bad ddp_flags 0x%x",
550 	    __func__, toep, toep->ddp_flags));
551 
552 	CTR3(KTR_CXGBE, "%s: tid %u (time %u)",
553 	    __func__, toep->tid, time_uptime);
554 
555 	toep->ddp_flags |= DDP_SC_REQ;
556 	t4_set_tcb_field(sc, toep, W_TCB_RX_DDP_FLAGS,
557 	    V_TF_DDP_OFF(1) | V_TF_DDP_INDICATE_OUT(1) |
558 	    V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1) |
559 	    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1),
560 	    V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1));
561 	t4_set_tcb_field(sc, toep, W_TCB_T_FLAGS,
562 	    V_TF_RCV_COALESCE_ENABLE(1), 0);
563 }
564 
565 static inline void
566 disable_ddp(struct adapter *sc, struct toepcb *toep)
567 {
568 
569 	KASSERT((toep->ddp_flags & (DDP_ON | DDP_SC_REQ)) == DDP_ON,
570 	    ("%s: toep %p has bad ddp_flags 0x%x",
571 	    __func__, toep, toep->ddp_flags));
572 
573 	CTR3(KTR_CXGBE, "%s: tid %u (time %u)",
574 	    __func__, toep->tid, time_uptime);
575 
576 	toep->ddp_flags |= DDP_SC_REQ;
577 	t4_set_tcb_field(sc, toep, W_TCB_T_FLAGS,
578 	    V_TF_RCV_COALESCE_ENABLE(1), V_TF_RCV_COALESCE_ENABLE(1));
579 	t4_set_tcb_field(sc, toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
580 	    V_TF_DDP_OFF(1));
581 }
582 
583 static int
584 hold_uio(struct uio *uio, vm_page_t **ppages, int *pnpages)
585 {
586 	struct vm_map *map;
587 	struct iovec *iov;
588 	vm_offset_t start, end;
589 	vm_page_t *pp;
590 	int n;
591 
592 	KASSERT(uio->uio_iovcnt == 1,
593 	    ("%s: uio_iovcnt %d", __func__, uio->uio_iovcnt));
594 	KASSERT(uio->uio_td->td_proc == curproc,
595 	    ("%s: uio proc (%p) is not curproc (%p)",
596 	    __func__, uio->uio_td->td_proc, curproc));
597 
598 	map = &curproc->p_vmspace->vm_map;
599 	iov = &uio->uio_iov[0];
600 	start = trunc_page((uintptr_t)iov->iov_base);
601 	end = round_page((vm_offset_t)iov->iov_base + iov->iov_len);
602 	n = howmany(end - start, PAGE_SIZE);
603 
604 	if (end - start > MAX_DDP_BUFFER_SIZE)
605 		return (E2BIG);
606 
607 	pp = malloc(n * sizeof(vm_page_t), M_CXGBE, M_NOWAIT);
608 	if (pp == NULL)
609 		return (ENOMEM);
610 
611 	if (vm_fault_quick_hold_pages(map, (vm_offset_t)iov->iov_base,
612 	    iov->iov_len, VM_PROT_WRITE, pp, n) < 0) {
613 		free(pp, M_CXGBE);
614 		return (EFAULT);
615 	}
616 
617 	*ppages = pp;
618 	*pnpages = n;
619 
620 	return (0);
621 }
622 
623 static int
624 bufcmp(struct ddp_buffer *db, vm_page_t *pages, int npages, int offset, int len)
625 {
626 	int i;
627 
628 	if (db == NULL || db->npages != npages || db->offset != offset ||
629 	    db->len != len)
630 		return (1);
631 
632 	for (i = 0; i < npages; i++) {
633 		if (pages[i]->phys_addr != db->pages[i]->phys_addr)
634 			return (1);
635 	}
636 
637 	return (0);
638 }
639 
640 static int
641 calculate_hcf(int n1, int n2)
642 {
643 	int a, b, t;
644 
645 	if (n1 <= n2) {
646 		a = n1;
647 		b = n2;
648 	} else {
649 		a = n2;
650 		b = n1;
651 	}
652 
653 	while (a != 0) {
654 		t = a;
655 		a = b % a;
656 		b = t;
657 	}
658 
659 	return (b);
660 }
661 
662 static struct ddp_buffer *
663 alloc_ddp_buffer(struct tom_data *td, vm_page_t *pages, int npages, int offset,
664     int len)
665 {
666 	int i, hcf, seglen, idx, ppod, nppods;
667 	struct ddp_buffer *db;
668 
669 	/*
670 	 * The DDP page size is unrelated to the VM page size.  We combine
671 	 * contiguous physical pages into larger segments to get the best DDP
672 	 * page size possible.  This is the largest of the four sizes in
673 	 * A_ULP_RX_TDDP_PSZ that evenly divides the HCF of the segment sizes in
674 	 * the page list.
675 	 */
676 	hcf = 0;
677 	for (i = 0; i < npages; i++) {
678 		seglen = PAGE_SIZE;
679 		while (i < npages - 1 &&
680 		    pages[i]->phys_addr + PAGE_SIZE == pages[i + 1]->phys_addr) {
681 			seglen += PAGE_SIZE;
682 			i++;
683 		}
684 
685 		hcf = calculate_hcf(hcf, seglen);
686 		if (hcf < t4_ddp_pgsz[1]) {
687 			idx = 0;
688 			goto have_pgsz;	/* give up, short circuit */
689 		}
690 	}
691 
692 	if (hcf % t4_ddp_pgsz[0] != 0) {
693 		/* hmmm.  This could only happen when PAGE_SIZE < 4K */
694 		KASSERT(PAGE_SIZE < 4096,
695 		    ("%s: PAGE_SIZE %d, hcf %d", __func__, PAGE_SIZE, hcf));
696 		CTR3(KTR_CXGBE, "%s: PAGE_SIZE %d, hcf %d",
697 		    __func__, PAGE_SIZE, hcf);
698 		return (NULL);
699 	}
700 
701 	for (idx = nitems(t4_ddp_pgsz) - 1; idx > 0; idx--) {
702 		if (hcf % t4_ddp_pgsz[idx] == 0)
703 			break;
704 	}
705 have_pgsz:
706 
707 	db = malloc(sizeof(*db), M_CXGBE, M_NOWAIT);
708 	if (db == NULL) {
709 		CTR1(KTR_CXGBE, "%s: malloc failed.", __func__);
710 		return (NULL);
711 	}
712 
713 	nppods = pages_to_nppods(npages, t4_ddp_pgsz[idx]);
714 	ppod = alloc_ppods(td, nppods, &db->ppod_region);
715 	if (ppod < 0) {
716 		free(db, M_CXGBE);
717 		CTR4(KTR_CXGBE, "%s: no pods, nppods %d, resid %d, pgsz %d",
718 		    __func__, nppods, len, t4_ddp_pgsz[idx]);
719 		return (NULL);
720 	}
721 
722 	KASSERT(idx <= M_PPOD_PGSZ && ppod <= M_PPOD_TAG,
723 	    ("%s: DDP pgsz_idx = %d, ppod = %d", __func__, idx, ppod));
724 
725 	db->tag = V_PPOD_PGSZ(idx) | V_PPOD_TAG(ppod);
726 	db->nppods = nppods;
727 	db->npages = npages;
728 	db->pages = pages;
729 	db->offset = offset;
730 	db->len = len;
731 
732 	CTR6(KTR_CXGBE, "New DDP buffer.  "
733 	    "ddp_pgsz %d, ppod 0x%x, npages %d, nppods %d, offset %d, len %d",
734 	    t4_ddp_pgsz[idx], ppod, db->npages, db->nppods, db->offset,
735 	    db->len);
736 
737 	return (db);
738 }
739 
740 #define NUM_ULP_TX_SC_IMM_PPODS (256 / PPOD_SIZE)
741 
742 static int
743 write_page_pods(struct adapter *sc, struct toepcb *toep, struct ddp_buffer *db)
744 {
745 	struct wrqe *wr;
746 	struct ulp_mem_io *ulpmc;
747 	struct ulptx_idata *ulpsc;
748 	struct pagepod *ppod;
749 	int i, j, k, n, chunk, len, ddp_pgsz, idx, ppod_addr;
750 
751 	ddp_pgsz = t4_ddp_pgsz[G_PPOD_PGSZ(db->tag)];
752 	ppod_addr = sc->vres.ddp.start + G_PPOD_TAG(db->tag) * PPOD_SIZE;
753 	for (i = 0; i < db->nppods; ppod_addr += chunk) {
754 
755 		/* How many page pods are we writing in this cycle */
756 		n = min(db->nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
757 		chunk = PPOD_SZ(n);
758 		len = roundup(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
759 
760 		wr = alloc_wrqe(len, toep->ctrlq);
761 		if (wr == NULL)
762 			return (ENOMEM);	/* ok to just bail out */
763 		ulpmc = wrtod(wr);
764 
765 		INIT_ULPTX_WR(ulpmc, len, 0, 0);
766 		ulpmc->cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE) |
767 		    F_ULP_MEMIO_ORDER);
768 		ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32));
769 		ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
770 		ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
771 
772 		ulpsc = (struct ulptx_idata *)(ulpmc + 1);
773 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
774 		ulpsc->len = htobe32(chunk);
775 
776 		ppod = (struct pagepod *)(ulpsc + 1);
777 		for (j = 0; j < n; i++, j++, ppod++) {
778 			ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
779 			    V_PPOD_TID(toep->tid) | db->tag);
780 			ppod->len_offset = htobe64(V_PPOD_LEN(db->len) |
781 			    V_PPOD_OFST(db->offset));
782 			ppod->rsvd = 0;
783 			idx = i * PPOD_PAGES * (ddp_pgsz / PAGE_SIZE);
784 			for (k = 0; k < nitems(ppod->addr); k++) {
785 				if (idx < db->npages) {
786 					ppod->addr[k] =
787 					    htobe64(db->pages[idx]->phys_addr);
788 					idx += ddp_pgsz / PAGE_SIZE;
789 				} else
790 					ppod->addr[k] = 0;
791 #if 0
792 				CTR5(KTR_CXGBE,
793 				    "%s: tid %d ppod[%d]->addr[%d] = %p",
794 				    __func__, toep->tid, i, k,
795 				    htobe64(ppod->addr[k]));
796 #endif
797 			}
798 
799 		}
800 
801 		t4_wrq_tx(sc, wr);
802 	}
803 
804 	return (0);
805 }
806 
807 /*
808  * Reuse, or allocate (and program the page pods for) a new DDP buffer.  The
809  * "pages" array is handed over to this function and should not be used in any
810  * way by the caller after that.
811  */
812 static int
813 select_ddp_buffer(struct adapter *sc, struct toepcb *toep, vm_page_t *pages,
814     int npages, int db_off, int db_len)
815 {
816 	struct ddp_buffer *db;
817 	struct tom_data *td = sc->tom_softc;
818 	int i, empty_slot = -1;
819 
820 	/* Try to reuse */
821 	for (i = 0; i < nitems(toep->db); i++) {
822 		if (bufcmp(toep->db[i], pages, npages, db_off, db_len) == 0) {
823 			free(pages, M_CXGBE);
824 			return (i);	/* pages still held */
825 		} else if (toep->db[i] == NULL && empty_slot < 0)
826 			empty_slot = i;
827 	}
828 
829 	/* Allocate new buffer, write its page pods. */
830 	db = alloc_ddp_buffer(td, pages, npages, db_off, db_len);
831 	if (db == NULL) {
832 		vm_page_unhold_pages(pages, npages);
833 		free(pages, M_CXGBE);
834 		return (-1);
835 	}
836 	if (write_page_pods(sc, toep, db) != 0) {
837 		vm_page_unhold_pages(pages, npages);
838 		free_ddp_buffer(td, db);
839 		return (-1);
840 	}
841 
842 	i = empty_slot;
843 	if (i < 0) {
844 		i = arc4random() % nitems(toep->db);
845 		free_ddp_buffer(td, toep->db[i]);
846 	}
847 	toep->db[i] = db;
848 
849 	CTR5(KTR_CXGBE, "%s: tid %d, DDP buffer[%d] = %p (tag 0x%x)",
850 	    __func__, toep->tid, i, db, db->tag);
851 
852 	return (i);
853 }
854 
855 static void
856 wire_ddp_buffer(struct ddp_buffer *db)
857 {
858 	int i;
859 	vm_page_t p;
860 
861 	for (i = 0; i < db->npages; i++) {
862 		p = db->pages[i];
863 		vm_page_lock(p);
864 		vm_page_wire(p);
865 		vm_page_unhold(p);
866 		vm_page_unlock(p);
867 	}
868 }
869 
870 static void
871 unwire_ddp_buffer(struct ddp_buffer *db)
872 {
873 	int i;
874 	vm_page_t p;
875 
876 	for (i = 0; i < db->npages; i++) {
877 		p = db->pages[i];
878 		vm_page_lock(p);
879 		vm_page_unwire(p, 0);
880 		vm_page_unlock(p);
881 	}
882 }
883 
884 static int
885 handle_ddp(struct socket *so, struct uio *uio, int flags, int error)
886 {
887 	struct sockbuf *sb = &so->so_rcv;
888 	struct tcpcb *tp = so_sototcpcb(so);
889 	struct toepcb *toep = tp->t_toe;
890 	struct adapter *sc = td_adapter(toep->td);
891 	vm_page_t *pages;
892 	int npages, db_idx, rc, buf_flag;
893 	struct ddp_buffer *db;
894 	struct wrqe *wr;
895 	uint64_t ddp_flags;
896 
897 	SOCKBUF_LOCK_ASSERT(sb);
898 
899 #if 0
900 	if (sb->sb_cc + sc->tt.ddp_thres > uio->uio_resid) {
901 		CTR4(KTR_CXGBE, "%s: sb_cc %d, threshold %d, resid %d",
902 		    __func__, sb->sb_cc, sc->tt.ddp_thres, uio->uio_resid);
903 	}
904 #endif
905 
906 	/* XXX: too eager to disable DDP, could handle NBIO better than this. */
907 	if (sb->sb_cc >= uio->uio_resid || uio->uio_resid < sc->tt.ddp_thres ||
908 	    uio->uio_resid > MAX_DDP_BUFFER_SIZE || uio->uio_iovcnt > 1 ||
909 	    so->so_state & SS_NBIO || flags & (MSG_DONTWAIT | MSG_NBIO) ||
910 	    error || so->so_error || sb->sb_state & SBS_CANTRCVMORE)
911 		goto no_ddp;
912 
913 	/*
914 	 * Fault in and then hold the pages of the uio buffers.  We'll wire them
915 	 * a bit later if everything else works out.
916 	 */
917 	SOCKBUF_UNLOCK(sb);
918 	if (hold_uio(uio, &pages, &npages) != 0) {
919 		SOCKBUF_LOCK(sb);
920 		goto no_ddp;
921 	}
922 	SOCKBUF_LOCK(sb);
923 	if (__predict_false(so->so_error || sb->sb_state & SBS_CANTRCVMORE)) {
924 		vm_page_unhold_pages(pages, npages);
925 		free(pages, M_CXGBE);
926 		goto no_ddp;
927 	}
928 
929 	/*
930 	 * Figure out which one of the two DDP buffers to use this time.
931 	 */
932 	db_idx = select_ddp_buffer(sc, toep, pages, npages,
933 	    (uintptr_t)uio->uio_iov->iov_base & PAGE_MASK, uio->uio_resid);
934 	pages = NULL;	/* handed off to select_ddp_buffer */
935 	if (db_idx < 0)
936 		goto no_ddp;
937 	db = toep->db[db_idx];
938 	buf_flag = db_idx == 0 ? DDP_BUF0_ACTIVE : DDP_BUF1_ACTIVE;
939 
940 	/*
941 	 * Build the compound work request that tells the chip where to DMA the
942 	 * payload.
943 	 */
944 	ddp_flags = select_ddp_flags(so, flags, db_idx);
945 	wr = mk_update_tcb_for_ddp(sc, toep, db_idx, sb->sb_cc, ddp_flags);
946 	if (wr == NULL) {
947 		/*
948 		 * Just unhold the pages.  The DDP buffer's software state is
949 		 * left as-is in the toep.  The page pods were written
950 		 * successfully and we may have an opportunity to use it in the
951 		 * future.
952 		 */
953 		vm_page_unhold_pages(db->pages, db->npages);
954 		goto no_ddp;
955 	}
956 
957 	/* Wire (and then unhold) the pages, and give the chip the go-ahead. */
958 	wire_ddp_buffer(db);
959 	t4_wrq_tx(sc, wr);
960 	sb->sb_flags &= ~SB_DDP_INDICATE;
961 	toep->ddp_flags |= buf_flag;
962 
963 	/*
964 	 * Wait for the DDP operation to complete and then unwire the pages.
965 	 * The return code from the sbwait will be the final return code of this
966 	 * function.  But we do need to wait for DDP no matter what.
967 	 */
968 	rc = sbwait(sb);
969 	while (toep->ddp_flags & buf_flag) {
970 		sb->sb_flags |= SB_WAIT;
971 		msleep(&sb->sb_cc, &sb->sb_mtx, PSOCK , "sbwait", 0);
972 	}
973 	unwire_ddp_buffer(db);
974 	return (rc);
975 no_ddp:
976 	disable_ddp(sc, toep);
977 	discourage_ddp(toep);
978 	sb->sb_flags &= ~SB_DDP_INDICATE;
979 	return (0);
980 }
981 
982 void
983 t4_init_ddp(struct adapter *sc, struct tom_data *td)
984 {
985 	int nppods = sc->vres.ddp.size / PPOD_SIZE;
986 
987 	td->nppods = nppods;
988 	td->nppods_free = nppods;
989 	td->nppods_free_head = nppods;
990 	TAILQ_INIT(&td->ppods);
991 	mtx_init(&td->ppod_lock, "page pods", NULL, MTX_DEF);
992 
993 	t4_register_cpl_handler(sc, CPL_RX_DATA_DDP, do_rx_data_ddp);
994 	t4_register_cpl_handler(sc, CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
995 }
996 
997 void
998 t4_uninit_ddp(struct adapter *sc __unused, struct tom_data *td)
999 {
1000 
1001 	KASSERT(td->nppods == td->nppods_free,
1002 	    ("%s: page pods still in use, nppods = %d, free = %d",
1003 	    __func__, td->nppods, td->nppods_free));
1004 
1005 	if (mtx_initialized(&td->ppod_lock))
1006 		mtx_destroy(&td->ppod_lock);
1007 }
1008 
1009 #define	VNET_SO_ASSERT(so)						\
1010 	VNET_ASSERT(curvnet != NULL,					\
1011 	    ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so)));
1012 #define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1013 static int
1014 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
1015 {
1016 
1017 	CXGBE_UNIMPLEMENTED(__func__);
1018 }
1019 
1020 /*
1021  * Copy an mbuf chain into a uio limited by len if set.
1022  */
1023 static int
1024 m_mbuftouio_ddp(struct uio *uio, struct mbuf *m, int len)
1025 {
1026 	int error, length, total;
1027 	int progress = 0;
1028 
1029 	if (len > 0)
1030 		total = min(uio->uio_resid, len);
1031 	else
1032 		total = uio->uio_resid;
1033 
1034 	/* Fill the uio with data from the mbufs. */
1035 	for (; m != NULL; m = m->m_next) {
1036 		length = min(m->m_len, total - progress);
1037 
1038 		if (m->m_flags & M_DDP) {
1039 			enum uio_seg segflag = uio->uio_segflg;
1040 
1041 			uio->uio_segflg	= UIO_NOCOPY;
1042 			error = uiomove(mtod(m, void *), length, uio);
1043 			uio->uio_segflg	= segflag;
1044 		} else
1045 			error = uiomove(mtod(m, void *), length, uio);
1046 		if (error)
1047 			return (error);
1048 
1049 		progress += length;
1050 	}
1051 
1052 	return (0);
1053 }
1054 
1055 /*
1056  * Based on soreceive_stream() in uipc_socket.c
1057  */
1058 int
1059 t4_soreceive_ddp(struct socket *so, struct sockaddr **psa, struct uio *uio,
1060     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1061 {
1062 	int len = 0, error = 0, flags, oresid, ddp_handled = 0;
1063 	struct sockbuf *sb;
1064 	struct mbuf *m, *n = NULL;
1065 
1066 	/* We only do stream sockets. */
1067 	if (so->so_type != SOCK_STREAM)
1068 		return (EINVAL);
1069 	if (psa != NULL)
1070 		*psa = NULL;
1071 	if (controlp != NULL)
1072 		return (EINVAL);
1073 	if (flagsp != NULL)
1074 		flags = *flagsp &~ MSG_EOR;
1075 	else
1076 		flags = 0;
1077 	if (flags & MSG_OOB)
1078 		return (soreceive_rcvoob(so, uio, flags));
1079 	if (mp0 != NULL)
1080 		*mp0 = NULL;
1081 
1082 	sb = &so->so_rcv;
1083 
1084 	/* Prevent other readers from entering the socket. */
1085 	error = sblock(sb, SBLOCKWAIT(flags));
1086 	if (error)
1087 		goto out;
1088 	SOCKBUF_LOCK(sb);
1089 
1090 	/* Easy one, no space to copyout anything. */
1091 	if (uio->uio_resid == 0) {
1092 		error = EINVAL;
1093 		goto out;
1094 	}
1095 	oresid = uio->uio_resid;
1096 
1097 	/* We will never ever get anything unless we are or were connected. */
1098 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
1099 		error = ENOTCONN;
1100 		goto out;
1101 	}
1102 
1103 restart:
1104 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1105 
1106 	if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled) {
1107 
1108 		/* uio should be just as it was at entry */
1109 		KASSERT(oresid == uio->uio_resid,
1110 		    ("%s: oresid = %d, uio_resid = %zd, sb_cc = %d",
1111 		    __func__, oresid, uio->uio_resid, sb->sb_cc));
1112 
1113 		error = handle_ddp(so, uio, flags, 0);
1114 		ddp_handled = 1;
1115 		if (error)
1116 			goto out;
1117 	}
1118 
1119 	/* Abort if socket has reported problems. */
1120 	if (so->so_error) {
1121 		if (sb->sb_cc > 0)
1122 			goto deliver;
1123 		if (oresid > uio->uio_resid)
1124 			goto out;
1125 		error = so->so_error;
1126 		if (!(flags & MSG_PEEK))
1127 			so->so_error = 0;
1128 		goto out;
1129 	}
1130 
1131 	/* Door is closed.  Deliver what is left, if any. */
1132 	if (sb->sb_state & SBS_CANTRCVMORE) {
1133 		if (sb->sb_cc > 0)
1134 			goto deliver;
1135 		else
1136 			goto out;
1137 	}
1138 
1139 	/* Socket buffer is empty and we shall not block. */
1140 	if (sb->sb_cc == 0 &&
1141 	    ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
1142 		error = EAGAIN;
1143 		goto out;
1144 	}
1145 
1146 	/* Socket buffer got some data that we shall deliver now. */
1147 	if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) &&
1148 	    ((sb->sb_flags & SS_NBIO) ||
1149 	     (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
1150 	     sb->sb_cc >= sb->sb_lowat ||
1151 	     sb->sb_cc >= uio->uio_resid ||
1152 	     sb->sb_cc >= sb->sb_hiwat) ) {
1153 		goto deliver;
1154 	}
1155 
1156 	/* On MSG_WAITALL we must wait until all data or error arrives. */
1157 	if ((flags & MSG_WAITALL) &&
1158 	    (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_lowat))
1159 		goto deliver;
1160 
1161 	/*
1162 	 * Wait and block until (more) data comes in.
1163 	 * NB: Drops the sockbuf lock during wait.
1164 	 */
1165 	error = sbwait(sb);
1166 	if (error) {
1167 		if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled) {
1168 			(void) handle_ddp(so, uio, flags, 1);
1169 			ddp_handled = 1;
1170 		}
1171 		goto out;
1172 	}
1173 	goto restart;
1174 
1175 deliver:
1176 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
1177 	KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__));
1178 	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
1179 
1180 	if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled)
1181 		goto restart;
1182 
1183 	/* Statistics. */
1184 	if (uio->uio_td)
1185 		uio->uio_td->td_ru.ru_msgrcv++;
1186 
1187 	/* Fill uio until full or current end of socket buffer is reached. */
1188 	len = min(uio->uio_resid, sb->sb_cc);
1189 	if (mp0 != NULL) {
1190 		/* Dequeue as many mbufs as possible. */
1191 		if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
1192 			for (*mp0 = m = sb->sb_mb;
1193 			     m != NULL && m->m_len <= len;
1194 			     m = m->m_next) {
1195 				len -= m->m_len;
1196 				uio->uio_resid -= m->m_len;
1197 				sbfree(sb, m);
1198 				n = m;
1199 			}
1200 			sb->sb_mb = m;
1201 			if (sb->sb_mb == NULL)
1202 				SB_EMPTY_FIXUP(sb);
1203 			n->m_next = NULL;
1204 		}
1205 		/* Copy the remainder. */
1206 		if (len > 0) {
1207 			KASSERT(sb->sb_mb != NULL,
1208 			    ("%s: len > 0 && sb->sb_mb empty", __func__));
1209 
1210 			m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
1211 			if (m == NULL)
1212 				len = 0;	/* Don't flush data from sockbuf. */
1213 			else
1214 				uio->uio_resid -= m->m_len;
1215 			if (*mp0 != NULL)
1216 				n->m_next = m;
1217 			else
1218 				*mp0 = m;
1219 			if (*mp0 == NULL) {
1220 				error = ENOBUFS;
1221 				goto out;
1222 			}
1223 		}
1224 	} else {
1225 		/* NB: Must unlock socket buffer as uiomove may sleep. */
1226 		SOCKBUF_UNLOCK(sb);
1227 		error = m_mbuftouio_ddp(uio, sb->sb_mb, len);
1228 		SOCKBUF_LOCK(sb);
1229 		if (error)
1230 			goto out;
1231 	}
1232 	SBLASTRECORDCHK(sb);
1233 	SBLASTMBUFCHK(sb);
1234 
1235 	/*
1236 	 * Remove the delivered data from the socket buffer unless we
1237 	 * were only peeking.
1238 	 */
1239 	if (!(flags & MSG_PEEK)) {
1240 		if (len > 0)
1241 			sbdrop_locked(sb, len);
1242 
1243 		/* Notify protocol that we drained some data. */
1244 		if ((so->so_proto->pr_flags & PR_WANTRCVD) &&
1245 		    (((flags & MSG_WAITALL) && uio->uio_resid > 0) ||
1246 		     !(flags & MSG_SOCALLBCK))) {
1247 			SOCKBUF_UNLOCK(sb);
1248 			VNET_SO_ASSERT(so);
1249 			(*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
1250 			SOCKBUF_LOCK(sb);
1251 		}
1252 	}
1253 
1254 	/*
1255 	 * For MSG_WAITALL we may have to loop again and wait for
1256 	 * more data to come in.
1257 	 */
1258 	if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
1259 		goto restart;
1260 out:
1261 	SOCKBUF_LOCK_ASSERT(sb);
1262 	SBLASTRECORDCHK(sb);
1263 	SBLASTMBUFCHK(sb);
1264 	SOCKBUF_UNLOCK(sb);
1265 	sbunlock(sb);
1266 	return (error);
1267 }
1268 
1269 #endif
1270