1 /*- 2 * Copyright (c) 2012 Chelsio Communications, Inc. 3 * All rights reserved. 4 * Written by: Navdeep Parhar <np@FreeBSD.org> 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 31 #include "opt_inet.h" 32 33 #include <sys/param.h> 34 #include <sys/types.h> 35 #include <sys/systm.h> 36 #include <sys/kernel.h> 37 #include <sys/ktr.h> 38 #include <sys/module.h> 39 #include <sys/protosw.h> 40 #include <sys/proc.h> 41 #include <sys/domain.h> 42 #include <sys/socket.h> 43 #include <sys/socketvar.h> 44 #include <sys/uio.h> 45 #include <netinet/in.h> 46 #include <netinet/in_pcb.h> 47 #include <netinet/ip.h> 48 #include <netinet/tcp_var.h> 49 #define TCPSTATES 50 #include <netinet/tcp_fsm.h> 51 #include <netinet/toecore.h> 52 53 #include <vm/vm.h> 54 #include <vm/vm_extern.h> 55 #include <vm/vm_param.h> 56 #include <vm/pmap.h> 57 #include <vm/vm_map.h> 58 #include <vm/vm_page.h> 59 #include <vm/vm_object.h> 60 61 #ifdef TCP_OFFLOAD 62 #include "common/common.h" 63 #include "common/t4_msg.h" 64 #include "common/t4_regs.h" 65 #include "common/t4_tcb.h" 66 #include "tom/t4_tom.h" 67 68 VNET_DECLARE(int, tcp_do_autorcvbuf); 69 #define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) 70 VNET_DECLARE(int, tcp_autorcvbuf_inc); 71 #define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc) 72 VNET_DECLARE(int, tcp_autorcvbuf_max); 73 #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) 74 75 #define PPOD_SZ(n) ((n) * sizeof(struct pagepod)) 76 #define PPOD_SIZE (PPOD_SZ(1)) 77 78 /* XXX: must match A_ULP_RX_TDDP_PSZ */ 79 static int t4_ddp_pgsz[] = {4096, 4096 << 2, 4096 << 4, 4096 << 6}; 80 81 #if 0 82 static void 83 t4_dump_tcb(struct adapter *sc, int tid) 84 { 85 uint32_t tcb_base, off, i, j; 86 87 /* Dump TCB for the tid */ 88 tcb_base = t4_read_reg(sc, A_TP_CMM_TCB_BASE); 89 t4_write_reg(sc, PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_OFFSET, 2), 90 tcb_base + tid * TCB_SIZE); 91 t4_read_reg(sc, PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_OFFSET, 2)); 92 off = 0; 93 printf("\n"); 94 for (i = 0; i < 4; i++) { 95 uint32_t buf[8]; 96 for (j = 0; j < 8; j++, off += 4) 97 buf[j] = htonl(t4_read_reg(sc, MEMWIN2_BASE + off)); 98 99 printf("%08x %08x %08x %08x %08x %08x %08x %08x\n", 100 buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], 101 buf[7]); 102 } 103 } 104 #endif 105 106 #define MAX_DDP_BUFFER_SIZE (M_TCB_RX_DDP_BUF0_LEN) 107 static int 108 alloc_ppods(struct tom_data *td, int n, u_int *ppod_addr) 109 { 110 vmem_addr_t v; 111 int rc; 112 113 MPASS(n > 0); 114 115 rc = vmem_alloc(td->ppod_arena, PPOD_SZ(n), M_NOWAIT | M_FIRSTFIT, &v); 116 *ppod_addr = (u_int)v; 117 118 return (rc); 119 } 120 121 static void 122 free_ppods(struct tom_data *td, u_int ppod_addr, int n) 123 { 124 125 MPASS(n > 0); 126 127 vmem_free(td->ppod_arena, (vmem_addr_t)ppod_addr, PPOD_SZ(n)); 128 } 129 130 static inline int 131 pages_to_nppods(int npages, int ddp_pgsz) 132 { 133 int nsegs = npages * PAGE_SIZE / ddp_pgsz; 134 135 return (howmany(nsegs, PPOD_PAGES)); 136 } 137 138 static void 139 free_ddp_buffer(struct tom_data *td, struct ddp_buffer *db) 140 { 141 142 if (db == NULL) 143 return; 144 145 if (db->pages) 146 free(db->pages, M_CXGBE); 147 148 if (db->nppods > 0) 149 free_ppods(td, db->ppod_addr, db->nppods); 150 151 free(db, M_CXGBE); 152 } 153 154 void 155 release_ddp_resources(struct toepcb *toep) 156 { 157 int i; 158 159 for (i = 0; i < nitems(toep->db); i++) { 160 if (toep->db[i] != NULL) { 161 free_ddp_buffer(toep->td, toep->db[i]); 162 toep->db[i] = NULL; 163 } 164 } 165 } 166 167 /* XXX: handle_ddp_data code duplication */ 168 void 169 insert_ddp_data(struct toepcb *toep, uint32_t n) 170 { 171 struct inpcb *inp = toep->inp; 172 struct tcpcb *tp = intotcpcb(inp); 173 struct sockbuf *sb = &inp->inp_socket->so_rcv; 174 struct mbuf *m; 175 176 INP_WLOCK_ASSERT(inp); 177 SOCKBUF_LOCK_ASSERT(sb); 178 179 m = get_ddp_mbuf(n); 180 tp->rcv_nxt += n; 181 #ifndef USE_DDP_RX_FLOW_CONTROL 182 KASSERT(tp->rcv_wnd >= n, ("%s: negative window size", __func__)); 183 tp->rcv_wnd -= n; 184 #endif 185 186 KASSERT(toep->sb_cc >= sbused(sb), 187 ("%s: sb %p has more data (%d) than last time (%d).", 188 __func__, sb, sbused(sb), toep->sb_cc)); 189 toep->rx_credits += toep->sb_cc - sbused(sb); 190 #ifdef USE_DDP_RX_FLOW_CONTROL 191 toep->rx_credits -= n; /* adjust for F_RX_FC_DDP */ 192 #endif 193 sbappendstream_locked(sb, m, 0); 194 toep->sb_cc = sbused(sb); 195 } 196 197 /* SET_TCB_FIELD sent as a ULP command looks like this */ 198 #define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \ 199 sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core)) 200 201 /* RX_DATA_ACK sent as a ULP command looks like this */ 202 #define LEN__RX_DATA_ACK_ULP (sizeof(struct ulp_txpkt) + \ 203 sizeof(struct ulptx_idata) + sizeof(struct cpl_rx_data_ack_core)) 204 205 static inline void * 206 mk_set_tcb_field_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep, 207 uint64_t word, uint64_t mask, uint64_t val) 208 { 209 struct ulptx_idata *ulpsc; 210 struct cpl_set_tcb_field_core *req; 211 212 ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0)); 213 ulpmc->len = htobe32(howmany(LEN__SET_TCB_FIELD_ULP, 16)); 214 215 ulpsc = (struct ulptx_idata *)(ulpmc + 1); 216 ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); 217 ulpsc->len = htobe32(sizeof(*req)); 218 219 req = (struct cpl_set_tcb_field_core *)(ulpsc + 1); 220 OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tid)); 221 req->reply_ctrl = htobe16(V_NO_REPLY(1) | 222 V_QUEUENO(toep->ofld_rxq->iq.abs_id)); 223 req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(0)); 224 req->mask = htobe64(mask); 225 req->val = htobe64(val); 226 227 ulpsc = (struct ulptx_idata *)(req + 1); 228 if (LEN__SET_TCB_FIELD_ULP % 16) { 229 ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP)); 230 ulpsc->len = htobe32(0); 231 return (ulpsc + 1); 232 } 233 return (ulpsc); 234 } 235 236 static inline void * 237 mk_rx_data_ack_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep) 238 { 239 struct ulptx_idata *ulpsc; 240 struct cpl_rx_data_ack_core *req; 241 242 ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0)); 243 ulpmc->len = htobe32(howmany(LEN__RX_DATA_ACK_ULP, 16)); 244 245 ulpsc = (struct ulptx_idata *)(ulpmc + 1); 246 ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); 247 ulpsc->len = htobe32(sizeof(*req)); 248 249 req = (struct cpl_rx_data_ack_core *)(ulpsc + 1); 250 OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tid)); 251 req->credit_dack = htobe32(F_RX_MODULATE_RX); 252 253 ulpsc = (struct ulptx_idata *)(req + 1); 254 if (LEN__RX_DATA_ACK_ULP % 16) { 255 ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP)); 256 ulpsc->len = htobe32(0); 257 return (ulpsc + 1); 258 } 259 return (ulpsc); 260 } 261 262 static inline uint64_t 263 select_ddp_flags(struct socket *so, int flags, int db_idx) 264 { 265 uint64_t ddp_flags = V_TF_DDP_INDICATE_OUT(0); 266 int waitall = flags & MSG_WAITALL; 267 int nb = so->so_state & SS_NBIO || flags & (MSG_DONTWAIT | MSG_NBIO); 268 269 KASSERT(db_idx == 0 || db_idx == 1, 270 ("%s: bad DDP buffer index %d", __func__, db_idx)); 271 272 if (db_idx == 0) { 273 ddp_flags |= V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0); 274 if (waitall) 275 ddp_flags |= V_TF_DDP_PUSH_DISABLE_0(1); 276 else if (nb) 277 ddp_flags |= V_TF_DDP_BUF0_FLUSH(1); 278 else 279 ddp_flags |= V_TF_DDP_BUF0_FLUSH(0); 280 } else { 281 ddp_flags |= V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1); 282 if (waitall) 283 ddp_flags |= V_TF_DDP_PUSH_DISABLE_1(1); 284 else if (nb) 285 ddp_flags |= V_TF_DDP_BUF1_FLUSH(1); 286 else 287 ddp_flags |= V_TF_DDP_BUF1_FLUSH(0); 288 } 289 290 return (ddp_flags); 291 } 292 293 static struct wrqe * 294 mk_update_tcb_for_ddp(struct adapter *sc, struct toepcb *toep, int db_idx, 295 int offset, uint64_t ddp_flags) 296 { 297 struct ddp_buffer *db = toep->db[db_idx]; 298 struct wrqe *wr; 299 struct work_request_hdr *wrh; 300 struct ulp_txpkt *ulpmc; 301 int len; 302 303 KASSERT(db_idx == 0 || db_idx == 1, 304 ("%s: bad DDP buffer index %d", __func__, db_idx)); 305 306 /* 307 * We'll send a compound work request that has 3 SET_TCB_FIELDs and an 308 * RX_DATA_ACK (with RX_MODULATE to speed up delivery). 309 * 310 * The work request header is 16B and always ends at a 16B boundary. 311 * The ULPTX master commands that follow must all end at 16B boundaries 312 * too so we round up the size to 16. 313 */ 314 len = sizeof(*wrh) + 3 * roundup2(LEN__SET_TCB_FIELD_ULP, 16) + 315 roundup2(LEN__RX_DATA_ACK_ULP, 16); 316 317 wr = alloc_wrqe(len, toep->ctrlq); 318 if (wr == NULL) 319 return (NULL); 320 wrh = wrtod(wr); 321 INIT_ULPTX_WRH(wrh, len, 1, 0); /* atomic */ 322 ulpmc = (struct ulp_txpkt *)(wrh + 1); 323 324 /* Write the buffer's tag */ 325 ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, 326 W_TCB_RX_DDP_BUF0_TAG + db_idx, 327 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG), 328 V_TCB_RX_DDP_BUF0_TAG(db->tag)); 329 330 /* Update the current offset in the DDP buffer and its total length */ 331 if (db_idx == 0) 332 ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, 333 W_TCB_RX_DDP_BUF0_OFFSET, 334 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | 335 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), 336 V_TCB_RX_DDP_BUF0_OFFSET(offset) | 337 V_TCB_RX_DDP_BUF0_LEN(db->len)); 338 else 339 ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, 340 W_TCB_RX_DDP_BUF1_OFFSET, 341 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | 342 V_TCB_RX_DDP_BUF1_LEN((u64)M_TCB_RX_DDP_BUF1_LEN << 32), 343 V_TCB_RX_DDP_BUF1_OFFSET(offset) | 344 V_TCB_RX_DDP_BUF1_LEN((u64)db->len << 32)); 345 346 /* Update DDP flags */ 347 ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_RX_DDP_FLAGS, 348 V_TF_DDP_BUF0_FLUSH(1) | V_TF_DDP_BUF1_FLUSH(1) | 349 V_TF_DDP_PUSH_DISABLE_0(1) | V_TF_DDP_PUSH_DISABLE_1(1) | 350 V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1) | 351 V_TF_DDP_ACTIVE_BUF(1) | V_TF_DDP_INDICATE_OUT(1), ddp_flags); 352 353 /* Gratuitous RX_DATA_ACK with RX_MODULATE set to speed up delivery. */ 354 ulpmc = mk_rx_data_ack_ulp(ulpmc, toep); 355 356 return (wr); 357 } 358 359 static void 360 discourage_ddp(struct toepcb *toep) 361 { 362 363 if (toep->ddp_score && --toep->ddp_score == 0) { 364 toep->ddp_flags &= ~DDP_OK; 365 toep->ddp_disabled = time_uptime; 366 CTR3(KTR_CXGBE, "%s: tid %u !DDP_OK @ %u", 367 __func__, toep->tid, time_uptime); 368 } 369 } 370 371 static int 372 handle_ddp_data(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt, int len) 373 { 374 uint32_t report = be32toh(ddp_report); 375 unsigned int db_flag; 376 struct inpcb *inp = toep->inp; 377 struct tcpcb *tp; 378 struct socket *so; 379 struct sockbuf *sb; 380 struct mbuf *m; 381 382 db_flag = report & F_DDP_BUF_IDX ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE; 383 384 if (__predict_false(!(report & F_DDP_INV))) 385 CXGBE_UNIMPLEMENTED("DDP buffer still valid"); 386 387 INP_WLOCK(inp); 388 so = inp_inpcbtosocket(inp); 389 sb = &so->so_rcv; 390 if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) { 391 392 /* 393 * XXX: think a bit more. 394 * tcpcb probably gone, but socket should still be around 395 * because we always wait for DDP completion in soreceive no 396 * matter what. Just wake it up and let it clean up. 397 */ 398 399 CTR5(KTR_CXGBE, "%s: tid %u, seq 0x%x, len %d, inp_flags 0x%x", 400 __func__, toep->tid, be32toh(rcv_nxt), len, inp->inp_flags); 401 SOCKBUF_LOCK(sb); 402 goto wakeup; 403 } 404 405 tp = intotcpcb(inp); 406 len += be32toh(rcv_nxt) - tp->rcv_nxt; 407 tp->rcv_nxt += len; 408 tp->t_rcvtime = ticks; 409 #ifndef USE_DDP_RX_FLOW_CONTROL 410 KASSERT(tp->rcv_wnd >= len, ("%s: negative window size", __func__)); 411 tp->rcv_wnd -= len; 412 #endif 413 m = get_ddp_mbuf(len); 414 415 SOCKBUF_LOCK(sb); 416 if (report & F_DDP_BUF_COMPLETE) 417 toep->ddp_score = DDP_HIGH_SCORE; 418 else 419 discourage_ddp(toep); 420 421 /* receive buffer autosize */ 422 if (sb->sb_flags & SB_AUTOSIZE && 423 V_tcp_do_autorcvbuf && 424 sb->sb_hiwat < V_tcp_autorcvbuf_max && 425 len > (sbspace(sb) / 8 * 7)) { 426 unsigned int hiwat = sb->sb_hiwat; 427 unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc, 428 V_tcp_autorcvbuf_max); 429 430 if (!sbreserve_locked(sb, newsize, so, NULL)) 431 sb->sb_flags &= ~SB_AUTOSIZE; 432 else 433 toep->rx_credits += newsize - hiwat; 434 } 435 436 KASSERT(toep->sb_cc >= sbused(sb), 437 ("%s: sb %p has more data (%d) than last time (%d).", 438 __func__, sb, sbused(sb), toep->sb_cc)); 439 toep->rx_credits += toep->sb_cc - sbused(sb); 440 #ifdef USE_DDP_RX_FLOW_CONTROL 441 toep->rx_credits -= len; /* adjust for F_RX_FC_DDP */ 442 #endif 443 sbappendstream_locked(sb, m, 0); 444 toep->sb_cc = sbused(sb); 445 wakeup: 446 KASSERT(toep->ddp_flags & db_flag, 447 ("%s: DDP buffer not active. toep %p, ddp_flags 0x%x, report 0x%x", 448 __func__, toep, toep->ddp_flags, report)); 449 toep->ddp_flags &= ~db_flag; 450 sorwakeup_locked(so); 451 SOCKBUF_UNLOCK_ASSERT(sb); 452 453 INP_WUNLOCK(inp); 454 return (0); 455 } 456 457 #define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\ 458 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\ 459 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\ 460 F_DDP_INVALID_PPOD | F_DDP_HDRCRC_ERR | F_DDP_DATACRC_ERR) 461 462 static int 463 do_rx_data_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 464 { 465 struct adapter *sc = iq->adapter; 466 const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1); 467 unsigned int tid = GET_TID(cpl); 468 uint32_t vld; 469 struct toepcb *toep = lookup_tid(sc, tid); 470 struct tom_data *td = toep->td; 471 472 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 473 KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__)); 474 KASSERT(!(toep->flags & TPF_SYNQE), 475 ("%s: toep %p claims to be a synq entry", __func__, toep)); 476 477 vld = be32toh(cpl->ddpvld); 478 if (__predict_false(vld & DDP_ERR)) { 479 panic("%s: DDP error 0x%x (tid %d, toep %p)", 480 __func__, vld, tid, toep); 481 } 482 if (toep->ulp_mode == ULP_MODE_ISCSI) { 483 m = m_get(M_NOWAIT, MT_DATA); 484 if (m == NULL) 485 CXGBE_UNIMPLEMENTED("mbuf alloc failure"); 486 memcpy(mtod(m, unsigned char *), cpl, 487 sizeof(struct cpl_rx_data_ddp)); 488 if (!t4_cpl_iscsi_callback(td, toep, m, CPL_RX_DATA_DDP)) 489 return (0); 490 m_freem(m); 491 } 492 493 handle_ddp_data(toep, cpl->u.ddp_report, cpl->seq, be16toh(cpl->len)); 494 495 return (0); 496 } 497 498 static int 499 do_rx_ddp_complete(struct sge_iq *iq, const struct rss_header *rss, 500 struct mbuf *m) 501 { 502 struct adapter *sc = iq->adapter; 503 const struct cpl_rx_ddp_complete *cpl = (const void *)(rss + 1); 504 unsigned int tid = GET_TID(cpl); 505 struct toepcb *toep = lookup_tid(sc, tid); 506 507 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 508 KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__)); 509 KASSERT(!(toep->flags & TPF_SYNQE), 510 ("%s: toep %p claims to be a synq entry", __func__, toep)); 511 512 handle_ddp_data(toep, cpl->ddp_report, cpl->rcv_nxt, 0); 513 514 return (0); 515 } 516 517 void 518 enable_ddp(struct adapter *sc, struct toepcb *toep) 519 { 520 521 KASSERT((toep->ddp_flags & (DDP_ON | DDP_OK | DDP_SC_REQ)) == DDP_OK, 522 ("%s: toep %p has bad ddp_flags 0x%x", 523 __func__, toep, toep->ddp_flags)); 524 525 CTR3(KTR_CXGBE, "%s: tid %u (time %u)", 526 __func__, toep->tid, time_uptime); 527 528 toep->ddp_flags |= DDP_SC_REQ; 529 t4_set_tcb_field(sc, toep, 1, W_TCB_RX_DDP_FLAGS, 530 V_TF_DDP_OFF(1) | V_TF_DDP_INDICATE_OUT(1) | 531 V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1) | 532 V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1), 533 V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1)); 534 t4_set_tcb_field(sc, toep, 1, W_TCB_T_FLAGS, 535 V_TF_RCV_COALESCE_ENABLE(1), 0); 536 } 537 538 static inline void 539 disable_ddp(struct adapter *sc, struct toepcb *toep) 540 { 541 542 KASSERT((toep->ddp_flags & (DDP_ON | DDP_SC_REQ)) == DDP_ON, 543 ("%s: toep %p has bad ddp_flags 0x%x", 544 __func__, toep, toep->ddp_flags)); 545 546 CTR3(KTR_CXGBE, "%s: tid %u (time %u)", 547 __func__, toep->tid, time_uptime); 548 549 toep->ddp_flags |= DDP_SC_REQ; 550 t4_set_tcb_field(sc, toep, 1, W_TCB_T_FLAGS, 551 V_TF_RCV_COALESCE_ENABLE(1), V_TF_RCV_COALESCE_ENABLE(1)); 552 t4_set_tcb_field(sc, toep, 1, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1), 553 V_TF_DDP_OFF(1)); 554 } 555 556 static int 557 hold_uio(struct uio *uio, vm_page_t **ppages, int *pnpages) 558 { 559 struct vm_map *map; 560 struct iovec *iov; 561 vm_offset_t start, end; 562 vm_page_t *pp; 563 int n; 564 565 KASSERT(uio->uio_iovcnt == 1, 566 ("%s: uio_iovcnt %d", __func__, uio->uio_iovcnt)); 567 KASSERT(uio->uio_td->td_proc == curproc, 568 ("%s: uio proc (%p) is not curproc (%p)", 569 __func__, uio->uio_td->td_proc, curproc)); 570 571 map = &curproc->p_vmspace->vm_map; 572 iov = &uio->uio_iov[0]; 573 start = trunc_page((uintptr_t)iov->iov_base); 574 end = round_page((vm_offset_t)iov->iov_base + iov->iov_len); 575 n = howmany(end - start, PAGE_SIZE); 576 577 if (end - start > MAX_DDP_BUFFER_SIZE) 578 return (E2BIG); 579 580 pp = malloc(n * sizeof(vm_page_t), M_CXGBE, M_NOWAIT); 581 if (pp == NULL) 582 return (ENOMEM); 583 584 if (vm_fault_quick_hold_pages(map, (vm_offset_t)iov->iov_base, 585 iov->iov_len, VM_PROT_WRITE, pp, n) < 0) { 586 free(pp, M_CXGBE); 587 return (EFAULT); 588 } 589 590 *ppages = pp; 591 *pnpages = n; 592 593 return (0); 594 } 595 596 static int 597 bufcmp(struct ddp_buffer *db, vm_page_t *pages, int npages, int offset, int len) 598 { 599 int i; 600 601 if (db == NULL || db->npages != npages || db->offset != offset || 602 db->len != len) 603 return (1); 604 605 for (i = 0; i < npages; i++) { 606 if (pages[i]->phys_addr != db->pages[i]->phys_addr) 607 return (1); 608 } 609 610 return (0); 611 } 612 613 static int 614 calculate_hcf(int n1, int n2) 615 { 616 int a, b, t; 617 618 if (n1 <= n2) { 619 a = n1; 620 b = n2; 621 } else { 622 a = n2; 623 b = n1; 624 } 625 626 while (a != 0) { 627 t = a; 628 a = b % a; 629 b = t; 630 } 631 632 return (b); 633 } 634 635 static struct ddp_buffer * 636 alloc_ddp_buffer(struct tom_data *td, vm_page_t *pages, int npages, int offset, 637 int len) 638 { 639 int i, hcf, seglen, idx, ppod, nppods; 640 struct ddp_buffer *db; 641 642 /* 643 * The DDP page size is unrelated to the VM page size. We combine 644 * contiguous physical pages into larger segments to get the best DDP 645 * page size possible. This is the largest of the four sizes in 646 * A_ULP_RX_TDDP_PSZ that evenly divides the HCF of the segment sizes in 647 * the page list. 648 */ 649 hcf = 0; 650 for (i = 0; i < npages; i++) { 651 seglen = PAGE_SIZE; 652 while (i < npages - 1 && 653 pages[i]->phys_addr + PAGE_SIZE == pages[i + 1]->phys_addr) { 654 seglen += PAGE_SIZE; 655 i++; 656 } 657 658 hcf = calculate_hcf(hcf, seglen); 659 if (hcf < t4_ddp_pgsz[1]) { 660 idx = 0; 661 goto have_pgsz; /* give up, short circuit */ 662 } 663 } 664 665 if (hcf % t4_ddp_pgsz[0] != 0) { 666 /* hmmm. This could only happen when PAGE_SIZE < 4K */ 667 KASSERT(PAGE_SIZE < 4096, 668 ("%s: PAGE_SIZE %d, hcf %d", __func__, PAGE_SIZE, hcf)); 669 CTR3(KTR_CXGBE, "%s: PAGE_SIZE %d, hcf %d", 670 __func__, PAGE_SIZE, hcf); 671 return (NULL); 672 } 673 674 for (idx = nitems(t4_ddp_pgsz) - 1; idx > 0; idx--) { 675 if (hcf % t4_ddp_pgsz[idx] == 0) 676 break; 677 } 678 have_pgsz: 679 MPASS(idx <= M_PPOD_PGSZ); 680 681 db = malloc(sizeof(*db), M_CXGBE, M_NOWAIT); 682 if (db == NULL) { 683 CTR1(KTR_CXGBE, "%s: malloc failed.", __func__); 684 return (NULL); 685 } 686 687 nppods = pages_to_nppods(npages, t4_ddp_pgsz[idx]); 688 if (alloc_ppods(td, nppods, &db->ppod_addr) != 0) { 689 free(db, M_CXGBE); 690 CTR4(KTR_CXGBE, "%s: no pods, nppods %d, resid %d, pgsz %d", 691 __func__, nppods, len, t4_ddp_pgsz[idx]); 692 return (NULL); 693 } 694 ppod = (db->ppod_addr - td->ppod_start) / PPOD_SIZE; 695 696 db->tag = V_PPOD_PGSZ(idx) | V_PPOD_TAG(ppod); 697 db->nppods = nppods; 698 db->npages = npages; 699 db->pages = pages; 700 db->offset = offset; 701 db->len = len; 702 703 CTR6(KTR_CXGBE, "New DDP buffer. " 704 "ddp_pgsz %d, ppod 0x%x, npages %d, nppods %d, offset %d, len %d", 705 t4_ddp_pgsz[idx], ppod, db->npages, db->nppods, db->offset, 706 db->len); 707 708 return (db); 709 } 710 711 #define NUM_ULP_TX_SC_IMM_PPODS (256 / PPOD_SIZE) 712 713 static int 714 write_page_pods(struct adapter *sc, struct toepcb *toep, struct ddp_buffer *db) 715 { 716 struct wrqe *wr; 717 struct ulp_mem_io *ulpmc; 718 struct ulptx_idata *ulpsc; 719 struct pagepod *ppod; 720 int i, j, k, n, chunk, len, ddp_pgsz, idx; 721 u_int ppod_addr; 722 uint32_t cmd; 723 724 cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE)); 725 if (is_t4(sc)) 726 cmd |= htobe32(F_ULP_MEMIO_ORDER); 727 else 728 cmd |= htobe32(F_T5_ULP_MEMIO_IMM); 729 ddp_pgsz = t4_ddp_pgsz[G_PPOD_PGSZ(db->tag)]; 730 ppod_addr = db->ppod_addr; 731 for (i = 0; i < db->nppods; ppod_addr += chunk) { 732 733 /* How many page pods are we writing in this cycle */ 734 n = min(db->nppods - i, NUM_ULP_TX_SC_IMM_PPODS); 735 chunk = PPOD_SZ(n); 736 len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16); 737 738 wr = alloc_wrqe(len, toep->ctrlq); 739 if (wr == NULL) 740 return (ENOMEM); /* ok to just bail out */ 741 ulpmc = wrtod(wr); 742 743 INIT_ULPTX_WR(ulpmc, len, 0, 0); 744 ulpmc->cmd = cmd; 745 ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32)); 746 ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16)); 747 ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5)); 748 749 ulpsc = (struct ulptx_idata *)(ulpmc + 1); 750 ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); 751 ulpsc->len = htobe32(chunk); 752 753 ppod = (struct pagepod *)(ulpsc + 1); 754 for (j = 0; j < n; i++, j++, ppod++) { 755 ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID | 756 V_PPOD_TID(toep->tid) | db->tag); 757 ppod->len_offset = htobe64(V_PPOD_LEN(db->len) | 758 V_PPOD_OFST(db->offset)); 759 ppod->rsvd = 0; 760 idx = i * PPOD_PAGES * (ddp_pgsz / PAGE_SIZE); 761 for (k = 0; k < nitems(ppod->addr); k++) { 762 if (idx < db->npages) { 763 ppod->addr[k] = 764 htobe64(db->pages[idx]->phys_addr); 765 idx += ddp_pgsz / PAGE_SIZE; 766 } else 767 ppod->addr[k] = 0; 768 #if 0 769 CTR5(KTR_CXGBE, 770 "%s: tid %d ppod[%d]->addr[%d] = %p", 771 __func__, toep->tid, i, k, 772 htobe64(ppod->addr[k])); 773 #endif 774 } 775 776 } 777 778 t4_wrq_tx(sc, wr); 779 } 780 781 return (0); 782 } 783 784 /* 785 * Reuse, or allocate (and program the page pods for) a new DDP buffer. The 786 * "pages" array is handed over to this function and should not be used in any 787 * way by the caller after that. 788 */ 789 static int 790 select_ddp_buffer(struct adapter *sc, struct toepcb *toep, vm_page_t *pages, 791 int npages, int db_off, int db_len) 792 { 793 struct ddp_buffer *db; 794 struct tom_data *td = sc->tom_softc; 795 int i, empty_slot = -1; 796 797 /* Try to reuse */ 798 for (i = 0; i < nitems(toep->db); i++) { 799 if (bufcmp(toep->db[i], pages, npages, db_off, db_len) == 0) { 800 free(pages, M_CXGBE); 801 return (i); /* pages still held */ 802 } else if (toep->db[i] == NULL && empty_slot < 0) 803 empty_slot = i; 804 } 805 806 /* Allocate new buffer, write its page pods. */ 807 db = alloc_ddp_buffer(td, pages, npages, db_off, db_len); 808 if (db == NULL) { 809 vm_page_unhold_pages(pages, npages); 810 free(pages, M_CXGBE); 811 return (-1); 812 } 813 if (write_page_pods(sc, toep, db) != 0) { 814 vm_page_unhold_pages(pages, npages); 815 free_ddp_buffer(td, db); 816 return (-1); 817 } 818 819 i = empty_slot; 820 if (i < 0) { 821 i = arc4random() % nitems(toep->db); 822 free_ddp_buffer(td, toep->db[i]); 823 } 824 toep->db[i] = db; 825 826 CTR5(KTR_CXGBE, "%s: tid %d, DDP buffer[%d] = %p (tag 0x%x)", 827 __func__, toep->tid, i, db, db->tag); 828 829 return (i); 830 } 831 832 static void 833 wire_ddp_buffer(struct ddp_buffer *db) 834 { 835 int i; 836 vm_page_t p; 837 838 for (i = 0; i < db->npages; i++) { 839 p = db->pages[i]; 840 vm_page_lock(p); 841 vm_page_wire(p); 842 vm_page_unhold(p); 843 vm_page_unlock(p); 844 } 845 } 846 847 static void 848 unwire_ddp_buffer(struct ddp_buffer *db) 849 { 850 int i; 851 vm_page_t p; 852 853 for (i = 0; i < db->npages; i++) { 854 p = db->pages[i]; 855 vm_page_lock(p); 856 vm_page_unwire(p, PQ_INACTIVE); 857 vm_page_unlock(p); 858 } 859 } 860 861 static int 862 handle_ddp(struct socket *so, struct uio *uio, int flags, int error) 863 { 864 struct sockbuf *sb = &so->so_rcv; 865 struct tcpcb *tp = so_sototcpcb(so); 866 struct toepcb *toep = tp->t_toe; 867 struct adapter *sc = td_adapter(toep->td); 868 vm_page_t *pages; 869 int npages, db_idx, rc, buf_flag; 870 struct ddp_buffer *db; 871 struct wrqe *wr; 872 uint64_t ddp_flags; 873 874 SOCKBUF_LOCK_ASSERT(sb); 875 876 #if 0 877 if (sbused(sb) + sc->tt.ddp_thres > uio->uio_resid) { 878 CTR4(KTR_CXGBE, "%s: sb_cc %d, threshold %d, resid %d", 879 __func__, sbused(sb), sc->tt.ddp_thres, uio->uio_resid); 880 } 881 #endif 882 883 /* XXX: too eager to disable DDP, could handle NBIO better than this. */ 884 if (sbused(sb) >= uio->uio_resid || uio->uio_resid < sc->tt.ddp_thres || 885 uio->uio_resid > MAX_DDP_BUFFER_SIZE || uio->uio_iovcnt > 1 || 886 so->so_state & SS_NBIO || flags & (MSG_DONTWAIT | MSG_NBIO) || 887 error || so->so_error || sb->sb_state & SBS_CANTRCVMORE) 888 goto no_ddp; 889 890 /* 891 * Fault in and then hold the pages of the uio buffers. We'll wire them 892 * a bit later if everything else works out. 893 */ 894 SOCKBUF_UNLOCK(sb); 895 if (hold_uio(uio, &pages, &npages) != 0) { 896 SOCKBUF_LOCK(sb); 897 goto no_ddp; 898 } 899 SOCKBUF_LOCK(sb); 900 if (__predict_false(so->so_error || sb->sb_state & SBS_CANTRCVMORE)) { 901 vm_page_unhold_pages(pages, npages); 902 free(pages, M_CXGBE); 903 goto no_ddp; 904 } 905 906 /* 907 * Figure out which one of the two DDP buffers to use this time. 908 */ 909 db_idx = select_ddp_buffer(sc, toep, pages, npages, 910 (uintptr_t)uio->uio_iov->iov_base & PAGE_MASK, uio->uio_resid); 911 pages = NULL; /* handed off to select_ddp_buffer */ 912 if (db_idx < 0) 913 goto no_ddp; 914 db = toep->db[db_idx]; 915 buf_flag = db_idx == 0 ? DDP_BUF0_ACTIVE : DDP_BUF1_ACTIVE; 916 917 /* 918 * Build the compound work request that tells the chip where to DMA the 919 * payload. 920 */ 921 ddp_flags = select_ddp_flags(so, flags, db_idx); 922 wr = mk_update_tcb_for_ddp(sc, toep, db_idx, sbused(sb), ddp_flags); 923 if (wr == NULL) { 924 /* 925 * Just unhold the pages. The DDP buffer's software state is 926 * left as-is in the toep. The page pods were written 927 * successfully and we may have an opportunity to use it in the 928 * future. 929 */ 930 vm_page_unhold_pages(db->pages, db->npages); 931 goto no_ddp; 932 } 933 934 /* Wire (and then unhold) the pages, and give the chip the go-ahead. */ 935 wire_ddp_buffer(db); 936 t4_wrq_tx(sc, wr); 937 sb->sb_flags &= ~SB_DDP_INDICATE; 938 toep->ddp_flags |= buf_flag; 939 940 /* 941 * Wait for the DDP operation to complete and then unwire the pages. 942 * The return code from the sbwait will be the final return code of this 943 * function. But we do need to wait for DDP no matter what. 944 */ 945 rc = sbwait(sb); 946 while (toep->ddp_flags & buf_flag) { 947 /* XXXGL: shouldn't here be sbwait() call? */ 948 sb->sb_flags |= SB_WAIT; 949 msleep(&sb->sb_acc, &sb->sb_mtx, PSOCK , "sbwait", 0); 950 } 951 unwire_ddp_buffer(db); 952 return (rc); 953 no_ddp: 954 disable_ddp(sc, toep); 955 discourage_ddp(toep); 956 sb->sb_flags &= ~SB_DDP_INDICATE; 957 return (0); 958 } 959 960 void 961 t4_init_ddp(struct adapter *sc, struct tom_data *td) 962 { 963 964 td->ppod_start = sc->vres.ddp.start; 965 td->ppod_arena = vmem_create("DDP page pods", sc->vres.ddp.start, 966 sc->vres.ddp.size, 1, 32, M_FIRSTFIT | M_NOWAIT); 967 968 t4_register_cpl_handler(sc, CPL_RX_DATA_DDP, do_rx_data_ddp); 969 t4_register_cpl_handler(sc, CPL_RX_DDP_COMPLETE, do_rx_ddp_complete); 970 } 971 972 void 973 t4_uninit_ddp(struct adapter *sc __unused, struct tom_data *td) 974 { 975 976 if (td->ppod_arena != NULL) { 977 vmem_destroy(td->ppod_arena); 978 td->ppod_arena = NULL; 979 } 980 } 981 982 #define VNET_SO_ASSERT(so) \ 983 VNET_ASSERT(curvnet != NULL, \ 984 ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so))); 985 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) 986 static int 987 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) 988 { 989 990 CXGBE_UNIMPLEMENTED(__func__); 991 } 992 993 static char ddp_magic_str[] = "nothing to see here"; 994 995 struct mbuf * 996 get_ddp_mbuf(int len) 997 { 998 struct mbuf *m; 999 1000 m = m_get(M_NOWAIT, MT_DATA); 1001 if (m == NULL) 1002 CXGBE_UNIMPLEMENTED("mbuf alloc failure"); 1003 m->m_len = len; 1004 m->m_data = &ddp_magic_str[0]; 1005 1006 return (m); 1007 } 1008 1009 static inline int 1010 is_ddp_mbuf(struct mbuf *m) 1011 { 1012 1013 return (m->m_data == &ddp_magic_str[0]); 1014 } 1015 1016 /* 1017 * Copy an mbuf chain into a uio limited by len if set. 1018 */ 1019 static int 1020 m_mbuftouio_ddp(struct uio *uio, struct mbuf *m, int len) 1021 { 1022 int error, length, total; 1023 int progress = 0; 1024 1025 if (len > 0) 1026 total = min(uio->uio_resid, len); 1027 else 1028 total = uio->uio_resid; 1029 1030 /* Fill the uio with data from the mbufs. */ 1031 for (; m != NULL; m = m->m_next) { 1032 length = min(m->m_len, total - progress); 1033 1034 if (is_ddp_mbuf(m)) { 1035 enum uio_seg segflag = uio->uio_segflg; 1036 1037 uio->uio_segflg = UIO_NOCOPY; 1038 error = uiomove(mtod(m, void *), length, uio); 1039 uio->uio_segflg = segflag; 1040 } else 1041 error = uiomove(mtod(m, void *), length, uio); 1042 if (error) 1043 return (error); 1044 1045 progress += length; 1046 } 1047 1048 return (0); 1049 } 1050 1051 /* 1052 * Based on soreceive_stream() in uipc_socket.c 1053 */ 1054 int 1055 t4_soreceive_ddp(struct socket *so, struct sockaddr **psa, struct uio *uio, 1056 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 1057 { 1058 int len = 0, error = 0, flags, oresid, ddp_handled = 0; 1059 struct sockbuf *sb; 1060 struct mbuf *m, *n = NULL; 1061 1062 /* We only do stream sockets. */ 1063 if (so->so_type != SOCK_STREAM) 1064 return (EINVAL); 1065 if (psa != NULL) 1066 *psa = NULL; 1067 if (controlp != NULL) 1068 return (EINVAL); 1069 if (flagsp != NULL) 1070 flags = *flagsp &~ MSG_EOR; 1071 else 1072 flags = 0; 1073 if (flags & MSG_OOB) 1074 return (soreceive_rcvoob(so, uio, flags)); 1075 if (mp0 != NULL) 1076 *mp0 = NULL; 1077 1078 sb = &so->so_rcv; 1079 1080 /* Prevent other readers from entering the socket. */ 1081 error = sblock(sb, SBLOCKWAIT(flags)); 1082 SOCKBUF_LOCK(sb); 1083 if (error) 1084 goto out; 1085 1086 /* Easy one, no space to copyout anything. */ 1087 if (uio->uio_resid == 0) { 1088 error = EINVAL; 1089 goto out; 1090 } 1091 oresid = uio->uio_resid; 1092 1093 /* We will never ever get anything unless we are or were connected. */ 1094 if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { 1095 error = ENOTCONN; 1096 goto out; 1097 } 1098 1099 restart: 1100 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1101 1102 if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled) { 1103 1104 /* uio should be just as it was at entry */ 1105 KASSERT(oresid == uio->uio_resid, 1106 ("%s: oresid = %d, uio_resid = %zd, sbavail = %d", 1107 __func__, oresid, uio->uio_resid, sbavail(sb))); 1108 1109 error = handle_ddp(so, uio, flags, 0); 1110 ddp_handled = 1; 1111 if (error) 1112 goto out; 1113 } 1114 1115 /* Abort if socket has reported problems. */ 1116 if (so->so_error) { 1117 if (sbavail(sb)) 1118 goto deliver; 1119 if (oresid > uio->uio_resid) 1120 goto out; 1121 error = so->so_error; 1122 if (!(flags & MSG_PEEK)) 1123 so->so_error = 0; 1124 goto out; 1125 } 1126 1127 /* Door is closed. Deliver what is left, if any. */ 1128 if (sb->sb_state & SBS_CANTRCVMORE) { 1129 if (sbavail(sb)) 1130 goto deliver; 1131 else 1132 goto out; 1133 } 1134 1135 /* Socket buffer is empty and we shall not block. */ 1136 if (sbavail(sb) == 0 && 1137 ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { 1138 error = EAGAIN; 1139 goto out; 1140 } 1141 1142 /* Socket buffer got some data that we shall deliver now. */ 1143 if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) && 1144 ((so->so_state & SS_NBIO) || 1145 (flags & (MSG_DONTWAIT|MSG_NBIO)) || 1146 sbavail(sb) >= sb->sb_lowat || 1147 sbavail(sb) >= uio->uio_resid || 1148 sbavail(sb) >= sb->sb_hiwat) ) { 1149 goto deliver; 1150 } 1151 1152 /* On MSG_WAITALL we must wait until all data or error arrives. */ 1153 if ((flags & MSG_WAITALL) && 1154 (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_lowat)) 1155 goto deliver; 1156 1157 /* 1158 * Wait and block until (more) data comes in. 1159 * NB: Drops the sockbuf lock during wait. 1160 */ 1161 error = sbwait(sb); 1162 if (error) { 1163 if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled) { 1164 (void) handle_ddp(so, uio, flags, 1); 1165 ddp_handled = 1; 1166 } 1167 goto out; 1168 } 1169 goto restart; 1170 1171 deliver: 1172 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1173 KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__)); 1174 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); 1175 1176 if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled) 1177 goto restart; 1178 1179 /* Statistics. */ 1180 if (uio->uio_td) 1181 uio->uio_td->td_ru.ru_msgrcv++; 1182 1183 /* Fill uio until full or current end of socket buffer is reached. */ 1184 len = min(uio->uio_resid, sbavail(sb)); 1185 if (mp0 != NULL) { 1186 /* Dequeue as many mbufs as possible. */ 1187 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { 1188 for (*mp0 = m = sb->sb_mb; 1189 m != NULL && m->m_len <= len; 1190 m = m->m_next) { 1191 len -= m->m_len; 1192 uio->uio_resid -= m->m_len; 1193 sbfree(sb, m); 1194 n = m; 1195 } 1196 sb->sb_mb = m; 1197 if (sb->sb_mb == NULL) 1198 SB_EMPTY_FIXUP(sb); 1199 n->m_next = NULL; 1200 } 1201 /* Copy the remainder. */ 1202 if (len > 0) { 1203 KASSERT(sb->sb_mb != NULL, 1204 ("%s: len > 0 && sb->sb_mb empty", __func__)); 1205 1206 m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); 1207 if (m == NULL) 1208 len = 0; /* Don't flush data from sockbuf. */ 1209 else 1210 uio->uio_resid -= m->m_len; 1211 if (*mp0 != NULL) 1212 n->m_next = m; 1213 else 1214 *mp0 = m; 1215 if (*mp0 == NULL) { 1216 error = ENOBUFS; 1217 goto out; 1218 } 1219 } 1220 } else { 1221 /* NB: Must unlock socket buffer as uiomove may sleep. */ 1222 SOCKBUF_UNLOCK(sb); 1223 error = m_mbuftouio_ddp(uio, sb->sb_mb, len); 1224 SOCKBUF_LOCK(sb); 1225 if (error) 1226 goto out; 1227 } 1228 SBLASTRECORDCHK(sb); 1229 SBLASTMBUFCHK(sb); 1230 1231 /* 1232 * Remove the delivered data from the socket buffer unless we 1233 * were only peeking. 1234 */ 1235 if (!(flags & MSG_PEEK)) { 1236 if (len > 0) 1237 sbdrop_locked(sb, len); 1238 1239 /* Notify protocol that we drained some data. */ 1240 if ((so->so_proto->pr_flags & PR_WANTRCVD) && 1241 (((flags & MSG_WAITALL) && uio->uio_resid > 0) || 1242 !(flags & MSG_SOCALLBCK))) { 1243 SOCKBUF_UNLOCK(sb); 1244 VNET_SO_ASSERT(so); 1245 (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags); 1246 SOCKBUF_LOCK(sb); 1247 } 1248 } 1249 1250 /* 1251 * For MSG_WAITALL we may have to loop again and wait for 1252 * more data to come in. 1253 */ 1254 if ((flags & MSG_WAITALL) && uio->uio_resid > 0) 1255 goto restart; 1256 out: 1257 SOCKBUF_LOCK_ASSERT(sb); 1258 SBLASTRECORDCHK(sb); 1259 SBLASTMBUFCHK(sb); 1260 SOCKBUF_UNLOCK(sb); 1261 sbunlock(sb); 1262 return (error); 1263 } 1264 1265 #endif 1266