1 /*- 2 * Copyright (c) 2012 Chelsio Communications, Inc. 3 * All rights reserved. 4 * Written by: Navdeep Parhar <np@FreeBSD.org> 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 31 #include "opt_inet.h" 32 33 #include <sys/param.h> 34 #include <sys/types.h> 35 #include <sys/systm.h> 36 #include <sys/kernel.h> 37 #include <sys/ktr.h> 38 #include <sys/module.h> 39 #include <sys/protosw.h> 40 #include <sys/proc.h> 41 #include <sys/domain.h> 42 #include <sys/socket.h> 43 #include <sys/socketvar.h> 44 #include <sys/uio.h> 45 #include <netinet/in.h> 46 #include <netinet/in_pcb.h> 47 #include <netinet/ip.h> 48 #include <netinet/tcp_var.h> 49 #define TCPSTATES 50 #include <netinet/tcp_fsm.h> 51 #include <netinet/toecore.h> 52 53 #include <vm/vm.h> 54 #include <vm/vm_extern.h> 55 #include <vm/vm_param.h> 56 #include <vm/pmap.h> 57 #include <vm/vm_map.h> 58 #include <vm/vm_page.h> 59 #include <vm/vm_object.h> 60 61 #ifdef TCP_OFFLOAD 62 #include "common/common.h" 63 #include "common/t4_msg.h" 64 #include "common/t4_regs.h" 65 #include "common/t4_tcb.h" 66 #include "tom/t4_tom.h" 67 68 VNET_DECLARE(int, tcp_do_autorcvbuf); 69 #define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) 70 VNET_DECLARE(int, tcp_autorcvbuf_inc); 71 #define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc) 72 VNET_DECLARE(int, tcp_autorcvbuf_max); 73 #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) 74 75 static struct mbuf *get_ddp_mbuf(int len); 76 77 #define PPOD_SZ(n) ((n) * sizeof(struct pagepod)) 78 #define PPOD_SIZE (PPOD_SZ(1)) 79 80 /* XXX: must match A_ULP_RX_TDDP_PSZ */ 81 static int t4_ddp_pgsz[] = {4096, 4096 << 2, 4096 << 4, 4096 << 6}; 82 83 #if 0 84 static void 85 t4_dump_tcb(struct adapter *sc, int tid) 86 { 87 uint32_t tcb_base, off, i, j; 88 89 /* Dump TCB for the tid */ 90 tcb_base = t4_read_reg(sc, A_TP_CMM_TCB_BASE); 91 t4_write_reg(sc, PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_OFFSET, 2), 92 tcb_base + tid * TCB_SIZE); 93 t4_read_reg(sc, PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_OFFSET, 2)); 94 off = 0; 95 printf("\n"); 96 for (i = 0; i < 4; i++) { 97 uint32_t buf[8]; 98 for (j = 0; j < 8; j++, off += 4) 99 buf[j] = htonl(t4_read_reg(sc, MEMWIN2_BASE + off)); 100 101 printf("%08x %08x %08x %08x %08x %08x %08x %08x\n", 102 buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], 103 buf[7]); 104 } 105 } 106 #endif 107 108 #define MAX_DDP_BUFFER_SIZE (M_TCB_RX_DDP_BUF0_LEN) 109 static int 110 alloc_ppods(struct tom_data *td, int n, u_int *ppod_addr) 111 { 112 vmem_addr_t v; 113 int rc; 114 115 MPASS(n > 0); 116 117 rc = vmem_alloc(td->ppod_arena, PPOD_SZ(n), M_NOWAIT | M_FIRSTFIT, &v); 118 *ppod_addr = (u_int)v; 119 120 return (rc); 121 } 122 123 static void 124 free_ppods(struct tom_data *td, u_int ppod_addr, int n) 125 { 126 127 MPASS(n > 0); 128 129 vmem_free(td->ppod_arena, (vmem_addr_t)ppod_addr, PPOD_SZ(n)); 130 } 131 132 static inline int 133 pages_to_nppods(int npages, int ddp_pgsz) 134 { 135 int nsegs = npages * PAGE_SIZE / ddp_pgsz; 136 137 return (howmany(nsegs, PPOD_PAGES)); 138 } 139 140 static void 141 free_ddp_buffer(struct tom_data *td, struct ddp_buffer *db) 142 { 143 144 if (db == NULL) 145 return; 146 147 if (db->pages) 148 free(db->pages, M_CXGBE); 149 150 if (db->nppods > 0) 151 free_ppods(td, db->ppod_addr, db->nppods); 152 153 free(db, M_CXGBE); 154 } 155 156 void 157 release_ddp_resources(struct toepcb *toep) 158 { 159 int i; 160 161 for (i = 0; i < nitems(toep->db); i++) { 162 if (toep->db[i] != NULL) { 163 free_ddp_buffer(toep->td, toep->db[i]); 164 toep->db[i] = NULL; 165 } 166 } 167 } 168 169 /* XXX: handle_ddp_data code duplication */ 170 void 171 insert_ddp_data(struct toepcb *toep, uint32_t n) 172 { 173 struct inpcb *inp = toep->inp; 174 struct tcpcb *tp = intotcpcb(inp); 175 struct sockbuf *sb = &inp->inp_socket->so_rcv; 176 struct mbuf *m; 177 178 INP_WLOCK_ASSERT(inp); 179 SOCKBUF_LOCK_ASSERT(sb); 180 181 m = get_ddp_mbuf(n); 182 tp->rcv_nxt += n; 183 #ifndef USE_DDP_RX_FLOW_CONTROL 184 KASSERT(tp->rcv_wnd >= n, ("%s: negative window size", __func__)); 185 tp->rcv_wnd -= n; 186 #endif 187 188 KASSERT(toep->sb_cc >= sbused(sb), 189 ("%s: sb %p has more data (%d) than last time (%d).", 190 __func__, sb, sbused(sb), toep->sb_cc)); 191 toep->rx_credits += toep->sb_cc - sbused(sb); 192 #ifdef USE_DDP_RX_FLOW_CONTROL 193 toep->rx_credits -= n; /* adjust for F_RX_FC_DDP */ 194 #endif 195 sbappendstream_locked(sb, m, 0); 196 toep->sb_cc = sbused(sb); 197 } 198 199 /* SET_TCB_FIELD sent as a ULP command looks like this */ 200 #define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \ 201 sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core)) 202 203 /* RX_DATA_ACK sent as a ULP command looks like this */ 204 #define LEN__RX_DATA_ACK_ULP (sizeof(struct ulp_txpkt) + \ 205 sizeof(struct ulptx_idata) + sizeof(struct cpl_rx_data_ack_core)) 206 207 static inline void * 208 mk_set_tcb_field_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep, 209 uint64_t word, uint64_t mask, uint64_t val) 210 { 211 struct ulptx_idata *ulpsc; 212 struct cpl_set_tcb_field_core *req; 213 214 ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0)); 215 ulpmc->len = htobe32(howmany(LEN__SET_TCB_FIELD_ULP, 16)); 216 217 ulpsc = (struct ulptx_idata *)(ulpmc + 1); 218 ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); 219 ulpsc->len = htobe32(sizeof(*req)); 220 221 req = (struct cpl_set_tcb_field_core *)(ulpsc + 1); 222 OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tid)); 223 req->reply_ctrl = htobe16(V_NO_REPLY(1) | 224 V_QUEUENO(toep->ofld_rxq->iq.abs_id)); 225 req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(0)); 226 req->mask = htobe64(mask); 227 req->val = htobe64(val); 228 229 ulpsc = (struct ulptx_idata *)(req + 1); 230 if (LEN__SET_TCB_FIELD_ULP % 16) { 231 ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP)); 232 ulpsc->len = htobe32(0); 233 return (ulpsc + 1); 234 } 235 return (ulpsc); 236 } 237 238 static inline void * 239 mk_rx_data_ack_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep) 240 { 241 struct ulptx_idata *ulpsc; 242 struct cpl_rx_data_ack_core *req; 243 244 ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0)); 245 ulpmc->len = htobe32(howmany(LEN__RX_DATA_ACK_ULP, 16)); 246 247 ulpsc = (struct ulptx_idata *)(ulpmc + 1); 248 ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); 249 ulpsc->len = htobe32(sizeof(*req)); 250 251 req = (struct cpl_rx_data_ack_core *)(ulpsc + 1); 252 OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tid)); 253 req->credit_dack = htobe32(F_RX_MODULATE_RX); 254 255 ulpsc = (struct ulptx_idata *)(req + 1); 256 if (LEN__RX_DATA_ACK_ULP % 16) { 257 ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP)); 258 ulpsc->len = htobe32(0); 259 return (ulpsc + 1); 260 } 261 return (ulpsc); 262 } 263 264 static inline uint64_t 265 select_ddp_flags(struct socket *so, int flags, int db_idx) 266 { 267 uint64_t ddp_flags = V_TF_DDP_INDICATE_OUT(0); 268 int waitall = flags & MSG_WAITALL; 269 int nb = so->so_state & SS_NBIO || flags & (MSG_DONTWAIT | MSG_NBIO); 270 271 KASSERT(db_idx == 0 || db_idx == 1, 272 ("%s: bad DDP buffer index %d", __func__, db_idx)); 273 274 if (db_idx == 0) { 275 ddp_flags |= V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0); 276 if (waitall) 277 ddp_flags |= V_TF_DDP_PUSH_DISABLE_0(1); 278 else if (nb) 279 ddp_flags |= V_TF_DDP_BUF0_FLUSH(1); 280 else 281 ddp_flags |= V_TF_DDP_BUF0_FLUSH(0); 282 } else { 283 ddp_flags |= V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1); 284 if (waitall) 285 ddp_flags |= V_TF_DDP_PUSH_DISABLE_1(1); 286 else if (nb) 287 ddp_flags |= V_TF_DDP_BUF1_FLUSH(1); 288 else 289 ddp_flags |= V_TF_DDP_BUF1_FLUSH(0); 290 } 291 292 return (ddp_flags); 293 } 294 295 static struct wrqe * 296 mk_update_tcb_for_ddp(struct adapter *sc, struct toepcb *toep, int db_idx, 297 int offset, uint64_t ddp_flags) 298 { 299 struct ddp_buffer *db = toep->db[db_idx]; 300 struct wrqe *wr; 301 struct work_request_hdr *wrh; 302 struct ulp_txpkt *ulpmc; 303 int len; 304 305 KASSERT(db_idx == 0 || db_idx == 1, 306 ("%s: bad DDP buffer index %d", __func__, db_idx)); 307 308 /* 309 * We'll send a compound work request that has 3 SET_TCB_FIELDs and an 310 * RX_DATA_ACK (with RX_MODULATE to speed up delivery). 311 * 312 * The work request header is 16B and always ends at a 16B boundary. 313 * The ULPTX master commands that follow must all end at 16B boundaries 314 * too so we round up the size to 16. 315 */ 316 len = sizeof(*wrh) + 3 * roundup2(LEN__SET_TCB_FIELD_ULP, 16) + 317 roundup2(LEN__RX_DATA_ACK_ULP, 16); 318 319 wr = alloc_wrqe(len, toep->ctrlq); 320 if (wr == NULL) 321 return (NULL); 322 wrh = wrtod(wr); 323 INIT_ULPTX_WRH(wrh, len, 1, 0); /* atomic */ 324 ulpmc = (struct ulp_txpkt *)(wrh + 1); 325 326 /* Write the buffer's tag */ 327 ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, 328 W_TCB_RX_DDP_BUF0_TAG + db_idx, 329 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG), 330 V_TCB_RX_DDP_BUF0_TAG(db->tag)); 331 332 /* Update the current offset in the DDP buffer and its total length */ 333 if (db_idx == 0) 334 ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, 335 W_TCB_RX_DDP_BUF0_OFFSET, 336 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | 337 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), 338 V_TCB_RX_DDP_BUF0_OFFSET(offset) | 339 V_TCB_RX_DDP_BUF0_LEN(db->len)); 340 else 341 ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, 342 W_TCB_RX_DDP_BUF1_OFFSET, 343 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | 344 V_TCB_RX_DDP_BUF1_LEN((u64)M_TCB_RX_DDP_BUF1_LEN << 32), 345 V_TCB_RX_DDP_BUF1_OFFSET(offset) | 346 V_TCB_RX_DDP_BUF1_LEN((u64)db->len << 32)); 347 348 /* Update DDP flags */ 349 ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_RX_DDP_FLAGS, 350 V_TF_DDP_BUF0_FLUSH(1) | V_TF_DDP_BUF1_FLUSH(1) | 351 V_TF_DDP_PUSH_DISABLE_0(1) | V_TF_DDP_PUSH_DISABLE_1(1) | 352 V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1) | 353 V_TF_DDP_ACTIVE_BUF(1) | V_TF_DDP_INDICATE_OUT(1), ddp_flags); 354 355 /* Gratuitous RX_DATA_ACK with RX_MODULATE set to speed up delivery. */ 356 ulpmc = mk_rx_data_ack_ulp(ulpmc, toep); 357 358 return (wr); 359 } 360 361 static void 362 discourage_ddp(struct toepcb *toep) 363 { 364 365 if (toep->ddp_score && --toep->ddp_score == 0) { 366 toep->ddp_flags &= ~DDP_OK; 367 toep->ddp_disabled = time_uptime; 368 CTR3(KTR_CXGBE, "%s: tid %u !DDP_OK @ %u", 369 __func__, toep->tid, time_uptime); 370 } 371 } 372 373 static int 374 handle_ddp_data(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt, int len) 375 { 376 uint32_t report = be32toh(ddp_report); 377 unsigned int db_flag; 378 struct inpcb *inp = toep->inp; 379 struct tcpcb *tp; 380 struct socket *so; 381 struct sockbuf *sb; 382 struct mbuf *m; 383 384 db_flag = report & F_DDP_BUF_IDX ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE; 385 386 if (__predict_false(!(report & F_DDP_INV))) 387 CXGBE_UNIMPLEMENTED("DDP buffer still valid"); 388 389 INP_WLOCK(inp); 390 so = inp_inpcbtosocket(inp); 391 sb = &so->so_rcv; 392 if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) { 393 394 /* 395 * XXX: think a bit more. 396 * tcpcb probably gone, but socket should still be around 397 * because we always wait for DDP completion in soreceive no 398 * matter what. Just wake it up and let it clean up. 399 */ 400 401 CTR5(KTR_CXGBE, "%s: tid %u, seq 0x%x, len %d, inp_flags 0x%x", 402 __func__, toep->tid, be32toh(rcv_nxt), len, inp->inp_flags); 403 SOCKBUF_LOCK(sb); 404 goto wakeup; 405 } 406 407 tp = intotcpcb(inp); 408 409 /* 410 * For RX_DDP_COMPLETE, len will be zero and rcv_nxt is the 411 * sequence number of the next byte to receive. The length of 412 * the data received for this message must be computed by 413 * comparing the new and old values of rcv_nxt. 414 * 415 * For RX_DATA_DDP, len might be non-zero, but it is only the 416 * length of the most recent DMA. It does not include the 417 * total length of the data received since the previous update 418 * for this DDP buffer. rcv_nxt is the sequence number of the 419 * first received byte from the most recent DMA. 420 */ 421 len += be32toh(rcv_nxt) - tp->rcv_nxt; 422 tp->rcv_nxt += len; 423 tp->t_rcvtime = ticks; 424 #ifndef USE_DDP_RX_FLOW_CONTROL 425 KASSERT(tp->rcv_wnd >= len, ("%s: negative window size", __func__)); 426 tp->rcv_wnd -= len; 427 #endif 428 m = get_ddp_mbuf(len); 429 430 SOCKBUF_LOCK(sb); 431 if (report & F_DDP_BUF_COMPLETE) 432 toep->ddp_score = DDP_HIGH_SCORE; 433 else 434 discourage_ddp(toep); 435 436 /* receive buffer autosize */ 437 if (sb->sb_flags & SB_AUTOSIZE && 438 V_tcp_do_autorcvbuf && 439 sb->sb_hiwat < V_tcp_autorcvbuf_max && 440 len > (sbspace(sb) / 8 * 7)) { 441 unsigned int hiwat = sb->sb_hiwat; 442 unsigned int newsize = min(hiwat + V_tcp_autorcvbuf_inc, 443 V_tcp_autorcvbuf_max); 444 445 if (!sbreserve_locked(sb, newsize, so, NULL)) 446 sb->sb_flags &= ~SB_AUTOSIZE; 447 else 448 toep->rx_credits += newsize - hiwat; 449 } 450 451 KASSERT(toep->sb_cc >= sbused(sb), 452 ("%s: sb %p has more data (%d) than last time (%d).", 453 __func__, sb, sbused(sb), toep->sb_cc)); 454 toep->rx_credits += toep->sb_cc - sbused(sb); 455 #ifdef USE_DDP_RX_FLOW_CONTROL 456 toep->rx_credits -= len; /* adjust for F_RX_FC_DDP */ 457 #endif 458 sbappendstream_locked(sb, m, 0); 459 toep->sb_cc = sbused(sb); 460 wakeup: 461 KASSERT(toep->ddp_flags & db_flag, 462 ("%s: DDP buffer not active. toep %p, ddp_flags 0x%x, report 0x%x", 463 __func__, toep, toep->ddp_flags, report)); 464 toep->ddp_flags &= ~db_flag; 465 sorwakeup_locked(so); 466 SOCKBUF_UNLOCK_ASSERT(sb); 467 468 INP_WUNLOCK(inp); 469 return (0); 470 } 471 472 void 473 handle_ddp_close(struct toepcb *toep, struct tcpcb *tp, struct sockbuf *sb, 474 __be32 rcv_nxt) 475 { 476 struct mbuf *m; 477 int len; 478 479 SOCKBUF_LOCK_ASSERT(sb); 480 INP_WLOCK_ASSERT(toep->inp); 481 len = be32toh(rcv_nxt) - tp->rcv_nxt; 482 483 /* Signal handle_ddp() to break out of its sleep loop. */ 484 toep->ddp_flags &= ~(DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE); 485 if (len == 0) 486 return; 487 488 tp->rcv_nxt += len; 489 KASSERT(toep->sb_cc >= sbused(sb), 490 ("%s: sb %p has more data (%d) than last time (%d).", 491 __func__, sb, sbused(sb), toep->sb_cc)); 492 toep->rx_credits += toep->sb_cc - sbused(sb); 493 #ifdef USE_DDP_RX_FLOW_CONTROL 494 toep->rx_credits -= len; /* adjust for F_RX_FC_DDP */ 495 #endif 496 497 m = get_ddp_mbuf(len); 498 499 sbappendstream_locked(sb, m, 0); 500 toep->sb_cc = sbused(sb); 501 } 502 503 #define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\ 504 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\ 505 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\ 506 F_DDP_INVALID_PPOD | F_DDP_HDRCRC_ERR | F_DDP_DATACRC_ERR) 507 508 static int 509 do_rx_data_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 510 { 511 struct adapter *sc = iq->adapter; 512 const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1); 513 unsigned int tid = GET_TID(cpl); 514 uint32_t vld; 515 struct toepcb *toep = lookup_tid(sc, tid); 516 struct tom_data *td = toep->td; 517 518 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 519 KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__)); 520 KASSERT(!(toep->flags & TPF_SYNQE), 521 ("%s: toep %p claims to be a synq entry", __func__, toep)); 522 523 vld = be32toh(cpl->ddpvld); 524 if (__predict_false(vld & DDP_ERR)) { 525 panic("%s: DDP error 0x%x (tid %d, toep %p)", 526 __func__, vld, tid, toep); 527 } 528 if (toep->ulp_mode == ULP_MODE_ISCSI) { 529 m = m_get(M_NOWAIT, MT_DATA); 530 if (m == NULL) 531 CXGBE_UNIMPLEMENTED("mbuf alloc failure"); 532 memcpy(mtod(m, unsigned char *), cpl, 533 sizeof(struct cpl_rx_data_ddp)); 534 if (!t4_cpl_iscsi_callback(td, toep, m, CPL_RX_DATA_DDP)) 535 return (0); 536 m_freem(m); 537 } 538 539 handle_ddp_data(toep, cpl->u.ddp_report, cpl->seq, be16toh(cpl->len)); 540 541 return (0); 542 } 543 544 static int 545 do_rx_ddp_complete(struct sge_iq *iq, const struct rss_header *rss, 546 struct mbuf *m) 547 { 548 struct adapter *sc = iq->adapter; 549 const struct cpl_rx_ddp_complete *cpl = (const void *)(rss + 1); 550 unsigned int tid = GET_TID(cpl); 551 struct toepcb *toep = lookup_tid(sc, tid); 552 553 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 554 KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__)); 555 KASSERT(!(toep->flags & TPF_SYNQE), 556 ("%s: toep %p claims to be a synq entry", __func__, toep)); 557 558 handle_ddp_data(toep, cpl->ddp_report, cpl->rcv_nxt, 0); 559 560 return (0); 561 } 562 563 void 564 enable_ddp(struct adapter *sc, struct toepcb *toep) 565 { 566 567 KASSERT((toep->ddp_flags & (DDP_ON | DDP_OK | DDP_SC_REQ)) == DDP_OK, 568 ("%s: toep %p has bad ddp_flags 0x%x", 569 __func__, toep, toep->ddp_flags)); 570 571 CTR3(KTR_CXGBE, "%s: tid %u (time %u)", 572 __func__, toep->tid, time_uptime); 573 574 toep->ddp_flags |= DDP_SC_REQ; 575 t4_set_tcb_field(sc, toep, 1, W_TCB_RX_DDP_FLAGS, 576 V_TF_DDP_OFF(1) | V_TF_DDP_INDICATE_OUT(1) | 577 V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1) | 578 V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1), 579 V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1)); 580 t4_set_tcb_field(sc, toep, 1, W_TCB_T_FLAGS, 581 V_TF_RCV_COALESCE_ENABLE(1), 0); 582 } 583 584 static inline void 585 disable_ddp(struct adapter *sc, struct toepcb *toep) 586 { 587 588 KASSERT((toep->ddp_flags & (DDP_ON | DDP_SC_REQ)) == DDP_ON, 589 ("%s: toep %p has bad ddp_flags 0x%x", 590 __func__, toep, toep->ddp_flags)); 591 592 CTR3(KTR_CXGBE, "%s: tid %u (time %u)", 593 __func__, toep->tid, time_uptime); 594 595 toep->ddp_flags |= DDP_SC_REQ; 596 t4_set_tcb_field(sc, toep, 1, W_TCB_T_FLAGS, 597 V_TF_RCV_COALESCE_ENABLE(1), V_TF_RCV_COALESCE_ENABLE(1)); 598 t4_set_tcb_field(sc, toep, 1, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1), 599 V_TF_DDP_OFF(1)); 600 } 601 602 static int 603 hold_uio(struct uio *uio, vm_page_t **ppages, int *pnpages) 604 { 605 struct vm_map *map; 606 struct iovec *iov; 607 vm_offset_t start, end; 608 vm_page_t *pp; 609 int n; 610 611 KASSERT(uio->uio_iovcnt == 1, 612 ("%s: uio_iovcnt %d", __func__, uio->uio_iovcnt)); 613 KASSERT(uio->uio_td->td_proc == curproc, 614 ("%s: uio proc (%p) is not curproc (%p)", 615 __func__, uio->uio_td->td_proc, curproc)); 616 617 map = &curproc->p_vmspace->vm_map; 618 iov = &uio->uio_iov[0]; 619 start = trunc_page((uintptr_t)iov->iov_base); 620 end = round_page((vm_offset_t)iov->iov_base + iov->iov_len); 621 n = howmany(end - start, PAGE_SIZE); 622 623 if (end - start > MAX_DDP_BUFFER_SIZE) 624 return (E2BIG); 625 626 pp = malloc(n * sizeof(vm_page_t), M_CXGBE, M_NOWAIT); 627 if (pp == NULL) 628 return (ENOMEM); 629 630 if (vm_fault_quick_hold_pages(map, (vm_offset_t)iov->iov_base, 631 iov->iov_len, VM_PROT_WRITE, pp, n) < 0) { 632 free(pp, M_CXGBE); 633 return (EFAULT); 634 } 635 636 *ppages = pp; 637 *pnpages = n; 638 639 return (0); 640 } 641 642 static int 643 bufcmp(struct ddp_buffer *db, vm_page_t *pages, int npages, int offset, int len) 644 { 645 int i; 646 647 if (db == NULL || db->npages != npages || db->offset != offset || 648 db->len != len) 649 return (1); 650 651 for (i = 0; i < npages; i++) { 652 if (pages[i]->phys_addr != db->pages[i]->phys_addr) 653 return (1); 654 } 655 656 return (0); 657 } 658 659 static int 660 calculate_hcf(int n1, int n2) 661 { 662 int a, b, t; 663 664 if (n1 <= n2) { 665 a = n1; 666 b = n2; 667 } else { 668 a = n2; 669 b = n1; 670 } 671 672 while (a != 0) { 673 t = a; 674 a = b % a; 675 b = t; 676 } 677 678 return (b); 679 } 680 681 static struct ddp_buffer * 682 alloc_ddp_buffer(struct tom_data *td, vm_page_t *pages, int npages, int offset, 683 int len) 684 { 685 int i, hcf, seglen, idx, ppod, nppods; 686 struct ddp_buffer *db; 687 688 /* 689 * The DDP page size is unrelated to the VM page size. We combine 690 * contiguous physical pages into larger segments to get the best DDP 691 * page size possible. This is the largest of the four sizes in 692 * A_ULP_RX_TDDP_PSZ that evenly divides the HCF of the segment sizes in 693 * the page list. 694 */ 695 hcf = 0; 696 for (i = 0; i < npages; i++) { 697 seglen = PAGE_SIZE; 698 while (i < npages - 1 && 699 pages[i]->phys_addr + PAGE_SIZE == pages[i + 1]->phys_addr) { 700 seglen += PAGE_SIZE; 701 i++; 702 } 703 704 hcf = calculate_hcf(hcf, seglen); 705 if (hcf < t4_ddp_pgsz[1]) { 706 idx = 0; 707 goto have_pgsz; /* give up, short circuit */ 708 } 709 } 710 711 if (hcf % t4_ddp_pgsz[0] != 0) { 712 /* hmmm. This could only happen when PAGE_SIZE < 4K */ 713 KASSERT(PAGE_SIZE < 4096, 714 ("%s: PAGE_SIZE %d, hcf %d", __func__, PAGE_SIZE, hcf)); 715 CTR3(KTR_CXGBE, "%s: PAGE_SIZE %d, hcf %d", 716 __func__, PAGE_SIZE, hcf); 717 return (NULL); 718 } 719 720 for (idx = nitems(t4_ddp_pgsz) - 1; idx > 0; idx--) { 721 if (hcf % t4_ddp_pgsz[idx] == 0) 722 break; 723 } 724 have_pgsz: 725 MPASS(idx <= M_PPOD_PGSZ); 726 727 db = malloc(sizeof(*db), M_CXGBE, M_NOWAIT); 728 if (db == NULL) { 729 CTR1(KTR_CXGBE, "%s: malloc failed.", __func__); 730 return (NULL); 731 } 732 733 nppods = pages_to_nppods(npages, t4_ddp_pgsz[idx]); 734 if (alloc_ppods(td, nppods, &db->ppod_addr) != 0) { 735 free(db, M_CXGBE); 736 CTR4(KTR_CXGBE, "%s: no pods, nppods %d, resid %d, pgsz %d", 737 __func__, nppods, len, t4_ddp_pgsz[idx]); 738 return (NULL); 739 } 740 ppod = (db->ppod_addr - td->ppod_start) / PPOD_SIZE; 741 742 db->tag = V_PPOD_PGSZ(idx) | V_PPOD_TAG(ppod); 743 db->nppods = nppods; 744 db->npages = npages; 745 db->pages = pages; 746 db->offset = offset; 747 db->len = len; 748 749 CTR6(KTR_CXGBE, "New DDP buffer. " 750 "ddp_pgsz %d, ppod 0x%x, npages %d, nppods %d, offset %d, len %d", 751 t4_ddp_pgsz[idx], ppod, db->npages, db->nppods, db->offset, 752 db->len); 753 754 return (db); 755 } 756 757 #define NUM_ULP_TX_SC_IMM_PPODS (256 / PPOD_SIZE) 758 759 static int 760 write_page_pods(struct adapter *sc, struct toepcb *toep, struct ddp_buffer *db) 761 { 762 struct wrqe *wr; 763 struct ulp_mem_io *ulpmc; 764 struct ulptx_idata *ulpsc; 765 struct pagepod *ppod; 766 int i, j, k, n, chunk, len, ddp_pgsz, idx; 767 u_int ppod_addr; 768 uint32_t cmd; 769 770 cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE)); 771 if (is_t4(sc)) 772 cmd |= htobe32(F_ULP_MEMIO_ORDER); 773 else 774 cmd |= htobe32(F_T5_ULP_MEMIO_IMM); 775 ddp_pgsz = t4_ddp_pgsz[G_PPOD_PGSZ(db->tag)]; 776 ppod_addr = db->ppod_addr; 777 for (i = 0; i < db->nppods; ppod_addr += chunk) { 778 779 /* How many page pods are we writing in this cycle */ 780 n = min(db->nppods - i, NUM_ULP_TX_SC_IMM_PPODS); 781 chunk = PPOD_SZ(n); 782 len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16); 783 784 wr = alloc_wrqe(len, toep->ctrlq); 785 if (wr == NULL) 786 return (ENOMEM); /* ok to just bail out */ 787 ulpmc = wrtod(wr); 788 789 INIT_ULPTX_WR(ulpmc, len, 0, 0); 790 ulpmc->cmd = cmd; 791 ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32)); 792 ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16)); 793 ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5)); 794 795 ulpsc = (struct ulptx_idata *)(ulpmc + 1); 796 ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); 797 ulpsc->len = htobe32(chunk); 798 799 ppod = (struct pagepod *)(ulpsc + 1); 800 for (j = 0; j < n; i++, j++, ppod++) { 801 ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID | 802 V_PPOD_TID(toep->tid) | db->tag); 803 ppod->len_offset = htobe64(V_PPOD_LEN(db->len) | 804 V_PPOD_OFST(db->offset)); 805 ppod->rsvd = 0; 806 idx = i * PPOD_PAGES * (ddp_pgsz / PAGE_SIZE); 807 for (k = 0; k < nitems(ppod->addr); k++) { 808 if (idx < db->npages) { 809 ppod->addr[k] = 810 htobe64(db->pages[idx]->phys_addr); 811 idx += ddp_pgsz / PAGE_SIZE; 812 } else 813 ppod->addr[k] = 0; 814 #if 0 815 CTR5(KTR_CXGBE, 816 "%s: tid %d ppod[%d]->addr[%d] = %p", 817 __func__, toep->tid, i, k, 818 htobe64(ppod->addr[k])); 819 #endif 820 } 821 822 } 823 824 t4_wrq_tx(sc, wr); 825 } 826 827 return (0); 828 } 829 830 /* 831 * Reuse, or allocate (and program the page pods for) a new DDP buffer. The 832 * "pages" array is handed over to this function and should not be used in any 833 * way by the caller after that. 834 */ 835 static int 836 select_ddp_buffer(struct adapter *sc, struct toepcb *toep, vm_page_t *pages, 837 int npages, int db_off, int db_len) 838 { 839 struct ddp_buffer *db; 840 struct tom_data *td = sc->tom_softc; 841 int i, empty_slot = -1; 842 843 /* Try to reuse */ 844 for (i = 0; i < nitems(toep->db); i++) { 845 if (bufcmp(toep->db[i], pages, npages, db_off, db_len) == 0) { 846 free(pages, M_CXGBE); 847 return (i); /* pages still held */ 848 } else if (toep->db[i] == NULL && empty_slot < 0) 849 empty_slot = i; 850 } 851 852 /* Allocate new buffer, write its page pods. */ 853 db = alloc_ddp_buffer(td, pages, npages, db_off, db_len); 854 if (db == NULL) { 855 vm_page_unhold_pages(pages, npages); 856 free(pages, M_CXGBE); 857 return (-1); 858 } 859 if (write_page_pods(sc, toep, db) != 0) { 860 vm_page_unhold_pages(pages, npages); 861 free_ddp_buffer(td, db); 862 return (-1); 863 } 864 865 i = empty_slot; 866 if (i < 0) { 867 i = arc4random() % nitems(toep->db); 868 free_ddp_buffer(td, toep->db[i]); 869 } 870 toep->db[i] = db; 871 872 CTR5(KTR_CXGBE, "%s: tid %d, DDP buffer[%d] = %p (tag 0x%x)", 873 __func__, toep->tid, i, db, db->tag); 874 875 return (i); 876 } 877 878 static void 879 wire_ddp_buffer(struct ddp_buffer *db) 880 { 881 int i; 882 vm_page_t p; 883 884 for (i = 0; i < db->npages; i++) { 885 p = db->pages[i]; 886 vm_page_lock(p); 887 vm_page_wire(p); 888 vm_page_unhold(p); 889 vm_page_unlock(p); 890 } 891 } 892 893 static void 894 unwire_ddp_buffer(struct ddp_buffer *db) 895 { 896 int i; 897 vm_page_t p; 898 899 for (i = 0; i < db->npages; i++) { 900 p = db->pages[i]; 901 vm_page_lock(p); 902 vm_page_unwire(p, PQ_INACTIVE); 903 vm_page_unlock(p); 904 } 905 } 906 907 static int 908 handle_ddp(struct socket *so, struct uio *uio, int flags, int error) 909 { 910 struct sockbuf *sb = &so->so_rcv; 911 struct tcpcb *tp = so_sototcpcb(so); 912 struct toepcb *toep = tp->t_toe; 913 struct adapter *sc = td_adapter(toep->td); 914 vm_page_t *pages; 915 int npages, db_idx, rc, buf_flag; 916 struct ddp_buffer *db; 917 struct wrqe *wr; 918 uint64_t ddp_flags; 919 920 SOCKBUF_LOCK_ASSERT(sb); 921 922 #if 0 923 if (sbused(sb) + sc->tt.ddp_thres > uio->uio_resid) { 924 CTR4(KTR_CXGBE, "%s: sb_cc %d, threshold %d, resid %d", 925 __func__, sbused(sb), sc->tt.ddp_thres, uio->uio_resid); 926 } 927 #endif 928 929 /* XXX: too eager to disable DDP, could handle NBIO better than this. */ 930 if (sbused(sb) >= uio->uio_resid || uio->uio_resid < sc->tt.ddp_thres || 931 uio->uio_resid > MAX_DDP_BUFFER_SIZE || uio->uio_iovcnt > 1 || 932 so->so_state & SS_NBIO || flags & (MSG_DONTWAIT | MSG_NBIO) || 933 error || so->so_error || sb->sb_state & SBS_CANTRCVMORE) 934 goto no_ddp; 935 936 /* 937 * Fault in and then hold the pages of the uio buffers. We'll wire them 938 * a bit later if everything else works out. 939 */ 940 SOCKBUF_UNLOCK(sb); 941 if (hold_uio(uio, &pages, &npages) != 0) { 942 SOCKBUF_LOCK(sb); 943 goto no_ddp; 944 } 945 SOCKBUF_LOCK(sb); 946 if (__predict_false(so->so_error || sb->sb_state & SBS_CANTRCVMORE)) { 947 vm_page_unhold_pages(pages, npages); 948 free(pages, M_CXGBE); 949 goto no_ddp; 950 } 951 952 /* 953 * Figure out which one of the two DDP buffers to use this time. 954 */ 955 db_idx = select_ddp_buffer(sc, toep, pages, npages, 956 (uintptr_t)uio->uio_iov->iov_base & PAGE_MASK, uio->uio_resid); 957 pages = NULL; /* handed off to select_ddp_buffer */ 958 if (db_idx < 0) 959 goto no_ddp; 960 db = toep->db[db_idx]; 961 buf_flag = db_idx == 0 ? DDP_BUF0_ACTIVE : DDP_BUF1_ACTIVE; 962 963 /* 964 * Build the compound work request that tells the chip where to DMA the 965 * payload. 966 */ 967 ddp_flags = select_ddp_flags(so, flags, db_idx); 968 wr = mk_update_tcb_for_ddp(sc, toep, db_idx, sbused(sb), ddp_flags); 969 if (wr == NULL) { 970 /* 971 * Just unhold the pages. The DDP buffer's software state is 972 * left as-is in the toep. The page pods were written 973 * successfully and we may have an opportunity to use it in the 974 * future. 975 */ 976 vm_page_unhold_pages(db->pages, db->npages); 977 goto no_ddp; 978 } 979 980 /* Wire (and then unhold) the pages, and give the chip the go-ahead. */ 981 wire_ddp_buffer(db); 982 t4_wrq_tx(sc, wr); 983 sb->sb_flags &= ~SB_DDP_INDICATE; 984 toep->ddp_flags |= buf_flag; 985 986 /* 987 * Wait for the DDP operation to complete and then unwire the pages. 988 * The return code from the sbwait will be the final return code of this 989 * function. But we do need to wait for DDP no matter what. 990 */ 991 rc = sbwait(sb); 992 while (toep->ddp_flags & buf_flag) { 993 /* XXXGL: shouldn't here be sbwait() call? */ 994 sb->sb_flags |= SB_WAIT; 995 msleep(&sb->sb_acc, &sb->sb_mtx, PSOCK , "sbwait", 0); 996 } 997 unwire_ddp_buffer(db); 998 return (rc); 999 no_ddp: 1000 disable_ddp(sc, toep); 1001 discourage_ddp(toep); 1002 sb->sb_flags &= ~SB_DDP_INDICATE; 1003 return (0); 1004 } 1005 1006 void 1007 t4_init_ddp(struct adapter *sc, struct tom_data *td) 1008 { 1009 1010 td->ppod_start = sc->vres.ddp.start; 1011 td->ppod_arena = vmem_create("DDP page pods", sc->vres.ddp.start, 1012 sc->vres.ddp.size, 1, 32, M_FIRSTFIT | M_NOWAIT); 1013 1014 t4_register_cpl_handler(sc, CPL_RX_DATA_DDP, do_rx_data_ddp); 1015 t4_register_cpl_handler(sc, CPL_RX_DDP_COMPLETE, do_rx_ddp_complete); 1016 } 1017 1018 void 1019 t4_uninit_ddp(struct adapter *sc __unused, struct tom_data *td) 1020 { 1021 1022 if (td->ppod_arena != NULL) { 1023 vmem_destroy(td->ppod_arena); 1024 td->ppod_arena = NULL; 1025 } 1026 } 1027 1028 #define VNET_SO_ASSERT(so) \ 1029 VNET_ASSERT(curvnet != NULL, \ 1030 ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so))); 1031 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) 1032 static int 1033 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) 1034 { 1035 1036 CXGBE_UNIMPLEMENTED(__func__); 1037 } 1038 1039 static char ddp_magic_str[] = "nothing to see here"; 1040 1041 static struct mbuf * 1042 get_ddp_mbuf(int len) 1043 { 1044 struct mbuf *m; 1045 1046 m = m_get(M_NOWAIT, MT_DATA); 1047 if (m == NULL) 1048 CXGBE_UNIMPLEMENTED("mbuf alloc failure"); 1049 m->m_len = len; 1050 m->m_data = &ddp_magic_str[0]; 1051 1052 return (m); 1053 } 1054 1055 static inline int 1056 is_ddp_mbuf(struct mbuf *m) 1057 { 1058 1059 return (m->m_data == &ddp_magic_str[0]); 1060 } 1061 1062 /* 1063 * Copy an mbuf chain into a uio limited by len if set. 1064 */ 1065 static int 1066 m_mbuftouio_ddp(struct uio *uio, struct mbuf *m, int len) 1067 { 1068 int error, length, total; 1069 int progress = 0; 1070 1071 if (len > 0) 1072 total = min(uio->uio_resid, len); 1073 else 1074 total = uio->uio_resid; 1075 1076 /* Fill the uio with data from the mbufs. */ 1077 for (; m != NULL; m = m->m_next) { 1078 length = min(m->m_len, total - progress); 1079 1080 if (is_ddp_mbuf(m)) { 1081 enum uio_seg segflag = uio->uio_segflg; 1082 1083 uio->uio_segflg = UIO_NOCOPY; 1084 error = uiomove(mtod(m, void *), length, uio); 1085 uio->uio_segflg = segflag; 1086 } else 1087 error = uiomove(mtod(m, void *), length, uio); 1088 if (error) 1089 return (error); 1090 1091 progress += length; 1092 } 1093 1094 return (0); 1095 } 1096 1097 /* 1098 * Based on soreceive_stream() in uipc_socket.c 1099 */ 1100 int 1101 t4_soreceive_ddp(struct socket *so, struct sockaddr **psa, struct uio *uio, 1102 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 1103 { 1104 int len = 0, error = 0, flags, oresid, ddp_handled = 0; 1105 struct sockbuf *sb; 1106 struct mbuf *m, *n = NULL; 1107 1108 /* We only do stream sockets. */ 1109 if (so->so_type != SOCK_STREAM) 1110 return (EINVAL); 1111 if (psa != NULL) 1112 *psa = NULL; 1113 if (controlp != NULL) 1114 return (EINVAL); 1115 if (flagsp != NULL) 1116 flags = *flagsp &~ MSG_EOR; 1117 else 1118 flags = 0; 1119 if (flags & MSG_OOB) 1120 return (soreceive_rcvoob(so, uio, flags)); 1121 if (mp0 != NULL) 1122 *mp0 = NULL; 1123 1124 sb = &so->so_rcv; 1125 1126 /* Prevent other readers from entering the socket. */ 1127 error = sblock(sb, SBLOCKWAIT(flags)); 1128 SOCKBUF_LOCK(sb); 1129 if (error) 1130 goto out; 1131 1132 /* Easy one, no space to copyout anything. */ 1133 if (uio->uio_resid == 0) { 1134 error = EINVAL; 1135 goto out; 1136 } 1137 oresid = uio->uio_resid; 1138 1139 /* We will never ever get anything unless we are or were connected. */ 1140 if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { 1141 error = ENOTCONN; 1142 goto out; 1143 } 1144 1145 restart: 1146 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1147 1148 if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled) { 1149 1150 /* uio should be just as it was at entry */ 1151 KASSERT(oresid == uio->uio_resid, 1152 ("%s: oresid = %d, uio_resid = %zd, sbavail = %d", 1153 __func__, oresid, uio->uio_resid, sbavail(sb))); 1154 1155 error = handle_ddp(so, uio, flags, 0); 1156 ddp_handled = 1; 1157 if (error) 1158 goto out; 1159 } 1160 1161 /* Abort if socket has reported problems. */ 1162 if (so->so_error) { 1163 if (sbavail(sb)) 1164 goto deliver; 1165 if (oresid > uio->uio_resid) 1166 goto out; 1167 error = so->so_error; 1168 if (!(flags & MSG_PEEK)) 1169 so->so_error = 0; 1170 goto out; 1171 } 1172 1173 /* Door is closed. Deliver what is left, if any. */ 1174 if (sb->sb_state & SBS_CANTRCVMORE) { 1175 if (sbavail(sb)) 1176 goto deliver; 1177 else 1178 goto out; 1179 } 1180 1181 /* Socket buffer is empty and we shall not block. */ 1182 if (sbavail(sb) == 0 && 1183 ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { 1184 error = EAGAIN; 1185 goto out; 1186 } 1187 1188 /* Socket buffer got some data that we shall deliver now. */ 1189 if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) && 1190 ((so->so_state & SS_NBIO) || 1191 (flags & (MSG_DONTWAIT|MSG_NBIO)) || 1192 sbavail(sb) >= sb->sb_lowat || 1193 sbavail(sb) >= uio->uio_resid || 1194 sbavail(sb) >= sb->sb_hiwat) ) { 1195 goto deliver; 1196 } 1197 1198 /* On MSG_WAITALL we must wait until all data or error arrives. */ 1199 if ((flags & MSG_WAITALL) && 1200 (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_lowat)) 1201 goto deliver; 1202 1203 /* 1204 * Wait and block until (more) data comes in. 1205 * NB: Drops the sockbuf lock during wait. 1206 */ 1207 error = sbwait(sb); 1208 if (error) { 1209 if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled) { 1210 (void) handle_ddp(so, uio, flags, 1); 1211 ddp_handled = 1; 1212 } 1213 goto out; 1214 } 1215 goto restart; 1216 1217 deliver: 1218 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1219 KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__)); 1220 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); 1221 1222 if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled) 1223 goto restart; 1224 1225 /* Statistics. */ 1226 if (uio->uio_td) 1227 uio->uio_td->td_ru.ru_msgrcv++; 1228 1229 /* Fill uio until full or current end of socket buffer is reached. */ 1230 len = min(uio->uio_resid, sbavail(sb)); 1231 if (mp0 != NULL) { 1232 /* Dequeue as many mbufs as possible. */ 1233 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { 1234 for (*mp0 = m = sb->sb_mb; 1235 m != NULL && m->m_len <= len; 1236 m = m->m_next) { 1237 len -= m->m_len; 1238 uio->uio_resid -= m->m_len; 1239 sbfree(sb, m); 1240 n = m; 1241 } 1242 sb->sb_mb = m; 1243 if (sb->sb_mb == NULL) 1244 SB_EMPTY_FIXUP(sb); 1245 n->m_next = NULL; 1246 } 1247 /* Copy the remainder. */ 1248 if (len > 0) { 1249 KASSERT(sb->sb_mb != NULL, 1250 ("%s: len > 0 && sb->sb_mb empty", __func__)); 1251 1252 m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); 1253 if (m == NULL) 1254 len = 0; /* Don't flush data from sockbuf. */ 1255 else 1256 uio->uio_resid -= m->m_len; 1257 if (*mp0 != NULL) 1258 n->m_next = m; 1259 else 1260 *mp0 = m; 1261 if (*mp0 == NULL) { 1262 error = ENOBUFS; 1263 goto out; 1264 } 1265 } 1266 } else { 1267 /* NB: Must unlock socket buffer as uiomove may sleep. */ 1268 SOCKBUF_UNLOCK(sb); 1269 error = m_mbuftouio_ddp(uio, sb->sb_mb, len); 1270 SOCKBUF_LOCK(sb); 1271 if (error) 1272 goto out; 1273 } 1274 SBLASTRECORDCHK(sb); 1275 SBLASTMBUFCHK(sb); 1276 1277 /* 1278 * Remove the delivered data from the socket buffer unless we 1279 * were only peeking. 1280 */ 1281 if (!(flags & MSG_PEEK)) { 1282 if (len > 0) 1283 sbdrop_locked(sb, len); 1284 1285 /* Notify protocol that we drained some data. */ 1286 if ((so->so_proto->pr_flags & PR_WANTRCVD) && 1287 (((flags & MSG_WAITALL) && uio->uio_resid > 0) || 1288 !(flags & MSG_SOCALLBCK))) { 1289 SOCKBUF_UNLOCK(sb); 1290 VNET_SO_ASSERT(so); 1291 (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags); 1292 SOCKBUF_LOCK(sb); 1293 } 1294 } 1295 1296 /* 1297 * For MSG_WAITALL we may have to loop again and wait for 1298 * more data to come in. 1299 */ 1300 if ((flags & MSG_WAITALL) && uio->uio_resid > 0) 1301 goto restart; 1302 out: 1303 SOCKBUF_LOCK_ASSERT(sb); 1304 SBLASTRECORDCHK(sb); 1305 SBLASTMBUFCHK(sb); 1306 SOCKBUF_UNLOCK(sb); 1307 sbunlock(sb); 1308 return (error); 1309 } 1310 1311 #endif 1312