1 /*- 2 * Copyright (c) 2012 Chelsio Communications, Inc. 3 * All rights reserved. 4 * Written by: Navdeep Parhar <np@FreeBSD.org> 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 31 #include "opt_inet.h" 32 33 #include <sys/param.h> 34 #include <sys/types.h> 35 #include <sys/systm.h> 36 #include <sys/kernel.h> 37 #include <sys/ktr.h> 38 #include <sys/module.h> 39 #include <sys/protosw.h> 40 #include <sys/proc.h> 41 #include <sys/domain.h> 42 #include <sys/socket.h> 43 #include <sys/socketvar.h> 44 #include <sys/uio.h> 45 #include <netinet/in.h> 46 #include <netinet/in_pcb.h> 47 #include <netinet/ip.h> 48 #include <netinet/tcp_var.h> 49 #define TCPSTATES 50 #include <netinet/tcp_fsm.h> 51 #include <netinet/toecore.h> 52 53 #include <vm/vm.h> 54 #include <vm/vm_extern.h> 55 #include <vm/vm_param.h> 56 #include <vm/pmap.h> 57 #include <vm/vm_map.h> 58 #include <vm/vm_page.h> 59 #include <vm/vm_object.h> 60 61 #ifdef TCP_OFFLOAD 62 #include "common/common.h" 63 #include "common/t4_msg.h" 64 #include "common/t4_regs.h" 65 #include "common/t4_tcb.h" 66 #include "tom/t4_tom.h" 67 68 #define PPOD_SZ(n) ((n) * sizeof(struct pagepod)) 69 #define PPOD_SIZE (PPOD_SZ(1)) 70 71 /* XXX: must match A_ULP_RX_TDDP_PSZ */ 72 static int t4_ddp_pgsz[] = {4096, 4096 << 2, 4096 << 4, 4096 << 6}; 73 74 #if 0 75 static void 76 t4_dump_tcb(struct adapter *sc, int tid) 77 { 78 uint32_t tcb_base, off, i, j; 79 80 /* Dump TCB for the tid */ 81 tcb_base = t4_read_reg(sc, A_TP_CMM_TCB_BASE); 82 t4_write_reg(sc, PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_OFFSET, 2), 83 tcb_base + tid * TCB_SIZE); 84 t4_read_reg(sc, PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_OFFSET, 2)); 85 off = 0; 86 printf("\n"); 87 for (i = 0; i < 4; i++) { 88 uint32_t buf[8]; 89 for (j = 0; j < 8; j++, off += 4) 90 buf[j] = htonl(t4_read_reg(sc, MEMWIN2_BASE + off)); 91 92 printf("%08x %08x %08x %08x %08x %08x %08x %08x\n", 93 buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], 94 buf[7]); 95 } 96 } 97 #endif 98 99 #define MAX_DDP_BUFFER_SIZE (M_TCB_RX_DDP_BUF0_LEN) 100 static int 101 alloc_ppods(struct tom_data *td, int n, struct ppod_region *pr) 102 { 103 int ppod; 104 105 KASSERT(n > 0, ("%s: nonsense allocation (%d)", __func__, n)); 106 107 mtx_lock(&td->ppod_lock); 108 if (n > td->nppods_free) { 109 mtx_unlock(&td->ppod_lock); 110 return (-1); 111 } 112 113 if (td->nppods_free_head >= n) { 114 td->nppods_free_head -= n; 115 ppod = td->nppods_free_head; 116 TAILQ_INSERT_HEAD(&td->ppods, pr, link); 117 } else { 118 struct ppod_region *p; 119 120 ppod = td->nppods_free_head; 121 TAILQ_FOREACH(p, &td->ppods, link) { 122 ppod += p->used + p->free; 123 if (n <= p->free) { 124 ppod -= n; 125 p->free -= n; 126 TAILQ_INSERT_AFTER(&td->ppods, p, pr, link); 127 goto allocated; 128 } 129 } 130 131 if (__predict_false(ppod != td->nppods)) { 132 panic("%s: ppods TAILQ (%p) corrupt." 133 " At %d instead of %d at the end of the queue.", 134 __func__, &td->ppods, ppod, td->nppods); 135 } 136 137 mtx_unlock(&td->ppod_lock); 138 return (-1); 139 } 140 141 allocated: 142 pr->used = n; 143 pr->free = 0; 144 td->nppods_free -= n; 145 mtx_unlock(&td->ppod_lock); 146 147 return (ppod); 148 } 149 150 static void 151 free_ppods(struct tom_data *td, struct ppod_region *pr) 152 { 153 struct ppod_region *p; 154 155 KASSERT(pr->used > 0, ("%s: nonsense free (%d)", __func__, pr->used)); 156 157 mtx_lock(&td->ppod_lock); 158 p = TAILQ_PREV(pr, ppod_head, link); 159 if (p != NULL) 160 p->free += pr->used + pr->free; 161 else 162 td->nppods_free_head += pr->used + pr->free; 163 td->nppods_free += pr->used; 164 KASSERT(td->nppods_free <= td->nppods, 165 ("%s: nppods_free (%d) > nppods (%d). %d freed this time.", 166 __func__, td->nppods_free, td->nppods, pr->used)); 167 TAILQ_REMOVE(&td->ppods, pr, link); 168 mtx_unlock(&td->ppod_lock); 169 } 170 171 static inline int 172 pages_to_nppods(int npages, int ddp_pgsz) 173 { 174 int nsegs = npages * PAGE_SIZE / ddp_pgsz; 175 176 return (howmany(nsegs, PPOD_PAGES)); 177 } 178 179 static void 180 free_ddp_buffer(struct tom_data *td, struct ddp_buffer *db) 181 { 182 183 if (db == NULL) 184 return; 185 186 if (db->pages) 187 free(db->pages, M_CXGBE); 188 189 if (db->nppods > 0) 190 free_ppods(td, &db->ppod_region); 191 192 free(db, M_CXGBE); 193 } 194 195 void 196 release_ddp_resources(struct toepcb *toep) 197 { 198 int i; 199 200 for (i = 0; i < nitems(toep->db); i++) { 201 if (toep->db[i] != NULL) { 202 free_ddp_buffer(toep->td, toep->db[i]); 203 toep->db[i] = NULL; 204 } 205 } 206 } 207 208 /* XXX: handle_ddp_data code duplication */ 209 void 210 insert_ddp_data(struct toepcb *toep, uint32_t n) 211 { 212 struct inpcb *inp = toep->inp; 213 struct tcpcb *tp = intotcpcb(inp); 214 struct sockbuf *sb = &inp->inp_socket->so_rcv; 215 struct mbuf *m; 216 217 INP_WLOCK_ASSERT(inp); 218 SOCKBUF_LOCK_ASSERT(sb); 219 220 m = get_ddp_mbuf(n); 221 tp->rcv_nxt += n; 222 #ifndef USE_DDP_RX_FLOW_CONTROL 223 KASSERT(tp->rcv_wnd >= n, ("%s: negative window size", __func__)); 224 tp->rcv_wnd -= n; 225 #endif 226 227 KASSERT(toep->sb_cc >= sbused(sb), 228 ("%s: sb %p has more data (%d) than last time (%d).", 229 __func__, sb, sbused(sb), toep->sb_cc)); 230 toep->rx_credits += toep->sb_cc - sbused(sb); 231 #ifdef USE_DDP_RX_FLOW_CONTROL 232 toep->rx_credits -= n; /* adjust for F_RX_FC_DDP */ 233 #endif 234 sbappendstream_locked(sb, m); 235 toep->sb_cc = sbused(sb); 236 } 237 238 /* SET_TCB_FIELD sent as a ULP command looks like this */ 239 #define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \ 240 sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core)) 241 242 /* RX_DATA_ACK sent as a ULP command looks like this */ 243 #define LEN__RX_DATA_ACK_ULP (sizeof(struct ulp_txpkt) + \ 244 sizeof(struct ulptx_idata) + sizeof(struct cpl_rx_data_ack_core)) 245 246 static inline void * 247 mk_set_tcb_field_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep, 248 uint64_t word, uint64_t mask, uint64_t val) 249 { 250 struct ulptx_idata *ulpsc; 251 struct cpl_set_tcb_field_core *req; 252 253 ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0)); 254 ulpmc->len = htobe32(howmany(LEN__SET_TCB_FIELD_ULP, 16)); 255 256 ulpsc = (struct ulptx_idata *)(ulpmc + 1); 257 ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); 258 ulpsc->len = htobe32(sizeof(*req)); 259 260 req = (struct cpl_set_tcb_field_core *)(ulpsc + 1); 261 OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tid)); 262 req->reply_ctrl = htobe16(V_NO_REPLY(1) | 263 V_QUEUENO(toep->ofld_rxq->iq.abs_id)); 264 req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(0)); 265 req->mask = htobe64(mask); 266 req->val = htobe64(val); 267 268 ulpsc = (struct ulptx_idata *)(req + 1); 269 if (LEN__SET_TCB_FIELD_ULP % 16) { 270 ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP)); 271 ulpsc->len = htobe32(0); 272 return (ulpsc + 1); 273 } 274 return (ulpsc); 275 } 276 277 static inline void * 278 mk_rx_data_ack_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep) 279 { 280 struct ulptx_idata *ulpsc; 281 struct cpl_rx_data_ack_core *req; 282 283 ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0)); 284 ulpmc->len = htobe32(howmany(LEN__RX_DATA_ACK_ULP, 16)); 285 286 ulpsc = (struct ulptx_idata *)(ulpmc + 1); 287 ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); 288 ulpsc->len = htobe32(sizeof(*req)); 289 290 req = (struct cpl_rx_data_ack_core *)(ulpsc + 1); 291 OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tid)); 292 req->credit_dack = htobe32(F_RX_MODULATE_RX); 293 294 ulpsc = (struct ulptx_idata *)(req + 1); 295 if (LEN__RX_DATA_ACK_ULP % 16) { 296 ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP)); 297 ulpsc->len = htobe32(0); 298 return (ulpsc + 1); 299 } 300 return (ulpsc); 301 } 302 303 static inline uint64_t 304 select_ddp_flags(struct socket *so, int flags, int db_idx) 305 { 306 uint64_t ddp_flags = V_TF_DDP_INDICATE_OUT(0); 307 int waitall = flags & MSG_WAITALL; 308 int nb = so->so_state & SS_NBIO || flags & (MSG_DONTWAIT | MSG_NBIO); 309 310 KASSERT(db_idx == 0 || db_idx == 1, 311 ("%s: bad DDP buffer index %d", __func__, db_idx)); 312 313 if (db_idx == 0) { 314 ddp_flags |= V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0); 315 if (waitall) 316 ddp_flags |= V_TF_DDP_PUSH_DISABLE_0(1); 317 else if (nb) 318 ddp_flags |= V_TF_DDP_BUF0_FLUSH(1); 319 else 320 ddp_flags |= V_TF_DDP_BUF0_FLUSH(0); 321 } else { 322 ddp_flags |= V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1); 323 if (waitall) 324 ddp_flags |= V_TF_DDP_PUSH_DISABLE_1(1); 325 else if (nb) 326 ddp_flags |= V_TF_DDP_BUF1_FLUSH(1); 327 else 328 ddp_flags |= V_TF_DDP_BUF1_FLUSH(0); 329 } 330 331 return (ddp_flags); 332 } 333 334 static struct wrqe * 335 mk_update_tcb_for_ddp(struct adapter *sc, struct toepcb *toep, int db_idx, 336 int offset, uint64_t ddp_flags) 337 { 338 struct ddp_buffer *db = toep->db[db_idx]; 339 struct wrqe *wr; 340 struct work_request_hdr *wrh; 341 struct ulp_txpkt *ulpmc; 342 int len; 343 344 KASSERT(db_idx == 0 || db_idx == 1, 345 ("%s: bad DDP buffer index %d", __func__, db_idx)); 346 347 /* 348 * We'll send a compound work request that has 3 SET_TCB_FIELDs and an 349 * RX_DATA_ACK (with RX_MODULATE to speed up delivery). 350 * 351 * The work request header is 16B and always ends at a 16B boundary. 352 * The ULPTX master commands that follow must all end at 16B boundaries 353 * too so we round up the size to 16. 354 */ 355 len = sizeof(*wrh) + 3 * roundup2(LEN__SET_TCB_FIELD_ULP, 16) + 356 roundup2(LEN__RX_DATA_ACK_ULP, 16); 357 358 wr = alloc_wrqe(len, toep->ctrlq); 359 if (wr == NULL) 360 return (NULL); 361 wrh = wrtod(wr); 362 INIT_ULPTX_WRH(wrh, len, 1, 0); /* atomic */ 363 ulpmc = (struct ulp_txpkt *)(wrh + 1); 364 365 /* Write the buffer's tag */ 366 ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, 367 W_TCB_RX_DDP_BUF0_TAG + db_idx, 368 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG), 369 V_TCB_RX_DDP_BUF0_TAG(db->tag)); 370 371 /* Update the current offset in the DDP buffer and its total length */ 372 if (db_idx == 0) 373 ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, 374 W_TCB_RX_DDP_BUF0_OFFSET, 375 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | 376 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), 377 V_TCB_RX_DDP_BUF0_OFFSET(offset) | 378 V_TCB_RX_DDP_BUF0_LEN(db->len)); 379 else 380 ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, 381 W_TCB_RX_DDP_BUF1_OFFSET, 382 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | 383 V_TCB_RX_DDP_BUF1_LEN((u64)M_TCB_RX_DDP_BUF1_LEN << 32), 384 V_TCB_RX_DDP_BUF1_OFFSET(offset) | 385 V_TCB_RX_DDP_BUF1_LEN((u64)db->len << 32)); 386 387 /* Update DDP flags */ 388 ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_RX_DDP_FLAGS, 389 V_TF_DDP_BUF0_FLUSH(1) | V_TF_DDP_BUF1_FLUSH(1) | 390 V_TF_DDP_PUSH_DISABLE_0(1) | V_TF_DDP_PUSH_DISABLE_1(1) | 391 V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1) | 392 V_TF_DDP_ACTIVE_BUF(1) | V_TF_DDP_INDICATE_OUT(1), ddp_flags); 393 394 /* Gratuitous RX_DATA_ACK with RX_MODULATE set to speed up delivery. */ 395 ulpmc = mk_rx_data_ack_ulp(ulpmc, toep); 396 397 return (wr); 398 } 399 400 static void 401 discourage_ddp(struct toepcb *toep) 402 { 403 404 if (toep->ddp_score && --toep->ddp_score == 0) { 405 toep->ddp_flags &= ~DDP_OK; 406 toep->ddp_disabled = time_uptime; 407 CTR3(KTR_CXGBE, "%s: tid %u !DDP_OK @ %u", 408 __func__, toep->tid, time_uptime); 409 } 410 } 411 412 static int 413 handle_ddp_data(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt, int len) 414 { 415 uint32_t report = be32toh(ddp_report); 416 unsigned int db_flag; 417 struct inpcb *inp = toep->inp; 418 struct tcpcb *tp; 419 struct socket *so; 420 struct sockbuf *sb; 421 struct mbuf *m; 422 423 db_flag = report & F_DDP_BUF_IDX ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE; 424 425 if (__predict_false(!(report & F_DDP_INV))) 426 CXGBE_UNIMPLEMENTED("DDP buffer still valid"); 427 428 INP_WLOCK(inp); 429 so = inp_inpcbtosocket(inp); 430 sb = &so->so_rcv; 431 if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) { 432 433 /* 434 * XXX: think a bit more. 435 * tcpcb probably gone, but socket should still be around 436 * because we always wait for DDP completion in soreceive no 437 * matter what. Just wake it up and let it clean up. 438 */ 439 440 CTR5(KTR_CXGBE, "%s: tid %u, seq 0x%x, len %d, inp_flags 0x%x", 441 __func__, toep->tid, be32toh(rcv_nxt), len, inp->inp_flags); 442 SOCKBUF_LOCK(sb); 443 goto wakeup; 444 } 445 446 tp = intotcpcb(inp); 447 len += be32toh(rcv_nxt) - tp->rcv_nxt; 448 tp->rcv_nxt += len; 449 tp->t_rcvtime = ticks; 450 #ifndef USE_DDP_RX_FLOW_CONTROL 451 KASSERT(tp->rcv_wnd >= len, ("%s: negative window size", __func__)); 452 tp->rcv_wnd -= len; 453 #endif 454 m = get_ddp_mbuf(len); 455 456 SOCKBUF_LOCK(sb); 457 if (report & F_DDP_BUF_COMPLETE) 458 toep->ddp_score = DDP_HIGH_SCORE; 459 else 460 discourage_ddp(toep); 461 462 KASSERT(toep->sb_cc >= sbused(sb), 463 ("%s: sb %p has more data (%d) than last time (%d).", 464 __func__, sb, sbused(sb), toep->sb_cc)); 465 toep->rx_credits += toep->sb_cc - sbused(sb); 466 #ifdef USE_DDP_RX_FLOW_CONTROL 467 toep->rx_credits -= len; /* adjust for F_RX_FC_DDP */ 468 #endif 469 sbappendstream_locked(sb, m); 470 toep->sb_cc = sbused(sb); 471 wakeup: 472 KASSERT(toep->ddp_flags & db_flag, 473 ("%s: DDP buffer not active. toep %p, ddp_flags 0x%x, report 0x%x", 474 __func__, toep, toep->ddp_flags, report)); 475 toep->ddp_flags &= ~db_flag; 476 sorwakeup_locked(so); 477 SOCKBUF_UNLOCK_ASSERT(sb); 478 479 INP_WUNLOCK(inp); 480 return (0); 481 } 482 483 #define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\ 484 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\ 485 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\ 486 F_DDP_INVALID_PPOD | F_DDP_HDRCRC_ERR | F_DDP_DATACRC_ERR) 487 488 static int 489 do_rx_data_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 490 { 491 struct adapter *sc = iq->adapter; 492 const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1); 493 unsigned int tid = GET_TID(cpl); 494 uint32_t vld; 495 struct toepcb *toep = lookup_tid(sc, tid); 496 struct tom_data *td = toep->td; 497 498 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 499 KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__)); 500 KASSERT(!(toep->flags & TPF_SYNQE), 501 ("%s: toep %p claims to be a synq entry", __func__, toep)); 502 503 vld = be32toh(cpl->ddpvld); 504 if (__predict_false(vld & DDP_ERR)) { 505 panic("%s: DDP error 0x%x (tid %d, toep %p)", 506 __func__, vld, tid, toep); 507 } 508 if (toep->ulp_mode == ULP_MODE_ISCSI) { 509 m = m_get(M_NOWAIT, MT_DATA); 510 if (m == NULL) 511 CXGBE_UNIMPLEMENTED("mbuf alloc failure"); 512 memcpy(mtod(m, unsigned char *), cpl, 513 sizeof(struct cpl_rx_data_ddp)); 514 if (!t4_cpl_iscsi_callback(td, toep, m, CPL_RX_DATA_DDP)) 515 return (0); 516 m_freem(m); 517 } 518 519 handle_ddp_data(toep, cpl->u.ddp_report, cpl->seq, be16toh(cpl->len)); 520 521 return (0); 522 } 523 524 static int 525 do_rx_ddp_complete(struct sge_iq *iq, const struct rss_header *rss, 526 struct mbuf *m) 527 { 528 struct adapter *sc = iq->adapter; 529 const struct cpl_rx_ddp_complete *cpl = (const void *)(rss + 1); 530 unsigned int tid = GET_TID(cpl); 531 struct toepcb *toep = lookup_tid(sc, tid); 532 533 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 534 KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__)); 535 KASSERT(!(toep->flags & TPF_SYNQE), 536 ("%s: toep %p claims to be a synq entry", __func__, toep)); 537 538 handle_ddp_data(toep, cpl->ddp_report, cpl->rcv_nxt, 0); 539 540 return (0); 541 } 542 543 void 544 enable_ddp(struct adapter *sc, struct toepcb *toep) 545 { 546 547 KASSERT((toep->ddp_flags & (DDP_ON | DDP_OK | DDP_SC_REQ)) == DDP_OK, 548 ("%s: toep %p has bad ddp_flags 0x%x", 549 __func__, toep, toep->ddp_flags)); 550 551 CTR3(KTR_CXGBE, "%s: tid %u (time %u)", 552 __func__, toep->tid, time_uptime); 553 554 toep->ddp_flags |= DDP_SC_REQ; 555 t4_set_tcb_field(sc, toep, 1, W_TCB_RX_DDP_FLAGS, 556 V_TF_DDP_OFF(1) | V_TF_DDP_INDICATE_OUT(1) | 557 V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1) | 558 V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1), 559 V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1)); 560 t4_set_tcb_field(sc, toep, 1, W_TCB_T_FLAGS, 561 V_TF_RCV_COALESCE_ENABLE(1), 0); 562 } 563 564 static inline void 565 disable_ddp(struct adapter *sc, struct toepcb *toep) 566 { 567 568 KASSERT((toep->ddp_flags & (DDP_ON | DDP_SC_REQ)) == DDP_ON, 569 ("%s: toep %p has bad ddp_flags 0x%x", 570 __func__, toep, toep->ddp_flags)); 571 572 CTR3(KTR_CXGBE, "%s: tid %u (time %u)", 573 __func__, toep->tid, time_uptime); 574 575 toep->ddp_flags |= DDP_SC_REQ; 576 t4_set_tcb_field(sc, toep, 1, W_TCB_T_FLAGS, 577 V_TF_RCV_COALESCE_ENABLE(1), V_TF_RCV_COALESCE_ENABLE(1)); 578 t4_set_tcb_field(sc, toep, 1, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1), 579 V_TF_DDP_OFF(1)); 580 } 581 582 static int 583 hold_uio(struct uio *uio, vm_page_t **ppages, int *pnpages) 584 { 585 struct vm_map *map; 586 struct iovec *iov; 587 vm_offset_t start, end; 588 vm_page_t *pp; 589 int n; 590 591 KASSERT(uio->uio_iovcnt == 1, 592 ("%s: uio_iovcnt %d", __func__, uio->uio_iovcnt)); 593 KASSERT(uio->uio_td->td_proc == curproc, 594 ("%s: uio proc (%p) is not curproc (%p)", 595 __func__, uio->uio_td->td_proc, curproc)); 596 597 map = &curproc->p_vmspace->vm_map; 598 iov = &uio->uio_iov[0]; 599 start = trunc_page((uintptr_t)iov->iov_base); 600 end = round_page((vm_offset_t)iov->iov_base + iov->iov_len); 601 n = howmany(end - start, PAGE_SIZE); 602 603 if (end - start > MAX_DDP_BUFFER_SIZE) 604 return (E2BIG); 605 606 pp = malloc(n * sizeof(vm_page_t), M_CXGBE, M_NOWAIT); 607 if (pp == NULL) 608 return (ENOMEM); 609 610 if (vm_fault_quick_hold_pages(map, (vm_offset_t)iov->iov_base, 611 iov->iov_len, VM_PROT_WRITE, pp, n) < 0) { 612 free(pp, M_CXGBE); 613 return (EFAULT); 614 } 615 616 *ppages = pp; 617 *pnpages = n; 618 619 return (0); 620 } 621 622 static int 623 bufcmp(struct ddp_buffer *db, vm_page_t *pages, int npages, int offset, int len) 624 { 625 int i; 626 627 if (db == NULL || db->npages != npages || db->offset != offset || 628 db->len != len) 629 return (1); 630 631 for (i = 0; i < npages; i++) { 632 if (pages[i]->phys_addr != db->pages[i]->phys_addr) 633 return (1); 634 } 635 636 return (0); 637 } 638 639 static int 640 calculate_hcf(int n1, int n2) 641 { 642 int a, b, t; 643 644 if (n1 <= n2) { 645 a = n1; 646 b = n2; 647 } else { 648 a = n2; 649 b = n1; 650 } 651 652 while (a != 0) { 653 t = a; 654 a = b % a; 655 b = t; 656 } 657 658 return (b); 659 } 660 661 static struct ddp_buffer * 662 alloc_ddp_buffer(struct tom_data *td, vm_page_t *pages, int npages, int offset, 663 int len) 664 { 665 int i, hcf, seglen, idx, ppod, nppods; 666 struct ddp_buffer *db; 667 668 /* 669 * The DDP page size is unrelated to the VM page size. We combine 670 * contiguous physical pages into larger segments to get the best DDP 671 * page size possible. This is the largest of the four sizes in 672 * A_ULP_RX_TDDP_PSZ that evenly divides the HCF of the segment sizes in 673 * the page list. 674 */ 675 hcf = 0; 676 for (i = 0; i < npages; i++) { 677 seglen = PAGE_SIZE; 678 while (i < npages - 1 && 679 pages[i]->phys_addr + PAGE_SIZE == pages[i + 1]->phys_addr) { 680 seglen += PAGE_SIZE; 681 i++; 682 } 683 684 hcf = calculate_hcf(hcf, seglen); 685 if (hcf < t4_ddp_pgsz[1]) { 686 idx = 0; 687 goto have_pgsz; /* give up, short circuit */ 688 } 689 } 690 691 if (hcf % t4_ddp_pgsz[0] != 0) { 692 /* hmmm. This could only happen when PAGE_SIZE < 4K */ 693 KASSERT(PAGE_SIZE < 4096, 694 ("%s: PAGE_SIZE %d, hcf %d", __func__, PAGE_SIZE, hcf)); 695 CTR3(KTR_CXGBE, "%s: PAGE_SIZE %d, hcf %d", 696 __func__, PAGE_SIZE, hcf); 697 return (NULL); 698 } 699 700 for (idx = nitems(t4_ddp_pgsz) - 1; idx > 0; idx--) { 701 if (hcf % t4_ddp_pgsz[idx] == 0) 702 break; 703 } 704 have_pgsz: 705 706 db = malloc(sizeof(*db), M_CXGBE, M_NOWAIT); 707 if (db == NULL) { 708 CTR1(KTR_CXGBE, "%s: malloc failed.", __func__); 709 return (NULL); 710 } 711 712 nppods = pages_to_nppods(npages, t4_ddp_pgsz[idx]); 713 ppod = alloc_ppods(td, nppods, &db->ppod_region); 714 if (ppod < 0) { 715 free(db, M_CXGBE); 716 CTR4(KTR_CXGBE, "%s: no pods, nppods %d, resid %d, pgsz %d", 717 __func__, nppods, len, t4_ddp_pgsz[idx]); 718 return (NULL); 719 } 720 721 KASSERT(idx <= M_PPOD_PGSZ && ppod <= M_PPOD_TAG, 722 ("%s: DDP pgsz_idx = %d, ppod = %d", __func__, idx, ppod)); 723 724 db->tag = V_PPOD_PGSZ(idx) | V_PPOD_TAG(ppod); 725 db->nppods = nppods; 726 db->npages = npages; 727 db->pages = pages; 728 db->offset = offset; 729 db->len = len; 730 731 CTR6(KTR_CXGBE, "New DDP buffer. " 732 "ddp_pgsz %d, ppod 0x%x, npages %d, nppods %d, offset %d, len %d", 733 t4_ddp_pgsz[idx], ppod, db->npages, db->nppods, db->offset, 734 db->len); 735 736 return (db); 737 } 738 739 #define NUM_ULP_TX_SC_IMM_PPODS (256 / PPOD_SIZE) 740 741 static int 742 write_page_pods(struct adapter *sc, struct toepcb *toep, struct ddp_buffer *db) 743 { 744 struct wrqe *wr; 745 struct ulp_mem_io *ulpmc; 746 struct ulptx_idata *ulpsc; 747 struct pagepod *ppod; 748 int i, j, k, n, chunk, len, ddp_pgsz, idx, ppod_addr; 749 uint32_t cmd; 750 751 cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE)); 752 if (is_t4(sc)) 753 cmd |= htobe32(F_ULP_MEMIO_ORDER); 754 else 755 cmd |= htobe32(F_T5_ULP_MEMIO_IMM); 756 ddp_pgsz = t4_ddp_pgsz[G_PPOD_PGSZ(db->tag)]; 757 ppod_addr = sc->vres.ddp.start + G_PPOD_TAG(db->tag) * PPOD_SIZE; 758 for (i = 0; i < db->nppods; ppod_addr += chunk) { 759 760 /* How many page pods are we writing in this cycle */ 761 n = min(db->nppods - i, NUM_ULP_TX_SC_IMM_PPODS); 762 chunk = PPOD_SZ(n); 763 len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16); 764 765 wr = alloc_wrqe(len, toep->ctrlq); 766 if (wr == NULL) 767 return (ENOMEM); /* ok to just bail out */ 768 ulpmc = wrtod(wr); 769 770 INIT_ULPTX_WR(ulpmc, len, 0, 0); 771 ulpmc->cmd = cmd; 772 ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32)); 773 ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16)); 774 ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5)); 775 776 ulpsc = (struct ulptx_idata *)(ulpmc + 1); 777 ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); 778 ulpsc->len = htobe32(chunk); 779 780 ppod = (struct pagepod *)(ulpsc + 1); 781 for (j = 0; j < n; i++, j++, ppod++) { 782 ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID | 783 V_PPOD_TID(toep->tid) | db->tag); 784 ppod->len_offset = htobe64(V_PPOD_LEN(db->len) | 785 V_PPOD_OFST(db->offset)); 786 ppod->rsvd = 0; 787 idx = i * PPOD_PAGES * (ddp_pgsz / PAGE_SIZE); 788 for (k = 0; k < nitems(ppod->addr); k++) { 789 if (idx < db->npages) { 790 ppod->addr[k] = 791 htobe64(db->pages[idx]->phys_addr); 792 idx += ddp_pgsz / PAGE_SIZE; 793 } else 794 ppod->addr[k] = 0; 795 #if 0 796 CTR5(KTR_CXGBE, 797 "%s: tid %d ppod[%d]->addr[%d] = %p", 798 __func__, toep->tid, i, k, 799 htobe64(ppod->addr[k])); 800 #endif 801 } 802 803 } 804 805 t4_wrq_tx(sc, wr); 806 } 807 808 return (0); 809 } 810 811 /* 812 * Reuse, or allocate (and program the page pods for) a new DDP buffer. The 813 * "pages" array is handed over to this function and should not be used in any 814 * way by the caller after that. 815 */ 816 static int 817 select_ddp_buffer(struct adapter *sc, struct toepcb *toep, vm_page_t *pages, 818 int npages, int db_off, int db_len) 819 { 820 struct ddp_buffer *db; 821 struct tom_data *td = sc->tom_softc; 822 int i, empty_slot = -1; 823 824 /* Try to reuse */ 825 for (i = 0; i < nitems(toep->db); i++) { 826 if (bufcmp(toep->db[i], pages, npages, db_off, db_len) == 0) { 827 free(pages, M_CXGBE); 828 return (i); /* pages still held */ 829 } else if (toep->db[i] == NULL && empty_slot < 0) 830 empty_slot = i; 831 } 832 833 /* Allocate new buffer, write its page pods. */ 834 db = alloc_ddp_buffer(td, pages, npages, db_off, db_len); 835 if (db == NULL) { 836 vm_page_unhold_pages(pages, npages); 837 free(pages, M_CXGBE); 838 return (-1); 839 } 840 if (write_page_pods(sc, toep, db) != 0) { 841 vm_page_unhold_pages(pages, npages); 842 free_ddp_buffer(td, db); 843 return (-1); 844 } 845 846 i = empty_slot; 847 if (i < 0) { 848 i = arc4random() % nitems(toep->db); 849 free_ddp_buffer(td, toep->db[i]); 850 } 851 toep->db[i] = db; 852 853 CTR5(KTR_CXGBE, "%s: tid %d, DDP buffer[%d] = %p (tag 0x%x)", 854 __func__, toep->tid, i, db, db->tag); 855 856 return (i); 857 } 858 859 static void 860 wire_ddp_buffer(struct ddp_buffer *db) 861 { 862 int i; 863 vm_page_t p; 864 865 for (i = 0; i < db->npages; i++) { 866 p = db->pages[i]; 867 vm_page_lock(p); 868 vm_page_wire(p); 869 vm_page_unhold(p); 870 vm_page_unlock(p); 871 } 872 } 873 874 static void 875 unwire_ddp_buffer(struct ddp_buffer *db) 876 { 877 int i; 878 vm_page_t p; 879 880 for (i = 0; i < db->npages; i++) { 881 p = db->pages[i]; 882 vm_page_lock(p); 883 vm_page_unwire(p, PQ_INACTIVE); 884 vm_page_unlock(p); 885 } 886 } 887 888 static int 889 handle_ddp(struct socket *so, struct uio *uio, int flags, int error) 890 { 891 struct sockbuf *sb = &so->so_rcv; 892 struct tcpcb *tp = so_sototcpcb(so); 893 struct toepcb *toep = tp->t_toe; 894 struct adapter *sc = td_adapter(toep->td); 895 vm_page_t *pages; 896 int npages, db_idx, rc, buf_flag; 897 struct ddp_buffer *db; 898 struct wrqe *wr; 899 uint64_t ddp_flags; 900 901 SOCKBUF_LOCK_ASSERT(sb); 902 903 #if 0 904 if (sb->sb_cc + sc->tt.ddp_thres > uio->uio_resid) { 905 CTR4(KTR_CXGBE, "%s: sb_cc %d, threshold %d, resid %d", 906 __func__, sb->sb_cc, sc->tt.ddp_thres, uio->uio_resid); 907 } 908 #endif 909 910 /* XXX: too eager to disable DDP, could handle NBIO better than this. */ 911 if (sbused(sb) >= uio->uio_resid || uio->uio_resid < sc->tt.ddp_thres || 912 uio->uio_resid > MAX_DDP_BUFFER_SIZE || uio->uio_iovcnt > 1 || 913 so->so_state & SS_NBIO || flags & (MSG_DONTWAIT | MSG_NBIO) || 914 error || so->so_error || sb->sb_state & SBS_CANTRCVMORE) 915 goto no_ddp; 916 917 /* 918 * Fault in and then hold the pages of the uio buffers. We'll wire them 919 * a bit later if everything else works out. 920 */ 921 SOCKBUF_UNLOCK(sb); 922 if (hold_uio(uio, &pages, &npages) != 0) { 923 SOCKBUF_LOCK(sb); 924 goto no_ddp; 925 } 926 SOCKBUF_LOCK(sb); 927 if (__predict_false(so->so_error || sb->sb_state & SBS_CANTRCVMORE)) { 928 vm_page_unhold_pages(pages, npages); 929 free(pages, M_CXGBE); 930 goto no_ddp; 931 } 932 933 /* 934 * Figure out which one of the two DDP buffers to use this time. 935 */ 936 db_idx = select_ddp_buffer(sc, toep, pages, npages, 937 (uintptr_t)uio->uio_iov->iov_base & PAGE_MASK, uio->uio_resid); 938 pages = NULL; /* handed off to select_ddp_buffer */ 939 if (db_idx < 0) 940 goto no_ddp; 941 db = toep->db[db_idx]; 942 buf_flag = db_idx == 0 ? DDP_BUF0_ACTIVE : DDP_BUF1_ACTIVE; 943 944 /* 945 * Build the compound work request that tells the chip where to DMA the 946 * payload. 947 */ 948 ddp_flags = select_ddp_flags(so, flags, db_idx); 949 wr = mk_update_tcb_for_ddp(sc, toep, db_idx, sbused(sb), ddp_flags); 950 if (wr == NULL) { 951 /* 952 * Just unhold the pages. The DDP buffer's software state is 953 * left as-is in the toep. The page pods were written 954 * successfully and we may have an opportunity to use it in the 955 * future. 956 */ 957 vm_page_unhold_pages(db->pages, db->npages); 958 goto no_ddp; 959 } 960 961 /* Wire (and then unhold) the pages, and give the chip the go-ahead. */ 962 wire_ddp_buffer(db); 963 t4_wrq_tx(sc, wr); 964 sb->sb_flags &= ~SB_DDP_INDICATE; 965 toep->ddp_flags |= buf_flag; 966 967 /* 968 * Wait for the DDP operation to complete and then unwire the pages. 969 * The return code from the sbwait will be the final return code of this 970 * function. But we do need to wait for DDP no matter what. 971 */ 972 rc = sbwait(sb); 973 while (toep->ddp_flags & buf_flag) { 974 sb->sb_flags |= SB_WAIT; 975 msleep(&sb->sb_cc, &sb->sb_mtx, PSOCK , "sbwait", 0); 976 } 977 unwire_ddp_buffer(db); 978 return (rc); 979 no_ddp: 980 disable_ddp(sc, toep); 981 discourage_ddp(toep); 982 sb->sb_flags &= ~SB_DDP_INDICATE; 983 return (0); 984 } 985 986 void 987 t4_init_ddp(struct adapter *sc, struct tom_data *td) 988 { 989 int nppods = sc->vres.ddp.size / PPOD_SIZE; 990 991 td->nppods = nppods; 992 td->nppods_free = nppods; 993 td->nppods_free_head = nppods; 994 TAILQ_INIT(&td->ppods); 995 mtx_init(&td->ppod_lock, "page pods", NULL, MTX_DEF); 996 997 t4_register_cpl_handler(sc, CPL_RX_DATA_DDP, do_rx_data_ddp); 998 t4_register_cpl_handler(sc, CPL_RX_DDP_COMPLETE, do_rx_ddp_complete); 999 } 1000 1001 void 1002 t4_uninit_ddp(struct adapter *sc __unused, struct tom_data *td) 1003 { 1004 1005 KASSERT(td->nppods == td->nppods_free, 1006 ("%s: page pods still in use, nppods = %d, free = %d", 1007 __func__, td->nppods, td->nppods_free)); 1008 1009 if (mtx_initialized(&td->ppod_lock)) 1010 mtx_destroy(&td->ppod_lock); 1011 } 1012 1013 #define VNET_SO_ASSERT(so) \ 1014 VNET_ASSERT(curvnet != NULL, \ 1015 ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so))); 1016 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) 1017 static int 1018 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) 1019 { 1020 1021 CXGBE_UNIMPLEMENTED(__func__); 1022 } 1023 1024 static char ddp_magic_str[] = "nothing to see here"; 1025 1026 struct mbuf * 1027 get_ddp_mbuf(int len) 1028 { 1029 struct mbuf *m; 1030 1031 m = m_get(M_NOWAIT, MT_DATA); 1032 if (m == NULL) 1033 CXGBE_UNIMPLEMENTED("mbuf alloc failure"); 1034 m->m_len = len; 1035 m->m_data = &ddp_magic_str[0]; 1036 1037 return (m); 1038 } 1039 1040 static inline int 1041 is_ddp_mbuf(struct mbuf *m) 1042 { 1043 1044 return (m->m_data == &ddp_magic_str[0]); 1045 } 1046 1047 /* 1048 * Copy an mbuf chain into a uio limited by len if set. 1049 */ 1050 static int 1051 m_mbuftouio_ddp(struct uio *uio, struct mbuf *m, int len) 1052 { 1053 int error, length, total; 1054 int progress = 0; 1055 1056 if (len > 0) 1057 total = min(uio->uio_resid, len); 1058 else 1059 total = uio->uio_resid; 1060 1061 /* Fill the uio with data from the mbufs. */ 1062 for (; m != NULL; m = m->m_next) { 1063 length = min(m->m_len, total - progress); 1064 1065 if (is_ddp_mbuf(m)) { 1066 enum uio_seg segflag = uio->uio_segflg; 1067 1068 uio->uio_segflg = UIO_NOCOPY; 1069 error = uiomove(mtod(m, void *), length, uio); 1070 uio->uio_segflg = segflag; 1071 } else 1072 error = uiomove(mtod(m, void *), length, uio); 1073 if (error) 1074 return (error); 1075 1076 progress += length; 1077 } 1078 1079 return (0); 1080 } 1081 1082 /* 1083 * Based on soreceive_stream() in uipc_socket.c 1084 */ 1085 int 1086 t4_soreceive_ddp(struct socket *so, struct sockaddr **psa, struct uio *uio, 1087 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 1088 { 1089 int len = 0, error = 0, flags, oresid, ddp_handled = 0; 1090 struct sockbuf *sb; 1091 struct mbuf *m, *n = NULL; 1092 1093 /* We only do stream sockets. */ 1094 if (so->so_type != SOCK_STREAM) 1095 return (EINVAL); 1096 if (psa != NULL) 1097 *psa = NULL; 1098 if (controlp != NULL) 1099 return (EINVAL); 1100 if (flagsp != NULL) 1101 flags = *flagsp &~ MSG_EOR; 1102 else 1103 flags = 0; 1104 if (flags & MSG_OOB) 1105 return (soreceive_rcvoob(so, uio, flags)); 1106 if (mp0 != NULL) 1107 *mp0 = NULL; 1108 1109 sb = &so->so_rcv; 1110 1111 /* Prevent other readers from entering the socket. */ 1112 error = sblock(sb, SBLOCKWAIT(flags)); 1113 if (error) 1114 goto out; 1115 SOCKBUF_LOCK(sb); 1116 1117 /* Easy one, no space to copyout anything. */ 1118 if (uio->uio_resid == 0) { 1119 error = EINVAL; 1120 goto out; 1121 } 1122 oresid = uio->uio_resid; 1123 1124 /* We will never ever get anything unless we are or were connected. */ 1125 if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { 1126 error = ENOTCONN; 1127 goto out; 1128 } 1129 1130 restart: 1131 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1132 1133 if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled) { 1134 1135 /* uio should be just as it was at entry */ 1136 KASSERT(oresid == uio->uio_resid, 1137 ("%s: oresid = %d, uio_resid = %zd, sbused = %d", 1138 __func__, oresid, uio->uio_resid, sbused(sb))); 1139 1140 error = handle_ddp(so, uio, flags, 0); 1141 ddp_handled = 1; 1142 if (error) 1143 goto out; 1144 } 1145 1146 /* Abort if socket has reported problems. */ 1147 if (so->so_error) { 1148 if (sbused(sb)) 1149 goto deliver; 1150 if (oresid > uio->uio_resid) 1151 goto out; 1152 error = so->so_error; 1153 if (!(flags & MSG_PEEK)) 1154 so->so_error = 0; 1155 goto out; 1156 } 1157 1158 /* Door is closed. Deliver what is left, if any. */ 1159 if (sb->sb_state & SBS_CANTRCVMORE) { 1160 if (sbused(sb)) 1161 goto deliver; 1162 else 1163 goto out; 1164 } 1165 1166 /* Socket buffer is empty and we shall not block. */ 1167 if (sbused(sb) == 0 && 1168 ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { 1169 error = EAGAIN; 1170 goto out; 1171 } 1172 1173 /* Socket buffer got some data that we shall deliver now. */ 1174 if (sbused(sb) && !(flags & MSG_WAITALL) && 1175 ((sb->sb_flags & SS_NBIO) || 1176 (flags & (MSG_DONTWAIT|MSG_NBIO)) || 1177 sbused(sb) >= sb->sb_lowat || 1178 sbused(sb) >= uio->uio_resid || 1179 sbused(sb) >= sb->sb_hiwat) ) { 1180 goto deliver; 1181 } 1182 1183 /* On MSG_WAITALL we must wait until all data or error arrives. */ 1184 if ((flags & MSG_WAITALL) && 1185 (sbused(sb) >= uio->uio_resid || sbused(sb) >= sb->sb_lowat)) 1186 goto deliver; 1187 1188 /* 1189 * Wait and block until (more) data comes in. 1190 * NB: Drops the sockbuf lock during wait. 1191 */ 1192 error = sbwait(sb); 1193 if (error) { 1194 if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled) { 1195 (void) handle_ddp(so, uio, flags, 1); 1196 ddp_handled = 1; 1197 } 1198 goto out; 1199 } 1200 goto restart; 1201 1202 deliver: 1203 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1204 KASSERT(sbused(sb) > 0, ("%s: sockbuf empty", __func__)); 1205 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); 1206 1207 if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled) 1208 goto restart; 1209 1210 /* Statistics. */ 1211 if (uio->uio_td) 1212 uio->uio_td->td_ru.ru_msgrcv++; 1213 1214 /* Fill uio until full or current end of socket buffer is reached. */ 1215 len = min(uio->uio_resid, sbused(sb)); 1216 if (mp0 != NULL) { 1217 /* Dequeue as many mbufs as possible. */ 1218 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { 1219 for (*mp0 = m = sb->sb_mb; 1220 m != NULL && m->m_len <= len; 1221 m = m->m_next) { 1222 len -= m->m_len; 1223 uio->uio_resid -= m->m_len; 1224 sbfree(sb, m); 1225 n = m; 1226 } 1227 sb->sb_mb = m; 1228 if (sb->sb_mb == NULL) 1229 SB_EMPTY_FIXUP(sb); 1230 n->m_next = NULL; 1231 } 1232 /* Copy the remainder. */ 1233 if (len > 0) { 1234 KASSERT(sb->sb_mb != NULL, 1235 ("%s: len > 0 && sb->sb_mb empty", __func__)); 1236 1237 m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); 1238 if (m == NULL) 1239 len = 0; /* Don't flush data from sockbuf. */ 1240 else 1241 uio->uio_resid -= m->m_len; 1242 if (*mp0 != NULL) 1243 n->m_next = m; 1244 else 1245 *mp0 = m; 1246 if (*mp0 == NULL) { 1247 error = ENOBUFS; 1248 goto out; 1249 } 1250 } 1251 } else { 1252 /* NB: Must unlock socket buffer as uiomove may sleep. */ 1253 SOCKBUF_UNLOCK(sb); 1254 error = m_mbuftouio_ddp(uio, sb->sb_mb, len); 1255 SOCKBUF_LOCK(sb); 1256 if (error) 1257 goto out; 1258 } 1259 SBLASTRECORDCHK(sb); 1260 SBLASTMBUFCHK(sb); 1261 1262 /* 1263 * Remove the delivered data from the socket buffer unless we 1264 * were only peeking. 1265 */ 1266 if (!(flags & MSG_PEEK)) { 1267 if (len > 0) 1268 sbdrop_locked(sb, len); 1269 1270 /* Notify protocol that we drained some data. */ 1271 if ((so->so_proto->pr_flags & PR_WANTRCVD) && 1272 (((flags & MSG_WAITALL) && uio->uio_resid > 0) || 1273 !(flags & MSG_SOCALLBCK))) { 1274 SOCKBUF_UNLOCK(sb); 1275 VNET_SO_ASSERT(so); 1276 (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags); 1277 SOCKBUF_LOCK(sb); 1278 } 1279 } 1280 1281 /* 1282 * For MSG_WAITALL we may have to loop again and wait for 1283 * more data to come in. 1284 */ 1285 if ((flags & MSG_WAITALL) && uio->uio_resid > 0) 1286 goto restart; 1287 out: 1288 SOCKBUF_LOCK_ASSERT(sb); 1289 SBLASTRECORDCHK(sb); 1290 SBLASTMBUFCHK(sb); 1291 SOCKBUF_UNLOCK(sb); 1292 sbunlock(sb); 1293 return (error); 1294 } 1295 1296 #endif 1297