1 /*- 2 * Copyright (c) 2012 Chelsio Communications, Inc. 3 * All rights reserved. 4 * Written by: Navdeep Parhar <np@FreeBSD.org> 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 31 #include "opt_inet.h" 32 33 #include <sys/param.h> 34 #include <sys/types.h> 35 #include <sys/systm.h> 36 #include <sys/kernel.h> 37 #include <sys/ktr.h> 38 #include <sys/module.h> 39 #include <sys/protosw.h> 40 #include <sys/proc.h> 41 #include <sys/domain.h> 42 #include <sys/socket.h> 43 #include <sys/socketvar.h> 44 #include <sys/uio.h> 45 #include <netinet/in.h> 46 #include <netinet/in_pcb.h> 47 #include <netinet/ip.h> 48 #include <netinet/tcp_var.h> 49 #define TCPSTATES 50 #include <netinet/tcp_fsm.h> 51 #include <netinet/toecore.h> 52 53 #include <vm/vm.h> 54 #include <vm/vm_extern.h> 55 #include <vm/vm_param.h> 56 #include <vm/pmap.h> 57 #include <vm/vm_map.h> 58 #include <vm/vm_page.h> 59 #include <vm/vm_object.h> 60 61 #ifdef TCP_OFFLOAD 62 #include "common/common.h" 63 #include "common/t4_msg.h" 64 #include "common/t4_regs.h" 65 #include "common/t4_tcb.h" 66 #include "tom/t4_tom.h" 67 68 #define PPOD_SZ(n) ((n) * sizeof(struct pagepod)) 69 #define PPOD_SIZE (PPOD_SZ(1)) 70 71 /* XXX: must match A_ULP_RX_TDDP_PSZ */ 72 static int t4_ddp_pgsz[] = {4096, 4096 << 2, 4096 << 4, 4096 << 6}; 73 74 #if 0 75 static void 76 t4_dump_tcb(struct adapter *sc, int tid) 77 { 78 uint32_t tcb_base, off, i, j; 79 80 /* Dump TCB for the tid */ 81 tcb_base = t4_read_reg(sc, A_TP_CMM_TCB_BASE); 82 t4_write_reg(sc, PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_OFFSET, 2), 83 tcb_base + tid * TCB_SIZE); 84 t4_read_reg(sc, PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_OFFSET, 2)); 85 off = 0; 86 printf("\n"); 87 for (i = 0; i < 4; i++) { 88 uint32_t buf[8]; 89 for (j = 0; j < 8; j++, off += 4) 90 buf[j] = htonl(t4_read_reg(sc, MEMWIN2_BASE + off)); 91 92 printf("%08x %08x %08x %08x %08x %08x %08x %08x\n", 93 buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], 94 buf[7]); 95 } 96 } 97 #endif 98 99 #define MAX_DDP_BUFFER_SIZE (M_TCB_RX_DDP_BUF0_LEN) 100 static int 101 alloc_ppods(struct tom_data *td, int n, u_int *ppod_addr) 102 { 103 vmem_addr_t v; 104 int rc; 105 106 MPASS(n > 0); 107 108 rc = vmem_alloc(td->ppod_arena, PPOD_SZ(n), M_NOWAIT | M_FIRSTFIT, &v); 109 *ppod_addr = (u_int)v; 110 111 return (rc); 112 } 113 114 static void 115 free_ppods(struct tom_data *td, u_int ppod_addr, int n) 116 { 117 118 MPASS(n > 0); 119 120 vmem_free(td->ppod_arena, (vmem_addr_t)ppod_addr, PPOD_SZ(n)); 121 } 122 123 static inline int 124 pages_to_nppods(int npages, int ddp_pgsz) 125 { 126 int nsegs = npages * PAGE_SIZE / ddp_pgsz; 127 128 return (howmany(nsegs, PPOD_PAGES)); 129 } 130 131 static void 132 free_ddp_buffer(struct tom_data *td, struct ddp_buffer *db) 133 { 134 135 if (db == NULL) 136 return; 137 138 if (db->pages) 139 free(db->pages, M_CXGBE); 140 141 if (db->nppods > 0) 142 free_ppods(td, db->ppod_addr, db->nppods); 143 144 free(db, M_CXGBE); 145 } 146 147 void 148 release_ddp_resources(struct toepcb *toep) 149 { 150 int i; 151 152 for (i = 0; i < nitems(toep->db); i++) { 153 if (toep->db[i] != NULL) { 154 free_ddp_buffer(toep->td, toep->db[i]); 155 toep->db[i] = NULL; 156 } 157 } 158 } 159 160 /* XXX: handle_ddp_data code duplication */ 161 void 162 insert_ddp_data(struct toepcb *toep, uint32_t n) 163 { 164 struct inpcb *inp = toep->inp; 165 struct tcpcb *tp = intotcpcb(inp); 166 struct sockbuf *sb = &inp->inp_socket->so_rcv; 167 struct mbuf *m; 168 169 INP_WLOCK_ASSERT(inp); 170 SOCKBUF_LOCK_ASSERT(sb); 171 172 m = get_ddp_mbuf(n); 173 tp->rcv_nxt += n; 174 #ifndef USE_DDP_RX_FLOW_CONTROL 175 KASSERT(tp->rcv_wnd >= n, ("%s: negative window size", __func__)); 176 tp->rcv_wnd -= n; 177 #endif 178 179 KASSERT(toep->sb_cc >= sbused(sb), 180 ("%s: sb %p has more data (%d) than last time (%d).", 181 __func__, sb, sbused(sb), toep->sb_cc)); 182 toep->rx_credits += toep->sb_cc - sbused(sb); 183 #ifdef USE_DDP_RX_FLOW_CONTROL 184 toep->rx_credits -= n; /* adjust for F_RX_FC_DDP */ 185 #endif 186 sbappendstream_locked(sb, m, 0); 187 toep->sb_cc = sbused(sb); 188 } 189 190 /* SET_TCB_FIELD sent as a ULP command looks like this */ 191 #define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \ 192 sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core)) 193 194 /* RX_DATA_ACK sent as a ULP command looks like this */ 195 #define LEN__RX_DATA_ACK_ULP (sizeof(struct ulp_txpkt) + \ 196 sizeof(struct ulptx_idata) + sizeof(struct cpl_rx_data_ack_core)) 197 198 static inline void * 199 mk_set_tcb_field_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep, 200 uint64_t word, uint64_t mask, uint64_t val) 201 { 202 struct ulptx_idata *ulpsc; 203 struct cpl_set_tcb_field_core *req; 204 205 ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0)); 206 ulpmc->len = htobe32(howmany(LEN__SET_TCB_FIELD_ULP, 16)); 207 208 ulpsc = (struct ulptx_idata *)(ulpmc + 1); 209 ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); 210 ulpsc->len = htobe32(sizeof(*req)); 211 212 req = (struct cpl_set_tcb_field_core *)(ulpsc + 1); 213 OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tid)); 214 req->reply_ctrl = htobe16(V_NO_REPLY(1) | 215 V_QUEUENO(toep->ofld_rxq->iq.abs_id)); 216 req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(0)); 217 req->mask = htobe64(mask); 218 req->val = htobe64(val); 219 220 ulpsc = (struct ulptx_idata *)(req + 1); 221 if (LEN__SET_TCB_FIELD_ULP % 16) { 222 ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP)); 223 ulpsc->len = htobe32(0); 224 return (ulpsc + 1); 225 } 226 return (ulpsc); 227 } 228 229 static inline void * 230 mk_rx_data_ack_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep) 231 { 232 struct ulptx_idata *ulpsc; 233 struct cpl_rx_data_ack_core *req; 234 235 ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0)); 236 ulpmc->len = htobe32(howmany(LEN__RX_DATA_ACK_ULP, 16)); 237 238 ulpsc = (struct ulptx_idata *)(ulpmc + 1); 239 ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); 240 ulpsc->len = htobe32(sizeof(*req)); 241 242 req = (struct cpl_rx_data_ack_core *)(ulpsc + 1); 243 OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tid)); 244 req->credit_dack = htobe32(F_RX_MODULATE_RX); 245 246 ulpsc = (struct ulptx_idata *)(req + 1); 247 if (LEN__RX_DATA_ACK_ULP % 16) { 248 ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP)); 249 ulpsc->len = htobe32(0); 250 return (ulpsc + 1); 251 } 252 return (ulpsc); 253 } 254 255 static inline uint64_t 256 select_ddp_flags(struct socket *so, int flags, int db_idx) 257 { 258 uint64_t ddp_flags = V_TF_DDP_INDICATE_OUT(0); 259 int waitall = flags & MSG_WAITALL; 260 int nb = so->so_state & SS_NBIO || flags & (MSG_DONTWAIT | MSG_NBIO); 261 262 KASSERT(db_idx == 0 || db_idx == 1, 263 ("%s: bad DDP buffer index %d", __func__, db_idx)); 264 265 if (db_idx == 0) { 266 ddp_flags |= V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0); 267 if (waitall) 268 ddp_flags |= V_TF_DDP_PUSH_DISABLE_0(1); 269 else if (nb) 270 ddp_flags |= V_TF_DDP_BUF0_FLUSH(1); 271 else 272 ddp_flags |= V_TF_DDP_BUF0_FLUSH(0); 273 } else { 274 ddp_flags |= V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1); 275 if (waitall) 276 ddp_flags |= V_TF_DDP_PUSH_DISABLE_1(1); 277 else if (nb) 278 ddp_flags |= V_TF_DDP_BUF1_FLUSH(1); 279 else 280 ddp_flags |= V_TF_DDP_BUF1_FLUSH(0); 281 } 282 283 return (ddp_flags); 284 } 285 286 static struct wrqe * 287 mk_update_tcb_for_ddp(struct adapter *sc, struct toepcb *toep, int db_idx, 288 int offset, uint64_t ddp_flags) 289 { 290 struct ddp_buffer *db = toep->db[db_idx]; 291 struct wrqe *wr; 292 struct work_request_hdr *wrh; 293 struct ulp_txpkt *ulpmc; 294 int len; 295 296 KASSERT(db_idx == 0 || db_idx == 1, 297 ("%s: bad DDP buffer index %d", __func__, db_idx)); 298 299 /* 300 * We'll send a compound work request that has 3 SET_TCB_FIELDs and an 301 * RX_DATA_ACK (with RX_MODULATE to speed up delivery). 302 * 303 * The work request header is 16B and always ends at a 16B boundary. 304 * The ULPTX master commands that follow must all end at 16B boundaries 305 * too so we round up the size to 16. 306 */ 307 len = sizeof(*wrh) + 3 * roundup2(LEN__SET_TCB_FIELD_ULP, 16) + 308 roundup2(LEN__RX_DATA_ACK_ULP, 16); 309 310 wr = alloc_wrqe(len, toep->ctrlq); 311 if (wr == NULL) 312 return (NULL); 313 wrh = wrtod(wr); 314 INIT_ULPTX_WRH(wrh, len, 1, 0); /* atomic */ 315 ulpmc = (struct ulp_txpkt *)(wrh + 1); 316 317 /* Write the buffer's tag */ 318 ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, 319 W_TCB_RX_DDP_BUF0_TAG + db_idx, 320 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG), 321 V_TCB_RX_DDP_BUF0_TAG(db->tag)); 322 323 /* Update the current offset in the DDP buffer and its total length */ 324 if (db_idx == 0) 325 ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, 326 W_TCB_RX_DDP_BUF0_OFFSET, 327 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | 328 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), 329 V_TCB_RX_DDP_BUF0_OFFSET(offset) | 330 V_TCB_RX_DDP_BUF0_LEN(db->len)); 331 else 332 ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, 333 W_TCB_RX_DDP_BUF1_OFFSET, 334 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | 335 V_TCB_RX_DDP_BUF1_LEN((u64)M_TCB_RX_DDP_BUF1_LEN << 32), 336 V_TCB_RX_DDP_BUF1_OFFSET(offset) | 337 V_TCB_RX_DDP_BUF1_LEN((u64)db->len << 32)); 338 339 /* Update DDP flags */ 340 ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_RX_DDP_FLAGS, 341 V_TF_DDP_BUF0_FLUSH(1) | V_TF_DDP_BUF1_FLUSH(1) | 342 V_TF_DDP_PUSH_DISABLE_0(1) | V_TF_DDP_PUSH_DISABLE_1(1) | 343 V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1) | 344 V_TF_DDP_ACTIVE_BUF(1) | V_TF_DDP_INDICATE_OUT(1), ddp_flags); 345 346 /* Gratuitous RX_DATA_ACK with RX_MODULATE set to speed up delivery. */ 347 ulpmc = mk_rx_data_ack_ulp(ulpmc, toep); 348 349 return (wr); 350 } 351 352 static void 353 discourage_ddp(struct toepcb *toep) 354 { 355 356 if (toep->ddp_score && --toep->ddp_score == 0) { 357 toep->ddp_flags &= ~DDP_OK; 358 toep->ddp_disabled = time_uptime; 359 CTR3(KTR_CXGBE, "%s: tid %u !DDP_OK @ %u", 360 __func__, toep->tid, time_uptime); 361 } 362 } 363 364 static int 365 handle_ddp_data(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt, int len) 366 { 367 uint32_t report = be32toh(ddp_report); 368 unsigned int db_flag; 369 struct inpcb *inp = toep->inp; 370 struct tcpcb *tp; 371 struct socket *so; 372 struct sockbuf *sb; 373 struct mbuf *m; 374 375 db_flag = report & F_DDP_BUF_IDX ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE; 376 377 if (__predict_false(!(report & F_DDP_INV))) 378 CXGBE_UNIMPLEMENTED("DDP buffer still valid"); 379 380 INP_WLOCK(inp); 381 so = inp_inpcbtosocket(inp); 382 sb = &so->so_rcv; 383 if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) { 384 385 /* 386 * XXX: think a bit more. 387 * tcpcb probably gone, but socket should still be around 388 * because we always wait for DDP completion in soreceive no 389 * matter what. Just wake it up and let it clean up. 390 */ 391 392 CTR5(KTR_CXGBE, "%s: tid %u, seq 0x%x, len %d, inp_flags 0x%x", 393 __func__, toep->tid, be32toh(rcv_nxt), len, inp->inp_flags); 394 SOCKBUF_LOCK(sb); 395 goto wakeup; 396 } 397 398 tp = intotcpcb(inp); 399 len += be32toh(rcv_nxt) - tp->rcv_nxt; 400 tp->rcv_nxt += len; 401 tp->t_rcvtime = ticks; 402 #ifndef USE_DDP_RX_FLOW_CONTROL 403 KASSERT(tp->rcv_wnd >= len, ("%s: negative window size", __func__)); 404 tp->rcv_wnd -= len; 405 #endif 406 m = get_ddp_mbuf(len); 407 408 SOCKBUF_LOCK(sb); 409 if (report & F_DDP_BUF_COMPLETE) 410 toep->ddp_score = DDP_HIGH_SCORE; 411 else 412 discourage_ddp(toep); 413 414 KASSERT(toep->sb_cc >= sbused(sb), 415 ("%s: sb %p has more data (%d) than last time (%d).", 416 __func__, sb, sbused(sb), toep->sb_cc)); 417 toep->rx_credits += toep->sb_cc - sbused(sb); 418 #ifdef USE_DDP_RX_FLOW_CONTROL 419 toep->rx_credits -= len; /* adjust for F_RX_FC_DDP */ 420 #endif 421 sbappendstream_locked(sb, m, 0); 422 toep->sb_cc = sbused(sb); 423 wakeup: 424 KASSERT(toep->ddp_flags & db_flag, 425 ("%s: DDP buffer not active. toep %p, ddp_flags 0x%x, report 0x%x", 426 __func__, toep, toep->ddp_flags, report)); 427 toep->ddp_flags &= ~db_flag; 428 sorwakeup_locked(so); 429 SOCKBUF_UNLOCK_ASSERT(sb); 430 431 INP_WUNLOCK(inp); 432 return (0); 433 } 434 435 #define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\ 436 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\ 437 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\ 438 F_DDP_INVALID_PPOD | F_DDP_HDRCRC_ERR | F_DDP_DATACRC_ERR) 439 440 static int 441 do_rx_data_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) 442 { 443 struct adapter *sc = iq->adapter; 444 const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1); 445 unsigned int tid = GET_TID(cpl); 446 uint32_t vld; 447 struct toepcb *toep = lookup_tid(sc, tid); 448 struct tom_data *td = toep->td; 449 450 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 451 KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__)); 452 KASSERT(!(toep->flags & TPF_SYNQE), 453 ("%s: toep %p claims to be a synq entry", __func__, toep)); 454 455 vld = be32toh(cpl->ddpvld); 456 if (__predict_false(vld & DDP_ERR)) { 457 panic("%s: DDP error 0x%x (tid %d, toep %p)", 458 __func__, vld, tid, toep); 459 } 460 if (toep->ulp_mode == ULP_MODE_ISCSI) { 461 m = m_get(M_NOWAIT, MT_DATA); 462 if (m == NULL) 463 CXGBE_UNIMPLEMENTED("mbuf alloc failure"); 464 memcpy(mtod(m, unsigned char *), cpl, 465 sizeof(struct cpl_rx_data_ddp)); 466 if (!t4_cpl_iscsi_callback(td, toep, m, CPL_RX_DATA_DDP)) 467 return (0); 468 m_freem(m); 469 } 470 471 handle_ddp_data(toep, cpl->u.ddp_report, cpl->seq, be16toh(cpl->len)); 472 473 return (0); 474 } 475 476 static int 477 do_rx_ddp_complete(struct sge_iq *iq, const struct rss_header *rss, 478 struct mbuf *m) 479 { 480 struct adapter *sc = iq->adapter; 481 const struct cpl_rx_ddp_complete *cpl = (const void *)(rss + 1); 482 unsigned int tid = GET_TID(cpl); 483 struct toepcb *toep = lookup_tid(sc, tid); 484 485 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); 486 KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__)); 487 KASSERT(!(toep->flags & TPF_SYNQE), 488 ("%s: toep %p claims to be a synq entry", __func__, toep)); 489 490 handle_ddp_data(toep, cpl->ddp_report, cpl->rcv_nxt, 0); 491 492 return (0); 493 } 494 495 void 496 enable_ddp(struct adapter *sc, struct toepcb *toep) 497 { 498 499 KASSERT((toep->ddp_flags & (DDP_ON | DDP_OK | DDP_SC_REQ)) == DDP_OK, 500 ("%s: toep %p has bad ddp_flags 0x%x", 501 __func__, toep, toep->ddp_flags)); 502 503 CTR3(KTR_CXGBE, "%s: tid %u (time %u)", 504 __func__, toep->tid, time_uptime); 505 506 toep->ddp_flags |= DDP_SC_REQ; 507 t4_set_tcb_field(sc, toep, 1, W_TCB_RX_DDP_FLAGS, 508 V_TF_DDP_OFF(1) | V_TF_DDP_INDICATE_OUT(1) | 509 V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1) | 510 V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1), 511 V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1)); 512 t4_set_tcb_field(sc, toep, 1, W_TCB_T_FLAGS, 513 V_TF_RCV_COALESCE_ENABLE(1), 0); 514 } 515 516 static inline void 517 disable_ddp(struct adapter *sc, struct toepcb *toep) 518 { 519 520 KASSERT((toep->ddp_flags & (DDP_ON | DDP_SC_REQ)) == DDP_ON, 521 ("%s: toep %p has bad ddp_flags 0x%x", 522 __func__, toep, toep->ddp_flags)); 523 524 CTR3(KTR_CXGBE, "%s: tid %u (time %u)", 525 __func__, toep->tid, time_uptime); 526 527 toep->ddp_flags |= DDP_SC_REQ; 528 t4_set_tcb_field(sc, toep, 1, W_TCB_T_FLAGS, 529 V_TF_RCV_COALESCE_ENABLE(1), V_TF_RCV_COALESCE_ENABLE(1)); 530 t4_set_tcb_field(sc, toep, 1, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1), 531 V_TF_DDP_OFF(1)); 532 } 533 534 static int 535 hold_uio(struct uio *uio, vm_page_t **ppages, int *pnpages) 536 { 537 struct vm_map *map; 538 struct iovec *iov; 539 vm_offset_t start, end; 540 vm_page_t *pp; 541 int n; 542 543 KASSERT(uio->uio_iovcnt == 1, 544 ("%s: uio_iovcnt %d", __func__, uio->uio_iovcnt)); 545 KASSERT(uio->uio_td->td_proc == curproc, 546 ("%s: uio proc (%p) is not curproc (%p)", 547 __func__, uio->uio_td->td_proc, curproc)); 548 549 map = &curproc->p_vmspace->vm_map; 550 iov = &uio->uio_iov[0]; 551 start = trunc_page((uintptr_t)iov->iov_base); 552 end = round_page((vm_offset_t)iov->iov_base + iov->iov_len); 553 n = howmany(end - start, PAGE_SIZE); 554 555 if (end - start > MAX_DDP_BUFFER_SIZE) 556 return (E2BIG); 557 558 pp = malloc(n * sizeof(vm_page_t), M_CXGBE, M_NOWAIT); 559 if (pp == NULL) 560 return (ENOMEM); 561 562 if (vm_fault_quick_hold_pages(map, (vm_offset_t)iov->iov_base, 563 iov->iov_len, VM_PROT_WRITE, pp, n) < 0) { 564 free(pp, M_CXGBE); 565 return (EFAULT); 566 } 567 568 *ppages = pp; 569 *pnpages = n; 570 571 return (0); 572 } 573 574 static int 575 bufcmp(struct ddp_buffer *db, vm_page_t *pages, int npages, int offset, int len) 576 { 577 int i; 578 579 if (db == NULL || db->npages != npages || db->offset != offset || 580 db->len != len) 581 return (1); 582 583 for (i = 0; i < npages; i++) { 584 if (pages[i]->phys_addr != db->pages[i]->phys_addr) 585 return (1); 586 } 587 588 return (0); 589 } 590 591 static int 592 calculate_hcf(int n1, int n2) 593 { 594 int a, b, t; 595 596 if (n1 <= n2) { 597 a = n1; 598 b = n2; 599 } else { 600 a = n2; 601 b = n1; 602 } 603 604 while (a != 0) { 605 t = a; 606 a = b % a; 607 b = t; 608 } 609 610 return (b); 611 } 612 613 static struct ddp_buffer * 614 alloc_ddp_buffer(struct tom_data *td, vm_page_t *pages, int npages, int offset, 615 int len) 616 { 617 int i, hcf, seglen, idx, ppod, nppods; 618 struct ddp_buffer *db; 619 620 /* 621 * The DDP page size is unrelated to the VM page size. We combine 622 * contiguous physical pages into larger segments to get the best DDP 623 * page size possible. This is the largest of the four sizes in 624 * A_ULP_RX_TDDP_PSZ that evenly divides the HCF of the segment sizes in 625 * the page list. 626 */ 627 hcf = 0; 628 for (i = 0; i < npages; i++) { 629 seglen = PAGE_SIZE; 630 while (i < npages - 1 && 631 pages[i]->phys_addr + PAGE_SIZE == pages[i + 1]->phys_addr) { 632 seglen += PAGE_SIZE; 633 i++; 634 } 635 636 hcf = calculate_hcf(hcf, seglen); 637 if (hcf < t4_ddp_pgsz[1]) { 638 idx = 0; 639 goto have_pgsz; /* give up, short circuit */ 640 } 641 } 642 643 if (hcf % t4_ddp_pgsz[0] != 0) { 644 /* hmmm. This could only happen when PAGE_SIZE < 4K */ 645 KASSERT(PAGE_SIZE < 4096, 646 ("%s: PAGE_SIZE %d, hcf %d", __func__, PAGE_SIZE, hcf)); 647 CTR3(KTR_CXGBE, "%s: PAGE_SIZE %d, hcf %d", 648 __func__, PAGE_SIZE, hcf); 649 return (NULL); 650 } 651 652 for (idx = nitems(t4_ddp_pgsz) - 1; idx > 0; idx--) { 653 if (hcf % t4_ddp_pgsz[idx] == 0) 654 break; 655 } 656 have_pgsz: 657 MPASS(idx <= M_PPOD_PGSZ); 658 659 db = malloc(sizeof(*db), M_CXGBE, M_NOWAIT); 660 if (db == NULL) { 661 CTR1(KTR_CXGBE, "%s: malloc failed.", __func__); 662 return (NULL); 663 } 664 665 nppods = pages_to_nppods(npages, t4_ddp_pgsz[idx]); 666 if (alloc_ppods(td, nppods, &db->ppod_addr) != 0) { 667 free(db, M_CXGBE); 668 CTR4(KTR_CXGBE, "%s: no pods, nppods %d, resid %d, pgsz %d", 669 __func__, nppods, len, t4_ddp_pgsz[idx]); 670 return (NULL); 671 } 672 ppod = (db->ppod_addr - td->ppod_start) / PPOD_SIZE; 673 674 db->tag = V_PPOD_PGSZ(idx) | V_PPOD_TAG(ppod); 675 db->nppods = nppods; 676 db->npages = npages; 677 db->pages = pages; 678 db->offset = offset; 679 db->len = len; 680 681 CTR6(KTR_CXGBE, "New DDP buffer. " 682 "ddp_pgsz %d, ppod 0x%x, npages %d, nppods %d, offset %d, len %d", 683 t4_ddp_pgsz[idx], ppod, db->npages, db->nppods, db->offset, 684 db->len); 685 686 return (db); 687 } 688 689 #define NUM_ULP_TX_SC_IMM_PPODS (256 / PPOD_SIZE) 690 691 static int 692 write_page_pods(struct adapter *sc, struct toepcb *toep, struct ddp_buffer *db) 693 { 694 struct wrqe *wr; 695 struct ulp_mem_io *ulpmc; 696 struct ulptx_idata *ulpsc; 697 struct pagepod *ppod; 698 int i, j, k, n, chunk, len, ddp_pgsz, idx; 699 u_int ppod_addr; 700 uint32_t cmd; 701 702 cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE)); 703 if (is_t4(sc)) 704 cmd |= htobe32(F_ULP_MEMIO_ORDER); 705 else 706 cmd |= htobe32(F_T5_ULP_MEMIO_IMM); 707 ddp_pgsz = t4_ddp_pgsz[G_PPOD_PGSZ(db->tag)]; 708 ppod_addr = db->ppod_addr; 709 for (i = 0; i < db->nppods; ppod_addr += chunk) { 710 711 /* How many page pods are we writing in this cycle */ 712 n = min(db->nppods - i, NUM_ULP_TX_SC_IMM_PPODS); 713 chunk = PPOD_SZ(n); 714 len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16); 715 716 wr = alloc_wrqe(len, toep->ctrlq); 717 if (wr == NULL) 718 return (ENOMEM); /* ok to just bail out */ 719 ulpmc = wrtod(wr); 720 721 INIT_ULPTX_WR(ulpmc, len, 0, 0); 722 ulpmc->cmd = cmd; 723 ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32)); 724 ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16)); 725 ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5)); 726 727 ulpsc = (struct ulptx_idata *)(ulpmc + 1); 728 ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); 729 ulpsc->len = htobe32(chunk); 730 731 ppod = (struct pagepod *)(ulpsc + 1); 732 for (j = 0; j < n; i++, j++, ppod++) { 733 ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID | 734 V_PPOD_TID(toep->tid) | db->tag); 735 ppod->len_offset = htobe64(V_PPOD_LEN(db->len) | 736 V_PPOD_OFST(db->offset)); 737 ppod->rsvd = 0; 738 idx = i * PPOD_PAGES * (ddp_pgsz / PAGE_SIZE); 739 for (k = 0; k < nitems(ppod->addr); k++) { 740 if (idx < db->npages) { 741 ppod->addr[k] = 742 htobe64(db->pages[idx]->phys_addr); 743 idx += ddp_pgsz / PAGE_SIZE; 744 } else 745 ppod->addr[k] = 0; 746 #if 0 747 CTR5(KTR_CXGBE, 748 "%s: tid %d ppod[%d]->addr[%d] = %p", 749 __func__, toep->tid, i, k, 750 htobe64(ppod->addr[k])); 751 #endif 752 } 753 754 } 755 756 t4_wrq_tx(sc, wr); 757 } 758 759 return (0); 760 } 761 762 /* 763 * Reuse, or allocate (and program the page pods for) a new DDP buffer. The 764 * "pages" array is handed over to this function and should not be used in any 765 * way by the caller after that. 766 */ 767 static int 768 select_ddp_buffer(struct adapter *sc, struct toepcb *toep, vm_page_t *pages, 769 int npages, int db_off, int db_len) 770 { 771 struct ddp_buffer *db; 772 struct tom_data *td = sc->tom_softc; 773 int i, empty_slot = -1; 774 775 /* Try to reuse */ 776 for (i = 0; i < nitems(toep->db); i++) { 777 if (bufcmp(toep->db[i], pages, npages, db_off, db_len) == 0) { 778 free(pages, M_CXGBE); 779 return (i); /* pages still held */ 780 } else if (toep->db[i] == NULL && empty_slot < 0) 781 empty_slot = i; 782 } 783 784 /* Allocate new buffer, write its page pods. */ 785 db = alloc_ddp_buffer(td, pages, npages, db_off, db_len); 786 if (db == NULL) { 787 vm_page_unhold_pages(pages, npages); 788 free(pages, M_CXGBE); 789 return (-1); 790 } 791 if (write_page_pods(sc, toep, db) != 0) { 792 vm_page_unhold_pages(pages, npages); 793 free_ddp_buffer(td, db); 794 return (-1); 795 } 796 797 i = empty_slot; 798 if (i < 0) { 799 i = arc4random() % nitems(toep->db); 800 free_ddp_buffer(td, toep->db[i]); 801 } 802 toep->db[i] = db; 803 804 CTR5(KTR_CXGBE, "%s: tid %d, DDP buffer[%d] = %p (tag 0x%x)", 805 __func__, toep->tid, i, db, db->tag); 806 807 return (i); 808 } 809 810 static void 811 wire_ddp_buffer(struct ddp_buffer *db) 812 { 813 int i; 814 vm_page_t p; 815 816 for (i = 0; i < db->npages; i++) { 817 p = db->pages[i]; 818 vm_page_lock(p); 819 vm_page_wire(p); 820 vm_page_unhold(p); 821 vm_page_unlock(p); 822 } 823 } 824 825 static void 826 unwire_ddp_buffer(struct ddp_buffer *db) 827 { 828 int i; 829 vm_page_t p; 830 831 for (i = 0; i < db->npages; i++) { 832 p = db->pages[i]; 833 vm_page_lock(p); 834 vm_page_unwire(p, PQ_INACTIVE); 835 vm_page_unlock(p); 836 } 837 } 838 839 static int 840 handle_ddp(struct socket *so, struct uio *uio, int flags, int error) 841 { 842 struct sockbuf *sb = &so->so_rcv; 843 struct tcpcb *tp = so_sototcpcb(so); 844 struct toepcb *toep = tp->t_toe; 845 struct adapter *sc = td_adapter(toep->td); 846 vm_page_t *pages; 847 int npages, db_idx, rc, buf_flag; 848 struct ddp_buffer *db; 849 struct wrqe *wr; 850 uint64_t ddp_flags; 851 852 SOCKBUF_LOCK_ASSERT(sb); 853 854 #if 0 855 if (sbused(sb) + sc->tt.ddp_thres > uio->uio_resid) { 856 CTR4(KTR_CXGBE, "%s: sb_cc %d, threshold %d, resid %d", 857 __func__, sbused(sb), sc->tt.ddp_thres, uio->uio_resid); 858 } 859 #endif 860 861 /* XXX: too eager to disable DDP, could handle NBIO better than this. */ 862 if (sbused(sb) >= uio->uio_resid || uio->uio_resid < sc->tt.ddp_thres || 863 uio->uio_resid > MAX_DDP_BUFFER_SIZE || uio->uio_iovcnt > 1 || 864 so->so_state & SS_NBIO || flags & (MSG_DONTWAIT | MSG_NBIO) || 865 error || so->so_error || sb->sb_state & SBS_CANTRCVMORE) 866 goto no_ddp; 867 868 /* 869 * Fault in and then hold the pages of the uio buffers. We'll wire them 870 * a bit later if everything else works out. 871 */ 872 SOCKBUF_UNLOCK(sb); 873 if (hold_uio(uio, &pages, &npages) != 0) { 874 SOCKBUF_LOCK(sb); 875 goto no_ddp; 876 } 877 SOCKBUF_LOCK(sb); 878 if (__predict_false(so->so_error || sb->sb_state & SBS_CANTRCVMORE)) { 879 vm_page_unhold_pages(pages, npages); 880 free(pages, M_CXGBE); 881 goto no_ddp; 882 } 883 884 /* 885 * Figure out which one of the two DDP buffers to use this time. 886 */ 887 db_idx = select_ddp_buffer(sc, toep, pages, npages, 888 (uintptr_t)uio->uio_iov->iov_base & PAGE_MASK, uio->uio_resid); 889 pages = NULL; /* handed off to select_ddp_buffer */ 890 if (db_idx < 0) 891 goto no_ddp; 892 db = toep->db[db_idx]; 893 buf_flag = db_idx == 0 ? DDP_BUF0_ACTIVE : DDP_BUF1_ACTIVE; 894 895 /* 896 * Build the compound work request that tells the chip where to DMA the 897 * payload. 898 */ 899 ddp_flags = select_ddp_flags(so, flags, db_idx); 900 wr = mk_update_tcb_for_ddp(sc, toep, db_idx, sbused(sb), ddp_flags); 901 if (wr == NULL) { 902 /* 903 * Just unhold the pages. The DDP buffer's software state is 904 * left as-is in the toep. The page pods were written 905 * successfully and we may have an opportunity to use it in the 906 * future. 907 */ 908 vm_page_unhold_pages(db->pages, db->npages); 909 goto no_ddp; 910 } 911 912 /* Wire (and then unhold) the pages, and give the chip the go-ahead. */ 913 wire_ddp_buffer(db); 914 t4_wrq_tx(sc, wr); 915 sb->sb_flags &= ~SB_DDP_INDICATE; 916 toep->ddp_flags |= buf_flag; 917 918 /* 919 * Wait for the DDP operation to complete and then unwire the pages. 920 * The return code from the sbwait will be the final return code of this 921 * function. But we do need to wait for DDP no matter what. 922 */ 923 rc = sbwait(sb); 924 while (toep->ddp_flags & buf_flag) { 925 /* XXXGL: shouldn't here be sbwait() call? */ 926 sb->sb_flags |= SB_WAIT; 927 msleep(&sb->sb_acc, &sb->sb_mtx, PSOCK , "sbwait", 0); 928 } 929 unwire_ddp_buffer(db); 930 return (rc); 931 no_ddp: 932 disable_ddp(sc, toep); 933 discourage_ddp(toep); 934 sb->sb_flags &= ~SB_DDP_INDICATE; 935 return (0); 936 } 937 938 void 939 t4_init_ddp(struct adapter *sc, struct tom_data *td) 940 { 941 942 td->ppod_start = sc->vres.ddp.start; 943 td->ppod_arena = vmem_create("DDP page pods", sc->vres.ddp.start, 944 sc->vres.ddp.size, 1, 32, M_FIRSTFIT | M_NOWAIT); 945 946 t4_register_cpl_handler(sc, CPL_RX_DATA_DDP, do_rx_data_ddp); 947 t4_register_cpl_handler(sc, CPL_RX_DDP_COMPLETE, do_rx_ddp_complete); 948 } 949 950 void 951 t4_uninit_ddp(struct adapter *sc __unused, struct tom_data *td) 952 { 953 954 if (td->ppod_arena != NULL) { 955 vmem_destroy(td->ppod_arena); 956 td->ppod_arena = NULL; 957 } 958 } 959 960 #define VNET_SO_ASSERT(so) \ 961 VNET_ASSERT(curvnet != NULL, \ 962 ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so))); 963 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) 964 static int 965 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) 966 { 967 968 CXGBE_UNIMPLEMENTED(__func__); 969 } 970 971 static char ddp_magic_str[] = "nothing to see here"; 972 973 struct mbuf * 974 get_ddp_mbuf(int len) 975 { 976 struct mbuf *m; 977 978 m = m_get(M_NOWAIT, MT_DATA); 979 if (m == NULL) 980 CXGBE_UNIMPLEMENTED("mbuf alloc failure"); 981 m->m_len = len; 982 m->m_data = &ddp_magic_str[0]; 983 984 return (m); 985 } 986 987 static inline int 988 is_ddp_mbuf(struct mbuf *m) 989 { 990 991 return (m->m_data == &ddp_magic_str[0]); 992 } 993 994 /* 995 * Copy an mbuf chain into a uio limited by len if set. 996 */ 997 static int 998 m_mbuftouio_ddp(struct uio *uio, struct mbuf *m, int len) 999 { 1000 int error, length, total; 1001 int progress = 0; 1002 1003 if (len > 0) 1004 total = min(uio->uio_resid, len); 1005 else 1006 total = uio->uio_resid; 1007 1008 /* Fill the uio with data from the mbufs. */ 1009 for (; m != NULL; m = m->m_next) { 1010 length = min(m->m_len, total - progress); 1011 1012 if (is_ddp_mbuf(m)) { 1013 enum uio_seg segflag = uio->uio_segflg; 1014 1015 uio->uio_segflg = UIO_NOCOPY; 1016 error = uiomove(mtod(m, void *), length, uio); 1017 uio->uio_segflg = segflag; 1018 } else 1019 error = uiomove(mtod(m, void *), length, uio); 1020 if (error) 1021 return (error); 1022 1023 progress += length; 1024 } 1025 1026 return (0); 1027 } 1028 1029 /* 1030 * Based on soreceive_stream() in uipc_socket.c 1031 */ 1032 int 1033 t4_soreceive_ddp(struct socket *so, struct sockaddr **psa, struct uio *uio, 1034 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 1035 { 1036 int len = 0, error = 0, flags, oresid, ddp_handled = 0; 1037 struct sockbuf *sb; 1038 struct mbuf *m, *n = NULL; 1039 1040 /* We only do stream sockets. */ 1041 if (so->so_type != SOCK_STREAM) 1042 return (EINVAL); 1043 if (psa != NULL) 1044 *psa = NULL; 1045 if (controlp != NULL) 1046 return (EINVAL); 1047 if (flagsp != NULL) 1048 flags = *flagsp &~ MSG_EOR; 1049 else 1050 flags = 0; 1051 if (flags & MSG_OOB) 1052 return (soreceive_rcvoob(so, uio, flags)); 1053 if (mp0 != NULL) 1054 *mp0 = NULL; 1055 1056 sb = &so->so_rcv; 1057 1058 /* Prevent other readers from entering the socket. */ 1059 error = sblock(sb, SBLOCKWAIT(flags)); 1060 SOCKBUF_LOCK(sb); 1061 if (error) 1062 goto out; 1063 1064 /* Easy one, no space to copyout anything. */ 1065 if (uio->uio_resid == 0) { 1066 error = EINVAL; 1067 goto out; 1068 } 1069 oresid = uio->uio_resid; 1070 1071 /* We will never ever get anything unless we are or were connected. */ 1072 if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { 1073 error = ENOTCONN; 1074 goto out; 1075 } 1076 1077 restart: 1078 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1079 1080 if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled) { 1081 1082 /* uio should be just as it was at entry */ 1083 KASSERT(oresid == uio->uio_resid, 1084 ("%s: oresid = %d, uio_resid = %zd, sbavail = %d", 1085 __func__, oresid, uio->uio_resid, sbavail(sb))); 1086 1087 error = handle_ddp(so, uio, flags, 0); 1088 ddp_handled = 1; 1089 if (error) 1090 goto out; 1091 } 1092 1093 /* Abort if socket has reported problems. */ 1094 if (so->so_error) { 1095 if (sbavail(sb)) 1096 goto deliver; 1097 if (oresid > uio->uio_resid) 1098 goto out; 1099 error = so->so_error; 1100 if (!(flags & MSG_PEEK)) 1101 so->so_error = 0; 1102 goto out; 1103 } 1104 1105 /* Door is closed. Deliver what is left, if any. */ 1106 if (sb->sb_state & SBS_CANTRCVMORE) { 1107 if (sbavail(sb)) 1108 goto deliver; 1109 else 1110 goto out; 1111 } 1112 1113 /* Socket buffer is empty and we shall not block. */ 1114 if (sbavail(sb) == 0 && 1115 ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { 1116 error = EAGAIN; 1117 goto out; 1118 } 1119 1120 /* Socket buffer got some data that we shall deliver now. */ 1121 if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) && 1122 ((so->so_state & SS_NBIO) || 1123 (flags & (MSG_DONTWAIT|MSG_NBIO)) || 1124 sbavail(sb) >= sb->sb_lowat || 1125 sbavail(sb) >= uio->uio_resid || 1126 sbavail(sb) >= sb->sb_hiwat) ) { 1127 goto deliver; 1128 } 1129 1130 /* On MSG_WAITALL we must wait until all data or error arrives. */ 1131 if ((flags & MSG_WAITALL) && 1132 (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_lowat)) 1133 goto deliver; 1134 1135 /* 1136 * Wait and block until (more) data comes in. 1137 * NB: Drops the sockbuf lock during wait. 1138 */ 1139 error = sbwait(sb); 1140 if (error) { 1141 if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled) { 1142 (void) handle_ddp(so, uio, flags, 1); 1143 ddp_handled = 1; 1144 } 1145 goto out; 1146 } 1147 goto restart; 1148 1149 deliver: 1150 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1151 KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__)); 1152 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); 1153 1154 if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled) 1155 goto restart; 1156 1157 /* Statistics. */ 1158 if (uio->uio_td) 1159 uio->uio_td->td_ru.ru_msgrcv++; 1160 1161 /* Fill uio until full or current end of socket buffer is reached. */ 1162 len = min(uio->uio_resid, sbavail(sb)); 1163 if (mp0 != NULL) { 1164 /* Dequeue as many mbufs as possible. */ 1165 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { 1166 for (*mp0 = m = sb->sb_mb; 1167 m != NULL && m->m_len <= len; 1168 m = m->m_next) { 1169 len -= m->m_len; 1170 uio->uio_resid -= m->m_len; 1171 sbfree(sb, m); 1172 n = m; 1173 } 1174 sb->sb_mb = m; 1175 if (sb->sb_mb == NULL) 1176 SB_EMPTY_FIXUP(sb); 1177 n->m_next = NULL; 1178 } 1179 /* Copy the remainder. */ 1180 if (len > 0) { 1181 KASSERT(sb->sb_mb != NULL, 1182 ("%s: len > 0 && sb->sb_mb empty", __func__)); 1183 1184 m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); 1185 if (m == NULL) 1186 len = 0; /* Don't flush data from sockbuf. */ 1187 else 1188 uio->uio_resid -= m->m_len; 1189 if (*mp0 != NULL) 1190 n->m_next = m; 1191 else 1192 *mp0 = m; 1193 if (*mp0 == NULL) { 1194 error = ENOBUFS; 1195 goto out; 1196 } 1197 } 1198 } else { 1199 /* NB: Must unlock socket buffer as uiomove may sleep. */ 1200 SOCKBUF_UNLOCK(sb); 1201 error = m_mbuftouio_ddp(uio, sb->sb_mb, len); 1202 SOCKBUF_LOCK(sb); 1203 if (error) 1204 goto out; 1205 } 1206 SBLASTRECORDCHK(sb); 1207 SBLASTMBUFCHK(sb); 1208 1209 /* 1210 * Remove the delivered data from the socket buffer unless we 1211 * were only peeking. 1212 */ 1213 if (!(flags & MSG_PEEK)) { 1214 if (len > 0) 1215 sbdrop_locked(sb, len); 1216 1217 /* Notify protocol that we drained some data. */ 1218 if ((so->so_proto->pr_flags & PR_WANTRCVD) && 1219 (((flags & MSG_WAITALL) && uio->uio_resid > 0) || 1220 !(flags & MSG_SOCALLBCK))) { 1221 SOCKBUF_UNLOCK(sb); 1222 VNET_SO_ASSERT(so); 1223 (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags); 1224 SOCKBUF_LOCK(sb); 1225 } 1226 } 1227 1228 /* 1229 * For MSG_WAITALL we may have to loop again and wait for 1230 * more data to come in. 1231 */ 1232 if ((flags & MSG_WAITALL) && uio->uio_resid > 0) 1233 goto restart; 1234 out: 1235 SOCKBUF_LOCK_ASSERT(sb); 1236 SBLASTRECORDCHK(sb); 1237 SBLASTMBUFCHK(sb); 1238 SOCKBUF_UNLOCK(sb); 1239 sbunlock(sb); 1240 return (error); 1241 } 1242 1243 #endif 1244