1 /*- 2 * Copyright (c) 2012 The FreeBSD Foundation 3 * Copyright (c) 2015 Chelsio Communications, Inc. 4 * All rights reserved. 5 * 6 * This software was developed by Edward Tomasz Napierala under sponsorship 7 * from the FreeBSD Foundation. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 * 30 */ 31 32 /* 33 * cxgbei implementation of iSCSI Common Layer kobj(9) interface. 34 */ 35 36 #include <sys/cdefs.h> 37 #include "opt_inet.h" 38 #include "opt_inet6.h" 39 40 #ifdef TCP_OFFLOAD 41 #include <sys/param.h> 42 #include <sys/bio.h> 43 #include <sys/capsicum.h> 44 #include <sys/condvar.h> 45 #include <sys/conf.h> 46 #include <sys/file.h> 47 #include <sys/kernel.h> 48 #include <sys/kthread.h> 49 #include <sys/ktr.h> 50 #include <sys/lock.h> 51 #include <sys/mbuf.h> 52 #include <sys/mutex.h> 53 #include <sys/module.h> 54 #include <sys/protosw.h> 55 #include <sys/socket.h> 56 #include <sys/socketvar.h> 57 #include <sys/sysctl.h> 58 #include <sys/systm.h> 59 #include <sys/sx.h> 60 #include <sys/uio.h> 61 #include <machine/bus.h> 62 #include <vm/vm.h> 63 #include <vm/vm_page.h> 64 #include <vm/pmap.h> 65 #include <netinet/in.h> 66 #include <netinet/in_pcb.h> 67 #include <netinet/tcp.h> 68 #include <netinet/tcp_var.h> 69 #include <netinet/toecore.h> 70 71 #include <dev/iscsi/icl.h> 72 #include <dev/iscsi/iscsi_proto.h> 73 #include <icl_conn_if.h> 74 75 #include <cam/scsi/scsi_all.h> 76 #include <cam/scsi/scsi_da.h> 77 #include <cam/ctl/ctl_io.h> 78 #include <cam/ctl/ctl.h> 79 #include <cam/ctl/ctl_backend.h> 80 #include <cam/ctl/ctl_error.h> 81 #include <cam/ctl/ctl_frontend.h> 82 #include <cam/ctl/ctl_debug.h> 83 #include <cam/ctl/ctl_ha.h> 84 #include <cam/ctl/ctl_ioctl.h> 85 86 #include <cam/cam.h> 87 #include <cam/cam_ccb.h> 88 #include <cam/cam_xpt.h> 89 #include <cam/cam_debug.h> 90 #include <cam/cam_sim.h> 91 #include <cam/cam_xpt_sim.h> 92 #include <cam/cam_xpt_periph.h> 93 #include <cam/cam_periph.h> 94 #include <cam/cam_compat.h> 95 #include <cam/scsi/scsi_message.h> 96 97 #include "common/common.h" 98 #include "common/t4_regs.h" 99 #include "common/t4_tcb.h" 100 #include "tom/t4_tom.h" 101 #include "cxgbei.h" 102 103 /* 104 * Use the page pod tag for the TT hash. 105 */ 106 #define TT_HASH(icc, tt) (G_PPOD_TAG(tt) & (icc)->cmp_hash_mask) 107 108 struct cxgbei_ddp_state { 109 struct ppod_reservation prsv; 110 struct cxgbei_cmp cmp; 111 }; 112 113 static MALLOC_DEFINE(M_CXGBEI, "cxgbei", "cxgbei(4)"); 114 115 SYSCTL_NODE(_kern_icl, OID_AUTO, cxgbei, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 116 "Chelsio iSCSI offload"); 117 static int first_burst_length = 8192; 118 SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, first_burst_length, CTLFLAG_RWTUN, 119 &first_burst_length, 0, "First burst length"); 120 static int max_burst_length = 2 * 1024 * 1024; 121 SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, max_burst_length, CTLFLAG_RWTUN, 122 &max_burst_length, 0, "Maximum burst length"); 123 static int sendspace = 1048576; 124 SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, sendspace, CTLFLAG_RWTUN, 125 &sendspace, 0, "Default send socket buffer size"); 126 static int recvspace = 1048576; 127 SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, recvspace, CTLFLAG_RWTUN, 128 &recvspace, 0, "Default receive socket buffer size"); 129 130 static volatile u_int icl_cxgbei_ncons; 131 132 static icl_conn_new_pdu_t icl_cxgbei_conn_new_pdu; 133 static icl_conn_pdu_data_segment_length_t 134 icl_cxgbei_conn_pdu_data_segment_length; 135 static icl_conn_pdu_append_bio_t icl_cxgbei_conn_pdu_append_bio; 136 static icl_conn_pdu_append_data_t icl_cxgbei_conn_pdu_append_data; 137 static icl_conn_pdu_get_bio_t icl_cxgbei_conn_pdu_get_bio; 138 static icl_conn_pdu_get_data_t icl_cxgbei_conn_pdu_get_data; 139 static icl_conn_pdu_queue_t icl_cxgbei_conn_pdu_queue; 140 static icl_conn_pdu_queue_cb_t icl_cxgbei_conn_pdu_queue_cb; 141 static icl_conn_handoff_t icl_cxgbei_conn_handoff; 142 static icl_conn_free_t icl_cxgbei_conn_free; 143 static icl_conn_close_t icl_cxgbei_conn_close; 144 static icl_conn_task_setup_t icl_cxgbei_conn_task_setup; 145 static icl_conn_task_done_t icl_cxgbei_conn_task_done; 146 static icl_conn_transfer_setup_t icl_cxgbei_conn_transfer_setup; 147 static icl_conn_transfer_done_t icl_cxgbei_conn_transfer_done; 148 149 static kobj_method_t icl_cxgbei_methods[] = { 150 KOBJMETHOD(icl_conn_new_pdu, icl_cxgbei_conn_new_pdu), 151 KOBJMETHOD(icl_conn_pdu_free, icl_cxgbei_conn_pdu_free), 152 KOBJMETHOD(icl_conn_pdu_data_segment_length, 153 icl_cxgbei_conn_pdu_data_segment_length), 154 KOBJMETHOD(icl_conn_pdu_append_bio, icl_cxgbei_conn_pdu_append_bio), 155 KOBJMETHOD(icl_conn_pdu_append_data, icl_cxgbei_conn_pdu_append_data), 156 KOBJMETHOD(icl_conn_pdu_get_bio, icl_cxgbei_conn_pdu_get_bio), 157 KOBJMETHOD(icl_conn_pdu_get_data, icl_cxgbei_conn_pdu_get_data), 158 KOBJMETHOD(icl_conn_pdu_queue, icl_cxgbei_conn_pdu_queue), 159 KOBJMETHOD(icl_conn_pdu_queue_cb, icl_cxgbei_conn_pdu_queue_cb), 160 KOBJMETHOD(icl_conn_handoff, icl_cxgbei_conn_handoff), 161 KOBJMETHOD(icl_conn_free, icl_cxgbei_conn_free), 162 KOBJMETHOD(icl_conn_close, icl_cxgbei_conn_close), 163 KOBJMETHOD(icl_conn_task_setup, icl_cxgbei_conn_task_setup), 164 KOBJMETHOD(icl_conn_task_done, icl_cxgbei_conn_task_done), 165 KOBJMETHOD(icl_conn_transfer_setup, icl_cxgbei_conn_transfer_setup), 166 KOBJMETHOD(icl_conn_transfer_done, icl_cxgbei_conn_transfer_done), 167 { 0, 0 } 168 }; 169 170 DEFINE_CLASS(icl_cxgbei, icl_cxgbei_methods, sizeof(struct icl_cxgbei_conn)); 171 172 void 173 icl_cxgbei_conn_pdu_free(struct icl_conn *ic, struct icl_pdu *ip) 174 { 175 struct icl_cxgbei_pdu *icp = ip_to_icp(ip); 176 177 KASSERT(icp->ref_cnt != 0, ("freeing deleted PDU")); 178 MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE); 179 MPASS(ic == ip->ip_conn); 180 181 m_freem(ip->ip_ahs_mbuf); 182 m_freem(ip->ip_data_mbuf); 183 m_freem(ip->ip_bhs_mbuf); 184 185 KASSERT(ic != NULL || icp->ref_cnt == 1, 186 ("orphaned PDU has oustanding references")); 187 188 if (atomic_fetchadd_int(&icp->ref_cnt, -1) != 1) 189 return; 190 191 free(icp, M_CXGBEI); 192 #ifdef DIAGNOSTIC 193 if (__predict_true(ic != NULL)) 194 refcount_release(&ic->ic_outstanding_pdus); 195 #endif 196 } 197 198 static void 199 icl_cxgbei_pdu_call_cb(struct icl_pdu *ip) 200 { 201 struct icl_cxgbei_pdu *icp = ip_to_icp(ip); 202 203 MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE); 204 205 if (icp->cb != NULL) 206 icp->cb(ip, icp->error); 207 #ifdef DIAGNOSTIC 208 if (__predict_true(ip->ip_conn != NULL)) 209 refcount_release(&ip->ip_conn->ic_outstanding_pdus); 210 #endif 211 free(icp, M_CXGBEI); 212 } 213 214 static void 215 icl_cxgbei_pdu_done(struct icl_pdu *ip, int error) 216 { 217 struct icl_cxgbei_pdu *icp = ip_to_icp(ip); 218 219 if (error != 0) 220 icp->error = error; 221 222 m_freem(ip->ip_ahs_mbuf); 223 ip->ip_ahs_mbuf = NULL; 224 m_freem(ip->ip_data_mbuf); 225 ip->ip_data_mbuf = NULL; 226 m_freem(ip->ip_bhs_mbuf); 227 ip->ip_bhs_mbuf = NULL; 228 229 /* 230 * All other references to this PDU should have been dropped 231 * by the m_freem() of ip_data_mbuf. 232 */ 233 if (atomic_fetchadd_int(&icp->ref_cnt, -1) == 1) 234 icl_cxgbei_pdu_call_cb(ip); 235 else 236 __assert_unreachable(); 237 } 238 239 static void 240 icl_cxgbei_mbuf_done(struct mbuf *mb) 241 { 242 243 struct icl_cxgbei_pdu *icp = (struct icl_cxgbei_pdu *)mb->m_ext.ext_arg1; 244 245 /* 246 * NB: mb_free_mext() might leave ref_cnt as 1 without 247 * decrementing it if it hits the fast path in the ref_cnt 248 * check. 249 */ 250 icl_cxgbei_pdu_call_cb(&icp->ip); 251 } 252 253 struct icl_pdu * 254 icl_cxgbei_new_pdu(int flags) 255 { 256 struct icl_cxgbei_pdu *icp; 257 struct icl_pdu *ip; 258 struct mbuf *m; 259 260 icp = malloc(sizeof(*icp), M_CXGBEI, flags | M_ZERO); 261 if (__predict_false(icp == NULL)) 262 return (NULL); 263 264 icp->icp_signature = CXGBEI_PDU_SIGNATURE; 265 icp->ref_cnt = 1; 266 ip = &icp->ip; 267 268 m = m_gethdr(flags, MT_DATA); 269 if (__predict_false(m == NULL)) { 270 free(icp, M_CXGBEI); 271 return (NULL); 272 } 273 274 ip->ip_bhs_mbuf = m; 275 ip->ip_bhs = mtod(m, struct iscsi_bhs *); 276 memset(ip->ip_bhs, 0, sizeof(*ip->ip_bhs)); 277 m->m_len = sizeof(struct iscsi_bhs); 278 m->m_pkthdr.len = m->m_len; 279 280 return (ip); 281 } 282 283 void 284 icl_cxgbei_new_pdu_set_conn(struct icl_pdu *ip, struct icl_conn *ic) 285 { 286 287 ip->ip_conn = ic; 288 #ifdef DIAGNOSTIC 289 refcount_acquire(&ic->ic_outstanding_pdus); 290 #endif 291 } 292 293 /* 294 * Allocate icl_pdu with empty BHS to fill up by the caller. 295 */ 296 static struct icl_pdu * 297 icl_cxgbei_conn_new_pdu(struct icl_conn *ic, int flags) 298 { 299 struct icl_pdu *ip; 300 301 ip = icl_cxgbei_new_pdu(flags); 302 if (__predict_false(ip == NULL)) 303 return (NULL); 304 icl_cxgbei_new_pdu_set_conn(ip, ic); 305 306 return (ip); 307 } 308 309 static size_t 310 icl_pdu_data_segment_length(const struct icl_pdu *request) 311 { 312 uint32_t len = 0; 313 314 len += request->ip_bhs->bhs_data_segment_len[0]; 315 len <<= 8; 316 len += request->ip_bhs->bhs_data_segment_len[1]; 317 len <<= 8; 318 len += request->ip_bhs->bhs_data_segment_len[2]; 319 320 return (len); 321 } 322 323 size_t 324 icl_cxgbei_conn_pdu_data_segment_length(struct icl_conn *ic, 325 const struct icl_pdu *request) 326 { 327 328 return (icl_pdu_data_segment_length(request)); 329 } 330 331 static struct mbuf * 332 finalize_pdu(struct icl_cxgbei_conn *icc, struct icl_cxgbei_pdu *icp) 333 { 334 struct icl_pdu *ip = &icp->ip; 335 uint8_t ulp_submode, padding; 336 struct mbuf *m, *last; 337 struct iscsi_bhs *bhs; 338 int data_len; 339 340 /* 341 * Fix up the data segment mbuf first. 342 */ 343 m = ip->ip_data_mbuf; 344 ulp_submode = icc->ulp_submode; 345 if (m != NULL) { 346 last = m_last(m); 347 348 /* 349 * Round up the data segment to a 4B boundary. Pad with 0 if 350 * necessary. There will definitely be room in the mbuf. 351 */ 352 padding = roundup2(ip->ip_data_len, 4) - ip->ip_data_len; 353 if (padding != 0) { 354 MPASS(padding <= M_TRAILINGSPACE(last)); 355 bzero(mtod(last, uint8_t *) + last->m_len, padding); 356 last->m_len += padding; 357 } 358 } else { 359 MPASS(ip->ip_data_len == 0); 360 ulp_submode &= ~ULP_CRC_DATA; 361 padding = 0; 362 } 363 364 /* 365 * Now the header mbuf that has the BHS. 366 */ 367 m = ip->ip_bhs_mbuf; 368 MPASS(m->m_pkthdr.len == sizeof(struct iscsi_bhs)); 369 MPASS(m->m_len == sizeof(struct iscsi_bhs)); 370 371 bhs = ip->ip_bhs; 372 data_len = ip->ip_data_len; 373 if (data_len > icc->ic.ic_max_send_data_segment_length) { 374 struct iscsi_bhs_data_in *bhsdi; 375 int flags; 376 377 KASSERT(padding == 0, ("%s: ISO with padding %d for icp %p", 378 __func__, padding, icp)); 379 switch (bhs->bhs_opcode) { 380 case ISCSI_BHS_OPCODE_SCSI_DATA_OUT: 381 flags = 1; 382 break; 383 case ISCSI_BHS_OPCODE_SCSI_DATA_IN: 384 flags = 2; 385 break; 386 default: 387 panic("invalid opcode %#x for ISO", bhs->bhs_opcode); 388 } 389 data_len = icc->ic.ic_max_send_data_segment_length; 390 bhsdi = (struct iscsi_bhs_data_in *)bhs; 391 if (bhsdi->bhsdi_flags & BHSDI_FLAGS_F) { 392 /* 393 * Firmware will set F on the final PDU in the 394 * burst. 395 */ 396 flags |= CXGBE_ISO_F; 397 bhsdi->bhsdi_flags &= ~BHSDI_FLAGS_F; 398 } 399 set_mbuf_iscsi_iso(m, true); 400 set_mbuf_iscsi_iso_flags(m, flags); 401 set_mbuf_iscsi_iso_mss(m, data_len); 402 } 403 404 bhs->bhs_data_segment_len[2] = data_len; 405 bhs->bhs_data_segment_len[1] = data_len >> 8; 406 bhs->bhs_data_segment_len[0] = data_len >> 16; 407 408 /* 409 * Extract mbuf chain from PDU. 410 */ 411 m->m_pkthdr.len += ip->ip_data_len + padding; 412 m->m_next = ip->ip_data_mbuf; 413 set_mbuf_ulp_submode(m, ulp_submode); 414 ip->ip_bhs_mbuf = NULL; 415 ip->ip_data_mbuf = NULL; 416 ip->ip_bhs = NULL; 417 418 /* 419 * Drop PDU reference on icp. Additional references might 420 * still be held by zero-copy PDU buffers (ICL_NOCOPY). 421 */ 422 if (atomic_fetchadd_int(&icp->ref_cnt, -1) == 1) 423 icl_cxgbei_pdu_call_cb(ip); 424 425 return (m); 426 } 427 428 static void 429 icl_cxgbei_tx_main(void *arg) 430 { 431 struct epoch_tracker et; 432 struct icl_cxgbei_conn *icc = arg; 433 struct icl_conn *ic = &icc->ic; 434 struct toepcb *toep = icc->toep; 435 struct socket *so = ic->ic_socket; 436 struct inpcb *inp = sotoinpcb(so); 437 struct icl_pdu *ip; 438 struct mbuf *m; 439 struct mbufq mq; 440 STAILQ_HEAD(, icl_pdu) tx_pdus = STAILQ_HEAD_INITIALIZER(tx_pdus); 441 442 mbufq_init(&mq, INT_MAX); 443 444 ICL_CONN_LOCK(ic); 445 while (__predict_true(!ic->ic_disconnecting)) { 446 while (STAILQ_EMPTY(&icc->sent_pdus)) { 447 icc->tx_active = false; 448 mtx_sleep(&icc->tx_active, ic->ic_lock, 0, "-", 0); 449 if (__predict_false(ic->ic_disconnecting)) 450 goto out; 451 MPASS(icc->tx_active); 452 } 453 454 STAILQ_SWAP(&icc->sent_pdus, &tx_pdus, icl_pdu); 455 ICL_CONN_UNLOCK(ic); 456 457 while ((ip = STAILQ_FIRST(&tx_pdus)) != NULL) { 458 STAILQ_REMOVE_HEAD(&tx_pdus, ip_next); 459 460 m = finalize_pdu(icc, ip_to_icp(ip)); 461 M_ASSERTPKTHDR(m); 462 MPASS((m->m_pkthdr.len & 3) == 0); 463 464 mbufq_enqueue(&mq, m); 465 } 466 467 ICL_CONN_LOCK(ic); 468 if (__predict_false(ic->ic_disconnecting) || 469 __predict_false(ic->ic_socket == NULL)) { 470 mbufq_drain(&mq); 471 break; 472 } 473 474 CURVNET_SET(toep->vnet); 475 NET_EPOCH_ENTER(et); 476 INP_WLOCK(inp); 477 478 ICL_CONN_UNLOCK(ic); 479 if (__predict_false(inp->inp_flags & INP_DROPPED) || 480 __predict_false((toep->flags & TPF_ATTACHED) == 0)) { 481 mbufq_drain(&mq); 482 } else { 483 mbufq_concat(&toep->ulp_pduq, &mq); 484 t4_push_pdus(icc->sc, toep, 0); 485 } 486 INP_WUNLOCK(inp); 487 NET_EPOCH_EXIT(et); 488 CURVNET_RESTORE(); 489 490 ICL_CONN_LOCK(ic); 491 } 492 out: 493 ICL_CONN_UNLOCK(ic); 494 495 kthread_exit(); 496 } 497 498 static void 499 icl_cxgbei_rx_main(void *arg) 500 { 501 struct icl_cxgbei_conn *icc = arg; 502 struct icl_conn *ic = &icc->ic; 503 struct icl_pdu *ip; 504 struct sockbuf *sb; 505 STAILQ_HEAD(, icl_pdu) rx_pdus = STAILQ_HEAD_INITIALIZER(rx_pdus); 506 bool cantrcvmore; 507 508 sb = &ic->ic_socket->so_rcv; 509 SOCKBUF_LOCK(sb); 510 while (__predict_true(!ic->ic_disconnecting)) { 511 while (STAILQ_EMPTY(&icc->rcvd_pdus)) { 512 icc->rx_active = false; 513 mtx_sleep(&icc->rx_active, SOCKBUF_MTX(sb), 0, "-", 0); 514 if (__predict_false(ic->ic_disconnecting)) 515 goto out; 516 MPASS(icc->rx_active); 517 } 518 519 if (__predict_false(sbused(sb)) != 0) { 520 /* 521 * PDUs were received before the tid 522 * transitioned to ULP mode. Convert 523 * them to icl_cxgbei_pdus and insert 524 * them into the head of rcvd_pdus. 525 */ 526 parse_pdus(icc, sb); 527 } 528 cantrcvmore = (sb->sb_state & SBS_CANTRCVMORE) != 0; 529 MPASS(STAILQ_EMPTY(&rx_pdus)); 530 STAILQ_SWAP(&icc->rcvd_pdus, &rx_pdus, icl_pdu); 531 SOCKBUF_UNLOCK(sb); 532 533 /* Hand over PDUs to ICL. */ 534 while ((ip = STAILQ_FIRST(&rx_pdus)) != NULL) { 535 STAILQ_REMOVE_HEAD(&rx_pdus, ip_next); 536 if (cantrcvmore) 537 icl_cxgbei_pdu_done(ip, ENOTCONN); 538 else 539 ic->ic_receive(ip); 540 } 541 542 SOCKBUF_LOCK(sb); 543 } 544 out: 545 /* 546 * Since ic_disconnecting is set before the SOCKBUF_MTX is 547 * locked in icl_cxgbei_conn_close, the loop above can exit 548 * before icl_cxgbei_conn_close can lock SOCKBUF_MTX and block 549 * waiting for the thread exit. 550 */ 551 while (!icc->rx_exiting) 552 mtx_sleep(&icc->rx_active, SOCKBUF_MTX(sb), 0, "-", 0); 553 SOCKBUF_UNLOCK(sb); 554 555 kthread_exit(); 556 } 557 558 static void 559 cxgbei_free_mext_pg(struct mbuf *m) 560 { 561 struct icl_cxgbei_pdu *icp; 562 563 M_ASSERTEXTPG(m); 564 565 /* 566 * Nothing to do for the pages; they are owned by the PDU / 567 * I/O request. 568 */ 569 570 /* Drop reference on the PDU. */ 571 icp = m->m_ext.ext_arg1; 572 if (atomic_fetchadd_int(&icp->ref_cnt, -1) == 1) 573 icl_cxgbei_pdu_call_cb(&icp->ip); 574 } 575 576 static struct mbuf * 577 cxgbei_getm(size_t len, int flags) 578 { 579 struct mbuf *m, *m0, *m_tail; 580 581 m_tail = m0 = NULL; 582 583 /* Allocate as jumbo mbufs of size MJUM16BYTES. */ 584 while (len >= MJUM16BYTES) { 585 m = m_getjcl(M_NOWAIT, MT_DATA, 0, MJUM16BYTES); 586 if (__predict_false(m == NULL)) { 587 if ((flags & M_WAITOK) != 0) { 588 /* Fall back to non-jumbo mbufs. */ 589 break; 590 } 591 return (NULL); 592 } 593 if (m0 == NULL) { 594 m0 = m_tail = m; 595 } else { 596 m_tail->m_next = m; 597 m_tail = m; 598 } 599 len -= MJUM16BYTES; 600 } 601 602 /* Allocate mbuf chain for the remaining data. */ 603 if (len != 0) { 604 m = m_getm2(NULL, len, flags, MT_DATA, 0); 605 if (__predict_false(m == NULL)) { 606 m_freem(m0); 607 return (NULL); 608 } 609 if (m0 == NULL) 610 m0 = m; 611 else 612 m_tail->m_next = m; 613 } 614 615 return (m0); 616 } 617 618 int 619 icl_cxgbei_conn_pdu_append_bio(struct icl_conn *ic, struct icl_pdu *ip, 620 struct bio *bp, size_t offset, size_t len, int flags) 621 { 622 struct icl_cxgbei_pdu *icp = ip_to_icp(ip); 623 struct mbuf *m, *m_tail; 624 vm_offset_t vaddr; 625 size_t page_offset, todo, mtodo; 626 bool mapped; 627 int i; 628 629 MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE); 630 MPASS(ic == ip->ip_conn); 631 KASSERT(len > 0, ("%s: len is %jd", __func__, (intmax_t)len)); 632 633 m_tail = ip->ip_data_mbuf; 634 if (m_tail != NULL) 635 for (; m_tail->m_next != NULL; m_tail = m_tail->m_next) 636 ; 637 638 MPASS(bp->bio_flags & BIO_UNMAPPED); 639 if (offset < PAGE_SIZE - bp->bio_ma_offset) { 640 page_offset = bp->bio_ma_offset + offset; 641 i = 0; 642 } else { 643 offset -= PAGE_SIZE - bp->bio_ma_offset; 644 for (i = 1; offset >= PAGE_SIZE; i++) 645 offset -= PAGE_SIZE; 646 page_offset = offset; 647 } 648 649 if (flags & ICL_NOCOPY) { 650 m = NULL; 651 while (len > 0) { 652 if (m == NULL) { 653 m = mb_alloc_ext_pgs(flags & ~ICL_NOCOPY, 654 cxgbei_free_mext_pg, 0); 655 if (__predict_false(m == NULL)) 656 return (ENOMEM); 657 atomic_add_int(&icp->ref_cnt, 1); 658 m->m_ext.ext_arg1 = icp; 659 m->m_epg_1st_off = page_offset; 660 } 661 662 todo = MIN(len, PAGE_SIZE - page_offset); 663 664 m->m_epg_pa[m->m_epg_npgs] = 665 VM_PAGE_TO_PHYS(bp->bio_ma[i]); 666 m->m_epg_npgs++; 667 m->m_epg_last_len = todo; 668 m->m_len += todo; 669 m->m_ext.ext_size += PAGE_SIZE; 670 MBUF_EXT_PGS_ASSERT_SANITY(m); 671 672 if (m->m_epg_npgs == MBUF_PEXT_MAX_PGS) { 673 if (m_tail != NULL) 674 m_tail->m_next = m; 675 else 676 ip->ip_data_mbuf = m; 677 m_tail = m; 678 ip->ip_data_len += m->m_len; 679 m = NULL; 680 } 681 682 page_offset = 0; 683 len -= todo; 684 i++; 685 } 686 687 if (m != NULL) { 688 if (m_tail != NULL) 689 m_tail->m_next = m; 690 else 691 ip->ip_data_mbuf = m; 692 ip->ip_data_len += m->m_len; 693 } 694 return (0); 695 } 696 697 m = cxgbei_getm(len, flags); 698 if (__predict_false(m == NULL)) 699 return (ENOMEM); 700 701 if (ip->ip_data_mbuf == NULL) { 702 ip->ip_data_mbuf = m; 703 ip->ip_data_len = len; 704 } else { 705 m_tail->m_next = m; 706 ip->ip_data_len += len; 707 } 708 709 while (len > 0) { 710 todo = MIN(len, PAGE_SIZE - page_offset); 711 712 mapped = pmap_map_io_transient(bp->bio_ma + i, &vaddr, 1, 713 false); 714 715 do { 716 mtodo = min(todo, M_SIZE(m) - m->m_len); 717 memcpy(mtod(m, char *) + m->m_len, (char *)vaddr + 718 page_offset, mtodo); 719 m->m_len += mtodo; 720 if (m->m_len == M_SIZE(m)) 721 m = m->m_next; 722 page_offset += mtodo; 723 todo -= mtodo; 724 } while (todo > 0); 725 726 if (__predict_false(mapped)) 727 pmap_unmap_io_transient(bp->bio_ma + 1, &vaddr, 1, 728 false); 729 730 page_offset = 0; 731 len -= todo; 732 i++; 733 } 734 735 MPASS(ip->ip_data_len <= max(ic->ic_max_send_data_segment_length, 736 ic->ic_hw_isomax)); 737 738 return (0); 739 } 740 741 int 742 icl_cxgbei_conn_pdu_append_data(struct icl_conn *ic, struct icl_pdu *ip, 743 const void *addr, size_t len, int flags) 744 { 745 struct icl_cxgbei_pdu *icp = ip_to_icp(ip); 746 struct mbuf *m, *m_tail; 747 const char *src; 748 749 MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE); 750 MPASS(ic == ip->ip_conn); 751 KASSERT(len > 0, ("%s: len is %jd", __func__, (intmax_t)len)); 752 753 m_tail = ip->ip_data_mbuf; 754 if (m_tail != NULL) 755 for (; m_tail->m_next != NULL; m_tail = m_tail->m_next) 756 ; 757 758 if (flags & ICL_NOCOPY) { 759 m = m_get(flags & ~ICL_NOCOPY, MT_DATA); 760 if (m == NULL) { 761 ICL_WARN("failed to allocate mbuf"); 762 return (ENOMEM); 763 } 764 765 m->m_flags |= M_RDONLY; 766 m_extaddref(m, __DECONST(char *, addr), len, &icp->ref_cnt, 767 icl_cxgbei_mbuf_done, icp, NULL); 768 m->m_len = len; 769 if (ip->ip_data_mbuf == NULL) { 770 ip->ip_data_mbuf = m; 771 ip->ip_data_len = len; 772 } else { 773 m_tail->m_next = m; 774 m_tail = m_tail->m_next; 775 ip->ip_data_len += len; 776 } 777 778 return (0); 779 } 780 781 m = cxgbei_getm(len, flags); 782 if (__predict_false(m == NULL)) 783 return (ENOMEM); 784 785 if (ip->ip_data_mbuf == NULL) { 786 ip->ip_data_mbuf = m; 787 ip->ip_data_len = len; 788 } else { 789 m_tail->m_next = m; 790 ip->ip_data_len += len; 791 } 792 src = (const char *)addr; 793 for (; m != NULL; m = m->m_next) { 794 m->m_len = min(len, M_SIZE(m)); 795 memcpy(mtod(m, void *), src, m->m_len); 796 src += m->m_len; 797 len -= m->m_len; 798 } 799 MPASS(len == 0); 800 801 MPASS(ip->ip_data_len <= max(ic->ic_max_send_data_segment_length, 802 ic->ic_hw_isomax)); 803 804 return (0); 805 } 806 807 void 808 icl_cxgbei_conn_pdu_get_bio(struct icl_conn *ic, struct icl_pdu *ip, 809 size_t pdu_off, struct bio *bp, size_t bio_off, size_t len) 810 { 811 struct icl_cxgbei_pdu *icp = ip_to_icp(ip); 812 vm_offset_t vaddr; 813 size_t page_offset, todo; 814 bool mapped; 815 int i; 816 817 if (icp->icp_flags & ICPF_RX_DDP) 818 return; /* data is DDP'ed, no need to copy */ 819 820 MPASS(bp->bio_flags & BIO_UNMAPPED); 821 if (bio_off < PAGE_SIZE - bp->bio_ma_offset) { 822 page_offset = bp->bio_ma_offset + bio_off; 823 i = 0; 824 } else { 825 bio_off -= PAGE_SIZE - bp->bio_ma_offset; 826 for (i = 1; bio_off >= PAGE_SIZE; i++) 827 bio_off -= PAGE_SIZE; 828 page_offset = bio_off; 829 } 830 831 while (len > 0) { 832 todo = MIN(len, PAGE_SIZE - page_offset); 833 834 mapped = pmap_map_io_transient(bp->bio_ma + i, &vaddr, 1, 835 false); 836 m_copydata(ip->ip_data_mbuf, pdu_off, todo, (char *)vaddr + 837 page_offset); 838 if (__predict_false(mapped)) 839 pmap_unmap_io_transient(bp->bio_ma + 1, &vaddr, 1, 840 false); 841 842 page_offset = 0; 843 pdu_off += todo; 844 len -= todo; 845 i++; 846 } 847 } 848 849 void 850 icl_cxgbei_conn_pdu_get_data(struct icl_conn *ic, struct icl_pdu *ip, 851 size_t off, void *addr, size_t len) 852 { 853 struct icl_cxgbei_pdu *icp = ip_to_icp(ip); 854 855 if (icp->icp_flags & ICPF_RX_DDP) 856 return; /* data is DDP'ed, no need to copy */ 857 m_copydata(ip->ip_data_mbuf, off, len, addr); 858 } 859 860 void 861 icl_cxgbei_conn_pdu_queue(struct icl_conn *ic, struct icl_pdu *ip) 862 { 863 icl_cxgbei_conn_pdu_queue_cb(ic, ip, NULL); 864 } 865 866 void 867 icl_cxgbei_conn_pdu_queue_cb(struct icl_conn *ic, struct icl_pdu *ip, 868 icl_pdu_cb cb) 869 { 870 struct icl_cxgbei_conn *icc = ic_to_icc(ic); 871 struct icl_cxgbei_pdu *icp = ip_to_icp(ip); 872 struct socket *so = ic->ic_socket; 873 874 MPASS(ic == ip->ip_conn); 875 MPASS(ip->ip_bhs_mbuf != NULL); 876 /* The kernel doesn't generate PDUs with AHS. */ 877 MPASS(ip->ip_ahs_mbuf == NULL && ip->ip_ahs_len == 0); 878 879 ICL_CONN_LOCK_ASSERT(ic); 880 881 icp->cb = cb; 882 883 /* NOTE: sowriteable without so_snd lock is a mostly harmless race. */ 884 if (ic->ic_disconnecting || so == NULL || !sowriteable(so)) { 885 icl_cxgbei_pdu_done(ip, ENOTCONN); 886 return; 887 } 888 889 STAILQ_INSERT_TAIL(&icc->sent_pdus, ip, ip_next); 890 if (!icc->tx_active) { 891 icc->tx_active = true; 892 wakeup(&icc->tx_active); 893 } 894 } 895 896 static struct icl_conn * 897 icl_cxgbei_new_conn(const char *name, struct mtx *lock) 898 { 899 struct icl_cxgbei_conn *icc; 900 struct icl_conn *ic; 901 902 refcount_acquire(&icl_cxgbei_ncons); 903 904 icc = (struct icl_cxgbei_conn *)kobj_create(&icl_cxgbei_class, M_CXGBE, 905 M_WAITOK | M_ZERO); 906 icc->icc_signature = CXGBEI_CONN_SIGNATURE; 907 STAILQ_INIT(&icc->rcvd_pdus); 908 STAILQ_INIT(&icc->sent_pdus); 909 910 icc->cmp_table = hashinit(64, M_CXGBEI, &icc->cmp_hash_mask); 911 mtx_init(&icc->cmp_lock, "cxgbei_cmp", NULL, MTX_DEF); 912 913 ic = &icc->ic; 914 ic->ic_lock = lock; 915 916 #ifdef DIAGNOSTIC 917 refcount_init(&ic->ic_outstanding_pdus, 0); 918 #endif 919 ic->ic_name = name; 920 ic->ic_offload = "cxgbei"; 921 ic->ic_unmapped = true; 922 923 CTR2(KTR_CXGBE, "%s: icc %p", __func__, icc); 924 925 return (ic); 926 } 927 928 void 929 icl_cxgbei_conn_free(struct icl_conn *ic) 930 { 931 struct icl_cxgbei_conn *icc = ic_to_icc(ic); 932 933 MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); 934 935 CTR2(KTR_CXGBE, "%s: icc %p", __func__, icc); 936 937 mtx_destroy(&icc->cmp_lock); 938 hashdestroy(icc->cmp_table, M_CXGBEI, icc->cmp_hash_mask); 939 kobj_delete((struct kobj *)icc, M_CXGBE); 940 refcount_release(&icl_cxgbei_ncons); 941 } 942 943 static int 944 icl_cxgbei_setsockopt(struct icl_conn *ic, struct socket *so, int sspace, 945 int rspace) 946 { 947 struct sockopt opt; 948 int error, one = 1, ss, rs; 949 950 ss = max(sendspace, sspace); 951 rs = max(recvspace, rspace); 952 953 error = soreserve(so, ss, rs); 954 if (error != 0) 955 return (error); 956 SOCKBUF_LOCK(&so->so_snd); 957 so->so_snd.sb_flags |= SB_AUTOSIZE; 958 SOCKBUF_UNLOCK(&so->so_snd); 959 SOCKBUF_LOCK(&so->so_rcv); 960 so->so_rcv.sb_flags |= SB_AUTOSIZE; 961 SOCKBUF_UNLOCK(&so->so_rcv); 962 963 /* 964 * Disable Nagle. 965 */ 966 bzero(&opt, sizeof(opt)); 967 opt.sopt_dir = SOPT_SET; 968 opt.sopt_level = IPPROTO_TCP; 969 opt.sopt_name = TCP_NODELAY; 970 opt.sopt_val = &one; 971 opt.sopt_valsize = sizeof(one); 972 error = sosetopt(so, &opt); 973 if (error != 0) 974 return (error); 975 976 return (0); 977 } 978 979 static bool 980 is_memfree(struct adapter *sc) 981 { 982 uint32_t em; 983 984 em = t4_read_reg(sc, A_MA_TARGET_MEM_ENABLE); 985 if ((em & F_EXT_MEM_ENABLE) != 0) 986 return (false); 987 if (is_t5(sc) && (em & F_EXT_MEM1_ENABLE) != 0) 988 return (false); 989 return (true); 990 } 991 992 static void 993 set_ulp_mode_iscsi(struct adapter *sc, struct toepcb *toep, u_int ulp_submode) 994 { 995 uint64_t val; 996 997 CTR3(KTR_CXGBE, "%s: tid %u, ULP_MODE_ISCSI, submode=%#x", 998 __func__, toep->tid, ulp_submode); 999 1000 val = V_TCB_ULP_TYPE(ULP_MODE_ISCSI) | V_TCB_ULP_RAW(ulp_submode); 1001 t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_ULP_TYPE, 1002 V_TCB_ULP_TYPE(M_TCB_ULP_TYPE) | V_TCB_ULP_RAW(M_TCB_ULP_RAW), val, 1003 0, 0); 1004 1005 val = V_TF_RX_FLOW_CONTROL_DISABLE(1ULL); 1006 t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_T_FLAGS, val, val, 0, 0); 1007 } 1008 1009 /* 1010 * XXXNP: Who is responsible for cleaning up the socket if this returns with an 1011 * error? Review all error paths. 1012 * 1013 * XXXNP: What happens to the socket's fd reference if the operation is 1014 * successful, and how does that affect the socket's life cycle? 1015 */ 1016 int 1017 icl_cxgbei_conn_handoff(struct icl_conn *ic, int fd) 1018 { 1019 struct icl_cxgbei_conn *icc = ic_to_icc(ic); 1020 struct file *fp; 1021 struct socket *so; 1022 struct inpcb *inp; 1023 struct tcpcb *tp; 1024 struct toepcb *toep; 1025 cap_rights_t rights; 1026 u_int max_iso_payload, max_rx_pdu_len, max_tx_pdu_len; 1027 int error, max_iso_pdus; 1028 1029 MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); 1030 ICL_CONN_LOCK_ASSERT_NOT(ic); 1031 1032 /* 1033 * Steal the socket from userland. 1034 */ 1035 error = fget(curthread, fd, 1036 cap_rights_init_one(&rights, CAP_SOCK_CLIENT), &fp); 1037 if (error != 0) 1038 return (error); 1039 if (fp->f_type != DTYPE_SOCKET) { 1040 fdrop(fp, curthread); 1041 return (EINVAL); 1042 } 1043 so = fp->f_data; 1044 if (so->so_type != SOCK_STREAM || 1045 so->so_proto->pr_protocol != IPPROTO_TCP) { 1046 fdrop(fp, curthread); 1047 return (EINVAL); 1048 } 1049 1050 ICL_CONN_LOCK(ic); 1051 if (ic->ic_socket != NULL) { 1052 ICL_CONN_UNLOCK(ic); 1053 fdrop(fp, curthread); 1054 return (EBUSY); 1055 } 1056 ic->ic_disconnecting = false; 1057 ic->ic_socket = so; 1058 fp->f_ops = &badfileops; 1059 fp->f_data = NULL; 1060 fdrop(fp, curthread); 1061 ICL_CONN_UNLOCK(ic); 1062 1063 icc->sc = find_offload_adapter(so); 1064 if (icc->sc == NULL) { 1065 error = EINVAL; 1066 goto out; 1067 } 1068 1069 max_rx_pdu_len = ISCSI_BHS_SIZE + ic->ic_max_recv_data_segment_length; 1070 max_tx_pdu_len = ISCSI_BHS_SIZE + ic->ic_max_send_data_segment_length; 1071 if (ic->ic_header_crc32c) { 1072 max_rx_pdu_len += ISCSI_HEADER_DIGEST_SIZE; 1073 max_tx_pdu_len += ISCSI_HEADER_DIGEST_SIZE; 1074 } 1075 if (ic->ic_data_crc32c) { 1076 max_rx_pdu_len += ISCSI_DATA_DIGEST_SIZE; 1077 max_tx_pdu_len += ISCSI_DATA_DIGEST_SIZE; 1078 } 1079 1080 inp = sotoinpcb(so); 1081 INP_WLOCK(inp); 1082 tp = intotcpcb(inp); 1083 if (inp->inp_flags & INP_DROPPED) { 1084 INP_WUNLOCK(inp); 1085 error = ENOTCONN; 1086 goto out; 1087 } 1088 1089 /* 1090 * socket could not have been "unoffloaded" if here. 1091 */ 1092 MPASS(tp->t_flags & TF_TOE); 1093 MPASS(tp->tod != NULL); 1094 MPASS(tp->t_toe != NULL); 1095 toep = tp->t_toe; 1096 MPASS(toep->vi->adapter == icc->sc); 1097 1098 if (ulp_mode(toep) != ULP_MODE_NONE) { 1099 INP_WUNLOCK(inp); 1100 error = EINVAL; 1101 goto out; 1102 } 1103 1104 icc->toep = toep; 1105 1106 icc->ulp_submode = 0; 1107 if (ic->ic_header_crc32c) 1108 icc->ulp_submode |= ULP_CRC_HEADER; 1109 if (ic->ic_data_crc32c) 1110 icc->ulp_submode |= ULP_CRC_DATA; 1111 1112 if (icc->sc->tt.iso && chip_id(icc->sc) >= CHELSIO_T5 && 1113 !is_memfree(icc->sc)) { 1114 max_iso_payload = rounddown(CXGBEI_MAX_ISO_PAYLOAD, 1115 tp->t_maxseg); 1116 max_iso_pdus = max_iso_payload / max_tx_pdu_len; 1117 ic->ic_hw_isomax = max_iso_pdus * 1118 ic->ic_max_send_data_segment_length; 1119 } else 1120 max_iso_pdus = 1; 1121 1122 toep->params.ulp_mode = ULP_MODE_ISCSI; 1123 toep->ulpcb = icc; 1124 1125 send_txdataplen_max_flowc_wr(icc->sc, toep, 1126 roundup(max_iso_pdus * max_tx_pdu_len, tp->t_maxseg)); 1127 set_ulp_mode_iscsi(icc->sc, toep, icc->ulp_submode); 1128 INP_WUNLOCK(inp); 1129 1130 error = kthread_add(icl_cxgbei_tx_main, icc, NULL, &icc->tx_thread, 0, 1131 0, "%stx (cxgbei)", ic->ic_name); 1132 if (error != 0) 1133 goto out; 1134 1135 error = kthread_add(icl_cxgbei_rx_main, icc, NULL, &icc->rx_thread, 0, 1136 0, "%srx (cxgbei)", ic->ic_name); 1137 if (error != 0) 1138 goto out; 1139 1140 error = icl_cxgbei_setsockopt(ic, so, max_tx_pdu_len, max_rx_pdu_len); 1141 out: 1142 if (error != 0) 1143 icl_cxgbei_conn_close(ic); 1144 return (error); 1145 } 1146 1147 void 1148 icl_cxgbei_conn_close(struct icl_conn *ic) 1149 { 1150 struct icl_cxgbei_conn *icc = ic_to_icc(ic); 1151 struct icl_pdu *ip; 1152 struct socket *so; 1153 struct sockbuf *sb; 1154 struct inpcb *inp; 1155 struct toepcb *toep = icc->toep; 1156 1157 MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); 1158 ICL_CONN_LOCK_ASSERT_NOT(ic); 1159 1160 ICL_CONN_LOCK(ic); 1161 so = ic->ic_socket; 1162 if (ic->ic_disconnecting || so == NULL) { 1163 CTR4(KTR_CXGBE, "%s: icc %p (disconnecting = %d), so %p", 1164 __func__, icc, ic->ic_disconnecting, so); 1165 ICL_CONN_UNLOCK(ic); 1166 return; 1167 } 1168 ic->ic_disconnecting = true; 1169 1170 #ifdef DIAGNOSTIC 1171 KASSERT(ic->ic_outstanding_pdus == 0, 1172 ("destroying session with %d outstanding PDUs", 1173 ic->ic_outstanding_pdus)); 1174 #endif 1175 1176 CTR3(KTR_CXGBE, "%s: tid %d, icc %p", __func__, toep ? toep->tid : -1, 1177 icc); 1178 1179 /* 1180 * Wait for the transmit thread to stop processing 1181 * this connection. 1182 */ 1183 if (icc->tx_thread != NULL) { 1184 wakeup(&icc->tx_active); 1185 mtx_sleep(icc->tx_thread, ic->ic_lock, 0, "conclo", 0); 1186 } 1187 1188 /* Discard PDUs queued for TX. */ 1189 while (!STAILQ_EMPTY(&icc->sent_pdus)) { 1190 ip = STAILQ_FIRST(&icc->sent_pdus); 1191 STAILQ_REMOVE_HEAD(&icc->sent_pdus, ip_next); 1192 icl_cxgbei_pdu_done(ip, ENOTCONN); 1193 } 1194 ICL_CONN_UNLOCK(ic); 1195 1196 inp = sotoinpcb(so); 1197 sb = &so->so_rcv; 1198 1199 /* 1200 * Wait for the receive thread to stop processing this 1201 * connection. 1202 */ 1203 SOCKBUF_LOCK(sb); 1204 if (icc->rx_thread != NULL) { 1205 icc->rx_exiting = true; 1206 wakeup(&icc->rx_active); 1207 mtx_sleep(icc->rx_thread, SOCKBUF_MTX(sb), 0, "conclo", 0); 1208 } 1209 1210 /* 1211 * Discard received PDUs not passed to the iSCSI layer. 1212 */ 1213 while (!STAILQ_EMPTY(&icc->rcvd_pdus)) { 1214 ip = STAILQ_FIRST(&icc->rcvd_pdus); 1215 STAILQ_REMOVE_HEAD(&icc->rcvd_pdus, ip_next); 1216 icl_cxgbei_pdu_done(ip, ENOTCONN); 1217 } 1218 SOCKBUF_UNLOCK(sb); 1219 1220 INP_WLOCK(inp); 1221 if (toep != NULL) { /* NULL if connection was never offloaded. */ 1222 toep->ulpcb = NULL; 1223 1224 /* Discard mbufs queued for TX. */ 1225 mbufq_drain(&toep->ulp_pduq); 1226 1227 /* 1228 * Grab a reference to use when waiting for the final 1229 * CPL to be received. If toep->inp is NULL, then 1230 * final_cpl_received() has already been called (e.g. 1231 * due to the peer sending a RST). 1232 */ 1233 if (toep->inp != NULL) { 1234 toep = hold_toepcb(toep); 1235 toep->flags |= TPF_WAITING_FOR_FINAL; 1236 } else 1237 toep = NULL; 1238 } 1239 INP_WUNLOCK(inp); 1240 1241 ICL_CONN_LOCK(ic); 1242 ic->ic_socket = NULL; 1243 ICL_CONN_UNLOCK(ic); 1244 1245 /* 1246 * XXXNP: we should send RST instead of FIN when PDUs held in various 1247 * queues were purged instead of delivered reliably but soabort isn't 1248 * really general purpose and wouldn't do the right thing here. 1249 */ 1250 soclose(so); 1251 1252 /* 1253 * Wait for the socket to fully close. This ensures any 1254 * pending received data has been received (and in particular, 1255 * any data that would be received by DDP has been handled). 1256 * Callers assume that it is safe to free buffers for tasks 1257 * and transfers after this function returns. 1258 */ 1259 if (toep != NULL) { 1260 struct mtx *lock = mtx_pool_find(mtxpool_sleep, toep); 1261 1262 mtx_lock(lock); 1263 while ((toep->flags & TPF_WAITING_FOR_FINAL) != 0) 1264 mtx_sleep(toep, lock, PSOCK, "conclo2", 0); 1265 mtx_unlock(lock); 1266 free_toepcb(toep); 1267 } 1268 } 1269 1270 static void 1271 cxgbei_insert_cmp(struct icl_cxgbei_conn *icc, struct cxgbei_cmp *cmp, 1272 uint32_t tt) 1273 { 1274 #ifdef INVARIANTS 1275 struct cxgbei_cmp *cmp2; 1276 #endif 1277 1278 cmp->tt = tt; 1279 1280 mtx_lock(&icc->cmp_lock); 1281 #ifdef INVARIANTS 1282 LIST_FOREACH(cmp2, &icc->cmp_table[TT_HASH(icc, tt)], link) { 1283 KASSERT(cmp2->tt != tt, ("%s: duplicate cmp", __func__)); 1284 } 1285 #endif 1286 LIST_INSERT_HEAD(&icc->cmp_table[TT_HASH(icc, tt)], cmp, link); 1287 mtx_unlock(&icc->cmp_lock); 1288 } 1289 1290 struct cxgbei_cmp * 1291 cxgbei_find_cmp(struct icl_cxgbei_conn *icc, uint32_t tt) 1292 { 1293 struct cxgbei_cmp *cmp; 1294 1295 mtx_lock(&icc->cmp_lock); 1296 LIST_FOREACH(cmp, &icc->cmp_table[TT_HASH(icc, tt)], link) { 1297 if (cmp->tt == tt) 1298 break; 1299 } 1300 mtx_unlock(&icc->cmp_lock); 1301 return (cmp); 1302 } 1303 1304 static void 1305 cxgbei_rm_cmp(struct icl_cxgbei_conn *icc, struct cxgbei_cmp *cmp) 1306 { 1307 #ifdef INVARIANTS 1308 struct cxgbei_cmp *cmp2; 1309 #endif 1310 1311 mtx_lock(&icc->cmp_lock); 1312 1313 #ifdef INVARIANTS 1314 LIST_FOREACH(cmp2, &icc->cmp_table[TT_HASH(icc, cmp->tt)], link) { 1315 if (cmp2 == cmp) 1316 goto found; 1317 } 1318 panic("%s: could not find cmp", __func__); 1319 found: 1320 #endif 1321 LIST_REMOVE(cmp, link); 1322 mtx_unlock(&icc->cmp_lock); 1323 } 1324 1325 int 1326 icl_cxgbei_conn_task_setup(struct icl_conn *ic, struct icl_pdu *ip, 1327 struct ccb_scsiio *csio, uint32_t *ittp, void **arg) 1328 { 1329 struct icl_cxgbei_conn *icc = ic_to_icc(ic); 1330 struct toepcb *toep = icc->toep; 1331 struct adapter *sc = icc->sc; 1332 struct cxgbei_data *ci = sc->iscsi_ulp_softc; 1333 struct ppod_region *pr = &ci->pr; 1334 struct cxgbei_ddp_state *ddp; 1335 struct ppod_reservation *prsv; 1336 struct inpcb *inp; 1337 struct mbufq mq; 1338 uint32_t itt; 1339 int rc = 0; 1340 1341 ICL_CONN_LOCK_ASSERT(ic); 1342 1343 /* This is for the offload driver's state. Must not be set already. */ 1344 MPASS(arg != NULL); 1345 MPASS(*arg == NULL); 1346 1347 if ((csio->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_IN || 1348 csio->dxfer_len < ci->ddp_threshold || ic->ic_disconnecting || 1349 ic->ic_socket == NULL) { 1350 no_ddp: 1351 /* 1352 * No DDP for this I/O. Allocate an ITT (based on the one 1353 * passed in) that cannot be a valid hardware DDP tag in the 1354 * iSCSI region. 1355 */ 1356 itt = *ittp & M_PPOD_TAG; 1357 itt = V_PPOD_TAG(itt) | pr->pr_invalid_bit; 1358 *ittp = htobe32(itt); 1359 MPASS(*arg == NULL); /* State is maintained for DDP only. */ 1360 if (rc != 0) 1361 counter_u64_add( 1362 toep->ofld_rxq->rx_iscsi_ddp_setup_error, 1); 1363 return (0); 1364 } 1365 1366 /* 1367 * Reserve resources for DDP, update the itt that should be used in the 1368 * PDU, and save DDP specific state for this I/O in *arg. 1369 */ 1370 ddp = malloc(sizeof(*ddp), M_CXGBEI, M_NOWAIT | M_ZERO); 1371 if (ddp == NULL) { 1372 rc = ENOMEM; 1373 goto no_ddp; 1374 } 1375 prsv = &ddp->prsv; 1376 1377 mbufq_init(&mq, INT_MAX); 1378 switch (csio->ccb_h.flags & CAM_DATA_MASK) { 1379 case CAM_DATA_BIO: 1380 rc = t4_alloc_page_pods_for_bio(pr, 1381 (struct bio *)csio->data_ptr, prsv); 1382 if (rc != 0) { 1383 free(ddp, M_CXGBEI); 1384 goto no_ddp; 1385 } 1386 1387 rc = t4_write_page_pods_for_bio(sc, toep, prsv, 1388 (struct bio *)csio->data_ptr, &mq); 1389 if (__predict_false(rc != 0)) { 1390 mbufq_drain(&mq); 1391 t4_free_page_pods(prsv); 1392 free(ddp, M_CXGBEI); 1393 goto no_ddp; 1394 } 1395 break; 1396 case CAM_DATA_VADDR: 1397 rc = t4_alloc_page_pods_for_buf(pr, (vm_offset_t)csio->data_ptr, 1398 csio->dxfer_len, prsv); 1399 if (rc != 0) { 1400 free(ddp, M_CXGBEI); 1401 goto no_ddp; 1402 } 1403 1404 rc = t4_write_page_pods_for_buf(sc, toep, prsv, 1405 (vm_offset_t)csio->data_ptr, csio->dxfer_len, &mq); 1406 if (__predict_false(rc != 0)) { 1407 mbufq_drain(&mq); 1408 t4_free_page_pods(prsv); 1409 free(ddp, M_CXGBEI); 1410 goto no_ddp; 1411 } 1412 break; 1413 default: 1414 free(ddp, M_CXGBEI); 1415 rc = EINVAL; 1416 goto no_ddp; 1417 } 1418 1419 /* 1420 * Do not get inp from toep->inp as the toepcb might have 1421 * detached already. 1422 */ 1423 inp = sotoinpcb(ic->ic_socket); 1424 INP_WLOCK(inp); 1425 if ((inp->inp_flags & INP_DROPPED) != 0) { 1426 INP_WUNLOCK(inp); 1427 mbufq_drain(&mq); 1428 t4_free_page_pods(prsv); 1429 free(ddp, M_CXGBEI); 1430 goto no_ddp; 1431 } 1432 mbufq_concat(&toep->ulp_pduq, &mq); 1433 INP_WUNLOCK(inp); 1434 1435 ddp->cmp.last_datasn = -1; 1436 cxgbei_insert_cmp(icc, &ddp->cmp, prsv->prsv_tag); 1437 *ittp = htobe32(prsv->prsv_tag); 1438 *arg = prsv; 1439 counter_u64_add(toep->ofld_rxq->rx_iscsi_ddp_setup_ok, 1); 1440 return (0); 1441 } 1442 1443 void 1444 icl_cxgbei_conn_task_done(struct icl_conn *ic, void *arg) 1445 { 1446 1447 if (arg != NULL) { 1448 struct cxgbei_ddp_state *ddp = arg; 1449 1450 cxgbei_rm_cmp(ic_to_icc(ic), &ddp->cmp); 1451 t4_free_page_pods(&ddp->prsv); 1452 free(ddp, M_CXGBEI); 1453 } 1454 } 1455 1456 static inline bool 1457 ddp_sgl_check(struct ctl_sg_entry *sg, int entries, int xferlen) 1458 { 1459 #ifdef INVARIANTS 1460 int total_len = 0; 1461 #endif 1462 1463 MPASS(entries > 0); 1464 if (((vm_offset_t)sg[--entries].addr & 3U) != 0) 1465 return (false); 1466 1467 #ifdef INVARIANTS 1468 total_len += sg[entries].len; 1469 #endif 1470 1471 while (--entries >= 0) { 1472 if (((vm_offset_t)sg[entries].addr & PAGE_MASK) != 0 || 1473 (sg[entries].len % PAGE_SIZE) != 0) 1474 return (false); 1475 #ifdef INVARIANTS 1476 total_len += sg[entries].len; 1477 #endif 1478 } 1479 1480 MPASS(total_len == xferlen); 1481 return (true); 1482 } 1483 1484 #define io_to_ddp_state(io) ((io)->io_hdr.ctl_private[CTL_PRIV_FRONTEND2].ptr) 1485 1486 int 1487 icl_cxgbei_conn_transfer_setup(struct icl_conn *ic, struct icl_pdu *ip, 1488 union ctl_io *io, uint32_t *tttp, void **arg) 1489 { 1490 struct icl_cxgbei_conn *icc = ic_to_icc(ic); 1491 struct toepcb *toep = icc->toep; 1492 struct ctl_scsiio *ctsio = &io->scsiio; 1493 struct adapter *sc = icc->sc; 1494 struct cxgbei_data *ci = sc->iscsi_ulp_softc; 1495 struct ppod_region *pr = &ci->pr; 1496 struct cxgbei_ddp_state *ddp; 1497 struct ppod_reservation *prsv; 1498 struct ctl_sg_entry *sgl, sg_entry; 1499 struct inpcb *inp; 1500 struct mbufq mq; 1501 int sg_entries = ctsio->kern_sg_entries; 1502 uint32_t ttt; 1503 int xferlen, rc = 0, alias; 1504 1505 /* This is for the offload driver's state. Must not be set already. */ 1506 MPASS(arg != NULL); 1507 MPASS(*arg == NULL); 1508 1509 if (ctsio->ext_data_filled == 0) { 1510 int first_burst; 1511 #ifdef INVARIANTS 1512 struct icl_cxgbei_pdu *icp = ip_to_icp(ip); 1513 1514 MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE); 1515 MPASS(ic == ip->ip_conn); 1516 MPASS(ip->ip_bhs_mbuf != NULL); 1517 #endif 1518 first_burst = icl_pdu_data_segment_length(ip); 1519 1520 /* 1521 * Note that ICL calls conn_transfer_setup even if the first 1522 * burst had everything and there's nothing left to transfer. 1523 * 1524 * NB: The CTL frontend might have provided a buffer 1525 * whose length (kern_data_len) is smaller than the 1526 * FirstBurstLength of unsolicited data. Treat those 1527 * as an empty transfer. 1528 */ 1529 xferlen = ctsio->kern_data_len; 1530 if (xferlen < first_burst || 1531 xferlen - first_burst < ci->ddp_threshold) { 1532 no_ddp: 1533 /* 1534 * No DDP for this transfer. Allocate a TTT (based on 1535 * the one passed in) that cannot be a valid hardware 1536 * DDP tag in the iSCSI region. 1537 */ 1538 ttt = *tttp & M_PPOD_TAG; 1539 ttt = V_PPOD_TAG(ttt) | pr->pr_invalid_bit; 1540 *tttp = htobe32(ttt); 1541 MPASS(io_to_ddp_state(io) == NULL); 1542 if (rc != 0) 1543 counter_u64_add( 1544 toep->ofld_rxq->rx_iscsi_ddp_setup_error, 1); 1545 return (0); 1546 } 1547 1548 if (sg_entries == 0) { 1549 sgl = &sg_entry; 1550 sgl->len = xferlen; 1551 sgl->addr = (void *)ctsio->kern_data_ptr; 1552 sg_entries = 1; 1553 } else 1554 sgl = (void *)ctsio->kern_data_ptr; 1555 1556 if (!ddp_sgl_check(sgl, sg_entries, xferlen)) 1557 goto no_ddp; 1558 1559 /* 1560 * Reserve resources for DDP, update the ttt that should be used 1561 * in the PDU, and save DDP specific state for this I/O. 1562 */ 1563 MPASS(io_to_ddp_state(io) == NULL); 1564 ddp = malloc(sizeof(*ddp), M_CXGBEI, M_NOWAIT | M_ZERO); 1565 if (ddp == NULL) { 1566 rc = ENOMEM; 1567 goto no_ddp; 1568 } 1569 prsv = &ddp->prsv; 1570 1571 rc = t4_alloc_page_pods_for_sgl(pr, sgl, sg_entries, prsv); 1572 if (rc != 0) { 1573 free(ddp, M_CXGBEI); 1574 goto no_ddp; 1575 } 1576 1577 mbufq_init(&mq, INT_MAX); 1578 rc = t4_write_page_pods_for_sgl(sc, toep, prsv, sgl, sg_entries, 1579 xferlen, &mq); 1580 if (__predict_false(rc != 0)) { 1581 mbufq_drain(&mq); 1582 t4_free_page_pods(prsv); 1583 free(ddp, M_CXGBEI); 1584 goto no_ddp; 1585 } 1586 1587 /* 1588 * Do not get inp from toep->inp as the toepcb might 1589 * have detached already. 1590 */ 1591 ICL_CONN_LOCK(ic); 1592 if (ic->ic_disconnecting || ic->ic_socket == NULL) { 1593 ICL_CONN_UNLOCK(ic); 1594 mbufq_drain(&mq); 1595 t4_free_page_pods(prsv); 1596 free(ddp, M_CXGBEI); 1597 return (ECONNRESET); 1598 } 1599 inp = sotoinpcb(ic->ic_socket); 1600 INP_WLOCK(inp); 1601 ICL_CONN_UNLOCK(ic); 1602 if ((inp->inp_flags & INP_DROPPED) != 0) { 1603 INP_WUNLOCK(inp); 1604 mbufq_drain(&mq); 1605 t4_free_page_pods(prsv); 1606 free(ddp, M_CXGBEI); 1607 return (ECONNRESET); 1608 } 1609 mbufq_concat(&toep->ulp_pduq, &mq); 1610 INP_WUNLOCK(inp); 1611 1612 ddp->cmp.next_buffer_offset = ctsio->kern_rel_offset + 1613 first_burst; 1614 ddp->cmp.last_datasn = -1; 1615 cxgbei_insert_cmp(icc, &ddp->cmp, prsv->prsv_tag); 1616 *tttp = htobe32(prsv->prsv_tag); 1617 io_to_ddp_state(io) = ddp; 1618 *arg = ctsio; 1619 counter_u64_add(toep->ofld_rxq->rx_iscsi_ddp_setup_ok, 1); 1620 return (0); 1621 } 1622 1623 /* 1624 * In the middle of an I/O. A non-NULL page pod reservation indicates 1625 * that a DDP buffer is being used for the I/O. 1626 */ 1627 ddp = io_to_ddp_state(ctsio); 1628 if (ddp == NULL) 1629 goto no_ddp; 1630 prsv = &ddp->prsv; 1631 1632 alias = (prsv->prsv_tag & pr->pr_alias_mask) >> pr->pr_alias_shift; 1633 alias++; 1634 prsv->prsv_tag &= ~pr->pr_alias_mask; 1635 prsv->prsv_tag |= alias << pr->pr_alias_shift & pr->pr_alias_mask; 1636 1637 ddp->cmp.last_datasn = -1; 1638 cxgbei_insert_cmp(icc, &ddp->cmp, prsv->prsv_tag); 1639 *tttp = htobe32(prsv->prsv_tag); 1640 *arg = ctsio; 1641 1642 return (0); 1643 } 1644 1645 void 1646 icl_cxgbei_conn_transfer_done(struct icl_conn *ic, void *arg) 1647 { 1648 struct ctl_scsiio *ctsio = arg; 1649 1650 if (ctsio != NULL) { 1651 struct cxgbei_ddp_state *ddp; 1652 1653 ddp = io_to_ddp_state(ctsio); 1654 MPASS(ddp != NULL); 1655 1656 cxgbei_rm_cmp(ic_to_icc(ic), &ddp->cmp); 1657 if (ctsio->kern_data_len == ctsio->ext_data_filled || 1658 ic->ic_disconnecting) { 1659 t4_free_page_pods(&ddp->prsv); 1660 free(ddp, M_CXGBEI); 1661 io_to_ddp_state(ctsio) = NULL; 1662 } 1663 } 1664 } 1665 1666 #ifdef COMPAT_FREEBSD13 1667 static void 1668 cxgbei_limits(struct adapter *sc, void *arg) 1669 { 1670 struct icl_drv_limits *idl = arg; 1671 struct cxgbei_data *ci; 1672 int max_dsl; 1673 1674 if (begin_synchronized_op(sc, NULL, HOLD_LOCK, "t4lims") != 0) 1675 return; 1676 1677 if (uld_active(sc, ULD_ISCSI)) { 1678 ci = sc->iscsi_ulp_softc; 1679 MPASS(ci != NULL); 1680 1681 1682 max_dsl = ci->max_rx_data_len; 1683 if (idl->idl_max_recv_data_segment_length > max_dsl) 1684 idl->idl_max_recv_data_segment_length = max_dsl; 1685 1686 max_dsl = ci->max_tx_data_len; 1687 if (idl->idl_max_send_data_segment_length > max_dsl) 1688 idl->idl_max_send_data_segment_length = max_dsl; 1689 } 1690 1691 end_synchronized_op(sc, LOCK_HELD); 1692 } 1693 #endif 1694 1695 static int 1696 cxgbei_limits_fd(struct icl_drv_limits *idl, int fd) 1697 { 1698 struct file *fp; 1699 struct socket *so; 1700 struct adapter *sc; 1701 struct cxgbei_data *ci; 1702 cap_rights_t rights; 1703 int error; 1704 1705 error = fget(curthread, fd, 1706 cap_rights_init_one(&rights, CAP_SOCK_CLIENT), &fp); 1707 if (error != 0) 1708 return (error); 1709 if (fp->f_type != DTYPE_SOCKET) { 1710 fdrop(fp, curthread); 1711 return (EINVAL); 1712 } 1713 so = fp->f_data; 1714 if (so->so_type != SOCK_STREAM || 1715 so->so_proto->pr_protocol != IPPROTO_TCP) { 1716 fdrop(fp, curthread); 1717 return (EINVAL); 1718 } 1719 1720 sc = find_offload_adapter(so); 1721 if (sc == NULL) { 1722 fdrop(fp, curthread); 1723 return (ENXIO); 1724 } 1725 fdrop(fp, curthread); 1726 1727 error = begin_synchronized_op(sc, NULL, HOLD_LOCK, "t4lims"); 1728 if (error != 0) 1729 return (error); 1730 1731 if (uld_active(sc, ULD_ISCSI)) { 1732 ci = sc->iscsi_ulp_softc; 1733 MPASS(ci != NULL); 1734 1735 idl->idl_max_recv_data_segment_length = ci->max_rx_data_len; 1736 idl->idl_max_send_data_segment_length = ci->max_tx_data_len; 1737 } else 1738 error = ENXIO; 1739 1740 end_synchronized_op(sc, LOCK_HELD); 1741 1742 return (error); 1743 } 1744 1745 static int 1746 icl_cxgbei_limits(struct icl_drv_limits *idl, int socket) 1747 { 1748 1749 /* Maximum allowed by the RFC. cxgbei_limits will clip them. */ 1750 idl->idl_max_recv_data_segment_length = (1 << 24) - 1; 1751 idl->idl_max_send_data_segment_length = (1 << 24) - 1; 1752 1753 /* These are somewhat arbitrary. */ 1754 idl->idl_max_burst_length = max_burst_length; 1755 idl->idl_first_burst_length = first_burst_length; 1756 1757 #ifdef COMPAT_FREEBSD13 1758 if (socket == 0) { 1759 t4_iterate(cxgbei_limits, idl); 1760 return (0); 1761 } 1762 #endif 1763 1764 return (cxgbei_limits_fd(idl, socket)); 1765 } 1766 1767 int 1768 icl_cxgbei_mod_load(void) 1769 { 1770 int rc; 1771 1772 refcount_init(&icl_cxgbei_ncons, 0); 1773 1774 rc = icl_register("cxgbei", false, -100, icl_cxgbei_limits, 1775 icl_cxgbei_new_conn); 1776 1777 return (rc); 1778 } 1779 1780 int 1781 icl_cxgbei_mod_unload(void) 1782 { 1783 1784 if (icl_cxgbei_ncons != 0) 1785 return (EBUSY); 1786 1787 icl_unregister("cxgbei", false); 1788 1789 return (0); 1790 } 1791 #endif 1792