1 /*- 2 * Copyright (c) 2012 The FreeBSD Foundation 3 * Copyright (c) 2015 Chelsio Communications, Inc. 4 * All rights reserved. 5 * 6 * This software was developed by Edward Tomasz Napierala under sponsorship 7 * from the FreeBSD Foundation. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 * 30 */ 31 32 /* 33 * cxgbei implementation of iSCSI Common Layer kobj(9) interface. 34 */ 35 36 #include <sys/cdefs.h> 37 __FBSDID("$FreeBSD$"); 38 39 #include "opt_inet.h" 40 #include "opt_inet6.h" 41 42 #ifdef TCP_OFFLOAD 43 #include <sys/param.h> 44 #include <sys/capsicum.h> 45 #include <sys/condvar.h> 46 #include <sys/conf.h> 47 #include <sys/file.h> 48 #include <sys/kernel.h> 49 #include <sys/kthread.h> 50 #include <sys/ktr.h> 51 #include <sys/lock.h> 52 #include <sys/mbuf.h> 53 #include <sys/mutex.h> 54 #include <sys/module.h> 55 #include <sys/protosw.h> 56 #include <sys/socket.h> 57 #include <sys/socketvar.h> 58 #include <sys/sysctl.h> 59 #include <sys/systm.h> 60 #include <sys/sx.h> 61 #include <sys/uio.h> 62 #include <machine/bus.h> 63 #include <vm/vm.h> 64 #include <vm/pmap.h> 65 #include <netinet/in.h> 66 #include <netinet/in_pcb.h> 67 #include <netinet/tcp.h> 68 #include <netinet/tcp_var.h> 69 #include <netinet/toecore.h> 70 71 #include <dev/iscsi/icl.h> 72 #include <dev/iscsi/iscsi_proto.h> 73 #include <icl_conn_if.h> 74 75 #include <cam/scsi/scsi_all.h> 76 #include <cam/scsi/scsi_da.h> 77 #include <cam/ctl/ctl_io.h> 78 #include <cam/ctl/ctl.h> 79 #include <cam/ctl/ctl_backend.h> 80 #include <cam/ctl/ctl_error.h> 81 #include <cam/ctl/ctl_frontend.h> 82 #include <cam/ctl/ctl_debug.h> 83 #include <cam/ctl/ctl_ha.h> 84 #include <cam/ctl/ctl_ioctl.h> 85 86 #include <cam/cam.h> 87 #include <cam/cam_ccb.h> 88 #include <cam/cam_xpt.h> 89 #include <cam/cam_debug.h> 90 #include <cam/cam_sim.h> 91 #include <cam/cam_xpt_sim.h> 92 #include <cam/cam_xpt_periph.h> 93 #include <cam/cam_periph.h> 94 #include <cam/cam_compat.h> 95 #include <cam/scsi/scsi_message.h> 96 97 #include "common/common.h" 98 #include "common/t4_regs.h" 99 #include "common/t4_tcb.h" 100 #include "tom/t4_tom.h" 101 #include "cxgbei.h" 102 103 /* 104 * Use the page pod tag for the TT hash. 105 */ 106 #define TT_HASH(icc, tt) (G_PPOD_TAG(tt) & (icc)->cmp_hash_mask) 107 108 struct cxgbei_ddp_state { 109 struct ppod_reservation prsv; 110 struct cxgbei_cmp cmp; 111 }; 112 113 static MALLOC_DEFINE(M_CXGBEI, "cxgbei", "cxgbei(4)"); 114 115 SYSCTL_NODE(_kern_icl, OID_AUTO, cxgbei, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 116 "Chelsio iSCSI offload"); 117 static int first_burst_length = 8192; 118 SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, first_burst_length, CTLFLAG_RWTUN, 119 &first_burst_length, 0, "First burst length"); 120 static int max_burst_length = 2 * 1024 * 1024; 121 SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, max_burst_length, CTLFLAG_RWTUN, 122 &max_burst_length, 0, "Maximum burst length"); 123 static int sendspace = 1048576; 124 SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, sendspace, CTLFLAG_RWTUN, 125 &sendspace, 0, "Default send socket buffer size"); 126 static int recvspace = 1048576; 127 SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, recvspace, CTLFLAG_RWTUN, 128 &recvspace, 0, "Default receive socket buffer size"); 129 130 static volatile u_int icl_cxgbei_ncons; 131 132 static icl_conn_new_pdu_t icl_cxgbei_conn_new_pdu; 133 static icl_conn_pdu_data_segment_length_t 134 icl_cxgbei_conn_pdu_data_segment_length; 135 static icl_conn_pdu_append_data_t icl_cxgbei_conn_pdu_append_data; 136 static icl_conn_pdu_get_data_t icl_cxgbei_conn_pdu_get_data; 137 static icl_conn_pdu_queue_t icl_cxgbei_conn_pdu_queue; 138 static icl_conn_pdu_queue_cb_t icl_cxgbei_conn_pdu_queue_cb; 139 static icl_conn_handoff_t icl_cxgbei_conn_handoff; 140 static icl_conn_free_t icl_cxgbei_conn_free; 141 static icl_conn_close_t icl_cxgbei_conn_close; 142 static icl_conn_task_setup_t icl_cxgbei_conn_task_setup; 143 static icl_conn_task_done_t icl_cxgbei_conn_task_done; 144 static icl_conn_transfer_setup_t icl_cxgbei_conn_transfer_setup; 145 static icl_conn_transfer_done_t icl_cxgbei_conn_transfer_done; 146 147 static kobj_method_t icl_cxgbei_methods[] = { 148 KOBJMETHOD(icl_conn_new_pdu, icl_cxgbei_conn_new_pdu), 149 KOBJMETHOD(icl_conn_pdu_free, icl_cxgbei_conn_pdu_free), 150 KOBJMETHOD(icl_conn_pdu_data_segment_length, 151 icl_cxgbei_conn_pdu_data_segment_length), 152 KOBJMETHOD(icl_conn_pdu_append_data, icl_cxgbei_conn_pdu_append_data), 153 KOBJMETHOD(icl_conn_pdu_get_data, icl_cxgbei_conn_pdu_get_data), 154 KOBJMETHOD(icl_conn_pdu_queue, icl_cxgbei_conn_pdu_queue), 155 KOBJMETHOD(icl_conn_pdu_queue_cb, icl_cxgbei_conn_pdu_queue_cb), 156 KOBJMETHOD(icl_conn_handoff, icl_cxgbei_conn_handoff), 157 KOBJMETHOD(icl_conn_free, icl_cxgbei_conn_free), 158 KOBJMETHOD(icl_conn_close, icl_cxgbei_conn_close), 159 KOBJMETHOD(icl_conn_task_setup, icl_cxgbei_conn_task_setup), 160 KOBJMETHOD(icl_conn_task_done, icl_cxgbei_conn_task_done), 161 KOBJMETHOD(icl_conn_transfer_setup, icl_cxgbei_conn_transfer_setup), 162 KOBJMETHOD(icl_conn_transfer_done, icl_cxgbei_conn_transfer_done), 163 { 0, 0 } 164 }; 165 166 DEFINE_CLASS(icl_cxgbei, icl_cxgbei_methods, sizeof(struct icl_cxgbei_conn)); 167 168 void 169 icl_cxgbei_conn_pdu_free(struct icl_conn *ic, struct icl_pdu *ip) 170 { 171 struct icl_cxgbei_pdu *icp = ip_to_icp(ip); 172 173 KASSERT(icp->ref_cnt != 0, ("freeing deleted PDU")); 174 MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE); 175 MPASS(ic == ip->ip_conn); 176 177 m_freem(ip->ip_ahs_mbuf); 178 m_freem(ip->ip_data_mbuf); 179 m_freem(ip->ip_bhs_mbuf); 180 181 KASSERT(ic != NULL || icp->ref_cnt == 1, 182 ("orphaned PDU has oustanding references")); 183 184 if (atomic_fetchadd_int(&icp->ref_cnt, -1) != 1) 185 return; 186 187 free(icp, M_CXGBEI); 188 #ifdef DIAGNOSTIC 189 if (__predict_true(ic != NULL)) 190 refcount_release(&ic->ic_outstanding_pdus); 191 #endif 192 } 193 194 static void 195 icl_cxgbei_pdu_call_cb(struct icl_pdu *ip) 196 { 197 struct icl_cxgbei_pdu *icp = ip_to_icp(ip); 198 199 MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE); 200 201 if (icp->cb != NULL) 202 icp->cb(ip, icp->error); 203 #ifdef DIAGNOSTIC 204 if (__predict_true(ip->ip_conn != NULL)) 205 refcount_release(&ip->ip_conn->ic_outstanding_pdus); 206 #endif 207 free(icp, M_CXGBEI); 208 } 209 210 static void 211 icl_cxgbei_pdu_done(struct icl_pdu *ip, int error) 212 { 213 struct icl_cxgbei_pdu *icp = ip_to_icp(ip); 214 215 if (error != 0) 216 icp->error = error; 217 218 m_freem(ip->ip_ahs_mbuf); 219 ip->ip_ahs_mbuf = NULL; 220 m_freem(ip->ip_data_mbuf); 221 ip->ip_data_mbuf = NULL; 222 m_freem(ip->ip_bhs_mbuf); 223 ip->ip_bhs_mbuf = NULL; 224 225 /* 226 * All other references to this PDU should have been dropped 227 * by the m_freem() of ip_data_mbuf. 228 */ 229 if (atomic_fetchadd_int(&icp->ref_cnt, -1) == 1) 230 icl_cxgbei_pdu_call_cb(ip); 231 else 232 __assert_unreachable(); 233 } 234 235 static void 236 icl_cxgbei_mbuf_done(struct mbuf *mb) 237 { 238 239 struct icl_cxgbei_pdu *icp = (struct icl_cxgbei_pdu *)mb->m_ext.ext_arg1; 240 241 /* 242 * NB: mb_free_mext() might leave ref_cnt as 1 without 243 * decrementing it if it hits the fast path in the ref_cnt 244 * check. 245 */ 246 icl_cxgbei_pdu_call_cb(&icp->ip); 247 } 248 249 struct icl_pdu * 250 icl_cxgbei_new_pdu(int flags) 251 { 252 struct icl_cxgbei_pdu *icp; 253 struct icl_pdu *ip; 254 struct mbuf *m; 255 256 icp = malloc(sizeof(*icp), M_CXGBEI, flags | M_ZERO); 257 if (__predict_false(icp == NULL)) 258 return (NULL); 259 260 icp->icp_signature = CXGBEI_PDU_SIGNATURE; 261 icp->ref_cnt = 1; 262 ip = &icp->ip; 263 264 m = m_gethdr(flags, MT_DATA); 265 if (__predict_false(m == NULL)) { 266 free(icp, M_CXGBEI); 267 return (NULL); 268 } 269 270 ip->ip_bhs_mbuf = m; 271 ip->ip_bhs = mtod(m, struct iscsi_bhs *); 272 memset(ip->ip_bhs, 0, sizeof(*ip->ip_bhs)); 273 m->m_len = sizeof(struct iscsi_bhs); 274 m->m_pkthdr.len = m->m_len; 275 276 return (ip); 277 } 278 279 void 280 icl_cxgbei_new_pdu_set_conn(struct icl_pdu *ip, struct icl_conn *ic) 281 { 282 283 ip->ip_conn = ic; 284 #ifdef DIAGNOSTIC 285 refcount_acquire(&ic->ic_outstanding_pdus); 286 #endif 287 } 288 289 /* 290 * Allocate icl_pdu with empty BHS to fill up by the caller. 291 */ 292 static struct icl_pdu * 293 icl_cxgbei_conn_new_pdu(struct icl_conn *ic, int flags) 294 { 295 struct icl_pdu *ip; 296 297 ip = icl_cxgbei_new_pdu(flags); 298 if (__predict_false(ip == NULL)) 299 return (NULL); 300 icl_cxgbei_new_pdu_set_conn(ip, ic); 301 302 return (ip); 303 } 304 305 static size_t 306 icl_pdu_data_segment_length(const struct icl_pdu *request) 307 { 308 uint32_t len = 0; 309 310 len += request->ip_bhs->bhs_data_segment_len[0]; 311 len <<= 8; 312 len += request->ip_bhs->bhs_data_segment_len[1]; 313 len <<= 8; 314 len += request->ip_bhs->bhs_data_segment_len[2]; 315 316 return (len); 317 } 318 319 size_t 320 icl_cxgbei_conn_pdu_data_segment_length(struct icl_conn *ic, 321 const struct icl_pdu *request) 322 { 323 324 return (icl_pdu_data_segment_length(request)); 325 } 326 327 static struct mbuf * 328 finalize_pdu(struct icl_cxgbei_conn *icc, struct icl_cxgbei_pdu *icp) 329 { 330 struct icl_pdu *ip = &icp->ip; 331 uint8_t ulp_submode, padding; 332 struct mbuf *m, *last; 333 struct iscsi_bhs *bhs; 334 int data_len; 335 336 /* 337 * Fix up the data segment mbuf first. 338 */ 339 m = ip->ip_data_mbuf; 340 ulp_submode = icc->ulp_submode; 341 if (m != NULL) { 342 last = m_last(m); 343 344 /* 345 * Round up the data segment to a 4B boundary. Pad with 0 if 346 * necessary. There will definitely be room in the mbuf. 347 */ 348 padding = roundup2(ip->ip_data_len, 4) - ip->ip_data_len; 349 if (padding != 0) { 350 MPASS(padding <= M_TRAILINGSPACE(last)); 351 bzero(mtod(last, uint8_t *) + last->m_len, padding); 352 last->m_len += padding; 353 } 354 } else { 355 MPASS(ip->ip_data_len == 0); 356 ulp_submode &= ~ULP_CRC_DATA; 357 padding = 0; 358 } 359 360 /* 361 * Now the header mbuf that has the BHS. 362 */ 363 m = ip->ip_bhs_mbuf; 364 MPASS(m->m_pkthdr.len == sizeof(struct iscsi_bhs)); 365 MPASS(m->m_len == sizeof(struct iscsi_bhs)); 366 367 bhs = ip->ip_bhs; 368 data_len = ip->ip_data_len; 369 if (data_len > icc->ic.ic_max_send_data_segment_length) { 370 struct iscsi_bhs_data_in *bhsdi; 371 int flags; 372 373 KASSERT(padding == 0, ("%s: ISO with padding %d for icp %p", 374 __func__, padding, icp)); 375 switch (bhs->bhs_opcode) { 376 case ISCSI_BHS_OPCODE_SCSI_DATA_OUT: 377 flags = 1; 378 break; 379 case ISCSI_BHS_OPCODE_SCSI_DATA_IN: 380 flags = 2; 381 break; 382 default: 383 panic("invalid opcode %#x for ISO", bhs->bhs_opcode); 384 } 385 data_len = icc->ic.ic_max_send_data_segment_length; 386 bhsdi = (struct iscsi_bhs_data_in *)bhs; 387 if (bhsdi->bhsdi_flags & BHSDI_FLAGS_F) { 388 /* 389 * Firmware will set F on the final PDU in the 390 * burst. 391 */ 392 flags |= CXGBE_ISO_F; 393 bhsdi->bhsdi_flags &= ~BHSDI_FLAGS_F; 394 } 395 set_mbuf_iscsi_iso(m, true); 396 set_mbuf_iscsi_iso_flags(m, flags); 397 set_mbuf_iscsi_iso_mss(m, data_len); 398 } 399 400 bhs->bhs_data_segment_len[2] = data_len; 401 bhs->bhs_data_segment_len[1] = data_len >> 8; 402 bhs->bhs_data_segment_len[0] = data_len >> 16; 403 404 /* 405 * Extract mbuf chain from PDU. 406 */ 407 m->m_pkthdr.len += ip->ip_data_len + padding; 408 m->m_next = ip->ip_data_mbuf; 409 set_mbuf_ulp_submode(m, ulp_submode); 410 ip->ip_bhs_mbuf = NULL; 411 ip->ip_data_mbuf = NULL; 412 ip->ip_bhs = NULL; 413 414 /* 415 * Drop PDU reference on icp. Additional references might 416 * still be held by zero-copy PDU buffers (ICL_NOCOPY). 417 */ 418 if (atomic_fetchadd_int(&icp->ref_cnt, -1) == 1) 419 icl_cxgbei_pdu_call_cb(ip); 420 421 return (m); 422 } 423 424 static void 425 icl_cxgbei_tx_main(void *arg) 426 { 427 struct epoch_tracker et; 428 struct icl_cxgbei_conn *icc = arg; 429 struct icl_conn *ic = &icc->ic; 430 struct toepcb *toep = icc->toep; 431 struct socket *so = ic->ic_socket; 432 struct inpcb *inp = sotoinpcb(so); 433 struct icl_pdu *ip; 434 struct mbuf *m; 435 struct mbufq mq; 436 STAILQ_HEAD(, icl_pdu) tx_pdus = STAILQ_HEAD_INITIALIZER(tx_pdus); 437 438 mbufq_init(&mq, INT_MAX); 439 440 ICL_CONN_LOCK(ic); 441 while (__predict_true(!ic->ic_disconnecting)) { 442 while (STAILQ_EMPTY(&icc->sent_pdus)) { 443 icc->tx_active = false; 444 mtx_sleep(&icc->tx_active, ic->ic_lock, 0, "-", 0); 445 if (__predict_false(ic->ic_disconnecting)) 446 goto out; 447 MPASS(icc->tx_active); 448 } 449 450 STAILQ_SWAP(&icc->sent_pdus, &tx_pdus, icl_pdu); 451 ICL_CONN_UNLOCK(ic); 452 453 while ((ip = STAILQ_FIRST(&tx_pdus)) != NULL) { 454 STAILQ_REMOVE_HEAD(&tx_pdus, ip_next); 455 456 m = finalize_pdu(icc, ip_to_icp(ip)); 457 M_ASSERTPKTHDR(m); 458 MPASS((m->m_pkthdr.len & 3) == 0); 459 460 mbufq_enqueue(&mq, m); 461 } 462 463 ICL_CONN_LOCK(ic); 464 if (__predict_false(ic->ic_disconnecting) || 465 __predict_false(ic->ic_socket == NULL)) { 466 mbufq_drain(&mq); 467 break; 468 } 469 470 CURVNET_SET(toep->vnet); 471 NET_EPOCH_ENTER(et); 472 INP_WLOCK(inp); 473 474 ICL_CONN_UNLOCK(ic); 475 if (__predict_false(inp->inp_flags & (INP_DROPPED | 476 INP_TIMEWAIT)) || 477 __predict_false((toep->flags & TPF_ATTACHED) == 0)) { 478 mbufq_drain(&mq); 479 } else { 480 mbufq_concat(&toep->ulp_pduq, &mq); 481 t4_push_pdus(icc->sc, toep, 0); 482 } 483 INP_WUNLOCK(inp); 484 NET_EPOCH_EXIT(et); 485 CURVNET_RESTORE(); 486 487 ICL_CONN_LOCK(ic); 488 } 489 out: 490 ICL_CONN_UNLOCK(ic); 491 492 kthread_exit(); 493 } 494 495 static void 496 icl_cxgbei_rx_main(void *arg) 497 { 498 struct icl_cxgbei_conn *icc = arg; 499 struct icl_conn *ic = &icc->ic; 500 struct icl_pdu *ip; 501 struct sockbuf *sb; 502 STAILQ_HEAD(, icl_pdu) rx_pdus = STAILQ_HEAD_INITIALIZER(rx_pdus); 503 bool cantrcvmore; 504 505 sb = &ic->ic_socket->so_rcv; 506 SOCKBUF_LOCK(sb); 507 while (__predict_true(!ic->ic_disconnecting)) { 508 while (STAILQ_EMPTY(&icc->rcvd_pdus)) { 509 icc->rx_active = false; 510 mtx_sleep(&icc->rx_active, SOCKBUF_MTX(sb), 0, "-", 0); 511 if (__predict_false(ic->ic_disconnecting)) 512 goto out; 513 MPASS(icc->rx_active); 514 } 515 516 if (__predict_false(sbused(sb)) != 0) { 517 /* 518 * PDUs were received before the tid 519 * transitioned to ULP mode. Convert 520 * them to icl_cxgbei_pdus and insert 521 * them into the head of rcvd_pdus. 522 */ 523 parse_pdus(icc, sb); 524 } 525 cantrcvmore = (sb->sb_state & SBS_CANTRCVMORE) != 0; 526 MPASS(STAILQ_EMPTY(&rx_pdus)); 527 STAILQ_SWAP(&icc->rcvd_pdus, &rx_pdus, icl_pdu); 528 SOCKBUF_UNLOCK(sb); 529 530 /* Hand over PDUs to ICL. */ 531 while ((ip = STAILQ_FIRST(&rx_pdus)) != NULL) { 532 STAILQ_REMOVE_HEAD(&rx_pdus, ip_next); 533 if (cantrcvmore) 534 icl_cxgbei_pdu_done(ip, ENOTCONN); 535 else 536 ic->ic_receive(ip); 537 } 538 539 SOCKBUF_LOCK(sb); 540 } 541 out: 542 /* 543 * Since ic_disconnecting is set before the SOCKBUF_MTX is 544 * locked in icl_cxgbei_conn_close, the loop above can exit 545 * before icl_cxgbei_conn_close can lock SOCKBUF_MTX and block 546 * waiting for the thread exit. 547 */ 548 while (!icc->rx_exiting) 549 mtx_sleep(&icc->rx_active, SOCKBUF_MTX(sb), 0, "-", 0); 550 SOCKBUF_UNLOCK(sb); 551 552 kthread_exit(); 553 } 554 555 int 556 icl_cxgbei_conn_pdu_append_data(struct icl_conn *ic, struct icl_pdu *ip, 557 const void *addr, size_t len, int flags) 558 { 559 struct icl_cxgbei_pdu *icp = ip_to_icp(ip); 560 struct mbuf *m, *m_tail; 561 const char *src; 562 563 MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE); 564 MPASS(ic == ip->ip_conn); 565 KASSERT(len > 0, ("%s: len is %jd", __func__, (intmax_t)len)); 566 567 m_tail = ip->ip_data_mbuf; 568 if (m_tail != NULL) 569 for (; m_tail->m_next != NULL; m_tail = m_tail->m_next) 570 ; 571 572 if (flags & ICL_NOCOPY) { 573 m = m_get(flags & ~ICL_NOCOPY, MT_DATA); 574 if (m == NULL) { 575 ICL_WARN("failed to allocate mbuf"); 576 return (ENOMEM); 577 } 578 579 m->m_flags |= M_RDONLY; 580 m_extaddref(m, __DECONST(char *, addr), len, &icp->ref_cnt, 581 icl_cxgbei_mbuf_done, icp, NULL); 582 m->m_len = len; 583 if (ip->ip_data_mbuf == NULL) { 584 ip->ip_data_mbuf = m; 585 ip->ip_data_len = len; 586 } else { 587 m_tail->m_next = m; 588 m_tail = m_tail->m_next; 589 ip->ip_data_len += len; 590 } 591 592 return (0); 593 } 594 595 src = (const char *)addr; 596 597 /* Allocate as jumbo mbufs of size MJUM16BYTES. */ 598 while (len >= MJUM16BYTES) { 599 m = m_getjcl(M_NOWAIT, MT_DATA, 0, MJUM16BYTES); 600 if (__predict_false(m == NULL)) { 601 if ((flags & M_WAITOK) != 0) { 602 /* Fall back to non-jumbo mbufs. */ 603 break; 604 } 605 return (ENOMEM); 606 } 607 memcpy(mtod(m, void *), src, MJUM16BYTES); 608 m->m_len = MJUM16BYTES; 609 if (ip->ip_data_mbuf == NULL) { 610 ip->ip_data_mbuf = m_tail = m; 611 ip->ip_data_len = MJUM16BYTES; 612 } else { 613 m_tail->m_next = m; 614 m_tail = m_tail->m_next; 615 ip->ip_data_len += MJUM16BYTES; 616 } 617 src += MJUM16BYTES; 618 len -= MJUM16BYTES; 619 } 620 621 /* Allocate mbuf chain for the remaining data. */ 622 if (len != 0) { 623 m = m_getm2(NULL, len, flags, MT_DATA, 0); 624 if (__predict_false(m == NULL)) 625 return (ENOMEM); 626 if (ip->ip_data_mbuf == NULL) { 627 ip->ip_data_mbuf = m; 628 ip->ip_data_len = len; 629 } else { 630 m_tail->m_next = m; 631 ip->ip_data_len += len; 632 } 633 for (; m != NULL; m = m->m_next) { 634 m->m_len = min(len, M_SIZE(m)); 635 memcpy(mtod(m, void *), src, m->m_len); 636 src += m->m_len; 637 len -= m->m_len; 638 } 639 MPASS(len == 0); 640 } 641 MPASS(ip->ip_data_len <= max(ic->ic_max_send_data_segment_length, 642 ic->ic_hw_isomax)); 643 644 return (0); 645 } 646 647 void 648 icl_cxgbei_conn_pdu_get_data(struct icl_conn *ic, struct icl_pdu *ip, 649 size_t off, void *addr, size_t len) 650 { 651 struct icl_cxgbei_pdu *icp = ip_to_icp(ip); 652 653 if (icp->icp_flags & ICPF_RX_DDP) 654 return; /* data is DDP'ed, no need to copy */ 655 m_copydata(ip->ip_data_mbuf, off, len, addr); 656 } 657 658 void 659 icl_cxgbei_conn_pdu_queue(struct icl_conn *ic, struct icl_pdu *ip) 660 { 661 icl_cxgbei_conn_pdu_queue_cb(ic, ip, NULL); 662 } 663 664 void 665 icl_cxgbei_conn_pdu_queue_cb(struct icl_conn *ic, struct icl_pdu *ip, 666 icl_pdu_cb cb) 667 { 668 struct icl_cxgbei_conn *icc = ic_to_icc(ic); 669 struct icl_cxgbei_pdu *icp = ip_to_icp(ip); 670 struct socket *so = ic->ic_socket; 671 672 MPASS(ic == ip->ip_conn); 673 MPASS(ip->ip_bhs_mbuf != NULL); 674 /* The kernel doesn't generate PDUs with AHS. */ 675 MPASS(ip->ip_ahs_mbuf == NULL && ip->ip_ahs_len == 0); 676 677 ICL_CONN_LOCK_ASSERT(ic); 678 679 icp->cb = cb; 680 681 /* NOTE: sowriteable without so_snd lock is a mostly harmless race. */ 682 if (ic->ic_disconnecting || so == NULL || !sowriteable(so)) { 683 icl_cxgbei_pdu_done(ip, ENOTCONN); 684 return; 685 } 686 687 STAILQ_INSERT_TAIL(&icc->sent_pdus, ip, ip_next); 688 if (!icc->tx_active) { 689 icc->tx_active = true; 690 wakeup(&icc->tx_active); 691 } 692 } 693 694 static struct icl_conn * 695 icl_cxgbei_new_conn(const char *name, struct mtx *lock) 696 { 697 struct icl_cxgbei_conn *icc; 698 struct icl_conn *ic; 699 700 refcount_acquire(&icl_cxgbei_ncons); 701 702 icc = (struct icl_cxgbei_conn *)kobj_create(&icl_cxgbei_class, M_CXGBE, 703 M_WAITOK | M_ZERO); 704 icc->icc_signature = CXGBEI_CONN_SIGNATURE; 705 STAILQ_INIT(&icc->rcvd_pdus); 706 STAILQ_INIT(&icc->sent_pdus); 707 708 icc->cmp_table = hashinit(64, M_CXGBEI, &icc->cmp_hash_mask); 709 mtx_init(&icc->cmp_lock, "cxgbei_cmp", NULL, MTX_DEF); 710 711 ic = &icc->ic; 712 ic->ic_lock = lock; 713 714 #ifdef DIAGNOSTIC 715 refcount_init(&ic->ic_outstanding_pdus, 0); 716 #endif 717 ic->ic_name = name; 718 ic->ic_offload = "cxgbei"; 719 ic->ic_unmapped = false; 720 721 CTR2(KTR_CXGBE, "%s: icc %p", __func__, icc); 722 723 return (ic); 724 } 725 726 void 727 icl_cxgbei_conn_free(struct icl_conn *ic) 728 { 729 struct icl_cxgbei_conn *icc = ic_to_icc(ic); 730 731 MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); 732 733 CTR2(KTR_CXGBE, "%s: icc %p", __func__, icc); 734 735 mtx_destroy(&icc->cmp_lock); 736 hashdestroy(icc->cmp_table, M_CXGBEI, icc->cmp_hash_mask); 737 kobj_delete((struct kobj *)icc, M_CXGBE); 738 refcount_release(&icl_cxgbei_ncons); 739 } 740 741 static int 742 icl_cxgbei_setsockopt(struct icl_conn *ic, struct socket *so, int sspace, 743 int rspace) 744 { 745 struct sockopt opt; 746 int error, one = 1, ss, rs; 747 748 ss = max(sendspace, sspace); 749 rs = max(recvspace, rspace); 750 751 error = soreserve(so, ss, rs); 752 if (error != 0) 753 return (error); 754 SOCKBUF_LOCK(&so->so_snd); 755 so->so_snd.sb_flags |= SB_AUTOSIZE; 756 SOCKBUF_UNLOCK(&so->so_snd); 757 SOCKBUF_LOCK(&so->so_rcv); 758 so->so_rcv.sb_flags |= SB_AUTOSIZE; 759 SOCKBUF_UNLOCK(&so->so_rcv); 760 761 /* 762 * Disable Nagle. 763 */ 764 bzero(&opt, sizeof(opt)); 765 opt.sopt_dir = SOPT_SET; 766 opt.sopt_level = IPPROTO_TCP; 767 opt.sopt_name = TCP_NODELAY; 768 opt.sopt_val = &one; 769 opt.sopt_valsize = sizeof(one); 770 error = sosetopt(so, &opt); 771 if (error != 0) 772 return (error); 773 774 return (0); 775 } 776 777 /* 778 * Request/response structure used to find out the adapter offloading a socket. 779 */ 780 struct find_ofld_adapter_rr { 781 struct socket *so; 782 struct adapter *sc; /* result */ 783 }; 784 785 static void 786 find_offload_adapter(struct adapter *sc, void *arg) 787 { 788 struct find_ofld_adapter_rr *fa = arg; 789 struct socket *so = fa->so; 790 struct tom_data *td = sc->tom_softc; 791 struct tcpcb *tp; 792 struct inpcb *inp; 793 794 /* Non-TCP were filtered out earlier. */ 795 MPASS(so->so_proto->pr_protocol == IPPROTO_TCP); 796 797 if (fa->sc != NULL) 798 return; /* Found already. */ 799 800 if (td == NULL) 801 return; /* TOE not enabled on this adapter. */ 802 803 inp = sotoinpcb(so); 804 INP_WLOCK(inp); 805 if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { 806 tp = intotcpcb(inp); 807 if (tp->t_flags & TF_TOE && tp->tod == &td->tod) 808 fa->sc = sc; /* Found. */ 809 } 810 INP_WUNLOCK(inp); 811 } 812 813 static bool 814 is_memfree(struct adapter *sc) 815 { 816 uint32_t em; 817 818 em = t4_read_reg(sc, A_MA_TARGET_MEM_ENABLE); 819 if ((em & F_EXT_MEM_ENABLE) != 0) 820 return (false); 821 if (is_t5(sc) && (em & F_EXT_MEM1_ENABLE) != 0) 822 return (false); 823 return (true); 824 } 825 826 /* XXXNP: move this to t4_tom. */ 827 static void 828 send_iscsi_flowc_wr(struct adapter *sc, struct toepcb *toep, int maxlen) 829 { 830 struct wrqe *wr; 831 struct fw_flowc_wr *flowc; 832 const u_int nparams = 1; 833 u_int flowclen; 834 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 835 836 flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); 837 838 wr = alloc_wrqe(roundup2(flowclen, 16), &toep->ofld_txq->wrq); 839 if (wr == NULL) { 840 /* XXX */ 841 panic("%s: allocation failure.", __func__); 842 } 843 flowc = wrtod(wr); 844 memset(flowc, 0, wr->wr_len); 845 846 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 847 V_FW_FLOWC_WR_NPARAMS(nparams)); 848 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | 849 V_FW_WR_FLOWID(toep->tid)); 850 851 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_TXDATAPLEN_MAX; 852 flowc->mnemval[0].val = htobe32(maxlen); 853 854 txsd->tx_credits = howmany(flowclen, 16); 855 txsd->plen = 0; 856 KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0, 857 ("%s: not enough credits (%d)", __func__, toep->tx_credits)); 858 toep->tx_credits -= txsd->tx_credits; 859 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 860 toep->txsd_pidx = 0; 861 toep->txsd_avail--; 862 863 t4_wrq_tx(sc, wr); 864 } 865 866 static void 867 set_ulp_mode_iscsi(struct adapter *sc, struct toepcb *toep, u_int ulp_submode) 868 { 869 uint64_t val; 870 871 CTR3(KTR_CXGBE, "%s: tid %u, ULP_MODE_ISCSI, submode=%#x", 872 __func__, toep->tid, ulp_submode); 873 874 val = V_TCB_ULP_TYPE(ULP_MODE_ISCSI) | V_TCB_ULP_RAW(ulp_submode); 875 t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_ULP_TYPE, 876 V_TCB_ULP_TYPE(M_TCB_ULP_TYPE) | V_TCB_ULP_RAW(M_TCB_ULP_RAW), val, 877 0, 0); 878 879 val = V_TF_RX_FLOW_CONTROL_DISABLE(1ULL); 880 t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_T_FLAGS, val, val, 0, 0); 881 } 882 883 /* 884 * XXXNP: Who is responsible for cleaning up the socket if this returns with an 885 * error? Review all error paths. 886 * 887 * XXXNP: What happens to the socket's fd reference if the operation is 888 * successful, and how does that affect the socket's life cycle? 889 */ 890 int 891 icl_cxgbei_conn_handoff(struct icl_conn *ic, int fd) 892 { 893 struct icl_cxgbei_conn *icc = ic_to_icc(ic); 894 struct find_ofld_adapter_rr fa; 895 struct file *fp; 896 struct socket *so; 897 struct inpcb *inp; 898 struct tcpcb *tp; 899 struct toepcb *toep; 900 cap_rights_t rights; 901 u_int max_rx_pdu_len, max_tx_pdu_len; 902 int error, max_iso_pdus; 903 904 MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); 905 ICL_CONN_LOCK_ASSERT_NOT(ic); 906 907 /* 908 * Steal the socket from userland. 909 */ 910 error = fget(curthread, fd, 911 cap_rights_init_one(&rights, CAP_SOCK_CLIENT), &fp); 912 if (error != 0) 913 return (error); 914 if (fp->f_type != DTYPE_SOCKET) { 915 fdrop(fp, curthread); 916 return (EINVAL); 917 } 918 so = fp->f_data; 919 if (so->so_type != SOCK_STREAM || 920 so->so_proto->pr_protocol != IPPROTO_TCP) { 921 fdrop(fp, curthread); 922 return (EINVAL); 923 } 924 925 ICL_CONN_LOCK(ic); 926 if (ic->ic_socket != NULL) { 927 ICL_CONN_UNLOCK(ic); 928 fdrop(fp, curthread); 929 return (EBUSY); 930 } 931 ic->ic_disconnecting = false; 932 ic->ic_socket = so; 933 fp->f_ops = &badfileops; 934 fp->f_data = NULL; 935 fdrop(fp, curthread); 936 ICL_CONN_UNLOCK(ic); 937 938 /* Find the adapter offloading this socket. */ 939 fa.sc = NULL; 940 fa.so = so; 941 t4_iterate(find_offload_adapter, &fa); 942 if (fa.sc == NULL) { 943 error = EINVAL; 944 goto out; 945 } 946 icc->sc = fa.sc; 947 948 max_rx_pdu_len = ISCSI_BHS_SIZE + ic->ic_max_recv_data_segment_length; 949 max_tx_pdu_len = ISCSI_BHS_SIZE + ic->ic_max_send_data_segment_length; 950 if (ic->ic_header_crc32c) { 951 max_rx_pdu_len += ISCSI_HEADER_DIGEST_SIZE; 952 max_tx_pdu_len += ISCSI_HEADER_DIGEST_SIZE; 953 } 954 if (ic->ic_data_crc32c) { 955 max_rx_pdu_len += ISCSI_DATA_DIGEST_SIZE; 956 max_tx_pdu_len += ISCSI_DATA_DIGEST_SIZE; 957 } 958 959 inp = sotoinpcb(so); 960 INP_WLOCK(inp); 961 tp = intotcpcb(inp); 962 if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) { 963 INP_WUNLOCK(inp); 964 error = ENOTCONN; 965 goto out; 966 } 967 968 /* 969 * socket could not have been "unoffloaded" if here. 970 */ 971 MPASS(tp->t_flags & TF_TOE); 972 MPASS(tp->tod != NULL); 973 MPASS(tp->t_toe != NULL); 974 toep = tp->t_toe; 975 MPASS(toep->vi->adapter == icc->sc); 976 977 if (ulp_mode(toep) != ULP_MODE_NONE) { 978 INP_WUNLOCK(inp); 979 error = EINVAL; 980 goto out; 981 } 982 983 icc->toep = toep; 984 985 icc->ulp_submode = 0; 986 if (ic->ic_header_crc32c) 987 icc->ulp_submode |= ULP_CRC_HEADER; 988 if (ic->ic_data_crc32c) 989 icc->ulp_submode |= ULP_CRC_DATA; 990 991 if (icc->sc->tt.iso && chip_id(icc->sc) >= CHELSIO_T5 && 992 !is_memfree(icc->sc)) { 993 max_iso_pdus = CXGBEI_MAX_ISO_PAYLOAD / max_tx_pdu_len; 994 ic->ic_hw_isomax = max_iso_pdus * 995 ic->ic_max_send_data_segment_length; 996 } else 997 max_iso_pdus = 1; 998 999 toep->params.ulp_mode = ULP_MODE_ISCSI; 1000 toep->ulpcb = icc; 1001 1002 send_iscsi_flowc_wr(icc->sc, toep, 1003 roundup(max_iso_pdus * max_tx_pdu_len, tp->t_maxseg)); 1004 set_ulp_mode_iscsi(icc->sc, toep, icc->ulp_submode); 1005 INP_WUNLOCK(inp); 1006 1007 error = kthread_add(icl_cxgbei_tx_main, icc, NULL, &icc->tx_thread, 0, 1008 0, "%stx (cxgbei)", ic->ic_name); 1009 if (error != 0) 1010 goto out; 1011 1012 error = kthread_add(icl_cxgbei_rx_main, icc, NULL, &icc->rx_thread, 0, 1013 0, "%srx (cxgbei)", ic->ic_name); 1014 if (error != 0) 1015 goto out; 1016 1017 error = icl_cxgbei_setsockopt(ic, so, max_tx_pdu_len, max_rx_pdu_len); 1018 out: 1019 if (error != 0) 1020 icl_cxgbei_conn_close(ic); 1021 return (error); 1022 } 1023 1024 void 1025 icl_cxgbei_conn_close(struct icl_conn *ic) 1026 { 1027 struct icl_cxgbei_conn *icc = ic_to_icc(ic); 1028 struct icl_pdu *ip; 1029 struct socket *so; 1030 struct sockbuf *sb; 1031 struct inpcb *inp; 1032 struct toepcb *toep = icc->toep; 1033 1034 MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); 1035 ICL_CONN_LOCK_ASSERT_NOT(ic); 1036 1037 ICL_CONN_LOCK(ic); 1038 so = ic->ic_socket; 1039 if (ic->ic_disconnecting || so == NULL) { 1040 CTR4(KTR_CXGBE, "%s: icc %p (disconnecting = %d), so %p", 1041 __func__, icc, ic->ic_disconnecting, so); 1042 ICL_CONN_UNLOCK(ic); 1043 return; 1044 } 1045 ic->ic_disconnecting = true; 1046 1047 #ifdef DIAGNOSTIC 1048 KASSERT(ic->ic_outstanding_pdus == 0, 1049 ("destroying session with %d outstanding PDUs", 1050 ic->ic_outstanding_pdus)); 1051 #endif 1052 1053 CTR3(KTR_CXGBE, "%s: tid %d, icc %p", __func__, toep ? toep->tid : -1, 1054 icc); 1055 1056 /* 1057 * Wait for the transmit thread to stop processing 1058 * this connection. 1059 */ 1060 if (icc->tx_thread != NULL) { 1061 wakeup(&icc->tx_active); 1062 mtx_sleep(icc->tx_thread, ic->ic_lock, 0, "conclo", 0); 1063 } 1064 1065 /* Discard PDUs queued for TX. */ 1066 while (!STAILQ_EMPTY(&icc->sent_pdus)) { 1067 ip = STAILQ_FIRST(&icc->sent_pdus); 1068 STAILQ_REMOVE_HEAD(&icc->sent_pdus, ip_next); 1069 icl_cxgbei_pdu_done(ip, ENOTCONN); 1070 } 1071 ICL_CONN_UNLOCK(ic); 1072 1073 inp = sotoinpcb(so); 1074 sb = &so->so_rcv; 1075 1076 /* 1077 * Wait for the receive thread to stop processing this 1078 * connection. 1079 */ 1080 SOCKBUF_LOCK(sb); 1081 if (icc->rx_thread != NULL) { 1082 icc->rx_exiting = true; 1083 wakeup(&icc->rx_active); 1084 mtx_sleep(icc->rx_thread, SOCKBUF_MTX(sb), 0, "conclo", 0); 1085 } 1086 1087 /* 1088 * Discard received PDUs not passed to the iSCSI layer. 1089 */ 1090 while (!STAILQ_EMPTY(&icc->rcvd_pdus)) { 1091 ip = STAILQ_FIRST(&icc->rcvd_pdus); 1092 STAILQ_REMOVE_HEAD(&icc->rcvd_pdus, ip_next); 1093 icl_cxgbei_pdu_done(ip, ENOTCONN); 1094 } 1095 SOCKBUF_UNLOCK(sb); 1096 1097 INP_WLOCK(inp); 1098 if (toep != NULL) { /* NULL if connection was never offloaded. */ 1099 toep->ulpcb = NULL; 1100 1101 /* Discard mbufs queued for TX. */ 1102 mbufq_drain(&toep->ulp_pduq); 1103 1104 /* 1105 * Grab a reference to use when waiting for the final 1106 * CPL to be received. If toep->inp is NULL, then 1107 * final_cpl_received() has already been called (e.g. 1108 * due to the peer sending a RST). 1109 */ 1110 if (toep->inp != NULL) { 1111 toep = hold_toepcb(toep); 1112 toep->flags |= TPF_WAITING_FOR_FINAL; 1113 } else 1114 toep = NULL; 1115 } 1116 INP_WUNLOCK(inp); 1117 1118 ICL_CONN_LOCK(ic); 1119 ic->ic_socket = NULL; 1120 ICL_CONN_UNLOCK(ic); 1121 1122 /* 1123 * XXXNP: we should send RST instead of FIN when PDUs held in various 1124 * queues were purged instead of delivered reliably but soabort isn't 1125 * really general purpose and wouldn't do the right thing here. 1126 */ 1127 soclose(so); 1128 1129 /* 1130 * Wait for the socket to fully close. This ensures any 1131 * pending received data has been received (and in particular, 1132 * any data that would be received by DDP has been handled). 1133 * Callers assume that it is safe to free buffers for tasks 1134 * and transfers after this function returns. 1135 */ 1136 if (toep != NULL) { 1137 struct mtx *lock = mtx_pool_find(mtxpool_sleep, toep); 1138 1139 mtx_lock(lock); 1140 while ((toep->flags & TPF_WAITING_FOR_FINAL) != 0) 1141 mtx_sleep(toep, lock, PSOCK, "conclo2", 0); 1142 mtx_unlock(lock); 1143 free_toepcb(toep); 1144 } 1145 } 1146 1147 static void 1148 cxgbei_insert_cmp(struct icl_cxgbei_conn *icc, struct cxgbei_cmp *cmp, 1149 uint32_t tt) 1150 { 1151 #ifdef INVARIANTS 1152 struct cxgbei_cmp *cmp2; 1153 #endif 1154 1155 cmp->tt = tt; 1156 1157 mtx_lock(&icc->cmp_lock); 1158 #ifdef INVARIANTS 1159 LIST_FOREACH(cmp2, &icc->cmp_table[TT_HASH(icc, tt)], link) { 1160 KASSERT(cmp2->tt != tt, ("%s: duplicate cmp", __func__)); 1161 } 1162 #endif 1163 LIST_INSERT_HEAD(&icc->cmp_table[TT_HASH(icc, tt)], cmp, link); 1164 mtx_unlock(&icc->cmp_lock); 1165 } 1166 1167 struct cxgbei_cmp * 1168 cxgbei_find_cmp(struct icl_cxgbei_conn *icc, uint32_t tt) 1169 { 1170 struct cxgbei_cmp *cmp; 1171 1172 mtx_lock(&icc->cmp_lock); 1173 LIST_FOREACH(cmp, &icc->cmp_table[TT_HASH(icc, tt)], link) { 1174 if (cmp->tt == tt) 1175 break; 1176 } 1177 mtx_unlock(&icc->cmp_lock); 1178 return (cmp); 1179 } 1180 1181 static void 1182 cxgbei_rm_cmp(struct icl_cxgbei_conn *icc, struct cxgbei_cmp *cmp) 1183 { 1184 #ifdef INVARIANTS 1185 struct cxgbei_cmp *cmp2; 1186 #endif 1187 1188 mtx_lock(&icc->cmp_lock); 1189 1190 #ifdef INVARIANTS 1191 LIST_FOREACH(cmp2, &icc->cmp_table[TT_HASH(icc, cmp->tt)], link) { 1192 if (cmp2 == cmp) 1193 goto found; 1194 } 1195 panic("%s: could not find cmp", __func__); 1196 found: 1197 #endif 1198 LIST_REMOVE(cmp, link); 1199 mtx_unlock(&icc->cmp_lock); 1200 } 1201 1202 int 1203 icl_cxgbei_conn_task_setup(struct icl_conn *ic, struct icl_pdu *ip, 1204 struct ccb_scsiio *csio, uint32_t *ittp, void **arg) 1205 { 1206 struct icl_cxgbei_conn *icc = ic_to_icc(ic); 1207 struct toepcb *toep = icc->toep; 1208 struct adapter *sc = icc->sc; 1209 struct cxgbei_data *ci = sc->iscsi_ulp_softc; 1210 struct ppod_region *pr = &ci->pr; 1211 struct cxgbei_ddp_state *ddp; 1212 struct ppod_reservation *prsv; 1213 struct inpcb *inp; 1214 struct mbufq mq; 1215 uint32_t itt; 1216 int rc = 0; 1217 1218 ICL_CONN_LOCK_ASSERT(ic); 1219 1220 /* This is for the offload driver's state. Must not be set already. */ 1221 MPASS(arg != NULL); 1222 MPASS(*arg == NULL); 1223 1224 if ((csio->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_IN || 1225 csio->dxfer_len < ci->ddp_threshold || ic->ic_disconnecting || 1226 ic->ic_socket == NULL) { 1227 no_ddp: 1228 /* 1229 * No DDP for this I/O. Allocate an ITT (based on the one 1230 * passed in) that cannot be a valid hardware DDP tag in the 1231 * iSCSI region. 1232 */ 1233 itt = *ittp & M_PPOD_TAG; 1234 itt = V_PPOD_TAG(itt) | pr->pr_invalid_bit; 1235 *ittp = htobe32(itt); 1236 MPASS(*arg == NULL); /* State is maintained for DDP only. */ 1237 if (rc != 0) 1238 counter_u64_add( 1239 toep->ofld_rxq->rx_iscsi_ddp_setup_error, 1); 1240 return (0); 1241 } 1242 1243 /* 1244 * Reserve resources for DDP, update the itt that should be used in the 1245 * PDU, and save DDP specific state for this I/O in *arg. 1246 */ 1247 ddp = malloc(sizeof(*ddp), M_CXGBEI, M_NOWAIT | M_ZERO); 1248 if (ddp == NULL) { 1249 rc = ENOMEM; 1250 goto no_ddp; 1251 } 1252 prsv = &ddp->prsv; 1253 1254 /* XXX add support for all CAM_DATA_ types */ 1255 MPASS((csio->ccb_h.flags & CAM_DATA_MASK) == CAM_DATA_VADDR); 1256 rc = t4_alloc_page_pods_for_buf(pr, (vm_offset_t)csio->data_ptr, 1257 csio->dxfer_len, prsv); 1258 if (rc != 0) { 1259 free(ddp, M_CXGBEI); 1260 goto no_ddp; 1261 } 1262 1263 mbufq_init(&mq, INT_MAX); 1264 rc = t4_write_page_pods_for_buf(sc, toep, prsv, 1265 (vm_offset_t)csio->data_ptr, csio->dxfer_len, &mq); 1266 if (__predict_false(rc != 0)) { 1267 mbufq_drain(&mq); 1268 t4_free_page_pods(prsv); 1269 free(ddp, M_CXGBEI); 1270 goto no_ddp; 1271 } 1272 1273 /* 1274 * Do not get inp from toep->inp as the toepcb might have 1275 * detached already. 1276 */ 1277 inp = sotoinpcb(ic->ic_socket); 1278 INP_WLOCK(inp); 1279 if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) != 0) { 1280 INP_WUNLOCK(inp); 1281 mbufq_drain(&mq); 1282 t4_free_page_pods(prsv); 1283 free(ddp, M_CXGBEI); 1284 goto no_ddp; 1285 } 1286 mbufq_concat(&toep->ulp_pduq, &mq); 1287 INP_WUNLOCK(inp); 1288 1289 ddp->cmp.last_datasn = -1; 1290 cxgbei_insert_cmp(icc, &ddp->cmp, prsv->prsv_tag); 1291 *ittp = htobe32(prsv->prsv_tag); 1292 *arg = prsv; 1293 counter_u64_add(toep->ofld_rxq->rx_iscsi_ddp_setup_ok, 1); 1294 return (0); 1295 } 1296 1297 void 1298 icl_cxgbei_conn_task_done(struct icl_conn *ic, void *arg) 1299 { 1300 1301 if (arg != NULL) { 1302 struct cxgbei_ddp_state *ddp = arg; 1303 1304 cxgbei_rm_cmp(ic_to_icc(ic), &ddp->cmp); 1305 t4_free_page_pods(&ddp->prsv); 1306 free(ddp, M_CXGBEI); 1307 } 1308 } 1309 1310 static inline bool 1311 ddp_sgl_check(struct ctl_sg_entry *sg, int entries, int xferlen) 1312 { 1313 #ifdef INVARIANTS 1314 int total_len = 0; 1315 #endif 1316 1317 MPASS(entries > 0); 1318 if (((vm_offset_t)sg[--entries].addr & 3U) != 0) 1319 return (false); 1320 1321 #ifdef INVARIANTS 1322 total_len += sg[entries].len; 1323 #endif 1324 1325 while (--entries >= 0) { 1326 if (((vm_offset_t)sg[entries].addr & PAGE_MASK) != 0 || 1327 (sg[entries].len % PAGE_SIZE) != 0) 1328 return (false); 1329 #ifdef INVARIANTS 1330 total_len += sg[entries].len; 1331 #endif 1332 } 1333 1334 MPASS(total_len == xferlen); 1335 return (true); 1336 } 1337 1338 #define io_to_ddp_state(io) ((io)->io_hdr.ctl_private[CTL_PRIV_FRONTEND2].ptr) 1339 1340 int 1341 icl_cxgbei_conn_transfer_setup(struct icl_conn *ic, struct icl_pdu *ip, 1342 union ctl_io *io, uint32_t *tttp, void **arg) 1343 { 1344 struct icl_cxgbei_conn *icc = ic_to_icc(ic); 1345 struct toepcb *toep = icc->toep; 1346 struct ctl_scsiio *ctsio = &io->scsiio; 1347 struct adapter *sc = icc->sc; 1348 struct cxgbei_data *ci = sc->iscsi_ulp_softc; 1349 struct ppod_region *pr = &ci->pr; 1350 struct cxgbei_ddp_state *ddp; 1351 struct ppod_reservation *prsv; 1352 struct ctl_sg_entry *sgl, sg_entry; 1353 struct inpcb *inp; 1354 struct mbufq mq; 1355 int sg_entries = ctsio->kern_sg_entries; 1356 uint32_t ttt; 1357 int xferlen, rc = 0, alias; 1358 1359 /* This is for the offload driver's state. Must not be set already. */ 1360 MPASS(arg != NULL); 1361 MPASS(*arg == NULL); 1362 1363 if (ctsio->ext_data_filled == 0) { 1364 int first_burst; 1365 #ifdef INVARIANTS 1366 struct icl_cxgbei_pdu *icp = ip_to_icp(ip); 1367 1368 MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE); 1369 MPASS(ic == ip->ip_conn); 1370 MPASS(ip->ip_bhs_mbuf != NULL); 1371 #endif 1372 first_burst = icl_pdu_data_segment_length(ip); 1373 1374 /* 1375 * Note that ICL calls conn_transfer_setup even if the first 1376 * burst had everything and there's nothing left to transfer. 1377 * 1378 * NB: The CTL frontend might have provided a buffer 1379 * whose length (kern_data_len) is smaller than the 1380 * FirstBurstLength of unsolicited data. Treat those 1381 * as an empty transfer. 1382 */ 1383 xferlen = ctsio->kern_data_len; 1384 if (xferlen < first_burst || 1385 xferlen - first_burst < ci->ddp_threshold) { 1386 no_ddp: 1387 /* 1388 * No DDP for this transfer. Allocate a TTT (based on 1389 * the one passed in) that cannot be a valid hardware 1390 * DDP tag in the iSCSI region. 1391 */ 1392 ttt = *tttp & M_PPOD_TAG; 1393 ttt = V_PPOD_TAG(ttt) | pr->pr_invalid_bit; 1394 *tttp = htobe32(ttt); 1395 MPASS(io_to_ddp_state(io) == NULL); 1396 if (rc != 0) 1397 counter_u64_add( 1398 toep->ofld_rxq->rx_iscsi_ddp_setup_error, 1); 1399 return (0); 1400 } 1401 1402 if (sg_entries == 0) { 1403 sgl = &sg_entry; 1404 sgl->len = xferlen; 1405 sgl->addr = (void *)ctsio->kern_data_ptr; 1406 sg_entries = 1; 1407 } else 1408 sgl = (void *)ctsio->kern_data_ptr; 1409 1410 if (!ddp_sgl_check(sgl, sg_entries, xferlen)) 1411 goto no_ddp; 1412 1413 /* 1414 * Reserve resources for DDP, update the ttt that should be used 1415 * in the PDU, and save DDP specific state for this I/O. 1416 */ 1417 MPASS(io_to_ddp_state(io) == NULL); 1418 ddp = malloc(sizeof(*ddp), M_CXGBEI, M_NOWAIT | M_ZERO); 1419 if (ddp == NULL) { 1420 rc = ENOMEM; 1421 goto no_ddp; 1422 } 1423 prsv = &ddp->prsv; 1424 1425 rc = t4_alloc_page_pods_for_sgl(pr, sgl, sg_entries, prsv); 1426 if (rc != 0) { 1427 free(ddp, M_CXGBEI); 1428 goto no_ddp; 1429 } 1430 1431 mbufq_init(&mq, INT_MAX); 1432 rc = t4_write_page_pods_for_sgl(sc, toep, prsv, sgl, sg_entries, 1433 xferlen, &mq); 1434 if (__predict_false(rc != 0)) { 1435 mbufq_drain(&mq); 1436 t4_free_page_pods(prsv); 1437 free(ddp, M_CXGBEI); 1438 goto no_ddp; 1439 } 1440 1441 /* 1442 * Do not get inp from toep->inp as the toepcb might 1443 * have detached already. 1444 */ 1445 ICL_CONN_LOCK(ic); 1446 if (ic->ic_disconnecting || ic->ic_socket == NULL) { 1447 ICL_CONN_UNLOCK(ic); 1448 mbufq_drain(&mq); 1449 t4_free_page_pods(prsv); 1450 free(ddp, M_CXGBEI); 1451 return (ECONNRESET); 1452 } 1453 inp = sotoinpcb(ic->ic_socket); 1454 INP_WLOCK(inp); 1455 ICL_CONN_UNLOCK(ic); 1456 if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) != 0) { 1457 INP_WUNLOCK(inp); 1458 mbufq_drain(&mq); 1459 t4_free_page_pods(prsv); 1460 free(ddp, M_CXGBEI); 1461 return (ECONNRESET); 1462 } 1463 mbufq_concat(&toep->ulp_pduq, &mq); 1464 INP_WUNLOCK(inp); 1465 1466 ddp->cmp.next_buffer_offset = ctsio->kern_rel_offset + 1467 first_burst; 1468 ddp->cmp.last_datasn = -1; 1469 cxgbei_insert_cmp(icc, &ddp->cmp, prsv->prsv_tag); 1470 *tttp = htobe32(prsv->prsv_tag); 1471 io_to_ddp_state(io) = ddp; 1472 *arg = ctsio; 1473 counter_u64_add(toep->ofld_rxq->rx_iscsi_ddp_setup_ok, 1); 1474 return (0); 1475 } 1476 1477 /* 1478 * In the middle of an I/O. A non-NULL page pod reservation indicates 1479 * that a DDP buffer is being used for the I/O. 1480 */ 1481 ddp = io_to_ddp_state(ctsio); 1482 if (ddp == NULL) 1483 goto no_ddp; 1484 prsv = &ddp->prsv; 1485 1486 alias = (prsv->prsv_tag & pr->pr_alias_mask) >> pr->pr_alias_shift; 1487 alias++; 1488 prsv->prsv_tag &= ~pr->pr_alias_mask; 1489 prsv->prsv_tag |= alias << pr->pr_alias_shift & pr->pr_alias_mask; 1490 1491 ddp->cmp.last_datasn = -1; 1492 cxgbei_insert_cmp(icc, &ddp->cmp, prsv->prsv_tag); 1493 *tttp = htobe32(prsv->prsv_tag); 1494 *arg = ctsio; 1495 1496 return (0); 1497 } 1498 1499 void 1500 icl_cxgbei_conn_transfer_done(struct icl_conn *ic, void *arg) 1501 { 1502 struct ctl_scsiio *ctsio = arg; 1503 1504 if (ctsio != NULL) { 1505 struct cxgbei_ddp_state *ddp; 1506 1507 ddp = io_to_ddp_state(ctsio); 1508 MPASS(ddp != NULL); 1509 1510 cxgbei_rm_cmp(ic_to_icc(ic), &ddp->cmp); 1511 if (ctsio->kern_data_len == ctsio->ext_data_filled || 1512 ic->ic_disconnecting) { 1513 t4_free_page_pods(&ddp->prsv); 1514 free(ddp, M_CXGBEI); 1515 io_to_ddp_state(ctsio) = NULL; 1516 } 1517 } 1518 } 1519 1520 static void 1521 cxgbei_limits(struct adapter *sc, void *arg) 1522 { 1523 struct icl_drv_limits *idl = arg; 1524 struct cxgbei_data *ci; 1525 int max_dsl; 1526 1527 if (begin_synchronized_op(sc, NULL, HOLD_LOCK, "t4lims") != 0) 1528 return; 1529 1530 if (uld_active(sc, ULD_ISCSI)) { 1531 ci = sc->iscsi_ulp_softc; 1532 MPASS(ci != NULL); 1533 1534 1535 max_dsl = ci->max_rx_data_len; 1536 if (idl->idl_max_recv_data_segment_length > max_dsl) 1537 idl->idl_max_recv_data_segment_length = max_dsl; 1538 1539 max_dsl = ci->max_tx_data_len; 1540 if (idl->idl_max_send_data_segment_length > max_dsl) 1541 idl->idl_max_send_data_segment_length = max_dsl; 1542 } 1543 1544 end_synchronized_op(sc, LOCK_HELD); 1545 } 1546 1547 static int 1548 icl_cxgbei_limits(struct icl_drv_limits *idl) 1549 { 1550 1551 /* Maximum allowed by the RFC. cxgbei_limits will clip them. */ 1552 idl->idl_max_recv_data_segment_length = (1 << 24) - 1; 1553 idl->idl_max_send_data_segment_length = (1 << 24) - 1; 1554 1555 /* These are somewhat arbitrary. */ 1556 idl->idl_max_burst_length = max_burst_length; 1557 idl->idl_first_burst_length = first_burst_length; 1558 1559 t4_iterate(cxgbei_limits, idl); 1560 1561 return (0); 1562 } 1563 1564 int 1565 icl_cxgbei_mod_load(void) 1566 { 1567 int rc; 1568 1569 refcount_init(&icl_cxgbei_ncons, 0); 1570 1571 rc = icl_register("cxgbei", false, -100, icl_cxgbei_limits, 1572 icl_cxgbei_new_conn); 1573 1574 return (rc); 1575 } 1576 1577 int 1578 icl_cxgbei_mod_unload(void) 1579 { 1580 1581 if (icl_cxgbei_ncons != 0) 1582 return (EBUSY); 1583 1584 icl_unregister("cxgbei", false); 1585 1586 return (0); 1587 } 1588 #endif 1589