1 /*- 2 * Copyright (c) 2012 The FreeBSD Foundation 3 * Copyright (c) 2015 Chelsio Communications, Inc. 4 * All rights reserved. 5 * 6 * This software was developed by Edward Tomasz Napierala under sponsorship 7 * from the FreeBSD Foundation. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 * 30 */ 31 32 /* 33 * cxgbei implementation of iSCSI Common Layer kobj(9) interface. 34 */ 35 36 #include <sys/cdefs.h> 37 __FBSDID("$FreeBSD$"); 38 39 #include "opt_inet.h" 40 #include "opt_inet6.h" 41 42 #ifdef TCP_OFFLOAD 43 #include <sys/param.h> 44 #include <sys/capsicum.h> 45 #include <sys/condvar.h> 46 #include <sys/conf.h> 47 #include <sys/file.h> 48 #include <sys/kernel.h> 49 #include <sys/kthread.h> 50 #include <sys/ktr.h> 51 #include <sys/lock.h> 52 #include <sys/mbuf.h> 53 #include <sys/mutex.h> 54 #include <sys/module.h> 55 #include <sys/protosw.h> 56 #include <sys/socket.h> 57 #include <sys/socketvar.h> 58 #include <sys/sysctl.h> 59 #include <sys/systm.h> 60 #include <sys/sx.h> 61 #include <sys/uio.h> 62 #include <machine/bus.h> 63 #include <vm/uma.h> 64 #include <vm/vm.h> 65 #include <vm/pmap.h> 66 #include <netinet/in.h> 67 #include <netinet/in_pcb.h> 68 #include <netinet/tcp.h> 69 #include <netinet/tcp_var.h> 70 #include <netinet/toecore.h> 71 72 #include <dev/iscsi/icl.h> 73 #include <dev/iscsi/iscsi_proto.h> 74 #include <icl_conn_if.h> 75 76 #include <cam/scsi/scsi_all.h> 77 #include <cam/scsi/scsi_da.h> 78 #include <cam/ctl/ctl_io.h> 79 #include <cam/ctl/ctl.h> 80 #include <cam/ctl/ctl_backend.h> 81 #include <cam/ctl/ctl_error.h> 82 #include <cam/ctl/ctl_frontend.h> 83 #include <cam/ctl/ctl_debug.h> 84 #include <cam/ctl/ctl_ha.h> 85 #include <cam/ctl/ctl_ioctl.h> 86 87 #include <cam/cam.h> 88 #include <cam/cam_ccb.h> 89 #include <cam/cam_xpt.h> 90 #include <cam/cam_debug.h> 91 #include <cam/cam_sim.h> 92 #include <cam/cam_xpt_sim.h> 93 #include <cam/cam_xpt_periph.h> 94 #include <cam/cam_periph.h> 95 #include <cam/cam_compat.h> 96 #include <cam/scsi/scsi_message.h> 97 98 #include "common/common.h" 99 #include "common/t4_tcb.h" 100 #include "tom/t4_tom.h" 101 #include "cxgbei.h" 102 103 SYSCTL_NODE(_kern_icl, OID_AUTO, cxgbei, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 104 "Chelsio iSCSI offload"); 105 static int coalesce = 1; 106 SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, coalesce, CTLFLAG_RWTUN, 107 &coalesce, 0, "Try to coalesce PDUs before sending"); 108 static int partial_receive_len = 128 * 1024; 109 SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, partial_receive_len, CTLFLAG_RWTUN, 110 &partial_receive_len, 0, "Minimum read size for partially received " 111 "data segment"); 112 static int sendspace = 1048576; 113 SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, sendspace, CTLFLAG_RWTUN, 114 &sendspace, 0, "Default send socket buffer size"); 115 static int recvspace = 1048576; 116 SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, recvspace, CTLFLAG_RWTUN, 117 &recvspace, 0, "Default receive socket buffer size"); 118 119 static uma_zone_t prsv_zone; 120 static volatile u_int icl_cxgbei_ncons; 121 122 #define ICL_CONN_LOCK(X) mtx_lock(X->ic_lock) 123 #define ICL_CONN_UNLOCK(X) mtx_unlock(X->ic_lock) 124 #define ICL_CONN_LOCK_ASSERT(X) mtx_assert(X->ic_lock, MA_OWNED) 125 #define ICL_CONN_LOCK_ASSERT_NOT(X) mtx_assert(X->ic_lock, MA_NOTOWNED) 126 127 struct icl_pdu *icl_cxgbei_new_pdu(int); 128 void icl_cxgbei_new_pdu_set_conn(struct icl_pdu *, struct icl_conn *); 129 130 static icl_conn_new_pdu_t icl_cxgbei_conn_new_pdu; 131 icl_conn_pdu_free_t icl_cxgbei_conn_pdu_free; 132 static icl_conn_pdu_data_segment_length_t 133 icl_cxgbei_conn_pdu_data_segment_length; 134 static icl_conn_pdu_append_data_t icl_cxgbei_conn_pdu_append_data; 135 static icl_conn_pdu_get_data_t icl_cxgbei_conn_pdu_get_data; 136 static icl_conn_pdu_queue_t icl_cxgbei_conn_pdu_queue; 137 static icl_conn_handoff_t icl_cxgbei_conn_handoff; 138 static icl_conn_free_t icl_cxgbei_conn_free; 139 static icl_conn_close_t icl_cxgbei_conn_close; 140 static icl_conn_task_setup_t icl_cxgbei_conn_task_setup; 141 static icl_conn_task_done_t icl_cxgbei_conn_task_done; 142 static icl_conn_transfer_setup_t icl_cxgbei_conn_transfer_setup; 143 static icl_conn_transfer_done_t icl_cxgbei_conn_transfer_done; 144 145 static kobj_method_t icl_cxgbei_methods[] = { 146 KOBJMETHOD(icl_conn_new_pdu, icl_cxgbei_conn_new_pdu), 147 KOBJMETHOD(icl_conn_pdu_free, icl_cxgbei_conn_pdu_free), 148 KOBJMETHOD(icl_conn_pdu_data_segment_length, 149 icl_cxgbei_conn_pdu_data_segment_length), 150 KOBJMETHOD(icl_conn_pdu_append_data, icl_cxgbei_conn_pdu_append_data), 151 KOBJMETHOD(icl_conn_pdu_get_data, icl_cxgbei_conn_pdu_get_data), 152 KOBJMETHOD(icl_conn_pdu_queue, icl_cxgbei_conn_pdu_queue), 153 KOBJMETHOD(icl_conn_handoff, icl_cxgbei_conn_handoff), 154 KOBJMETHOD(icl_conn_free, icl_cxgbei_conn_free), 155 KOBJMETHOD(icl_conn_close, icl_cxgbei_conn_close), 156 KOBJMETHOD(icl_conn_task_setup, icl_cxgbei_conn_task_setup), 157 KOBJMETHOD(icl_conn_task_done, icl_cxgbei_conn_task_done), 158 KOBJMETHOD(icl_conn_transfer_setup, icl_cxgbei_conn_transfer_setup), 159 KOBJMETHOD(icl_conn_transfer_done, icl_cxgbei_conn_transfer_done), 160 { 0, 0 } 161 }; 162 163 DEFINE_CLASS(icl_cxgbei, icl_cxgbei_methods, sizeof(struct icl_cxgbei_conn)); 164 165 void 166 icl_cxgbei_conn_pdu_free(struct icl_conn *ic, struct icl_pdu *ip) 167 { 168 #ifdef INVARIANTS 169 struct icl_cxgbei_pdu *icp = ip_to_icp(ip); 170 #endif 171 172 MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE); 173 MPASS(ic == ip->ip_conn); 174 MPASS(ip->ip_bhs_mbuf != NULL); 175 176 m_freem(ip->ip_ahs_mbuf); 177 m_freem(ip->ip_data_mbuf); 178 m_freem(ip->ip_bhs_mbuf); /* storage for icl_cxgbei_pdu itself */ 179 180 #ifdef DIAGNOSTIC 181 if (__predict_true(ic != NULL)) 182 refcount_release(&ic->ic_outstanding_pdus); 183 #endif 184 } 185 186 struct icl_pdu * 187 icl_cxgbei_new_pdu(int flags) 188 { 189 struct icl_cxgbei_pdu *icp; 190 struct icl_pdu *ip; 191 struct mbuf *m; 192 uintptr_t a; 193 194 m = m_gethdr(flags, MT_DATA); 195 if (__predict_false(m == NULL)) 196 return (NULL); 197 198 a = roundup2(mtod(m, uintptr_t), _Alignof(struct icl_cxgbei_pdu)); 199 icp = (struct icl_cxgbei_pdu *)a; 200 bzero(icp, sizeof(*icp)); 201 202 icp->icp_signature = CXGBEI_PDU_SIGNATURE; 203 ip = &icp->ip; 204 ip->ip_bhs_mbuf = m; 205 206 a = roundup2((uintptr_t)(icp + 1), _Alignof(struct iscsi_bhs *)); 207 ip->ip_bhs = (struct iscsi_bhs *)a; 208 #ifdef INVARIANTS 209 /* Everything must fit entirely in the mbuf. */ 210 a = (uintptr_t)(ip->ip_bhs + 1); 211 MPASS(a <= (uintptr_t)m + MSIZE); 212 #endif 213 bzero(ip->ip_bhs, sizeof(*ip->ip_bhs)); 214 215 m->m_data = (void *)ip->ip_bhs; 216 m->m_len = sizeof(struct iscsi_bhs); 217 m->m_pkthdr.len = m->m_len; 218 219 return (ip); 220 } 221 222 void 223 icl_cxgbei_new_pdu_set_conn(struct icl_pdu *ip, struct icl_conn *ic) 224 { 225 226 ip->ip_conn = ic; 227 #ifdef DIAGNOSTIC 228 refcount_acquire(&ic->ic_outstanding_pdus); 229 #endif 230 } 231 232 /* 233 * Allocate icl_pdu with empty BHS to fill up by the caller. 234 */ 235 static struct icl_pdu * 236 icl_cxgbei_conn_new_pdu(struct icl_conn *ic, int flags) 237 { 238 struct icl_pdu *ip; 239 240 ip = icl_cxgbei_new_pdu(flags); 241 if (__predict_false(ip == NULL)) 242 return (NULL); 243 icl_cxgbei_new_pdu_set_conn(ip, ic); 244 245 return (ip); 246 } 247 248 static size_t 249 icl_pdu_data_segment_length(const struct icl_pdu *request) 250 { 251 uint32_t len = 0; 252 253 len += request->ip_bhs->bhs_data_segment_len[0]; 254 len <<= 8; 255 len += request->ip_bhs->bhs_data_segment_len[1]; 256 len <<= 8; 257 len += request->ip_bhs->bhs_data_segment_len[2]; 258 259 return (len); 260 } 261 262 size_t 263 icl_cxgbei_conn_pdu_data_segment_length(struct icl_conn *ic, 264 const struct icl_pdu *request) 265 { 266 267 return (icl_pdu_data_segment_length(request)); 268 } 269 270 static struct mbuf * 271 finalize_pdu(struct icl_cxgbei_conn *icc, struct icl_cxgbei_pdu *icp) 272 { 273 struct icl_pdu *ip = &icp->ip; 274 uint8_t ulp_submode, padding; 275 struct mbuf *m, *last; 276 struct iscsi_bhs *bhs; 277 278 /* 279 * Fix up the data segment mbuf first. 280 */ 281 m = ip->ip_data_mbuf; 282 ulp_submode = icc->ulp_submode; 283 if (m) { 284 last = m_last(m); 285 286 /* 287 * Round up the data segment to a 4B boundary. Pad with 0 if 288 * necessary. There will definitely be room in the mbuf. 289 */ 290 padding = roundup2(ip->ip_data_len, 4) - ip->ip_data_len; 291 if (padding) { 292 bzero(mtod(last, uint8_t *) + last->m_len, padding); 293 last->m_len += padding; 294 } 295 } else { 296 MPASS(ip->ip_data_len == 0); 297 ulp_submode &= ~ULP_CRC_DATA; 298 padding = 0; 299 } 300 301 /* 302 * Now the header mbuf that has the BHS. 303 */ 304 m = ip->ip_bhs_mbuf; 305 MPASS(m->m_pkthdr.len == sizeof(struct iscsi_bhs)); 306 MPASS(m->m_len == sizeof(struct iscsi_bhs)); 307 308 bhs = ip->ip_bhs; 309 bhs->bhs_data_segment_len[2] = ip->ip_data_len; 310 bhs->bhs_data_segment_len[1] = ip->ip_data_len >> 8; 311 bhs->bhs_data_segment_len[0] = ip->ip_data_len >> 16; 312 313 /* "Convert" PDU to mbuf chain. Do not use icp/ip after this. */ 314 m->m_pkthdr.len = sizeof(struct iscsi_bhs) + ip->ip_data_len + padding; 315 m->m_next = ip->ip_data_mbuf; 316 set_mbuf_ulp_submode(m, ulp_submode); 317 #ifdef INVARIANTS 318 bzero(icp, sizeof(*icp)); 319 #endif 320 #ifdef DIAGNOSTIC 321 refcount_release(&icc->ic.ic_outstanding_pdus); 322 #endif 323 324 return (m); 325 } 326 327 int 328 icl_cxgbei_conn_pdu_append_data(struct icl_conn *ic, struct icl_pdu *ip, 329 const void *addr, size_t len, int flags) 330 { 331 struct mbuf *m; 332 #ifdef INVARIANTS 333 struct icl_cxgbei_pdu *icp = ip_to_icp(ip); 334 #endif 335 336 MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE); 337 MPASS(ic == ip->ip_conn); 338 KASSERT(len > 0, ("%s: len is %jd", __func__, (intmax_t)len)); 339 340 m = ip->ip_data_mbuf; 341 if (m == NULL) { 342 m = m_getjcl(M_NOWAIT, MT_DATA, 0, MJUM16BYTES); 343 if (__predict_false(m == NULL)) 344 return (ENOMEM); 345 346 ip->ip_data_mbuf = m; 347 } 348 349 if (__predict_true(m_append(m, len, addr) != 0)) { 350 ip->ip_data_len += len; 351 MPASS(ip->ip_data_len <= ic->ic_max_data_segment_length); 352 return (0); 353 } else { 354 if (flags & M_WAITOK) { 355 CXGBE_UNIMPLEMENTED("fail safe append"); 356 } 357 ip->ip_data_len = m_length(m, NULL); 358 return (1); 359 } 360 } 361 362 void 363 icl_cxgbei_conn_pdu_get_data(struct icl_conn *ic, struct icl_pdu *ip, 364 size_t off, void *addr, size_t len) 365 { 366 struct icl_cxgbei_pdu *icp = ip_to_icp(ip); 367 368 if (icp->icp_flags & ICPF_RX_DDP) 369 return; /* data is DDP'ed, no need to copy */ 370 m_copydata(ip->ip_data_mbuf, off, len, addr); 371 } 372 373 void 374 icl_cxgbei_conn_pdu_queue(struct icl_conn *ic, struct icl_pdu *ip) 375 { 376 struct icl_cxgbei_conn *icc = ic_to_icc(ic); 377 struct icl_cxgbei_pdu *icp = ip_to_icp(ip); 378 struct socket *so = ic->ic_socket; 379 struct toepcb *toep = icc->toep; 380 struct inpcb *inp; 381 struct mbuf *m; 382 383 MPASS(ic == ip->ip_conn); 384 MPASS(ip->ip_bhs_mbuf != NULL); 385 /* The kernel doesn't generate PDUs with AHS. */ 386 MPASS(ip->ip_ahs_mbuf == NULL && ip->ip_ahs_len == 0); 387 388 ICL_CONN_LOCK_ASSERT(ic); 389 /* NOTE: sowriteable without so_snd lock is a mostly harmless race. */ 390 if (ic->ic_disconnecting || so == NULL || !sowriteable(so)) { 391 icl_cxgbei_conn_pdu_free(ic, ip); 392 return; 393 } 394 395 m = finalize_pdu(icc, icp); 396 M_ASSERTPKTHDR(m); 397 MPASS((m->m_pkthdr.len & 3) == 0); 398 399 /* 400 * Do not get inp from toep->inp as the toepcb might have detached 401 * already. 402 */ 403 inp = sotoinpcb(so); 404 INP_WLOCK(inp); 405 if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) || 406 __predict_false((toep->flags & TPF_ATTACHED) == 0)) 407 m_freem(m); 408 else { 409 mbufq_enqueue(&toep->ulp_pduq, m); 410 t4_push_pdus(icc->sc, toep, 0); 411 } 412 INP_WUNLOCK(inp); 413 } 414 415 static struct icl_conn * 416 icl_cxgbei_new_conn(const char *name, struct mtx *lock) 417 { 418 struct icl_cxgbei_conn *icc; 419 struct icl_conn *ic; 420 421 refcount_acquire(&icl_cxgbei_ncons); 422 423 icc = (struct icl_cxgbei_conn *)kobj_create(&icl_cxgbei_class, M_CXGBE, 424 M_WAITOK | M_ZERO); 425 icc->icc_signature = CXGBEI_CONN_SIGNATURE; 426 STAILQ_INIT(&icc->rcvd_pdus); 427 428 ic = &icc->ic; 429 ic->ic_lock = lock; 430 431 /* XXXNP: review. Most of these icl_conn fields aren't really used */ 432 STAILQ_INIT(&ic->ic_to_send); 433 cv_init(&ic->ic_send_cv, "icl_cxgbei_tx"); 434 cv_init(&ic->ic_receive_cv, "icl_cxgbei_rx"); 435 #ifdef DIAGNOSTIC 436 refcount_init(&ic->ic_outstanding_pdus, 0); 437 #endif 438 /* This is a stop-gap value that will be corrected during handoff. */ 439 ic->ic_max_data_segment_length = 16384; 440 ic->ic_name = name; 441 ic->ic_offload = "cxgbei"; 442 ic->ic_unmapped = false; 443 444 CTR2(KTR_CXGBE, "%s: icc %p", __func__, icc); 445 446 return (ic); 447 } 448 449 void 450 icl_cxgbei_conn_free(struct icl_conn *ic) 451 { 452 struct icl_cxgbei_conn *icc = ic_to_icc(ic); 453 454 MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); 455 456 CTR2(KTR_CXGBE, "%s: icc %p", __func__, icc); 457 458 cv_destroy(&ic->ic_send_cv); 459 cv_destroy(&ic->ic_receive_cv); 460 461 kobj_delete((struct kobj *)icc, M_CXGBE); 462 refcount_release(&icl_cxgbei_ncons); 463 } 464 465 static int 466 icl_cxgbei_setsockopt(struct icl_conn *ic, struct socket *so, int sspace, 467 int rspace) 468 { 469 struct sockopt opt; 470 int error, one = 1, ss, rs; 471 472 ss = max(sendspace, sspace); 473 rs = max(recvspace, rspace); 474 475 error = soreserve(so, ss, rs); 476 if (error != 0) { 477 icl_cxgbei_conn_close(ic); 478 return (error); 479 } 480 SOCKBUF_LOCK(&so->so_snd); 481 so->so_snd.sb_flags |= SB_AUTOSIZE; 482 SOCKBUF_UNLOCK(&so->so_snd); 483 SOCKBUF_LOCK(&so->so_rcv); 484 so->so_rcv.sb_flags |= SB_AUTOSIZE; 485 SOCKBUF_UNLOCK(&so->so_rcv); 486 487 /* 488 * Disable Nagle. 489 */ 490 bzero(&opt, sizeof(opt)); 491 opt.sopt_dir = SOPT_SET; 492 opt.sopt_level = IPPROTO_TCP; 493 opt.sopt_name = TCP_NODELAY; 494 opt.sopt_val = &one; 495 opt.sopt_valsize = sizeof(one); 496 error = sosetopt(so, &opt); 497 if (error != 0) { 498 icl_cxgbei_conn_close(ic); 499 return (error); 500 } 501 502 return (0); 503 } 504 505 /* 506 * Request/response structure used to find out the adapter offloading a socket. 507 */ 508 struct find_ofld_adapter_rr { 509 struct socket *so; 510 struct adapter *sc; /* result */ 511 }; 512 513 static void 514 find_offload_adapter(struct adapter *sc, void *arg) 515 { 516 struct find_ofld_adapter_rr *fa = arg; 517 struct socket *so = fa->so; 518 struct tom_data *td = sc->tom_softc; 519 struct tcpcb *tp; 520 struct inpcb *inp; 521 522 /* Non-TCP were filtered out earlier. */ 523 MPASS(so->so_proto->pr_protocol == IPPROTO_TCP); 524 525 if (fa->sc != NULL) 526 return; /* Found already. */ 527 528 if (td == NULL) 529 return; /* TOE not enabled on this adapter. */ 530 531 inp = sotoinpcb(so); 532 INP_WLOCK(inp); 533 if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { 534 tp = intotcpcb(inp); 535 if (tp->t_flags & TF_TOE && tp->tod == &td->tod) 536 fa->sc = sc; /* Found. */ 537 } 538 INP_WUNLOCK(inp); 539 } 540 541 /* XXXNP: move this to t4_tom. */ 542 static void 543 send_iscsi_flowc_wr(struct adapter *sc, struct toepcb *toep, int maxlen) 544 { 545 struct wrqe *wr; 546 struct fw_flowc_wr *flowc; 547 const u_int nparams = 1; 548 u_int flowclen; 549 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 550 551 flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); 552 553 wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq); 554 if (wr == NULL) { 555 /* XXX */ 556 panic("%s: allocation failure.", __func__); 557 } 558 flowc = wrtod(wr); 559 memset(flowc, 0, wr->wr_len); 560 561 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 562 V_FW_FLOWC_WR_NPARAMS(nparams)); 563 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | 564 V_FW_WR_FLOWID(toep->tid)); 565 566 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_TXDATAPLEN_MAX; 567 flowc->mnemval[0].val = htobe32(maxlen); 568 569 txsd->tx_credits = howmany(flowclen, 16); 570 txsd->plen = 0; 571 KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0, 572 ("%s: not enough credits (%d)", __func__, toep->tx_credits)); 573 toep->tx_credits -= txsd->tx_credits; 574 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 575 toep->txsd_pidx = 0; 576 toep->txsd_avail--; 577 578 t4_wrq_tx(sc, wr); 579 } 580 581 static void 582 set_ulp_mode_iscsi(struct adapter *sc, struct toepcb *toep, int hcrc, int dcrc) 583 { 584 uint64_t val = ULP_MODE_ISCSI; 585 586 if (hcrc) 587 val |= ULP_CRC_HEADER << 4; 588 if (dcrc) 589 val |= ULP_CRC_DATA << 4; 590 591 CTR4(KTR_CXGBE, "%s: tid %u, ULP_MODE_ISCSI, CRC hdr=%d data=%d", 592 __func__, toep->tid, hcrc, dcrc); 593 594 t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_ULP_TYPE, 595 V_TCB_ULP_TYPE(M_TCB_ULP_TYPE) | V_TCB_ULP_RAW(M_TCB_ULP_RAW), val, 596 0, 0); 597 } 598 599 /* 600 * XXXNP: Who is responsible for cleaning up the socket if this returns with an 601 * error? Review all error paths. 602 * 603 * XXXNP: What happens to the socket's fd reference if the operation is 604 * successful, and how does that affect the socket's life cycle? 605 */ 606 int 607 icl_cxgbei_conn_handoff(struct icl_conn *ic, int fd) 608 { 609 struct icl_cxgbei_conn *icc = ic_to_icc(ic); 610 struct cxgbei_data *ci; 611 struct find_ofld_adapter_rr fa; 612 struct file *fp; 613 struct socket *so; 614 struct inpcb *inp; 615 struct tcpcb *tp; 616 struct toepcb *toep; 617 cap_rights_t rights; 618 int error; 619 620 MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); 621 ICL_CONN_LOCK_ASSERT_NOT(ic); 622 623 /* 624 * Steal the socket from userland. 625 */ 626 error = fget(curthread, fd, 627 cap_rights_init(&rights, CAP_SOCK_CLIENT), &fp); 628 if (error != 0) 629 return (error); 630 if (fp->f_type != DTYPE_SOCKET) { 631 fdrop(fp, curthread); 632 return (EINVAL); 633 } 634 so = fp->f_data; 635 if (so->so_type != SOCK_STREAM || 636 so->so_proto->pr_protocol != IPPROTO_TCP) { 637 fdrop(fp, curthread); 638 return (EINVAL); 639 } 640 641 ICL_CONN_LOCK(ic); 642 if (ic->ic_socket != NULL) { 643 ICL_CONN_UNLOCK(ic); 644 fdrop(fp, curthread); 645 return (EBUSY); 646 } 647 ic->ic_disconnecting = false; 648 ic->ic_socket = so; 649 fp->f_ops = &badfileops; 650 fp->f_data = NULL; 651 fdrop(fp, curthread); 652 ICL_CONN_UNLOCK(ic); 653 654 /* Find the adapter offloading this socket. */ 655 fa.sc = NULL; 656 fa.so = so; 657 t4_iterate(find_offload_adapter, &fa); 658 if (fa.sc == NULL) 659 return (EINVAL); 660 icc->sc = fa.sc; 661 ci = icc->sc->iscsi_ulp_softc; 662 663 inp = sotoinpcb(so); 664 INP_WLOCK(inp); 665 tp = intotcpcb(inp); 666 if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) 667 error = EBUSY; 668 else { 669 /* 670 * socket could not have been "unoffloaded" if here. 671 */ 672 MPASS(tp->t_flags & TF_TOE); 673 MPASS(tp->tod != NULL); 674 MPASS(tp->t_toe != NULL); 675 toep = tp->t_toe; 676 MPASS(toep->vi->adapter == icc->sc); 677 icc->toep = toep; 678 icc->cwt = cxgbei_select_worker_thread(icc); 679 680 /* 681 * We maintain the _send_ DSL in this field just to have a 682 * convenient way to assert that the kernel never sends 683 * oversized PDUs. This field is otherwise unused in the driver 684 * or the kernel. 685 */ 686 ic->ic_max_data_segment_length = ci->max_tx_pdu_len - 687 ISCSI_BHS_SIZE; 688 689 icc->ulp_submode = 0; 690 if (ic->ic_header_crc32c) { 691 icc->ulp_submode |= ULP_CRC_HEADER; 692 ic->ic_max_data_segment_length -= 693 ISCSI_HEADER_DIGEST_SIZE; 694 } 695 if (ic->ic_data_crc32c) { 696 icc->ulp_submode |= ULP_CRC_DATA; 697 ic->ic_max_data_segment_length -= 698 ISCSI_DATA_DIGEST_SIZE; 699 } 700 so->so_options |= SO_NO_DDP; 701 toep->params.ulp_mode = ULP_MODE_ISCSI; 702 toep->ulpcb = icc; 703 704 send_iscsi_flowc_wr(icc->sc, toep, ci->max_tx_pdu_len); 705 set_ulp_mode_iscsi(icc->sc, toep, ic->ic_header_crc32c, 706 ic->ic_data_crc32c); 707 error = 0; 708 } 709 INP_WUNLOCK(inp); 710 711 if (error == 0) { 712 error = icl_cxgbei_setsockopt(ic, so, ci->max_tx_pdu_len, 713 ci->max_rx_pdu_len); 714 } 715 716 return (error); 717 } 718 719 void 720 icl_cxgbei_conn_close(struct icl_conn *ic) 721 { 722 struct icl_cxgbei_conn *icc = ic_to_icc(ic); 723 struct icl_pdu *ip; 724 struct socket *so; 725 struct sockbuf *sb; 726 struct inpcb *inp; 727 struct toepcb *toep = icc->toep; 728 729 MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); 730 ICL_CONN_LOCK_ASSERT_NOT(ic); 731 732 ICL_CONN_LOCK(ic); 733 so = ic->ic_socket; 734 if (ic->ic_disconnecting || so == NULL) { 735 CTR4(KTR_CXGBE, "%s: icc %p (disconnecting = %d), so %p", 736 __func__, icc, ic->ic_disconnecting, so); 737 ICL_CONN_UNLOCK(ic); 738 return; 739 } 740 ic->ic_disconnecting = true; 741 742 /* These are unused in this driver right now. */ 743 MPASS(STAILQ_EMPTY(&ic->ic_to_send)); 744 MPASS(ic->ic_receive_pdu == NULL); 745 746 #ifdef DIAGNOSTIC 747 KASSERT(ic->ic_outstanding_pdus == 0, 748 ("destroying session with %d outstanding PDUs", 749 ic->ic_outstanding_pdus)); 750 #endif 751 ICL_CONN_UNLOCK(ic); 752 753 CTR3(KTR_CXGBE, "%s: tid %d, icc %p", __func__, toep ? toep->tid : -1, 754 icc); 755 inp = sotoinpcb(so); 756 sb = &so->so_rcv; 757 INP_WLOCK(inp); 758 if (toep != NULL) { /* NULL if connection was never offloaded. */ 759 toep->ulpcb = NULL; 760 mbufq_drain(&toep->ulp_pduq); 761 SOCKBUF_LOCK(sb); 762 if (icc->rx_flags & RXF_ACTIVE) { 763 volatile u_int *p = &icc->rx_flags; 764 765 SOCKBUF_UNLOCK(sb); 766 INP_WUNLOCK(inp); 767 768 while (*p & RXF_ACTIVE) 769 pause("conclo", 1); 770 771 INP_WLOCK(inp); 772 SOCKBUF_LOCK(sb); 773 } 774 775 while (!STAILQ_EMPTY(&icc->rcvd_pdus)) { 776 ip = STAILQ_FIRST(&icc->rcvd_pdus); 777 STAILQ_REMOVE_HEAD(&icc->rcvd_pdus, ip_next); 778 icl_cxgbei_conn_pdu_free(ic, ip); 779 } 780 SOCKBUF_UNLOCK(sb); 781 } 782 INP_WUNLOCK(inp); 783 784 ICL_CONN_LOCK(ic); 785 ic->ic_socket = NULL; 786 ICL_CONN_UNLOCK(ic); 787 788 /* 789 * XXXNP: we should send RST instead of FIN when PDUs held in various 790 * queues were purged instead of delivered reliably but soabort isn't 791 * really general purpose and wouldn't do the right thing here. 792 */ 793 soclose(so); 794 } 795 796 int 797 icl_cxgbei_conn_task_setup(struct icl_conn *ic, struct icl_pdu *ip, 798 struct ccb_scsiio *csio, uint32_t *ittp, void **arg) 799 { 800 struct icl_cxgbei_conn *icc = ic_to_icc(ic); 801 struct toepcb *toep = icc->toep; 802 struct adapter *sc = icc->sc; 803 struct cxgbei_data *ci = sc->iscsi_ulp_softc; 804 struct ppod_region *pr = &ci->pr; 805 struct ppod_reservation *prsv; 806 uint32_t itt; 807 int rc = 0; 808 809 /* This is for the offload driver's state. Must not be set already. */ 810 MPASS(arg != NULL); 811 MPASS(*arg == NULL); 812 813 if ((csio->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_IN || 814 csio->dxfer_len < ci->ddp_threshold) { 815 no_ddp: 816 /* 817 * No DDP for this I/O. Allocate an ITT (based on the one 818 * passed in) that cannot be a valid hardware DDP tag in the 819 * iSCSI region. 820 */ 821 itt = *ittp & M_PPOD_TAG; 822 itt = V_PPOD_TAG(itt) | pr->pr_invalid_bit; 823 *ittp = htobe32(itt); 824 MPASS(*arg == NULL); /* State is maintained for DDP only. */ 825 if (rc != 0) 826 counter_u64_add(ci->ddp_setup_error, 1); 827 return (0); 828 } 829 830 /* 831 * Reserve resources for DDP, update the itt that should be used in the 832 * PDU, and save DDP specific state for this I/O in *arg. 833 */ 834 835 prsv = uma_zalloc(prsv_zone, M_NOWAIT); 836 if (prsv == NULL) { 837 rc = ENOMEM; 838 goto no_ddp; 839 } 840 841 /* XXX add support for all CAM_DATA_ types */ 842 MPASS((csio->ccb_h.flags & CAM_DATA_MASK) == CAM_DATA_VADDR); 843 rc = t4_alloc_page_pods_for_buf(pr, (vm_offset_t)csio->data_ptr, 844 csio->dxfer_len, prsv); 845 if (rc != 0) { 846 uma_zfree(prsv_zone, prsv); 847 goto no_ddp; 848 } 849 850 rc = t4_write_page_pods_for_buf(sc, toep->ofld_txq, toep->tid, prsv, 851 (vm_offset_t)csio->data_ptr, csio->dxfer_len); 852 if (rc != 0) { 853 t4_free_page_pods(prsv); 854 uma_zfree(prsv_zone, prsv); 855 goto no_ddp; 856 } 857 858 *ittp = htobe32(prsv->prsv_tag); 859 *arg = prsv; 860 counter_u64_add(ci->ddp_setup_ok, 1); 861 return (0); 862 } 863 864 void 865 icl_cxgbei_conn_task_done(struct icl_conn *ic, void *arg) 866 { 867 868 if (arg != NULL) { 869 struct ppod_reservation *prsv = arg; 870 871 t4_free_page_pods(prsv); 872 uma_zfree(prsv_zone, prsv); 873 } 874 } 875 876 /* XXXNP: PDU should be passed in as parameter, like on the initiator. */ 877 #define io_to_request_pdu(io) ((io)->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr) 878 #define io_to_ppod_reservation(io) ((io)->io_hdr.ctl_private[CTL_PRIV_FRONTEND2].ptr) 879 880 int 881 icl_cxgbei_conn_transfer_setup(struct icl_conn *ic, union ctl_io *io, 882 uint32_t *tttp, void **arg) 883 { 884 struct icl_cxgbei_conn *icc = ic_to_icc(ic); 885 struct toepcb *toep = icc->toep; 886 struct ctl_scsiio *ctsio = &io->scsiio; 887 struct adapter *sc = icc->sc; 888 struct cxgbei_data *ci = sc->iscsi_ulp_softc; 889 struct ppod_region *pr = &ci->pr; 890 struct ppod_reservation *prsv; 891 uint32_t ttt; 892 int xferlen, rc = 0, alias; 893 894 /* This is for the offload driver's state. Must not be set already. */ 895 MPASS(arg != NULL); 896 MPASS(*arg == NULL); 897 898 if (ctsio->ext_data_filled == 0) { 899 int first_burst; 900 struct icl_pdu *ip = io_to_request_pdu(io); 901 vm_offset_t buf; 902 #ifdef INVARIANTS 903 struct icl_cxgbei_pdu *icp = ip_to_icp(ip); 904 905 MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE); 906 MPASS(ic == ip->ip_conn); 907 MPASS(ip->ip_bhs_mbuf != NULL); 908 #endif 909 first_burst = icl_pdu_data_segment_length(ip); 910 911 /* 912 * Note that ICL calls conn_transfer_setup even if the first 913 * burst had everything and there's nothing left to transfer. 914 */ 915 MPASS(ctsio->kern_data_len >= first_burst); 916 xferlen = ctsio->kern_data_len; 917 if (xferlen - first_burst < ci->ddp_threshold) { 918 no_ddp: 919 /* 920 * No DDP for this transfer. Allocate a TTT (based on 921 * the one passed in) that cannot be a valid hardware 922 * DDP tag in the iSCSI region. 923 */ 924 ttt = *tttp & M_PPOD_TAG; 925 ttt = V_PPOD_TAG(ttt) | pr->pr_invalid_bit; 926 *tttp = htobe32(ttt); 927 MPASS(io_to_ppod_reservation(io) == NULL); 928 if (rc != 0) 929 counter_u64_add(ci->ddp_setup_error, 1); 930 return (0); 931 } 932 933 if (ctsio->kern_sg_entries == 0) 934 buf = (vm_offset_t)ctsio->kern_data_ptr; 935 else if (ctsio->kern_sg_entries == 1) { 936 struct ctl_sg_entry *sgl = (void *)ctsio->kern_data_ptr; 937 938 MPASS(sgl->len == xferlen); 939 buf = (vm_offset_t)sgl->addr; 940 } else { 941 rc = EAGAIN; /* XXX implement */ 942 goto no_ddp; 943 } 944 945 946 /* 947 * Reserve resources for DDP, update the ttt that should be used 948 * in the PDU, and save DDP specific state for this I/O. 949 */ 950 951 MPASS(io_to_ppod_reservation(io) == NULL); 952 prsv = uma_zalloc(prsv_zone, M_NOWAIT); 953 if (prsv == NULL) { 954 rc = ENOMEM; 955 goto no_ddp; 956 } 957 958 rc = t4_alloc_page_pods_for_buf(pr, buf, xferlen, prsv); 959 if (rc != 0) { 960 uma_zfree(prsv_zone, prsv); 961 goto no_ddp; 962 } 963 964 rc = t4_write_page_pods_for_buf(sc, toep->ofld_txq, toep->tid, 965 prsv, buf, xferlen); 966 if (rc != 0) { 967 t4_free_page_pods(prsv); 968 uma_zfree(prsv_zone, prsv); 969 goto no_ddp; 970 } 971 972 *tttp = htobe32(prsv->prsv_tag); 973 io_to_ppod_reservation(io) = prsv; 974 *arg = ctsio; 975 counter_u64_add(ci->ddp_setup_ok, 1); 976 return (0); 977 } 978 979 /* 980 * In the middle of an I/O. A non-NULL page pod reservation indicates 981 * that a DDP buffer is being used for the I/O. 982 */ 983 984 prsv = io_to_ppod_reservation(ctsio); 985 if (prsv == NULL) 986 goto no_ddp; 987 988 alias = (prsv->prsv_tag & pr->pr_alias_mask) >> pr->pr_alias_shift; 989 alias++; 990 prsv->prsv_tag &= ~pr->pr_alias_mask; 991 prsv->prsv_tag |= alias << pr->pr_alias_shift & pr->pr_alias_mask; 992 993 *tttp = htobe32(prsv->prsv_tag); 994 *arg = ctsio; 995 996 return (0); 997 } 998 999 void 1000 icl_cxgbei_conn_transfer_done(struct icl_conn *ic, void *arg) 1001 { 1002 struct ctl_scsiio *ctsio = arg; 1003 1004 if (ctsio != NULL && ctsio->kern_data_len == ctsio->ext_data_filled) { 1005 struct ppod_reservation *prsv; 1006 1007 prsv = io_to_ppod_reservation(ctsio); 1008 MPASS(prsv != NULL); 1009 1010 t4_free_page_pods(prsv); 1011 uma_zfree(prsv_zone, prsv); 1012 } 1013 } 1014 1015 static void 1016 cxgbei_limits(struct adapter *sc, void *arg) 1017 { 1018 struct icl_drv_limits *idl = arg; 1019 struct cxgbei_data *ci; 1020 int max_dsl; 1021 1022 if (begin_synchronized_op(sc, NULL, HOLD_LOCK, "t4lims") != 0) 1023 return; 1024 1025 if (uld_active(sc, ULD_ISCSI)) { 1026 ci = sc->iscsi_ulp_softc; 1027 MPASS(ci != NULL); 1028 1029 /* 1030 * AHS is not supported by the kernel so we'll not account for 1031 * it either in our PDU len -> data segment len conversions. 1032 */ 1033 1034 max_dsl = ci->max_rx_pdu_len - ISCSI_BHS_SIZE - 1035 ISCSI_HEADER_DIGEST_SIZE - ISCSI_DATA_DIGEST_SIZE; 1036 if (idl->idl_max_recv_data_segment_length > max_dsl) 1037 idl->idl_max_recv_data_segment_length = max_dsl; 1038 1039 max_dsl = ci->max_tx_pdu_len - ISCSI_BHS_SIZE - 1040 ISCSI_HEADER_DIGEST_SIZE - ISCSI_DATA_DIGEST_SIZE; 1041 if (idl->idl_max_send_data_segment_length > max_dsl) 1042 idl->idl_max_send_data_segment_length = max_dsl; 1043 } 1044 1045 end_synchronized_op(sc, LOCK_HELD); 1046 } 1047 1048 static int 1049 icl_cxgbei_limits(struct icl_drv_limits *idl) 1050 { 1051 1052 /* Maximum allowed by the RFC. cxgbei_limits will clip them. */ 1053 idl->idl_max_recv_data_segment_length = (1 << 24) - 1; 1054 idl->idl_max_send_data_segment_length = (1 << 24) - 1; 1055 1056 /* These are somewhat arbitrary. */ 1057 idl->idl_max_burst_length = 2 * 1024 * 1024; 1058 idl->idl_first_burst_length = 8192; 1059 1060 t4_iterate(cxgbei_limits, idl); 1061 1062 return (0); 1063 } 1064 1065 int 1066 icl_cxgbei_mod_load(void) 1067 { 1068 int rc; 1069 1070 /* 1071 * Space to track pagepod reservations. 1072 */ 1073 prsv_zone = uma_zcreate("Pagepod reservations", 1074 sizeof(struct ppod_reservation), NULL, NULL, NULL, NULL, 1075 UMA_ALIGN_CACHE, 0); 1076 1077 refcount_init(&icl_cxgbei_ncons, 0); 1078 1079 rc = icl_register("cxgbei", false, -100, icl_cxgbei_limits, 1080 icl_cxgbei_new_conn); 1081 1082 return (rc); 1083 } 1084 1085 int 1086 icl_cxgbei_mod_unload(void) 1087 { 1088 1089 if (icl_cxgbei_ncons != 0) 1090 return (EBUSY); 1091 1092 icl_unregister("cxgbei", false); 1093 1094 uma_zdestroy(prsv_zone); 1095 1096 return (0); 1097 } 1098 #endif 1099