1 /*- 2 * Copyright (c) 2012 The FreeBSD Foundation 3 * Copyright (c) 2015 Chelsio Communications, Inc. 4 * All rights reserved. 5 * 6 * This software was developed by Edward Tomasz Napierala under sponsorship 7 * from the FreeBSD Foundation. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 * 30 */ 31 32 /* 33 * cxgbei implementation of iSCSI Common Layer kobj(9) interface. 34 */ 35 36 #include <sys/cdefs.h> 37 __FBSDID("$FreeBSD$"); 38 39 #include "opt_inet.h" 40 #include "opt_inet6.h" 41 42 #ifdef TCP_OFFLOAD 43 #include <sys/param.h> 44 #include <sys/capsicum.h> 45 #include <sys/condvar.h> 46 #include <sys/conf.h> 47 #include <sys/file.h> 48 #include <sys/kernel.h> 49 #include <sys/kthread.h> 50 #include <sys/lock.h> 51 #include <sys/mbuf.h> 52 #include <sys/mutex.h> 53 #include <sys/module.h> 54 #include <sys/protosw.h> 55 #include <sys/socket.h> 56 #include <sys/socketvar.h> 57 #include <sys/sysctl.h> 58 #include <sys/systm.h> 59 #include <sys/sx.h> 60 #include <sys/uio.h> 61 #include <machine/bus.h> 62 #include <vm/uma.h> 63 #include <netinet/in.h> 64 #include <netinet/in_pcb.h> 65 #include <netinet/tcp.h> 66 #include <netinet/tcp_var.h> 67 #include <netinet/toecore.h> 68 69 #include <dev/iscsi/icl.h> 70 #include <dev/iscsi/iscsi_proto.h> 71 #include <icl_conn_if.h> 72 73 #include "common/common.h" 74 #include "common/t4_tcb.h" 75 #include "tom/t4_tom.h" 76 #include "cxgbei.h" 77 78 SYSCTL_NODE(_kern_icl, OID_AUTO, cxgbei, CTLFLAG_RD, 0, "Chelsio iSCSI offload"); 79 static int coalesce = 1; 80 SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, coalesce, CTLFLAG_RWTUN, 81 &coalesce, 0, "Try to coalesce PDUs before sending"); 82 static int partial_receive_len = 128 * 1024; 83 SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, partial_receive_len, CTLFLAG_RWTUN, 84 &partial_receive_len, 0, "Minimum read size for partially received " 85 "data segment"); 86 static int sendspace = 1048576; 87 SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, sendspace, CTLFLAG_RWTUN, 88 &sendspace, 0, "Default send socket buffer size"); 89 static int recvspace = 1048576; 90 SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, recvspace, CTLFLAG_RWTUN, 91 &recvspace, 0, "Default receive socket buffer size"); 92 93 static uma_zone_t icl_transfer_zone; 94 95 static volatile u_int icl_cxgbei_ncons; 96 97 #define ICL_CONN_LOCK(X) mtx_lock(X->ic_lock) 98 #define ICL_CONN_UNLOCK(X) mtx_unlock(X->ic_lock) 99 #define ICL_CONN_LOCK_ASSERT(X) mtx_assert(X->ic_lock, MA_OWNED) 100 #define ICL_CONN_LOCK_ASSERT_NOT(X) mtx_assert(X->ic_lock, MA_NOTOWNED) 101 102 struct icl_pdu *icl_cxgbei_new_pdu(int); 103 void icl_cxgbei_new_pdu_set_conn(struct icl_pdu *, struct icl_conn *); 104 105 static icl_conn_new_pdu_t icl_cxgbei_conn_new_pdu; 106 icl_conn_pdu_free_t icl_cxgbei_conn_pdu_free; 107 static icl_conn_pdu_data_segment_length_t 108 icl_cxgbei_conn_pdu_data_segment_length; 109 static icl_conn_pdu_append_data_t icl_cxgbei_conn_pdu_append_data; 110 static icl_conn_pdu_get_data_t icl_cxgbei_conn_pdu_get_data; 111 static icl_conn_pdu_queue_t icl_cxgbei_conn_pdu_queue; 112 static icl_conn_handoff_t icl_cxgbei_conn_handoff; 113 static icl_conn_free_t icl_cxgbei_conn_free; 114 static icl_conn_close_t icl_cxgbei_conn_close; 115 static icl_conn_task_setup_t icl_cxgbei_conn_task_setup; 116 static icl_conn_task_done_t icl_cxgbei_conn_task_done; 117 static icl_conn_transfer_setup_t icl_cxgbei_conn_transfer_setup; 118 static icl_conn_transfer_done_t icl_cxgbei_conn_transfer_done; 119 120 static kobj_method_t icl_cxgbei_methods[] = { 121 KOBJMETHOD(icl_conn_new_pdu, icl_cxgbei_conn_new_pdu), 122 KOBJMETHOD(icl_conn_pdu_free, icl_cxgbei_conn_pdu_free), 123 KOBJMETHOD(icl_conn_pdu_data_segment_length, 124 icl_cxgbei_conn_pdu_data_segment_length), 125 KOBJMETHOD(icl_conn_pdu_append_data, icl_cxgbei_conn_pdu_append_data), 126 KOBJMETHOD(icl_conn_pdu_get_data, icl_cxgbei_conn_pdu_get_data), 127 KOBJMETHOD(icl_conn_pdu_queue, icl_cxgbei_conn_pdu_queue), 128 KOBJMETHOD(icl_conn_handoff, icl_cxgbei_conn_handoff), 129 KOBJMETHOD(icl_conn_free, icl_cxgbei_conn_free), 130 KOBJMETHOD(icl_conn_close, icl_cxgbei_conn_close), 131 KOBJMETHOD(icl_conn_task_setup, icl_cxgbei_conn_task_setup), 132 KOBJMETHOD(icl_conn_task_done, icl_cxgbei_conn_task_done), 133 KOBJMETHOD(icl_conn_transfer_setup, icl_cxgbei_conn_transfer_setup), 134 KOBJMETHOD(icl_conn_transfer_done, icl_cxgbei_conn_transfer_done), 135 { 0, 0 } 136 }; 137 138 DEFINE_CLASS(icl_cxgbei, icl_cxgbei_methods, sizeof(struct icl_cxgbei_conn)); 139 140 void 141 icl_cxgbei_conn_pdu_free(struct icl_conn *ic, struct icl_pdu *ip) 142 { 143 #ifdef INVARIANTS 144 struct icl_cxgbei_pdu *icp = ip_to_icp(ip); 145 #endif 146 147 MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE); 148 MPASS(ic == ip->ip_conn); 149 MPASS(ip->ip_bhs_mbuf != NULL); 150 151 m_freem(ip->ip_ahs_mbuf); 152 m_freem(ip->ip_data_mbuf); 153 m_freem(ip->ip_bhs_mbuf); /* storage for icl_cxgbei_pdu itself */ 154 155 #ifdef DIAGNOSTIC 156 if (__predict_true(ic != NULL)) 157 refcount_release(&ic->ic_outstanding_pdus); 158 #endif 159 } 160 161 struct icl_pdu * 162 icl_cxgbei_new_pdu(int flags) 163 { 164 struct icl_cxgbei_pdu *icp; 165 struct icl_pdu *ip; 166 struct mbuf *m; 167 uintptr_t a; 168 169 m = m_gethdr(flags, MT_DATA); 170 if (__predict_false(m == NULL)) 171 return (NULL); 172 173 a = roundup2(mtod(m, uintptr_t), _Alignof(struct icl_cxgbei_pdu)); 174 icp = (struct icl_cxgbei_pdu *)a; 175 bzero(icp, sizeof(*icp)); 176 177 icp->icp_signature = CXGBEI_PDU_SIGNATURE; 178 ip = &icp->ip; 179 ip->ip_bhs_mbuf = m; 180 181 a = roundup2((uintptr_t)(icp + 1), _Alignof(struct iscsi_bhs *)); 182 ip->ip_bhs = (struct iscsi_bhs *)a; 183 #ifdef INVARIANTS 184 /* Everything must fit entirely in the mbuf. */ 185 a = (uintptr_t)(ip->ip_bhs + 1); 186 MPASS(a <= (uintptr_t)m + MSIZE); 187 #endif 188 bzero(ip->ip_bhs, sizeof(*ip->ip_bhs)); 189 190 m->m_data = (void *)ip->ip_bhs; 191 m->m_len = sizeof(struct iscsi_bhs); 192 m->m_pkthdr.len = m->m_len; 193 194 return (ip); 195 } 196 197 void 198 icl_cxgbei_new_pdu_set_conn(struct icl_pdu *ip, struct icl_conn *ic) 199 { 200 201 ip->ip_conn = ic; 202 #ifdef DIAGNOSTIC 203 refcount_acquire(&ic->ic_outstanding_pdus); 204 #endif 205 } 206 207 /* 208 * Allocate icl_pdu with empty BHS to fill up by the caller. 209 */ 210 static struct icl_pdu * 211 icl_cxgbei_conn_new_pdu(struct icl_conn *ic, int flags) 212 { 213 struct icl_pdu *ip; 214 215 ip = icl_cxgbei_new_pdu(flags); 216 if (__predict_false(ip == NULL)) 217 return (NULL); 218 icl_cxgbei_new_pdu_set_conn(ip, ic); 219 220 return (ip); 221 } 222 223 static size_t 224 icl_pdu_data_segment_length(const struct icl_pdu *request) 225 { 226 uint32_t len = 0; 227 228 len += request->ip_bhs->bhs_data_segment_len[0]; 229 len <<= 8; 230 len += request->ip_bhs->bhs_data_segment_len[1]; 231 len <<= 8; 232 len += request->ip_bhs->bhs_data_segment_len[2]; 233 234 return (len); 235 } 236 237 size_t 238 icl_cxgbei_conn_pdu_data_segment_length(struct icl_conn *ic, 239 const struct icl_pdu *request) 240 { 241 242 return (icl_pdu_data_segment_length(request)); 243 } 244 245 static uint32_t 246 icl_conn_build_tasktag(struct icl_conn *ic, uint32_t tag) 247 { 248 return tag; 249 } 250 251 static struct mbuf * 252 finalize_pdu(struct icl_cxgbei_conn *icc, struct icl_cxgbei_pdu *icp) 253 { 254 struct icl_pdu *ip = &icp->ip; 255 uint8_t ulp_submode, padding; 256 struct mbuf *m, *last; 257 struct iscsi_bhs *bhs; 258 259 /* 260 * Fix up the data segment mbuf first. 261 */ 262 m = ip->ip_data_mbuf; 263 ulp_submode = icc->ulp_submode; 264 if (m) { 265 last = m_last(m); 266 267 /* 268 * Round up the data segment to a 4B boundary. Pad with 0 if 269 * necessary. There will definitely be room in the mbuf. 270 */ 271 padding = roundup2(ip->ip_data_len, 4) - ip->ip_data_len; 272 if (padding) { 273 bzero(mtod(last, uint8_t *) + last->m_len, padding); 274 last->m_len += padding; 275 } 276 } else { 277 MPASS(ip->ip_data_len == 0); 278 ulp_submode &= ~ULP_CRC_DATA; 279 padding = 0; 280 } 281 282 /* 283 * Now the header mbuf that has the BHS. 284 */ 285 m = ip->ip_bhs_mbuf; 286 MPASS(m->m_pkthdr.len == sizeof(struct iscsi_bhs)); 287 MPASS(m->m_len == sizeof(struct iscsi_bhs)); 288 289 bhs = ip->ip_bhs; 290 bhs->bhs_data_segment_len[2] = ip->ip_data_len; 291 bhs->bhs_data_segment_len[1] = ip->ip_data_len >> 8; 292 bhs->bhs_data_segment_len[0] = ip->ip_data_len >> 16; 293 294 /* "Convert" PDU to mbuf chain. Do not use icp/ip after this. */ 295 m->m_pkthdr.len = sizeof(struct iscsi_bhs) + ip->ip_data_len + padding; 296 m->m_next = ip->ip_data_mbuf; 297 set_mbuf_ulp_submode(m, ulp_submode); 298 #ifdef INVARIANTS 299 bzero(icp, sizeof(*icp)); 300 #endif 301 #ifdef DIAGNOSTIC 302 refcount_release(&icc->ic.ic_outstanding_pdus); 303 #endif 304 305 return (m); 306 } 307 308 int 309 icl_cxgbei_conn_pdu_append_data(struct icl_conn *ic, struct icl_pdu *ip, 310 const void *addr, size_t len, int flags) 311 { 312 struct mbuf *m; 313 #ifdef INVARIANTS 314 struct icl_cxgbei_pdu *icp = ip_to_icp(ip); 315 #endif 316 317 MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE); 318 MPASS(ic == ip->ip_conn); 319 KASSERT(len > 0, ("%s: len is %jd", __func__, (intmax_t)len)); 320 321 m = ip->ip_data_mbuf; 322 if (m == NULL) { 323 m = m_getjcl(M_NOWAIT, MT_DATA, 0, MJUM16BYTES); 324 if (__predict_false(m == NULL)) 325 return (ENOMEM); 326 327 ip->ip_data_mbuf = m; 328 } 329 330 if (__predict_true(m_append(m, len, addr) != 0)) { 331 ip->ip_data_len += len; 332 MPASS(ip->ip_data_len <= ic->ic_max_data_segment_length); 333 return (0); 334 } else { 335 if (flags & M_WAITOK) { 336 CXGBE_UNIMPLEMENTED("fail safe append"); 337 } 338 ip->ip_data_len = m_length(m, NULL); 339 return (1); 340 } 341 } 342 343 void 344 icl_cxgbei_conn_pdu_get_data(struct icl_conn *ic, struct icl_pdu *ip, 345 size_t off, void *addr, size_t len) 346 { 347 struct icl_cxgbei_pdu *icp = ip_to_icp(ip); 348 349 if (icp->icp_flags & ICPF_RX_DDP) 350 return; /* data is DDP'ed, no need to copy */ 351 m_copydata(ip->ip_data_mbuf, off, len, addr); 352 } 353 354 void 355 icl_cxgbei_conn_pdu_queue(struct icl_conn *ic, struct icl_pdu *ip) 356 { 357 struct icl_cxgbei_conn *icc = ic_to_icc(ic); 358 struct icl_cxgbei_pdu *icp = ip_to_icp(ip); 359 struct socket *so = ic->ic_socket; 360 struct toepcb *toep = icc->toep; 361 struct inpcb *inp; 362 struct mbuf *m; 363 364 MPASS(ic == ip->ip_conn); 365 MPASS(ip->ip_bhs_mbuf != NULL); 366 /* The kernel doesn't generate PDUs with AHS. */ 367 MPASS(ip->ip_ahs_mbuf == NULL && ip->ip_ahs_len == 0); 368 369 ICL_CONN_LOCK_ASSERT(ic); 370 /* NOTE: sowriteable without so_snd lock is a mostly harmless race. */ 371 if (ic->ic_disconnecting || so == NULL || !sowriteable(so)) { 372 icl_cxgbei_conn_pdu_free(ic, ip); 373 return; 374 } 375 376 m = finalize_pdu(icc, icp); 377 M_ASSERTPKTHDR(m); 378 MPASS((m->m_pkthdr.len & 3) == 0); 379 380 /* 381 * Do not get inp from toep->inp as the toepcb might have detached 382 * already. 383 */ 384 inp = sotoinpcb(so); 385 INP_WLOCK(inp); 386 if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) || 387 __predict_false((toep->flags & TPF_ATTACHED) == 0)) 388 m_freem(m); 389 else { 390 mbufq_enqueue(&toep->ulp_pduq, m); 391 t4_push_pdus(icc->sc, toep, 0); 392 } 393 INP_WUNLOCK(inp); 394 } 395 396 static struct icl_conn * 397 icl_cxgbei_new_conn(const char *name, struct mtx *lock) 398 { 399 struct icl_cxgbei_conn *icc; 400 struct icl_conn *ic; 401 402 refcount_acquire(&icl_cxgbei_ncons); 403 404 icc = (struct icl_cxgbei_conn *)kobj_create(&icl_cxgbei_class, M_CXGBE, 405 M_WAITOK | M_ZERO); 406 icc->icc_signature = CXGBEI_CONN_SIGNATURE; 407 STAILQ_INIT(&icc->rcvd_pdus); 408 409 ic = &icc->ic; 410 ic->ic_lock = lock; 411 412 /* XXXNP: review. Most of these icl_conn fields aren't really used */ 413 STAILQ_INIT(&ic->ic_to_send); 414 cv_init(&ic->ic_send_cv, "icl_cxgbei_tx"); 415 cv_init(&ic->ic_receive_cv, "icl_cxgbei_rx"); 416 #ifdef DIAGNOSTIC 417 refcount_init(&ic->ic_outstanding_pdus, 0); 418 #endif 419 /* This is a stop-gap value that will be corrected during handoff. */ 420 ic->ic_max_data_segment_length = 16384; 421 ic->ic_name = name; 422 ic->ic_offload = "cxgbei"; 423 ic->ic_unmapped = false; 424 425 CTR2(KTR_CXGBE, "%s: icc %p", __func__, icc); 426 427 return (ic); 428 } 429 430 void 431 icl_cxgbei_conn_free(struct icl_conn *ic) 432 { 433 struct icl_cxgbei_conn *icc = ic_to_icc(ic); 434 435 MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); 436 437 CTR2(KTR_CXGBE, "%s: icc %p", __func__, icc); 438 439 cv_destroy(&ic->ic_send_cv); 440 cv_destroy(&ic->ic_receive_cv); 441 442 kobj_delete((struct kobj *)icc, M_CXGBE); 443 refcount_release(&icl_cxgbei_ncons); 444 } 445 446 static int 447 icl_cxgbei_setsockopt(struct icl_conn *ic, struct socket *so, int sspace, 448 int rspace) 449 { 450 struct sockopt opt; 451 int error, one = 1, ss, rs; 452 453 ss = max(sendspace, sspace); 454 rs = max(recvspace, rspace); 455 456 error = soreserve(so, ss, rs); 457 if (error != 0) { 458 icl_cxgbei_conn_close(ic); 459 return (error); 460 } 461 SOCKBUF_LOCK(&so->so_snd); 462 so->so_snd.sb_flags |= SB_AUTOSIZE; 463 SOCKBUF_UNLOCK(&so->so_snd); 464 SOCKBUF_LOCK(&so->so_rcv); 465 so->so_rcv.sb_flags |= SB_AUTOSIZE; 466 SOCKBUF_UNLOCK(&so->so_rcv); 467 468 /* 469 * Disable Nagle. 470 */ 471 bzero(&opt, sizeof(opt)); 472 opt.sopt_dir = SOPT_SET; 473 opt.sopt_level = IPPROTO_TCP; 474 opt.sopt_name = TCP_NODELAY; 475 opt.sopt_val = &one; 476 opt.sopt_valsize = sizeof(one); 477 error = sosetopt(so, &opt); 478 if (error != 0) { 479 icl_cxgbei_conn_close(ic); 480 return (error); 481 } 482 483 return (0); 484 } 485 486 /* 487 * Request/response structure used to find out the adapter offloading a socket. 488 */ 489 struct find_ofld_adapter_rr { 490 struct socket *so; 491 struct adapter *sc; /* result */ 492 }; 493 494 static void 495 find_offload_adapter(struct adapter *sc, void *arg) 496 { 497 struct find_ofld_adapter_rr *fa = arg; 498 struct socket *so = fa->so; 499 struct tom_data *td = sc->tom_softc; 500 struct tcpcb *tp; 501 struct inpcb *inp; 502 503 /* Non-TCP were filtered out earlier. */ 504 MPASS(so->so_proto->pr_protocol == IPPROTO_TCP); 505 506 if (fa->sc != NULL) 507 return; /* Found already. */ 508 509 if (td == NULL) 510 return; /* TOE not enabled on this adapter. */ 511 512 inp = sotoinpcb(so); 513 INP_WLOCK(inp); 514 if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { 515 tp = intotcpcb(inp); 516 if (tp->t_flags & TF_TOE && tp->tod == &td->tod) 517 fa->sc = sc; /* Found. */ 518 } 519 INP_WUNLOCK(inp); 520 } 521 522 /* XXXNP: move this to t4_tom. */ 523 static void 524 send_iscsi_flowc_wr(struct adapter *sc, struct toepcb *toep, int maxlen) 525 { 526 struct wrqe *wr; 527 struct fw_flowc_wr *flowc; 528 const u_int nparams = 1; 529 u_int flowclen; 530 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 531 532 flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); 533 534 wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq); 535 if (wr == NULL) { 536 /* XXX */ 537 panic("%s: allocation failure.", __func__); 538 } 539 flowc = wrtod(wr); 540 memset(flowc, 0, wr->wr_len); 541 542 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 543 V_FW_FLOWC_WR_NPARAMS(nparams)); 544 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | 545 V_FW_WR_FLOWID(toep->tid)); 546 547 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_TXDATAPLEN_MAX; 548 flowc->mnemval[0].val = htobe32(maxlen); 549 550 txsd->tx_credits = howmany(flowclen, 16); 551 txsd->plen = 0; 552 KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0, 553 ("%s: not enough credits (%d)", __func__, toep->tx_credits)); 554 toep->tx_credits -= txsd->tx_credits; 555 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 556 toep->txsd_pidx = 0; 557 toep->txsd_avail--; 558 559 t4_wrq_tx(sc, wr); 560 } 561 562 static void 563 set_ulp_mode_iscsi(struct adapter *sc, struct toepcb *toep, int hcrc, int dcrc) 564 { 565 uint64_t val = ULP_MODE_ISCSI; 566 567 if (hcrc) 568 val |= ULP_CRC_HEADER << 4; 569 if (dcrc) 570 val |= ULP_CRC_DATA << 4; 571 572 CTR4(KTR_CXGBE, "%s: tid %u, ULP_MODE_ISCSI, CRC hdr=%d data=%d", 573 __func__, toep->tid, hcrc, dcrc); 574 575 t4_set_tcb_field(sc, toep->ctrlq, toep->tid, W_TCB_ULP_TYPE, 576 V_TCB_ULP_TYPE(M_TCB_ULP_TYPE) | V_TCB_ULP_RAW(M_TCB_ULP_RAW), val, 577 0, 0, toep->ofld_rxq->iq.abs_id); 578 } 579 580 /* 581 * XXXNP: Who is responsible for cleaning up the socket if this returns with an 582 * error? Review all error paths. 583 * 584 * XXXNP: What happens to the socket's fd reference if the operation is 585 * successful, and how does that affect the socket's life cycle? 586 */ 587 int 588 icl_cxgbei_conn_handoff(struct icl_conn *ic, int fd) 589 { 590 struct icl_cxgbei_conn *icc = ic_to_icc(ic); 591 struct cxgbei_data *ci; 592 struct find_ofld_adapter_rr fa; 593 struct file *fp; 594 struct socket *so; 595 struct inpcb *inp; 596 struct tcpcb *tp; 597 struct toepcb *toep; 598 cap_rights_t rights; 599 int error; 600 601 MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); 602 ICL_CONN_LOCK_ASSERT_NOT(ic); 603 604 /* 605 * Steal the socket from userland. 606 */ 607 error = fget(curthread, fd, 608 cap_rights_init(&rights, CAP_SOCK_CLIENT), &fp); 609 if (error != 0) 610 return (error); 611 if (fp->f_type != DTYPE_SOCKET) { 612 fdrop(fp, curthread); 613 return (EINVAL); 614 } 615 so = fp->f_data; 616 if (so->so_type != SOCK_STREAM || 617 so->so_proto->pr_protocol != IPPROTO_TCP) { 618 fdrop(fp, curthread); 619 return (EINVAL); 620 } 621 622 ICL_CONN_LOCK(ic); 623 if (ic->ic_socket != NULL) { 624 ICL_CONN_UNLOCK(ic); 625 fdrop(fp, curthread); 626 return (EBUSY); 627 } 628 ic->ic_disconnecting = false; 629 ic->ic_socket = so; 630 fp->f_ops = &badfileops; 631 fp->f_data = NULL; 632 fdrop(fp, curthread); 633 ICL_CONN_UNLOCK(ic); 634 635 /* Find the adapter offloading this socket. */ 636 fa.sc = NULL; 637 fa.so = so; 638 t4_iterate(find_offload_adapter, &fa); 639 if (fa.sc == NULL) 640 return (EINVAL); 641 icc->sc = fa.sc; 642 ci = icc->sc->iscsi_ulp_softc; 643 644 inp = sotoinpcb(so); 645 INP_WLOCK(inp); 646 tp = intotcpcb(inp); 647 if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) 648 error = EBUSY; 649 else { 650 /* 651 * socket could not have been "unoffloaded" if here. 652 */ 653 MPASS(tp->t_flags & TF_TOE); 654 MPASS(tp->tod != NULL); 655 MPASS(tp->t_toe != NULL); 656 toep = tp->t_toe; 657 MPASS(toep->vi->pi->adapter == icc->sc); 658 icc->toep = toep; 659 icc->cwt = cxgbei_select_worker_thread(icc); 660 661 /* 662 * We maintain the _send_ DSL in this field just to have a 663 * convenient way to assert that the kernel never sends 664 * oversized PDUs. This field is otherwise unused in the driver 665 * or the kernel. 666 */ 667 ic->ic_max_data_segment_length = ci->max_tx_pdu_len - 668 ISCSI_BHS_SIZE; 669 670 icc->ulp_submode = 0; 671 if (ic->ic_header_crc32c) { 672 icc->ulp_submode |= ULP_CRC_HEADER; 673 ic->ic_max_data_segment_length -= 674 ISCSI_HEADER_DIGEST_SIZE; 675 } 676 if (ic->ic_data_crc32c) { 677 icc->ulp_submode |= ULP_CRC_DATA; 678 ic->ic_max_data_segment_length -= 679 ISCSI_DATA_DIGEST_SIZE; 680 } 681 so->so_options |= SO_NO_DDP; 682 toep->ulp_mode = ULP_MODE_ISCSI; 683 toep->ulpcb = icc; 684 685 send_iscsi_flowc_wr(icc->sc, toep, ci->max_tx_pdu_len); 686 set_ulp_mode_iscsi(icc->sc, toep, ic->ic_header_crc32c, 687 ic->ic_data_crc32c); 688 error = 0; 689 } 690 INP_WUNLOCK(inp); 691 692 if (error == 0) { 693 error = icl_cxgbei_setsockopt(ic, so, ci->max_tx_pdu_len, 694 ci->max_rx_pdu_len); 695 } 696 697 return (error); 698 } 699 700 void 701 icl_cxgbei_conn_close(struct icl_conn *ic) 702 { 703 struct icl_cxgbei_conn *icc = ic_to_icc(ic); 704 struct icl_pdu *ip; 705 struct socket *so; 706 struct sockbuf *sb; 707 struct inpcb *inp; 708 struct toepcb *toep = icc->toep; 709 710 MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); 711 ICL_CONN_LOCK_ASSERT_NOT(ic); 712 713 ICL_CONN_LOCK(ic); 714 so = ic->ic_socket; 715 if (ic->ic_disconnecting || so == NULL) { 716 CTR4(KTR_CXGBE, "%s: icc %p (disconnecting = %d), so %p", 717 __func__, icc, ic->ic_disconnecting, so); 718 ICL_CONN_UNLOCK(ic); 719 return; 720 } 721 ic->ic_disconnecting = true; 722 723 /* These are unused in this driver right now. */ 724 MPASS(STAILQ_EMPTY(&ic->ic_to_send)); 725 MPASS(ic->ic_receive_pdu == NULL); 726 727 #ifdef DIAGNOSTIC 728 KASSERT(ic->ic_outstanding_pdus == 0, 729 ("destroying session with %d outstanding PDUs", 730 ic->ic_outstanding_pdus)); 731 #endif 732 ICL_CONN_UNLOCK(ic); 733 734 CTR3(KTR_CXGBE, "%s: tid %d, icc %p", __func__, toep ? toep->tid : -1, 735 icc); 736 inp = sotoinpcb(so); 737 sb = &so->so_rcv; 738 INP_WLOCK(inp); 739 if (toep != NULL) { /* NULL if connection was never offloaded. */ 740 toep->ulpcb = NULL; 741 mbufq_drain(&toep->ulp_pduq); 742 SOCKBUF_LOCK(sb); 743 if (icc->rx_flags & RXF_ACTIVE) { 744 volatile u_int *p = &icc->rx_flags; 745 746 SOCKBUF_UNLOCK(sb); 747 INP_WUNLOCK(inp); 748 749 while (*p & RXF_ACTIVE) 750 pause("conclo", 1); 751 752 INP_WLOCK(inp); 753 SOCKBUF_LOCK(sb); 754 } 755 756 while (!STAILQ_EMPTY(&icc->rcvd_pdus)) { 757 ip = STAILQ_FIRST(&icc->rcvd_pdus); 758 STAILQ_REMOVE_HEAD(&icc->rcvd_pdus, ip_next); 759 icl_cxgbei_conn_pdu_free(ic, ip); 760 } 761 SOCKBUF_UNLOCK(sb); 762 } 763 INP_WUNLOCK(inp); 764 765 ICL_CONN_LOCK(ic); 766 ic->ic_socket = NULL; 767 ICL_CONN_UNLOCK(ic); 768 769 /* 770 * XXXNP: we should send RST instead of FIN when PDUs held in various 771 * queues were purged instead of delivered reliably but soabort isn't 772 * really general purpose and wouldn't do the right thing here. 773 */ 774 soclose(so); 775 } 776 777 int 778 icl_cxgbei_conn_task_setup(struct icl_conn *ic, struct icl_pdu *ip, 779 struct ccb_scsiio *csio, uint32_t *task_tagp, void **prvp) 780 { 781 void *prv; 782 783 *task_tagp = icl_conn_build_tasktag(ic, *task_tagp); 784 785 prv = uma_zalloc(icl_transfer_zone, M_NOWAIT | M_ZERO); 786 if (prv == NULL) 787 return (ENOMEM); 788 789 *prvp = prv; 790 791 cxgbei_conn_task_reserve_itt(ic, prvp, csio, task_tagp); 792 793 return (0); 794 } 795 796 void 797 icl_cxgbei_conn_task_done(struct icl_conn *ic, void *prv) 798 { 799 800 cxgbei_cleanup_task(ic, prv); 801 uma_zfree(icl_transfer_zone, prv); 802 } 803 804 int 805 icl_cxgbei_conn_transfer_setup(struct icl_conn *ic, union ctl_io *io, 806 uint32_t *transfer_tag, void **prvp) 807 { 808 void *prv; 809 810 *transfer_tag = icl_conn_build_tasktag(ic, *transfer_tag); 811 812 prv = uma_zalloc(icl_transfer_zone, M_NOWAIT | M_ZERO); 813 if (prv == NULL) 814 return (ENOMEM); 815 816 *prvp = prv; 817 818 cxgbei_conn_transfer_reserve_ttt(ic, prvp, io, transfer_tag); 819 820 return (0); 821 } 822 823 void 824 icl_cxgbei_conn_transfer_done(struct icl_conn *ic, void *prv) 825 { 826 cxgbei_cleanup_task(ic, prv); 827 uma_zfree(icl_transfer_zone, prv); 828 } 829 830 static void 831 cxgbei_limits(struct adapter *sc, void *arg) 832 { 833 struct icl_drv_limits *idl = arg; 834 struct cxgbei_data *ci; 835 int max_dsl; 836 837 if (begin_synchronized_op(sc, NULL, HOLD_LOCK, "t4lims") != 0) 838 return; 839 840 if (uld_active(sc, ULD_ISCSI)) { 841 ci = sc->iscsi_ulp_softc; 842 MPASS(ci != NULL); 843 844 /* 845 * AHS is not supported by the kernel so we'll not account for 846 * it either in our PDU len -> data segment len conversions. 847 */ 848 849 max_dsl = ci->max_rx_pdu_len - ISCSI_BHS_SIZE - 850 ISCSI_HEADER_DIGEST_SIZE - ISCSI_DATA_DIGEST_SIZE; 851 if (idl->idl_max_recv_data_segment_length > max_dsl) 852 idl->idl_max_recv_data_segment_length = max_dsl; 853 854 max_dsl = ci->max_tx_pdu_len - ISCSI_BHS_SIZE - 855 ISCSI_HEADER_DIGEST_SIZE - ISCSI_DATA_DIGEST_SIZE; 856 if (idl->idl_max_send_data_segment_length > max_dsl) 857 idl->idl_max_send_data_segment_length = max_dsl; 858 } 859 860 end_synchronized_op(sc, LOCK_HELD); 861 } 862 863 static int 864 icl_cxgbei_limits(struct icl_drv_limits *idl) 865 { 866 867 /* Maximum allowed by the RFC. cxgbei_limits will clip them. */ 868 idl->idl_max_recv_data_segment_length = (1 << 24) - 1; 869 idl->idl_max_send_data_segment_length = (1 << 24) - 1; 870 871 /* These are somewhat arbitrary. */ 872 idl->idl_max_burst_length = 2 * 1024 * 1024; 873 idl->idl_first_burst_length = 8192; 874 875 t4_iterate(cxgbei_limits, idl); 876 877 return (0); 878 } 879 880 int 881 icl_cxgbei_mod_load(void) 882 { 883 int rc; 884 885 icl_transfer_zone = uma_zcreate("icl_transfer", 886 16 * 1024, NULL, NULL, NULL, NULL, 887 UMA_ALIGN_PTR, 0); 888 889 refcount_init(&icl_cxgbei_ncons, 0); 890 891 rc = icl_register("cxgbei", false, -100, icl_cxgbei_limits, 892 icl_cxgbei_new_conn); 893 894 return (rc); 895 } 896 897 int 898 icl_cxgbei_mod_unload(void) 899 { 900 901 if (icl_cxgbei_ncons != 0) 902 return (EBUSY); 903 904 icl_unregister("cxgbei", false); 905 906 uma_zdestroy(icl_transfer_zone); 907 908 return (0); 909 } 910 #endif 911