1 /*- 2 * Copyright (c) 2012 The FreeBSD Foundation 3 * Copyright (c) 2015 Chelsio Communications, Inc. 4 * All rights reserved. 5 * 6 * This software was developed by Edward Tomasz Napierala under sponsorship 7 * from the FreeBSD Foundation. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 * 30 */ 31 32 /* 33 * cxgbei implementation of iSCSI Common Layer kobj(9) interface. 34 */ 35 36 #include <sys/cdefs.h> 37 __FBSDID("$FreeBSD$"); 38 39 #include "opt_inet.h" 40 #include "opt_inet6.h" 41 42 #ifdef TCP_OFFLOAD 43 #include <sys/param.h> 44 #include <sys/capsicum.h> 45 #include <sys/condvar.h> 46 #include <sys/conf.h> 47 #include <sys/file.h> 48 #include <sys/kernel.h> 49 #include <sys/kthread.h> 50 #include <sys/lock.h> 51 #include <sys/mbuf.h> 52 #include <sys/mutex.h> 53 #include <sys/module.h> 54 #include <sys/protosw.h> 55 #include <sys/socket.h> 56 #include <sys/socketvar.h> 57 #include <sys/sysctl.h> 58 #include <sys/systm.h> 59 #include <sys/sx.h> 60 #include <sys/uio.h> 61 #include <machine/bus.h> 62 #include <vm/uma.h> 63 #include <netinet/in.h> 64 #include <netinet/in_pcb.h> 65 #include <netinet/tcp.h> 66 #include <netinet/tcp_var.h> 67 #include <netinet/toecore.h> 68 69 #include <dev/iscsi/icl.h> 70 #include <dev/iscsi/iscsi_proto.h> 71 #include <icl_conn_if.h> 72 73 #include "common/common.h" 74 #include "tom/t4_tom.h" 75 #include "cxgbei.h" 76 77 SYSCTL_NODE(_kern_icl, OID_AUTO, cxgbei, CTLFLAG_RD, 0, "Chelsio iSCSI offload"); 78 static int coalesce = 1; 79 SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, coalesce, CTLFLAG_RWTUN, 80 &coalesce, 0, "Try to coalesce PDUs before sending"); 81 static int partial_receive_len = 128 * 1024; 82 SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, partial_receive_len, CTLFLAG_RWTUN, 83 &partial_receive_len, 0, "Minimum read size for partially received " 84 "data segment"); 85 static int sendspace = 1048576; 86 SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, sendspace, CTLFLAG_RWTUN, 87 &sendspace, 0, "Default send socket buffer size"); 88 static int recvspace = 1048576; 89 SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, recvspace, CTLFLAG_RWTUN, 90 &recvspace, 0, "Default receive socket buffer size"); 91 92 static uma_zone_t icl_transfer_zone; 93 94 static volatile u_int icl_cxgbei_ncons; 95 96 #define ICL_CONN_LOCK(X) mtx_lock(X->ic_lock) 97 #define ICL_CONN_UNLOCK(X) mtx_unlock(X->ic_lock) 98 #define ICL_CONN_LOCK_ASSERT(X) mtx_assert(X->ic_lock, MA_OWNED) 99 #define ICL_CONN_LOCK_ASSERT_NOT(X) mtx_assert(X->ic_lock, MA_NOTOWNED) 100 101 struct icl_pdu *icl_cxgbei_new_pdu(int); 102 void icl_cxgbei_new_pdu_set_conn(struct icl_pdu *, struct icl_conn *); 103 104 static icl_conn_new_pdu_t icl_cxgbei_conn_new_pdu; 105 icl_conn_pdu_free_t icl_cxgbei_conn_pdu_free; 106 static icl_conn_pdu_data_segment_length_t 107 icl_cxgbei_conn_pdu_data_segment_length; 108 static icl_conn_pdu_append_data_t icl_cxgbei_conn_pdu_append_data; 109 static icl_conn_pdu_get_data_t icl_cxgbei_conn_pdu_get_data; 110 static icl_conn_pdu_queue_t icl_cxgbei_conn_pdu_queue; 111 static icl_conn_handoff_t icl_cxgbei_conn_handoff; 112 static icl_conn_free_t icl_cxgbei_conn_free; 113 static icl_conn_close_t icl_cxgbei_conn_close; 114 static icl_conn_task_setup_t icl_cxgbei_conn_task_setup; 115 static icl_conn_task_done_t icl_cxgbei_conn_task_done; 116 static icl_conn_transfer_setup_t icl_cxgbei_conn_transfer_setup; 117 static icl_conn_transfer_done_t icl_cxgbei_conn_transfer_done; 118 119 static kobj_method_t icl_cxgbei_methods[] = { 120 KOBJMETHOD(icl_conn_new_pdu, icl_cxgbei_conn_new_pdu), 121 KOBJMETHOD(icl_conn_pdu_free, icl_cxgbei_conn_pdu_free), 122 KOBJMETHOD(icl_conn_pdu_data_segment_length, 123 icl_cxgbei_conn_pdu_data_segment_length), 124 KOBJMETHOD(icl_conn_pdu_append_data, icl_cxgbei_conn_pdu_append_data), 125 KOBJMETHOD(icl_conn_pdu_get_data, icl_cxgbei_conn_pdu_get_data), 126 KOBJMETHOD(icl_conn_pdu_queue, icl_cxgbei_conn_pdu_queue), 127 KOBJMETHOD(icl_conn_handoff, icl_cxgbei_conn_handoff), 128 KOBJMETHOD(icl_conn_free, icl_cxgbei_conn_free), 129 KOBJMETHOD(icl_conn_close, icl_cxgbei_conn_close), 130 KOBJMETHOD(icl_conn_task_setup, icl_cxgbei_conn_task_setup), 131 KOBJMETHOD(icl_conn_task_done, icl_cxgbei_conn_task_done), 132 KOBJMETHOD(icl_conn_transfer_setup, icl_cxgbei_conn_transfer_setup), 133 KOBJMETHOD(icl_conn_transfer_done, icl_cxgbei_conn_transfer_done), 134 { 0, 0 } 135 }; 136 137 DEFINE_CLASS(icl_cxgbei, icl_cxgbei_methods, sizeof(struct icl_cxgbei_conn)); 138 139 #if 0 140 /* 141 * Subtract another 256 for AHS from MAX_DSL if AHS could be used. 142 */ 143 #define CXGBEI_MAX_PDU 16224 144 #define CXGBEI_MAX_DSL (CXGBEI_MAX_PDU - sizeof(struct iscsi_bhs) - 8) 145 #endif 146 #define CXGBEI_MAX_DSL 8192 147 #define CXGBEI_MAX_PDU (CXGBEI_MAX_DSL + sizeof(struct iscsi_bhs) + 8) 148 149 void 150 icl_cxgbei_conn_pdu_free(struct icl_conn *ic, struct icl_pdu *ip) 151 { 152 #ifdef INVARIANTS 153 struct icl_cxgbei_pdu *icp = ip_to_icp(ip); 154 #endif 155 156 MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE); 157 MPASS(ic == ip->ip_conn); 158 MPASS(ip->ip_bhs_mbuf != NULL); 159 160 m_freem(ip->ip_ahs_mbuf); 161 m_freem(ip->ip_data_mbuf); 162 m_freem(ip->ip_bhs_mbuf); /* storage for icl_cxgbei_pdu itself */ 163 164 #ifdef DIAGNOSTIC 165 if (__predict_true(ic != NULL)) 166 refcount_release(&ic->ic_outstanding_pdus); 167 #endif 168 } 169 170 struct icl_pdu * 171 icl_cxgbei_new_pdu(int flags) 172 { 173 struct icl_cxgbei_pdu *icp; 174 struct icl_pdu *ip; 175 struct mbuf *m; 176 uintptr_t a; 177 178 m = m_gethdr(flags, MT_DATA); 179 if (__predict_false(m == NULL)) 180 return (NULL); 181 182 a = roundup2(mtod(m, uintptr_t), _Alignof(struct icl_cxgbei_pdu)); 183 icp = (struct icl_cxgbei_pdu *)a; 184 bzero(icp, sizeof(*icp)); 185 186 icp->icp_signature = CXGBEI_PDU_SIGNATURE; 187 ip = &icp->ip; 188 ip->ip_bhs_mbuf = m; 189 190 a = roundup2((uintptr_t)(icp + 1), _Alignof(struct iscsi_bhs *)); 191 ip->ip_bhs = (struct iscsi_bhs *)a; 192 #ifdef INVARIANTS 193 /* Everything must fit entirely in the mbuf. */ 194 a = (uintptr_t)(ip->ip_bhs + 1); 195 MPASS(a <= (uintptr_t)m + MSIZE); 196 #endif 197 bzero(ip->ip_bhs, sizeof(*ip->ip_bhs)); 198 199 m->m_data = (void *)ip->ip_bhs; 200 m->m_len = sizeof(struct iscsi_bhs); 201 m->m_pkthdr.len = m->m_len; 202 203 return (ip); 204 } 205 206 void 207 icl_cxgbei_new_pdu_set_conn(struct icl_pdu *ip, struct icl_conn *ic) 208 { 209 210 ip->ip_conn = ic; 211 #ifdef DIAGNOSTIC 212 refcount_acquire(&ic->ic_outstanding_pdus); 213 #endif 214 } 215 216 /* 217 * Allocate icl_pdu with empty BHS to fill up by the caller. 218 */ 219 static struct icl_pdu * 220 icl_cxgbei_conn_new_pdu(struct icl_conn *ic, int flags) 221 { 222 struct icl_pdu *ip; 223 224 ip = icl_cxgbei_new_pdu(flags); 225 if (__predict_false(ip == NULL)) 226 return (NULL); 227 icl_cxgbei_new_pdu_set_conn(ip, ic); 228 229 return (ip); 230 } 231 232 static size_t 233 icl_pdu_data_segment_length(const struct icl_pdu *request) 234 { 235 uint32_t len = 0; 236 237 len += request->ip_bhs->bhs_data_segment_len[0]; 238 len <<= 8; 239 len += request->ip_bhs->bhs_data_segment_len[1]; 240 len <<= 8; 241 len += request->ip_bhs->bhs_data_segment_len[2]; 242 243 return (len); 244 } 245 246 size_t 247 icl_cxgbei_conn_pdu_data_segment_length(struct icl_conn *ic, 248 const struct icl_pdu *request) 249 { 250 251 return (icl_pdu_data_segment_length(request)); 252 } 253 254 static uint32_t 255 icl_conn_build_tasktag(struct icl_conn *ic, uint32_t tag) 256 { 257 return tag; 258 } 259 260 static struct mbuf * 261 finalize_pdu(struct icl_cxgbei_conn *icc, struct icl_cxgbei_pdu *icp) 262 { 263 struct icl_pdu *ip = &icp->ip; 264 uint8_t ulp_submode, padding; 265 struct mbuf *m, *last; 266 struct iscsi_bhs *bhs; 267 268 /* 269 * Fix up the data segment mbuf first. 270 */ 271 m = ip->ip_data_mbuf; 272 ulp_submode = icc->ulp_submode; 273 if (m) { 274 last = m_last(m); 275 276 /* 277 * Round up the data segment to a 4B boundary. Pad with 0 if 278 * necessary. There will definitely be room in the mbuf. 279 */ 280 padding = roundup2(ip->ip_data_len, 4) - ip->ip_data_len; 281 if (padding) { 282 bzero(mtod(last, uint8_t *) + last->m_len, padding); 283 last->m_len += padding; 284 } 285 } else { 286 MPASS(ip->ip_data_len == 0); 287 ulp_submode &= ~ULP_CRC_DATA; 288 padding = 0; 289 } 290 291 /* 292 * Now the header mbuf that has the BHS. 293 */ 294 m = ip->ip_bhs_mbuf; 295 MPASS(m->m_pkthdr.len == sizeof(struct iscsi_bhs)); 296 MPASS(m->m_len == sizeof(struct iscsi_bhs)); 297 298 bhs = ip->ip_bhs; 299 bhs->bhs_data_segment_len[2] = ip->ip_data_len; 300 bhs->bhs_data_segment_len[1] = ip->ip_data_len >> 8; 301 bhs->bhs_data_segment_len[0] = ip->ip_data_len >> 16; 302 303 /* "Convert" PDU to mbuf chain. Do not use icp/ip after this. */ 304 m->m_pkthdr.len = sizeof(struct iscsi_bhs) + ip->ip_data_len + padding; 305 m->m_next = ip->ip_data_mbuf; 306 set_mbuf_ulp_submode(m, ulp_submode); 307 #ifdef INVARIANTS 308 bzero(icp, sizeof(*icp)); 309 #endif 310 #ifdef DIAGNOSTIC 311 refcount_release(&icc->ic.ic_outstanding_pdus); 312 #endif 313 314 return (m); 315 } 316 317 int 318 icl_cxgbei_conn_pdu_append_data(struct icl_conn *ic, struct icl_pdu *ip, 319 const void *addr, size_t len, int flags) 320 { 321 struct mbuf *m; 322 #ifdef INVARIANTS 323 struct icl_cxgbei_pdu *icp = ip_to_icp(ip); 324 #endif 325 326 MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE); 327 MPASS(ic == ip->ip_conn); 328 KASSERT(len > 0, ("%s: len is %jd", __func__, (intmax_t)len)); 329 330 m = ip->ip_data_mbuf; 331 if (m == NULL) { 332 m = m_getjcl(M_NOWAIT, MT_DATA, 0, MJUM16BYTES); 333 if (__predict_false(m == NULL)) 334 return (ENOMEM); 335 336 ip->ip_data_mbuf = m; 337 } 338 339 if (__predict_true(m_append(m, len, addr) != 0)) { 340 ip->ip_data_len += len; 341 MPASS(ip->ip_data_len <= CXGBEI_MAX_DSL); 342 return (0); 343 } else { 344 if (flags & M_WAITOK) { 345 CXGBE_UNIMPLEMENTED("fail safe append"); 346 } 347 ip->ip_data_len = m_length(m, NULL); 348 return (1); 349 } 350 } 351 352 void 353 icl_cxgbei_conn_pdu_get_data(struct icl_conn *ic, struct icl_pdu *ip, 354 size_t off, void *addr, size_t len) 355 { 356 struct icl_cxgbei_pdu *icp = ip_to_icp(ip); 357 358 if (icp->pdu_flags & SBUF_ULP_FLAG_DATA_DDPED) 359 return; /* data is DDP'ed, no need to copy */ 360 m_copydata(ip->ip_data_mbuf, off, len, addr); 361 } 362 363 void 364 icl_cxgbei_conn_pdu_queue(struct icl_conn *ic, struct icl_pdu *ip) 365 { 366 struct icl_cxgbei_conn *icc = ic_to_icc(ic); 367 struct icl_cxgbei_pdu *icp = ip_to_icp(ip); 368 struct socket *so = ic->ic_socket; 369 struct toepcb *toep = icc->toep; 370 struct inpcb *inp; 371 struct mbuf *m; 372 373 MPASS(ic == ip->ip_conn); 374 MPASS(ip->ip_bhs_mbuf != NULL); 375 /* The kernel doesn't generate PDUs with AHS. */ 376 MPASS(ip->ip_ahs_mbuf == NULL && ip->ip_ahs_len == 0); 377 378 ICL_CONN_LOCK_ASSERT(ic); 379 /* NOTE: sowriteable without so_snd lock is a mostly harmless race. */ 380 if (ic->ic_disconnecting || so == NULL || !sowriteable(so)) { 381 icl_cxgbei_conn_pdu_free(ic, ip); 382 return; 383 } 384 385 m = finalize_pdu(icc, icp); 386 M_ASSERTPKTHDR(m); 387 MPASS((m->m_pkthdr.len & 3) == 0); 388 MPASS(m->m_pkthdr.len + 8 <= CXGBEI_MAX_PDU); 389 390 /* 391 * Do not get inp from toep->inp as the toepcb might have detached 392 * already. 393 */ 394 inp = sotoinpcb(so); 395 INP_WLOCK(inp); 396 if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) || 397 __predict_false((toep->flags & TPF_ATTACHED) == 0)) 398 m_freem(m); 399 else { 400 mbufq_enqueue(&toep->ulp_pduq, m); 401 t4_push_pdus(icc->sc, toep, 0); 402 } 403 INP_WUNLOCK(inp); 404 } 405 406 static struct icl_conn * 407 icl_cxgbei_new_conn(const char *name, struct mtx *lock) 408 { 409 struct icl_cxgbei_conn *icc; 410 struct icl_conn *ic; 411 412 refcount_acquire(&icl_cxgbei_ncons); 413 414 icc = (struct icl_cxgbei_conn *)kobj_create(&icl_cxgbei_class, M_CXGBE, 415 M_WAITOK | M_ZERO); 416 icc->icc_signature = CXGBEI_CONN_SIGNATURE; 417 STAILQ_INIT(&icc->rcvd_pdus); 418 419 ic = &icc->ic; 420 ic->ic_lock = lock; 421 422 /* XXXNP: review. Most of these icl_conn fields aren't really used */ 423 STAILQ_INIT(&ic->ic_to_send); 424 cv_init(&ic->ic_send_cv, "icl_cxgbei_tx"); 425 cv_init(&ic->ic_receive_cv, "icl_cxgbei_rx"); 426 #ifdef DIAGNOSTIC 427 refcount_init(&ic->ic_outstanding_pdus, 0); 428 #endif 429 ic->ic_max_data_segment_length = CXGBEI_MAX_DSL; 430 ic->ic_name = name; 431 ic->ic_offload = "cxgbei"; 432 ic->ic_unmapped = false; 433 434 CTR2(KTR_CXGBE, "%s: icc %p", __func__, icc); 435 436 return (ic); 437 } 438 439 void 440 icl_cxgbei_conn_free(struct icl_conn *ic) 441 { 442 struct icl_cxgbei_conn *icc = ic_to_icc(ic); 443 444 MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); 445 446 CTR2(KTR_CXGBE, "%s: icc %p", __func__, icc); 447 448 cv_destroy(&ic->ic_send_cv); 449 cv_destroy(&ic->ic_receive_cv); 450 451 kobj_delete((struct kobj *)icc, M_CXGBE); 452 refcount_release(&icl_cxgbei_ncons); 453 } 454 455 static int 456 icl_cxgbei_setsockopt(struct icl_conn *ic, struct socket *so) 457 { 458 size_t minspace; 459 struct sockopt opt; 460 int error, one = 1; 461 462 /* 463 * For sendspace, this is required because the current code cannot 464 * send a PDU in pieces; thus, the minimum buffer size is equal 465 * to the maximum PDU size. "+4" is to account for possible padding. 466 * 467 * What we should actually do here is to use autoscaling, but set 468 * some minimal buffer size to "minspace". I don't know a way to do 469 * that, though. 470 */ 471 minspace = sizeof(struct iscsi_bhs) + ic->ic_max_data_segment_length + 472 ISCSI_HEADER_DIGEST_SIZE + ISCSI_DATA_DIGEST_SIZE + 4; 473 if (sendspace < minspace) 474 sendspace = minspace; 475 if (recvspace < minspace) 476 recvspace = minspace; 477 478 error = soreserve(so, sendspace, recvspace); 479 if (error != 0) { 480 icl_cxgbei_conn_close(ic); 481 return (error); 482 } 483 SOCKBUF_LOCK(&so->so_snd); 484 so->so_snd.sb_flags |= SB_AUTOSIZE; 485 SOCKBUF_UNLOCK(&so->so_snd); 486 SOCKBUF_LOCK(&so->so_rcv); 487 so->so_rcv.sb_flags |= SB_AUTOSIZE; 488 SOCKBUF_UNLOCK(&so->so_rcv); 489 490 /* 491 * Disable Nagle. 492 */ 493 bzero(&opt, sizeof(opt)); 494 opt.sopt_dir = SOPT_SET; 495 opt.sopt_level = IPPROTO_TCP; 496 opt.sopt_name = TCP_NODELAY; 497 opt.sopt_val = &one; 498 opt.sopt_valsize = sizeof(one); 499 error = sosetopt(so, &opt); 500 if (error != 0) { 501 icl_cxgbei_conn_close(ic); 502 return (error); 503 } 504 505 return (0); 506 } 507 508 /* 509 * Request/response structure used to find out the adapter offloading a socket. 510 */ 511 struct find_ofld_adapter_rr { 512 struct socket *so; 513 struct adapter *sc; /* result */ 514 }; 515 516 static void 517 find_offload_adapter(struct adapter *sc, void *arg) 518 { 519 struct find_ofld_adapter_rr *fa = arg; 520 struct socket *so = fa->so; 521 struct tom_data *td = sc->tom_softc; 522 struct tcpcb *tp; 523 struct inpcb *inp; 524 525 /* Non-TCP were filtered out earlier. */ 526 MPASS(so->so_proto->pr_protocol == IPPROTO_TCP); 527 528 if (fa->sc != NULL) 529 return; /* Found already. */ 530 531 if (td == NULL) 532 return; /* TOE not enabled on this adapter. */ 533 534 inp = sotoinpcb(so); 535 INP_WLOCK(inp); 536 if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { 537 tp = intotcpcb(inp); 538 if (tp->t_flags & TF_TOE && tp->tod == &td->tod) 539 fa->sc = sc; /* Found. */ 540 } 541 INP_WUNLOCK(inp); 542 } 543 544 /* XXXNP: move this to t4_tom. */ 545 static void 546 send_iscsi_flowc_wr(struct adapter *sc, struct toepcb *toep, int maxlen) 547 { 548 struct wrqe *wr; 549 struct fw_flowc_wr *flowc; 550 const u_int nparams = 1; 551 u_int flowclen; 552 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; 553 554 flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); 555 556 wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq); 557 if (wr == NULL) { 558 /* XXX */ 559 panic("%s: allocation failure.", __func__); 560 } 561 flowc = wrtod(wr); 562 memset(flowc, 0, wr->wr_len); 563 564 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | 565 V_FW_FLOWC_WR_NPARAMS(nparams)); 566 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | 567 V_FW_WR_FLOWID(toep->tid)); 568 569 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_TXDATAPLEN_MAX; 570 flowc->mnemval[0].val = htobe32(maxlen); 571 572 txsd->tx_credits = howmany(flowclen, 16); 573 txsd->plen = 0; 574 KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0, 575 ("%s: not enough credits (%d)", __func__, toep->tx_credits)); 576 toep->tx_credits -= txsd->tx_credits; 577 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) 578 toep->txsd_pidx = 0; 579 toep->txsd_avail--; 580 581 t4_wrq_tx(sc, wr); 582 } 583 584 static void 585 set_ulp_mode_iscsi(struct adapter *sc, struct toepcb *toep, int hcrc, int dcrc) 586 { 587 uint64_t val = 0; 588 589 if (hcrc) 590 val |= ULP_CRC_HEADER; 591 if (dcrc) 592 val |= ULP_CRC_DATA; 593 val <<= 4; 594 val |= ULP_MODE_ISCSI; 595 596 CTR4(KTR_CXGBE, "%s: tid %u, ULP_MODE_ISCSI, CRC hdr=%d data=%d", 597 __func__, toep->tid, hcrc, dcrc); 598 599 t4_set_tcb_field(sc, toep, 1, 0, 0xfff, val); 600 } 601 602 /* 603 * XXXNP: Who is responsible for cleaning up the socket if this returns with an 604 * error? Review all error paths. 605 * 606 * XXXNP: What happens to the socket's fd reference if the operation is 607 * successful, and how does that affect the socket's life cycle? 608 */ 609 int 610 icl_cxgbei_conn_handoff(struct icl_conn *ic, int fd) 611 { 612 struct icl_cxgbei_conn *icc = ic_to_icc(ic); 613 struct find_ofld_adapter_rr fa; 614 struct file *fp; 615 struct socket *so; 616 struct inpcb *inp; 617 struct tcpcb *tp; 618 struct toepcb *toep; 619 cap_rights_t rights; 620 int error; 621 622 MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); 623 ICL_CONN_LOCK_ASSERT_NOT(ic); 624 625 /* 626 * Steal the socket from userland. 627 */ 628 error = fget(curthread, fd, 629 cap_rights_init(&rights, CAP_SOCK_CLIENT), &fp); 630 if (error != 0) 631 return (error); 632 if (fp->f_type != DTYPE_SOCKET) { 633 fdrop(fp, curthread); 634 return (EINVAL); 635 } 636 so = fp->f_data; 637 if (so->so_type != SOCK_STREAM || 638 so->so_proto->pr_protocol != IPPROTO_TCP) { 639 fdrop(fp, curthread); 640 return (EINVAL); 641 } 642 643 ICL_CONN_LOCK(ic); 644 if (ic->ic_socket != NULL) { 645 ICL_CONN_UNLOCK(ic); 646 fdrop(fp, curthread); 647 return (EBUSY); 648 } 649 ic->ic_disconnecting = false; 650 ic->ic_socket = so; 651 fp->f_ops = &badfileops; 652 fp->f_data = NULL; 653 fdrop(fp, curthread); 654 ICL_CONN_UNLOCK(ic); 655 656 /* Find the adapter offloading this socket. */ 657 fa.sc = NULL; 658 fa.so = so; 659 t4_iterate(find_offload_adapter, &fa); 660 if (fa.sc == NULL) 661 return (EINVAL); 662 icc->sc = fa.sc; 663 664 error = icl_cxgbei_setsockopt(ic, so); 665 if (error) 666 return (error); 667 668 inp = sotoinpcb(so); 669 INP_WLOCK(inp); 670 tp = intotcpcb(inp); 671 if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) 672 error = EBUSY; 673 else { 674 /* 675 * socket could not have been "unoffloaded" if here. 676 */ 677 MPASS(tp->t_flags & TF_TOE); 678 MPASS(tp->tod != NULL); 679 MPASS(tp->t_toe != NULL); 680 toep = tp->t_toe; 681 MPASS(toep->vi->pi->adapter == icc->sc); 682 icc->toep = toep; 683 icc->cwt = cxgbei_select_worker_thread(icc); 684 icc->ulp_submode = 0; 685 if (ic->ic_header_crc32c) 686 icc->ulp_submode |= ULP_CRC_HEADER; 687 if (ic->ic_data_crc32c) 688 icc->ulp_submode |= ULP_CRC_DATA; 689 so->so_options |= SO_NO_DDP; 690 toep->ulp_mode = ULP_MODE_ISCSI; 691 toep->ulpcb = icc; 692 693 send_iscsi_flowc_wr(icc->sc, toep, CXGBEI_MAX_PDU); 694 set_ulp_mode_iscsi(icc->sc, toep, ic->ic_header_crc32c, 695 ic->ic_data_crc32c); 696 error = 0; 697 } 698 INP_WUNLOCK(inp); 699 700 return (error); 701 } 702 703 void 704 icl_cxgbei_conn_close(struct icl_conn *ic) 705 { 706 struct icl_cxgbei_conn *icc = ic_to_icc(ic); 707 struct icl_pdu *ip; 708 struct socket *so; 709 struct sockbuf *sb; 710 struct inpcb *inp; 711 struct toepcb *toep = icc->toep; 712 713 MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); 714 ICL_CONN_LOCK_ASSERT_NOT(ic); 715 716 ICL_CONN_LOCK(ic); 717 so = ic->ic_socket; 718 if (ic->ic_disconnecting || so == NULL) { 719 CTR4(KTR_CXGBE, "%s: icc %p (disconnecting = %d), so %p", 720 __func__, icc, ic->ic_disconnecting, so); 721 ICL_CONN_UNLOCK(ic); 722 return; 723 } 724 ic->ic_disconnecting = true; 725 726 /* These are unused in this driver right now. */ 727 MPASS(STAILQ_EMPTY(&ic->ic_to_send)); 728 MPASS(ic->ic_receive_pdu == NULL); 729 730 #ifdef DIAGNOSTIC 731 KASSERT(ic->ic_outstanding_pdus == 0, 732 ("destroying session with %d outstanding PDUs", 733 ic->ic_outstanding_pdus)); 734 #endif 735 ICL_CONN_UNLOCK(ic); 736 737 CTR3(KTR_CXGBE, "%s: tid %d, icc %p", __func__, toep ? toep->tid : -1, 738 icc); 739 inp = sotoinpcb(so); 740 sb = &so->so_rcv; 741 INP_WLOCK(inp); 742 if (toep != NULL) { /* NULL if connection was never offloaded. */ 743 toep->ulpcb = NULL; 744 mbufq_drain(&toep->ulp_pduq); 745 SOCKBUF_LOCK(sb); 746 if (icc->rx_flags & RXF_ACTIVE) { 747 volatile u_int *p = &icc->rx_flags; 748 749 SOCKBUF_UNLOCK(sb); 750 INP_WUNLOCK(inp); 751 752 while (*p & RXF_ACTIVE) 753 pause("conclo", 1); 754 755 INP_WLOCK(inp); 756 SOCKBUF_LOCK(sb); 757 } 758 759 while (!STAILQ_EMPTY(&icc->rcvd_pdus)) { 760 ip = STAILQ_FIRST(&icc->rcvd_pdus); 761 STAILQ_REMOVE_HEAD(&icc->rcvd_pdus, ip_next); 762 icl_cxgbei_conn_pdu_free(ic, ip); 763 } 764 SOCKBUF_UNLOCK(sb); 765 } 766 INP_WUNLOCK(inp); 767 768 ICL_CONN_LOCK(ic); 769 ic->ic_socket = NULL; 770 ICL_CONN_UNLOCK(ic); 771 772 /* 773 * XXXNP: we should send RST instead of FIN when PDUs held in various 774 * queues were purged instead of delivered reliably but soabort isn't 775 * really general purpose and wouldn't do the right thing here. 776 */ 777 soclose(so); 778 } 779 780 int 781 icl_cxgbei_conn_task_setup(struct icl_conn *ic, struct icl_pdu *ip, 782 struct ccb_scsiio *csio, uint32_t *task_tagp, void **prvp) 783 { 784 void *prv; 785 786 *task_tagp = icl_conn_build_tasktag(ic, *task_tagp); 787 788 prv = uma_zalloc(icl_transfer_zone, M_NOWAIT | M_ZERO); 789 if (prv == NULL) 790 return (ENOMEM); 791 792 *prvp = prv; 793 794 cxgbei_conn_task_reserve_itt(ic, prvp, csio, task_tagp); 795 796 return (0); 797 } 798 799 void 800 icl_cxgbei_conn_task_done(struct icl_conn *ic, void *prv) 801 { 802 803 cxgbei_cleanup_task(ic, prv); 804 uma_zfree(icl_transfer_zone, prv); 805 } 806 807 int 808 icl_cxgbei_conn_transfer_setup(struct icl_conn *ic, union ctl_io *io, 809 uint32_t *transfer_tag, void **prvp) 810 { 811 void *prv; 812 813 *transfer_tag = icl_conn_build_tasktag(ic, *transfer_tag); 814 815 prv = uma_zalloc(icl_transfer_zone, M_NOWAIT | M_ZERO); 816 if (prv == NULL) 817 return (ENOMEM); 818 819 *prvp = prv; 820 821 cxgbei_conn_transfer_reserve_ttt(ic, prvp, io, transfer_tag); 822 823 return (0); 824 } 825 826 void 827 icl_cxgbei_conn_transfer_done(struct icl_conn *ic, void *prv) 828 { 829 cxgbei_cleanup_task(ic, prv); 830 uma_zfree(icl_transfer_zone, prv); 831 } 832 833 static int 834 icl_cxgbei_limits(size_t *limitp) 835 { 836 837 *limitp = CXGBEI_MAX_DSL; 838 839 return (0); 840 } 841 842 static int 843 icl_cxgbei_load(void) 844 { 845 int error; 846 847 icl_transfer_zone = uma_zcreate("icl_transfer", 848 16 * 1024, NULL, NULL, NULL, NULL, 849 UMA_ALIGN_PTR, 0); 850 851 refcount_init(&icl_cxgbei_ncons, 0); 852 853 error = icl_register("cxgbei", false, -100, icl_cxgbei_limits, 854 icl_cxgbei_new_conn); 855 KASSERT(error == 0, ("failed to register")); 856 857 return (error); 858 } 859 860 static int 861 icl_cxgbei_unload(void) 862 { 863 864 if (icl_cxgbei_ncons != 0) 865 return (EBUSY); 866 867 icl_unregister("cxgbei", false); 868 869 uma_zdestroy(icl_transfer_zone); 870 871 return (0); 872 } 873 874 static int 875 icl_cxgbei_modevent(module_t mod, int what, void *arg) 876 { 877 878 switch (what) { 879 case MOD_LOAD: 880 return (icl_cxgbei_load()); 881 case MOD_UNLOAD: 882 return (icl_cxgbei_unload()); 883 default: 884 return (EINVAL); 885 } 886 } 887 888 moduledata_t icl_cxgbei_data = { 889 "icl_cxgbei", 890 icl_cxgbei_modevent, 891 0 892 }; 893 894 DECLARE_MODULE(icl_cxgbei, icl_cxgbei_data, SI_SUB_DRIVERS, SI_ORDER_MIDDLE); 895 MODULE_DEPEND(icl_cxgbei, icl, 1, 1, 1); 896 MODULE_VERSION(icl_cxgbei, 1); 897 #endif 898