xref: /freebsd/sys/dev/iscsi/icl_soft.c (revision ce6a89e27cd190313be39bb479880aeda4778436)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2012 The FreeBSD Foundation
5  * All rights reserved.
6  *
7  * This software was developed by Edward Tomasz Napierala under sponsorship
8  * from the FreeBSD Foundation.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  *
31  */
32 
33 /*
34  * Software implementation of iSCSI Common Layer kobj(9) interface.
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include <sys/param.h>
41 #include <sys/capsicum.h>
42 #include <sys/condvar.h>
43 #include <sys/conf.h>
44 #include <sys/gsb_crc32.h>
45 #include <sys/file.h>
46 #include <sys/kernel.h>
47 #include <sys/kthread.h>
48 #include <sys/lock.h>
49 #include <sys/mbuf.h>
50 #include <sys/mutex.h>
51 #include <sys/module.h>
52 #include <sys/protosw.h>
53 #include <sys/socket.h>
54 #include <sys/socketvar.h>
55 #include <sys/sysctl.h>
56 #include <sys/systm.h>
57 #include <sys/sx.h>
58 #include <sys/uio.h>
59 #include <vm/uma.h>
60 #include <netinet/in.h>
61 #include <netinet/tcp.h>
62 
63 #include <dev/iscsi/icl.h>
64 #include <dev/iscsi/iscsi_proto.h>
65 #include <icl_conn_if.h>
66 
67 static int coalesce = 1;
68 SYSCTL_INT(_kern_icl, OID_AUTO, coalesce, CTLFLAG_RWTUN,
69     &coalesce, 0, "Try to coalesce PDUs before sending");
70 static int partial_receive_len = 128 * 1024;
71 SYSCTL_INT(_kern_icl, OID_AUTO, partial_receive_len, CTLFLAG_RWTUN,
72     &partial_receive_len, 0, "Minimum read size for partially received "
73     "data segment");
74 static int sendspace = 1048576;
75 SYSCTL_INT(_kern_icl, OID_AUTO, sendspace, CTLFLAG_RWTUN,
76     &sendspace, 0, "Default send socket buffer size");
77 static int recvspace = 1048576;
78 SYSCTL_INT(_kern_icl, OID_AUTO, recvspace, CTLFLAG_RWTUN,
79     &recvspace, 0, "Default receive socket buffer size");
80 
81 static MALLOC_DEFINE(M_ICL_SOFT, "icl_soft", "iSCSI software backend");
82 static uma_zone_t icl_pdu_zone;
83 
84 static volatile u_int	icl_ncons;
85 
86 #define ICL_CONN_LOCK(X)		mtx_lock(X->ic_lock)
87 #define ICL_CONN_UNLOCK(X)		mtx_unlock(X->ic_lock)
88 #define ICL_CONN_LOCK_ASSERT(X)		mtx_assert(X->ic_lock, MA_OWNED)
89 #define ICL_CONN_LOCK_ASSERT_NOT(X)	mtx_assert(X->ic_lock, MA_NOTOWNED)
90 
91 STAILQ_HEAD(icl_pdu_stailq, icl_pdu);
92 
93 static icl_conn_new_pdu_t	icl_soft_conn_new_pdu;
94 static icl_conn_pdu_free_t	icl_soft_conn_pdu_free;
95 static icl_conn_pdu_data_segment_length_t
96 				    icl_soft_conn_pdu_data_segment_length;
97 static icl_conn_pdu_append_data_t	icl_soft_conn_pdu_append_data;
98 static icl_conn_pdu_get_data_t	icl_soft_conn_pdu_get_data;
99 static icl_conn_pdu_queue_t	icl_soft_conn_pdu_queue;
100 static icl_conn_handoff_t	icl_soft_conn_handoff;
101 static icl_conn_free_t		icl_soft_conn_free;
102 static icl_conn_close_t		icl_soft_conn_close;
103 static icl_conn_task_setup_t	icl_soft_conn_task_setup;
104 static icl_conn_task_done_t	icl_soft_conn_task_done;
105 static icl_conn_transfer_setup_t	icl_soft_conn_transfer_setup;
106 static icl_conn_transfer_done_t	icl_soft_conn_transfer_done;
107 #ifdef ICL_KERNEL_PROXY
108 static icl_conn_connect_t	icl_soft_conn_connect;
109 #endif
110 
111 static kobj_method_t icl_soft_methods[] = {
112 	KOBJMETHOD(icl_conn_new_pdu, icl_soft_conn_new_pdu),
113 	KOBJMETHOD(icl_conn_pdu_free, icl_soft_conn_pdu_free),
114 	KOBJMETHOD(icl_conn_pdu_data_segment_length,
115 	    icl_soft_conn_pdu_data_segment_length),
116 	KOBJMETHOD(icl_conn_pdu_append_data, icl_soft_conn_pdu_append_data),
117 	KOBJMETHOD(icl_conn_pdu_get_data, icl_soft_conn_pdu_get_data),
118 	KOBJMETHOD(icl_conn_pdu_queue, icl_soft_conn_pdu_queue),
119 	KOBJMETHOD(icl_conn_handoff, icl_soft_conn_handoff),
120 	KOBJMETHOD(icl_conn_free, icl_soft_conn_free),
121 	KOBJMETHOD(icl_conn_close, icl_soft_conn_close),
122 	KOBJMETHOD(icl_conn_task_setup, icl_soft_conn_task_setup),
123 	KOBJMETHOD(icl_conn_task_done, icl_soft_conn_task_done),
124 	KOBJMETHOD(icl_conn_transfer_setup, icl_soft_conn_transfer_setup),
125 	KOBJMETHOD(icl_conn_transfer_done, icl_soft_conn_transfer_done),
126 #ifdef ICL_KERNEL_PROXY
127 	KOBJMETHOD(icl_conn_connect, icl_soft_conn_connect),
128 #endif
129 	{ 0, 0 }
130 };
131 
132 DEFINE_CLASS(icl_soft, icl_soft_methods, sizeof(struct icl_conn));
133 
134 static void
135 icl_conn_fail(struct icl_conn *ic)
136 {
137 	if (ic->ic_socket == NULL)
138 		return;
139 
140 	/*
141 	 * XXX
142 	 */
143 	ic->ic_socket->so_error = EDOOFUS;
144 	(ic->ic_error)(ic);
145 }
146 
147 static struct mbuf *
148 icl_conn_receive(struct icl_conn *ic, size_t len)
149 {
150 	struct uio uio;
151 	struct socket *so;
152 	struct mbuf *m;
153 	int error, flags;
154 
155 	so = ic->ic_socket;
156 
157 	memset(&uio, 0, sizeof(uio));
158 	uio.uio_resid = len;
159 
160 	flags = MSG_DONTWAIT;
161 	error = soreceive(so, NULL, &uio, &m, NULL, &flags);
162 	if (error != 0) {
163 		ICL_DEBUG("soreceive error %d", error);
164 		return (NULL);
165 	}
166 	if (uio.uio_resid != 0) {
167 		m_freem(m);
168 		ICL_DEBUG("short read");
169 		return (NULL);
170 	}
171 
172 	return (m);
173 }
174 
175 static int
176 icl_conn_receive_buf(struct icl_conn *ic, void *buf, size_t len)
177 {
178 	struct iovec iov[1];
179 	struct uio uio;
180 	struct socket *so;
181 	int error, flags;
182 
183 	so = ic->ic_socket;
184 
185 	memset(&uio, 0, sizeof(uio));
186 	iov[0].iov_base = buf;
187 	iov[0].iov_len = len;
188 	uio.uio_iov = iov;
189 	uio.uio_iovcnt = 1;
190 	uio.uio_offset = 0;
191 	uio.uio_resid = len;
192 	uio.uio_segflg = UIO_SYSSPACE;
193 	uio.uio_rw = UIO_READ;
194 
195 	flags = MSG_DONTWAIT;
196 	error = soreceive(so, NULL, &uio, NULL, NULL, &flags);
197 	if (error != 0) {
198 		ICL_DEBUG("soreceive error %d", error);
199 		return (-1);
200 	}
201 	if (uio.uio_resid != 0) {
202 		ICL_DEBUG("short read");
203 		return (-1);
204 	}
205 
206 	return (0);
207 }
208 
209 static void
210 icl_soft_conn_pdu_free(struct icl_conn *ic, struct icl_pdu *ip)
211 {
212 
213 	m_freem(ip->ip_bhs_mbuf);
214 	m_freem(ip->ip_ahs_mbuf);
215 	m_freem(ip->ip_data_mbuf);
216 	uma_zfree(icl_pdu_zone, ip);
217 #ifdef DIAGNOSTIC
218 	refcount_release(&ic->ic_outstanding_pdus);
219 #endif
220 }
221 
222 /*
223  * Allocate icl_pdu with empty BHS to fill up by the caller.
224  */
225 struct icl_pdu *
226 icl_soft_conn_new_pdu(struct icl_conn *ic, int flags)
227 {
228 	struct icl_pdu *ip;
229 
230 #ifdef DIAGNOSTIC
231 	refcount_acquire(&ic->ic_outstanding_pdus);
232 #endif
233 	ip = uma_zalloc(icl_pdu_zone, flags | M_ZERO);
234 	if (ip == NULL) {
235 		ICL_WARN("failed to allocate %zd bytes", sizeof(*ip));
236 #ifdef DIAGNOSTIC
237 		refcount_release(&ic->ic_outstanding_pdus);
238 #endif
239 		return (NULL);
240 	}
241 	ip->ip_conn = ic;
242 
243 	CTASSERT(sizeof(struct iscsi_bhs) <= MHLEN);
244 	ip->ip_bhs_mbuf = m_gethdr(flags, MT_DATA);
245 	if (ip->ip_bhs_mbuf == NULL) {
246 		ICL_WARN("failed to allocate BHS mbuf");
247 		icl_soft_conn_pdu_free(ic, ip);
248 		return (NULL);
249 	}
250 	ip->ip_bhs = mtod(ip->ip_bhs_mbuf, struct iscsi_bhs *);
251 	memset(ip->ip_bhs, 0, sizeof(struct iscsi_bhs));
252 	ip->ip_bhs_mbuf->m_len = sizeof(struct iscsi_bhs);
253 
254 	return (ip);
255 }
256 
257 static int
258 icl_pdu_ahs_length(const struct icl_pdu *request)
259 {
260 
261 	return (request->ip_bhs->bhs_total_ahs_len * 4);
262 }
263 
264 static size_t
265 icl_pdu_data_segment_length(const struct icl_pdu *request)
266 {
267 	uint32_t len = 0;
268 
269 	len += request->ip_bhs->bhs_data_segment_len[0];
270 	len <<= 8;
271 	len += request->ip_bhs->bhs_data_segment_len[1];
272 	len <<= 8;
273 	len += request->ip_bhs->bhs_data_segment_len[2];
274 
275 	return (len);
276 }
277 
278 size_t
279 icl_soft_conn_pdu_data_segment_length(struct icl_conn *ic,
280     const struct icl_pdu *request)
281 {
282 
283 	return (icl_pdu_data_segment_length(request));
284 }
285 
286 static void
287 icl_pdu_set_data_segment_length(struct icl_pdu *response, uint32_t len)
288 {
289 
290 	response->ip_bhs->bhs_data_segment_len[2] = len;
291 	response->ip_bhs->bhs_data_segment_len[1] = len >> 8;
292 	response->ip_bhs->bhs_data_segment_len[0] = len >> 16;
293 }
294 
295 static size_t
296 icl_pdu_padding(const struct icl_pdu *ip)
297 {
298 
299 	if ((ip->ip_data_len % 4) != 0)
300 		return (4 - (ip->ip_data_len % 4));
301 
302 	return (0);
303 }
304 
305 static size_t
306 icl_pdu_size(const struct icl_pdu *response)
307 {
308 	size_t len;
309 
310 	KASSERT(response->ip_ahs_len == 0, ("responding with AHS"));
311 
312 	len = sizeof(struct iscsi_bhs) + response->ip_data_len +
313 	    icl_pdu_padding(response);
314 	if (response->ip_conn->ic_header_crc32c)
315 		len += ISCSI_HEADER_DIGEST_SIZE;
316 	if (response->ip_data_len != 0 && response->ip_conn->ic_data_crc32c)
317 		len += ISCSI_DATA_DIGEST_SIZE;
318 
319 	return (len);
320 }
321 
322 static int
323 icl_pdu_receive_bhs(struct icl_pdu *request, size_t *availablep)
324 {
325 
326 	if (icl_conn_receive_buf(request->ip_conn,
327 	    request->ip_bhs, sizeof(struct iscsi_bhs))) {
328 		ICL_DEBUG("failed to receive BHS");
329 		return (-1);
330 	}
331 
332 	*availablep -= sizeof(struct iscsi_bhs);
333 	return (0);
334 }
335 
336 static int
337 icl_pdu_receive_ahs(struct icl_pdu *request, size_t *availablep)
338 {
339 
340 	request->ip_ahs_len = icl_pdu_ahs_length(request);
341 	if (request->ip_ahs_len == 0)
342 		return (0);
343 
344 	request->ip_ahs_mbuf = icl_conn_receive(request->ip_conn,
345 	    request->ip_ahs_len);
346 	if (request->ip_ahs_mbuf == NULL) {
347 		ICL_DEBUG("failed to receive AHS");
348 		return (-1);
349 	}
350 
351 	*availablep -= request->ip_ahs_len;
352 	return (0);
353 }
354 
355 static uint32_t
356 icl_mbuf_to_crc32c(const struct mbuf *m0)
357 {
358 	uint32_t digest = 0xffffffff;
359 	const struct mbuf *m;
360 
361 	for (m = m0; m != NULL; m = m->m_next)
362 		digest = calculate_crc32c(digest,
363 		    mtod(m, const void *), m->m_len);
364 
365 	digest = digest ^ 0xffffffff;
366 
367 	return (digest);
368 }
369 
370 static int
371 icl_pdu_check_header_digest(struct icl_pdu *request, size_t *availablep)
372 {
373 	uint32_t received_digest, valid_digest;
374 
375 	if (request->ip_conn->ic_header_crc32c == false)
376 		return (0);
377 
378 	CTASSERT(sizeof(received_digest) == ISCSI_HEADER_DIGEST_SIZE);
379 	if (icl_conn_receive_buf(request->ip_conn,
380 	    &received_digest, ISCSI_HEADER_DIGEST_SIZE)) {
381 		ICL_DEBUG("failed to receive header digest");
382 		return (-1);
383 	}
384 	*availablep -= ISCSI_HEADER_DIGEST_SIZE;
385 
386 	/* Temporary attach AHS to BHS to calculate header digest. */
387 	request->ip_bhs_mbuf->m_next = request->ip_ahs_mbuf;
388 	valid_digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf);
389 	request->ip_bhs_mbuf->m_next = NULL;
390 	if (received_digest != valid_digest) {
391 		ICL_WARN("header digest check failed; got 0x%x, "
392 		    "should be 0x%x", received_digest, valid_digest);
393 		return (-1);
394 	}
395 
396 	return (0);
397 }
398 
399 /*
400  * Return the number of bytes that should be waiting in the receive socket
401  * before icl_pdu_receive_data_segment() gets called.
402  */
403 static size_t
404 icl_pdu_data_segment_receive_len(const struct icl_pdu *request)
405 {
406 	size_t len;
407 
408 	len = icl_pdu_data_segment_length(request);
409 	if (len == 0)
410 		return (0);
411 
412 	/*
413 	 * Account for the parts of data segment already read from
414 	 * the socket buffer.
415 	 */
416 	KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len"));
417 	len -= request->ip_data_len;
418 
419 	/*
420 	 * Don't always wait for the full data segment to be delivered
421 	 * to the socket; this might badly affect performance due to
422 	 * TCP window scaling.
423 	 */
424 	if (len > partial_receive_len) {
425 #if 0
426 		ICL_DEBUG("need %zd bytes of data, limiting to %zd",
427 		    len, partial_receive_len));
428 #endif
429 		len = partial_receive_len;
430 
431 		return (len);
432 	}
433 
434 	/*
435 	 * Account for padding.  Note that due to the way code is written,
436 	 * the icl_pdu_receive_data_segment() must always receive padding
437 	 * along with the last part of data segment, because it would be
438 	 * impossible to tell whether we've already received the full data
439 	 * segment including padding, or without it.
440 	 */
441 	if ((len % 4) != 0)
442 		len += 4 - (len % 4);
443 
444 #if 0
445 	ICL_DEBUG("need %zd bytes of data", len));
446 #endif
447 
448 	return (len);
449 }
450 
451 static int
452 icl_pdu_receive_data_segment(struct icl_pdu *request,
453     size_t *availablep, bool *more_neededp)
454 {
455 	struct icl_conn *ic;
456 	size_t len, padding = 0;
457 	struct mbuf *m;
458 
459 	ic = request->ip_conn;
460 
461 	*more_neededp = false;
462 	ic->ic_receive_len = 0;
463 
464 	len = icl_pdu_data_segment_length(request);
465 	if (len == 0)
466 		return (0);
467 
468 	if ((len % 4) != 0)
469 		padding = 4 - (len % 4);
470 
471 	/*
472 	 * Account for already received parts of data segment.
473 	 */
474 	KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len"));
475 	len -= request->ip_data_len;
476 
477 	if (len + padding > *availablep) {
478 		/*
479 		 * Not enough data in the socket buffer.  Receive as much
480 		 * as we can.  Don't receive padding, since, obviously, it's
481 		 * not the end of data segment yet.
482 		 */
483 #if 0
484 		ICL_DEBUG("limited from %zd to %zd",
485 		    len + padding, *availablep - padding));
486 #endif
487 		len = *availablep - padding;
488 		*more_neededp = true;
489 		padding = 0;
490 	}
491 
492 	/*
493 	 * Must not try to receive padding without at least one byte
494 	 * of actual data segment.
495 	 */
496 	if (len > 0) {
497 		m = icl_conn_receive(request->ip_conn, len + padding);
498 		if (m == NULL) {
499 			ICL_DEBUG("failed to receive data segment");
500 			return (-1);
501 		}
502 
503 		if (request->ip_data_mbuf == NULL)
504 			request->ip_data_mbuf = m;
505 		else
506 			m_cat(request->ip_data_mbuf, m);
507 
508 		request->ip_data_len += len;
509 		*availablep -= len + padding;
510 	} else
511 		ICL_DEBUG("len 0");
512 
513 	if (*more_neededp)
514 		ic->ic_receive_len =
515 		    icl_pdu_data_segment_receive_len(request);
516 
517 	return (0);
518 }
519 
520 static int
521 icl_pdu_check_data_digest(struct icl_pdu *request, size_t *availablep)
522 {
523 	uint32_t received_digest, valid_digest;
524 
525 	if (request->ip_conn->ic_data_crc32c == false)
526 		return (0);
527 
528 	if (request->ip_data_len == 0)
529 		return (0);
530 
531 	CTASSERT(sizeof(received_digest) == ISCSI_DATA_DIGEST_SIZE);
532 	if (icl_conn_receive_buf(request->ip_conn,
533 	    &received_digest, ISCSI_DATA_DIGEST_SIZE)) {
534 		ICL_DEBUG("failed to receive data digest");
535 		return (-1);
536 	}
537 	*availablep -= ISCSI_DATA_DIGEST_SIZE;
538 
539 	/*
540 	 * Note that ip_data_mbuf also contains padding; since digest
541 	 * calculation is supposed to include that, we iterate over
542 	 * the entire ip_data_mbuf chain, not just ip_data_len bytes of it.
543 	 */
544 	valid_digest = icl_mbuf_to_crc32c(request->ip_data_mbuf);
545 	if (received_digest != valid_digest) {
546 		ICL_WARN("data digest check failed; got 0x%x, "
547 		    "should be 0x%x", received_digest, valid_digest);
548 		return (-1);
549 	}
550 
551 	return (0);
552 }
553 
554 /*
555  * Somewhat contrary to the name, this attempts to receive only one
556  * "part" of PDU at a time; call it repeatedly until it returns non-NULL.
557  */
558 static struct icl_pdu *
559 icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep)
560 {
561 	struct icl_pdu *request;
562 	struct socket *so;
563 	size_t len;
564 	int error;
565 	bool more_needed;
566 
567 	so = ic->ic_socket;
568 
569 	if (ic->ic_receive_state == ICL_CONN_STATE_BHS) {
570 		KASSERT(ic->ic_receive_pdu == NULL,
571 		    ("ic->ic_receive_pdu != NULL"));
572 		request = icl_soft_conn_new_pdu(ic, M_NOWAIT);
573 		if (request == NULL) {
574 			ICL_DEBUG("failed to allocate PDU; "
575 			    "dropping connection");
576 			icl_conn_fail(ic);
577 			return (NULL);
578 		}
579 		ic->ic_receive_pdu = request;
580 	} else {
581 		KASSERT(ic->ic_receive_pdu != NULL,
582 		    ("ic->ic_receive_pdu == NULL"));
583 		request = ic->ic_receive_pdu;
584 	}
585 
586 	if (*availablep < ic->ic_receive_len) {
587 #if 0
588 		ICL_DEBUG("not enough data; need %zd, "
589 		    "have %zd", ic->ic_receive_len, *availablep);
590 #endif
591 		return (NULL);
592 	}
593 
594 	switch (ic->ic_receive_state) {
595 	case ICL_CONN_STATE_BHS:
596 		//ICL_DEBUG("receiving BHS");
597 		error = icl_pdu_receive_bhs(request, availablep);
598 		if (error != 0) {
599 			ICL_DEBUG("failed to receive BHS; "
600 			    "dropping connection");
601 			break;
602 		}
603 
604 		/*
605 		 * We don't enforce any limit for AHS length;
606 		 * its length is stored in 8 bit field.
607 		 */
608 
609 		len = icl_pdu_data_segment_length(request);
610 		if (len > ic->ic_max_data_segment_length) {
611 			ICL_WARN("received data segment "
612 			    "length %zd is larger than negotiated "
613 			    "MaxDataSegmentLength %zd; "
614 			    "dropping connection",
615 			    len, ic->ic_max_data_segment_length);
616 			error = EINVAL;
617 			break;
618 		}
619 
620 		ic->ic_receive_state = ICL_CONN_STATE_AHS;
621 		ic->ic_receive_len = icl_pdu_ahs_length(request);
622 		break;
623 
624 	case ICL_CONN_STATE_AHS:
625 		//ICL_DEBUG("receiving AHS");
626 		error = icl_pdu_receive_ahs(request, availablep);
627 		if (error != 0) {
628 			ICL_DEBUG("failed to receive AHS; "
629 			    "dropping connection");
630 			break;
631 		}
632 		ic->ic_receive_state = ICL_CONN_STATE_HEADER_DIGEST;
633 		if (ic->ic_header_crc32c == false)
634 			ic->ic_receive_len = 0;
635 		else
636 			ic->ic_receive_len = ISCSI_HEADER_DIGEST_SIZE;
637 		break;
638 
639 	case ICL_CONN_STATE_HEADER_DIGEST:
640 		//ICL_DEBUG("receiving header digest");
641 		error = icl_pdu_check_header_digest(request, availablep);
642 		if (error != 0) {
643 			ICL_DEBUG("header digest failed; "
644 			    "dropping connection");
645 			break;
646 		}
647 
648 		ic->ic_receive_state = ICL_CONN_STATE_DATA;
649 		ic->ic_receive_len =
650 		    icl_pdu_data_segment_receive_len(request);
651 		break;
652 
653 	case ICL_CONN_STATE_DATA:
654 		//ICL_DEBUG("receiving data segment");
655 		error = icl_pdu_receive_data_segment(request, availablep,
656 		    &more_needed);
657 		if (error != 0) {
658 			ICL_DEBUG("failed to receive data segment;"
659 			    "dropping connection");
660 			break;
661 		}
662 
663 		if (more_needed)
664 			break;
665 
666 		ic->ic_receive_state = ICL_CONN_STATE_DATA_DIGEST;
667 		if (request->ip_data_len == 0 || ic->ic_data_crc32c == false)
668 			ic->ic_receive_len = 0;
669 		else
670 			ic->ic_receive_len = ISCSI_DATA_DIGEST_SIZE;
671 		break;
672 
673 	case ICL_CONN_STATE_DATA_DIGEST:
674 		//ICL_DEBUG("receiving data digest");
675 		error = icl_pdu_check_data_digest(request, availablep);
676 		if (error != 0) {
677 			ICL_DEBUG("data digest failed; "
678 			    "dropping connection");
679 			break;
680 		}
681 
682 		/*
683 		 * We've received complete PDU; reset the receive state machine
684 		 * and return the PDU.
685 		 */
686 		ic->ic_receive_state = ICL_CONN_STATE_BHS;
687 		ic->ic_receive_len = sizeof(struct iscsi_bhs);
688 		ic->ic_receive_pdu = NULL;
689 		return (request);
690 
691 	default:
692 		panic("invalid ic_receive_state %d\n", ic->ic_receive_state);
693 	}
694 
695 	if (error != 0) {
696 		/*
697 		 * Don't free the PDU; it's pointed to by ic->ic_receive_pdu
698 		 * and will get freed in icl_soft_conn_close().
699 		 */
700 		icl_conn_fail(ic);
701 	}
702 
703 	return (NULL);
704 }
705 
706 static void
707 icl_conn_receive_pdus(struct icl_conn *ic, size_t available)
708 {
709 	struct icl_pdu *response;
710 	struct socket *so;
711 
712 	so = ic->ic_socket;
713 
714 	/*
715 	 * This can never happen; we're careful to only mess with ic->ic_socket
716 	 * pointer when the send/receive threads are not running.
717 	 */
718 	KASSERT(so != NULL, ("NULL socket"));
719 
720 	for (;;) {
721 		if (ic->ic_disconnecting)
722 			return;
723 
724 		if (so->so_error != 0) {
725 			ICL_DEBUG("connection error %d; "
726 			    "dropping connection", so->so_error);
727 			icl_conn_fail(ic);
728 			return;
729 		}
730 
731 		/*
732 		 * Loop until we have a complete PDU or there is not enough
733 		 * data in the socket buffer.
734 		 */
735 		if (available < ic->ic_receive_len) {
736 #if 0
737 			ICL_DEBUG("not enough data; have %zd, "
738 			    "need %zd", available,
739 			    ic->ic_receive_len);
740 #endif
741 			return;
742 		}
743 
744 		response = icl_conn_receive_pdu(ic, &available);
745 		if (response == NULL)
746 			continue;
747 
748 		if (response->ip_ahs_len > 0) {
749 			ICL_WARN("received PDU with unsupported "
750 			    "AHS; opcode 0x%x; dropping connection",
751 			    response->ip_bhs->bhs_opcode);
752 			icl_soft_conn_pdu_free(ic, response);
753 			icl_conn_fail(ic);
754 			return;
755 		}
756 
757 		(ic->ic_receive)(response);
758 	}
759 }
760 
761 static void
762 icl_receive_thread(void *arg)
763 {
764 	struct icl_conn *ic;
765 	size_t available;
766 	struct socket *so;
767 
768 	ic = arg;
769 	so = ic->ic_socket;
770 
771 	for (;;) {
772 		if (ic->ic_disconnecting) {
773 			//ICL_DEBUG("terminating");
774 			break;
775 		}
776 
777 		/*
778 		 * Set the low watermark, to be checked by
779 		 * soreadable() in icl_soupcall_receive()
780 		 * to avoid unnecessary wakeups until there
781 		 * is enough data received to read the PDU.
782 		 */
783 		SOCKBUF_LOCK(&so->so_rcv);
784 		available = sbavail(&so->so_rcv);
785 		if (available < ic->ic_receive_len) {
786 			so->so_rcv.sb_lowat = ic->ic_receive_len;
787 			cv_wait(&ic->ic_receive_cv, &so->so_rcv.sb_mtx);
788 		} else
789 			so->so_rcv.sb_lowat = so->so_rcv.sb_hiwat + 1;
790 		SOCKBUF_UNLOCK(&so->so_rcv);
791 
792 		icl_conn_receive_pdus(ic, available);
793 	}
794 
795 	ICL_CONN_LOCK(ic);
796 	ic->ic_receive_running = false;
797 	cv_signal(&ic->ic_send_cv);
798 	ICL_CONN_UNLOCK(ic);
799 	kthread_exit();
800 }
801 
802 static int
803 icl_soupcall_receive(struct socket *so, void *arg, int waitflag)
804 {
805 	struct icl_conn *ic;
806 
807 	if (!soreadable(so))
808 		return (SU_OK);
809 
810 	ic = arg;
811 	cv_signal(&ic->ic_receive_cv);
812 	return (SU_OK);
813 }
814 
815 static int
816 icl_pdu_finalize(struct icl_pdu *request)
817 {
818 	size_t padding, pdu_len;
819 	uint32_t digest, zero = 0;
820 	int ok;
821 	struct icl_conn *ic;
822 
823 	ic = request->ip_conn;
824 
825 	icl_pdu_set_data_segment_length(request, request->ip_data_len);
826 
827 	pdu_len = icl_pdu_size(request);
828 
829 	if (ic->ic_header_crc32c) {
830 		digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf);
831 		ok = m_append(request->ip_bhs_mbuf, sizeof(digest),
832 		    (void *)&digest);
833 		if (ok != 1) {
834 			ICL_WARN("failed to append header digest");
835 			return (1);
836 		}
837 	}
838 
839 	if (request->ip_data_len != 0) {
840 		padding = icl_pdu_padding(request);
841 		if (padding > 0) {
842 			ok = m_append(request->ip_data_mbuf, padding,
843 			    (void *)&zero);
844 			if (ok != 1) {
845 				ICL_WARN("failed to append padding");
846 				return (1);
847 			}
848 		}
849 
850 		if (ic->ic_data_crc32c) {
851 			digest = icl_mbuf_to_crc32c(request->ip_data_mbuf);
852 
853 			ok = m_append(request->ip_data_mbuf, sizeof(digest),
854 			    (void *)&digest);
855 			if (ok != 1) {
856 				ICL_WARN("failed to append data digest");
857 				return (1);
858 			}
859 		}
860 
861 		m_cat(request->ip_bhs_mbuf, request->ip_data_mbuf);
862 		request->ip_data_mbuf = NULL;
863 	}
864 
865 	request->ip_bhs_mbuf->m_pkthdr.len = pdu_len;
866 
867 	return (0);
868 }
869 
870 static void
871 icl_conn_send_pdus(struct icl_conn *ic, struct icl_pdu_stailq *queue)
872 {
873 	struct icl_pdu *request, *request2;
874 	struct socket *so;
875 	long available, size, size2;
876 	int coalesced, error;
877 
878 	ICL_CONN_LOCK_ASSERT_NOT(ic);
879 
880 	so = ic->ic_socket;
881 
882 	SOCKBUF_LOCK(&so->so_snd);
883 	/*
884 	 * Check how much space do we have for transmit.  We can't just
885 	 * call sosend() and retry when we get EWOULDBLOCK or EMSGSIZE,
886 	 * as it always frees the mbuf chain passed to it, even in case
887 	 * of error.
888 	 */
889 	available = sbspace(&so->so_snd);
890 
891 	/*
892 	 * Notify the socket upcall that we don't need wakeups
893 	 * for the time being.
894 	 */
895 	so->so_snd.sb_lowat = so->so_snd.sb_hiwat + 1;
896 	SOCKBUF_UNLOCK(&so->so_snd);
897 
898 	while (!STAILQ_EMPTY(queue)) {
899 		request = STAILQ_FIRST(queue);
900 		size = icl_pdu_size(request);
901 		if (available < size) {
902 
903 			/*
904 			 * Set the low watermark, to be checked by
905 			 * sowriteable() in icl_soupcall_send()
906 			 * to avoid unnecessary wakeups until there
907 			 * is enough space for the PDU to fit.
908 			 */
909 			SOCKBUF_LOCK(&so->so_snd);
910 			available = sbspace(&so->so_snd);
911 			if (available < size) {
912 #if 1
913 				ICL_DEBUG("no space to send; "
914 				    "have %ld, need %ld",
915 				    available, size);
916 #endif
917 				so->so_snd.sb_lowat = size;
918 				SOCKBUF_UNLOCK(&so->so_snd);
919 				return;
920 			}
921 			SOCKBUF_UNLOCK(&so->so_snd);
922 		}
923 		STAILQ_REMOVE_HEAD(queue, ip_next);
924 		error = icl_pdu_finalize(request);
925 		if (error != 0) {
926 			ICL_DEBUG("failed to finalize PDU; "
927 			    "dropping connection");
928 			icl_soft_conn_pdu_free(ic, request);
929 			icl_conn_fail(ic);
930 			return;
931 		}
932 		if (coalesce) {
933 			coalesced = 1;
934 			for (;;) {
935 				request2 = STAILQ_FIRST(queue);
936 				if (request2 == NULL)
937 					break;
938 				size2 = icl_pdu_size(request2);
939 				if (available < size + size2)
940 					break;
941 				STAILQ_REMOVE_HEAD(queue, ip_next);
942 				error = icl_pdu_finalize(request2);
943 				if (error != 0) {
944 					ICL_DEBUG("failed to finalize PDU; "
945 					    "dropping connection");
946 					icl_soft_conn_pdu_free(ic, request);
947 					icl_soft_conn_pdu_free(ic, request2);
948 					icl_conn_fail(ic);
949 					return;
950 				}
951 				m_cat(request->ip_bhs_mbuf, request2->ip_bhs_mbuf);
952 				request2->ip_bhs_mbuf = NULL;
953 				request->ip_bhs_mbuf->m_pkthdr.len += size2;
954 				size += size2;
955 				STAILQ_REMOVE_AFTER(queue, request, ip_next);
956 				icl_soft_conn_pdu_free(ic, request2);
957 				coalesced++;
958 			}
959 #if 0
960 			if (coalesced > 1) {
961 				ICL_DEBUG("coalesced %d PDUs into %ld bytes",
962 				    coalesced, size);
963 			}
964 #endif
965 		}
966 		available -= size;
967 		error = sosend(so, NULL, NULL, request->ip_bhs_mbuf,
968 		    NULL, MSG_DONTWAIT, curthread);
969 		request->ip_bhs_mbuf = NULL; /* Sosend consumes the mbuf. */
970 		if (error != 0) {
971 			ICL_DEBUG("failed to send PDU, error %d; "
972 			    "dropping connection", error);
973 			icl_soft_conn_pdu_free(ic, request);
974 			icl_conn_fail(ic);
975 			return;
976 		}
977 		icl_soft_conn_pdu_free(ic, request);
978 	}
979 }
980 
981 static void
982 icl_send_thread(void *arg)
983 {
984 	struct icl_conn *ic;
985 	struct icl_pdu_stailq queue;
986 
987 	ic = arg;
988 
989 	STAILQ_INIT(&queue);
990 
991 	ICL_CONN_LOCK(ic);
992 	for (;;) {
993 		for (;;) {
994 			/*
995 			 * If the local queue is empty, populate it from
996 			 * the main one.  This way the icl_conn_send_pdus()
997 			 * can go through all the queued PDUs without holding
998 			 * any locks.
999 			 */
1000 			if (STAILQ_EMPTY(&queue))
1001 				STAILQ_SWAP(&ic->ic_to_send, &queue, icl_pdu);
1002 
1003 			ic->ic_check_send_space = false;
1004 			ICL_CONN_UNLOCK(ic);
1005 			icl_conn_send_pdus(ic, &queue);
1006 			ICL_CONN_LOCK(ic);
1007 
1008 			/*
1009 			 * The icl_soupcall_send() was called since the last
1010 			 * call to sbspace(); go around;
1011 			 */
1012 			if (ic->ic_check_send_space)
1013 				continue;
1014 
1015 			/*
1016 			 * Local queue is empty, but we still have PDUs
1017 			 * in the main one; go around.
1018 			 */
1019 			if (STAILQ_EMPTY(&queue) &&
1020 			    !STAILQ_EMPTY(&ic->ic_to_send))
1021 				continue;
1022 
1023 			/*
1024 			 * There might be some stuff in the local queue,
1025 			 * which didn't get sent due to not having enough send
1026 			 * space.  Wait for socket upcall.
1027 			 */
1028 			break;
1029 		}
1030 
1031 		if (ic->ic_disconnecting) {
1032 			//ICL_DEBUG("terminating");
1033 			break;
1034 		}
1035 
1036 		cv_wait(&ic->ic_send_cv, ic->ic_lock);
1037 	}
1038 
1039 	/*
1040 	 * We're exiting; move PDUs back to the main queue, so they can
1041 	 * get freed properly.  At this point ordering doesn't matter.
1042 	 */
1043 	STAILQ_CONCAT(&ic->ic_to_send, &queue);
1044 
1045 	ic->ic_send_running = false;
1046 	cv_signal(&ic->ic_send_cv);
1047 	ICL_CONN_UNLOCK(ic);
1048 	kthread_exit();
1049 }
1050 
1051 static int
1052 icl_soupcall_send(struct socket *so, void *arg, int waitflag)
1053 {
1054 	struct icl_conn *ic;
1055 
1056 	if (!sowriteable(so))
1057 		return (SU_OK);
1058 
1059 	ic = arg;
1060 
1061 	ICL_CONN_LOCK(ic);
1062 	ic->ic_check_send_space = true;
1063 	ICL_CONN_UNLOCK(ic);
1064 
1065 	cv_signal(&ic->ic_send_cv);
1066 
1067 	return (SU_OK);
1068 }
1069 
1070 static int
1071 icl_soft_conn_pdu_append_data(struct icl_conn *ic, struct icl_pdu *request,
1072     const void *addr, size_t len, int flags)
1073 {
1074 	struct mbuf *mb, *newmb;
1075 	size_t copylen, off = 0;
1076 
1077 	KASSERT(len > 0, ("len == 0"));
1078 
1079 	newmb = m_getm2(NULL, len, flags, MT_DATA, 0);
1080 	if (newmb == NULL) {
1081 		ICL_WARN("failed to allocate mbuf for %zd bytes", len);
1082 		return (ENOMEM);
1083 	}
1084 
1085 	for (mb = newmb; mb != NULL; mb = mb->m_next) {
1086 		copylen = min(M_TRAILINGSPACE(mb), len - off);
1087 		memcpy(mtod(mb, char *), (const char *)addr + off, copylen);
1088 		mb->m_len = copylen;
1089 		off += copylen;
1090 	}
1091 	KASSERT(off == len, ("%s: off != len", __func__));
1092 
1093 	if (request->ip_data_mbuf == NULL) {
1094 		request->ip_data_mbuf = newmb;
1095 		request->ip_data_len = len;
1096 	} else {
1097 		m_cat(request->ip_data_mbuf, newmb);
1098 		request->ip_data_len += len;
1099 	}
1100 
1101 	return (0);
1102 }
1103 
1104 void
1105 icl_soft_conn_pdu_get_data(struct icl_conn *ic, struct icl_pdu *ip,
1106     size_t off, void *addr, size_t len)
1107 {
1108 
1109 	m_copydata(ip->ip_data_mbuf, off, len, addr);
1110 }
1111 
1112 static void
1113 icl_pdu_queue(struct icl_pdu *ip)
1114 {
1115 	struct icl_conn *ic;
1116 
1117 	ic = ip->ip_conn;
1118 
1119 	ICL_CONN_LOCK_ASSERT(ic);
1120 
1121 	if (ic->ic_disconnecting || ic->ic_socket == NULL) {
1122 		ICL_DEBUG("icl_pdu_queue on closed connection");
1123 		icl_soft_conn_pdu_free(ic, ip);
1124 		return;
1125 	}
1126 
1127 	if (!STAILQ_EMPTY(&ic->ic_to_send)) {
1128 		STAILQ_INSERT_TAIL(&ic->ic_to_send, ip, ip_next);
1129 		/*
1130 		 * If the queue is not empty, someone else had already
1131 		 * signaled the send thread; no need to do that again,
1132 		 * just return.
1133 		 */
1134 		return;
1135 	}
1136 
1137 	STAILQ_INSERT_TAIL(&ic->ic_to_send, ip, ip_next);
1138 	cv_signal(&ic->ic_send_cv);
1139 }
1140 
1141 void
1142 icl_soft_conn_pdu_queue(struct icl_conn *ic, struct icl_pdu *ip)
1143 {
1144 
1145 	icl_pdu_queue(ip);
1146 }
1147 
1148 static struct icl_conn *
1149 icl_soft_new_conn(const char *name, struct mtx *lock)
1150 {
1151 	struct icl_conn *ic;
1152 
1153 	refcount_acquire(&icl_ncons);
1154 
1155 	ic = (struct icl_conn *)kobj_create(&icl_soft_class, M_ICL_SOFT, M_WAITOK | M_ZERO);
1156 
1157 	STAILQ_INIT(&ic->ic_to_send);
1158 	ic->ic_lock = lock;
1159 	cv_init(&ic->ic_send_cv, "icl_tx");
1160 	cv_init(&ic->ic_receive_cv, "icl_rx");
1161 #ifdef DIAGNOSTIC
1162 	refcount_init(&ic->ic_outstanding_pdus, 0);
1163 #endif
1164 	ic->ic_max_data_segment_length = ICL_MAX_DATA_SEGMENT_LENGTH;
1165 	ic->ic_name = name;
1166 	ic->ic_offload = "None";
1167 	ic->ic_unmapped = false;
1168 
1169 	return (ic);
1170 }
1171 
1172 void
1173 icl_soft_conn_free(struct icl_conn *ic)
1174 {
1175 
1176 #ifdef DIAGNOSTIC
1177 	KASSERT(ic->ic_outstanding_pdus == 0,
1178 	    ("destroying session with %d outstanding PDUs",
1179 	     ic->ic_outstanding_pdus));
1180 #endif
1181 	cv_destroy(&ic->ic_send_cv);
1182 	cv_destroy(&ic->ic_receive_cv);
1183 	kobj_delete((struct kobj *)ic, M_ICL_SOFT);
1184 	refcount_release(&icl_ncons);
1185 }
1186 
1187 static int
1188 icl_conn_start(struct icl_conn *ic)
1189 {
1190 	size_t minspace;
1191 	struct sockopt opt;
1192 	int error, one = 1;
1193 
1194 	ICL_CONN_LOCK(ic);
1195 
1196 	/*
1197 	 * XXX: Ugly hack.
1198 	 */
1199 	if (ic->ic_socket == NULL) {
1200 		ICL_CONN_UNLOCK(ic);
1201 		return (EINVAL);
1202 	}
1203 
1204 	ic->ic_receive_state = ICL_CONN_STATE_BHS;
1205 	ic->ic_receive_len = sizeof(struct iscsi_bhs);
1206 	ic->ic_disconnecting = false;
1207 
1208 	ICL_CONN_UNLOCK(ic);
1209 
1210 	/*
1211 	 * For sendspace, this is required because the current code cannot
1212 	 * send a PDU in pieces; thus, the minimum buffer size is equal
1213 	 * to the maximum PDU size.  "+4" is to account for possible padding.
1214 	 *
1215 	 * What we should actually do here is to use autoscaling, but set
1216 	 * some minimal buffer size to "minspace".  I don't know a way to do
1217 	 * that, though.
1218 	 */
1219 	minspace = sizeof(struct iscsi_bhs) + ic->ic_max_data_segment_length +
1220 	    ISCSI_HEADER_DIGEST_SIZE + ISCSI_DATA_DIGEST_SIZE + 4;
1221 	if (sendspace < minspace) {
1222 		ICL_WARN("kern.icl.sendspace too low; must be at least %zd",
1223 		    minspace);
1224 		sendspace = minspace;
1225 	}
1226 	if (recvspace < minspace) {
1227 		ICL_WARN("kern.icl.recvspace too low; must be at least %zd",
1228 		    minspace);
1229 		recvspace = minspace;
1230 	}
1231 
1232 	error = soreserve(ic->ic_socket, sendspace, recvspace);
1233 	if (error != 0) {
1234 		ICL_WARN("soreserve failed with error %d", error);
1235 		icl_soft_conn_close(ic);
1236 		return (error);
1237 	}
1238 	ic->ic_socket->so_snd.sb_flags |= SB_AUTOSIZE;
1239 	ic->ic_socket->so_rcv.sb_flags |= SB_AUTOSIZE;
1240 
1241 	/*
1242 	 * Disable Nagle.
1243 	 */
1244 	bzero(&opt, sizeof(opt));
1245 	opt.sopt_dir = SOPT_SET;
1246 	opt.sopt_level = IPPROTO_TCP;
1247 	opt.sopt_name = TCP_NODELAY;
1248 	opt.sopt_val = &one;
1249 	opt.sopt_valsize = sizeof(one);
1250 	error = sosetopt(ic->ic_socket, &opt);
1251 	if (error != 0) {
1252 		ICL_WARN("disabling TCP_NODELAY failed with error %d", error);
1253 		icl_soft_conn_close(ic);
1254 		return (error);
1255 	}
1256 
1257 	/*
1258 	 * Register socket upcall, to get notified about incoming PDUs
1259 	 * and free space to send outgoing ones.
1260 	 */
1261 	SOCKBUF_LOCK(&ic->ic_socket->so_snd);
1262 	soupcall_set(ic->ic_socket, SO_SND, icl_soupcall_send, ic);
1263 	SOCKBUF_UNLOCK(&ic->ic_socket->so_snd);
1264 	SOCKBUF_LOCK(&ic->ic_socket->so_rcv);
1265 	soupcall_set(ic->ic_socket, SO_RCV, icl_soupcall_receive, ic);
1266 	SOCKBUF_UNLOCK(&ic->ic_socket->so_rcv);
1267 
1268 	/*
1269 	 * Start threads.
1270 	 */
1271 	ICL_CONN_LOCK(ic);
1272 	ic->ic_send_running = ic->ic_receive_running = true;
1273 	ICL_CONN_UNLOCK(ic);
1274 	error = kthread_add(icl_send_thread, ic, NULL, NULL, 0, 0, "%stx",
1275 	    ic->ic_name);
1276 	if (error != 0) {
1277 		ICL_WARN("kthread_add(9) failed with error %d", error);
1278 		ICL_CONN_LOCK(ic);
1279 		ic->ic_send_running = ic->ic_receive_running = false;
1280 		cv_signal(&ic->ic_send_cv);
1281 		ICL_CONN_UNLOCK(ic);
1282 		icl_soft_conn_close(ic);
1283 		return (error);
1284 	}
1285 	error = kthread_add(icl_receive_thread, ic, NULL, NULL, 0, 0, "%srx",
1286 	    ic->ic_name);
1287 	if (error != 0) {
1288 		ICL_WARN("kthread_add(9) failed with error %d", error);
1289 		ICL_CONN_LOCK(ic);
1290 		ic->ic_receive_running = false;
1291 		cv_signal(&ic->ic_send_cv);
1292 		ICL_CONN_UNLOCK(ic);
1293 		icl_soft_conn_close(ic);
1294 		return (error);
1295 	}
1296 
1297 	return (0);
1298 }
1299 
1300 int
1301 icl_soft_conn_handoff(struct icl_conn *ic, int fd)
1302 {
1303 	struct file *fp;
1304 	struct socket *so;
1305 	cap_rights_t rights;
1306 	int error;
1307 
1308 	ICL_CONN_LOCK_ASSERT_NOT(ic);
1309 
1310 #ifdef ICL_KERNEL_PROXY
1311 	/*
1312 	 * We're transitioning to Full Feature phase, and we don't
1313 	 * really care.
1314 	 */
1315 	if (fd == 0) {
1316 		ICL_CONN_LOCK(ic);
1317 		if (ic->ic_socket == NULL) {
1318 			ICL_CONN_UNLOCK(ic);
1319 			ICL_WARN("proxy handoff without connect");
1320 			return (EINVAL);
1321 		}
1322 		ICL_CONN_UNLOCK(ic);
1323 		return (0);
1324 	}
1325 #endif
1326 
1327 	/*
1328 	 * Steal the socket from userland.
1329 	 */
1330 	error = fget(curthread, fd,
1331 	    cap_rights_init(&rights, CAP_SOCK_CLIENT), &fp);
1332 	if (error != 0)
1333 		return (error);
1334 	if (fp->f_type != DTYPE_SOCKET) {
1335 		fdrop(fp, curthread);
1336 		return (EINVAL);
1337 	}
1338 	so = fp->f_data;
1339 	if (so->so_type != SOCK_STREAM) {
1340 		fdrop(fp, curthread);
1341 		return (EINVAL);
1342 	}
1343 
1344 	ICL_CONN_LOCK(ic);
1345 
1346 	if (ic->ic_socket != NULL) {
1347 		ICL_CONN_UNLOCK(ic);
1348 		fdrop(fp, curthread);
1349 		return (EBUSY);
1350 	}
1351 
1352 	ic->ic_socket = fp->f_data;
1353 	fp->f_ops = &badfileops;
1354 	fp->f_data = NULL;
1355 	fdrop(fp, curthread);
1356 	ICL_CONN_UNLOCK(ic);
1357 
1358 	error = icl_conn_start(ic);
1359 
1360 	return (error);
1361 }
1362 
1363 void
1364 icl_soft_conn_close(struct icl_conn *ic)
1365 {
1366 	struct icl_pdu *pdu;
1367 	struct socket *so;
1368 
1369 	ICL_CONN_LOCK(ic);
1370 
1371 	/*
1372 	 * Wake up the threads, so they can properly terminate.
1373 	 */
1374 	ic->ic_disconnecting = true;
1375 	while (ic->ic_receive_running || ic->ic_send_running) {
1376 		cv_signal(&ic->ic_receive_cv);
1377 		cv_signal(&ic->ic_send_cv);
1378 		cv_wait(&ic->ic_send_cv, ic->ic_lock);
1379 	}
1380 
1381 	/* Some other thread could close the connection same time. */
1382 	so = ic->ic_socket;
1383 	if (so == NULL) {
1384 		ICL_CONN_UNLOCK(ic);
1385 		return;
1386 	}
1387 	ic->ic_socket = NULL;
1388 
1389 	/*
1390 	 * Deregister socket upcalls.
1391 	 */
1392 	ICL_CONN_UNLOCK(ic);
1393 	SOCKBUF_LOCK(&so->so_snd);
1394 	if (so->so_snd.sb_upcall != NULL)
1395 		soupcall_clear(so, SO_SND);
1396 	SOCKBUF_UNLOCK(&so->so_snd);
1397 	SOCKBUF_LOCK(&so->so_rcv);
1398 	if (so->so_rcv.sb_upcall != NULL)
1399 		soupcall_clear(so, SO_RCV);
1400 	SOCKBUF_UNLOCK(&so->so_rcv);
1401 	soclose(so);
1402 	ICL_CONN_LOCK(ic);
1403 
1404 	if (ic->ic_receive_pdu != NULL) {
1405 		//ICL_DEBUG("freeing partially received PDU");
1406 		icl_soft_conn_pdu_free(ic, ic->ic_receive_pdu);
1407 		ic->ic_receive_pdu = NULL;
1408 	}
1409 
1410 	/*
1411 	 * Remove any outstanding PDUs from the send queue.
1412 	 */
1413 	while (!STAILQ_EMPTY(&ic->ic_to_send)) {
1414 		pdu = STAILQ_FIRST(&ic->ic_to_send);
1415 		STAILQ_REMOVE_HEAD(&ic->ic_to_send, ip_next);
1416 		icl_soft_conn_pdu_free(ic, pdu);
1417 	}
1418 
1419 	KASSERT(STAILQ_EMPTY(&ic->ic_to_send),
1420 	    ("destroying session with non-empty send queue"));
1421 	ICL_CONN_UNLOCK(ic);
1422 }
1423 
1424 int
1425 icl_soft_conn_task_setup(struct icl_conn *ic, struct icl_pdu *ip,
1426     struct ccb_scsiio *csio, uint32_t *task_tagp, void **prvp)
1427 {
1428 
1429 	return (0);
1430 }
1431 
1432 void
1433 icl_soft_conn_task_done(struct icl_conn *ic, void *prv)
1434 {
1435 }
1436 
1437 int
1438 icl_soft_conn_transfer_setup(struct icl_conn *ic, union ctl_io *io,
1439     uint32_t *transfer_tag, void **prvp)
1440 {
1441 
1442 	return (0);
1443 }
1444 
1445 void
1446 icl_soft_conn_transfer_done(struct icl_conn *ic, void *prv)
1447 {
1448 }
1449 
1450 static int
1451 icl_soft_limits(struct icl_drv_limits *idl)
1452 {
1453 
1454 	idl->idl_max_recv_data_segment_length = 128 * 1024;
1455 	idl->idl_max_send_data_segment_length = 128 * 1024;
1456 	idl->idl_max_burst_length = 262144;
1457 	idl->idl_first_burst_length = 65536;
1458 
1459 	return (0);
1460 }
1461 
1462 #ifdef ICL_KERNEL_PROXY
1463 int
1464 icl_soft_conn_connect(struct icl_conn *ic, int domain, int socktype,
1465     int protocol, struct sockaddr *from_sa, struct sockaddr *to_sa)
1466 {
1467 
1468 	return (icl_soft_proxy_connect(ic, domain, socktype, protocol,
1469 	    from_sa, to_sa));
1470 }
1471 
1472 int
1473 icl_soft_handoff_sock(struct icl_conn *ic, struct socket *so)
1474 {
1475 	int error;
1476 
1477 	ICL_CONN_LOCK_ASSERT_NOT(ic);
1478 
1479 	if (so->so_type != SOCK_STREAM)
1480 		return (EINVAL);
1481 
1482 	ICL_CONN_LOCK(ic);
1483 	if (ic->ic_socket != NULL) {
1484 		ICL_CONN_UNLOCK(ic);
1485 		return (EBUSY);
1486 	}
1487 	ic->ic_socket = so;
1488 	ICL_CONN_UNLOCK(ic);
1489 
1490 	error = icl_conn_start(ic);
1491 
1492 	return (error);
1493 }
1494 #endif /* ICL_KERNEL_PROXY */
1495 
1496 static int
1497 icl_soft_load(void)
1498 {
1499 	int error;
1500 
1501 	icl_pdu_zone = uma_zcreate("icl_pdu",
1502 	    sizeof(struct icl_pdu), NULL, NULL, NULL, NULL,
1503 	    UMA_ALIGN_PTR, 0);
1504 	refcount_init(&icl_ncons, 0);
1505 
1506 	/*
1507 	 * The reason we call this "none" is that to the user,
1508 	 * it's known as "offload driver"; "offload driver: soft"
1509 	 * doesn't make much sense.
1510 	 */
1511 	error = icl_register("none", false, 0,
1512 	    icl_soft_limits, icl_soft_new_conn);
1513 	KASSERT(error == 0, ("failed to register"));
1514 
1515 #if defined(ICL_KERNEL_PROXY) && 0
1516 	/*
1517 	 * Debugging aid for kernel proxy functionality.
1518 	 */
1519 	error = icl_register("proxytest", true, 0,
1520 	    icl_soft_limits, icl_soft_new_conn);
1521 	KASSERT(error == 0, ("failed to register"));
1522 #endif
1523 
1524 	return (error);
1525 }
1526 
1527 static int
1528 icl_soft_unload(void)
1529 {
1530 
1531 	if (icl_ncons != 0)
1532 		return (EBUSY);
1533 
1534 	icl_unregister("none", false);
1535 #if defined(ICL_KERNEL_PROXY) && 0
1536 	icl_unregister("proxytest", true);
1537 #endif
1538 
1539 	uma_zdestroy(icl_pdu_zone);
1540 
1541 	return (0);
1542 }
1543 
1544 static int
1545 icl_soft_modevent(module_t mod, int what, void *arg)
1546 {
1547 
1548 	switch (what) {
1549 	case MOD_LOAD:
1550 		return (icl_soft_load());
1551 	case MOD_UNLOAD:
1552 		return (icl_soft_unload());
1553 	default:
1554 		return (EINVAL);
1555 	}
1556 }
1557 
1558 moduledata_t icl_soft_data = {
1559 	"icl_soft",
1560 	icl_soft_modevent,
1561 	0
1562 };
1563 
1564 DECLARE_MODULE(icl_soft, icl_soft_data, SI_SUB_DRIVERS, SI_ORDER_MIDDLE);
1565 MODULE_DEPEND(icl_soft, icl, 1, 1, 1);
1566 MODULE_VERSION(icl_soft, 1);
1567