xref: /freebsd/sys/dev/iscsi/icl_soft.c (revision 64de80195bba295c961a4cdf96dbe0e4979bdf2a)
1 /*-
2  * Copyright (c) 2012 The FreeBSD Foundation
3  * All rights reserved.
4  *
5  * This software was developed by Edward Tomasz Napierala under sponsorship
6  * from the FreeBSD Foundation.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  */
30 
31 /*
32  * iSCSI Common Layer.  It's used by both the initiator and target to send
33  * and receive iSCSI PDUs.
34  */
35 
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD$");
38 
39 #include <sys/param.h>
40 #include <sys/capsicum.h>
41 #include <sys/condvar.h>
42 #include <sys/conf.h>
43 #include <sys/file.h>
44 #include <sys/kernel.h>
45 #include <sys/kthread.h>
46 #include <sys/lock.h>
47 #include <sys/mbuf.h>
48 #include <sys/mutex.h>
49 #include <sys/module.h>
50 #include <sys/protosw.h>
51 #include <sys/socket.h>
52 #include <sys/socketvar.h>
53 #include <sys/sysctl.h>
54 #include <sys/systm.h>
55 #include <sys/sx.h>
56 #include <sys/uio.h>
57 #include <vm/uma.h>
58 #include <netinet/in.h>
59 #include <netinet/tcp.h>
60 
61 #include <dev/iscsi/icl.h>
62 #include <dev/iscsi/iscsi_proto.h>
63 #include <icl_conn_if.h>
64 
65 static int coalesce = 1;
66 SYSCTL_INT(_kern_icl, OID_AUTO, coalesce, CTLFLAG_RWTUN,
67     &coalesce, 0, "Try to coalesce PDUs before sending");
68 static int partial_receive_len = 128 * 1024;
69 SYSCTL_INT(_kern_icl, OID_AUTO, partial_receive_len, CTLFLAG_RWTUN,
70     &partial_receive_len, 0, "Minimum read size for partially received "
71     "data segment");
72 static int sendspace = 1048576;
73 SYSCTL_INT(_kern_icl, OID_AUTO, sendspace, CTLFLAG_RWTUN,
74     &sendspace, 0, "Default send socket buffer size");
75 static int recvspace = 1048576;
76 SYSCTL_INT(_kern_icl, OID_AUTO, recvspace, CTLFLAG_RWTUN,
77     &recvspace, 0, "Default receive socket buffer size");
78 
79 static MALLOC_DEFINE(M_ICL_SOFT, "icl_soft", "iSCSI software backend");
80 static uma_zone_t icl_pdu_zone;
81 
82 static volatile u_int	icl_ncons;
83 
84 #define ICL_CONN_LOCK(X)		mtx_lock(X->ic_lock)
85 #define ICL_CONN_UNLOCK(X)		mtx_unlock(X->ic_lock)
86 #define ICL_CONN_LOCK_ASSERT(X)		mtx_assert(X->ic_lock, MA_OWNED)
87 #define ICL_CONN_LOCK_ASSERT_NOT(X)	mtx_assert(X->ic_lock, MA_NOTOWNED)
88 
89 STAILQ_HEAD(icl_pdu_stailq, icl_pdu);
90 
91 static icl_conn_new_pdu_t	icl_soft_conn_new_pdu;
92 static icl_conn_pdu_free_t	icl_soft_conn_pdu_free;
93 static icl_conn_pdu_data_segment_length_t
94 				    icl_soft_conn_pdu_data_segment_length;
95 static icl_conn_pdu_append_data_t	icl_soft_conn_pdu_append_data;
96 static icl_conn_pdu_get_data_t	icl_soft_conn_pdu_get_data;
97 static icl_conn_pdu_queue_t	icl_soft_conn_pdu_queue;
98 static icl_conn_handoff_t	icl_soft_conn_handoff;
99 static icl_conn_free_t		icl_soft_conn_free;
100 static icl_conn_close_t		icl_soft_conn_close;
101 static icl_conn_connected_t	icl_soft_conn_connected;
102 
103 static kobj_method_t icl_soft_methods[] = {
104 	KOBJMETHOD(icl_conn_new_pdu, icl_soft_conn_new_pdu),
105 	KOBJMETHOD(icl_conn_pdu_free, icl_soft_conn_pdu_free),
106 	KOBJMETHOD(icl_conn_pdu_data_segment_length,
107 	    icl_soft_conn_pdu_data_segment_length),
108 	KOBJMETHOD(icl_conn_pdu_append_data, icl_soft_conn_pdu_append_data),
109 	KOBJMETHOD(icl_conn_pdu_get_data, icl_soft_conn_pdu_get_data),
110 	KOBJMETHOD(icl_conn_pdu_queue, icl_soft_conn_pdu_queue),
111 	KOBJMETHOD(icl_conn_handoff, icl_soft_conn_handoff),
112 	KOBJMETHOD(icl_conn_free, icl_soft_conn_free),
113 	KOBJMETHOD(icl_conn_close, icl_soft_conn_close),
114 	KOBJMETHOD(icl_conn_connected, icl_soft_conn_connected),
115 	{ 0, 0 }
116 };
117 
118 DEFINE_CLASS(icl_soft, icl_soft_methods, sizeof(struct icl_conn));
119 
120 static void	icl_conn_close(struct icl_conn *ic);
121 
122 static void
123 icl_conn_fail(struct icl_conn *ic)
124 {
125 	if (ic->ic_socket == NULL)
126 		return;
127 
128 	/*
129 	 * XXX
130 	 */
131 	ic->ic_socket->so_error = EDOOFUS;
132 	(ic->ic_error)(ic);
133 }
134 
135 static struct mbuf *
136 icl_conn_receive(struct icl_conn *ic, size_t len)
137 {
138 	struct uio uio;
139 	struct socket *so;
140 	struct mbuf *m;
141 	int error, flags;
142 
143 	so = ic->ic_socket;
144 
145 	memset(&uio, 0, sizeof(uio));
146 	uio.uio_resid = len;
147 
148 	flags = MSG_DONTWAIT;
149 	error = soreceive(so, NULL, &uio, &m, NULL, &flags);
150 	if (error != 0) {
151 		ICL_DEBUG("soreceive error %d", error);
152 		return (NULL);
153 	}
154 	if (uio.uio_resid != 0) {
155 		m_freem(m);
156 		ICL_DEBUG("short read");
157 		return (NULL);
158 	}
159 
160 	return (m);
161 }
162 
163 static struct icl_pdu *
164 icl_pdu_new_empty(struct icl_conn *ic, int flags)
165 {
166 	struct icl_pdu *ip;
167 
168 #ifdef DIAGNOSTIC
169 	refcount_acquire(&ic->ic_outstanding_pdus);
170 #endif
171 	ip = uma_zalloc(icl_pdu_zone, flags | M_ZERO);
172 	if (ip == NULL) {
173 		ICL_WARN("failed to allocate %zd bytes", sizeof(*ip));
174 #ifdef DIAGNOSTIC
175 		refcount_release(&ic->ic_outstanding_pdus);
176 #endif
177 		return (NULL);
178 	}
179 
180 	ip->ip_conn = ic;
181 
182 	return (ip);
183 }
184 
185 static void
186 icl_pdu_free(struct icl_pdu *ip)
187 {
188 	struct icl_conn *ic;
189 
190 	ic = ip->ip_conn;
191 
192 	m_freem(ip->ip_bhs_mbuf);
193 	m_freem(ip->ip_ahs_mbuf);
194 	m_freem(ip->ip_data_mbuf);
195 	uma_zfree(icl_pdu_zone, ip);
196 #ifdef DIAGNOSTIC
197 	refcount_release(&ic->ic_outstanding_pdus);
198 #endif
199 }
200 
201 void
202 icl_soft_conn_pdu_free(struct icl_conn *ic, struct icl_pdu *ip)
203 {
204 	icl_pdu_free(ip);
205 }
206 
207 /*
208  * Allocate icl_pdu with empty BHS to fill up by the caller.
209  */
210 struct icl_pdu *
211 icl_soft_conn_new_pdu(struct icl_conn *ic, int flags)
212 {
213 	struct icl_pdu *ip;
214 
215 	ip = icl_pdu_new_empty(ic, flags);
216 	if (ip == NULL)
217 		return (NULL);
218 
219 	ip->ip_bhs_mbuf = m_getm2(NULL, sizeof(struct iscsi_bhs),
220 	    flags, MT_DATA, M_PKTHDR);
221 	if (ip->ip_bhs_mbuf == NULL) {
222 		ICL_WARN("failed to allocate %zd bytes", sizeof(*ip));
223 		icl_pdu_free(ip);
224 		return (NULL);
225 	}
226 	ip->ip_bhs = mtod(ip->ip_bhs_mbuf, struct iscsi_bhs *);
227 	memset(ip->ip_bhs, 0, sizeof(struct iscsi_bhs));
228 	ip->ip_bhs_mbuf->m_len = sizeof(struct iscsi_bhs);
229 
230 	return (ip);
231 }
232 
233 static int
234 icl_pdu_ahs_length(const struct icl_pdu *request)
235 {
236 
237 	return (request->ip_bhs->bhs_total_ahs_len * 4);
238 }
239 
240 static size_t
241 icl_pdu_data_segment_length(const struct icl_pdu *request)
242 {
243 	uint32_t len = 0;
244 
245 	len += request->ip_bhs->bhs_data_segment_len[0];
246 	len <<= 8;
247 	len += request->ip_bhs->bhs_data_segment_len[1];
248 	len <<= 8;
249 	len += request->ip_bhs->bhs_data_segment_len[2];
250 
251 	return (len);
252 }
253 
254 size_t
255 icl_soft_conn_pdu_data_segment_length(struct icl_conn *ic,
256     const struct icl_pdu *request)
257 {
258 
259 	return (icl_pdu_data_segment_length(request));
260 }
261 
262 static void
263 icl_pdu_set_data_segment_length(struct icl_pdu *response, uint32_t len)
264 {
265 
266 	response->ip_bhs->bhs_data_segment_len[2] = len;
267 	response->ip_bhs->bhs_data_segment_len[1] = len >> 8;
268 	response->ip_bhs->bhs_data_segment_len[0] = len >> 16;
269 }
270 
271 static size_t
272 icl_pdu_padding(const struct icl_pdu *ip)
273 {
274 
275 	if ((ip->ip_data_len % 4) != 0)
276 		return (4 - (ip->ip_data_len % 4));
277 
278 	return (0);
279 }
280 
281 static size_t
282 icl_pdu_size(const struct icl_pdu *response)
283 {
284 	size_t len;
285 
286 	KASSERT(response->ip_ahs_len == 0, ("responding with AHS"));
287 
288 	len = sizeof(struct iscsi_bhs) + response->ip_data_len +
289 	    icl_pdu_padding(response);
290 	if (response->ip_conn->ic_header_crc32c)
291 		len += ISCSI_HEADER_DIGEST_SIZE;
292 	if (response->ip_data_len != 0 && response->ip_conn->ic_data_crc32c)
293 		len += ISCSI_DATA_DIGEST_SIZE;
294 
295 	return (len);
296 }
297 
298 static int
299 icl_pdu_receive_bhs(struct icl_pdu *request, size_t *availablep)
300 {
301 	struct mbuf *m;
302 
303 	m = icl_conn_receive(request->ip_conn, sizeof(struct iscsi_bhs));
304 	if (m == NULL) {
305 		ICL_DEBUG("failed to receive BHS");
306 		return (-1);
307 	}
308 
309 	request->ip_bhs_mbuf = m_pullup(m, sizeof(struct iscsi_bhs));
310 	if (request->ip_bhs_mbuf == NULL) {
311 		ICL_WARN("m_pullup failed");
312 		return (-1);
313 	}
314 	request->ip_bhs = mtod(request->ip_bhs_mbuf, struct iscsi_bhs *);
315 
316 	/*
317 	 * XXX: For architectures with strict alignment requirements
318 	 * 	we may need to allocate ip_bhs and copy the data into it.
319 	 * 	For some reason, though, not doing this doesn't seem
320 	 * 	to cause problems; tested on sparc64.
321 	 */
322 
323 	*availablep -= sizeof(struct iscsi_bhs);
324 	return (0);
325 }
326 
327 static int
328 icl_pdu_receive_ahs(struct icl_pdu *request, size_t *availablep)
329 {
330 
331 	request->ip_ahs_len = icl_pdu_ahs_length(request);
332 	if (request->ip_ahs_len == 0)
333 		return (0);
334 
335 	request->ip_ahs_mbuf = icl_conn_receive(request->ip_conn,
336 	    request->ip_ahs_len);
337 	if (request->ip_ahs_mbuf == NULL) {
338 		ICL_DEBUG("failed to receive AHS");
339 		return (-1);
340 	}
341 
342 	*availablep -= request->ip_ahs_len;
343 	return (0);
344 }
345 
346 static uint32_t
347 icl_mbuf_to_crc32c(const struct mbuf *m0)
348 {
349 	uint32_t digest = 0xffffffff;
350 	const struct mbuf *m;
351 
352 	for (m = m0; m != NULL; m = m->m_next)
353 		digest = calculate_crc32c(digest,
354 		    mtod(m, const void *), m->m_len);
355 
356 	digest = digest ^ 0xffffffff;
357 
358 	return (digest);
359 }
360 
361 static int
362 icl_pdu_check_header_digest(struct icl_pdu *request, size_t *availablep)
363 {
364 	struct mbuf *m;
365 	uint32_t received_digest, valid_digest;
366 
367 	if (request->ip_conn->ic_header_crc32c == false)
368 		return (0);
369 
370 	m = icl_conn_receive(request->ip_conn, ISCSI_HEADER_DIGEST_SIZE);
371 	if (m == NULL) {
372 		ICL_DEBUG("failed to receive header digest");
373 		return (-1);
374 	}
375 
376 	CTASSERT(sizeof(received_digest) == ISCSI_HEADER_DIGEST_SIZE);
377 	m_copydata(m, 0, ISCSI_HEADER_DIGEST_SIZE, (void *)&received_digest);
378 	m_freem(m);
379 
380 	*availablep -= ISCSI_HEADER_DIGEST_SIZE;
381 
382 	/*
383 	 * XXX: Handle AHS.
384 	 */
385 	valid_digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf);
386 	if (received_digest != valid_digest) {
387 		ICL_WARN("header digest check failed; got 0x%x, "
388 		    "should be 0x%x", received_digest, valid_digest);
389 		return (-1);
390 	}
391 
392 	return (0);
393 }
394 
395 /*
396  * Return the number of bytes that should be waiting in the receive socket
397  * before icl_pdu_receive_data_segment() gets called.
398  */
399 static size_t
400 icl_pdu_data_segment_receive_len(const struct icl_pdu *request)
401 {
402 	size_t len;
403 
404 	len = icl_pdu_data_segment_length(request);
405 	if (len == 0)
406 		return (0);
407 
408 	/*
409 	 * Account for the parts of data segment already read from
410 	 * the socket buffer.
411 	 */
412 	KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len"));
413 	len -= request->ip_data_len;
414 
415 	/*
416 	 * Don't always wait for the full data segment to be delivered
417 	 * to the socket; this might badly affect performance due to
418 	 * TCP window scaling.
419 	 */
420 	if (len > partial_receive_len) {
421 #if 0
422 		ICL_DEBUG("need %zd bytes of data, limiting to %zd",
423 		    len, partial_receive_len));
424 #endif
425 		len = partial_receive_len;
426 
427 		return (len);
428 	}
429 
430 	/*
431 	 * Account for padding.  Note that due to the way code is written,
432 	 * the icl_pdu_receive_data_segment() must always receive padding
433 	 * along with the last part of data segment, because it would be
434 	 * impossible to tell whether we've already received the full data
435 	 * segment including padding, or without it.
436 	 */
437 	if ((len % 4) != 0)
438 		len += 4 - (len % 4);
439 
440 #if 0
441 	ICL_DEBUG("need %zd bytes of data", len));
442 #endif
443 
444 	return (len);
445 }
446 
447 static int
448 icl_pdu_receive_data_segment(struct icl_pdu *request,
449     size_t *availablep, bool *more_neededp)
450 {
451 	struct icl_conn *ic;
452 	size_t len, padding = 0;
453 	struct mbuf *m;
454 
455 	ic = request->ip_conn;
456 
457 	*more_neededp = false;
458 	ic->ic_receive_len = 0;
459 
460 	len = icl_pdu_data_segment_length(request);
461 	if (len == 0)
462 		return (0);
463 
464 	if ((len % 4) != 0)
465 		padding = 4 - (len % 4);
466 
467 	/*
468 	 * Account for already received parts of data segment.
469 	 */
470 	KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len"));
471 	len -= request->ip_data_len;
472 
473 	if (len + padding > *availablep) {
474 		/*
475 		 * Not enough data in the socket buffer.  Receive as much
476 		 * as we can.  Don't receive padding, since, obviously, it's
477 		 * not the end of data segment yet.
478 		 */
479 #if 0
480 		ICL_DEBUG("limited from %zd to %zd",
481 		    len + padding, *availablep - padding));
482 #endif
483 		len = *availablep - padding;
484 		*more_neededp = true;
485 		padding = 0;
486 	}
487 
488 	/*
489 	 * Must not try to receive padding without at least one byte
490 	 * of actual data segment.
491 	 */
492 	if (len > 0) {
493 		m = icl_conn_receive(request->ip_conn, len + padding);
494 		if (m == NULL) {
495 			ICL_DEBUG("failed to receive data segment");
496 			return (-1);
497 		}
498 
499 		if (request->ip_data_mbuf == NULL)
500 			request->ip_data_mbuf = m;
501 		else
502 			m_cat(request->ip_data_mbuf, m);
503 
504 		request->ip_data_len += len;
505 		*availablep -= len + padding;
506 	} else
507 		ICL_DEBUG("len 0");
508 
509 	if (*more_neededp)
510 		ic->ic_receive_len =
511 		    icl_pdu_data_segment_receive_len(request);
512 
513 	return (0);
514 }
515 
516 static int
517 icl_pdu_check_data_digest(struct icl_pdu *request, size_t *availablep)
518 {
519 	struct mbuf *m;
520 	uint32_t received_digest, valid_digest;
521 
522 	if (request->ip_conn->ic_data_crc32c == false)
523 		return (0);
524 
525 	if (request->ip_data_len == 0)
526 		return (0);
527 
528 	m = icl_conn_receive(request->ip_conn, ISCSI_DATA_DIGEST_SIZE);
529 	if (m == NULL) {
530 		ICL_DEBUG("failed to receive data digest");
531 		return (-1);
532 	}
533 
534 	CTASSERT(sizeof(received_digest) == ISCSI_DATA_DIGEST_SIZE);
535 	m_copydata(m, 0, ISCSI_DATA_DIGEST_SIZE, (void *)&received_digest);
536 	m_freem(m);
537 
538 	*availablep -= ISCSI_DATA_DIGEST_SIZE;
539 
540 	/*
541 	 * Note that ip_data_mbuf also contains padding; since digest
542 	 * calculation is supposed to include that, we iterate over
543 	 * the entire ip_data_mbuf chain, not just ip_data_len bytes of it.
544 	 */
545 	valid_digest = icl_mbuf_to_crc32c(request->ip_data_mbuf);
546 	if (received_digest != valid_digest) {
547 		ICL_WARN("data digest check failed; got 0x%x, "
548 		    "should be 0x%x", received_digest, valid_digest);
549 		return (-1);
550 	}
551 
552 	return (0);
553 }
554 
555 /*
556  * Somewhat contrary to the name, this attempts to receive only one
557  * "part" of PDU at a time; call it repeatedly until it returns non-NULL.
558  */
559 static struct icl_pdu *
560 icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep)
561 {
562 	struct icl_pdu *request;
563 	struct socket *so;
564 	size_t len;
565 	int error;
566 	bool more_needed;
567 
568 	so = ic->ic_socket;
569 
570 	if (ic->ic_receive_state == ICL_CONN_STATE_BHS) {
571 		KASSERT(ic->ic_receive_pdu == NULL,
572 		    ("ic->ic_receive_pdu != NULL"));
573 		request = icl_pdu_new_empty(ic, M_NOWAIT);
574 		if (request == NULL) {
575 			ICL_DEBUG("failed to allocate PDU; "
576 			    "dropping connection");
577 			icl_conn_fail(ic);
578 			return (NULL);
579 		}
580 		ic->ic_receive_pdu = request;
581 	} else {
582 		KASSERT(ic->ic_receive_pdu != NULL,
583 		    ("ic->ic_receive_pdu == NULL"));
584 		request = ic->ic_receive_pdu;
585 	}
586 
587 	if (*availablep < ic->ic_receive_len) {
588 #if 0
589 		ICL_DEBUG("not enough data; need %zd, "
590 		    "have %zd", ic->ic_receive_len, *availablep);
591 #endif
592 		return (NULL);
593 	}
594 
595 	switch (ic->ic_receive_state) {
596 	case ICL_CONN_STATE_BHS:
597 		//ICL_DEBUG("receiving BHS");
598 		error = icl_pdu_receive_bhs(request, availablep);
599 		if (error != 0) {
600 			ICL_DEBUG("failed to receive BHS; "
601 			    "dropping connection");
602 			break;
603 		}
604 
605 		/*
606 		 * We don't enforce any limit for AHS length;
607 		 * its length is stored in 8 bit field.
608 		 */
609 
610 		len = icl_pdu_data_segment_length(request);
611 		if (len > ic->ic_max_data_segment_length) {
612 			ICL_WARN("received data segment "
613 			    "length %zd is larger than negotiated "
614 			    "MaxDataSegmentLength %zd; "
615 			    "dropping connection",
616 			    len, ic->ic_max_data_segment_length);
617 			error = EINVAL;
618 			break;
619 		}
620 
621 		ic->ic_receive_state = ICL_CONN_STATE_AHS;
622 		ic->ic_receive_len = icl_pdu_ahs_length(request);
623 		break;
624 
625 	case ICL_CONN_STATE_AHS:
626 		//ICL_DEBUG("receiving AHS");
627 		error = icl_pdu_receive_ahs(request, availablep);
628 		if (error != 0) {
629 			ICL_DEBUG("failed to receive AHS; "
630 			    "dropping connection");
631 			break;
632 		}
633 		ic->ic_receive_state = ICL_CONN_STATE_HEADER_DIGEST;
634 		if (ic->ic_header_crc32c == false)
635 			ic->ic_receive_len = 0;
636 		else
637 			ic->ic_receive_len = ISCSI_HEADER_DIGEST_SIZE;
638 		break;
639 
640 	case ICL_CONN_STATE_HEADER_DIGEST:
641 		//ICL_DEBUG("receiving header digest");
642 		error = icl_pdu_check_header_digest(request, availablep);
643 		if (error != 0) {
644 			ICL_DEBUG("header digest failed; "
645 			    "dropping connection");
646 			break;
647 		}
648 
649 		ic->ic_receive_state = ICL_CONN_STATE_DATA;
650 		ic->ic_receive_len =
651 		    icl_pdu_data_segment_receive_len(request);
652 		break;
653 
654 	case ICL_CONN_STATE_DATA:
655 		//ICL_DEBUG("receiving data segment");
656 		error = icl_pdu_receive_data_segment(request, availablep,
657 		    &more_needed);
658 		if (error != 0) {
659 			ICL_DEBUG("failed to receive data segment;"
660 			    "dropping connection");
661 			break;
662 		}
663 
664 		if (more_needed)
665 			break;
666 
667 		ic->ic_receive_state = ICL_CONN_STATE_DATA_DIGEST;
668 		if (request->ip_data_len == 0 || ic->ic_data_crc32c == false)
669 			ic->ic_receive_len = 0;
670 		else
671 			ic->ic_receive_len = ISCSI_DATA_DIGEST_SIZE;
672 		break;
673 
674 	case ICL_CONN_STATE_DATA_DIGEST:
675 		//ICL_DEBUG("receiving data digest");
676 		error = icl_pdu_check_data_digest(request, availablep);
677 		if (error != 0) {
678 			ICL_DEBUG("data digest failed; "
679 			    "dropping connection");
680 			break;
681 		}
682 
683 		/*
684 		 * We've received complete PDU; reset the receive state machine
685 		 * and return the PDU.
686 		 */
687 		ic->ic_receive_state = ICL_CONN_STATE_BHS;
688 		ic->ic_receive_len = sizeof(struct iscsi_bhs);
689 		ic->ic_receive_pdu = NULL;
690 		return (request);
691 
692 	default:
693 		panic("invalid ic_receive_state %d\n", ic->ic_receive_state);
694 	}
695 
696 	if (error != 0) {
697 		/*
698 		 * Don't free the PDU; it's pointed to by ic->ic_receive_pdu
699 		 * and will get freed in icl_conn_close().
700 		 */
701 		icl_conn_fail(ic);
702 	}
703 
704 	return (NULL);
705 }
706 
707 static void
708 icl_conn_receive_pdus(struct icl_conn *ic, size_t available)
709 {
710 	struct icl_pdu *response;
711 	struct socket *so;
712 
713 	so = ic->ic_socket;
714 
715 	/*
716 	 * This can never happen; we're careful to only mess with ic->ic_socket
717 	 * pointer when the send/receive threads are not running.
718 	 */
719 	KASSERT(so != NULL, ("NULL socket"));
720 
721 	for (;;) {
722 		if (ic->ic_disconnecting)
723 			return;
724 
725 		if (so->so_error != 0) {
726 			ICL_DEBUG("connection error %d; "
727 			    "dropping connection", so->so_error);
728 			icl_conn_fail(ic);
729 			return;
730 		}
731 
732 		/*
733 		 * Loop until we have a complete PDU or there is not enough
734 		 * data in the socket buffer.
735 		 */
736 		if (available < ic->ic_receive_len) {
737 #if 0
738 			ICL_DEBUG("not enough data; have %zd, "
739 			    "need %zd", available,
740 			    ic->ic_receive_len);
741 #endif
742 			return;
743 		}
744 
745 		response = icl_conn_receive_pdu(ic, &available);
746 		if (response == NULL)
747 			continue;
748 
749 		if (response->ip_ahs_len > 0) {
750 			ICL_WARN("received PDU with unsupported "
751 			    "AHS; opcode 0x%x; dropping connection",
752 			    response->ip_bhs->bhs_opcode);
753 			icl_pdu_free(response);
754 			icl_conn_fail(ic);
755 			return;
756 		}
757 
758 		(ic->ic_receive)(response);
759 	}
760 }
761 
762 static void
763 icl_receive_thread(void *arg)
764 {
765 	struct icl_conn *ic;
766 	size_t available;
767 	struct socket *so;
768 
769 	ic = arg;
770 	so = ic->ic_socket;
771 
772 	ICL_CONN_LOCK(ic);
773 	ic->ic_receive_running = true;
774 	ICL_CONN_UNLOCK(ic);
775 
776 	for (;;) {
777 		if (ic->ic_disconnecting) {
778 			//ICL_DEBUG("terminating");
779 			break;
780 		}
781 
782 		/*
783 		 * Set the low watermark, to be checked by
784 		 * soreadable() in icl_soupcall_receive()
785 		 * to avoid unneccessary wakeups until there
786 		 * is enough data received to read the PDU.
787 		 */
788 		SOCKBUF_LOCK(&so->so_rcv);
789 		available = sbavail(&so->so_rcv);
790 		if (available < ic->ic_receive_len) {
791 			so->so_rcv.sb_lowat = ic->ic_receive_len;
792 			cv_wait(&ic->ic_receive_cv, &so->so_rcv.sb_mtx);
793 		} else
794 			so->so_rcv.sb_lowat = so->so_rcv.sb_hiwat + 1;
795 		SOCKBUF_UNLOCK(&so->so_rcv);
796 
797 		icl_conn_receive_pdus(ic, available);
798 	}
799 
800 	ICL_CONN_LOCK(ic);
801 	ic->ic_receive_running = false;
802 	cv_signal(&ic->ic_send_cv);
803 	ICL_CONN_UNLOCK(ic);
804 	kthread_exit();
805 }
806 
807 static int
808 icl_soupcall_receive(struct socket *so, void *arg, int waitflag)
809 {
810 	struct icl_conn *ic;
811 
812 	if (!soreadable(so))
813 		return (SU_OK);
814 
815 	ic = arg;
816 	cv_signal(&ic->ic_receive_cv);
817 	return (SU_OK);
818 }
819 
820 static int
821 icl_pdu_finalize(struct icl_pdu *request)
822 {
823 	size_t padding, pdu_len;
824 	uint32_t digest, zero = 0;
825 	int ok;
826 	struct icl_conn *ic;
827 
828 	ic = request->ip_conn;
829 
830 	icl_pdu_set_data_segment_length(request, request->ip_data_len);
831 
832 	pdu_len = icl_pdu_size(request);
833 
834 	if (ic->ic_header_crc32c) {
835 		digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf);
836 		ok = m_append(request->ip_bhs_mbuf, sizeof(digest),
837 		    (void *)&digest);
838 		if (ok != 1) {
839 			ICL_WARN("failed to append header digest");
840 			return (1);
841 		}
842 	}
843 
844 	if (request->ip_data_len != 0) {
845 		padding = icl_pdu_padding(request);
846 		if (padding > 0) {
847 			ok = m_append(request->ip_data_mbuf, padding,
848 			    (void *)&zero);
849 			if (ok != 1) {
850 				ICL_WARN("failed to append padding");
851 				return (1);
852 			}
853 		}
854 
855 		if (ic->ic_data_crc32c) {
856 			digest = icl_mbuf_to_crc32c(request->ip_data_mbuf);
857 
858 			ok = m_append(request->ip_data_mbuf, sizeof(digest),
859 			    (void *)&digest);
860 			if (ok != 1) {
861 				ICL_WARN("failed to append data digest");
862 				return (1);
863 			}
864 		}
865 
866 		m_cat(request->ip_bhs_mbuf, request->ip_data_mbuf);
867 		request->ip_data_mbuf = NULL;
868 	}
869 
870 	request->ip_bhs_mbuf->m_pkthdr.len = pdu_len;
871 
872 	return (0);
873 }
874 
875 static void
876 icl_conn_send_pdus(struct icl_conn *ic, struct icl_pdu_stailq *queue)
877 {
878 	struct icl_pdu *request, *request2;
879 	struct socket *so;
880 	size_t available, size, size2;
881 	int coalesced, error;
882 
883 	ICL_CONN_LOCK_ASSERT_NOT(ic);
884 
885 	so = ic->ic_socket;
886 
887 	SOCKBUF_LOCK(&so->so_snd);
888 	/*
889 	 * Check how much space do we have for transmit.  We can't just
890 	 * call sosend() and retry when we get EWOULDBLOCK or EMSGSIZE,
891 	 * as it always frees the mbuf chain passed to it, even in case
892 	 * of error.
893 	 */
894 	available = sbspace(&so->so_snd);
895 
896 	/*
897 	 * Notify the socket upcall that we don't need wakeups
898 	 * for the time being.
899 	 */
900 	so->so_snd.sb_lowat = so->so_snd.sb_hiwat + 1;
901 	SOCKBUF_UNLOCK(&so->so_snd);
902 
903 	while (!STAILQ_EMPTY(queue)) {
904 		request = STAILQ_FIRST(queue);
905 		size = icl_pdu_size(request);
906 		if (available < size) {
907 
908 			/*
909 			 * Set the low watermark, to be checked by
910 			 * sowriteable() in icl_soupcall_send()
911 			 * to avoid unneccessary wakeups until there
912 			 * is enough space for the PDU to fit.
913 			 */
914 			SOCKBUF_LOCK(&so->so_snd);
915 			available = sbspace(&so->so_snd);
916 			if (available < size) {
917 #if 1
918 				ICL_DEBUG("no space to send; "
919 				    "have %zd, need %zd",
920 				    available, size);
921 #endif
922 				so->so_snd.sb_lowat = size;
923 				SOCKBUF_UNLOCK(&so->so_snd);
924 				return;
925 			}
926 			SOCKBUF_UNLOCK(&so->so_snd);
927 		}
928 		STAILQ_REMOVE_HEAD(queue, ip_next);
929 		error = icl_pdu_finalize(request);
930 		if (error != 0) {
931 			ICL_DEBUG("failed to finalize PDU; "
932 			    "dropping connection");
933 			icl_conn_fail(ic);
934 			icl_pdu_free(request);
935 			return;
936 		}
937 		if (coalesce) {
938 			coalesced = 1;
939 			for (;;) {
940 				request2 = STAILQ_FIRST(queue);
941 				if (request2 == NULL)
942 					break;
943 				size2 = icl_pdu_size(request2);
944 				if (available < size + size2)
945 					break;
946 				STAILQ_REMOVE_HEAD(queue, ip_next);
947 				error = icl_pdu_finalize(request2);
948 				if (error != 0) {
949 					ICL_DEBUG("failed to finalize PDU; "
950 					    "dropping connection");
951 					icl_conn_fail(ic);
952 					icl_pdu_free(request);
953 					icl_pdu_free(request2);
954 					return;
955 				}
956 				m_cat(request->ip_bhs_mbuf, request2->ip_bhs_mbuf);
957 				request2->ip_bhs_mbuf = NULL;
958 				request->ip_bhs_mbuf->m_pkthdr.len += size2;
959 				size += size2;
960 				STAILQ_REMOVE_AFTER(queue, request, ip_next);
961 				icl_pdu_free(request2);
962 				coalesced++;
963 			}
964 #if 0
965 			if (coalesced > 1) {
966 				ICL_DEBUG("coalesced %d PDUs into %zd bytes",
967 				    coalesced, size);
968 			}
969 #endif
970 		}
971 		available -= size;
972 		error = sosend(so, NULL, NULL, request->ip_bhs_mbuf,
973 		    NULL, MSG_DONTWAIT, curthread);
974 		request->ip_bhs_mbuf = NULL; /* Sosend consumes the mbuf. */
975 		if (error != 0) {
976 			ICL_DEBUG("failed to send PDU, error %d; "
977 			    "dropping connection", error);
978 			icl_conn_fail(ic);
979 			icl_pdu_free(request);
980 			return;
981 		}
982 		icl_pdu_free(request);
983 	}
984 }
985 
986 static void
987 icl_send_thread(void *arg)
988 {
989 	struct icl_conn *ic;
990 	struct icl_pdu_stailq queue;
991 
992 	ic = arg;
993 
994 	STAILQ_INIT(&queue);
995 
996 	ICL_CONN_LOCK(ic);
997 	ic->ic_send_running = true;
998 
999 	for (;;) {
1000 		for (;;) {
1001 			/*
1002 			 * If the local queue is empty, populate it from
1003 			 * the main one.  This way the icl_conn_send_pdus()
1004 			 * can go through all the queued PDUs without holding
1005 			 * any locks.
1006 			 */
1007 			if (STAILQ_EMPTY(&queue))
1008 				STAILQ_SWAP(&ic->ic_to_send, &queue, icl_pdu);
1009 
1010 			ic->ic_check_send_space = false;
1011 			ICL_CONN_UNLOCK(ic);
1012 			icl_conn_send_pdus(ic, &queue);
1013 			ICL_CONN_LOCK(ic);
1014 
1015 			/*
1016 			 * The icl_soupcall_send() was called since the last
1017 			 * call to sbspace(); go around;
1018 			 */
1019 			if (ic->ic_check_send_space)
1020 				continue;
1021 
1022 			/*
1023 			 * Local queue is empty, but we still have PDUs
1024 			 * in the main one; go around.
1025 			 */
1026 			if (STAILQ_EMPTY(&queue) &&
1027 			    !STAILQ_EMPTY(&ic->ic_to_send))
1028 				continue;
1029 
1030 			/*
1031 			 * There might be some stuff in the local queue,
1032 			 * which didn't get sent due to not having enough send
1033 			 * space.  Wait for socket upcall.
1034 			 */
1035 			break;
1036 		}
1037 
1038 		if (ic->ic_disconnecting) {
1039 			//ICL_DEBUG("terminating");
1040 			break;
1041 		}
1042 
1043 		cv_wait(&ic->ic_send_cv, ic->ic_lock);
1044 	}
1045 
1046 	/*
1047 	 * We're exiting; move PDUs back to the main queue, so they can
1048 	 * get freed properly.  At this point ordering doesn't matter.
1049 	 */
1050 	STAILQ_CONCAT(&ic->ic_to_send, &queue);
1051 
1052 	ic->ic_send_running = false;
1053 	cv_signal(&ic->ic_send_cv);
1054 	ICL_CONN_UNLOCK(ic);
1055 	kthread_exit();
1056 }
1057 
1058 static int
1059 icl_soupcall_send(struct socket *so, void *arg, int waitflag)
1060 {
1061 	struct icl_conn *ic;
1062 
1063 	if (!sowriteable(so))
1064 		return (SU_OK);
1065 
1066 	ic = arg;
1067 
1068 	ICL_CONN_LOCK(ic);
1069 	ic->ic_check_send_space = true;
1070 	ICL_CONN_UNLOCK(ic);
1071 
1072 	cv_signal(&ic->ic_send_cv);
1073 
1074 	return (SU_OK);
1075 }
1076 
1077 static int
1078 icl_pdu_append_data(struct icl_pdu *request, const void *addr, size_t len,
1079     int flags)
1080 {
1081 	struct mbuf *mb, *newmb;
1082 	size_t copylen, off = 0;
1083 
1084 	KASSERT(len > 0, ("len == 0"));
1085 
1086 	newmb = m_getm2(NULL, len, flags, MT_DATA, M_PKTHDR);
1087 	if (newmb == NULL) {
1088 		ICL_WARN("failed to allocate mbuf for %zd bytes", len);
1089 		return (ENOMEM);
1090 	}
1091 
1092 	for (mb = newmb; mb != NULL; mb = mb->m_next) {
1093 		copylen = min(M_TRAILINGSPACE(mb), len - off);
1094 		memcpy(mtod(mb, char *), (const char *)addr + off, copylen);
1095 		mb->m_len = copylen;
1096 		off += copylen;
1097 	}
1098 	KASSERT(off == len, ("%s: off != len", __func__));
1099 
1100 	if (request->ip_data_mbuf == NULL) {
1101 		request->ip_data_mbuf = newmb;
1102 		request->ip_data_len = len;
1103 	} else {
1104 		m_cat(request->ip_data_mbuf, newmb);
1105 		request->ip_data_len += len;
1106 	}
1107 
1108 	return (0);
1109 }
1110 
1111 int
1112 icl_soft_conn_pdu_append_data(struct icl_conn *ic, struct icl_pdu *request,
1113     const void *addr, size_t len, int flags)
1114 {
1115 
1116 	return (icl_pdu_append_data(request, addr, len, flags));
1117 }
1118 
1119 static void
1120 icl_pdu_get_data(struct icl_pdu *ip, size_t off, void *addr, size_t len)
1121 {
1122 
1123 	m_copydata(ip->ip_data_mbuf, off, len, addr);
1124 }
1125 
1126 void
1127 icl_soft_conn_pdu_get_data(struct icl_conn *ic, struct icl_pdu *ip,
1128     size_t off, void *addr, size_t len)
1129 {
1130 
1131 	return (icl_pdu_get_data(ip, off, addr, len));
1132 }
1133 
1134 static void
1135 icl_pdu_queue(struct icl_pdu *ip)
1136 {
1137 	struct icl_conn *ic;
1138 
1139 	ic = ip->ip_conn;
1140 
1141 	ICL_CONN_LOCK_ASSERT(ic);
1142 
1143 	if (ic->ic_disconnecting || ic->ic_socket == NULL) {
1144 		ICL_DEBUG("icl_pdu_queue on closed connection");
1145 		icl_pdu_free(ip);
1146 		return;
1147 	}
1148 
1149 	if (!STAILQ_EMPTY(&ic->ic_to_send)) {
1150 		STAILQ_INSERT_TAIL(&ic->ic_to_send, ip, ip_next);
1151 		/*
1152 		 * If the queue is not empty, someone else had already
1153 		 * signaled the send thread; no need to do that again,
1154 		 * just return.
1155 		 */
1156 		return;
1157 	}
1158 
1159 	STAILQ_INSERT_TAIL(&ic->ic_to_send, ip, ip_next);
1160 	cv_signal(&ic->ic_send_cv);
1161 }
1162 
1163 void
1164 icl_soft_conn_pdu_queue(struct icl_conn *ic, struct icl_pdu *ip)
1165 {
1166 
1167 	icl_pdu_queue(ip);
1168 }
1169 
1170 static struct icl_conn *
1171 icl_soft_new_conn(const char *name, struct mtx *lock)
1172 {
1173 	struct icl_conn *ic;
1174 
1175 	refcount_acquire(&icl_ncons);
1176 
1177 	ic = (struct icl_conn *)kobj_create(&icl_soft_class, M_ICL_SOFT, M_WAITOK | M_ZERO);
1178 
1179 	STAILQ_INIT(&ic->ic_to_send);
1180 	ic->ic_lock = lock;
1181 	cv_init(&ic->ic_send_cv, "icl_tx");
1182 	cv_init(&ic->ic_receive_cv, "icl_rx");
1183 #ifdef DIAGNOSTIC
1184 	refcount_init(&ic->ic_outstanding_pdus, 0);
1185 #endif
1186 	ic->ic_max_data_segment_length = ICL_MAX_DATA_SEGMENT_LENGTH;
1187 	ic->ic_name = name;
1188 	ic->ic_offload = "none";
1189 
1190 	return (ic);
1191 }
1192 
1193 void
1194 icl_soft_conn_free(struct icl_conn *ic)
1195 {
1196 
1197 	cv_destroy(&ic->ic_send_cv);
1198 	cv_destroy(&ic->ic_receive_cv);
1199 	kobj_delete((struct kobj *)ic, M_ICL_SOFT);
1200 	refcount_release(&icl_ncons);
1201 }
1202 
1203 static int
1204 icl_conn_start(struct icl_conn *ic)
1205 {
1206 	size_t minspace;
1207 	struct sockopt opt;
1208 	int error, one = 1;
1209 
1210 	ICL_CONN_LOCK(ic);
1211 
1212 	/*
1213 	 * XXX: Ugly hack.
1214 	 */
1215 	if (ic->ic_socket == NULL) {
1216 		ICL_CONN_UNLOCK(ic);
1217 		return (EINVAL);
1218 	}
1219 
1220 	ic->ic_receive_state = ICL_CONN_STATE_BHS;
1221 	ic->ic_receive_len = sizeof(struct iscsi_bhs);
1222 	ic->ic_disconnecting = false;
1223 
1224 	ICL_CONN_UNLOCK(ic);
1225 
1226 	/*
1227 	 * For sendspace, this is required because the current code cannot
1228 	 * send a PDU in pieces; thus, the minimum buffer size is equal
1229 	 * to the maximum PDU size.  "+4" is to account for possible padding.
1230 	 *
1231 	 * What we should actually do here is to use autoscaling, but set
1232 	 * some minimal buffer size to "minspace".  I don't know a way to do
1233 	 * that, though.
1234 	 */
1235 	minspace = sizeof(struct iscsi_bhs) + ic->ic_max_data_segment_length +
1236 	    ISCSI_HEADER_DIGEST_SIZE + ISCSI_DATA_DIGEST_SIZE + 4;
1237 	if (sendspace < minspace) {
1238 		ICL_WARN("kern.icl.sendspace too low; must be at least %zd",
1239 		    minspace);
1240 		sendspace = minspace;
1241 	}
1242 	if (recvspace < minspace) {
1243 		ICL_WARN("kern.icl.recvspace too low; must be at least %zd",
1244 		    minspace);
1245 		recvspace = minspace;
1246 	}
1247 
1248 	error = soreserve(ic->ic_socket, sendspace, recvspace);
1249 	if (error != 0) {
1250 		ICL_WARN("soreserve failed with error %d", error);
1251 		icl_conn_close(ic);
1252 		return (error);
1253 	}
1254 	ic->ic_socket->so_snd.sb_flags |= SB_AUTOSIZE;
1255 	ic->ic_socket->so_rcv.sb_flags |= SB_AUTOSIZE;
1256 
1257 	/*
1258 	 * Disable Nagle.
1259 	 */
1260 	bzero(&opt, sizeof(opt));
1261 	opt.sopt_dir = SOPT_SET;
1262 	opt.sopt_level = IPPROTO_TCP;
1263 	opt.sopt_name = TCP_NODELAY;
1264 	opt.sopt_val = &one;
1265 	opt.sopt_valsize = sizeof(one);
1266 	error = sosetopt(ic->ic_socket, &opt);
1267 	if (error != 0) {
1268 		ICL_WARN("disabling TCP_NODELAY failed with error %d", error);
1269 		icl_conn_close(ic);
1270 		return (error);
1271 	}
1272 
1273 	/*
1274 	 * Start threads.
1275 	 */
1276 	error = kthread_add(icl_send_thread, ic, NULL, NULL, 0, 0, "%stx",
1277 	    ic->ic_name);
1278 	if (error != 0) {
1279 		ICL_WARN("kthread_add(9) failed with error %d", error);
1280 		icl_conn_close(ic);
1281 		return (error);
1282 	}
1283 
1284 	error = kthread_add(icl_receive_thread, ic, NULL, NULL, 0, 0, "%srx",
1285 	    ic->ic_name);
1286 	if (error != 0) {
1287 		ICL_WARN("kthread_add(9) failed with error %d", error);
1288 		icl_conn_close(ic);
1289 		return (error);
1290 	}
1291 
1292 	/*
1293 	 * Register socket upcall, to get notified about incoming PDUs
1294 	 * and free space to send outgoing ones.
1295 	 */
1296 	SOCKBUF_LOCK(&ic->ic_socket->so_snd);
1297 	soupcall_set(ic->ic_socket, SO_SND, icl_soupcall_send, ic);
1298 	SOCKBUF_UNLOCK(&ic->ic_socket->so_snd);
1299 	SOCKBUF_LOCK(&ic->ic_socket->so_rcv);
1300 	soupcall_set(ic->ic_socket, SO_RCV, icl_soupcall_receive, ic);
1301 	SOCKBUF_UNLOCK(&ic->ic_socket->so_rcv);
1302 
1303 	return (0);
1304 }
1305 
1306 int
1307 icl_soft_conn_handoff(struct icl_conn *ic, int fd)
1308 {
1309 	struct file *fp;
1310 	struct socket *so;
1311 	cap_rights_t rights;
1312 	int error;
1313 
1314 	ICL_CONN_LOCK_ASSERT_NOT(ic);
1315 
1316 	/*
1317 	 * Steal the socket from userland.
1318 	 */
1319 	error = fget(curthread, fd,
1320 	    cap_rights_init(&rights, CAP_SOCK_CLIENT), &fp);
1321 	if (error != 0)
1322 		return (error);
1323 	if (fp->f_type != DTYPE_SOCKET) {
1324 		fdrop(fp, curthread);
1325 		return (EINVAL);
1326 	}
1327 	so = fp->f_data;
1328 	if (so->so_type != SOCK_STREAM) {
1329 		fdrop(fp, curthread);
1330 		return (EINVAL);
1331 	}
1332 
1333 	ICL_CONN_LOCK(ic);
1334 
1335 	if (ic->ic_socket != NULL) {
1336 		ICL_CONN_UNLOCK(ic);
1337 		fdrop(fp, curthread);
1338 		return (EBUSY);
1339 	}
1340 
1341 	ic->ic_socket = fp->f_data;
1342 	fp->f_ops = &badfileops;
1343 	fp->f_data = NULL;
1344 	fdrop(fp, curthread);
1345 	ICL_CONN_UNLOCK(ic);
1346 
1347 	error = icl_conn_start(ic);
1348 
1349 	return (error);
1350 }
1351 
1352 void
1353 icl_conn_close(struct icl_conn *ic)
1354 {
1355 	struct icl_pdu *pdu;
1356 
1357 	ICL_CONN_LOCK_ASSERT_NOT(ic);
1358 
1359 	ICL_CONN_LOCK(ic);
1360 	if (ic->ic_socket == NULL) {
1361 		ICL_CONN_UNLOCK(ic);
1362 		return;
1363 	}
1364 
1365 	/*
1366 	 * Deregister socket upcalls.
1367 	 */
1368 	ICL_CONN_UNLOCK(ic);
1369 	SOCKBUF_LOCK(&ic->ic_socket->so_snd);
1370 	if (ic->ic_socket->so_snd.sb_upcall != NULL)
1371 		soupcall_clear(ic->ic_socket, SO_SND);
1372 	SOCKBUF_UNLOCK(&ic->ic_socket->so_snd);
1373 	SOCKBUF_LOCK(&ic->ic_socket->so_rcv);
1374 	if (ic->ic_socket->so_rcv.sb_upcall != NULL)
1375 		soupcall_clear(ic->ic_socket, SO_RCV);
1376 	SOCKBUF_UNLOCK(&ic->ic_socket->so_rcv);
1377 	ICL_CONN_LOCK(ic);
1378 
1379 	ic->ic_disconnecting = true;
1380 
1381 	/*
1382 	 * Wake up the threads, so they can properly terminate.
1383 	 */
1384 	while (ic->ic_receive_running || ic->ic_send_running) {
1385 		//ICL_DEBUG("waiting for send/receive threads to terminate");
1386 		cv_signal(&ic->ic_receive_cv);
1387 		cv_signal(&ic->ic_send_cv);
1388 		cv_wait(&ic->ic_send_cv, ic->ic_lock);
1389 	}
1390 	//ICL_DEBUG("send/receive threads terminated");
1391 
1392 	ICL_CONN_UNLOCK(ic);
1393 	soclose(ic->ic_socket);
1394 	ICL_CONN_LOCK(ic);
1395 	ic->ic_socket = NULL;
1396 
1397 	if (ic->ic_receive_pdu != NULL) {
1398 		//ICL_DEBUG("freeing partially received PDU");
1399 		icl_pdu_free(ic->ic_receive_pdu);
1400 		ic->ic_receive_pdu = NULL;
1401 	}
1402 
1403 	/*
1404 	 * Remove any outstanding PDUs from the send queue.
1405 	 */
1406 	while (!STAILQ_EMPTY(&ic->ic_to_send)) {
1407 		pdu = STAILQ_FIRST(&ic->ic_to_send);
1408 		STAILQ_REMOVE_HEAD(&ic->ic_to_send, ip_next);
1409 		icl_pdu_free(pdu);
1410 	}
1411 
1412 	KASSERT(STAILQ_EMPTY(&ic->ic_to_send),
1413 	    ("destroying session with non-empty send queue"));
1414 #ifdef DIAGNOSTIC
1415 	KASSERT(ic->ic_outstanding_pdus == 0,
1416 	    ("destroying session with %d outstanding PDUs",
1417 	     ic->ic_outstanding_pdus));
1418 #endif
1419 	ICL_CONN_UNLOCK(ic);
1420 }
1421 
1422 void
1423 icl_soft_conn_close(struct icl_conn *ic)
1424 {
1425 
1426 	icl_conn_close(ic);
1427 }
1428 
1429 bool
1430 icl_soft_conn_connected(struct icl_conn *ic)
1431 {
1432 	ICL_CONN_LOCK_ASSERT_NOT(ic);
1433 
1434 	ICL_CONN_LOCK(ic);
1435 	if (ic->ic_socket == NULL) {
1436 		ICL_CONN_UNLOCK(ic);
1437 		return (false);
1438 	}
1439 	if (ic->ic_socket->so_error != 0) {
1440 		ICL_CONN_UNLOCK(ic);
1441 		return (false);
1442 	}
1443 	ICL_CONN_UNLOCK(ic);
1444 	return (true);
1445 }
1446 
1447 static int
1448 icl_soft_limits(size_t *limitp)
1449 {
1450 
1451 	*limitp = 128 * 1024;
1452 
1453 	return (0);
1454 }
1455 
1456 #ifdef ICL_KERNEL_PROXY
1457 int
1458 icl_conn_handoff_sock(struct icl_conn *ic, struct socket *so)
1459 {
1460 	int error;
1461 
1462 	ICL_CONN_LOCK_ASSERT_NOT(ic);
1463 
1464 	if (so->so_type != SOCK_STREAM)
1465 		return (EINVAL);
1466 
1467 	ICL_CONN_LOCK(ic);
1468 	if (ic->ic_socket != NULL) {
1469 		ICL_CONN_UNLOCK(ic);
1470 		return (EBUSY);
1471 	}
1472 	ic->ic_socket = so;
1473 	ICL_CONN_UNLOCK(ic);
1474 
1475 	error = icl_conn_start(ic);
1476 
1477 	return (error);
1478 }
1479 #endif /* ICL_KERNEL_PROXY */
1480 
1481 static int
1482 icl_soft_load(void)
1483 {
1484 	int error;
1485 
1486 	icl_pdu_zone = uma_zcreate("icl_pdu",
1487 	    sizeof(struct icl_pdu), NULL, NULL, NULL, NULL,
1488 	    UMA_ALIGN_PTR, 0);
1489 	refcount_init(&icl_ncons, 0);
1490 
1491 	/*
1492 	 * The reason we call this "none" is that to the user,
1493 	 * it's known as "offload driver"; "offload driver: soft"
1494 	 * doesn't make much sense.
1495 	 */
1496 	error = icl_register("none", 0, icl_soft_limits, icl_soft_new_conn);
1497 	KASSERT(error == 0, ("failed to register"));
1498 
1499 	return (error);
1500 }
1501 
1502 static int
1503 icl_soft_unload(void)
1504 {
1505 
1506 	if (icl_ncons != 0)
1507 		return (EBUSY);
1508 
1509 	icl_unregister("none");
1510 
1511 	uma_zdestroy(icl_pdu_zone);
1512 
1513 	return (0);
1514 }
1515 
1516 static int
1517 icl_soft_modevent(module_t mod, int what, void *arg)
1518 {
1519 
1520 	switch (what) {
1521 	case MOD_LOAD:
1522 		return (icl_soft_load());
1523 	case MOD_UNLOAD:
1524 		return (icl_soft_unload());
1525 	default:
1526 		return (EINVAL);
1527 	}
1528 }
1529 
1530 moduledata_t icl_soft_data = {
1531 	"icl_soft",
1532 	icl_soft_modevent,
1533 	0
1534 };
1535 
1536 DECLARE_MODULE(icl_soft, icl_soft_data, SI_SUB_DRIVERS, SI_ORDER_MIDDLE);
1537 MODULE_DEPEND(icl_soft, icl, 1, 1, 1);
1538 MODULE_VERSION(icl_soft, 1);
1539