xref: /freebsd/sys/dev/iscsi/icl.c (revision 8d20be1e22095c27faf8fe8b2f0d089739cc742e)
1 /*-
2  * Copyright (c) 2012 The FreeBSD Foundation
3  * All rights reserved.
4  *
5  * This software was developed by Edward Tomasz Napierala under sponsorship
6  * from the FreeBSD Foundation.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * $FreeBSD$
30  */
31 
32 /*
33  * iSCSI Common Layer.  It's used by both the initiator and target to send
34  * and receive iSCSI PDUs.
35  */
36 
37 #include <sys/param.h>
38 #include <sys/capability.h>
39 #include <sys/condvar.h>
40 #include <sys/conf.h>
41 #include <sys/file.h>
42 #include <sys/kernel.h>
43 #include <sys/kthread.h>
44 #include <sys/lock.h>
45 #include <sys/mbuf.h>
46 #include <sys/mutex.h>
47 #include <sys/module.h>
48 #include <sys/socket.h>
49 #include <sys/socketvar.h>
50 #include <sys/sysctl.h>
51 #include <sys/systm.h>
52 #include <sys/sx.h>
53 #include <sys/uio.h>
54 #include <vm/uma.h>
55 #include <netinet/in.h>
56 #include <netinet/tcp.h>
57 
58 #include "icl.h"
59 #include "iscsi_proto.h"
60 
61 SYSCTL_NODE(_kern, OID_AUTO, icl, CTLFLAG_RD, 0, "iSCSI Common Layer");
62 static int debug = 1;
63 TUNABLE_INT("kern.icl.debug", &debug);
64 SYSCTL_INT(_kern_icl, OID_AUTO, debug, CTLFLAG_RW,
65     &debug, 1, "Enable debug messages");
66 static int partial_receive_len = 1 * 1024; /* XXX: More? */
67 TUNABLE_INT("kern.icl.partial_receive_len", &partial_receive_len);
68 SYSCTL_INT(_kern_icl, OID_AUTO, partial_receive_len, CTLFLAG_RW,
69     &partial_receive_len, 1 * 1024, "Minimum read size for partially received "
70     "data segment");
71 
72 static uma_zone_t icl_conn_zone;
73 static uma_zone_t icl_pdu_zone;
74 
75 static volatile u_int	icl_ncons;
76 
77 #define	ICL_DEBUG(X, ...)					\
78 	if (debug > 1) {					\
79 		printf("%s: " X "\n", __func__, ## __VA_ARGS__);\
80 	} while (0)
81 
82 #define	ICL_WARN(X, ...)					\
83 	if (debug > 0) {					\
84 		printf("WARNING: %s: " X "\n",			\
85 		    __func__, ## __VA_ARGS__);			\
86 	} while (0)
87 
88 #define ICL_CONN_LOCK(X)		mtx_lock(&X->ic_lock)
89 #define ICL_CONN_UNLOCK(X)		mtx_unlock(&X->ic_lock)
90 #define ICL_CONN_LOCK_ASSERT(X)		mtx_assert(&X->ic_lock, MA_OWNED)
91 
92 static void
93 icl_conn_fail(struct icl_conn *ic)
94 {
95 	if (ic->ic_socket == NULL)
96 		return;
97 
98 	/*
99 	 * XXX
100 	 */
101 	ic->ic_socket->so_error = EDOOFUS;
102 	(ic->ic_error)(ic);
103 }
104 
105 static struct mbuf *
106 icl_conn_receive(struct icl_conn *ic, size_t len)
107 {
108 	struct uio uio;
109 	struct socket *so;
110 	struct mbuf *m;
111 	int error, flags;
112 
113 	so = ic->ic_socket;
114 
115 	memset(&uio, 0, sizeof(uio));
116 	uio.uio_resid = len;
117 
118 	flags = MSG_DONTWAIT;
119 	error = soreceive(so, NULL, &uio, &m, NULL, &flags);
120 	if (error != 0) {
121 		ICL_DEBUG("soreceive error %d", error);
122 		return (NULL);
123 	}
124 	if (uio.uio_resid != 0) {
125 		m_freem(m);
126 		ICL_DEBUG("short read");
127 		return (NULL);
128 	}
129 
130 	return (m);
131 }
132 
133 static struct icl_pdu *
134 icl_pdu_new(struct icl_conn *ic, int flags)
135 {
136 	struct icl_pdu *ip;
137 
138 	refcount_acquire(&ic->ic_outstanding_pdus);
139 	ip = uma_zalloc(icl_pdu_zone, flags | M_ZERO);
140 	if (ip == NULL) {
141 		ICL_WARN("failed to allocate %zd bytes", sizeof(*ip));
142 		refcount_release(&ic->ic_outstanding_pdus);
143 		return (NULL);
144 	}
145 
146 	ip->ip_conn = ic;
147 
148 	return (ip);
149 }
150 
151 void
152 icl_pdu_free(struct icl_pdu *ip)
153 {
154 	struct icl_conn *ic;
155 
156 	ic = ip->ip_conn;
157 
158 	m_freem(ip->ip_bhs_mbuf);
159 	m_freem(ip->ip_ahs_mbuf);
160 	m_freem(ip->ip_data_mbuf);
161 	uma_zfree(icl_pdu_zone, ip);
162 	refcount_release(&ic->ic_outstanding_pdus);
163 }
164 
165 /*
166  * Allocate icl_pdu with empty BHS to fill up by the caller.
167  */
168 struct icl_pdu *
169 icl_pdu_new_bhs(struct icl_conn *ic, int flags)
170 {
171 	struct icl_pdu *ip;
172 
173 	ip = icl_pdu_new(ic, flags);
174 	if (ip == NULL)
175 		return (NULL);
176 
177 	ip->ip_bhs_mbuf = m_getm2(NULL, sizeof(struct iscsi_bhs),
178 	    flags, MT_DATA, M_PKTHDR);
179 	if (ip->ip_bhs_mbuf == NULL) {
180 		ICL_WARN("failed to allocate %zd bytes", sizeof(*ip));
181 		icl_pdu_free(ip);
182 		return (NULL);
183 	}
184 	ip->ip_bhs = mtod(ip->ip_bhs_mbuf, struct iscsi_bhs *);
185 	memset(ip->ip_bhs, 0, sizeof(struct iscsi_bhs));
186 	ip->ip_bhs_mbuf->m_len = sizeof(struct iscsi_bhs);
187 
188 	return (ip);
189 }
190 
191 static int
192 icl_pdu_ahs_length(const struct icl_pdu *request)
193 {
194 
195 	return (request->ip_bhs->bhs_total_ahs_len * 4);
196 }
197 
198 size_t
199 icl_pdu_data_segment_length(const struct icl_pdu *request)
200 {
201 	uint32_t len = 0;
202 
203 	len += request->ip_bhs->bhs_data_segment_len[0];
204 	len <<= 8;
205 	len += request->ip_bhs->bhs_data_segment_len[1];
206 	len <<= 8;
207 	len += request->ip_bhs->bhs_data_segment_len[2];
208 
209 	return (len);
210 }
211 
212 static void
213 icl_pdu_set_data_segment_length(struct icl_pdu *response, uint32_t len)
214 {
215 
216 	response->ip_bhs->bhs_data_segment_len[2] = len;
217 	response->ip_bhs->bhs_data_segment_len[1] = len >> 8;
218 	response->ip_bhs->bhs_data_segment_len[0] = len >> 16;
219 }
220 
221 static size_t
222 icl_pdu_padding(const struct icl_pdu *ip)
223 {
224 
225 	if ((ip->ip_data_len % 4) != 0)
226 		return (4 - (ip->ip_data_len % 4));
227 
228 	return (0);
229 }
230 
231 static size_t
232 icl_pdu_size(const struct icl_pdu *response)
233 {
234 	size_t len;
235 
236 	KASSERT(response->ip_ahs_len == 0, ("responding with AHS"));
237 
238 	len = sizeof(struct iscsi_bhs) + response->ip_data_len +
239 	    icl_pdu_padding(response);
240 	if (response->ip_conn->ic_header_crc32c)
241 		len += ISCSI_HEADER_DIGEST_SIZE;
242 	if (response->ip_conn->ic_data_crc32c)
243 		len += ISCSI_DATA_DIGEST_SIZE;
244 
245 	return (len);
246 }
247 
248 static int
249 icl_pdu_receive_bhs(struct icl_pdu *request, size_t *availablep)
250 {
251 	struct mbuf *m;
252 
253 	m = icl_conn_receive(request->ip_conn, sizeof(struct iscsi_bhs));
254 	if (m == NULL) {
255 		ICL_DEBUG("failed to receive BHS");
256 		return (-1);
257 	}
258 
259 	request->ip_bhs_mbuf = m_pullup(m, sizeof(struct iscsi_bhs));
260 	if (request->ip_bhs_mbuf == NULL) {
261 		ICL_WARN("m_pullup failed");
262 		return (-1);
263 	}
264 	request->ip_bhs = mtod(request->ip_bhs_mbuf, struct iscsi_bhs *);
265 
266 	/*
267 	 * XXX: For architectures with strict alignment requirements
268 	 * 	we may need to allocate ip_bhs and copy the data into it.
269 	 * 	For some reason, though, not doing this doesn't seem
270 	 * 	to cause problems; tested on sparc64.
271 	 */
272 
273 	*availablep -= sizeof(struct iscsi_bhs);
274 	return (0);
275 }
276 
277 static int
278 icl_pdu_receive_ahs(struct icl_pdu *request, size_t *availablep)
279 {
280 
281 	request->ip_ahs_len = icl_pdu_ahs_length(request);
282 	if (request->ip_ahs_len == 0)
283 		return (0);
284 
285 	request->ip_ahs_mbuf = icl_conn_receive(request->ip_conn,
286 	    request->ip_ahs_len);
287 	if (request->ip_ahs_mbuf == NULL) {
288 		ICL_DEBUG("failed to receive AHS");
289 		return (-1);
290 	}
291 
292 	*availablep -= request->ip_ahs_len;
293 	return (0);
294 }
295 
296 static uint32_t
297 icl_mbuf_to_crc32c(const struct mbuf *m0)
298 {
299 	uint32_t digest = 0xffffffff;
300 	const struct mbuf *m;
301 
302 	for (m = m0; m != NULL; m = m->m_next)
303 		digest = calculate_crc32c(digest,
304 		    mtod(m, const void *), m->m_len);
305 
306 	digest = digest ^ 0xffffffff;
307 
308 	return (digest);
309 }
310 
311 static int
312 icl_pdu_check_header_digest(struct icl_pdu *request, size_t *availablep)
313 {
314 	struct mbuf *m;
315 	uint32_t received_digest, valid_digest;
316 
317 	if (request->ip_conn->ic_header_crc32c == false)
318 		return (0);
319 
320 	m = icl_conn_receive(request->ip_conn, ISCSI_HEADER_DIGEST_SIZE);
321 	if (m == NULL) {
322 		ICL_DEBUG("failed to receive header digest");
323 		return (-1);
324 	}
325 
326 	CTASSERT(sizeof(received_digest) == ISCSI_HEADER_DIGEST_SIZE);
327 	memcpy(&received_digest, mtod(m, void *), ISCSI_HEADER_DIGEST_SIZE);
328 	m_freem(m);
329 
330 	*availablep -= ISCSI_HEADER_DIGEST_SIZE;
331 
332 	/*
333 	 * XXX: Handle AHS.
334 	 */
335 	valid_digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf);
336 	if (received_digest != valid_digest) {
337 		ICL_WARN("header digest check failed; got 0x%x, "
338 		    "should be 0x%x", received_digest, valid_digest);
339 		return (-1);
340 	}
341 
342 	return (0);
343 }
344 
345 /*
346  * Return the number of bytes that should be waiting in the receive socket
347  * before icl_pdu_receive_data_segment() gets called.
348  */
349 static size_t
350 icl_pdu_data_segment_receive_len(const struct icl_pdu *request)
351 {
352 	size_t len;
353 
354 	len = icl_pdu_data_segment_length(request);
355 	if (len == 0)
356 		return (0);
357 
358 	/*
359 	 * Account for the parts of data segment already read from
360 	 * the socket buffer.
361 	 */
362 	KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len"));
363 	len -= request->ip_data_len;
364 
365 	/*
366 	 * Don't always wait for the full data segment to be delivered
367 	 * to the socket; this might badly affect performance due to
368 	 * TCP window scaling.
369 	 */
370 	if (len > partial_receive_len) {
371 #if 0
372 		ICL_DEBUG("need %zd bytes of data, limiting to %zd",
373 		    len, partial_receive_len));
374 #endif
375 		len = partial_receive_len;
376 
377 		return (len);
378 	}
379 
380 	/*
381 	 * Account for padding.  Note that due to the way code is written,
382 	 * the icl_pdu_receive_data_segment() must always receive padding
383 	 * along with the last part of data segment, because it would be
384 	 * impossible to tell whether we've already received the full data
385 	 * segment including padding, or without it.
386 	 */
387 	if ((len % 4) != 0)
388 		len += 4 - (len % 4);
389 
390 #if 0
391 	ICL_DEBUG("need %zd bytes of data", len));
392 #endif
393 
394 	return (len);
395 }
396 
397 static int
398 icl_pdu_receive_data_segment(struct icl_pdu *request,
399     size_t *availablep, bool *more_neededp)
400 {
401 	struct icl_conn *ic;
402 	size_t len, padding = 0;
403 	struct mbuf *m;
404 
405 	ic = request->ip_conn;
406 
407 	*more_neededp = false;
408 	ic->ic_receive_len = 0;
409 
410 	len = icl_pdu_data_segment_length(request);
411 	if (len == 0)
412 		return (0);
413 
414 	if ((len % 4) != 0)
415 		padding = 4 - (len % 4);
416 
417 	/*
418 	 * Account for already received parts of data segment.
419 	 */
420 	KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len"));
421 	len -= request->ip_data_len;
422 
423 	if (len + padding > *availablep) {
424 		/*
425 		 * Not enough data in the socket buffer.  Receive as much
426 		 * as we can.  Don't receive padding, since, obviously, it's
427 		 * not the end of data segment yet.
428 		 */
429 #if 0
430 		ICL_DEBUG("limited from %zd to %zd",
431 		    len + padding, *availablep - padding));
432 #endif
433 		len = *availablep - padding;
434 		*more_neededp = true;
435 		padding = 0;
436 	}
437 
438 	/*
439 	 * Must not try to receive padding without at least one byte
440 	 * of actual data segment.
441 	 */
442 	if (len > 0) {
443 		m = icl_conn_receive(request->ip_conn, len + padding);
444 		if (m == NULL) {
445 			ICL_DEBUG("failed to receive data segment");
446 			return (-1);
447 		}
448 
449 		if (request->ip_data_mbuf == NULL)
450 			request->ip_data_mbuf = m;
451 		else
452 			m_cat(request->ip_data_mbuf, m);
453 
454 		request->ip_data_len += len;
455 		*availablep -= len + padding;
456 	} else
457 		ICL_DEBUG("len 0");
458 
459 	if (*more_neededp)
460 		ic->ic_receive_len =
461 		    icl_pdu_data_segment_receive_len(request);
462 
463 	return (0);
464 }
465 
466 static int
467 icl_pdu_check_data_digest(struct icl_pdu *request, size_t *availablep)
468 {
469 	struct mbuf *m;
470 	uint32_t received_digest, valid_digest;
471 
472 	if (request->ip_conn->ic_data_crc32c == false)
473 		return (0);
474 
475 	if (request->ip_data_len == 0)
476 		return (0);
477 
478 	m = icl_conn_receive(request->ip_conn, ISCSI_DATA_DIGEST_SIZE);
479 	if (m == NULL) {
480 		ICL_DEBUG("failed to receive data digest");
481 		return (-1);
482 	}
483 
484 	CTASSERT(sizeof(received_digest) == ISCSI_DATA_DIGEST_SIZE);
485 	memcpy(&received_digest, mtod(m, void *), ISCSI_DATA_DIGEST_SIZE);
486 	m_freem(m);
487 
488 	*availablep -= ISCSI_DATA_DIGEST_SIZE;
489 
490 	/*
491 	 * Note that ip_data_mbuf also contains padding; since digest
492 	 * calculation is supposed to include that, we iterate over
493 	 * the entire ip_data_mbuf chain, not just ip_data_len bytes of it.
494 	 */
495 	valid_digest = icl_mbuf_to_crc32c(request->ip_data_mbuf);
496 	if (received_digest != valid_digest) {
497 		ICL_WARN("data digest check failed; got 0x%x, "
498 		    "should be 0x%x", received_digest, valid_digest);
499 		return (-1);
500 	}
501 
502 	return (0);
503 }
504 
505 /*
506  * Somewhat contrary to the name, this attempts to receive only one
507  * "part" of PDU at a time; call it repeatedly until it returns non-NULL.
508  */
509 static struct icl_pdu *
510 icl_conn_receive_pdu(struct icl_conn *ic, size_t *availablep)
511 {
512 	struct icl_pdu *request;
513 	struct socket *so;
514 	size_t len;
515 	int error;
516 	bool more_needed;
517 
518 	so = ic->ic_socket;
519 
520 	if (ic->ic_receive_state == ICL_CONN_STATE_BHS) {
521 		KASSERT(ic->ic_receive_pdu == NULL,
522 		    ("ic->ic_receive_pdu != NULL"));
523 		request = icl_pdu_new(ic, M_NOWAIT);
524 		if (request == NULL) {
525 			ICL_DEBUG("failed to allocate PDU; "
526 			    "dropping connection");
527 			icl_conn_fail(ic);
528 			return (NULL);
529 		}
530 		ic->ic_receive_pdu = request;
531 	} else {
532 		KASSERT(ic->ic_receive_pdu != NULL,
533 		    ("ic->ic_receive_pdu == NULL"));
534 		request = ic->ic_receive_pdu;
535 	}
536 
537 	if (*availablep < ic->ic_receive_len) {
538 #if 0
539 		ICL_DEBUG("not enough data; need %zd, "
540 		    "have %zd", ic->ic_receive_len, *availablep);
541 #endif
542 		return (NULL);
543 	}
544 
545 	switch (ic->ic_receive_state) {
546 	case ICL_CONN_STATE_BHS:
547 		//ICL_DEBUG("receiving BHS");
548 		error = icl_pdu_receive_bhs(request, availablep);
549 		if (error != 0) {
550 			ICL_DEBUG("failed to receive BHS; "
551 			    "dropping connection");
552 			break;
553 		}
554 
555 		/*
556 		 * We don't enforce any limit for AHS length;
557 		 * its length is stored in 8 bit field.
558 		 */
559 
560 		len = icl_pdu_data_segment_length(request);
561 		if (len > ic->ic_max_data_segment_length) {
562 			ICL_WARN("received data segment "
563 			    "length %zd is larger than negotiated "
564 			    "MaxDataSegmentLength %zd; "
565 			    "dropping connection",
566 			    len, ic->ic_max_data_segment_length);
567 			error = EINVAL;
568 			break;
569 		}
570 
571 		ic->ic_receive_state = ICL_CONN_STATE_AHS;
572 		ic->ic_receive_len = icl_pdu_ahs_length(request);
573 		break;
574 
575 	case ICL_CONN_STATE_AHS:
576 		//ICL_DEBUG("receiving AHS");
577 		error = icl_pdu_receive_ahs(request, availablep);
578 		if (error != 0) {
579 			ICL_DEBUG("failed to receive AHS; "
580 			    "dropping connection");
581 			break;
582 		}
583 		ic->ic_receive_state = ICL_CONN_STATE_HEADER_DIGEST;
584 		if (ic->ic_header_crc32c == false)
585 			ic->ic_receive_len = 0;
586 		else
587 			ic->ic_receive_len = ISCSI_HEADER_DIGEST_SIZE;
588 		break;
589 
590 	case ICL_CONN_STATE_HEADER_DIGEST:
591 		//ICL_DEBUG("receiving header digest");
592 		error = icl_pdu_check_header_digest(request, availablep);
593 		if (error != 0) {
594 			ICL_DEBUG("header digest failed; "
595 			    "dropping connection");
596 			break;
597 		}
598 
599 		ic->ic_receive_state = ICL_CONN_STATE_DATA;
600 		ic->ic_receive_len =
601 		    icl_pdu_data_segment_receive_len(request);
602 		break;
603 
604 	case ICL_CONN_STATE_DATA:
605 		//ICL_DEBUG("receiving data segment");
606 		error = icl_pdu_receive_data_segment(request, availablep,
607 		    &more_needed);
608 		if (error != 0) {
609 			ICL_DEBUG("failed to receive data segment;"
610 			    "dropping connection");
611 			break;
612 		}
613 
614 		if (more_needed)
615 			break;
616 
617 		ic->ic_receive_state = ICL_CONN_STATE_DATA_DIGEST;
618 		if (ic->ic_data_crc32c == false)
619 			ic->ic_receive_len = 0;
620 		else
621 			ic->ic_receive_len = ISCSI_DATA_DIGEST_SIZE;
622 		break;
623 
624 	case ICL_CONN_STATE_DATA_DIGEST:
625 		//ICL_DEBUG("receiving data digest");
626 		error = icl_pdu_check_data_digest(request, availablep);
627 		if (error != 0) {
628 			ICL_DEBUG("data digest failed; "
629 			    "dropping connection");
630 			break;
631 		}
632 
633 		/*
634 		 * We've received complete PDU; reset the receive state machine
635 		 * and return the PDU.
636 		 */
637 		ic->ic_receive_state = ICL_CONN_STATE_BHS;
638 		ic->ic_receive_len = sizeof(struct iscsi_bhs);
639 		ic->ic_receive_pdu = NULL;
640 		return (request);
641 
642 	default:
643 		panic("invalid ic_receive_state %d\n", ic->ic_receive_state);
644 	}
645 
646 	if (error != 0) {
647 		icl_pdu_free(request);
648 		icl_conn_fail(ic);
649 	}
650 
651 	return (NULL);
652 }
653 
654 static void
655 icl_conn_receive_pdus(struct icl_conn *ic, size_t available)
656 {
657 	struct icl_pdu *response;
658 	struct socket *so;
659 
660 	so = ic->ic_socket;
661 
662 	/*
663 	 * This can never happen; we're careful to only mess with ic->ic_socket
664 	 * pointer when the send/receive threads are not running.
665 	 */
666 	KASSERT(so != NULL, ("NULL socket"));
667 
668 	for (;;) {
669 		if (ic->ic_disconnecting)
670 			return;
671 
672 		if (so->so_error != 0) {
673 			ICL_DEBUG("connection error %d; "
674 			    "dropping connection", so->so_error);
675 			icl_conn_fail(ic);
676 			return;
677 		}
678 
679 		/*
680 		 * Loop until we have a complete PDU or there is not enough
681 		 * data in the socket buffer.
682 		 */
683 		if (available < ic->ic_receive_len) {
684 #if 0
685 			ICL_DEBUG("not enough data; have %zd, "
686 			    "need %zd", available,
687 			    ic->ic_receive_len);
688 #endif
689 			return;
690 		}
691 
692 		response = icl_conn_receive_pdu(ic, &available);
693 		if (response == NULL)
694 			continue;
695 
696 		if (response->ip_ahs_len > 0) {
697 			ICL_WARN("received PDU with unsupported "
698 			    "AHS; opcode 0x%x; dropping connection",
699 			    response->ip_bhs->bhs_opcode);
700 			icl_pdu_free(response);
701 			icl_conn_fail(ic);
702 			return;
703 		}
704 
705 		(ic->ic_receive)(response);
706 	}
707 }
708 
709 static void
710 icl_receive_thread(void *arg)
711 {
712 	struct icl_conn *ic;
713 	size_t available;
714 	struct socket *so;
715 
716 	ic = arg;
717 	so = ic->ic_socket;
718 
719 	ICL_CONN_LOCK(ic);
720 	ic->ic_receive_running = true;
721 	ICL_CONN_UNLOCK(ic);
722 
723 	for (;;) {
724 		if (ic->ic_disconnecting) {
725 			//ICL_DEBUG("terminating");
726 			break;
727 		}
728 
729 		SOCKBUF_LOCK(&so->so_rcv);
730 		available = so->so_rcv.sb_cc;
731 		if (available < ic->ic_receive_len) {
732 			so->so_rcv.sb_lowat = ic->ic_receive_len;
733 			cv_wait(&ic->ic_receive_cv, &so->so_rcv.sb_mtx);
734 		}
735 		SOCKBUF_UNLOCK(&so->so_rcv);
736 
737 		icl_conn_receive_pdus(ic, available);
738 	}
739 
740 	ICL_CONN_LOCK(ic);
741 	ic->ic_receive_running = false;
742 	ICL_CONN_UNLOCK(ic);
743 	kthread_exit();
744 }
745 
746 static int
747 icl_soupcall_receive(struct socket *so, void *arg, int waitflag)
748 {
749 	struct icl_conn *ic;
750 
751 	ic = arg;
752 	cv_signal(&ic->ic_receive_cv);
753 	return (SU_OK);
754 }
755 
756 static int
757 icl_pdu_send(struct icl_pdu *request)
758 {
759 	size_t padding, pdu_len;
760 	uint32_t digest, zero = 0;
761 	int error, ok;
762 	struct socket *so;
763 	struct icl_conn *ic;
764 
765 	ic = request->ip_conn;
766 	so = request->ip_conn->ic_socket;
767 
768 	ICL_CONN_LOCK_ASSERT(ic);
769 
770 	icl_pdu_set_data_segment_length(request, request->ip_data_len);
771 
772 	pdu_len = icl_pdu_size(request);
773 
774 	if (ic->ic_header_crc32c) {
775 		digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf);
776 		ok = m_append(request->ip_bhs_mbuf, sizeof(digest),
777 		    (void *)&digest);
778 		if (ok != 1) {
779 			ICL_WARN("failed to append header digest");
780 			return (1);
781 		}
782 	}
783 
784 	if (request->ip_data_len != 0) {
785 		padding = icl_pdu_padding(request);
786 		if (padding > 0) {
787 			ok = m_append(request->ip_data_mbuf, padding,
788 			    (void *)&zero);
789 			if (ok != 1) {
790 				ICL_WARN("failed to append padding");
791 				return (1);
792 			}
793 		}
794 
795 		if (ic->ic_data_crc32c) {
796 			digest = icl_mbuf_to_crc32c(request->ip_data_mbuf);
797 
798 			ok = m_append(request->ip_data_mbuf, sizeof(digest),
799 			    (void *)&digest);
800 			if (ok != 1) {
801 				ICL_WARN("failed to append header digest");
802 				return (1);
803 			}
804 		}
805 
806 		m_cat(request->ip_bhs_mbuf, request->ip_data_mbuf);
807 		request->ip_data_mbuf = NULL;
808 	}
809 
810 	request->ip_bhs_mbuf->m_pkthdr.len = pdu_len;
811 
812 	error = sosend(so, NULL, NULL, request->ip_bhs_mbuf,
813 	    NULL, MSG_DONTWAIT, curthread);
814 	request->ip_bhs_mbuf = NULL; /* Sosend consumes the mbuf. */
815 	if (error != 0) {
816 		ICL_DEBUG("sosend error %d", error);
817 		return (error);
818 	}
819 
820 	return (0);
821 }
822 
823 static void
824 icl_conn_send_pdus(struct icl_conn *ic)
825 {
826 	struct icl_pdu *request;
827 	struct socket *so;
828 	size_t available, size;
829 	int error;
830 
831 	ICL_CONN_LOCK_ASSERT(ic);
832 
833 	so = ic->ic_socket;
834 
835 	SOCKBUF_LOCK(&so->so_snd);
836 	available = sbspace(&so->so_snd);
837 	SOCKBUF_UNLOCK(&so->so_snd);
838 
839 	while (!TAILQ_EMPTY(&ic->ic_to_send)) {
840 		if (ic->ic_disconnecting)
841 			return;
842 
843 		request = TAILQ_FIRST(&ic->ic_to_send);
844 		size = icl_pdu_size(request);
845 		if (available < size) {
846 			/*
847 			 * Set the low watermark on the socket,
848 			 * to avoid waking up until there is enough
849 			 * space.
850 			 */
851 			SOCKBUF_LOCK(&so->so_snd);
852 			so->so_snd.sb_lowat = size;
853 			SOCKBUF_UNLOCK(&so->so_snd);
854 #if 1
855 			ICL_DEBUG("no space to send; "
856 			    "have %zd, need %zd",
857 			    available, size);
858 #endif
859 			return;
860 		}
861 		available -= size;
862 		TAILQ_REMOVE(&ic->ic_to_send, request, ip_next);
863 		error = icl_pdu_send(request);
864 		if (error != 0) {
865 			ICL_DEBUG("failed to send PDU; "
866 			    "dropping connection");
867 			icl_conn_fail(ic);
868 			return;
869 		}
870 		icl_pdu_free(request);
871 	}
872 }
873 
874 static void
875 icl_send_thread(void *arg)
876 {
877 	struct icl_conn *ic;
878 
879 	ic = arg;
880 
881 	ICL_CONN_LOCK(ic);
882 	ic->ic_send_running = true;
883 
884 	for (;;) {
885 		if (ic->ic_disconnecting) {
886 			//ICL_DEBUG("terminating");
887 			break;
888 		}
889 		icl_conn_send_pdus(ic);
890 		cv_wait(&ic->ic_send_cv, &ic->ic_lock);
891 	}
892 
893 	ic->ic_send_running = false;
894 	ICL_CONN_UNLOCK(ic);
895 	kthread_exit();
896 }
897 
898 static int
899 icl_soupcall_send(struct socket *so, void *arg, int waitflag)
900 {
901 	struct icl_conn *ic;
902 
903 	ic = arg;
904 	cv_signal(&ic->ic_send_cv);
905 	return (SU_OK);
906 }
907 
908 int
909 icl_pdu_append_data(struct icl_pdu *request, const void *addr, size_t len, int flags)
910 {
911 	struct mbuf *mb, *newmb;
912 	size_t copylen, off = 0;
913 
914 	KASSERT(len > 0, ("len == 0"));
915 
916 	newmb = m_getm2(NULL, len, flags, MT_DATA, M_PKTHDR);
917 	if (newmb == NULL) {
918 		ICL_WARN("failed to allocate mbuf for %zd bytes", len);
919 		return (ENOMEM);
920 	}
921 
922 	for (mb = newmb; mb != NULL; mb = mb->m_next) {
923 		copylen = min(M_TRAILINGSPACE(mb), len - off);
924 		memcpy(mtod(mb, char *), (const char *)addr + off, copylen);
925 		mb->m_len = copylen;
926 		off += copylen;
927 	}
928 	KASSERT(off == len, ("%s: off != len", __func__));
929 
930 	if (request->ip_data_mbuf == NULL) {
931 		request->ip_data_mbuf = newmb;
932 		request->ip_data_len = len;
933 	} else {
934 		m_cat(request->ip_data_mbuf, newmb);
935 		request->ip_data_len += len;
936 	}
937 
938 	return (0);
939 }
940 
941 void
942 icl_pdu_get_data(struct icl_pdu *ip, size_t off, void *addr, size_t len)
943 {
944 
945 	m_copydata(ip->ip_data_mbuf, off, len, addr);
946 }
947 
948 void
949 icl_pdu_queue(struct icl_pdu *ip)
950 {
951 	struct icl_conn *ic;
952 
953 	ic = ip->ip_conn;
954 
955 	ICL_CONN_LOCK(ic);
956 	if (ic->ic_disconnecting || ic->ic_socket == NULL) {
957 		ICL_DEBUG("icl_pdu_queue on closed connection");
958 		ICL_CONN_UNLOCK(ic);
959 		icl_pdu_free(ip);
960 		return;
961 	}
962 	TAILQ_INSERT_TAIL(&ic->ic_to_send, ip, ip_next);
963 	ICL_CONN_UNLOCK(ic);
964 	cv_signal(&ic->ic_send_cv);
965 }
966 
967 struct icl_conn *
968 icl_conn_new(void)
969 {
970 	struct icl_conn *ic;
971 
972 	refcount_acquire(&icl_ncons);
973 
974 	ic = uma_zalloc(icl_conn_zone, M_WAITOK | M_ZERO);
975 
976 	TAILQ_INIT(&ic->ic_to_send);
977 	mtx_init(&ic->ic_lock, "icl_lock", NULL, MTX_DEF);
978 	cv_init(&ic->ic_send_cv, "icl_tx");
979 	cv_init(&ic->ic_receive_cv, "icl_rx");
980 	refcount_init(&ic->ic_outstanding_pdus, 0);
981 	ic->ic_max_data_segment_length = ICL_MAX_DATA_SEGMENT_LENGTH;
982 
983 	return (ic);
984 }
985 
986 void
987 icl_conn_free(struct icl_conn *ic)
988 {
989 
990 	mtx_destroy(&ic->ic_lock);
991 	cv_destroy(&ic->ic_send_cv);
992 	cv_destroy(&ic->ic_receive_cv);
993 	uma_zfree(icl_conn_zone, ic);
994 	refcount_release(&icl_ncons);
995 }
996 
997 static int
998 icl_conn_start(struct icl_conn *ic)
999 {
1000 	size_t bufsize;
1001 	struct sockopt opt;
1002 	int error, one = 1;
1003 
1004 	ICL_CONN_LOCK(ic);
1005 
1006 	/*
1007 	 * XXX: Ugly hack.
1008 	 */
1009 	if (ic->ic_socket == NULL) {
1010 		ICL_CONN_UNLOCK(ic);
1011 		return (EINVAL);
1012 	}
1013 
1014 	ic->ic_receive_state = ICL_CONN_STATE_BHS;
1015 	ic->ic_receive_len = sizeof(struct iscsi_bhs);
1016 	ic->ic_disconnecting = false;
1017 
1018 	ICL_CONN_UNLOCK(ic);
1019 
1020 	/*
1021 	 * Use max available sockbuf size for sending.  Do it manually
1022 	 * instead of sbreserve(9) to work around resource limits.
1023 	 *
1024 	 * XXX: This kind of sucks.  On one hand, we don't currently support
1025 	 *	sending a part of data segment; we always do it in one piece,
1026 	 *	so we have to make sure it can fit in the socket buffer.
1027 	 *	Once I've implemented partial send, we'll get rid of this
1028 	 *	and use autoscaling.
1029 	 */
1030         bufsize = (sizeof(struct iscsi_bhs) +
1031             ic->ic_max_data_segment_length) * 8;
1032 	error = soreserve(ic->ic_socket, bufsize, bufsize);
1033 	if (error != 0) {
1034 		ICL_WARN("soreserve failed with error %d", error);
1035 		icl_conn_close(ic);
1036 		return (error);
1037 	}
1038 
1039 	/*
1040 	 * Disable Nagle.
1041 	 */
1042 	bzero(&opt, sizeof(opt));
1043 	opt.sopt_dir = SOPT_SET;
1044 	opt.sopt_level = IPPROTO_TCP;
1045 	opt.sopt_name = TCP_NODELAY;
1046 	opt.sopt_val = &one;
1047 	opt.sopt_valsize = sizeof(one);
1048 	error = sosetopt(ic->ic_socket, &opt);
1049 	if (error != 0) {
1050 		ICL_WARN("disabling TCP_NODELAY failed with error %d", error);
1051 		icl_conn_close(ic);
1052 		return (error);
1053 	}
1054 
1055 	/*
1056 	 * Start threads.
1057 	 */
1058 	error = kthread_add(icl_send_thread, ic, NULL, NULL, 0, 0, "icltx");
1059 	if (error != 0) {
1060 		ICL_WARN("kthread_add(9) failed with error %d", error);
1061 		icl_conn_close(ic);
1062 		return (error);
1063 	}
1064 
1065 	error = kthread_add(icl_receive_thread, ic, NULL, NULL, 0, 0, "iclrx");
1066 	if (error != 0) {
1067 		ICL_WARN("kthread_add(9) failed with error %d", error);
1068 		icl_conn_close(ic);
1069 		return (error);
1070 	}
1071 
1072 	/*
1073 	 * Register socket upcall, to get notified about incoming PDUs
1074 	 * and free space to send outgoing ones.
1075 	 */
1076 	SOCKBUF_LOCK(&ic->ic_socket->so_snd);
1077 	soupcall_set(ic->ic_socket, SO_SND, icl_soupcall_send, ic);
1078 	SOCKBUF_UNLOCK(&ic->ic_socket->so_snd);
1079 	SOCKBUF_LOCK(&ic->ic_socket->so_rcv);
1080 	soupcall_set(ic->ic_socket, SO_RCV, icl_soupcall_receive, ic);
1081 	SOCKBUF_UNLOCK(&ic->ic_socket->so_rcv);
1082 
1083 	return (0);
1084 }
1085 
1086 int
1087 icl_conn_handoff(struct icl_conn *ic, int fd)
1088 {
1089 	struct file *fp;
1090 	struct socket *so;
1091 	cap_rights_t rights;
1092 	int error;
1093 
1094 	/*
1095 	 * Steal the socket from userland.
1096 	 */
1097 	error = fget(curthread, fd,
1098 	    cap_rights_init(&rights, CAP_SOCK_CLIENT), &fp);
1099 	if (error != 0)
1100 		return (error);
1101 	if (fp->f_type != DTYPE_SOCKET) {
1102 		fdrop(fp, curthread);
1103 		return (EINVAL);
1104 	}
1105 	so = fp->f_data;
1106 	if (so->so_type != SOCK_STREAM) {
1107 		fdrop(fp, curthread);
1108 		return (EINVAL);
1109 	}
1110 
1111 	ICL_CONN_LOCK(ic);
1112 
1113 	if (ic->ic_socket != NULL) {
1114 		ICL_CONN_UNLOCK(ic);
1115 		fdrop(fp, curthread);
1116 		return (EBUSY);
1117 	}
1118 
1119 	ic->ic_socket = fp->f_data;
1120 	fp->f_ops = &badfileops;
1121 	fp->f_data = NULL;
1122 	fdrop(fp, curthread);
1123 	ICL_CONN_UNLOCK(ic);
1124 
1125 	error = icl_conn_start(ic);
1126 
1127 	return (error);
1128 }
1129 
1130 void
1131 icl_conn_shutdown(struct icl_conn *ic)
1132 {
1133 
1134 	ICL_CONN_LOCK(ic);
1135 	if (ic->ic_socket == NULL) {
1136 		ICL_CONN_UNLOCK(ic);
1137 		return;
1138 	}
1139 	ICL_CONN_UNLOCK(ic);
1140 
1141 	soshutdown(ic->ic_socket, SHUT_RDWR);
1142 }
1143 
1144 void
1145 icl_conn_close(struct icl_conn *ic)
1146 {
1147 	struct icl_pdu *pdu;
1148 
1149 	ICL_CONN_LOCK(ic);
1150 	if (ic->ic_socket == NULL) {
1151 		ICL_CONN_UNLOCK(ic);
1152 		return;
1153 	}
1154 
1155 	ic->ic_disconnecting = true;
1156 
1157 	/*
1158 	 * Wake up the threads, so they can properly terminate.
1159 	 */
1160 	cv_signal(&ic->ic_receive_cv);
1161 	cv_signal(&ic->ic_send_cv);
1162 	while (ic->ic_receive_running || ic->ic_send_running) {
1163 		//ICL_DEBUG("waiting for send/receive threads to terminate");
1164 		ICL_CONN_UNLOCK(ic);
1165 		cv_signal(&ic->ic_receive_cv);
1166 		cv_signal(&ic->ic_send_cv);
1167 		pause("icl_close", 1 * hz);
1168 		ICL_CONN_LOCK(ic);
1169 	}
1170 	//ICL_DEBUG("send/receive threads terminated");
1171 
1172 	soclose(ic->ic_socket);
1173 	ic->ic_socket = NULL;
1174 
1175 	if (ic->ic_receive_pdu != NULL) {
1176 		//ICL_DEBUG("freeing partially received PDU");
1177 		icl_pdu_free(ic->ic_receive_pdu);
1178 		ic->ic_receive_pdu = NULL;
1179 	}
1180 
1181 	/*
1182 	 * Remove any outstanding PDUs from the send queue.
1183 	 */
1184 	while (!TAILQ_EMPTY(&ic->ic_to_send)) {
1185 		pdu = TAILQ_FIRST(&ic->ic_to_send);
1186 		TAILQ_REMOVE(&ic->ic_to_send, pdu, ip_next);
1187 		icl_pdu_free(pdu);
1188 	}
1189 
1190 	KASSERT(TAILQ_EMPTY(&ic->ic_to_send),
1191 	    ("destroying session with non-empty send queue"));
1192 	/*
1193 	 * XXX
1194 	 */
1195 #if 0
1196 	KASSERT(ic->ic_outstanding_pdus == 0,
1197 	    ("destroying session with %d outstanding PDUs",
1198 	     ic->ic_outstanding_pdus));
1199 #endif
1200 	ICL_CONN_UNLOCK(ic);
1201 }
1202 
1203 bool
1204 icl_conn_connected(struct icl_conn *ic)
1205 {
1206 
1207 	ICL_CONN_LOCK(ic);
1208 	if (ic->ic_socket == NULL) {
1209 		ICL_CONN_UNLOCK(ic);
1210 		return (false);
1211 	}
1212 	if (ic->ic_socket->so_error != 0) {
1213 		ICL_CONN_UNLOCK(ic);
1214 		return (false);
1215 	}
1216 	ICL_CONN_UNLOCK(ic);
1217 	return (true);
1218 }
1219 
1220 #ifdef ICL_KERNEL_PROXY
1221 int
1222 icl_conn_handoff_sock(struct icl_conn *ic, struct socket *so)
1223 {
1224 	int error;
1225 
1226 	if (so->so_type != SOCK_STREAM)
1227 		return (EINVAL);
1228 
1229 	ICL_CONN_LOCK(ic);
1230 	if (ic->ic_socket != NULL) {
1231 		ICL_CONN_UNLOCK(ic);
1232 		return (EBUSY);
1233 	}
1234 	ic->ic_socket = so;
1235 	ICL_CONN_UNLOCK(ic);
1236 
1237 	error = icl_conn_start(ic);
1238 
1239 	return (error);
1240 }
1241 #endif /* ICL_KERNEL_PROXY */
1242 
1243 static int
1244 icl_unload(void)
1245 {
1246 
1247 	if (icl_ncons != 0)
1248 		return (EBUSY);
1249 
1250 	uma_zdestroy(icl_conn_zone);
1251 	uma_zdestroy(icl_pdu_zone);
1252 
1253 	return (0);
1254 }
1255 
1256 static void
1257 icl_load(void)
1258 {
1259 
1260 	icl_conn_zone = uma_zcreate("icl_conn",
1261 	    sizeof(struct icl_conn), NULL, NULL, NULL, NULL,
1262 	    UMA_ALIGN_PTR, 0);
1263 	icl_pdu_zone = uma_zcreate("icl_pdu",
1264 	    sizeof(struct icl_pdu), NULL, NULL, NULL, NULL,
1265 	    UMA_ALIGN_PTR, 0);
1266 
1267 	refcount_init(&icl_ncons, 0);
1268 }
1269 
1270 static int
1271 icl_modevent(module_t mod, int what, void *arg)
1272 {
1273 
1274 	switch (what) {
1275 	case MOD_LOAD:
1276 		icl_load();
1277 		return (0);
1278 	case MOD_UNLOAD:
1279 		return (icl_unload());
1280 	default:
1281 		return (EINVAL);
1282 	}
1283 }
1284 
1285 moduledata_t icl_data = {
1286 	"icl",
1287 	icl_modevent,
1288 	0
1289 };
1290 
1291 DECLARE_MODULE(icl, icl_data, SI_SUB_DRIVERS, SI_ORDER_FIRST);
1292 MODULE_VERSION(icl, 1);
1293