xref: /illumos-gate/usr/src/uts/common/io/idm/idm_so.c (revision 8a2b682e57a046b828f37bcde1776f131ef4629f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 2013 by Delphix. All rights reserved.
27  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
28  * Copyright (c) 2017, Joyent, Inc.  All rights reserved.
29  */
30 
31 #include <sys/conf.h>
32 #include <sys/stat.h>
33 #include <sys/file.h>
34 #include <sys/ddi.h>
35 #include <sys/sunddi.h>
36 #include <sys/modctl.h>
37 #include <sys/priv.h>
38 #include <sys/cpuvar.h>
39 #include <sys/socket.h>
40 #include <sys/strsubr.h>
41 #include <sys/sysmacros.h>
42 #include <sys/sdt.h>
43 #include <netinet/tcp.h>
44 #include <inet/tcp.h>
45 #include <sys/socketvar.h>
46 #include <sys/pathname.h>
47 #include <sys/fs/snode.h>
48 #include <sys/fs/dv_node.h>
49 #include <sys/vnode.h>
50 #include <netinet/in.h>
51 #include <net/if.h>
52 #include <sys/sockio.h>
53 #include <sys/ksocket.h>
54 #include <sys/filio.h>		/* FIONBIO */
55 #include <sys/iscsi_protocol.h>
56 #include <sys/idm/idm.h>
57 #include <sys/idm/idm_so.h>
58 #include <sys/idm/idm_text.h>
59 
60 #define	IN_PROGRESS_DELAY	1
61 
62 /*
63  * in6addr_any is currently all zeroes, but use the macro in case this
64  * ever changes.
65  */
66 static const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT;
67 
68 static void idm_sorx_cache_pdu_cb(idm_pdu_t *pdu, idm_status_t status);
69 static void idm_sorx_addl_pdu_cb(idm_pdu_t *pdu, idm_status_t status);
70 static void idm_sotx_cache_pdu_cb(idm_pdu_t *pdu, idm_status_t status);
71 
72 static idm_status_t idm_so_conn_create_common(idm_conn_t *ic, ksocket_t new_so);
73 static void idm_so_conn_destroy_common(idm_conn_t *ic);
74 static void idm_so_conn_connect_common(idm_conn_t *ic);
75 
76 static void idm_set_ini_preconnect_options(idm_so_conn_t *sc,
77     boolean_t boot_conn);
78 static void idm_set_postconnect_options(ksocket_t so);
79 static idm_status_t idm_i_so_tx(idm_pdu_t *pdu);
80 
81 static idm_status_t idm_sorecvdata(idm_conn_t *ic, idm_pdu_t *pdu);
82 static void idm_so_send_rtt_data(idm_conn_t *ic, idm_task_t *idt,
83     idm_buf_t *idb, uint32_t offset, uint32_t length);
84 static void idm_so_send_rtt_data_done(idm_task_t *idt, idm_buf_t *idb);
85 static idm_status_t idm_so_send_buf_region(idm_task_t *idt,
86     idm_buf_t *idb, uint32_t buf_region_offset, uint32_t buf_region_length);
87 
88 static uint32_t idm_fill_iov(idm_pdu_t *pdu, idm_buf_t *idb,
89     uint32_t ro, uint32_t dlength);
90 
91 static idm_status_t idm_so_handle_digest(idm_conn_t *it,
92     nvpair_t *digest_choice, const idm_kv_xlate_t *ikvx);
93 
94 static void idm_so_socket_set_nonblock(struct sonode *node);
95 static void idm_so_socket_set_block(struct sonode *node);
96 
97 /*
98  * Transport ops prototypes
99  */
100 static void idm_so_tx(idm_conn_t *ic, idm_pdu_t *pdu);
101 static idm_status_t idm_so_buf_tx_to_ini(idm_task_t *idt, idm_buf_t *idb);
102 static idm_status_t idm_so_buf_rx_from_ini(idm_task_t *idt, idm_buf_t *idb);
103 static void idm_so_rx_datain(idm_conn_t *ic, idm_pdu_t *pdu);
104 static void idm_so_rx_rtt(idm_conn_t *ic, idm_pdu_t *pdu);
105 static void idm_so_rx_dataout(idm_conn_t *ic, idm_pdu_t *pdu);
106 static idm_status_t idm_so_free_task_rsrc(idm_task_t *idt);
107 static kv_status_t idm_so_negotiate_key_values(idm_conn_t *it,
108     nvlist_t *request_nvl, nvlist_t *response_nvl, nvlist_t *negotiated_nvl);
109 static void idm_so_notice_key_values(idm_conn_t *it,
110     nvlist_t *negotiated_nvl);
111 static kv_status_t idm_so_declare_key_values(idm_conn_t *it,
112     nvlist_t *config_nvl, nvlist_t *outgoing_nvl);
113 static boolean_t idm_so_conn_is_capable(idm_conn_req_t *ic,
114     idm_transport_caps_t *caps);
115 static idm_status_t idm_so_buf_alloc(idm_buf_t *idb, uint64_t buflen);
116 static void idm_so_buf_free(idm_buf_t *idb);
117 static idm_status_t idm_so_buf_setup(idm_buf_t *idb);
118 static void idm_so_buf_teardown(idm_buf_t *idb);
119 static idm_status_t idm_so_tgt_svc_create(idm_svc_req_t *sr, idm_svc_t *is);
120 static void idm_so_tgt_svc_destroy(idm_svc_t *is);
121 static idm_status_t idm_so_tgt_svc_online(idm_svc_t *is);
122 static void idm_so_tgt_svc_offline(idm_svc_t *is);
123 static void idm_so_tgt_conn_destroy(idm_conn_t *ic);
124 static idm_status_t idm_so_tgt_conn_connect(idm_conn_t *ic);
125 static void idm_so_conn_disconnect(idm_conn_t *ic);
126 static idm_status_t idm_so_ini_conn_create(idm_conn_req_t *cr, idm_conn_t *ic);
127 static void idm_so_ini_conn_destroy(idm_conn_t *ic);
128 static idm_status_t idm_so_ini_conn_connect(idm_conn_t *ic);
129 
130 /*
131  * IDM Native Sockets transport operations
132  */
133 static
134 idm_transport_ops_t idm_so_transport_ops = {
135 	idm_so_tx,			/* it_tx_pdu */
136 	idm_so_buf_tx_to_ini,		/* it_buf_tx_to_ini */
137 	idm_so_buf_rx_from_ini,		/* it_buf_rx_from_ini */
138 	idm_so_rx_datain,		/* it_rx_datain */
139 	idm_so_rx_rtt,			/* it_rx_rtt */
140 	idm_so_rx_dataout,		/* it_rx_dataout */
141 	NULL,				/* it_alloc_conn_rsrc */
142 	NULL,				/* it_free_conn_rsrc */
143 	NULL,				/* it_tgt_enable_datamover */
144 	NULL,				/* it_ini_enable_datamover */
145 	NULL,				/* it_conn_terminate */
146 	idm_so_free_task_rsrc,		/* it_free_task_rsrc */
147 	idm_so_negotiate_key_values,	/* it_negotiate_key_values */
148 	idm_so_notice_key_values,	/* it_notice_key_values */
149 	idm_so_conn_is_capable,		/* it_conn_is_capable */
150 	idm_so_buf_alloc,		/* it_buf_alloc */
151 	idm_so_buf_free,		/* it_buf_free */
152 	idm_so_buf_setup,		/* it_buf_setup */
153 	idm_so_buf_teardown,		/* it_buf_teardown */
154 	idm_so_tgt_svc_create,		/* it_tgt_svc_create */
155 	idm_so_tgt_svc_destroy,		/* it_tgt_svc_destroy */
156 	idm_so_tgt_svc_online,		/* it_tgt_svc_online */
157 	idm_so_tgt_svc_offline,		/* it_tgt_svc_offline */
158 	idm_so_tgt_conn_destroy,	/* it_tgt_conn_destroy */
159 	idm_so_tgt_conn_connect,	/* it_tgt_conn_connect */
160 	idm_so_conn_disconnect,		/* it_tgt_conn_disconnect */
161 	idm_so_ini_conn_create,		/* it_ini_conn_create */
162 	idm_so_ini_conn_destroy,	/* it_ini_conn_destroy */
163 	idm_so_ini_conn_connect,	/* it_ini_conn_connect */
164 	idm_so_conn_disconnect,		/* it_ini_conn_disconnect */
165 	idm_so_declare_key_values	/* it_declare_key_values */
166 };
167 
168 kmutex_t	idm_so_timed_socket_mutex;
169 
170 int32_t idm_so_sndbuf = IDM_SNDBUF_SIZE;
171 int32_t idm_so_rcvbuf = IDM_RCVBUF_SIZE;
172 
173 /*
174  * idm_so_init()
175  * Sockets transport initialization
176  */
177 void
178 idm_so_init(idm_transport_t *it)
179 {
180 	/* Cache for IDM Data and R2T Transmit PDU's */
181 	idm.idm_sotx_pdu_cache = kmem_cache_create("idm_tx_pdu_cache",
182 	    sizeof (idm_pdu_t) + sizeof (iscsi_hdr_t), 8,
183 	    &idm_sotx_pdu_constructor, NULL, NULL, NULL, NULL, KM_SLEEP);
184 
185 	/* Cache for IDM Receive PDU's */
186 	idm.idm_sorx_pdu_cache = kmem_cache_create("idm_rx_pdu_cache",
187 	    sizeof (idm_pdu_t) + IDM_SORX_CACHE_HDRLEN, 8,
188 	    &idm_sorx_pdu_constructor, NULL, NULL, NULL, NULL, KM_SLEEP);
189 
190 	/* 128k buffer cache */
191 	idm.idm_so_128k_buf_cache = kmem_cache_create("idm_128k_buf_cache",
192 	    IDM_SO_BUF_CACHE_UB, 8, NULL, NULL, NULL, NULL, NULL, KM_SLEEP);
193 
194 	/* Set the sockets transport ops */
195 	it->it_ops = &idm_so_transport_ops;
196 
197 	mutex_init(&idm_so_timed_socket_mutex, NULL, MUTEX_DEFAULT, NULL);
198 
199 }
200 
201 /*
202  * idm_so_fini()
203  * Sockets transport teardown
204  */
205 void
206 idm_so_fini(void)
207 {
208 	kmem_cache_destroy(idm.idm_so_128k_buf_cache);
209 	kmem_cache_destroy(idm.idm_sotx_pdu_cache);
210 	kmem_cache_destroy(idm.idm_sorx_pdu_cache);
211 	mutex_destroy(&idm_so_timed_socket_mutex);
212 }
213 
214 ksocket_t
215 idm_socreate(int domain, int type, int protocol)
216 {
217 	ksocket_t ks;
218 
219 	if (!ksocket_socket(&ks, domain, type, protocol, KSOCKET_NOSLEEP,
220 	    CRED())) {
221 		return (ks);
222 	} else {
223 		return (NULL);
224 	}
225 }
226 
227 /*
228  * idm_soshutdown will disconnect the socket and prevent subsequent PDU
229  * reception and transmission.  The sonode still exists but its state
230  * gets modified to indicate it is no longer connected.  Calls to
231  * idm_sorecv/idm_iov_sorecv will return so idm_soshutdown can be used
232  * regain control of a thread stuck in idm_sorecv.
233  */
234 void
235 idm_soshutdown(ksocket_t so)
236 {
237 	(void) ksocket_shutdown(so, SHUT_RDWR, CRED());
238 }
239 
240 /*
241  * idm_sodestroy releases all resources associated with a socket previously
242  * created with idm_socreate.  The socket must be shutdown using
243  * idm_soshutdown before the socket is destroyed with idm_sodestroy,
244  * otherwise undefined behavior will result.
245  */
246 void
247 idm_sodestroy(ksocket_t ks)
248 {
249 	(void) ksocket_close(ks, CRED());
250 }
251 
252 /*
253  * Function to compare two addresses in sockaddr_storage format
254  */
255 
256 int
257 idm_ss_compare(const struct sockaddr_storage *cmp_ss1,
258     const struct sockaddr_storage *cmp_ss2,
259     boolean_t v4_mapped_as_v4,
260     boolean_t compare_ports)
261 {
262 	struct sockaddr_storage			mapped_v4_ss1, mapped_v4_ss2;
263 	const struct sockaddr_storage		*ss1, *ss2;
264 	struct in_addr				*in1, *in2;
265 	struct in6_addr				*in61, *in62;
266 	int i;
267 
268 	/*
269 	 * Normalize V4-mapped IPv6 addresses into V4 format if
270 	 * v4_mapped_as_v4 is B_TRUE.
271 	 */
272 	ss1 = cmp_ss1;
273 	ss2 = cmp_ss2;
274 	if (v4_mapped_as_v4 && (ss1->ss_family == AF_INET6)) {
275 		in61 = &((struct sockaddr_in6 *)ss1)->sin6_addr;
276 		if (IN6_IS_ADDR_V4MAPPED(in61)) {
277 			bzero(&mapped_v4_ss1, sizeof (mapped_v4_ss1));
278 			mapped_v4_ss1.ss_family = AF_INET;
279 			((struct sockaddr_in *)&mapped_v4_ss1)->sin_port =
280 			    ((struct sockaddr_in *)ss1)->sin_port;
281 			IN6_V4MAPPED_TO_INADDR(in61,
282 			    &((struct sockaddr_in *)&mapped_v4_ss1)->sin_addr);
283 			ss1 = &mapped_v4_ss1;
284 		}
285 	}
286 	ss2 = cmp_ss2;
287 	if (v4_mapped_as_v4 && (ss2->ss_family == AF_INET6)) {
288 		in62 = &((struct sockaddr_in6 *)ss2)->sin6_addr;
289 		if (IN6_IS_ADDR_V4MAPPED(in62)) {
290 			bzero(&mapped_v4_ss2, sizeof (mapped_v4_ss2));
291 			mapped_v4_ss2.ss_family = AF_INET;
292 			((struct sockaddr_in *)&mapped_v4_ss2)->sin_port =
293 			    ((struct sockaddr_in *)ss2)->sin_port;
294 			IN6_V4MAPPED_TO_INADDR(in62,
295 			    &((struct sockaddr_in *)&mapped_v4_ss2)->sin_addr);
296 			ss2 = &mapped_v4_ss2;
297 		}
298 	}
299 
300 	/*
301 	 * Compare ports, then address family, then ip address
302 	 */
303 	if (compare_ports &&
304 	    (((struct sockaddr_in *)ss1)->sin_port !=
305 	    ((struct sockaddr_in *)ss2)->sin_port)) {
306 		if (((struct sockaddr_in *)ss1)->sin_port >
307 		    ((struct sockaddr_in *)ss2)->sin_port)
308 			return (1);
309 		else
310 			return (-1);
311 	}
312 
313 	/*
314 	 * ports are the same
315 	 */
316 	if (ss1->ss_family != ss2->ss_family) {
317 		if (ss1->ss_family == AF_INET)
318 			return (1);
319 		else
320 			return (-1);
321 	}
322 
323 	/*
324 	 * address families are the same
325 	 */
326 	if (ss1->ss_family == AF_INET) {
327 		in1 = &((struct sockaddr_in *)ss1)->sin_addr;
328 		in2 = &((struct sockaddr_in *)ss2)->sin_addr;
329 
330 		if (in1->s_addr > in2->s_addr)
331 			return (1);
332 		else if (in1->s_addr < in2->s_addr)
333 			return (-1);
334 		else
335 			return (0);
336 	} else if (ss1->ss_family == AF_INET6) {
337 		in61 = &((struct sockaddr_in6 *)ss1)->sin6_addr;
338 		in62 = &((struct sockaddr_in6 *)ss2)->sin6_addr;
339 
340 		for (i = 0; i < 4; i++) {
341 			if (in61->s6_addr32[i] > in62->s6_addr32[i])
342 				return (1);
343 			else if (in61->s6_addr32[i] < in62->s6_addr32[i])
344 				return (-1);
345 		}
346 		return (0);
347 	}
348 
349 	return (1);
350 }
351 
352 /*
353  * IP address filter functions to flag addresses that should not
354  * go out to initiators through discovery.
355  */
356 static boolean_t
357 idm_v4_addr_okay(struct in_addr *in_addr)
358 {
359 	in_addr_t addr = ntohl(in_addr->s_addr);
360 
361 	if ((INADDR_NONE == addr) ||
362 	    (IN_MULTICAST(addr)) ||
363 	    ((addr >> IN_CLASSA_NSHIFT) == 0) ||
364 	    ((addr >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)) {
365 		return (B_FALSE);
366 	}
367 	return (B_TRUE);
368 }
369 
370 static boolean_t
371 idm_v6_addr_okay(struct in6_addr *addr6)
372 {
373 
374 	if ((IN6_IS_ADDR_UNSPECIFIED(addr6)) ||
375 	    (IN6_IS_ADDR_LOOPBACK(addr6)) ||
376 	    (IN6_IS_ADDR_MULTICAST(addr6)) ||
377 	    (IN6_IS_ADDR_V4MAPPED(addr6)) ||
378 	    (IN6_IS_ADDR_V4COMPAT(addr6)) ||
379 	    (IN6_IS_ADDR_LINKLOCAL(addr6))) {
380 		return (B_FALSE);
381 	}
382 	return (B_TRUE);
383 }
384 
385 /*
386  * idm_get_ipaddr will retrieve a list of IP Addresses which the host is
387  * configured with by sending down a sequence of kernel ioctl to IP STREAMS.
388  */
389 int
390 idm_get_ipaddr(idm_addr_list_t **ipaddr_p)
391 {
392 	ksocket_t		so4, so6;
393 	struct lifnum		lifn;
394 	struct lifconf		lifc;
395 	struct lifreq		*lp;
396 	int			rval;
397 	int			numifs;
398 	int			bufsize;
399 	void			*buf;
400 	int			i, j, n, rc;
401 	struct sockaddr_storage	ss;
402 	struct sockaddr_in	*sin;
403 	struct sockaddr_in6	*sin6;
404 	idm_addr_t		*ip;
405 	idm_addr_list_t		*ipaddr = NULL;
406 	int			size_ipaddr;
407 
408 	*ipaddr_p = NULL;
409 	size_ipaddr = 0;
410 	buf = NULL;
411 
412 	/* create an ipv4 and ipv6 UDP socket */
413 	if ((so6 = idm_socreate(PF_INET6, SOCK_DGRAM, 0)) == NULL)
414 		return (0);
415 	if ((so4 = idm_socreate(PF_INET, SOCK_DGRAM, 0)) == NULL) {
416 		idm_sodestroy(so6);
417 		return (0);
418 	}
419 
420 
421 retry_count:
422 	/* snapshot the current number of interfaces */
423 	lifn.lifn_family = PF_UNSPEC;
424 	lifn.lifn_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES;
425 	lifn.lifn_count = 0;
426 	/* use vp6 for ioctls with unspecified families by default */
427 	if (ksocket_ioctl(so6, SIOCGLIFNUM, (intptr_t)&lifn, &rval, CRED())
428 	    != 0) {
429 		goto cleanup;
430 	}
431 
432 	numifs = lifn.lifn_count;
433 	if (numifs <= 0) {
434 		goto cleanup;
435 	}
436 
437 	/* allocate extra room in case more interfaces appear */
438 	numifs += 10;
439 
440 	/* get the interface names and ip addresses */
441 	bufsize = numifs * sizeof (struct lifreq);
442 	buf = kmem_alloc(bufsize, KM_SLEEP);
443 
444 	lifc.lifc_family = AF_UNSPEC;
445 	lifc.lifc_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES;
446 	lifc.lifc_len = bufsize;
447 	lifc.lifc_buf = buf;
448 	rc = ksocket_ioctl(so6, SIOCGLIFCONF, (intptr_t)&lifc, &rval, CRED());
449 	if (rc != 0) {
450 		goto cleanup;
451 	}
452 	/* if our extra room is used up, try again */
453 	if (bufsize <= lifc.lifc_len) {
454 		kmem_free(buf, bufsize);
455 		buf = NULL;
456 		goto retry_count;
457 	}
458 	/* calc actual number of ifconfs */
459 	n = lifc.lifc_len / sizeof (struct lifreq);
460 
461 	/* get ip address */
462 	if (n > 0) {
463 		size_ipaddr = sizeof (idm_addr_list_t) +
464 		    (n - 1) * sizeof (idm_addr_t);
465 		ipaddr = kmem_zalloc(size_ipaddr, KM_SLEEP);
466 	} else {
467 		goto cleanup;
468 	}
469 
470 	/*
471 	 * Examine the array of interfaces and filter uninteresting ones
472 	 */
473 	for (i = 0, j = 0, lp = lifc.lifc_req; i < n; i++, lp++) {
474 
475 		/*
476 		 * Copy the address as the SIOCGLIFFLAGS ioctl is destructive
477 		 */
478 		ss = lp->lifr_addr;
479 		/*
480 		 * fetch the flags using the socket of the correct family
481 		 */
482 		switch (ss.ss_family) {
483 		case AF_INET:
484 			rc = ksocket_ioctl(so4, SIOCGLIFFLAGS, (intptr_t)lp,
485 			    &rval, CRED());
486 			break;
487 		case AF_INET6:
488 			rc = ksocket_ioctl(so6, SIOCGLIFFLAGS, (intptr_t)lp,
489 			    &rval, CRED());
490 			break;
491 		default:
492 			continue;
493 		}
494 		if (rc == 0) {
495 			/*
496 			 * If we got the flags, skip uninteresting
497 			 * interfaces based on flags
498 			 */
499 			if ((lp->lifr_flags & IFF_UP) != IFF_UP)
500 				continue;
501 			if (lp->lifr_flags &
502 			    (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED))
503 				continue;
504 		}
505 
506 		/* save ip address */
507 		ip = &ipaddr->al_addrs[j];
508 		switch (ss.ss_family) {
509 		case AF_INET:
510 			sin = (struct sockaddr_in *)&ss;
511 			if (!idm_v4_addr_okay(&sin->sin_addr))
512 				continue;
513 			ip->a_addr.i_addr.in4 = sin->sin_addr;
514 			ip->a_addr.i_insize = sizeof (struct in_addr);
515 			break;
516 		case AF_INET6:
517 			sin6 = (struct sockaddr_in6 *)&ss;
518 			if (!idm_v6_addr_okay(&sin6->sin6_addr))
519 				continue;
520 			ip->a_addr.i_addr.in6 = sin6->sin6_addr;
521 			ip->a_addr.i_insize = sizeof (struct in6_addr);
522 			break;
523 		default:
524 			continue;
525 		}
526 		j++;
527 	}
528 
529 	if (j == 0) {
530 		/* no valid ifaddr */
531 		kmem_free(ipaddr, size_ipaddr);
532 		size_ipaddr = 0;
533 		ipaddr = NULL;
534 	} else {
535 		ipaddr->al_out_cnt = j;
536 	}
537 
538 
539 cleanup:
540 	idm_sodestroy(so6);
541 	idm_sodestroy(so4);
542 
543 	if (buf != NULL)
544 		kmem_free(buf, bufsize);
545 
546 	*ipaddr_p = ipaddr;
547 	return (size_ipaddr);
548 }
549 
550 int
551 idm_sorecv(ksocket_t so, void *msg, size_t len)
552 {
553 	iovec_t iov;
554 
555 	ASSERT(so != NULL);
556 	ASSERT(len != 0);
557 
558 	/*
559 	 * Fill in iovec and receive data
560 	 */
561 	iov.iov_base = msg;
562 	iov.iov_len = len;
563 
564 	return (idm_iov_sorecv(so, &iov, 1, len));
565 }
566 
567 /*
568  * idm_sosendto - Sends a buffered data on a non-connected socket.
569  *
570  * This function puts the data provided on the wire by calling sosendmsg.
571  * It will return only when all the data has been sent or if an error
572  * occurs.
573  *
574  * Returns 0 for success, the socket errno value if sosendmsg fails, and
575  * -1 if sosendmsg returns success but uio_resid != 0
576  */
577 int
578 idm_sosendto(ksocket_t so, void *buff, size_t len,
579     struct sockaddr *name, socklen_t namelen)
580 {
581 	struct msghdr		msg;
582 	struct iovec		iov[1];
583 	int			error;
584 	size_t			sent = 0;
585 
586 	iov[0].iov_base	= buff;
587 	iov[0].iov_len	= len;
588 
589 	/* Initialization of the message header. */
590 	bzero(&msg, sizeof (msg));
591 	msg.msg_iov	= iov;
592 	msg.msg_iovlen	= 1;
593 	msg.msg_name	= name;
594 	msg.msg_namelen	= namelen;
595 
596 	if ((error = ksocket_sendmsg(so, &msg, 0, &sent, CRED())) == 0) {
597 		/* Data sent */
598 		if (sent == len) {
599 			/* All data sent.  Success. */
600 			return (0);
601 		} else {
602 			/* Not all data was sent.  Failure */
603 			return (-1);
604 		}
605 	}
606 
607 	/* Send failed */
608 	return (error);
609 }
610 
611 /*
612  * idm_iov_sosend - Sends an iovec on a connection.
613  *
614  * This function puts the data provided on the wire by calling sosendmsg.
615  * It will return only when all the data has been sent or if an error
616  * occurs.
617  *
618  * Returns 0 for success, the socket errno value if sosendmsg fails, and
619  * -1 if sosendmsg returns success but uio_resid != 0
620  */
621 int
622 idm_iov_sosend(ksocket_t so, iovec_t *iop, int iovlen, size_t total_len)
623 {
624 	struct msghdr		msg;
625 	int			error;
626 	size_t			sent = 0;
627 
628 	ASSERT(iop != NULL);
629 
630 	/* Initialization of the message header. */
631 	bzero(&msg, sizeof (msg));
632 	msg.msg_iov	= iop;
633 	msg.msg_iovlen	= iovlen;
634 
635 	if ((error = ksocket_sendmsg(so, &msg, 0, &sent, CRED()))
636 	    == 0) {
637 		/* Data sent */
638 		if (sent == total_len) {
639 			/* All data sent.  Success. */
640 			return (0);
641 		} else {
642 			/* Not all data was sent.  Failure */
643 			return (-1);
644 		}
645 	}
646 
647 	/* Send failed */
648 	return (error);
649 }
650 
651 /*
652  * idm_iov_sorecv - Receives an iovec from a connection
653  *
654  * This function gets the data asked for from the socket.  It will return
655  * only when all the requested data has been retrieved or if an error
656  * occurs.
657  *
658  * Returns 0 for success, the socket errno value if sorecvmsg fails, and
659  * -1 if sorecvmsg returns success but uio_resid != 0
660  */
661 int
662 idm_iov_sorecv(ksocket_t so, iovec_t *iop, int iovlen, size_t total_len)
663 {
664 	struct msghdr		msg;
665 	int			error;
666 	size_t			recv;
667 	int			flags;
668 
669 	ASSERT(iop != NULL);
670 
671 	/* Initialization of the message header. */
672 	bzero(&msg, sizeof (msg));
673 	msg.msg_iov	= iop;
674 	msg.msg_iovlen	= iovlen;
675 	flags		= MSG_WAITALL;
676 
677 	if ((error = ksocket_recvmsg(so, &msg, flags, &recv, CRED()))
678 	    == 0) {
679 		/* Received data */
680 		if (recv == total_len) {
681 			/* All requested data received.  Success */
682 			return (0);
683 		} else {
684 			/*
685 			 * Not all data was received.  The connection has
686 			 * probably failed.
687 			 */
688 			return (-1);
689 		}
690 	}
691 
692 	/* Receive failed */
693 	return (error);
694 }
695 
696 static void
697 idm_set_ini_preconnect_options(idm_so_conn_t *sc, boolean_t boot_conn)
698 {
699 	int	conn_abort = 10000;
700 	int	conn_notify = 2000;
701 	int	abort = 30000;
702 
703 	/* Pre-connect socket options */
704 	(void) ksocket_setsockopt(sc->ic_so, IPPROTO_TCP,
705 	    TCP_CONN_NOTIFY_THRESHOLD, (char *)&conn_notify, sizeof (int),
706 	    CRED());
707 	if (boot_conn == B_FALSE) {
708 		(void) ksocket_setsockopt(sc->ic_so, IPPROTO_TCP,
709 		    TCP_CONN_ABORT_THRESHOLD, (char *)&conn_abort, sizeof (int),
710 		    CRED());
711 		(void) ksocket_setsockopt(sc->ic_so, IPPROTO_TCP,
712 		    TCP_ABORT_THRESHOLD,
713 		    (char *)&abort, sizeof (int), CRED());
714 	}
715 }
716 
717 static void
718 idm_set_postconnect_options(ksocket_t ks)
719 {
720 	const int	on = 1;
721 
722 	/* Set connect options */
723 	(void) ksocket_setsockopt(ks, SOL_SOCKET, SO_RCVBUF,
724 	    (char *)&idm_so_rcvbuf, sizeof (idm_so_rcvbuf), CRED());
725 	(void) ksocket_setsockopt(ks, SOL_SOCKET, SO_SNDBUF,
726 	    (char *)&idm_so_sndbuf, sizeof (idm_so_sndbuf), CRED());
727 	(void) ksocket_setsockopt(ks, IPPROTO_TCP, TCP_NODELAY,
728 	    (char *)&on, sizeof (on), CRED());
729 }
730 
731 static uint32_t
732 n2h24(const uchar_t *ptr)
733 {
734 	return ((ptr[0] << 16) | (ptr[1] << 8) | ptr[2]);
735 }
736 
737 static boolean_t
738 idm_dataseglenokay(idm_conn_t *ic, idm_pdu_t *pdu)
739 {
740 	iscsi_hdr_t	*bhs;
741 
742 	if (ic->ic_conn_type == CONN_TYPE_TGT &&
743 	    pdu->isp_datalen > ic->ic_conn_params.max_recv_dataseglen) {
744 		IDM_CONN_LOG(CE_WARN,
745 		    "idm_dataseglenokay: exceeded the max data segment length");
746 		return (B_FALSE);
747 	}
748 
749 	bhs = pdu->isp_hdr;
750 	/*
751 	 * Filter out any RFC3720 data-size violations.
752 	 */
753 	switch (IDM_PDU_OPCODE(pdu)) {
754 	case ISCSI_OP_SCSI_TASK_MGT_MSG:
755 	case ISCSI_OP_SCSI_TASK_MGT_RSP:
756 	case ISCSI_OP_RTT_RSP:
757 	case ISCSI_OP_LOGOUT_CMD:
758 		/*
759 		 * Data-segment not allowed and additional headers not allowed.
760 		 * (both must be zero according to the RFC3720.)
761 		 */
762 		if (bhs->hlength != 0 || pdu->isp_datalen != 0)
763 			return (B_FALSE);
764 		break;
765 	case ISCSI_OP_NOOP_OUT:
766 	case ISCSI_OP_LOGIN_CMD:
767 	case ISCSI_OP_TEXT_CMD:
768 	case ISCSI_OP_SNACK_CMD:
769 	case ISCSI_OP_NOOP_IN:
770 	case ISCSI_OP_SCSI_RSP:
771 	case ISCSI_OP_LOGIN_RSP:
772 	case ISCSI_OP_TEXT_RSP:
773 	case ISCSI_OP_SCSI_DATA_RSP:
774 	case ISCSI_OP_LOGOUT_RSP:
775 	case ISCSI_OP_ASYNC_EVENT:
776 	case ISCSI_OP_REJECT_MSG:
777 		/*
778 		 * Additional headers not allowed.
779 		 * (must be zero according to RFC3720.)
780 		 */
781 		if (bhs->hlength)
782 			return (B_FALSE);
783 		break;
784 	case ISCSI_OP_SCSI_CMD:
785 		/*
786 		 * See RFC3720, section 10.3
787 		 *
788 		 * For pure read cmds, data-segment-length must be zero.
789 		 * For non-final transfers, data-size must be even number of
790 		 * 4-byte words.
791 		 * For any transfer, an expected byte count must be provided.
792 		 * For bidirectional transfers, an additional-header must be
793 		 * provided (for the read byte-count.)
794 		 */
795 		if (pdu->isp_datalen) {
796 			if ((bhs->flags & (ISCSI_FLAG_CMD_READ |
797 			    ISCSI_FLAG_CMD_WRITE)) == ISCSI_FLAG_CMD_READ)
798 				return (B_FALSE);
799 			if ((bhs->flags & ISCSI_FLAG_FINAL) == 0 &&
800 			    ((pdu->isp_datalen & 0x3) != 0))
801 				return (B_FALSE);
802 		}
803 		if (bhs->flags & (ISCSI_FLAG_CMD_READ |
804 		    ISCSI_FLAG_CMD_WRITE)) {
805 			iscsi_scsi_cmd_hdr_t *cmdhdr =
806 			    (iscsi_scsi_cmd_hdr_t *)bhs;
807 			/*
808 			 * we're transfering some data, we must have a
809 			 * byte count
810 			 */
811 			if (cmdhdr->data_length == 0)
812 				return (B_FALSE);
813 		}
814 		break;
815 	case ISCSI_OP_SCSI_DATA:
816 		/*
817 		 * See RFC3720, section 10.7
818 		 *
819 		 * Additional headers aren't allowed, and the data-size must
820 		 * be an even number of 4-byte words (unless the final bit
821 		 * is set.)
822 		 */
823 		if (bhs->hlength)
824 			return (B_FALSE);
825 		if ((bhs->flags & ISCSI_FLAG_FINAL) == 0 &&
826 		    ((pdu->isp_datalen & 0x3) != 0))
827 			return (B_FALSE);
828 		break;
829 	default:
830 		break;
831 	}
832 	return (B_TRUE);
833 }
834 
835 static idm_status_t
836 idm_sorecvhdr(idm_conn_t *ic, idm_pdu_t *pdu)
837 {
838 	iscsi_hdr_t	*bhs;
839 	uint32_t	hdr_digest_crc;
840 	uint32_t	crc_calculated;
841 	void		*new_hdr;
842 	int		ahslen = 0;
843 	int		total_len = 0;
844 	int		iovlen = 0;
845 	struct iovec	iov[2];
846 	idm_so_conn_t	*so_conn;
847 	int		rc;
848 
849 	so_conn = ic->ic_transport_private;
850 
851 	/*
852 	 * Read BHS
853 	 */
854 	bhs = pdu->isp_hdr;
855 	rc = idm_sorecv(so_conn->ic_so, pdu->isp_hdr, sizeof (iscsi_hdr_t));
856 	if (rc != IDM_STATUS_SUCCESS) {
857 		return (IDM_STATUS_FAIL);
858 	}
859 
860 	/*
861 	 * Check actual AHS length against the amount available in the buffer
862 	 */
863 	if ((IDM_PDU_OPCODE(pdu) != ISCSI_OP_SCSI_CMD) &&
864 	    (bhs->hlength != 0)) {
865 		/* ---- hlength is only only valid for SCSI Request ---- */
866 		return (IDM_STATUS_FAIL);
867 	}
868 	pdu->isp_hdrlen = sizeof (iscsi_hdr_t) +
869 	    (bhs->hlength * sizeof (uint32_t));
870 	pdu->isp_datalen = n2h24(bhs->dlength);
871 
872 	if (!idm_dataseglenokay(ic, pdu)) {
873 		IDM_CONN_LOG(CE_WARN,
874 		    "idm_sorecvhdr: invalid data segment length");
875 		return (IDM_STATUS_FAIL);
876 	}
877 	if (bhs->hlength > IDM_SORX_WIRE_AHSLEN) {
878 		/* Allocate a new header segment and change the callback */
879 		new_hdr = kmem_alloc(pdu->isp_hdrlen, KM_SLEEP);
880 		bcopy(pdu->isp_hdr, new_hdr, sizeof (iscsi_hdr_t));
881 		pdu->isp_hdr = new_hdr;
882 		pdu->isp_flags |= IDM_PDU_ADDL_HDR;
883 
884 		/*
885 		 * This callback will restore the expected values after
886 		 * the RX PDU has been processed.
887 		 */
888 		pdu->isp_callback = idm_sorx_addl_pdu_cb;
889 	}
890 
891 	/*
892 	 * Setup receipt of additional header and header digest (if enabled).
893 	 */
894 	if (bhs->hlength > 0) {
895 		iov[iovlen].iov_base = (caddr_t)(pdu->isp_hdr + 1);
896 		ahslen = pdu->isp_hdrlen - sizeof (iscsi_hdr_t);
897 		iov[iovlen].iov_len = ahslen;
898 		total_len += iov[iovlen].iov_len;
899 		iovlen++;
900 	}
901 
902 	if (ic->ic_conn_flags & IDM_CONN_HEADER_DIGEST) {
903 		iov[iovlen].iov_base = (caddr_t)&hdr_digest_crc;
904 		iov[iovlen].iov_len = sizeof (hdr_digest_crc);
905 		total_len += iov[iovlen].iov_len;
906 		iovlen++;
907 	}
908 
909 	if ((iovlen != 0) &&
910 	    (idm_iov_sorecv(so_conn->ic_so, &iov[0], iovlen,
911 	    total_len) != 0)) {
912 		return (IDM_STATUS_FAIL);
913 	}
914 
915 	/*
916 	 * Validate header digest if enabled
917 	 */
918 	if (ic->ic_conn_flags & IDM_CONN_HEADER_DIGEST) {
919 		crc_calculated = idm_crc32c(pdu->isp_hdr,
920 		    sizeof (iscsi_hdr_t) + ahslen);
921 		if (crc_calculated != hdr_digest_crc) {
922 			/* Invalid Header Digest */
923 			return (IDM_STATUS_HEADER_DIGEST);
924 		}
925 	}
926 
927 	return (0);
928 }
929 
930 /*
931  * idm_so_ini_conn_create()
932  * Allocate the sockets transport connection resources.
933  */
934 static idm_status_t
935 idm_so_ini_conn_create(idm_conn_req_t *cr, idm_conn_t *ic)
936 {
937 	ksocket_t	so;
938 	idm_so_conn_t	*so_conn;
939 	idm_status_t	idmrc;
940 
941 	so = idm_socreate(cr->cr_domain, cr->cr_type,
942 	    cr->cr_protocol);
943 	if (so == NULL) {
944 		return (IDM_STATUS_FAIL);
945 	}
946 
947 	/* Bind the socket if configured to do so */
948 	if (cr->cr_bound) {
949 		if (ksocket_bind(so, &cr->cr_bound_addr.sin,
950 		    SIZEOF_SOCKADDR(&cr->cr_bound_addr.sin), CRED()) != 0) {
951 			idm_sodestroy(so);
952 			return (IDM_STATUS_FAIL);
953 		}
954 	}
955 
956 	idmrc = idm_so_conn_create_common(ic, so);
957 	if (idmrc != IDM_STATUS_SUCCESS) {
958 		idm_soshutdown(so);
959 		idm_sodestroy(so);
960 		return (IDM_STATUS_FAIL);
961 	}
962 
963 	so_conn = ic->ic_transport_private;
964 	/* Set up socket options */
965 	idm_set_ini_preconnect_options(so_conn, cr->cr_boot_conn);
966 
967 	return (IDM_STATUS_SUCCESS);
968 }
969 
970 /*
971  * idm_so_ini_conn_destroy()
972  * Tear down the sockets transport connection resources.
973  */
974 static void
975 idm_so_ini_conn_destroy(idm_conn_t *ic)
976 {
977 	idm_so_conn_destroy_common(ic);
978 }
979 
980 /*
981  * idm_so_ini_conn_connect()
982  * Establish the connection referred to by the handle previously allocated via
983  * idm_so_ini_conn_create().
984  */
985 static idm_status_t
986 idm_so_ini_conn_connect(idm_conn_t *ic)
987 {
988 	idm_so_conn_t	*so_conn;
989 	struct sonode	*node = NULL;
990 	int		rc;
991 	clock_t		lbolt, conn_login_max, conn_login_interval;
992 	boolean_t	nonblock;
993 
994 	so_conn = ic->ic_transport_private;
995 	nonblock = ic->ic_conn_params.nonblock_socket;
996 	conn_login_max = ic->ic_conn_params.conn_login_max;
997 	conn_login_interval = ddi_get_lbolt() +
998 	    SEC_TO_TICK(ic->ic_conn_params.conn_login_interval);
999 
1000 	if (nonblock == B_TRUE) {
1001 		node = ((struct sonode *)(so_conn->ic_so));
1002 		/* Set to none block socket mode */
1003 		idm_so_socket_set_nonblock(node);
1004 		do {
1005 			rc = ksocket_connect(so_conn->ic_so,
1006 			    &ic->ic_ini_dst_addr.sin,
1007 			    (SIZEOF_SOCKADDR(&ic->ic_ini_dst_addr.sin)),
1008 			    CRED());
1009 			if (rc == 0 || rc == EISCONN) {
1010 				/* socket success or already success */
1011 				rc = IDM_STATUS_SUCCESS;
1012 				break;
1013 			}
1014 			if ((rc == ETIMEDOUT) || (rc == ECONNREFUSED) ||
1015 			    (rc == ECONNRESET)) {
1016 				/* socket connection timeout or refuse */
1017 				break;
1018 			}
1019 			lbolt = ddi_get_lbolt();
1020 			if (lbolt > conn_login_max) {
1021 				/*
1022 				 * Connection retry timeout,
1023 				 * failed connect to target.
1024 				 */
1025 				break;
1026 			}
1027 			if (lbolt < conn_login_interval) {
1028 				if ((rc == EINPROGRESS) || (rc == EALREADY)) {
1029 					/* TCP connect still in progress */
1030 					delay(SEC_TO_TICK(IN_PROGRESS_DELAY));
1031 					continue;
1032 				} else {
1033 					delay(conn_login_interval - lbolt);
1034 				}
1035 			}
1036 			conn_login_interval = ddi_get_lbolt() +
1037 			    SEC_TO_TICK(ic->ic_conn_params.conn_login_interval);
1038 		} while (rc != 0);
1039 		/* resume to nonblock mode */
1040 		if (rc == IDM_STATUS_SUCCESS) {
1041 			idm_so_socket_set_block(node);
1042 		}
1043 	} else {
1044 		rc = ksocket_connect(so_conn->ic_so, &ic->ic_ini_dst_addr.sin,
1045 		    (SIZEOF_SOCKADDR(&ic->ic_ini_dst_addr.sin)), CRED());
1046 	}
1047 
1048 	if (rc != 0) {
1049 		idm_soshutdown(so_conn->ic_so);
1050 		return (IDM_STATUS_FAIL);
1051 	}
1052 
1053 	idm_so_conn_connect_common(ic);
1054 
1055 	idm_set_postconnect_options(so_conn->ic_so);
1056 
1057 	return (IDM_STATUS_SUCCESS);
1058 }
1059 
1060 idm_status_t
1061 idm_so_tgt_conn_create(idm_conn_t *ic, ksocket_t new_so)
1062 {
1063 	idm_status_t	idmrc;
1064 
1065 	idm_set_postconnect_options(new_so);
1066 	idmrc = idm_so_conn_create_common(ic, new_so);
1067 
1068 	return (idmrc);
1069 }
1070 
1071 static void
1072 idm_so_tgt_conn_destroy(idm_conn_t *ic)
1073 {
1074 	idm_so_conn_destroy_common(ic);
1075 }
1076 
1077 /*
1078  * idm_so_tgt_conn_connect()
1079  * Establish the connection in ic, passed from idm_tgt_conn_finish(), which
1080  * is invoked from the SM as a result of an inbound connection request.
1081  */
1082 static idm_status_t
1083 idm_so_tgt_conn_connect(idm_conn_t *ic)
1084 {
1085 	idm_so_conn_connect_common(ic);
1086 
1087 	return (IDM_STATUS_SUCCESS);
1088 }
1089 
1090 static idm_status_t
1091 idm_so_conn_create_common(idm_conn_t *ic, ksocket_t new_so)
1092 {
1093 	idm_so_conn_t	*so_conn;
1094 
1095 	so_conn = kmem_zalloc(sizeof (idm_so_conn_t), KM_SLEEP);
1096 	so_conn->ic_so = new_so;
1097 
1098 	ic->ic_transport_private = so_conn;
1099 	ic->ic_transport_hdrlen = 0;
1100 
1101 	/* Set the scoreboarding flag on this connection */
1102 	ic->ic_conn_flags |= IDM_CONN_USE_SCOREBOARD;
1103 	ic->ic_conn_params.max_recv_dataseglen =
1104 	    ISCSI_DEFAULT_MAX_RECV_SEG_LEN;
1105 	ic->ic_conn_params.max_xmit_dataseglen =
1106 	    ISCSI_DEFAULT_MAX_XMIT_SEG_LEN;
1107 
1108 	/*
1109 	 * Initialize tx thread mutex and list
1110 	 */
1111 	mutex_init(&so_conn->ic_tx_mutex, NULL, MUTEX_DEFAULT, NULL);
1112 	cv_init(&so_conn->ic_tx_cv, NULL, CV_DEFAULT, NULL);
1113 	list_create(&so_conn->ic_tx_list, sizeof (idm_pdu_t),
1114 	    offsetof(idm_pdu_t, idm_tx_link));
1115 
1116 	return (IDM_STATUS_SUCCESS);
1117 }
1118 
1119 static void
1120 idm_so_conn_destroy_common(idm_conn_t *ic)
1121 {
1122 	idm_so_conn_t	*so_conn = ic->ic_transport_private;
1123 
1124 	ic->ic_transport_private = NULL;
1125 	idm_sodestroy(so_conn->ic_so);
1126 	list_destroy(&so_conn->ic_tx_list);
1127 	mutex_destroy(&so_conn->ic_tx_mutex);
1128 	cv_destroy(&so_conn->ic_tx_cv);
1129 
1130 	kmem_free(so_conn, sizeof (idm_so_conn_t));
1131 }
1132 
1133 static void
1134 idm_so_conn_connect_common(idm_conn_t *ic)
1135 {
1136 	idm_so_conn_t	*so_conn;
1137 	struct sockaddr_in6	t_addr;
1138 	socklen_t	t_addrlen = 0;
1139 
1140 	so_conn = ic->ic_transport_private;
1141 	bzero(&t_addr, sizeof (struct sockaddr_in6));
1142 	t_addrlen = sizeof (struct sockaddr_in6);
1143 
1144 	/* Set the local and remote addresses in the idm conn handle */
1145 	(void) ksocket_getsockname(so_conn->ic_so, (struct sockaddr *)&t_addr,
1146 	    &t_addrlen, CRED());
1147 	bcopy(&t_addr, &ic->ic_laddr, t_addrlen);
1148 	(void) ksocket_getpeername(so_conn->ic_so, (struct sockaddr *)&t_addr,
1149 	    &t_addrlen, CRED());
1150 	bcopy(&t_addr, &ic->ic_raddr, t_addrlen);
1151 
1152 	mutex_enter(&ic->ic_mutex);
1153 	so_conn->ic_tx_thread = thread_create(NULL, 0, idm_sotx_thread, ic, 0,
1154 	    &p0, TS_RUN, minclsyspri);
1155 	so_conn->ic_rx_thread = thread_create(NULL, 0, idm_sorx_thread, ic, 0,
1156 	    &p0, TS_RUN, minclsyspri);
1157 
1158 	while (so_conn->ic_rx_thread_did == 0 ||
1159 	    so_conn->ic_tx_thread_did == 0)
1160 		cv_wait(&ic->ic_cv, &ic->ic_mutex);
1161 	mutex_exit(&ic->ic_mutex);
1162 }
1163 
1164 /*
1165  * idm_so_conn_disconnect()
1166  * Shutdown the socket connection and stop the thread
1167  */
1168 static void
1169 idm_so_conn_disconnect(idm_conn_t *ic)
1170 {
1171 	idm_so_conn_t	*so_conn;
1172 
1173 	so_conn = ic->ic_transport_private;
1174 
1175 	mutex_enter(&ic->ic_mutex);
1176 	so_conn->ic_rx_thread_running = B_FALSE;
1177 	so_conn->ic_tx_thread_running = B_FALSE;
1178 	/* We need to wakeup the TX thread */
1179 	mutex_enter(&so_conn->ic_tx_mutex);
1180 	cv_signal(&so_conn->ic_tx_cv);
1181 	mutex_exit(&so_conn->ic_tx_mutex);
1182 	mutex_exit(&ic->ic_mutex);
1183 
1184 	/* This should wakeup the RX thread if it is sleeping */
1185 	idm_soshutdown(so_conn->ic_so);
1186 
1187 	thread_join(so_conn->ic_tx_thread_did);
1188 	thread_join(so_conn->ic_rx_thread_did);
1189 }
1190 
1191 /*
1192  * idm_so_tgt_svc_create()
1193  * Establish a service on an IP address and port.  idm_svc_req_t contains
1194  * the service parameters.
1195  */
1196 /*ARGSUSED*/
1197 static idm_status_t
1198 idm_so_tgt_svc_create(idm_svc_req_t *sr, idm_svc_t *is)
1199 {
1200 	idm_so_svc_t		*so_svc;
1201 
1202 	so_svc = kmem_zalloc(sizeof (idm_so_svc_t), KM_SLEEP);
1203 
1204 	/* Set the new sockets service in svc handle */
1205 	is->is_so_svc = (void *)so_svc;
1206 
1207 	return (IDM_STATUS_SUCCESS);
1208 }
1209 
1210 /*
1211  * idm_so_tgt_svc_destroy()
1212  * Teardown sockets resources allocated in idm_so_tgt_svc_create()
1213  */
1214 static void
1215 idm_so_tgt_svc_destroy(idm_svc_t *is)
1216 {
1217 	/* the socket will have been torn down; free the service */
1218 	kmem_free(is->is_so_svc, sizeof (idm_so_svc_t));
1219 }
1220 
1221 /*
1222  * idm_so_tgt_svc_online()
1223  * Launch a watch thread on the svc allocated in idm_so_tgt_svc_create()
1224  */
1225 
1226 static idm_status_t
1227 idm_so_tgt_svc_online(idm_svc_t *is)
1228 {
1229 	idm_so_svc_t		*so_svc;
1230 	idm_svc_req_t		*sr = &is->is_svc_req;
1231 	struct sockaddr_in6	sin6_ip;
1232 	const uint32_t		on = 1;
1233 	const uint32_t		off = 0;
1234 
1235 	mutex_enter(&is->is_mutex);
1236 	so_svc = (idm_so_svc_t *)is->is_so_svc;
1237 
1238 	/*
1239 	 * Try creating an IPv6 socket first
1240 	 */
1241 	if ((so_svc->is_so = idm_socreate(PF_INET6, SOCK_STREAM, 0)) == NULL) {
1242 		mutex_exit(&is->is_mutex);
1243 		return (IDM_STATUS_FAIL);
1244 	} else {
1245 		bzero(&sin6_ip, sizeof (sin6_ip));
1246 		sin6_ip.sin6_family = AF_INET6;
1247 		sin6_ip.sin6_port = htons(sr->sr_port);
1248 		sin6_ip.sin6_addr = in6addr_any;
1249 
1250 		(void) ksocket_setsockopt(so_svc->is_so, SOL_SOCKET,
1251 		    SO_REUSEADDR, (char *)&on, sizeof (on), CRED());
1252 		/*
1253 		 * Turn off SO_MAC_EXEMPT so future sobinds succeed
1254 		 */
1255 		(void) ksocket_setsockopt(so_svc->is_so, SOL_SOCKET,
1256 		    SO_MAC_EXEMPT, (char *)&off, sizeof (off), CRED());
1257 
1258 		if (ksocket_bind(so_svc->is_so, (struct sockaddr *)&sin6_ip,
1259 		    sizeof (sin6_ip), CRED()) != 0) {
1260 			mutex_exit(&is->is_mutex);
1261 			idm_sodestroy(so_svc->is_so);
1262 			return (IDM_STATUS_FAIL);
1263 		}
1264 	}
1265 
1266 	idm_set_postconnect_options(so_svc->is_so);
1267 
1268 	if (ksocket_listen(so_svc->is_so, 5, CRED()) != 0) {
1269 		mutex_exit(&is->is_mutex);
1270 		idm_soshutdown(so_svc->is_so);
1271 		idm_sodestroy(so_svc->is_so);
1272 		return (IDM_STATUS_FAIL);
1273 	}
1274 
1275 	/* Launch a watch thread */
1276 	so_svc->is_thread = thread_create(NULL, 0, idm_so_svc_port_watcher,
1277 	    is, 0, &p0, TS_RUN, minclsyspri);
1278 
1279 	if (so_svc->is_thread == NULL) {
1280 		/* Failure to launch; teardown the socket */
1281 		mutex_exit(&is->is_mutex);
1282 		idm_soshutdown(so_svc->is_so);
1283 		idm_sodestroy(so_svc->is_so);
1284 		return (IDM_STATUS_FAIL);
1285 	}
1286 	ksocket_hold(so_svc->is_so);
1287 	/* Wait for the port watcher thread to start */
1288 	while (!so_svc->is_thread_running)
1289 		cv_wait(&is->is_cv, &is->is_mutex);
1290 	mutex_exit(&is->is_mutex);
1291 
1292 	return (IDM_STATUS_SUCCESS);
1293 }
1294 
1295 /*
1296  * idm_so_tgt_svc_offline
1297  *
1298  * Stop listening on the IP address and port identified by idm_svc_t.
1299  */
1300 static void
1301 idm_so_tgt_svc_offline(idm_svc_t *is)
1302 {
1303 	idm_so_svc_t		*so_svc;
1304 	mutex_enter(&is->is_mutex);
1305 	so_svc = (idm_so_svc_t *)is->is_so_svc;
1306 	so_svc->is_thread_running = B_FALSE;
1307 	mutex_exit(&is->is_mutex);
1308 
1309 	/*
1310 	 * Teardown socket
1311 	 */
1312 	idm_sodestroy(so_svc->is_so);
1313 
1314 	/*
1315 	 * Now we expect the port watcher thread to terminate
1316 	 */
1317 	thread_join(so_svc->is_thread_did);
1318 }
1319 
1320 /*
1321  * Watch thread for target service connection establishment.
1322  */
1323 void
1324 idm_so_svc_port_watcher(void *arg)
1325 {
1326 	idm_svc_t		*svc = arg;
1327 	ksocket_t		new_so;
1328 	idm_conn_t		*ic;
1329 	idm_status_t		idmrc;
1330 	idm_so_svc_t		*so_svc;
1331 	int			rc;
1332 	const uint32_t		off = 0;
1333 	struct sockaddr_in6	t_addr;
1334 	socklen_t		t_addrlen;
1335 
1336 	bzero(&t_addr, sizeof (struct sockaddr_in6));
1337 	t_addrlen = sizeof (struct sockaddr_in6);
1338 	mutex_enter(&svc->is_mutex);
1339 
1340 	so_svc = svc->is_so_svc;
1341 	so_svc->is_thread_running = B_TRUE;
1342 	so_svc->is_thread_did = so_svc->is_thread->t_did;
1343 
1344 	cv_signal(&svc->is_cv);
1345 
1346 	IDM_SVC_LOG(CE_NOTE, "iSCSI service (%p/%d) online", (void *)svc,
1347 	    svc->is_svc_req.sr_port);
1348 
1349 	while (so_svc->is_thread_running) {
1350 		mutex_exit(&svc->is_mutex);
1351 
1352 		if ((rc = ksocket_accept(so_svc->is_so,
1353 		    (struct sockaddr *)&t_addr, &t_addrlen,
1354 		    &new_so, CRED())) != 0) {
1355 			mutex_enter(&svc->is_mutex);
1356 			if (rc != ECONNABORTED && rc != EINTR) {
1357 				IDM_SVC_LOG(CE_NOTE, "idm_so_svc_port_watcher:"
1358 				    " ksocket_accept failed %d", rc);
1359 			}
1360 			/*
1361 			 * Unclean shutdown of this thread is not handled
1362 			 * wait for !is_thread_running.
1363 			 */
1364 			continue;
1365 		}
1366 		/*
1367 		 * Turn off SO_MAC_EXEMPT so future sobinds succeed
1368 		 */
1369 		(void) ksocket_setsockopt(new_so, SOL_SOCKET, SO_MAC_EXEMPT,
1370 		    (char *)&off, sizeof (off), CRED());
1371 
1372 		idmrc = idm_svc_conn_create(svc, IDM_TRANSPORT_TYPE_SOCKETS,
1373 		    &ic);
1374 		if (idmrc != IDM_STATUS_SUCCESS) {
1375 			/* Drop connection */
1376 			idm_soshutdown(new_so);
1377 			idm_sodestroy(new_so);
1378 			mutex_enter(&svc->is_mutex);
1379 			continue;
1380 		}
1381 
1382 		idmrc = idm_so_tgt_conn_create(ic, new_so);
1383 		if (idmrc != IDM_STATUS_SUCCESS) {
1384 			idm_svc_conn_destroy(ic);
1385 			idm_soshutdown(new_so);
1386 			idm_sodestroy(new_so);
1387 			mutex_enter(&svc->is_mutex);
1388 			continue;
1389 		}
1390 
1391 		/*
1392 		 * Kick the state machine.  At CS_S3_XPT_UP the state machine
1393 		 * will notify the client (target) about the new connection.
1394 		 */
1395 		idm_conn_event(ic, CE_CONNECT_ACCEPT, (uintptr_t)NULL);
1396 
1397 		mutex_enter(&svc->is_mutex);
1398 	}
1399 	ksocket_rele(so_svc->is_so);
1400 	so_svc->is_thread_running = B_FALSE;
1401 	mutex_exit(&svc->is_mutex);
1402 
1403 	IDM_SVC_LOG(CE_NOTE, "iSCSI service (%p/%d) offline", (void *)svc,
1404 	    svc->is_svc_req.sr_port);
1405 
1406 	thread_exit();
1407 }
1408 
1409 /*
1410  * idm_so_free_task_rsrc() stops any ongoing processing of the task and
1411  * frees resources associated with the task.
1412  *
1413  * It's not clear that this should return idm_status_t.  What do we do
1414  * if it fails?
1415  */
1416 static idm_status_t
1417 idm_so_free_task_rsrc(idm_task_t *idt)
1418 {
1419 	idm_buf_t	*idb, *next_idb;
1420 
1421 	/*
1422 	 * There is nothing to cleanup on initiator connections
1423 	 */
1424 	if (IDM_CONN_ISINI(idt->idt_ic))
1425 		return (IDM_STATUS_SUCCESS);
1426 
1427 	/*
1428 	 * If this is a target connection, call idm_buf_rx_from_ini_done for
1429 	 * any buffer on the "outbufv" list with idb->idb_in_transport==B_TRUE.
1430 	 *
1431 	 * In addition, remove any buffers associated with this task from
1432 	 * the ic_tx_list.  We'll do this by walking the idt_inbufv list, but
1433 	 * items don't actually get removed from that list (and completion
1434 	 * routines called) until idm_task_cleanup.
1435 	 */
1436 	mutex_enter(&idt->idt_mutex);
1437 
1438 	for (idb = list_head(&idt->idt_outbufv); idb != NULL; idb = next_idb) {
1439 		next_idb = list_next(&idt->idt_outbufv, idb);
1440 		if (idb->idb_in_transport) {
1441 			/*
1442 			 * idm_buf_rx_from_ini_done releases idt->idt_mutex
1443 			 */
1444 			DTRACE_ISCSI_8(xfer__done, idm_conn_t *, idt->idt_ic,
1445 			    uintptr_t, idb->idb_buf,
1446 			    uint32_t, idb->idb_bufoffset,
1447 			    uint64_t, 0, uint32_t, 0, uint32_t, 0,
1448 			    uint32_t, idb->idb_xfer_len,
1449 			    int, XFER_BUF_RX_FROM_INI);
1450 			idm_buf_rx_from_ini_done(idt, idb, IDM_STATUS_ABORTED);
1451 			mutex_enter(&idt->idt_mutex);
1452 		}
1453 	}
1454 
1455 	for (idb = list_head(&idt->idt_inbufv); idb != NULL; idb = next_idb) {
1456 		next_idb = list_next(&idt->idt_inbufv, idb);
1457 		/*
1458 		 * We want to remove these items from the tx_list as well,
1459 		 * but knowing it's in the idt_inbufv list is not a guarantee
1460 		 * that it's in the tx_list.  If it's on the tx list then
1461 		 * let idm_sotx_thread() clean it up.
1462 		 */
1463 		if (idb->idb_in_transport && !idb->idb_tx_thread) {
1464 			/*
1465 			 * idm_buf_tx_to_ini_done releases idt->idt_mutex
1466 			 */
1467 			DTRACE_ISCSI_8(xfer__done, idm_conn_t *, idt->idt_ic,
1468 			    uintptr_t, idb->idb_buf,
1469 			    uint32_t, idb->idb_bufoffset,
1470 			    uint64_t, 0, uint32_t, 0, uint32_t, 0,
1471 			    uint32_t, idb->idb_xfer_len,
1472 			    int, XFER_BUF_TX_TO_INI);
1473 			idm_buf_tx_to_ini_done(idt, idb, IDM_STATUS_ABORTED);
1474 			mutex_enter(&idt->idt_mutex);
1475 		}
1476 	}
1477 
1478 	mutex_exit(&idt->idt_mutex);
1479 
1480 	return (IDM_STATUS_SUCCESS);
1481 }
1482 
1483 /*
1484  * idm_so_negotiate_key_values() validates the key values for this connection
1485  */
1486 /* ARGSUSED */
1487 static kv_status_t
1488 idm_so_negotiate_key_values(idm_conn_t *it, nvlist_t *request_nvl,
1489     nvlist_t *response_nvl, nvlist_t *negotiated_nvl)
1490 {
1491 	/* All parameters are negotiated at the iscsit level */
1492 	return (KV_HANDLED);
1493 }
1494 
1495 /*
1496  * idm_so_notice_key_values() activates the negotiated key values for
1497  * this connection.
1498  */
1499 static void
1500 idm_so_notice_key_values(idm_conn_t *it, nvlist_t *negotiated_nvl)
1501 {
1502 	char			*nvp_name;
1503 	nvpair_t		*nvp;
1504 	nvpair_t		*next_nvp;
1505 	int			nvrc;
1506 	idm_status_t		idm_status;
1507 	const idm_kv_xlate_t	*ikvx;
1508 	uint64_t		num_val;
1509 
1510 	for (nvp = nvlist_next_nvpair(negotiated_nvl, NULL);
1511 	    nvp != NULL; nvp = next_nvp) {
1512 		next_nvp = nvlist_next_nvpair(negotiated_nvl, nvp);
1513 		nvp_name = nvpair_name(nvp);
1514 
1515 		ikvx = idm_lookup_kv_xlate(nvp_name, strlen(nvp_name));
1516 		switch (ikvx->ik_key_id) {
1517 		case KI_HEADER_DIGEST:
1518 		case KI_DATA_DIGEST:
1519 			idm_status = idm_so_handle_digest(it, nvp, ikvx);
1520 			ASSERT(idm_status == 0);
1521 
1522 			/* Remove processed item from negotiated_nvl list */
1523 			nvrc = nvlist_remove_all(
1524 			    negotiated_nvl, ikvx->ik_key_name);
1525 			ASSERT(nvrc == 0);
1526 			break;
1527 		case KI_MAX_RECV_DATA_SEGMENT_LENGTH:
1528 			/*
1529 			 * Just pass the value down to idm layer.
1530 			 * No need to remove it from negotiated_nvl list here.
1531 			 */
1532 			nvrc = nvpair_value_uint64(nvp, &num_val);
1533 			ASSERT(nvrc == 0);
1534 			it->ic_conn_params.max_xmit_dataseglen =
1535 			    (uint32_t)num_val;
1536 			break;
1537 		default:
1538 			break;
1539 		}
1540 	}
1541 }
1542 
1543 /*
1544  * idm_so_declare_key_values() declares the key values for this connection
1545  */
1546 /* ARGSUSED */
1547 static kv_status_t
1548 idm_so_declare_key_values(idm_conn_t *it, nvlist_t *config_nvl,
1549     nvlist_t *outgoing_nvl)
1550 {
1551 	char			*nvp_name;
1552 	nvpair_t		*nvp;
1553 	nvpair_t		*next_nvp;
1554 	kv_status_t		kvrc;
1555 	int			nvrc = 0;
1556 	const idm_kv_xlate_t	*ikvx;
1557 	uint64_t		num_val;
1558 
1559 	for (nvp = nvlist_next_nvpair(config_nvl, NULL);
1560 	    nvp != NULL && nvrc == 0; nvp = next_nvp) {
1561 		next_nvp = nvlist_next_nvpair(config_nvl, nvp);
1562 		nvp_name = nvpair_name(nvp);
1563 
1564 		ikvx = idm_lookup_kv_xlate(nvp_name, strlen(nvp_name));
1565 		switch (ikvx->ik_key_id) {
1566 		case KI_MAX_RECV_DATA_SEGMENT_LENGTH:
1567 			if ((nvrc = nvpair_value_uint64(nvp, &num_val)) != 0) {
1568 				break;
1569 			}
1570 			if (outgoing_nvl &&
1571 			    (nvrc = nvlist_add_uint64(outgoing_nvl,
1572 			    nvp_name, num_val)) != 0) {
1573 				break;
1574 			}
1575 			it->ic_conn_params.max_recv_dataseglen =
1576 			    (uint32_t)num_val;
1577 			break;
1578 		default:
1579 			break;
1580 		}
1581 	}
1582 	kvrc = idm_nvstat_to_kvstat(nvrc);
1583 	return (kvrc);
1584 }
1585 
1586 static idm_status_t
1587 idm_so_handle_digest(idm_conn_t *it, nvpair_t *digest_choice,
1588     const idm_kv_xlate_t *ikvx)
1589 {
1590 	int			nvrc;
1591 	char			*digest_choice_string;
1592 
1593 	nvrc = nvpair_value_string(digest_choice,
1594 	    &digest_choice_string);
1595 	ASSERT(nvrc == 0);
1596 	if (strcasecmp(digest_choice_string, "crc32c") == 0) {
1597 		switch (ikvx->ik_key_id) {
1598 		case KI_HEADER_DIGEST:
1599 			it->ic_conn_flags |= IDM_CONN_HEADER_DIGEST;
1600 			break;
1601 		case KI_DATA_DIGEST:
1602 			it->ic_conn_flags |= IDM_CONN_DATA_DIGEST;
1603 			break;
1604 		default:
1605 			ASSERT(0);
1606 			break;
1607 		}
1608 	} else if (strcasecmp(digest_choice_string, "none") == 0) {
1609 		switch (ikvx->ik_key_id) {
1610 		case KI_HEADER_DIGEST:
1611 			it->ic_conn_flags &= ~IDM_CONN_HEADER_DIGEST;
1612 			break;
1613 		case KI_DATA_DIGEST:
1614 			it->ic_conn_flags &= ~IDM_CONN_DATA_DIGEST;
1615 			break;
1616 		default:
1617 			ASSERT(0);
1618 			break;
1619 		}
1620 	} else {
1621 		ASSERT(0);
1622 	}
1623 
1624 	return (IDM_STATUS_SUCCESS);
1625 }
1626 
1627 
1628 /*
1629  * idm_so_conn_is_capable() verifies that the passed connection is provided
1630  * for by the sockets interface.
1631  */
1632 /* ARGSUSED */
1633 static boolean_t
1634 idm_so_conn_is_capable(idm_conn_req_t *ic, idm_transport_caps_t *caps)
1635 {
1636 	return (B_TRUE);
1637 }
1638 
1639 /*
1640  * idm_so_rx_datain() validates the Data Sequence number of the PDU. The
1641  * idm_sorecv_scsidata() function invoked earlier actually reads the data
1642  * off the socket into the appropriate buffers.
1643  */
1644 static void
1645 idm_so_rx_datain(idm_conn_t *ic, idm_pdu_t *pdu)
1646 {
1647 	iscsi_data_hdr_t	*bhs;
1648 	idm_task_t		*idt;
1649 	idm_buf_t		*idb;
1650 	uint32_t		datasn;
1651 	size_t			offset;
1652 	iscsi_hdr_t		*ihp = (iscsi_hdr_t *)pdu->isp_hdr;
1653 	iscsi_data_rsp_hdr_t    *idrhp = (iscsi_data_rsp_hdr_t *)ihp;
1654 
1655 	ASSERT(ic != NULL);
1656 	ASSERT(pdu != NULL);
1657 	ASSERT(IDM_PDU_OPCODE(pdu) == ISCSI_OP_SCSI_DATA_RSP);
1658 
1659 	bhs	= (iscsi_data_hdr_t *)pdu->isp_hdr;
1660 	datasn	= ntohl(bhs->datasn);
1661 	offset	= ntohl(bhs->offset);
1662 
1663 	/*
1664 	 * Look up the task corresponding to the initiator task tag
1665 	 * to get the buffers affiliated with the task.
1666 	 */
1667 	idt = idm_task_find(ic, bhs->itt, bhs->ttt);
1668 	if (idt == NULL) {
1669 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_datain: failed to find task");
1670 		idm_pdu_rx_protocol_error(ic, pdu);
1671 		return;
1672 	}
1673 
1674 	idb = pdu->isp_sorx_buf;
1675 	if (idb == NULL) {
1676 		IDM_CONN_LOG(CE_WARN,
1677 		    "idm_so_rx_datain: failed to find buffer");
1678 		idm_task_rele(idt);
1679 		idm_pdu_rx_protocol_error(ic, pdu);
1680 		return;
1681 	}
1682 
1683 	/*
1684 	 * DataSN values should be sequential and should not have any gaps or
1685 	 * repetitions. Check the DataSN with the one stored in the task.
1686 	 */
1687 	if (datasn == idt->idt_exp_datasn) {
1688 		idt->idt_exp_datasn++; /* keep track of DataSN received */
1689 	} else {
1690 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_datain: datasn out of order");
1691 		idm_task_rele(idt);
1692 		idm_pdu_rx_protocol_error(ic, pdu);
1693 		return;
1694 	}
1695 
1696 	/*
1697 	 * PDUs in a sequence should be in continuously increasing
1698 	 * address offset
1699 	 */
1700 	if (offset != idb->idb_exp_offset) {
1701 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_datain: unexpected offset");
1702 		idm_task_rele(idt);
1703 		idm_pdu_rx_protocol_error(ic, pdu);
1704 		return;
1705 	}
1706 	/* Expected next relative buffer offset */
1707 	idb->idb_exp_offset += n2h24(bhs->dlength);
1708 	idt->idt_rx_bytes += n2h24(bhs->dlength);
1709 
1710 	idm_task_rele(idt);
1711 
1712 	/*
1713 	 * For now call scsi_rsp which will process the data rsp
1714 	 * Revisit, need to provide an explicit client entry point for
1715 	 * phase collapse completions.
1716 	 */
1717 	if ((IDM_PDU_OPCODE(pdu) == ISCSI_OP_SCSI_DATA_RSP) &&
1718 	    (idrhp->flags & ISCSI_FLAG_DATA_STATUS)) {
1719 		(*ic->ic_conn_ops.icb_rx_scsi_rsp)(ic, pdu);
1720 	}
1721 
1722 	idm_pdu_complete(pdu, IDM_STATUS_SUCCESS);
1723 }
1724 
1725 /*
1726  * The idm_so_rx_dataout() function is used by the iSCSI target to read
1727  * data from the Data-Out PDU sent by the iSCSI initiator.
1728  *
1729  * This function gets the Initiator Task Tag from the PDU BHS and looks up the
1730  * task to get the buffers associated with the PDU. A PDU might span buffers.
1731  * The data is then read into the respective buffer.
1732  */
1733 static void
1734 idm_so_rx_dataout(idm_conn_t *ic, idm_pdu_t *pdu)
1735 {
1736 
1737 	iscsi_data_hdr_t	*bhs;
1738 	idm_task_t		*idt;
1739 	idm_buf_t		*idb;
1740 	size_t			offset;
1741 
1742 	ASSERT(ic != NULL);
1743 	ASSERT(pdu != NULL);
1744 	ASSERT(IDM_PDU_OPCODE(pdu) == ISCSI_OP_SCSI_DATA);
1745 
1746 	bhs = (iscsi_data_hdr_t *)pdu->isp_hdr;
1747 	offset = ntohl(bhs->offset);
1748 
1749 	/*
1750 	 * Look up the task corresponding to the initiator task tag
1751 	 * to get the buffers affiliated with the task.
1752 	 */
1753 	idt = idm_task_find(ic, bhs->itt, bhs->ttt);
1754 	if (idt == NULL) {
1755 		IDM_CONN_LOG(CE_WARN,
1756 		    "idm_so_rx_dataout: failed to find task");
1757 		idm_pdu_rx_protocol_error(ic, pdu);
1758 		return;
1759 	}
1760 
1761 	idb = pdu->isp_sorx_buf;
1762 	if (idb == NULL) {
1763 		IDM_CONN_LOG(CE_WARN,
1764 		    "idm_so_rx_dataout: failed to find buffer");
1765 		idm_task_rele(idt);
1766 		idm_pdu_rx_protocol_error(ic, pdu);
1767 		return;
1768 	}
1769 
1770 	/* Keep track of data transferred - check data offsets */
1771 	if (offset != idb->idb_exp_offset) {
1772 		IDM_CONN_LOG(CE_NOTE, "idm_so_rx_dataout: offset out of seq: "
1773 		    "%ld, %d", offset, idb->idb_exp_offset);
1774 		idm_task_rele(idt);
1775 		idm_pdu_rx_protocol_error(ic, pdu);
1776 		return;
1777 	}
1778 	/* Expected next relative offset */
1779 	idb->idb_exp_offset += ntoh24(bhs->dlength);
1780 	idt->idt_rx_bytes += n2h24(bhs->dlength);
1781 
1782 	/*
1783 	 * Call the buffer callback when the transfer is complete
1784 	 *
1785 	 * The connection state machine should only abort tasks after
1786 	 * shutting down the connection so we are assured that there
1787 	 * won't be a simultaneous attempt to abort this task at the
1788 	 * same time as we are processing this PDU (due to a connection
1789 	 * state change).
1790 	 */
1791 	if (bhs->flags & ISCSI_FLAG_FINAL) {
1792 		/*
1793 		 * We have gotten the last data-message for the current
1794 		 * transfer.  idb_xfer_len represents the data that the
1795 		 * command intended to transfer, it does not represent the
1796 		 * actual number of bytes transferred. If we have not
1797 		 * transferred the expected number of bytes something is
1798 		 * wrong.
1799 		 *
1800 		 * We have two options, when there is a mismatch, we can
1801 		 * regard the transfer as invalid -- or we can modify our
1802 		 * notion of "xfer_len." In order to be as stringent as
1803 		 * possible, here we regard this transfer as in error; and
1804 		 * bail out.
1805 		 */
1806 		if (idb->idb_buflen == idb->idb_xfer_len &&
1807 		    idb->idb_buflen !=
1808 		    (idb->idb_exp_offset - idb->idb_bufoffset)) {
1809 			printf("idm_so_rx_dataout: incomplete transfer, "
1810 			    "protocol err");
1811 			IDM_CONN_LOG(CE_NOTE,
1812 			    "idm_so_rx_dataout: incomplete transfer: %ld, %d",
1813 			    offset, (int)(idb->idb_exp_offset - offset));
1814 			idm_task_rele(idt);
1815 			idm_pdu_rx_protocol_error(ic, pdu);
1816 			return;
1817 		}
1818 		/*
1819 		 * We only want to call idm_buf_rx_from_ini_done once
1820 		 * per transfer.  It's possible that this task has
1821 		 * already been aborted in which case
1822 		 * idm_so_free_task_rsrc will call idm_buf_rx_from_ini_done
1823 		 * for each buffer with idb_in_transport==B_TRUE.  To
1824 		 * close this window and ensure that this doesn't happen,
1825 		 * we'll clear idb->idb_in_transport now while holding
1826 		 * the task mutex.   This is only really an issue for
1827 		 * SCSI task abort -- if tasks were being aborted because
1828 		 * of a connection state change the state machine would
1829 		 * have already stopped the receive thread.
1830 		 */
1831 		mutex_enter(&idt->idt_mutex);
1832 
1833 		/*
1834 		 * Release the task hold here (obtained in idm_task_find)
1835 		 * because the task may complete synchronously during
1836 		 * idm_buf_rx_from_ini_done.  Since we still have an active
1837 		 * buffer we know there is at least one additional hold on idt.
1838 		 */
1839 		idm_task_rele(idt);
1840 
1841 		/*
1842 		 * idm_buf_rx_from_ini_done releases idt->idt_mutex
1843 		 */
1844 		DTRACE_ISCSI_8(xfer__done, idm_conn_t *, idt->idt_ic,
1845 		    uintptr_t, idb->idb_buf, uint32_t, idb->idb_bufoffset,
1846 		    uint64_t, 0, uint32_t, 0, uint32_t, 0,
1847 		    uint32_t, idb->idb_xfer_len,
1848 		    int, XFER_BUF_RX_FROM_INI);
1849 		idm_buf_rx_from_ini_done(idt, idb, IDM_STATUS_SUCCESS);
1850 		idm_pdu_complete(pdu, IDM_STATUS_SUCCESS);
1851 		return;
1852 	}
1853 
1854 	idm_task_rele(idt);
1855 	idm_pdu_complete(pdu, IDM_STATUS_SUCCESS);
1856 }
1857 
1858 /*
1859  * The idm_so_rx_rtt() function is used by the iSCSI initiator to handle
1860  * the R2T PDU sent by the iSCSI target indicating that it is ready to
1861  * accept data. This gets the Initiator Task Tag (itt) from the PDU BHS
1862  * and looks up the task in the task tree using the itt to get the output
1863  * buffers associated the task. The R2T PDU contains the offset of the
1864  * requested data and the data length. This function then constructs a
1865  * sequence of iSCSI PDUs and outputs the requested data. Each Data-Out
1866  * PDU is associated with the R2T by the Target Transfer Tag  (ttt).
1867  */
1868 
1869 static void
1870 idm_so_rx_rtt(idm_conn_t *ic, idm_pdu_t *pdu)
1871 {
1872 	idm_task_t		*idt;
1873 	idm_buf_t		*idb;
1874 	iscsi_rtt_hdr_t		*rtt_hdr;
1875 	uint32_t		data_offset;
1876 	uint32_t		data_length;
1877 
1878 	ASSERT(ic != NULL);
1879 	ASSERT(pdu != NULL);
1880 
1881 	rtt_hdr	= (iscsi_rtt_hdr_t *)pdu->isp_hdr;
1882 	data_offset = ntohl(rtt_hdr->data_offset);
1883 	data_length = ntohl(rtt_hdr->data_length);
1884 	idt	= idm_task_find(ic, rtt_hdr->itt, rtt_hdr->ttt);
1885 
1886 	if (idt == NULL) {
1887 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_rtt: could not find task");
1888 		idm_pdu_rx_protocol_error(ic, pdu);
1889 		return;
1890 	}
1891 
1892 	/* Find the buffer bound to the task by the iSCSI initiator */
1893 	mutex_enter(&idt->idt_mutex);
1894 	idb = idm_buf_find(&idt->idt_outbufv, data_offset);
1895 	if (idb == NULL) {
1896 		mutex_exit(&idt->idt_mutex);
1897 		idm_task_rele(idt);
1898 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_rtt: could not find buffer");
1899 		idm_pdu_rx_protocol_error(ic, pdu);
1900 		return;
1901 	}
1902 
1903 	/* return buffer contains this data */
1904 	if (data_offset + data_length > idb->idb_buflen) {
1905 		/* Overflow */
1906 		mutex_exit(&idt->idt_mutex);
1907 		idm_task_rele(idt);
1908 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_rtt: read from outside "
1909 		    "buffer");
1910 		idm_pdu_rx_protocol_error(ic, pdu);
1911 		return;
1912 	}
1913 
1914 	idt->idt_r2t_ttt = rtt_hdr->ttt;
1915 	idt->idt_exp_datasn = 0;
1916 
1917 	idm_so_send_rtt_data(ic, idt, idb, data_offset,
1918 	    ntohl(rtt_hdr->data_length));
1919 	/*
1920 	 * the idt_mutex is released in idm_so_send_rtt_data
1921 	 */
1922 
1923 	idm_pdu_complete(pdu, IDM_STATUS_SUCCESS);
1924 	idm_task_rele(idt);
1925 
1926 }
1927 
1928 idm_status_t
1929 idm_sorecvdata(idm_conn_t *ic, idm_pdu_t *pdu)
1930 {
1931 	uint8_t		pad[ISCSI_PAD_WORD_LEN];
1932 	int		pad_len;
1933 	uint32_t	data_digest_crc;
1934 	uint32_t	crc_calculated;
1935 	int		total_len;
1936 	idm_so_conn_t	*so_conn;
1937 
1938 	so_conn = ic->ic_transport_private;
1939 
1940 	pad_len = ((ISCSI_PAD_WORD_LEN -
1941 	    (pdu->isp_datalen & (ISCSI_PAD_WORD_LEN - 1))) &
1942 	    (ISCSI_PAD_WORD_LEN - 1));
1943 
1944 	ASSERT(pdu->isp_iovlen < (PDU_MAX_IOVLEN - 2)); /* pad + data digest */
1945 
1946 	total_len = pdu->isp_datalen;
1947 
1948 	if (pad_len) {
1949 		pdu->isp_iov[pdu->isp_iovlen].iov_base	= (char *)&pad;
1950 		pdu->isp_iov[pdu->isp_iovlen].iov_len	= pad_len;
1951 		total_len		+= pad_len;
1952 		pdu->isp_iovlen++;
1953 	}
1954 
1955 	/* setup data digest */
1956 	if ((ic->ic_conn_flags & IDM_CONN_DATA_DIGEST) != 0) {
1957 		pdu->isp_iov[pdu->isp_iovlen].iov_base =
1958 		    (char *)&data_digest_crc;
1959 		pdu->isp_iov[pdu->isp_iovlen].iov_len =
1960 		    sizeof (data_digest_crc);
1961 		total_len		+= sizeof (data_digest_crc);
1962 		pdu->isp_iovlen++;
1963 	}
1964 
1965 	pdu->isp_data = (uint8_t *)(uintptr_t)pdu->isp_iov[0].iov_base;
1966 
1967 	if (idm_iov_sorecv(so_conn->ic_so, &pdu->isp_iov[0],
1968 	    pdu->isp_iovlen, total_len) != 0) {
1969 		return (IDM_STATUS_IO);
1970 	}
1971 
1972 	if ((ic->ic_conn_flags & IDM_CONN_DATA_DIGEST) != 0) {
1973 		crc_calculated = idm_crc32c(pdu->isp_data,
1974 		    pdu->isp_datalen);
1975 		if (pad_len) {
1976 			crc_calculated = idm_crc32c_continued((char *)&pad,
1977 			    pad_len, crc_calculated);
1978 		}
1979 		if (crc_calculated != data_digest_crc) {
1980 			IDM_CONN_LOG(CE_WARN,
1981 			    "idm_sorecvdata: "
1982 			    "CRC error: actual 0x%x, calc 0x%x",
1983 			    data_digest_crc, crc_calculated);
1984 
1985 			/* Invalid Data Digest */
1986 			return (IDM_STATUS_DATA_DIGEST);
1987 		}
1988 	}
1989 
1990 	return (IDM_STATUS_SUCCESS);
1991 }
1992 
1993 /*
1994  * idm_sorecv_scsidata() is used to receive scsi data from the socket. The
1995  * Data-type PDU header must be read into the idm_pdu_t structure prior to
1996  * calling this function.
1997  */
1998 idm_status_t
1999 idm_sorecv_scsidata(idm_conn_t *ic, idm_pdu_t *pdu)
2000 {
2001 	iscsi_data_hdr_t	*bhs;
2002 	idm_task_t		*task;
2003 	uint32_t		offset;
2004 	uint8_t			opcode;
2005 	uint32_t		dlength;
2006 	list_t			*buflst;
2007 	uint32_t		xfer_bytes;
2008 	idm_status_t		status;
2009 
2010 	ASSERT(ic != NULL);
2011 	ASSERT(pdu != NULL);
2012 
2013 	bhs	= (iscsi_data_hdr_t *)pdu->isp_hdr;
2014 
2015 	offset	= ntohl(bhs->offset);
2016 	opcode	= IDM_PDU_OPCODE(pdu);
2017 	dlength = n2h24(bhs->dlength);
2018 
2019 	ASSERT((opcode == ISCSI_OP_SCSI_DATA_RSP) ||
2020 	    (opcode == ISCSI_OP_SCSI_DATA));
2021 
2022 	/*
2023 	 * Successful lookup implicitly gets a "hold" on the task.  This
2024 	 * hold must be released before leaving this function.  At one
2025 	 * point we were caching this task context and retaining the hold
2026 	 * but it turned out to be very difficult to release the hold properly.
2027 	 * The task can be aborted and the connection shutdown between this
2028 	 * call and the subsequent expected call to idm_so_rx_datain/
2029 	 * idm_so_rx_dataout (in which case those functions are not called).
2030 	 * Releasing the hold in the PDU callback doesn't work well either
2031 	 * because the whole task may be completed by then at which point
2032 	 * it is too late to release the hold -- for better or worse this
2033 	 * code doesn't wait on the refcnts during normal operation.
2034 	 * idm_task_find() is very fast and it is not a huge burden if we
2035 	 * have to do it twice.
2036 	 */
2037 	task = idm_task_find(ic, bhs->itt, bhs->ttt);
2038 	if (task == NULL) {
2039 		IDM_CONN_LOG(CE_WARN,
2040 		    "idm_sorecv_scsidata: could not find task");
2041 		return (IDM_STATUS_FAIL);
2042 	}
2043 
2044 	mutex_enter(&task->idt_mutex);
2045 	buflst	= (opcode == ISCSI_OP_SCSI_DATA_RSP) ?
2046 	    &task->idt_inbufv : &task->idt_outbufv;
2047 	pdu->isp_sorx_buf = idm_buf_find(buflst, offset);
2048 	mutex_exit(&task->idt_mutex);
2049 
2050 	if (pdu->isp_sorx_buf == NULL) {
2051 		idm_task_rele(task);
2052 		IDM_CONN_LOG(CE_WARN, "idm_sorecv_scsidata: could not find "
2053 		    "buffer for offset %x opcode=%x",
2054 		    offset, opcode);
2055 		return (IDM_STATUS_FAIL);
2056 	}
2057 
2058 	xfer_bytes = idm_fill_iov(pdu, pdu->isp_sorx_buf, offset, dlength);
2059 	ASSERT(xfer_bytes != 0);
2060 	if (xfer_bytes != dlength) {
2061 		idm_task_rele(task);
2062 		/*
2063 		 * Buffer overflow, connection error.  The PDU data is still
2064 		 * sitting in the socket so we can't use the connection
2065 		 * again until that data is drained.
2066 		 */
2067 		return (IDM_STATUS_FAIL);
2068 	}
2069 
2070 	status = idm_sorecvdata(ic, pdu);
2071 
2072 	idm_task_rele(task);
2073 
2074 	return (status);
2075 }
2076 
2077 static uint32_t
2078 idm_fill_iov(idm_pdu_t *pdu, idm_buf_t *idb, uint32_t ro, uint32_t dlength)
2079 {
2080 	uint32_t	buf_ro = ro - idb->idb_bufoffset;
2081 	uint32_t	xfer_len = min(dlength, idb->idb_buflen - buf_ro);
2082 
2083 	ASSERT(ro >= idb->idb_bufoffset);
2084 
2085 	pdu->isp_iov[pdu->isp_iovlen].iov_base	=
2086 	    (caddr_t)idb->idb_buf + buf_ro;
2087 	pdu->isp_iov[pdu->isp_iovlen].iov_len	= xfer_len;
2088 	pdu->isp_iovlen++;
2089 
2090 	return (xfer_len);
2091 }
2092 
2093 int
2094 idm_sorecv_nonscsidata(idm_conn_t *ic, idm_pdu_t *pdu)
2095 {
2096 	pdu->isp_data = kmem_alloc(pdu->isp_datalen, KM_SLEEP);
2097 	ASSERT(pdu->isp_data != NULL);
2098 
2099 	pdu->isp_databuflen = pdu->isp_datalen;
2100 	pdu->isp_iov[0].iov_base = (caddr_t)pdu->isp_data;
2101 	pdu->isp_iov[0].iov_len = pdu->isp_datalen;
2102 	pdu->isp_iovlen = 1;
2103 	/*
2104 	 * Since we are associating a new data buffer with this received
2105 	 * PDU we need to set a specific callback to free the data
2106 	 * after the PDU is processed.
2107 	 */
2108 	pdu->isp_flags |= IDM_PDU_ADDL_DATA;
2109 	pdu->isp_callback = idm_sorx_addl_pdu_cb;
2110 
2111 	return (idm_sorecvdata(ic, pdu));
2112 }
2113 
2114 void
2115 idm_sorx_thread(void *arg)
2116 {
2117 	boolean_t	conn_failure = B_FALSE;
2118 	idm_conn_t	*ic = (idm_conn_t *)arg;
2119 	idm_so_conn_t	*so_conn;
2120 	idm_pdu_t	*pdu;
2121 	idm_status_t	rc;
2122 
2123 	idm_conn_hold(ic);
2124 
2125 	mutex_enter(&ic->ic_mutex);
2126 
2127 	so_conn = ic->ic_transport_private;
2128 	so_conn->ic_rx_thread_running = B_TRUE;
2129 	so_conn->ic_rx_thread_did = so_conn->ic_rx_thread->t_did;
2130 	cv_signal(&ic->ic_cv);
2131 
2132 	while (so_conn->ic_rx_thread_running) {
2133 		mutex_exit(&ic->ic_mutex);
2134 
2135 		/*
2136 		 * Get PDU with default header size (large enough for
2137 		 * BHS plus any anticipated AHS).  PDU from
2138 		 * the cache will have all values set correctly
2139 		 * for sockets RX including callback.
2140 		 */
2141 		pdu = kmem_cache_alloc(idm.idm_sorx_pdu_cache, KM_SLEEP);
2142 		pdu->isp_ic = ic;
2143 		pdu->isp_flags = 0;
2144 		pdu->isp_transport_hdrlen = 0;
2145 
2146 		if ((rc = idm_sorecvhdr(ic, pdu)) != 0) {
2147 			/*
2148 			 * Call idm_pdu_complete so that we call the callback
2149 			 * and ensure any memory allocated in idm_sorecvhdr
2150 			 * gets freed up.
2151 			 */
2152 			idm_pdu_complete(pdu, IDM_STATUS_FAIL);
2153 
2154 			/*
2155 			 * If ic_rx_thread_running is still set then
2156 			 * this is some kind of connection problem
2157 			 * on the socket.  In this case we want to
2158 			 * generate an event.  Otherwise some other
2159 			 * thread closed the socket due to another
2160 			 * issue in which case we don't need to
2161 			 * generate an event.
2162 			 */
2163 			mutex_enter(&ic->ic_mutex);
2164 			if (so_conn->ic_rx_thread_running) {
2165 				conn_failure = B_TRUE;
2166 				so_conn->ic_rx_thread_running = B_FALSE;
2167 			}
2168 
2169 			continue;
2170 		}
2171 
2172 		/*
2173 		 * Header has been read and validated.  Now we need
2174 		 * to read the PDU data payload (if present).  SCSI data
2175 		 * need to be transferred from the socket directly into
2176 		 * the associated transfer buffer for the SCSI task.
2177 		 */
2178 		if (pdu->isp_datalen != 0) {
2179 			if ((IDM_PDU_OPCODE(pdu) == ISCSI_OP_SCSI_DATA) ||
2180 			    (IDM_PDU_OPCODE(pdu) == ISCSI_OP_SCSI_DATA_RSP)) {
2181 				rc = idm_sorecv_scsidata(ic, pdu);
2182 				/*
2183 				 * All SCSI errors are fatal to the
2184 				 * connection right now since we have no
2185 				 * place to put the data.  What we need
2186 				 * is some kind of sink to dispose of unwanted
2187 				 * SCSI data.  For example an invalid task tag
2188 				 * should not kill the connection (although
2189 				 * we may want to drop the connection).
2190 				 */
2191 			} else {
2192 				/*
2193 				 * Not data PDUs so allocate a buffer for the
2194 				 * data segment and read the remaining data.
2195 				 */
2196 				rc = idm_sorecv_nonscsidata(ic, pdu);
2197 			}
2198 			if (rc != 0) {
2199 				/*
2200 				 * Call idm_pdu_complete so that we call the
2201 				 * callback and ensure any memory allocated
2202 				 * in idm_sorecvhdr gets freed up.
2203 				 */
2204 				idm_pdu_complete(pdu, IDM_STATUS_FAIL);
2205 
2206 				/*
2207 				 * If ic_rx_thread_running is still set then
2208 				 * this is some kind of connection problem
2209 				 * on the socket.  In this case we want to
2210 				 * generate an event.  Otherwise some other
2211 				 * thread closed the socket due to another
2212 				 * issue in which case we don't need to
2213 				 * generate an event.
2214 				 */
2215 				mutex_enter(&ic->ic_mutex);
2216 				if (so_conn->ic_rx_thread_running) {
2217 					conn_failure = B_TRUE;
2218 					so_conn->ic_rx_thread_running = B_FALSE;
2219 				}
2220 				continue;
2221 			}
2222 		}
2223 
2224 		/*
2225 		 * Process RX PDU
2226 		 */
2227 		idm_pdu_rx(ic, pdu);
2228 
2229 		mutex_enter(&ic->ic_mutex);
2230 	}
2231 
2232 	mutex_exit(&ic->ic_mutex);
2233 
2234 	/*
2235 	 * If we dropped out of the RX processing loop because of
2236 	 * a socket problem or other connection failure (including
2237 	 * digest errors) then we need to generate a state machine
2238 	 * event to shut the connection down.
2239 	 * If the state machine is already in, for example, INIT_ERROR, this
2240 	 * event will get dropped, and the TX thread will never be notified
2241 	 * to shut down.  To be safe, we'll just notify it here.
2242 	 */
2243 	if (conn_failure) {
2244 		if (so_conn->ic_tx_thread_running) {
2245 			so_conn->ic_tx_thread_running = B_FALSE;
2246 			mutex_enter(&so_conn->ic_tx_mutex);
2247 			cv_signal(&so_conn->ic_tx_cv);
2248 			mutex_exit(&so_conn->ic_tx_mutex);
2249 		}
2250 
2251 		idm_conn_event(ic, CE_TRANSPORT_FAIL, rc);
2252 	}
2253 
2254 	idm_conn_rele(ic);
2255 
2256 	thread_exit();
2257 }
2258 
2259 /*
2260  * idm_so_tx
2261  *
2262  * This is the implementation of idm_transport_ops_t's it_tx_pdu entry
2263  * point.  By definition, it is supposed to be fast.  So, simply queue
2264  * the entry and return.  The real work is done by idm_i_so_tx() via
2265  * idm_sotx_thread().
2266  */
2267 
2268 static void
2269 idm_so_tx(idm_conn_t *ic, idm_pdu_t *pdu)
2270 {
2271 	idm_so_conn_t *so_conn = ic->ic_transport_private;
2272 
2273 	ASSERT(pdu->isp_ic == ic);
2274 	mutex_enter(&so_conn->ic_tx_mutex);
2275 
2276 	if (!so_conn->ic_tx_thread_running) {
2277 		mutex_exit(&so_conn->ic_tx_mutex);
2278 		idm_pdu_complete(pdu, IDM_STATUS_ABORTED);
2279 		return;
2280 	}
2281 
2282 	list_insert_tail(&so_conn->ic_tx_list, (void *)pdu);
2283 	cv_signal(&so_conn->ic_tx_cv);
2284 	mutex_exit(&so_conn->ic_tx_mutex);
2285 }
2286 
2287 static idm_status_t
2288 idm_i_so_tx(idm_pdu_t *pdu)
2289 {
2290 	idm_conn_t	*ic = pdu->isp_ic;
2291 	idm_status_t	status = IDM_STATUS_SUCCESS;
2292 	uint8_t		pad[ISCSI_PAD_WORD_LEN];
2293 	int		pad_len;
2294 	uint32_t	hdr_digest_crc;
2295 	uint32_t	data_digest_crc = 0;
2296 	int		total_len = 0;
2297 	int		iovlen = 0;
2298 	struct iovec	iov[6];
2299 	idm_so_conn_t	*so_conn;
2300 
2301 	so_conn = ic->ic_transport_private;
2302 
2303 	/* Setup BHS */
2304 	iov[iovlen].iov_base	= (caddr_t)pdu->isp_hdr;
2305 	iov[iovlen].iov_len	= pdu->isp_hdrlen;
2306 	total_len		+= iov[iovlen].iov_len;
2307 	iovlen++;
2308 
2309 	/* Setup header digest */
2310 	if (((pdu->isp_flags & IDM_PDU_LOGIN_TX) == 0) &&
2311 	    (ic->ic_conn_flags & IDM_CONN_HEADER_DIGEST)) {
2312 		hdr_digest_crc = idm_crc32c(pdu->isp_hdr, pdu->isp_hdrlen);
2313 
2314 		iov[iovlen].iov_base	= (caddr_t)&hdr_digest_crc;
2315 		iov[iovlen].iov_len	= sizeof (hdr_digest_crc);
2316 		total_len		+= iov[iovlen].iov_len;
2317 		iovlen++;
2318 	}
2319 
2320 	/* Setup the data */
2321 	if (pdu->isp_datalen) {
2322 		idm_task_t		*idt;
2323 		idm_buf_t		*idb;
2324 		iscsi_data_hdr_t	*ihp;
2325 		ihp = (iscsi_data_hdr_t *)pdu->isp_hdr;
2326 		/* Write of immediate data */
2327 		if (ic->ic_ffp &&
2328 		    (IDM_PDU_OPCODE(pdu) == ISCSI_OP_SCSI_CMD ||
2329 		    IDM_PDU_OPCODE(pdu) == ISCSI_OP_SCSI_DATA)) {
2330 			idt = idm_task_find(ic, ihp->itt, ihp->ttt);
2331 			if (idt) {
2332 				mutex_enter(&idt->idt_mutex);
2333 				idb = idm_buf_find(&idt->idt_outbufv, 0);
2334 				mutex_exit(&idt->idt_mutex);
2335 				/*
2336 				 * If the initiator call to idm_buf_alloc
2337 				 * failed then we can get to this point
2338 				 * without a bound buffer.  The associated
2339 				 * connection failure will clean things up
2340 				 * later.  It would be nice to come up with
2341 				 * a cleaner way to handle this.  In
2342 				 * particular it seems absurd to look up
2343 				 * the task and the buffer just to update
2344 				 * this counter.
2345 				 */
2346 				if (idb)
2347 					idb->idb_xfer_len += pdu->isp_datalen;
2348 				idm_task_rele(idt);
2349 			}
2350 		}
2351 
2352 		iov[iovlen].iov_base = (caddr_t)pdu->isp_data;
2353 		iov[iovlen].iov_len  = pdu->isp_datalen;
2354 		total_len += iov[iovlen].iov_len;
2355 		iovlen++;
2356 	}
2357 
2358 	/* Setup the data pad if necessary */
2359 	pad_len = ((ISCSI_PAD_WORD_LEN -
2360 	    (pdu->isp_datalen & (ISCSI_PAD_WORD_LEN - 1))) &
2361 	    (ISCSI_PAD_WORD_LEN - 1));
2362 
2363 	if (pad_len) {
2364 		bzero(pad, sizeof (pad));
2365 		iov[iovlen].iov_base = (void *)&pad;
2366 		iov[iovlen].iov_len  = pad_len;
2367 		total_len		+= iov[iovlen].iov_len;
2368 		iovlen++;
2369 	}
2370 
2371 	/*
2372 	 * Setup the data digest if enabled.  Data-digest is not sent
2373 	 * for login-phase PDUs.
2374 	 */
2375 	if ((ic->ic_conn_flags & IDM_CONN_DATA_DIGEST) &&
2376 	    ((pdu->isp_flags & IDM_PDU_LOGIN_TX) == 0) &&
2377 	    (pdu->isp_datalen || pad_len)) {
2378 		/*
2379 		 * RFC3720/10.2.3: A zero-length Data Segment also
2380 		 * implies a zero-length data digest.
2381 		 */
2382 		if (pdu->isp_datalen) {
2383 			data_digest_crc = idm_crc32c(pdu->isp_data,
2384 			    pdu->isp_datalen);
2385 		}
2386 		if (pad_len) {
2387 			data_digest_crc = idm_crc32c_continued(&pad,
2388 			    pad_len, data_digest_crc);
2389 		}
2390 
2391 		iov[iovlen].iov_base	= (caddr_t)&data_digest_crc;
2392 		iov[iovlen].iov_len	= sizeof (data_digest_crc);
2393 		total_len		+= iov[iovlen].iov_len;
2394 		iovlen++;
2395 	}
2396 
2397 	/* Transmit the PDU */
2398 	if (idm_iov_sosend(so_conn->ic_so, &iov[0], iovlen,
2399 	    total_len) != 0) {
2400 		/* Set error status */
2401 		IDM_CONN_LOG(CE_WARN,
2402 		    "idm_so_tx: failed to transmit the PDU, so: %p ic: %p "
2403 		    "data: %p", (void *) so_conn->ic_so, (void *) ic,
2404 		    (void *) pdu->isp_data);
2405 		status = IDM_STATUS_IO;
2406 	}
2407 
2408 	/*
2409 	 * Success does not mean that the PDU actually reached the
2410 	 * remote node since it could get dropped along the way.
2411 	 */
2412 	idm_pdu_complete(pdu, status);
2413 
2414 	return (status);
2415 }
2416 
2417 /*
2418  * The idm_so_buf_tx_to_ini() is used by the target iSCSI layer to transmit the
2419  * Data-In PDUs using sockets. Based on the negotiated MaxRecvDataSegmentLength,
2420  * the buffer is segmented into a sequence of Data-In PDUs, ordered by DataSN.
2421  * A target can invoke this function multiple times for a single read command
2422  * (identified by the same ITT) to split the input into several sequences.
2423  *
2424  * DataSN starts with 0 for the first data PDU of an input command and advances
2425  * by 1 for each subsequent data PDU. Each sequence will have its own F bit,
2426  * which is set to 1 for the last data PDU of a sequence.
2427  * If the initiator supports phase collapse, the status bit must be set along
2428  * with the F bit to indicate that the status is shipped together with the last
2429  * Data-In PDU.
2430  *
2431  * The data PDUs within a sequence will be sent in order with the buffer offset
2432  * in increasing order. i.e. initiator and target must have negotiated the
2433  * "DataPDUInOrder" to "Yes". The order between sequences is not enforced.
2434  *
2435  * Caller holds idt->idt_mutex
2436  */
2437 static idm_status_t
2438 idm_so_buf_tx_to_ini(idm_task_t *idt, idm_buf_t *idb)
2439 {
2440 	idm_so_conn_t	*so_conn = idb->idb_ic->ic_transport_private;
2441 	idm_pdu_t	tmppdu;
2442 
2443 	ASSERT(mutex_owned(&idt->idt_mutex));
2444 
2445 	/*
2446 	 * Put the idm_buf_t on the tx queue.  It will be transmitted by
2447 	 * idm_sotx_thread.
2448 	 */
2449 	mutex_enter(&so_conn->ic_tx_mutex);
2450 
2451 	DTRACE_ISCSI_8(xfer__start, idm_conn_t *, idt->idt_ic,
2452 	    uintptr_t, idb->idb_buf, uint32_t, idb->idb_bufoffset,
2453 	    uint64_t, 0, uint32_t, 0, uint32_t, 0,
2454 	    uint32_t, idb->idb_xfer_len, int, XFER_BUF_TX_TO_INI);
2455 
2456 	if (!so_conn->ic_tx_thread_running) {
2457 		mutex_exit(&so_conn->ic_tx_mutex);
2458 		/*
2459 		 * Don't release idt->idt_mutex since we're supposed to hold
2460 		 * in when calling idm_buf_tx_to_ini_done
2461 		 */
2462 		DTRACE_ISCSI_8(xfer__done, idm_conn_t *, idt->idt_ic,
2463 		    uintptr_t, idb->idb_buf, uint32_t, idb->idb_bufoffset,
2464 		    uint64_t, 0, uint32_t, 0, uint32_t, 0,
2465 		    uint32_t, idb->idb_xfer_len,
2466 		    int, XFER_BUF_TX_TO_INI);
2467 		idm_buf_tx_to_ini_done(idt, idb, IDM_STATUS_ABORTED);
2468 		return (IDM_STATUS_FAIL);
2469 	}
2470 
2471 	/*
2472 	 * Build a template for the data PDU headers we will use so that
2473 	 * the SN values will stay consistent with other PDU's we are
2474 	 * transmitting like R2T and SCSI status.
2475 	 */
2476 	bzero(&idb->idb_data_hdr_tmpl, sizeof (iscsi_hdr_t));
2477 	tmppdu.isp_hdr = &idb->idb_data_hdr_tmpl;
2478 	(*idt->idt_ic->ic_conn_ops.icb_build_hdr)(idt, &tmppdu,
2479 	    ISCSI_OP_SCSI_DATA_RSP);
2480 	idb->idb_tx_thread = B_TRUE;
2481 	list_insert_tail(&so_conn->ic_tx_list, (void *)idb);
2482 	cv_signal(&so_conn->ic_tx_cv);
2483 	mutex_exit(&so_conn->ic_tx_mutex);
2484 	mutex_exit(&idt->idt_mutex);
2485 
2486 	/*
2487 	 * Returning success here indicates the transfer was successfully
2488 	 * dispatched -- it does not mean that the transfer completed
2489 	 * successfully.
2490 	 */
2491 	return (IDM_STATUS_SUCCESS);
2492 }
2493 
2494 /*
2495  * The idm_so_buf_rx_from_ini() is used by the target iSCSI layer to specify the
2496  * data blocks it is ready to receive from the initiator in response to a WRITE
2497  * SCSI command. The target iSCSI layer passes the information about the desired
2498  * data blocks to the initiator in one R2T PDU. The receiving buffer, the buffer
2499  * offset and datalen are passed via the 'idb' argument.
2500  *
2501  * Scope for Prototype build:
2502  * R2Ts are required for any Data-Out PDU, i.e. initiator and target must have
2503  * negotiated the "InitialR2T" to "Yes".
2504  *
2505  * Caller holds idt->idt_mutex
2506  */
2507 static idm_status_t
2508 idm_so_buf_rx_from_ini(idm_task_t *idt, idm_buf_t *idb)
2509 {
2510 	idm_pdu_t		*pdu;
2511 	iscsi_rtt_hdr_t		*rtt;
2512 
2513 	ASSERT(mutex_owned(&idt->idt_mutex));
2514 
2515 	DTRACE_ISCSI_8(xfer__start, idm_conn_t *, idt->idt_ic,
2516 	    uintptr_t, idb->idb_buf, uint32_t, idb->idb_bufoffset,
2517 	    uint64_t, 0, uint32_t, 0, uint32_t, 0,
2518 	    uint32_t, idb->idb_xfer_len, int, XFER_BUF_RX_FROM_INI);
2519 
2520 	pdu = kmem_cache_alloc(idm.idm_sotx_pdu_cache, KM_SLEEP);
2521 	pdu->isp_ic = idt->idt_ic;
2522 	pdu->isp_flags = IDM_PDU_SET_STATSN;
2523 	bzero(pdu->isp_hdr, sizeof (iscsi_rtt_hdr_t));
2524 
2525 	/* iSCSI layer fills the TTT, ITT, ExpCmdSN, MaxCmdSN */
2526 	(*idt->idt_ic->ic_conn_ops.icb_build_hdr)(idt, pdu, ISCSI_OP_RTT_RSP);
2527 
2528 	/* set the rttsn, rtt.flags, rtt.data_offset and rtt.data_length */
2529 	rtt = (iscsi_rtt_hdr_t *)(pdu->isp_hdr);
2530 
2531 	rtt->opcode		= ISCSI_OP_RTT_RSP;
2532 	rtt->flags		= ISCSI_FLAG_FINAL;
2533 	rtt->data_offset	= htonl(idb->idb_bufoffset);
2534 	rtt->data_length	= htonl(idb->idb_xfer_len);
2535 	rtt->rttsn		= htonl(idt->idt_exp_rttsn++);
2536 
2537 	/* Keep track of buffer offsets */
2538 	idb->idb_exp_offset	= idb->idb_bufoffset;
2539 	mutex_exit(&idt->idt_mutex);
2540 
2541 	/*
2542 	 * Transmit the PDU.
2543 	 */
2544 	idm_pdu_tx(pdu);
2545 
2546 	return (IDM_STATUS_SUCCESS);
2547 }
2548 
2549 static idm_status_t
2550 idm_so_buf_alloc(idm_buf_t *idb, uint64_t buflen)
2551 {
2552 	if ((buflen > IDM_SO_BUF_CACHE_LB) && (buflen <= IDM_SO_BUF_CACHE_UB)) {
2553 		idb->idb_buf = kmem_cache_alloc(idm.idm_so_128k_buf_cache,
2554 		    KM_NOSLEEP);
2555 		idb->idb_buf_private = idm.idm_so_128k_buf_cache;
2556 	} else {
2557 		idb->idb_buf = kmem_alloc(buflen, KM_NOSLEEP);
2558 		idb->idb_buf_private = NULL;
2559 	}
2560 
2561 	if (idb->idb_buf == NULL) {
2562 		IDM_CONN_LOG(CE_NOTE,
2563 		    "idm_so_buf_alloc: failed buffer allocation");
2564 		return (IDM_STATUS_FAIL);
2565 	}
2566 
2567 	return (IDM_STATUS_SUCCESS);
2568 }
2569 
2570 /* ARGSUSED */
2571 static idm_status_t
2572 idm_so_buf_setup(idm_buf_t *idb)
2573 {
2574 	/* Ensure bufalloc'd flag is unset */
2575 	idb->idb_bufalloc = B_FALSE;
2576 
2577 	return (IDM_STATUS_SUCCESS);
2578 }
2579 
2580 /* ARGSUSED */
2581 static void
2582 idm_so_buf_teardown(idm_buf_t *idb)
2583 {
2584 	/* nothing to do here */
2585 }
2586 
2587 static void
2588 idm_so_buf_free(idm_buf_t *idb)
2589 {
2590 	if (idb->idb_buf_private == NULL) {
2591 		kmem_free(idb->idb_buf, idb->idb_buflen);
2592 	} else {
2593 		kmem_cache_free(idb->idb_buf_private, idb->idb_buf);
2594 	}
2595 }
2596 
2597 static void
2598 idm_so_send_rtt_data(idm_conn_t *ic, idm_task_t *idt, idm_buf_t *idb,
2599     uint32_t offset, uint32_t length)
2600 {
2601 	idm_so_conn_t	*so_conn = ic->ic_transport_private;
2602 	idm_pdu_t	tmppdu;
2603 	idm_buf_t	*rtt_buf;
2604 
2605 	ASSERT(mutex_owned(&idt->idt_mutex));
2606 
2607 	/*
2608 	 * Allocate a buffer to represent the RTT transfer.  We could further
2609 	 * optimize this by allocating the buffers internally from an rtt
2610 	 * specific buffer cache since this is socket-specific code but for
2611 	 * now we will keep it simple.
2612 	 */
2613 	rtt_buf = idm_buf_alloc(ic, (uint8_t *)idb->idb_buf + offset, length);
2614 	if (rtt_buf == NULL) {
2615 		/*
2616 		 * If we're in FFP then the failure was likely a resource
2617 		 * allocation issue and we should close the connection by
2618 		 * sending a CE_TRANSPORT_FAIL event.
2619 		 *
2620 		 * If we're not in FFP then idm_buf_alloc will always
2621 		 * fail and the state is transitioning to "complete" anyway
2622 		 * so we won't bother to send an event.
2623 		 */
2624 		mutex_enter(&ic->ic_state_mutex);
2625 		if (ic->ic_ffp)
2626 			idm_conn_event_locked(ic, CE_TRANSPORT_FAIL,
2627 			    (uintptr_t)NULL, CT_NONE);
2628 		mutex_exit(&ic->ic_state_mutex);
2629 		mutex_exit(&idt->idt_mutex);
2630 		return;
2631 	}
2632 
2633 	rtt_buf->idb_buf_cb = NULL;
2634 	rtt_buf->idb_cb_arg = NULL;
2635 	rtt_buf->idb_bufoffset = offset;
2636 	rtt_buf->idb_xfer_len = length;
2637 	rtt_buf->idb_ic = idt->idt_ic;
2638 	rtt_buf->idb_task_binding = idt;
2639 
2640 	/*
2641 	 * The new buffer (if any) represents an additional
2642 	 * reference on the task
2643 	 */
2644 	idm_task_hold(idt);
2645 	mutex_exit(&idt->idt_mutex);
2646 
2647 	/*
2648 	 * Put the idm_buf_t on the tx queue.  It will be transmitted by
2649 	 * idm_sotx_thread.
2650 	 */
2651 	mutex_enter(&so_conn->ic_tx_mutex);
2652 
2653 	if (!so_conn->ic_tx_thread_running) {
2654 		idm_buf_free(rtt_buf);
2655 		mutex_exit(&so_conn->ic_tx_mutex);
2656 		idm_task_rele(idt);
2657 		return;
2658 	}
2659 
2660 	/*
2661 	 * Build a template for the data PDU headers we will use so that
2662 	 * the SN values will stay consistent with other PDU's we are
2663 	 * transmitting like R2T and SCSI status.
2664 	 */
2665 	bzero(&rtt_buf->idb_data_hdr_tmpl, sizeof (iscsi_hdr_t));
2666 	tmppdu.isp_hdr = &rtt_buf->idb_data_hdr_tmpl;
2667 	(*idt->idt_ic->ic_conn_ops.icb_build_hdr)(idt, &tmppdu,
2668 	    ISCSI_OP_SCSI_DATA);
2669 	rtt_buf->idb_tx_thread = B_TRUE;
2670 	rtt_buf->idb_in_transport = B_TRUE;
2671 	list_insert_tail(&so_conn->ic_tx_list, (void *)rtt_buf);
2672 	cv_signal(&so_conn->ic_tx_cv);
2673 	mutex_exit(&so_conn->ic_tx_mutex);
2674 }
2675 
2676 static void
2677 idm_so_send_rtt_data_done(idm_task_t *idt, idm_buf_t *idb)
2678 {
2679 	/*
2680 	 * Don't worry about status -- we assume any error handling
2681 	 * is performed by the caller (idm_sotx_thread).
2682 	 */
2683 	idb->idb_in_transport = B_FALSE;
2684 	idm_task_rele(idt);
2685 	idm_buf_free(idb);
2686 }
2687 
2688 static idm_status_t
2689 idm_so_send_buf_region(idm_task_t *idt, idm_buf_t *idb,
2690     uint32_t buf_region_offset, uint32_t buf_region_length)
2691 {
2692 	idm_conn_t		*ic;
2693 	uint32_t		max_dataseglen;
2694 	size_t			remainder, chunk;
2695 	uint32_t		data_offset = buf_region_offset;
2696 	iscsi_data_hdr_t	*bhs;
2697 	idm_pdu_t		*pdu;
2698 	idm_status_t		tx_status;
2699 
2700 	ASSERT(mutex_owned(&idt->idt_mutex));
2701 
2702 	ic = idt->idt_ic;
2703 
2704 	max_dataseglen = ic->ic_conn_params.max_xmit_dataseglen;
2705 	remainder = buf_region_length;
2706 
2707 	while (remainder) {
2708 		if (idt->idt_state != TASK_ACTIVE) {
2709 			ASSERT((idt->idt_state != TASK_IDLE) &&
2710 			    (idt->idt_state != TASK_COMPLETE));
2711 			return (IDM_STATUS_ABORTED);
2712 		}
2713 
2714 		/* check to see if we need to chunk the data */
2715 		if (remainder > max_dataseglen) {
2716 			chunk = max_dataseglen;
2717 		} else {
2718 			chunk = remainder;
2719 		}
2720 
2721 		/* Data PDU headers will always be sizeof (iscsi_hdr_t) */
2722 		pdu = kmem_cache_alloc(idm.idm_sotx_pdu_cache, KM_SLEEP);
2723 		pdu->isp_ic = ic;
2724 		pdu->isp_flags = 0;	/* initialize isp_flags */
2725 
2726 		/*
2727 		 * We've already built a build a header template
2728 		 * to use during the transfer.  Use this template so that
2729 		 * the SN values stay consistent with any unrelated PDU's
2730 		 * being transmitted.
2731 		 */
2732 		bcopy(&idb->idb_data_hdr_tmpl, pdu->isp_hdr,
2733 		    sizeof (iscsi_hdr_t));
2734 
2735 		/*
2736 		 * Set DataSN, data offset, and flags in BHS
2737 		 * For the prototype build, A = 0, S = 0, U = 0
2738 		 */
2739 		bhs = (iscsi_data_hdr_t *)(pdu->isp_hdr);
2740 
2741 		bhs->datasn		= htonl(idt->idt_exp_datasn++);
2742 
2743 		hton24(bhs->dlength, chunk);
2744 		bhs->offset = htonl(idb->idb_bufoffset + data_offset);
2745 
2746 		/* setup data */
2747 		pdu->isp_data	=  (uint8_t *)idb->idb_buf + data_offset;
2748 		pdu->isp_datalen = (uint_t)chunk;
2749 
2750 		if (chunk == remainder) {
2751 			bhs->flags = ISCSI_FLAG_FINAL; /* F bit set to 1 */
2752 			/* Piggyback the status with the last data PDU */
2753 			if (idt->idt_flags & IDM_TASK_PHASECOLLAPSE_REQ) {
2754 				pdu->isp_flags |= IDM_PDU_SET_STATSN |
2755 				    IDM_PDU_ADVANCE_STATSN;
2756 				(*idt->idt_ic->ic_conn_ops.icb_update_statsn)
2757 				    (idt, pdu);
2758 				idt->idt_flags |=
2759 				    IDM_TASK_PHASECOLLAPSE_SUCCESS;
2760 
2761 			}
2762 		}
2763 
2764 		remainder	-= chunk;
2765 		data_offset	+= chunk;
2766 
2767 		/* Instrument the data-send DTrace probe. */
2768 		if (IDM_PDU_OPCODE(pdu) == ISCSI_OP_SCSI_DATA_RSP) {
2769 			DTRACE_ISCSI_2(data__send,
2770 			    idm_conn_t *, idt->idt_ic,
2771 			    iscsi_data_rsp_hdr_t *,
2772 			    (iscsi_data_rsp_hdr_t *)pdu->isp_hdr);
2773 		}
2774 
2775 		/*
2776 		 * Now that we're done working with idt_exp_datasn,
2777 		 * idt->idt_state and idb->idb_bufoffset we can release
2778 		 * the task lock -- don't want to hold it across the
2779 		 * call to idm_i_so_tx since we could block.
2780 		 */
2781 		mutex_exit(&idt->idt_mutex);
2782 
2783 		/*
2784 		 * Transmit the PDU.  Call the internal routine directly
2785 		 * as there is already implicit ordering.
2786 		 */
2787 		if ((tx_status = idm_i_so_tx(pdu)) != IDM_STATUS_SUCCESS) {
2788 			mutex_enter(&idt->idt_mutex);
2789 			return (tx_status);
2790 		}
2791 
2792 		mutex_enter(&idt->idt_mutex);
2793 		idt->idt_tx_bytes += chunk;
2794 	}
2795 
2796 	return (IDM_STATUS_SUCCESS);
2797 }
2798 
2799 /*
2800  * TX PDU cache
2801  */
2802 /* ARGSUSED */
2803 int
2804 idm_sotx_pdu_constructor(void *hdl, void *arg, int flags)
2805 {
2806 	idm_pdu_t	*pdu = hdl;
2807 
2808 	bzero(pdu, sizeof (idm_pdu_t));
2809 	pdu->isp_hdr = (iscsi_hdr_t *)(pdu + 1); /* Ptr arithmetic */
2810 	pdu->isp_hdrlen = sizeof (iscsi_hdr_t);
2811 	pdu->isp_callback = idm_sotx_cache_pdu_cb;
2812 	pdu->isp_magic = IDM_PDU_MAGIC;
2813 	bzero(pdu->isp_hdr, sizeof (iscsi_hdr_t));
2814 
2815 	return (0);
2816 }
2817 
2818 /* ARGSUSED */
2819 void
2820 idm_sotx_cache_pdu_cb(idm_pdu_t *pdu, idm_status_t status)
2821 {
2822 	/* reset values between use */
2823 	pdu->isp_datalen = 0;
2824 
2825 	kmem_cache_free(idm.idm_sotx_pdu_cache, pdu);
2826 }
2827 
2828 /*
2829  * RX PDU cache
2830  */
2831 /* ARGSUSED */
2832 int
2833 idm_sorx_pdu_constructor(void *hdl, void *arg, int flags)
2834 {
2835 	idm_pdu_t	*pdu = hdl;
2836 
2837 	bzero(pdu, sizeof (idm_pdu_t));
2838 	pdu->isp_magic = IDM_PDU_MAGIC;
2839 	pdu->isp_hdr = (iscsi_hdr_t *)(pdu + 1); /* Ptr arithmetic */
2840 	pdu->isp_callback = idm_sorx_cache_pdu_cb;
2841 
2842 	return (0);
2843 }
2844 
2845 /* ARGSUSED */
2846 static void
2847 idm_sorx_cache_pdu_cb(idm_pdu_t *pdu, idm_status_t status)
2848 {
2849 	pdu->isp_iovlen = 0;
2850 	pdu->isp_sorx_buf = 0;
2851 	kmem_cache_free(idm.idm_sorx_pdu_cache, pdu);
2852 }
2853 
2854 static void
2855 idm_sorx_addl_pdu_cb(idm_pdu_t *pdu, idm_status_t status)
2856 {
2857 	/*
2858 	 * We had to modify our cached RX PDU with a longer header buffer
2859 	 * and/or a longer data buffer.  Release the new buffers and fix
2860 	 * the fields back to what we would expect for a cached RX PDU.
2861 	 */
2862 	if (pdu->isp_flags & IDM_PDU_ADDL_HDR) {
2863 		kmem_free(pdu->isp_hdr, pdu->isp_hdrlen);
2864 	}
2865 	if (pdu->isp_flags & IDM_PDU_ADDL_DATA) {
2866 		kmem_free(pdu->isp_data, pdu->isp_datalen);
2867 	}
2868 	pdu->isp_hdr = (iscsi_hdr_t *)(pdu + 1);
2869 	pdu->isp_hdrlen = sizeof (iscsi_hdr_t);
2870 	pdu->isp_data = NULL;
2871 	pdu->isp_datalen = 0;
2872 	pdu->isp_sorx_buf = 0;
2873 	pdu->isp_callback = idm_sorx_cache_pdu_cb;
2874 	idm_sorx_cache_pdu_cb(pdu, status);
2875 }
2876 
2877 /*
2878  * This thread is only active when I/O is queued for transmit
2879  * because the socket is busy.
2880  */
2881 void
2882 idm_sotx_thread(void *arg)
2883 {
2884 	idm_conn_t	*ic = arg;
2885 	idm_tx_obj_t	*object, *next;
2886 	idm_so_conn_t	*so_conn;
2887 	idm_status_t	status = IDM_STATUS_SUCCESS;
2888 
2889 	idm_conn_hold(ic);
2890 
2891 	mutex_enter(&ic->ic_mutex);
2892 	so_conn = ic->ic_transport_private;
2893 	so_conn->ic_tx_thread_running = B_TRUE;
2894 	so_conn->ic_tx_thread_did = so_conn->ic_tx_thread->t_did;
2895 	cv_signal(&ic->ic_cv);
2896 	mutex_exit(&ic->ic_mutex);
2897 
2898 	mutex_enter(&so_conn->ic_tx_mutex);
2899 
2900 	while (so_conn->ic_tx_thread_running) {
2901 		while (list_is_empty(&so_conn->ic_tx_list)) {
2902 			DTRACE_PROBE1(soconn__tx__sleep, idm_conn_t *, ic);
2903 			cv_wait(&so_conn->ic_tx_cv, &so_conn->ic_tx_mutex);
2904 			DTRACE_PROBE1(soconn__tx__wakeup, idm_conn_t *, ic);
2905 
2906 			if (!so_conn->ic_tx_thread_running) {
2907 				goto tx_bail;
2908 			}
2909 		}
2910 
2911 		object = (idm_tx_obj_t *)list_head(&so_conn->ic_tx_list);
2912 		list_remove(&so_conn->ic_tx_list, object);
2913 		mutex_exit(&so_conn->ic_tx_mutex);
2914 
2915 		switch (object->idm_tx_obj_magic) {
2916 		case IDM_PDU_MAGIC: {
2917 			idm_pdu_t *pdu = (idm_pdu_t *)object;
2918 			DTRACE_PROBE2(soconn__tx__pdu, idm_conn_t *, ic,
2919 			    idm_pdu_t *, (idm_pdu_t *)object);
2920 
2921 			if (pdu->isp_flags & IDM_PDU_SET_STATSN) {
2922 				/* No IDM task */
2923 				(ic->ic_conn_ops.icb_update_statsn)(NULL, pdu);
2924 			}
2925 			status = idm_i_so_tx((idm_pdu_t *)object);
2926 			break;
2927 		}
2928 		case IDM_BUF_MAGIC: {
2929 			idm_buf_t *idb = (idm_buf_t *)object;
2930 			idm_task_t *idt = idb->idb_task_binding;
2931 
2932 			DTRACE_PROBE2(soconn__tx__buf, idm_conn_t *, ic,
2933 			    idm_buf_t *, idb);
2934 
2935 			mutex_enter(&idt->idt_mutex);
2936 			status = idm_so_send_buf_region(idt,
2937 			    idb, 0, idb->idb_xfer_len);
2938 
2939 			/*
2940 			 * TX thread owns the buffer so we expect it to
2941 			 * be "in transport"
2942 			 */
2943 			ASSERT(idb->idb_in_transport);
2944 			if (IDM_CONN_ISTGT(ic)) {
2945 				/*
2946 				 * idm_buf_tx_to_ini_done releases
2947 				 * idt->idt_mutex
2948 				 */
2949 				DTRACE_ISCSI_8(xfer__done,
2950 				    idm_conn_t *, idt->idt_ic,
2951 				    uintptr_t, idb->idb_buf,
2952 				    uint32_t, idb->idb_bufoffset,
2953 				    uint64_t, 0, uint32_t, 0, uint32_t, 0,
2954 				    uint32_t, idb->idb_xfer_len,
2955 				    int, XFER_BUF_TX_TO_INI);
2956 				idm_buf_tx_to_ini_done(idt, idb, status);
2957 			} else {
2958 				idm_so_send_rtt_data_done(idt, idb);
2959 				mutex_exit(&idt->idt_mutex);
2960 			}
2961 			break;
2962 		}
2963 
2964 		default:
2965 			IDM_CONN_LOG(CE_WARN, "idm_sotx_thread: Unknown magic "
2966 			    "(0x%08x)", object->idm_tx_obj_magic);
2967 			status = IDM_STATUS_FAIL;
2968 		}
2969 
2970 		mutex_enter(&so_conn->ic_tx_mutex);
2971 
2972 		if (status != IDM_STATUS_SUCCESS) {
2973 			so_conn->ic_tx_thread_running = B_FALSE;
2974 			idm_conn_event(ic, CE_TRANSPORT_FAIL, status);
2975 		}
2976 	}
2977 
2978 	/*
2979 	 * Before we leave, we need to abort every item remaining in the
2980 	 * TX list.
2981 	 */
2982 
2983 tx_bail:
2984 	object = (idm_tx_obj_t *)list_head(&so_conn->ic_tx_list);
2985 
2986 	while (object != NULL) {
2987 		next = list_next(&so_conn->ic_tx_list, object);
2988 
2989 		list_remove(&so_conn->ic_tx_list, object);
2990 		switch (object->idm_tx_obj_magic) {
2991 		case IDM_PDU_MAGIC:
2992 			idm_pdu_complete((idm_pdu_t *)object,
2993 			    IDM_STATUS_ABORTED);
2994 			break;
2995 
2996 		case IDM_BUF_MAGIC: {
2997 			idm_buf_t *idb = (idm_buf_t *)object;
2998 			idm_task_t *idt = idb->idb_task_binding;
2999 			mutex_exit(&so_conn->ic_tx_mutex);
3000 			mutex_enter(&idt->idt_mutex);
3001 			/*
3002 			 * TX thread owns the buffer so we expect it to
3003 			 * be "in transport"
3004 			 */
3005 			ASSERT(idb->idb_in_transport);
3006 			if (IDM_CONN_ISTGT(ic)) {
3007 				/*
3008 				 * idm_buf_tx_to_ini_done releases
3009 				 * idt->idt_mutex
3010 				 */
3011 				DTRACE_ISCSI_8(xfer__done,
3012 				    idm_conn_t *, idt->idt_ic,
3013 				    uintptr_t, idb->idb_buf,
3014 				    uint32_t, idb->idb_bufoffset,
3015 				    uint64_t, 0, uint32_t, 0, uint32_t, 0,
3016 				    uint32_t, idb->idb_xfer_len,
3017 				    int, XFER_BUF_TX_TO_INI);
3018 				idm_buf_tx_to_ini_done(idt, idb,
3019 				    IDM_STATUS_ABORTED);
3020 			} else {
3021 				idm_so_send_rtt_data_done(idt, idb);
3022 				mutex_exit(&idt->idt_mutex);
3023 			}
3024 			mutex_enter(&so_conn->ic_tx_mutex);
3025 			break;
3026 		}
3027 		default:
3028 			IDM_CONN_LOG(CE_WARN,
3029 			    "idm_sotx_thread: Unexpected magic "
3030 			    "(0x%08x)", object->idm_tx_obj_magic);
3031 		}
3032 
3033 		object = next;
3034 	}
3035 
3036 	mutex_exit(&so_conn->ic_tx_mutex);
3037 	idm_conn_rele(ic);
3038 	thread_exit();
3039 	/*NOTREACHED*/
3040 }
3041 
3042 static void
3043 idm_so_socket_set_nonblock(struct sonode *node)
3044 {
3045 	(void) VOP_SETFL(node->so_vnode, node->so_flag,
3046 	    (node->so_state | FNONBLOCK), CRED(), NULL);
3047 }
3048 
3049 static void
3050 idm_so_socket_set_block(struct sonode *node)
3051 {
3052 	(void) VOP_SETFL(node->so_vnode, node->so_flag,
3053 	    (node->so_state & (~FNONBLOCK)), CRED(), NULL);
3054 }
3055 
3056 
3057 /*
3058  * Called by kernel sockets when the connection has been accepted or
3059  * rejected. In early volo, a "disconnect" callback was sent instead of
3060  * "connectfailed", so we check for both.
3061  */
3062 /* ARGSUSED */
3063 void
3064 idm_so_timed_socket_connect_cb(ksocket_t ks,
3065     ksocket_callback_event_t ev, void *arg, uintptr_t info)
3066 {
3067 	idm_so_timed_socket_t	*itp = arg;
3068 	ASSERT(itp != NULL);
3069 	ASSERT(ev == KSOCKET_EV_CONNECTED ||
3070 	    ev == KSOCKET_EV_CONNECTFAILED ||
3071 	    ev == KSOCKET_EV_DISCONNECTED);
3072 
3073 	mutex_enter(&idm_so_timed_socket_mutex);
3074 	itp->it_callback_called = B_TRUE;
3075 	if (ev == KSOCKET_EV_CONNECTED) {
3076 		itp->it_socket_error_code = 0;
3077 	} else {
3078 		/* Make sure the error code is non-zero on error */
3079 		if (info == 0)
3080 			info = ECONNRESET;
3081 		itp->it_socket_error_code = (int)info;
3082 	}
3083 	cv_signal(&itp->it_cv);
3084 	mutex_exit(&idm_so_timed_socket_mutex);
3085 }
3086 
3087 int
3088 idm_so_timed_socket_connect(ksocket_t ks,
3089     struct sockaddr_storage *sa, int sa_sz, int login_max_usec)
3090 {
3091 	clock_t			conn_login_max;
3092 	int			rc, nonblocking, rval;
3093 	idm_so_timed_socket_t	it;
3094 	ksocket_callbacks_t	ks_cb;
3095 
3096 	conn_login_max = ddi_get_lbolt() + drv_usectohz(login_max_usec);
3097 
3098 	/*
3099 	 * Set to non-block socket mode, with callback on connect
3100 	 * Early volo used "disconnected" instead of "connectfailed",
3101 	 * so set callback to look for both.
3102 	 */
3103 	bzero(&it, sizeof (it));
3104 	ks_cb.ksock_cb_flags = KSOCKET_CB_CONNECTED |
3105 	    KSOCKET_CB_CONNECTFAILED | KSOCKET_CB_DISCONNECTED;
3106 	ks_cb.ksock_cb_connected = idm_so_timed_socket_connect_cb;
3107 	ks_cb.ksock_cb_connectfailed = idm_so_timed_socket_connect_cb;
3108 	ks_cb.ksock_cb_disconnected = idm_so_timed_socket_connect_cb;
3109 	cv_init(&it.it_cv, NULL, CV_DEFAULT, NULL);
3110 	rc = ksocket_setcallbacks(ks, &ks_cb, &it, CRED());
3111 	if (rc != 0)
3112 		return (rc);
3113 
3114 	/* Set to non-blocking mode */
3115 	nonblocking = 1;
3116 	rc = ksocket_ioctl(ks, FIONBIO, (intptr_t)&nonblocking, &rval,
3117 	    CRED());
3118 	if (rc != 0)
3119 		goto cleanup;
3120 
3121 	bzero(&it, sizeof (it));
3122 	for (;;) {
3123 		/*
3124 		 * Warning -- in a loopback scenario, the call to
3125 		 * the connect_cb can occur inside the call to
3126 		 * ksocket_connect. Do not hold the mutex around the
3127 		 * call to ksocket_connect.
3128 		 */
3129 		rc = ksocket_connect(ks, (struct sockaddr *)sa, sa_sz, CRED());
3130 		if (rc == 0 || rc == EISCONN) {
3131 			/* socket success or already success */
3132 			rc = 0;
3133 			break;
3134 		}
3135 		if ((rc != EINPROGRESS) && (rc != EALREADY)) {
3136 			break;
3137 		}
3138 
3139 		/* TCP connect still in progress. See if out of time. */
3140 		if (ddi_get_lbolt() > conn_login_max) {
3141 			/*
3142 			 * Connection retry timeout,
3143 			 * failed connect to target.
3144 			 */
3145 			rc = ETIMEDOUT;
3146 			break;
3147 		}
3148 
3149 		/*
3150 		 * TCP connect still in progress.  Sleep until callback.
3151 		 * Do NOT go to sleep if the callback already occurred!
3152 		 */
3153 		mutex_enter(&idm_so_timed_socket_mutex);
3154 		if (!it.it_callback_called) {
3155 			(void) cv_timedwait(&it.it_cv,
3156 			    &idm_so_timed_socket_mutex, conn_login_max);
3157 		}
3158 		if (it.it_callback_called) {
3159 			rc = it.it_socket_error_code;
3160 			mutex_exit(&idm_so_timed_socket_mutex);
3161 			break;
3162 		}
3163 		/* If timer expires, go call ksocket_connect one last time. */
3164 		mutex_exit(&idm_so_timed_socket_mutex);
3165 	}
3166 
3167 	/* resume blocking mode */
3168 	nonblocking = 0;
3169 	(void) ksocket_ioctl(ks, FIONBIO, (intptr_t)&nonblocking, &rval,
3170 	    CRED());
3171 cleanup:
3172 	(void) ksocket_setcallbacks(ks, NULL, NULL, CRED());
3173 	cv_destroy(&it.it_cv);
3174 	if (rc != 0) {
3175 		idm_soshutdown(ks);
3176 	}
3177 	return (rc);
3178 }
3179 
3180 
3181 void
3182 idm_addr_to_sa(idm_addr_t *dportal, struct sockaddr_storage *sa)
3183 {
3184 	int			dp_addr_size;
3185 	struct sockaddr_in	*sin;
3186 	struct sockaddr_in6	*sin6;
3187 
3188 	/* Build sockaddr_storage for this portal (idm_addr_t) */
3189 	bzero(sa, sizeof (*sa));
3190 	dp_addr_size = dportal->a_addr.i_insize;
3191 	if (dp_addr_size == sizeof (struct in_addr)) {
3192 		/* IPv4 */
3193 		sa->ss_family = AF_INET;
3194 		sin = (struct sockaddr_in *)sa;
3195 		sin->sin_port = htons(dportal->a_port);
3196 		bcopy(&dportal->a_addr.i_addr.in4,
3197 		    &sin->sin_addr, sizeof (struct in_addr));
3198 	} else if (dp_addr_size == sizeof (struct in6_addr)) {
3199 		/* IPv6 */
3200 		sa->ss_family = AF_INET6;
3201 		sin6 = (struct sockaddr_in6 *)sa;
3202 		sin6->sin6_port = htons(dportal->a_port);
3203 		bcopy(&dportal->a_addr.i_addr.in6,
3204 		    &sin6->sin6_addr, sizeof (struct in6_addr));
3205 	} else {
3206 		ASSERT(0);
3207 	}
3208 }
3209 
3210 
3211 /*
3212  * return a human-readable form of a sockaddr_storage, in the form
3213  * [ip-address]:port.  This is used in calls to logging functions.
3214  * If several calls to idm_sa_ntop are made within the same invocation
3215  * of a logging function, then each one needs its own buf.
3216  */
3217 const char *
3218 idm_sa_ntop(const struct sockaddr_storage *sa,
3219     char *buf, size_t size)
3220 {
3221 	static const char bogus_ip[] = "[0].-1";
3222 	char tmp[INET6_ADDRSTRLEN];
3223 
3224 	switch (sa->ss_family) {
3225 	case AF_INET6: {
3226 		const struct sockaddr_in6 *in6 =
3227 		    (const struct sockaddr_in6 *) sa;
3228 
3229 		(void) inet_ntop(in6->sin6_family, &in6->sin6_addr, tmp,
3230 		    sizeof (tmp));
3231 		if (strlen(tmp) + sizeof ("[].65535") > size)
3232 			goto err;
3233 		/* struct sockaddr_storage gets port info from v4 loc */
3234 		(void) snprintf(buf, size, "[%s].%u", tmp,
3235 		    ntohs(in6->sin6_port));
3236 		return (buf);
3237 	}
3238 	case AF_INET: {
3239 		const struct sockaddr_in *in = (const struct sockaddr_in *) sa;
3240 
3241 		(void) inet_ntop(in->sin_family, &in->sin_addr, tmp,
3242 		    sizeof (tmp));
3243 		if (strlen(tmp) + sizeof ("[].65535") > size)
3244 				goto err;
3245 		(void) snprintf(buf, size,  "[%s].%u", tmp,
3246 		    ntohs(in->sin_port));
3247 		return (buf);
3248 	}
3249 	default:
3250 		break;
3251 	}
3252 err:
3253 	(void) snprintf(buf, size, "%s", bogus_ip);
3254 	return (buf);
3255 }
3256