xref: /illumos-gate/usr/src/uts/common/io/idm/idm_so.c (revision 148434217c040ea38dc844384f6ba68d9b325906)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/conf.h>
27 #include <sys/stat.h>
28 #include <sys/file.h>
29 #include <sys/ddi.h>
30 #include <sys/sunddi.h>
31 #include <sys/modctl.h>
32 #include <sys/priv.h>
33 #include <sys/cpuvar.h>
34 #include <sys/socket.h>
35 #include <sys/strsubr.h>
36 #include <sys/sysmacros.h>
37 #include <sys/sdt.h>
38 #include <netinet/tcp.h>
39 #include <inet/tcp.h>
40 #include <sys/socketvar.h>
41 #include <sys/pathname.h>
42 #include <sys/fs/snode.h>
43 #include <sys/fs/dv_node.h>
44 #include <sys/vnode.h>
45 #include <netinet/in.h>
46 #include <net/if.h>
47 #include <sys/sockio.h>
48 #include <sys/ksocket.h>
49 #include <sys/idm/idm.h>
50 #include <sys/idm/idm_so.h>
51 #include <sys/idm/idm_text.h>
52 
53 /*
54  * in6addr_any is currently all zeroes, but use the macro in case this
55  * ever changes.
56  */
57 static const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT;
58 
59 static void idm_sorx_cache_pdu_cb(idm_pdu_t *pdu, idm_status_t status);
60 static void idm_sorx_addl_pdu_cb(idm_pdu_t *pdu, idm_status_t status);
61 static void idm_sotx_cache_pdu_cb(idm_pdu_t *pdu, idm_status_t status);
62 
63 static idm_status_t idm_so_conn_create_common(idm_conn_t *ic, ksocket_t new_so);
64 static void idm_so_conn_destroy_common(idm_conn_t *ic);
65 static void idm_so_conn_connect_common(idm_conn_t *ic);
66 
67 static void idm_set_ini_preconnect_options(idm_so_conn_t *sc);
68 static void idm_set_ini_postconnect_options(idm_so_conn_t *sc);
69 static void idm_set_tgt_connect_options(ksocket_t so);
70 static idm_status_t idm_i_so_tx(idm_pdu_t *pdu);
71 
72 static idm_status_t idm_sorecvdata(idm_conn_t *ic, idm_pdu_t *pdu);
73 static void idm_so_send_rtt_data(idm_conn_t *ic, idm_task_t *idt,
74     idm_buf_t *idb, uint32_t offset, uint32_t length);
75 static void idm_so_send_rtt_data_done(idm_task_t *idt, idm_buf_t *idb);
76 static idm_status_t idm_so_send_buf_region(idm_task_t *idt,
77     idm_buf_t *idb, uint32_t buf_region_offset, uint32_t buf_region_length);
78 
79 static uint32_t idm_fill_iov(idm_pdu_t *pdu, idm_buf_t *idb,
80     uint32_t ro, uint32_t dlength);
81 
82 static idm_status_t idm_so_handle_digest(idm_conn_t *it,
83     nvpair_t *digest_choice, const idm_kv_xlate_t *ikvx);
84 
85 /*
86  * Transport ops prototypes
87  */
88 static void idm_so_tx(idm_conn_t *ic, idm_pdu_t *pdu);
89 static idm_status_t idm_so_buf_tx_to_ini(idm_task_t *idt, idm_buf_t *idb);
90 static idm_status_t idm_so_buf_rx_from_ini(idm_task_t *idt, idm_buf_t *idb);
91 static void idm_so_rx_datain(idm_conn_t *ic, idm_pdu_t *pdu);
92 static void idm_so_rx_rtt(idm_conn_t *ic, idm_pdu_t *pdu);
93 static void idm_so_rx_dataout(idm_conn_t *ic, idm_pdu_t *pdu);
94 static idm_status_t idm_so_free_task_rsrc(idm_task_t *idt);
95 static kv_status_t idm_so_negotiate_key_values(idm_conn_t *it,
96     nvlist_t *request_nvl, nvlist_t *response_nvl, nvlist_t *negotiated_nvl);
97 static void idm_so_notice_key_values(idm_conn_t *it,
98     nvlist_t *negotiated_nvl);
99 static boolean_t idm_so_conn_is_capable(idm_conn_req_t *ic,
100     idm_transport_caps_t *caps);
101 static idm_status_t idm_so_buf_alloc(idm_buf_t *idb, uint64_t buflen);
102 static void idm_so_buf_free(idm_buf_t *idb);
103 static idm_status_t idm_so_buf_setup(idm_buf_t *idb);
104 static void idm_so_buf_teardown(idm_buf_t *idb);
105 static idm_status_t idm_so_tgt_svc_create(idm_svc_req_t *sr, idm_svc_t *is);
106 static void idm_so_tgt_svc_destroy(idm_svc_t *is);
107 static idm_status_t idm_so_tgt_svc_online(idm_svc_t *is);
108 static void idm_so_tgt_svc_offline(idm_svc_t *is);
109 static void idm_so_tgt_conn_destroy(idm_conn_t *ic);
110 static idm_status_t idm_so_tgt_conn_connect(idm_conn_t *ic);
111 static void idm_so_conn_disconnect(idm_conn_t *ic);
112 static idm_status_t idm_so_ini_conn_create(idm_conn_req_t *cr, idm_conn_t *ic);
113 static void idm_so_ini_conn_destroy(idm_conn_t *ic);
114 static idm_status_t idm_so_ini_conn_connect(idm_conn_t *ic);
115 
116 /*
117  * IDM Native Sockets transport operations
118  */
119 static
120 idm_transport_ops_t idm_so_transport_ops = {
121 	idm_so_tx,			/* it_tx_pdu */
122 	idm_so_buf_tx_to_ini,		/* it_buf_tx_to_ini */
123 	idm_so_buf_rx_from_ini,		/* it_buf_rx_from_ini */
124 	idm_so_rx_datain,		/* it_rx_datain */
125 	idm_so_rx_rtt,			/* it_rx_rtt */
126 	idm_so_rx_dataout,		/* it_rx_dataout */
127 	NULL,				/* it_alloc_conn_rsrc */
128 	NULL,				/* it_free_conn_rsrc */
129 	NULL,				/* it_tgt_enable_datamover */
130 	NULL,				/* it_ini_enable_datamover */
131 	NULL,				/* it_conn_terminate */
132 	idm_so_free_task_rsrc,		/* it_free_task_rsrc */
133 	idm_so_negotiate_key_values,	/* it_negotiate_key_values */
134 	idm_so_notice_key_values,	/* it_notice_key_values */
135 	idm_so_conn_is_capable,		/* it_conn_is_capable */
136 	idm_so_buf_alloc,		/* it_buf_alloc */
137 	idm_so_buf_free,		/* it_buf_free */
138 	idm_so_buf_setup,		/* it_buf_setup */
139 	idm_so_buf_teardown,		/* it_buf_teardown */
140 	idm_so_tgt_svc_create,		/* it_tgt_svc_create */
141 	idm_so_tgt_svc_destroy,		/* it_tgt_svc_destroy */
142 	idm_so_tgt_svc_online,		/* it_tgt_svc_online */
143 	idm_so_tgt_svc_offline,		/* it_tgt_svc_offline */
144 	idm_so_tgt_conn_destroy,	/* it_tgt_conn_destroy */
145 	idm_so_tgt_conn_connect,	/* it_tgt_conn_connect */
146 	idm_so_conn_disconnect,		/* it_tgt_conn_disconnect */
147 	idm_so_ini_conn_create,		/* it_ini_conn_create */
148 	idm_so_ini_conn_destroy,	/* it_ini_conn_destroy */
149 	idm_so_ini_conn_connect,	/* it_ini_conn_connect */
150 	idm_so_conn_disconnect		/* it_ini_conn_disconnect */
151 };
152 
153 /*
154  * idm_so_init()
155  * Sockets transport initialization
156  */
157 void
158 idm_so_init(idm_transport_t *it)
159 {
160 	/* Cache for IDM Data and R2T Transmit PDU's */
161 	idm.idm_sotx_pdu_cache = kmem_cache_create("idm_tx_pdu_cache",
162 	    sizeof (idm_pdu_t) + sizeof (iscsi_hdr_t), 8,
163 	    &idm_sotx_pdu_constructor, NULL, NULL, NULL, NULL, KM_SLEEP);
164 
165 	/* Cache for IDM Receive PDU's */
166 	idm.idm_sorx_pdu_cache = kmem_cache_create("idm_rx_pdu_cache",
167 	    sizeof (idm_pdu_t) + IDM_SORX_CACHE_HDRLEN, 8,
168 	    &idm_sorx_pdu_constructor, NULL, NULL, NULL, NULL, KM_SLEEP);
169 
170 	/* 128k buffer cache */
171 	idm.idm_so_128k_buf_cache = kmem_cache_create("idm_128k_buf_cache",
172 	    IDM_SO_BUF_CACHE_UB, 8, NULL, NULL, NULL, NULL, NULL, KM_SLEEP);
173 
174 	/* Set the sockets transport ops */
175 	it->it_ops = &idm_so_transport_ops;
176 }
177 
178 /*
179  * idm_so_fini()
180  * Sockets transport teardown
181  */
182 void
183 idm_so_fini(void)
184 {
185 	kmem_cache_destroy(idm.idm_so_128k_buf_cache);
186 	kmem_cache_destroy(idm.idm_sotx_pdu_cache);
187 	kmem_cache_destroy(idm.idm_sorx_pdu_cache);
188 }
189 
190 ksocket_t
191 idm_socreate(int domain, int type, int protocol)
192 {
193 	ksocket_t ks;
194 
195 	if (!ksocket_socket(&ks, domain, type, protocol, KSOCKET_NOSLEEP,
196 	    CRED())) {
197 		return (ks);
198 	} else {
199 		return (NULL);
200 	}
201 }
202 
203 /*
204  * idm_soshutdown will disconnect the socket and prevent subsequent PDU
205  * reception and transmission.  The sonode still exists but its state
206  * gets modified to indicate it is no longer connected.  Calls to
207  * idm_sorecv/idm_iov_sorecv will return so idm_soshutdown can be used
208  * regain control of a thread stuck in idm_sorecv.
209  */
210 void
211 idm_soshutdown(ksocket_t so)
212 {
213 	(void) ksocket_shutdown(so, SHUT_RDWR, CRED());
214 }
215 
216 /*
217  * idm_sodestroy releases all resources associated with a socket previously
218  * created with idm_socreate.  The socket must be shutdown using
219  * idm_soshutdown before the socket is destroyed with idm_sodestroy,
220  * otherwise undefined behavior will result.
221  */
222 void
223 idm_sodestroy(ksocket_t ks)
224 {
225 	(void) ksocket_close(ks, CRED());
226 }
227 
228 /*
229  * Function to compare two addresses in sockaddr_storage format
230  */
231 
232 int
233 idm_ss_compare(const struct sockaddr_storage *cmp_ss1,
234     const struct sockaddr_storage *cmp_ss2,
235     boolean_t v4_mapped_as_v4)
236 {
237 	struct sockaddr_storage			mapped_v4_ss1, mapped_v4_ss2;
238 	const struct sockaddr_storage		*ss1, *ss2;
239 	struct in_addr				*in1, *in2;
240 	struct in6_addr				*in61, *in62;
241 	int i;
242 
243 	/*
244 	 * Normalize V4-mapped IPv6 addresses into V4 format if
245 	 * v4_mapped_as_v4 is B_TRUE.
246 	 */
247 	ss1 = cmp_ss1;
248 	ss2 = cmp_ss2;
249 	if (v4_mapped_as_v4 && (ss1->ss_family == AF_INET6)) {
250 		in61 = &((struct sockaddr_in6 *)ss1)->sin6_addr;
251 		if (IN6_IS_ADDR_V4MAPPED(in61)) {
252 			bzero(&mapped_v4_ss1, sizeof (mapped_v4_ss1));
253 			mapped_v4_ss1.ss_family = AF_INET;
254 			((struct sockaddr_in *)&mapped_v4_ss1)->sin_port =
255 			    ((struct sockaddr_in *)ss1)->sin_port;
256 			IN6_V4MAPPED_TO_INADDR(in61,
257 			    &((struct sockaddr_in *)&mapped_v4_ss1)->sin_addr);
258 			ss1 = &mapped_v4_ss1;
259 		}
260 	}
261 	ss2 = cmp_ss2;
262 	if (v4_mapped_as_v4 && (ss2->ss_family == AF_INET6)) {
263 		in62 = &((struct sockaddr_in6 *)ss2)->sin6_addr;
264 		if (IN6_IS_ADDR_V4MAPPED(in62)) {
265 			bzero(&mapped_v4_ss2, sizeof (mapped_v4_ss2));
266 			mapped_v4_ss2.ss_family = AF_INET;
267 			((struct sockaddr_in *)&mapped_v4_ss2)->sin_port =
268 			    ((struct sockaddr_in *)ss2)->sin_port;
269 			IN6_V4MAPPED_TO_INADDR(in62,
270 			    &((struct sockaddr_in *)&mapped_v4_ss2)->sin_addr);
271 			ss2 = &mapped_v4_ss2;
272 		}
273 	}
274 
275 	/*
276 	 * Compare ports, then address family, then ip address
277 	 */
278 	if (((struct sockaddr_in *)ss1)->sin_port !=
279 	    ((struct sockaddr_in *)ss2)->sin_port) {
280 		if (((struct sockaddr_in *)ss1)->sin_port >
281 		    ((struct sockaddr_in *)ss2)->sin_port)
282 			return (1);
283 		else
284 			return (-1);
285 	}
286 
287 	/*
288 	 * ports are the same
289 	 */
290 	if (ss1->ss_family != ss2->ss_family) {
291 		if (ss1->ss_family == AF_INET)
292 			return (1);
293 		else
294 			return (-1);
295 	}
296 
297 	/*
298 	 * address families are the same
299 	 */
300 	if (ss1->ss_family == AF_INET) {
301 		in1 = &((struct sockaddr_in *)ss1)->sin_addr;
302 		in2 = &((struct sockaddr_in *)ss2)->sin_addr;
303 
304 		if (in1->s_addr > in2->s_addr)
305 			return (1);
306 		else if (in1->s_addr < in2->s_addr)
307 			return (-1);
308 		else
309 			return (0);
310 	} else if (ss1->ss_family == AF_INET6) {
311 		in61 = &((struct sockaddr_in6 *)ss1)->sin6_addr;
312 		in62 = &((struct sockaddr_in6 *)ss2)->sin6_addr;
313 
314 		for (i = 0; i < 4; i++) {
315 			if (in61->s6_addr32[i] > in62->s6_addr32[i])
316 				return (1);
317 			else if (in61->s6_addr32[i] < in62->s6_addr32[i])
318 				return (-1);
319 		}
320 		return (0);
321 	}
322 
323 	return (1);
324 }
325 
326 /*
327  * IP address filter functions to flag addresses that should not
328  * go out to initiators through discovery.
329  */
330 static boolean_t
331 idm_v4_addr_okay(struct in_addr *in_addr)
332 {
333 	in_addr_t addr = ntohl(in_addr->s_addr);
334 
335 	if ((INADDR_NONE == addr) ||
336 	    (IN_MULTICAST(addr)) ||
337 	    ((addr >> IN_CLASSA_NSHIFT) == 0) ||
338 	    ((addr >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)) {
339 		return (B_FALSE);
340 	}
341 	return (B_TRUE);
342 }
343 
344 static boolean_t
345 idm_v6_addr_okay(struct in6_addr *addr6)
346 {
347 
348 	if ((IN6_IS_ADDR_UNSPECIFIED(addr6)) ||
349 	    (IN6_IS_ADDR_LOOPBACK(addr6)) ||
350 	    (IN6_IS_ADDR_MULTICAST(addr6)) ||
351 	    (IN6_IS_ADDR_V4MAPPED(addr6)) ||
352 	    (IN6_IS_ADDR_V4COMPAT(addr6)) ||
353 	    (IN6_IS_ADDR_LINKLOCAL(addr6))) {
354 		return (B_FALSE);
355 	}
356 	return (B_TRUE);
357 }
358 
359 /*
360  * idm_get_ipaddr will retrieve a list of IP Addresses which the host is
361  * configured with by sending down a sequence of kernel ioctl to IP STREAMS.
362  */
363 int
364 idm_get_ipaddr(idm_addr_list_t **ipaddr_p)
365 {
366 	ksocket_t 		so4, so6;
367 	struct lifnum		lifn;
368 	struct lifconf		lifc;
369 	struct lifreq		*lp;
370 	int			rval;
371 	int			numifs;
372 	int			bufsize;
373 	void			*buf;
374 	int			i, j, n, rc;
375 	struct sockaddr_storage	ss;
376 	struct sockaddr_in	*sin;
377 	struct sockaddr_in6	*sin6;
378 	idm_addr_t		*ip;
379 	idm_addr_list_t		*ipaddr;
380 	int			size_ipaddr;
381 
382 	*ipaddr_p = NULL;
383 	size_ipaddr = 0;
384 	buf = NULL;
385 
386 	/* create an ipv4 and ipv6 UDP socket */
387 	if ((so6 = idm_socreate(PF_INET6, SOCK_DGRAM, 0)) == NULL)
388 		return (0);
389 	if ((so4 = idm_socreate(PF_INET, SOCK_DGRAM, 0)) == NULL) {
390 		idm_sodestroy(so6);
391 		return (0);
392 	}
393 
394 
395 retry_count:
396 	/* snapshot the current number of interfaces */
397 	lifn.lifn_family = PF_UNSPEC;
398 	lifn.lifn_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES;
399 	lifn.lifn_count = 0;
400 	/* use vp6 for ioctls with unspecified families by default */
401 	if (ksocket_ioctl(so6, SIOCGLIFNUM, (intptr_t)&lifn, &rval, CRED())
402 	    != 0) {
403 		goto cleanup;
404 	}
405 
406 	numifs = lifn.lifn_count;
407 	if (numifs <= 0) {
408 		goto cleanup;
409 	}
410 
411 	/* allocate extra room in case more interfaces appear */
412 	numifs += 10;
413 
414 	/* get the interface names and ip addresses */
415 	bufsize = numifs * sizeof (struct lifreq);
416 	buf = kmem_alloc(bufsize, KM_SLEEP);
417 
418 	lifc.lifc_family = AF_UNSPEC;
419 	lifc.lifc_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES;
420 	lifc.lifc_len = bufsize;
421 	lifc.lifc_buf = buf;
422 	rc = ksocket_ioctl(so6, SIOCGLIFCONF, (intptr_t)&lifc, &rval, CRED());
423 	if (rc != 0) {
424 		goto cleanup;
425 	}
426 	/* if our extra room is used up, try again */
427 	if (bufsize <= lifc.lifc_len) {
428 		kmem_free(buf, bufsize);
429 		buf = NULL;
430 		goto retry_count;
431 	}
432 	/* calc actual number of ifconfs */
433 	n = lifc.lifc_len / sizeof (struct lifreq);
434 
435 	/* get ip address */
436 	if (n > 0) {
437 		size_ipaddr = sizeof (idm_addr_list_t) +
438 		    (n - 1) * sizeof (idm_addr_t);
439 		ipaddr = kmem_zalloc(size_ipaddr, KM_SLEEP);
440 	} else {
441 		goto cleanup;
442 	}
443 
444 	/*
445 	 * Examine the array of interfaces and filter uninteresting ones
446 	 */
447 	for (i = 0, j = 0, lp = lifc.lifc_req; i < n; i++, lp++) {
448 
449 		/*
450 		 * Copy the address as the SIOCGLIFFLAGS ioctl is destructive
451 		 */
452 		ss = lp->lifr_addr;
453 		/*
454 		 * fetch the flags using the socket of the correct family
455 		 */
456 		switch (ss.ss_family) {
457 		case AF_INET:
458 			rc = ksocket_ioctl(so4, SIOCGLIFFLAGS, (intptr_t)lp,
459 			    &rval, CRED());
460 			break;
461 		case AF_INET6:
462 			rc = ksocket_ioctl(so6, SIOCGLIFFLAGS, (intptr_t)lp,
463 			    &rval, CRED());
464 			break;
465 		default:
466 			continue;
467 		}
468 		if (rc == 0) {
469 			/*
470 			 * If we got the flags, skip uninteresting
471 			 * interfaces based on flags
472 			 */
473 			if ((lp->lifr_flags & IFF_UP) != IFF_UP)
474 				continue;
475 			if (lp->lifr_flags &
476 			    (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED))
477 				continue;
478 		}
479 
480 		/* save ip address */
481 		ip = &ipaddr->al_addrs[j];
482 		switch (ss.ss_family) {
483 		case AF_INET:
484 			sin = (struct sockaddr_in *)&ss;
485 			if (!idm_v4_addr_okay(&sin->sin_addr))
486 				continue;
487 			ip->a_addr.i_addr.in4 = sin->sin_addr;
488 			ip->a_addr.i_insize = sizeof (struct in_addr);
489 			break;
490 		case AF_INET6:
491 			sin6 = (struct sockaddr_in6 *)&ss;
492 			if (!idm_v6_addr_okay(&sin6->sin6_addr))
493 				continue;
494 			ip->a_addr.i_addr.in6 = sin6->sin6_addr;
495 			ip->a_addr.i_insize = sizeof (struct in6_addr);
496 			break;
497 		default:
498 			continue;
499 		}
500 		j++;
501 	}
502 
503 	if (j == 0) {
504 		/* no valid ifaddr */
505 		kmem_free(ipaddr, size_ipaddr);
506 		size_ipaddr = 0;
507 		ipaddr = NULL;
508 	} else {
509 		ipaddr->al_out_cnt = j;
510 	}
511 
512 
513 cleanup:
514 	idm_sodestroy(so6);
515 	idm_sodestroy(so4);
516 
517 	if (buf != NULL)
518 		kmem_free(buf, bufsize);
519 
520 	*ipaddr_p = ipaddr;
521 	return (size_ipaddr);
522 }
523 
524 int
525 idm_sorecv(ksocket_t so, void *msg, size_t len)
526 {
527 	iovec_t iov;
528 
529 	ASSERT(so != NULL);
530 	ASSERT(len != 0);
531 
532 	/*
533 	 * Fill in iovec and receive data
534 	 */
535 	iov.iov_base = msg;
536 	iov.iov_len = len;
537 
538 	return (idm_iov_sorecv(so, &iov, 1, len));
539 }
540 
541 /*
542  * idm_sosendto - Sends a buffered data on a non-connected socket.
543  *
544  * This function puts the data provided on the wire by calling sosendmsg.
545  * It will return only when all the data has been sent or if an error
546  * occurs.
547  *
548  * Returns 0 for success, the socket errno value if sosendmsg fails, and
549  * -1 if sosendmsg returns success but uio_resid != 0
550  */
551 int
552 idm_sosendto(ksocket_t so, void *buff, size_t len,
553     struct sockaddr *name, socklen_t namelen)
554 {
555 	struct msghdr		msg;
556 	struct iovec		iov[1];
557 	int			error;
558 	size_t			sent = 0;
559 
560 	iov[0].iov_base	= buff;
561 	iov[0].iov_len	= len;
562 
563 	/* Initialization of the message header. */
564 	bzero(&msg, sizeof (msg));
565 	msg.msg_iov	= iov;
566 	msg.msg_iovlen	= 1;
567 	msg.msg_name	= name;
568 	msg.msg_namelen	= namelen;
569 
570 	if ((error = ksocket_sendmsg(so, &msg, 0, &sent, CRED())) == 0) {
571 		/* Data sent */
572 		if (sent == len) {
573 			/* All data sent.  Success. */
574 			return (0);
575 		} else {
576 			/* Not all data was sent.  Failure */
577 			return (-1);
578 		}
579 	}
580 
581 	/* Send failed */
582 	return (error);
583 }
584 
585 /*
586  * idm_iov_sosend - Sends an iovec on a connection.
587  *
588  * This function puts the data provided on the wire by calling sosendmsg.
589  * It will return only when all the data has been sent or if an error
590  * occurs.
591  *
592  * Returns 0 for success, the socket errno value if sosendmsg fails, and
593  * -1 if sosendmsg returns success but uio_resid != 0
594  */
595 int
596 idm_iov_sosend(ksocket_t so, iovec_t *iop, int iovlen, size_t total_len)
597 {
598 	struct msghdr		msg;
599 	int			error;
600 	size_t 			sent = 0;
601 
602 	ASSERT(iop != NULL);
603 
604 	/* Initialization of the message header. */
605 	bzero(&msg, sizeof (msg));
606 	msg.msg_iov	= iop;
607 	msg.msg_iovlen	= iovlen;
608 
609 	if ((error = ksocket_sendmsg(so, &msg, 0, &sent, CRED()))
610 	    == 0) {
611 		/* Data sent */
612 		if (sent == total_len) {
613 			/* All data sent.  Success. */
614 			return (0);
615 		} else {
616 			/* Not all data was sent.  Failure */
617 			return (-1);
618 		}
619 	}
620 
621 	/* Send failed */
622 	return (error);
623 }
624 
625 /*
626  * idm_iov_sorecv - Receives an iovec from a connection
627  *
628  * This function gets the data asked for from the socket.  It will return
629  * only when all the requested data has been retrieved or if an error
630  * occurs.
631  *
632  * Returns 0 for success, the socket errno value if sorecvmsg fails, and
633  * -1 if sorecvmsg returns success but uio_resid != 0
634  */
635 int
636 idm_iov_sorecv(ksocket_t so, iovec_t *iop, int iovlen, size_t total_len)
637 {
638 	struct msghdr		msg;
639 	int			error;
640 	size_t			recv;
641 	int 			flags;
642 
643 	ASSERT(iop != NULL);
644 
645 	/* Initialization of the message header. */
646 	bzero(&msg, sizeof (msg));
647 	msg.msg_iov	= iop;
648 	msg.msg_iovlen	= iovlen;
649 	flags		= MSG_WAITALL;
650 
651 	if ((error = ksocket_recvmsg(so, &msg, flags, &recv, CRED()))
652 	    == 0) {
653 		/* Received data */
654 		if (recv == total_len) {
655 			/* All requested data received.  Success */
656 			return (0);
657 		} else {
658 			/*
659 			 * Not all data was received.  The connection has
660 			 * probably failed.
661 			 */
662 			return (-1);
663 		}
664 	}
665 
666 	/* Receive failed */
667 	return (error);
668 }
669 
670 static void
671 idm_set_ini_preconnect_options(idm_so_conn_t *sc)
672 {
673 	int	conn_abort = 10000;
674 	int	conn_notify = 2000;
675 	int	abort = 30000;
676 
677 	/* Pre-connect socket options */
678 	(void) ksocket_setsockopt(sc->ic_so, IPPROTO_TCP,
679 	    TCP_CONN_NOTIFY_THRESHOLD, (char *)&conn_notify, sizeof (int),
680 	    CRED());
681 	(void) ksocket_setsockopt(sc->ic_so, IPPROTO_TCP,
682 	    TCP_CONN_ABORT_THRESHOLD, (char *)&conn_abort, sizeof (int),
683 	    CRED());
684 	(void) ksocket_setsockopt(sc->ic_so, IPPROTO_TCP, TCP_ABORT_THRESHOLD,
685 	    (char *)&abort, sizeof (int), CRED());
686 }
687 
688 static void
689 idm_set_ini_postconnect_options(idm_so_conn_t *sc)
690 {
691 	int32_t		rcvbuf = IDM_RCVBUF_SIZE;
692 	int32_t		sndbuf = IDM_SNDBUF_SIZE;
693 	const int	on = 1;
694 
695 	/* Set postconnect options */
696 	(void) ksocket_setsockopt(sc->ic_so, IPPROTO_TCP, TCP_NODELAY,
697 	    (char *)&on, sizeof (int), CRED());
698 	(void) ksocket_setsockopt(sc->ic_so, SOL_SOCKET, SO_RCVBUF,
699 	    (char *)&rcvbuf, sizeof (int), CRED());
700 	(void) ksocket_setsockopt(sc->ic_so, SOL_SOCKET, SO_SNDBUF,
701 	    (char *)&sndbuf, sizeof (int), CRED());
702 }
703 
704 static void
705 idm_set_tgt_connect_options(ksocket_t ks)
706 {
707 	int32_t		rcvbuf = IDM_RCVBUF_SIZE;
708 	int32_t		sndbuf = IDM_SNDBUF_SIZE;
709 	const int	on = 1;
710 
711 	/* Set connect options */
712 	(void) ksocket_setsockopt(ks, SOL_SOCKET, SO_RCVBUF,
713 	    (char *)&rcvbuf, sizeof (int), CRED());
714 	(void) ksocket_setsockopt(ks, SOL_SOCKET, SO_SNDBUF,
715 	    (char *)&sndbuf, sizeof (int), CRED());
716 	(void) ksocket_setsockopt(ks, IPPROTO_TCP, TCP_NODELAY,
717 	    (char *)&on, sizeof (on), CRED());
718 }
719 
720 static uint32_t
721 n2h24(const uchar_t *ptr)
722 {
723 	return ((ptr[0] << 16) | (ptr[1] << 8) | ptr[2]);
724 }
725 
726 
727 static idm_status_t
728 idm_sorecvhdr(idm_conn_t *ic, idm_pdu_t *pdu)
729 {
730 	iscsi_hdr_t	*bhs;
731 	uint32_t	hdr_digest_crc;
732 	uint32_t	crc_calculated;
733 	void		*new_hdr;
734 	int		ahslen = 0;
735 	int		total_len = 0;
736 	int		iovlen = 0;
737 	struct iovec	iov[2];
738 	idm_so_conn_t	*so_conn;
739 	int		rc;
740 
741 	so_conn = ic->ic_transport_private;
742 
743 	/*
744 	 * Read BHS
745 	 */
746 	bhs = pdu->isp_hdr;
747 	rc = idm_sorecv(so_conn->ic_so, pdu->isp_hdr, sizeof (iscsi_hdr_t));
748 	if (rc != IDM_STATUS_SUCCESS) {
749 		return (IDM_STATUS_FAIL);
750 	}
751 
752 	/*
753 	 * Check actual AHS length against the amount available in the buffer
754 	 */
755 	pdu->isp_hdrlen = sizeof (iscsi_hdr_t) +
756 	    (bhs->hlength * sizeof (uint32_t));
757 	pdu->isp_datalen = n2h24(bhs->dlength);
758 	if (bhs->hlength > IDM_SORX_CACHE_AHSLEN) {
759 		/* Allocate a new header segment and change the callback */
760 		new_hdr = kmem_alloc(pdu->isp_hdrlen, KM_SLEEP);
761 		bcopy(pdu->isp_hdr, new_hdr, sizeof (iscsi_hdr_t));
762 		pdu->isp_hdr = new_hdr;
763 		pdu->isp_flags |= IDM_PDU_ADDL_HDR;
764 
765 		/*
766 		 * This callback will restore the expected values after
767 		 * the RX PDU has been processed.
768 		 */
769 		pdu->isp_callback = idm_sorx_addl_pdu_cb;
770 	}
771 
772 	/*
773 	 * Setup receipt of additional header and header digest (if enabled).
774 	 */
775 	if (bhs->hlength > 0) {
776 		iov[iovlen].iov_base = (caddr_t)(pdu->isp_hdr + 1);
777 		ahslen = pdu->isp_hdrlen - sizeof (iscsi_hdr_t);
778 		iov[iovlen].iov_len = ahslen;
779 		total_len += iov[iovlen].iov_len;
780 		iovlen++;
781 	}
782 
783 	if (ic->ic_conn_flags & IDM_CONN_HEADER_DIGEST) {
784 		iov[iovlen].iov_base = (caddr_t)&hdr_digest_crc;
785 		iov[iovlen].iov_len = sizeof (hdr_digest_crc);
786 		total_len += iov[iovlen].iov_len;
787 		iovlen++;
788 	}
789 
790 	if ((iovlen != 0) &&
791 	    (idm_iov_sorecv(so_conn->ic_so, &iov[0], iovlen,
792 	    total_len) != 0)) {
793 		return (IDM_STATUS_FAIL);
794 	}
795 
796 	/*
797 	 * Validate header digest if enabled
798 	 */
799 	if (ic->ic_conn_flags & IDM_CONN_HEADER_DIGEST) {
800 		crc_calculated = idm_crc32c(pdu->isp_hdr,
801 		    sizeof (iscsi_hdr_t) + ahslen);
802 		if (crc_calculated != hdr_digest_crc) {
803 			/* Invalid Header Digest */
804 			return (IDM_STATUS_HEADER_DIGEST);
805 		}
806 	}
807 
808 	return (0);
809 }
810 
811 /*
812  * idm_so_ini_conn_create()
813  * Allocate the sockets transport connection resources.
814  */
815 static idm_status_t
816 idm_so_ini_conn_create(idm_conn_req_t *cr, idm_conn_t *ic)
817 {
818 	ksocket_t	so;
819 	idm_so_conn_t	*so_conn;
820 	idm_status_t	idmrc;
821 
822 	so = idm_socreate(cr->cr_domain, cr->cr_type,
823 	    cr->cr_protocol);
824 	if (so == NULL) {
825 		return (IDM_STATUS_FAIL);
826 	}
827 
828 	/* Bind the socket if configured to do so */
829 	if (cr->cr_bound) {
830 		if (ksocket_bind(so, &cr->cr_bound_addr.sin,
831 		    SIZEOF_SOCKADDR(&cr->cr_bound_addr.sin), CRED()) != 0) {
832 			idm_sodestroy(so);
833 			return (IDM_STATUS_FAIL);
834 		}
835 	}
836 
837 	idmrc = idm_so_conn_create_common(ic, so);
838 	if (idmrc != IDM_STATUS_SUCCESS) {
839 		idm_soshutdown(so);
840 		idm_sodestroy(so);
841 		return (IDM_STATUS_FAIL);
842 	}
843 
844 	so_conn = ic->ic_transport_private;
845 	/* Set up socket options */
846 	idm_set_ini_preconnect_options(so_conn);
847 
848 	return (IDM_STATUS_SUCCESS);
849 }
850 
851 /*
852  * idm_so_ini_conn_destroy()
853  * Tear down the sockets transport connection resources.
854  */
855 static void
856 idm_so_ini_conn_destroy(idm_conn_t *ic)
857 {
858 	idm_so_conn_destroy_common(ic);
859 }
860 
861 /*
862  * idm_so_ini_conn_connect()
863  * Establish the connection referred to by the handle previously allocated via
864  * idm_so_ini_conn_create().
865  */
866 static idm_status_t
867 idm_so_ini_conn_connect(idm_conn_t *ic)
868 {
869 	idm_so_conn_t	*so_conn;
870 
871 	so_conn = ic->ic_transport_private;
872 
873 	if (ksocket_connect(so_conn->ic_so, &ic->ic_ini_dst_addr.sin,
874 	    (SIZEOF_SOCKADDR(&ic->ic_ini_dst_addr.sin)), CRED()) != 0) {
875 		idm_soshutdown(so_conn->ic_so);
876 		return (IDM_STATUS_FAIL);
877 	}
878 
879 	idm_so_conn_connect_common(ic);
880 
881 	idm_set_ini_postconnect_options(so_conn);
882 
883 	return (IDM_STATUS_SUCCESS);
884 }
885 
886 idm_status_t
887 idm_so_tgt_conn_create(idm_conn_t *ic, ksocket_t new_so)
888 {
889 	idm_status_t	idmrc;
890 
891 	idmrc = idm_so_conn_create_common(ic, new_so);
892 
893 	return (idmrc);
894 }
895 
896 static void
897 idm_so_tgt_conn_destroy(idm_conn_t *ic)
898 {
899 	idm_so_conn_destroy_common(ic);
900 }
901 
902 /*
903  * idm_so_tgt_conn_connect()
904  * Establish the connection in ic, passed from idm_tgt_conn_finish(), which
905  * is invoked from the SM as a result of an inbound connection request.
906  */
907 static idm_status_t
908 idm_so_tgt_conn_connect(idm_conn_t *ic)
909 {
910 	idm_so_conn_connect_common(ic);
911 
912 	return (IDM_STATUS_SUCCESS);
913 }
914 
915 static idm_status_t
916 idm_so_conn_create_common(idm_conn_t *ic, ksocket_t new_so)
917 {
918 	idm_so_conn_t	*so_conn;
919 
920 	so_conn = kmem_zalloc(sizeof (idm_so_conn_t), KM_SLEEP);
921 	so_conn->ic_so = new_so;
922 
923 	ic->ic_transport_private = so_conn;
924 	ic->ic_transport_hdrlen = 0;
925 
926 	/* Set the scoreboarding flag on this connection */
927 	ic->ic_conn_flags |= IDM_CONN_USE_SCOREBOARD;
928 
929 	/*
930 	 * Initialize tx thread mutex and list
931 	 */
932 	mutex_init(&so_conn->ic_tx_mutex, NULL, MUTEX_DEFAULT, NULL);
933 	cv_init(&so_conn->ic_tx_cv, NULL, CV_DEFAULT, NULL);
934 	list_create(&so_conn->ic_tx_list, sizeof (idm_pdu_t),
935 	    offsetof(idm_pdu_t, idm_tx_link));
936 
937 	return (IDM_STATUS_SUCCESS);
938 }
939 
940 static void
941 idm_so_conn_destroy_common(idm_conn_t *ic)
942 {
943 	idm_so_conn_t	*so_conn = ic->ic_transport_private;
944 
945 	ic->ic_transport_private = NULL;
946 	idm_sodestroy(so_conn->ic_so);
947 	list_destroy(&so_conn->ic_tx_list);
948 	mutex_destroy(&so_conn->ic_tx_mutex);
949 	cv_destroy(&so_conn->ic_tx_cv);
950 
951 	kmem_free(so_conn, sizeof (idm_so_conn_t));
952 }
953 
954 static void
955 idm_so_conn_connect_common(idm_conn_t *ic)
956 {
957 	idm_so_conn_t	*so_conn;
958 	struct sockaddr_in6	t_addr;
959 	socklen_t	t_addrlen = 0;
960 
961 	so_conn = ic->ic_transport_private;
962 	bzero(&t_addr, sizeof (struct sockaddr_in6));
963 	t_addrlen = sizeof (struct sockaddr_in6);
964 
965 	/* Set the local and remote addresses in the idm conn handle */
966 	ksocket_getsockname(so_conn->ic_so, (struct sockaddr *)&t_addr,
967 	    &t_addrlen, CRED());
968 	bcopy(&t_addr, &ic->ic_laddr, t_addrlen);
969 	ksocket_getpeername(so_conn->ic_so, (struct sockaddr *)&t_addr,
970 	    &t_addrlen, CRED());
971 	bcopy(&t_addr, &ic->ic_raddr, t_addrlen);
972 
973 	mutex_enter(&ic->ic_mutex);
974 	so_conn->ic_tx_thread = thread_create(NULL, 0, idm_sotx_thread, ic, 0,
975 	    &p0, TS_RUN, minclsyspri);
976 	so_conn->ic_rx_thread = thread_create(NULL, 0, idm_sorx_thread, ic, 0,
977 	    &p0, TS_RUN, minclsyspri);
978 
979 	while (!so_conn->ic_rx_thread_running || !so_conn->ic_tx_thread_running)
980 		cv_wait(&ic->ic_cv, &ic->ic_mutex);
981 	mutex_exit(&ic->ic_mutex);
982 }
983 
984 /*
985  * idm_so_conn_disconnect()
986  * Shutdown the socket connection and stop the thread
987  */
988 static void
989 idm_so_conn_disconnect(idm_conn_t *ic)
990 {
991 	idm_so_conn_t	*so_conn;
992 
993 	so_conn = ic->ic_transport_private;
994 
995 	mutex_enter(&ic->ic_mutex);
996 	so_conn->ic_rx_thread_running = B_FALSE;
997 	so_conn->ic_tx_thread_running = B_FALSE;
998 	/* We need to wakeup the TX thread */
999 	mutex_enter(&so_conn->ic_tx_mutex);
1000 	cv_signal(&so_conn->ic_tx_cv);
1001 	mutex_exit(&so_conn->ic_tx_mutex);
1002 	mutex_exit(&ic->ic_mutex);
1003 
1004 	/* This should wakeup the RX thread if it is sleeping */
1005 	idm_soshutdown(so_conn->ic_so);
1006 
1007 	thread_join(so_conn->ic_tx_thread_did);
1008 	thread_join(so_conn->ic_rx_thread_did);
1009 }
1010 
1011 /*
1012  * idm_so_tgt_svc_create()
1013  * Establish a service on an IP address and port.  idm_svc_req_t contains
1014  * the service parameters.
1015  */
1016 /*ARGSUSED*/
1017 static idm_status_t
1018 idm_so_tgt_svc_create(idm_svc_req_t *sr, idm_svc_t *is)
1019 {
1020 	idm_so_svc_t		*so_svc;
1021 
1022 	so_svc = kmem_zalloc(sizeof (idm_so_svc_t), KM_SLEEP);
1023 
1024 	/* Set the new sockets service in svc handle */
1025 	is->is_so_svc = (void *)so_svc;
1026 
1027 	return (IDM_STATUS_SUCCESS);
1028 }
1029 
1030 /*
1031  * idm_so_tgt_svc_destroy()
1032  * Teardown sockets resources allocated in idm_so_tgt_svc_create()
1033  */
1034 static void
1035 idm_so_tgt_svc_destroy(idm_svc_t *is)
1036 {
1037 	/* the socket will have been torn down; free the service */
1038 	kmem_free(is->is_so_svc, sizeof (idm_so_svc_t));
1039 }
1040 
1041 /*
1042  * idm_so_tgt_svc_online()
1043  * Launch a watch thread on the svc allocated in idm_so_tgt_svc_create()
1044  */
1045 
1046 static idm_status_t
1047 idm_so_tgt_svc_online(idm_svc_t *is)
1048 {
1049 	idm_so_svc_t		*so_svc;
1050 	idm_svc_req_t		*sr = &is->is_svc_req;
1051 	struct sockaddr_in6	sin6_ip;
1052 	const uint32_t		on = 1;
1053 	const uint32_t		off = 0;
1054 
1055 	mutex_enter(&is->is_mutex);
1056 	so_svc = (idm_so_svc_t *)is->is_so_svc;
1057 
1058 	/*
1059 	 * Try creating an IPv6 socket first
1060 	 */
1061 	if ((so_svc->is_so = idm_socreate(PF_INET6, SOCK_STREAM, 0)) == NULL) {
1062 		mutex_exit(&is->is_mutex);
1063 		return (IDM_STATUS_FAIL);
1064 	} else {
1065 		bzero(&sin6_ip, sizeof (sin6_ip));
1066 		sin6_ip.sin6_family = AF_INET6;
1067 		sin6_ip.sin6_port = htons(sr->sr_port);
1068 		sin6_ip.sin6_addr = in6addr_any;
1069 
1070 		(void) ksocket_setsockopt(so_svc->is_so, SOL_SOCKET,
1071 		    SO_REUSEADDR, (char *)&on, sizeof (on), CRED());
1072 		/*
1073 		 * Turn off SO_MAC_EXEMPT so future sobinds succeed
1074 		 */
1075 		(void) ksocket_setsockopt(so_svc->is_so, SOL_SOCKET,
1076 		    SO_MAC_EXEMPT, (char *)&off, sizeof (off), CRED());
1077 
1078 		if (ksocket_bind(so_svc->is_so, (struct sockaddr *)&sin6_ip,
1079 		    sizeof (sin6_ip), CRED()) != 0) {
1080 			mutex_exit(&is->is_mutex);
1081 			idm_sodestroy(so_svc->is_so);
1082 			return (IDM_STATUS_FAIL);
1083 		}
1084 	}
1085 
1086 	idm_set_tgt_connect_options(so_svc->is_so);
1087 
1088 	if (ksocket_listen(so_svc->is_so, 5, CRED()) != 0) {
1089 		mutex_exit(&is->is_mutex);
1090 		idm_soshutdown(so_svc->is_so);
1091 		idm_sodestroy(so_svc->is_so);
1092 		return (IDM_STATUS_FAIL);
1093 	}
1094 
1095 	/* Launch a watch thread */
1096 	so_svc->is_thread = thread_create(NULL, 0, idm_so_svc_port_watcher,
1097 	    is, 0, &p0, TS_RUN, minclsyspri);
1098 
1099 	if (so_svc->is_thread == NULL) {
1100 		/* Failure to launch; teardown the socket */
1101 		mutex_exit(&is->is_mutex);
1102 		idm_soshutdown(so_svc->is_so);
1103 		idm_sodestroy(so_svc->is_so);
1104 		return (IDM_STATUS_FAIL);
1105 	}
1106 	ksocket_hold(so_svc->is_so);
1107 	/* Wait for the port watcher thread to start */
1108 	while (!so_svc->is_thread_running)
1109 		cv_wait(&is->is_cv, &is->is_mutex);
1110 	mutex_exit(&is->is_mutex);
1111 
1112 	return (IDM_STATUS_SUCCESS);
1113 }
1114 
1115 /*
1116  * idm_so_tgt_svc_offline
1117  *
1118  * Stop listening on the IP address and port identified by idm_svc_t.
1119  */
1120 static void
1121 idm_so_tgt_svc_offline(idm_svc_t *is)
1122 {
1123 	idm_so_svc_t		*so_svc;
1124 	mutex_enter(&is->is_mutex);
1125 	so_svc = (idm_so_svc_t *)is->is_so_svc;
1126 	so_svc->is_thread_running = B_FALSE;
1127 	mutex_exit(&is->is_mutex);
1128 
1129 	/*
1130 	 * Teardown socket
1131 	 */
1132 	idm_sodestroy(so_svc->is_so);
1133 
1134 	/*
1135 	 * Now we expect the port watcher thread to terminate
1136 	 */
1137 	thread_join(so_svc->is_thread_did);
1138 }
1139 
1140 /*
1141  * Watch thread for target service connection establishment.
1142  */
1143 void
1144 idm_so_svc_port_watcher(void *arg)
1145 {
1146 	idm_svc_t		*svc = arg;
1147 	ksocket_t		new_so;
1148 	idm_conn_t		*ic;
1149 	idm_status_t		idmrc;
1150 	idm_so_svc_t		*so_svc;
1151 	int			rc;
1152 	const uint32_t		off = 0;
1153 	struct sockaddr_in6 	t_addr;
1154 	socklen_t		t_addrlen;
1155 
1156 	bzero(&t_addr, sizeof (struct sockaddr_in6));
1157 	t_addrlen = sizeof (struct sockaddr_in6);
1158 	mutex_enter(&svc->is_mutex);
1159 
1160 	so_svc = svc->is_so_svc;
1161 	so_svc->is_thread_running = B_TRUE;
1162 	so_svc->is_thread_did = so_svc->is_thread->t_did;
1163 
1164 	cv_signal(&svc->is_cv);
1165 
1166 	IDM_SVC_LOG(CE_NOTE, "iSCSI service (%p/%d) online", (void *)svc,
1167 	    svc->is_svc_req.sr_port);
1168 
1169 	while (so_svc->is_thread_running) {
1170 		mutex_exit(&svc->is_mutex);
1171 
1172 		if ((rc = ksocket_accept(so_svc->is_so,
1173 		    (struct sockaddr *)&t_addr, &t_addrlen,
1174 		    &new_so, CRED())) != 0) {
1175 			mutex_enter(&svc->is_mutex);
1176 			if (rc == ECONNABORTED)
1177 				continue;
1178 			/* Connection problem */
1179 			break;
1180 		}
1181 		/*
1182 		 * Turn off SO_MAC_EXEMPT so future sobinds succeed
1183 		 */
1184 		(void) ksocket_setsockopt(new_so, SOL_SOCKET, SO_MAC_EXEMPT,
1185 		    (char *)&off, sizeof (off), CRED());
1186 
1187 		idmrc = idm_svc_conn_create(svc, IDM_TRANSPORT_TYPE_SOCKETS,
1188 		    &ic);
1189 		if (idmrc != IDM_STATUS_SUCCESS) {
1190 			/* Drop connection */
1191 			idm_soshutdown(new_so);
1192 			idm_sodestroy(new_so);
1193 			mutex_enter(&svc->is_mutex);
1194 			continue;
1195 		}
1196 
1197 		idmrc = idm_so_tgt_conn_create(ic, new_so);
1198 		if (idmrc != IDM_STATUS_SUCCESS) {
1199 			idm_svc_conn_destroy(ic);
1200 			idm_soshutdown(new_so);
1201 			idm_sodestroy(new_so);
1202 			mutex_enter(&svc->is_mutex);
1203 			continue;
1204 		}
1205 
1206 		/*
1207 		 * Kick the state machine.  At CS_S3_XPT_UP the state machine
1208 		 * will notify the client (target) about the new connection.
1209 		 */
1210 		idm_conn_event(ic, CE_CONNECT_ACCEPT, NULL);
1211 
1212 		mutex_enter(&svc->is_mutex);
1213 	}
1214 	ksocket_rele(so_svc->is_so);
1215 	so_svc->is_thread_running = B_FALSE;
1216 	mutex_exit(&svc->is_mutex);
1217 
1218 	IDM_SVC_LOG(CE_NOTE, "iSCSI service (%p/%d) offline", (void *)svc,
1219 	    svc->is_svc_req.sr_port);
1220 
1221 	thread_exit();
1222 }
1223 
1224 /*
1225  * idm_so_free_task_rsrc() stops any ongoing processing of the task and
1226  * frees resources associated with the task.
1227  *
1228  * It's not clear that this should return idm_status_t.  What do we do
1229  * if it fails?
1230  */
1231 static idm_status_t
1232 idm_so_free_task_rsrc(idm_task_t *idt)
1233 {
1234 	idm_buf_t	*idb;
1235 
1236 	/*
1237 	 * There is nothing to cleanup on initiator connections
1238 	 */
1239 	if (IDM_CONN_ISINI(idt->idt_ic))
1240 		return (IDM_STATUS_SUCCESS);
1241 
1242 	/*
1243 	 * If this is a target connection, call idm_buf_rx_from_ini_done for
1244 	 * any buffer on the "outbufv" list with idb->idb_in_transport==B_TRUE.
1245 	 *
1246 	 * In addition, remove any buffers associated with this task from
1247 	 * the ic_tx_list.  We'll do this by walking the idt_inbufv list, but
1248 	 * items don't actually get removed from that list (and completion
1249 	 * routines called) until idm_task_cleanup.
1250 	 */
1251 	mutex_enter(&idt->idt_mutex);
1252 
1253 	for (idb = list_head(&idt->idt_outbufv); idb != NULL;
1254 	    idb = list_next(&idt->idt_outbufv, idb)) {
1255 		if (idb->idb_in_transport) {
1256 			/*
1257 			 * idm_buf_rx_from_ini_done releases idt->idt_mutex
1258 			 */
1259 			idm_buf_rx_from_ini_done(idt, idb, IDM_STATUS_ABORTED);
1260 			mutex_enter(&idt->idt_mutex);
1261 		}
1262 	}
1263 
1264 	for (idb = list_head(&idt->idt_inbufv); idb != NULL;
1265 	    idb = list_next(&idt->idt_inbufv, idb)) {
1266 		/*
1267 		 * We want to remove these items from the tx_list as well,
1268 		 * but knowing it's in the idt_inbufv list is not a guarantee
1269 		 * that it's in the tx_list.  If it's on the tx list then
1270 		 * let idm_sotx_thread() clean it up.
1271 		 */
1272 		if (idb->idb_in_transport && !idb->idb_tx_thread) {
1273 			/*
1274 			 * idm_buf_tx_to_ini_done releases idt->idt_mutex
1275 			 */
1276 			idm_buf_tx_to_ini_done(idt, idb, IDM_STATUS_ABORTED);
1277 			mutex_enter(&idt->idt_mutex);
1278 		}
1279 	}
1280 
1281 	mutex_exit(&idt->idt_mutex);
1282 
1283 	return (IDM_STATUS_SUCCESS);
1284 }
1285 
1286 /*
1287  * idm_so_negotiate_key_values() validates the key values for this connection
1288  */
1289 /* ARGSUSED */
1290 static kv_status_t
1291 idm_so_negotiate_key_values(idm_conn_t *it, nvlist_t *request_nvl,
1292     nvlist_t *response_nvl, nvlist_t *negotiated_nvl)
1293 {
1294 	/* All parameters are negotiated at the iscsit level */
1295 	return (KV_HANDLED);
1296 }
1297 
1298 /*
1299  * idm_so_notice_key_values() activates the negotiated key values for
1300  * this connection.
1301  */
1302 static void
1303 idm_so_notice_key_values(idm_conn_t *it, nvlist_t *negotiated_nvl)
1304 {
1305 	char			*nvp_name;
1306 	nvpair_t		*nvp;
1307 	nvpair_t		*next_nvp;
1308 	int			nvrc;
1309 	idm_status_t		idm_status;
1310 	const idm_kv_xlate_t	*ikvx;
1311 
1312 	for (nvp = nvlist_next_nvpair(negotiated_nvl, NULL);
1313 	    nvp != NULL; nvp = next_nvp) {
1314 		next_nvp = nvlist_next_nvpair(negotiated_nvl, nvp);
1315 		nvp_name = nvpair_name(nvp);
1316 
1317 		ikvx = idm_lookup_kv_xlate(nvp_name, strlen(nvp_name));
1318 		switch (ikvx->ik_key_id) {
1319 		case KI_HEADER_DIGEST:
1320 		case KI_DATA_DIGEST:
1321 			idm_status = idm_so_handle_digest(it, nvp, ikvx);
1322 			ASSERT(idm_status == 0);
1323 
1324 			/* Remove processed item from negotiated_nvl list */
1325 			nvrc = nvlist_remove_all(
1326 			    negotiated_nvl, ikvx->ik_key_name);
1327 			ASSERT(nvrc == 0);
1328 			break;
1329 		default:
1330 			break;
1331 		}
1332 	}
1333 }
1334 
1335 
1336 static idm_status_t
1337 idm_so_handle_digest(idm_conn_t *it, nvpair_t *digest_choice,
1338     const idm_kv_xlate_t *ikvx)
1339 {
1340 	int			nvrc;
1341 	char			*digest_choice_string;
1342 
1343 	nvrc = nvpair_value_string(digest_choice,
1344 	    &digest_choice_string);
1345 	ASSERT(nvrc == 0);
1346 	if (strcasecmp(digest_choice_string, "crc32c") == 0) {
1347 		switch (ikvx->ik_key_id) {
1348 		case KI_HEADER_DIGEST:
1349 			it->ic_conn_flags |= IDM_CONN_HEADER_DIGEST;
1350 			break;
1351 		case KI_DATA_DIGEST:
1352 			it->ic_conn_flags |= IDM_CONN_DATA_DIGEST;
1353 			break;
1354 		default:
1355 			ASSERT(0);
1356 			break;
1357 		}
1358 	} else if (strcasecmp(digest_choice_string, "none") == 0) {
1359 		switch (ikvx->ik_key_id) {
1360 		case KI_HEADER_DIGEST:
1361 			it->ic_conn_flags &= ~IDM_CONN_HEADER_DIGEST;
1362 			break;
1363 		case KI_DATA_DIGEST:
1364 			it->ic_conn_flags &= ~IDM_CONN_DATA_DIGEST;
1365 			break;
1366 		default:
1367 			ASSERT(0);
1368 			break;
1369 		}
1370 	} else {
1371 		ASSERT(0);
1372 	}
1373 
1374 	return (IDM_STATUS_SUCCESS);
1375 }
1376 
1377 
1378 /*
1379  * idm_so_conn_is_capable() verifies that the passed connection is provided
1380  * for by the sockets interface.
1381  */
1382 /* ARGSUSED */
1383 static boolean_t
1384 idm_so_conn_is_capable(idm_conn_req_t *ic, idm_transport_caps_t *caps)
1385 {
1386 	return (B_TRUE);
1387 }
1388 
1389 /*
1390  * idm_so_rx_datain() validates the Data Sequence number of the PDU. The
1391  * idm_sorecv_scsidata() function invoked earlier actually reads the data
1392  * off the socket into the appropriate buffers.
1393  */
1394 static void
1395 idm_so_rx_datain(idm_conn_t *ic, idm_pdu_t *pdu)
1396 {
1397 	iscsi_data_hdr_t	*bhs;
1398 	idm_task_t		*idt;
1399 	idm_buf_t		*idb;
1400 	uint32_t		datasn;
1401 	size_t			offset;
1402 	iscsi_hdr_t		*ihp = (iscsi_hdr_t *)pdu->isp_hdr;
1403 	iscsi_data_rsp_hdr_t    *idrhp = (iscsi_data_rsp_hdr_t *)ihp;
1404 
1405 	ASSERT(ic != NULL);
1406 	ASSERT(pdu != NULL);
1407 
1408 	bhs	= (iscsi_data_hdr_t *)pdu->isp_hdr;
1409 	datasn	= ntohl(bhs->datasn);
1410 	offset	= ntohl(bhs->offset);
1411 
1412 	ASSERT(bhs->opcode == ISCSI_OP_SCSI_DATA_RSP);
1413 
1414 	/*
1415 	 * Look up the task corresponding to the initiator task tag
1416 	 * to get the buffers affiliated with the task.
1417 	 */
1418 	idt = idm_task_find(ic, bhs->itt, bhs->ttt);
1419 	if (idt == NULL) {
1420 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_datain: failed to find task");
1421 		idm_pdu_rx_protocol_error(ic, pdu);
1422 		return;
1423 	}
1424 
1425 	idb = pdu->isp_sorx_buf;
1426 	if (idb == NULL) {
1427 		IDM_CONN_LOG(CE_WARN,
1428 		    "idm_so_rx_datain: failed to find buffer");
1429 		idm_task_rele(idt);
1430 		idm_pdu_rx_protocol_error(ic, pdu);
1431 		return;
1432 	}
1433 
1434 	/*
1435 	 * DataSN values should be sequential and should not have any gaps or
1436 	 * repetitions. Check the DataSN with the one stored in the task.
1437 	 */
1438 	if (datasn == idt->idt_exp_datasn) {
1439 		idt->idt_exp_datasn++; /* keep track of DataSN received */
1440 	} else {
1441 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_datain: datasn out of order");
1442 		idm_task_rele(idt);
1443 		idm_pdu_rx_protocol_error(ic, pdu);
1444 		return;
1445 	}
1446 
1447 	/*
1448 	 * PDUs in a sequence should be in continuously increasing
1449 	 * address offset
1450 	 */
1451 	if (offset != idb->idb_exp_offset) {
1452 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_datain: unexpected offset");
1453 		idm_task_rele(idt);
1454 		idm_pdu_rx_protocol_error(ic, pdu);
1455 		return;
1456 	}
1457 	/* Expected next relative buffer offset */
1458 	idb->idb_exp_offset += n2h24(bhs->dlength);
1459 	idt->idt_rx_bytes += n2h24(bhs->dlength);
1460 
1461 	idm_task_rele(idt);
1462 
1463 	/*
1464 	 * For now call scsi_rsp which will process the data rsp
1465 	 * Revisit, need to provide an explicit client entry point for
1466 	 * phase collapse completions.
1467 	 */
1468 	if (((ihp->opcode & ISCSI_OPCODE_MASK) == ISCSI_OP_SCSI_DATA_RSP) &&
1469 	    (idrhp->flags & ISCSI_FLAG_DATA_STATUS)) {
1470 		(*ic->ic_conn_ops.icb_rx_scsi_rsp)(ic, pdu);
1471 	}
1472 
1473 	idm_pdu_complete(pdu, IDM_STATUS_SUCCESS);
1474 }
1475 
1476 /*
1477  * The idm_so_rx_dataout() function is used by the iSCSI target to read
1478  * data from the Data-Out PDU sent by the iSCSI initiator.
1479  *
1480  * This function gets the Initiator Task Tag from the PDU BHS and looks up the
1481  * task to get the buffers associated with the PDU. A PDU might span buffers.
1482  * The data is then read into the respective buffer.
1483  */
1484 static void
1485 idm_so_rx_dataout(idm_conn_t *ic, idm_pdu_t *pdu)
1486 {
1487 
1488 	iscsi_data_hdr_t	*bhs;
1489 	idm_task_t		*idt;
1490 	idm_buf_t		*idb;
1491 	size_t			offset;
1492 
1493 	ASSERT(ic != NULL);
1494 	ASSERT(pdu != NULL);
1495 
1496 	bhs = (iscsi_data_hdr_t *)pdu->isp_hdr;
1497 	offset = ntohl(bhs->offset);
1498 	ASSERT(bhs->opcode == ISCSI_OP_SCSI_DATA);
1499 
1500 	/*
1501 	 * Look up the task corresponding to the initiator task tag
1502 	 * to get the buffers affiliated with the task.
1503 	 */
1504 	idt = idm_task_find(ic, bhs->itt, bhs->ttt);
1505 	if (idt == NULL) {
1506 		IDM_CONN_LOG(CE_WARN,
1507 		    "idm_so_rx_dataout: failed to find task");
1508 		idm_pdu_rx_protocol_error(ic, pdu);
1509 		return;
1510 	}
1511 
1512 	idb = pdu->isp_sorx_buf;
1513 	if (idb == NULL) {
1514 		IDM_CONN_LOG(CE_WARN,
1515 		    "idm_so_rx_dataout: failed to find buffer");
1516 		idm_task_rele(idt);
1517 		idm_pdu_rx_protocol_error(ic, pdu);
1518 		return;
1519 	}
1520 
1521 	/* Keep track of data transferred - check data offsets */
1522 	if (offset != idb->idb_exp_offset) {
1523 		IDM_CONN_LOG(CE_NOTE, "idm_so_rx_dataout: offset out of seq: "
1524 		    "%ld, %d", offset, idb->idb_exp_offset);
1525 		idm_task_rele(idt);
1526 		idm_pdu_rx_protocol_error(ic, pdu);
1527 		return;
1528 	}
1529 	/* Expected next relative offset */
1530 	idb->idb_exp_offset += ntoh24(bhs->dlength);
1531 	idt->idt_rx_bytes += n2h24(bhs->dlength);
1532 
1533 	/*
1534 	 * Call the buffer callback when the transfer is complete
1535 	 *
1536 	 * The connection state machine should only abort tasks after
1537 	 * shutting down the connection so we are assured that there
1538 	 * won't be a simultaneous attempt to abort this task at the
1539 	 * same time as we are processing this PDU (due to a connection
1540 	 * state change).
1541 	 */
1542 	if (bhs->flags & ISCSI_FLAG_FINAL) {
1543 		/*
1544 		 * We only want to call idm_buf_rx_from_ini_done once
1545 		 * per transfer.  It's possible that this task has
1546 		 * already been aborted in which case
1547 		 * idm_so_free_task_rsrc will call idm_buf_rx_from_ini_done
1548 		 * for each buffer with idb_in_transport==B_TRUE.  To
1549 		 * close this window and ensure that this doesn't happen,
1550 		 * we'll clear idb->idb_in_transport now while holding
1551 		 * the task mutex.   This is only really an issue for
1552 		 * SCSI task abort -- if tasks were being aborted because
1553 		 * of a connection state change the state machine would
1554 		 * have already stopped the receive thread.
1555 		 */
1556 		mutex_enter(&idt->idt_mutex);
1557 
1558 		/*
1559 		 * Release the task hold here (obtained in idm_task_find)
1560 		 * because the task may complete synchronously during
1561 		 * idm_buf_rx_from_ini_done.  Since we still have an active
1562 		 * buffer we know there is at least one additional hold on idt.
1563 		 */
1564 		idm_task_rele(idt);
1565 
1566 		/*
1567 		 * idm_buf_rx_from_ini_done releases idt->idt_mutex
1568 		 */
1569 		idm_buf_rx_from_ini_done(idt, idb, IDM_STATUS_SUCCESS);
1570 		idm_pdu_complete(pdu, IDM_STATUS_SUCCESS);
1571 		return;
1572 	}
1573 
1574 	idm_task_rele(idt);
1575 	idm_pdu_complete(pdu, IDM_STATUS_SUCCESS);
1576 }
1577 
1578 /*
1579  * The idm_so_rx_rtt() function is used by the iSCSI initiator to handle
1580  * the R2T PDU sent by the iSCSI target indicating that it is ready to
1581  * accept data. This gets the Initiator Task Tag (itt) from the PDU BHS
1582  * and looks up the task in the task tree using the itt to get the output
1583  * buffers associated the task. The R2T PDU contains the offset of the
1584  * requested data and the data length. This function then constructs a
1585  * sequence of iSCSI PDUs and outputs the requested data. Each Data-Out
1586  * PDU is associated with the R2T by the Target Transfer Tag  (ttt).
1587  */
1588 
1589 static void
1590 idm_so_rx_rtt(idm_conn_t *ic, idm_pdu_t *pdu)
1591 {
1592 	idm_task_t		*idt;
1593 	idm_buf_t		*idb;
1594 	iscsi_rtt_hdr_t		*rtt_hdr;
1595 	uint32_t		data_offset;
1596 	uint32_t		data_length;
1597 
1598 	ASSERT(ic != NULL);
1599 	ASSERT(pdu != NULL);
1600 
1601 	rtt_hdr	= (iscsi_rtt_hdr_t *)pdu->isp_hdr;
1602 	data_offset = ntohl(rtt_hdr->data_offset);
1603 	data_length = ntohl(rtt_hdr->data_length);
1604 	idt	= idm_task_find(ic, rtt_hdr->itt, rtt_hdr->ttt);
1605 
1606 	if (idt == NULL) {
1607 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_rtt: could not find task");
1608 		idm_pdu_rx_protocol_error(ic, pdu);
1609 		return;
1610 	}
1611 
1612 	/* Find the buffer bound to the task by the iSCSI initiator */
1613 	mutex_enter(&idt->idt_mutex);
1614 	idb = idm_buf_find(&idt->idt_outbufv, data_offset);
1615 	if (idb == NULL) {
1616 		mutex_exit(&idt->idt_mutex);
1617 		idm_task_rele(idt);
1618 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_rtt: could not find buffer");
1619 		idm_pdu_rx_protocol_error(ic, pdu);
1620 		return;
1621 	}
1622 
1623 	/* return buffer contains this data */
1624 	if (data_offset + data_length > idb->idb_buflen) {
1625 		/* Overflow */
1626 		mutex_exit(&idt->idt_mutex);
1627 		idm_task_rele(idt);
1628 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_rtt: read from outside "
1629 		    "buffer");
1630 		idm_pdu_rx_protocol_error(ic, pdu);
1631 		return;
1632 	}
1633 
1634 	idt->idt_r2t_ttt = rtt_hdr->ttt;
1635 	idt->idt_exp_datasn = 0;
1636 
1637 	idm_so_send_rtt_data(ic, idt, idb, data_offset,
1638 	    ntohl(rtt_hdr->data_length));
1639 	mutex_exit(&idt->idt_mutex);
1640 
1641 	idm_pdu_complete(pdu, IDM_STATUS_SUCCESS);
1642 	idm_task_rele(idt);
1643 
1644 }
1645 
1646 idm_status_t
1647 idm_sorecvdata(idm_conn_t *ic, idm_pdu_t *pdu)
1648 {
1649 	uint8_t		pad[ISCSI_PAD_WORD_LEN];
1650 	int		pad_len;
1651 	uint32_t	data_digest_crc;
1652 	uint32_t	crc_calculated;
1653 	int		total_len;
1654 	idm_so_conn_t	*so_conn;
1655 
1656 	so_conn = ic->ic_transport_private;
1657 
1658 	pad_len = ((ISCSI_PAD_WORD_LEN -
1659 	    (pdu->isp_datalen & (ISCSI_PAD_WORD_LEN - 1))) &
1660 	    (ISCSI_PAD_WORD_LEN - 1));
1661 
1662 	ASSERT(pdu->isp_iovlen < (PDU_MAX_IOVLEN - 2)); /* pad + data digest */
1663 
1664 	total_len = pdu->isp_datalen;
1665 
1666 	if (pad_len) {
1667 		pdu->isp_iov[pdu->isp_iovlen].iov_base	= (char *)&pad;
1668 		pdu->isp_iov[pdu->isp_iovlen].iov_len	= pad_len;
1669 		total_len		+= pad_len;
1670 		pdu->isp_iovlen++;
1671 	}
1672 
1673 	/* setup data digest */
1674 	if ((ic->ic_conn_flags & IDM_CONN_DATA_DIGEST) != 0) {
1675 		pdu->isp_iov[pdu->isp_iovlen].iov_base =
1676 		    (char *)&data_digest_crc;
1677 		pdu->isp_iov[pdu->isp_iovlen].iov_len =
1678 		    sizeof (data_digest_crc);
1679 		total_len		+= sizeof (data_digest_crc);
1680 		pdu->isp_iovlen++;
1681 	}
1682 
1683 	pdu->isp_data = (uint8_t *)(uintptr_t)pdu->isp_iov[0].iov_base;
1684 
1685 	if (idm_iov_sorecv(so_conn->ic_so, &pdu->isp_iov[0],
1686 	    pdu->isp_iovlen, total_len) != 0) {
1687 		return (IDM_STATUS_IO);
1688 	}
1689 
1690 	if ((ic->ic_conn_flags & IDM_CONN_DATA_DIGEST) != 0) {
1691 		crc_calculated = idm_crc32c(pdu->isp_data,
1692 		    pdu->isp_datalen);
1693 		if (pad_len) {
1694 			crc_calculated = idm_crc32c_continued((char *)&pad,
1695 			    pad_len, crc_calculated);
1696 		}
1697 		if (crc_calculated != data_digest_crc) {
1698 			IDM_CONN_LOG(CE_WARN,
1699 			    "idm_sorecvdata: "
1700 			    "CRC error: actual 0x%x, calc 0x%x",
1701 			    data_digest_crc, crc_calculated);
1702 
1703 			/* Invalid Data Digest */
1704 			return (IDM_STATUS_DATA_DIGEST);
1705 		}
1706 	}
1707 
1708 	return (IDM_STATUS_SUCCESS);
1709 }
1710 
1711 /*
1712  * idm_sorecv_scsidata() is used to receive scsi data from the socket. The
1713  * Data-type PDU header must be read into the idm_pdu_t structure prior to
1714  * calling this function.
1715  */
1716 idm_status_t
1717 idm_sorecv_scsidata(idm_conn_t *ic, idm_pdu_t *pdu)
1718 {
1719 	iscsi_data_hdr_t	*bhs;
1720 	idm_task_t		*task;
1721 	uint32_t		offset;
1722 	uint8_t			opcode;
1723 	uint32_t		dlength;
1724 	list_t			*buflst;
1725 	uint32_t		xfer_bytes;
1726 	idm_status_t		status;
1727 
1728 	ASSERT(ic != NULL);
1729 	ASSERT(pdu != NULL);
1730 
1731 	bhs	= (iscsi_data_hdr_t *)pdu->isp_hdr;
1732 
1733 	offset	= ntohl(bhs->offset);
1734 	opcode	= bhs->opcode;
1735 	dlength = n2h24(bhs->dlength);
1736 
1737 	ASSERT((opcode == ISCSI_OP_SCSI_DATA_RSP) ||
1738 	    (opcode == ISCSI_OP_SCSI_DATA));
1739 
1740 	/*
1741 	 * Successful lookup implicitly gets a "hold" on the task.  This
1742 	 * hold must be released before leaving this function.  At one
1743 	 * point we were caching this task context and retaining the hold
1744 	 * but it turned out to be very difficult to release the hold properly.
1745 	 * The task can be aborted and the connection shutdown between this
1746 	 * call and the subsequent expected call to idm_so_rx_datain/
1747 	 * idm_so_rx_dataout (in which case those functions are not called).
1748 	 * Releasing the hold in the PDU callback doesn't work well either
1749 	 * because the whole task may be completed by then at which point
1750 	 * it is too late to release the hold -- for better or worse this
1751 	 * code doesn't wait on the refcnts during normal operation.
1752 	 * idm_task_find() is very fast and it is not a huge burden if we
1753 	 * have to do it twice.
1754 	 */
1755 	task = idm_task_find(ic, bhs->itt, bhs->ttt);
1756 	if (task == NULL) {
1757 		IDM_CONN_LOG(CE_WARN,
1758 		    "idm_sorecv_scsidata: could not find task");
1759 		return (IDM_STATUS_FAIL);
1760 	}
1761 
1762 	mutex_enter(&task->idt_mutex);
1763 	buflst	= (opcode == ISCSI_OP_SCSI_DATA_RSP) ?
1764 	    &task->idt_inbufv : &task->idt_outbufv;
1765 	pdu->isp_sorx_buf = idm_buf_find(buflst, offset);
1766 	mutex_exit(&task->idt_mutex);
1767 
1768 	if (pdu->isp_sorx_buf == NULL) {
1769 		idm_task_rele(task);
1770 		IDM_CONN_LOG(CE_WARN, "idm_sorecv_scsidata: could not find "
1771 		    "buffer for offset %x opcode=%x",
1772 		    offset, opcode);
1773 		return (IDM_STATUS_FAIL);
1774 	}
1775 
1776 	xfer_bytes = idm_fill_iov(pdu, pdu->isp_sorx_buf, offset, dlength);
1777 	ASSERT(xfer_bytes != 0);
1778 	if (xfer_bytes != dlength) {
1779 		idm_task_rele(task);
1780 		/*
1781 		 * Buffer overflow, connection error.  The PDU data is still
1782 		 * sitting in the socket so we can't use the connection
1783 		 * again until that data is drained.
1784 		 */
1785 		return (IDM_STATUS_FAIL);
1786 	}
1787 
1788 	status = idm_sorecvdata(ic, pdu);
1789 
1790 	idm_task_rele(task);
1791 
1792 	return (status);
1793 }
1794 
1795 static uint32_t
1796 idm_fill_iov(idm_pdu_t *pdu, idm_buf_t *idb, uint32_t ro, uint32_t dlength)
1797 {
1798 	uint32_t	buf_ro = ro - idb->idb_bufoffset;
1799 	uint32_t	xfer_len = min(dlength, idb->idb_buflen - buf_ro);
1800 
1801 	ASSERT(ro >= idb->idb_bufoffset);
1802 
1803 	pdu->isp_iov[pdu->isp_iovlen].iov_base	=
1804 	    (caddr_t)idb->idb_buf + buf_ro;
1805 	pdu->isp_iov[pdu->isp_iovlen].iov_len	= xfer_len;
1806 	pdu->isp_iovlen++;
1807 
1808 	return (xfer_len);
1809 }
1810 
1811 int
1812 idm_sorecv_nonscsidata(idm_conn_t *ic, idm_pdu_t *pdu)
1813 {
1814 	pdu->isp_data = kmem_alloc(pdu->isp_datalen, KM_SLEEP);
1815 	ASSERT(pdu->isp_data != NULL);
1816 
1817 	pdu->isp_databuflen = pdu->isp_datalen;
1818 	pdu->isp_iov[0].iov_base = (caddr_t)pdu->isp_data;
1819 	pdu->isp_iov[0].iov_len = pdu->isp_datalen;
1820 	pdu->isp_iovlen = 1;
1821 	/*
1822 	 * Since we are associating a new data buffer with this received
1823 	 * PDU we need to set a specific callback to free the data
1824 	 * after the PDU is processed.
1825 	 */
1826 	pdu->isp_flags |= IDM_PDU_ADDL_DATA;
1827 	pdu->isp_callback = idm_sorx_addl_pdu_cb;
1828 
1829 	return (idm_sorecvdata(ic, pdu));
1830 }
1831 
1832 void
1833 idm_sorx_thread(void *arg)
1834 {
1835 	boolean_t	conn_failure = B_FALSE;
1836 	idm_conn_t	*ic = (idm_conn_t *)arg;
1837 	idm_so_conn_t	*so_conn;
1838 	idm_pdu_t	*pdu;
1839 	idm_status_t	rc;
1840 
1841 	idm_conn_hold(ic);
1842 
1843 	mutex_enter(&ic->ic_mutex);
1844 
1845 	so_conn = ic->ic_transport_private;
1846 	so_conn->ic_rx_thread_running = B_TRUE;
1847 	so_conn->ic_rx_thread_did = so_conn->ic_rx_thread->t_did;
1848 	cv_signal(&ic->ic_cv);
1849 
1850 	while (so_conn->ic_rx_thread_running) {
1851 		mutex_exit(&ic->ic_mutex);
1852 
1853 		/*
1854 		 * Get PDU with default header size (large enough for
1855 		 * BHS plus any anticipated AHS).  PDU from
1856 		 * the cache will have all values set correctly
1857 		 * for sockets RX including callback.
1858 		 */
1859 		pdu = kmem_cache_alloc(idm.idm_sorx_pdu_cache, KM_SLEEP);
1860 		pdu->isp_ic = ic;
1861 		pdu->isp_flags = 0;
1862 		pdu->isp_transport_hdrlen = 0;
1863 
1864 		if ((rc = idm_sorecvhdr(ic, pdu)) != 0) {
1865 			/*
1866 			 * Call idm_pdu_complete so that we call the callback
1867 			 * and ensure any memory allocated in idm_sorecvhdr
1868 			 * gets freed up.
1869 			 */
1870 			idm_pdu_complete(pdu, IDM_STATUS_FAIL);
1871 
1872 			/*
1873 			 * If ic_rx_thread_running is still set then
1874 			 * this is some kind of connection problem
1875 			 * on the socket.  In this case we want to
1876 			 * generate an event.  Otherwise some other
1877 			 * thread closed the socket due to another
1878 			 * issue in which case we don't need to
1879 			 * generate an event.
1880 			 */
1881 			mutex_enter(&ic->ic_mutex);
1882 			if (so_conn->ic_rx_thread_running) {
1883 				conn_failure = B_TRUE;
1884 				so_conn->ic_rx_thread_running = B_FALSE;
1885 			}
1886 
1887 			continue;
1888 		}
1889 
1890 		/*
1891 		 * Header has been read and validated.  Now we need
1892 		 * to read the PDU data payload (if present).  SCSI data
1893 		 * need to be transferred from the socket directly into
1894 		 * the associated transfer buffer for the SCSI task.
1895 		 */
1896 		if (pdu->isp_datalen != 0) {
1897 			if ((IDM_PDU_OPCODE(pdu) == ISCSI_OP_SCSI_DATA) ||
1898 			    (IDM_PDU_OPCODE(pdu) == ISCSI_OP_SCSI_DATA_RSP)) {
1899 				rc = idm_sorecv_scsidata(ic, pdu);
1900 				/*
1901 				 * All SCSI errors are fatal to the
1902 				 * connection right now since we have no
1903 				 * place to put the data.  What we need
1904 				 * is some kind of sink to dispose of unwanted
1905 				 * SCSI data.  For example an invalid task tag
1906 				 * should not kill the connection (although
1907 				 * we may want to drop the connection).
1908 				 */
1909 			} else {
1910 				/*
1911 				 * Not data PDUs so allocate a buffer for the
1912 				 * data segment and read the remaining data.
1913 				 */
1914 				rc = idm_sorecv_nonscsidata(ic, pdu);
1915 			}
1916 			if (rc != 0) {
1917 				/*
1918 				 * Call idm_pdu_complete so that we call the
1919 				 * callback and ensure any memory allocated
1920 				 * in idm_sorecvhdr gets freed up.
1921 				 */
1922 				idm_pdu_complete(pdu, IDM_STATUS_FAIL);
1923 
1924 				/*
1925 				 * If ic_rx_thread_running is still set then
1926 				 * this is some kind of connection problem
1927 				 * on the socket.  In this case we want to
1928 				 * generate an event.  Otherwise some other
1929 				 * thread closed the socket due to another
1930 				 * issue in which case we don't need to
1931 				 * generate an event.
1932 				 */
1933 				mutex_enter(&ic->ic_mutex);
1934 				if (so_conn->ic_rx_thread_running) {
1935 					conn_failure = B_TRUE;
1936 					so_conn->ic_rx_thread_running = B_FALSE;
1937 				}
1938 				continue;
1939 			}
1940 		}
1941 
1942 		/*
1943 		 * Process RX PDU
1944 		 */
1945 		idm_pdu_rx(ic, pdu);
1946 
1947 		mutex_enter(&ic->ic_mutex);
1948 	}
1949 
1950 	mutex_exit(&ic->ic_mutex);
1951 
1952 	/*
1953 	 * If we dropped out of the RX processing loop because of
1954 	 * a socket problem or other connection failure (including
1955 	 * digest errors) then we need to generate a state machine
1956 	 * event to shut the connection down.
1957 	 * If the state machine is already in, for example, INIT_ERROR, this
1958 	 * event will get dropped, and the TX thread will never be notified
1959 	 * to shut down.  To be safe, we'll just notify it here.
1960 	 */
1961 	if (conn_failure) {
1962 		if (so_conn->ic_tx_thread_running) {
1963 			so_conn->ic_tx_thread_running = B_FALSE;
1964 			mutex_enter(&so_conn->ic_tx_mutex);
1965 			cv_signal(&so_conn->ic_tx_cv);
1966 			mutex_exit(&so_conn->ic_tx_mutex);
1967 		}
1968 
1969 		idm_conn_event(ic, CE_TRANSPORT_FAIL, rc);
1970 	}
1971 
1972 	idm_conn_rele(ic);
1973 
1974 	thread_exit();
1975 }
1976 
1977 /*
1978  * idm_so_tx
1979  *
1980  * This is the implementation of idm_transport_ops_t's it_tx_pdu entry
1981  * point.  By definition, it is supposed to be fast.  So, simply queue
1982  * the entry and return.  The real work is done by idm_i_so_tx() via
1983  * idm_sotx_thread().
1984  */
1985 
1986 static void
1987 idm_so_tx(idm_conn_t *ic, idm_pdu_t *pdu)
1988 {
1989 	idm_so_conn_t *so_conn = ic->ic_transport_private;
1990 
1991 	ASSERT(pdu->isp_ic == ic);
1992 	mutex_enter(&so_conn->ic_tx_mutex);
1993 
1994 	if (!so_conn->ic_tx_thread_running) {
1995 		mutex_exit(&so_conn->ic_tx_mutex);
1996 		idm_pdu_complete(pdu, IDM_STATUS_ABORTED);
1997 		return;
1998 	}
1999 
2000 	list_insert_tail(&so_conn->ic_tx_list, (void *)pdu);
2001 	cv_signal(&so_conn->ic_tx_cv);
2002 	mutex_exit(&so_conn->ic_tx_mutex);
2003 }
2004 
2005 static idm_status_t
2006 idm_i_so_tx(idm_pdu_t *pdu)
2007 {
2008 	idm_conn_t	*ic = pdu->isp_ic;
2009 	idm_status_t	status = IDM_STATUS_SUCCESS;
2010 	uint8_t		pad[ISCSI_PAD_WORD_LEN];
2011 	int		pad_len;
2012 	uint32_t	hdr_digest_crc;
2013 	uint32_t	data_digest_crc = 0;
2014 	int		total_len = 0;
2015 	int		iovlen = 0;
2016 	struct iovec	iov[6];
2017 	idm_so_conn_t	*so_conn;
2018 
2019 	so_conn = ic->ic_transport_private;
2020 
2021 	/* Setup BHS */
2022 	iov[iovlen].iov_base	= (caddr_t)pdu->isp_hdr;
2023 	iov[iovlen].iov_len	= pdu->isp_hdrlen;
2024 	total_len		+= iov[iovlen].iov_len;
2025 	iovlen++;
2026 
2027 	/* Setup header digest */
2028 	if (((pdu->isp_flags & IDM_PDU_LOGIN_TX) == 0) &&
2029 	    (ic->ic_conn_flags & IDM_CONN_HEADER_DIGEST)) {
2030 		hdr_digest_crc = idm_crc32c(pdu->isp_hdr, pdu->isp_hdrlen);
2031 
2032 		iov[iovlen].iov_base	= (caddr_t)&hdr_digest_crc;
2033 		iov[iovlen].iov_len	= sizeof (hdr_digest_crc);
2034 		total_len		+= iov[iovlen].iov_len;
2035 		iovlen++;
2036 	}
2037 
2038 	/* Setup the data */
2039 	if (pdu->isp_datalen) {
2040 		idm_task_t		*idt;
2041 		idm_buf_t		*idb;
2042 		iscsi_data_hdr_t	*ihp;
2043 		ihp = (iscsi_data_hdr_t *)pdu->isp_hdr;
2044 		/* Write of immediate data */
2045 		if (ic->ic_ffp &&
2046 		    (ihp->opcode == ISCSI_OP_SCSI_CMD ||
2047 		    ihp->opcode == ISCSI_OP_SCSI_DATA)) {
2048 			idt = idm_task_find(ic, ihp->itt, ihp->ttt);
2049 			if (idt) {
2050 				mutex_enter(&idt->idt_mutex);
2051 				idb = idm_buf_find(&idt->idt_outbufv, 0);
2052 				mutex_exit(&idt->idt_mutex);
2053 				/*
2054 				 * If the initiator call to idm_buf_alloc
2055 				 * failed then we can get to this point
2056 				 * without a bound buffer.  The associated
2057 				 * connection failure will clean things up
2058 				 * later.  It would be nice to come up with
2059 				 * a cleaner way to handle this.  In
2060 				 * particular it seems absurd to look up
2061 				 * the task and the buffer just to update
2062 				 * this counter.
2063 				 */
2064 				if (idb)
2065 					idb->idb_xfer_len += pdu->isp_datalen;
2066 				idm_task_rele(idt);
2067 			}
2068 		}
2069 
2070 		iov[iovlen].iov_base = (caddr_t)pdu->isp_data;
2071 		iov[iovlen].iov_len  = pdu->isp_datalen;
2072 		total_len += iov[iovlen].iov_len;
2073 		iovlen++;
2074 	}
2075 
2076 	/* Setup the data pad if necessary */
2077 	pad_len = ((ISCSI_PAD_WORD_LEN -
2078 	    (pdu->isp_datalen & (ISCSI_PAD_WORD_LEN - 1))) &
2079 	    (ISCSI_PAD_WORD_LEN - 1));
2080 
2081 	if (pad_len) {
2082 		bzero(pad, sizeof (pad));
2083 		iov[iovlen].iov_base = (void *)&pad;
2084 		iov[iovlen].iov_len  = pad_len;
2085 		total_len		+= iov[iovlen].iov_len;
2086 		iovlen++;
2087 	}
2088 
2089 	/*
2090 	 * Setup the data digest if enabled.  Data-digest is not sent
2091 	 * for login-phase PDUs.
2092 	 */
2093 	if ((ic->ic_conn_flags & IDM_CONN_DATA_DIGEST) &&
2094 	    ((pdu->isp_flags & IDM_PDU_LOGIN_TX) == 0) &&
2095 	    (pdu->isp_datalen || pad_len)) {
2096 		/*
2097 		 * RFC3720/10.2.3: A zero-length Data Segment also
2098 		 * implies a zero-length data digest.
2099 		 */
2100 		if (pdu->isp_datalen) {
2101 			data_digest_crc = idm_crc32c(pdu->isp_data,
2102 			    pdu->isp_datalen);
2103 		}
2104 		if (pad_len) {
2105 			data_digest_crc = idm_crc32c_continued(&pad,
2106 			    pad_len, data_digest_crc);
2107 		}
2108 
2109 		iov[iovlen].iov_base	= (caddr_t)&data_digest_crc;
2110 		iov[iovlen].iov_len	= sizeof (data_digest_crc);
2111 		total_len		+= iov[iovlen].iov_len;
2112 		iovlen++;
2113 	}
2114 
2115 	/* Transmit the PDU */
2116 	if (idm_iov_sosend(so_conn->ic_so, &iov[0], iovlen,
2117 	    total_len) != 0) {
2118 		/* Set error status */
2119 		IDM_CONN_LOG(CE_WARN,
2120 		    "idm_so_tx: failed to transmit the PDU, so: %p ic: %p "
2121 		    "data: %p", (void *) so_conn->ic_so, (void *) ic,
2122 		    (void *) pdu->isp_data);
2123 		status = IDM_STATUS_IO;
2124 	}
2125 
2126 	/*
2127 	 * Success does not mean that the PDU actually reached the
2128 	 * remote node since it could get dropped along the way.
2129 	 */
2130 	idm_pdu_complete(pdu, status);
2131 
2132 	return (status);
2133 }
2134 
2135 /*
2136  * The idm_so_buf_tx_to_ini() is used by the target iSCSI layer to transmit the
2137  * Data-In PDUs using sockets. Based on the negotiated MaxRecvDataSegmentLength,
2138  * the buffer is segmented into a sequence of Data-In PDUs, ordered by DataSN.
2139  * A target can invoke this function multiple times for a single read command
2140  * (identified by the same ITT) to split the input into several sequences.
2141  *
2142  * DataSN starts with 0 for the first data PDU of an input command and advances
2143  * by 1 for each subsequent data PDU. Each sequence will have its own F bit,
2144  * which is set to 1 for the last data PDU of a sequence.
2145  *
2146  * Scope for Prototype build:
2147  * The data PDUs within a sequence will be sent in order with the buffer offset
2148  * in increasing order. i.e. initiator and target must have negotiated the
2149  * "DataPDUInOrder" to "Yes". The order between sequences is not enforced.
2150  *
2151  * Caller holds idt->idt_mutex
2152  */
2153 static idm_status_t
2154 idm_so_buf_tx_to_ini(idm_task_t *idt, idm_buf_t *idb)
2155 {
2156 	idm_so_conn_t	*so_conn = idb->idb_ic->ic_transport_private;
2157 	idm_pdu_t	tmppdu;
2158 
2159 	ASSERT(mutex_owned(&idt->idt_mutex));
2160 
2161 	/*
2162 	 * Put the idm_buf_t on the tx queue.  It will be transmitted by
2163 	 * idm_sotx_thread.
2164 	 */
2165 	mutex_enter(&so_conn->ic_tx_mutex);
2166 
2167 	if (!so_conn->ic_tx_thread_running) {
2168 		mutex_exit(&so_conn->ic_tx_mutex);
2169 		/*
2170 		 * Don't release idt->idt_mutex since we're supposed to hold
2171 		 * in when calling idm_buf_tx_to_ini_done
2172 		 */
2173 		idm_buf_tx_to_ini_done(idt, idb, IDM_STATUS_ABORTED);
2174 		return (IDM_STATUS_FAIL);
2175 	}
2176 
2177 	/*
2178 	 * Build a template for the data PDU headers we will use so that
2179 	 * the SN values will stay consistent with other PDU's we are
2180 	 * transmitting like R2T and SCSI status.
2181 	 */
2182 	bzero(&idb->idb_data_hdr_tmpl, sizeof (iscsi_hdr_t));
2183 	tmppdu.isp_hdr = &idb->idb_data_hdr_tmpl;
2184 	(*idt->idt_ic->ic_conn_ops.icb_build_hdr)(idt, &tmppdu,
2185 	    ISCSI_OP_SCSI_DATA_RSP);
2186 	idb->idb_tx_thread = B_TRUE;
2187 	list_insert_tail(&so_conn->ic_tx_list, (void *)idb);
2188 	cv_signal(&so_conn->ic_tx_cv);
2189 	mutex_exit(&so_conn->ic_tx_mutex);
2190 	mutex_exit(&idt->idt_mutex);
2191 
2192 	/*
2193 	 * Returning success here indicates the transfer was successfully
2194 	 * dispatched -- it does not mean that the transfer completed
2195 	 * successfully.
2196 	 */
2197 	return (IDM_STATUS_SUCCESS);
2198 }
2199 
2200 /*
2201  * The idm_so_buf_rx_from_ini() is used by the target iSCSI layer to specify the
2202  * data blocks it is ready to receive from the initiator in response to a WRITE
2203  * SCSI command. The target iSCSI layer passes the information about the desired
2204  * data blocks to the initiator in one R2T PDU. The receiving buffer, the buffer
2205  * offset and datalen are passed via the 'idb' argument.
2206  *
2207  * Scope for Prototype build:
2208  * R2Ts are required for any Data-Out PDU, i.e. initiator and target must have
2209  * negotiated the "InitialR2T" to "Yes".
2210  *
2211  * Caller holds idt->idt_mutex
2212  */
2213 static idm_status_t
2214 idm_so_buf_rx_from_ini(idm_task_t *idt, idm_buf_t *idb)
2215 {
2216 	idm_pdu_t		*pdu;
2217 	iscsi_rtt_hdr_t		*rtt;
2218 
2219 	ASSERT(mutex_owned(&idt->idt_mutex));
2220 
2221 	pdu = kmem_cache_alloc(idm.idm_sotx_pdu_cache, KM_SLEEP);
2222 	pdu->isp_ic = idt->idt_ic;
2223 	bzero(pdu->isp_hdr, sizeof (iscsi_rtt_hdr_t));
2224 
2225 	/* iSCSI layer fills the TTT, ITT, StatSN, ExpCmdSN, MaxCmdSN */
2226 	(*idt->idt_ic->ic_conn_ops.icb_build_hdr)(idt, pdu, ISCSI_OP_RTT_RSP);
2227 
2228 	/* set the rttsn, rtt.flags, rtt.data_offset and rtt.data_length */
2229 	rtt = (iscsi_rtt_hdr_t *)(pdu->isp_hdr);
2230 
2231 	rtt->opcode		= ISCSI_OP_RTT_RSP;
2232 	rtt->flags		= ISCSI_FLAG_FINAL;
2233 	rtt->data_offset	= htonl(idb->idb_bufoffset);
2234 	rtt->data_length	= htonl(idb->idb_xfer_len);
2235 	rtt->rttsn		= htonl(idt->idt_exp_rttsn++);
2236 
2237 	/* Keep track of buffer offsets */
2238 	idb->idb_exp_offset	= idb->idb_bufoffset;
2239 	mutex_exit(&idt->idt_mutex);
2240 
2241 	/*
2242 	 * Transmit the PDU.
2243 	 */
2244 	idm_pdu_tx(pdu);
2245 
2246 	return (IDM_STATUS_SUCCESS);
2247 }
2248 
2249 static idm_status_t
2250 idm_so_buf_alloc(idm_buf_t *idb, uint64_t buflen)
2251 {
2252 	if ((buflen > IDM_SO_BUF_CACHE_LB) && (buflen <= IDM_SO_BUF_CACHE_UB)) {
2253 		idb->idb_buf = kmem_cache_alloc(idm.idm_so_128k_buf_cache,
2254 		    KM_NOSLEEP);
2255 		idb->idb_buf_private = idm.idm_so_128k_buf_cache;
2256 	} else {
2257 		idb->idb_buf = kmem_alloc(buflen, KM_NOSLEEP);
2258 		idb->idb_buf_private = NULL;
2259 	}
2260 
2261 	if (idb->idb_buf == NULL) {
2262 		IDM_CONN_LOG(CE_NOTE,
2263 		    "idm_so_buf_alloc: failed buffer allocation");
2264 		return (IDM_STATUS_FAIL);
2265 	}
2266 
2267 	return (IDM_STATUS_SUCCESS);
2268 }
2269 
2270 /* ARGSUSED */
2271 static idm_status_t
2272 idm_so_buf_setup(idm_buf_t *idb)
2273 {
2274 	/* Ensure bufalloc'd flag is unset */
2275 	idb->idb_bufalloc = B_FALSE;
2276 
2277 	return (IDM_STATUS_SUCCESS);
2278 }
2279 
2280 /* ARGSUSED */
2281 static void
2282 idm_so_buf_teardown(idm_buf_t *idb)
2283 {
2284 	/* nothing to do here */
2285 }
2286 
2287 static void
2288 idm_so_buf_free(idm_buf_t *idb)
2289 {
2290 	if (idb->idb_buf_private == NULL) {
2291 		kmem_free(idb->idb_buf, idb->idb_buflen);
2292 	} else {
2293 		kmem_cache_free(idb->idb_buf_private, idb->idb_buf);
2294 	}
2295 }
2296 
2297 static void
2298 idm_so_send_rtt_data(idm_conn_t *ic, idm_task_t *idt, idm_buf_t *idb,
2299     uint32_t offset, uint32_t length)
2300 {
2301 	idm_so_conn_t	*so_conn = ic->ic_transport_private;
2302 	idm_pdu_t	tmppdu;
2303 	idm_buf_t	*rtt_buf;
2304 
2305 	ASSERT(mutex_owned(&idt->idt_mutex));
2306 
2307 	/*
2308 	 * Allocate a buffer to represent the RTT transfer.  We could further
2309 	 * optimize this by allocating the buffers internally from an rtt
2310 	 * specific buffer cache since this is socket-specific code but for
2311 	 * now we will keep it simple.
2312 	 */
2313 	rtt_buf = idm_buf_alloc(ic, (uint8_t *)idb->idb_buf + offset, length);
2314 	if (rtt_buf == NULL) {
2315 		/*
2316 		 * If we're in FFP then the failure was likely a resource
2317 		 * allocation issue and we should close the connection by
2318 		 * sending a CE_TRANSPORT_FAIL event.
2319 		 *
2320 		 * If we're not in FFP then idm_buf_alloc will always
2321 		 * fail and the state is transitioning to "complete" anyway
2322 		 * so we won't bother to send an event.
2323 		 */
2324 		mutex_enter(&ic->ic_state_mutex);
2325 		if (ic->ic_ffp)
2326 			idm_conn_event_locked(ic, CE_TRANSPORT_FAIL,
2327 			    NULL, CT_NONE);
2328 		mutex_exit(&ic->ic_state_mutex);
2329 		return;
2330 	}
2331 
2332 	rtt_buf->idb_buf_cb = NULL;
2333 	rtt_buf->idb_cb_arg = NULL;
2334 	rtt_buf->idb_bufoffset = offset;
2335 	rtt_buf->idb_xfer_len = length;
2336 	rtt_buf->idb_ic = idt->idt_ic;
2337 	rtt_buf->idb_task_binding = idt;
2338 
2339 	/*
2340 	 * Put the idm_buf_t on the tx queue.  It will be transmitted by
2341 	 * idm_sotx_thread.
2342 	 */
2343 	mutex_enter(&so_conn->ic_tx_mutex);
2344 
2345 	if (!so_conn->ic_tx_thread_running) {
2346 		idm_buf_free(rtt_buf);
2347 		mutex_exit(&so_conn->ic_tx_mutex);
2348 		return;
2349 	}
2350 
2351 	/*
2352 	 * This new buffer represents an additional reference on the task
2353 	 */
2354 	idm_task_hold(idt);
2355 
2356 	/*
2357 	 * Build a template for the data PDU headers we will use so that
2358 	 * the SN values will stay consistent with other PDU's we are
2359 	 * transmitting like R2T and SCSI status.
2360 	 */
2361 	bzero(&rtt_buf->idb_data_hdr_tmpl, sizeof (iscsi_hdr_t));
2362 	tmppdu.isp_hdr = &rtt_buf->idb_data_hdr_tmpl;
2363 	(*idt->idt_ic->ic_conn_ops.icb_build_hdr)(idt, &tmppdu,
2364 	    ISCSI_OP_SCSI_DATA);
2365 	rtt_buf->idb_tx_thread = B_TRUE;
2366 	rtt_buf->idb_in_transport = B_TRUE;
2367 	list_insert_tail(&so_conn->ic_tx_list, (void *)rtt_buf);
2368 	cv_signal(&so_conn->ic_tx_cv);
2369 	mutex_exit(&so_conn->ic_tx_mutex);
2370 }
2371 
2372 static void
2373 idm_so_send_rtt_data_done(idm_task_t *idt, idm_buf_t *idb)
2374 {
2375 	/*
2376 	 * Don't worry about status -- we assume any error handling
2377 	 * is performed by the caller (idm_sotx_thread).
2378 	 */
2379 	idb->idb_in_transport = B_FALSE;
2380 	idm_task_rele(idt);
2381 	idm_buf_free(idb);
2382 }
2383 
2384 static idm_status_t
2385 idm_so_send_buf_region(idm_task_t *idt, idm_buf_t *idb,
2386     uint32_t buf_region_offset, uint32_t buf_region_length)
2387 {
2388 	idm_conn_t		*ic;
2389 	uint32_t		max_dataseglen;
2390 	size_t			remainder, chunk;
2391 	uint32_t		data_offset = buf_region_offset;
2392 	iscsi_data_hdr_t	*bhs;
2393 	idm_pdu_t		*pdu;
2394 	idm_status_t		tx_status;
2395 
2396 	ASSERT(mutex_owned(&idt->idt_mutex));
2397 
2398 	ic = idt->idt_ic;
2399 
2400 	max_dataseglen = 8192; /* Need value from login negotiation */
2401 	remainder = buf_region_length;
2402 
2403 	while (remainder) {
2404 		if (idt->idt_state != TASK_ACTIVE) {
2405 			ASSERT((idt->idt_state != TASK_IDLE) &&
2406 			    (idt->idt_state != TASK_COMPLETE));
2407 			return (IDM_STATUS_ABORTED);
2408 		}
2409 
2410 		/* check to see if we need to chunk the data */
2411 		if (remainder > max_dataseglen) {
2412 			chunk = max_dataseglen;
2413 		} else {
2414 			chunk = remainder;
2415 		}
2416 
2417 		/* Data PDU headers will always be sizeof (iscsi_hdr_t) */
2418 		pdu = kmem_cache_alloc(idm.idm_sotx_pdu_cache, KM_SLEEP);
2419 		pdu->isp_ic = ic;
2420 
2421 		/*
2422 		 * We've already built a build a header template
2423 		 * to use during the transfer.  Use this template so that
2424 		 * the SN values stay consistent with any unrelated PDU's
2425 		 * being transmitted.
2426 		 */
2427 		bcopy(&idb->idb_data_hdr_tmpl, pdu->isp_hdr,
2428 		    sizeof (iscsi_hdr_t));
2429 
2430 		/*
2431 		 * Set DataSN, data offset, and flags in BHS
2432 		 * For the prototype build, A = 0, S = 0, U = 0
2433 		 */
2434 		bhs = (iscsi_data_hdr_t *)(pdu->isp_hdr);
2435 
2436 		bhs->datasn		= htonl(idt->idt_exp_datasn++);
2437 
2438 		hton24(bhs->dlength, chunk);
2439 		bhs->offset = htonl(idb->idb_bufoffset + data_offset);
2440 
2441 		if (chunk == remainder) {
2442 			bhs->flags = ISCSI_FLAG_FINAL; /* F bit set to 1 */
2443 		}
2444 
2445 		/* setup data */
2446 		pdu->isp_data	=  (uint8_t *)idb->idb_buf + data_offset;
2447 		pdu->isp_datalen = (uint_t)chunk;
2448 		remainder	-= chunk;
2449 		data_offset	+= chunk;
2450 
2451 		/*
2452 		 * Now that we're done working with idt_exp_datasn,
2453 		 * idt->idt_state and idb->idb_bufoffset we can release
2454 		 * the task lock -- don't want to hold it across the
2455 		 * call to idm_i_so_tx since we could block.
2456 		 */
2457 		mutex_exit(&idt->idt_mutex);
2458 
2459 		/*
2460 		 * Transmit the PDU.  Call the internal routine directly
2461 		 * as there is already implicit ordering.
2462 		 */
2463 		if ((tx_status = idm_i_so_tx(pdu)) != IDM_STATUS_SUCCESS) {
2464 			mutex_enter(&idt->idt_mutex);
2465 			return (tx_status);
2466 		}
2467 
2468 		mutex_enter(&idt->idt_mutex);
2469 		idt->idt_tx_bytes += chunk;
2470 	}
2471 
2472 	return (IDM_STATUS_SUCCESS);
2473 }
2474 
2475 /*
2476  * TX PDU cache
2477  */
2478 /* ARGSUSED */
2479 int
2480 idm_sotx_pdu_constructor(void *hdl, void *arg, int flags)
2481 {
2482 	idm_pdu_t	*pdu = hdl;
2483 
2484 	bzero(pdu, sizeof (idm_pdu_t));
2485 	pdu->isp_hdr = (iscsi_hdr_t *)(pdu + 1); /* Ptr arithmetic */
2486 	pdu->isp_hdrlen = sizeof (iscsi_hdr_t);
2487 	pdu->isp_callback = idm_sotx_cache_pdu_cb;
2488 	pdu->isp_magic = IDM_PDU_MAGIC;
2489 	bzero(pdu->isp_hdr, sizeof (iscsi_hdr_t));
2490 
2491 	return (0);
2492 }
2493 
2494 /* ARGSUSED */
2495 void
2496 idm_sotx_cache_pdu_cb(idm_pdu_t *pdu, idm_status_t status)
2497 {
2498 	/* reset values between use */
2499 	pdu->isp_datalen = 0;
2500 
2501 	kmem_cache_free(idm.idm_sotx_pdu_cache, pdu);
2502 }
2503 
2504 /*
2505  * RX PDU cache
2506  */
2507 /* ARGSUSED */
2508 int
2509 idm_sorx_pdu_constructor(void *hdl, void *arg, int flags)
2510 {
2511 	idm_pdu_t	*pdu = hdl;
2512 
2513 	bzero(pdu, sizeof (idm_pdu_t));
2514 	pdu->isp_magic = IDM_PDU_MAGIC;
2515 	pdu->isp_hdr = (iscsi_hdr_t *)(pdu + 1); /* Ptr arithmetic */
2516 	pdu->isp_callback = idm_sorx_cache_pdu_cb;
2517 
2518 	return (0);
2519 }
2520 
2521 /* ARGSUSED */
2522 static void
2523 idm_sorx_cache_pdu_cb(idm_pdu_t *pdu, idm_status_t status)
2524 {
2525 	pdu->isp_iovlen = 0;
2526 	pdu->isp_sorx_buf = 0;
2527 	kmem_cache_free(idm.idm_sorx_pdu_cache, pdu);
2528 }
2529 
2530 static void
2531 idm_sorx_addl_pdu_cb(idm_pdu_t *pdu, idm_status_t status)
2532 {
2533 	/*
2534 	 * We had to modify our cached RX PDU with a longer header buffer
2535 	 * and/or a longer data buffer.  Release the new buffers and fix
2536 	 * the fields back to what we would expect for a cached RX PDU.
2537 	 */
2538 	if (pdu->isp_flags & IDM_PDU_ADDL_HDR) {
2539 		kmem_free(pdu->isp_hdr, pdu->isp_hdrlen);
2540 	}
2541 	if (pdu->isp_flags & IDM_PDU_ADDL_DATA) {
2542 		kmem_free(pdu->isp_data, pdu->isp_datalen);
2543 	}
2544 	pdu->isp_hdr = (iscsi_hdr_t *)(pdu + 1);
2545 	pdu->isp_hdrlen = sizeof (iscsi_hdr_t);
2546 	pdu->isp_data = NULL;
2547 	pdu->isp_datalen = 0;
2548 	pdu->isp_sorx_buf = 0;
2549 	pdu->isp_callback = idm_sorx_cache_pdu_cb;
2550 	idm_sorx_cache_pdu_cb(pdu, status);
2551 }
2552 
2553 /*
2554  * This thread is only active when I/O is queued for transmit
2555  * because the socket is busy.
2556  */
2557 void
2558 idm_sotx_thread(void *arg)
2559 {
2560 	idm_conn_t	*ic = arg;
2561 	idm_tx_obj_t	*object, *next;
2562 	idm_so_conn_t	*so_conn;
2563 	idm_status_t	status = IDM_STATUS_SUCCESS;
2564 
2565 	idm_conn_hold(ic);
2566 
2567 	mutex_enter(&ic->ic_mutex);
2568 	so_conn = ic->ic_transport_private;
2569 	so_conn->ic_tx_thread_running = B_TRUE;
2570 	so_conn->ic_tx_thread_did = so_conn->ic_tx_thread->t_did;
2571 	cv_signal(&ic->ic_cv);
2572 	mutex_exit(&ic->ic_mutex);
2573 
2574 	mutex_enter(&so_conn->ic_tx_mutex);
2575 
2576 	while (so_conn->ic_tx_thread_running) {
2577 		while (list_is_empty(&so_conn->ic_tx_list)) {
2578 			DTRACE_PROBE1(soconn__tx__sleep, idm_conn_t *, ic);
2579 			cv_wait(&so_conn->ic_tx_cv, &so_conn->ic_tx_mutex);
2580 			DTRACE_PROBE1(soconn__tx__wakeup, idm_conn_t *, ic);
2581 
2582 			if (!so_conn->ic_tx_thread_running) {
2583 				goto tx_bail;
2584 			}
2585 		}
2586 
2587 		object = (idm_tx_obj_t *)list_head(&so_conn->ic_tx_list);
2588 		list_remove(&so_conn->ic_tx_list, object);
2589 		mutex_exit(&so_conn->ic_tx_mutex);
2590 
2591 		switch (object->idm_tx_obj_magic) {
2592 		case IDM_PDU_MAGIC:
2593 			DTRACE_PROBE2(soconn__tx__pdu, idm_conn_t *, ic,
2594 			    idm_pdu_t *, (idm_pdu_t *)object);
2595 
2596 			status = idm_i_so_tx((idm_pdu_t *)object);
2597 			break;
2598 
2599 		case IDM_BUF_MAGIC: {
2600 			idm_buf_t *idb = (idm_buf_t *)object;
2601 			idm_task_t *idt = idb->idb_task_binding;
2602 
2603 			DTRACE_PROBE2(soconn__tx__buf, idm_conn_t *, ic,
2604 			    idm_buf_t *, idb);
2605 
2606 			mutex_enter(&idt->idt_mutex);
2607 			status = idm_so_send_buf_region(idt,
2608 			    idb, 0, idb->idb_xfer_len);
2609 
2610 			/*
2611 			 * TX thread owns the buffer so we expect it to
2612 			 * be "in transport"
2613 			 */
2614 			ASSERT(idb->idb_in_transport);
2615 			if (IDM_CONN_ISTGT(ic)) {
2616 				/*
2617 				 * idm_buf_tx_to_ini_done releases
2618 				 * idt->idt_mutex
2619 				 */
2620 				idm_buf_tx_to_ini_done(idt, idb, status);
2621 			} else {
2622 				idm_so_send_rtt_data_done(idt, idb);
2623 				mutex_exit(&idt->idt_mutex);
2624 			}
2625 			break;
2626 		}
2627 
2628 		default:
2629 			IDM_CONN_LOG(CE_WARN, "idm_sotx_thread: Unknown magic "
2630 			    "(0x%08x)", object->idm_tx_obj_magic);
2631 			status = IDM_STATUS_FAIL;
2632 		}
2633 
2634 		mutex_enter(&so_conn->ic_tx_mutex);
2635 
2636 		if (status != IDM_STATUS_SUCCESS) {
2637 			so_conn->ic_tx_thread_running = B_FALSE;
2638 			idm_conn_event(ic, CE_TRANSPORT_FAIL, status);
2639 		}
2640 	}
2641 
2642 	/*
2643 	 * Before we leave, we need to abort every item remaining in the
2644 	 * TX list.
2645 	 */
2646 
2647 tx_bail:
2648 	object = (idm_tx_obj_t *)list_head(&so_conn->ic_tx_list);
2649 
2650 	while (object != NULL) {
2651 		next = list_next(&so_conn->ic_tx_list, object);
2652 
2653 		list_remove(&so_conn->ic_tx_list, object);
2654 		switch (object->idm_tx_obj_magic) {
2655 		case IDM_PDU_MAGIC:
2656 			idm_pdu_complete((idm_pdu_t *)object,
2657 			    IDM_STATUS_ABORTED);
2658 			break;
2659 
2660 		case IDM_BUF_MAGIC: {
2661 			idm_buf_t *idb = (idm_buf_t *)object;
2662 			idm_task_t *idt = idb->idb_task_binding;
2663 			mutex_exit(&so_conn->ic_tx_mutex);
2664 			mutex_enter(&idt->idt_mutex);
2665 			/*
2666 			 * TX thread owns the buffer so we expect it to
2667 			 * be "in transport"
2668 			 */
2669 			ASSERT(idb->idb_in_transport);
2670 			if (IDM_CONN_ISTGT(ic)) {
2671 				/*
2672 				 * idm_buf_tx_to_ini_done releases
2673 				 * idt->idt_mutex
2674 				 */
2675 				idm_buf_tx_to_ini_done(idt, idb,
2676 				    IDM_STATUS_ABORTED);
2677 			} else {
2678 				idm_so_send_rtt_data_done(idt, idb);
2679 				mutex_exit(&idt->idt_mutex);
2680 			}
2681 			mutex_enter(&so_conn->ic_tx_mutex);
2682 			break;
2683 		}
2684 		default:
2685 			IDM_CONN_LOG(CE_WARN,
2686 			    "idm_sotx_thread: Unexpected magic "
2687 			    "(0x%08x)", object->idm_tx_obj_magic);
2688 		}
2689 
2690 		object = next;
2691 	}
2692 
2693 	mutex_exit(&so_conn->ic_tx_mutex);
2694 	idm_conn_rele(ic);
2695 	thread_exit();
2696 	/*NOTREACHED*/
2697 }
2698