xref: /illumos-gate/usr/src/uts/common/io/idm/idm_so.c (revision 6e6c7d67bf5ba2efa13619acd59395d0f278ee75)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 2013 by Delphix. All rights reserved.
27  * Copyright (c) 2017, Joyent, Inc.  All rights reserved.
28  */
29 
30 #include <sys/conf.h>
31 #include <sys/stat.h>
32 #include <sys/file.h>
33 #include <sys/ddi.h>
34 #include <sys/sunddi.h>
35 #include <sys/modctl.h>
36 #include <sys/priv.h>
37 #include <sys/cpuvar.h>
38 #include <sys/socket.h>
39 #include <sys/strsubr.h>
40 #include <sys/sysmacros.h>
41 #include <sys/sdt.h>
42 #include <netinet/tcp.h>
43 #include <inet/tcp.h>
44 #include <sys/socketvar.h>
45 #include <sys/pathname.h>
46 #include <sys/fs/snode.h>
47 #include <sys/fs/dv_node.h>
48 #include <sys/vnode.h>
49 #include <netinet/in.h>
50 #include <net/if.h>
51 #include <sys/sockio.h>
52 #include <sys/ksocket.h>
53 #include <sys/filio.h>		/* FIONBIO */
54 #include <sys/iscsi_protocol.h>
55 #include <sys/idm/idm.h>
56 #include <sys/idm/idm_so.h>
57 #include <sys/idm/idm_text.h>
58 
59 #define	IN_PROGRESS_DELAY	1
60 
61 /*
62  * in6addr_any is currently all zeroes, but use the macro in case this
63  * ever changes.
64  */
65 static const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT;
66 
67 static void idm_sorx_cache_pdu_cb(idm_pdu_t *pdu, idm_status_t status);
68 static void idm_sorx_addl_pdu_cb(idm_pdu_t *pdu, idm_status_t status);
69 static void idm_sotx_cache_pdu_cb(idm_pdu_t *pdu, idm_status_t status);
70 
71 static idm_status_t idm_so_conn_create_common(idm_conn_t *ic, ksocket_t new_so);
72 static void idm_so_conn_destroy_common(idm_conn_t *ic);
73 static void idm_so_conn_connect_common(idm_conn_t *ic);
74 
75 static void idm_set_ini_preconnect_options(idm_so_conn_t *sc,
76     boolean_t boot_conn);
77 static void idm_set_postconnect_options(ksocket_t so);
78 static idm_status_t idm_i_so_tx(idm_pdu_t *pdu);
79 
80 static idm_status_t idm_sorecvdata(idm_conn_t *ic, idm_pdu_t *pdu);
81 static void idm_so_send_rtt_data(idm_conn_t *ic, idm_task_t *idt,
82     idm_buf_t *idb, uint32_t offset, uint32_t length);
83 static void idm_so_send_rtt_data_done(idm_task_t *idt, idm_buf_t *idb);
84 static idm_status_t idm_so_send_buf_region(idm_task_t *idt,
85     idm_buf_t *idb, uint32_t buf_region_offset, uint32_t buf_region_length);
86 
87 static uint32_t idm_fill_iov(idm_pdu_t *pdu, idm_buf_t *idb,
88     uint32_t ro, uint32_t dlength);
89 
90 static idm_status_t idm_so_handle_digest(idm_conn_t *it,
91     nvpair_t *digest_choice, const idm_kv_xlate_t *ikvx);
92 
93 static void idm_so_socket_set_nonblock(struct sonode *node);
94 static void idm_so_socket_set_block(struct sonode *node);
95 
96 /*
97  * Transport ops prototypes
98  */
99 static void idm_so_tx(idm_conn_t *ic, idm_pdu_t *pdu);
100 static idm_status_t idm_so_buf_tx_to_ini(idm_task_t *idt, idm_buf_t *idb);
101 static idm_status_t idm_so_buf_rx_from_ini(idm_task_t *idt, idm_buf_t *idb);
102 static void idm_so_rx_datain(idm_conn_t *ic, idm_pdu_t *pdu);
103 static void idm_so_rx_rtt(idm_conn_t *ic, idm_pdu_t *pdu);
104 static void idm_so_rx_dataout(idm_conn_t *ic, idm_pdu_t *pdu);
105 static idm_status_t idm_so_free_task_rsrc(idm_task_t *idt);
106 static kv_status_t idm_so_negotiate_key_values(idm_conn_t *it,
107     nvlist_t *request_nvl, nvlist_t *response_nvl, nvlist_t *negotiated_nvl);
108 static void idm_so_notice_key_values(idm_conn_t *it,
109     nvlist_t *negotiated_nvl);
110 static kv_status_t idm_so_declare_key_values(idm_conn_t *it,
111     nvlist_t *config_nvl, nvlist_t *outgoing_nvl);
112 static boolean_t idm_so_conn_is_capable(idm_conn_req_t *ic,
113     idm_transport_caps_t *caps);
114 static idm_status_t idm_so_buf_alloc(idm_buf_t *idb, uint64_t buflen);
115 static void idm_so_buf_free(idm_buf_t *idb);
116 static idm_status_t idm_so_buf_setup(idm_buf_t *idb);
117 static void idm_so_buf_teardown(idm_buf_t *idb);
118 static idm_status_t idm_so_tgt_svc_create(idm_svc_req_t *sr, idm_svc_t *is);
119 static void idm_so_tgt_svc_destroy(idm_svc_t *is);
120 static idm_status_t idm_so_tgt_svc_online(idm_svc_t *is);
121 static void idm_so_tgt_svc_offline(idm_svc_t *is);
122 static void idm_so_tgt_conn_destroy(idm_conn_t *ic);
123 static idm_status_t idm_so_tgt_conn_connect(idm_conn_t *ic);
124 static void idm_so_conn_disconnect(idm_conn_t *ic);
125 static idm_status_t idm_so_ini_conn_create(idm_conn_req_t *cr, idm_conn_t *ic);
126 static void idm_so_ini_conn_destroy(idm_conn_t *ic);
127 static idm_status_t idm_so_ini_conn_connect(idm_conn_t *ic);
128 
129 /*
130  * IDM Native Sockets transport operations
131  */
132 static
133 idm_transport_ops_t idm_so_transport_ops = {
134 	idm_so_tx,			/* it_tx_pdu */
135 	idm_so_buf_tx_to_ini,		/* it_buf_tx_to_ini */
136 	idm_so_buf_rx_from_ini,		/* it_buf_rx_from_ini */
137 	idm_so_rx_datain,		/* it_rx_datain */
138 	idm_so_rx_rtt,			/* it_rx_rtt */
139 	idm_so_rx_dataout,		/* it_rx_dataout */
140 	NULL,				/* it_alloc_conn_rsrc */
141 	NULL,				/* it_free_conn_rsrc */
142 	NULL,				/* it_tgt_enable_datamover */
143 	NULL,				/* it_ini_enable_datamover */
144 	NULL,				/* it_conn_terminate */
145 	idm_so_free_task_rsrc,		/* it_free_task_rsrc */
146 	idm_so_negotiate_key_values,	/* it_negotiate_key_values */
147 	idm_so_notice_key_values,	/* it_notice_key_values */
148 	idm_so_conn_is_capable,		/* it_conn_is_capable */
149 	idm_so_buf_alloc,		/* it_buf_alloc */
150 	idm_so_buf_free,		/* it_buf_free */
151 	idm_so_buf_setup,		/* it_buf_setup */
152 	idm_so_buf_teardown,		/* it_buf_teardown */
153 	idm_so_tgt_svc_create,		/* it_tgt_svc_create */
154 	idm_so_tgt_svc_destroy,		/* it_tgt_svc_destroy */
155 	idm_so_tgt_svc_online,		/* it_tgt_svc_online */
156 	idm_so_tgt_svc_offline,		/* it_tgt_svc_offline */
157 	idm_so_tgt_conn_destroy,	/* it_tgt_conn_destroy */
158 	idm_so_tgt_conn_connect,	/* it_tgt_conn_connect */
159 	idm_so_conn_disconnect,		/* it_tgt_conn_disconnect */
160 	idm_so_ini_conn_create,		/* it_ini_conn_create */
161 	idm_so_ini_conn_destroy,	/* it_ini_conn_destroy */
162 	idm_so_ini_conn_connect,	/* it_ini_conn_connect */
163 	idm_so_conn_disconnect,		/* it_ini_conn_disconnect */
164 	idm_so_declare_key_values	/* it_declare_key_values */
165 };
166 
167 kmutex_t	idm_so_timed_socket_mutex;
168 
169 int32_t idm_so_sndbuf = IDM_SNDBUF_SIZE;
170 int32_t idm_so_rcvbuf = IDM_RCVBUF_SIZE;
171 
172 /*
173  * idm_so_init()
174  * Sockets transport initialization
175  */
176 void
177 idm_so_init(idm_transport_t *it)
178 {
179 	/* Cache for IDM Data and R2T Transmit PDU's */
180 	idm.idm_sotx_pdu_cache = kmem_cache_create("idm_tx_pdu_cache",
181 	    sizeof (idm_pdu_t) + sizeof (iscsi_hdr_t), 8,
182 	    &idm_sotx_pdu_constructor, NULL, NULL, NULL, NULL, KM_SLEEP);
183 
184 	/* Cache for IDM Receive PDU's */
185 	idm.idm_sorx_pdu_cache = kmem_cache_create("idm_rx_pdu_cache",
186 	    sizeof (idm_pdu_t) + IDM_SORX_CACHE_HDRLEN, 8,
187 	    &idm_sorx_pdu_constructor, NULL, NULL, NULL, NULL, KM_SLEEP);
188 
189 	/* 128k buffer cache */
190 	idm.idm_so_128k_buf_cache = kmem_cache_create("idm_128k_buf_cache",
191 	    IDM_SO_BUF_CACHE_UB, 8, NULL, NULL, NULL, NULL, NULL, KM_SLEEP);
192 
193 	/* Set the sockets transport ops */
194 	it->it_ops = &idm_so_transport_ops;
195 
196 	mutex_init(&idm_so_timed_socket_mutex, NULL, MUTEX_DEFAULT, NULL);
197 
198 }
199 
200 /*
201  * idm_so_fini()
202  * Sockets transport teardown
203  */
204 void
205 idm_so_fini(void)
206 {
207 	kmem_cache_destroy(idm.idm_so_128k_buf_cache);
208 	kmem_cache_destroy(idm.idm_sotx_pdu_cache);
209 	kmem_cache_destroy(idm.idm_sorx_pdu_cache);
210 	mutex_destroy(&idm_so_timed_socket_mutex);
211 }
212 
213 ksocket_t
214 idm_socreate(int domain, int type, int protocol)
215 {
216 	ksocket_t ks;
217 
218 	if (!ksocket_socket(&ks, domain, type, protocol, KSOCKET_NOSLEEP,
219 	    CRED())) {
220 		return (ks);
221 	} else {
222 		return (NULL);
223 	}
224 }
225 
226 /*
227  * idm_soshutdown will disconnect the socket and prevent subsequent PDU
228  * reception and transmission.  The sonode still exists but its state
229  * gets modified to indicate it is no longer connected.  Calls to
230  * idm_sorecv/idm_iov_sorecv will return so idm_soshutdown can be used
231  * regain control of a thread stuck in idm_sorecv.
232  */
233 void
234 idm_soshutdown(ksocket_t so)
235 {
236 	(void) ksocket_shutdown(so, SHUT_RDWR, CRED());
237 }
238 
239 /*
240  * idm_sodestroy releases all resources associated with a socket previously
241  * created with idm_socreate.  The socket must be shutdown using
242  * idm_soshutdown before the socket is destroyed with idm_sodestroy,
243  * otherwise undefined behavior will result.
244  */
245 void
246 idm_sodestroy(ksocket_t ks)
247 {
248 	(void) ksocket_close(ks, CRED());
249 }
250 
251 /*
252  * Function to compare two addresses in sockaddr_storage format
253  */
254 
255 int
256 idm_ss_compare(const struct sockaddr_storage *cmp_ss1,
257     const struct sockaddr_storage *cmp_ss2,
258     boolean_t v4_mapped_as_v4,
259     boolean_t compare_ports)
260 {
261 	struct sockaddr_storage			mapped_v4_ss1, mapped_v4_ss2;
262 	const struct sockaddr_storage		*ss1, *ss2;
263 	struct in_addr				*in1, *in2;
264 	struct in6_addr				*in61, *in62;
265 	int i;
266 
267 	/*
268 	 * Normalize V4-mapped IPv6 addresses into V4 format if
269 	 * v4_mapped_as_v4 is B_TRUE.
270 	 */
271 	ss1 = cmp_ss1;
272 	ss2 = cmp_ss2;
273 	if (v4_mapped_as_v4 && (ss1->ss_family == AF_INET6)) {
274 		in61 = &((struct sockaddr_in6 *)ss1)->sin6_addr;
275 		if (IN6_IS_ADDR_V4MAPPED(in61)) {
276 			bzero(&mapped_v4_ss1, sizeof (mapped_v4_ss1));
277 			mapped_v4_ss1.ss_family = AF_INET;
278 			((struct sockaddr_in *)&mapped_v4_ss1)->sin_port =
279 			    ((struct sockaddr_in *)ss1)->sin_port;
280 			IN6_V4MAPPED_TO_INADDR(in61,
281 			    &((struct sockaddr_in *)&mapped_v4_ss1)->sin_addr);
282 			ss1 = &mapped_v4_ss1;
283 		}
284 	}
285 	ss2 = cmp_ss2;
286 	if (v4_mapped_as_v4 && (ss2->ss_family == AF_INET6)) {
287 		in62 = &((struct sockaddr_in6 *)ss2)->sin6_addr;
288 		if (IN6_IS_ADDR_V4MAPPED(in62)) {
289 			bzero(&mapped_v4_ss2, sizeof (mapped_v4_ss2));
290 			mapped_v4_ss2.ss_family = AF_INET;
291 			((struct sockaddr_in *)&mapped_v4_ss2)->sin_port =
292 			    ((struct sockaddr_in *)ss2)->sin_port;
293 			IN6_V4MAPPED_TO_INADDR(in62,
294 			    &((struct sockaddr_in *)&mapped_v4_ss2)->sin_addr);
295 			ss2 = &mapped_v4_ss2;
296 		}
297 	}
298 
299 	/*
300 	 * Compare ports, then address family, then ip address
301 	 */
302 	if (compare_ports &&
303 	    (((struct sockaddr_in *)ss1)->sin_port !=
304 	    ((struct sockaddr_in *)ss2)->sin_port)) {
305 		if (((struct sockaddr_in *)ss1)->sin_port >
306 		    ((struct sockaddr_in *)ss2)->sin_port)
307 			return (1);
308 		else
309 			return (-1);
310 	}
311 
312 	/*
313 	 * ports are the same
314 	 */
315 	if (ss1->ss_family != ss2->ss_family) {
316 		if (ss1->ss_family == AF_INET)
317 			return (1);
318 		else
319 			return (-1);
320 	}
321 
322 	/*
323 	 * address families are the same
324 	 */
325 	if (ss1->ss_family == AF_INET) {
326 		in1 = &((struct sockaddr_in *)ss1)->sin_addr;
327 		in2 = &((struct sockaddr_in *)ss2)->sin_addr;
328 
329 		if (in1->s_addr > in2->s_addr)
330 			return (1);
331 		else if (in1->s_addr < in2->s_addr)
332 			return (-1);
333 		else
334 			return (0);
335 	} else if (ss1->ss_family == AF_INET6) {
336 		in61 = &((struct sockaddr_in6 *)ss1)->sin6_addr;
337 		in62 = &((struct sockaddr_in6 *)ss2)->sin6_addr;
338 
339 		for (i = 0; i < 4; i++) {
340 			if (in61->s6_addr32[i] > in62->s6_addr32[i])
341 				return (1);
342 			else if (in61->s6_addr32[i] < in62->s6_addr32[i])
343 				return (-1);
344 		}
345 		return (0);
346 	}
347 
348 	return (1);
349 }
350 
351 /*
352  * IP address filter functions to flag addresses that should not
353  * go out to initiators through discovery.
354  */
355 static boolean_t
356 idm_v4_addr_okay(struct in_addr *in_addr)
357 {
358 	in_addr_t addr = ntohl(in_addr->s_addr);
359 
360 	if ((INADDR_NONE == addr) ||
361 	    (IN_MULTICAST(addr)) ||
362 	    ((addr >> IN_CLASSA_NSHIFT) == 0) ||
363 	    ((addr >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)) {
364 		return (B_FALSE);
365 	}
366 	return (B_TRUE);
367 }
368 
369 static boolean_t
370 idm_v6_addr_okay(struct in6_addr *addr6)
371 {
372 
373 	if ((IN6_IS_ADDR_UNSPECIFIED(addr6)) ||
374 	    (IN6_IS_ADDR_LOOPBACK(addr6)) ||
375 	    (IN6_IS_ADDR_MULTICAST(addr6)) ||
376 	    (IN6_IS_ADDR_V4MAPPED(addr6)) ||
377 	    (IN6_IS_ADDR_V4COMPAT(addr6)) ||
378 	    (IN6_IS_ADDR_LINKLOCAL(addr6))) {
379 		return (B_FALSE);
380 	}
381 	return (B_TRUE);
382 }
383 
384 /*
385  * idm_get_ipaddr will retrieve a list of IP Addresses which the host is
386  * configured with by sending down a sequence of kernel ioctl to IP STREAMS.
387  */
388 int
389 idm_get_ipaddr(idm_addr_list_t **ipaddr_p)
390 {
391 	ksocket_t 		so4, so6;
392 	struct lifnum		lifn;
393 	struct lifconf		lifc;
394 	struct lifreq		*lp;
395 	int			rval;
396 	int			numifs;
397 	int			bufsize;
398 	void			*buf;
399 	int			i, j, n, rc;
400 	struct sockaddr_storage	ss;
401 	struct sockaddr_in	*sin;
402 	struct sockaddr_in6	*sin6;
403 	idm_addr_t		*ip;
404 	idm_addr_list_t		*ipaddr = NULL;
405 	int			size_ipaddr;
406 
407 	*ipaddr_p = NULL;
408 	size_ipaddr = 0;
409 	buf = NULL;
410 
411 	/* create an ipv4 and ipv6 UDP socket */
412 	if ((so6 = idm_socreate(PF_INET6, SOCK_DGRAM, 0)) == NULL)
413 		return (0);
414 	if ((so4 = idm_socreate(PF_INET, SOCK_DGRAM, 0)) == NULL) {
415 		idm_sodestroy(so6);
416 		return (0);
417 	}
418 
419 
420 retry_count:
421 	/* snapshot the current number of interfaces */
422 	lifn.lifn_family = PF_UNSPEC;
423 	lifn.lifn_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES;
424 	lifn.lifn_count = 0;
425 	/* use vp6 for ioctls with unspecified families by default */
426 	if (ksocket_ioctl(so6, SIOCGLIFNUM, (intptr_t)&lifn, &rval, CRED())
427 	    != 0) {
428 		goto cleanup;
429 	}
430 
431 	numifs = lifn.lifn_count;
432 	if (numifs <= 0) {
433 		goto cleanup;
434 	}
435 
436 	/* allocate extra room in case more interfaces appear */
437 	numifs += 10;
438 
439 	/* get the interface names and ip addresses */
440 	bufsize = numifs * sizeof (struct lifreq);
441 	buf = kmem_alloc(bufsize, KM_SLEEP);
442 
443 	lifc.lifc_family = AF_UNSPEC;
444 	lifc.lifc_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES;
445 	lifc.lifc_len = bufsize;
446 	lifc.lifc_buf = buf;
447 	rc = ksocket_ioctl(so6, SIOCGLIFCONF, (intptr_t)&lifc, &rval, CRED());
448 	if (rc != 0) {
449 		goto cleanup;
450 	}
451 	/* if our extra room is used up, try again */
452 	if (bufsize <= lifc.lifc_len) {
453 		kmem_free(buf, bufsize);
454 		buf = NULL;
455 		goto retry_count;
456 	}
457 	/* calc actual number of ifconfs */
458 	n = lifc.lifc_len / sizeof (struct lifreq);
459 
460 	/* get ip address */
461 	if (n > 0) {
462 		size_ipaddr = sizeof (idm_addr_list_t) +
463 		    (n - 1) * sizeof (idm_addr_t);
464 		ipaddr = kmem_zalloc(size_ipaddr, KM_SLEEP);
465 	} else {
466 		goto cleanup;
467 	}
468 
469 	/*
470 	 * Examine the array of interfaces and filter uninteresting ones
471 	 */
472 	for (i = 0, j = 0, lp = lifc.lifc_req; i < n; i++, lp++) {
473 
474 		/*
475 		 * Copy the address as the SIOCGLIFFLAGS ioctl is destructive
476 		 */
477 		ss = lp->lifr_addr;
478 		/*
479 		 * fetch the flags using the socket of the correct family
480 		 */
481 		switch (ss.ss_family) {
482 		case AF_INET:
483 			rc = ksocket_ioctl(so4, SIOCGLIFFLAGS, (intptr_t)lp,
484 			    &rval, CRED());
485 			break;
486 		case AF_INET6:
487 			rc = ksocket_ioctl(so6, SIOCGLIFFLAGS, (intptr_t)lp,
488 			    &rval, CRED());
489 			break;
490 		default:
491 			continue;
492 		}
493 		if (rc == 0) {
494 			/*
495 			 * If we got the flags, skip uninteresting
496 			 * interfaces based on flags
497 			 */
498 			if ((lp->lifr_flags & IFF_UP) != IFF_UP)
499 				continue;
500 			if (lp->lifr_flags &
501 			    (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED))
502 				continue;
503 		}
504 
505 		/* save ip address */
506 		ip = &ipaddr->al_addrs[j];
507 		switch (ss.ss_family) {
508 		case AF_INET:
509 			sin = (struct sockaddr_in *)&ss;
510 			if (!idm_v4_addr_okay(&sin->sin_addr))
511 				continue;
512 			ip->a_addr.i_addr.in4 = sin->sin_addr;
513 			ip->a_addr.i_insize = sizeof (struct in_addr);
514 			break;
515 		case AF_INET6:
516 			sin6 = (struct sockaddr_in6 *)&ss;
517 			if (!idm_v6_addr_okay(&sin6->sin6_addr))
518 				continue;
519 			ip->a_addr.i_addr.in6 = sin6->sin6_addr;
520 			ip->a_addr.i_insize = sizeof (struct in6_addr);
521 			break;
522 		default:
523 			continue;
524 		}
525 		j++;
526 	}
527 
528 	if (j == 0) {
529 		/* no valid ifaddr */
530 		kmem_free(ipaddr, size_ipaddr);
531 		size_ipaddr = 0;
532 		ipaddr = NULL;
533 	} else {
534 		ipaddr->al_out_cnt = j;
535 	}
536 
537 
538 cleanup:
539 	idm_sodestroy(so6);
540 	idm_sodestroy(so4);
541 
542 	if (buf != NULL)
543 		kmem_free(buf, bufsize);
544 
545 	*ipaddr_p = ipaddr;
546 	return (size_ipaddr);
547 }
548 
549 int
550 idm_sorecv(ksocket_t so, void *msg, size_t len)
551 {
552 	iovec_t iov;
553 
554 	ASSERT(so != NULL);
555 	ASSERT(len != 0);
556 
557 	/*
558 	 * Fill in iovec and receive data
559 	 */
560 	iov.iov_base = msg;
561 	iov.iov_len = len;
562 
563 	return (idm_iov_sorecv(so, &iov, 1, len));
564 }
565 
566 /*
567  * idm_sosendto - Sends a buffered data on a non-connected socket.
568  *
569  * This function puts the data provided on the wire by calling sosendmsg.
570  * It will return only when all the data has been sent or if an error
571  * occurs.
572  *
573  * Returns 0 for success, the socket errno value if sosendmsg fails, and
574  * -1 if sosendmsg returns success but uio_resid != 0
575  */
576 int
577 idm_sosendto(ksocket_t so, void *buff, size_t len,
578     struct sockaddr *name, socklen_t namelen)
579 {
580 	struct msghdr		msg;
581 	struct iovec		iov[1];
582 	int			error;
583 	size_t			sent = 0;
584 
585 	iov[0].iov_base	= buff;
586 	iov[0].iov_len	= len;
587 
588 	/* Initialization of the message header. */
589 	bzero(&msg, sizeof (msg));
590 	msg.msg_iov	= iov;
591 	msg.msg_iovlen	= 1;
592 	msg.msg_name	= name;
593 	msg.msg_namelen	= namelen;
594 
595 	if ((error = ksocket_sendmsg(so, &msg, 0, &sent, CRED())) == 0) {
596 		/* Data sent */
597 		if (sent == len) {
598 			/* All data sent.  Success. */
599 			return (0);
600 		} else {
601 			/* Not all data was sent.  Failure */
602 			return (-1);
603 		}
604 	}
605 
606 	/* Send failed */
607 	return (error);
608 }
609 
610 /*
611  * idm_iov_sosend - Sends an iovec on a connection.
612  *
613  * This function puts the data provided on the wire by calling sosendmsg.
614  * It will return only when all the data has been sent or if an error
615  * occurs.
616  *
617  * Returns 0 for success, the socket errno value if sosendmsg fails, and
618  * -1 if sosendmsg returns success but uio_resid != 0
619  */
620 int
621 idm_iov_sosend(ksocket_t so, iovec_t *iop, int iovlen, size_t total_len)
622 {
623 	struct msghdr		msg;
624 	int			error;
625 	size_t 			sent = 0;
626 
627 	ASSERT(iop != NULL);
628 
629 	/* Initialization of the message header. */
630 	bzero(&msg, sizeof (msg));
631 	msg.msg_iov	= iop;
632 	msg.msg_iovlen	= iovlen;
633 
634 	if ((error = ksocket_sendmsg(so, &msg, 0, &sent, CRED()))
635 	    == 0) {
636 		/* Data sent */
637 		if (sent == total_len) {
638 			/* All data sent.  Success. */
639 			return (0);
640 		} else {
641 			/* Not all data was sent.  Failure */
642 			return (-1);
643 		}
644 	}
645 
646 	/* Send failed */
647 	return (error);
648 }
649 
650 /*
651  * idm_iov_sorecv - Receives an iovec from a connection
652  *
653  * This function gets the data asked for from the socket.  It will return
654  * only when all the requested data has been retrieved or if an error
655  * occurs.
656  *
657  * Returns 0 for success, the socket errno value if sorecvmsg fails, and
658  * -1 if sorecvmsg returns success but uio_resid != 0
659  */
660 int
661 idm_iov_sorecv(ksocket_t so, iovec_t *iop, int iovlen, size_t total_len)
662 {
663 	struct msghdr		msg;
664 	int			error;
665 	size_t			recv;
666 	int 			flags;
667 
668 	ASSERT(iop != NULL);
669 
670 	/* Initialization of the message header. */
671 	bzero(&msg, sizeof (msg));
672 	msg.msg_iov	= iop;
673 	msg.msg_iovlen	= iovlen;
674 	flags		= MSG_WAITALL;
675 
676 	if ((error = ksocket_recvmsg(so, &msg, flags, &recv, CRED()))
677 	    == 0) {
678 		/* Received data */
679 		if (recv == total_len) {
680 			/* All requested data received.  Success */
681 			return (0);
682 		} else {
683 			/*
684 			 * Not all data was received.  The connection has
685 			 * probably failed.
686 			 */
687 			return (-1);
688 		}
689 	}
690 
691 	/* Receive failed */
692 	return (error);
693 }
694 
695 static void
696 idm_set_ini_preconnect_options(idm_so_conn_t *sc, boolean_t boot_conn)
697 {
698 	int	conn_abort = 10000;
699 	int	conn_notify = 2000;
700 	int	abort = 30000;
701 
702 	/* Pre-connect socket options */
703 	(void) ksocket_setsockopt(sc->ic_so, IPPROTO_TCP,
704 	    TCP_CONN_NOTIFY_THRESHOLD, (char *)&conn_notify, sizeof (int),
705 	    CRED());
706 	if (boot_conn == B_FALSE) {
707 		(void) ksocket_setsockopt(sc->ic_so, IPPROTO_TCP,
708 		    TCP_CONN_ABORT_THRESHOLD, (char *)&conn_abort, sizeof (int),
709 		    CRED());
710 		(void) ksocket_setsockopt(sc->ic_so, IPPROTO_TCP,
711 		    TCP_ABORT_THRESHOLD,
712 		    (char *)&abort, sizeof (int), CRED());
713 	}
714 }
715 
716 static void
717 idm_set_postconnect_options(ksocket_t ks)
718 {
719 	const int	on = 1;
720 
721 	/* Set connect options */
722 	(void) ksocket_setsockopt(ks, SOL_SOCKET, SO_RCVBUF,
723 	    (char *)&idm_so_rcvbuf, sizeof (idm_so_rcvbuf), CRED());
724 	(void) ksocket_setsockopt(ks, SOL_SOCKET, SO_SNDBUF,
725 	    (char *)&idm_so_sndbuf, sizeof (idm_so_sndbuf), CRED());
726 	(void) ksocket_setsockopt(ks, IPPROTO_TCP, TCP_NODELAY,
727 	    (char *)&on, sizeof (on), CRED());
728 }
729 
730 static uint32_t
731 n2h24(const uchar_t *ptr)
732 {
733 	return ((ptr[0] << 16) | (ptr[1] << 8) | ptr[2]);
734 }
735 
736 static boolean_t
737 idm_dataseglenokay(idm_conn_t *ic, idm_pdu_t *pdu)
738 {
739 	iscsi_hdr_t	*bhs;
740 
741 	if (ic->ic_conn_type == CONN_TYPE_TGT &&
742 	    pdu->isp_datalen > ic->ic_conn_params.max_recv_dataseglen) {
743 		IDM_CONN_LOG(CE_WARN,
744 		    "idm_dataseglenokay: exceeded the max data segment length");
745 		return (B_FALSE);
746 	}
747 
748 	bhs = pdu->isp_hdr;
749 	/*
750 	 * Filter out any RFC3720 data-size violations.
751 	 */
752 	switch (IDM_PDU_OPCODE(pdu)) {
753 	case ISCSI_OP_SCSI_TASK_MGT_MSG:
754 	case ISCSI_OP_SCSI_TASK_MGT_RSP:
755 	case ISCSI_OP_RTT_RSP:
756 	case ISCSI_OP_LOGOUT_CMD:
757 		/*
758 		 * Data-segment not allowed and additional headers not allowed.
759 		 * (both must be zero according to the RFC3720.)
760 		 */
761 		if (bhs->hlength != 0 || pdu->isp_datalen != 0)
762 			return (B_FALSE);
763 		break;
764 	case ISCSI_OP_NOOP_OUT:
765 	case ISCSI_OP_LOGIN_CMD:
766 	case ISCSI_OP_TEXT_CMD:
767 	case ISCSI_OP_SNACK_CMD:
768 	case ISCSI_OP_NOOP_IN:
769 	case ISCSI_OP_SCSI_RSP:
770 	case ISCSI_OP_LOGIN_RSP:
771 	case ISCSI_OP_TEXT_RSP:
772 	case ISCSI_OP_SCSI_DATA_RSP:
773 	case ISCSI_OP_LOGOUT_RSP:
774 	case ISCSI_OP_ASYNC_EVENT:
775 	case ISCSI_OP_REJECT_MSG:
776 		/*
777 		 * Additional headers not allowed.
778 		 * (must be zero according to RFC3720.)
779 		 */
780 		if (bhs->hlength)
781 			return (B_FALSE);
782 		break;
783 	case ISCSI_OP_SCSI_CMD:
784 		/*
785 		 * See RFC3720, section 10.3
786 		 *
787 		 * For pure read cmds, data-segment-length must be zero.
788 		 * For non-final transfers, data-size must be even number of
789 		 * 4-byte words.
790 		 * For any transfer, an expected byte count must be provided.
791 		 * For bidirectional transfers, an additional-header must be
792 		 * provided (for the read byte-count.)
793 		 */
794 		if (pdu->isp_datalen) {
795 			if ((bhs->flags & (ISCSI_FLAG_CMD_READ |
796 			    ISCSI_FLAG_CMD_WRITE)) == ISCSI_FLAG_CMD_READ)
797 				return (B_FALSE);
798 			if ((bhs->flags & ISCSI_FLAG_FINAL) == 0 &&
799 			    ((pdu->isp_datalen & 0x3) != 0))
800 				return (B_FALSE);
801 		}
802 		if (bhs->flags & (ISCSI_FLAG_CMD_READ |
803 		    ISCSI_FLAG_CMD_WRITE)) {
804 			iscsi_scsi_cmd_hdr_t *cmdhdr =
805 			    (iscsi_scsi_cmd_hdr_t *)bhs;
806 			/*
807 			 * we're transfering some data, we must have a
808 			 * byte count
809 			 */
810 			if (cmdhdr->data_length == 0)
811 				return (B_FALSE);
812 		}
813 		break;
814 	case ISCSI_OP_SCSI_DATA:
815 		/*
816 		 * See RFC3720, section 10.7
817 		 *
818 		 * Additional headers aren't allowed, and the data-size must
819 		 * be an even number of 4-byte words (unless the final bit
820 		 * is set.)
821 		 */
822 		if (bhs->hlength)
823 			return (B_FALSE);
824 		if ((bhs->flags & ISCSI_FLAG_FINAL) == 0 &&
825 		    ((pdu->isp_datalen & 0x3) != 0))
826 			return (B_FALSE);
827 		break;
828 	default:
829 		break;
830 	}
831 	return (B_TRUE);
832 }
833 
834 static idm_status_t
835 idm_sorecvhdr(idm_conn_t *ic, idm_pdu_t *pdu)
836 {
837 	iscsi_hdr_t	*bhs;
838 	uint32_t	hdr_digest_crc;
839 	uint32_t	crc_calculated;
840 	void		*new_hdr;
841 	int		ahslen = 0;
842 	int		total_len = 0;
843 	int		iovlen = 0;
844 	struct iovec	iov[2];
845 	idm_so_conn_t	*so_conn;
846 	int		rc;
847 
848 	so_conn = ic->ic_transport_private;
849 
850 	/*
851 	 * Read BHS
852 	 */
853 	bhs = pdu->isp_hdr;
854 	rc = idm_sorecv(so_conn->ic_so, pdu->isp_hdr, sizeof (iscsi_hdr_t));
855 	if (rc != IDM_STATUS_SUCCESS) {
856 		return (IDM_STATUS_FAIL);
857 	}
858 
859 	/*
860 	 * Check actual AHS length against the amount available in the buffer
861 	 */
862 	pdu->isp_hdrlen = sizeof (iscsi_hdr_t) +
863 	    (bhs->hlength * sizeof (uint32_t));
864 	pdu->isp_datalen = n2h24(bhs->dlength);
865 
866 	if (!idm_dataseglenokay(ic, pdu)) {
867 		IDM_CONN_LOG(CE_WARN,
868 		    "idm_sorecvhdr: invalid data segment length");
869 		return (IDM_STATUS_FAIL);
870 	}
871 	if (bhs->hlength > IDM_SORX_CACHE_AHSLEN) {
872 		/* Allocate a new header segment and change the callback */
873 		new_hdr = kmem_alloc(pdu->isp_hdrlen, KM_SLEEP);
874 		bcopy(pdu->isp_hdr, new_hdr, sizeof (iscsi_hdr_t));
875 		pdu->isp_hdr = new_hdr;
876 		pdu->isp_flags |= IDM_PDU_ADDL_HDR;
877 
878 		/*
879 		 * This callback will restore the expected values after
880 		 * the RX PDU has been processed.
881 		 */
882 		pdu->isp_callback = idm_sorx_addl_pdu_cb;
883 	}
884 
885 	/*
886 	 * Setup receipt of additional header and header digest (if enabled).
887 	 */
888 	if (bhs->hlength > 0) {
889 		iov[iovlen].iov_base = (caddr_t)(pdu->isp_hdr + 1);
890 		ahslen = pdu->isp_hdrlen - sizeof (iscsi_hdr_t);
891 		iov[iovlen].iov_len = ahslen;
892 		total_len += iov[iovlen].iov_len;
893 		iovlen++;
894 	}
895 
896 	if (ic->ic_conn_flags & IDM_CONN_HEADER_DIGEST) {
897 		iov[iovlen].iov_base = (caddr_t)&hdr_digest_crc;
898 		iov[iovlen].iov_len = sizeof (hdr_digest_crc);
899 		total_len += iov[iovlen].iov_len;
900 		iovlen++;
901 	}
902 
903 	if ((iovlen != 0) &&
904 	    (idm_iov_sorecv(so_conn->ic_so, &iov[0], iovlen,
905 	    total_len) != 0)) {
906 		return (IDM_STATUS_FAIL);
907 	}
908 
909 	/*
910 	 * Validate header digest if enabled
911 	 */
912 	if (ic->ic_conn_flags & IDM_CONN_HEADER_DIGEST) {
913 		crc_calculated = idm_crc32c(pdu->isp_hdr,
914 		    sizeof (iscsi_hdr_t) + ahslen);
915 		if (crc_calculated != hdr_digest_crc) {
916 			/* Invalid Header Digest */
917 			return (IDM_STATUS_HEADER_DIGEST);
918 		}
919 	}
920 
921 	return (0);
922 }
923 
924 /*
925  * idm_so_ini_conn_create()
926  * Allocate the sockets transport connection resources.
927  */
928 static idm_status_t
929 idm_so_ini_conn_create(idm_conn_req_t *cr, idm_conn_t *ic)
930 {
931 	ksocket_t	so;
932 	idm_so_conn_t	*so_conn;
933 	idm_status_t	idmrc;
934 
935 	so = idm_socreate(cr->cr_domain, cr->cr_type,
936 	    cr->cr_protocol);
937 	if (so == NULL) {
938 		return (IDM_STATUS_FAIL);
939 	}
940 
941 	/* Bind the socket if configured to do so */
942 	if (cr->cr_bound) {
943 		if (ksocket_bind(so, &cr->cr_bound_addr.sin,
944 		    SIZEOF_SOCKADDR(&cr->cr_bound_addr.sin), CRED()) != 0) {
945 			idm_sodestroy(so);
946 			return (IDM_STATUS_FAIL);
947 		}
948 	}
949 
950 	idmrc = idm_so_conn_create_common(ic, so);
951 	if (idmrc != IDM_STATUS_SUCCESS) {
952 		idm_soshutdown(so);
953 		idm_sodestroy(so);
954 		return (IDM_STATUS_FAIL);
955 	}
956 
957 	so_conn = ic->ic_transport_private;
958 	/* Set up socket options */
959 	idm_set_ini_preconnect_options(so_conn, cr->cr_boot_conn);
960 
961 	return (IDM_STATUS_SUCCESS);
962 }
963 
964 /*
965  * idm_so_ini_conn_destroy()
966  * Tear down the sockets transport connection resources.
967  */
968 static void
969 idm_so_ini_conn_destroy(idm_conn_t *ic)
970 {
971 	idm_so_conn_destroy_common(ic);
972 }
973 
974 /*
975  * idm_so_ini_conn_connect()
976  * Establish the connection referred to by the handle previously allocated via
977  * idm_so_ini_conn_create().
978  */
979 static idm_status_t
980 idm_so_ini_conn_connect(idm_conn_t *ic)
981 {
982 	idm_so_conn_t	*so_conn;
983 	struct sonode	*node = NULL;
984 	int 		rc;
985 	clock_t		lbolt, conn_login_max, conn_login_interval;
986 	boolean_t	nonblock;
987 
988 	so_conn = ic->ic_transport_private;
989 	nonblock = ic->ic_conn_params.nonblock_socket;
990 	conn_login_max = ic->ic_conn_params.conn_login_max;
991 	conn_login_interval = ddi_get_lbolt() +
992 	    SEC_TO_TICK(ic->ic_conn_params.conn_login_interval);
993 
994 	if (nonblock == B_TRUE) {
995 		node = ((struct sonode *)(so_conn->ic_so));
996 		/* Set to none block socket mode */
997 		idm_so_socket_set_nonblock(node);
998 		do {
999 			rc = ksocket_connect(so_conn->ic_so,
1000 			    &ic->ic_ini_dst_addr.sin,
1001 			    (SIZEOF_SOCKADDR(&ic->ic_ini_dst_addr.sin)),
1002 			    CRED());
1003 			if (rc == 0 || rc == EISCONN) {
1004 				/* socket success or already success */
1005 				rc = IDM_STATUS_SUCCESS;
1006 				break;
1007 			}
1008 			if ((rc == ETIMEDOUT) || (rc == ECONNREFUSED) ||
1009 			    (rc == ECONNRESET)) {
1010 				/* socket connection timeout or refuse */
1011 				break;
1012 			}
1013 			lbolt = ddi_get_lbolt();
1014 			if (lbolt > conn_login_max) {
1015 				/*
1016 				 * Connection retry timeout,
1017 				 * failed connect to target.
1018 				 */
1019 				break;
1020 			}
1021 			if (lbolt < conn_login_interval) {
1022 				if ((rc == EINPROGRESS) || (rc == EALREADY)) {
1023 					/* TCP connect still in progress */
1024 					delay(SEC_TO_TICK(IN_PROGRESS_DELAY));
1025 					continue;
1026 				} else {
1027 					delay(conn_login_interval - lbolt);
1028 				}
1029 			}
1030 			conn_login_interval = ddi_get_lbolt() +
1031 			    SEC_TO_TICK(ic->ic_conn_params.conn_login_interval);
1032 		} while (rc != 0);
1033 		/* resume to nonblock mode */
1034 		if (rc == IDM_STATUS_SUCCESS) {
1035 			idm_so_socket_set_block(node);
1036 		}
1037 	} else {
1038 		rc = ksocket_connect(so_conn->ic_so, &ic->ic_ini_dst_addr.sin,
1039 		    (SIZEOF_SOCKADDR(&ic->ic_ini_dst_addr.sin)), CRED());
1040 	}
1041 
1042 	if (rc != 0) {
1043 		idm_soshutdown(so_conn->ic_so);
1044 		return (IDM_STATUS_FAIL);
1045 	}
1046 
1047 	idm_so_conn_connect_common(ic);
1048 
1049 	idm_set_postconnect_options(so_conn->ic_so);
1050 
1051 	return (IDM_STATUS_SUCCESS);
1052 }
1053 
1054 idm_status_t
1055 idm_so_tgt_conn_create(idm_conn_t *ic, ksocket_t new_so)
1056 {
1057 	idm_status_t	idmrc;
1058 
1059 	idm_set_postconnect_options(new_so);
1060 	idmrc = idm_so_conn_create_common(ic, new_so);
1061 
1062 	return (idmrc);
1063 }
1064 
1065 static void
1066 idm_so_tgt_conn_destroy(idm_conn_t *ic)
1067 {
1068 	idm_so_conn_destroy_common(ic);
1069 }
1070 
1071 /*
1072  * idm_so_tgt_conn_connect()
1073  * Establish the connection in ic, passed from idm_tgt_conn_finish(), which
1074  * is invoked from the SM as a result of an inbound connection request.
1075  */
1076 static idm_status_t
1077 idm_so_tgt_conn_connect(idm_conn_t *ic)
1078 {
1079 	idm_so_conn_connect_common(ic);
1080 
1081 	return (IDM_STATUS_SUCCESS);
1082 }
1083 
1084 static idm_status_t
1085 idm_so_conn_create_common(idm_conn_t *ic, ksocket_t new_so)
1086 {
1087 	idm_so_conn_t	*so_conn;
1088 
1089 	so_conn = kmem_zalloc(sizeof (idm_so_conn_t), KM_SLEEP);
1090 	so_conn->ic_so = new_so;
1091 
1092 	ic->ic_transport_private = so_conn;
1093 	ic->ic_transport_hdrlen = 0;
1094 
1095 	/* Set the scoreboarding flag on this connection */
1096 	ic->ic_conn_flags |= IDM_CONN_USE_SCOREBOARD;
1097 	ic->ic_conn_params.max_recv_dataseglen =
1098 	    ISCSI_DEFAULT_MAX_RECV_SEG_LEN;
1099 	ic->ic_conn_params.max_xmit_dataseglen =
1100 	    ISCSI_DEFAULT_MAX_XMIT_SEG_LEN;
1101 
1102 	/*
1103 	 * Initialize tx thread mutex and list
1104 	 */
1105 	mutex_init(&so_conn->ic_tx_mutex, NULL, MUTEX_DEFAULT, NULL);
1106 	cv_init(&so_conn->ic_tx_cv, NULL, CV_DEFAULT, NULL);
1107 	list_create(&so_conn->ic_tx_list, sizeof (idm_pdu_t),
1108 	    offsetof(idm_pdu_t, idm_tx_link));
1109 
1110 	return (IDM_STATUS_SUCCESS);
1111 }
1112 
1113 static void
1114 idm_so_conn_destroy_common(idm_conn_t *ic)
1115 {
1116 	idm_so_conn_t	*so_conn = ic->ic_transport_private;
1117 
1118 	ic->ic_transport_private = NULL;
1119 	idm_sodestroy(so_conn->ic_so);
1120 	list_destroy(&so_conn->ic_tx_list);
1121 	mutex_destroy(&so_conn->ic_tx_mutex);
1122 	cv_destroy(&so_conn->ic_tx_cv);
1123 
1124 	kmem_free(so_conn, sizeof (idm_so_conn_t));
1125 }
1126 
1127 static void
1128 idm_so_conn_connect_common(idm_conn_t *ic)
1129 {
1130 	idm_so_conn_t	*so_conn;
1131 	struct sockaddr_in6	t_addr;
1132 	socklen_t	t_addrlen = 0;
1133 
1134 	so_conn = ic->ic_transport_private;
1135 	bzero(&t_addr, sizeof (struct sockaddr_in6));
1136 	t_addrlen = sizeof (struct sockaddr_in6);
1137 
1138 	/* Set the local and remote addresses in the idm conn handle */
1139 	(void) ksocket_getsockname(so_conn->ic_so, (struct sockaddr *)&t_addr,
1140 	    &t_addrlen, CRED());
1141 	bcopy(&t_addr, &ic->ic_laddr, t_addrlen);
1142 	(void) ksocket_getpeername(so_conn->ic_so, (struct sockaddr *)&t_addr,
1143 	    &t_addrlen, CRED());
1144 	bcopy(&t_addr, &ic->ic_raddr, t_addrlen);
1145 
1146 	mutex_enter(&ic->ic_mutex);
1147 	so_conn->ic_tx_thread = thread_create(NULL, 0, idm_sotx_thread, ic, 0,
1148 	    &p0, TS_RUN, minclsyspri);
1149 	so_conn->ic_rx_thread = thread_create(NULL, 0, idm_sorx_thread, ic, 0,
1150 	    &p0, TS_RUN, minclsyspri);
1151 
1152 	while (so_conn->ic_rx_thread_did == 0 ||
1153 	    so_conn->ic_tx_thread_did == 0)
1154 		cv_wait(&ic->ic_cv, &ic->ic_mutex);
1155 	mutex_exit(&ic->ic_mutex);
1156 }
1157 
1158 /*
1159  * idm_so_conn_disconnect()
1160  * Shutdown the socket connection and stop the thread
1161  */
1162 static void
1163 idm_so_conn_disconnect(idm_conn_t *ic)
1164 {
1165 	idm_so_conn_t	*so_conn;
1166 
1167 	so_conn = ic->ic_transport_private;
1168 
1169 	mutex_enter(&ic->ic_mutex);
1170 	so_conn->ic_rx_thread_running = B_FALSE;
1171 	so_conn->ic_tx_thread_running = B_FALSE;
1172 	/* We need to wakeup the TX thread */
1173 	mutex_enter(&so_conn->ic_tx_mutex);
1174 	cv_signal(&so_conn->ic_tx_cv);
1175 	mutex_exit(&so_conn->ic_tx_mutex);
1176 	mutex_exit(&ic->ic_mutex);
1177 
1178 	/* This should wakeup the RX thread if it is sleeping */
1179 	idm_soshutdown(so_conn->ic_so);
1180 
1181 	thread_join(so_conn->ic_tx_thread_did);
1182 	thread_join(so_conn->ic_rx_thread_did);
1183 }
1184 
1185 /*
1186  * idm_so_tgt_svc_create()
1187  * Establish a service on an IP address and port.  idm_svc_req_t contains
1188  * the service parameters.
1189  */
1190 /*ARGSUSED*/
1191 static idm_status_t
1192 idm_so_tgt_svc_create(idm_svc_req_t *sr, idm_svc_t *is)
1193 {
1194 	idm_so_svc_t		*so_svc;
1195 
1196 	so_svc = kmem_zalloc(sizeof (idm_so_svc_t), KM_SLEEP);
1197 
1198 	/* Set the new sockets service in svc handle */
1199 	is->is_so_svc = (void *)so_svc;
1200 
1201 	return (IDM_STATUS_SUCCESS);
1202 }
1203 
1204 /*
1205  * idm_so_tgt_svc_destroy()
1206  * Teardown sockets resources allocated in idm_so_tgt_svc_create()
1207  */
1208 static void
1209 idm_so_tgt_svc_destroy(idm_svc_t *is)
1210 {
1211 	/* the socket will have been torn down; free the service */
1212 	kmem_free(is->is_so_svc, sizeof (idm_so_svc_t));
1213 }
1214 
1215 /*
1216  * idm_so_tgt_svc_online()
1217  * Launch a watch thread on the svc allocated in idm_so_tgt_svc_create()
1218  */
1219 
1220 static idm_status_t
1221 idm_so_tgt_svc_online(idm_svc_t *is)
1222 {
1223 	idm_so_svc_t		*so_svc;
1224 	idm_svc_req_t		*sr = &is->is_svc_req;
1225 	struct sockaddr_in6	sin6_ip;
1226 	const uint32_t		on = 1;
1227 	const uint32_t		off = 0;
1228 
1229 	mutex_enter(&is->is_mutex);
1230 	so_svc = (idm_so_svc_t *)is->is_so_svc;
1231 
1232 	/*
1233 	 * Try creating an IPv6 socket first
1234 	 */
1235 	if ((so_svc->is_so = idm_socreate(PF_INET6, SOCK_STREAM, 0)) == NULL) {
1236 		mutex_exit(&is->is_mutex);
1237 		return (IDM_STATUS_FAIL);
1238 	} else {
1239 		bzero(&sin6_ip, sizeof (sin6_ip));
1240 		sin6_ip.sin6_family = AF_INET6;
1241 		sin6_ip.sin6_port = htons(sr->sr_port);
1242 		sin6_ip.sin6_addr = in6addr_any;
1243 
1244 		(void) ksocket_setsockopt(so_svc->is_so, SOL_SOCKET,
1245 		    SO_REUSEADDR, (char *)&on, sizeof (on), CRED());
1246 		/*
1247 		 * Turn off SO_MAC_EXEMPT so future sobinds succeed
1248 		 */
1249 		(void) ksocket_setsockopt(so_svc->is_so, SOL_SOCKET,
1250 		    SO_MAC_EXEMPT, (char *)&off, sizeof (off), CRED());
1251 
1252 		if (ksocket_bind(so_svc->is_so, (struct sockaddr *)&sin6_ip,
1253 		    sizeof (sin6_ip), CRED()) != 0) {
1254 			mutex_exit(&is->is_mutex);
1255 			idm_sodestroy(so_svc->is_so);
1256 			return (IDM_STATUS_FAIL);
1257 		}
1258 	}
1259 
1260 	idm_set_postconnect_options(so_svc->is_so);
1261 
1262 	if (ksocket_listen(so_svc->is_so, 5, CRED()) != 0) {
1263 		mutex_exit(&is->is_mutex);
1264 		idm_soshutdown(so_svc->is_so);
1265 		idm_sodestroy(so_svc->is_so);
1266 		return (IDM_STATUS_FAIL);
1267 	}
1268 
1269 	/* Launch a watch thread */
1270 	so_svc->is_thread = thread_create(NULL, 0, idm_so_svc_port_watcher,
1271 	    is, 0, &p0, TS_RUN, minclsyspri);
1272 
1273 	if (so_svc->is_thread == NULL) {
1274 		/* Failure to launch; teardown the socket */
1275 		mutex_exit(&is->is_mutex);
1276 		idm_soshutdown(so_svc->is_so);
1277 		idm_sodestroy(so_svc->is_so);
1278 		return (IDM_STATUS_FAIL);
1279 	}
1280 	ksocket_hold(so_svc->is_so);
1281 	/* Wait for the port watcher thread to start */
1282 	while (!so_svc->is_thread_running)
1283 		cv_wait(&is->is_cv, &is->is_mutex);
1284 	mutex_exit(&is->is_mutex);
1285 
1286 	return (IDM_STATUS_SUCCESS);
1287 }
1288 
1289 /*
1290  * idm_so_tgt_svc_offline
1291  *
1292  * Stop listening on the IP address and port identified by idm_svc_t.
1293  */
1294 static void
1295 idm_so_tgt_svc_offline(idm_svc_t *is)
1296 {
1297 	idm_so_svc_t		*so_svc;
1298 	mutex_enter(&is->is_mutex);
1299 	so_svc = (idm_so_svc_t *)is->is_so_svc;
1300 	so_svc->is_thread_running = B_FALSE;
1301 	mutex_exit(&is->is_mutex);
1302 
1303 	/*
1304 	 * Teardown socket
1305 	 */
1306 	idm_sodestroy(so_svc->is_so);
1307 
1308 	/*
1309 	 * Now we expect the port watcher thread to terminate
1310 	 */
1311 	thread_join(so_svc->is_thread_did);
1312 }
1313 
1314 /*
1315  * Watch thread for target service connection establishment.
1316  */
1317 void
1318 idm_so_svc_port_watcher(void *arg)
1319 {
1320 	idm_svc_t		*svc = arg;
1321 	ksocket_t		new_so;
1322 	idm_conn_t		*ic;
1323 	idm_status_t		idmrc;
1324 	idm_so_svc_t		*so_svc;
1325 	int			rc;
1326 	const uint32_t		off = 0;
1327 	struct sockaddr_in6 	t_addr;
1328 	socklen_t		t_addrlen;
1329 
1330 	bzero(&t_addr, sizeof (struct sockaddr_in6));
1331 	t_addrlen = sizeof (struct sockaddr_in6);
1332 	mutex_enter(&svc->is_mutex);
1333 
1334 	so_svc = svc->is_so_svc;
1335 	so_svc->is_thread_running = B_TRUE;
1336 	so_svc->is_thread_did = so_svc->is_thread->t_did;
1337 
1338 	cv_signal(&svc->is_cv);
1339 
1340 	IDM_SVC_LOG(CE_NOTE, "iSCSI service (%p/%d) online", (void *)svc,
1341 	    svc->is_svc_req.sr_port);
1342 
1343 	while (so_svc->is_thread_running) {
1344 		mutex_exit(&svc->is_mutex);
1345 
1346 		if ((rc = ksocket_accept(so_svc->is_so,
1347 		    (struct sockaddr *)&t_addr, &t_addrlen,
1348 		    &new_so, CRED())) != 0) {
1349 			mutex_enter(&svc->is_mutex);
1350 			if (rc != ECONNABORTED && rc != EINTR) {
1351 				IDM_SVC_LOG(CE_NOTE, "idm_so_svc_port_watcher:"
1352 				    " ksocket_accept failed %d", rc);
1353 			}
1354 			/*
1355 			 * Unclean shutdown of this thread is not handled
1356 			 * wait for !is_thread_running.
1357 			 */
1358 			continue;
1359 		}
1360 		/*
1361 		 * Turn off SO_MAC_EXEMPT so future sobinds succeed
1362 		 */
1363 		(void) ksocket_setsockopt(new_so, SOL_SOCKET, SO_MAC_EXEMPT,
1364 		    (char *)&off, sizeof (off), CRED());
1365 
1366 		idmrc = idm_svc_conn_create(svc, IDM_TRANSPORT_TYPE_SOCKETS,
1367 		    &ic);
1368 		if (idmrc != IDM_STATUS_SUCCESS) {
1369 			/* Drop connection */
1370 			idm_soshutdown(new_so);
1371 			idm_sodestroy(new_so);
1372 			mutex_enter(&svc->is_mutex);
1373 			continue;
1374 		}
1375 
1376 		idmrc = idm_so_tgt_conn_create(ic, new_so);
1377 		if (idmrc != IDM_STATUS_SUCCESS) {
1378 			idm_svc_conn_destroy(ic);
1379 			idm_soshutdown(new_so);
1380 			idm_sodestroy(new_so);
1381 			mutex_enter(&svc->is_mutex);
1382 			continue;
1383 		}
1384 
1385 		/*
1386 		 * Kick the state machine.  At CS_S3_XPT_UP the state machine
1387 		 * will notify the client (target) about the new connection.
1388 		 */
1389 		idm_conn_event(ic, CE_CONNECT_ACCEPT, NULL);
1390 
1391 		mutex_enter(&svc->is_mutex);
1392 	}
1393 	ksocket_rele(so_svc->is_so);
1394 	so_svc->is_thread_running = B_FALSE;
1395 	mutex_exit(&svc->is_mutex);
1396 
1397 	IDM_SVC_LOG(CE_NOTE, "iSCSI service (%p/%d) offline", (void *)svc,
1398 	    svc->is_svc_req.sr_port);
1399 
1400 	thread_exit();
1401 }
1402 
1403 /*
1404  * idm_so_free_task_rsrc() stops any ongoing processing of the task and
1405  * frees resources associated with the task.
1406  *
1407  * It's not clear that this should return idm_status_t.  What do we do
1408  * if it fails?
1409  */
1410 static idm_status_t
1411 idm_so_free_task_rsrc(idm_task_t *idt)
1412 {
1413 	idm_buf_t	*idb, *next_idb;
1414 
1415 	/*
1416 	 * There is nothing to cleanup on initiator connections
1417 	 */
1418 	if (IDM_CONN_ISINI(idt->idt_ic))
1419 		return (IDM_STATUS_SUCCESS);
1420 
1421 	/*
1422 	 * If this is a target connection, call idm_buf_rx_from_ini_done for
1423 	 * any buffer on the "outbufv" list with idb->idb_in_transport==B_TRUE.
1424 	 *
1425 	 * In addition, remove any buffers associated with this task from
1426 	 * the ic_tx_list.  We'll do this by walking the idt_inbufv list, but
1427 	 * items don't actually get removed from that list (and completion
1428 	 * routines called) until idm_task_cleanup.
1429 	 */
1430 	mutex_enter(&idt->idt_mutex);
1431 
1432 	for (idb = list_head(&idt->idt_outbufv); idb != NULL; idb = next_idb) {
1433 		next_idb = list_next(&idt->idt_outbufv, idb);
1434 		if (idb->idb_in_transport) {
1435 			/*
1436 			 * idm_buf_rx_from_ini_done releases idt->idt_mutex
1437 			 */
1438 			DTRACE_ISCSI_8(xfer__done, idm_conn_t *, idt->idt_ic,
1439 			    uintptr_t, idb->idb_buf,
1440 			    uint32_t, idb->idb_bufoffset,
1441 			    uint64_t, 0, uint32_t, 0, uint32_t, 0,
1442 			    uint32_t, idb->idb_xfer_len,
1443 			    int, XFER_BUF_RX_FROM_INI);
1444 			idm_buf_rx_from_ini_done(idt, idb, IDM_STATUS_ABORTED);
1445 			mutex_enter(&idt->idt_mutex);
1446 		}
1447 	}
1448 
1449 	for (idb = list_head(&idt->idt_inbufv); idb != NULL; idb = next_idb) {
1450 		next_idb = list_next(&idt->idt_inbufv, idb);
1451 		/*
1452 		 * We want to remove these items from the tx_list as well,
1453 		 * but knowing it's in the idt_inbufv list is not a guarantee
1454 		 * that it's in the tx_list.  If it's on the tx list then
1455 		 * let idm_sotx_thread() clean it up.
1456 		 */
1457 		if (idb->idb_in_transport && !idb->idb_tx_thread) {
1458 			/*
1459 			 * idm_buf_tx_to_ini_done releases idt->idt_mutex
1460 			 */
1461 			DTRACE_ISCSI_8(xfer__done, idm_conn_t *, idt->idt_ic,
1462 			    uintptr_t, idb->idb_buf,
1463 			    uint32_t, idb->idb_bufoffset,
1464 			    uint64_t, 0, uint32_t, 0, uint32_t, 0,
1465 			    uint32_t, idb->idb_xfer_len,
1466 			    int, XFER_BUF_TX_TO_INI);
1467 			idm_buf_tx_to_ini_done(idt, idb, IDM_STATUS_ABORTED);
1468 			mutex_enter(&idt->idt_mutex);
1469 		}
1470 	}
1471 
1472 	mutex_exit(&idt->idt_mutex);
1473 
1474 	return (IDM_STATUS_SUCCESS);
1475 }
1476 
1477 /*
1478  * idm_so_negotiate_key_values() validates the key values for this connection
1479  */
1480 /* ARGSUSED */
1481 static kv_status_t
1482 idm_so_negotiate_key_values(idm_conn_t *it, nvlist_t *request_nvl,
1483     nvlist_t *response_nvl, nvlist_t *negotiated_nvl)
1484 {
1485 	/* All parameters are negotiated at the iscsit level */
1486 	return (KV_HANDLED);
1487 }
1488 
1489 /*
1490  * idm_so_notice_key_values() activates the negotiated key values for
1491  * this connection.
1492  */
1493 static void
1494 idm_so_notice_key_values(idm_conn_t *it, nvlist_t *negotiated_nvl)
1495 {
1496 	char			*nvp_name;
1497 	nvpair_t		*nvp;
1498 	nvpair_t		*next_nvp;
1499 	int			nvrc;
1500 	idm_status_t		idm_status;
1501 	const idm_kv_xlate_t	*ikvx;
1502 	uint64_t		num_val;
1503 
1504 	for (nvp = nvlist_next_nvpair(negotiated_nvl, NULL);
1505 	    nvp != NULL; nvp = next_nvp) {
1506 		next_nvp = nvlist_next_nvpair(negotiated_nvl, nvp);
1507 		nvp_name = nvpair_name(nvp);
1508 
1509 		ikvx = idm_lookup_kv_xlate(nvp_name, strlen(nvp_name));
1510 		switch (ikvx->ik_key_id) {
1511 		case KI_HEADER_DIGEST:
1512 		case KI_DATA_DIGEST:
1513 			idm_status = idm_so_handle_digest(it, nvp, ikvx);
1514 			ASSERT(idm_status == 0);
1515 
1516 			/* Remove processed item from negotiated_nvl list */
1517 			nvrc = nvlist_remove_all(
1518 			    negotiated_nvl, ikvx->ik_key_name);
1519 			ASSERT(nvrc == 0);
1520 			break;
1521 		case KI_MAX_RECV_DATA_SEGMENT_LENGTH:
1522 			/*
1523 			 * Just pass the value down to idm layer.
1524 			 * No need to remove it from negotiated_nvl list here.
1525 			 */
1526 			nvrc = nvpair_value_uint64(nvp, &num_val);
1527 			ASSERT(nvrc == 0);
1528 			it->ic_conn_params.max_xmit_dataseglen =
1529 			    (uint32_t)num_val;
1530 			break;
1531 		default:
1532 			break;
1533 		}
1534 	}
1535 }
1536 
1537 /*
1538  * idm_so_declare_key_values() declares the key values for this connection
1539  */
1540 /* ARGSUSED */
1541 static kv_status_t
1542 idm_so_declare_key_values(idm_conn_t *it, nvlist_t *config_nvl,
1543     nvlist_t *outgoing_nvl)
1544 {
1545 	char			*nvp_name;
1546 	nvpair_t		*nvp;
1547 	nvpair_t		*next_nvp;
1548 	kv_status_t		kvrc;
1549 	int			nvrc = 0;
1550 	const idm_kv_xlate_t	*ikvx;
1551 	uint64_t		num_val;
1552 
1553 	for (nvp = nvlist_next_nvpair(config_nvl, NULL);
1554 	    nvp != NULL && nvrc == 0; nvp = next_nvp) {
1555 		next_nvp = nvlist_next_nvpair(config_nvl, nvp);
1556 		nvp_name = nvpair_name(nvp);
1557 
1558 		ikvx = idm_lookup_kv_xlate(nvp_name, strlen(nvp_name));
1559 		switch (ikvx->ik_key_id) {
1560 		case KI_MAX_RECV_DATA_SEGMENT_LENGTH:
1561 			if ((nvrc = nvpair_value_uint64(nvp, &num_val)) != 0) {
1562 				break;
1563 			}
1564 			if (outgoing_nvl &&
1565 			    (nvrc = nvlist_add_uint64(outgoing_nvl,
1566 			    nvp_name, num_val)) != 0) {
1567 				break;
1568 			}
1569 			it->ic_conn_params.max_recv_dataseglen =
1570 			    (uint32_t)num_val;
1571 			break;
1572 		default:
1573 			break;
1574 		}
1575 	}
1576 	kvrc = idm_nvstat_to_kvstat(nvrc);
1577 	return (kvrc);
1578 }
1579 
1580 static idm_status_t
1581 idm_so_handle_digest(idm_conn_t *it, nvpair_t *digest_choice,
1582     const idm_kv_xlate_t *ikvx)
1583 {
1584 	int			nvrc;
1585 	char			*digest_choice_string;
1586 
1587 	nvrc = nvpair_value_string(digest_choice,
1588 	    &digest_choice_string);
1589 	ASSERT(nvrc == 0);
1590 	if (strcasecmp(digest_choice_string, "crc32c") == 0) {
1591 		switch (ikvx->ik_key_id) {
1592 		case KI_HEADER_DIGEST:
1593 			it->ic_conn_flags |= IDM_CONN_HEADER_DIGEST;
1594 			break;
1595 		case KI_DATA_DIGEST:
1596 			it->ic_conn_flags |= IDM_CONN_DATA_DIGEST;
1597 			break;
1598 		default:
1599 			ASSERT(0);
1600 			break;
1601 		}
1602 	} else if (strcasecmp(digest_choice_string, "none") == 0) {
1603 		switch (ikvx->ik_key_id) {
1604 		case KI_HEADER_DIGEST:
1605 			it->ic_conn_flags &= ~IDM_CONN_HEADER_DIGEST;
1606 			break;
1607 		case KI_DATA_DIGEST:
1608 			it->ic_conn_flags &= ~IDM_CONN_DATA_DIGEST;
1609 			break;
1610 		default:
1611 			ASSERT(0);
1612 			break;
1613 		}
1614 	} else {
1615 		ASSERT(0);
1616 	}
1617 
1618 	return (IDM_STATUS_SUCCESS);
1619 }
1620 
1621 
1622 /*
1623  * idm_so_conn_is_capable() verifies that the passed connection is provided
1624  * for by the sockets interface.
1625  */
1626 /* ARGSUSED */
1627 static boolean_t
1628 idm_so_conn_is_capable(idm_conn_req_t *ic, idm_transport_caps_t *caps)
1629 {
1630 	return (B_TRUE);
1631 }
1632 
1633 /*
1634  * idm_so_rx_datain() validates the Data Sequence number of the PDU. The
1635  * idm_sorecv_scsidata() function invoked earlier actually reads the data
1636  * off the socket into the appropriate buffers.
1637  */
1638 static void
1639 idm_so_rx_datain(idm_conn_t *ic, idm_pdu_t *pdu)
1640 {
1641 	iscsi_data_hdr_t	*bhs;
1642 	idm_task_t		*idt;
1643 	idm_buf_t		*idb;
1644 	uint32_t		datasn;
1645 	size_t			offset;
1646 	iscsi_hdr_t		*ihp = (iscsi_hdr_t *)pdu->isp_hdr;
1647 	iscsi_data_rsp_hdr_t    *idrhp = (iscsi_data_rsp_hdr_t *)ihp;
1648 
1649 	ASSERT(ic != NULL);
1650 	ASSERT(pdu != NULL);
1651 	ASSERT(IDM_PDU_OPCODE(pdu) == ISCSI_OP_SCSI_DATA_RSP);
1652 
1653 	bhs	= (iscsi_data_hdr_t *)pdu->isp_hdr;
1654 	datasn	= ntohl(bhs->datasn);
1655 	offset	= ntohl(bhs->offset);
1656 
1657 	/*
1658 	 * Look up the task corresponding to the initiator task tag
1659 	 * to get the buffers affiliated with the task.
1660 	 */
1661 	idt = idm_task_find(ic, bhs->itt, bhs->ttt);
1662 	if (idt == NULL) {
1663 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_datain: failed to find task");
1664 		idm_pdu_rx_protocol_error(ic, pdu);
1665 		return;
1666 	}
1667 
1668 	idb = pdu->isp_sorx_buf;
1669 	if (idb == NULL) {
1670 		IDM_CONN_LOG(CE_WARN,
1671 		    "idm_so_rx_datain: failed to find buffer");
1672 		idm_task_rele(idt);
1673 		idm_pdu_rx_protocol_error(ic, pdu);
1674 		return;
1675 	}
1676 
1677 	/*
1678 	 * DataSN values should be sequential and should not have any gaps or
1679 	 * repetitions. Check the DataSN with the one stored in the task.
1680 	 */
1681 	if (datasn == idt->idt_exp_datasn) {
1682 		idt->idt_exp_datasn++; /* keep track of DataSN received */
1683 	} else {
1684 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_datain: datasn out of order");
1685 		idm_task_rele(idt);
1686 		idm_pdu_rx_protocol_error(ic, pdu);
1687 		return;
1688 	}
1689 
1690 	/*
1691 	 * PDUs in a sequence should be in continuously increasing
1692 	 * address offset
1693 	 */
1694 	if (offset != idb->idb_exp_offset) {
1695 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_datain: unexpected offset");
1696 		idm_task_rele(idt);
1697 		idm_pdu_rx_protocol_error(ic, pdu);
1698 		return;
1699 	}
1700 	/* Expected next relative buffer offset */
1701 	idb->idb_exp_offset += n2h24(bhs->dlength);
1702 	idt->idt_rx_bytes += n2h24(bhs->dlength);
1703 
1704 	idm_task_rele(idt);
1705 
1706 	/*
1707 	 * For now call scsi_rsp which will process the data rsp
1708 	 * Revisit, need to provide an explicit client entry point for
1709 	 * phase collapse completions.
1710 	 */
1711 	if ((IDM_PDU_OPCODE(pdu) == ISCSI_OP_SCSI_DATA_RSP) &&
1712 	    (idrhp->flags & ISCSI_FLAG_DATA_STATUS)) {
1713 		(*ic->ic_conn_ops.icb_rx_scsi_rsp)(ic, pdu);
1714 	}
1715 
1716 	idm_pdu_complete(pdu, IDM_STATUS_SUCCESS);
1717 }
1718 
1719 /*
1720  * The idm_so_rx_dataout() function is used by the iSCSI target to read
1721  * data from the Data-Out PDU sent by the iSCSI initiator.
1722  *
1723  * This function gets the Initiator Task Tag from the PDU BHS and looks up the
1724  * task to get the buffers associated with the PDU. A PDU might span buffers.
1725  * The data is then read into the respective buffer.
1726  */
1727 static void
1728 idm_so_rx_dataout(idm_conn_t *ic, idm_pdu_t *pdu)
1729 {
1730 
1731 	iscsi_data_hdr_t	*bhs;
1732 	idm_task_t		*idt;
1733 	idm_buf_t		*idb;
1734 	size_t			offset;
1735 
1736 	ASSERT(ic != NULL);
1737 	ASSERT(pdu != NULL);
1738 	ASSERT(IDM_PDU_OPCODE(pdu) == ISCSI_OP_SCSI_DATA);
1739 
1740 	bhs = (iscsi_data_hdr_t *)pdu->isp_hdr;
1741 	offset = ntohl(bhs->offset);
1742 
1743 	/*
1744 	 * Look up the task corresponding to the initiator task tag
1745 	 * to get the buffers affiliated with the task.
1746 	 */
1747 	idt = idm_task_find(ic, bhs->itt, bhs->ttt);
1748 	if (idt == NULL) {
1749 		IDM_CONN_LOG(CE_WARN,
1750 		    "idm_so_rx_dataout: failed to find task");
1751 		idm_pdu_rx_protocol_error(ic, pdu);
1752 		return;
1753 	}
1754 
1755 	idb = pdu->isp_sorx_buf;
1756 	if (idb == NULL) {
1757 		IDM_CONN_LOG(CE_WARN,
1758 		    "idm_so_rx_dataout: failed to find buffer");
1759 		idm_task_rele(idt);
1760 		idm_pdu_rx_protocol_error(ic, pdu);
1761 		return;
1762 	}
1763 
1764 	/* Keep track of data transferred - check data offsets */
1765 	if (offset != idb->idb_exp_offset) {
1766 		IDM_CONN_LOG(CE_NOTE, "idm_so_rx_dataout: offset out of seq: "
1767 		    "%ld, %d", offset, idb->idb_exp_offset);
1768 		idm_task_rele(idt);
1769 		idm_pdu_rx_protocol_error(ic, pdu);
1770 		return;
1771 	}
1772 	/* Expected next relative offset */
1773 	idb->idb_exp_offset += ntoh24(bhs->dlength);
1774 	idt->idt_rx_bytes += n2h24(bhs->dlength);
1775 
1776 	/*
1777 	 * Call the buffer callback when the transfer is complete
1778 	 *
1779 	 * The connection state machine should only abort tasks after
1780 	 * shutting down the connection so we are assured that there
1781 	 * won't be a simultaneous attempt to abort this task at the
1782 	 * same time as we are processing this PDU (due to a connection
1783 	 * state change).
1784 	 */
1785 	if (bhs->flags & ISCSI_FLAG_FINAL) {
1786 		/*
1787 		 * We have gotten the last data-message for the current
1788 		 * transfer.  idb_xfer_len represents the data that the
1789 		 * command intended to transfer, it does not represent the
1790 		 * actual number of bytes transferred. If we have not
1791 		 * transferred the expected number of bytes something is
1792 		 * wrong.
1793 		 *
1794 		 * We have two options, when there is a mismatch, we can
1795 		 * regard the transfer as invalid -- or we can modify our
1796 		 * notion of "xfer_len." In order to be as stringent as
1797 		 * possible, here we regard this transfer as in error; and
1798 		 * bail out.
1799 		 */
1800 		if (idb->idb_buflen == idb->idb_xfer_len &&
1801 		    idb->idb_buflen !=
1802 		    (idb->idb_exp_offset - idb->idb_bufoffset)) {
1803 			printf("idm_so_rx_dataout: incomplete transfer, "
1804 			    "protocol err");
1805 			IDM_CONN_LOG(CE_NOTE,
1806 			    "idm_so_rx_dataout: incomplete transfer: %ld, %d",
1807 			    offset, (int)(idb->idb_exp_offset - offset));
1808 			idm_task_rele(idt);
1809 			idm_pdu_rx_protocol_error(ic, pdu);
1810 			return;
1811 		}
1812 		/*
1813 		 * We only want to call idm_buf_rx_from_ini_done once
1814 		 * per transfer.  It's possible that this task has
1815 		 * already been aborted in which case
1816 		 * idm_so_free_task_rsrc will call idm_buf_rx_from_ini_done
1817 		 * for each buffer with idb_in_transport==B_TRUE.  To
1818 		 * close this window and ensure that this doesn't happen,
1819 		 * we'll clear idb->idb_in_transport now while holding
1820 		 * the task mutex.   This is only really an issue for
1821 		 * SCSI task abort -- if tasks were being aborted because
1822 		 * of a connection state change the state machine would
1823 		 * have already stopped the receive thread.
1824 		 */
1825 		mutex_enter(&idt->idt_mutex);
1826 
1827 		/*
1828 		 * Release the task hold here (obtained in idm_task_find)
1829 		 * because the task may complete synchronously during
1830 		 * idm_buf_rx_from_ini_done.  Since we still have an active
1831 		 * buffer we know there is at least one additional hold on idt.
1832 		 */
1833 		idm_task_rele(idt);
1834 
1835 		/*
1836 		 * idm_buf_rx_from_ini_done releases idt->idt_mutex
1837 		 */
1838 		DTRACE_ISCSI_8(xfer__done, idm_conn_t *, idt->idt_ic,
1839 		    uintptr_t, idb->idb_buf, uint32_t, idb->idb_bufoffset,
1840 		    uint64_t, 0, uint32_t, 0, uint32_t, 0,
1841 		    uint32_t, idb->idb_xfer_len,
1842 		    int, XFER_BUF_RX_FROM_INI);
1843 		idm_buf_rx_from_ini_done(idt, idb, IDM_STATUS_SUCCESS);
1844 		idm_pdu_complete(pdu, IDM_STATUS_SUCCESS);
1845 		return;
1846 	}
1847 
1848 	idm_task_rele(idt);
1849 	idm_pdu_complete(pdu, IDM_STATUS_SUCCESS);
1850 }
1851 
1852 /*
1853  * The idm_so_rx_rtt() function is used by the iSCSI initiator to handle
1854  * the R2T PDU sent by the iSCSI target indicating that it is ready to
1855  * accept data. This gets the Initiator Task Tag (itt) from the PDU BHS
1856  * and looks up the task in the task tree using the itt to get the output
1857  * buffers associated the task. The R2T PDU contains the offset of the
1858  * requested data and the data length. This function then constructs a
1859  * sequence of iSCSI PDUs and outputs the requested data. Each Data-Out
1860  * PDU is associated with the R2T by the Target Transfer Tag  (ttt).
1861  */
1862 
1863 static void
1864 idm_so_rx_rtt(idm_conn_t *ic, idm_pdu_t *pdu)
1865 {
1866 	idm_task_t		*idt;
1867 	idm_buf_t		*idb;
1868 	iscsi_rtt_hdr_t		*rtt_hdr;
1869 	uint32_t		data_offset;
1870 	uint32_t		data_length;
1871 
1872 	ASSERT(ic != NULL);
1873 	ASSERT(pdu != NULL);
1874 
1875 	rtt_hdr	= (iscsi_rtt_hdr_t *)pdu->isp_hdr;
1876 	data_offset = ntohl(rtt_hdr->data_offset);
1877 	data_length = ntohl(rtt_hdr->data_length);
1878 	idt	= idm_task_find(ic, rtt_hdr->itt, rtt_hdr->ttt);
1879 
1880 	if (idt == NULL) {
1881 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_rtt: could not find task");
1882 		idm_pdu_rx_protocol_error(ic, pdu);
1883 		return;
1884 	}
1885 
1886 	/* Find the buffer bound to the task by the iSCSI initiator */
1887 	mutex_enter(&idt->idt_mutex);
1888 	idb = idm_buf_find(&idt->idt_outbufv, data_offset);
1889 	if (idb == NULL) {
1890 		mutex_exit(&idt->idt_mutex);
1891 		idm_task_rele(idt);
1892 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_rtt: could not find buffer");
1893 		idm_pdu_rx_protocol_error(ic, pdu);
1894 		return;
1895 	}
1896 
1897 	/* return buffer contains this data */
1898 	if (data_offset + data_length > idb->idb_buflen) {
1899 		/* Overflow */
1900 		mutex_exit(&idt->idt_mutex);
1901 		idm_task_rele(idt);
1902 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_rtt: read from outside "
1903 		    "buffer");
1904 		idm_pdu_rx_protocol_error(ic, pdu);
1905 		return;
1906 	}
1907 
1908 	idt->idt_r2t_ttt = rtt_hdr->ttt;
1909 	idt->idt_exp_datasn = 0;
1910 
1911 	idm_so_send_rtt_data(ic, idt, idb, data_offset,
1912 	    ntohl(rtt_hdr->data_length));
1913 	/*
1914 	 * the idt_mutex is released in idm_so_send_rtt_data
1915 	 */
1916 
1917 	idm_pdu_complete(pdu, IDM_STATUS_SUCCESS);
1918 	idm_task_rele(idt);
1919 
1920 }
1921 
1922 idm_status_t
1923 idm_sorecvdata(idm_conn_t *ic, idm_pdu_t *pdu)
1924 {
1925 	uint8_t		pad[ISCSI_PAD_WORD_LEN];
1926 	int		pad_len;
1927 	uint32_t	data_digest_crc;
1928 	uint32_t	crc_calculated;
1929 	int		total_len;
1930 	idm_so_conn_t	*so_conn;
1931 
1932 	so_conn = ic->ic_transport_private;
1933 
1934 	pad_len = ((ISCSI_PAD_WORD_LEN -
1935 	    (pdu->isp_datalen & (ISCSI_PAD_WORD_LEN - 1))) &
1936 	    (ISCSI_PAD_WORD_LEN - 1));
1937 
1938 	ASSERT(pdu->isp_iovlen < (PDU_MAX_IOVLEN - 2)); /* pad + data digest */
1939 
1940 	total_len = pdu->isp_datalen;
1941 
1942 	if (pad_len) {
1943 		pdu->isp_iov[pdu->isp_iovlen].iov_base	= (char *)&pad;
1944 		pdu->isp_iov[pdu->isp_iovlen].iov_len	= pad_len;
1945 		total_len		+= pad_len;
1946 		pdu->isp_iovlen++;
1947 	}
1948 
1949 	/* setup data digest */
1950 	if ((ic->ic_conn_flags & IDM_CONN_DATA_DIGEST) != 0) {
1951 		pdu->isp_iov[pdu->isp_iovlen].iov_base =
1952 		    (char *)&data_digest_crc;
1953 		pdu->isp_iov[pdu->isp_iovlen].iov_len =
1954 		    sizeof (data_digest_crc);
1955 		total_len		+= sizeof (data_digest_crc);
1956 		pdu->isp_iovlen++;
1957 	}
1958 
1959 	pdu->isp_data = (uint8_t *)(uintptr_t)pdu->isp_iov[0].iov_base;
1960 
1961 	if (idm_iov_sorecv(so_conn->ic_so, &pdu->isp_iov[0],
1962 	    pdu->isp_iovlen, total_len) != 0) {
1963 		return (IDM_STATUS_IO);
1964 	}
1965 
1966 	if ((ic->ic_conn_flags & IDM_CONN_DATA_DIGEST) != 0) {
1967 		crc_calculated = idm_crc32c(pdu->isp_data,
1968 		    pdu->isp_datalen);
1969 		if (pad_len) {
1970 			crc_calculated = idm_crc32c_continued((char *)&pad,
1971 			    pad_len, crc_calculated);
1972 		}
1973 		if (crc_calculated != data_digest_crc) {
1974 			IDM_CONN_LOG(CE_WARN,
1975 			    "idm_sorecvdata: "
1976 			    "CRC error: actual 0x%x, calc 0x%x",
1977 			    data_digest_crc, crc_calculated);
1978 
1979 			/* Invalid Data Digest */
1980 			return (IDM_STATUS_DATA_DIGEST);
1981 		}
1982 	}
1983 
1984 	return (IDM_STATUS_SUCCESS);
1985 }
1986 
1987 /*
1988  * idm_sorecv_scsidata() is used to receive scsi data from the socket. The
1989  * Data-type PDU header must be read into the idm_pdu_t structure prior to
1990  * calling this function.
1991  */
1992 idm_status_t
1993 idm_sorecv_scsidata(idm_conn_t *ic, idm_pdu_t *pdu)
1994 {
1995 	iscsi_data_hdr_t	*bhs;
1996 	idm_task_t		*task;
1997 	uint32_t		offset;
1998 	uint8_t			opcode;
1999 	uint32_t		dlength;
2000 	list_t			*buflst;
2001 	uint32_t		xfer_bytes;
2002 	idm_status_t		status;
2003 
2004 	ASSERT(ic != NULL);
2005 	ASSERT(pdu != NULL);
2006 
2007 	bhs	= (iscsi_data_hdr_t *)pdu->isp_hdr;
2008 
2009 	offset	= ntohl(bhs->offset);
2010 	opcode	= IDM_PDU_OPCODE(pdu);
2011 	dlength = n2h24(bhs->dlength);
2012 
2013 	ASSERT((opcode == ISCSI_OP_SCSI_DATA_RSP) ||
2014 	    (opcode == ISCSI_OP_SCSI_DATA));
2015 
2016 	/*
2017 	 * Successful lookup implicitly gets a "hold" on the task.  This
2018 	 * hold must be released before leaving this function.  At one
2019 	 * point we were caching this task context and retaining the hold
2020 	 * but it turned out to be very difficult to release the hold properly.
2021 	 * The task can be aborted and the connection shutdown between this
2022 	 * call and the subsequent expected call to idm_so_rx_datain/
2023 	 * idm_so_rx_dataout (in which case those functions are not called).
2024 	 * Releasing the hold in the PDU callback doesn't work well either
2025 	 * because the whole task may be completed by then at which point
2026 	 * it is too late to release the hold -- for better or worse this
2027 	 * code doesn't wait on the refcnts during normal operation.
2028 	 * idm_task_find() is very fast and it is not a huge burden if we
2029 	 * have to do it twice.
2030 	 */
2031 	task = idm_task_find(ic, bhs->itt, bhs->ttt);
2032 	if (task == NULL) {
2033 		IDM_CONN_LOG(CE_WARN,
2034 		    "idm_sorecv_scsidata: could not find task");
2035 		return (IDM_STATUS_FAIL);
2036 	}
2037 
2038 	mutex_enter(&task->idt_mutex);
2039 	buflst	= (opcode == ISCSI_OP_SCSI_DATA_RSP) ?
2040 	    &task->idt_inbufv : &task->idt_outbufv;
2041 	pdu->isp_sorx_buf = idm_buf_find(buflst, offset);
2042 	mutex_exit(&task->idt_mutex);
2043 
2044 	if (pdu->isp_sorx_buf == NULL) {
2045 		idm_task_rele(task);
2046 		IDM_CONN_LOG(CE_WARN, "idm_sorecv_scsidata: could not find "
2047 		    "buffer for offset %x opcode=%x",
2048 		    offset, opcode);
2049 		return (IDM_STATUS_FAIL);
2050 	}
2051 
2052 	xfer_bytes = idm_fill_iov(pdu, pdu->isp_sorx_buf, offset, dlength);
2053 	ASSERT(xfer_bytes != 0);
2054 	if (xfer_bytes != dlength) {
2055 		idm_task_rele(task);
2056 		/*
2057 		 * Buffer overflow, connection error.  The PDU data is still
2058 		 * sitting in the socket so we can't use the connection
2059 		 * again until that data is drained.
2060 		 */
2061 		return (IDM_STATUS_FAIL);
2062 	}
2063 
2064 	status = idm_sorecvdata(ic, pdu);
2065 
2066 	idm_task_rele(task);
2067 
2068 	return (status);
2069 }
2070 
2071 static uint32_t
2072 idm_fill_iov(idm_pdu_t *pdu, idm_buf_t *idb, uint32_t ro, uint32_t dlength)
2073 {
2074 	uint32_t	buf_ro = ro - idb->idb_bufoffset;
2075 	uint32_t	xfer_len = min(dlength, idb->idb_buflen - buf_ro);
2076 
2077 	ASSERT(ro >= idb->idb_bufoffset);
2078 
2079 	pdu->isp_iov[pdu->isp_iovlen].iov_base	=
2080 	    (caddr_t)idb->idb_buf + buf_ro;
2081 	pdu->isp_iov[pdu->isp_iovlen].iov_len	= xfer_len;
2082 	pdu->isp_iovlen++;
2083 
2084 	return (xfer_len);
2085 }
2086 
2087 int
2088 idm_sorecv_nonscsidata(idm_conn_t *ic, idm_pdu_t *pdu)
2089 {
2090 	pdu->isp_data = kmem_alloc(pdu->isp_datalen, KM_SLEEP);
2091 	ASSERT(pdu->isp_data != NULL);
2092 
2093 	pdu->isp_databuflen = pdu->isp_datalen;
2094 	pdu->isp_iov[0].iov_base = (caddr_t)pdu->isp_data;
2095 	pdu->isp_iov[0].iov_len = pdu->isp_datalen;
2096 	pdu->isp_iovlen = 1;
2097 	/*
2098 	 * Since we are associating a new data buffer with this received
2099 	 * PDU we need to set a specific callback to free the data
2100 	 * after the PDU is processed.
2101 	 */
2102 	pdu->isp_flags |= IDM_PDU_ADDL_DATA;
2103 	pdu->isp_callback = idm_sorx_addl_pdu_cb;
2104 
2105 	return (idm_sorecvdata(ic, pdu));
2106 }
2107 
2108 void
2109 idm_sorx_thread(void *arg)
2110 {
2111 	boolean_t	conn_failure = B_FALSE;
2112 	idm_conn_t	*ic = (idm_conn_t *)arg;
2113 	idm_so_conn_t	*so_conn;
2114 	idm_pdu_t	*pdu;
2115 	idm_status_t	rc;
2116 
2117 	idm_conn_hold(ic);
2118 
2119 	mutex_enter(&ic->ic_mutex);
2120 
2121 	so_conn = ic->ic_transport_private;
2122 	so_conn->ic_rx_thread_running = B_TRUE;
2123 	so_conn->ic_rx_thread_did = so_conn->ic_rx_thread->t_did;
2124 	cv_signal(&ic->ic_cv);
2125 
2126 	while (so_conn->ic_rx_thread_running) {
2127 		mutex_exit(&ic->ic_mutex);
2128 
2129 		/*
2130 		 * Get PDU with default header size (large enough for
2131 		 * BHS plus any anticipated AHS).  PDU from
2132 		 * the cache will have all values set correctly
2133 		 * for sockets RX including callback.
2134 		 */
2135 		pdu = kmem_cache_alloc(idm.idm_sorx_pdu_cache, KM_SLEEP);
2136 		pdu->isp_ic = ic;
2137 		pdu->isp_flags = 0;
2138 		pdu->isp_transport_hdrlen = 0;
2139 
2140 		if ((rc = idm_sorecvhdr(ic, pdu)) != 0) {
2141 			/*
2142 			 * Call idm_pdu_complete so that we call the callback
2143 			 * and ensure any memory allocated in idm_sorecvhdr
2144 			 * gets freed up.
2145 			 */
2146 			idm_pdu_complete(pdu, IDM_STATUS_FAIL);
2147 
2148 			/*
2149 			 * If ic_rx_thread_running is still set then
2150 			 * this is some kind of connection problem
2151 			 * on the socket.  In this case we want to
2152 			 * generate an event.  Otherwise some other
2153 			 * thread closed the socket due to another
2154 			 * issue in which case we don't need to
2155 			 * generate an event.
2156 			 */
2157 			mutex_enter(&ic->ic_mutex);
2158 			if (so_conn->ic_rx_thread_running) {
2159 				conn_failure = B_TRUE;
2160 				so_conn->ic_rx_thread_running = B_FALSE;
2161 			}
2162 
2163 			continue;
2164 		}
2165 
2166 		/*
2167 		 * Header has been read and validated.  Now we need
2168 		 * to read the PDU data payload (if present).  SCSI data
2169 		 * need to be transferred from the socket directly into
2170 		 * the associated transfer buffer for the SCSI task.
2171 		 */
2172 		if (pdu->isp_datalen != 0) {
2173 			if ((IDM_PDU_OPCODE(pdu) == ISCSI_OP_SCSI_DATA) ||
2174 			    (IDM_PDU_OPCODE(pdu) == ISCSI_OP_SCSI_DATA_RSP)) {
2175 				rc = idm_sorecv_scsidata(ic, pdu);
2176 				/*
2177 				 * All SCSI errors are fatal to the
2178 				 * connection right now since we have no
2179 				 * place to put the data.  What we need
2180 				 * is some kind of sink to dispose of unwanted
2181 				 * SCSI data.  For example an invalid task tag
2182 				 * should not kill the connection (although
2183 				 * we may want to drop the connection).
2184 				 */
2185 			} else {
2186 				/*
2187 				 * Not data PDUs so allocate a buffer for the
2188 				 * data segment and read the remaining data.
2189 				 */
2190 				rc = idm_sorecv_nonscsidata(ic, pdu);
2191 			}
2192 			if (rc != 0) {
2193 				/*
2194 				 * Call idm_pdu_complete so that we call the
2195 				 * callback and ensure any memory allocated
2196 				 * in idm_sorecvhdr gets freed up.
2197 				 */
2198 				idm_pdu_complete(pdu, IDM_STATUS_FAIL);
2199 
2200 				/*
2201 				 * If ic_rx_thread_running is still set then
2202 				 * this is some kind of connection problem
2203 				 * on the socket.  In this case we want to
2204 				 * generate an event.  Otherwise some other
2205 				 * thread closed the socket due to another
2206 				 * issue in which case we don't need to
2207 				 * generate an event.
2208 				 */
2209 				mutex_enter(&ic->ic_mutex);
2210 				if (so_conn->ic_rx_thread_running) {
2211 					conn_failure = B_TRUE;
2212 					so_conn->ic_rx_thread_running = B_FALSE;
2213 				}
2214 				continue;
2215 			}
2216 		}
2217 
2218 		/*
2219 		 * Process RX PDU
2220 		 */
2221 		idm_pdu_rx(ic, pdu);
2222 
2223 		mutex_enter(&ic->ic_mutex);
2224 	}
2225 
2226 	mutex_exit(&ic->ic_mutex);
2227 
2228 	/*
2229 	 * If we dropped out of the RX processing loop because of
2230 	 * a socket problem or other connection failure (including
2231 	 * digest errors) then we need to generate a state machine
2232 	 * event to shut the connection down.
2233 	 * If the state machine is already in, for example, INIT_ERROR, this
2234 	 * event will get dropped, and the TX thread will never be notified
2235 	 * to shut down.  To be safe, we'll just notify it here.
2236 	 */
2237 	if (conn_failure) {
2238 		if (so_conn->ic_tx_thread_running) {
2239 			so_conn->ic_tx_thread_running = B_FALSE;
2240 			mutex_enter(&so_conn->ic_tx_mutex);
2241 			cv_signal(&so_conn->ic_tx_cv);
2242 			mutex_exit(&so_conn->ic_tx_mutex);
2243 		}
2244 
2245 		idm_conn_event(ic, CE_TRANSPORT_FAIL, rc);
2246 	}
2247 
2248 	idm_conn_rele(ic);
2249 
2250 	thread_exit();
2251 }
2252 
2253 /*
2254  * idm_so_tx
2255  *
2256  * This is the implementation of idm_transport_ops_t's it_tx_pdu entry
2257  * point.  By definition, it is supposed to be fast.  So, simply queue
2258  * the entry and return.  The real work is done by idm_i_so_tx() via
2259  * idm_sotx_thread().
2260  */
2261 
2262 static void
2263 idm_so_tx(idm_conn_t *ic, idm_pdu_t *pdu)
2264 {
2265 	idm_so_conn_t *so_conn = ic->ic_transport_private;
2266 
2267 	ASSERT(pdu->isp_ic == ic);
2268 	mutex_enter(&so_conn->ic_tx_mutex);
2269 
2270 	if (!so_conn->ic_tx_thread_running) {
2271 		mutex_exit(&so_conn->ic_tx_mutex);
2272 		idm_pdu_complete(pdu, IDM_STATUS_ABORTED);
2273 		return;
2274 	}
2275 
2276 	list_insert_tail(&so_conn->ic_tx_list, (void *)pdu);
2277 	cv_signal(&so_conn->ic_tx_cv);
2278 	mutex_exit(&so_conn->ic_tx_mutex);
2279 }
2280 
2281 static idm_status_t
2282 idm_i_so_tx(idm_pdu_t *pdu)
2283 {
2284 	idm_conn_t	*ic = pdu->isp_ic;
2285 	idm_status_t	status = IDM_STATUS_SUCCESS;
2286 	uint8_t		pad[ISCSI_PAD_WORD_LEN];
2287 	int		pad_len;
2288 	uint32_t	hdr_digest_crc;
2289 	uint32_t	data_digest_crc = 0;
2290 	int		total_len = 0;
2291 	int		iovlen = 0;
2292 	struct iovec	iov[6];
2293 	idm_so_conn_t	*so_conn;
2294 
2295 	so_conn = ic->ic_transport_private;
2296 
2297 	/* Setup BHS */
2298 	iov[iovlen].iov_base	= (caddr_t)pdu->isp_hdr;
2299 	iov[iovlen].iov_len	= pdu->isp_hdrlen;
2300 	total_len		+= iov[iovlen].iov_len;
2301 	iovlen++;
2302 
2303 	/* Setup header digest */
2304 	if (((pdu->isp_flags & IDM_PDU_LOGIN_TX) == 0) &&
2305 	    (ic->ic_conn_flags & IDM_CONN_HEADER_DIGEST)) {
2306 		hdr_digest_crc = idm_crc32c(pdu->isp_hdr, pdu->isp_hdrlen);
2307 
2308 		iov[iovlen].iov_base	= (caddr_t)&hdr_digest_crc;
2309 		iov[iovlen].iov_len	= sizeof (hdr_digest_crc);
2310 		total_len		+= iov[iovlen].iov_len;
2311 		iovlen++;
2312 	}
2313 
2314 	/* Setup the data */
2315 	if (pdu->isp_datalen) {
2316 		idm_task_t		*idt;
2317 		idm_buf_t		*idb;
2318 		iscsi_data_hdr_t	*ihp;
2319 		ihp = (iscsi_data_hdr_t *)pdu->isp_hdr;
2320 		/* Write of immediate data */
2321 		if (ic->ic_ffp &&
2322 		    (IDM_PDU_OPCODE(pdu) == ISCSI_OP_SCSI_CMD ||
2323 		    IDM_PDU_OPCODE(pdu) == ISCSI_OP_SCSI_DATA)) {
2324 			idt = idm_task_find(ic, ihp->itt, ihp->ttt);
2325 			if (idt) {
2326 				mutex_enter(&idt->idt_mutex);
2327 				idb = idm_buf_find(&idt->idt_outbufv, 0);
2328 				mutex_exit(&idt->idt_mutex);
2329 				/*
2330 				 * If the initiator call to idm_buf_alloc
2331 				 * failed then we can get to this point
2332 				 * without a bound buffer.  The associated
2333 				 * connection failure will clean things up
2334 				 * later.  It would be nice to come up with
2335 				 * a cleaner way to handle this.  In
2336 				 * particular it seems absurd to look up
2337 				 * the task and the buffer just to update
2338 				 * this counter.
2339 				 */
2340 				if (idb)
2341 					idb->idb_xfer_len += pdu->isp_datalen;
2342 				idm_task_rele(idt);
2343 			}
2344 		}
2345 
2346 		iov[iovlen].iov_base = (caddr_t)pdu->isp_data;
2347 		iov[iovlen].iov_len  = pdu->isp_datalen;
2348 		total_len += iov[iovlen].iov_len;
2349 		iovlen++;
2350 	}
2351 
2352 	/* Setup the data pad if necessary */
2353 	pad_len = ((ISCSI_PAD_WORD_LEN -
2354 	    (pdu->isp_datalen & (ISCSI_PAD_WORD_LEN - 1))) &
2355 	    (ISCSI_PAD_WORD_LEN - 1));
2356 
2357 	if (pad_len) {
2358 		bzero(pad, sizeof (pad));
2359 		iov[iovlen].iov_base = (void *)&pad;
2360 		iov[iovlen].iov_len  = pad_len;
2361 		total_len		+= iov[iovlen].iov_len;
2362 		iovlen++;
2363 	}
2364 
2365 	/*
2366 	 * Setup the data digest if enabled.  Data-digest is not sent
2367 	 * for login-phase PDUs.
2368 	 */
2369 	if ((ic->ic_conn_flags & IDM_CONN_DATA_DIGEST) &&
2370 	    ((pdu->isp_flags & IDM_PDU_LOGIN_TX) == 0) &&
2371 	    (pdu->isp_datalen || pad_len)) {
2372 		/*
2373 		 * RFC3720/10.2.3: A zero-length Data Segment also
2374 		 * implies a zero-length data digest.
2375 		 */
2376 		if (pdu->isp_datalen) {
2377 			data_digest_crc = idm_crc32c(pdu->isp_data,
2378 			    pdu->isp_datalen);
2379 		}
2380 		if (pad_len) {
2381 			data_digest_crc = idm_crc32c_continued(&pad,
2382 			    pad_len, data_digest_crc);
2383 		}
2384 
2385 		iov[iovlen].iov_base	= (caddr_t)&data_digest_crc;
2386 		iov[iovlen].iov_len	= sizeof (data_digest_crc);
2387 		total_len		+= iov[iovlen].iov_len;
2388 		iovlen++;
2389 	}
2390 
2391 	/* Transmit the PDU */
2392 	if (idm_iov_sosend(so_conn->ic_so, &iov[0], iovlen,
2393 	    total_len) != 0) {
2394 		/* Set error status */
2395 		IDM_CONN_LOG(CE_WARN,
2396 		    "idm_so_tx: failed to transmit the PDU, so: %p ic: %p "
2397 		    "data: %p", (void *) so_conn->ic_so, (void *) ic,
2398 		    (void *) pdu->isp_data);
2399 		status = IDM_STATUS_IO;
2400 	}
2401 
2402 	/*
2403 	 * Success does not mean that the PDU actually reached the
2404 	 * remote node since it could get dropped along the way.
2405 	 */
2406 	idm_pdu_complete(pdu, status);
2407 
2408 	return (status);
2409 }
2410 
2411 /*
2412  * The idm_so_buf_tx_to_ini() is used by the target iSCSI layer to transmit the
2413  * Data-In PDUs using sockets. Based on the negotiated MaxRecvDataSegmentLength,
2414  * the buffer is segmented into a sequence of Data-In PDUs, ordered by DataSN.
2415  * A target can invoke this function multiple times for a single read command
2416  * (identified by the same ITT) to split the input into several sequences.
2417  *
2418  * DataSN starts with 0 for the first data PDU of an input command and advances
2419  * by 1 for each subsequent data PDU. Each sequence will have its own F bit,
2420  * which is set to 1 for the last data PDU of a sequence.
2421  * If the initiator supports phase collapse, the status bit must be set along
2422  * with the F bit to indicate that the status is shipped together with the last
2423  * Data-In PDU.
2424  *
2425  * The data PDUs within a sequence will be sent in order with the buffer offset
2426  * in increasing order. i.e. initiator and target must have negotiated the
2427  * "DataPDUInOrder" to "Yes". The order between sequences is not enforced.
2428  *
2429  * Caller holds idt->idt_mutex
2430  */
2431 static idm_status_t
2432 idm_so_buf_tx_to_ini(idm_task_t *idt, idm_buf_t *idb)
2433 {
2434 	idm_so_conn_t	*so_conn = idb->idb_ic->ic_transport_private;
2435 	idm_pdu_t	tmppdu;
2436 
2437 	ASSERT(mutex_owned(&idt->idt_mutex));
2438 
2439 	/*
2440 	 * Put the idm_buf_t on the tx queue.  It will be transmitted by
2441 	 * idm_sotx_thread.
2442 	 */
2443 	mutex_enter(&so_conn->ic_tx_mutex);
2444 
2445 	DTRACE_ISCSI_8(xfer__start, idm_conn_t *, idt->idt_ic,
2446 	    uintptr_t, idb->idb_buf, uint32_t, idb->idb_bufoffset,
2447 	    uint64_t, 0, uint32_t, 0, uint32_t, 0,
2448 	    uint32_t, idb->idb_xfer_len, int, XFER_BUF_TX_TO_INI);
2449 
2450 	if (!so_conn->ic_tx_thread_running) {
2451 		mutex_exit(&so_conn->ic_tx_mutex);
2452 		/*
2453 		 * Don't release idt->idt_mutex since we're supposed to hold
2454 		 * in when calling idm_buf_tx_to_ini_done
2455 		 */
2456 		DTRACE_ISCSI_8(xfer__done, idm_conn_t *, idt->idt_ic,
2457 		    uintptr_t, idb->idb_buf, uint32_t, idb->idb_bufoffset,
2458 		    uint64_t, 0, uint32_t, 0, uint32_t, 0,
2459 		    uint32_t, idb->idb_xfer_len,
2460 		    int, XFER_BUF_TX_TO_INI);
2461 		idm_buf_tx_to_ini_done(idt, idb, IDM_STATUS_ABORTED);
2462 		return (IDM_STATUS_FAIL);
2463 	}
2464 
2465 	/*
2466 	 * Build a template for the data PDU headers we will use so that
2467 	 * the SN values will stay consistent with other PDU's we are
2468 	 * transmitting like R2T and SCSI status.
2469 	 */
2470 	bzero(&idb->idb_data_hdr_tmpl, sizeof (iscsi_hdr_t));
2471 	tmppdu.isp_hdr = &idb->idb_data_hdr_tmpl;
2472 	(*idt->idt_ic->ic_conn_ops.icb_build_hdr)(idt, &tmppdu,
2473 	    ISCSI_OP_SCSI_DATA_RSP);
2474 	idb->idb_tx_thread = B_TRUE;
2475 	list_insert_tail(&so_conn->ic_tx_list, (void *)idb);
2476 	cv_signal(&so_conn->ic_tx_cv);
2477 	mutex_exit(&so_conn->ic_tx_mutex);
2478 	mutex_exit(&idt->idt_mutex);
2479 
2480 	/*
2481 	 * Returning success here indicates the transfer was successfully
2482 	 * dispatched -- it does not mean that the transfer completed
2483 	 * successfully.
2484 	 */
2485 	return (IDM_STATUS_SUCCESS);
2486 }
2487 
2488 /*
2489  * The idm_so_buf_rx_from_ini() is used by the target iSCSI layer to specify the
2490  * data blocks it is ready to receive from the initiator in response to a WRITE
2491  * SCSI command. The target iSCSI layer passes the information about the desired
2492  * data blocks to the initiator in one R2T PDU. The receiving buffer, the buffer
2493  * offset and datalen are passed via the 'idb' argument.
2494  *
2495  * Scope for Prototype build:
2496  * R2Ts are required for any Data-Out PDU, i.e. initiator and target must have
2497  * negotiated the "InitialR2T" to "Yes".
2498  *
2499  * Caller holds idt->idt_mutex
2500  */
2501 static idm_status_t
2502 idm_so_buf_rx_from_ini(idm_task_t *idt, idm_buf_t *idb)
2503 {
2504 	idm_pdu_t		*pdu;
2505 	iscsi_rtt_hdr_t		*rtt;
2506 
2507 	ASSERT(mutex_owned(&idt->idt_mutex));
2508 
2509 	DTRACE_ISCSI_8(xfer__start, idm_conn_t *, idt->idt_ic,
2510 	    uintptr_t, idb->idb_buf, uint32_t, idb->idb_bufoffset,
2511 	    uint64_t, 0, uint32_t, 0, uint32_t, 0,
2512 	    uint32_t, idb->idb_xfer_len, int, XFER_BUF_RX_FROM_INI);
2513 
2514 	pdu = kmem_cache_alloc(idm.idm_sotx_pdu_cache, KM_SLEEP);
2515 	pdu->isp_ic = idt->idt_ic;
2516 	pdu->isp_flags = IDM_PDU_SET_STATSN;
2517 	bzero(pdu->isp_hdr, sizeof (iscsi_rtt_hdr_t));
2518 
2519 	/* iSCSI layer fills the TTT, ITT, ExpCmdSN, MaxCmdSN */
2520 	(*idt->idt_ic->ic_conn_ops.icb_build_hdr)(idt, pdu, ISCSI_OP_RTT_RSP);
2521 
2522 	/* set the rttsn, rtt.flags, rtt.data_offset and rtt.data_length */
2523 	rtt = (iscsi_rtt_hdr_t *)(pdu->isp_hdr);
2524 
2525 	rtt->opcode		= ISCSI_OP_RTT_RSP;
2526 	rtt->flags		= ISCSI_FLAG_FINAL;
2527 	rtt->data_offset	= htonl(idb->idb_bufoffset);
2528 	rtt->data_length	= htonl(idb->idb_xfer_len);
2529 	rtt->rttsn		= htonl(idt->idt_exp_rttsn++);
2530 
2531 	/* Keep track of buffer offsets */
2532 	idb->idb_exp_offset	= idb->idb_bufoffset;
2533 	mutex_exit(&idt->idt_mutex);
2534 
2535 	/*
2536 	 * Transmit the PDU.
2537 	 */
2538 	idm_pdu_tx(pdu);
2539 
2540 	return (IDM_STATUS_SUCCESS);
2541 }
2542 
2543 static idm_status_t
2544 idm_so_buf_alloc(idm_buf_t *idb, uint64_t buflen)
2545 {
2546 	if ((buflen > IDM_SO_BUF_CACHE_LB) && (buflen <= IDM_SO_BUF_CACHE_UB)) {
2547 		idb->idb_buf = kmem_cache_alloc(idm.idm_so_128k_buf_cache,
2548 		    KM_NOSLEEP);
2549 		idb->idb_buf_private = idm.idm_so_128k_buf_cache;
2550 	} else {
2551 		idb->idb_buf = kmem_alloc(buflen, KM_NOSLEEP);
2552 		idb->idb_buf_private = NULL;
2553 	}
2554 
2555 	if (idb->idb_buf == NULL) {
2556 		IDM_CONN_LOG(CE_NOTE,
2557 		    "idm_so_buf_alloc: failed buffer allocation");
2558 		return (IDM_STATUS_FAIL);
2559 	}
2560 
2561 	return (IDM_STATUS_SUCCESS);
2562 }
2563 
2564 /* ARGSUSED */
2565 static idm_status_t
2566 idm_so_buf_setup(idm_buf_t *idb)
2567 {
2568 	/* Ensure bufalloc'd flag is unset */
2569 	idb->idb_bufalloc = B_FALSE;
2570 
2571 	return (IDM_STATUS_SUCCESS);
2572 }
2573 
2574 /* ARGSUSED */
2575 static void
2576 idm_so_buf_teardown(idm_buf_t *idb)
2577 {
2578 	/* nothing to do here */
2579 }
2580 
2581 static void
2582 idm_so_buf_free(idm_buf_t *idb)
2583 {
2584 	if (idb->idb_buf_private == NULL) {
2585 		kmem_free(idb->idb_buf, idb->idb_buflen);
2586 	} else {
2587 		kmem_cache_free(idb->idb_buf_private, idb->idb_buf);
2588 	}
2589 }
2590 
2591 static void
2592 idm_so_send_rtt_data(idm_conn_t *ic, idm_task_t *idt, idm_buf_t *idb,
2593     uint32_t offset, uint32_t length)
2594 {
2595 	idm_so_conn_t	*so_conn = ic->ic_transport_private;
2596 	idm_pdu_t	tmppdu;
2597 	idm_buf_t	*rtt_buf;
2598 
2599 	ASSERT(mutex_owned(&idt->idt_mutex));
2600 
2601 	/*
2602 	 * Allocate a buffer to represent the RTT transfer.  We could further
2603 	 * optimize this by allocating the buffers internally from an rtt
2604 	 * specific buffer cache since this is socket-specific code but for
2605 	 * now we will keep it simple.
2606 	 */
2607 	rtt_buf = idm_buf_alloc(ic, (uint8_t *)idb->idb_buf + offset, length);
2608 	if (rtt_buf == NULL) {
2609 		/*
2610 		 * If we're in FFP then the failure was likely a resource
2611 		 * allocation issue and we should close the connection by
2612 		 * sending a CE_TRANSPORT_FAIL event.
2613 		 *
2614 		 * If we're not in FFP then idm_buf_alloc will always
2615 		 * fail and the state is transitioning to "complete" anyway
2616 		 * so we won't bother to send an event.
2617 		 */
2618 		mutex_enter(&ic->ic_state_mutex);
2619 		if (ic->ic_ffp)
2620 			idm_conn_event_locked(ic, CE_TRANSPORT_FAIL,
2621 			    NULL, CT_NONE);
2622 		mutex_exit(&ic->ic_state_mutex);
2623 		mutex_exit(&idt->idt_mutex);
2624 		return;
2625 	}
2626 
2627 	rtt_buf->idb_buf_cb = NULL;
2628 	rtt_buf->idb_cb_arg = NULL;
2629 	rtt_buf->idb_bufoffset = offset;
2630 	rtt_buf->idb_xfer_len = length;
2631 	rtt_buf->idb_ic = idt->idt_ic;
2632 	rtt_buf->idb_task_binding = idt;
2633 
2634 	/*
2635 	 * The new buffer (if any) represents an additional
2636 	 * reference on the task
2637 	 */
2638 	idm_task_hold(idt);
2639 	mutex_exit(&idt->idt_mutex);
2640 
2641 	/*
2642 	 * Put the idm_buf_t on the tx queue.  It will be transmitted by
2643 	 * idm_sotx_thread.
2644 	 */
2645 	mutex_enter(&so_conn->ic_tx_mutex);
2646 
2647 	if (!so_conn->ic_tx_thread_running) {
2648 		idm_buf_free(rtt_buf);
2649 		mutex_exit(&so_conn->ic_tx_mutex);
2650 		idm_task_rele(idt);
2651 		return;
2652 	}
2653 
2654 	/*
2655 	 * Build a template for the data PDU headers we will use so that
2656 	 * the SN values will stay consistent with other PDU's we are
2657 	 * transmitting like R2T and SCSI status.
2658 	 */
2659 	bzero(&rtt_buf->idb_data_hdr_tmpl, sizeof (iscsi_hdr_t));
2660 	tmppdu.isp_hdr = &rtt_buf->idb_data_hdr_tmpl;
2661 	(*idt->idt_ic->ic_conn_ops.icb_build_hdr)(idt, &tmppdu,
2662 	    ISCSI_OP_SCSI_DATA);
2663 	rtt_buf->idb_tx_thread = B_TRUE;
2664 	rtt_buf->idb_in_transport = B_TRUE;
2665 	list_insert_tail(&so_conn->ic_tx_list, (void *)rtt_buf);
2666 	cv_signal(&so_conn->ic_tx_cv);
2667 	mutex_exit(&so_conn->ic_tx_mutex);
2668 }
2669 
2670 static void
2671 idm_so_send_rtt_data_done(idm_task_t *idt, idm_buf_t *idb)
2672 {
2673 	/*
2674 	 * Don't worry about status -- we assume any error handling
2675 	 * is performed by the caller (idm_sotx_thread).
2676 	 */
2677 	idb->idb_in_transport = B_FALSE;
2678 	idm_task_rele(idt);
2679 	idm_buf_free(idb);
2680 }
2681 
2682 static idm_status_t
2683 idm_so_send_buf_region(idm_task_t *idt, idm_buf_t *idb,
2684     uint32_t buf_region_offset, uint32_t buf_region_length)
2685 {
2686 	idm_conn_t		*ic;
2687 	uint32_t		max_dataseglen;
2688 	size_t			remainder, chunk;
2689 	uint32_t		data_offset = buf_region_offset;
2690 	iscsi_data_hdr_t	*bhs;
2691 	idm_pdu_t		*pdu;
2692 	idm_status_t		tx_status;
2693 
2694 	ASSERT(mutex_owned(&idt->idt_mutex));
2695 
2696 	ic = idt->idt_ic;
2697 
2698 	max_dataseglen = ic->ic_conn_params.max_xmit_dataseglen;
2699 	remainder = buf_region_length;
2700 
2701 	while (remainder) {
2702 		if (idt->idt_state != TASK_ACTIVE) {
2703 			ASSERT((idt->idt_state != TASK_IDLE) &&
2704 			    (idt->idt_state != TASK_COMPLETE));
2705 			return (IDM_STATUS_ABORTED);
2706 		}
2707 
2708 		/* check to see if we need to chunk the data */
2709 		if (remainder > max_dataseglen) {
2710 			chunk = max_dataseglen;
2711 		} else {
2712 			chunk = remainder;
2713 		}
2714 
2715 		/* Data PDU headers will always be sizeof (iscsi_hdr_t) */
2716 		pdu = kmem_cache_alloc(idm.idm_sotx_pdu_cache, KM_SLEEP);
2717 		pdu->isp_ic = ic;
2718 		pdu->isp_flags = 0;	/* initialize isp_flags */
2719 
2720 		/*
2721 		 * We've already built a build a header template
2722 		 * to use during the transfer.  Use this template so that
2723 		 * the SN values stay consistent with any unrelated PDU's
2724 		 * being transmitted.
2725 		 */
2726 		bcopy(&idb->idb_data_hdr_tmpl, pdu->isp_hdr,
2727 		    sizeof (iscsi_hdr_t));
2728 
2729 		/*
2730 		 * Set DataSN, data offset, and flags in BHS
2731 		 * For the prototype build, A = 0, S = 0, U = 0
2732 		 */
2733 		bhs = (iscsi_data_hdr_t *)(pdu->isp_hdr);
2734 
2735 		bhs->datasn		= htonl(idt->idt_exp_datasn++);
2736 
2737 		hton24(bhs->dlength, chunk);
2738 		bhs->offset = htonl(idb->idb_bufoffset + data_offset);
2739 
2740 		/* setup data */
2741 		pdu->isp_data	=  (uint8_t *)idb->idb_buf + data_offset;
2742 		pdu->isp_datalen = (uint_t)chunk;
2743 
2744 		if (chunk == remainder) {
2745 			bhs->flags = ISCSI_FLAG_FINAL; /* F bit set to 1 */
2746 			/* Piggyback the status with the last data PDU */
2747 			if (idt->idt_flags & IDM_TASK_PHASECOLLAPSE_REQ) {
2748 				pdu->isp_flags |= IDM_PDU_SET_STATSN |
2749 				    IDM_PDU_ADVANCE_STATSN;
2750 				(*idt->idt_ic->ic_conn_ops.icb_update_statsn)
2751 				    (idt, pdu);
2752 				idt->idt_flags |=
2753 				    IDM_TASK_PHASECOLLAPSE_SUCCESS;
2754 
2755 			}
2756 		}
2757 
2758 		remainder	-= chunk;
2759 		data_offset	+= chunk;
2760 
2761 		/* Instrument the data-send DTrace probe. */
2762 		if (IDM_PDU_OPCODE(pdu) == ISCSI_OP_SCSI_DATA_RSP) {
2763 			DTRACE_ISCSI_2(data__send,
2764 			    idm_conn_t *, idt->idt_ic,
2765 			    iscsi_data_rsp_hdr_t *,
2766 			    (iscsi_data_rsp_hdr_t *)pdu->isp_hdr);
2767 		}
2768 
2769 		/*
2770 		 * Now that we're done working with idt_exp_datasn,
2771 		 * idt->idt_state and idb->idb_bufoffset we can release
2772 		 * the task lock -- don't want to hold it across the
2773 		 * call to idm_i_so_tx since we could block.
2774 		 */
2775 		mutex_exit(&idt->idt_mutex);
2776 
2777 		/*
2778 		 * Transmit the PDU.  Call the internal routine directly
2779 		 * as there is already implicit ordering.
2780 		 */
2781 		if ((tx_status = idm_i_so_tx(pdu)) != IDM_STATUS_SUCCESS) {
2782 			mutex_enter(&idt->idt_mutex);
2783 			return (tx_status);
2784 		}
2785 
2786 		mutex_enter(&idt->idt_mutex);
2787 		idt->idt_tx_bytes += chunk;
2788 	}
2789 
2790 	return (IDM_STATUS_SUCCESS);
2791 }
2792 
2793 /*
2794  * TX PDU cache
2795  */
2796 /* ARGSUSED */
2797 int
2798 idm_sotx_pdu_constructor(void *hdl, void *arg, int flags)
2799 {
2800 	idm_pdu_t	*pdu = hdl;
2801 
2802 	bzero(pdu, sizeof (idm_pdu_t));
2803 	pdu->isp_hdr = (iscsi_hdr_t *)(pdu + 1); /* Ptr arithmetic */
2804 	pdu->isp_hdrlen = sizeof (iscsi_hdr_t);
2805 	pdu->isp_callback = idm_sotx_cache_pdu_cb;
2806 	pdu->isp_magic = IDM_PDU_MAGIC;
2807 	bzero(pdu->isp_hdr, sizeof (iscsi_hdr_t));
2808 
2809 	return (0);
2810 }
2811 
2812 /* ARGSUSED */
2813 void
2814 idm_sotx_cache_pdu_cb(idm_pdu_t *pdu, idm_status_t status)
2815 {
2816 	/* reset values between use */
2817 	pdu->isp_datalen = 0;
2818 
2819 	kmem_cache_free(idm.idm_sotx_pdu_cache, pdu);
2820 }
2821 
2822 /*
2823  * RX PDU cache
2824  */
2825 /* ARGSUSED */
2826 int
2827 idm_sorx_pdu_constructor(void *hdl, void *arg, int flags)
2828 {
2829 	idm_pdu_t	*pdu = hdl;
2830 
2831 	bzero(pdu, sizeof (idm_pdu_t));
2832 	pdu->isp_magic = IDM_PDU_MAGIC;
2833 	pdu->isp_hdr = (iscsi_hdr_t *)(pdu + 1); /* Ptr arithmetic */
2834 	pdu->isp_callback = idm_sorx_cache_pdu_cb;
2835 
2836 	return (0);
2837 }
2838 
2839 /* ARGSUSED */
2840 static void
2841 idm_sorx_cache_pdu_cb(idm_pdu_t *pdu, idm_status_t status)
2842 {
2843 	pdu->isp_iovlen = 0;
2844 	pdu->isp_sorx_buf = 0;
2845 	kmem_cache_free(idm.idm_sorx_pdu_cache, pdu);
2846 }
2847 
2848 static void
2849 idm_sorx_addl_pdu_cb(idm_pdu_t *pdu, idm_status_t status)
2850 {
2851 	/*
2852 	 * We had to modify our cached RX PDU with a longer header buffer
2853 	 * and/or a longer data buffer.  Release the new buffers and fix
2854 	 * the fields back to what we would expect for a cached RX PDU.
2855 	 */
2856 	if (pdu->isp_flags & IDM_PDU_ADDL_HDR) {
2857 		kmem_free(pdu->isp_hdr, pdu->isp_hdrlen);
2858 	}
2859 	if (pdu->isp_flags & IDM_PDU_ADDL_DATA) {
2860 		kmem_free(pdu->isp_data, pdu->isp_datalen);
2861 	}
2862 	pdu->isp_hdr = (iscsi_hdr_t *)(pdu + 1);
2863 	pdu->isp_hdrlen = sizeof (iscsi_hdr_t);
2864 	pdu->isp_data = NULL;
2865 	pdu->isp_datalen = 0;
2866 	pdu->isp_sorx_buf = 0;
2867 	pdu->isp_callback = idm_sorx_cache_pdu_cb;
2868 	idm_sorx_cache_pdu_cb(pdu, status);
2869 }
2870 
2871 /*
2872  * This thread is only active when I/O is queued for transmit
2873  * because the socket is busy.
2874  */
2875 void
2876 idm_sotx_thread(void *arg)
2877 {
2878 	idm_conn_t	*ic = arg;
2879 	idm_tx_obj_t	*object, *next;
2880 	idm_so_conn_t	*so_conn;
2881 	idm_status_t	status = IDM_STATUS_SUCCESS;
2882 
2883 	idm_conn_hold(ic);
2884 
2885 	mutex_enter(&ic->ic_mutex);
2886 	so_conn = ic->ic_transport_private;
2887 	so_conn->ic_tx_thread_running = B_TRUE;
2888 	so_conn->ic_tx_thread_did = so_conn->ic_tx_thread->t_did;
2889 	cv_signal(&ic->ic_cv);
2890 	mutex_exit(&ic->ic_mutex);
2891 
2892 	mutex_enter(&so_conn->ic_tx_mutex);
2893 
2894 	while (so_conn->ic_tx_thread_running) {
2895 		while (list_is_empty(&so_conn->ic_tx_list)) {
2896 			DTRACE_PROBE1(soconn__tx__sleep, idm_conn_t *, ic);
2897 			cv_wait(&so_conn->ic_tx_cv, &so_conn->ic_tx_mutex);
2898 			DTRACE_PROBE1(soconn__tx__wakeup, idm_conn_t *, ic);
2899 
2900 			if (!so_conn->ic_tx_thread_running) {
2901 				goto tx_bail;
2902 			}
2903 		}
2904 
2905 		object = (idm_tx_obj_t *)list_head(&so_conn->ic_tx_list);
2906 		list_remove(&so_conn->ic_tx_list, object);
2907 		mutex_exit(&so_conn->ic_tx_mutex);
2908 
2909 		switch (object->idm_tx_obj_magic) {
2910 		case IDM_PDU_MAGIC: {
2911 			idm_pdu_t *pdu = (idm_pdu_t *)object;
2912 			DTRACE_PROBE2(soconn__tx__pdu, idm_conn_t *, ic,
2913 			    idm_pdu_t *, (idm_pdu_t *)object);
2914 
2915 			if (pdu->isp_flags & IDM_PDU_SET_STATSN) {
2916 				/* No IDM task */
2917 				(ic->ic_conn_ops.icb_update_statsn)(NULL, pdu);
2918 			}
2919 			status = idm_i_so_tx((idm_pdu_t *)object);
2920 			break;
2921 		}
2922 		case IDM_BUF_MAGIC: {
2923 			idm_buf_t *idb = (idm_buf_t *)object;
2924 			idm_task_t *idt = idb->idb_task_binding;
2925 
2926 			DTRACE_PROBE2(soconn__tx__buf, idm_conn_t *, ic,
2927 			    idm_buf_t *, idb);
2928 
2929 			mutex_enter(&idt->idt_mutex);
2930 			status = idm_so_send_buf_region(idt,
2931 			    idb, 0, idb->idb_xfer_len);
2932 
2933 			/*
2934 			 * TX thread owns the buffer so we expect it to
2935 			 * be "in transport"
2936 			 */
2937 			ASSERT(idb->idb_in_transport);
2938 			if (IDM_CONN_ISTGT(ic)) {
2939 				/*
2940 				 * idm_buf_tx_to_ini_done releases
2941 				 * idt->idt_mutex
2942 				 */
2943 				DTRACE_ISCSI_8(xfer__done,
2944 				    idm_conn_t *, idt->idt_ic,
2945 				    uintptr_t, idb->idb_buf,
2946 				    uint32_t, idb->idb_bufoffset,
2947 				    uint64_t, 0, uint32_t, 0, uint32_t, 0,
2948 				    uint32_t, idb->idb_xfer_len,
2949 				    int, XFER_BUF_TX_TO_INI);
2950 				idm_buf_tx_to_ini_done(idt, idb, status);
2951 			} else {
2952 				idm_so_send_rtt_data_done(idt, idb);
2953 				mutex_exit(&idt->idt_mutex);
2954 			}
2955 			break;
2956 		}
2957 
2958 		default:
2959 			IDM_CONN_LOG(CE_WARN, "idm_sotx_thread: Unknown magic "
2960 			    "(0x%08x)", object->idm_tx_obj_magic);
2961 			status = IDM_STATUS_FAIL;
2962 		}
2963 
2964 		mutex_enter(&so_conn->ic_tx_mutex);
2965 
2966 		if (status != IDM_STATUS_SUCCESS) {
2967 			so_conn->ic_tx_thread_running = B_FALSE;
2968 			idm_conn_event(ic, CE_TRANSPORT_FAIL, status);
2969 		}
2970 	}
2971 
2972 	/*
2973 	 * Before we leave, we need to abort every item remaining in the
2974 	 * TX list.
2975 	 */
2976 
2977 tx_bail:
2978 	object = (idm_tx_obj_t *)list_head(&so_conn->ic_tx_list);
2979 
2980 	while (object != NULL) {
2981 		next = list_next(&so_conn->ic_tx_list, object);
2982 
2983 		list_remove(&so_conn->ic_tx_list, object);
2984 		switch (object->idm_tx_obj_magic) {
2985 		case IDM_PDU_MAGIC:
2986 			idm_pdu_complete((idm_pdu_t *)object,
2987 			    IDM_STATUS_ABORTED);
2988 			break;
2989 
2990 		case IDM_BUF_MAGIC: {
2991 			idm_buf_t *idb = (idm_buf_t *)object;
2992 			idm_task_t *idt = idb->idb_task_binding;
2993 			mutex_exit(&so_conn->ic_tx_mutex);
2994 			mutex_enter(&idt->idt_mutex);
2995 			/*
2996 			 * TX thread owns the buffer so we expect it to
2997 			 * be "in transport"
2998 			 */
2999 			ASSERT(idb->idb_in_transport);
3000 			if (IDM_CONN_ISTGT(ic)) {
3001 				/*
3002 				 * idm_buf_tx_to_ini_done releases
3003 				 * idt->idt_mutex
3004 				 */
3005 				DTRACE_ISCSI_8(xfer__done,
3006 				    idm_conn_t *, idt->idt_ic,
3007 				    uintptr_t, idb->idb_buf,
3008 				    uint32_t, idb->idb_bufoffset,
3009 				    uint64_t, 0, uint32_t, 0, uint32_t, 0,
3010 				    uint32_t, idb->idb_xfer_len,
3011 				    int, XFER_BUF_TX_TO_INI);
3012 				idm_buf_tx_to_ini_done(idt, idb,
3013 				    IDM_STATUS_ABORTED);
3014 			} else {
3015 				idm_so_send_rtt_data_done(idt, idb);
3016 				mutex_exit(&idt->idt_mutex);
3017 			}
3018 			mutex_enter(&so_conn->ic_tx_mutex);
3019 			break;
3020 		}
3021 		default:
3022 			IDM_CONN_LOG(CE_WARN,
3023 			    "idm_sotx_thread: Unexpected magic "
3024 			    "(0x%08x)", object->idm_tx_obj_magic);
3025 		}
3026 
3027 		object = next;
3028 	}
3029 
3030 	mutex_exit(&so_conn->ic_tx_mutex);
3031 	idm_conn_rele(ic);
3032 	thread_exit();
3033 	/*NOTREACHED*/
3034 }
3035 
3036 static void
3037 idm_so_socket_set_nonblock(struct sonode *node)
3038 {
3039 	(void) VOP_SETFL(node->so_vnode, node->so_flag,
3040 	    (node->so_state | FNONBLOCK), CRED(), NULL);
3041 }
3042 
3043 static void
3044 idm_so_socket_set_block(struct sonode *node)
3045 {
3046 	(void) VOP_SETFL(node->so_vnode, node->so_flag,
3047 	    (node->so_state & (~FNONBLOCK)), CRED(), NULL);
3048 }
3049 
3050 
3051 /*
3052  * Called by kernel sockets when the connection has been accepted or
3053  * rejected. In early volo, a "disconnect" callback was sent instead of
3054  * "connectfailed", so we check for both.
3055  */
3056 /* ARGSUSED */
3057 void
3058 idm_so_timed_socket_connect_cb(ksocket_t ks,
3059     ksocket_callback_event_t ev, void *arg, uintptr_t info)
3060 {
3061 	idm_so_timed_socket_t	*itp = arg;
3062 	ASSERT(itp != NULL);
3063 	ASSERT(ev == KSOCKET_EV_CONNECTED ||
3064 	    ev == KSOCKET_EV_CONNECTFAILED ||
3065 	    ev == KSOCKET_EV_DISCONNECTED);
3066 
3067 	mutex_enter(&idm_so_timed_socket_mutex);
3068 	itp->it_callback_called = B_TRUE;
3069 	if (ev == KSOCKET_EV_CONNECTED) {
3070 		itp->it_socket_error_code = 0;
3071 	} else {
3072 		/* Make sure the error code is non-zero on error */
3073 		if (info == 0)
3074 			info = ECONNRESET;
3075 		itp->it_socket_error_code = (int)info;
3076 	}
3077 	cv_signal(&itp->it_cv);
3078 	mutex_exit(&idm_so_timed_socket_mutex);
3079 }
3080 
3081 int
3082 idm_so_timed_socket_connect(ksocket_t ks,
3083     struct sockaddr_storage *sa, int sa_sz, int login_max_usec)
3084 {
3085 	clock_t			conn_login_max;
3086 	int			rc, nonblocking, rval;
3087 	idm_so_timed_socket_t	it;
3088 	ksocket_callbacks_t	ks_cb;
3089 
3090 	conn_login_max = ddi_get_lbolt() + drv_usectohz(login_max_usec);
3091 
3092 	/*
3093 	 * Set to non-block socket mode, with callback on connect
3094 	 * Early volo used "disconnected" instead of "connectfailed",
3095 	 * so set callback to look for both.
3096 	 */
3097 	bzero(&it, sizeof (it));
3098 	ks_cb.ksock_cb_flags = KSOCKET_CB_CONNECTED |
3099 	    KSOCKET_CB_CONNECTFAILED | KSOCKET_CB_DISCONNECTED;
3100 	ks_cb.ksock_cb_connected = idm_so_timed_socket_connect_cb;
3101 	ks_cb.ksock_cb_connectfailed = idm_so_timed_socket_connect_cb;
3102 	ks_cb.ksock_cb_disconnected = idm_so_timed_socket_connect_cb;
3103 	cv_init(&it.it_cv, NULL, CV_DEFAULT, NULL);
3104 	rc = ksocket_setcallbacks(ks, &ks_cb, &it, CRED());
3105 	if (rc != 0)
3106 		return (rc);
3107 
3108 	/* Set to non-blocking mode */
3109 	nonblocking = 1;
3110 	rc = ksocket_ioctl(ks, FIONBIO, (intptr_t)&nonblocking, &rval,
3111 	    CRED());
3112 	if (rc != 0)
3113 		goto cleanup;
3114 
3115 	bzero(&it, sizeof (it));
3116 	for (;;) {
3117 		/*
3118 		 * Warning -- in a loopback scenario, the call to
3119 		 * the connect_cb can occur inside the call to
3120 		 * ksocket_connect. Do not hold the mutex around the
3121 		 * call to ksocket_connect.
3122 		 */
3123 		rc = ksocket_connect(ks, (struct sockaddr *)sa, sa_sz, CRED());
3124 		if (rc == 0 || rc == EISCONN) {
3125 			/* socket success or already success */
3126 			rc = 0;
3127 			break;
3128 		}
3129 		if ((rc != EINPROGRESS) && (rc != EALREADY)) {
3130 			break;
3131 		}
3132 
3133 		/* TCP connect still in progress. See if out of time. */
3134 		if (ddi_get_lbolt() > conn_login_max) {
3135 			/*
3136 			 * Connection retry timeout,
3137 			 * failed connect to target.
3138 			 */
3139 			rc = ETIMEDOUT;
3140 			break;
3141 		}
3142 
3143 		/*
3144 		 * TCP connect still in progress.  Sleep until callback.
3145 		 * Do NOT go to sleep if the callback already occurred!
3146 		 */
3147 		mutex_enter(&idm_so_timed_socket_mutex);
3148 		if (!it.it_callback_called) {
3149 			(void) cv_timedwait(&it.it_cv,
3150 			    &idm_so_timed_socket_mutex, conn_login_max);
3151 		}
3152 		if (it.it_callback_called) {
3153 			rc = it.it_socket_error_code;
3154 			mutex_exit(&idm_so_timed_socket_mutex);
3155 			break;
3156 		}
3157 		/* If timer expires, go call ksocket_connect one last time. */
3158 		mutex_exit(&idm_so_timed_socket_mutex);
3159 	}
3160 
3161 	/* resume blocking mode */
3162 	nonblocking = 0;
3163 	(void) ksocket_ioctl(ks, FIONBIO, (intptr_t)&nonblocking, &rval,
3164 	    CRED());
3165 cleanup:
3166 	(void) ksocket_setcallbacks(ks, NULL, NULL, CRED());
3167 	cv_destroy(&it.it_cv);
3168 	if (rc != 0) {
3169 		idm_soshutdown(ks);
3170 	}
3171 	return (rc);
3172 }
3173 
3174 
3175 void
3176 idm_addr_to_sa(idm_addr_t *dportal, struct sockaddr_storage *sa)
3177 {
3178 	int			dp_addr_size;
3179 	struct sockaddr_in	*sin;
3180 	struct sockaddr_in6	*sin6;
3181 
3182 	/* Build sockaddr_storage for this portal (idm_addr_t) */
3183 	bzero(sa, sizeof (*sa));
3184 	dp_addr_size = dportal->a_addr.i_insize;
3185 	if (dp_addr_size == sizeof (struct in_addr)) {
3186 		/* IPv4 */
3187 		sa->ss_family = AF_INET;
3188 		sin = (struct sockaddr_in *)sa;
3189 		sin->sin_port = htons(dportal->a_port);
3190 		bcopy(&dportal->a_addr.i_addr.in4,
3191 		    &sin->sin_addr, sizeof (struct in_addr));
3192 	} else if (dp_addr_size == sizeof (struct in6_addr)) {
3193 		/* IPv6 */
3194 		sa->ss_family = AF_INET6;
3195 		sin6 = (struct sockaddr_in6 *)sa;
3196 		sin6->sin6_port = htons(dportal->a_port);
3197 		bcopy(&dportal->a_addr.i_addr.in6,
3198 		    &sin6->sin6_addr, sizeof (struct in6_addr));
3199 	} else {
3200 		ASSERT(0);
3201 	}
3202 }
3203 
3204 
3205 /*
3206  * return a human-readable form of a sockaddr_storage, in the form
3207  * [ip-address]:port.  This is used in calls to logging functions.
3208  * If several calls to idm_sa_ntop are made within the same invocation
3209  * of a logging function, then each one needs its own buf.
3210  */
3211 const char *
3212 idm_sa_ntop(const struct sockaddr_storage *sa,
3213     char *buf, size_t size)
3214 {
3215 	static const char bogus_ip[] = "[0].-1";
3216 	char tmp[INET6_ADDRSTRLEN];
3217 
3218 	switch (sa->ss_family) {
3219 	case AF_INET6: {
3220 		const struct sockaddr_in6 *in6 =
3221 		    (const struct sockaddr_in6 *) sa;
3222 
3223 		(void) inet_ntop(in6->sin6_family, &in6->sin6_addr, tmp,
3224 		    sizeof (tmp));
3225 		if (strlen(tmp) + sizeof ("[].65535") > size)
3226 			goto err;
3227 		/* struct sockaddr_storage gets port info from v4 loc */
3228 		(void) snprintf(buf, size, "[%s].%u", tmp,
3229 		    ntohs(in6->sin6_port));
3230 		return (buf);
3231 	}
3232 	case AF_INET: {
3233 		const struct sockaddr_in *in = (const struct sockaddr_in *) sa;
3234 
3235 		(void) inet_ntop(in->sin_family, &in->sin_addr, tmp,
3236 		    sizeof (tmp));
3237 		if (strlen(tmp) + sizeof ("[].65535") > size)
3238 				goto err;
3239 		(void) snprintf(buf, size,  "[%s].%u", tmp,
3240 		    ntohs(in->sin_port));
3241 		return (buf);
3242 	}
3243 	default:
3244 		break;
3245 	}
3246 err:
3247 	(void) snprintf(buf, size, "%s", bogus_ip);
3248 	return (buf);
3249 }
3250