xref: /titanic_44/usr/src/uts/common/io/idm/idm_so.c (revision 682cb1044237d21ad6810702564bec833b8c410c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/conf.h>
27 #include <sys/stat.h>
28 #include <sys/file.h>
29 #include <sys/ddi.h>
30 #include <sys/sunddi.h>
31 #include <sys/modctl.h>
32 #include <sys/priv.h>
33 #include <sys/cpuvar.h>
34 #include <sys/socket.h>
35 #include <sys/strsubr.h>
36 #include <sys/sysmacros.h>
37 #include <sys/sdt.h>
38 #include <netinet/tcp.h>
39 #include <inet/tcp.h>
40 #include <sys/socketvar.h>
41 #include <sys/pathname.h>
42 #include <sys/fs/snode.h>
43 #include <sys/fs/dv_node.h>
44 #include <sys/vnode.h>
45 #include <netinet/in.h>
46 #include <net/if.h>
47 #include <sys/sockio.h>
48 #include <sys/ksocket.h>
49 #include <sys/filio.h>		/* FIONBIO */
50 #include <sys/iscsi_protocol.h>
51 #include <sys/idm/idm.h>
52 #include <sys/idm/idm_so.h>
53 #include <sys/idm/idm_text.h>
54 
55 #define	IN_PROGRESS_DELAY	1
56 
57 /*
58  * in6addr_any is currently all zeroes, but use the macro in case this
59  * ever changes.
60  */
61 static const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT;
62 
63 static void idm_sorx_cache_pdu_cb(idm_pdu_t *pdu, idm_status_t status);
64 static void idm_sorx_addl_pdu_cb(idm_pdu_t *pdu, idm_status_t status);
65 static void idm_sotx_cache_pdu_cb(idm_pdu_t *pdu, idm_status_t status);
66 
67 static idm_status_t idm_so_conn_create_common(idm_conn_t *ic, ksocket_t new_so);
68 static void idm_so_conn_destroy_common(idm_conn_t *ic);
69 static void idm_so_conn_connect_common(idm_conn_t *ic);
70 
71 static void idm_set_ini_preconnect_options(idm_so_conn_t *sc);
72 static void idm_set_ini_postconnect_options(idm_so_conn_t *sc);
73 static void idm_set_tgt_connect_options(ksocket_t so);
74 static idm_status_t idm_i_so_tx(idm_pdu_t *pdu);
75 
76 static idm_status_t idm_sorecvdata(idm_conn_t *ic, idm_pdu_t *pdu);
77 static void idm_so_send_rtt_data(idm_conn_t *ic, idm_task_t *idt,
78     idm_buf_t *idb, uint32_t offset, uint32_t length);
79 static void idm_so_send_rtt_data_done(idm_task_t *idt, idm_buf_t *idb);
80 static idm_status_t idm_so_send_buf_region(idm_task_t *idt,
81     idm_buf_t *idb, uint32_t buf_region_offset, uint32_t buf_region_length);
82 
83 static uint32_t idm_fill_iov(idm_pdu_t *pdu, idm_buf_t *idb,
84     uint32_t ro, uint32_t dlength);
85 
86 static idm_status_t idm_so_handle_digest(idm_conn_t *it,
87     nvpair_t *digest_choice, const idm_kv_xlate_t *ikvx);
88 
89 static void idm_so_socket_set_nonblock(struct sonode *node);
90 static void idm_so_socket_set_block(struct sonode *node);
91 
92 /*
93  * Transport ops prototypes
94  */
95 static void idm_so_tx(idm_conn_t *ic, idm_pdu_t *pdu);
96 static idm_status_t idm_so_buf_tx_to_ini(idm_task_t *idt, idm_buf_t *idb);
97 static idm_status_t idm_so_buf_rx_from_ini(idm_task_t *idt, idm_buf_t *idb);
98 static void idm_so_rx_datain(idm_conn_t *ic, idm_pdu_t *pdu);
99 static void idm_so_rx_rtt(idm_conn_t *ic, idm_pdu_t *pdu);
100 static void idm_so_rx_dataout(idm_conn_t *ic, idm_pdu_t *pdu);
101 static idm_status_t idm_so_free_task_rsrc(idm_task_t *idt);
102 static kv_status_t idm_so_negotiate_key_values(idm_conn_t *it,
103     nvlist_t *request_nvl, nvlist_t *response_nvl, nvlist_t *negotiated_nvl);
104 static void idm_so_notice_key_values(idm_conn_t *it,
105     nvlist_t *negotiated_nvl);
106 static kv_status_t idm_so_declare_key_values(idm_conn_t *it,
107     nvlist_t *config_nvl, nvlist_t *outgoing_nvl);
108 static boolean_t idm_so_conn_is_capable(idm_conn_req_t *ic,
109     idm_transport_caps_t *caps);
110 static idm_status_t idm_so_buf_alloc(idm_buf_t *idb, uint64_t buflen);
111 static void idm_so_buf_free(idm_buf_t *idb);
112 static idm_status_t idm_so_buf_setup(idm_buf_t *idb);
113 static void idm_so_buf_teardown(idm_buf_t *idb);
114 static idm_status_t idm_so_tgt_svc_create(idm_svc_req_t *sr, idm_svc_t *is);
115 static void idm_so_tgt_svc_destroy(idm_svc_t *is);
116 static idm_status_t idm_so_tgt_svc_online(idm_svc_t *is);
117 static void idm_so_tgt_svc_offline(idm_svc_t *is);
118 static void idm_so_tgt_conn_destroy(idm_conn_t *ic);
119 static idm_status_t idm_so_tgt_conn_connect(idm_conn_t *ic);
120 static void idm_so_conn_disconnect(idm_conn_t *ic);
121 static idm_status_t idm_so_ini_conn_create(idm_conn_req_t *cr, idm_conn_t *ic);
122 static void idm_so_ini_conn_destroy(idm_conn_t *ic);
123 static idm_status_t idm_so_ini_conn_connect(idm_conn_t *ic);
124 
125 /*
126  * IDM Native Sockets transport operations
127  */
128 static
129 idm_transport_ops_t idm_so_transport_ops = {
130 	idm_so_tx,			/* it_tx_pdu */
131 	idm_so_buf_tx_to_ini,		/* it_buf_tx_to_ini */
132 	idm_so_buf_rx_from_ini,		/* it_buf_rx_from_ini */
133 	idm_so_rx_datain,		/* it_rx_datain */
134 	idm_so_rx_rtt,			/* it_rx_rtt */
135 	idm_so_rx_dataout,		/* it_rx_dataout */
136 	NULL,				/* it_alloc_conn_rsrc */
137 	NULL,				/* it_free_conn_rsrc */
138 	NULL,				/* it_tgt_enable_datamover */
139 	NULL,				/* it_ini_enable_datamover */
140 	NULL,				/* it_conn_terminate */
141 	idm_so_free_task_rsrc,		/* it_free_task_rsrc */
142 	idm_so_negotiate_key_values,	/* it_negotiate_key_values */
143 	idm_so_notice_key_values,	/* it_notice_key_values */
144 	idm_so_conn_is_capable,		/* it_conn_is_capable */
145 	idm_so_buf_alloc,		/* it_buf_alloc */
146 	idm_so_buf_free,		/* it_buf_free */
147 	idm_so_buf_setup,		/* it_buf_setup */
148 	idm_so_buf_teardown,		/* it_buf_teardown */
149 	idm_so_tgt_svc_create,		/* it_tgt_svc_create */
150 	idm_so_tgt_svc_destroy,		/* it_tgt_svc_destroy */
151 	idm_so_tgt_svc_online,		/* it_tgt_svc_online */
152 	idm_so_tgt_svc_offline,		/* it_tgt_svc_offline */
153 	idm_so_tgt_conn_destroy,	/* it_tgt_conn_destroy */
154 	idm_so_tgt_conn_connect,	/* it_tgt_conn_connect */
155 	idm_so_conn_disconnect,		/* it_tgt_conn_disconnect */
156 	idm_so_ini_conn_create,		/* it_ini_conn_create */
157 	idm_so_ini_conn_destroy,	/* it_ini_conn_destroy */
158 	idm_so_ini_conn_connect,	/* it_ini_conn_connect */
159 	idm_so_conn_disconnect,		/* it_ini_conn_disconnect */
160 	idm_so_declare_key_values	/* it_declare_key_values */
161 };
162 
163 kmutex_t	idm_so_timed_socket_mutex;
164 /*
165  * idm_so_init()
166  * Sockets transport initialization
167  */
168 void
169 idm_so_init(idm_transport_t *it)
170 {
171 	/* Cache for IDM Data and R2T Transmit PDU's */
172 	idm.idm_sotx_pdu_cache = kmem_cache_create("idm_tx_pdu_cache",
173 	    sizeof (idm_pdu_t) + sizeof (iscsi_hdr_t), 8,
174 	    &idm_sotx_pdu_constructor, NULL, NULL, NULL, NULL, KM_SLEEP);
175 
176 	/* Cache for IDM Receive PDU's */
177 	idm.idm_sorx_pdu_cache = kmem_cache_create("idm_rx_pdu_cache",
178 	    sizeof (idm_pdu_t) + IDM_SORX_CACHE_HDRLEN, 8,
179 	    &idm_sorx_pdu_constructor, NULL, NULL, NULL, NULL, KM_SLEEP);
180 
181 	/* 128k buffer cache */
182 	idm.idm_so_128k_buf_cache = kmem_cache_create("idm_128k_buf_cache",
183 	    IDM_SO_BUF_CACHE_UB, 8, NULL, NULL, NULL, NULL, NULL, KM_SLEEP);
184 
185 	/* Set the sockets transport ops */
186 	it->it_ops = &idm_so_transport_ops;
187 
188 	mutex_init(&idm_so_timed_socket_mutex, NULL, MUTEX_DEFAULT, NULL);
189 
190 }
191 
192 /*
193  * idm_so_fini()
194  * Sockets transport teardown
195  */
196 void
197 idm_so_fini(void)
198 {
199 	kmem_cache_destroy(idm.idm_so_128k_buf_cache);
200 	kmem_cache_destroy(idm.idm_sotx_pdu_cache);
201 	kmem_cache_destroy(idm.idm_sorx_pdu_cache);
202 	mutex_destroy(&idm_so_timed_socket_mutex);
203 }
204 
205 ksocket_t
206 idm_socreate(int domain, int type, int protocol)
207 {
208 	ksocket_t ks;
209 
210 	if (!ksocket_socket(&ks, domain, type, protocol, KSOCKET_NOSLEEP,
211 	    CRED())) {
212 		return (ks);
213 	} else {
214 		return (NULL);
215 	}
216 }
217 
218 /*
219  * idm_soshutdown will disconnect the socket and prevent subsequent PDU
220  * reception and transmission.  The sonode still exists but its state
221  * gets modified to indicate it is no longer connected.  Calls to
222  * idm_sorecv/idm_iov_sorecv will return so idm_soshutdown can be used
223  * regain control of a thread stuck in idm_sorecv.
224  */
225 void
226 idm_soshutdown(ksocket_t so)
227 {
228 	(void) ksocket_shutdown(so, SHUT_RDWR, CRED());
229 }
230 
231 /*
232  * idm_sodestroy releases all resources associated with a socket previously
233  * created with idm_socreate.  The socket must be shutdown using
234  * idm_soshutdown before the socket is destroyed with idm_sodestroy,
235  * otherwise undefined behavior will result.
236  */
237 void
238 idm_sodestroy(ksocket_t ks)
239 {
240 	(void) ksocket_close(ks, CRED());
241 }
242 
243 /*
244  * Function to compare two addresses in sockaddr_storage format
245  */
246 
247 int
248 idm_ss_compare(const struct sockaddr_storage *cmp_ss1,
249     const struct sockaddr_storage *cmp_ss2,
250     boolean_t v4_mapped_as_v4,
251     boolean_t compare_ports)
252 {
253 	struct sockaddr_storage			mapped_v4_ss1, mapped_v4_ss2;
254 	const struct sockaddr_storage		*ss1, *ss2;
255 	struct in_addr				*in1, *in2;
256 	struct in6_addr				*in61, *in62;
257 	int i;
258 
259 	/*
260 	 * Normalize V4-mapped IPv6 addresses into V4 format if
261 	 * v4_mapped_as_v4 is B_TRUE.
262 	 */
263 	ss1 = cmp_ss1;
264 	ss2 = cmp_ss2;
265 	if (v4_mapped_as_v4 && (ss1->ss_family == AF_INET6)) {
266 		in61 = &((struct sockaddr_in6 *)ss1)->sin6_addr;
267 		if (IN6_IS_ADDR_V4MAPPED(in61)) {
268 			bzero(&mapped_v4_ss1, sizeof (mapped_v4_ss1));
269 			mapped_v4_ss1.ss_family = AF_INET;
270 			((struct sockaddr_in *)&mapped_v4_ss1)->sin_port =
271 			    ((struct sockaddr_in *)ss1)->sin_port;
272 			IN6_V4MAPPED_TO_INADDR(in61,
273 			    &((struct sockaddr_in *)&mapped_v4_ss1)->sin_addr);
274 			ss1 = &mapped_v4_ss1;
275 		}
276 	}
277 	ss2 = cmp_ss2;
278 	if (v4_mapped_as_v4 && (ss2->ss_family == AF_INET6)) {
279 		in62 = &((struct sockaddr_in6 *)ss2)->sin6_addr;
280 		if (IN6_IS_ADDR_V4MAPPED(in62)) {
281 			bzero(&mapped_v4_ss2, sizeof (mapped_v4_ss2));
282 			mapped_v4_ss2.ss_family = AF_INET;
283 			((struct sockaddr_in *)&mapped_v4_ss2)->sin_port =
284 			    ((struct sockaddr_in *)ss2)->sin_port;
285 			IN6_V4MAPPED_TO_INADDR(in62,
286 			    &((struct sockaddr_in *)&mapped_v4_ss2)->sin_addr);
287 			ss2 = &mapped_v4_ss2;
288 		}
289 	}
290 
291 	/*
292 	 * Compare ports, then address family, then ip address
293 	 */
294 	if (compare_ports &&
295 	    (((struct sockaddr_in *)ss1)->sin_port !=
296 	    ((struct sockaddr_in *)ss2)->sin_port)) {
297 		if (((struct sockaddr_in *)ss1)->sin_port >
298 		    ((struct sockaddr_in *)ss2)->sin_port)
299 			return (1);
300 		else
301 			return (-1);
302 	}
303 
304 	/*
305 	 * ports are the same
306 	 */
307 	if (ss1->ss_family != ss2->ss_family) {
308 		if (ss1->ss_family == AF_INET)
309 			return (1);
310 		else
311 			return (-1);
312 	}
313 
314 	/*
315 	 * address families are the same
316 	 */
317 	if (ss1->ss_family == AF_INET) {
318 		in1 = &((struct sockaddr_in *)ss1)->sin_addr;
319 		in2 = &((struct sockaddr_in *)ss2)->sin_addr;
320 
321 		if (in1->s_addr > in2->s_addr)
322 			return (1);
323 		else if (in1->s_addr < in2->s_addr)
324 			return (-1);
325 		else
326 			return (0);
327 	} else if (ss1->ss_family == AF_INET6) {
328 		in61 = &((struct sockaddr_in6 *)ss1)->sin6_addr;
329 		in62 = &((struct sockaddr_in6 *)ss2)->sin6_addr;
330 
331 		for (i = 0; i < 4; i++) {
332 			if (in61->s6_addr32[i] > in62->s6_addr32[i])
333 				return (1);
334 			else if (in61->s6_addr32[i] < in62->s6_addr32[i])
335 				return (-1);
336 		}
337 		return (0);
338 	}
339 
340 	return (1);
341 }
342 
343 /*
344  * IP address filter functions to flag addresses that should not
345  * go out to initiators through discovery.
346  */
347 static boolean_t
348 idm_v4_addr_okay(struct in_addr *in_addr)
349 {
350 	in_addr_t addr = ntohl(in_addr->s_addr);
351 
352 	if ((INADDR_NONE == addr) ||
353 	    (IN_MULTICAST(addr)) ||
354 	    ((addr >> IN_CLASSA_NSHIFT) == 0) ||
355 	    ((addr >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)) {
356 		return (B_FALSE);
357 	}
358 	return (B_TRUE);
359 }
360 
361 static boolean_t
362 idm_v6_addr_okay(struct in6_addr *addr6)
363 {
364 
365 	if ((IN6_IS_ADDR_UNSPECIFIED(addr6)) ||
366 	    (IN6_IS_ADDR_LOOPBACK(addr6)) ||
367 	    (IN6_IS_ADDR_MULTICAST(addr6)) ||
368 	    (IN6_IS_ADDR_V4MAPPED(addr6)) ||
369 	    (IN6_IS_ADDR_V4COMPAT(addr6)) ||
370 	    (IN6_IS_ADDR_LINKLOCAL(addr6))) {
371 		return (B_FALSE);
372 	}
373 	return (B_TRUE);
374 }
375 
376 /*
377  * idm_get_ipaddr will retrieve a list of IP Addresses which the host is
378  * configured with by sending down a sequence of kernel ioctl to IP STREAMS.
379  */
380 int
381 idm_get_ipaddr(idm_addr_list_t **ipaddr_p)
382 {
383 	ksocket_t 		so4, so6;
384 	struct lifnum		lifn;
385 	struct lifconf		lifc;
386 	struct lifreq		*lp;
387 	int			rval;
388 	int			numifs;
389 	int			bufsize;
390 	void			*buf;
391 	int			i, j, n, rc;
392 	struct sockaddr_storage	ss;
393 	struct sockaddr_in	*sin;
394 	struct sockaddr_in6	*sin6;
395 	idm_addr_t		*ip;
396 	idm_addr_list_t		*ipaddr;
397 	int			size_ipaddr;
398 
399 	*ipaddr_p = NULL;
400 	size_ipaddr = 0;
401 	buf = NULL;
402 
403 	/* create an ipv4 and ipv6 UDP socket */
404 	if ((so6 = idm_socreate(PF_INET6, SOCK_DGRAM, 0)) == NULL)
405 		return (0);
406 	if ((so4 = idm_socreate(PF_INET, SOCK_DGRAM, 0)) == NULL) {
407 		idm_sodestroy(so6);
408 		return (0);
409 	}
410 
411 
412 retry_count:
413 	/* snapshot the current number of interfaces */
414 	lifn.lifn_family = PF_UNSPEC;
415 	lifn.lifn_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES;
416 	lifn.lifn_count = 0;
417 	/* use vp6 for ioctls with unspecified families by default */
418 	if (ksocket_ioctl(so6, SIOCGLIFNUM, (intptr_t)&lifn, &rval, CRED())
419 	    != 0) {
420 		goto cleanup;
421 	}
422 
423 	numifs = lifn.lifn_count;
424 	if (numifs <= 0) {
425 		goto cleanup;
426 	}
427 
428 	/* allocate extra room in case more interfaces appear */
429 	numifs += 10;
430 
431 	/* get the interface names and ip addresses */
432 	bufsize = numifs * sizeof (struct lifreq);
433 	buf = kmem_alloc(bufsize, KM_SLEEP);
434 
435 	lifc.lifc_family = AF_UNSPEC;
436 	lifc.lifc_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES;
437 	lifc.lifc_len = bufsize;
438 	lifc.lifc_buf = buf;
439 	rc = ksocket_ioctl(so6, SIOCGLIFCONF, (intptr_t)&lifc, &rval, CRED());
440 	if (rc != 0) {
441 		goto cleanup;
442 	}
443 	/* if our extra room is used up, try again */
444 	if (bufsize <= lifc.lifc_len) {
445 		kmem_free(buf, bufsize);
446 		buf = NULL;
447 		goto retry_count;
448 	}
449 	/* calc actual number of ifconfs */
450 	n = lifc.lifc_len / sizeof (struct lifreq);
451 
452 	/* get ip address */
453 	if (n > 0) {
454 		size_ipaddr = sizeof (idm_addr_list_t) +
455 		    (n - 1) * sizeof (idm_addr_t);
456 		ipaddr = kmem_zalloc(size_ipaddr, KM_SLEEP);
457 	} else {
458 		goto cleanup;
459 	}
460 
461 	/*
462 	 * Examine the array of interfaces and filter uninteresting ones
463 	 */
464 	for (i = 0, j = 0, lp = lifc.lifc_req; i < n; i++, lp++) {
465 
466 		/*
467 		 * Copy the address as the SIOCGLIFFLAGS ioctl is destructive
468 		 */
469 		ss = lp->lifr_addr;
470 		/*
471 		 * fetch the flags using the socket of the correct family
472 		 */
473 		switch (ss.ss_family) {
474 		case AF_INET:
475 			rc = ksocket_ioctl(so4, SIOCGLIFFLAGS, (intptr_t)lp,
476 			    &rval, CRED());
477 			break;
478 		case AF_INET6:
479 			rc = ksocket_ioctl(so6, SIOCGLIFFLAGS, (intptr_t)lp,
480 			    &rval, CRED());
481 			break;
482 		default:
483 			continue;
484 		}
485 		if (rc == 0) {
486 			/*
487 			 * If we got the flags, skip uninteresting
488 			 * interfaces based on flags
489 			 */
490 			if ((lp->lifr_flags & IFF_UP) != IFF_UP)
491 				continue;
492 			if (lp->lifr_flags &
493 			    (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED))
494 				continue;
495 		}
496 
497 		/* save ip address */
498 		ip = &ipaddr->al_addrs[j];
499 		switch (ss.ss_family) {
500 		case AF_INET:
501 			sin = (struct sockaddr_in *)&ss;
502 			if (!idm_v4_addr_okay(&sin->sin_addr))
503 				continue;
504 			ip->a_addr.i_addr.in4 = sin->sin_addr;
505 			ip->a_addr.i_insize = sizeof (struct in_addr);
506 			break;
507 		case AF_INET6:
508 			sin6 = (struct sockaddr_in6 *)&ss;
509 			if (!idm_v6_addr_okay(&sin6->sin6_addr))
510 				continue;
511 			ip->a_addr.i_addr.in6 = sin6->sin6_addr;
512 			ip->a_addr.i_insize = sizeof (struct in6_addr);
513 			break;
514 		default:
515 			continue;
516 		}
517 		j++;
518 	}
519 
520 	if (j == 0) {
521 		/* no valid ifaddr */
522 		kmem_free(ipaddr, size_ipaddr);
523 		size_ipaddr = 0;
524 		ipaddr = NULL;
525 	} else {
526 		ipaddr->al_out_cnt = j;
527 	}
528 
529 
530 cleanup:
531 	idm_sodestroy(so6);
532 	idm_sodestroy(so4);
533 
534 	if (buf != NULL)
535 		kmem_free(buf, bufsize);
536 
537 	*ipaddr_p = ipaddr;
538 	return (size_ipaddr);
539 }
540 
541 int
542 idm_sorecv(ksocket_t so, void *msg, size_t len)
543 {
544 	iovec_t iov;
545 
546 	ASSERT(so != NULL);
547 	ASSERT(len != 0);
548 
549 	/*
550 	 * Fill in iovec and receive data
551 	 */
552 	iov.iov_base = msg;
553 	iov.iov_len = len;
554 
555 	return (idm_iov_sorecv(so, &iov, 1, len));
556 }
557 
558 /*
559  * idm_sosendto - Sends a buffered data on a non-connected socket.
560  *
561  * This function puts the data provided on the wire by calling sosendmsg.
562  * It will return only when all the data has been sent or if an error
563  * occurs.
564  *
565  * Returns 0 for success, the socket errno value if sosendmsg fails, and
566  * -1 if sosendmsg returns success but uio_resid != 0
567  */
568 int
569 idm_sosendto(ksocket_t so, void *buff, size_t len,
570     struct sockaddr *name, socklen_t namelen)
571 {
572 	struct msghdr		msg;
573 	struct iovec		iov[1];
574 	int			error;
575 	size_t			sent = 0;
576 
577 	iov[0].iov_base	= buff;
578 	iov[0].iov_len	= len;
579 
580 	/* Initialization of the message header. */
581 	bzero(&msg, sizeof (msg));
582 	msg.msg_iov	= iov;
583 	msg.msg_iovlen	= 1;
584 	msg.msg_name	= name;
585 	msg.msg_namelen	= namelen;
586 
587 	if ((error = ksocket_sendmsg(so, &msg, 0, &sent, CRED())) == 0) {
588 		/* Data sent */
589 		if (sent == len) {
590 			/* All data sent.  Success. */
591 			return (0);
592 		} else {
593 			/* Not all data was sent.  Failure */
594 			return (-1);
595 		}
596 	}
597 
598 	/* Send failed */
599 	return (error);
600 }
601 
602 /*
603  * idm_iov_sosend - Sends an iovec on a connection.
604  *
605  * This function puts the data provided on the wire by calling sosendmsg.
606  * It will return only when all the data has been sent or if an error
607  * occurs.
608  *
609  * Returns 0 for success, the socket errno value if sosendmsg fails, and
610  * -1 if sosendmsg returns success but uio_resid != 0
611  */
612 int
613 idm_iov_sosend(ksocket_t so, iovec_t *iop, int iovlen, size_t total_len)
614 {
615 	struct msghdr		msg;
616 	int			error;
617 	size_t 			sent = 0;
618 
619 	ASSERT(iop != NULL);
620 
621 	/* Initialization of the message header. */
622 	bzero(&msg, sizeof (msg));
623 	msg.msg_iov	= iop;
624 	msg.msg_iovlen	= iovlen;
625 
626 	if ((error = ksocket_sendmsg(so, &msg, 0, &sent, CRED()))
627 	    == 0) {
628 		/* Data sent */
629 		if (sent == total_len) {
630 			/* All data sent.  Success. */
631 			return (0);
632 		} else {
633 			/* Not all data was sent.  Failure */
634 			return (-1);
635 		}
636 	}
637 
638 	/* Send failed */
639 	return (error);
640 }
641 
642 /*
643  * idm_iov_sorecv - Receives an iovec from a connection
644  *
645  * This function gets the data asked for from the socket.  It will return
646  * only when all the requested data has been retrieved or if an error
647  * occurs.
648  *
649  * Returns 0 for success, the socket errno value if sorecvmsg fails, and
650  * -1 if sorecvmsg returns success but uio_resid != 0
651  */
652 int
653 idm_iov_sorecv(ksocket_t so, iovec_t *iop, int iovlen, size_t total_len)
654 {
655 	struct msghdr		msg;
656 	int			error;
657 	size_t			recv;
658 	int 			flags;
659 
660 	ASSERT(iop != NULL);
661 
662 	/* Initialization of the message header. */
663 	bzero(&msg, sizeof (msg));
664 	msg.msg_iov	= iop;
665 	msg.msg_iovlen	= iovlen;
666 	flags		= MSG_WAITALL;
667 
668 	if ((error = ksocket_recvmsg(so, &msg, flags, &recv, CRED()))
669 	    == 0) {
670 		/* Received data */
671 		if (recv == total_len) {
672 			/* All requested data received.  Success */
673 			return (0);
674 		} else {
675 			/*
676 			 * Not all data was received.  The connection has
677 			 * probably failed.
678 			 */
679 			return (-1);
680 		}
681 	}
682 
683 	/* Receive failed */
684 	return (error);
685 }
686 
687 static void
688 idm_set_ini_preconnect_options(idm_so_conn_t *sc)
689 {
690 	int	conn_abort = 10000;
691 	int	conn_notify = 2000;
692 	int	abort = 30000;
693 
694 	/* Pre-connect socket options */
695 	(void) ksocket_setsockopt(sc->ic_so, IPPROTO_TCP,
696 	    TCP_CONN_NOTIFY_THRESHOLD, (char *)&conn_notify, sizeof (int),
697 	    CRED());
698 	(void) ksocket_setsockopt(sc->ic_so, IPPROTO_TCP,
699 	    TCP_CONN_ABORT_THRESHOLD, (char *)&conn_abort, sizeof (int),
700 	    CRED());
701 	(void) ksocket_setsockopt(sc->ic_so, IPPROTO_TCP, TCP_ABORT_THRESHOLD,
702 	    (char *)&abort, sizeof (int), CRED());
703 }
704 
705 static void
706 idm_set_ini_postconnect_options(idm_so_conn_t *sc)
707 {
708 	int32_t		rcvbuf = IDM_RCVBUF_SIZE;
709 	int32_t		sndbuf = IDM_SNDBUF_SIZE;
710 	const int	on = 1;
711 
712 	/* Set postconnect options */
713 	(void) ksocket_setsockopt(sc->ic_so, IPPROTO_TCP, TCP_NODELAY,
714 	    (char *)&on, sizeof (int), CRED());
715 	(void) ksocket_setsockopt(sc->ic_so, SOL_SOCKET, SO_RCVBUF,
716 	    (char *)&rcvbuf, sizeof (int), CRED());
717 	(void) ksocket_setsockopt(sc->ic_so, SOL_SOCKET, SO_SNDBUF,
718 	    (char *)&sndbuf, sizeof (int), CRED());
719 }
720 
721 static void
722 idm_set_tgt_connect_options(ksocket_t ks)
723 {
724 	int32_t		rcvbuf = IDM_RCVBUF_SIZE;
725 	int32_t		sndbuf = IDM_SNDBUF_SIZE;
726 	const int	on = 1;
727 
728 	/* Set connect options */
729 	(void) ksocket_setsockopt(ks, SOL_SOCKET, SO_RCVBUF,
730 	    (char *)&rcvbuf, sizeof (int), CRED());
731 	(void) ksocket_setsockopt(ks, SOL_SOCKET, SO_SNDBUF,
732 	    (char *)&sndbuf, sizeof (int), CRED());
733 	(void) ksocket_setsockopt(ks, IPPROTO_TCP, TCP_NODELAY,
734 	    (char *)&on, sizeof (on), CRED());
735 }
736 
737 static uint32_t
738 n2h24(const uchar_t *ptr)
739 {
740 	return ((ptr[0] << 16) | (ptr[1] << 8) | ptr[2]);
741 }
742 
743 
744 static idm_status_t
745 idm_sorecvhdr(idm_conn_t *ic, idm_pdu_t *pdu)
746 {
747 	iscsi_hdr_t	*bhs;
748 	uint32_t	hdr_digest_crc;
749 	uint32_t	crc_calculated;
750 	void		*new_hdr;
751 	int		ahslen = 0;
752 	int		total_len = 0;
753 	int		iovlen = 0;
754 	struct iovec	iov[2];
755 	idm_so_conn_t	*so_conn;
756 	int		rc;
757 
758 	so_conn = ic->ic_transport_private;
759 
760 	/*
761 	 * Read BHS
762 	 */
763 	bhs = pdu->isp_hdr;
764 	rc = idm_sorecv(so_conn->ic_so, pdu->isp_hdr, sizeof (iscsi_hdr_t));
765 	if (rc != IDM_STATUS_SUCCESS) {
766 		return (IDM_STATUS_FAIL);
767 	}
768 
769 	/*
770 	 * Check actual AHS length against the amount available in the buffer
771 	 */
772 	pdu->isp_hdrlen = sizeof (iscsi_hdr_t) +
773 	    (bhs->hlength * sizeof (uint32_t));
774 	pdu->isp_datalen = n2h24(bhs->dlength);
775 	if (ic->ic_conn_type == CONN_TYPE_TGT &&
776 	    pdu->isp_datalen > ic->ic_conn_params.max_recv_dataseglen) {
777 		IDM_CONN_LOG(CE_WARN,
778 		    "idm_sorecvhdr: exceeded the max data segment length");
779 		return (IDM_STATUS_FAIL);
780 	}
781 	if (bhs->hlength > IDM_SORX_CACHE_AHSLEN) {
782 		/* Allocate a new header segment and change the callback */
783 		new_hdr = kmem_alloc(pdu->isp_hdrlen, KM_SLEEP);
784 		bcopy(pdu->isp_hdr, new_hdr, sizeof (iscsi_hdr_t));
785 		pdu->isp_hdr = new_hdr;
786 		pdu->isp_flags |= IDM_PDU_ADDL_HDR;
787 
788 		/*
789 		 * This callback will restore the expected values after
790 		 * the RX PDU has been processed.
791 		 */
792 		pdu->isp_callback = idm_sorx_addl_pdu_cb;
793 	}
794 
795 	/*
796 	 * Setup receipt of additional header and header digest (if enabled).
797 	 */
798 	if (bhs->hlength > 0) {
799 		iov[iovlen].iov_base = (caddr_t)(pdu->isp_hdr + 1);
800 		ahslen = pdu->isp_hdrlen - sizeof (iscsi_hdr_t);
801 		iov[iovlen].iov_len = ahslen;
802 		total_len += iov[iovlen].iov_len;
803 		iovlen++;
804 	}
805 
806 	if (ic->ic_conn_flags & IDM_CONN_HEADER_DIGEST) {
807 		iov[iovlen].iov_base = (caddr_t)&hdr_digest_crc;
808 		iov[iovlen].iov_len = sizeof (hdr_digest_crc);
809 		total_len += iov[iovlen].iov_len;
810 		iovlen++;
811 	}
812 
813 	if ((iovlen != 0) &&
814 	    (idm_iov_sorecv(so_conn->ic_so, &iov[0], iovlen,
815 	    total_len) != 0)) {
816 		return (IDM_STATUS_FAIL);
817 	}
818 
819 	/*
820 	 * Validate header digest if enabled
821 	 */
822 	if (ic->ic_conn_flags & IDM_CONN_HEADER_DIGEST) {
823 		crc_calculated = idm_crc32c(pdu->isp_hdr,
824 		    sizeof (iscsi_hdr_t) + ahslen);
825 		if (crc_calculated != hdr_digest_crc) {
826 			/* Invalid Header Digest */
827 			return (IDM_STATUS_HEADER_DIGEST);
828 		}
829 	}
830 
831 	return (0);
832 }
833 
834 /*
835  * idm_so_ini_conn_create()
836  * Allocate the sockets transport connection resources.
837  */
838 static idm_status_t
839 idm_so_ini_conn_create(idm_conn_req_t *cr, idm_conn_t *ic)
840 {
841 	ksocket_t	so;
842 	idm_so_conn_t	*so_conn;
843 	idm_status_t	idmrc;
844 
845 	so = idm_socreate(cr->cr_domain, cr->cr_type,
846 	    cr->cr_protocol);
847 	if (so == NULL) {
848 		return (IDM_STATUS_FAIL);
849 	}
850 
851 	/* Bind the socket if configured to do so */
852 	if (cr->cr_bound) {
853 		if (ksocket_bind(so, &cr->cr_bound_addr.sin,
854 		    SIZEOF_SOCKADDR(&cr->cr_bound_addr.sin), CRED()) != 0) {
855 			idm_sodestroy(so);
856 			return (IDM_STATUS_FAIL);
857 		}
858 	}
859 
860 	idmrc = idm_so_conn_create_common(ic, so);
861 	if (idmrc != IDM_STATUS_SUCCESS) {
862 		idm_soshutdown(so);
863 		idm_sodestroy(so);
864 		return (IDM_STATUS_FAIL);
865 	}
866 
867 	so_conn = ic->ic_transport_private;
868 	/* Set up socket options */
869 	idm_set_ini_preconnect_options(so_conn);
870 
871 	return (IDM_STATUS_SUCCESS);
872 }
873 
874 /*
875  * idm_so_ini_conn_destroy()
876  * Tear down the sockets transport connection resources.
877  */
878 static void
879 idm_so_ini_conn_destroy(idm_conn_t *ic)
880 {
881 	idm_so_conn_destroy_common(ic);
882 }
883 
884 /*
885  * idm_so_ini_conn_connect()
886  * Establish the connection referred to by the handle previously allocated via
887  * idm_so_ini_conn_create().
888  */
889 static idm_status_t
890 idm_so_ini_conn_connect(idm_conn_t *ic)
891 {
892 	idm_so_conn_t	*so_conn;
893 	struct sonode	*node = NULL;
894 	int 		rc;
895 	clock_t		lbolt, conn_login_max, conn_login_interval;
896 	boolean_t	nonblock;
897 
898 	so_conn = ic->ic_transport_private;
899 	nonblock = ic->ic_conn_params.nonblock_socket;
900 	conn_login_max = ic->ic_conn_params.conn_login_max;
901 	conn_login_interval = ddi_get_lbolt() +
902 	    SEC_TO_TICK(ic->ic_conn_params.conn_login_interval);
903 
904 	if (nonblock == B_TRUE) {
905 		node = ((struct sonode *)(so_conn->ic_so));
906 		/* Set to none block socket mode */
907 		idm_so_socket_set_nonblock(node);
908 		do {
909 			rc = ksocket_connect(so_conn->ic_so,
910 			    &ic->ic_ini_dst_addr.sin,
911 			    (SIZEOF_SOCKADDR(&ic->ic_ini_dst_addr.sin)),
912 			    CRED());
913 			if (rc == 0 || rc == EISCONN) {
914 				/* socket success or already success */
915 				rc = IDM_STATUS_SUCCESS;
916 				break;
917 			}
918 			if ((rc == ETIMEDOUT) || (rc == ECONNREFUSED) ||
919 			    (rc == ECONNRESET)) {
920 				/* socket connection timeout or refuse */
921 				break;
922 			}
923 			lbolt = ddi_get_lbolt();
924 			if (lbolt > conn_login_max) {
925 				/*
926 				 * Connection retry timeout,
927 				 * failed connect to target.
928 				 */
929 				break;
930 			}
931 			if (lbolt < conn_login_interval) {
932 				if ((rc == EINPROGRESS) || (rc == EALREADY)) {
933 					/* TCP connect still in progress */
934 					delay(SEC_TO_TICK(IN_PROGRESS_DELAY));
935 					continue;
936 				} else {
937 					delay(conn_login_interval - lbolt);
938 				}
939 			}
940 			conn_login_interval = ddi_get_lbolt() +
941 			    SEC_TO_TICK(ic->ic_conn_params.conn_login_interval);
942 		} while (rc != 0);
943 		/* resume to nonblock mode */
944 		if (rc == IDM_STATUS_SUCCESS) {
945 			idm_so_socket_set_block(node);
946 		}
947 	} else {
948 		rc = ksocket_connect(so_conn->ic_so, &ic->ic_ini_dst_addr.sin,
949 		    (SIZEOF_SOCKADDR(&ic->ic_ini_dst_addr.sin)), CRED());
950 	}
951 
952 	if (rc != 0) {
953 		idm_soshutdown(so_conn->ic_so);
954 		return (IDM_STATUS_FAIL);
955 	}
956 
957 	idm_so_conn_connect_common(ic);
958 
959 	idm_set_ini_postconnect_options(so_conn);
960 
961 	return (IDM_STATUS_SUCCESS);
962 }
963 
964 idm_status_t
965 idm_so_tgt_conn_create(idm_conn_t *ic, ksocket_t new_so)
966 {
967 	idm_status_t	idmrc;
968 
969 	idmrc = idm_so_conn_create_common(ic, new_so);
970 
971 	return (idmrc);
972 }
973 
974 static void
975 idm_so_tgt_conn_destroy(idm_conn_t *ic)
976 {
977 	idm_so_conn_destroy_common(ic);
978 }
979 
980 /*
981  * idm_so_tgt_conn_connect()
982  * Establish the connection in ic, passed from idm_tgt_conn_finish(), which
983  * is invoked from the SM as a result of an inbound connection request.
984  */
985 static idm_status_t
986 idm_so_tgt_conn_connect(idm_conn_t *ic)
987 {
988 	idm_so_conn_connect_common(ic);
989 
990 	return (IDM_STATUS_SUCCESS);
991 }
992 
993 static idm_status_t
994 idm_so_conn_create_common(idm_conn_t *ic, ksocket_t new_so)
995 {
996 	idm_so_conn_t	*so_conn;
997 
998 	so_conn = kmem_zalloc(sizeof (idm_so_conn_t), KM_SLEEP);
999 	so_conn->ic_so = new_so;
1000 
1001 	ic->ic_transport_private = so_conn;
1002 	ic->ic_transport_hdrlen = 0;
1003 
1004 	/* Set the scoreboarding flag on this connection */
1005 	ic->ic_conn_flags |= IDM_CONN_USE_SCOREBOARD;
1006 	ic->ic_conn_params.max_recv_dataseglen =
1007 	    ISCSI_DEFAULT_MAX_RECV_SEG_LEN;
1008 	ic->ic_conn_params.max_xmit_dataseglen =
1009 	    ISCSI_DEFAULT_MAX_XMIT_SEG_LEN;
1010 
1011 	/*
1012 	 * Initialize tx thread mutex and list
1013 	 */
1014 	mutex_init(&so_conn->ic_tx_mutex, NULL, MUTEX_DEFAULT, NULL);
1015 	cv_init(&so_conn->ic_tx_cv, NULL, CV_DEFAULT, NULL);
1016 	list_create(&so_conn->ic_tx_list, sizeof (idm_pdu_t),
1017 	    offsetof(idm_pdu_t, idm_tx_link));
1018 
1019 	return (IDM_STATUS_SUCCESS);
1020 }
1021 
1022 static void
1023 idm_so_conn_destroy_common(idm_conn_t *ic)
1024 {
1025 	idm_so_conn_t	*so_conn = ic->ic_transport_private;
1026 
1027 	ic->ic_transport_private = NULL;
1028 	idm_sodestroy(so_conn->ic_so);
1029 	list_destroy(&so_conn->ic_tx_list);
1030 	mutex_destroy(&so_conn->ic_tx_mutex);
1031 	cv_destroy(&so_conn->ic_tx_cv);
1032 
1033 	kmem_free(so_conn, sizeof (idm_so_conn_t));
1034 }
1035 
1036 static void
1037 idm_so_conn_connect_common(idm_conn_t *ic)
1038 {
1039 	idm_so_conn_t	*so_conn;
1040 	struct sockaddr_in6	t_addr;
1041 	socklen_t	t_addrlen = 0;
1042 
1043 	so_conn = ic->ic_transport_private;
1044 	bzero(&t_addr, sizeof (struct sockaddr_in6));
1045 	t_addrlen = sizeof (struct sockaddr_in6);
1046 
1047 	/* Set the local and remote addresses in the idm conn handle */
1048 	ksocket_getsockname(so_conn->ic_so, (struct sockaddr *)&t_addr,
1049 	    &t_addrlen, CRED());
1050 	bcopy(&t_addr, &ic->ic_laddr, t_addrlen);
1051 	ksocket_getpeername(so_conn->ic_so, (struct sockaddr *)&t_addr,
1052 	    &t_addrlen, CRED());
1053 	bcopy(&t_addr, &ic->ic_raddr, t_addrlen);
1054 
1055 	mutex_enter(&ic->ic_mutex);
1056 	so_conn->ic_tx_thread = thread_create(NULL, 0, idm_sotx_thread, ic, 0,
1057 	    &p0, TS_RUN, minclsyspri);
1058 	so_conn->ic_rx_thread = thread_create(NULL, 0, idm_sorx_thread, ic, 0,
1059 	    &p0, TS_RUN, minclsyspri);
1060 
1061 	while (!so_conn->ic_rx_thread_running || !so_conn->ic_tx_thread_running)
1062 		cv_wait(&ic->ic_cv, &ic->ic_mutex);
1063 	mutex_exit(&ic->ic_mutex);
1064 }
1065 
1066 /*
1067  * idm_so_conn_disconnect()
1068  * Shutdown the socket connection and stop the thread
1069  */
1070 static void
1071 idm_so_conn_disconnect(idm_conn_t *ic)
1072 {
1073 	idm_so_conn_t	*so_conn;
1074 
1075 	so_conn = ic->ic_transport_private;
1076 
1077 	mutex_enter(&ic->ic_mutex);
1078 	so_conn->ic_rx_thread_running = B_FALSE;
1079 	so_conn->ic_tx_thread_running = B_FALSE;
1080 	/* We need to wakeup the TX thread */
1081 	mutex_enter(&so_conn->ic_tx_mutex);
1082 	cv_signal(&so_conn->ic_tx_cv);
1083 	mutex_exit(&so_conn->ic_tx_mutex);
1084 	mutex_exit(&ic->ic_mutex);
1085 
1086 	/* This should wakeup the RX thread if it is sleeping */
1087 	idm_soshutdown(so_conn->ic_so);
1088 
1089 	thread_join(so_conn->ic_tx_thread_did);
1090 	thread_join(so_conn->ic_rx_thread_did);
1091 }
1092 
1093 /*
1094  * idm_so_tgt_svc_create()
1095  * Establish a service on an IP address and port.  idm_svc_req_t contains
1096  * the service parameters.
1097  */
1098 /*ARGSUSED*/
1099 static idm_status_t
1100 idm_so_tgt_svc_create(idm_svc_req_t *sr, idm_svc_t *is)
1101 {
1102 	idm_so_svc_t		*so_svc;
1103 
1104 	so_svc = kmem_zalloc(sizeof (idm_so_svc_t), KM_SLEEP);
1105 
1106 	/* Set the new sockets service in svc handle */
1107 	is->is_so_svc = (void *)so_svc;
1108 
1109 	return (IDM_STATUS_SUCCESS);
1110 }
1111 
1112 /*
1113  * idm_so_tgt_svc_destroy()
1114  * Teardown sockets resources allocated in idm_so_tgt_svc_create()
1115  */
1116 static void
1117 idm_so_tgt_svc_destroy(idm_svc_t *is)
1118 {
1119 	/* the socket will have been torn down; free the service */
1120 	kmem_free(is->is_so_svc, sizeof (idm_so_svc_t));
1121 }
1122 
1123 /*
1124  * idm_so_tgt_svc_online()
1125  * Launch a watch thread on the svc allocated in idm_so_tgt_svc_create()
1126  */
1127 
1128 static idm_status_t
1129 idm_so_tgt_svc_online(idm_svc_t *is)
1130 {
1131 	idm_so_svc_t		*so_svc;
1132 	idm_svc_req_t		*sr = &is->is_svc_req;
1133 	struct sockaddr_in6	sin6_ip;
1134 	const uint32_t		on = 1;
1135 	const uint32_t		off = 0;
1136 
1137 	mutex_enter(&is->is_mutex);
1138 	so_svc = (idm_so_svc_t *)is->is_so_svc;
1139 
1140 	/*
1141 	 * Try creating an IPv6 socket first
1142 	 */
1143 	if ((so_svc->is_so = idm_socreate(PF_INET6, SOCK_STREAM, 0)) == NULL) {
1144 		mutex_exit(&is->is_mutex);
1145 		return (IDM_STATUS_FAIL);
1146 	} else {
1147 		bzero(&sin6_ip, sizeof (sin6_ip));
1148 		sin6_ip.sin6_family = AF_INET6;
1149 		sin6_ip.sin6_port = htons(sr->sr_port);
1150 		sin6_ip.sin6_addr = in6addr_any;
1151 
1152 		(void) ksocket_setsockopt(so_svc->is_so, SOL_SOCKET,
1153 		    SO_REUSEADDR, (char *)&on, sizeof (on), CRED());
1154 		/*
1155 		 * Turn off SO_MAC_EXEMPT so future sobinds succeed
1156 		 */
1157 		(void) ksocket_setsockopt(so_svc->is_so, SOL_SOCKET,
1158 		    SO_MAC_EXEMPT, (char *)&off, sizeof (off), CRED());
1159 
1160 		if (ksocket_bind(so_svc->is_so, (struct sockaddr *)&sin6_ip,
1161 		    sizeof (sin6_ip), CRED()) != 0) {
1162 			mutex_exit(&is->is_mutex);
1163 			idm_sodestroy(so_svc->is_so);
1164 			return (IDM_STATUS_FAIL);
1165 		}
1166 	}
1167 
1168 	idm_set_tgt_connect_options(so_svc->is_so);
1169 
1170 	if (ksocket_listen(so_svc->is_so, 5, CRED()) != 0) {
1171 		mutex_exit(&is->is_mutex);
1172 		idm_soshutdown(so_svc->is_so);
1173 		idm_sodestroy(so_svc->is_so);
1174 		return (IDM_STATUS_FAIL);
1175 	}
1176 
1177 	/* Launch a watch thread */
1178 	so_svc->is_thread = thread_create(NULL, 0, idm_so_svc_port_watcher,
1179 	    is, 0, &p0, TS_RUN, minclsyspri);
1180 
1181 	if (so_svc->is_thread == NULL) {
1182 		/* Failure to launch; teardown the socket */
1183 		mutex_exit(&is->is_mutex);
1184 		idm_soshutdown(so_svc->is_so);
1185 		idm_sodestroy(so_svc->is_so);
1186 		return (IDM_STATUS_FAIL);
1187 	}
1188 	ksocket_hold(so_svc->is_so);
1189 	/* Wait for the port watcher thread to start */
1190 	while (!so_svc->is_thread_running)
1191 		cv_wait(&is->is_cv, &is->is_mutex);
1192 	mutex_exit(&is->is_mutex);
1193 
1194 	return (IDM_STATUS_SUCCESS);
1195 }
1196 
1197 /*
1198  * idm_so_tgt_svc_offline
1199  *
1200  * Stop listening on the IP address and port identified by idm_svc_t.
1201  */
1202 static void
1203 idm_so_tgt_svc_offline(idm_svc_t *is)
1204 {
1205 	idm_so_svc_t		*so_svc;
1206 	mutex_enter(&is->is_mutex);
1207 	so_svc = (idm_so_svc_t *)is->is_so_svc;
1208 	so_svc->is_thread_running = B_FALSE;
1209 	mutex_exit(&is->is_mutex);
1210 
1211 	/*
1212 	 * Teardown socket
1213 	 */
1214 	idm_sodestroy(so_svc->is_so);
1215 
1216 	/*
1217 	 * Now we expect the port watcher thread to terminate
1218 	 */
1219 	thread_join(so_svc->is_thread_did);
1220 }
1221 
1222 /*
1223  * Watch thread for target service connection establishment.
1224  */
1225 void
1226 idm_so_svc_port_watcher(void *arg)
1227 {
1228 	idm_svc_t		*svc = arg;
1229 	ksocket_t		new_so;
1230 	idm_conn_t		*ic;
1231 	idm_status_t		idmrc;
1232 	idm_so_svc_t		*so_svc;
1233 	int			rc;
1234 	const uint32_t		off = 0;
1235 	struct sockaddr_in6 	t_addr;
1236 	socklen_t		t_addrlen;
1237 
1238 	bzero(&t_addr, sizeof (struct sockaddr_in6));
1239 	t_addrlen = sizeof (struct sockaddr_in6);
1240 	mutex_enter(&svc->is_mutex);
1241 
1242 	so_svc = svc->is_so_svc;
1243 	so_svc->is_thread_running = B_TRUE;
1244 	so_svc->is_thread_did = so_svc->is_thread->t_did;
1245 
1246 	cv_signal(&svc->is_cv);
1247 
1248 	IDM_SVC_LOG(CE_NOTE, "iSCSI service (%p/%d) online", (void *)svc,
1249 	    svc->is_svc_req.sr_port);
1250 
1251 	while (so_svc->is_thread_running) {
1252 		mutex_exit(&svc->is_mutex);
1253 
1254 		if ((rc = ksocket_accept(so_svc->is_so,
1255 		    (struct sockaddr *)&t_addr, &t_addrlen,
1256 		    &new_so, CRED())) != 0) {
1257 			mutex_enter(&svc->is_mutex);
1258 			if (rc == ECONNABORTED)
1259 				continue;
1260 			/* Connection problem */
1261 			break;
1262 		}
1263 		/*
1264 		 * Turn off SO_MAC_EXEMPT so future sobinds succeed
1265 		 */
1266 		(void) ksocket_setsockopt(new_so, SOL_SOCKET, SO_MAC_EXEMPT,
1267 		    (char *)&off, sizeof (off), CRED());
1268 
1269 		idmrc = idm_svc_conn_create(svc, IDM_TRANSPORT_TYPE_SOCKETS,
1270 		    &ic);
1271 		if (idmrc != IDM_STATUS_SUCCESS) {
1272 			/* Drop connection */
1273 			idm_soshutdown(new_so);
1274 			idm_sodestroy(new_so);
1275 			mutex_enter(&svc->is_mutex);
1276 			continue;
1277 		}
1278 
1279 		idmrc = idm_so_tgt_conn_create(ic, new_so);
1280 		if (idmrc != IDM_STATUS_SUCCESS) {
1281 			idm_svc_conn_destroy(ic);
1282 			idm_soshutdown(new_so);
1283 			idm_sodestroy(new_so);
1284 			mutex_enter(&svc->is_mutex);
1285 			continue;
1286 		}
1287 
1288 		/*
1289 		 * Kick the state machine.  At CS_S3_XPT_UP the state machine
1290 		 * will notify the client (target) about the new connection.
1291 		 */
1292 		idm_conn_event(ic, CE_CONNECT_ACCEPT, NULL);
1293 
1294 		mutex_enter(&svc->is_mutex);
1295 	}
1296 	ksocket_rele(so_svc->is_so);
1297 	so_svc->is_thread_running = B_FALSE;
1298 	mutex_exit(&svc->is_mutex);
1299 
1300 	IDM_SVC_LOG(CE_NOTE, "iSCSI service (%p/%d) offline", (void *)svc,
1301 	    svc->is_svc_req.sr_port);
1302 
1303 	thread_exit();
1304 }
1305 
1306 /*
1307  * idm_so_free_task_rsrc() stops any ongoing processing of the task and
1308  * frees resources associated with the task.
1309  *
1310  * It's not clear that this should return idm_status_t.  What do we do
1311  * if it fails?
1312  */
1313 static idm_status_t
1314 idm_so_free_task_rsrc(idm_task_t *idt)
1315 {
1316 	idm_buf_t	*idb;
1317 
1318 	/*
1319 	 * There is nothing to cleanup on initiator connections
1320 	 */
1321 	if (IDM_CONN_ISINI(idt->idt_ic))
1322 		return (IDM_STATUS_SUCCESS);
1323 
1324 	/*
1325 	 * If this is a target connection, call idm_buf_rx_from_ini_done for
1326 	 * any buffer on the "outbufv" list with idb->idb_in_transport==B_TRUE.
1327 	 *
1328 	 * In addition, remove any buffers associated with this task from
1329 	 * the ic_tx_list.  We'll do this by walking the idt_inbufv list, but
1330 	 * items don't actually get removed from that list (and completion
1331 	 * routines called) until idm_task_cleanup.
1332 	 */
1333 	mutex_enter(&idt->idt_mutex);
1334 
1335 	for (idb = list_head(&idt->idt_outbufv); idb != NULL;
1336 	    idb = list_next(&idt->idt_outbufv, idb)) {
1337 		if (idb->idb_in_transport) {
1338 			/*
1339 			 * idm_buf_rx_from_ini_done releases idt->idt_mutex
1340 			 */
1341 			DTRACE_ISCSI_8(xfer__done, idm_conn_t *, idt->idt_ic,
1342 			    uintptr_t, idb->idb_buf,
1343 			    uint32_t, idb->idb_bufoffset,
1344 			    uint64_t, 0, uint32_t, 0, uint32_t, 0,
1345 			    uint32_t, idb->idb_xfer_len,
1346 			    int, XFER_BUF_RX_FROM_INI);
1347 			idm_buf_rx_from_ini_done(idt, idb, IDM_STATUS_ABORTED);
1348 			mutex_enter(&idt->idt_mutex);
1349 		}
1350 	}
1351 
1352 	for (idb = list_head(&idt->idt_inbufv); idb != NULL;
1353 	    idb = list_next(&idt->idt_inbufv, idb)) {
1354 		/*
1355 		 * We want to remove these items from the tx_list as well,
1356 		 * but knowing it's in the idt_inbufv list is not a guarantee
1357 		 * that it's in the tx_list.  If it's on the tx list then
1358 		 * let idm_sotx_thread() clean it up.
1359 		 */
1360 		if (idb->idb_in_transport && !idb->idb_tx_thread) {
1361 			/*
1362 			 * idm_buf_tx_to_ini_done releases idt->idt_mutex
1363 			 */
1364 			DTRACE_ISCSI_8(xfer__done, idm_conn_t *, idt->idt_ic,
1365 			    uintptr_t, idb->idb_buf,
1366 			    uint32_t, idb->idb_bufoffset,
1367 			    uint64_t, 0, uint32_t, 0, uint32_t, 0,
1368 			    uint32_t, idb->idb_xfer_len,
1369 			    int, XFER_BUF_TX_TO_INI);
1370 			idm_buf_tx_to_ini_done(idt, idb, IDM_STATUS_ABORTED);
1371 			mutex_enter(&idt->idt_mutex);
1372 		}
1373 	}
1374 
1375 	mutex_exit(&idt->idt_mutex);
1376 
1377 	return (IDM_STATUS_SUCCESS);
1378 }
1379 
1380 /*
1381  * idm_so_negotiate_key_values() validates the key values for this connection
1382  */
1383 /* ARGSUSED */
1384 static kv_status_t
1385 idm_so_negotiate_key_values(idm_conn_t *it, nvlist_t *request_nvl,
1386     nvlist_t *response_nvl, nvlist_t *negotiated_nvl)
1387 {
1388 	/* All parameters are negotiated at the iscsit level */
1389 	return (KV_HANDLED);
1390 }
1391 
1392 /*
1393  * idm_so_notice_key_values() activates the negotiated key values for
1394  * this connection.
1395  */
1396 static void
1397 idm_so_notice_key_values(idm_conn_t *it, nvlist_t *negotiated_nvl)
1398 {
1399 	char			*nvp_name;
1400 	nvpair_t		*nvp;
1401 	nvpair_t		*next_nvp;
1402 	int			nvrc;
1403 	idm_status_t		idm_status;
1404 	const idm_kv_xlate_t	*ikvx;
1405 	uint64_t		num_val;
1406 
1407 	for (nvp = nvlist_next_nvpair(negotiated_nvl, NULL);
1408 	    nvp != NULL; nvp = next_nvp) {
1409 		next_nvp = nvlist_next_nvpair(negotiated_nvl, nvp);
1410 		nvp_name = nvpair_name(nvp);
1411 
1412 		ikvx = idm_lookup_kv_xlate(nvp_name, strlen(nvp_name));
1413 		switch (ikvx->ik_key_id) {
1414 		case KI_HEADER_DIGEST:
1415 		case KI_DATA_DIGEST:
1416 			idm_status = idm_so_handle_digest(it, nvp, ikvx);
1417 			ASSERT(idm_status == 0);
1418 
1419 			/* Remove processed item from negotiated_nvl list */
1420 			nvrc = nvlist_remove_all(
1421 			    negotiated_nvl, ikvx->ik_key_name);
1422 			ASSERT(nvrc == 0);
1423 			break;
1424 		case KI_MAX_RECV_DATA_SEGMENT_LENGTH:
1425 			/*
1426 			 * Just pass the value down to idm layer.
1427 			 * No need to remove it from negotiated_nvl list here.
1428 			 */
1429 			nvrc = nvpair_value_uint64(nvp, &num_val);
1430 			ASSERT(nvrc == 0);
1431 			it->ic_conn_params.max_xmit_dataseglen =
1432 			    (uint32_t)num_val;
1433 			break;
1434 		default:
1435 			break;
1436 		}
1437 	}
1438 }
1439 
1440 /*
1441  * idm_so_declare_key_values() declares the key values for this connection
1442  */
1443 /* ARGSUSED */
1444 static kv_status_t
1445 idm_so_declare_key_values(idm_conn_t *it, nvlist_t *config_nvl,
1446     nvlist_t *outgoing_nvl)
1447 {
1448 	char			*nvp_name;
1449 	nvpair_t		*nvp;
1450 	nvpair_t		*next_nvp;
1451 	kv_status_t		kvrc;
1452 	int			nvrc = 0;
1453 	const idm_kv_xlate_t	*ikvx;
1454 	uint64_t		num_val;
1455 
1456 	for (nvp = nvlist_next_nvpair(config_nvl, NULL);
1457 	    nvp != NULL && nvrc == 0; nvp = next_nvp) {
1458 		next_nvp = nvlist_next_nvpair(config_nvl, nvp);
1459 		nvp_name = nvpair_name(nvp);
1460 
1461 		ikvx = idm_lookup_kv_xlate(nvp_name, strlen(nvp_name));
1462 		switch (ikvx->ik_key_id) {
1463 		case KI_MAX_RECV_DATA_SEGMENT_LENGTH:
1464 			if ((nvrc = nvpair_value_uint64(nvp, &num_val)) != 0) {
1465 				break;
1466 			}
1467 			if (outgoing_nvl &&
1468 			    (nvrc = nvlist_add_uint64(outgoing_nvl,
1469 			    nvp_name, num_val)) != 0) {
1470 				break;
1471 			}
1472 			it->ic_conn_params.max_recv_dataseglen =
1473 			    (uint32_t)num_val;
1474 			break;
1475 		default:
1476 			break;
1477 		}
1478 	}
1479 	kvrc = idm_nvstat_to_kvstat(nvrc);
1480 	return (kvrc);
1481 }
1482 
1483 static idm_status_t
1484 idm_so_handle_digest(idm_conn_t *it, nvpair_t *digest_choice,
1485     const idm_kv_xlate_t *ikvx)
1486 {
1487 	int			nvrc;
1488 	char			*digest_choice_string;
1489 
1490 	nvrc = nvpair_value_string(digest_choice,
1491 	    &digest_choice_string);
1492 	ASSERT(nvrc == 0);
1493 	if (strcasecmp(digest_choice_string, "crc32c") == 0) {
1494 		switch (ikvx->ik_key_id) {
1495 		case KI_HEADER_DIGEST:
1496 			it->ic_conn_flags |= IDM_CONN_HEADER_DIGEST;
1497 			break;
1498 		case KI_DATA_DIGEST:
1499 			it->ic_conn_flags |= IDM_CONN_DATA_DIGEST;
1500 			break;
1501 		default:
1502 			ASSERT(0);
1503 			break;
1504 		}
1505 	} else if (strcasecmp(digest_choice_string, "none") == 0) {
1506 		switch (ikvx->ik_key_id) {
1507 		case KI_HEADER_DIGEST:
1508 			it->ic_conn_flags &= ~IDM_CONN_HEADER_DIGEST;
1509 			break;
1510 		case KI_DATA_DIGEST:
1511 			it->ic_conn_flags &= ~IDM_CONN_DATA_DIGEST;
1512 			break;
1513 		default:
1514 			ASSERT(0);
1515 			break;
1516 		}
1517 	} else {
1518 		ASSERT(0);
1519 	}
1520 
1521 	return (IDM_STATUS_SUCCESS);
1522 }
1523 
1524 
1525 /*
1526  * idm_so_conn_is_capable() verifies that the passed connection is provided
1527  * for by the sockets interface.
1528  */
1529 /* ARGSUSED */
1530 static boolean_t
1531 idm_so_conn_is_capable(idm_conn_req_t *ic, idm_transport_caps_t *caps)
1532 {
1533 	return (B_TRUE);
1534 }
1535 
1536 /*
1537  * idm_so_rx_datain() validates the Data Sequence number of the PDU. The
1538  * idm_sorecv_scsidata() function invoked earlier actually reads the data
1539  * off the socket into the appropriate buffers.
1540  */
1541 static void
1542 idm_so_rx_datain(idm_conn_t *ic, idm_pdu_t *pdu)
1543 {
1544 	iscsi_data_hdr_t	*bhs;
1545 	idm_task_t		*idt;
1546 	idm_buf_t		*idb;
1547 	uint32_t		datasn;
1548 	size_t			offset;
1549 	iscsi_hdr_t		*ihp = (iscsi_hdr_t *)pdu->isp_hdr;
1550 	iscsi_data_rsp_hdr_t    *idrhp = (iscsi_data_rsp_hdr_t *)ihp;
1551 
1552 	ASSERT(ic != NULL);
1553 	ASSERT(pdu != NULL);
1554 
1555 	bhs	= (iscsi_data_hdr_t *)pdu->isp_hdr;
1556 	datasn	= ntohl(bhs->datasn);
1557 	offset	= ntohl(bhs->offset);
1558 
1559 	ASSERT(bhs->opcode == ISCSI_OP_SCSI_DATA_RSP);
1560 
1561 	/*
1562 	 * Look up the task corresponding to the initiator task tag
1563 	 * to get the buffers affiliated with the task.
1564 	 */
1565 	idt = idm_task_find(ic, bhs->itt, bhs->ttt);
1566 	if (idt == NULL) {
1567 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_datain: failed to find task");
1568 		idm_pdu_rx_protocol_error(ic, pdu);
1569 		return;
1570 	}
1571 
1572 	idb = pdu->isp_sorx_buf;
1573 	if (idb == NULL) {
1574 		IDM_CONN_LOG(CE_WARN,
1575 		    "idm_so_rx_datain: failed to find buffer");
1576 		idm_task_rele(idt);
1577 		idm_pdu_rx_protocol_error(ic, pdu);
1578 		return;
1579 	}
1580 
1581 	/*
1582 	 * DataSN values should be sequential and should not have any gaps or
1583 	 * repetitions. Check the DataSN with the one stored in the task.
1584 	 */
1585 	if (datasn == idt->idt_exp_datasn) {
1586 		idt->idt_exp_datasn++; /* keep track of DataSN received */
1587 	} else {
1588 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_datain: datasn out of order");
1589 		idm_task_rele(idt);
1590 		idm_pdu_rx_protocol_error(ic, pdu);
1591 		return;
1592 	}
1593 
1594 	/*
1595 	 * PDUs in a sequence should be in continuously increasing
1596 	 * address offset
1597 	 */
1598 	if (offset != idb->idb_exp_offset) {
1599 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_datain: unexpected offset");
1600 		idm_task_rele(idt);
1601 		idm_pdu_rx_protocol_error(ic, pdu);
1602 		return;
1603 	}
1604 	/* Expected next relative buffer offset */
1605 	idb->idb_exp_offset += n2h24(bhs->dlength);
1606 	idt->idt_rx_bytes += n2h24(bhs->dlength);
1607 
1608 	idm_task_rele(idt);
1609 
1610 	/*
1611 	 * For now call scsi_rsp which will process the data rsp
1612 	 * Revisit, need to provide an explicit client entry point for
1613 	 * phase collapse completions.
1614 	 */
1615 	if (((ihp->opcode & ISCSI_OPCODE_MASK) == ISCSI_OP_SCSI_DATA_RSP) &&
1616 	    (idrhp->flags & ISCSI_FLAG_DATA_STATUS)) {
1617 		(*ic->ic_conn_ops.icb_rx_scsi_rsp)(ic, pdu);
1618 	}
1619 
1620 	idm_pdu_complete(pdu, IDM_STATUS_SUCCESS);
1621 }
1622 
1623 /*
1624  * The idm_so_rx_dataout() function is used by the iSCSI target to read
1625  * data from the Data-Out PDU sent by the iSCSI initiator.
1626  *
1627  * This function gets the Initiator Task Tag from the PDU BHS and looks up the
1628  * task to get the buffers associated with the PDU. A PDU might span buffers.
1629  * The data is then read into the respective buffer.
1630  */
1631 static void
1632 idm_so_rx_dataout(idm_conn_t *ic, idm_pdu_t *pdu)
1633 {
1634 
1635 	iscsi_data_hdr_t	*bhs;
1636 	idm_task_t		*idt;
1637 	idm_buf_t		*idb;
1638 	size_t			offset;
1639 
1640 	ASSERT(ic != NULL);
1641 	ASSERT(pdu != NULL);
1642 
1643 	bhs = (iscsi_data_hdr_t *)pdu->isp_hdr;
1644 	offset = ntohl(bhs->offset);
1645 	ASSERT(bhs->opcode == ISCSI_OP_SCSI_DATA);
1646 
1647 	/*
1648 	 * Look up the task corresponding to the initiator task tag
1649 	 * to get the buffers affiliated with the task.
1650 	 */
1651 	idt = idm_task_find(ic, bhs->itt, bhs->ttt);
1652 	if (idt == NULL) {
1653 		IDM_CONN_LOG(CE_WARN,
1654 		    "idm_so_rx_dataout: failed to find task");
1655 		idm_pdu_rx_protocol_error(ic, pdu);
1656 		return;
1657 	}
1658 
1659 	idb = pdu->isp_sorx_buf;
1660 	if (idb == NULL) {
1661 		IDM_CONN_LOG(CE_WARN,
1662 		    "idm_so_rx_dataout: failed to find buffer");
1663 		idm_task_rele(idt);
1664 		idm_pdu_rx_protocol_error(ic, pdu);
1665 		return;
1666 	}
1667 
1668 	/* Keep track of data transferred - check data offsets */
1669 	if (offset != idb->idb_exp_offset) {
1670 		IDM_CONN_LOG(CE_NOTE, "idm_so_rx_dataout: offset out of seq: "
1671 		    "%ld, %d", offset, idb->idb_exp_offset);
1672 		idm_task_rele(idt);
1673 		idm_pdu_rx_protocol_error(ic, pdu);
1674 		return;
1675 	}
1676 	/* Expected next relative offset */
1677 	idb->idb_exp_offset += ntoh24(bhs->dlength);
1678 	idt->idt_rx_bytes += n2h24(bhs->dlength);
1679 
1680 	/*
1681 	 * Call the buffer callback when the transfer is complete
1682 	 *
1683 	 * The connection state machine should only abort tasks after
1684 	 * shutting down the connection so we are assured that there
1685 	 * won't be a simultaneous attempt to abort this task at the
1686 	 * same time as we are processing this PDU (due to a connection
1687 	 * state change).
1688 	 */
1689 	if (bhs->flags & ISCSI_FLAG_FINAL) {
1690 		/*
1691 		 * We only want to call idm_buf_rx_from_ini_done once
1692 		 * per transfer.  It's possible that this task has
1693 		 * already been aborted in which case
1694 		 * idm_so_free_task_rsrc will call idm_buf_rx_from_ini_done
1695 		 * for each buffer with idb_in_transport==B_TRUE.  To
1696 		 * close this window and ensure that this doesn't happen,
1697 		 * we'll clear idb->idb_in_transport now while holding
1698 		 * the task mutex.   This is only really an issue for
1699 		 * SCSI task abort -- if tasks were being aborted because
1700 		 * of a connection state change the state machine would
1701 		 * have already stopped the receive thread.
1702 		 */
1703 		mutex_enter(&idt->idt_mutex);
1704 
1705 		/*
1706 		 * Release the task hold here (obtained in idm_task_find)
1707 		 * because the task may complete synchronously during
1708 		 * idm_buf_rx_from_ini_done.  Since we still have an active
1709 		 * buffer we know there is at least one additional hold on idt.
1710 		 */
1711 		idm_task_rele(idt);
1712 
1713 		/*
1714 		 * idm_buf_rx_from_ini_done releases idt->idt_mutex
1715 		 */
1716 		DTRACE_ISCSI_8(xfer__done, idm_conn_t *, idt->idt_ic,
1717 		    uintptr_t, idb->idb_buf, uint32_t, idb->idb_bufoffset,
1718 		    uint64_t, 0, uint32_t, 0, uint32_t, 0,
1719 		    uint32_t, idb->idb_xfer_len,
1720 		    int, XFER_BUF_RX_FROM_INI);
1721 		idm_buf_rx_from_ini_done(idt, idb, IDM_STATUS_SUCCESS);
1722 		idm_pdu_complete(pdu, IDM_STATUS_SUCCESS);
1723 		return;
1724 	}
1725 
1726 	idm_task_rele(idt);
1727 	idm_pdu_complete(pdu, IDM_STATUS_SUCCESS);
1728 }
1729 
1730 /*
1731  * The idm_so_rx_rtt() function is used by the iSCSI initiator to handle
1732  * the R2T PDU sent by the iSCSI target indicating that it is ready to
1733  * accept data. This gets the Initiator Task Tag (itt) from the PDU BHS
1734  * and looks up the task in the task tree using the itt to get the output
1735  * buffers associated the task. The R2T PDU contains the offset of the
1736  * requested data and the data length. This function then constructs a
1737  * sequence of iSCSI PDUs and outputs the requested data. Each Data-Out
1738  * PDU is associated with the R2T by the Target Transfer Tag  (ttt).
1739  */
1740 
1741 static void
1742 idm_so_rx_rtt(idm_conn_t *ic, idm_pdu_t *pdu)
1743 {
1744 	idm_task_t		*idt;
1745 	idm_buf_t		*idb;
1746 	iscsi_rtt_hdr_t		*rtt_hdr;
1747 	uint32_t		data_offset;
1748 	uint32_t		data_length;
1749 
1750 	ASSERT(ic != NULL);
1751 	ASSERT(pdu != NULL);
1752 
1753 	rtt_hdr	= (iscsi_rtt_hdr_t *)pdu->isp_hdr;
1754 	data_offset = ntohl(rtt_hdr->data_offset);
1755 	data_length = ntohl(rtt_hdr->data_length);
1756 	idt	= idm_task_find(ic, rtt_hdr->itt, rtt_hdr->ttt);
1757 
1758 	if (idt == NULL) {
1759 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_rtt: could not find task");
1760 		idm_pdu_rx_protocol_error(ic, pdu);
1761 		return;
1762 	}
1763 
1764 	/* Find the buffer bound to the task by the iSCSI initiator */
1765 	mutex_enter(&idt->idt_mutex);
1766 	idb = idm_buf_find(&idt->idt_outbufv, data_offset);
1767 	if (idb == NULL) {
1768 		mutex_exit(&idt->idt_mutex);
1769 		idm_task_rele(idt);
1770 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_rtt: could not find buffer");
1771 		idm_pdu_rx_protocol_error(ic, pdu);
1772 		return;
1773 	}
1774 
1775 	/* return buffer contains this data */
1776 	if (data_offset + data_length > idb->idb_buflen) {
1777 		/* Overflow */
1778 		mutex_exit(&idt->idt_mutex);
1779 		idm_task_rele(idt);
1780 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_rtt: read from outside "
1781 		    "buffer");
1782 		idm_pdu_rx_protocol_error(ic, pdu);
1783 		return;
1784 	}
1785 
1786 	idt->idt_r2t_ttt = rtt_hdr->ttt;
1787 	idt->idt_exp_datasn = 0;
1788 
1789 	idm_so_send_rtt_data(ic, idt, idb, data_offset,
1790 	    ntohl(rtt_hdr->data_length));
1791 	mutex_exit(&idt->idt_mutex);
1792 
1793 	idm_pdu_complete(pdu, IDM_STATUS_SUCCESS);
1794 	idm_task_rele(idt);
1795 
1796 }
1797 
1798 idm_status_t
1799 idm_sorecvdata(idm_conn_t *ic, idm_pdu_t *pdu)
1800 {
1801 	uint8_t		pad[ISCSI_PAD_WORD_LEN];
1802 	int		pad_len;
1803 	uint32_t	data_digest_crc;
1804 	uint32_t	crc_calculated;
1805 	int		total_len;
1806 	idm_so_conn_t	*so_conn;
1807 
1808 	so_conn = ic->ic_transport_private;
1809 
1810 	pad_len = ((ISCSI_PAD_WORD_LEN -
1811 	    (pdu->isp_datalen & (ISCSI_PAD_WORD_LEN - 1))) &
1812 	    (ISCSI_PAD_WORD_LEN - 1));
1813 
1814 	ASSERT(pdu->isp_iovlen < (PDU_MAX_IOVLEN - 2)); /* pad + data digest */
1815 
1816 	total_len = pdu->isp_datalen;
1817 
1818 	if (pad_len) {
1819 		pdu->isp_iov[pdu->isp_iovlen].iov_base	= (char *)&pad;
1820 		pdu->isp_iov[pdu->isp_iovlen].iov_len	= pad_len;
1821 		total_len		+= pad_len;
1822 		pdu->isp_iovlen++;
1823 	}
1824 
1825 	/* setup data digest */
1826 	if ((ic->ic_conn_flags & IDM_CONN_DATA_DIGEST) != 0) {
1827 		pdu->isp_iov[pdu->isp_iovlen].iov_base =
1828 		    (char *)&data_digest_crc;
1829 		pdu->isp_iov[pdu->isp_iovlen].iov_len =
1830 		    sizeof (data_digest_crc);
1831 		total_len		+= sizeof (data_digest_crc);
1832 		pdu->isp_iovlen++;
1833 	}
1834 
1835 	pdu->isp_data = (uint8_t *)(uintptr_t)pdu->isp_iov[0].iov_base;
1836 
1837 	if (idm_iov_sorecv(so_conn->ic_so, &pdu->isp_iov[0],
1838 	    pdu->isp_iovlen, total_len) != 0) {
1839 		return (IDM_STATUS_IO);
1840 	}
1841 
1842 	if ((ic->ic_conn_flags & IDM_CONN_DATA_DIGEST) != 0) {
1843 		crc_calculated = idm_crc32c(pdu->isp_data,
1844 		    pdu->isp_datalen);
1845 		if (pad_len) {
1846 			crc_calculated = idm_crc32c_continued((char *)&pad,
1847 			    pad_len, crc_calculated);
1848 		}
1849 		if (crc_calculated != data_digest_crc) {
1850 			IDM_CONN_LOG(CE_WARN,
1851 			    "idm_sorecvdata: "
1852 			    "CRC error: actual 0x%x, calc 0x%x",
1853 			    data_digest_crc, crc_calculated);
1854 
1855 			/* Invalid Data Digest */
1856 			return (IDM_STATUS_DATA_DIGEST);
1857 		}
1858 	}
1859 
1860 	return (IDM_STATUS_SUCCESS);
1861 }
1862 
1863 /*
1864  * idm_sorecv_scsidata() is used to receive scsi data from the socket. The
1865  * Data-type PDU header must be read into the idm_pdu_t structure prior to
1866  * calling this function.
1867  */
1868 idm_status_t
1869 idm_sorecv_scsidata(idm_conn_t *ic, idm_pdu_t *pdu)
1870 {
1871 	iscsi_data_hdr_t	*bhs;
1872 	idm_task_t		*task;
1873 	uint32_t		offset;
1874 	uint8_t			opcode;
1875 	uint32_t		dlength;
1876 	list_t			*buflst;
1877 	uint32_t		xfer_bytes;
1878 	idm_status_t		status;
1879 
1880 	ASSERT(ic != NULL);
1881 	ASSERT(pdu != NULL);
1882 
1883 	bhs	= (iscsi_data_hdr_t *)pdu->isp_hdr;
1884 
1885 	offset	= ntohl(bhs->offset);
1886 	opcode	= bhs->opcode;
1887 	dlength = n2h24(bhs->dlength);
1888 
1889 	ASSERT((opcode == ISCSI_OP_SCSI_DATA_RSP) ||
1890 	    (opcode == ISCSI_OP_SCSI_DATA));
1891 
1892 	/*
1893 	 * Successful lookup implicitly gets a "hold" on the task.  This
1894 	 * hold must be released before leaving this function.  At one
1895 	 * point we were caching this task context and retaining the hold
1896 	 * but it turned out to be very difficult to release the hold properly.
1897 	 * The task can be aborted and the connection shutdown between this
1898 	 * call and the subsequent expected call to idm_so_rx_datain/
1899 	 * idm_so_rx_dataout (in which case those functions are not called).
1900 	 * Releasing the hold in the PDU callback doesn't work well either
1901 	 * because the whole task may be completed by then at which point
1902 	 * it is too late to release the hold -- for better or worse this
1903 	 * code doesn't wait on the refcnts during normal operation.
1904 	 * idm_task_find() is very fast and it is not a huge burden if we
1905 	 * have to do it twice.
1906 	 */
1907 	task = idm_task_find(ic, bhs->itt, bhs->ttt);
1908 	if (task == NULL) {
1909 		IDM_CONN_LOG(CE_WARN,
1910 		    "idm_sorecv_scsidata: could not find task");
1911 		return (IDM_STATUS_FAIL);
1912 	}
1913 
1914 	mutex_enter(&task->idt_mutex);
1915 	buflst	= (opcode == ISCSI_OP_SCSI_DATA_RSP) ?
1916 	    &task->idt_inbufv : &task->idt_outbufv;
1917 	pdu->isp_sorx_buf = idm_buf_find(buflst, offset);
1918 	mutex_exit(&task->idt_mutex);
1919 
1920 	if (pdu->isp_sorx_buf == NULL) {
1921 		idm_task_rele(task);
1922 		IDM_CONN_LOG(CE_WARN, "idm_sorecv_scsidata: could not find "
1923 		    "buffer for offset %x opcode=%x",
1924 		    offset, opcode);
1925 		return (IDM_STATUS_FAIL);
1926 	}
1927 
1928 	xfer_bytes = idm_fill_iov(pdu, pdu->isp_sorx_buf, offset, dlength);
1929 	ASSERT(xfer_bytes != 0);
1930 	if (xfer_bytes != dlength) {
1931 		idm_task_rele(task);
1932 		/*
1933 		 * Buffer overflow, connection error.  The PDU data is still
1934 		 * sitting in the socket so we can't use the connection
1935 		 * again until that data is drained.
1936 		 */
1937 		return (IDM_STATUS_FAIL);
1938 	}
1939 
1940 	status = idm_sorecvdata(ic, pdu);
1941 
1942 	idm_task_rele(task);
1943 
1944 	return (status);
1945 }
1946 
1947 static uint32_t
1948 idm_fill_iov(idm_pdu_t *pdu, idm_buf_t *idb, uint32_t ro, uint32_t dlength)
1949 {
1950 	uint32_t	buf_ro = ro - idb->idb_bufoffset;
1951 	uint32_t	xfer_len = min(dlength, idb->idb_buflen - buf_ro);
1952 
1953 	ASSERT(ro >= idb->idb_bufoffset);
1954 
1955 	pdu->isp_iov[pdu->isp_iovlen].iov_base	=
1956 	    (caddr_t)idb->idb_buf + buf_ro;
1957 	pdu->isp_iov[pdu->isp_iovlen].iov_len	= xfer_len;
1958 	pdu->isp_iovlen++;
1959 
1960 	return (xfer_len);
1961 }
1962 
1963 int
1964 idm_sorecv_nonscsidata(idm_conn_t *ic, idm_pdu_t *pdu)
1965 {
1966 	pdu->isp_data = kmem_alloc(pdu->isp_datalen, KM_SLEEP);
1967 	ASSERT(pdu->isp_data != NULL);
1968 
1969 	pdu->isp_databuflen = pdu->isp_datalen;
1970 	pdu->isp_iov[0].iov_base = (caddr_t)pdu->isp_data;
1971 	pdu->isp_iov[0].iov_len = pdu->isp_datalen;
1972 	pdu->isp_iovlen = 1;
1973 	/*
1974 	 * Since we are associating a new data buffer with this received
1975 	 * PDU we need to set a specific callback to free the data
1976 	 * after the PDU is processed.
1977 	 */
1978 	pdu->isp_flags |= IDM_PDU_ADDL_DATA;
1979 	pdu->isp_callback = idm_sorx_addl_pdu_cb;
1980 
1981 	return (idm_sorecvdata(ic, pdu));
1982 }
1983 
1984 void
1985 idm_sorx_thread(void *arg)
1986 {
1987 	boolean_t	conn_failure = B_FALSE;
1988 	idm_conn_t	*ic = (idm_conn_t *)arg;
1989 	idm_so_conn_t	*so_conn;
1990 	idm_pdu_t	*pdu;
1991 	idm_status_t	rc;
1992 
1993 	idm_conn_hold(ic);
1994 
1995 	mutex_enter(&ic->ic_mutex);
1996 
1997 	so_conn = ic->ic_transport_private;
1998 	so_conn->ic_rx_thread_running = B_TRUE;
1999 	so_conn->ic_rx_thread_did = so_conn->ic_rx_thread->t_did;
2000 	cv_signal(&ic->ic_cv);
2001 
2002 	while (so_conn->ic_rx_thread_running) {
2003 		mutex_exit(&ic->ic_mutex);
2004 
2005 		/*
2006 		 * Get PDU with default header size (large enough for
2007 		 * BHS plus any anticipated AHS).  PDU from
2008 		 * the cache will have all values set correctly
2009 		 * for sockets RX including callback.
2010 		 */
2011 		pdu = kmem_cache_alloc(idm.idm_sorx_pdu_cache, KM_SLEEP);
2012 		pdu->isp_ic = ic;
2013 		pdu->isp_flags = 0;
2014 		pdu->isp_transport_hdrlen = 0;
2015 
2016 		if ((rc = idm_sorecvhdr(ic, pdu)) != 0) {
2017 			/*
2018 			 * Call idm_pdu_complete so that we call the callback
2019 			 * and ensure any memory allocated in idm_sorecvhdr
2020 			 * gets freed up.
2021 			 */
2022 			idm_pdu_complete(pdu, IDM_STATUS_FAIL);
2023 
2024 			/*
2025 			 * If ic_rx_thread_running is still set then
2026 			 * this is some kind of connection problem
2027 			 * on the socket.  In this case we want to
2028 			 * generate an event.  Otherwise some other
2029 			 * thread closed the socket due to another
2030 			 * issue in which case we don't need to
2031 			 * generate an event.
2032 			 */
2033 			mutex_enter(&ic->ic_mutex);
2034 			if (so_conn->ic_rx_thread_running) {
2035 				conn_failure = B_TRUE;
2036 				so_conn->ic_rx_thread_running = B_FALSE;
2037 			}
2038 
2039 			continue;
2040 		}
2041 
2042 		/*
2043 		 * Header has been read and validated.  Now we need
2044 		 * to read the PDU data payload (if present).  SCSI data
2045 		 * need to be transferred from the socket directly into
2046 		 * the associated transfer buffer for the SCSI task.
2047 		 */
2048 		if (pdu->isp_datalen != 0) {
2049 			if ((IDM_PDU_OPCODE(pdu) == ISCSI_OP_SCSI_DATA) ||
2050 			    (IDM_PDU_OPCODE(pdu) == ISCSI_OP_SCSI_DATA_RSP)) {
2051 				rc = idm_sorecv_scsidata(ic, pdu);
2052 				/*
2053 				 * All SCSI errors are fatal to the
2054 				 * connection right now since we have no
2055 				 * place to put the data.  What we need
2056 				 * is some kind of sink to dispose of unwanted
2057 				 * SCSI data.  For example an invalid task tag
2058 				 * should not kill the connection (although
2059 				 * we may want to drop the connection).
2060 				 */
2061 			} else {
2062 				/*
2063 				 * Not data PDUs so allocate a buffer for the
2064 				 * data segment and read the remaining data.
2065 				 */
2066 				rc = idm_sorecv_nonscsidata(ic, pdu);
2067 			}
2068 			if (rc != 0) {
2069 				/*
2070 				 * Call idm_pdu_complete so that we call the
2071 				 * callback and ensure any memory allocated
2072 				 * in idm_sorecvhdr gets freed up.
2073 				 */
2074 				idm_pdu_complete(pdu, IDM_STATUS_FAIL);
2075 
2076 				/*
2077 				 * If ic_rx_thread_running is still set then
2078 				 * this is some kind of connection problem
2079 				 * on the socket.  In this case we want to
2080 				 * generate an event.  Otherwise some other
2081 				 * thread closed the socket due to another
2082 				 * issue in which case we don't need to
2083 				 * generate an event.
2084 				 */
2085 				mutex_enter(&ic->ic_mutex);
2086 				if (so_conn->ic_rx_thread_running) {
2087 					conn_failure = B_TRUE;
2088 					so_conn->ic_rx_thread_running = B_FALSE;
2089 				}
2090 				continue;
2091 			}
2092 		}
2093 
2094 		/*
2095 		 * Process RX PDU
2096 		 */
2097 		idm_pdu_rx(ic, pdu);
2098 
2099 		mutex_enter(&ic->ic_mutex);
2100 	}
2101 
2102 	mutex_exit(&ic->ic_mutex);
2103 
2104 	/*
2105 	 * If we dropped out of the RX processing loop because of
2106 	 * a socket problem or other connection failure (including
2107 	 * digest errors) then we need to generate a state machine
2108 	 * event to shut the connection down.
2109 	 * If the state machine is already in, for example, INIT_ERROR, this
2110 	 * event will get dropped, and the TX thread will never be notified
2111 	 * to shut down.  To be safe, we'll just notify it here.
2112 	 */
2113 	if (conn_failure) {
2114 		if (so_conn->ic_tx_thread_running) {
2115 			so_conn->ic_tx_thread_running = B_FALSE;
2116 			mutex_enter(&so_conn->ic_tx_mutex);
2117 			cv_signal(&so_conn->ic_tx_cv);
2118 			mutex_exit(&so_conn->ic_tx_mutex);
2119 		}
2120 
2121 		idm_conn_event(ic, CE_TRANSPORT_FAIL, rc);
2122 	}
2123 
2124 	idm_conn_rele(ic);
2125 
2126 	thread_exit();
2127 }
2128 
2129 /*
2130  * idm_so_tx
2131  *
2132  * This is the implementation of idm_transport_ops_t's it_tx_pdu entry
2133  * point.  By definition, it is supposed to be fast.  So, simply queue
2134  * the entry and return.  The real work is done by idm_i_so_tx() via
2135  * idm_sotx_thread().
2136  */
2137 
2138 static void
2139 idm_so_tx(idm_conn_t *ic, idm_pdu_t *pdu)
2140 {
2141 	idm_so_conn_t *so_conn = ic->ic_transport_private;
2142 
2143 	ASSERT(pdu->isp_ic == ic);
2144 	mutex_enter(&so_conn->ic_tx_mutex);
2145 
2146 	if (!so_conn->ic_tx_thread_running) {
2147 		mutex_exit(&so_conn->ic_tx_mutex);
2148 		idm_pdu_complete(pdu, IDM_STATUS_ABORTED);
2149 		return;
2150 	}
2151 
2152 	list_insert_tail(&so_conn->ic_tx_list, (void *)pdu);
2153 	cv_signal(&so_conn->ic_tx_cv);
2154 	mutex_exit(&so_conn->ic_tx_mutex);
2155 }
2156 
2157 static idm_status_t
2158 idm_i_so_tx(idm_pdu_t *pdu)
2159 {
2160 	idm_conn_t	*ic = pdu->isp_ic;
2161 	idm_status_t	status = IDM_STATUS_SUCCESS;
2162 	uint8_t		pad[ISCSI_PAD_WORD_LEN];
2163 	int		pad_len;
2164 	uint32_t	hdr_digest_crc;
2165 	uint32_t	data_digest_crc = 0;
2166 	int		total_len = 0;
2167 	int		iovlen = 0;
2168 	struct iovec	iov[6];
2169 	idm_so_conn_t	*so_conn;
2170 
2171 	so_conn = ic->ic_transport_private;
2172 
2173 	/* Setup BHS */
2174 	iov[iovlen].iov_base	= (caddr_t)pdu->isp_hdr;
2175 	iov[iovlen].iov_len	= pdu->isp_hdrlen;
2176 	total_len		+= iov[iovlen].iov_len;
2177 	iovlen++;
2178 
2179 	/* Setup header digest */
2180 	if (((pdu->isp_flags & IDM_PDU_LOGIN_TX) == 0) &&
2181 	    (ic->ic_conn_flags & IDM_CONN_HEADER_DIGEST)) {
2182 		hdr_digest_crc = idm_crc32c(pdu->isp_hdr, pdu->isp_hdrlen);
2183 
2184 		iov[iovlen].iov_base	= (caddr_t)&hdr_digest_crc;
2185 		iov[iovlen].iov_len	= sizeof (hdr_digest_crc);
2186 		total_len		+= iov[iovlen].iov_len;
2187 		iovlen++;
2188 	}
2189 
2190 	/* Setup the data */
2191 	if (pdu->isp_datalen) {
2192 		idm_task_t		*idt;
2193 		idm_buf_t		*idb;
2194 		iscsi_data_hdr_t	*ihp;
2195 		ihp = (iscsi_data_hdr_t *)pdu->isp_hdr;
2196 		/* Write of immediate data */
2197 		if (ic->ic_ffp &&
2198 		    (ihp->opcode == ISCSI_OP_SCSI_CMD ||
2199 		    ihp->opcode == ISCSI_OP_SCSI_DATA)) {
2200 			idt = idm_task_find(ic, ihp->itt, ihp->ttt);
2201 			if (idt) {
2202 				mutex_enter(&idt->idt_mutex);
2203 				idb = idm_buf_find(&idt->idt_outbufv, 0);
2204 				mutex_exit(&idt->idt_mutex);
2205 				/*
2206 				 * If the initiator call to idm_buf_alloc
2207 				 * failed then we can get to this point
2208 				 * without a bound buffer.  The associated
2209 				 * connection failure will clean things up
2210 				 * later.  It would be nice to come up with
2211 				 * a cleaner way to handle this.  In
2212 				 * particular it seems absurd to look up
2213 				 * the task and the buffer just to update
2214 				 * this counter.
2215 				 */
2216 				if (idb)
2217 					idb->idb_xfer_len += pdu->isp_datalen;
2218 				idm_task_rele(idt);
2219 			}
2220 		}
2221 
2222 		iov[iovlen].iov_base = (caddr_t)pdu->isp_data;
2223 		iov[iovlen].iov_len  = pdu->isp_datalen;
2224 		total_len += iov[iovlen].iov_len;
2225 		iovlen++;
2226 	}
2227 
2228 	/* Setup the data pad if necessary */
2229 	pad_len = ((ISCSI_PAD_WORD_LEN -
2230 	    (pdu->isp_datalen & (ISCSI_PAD_WORD_LEN - 1))) &
2231 	    (ISCSI_PAD_WORD_LEN - 1));
2232 
2233 	if (pad_len) {
2234 		bzero(pad, sizeof (pad));
2235 		iov[iovlen].iov_base = (void *)&pad;
2236 		iov[iovlen].iov_len  = pad_len;
2237 		total_len		+= iov[iovlen].iov_len;
2238 		iovlen++;
2239 	}
2240 
2241 	/*
2242 	 * Setup the data digest if enabled.  Data-digest is not sent
2243 	 * for login-phase PDUs.
2244 	 */
2245 	if ((ic->ic_conn_flags & IDM_CONN_DATA_DIGEST) &&
2246 	    ((pdu->isp_flags & IDM_PDU_LOGIN_TX) == 0) &&
2247 	    (pdu->isp_datalen || pad_len)) {
2248 		/*
2249 		 * RFC3720/10.2.3: A zero-length Data Segment also
2250 		 * implies a zero-length data digest.
2251 		 */
2252 		if (pdu->isp_datalen) {
2253 			data_digest_crc = idm_crc32c(pdu->isp_data,
2254 			    pdu->isp_datalen);
2255 		}
2256 		if (pad_len) {
2257 			data_digest_crc = idm_crc32c_continued(&pad,
2258 			    pad_len, data_digest_crc);
2259 		}
2260 
2261 		iov[iovlen].iov_base	= (caddr_t)&data_digest_crc;
2262 		iov[iovlen].iov_len	= sizeof (data_digest_crc);
2263 		total_len		+= iov[iovlen].iov_len;
2264 		iovlen++;
2265 	}
2266 
2267 	/* Transmit the PDU */
2268 	if (idm_iov_sosend(so_conn->ic_so, &iov[0], iovlen,
2269 	    total_len) != 0) {
2270 		/* Set error status */
2271 		IDM_CONN_LOG(CE_WARN,
2272 		    "idm_so_tx: failed to transmit the PDU, so: %p ic: %p "
2273 		    "data: %p", (void *) so_conn->ic_so, (void *) ic,
2274 		    (void *) pdu->isp_data);
2275 		status = IDM_STATUS_IO;
2276 	}
2277 
2278 	/*
2279 	 * Success does not mean that the PDU actually reached the
2280 	 * remote node since it could get dropped along the way.
2281 	 */
2282 	idm_pdu_complete(pdu, status);
2283 
2284 	return (status);
2285 }
2286 
2287 /*
2288  * The idm_so_buf_tx_to_ini() is used by the target iSCSI layer to transmit the
2289  * Data-In PDUs using sockets. Based on the negotiated MaxRecvDataSegmentLength,
2290  * the buffer is segmented into a sequence of Data-In PDUs, ordered by DataSN.
2291  * A target can invoke this function multiple times for a single read command
2292  * (identified by the same ITT) to split the input into several sequences.
2293  *
2294  * DataSN starts with 0 for the first data PDU of an input command and advances
2295  * by 1 for each subsequent data PDU. Each sequence will have its own F bit,
2296  * which is set to 1 for the last data PDU of a sequence.
2297  *
2298  * Scope for Prototype build:
2299  * The data PDUs within a sequence will be sent in order with the buffer offset
2300  * in increasing order. i.e. initiator and target must have negotiated the
2301  * "DataPDUInOrder" to "Yes". The order between sequences is not enforced.
2302  *
2303  * Caller holds idt->idt_mutex
2304  */
2305 static idm_status_t
2306 idm_so_buf_tx_to_ini(idm_task_t *idt, idm_buf_t *idb)
2307 {
2308 	idm_so_conn_t	*so_conn = idb->idb_ic->ic_transport_private;
2309 	idm_pdu_t	tmppdu;
2310 
2311 	ASSERT(mutex_owned(&idt->idt_mutex));
2312 
2313 	/*
2314 	 * Put the idm_buf_t on the tx queue.  It will be transmitted by
2315 	 * idm_sotx_thread.
2316 	 */
2317 	mutex_enter(&so_conn->ic_tx_mutex);
2318 
2319 	DTRACE_ISCSI_8(xfer__start, idm_conn_t *, idt->idt_ic,
2320 	    uintptr_t, idb->idb_buf, uint32_t, idb->idb_bufoffset,
2321 	    uint64_t, 0, uint32_t, 0, uint32_t, 0,
2322 	    uint32_t, idb->idb_xfer_len, int, XFER_BUF_TX_TO_INI);
2323 
2324 	if (!so_conn->ic_tx_thread_running) {
2325 		mutex_exit(&so_conn->ic_tx_mutex);
2326 		/*
2327 		 * Don't release idt->idt_mutex since we're supposed to hold
2328 		 * in when calling idm_buf_tx_to_ini_done
2329 		 */
2330 		DTRACE_ISCSI_8(xfer__done, idm_conn_t *, idt->idt_ic,
2331 		    uintptr_t, idb->idb_buf, uint32_t, idb->idb_bufoffset,
2332 		    uint64_t, 0, uint32_t, 0, uint32_t, 0,
2333 		    uint32_t, idb->idb_xfer_len,
2334 		    int, XFER_BUF_TX_TO_INI);
2335 		idm_buf_tx_to_ini_done(idt, idb, IDM_STATUS_ABORTED);
2336 		return (IDM_STATUS_FAIL);
2337 	}
2338 
2339 	/*
2340 	 * Build a template for the data PDU headers we will use so that
2341 	 * the SN values will stay consistent with other PDU's we are
2342 	 * transmitting like R2T and SCSI status.
2343 	 */
2344 	bzero(&idb->idb_data_hdr_tmpl, sizeof (iscsi_hdr_t));
2345 	tmppdu.isp_hdr = &idb->idb_data_hdr_tmpl;
2346 	(*idt->idt_ic->ic_conn_ops.icb_build_hdr)(idt, &tmppdu,
2347 	    ISCSI_OP_SCSI_DATA_RSP);
2348 	idb->idb_tx_thread = B_TRUE;
2349 	list_insert_tail(&so_conn->ic_tx_list, (void *)idb);
2350 	cv_signal(&so_conn->ic_tx_cv);
2351 	mutex_exit(&so_conn->ic_tx_mutex);
2352 	mutex_exit(&idt->idt_mutex);
2353 
2354 	/*
2355 	 * Returning success here indicates the transfer was successfully
2356 	 * dispatched -- it does not mean that the transfer completed
2357 	 * successfully.
2358 	 */
2359 	return (IDM_STATUS_SUCCESS);
2360 }
2361 
2362 /*
2363  * The idm_so_buf_rx_from_ini() is used by the target iSCSI layer to specify the
2364  * data blocks it is ready to receive from the initiator in response to a WRITE
2365  * SCSI command. The target iSCSI layer passes the information about the desired
2366  * data blocks to the initiator in one R2T PDU. The receiving buffer, the buffer
2367  * offset and datalen are passed via the 'idb' argument.
2368  *
2369  * Scope for Prototype build:
2370  * R2Ts are required for any Data-Out PDU, i.e. initiator and target must have
2371  * negotiated the "InitialR2T" to "Yes".
2372  *
2373  * Caller holds idt->idt_mutex
2374  */
2375 static idm_status_t
2376 idm_so_buf_rx_from_ini(idm_task_t *idt, idm_buf_t *idb)
2377 {
2378 	idm_pdu_t		*pdu;
2379 	iscsi_rtt_hdr_t		*rtt;
2380 
2381 	ASSERT(mutex_owned(&idt->idt_mutex));
2382 
2383 	DTRACE_ISCSI_8(xfer__start, idm_conn_t *, idt->idt_ic,
2384 	    uintptr_t, idb->idb_buf, uint32_t, idb->idb_bufoffset,
2385 	    uint64_t, 0, uint32_t, 0, uint32_t, 0,
2386 	    uint32_t, idb->idb_xfer_len, int, XFER_BUF_RX_FROM_INI);
2387 
2388 	pdu = kmem_cache_alloc(idm.idm_sotx_pdu_cache, KM_SLEEP);
2389 	pdu->isp_ic = idt->idt_ic;
2390 	bzero(pdu->isp_hdr, sizeof (iscsi_rtt_hdr_t));
2391 
2392 	/* iSCSI layer fills the TTT, ITT, StatSN, ExpCmdSN, MaxCmdSN */
2393 	(*idt->idt_ic->ic_conn_ops.icb_build_hdr)(idt, pdu, ISCSI_OP_RTT_RSP);
2394 
2395 	/* set the rttsn, rtt.flags, rtt.data_offset and rtt.data_length */
2396 	rtt = (iscsi_rtt_hdr_t *)(pdu->isp_hdr);
2397 
2398 	rtt->opcode		= ISCSI_OP_RTT_RSP;
2399 	rtt->flags		= ISCSI_FLAG_FINAL;
2400 	rtt->data_offset	= htonl(idb->idb_bufoffset);
2401 	rtt->data_length	= htonl(idb->idb_xfer_len);
2402 	rtt->rttsn		= htonl(idt->idt_exp_rttsn++);
2403 
2404 	/* Keep track of buffer offsets */
2405 	idb->idb_exp_offset	= idb->idb_bufoffset;
2406 	mutex_exit(&idt->idt_mutex);
2407 
2408 	/*
2409 	 * Transmit the PDU.
2410 	 */
2411 	idm_pdu_tx(pdu);
2412 
2413 	return (IDM_STATUS_SUCCESS);
2414 }
2415 
2416 static idm_status_t
2417 idm_so_buf_alloc(idm_buf_t *idb, uint64_t buflen)
2418 {
2419 	if ((buflen > IDM_SO_BUF_CACHE_LB) && (buflen <= IDM_SO_BUF_CACHE_UB)) {
2420 		idb->idb_buf = kmem_cache_alloc(idm.idm_so_128k_buf_cache,
2421 		    KM_NOSLEEP);
2422 		idb->idb_buf_private = idm.idm_so_128k_buf_cache;
2423 	} else {
2424 		idb->idb_buf = kmem_alloc(buflen, KM_NOSLEEP);
2425 		idb->idb_buf_private = NULL;
2426 	}
2427 
2428 	if (idb->idb_buf == NULL) {
2429 		IDM_CONN_LOG(CE_NOTE,
2430 		    "idm_so_buf_alloc: failed buffer allocation");
2431 		return (IDM_STATUS_FAIL);
2432 	}
2433 
2434 	return (IDM_STATUS_SUCCESS);
2435 }
2436 
2437 /* ARGSUSED */
2438 static idm_status_t
2439 idm_so_buf_setup(idm_buf_t *idb)
2440 {
2441 	/* Ensure bufalloc'd flag is unset */
2442 	idb->idb_bufalloc = B_FALSE;
2443 
2444 	return (IDM_STATUS_SUCCESS);
2445 }
2446 
2447 /* ARGSUSED */
2448 static void
2449 idm_so_buf_teardown(idm_buf_t *idb)
2450 {
2451 	/* nothing to do here */
2452 }
2453 
2454 static void
2455 idm_so_buf_free(idm_buf_t *idb)
2456 {
2457 	if (idb->idb_buf_private == NULL) {
2458 		kmem_free(idb->idb_buf, idb->idb_buflen);
2459 	} else {
2460 		kmem_cache_free(idb->idb_buf_private, idb->idb_buf);
2461 	}
2462 }
2463 
2464 static void
2465 idm_so_send_rtt_data(idm_conn_t *ic, idm_task_t *idt, idm_buf_t *idb,
2466     uint32_t offset, uint32_t length)
2467 {
2468 	idm_so_conn_t	*so_conn = ic->ic_transport_private;
2469 	idm_pdu_t	tmppdu;
2470 	idm_buf_t	*rtt_buf;
2471 
2472 	ASSERT(mutex_owned(&idt->idt_mutex));
2473 
2474 	/*
2475 	 * Allocate a buffer to represent the RTT transfer.  We could further
2476 	 * optimize this by allocating the buffers internally from an rtt
2477 	 * specific buffer cache since this is socket-specific code but for
2478 	 * now we will keep it simple.
2479 	 */
2480 	rtt_buf = idm_buf_alloc(ic, (uint8_t *)idb->idb_buf + offset, length);
2481 	if (rtt_buf == NULL) {
2482 		/*
2483 		 * If we're in FFP then the failure was likely a resource
2484 		 * allocation issue and we should close the connection by
2485 		 * sending a CE_TRANSPORT_FAIL event.
2486 		 *
2487 		 * If we're not in FFP then idm_buf_alloc will always
2488 		 * fail and the state is transitioning to "complete" anyway
2489 		 * so we won't bother to send an event.
2490 		 */
2491 		mutex_enter(&ic->ic_state_mutex);
2492 		if (ic->ic_ffp)
2493 			idm_conn_event_locked(ic, CE_TRANSPORT_FAIL,
2494 			    NULL, CT_NONE);
2495 		mutex_exit(&ic->ic_state_mutex);
2496 		return;
2497 	}
2498 
2499 	rtt_buf->idb_buf_cb = NULL;
2500 	rtt_buf->idb_cb_arg = NULL;
2501 	rtt_buf->idb_bufoffset = offset;
2502 	rtt_buf->idb_xfer_len = length;
2503 	rtt_buf->idb_ic = idt->idt_ic;
2504 	rtt_buf->idb_task_binding = idt;
2505 
2506 	/*
2507 	 * Put the idm_buf_t on the tx queue.  It will be transmitted by
2508 	 * idm_sotx_thread.
2509 	 */
2510 	mutex_enter(&so_conn->ic_tx_mutex);
2511 
2512 	if (!so_conn->ic_tx_thread_running) {
2513 		idm_buf_free(rtt_buf);
2514 		mutex_exit(&so_conn->ic_tx_mutex);
2515 		return;
2516 	}
2517 
2518 	/*
2519 	 * This new buffer represents an additional reference on the task
2520 	 */
2521 	idm_task_hold(idt);
2522 
2523 	/*
2524 	 * Build a template for the data PDU headers we will use so that
2525 	 * the SN values will stay consistent with other PDU's we are
2526 	 * transmitting like R2T and SCSI status.
2527 	 */
2528 	bzero(&rtt_buf->idb_data_hdr_tmpl, sizeof (iscsi_hdr_t));
2529 	tmppdu.isp_hdr = &rtt_buf->idb_data_hdr_tmpl;
2530 	(*idt->idt_ic->ic_conn_ops.icb_build_hdr)(idt, &tmppdu,
2531 	    ISCSI_OP_SCSI_DATA);
2532 	rtt_buf->idb_tx_thread = B_TRUE;
2533 	rtt_buf->idb_in_transport = B_TRUE;
2534 	list_insert_tail(&so_conn->ic_tx_list, (void *)rtt_buf);
2535 	cv_signal(&so_conn->ic_tx_cv);
2536 	mutex_exit(&so_conn->ic_tx_mutex);
2537 }
2538 
2539 static void
2540 idm_so_send_rtt_data_done(idm_task_t *idt, idm_buf_t *idb)
2541 {
2542 	/*
2543 	 * Don't worry about status -- we assume any error handling
2544 	 * is performed by the caller (idm_sotx_thread).
2545 	 */
2546 	idb->idb_in_transport = B_FALSE;
2547 	idm_task_rele(idt);
2548 	idm_buf_free(idb);
2549 }
2550 
2551 static idm_status_t
2552 idm_so_send_buf_region(idm_task_t *idt, idm_buf_t *idb,
2553     uint32_t buf_region_offset, uint32_t buf_region_length)
2554 {
2555 	idm_conn_t		*ic;
2556 	uint32_t		max_dataseglen;
2557 	size_t			remainder, chunk;
2558 	uint32_t		data_offset = buf_region_offset;
2559 	iscsi_data_hdr_t	*bhs;
2560 	idm_pdu_t		*pdu;
2561 	idm_status_t		tx_status;
2562 
2563 	ASSERT(mutex_owned(&idt->idt_mutex));
2564 
2565 	ic = idt->idt_ic;
2566 
2567 	max_dataseglen = ic->ic_conn_params.max_xmit_dataseglen;
2568 	remainder = buf_region_length;
2569 
2570 	while (remainder) {
2571 		if (idt->idt_state != TASK_ACTIVE) {
2572 			ASSERT((idt->idt_state != TASK_IDLE) &&
2573 			    (idt->idt_state != TASK_COMPLETE));
2574 			return (IDM_STATUS_ABORTED);
2575 		}
2576 
2577 		/* check to see if we need to chunk the data */
2578 		if (remainder > max_dataseglen) {
2579 			chunk = max_dataseglen;
2580 		} else {
2581 			chunk = remainder;
2582 		}
2583 
2584 		/* Data PDU headers will always be sizeof (iscsi_hdr_t) */
2585 		pdu = kmem_cache_alloc(idm.idm_sotx_pdu_cache, KM_SLEEP);
2586 		pdu->isp_ic = ic;
2587 
2588 		/*
2589 		 * We've already built a build a header template
2590 		 * to use during the transfer.  Use this template so that
2591 		 * the SN values stay consistent with any unrelated PDU's
2592 		 * being transmitted.
2593 		 */
2594 		bcopy(&idb->idb_data_hdr_tmpl, pdu->isp_hdr,
2595 		    sizeof (iscsi_hdr_t));
2596 
2597 		/*
2598 		 * Set DataSN, data offset, and flags in BHS
2599 		 * For the prototype build, A = 0, S = 0, U = 0
2600 		 */
2601 		bhs = (iscsi_data_hdr_t *)(pdu->isp_hdr);
2602 
2603 		bhs->datasn		= htonl(idt->idt_exp_datasn++);
2604 
2605 		hton24(bhs->dlength, chunk);
2606 		bhs->offset = htonl(idb->idb_bufoffset + data_offset);
2607 
2608 		if (chunk == remainder) {
2609 			bhs->flags = ISCSI_FLAG_FINAL; /* F bit set to 1 */
2610 		}
2611 
2612 		/* Instrument the data-send DTrace probe. */
2613 		if (IDM_PDU_OPCODE(pdu) == ISCSI_OP_SCSI_DATA_RSP) {
2614 			DTRACE_ISCSI_2(data__send,
2615 			    idm_conn_t *, idt->idt_ic,
2616 			    iscsi_data_rsp_hdr_t *,
2617 			    (iscsi_data_rsp_hdr_t *)pdu->isp_hdr);
2618 		}
2619 		/* setup data */
2620 		pdu->isp_data	=  (uint8_t *)idb->idb_buf + data_offset;
2621 		pdu->isp_datalen = (uint_t)chunk;
2622 		remainder	-= chunk;
2623 		data_offset	+= chunk;
2624 
2625 		/*
2626 		 * Now that we're done working with idt_exp_datasn,
2627 		 * idt->idt_state and idb->idb_bufoffset we can release
2628 		 * the task lock -- don't want to hold it across the
2629 		 * call to idm_i_so_tx since we could block.
2630 		 */
2631 		mutex_exit(&idt->idt_mutex);
2632 
2633 		/*
2634 		 * Transmit the PDU.  Call the internal routine directly
2635 		 * as there is already implicit ordering.
2636 		 */
2637 		if ((tx_status = idm_i_so_tx(pdu)) != IDM_STATUS_SUCCESS) {
2638 			mutex_enter(&idt->idt_mutex);
2639 			return (tx_status);
2640 		}
2641 
2642 		mutex_enter(&idt->idt_mutex);
2643 		idt->idt_tx_bytes += chunk;
2644 	}
2645 
2646 	return (IDM_STATUS_SUCCESS);
2647 }
2648 
2649 /*
2650  * TX PDU cache
2651  */
2652 /* ARGSUSED */
2653 int
2654 idm_sotx_pdu_constructor(void *hdl, void *arg, int flags)
2655 {
2656 	idm_pdu_t	*pdu = hdl;
2657 
2658 	bzero(pdu, sizeof (idm_pdu_t));
2659 	pdu->isp_hdr = (iscsi_hdr_t *)(pdu + 1); /* Ptr arithmetic */
2660 	pdu->isp_hdrlen = sizeof (iscsi_hdr_t);
2661 	pdu->isp_callback = idm_sotx_cache_pdu_cb;
2662 	pdu->isp_magic = IDM_PDU_MAGIC;
2663 	bzero(pdu->isp_hdr, sizeof (iscsi_hdr_t));
2664 
2665 	return (0);
2666 }
2667 
2668 /* ARGSUSED */
2669 void
2670 idm_sotx_cache_pdu_cb(idm_pdu_t *pdu, idm_status_t status)
2671 {
2672 	/* reset values between use */
2673 	pdu->isp_datalen = 0;
2674 
2675 	kmem_cache_free(idm.idm_sotx_pdu_cache, pdu);
2676 }
2677 
2678 /*
2679  * RX PDU cache
2680  */
2681 /* ARGSUSED */
2682 int
2683 idm_sorx_pdu_constructor(void *hdl, void *arg, int flags)
2684 {
2685 	idm_pdu_t	*pdu = hdl;
2686 
2687 	bzero(pdu, sizeof (idm_pdu_t));
2688 	pdu->isp_magic = IDM_PDU_MAGIC;
2689 	pdu->isp_hdr = (iscsi_hdr_t *)(pdu + 1); /* Ptr arithmetic */
2690 	pdu->isp_callback = idm_sorx_cache_pdu_cb;
2691 
2692 	return (0);
2693 }
2694 
2695 /* ARGSUSED */
2696 static void
2697 idm_sorx_cache_pdu_cb(idm_pdu_t *pdu, idm_status_t status)
2698 {
2699 	pdu->isp_iovlen = 0;
2700 	pdu->isp_sorx_buf = 0;
2701 	kmem_cache_free(idm.idm_sorx_pdu_cache, pdu);
2702 }
2703 
2704 static void
2705 idm_sorx_addl_pdu_cb(idm_pdu_t *pdu, idm_status_t status)
2706 {
2707 	/*
2708 	 * We had to modify our cached RX PDU with a longer header buffer
2709 	 * and/or a longer data buffer.  Release the new buffers and fix
2710 	 * the fields back to what we would expect for a cached RX PDU.
2711 	 */
2712 	if (pdu->isp_flags & IDM_PDU_ADDL_HDR) {
2713 		kmem_free(pdu->isp_hdr, pdu->isp_hdrlen);
2714 	}
2715 	if (pdu->isp_flags & IDM_PDU_ADDL_DATA) {
2716 		kmem_free(pdu->isp_data, pdu->isp_datalen);
2717 	}
2718 	pdu->isp_hdr = (iscsi_hdr_t *)(pdu + 1);
2719 	pdu->isp_hdrlen = sizeof (iscsi_hdr_t);
2720 	pdu->isp_data = NULL;
2721 	pdu->isp_datalen = 0;
2722 	pdu->isp_sorx_buf = 0;
2723 	pdu->isp_callback = idm_sorx_cache_pdu_cb;
2724 	idm_sorx_cache_pdu_cb(pdu, status);
2725 }
2726 
2727 /*
2728  * This thread is only active when I/O is queued for transmit
2729  * because the socket is busy.
2730  */
2731 void
2732 idm_sotx_thread(void *arg)
2733 {
2734 	idm_conn_t	*ic = arg;
2735 	idm_tx_obj_t	*object, *next;
2736 	idm_so_conn_t	*so_conn;
2737 	idm_status_t	status = IDM_STATUS_SUCCESS;
2738 
2739 	idm_conn_hold(ic);
2740 
2741 	mutex_enter(&ic->ic_mutex);
2742 	so_conn = ic->ic_transport_private;
2743 	so_conn->ic_tx_thread_running = B_TRUE;
2744 	so_conn->ic_tx_thread_did = so_conn->ic_tx_thread->t_did;
2745 	cv_signal(&ic->ic_cv);
2746 	mutex_exit(&ic->ic_mutex);
2747 
2748 	mutex_enter(&so_conn->ic_tx_mutex);
2749 
2750 	while (so_conn->ic_tx_thread_running) {
2751 		while (list_is_empty(&so_conn->ic_tx_list)) {
2752 			DTRACE_PROBE1(soconn__tx__sleep, idm_conn_t *, ic);
2753 			cv_wait(&so_conn->ic_tx_cv, &so_conn->ic_tx_mutex);
2754 			DTRACE_PROBE1(soconn__tx__wakeup, idm_conn_t *, ic);
2755 
2756 			if (!so_conn->ic_tx_thread_running) {
2757 				goto tx_bail;
2758 			}
2759 		}
2760 
2761 		object = (idm_tx_obj_t *)list_head(&so_conn->ic_tx_list);
2762 		list_remove(&so_conn->ic_tx_list, object);
2763 		mutex_exit(&so_conn->ic_tx_mutex);
2764 
2765 		switch (object->idm_tx_obj_magic) {
2766 		case IDM_PDU_MAGIC:
2767 			DTRACE_PROBE2(soconn__tx__pdu, idm_conn_t *, ic,
2768 			    idm_pdu_t *, (idm_pdu_t *)object);
2769 
2770 			status = idm_i_so_tx((idm_pdu_t *)object);
2771 			break;
2772 
2773 		case IDM_BUF_MAGIC: {
2774 			idm_buf_t *idb = (idm_buf_t *)object;
2775 			idm_task_t *idt = idb->idb_task_binding;
2776 
2777 			DTRACE_PROBE2(soconn__tx__buf, idm_conn_t *, ic,
2778 			    idm_buf_t *, idb);
2779 
2780 			mutex_enter(&idt->idt_mutex);
2781 			status = idm_so_send_buf_region(idt,
2782 			    idb, 0, idb->idb_xfer_len);
2783 
2784 			/*
2785 			 * TX thread owns the buffer so we expect it to
2786 			 * be "in transport"
2787 			 */
2788 			ASSERT(idb->idb_in_transport);
2789 			if (IDM_CONN_ISTGT(ic)) {
2790 				/*
2791 				 * idm_buf_tx_to_ini_done releases
2792 				 * idt->idt_mutex
2793 				 */
2794 				DTRACE_ISCSI_8(xfer__done,
2795 				    idm_conn_t *, idt->idt_ic,
2796 				    uintptr_t, idb->idb_buf,
2797 				    uint32_t, idb->idb_bufoffset,
2798 				    uint64_t, 0, uint32_t, 0, uint32_t, 0,
2799 				    uint32_t, idb->idb_xfer_len,
2800 				    int, XFER_BUF_TX_TO_INI);
2801 				idm_buf_tx_to_ini_done(idt, idb, status);
2802 			} else {
2803 				idm_so_send_rtt_data_done(idt, idb);
2804 				mutex_exit(&idt->idt_mutex);
2805 			}
2806 			break;
2807 		}
2808 
2809 		default:
2810 			IDM_CONN_LOG(CE_WARN, "idm_sotx_thread: Unknown magic "
2811 			    "(0x%08x)", object->idm_tx_obj_magic);
2812 			status = IDM_STATUS_FAIL;
2813 		}
2814 
2815 		mutex_enter(&so_conn->ic_tx_mutex);
2816 
2817 		if (status != IDM_STATUS_SUCCESS) {
2818 			so_conn->ic_tx_thread_running = B_FALSE;
2819 			idm_conn_event(ic, CE_TRANSPORT_FAIL, status);
2820 		}
2821 	}
2822 
2823 	/*
2824 	 * Before we leave, we need to abort every item remaining in the
2825 	 * TX list.
2826 	 */
2827 
2828 tx_bail:
2829 	object = (idm_tx_obj_t *)list_head(&so_conn->ic_tx_list);
2830 
2831 	while (object != NULL) {
2832 		next = list_next(&so_conn->ic_tx_list, object);
2833 
2834 		list_remove(&so_conn->ic_tx_list, object);
2835 		switch (object->idm_tx_obj_magic) {
2836 		case IDM_PDU_MAGIC:
2837 			idm_pdu_complete((idm_pdu_t *)object,
2838 			    IDM_STATUS_ABORTED);
2839 			break;
2840 
2841 		case IDM_BUF_MAGIC: {
2842 			idm_buf_t *idb = (idm_buf_t *)object;
2843 			idm_task_t *idt = idb->idb_task_binding;
2844 			mutex_exit(&so_conn->ic_tx_mutex);
2845 			mutex_enter(&idt->idt_mutex);
2846 			/*
2847 			 * TX thread owns the buffer so we expect it to
2848 			 * be "in transport"
2849 			 */
2850 			ASSERT(idb->idb_in_transport);
2851 			if (IDM_CONN_ISTGT(ic)) {
2852 				/*
2853 				 * idm_buf_tx_to_ini_done releases
2854 				 * idt->idt_mutex
2855 				 */
2856 				DTRACE_ISCSI_8(xfer__done,
2857 				    idm_conn_t *, idt->idt_ic,
2858 				    uintptr_t, idb->idb_buf,
2859 				    uint32_t, idb->idb_bufoffset,
2860 				    uint64_t, 0, uint32_t, 0, uint32_t, 0,
2861 				    uint32_t, idb->idb_xfer_len,
2862 				    int, XFER_BUF_TX_TO_INI);
2863 				idm_buf_tx_to_ini_done(idt, idb,
2864 				    IDM_STATUS_ABORTED);
2865 			} else {
2866 				idm_so_send_rtt_data_done(idt, idb);
2867 				mutex_exit(&idt->idt_mutex);
2868 			}
2869 			mutex_enter(&so_conn->ic_tx_mutex);
2870 			break;
2871 		}
2872 		default:
2873 			IDM_CONN_LOG(CE_WARN,
2874 			    "idm_sotx_thread: Unexpected magic "
2875 			    "(0x%08x)", object->idm_tx_obj_magic);
2876 		}
2877 
2878 		object = next;
2879 	}
2880 
2881 	mutex_exit(&so_conn->ic_tx_mutex);
2882 	idm_conn_rele(ic);
2883 	thread_exit();
2884 	/*NOTREACHED*/
2885 }
2886 
2887 static void
2888 idm_so_socket_set_nonblock(struct sonode *node)
2889 {
2890 	(void) VOP_SETFL(node->so_vnode, node->so_flag,
2891 	    (node->so_state | FNONBLOCK), CRED(), NULL);
2892 }
2893 
2894 static void
2895 idm_so_socket_set_block(struct sonode *node)
2896 {
2897 	(void) VOP_SETFL(node->so_vnode, node->so_flag,
2898 	    (node->so_state & (~FNONBLOCK)), CRED(), NULL);
2899 }
2900 
2901 
2902 /*
2903  * Called by kernel sockets when the connection has been accepted or
2904  * rejected. In early volo, a "disconnect" callback was sent instead of
2905  * "connectfailed", so we check for both.
2906  */
2907 /* ARGSUSED */
2908 void
2909 idm_so_timed_socket_connect_cb(ksocket_t ks,
2910     ksocket_callback_event_t ev, void *arg, uintptr_t info)
2911 {
2912 	idm_so_timed_socket_t	*itp = arg;
2913 	ASSERT(itp != NULL);
2914 	ASSERT(ev == KSOCKET_EV_CONNECTED ||
2915 	    ev == KSOCKET_EV_CONNECTFAILED ||
2916 	    ev == KSOCKET_EV_DISCONNECTED);
2917 
2918 	mutex_enter(&idm_so_timed_socket_mutex);
2919 	itp->it_callback_called = B_TRUE;
2920 	if (ev == KSOCKET_EV_CONNECTED) {
2921 		itp->it_socket_error_code = 0;
2922 	} else {
2923 		/* Make sure the error code is non-zero on error */
2924 		if (info == 0)
2925 			info = ECONNRESET;
2926 		itp->it_socket_error_code = (int)info;
2927 	}
2928 	cv_signal(&itp->it_cv);
2929 	mutex_exit(&idm_so_timed_socket_mutex);
2930 }
2931 
2932 int
2933 idm_so_timed_socket_connect(ksocket_t ks,
2934     struct sockaddr_storage *sa, int sa_sz, int login_max_usec)
2935 {
2936 	clock_t			conn_login_max;
2937 	int			rc, nonblocking, rval;
2938 	idm_so_timed_socket_t	it;
2939 	ksocket_callbacks_t	ks_cb;
2940 
2941 	conn_login_max = ddi_get_lbolt() + drv_usectohz(login_max_usec);
2942 
2943 	/*
2944 	 * Set to non-block socket mode, with callback on connect
2945 	 * Early volo used "disconnected" instead of "connectfailed",
2946 	 * so set callback to look for both.
2947 	 */
2948 	bzero(&it, sizeof (it));
2949 	ks_cb.ksock_cb_flags = KSOCKET_CB_CONNECTED |
2950 	    KSOCKET_CB_CONNECTFAILED | KSOCKET_CB_DISCONNECTED;
2951 	ks_cb.ksock_cb_connected = idm_so_timed_socket_connect_cb;
2952 	ks_cb.ksock_cb_connectfailed = idm_so_timed_socket_connect_cb;
2953 	ks_cb.ksock_cb_disconnected = idm_so_timed_socket_connect_cb;
2954 	cv_init(&it.it_cv, NULL, CV_DEFAULT, NULL);
2955 	rc = ksocket_setcallbacks(ks, &ks_cb, &it, CRED());
2956 	if (rc != 0)
2957 		return (rc);
2958 
2959 	/* Set to non-blocking mode */
2960 	nonblocking = 1;
2961 	rc = ksocket_ioctl(ks, FIONBIO, (intptr_t)&nonblocking, &rval,
2962 	    CRED());
2963 	if (rc != 0)
2964 		goto cleanup;
2965 
2966 	bzero(&it, sizeof (it));
2967 	for (;;) {
2968 		/*
2969 		 * Warning -- in a loopback scenario, the call to
2970 		 * the connect_cb can occur inside the call to
2971 		 * ksocket_connect. Do not hold the mutex around the
2972 		 * call to ksocket_connect.
2973 		 */
2974 		rc = ksocket_connect(ks, (struct sockaddr *)sa, sa_sz, CRED());
2975 		if (rc == 0 || rc == EISCONN) {
2976 			/* socket success or already success */
2977 			rc = 0;
2978 			break;
2979 		}
2980 		if ((rc != EINPROGRESS) && (rc != EALREADY)) {
2981 			break;
2982 		}
2983 
2984 		/* TCP connect still in progress. See if out of time. */
2985 		if (ddi_get_lbolt() > conn_login_max) {
2986 			/*
2987 			 * Connection retry timeout,
2988 			 * failed connect to target.
2989 			 */
2990 			rc = ETIMEDOUT;
2991 			break;
2992 		}
2993 
2994 		/*
2995 		 * TCP connect still in progress.  Sleep until callback.
2996 		 * Do NOT go to sleep if the callback already occurred!
2997 		 */
2998 		mutex_enter(&idm_so_timed_socket_mutex);
2999 		if (!it.it_callback_called) {
3000 			(void) cv_timedwait(&it.it_cv,
3001 			    &idm_so_timed_socket_mutex, conn_login_max);
3002 		}
3003 		if (it.it_callback_called) {
3004 			rc = it.it_socket_error_code;
3005 			mutex_exit(&idm_so_timed_socket_mutex);
3006 			break;
3007 		}
3008 		/* If timer expires, go call ksocket_connect one last time. */
3009 		mutex_exit(&idm_so_timed_socket_mutex);
3010 	}
3011 
3012 	/* resume blocking mode */
3013 	nonblocking = 0;
3014 	(void)  ksocket_ioctl(ks, FIONBIO, (intptr_t)&nonblocking, &rval,
3015 	    CRED());
3016 cleanup:
3017 	ksocket_setcallbacks(ks, NULL, NULL, CRED());
3018 	cv_destroy(&it.it_cv);
3019 	if (rc != 0) {
3020 		idm_soshutdown(ks);
3021 	}
3022 	return (rc);
3023 }
3024 
3025 
3026 void
3027 idm_addr_to_sa(idm_addr_t *dportal, struct sockaddr_storage *sa)
3028 {
3029 	int			dp_addr_size;
3030 	struct sockaddr_in	*sin;
3031 	struct sockaddr_in6	*sin6;
3032 
3033 	/* Build sockaddr_storage for this portal (idm_addr_t) */
3034 	bzero(sa, sizeof (*sa));
3035 	dp_addr_size = dportal->a_addr.i_insize;
3036 	if (dp_addr_size == sizeof (struct in_addr)) {
3037 		/* IPv4 */
3038 		sa->ss_family = AF_INET;
3039 		sin = (struct sockaddr_in *)sa;
3040 		sin->sin_port = htons(dportal->a_port);
3041 		bcopy(&dportal->a_addr.i_addr.in4,
3042 		    &sin->sin_addr, sizeof (struct in_addr));
3043 	} else if (dp_addr_size == sizeof (struct in6_addr)) {
3044 		/* IPv6 */
3045 		sa->ss_family = AF_INET6;
3046 		sin6 = (struct sockaddr_in6 *)sa;
3047 		sin6->sin6_port = htons(dportal->a_port);
3048 		bcopy(&dportal->a_addr.i_addr.in6,
3049 		    &sin6->sin6_addr, sizeof (struct in6_addr));
3050 	} else {
3051 		ASSERT(0);
3052 	}
3053 }
3054 
3055 
3056 /*
3057  * return a human-readable form of a sockaddr_storage, in the form
3058  * [ip-address]:port.  This is used in calls to logging functions.
3059  * If several calls to idm_sa_ntop are made within the same invocation
3060  * of a logging function, then each one needs its own buf.
3061  */
3062 const char *
3063 idm_sa_ntop(const struct sockaddr_storage *sa,
3064     char *buf, size_t size)
3065 {
3066 	static const char bogus_ip[] = "[0].-1";
3067 	char tmp[INET6_ADDRSTRLEN];
3068 
3069 	switch (sa->ss_family) {
3070 	case AF_INET6:
3071 		{
3072 			const struct sockaddr_in6 *in6 =
3073 			    (const struct sockaddr_in6 *) sa;
3074 
3075 			if (inet_ntop(in6->sin6_family,
3076 			    &in6->sin6_addr, tmp, sizeof (tmp)) == NULL) {
3077 				goto err;
3078 			}
3079 			if (strlen(tmp) + sizeof ("[].65535") > size) {
3080 				goto err;
3081 			}
3082 			/* struct sockaddr_storage gets port info from v4 loc */
3083 			(void) snprintf(buf, size, "[%s].%u", tmp,
3084 			    ntohs(in6->sin6_port));
3085 			return (buf);
3086 		}
3087 	case AF_INET:
3088 		{
3089 			const struct sockaddr_in *in =
3090 			    (const struct sockaddr_in *) sa;
3091 
3092 			if (inet_ntop(in->sin_family, &in->sin_addr,
3093 			    tmp, sizeof (tmp)) == NULL) {
3094 				goto err;
3095 			}
3096 			if (strlen(tmp) + sizeof ("[].65535") > size) {
3097 				goto err;
3098 			}
3099 			(void) snprintf(buf, size,  "[%s].%u", tmp,
3100 			    ntohs(in->sin_port));
3101 			return (buf);
3102 		}
3103 	default:
3104 		break;
3105 	}
3106 err:
3107 	(void) snprintf(buf, size, "%s", bogus_ip);
3108 	return (buf);
3109 }
3110