xref: /titanic_44/usr/src/uts/common/io/idm/idm_so.c (revision 8956713aded83a741173fcd4f9ef1c83521fbea9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/conf.h>
27 #include <sys/stat.h>
28 #include <sys/file.h>
29 #include <sys/ddi.h>
30 #include <sys/sunddi.h>
31 #include <sys/modctl.h>
32 #include <sys/priv.h>
33 #include <sys/cpuvar.h>
34 #include <sys/socket.h>
35 #include <sys/strsubr.h>
36 #include <sys/sysmacros.h>
37 #include <sys/sdt.h>
38 #include <netinet/tcp.h>
39 #include <inet/tcp.h>
40 #include <sys/socketvar.h>
41 #include <sys/pathname.h>
42 #include <sys/fs/snode.h>
43 #include <sys/fs/dv_node.h>
44 #include <sys/vnode.h>
45 #include <netinet/in.h>
46 #include <net/if.h>
47 #include <sys/sockio.h>
48 #include <sys/ksocket.h>
49 #include <sys/idm/idm.h>
50 #include <sys/idm/idm_so.h>
51 #include <sys/idm/idm_text.h>
52 
53 /*
54  * in6addr_any is currently all zeroes, but use the macro in case this
55  * ever changes.
56  */
57 static const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT;
58 
59 static void idm_sorx_cache_pdu_cb(idm_pdu_t *pdu, idm_status_t status);
60 static void idm_sorx_addl_pdu_cb(idm_pdu_t *pdu, idm_status_t status);
61 static void idm_sotx_cache_pdu_cb(idm_pdu_t *pdu, idm_status_t status);
62 
63 static idm_status_t idm_so_conn_create_common(idm_conn_t *ic, ksocket_t new_so);
64 static void idm_so_conn_destroy_common(idm_conn_t *ic);
65 static void idm_so_conn_connect_common(idm_conn_t *ic);
66 
67 static void idm_set_ini_preconnect_options(idm_so_conn_t *sc);
68 static void idm_set_ini_postconnect_options(idm_so_conn_t *sc);
69 static void idm_set_tgt_connect_options(ksocket_t so);
70 static idm_status_t idm_i_so_tx(idm_pdu_t *pdu);
71 
72 static idm_status_t idm_sorecvdata(idm_conn_t *ic, idm_pdu_t *pdu);
73 static void idm_so_send_rtt_data(idm_conn_t *ic, idm_task_t *idt,
74     idm_buf_t *idb, uint32_t offset, uint32_t length);
75 static void idm_so_send_rtt_data_done(idm_task_t *idt, idm_buf_t *idb);
76 static idm_status_t idm_so_send_buf_region(idm_task_t *idt,
77     idm_buf_t *idb, uint32_t buf_region_offset, uint32_t buf_region_length);
78 
79 static uint32_t idm_fill_iov(idm_pdu_t *pdu, idm_buf_t *idb,
80     uint32_t ro, uint32_t dlength);
81 
82 static idm_status_t idm_so_handle_digest(idm_conn_t *it,
83     nvpair_t *digest_choice, const idm_kv_xlate_t *ikvx);
84 
85 /*
86  * Transport ops prototypes
87  */
88 static void idm_so_tx(idm_conn_t *ic, idm_pdu_t *pdu);
89 static idm_status_t idm_so_buf_tx_to_ini(idm_task_t *idt, idm_buf_t *idb);
90 static idm_status_t idm_so_buf_rx_from_ini(idm_task_t *idt, idm_buf_t *idb);
91 static void idm_so_rx_datain(idm_conn_t *ic, idm_pdu_t *pdu);
92 static void idm_so_rx_rtt(idm_conn_t *ic, idm_pdu_t *pdu);
93 static void idm_so_rx_dataout(idm_conn_t *ic, idm_pdu_t *pdu);
94 static idm_status_t idm_so_free_task_rsrc(idm_task_t *idt);
95 static kv_status_t idm_so_negotiate_key_values(idm_conn_t *it,
96     nvlist_t *request_nvl, nvlist_t *response_nvl, nvlist_t *negotiated_nvl);
97 static void idm_so_notice_key_values(idm_conn_t *it,
98     nvlist_t *negotiated_nvl);
99 static boolean_t idm_so_conn_is_capable(idm_conn_req_t *ic,
100     idm_transport_caps_t *caps);
101 static idm_status_t idm_so_buf_alloc(idm_buf_t *idb, uint64_t buflen);
102 static void idm_so_buf_free(idm_buf_t *idb);
103 static idm_status_t idm_so_buf_setup(idm_buf_t *idb);
104 static void idm_so_buf_teardown(idm_buf_t *idb);
105 static idm_status_t idm_so_tgt_svc_create(idm_svc_req_t *sr, idm_svc_t *is);
106 static void idm_so_tgt_svc_destroy(idm_svc_t *is);
107 static idm_status_t idm_so_tgt_svc_online(idm_svc_t *is);
108 static void idm_so_tgt_svc_offline(idm_svc_t *is);
109 static void idm_so_tgt_conn_destroy(idm_conn_t *ic);
110 static idm_status_t idm_so_tgt_conn_connect(idm_conn_t *ic);
111 static void idm_so_conn_disconnect(idm_conn_t *ic);
112 static idm_status_t idm_so_ini_conn_create(idm_conn_req_t *cr, idm_conn_t *ic);
113 static void idm_so_ini_conn_destroy(idm_conn_t *ic);
114 static idm_status_t idm_so_ini_conn_connect(idm_conn_t *ic);
115 
116 /*
117  * IDM Native Sockets transport operations
118  */
119 static
120 idm_transport_ops_t idm_so_transport_ops = {
121 	idm_so_tx,			/* it_tx_pdu */
122 	idm_so_buf_tx_to_ini,		/* it_buf_tx_to_ini */
123 	idm_so_buf_rx_from_ini,		/* it_buf_rx_from_ini */
124 	idm_so_rx_datain,		/* it_rx_datain */
125 	idm_so_rx_rtt,			/* it_rx_rtt */
126 	idm_so_rx_dataout,		/* it_rx_dataout */
127 	NULL,				/* it_alloc_conn_rsrc */
128 	NULL,				/* it_free_conn_rsrc */
129 	NULL,				/* it_tgt_enable_datamover */
130 	NULL,				/* it_ini_enable_datamover */
131 	NULL,				/* it_conn_terminate */
132 	idm_so_free_task_rsrc,		/* it_free_task_rsrc */
133 	idm_so_negotiate_key_values,	/* it_negotiate_key_values */
134 	idm_so_notice_key_values,	/* it_notice_key_values */
135 	idm_so_conn_is_capable,		/* it_conn_is_capable */
136 	idm_so_buf_alloc,		/* it_buf_alloc */
137 	idm_so_buf_free,		/* it_buf_free */
138 	idm_so_buf_setup,		/* it_buf_setup */
139 	idm_so_buf_teardown,		/* it_buf_teardown */
140 	idm_so_tgt_svc_create,		/* it_tgt_svc_create */
141 	idm_so_tgt_svc_destroy,		/* it_tgt_svc_destroy */
142 	idm_so_tgt_svc_online,		/* it_tgt_svc_online */
143 	idm_so_tgt_svc_offline,		/* it_tgt_svc_offline */
144 	idm_so_tgt_conn_destroy,	/* it_tgt_conn_destroy */
145 	idm_so_tgt_conn_connect,	/* it_tgt_conn_connect */
146 	idm_so_conn_disconnect,		/* it_tgt_conn_disconnect */
147 	idm_so_ini_conn_create,		/* it_ini_conn_create */
148 	idm_so_ini_conn_destroy,	/* it_ini_conn_destroy */
149 	idm_so_ini_conn_connect,	/* it_ini_conn_connect */
150 	idm_so_conn_disconnect		/* it_ini_conn_disconnect */
151 };
152 
153 /*
154  * idm_so_init()
155  * Sockets transport initialization
156  */
157 void
158 idm_so_init(idm_transport_t *it)
159 {
160 	/* Cache for IDM Data and R2T Transmit PDU's */
161 	idm.idm_sotx_pdu_cache = kmem_cache_create("idm_tx_pdu_cache",
162 	    sizeof (idm_pdu_t) + sizeof (iscsi_hdr_t), 8,
163 	    &idm_sotx_pdu_constructor, NULL, NULL, NULL, NULL, KM_SLEEP);
164 
165 	/* Cache for IDM Receive PDU's */
166 	idm.idm_sorx_pdu_cache = kmem_cache_create("idm_rx_pdu_cache",
167 	    sizeof (idm_pdu_t) + IDM_SORX_CACHE_HDRLEN, 8,
168 	    &idm_sorx_pdu_constructor, NULL, NULL, NULL, NULL, KM_SLEEP);
169 
170 	/* 128k buffer cache */
171 	idm.idm_so_128k_buf_cache = kmem_cache_create("idm_128k_buf_cache",
172 	    IDM_SO_BUF_CACHE_UB, 8, NULL, NULL, NULL, NULL, NULL, KM_SLEEP);
173 
174 	/* Set the sockets transport ops */
175 	it->it_ops = &idm_so_transport_ops;
176 }
177 
178 /*
179  * idm_so_fini()
180  * Sockets transport teardown
181  */
182 void
183 idm_so_fini(void)
184 {
185 	kmem_cache_destroy(idm.idm_so_128k_buf_cache);
186 	kmem_cache_destroy(idm.idm_sotx_pdu_cache);
187 	kmem_cache_destroy(idm.idm_sorx_pdu_cache);
188 }
189 
190 ksocket_t
191 idm_socreate(int domain, int type, int protocol)
192 {
193 	ksocket_t ks;
194 
195 	if (!ksocket_socket(&ks, domain, type, protocol, KSOCKET_NOSLEEP,
196 	    CRED())) {
197 		return (ks);
198 	} else {
199 		return (NULL);
200 	}
201 }
202 
203 /*
204  * idm_soshutdown will disconnect the socket and prevent subsequent PDU
205  * reception and transmission.  The sonode still exists but its state
206  * gets modified to indicate it is no longer connected.  Calls to
207  * idm_sorecv/idm_iov_sorecv will return so idm_soshutdown can be used
208  * regain control of a thread stuck in idm_sorecv.
209  */
210 void
211 idm_soshutdown(ksocket_t so)
212 {
213 	(void) ksocket_shutdown(so, SHUT_RDWR, CRED());
214 }
215 
216 /*
217  * idm_sodestroy releases all resources associated with a socket previously
218  * created with idm_socreate.  The socket must be shutdown using
219  * idm_soshutdown before the socket is destroyed with idm_sodestroy,
220  * otherwise undefined behavior will result.
221  */
222 void
223 idm_sodestroy(ksocket_t ks)
224 {
225 	(void) ksocket_close(ks, CRED());
226 }
227 
228 /*
229  * Function to compare two addresses in sockaddr_storage format
230  */
231 
232 int
233 idm_ss_compare(const struct sockaddr_storage *cmp_ss1,
234     const struct sockaddr_storage *cmp_ss2,
235     boolean_t v4_mapped_as_v4)
236 {
237 	struct sockaddr_storage			mapped_v4_ss1, mapped_v4_ss2;
238 	const struct sockaddr_storage		*ss1, *ss2;
239 	struct in_addr				*in1, *in2;
240 	struct in6_addr				*in61, *in62;
241 	int i;
242 
243 	/*
244 	 * Normalize V4-mapped IPv6 addresses into V4 format if
245 	 * v4_mapped_as_v4 is B_TRUE.
246 	 */
247 	ss1 = cmp_ss1;
248 	ss2 = cmp_ss2;
249 	if (v4_mapped_as_v4 && (ss1->ss_family == AF_INET6)) {
250 		in61 = &((struct sockaddr_in6 *)ss1)->sin6_addr;
251 		if (IN6_IS_ADDR_V4MAPPED(in61)) {
252 			bzero(&mapped_v4_ss1, sizeof (mapped_v4_ss1));
253 			mapped_v4_ss1.ss_family = AF_INET;
254 			((struct sockaddr_in *)&mapped_v4_ss1)->sin_port =
255 			    ((struct sockaddr_in *)ss1)->sin_port;
256 			IN6_V4MAPPED_TO_INADDR(in61,
257 			    &((struct sockaddr_in *)&mapped_v4_ss1)->sin_addr);
258 			ss1 = &mapped_v4_ss1;
259 		}
260 	}
261 	ss2 = cmp_ss2;
262 	if (v4_mapped_as_v4 && (ss2->ss_family == AF_INET6)) {
263 		in62 = &((struct sockaddr_in6 *)ss2)->sin6_addr;
264 		if (IN6_IS_ADDR_V4MAPPED(in62)) {
265 			bzero(&mapped_v4_ss2, sizeof (mapped_v4_ss2));
266 			mapped_v4_ss2.ss_family = AF_INET;
267 			((struct sockaddr_in *)&mapped_v4_ss2)->sin_port =
268 			    ((struct sockaddr_in *)ss2)->sin_port;
269 			IN6_V4MAPPED_TO_INADDR(in62,
270 			    &((struct sockaddr_in *)&mapped_v4_ss2)->sin_addr);
271 			ss2 = &mapped_v4_ss2;
272 		}
273 	}
274 
275 	/*
276 	 * Compare ports, then address family, then ip address
277 	 */
278 	if (((struct sockaddr_in *)ss1)->sin_port !=
279 	    ((struct sockaddr_in *)ss2)->sin_port) {
280 		if (((struct sockaddr_in *)ss1)->sin_port >
281 		    ((struct sockaddr_in *)ss2)->sin_port)
282 			return (1);
283 		else
284 			return (-1);
285 	}
286 
287 	/*
288 	 * ports are the same
289 	 */
290 	if (ss1->ss_family != ss2->ss_family) {
291 		if (ss1->ss_family == AF_INET)
292 			return (1);
293 		else
294 			return (-1);
295 	}
296 
297 	/*
298 	 * address families are the same
299 	 */
300 	if (ss1->ss_family == AF_INET) {
301 		in1 = &((struct sockaddr_in *)ss1)->sin_addr;
302 		in2 = &((struct sockaddr_in *)ss2)->sin_addr;
303 
304 		if (in1->s_addr > in2->s_addr)
305 			return (1);
306 		else if (in1->s_addr < in2->s_addr)
307 			return (-1);
308 		else
309 			return (0);
310 	} else if (ss1->ss_family == AF_INET6) {
311 		in61 = &((struct sockaddr_in6 *)ss1)->sin6_addr;
312 		in62 = &((struct sockaddr_in6 *)ss2)->sin6_addr;
313 
314 		for (i = 0; i < 4; i++) {
315 			if (in61->s6_addr32[i] > in62->s6_addr32[i])
316 				return (1);
317 			else if (in61->s6_addr32[i] < in62->s6_addr32[i])
318 				return (-1);
319 		}
320 		return (0);
321 	}
322 
323 	return (1);
324 }
325 
326 /*
327  * IP address filter functions to flag addresses that should not
328  * go out to initiators through discovery.
329  */
330 static boolean_t
331 idm_v4_addr_okay(struct in_addr *in_addr)
332 {
333 	in_addr_t addr = ntohl(in_addr->s_addr);
334 
335 	if ((INADDR_NONE == addr) ||
336 	    (IN_MULTICAST(addr)) ||
337 	    ((addr >> IN_CLASSA_NSHIFT) == 0) ||
338 	    ((addr >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)) {
339 		return (B_FALSE);
340 	}
341 	return (B_TRUE);
342 }
343 
344 static boolean_t
345 idm_v6_addr_okay(struct in6_addr *addr6)
346 {
347 
348 	if ((IN6_IS_ADDR_UNSPECIFIED(addr6)) ||
349 	    (IN6_IS_ADDR_LOOPBACK(addr6)) ||
350 	    (IN6_IS_ADDR_MULTICAST(addr6)) ||
351 	    (IN6_IS_ADDR_V4MAPPED(addr6)) ||
352 	    (IN6_IS_ADDR_V4COMPAT(addr6)) ||
353 	    (IN6_IS_ADDR_LINKLOCAL(addr6))) {
354 		return (B_FALSE);
355 	}
356 	return (B_TRUE);
357 }
358 
359 /*
360  * idm_get_ipaddr will retrieve a list of IP Addresses which the host is
361  * configured with by sending down a sequence of kernel ioctl to IP STREAMS.
362  */
363 int
364 idm_get_ipaddr(idm_addr_list_t **ipaddr_p)
365 {
366 	ksocket_t 		so4, so6;
367 	struct lifnum		lifn;
368 	struct lifconf		lifc;
369 	struct lifreq		*lp;
370 	int			rval;
371 	int			numifs;
372 	int			bufsize;
373 	void			*buf;
374 	int			i, j, n, rc;
375 	struct sockaddr_storage	ss;
376 	struct sockaddr_in	*sin;
377 	struct sockaddr_in6	*sin6;
378 	idm_addr_t		*ip;
379 	idm_addr_list_t		*ipaddr;
380 	int			size_ipaddr;
381 
382 	*ipaddr_p = NULL;
383 	size_ipaddr = 0;
384 	buf = NULL;
385 
386 	/* create an ipv4 and ipv6 UDP socket */
387 	if ((so6 = idm_socreate(PF_INET6, SOCK_DGRAM, 0)) == NULL)
388 		return (0);
389 	if ((so4 = idm_socreate(PF_INET, SOCK_DGRAM, 0)) == NULL) {
390 		idm_sodestroy(so6);
391 		return (0);
392 	}
393 
394 
395 retry_count:
396 	/* snapshot the current number of interfaces */
397 	lifn.lifn_family = PF_UNSPEC;
398 	lifn.lifn_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES;
399 	lifn.lifn_count = 0;
400 	/* use vp6 for ioctls with unspecified families by default */
401 	if (ksocket_ioctl(so6, SIOCGLIFNUM, (intptr_t)&lifn, &rval, CRED())
402 	    != 0) {
403 		goto cleanup;
404 	}
405 
406 	numifs = lifn.lifn_count;
407 	if (numifs <= 0) {
408 		goto cleanup;
409 	}
410 
411 	/* allocate extra room in case more interfaces appear */
412 	numifs += 10;
413 
414 	/* get the interface names and ip addresses */
415 	bufsize = numifs * sizeof (struct lifreq);
416 	buf = kmem_alloc(bufsize, KM_SLEEP);
417 
418 	lifc.lifc_family = AF_UNSPEC;
419 	lifc.lifc_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES;
420 	lifc.lifc_len = bufsize;
421 	lifc.lifc_buf = buf;
422 	rc = ksocket_ioctl(so6, SIOCGLIFCONF, (intptr_t)&lifc, &rval, CRED());
423 	if (rc != 0) {
424 		goto cleanup;
425 	}
426 	/* if our extra room is used up, try again */
427 	if (bufsize <= lifc.lifc_len) {
428 		kmem_free(buf, bufsize);
429 		buf = NULL;
430 		goto retry_count;
431 	}
432 	/* calc actual number of ifconfs */
433 	n = lifc.lifc_len / sizeof (struct lifreq);
434 
435 	/* get ip address */
436 	if (n > 0) {
437 		size_ipaddr = sizeof (idm_addr_list_t) +
438 		    (n - 1) * sizeof (idm_addr_t);
439 		ipaddr = kmem_zalloc(size_ipaddr, KM_SLEEP);
440 	} else {
441 		goto cleanup;
442 	}
443 
444 	/*
445 	 * Examine the array of interfaces and filter uninteresting ones
446 	 */
447 	for (i = 0, j = 0, lp = lifc.lifc_req; i < n; i++, lp++) {
448 
449 		/*
450 		 * Copy the address as the SIOCGLIFFLAGS ioctl is destructive
451 		 */
452 		ss = lp->lifr_addr;
453 		/*
454 		 * fetch the flags using the socket of the correct family
455 		 */
456 		switch (ss.ss_family) {
457 		case AF_INET:
458 			rc = ksocket_ioctl(so4, SIOCGLIFFLAGS, (intptr_t)lp,
459 			    &rval, CRED());
460 			break;
461 		case AF_INET6:
462 			rc = ksocket_ioctl(so6, SIOCGLIFFLAGS, (intptr_t)lp,
463 			    &rval, CRED());
464 			break;
465 		default:
466 			continue;
467 		}
468 		if (rc == 0) {
469 			/*
470 			 * If we got the flags, skip uninteresting
471 			 * interfaces based on flags
472 			 */
473 			if ((lp->lifr_flags & IFF_UP) != IFF_UP)
474 				continue;
475 			if (lp->lifr_flags &
476 			    (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED))
477 				continue;
478 		}
479 
480 		/* save ip address */
481 		ip = &ipaddr->al_addrs[j];
482 		switch (ss.ss_family) {
483 		case AF_INET:
484 			sin = (struct sockaddr_in *)&ss;
485 			if (!idm_v4_addr_okay(&sin->sin_addr))
486 				continue;
487 			ip->a_addr.i_addr.in4 = sin->sin_addr;
488 			ip->a_addr.i_insize = sizeof (struct in_addr);
489 			break;
490 		case AF_INET6:
491 			sin6 = (struct sockaddr_in6 *)&ss;
492 			if (!idm_v6_addr_okay(&sin6->sin6_addr))
493 				continue;
494 			ip->a_addr.i_addr.in6 = sin6->sin6_addr;
495 			ip->a_addr.i_insize = sizeof (struct in6_addr);
496 			break;
497 		default:
498 			continue;
499 		}
500 		j++;
501 	}
502 
503 	if (j == 0) {
504 		/* no valid ifaddr */
505 		kmem_free(ipaddr, size_ipaddr);
506 		size_ipaddr = 0;
507 		ipaddr = NULL;
508 	} else {
509 		ipaddr->al_out_cnt = j;
510 	}
511 
512 
513 cleanup:
514 	idm_sodestroy(so6);
515 	idm_sodestroy(so4);
516 
517 	if (buf != NULL)
518 		kmem_free(buf, bufsize);
519 
520 	*ipaddr_p = ipaddr;
521 	return (size_ipaddr);
522 }
523 
524 int
525 idm_sorecv(ksocket_t so, void *msg, size_t len)
526 {
527 	iovec_t iov;
528 
529 	ASSERT(so != NULL);
530 	ASSERT(len != 0);
531 
532 	/*
533 	 * Fill in iovec and receive data
534 	 */
535 	iov.iov_base = msg;
536 	iov.iov_len = len;
537 
538 	return (idm_iov_sorecv(so, &iov, 1, len));
539 }
540 
541 /*
542  * idm_sosendto - Sends a buffered data on a non-connected socket.
543  *
544  * This function puts the data provided on the wire by calling sosendmsg.
545  * It will return only when all the data has been sent or if an error
546  * occurs.
547  *
548  * Returns 0 for success, the socket errno value if sosendmsg fails, and
549  * -1 if sosendmsg returns success but uio_resid != 0
550  */
551 int
552 idm_sosendto(ksocket_t so, void *buff, size_t len,
553     struct sockaddr *name, socklen_t namelen)
554 {
555 	struct msghdr		msg;
556 	struct iovec		iov[1];
557 	int			error;
558 	size_t			sent = 0;
559 
560 	iov[0].iov_base	= buff;
561 	iov[0].iov_len	= len;
562 
563 	/* Initialization of the message header. */
564 	bzero(&msg, sizeof (msg));
565 	msg.msg_iov	= iov;
566 	msg.msg_iovlen	= 1;
567 	msg.msg_name	= name;
568 	msg.msg_namelen	= namelen;
569 
570 	if ((error = ksocket_sendmsg(so, &msg, 0, &sent, CRED())) == 0) {
571 		/* Data sent */
572 		if (sent == len) {
573 			/* All data sent.  Success. */
574 			return (0);
575 		} else {
576 			/* Not all data was sent.  Failure */
577 			return (-1);
578 		}
579 	}
580 
581 	/* Send failed */
582 	return (error);
583 }
584 
585 /*
586  * idm_iov_sosend - Sends an iovec on a connection.
587  *
588  * This function puts the data provided on the wire by calling sosendmsg.
589  * It will return only when all the data has been sent or if an error
590  * occurs.
591  *
592  * Returns 0 for success, the socket errno value if sosendmsg fails, and
593  * -1 if sosendmsg returns success but uio_resid != 0
594  */
595 int
596 idm_iov_sosend(ksocket_t so, iovec_t *iop, int iovlen, size_t total_len)
597 {
598 	struct msghdr		msg;
599 	int			error;
600 	size_t 			sent = 0;
601 
602 	ASSERT(iop != NULL);
603 
604 	/* Initialization of the message header. */
605 	bzero(&msg, sizeof (msg));
606 	msg.msg_iov	= iop;
607 	msg.msg_iovlen	= iovlen;
608 
609 	if ((error = ksocket_sendmsg(so, &msg, 0, &sent, CRED()))
610 	    == 0) {
611 		/* Data sent */
612 		if (sent == total_len) {
613 			/* All data sent.  Success. */
614 			return (0);
615 		} else {
616 			/* Not all data was sent.  Failure */
617 			return (-1);
618 		}
619 	}
620 
621 	/* Send failed */
622 	return (error);
623 }
624 
625 /*
626  * idm_iov_sorecv - Receives an iovec from a connection
627  *
628  * This function gets the data asked for from the socket.  It will return
629  * only when all the requested data has been retrieved or if an error
630  * occurs.
631  *
632  * Returns 0 for success, the socket errno value if sorecvmsg fails, and
633  * -1 if sorecvmsg returns success but uio_resid != 0
634  */
635 int
636 idm_iov_sorecv(ksocket_t so, iovec_t *iop, int iovlen, size_t total_len)
637 {
638 	struct msghdr		msg;
639 	int			error;
640 	size_t			recv;
641 	int 			flags;
642 
643 	ASSERT(iop != NULL);
644 
645 	/* Initialization of the message header. */
646 	bzero(&msg, sizeof (msg));
647 	msg.msg_iov	= iop;
648 	msg.msg_iovlen	= iovlen;
649 	flags		= MSG_WAITALL;
650 
651 	if ((error = ksocket_recvmsg(so, &msg, flags, &recv, CRED()))
652 	    == 0) {
653 		/* Received data */
654 		if (recv == total_len) {
655 			/* All requested data received.  Success */
656 			return (0);
657 		} else {
658 			/*
659 			 * Not all data was received.  The connection has
660 			 * probably failed.
661 			 */
662 			return (-1);
663 		}
664 	}
665 
666 	/* Receive failed */
667 	return (error);
668 }
669 
670 static void
671 idm_set_ini_preconnect_options(idm_so_conn_t *sc)
672 {
673 	int	conn_abort = 10000;
674 	int	conn_notify = 2000;
675 	int	abort = 30000;
676 
677 	/* Pre-connect socket options */
678 	(void) ksocket_setsockopt(sc->ic_so, IPPROTO_TCP,
679 	    TCP_CONN_NOTIFY_THRESHOLD, (char *)&conn_notify, sizeof (int),
680 	    CRED());
681 	(void) ksocket_setsockopt(sc->ic_so, IPPROTO_TCP,
682 	    TCP_CONN_ABORT_THRESHOLD, (char *)&conn_abort, sizeof (int),
683 	    CRED());
684 	(void) ksocket_setsockopt(sc->ic_so, IPPROTO_TCP, TCP_ABORT_THRESHOLD,
685 	    (char *)&abort, sizeof (int), CRED());
686 }
687 
688 static void
689 idm_set_ini_postconnect_options(idm_so_conn_t *sc)
690 {
691 	int32_t		rcvbuf = IDM_RCVBUF_SIZE;
692 	int32_t		sndbuf = IDM_SNDBUF_SIZE;
693 	const int	on = 1;
694 
695 	/* Set postconnect options */
696 	(void) ksocket_setsockopt(sc->ic_so, IPPROTO_TCP, TCP_NODELAY,
697 	    (char *)&on, sizeof (int), CRED());
698 	(void) ksocket_setsockopt(sc->ic_so, SOL_SOCKET, SO_RCVBUF,
699 	    (char *)&rcvbuf, sizeof (int), CRED());
700 	(void) ksocket_setsockopt(sc->ic_so, SOL_SOCKET, SO_SNDBUF,
701 	    (char *)&sndbuf, sizeof (int), CRED());
702 }
703 
704 static void
705 idm_set_tgt_connect_options(ksocket_t ks)
706 {
707 	int32_t		rcvbuf = IDM_RCVBUF_SIZE;
708 	int32_t		sndbuf = IDM_SNDBUF_SIZE;
709 	const int	on = 1;
710 
711 	/* Set connect options */
712 	(void) ksocket_setsockopt(ks, SOL_SOCKET, SO_RCVBUF,
713 	    (char *)&rcvbuf, sizeof (int), CRED());
714 	(void) ksocket_setsockopt(ks, SOL_SOCKET, SO_SNDBUF,
715 	    (char *)&sndbuf, sizeof (int), CRED());
716 	(void) ksocket_setsockopt(ks, IPPROTO_TCP, TCP_NODELAY,
717 	    (char *)&on, sizeof (on), CRED());
718 }
719 
720 static uint32_t
721 n2h24(const uchar_t *ptr)
722 {
723 	return ((ptr[0] << 16) | (ptr[1] << 8) | ptr[2]);
724 }
725 
726 
727 static idm_status_t
728 idm_sorecvhdr(idm_conn_t *ic, idm_pdu_t *pdu)
729 {
730 	iscsi_hdr_t	*bhs;
731 	uint32_t	hdr_digest_crc;
732 	uint32_t	crc_calculated;
733 	void		*new_hdr;
734 	int		ahslen = 0;
735 	int		total_len = 0;
736 	int		iovlen = 0;
737 	struct iovec	iov[2];
738 	idm_so_conn_t	*so_conn;
739 	int		rc;
740 
741 	so_conn = ic->ic_transport_private;
742 
743 	/*
744 	 * Read BHS
745 	 */
746 	bhs = pdu->isp_hdr;
747 	rc = idm_sorecv(so_conn->ic_so, pdu->isp_hdr, sizeof (iscsi_hdr_t));
748 	if (rc != IDM_STATUS_SUCCESS) {
749 		return (IDM_STATUS_FAIL);
750 	}
751 
752 	/*
753 	 * Check actual AHS length against the amount available in the buffer
754 	 */
755 	pdu->isp_hdrlen = sizeof (iscsi_hdr_t) +
756 	    (bhs->hlength * sizeof (uint32_t));
757 	pdu->isp_datalen = n2h24(bhs->dlength);
758 	if (bhs->hlength > IDM_SORX_CACHE_AHSLEN) {
759 		/* Allocate a new header segment and change the callback */
760 		new_hdr = kmem_alloc(pdu->isp_hdrlen, KM_SLEEP);
761 		bcopy(pdu->isp_hdr, new_hdr, sizeof (iscsi_hdr_t));
762 		pdu->isp_hdr = new_hdr;
763 		pdu->isp_flags |= IDM_PDU_ADDL_HDR;
764 
765 		/*
766 		 * This callback will restore the expected values after
767 		 * the RX PDU has been processed.
768 		 */
769 		pdu->isp_callback = idm_sorx_addl_pdu_cb;
770 	}
771 
772 	/*
773 	 * Setup receipt of additional header and header digest (if enabled).
774 	 */
775 	if (bhs->hlength > 0) {
776 		iov[iovlen].iov_base = (caddr_t)(pdu->isp_hdr + 1);
777 		ahslen = pdu->isp_hdrlen - sizeof (iscsi_hdr_t);
778 		iov[iovlen].iov_len = ahslen;
779 		total_len += iov[iovlen].iov_len;
780 		iovlen++;
781 	}
782 
783 	if (ic->ic_conn_flags & IDM_CONN_HEADER_DIGEST) {
784 		iov[iovlen].iov_base = (caddr_t)&hdr_digest_crc;
785 		iov[iovlen].iov_len = sizeof (hdr_digest_crc);
786 		total_len += iov[iovlen].iov_len;
787 		iovlen++;
788 	}
789 
790 	if ((iovlen != 0) &&
791 	    (idm_iov_sorecv(so_conn->ic_so, &iov[0], iovlen,
792 	    total_len) != 0)) {
793 		return (IDM_STATUS_FAIL);
794 	}
795 
796 	/*
797 	 * Validate header digest if enabled
798 	 */
799 	if (ic->ic_conn_flags & IDM_CONN_HEADER_DIGEST) {
800 		crc_calculated = idm_crc32c(pdu->isp_hdr,
801 		    sizeof (iscsi_hdr_t) + ahslen);
802 		if (crc_calculated != hdr_digest_crc) {
803 			/* Invalid Header Digest */
804 			return (IDM_STATUS_HEADER_DIGEST);
805 		}
806 	}
807 
808 	return (0);
809 }
810 
811 /*
812  * idm_so_ini_conn_create()
813  * Allocate the sockets transport connection resources.
814  */
815 static idm_status_t
816 idm_so_ini_conn_create(idm_conn_req_t *cr, idm_conn_t *ic)
817 {
818 	ksocket_t	so;
819 	idm_so_conn_t	*so_conn;
820 	idm_status_t	idmrc;
821 
822 	so = idm_socreate(cr->cr_domain, cr->cr_type,
823 	    cr->cr_protocol);
824 	if (so == NULL) {
825 		return (IDM_STATUS_FAIL);
826 	}
827 
828 	/* Bind the socket if configured to do so */
829 	if (cr->cr_bound) {
830 		if (ksocket_bind(so, &cr->cr_bound_addr.sin,
831 		    SIZEOF_SOCKADDR(&cr->cr_bound_addr.sin), CRED()) != 0) {
832 			idm_sodestroy(so);
833 			return (IDM_STATUS_FAIL);
834 		}
835 	}
836 
837 	idmrc = idm_so_conn_create_common(ic, so);
838 	if (idmrc != IDM_STATUS_SUCCESS) {
839 		idm_soshutdown(so);
840 		idm_sodestroy(so);
841 		return (IDM_STATUS_FAIL);
842 	}
843 
844 	so_conn = ic->ic_transport_private;
845 	/* Set up socket options */
846 	idm_set_ini_preconnect_options(so_conn);
847 
848 	return (IDM_STATUS_SUCCESS);
849 }
850 
851 /*
852  * idm_so_ini_conn_destroy()
853  * Tear down the sockets transport connection resources.
854  */
855 static void
856 idm_so_ini_conn_destroy(idm_conn_t *ic)
857 {
858 	idm_so_conn_destroy_common(ic);
859 }
860 
861 /*
862  * idm_so_ini_conn_connect()
863  * Establish the connection referred to by the handle previously allocated via
864  * idm_so_ini_conn_create().
865  */
866 static idm_status_t
867 idm_so_ini_conn_connect(idm_conn_t *ic)
868 {
869 	idm_so_conn_t	*so_conn;
870 
871 	so_conn = ic->ic_transport_private;
872 
873 	if (ksocket_connect(so_conn->ic_so, &ic->ic_ini_dst_addr.sin,
874 	    (SIZEOF_SOCKADDR(&ic->ic_ini_dst_addr.sin)), CRED()) != 0) {
875 		idm_soshutdown(so_conn->ic_so);
876 		return (IDM_STATUS_FAIL);
877 	}
878 
879 	idm_so_conn_connect_common(ic);
880 
881 	idm_set_ini_postconnect_options(so_conn);
882 
883 	return (IDM_STATUS_SUCCESS);
884 }
885 
886 idm_status_t
887 idm_so_tgt_conn_create(idm_conn_t *ic, ksocket_t new_so)
888 {
889 	idm_status_t	idmrc;
890 
891 	idmrc = idm_so_conn_create_common(ic, new_so);
892 
893 	return (idmrc);
894 }
895 
896 static void
897 idm_so_tgt_conn_destroy(idm_conn_t *ic)
898 {
899 	idm_so_conn_destroy_common(ic);
900 }
901 
902 /*
903  * idm_so_tgt_conn_connect()
904  * Establish the connection in ic, passed from idm_tgt_conn_finish(), which
905  * is invoked from the SM as a result of an inbound connection request.
906  */
907 static idm_status_t
908 idm_so_tgt_conn_connect(idm_conn_t *ic)
909 {
910 	idm_so_conn_connect_common(ic);
911 
912 	return (IDM_STATUS_SUCCESS);
913 }
914 
915 static idm_status_t
916 idm_so_conn_create_common(idm_conn_t *ic, ksocket_t new_so)
917 {
918 	idm_so_conn_t	*so_conn;
919 
920 	so_conn = kmem_zalloc(sizeof (idm_so_conn_t), KM_SLEEP);
921 	so_conn->ic_so = new_so;
922 
923 	ic->ic_transport_private = so_conn;
924 	ic->ic_transport_hdrlen = 0;
925 
926 	/* Set the scoreboarding flag on this connection */
927 	ic->ic_conn_flags |= IDM_CONN_USE_SCOREBOARD;
928 
929 	/*
930 	 * Initialize tx thread mutex and list
931 	 */
932 	mutex_init(&so_conn->ic_tx_mutex, NULL, MUTEX_DEFAULT, NULL);
933 	cv_init(&so_conn->ic_tx_cv, NULL, CV_DEFAULT, NULL);
934 	list_create(&so_conn->ic_tx_list, sizeof (idm_pdu_t),
935 	    offsetof(idm_pdu_t, idm_tx_link));
936 
937 	return (IDM_STATUS_SUCCESS);
938 }
939 
940 static void
941 idm_so_conn_destroy_common(idm_conn_t *ic)
942 {
943 	idm_so_conn_t	*so_conn = ic->ic_transport_private;
944 
945 	ic->ic_transport_private = NULL;
946 	idm_sodestroy(so_conn->ic_so);
947 	list_destroy(&so_conn->ic_tx_list);
948 	mutex_destroy(&so_conn->ic_tx_mutex);
949 	cv_destroy(&so_conn->ic_tx_cv);
950 
951 	kmem_free(so_conn, sizeof (idm_so_conn_t));
952 }
953 
954 static void
955 idm_so_conn_connect_common(idm_conn_t *ic)
956 {
957 	idm_so_conn_t	*so_conn;
958 	struct sockaddr_in6	t_addr;
959 	socklen_t	t_addrlen = 0;
960 
961 	so_conn = ic->ic_transport_private;
962 	bzero(&t_addr, sizeof (struct sockaddr_in6));
963 	t_addrlen = sizeof (struct sockaddr_in6);
964 
965 	/* Set the local and remote addresses in the idm conn handle */
966 	ksocket_getsockname(so_conn->ic_so, (struct sockaddr *)&t_addr,
967 	    &t_addrlen, CRED());
968 	bcopy(&t_addr, &ic->ic_laddr, t_addrlen);
969 	ksocket_getpeername(so_conn->ic_so, (struct sockaddr *)&t_addr,
970 	    &t_addrlen, CRED());
971 	bcopy(&t_addr, &ic->ic_raddr, t_addrlen);
972 
973 	mutex_enter(&ic->ic_mutex);
974 	so_conn->ic_tx_thread = thread_create(NULL, 0, idm_sotx_thread, ic, 0,
975 	    &p0, TS_RUN, minclsyspri);
976 	so_conn->ic_rx_thread = thread_create(NULL, 0, idm_sorx_thread, ic, 0,
977 	    &p0, TS_RUN, minclsyspri);
978 
979 	while (!so_conn->ic_rx_thread_running || !so_conn->ic_tx_thread_running)
980 		cv_wait(&ic->ic_cv, &ic->ic_mutex);
981 	mutex_exit(&ic->ic_mutex);
982 }
983 
984 /*
985  * idm_so_conn_disconnect()
986  * Shutdown the socket connection and stop the thread
987  */
988 static void
989 idm_so_conn_disconnect(idm_conn_t *ic)
990 {
991 	idm_so_conn_t	*so_conn;
992 
993 	so_conn = ic->ic_transport_private;
994 
995 	mutex_enter(&ic->ic_mutex);
996 	so_conn->ic_rx_thread_running = B_FALSE;
997 	so_conn->ic_tx_thread_running = B_FALSE;
998 	/* We need to wakeup the TX thread */
999 	mutex_enter(&so_conn->ic_tx_mutex);
1000 	cv_signal(&so_conn->ic_tx_cv);
1001 	mutex_exit(&so_conn->ic_tx_mutex);
1002 	mutex_exit(&ic->ic_mutex);
1003 
1004 	/* This should wakeup the RX thread if it is sleeping */
1005 	idm_soshutdown(so_conn->ic_so);
1006 
1007 	thread_join(so_conn->ic_tx_thread_did);
1008 	thread_join(so_conn->ic_rx_thread_did);
1009 }
1010 
1011 /*
1012  * idm_so_tgt_svc_create()
1013  * Establish a service on an IP address and port.  idm_svc_req_t contains
1014  * the service parameters.
1015  */
1016 /*ARGSUSED*/
1017 static idm_status_t
1018 idm_so_tgt_svc_create(idm_svc_req_t *sr, idm_svc_t *is)
1019 {
1020 	idm_so_svc_t		*so_svc;
1021 
1022 	so_svc = kmem_zalloc(sizeof (idm_so_svc_t), KM_SLEEP);
1023 
1024 	/* Set the new sockets service in svc handle */
1025 	is->is_so_svc = (void *)so_svc;
1026 
1027 	return (IDM_STATUS_SUCCESS);
1028 }
1029 
1030 /*
1031  * idm_so_tgt_svc_destroy()
1032  * Teardown sockets resources allocated in idm_so_tgt_svc_create()
1033  */
1034 static void
1035 idm_so_tgt_svc_destroy(idm_svc_t *is)
1036 {
1037 	/* the socket will have been torn down; free the service */
1038 	kmem_free(is->is_so_svc, sizeof (idm_so_svc_t));
1039 }
1040 
1041 /*
1042  * idm_so_tgt_svc_online()
1043  * Launch a watch thread on the svc allocated in idm_so_tgt_svc_create()
1044  */
1045 
1046 static idm_status_t
1047 idm_so_tgt_svc_online(idm_svc_t *is)
1048 {
1049 	idm_so_svc_t		*so_svc;
1050 	idm_svc_req_t		*sr = &is->is_svc_req;
1051 	struct sockaddr_in6	sin6_ip;
1052 	const uint32_t		on = 1;
1053 	const uint32_t		off = 0;
1054 
1055 	mutex_enter(&is->is_mutex);
1056 	so_svc = (idm_so_svc_t *)is->is_so_svc;
1057 
1058 	/*
1059 	 * Try creating an IPv6 socket first
1060 	 */
1061 	if ((so_svc->is_so = idm_socreate(PF_INET6, SOCK_STREAM, 0)) == NULL) {
1062 		mutex_exit(&is->is_mutex);
1063 		return (IDM_STATUS_FAIL);
1064 	} else {
1065 		bzero(&sin6_ip, sizeof (sin6_ip));
1066 		sin6_ip.sin6_family = AF_INET6;
1067 		sin6_ip.sin6_port = htons(sr->sr_port);
1068 		sin6_ip.sin6_addr = in6addr_any;
1069 
1070 		(void) ksocket_setsockopt(so_svc->is_so, SOL_SOCKET,
1071 		    SO_REUSEADDR, (char *)&on, sizeof (on), CRED());
1072 		/*
1073 		 * Turn off SO_MAC_EXEMPT so future sobinds succeed
1074 		 */
1075 		(void) ksocket_setsockopt(so_svc->is_so, SOL_SOCKET,
1076 		    SO_MAC_EXEMPT, (char *)&off, sizeof (off), CRED());
1077 
1078 		if (ksocket_bind(so_svc->is_so, (struct sockaddr *)&sin6_ip,
1079 		    sizeof (sin6_ip), CRED()) != 0) {
1080 			mutex_exit(&is->is_mutex);
1081 			idm_sodestroy(so_svc->is_so);
1082 			return (IDM_STATUS_FAIL);
1083 		}
1084 	}
1085 
1086 	idm_set_tgt_connect_options(so_svc->is_so);
1087 
1088 	if (ksocket_listen(so_svc->is_so, 5, CRED()) != 0) {
1089 		mutex_exit(&is->is_mutex);
1090 		idm_soshutdown(so_svc->is_so);
1091 		idm_sodestroy(so_svc->is_so);
1092 		return (IDM_STATUS_FAIL);
1093 	}
1094 
1095 	/* Launch a watch thread */
1096 	so_svc->is_thread = thread_create(NULL, 0, idm_so_svc_port_watcher,
1097 	    is, 0, &p0, TS_RUN, minclsyspri);
1098 
1099 	if (so_svc->is_thread == NULL) {
1100 		/* Failure to launch; teardown the socket */
1101 		mutex_exit(&is->is_mutex);
1102 		idm_soshutdown(so_svc->is_so);
1103 		idm_sodestroy(so_svc->is_so);
1104 		return (IDM_STATUS_FAIL);
1105 	}
1106 	ksocket_hold(so_svc->is_so);
1107 	/* Wait for the port watcher thread to start */
1108 	while (!so_svc->is_thread_running)
1109 		cv_wait(&is->is_cv, &is->is_mutex);
1110 	mutex_exit(&is->is_mutex);
1111 
1112 	return (IDM_STATUS_SUCCESS);
1113 }
1114 
1115 /*
1116  * idm_so_tgt_svc_offline
1117  *
1118  * Stop listening on the IP address and port identified by idm_svc_t.
1119  */
1120 static void
1121 idm_so_tgt_svc_offline(idm_svc_t *is)
1122 {
1123 	idm_so_svc_t		*so_svc;
1124 	mutex_enter(&is->is_mutex);
1125 	so_svc = (idm_so_svc_t *)is->is_so_svc;
1126 	so_svc->is_thread_running = B_FALSE;
1127 	mutex_exit(&is->is_mutex);
1128 
1129 	/*
1130 	 * Teardown socket
1131 	 */
1132 	idm_sodestroy(so_svc->is_so);
1133 
1134 	/*
1135 	 * Now we expect the port watcher thread to terminate
1136 	 */
1137 	thread_join(so_svc->is_thread_did);
1138 }
1139 
1140 /*
1141  * Watch thread for target service connection establishment.
1142  */
1143 void
1144 idm_so_svc_port_watcher(void *arg)
1145 {
1146 	idm_svc_t		*svc = arg;
1147 	ksocket_t		new_so;
1148 	idm_conn_t		*ic;
1149 	idm_status_t		idmrc;
1150 	idm_so_svc_t		*so_svc;
1151 	int			rc;
1152 	const uint32_t		off = 0;
1153 	struct sockaddr_in6 	t_addr;
1154 	socklen_t		t_addrlen;
1155 
1156 	bzero(&t_addr, sizeof (struct sockaddr_in6));
1157 	t_addrlen = sizeof (struct sockaddr_in6);
1158 	mutex_enter(&svc->is_mutex);
1159 
1160 	so_svc = svc->is_so_svc;
1161 	so_svc->is_thread_running = B_TRUE;
1162 	so_svc->is_thread_did = so_svc->is_thread->t_did;
1163 
1164 	cv_signal(&svc->is_cv);
1165 
1166 	IDM_SVC_LOG(CE_NOTE, "iSCSI service (%p/%d) online", (void *)svc,
1167 	    svc->is_svc_req.sr_port);
1168 
1169 	while (so_svc->is_thread_running) {
1170 		mutex_exit(&svc->is_mutex);
1171 
1172 		if ((rc = ksocket_accept(so_svc->is_so,
1173 		    (struct sockaddr *)&t_addr, &t_addrlen,
1174 		    &new_so, CRED())) != 0) {
1175 			mutex_enter(&svc->is_mutex);
1176 			if (rc == ECONNABORTED)
1177 				continue;
1178 			/* Connection problem */
1179 			break;
1180 		}
1181 		/*
1182 		 * Turn off SO_MAC_EXEMPT so future sobinds succeed
1183 		 */
1184 		(void) ksocket_setsockopt(new_so, SOL_SOCKET, SO_MAC_EXEMPT,
1185 		    (char *)&off, sizeof (off), CRED());
1186 
1187 		idmrc = idm_svc_conn_create(svc, IDM_TRANSPORT_TYPE_SOCKETS,
1188 		    &ic);
1189 		if (idmrc != IDM_STATUS_SUCCESS) {
1190 			/* Drop connection */
1191 			idm_soshutdown(new_so);
1192 			idm_sodestroy(new_so);
1193 			mutex_enter(&svc->is_mutex);
1194 			continue;
1195 		}
1196 
1197 		idmrc = idm_so_tgt_conn_create(ic, new_so);
1198 		if (idmrc != IDM_STATUS_SUCCESS) {
1199 			idm_svc_conn_destroy(ic);
1200 			idm_soshutdown(new_so);
1201 			idm_sodestroy(new_so);
1202 			mutex_enter(&svc->is_mutex);
1203 			continue;
1204 		}
1205 
1206 		/*
1207 		 * Kick the state machine.  At CS_S3_XPT_UP the state machine
1208 		 * will notify the client (target) about the new connection.
1209 		 */
1210 		idm_conn_event(ic, CE_CONNECT_ACCEPT, NULL);
1211 
1212 		mutex_enter(&svc->is_mutex);
1213 	}
1214 	ksocket_rele(so_svc->is_so);
1215 	so_svc->is_thread_running = B_FALSE;
1216 	mutex_exit(&svc->is_mutex);
1217 
1218 	IDM_SVC_LOG(CE_NOTE, "iSCSI service (%p/%d) offline", (void *)svc,
1219 	    svc->is_svc_req.sr_port);
1220 
1221 	thread_exit();
1222 }
1223 
1224 /*
1225  * idm_so_free_task_rsrc() stops any ongoing processing of the task and
1226  * frees resources associated with the task.
1227  *
1228  * It's not clear that this should return idm_status_t.  What do we do
1229  * if it fails?
1230  */
1231 static idm_status_t
1232 idm_so_free_task_rsrc(idm_task_t *idt)
1233 {
1234 	idm_buf_t	*idb;
1235 
1236 	/*
1237 	 * There is nothing to cleanup on initiator connections
1238 	 */
1239 	if (IDM_CONN_ISINI(idt->idt_ic))
1240 		return (IDM_STATUS_SUCCESS);
1241 
1242 	/*
1243 	 * If this is a target connection, call idm_buf_rx_from_ini_done for
1244 	 * any buffer on the "outbufv" list with idb->idb_in_transport==B_TRUE.
1245 	 *
1246 	 * In addition, remove any buffers associated with this task from
1247 	 * the ic_tx_list.  We'll do this by walking the idt_inbufv list, but
1248 	 * items don't actually get removed from that list (and completion
1249 	 * routines called) until idm_task_cleanup.
1250 	 */
1251 	mutex_enter(&idt->idt_mutex);
1252 
1253 	for (idb = list_head(&idt->idt_outbufv); idb != NULL;
1254 	    idb = list_next(&idt->idt_outbufv, idb)) {
1255 		if (idb->idb_in_transport) {
1256 			/*
1257 			 * idm_buf_rx_from_ini_done releases idt->idt_mutex
1258 			 */
1259 			DTRACE_ISCSI_8(xfer__done, idm_conn_t *, idt->idt_ic,
1260 			    uintptr_t, idb->idb_buf,
1261 			    uint32_t, idb->idb_bufoffset,
1262 			    uint64_t, 0, uint32_t, 0, uint32_t, 0,
1263 			    uint32_t, idb->idb_xfer_len,
1264 			    int, XFER_BUF_RX_FROM_INI);
1265 			idm_buf_rx_from_ini_done(idt, idb, IDM_STATUS_ABORTED);
1266 			mutex_enter(&idt->idt_mutex);
1267 		}
1268 	}
1269 
1270 	for (idb = list_head(&idt->idt_inbufv); idb != NULL;
1271 	    idb = list_next(&idt->idt_inbufv, idb)) {
1272 		/*
1273 		 * We want to remove these items from the tx_list as well,
1274 		 * but knowing it's in the idt_inbufv list is not a guarantee
1275 		 * that it's in the tx_list.  If it's on the tx list then
1276 		 * let idm_sotx_thread() clean it up.
1277 		 */
1278 		if (idb->idb_in_transport && !idb->idb_tx_thread) {
1279 			/*
1280 			 * idm_buf_tx_to_ini_done releases idt->idt_mutex
1281 			 */
1282 			DTRACE_ISCSI_8(xfer__done, idm_conn_t *, idt->idt_ic,
1283 			    uintptr_t, idb->idb_buf,
1284 			    uint32_t, idb->idb_bufoffset,
1285 			    uint64_t, 0, uint32_t, 0, uint32_t, 0,
1286 			    uint32_t, idb->idb_xfer_len,
1287 			    int, XFER_BUF_TX_TO_INI);
1288 			idm_buf_tx_to_ini_done(idt, idb, IDM_STATUS_ABORTED);
1289 			mutex_enter(&idt->idt_mutex);
1290 		}
1291 	}
1292 
1293 	mutex_exit(&idt->idt_mutex);
1294 
1295 	return (IDM_STATUS_SUCCESS);
1296 }
1297 
1298 /*
1299  * idm_so_negotiate_key_values() validates the key values for this connection
1300  */
1301 /* ARGSUSED */
1302 static kv_status_t
1303 idm_so_negotiate_key_values(idm_conn_t *it, nvlist_t *request_nvl,
1304     nvlist_t *response_nvl, nvlist_t *negotiated_nvl)
1305 {
1306 	/* All parameters are negotiated at the iscsit level */
1307 	return (KV_HANDLED);
1308 }
1309 
1310 /*
1311  * idm_so_notice_key_values() activates the negotiated key values for
1312  * this connection.
1313  */
1314 static void
1315 idm_so_notice_key_values(idm_conn_t *it, nvlist_t *negotiated_nvl)
1316 {
1317 	char			*nvp_name;
1318 	nvpair_t		*nvp;
1319 	nvpair_t		*next_nvp;
1320 	int			nvrc;
1321 	idm_status_t		idm_status;
1322 	const idm_kv_xlate_t	*ikvx;
1323 
1324 	for (nvp = nvlist_next_nvpair(negotiated_nvl, NULL);
1325 	    nvp != NULL; nvp = next_nvp) {
1326 		next_nvp = nvlist_next_nvpair(negotiated_nvl, nvp);
1327 		nvp_name = nvpair_name(nvp);
1328 
1329 		ikvx = idm_lookup_kv_xlate(nvp_name, strlen(nvp_name));
1330 		switch (ikvx->ik_key_id) {
1331 		case KI_HEADER_DIGEST:
1332 		case KI_DATA_DIGEST:
1333 			idm_status = idm_so_handle_digest(it, nvp, ikvx);
1334 			ASSERT(idm_status == 0);
1335 
1336 			/* Remove processed item from negotiated_nvl list */
1337 			nvrc = nvlist_remove_all(
1338 			    negotiated_nvl, ikvx->ik_key_name);
1339 			ASSERT(nvrc == 0);
1340 			break;
1341 		default:
1342 			break;
1343 		}
1344 	}
1345 }
1346 
1347 
1348 static idm_status_t
1349 idm_so_handle_digest(idm_conn_t *it, nvpair_t *digest_choice,
1350     const idm_kv_xlate_t *ikvx)
1351 {
1352 	int			nvrc;
1353 	char			*digest_choice_string;
1354 
1355 	nvrc = nvpair_value_string(digest_choice,
1356 	    &digest_choice_string);
1357 	ASSERT(nvrc == 0);
1358 	if (strcasecmp(digest_choice_string, "crc32c") == 0) {
1359 		switch (ikvx->ik_key_id) {
1360 		case KI_HEADER_DIGEST:
1361 			it->ic_conn_flags |= IDM_CONN_HEADER_DIGEST;
1362 			break;
1363 		case KI_DATA_DIGEST:
1364 			it->ic_conn_flags |= IDM_CONN_DATA_DIGEST;
1365 			break;
1366 		default:
1367 			ASSERT(0);
1368 			break;
1369 		}
1370 	} else if (strcasecmp(digest_choice_string, "none") == 0) {
1371 		switch (ikvx->ik_key_id) {
1372 		case KI_HEADER_DIGEST:
1373 			it->ic_conn_flags &= ~IDM_CONN_HEADER_DIGEST;
1374 			break;
1375 		case KI_DATA_DIGEST:
1376 			it->ic_conn_flags &= ~IDM_CONN_DATA_DIGEST;
1377 			break;
1378 		default:
1379 			ASSERT(0);
1380 			break;
1381 		}
1382 	} else {
1383 		ASSERT(0);
1384 	}
1385 
1386 	return (IDM_STATUS_SUCCESS);
1387 }
1388 
1389 
1390 /*
1391  * idm_so_conn_is_capable() verifies that the passed connection is provided
1392  * for by the sockets interface.
1393  */
1394 /* ARGSUSED */
1395 static boolean_t
1396 idm_so_conn_is_capable(idm_conn_req_t *ic, idm_transport_caps_t *caps)
1397 {
1398 	return (B_TRUE);
1399 }
1400 
1401 /*
1402  * idm_so_rx_datain() validates the Data Sequence number of the PDU. The
1403  * idm_sorecv_scsidata() function invoked earlier actually reads the data
1404  * off the socket into the appropriate buffers.
1405  */
1406 static void
1407 idm_so_rx_datain(idm_conn_t *ic, idm_pdu_t *pdu)
1408 {
1409 	iscsi_data_hdr_t	*bhs;
1410 	idm_task_t		*idt;
1411 	idm_buf_t		*idb;
1412 	uint32_t		datasn;
1413 	size_t			offset;
1414 	iscsi_hdr_t		*ihp = (iscsi_hdr_t *)pdu->isp_hdr;
1415 	iscsi_data_rsp_hdr_t    *idrhp = (iscsi_data_rsp_hdr_t *)ihp;
1416 
1417 	ASSERT(ic != NULL);
1418 	ASSERT(pdu != NULL);
1419 
1420 	bhs	= (iscsi_data_hdr_t *)pdu->isp_hdr;
1421 	datasn	= ntohl(bhs->datasn);
1422 	offset	= ntohl(bhs->offset);
1423 
1424 	ASSERT(bhs->opcode == ISCSI_OP_SCSI_DATA_RSP);
1425 
1426 	/*
1427 	 * Look up the task corresponding to the initiator task tag
1428 	 * to get the buffers affiliated with the task.
1429 	 */
1430 	idt = idm_task_find(ic, bhs->itt, bhs->ttt);
1431 	if (idt == NULL) {
1432 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_datain: failed to find task");
1433 		idm_pdu_rx_protocol_error(ic, pdu);
1434 		return;
1435 	}
1436 
1437 	idb = pdu->isp_sorx_buf;
1438 	if (idb == NULL) {
1439 		IDM_CONN_LOG(CE_WARN,
1440 		    "idm_so_rx_datain: failed to find buffer");
1441 		idm_task_rele(idt);
1442 		idm_pdu_rx_protocol_error(ic, pdu);
1443 		return;
1444 	}
1445 
1446 	/*
1447 	 * DataSN values should be sequential and should not have any gaps or
1448 	 * repetitions. Check the DataSN with the one stored in the task.
1449 	 */
1450 	if (datasn == idt->idt_exp_datasn) {
1451 		idt->idt_exp_datasn++; /* keep track of DataSN received */
1452 	} else {
1453 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_datain: datasn out of order");
1454 		idm_task_rele(idt);
1455 		idm_pdu_rx_protocol_error(ic, pdu);
1456 		return;
1457 	}
1458 
1459 	/*
1460 	 * PDUs in a sequence should be in continuously increasing
1461 	 * address offset
1462 	 */
1463 	if (offset != idb->idb_exp_offset) {
1464 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_datain: unexpected offset");
1465 		idm_task_rele(idt);
1466 		idm_pdu_rx_protocol_error(ic, pdu);
1467 		return;
1468 	}
1469 	/* Expected next relative buffer offset */
1470 	idb->idb_exp_offset += n2h24(bhs->dlength);
1471 	idt->idt_rx_bytes += n2h24(bhs->dlength);
1472 
1473 	idm_task_rele(idt);
1474 
1475 	/*
1476 	 * For now call scsi_rsp which will process the data rsp
1477 	 * Revisit, need to provide an explicit client entry point for
1478 	 * phase collapse completions.
1479 	 */
1480 	if (((ihp->opcode & ISCSI_OPCODE_MASK) == ISCSI_OP_SCSI_DATA_RSP) &&
1481 	    (idrhp->flags & ISCSI_FLAG_DATA_STATUS)) {
1482 		(*ic->ic_conn_ops.icb_rx_scsi_rsp)(ic, pdu);
1483 	}
1484 
1485 	idm_pdu_complete(pdu, IDM_STATUS_SUCCESS);
1486 }
1487 
1488 /*
1489  * The idm_so_rx_dataout() function is used by the iSCSI target to read
1490  * data from the Data-Out PDU sent by the iSCSI initiator.
1491  *
1492  * This function gets the Initiator Task Tag from the PDU BHS and looks up the
1493  * task to get the buffers associated with the PDU. A PDU might span buffers.
1494  * The data is then read into the respective buffer.
1495  */
1496 static void
1497 idm_so_rx_dataout(idm_conn_t *ic, idm_pdu_t *pdu)
1498 {
1499 
1500 	iscsi_data_hdr_t	*bhs;
1501 	idm_task_t		*idt;
1502 	idm_buf_t		*idb;
1503 	size_t			offset;
1504 
1505 	ASSERT(ic != NULL);
1506 	ASSERT(pdu != NULL);
1507 
1508 	bhs = (iscsi_data_hdr_t *)pdu->isp_hdr;
1509 	offset = ntohl(bhs->offset);
1510 	ASSERT(bhs->opcode == ISCSI_OP_SCSI_DATA);
1511 
1512 	/*
1513 	 * Look up the task corresponding to the initiator task tag
1514 	 * to get the buffers affiliated with the task.
1515 	 */
1516 	idt = idm_task_find(ic, bhs->itt, bhs->ttt);
1517 	if (idt == NULL) {
1518 		IDM_CONN_LOG(CE_WARN,
1519 		    "idm_so_rx_dataout: failed to find task");
1520 		idm_pdu_rx_protocol_error(ic, pdu);
1521 		return;
1522 	}
1523 
1524 	idb = pdu->isp_sorx_buf;
1525 	if (idb == NULL) {
1526 		IDM_CONN_LOG(CE_WARN,
1527 		    "idm_so_rx_dataout: failed to find buffer");
1528 		idm_task_rele(idt);
1529 		idm_pdu_rx_protocol_error(ic, pdu);
1530 		return;
1531 	}
1532 
1533 	/* Keep track of data transferred - check data offsets */
1534 	if (offset != idb->idb_exp_offset) {
1535 		IDM_CONN_LOG(CE_NOTE, "idm_so_rx_dataout: offset out of seq: "
1536 		    "%ld, %d", offset, idb->idb_exp_offset);
1537 		idm_task_rele(idt);
1538 		idm_pdu_rx_protocol_error(ic, pdu);
1539 		return;
1540 	}
1541 	/* Expected next relative offset */
1542 	idb->idb_exp_offset += ntoh24(bhs->dlength);
1543 	idt->idt_rx_bytes += n2h24(bhs->dlength);
1544 
1545 	/*
1546 	 * Call the buffer callback when the transfer is complete
1547 	 *
1548 	 * The connection state machine should only abort tasks after
1549 	 * shutting down the connection so we are assured that there
1550 	 * won't be a simultaneous attempt to abort this task at the
1551 	 * same time as we are processing this PDU (due to a connection
1552 	 * state change).
1553 	 */
1554 	if (bhs->flags & ISCSI_FLAG_FINAL) {
1555 		/*
1556 		 * We only want to call idm_buf_rx_from_ini_done once
1557 		 * per transfer.  It's possible that this task has
1558 		 * already been aborted in which case
1559 		 * idm_so_free_task_rsrc will call idm_buf_rx_from_ini_done
1560 		 * for each buffer with idb_in_transport==B_TRUE.  To
1561 		 * close this window and ensure that this doesn't happen,
1562 		 * we'll clear idb->idb_in_transport now while holding
1563 		 * the task mutex.   This is only really an issue for
1564 		 * SCSI task abort -- if tasks were being aborted because
1565 		 * of a connection state change the state machine would
1566 		 * have already stopped the receive thread.
1567 		 */
1568 		mutex_enter(&idt->idt_mutex);
1569 
1570 		/*
1571 		 * Release the task hold here (obtained in idm_task_find)
1572 		 * because the task may complete synchronously during
1573 		 * idm_buf_rx_from_ini_done.  Since we still have an active
1574 		 * buffer we know there is at least one additional hold on idt.
1575 		 */
1576 		idm_task_rele(idt);
1577 
1578 		/*
1579 		 * idm_buf_rx_from_ini_done releases idt->idt_mutex
1580 		 */
1581 		DTRACE_ISCSI_8(xfer__done, idm_conn_t *, idt->idt_ic,
1582 		    uintptr_t, idb->idb_buf, uint32_t, idb->idb_bufoffset,
1583 		    uint64_t, 0, uint32_t, 0, uint32_t, 0,
1584 		    uint32_t, idb->idb_xfer_len,
1585 		    int, XFER_BUF_RX_FROM_INI);
1586 		idm_buf_rx_from_ini_done(idt, idb, IDM_STATUS_SUCCESS);
1587 		idm_pdu_complete(pdu, IDM_STATUS_SUCCESS);
1588 		return;
1589 	}
1590 
1591 	idm_task_rele(idt);
1592 	idm_pdu_complete(pdu, IDM_STATUS_SUCCESS);
1593 }
1594 
1595 /*
1596  * The idm_so_rx_rtt() function is used by the iSCSI initiator to handle
1597  * the R2T PDU sent by the iSCSI target indicating that it is ready to
1598  * accept data. This gets the Initiator Task Tag (itt) from the PDU BHS
1599  * and looks up the task in the task tree using the itt to get the output
1600  * buffers associated the task. The R2T PDU contains the offset of the
1601  * requested data and the data length. This function then constructs a
1602  * sequence of iSCSI PDUs and outputs the requested data. Each Data-Out
1603  * PDU is associated with the R2T by the Target Transfer Tag  (ttt).
1604  */
1605 
1606 static void
1607 idm_so_rx_rtt(idm_conn_t *ic, idm_pdu_t *pdu)
1608 {
1609 	idm_task_t		*idt;
1610 	idm_buf_t		*idb;
1611 	iscsi_rtt_hdr_t		*rtt_hdr;
1612 	uint32_t		data_offset;
1613 	uint32_t		data_length;
1614 
1615 	ASSERT(ic != NULL);
1616 	ASSERT(pdu != NULL);
1617 
1618 	rtt_hdr	= (iscsi_rtt_hdr_t *)pdu->isp_hdr;
1619 	data_offset = ntohl(rtt_hdr->data_offset);
1620 	data_length = ntohl(rtt_hdr->data_length);
1621 	idt	= idm_task_find(ic, rtt_hdr->itt, rtt_hdr->ttt);
1622 
1623 	if (idt == NULL) {
1624 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_rtt: could not find task");
1625 		idm_pdu_rx_protocol_error(ic, pdu);
1626 		return;
1627 	}
1628 
1629 	/* Find the buffer bound to the task by the iSCSI initiator */
1630 	mutex_enter(&idt->idt_mutex);
1631 	idb = idm_buf_find(&idt->idt_outbufv, data_offset);
1632 	if (idb == NULL) {
1633 		mutex_exit(&idt->idt_mutex);
1634 		idm_task_rele(idt);
1635 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_rtt: could not find buffer");
1636 		idm_pdu_rx_protocol_error(ic, pdu);
1637 		return;
1638 	}
1639 
1640 	/* return buffer contains this data */
1641 	if (data_offset + data_length > idb->idb_buflen) {
1642 		/* Overflow */
1643 		mutex_exit(&idt->idt_mutex);
1644 		idm_task_rele(idt);
1645 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_rtt: read from outside "
1646 		    "buffer");
1647 		idm_pdu_rx_protocol_error(ic, pdu);
1648 		return;
1649 	}
1650 
1651 	idt->idt_r2t_ttt = rtt_hdr->ttt;
1652 	idt->idt_exp_datasn = 0;
1653 
1654 	idm_so_send_rtt_data(ic, idt, idb, data_offset,
1655 	    ntohl(rtt_hdr->data_length));
1656 	mutex_exit(&idt->idt_mutex);
1657 
1658 	idm_pdu_complete(pdu, IDM_STATUS_SUCCESS);
1659 	idm_task_rele(idt);
1660 
1661 }
1662 
1663 idm_status_t
1664 idm_sorecvdata(idm_conn_t *ic, idm_pdu_t *pdu)
1665 {
1666 	uint8_t		pad[ISCSI_PAD_WORD_LEN];
1667 	int		pad_len;
1668 	uint32_t	data_digest_crc;
1669 	uint32_t	crc_calculated;
1670 	int		total_len;
1671 	idm_so_conn_t	*so_conn;
1672 
1673 	so_conn = ic->ic_transport_private;
1674 
1675 	pad_len = ((ISCSI_PAD_WORD_LEN -
1676 	    (pdu->isp_datalen & (ISCSI_PAD_WORD_LEN - 1))) &
1677 	    (ISCSI_PAD_WORD_LEN - 1));
1678 
1679 	ASSERT(pdu->isp_iovlen < (PDU_MAX_IOVLEN - 2)); /* pad + data digest */
1680 
1681 	total_len = pdu->isp_datalen;
1682 
1683 	if (pad_len) {
1684 		pdu->isp_iov[pdu->isp_iovlen].iov_base	= (char *)&pad;
1685 		pdu->isp_iov[pdu->isp_iovlen].iov_len	= pad_len;
1686 		total_len		+= pad_len;
1687 		pdu->isp_iovlen++;
1688 	}
1689 
1690 	/* setup data digest */
1691 	if ((ic->ic_conn_flags & IDM_CONN_DATA_DIGEST) != 0) {
1692 		pdu->isp_iov[pdu->isp_iovlen].iov_base =
1693 		    (char *)&data_digest_crc;
1694 		pdu->isp_iov[pdu->isp_iovlen].iov_len =
1695 		    sizeof (data_digest_crc);
1696 		total_len		+= sizeof (data_digest_crc);
1697 		pdu->isp_iovlen++;
1698 	}
1699 
1700 	pdu->isp_data = (uint8_t *)(uintptr_t)pdu->isp_iov[0].iov_base;
1701 
1702 	if (idm_iov_sorecv(so_conn->ic_so, &pdu->isp_iov[0],
1703 	    pdu->isp_iovlen, total_len) != 0) {
1704 		return (IDM_STATUS_IO);
1705 	}
1706 
1707 	if ((ic->ic_conn_flags & IDM_CONN_DATA_DIGEST) != 0) {
1708 		crc_calculated = idm_crc32c(pdu->isp_data,
1709 		    pdu->isp_datalen);
1710 		if (pad_len) {
1711 			crc_calculated = idm_crc32c_continued((char *)&pad,
1712 			    pad_len, crc_calculated);
1713 		}
1714 		if (crc_calculated != data_digest_crc) {
1715 			IDM_CONN_LOG(CE_WARN,
1716 			    "idm_sorecvdata: "
1717 			    "CRC error: actual 0x%x, calc 0x%x",
1718 			    data_digest_crc, crc_calculated);
1719 
1720 			/* Invalid Data Digest */
1721 			return (IDM_STATUS_DATA_DIGEST);
1722 		}
1723 	}
1724 
1725 	return (IDM_STATUS_SUCCESS);
1726 }
1727 
1728 /*
1729  * idm_sorecv_scsidata() is used to receive scsi data from the socket. The
1730  * Data-type PDU header must be read into the idm_pdu_t structure prior to
1731  * calling this function.
1732  */
1733 idm_status_t
1734 idm_sorecv_scsidata(idm_conn_t *ic, idm_pdu_t *pdu)
1735 {
1736 	iscsi_data_hdr_t	*bhs;
1737 	idm_task_t		*task;
1738 	uint32_t		offset;
1739 	uint8_t			opcode;
1740 	uint32_t		dlength;
1741 	list_t			*buflst;
1742 	uint32_t		xfer_bytes;
1743 	idm_status_t		status;
1744 
1745 	ASSERT(ic != NULL);
1746 	ASSERT(pdu != NULL);
1747 
1748 	bhs	= (iscsi_data_hdr_t *)pdu->isp_hdr;
1749 
1750 	offset	= ntohl(bhs->offset);
1751 	opcode	= bhs->opcode;
1752 	dlength = n2h24(bhs->dlength);
1753 
1754 	ASSERT((opcode == ISCSI_OP_SCSI_DATA_RSP) ||
1755 	    (opcode == ISCSI_OP_SCSI_DATA));
1756 
1757 	/*
1758 	 * Successful lookup implicitly gets a "hold" on the task.  This
1759 	 * hold must be released before leaving this function.  At one
1760 	 * point we were caching this task context and retaining the hold
1761 	 * but it turned out to be very difficult to release the hold properly.
1762 	 * The task can be aborted and the connection shutdown between this
1763 	 * call and the subsequent expected call to idm_so_rx_datain/
1764 	 * idm_so_rx_dataout (in which case those functions are not called).
1765 	 * Releasing the hold in the PDU callback doesn't work well either
1766 	 * because the whole task may be completed by then at which point
1767 	 * it is too late to release the hold -- for better or worse this
1768 	 * code doesn't wait on the refcnts during normal operation.
1769 	 * idm_task_find() is very fast and it is not a huge burden if we
1770 	 * have to do it twice.
1771 	 */
1772 	task = idm_task_find(ic, bhs->itt, bhs->ttt);
1773 	if (task == NULL) {
1774 		IDM_CONN_LOG(CE_WARN,
1775 		    "idm_sorecv_scsidata: could not find task");
1776 		return (IDM_STATUS_FAIL);
1777 	}
1778 
1779 	mutex_enter(&task->idt_mutex);
1780 	buflst	= (opcode == ISCSI_OP_SCSI_DATA_RSP) ?
1781 	    &task->idt_inbufv : &task->idt_outbufv;
1782 	pdu->isp_sorx_buf = idm_buf_find(buflst, offset);
1783 	mutex_exit(&task->idt_mutex);
1784 
1785 	if (pdu->isp_sorx_buf == NULL) {
1786 		idm_task_rele(task);
1787 		IDM_CONN_LOG(CE_WARN, "idm_sorecv_scsidata: could not find "
1788 		    "buffer for offset %x opcode=%x",
1789 		    offset, opcode);
1790 		return (IDM_STATUS_FAIL);
1791 	}
1792 
1793 	xfer_bytes = idm_fill_iov(pdu, pdu->isp_sorx_buf, offset, dlength);
1794 	ASSERT(xfer_bytes != 0);
1795 	if (xfer_bytes != dlength) {
1796 		idm_task_rele(task);
1797 		/*
1798 		 * Buffer overflow, connection error.  The PDU data is still
1799 		 * sitting in the socket so we can't use the connection
1800 		 * again until that data is drained.
1801 		 */
1802 		return (IDM_STATUS_FAIL);
1803 	}
1804 
1805 	status = idm_sorecvdata(ic, pdu);
1806 
1807 	idm_task_rele(task);
1808 
1809 	return (status);
1810 }
1811 
1812 static uint32_t
1813 idm_fill_iov(idm_pdu_t *pdu, idm_buf_t *idb, uint32_t ro, uint32_t dlength)
1814 {
1815 	uint32_t	buf_ro = ro - idb->idb_bufoffset;
1816 	uint32_t	xfer_len = min(dlength, idb->idb_buflen - buf_ro);
1817 
1818 	ASSERT(ro >= idb->idb_bufoffset);
1819 
1820 	pdu->isp_iov[pdu->isp_iovlen].iov_base	=
1821 	    (caddr_t)idb->idb_buf + buf_ro;
1822 	pdu->isp_iov[pdu->isp_iovlen].iov_len	= xfer_len;
1823 	pdu->isp_iovlen++;
1824 
1825 	return (xfer_len);
1826 }
1827 
1828 int
1829 idm_sorecv_nonscsidata(idm_conn_t *ic, idm_pdu_t *pdu)
1830 {
1831 	pdu->isp_data = kmem_alloc(pdu->isp_datalen, KM_SLEEP);
1832 	ASSERT(pdu->isp_data != NULL);
1833 
1834 	pdu->isp_databuflen = pdu->isp_datalen;
1835 	pdu->isp_iov[0].iov_base = (caddr_t)pdu->isp_data;
1836 	pdu->isp_iov[0].iov_len = pdu->isp_datalen;
1837 	pdu->isp_iovlen = 1;
1838 	/*
1839 	 * Since we are associating a new data buffer with this received
1840 	 * PDU we need to set a specific callback to free the data
1841 	 * after the PDU is processed.
1842 	 */
1843 	pdu->isp_flags |= IDM_PDU_ADDL_DATA;
1844 	pdu->isp_callback = idm_sorx_addl_pdu_cb;
1845 
1846 	return (idm_sorecvdata(ic, pdu));
1847 }
1848 
1849 void
1850 idm_sorx_thread(void *arg)
1851 {
1852 	boolean_t	conn_failure = B_FALSE;
1853 	idm_conn_t	*ic = (idm_conn_t *)arg;
1854 	idm_so_conn_t	*so_conn;
1855 	idm_pdu_t	*pdu;
1856 	idm_status_t	rc;
1857 
1858 	idm_conn_hold(ic);
1859 
1860 	mutex_enter(&ic->ic_mutex);
1861 
1862 	so_conn = ic->ic_transport_private;
1863 	so_conn->ic_rx_thread_running = B_TRUE;
1864 	so_conn->ic_rx_thread_did = so_conn->ic_rx_thread->t_did;
1865 	cv_signal(&ic->ic_cv);
1866 
1867 	while (so_conn->ic_rx_thread_running) {
1868 		mutex_exit(&ic->ic_mutex);
1869 
1870 		/*
1871 		 * Get PDU with default header size (large enough for
1872 		 * BHS plus any anticipated AHS).  PDU from
1873 		 * the cache will have all values set correctly
1874 		 * for sockets RX including callback.
1875 		 */
1876 		pdu = kmem_cache_alloc(idm.idm_sorx_pdu_cache, KM_SLEEP);
1877 		pdu->isp_ic = ic;
1878 		pdu->isp_flags = 0;
1879 		pdu->isp_transport_hdrlen = 0;
1880 
1881 		if ((rc = idm_sorecvhdr(ic, pdu)) != 0) {
1882 			/*
1883 			 * Call idm_pdu_complete so that we call the callback
1884 			 * and ensure any memory allocated in idm_sorecvhdr
1885 			 * gets freed up.
1886 			 */
1887 			idm_pdu_complete(pdu, IDM_STATUS_FAIL);
1888 
1889 			/*
1890 			 * If ic_rx_thread_running is still set then
1891 			 * this is some kind of connection problem
1892 			 * on the socket.  In this case we want to
1893 			 * generate an event.  Otherwise some other
1894 			 * thread closed the socket due to another
1895 			 * issue in which case we don't need to
1896 			 * generate an event.
1897 			 */
1898 			mutex_enter(&ic->ic_mutex);
1899 			if (so_conn->ic_rx_thread_running) {
1900 				conn_failure = B_TRUE;
1901 				so_conn->ic_rx_thread_running = B_FALSE;
1902 			}
1903 
1904 			continue;
1905 		}
1906 
1907 		/*
1908 		 * Header has been read and validated.  Now we need
1909 		 * to read the PDU data payload (if present).  SCSI data
1910 		 * need to be transferred from the socket directly into
1911 		 * the associated transfer buffer for the SCSI task.
1912 		 */
1913 		if (pdu->isp_datalen != 0) {
1914 			if ((IDM_PDU_OPCODE(pdu) == ISCSI_OP_SCSI_DATA) ||
1915 			    (IDM_PDU_OPCODE(pdu) == ISCSI_OP_SCSI_DATA_RSP)) {
1916 				rc = idm_sorecv_scsidata(ic, pdu);
1917 				/*
1918 				 * All SCSI errors are fatal to the
1919 				 * connection right now since we have no
1920 				 * place to put the data.  What we need
1921 				 * is some kind of sink to dispose of unwanted
1922 				 * SCSI data.  For example an invalid task tag
1923 				 * should not kill the connection (although
1924 				 * we may want to drop the connection).
1925 				 */
1926 			} else {
1927 				/*
1928 				 * Not data PDUs so allocate a buffer for the
1929 				 * data segment and read the remaining data.
1930 				 */
1931 				rc = idm_sorecv_nonscsidata(ic, pdu);
1932 			}
1933 			if (rc != 0) {
1934 				/*
1935 				 * Call idm_pdu_complete so that we call the
1936 				 * callback and ensure any memory allocated
1937 				 * in idm_sorecvhdr gets freed up.
1938 				 */
1939 				idm_pdu_complete(pdu, IDM_STATUS_FAIL);
1940 
1941 				/*
1942 				 * If ic_rx_thread_running is still set then
1943 				 * this is some kind of connection problem
1944 				 * on the socket.  In this case we want to
1945 				 * generate an event.  Otherwise some other
1946 				 * thread closed the socket due to another
1947 				 * issue in which case we don't need to
1948 				 * generate an event.
1949 				 */
1950 				mutex_enter(&ic->ic_mutex);
1951 				if (so_conn->ic_rx_thread_running) {
1952 					conn_failure = B_TRUE;
1953 					so_conn->ic_rx_thread_running = B_FALSE;
1954 				}
1955 				continue;
1956 			}
1957 		}
1958 
1959 		/*
1960 		 * Process RX PDU
1961 		 */
1962 		idm_pdu_rx(ic, pdu);
1963 
1964 		mutex_enter(&ic->ic_mutex);
1965 	}
1966 
1967 	mutex_exit(&ic->ic_mutex);
1968 
1969 	/*
1970 	 * If we dropped out of the RX processing loop because of
1971 	 * a socket problem or other connection failure (including
1972 	 * digest errors) then we need to generate a state machine
1973 	 * event to shut the connection down.
1974 	 * If the state machine is already in, for example, INIT_ERROR, this
1975 	 * event will get dropped, and the TX thread will never be notified
1976 	 * to shut down.  To be safe, we'll just notify it here.
1977 	 */
1978 	if (conn_failure) {
1979 		if (so_conn->ic_tx_thread_running) {
1980 			so_conn->ic_tx_thread_running = B_FALSE;
1981 			mutex_enter(&so_conn->ic_tx_mutex);
1982 			cv_signal(&so_conn->ic_tx_cv);
1983 			mutex_exit(&so_conn->ic_tx_mutex);
1984 		}
1985 
1986 		idm_conn_event(ic, CE_TRANSPORT_FAIL, rc);
1987 	}
1988 
1989 	idm_conn_rele(ic);
1990 
1991 	thread_exit();
1992 }
1993 
1994 /*
1995  * idm_so_tx
1996  *
1997  * This is the implementation of idm_transport_ops_t's it_tx_pdu entry
1998  * point.  By definition, it is supposed to be fast.  So, simply queue
1999  * the entry and return.  The real work is done by idm_i_so_tx() via
2000  * idm_sotx_thread().
2001  */
2002 
2003 static void
2004 idm_so_tx(idm_conn_t *ic, idm_pdu_t *pdu)
2005 {
2006 	idm_so_conn_t *so_conn = ic->ic_transport_private;
2007 
2008 	ASSERT(pdu->isp_ic == ic);
2009 	mutex_enter(&so_conn->ic_tx_mutex);
2010 
2011 	if (!so_conn->ic_tx_thread_running) {
2012 		mutex_exit(&so_conn->ic_tx_mutex);
2013 		idm_pdu_complete(pdu, IDM_STATUS_ABORTED);
2014 		return;
2015 	}
2016 
2017 	list_insert_tail(&so_conn->ic_tx_list, (void *)pdu);
2018 	cv_signal(&so_conn->ic_tx_cv);
2019 	mutex_exit(&so_conn->ic_tx_mutex);
2020 }
2021 
2022 static idm_status_t
2023 idm_i_so_tx(idm_pdu_t *pdu)
2024 {
2025 	idm_conn_t	*ic = pdu->isp_ic;
2026 	idm_status_t	status = IDM_STATUS_SUCCESS;
2027 	uint8_t		pad[ISCSI_PAD_WORD_LEN];
2028 	int		pad_len;
2029 	uint32_t	hdr_digest_crc;
2030 	uint32_t	data_digest_crc = 0;
2031 	int		total_len = 0;
2032 	int		iovlen = 0;
2033 	struct iovec	iov[6];
2034 	idm_so_conn_t	*so_conn;
2035 
2036 	so_conn = ic->ic_transport_private;
2037 
2038 	/* Setup BHS */
2039 	iov[iovlen].iov_base	= (caddr_t)pdu->isp_hdr;
2040 	iov[iovlen].iov_len	= pdu->isp_hdrlen;
2041 	total_len		+= iov[iovlen].iov_len;
2042 	iovlen++;
2043 
2044 	/* Setup header digest */
2045 	if (((pdu->isp_flags & IDM_PDU_LOGIN_TX) == 0) &&
2046 	    (ic->ic_conn_flags & IDM_CONN_HEADER_DIGEST)) {
2047 		hdr_digest_crc = idm_crc32c(pdu->isp_hdr, pdu->isp_hdrlen);
2048 
2049 		iov[iovlen].iov_base	= (caddr_t)&hdr_digest_crc;
2050 		iov[iovlen].iov_len	= sizeof (hdr_digest_crc);
2051 		total_len		+= iov[iovlen].iov_len;
2052 		iovlen++;
2053 	}
2054 
2055 	/* Setup the data */
2056 	if (pdu->isp_datalen) {
2057 		idm_task_t		*idt;
2058 		idm_buf_t		*idb;
2059 		iscsi_data_hdr_t	*ihp;
2060 		ihp = (iscsi_data_hdr_t *)pdu->isp_hdr;
2061 		/* Write of immediate data */
2062 		if (ic->ic_ffp &&
2063 		    (ihp->opcode == ISCSI_OP_SCSI_CMD ||
2064 		    ihp->opcode == ISCSI_OP_SCSI_DATA)) {
2065 			idt = idm_task_find(ic, ihp->itt, ihp->ttt);
2066 			if (idt) {
2067 				mutex_enter(&idt->idt_mutex);
2068 				idb = idm_buf_find(&idt->idt_outbufv, 0);
2069 				mutex_exit(&idt->idt_mutex);
2070 				/*
2071 				 * If the initiator call to idm_buf_alloc
2072 				 * failed then we can get to this point
2073 				 * without a bound buffer.  The associated
2074 				 * connection failure will clean things up
2075 				 * later.  It would be nice to come up with
2076 				 * a cleaner way to handle this.  In
2077 				 * particular it seems absurd to look up
2078 				 * the task and the buffer just to update
2079 				 * this counter.
2080 				 */
2081 				if (idb)
2082 					idb->idb_xfer_len += pdu->isp_datalen;
2083 				idm_task_rele(idt);
2084 			}
2085 		}
2086 
2087 		iov[iovlen].iov_base = (caddr_t)pdu->isp_data;
2088 		iov[iovlen].iov_len  = pdu->isp_datalen;
2089 		total_len += iov[iovlen].iov_len;
2090 		iovlen++;
2091 	}
2092 
2093 	/* Setup the data pad if necessary */
2094 	pad_len = ((ISCSI_PAD_WORD_LEN -
2095 	    (pdu->isp_datalen & (ISCSI_PAD_WORD_LEN - 1))) &
2096 	    (ISCSI_PAD_WORD_LEN - 1));
2097 
2098 	if (pad_len) {
2099 		bzero(pad, sizeof (pad));
2100 		iov[iovlen].iov_base = (void *)&pad;
2101 		iov[iovlen].iov_len  = pad_len;
2102 		total_len		+= iov[iovlen].iov_len;
2103 		iovlen++;
2104 	}
2105 
2106 	/*
2107 	 * Setup the data digest if enabled.  Data-digest is not sent
2108 	 * for login-phase PDUs.
2109 	 */
2110 	if ((ic->ic_conn_flags & IDM_CONN_DATA_DIGEST) &&
2111 	    ((pdu->isp_flags & IDM_PDU_LOGIN_TX) == 0) &&
2112 	    (pdu->isp_datalen || pad_len)) {
2113 		/*
2114 		 * RFC3720/10.2.3: A zero-length Data Segment also
2115 		 * implies a zero-length data digest.
2116 		 */
2117 		if (pdu->isp_datalen) {
2118 			data_digest_crc = idm_crc32c(pdu->isp_data,
2119 			    pdu->isp_datalen);
2120 		}
2121 		if (pad_len) {
2122 			data_digest_crc = idm_crc32c_continued(&pad,
2123 			    pad_len, data_digest_crc);
2124 		}
2125 
2126 		iov[iovlen].iov_base	= (caddr_t)&data_digest_crc;
2127 		iov[iovlen].iov_len	= sizeof (data_digest_crc);
2128 		total_len		+= iov[iovlen].iov_len;
2129 		iovlen++;
2130 	}
2131 
2132 	/* Transmit the PDU */
2133 	if (idm_iov_sosend(so_conn->ic_so, &iov[0], iovlen,
2134 	    total_len) != 0) {
2135 		/* Set error status */
2136 		IDM_CONN_LOG(CE_WARN,
2137 		    "idm_so_tx: failed to transmit the PDU, so: %p ic: %p "
2138 		    "data: %p", (void *) so_conn->ic_so, (void *) ic,
2139 		    (void *) pdu->isp_data);
2140 		status = IDM_STATUS_IO;
2141 	}
2142 
2143 	/*
2144 	 * Success does not mean that the PDU actually reached the
2145 	 * remote node since it could get dropped along the way.
2146 	 */
2147 	idm_pdu_complete(pdu, status);
2148 
2149 	return (status);
2150 }
2151 
2152 /*
2153  * The idm_so_buf_tx_to_ini() is used by the target iSCSI layer to transmit the
2154  * Data-In PDUs using sockets. Based on the negotiated MaxRecvDataSegmentLength,
2155  * the buffer is segmented into a sequence of Data-In PDUs, ordered by DataSN.
2156  * A target can invoke this function multiple times for a single read command
2157  * (identified by the same ITT) to split the input into several sequences.
2158  *
2159  * DataSN starts with 0 for the first data PDU of an input command and advances
2160  * by 1 for each subsequent data PDU. Each sequence will have its own F bit,
2161  * which is set to 1 for the last data PDU of a sequence.
2162  *
2163  * Scope for Prototype build:
2164  * The data PDUs within a sequence will be sent in order with the buffer offset
2165  * in increasing order. i.e. initiator and target must have negotiated the
2166  * "DataPDUInOrder" to "Yes". The order between sequences is not enforced.
2167  *
2168  * Caller holds idt->idt_mutex
2169  */
2170 static idm_status_t
2171 idm_so_buf_tx_to_ini(idm_task_t *idt, idm_buf_t *idb)
2172 {
2173 	idm_so_conn_t	*so_conn = idb->idb_ic->ic_transport_private;
2174 	idm_pdu_t	tmppdu;
2175 
2176 	ASSERT(mutex_owned(&idt->idt_mutex));
2177 
2178 	/*
2179 	 * Put the idm_buf_t on the tx queue.  It will be transmitted by
2180 	 * idm_sotx_thread.
2181 	 */
2182 	mutex_enter(&so_conn->ic_tx_mutex);
2183 
2184 	DTRACE_ISCSI_8(xfer__start, idm_conn_t *, idt->idt_ic,
2185 	    uintptr_t, idb->idb_buf, uint32_t, idb->idb_bufoffset,
2186 	    uint64_t, 0, uint32_t, 0, uint32_t, 0,
2187 	    uint32_t, idb->idb_xfer_len, int, XFER_BUF_TX_TO_INI);
2188 
2189 	if (!so_conn->ic_tx_thread_running) {
2190 		mutex_exit(&so_conn->ic_tx_mutex);
2191 		/*
2192 		 * Don't release idt->idt_mutex since we're supposed to hold
2193 		 * in when calling idm_buf_tx_to_ini_done
2194 		 */
2195 		DTRACE_ISCSI_8(xfer__done, idm_conn_t *, idt->idt_ic,
2196 		    uintptr_t, idb->idb_buf, uint32_t, idb->idb_bufoffset,
2197 		    uint64_t, 0, uint32_t, 0, uint32_t, 0,
2198 		    uint32_t, idb->idb_xfer_len,
2199 		    int, XFER_BUF_TX_TO_INI);
2200 		idm_buf_tx_to_ini_done(idt, idb, IDM_STATUS_ABORTED);
2201 		return (IDM_STATUS_FAIL);
2202 	}
2203 
2204 	/*
2205 	 * Build a template for the data PDU headers we will use so that
2206 	 * the SN values will stay consistent with other PDU's we are
2207 	 * transmitting like R2T and SCSI status.
2208 	 */
2209 	bzero(&idb->idb_data_hdr_tmpl, sizeof (iscsi_hdr_t));
2210 	tmppdu.isp_hdr = &idb->idb_data_hdr_tmpl;
2211 	(*idt->idt_ic->ic_conn_ops.icb_build_hdr)(idt, &tmppdu,
2212 	    ISCSI_OP_SCSI_DATA_RSP);
2213 	idb->idb_tx_thread = B_TRUE;
2214 	list_insert_tail(&so_conn->ic_tx_list, (void *)idb);
2215 	cv_signal(&so_conn->ic_tx_cv);
2216 	mutex_exit(&so_conn->ic_tx_mutex);
2217 	mutex_exit(&idt->idt_mutex);
2218 
2219 	/*
2220 	 * Returning success here indicates the transfer was successfully
2221 	 * dispatched -- it does not mean that the transfer completed
2222 	 * successfully.
2223 	 */
2224 	return (IDM_STATUS_SUCCESS);
2225 }
2226 
2227 /*
2228  * The idm_so_buf_rx_from_ini() is used by the target iSCSI layer to specify the
2229  * data blocks it is ready to receive from the initiator in response to a WRITE
2230  * SCSI command. The target iSCSI layer passes the information about the desired
2231  * data blocks to the initiator in one R2T PDU. The receiving buffer, the buffer
2232  * offset and datalen are passed via the 'idb' argument.
2233  *
2234  * Scope for Prototype build:
2235  * R2Ts are required for any Data-Out PDU, i.e. initiator and target must have
2236  * negotiated the "InitialR2T" to "Yes".
2237  *
2238  * Caller holds idt->idt_mutex
2239  */
2240 static idm_status_t
2241 idm_so_buf_rx_from_ini(idm_task_t *idt, idm_buf_t *idb)
2242 {
2243 	idm_pdu_t		*pdu;
2244 	iscsi_rtt_hdr_t		*rtt;
2245 
2246 	ASSERT(mutex_owned(&idt->idt_mutex));
2247 
2248 	DTRACE_ISCSI_8(xfer__start, idm_conn_t *, idt->idt_ic,
2249 	    uintptr_t, idb->idb_buf, uint32_t, idb->idb_bufoffset,
2250 	    uint64_t, 0, uint32_t, 0, uint32_t, 0,
2251 	    uint32_t, idb->idb_xfer_len, int, XFER_BUF_RX_FROM_INI);
2252 
2253 	pdu = kmem_cache_alloc(idm.idm_sotx_pdu_cache, KM_SLEEP);
2254 	pdu->isp_ic = idt->idt_ic;
2255 	bzero(pdu->isp_hdr, sizeof (iscsi_rtt_hdr_t));
2256 
2257 	/* iSCSI layer fills the TTT, ITT, StatSN, ExpCmdSN, MaxCmdSN */
2258 	(*idt->idt_ic->ic_conn_ops.icb_build_hdr)(idt, pdu, ISCSI_OP_RTT_RSP);
2259 
2260 	/* set the rttsn, rtt.flags, rtt.data_offset and rtt.data_length */
2261 	rtt = (iscsi_rtt_hdr_t *)(pdu->isp_hdr);
2262 
2263 	rtt->opcode		= ISCSI_OP_RTT_RSP;
2264 	rtt->flags		= ISCSI_FLAG_FINAL;
2265 	rtt->data_offset	= htonl(idb->idb_bufoffset);
2266 	rtt->data_length	= htonl(idb->idb_xfer_len);
2267 	rtt->rttsn		= htonl(idt->idt_exp_rttsn++);
2268 
2269 	/* Keep track of buffer offsets */
2270 	idb->idb_exp_offset	= idb->idb_bufoffset;
2271 	mutex_exit(&idt->idt_mutex);
2272 
2273 	/*
2274 	 * Transmit the PDU.
2275 	 */
2276 	idm_pdu_tx(pdu);
2277 
2278 	return (IDM_STATUS_SUCCESS);
2279 }
2280 
2281 static idm_status_t
2282 idm_so_buf_alloc(idm_buf_t *idb, uint64_t buflen)
2283 {
2284 	if ((buflen > IDM_SO_BUF_CACHE_LB) && (buflen <= IDM_SO_BUF_CACHE_UB)) {
2285 		idb->idb_buf = kmem_cache_alloc(idm.idm_so_128k_buf_cache,
2286 		    KM_NOSLEEP);
2287 		idb->idb_buf_private = idm.idm_so_128k_buf_cache;
2288 	} else {
2289 		idb->idb_buf = kmem_alloc(buflen, KM_NOSLEEP);
2290 		idb->idb_buf_private = NULL;
2291 	}
2292 
2293 	if (idb->idb_buf == NULL) {
2294 		IDM_CONN_LOG(CE_NOTE,
2295 		    "idm_so_buf_alloc: failed buffer allocation");
2296 		return (IDM_STATUS_FAIL);
2297 	}
2298 
2299 	return (IDM_STATUS_SUCCESS);
2300 }
2301 
2302 /* ARGSUSED */
2303 static idm_status_t
2304 idm_so_buf_setup(idm_buf_t *idb)
2305 {
2306 	/* Ensure bufalloc'd flag is unset */
2307 	idb->idb_bufalloc = B_FALSE;
2308 
2309 	return (IDM_STATUS_SUCCESS);
2310 }
2311 
2312 /* ARGSUSED */
2313 static void
2314 idm_so_buf_teardown(idm_buf_t *idb)
2315 {
2316 	/* nothing to do here */
2317 }
2318 
2319 static void
2320 idm_so_buf_free(idm_buf_t *idb)
2321 {
2322 	if (idb->idb_buf_private == NULL) {
2323 		kmem_free(idb->idb_buf, idb->idb_buflen);
2324 	} else {
2325 		kmem_cache_free(idb->idb_buf_private, idb->idb_buf);
2326 	}
2327 }
2328 
2329 static void
2330 idm_so_send_rtt_data(idm_conn_t *ic, idm_task_t *idt, idm_buf_t *idb,
2331     uint32_t offset, uint32_t length)
2332 {
2333 	idm_so_conn_t	*so_conn = ic->ic_transport_private;
2334 	idm_pdu_t	tmppdu;
2335 	idm_buf_t	*rtt_buf;
2336 
2337 	ASSERT(mutex_owned(&idt->idt_mutex));
2338 
2339 	/*
2340 	 * Allocate a buffer to represent the RTT transfer.  We could further
2341 	 * optimize this by allocating the buffers internally from an rtt
2342 	 * specific buffer cache since this is socket-specific code but for
2343 	 * now we will keep it simple.
2344 	 */
2345 	rtt_buf = idm_buf_alloc(ic, (uint8_t *)idb->idb_buf + offset, length);
2346 	if (rtt_buf == NULL) {
2347 		/*
2348 		 * If we're in FFP then the failure was likely a resource
2349 		 * allocation issue and we should close the connection by
2350 		 * sending a CE_TRANSPORT_FAIL event.
2351 		 *
2352 		 * If we're not in FFP then idm_buf_alloc will always
2353 		 * fail and the state is transitioning to "complete" anyway
2354 		 * so we won't bother to send an event.
2355 		 */
2356 		mutex_enter(&ic->ic_state_mutex);
2357 		if (ic->ic_ffp)
2358 			idm_conn_event_locked(ic, CE_TRANSPORT_FAIL,
2359 			    NULL, CT_NONE);
2360 		mutex_exit(&ic->ic_state_mutex);
2361 		return;
2362 	}
2363 
2364 	rtt_buf->idb_buf_cb = NULL;
2365 	rtt_buf->idb_cb_arg = NULL;
2366 	rtt_buf->idb_bufoffset = offset;
2367 	rtt_buf->idb_xfer_len = length;
2368 	rtt_buf->idb_ic = idt->idt_ic;
2369 	rtt_buf->idb_task_binding = idt;
2370 
2371 	/*
2372 	 * Put the idm_buf_t on the tx queue.  It will be transmitted by
2373 	 * idm_sotx_thread.
2374 	 */
2375 	mutex_enter(&so_conn->ic_tx_mutex);
2376 
2377 	if (!so_conn->ic_tx_thread_running) {
2378 		idm_buf_free(rtt_buf);
2379 		mutex_exit(&so_conn->ic_tx_mutex);
2380 		return;
2381 	}
2382 
2383 	/*
2384 	 * This new buffer represents an additional reference on the task
2385 	 */
2386 	idm_task_hold(idt);
2387 
2388 	/*
2389 	 * Build a template for the data PDU headers we will use so that
2390 	 * the SN values will stay consistent with other PDU's we are
2391 	 * transmitting like R2T and SCSI status.
2392 	 */
2393 	bzero(&rtt_buf->idb_data_hdr_tmpl, sizeof (iscsi_hdr_t));
2394 	tmppdu.isp_hdr = &rtt_buf->idb_data_hdr_tmpl;
2395 	(*idt->idt_ic->ic_conn_ops.icb_build_hdr)(idt, &tmppdu,
2396 	    ISCSI_OP_SCSI_DATA);
2397 	rtt_buf->idb_tx_thread = B_TRUE;
2398 	rtt_buf->idb_in_transport = B_TRUE;
2399 	list_insert_tail(&so_conn->ic_tx_list, (void *)rtt_buf);
2400 	cv_signal(&so_conn->ic_tx_cv);
2401 	mutex_exit(&so_conn->ic_tx_mutex);
2402 }
2403 
2404 static void
2405 idm_so_send_rtt_data_done(idm_task_t *idt, idm_buf_t *idb)
2406 {
2407 	/*
2408 	 * Don't worry about status -- we assume any error handling
2409 	 * is performed by the caller (idm_sotx_thread).
2410 	 */
2411 	idb->idb_in_transport = B_FALSE;
2412 	idm_task_rele(idt);
2413 	idm_buf_free(idb);
2414 }
2415 
2416 static idm_status_t
2417 idm_so_send_buf_region(idm_task_t *idt, idm_buf_t *idb,
2418     uint32_t buf_region_offset, uint32_t buf_region_length)
2419 {
2420 	idm_conn_t		*ic;
2421 	uint32_t		max_dataseglen;
2422 	size_t			remainder, chunk;
2423 	uint32_t		data_offset = buf_region_offset;
2424 	iscsi_data_hdr_t	*bhs;
2425 	idm_pdu_t		*pdu;
2426 	idm_status_t		tx_status;
2427 
2428 	ASSERT(mutex_owned(&idt->idt_mutex));
2429 
2430 	ic = idt->idt_ic;
2431 
2432 	max_dataseglen = 8192; /* Need value from login negotiation */
2433 	remainder = buf_region_length;
2434 
2435 	while (remainder) {
2436 		if (idt->idt_state != TASK_ACTIVE) {
2437 			ASSERT((idt->idt_state != TASK_IDLE) &&
2438 			    (idt->idt_state != TASK_COMPLETE));
2439 			return (IDM_STATUS_ABORTED);
2440 		}
2441 
2442 		/* check to see if we need to chunk the data */
2443 		if (remainder > max_dataseglen) {
2444 			chunk = max_dataseglen;
2445 		} else {
2446 			chunk = remainder;
2447 		}
2448 
2449 		/* Data PDU headers will always be sizeof (iscsi_hdr_t) */
2450 		pdu = kmem_cache_alloc(idm.idm_sotx_pdu_cache, KM_SLEEP);
2451 		pdu->isp_ic = ic;
2452 
2453 		/*
2454 		 * We've already built a build a header template
2455 		 * to use during the transfer.  Use this template so that
2456 		 * the SN values stay consistent with any unrelated PDU's
2457 		 * being transmitted.
2458 		 */
2459 		bcopy(&idb->idb_data_hdr_tmpl, pdu->isp_hdr,
2460 		    sizeof (iscsi_hdr_t));
2461 
2462 		/*
2463 		 * Set DataSN, data offset, and flags in BHS
2464 		 * For the prototype build, A = 0, S = 0, U = 0
2465 		 */
2466 		bhs = (iscsi_data_hdr_t *)(pdu->isp_hdr);
2467 
2468 		bhs->datasn		= htonl(idt->idt_exp_datasn++);
2469 
2470 		hton24(bhs->dlength, chunk);
2471 		bhs->offset = htonl(idb->idb_bufoffset + data_offset);
2472 
2473 		if (chunk == remainder) {
2474 			bhs->flags = ISCSI_FLAG_FINAL; /* F bit set to 1 */
2475 		}
2476 
2477 		/* Instrument the data-send DTrace probe. */
2478 		if (IDM_PDU_OPCODE(pdu) == ISCSI_OP_SCSI_DATA_RSP) {
2479 			DTRACE_ISCSI_2(data__send,
2480 			    idm_conn_t *, idt->idt_ic,
2481 			    iscsi_data_rsp_hdr_t *,
2482 			    (iscsi_data_rsp_hdr_t *)pdu->isp_hdr);
2483 		}
2484 		/* setup data */
2485 		pdu->isp_data	=  (uint8_t *)idb->idb_buf + data_offset;
2486 		pdu->isp_datalen = (uint_t)chunk;
2487 		remainder	-= chunk;
2488 		data_offset	+= chunk;
2489 
2490 		/*
2491 		 * Now that we're done working with idt_exp_datasn,
2492 		 * idt->idt_state and idb->idb_bufoffset we can release
2493 		 * the task lock -- don't want to hold it across the
2494 		 * call to idm_i_so_tx since we could block.
2495 		 */
2496 		mutex_exit(&idt->idt_mutex);
2497 
2498 		/*
2499 		 * Transmit the PDU.  Call the internal routine directly
2500 		 * as there is already implicit ordering.
2501 		 */
2502 		if ((tx_status = idm_i_so_tx(pdu)) != IDM_STATUS_SUCCESS) {
2503 			mutex_enter(&idt->idt_mutex);
2504 			return (tx_status);
2505 		}
2506 
2507 		mutex_enter(&idt->idt_mutex);
2508 		idt->idt_tx_bytes += chunk;
2509 	}
2510 
2511 	return (IDM_STATUS_SUCCESS);
2512 }
2513 
2514 /*
2515  * TX PDU cache
2516  */
2517 /* ARGSUSED */
2518 int
2519 idm_sotx_pdu_constructor(void *hdl, void *arg, int flags)
2520 {
2521 	idm_pdu_t	*pdu = hdl;
2522 
2523 	bzero(pdu, sizeof (idm_pdu_t));
2524 	pdu->isp_hdr = (iscsi_hdr_t *)(pdu + 1); /* Ptr arithmetic */
2525 	pdu->isp_hdrlen = sizeof (iscsi_hdr_t);
2526 	pdu->isp_callback = idm_sotx_cache_pdu_cb;
2527 	pdu->isp_magic = IDM_PDU_MAGIC;
2528 	bzero(pdu->isp_hdr, sizeof (iscsi_hdr_t));
2529 
2530 	return (0);
2531 }
2532 
2533 /* ARGSUSED */
2534 void
2535 idm_sotx_cache_pdu_cb(idm_pdu_t *pdu, idm_status_t status)
2536 {
2537 	/* reset values between use */
2538 	pdu->isp_datalen = 0;
2539 
2540 	kmem_cache_free(idm.idm_sotx_pdu_cache, pdu);
2541 }
2542 
2543 /*
2544  * RX PDU cache
2545  */
2546 /* ARGSUSED */
2547 int
2548 idm_sorx_pdu_constructor(void *hdl, void *arg, int flags)
2549 {
2550 	idm_pdu_t	*pdu = hdl;
2551 
2552 	bzero(pdu, sizeof (idm_pdu_t));
2553 	pdu->isp_magic = IDM_PDU_MAGIC;
2554 	pdu->isp_hdr = (iscsi_hdr_t *)(pdu + 1); /* Ptr arithmetic */
2555 	pdu->isp_callback = idm_sorx_cache_pdu_cb;
2556 
2557 	return (0);
2558 }
2559 
2560 /* ARGSUSED */
2561 static void
2562 idm_sorx_cache_pdu_cb(idm_pdu_t *pdu, idm_status_t status)
2563 {
2564 	pdu->isp_iovlen = 0;
2565 	pdu->isp_sorx_buf = 0;
2566 	kmem_cache_free(idm.idm_sorx_pdu_cache, pdu);
2567 }
2568 
2569 static void
2570 idm_sorx_addl_pdu_cb(idm_pdu_t *pdu, idm_status_t status)
2571 {
2572 	/*
2573 	 * We had to modify our cached RX PDU with a longer header buffer
2574 	 * and/or a longer data buffer.  Release the new buffers and fix
2575 	 * the fields back to what we would expect for a cached RX PDU.
2576 	 */
2577 	if (pdu->isp_flags & IDM_PDU_ADDL_HDR) {
2578 		kmem_free(pdu->isp_hdr, pdu->isp_hdrlen);
2579 	}
2580 	if (pdu->isp_flags & IDM_PDU_ADDL_DATA) {
2581 		kmem_free(pdu->isp_data, pdu->isp_datalen);
2582 	}
2583 	pdu->isp_hdr = (iscsi_hdr_t *)(pdu + 1);
2584 	pdu->isp_hdrlen = sizeof (iscsi_hdr_t);
2585 	pdu->isp_data = NULL;
2586 	pdu->isp_datalen = 0;
2587 	pdu->isp_sorx_buf = 0;
2588 	pdu->isp_callback = idm_sorx_cache_pdu_cb;
2589 	idm_sorx_cache_pdu_cb(pdu, status);
2590 }
2591 
2592 /*
2593  * This thread is only active when I/O is queued for transmit
2594  * because the socket is busy.
2595  */
2596 void
2597 idm_sotx_thread(void *arg)
2598 {
2599 	idm_conn_t	*ic = arg;
2600 	idm_tx_obj_t	*object, *next;
2601 	idm_so_conn_t	*so_conn;
2602 	idm_status_t	status = IDM_STATUS_SUCCESS;
2603 
2604 	idm_conn_hold(ic);
2605 
2606 	mutex_enter(&ic->ic_mutex);
2607 	so_conn = ic->ic_transport_private;
2608 	so_conn->ic_tx_thread_running = B_TRUE;
2609 	so_conn->ic_tx_thread_did = so_conn->ic_tx_thread->t_did;
2610 	cv_signal(&ic->ic_cv);
2611 	mutex_exit(&ic->ic_mutex);
2612 
2613 	mutex_enter(&so_conn->ic_tx_mutex);
2614 
2615 	while (so_conn->ic_tx_thread_running) {
2616 		while (list_is_empty(&so_conn->ic_tx_list)) {
2617 			DTRACE_PROBE1(soconn__tx__sleep, idm_conn_t *, ic);
2618 			cv_wait(&so_conn->ic_tx_cv, &so_conn->ic_tx_mutex);
2619 			DTRACE_PROBE1(soconn__tx__wakeup, idm_conn_t *, ic);
2620 
2621 			if (!so_conn->ic_tx_thread_running) {
2622 				goto tx_bail;
2623 			}
2624 		}
2625 
2626 		object = (idm_tx_obj_t *)list_head(&so_conn->ic_tx_list);
2627 		list_remove(&so_conn->ic_tx_list, object);
2628 		mutex_exit(&so_conn->ic_tx_mutex);
2629 
2630 		switch (object->idm_tx_obj_magic) {
2631 		case IDM_PDU_MAGIC:
2632 			DTRACE_PROBE2(soconn__tx__pdu, idm_conn_t *, ic,
2633 			    idm_pdu_t *, (idm_pdu_t *)object);
2634 
2635 			status = idm_i_so_tx((idm_pdu_t *)object);
2636 			break;
2637 
2638 		case IDM_BUF_MAGIC: {
2639 			idm_buf_t *idb = (idm_buf_t *)object;
2640 			idm_task_t *idt = idb->idb_task_binding;
2641 
2642 			DTRACE_PROBE2(soconn__tx__buf, idm_conn_t *, ic,
2643 			    idm_buf_t *, idb);
2644 
2645 			mutex_enter(&idt->idt_mutex);
2646 			status = idm_so_send_buf_region(idt,
2647 			    idb, 0, idb->idb_xfer_len);
2648 
2649 			/*
2650 			 * TX thread owns the buffer so we expect it to
2651 			 * be "in transport"
2652 			 */
2653 			ASSERT(idb->idb_in_transport);
2654 			if (IDM_CONN_ISTGT(ic)) {
2655 				/*
2656 				 * idm_buf_tx_to_ini_done releases
2657 				 * idt->idt_mutex
2658 				 */
2659 				DTRACE_ISCSI_8(xfer__done,
2660 				    idm_conn_t *, idt->idt_ic,
2661 				    uintptr_t, idb->idb_buf,
2662 				    uint32_t, idb->idb_bufoffset,
2663 				    uint64_t, 0, uint32_t, 0, uint32_t, 0,
2664 				    uint32_t, idb->idb_xfer_len,
2665 				    int, XFER_BUF_TX_TO_INI);
2666 				idm_buf_tx_to_ini_done(idt, idb, status);
2667 			} else {
2668 				idm_so_send_rtt_data_done(idt, idb);
2669 				mutex_exit(&idt->idt_mutex);
2670 			}
2671 			break;
2672 		}
2673 
2674 		default:
2675 			IDM_CONN_LOG(CE_WARN, "idm_sotx_thread: Unknown magic "
2676 			    "(0x%08x)", object->idm_tx_obj_magic);
2677 			status = IDM_STATUS_FAIL;
2678 		}
2679 
2680 		mutex_enter(&so_conn->ic_tx_mutex);
2681 
2682 		if (status != IDM_STATUS_SUCCESS) {
2683 			so_conn->ic_tx_thread_running = B_FALSE;
2684 			idm_conn_event(ic, CE_TRANSPORT_FAIL, status);
2685 		}
2686 	}
2687 
2688 	/*
2689 	 * Before we leave, we need to abort every item remaining in the
2690 	 * TX list.
2691 	 */
2692 
2693 tx_bail:
2694 	object = (idm_tx_obj_t *)list_head(&so_conn->ic_tx_list);
2695 
2696 	while (object != NULL) {
2697 		next = list_next(&so_conn->ic_tx_list, object);
2698 
2699 		list_remove(&so_conn->ic_tx_list, object);
2700 		switch (object->idm_tx_obj_magic) {
2701 		case IDM_PDU_MAGIC:
2702 			idm_pdu_complete((idm_pdu_t *)object,
2703 			    IDM_STATUS_ABORTED);
2704 			break;
2705 
2706 		case IDM_BUF_MAGIC: {
2707 			idm_buf_t *idb = (idm_buf_t *)object;
2708 			idm_task_t *idt = idb->idb_task_binding;
2709 			mutex_exit(&so_conn->ic_tx_mutex);
2710 			mutex_enter(&idt->idt_mutex);
2711 			/*
2712 			 * TX thread owns the buffer so we expect it to
2713 			 * be "in transport"
2714 			 */
2715 			ASSERT(idb->idb_in_transport);
2716 			if (IDM_CONN_ISTGT(ic)) {
2717 				/*
2718 				 * idm_buf_tx_to_ini_done releases
2719 				 * idt->idt_mutex
2720 				 */
2721 				DTRACE_ISCSI_8(xfer__done,
2722 				    idm_conn_t *, idt->idt_ic,
2723 				    uintptr_t, idb->idb_buf,
2724 				    uint32_t, idb->idb_bufoffset,
2725 				    uint64_t, 0, uint32_t, 0, uint32_t, 0,
2726 				    uint32_t, idb->idb_xfer_len,
2727 				    int, XFER_BUF_TX_TO_INI);
2728 				idm_buf_tx_to_ini_done(idt, idb,
2729 				    IDM_STATUS_ABORTED);
2730 			} else {
2731 				idm_so_send_rtt_data_done(idt, idb);
2732 				mutex_exit(&idt->idt_mutex);
2733 			}
2734 			mutex_enter(&so_conn->ic_tx_mutex);
2735 			break;
2736 		}
2737 		default:
2738 			IDM_CONN_LOG(CE_WARN,
2739 			    "idm_sotx_thread: Unexpected magic "
2740 			    "(0x%08x)", object->idm_tx_obj_magic);
2741 		}
2742 
2743 		object = next;
2744 	}
2745 
2746 	mutex_exit(&so_conn->ic_tx_mutex);
2747 	idm_conn_rele(ic);
2748 	thread_exit();
2749 	/*NOTREACHED*/
2750 }
2751