xref: /titanic_51/usr/src/uts/common/io/idm/idm_so.c (revision 356a8421a012386337e4aa0d431112a0d9395744)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/conf.h>
27 #include <sys/stat.h>
28 #include <sys/file.h>
29 #include <sys/ddi.h>
30 #include <sys/sunddi.h>
31 #include <sys/modctl.h>
32 #include <sys/priv.h>
33 #include <sys/cpuvar.h>
34 #include <sys/socket.h>
35 #include <sys/strsubr.h>
36 #include <sys/sysmacros.h>
37 #include <sys/sdt.h>
38 #include <netinet/tcp.h>
39 #include <inet/tcp.h>
40 #include <sys/socketvar.h>
41 #include <sys/pathname.h>
42 #include <sys/fs/snode.h>
43 #include <sys/fs/dv_node.h>
44 #include <sys/vnode.h>
45 #include <netinet/in.h>
46 #include <net/if.h>
47 #include <sys/sockio.h>
48 #include <sys/ksocket.h>
49 #include <sys/idm/idm.h>
50 #include <sys/idm/idm_so.h>
51 #include <sys/idm/idm_text.h>
52 
53 /*
54  * in6addr_any is currently all zeroes, but use the macro in case this
55  * ever changes.
56  */
57 const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT;
58 
59 static void idm_sorx_cache_pdu_cb(idm_pdu_t *pdu, idm_status_t status);
60 static void idm_sorx_addl_pdu_cb(idm_pdu_t *pdu, idm_status_t status);
61 static void idm_sotx_cache_pdu_cb(idm_pdu_t *pdu, idm_status_t status);
62 
63 static idm_status_t idm_so_conn_create_common(idm_conn_t *ic, ksocket_t new_so);
64 static void idm_so_conn_destroy_common(idm_conn_t *ic);
65 static void idm_so_conn_connect_common(idm_conn_t *ic);
66 
67 static void idm_set_ini_preconnect_options(idm_so_conn_t *sc);
68 static void idm_set_ini_postconnect_options(idm_so_conn_t *sc);
69 static void idm_set_tgt_connect_options(ksocket_t so);
70 static idm_status_t idm_i_so_tx(idm_pdu_t *pdu);
71 
72 static idm_status_t idm_sorecvdata(idm_conn_t *ic, idm_pdu_t *pdu);
73 static void idm_so_send_rtt_data(idm_conn_t *ic, idm_task_t *idt,
74     idm_buf_t *idb, uint32_t offset, uint32_t length);
75 static void idm_so_send_rtt_data_done(idm_task_t *idt, idm_buf_t *idb);
76 static idm_status_t idm_so_send_buf_region(idm_task_t *idt,
77     idm_buf_t *idb, uint32_t buf_region_offset, uint32_t buf_region_length);
78 
79 static uint32_t idm_fill_iov(idm_pdu_t *pdu, idm_buf_t *idb,
80     uint32_t ro, uint32_t dlength);
81 
82 static idm_status_t idm_so_handle_digest(idm_conn_t *it,
83     nvpair_t *digest_choice, const idm_kv_xlate_t *ikvx);
84 
85 /*
86  * Transport ops prototypes
87  */
88 static void idm_so_tx(idm_conn_t *ic, idm_pdu_t *pdu);
89 static idm_status_t idm_so_buf_tx_to_ini(idm_task_t *idt, idm_buf_t *idb);
90 static idm_status_t idm_so_buf_rx_from_ini(idm_task_t *idt, idm_buf_t *idb);
91 static void idm_so_rx_datain(idm_conn_t *ic, idm_pdu_t *pdu);
92 static void idm_so_rx_rtt(idm_conn_t *ic, idm_pdu_t *pdu);
93 static void idm_so_rx_dataout(idm_conn_t *ic, idm_pdu_t *pdu);
94 static idm_status_t idm_so_free_task_rsrc(idm_task_t *idt);
95 static kv_status_t idm_so_negotiate_key_values(idm_conn_t *it,
96     nvlist_t *request_nvl, nvlist_t *response_nvl, nvlist_t *negotiated_nvl);
97 static void idm_so_notice_key_values(idm_conn_t *it,
98     nvlist_t *negotiated_nvl);
99 static boolean_t idm_so_conn_is_capable(idm_conn_req_t *ic,
100     idm_transport_caps_t *caps);
101 static idm_status_t idm_so_buf_alloc(idm_buf_t *idb, uint64_t buflen);
102 static void idm_so_buf_free(idm_buf_t *idb);
103 static idm_status_t idm_so_buf_setup(idm_buf_t *idb);
104 static void idm_so_buf_teardown(idm_buf_t *idb);
105 static idm_status_t idm_so_tgt_svc_create(idm_svc_req_t *sr, idm_svc_t *is);
106 static void idm_so_tgt_svc_destroy(idm_svc_t *is);
107 static idm_status_t idm_so_tgt_svc_online(idm_svc_t *is);
108 static void idm_so_tgt_svc_offline(idm_svc_t *is);
109 static void idm_so_tgt_conn_destroy(idm_conn_t *ic);
110 static idm_status_t idm_so_tgt_conn_connect(idm_conn_t *ic);
111 static void idm_so_conn_disconnect(idm_conn_t *ic);
112 static idm_status_t idm_so_ini_conn_create(idm_conn_req_t *cr, idm_conn_t *ic);
113 static void idm_so_ini_conn_destroy(idm_conn_t *ic);
114 static idm_status_t idm_so_ini_conn_connect(idm_conn_t *ic);
115 
116 /*
117  * IDM Native Sockets transport operations
118  */
119 static
120 idm_transport_ops_t idm_so_transport_ops = {
121 	idm_so_tx,			/* it_tx_pdu */
122 	idm_so_buf_tx_to_ini,		/* it_buf_tx_to_ini */
123 	idm_so_buf_rx_from_ini,		/* it_buf_rx_from_ini */
124 	idm_so_rx_datain,		/* it_rx_datain */
125 	idm_so_rx_rtt,			/* it_rx_rtt */
126 	idm_so_rx_dataout,		/* it_rx_dataout */
127 	NULL,				/* it_alloc_conn_rsrc */
128 	NULL,				/* it_free_conn_rsrc */
129 	NULL,				/* it_tgt_enable_datamover */
130 	NULL,				/* it_ini_enable_datamover */
131 	NULL,				/* it_conn_terminate */
132 	idm_so_free_task_rsrc,		/* it_free_task_rsrc */
133 	idm_so_negotiate_key_values,	/* it_negotiate_key_values */
134 	idm_so_notice_key_values,	/* it_notice_key_values */
135 	idm_so_conn_is_capable,		/* it_conn_is_capable */
136 	idm_so_buf_alloc,		/* it_buf_alloc */
137 	idm_so_buf_free,		/* it_buf_free */
138 	idm_so_buf_setup,		/* it_buf_setup */
139 	idm_so_buf_teardown,		/* it_buf_teardown */
140 	idm_so_tgt_svc_create,		/* it_tgt_svc_create */
141 	idm_so_tgt_svc_destroy,		/* it_tgt_svc_destroy */
142 	idm_so_tgt_svc_online,		/* it_tgt_svc_online */
143 	idm_so_tgt_svc_offline,		/* it_tgt_svc_offline */
144 	idm_so_tgt_conn_destroy,	/* it_tgt_conn_destroy */
145 	idm_so_tgt_conn_connect,	/* it_tgt_conn_connect */
146 	idm_so_conn_disconnect,		/* it_tgt_conn_disconnect */
147 	idm_so_ini_conn_create,		/* it_ini_conn_create */
148 	idm_so_ini_conn_destroy,	/* it_ini_conn_destroy */
149 	idm_so_ini_conn_connect,	/* it_ini_conn_connect */
150 	idm_so_conn_disconnect		/* it_ini_conn_disconnect */
151 };
152 
153 /*
154  * idm_so_init()
155  * Sockets transport initialization
156  */
157 void
158 idm_so_init(idm_transport_t *it)
159 {
160 	/* Cache for IDM Data and R2T Transmit PDU's */
161 	idm.idm_sotx_pdu_cache = kmem_cache_create("idm_tx_pdu_cache",
162 	    sizeof (idm_pdu_t) + sizeof (iscsi_hdr_t), 8,
163 	    &idm_sotx_pdu_constructor, NULL, NULL, NULL, NULL, KM_SLEEP);
164 
165 	/* Cache for IDM Receive PDU's */
166 	idm.idm_sorx_pdu_cache = kmem_cache_create("idm_rx_pdu_cache",
167 	    sizeof (idm_pdu_t) + IDM_SORX_CACHE_HDRLEN, 8,
168 	    &idm_sorx_pdu_constructor, NULL, NULL, NULL, NULL, KM_SLEEP);
169 
170 	/* Set the sockets transport ops */
171 	it->it_ops = &idm_so_transport_ops;
172 }
173 
174 /*
175  * idm_so_fini()
176  * Sockets transport teardown
177  */
178 void
179 idm_so_fini(void)
180 {
181 	kmem_cache_destroy(idm.idm_sotx_pdu_cache);
182 	kmem_cache_destroy(idm.idm_sorx_pdu_cache);
183 }
184 
185 ksocket_t
186 idm_socreate(int domain, int type, int protocol)
187 {
188 	ksocket_t ks;
189 
190 	if (!ksocket_socket(&ks, domain, type, protocol, KSOCKET_NOSLEEP,
191 	    CRED())) {
192 		return (ks);
193 	} else {
194 		return (NULL);
195 	}
196 }
197 
198 /*
199  * idm_soshutdown will disconnect the socket and prevent subsequent PDU
200  * reception and transmission.  The sonode still exists but its state
201  * gets modified to indicate it is no longer connected.  Calls to
202  * idm_sorecv/idm_iov_sorecv will return so idm_soshutdown can be used
203  * regain control of a thread stuck in idm_sorecv.
204  */
205 void
206 idm_soshutdown(ksocket_t so)
207 {
208 	(void) ksocket_shutdown(so, SHUT_RDWR, CRED());
209 }
210 
211 /*
212  * idm_sodestroy releases all resources associated with a socket previously
213  * created with idm_socreate.  The socket must be shutdown using
214  * idm_soshutdown before the socket is destroyed with idm_sodestroy,
215  * otherwise undefined behavior will result.
216  */
217 void
218 idm_sodestroy(ksocket_t ks)
219 {
220 	(void) ksocket_close(ks, CRED());
221 }
222 
223 /*
224  * IP address filter functions to flag addresses that should not
225  * go out to initiators through discovery.
226  */
227 static boolean_t
228 idm_v4_addr_okay(struct in_addr *in_addr)
229 {
230 	in_addr_t addr = ntohl(in_addr->s_addr);
231 
232 	if ((INADDR_NONE == addr) ||
233 	    (IN_MULTICAST(addr)) ||
234 	    ((addr >> IN_CLASSA_NSHIFT) == 0) ||
235 	    ((addr >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)) {
236 		return (B_FALSE);
237 	}
238 	return (B_TRUE);
239 }
240 
241 static boolean_t
242 idm_v6_addr_okay(struct in6_addr *addr6)
243 {
244 
245 	if ((IN6_IS_ADDR_UNSPECIFIED(addr6)) ||
246 	    (IN6_IS_ADDR_LOOPBACK(addr6)) ||
247 	    (IN6_IS_ADDR_MULTICAST(addr6)) ||
248 	    (IN6_IS_ADDR_V4MAPPED(addr6)) ||
249 	    (IN6_IS_ADDR_V4COMPAT(addr6)) ||
250 	    (IN6_IS_ADDR_LINKLOCAL(addr6))) {
251 		return (B_FALSE);
252 	}
253 	return (B_TRUE);
254 }
255 
256 /*
257  * idm_get_ipaddr will retrieve a list of IP Addresses which the host is
258  * configured with by sending down a sequence of kernel ioctl to IP STREAMS.
259  */
260 int
261 idm_get_ipaddr(idm_addr_list_t **ipaddr_p)
262 {
263 	ksocket_t 		so4, so6;
264 	struct lifnum		lifn;
265 	struct lifconf		lifc;
266 	struct lifreq		*lp;
267 	int			rval;
268 	int			numifs;
269 	int			bufsize;
270 	void			*buf;
271 	int			i, j, n, rc;
272 	struct sockaddr_storage	ss;
273 	struct sockaddr_in	*sin;
274 	struct sockaddr_in6	*sin6;
275 	idm_addr_t		*ip;
276 	idm_addr_list_t		*ipaddr;
277 	int			size_ipaddr;
278 
279 	*ipaddr_p = NULL;
280 	size_ipaddr = 0;
281 	buf = NULL;
282 
283 	/* create an ipv4 and ipv6 UDP socket */
284 	if ((so6 = idm_socreate(PF_INET6, SOCK_DGRAM, 0)) == NULL)
285 		return (0);
286 	if ((so4 = idm_socreate(PF_INET, SOCK_DGRAM, 0)) == NULL) {
287 		idm_sodestroy(so6);
288 		return (0);
289 	}
290 
291 
292 retry_count:
293 	/* snapshot the current number of interfaces */
294 	lifn.lifn_family = PF_UNSPEC;
295 	lifn.lifn_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES;
296 	lifn.lifn_count = 0;
297 	/* use vp6 for ioctls with unspecified families by default */
298 	if (ksocket_ioctl(so6, SIOCGLIFNUM, (intptr_t)&lifn, &rval, CRED())
299 	    != 0) {
300 		goto cleanup;
301 	}
302 
303 	numifs = lifn.lifn_count;
304 	if (numifs <= 0) {
305 		goto cleanup;
306 	}
307 
308 	/* allocate extra room in case more interfaces appear */
309 	numifs += 10;
310 
311 	/* get the interface names and ip addresses */
312 	bufsize = numifs * sizeof (struct lifreq);
313 	buf = kmem_alloc(bufsize, KM_SLEEP);
314 
315 	lifc.lifc_family = AF_UNSPEC;
316 	lifc.lifc_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES;
317 	lifc.lifc_len = bufsize;
318 	lifc.lifc_buf = buf;
319 	rc = ksocket_ioctl(so6, SIOCGLIFCONF, (intptr_t)&lifc, &rval, CRED());
320 	if (rc != 0) {
321 		goto cleanup;
322 	}
323 	/* if our extra room is used up, try again */
324 	if (bufsize <= lifc.lifc_len) {
325 		kmem_free(buf, bufsize);
326 		buf = NULL;
327 		goto retry_count;
328 	}
329 	/* calc actual number of ifconfs */
330 	n = lifc.lifc_len / sizeof (struct lifreq);
331 
332 	/* get ip address */
333 	if (n > 0) {
334 		size_ipaddr = sizeof (idm_addr_list_t) +
335 		    (n - 1) * sizeof (idm_addr_t);
336 		ipaddr = kmem_zalloc(size_ipaddr, KM_SLEEP);
337 	} else {
338 		goto cleanup;
339 	}
340 
341 	/*
342 	 * Examine the array of interfaces and filter uninteresting ones
343 	 */
344 	for (i = 0, j = 0, lp = lifc.lifc_req; i < n; i++, lp++) {
345 
346 		/*
347 		 * Copy the address as the SIOCGLIFFLAGS ioctl is destructive
348 		 */
349 		ss = lp->lifr_addr;
350 		/*
351 		 * fetch the flags using the socket of the correct family
352 		 */
353 		switch (ss.ss_family) {
354 		case AF_INET:
355 			rc = ksocket_ioctl(so4, SIOCGLIFFLAGS, (intptr_t)lp,
356 			    &rval, CRED());
357 			break;
358 		case AF_INET6:
359 			rc = ksocket_ioctl(so6, SIOCGLIFFLAGS, (intptr_t)lp,
360 			    &rval, CRED());
361 			break;
362 		default:
363 			continue;
364 		}
365 		if (rc == 0) {
366 			/*
367 			 * If we got the flags, skip uninteresting
368 			 * interfaces based on flags
369 			 */
370 			if ((lp->lifr_flags & IFF_UP) != IFF_UP)
371 				continue;
372 			if (lp->lifr_flags &
373 			    (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED))
374 				continue;
375 		}
376 
377 		/* save ip address */
378 		ip = &ipaddr->al_addrs[j];
379 		switch (ss.ss_family) {
380 		case AF_INET:
381 			sin = (struct sockaddr_in *)&ss;
382 			if (!idm_v4_addr_okay(&sin->sin_addr))
383 				continue;
384 			ip->a_addr.i_addr.in4 = sin->sin_addr;
385 			ip->a_addr.i_insize = sizeof (struct in_addr);
386 			break;
387 		case AF_INET6:
388 			sin6 = (struct sockaddr_in6 *)&ss;
389 			if (!idm_v6_addr_okay(&sin6->sin6_addr))
390 				continue;
391 			ip->a_addr.i_addr.in6 = sin6->sin6_addr;
392 			ip->a_addr.i_insize = sizeof (struct in6_addr);
393 			break;
394 		default:
395 			continue;
396 		}
397 		j++;
398 	}
399 
400 	if (j == 0) {
401 		/* no valid ifaddr */
402 		kmem_free(ipaddr, size_ipaddr);
403 		size_ipaddr = 0;
404 		ipaddr = NULL;
405 	} else {
406 		ipaddr->al_out_cnt = j;
407 	}
408 
409 
410 cleanup:
411 	idm_sodestroy(so6);
412 	idm_sodestroy(so4);
413 
414 	if (buf != NULL)
415 		kmem_free(buf, bufsize);
416 
417 	*ipaddr_p = ipaddr;
418 	return (size_ipaddr);
419 }
420 
421 int
422 idm_sorecv(ksocket_t so, void *msg, size_t len)
423 {
424 	iovec_t iov;
425 
426 	ASSERT(so != NULL);
427 	ASSERT(len != 0);
428 
429 	/*
430 	 * Fill in iovec and receive data
431 	 */
432 	iov.iov_base = msg;
433 	iov.iov_len = len;
434 
435 	return (idm_iov_sorecv(so, &iov, 1, len));
436 }
437 
438 /*
439  * idm_sosendto - Sends a buffered data on a non-connected socket.
440  *
441  * This function puts the data provided on the wire by calling sosendmsg.
442  * It will return only when all the data has been sent or if an error
443  * occurs.
444  *
445  * Returns 0 for success, the socket errno value if sosendmsg fails, and
446  * -1 if sosendmsg returns success but uio_resid != 0
447  */
448 int
449 idm_sosendto(ksocket_t so, void *buff, size_t len,
450     struct sockaddr *name, socklen_t namelen)
451 {
452 	struct msghdr		msg;
453 	struct iovec		iov[1];
454 	int			error;
455 	size_t			sent = 0;
456 
457 	iov[0].iov_base	= buff;
458 	iov[0].iov_len	= len;
459 
460 	/* Initialization of the message header. */
461 	bzero(&msg, sizeof (msg));
462 	msg.msg_iov	= iov;
463 	msg.msg_iovlen	= 1;
464 	msg.msg_name	= name;
465 	msg.msg_namelen	= namelen;
466 
467 	if ((error = ksocket_sendmsg(so, &msg, 0, &sent, CRED())) == 0) {
468 		/* Data sent */
469 		if (sent == len) {
470 			/* All data sent.  Success. */
471 			return (0);
472 		} else {
473 			/* Not all data was sent.  Failure */
474 			return (-1);
475 		}
476 	}
477 
478 	/* Send failed */
479 	return (error);
480 }
481 
482 /*
483  * idm_iov_sosend - Sends an iovec on a connection.
484  *
485  * This function puts the data provided on the wire by calling sosendmsg.
486  * It will return only when all the data has been sent or if an error
487  * occurs.
488  *
489  * Returns 0 for success, the socket errno value if sosendmsg fails, and
490  * -1 if sosendmsg returns success but uio_resid != 0
491  */
492 int
493 idm_iov_sosend(ksocket_t so, iovec_t *iop, int iovlen, size_t total_len)
494 {
495 	struct msghdr		msg;
496 	int			error;
497 	size_t 			sent = 0;
498 
499 	ASSERT(iop != NULL);
500 
501 	/* Initialization of the message header. */
502 	bzero(&msg, sizeof (msg));
503 	msg.msg_iov	= iop;
504 	msg.msg_iovlen	= iovlen;
505 
506 	if ((error = ksocket_sendmsg(so, &msg, 0, &sent, CRED()))
507 	    == 0) {
508 		/* Data sent */
509 		if (sent == total_len) {
510 			/* All data sent.  Success. */
511 			return (0);
512 		} else {
513 			/* Not all data was sent.  Failure */
514 			return (-1);
515 		}
516 	}
517 
518 	/* Send failed */
519 	return (error);
520 }
521 
522 /*
523  * idm_iov_sorecv - Receives an iovec from a connection
524  *
525  * This function gets the data asked for from the socket.  It will return
526  * only when all the requested data has been retrieved or if an error
527  * occurs.
528  *
529  * Returns 0 for success, the socket errno value if sorecvmsg fails, and
530  * -1 if sorecvmsg returns success but uio_resid != 0
531  */
532 int
533 idm_iov_sorecv(ksocket_t so, iovec_t *iop, int iovlen, size_t total_len)
534 {
535 	struct msghdr		msg;
536 	int			error;
537 	size_t			recv;
538 	int 			flags;
539 
540 	ASSERT(iop != NULL);
541 
542 	/* Initialization of the message header. */
543 	bzero(&msg, sizeof (msg));
544 	msg.msg_iov	= iop;
545 	msg.msg_iovlen	= iovlen;
546 	flags		= MSG_WAITALL;
547 
548 	if ((error = ksocket_recvmsg(so, &msg, flags, &recv, CRED()))
549 	    == 0) {
550 		/* Received data */
551 		if (recv == total_len) {
552 			/* All requested data received.  Success */
553 			return (0);
554 		} else {
555 			/*
556 			 * Not all data was received.  The connection has
557 			 * probably failed.
558 			 */
559 			return (-1);
560 		}
561 	}
562 
563 	/* Receive failed */
564 	return (error);
565 }
566 
567 static void
568 idm_set_ini_preconnect_options(idm_so_conn_t *sc)
569 {
570 	int	conn_abort = 10000;
571 	int	conn_notify = 2000;
572 	int	abort = 30000;
573 
574 	/* Pre-connect socket options */
575 	(void) ksocket_setsockopt(sc->ic_so, IPPROTO_TCP,
576 	    TCP_CONN_NOTIFY_THRESHOLD, (char *)&conn_notify, sizeof (int),
577 	    CRED());
578 	(void) ksocket_setsockopt(sc->ic_so, IPPROTO_TCP,
579 	    TCP_CONN_ABORT_THRESHOLD, (char *)&conn_abort, sizeof (int),
580 	    CRED());
581 	(void) ksocket_setsockopt(sc->ic_so, IPPROTO_TCP, TCP_ABORT_THRESHOLD,
582 	    (char *)&abort, sizeof (int), CRED());
583 }
584 
585 static void
586 idm_set_ini_postconnect_options(idm_so_conn_t *sc)
587 {
588 	int32_t		rcvbuf = IDM_RCVBUF_SIZE;
589 	int32_t		sndbuf = IDM_SNDBUF_SIZE;
590 	const int	on = 1;
591 
592 	/* Set postconnect options */
593 	(void) ksocket_setsockopt(sc->ic_so, IPPROTO_TCP, TCP_NODELAY,
594 	    (char *)&on, sizeof (int), CRED());
595 	(void) ksocket_setsockopt(sc->ic_so, SOL_SOCKET, SO_RCVBUF,
596 	    (char *)&rcvbuf, sizeof (int), CRED());
597 	(void) ksocket_setsockopt(sc->ic_so, SOL_SOCKET, SO_SNDBUF,
598 	    (char *)&sndbuf, sizeof (int), CRED());
599 }
600 
601 static void
602 idm_set_tgt_connect_options(ksocket_t ks)
603 {
604 	int32_t		rcvbuf = IDM_RCVBUF_SIZE;
605 	int32_t		sndbuf = IDM_SNDBUF_SIZE;
606 	const int	on = 1;
607 
608 	/* Set connect options */
609 	(void) ksocket_setsockopt(ks, SOL_SOCKET, SO_RCVBUF,
610 	    (char *)&rcvbuf, sizeof (int), CRED());
611 	(void) ksocket_setsockopt(ks, SOL_SOCKET, SO_SNDBUF,
612 	    (char *)&sndbuf, sizeof (int), CRED());
613 	(void) ksocket_setsockopt(ks, IPPROTO_TCP, TCP_NODELAY,
614 	    (char *)&on, sizeof (on), CRED());
615 }
616 
617 static uint32_t
618 n2h24(const uchar_t *ptr)
619 {
620 	return ((ptr[0] << 16) | (ptr[1] << 8) | ptr[2]);
621 }
622 
623 
624 static idm_status_t
625 idm_sorecvhdr(idm_conn_t *ic, idm_pdu_t *pdu)
626 {
627 	iscsi_hdr_t	*bhs;
628 	uint32_t	hdr_digest_crc;
629 	uint32_t	crc_calculated;
630 	void		*new_hdr;
631 	int		ahslen = 0;
632 	int		total_len = 0;
633 	int		iovlen = 0;
634 	struct iovec	iov[2];
635 	idm_so_conn_t	*so_conn;
636 	int		rc;
637 
638 	so_conn = ic->ic_transport_private;
639 
640 	/*
641 	 * Read BHS
642 	 */
643 	bhs = pdu->isp_hdr;
644 	rc = idm_sorecv(so_conn->ic_so, pdu->isp_hdr, sizeof (iscsi_hdr_t));
645 	if (rc != IDM_STATUS_SUCCESS) {
646 		return (IDM_STATUS_FAIL);
647 	}
648 
649 	/*
650 	 * Check actual AHS length against the amount available in the buffer
651 	 */
652 	pdu->isp_hdrlen = sizeof (iscsi_hdr_t) +
653 	    (bhs->hlength * sizeof (uint32_t));
654 	pdu->isp_datalen = n2h24(bhs->dlength);
655 	if (bhs->hlength > IDM_SORX_CACHE_AHSLEN) {
656 		/* Allocate a new header segment and change the callback */
657 		new_hdr = kmem_alloc(pdu->isp_hdrlen, KM_SLEEP);
658 		bcopy(pdu->isp_hdr, new_hdr, sizeof (iscsi_hdr_t));
659 		pdu->isp_hdr = new_hdr;
660 		pdu->isp_flags |= IDM_PDU_ADDL_HDR;
661 
662 		/*
663 		 * This callback will restore the expected values after
664 		 * the RX PDU has been processed.
665 		 */
666 		pdu->isp_callback = idm_sorx_addl_pdu_cb;
667 	}
668 
669 	/*
670 	 * Setup receipt of additional header and header digest (if enabled).
671 	 */
672 	if (bhs->hlength > 0) {
673 		iov[iovlen].iov_base = (caddr_t)(pdu->isp_hdr + 1);
674 		ahslen = pdu->isp_hdrlen - sizeof (iscsi_hdr_t);
675 		iov[iovlen].iov_len = ahslen;
676 		total_len += iov[iovlen].iov_len;
677 		iovlen++;
678 	}
679 
680 	if (ic->ic_conn_flags & IDM_CONN_HEADER_DIGEST) {
681 		iov[iovlen].iov_base = (caddr_t)&hdr_digest_crc;
682 		iov[iovlen].iov_len = sizeof (hdr_digest_crc);
683 		total_len += iov[iovlen].iov_len;
684 		iovlen++;
685 	}
686 
687 	if ((iovlen != 0) &&
688 	    (idm_iov_sorecv(so_conn->ic_so, &iov[0], iovlen,
689 	    total_len) != 0)) {
690 		return (IDM_STATUS_FAIL);
691 	}
692 
693 	/*
694 	 * Validate header digest if enabled
695 	 */
696 	if (ic->ic_conn_flags & IDM_CONN_HEADER_DIGEST) {
697 		crc_calculated = idm_crc32c(pdu->isp_hdr,
698 		    sizeof (iscsi_hdr_t) + ahslen);
699 		if (crc_calculated != hdr_digest_crc) {
700 			/* Invalid Header Digest */
701 			return (IDM_STATUS_HEADER_DIGEST);
702 		}
703 	}
704 
705 	return (0);
706 }
707 
708 /*
709  * idm_so_ini_conn_create()
710  * Allocate the sockets transport connection resources.
711  */
712 static idm_status_t
713 idm_so_ini_conn_create(idm_conn_req_t *cr, idm_conn_t *ic)
714 {
715 	ksocket_t	so;
716 	idm_so_conn_t	*so_conn;
717 	idm_status_t	idmrc;
718 
719 	so = idm_socreate(cr->cr_domain, cr->cr_type,
720 	    cr->cr_protocol);
721 	if (so == NULL) {
722 		return (IDM_STATUS_FAIL);
723 	}
724 
725 	/* Bind the socket if configured to do so */
726 	if (cr->cr_bound) {
727 		if (ksocket_bind(so, &cr->cr_bound_addr.sin,
728 		    SIZEOF_SOCKADDR(&cr->cr_bound_addr.sin), CRED()) != 0) {
729 			idm_sodestroy(so);
730 			return (IDM_STATUS_FAIL);
731 		}
732 	}
733 
734 	idmrc = idm_so_conn_create_common(ic, so);
735 	if (idmrc != IDM_STATUS_SUCCESS) {
736 		idm_soshutdown(so);
737 		idm_sodestroy(so);
738 		return (IDM_STATUS_FAIL);
739 	}
740 
741 	so_conn = ic->ic_transport_private;
742 	/* Set up socket options */
743 	idm_set_ini_preconnect_options(so_conn);
744 
745 	return (IDM_STATUS_SUCCESS);
746 }
747 
748 /*
749  * idm_so_ini_conn_destroy()
750  * Tear down the sockets transport connection resources.
751  */
752 static void
753 idm_so_ini_conn_destroy(idm_conn_t *ic)
754 {
755 	idm_so_conn_destroy_common(ic);
756 }
757 
758 /*
759  * idm_so_ini_conn_connect()
760  * Establish the connection referred to by the handle previously allocated via
761  * idm_so_ini_conn_create().
762  */
763 static idm_status_t
764 idm_so_ini_conn_connect(idm_conn_t *ic)
765 {
766 	idm_so_conn_t	*so_conn;
767 
768 	so_conn = ic->ic_transport_private;
769 
770 	if (ksocket_connect(so_conn->ic_so, &ic->ic_ini_dst_addr.sin,
771 	    (SIZEOF_SOCKADDR(&ic->ic_ini_dst_addr.sin)), CRED()) != 0) {
772 		idm_soshutdown(so_conn->ic_so);
773 		return (IDM_STATUS_FAIL);
774 	}
775 
776 	idm_so_conn_connect_common(ic);
777 
778 	idm_set_ini_postconnect_options(so_conn);
779 
780 	return (IDM_STATUS_SUCCESS);
781 }
782 
783 idm_status_t
784 idm_so_tgt_conn_create(idm_conn_t *ic, ksocket_t new_so)
785 {
786 	idm_status_t	idmrc;
787 
788 	idmrc = idm_so_conn_create_common(ic, new_so);
789 
790 	return (idmrc);
791 }
792 
793 static void
794 idm_so_tgt_conn_destroy(idm_conn_t *ic)
795 {
796 	idm_so_conn_destroy_common(ic);
797 }
798 
799 /*
800  * idm_so_tgt_conn_connect()
801  * Establish the connection in ic, passed from idm_tgt_conn_finish(), which
802  * is invoked from the SM as a result of an inbound connection request.
803  */
804 static idm_status_t
805 idm_so_tgt_conn_connect(idm_conn_t *ic)
806 {
807 	idm_so_conn_connect_common(ic);
808 
809 	return (IDM_STATUS_SUCCESS);
810 }
811 
812 static idm_status_t
813 idm_so_conn_create_common(idm_conn_t *ic, ksocket_t new_so)
814 {
815 	idm_so_conn_t	*so_conn;
816 
817 	so_conn = kmem_zalloc(sizeof (idm_so_conn_t), KM_SLEEP);
818 	so_conn->ic_so = new_so;
819 
820 	ic->ic_transport_private = so_conn;
821 	ic->ic_transport_hdrlen = 0;
822 
823 	/* Set the scoreboarding flag on this connection */
824 	ic->ic_conn_flags |= IDM_CONN_USE_SCOREBOARD;
825 
826 	/*
827 	 * Initialize tx thread mutex and list
828 	 */
829 	mutex_init(&so_conn->ic_tx_mutex, NULL, MUTEX_DEFAULT, NULL);
830 	cv_init(&so_conn->ic_tx_cv, NULL, CV_DEFAULT, NULL);
831 	list_create(&so_conn->ic_tx_list, sizeof (idm_pdu_t),
832 	    offsetof(idm_pdu_t, idm_tx_link));
833 
834 	return (IDM_STATUS_SUCCESS);
835 }
836 
837 static void
838 idm_so_conn_destroy_common(idm_conn_t *ic)
839 {
840 	idm_so_conn_t	*so_conn = ic->ic_transport_private;
841 
842 	ic->ic_transport_private = NULL;
843 	idm_sodestroy(so_conn->ic_so);
844 	list_destroy(&so_conn->ic_tx_list);
845 	mutex_destroy(&so_conn->ic_tx_mutex);
846 	cv_destroy(&so_conn->ic_tx_cv);
847 
848 	kmem_free(so_conn, sizeof (idm_so_conn_t));
849 }
850 
851 static void
852 idm_so_conn_connect_common(idm_conn_t *ic)
853 {
854 	idm_so_conn_t	*so_conn;
855 	struct sockaddr_in6	t_addr;
856 	socklen_t	t_addrlen = 0;
857 
858 	so_conn = ic->ic_transport_private;
859 	bzero(&t_addr, sizeof (struct sockaddr_in6));
860 	t_addrlen = sizeof (struct sockaddr_in6);
861 
862 	/* Set the local and remote addresses in the idm conn handle */
863 	ksocket_getsockname(so_conn->ic_so, (struct sockaddr *)&t_addr,
864 	    &t_addrlen, CRED());
865 	bcopy(&t_addr, &ic->ic_laddr, t_addrlen);
866 	ksocket_getpeername(so_conn->ic_so, (struct sockaddr *)&t_addr,
867 	    &t_addrlen, CRED());
868 	bcopy(&t_addr, &ic->ic_raddr, t_addrlen);
869 
870 	mutex_enter(&ic->ic_mutex);
871 	so_conn->ic_tx_thread = thread_create(NULL, 0, idm_sotx_thread, ic, 0,
872 	    &p0, TS_RUN, minclsyspri);
873 	so_conn->ic_rx_thread = thread_create(NULL, 0, idm_sorx_thread, ic, 0,
874 	    &p0, TS_RUN, minclsyspri);
875 
876 	while (!so_conn->ic_rx_thread_running || !so_conn->ic_tx_thread_running)
877 		cv_wait(&ic->ic_cv, &ic->ic_mutex);
878 	mutex_exit(&ic->ic_mutex);
879 }
880 
881 /*
882  * idm_so_conn_disconnect()
883  * Shutdown the socket connection and stop the thread
884  */
885 static void
886 idm_so_conn_disconnect(idm_conn_t *ic)
887 {
888 	idm_so_conn_t	*so_conn;
889 
890 	so_conn = ic->ic_transport_private;
891 
892 	mutex_enter(&ic->ic_mutex);
893 	so_conn->ic_rx_thread_running = B_FALSE;
894 	so_conn->ic_tx_thread_running = B_FALSE;
895 	/* We need to wakeup the TX thread */
896 	mutex_enter(&so_conn->ic_tx_mutex);
897 	cv_signal(&so_conn->ic_tx_cv);
898 	mutex_exit(&so_conn->ic_tx_mutex);
899 	mutex_exit(&ic->ic_mutex);
900 
901 	/* This should wakeup the RX thread if it is sleeping */
902 	idm_soshutdown(so_conn->ic_so);
903 
904 	thread_join(so_conn->ic_tx_thread_did);
905 	thread_join(so_conn->ic_rx_thread_did);
906 }
907 
908 /*
909  * idm_so_tgt_svc_create()
910  * Establish a service on an IP address and port.  idm_svc_req_t contains
911  * the service parameters.
912  */
913 /*ARGSUSED*/
914 static idm_status_t
915 idm_so_tgt_svc_create(idm_svc_req_t *sr, idm_svc_t *is)
916 {
917 	idm_so_svc_t		*so_svc;
918 
919 	so_svc = kmem_zalloc(sizeof (idm_so_svc_t), KM_SLEEP);
920 
921 	/* Set the new sockets service in svc handle */
922 	is->is_so_svc = (void *)so_svc;
923 
924 	return (IDM_STATUS_SUCCESS);
925 }
926 
927 /*
928  * idm_so_tgt_svc_destroy()
929  * Teardown sockets resources allocated in idm_so_tgt_svc_create()
930  */
931 static void
932 idm_so_tgt_svc_destroy(idm_svc_t *is)
933 {
934 	/* the socket will have been torn down; free the service */
935 	kmem_free(is->is_so_svc, sizeof (idm_so_svc_t));
936 }
937 
938 /*
939  * idm_so_tgt_svc_online()
940  * Launch a watch thread on the svc allocated in idm_so_tgt_svc_create()
941  */
942 
943 static idm_status_t
944 idm_so_tgt_svc_online(idm_svc_t *is)
945 {
946 	idm_so_svc_t		*so_svc;
947 	idm_svc_req_t		*sr = &is->is_svc_req;
948 	struct sockaddr_in6	sin6_ip;
949 	const uint32_t		on = 1;
950 	const uint32_t		off = 0;
951 
952 	mutex_enter(&is->is_mutex);
953 	so_svc = (idm_so_svc_t *)is->is_so_svc;
954 
955 	/*
956 	 * Try creating an IPv6 socket first
957 	 */
958 	if ((so_svc->is_so = idm_socreate(PF_INET6, SOCK_STREAM, 0)) == NULL) {
959 		mutex_exit(&is->is_mutex);
960 		return (IDM_STATUS_FAIL);
961 	} else {
962 		bzero(&sin6_ip, sizeof (sin6_ip));
963 		sin6_ip.sin6_family = AF_INET6;
964 		sin6_ip.sin6_port = htons(sr->sr_port);
965 		sin6_ip.sin6_addr = in6addr_any;
966 
967 		(void) ksocket_setsockopt(so_svc->is_so, SOL_SOCKET,
968 		    SO_REUSEADDR, (char *)&on, sizeof (on), CRED());
969 		/*
970 		 * Turn off SO_MAC_EXEMPT so future sobinds succeed
971 		 */
972 		(void) ksocket_setsockopt(so_svc->is_so, SOL_SOCKET,
973 		    SO_MAC_EXEMPT, (char *)&off, sizeof (off), CRED());
974 
975 		if (ksocket_bind(so_svc->is_so, (struct sockaddr *)&sin6_ip,
976 		    sizeof (sin6_ip), CRED()) != 0) {
977 			mutex_exit(&is->is_mutex);
978 			idm_sodestroy(so_svc->is_so);
979 			return (IDM_STATUS_FAIL);
980 		}
981 	}
982 
983 	idm_set_tgt_connect_options(so_svc->is_so);
984 
985 	if (ksocket_listen(so_svc->is_so, 5, CRED()) != 0) {
986 		mutex_exit(&is->is_mutex);
987 		idm_soshutdown(so_svc->is_so);
988 		idm_sodestroy(so_svc->is_so);
989 		return (IDM_STATUS_FAIL);
990 	}
991 
992 	/* Launch a watch thread */
993 	so_svc->is_thread = thread_create(NULL, 0, idm_so_svc_port_watcher,
994 	    is, 0, &p0, TS_RUN, minclsyspri);
995 
996 	if (so_svc->is_thread == NULL) {
997 		/* Failure to launch; teardown the socket */
998 		mutex_exit(&is->is_mutex);
999 		idm_soshutdown(so_svc->is_so);
1000 		idm_sodestroy(so_svc->is_so);
1001 		return (IDM_STATUS_FAIL);
1002 	}
1003 	ksocket_hold(so_svc->is_so);
1004 	/* Wait for the port watcher thread to start */
1005 	while (!so_svc->is_thread_running)
1006 		cv_wait(&is->is_cv, &is->is_mutex);
1007 	mutex_exit(&is->is_mutex);
1008 
1009 	return (IDM_STATUS_SUCCESS);
1010 }
1011 
1012 /*
1013  * idm_so_tgt_svc_offline
1014  *
1015  * Stop listening on the IP address and port identified by idm_svc_t.
1016  */
1017 static void
1018 idm_so_tgt_svc_offline(idm_svc_t *is)
1019 {
1020 	idm_so_svc_t		*so_svc;
1021 	mutex_enter(&is->is_mutex);
1022 	so_svc = (idm_so_svc_t *)is->is_so_svc;
1023 	so_svc->is_thread_running = B_FALSE;
1024 	mutex_exit(&is->is_mutex);
1025 
1026 	/*
1027 	 * Teardown socket
1028 	 */
1029 	idm_sodestroy(so_svc->is_so);
1030 
1031 	/*
1032 	 * Now we expect the port watcher thread to terminate
1033 	 */
1034 	thread_join(so_svc->is_thread_did);
1035 }
1036 
1037 /*
1038  * Watch thread for target service connection establishment.
1039  */
1040 void
1041 idm_so_svc_port_watcher(void *arg)
1042 {
1043 	idm_svc_t		*svc = arg;
1044 	ksocket_t		new_so;
1045 	idm_conn_t		*ic;
1046 	idm_status_t		idmrc;
1047 	idm_so_svc_t		*so_svc;
1048 	int			rc;
1049 	const uint32_t		off = 0;
1050 	struct sockaddr_in6 	t_addr;
1051 	socklen_t		t_addrlen;
1052 
1053 	bzero(&t_addr, sizeof (struct sockaddr_in6));
1054 	t_addrlen = sizeof (struct sockaddr_in6);
1055 	mutex_enter(&svc->is_mutex);
1056 
1057 	so_svc = svc->is_so_svc;
1058 	so_svc->is_thread_running = B_TRUE;
1059 	so_svc->is_thread_did = so_svc->is_thread->t_did;
1060 
1061 	cv_signal(&svc->is_cv);
1062 
1063 	IDM_SVC_LOG(CE_NOTE, "iSCSI service (%p/%d) online", (void *)svc,
1064 	    svc->is_svc_req.sr_port);
1065 
1066 	while (so_svc->is_thread_running) {
1067 		mutex_exit(&svc->is_mutex);
1068 
1069 		if ((rc = ksocket_accept(so_svc->is_so,
1070 		    (struct sockaddr *)&t_addr, &t_addrlen,
1071 		    &new_so, CRED())) != 0) {
1072 			mutex_enter(&svc->is_mutex);
1073 			if (rc == ECONNABORTED)
1074 				continue;
1075 			/* Connection problem */
1076 			break;
1077 		}
1078 		/*
1079 		 * Turn off SO_MAC_EXEMPT so future sobinds succeed
1080 		 */
1081 		(void) ksocket_setsockopt(new_so, SOL_SOCKET, SO_MAC_EXEMPT,
1082 		    (char *)&off, sizeof (off), CRED());
1083 
1084 		idmrc = idm_svc_conn_create(svc, IDM_TRANSPORT_TYPE_SOCKETS,
1085 		    &ic);
1086 		if (idmrc != IDM_STATUS_SUCCESS) {
1087 			/* Drop connection */
1088 			idm_soshutdown(new_so);
1089 			idm_sodestroy(new_so);
1090 			mutex_enter(&svc->is_mutex);
1091 			continue;
1092 		}
1093 
1094 		idmrc = idm_so_tgt_conn_create(ic, new_so);
1095 		if (idmrc != IDM_STATUS_SUCCESS) {
1096 			idm_svc_conn_destroy(ic);
1097 			idm_soshutdown(new_so);
1098 			idm_sodestroy(new_so);
1099 			mutex_enter(&svc->is_mutex);
1100 			continue;
1101 		}
1102 
1103 		/*
1104 		 * Kick the state machine.  At CS_S3_XPT_UP the state machine
1105 		 * will notify the client (target) about the new connection.
1106 		 */
1107 		idm_conn_event(ic, CE_CONNECT_ACCEPT, NULL);
1108 
1109 		mutex_enter(&svc->is_mutex);
1110 	}
1111 	ksocket_rele(so_svc->is_so);
1112 	so_svc->is_thread_running = B_FALSE;
1113 	mutex_exit(&svc->is_mutex);
1114 
1115 	IDM_SVC_LOG(CE_NOTE, "iSCSI service (%p/%d) offline", (void *)svc,
1116 	    svc->is_svc_req.sr_port);
1117 
1118 	thread_exit();
1119 }
1120 
1121 /*
1122  * idm_so_free_task_rsrc() stops any ongoing processing of the task and
1123  * frees resources associated with the task.
1124  *
1125  * It's not clear that this should return idm_status_t.  What do we do
1126  * if it fails?
1127  */
1128 static idm_status_t
1129 idm_so_free_task_rsrc(idm_task_t *idt)
1130 {
1131 	idm_buf_t	*idb;
1132 
1133 	/*
1134 	 * There is nothing to cleanup on initiator connections
1135 	 */
1136 	if (IDM_CONN_ISINI(idt->idt_ic))
1137 		return (IDM_STATUS_SUCCESS);
1138 
1139 	/*
1140 	 * If this is a target connection, call idm_buf_rx_from_ini_done for
1141 	 * any buffer on the "outbufv" list with idb->idb_in_transport==B_TRUE.
1142 	 *
1143 	 * In addition, remove any buffers associated with this task from
1144 	 * the ic_tx_list.  We'll do this by walking the idt_inbufv list, but
1145 	 * items don't actually get removed from that list (and completion
1146 	 * routines called) until idm_task_cleanup.
1147 	 */
1148 	mutex_enter(&idt->idt_mutex);
1149 
1150 	for (idb = list_head(&idt->idt_outbufv); idb != NULL;
1151 	    idb = list_next(&idt->idt_outbufv, idb)) {
1152 		if (idb->idb_in_transport) {
1153 			/*
1154 			 * idm_buf_rx_from_ini_done releases idt->idt_mutex
1155 			 */
1156 			idm_buf_rx_from_ini_done(idt, idb, IDM_STATUS_ABORTED);
1157 			mutex_enter(&idt->idt_mutex);
1158 		}
1159 	}
1160 
1161 	for (idb = list_head(&idt->idt_inbufv); idb != NULL;
1162 	    idb = list_next(&idt->idt_inbufv, idb)) {
1163 		/*
1164 		 * We want to remove these items from the tx_list as well,
1165 		 * but knowing it's in the idt_inbufv list is not a guarantee
1166 		 * that it's in the tx_list.  If it's on the tx list then
1167 		 * let idm_sotx_thread() clean it up.
1168 		 */
1169 		if (idb->idb_in_transport && !idb->idb_tx_thread) {
1170 			/*
1171 			 * idm_buf_tx_to_ini_done releases idt->idt_mutex
1172 			 */
1173 			idm_buf_tx_to_ini_done(idt, idb, IDM_STATUS_ABORTED);
1174 			mutex_enter(&idt->idt_mutex);
1175 		}
1176 	}
1177 
1178 	mutex_exit(&idt->idt_mutex);
1179 
1180 	return (IDM_STATUS_SUCCESS);
1181 }
1182 
1183 /*
1184  * idm_so_negotiate_key_values() validates the key values for this connection
1185  */
1186 /* ARGSUSED */
1187 static kv_status_t
1188 idm_so_negotiate_key_values(idm_conn_t *it, nvlist_t *request_nvl,
1189     nvlist_t *response_nvl, nvlist_t *negotiated_nvl)
1190 {
1191 	/* All parameters are negotiated at the iscsit level */
1192 	return (KV_HANDLED);
1193 }
1194 
1195 /*
1196  * idm_so_notice_key_values() activates the negotiated key values for
1197  * this connection.
1198  */
1199 static void
1200 idm_so_notice_key_values(idm_conn_t *it, nvlist_t *negotiated_nvl)
1201 {
1202 	char			*nvp_name;
1203 	nvpair_t		*nvp;
1204 	nvpair_t		*next_nvp;
1205 	int			nvrc;
1206 	idm_status_t		idm_status;
1207 	const idm_kv_xlate_t	*ikvx;
1208 
1209 	for (nvp = nvlist_next_nvpair(negotiated_nvl, NULL);
1210 	    nvp != NULL; nvp = next_nvp) {
1211 		next_nvp = nvlist_next_nvpair(negotiated_nvl, nvp);
1212 		nvp_name = nvpair_name(nvp);
1213 
1214 		ikvx = idm_lookup_kv_xlate(nvp_name, strlen(nvp_name));
1215 		switch (ikvx->ik_key_id) {
1216 		case KI_HEADER_DIGEST:
1217 		case KI_DATA_DIGEST:
1218 			idm_status = idm_so_handle_digest(it, nvp, ikvx);
1219 			ASSERT(idm_status == 0);
1220 
1221 			/* Remove processed item from negotiated_nvl list */
1222 			nvrc = nvlist_remove_all(
1223 			    negotiated_nvl, ikvx->ik_key_name);
1224 			ASSERT(nvrc == 0);
1225 			break;
1226 		default:
1227 			break;
1228 		}
1229 	}
1230 }
1231 
1232 
1233 static idm_status_t
1234 idm_so_handle_digest(idm_conn_t *it, nvpair_t *digest_choice,
1235     const idm_kv_xlate_t *ikvx)
1236 {
1237 	int			nvrc;
1238 	char			*digest_choice_string;
1239 
1240 	nvrc = nvpair_value_string(digest_choice,
1241 	    &digest_choice_string);
1242 	ASSERT(nvrc == 0);
1243 	if (strcasecmp(digest_choice_string, "crc32c") == 0) {
1244 		switch (ikvx->ik_key_id) {
1245 		case KI_HEADER_DIGEST:
1246 			it->ic_conn_flags |= IDM_CONN_HEADER_DIGEST;
1247 			break;
1248 		case KI_DATA_DIGEST:
1249 			it->ic_conn_flags |= IDM_CONN_DATA_DIGEST;
1250 			break;
1251 		default:
1252 			ASSERT(0);
1253 			break;
1254 		}
1255 	} else if (strcasecmp(digest_choice_string, "none") == 0) {
1256 		switch (ikvx->ik_key_id) {
1257 		case KI_HEADER_DIGEST:
1258 			it->ic_conn_flags &= ~IDM_CONN_HEADER_DIGEST;
1259 			break;
1260 		case KI_DATA_DIGEST:
1261 			it->ic_conn_flags &= ~IDM_CONN_DATA_DIGEST;
1262 			break;
1263 		default:
1264 			ASSERT(0);
1265 			break;
1266 		}
1267 	} else {
1268 		ASSERT(0);
1269 	}
1270 
1271 	return (IDM_STATUS_SUCCESS);
1272 }
1273 
1274 
1275 /*
1276  * idm_so_conn_is_capable() verifies that the passed connection is provided
1277  * for by the sockets interface.
1278  */
1279 /* ARGSUSED */
1280 static boolean_t
1281 idm_so_conn_is_capable(idm_conn_req_t *ic, idm_transport_caps_t *caps)
1282 {
1283 	return (B_TRUE);
1284 }
1285 
1286 /*
1287  * idm_so_rx_datain() validates the Data Sequence number of the PDU. The
1288  * idm_sorecv_scsidata() function invoked earlier actually reads the data
1289  * off the socket into the appropriate buffers.
1290  */
1291 static void
1292 idm_so_rx_datain(idm_conn_t *ic, idm_pdu_t *pdu)
1293 {
1294 	iscsi_data_hdr_t	*bhs;
1295 	idm_task_t		*idt;
1296 	idm_buf_t		*idb;
1297 	uint32_t		datasn;
1298 	size_t			offset;
1299 	iscsi_hdr_t		*ihp = (iscsi_hdr_t *)pdu->isp_hdr;
1300 	iscsi_data_rsp_hdr_t    *idrhp = (iscsi_data_rsp_hdr_t *)ihp;
1301 
1302 	ASSERT(ic != NULL);
1303 	ASSERT(pdu != NULL);
1304 
1305 	bhs	= (iscsi_data_hdr_t *)pdu->isp_hdr;
1306 	datasn	= ntohl(bhs->datasn);
1307 	offset	= ntohl(bhs->offset);
1308 
1309 	ASSERT(bhs->opcode == ISCSI_OP_SCSI_DATA_RSP);
1310 
1311 	/*
1312 	 * Look up the task corresponding to the initiator task tag
1313 	 * to get the buffers affiliated with the task.
1314 	 */
1315 	idt = idm_task_find(ic, bhs->itt, bhs->ttt);
1316 	if (idt == NULL) {
1317 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_datain: failed to find task");
1318 		idm_pdu_rx_protocol_error(ic, pdu);
1319 		return;
1320 	}
1321 
1322 	idb = pdu->isp_sorx_buf;
1323 	if (idb == NULL) {
1324 		IDM_CONN_LOG(CE_WARN,
1325 		    "idm_so_rx_datain: failed to find buffer");
1326 		idm_task_rele(idt);
1327 		idm_pdu_rx_protocol_error(ic, pdu);
1328 		return;
1329 	}
1330 
1331 	/*
1332 	 * DataSN values should be sequential and should not have any gaps or
1333 	 * repetitions. Check the DataSN with the one stored in the task.
1334 	 */
1335 	if (datasn == idt->idt_exp_datasn) {
1336 		idt->idt_exp_datasn++; /* keep track of DataSN received */
1337 	} else {
1338 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_datain: datasn out of order");
1339 		idm_task_rele(idt);
1340 		idm_pdu_rx_protocol_error(ic, pdu);
1341 		return;
1342 	}
1343 
1344 	/*
1345 	 * PDUs in a sequence should be in continuously increasing
1346 	 * address offset
1347 	 */
1348 	if (offset != idb->idb_exp_offset) {
1349 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_datain: unexpected offset");
1350 		idm_task_rele(idt);
1351 		idm_pdu_rx_protocol_error(ic, pdu);
1352 		return;
1353 	}
1354 	/* Expected next relative buffer offset */
1355 	idb->idb_exp_offset += n2h24(bhs->dlength);
1356 	idt->idt_rx_bytes += n2h24(bhs->dlength);
1357 
1358 	idm_task_rele(idt);
1359 
1360 	/*
1361 	 * For now call scsi_rsp which will process the data rsp
1362 	 * Revisit, need to provide an explicit client entry point for
1363 	 * phase collapse completions.
1364 	 */
1365 	if (((ihp->opcode & ISCSI_OPCODE_MASK) == ISCSI_OP_SCSI_DATA_RSP) &&
1366 	    (idrhp->flags & ISCSI_FLAG_DATA_STATUS)) {
1367 		(*ic->ic_conn_ops.icb_rx_scsi_rsp)(ic, pdu);
1368 	}
1369 
1370 	idm_pdu_complete(pdu, IDM_STATUS_SUCCESS);
1371 }
1372 
1373 /*
1374  * The idm_so_rx_dataout() function is used by the iSCSI target to read
1375  * data from the Data-Out PDU sent by the iSCSI initiator.
1376  *
1377  * This function gets the Initiator Task Tag from the PDU BHS and looks up the
1378  * task to get the buffers associated with the PDU. A PDU might span buffers.
1379  * The data is then read into the respective buffer.
1380  */
1381 static void
1382 idm_so_rx_dataout(idm_conn_t *ic, idm_pdu_t *pdu)
1383 {
1384 
1385 	iscsi_data_hdr_t	*bhs;
1386 	idm_task_t		*idt;
1387 	idm_buf_t		*idb;
1388 	size_t			offset;
1389 
1390 	ASSERT(ic != NULL);
1391 	ASSERT(pdu != NULL);
1392 
1393 	bhs = (iscsi_data_hdr_t *)pdu->isp_hdr;
1394 	offset = ntohl(bhs->offset);
1395 	ASSERT(bhs->opcode == ISCSI_OP_SCSI_DATA);
1396 
1397 	/*
1398 	 * Look up the task corresponding to the initiator task tag
1399 	 * to get the buffers affiliated with the task.
1400 	 */
1401 	idt = idm_task_find(ic, bhs->itt, bhs->ttt);
1402 	if (idt == NULL) {
1403 		IDM_CONN_LOG(CE_WARN,
1404 		    "idm_so_rx_dataout: failed to find task");
1405 		idm_pdu_rx_protocol_error(ic, pdu);
1406 		return;
1407 	}
1408 
1409 	idb = pdu->isp_sorx_buf;
1410 	if (idb == NULL) {
1411 		IDM_CONN_LOG(CE_WARN,
1412 		    "idm_so_rx_dataout: failed to find buffer");
1413 		idm_task_rele(idt);
1414 		idm_pdu_rx_protocol_error(ic, pdu);
1415 		return;
1416 	}
1417 
1418 	/* Keep track of data transferred - check data offsets */
1419 	if (offset != idb->idb_exp_offset) {
1420 		IDM_CONN_LOG(CE_NOTE, "idm_so_rx_dataout: offset out of seq: "
1421 		    "%ld, %d", offset, idb->idb_exp_offset);
1422 		idm_task_rele(idt);
1423 		idm_pdu_rx_protocol_error(ic, pdu);
1424 		return;
1425 	}
1426 	/* Expected next relative offset */
1427 	idb->idb_exp_offset += ntoh24(bhs->dlength);
1428 	idt->idt_rx_bytes += n2h24(bhs->dlength);
1429 
1430 	/*
1431 	 * Call the buffer callback when the transfer is complete
1432 	 *
1433 	 * The connection state machine should only abort tasks after
1434 	 * shutting down the connection so we are assured that there
1435 	 * won't be a simultaneous attempt to abort this task at the
1436 	 * same time as we are processing this PDU (due to a connection
1437 	 * state change).
1438 	 */
1439 	if (bhs->flags & ISCSI_FLAG_FINAL) {
1440 		/*
1441 		 * We only want to call idm_buf_rx_from_ini_done once
1442 		 * per transfer.  It's possible that this task has
1443 		 * already been aborted in which case
1444 		 * idm_so_free_task_rsrc will call idm_buf_rx_from_ini_done
1445 		 * for each buffer with idb_in_transport==B_TRUE.  To
1446 		 * close this window and ensure that this doesn't happen,
1447 		 * we'll clear idb->idb_in_transport now while holding
1448 		 * the task mutex.   This is only really an issue for
1449 		 * SCSI task abort -- if tasks were being aborted because
1450 		 * of a connection state change the state machine would
1451 		 * have already stopped the receive thread.
1452 		 */
1453 		mutex_enter(&idt->idt_mutex);
1454 
1455 		/*
1456 		 * Release the task hold here (obtained in idm_task_find)
1457 		 * because the task may complete synchronously during
1458 		 * idm_buf_rx_from_ini_done.  Since we still have an active
1459 		 * buffer we know there is at least one additional hold on idt.
1460 		 */
1461 		idm_task_rele(idt);
1462 
1463 		/*
1464 		 * idm_buf_rx_from_ini_done releases idt->idt_mutex
1465 		 */
1466 		idm_buf_rx_from_ini_done(idt, idb, IDM_STATUS_SUCCESS);
1467 		idm_pdu_complete(pdu, IDM_STATUS_SUCCESS);
1468 		return;
1469 	}
1470 
1471 	idm_task_rele(idt);
1472 	idm_pdu_complete(pdu, IDM_STATUS_SUCCESS);
1473 }
1474 
1475 /*
1476  * The idm_so_rx_rtt() function is used by the iSCSI initiator to handle
1477  * the R2T PDU sent by the iSCSI target indicating that it is ready to
1478  * accept data. This gets the Initiator Task Tag (itt) from the PDU BHS
1479  * and looks up the task in the task tree using the itt to get the output
1480  * buffers associated the task. The R2T PDU contains the offset of the
1481  * requested data and the data length. This function then constructs a
1482  * sequence of iSCSI PDUs and outputs the requested data. Each Data-Out
1483  * PDU is associated with the R2T by the Target Transfer Tag  (ttt).
1484  */
1485 
1486 static void
1487 idm_so_rx_rtt(idm_conn_t *ic, idm_pdu_t *pdu)
1488 {
1489 	idm_task_t		*idt;
1490 	idm_buf_t		*idb;
1491 	iscsi_rtt_hdr_t		*rtt_hdr;
1492 	uint32_t		data_offset;
1493 	uint32_t		data_length;
1494 
1495 	ASSERT(ic != NULL);
1496 	ASSERT(pdu != NULL);
1497 
1498 	rtt_hdr	= (iscsi_rtt_hdr_t *)pdu->isp_hdr;
1499 	data_offset = ntohl(rtt_hdr->data_offset);
1500 	data_length = ntohl(rtt_hdr->data_length);
1501 	idt	= idm_task_find(ic, rtt_hdr->itt, rtt_hdr->ttt);
1502 
1503 	if (idt == NULL) {
1504 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_rtt: could not find task");
1505 		idm_pdu_rx_protocol_error(ic, pdu);
1506 		return;
1507 	}
1508 
1509 	/* Find the buffer bound to the task by the iSCSI initiator */
1510 	mutex_enter(&idt->idt_mutex);
1511 	idb = idm_buf_find(&idt->idt_outbufv, data_offset);
1512 	if (idb == NULL) {
1513 		mutex_exit(&idt->idt_mutex);
1514 		idm_task_rele(idt);
1515 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_rtt: could not find buffer");
1516 		idm_pdu_rx_protocol_error(ic, pdu);
1517 		return;
1518 	}
1519 
1520 	/* return buffer contains this data */
1521 	if (data_offset + data_length > idb->idb_buflen) {
1522 		/* Overflow */
1523 		mutex_exit(&idt->idt_mutex);
1524 		idm_task_rele(idt);
1525 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_rtt: read from outside "
1526 		    "buffer");
1527 		idm_pdu_rx_protocol_error(ic, pdu);
1528 		return;
1529 	}
1530 
1531 	idt->idt_r2t_ttt = rtt_hdr->ttt;
1532 	idt->idt_exp_datasn = 0;
1533 
1534 	idm_so_send_rtt_data(ic, idt, idb, data_offset,
1535 	    ntohl(rtt_hdr->data_length));
1536 	mutex_exit(&idt->idt_mutex);
1537 
1538 	idm_pdu_complete(pdu, IDM_STATUS_SUCCESS);
1539 	idm_task_rele(idt);
1540 
1541 }
1542 
1543 idm_status_t
1544 idm_sorecvdata(idm_conn_t *ic, idm_pdu_t *pdu)
1545 {
1546 	uint8_t		pad[ISCSI_PAD_WORD_LEN];
1547 	int		pad_len;
1548 	uint32_t	data_digest_crc;
1549 	uint32_t	crc_calculated;
1550 	int		total_len;
1551 	idm_so_conn_t	*so_conn;
1552 
1553 	so_conn = ic->ic_transport_private;
1554 
1555 	pad_len = ((ISCSI_PAD_WORD_LEN -
1556 	    (pdu->isp_datalen & (ISCSI_PAD_WORD_LEN - 1))) &
1557 	    (ISCSI_PAD_WORD_LEN - 1));
1558 
1559 	ASSERT(pdu->isp_iovlen < (PDU_MAX_IOVLEN - 2)); /* pad + data digest */
1560 
1561 	total_len = pdu->isp_datalen;
1562 
1563 	if (pad_len) {
1564 		pdu->isp_iov[pdu->isp_iovlen].iov_base	= (char *)&pad;
1565 		pdu->isp_iov[pdu->isp_iovlen].iov_len	= pad_len;
1566 		total_len		+= pad_len;
1567 		pdu->isp_iovlen++;
1568 	}
1569 
1570 	/* setup data digest */
1571 	if ((ic->ic_conn_flags & IDM_CONN_DATA_DIGEST) != 0) {
1572 		pdu->isp_iov[pdu->isp_iovlen].iov_base =
1573 		    (char *)&data_digest_crc;
1574 		pdu->isp_iov[pdu->isp_iovlen].iov_len =
1575 		    sizeof (data_digest_crc);
1576 		total_len		+= sizeof (data_digest_crc);
1577 		pdu->isp_iovlen++;
1578 	}
1579 
1580 	pdu->isp_data = (uint8_t *)(uintptr_t)pdu->isp_iov[0].iov_base;
1581 
1582 	if (idm_iov_sorecv(so_conn->ic_so, &pdu->isp_iov[0],
1583 	    pdu->isp_iovlen, total_len) != 0) {
1584 		return (IDM_STATUS_IO);
1585 	}
1586 
1587 	if ((ic->ic_conn_flags & IDM_CONN_DATA_DIGEST) != 0) {
1588 		crc_calculated = idm_crc32c(pdu->isp_data,
1589 		    pdu->isp_datalen);
1590 		if (pad_len) {
1591 			crc_calculated = idm_crc32c_continued((char *)&pad,
1592 			    pad_len, crc_calculated);
1593 		}
1594 		if (crc_calculated != data_digest_crc) {
1595 			IDM_CONN_LOG(CE_WARN,
1596 			    "idm_sorecvdata: "
1597 			    "CRC error: actual 0x%x, calc 0x%x",
1598 			    data_digest_crc, crc_calculated);
1599 
1600 			/* Invalid Data Digest */
1601 			return (IDM_STATUS_DATA_DIGEST);
1602 		}
1603 	}
1604 
1605 	return (IDM_STATUS_SUCCESS);
1606 }
1607 
1608 /*
1609  * idm_sorecv_scsidata() is used to receive scsi data from the socket. The
1610  * Data-type PDU header must be read into the idm_pdu_t structure prior to
1611  * calling this function.
1612  */
1613 idm_status_t
1614 idm_sorecv_scsidata(idm_conn_t *ic, idm_pdu_t *pdu)
1615 {
1616 	iscsi_data_hdr_t	*bhs;
1617 	idm_task_t		*task;
1618 	uint32_t		offset;
1619 	uint8_t			opcode;
1620 	uint32_t		dlength;
1621 	list_t			*buflst;
1622 	uint32_t		xfer_bytes;
1623 	idm_status_t		status;
1624 
1625 	ASSERT(ic != NULL);
1626 	ASSERT(pdu != NULL);
1627 
1628 	bhs	= (iscsi_data_hdr_t *)pdu->isp_hdr;
1629 
1630 	offset	= ntohl(bhs->offset);
1631 	opcode	= bhs->opcode;
1632 	dlength = n2h24(bhs->dlength);
1633 
1634 	ASSERT((opcode == ISCSI_OP_SCSI_DATA_RSP) ||
1635 	    (opcode == ISCSI_OP_SCSI_DATA));
1636 
1637 	/*
1638 	 * Successful lookup implicitly gets a "hold" on the task.  This
1639 	 * hold must be released before leaving this function.  At one
1640 	 * point we were caching this task context and retaining the hold
1641 	 * but it turned out to be very difficult to release the hold properly.
1642 	 * The task can be aborted and the connection shutdown between this
1643 	 * call and the subsequent expected call to idm_so_rx_datain/
1644 	 * idm_so_rx_dataout (in which case those functions are not called).
1645 	 * Releasing the hold in the PDU callback doesn't work well either
1646 	 * because the whole task may be completed by then at which point
1647 	 * it is too late to release the hold -- for better or worse this
1648 	 * code doesn't wait on the refcnts during normal operation.
1649 	 * idm_task_find() is very fast and it is not a huge burden if we
1650 	 * have to do it twice.
1651 	 */
1652 	task = idm_task_find(ic, bhs->itt, bhs->ttt);
1653 	if (task == NULL) {
1654 		IDM_CONN_LOG(CE_WARN,
1655 		    "idm_sorecv_scsidata: could not find task");
1656 		return (IDM_STATUS_FAIL);
1657 	}
1658 
1659 	mutex_enter(&task->idt_mutex);
1660 	buflst	= (opcode == ISCSI_OP_SCSI_DATA_RSP) ?
1661 	    &task->idt_inbufv : &task->idt_outbufv;
1662 	pdu->isp_sorx_buf = idm_buf_find(buflst, offset);
1663 	mutex_exit(&task->idt_mutex);
1664 
1665 	if (pdu->isp_sorx_buf == NULL) {
1666 		idm_task_rele(task);
1667 		IDM_CONN_LOG(CE_WARN, "idm_sorecv_scsidata: could not find "
1668 		    "buffer for offset %x opcode=%x",
1669 		    offset, opcode);
1670 		return (IDM_STATUS_FAIL);
1671 	}
1672 
1673 	xfer_bytes = idm_fill_iov(pdu, pdu->isp_sorx_buf, offset, dlength);
1674 	ASSERT(xfer_bytes != 0);
1675 	if (xfer_bytes != dlength) {
1676 		idm_task_rele(task);
1677 		/*
1678 		 * Buffer overflow, connection error.  The PDU data is still
1679 		 * sitting in the socket so we can't use the connection
1680 		 * again until that data is drained.
1681 		 */
1682 		return (IDM_STATUS_FAIL);
1683 	}
1684 
1685 	status = idm_sorecvdata(ic, pdu);
1686 
1687 	idm_task_rele(task);
1688 
1689 	return (status);
1690 }
1691 
1692 static uint32_t
1693 idm_fill_iov(idm_pdu_t *pdu, idm_buf_t *idb, uint32_t ro, uint32_t dlength)
1694 {
1695 	uint32_t	buf_ro = ro - idb->idb_bufoffset;
1696 	uint32_t	xfer_len = min(dlength, idb->idb_buflen - buf_ro);
1697 
1698 	ASSERT(ro >= idb->idb_bufoffset);
1699 
1700 	pdu->isp_iov[pdu->isp_iovlen].iov_base	=
1701 	    (caddr_t)idb->idb_buf + buf_ro;
1702 	pdu->isp_iov[pdu->isp_iovlen].iov_len	= xfer_len;
1703 	pdu->isp_iovlen++;
1704 
1705 	return (xfer_len);
1706 }
1707 
1708 int
1709 idm_sorecv_nonscsidata(idm_conn_t *ic, idm_pdu_t *pdu)
1710 {
1711 	pdu->isp_data = kmem_alloc(pdu->isp_datalen, KM_SLEEP);
1712 	ASSERT(pdu->isp_data != NULL);
1713 
1714 	pdu->isp_databuflen = pdu->isp_datalen;
1715 	pdu->isp_iov[0].iov_base = (caddr_t)pdu->isp_data;
1716 	pdu->isp_iov[0].iov_len = pdu->isp_datalen;
1717 	pdu->isp_iovlen = 1;
1718 	/*
1719 	 * Since we are associating a new data buffer with this received
1720 	 * PDU we need to set a specific callback to free the data
1721 	 * after the PDU is processed.
1722 	 */
1723 	pdu->isp_flags |= IDM_PDU_ADDL_DATA;
1724 	pdu->isp_callback = idm_sorx_addl_pdu_cb;
1725 
1726 	return (idm_sorecvdata(ic, pdu));
1727 }
1728 
1729 void
1730 idm_sorx_thread(void *arg)
1731 {
1732 	boolean_t	conn_failure = B_FALSE;
1733 	idm_conn_t	*ic = (idm_conn_t *)arg;
1734 	idm_so_conn_t	*so_conn;
1735 	idm_pdu_t	*pdu;
1736 	idm_status_t	rc;
1737 
1738 	idm_conn_hold(ic);
1739 
1740 	mutex_enter(&ic->ic_mutex);
1741 
1742 	so_conn = ic->ic_transport_private;
1743 	so_conn->ic_rx_thread_running = B_TRUE;
1744 	so_conn->ic_rx_thread_did = so_conn->ic_rx_thread->t_did;
1745 	cv_signal(&ic->ic_cv);
1746 
1747 	while (so_conn->ic_rx_thread_running) {
1748 		mutex_exit(&ic->ic_mutex);
1749 
1750 		/*
1751 		 * Get PDU with default header size (large enough for
1752 		 * BHS plus any anticipated AHS).  PDU from
1753 		 * the cache will have all values set correctly
1754 		 * for sockets RX including callback.
1755 		 */
1756 		pdu = kmem_cache_alloc(idm.idm_sorx_pdu_cache, KM_SLEEP);
1757 		pdu->isp_ic = ic;
1758 		pdu->isp_flags = 0;
1759 		pdu->isp_transport_hdrlen = 0;
1760 
1761 		if ((rc = idm_sorecvhdr(ic, pdu)) != 0) {
1762 			/*
1763 			 * Call idm_pdu_complete so that we call the callback
1764 			 * and ensure any memory allocated in idm_sorecvhdr
1765 			 * gets freed up.
1766 			 */
1767 			idm_pdu_complete(pdu, IDM_STATUS_FAIL);
1768 
1769 			/*
1770 			 * If ic_rx_thread_running is still set then
1771 			 * this is some kind of connection problem
1772 			 * on the socket.  In this case we want to
1773 			 * generate an event.  Otherwise some other
1774 			 * thread closed the socket due to another
1775 			 * issue in which case we don't need to
1776 			 * generate an event.
1777 			 */
1778 			mutex_enter(&ic->ic_mutex);
1779 			if (so_conn->ic_rx_thread_running) {
1780 				conn_failure = B_TRUE;
1781 				so_conn->ic_rx_thread_running = B_FALSE;
1782 			}
1783 
1784 			continue;
1785 		}
1786 
1787 		/*
1788 		 * Header has been read and validated.  Now we need
1789 		 * to read the PDU data payload (if present).  SCSI data
1790 		 * need to be transferred from the socket directly into
1791 		 * the associated transfer buffer for the SCSI task.
1792 		 */
1793 		if (pdu->isp_datalen != 0) {
1794 			if ((IDM_PDU_OPCODE(pdu) == ISCSI_OP_SCSI_DATA) ||
1795 			    (IDM_PDU_OPCODE(pdu) == ISCSI_OP_SCSI_DATA_RSP)) {
1796 				rc = idm_sorecv_scsidata(ic, pdu);
1797 				/*
1798 				 * All SCSI errors are fatal to the
1799 				 * connection right now since we have no
1800 				 * place to put the data.  What we need
1801 				 * is some kind of sink to dispose of unwanted
1802 				 * SCSI data.  For example an invalid task tag
1803 				 * should not kill the connection (although
1804 				 * we may want to drop the connection).
1805 				 */
1806 			} else {
1807 				/*
1808 				 * Not data PDUs so allocate a buffer for the
1809 				 * data segment and read the remaining data.
1810 				 */
1811 				rc = idm_sorecv_nonscsidata(ic, pdu);
1812 			}
1813 			if (rc != 0) {
1814 				/*
1815 				 * Call idm_pdu_complete so that we call the
1816 				 * callback and ensure any memory allocated
1817 				 * in idm_sorecvhdr gets freed up.
1818 				 */
1819 				idm_pdu_complete(pdu, IDM_STATUS_FAIL);
1820 
1821 				/*
1822 				 * If ic_rx_thread_running is still set then
1823 				 * this is some kind of connection problem
1824 				 * on the socket.  In this case we want to
1825 				 * generate an event.  Otherwise some other
1826 				 * thread closed the socket due to another
1827 				 * issue in which case we don't need to
1828 				 * generate an event.
1829 				 */
1830 				mutex_enter(&ic->ic_mutex);
1831 				if (so_conn->ic_rx_thread_running) {
1832 					conn_failure = B_TRUE;
1833 					so_conn->ic_rx_thread_running = B_FALSE;
1834 				}
1835 				continue;
1836 			}
1837 		}
1838 
1839 		/*
1840 		 * Process RX PDU
1841 		 */
1842 		idm_pdu_rx(ic, pdu);
1843 
1844 		mutex_enter(&ic->ic_mutex);
1845 	}
1846 
1847 	mutex_exit(&ic->ic_mutex);
1848 
1849 	/*
1850 	 * If we dropped out of the RX processing loop because of
1851 	 * a socket problem or other connection failure (including
1852 	 * digest errors) then we need to generate a state machine
1853 	 * event to shut the connection down.
1854 	 * If the state machine is already in, for example, INIT_ERROR, this
1855 	 * event will get dropped, and the TX thread will never be notified
1856 	 * to shut down.  To be safe, we'll just notify it here.
1857 	 */
1858 	if (conn_failure) {
1859 		if (so_conn->ic_tx_thread_running) {
1860 			so_conn->ic_tx_thread_running = B_FALSE;
1861 			mutex_enter(&so_conn->ic_tx_mutex);
1862 			cv_signal(&so_conn->ic_tx_cv);
1863 			mutex_exit(&so_conn->ic_tx_mutex);
1864 		}
1865 
1866 		idm_conn_event(ic, CE_TRANSPORT_FAIL, rc);
1867 	}
1868 
1869 	idm_conn_rele(ic);
1870 
1871 	thread_exit();
1872 }
1873 
1874 /*
1875  * idm_so_tx
1876  *
1877  * This is the implementation of idm_transport_ops_t's it_tx_pdu entry
1878  * point.  By definition, it is supposed to be fast.  So, simply queue
1879  * the entry and return.  The real work is done by idm_i_so_tx() via
1880  * idm_sotx_thread().
1881  */
1882 
1883 static void
1884 idm_so_tx(idm_conn_t *ic, idm_pdu_t *pdu)
1885 {
1886 	idm_so_conn_t *so_conn = ic->ic_transport_private;
1887 
1888 	ASSERT(pdu->isp_ic == ic);
1889 	mutex_enter(&so_conn->ic_tx_mutex);
1890 
1891 	if (!so_conn->ic_tx_thread_running) {
1892 		mutex_exit(&so_conn->ic_tx_mutex);
1893 		idm_pdu_complete(pdu, IDM_STATUS_ABORTED);
1894 		return;
1895 	}
1896 
1897 	list_insert_tail(&so_conn->ic_tx_list, (void *)pdu);
1898 	cv_signal(&so_conn->ic_tx_cv);
1899 	mutex_exit(&so_conn->ic_tx_mutex);
1900 }
1901 
1902 static idm_status_t
1903 idm_i_so_tx(idm_pdu_t *pdu)
1904 {
1905 	idm_conn_t	*ic = pdu->isp_ic;
1906 	idm_status_t	status = IDM_STATUS_SUCCESS;
1907 	uint8_t		pad[ISCSI_PAD_WORD_LEN];
1908 	int		pad_len;
1909 	uint32_t	hdr_digest_crc;
1910 	uint32_t	data_digest_crc = 0;
1911 	int		total_len = 0;
1912 	int		iovlen = 0;
1913 	struct iovec	iov[6];
1914 	idm_so_conn_t	*so_conn;
1915 
1916 	so_conn = ic->ic_transport_private;
1917 
1918 	/* Setup BHS */
1919 	iov[iovlen].iov_base	= (caddr_t)pdu->isp_hdr;
1920 	iov[iovlen].iov_len	= pdu->isp_hdrlen;
1921 	total_len		+= iov[iovlen].iov_len;
1922 	iovlen++;
1923 
1924 	/* Setup header digest */
1925 	if (((pdu->isp_flags & IDM_PDU_LOGIN_TX) == 0) &&
1926 	    (ic->ic_conn_flags & IDM_CONN_HEADER_DIGEST)) {
1927 		hdr_digest_crc = idm_crc32c(pdu->isp_hdr, pdu->isp_hdrlen);
1928 
1929 		iov[iovlen].iov_base	= (caddr_t)&hdr_digest_crc;
1930 		iov[iovlen].iov_len	= sizeof (hdr_digest_crc);
1931 		total_len		+= iov[iovlen].iov_len;
1932 		iovlen++;
1933 	}
1934 
1935 	/* Setup the data */
1936 	if (pdu->isp_datalen) {
1937 		idm_task_t		*idt;
1938 		idm_buf_t		*idb;
1939 		iscsi_data_hdr_t	*ihp;
1940 		ihp = (iscsi_data_hdr_t *)pdu->isp_hdr;
1941 		/* Write of immediate data */
1942 		if (ic->ic_ffp &&
1943 		    (ihp->opcode == ISCSI_OP_SCSI_CMD ||
1944 		    ihp->opcode == ISCSI_OP_SCSI_DATA)) {
1945 			idt = idm_task_find(ic, ihp->itt, ihp->ttt);
1946 			if (idt) {
1947 				mutex_enter(&idt->idt_mutex);
1948 				idb = idm_buf_find(&idt->idt_outbufv, 0);
1949 				mutex_exit(&idt->idt_mutex);
1950 				/*
1951 				 * If the initiator call to idm_buf_alloc
1952 				 * failed then we can get to this point
1953 				 * without a bound buffer.  The associated
1954 				 * connection failure will clean things up
1955 				 * later.  It would be nice to come up with
1956 				 * a cleaner way to handle this.  In
1957 				 * particular it seems absurd to look up
1958 				 * the task and the buffer just to update
1959 				 * this counter.
1960 				 */
1961 				if (idb)
1962 					idb->idb_xfer_len += pdu->isp_datalen;
1963 				idm_task_rele(idt);
1964 			}
1965 		}
1966 
1967 		iov[iovlen].iov_base = (caddr_t)pdu->isp_data;
1968 		iov[iovlen].iov_len  = pdu->isp_datalen;
1969 		total_len += iov[iovlen].iov_len;
1970 		iovlen++;
1971 	}
1972 
1973 	/* Setup the data pad if necessary */
1974 	pad_len = ((ISCSI_PAD_WORD_LEN -
1975 	    (pdu->isp_datalen & (ISCSI_PAD_WORD_LEN - 1))) &
1976 	    (ISCSI_PAD_WORD_LEN - 1));
1977 
1978 	if (pad_len) {
1979 		bzero(pad, sizeof (pad));
1980 		iov[iovlen].iov_base = (void *)&pad;
1981 		iov[iovlen].iov_len  = pad_len;
1982 		total_len		+= iov[iovlen].iov_len;
1983 		iovlen++;
1984 	}
1985 
1986 	/*
1987 	 * Setup the data digest if enabled.  Data-digest is not sent
1988 	 * for login-phase PDUs.
1989 	 */
1990 	if ((ic->ic_conn_flags & IDM_CONN_DATA_DIGEST) &&
1991 	    ((pdu->isp_flags & IDM_PDU_LOGIN_TX) == 0) &&
1992 	    (pdu->isp_datalen || pad_len)) {
1993 		/*
1994 		 * RFC3720/10.2.3: A zero-length Data Segment also
1995 		 * implies a zero-length data digest.
1996 		 */
1997 		if (pdu->isp_datalen) {
1998 			data_digest_crc = idm_crc32c(pdu->isp_data,
1999 			    pdu->isp_datalen);
2000 		}
2001 		if (pad_len) {
2002 			data_digest_crc = idm_crc32c_continued(&pad,
2003 			    pad_len, data_digest_crc);
2004 		}
2005 
2006 		iov[iovlen].iov_base	= (caddr_t)&data_digest_crc;
2007 		iov[iovlen].iov_len	= sizeof (data_digest_crc);
2008 		total_len		+= iov[iovlen].iov_len;
2009 		iovlen++;
2010 	}
2011 
2012 	/* Transmit the PDU */
2013 	if (idm_iov_sosend(so_conn->ic_so, &iov[0], iovlen,
2014 	    total_len) != 0) {
2015 		/* Set error status */
2016 		IDM_CONN_LOG(CE_WARN,
2017 		    "idm_so_tx: failed to transmit the PDU, so: %p ic: %p "
2018 		    "data: %p", (void *) so_conn->ic_so, (void *) ic,
2019 		    (void *) pdu->isp_data);
2020 		status = IDM_STATUS_IO;
2021 	}
2022 
2023 	/*
2024 	 * Success does not mean that the PDU actually reached the
2025 	 * remote node since it could get dropped along the way.
2026 	 */
2027 	idm_pdu_complete(pdu, status);
2028 
2029 	return (status);
2030 }
2031 
2032 /*
2033  * The idm_so_buf_tx_to_ini() is used by the target iSCSI layer to transmit the
2034  * Data-In PDUs using sockets. Based on the negotiated MaxRecvDataSegmentLength,
2035  * the buffer is segmented into a sequence of Data-In PDUs, ordered by DataSN.
2036  * A target can invoke this function multiple times for a single read command
2037  * (identified by the same ITT) to split the input into several sequences.
2038  *
2039  * DataSN starts with 0 for the first data PDU of an input command and advances
2040  * by 1 for each subsequent data PDU. Each sequence will have its own F bit,
2041  * which is set to 1 for the last data PDU of a sequence.
2042  *
2043  * Scope for Prototype build:
2044  * The data PDUs within a sequence will be sent in order with the buffer offset
2045  * in increasing order. i.e. initiator and target must have negotiated the
2046  * "DataPDUInOrder" to "Yes". The order between sequences is not enforced.
2047  *
2048  * Caller holds idt->idt_mutex
2049  */
2050 static idm_status_t
2051 idm_so_buf_tx_to_ini(idm_task_t *idt, idm_buf_t *idb)
2052 {
2053 	idm_so_conn_t	*so_conn = idb->idb_ic->ic_transport_private;
2054 	idm_pdu_t	tmppdu;
2055 
2056 	ASSERT(mutex_owned(&idt->idt_mutex));
2057 
2058 	/*
2059 	 * Put the idm_buf_t on the tx queue.  It will be transmitted by
2060 	 * idm_sotx_thread.
2061 	 */
2062 	mutex_enter(&so_conn->ic_tx_mutex);
2063 
2064 	if (!so_conn->ic_tx_thread_running) {
2065 		mutex_exit(&so_conn->ic_tx_mutex);
2066 		/*
2067 		 * Don't release idt->idt_mutex since we're supposed to hold
2068 		 * in when calling idm_buf_tx_to_ini_done
2069 		 */
2070 		idm_buf_tx_to_ini_done(idt, idb, IDM_STATUS_ABORTED);
2071 		return (IDM_STATUS_FAIL);
2072 	}
2073 
2074 	/*
2075 	 * Build a template for the data PDU headers we will use so that
2076 	 * the SN values will stay consistent with other PDU's we are
2077 	 * transmitting like R2T and SCSI status.
2078 	 */
2079 	bzero(&idb->idb_data_hdr_tmpl, sizeof (iscsi_hdr_t));
2080 	tmppdu.isp_hdr = &idb->idb_data_hdr_tmpl;
2081 	(*idt->idt_ic->ic_conn_ops.icb_build_hdr)(idt, &tmppdu,
2082 	    ISCSI_OP_SCSI_DATA_RSP);
2083 	idb->idb_tx_thread = B_TRUE;
2084 	list_insert_tail(&so_conn->ic_tx_list, (void *)idb);
2085 	cv_signal(&so_conn->ic_tx_cv);
2086 	mutex_exit(&so_conn->ic_tx_mutex);
2087 	mutex_exit(&idt->idt_mutex);
2088 
2089 	/*
2090 	 * Returning success here indicates the transfer was successfully
2091 	 * dispatched -- it does not mean that the transfer completed
2092 	 * successfully.
2093 	 */
2094 	return (IDM_STATUS_SUCCESS);
2095 }
2096 
2097 /*
2098  * The idm_so_buf_rx_from_ini() is used by the target iSCSI layer to specify the
2099  * data blocks it is ready to receive from the initiator in response to a WRITE
2100  * SCSI command. The target iSCSI layer passes the information about the desired
2101  * data blocks to the initiator in one R2T PDU. The receiving buffer, the buffer
2102  * offset and datalen are passed via the 'idb' argument.
2103  *
2104  * Scope for Prototype build:
2105  * R2Ts are required for any Data-Out PDU, i.e. initiator and target must have
2106  * negotiated the "InitialR2T" to "Yes".
2107  *
2108  * Caller holds idt->idt_mutex
2109  */
2110 static idm_status_t
2111 idm_so_buf_rx_from_ini(idm_task_t *idt, idm_buf_t *idb)
2112 {
2113 	idm_pdu_t		*pdu;
2114 	iscsi_rtt_hdr_t		*rtt;
2115 
2116 	ASSERT(mutex_owned(&idt->idt_mutex));
2117 
2118 	pdu = kmem_cache_alloc(idm.idm_sotx_pdu_cache, KM_SLEEP);
2119 	pdu->isp_ic = idt->idt_ic;
2120 	bzero(pdu->isp_hdr, sizeof (iscsi_rtt_hdr_t));
2121 
2122 	/* iSCSI layer fills the TTT, ITT, StatSN, ExpCmdSN, MaxCmdSN */
2123 	(*idt->idt_ic->ic_conn_ops.icb_build_hdr)(idt, pdu, ISCSI_OP_RTT_RSP);
2124 
2125 	/* set the rttsn, rtt.flags, rtt.data_offset and rtt.data_length */
2126 	rtt = (iscsi_rtt_hdr_t *)(pdu->isp_hdr);
2127 
2128 	rtt->opcode		= ISCSI_OP_RTT_RSP;
2129 	rtt->flags		= ISCSI_FLAG_FINAL;
2130 	rtt->data_offset	= htonl(idb->idb_bufoffset);
2131 	rtt->data_length	= htonl(idb->idb_xfer_len);
2132 	rtt->rttsn		= htonl(idt->idt_exp_rttsn++);
2133 
2134 	/* Keep track of buffer offsets */
2135 	idb->idb_exp_offset	= idb->idb_bufoffset;
2136 	mutex_exit(&idt->idt_mutex);
2137 
2138 	/*
2139 	 * Transmit the PDU.
2140 	 */
2141 	idm_pdu_tx(pdu);
2142 
2143 	return (IDM_STATUS_SUCCESS);
2144 }
2145 
2146 static idm_status_t
2147 idm_so_buf_alloc(idm_buf_t *idb, uint64_t buflen)
2148 {
2149 	idb->idb_buf = kmem_alloc(buflen, KM_NOSLEEP);
2150 	if (idb->idb_buf == NULL) {
2151 		IDM_CONN_LOG(CE_NOTE,
2152 		    "idm_so_buf_alloc: failed buffer allocation");
2153 		return (IDM_STATUS_FAIL);
2154 	}
2155 	return (IDM_STATUS_SUCCESS);
2156 }
2157 
2158 /* ARGSUSED */
2159 static idm_status_t
2160 idm_so_buf_setup(idm_buf_t *idb)
2161 {
2162 	/* Ensure bufalloc'd flag is unset */
2163 	idb->idb_bufalloc = B_FALSE;
2164 
2165 	return (IDM_STATUS_SUCCESS);
2166 }
2167 
2168 /* ARGSUSED */
2169 static void
2170 idm_so_buf_teardown(idm_buf_t *idb)
2171 {
2172 	/* nothing to do here */
2173 }
2174 
2175 static void
2176 idm_so_buf_free(idm_buf_t *idb)
2177 {
2178 	kmem_free(idb->idb_buf, idb->idb_buflen);
2179 }
2180 
2181 static void
2182 idm_so_send_rtt_data(idm_conn_t *ic, idm_task_t *idt, idm_buf_t *idb,
2183     uint32_t offset, uint32_t length)
2184 {
2185 	idm_so_conn_t	*so_conn = ic->ic_transport_private;
2186 	idm_pdu_t	tmppdu;
2187 	idm_buf_t	*rtt_buf;
2188 
2189 	ASSERT(mutex_owned(&idt->idt_mutex));
2190 
2191 	/*
2192 	 * Allocate a buffer to represent the RTT transfer.  We could further
2193 	 * optimize this by allocating the buffers internally from an rtt
2194 	 * specific buffer cache since this is socket-specific code but for
2195 	 * now we will keep it simple.
2196 	 */
2197 	rtt_buf = idm_buf_alloc(ic, (uint8_t *)idb->idb_buf + offset, length);
2198 	if (rtt_buf == NULL) {
2199 		/*
2200 		 * If we're in FFP then the failure was likely a resource
2201 		 * allocation issue and we should close the connection by
2202 		 * sending a CE_TRANSPORT_FAIL event.
2203 		 *
2204 		 * If we're not in FFP then idm_buf_alloc will always
2205 		 * fail and the state is transitioning to "complete" anyway
2206 		 * so we won't bother to send an event.
2207 		 */
2208 		mutex_enter(&ic->ic_state_mutex);
2209 		if (ic->ic_ffp)
2210 			idm_conn_event_locked(ic, CE_TRANSPORT_FAIL,
2211 			    NULL, CT_NONE);
2212 		mutex_exit(&ic->ic_state_mutex);
2213 		return;
2214 	}
2215 
2216 	rtt_buf->idb_buf_cb = NULL;
2217 	rtt_buf->idb_cb_arg = NULL;
2218 	rtt_buf->idb_bufoffset = offset;
2219 	rtt_buf->idb_xfer_len = length;
2220 	rtt_buf->idb_ic = idt->idt_ic;
2221 	rtt_buf->idb_task_binding = idt;
2222 
2223 	/*
2224 	 * Put the idm_buf_t on the tx queue.  It will be transmitted by
2225 	 * idm_sotx_thread.
2226 	 */
2227 	mutex_enter(&so_conn->ic_tx_mutex);
2228 
2229 	if (!so_conn->ic_tx_thread_running) {
2230 		idm_buf_free(rtt_buf);
2231 		mutex_exit(&so_conn->ic_tx_mutex);
2232 		return;
2233 	}
2234 
2235 	/*
2236 	 * This new buffer represents an additional reference on the task
2237 	 */
2238 	idm_task_hold(idt);
2239 
2240 	/*
2241 	 * Build a template for the data PDU headers we will use so that
2242 	 * the SN values will stay consistent with other PDU's we are
2243 	 * transmitting like R2T and SCSI status.
2244 	 */
2245 	bzero(&rtt_buf->idb_data_hdr_tmpl, sizeof (iscsi_hdr_t));
2246 	tmppdu.isp_hdr = &rtt_buf->idb_data_hdr_tmpl;
2247 	(*idt->idt_ic->ic_conn_ops.icb_build_hdr)(idt, &tmppdu,
2248 	    ISCSI_OP_SCSI_DATA);
2249 	rtt_buf->idb_tx_thread = B_TRUE;
2250 	rtt_buf->idb_in_transport = B_TRUE;
2251 	list_insert_tail(&so_conn->ic_tx_list, (void *)rtt_buf);
2252 	cv_signal(&so_conn->ic_tx_cv);
2253 	mutex_exit(&so_conn->ic_tx_mutex);
2254 }
2255 
2256 static void
2257 idm_so_send_rtt_data_done(idm_task_t *idt, idm_buf_t *idb)
2258 {
2259 	/*
2260 	 * Don't worry about status -- we assume any error handling
2261 	 * is performed by the caller (idm_sotx_thread).
2262 	 */
2263 	idb->idb_in_transport = B_FALSE;
2264 	idm_task_rele(idt);
2265 	idm_buf_free(idb);
2266 }
2267 
2268 static idm_status_t
2269 idm_so_send_buf_region(idm_task_t *idt, idm_buf_t *idb,
2270     uint32_t buf_region_offset, uint32_t buf_region_length)
2271 {
2272 	idm_conn_t		*ic;
2273 	uint32_t		max_dataseglen;
2274 	size_t			remainder, chunk;
2275 	uint32_t		data_offset = buf_region_offset;
2276 	iscsi_data_hdr_t	*bhs;
2277 	idm_pdu_t		*pdu;
2278 	idm_status_t		tx_status;
2279 
2280 	ASSERT(mutex_owned(&idt->idt_mutex));
2281 
2282 	ic = idt->idt_ic;
2283 
2284 	max_dataseglen = 8192; /* Need value from login negotiation */
2285 	remainder = buf_region_length;
2286 
2287 	while (remainder) {
2288 		if (idt->idt_state != TASK_ACTIVE) {
2289 			ASSERT((idt->idt_state != TASK_IDLE) &&
2290 			    (idt->idt_state != TASK_COMPLETE));
2291 			return (IDM_STATUS_ABORTED);
2292 		}
2293 
2294 		/* check to see if we need to chunk the data */
2295 		if (remainder > max_dataseglen) {
2296 			chunk = max_dataseglen;
2297 		} else {
2298 			chunk = remainder;
2299 		}
2300 
2301 		/* Data PDU headers will always be sizeof (iscsi_hdr_t) */
2302 		pdu = kmem_cache_alloc(idm.idm_sotx_pdu_cache, KM_SLEEP);
2303 		pdu->isp_ic = ic;
2304 
2305 		/*
2306 		 * We've already built a build a header template
2307 		 * to use during the transfer.  Use this template so that
2308 		 * the SN values stay consistent with any unrelated PDU's
2309 		 * being transmitted.
2310 		 */
2311 		bcopy(&idb->idb_data_hdr_tmpl, pdu->isp_hdr,
2312 		    sizeof (iscsi_hdr_t));
2313 
2314 		/*
2315 		 * Set DataSN, data offset, and flags in BHS
2316 		 * For the prototype build, A = 0, S = 0, U = 0
2317 		 */
2318 		bhs = (iscsi_data_hdr_t *)(pdu->isp_hdr);
2319 
2320 		bhs->datasn		= htonl(idt->idt_exp_datasn++);
2321 
2322 		hton24(bhs->dlength, chunk);
2323 		bhs->offset = htonl(idb->idb_bufoffset + data_offset);
2324 
2325 		if (chunk == remainder) {
2326 			bhs->flags = ISCSI_FLAG_FINAL; /* F bit set to 1 */
2327 		}
2328 
2329 		/* setup data */
2330 		pdu->isp_data	=  (uint8_t *)idb->idb_buf + data_offset;
2331 		pdu->isp_datalen = (uint_t)chunk;
2332 		remainder	-= chunk;
2333 		data_offset	+= chunk;
2334 
2335 		/*
2336 		 * Now that we're done working with idt_exp_datasn,
2337 		 * idt->idt_state and idb->idb_bufoffset we can release
2338 		 * the task lock -- don't want to hold it across the
2339 		 * call to idm_i_so_tx since we could block.
2340 		 */
2341 		mutex_exit(&idt->idt_mutex);
2342 
2343 		/*
2344 		 * Transmit the PDU.  Call the internal routine directly
2345 		 * as there is already implicit ordering.
2346 		 */
2347 		if ((tx_status = idm_i_so_tx(pdu)) != IDM_STATUS_SUCCESS) {
2348 			mutex_enter(&idt->idt_mutex);
2349 			return (tx_status);
2350 		}
2351 
2352 		mutex_enter(&idt->idt_mutex);
2353 		idt->idt_tx_bytes += chunk;
2354 	}
2355 
2356 	return (IDM_STATUS_SUCCESS);
2357 }
2358 
2359 /*
2360  * TX PDU cache
2361  */
2362 /* ARGSUSED */
2363 int
2364 idm_sotx_pdu_constructor(void *hdl, void *arg, int flags)
2365 {
2366 	idm_pdu_t	*pdu = hdl;
2367 
2368 	bzero(pdu, sizeof (idm_pdu_t));
2369 	pdu->isp_hdr = (iscsi_hdr_t *)(pdu + 1); /* Ptr arithmetic */
2370 	pdu->isp_hdrlen = sizeof (iscsi_hdr_t);
2371 	pdu->isp_callback = idm_sotx_cache_pdu_cb;
2372 	pdu->isp_magic = IDM_PDU_MAGIC;
2373 	bzero(pdu->isp_hdr, sizeof (iscsi_hdr_t));
2374 
2375 	return (0);
2376 }
2377 
2378 /* ARGSUSED */
2379 void
2380 idm_sotx_cache_pdu_cb(idm_pdu_t *pdu, idm_status_t status)
2381 {
2382 	/* reset values between use */
2383 	pdu->isp_datalen = 0;
2384 
2385 	kmem_cache_free(idm.idm_sotx_pdu_cache, pdu);
2386 }
2387 
2388 /*
2389  * RX PDU cache
2390  */
2391 /* ARGSUSED */
2392 int
2393 idm_sorx_pdu_constructor(void *hdl, void *arg, int flags)
2394 {
2395 	idm_pdu_t	*pdu = hdl;
2396 
2397 	bzero(pdu, sizeof (idm_pdu_t));
2398 	pdu->isp_magic = IDM_PDU_MAGIC;
2399 	pdu->isp_hdr = (iscsi_hdr_t *)(pdu + 1); /* Ptr arithmetic */
2400 	pdu->isp_callback = idm_sorx_cache_pdu_cb;
2401 
2402 	return (0);
2403 }
2404 
2405 /* ARGSUSED */
2406 static void
2407 idm_sorx_cache_pdu_cb(idm_pdu_t *pdu, idm_status_t status)
2408 {
2409 	pdu->isp_iovlen = 0;
2410 	pdu->isp_sorx_buf = 0;
2411 	kmem_cache_free(idm.idm_sorx_pdu_cache, pdu);
2412 }
2413 
2414 static void
2415 idm_sorx_addl_pdu_cb(idm_pdu_t *pdu, idm_status_t status)
2416 {
2417 	/*
2418 	 * We had to modify our cached RX PDU with a longer header buffer
2419 	 * and/or a longer data buffer.  Release the new buffers and fix
2420 	 * the fields back to what we would expect for a cached RX PDU.
2421 	 */
2422 	if (pdu->isp_flags & IDM_PDU_ADDL_HDR) {
2423 		kmem_free(pdu->isp_hdr, pdu->isp_hdrlen);
2424 	}
2425 	if (pdu->isp_flags & IDM_PDU_ADDL_DATA) {
2426 		kmem_free(pdu->isp_data, pdu->isp_datalen);
2427 	}
2428 	pdu->isp_hdr = (iscsi_hdr_t *)(pdu + 1);
2429 	pdu->isp_hdrlen = sizeof (iscsi_hdr_t);
2430 	pdu->isp_data = NULL;
2431 	pdu->isp_datalen = 0;
2432 	pdu->isp_sorx_buf = 0;
2433 	pdu->isp_callback = idm_sorx_cache_pdu_cb;
2434 	idm_sorx_cache_pdu_cb(pdu, status);
2435 }
2436 
2437 /*
2438  * This thread is only active when I/O is queued for transmit
2439  * because the socket is busy.
2440  */
2441 void
2442 idm_sotx_thread(void *arg)
2443 {
2444 	idm_conn_t	*ic = arg;
2445 	idm_tx_obj_t	*object, *next;
2446 	idm_so_conn_t	*so_conn;
2447 	idm_status_t	status = IDM_STATUS_SUCCESS;
2448 
2449 	idm_conn_hold(ic);
2450 
2451 	mutex_enter(&ic->ic_mutex);
2452 	so_conn = ic->ic_transport_private;
2453 	so_conn->ic_tx_thread_running = B_TRUE;
2454 	so_conn->ic_tx_thread_did = so_conn->ic_tx_thread->t_did;
2455 	cv_signal(&ic->ic_cv);
2456 	mutex_exit(&ic->ic_mutex);
2457 
2458 	mutex_enter(&so_conn->ic_tx_mutex);
2459 
2460 	while (so_conn->ic_tx_thread_running) {
2461 		while (list_is_empty(&so_conn->ic_tx_list)) {
2462 			DTRACE_PROBE1(soconn__tx__sleep, idm_conn_t *, ic);
2463 			cv_wait(&so_conn->ic_tx_cv, &so_conn->ic_tx_mutex);
2464 			DTRACE_PROBE1(soconn__tx__wakeup, idm_conn_t *, ic);
2465 
2466 			if (!so_conn->ic_tx_thread_running) {
2467 				goto tx_bail;
2468 			}
2469 		}
2470 
2471 		object = (idm_tx_obj_t *)list_head(&so_conn->ic_tx_list);
2472 		list_remove(&so_conn->ic_tx_list, object);
2473 		mutex_exit(&so_conn->ic_tx_mutex);
2474 
2475 		switch (object->idm_tx_obj_magic) {
2476 		case IDM_PDU_MAGIC:
2477 			DTRACE_PROBE2(soconn__tx__pdu, idm_conn_t *, ic,
2478 			    idm_pdu_t *, (idm_pdu_t *)object);
2479 
2480 			status = idm_i_so_tx((idm_pdu_t *)object);
2481 			break;
2482 
2483 		case IDM_BUF_MAGIC: {
2484 			idm_buf_t *idb = (idm_buf_t *)object;
2485 			idm_task_t *idt = idb->idb_task_binding;
2486 
2487 			DTRACE_PROBE2(soconn__tx__buf, idm_conn_t *, ic,
2488 			    idm_buf_t *, idb);
2489 
2490 			mutex_enter(&idt->idt_mutex);
2491 			status = idm_so_send_buf_region(idt,
2492 			    idb, 0, idb->idb_xfer_len);
2493 
2494 			/*
2495 			 * TX thread owns the buffer so we expect it to
2496 			 * be "in transport"
2497 			 */
2498 			ASSERT(idb->idb_in_transport);
2499 			if (IDM_CONN_ISTGT(ic)) {
2500 				/*
2501 				 * idm_buf_tx_to_ini_done releases
2502 				 * idt->idt_mutex
2503 				 */
2504 				idm_buf_tx_to_ini_done(idt, idb, status);
2505 			} else {
2506 				idm_so_send_rtt_data_done(idt, idb);
2507 				mutex_exit(&idt->idt_mutex);
2508 			}
2509 			break;
2510 		}
2511 
2512 		default:
2513 			IDM_CONN_LOG(CE_WARN, "idm_sotx_thread: Unknown magic "
2514 			    "(0x%08x)", object->idm_tx_obj_magic);
2515 			status = IDM_STATUS_FAIL;
2516 		}
2517 
2518 		mutex_enter(&so_conn->ic_tx_mutex);
2519 
2520 		if (status != IDM_STATUS_SUCCESS) {
2521 			so_conn->ic_tx_thread_running = B_FALSE;
2522 			idm_conn_event(ic, CE_TRANSPORT_FAIL, status);
2523 		}
2524 	}
2525 
2526 	/*
2527 	 * Before we leave, we need to abort every item remaining in the
2528 	 * TX list.
2529 	 */
2530 
2531 tx_bail:
2532 	object = (idm_tx_obj_t *)list_head(&so_conn->ic_tx_list);
2533 
2534 	while (object != NULL) {
2535 		next = list_next(&so_conn->ic_tx_list, object);
2536 
2537 		list_remove(&so_conn->ic_tx_list, object);
2538 		switch (object->idm_tx_obj_magic) {
2539 		case IDM_PDU_MAGIC:
2540 			idm_pdu_complete((idm_pdu_t *)object,
2541 			    IDM_STATUS_ABORTED);
2542 			break;
2543 
2544 		case IDM_BUF_MAGIC: {
2545 			idm_buf_t *idb = (idm_buf_t *)object;
2546 			idm_task_t *idt = idb->idb_task_binding;
2547 			mutex_exit(&so_conn->ic_tx_mutex);
2548 			mutex_enter(&idt->idt_mutex);
2549 			/*
2550 			 * TX thread owns the buffer so we expect it to
2551 			 * be "in transport"
2552 			 */
2553 			ASSERT(idb->idb_in_transport);
2554 			if (IDM_CONN_ISTGT(ic)) {
2555 				/*
2556 				 * idm_buf_tx_to_ini_done releases
2557 				 * idt->idt_mutex
2558 				 */
2559 				idm_buf_tx_to_ini_done(idt, idb,
2560 				    IDM_STATUS_ABORTED);
2561 			} else {
2562 				idm_so_send_rtt_data_done(idt, idb);
2563 				mutex_exit(&idt->idt_mutex);
2564 			}
2565 			mutex_enter(&so_conn->ic_tx_mutex);
2566 			break;
2567 		}
2568 		default:
2569 			IDM_CONN_LOG(CE_WARN,
2570 			    "idm_sotx_thread: Unexpected magic "
2571 			    "(0x%08x)", object->idm_tx_obj_magic);
2572 		}
2573 
2574 		object = next;
2575 	}
2576 
2577 	mutex_exit(&so_conn->ic_tx_mutex);
2578 	idm_conn_rele(ic);
2579 	thread_exit();
2580 	/*NOTREACHED*/
2581 }
2582