xref: /titanic_52/usr/src/uts/common/io/idm/idm_so.c (revision 8780f632c8794e526157dc18c87834b2cc4f6592)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/conf.h>
27 #include <sys/stat.h>
28 #include <sys/file.h>
29 #include <sys/ddi.h>
30 #include <sys/sunddi.h>
31 #include <sys/modctl.h>
32 #include <sys/priv.h>
33 #include <sys/cpuvar.h>
34 #include <sys/socket.h>
35 #include <sys/strsubr.h>
36 #include <sys/sysmacros.h>
37 #include <sys/sdt.h>
38 #include <netinet/tcp.h>
39 #include <inet/tcp.h>
40 #include <sys/socketvar.h>
41 #include <sys/pathname.h>
42 #include <sys/fs/snode.h>
43 #include <sys/fs/dv_node.h>
44 #include <sys/vnode.h>
45 #include <netinet/in.h>
46 #include <net/if.h>
47 #include <sys/sockio.h>
48 #include <sys/ksocket.h>
49 #include <sys/idm/idm.h>
50 #include <sys/idm/idm_so.h>
51 #include <sys/idm/idm_text.h>
52 
53 #define	IN_PROGRESS_DELAY	1
54 
55 /*
56  * in6addr_any is currently all zeroes, but use the macro in case this
57  * ever changes.
58  */
59 static const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT;
60 
61 static void idm_sorx_cache_pdu_cb(idm_pdu_t *pdu, idm_status_t status);
62 static void idm_sorx_addl_pdu_cb(idm_pdu_t *pdu, idm_status_t status);
63 static void idm_sotx_cache_pdu_cb(idm_pdu_t *pdu, idm_status_t status);
64 
65 static idm_status_t idm_so_conn_create_common(idm_conn_t *ic, ksocket_t new_so);
66 static void idm_so_conn_destroy_common(idm_conn_t *ic);
67 static void idm_so_conn_connect_common(idm_conn_t *ic);
68 
69 static void idm_set_ini_preconnect_options(idm_so_conn_t *sc);
70 static void idm_set_ini_postconnect_options(idm_so_conn_t *sc);
71 static void idm_set_tgt_connect_options(ksocket_t so);
72 static idm_status_t idm_i_so_tx(idm_pdu_t *pdu);
73 
74 static idm_status_t idm_sorecvdata(idm_conn_t *ic, idm_pdu_t *pdu);
75 static void idm_so_send_rtt_data(idm_conn_t *ic, idm_task_t *idt,
76     idm_buf_t *idb, uint32_t offset, uint32_t length);
77 static void idm_so_send_rtt_data_done(idm_task_t *idt, idm_buf_t *idb);
78 static idm_status_t idm_so_send_buf_region(idm_task_t *idt,
79     idm_buf_t *idb, uint32_t buf_region_offset, uint32_t buf_region_length);
80 
81 static uint32_t idm_fill_iov(idm_pdu_t *pdu, idm_buf_t *idb,
82     uint32_t ro, uint32_t dlength);
83 
84 static idm_status_t idm_so_handle_digest(idm_conn_t *it,
85     nvpair_t *digest_choice, const idm_kv_xlate_t *ikvx);
86 
87 static void idm_so_socket_set_nonblock(struct sonode *node);
88 static void idm_so_socket_set_block(struct sonode *node);
89 
90 /*
91  * Transport ops prototypes
92  */
93 static void idm_so_tx(idm_conn_t *ic, idm_pdu_t *pdu);
94 static idm_status_t idm_so_buf_tx_to_ini(idm_task_t *idt, idm_buf_t *idb);
95 static idm_status_t idm_so_buf_rx_from_ini(idm_task_t *idt, idm_buf_t *idb);
96 static void idm_so_rx_datain(idm_conn_t *ic, idm_pdu_t *pdu);
97 static void idm_so_rx_rtt(idm_conn_t *ic, idm_pdu_t *pdu);
98 static void idm_so_rx_dataout(idm_conn_t *ic, idm_pdu_t *pdu);
99 static idm_status_t idm_so_free_task_rsrc(idm_task_t *idt);
100 static kv_status_t idm_so_negotiate_key_values(idm_conn_t *it,
101     nvlist_t *request_nvl, nvlist_t *response_nvl, nvlist_t *negotiated_nvl);
102 static void idm_so_notice_key_values(idm_conn_t *it,
103     nvlist_t *negotiated_nvl);
104 static boolean_t idm_so_conn_is_capable(idm_conn_req_t *ic,
105     idm_transport_caps_t *caps);
106 static idm_status_t idm_so_buf_alloc(idm_buf_t *idb, uint64_t buflen);
107 static void idm_so_buf_free(idm_buf_t *idb);
108 static idm_status_t idm_so_buf_setup(idm_buf_t *idb);
109 static void idm_so_buf_teardown(idm_buf_t *idb);
110 static idm_status_t idm_so_tgt_svc_create(idm_svc_req_t *sr, idm_svc_t *is);
111 static void idm_so_tgt_svc_destroy(idm_svc_t *is);
112 static idm_status_t idm_so_tgt_svc_online(idm_svc_t *is);
113 static void idm_so_tgt_svc_offline(idm_svc_t *is);
114 static void idm_so_tgt_conn_destroy(idm_conn_t *ic);
115 static idm_status_t idm_so_tgt_conn_connect(idm_conn_t *ic);
116 static void idm_so_conn_disconnect(idm_conn_t *ic);
117 static idm_status_t idm_so_ini_conn_create(idm_conn_req_t *cr, idm_conn_t *ic);
118 static void idm_so_ini_conn_destroy(idm_conn_t *ic);
119 static idm_status_t idm_so_ini_conn_connect(idm_conn_t *ic);
120 
121 /*
122  * IDM Native Sockets transport operations
123  */
124 static
125 idm_transport_ops_t idm_so_transport_ops = {
126 	idm_so_tx,			/* it_tx_pdu */
127 	idm_so_buf_tx_to_ini,		/* it_buf_tx_to_ini */
128 	idm_so_buf_rx_from_ini,		/* it_buf_rx_from_ini */
129 	idm_so_rx_datain,		/* it_rx_datain */
130 	idm_so_rx_rtt,			/* it_rx_rtt */
131 	idm_so_rx_dataout,		/* it_rx_dataout */
132 	NULL,				/* it_alloc_conn_rsrc */
133 	NULL,				/* it_free_conn_rsrc */
134 	NULL,				/* it_tgt_enable_datamover */
135 	NULL,				/* it_ini_enable_datamover */
136 	NULL,				/* it_conn_terminate */
137 	idm_so_free_task_rsrc,		/* it_free_task_rsrc */
138 	idm_so_negotiate_key_values,	/* it_negotiate_key_values */
139 	idm_so_notice_key_values,	/* it_notice_key_values */
140 	idm_so_conn_is_capable,		/* it_conn_is_capable */
141 	idm_so_buf_alloc,		/* it_buf_alloc */
142 	idm_so_buf_free,		/* it_buf_free */
143 	idm_so_buf_setup,		/* it_buf_setup */
144 	idm_so_buf_teardown,		/* it_buf_teardown */
145 	idm_so_tgt_svc_create,		/* it_tgt_svc_create */
146 	idm_so_tgt_svc_destroy,		/* it_tgt_svc_destroy */
147 	idm_so_tgt_svc_online,		/* it_tgt_svc_online */
148 	idm_so_tgt_svc_offline,		/* it_tgt_svc_offline */
149 	idm_so_tgt_conn_destroy,	/* it_tgt_conn_destroy */
150 	idm_so_tgt_conn_connect,	/* it_tgt_conn_connect */
151 	idm_so_conn_disconnect,		/* it_tgt_conn_disconnect */
152 	idm_so_ini_conn_create,		/* it_ini_conn_create */
153 	idm_so_ini_conn_destroy,	/* it_ini_conn_destroy */
154 	idm_so_ini_conn_connect,	/* it_ini_conn_connect */
155 	idm_so_conn_disconnect		/* it_ini_conn_disconnect */
156 };
157 
158 /*
159  * idm_so_init()
160  * Sockets transport initialization
161  */
162 void
163 idm_so_init(idm_transport_t *it)
164 {
165 	/* Cache for IDM Data and R2T Transmit PDU's */
166 	idm.idm_sotx_pdu_cache = kmem_cache_create("idm_tx_pdu_cache",
167 	    sizeof (idm_pdu_t) + sizeof (iscsi_hdr_t), 8,
168 	    &idm_sotx_pdu_constructor, NULL, NULL, NULL, NULL, KM_SLEEP);
169 
170 	/* Cache for IDM Receive PDU's */
171 	idm.idm_sorx_pdu_cache = kmem_cache_create("idm_rx_pdu_cache",
172 	    sizeof (idm_pdu_t) + IDM_SORX_CACHE_HDRLEN, 8,
173 	    &idm_sorx_pdu_constructor, NULL, NULL, NULL, NULL, KM_SLEEP);
174 
175 	/* 128k buffer cache */
176 	idm.idm_so_128k_buf_cache = kmem_cache_create("idm_128k_buf_cache",
177 	    IDM_SO_BUF_CACHE_UB, 8, NULL, NULL, NULL, NULL, NULL, KM_SLEEP);
178 
179 	/* Set the sockets transport ops */
180 	it->it_ops = &idm_so_transport_ops;
181 }
182 
183 /*
184  * idm_so_fini()
185  * Sockets transport teardown
186  */
187 void
188 idm_so_fini(void)
189 {
190 	kmem_cache_destroy(idm.idm_so_128k_buf_cache);
191 	kmem_cache_destroy(idm.idm_sotx_pdu_cache);
192 	kmem_cache_destroy(idm.idm_sorx_pdu_cache);
193 }
194 
195 ksocket_t
196 idm_socreate(int domain, int type, int protocol)
197 {
198 	ksocket_t ks;
199 
200 	if (!ksocket_socket(&ks, domain, type, protocol, KSOCKET_NOSLEEP,
201 	    CRED())) {
202 		return (ks);
203 	} else {
204 		return (NULL);
205 	}
206 }
207 
208 /*
209  * idm_soshutdown will disconnect the socket and prevent subsequent PDU
210  * reception and transmission.  The sonode still exists but its state
211  * gets modified to indicate it is no longer connected.  Calls to
212  * idm_sorecv/idm_iov_sorecv will return so idm_soshutdown can be used
213  * regain control of a thread stuck in idm_sorecv.
214  */
215 void
216 idm_soshutdown(ksocket_t so)
217 {
218 	(void) ksocket_shutdown(so, SHUT_RDWR, CRED());
219 }
220 
221 /*
222  * idm_sodestroy releases all resources associated with a socket previously
223  * created with idm_socreate.  The socket must be shutdown using
224  * idm_soshutdown before the socket is destroyed with idm_sodestroy,
225  * otherwise undefined behavior will result.
226  */
227 void
228 idm_sodestroy(ksocket_t ks)
229 {
230 	(void) ksocket_close(ks, CRED());
231 }
232 
233 /*
234  * Function to compare two addresses in sockaddr_storage format
235  */
236 
237 int
238 idm_ss_compare(const struct sockaddr_storage *cmp_ss1,
239     const struct sockaddr_storage *cmp_ss2,
240     boolean_t v4_mapped_as_v4)
241 {
242 	struct sockaddr_storage			mapped_v4_ss1, mapped_v4_ss2;
243 	const struct sockaddr_storage		*ss1, *ss2;
244 	struct in_addr				*in1, *in2;
245 	struct in6_addr				*in61, *in62;
246 	int i;
247 
248 	/*
249 	 * Normalize V4-mapped IPv6 addresses into V4 format if
250 	 * v4_mapped_as_v4 is B_TRUE.
251 	 */
252 	ss1 = cmp_ss1;
253 	ss2 = cmp_ss2;
254 	if (v4_mapped_as_v4 && (ss1->ss_family == AF_INET6)) {
255 		in61 = &((struct sockaddr_in6 *)ss1)->sin6_addr;
256 		if (IN6_IS_ADDR_V4MAPPED(in61)) {
257 			bzero(&mapped_v4_ss1, sizeof (mapped_v4_ss1));
258 			mapped_v4_ss1.ss_family = AF_INET;
259 			((struct sockaddr_in *)&mapped_v4_ss1)->sin_port =
260 			    ((struct sockaddr_in *)ss1)->sin_port;
261 			IN6_V4MAPPED_TO_INADDR(in61,
262 			    &((struct sockaddr_in *)&mapped_v4_ss1)->sin_addr);
263 			ss1 = &mapped_v4_ss1;
264 		}
265 	}
266 	ss2 = cmp_ss2;
267 	if (v4_mapped_as_v4 && (ss2->ss_family == AF_INET6)) {
268 		in62 = &((struct sockaddr_in6 *)ss2)->sin6_addr;
269 		if (IN6_IS_ADDR_V4MAPPED(in62)) {
270 			bzero(&mapped_v4_ss2, sizeof (mapped_v4_ss2));
271 			mapped_v4_ss2.ss_family = AF_INET;
272 			((struct sockaddr_in *)&mapped_v4_ss2)->sin_port =
273 			    ((struct sockaddr_in *)ss2)->sin_port;
274 			IN6_V4MAPPED_TO_INADDR(in62,
275 			    &((struct sockaddr_in *)&mapped_v4_ss2)->sin_addr);
276 			ss2 = &mapped_v4_ss2;
277 		}
278 	}
279 
280 	/*
281 	 * Compare ports, then address family, then ip address
282 	 */
283 	if (((struct sockaddr_in *)ss1)->sin_port !=
284 	    ((struct sockaddr_in *)ss2)->sin_port) {
285 		if (((struct sockaddr_in *)ss1)->sin_port >
286 		    ((struct sockaddr_in *)ss2)->sin_port)
287 			return (1);
288 		else
289 			return (-1);
290 	}
291 
292 	/*
293 	 * ports are the same
294 	 */
295 	if (ss1->ss_family != ss2->ss_family) {
296 		if (ss1->ss_family == AF_INET)
297 			return (1);
298 		else
299 			return (-1);
300 	}
301 
302 	/*
303 	 * address families are the same
304 	 */
305 	if (ss1->ss_family == AF_INET) {
306 		in1 = &((struct sockaddr_in *)ss1)->sin_addr;
307 		in2 = &((struct sockaddr_in *)ss2)->sin_addr;
308 
309 		if (in1->s_addr > in2->s_addr)
310 			return (1);
311 		else if (in1->s_addr < in2->s_addr)
312 			return (-1);
313 		else
314 			return (0);
315 	} else if (ss1->ss_family == AF_INET6) {
316 		in61 = &((struct sockaddr_in6 *)ss1)->sin6_addr;
317 		in62 = &((struct sockaddr_in6 *)ss2)->sin6_addr;
318 
319 		for (i = 0; i < 4; i++) {
320 			if (in61->s6_addr32[i] > in62->s6_addr32[i])
321 				return (1);
322 			else if (in61->s6_addr32[i] < in62->s6_addr32[i])
323 				return (-1);
324 		}
325 		return (0);
326 	}
327 
328 	return (1);
329 }
330 
331 /*
332  * IP address filter functions to flag addresses that should not
333  * go out to initiators through discovery.
334  */
335 static boolean_t
336 idm_v4_addr_okay(struct in_addr *in_addr)
337 {
338 	in_addr_t addr = ntohl(in_addr->s_addr);
339 
340 	if ((INADDR_NONE == addr) ||
341 	    (IN_MULTICAST(addr)) ||
342 	    ((addr >> IN_CLASSA_NSHIFT) == 0) ||
343 	    ((addr >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)) {
344 		return (B_FALSE);
345 	}
346 	return (B_TRUE);
347 }
348 
349 static boolean_t
350 idm_v6_addr_okay(struct in6_addr *addr6)
351 {
352 
353 	if ((IN6_IS_ADDR_UNSPECIFIED(addr6)) ||
354 	    (IN6_IS_ADDR_LOOPBACK(addr6)) ||
355 	    (IN6_IS_ADDR_MULTICAST(addr6)) ||
356 	    (IN6_IS_ADDR_V4MAPPED(addr6)) ||
357 	    (IN6_IS_ADDR_V4COMPAT(addr6)) ||
358 	    (IN6_IS_ADDR_LINKLOCAL(addr6))) {
359 		return (B_FALSE);
360 	}
361 	return (B_TRUE);
362 }
363 
364 /*
365  * idm_get_ipaddr will retrieve a list of IP Addresses which the host is
366  * configured with by sending down a sequence of kernel ioctl to IP STREAMS.
367  */
368 int
369 idm_get_ipaddr(idm_addr_list_t **ipaddr_p)
370 {
371 	ksocket_t 		so4, so6;
372 	struct lifnum		lifn;
373 	struct lifconf		lifc;
374 	struct lifreq		*lp;
375 	int			rval;
376 	int			numifs;
377 	int			bufsize;
378 	void			*buf;
379 	int			i, j, n, rc;
380 	struct sockaddr_storage	ss;
381 	struct sockaddr_in	*sin;
382 	struct sockaddr_in6	*sin6;
383 	idm_addr_t		*ip;
384 	idm_addr_list_t		*ipaddr;
385 	int			size_ipaddr;
386 
387 	*ipaddr_p = NULL;
388 	size_ipaddr = 0;
389 	buf = NULL;
390 
391 	/* create an ipv4 and ipv6 UDP socket */
392 	if ((so6 = idm_socreate(PF_INET6, SOCK_DGRAM, 0)) == NULL)
393 		return (0);
394 	if ((so4 = idm_socreate(PF_INET, SOCK_DGRAM, 0)) == NULL) {
395 		idm_sodestroy(so6);
396 		return (0);
397 	}
398 
399 
400 retry_count:
401 	/* snapshot the current number of interfaces */
402 	lifn.lifn_family = PF_UNSPEC;
403 	lifn.lifn_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES;
404 	lifn.lifn_count = 0;
405 	/* use vp6 for ioctls with unspecified families by default */
406 	if (ksocket_ioctl(so6, SIOCGLIFNUM, (intptr_t)&lifn, &rval, CRED())
407 	    != 0) {
408 		goto cleanup;
409 	}
410 
411 	numifs = lifn.lifn_count;
412 	if (numifs <= 0) {
413 		goto cleanup;
414 	}
415 
416 	/* allocate extra room in case more interfaces appear */
417 	numifs += 10;
418 
419 	/* get the interface names and ip addresses */
420 	bufsize = numifs * sizeof (struct lifreq);
421 	buf = kmem_alloc(bufsize, KM_SLEEP);
422 
423 	lifc.lifc_family = AF_UNSPEC;
424 	lifc.lifc_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES;
425 	lifc.lifc_len = bufsize;
426 	lifc.lifc_buf = buf;
427 	rc = ksocket_ioctl(so6, SIOCGLIFCONF, (intptr_t)&lifc, &rval, CRED());
428 	if (rc != 0) {
429 		goto cleanup;
430 	}
431 	/* if our extra room is used up, try again */
432 	if (bufsize <= lifc.lifc_len) {
433 		kmem_free(buf, bufsize);
434 		buf = NULL;
435 		goto retry_count;
436 	}
437 	/* calc actual number of ifconfs */
438 	n = lifc.lifc_len / sizeof (struct lifreq);
439 
440 	/* get ip address */
441 	if (n > 0) {
442 		size_ipaddr = sizeof (idm_addr_list_t) +
443 		    (n - 1) * sizeof (idm_addr_t);
444 		ipaddr = kmem_zalloc(size_ipaddr, KM_SLEEP);
445 	} else {
446 		goto cleanup;
447 	}
448 
449 	/*
450 	 * Examine the array of interfaces and filter uninteresting ones
451 	 */
452 	for (i = 0, j = 0, lp = lifc.lifc_req; i < n; i++, lp++) {
453 
454 		/*
455 		 * Copy the address as the SIOCGLIFFLAGS ioctl is destructive
456 		 */
457 		ss = lp->lifr_addr;
458 		/*
459 		 * fetch the flags using the socket of the correct family
460 		 */
461 		switch (ss.ss_family) {
462 		case AF_INET:
463 			rc = ksocket_ioctl(so4, SIOCGLIFFLAGS, (intptr_t)lp,
464 			    &rval, CRED());
465 			break;
466 		case AF_INET6:
467 			rc = ksocket_ioctl(so6, SIOCGLIFFLAGS, (intptr_t)lp,
468 			    &rval, CRED());
469 			break;
470 		default:
471 			continue;
472 		}
473 		if (rc == 0) {
474 			/*
475 			 * If we got the flags, skip uninteresting
476 			 * interfaces based on flags
477 			 */
478 			if ((lp->lifr_flags & IFF_UP) != IFF_UP)
479 				continue;
480 			if (lp->lifr_flags &
481 			    (IFF_ANYCAST|IFF_NOLOCAL|IFF_DEPRECATED))
482 				continue;
483 		}
484 
485 		/* save ip address */
486 		ip = &ipaddr->al_addrs[j];
487 		switch (ss.ss_family) {
488 		case AF_INET:
489 			sin = (struct sockaddr_in *)&ss;
490 			if (!idm_v4_addr_okay(&sin->sin_addr))
491 				continue;
492 			ip->a_addr.i_addr.in4 = sin->sin_addr;
493 			ip->a_addr.i_insize = sizeof (struct in_addr);
494 			break;
495 		case AF_INET6:
496 			sin6 = (struct sockaddr_in6 *)&ss;
497 			if (!idm_v6_addr_okay(&sin6->sin6_addr))
498 				continue;
499 			ip->a_addr.i_addr.in6 = sin6->sin6_addr;
500 			ip->a_addr.i_insize = sizeof (struct in6_addr);
501 			break;
502 		default:
503 			continue;
504 		}
505 		j++;
506 	}
507 
508 	if (j == 0) {
509 		/* no valid ifaddr */
510 		kmem_free(ipaddr, size_ipaddr);
511 		size_ipaddr = 0;
512 		ipaddr = NULL;
513 	} else {
514 		ipaddr->al_out_cnt = j;
515 	}
516 
517 
518 cleanup:
519 	idm_sodestroy(so6);
520 	idm_sodestroy(so4);
521 
522 	if (buf != NULL)
523 		kmem_free(buf, bufsize);
524 
525 	*ipaddr_p = ipaddr;
526 	return (size_ipaddr);
527 }
528 
529 int
530 idm_sorecv(ksocket_t so, void *msg, size_t len)
531 {
532 	iovec_t iov;
533 
534 	ASSERT(so != NULL);
535 	ASSERT(len != 0);
536 
537 	/*
538 	 * Fill in iovec and receive data
539 	 */
540 	iov.iov_base = msg;
541 	iov.iov_len = len;
542 
543 	return (idm_iov_sorecv(so, &iov, 1, len));
544 }
545 
546 /*
547  * idm_sosendto - Sends a buffered data on a non-connected socket.
548  *
549  * This function puts the data provided on the wire by calling sosendmsg.
550  * It will return only when all the data has been sent or if an error
551  * occurs.
552  *
553  * Returns 0 for success, the socket errno value if sosendmsg fails, and
554  * -1 if sosendmsg returns success but uio_resid != 0
555  */
556 int
557 idm_sosendto(ksocket_t so, void *buff, size_t len,
558     struct sockaddr *name, socklen_t namelen)
559 {
560 	struct msghdr		msg;
561 	struct iovec		iov[1];
562 	int			error;
563 	size_t			sent = 0;
564 
565 	iov[0].iov_base	= buff;
566 	iov[0].iov_len	= len;
567 
568 	/* Initialization of the message header. */
569 	bzero(&msg, sizeof (msg));
570 	msg.msg_iov	= iov;
571 	msg.msg_iovlen	= 1;
572 	msg.msg_name	= name;
573 	msg.msg_namelen	= namelen;
574 
575 	if ((error = ksocket_sendmsg(so, &msg, 0, &sent, CRED())) == 0) {
576 		/* Data sent */
577 		if (sent == len) {
578 			/* All data sent.  Success. */
579 			return (0);
580 		} else {
581 			/* Not all data was sent.  Failure */
582 			return (-1);
583 		}
584 	}
585 
586 	/* Send failed */
587 	return (error);
588 }
589 
590 /*
591  * idm_iov_sosend - Sends an iovec on a connection.
592  *
593  * This function puts the data provided on the wire by calling sosendmsg.
594  * It will return only when all the data has been sent or if an error
595  * occurs.
596  *
597  * Returns 0 for success, the socket errno value if sosendmsg fails, and
598  * -1 if sosendmsg returns success but uio_resid != 0
599  */
600 int
601 idm_iov_sosend(ksocket_t so, iovec_t *iop, int iovlen, size_t total_len)
602 {
603 	struct msghdr		msg;
604 	int			error;
605 	size_t 			sent = 0;
606 
607 	ASSERT(iop != NULL);
608 
609 	/* Initialization of the message header. */
610 	bzero(&msg, sizeof (msg));
611 	msg.msg_iov	= iop;
612 	msg.msg_iovlen	= iovlen;
613 
614 	if ((error = ksocket_sendmsg(so, &msg, 0, &sent, CRED()))
615 	    == 0) {
616 		/* Data sent */
617 		if (sent == total_len) {
618 			/* All data sent.  Success. */
619 			return (0);
620 		} else {
621 			/* Not all data was sent.  Failure */
622 			return (-1);
623 		}
624 	}
625 
626 	/* Send failed */
627 	return (error);
628 }
629 
630 /*
631  * idm_iov_sorecv - Receives an iovec from a connection
632  *
633  * This function gets the data asked for from the socket.  It will return
634  * only when all the requested data has been retrieved or if an error
635  * occurs.
636  *
637  * Returns 0 for success, the socket errno value if sorecvmsg fails, and
638  * -1 if sorecvmsg returns success but uio_resid != 0
639  */
640 int
641 idm_iov_sorecv(ksocket_t so, iovec_t *iop, int iovlen, size_t total_len)
642 {
643 	struct msghdr		msg;
644 	int			error;
645 	size_t			recv;
646 	int 			flags;
647 
648 	ASSERT(iop != NULL);
649 
650 	/* Initialization of the message header. */
651 	bzero(&msg, sizeof (msg));
652 	msg.msg_iov	= iop;
653 	msg.msg_iovlen	= iovlen;
654 	flags		= MSG_WAITALL;
655 
656 	if ((error = ksocket_recvmsg(so, &msg, flags, &recv, CRED()))
657 	    == 0) {
658 		/* Received data */
659 		if (recv == total_len) {
660 			/* All requested data received.  Success */
661 			return (0);
662 		} else {
663 			/*
664 			 * Not all data was received.  The connection has
665 			 * probably failed.
666 			 */
667 			return (-1);
668 		}
669 	}
670 
671 	/* Receive failed */
672 	return (error);
673 }
674 
675 static void
676 idm_set_ini_preconnect_options(idm_so_conn_t *sc)
677 {
678 	int	conn_abort = 10000;
679 	int	conn_notify = 2000;
680 	int	abort = 30000;
681 
682 	/* Pre-connect socket options */
683 	(void) ksocket_setsockopt(sc->ic_so, IPPROTO_TCP,
684 	    TCP_CONN_NOTIFY_THRESHOLD, (char *)&conn_notify, sizeof (int),
685 	    CRED());
686 	(void) ksocket_setsockopt(sc->ic_so, IPPROTO_TCP,
687 	    TCP_CONN_ABORT_THRESHOLD, (char *)&conn_abort, sizeof (int),
688 	    CRED());
689 	(void) ksocket_setsockopt(sc->ic_so, IPPROTO_TCP, TCP_ABORT_THRESHOLD,
690 	    (char *)&abort, sizeof (int), CRED());
691 }
692 
693 static void
694 idm_set_ini_postconnect_options(idm_so_conn_t *sc)
695 {
696 	int32_t		rcvbuf = IDM_RCVBUF_SIZE;
697 	int32_t		sndbuf = IDM_SNDBUF_SIZE;
698 	const int	on = 1;
699 
700 	/* Set postconnect options */
701 	(void) ksocket_setsockopt(sc->ic_so, IPPROTO_TCP, TCP_NODELAY,
702 	    (char *)&on, sizeof (int), CRED());
703 	(void) ksocket_setsockopt(sc->ic_so, SOL_SOCKET, SO_RCVBUF,
704 	    (char *)&rcvbuf, sizeof (int), CRED());
705 	(void) ksocket_setsockopt(sc->ic_so, SOL_SOCKET, SO_SNDBUF,
706 	    (char *)&sndbuf, sizeof (int), CRED());
707 }
708 
709 static void
710 idm_set_tgt_connect_options(ksocket_t ks)
711 {
712 	int32_t		rcvbuf = IDM_RCVBUF_SIZE;
713 	int32_t		sndbuf = IDM_SNDBUF_SIZE;
714 	const int	on = 1;
715 
716 	/* Set connect options */
717 	(void) ksocket_setsockopt(ks, SOL_SOCKET, SO_RCVBUF,
718 	    (char *)&rcvbuf, sizeof (int), CRED());
719 	(void) ksocket_setsockopt(ks, SOL_SOCKET, SO_SNDBUF,
720 	    (char *)&sndbuf, sizeof (int), CRED());
721 	(void) ksocket_setsockopt(ks, IPPROTO_TCP, TCP_NODELAY,
722 	    (char *)&on, sizeof (on), CRED());
723 }
724 
725 static uint32_t
726 n2h24(const uchar_t *ptr)
727 {
728 	return ((ptr[0] << 16) | (ptr[1] << 8) | ptr[2]);
729 }
730 
731 
732 static idm_status_t
733 idm_sorecvhdr(idm_conn_t *ic, idm_pdu_t *pdu)
734 {
735 	iscsi_hdr_t	*bhs;
736 	uint32_t	hdr_digest_crc;
737 	uint32_t	crc_calculated;
738 	void		*new_hdr;
739 	int		ahslen = 0;
740 	int		total_len = 0;
741 	int		iovlen = 0;
742 	struct iovec	iov[2];
743 	idm_so_conn_t	*so_conn;
744 	int		rc;
745 
746 	so_conn = ic->ic_transport_private;
747 
748 	/*
749 	 * Read BHS
750 	 */
751 	bhs = pdu->isp_hdr;
752 	rc = idm_sorecv(so_conn->ic_so, pdu->isp_hdr, sizeof (iscsi_hdr_t));
753 	if (rc != IDM_STATUS_SUCCESS) {
754 		return (IDM_STATUS_FAIL);
755 	}
756 
757 	/*
758 	 * Check actual AHS length against the amount available in the buffer
759 	 */
760 	pdu->isp_hdrlen = sizeof (iscsi_hdr_t) +
761 	    (bhs->hlength * sizeof (uint32_t));
762 	pdu->isp_datalen = n2h24(bhs->dlength);
763 	if (bhs->hlength > IDM_SORX_CACHE_AHSLEN) {
764 		/* Allocate a new header segment and change the callback */
765 		new_hdr = kmem_alloc(pdu->isp_hdrlen, KM_SLEEP);
766 		bcopy(pdu->isp_hdr, new_hdr, sizeof (iscsi_hdr_t));
767 		pdu->isp_hdr = new_hdr;
768 		pdu->isp_flags |= IDM_PDU_ADDL_HDR;
769 
770 		/*
771 		 * This callback will restore the expected values after
772 		 * the RX PDU has been processed.
773 		 */
774 		pdu->isp_callback = idm_sorx_addl_pdu_cb;
775 	}
776 
777 	/*
778 	 * Setup receipt of additional header and header digest (if enabled).
779 	 */
780 	if (bhs->hlength > 0) {
781 		iov[iovlen].iov_base = (caddr_t)(pdu->isp_hdr + 1);
782 		ahslen = pdu->isp_hdrlen - sizeof (iscsi_hdr_t);
783 		iov[iovlen].iov_len = ahslen;
784 		total_len += iov[iovlen].iov_len;
785 		iovlen++;
786 	}
787 
788 	if (ic->ic_conn_flags & IDM_CONN_HEADER_DIGEST) {
789 		iov[iovlen].iov_base = (caddr_t)&hdr_digest_crc;
790 		iov[iovlen].iov_len = sizeof (hdr_digest_crc);
791 		total_len += iov[iovlen].iov_len;
792 		iovlen++;
793 	}
794 
795 	if ((iovlen != 0) &&
796 	    (idm_iov_sorecv(so_conn->ic_so, &iov[0], iovlen,
797 	    total_len) != 0)) {
798 		return (IDM_STATUS_FAIL);
799 	}
800 
801 	/*
802 	 * Validate header digest if enabled
803 	 */
804 	if (ic->ic_conn_flags & IDM_CONN_HEADER_DIGEST) {
805 		crc_calculated = idm_crc32c(pdu->isp_hdr,
806 		    sizeof (iscsi_hdr_t) + ahslen);
807 		if (crc_calculated != hdr_digest_crc) {
808 			/* Invalid Header Digest */
809 			return (IDM_STATUS_HEADER_DIGEST);
810 		}
811 	}
812 
813 	return (0);
814 }
815 
816 /*
817  * idm_so_ini_conn_create()
818  * Allocate the sockets transport connection resources.
819  */
820 static idm_status_t
821 idm_so_ini_conn_create(idm_conn_req_t *cr, idm_conn_t *ic)
822 {
823 	ksocket_t	so;
824 	idm_so_conn_t	*so_conn;
825 	idm_status_t	idmrc;
826 
827 	so = idm_socreate(cr->cr_domain, cr->cr_type,
828 	    cr->cr_protocol);
829 	if (so == NULL) {
830 		return (IDM_STATUS_FAIL);
831 	}
832 
833 	/* Bind the socket if configured to do so */
834 	if (cr->cr_bound) {
835 		if (ksocket_bind(so, &cr->cr_bound_addr.sin,
836 		    SIZEOF_SOCKADDR(&cr->cr_bound_addr.sin), CRED()) != 0) {
837 			idm_sodestroy(so);
838 			return (IDM_STATUS_FAIL);
839 		}
840 	}
841 
842 	idmrc = idm_so_conn_create_common(ic, so);
843 	if (idmrc != IDM_STATUS_SUCCESS) {
844 		idm_soshutdown(so);
845 		idm_sodestroy(so);
846 		return (IDM_STATUS_FAIL);
847 	}
848 
849 	so_conn = ic->ic_transport_private;
850 	/* Set up socket options */
851 	idm_set_ini_preconnect_options(so_conn);
852 
853 	return (IDM_STATUS_SUCCESS);
854 }
855 
856 /*
857  * idm_so_ini_conn_destroy()
858  * Tear down the sockets transport connection resources.
859  */
860 static void
861 idm_so_ini_conn_destroy(idm_conn_t *ic)
862 {
863 	idm_so_conn_destroy_common(ic);
864 }
865 
866 /*
867  * idm_so_ini_conn_connect()
868  * Establish the connection referred to by the handle previously allocated via
869  * idm_so_ini_conn_create().
870  */
871 static idm_status_t
872 idm_so_ini_conn_connect(idm_conn_t *ic)
873 {
874 	idm_so_conn_t	*so_conn;
875 	struct sonode	*node = NULL;
876 	int 		rc;
877 	clock_t		lbolt, conn_login_max, conn_login_interval;
878 	boolean_t	nonblock;
879 
880 	so_conn = ic->ic_transport_private;
881 	nonblock = ic->ic_conn_params.nonblock_socket;
882 	conn_login_max = ic->ic_conn_params.conn_login_max;
883 	conn_login_interval = ddi_get_lbolt() +
884 	    SEC_TO_TICK(ic->ic_conn_params.conn_login_interval);
885 
886 	if (nonblock == B_TRUE) {
887 		node = ((struct sonode *)(so_conn->ic_so));
888 		/* Set to none block socket mode */
889 		idm_so_socket_set_nonblock(node);
890 		do {
891 			rc = ksocket_connect(so_conn->ic_so,
892 			    &ic->ic_ini_dst_addr.sin,
893 			    (SIZEOF_SOCKADDR(&ic->ic_ini_dst_addr.sin)),
894 			    CRED());
895 			if (rc == 0 || rc == EISCONN) {
896 				/* socket success or already success */
897 				rc = IDM_STATUS_SUCCESS;
898 				break;
899 			}
900 			if ((rc == ETIMEDOUT) || (rc == ECONNREFUSED) ||
901 			    (rc == ECONNRESET)) {
902 				/* socket connection timeout or refuse */
903 				break;
904 			}
905 			lbolt = ddi_get_lbolt();
906 			if (lbolt > conn_login_max) {
907 				/*
908 				 * Connection retry timeout,
909 				 * failed connect to target.
910 				 */
911 				break;
912 			}
913 			if (lbolt < conn_login_interval) {
914 				if ((rc == EINPROGRESS) || (rc == EALREADY)) {
915 					/* TCP connect still in progress */
916 					delay(SEC_TO_TICK(IN_PROGRESS_DELAY));
917 					continue;
918 				} else {
919 					delay(conn_login_interval - lbolt);
920 				}
921 			}
922 			conn_login_interval = ddi_get_lbolt() +
923 			    SEC_TO_TICK(ic->ic_conn_params.conn_login_interval);
924 		} while (rc != 0);
925 		/* resume to nonblock mode */
926 		if (rc == IDM_STATUS_SUCCESS) {
927 			idm_so_socket_set_block(node);
928 		}
929 	} else {
930 		rc = ksocket_connect(so_conn->ic_so, &ic->ic_ini_dst_addr.sin,
931 		    (SIZEOF_SOCKADDR(&ic->ic_ini_dst_addr.sin)), CRED());
932 	}
933 
934 	if (rc != 0) {
935 		idm_soshutdown(so_conn->ic_so);
936 		return (IDM_STATUS_FAIL);
937 	}
938 
939 	idm_so_conn_connect_common(ic);
940 
941 	idm_set_ini_postconnect_options(so_conn);
942 
943 	return (IDM_STATUS_SUCCESS);
944 }
945 
946 idm_status_t
947 idm_so_tgt_conn_create(idm_conn_t *ic, ksocket_t new_so)
948 {
949 	idm_status_t	idmrc;
950 
951 	idmrc = idm_so_conn_create_common(ic, new_so);
952 
953 	return (idmrc);
954 }
955 
956 static void
957 idm_so_tgt_conn_destroy(idm_conn_t *ic)
958 {
959 	idm_so_conn_destroy_common(ic);
960 }
961 
962 /*
963  * idm_so_tgt_conn_connect()
964  * Establish the connection in ic, passed from idm_tgt_conn_finish(), which
965  * is invoked from the SM as a result of an inbound connection request.
966  */
967 static idm_status_t
968 idm_so_tgt_conn_connect(idm_conn_t *ic)
969 {
970 	idm_so_conn_connect_common(ic);
971 
972 	return (IDM_STATUS_SUCCESS);
973 }
974 
975 static idm_status_t
976 idm_so_conn_create_common(idm_conn_t *ic, ksocket_t new_so)
977 {
978 	idm_so_conn_t	*so_conn;
979 
980 	so_conn = kmem_zalloc(sizeof (idm_so_conn_t), KM_SLEEP);
981 	so_conn->ic_so = new_so;
982 
983 	ic->ic_transport_private = so_conn;
984 	ic->ic_transport_hdrlen = 0;
985 
986 	/* Set the scoreboarding flag on this connection */
987 	ic->ic_conn_flags |= IDM_CONN_USE_SCOREBOARD;
988 
989 	/*
990 	 * Initialize tx thread mutex and list
991 	 */
992 	mutex_init(&so_conn->ic_tx_mutex, NULL, MUTEX_DEFAULT, NULL);
993 	cv_init(&so_conn->ic_tx_cv, NULL, CV_DEFAULT, NULL);
994 	list_create(&so_conn->ic_tx_list, sizeof (idm_pdu_t),
995 	    offsetof(idm_pdu_t, idm_tx_link));
996 
997 	return (IDM_STATUS_SUCCESS);
998 }
999 
1000 static void
1001 idm_so_conn_destroy_common(idm_conn_t *ic)
1002 {
1003 	idm_so_conn_t	*so_conn = ic->ic_transport_private;
1004 
1005 	ic->ic_transport_private = NULL;
1006 	idm_sodestroy(so_conn->ic_so);
1007 	list_destroy(&so_conn->ic_tx_list);
1008 	mutex_destroy(&so_conn->ic_tx_mutex);
1009 	cv_destroy(&so_conn->ic_tx_cv);
1010 
1011 	kmem_free(so_conn, sizeof (idm_so_conn_t));
1012 }
1013 
1014 static void
1015 idm_so_conn_connect_common(idm_conn_t *ic)
1016 {
1017 	idm_so_conn_t	*so_conn;
1018 	struct sockaddr_in6	t_addr;
1019 	socklen_t	t_addrlen = 0;
1020 
1021 	so_conn = ic->ic_transport_private;
1022 	bzero(&t_addr, sizeof (struct sockaddr_in6));
1023 	t_addrlen = sizeof (struct sockaddr_in6);
1024 
1025 	/* Set the local and remote addresses in the idm conn handle */
1026 	ksocket_getsockname(so_conn->ic_so, (struct sockaddr *)&t_addr,
1027 	    &t_addrlen, CRED());
1028 	bcopy(&t_addr, &ic->ic_laddr, t_addrlen);
1029 	ksocket_getpeername(so_conn->ic_so, (struct sockaddr *)&t_addr,
1030 	    &t_addrlen, CRED());
1031 	bcopy(&t_addr, &ic->ic_raddr, t_addrlen);
1032 
1033 	mutex_enter(&ic->ic_mutex);
1034 	so_conn->ic_tx_thread = thread_create(NULL, 0, idm_sotx_thread, ic, 0,
1035 	    &p0, TS_RUN, minclsyspri);
1036 	so_conn->ic_rx_thread = thread_create(NULL, 0, idm_sorx_thread, ic, 0,
1037 	    &p0, TS_RUN, minclsyspri);
1038 
1039 	while (!so_conn->ic_rx_thread_running || !so_conn->ic_tx_thread_running)
1040 		cv_wait(&ic->ic_cv, &ic->ic_mutex);
1041 	mutex_exit(&ic->ic_mutex);
1042 }
1043 
1044 /*
1045  * idm_so_conn_disconnect()
1046  * Shutdown the socket connection and stop the thread
1047  */
1048 static void
1049 idm_so_conn_disconnect(idm_conn_t *ic)
1050 {
1051 	idm_so_conn_t	*so_conn;
1052 
1053 	so_conn = ic->ic_transport_private;
1054 
1055 	mutex_enter(&ic->ic_mutex);
1056 	so_conn->ic_rx_thread_running = B_FALSE;
1057 	so_conn->ic_tx_thread_running = B_FALSE;
1058 	/* We need to wakeup the TX thread */
1059 	mutex_enter(&so_conn->ic_tx_mutex);
1060 	cv_signal(&so_conn->ic_tx_cv);
1061 	mutex_exit(&so_conn->ic_tx_mutex);
1062 	mutex_exit(&ic->ic_mutex);
1063 
1064 	/* This should wakeup the RX thread if it is sleeping */
1065 	idm_soshutdown(so_conn->ic_so);
1066 
1067 	thread_join(so_conn->ic_tx_thread_did);
1068 	thread_join(so_conn->ic_rx_thread_did);
1069 }
1070 
1071 /*
1072  * idm_so_tgt_svc_create()
1073  * Establish a service on an IP address and port.  idm_svc_req_t contains
1074  * the service parameters.
1075  */
1076 /*ARGSUSED*/
1077 static idm_status_t
1078 idm_so_tgt_svc_create(idm_svc_req_t *sr, idm_svc_t *is)
1079 {
1080 	idm_so_svc_t		*so_svc;
1081 
1082 	so_svc = kmem_zalloc(sizeof (idm_so_svc_t), KM_SLEEP);
1083 
1084 	/* Set the new sockets service in svc handle */
1085 	is->is_so_svc = (void *)so_svc;
1086 
1087 	return (IDM_STATUS_SUCCESS);
1088 }
1089 
1090 /*
1091  * idm_so_tgt_svc_destroy()
1092  * Teardown sockets resources allocated in idm_so_tgt_svc_create()
1093  */
1094 static void
1095 idm_so_tgt_svc_destroy(idm_svc_t *is)
1096 {
1097 	/* the socket will have been torn down; free the service */
1098 	kmem_free(is->is_so_svc, sizeof (idm_so_svc_t));
1099 }
1100 
1101 /*
1102  * idm_so_tgt_svc_online()
1103  * Launch a watch thread on the svc allocated in idm_so_tgt_svc_create()
1104  */
1105 
1106 static idm_status_t
1107 idm_so_tgt_svc_online(idm_svc_t *is)
1108 {
1109 	idm_so_svc_t		*so_svc;
1110 	idm_svc_req_t		*sr = &is->is_svc_req;
1111 	struct sockaddr_in6	sin6_ip;
1112 	const uint32_t		on = 1;
1113 	const uint32_t		off = 0;
1114 
1115 	mutex_enter(&is->is_mutex);
1116 	so_svc = (idm_so_svc_t *)is->is_so_svc;
1117 
1118 	/*
1119 	 * Try creating an IPv6 socket first
1120 	 */
1121 	if ((so_svc->is_so = idm_socreate(PF_INET6, SOCK_STREAM, 0)) == NULL) {
1122 		mutex_exit(&is->is_mutex);
1123 		return (IDM_STATUS_FAIL);
1124 	} else {
1125 		bzero(&sin6_ip, sizeof (sin6_ip));
1126 		sin6_ip.sin6_family = AF_INET6;
1127 		sin6_ip.sin6_port = htons(sr->sr_port);
1128 		sin6_ip.sin6_addr = in6addr_any;
1129 
1130 		(void) ksocket_setsockopt(so_svc->is_so, SOL_SOCKET,
1131 		    SO_REUSEADDR, (char *)&on, sizeof (on), CRED());
1132 		/*
1133 		 * Turn off SO_MAC_EXEMPT so future sobinds succeed
1134 		 */
1135 		(void) ksocket_setsockopt(so_svc->is_so, SOL_SOCKET,
1136 		    SO_MAC_EXEMPT, (char *)&off, sizeof (off), CRED());
1137 
1138 		if (ksocket_bind(so_svc->is_so, (struct sockaddr *)&sin6_ip,
1139 		    sizeof (sin6_ip), CRED()) != 0) {
1140 			mutex_exit(&is->is_mutex);
1141 			idm_sodestroy(so_svc->is_so);
1142 			return (IDM_STATUS_FAIL);
1143 		}
1144 	}
1145 
1146 	idm_set_tgt_connect_options(so_svc->is_so);
1147 
1148 	if (ksocket_listen(so_svc->is_so, 5, CRED()) != 0) {
1149 		mutex_exit(&is->is_mutex);
1150 		idm_soshutdown(so_svc->is_so);
1151 		idm_sodestroy(so_svc->is_so);
1152 		return (IDM_STATUS_FAIL);
1153 	}
1154 
1155 	/* Launch a watch thread */
1156 	so_svc->is_thread = thread_create(NULL, 0, idm_so_svc_port_watcher,
1157 	    is, 0, &p0, TS_RUN, minclsyspri);
1158 
1159 	if (so_svc->is_thread == NULL) {
1160 		/* Failure to launch; teardown the socket */
1161 		mutex_exit(&is->is_mutex);
1162 		idm_soshutdown(so_svc->is_so);
1163 		idm_sodestroy(so_svc->is_so);
1164 		return (IDM_STATUS_FAIL);
1165 	}
1166 	ksocket_hold(so_svc->is_so);
1167 	/* Wait for the port watcher thread to start */
1168 	while (!so_svc->is_thread_running)
1169 		cv_wait(&is->is_cv, &is->is_mutex);
1170 	mutex_exit(&is->is_mutex);
1171 
1172 	return (IDM_STATUS_SUCCESS);
1173 }
1174 
1175 /*
1176  * idm_so_tgt_svc_offline
1177  *
1178  * Stop listening on the IP address and port identified by idm_svc_t.
1179  */
1180 static void
1181 idm_so_tgt_svc_offline(idm_svc_t *is)
1182 {
1183 	idm_so_svc_t		*so_svc;
1184 	mutex_enter(&is->is_mutex);
1185 	so_svc = (idm_so_svc_t *)is->is_so_svc;
1186 	so_svc->is_thread_running = B_FALSE;
1187 	mutex_exit(&is->is_mutex);
1188 
1189 	/*
1190 	 * Teardown socket
1191 	 */
1192 	idm_sodestroy(so_svc->is_so);
1193 
1194 	/*
1195 	 * Now we expect the port watcher thread to terminate
1196 	 */
1197 	thread_join(so_svc->is_thread_did);
1198 }
1199 
1200 /*
1201  * Watch thread for target service connection establishment.
1202  */
1203 void
1204 idm_so_svc_port_watcher(void *arg)
1205 {
1206 	idm_svc_t		*svc = arg;
1207 	ksocket_t		new_so;
1208 	idm_conn_t		*ic;
1209 	idm_status_t		idmrc;
1210 	idm_so_svc_t		*so_svc;
1211 	int			rc;
1212 	const uint32_t		off = 0;
1213 	struct sockaddr_in6 	t_addr;
1214 	socklen_t		t_addrlen;
1215 
1216 	bzero(&t_addr, sizeof (struct sockaddr_in6));
1217 	t_addrlen = sizeof (struct sockaddr_in6);
1218 	mutex_enter(&svc->is_mutex);
1219 
1220 	so_svc = svc->is_so_svc;
1221 	so_svc->is_thread_running = B_TRUE;
1222 	so_svc->is_thread_did = so_svc->is_thread->t_did;
1223 
1224 	cv_signal(&svc->is_cv);
1225 
1226 	IDM_SVC_LOG(CE_NOTE, "iSCSI service (%p/%d) online", (void *)svc,
1227 	    svc->is_svc_req.sr_port);
1228 
1229 	while (so_svc->is_thread_running) {
1230 		mutex_exit(&svc->is_mutex);
1231 
1232 		if ((rc = ksocket_accept(so_svc->is_so,
1233 		    (struct sockaddr *)&t_addr, &t_addrlen,
1234 		    &new_so, CRED())) != 0) {
1235 			mutex_enter(&svc->is_mutex);
1236 			if (rc == ECONNABORTED)
1237 				continue;
1238 			/* Connection problem */
1239 			break;
1240 		}
1241 		/*
1242 		 * Turn off SO_MAC_EXEMPT so future sobinds succeed
1243 		 */
1244 		(void) ksocket_setsockopt(new_so, SOL_SOCKET, SO_MAC_EXEMPT,
1245 		    (char *)&off, sizeof (off), CRED());
1246 
1247 		idmrc = idm_svc_conn_create(svc, IDM_TRANSPORT_TYPE_SOCKETS,
1248 		    &ic);
1249 		if (idmrc != IDM_STATUS_SUCCESS) {
1250 			/* Drop connection */
1251 			idm_soshutdown(new_so);
1252 			idm_sodestroy(new_so);
1253 			mutex_enter(&svc->is_mutex);
1254 			continue;
1255 		}
1256 
1257 		idmrc = idm_so_tgt_conn_create(ic, new_so);
1258 		if (idmrc != IDM_STATUS_SUCCESS) {
1259 			idm_svc_conn_destroy(ic);
1260 			idm_soshutdown(new_so);
1261 			idm_sodestroy(new_so);
1262 			mutex_enter(&svc->is_mutex);
1263 			continue;
1264 		}
1265 
1266 		/*
1267 		 * Kick the state machine.  At CS_S3_XPT_UP the state machine
1268 		 * will notify the client (target) about the new connection.
1269 		 */
1270 		idm_conn_event(ic, CE_CONNECT_ACCEPT, NULL);
1271 
1272 		mutex_enter(&svc->is_mutex);
1273 	}
1274 	ksocket_rele(so_svc->is_so);
1275 	so_svc->is_thread_running = B_FALSE;
1276 	mutex_exit(&svc->is_mutex);
1277 
1278 	IDM_SVC_LOG(CE_NOTE, "iSCSI service (%p/%d) offline", (void *)svc,
1279 	    svc->is_svc_req.sr_port);
1280 
1281 	thread_exit();
1282 }
1283 
1284 /*
1285  * idm_so_free_task_rsrc() stops any ongoing processing of the task and
1286  * frees resources associated with the task.
1287  *
1288  * It's not clear that this should return idm_status_t.  What do we do
1289  * if it fails?
1290  */
1291 static idm_status_t
1292 idm_so_free_task_rsrc(idm_task_t *idt)
1293 {
1294 	idm_buf_t	*idb;
1295 
1296 	/*
1297 	 * There is nothing to cleanup on initiator connections
1298 	 */
1299 	if (IDM_CONN_ISINI(idt->idt_ic))
1300 		return (IDM_STATUS_SUCCESS);
1301 
1302 	/*
1303 	 * If this is a target connection, call idm_buf_rx_from_ini_done for
1304 	 * any buffer on the "outbufv" list with idb->idb_in_transport==B_TRUE.
1305 	 *
1306 	 * In addition, remove any buffers associated with this task from
1307 	 * the ic_tx_list.  We'll do this by walking the idt_inbufv list, but
1308 	 * items don't actually get removed from that list (and completion
1309 	 * routines called) until idm_task_cleanup.
1310 	 */
1311 	mutex_enter(&idt->idt_mutex);
1312 
1313 	for (idb = list_head(&idt->idt_outbufv); idb != NULL;
1314 	    idb = list_next(&idt->idt_outbufv, idb)) {
1315 		if (idb->idb_in_transport) {
1316 			/*
1317 			 * idm_buf_rx_from_ini_done releases idt->idt_mutex
1318 			 */
1319 			DTRACE_ISCSI_8(xfer__done, idm_conn_t *, idt->idt_ic,
1320 			    uintptr_t, idb->idb_buf,
1321 			    uint32_t, idb->idb_bufoffset,
1322 			    uint64_t, 0, uint32_t, 0, uint32_t, 0,
1323 			    uint32_t, idb->idb_xfer_len,
1324 			    int, XFER_BUF_RX_FROM_INI);
1325 			idm_buf_rx_from_ini_done(idt, idb, IDM_STATUS_ABORTED);
1326 			mutex_enter(&idt->idt_mutex);
1327 		}
1328 	}
1329 
1330 	for (idb = list_head(&idt->idt_inbufv); idb != NULL;
1331 	    idb = list_next(&idt->idt_inbufv, idb)) {
1332 		/*
1333 		 * We want to remove these items from the tx_list as well,
1334 		 * but knowing it's in the idt_inbufv list is not a guarantee
1335 		 * that it's in the tx_list.  If it's on the tx list then
1336 		 * let idm_sotx_thread() clean it up.
1337 		 */
1338 		if (idb->idb_in_transport && !idb->idb_tx_thread) {
1339 			/*
1340 			 * idm_buf_tx_to_ini_done releases idt->idt_mutex
1341 			 */
1342 			DTRACE_ISCSI_8(xfer__done, idm_conn_t *, idt->idt_ic,
1343 			    uintptr_t, idb->idb_buf,
1344 			    uint32_t, idb->idb_bufoffset,
1345 			    uint64_t, 0, uint32_t, 0, uint32_t, 0,
1346 			    uint32_t, idb->idb_xfer_len,
1347 			    int, XFER_BUF_TX_TO_INI);
1348 			idm_buf_tx_to_ini_done(idt, idb, IDM_STATUS_ABORTED);
1349 			mutex_enter(&idt->idt_mutex);
1350 		}
1351 	}
1352 
1353 	mutex_exit(&idt->idt_mutex);
1354 
1355 	return (IDM_STATUS_SUCCESS);
1356 }
1357 
1358 /*
1359  * idm_so_negotiate_key_values() validates the key values for this connection
1360  */
1361 /* ARGSUSED */
1362 static kv_status_t
1363 idm_so_negotiate_key_values(idm_conn_t *it, nvlist_t *request_nvl,
1364     nvlist_t *response_nvl, nvlist_t *negotiated_nvl)
1365 {
1366 	/* All parameters are negotiated at the iscsit level */
1367 	return (KV_HANDLED);
1368 }
1369 
1370 /*
1371  * idm_so_notice_key_values() activates the negotiated key values for
1372  * this connection.
1373  */
1374 static void
1375 idm_so_notice_key_values(idm_conn_t *it, nvlist_t *negotiated_nvl)
1376 {
1377 	char			*nvp_name;
1378 	nvpair_t		*nvp;
1379 	nvpair_t		*next_nvp;
1380 	int			nvrc;
1381 	idm_status_t		idm_status;
1382 	const idm_kv_xlate_t	*ikvx;
1383 
1384 	for (nvp = nvlist_next_nvpair(negotiated_nvl, NULL);
1385 	    nvp != NULL; nvp = next_nvp) {
1386 		next_nvp = nvlist_next_nvpair(negotiated_nvl, nvp);
1387 		nvp_name = nvpair_name(nvp);
1388 
1389 		ikvx = idm_lookup_kv_xlate(nvp_name, strlen(nvp_name));
1390 		switch (ikvx->ik_key_id) {
1391 		case KI_HEADER_DIGEST:
1392 		case KI_DATA_DIGEST:
1393 			idm_status = idm_so_handle_digest(it, nvp, ikvx);
1394 			ASSERT(idm_status == 0);
1395 
1396 			/* Remove processed item from negotiated_nvl list */
1397 			nvrc = nvlist_remove_all(
1398 			    negotiated_nvl, ikvx->ik_key_name);
1399 			ASSERT(nvrc == 0);
1400 			break;
1401 		default:
1402 			break;
1403 		}
1404 	}
1405 }
1406 
1407 
1408 static idm_status_t
1409 idm_so_handle_digest(idm_conn_t *it, nvpair_t *digest_choice,
1410     const idm_kv_xlate_t *ikvx)
1411 {
1412 	int			nvrc;
1413 	char			*digest_choice_string;
1414 
1415 	nvrc = nvpair_value_string(digest_choice,
1416 	    &digest_choice_string);
1417 	ASSERT(nvrc == 0);
1418 	if (strcasecmp(digest_choice_string, "crc32c") == 0) {
1419 		switch (ikvx->ik_key_id) {
1420 		case KI_HEADER_DIGEST:
1421 			it->ic_conn_flags |= IDM_CONN_HEADER_DIGEST;
1422 			break;
1423 		case KI_DATA_DIGEST:
1424 			it->ic_conn_flags |= IDM_CONN_DATA_DIGEST;
1425 			break;
1426 		default:
1427 			ASSERT(0);
1428 			break;
1429 		}
1430 	} else if (strcasecmp(digest_choice_string, "none") == 0) {
1431 		switch (ikvx->ik_key_id) {
1432 		case KI_HEADER_DIGEST:
1433 			it->ic_conn_flags &= ~IDM_CONN_HEADER_DIGEST;
1434 			break;
1435 		case KI_DATA_DIGEST:
1436 			it->ic_conn_flags &= ~IDM_CONN_DATA_DIGEST;
1437 			break;
1438 		default:
1439 			ASSERT(0);
1440 			break;
1441 		}
1442 	} else {
1443 		ASSERT(0);
1444 	}
1445 
1446 	return (IDM_STATUS_SUCCESS);
1447 }
1448 
1449 
1450 /*
1451  * idm_so_conn_is_capable() verifies that the passed connection is provided
1452  * for by the sockets interface.
1453  */
1454 /* ARGSUSED */
1455 static boolean_t
1456 idm_so_conn_is_capable(idm_conn_req_t *ic, idm_transport_caps_t *caps)
1457 {
1458 	return (B_TRUE);
1459 }
1460 
1461 /*
1462  * idm_so_rx_datain() validates the Data Sequence number of the PDU. The
1463  * idm_sorecv_scsidata() function invoked earlier actually reads the data
1464  * off the socket into the appropriate buffers.
1465  */
1466 static void
1467 idm_so_rx_datain(idm_conn_t *ic, idm_pdu_t *pdu)
1468 {
1469 	iscsi_data_hdr_t	*bhs;
1470 	idm_task_t		*idt;
1471 	idm_buf_t		*idb;
1472 	uint32_t		datasn;
1473 	size_t			offset;
1474 	iscsi_hdr_t		*ihp = (iscsi_hdr_t *)pdu->isp_hdr;
1475 	iscsi_data_rsp_hdr_t    *idrhp = (iscsi_data_rsp_hdr_t *)ihp;
1476 
1477 	ASSERT(ic != NULL);
1478 	ASSERT(pdu != NULL);
1479 
1480 	bhs	= (iscsi_data_hdr_t *)pdu->isp_hdr;
1481 	datasn	= ntohl(bhs->datasn);
1482 	offset	= ntohl(bhs->offset);
1483 
1484 	ASSERT(bhs->opcode == ISCSI_OP_SCSI_DATA_RSP);
1485 
1486 	/*
1487 	 * Look up the task corresponding to the initiator task tag
1488 	 * to get the buffers affiliated with the task.
1489 	 */
1490 	idt = idm_task_find(ic, bhs->itt, bhs->ttt);
1491 	if (idt == NULL) {
1492 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_datain: failed to find task");
1493 		idm_pdu_rx_protocol_error(ic, pdu);
1494 		return;
1495 	}
1496 
1497 	idb = pdu->isp_sorx_buf;
1498 	if (idb == NULL) {
1499 		IDM_CONN_LOG(CE_WARN,
1500 		    "idm_so_rx_datain: failed to find buffer");
1501 		idm_task_rele(idt);
1502 		idm_pdu_rx_protocol_error(ic, pdu);
1503 		return;
1504 	}
1505 
1506 	/*
1507 	 * DataSN values should be sequential and should not have any gaps or
1508 	 * repetitions. Check the DataSN with the one stored in the task.
1509 	 */
1510 	if (datasn == idt->idt_exp_datasn) {
1511 		idt->idt_exp_datasn++; /* keep track of DataSN received */
1512 	} else {
1513 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_datain: datasn out of order");
1514 		idm_task_rele(idt);
1515 		idm_pdu_rx_protocol_error(ic, pdu);
1516 		return;
1517 	}
1518 
1519 	/*
1520 	 * PDUs in a sequence should be in continuously increasing
1521 	 * address offset
1522 	 */
1523 	if (offset != idb->idb_exp_offset) {
1524 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_datain: unexpected offset");
1525 		idm_task_rele(idt);
1526 		idm_pdu_rx_protocol_error(ic, pdu);
1527 		return;
1528 	}
1529 	/* Expected next relative buffer offset */
1530 	idb->idb_exp_offset += n2h24(bhs->dlength);
1531 	idt->idt_rx_bytes += n2h24(bhs->dlength);
1532 
1533 	idm_task_rele(idt);
1534 
1535 	/*
1536 	 * For now call scsi_rsp which will process the data rsp
1537 	 * Revisit, need to provide an explicit client entry point for
1538 	 * phase collapse completions.
1539 	 */
1540 	if (((ihp->opcode & ISCSI_OPCODE_MASK) == ISCSI_OP_SCSI_DATA_RSP) &&
1541 	    (idrhp->flags & ISCSI_FLAG_DATA_STATUS)) {
1542 		(*ic->ic_conn_ops.icb_rx_scsi_rsp)(ic, pdu);
1543 	}
1544 
1545 	idm_pdu_complete(pdu, IDM_STATUS_SUCCESS);
1546 }
1547 
1548 /*
1549  * The idm_so_rx_dataout() function is used by the iSCSI target to read
1550  * data from the Data-Out PDU sent by the iSCSI initiator.
1551  *
1552  * This function gets the Initiator Task Tag from the PDU BHS and looks up the
1553  * task to get the buffers associated with the PDU. A PDU might span buffers.
1554  * The data is then read into the respective buffer.
1555  */
1556 static void
1557 idm_so_rx_dataout(idm_conn_t *ic, idm_pdu_t *pdu)
1558 {
1559 
1560 	iscsi_data_hdr_t	*bhs;
1561 	idm_task_t		*idt;
1562 	idm_buf_t		*idb;
1563 	size_t			offset;
1564 
1565 	ASSERT(ic != NULL);
1566 	ASSERT(pdu != NULL);
1567 
1568 	bhs = (iscsi_data_hdr_t *)pdu->isp_hdr;
1569 	offset = ntohl(bhs->offset);
1570 	ASSERT(bhs->opcode == ISCSI_OP_SCSI_DATA);
1571 
1572 	/*
1573 	 * Look up the task corresponding to the initiator task tag
1574 	 * to get the buffers affiliated with the task.
1575 	 */
1576 	idt = idm_task_find(ic, bhs->itt, bhs->ttt);
1577 	if (idt == NULL) {
1578 		IDM_CONN_LOG(CE_WARN,
1579 		    "idm_so_rx_dataout: failed to find task");
1580 		idm_pdu_rx_protocol_error(ic, pdu);
1581 		return;
1582 	}
1583 
1584 	idb = pdu->isp_sorx_buf;
1585 	if (idb == NULL) {
1586 		IDM_CONN_LOG(CE_WARN,
1587 		    "idm_so_rx_dataout: failed to find buffer");
1588 		idm_task_rele(idt);
1589 		idm_pdu_rx_protocol_error(ic, pdu);
1590 		return;
1591 	}
1592 
1593 	/* Keep track of data transferred - check data offsets */
1594 	if (offset != idb->idb_exp_offset) {
1595 		IDM_CONN_LOG(CE_NOTE, "idm_so_rx_dataout: offset out of seq: "
1596 		    "%ld, %d", offset, idb->idb_exp_offset);
1597 		idm_task_rele(idt);
1598 		idm_pdu_rx_protocol_error(ic, pdu);
1599 		return;
1600 	}
1601 	/* Expected next relative offset */
1602 	idb->idb_exp_offset += ntoh24(bhs->dlength);
1603 	idt->idt_rx_bytes += n2h24(bhs->dlength);
1604 
1605 	/*
1606 	 * Call the buffer callback when the transfer is complete
1607 	 *
1608 	 * The connection state machine should only abort tasks after
1609 	 * shutting down the connection so we are assured that there
1610 	 * won't be a simultaneous attempt to abort this task at the
1611 	 * same time as we are processing this PDU (due to a connection
1612 	 * state change).
1613 	 */
1614 	if (bhs->flags & ISCSI_FLAG_FINAL) {
1615 		/*
1616 		 * We only want to call idm_buf_rx_from_ini_done once
1617 		 * per transfer.  It's possible that this task has
1618 		 * already been aborted in which case
1619 		 * idm_so_free_task_rsrc will call idm_buf_rx_from_ini_done
1620 		 * for each buffer with idb_in_transport==B_TRUE.  To
1621 		 * close this window and ensure that this doesn't happen,
1622 		 * we'll clear idb->idb_in_transport now while holding
1623 		 * the task mutex.   This is only really an issue for
1624 		 * SCSI task abort -- if tasks were being aborted because
1625 		 * of a connection state change the state machine would
1626 		 * have already stopped the receive thread.
1627 		 */
1628 		mutex_enter(&idt->idt_mutex);
1629 
1630 		/*
1631 		 * Release the task hold here (obtained in idm_task_find)
1632 		 * because the task may complete synchronously during
1633 		 * idm_buf_rx_from_ini_done.  Since we still have an active
1634 		 * buffer we know there is at least one additional hold on idt.
1635 		 */
1636 		idm_task_rele(idt);
1637 
1638 		/*
1639 		 * idm_buf_rx_from_ini_done releases idt->idt_mutex
1640 		 */
1641 		DTRACE_ISCSI_8(xfer__done, idm_conn_t *, idt->idt_ic,
1642 		    uintptr_t, idb->idb_buf, uint32_t, idb->idb_bufoffset,
1643 		    uint64_t, 0, uint32_t, 0, uint32_t, 0,
1644 		    uint32_t, idb->idb_xfer_len,
1645 		    int, XFER_BUF_RX_FROM_INI);
1646 		idm_buf_rx_from_ini_done(idt, idb, IDM_STATUS_SUCCESS);
1647 		idm_pdu_complete(pdu, IDM_STATUS_SUCCESS);
1648 		return;
1649 	}
1650 
1651 	idm_task_rele(idt);
1652 	idm_pdu_complete(pdu, IDM_STATUS_SUCCESS);
1653 }
1654 
1655 /*
1656  * The idm_so_rx_rtt() function is used by the iSCSI initiator to handle
1657  * the R2T PDU sent by the iSCSI target indicating that it is ready to
1658  * accept data. This gets the Initiator Task Tag (itt) from the PDU BHS
1659  * and looks up the task in the task tree using the itt to get the output
1660  * buffers associated the task. The R2T PDU contains the offset of the
1661  * requested data and the data length. This function then constructs a
1662  * sequence of iSCSI PDUs and outputs the requested data. Each Data-Out
1663  * PDU is associated with the R2T by the Target Transfer Tag  (ttt).
1664  */
1665 
1666 static void
1667 idm_so_rx_rtt(idm_conn_t *ic, idm_pdu_t *pdu)
1668 {
1669 	idm_task_t		*idt;
1670 	idm_buf_t		*idb;
1671 	iscsi_rtt_hdr_t		*rtt_hdr;
1672 	uint32_t		data_offset;
1673 	uint32_t		data_length;
1674 
1675 	ASSERT(ic != NULL);
1676 	ASSERT(pdu != NULL);
1677 
1678 	rtt_hdr	= (iscsi_rtt_hdr_t *)pdu->isp_hdr;
1679 	data_offset = ntohl(rtt_hdr->data_offset);
1680 	data_length = ntohl(rtt_hdr->data_length);
1681 	idt	= idm_task_find(ic, rtt_hdr->itt, rtt_hdr->ttt);
1682 
1683 	if (idt == NULL) {
1684 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_rtt: could not find task");
1685 		idm_pdu_rx_protocol_error(ic, pdu);
1686 		return;
1687 	}
1688 
1689 	/* Find the buffer bound to the task by the iSCSI initiator */
1690 	mutex_enter(&idt->idt_mutex);
1691 	idb = idm_buf_find(&idt->idt_outbufv, data_offset);
1692 	if (idb == NULL) {
1693 		mutex_exit(&idt->idt_mutex);
1694 		idm_task_rele(idt);
1695 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_rtt: could not find buffer");
1696 		idm_pdu_rx_protocol_error(ic, pdu);
1697 		return;
1698 	}
1699 
1700 	/* return buffer contains this data */
1701 	if (data_offset + data_length > idb->idb_buflen) {
1702 		/* Overflow */
1703 		mutex_exit(&idt->idt_mutex);
1704 		idm_task_rele(idt);
1705 		IDM_CONN_LOG(CE_WARN, "idm_so_rx_rtt: read from outside "
1706 		    "buffer");
1707 		idm_pdu_rx_protocol_error(ic, pdu);
1708 		return;
1709 	}
1710 
1711 	idt->idt_r2t_ttt = rtt_hdr->ttt;
1712 	idt->idt_exp_datasn = 0;
1713 
1714 	idm_so_send_rtt_data(ic, idt, idb, data_offset,
1715 	    ntohl(rtt_hdr->data_length));
1716 	mutex_exit(&idt->idt_mutex);
1717 
1718 	idm_pdu_complete(pdu, IDM_STATUS_SUCCESS);
1719 	idm_task_rele(idt);
1720 
1721 }
1722 
1723 idm_status_t
1724 idm_sorecvdata(idm_conn_t *ic, idm_pdu_t *pdu)
1725 {
1726 	uint8_t		pad[ISCSI_PAD_WORD_LEN];
1727 	int		pad_len;
1728 	uint32_t	data_digest_crc;
1729 	uint32_t	crc_calculated;
1730 	int		total_len;
1731 	idm_so_conn_t	*so_conn;
1732 
1733 	so_conn = ic->ic_transport_private;
1734 
1735 	pad_len = ((ISCSI_PAD_WORD_LEN -
1736 	    (pdu->isp_datalen & (ISCSI_PAD_WORD_LEN - 1))) &
1737 	    (ISCSI_PAD_WORD_LEN - 1));
1738 
1739 	ASSERT(pdu->isp_iovlen < (PDU_MAX_IOVLEN - 2)); /* pad + data digest */
1740 
1741 	total_len = pdu->isp_datalen;
1742 
1743 	if (pad_len) {
1744 		pdu->isp_iov[pdu->isp_iovlen].iov_base	= (char *)&pad;
1745 		pdu->isp_iov[pdu->isp_iovlen].iov_len	= pad_len;
1746 		total_len		+= pad_len;
1747 		pdu->isp_iovlen++;
1748 	}
1749 
1750 	/* setup data digest */
1751 	if ((ic->ic_conn_flags & IDM_CONN_DATA_DIGEST) != 0) {
1752 		pdu->isp_iov[pdu->isp_iovlen].iov_base =
1753 		    (char *)&data_digest_crc;
1754 		pdu->isp_iov[pdu->isp_iovlen].iov_len =
1755 		    sizeof (data_digest_crc);
1756 		total_len		+= sizeof (data_digest_crc);
1757 		pdu->isp_iovlen++;
1758 	}
1759 
1760 	pdu->isp_data = (uint8_t *)(uintptr_t)pdu->isp_iov[0].iov_base;
1761 
1762 	if (idm_iov_sorecv(so_conn->ic_so, &pdu->isp_iov[0],
1763 	    pdu->isp_iovlen, total_len) != 0) {
1764 		return (IDM_STATUS_IO);
1765 	}
1766 
1767 	if ((ic->ic_conn_flags & IDM_CONN_DATA_DIGEST) != 0) {
1768 		crc_calculated = idm_crc32c(pdu->isp_data,
1769 		    pdu->isp_datalen);
1770 		if (pad_len) {
1771 			crc_calculated = idm_crc32c_continued((char *)&pad,
1772 			    pad_len, crc_calculated);
1773 		}
1774 		if (crc_calculated != data_digest_crc) {
1775 			IDM_CONN_LOG(CE_WARN,
1776 			    "idm_sorecvdata: "
1777 			    "CRC error: actual 0x%x, calc 0x%x",
1778 			    data_digest_crc, crc_calculated);
1779 
1780 			/* Invalid Data Digest */
1781 			return (IDM_STATUS_DATA_DIGEST);
1782 		}
1783 	}
1784 
1785 	return (IDM_STATUS_SUCCESS);
1786 }
1787 
1788 /*
1789  * idm_sorecv_scsidata() is used to receive scsi data from the socket. The
1790  * Data-type PDU header must be read into the idm_pdu_t structure prior to
1791  * calling this function.
1792  */
1793 idm_status_t
1794 idm_sorecv_scsidata(idm_conn_t *ic, idm_pdu_t *pdu)
1795 {
1796 	iscsi_data_hdr_t	*bhs;
1797 	idm_task_t		*task;
1798 	uint32_t		offset;
1799 	uint8_t			opcode;
1800 	uint32_t		dlength;
1801 	list_t			*buflst;
1802 	uint32_t		xfer_bytes;
1803 	idm_status_t		status;
1804 
1805 	ASSERT(ic != NULL);
1806 	ASSERT(pdu != NULL);
1807 
1808 	bhs	= (iscsi_data_hdr_t *)pdu->isp_hdr;
1809 
1810 	offset	= ntohl(bhs->offset);
1811 	opcode	= bhs->opcode;
1812 	dlength = n2h24(bhs->dlength);
1813 
1814 	ASSERT((opcode == ISCSI_OP_SCSI_DATA_RSP) ||
1815 	    (opcode == ISCSI_OP_SCSI_DATA));
1816 
1817 	/*
1818 	 * Successful lookup implicitly gets a "hold" on the task.  This
1819 	 * hold must be released before leaving this function.  At one
1820 	 * point we were caching this task context and retaining the hold
1821 	 * but it turned out to be very difficult to release the hold properly.
1822 	 * The task can be aborted and the connection shutdown between this
1823 	 * call and the subsequent expected call to idm_so_rx_datain/
1824 	 * idm_so_rx_dataout (in which case those functions are not called).
1825 	 * Releasing the hold in the PDU callback doesn't work well either
1826 	 * because the whole task may be completed by then at which point
1827 	 * it is too late to release the hold -- for better or worse this
1828 	 * code doesn't wait on the refcnts during normal operation.
1829 	 * idm_task_find() is very fast and it is not a huge burden if we
1830 	 * have to do it twice.
1831 	 */
1832 	task = idm_task_find(ic, bhs->itt, bhs->ttt);
1833 	if (task == NULL) {
1834 		IDM_CONN_LOG(CE_WARN,
1835 		    "idm_sorecv_scsidata: could not find task");
1836 		return (IDM_STATUS_FAIL);
1837 	}
1838 
1839 	mutex_enter(&task->idt_mutex);
1840 	buflst	= (opcode == ISCSI_OP_SCSI_DATA_RSP) ?
1841 	    &task->idt_inbufv : &task->idt_outbufv;
1842 	pdu->isp_sorx_buf = idm_buf_find(buflst, offset);
1843 	mutex_exit(&task->idt_mutex);
1844 
1845 	if (pdu->isp_sorx_buf == NULL) {
1846 		idm_task_rele(task);
1847 		IDM_CONN_LOG(CE_WARN, "idm_sorecv_scsidata: could not find "
1848 		    "buffer for offset %x opcode=%x",
1849 		    offset, opcode);
1850 		return (IDM_STATUS_FAIL);
1851 	}
1852 
1853 	xfer_bytes = idm_fill_iov(pdu, pdu->isp_sorx_buf, offset, dlength);
1854 	ASSERT(xfer_bytes != 0);
1855 	if (xfer_bytes != dlength) {
1856 		idm_task_rele(task);
1857 		/*
1858 		 * Buffer overflow, connection error.  The PDU data is still
1859 		 * sitting in the socket so we can't use the connection
1860 		 * again until that data is drained.
1861 		 */
1862 		return (IDM_STATUS_FAIL);
1863 	}
1864 
1865 	status = idm_sorecvdata(ic, pdu);
1866 
1867 	idm_task_rele(task);
1868 
1869 	return (status);
1870 }
1871 
1872 static uint32_t
1873 idm_fill_iov(idm_pdu_t *pdu, idm_buf_t *idb, uint32_t ro, uint32_t dlength)
1874 {
1875 	uint32_t	buf_ro = ro - idb->idb_bufoffset;
1876 	uint32_t	xfer_len = min(dlength, idb->idb_buflen - buf_ro);
1877 
1878 	ASSERT(ro >= idb->idb_bufoffset);
1879 
1880 	pdu->isp_iov[pdu->isp_iovlen].iov_base	=
1881 	    (caddr_t)idb->idb_buf + buf_ro;
1882 	pdu->isp_iov[pdu->isp_iovlen].iov_len	= xfer_len;
1883 	pdu->isp_iovlen++;
1884 
1885 	return (xfer_len);
1886 }
1887 
1888 int
1889 idm_sorecv_nonscsidata(idm_conn_t *ic, idm_pdu_t *pdu)
1890 {
1891 	pdu->isp_data = kmem_alloc(pdu->isp_datalen, KM_SLEEP);
1892 	ASSERT(pdu->isp_data != NULL);
1893 
1894 	pdu->isp_databuflen = pdu->isp_datalen;
1895 	pdu->isp_iov[0].iov_base = (caddr_t)pdu->isp_data;
1896 	pdu->isp_iov[0].iov_len = pdu->isp_datalen;
1897 	pdu->isp_iovlen = 1;
1898 	/*
1899 	 * Since we are associating a new data buffer with this received
1900 	 * PDU we need to set a specific callback to free the data
1901 	 * after the PDU is processed.
1902 	 */
1903 	pdu->isp_flags |= IDM_PDU_ADDL_DATA;
1904 	pdu->isp_callback = idm_sorx_addl_pdu_cb;
1905 
1906 	return (idm_sorecvdata(ic, pdu));
1907 }
1908 
1909 void
1910 idm_sorx_thread(void *arg)
1911 {
1912 	boolean_t	conn_failure = B_FALSE;
1913 	idm_conn_t	*ic = (idm_conn_t *)arg;
1914 	idm_so_conn_t	*so_conn;
1915 	idm_pdu_t	*pdu;
1916 	idm_status_t	rc;
1917 
1918 	idm_conn_hold(ic);
1919 
1920 	mutex_enter(&ic->ic_mutex);
1921 
1922 	so_conn = ic->ic_transport_private;
1923 	so_conn->ic_rx_thread_running = B_TRUE;
1924 	so_conn->ic_rx_thread_did = so_conn->ic_rx_thread->t_did;
1925 	cv_signal(&ic->ic_cv);
1926 
1927 	while (so_conn->ic_rx_thread_running) {
1928 		mutex_exit(&ic->ic_mutex);
1929 
1930 		/*
1931 		 * Get PDU with default header size (large enough for
1932 		 * BHS plus any anticipated AHS).  PDU from
1933 		 * the cache will have all values set correctly
1934 		 * for sockets RX including callback.
1935 		 */
1936 		pdu = kmem_cache_alloc(idm.idm_sorx_pdu_cache, KM_SLEEP);
1937 		pdu->isp_ic = ic;
1938 		pdu->isp_flags = 0;
1939 		pdu->isp_transport_hdrlen = 0;
1940 
1941 		if ((rc = idm_sorecvhdr(ic, pdu)) != 0) {
1942 			/*
1943 			 * Call idm_pdu_complete so that we call the callback
1944 			 * and ensure any memory allocated in idm_sorecvhdr
1945 			 * gets freed up.
1946 			 */
1947 			idm_pdu_complete(pdu, IDM_STATUS_FAIL);
1948 
1949 			/*
1950 			 * If ic_rx_thread_running is still set then
1951 			 * this is some kind of connection problem
1952 			 * on the socket.  In this case we want to
1953 			 * generate an event.  Otherwise some other
1954 			 * thread closed the socket due to another
1955 			 * issue in which case we don't need to
1956 			 * generate an event.
1957 			 */
1958 			mutex_enter(&ic->ic_mutex);
1959 			if (so_conn->ic_rx_thread_running) {
1960 				conn_failure = B_TRUE;
1961 				so_conn->ic_rx_thread_running = B_FALSE;
1962 			}
1963 
1964 			continue;
1965 		}
1966 
1967 		/*
1968 		 * Header has been read and validated.  Now we need
1969 		 * to read the PDU data payload (if present).  SCSI data
1970 		 * need to be transferred from the socket directly into
1971 		 * the associated transfer buffer for the SCSI task.
1972 		 */
1973 		if (pdu->isp_datalen != 0) {
1974 			if ((IDM_PDU_OPCODE(pdu) == ISCSI_OP_SCSI_DATA) ||
1975 			    (IDM_PDU_OPCODE(pdu) == ISCSI_OP_SCSI_DATA_RSP)) {
1976 				rc = idm_sorecv_scsidata(ic, pdu);
1977 				/*
1978 				 * All SCSI errors are fatal to the
1979 				 * connection right now since we have no
1980 				 * place to put the data.  What we need
1981 				 * is some kind of sink to dispose of unwanted
1982 				 * SCSI data.  For example an invalid task tag
1983 				 * should not kill the connection (although
1984 				 * we may want to drop the connection).
1985 				 */
1986 			} else {
1987 				/*
1988 				 * Not data PDUs so allocate a buffer for the
1989 				 * data segment and read the remaining data.
1990 				 */
1991 				rc = idm_sorecv_nonscsidata(ic, pdu);
1992 			}
1993 			if (rc != 0) {
1994 				/*
1995 				 * Call idm_pdu_complete so that we call the
1996 				 * callback and ensure any memory allocated
1997 				 * in idm_sorecvhdr gets freed up.
1998 				 */
1999 				idm_pdu_complete(pdu, IDM_STATUS_FAIL);
2000 
2001 				/*
2002 				 * If ic_rx_thread_running is still set then
2003 				 * this is some kind of connection problem
2004 				 * on the socket.  In this case we want to
2005 				 * generate an event.  Otherwise some other
2006 				 * thread closed the socket due to another
2007 				 * issue in which case we don't need to
2008 				 * generate an event.
2009 				 */
2010 				mutex_enter(&ic->ic_mutex);
2011 				if (so_conn->ic_rx_thread_running) {
2012 					conn_failure = B_TRUE;
2013 					so_conn->ic_rx_thread_running = B_FALSE;
2014 				}
2015 				continue;
2016 			}
2017 		}
2018 
2019 		/*
2020 		 * Process RX PDU
2021 		 */
2022 		idm_pdu_rx(ic, pdu);
2023 
2024 		mutex_enter(&ic->ic_mutex);
2025 	}
2026 
2027 	mutex_exit(&ic->ic_mutex);
2028 
2029 	/*
2030 	 * If we dropped out of the RX processing loop because of
2031 	 * a socket problem or other connection failure (including
2032 	 * digest errors) then we need to generate a state machine
2033 	 * event to shut the connection down.
2034 	 * If the state machine is already in, for example, INIT_ERROR, this
2035 	 * event will get dropped, and the TX thread will never be notified
2036 	 * to shut down.  To be safe, we'll just notify it here.
2037 	 */
2038 	if (conn_failure) {
2039 		if (so_conn->ic_tx_thread_running) {
2040 			so_conn->ic_tx_thread_running = B_FALSE;
2041 			mutex_enter(&so_conn->ic_tx_mutex);
2042 			cv_signal(&so_conn->ic_tx_cv);
2043 			mutex_exit(&so_conn->ic_tx_mutex);
2044 		}
2045 
2046 		idm_conn_event(ic, CE_TRANSPORT_FAIL, rc);
2047 	}
2048 
2049 	idm_conn_rele(ic);
2050 
2051 	thread_exit();
2052 }
2053 
2054 /*
2055  * idm_so_tx
2056  *
2057  * This is the implementation of idm_transport_ops_t's it_tx_pdu entry
2058  * point.  By definition, it is supposed to be fast.  So, simply queue
2059  * the entry and return.  The real work is done by idm_i_so_tx() via
2060  * idm_sotx_thread().
2061  */
2062 
2063 static void
2064 idm_so_tx(idm_conn_t *ic, idm_pdu_t *pdu)
2065 {
2066 	idm_so_conn_t *so_conn = ic->ic_transport_private;
2067 
2068 	ASSERT(pdu->isp_ic == ic);
2069 	mutex_enter(&so_conn->ic_tx_mutex);
2070 
2071 	if (!so_conn->ic_tx_thread_running) {
2072 		mutex_exit(&so_conn->ic_tx_mutex);
2073 		idm_pdu_complete(pdu, IDM_STATUS_ABORTED);
2074 		return;
2075 	}
2076 
2077 	list_insert_tail(&so_conn->ic_tx_list, (void *)pdu);
2078 	cv_signal(&so_conn->ic_tx_cv);
2079 	mutex_exit(&so_conn->ic_tx_mutex);
2080 }
2081 
2082 static idm_status_t
2083 idm_i_so_tx(idm_pdu_t *pdu)
2084 {
2085 	idm_conn_t	*ic = pdu->isp_ic;
2086 	idm_status_t	status = IDM_STATUS_SUCCESS;
2087 	uint8_t		pad[ISCSI_PAD_WORD_LEN];
2088 	int		pad_len;
2089 	uint32_t	hdr_digest_crc;
2090 	uint32_t	data_digest_crc = 0;
2091 	int		total_len = 0;
2092 	int		iovlen = 0;
2093 	struct iovec	iov[6];
2094 	idm_so_conn_t	*so_conn;
2095 
2096 	so_conn = ic->ic_transport_private;
2097 
2098 	/* Setup BHS */
2099 	iov[iovlen].iov_base	= (caddr_t)pdu->isp_hdr;
2100 	iov[iovlen].iov_len	= pdu->isp_hdrlen;
2101 	total_len		+= iov[iovlen].iov_len;
2102 	iovlen++;
2103 
2104 	/* Setup header digest */
2105 	if (((pdu->isp_flags & IDM_PDU_LOGIN_TX) == 0) &&
2106 	    (ic->ic_conn_flags & IDM_CONN_HEADER_DIGEST)) {
2107 		hdr_digest_crc = idm_crc32c(pdu->isp_hdr, pdu->isp_hdrlen);
2108 
2109 		iov[iovlen].iov_base	= (caddr_t)&hdr_digest_crc;
2110 		iov[iovlen].iov_len	= sizeof (hdr_digest_crc);
2111 		total_len		+= iov[iovlen].iov_len;
2112 		iovlen++;
2113 	}
2114 
2115 	/* Setup the data */
2116 	if (pdu->isp_datalen) {
2117 		idm_task_t		*idt;
2118 		idm_buf_t		*idb;
2119 		iscsi_data_hdr_t	*ihp;
2120 		ihp = (iscsi_data_hdr_t *)pdu->isp_hdr;
2121 		/* Write of immediate data */
2122 		if (ic->ic_ffp &&
2123 		    (ihp->opcode == ISCSI_OP_SCSI_CMD ||
2124 		    ihp->opcode == ISCSI_OP_SCSI_DATA)) {
2125 			idt = idm_task_find(ic, ihp->itt, ihp->ttt);
2126 			if (idt) {
2127 				mutex_enter(&idt->idt_mutex);
2128 				idb = idm_buf_find(&idt->idt_outbufv, 0);
2129 				mutex_exit(&idt->idt_mutex);
2130 				/*
2131 				 * If the initiator call to idm_buf_alloc
2132 				 * failed then we can get to this point
2133 				 * without a bound buffer.  The associated
2134 				 * connection failure will clean things up
2135 				 * later.  It would be nice to come up with
2136 				 * a cleaner way to handle this.  In
2137 				 * particular it seems absurd to look up
2138 				 * the task and the buffer just to update
2139 				 * this counter.
2140 				 */
2141 				if (idb)
2142 					idb->idb_xfer_len += pdu->isp_datalen;
2143 				idm_task_rele(idt);
2144 			}
2145 		}
2146 
2147 		iov[iovlen].iov_base = (caddr_t)pdu->isp_data;
2148 		iov[iovlen].iov_len  = pdu->isp_datalen;
2149 		total_len += iov[iovlen].iov_len;
2150 		iovlen++;
2151 	}
2152 
2153 	/* Setup the data pad if necessary */
2154 	pad_len = ((ISCSI_PAD_WORD_LEN -
2155 	    (pdu->isp_datalen & (ISCSI_PAD_WORD_LEN - 1))) &
2156 	    (ISCSI_PAD_WORD_LEN - 1));
2157 
2158 	if (pad_len) {
2159 		bzero(pad, sizeof (pad));
2160 		iov[iovlen].iov_base = (void *)&pad;
2161 		iov[iovlen].iov_len  = pad_len;
2162 		total_len		+= iov[iovlen].iov_len;
2163 		iovlen++;
2164 	}
2165 
2166 	/*
2167 	 * Setup the data digest if enabled.  Data-digest is not sent
2168 	 * for login-phase PDUs.
2169 	 */
2170 	if ((ic->ic_conn_flags & IDM_CONN_DATA_DIGEST) &&
2171 	    ((pdu->isp_flags & IDM_PDU_LOGIN_TX) == 0) &&
2172 	    (pdu->isp_datalen || pad_len)) {
2173 		/*
2174 		 * RFC3720/10.2.3: A zero-length Data Segment also
2175 		 * implies a zero-length data digest.
2176 		 */
2177 		if (pdu->isp_datalen) {
2178 			data_digest_crc = idm_crc32c(pdu->isp_data,
2179 			    pdu->isp_datalen);
2180 		}
2181 		if (pad_len) {
2182 			data_digest_crc = idm_crc32c_continued(&pad,
2183 			    pad_len, data_digest_crc);
2184 		}
2185 
2186 		iov[iovlen].iov_base	= (caddr_t)&data_digest_crc;
2187 		iov[iovlen].iov_len	= sizeof (data_digest_crc);
2188 		total_len		+= iov[iovlen].iov_len;
2189 		iovlen++;
2190 	}
2191 
2192 	/* Transmit the PDU */
2193 	if (idm_iov_sosend(so_conn->ic_so, &iov[0], iovlen,
2194 	    total_len) != 0) {
2195 		/* Set error status */
2196 		IDM_CONN_LOG(CE_WARN,
2197 		    "idm_so_tx: failed to transmit the PDU, so: %p ic: %p "
2198 		    "data: %p", (void *) so_conn->ic_so, (void *) ic,
2199 		    (void *) pdu->isp_data);
2200 		status = IDM_STATUS_IO;
2201 	}
2202 
2203 	/*
2204 	 * Success does not mean that the PDU actually reached the
2205 	 * remote node since it could get dropped along the way.
2206 	 */
2207 	idm_pdu_complete(pdu, status);
2208 
2209 	return (status);
2210 }
2211 
2212 /*
2213  * The idm_so_buf_tx_to_ini() is used by the target iSCSI layer to transmit the
2214  * Data-In PDUs using sockets. Based on the negotiated MaxRecvDataSegmentLength,
2215  * the buffer is segmented into a sequence of Data-In PDUs, ordered by DataSN.
2216  * A target can invoke this function multiple times for a single read command
2217  * (identified by the same ITT) to split the input into several sequences.
2218  *
2219  * DataSN starts with 0 for the first data PDU of an input command and advances
2220  * by 1 for each subsequent data PDU. Each sequence will have its own F bit,
2221  * which is set to 1 for the last data PDU of a sequence.
2222  *
2223  * Scope for Prototype build:
2224  * The data PDUs within a sequence will be sent in order with the buffer offset
2225  * in increasing order. i.e. initiator and target must have negotiated the
2226  * "DataPDUInOrder" to "Yes". The order between sequences is not enforced.
2227  *
2228  * Caller holds idt->idt_mutex
2229  */
2230 static idm_status_t
2231 idm_so_buf_tx_to_ini(idm_task_t *idt, idm_buf_t *idb)
2232 {
2233 	idm_so_conn_t	*so_conn = idb->idb_ic->ic_transport_private;
2234 	idm_pdu_t	tmppdu;
2235 
2236 	ASSERT(mutex_owned(&idt->idt_mutex));
2237 
2238 	/*
2239 	 * Put the idm_buf_t on the tx queue.  It will be transmitted by
2240 	 * idm_sotx_thread.
2241 	 */
2242 	mutex_enter(&so_conn->ic_tx_mutex);
2243 
2244 	DTRACE_ISCSI_8(xfer__start, idm_conn_t *, idt->idt_ic,
2245 	    uintptr_t, idb->idb_buf, uint32_t, idb->idb_bufoffset,
2246 	    uint64_t, 0, uint32_t, 0, uint32_t, 0,
2247 	    uint32_t, idb->idb_xfer_len, int, XFER_BUF_TX_TO_INI);
2248 
2249 	if (!so_conn->ic_tx_thread_running) {
2250 		mutex_exit(&so_conn->ic_tx_mutex);
2251 		/*
2252 		 * Don't release idt->idt_mutex since we're supposed to hold
2253 		 * in when calling idm_buf_tx_to_ini_done
2254 		 */
2255 		DTRACE_ISCSI_8(xfer__done, idm_conn_t *, idt->idt_ic,
2256 		    uintptr_t, idb->idb_buf, uint32_t, idb->idb_bufoffset,
2257 		    uint64_t, 0, uint32_t, 0, uint32_t, 0,
2258 		    uint32_t, idb->idb_xfer_len,
2259 		    int, XFER_BUF_TX_TO_INI);
2260 		idm_buf_tx_to_ini_done(idt, idb, IDM_STATUS_ABORTED);
2261 		return (IDM_STATUS_FAIL);
2262 	}
2263 
2264 	/*
2265 	 * Build a template for the data PDU headers we will use so that
2266 	 * the SN values will stay consistent with other PDU's we are
2267 	 * transmitting like R2T and SCSI status.
2268 	 */
2269 	bzero(&idb->idb_data_hdr_tmpl, sizeof (iscsi_hdr_t));
2270 	tmppdu.isp_hdr = &idb->idb_data_hdr_tmpl;
2271 	(*idt->idt_ic->ic_conn_ops.icb_build_hdr)(idt, &tmppdu,
2272 	    ISCSI_OP_SCSI_DATA_RSP);
2273 	idb->idb_tx_thread = B_TRUE;
2274 	list_insert_tail(&so_conn->ic_tx_list, (void *)idb);
2275 	cv_signal(&so_conn->ic_tx_cv);
2276 	mutex_exit(&so_conn->ic_tx_mutex);
2277 	mutex_exit(&idt->idt_mutex);
2278 
2279 	/*
2280 	 * Returning success here indicates the transfer was successfully
2281 	 * dispatched -- it does not mean that the transfer completed
2282 	 * successfully.
2283 	 */
2284 	return (IDM_STATUS_SUCCESS);
2285 }
2286 
2287 /*
2288  * The idm_so_buf_rx_from_ini() is used by the target iSCSI layer to specify the
2289  * data blocks it is ready to receive from the initiator in response to a WRITE
2290  * SCSI command. The target iSCSI layer passes the information about the desired
2291  * data blocks to the initiator in one R2T PDU. The receiving buffer, the buffer
2292  * offset and datalen are passed via the 'idb' argument.
2293  *
2294  * Scope for Prototype build:
2295  * R2Ts are required for any Data-Out PDU, i.e. initiator and target must have
2296  * negotiated the "InitialR2T" to "Yes".
2297  *
2298  * Caller holds idt->idt_mutex
2299  */
2300 static idm_status_t
2301 idm_so_buf_rx_from_ini(idm_task_t *idt, idm_buf_t *idb)
2302 {
2303 	idm_pdu_t		*pdu;
2304 	iscsi_rtt_hdr_t		*rtt;
2305 
2306 	ASSERT(mutex_owned(&idt->idt_mutex));
2307 
2308 	DTRACE_ISCSI_8(xfer__start, idm_conn_t *, idt->idt_ic,
2309 	    uintptr_t, idb->idb_buf, uint32_t, idb->idb_bufoffset,
2310 	    uint64_t, 0, uint32_t, 0, uint32_t, 0,
2311 	    uint32_t, idb->idb_xfer_len, int, XFER_BUF_RX_FROM_INI);
2312 
2313 	pdu = kmem_cache_alloc(idm.idm_sotx_pdu_cache, KM_SLEEP);
2314 	pdu->isp_ic = idt->idt_ic;
2315 	bzero(pdu->isp_hdr, sizeof (iscsi_rtt_hdr_t));
2316 
2317 	/* iSCSI layer fills the TTT, ITT, StatSN, ExpCmdSN, MaxCmdSN */
2318 	(*idt->idt_ic->ic_conn_ops.icb_build_hdr)(idt, pdu, ISCSI_OP_RTT_RSP);
2319 
2320 	/* set the rttsn, rtt.flags, rtt.data_offset and rtt.data_length */
2321 	rtt = (iscsi_rtt_hdr_t *)(pdu->isp_hdr);
2322 
2323 	rtt->opcode		= ISCSI_OP_RTT_RSP;
2324 	rtt->flags		= ISCSI_FLAG_FINAL;
2325 	rtt->data_offset	= htonl(idb->idb_bufoffset);
2326 	rtt->data_length	= htonl(idb->idb_xfer_len);
2327 	rtt->rttsn		= htonl(idt->idt_exp_rttsn++);
2328 
2329 	/* Keep track of buffer offsets */
2330 	idb->idb_exp_offset	= idb->idb_bufoffset;
2331 	mutex_exit(&idt->idt_mutex);
2332 
2333 	/*
2334 	 * Transmit the PDU.
2335 	 */
2336 	idm_pdu_tx(pdu);
2337 
2338 	return (IDM_STATUS_SUCCESS);
2339 }
2340 
2341 static idm_status_t
2342 idm_so_buf_alloc(idm_buf_t *idb, uint64_t buflen)
2343 {
2344 	if ((buflen > IDM_SO_BUF_CACHE_LB) && (buflen <= IDM_SO_BUF_CACHE_UB)) {
2345 		idb->idb_buf = kmem_cache_alloc(idm.idm_so_128k_buf_cache,
2346 		    KM_NOSLEEP);
2347 		idb->idb_buf_private = idm.idm_so_128k_buf_cache;
2348 	} else {
2349 		idb->idb_buf = kmem_alloc(buflen, KM_NOSLEEP);
2350 		idb->idb_buf_private = NULL;
2351 	}
2352 
2353 	if (idb->idb_buf == NULL) {
2354 		IDM_CONN_LOG(CE_NOTE,
2355 		    "idm_so_buf_alloc: failed buffer allocation");
2356 		return (IDM_STATUS_FAIL);
2357 	}
2358 
2359 	return (IDM_STATUS_SUCCESS);
2360 }
2361 
2362 /* ARGSUSED */
2363 static idm_status_t
2364 idm_so_buf_setup(idm_buf_t *idb)
2365 {
2366 	/* Ensure bufalloc'd flag is unset */
2367 	idb->idb_bufalloc = B_FALSE;
2368 
2369 	return (IDM_STATUS_SUCCESS);
2370 }
2371 
2372 /* ARGSUSED */
2373 static void
2374 idm_so_buf_teardown(idm_buf_t *idb)
2375 {
2376 	/* nothing to do here */
2377 }
2378 
2379 static void
2380 idm_so_buf_free(idm_buf_t *idb)
2381 {
2382 	if (idb->idb_buf_private == NULL) {
2383 		kmem_free(idb->idb_buf, idb->idb_buflen);
2384 	} else {
2385 		kmem_cache_free(idb->idb_buf_private, idb->idb_buf);
2386 	}
2387 }
2388 
2389 static void
2390 idm_so_send_rtt_data(idm_conn_t *ic, idm_task_t *idt, idm_buf_t *idb,
2391     uint32_t offset, uint32_t length)
2392 {
2393 	idm_so_conn_t	*so_conn = ic->ic_transport_private;
2394 	idm_pdu_t	tmppdu;
2395 	idm_buf_t	*rtt_buf;
2396 
2397 	ASSERT(mutex_owned(&idt->idt_mutex));
2398 
2399 	/*
2400 	 * Allocate a buffer to represent the RTT transfer.  We could further
2401 	 * optimize this by allocating the buffers internally from an rtt
2402 	 * specific buffer cache since this is socket-specific code but for
2403 	 * now we will keep it simple.
2404 	 */
2405 	rtt_buf = idm_buf_alloc(ic, (uint8_t *)idb->idb_buf + offset, length);
2406 	if (rtt_buf == NULL) {
2407 		/*
2408 		 * If we're in FFP then the failure was likely a resource
2409 		 * allocation issue and we should close the connection by
2410 		 * sending a CE_TRANSPORT_FAIL event.
2411 		 *
2412 		 * If we're not in FFP then idm_buf_alloc will always
2413 		 * fail and the state is transitioning to "complete" anyway
2414 		 * so we won't bother to send an event.
2415 		 */
2416 		mutex_enter(&ic->ic_state_mutex);
2417 		if (ic->ic_ffp)
2418 			idm_conn_event_locked(ic, CE_TRANSPORT_FAIL,
2419 			    NULL, CT_NONE);
2420 		mutex_exit(&ic->ic_state_mutex);
2421 		return;
2422 	}
2423 
2424 	rtt_buf->idb_buf_cb = NULL;
2425 	rtt_buf->idb_cb_arg = NULL;
2426 	rtt_buf->idb_bufoffset = offset;
2427 	rtt_buf->idb_xfer_len = length;
2428 	rtt_buf->idb_ic = idt->idt_ic;
2429 	rtt_buf->idb_task_binding = idt;
2430 
2431 	/*
2432 	 * Put the idm_buf_t on the tx queue.  It will be transmitted by
2433 	 * idm_sotx_thread.
2434 	 */
2435 	mutex_enter(&so_conn->ic_tx_mutex);
2436 
2437 	if (!so_conn->ic_tx_thread_running) {
2438 		idm_buf_free(rtt_buf);
2439 		mutex_exit(&so_conn->ic_tx_mutex);
2440 		return;
2441 	}
2442 
2443 	/*
2444 	 * This new buffer represents an additional reference on the task
2445 	 */
2446 	idm_task_hold(idt);
2447 
2448 	/*
2449 	 * Build a template for the data PDU headers we will use so that
2450 	 * the SN values will stay consistent with other PDU's we are
2451 	 * transmitting like R2T and SCSI status.
2452 	 */
2453 	bzero(&rtt_buf->idb_data_hdr_tmpl, sizeof (iscsi_hdr_t));
2454 	tmppdu.isp_hdr = &rtt_buf->idb_data_hdr_tmpl;
2455 	(*idt->idt_ic->ic_conn_ops.icb_build_hdr)(idt, &tmppdu,
2456 	    ISCSI_OP_SCSI_DATA);
2457 	rtt_buf->idb_tx_thread = B_TRUE;
2458 	rtt_buf->idb_in_transport = B_TRUE;
2459 	list_insert_tail(&so_conn->ic_tx_list, (void *)rtt_buf);
2460 	cv_signal(&so_conn->ic_tx_cv);
2461 	mutex_exit(&so_conn->ic_tx_mutex);
2462 }
2463 
2464 static void
2465 idm_so_send_rtt_data_done(idm_task_t *idt, idm_buf_t *idb)
2466 {
2467 	/*
2468 	 * Don't worry about status -- we assume any error handling
2469 	 * is performed by the caller (idm_sotx_thread).
2470 	 */
2471 	idb->idb_in_transport = B_FALSE;
2472 	idm_task_rele(idt);
2473 	idm_buf_free(idb);
2474 }
2475 
2476 static idm_status_t
2477 idm_so_send_buf_region(idm_task_t *idt, idm_buf_t *idb,
2478     uint32_t buf_region_offset, uint32_t buf_region_length)
2479 {
2480 	idm_conn_t		*ic;
2481 	uint32_t		max_dataseglen;
2482 	size_t			remainder, chunk;
2483 	uint32_t		data_offset = buf_region_offset;
2484 	iscsi_data_hdr_t	*bhs;
2485 	idm_pdu_t		*pdu;
2486 	idm_status_t		tx_status;
2487 
2488 	ASSERT(mutex_owned(&idt->idt_mutex));
2489 
2490 	ic = idt->idt_ic;
2491 
2492 	max_dataseglen = 8192; /* Need value from login negotiation */
2493 	remainder = buf_region_length;
2494 
2495 	while (remainder) {
2496 		if (idt->idt_state != TASK_ACTIVE) {
2497 			ASSERT((idt->idt_state != TASK_IDLE) &&
2498 			    (idt->idt_state != TASK_COMPLETE));
2499 			return (IDM_STATUS_ABORTED);
2500 		}
2501 
2502 		/* check to see if we need to chunk the data */
2503 		if (remainder > max_dataseglen) {
2504 			chunk = max_dataseglen;
2505 		} else {
2506 			chunk = remainder;
2507 		}
2508 
2509 		/* Data PDU headers will always be sizeof (iscsi_hdr_t) */
2510 		pdu = kmem_cache_alloc(idm.idm_sotx_pdu_cache, KM_SLEEP);
2511 		pdu->isp_ic = ic;
2512 
2513 		/*
2514 		 * We've already built a build a header template
2515 		 * to use during the transfer.  Use this template so that
2516 		 * the SN values stay consistent with any unrelated PDU's
2517 		 * being transmitted.
2518 		 */
2519 		bcopy(&idb->idb_data_hdr_tmpl, pdu->isp_hdr,
2520 		    sizeof (iscsi_hdr_t));
2521 
2522 		/*
2523 		 * Set DataSN, data offset, and flags in BHS
2524 		 * For the prototype build, A = 0, S = 0, U = 0
2525 		 */
2526 		bhs = (iscsi_data_hdr_t *)(pdu->isp_hdr);
2527 
2528 		bhs->datasn		= htonl(idt->idt_exp_datasn++);
2529 
2530 		hton24(bhs->dlength, chunk);
2531 		bhs->offset = htonl(idb->idb_bufoffset + data_offset);
2532 
2533 		if (chunk == remainder) {
2534 			bhs->flags = ISCSI_FLAG_FINAL; /* F bit set to 1 */
2535 		}
2536 
2537 		/* Instrument the data-send DTrace probe. */
2538 		if (IDM_PDU_OPCODE(pdu) == ISCSI_OP_SCSI_DATA_RSP) {
2539 			DTRACE_ISCSI_2(data__send,
2540 			    idm_conn_t *, idt->idt_ic,
2541 			    iscsi_data_rsp_hdr_t *,
2542 			    (iscsi_data_rsp_hdr_t *)pdu->isp_hdr);
2543 		}
2544 		/* setup data */
2545 		pdu->isp_data	=  (uint8_t *)idb->idb_buf + data_offset;
2546 		pdu->isp_datalen = (uint_t)chunk;
2547 		remainder	-= chunk;
2548 		data_offset	+= chunk;
2549 
2550 		/*
2551 		 * Now that we're done working with idt_exp_datasn,
2552 		 * idt->idt_state and idb->idb_bufoffset we can release
2553 		 * the task lock -- don't want to hold it across the
2554 		 * call to idm_i_so_tx since we could block.
2555 		 */
2556 		mutex_exit(&idt->idt_mutex);
2557 
2558 		/*
2559 		 * Transmit the PDU.  Call the internal routine directly
2560 		 * as there is already implicit ordering.
2561 		 */
2562 		if ((tx_status = idm_i_so_tx(pdu)) != IDM_STATUS_SUCCESS) {
2563 			mutex_enter(&idt->idt_mutex);
2564 			return (tx_status);
2565 		}
2566 
2567 		mutex_enter(&idt->idt_mutex);
2568 		idt->idt_tx_bytes += chunk;
2569 	}
2570 
2571 	return (IDM_STATUS_SUCCESS);
2572 }
2573 
2574 /*
2575  * TX PDU cache
2576  */
2577 /* ARGSUSED */
2578 int
2579 idm_sotx_pdu_constructor(void *hdl, void *arg, int flags)
2580 {
2581 	idm_pdu_t	*pdu = hdl;
2582 
2583 	bzero(pdu, sizeof (idm_pdu_t));
2584 	pdu->isp_hdr = (iscsi_hdr_t *)(pdu + 1); /* Ptr arithmetic */
2585 	pdu->isp_hdrlen = sizeof (iscsi_hdr_t);
2586 	pdu->isp_callback = idm_sotx_cache_pdu_cb;
2587 	pdu->isp_magic = IDM_PDU_MAGIC;
2588 	bzero(pdu->isp_hdr, sizeof (iscsi_hdr_t));
2589 
2590 	return (0);
2591 }
2592 
2593 /* ARGSUSED */
2594 void
2595 idm_sotx_cache_pdu_cb(idm_pdu_t *pdu, idm_status_t status)
2596 {
2597 	/* reset values between use */
2598 	pdu->isp_datalen = 0;
2599 
2600 	kmem_cache_free(idm.idm_sotx_pdu_cache, pdu);
2601 }
2602 
2603 /*
2604  * RX PDU cache
2605  */
2606 /* ARGSUSED */
2607 int
2608 idm_sorx_pdu_constructor(void *hdl, void *arg, int flags)
2609 {
2610 	idm_pdu_t	*pdu = hdl;
2611 
2612 	bzero(pdu, sizeof (idm_pdu_t));
2613 	pdu->isp_magic = IDM_PDU_MAGIC;
2614 	pdu->isp_hdr = (iscsi_hdr_t *)(pdu + 1); /* Ptr arithmetic */
2615 	pdu->isp_callback = idm_sorx_cache_pdu_cb;
2616 
2617 	return (0);
2618 }
2619 
2620 /* ARGSUSED */
2621 static void
2622 idm_sorx_cache_pdu_cb(idm_pdu_t *pdu, idm_status_t status)
2623 {
2624 	pdu->isp_iovlen = 0;
2625 	pdu->isp_sorx_buf = 0;
2626 	kmem_cache_free(idm.idm_sorx_pdu_cache, pdu);
2627 }
2628 
2629 static void
2630 idm_sorx_addl_pdu_cb(idm_pdu_t *pdu, idm_status_t status)
2631 {
2632 	/*
2633 	 * We had to modify our cached RX PDU with a longer header buffer
2634 	 * and/or a longer data buffer.  Release the new buffers and fix
2635 	 * the fields back to what we would expect for a cached RX PDU.
2636 	 */
2637 	if (pdu->isp_flags & IDM_PDU_ADDL_HDR) {
2638 		kmem_free(pdu->isp_hdr, pdu->isp_hdrlen);
2639 	}
2640 	if (pdu->isp_flags & IDM_PDU_ADDL_DATA) {
2641 		kmem_free(pdu->isp_data, pdu->isp_datalen);
2642 	}
2643 	pdu->isp_hdr = (iscsi_hdr_t *)(pdu + 1);
2644 	pdu->isp_hdrlen = sizeof (iscsi_hdr_t);
2645 	pdu->isp_data = NULL;
2646 	pdu->isp_datalen = 0;
2647 	pdu->isp_sorx_buf = 0;
2648 	pdu->isp_callback = idm_sorx_cache_pdu_cb;
2649 	idm_sorx_cache_pdu_cb(pdu, status);
2650 }
2651 
2652 /*
2653  * This thread is only active when I/O is queued for transmit
2654  * because the socket is busy.
2655  */
2656 void
2657 idm_sotx_thread(void *arg)
2658 {
2659 	idm_conn_t	*ic = arg;
2660 	idm_tx_obj_t	*object, *next;
2661 	idm_so_conn_t	*so_conn;
2662 	idm_status_t	status = IDM_STATUS_SUCCESS;
2663 
2664 	idm_conn_hold(ic);
2665 
2666 	mutex_enter(&ic->ic_mutex);
2667 	so_conn = ic->ic_transport_private;
2668 	so_conn->ic_tx_thread_running = B_TRUE;
2669 	so_conn->ic_tx_thread_did = so_conn->ic_tx_thread->t_did;
2670 	cv_signal(&ic->ic_cv);
2671 	mutex_exit(&ic->ic_mutex);
2672 
2673 	mutex_enter(&so_conn->ic_tx_mutex);
2674 
2675 	while (so_conn->ic_tx_thread_running) {
2676 		while (list_is_empty(&so_conn->ic_tx_list)) {
2677 			DTRACE_PROBE1(soconn__tx__sleep, idm_conn_t *, ic);
2678 			cv_wait(&so_conn->ic_tx_cv, &so_conn->ic_tx_mutex);
2679 			DTRACE_PROBE1(soconn__tx__wakeup, idm_conn_t *, ic);
2680 
2681 			if (!so_conn->ic_tx_thread_running) {
2682 				goto tx_bail;
2683 			}
2684 		}
2685 
2686 		object = (idm_tx_obj_t *)list_head(&so_conn->ic_tx_list);
2687 		list_remove(&so_conn->ic_tx_list, object);
2688 		mutex_exit(&so_conn->ic_tx_mutex);
2689 
2690 		switch (object->idm_tx_obj_magic) {
2691 		case IDM_PDU_MAGIC:
2692 			DTRACE_PROBE2(soconn__tx__pdu, idm_conn_t *, ic,
2693 			    idm_pdu_t *, (idm_pdu_t *)object);
2694 
2695 			status = idm_i_so_tx((idm_pdu_t *)object);
2696 			break;
2697 
2698 		case IDM_BUF_MAGIC: {
2699 			idm_buf_t *idb = (idm_buf_t *)object;
2700 			idm_task_t *idt = idb->idb_task_binding;
2701 
2702 			DTRACE_PROBE2(soconn__tx__buf, idm_conn_t *, ic,
2703 			    idm_buf_t *, idb);
2704 
2705 			mutex_enter(&idt->idt_mutex);
2706 			status = idm_so_send_buf_region(idt,
2707 			    idb, 0, idb->idb_xfer_len);
2708 
2709 			/*
2710 			 * TX thread owns the buffer so we expect it to
2711 			 * be "in transport"
2712 			 */
2713 			ASSERT(idb->idb_in_transport);
2714 			if (IDM_CONN_ISTGT(ic)) {
2715 				/*
2716 				 * idm_buf_tx_to_ini_done releases
2717 				 * idt->idt_mutex
2718 				 */
2719 				DTRACE_ISCSI_8(xfer__done,
2720 				    idm_conn_t *, idt->idt_ic,
2721 				    uintptr_t, idb->idb_buf,
2722 				    uint32_t, idb->idb_bufoffset,
2723 				    uint64_t, 0, uint32_t, 0, uint32_t, 0,
2724 				    uint32_t, idb->idb_xfer_len,
2725 				    int, XFER_BUF_TX_TO_INI);
2726 				idm_buf_tx_to_ini_done(idt, idb, status);
2727 			} else {
2728 				idm_so_send_rtt_data_done(idt, idb);
2729 				mutex_exit(&idt->idt_mutex);
2730 			}
2731 			break;
2732 		}
2733 
2734 		default:
2735 			IDM_CONN_LOG(CE_WARN, "idm_sotx_thread: Unknown magic "
2736 			    "(0x%08x)", object->idm_tx_obj_magic);
2737 			status = IDM_STATUS_FAIL;
2738 		}
2739 
2740 		mutex_enter(&so_conn->ic_tx_mutex);
2741 
2742 		if (status != IDM_STATUS_SUCCESS) {
2743 			so_conn->ic_tx_thread_running = B_FALSE;
2744 			idm_conn_event(ic, CE_TRANSPORT_FAIL, status);
2745 		}
2746 	}
2747 
2748 	/*
2749 	 * Before we leave, we need to abort every item remaining in the
2750 	 * TX list.
2751 	 */
2752 
2753 tx_bail:
2754 	object = (idm_tx_obj_t *)list_head(&so_conn->ic_tx_list);
2755 
2756 	while (object != NULL) {
2757 		next = list_next(&so_conn->ic_tx_list, object);
2758 
2759 		list_remove(&so_conn->ic_tx_list, object);
2760 		switch (object->idm_tx_obj_magic) {
2761 		case IDM_PDU_MAGIC:
2762 			idm_pdu_complete((idm_pdu_t *)object,
2763 			    IDM_STATUS_ABORTED);
2764 			break;
2765 
2766 		case IDM_BUF_MAGIC: {
2767 			idm_buf_t *idb = (idm_buf_t *)object;
2768 			idm_task_t *idt = idb->idb_task_binding;
2769 			mutex_exit(&so_conn->ic_tx_mutex);
2770 			mutex_enter(&idt->idt_mutex);
2771 			/*
2772 			 * TX thread owns the buffer so we expect it to
2773 			 * be "in transport"
2774 			 */
2775 			ASSERT(idb->idb_in_transport);
2776 			if (IDM_CONN_ISTGT(ic)) {
2777 				/*
2778 				 * idm_buf_tx_to_ini_done releases
2779 				 * idt->idt_mutex
2780 				 */
2781 				DTRACE_ISCSI_8(xfer__done,
2782 				    idm_conn_t *, idt->idt_ic,
2783 				    uintptr_t, idb->idb_buf,
2784 				    uint32_t, idb->idb_bufoffset,
2785 				    uint64_t, 0, uint32_t, 0, uint32_t, 0,
2786 				    uint32_t, idb->idb_xfer_len,
2787 				    int, XFER_BUF_TX_TO_INI);
2788 				idm_buf_tx_to_ini_done(idt, idb,
2789 				    IDM_STATUS_ABORTED);
2790 			} else {
2791 				idm_so_send_rtt_data_done(idt, idb);
2792 				mutex_exit(&idt->idt_mutex);
2793 			}
2794 			mutex_enter(&so_conn->ic_tx_mutex);
2795 			break;
2796 		}
2797 		default:
2798 			IDM_CONN_LOG(CE_WARN,
2799 			    "idm_sotx_thread: Unexpected magic "
2800 			    "(0x%08x)", object->idm_tx_obj_magic);
2801 		}
2802 
2803 		object = next;
2804 	}
2805 
2806 	mutex_exit(&so_conn->ic_tx_mutex);
2807 	idm_conn_rele(ic);
2808 	thread_exit();
2809 	/*NOTREACHED*/
2810 }
2811 
2812 static void
2813 idm_so_socket_set_nonblock(struct sonode *node)
2814 {
2815 	(void) VOP_SETFL(node->so_vnode, node->so_flag,
2816 	    (node->so_state | FNONBLOCK), CRED(), NULL);
2817 }
2818 
2819 static void
2820 idm_so_socket_set_block(struct sonode *node)
2821 {
2822 	(void) VOP_SETFL(node->so_vnode, node->so_flag,
2823 	    (node->so_state & (~FNONBLOCK)), CRED(), NULL);
2824 }
2825