xref: /illumos-gate/usr/src/uts/common/io/scsi/adapters/iscsi/iscsi_net.c (revision b4128092752f04132443f3dd6bc22b84cf15cf33)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  *
25  * iSCSI Software Initiator
26  */
27 
28 #include <sys/socket.h>		/* networking stuff */
29 #include <sys/strsubr.h>	/* networking stuff */
30 #include <netinet/tcp.h>	/* TCP_NODELAY */
31 #include <sys/socketvar.h>	/* _ALLOC_SLEEP */
32 #include <sys/pathname.h>	/* declares:	lookupname */
33 #include <sys/fs/snode.h>	/* defines:	VTOS */
34 #include <sys/fs/dv_node.h>	/* declares:	devfs_lookupname */
35 #include <sys/bootconf.h>
36 #include <sys/bootprops.h>
37 
38 #include "iscsi.h"
39 
40 /*
41  * This is a high level description of the default
42  * iscsi_net transport interfaces.  These are used
43  * to create, send, recv, and close standard TCP/IP
44  * messages.  In addition there are extensions to send
45  * and recv iSCSI PDU data.
46  *
47  * NOTE: It would be very easy for an iSCSI HBA vendor
48  * to register their own functions over the top of
49  * the default interfaces.  This would allow an iSCSI
50  * HBA to use the same iscsiadm management interfaces
51  * and the Solaris iSCSI session / connection management.
52  * The current problem with this approach is we only
53  * allow one one registered transport table.  This
54  * would be pretty easy to correct although will require
55  * additional CLI changes to manage multiple interfaces.
56  * If a vendor can present compelling performance data,
57  * then Sun will be willing to enhance this support for
58  * multiple interface tables and better CLI management.
59  *
60  * The following listing describes the iscsi_net
61  * entry points:
62  *
63  *   socket            - Creates TCP/IP socket connection.  In the
64  *                       default implementation creates a sonode
65  *                       via the sockfs kernel layer.
66  *   bind              - Performs standard TCP/IP BSD operation.  In
67  *                       the default implementation this only act
68  *                       as a soft binding based on the IP and routing
69  *	                 tables.  It would be preferred if this was
70  *	                 a hard binding but that is currently not
71  *	                 possible with Solaris's networking stack.
72  *   connect           - Performs standard TCP/IP BSD operation.  This
73  *                       establishes the TCP SYN to the peer IP address.
74  *   listen            - Performs standard TCP/IP BSD operation.  This
75  *                       listens for incoming peer connections.
76  *   accept            - Performs standard TCP/IP BSD operation.  This
77  *                       accepts incoming peer connections.
78  *   shutdown          - This disconnects the TCP/IP connection while
79  *                       maintaining the resources.
80  *   close             - This disconnects the TCP/IP connection and
81  *                       releases the resources.
82  *
83  *   getsockopt        - Gets socket option for specified socket.
84  *   setsockopt        - Sets socket option for specified socket.
85  *
86  *      The current socket options that are used by the initiator
87  *      are listed below.
88  *
89  *        TCP_CONN_NOTIFY_THRESHOLD
90  *        TCP_CONN_ABORT_THRESHOLD
91  *        TCP_ABORT_THRESHOLD
92  *        TCP_NODELAY
93  *        SO_RCVBUF
94  *        SO_SNDBUF
95  *
96  *   iscsi_net_poll    - Poll socket interface for a specified amount
97  *                       of data.  If data not received in timeout
98  *                       period fail request.
99  *   iscsi_net_sendmsg - Send message on socket connection
100  *   iscsi_net_recvmsg - Receive message on socket connection
101  *
102  *   iscsi_net_sendpdu - Send iSCSI PDU on socket connection
103  *   iscsi_net_recvhdr - Receive iSCSI header on socket connection
104  *   iscsi_net_recvdata - Receive iSCSI data on socket connection
105  *
106  *     The iSCSI interfaces have the below optional flags.
107  *
108  *       ISCSI_NET_HEADER_DIGEST - The interface should either
109  *				generate or validate the iSCSI
110  *				header digest CRC.
111  *       ISCSI_NET_DATA_DIGESt   - The interface should either
112  *                              generate or validate the iSCSI
113  *                              data digest CRC.
114  */
115 
116 
117 /* global */
118 iscsi_network_t *iscsi_net;
119 
120 /* consts */
121 
122 /*
123  * This table is used for quick validation of incoming
124  * iSCSI PDU opcodes.  A value of '0' in the table below
125  * indicated that the opcode is invalid for an iSCSI
126  * initiator to receive.
127  */
128 const int   is_incoming_opcode_invalid[256] = {
129 	/*		0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F */
130 	/* 0x0X */	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
131 	/* 0x1X */	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
132 	/* 0x2X */	0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
133 	/* 0x3X */	1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
134 	/* 0x4X */	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
135 	/* 0x5X */	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
136 	/* 0x6X */	0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
137 	/* 0x7X */	1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
138 	/* 0x8X */	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
139 	/* 0x9X */	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
140 	/* 0xAX */	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
141 	/* 0xBX */	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
142 	/* 0xCX */	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
143 	/* 0xDX */	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
144 	/* 0xEX */	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
145 	/* 0xFX */	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
146 };
147 /*
148  * Define macros to manipulate snode, vnode, and open device flags
149  */
150 #define	VTYP_VALID(i)	(((i) == VCHR) || ((i) == VBLK))
151 #define	STYP_VALID(i)	(((i) == S_IFCHR) || ((i) == S_IFBLK))
152 #define	STYP_TO_VTYP(i)	(((i) == S_IFCHR) ? VCHR : VBLK)
153 
154 #define	IP_4_BITS	32
155 #define	IP_6_BITS	128
156 
157 extern int modrootloaded;
158 extern ib_boot_prop_t	*iscsiboot_prop;
159 
160 /* prototypes */
161 
162 /* for iSCSI boot */
163 static int net_up = 0;
164 static iscsi_status_t iscsi_net_interface();
165 static int iscsi_ldi_vp_from_name(char *path, vnode_t **vpp);
166 /* boot prototypes end */
167 
168 static void * iscsi_net_socket(int domain, int type, int protocol);
169 static int iscsi_net_bind(void *socket, struct sockaddr *
170     name, int name_len, int backlog, int flags);
171 static int iscsi_net_connect(void *socket, struct sockaddr *
172     name, int name_len, int fflag, int flags);
173 static int iscsi_net_listen(void *socket, int backlog);
174 static void * iscsi_net_accept(void *socket, struct sockaddr *addr,
175     int *addr_len);
176 static int iscsi_net_getsockname(void *socket);
177 static int iscsi_net_getsockopt(void *socket, int level,
178     int option_name, void *option_val, int *option_len, int flags);
179 static int iscsi_net_setsockopt(void *socket, int level,
180     int option_name, void *option_val, int option_len);
181 static int iscsi_net_shutdown(void *socket, int how);
182 static void iscsi_net_close(void *socket);
183 
184 static size_t iscsi_net_poll(void *socket, clock_t timeout);
185 static size_t iscsi_net_sendmsg(void *socket, struct msghdr *msg);
186 static size_t iscsi_net_recvmsg(void *socket,
187     struct msghdr *msg, int timeout);
188 
189 static iscsi_status_t iscsi_net_sendpdu(void *socket, iscsi_hdr_t *ihp,
190     char *data, int flags);
191 static iscsi_status_t iscsi_net_recvdata(void *socket, iscsi_hdr_t *ihp,
192     char *data, int max_data_length, int timeout, int flags);
193 static iscsi_status_t iscsi_net_recvhdr(void *socket, iscsi_hdr_t *ihp,
194     int header_length, int timeout, int flags);
195 
196 static void iscsi_net_set_preconnect_options(void *socket);
197 static void iscsi_net_set_postconnect_options(void *socket);
198 
199 /*
200  * +--------------------------------------------------------------------+
201  * | network interface registration functions                           |
202  * +--------------------------------------------------------------------+
203  */
204 
205 /*
206  * iscsi_net_init - initialize network interface
207  */
208 void
209 iscsi_net_init()
210 {
211 	iscsi_net = kmem_zalloc(sizeof (*iscsi_net), KM_SLEEP);
212 
213 	iscsi_net->socket	= iscsi_net_socket;
214 
215 	iscsi_net->bind		= iscsi_net_bind;
216 	iscsi_net->connect	= iscsi_net_connect;
217 	iscsi_net->listen	= iscsi_net_listen;
218 	iscsi_net->accept	= iscsi_net_accept;
219 	iscsi_net->shutdown	= iscsi_net_shutdown;
220 	iscsi_net->close	= iscsi_net_close;
221 
222 	iscsi_net->getsockname	= iscsi_net_getsockname;
223 	iscsi_net->getsockopt	= iscsi_net_getsockopt;
224 	iscsi_net->setsockopt	= iscsi_net_setsockopt;
225 
226 	iscsi_net->poll		= iscsi_net_poll;
227 	iscsi_net->sendmsg	= iscsi_net_sendmsg;
228 	iscsi_net->recvmsg	= iscsi_net_recvmsg;
229 
230 	iscsi_net->sendpdu	= iscsi_net_sendpdu;
231 	iscsi_net->recvhdr	= iscsi_net_recvhdr;
232 	iscsi_net->recvdata	= iscsi_net_recvdata;
233 }
234 
235 /*
236  * iscsi_net_fini - release network interface
237  */
238 void
239 iscsi_net_fini()
240 {
241 	kmem_free(iscsi_net, sizeof (*iscsi_net));
242 	iscsi_net = NULL;
243 }
244 
245 
246 /*
247  * iscsi_net_set_preconnect_options -
248  */
249 static void
250 iscsi_net_set_preconnect_options(void *socket)
251 {
252 	int ret = 0;
253 	ret += iscsi_net->setsockopt(socket, IPPROTO_TCP,
254 	    TCP_CONN_NOTIFY_THRESHOLD, (char *)&iscsi_net->tweaks.
255 	    conn_notify_threshold, sizeof (int));
256 	ret += iscsi_net->setsockopt(socket, IPPROTO_TCP,
257 	    TCP_CONN_ABORT_THRESHOLD, (char *)&iscsi_net->tweaks.
258 	    conn_abort_threshold, sizeof (int));
259 	ret += iscsi_net->setsockopt(socket, IPPROTO_TCP, TCP_ABORT_THRESHOLD,
260 	    (char *)&iscsi_net->tweaks.abort_threshold, sizeof (int));
261 	if (ret != 0) {
262 		cmn_err(CE_NOTE, "iscsi connection failed to set socket option"
263 		    "TCP_CONN_NOTIFY_THRESHOLD, TCP_CONN_ABORT_THRESHOLD or "
264 		    "TCP_ABORT_THRESHOLD");
265 	}
266 }
267 
268 /*
269  * iscsi_net_set_postconnect_options -
270  */
271 static void
272 iscsi_net_set_postconnect_options(void *socket)
273 {
274 	int ret = 0;
275 	ret += iscsi_net->setsockopt(socket, IPPROTO_TCP, TCP_NODELAY,
276 	    (char *)&iscsi_net->tweaks.nodelay, sizeof (int));
277 	ret += iscsi_net->setsockopt(socket, SOL_SOCKET, SO_RCVBUF,
278 	    (char *)&iscsi_net->tweaks.rcvbuf, sizeof (int));
279 	ret += iscsi_net->setsockopt(socket, SOL_SOCKET, SO_SNDBUF,
280 	    (char *)&iscsi_net->tweaks.sndbuf, sizeof (int));
281 	if (ret != 0) {
282 		cmn_err(CE_NOTE, "iscsi connection failed to set socket option"
283 		    "TCP_NODELAY, SO_RCVBUF or SO_SNDBUF");
284 	}
285 }
286 
287 
288 /*
289  * +--------------------------------------------------------------------+
290  * | register network interfaces                                        |
291  * +--------------------------------------------------------------------+
292  */
293 
294 /*
295  * iscsi_net_socket - create socket
296  */
297 static void *
298 iscsi_net_socket(int domain, int type, int protocol)
299 {
300 	vnode_t		*dvp		= NULL,
301 	    *vp		= NULL;
302 	struct snode	*csp		= NULL;
303 	int		err		= 0;
304 	major_t		maj;
305 
306 	if (!modrootloaded && !net_up && iscsiboot_prop) {
307 		if (iscsi_net_interface() == ISCSI_STATUS_SUCCESS)
308 			net_up = 1;
309 	}
310 
311 	/* ---- solookup: start ---- */
312 	if ((vp = solookup(domain, type, protocol, NULL, &err)) == NULL) {
313 
314 		/*
315 		 * solookup calls sogetvp if the vp is not found in
316 		 * the cache.  Since the call to sogetvp is hardwired
317 		 * to use USERSPACE and declared static we'll do the
318 		 * work here instead.
319 		 */
320 		if (!modrootloaded) {
321 			err = iscsi_ldi_vp_from_name("/devices/pseudo/tcp@0:"
322 			    "tcp", &vp);
323 		} else {
324 			err = lookupname(type == SOCK_STREAM ? "/dev/tcp" :
325 			    "/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
326 		}
327 		if (err) {
328 			return (NULL);
329 		}
330 
331 		/* ---- check that it is the correct vnode ---- */
332 		if (vp->v_type != VCHR) {
333 			VN_RELE(vp);
334 			return (NULL);
335 		}
336 
337 		csp = VTOS(VTOS(vp)->s_commonvp);
338 		if (!(csp->s_flag & SDIPSET)) {
339 			char    *pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
340 			err = ddi_dev_pathname(vp->v_rdev, S_IFCHR,
341 			    pathname);
342 			if (err == 0) {
343 				err = devfs_lookupname(pathname, NULLVPP,
344 				    &dvp);
345 			}
346 			VN_RELE(vp);
347 			kmem_free(pathname, MAXPATHLEN);
348 			if (err != 0) {
349 				return (NULL);
350 			}
351 			vp = dvp;
352 		}
353 
354 		maj = getmajor(vp->v_rdev);
355 		if (!STREAMSTAB(maj)) {
356 			VN_RELE(vp);
357 			return (NULL);
358 		}
359 	}
360 	/* ---- solookup: end ---- */
361 	return (socreate(vp, domain, type, protocol, SOV_DEFAULT, NULL, &err));
362 }
363 
364 /*
365  * iscsi_net_bind - bind socket to a specific sockaddr
366  */
367 static int
368 iscsi_net_bind(void *socket, struct sockaddr *name, int name_len,
369 	int backlog, int flags)
370 {
371 	return (sobind((struct sonode *)socket, name, name_len,
372 	    backlog, flags));
373 }
374 
375 /*
376  * iscsi_net_connect - connect socket to peer sockaddr
377  */
378 static int
379 iscsi_net_connect(void *socket, struct sockaddr *name, int name_len,
380 	int fflag, int flags)
381 {
382 	int rval;
383 
384 	iscsi_net_set_preconnect_options(socket);
385 	rval = soconnect((struct sonode *)socket, name,
386 	    name_len, fflag, flags);
387 	iscsi_net_set_postconnect_options(socket);
388 
389 	return (rval);
390 }
391 
392 /*
393  * iscsi_net_listen - listen to socket for peer connections
394  */
395 static int
396 iscsi_net_listen(void *socket, int backlog)
397 {
398 	return (solisten((struct sonode *)socket, backlog));
399 }
400 
401 /*
402  * iscsi_net_accept - accept peer socket connections
403  */
404 static void *
405 iscsi_net_accept(void *socket, struct sockaddr *addr, int *addr_len)
406 {
407 	struct sonode *listening_socket;
408 
409 	(void) soaccept((struct sonode *)socket,
410 	    ((struct sonode *)socket)->so_flag,
411 	    &listening_socket);
412 	if (listening_socket != NULL) {
413 		bcopy(listening_socket->so_faddr_sa, addr,
414 		    (socklen_t)listening_socket->so_faddr_len);
415 		*addr_len = listening_socket->so_faddr_len;
416 	} else {
417 		*addr_len = 0;
418 	}
419 
420 	return ((void *)listening_socket);
421 }
422 
423 /*
424  * iscsi_net_getsockname -
425  */
426 static int
427 iscsi_net_getsockname(void *socket)
428 {
429 	return (sogetsockname((struct sonode *)socket));
430 }
431 
432 /*
433  * iscsi_net_getsockopt - get value of option on socket
434  */
435 static int
436 iscsi_net_getsockopt(void *socket, int level, int option_name,
437 	void *option_val, int *option_len, int flags)
438 {
439 	return (sogetsockopt((struct sonode *)socket, level,
440 	    option_name, option_val, (socklen_t *)option_len,
441 	    flags));
442 }
443 
444 /*
445  * iscsi_net_setsockopt - set value for option on socket
446  */
447 static int
448 iscsi_net_setsockopt(void *socket, int level, int option_name,
449 	void *option_val, int option_len)
450 {
451 	return (sosetsockopt((struct sonode *)socket, level,
452 	    option_name, option_val, option_len));
453 }
454 
455 /*
456  * iscsi_net_shutdown - shutdown socket connection
457  */
458 static int
459 iscsi_net_shutdown(void *socket, int how)
460 {
461 	return (soshutdown((struct sonode *)socket, how));
462 }
463 
464 /*
465  * iscsi_net_close - shutdown socket connection and release resources
466  */
467 static void
468 iscsi_net_close(void *socket)
469 {
470 	vnode_t *vp = SOTOV((struct sonode *)socket);
471 	(void) soshutdown((struct sonode *)socket, 2);
472 	(void) VOP_CLOSE(vp, 0, 1, 0, kcred, NULL);
473 	VN_RELE(vp);
474 }
475 
476 /*
477  * iscsi_net_poll - poll socket for data
478  */
479 static size_t
480 iscsi_net_poll(void *socket, clock_t timeout)
481 {
482 	int pflag;
483 	uchar_t pri;
484 	rval_t rval;
485 
486 	pri = 0;
487 	pflag = MSG_ANY;
488 	return (kstrgetmsg(SOTOV((struct sonode *)socket), NULL, NULL,
489 	    &pri, &pflag, timeout, &rval));
490 }
491 
492 /*
493  * iscsi_net_sendmsg - send message on socket
494  */
495 /* ARGSUSED */
496 static size_t
497 iscsi_net_sendmsg(void *socket, struct msghdr *msg)
498 {
499 	int i = 0;
500 	int total_len = 0;
501 	struct uio uio;
502 
503 	/* Initialization of the uio structure. */
504 	bzero(&uio, sizeof (uio));
505 	uio.uio_iov = msg->msg_iov;
506 	uio.uio_iovcnt = msg->msg_iovlen;
507 	uio.uio_segflg  = UIO_SYSSPACE;
508 
509 	for (i = 0; i < msg->msg_iovlen; i++) {
510 		total_len += (msg->msg_iov)[i].iov_len;
511 	}
512 	uio.uio_resid = total_len;
513 
514 	(void) sosendmsg((struct sonode *)socket, msg, &uio);
515 	DTRACE_PROBE2(sosendmsg, size_t, total_len, size_t, uio.uio_resid);
516 	return (total_len - uio.uio_resid);
517 }
518 
519 /*
520  * iscsi_net_recvmsg - receive message on socket
521  */
522 /* ARGSUSED */
523 static size_t
524 iscsi_net_recvmsg(void *socket, struct msghdr *msg, int timeout)
525 {
526 	int		idx;
527 	int		total_len   = 0;
528 	struct uio	uio;
529 	uchar_t		pri	    = 0;
530 	int		prflag	    = MSG_ANY;
531 	rval_t		rval;
532 	struct sonode	*sonode	    = (struct sonode *)socket;
533 
534 	/* Initialization of the uio structure. */
535 	bzero(&uio, sizeof (uio));
536 	uio.uio_iov	    = msg->msg_iov;
537 	uio.uio_iovcnt	    = msg->msg_iovlen;
538 	uio.uio_segflg	    = UIO_SYSSPACE;
539 
540 	for (idx = 0; idx < msg->msg_iovlen; idx++) {
541 		total_len += (msg->msg_iov)[idx].iov_len;
542 	}
543 	uio.uio_resid = total_len;
544 
545 	/* If timeout requested on receive */
546 	if (timeout > 0) {
547 		boolean_t   loopback = B_FALSE;
548 
549 		/* And this isn't a loopback connection */
550 		if (sonode->so_laddr.soa_sa->sa_family == AF_INET) {
551 			struct sockaddr_in *lin =
552 			    (struct sockaddr_in *)sonode->so_laddr.soa_sa;
553 			struct sockaddr_in *fin =
554 			    (struct sockaddr_in *)sonode->so_faddr.soa_sa;
555 
556 			if ((lin->sin_family == fin->sin_family) &&
557 			    (bcmp(&lin->sin_addr, &fin->sin_addr,
558 			    sizeof (struct in_addr)) == 0)) {
559 				loopback = B_TRUE;
560 			}
561 		} else {
562 			struct sockaddr_in6 *lin6 =
563 			    (struct sockaddr_in6 *)sonode->so_laddr.soa_sa;
564 			struct sockaddr_in6 *fin6 =
565 			    (struct sockaddr_in6 *)sonode->so_faddr.soa_sa;
566 
567 			if ((lin6->sin6_family == fin6->sin6_family) &&
568 			    (bcmp(&lin6->sin6_addr, &fin6->sin6_addr,
569 			    sizeof (struct in6_addr)) == 0)) {
570 				loopback = B_TRUE;
571 			}
572 		}
573 
574 		if (loopback == B_FALSE) {
575 			/*
576 			 * Then poll device for up to the timeout
577 			 * period or the requested data is received.
578 			 */
579 			if (kstrgetmsg(SOTOV(sonode),
580 			    NULL, NULL, &pri, &prflag, timeout * 1000,
581 			    &rval) == ETIME) {
582 				return (0);
583 			}
584 		}
585 	}
586 
587 	/*
588 	 * Receive the requested data.  Block until all
589 	 * data is received.
590 	 *
591 	 * resid occurs only when the connection is
592 	 * disconnected.  In that case it will return
593 	 * the amount of data that was not received.
594 	 * In general this is the total amount we
595 	 * requested.
596 	 */
597 	(void) sorecvmsg((struct sonode *)socket, msg, &uio);
598 	DTRACE_PROBE2(sorecvmsg, size_t, total_len, size_t, uio.uio_resid);
599 	return (total_len - uio.uio_resid);
600 }
601 
602 /*
603  * iscsi_net_sendpdu - send iscsi pdu on socket
604  */
605 static iscsi_status_t
606 iscsi_net_sendpdu(void *socket, iscsi_hdr_t *ihp, char *data, int flags)
607 {
608 	uint32_t	pad;
609 	uint32_t	crc_hdr;
610 	uint32_t	crc_data;
611 	uint32_t	pad_len;
612 	uint32_t	data_len;
613 	iovec_t		iovec[ISCSI_MAX_IOVEC];
614 	int		iovlen = 0;
615 	size_t		total_len = 0;
616 	size_t		send_len;
617 	struct msghdr	msg;
618 
619 	ASSERT(socket != NULL);
620 	ASSERT(ihp != NULL);
621 
622 	/*
623 	 * Let's send the header first.  'hlength' is in 32-bit
624 	 * quantities, so we need to multiply by four to get bytes
625 	 */
626 	ASSERT(iovlen < ISCSI_MAX_IOVEC);
627 	iovec[iovlen].iov_base = (void *)ihp;
628 	iovec[iovlen].iov_len  = sizeof (*ihp) + ihp->hlength * 4;
629 	total_len += sizeof (*ihp) + ihp->hlength * 4;
630 	iovlen++;
631 
632 	/* Let's transmit the header digest if we have to. */
633 	if ((flags & ISCSI_NET_HEADER_DIGEST) != 0) {
634 		ASSERT(iovlen < ISCSI_MAX_IOVEC);
635 		/*
636 		 * Converting the calculated CRC via htonl is not
637 		 * necessary because iscsi_crc32c calculates
638 		 * the value as it expects to be written
639 		 */
640 		crc_hdr = iscsi_crc32c((char *)ihp,
641 		    sizeof (iscsi_hdr_t) + ihp->hlength * 4);
642 
643 		iovec[iovlen].iov_base = (void *)&crc_hdr;
644 		iovec[iovlen].iov_len  = sizeof (crc_hdr);
645 		total_len += sizeof (crc_hdr);
646 		iovlen++;
647 	}
648 
649 	/* Let's transmit the data if any. */
650 	data_len = ntoh24(ihp->dlength);
651 
652 	if (data_len) {
653 
654 		ASSERT(iovlen < ISCSI_MAX_IOVEC);
655 		iovec[iovlen].iov_base = (void *)data;
656 		iovec[iovlen].iov_len  = data_len;
657 		total_len += data_len;
658 		iovlen++;
659 
660 		pad_len = ((ISCSI_PAD_WORD_LEN -
661 		    (data_len & (ISCSI_PAD_WORD_LEN - 1))) &
662 		    (ISCSI_PAD_WORD_LEN - 1));
663 
664 		/* Let's transmit the data pad if any. */
665 		if (pad_len) {
666 
667 			ASSERT(iovlen < ISCSI_MAX_IOVEC);
668 			pad = 0;
669 			iovec[iovlen].iov_base = (void *)&pad;
670 			iovec[iovlen].iov_len  = pad_len;
671 			total_len += pad_len;
672 			iovlen++;
673 		}
674 
675 		/* Let's transmit the data digest if we have to. */
676 		if ((flags & ISCSI_NET_DATA_DIGEST) != 0) {
677 
678 			ASSERT(iovlen < ISCSI_MAX_IOVEC);
679 			/*
680 			 * Converting the calculated CRC via htonl is not
681 			 * necessary because iscsi_crc32c calculates the
682 			 * value as it expects to be written
683 			 */
684 			crc_data = iscsi_crc32c(data, data_len);
685 			crc_data = iscsi_crc32c_continued(
686 			    (char *)&pad, pad_len, crc_data);
687 
688 			iovec[iovlen].iov_base = (void *)&crc_data;
689 			iovec[iovlen].iov_len  = sizeof (crc_data);
690 			total_len += sizeof (crc_data);
691 			iovlen++;
692 		}
693 	}
694 
695 	DTRACE_PROBE4(tx, void *, socket, iovec_t *, &iovec[0],
696 	    int, iovlen, int, total_len);
697 
698 	/* Initialization of the message header. */
699 	bzero(&msg, sizeof (msg));
700 	msg.msg_iov	= &iovec[0];
701 	msg.msg_flags	= MSG_WAITALL;
702 	msg.msg_iovlen	= iovlen;
703 
704 	send_len = iscsi_net->sendmsg((struct sonode *)socket, &msg);
705 	DTRACE_PROBE2(sendmsg, size_t, total_len, size_t, send_len);
706 	if (total_len != send_len) {
707 		return (ISCSI_STATUS_TCP_TX_ERROR);
708 	}
709 	return (ISCSI_STATUS_SUCCESS);
710 }
711 
712 /*
713  * iscsi_net_recvhdr - receive iscsi hdr on socket
714  */
715 static iscsi_status_t
716 iscsi_net_recvhdr(void *socket, iscsi_hdr_t *ihp, int header_length,
717     int timeout, int flags)
718 {
719 	iovec_t		    iov[ISCSI_MAX_IOVEC];
720 	int		    iovlen		= 1;
721 	int		    total_len		= 0;
722 	uint32_t	    crc_actual		= 0;
723 	uint32_t	    crc_calculated	= 0;
724 	char		    *adhdr		= NULL;
725 	int		    adhdr_length	= 0;
726 	struct msghdr	    msg;
727 	size_t		    recv_len;
728 
729 	ASSERT(socket != NULL);
730 	ASSERT(ihp != NULL);
731 
732 	if (header_length < sizeof (iscsi_hdr_t)) {
733 		ASSERT(FALSE);
734 		return (ISCSI_STATUS_INTERNAL_ERROR);
735 	}
736 
737 	/*
738 	 * Receive primary header
739 	 */
740 	iov[0].iov_base = (char *)ihp;
741 	iov[0].iov_len = sizeof (iscsi_hdr_t);
742 
743 	bzero(&msg, sizeof (msg));
744 	msg.msg_iov	= iov;
745 	msg.msg_flags	= MSG_WAITALL;
746 	msg.msg_iovlen	= iovlen;
747 
748 	recv_len = iscsi_net->recvmsg(socket, &msg, timeout);
749 	if (recv_len != sizeof (iscsi_hdr_t)) {
750 		return (ISCSI_STATUS_TCP_RX_ERROR);
751 	}
752 
753 	DTRACE_PROBE2(rx_hdr, void *, socket, iovec_t *iop, &iov[0]);
754 
755 	/* verify incoming opcode is a valid operation */
756 	if (is_incoming_opcode_invalid[ihp->opcode]) {
757 		cmn_err(CE_WARN, "iscsi connection(%p) protocol error - "
758 		    "received an unsupported opcode:0x%02x",
759 		    socket, ihp->opcode);
760 		return (ISCSI_STATUS_PROTOCOL_ERROR);
761 	}
762 
763 	/*
764 	 * Setup receipt of additional header
765 	 */
766 	if (ihp->hlength > 0) {
767 		adhdr = ((char *)ihp) + sizeof (iscsi_hdr_t);
768 		adhdr_length = header_length - sizeof (iscsi_hdr_t);
769 		/* make sure enough space is available for adhdr */
770 		if (ihp->hlength > adhdr_length) {
771 			ASSERT(FALSE);
772 			return (ISCSI_STATUS_INTERNAL_ERROR);
773 		}
774 
775 		ASSERT(iovlen < ISCSI_MAX_IOVEC);
776 		iov[iovlen].iov_base = adhdr;
777 		iov[iovlen].iov_len = adhdr_length;
778 		total_len += adhdr_length;
779 		iovlen++;
780 	}
781 
782 	/*
783 	 * Setup receipt of header digest if enabled and connection
784 	 * is in full feature mode.
785 	 */
786 	if ((flags & ISCSI_NET_HEADER_DIGEST) != 0) {
787 		ASSERT(iovlen < ISCSI_MAX_IOVEC);
788 		iov[iovlen].iov_base = (char *)&crc_actual;
789 		iov[iovlen].iov_len = sizeof (uint32_t);
790 		total_len += sizeof (uint32_t);
791 		iovlen++;
792 	}
793 
794 	/*
795 	 * Read additional header and/or header digest if pieces
796 	 * are available
797 	 */
798 	if (iovlen > 1) {
799 
800 		bzero(&msg, sizeof (msg));
801 		msg.msg_iov	= iov;
802 		msg.msg_flags	= MSG_WAITALL;
803 		msg.msg_iovlen	= iovlen;
804 
805 		recv_len = iscsi_net->recvmsg(socket, &msg, timeout);
806 		if (recv_len != total_len) {
807 			return (ISCSI_STATUS_TCP_RX_ERROR);
808 		}
809 
810 		DTRACE_PROBE4(rx_adhdr_digest, void *, socket,
811 		    iovec_t *iop, &iov[0], int, iovlen, int, total_len);
812 
813 		/*
814 		 * Verify header digest if enabled and connection
815 		 * is in full feature mode
816 		 */
817 		if ((flags & ISCSI_NET_HEADER_DIGEST) != 0) {
818 			crc_calculated = iscsi_crc32c((uchar_t *)ihp,
819 			    sizeof (iscsi_hdr_t) + ihp->hlength * 4);
820 
821 			/*
822 			 * Converting actual CRC read via ntohl is not
823 			 * necessary because iscsi_crc32c calculates the
824 			 * value as it expect to be read
825 			 */
826 			if (crc_calculated != crc_actual) {
827 				/* Invalid Header Digest */
828 				cmn_err(CE_WARN, "iscsi connection(%p) "
829 				    "protocol error - encountered a header "
830 				    "digest error expected:0x%08x "
831 				    "received:0x%08x", socket,
832 				    crc_calculated, crc_actual);
833 				return (ISCSI_STATUS_HEADER_DIGEST_ERROR);
834 			}
835 		}
836 	}
837 	return (ISCSI_STATUS_SUCCESS);
838 }
839 
840 
841 /*
842  * iscsi_net_recvdata - receive iscsi data payload from socket
843  */
844 static iscsi_status_t
845 iscsi_net_recvdata(void *socket, iscsi_hdr_t *ihp, char *data,
846     int max_data_length, int timeout, int flags)
847 {
848 	struct iovec	iov[3];
849 	int		iovlen			= 1;
850 	int		total_len		= 0;
851 	int		dlength			= 0;
852 	int		pad_len			= 0;
853 	uint8_t		pad[ISCSI_PAD_WORD_LEN];
854 	uint32_t	crc_calculated		= 0;
855 	uint32_t	crc_actual		= 0;
856 	struct msghdr	msg;
857 	size_t		recv_len;
858 
859 	ASSERT(socket != NULL);
860 	ASSERT(ihp != NULL);
861 	ASSERT(data != NULL);
862 
863 	/* short hand dlength */
864 	dlength = ntoh24(ihp->dlength);
865 
866 	/* verify dlength is valid */
867 	if (dlength > max_data_length) {
868 		cmn_err(CE_WARN, "iscsi connection(%p) protocol error - "
869 		    "invalid data lengths itt:0x%x received:0x%x "
870 		    "max expected:0x%x", socket, ihp->itt,
871 		    dlength, max_data_length);
872 		return (ISCSI_STATUS_PROTOCOL_ERROR);
873 	}
874 
875 	if (dlength) {
876 
877 		/* calculate pad */
878 		pad_len = ((ISCSI_PAD_WORD_LEN -
879 		    (dlength & (ISCSI_PAD_WORD_LEN - 1))) &
880 		    (ISCSI_PAD_WORD_LEN - 1));
881 
882 		/* setup data iovec */
883 		iov[0].iov_base	= (char *)data;
884 		iov[0].iov_len	= dlength;
885 		total_len	= dlength;
886 
887 		/* if pad setup pad iovec */
888 		if (pad_len) {
889 			iov[iovlen].iov_base	= (char *)&pad;
890 			iov[iovlen].iov_len	= pad_len;
891 			total_len		+= pad_len;
892 			iovlen++;
893 		}
894 
895 		/* setup data digest */
896 		if ((flags & ISCSI_NET_DATA_DIGEST) != 0) {
897 			iov[iovlen].iov_base	= (char *)&crc_actual;
898 			iov[iovlen].iov_len	= sizeof (crc_actual);
899 			total_len		+= sizeof (crc_actual);
900 			iovlen++;
901 		}
902 
903 		bzero(&msg, sizeof (msg));
904 		msg.msg_iov	= iov;
905 		msg.msg_flags	= MSG_WAITALL;
906 		msg.msg_iovlen	= iovlen;
907 
908 		recv_len = iscsi_net->recvmsg(socket, &msg, timeout);
909 		if (recv_len != total_len) {
910 			return (ISCSI_STATUS_TCP_RX_ERROR);
911 		}
912 
913 		DTRACE_PROBE4(rx_data, void *, socket, iovec_t *iop,
914 		    &iov[0], int, iovlen, int, total_len);
915 
916 		/* verify data digest is present */
917 		if ((flags & ISCSI_NET_DATA_DIGEST) != 0) {
918 
919 			crc_calculated = iscsi_crc32c(data, dlength);
920 			crc_calculated = iscsi_crc32c_continued(
921 			    (char *)&pad, pad_len, crc_calculated);
922 
923 			/*
924 			 * Converting actual CRC read via ntohl is not
925 			 * necessary because iscsi_crc32c calculates the
926 			 * value as it expects to be read
927 			 */
928 			if (crc_calculated != crc_actual) {
929 				cmn_err(CE_WARN, "iscsi connection(%p) "
930 				    "protocol error - encountered a data "
931 				    "digest error itt:0x%x expected:0x%08x "
932 				    "received:0x%08x", socket,
933 				    ihp->itt, crc_calculated, crc_actual);
934 				return (ISCSI_STATUS_DATA_DIGEST_ERROR);
935 			}
936 		}
937 	}
938 	return (ISCSI_STATUS_SUCCESS);
939 }
940 
941 /*
942  * Convert a prefix length to a mask.
943  */
944 static iscsi_status_t
945 iscsi_prefixlentomask(int prefixlen, int maxlen, uchar_t *mask)
946 {
947 	if (prefixlen < 0 || prefixlen > maxlen || mask == NULL) {
948 		return (ISCSI_STATUS_INTERNAL_ERROR);
949 	}
950 
951 	while (prefixlen > 0) {
952 		if (prefixlen >= 8) {
953 			*mask = 0xff;
954 			mask++;
955 			prefixlen = prefixlen - 8;
956 			continue;
957 		}
958 		*mask = *mask | (1 << (8 - prefixlen));
959 		prefixlen--;
960 	}
961 	return (ISCSI_STATUS_SUCCESS);
962 }
963 
964 static iscsi_status_t
965 iscsi_net_interface()
966 {
967 	struct in_addr	braddr;
968 	struct in_addr	subnet;
969 	struct in_addr	myaddr;
970 	struct in_addr	defgateway;
971 	struct in6_addr myaddr6;
972 	struct in6_addr subnet6;
973 	uchar_t		mask_prefix = 0;
974 	int		mask_bits   = 1;
975 	TIUSER		*tiptr;
976 	TIUSER		*tiptr6;
977 	char		ifname[16]	= {0};
978 	iscsi_status_t	status;
979 
980 	struct knetconfig dl_udp_netconf = {
981 	    NC_TPI_CLTS,
982 	    NC_INET,
983 	    NC_UDP,
984 	    0, };
985 	struct knetconfig dl_udp6_netconf = {
986 	    NC_TPI_CLTS,
987 	    NC_INET6,
988 	    NC_UDP,
989 	    0, };
990 
991 	(void) strlcpy(ifname, rootfs.bo_ifname, sizeof (ifname));
992 
993 	if (iscsiboot_prop->boot_nic.sin_family == AF_INET) {
994 		/*
995 		 * Assumes only one linkage array element.
996 		 */
997 		dl_udp_netconf.knc_rdev =
998 		    makedevice(clone_major, ddi_name_to_major("udp"));
999 
1000 		myaddr.s_addr =
1001 		    iscsiboot_prop->boot_nic.nic_ip_u.u_in4.s_addr;
1002 
1003 		mask_prefix = iscsiboot_prop->boot_nic.sub_mask_prefix;
1004 		(void) memset(&subnet.s_addr, 0, sizeof (subnet));
1005 		status = iscsi_prefixlentomask(mask_prefix, IP_4_BITS,
1006 		    (uchar_t *)&subnet.s_addr);
1007 		if (status != ISCSI_STATUS_SUCCESS) {
1008 			return (status);
1009 		}
1010 
1011 		mask_bits = mask_bits << (IP_4_BITS - mask_prefix);
1012 		mask_bits = mask_bits - 1;
1013 		/*
1014 		 * Set the last mask bits of the ip address with 1, then
1015 		 * we can get the broadcast address.
1016 		 */
1017 		braddr.s_addr = myaddr.s_addr | mask_bits;
1018 
1019 		defgateway.s_addr =
1020 		    iscsiboot_prop->boot_nic.nic_gw_u.u_in4.s_addr;
1021 
1022 		/* initialize interface */
1023 		if (t_kopen((file_t *)NULL, dl_udp_netconf.knc_rdev,
1024 		    FREAD|FWRITE, &tiptr, CRED()) == 0) {
1025 			if (kdlifconfig(tiptr, AF_INET, &myaddr, &subnet,
1026 			    &braddr, &defgateway, ifname)) {
1027 				cmn_err(CE_WARN, "Failed to configure"
1028 				    " iSCSI boot nic");
1029 				(void) t_kclose(tiptr, 0);
1030 				return (ISCSI_STATUS_INTERNAL_ERROR);
1031 			}
1032 		} else {
1033 			cmn_err(CE_WARN, "Failed to configure"
1034 			    " iSCSI boot nic");
1035 			return (ISCSI_STATUS_INTERNAL_ERROR);
1036 		}
1037 		return (ISCSI_STATUS_SUCCESS);
1038 	} else {
1039 		dl_udp6_netconf.knc_rdev =
1040 		    makedevice(clone_major, ddi_name_to_major("udp6"));
1041 
1042 		bcopy(&iscsiboot_prop->boot_nic.nic_ip_u.u_in6.s6_addr,
1043 		    &myaddr6.s6_addr, 16);
1044 
1045 		(void) memset(&subnet6, 0, sizeof (subnet6));
1046 		mask_prefix = iscsiboot_prop->boot_nic.sub_mask_prefix;
1047 		status = iscsi_prefixlentomask(mask_prefix, IP_6_BITS,
1048 		    (uchar_t *)&subnet6.s6_addr);
1049 		if (status != ISCSI_STATUS_SUCCESS) {
1050 			return (status);
1051 		}
1052 
1053 		if (t_kopen((file_t *)NULL, dl_udp6_netconf.knc_rdev,
1054 		    FREAD|FWRITE, &tiptr6, CRED()) == 0) {
1055 			if (kdlifconfig(tiptr6, AF_INET6, &myaddr6,
1056 			    &subnet6, NULL, NULL, ifname)) {
1057 				cmn_err(CE_WARN, "Failed to configure"
1058 				    " iSCSI boot nic");
1059 				(void) t_kclose(tiptr, 0);
1060 				return (ISCSI_STATUS_INTERNAL_ERROR);
1061 			}
1062 		} else {
1063 			cmn_err(CE_WARN, "Failed to configure"
1064 			    " iSCSI boot nic");
1065 			return (ISCSI_STATUS_INTERNAL_ERROR);
1066 		}
1067 		return (ISCSI_STATUS_SUCCESS);
1068 	}
1069 }
1070 
1071 /*
1072  * vp is needed to create the socket for the time being.
1073  */
1074 static int
1075 iscsi_ldi_vp_from_name(char *path, vnode_t **vpp)
1076 {
1077 	vnode_t		*vp = NULL;
1078 	int		ret;
1079 
1080 	/* sanity check required input parameters */
1081 	if ((path == NULL) || (vpp == NULL))
1082 		return (EINVAL);
1083 
1084 	if (modrootloaded) {
1085 		cred_t *saved_cred = curthread->t_cred;
1086 
1087 		/* we don't want lookupname to fail because of credentials */
1088 		curthread->t_cred = kcred;
1089 
1090 		/*
1091 		 * all lookups should be done in the global zone.  but
1092 		 * lookupnameat() won't actually do this if an absolute
1093 		 * path is passed in.  since the ldi interfaces require an
1094 		 * absolute path we pass lookupnameat() a pointer to
1095 		 * the character after the leading '/' and tell it to
1096 		 * start searching at the current system root directory.
1097 		 */
1098 		ASSERT(*path == '/');
1099 		ret = lookupnameat(path + 1, UIO_SYSSPACE, FOLLOW, NULLVPP,
1100 		    &vp, rootdir);
1101 
1102 		/* restore this threads credentials */
1103 		curthread->t_cred = saved_cred;
1104 
1105 		if (ret == 0) {
1106 			if (!vn_matchops(vp, spec_getvnodeops()) ||
1107 			    !VTYP_VALID(vp->v_type)) {
1108 				VN_RELE(vp);
1109 				return (ENXIO);
1110 			}
1111 		}
1112 	}
1113 
1114 	if (vp == NULL) {
1115 		dev_info_t	*dip;
1116 		dev_t		dev;
1117 		int		spec_type;
1118 
1119 		/*
1120 		 * Root is not mounted, the minor node is not specified,
1121 		 * or an OBP path has been specified.
1122 		 */
1123 
1124 		/*
1125 		 * Determine if path can be pruned to produce an
1126 		 * OBP or devfs path for resolve_pathname.
1127 		 */
1128 		if (strncmp(path, "/devices/", 9) == 0)
1129 			path += strlen("/devices");
1130 
1131 		/*
1132 		 * if no minor node was specified the DEFAULT minor node
1133 		 * will be returned.  if there is no DEFAULT minor node
1134 		 * one will be fabricated of type S_IFCHR with the minor
1135 		 * number equal to the instance number.
1136 		 */
1137 		ret = resolve_pathname(path, &dip, &dev, &spec_type);
1138 		if (ret != 0)
1139 			return (ENODEV);
1140 
1141 		ASSERT(STYP_VALID(spec_type));
1142 		vp = makespecvp(dev, STYP_TO_VTYP(spec_type));
1143 		spec_assoc_vp_with_devi(vp, dip);
1144 		ddi_release_devi(dip);
1145 	}
1146 
1147 	*vpp = vp;
1148 	return (0);
1149 }
1150