xref: /titanic_51/usr/src/stand/lib/sock/socket.c (revision 50981ffc7e4c5048d14890df805afee6ec113991)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  *
25  * socket.c, Code implementing a simple socket interface.
26  */
27 
28 #pragma ident	"%Z%%M%	%I%	%E% SMI"
29 
30 #include <sys/types.h>
31 #include "socket_impl.h"
32 #include <sys/isa_defs.h>
33 #include <sys/sysmacros.h>
34 #include <sys/bootconf.h>
35 #include <sys/socket.h>
36 #include <netinet/in.h>
37 #include <netinet/ip.h>
38 #include <netinet/tcp.h>
39 #include <sys/uio.h>
40 #include <sys/salib.h>
41 #include "socket_inet.h"
42 #include "ipv4.h"
43 #include "ipv4_impl.h"
44 #include "udp_inet.h"
45 #include "tcp_inet.h"
46 #include "mac.h"
47 #include "mac_impl.h"
48 #include <sys/promif.h>
49 
50 struct inetboot_socket	sockets[MAXSOCKET] = { 0 };
51 
52 /* Default send and receive socket buffer size */
53 #define	SO_DEF_SNDBUF	48*1024
54 #define	SO_DEF_RCVBUF	48*1024
55 
56 /* Default max socket buffer size */
57 #define	SO_MAX_BUF	4*1024*1024
58 
59 static ssize_t dgram_sendto(int, const void *, size_t, int,
60     const struct sockaddr *, int);
61 static ssize_t stream_sendto(int, const void *, size_t, int);
62 static int bind_check(int, const struct sockaddr *);
63 static int quickbind(int);
64 
65 /* Check the validity of a fd and return the socket index of that fd. */
66 int
67 so_check_fd(int fd, int *errno)
68 {
69 	int i;
70 
71 	i = FD_TO_SOCKET(fd);
72 	if (i < 0 || i >= MAXSOCKET) {
73 		*errno = ENOTSOCK;
74 		return (-1);
75 	}
76 	if (sockets[i].type == INETBOOT_UNUSED) {
77 		*errno = ENOTSOCK;
78 		return (-1);
79 	}
80 	return (i);
81 }
82 
83 /*
84  * Create an endpoint for network communication. Returns a descriptor.
85  *
86  * Notes:
87  *	Only PF_INET communication domains are supported. Within
88  * 	this domain, only SOCK_RAW, SOCK_DGRAM and SOCK_STREAM types are
89  *	supported.
90  */
91 int
92 socket(int domain, int type, int protocol)
93 {
94 	static int sock_initialized;
95 	int i;
96 
97 	errno = 0;
98 
99 	if (!sock_initialized) {
100 		for (i = 0; i < MAXSOCKET; i++)
101 			sockets[i].type = INETBOOT_UNUSED;
102 		sock_initialized = B_TRUE;
103 	}
104 	if (domain != AF_INET) {
105 		errno = EPROTONOSUPPORT;
106 		return (-1);
107 	}
108 
109 	/* Find available socket */
110 	for (i = 0; i < MAXSOCKET; i++) {
111 		if (sockets[i].type == INETBOOT_UNUSED)
112 			break;
113 	}
114 	if (i >= MAXSOCKET) {
115 		errno = EMFILE;	/* No slots left. */
116 		return (-1);
117 	}
118 
119 	/* Some socket initialization... */
120 	sockets[i].so_rcvbuf = SO_DEF_RCVBUF;
121 	sockets[i].so_sndbuf = SO_DEF_SNDBUF;
122 
123 	/*
124 	 * Note that we ignore the protocol field for SOCK_DGRAM and
125 	 * SOCK_STREAM.  When we support different protocols in future,
126 	 * this needs to be changed.
127 	 */
128 	switch (type) {
129 	case SOCK_RAW:
130 		ipv4_raw_socket(&sockets[i], (uint8_t)protocol);
131 		break;
132 	case SOCK_DGRAM:
133 		udp_socket_init(&sockets[i]);
134 		break;
135 	case SOCK_STREAM:
136 		tcp_socket_init(&sockets[i]);
137 		break;
138 	default:
139 		errno = EPROTOTYPE;
140 		break;
141 	}
142 
143 	if (errno != 0)
144 		return (-1);
145 
146 	/* IPv4 generic initialization. */
147 	ipv4_socket_init(&sockets[i]);
148 
149 	/* MAC generic initialization. */
150 	mac_socket_init(&sockets[i]);
151 
152 	return (i + SOCKETTYPE);
153 }
154 
155 int
156 getsockname(int s, struct sockaddr *name,  socklen_t *namelen)
157 {
158 	int i;
159 
160 	errno = 0;
161 	if ((i = so_check_fd(s, &errno)) == -1)
162 		return (-1);
163 
164 	if (*namelen < sizeof (struct sockaddr_in)) {
165 		errno = ENOMEM;
166 		return (-1);
167 	}
168 
169 	/* Structure assignment... */
170 	*((struct sockaddr_in *)name) = sockets[i].bind;
171 	*namelen = sizeof (struct sockaddr_in);
172 	return (0);
173 }
174 
175 /*
176  * The socket options we support are:
177  * SO_RCVTIMEO	-	Value is in msecs, and is of uint32_t.
178  * SO_DONTROUTE	-	Value is an int, and is a boolean (nonzero if set).
179  * SO_REUSEADDR -	Value is an int boolean.
180  * SO_RCVBUF -		Value is an int.
181  * SO_SNDBUF -		Value is an int.
182  */
183 int
184 getsockopt(int s, int level, int option, void *optval, socklen_t *optlen)
185 {
186 	int i;
187 
188 	errno = 0;
189 	if ((i = so_check_fd(s, &errno)) == -1)
190 		return (-1);
191 
192 	switch (level) {
193 	case SOL_SOCKET: {
194 		switch (option) {
195 		case SO_RCVTIMEO:
196 			if (*optlen == sizeof (uint32_t)) {
197 				*(uint32_t *)optval = sockets[i].in_timeout;
198 			} else {
199 				*optlen = 0;
200 				errno = EINVAL;
201 			}
202 			break;
203 		case SO_DONTROUTE:
204 			if (*optlen == sizeof (int)) {
205 				*(int *)optval =
206 				    (sockets[i].out_flags & SO_DONTROUTE);
207 			} else {
208 				*optlen = 0;
209 				errno = EINVAL;
210 			}
211 			break;
212 		case SO_REUSEADDR:
213 			if (*optlen == sizeof (int)) {
214 				*(int *)optval =
215 				    (sockets[i].so_opt & SO_REUSEADDR);
216 			} else {
217 				*optlen = 0;
218 				errno = EINVAL;
219 			}
220 			break;
221 		case SO_RCVBUF:
222 			if (*optlen == sizeof (int)) {
223 				*(int *)optval = sockets[i].so_rcvbuf;
224 			} else {
225 				*optlen = 0;
226 				errno = EINVAL;
227 			}
228 			break;
229 		case SO_SNDBUF:
230 			if (*optlen == sizeof (int)) {
231 				*(int *)optval = sockets[i].so_sndbuf;
232 			} else {
233 				*optlen = 0;
234 				errno = EINVAL;
235 			}
236 			break;
237 		case SO_LINGER:
238 			if (*optlen == sizeof (struct linger)) {
239 				/* struct copy */
240 				*(struct linger *)optval = sockets[i].so_linger;
241 			} else {
242 				*optlen = 0;
243 				errno = EINVAL;
244 			}
245 		default:
246 			errno = ENOPROTOOPT;
247 			break;
248 		}
249 		break;
250 	} /* case SOL_SOCKET */
251 	case IPPROTO_TCP:
252 	case IPPROTO_IP: {
253 		switch (option) {
254 		default:
255 			*optlen = 0;
256 			errno = ENOPROTOOPT;
257 			break;
258 		}
259 		break;
260 	} /* case IPPROTO_IP or IPPROTO_TCP */
261 	default:
262 		errno = ENOPROTOOPT;
263 		break;
264 	} /* switch (level) */
265 
266 	if (errno != 0)
267 		return (-1);
268 	else
269 		return (0);
270 }
271 
272 /*
273  * Generate a network-order source port from the privileged range if
274  * "reserved" is true, dynamic/private range otherwise. We consider the
275  * range of 512-1023 privileged ports as ports we can use. This mirrors
276  * historical rpc client practice for privileged port selection.
277  */
278 in_port_t
279 get_source_port(boolean_t reserved)
280 {
281 	static in_port_t	dynamic = IPPORT_DYNAMIC_START - 1,
282 	    rsvdport = (IPPORT_RESERVED / 2) - 1;
283 	in_port_t		p;
284 
285 	if (reserved) {
286 		if (++rsvdport >= IPPORT_RESERVED)
287 			p = rsvdport = IPPORT_RESERVED / 2;
288 		else
289 			p = rsvdport;
290 	} else
291 		p = ++dynamic;
292 
293 	return (htons(p));
294 }
295 
296 /*
297  * The socket options we support are:
298  * SO_RECVTIMEO	-	Value is uint32_t msecs.
299  * SO_DONTROUTE	-	Value is int boolean (nonzero == TRUE, zero == FALSE).
300  * SO_REUSEADDR -	value is int boolean.
301  * SO_RCVBUF -		Value is int.
302  * SO_SNDBUF -		Value is int.
303  */
304 int
305 setsockopt(int s, int level, int option, const void *optval, socklen_t optlen)
306 {
307 	int i;
308 
309 	errno = 0;
310 	if ((i = so_check_fd(s, &errno)) == -1)
311 		return (-1);
312 
313 	switch (level) {
314 	case SOL_SOCKET: {
315 		switch (option) {
316 		case SO_RCVTIMEO:
317 			if (optlen == sizeof (uint32_t))
318 				sockets[i].in_timeout = *(uint32_t *)optval;
319 			else {
320 				errno = EINVAL;
321 			}
322 			break;
323 		case SO_DONTROUTE:
324 			if (optlen == sizeof (int)) {
325 				if (*(int *)optval)
326 					sockets[i].out_flags |= SO_DONTROUTE;
327 				else
328 					sockets[i].out_flags &= ~SO_DONTROUTE;
329 			} else {
330 				errno = EINVAL;
331 			}
332 			break;
333 		case SO_REUSEADDR:
334 			if (optlen == sizeof (int)) {
335 				if (*(int *)optval)
336 					sockets[i].so_opt |= SO_REUSEADDR;
337 				else
338 					sockets[i].so_opt &= ~SO_REUSEADDR;
339 			} else {
340 				errno = EINVAL;
341 			}
342 			break;
343 		case SO_RCVBUF:
344 			if (optlen == sizeof (int)) {
345 				sockets[i].so_rcvbuf = *(int *)optval;
346 				if (sockets[i].so_rcvbuf > SO_MAX_BUF)
347 					sockets[i].so_rcvbuf = SO_MAX_BUF;
348 				(void) tcp_opt_set(sockets[i].pcb,
349 				    level, option, optval, optlen);
350 			} else {
351 				errno = EINVAL;
352 			}
353 			break;
354 		case SO_SNDBUF:
355 			if (optlen == sizeof (int)) {
356 				sockets[i].so_sndbuf = *(int *)optval;
357 				if (sockets[i].so_sndbuf > SO_MAX_BUF)
358 					sockets[i].so_sndbuf = SO_MAX_BUF;
359 				(void) tcp_opt_set(sockets[i].pcb,
360 				    level, option, optval, optlen);
361 			} else {
362 				errno = EINVAL;
363 			}
364 			break;
365 		case SO_LINGER:
366 			if (optlen == sizeof (struct linger)) {
367 				/* struct copy */
368 				sockets[i].so_linger = *(struct linger *)optval;
369 				(void) tcp_opt_set(sockets[i].pcb,
370 				    level, option, optval, optlen);
371 			} else {
372 				errno = EINVAL;
373 			}
374 			break;
375 		default:
376 			errno = ENOPROTOOPT;
377 			break;
378 		}
379 		break;
380 	} /* case SOL_SOCKET */
381 	case IPPROTO_TCP:
382 	case IPPROTO_IP: {
383 		switch (option) {
384 		default:
385 			errno = ENOPROTOOPT;
386 			break;
387 		}
388 		break;
389 	} /* case IPPROTO_IP  or IPPROTO_TCP */
390 	default:
391 		errno = ENOPROTOOPT;
392 		break;
393 	} /* switch (level) */
394 
395 	if (errno != 0)
396 		return (-1);
397 	else
398 		return (0);
399 }
400 
401 /*
402  * Shut down part of a full-duplex connection.
403  *
404  * Only supported for TCP sockets
405  */
406 int
407 shutdown(int s, int how)
408 {
409 	int sock_id;
410 	int i;
411 
412 	errno = 0;
413 	if ((sock_id = so_check_fd(s, &errno)) == -1)
414 		return (-1);
415 
416 	/* shutdown only supported for TCP sockets */
417 	if (sockets[sock_id].type != INETBOOT_STREAM) {
418 		errno = EOPNOTSUPP;
419 		return (-1);
420 	}
421 
422 	if (!(sockets[sock_id].so_state & SS_ISCONNECTED)) {
423 		errno = ENOTCONN;
424 		return (-1);
425 	}
426 
427 	switch (how) {
428 	case 0:
429 		sockets[sock_id].so_state |= SS_CANTRCVMORE;
430 		break;
431 	case 1:
432 		sockets[sock_id].so_state |= SS_CANTSENDMORE;
433 		break;
434 	case 2:
435 		sockets[sock_id].so_state |= (SS_CANTRCVMORE | SS_CANTSENDMORE);
436 		break;
437 	default:
438 		errno = EINVAL;
439 		return (-1);
440 	}
441 
442 	switch (sockets[sock_id].so_state &
443 	    (SS_CANTRCVMORE | SS_CANTSENDMORE)) {
444 	case (SS_CANTRCVMORE | SS_CANTSENDMORE):
445 		/* Call lower level protocol close routine. */
446 		for (i = TRANSPORT_LVL; i >= MEDIA_LVL; i--) {
447 			if (sockets[sock_id].close[i] != NULL) {
448 				(void) sockets[sock_id].close[i](sock_id);
449 			}
450 		}
451 		nuke_grams(&sockets[sock_id].inq);
452 		break;
453 	case SS_CANTRCVMORE:
454 		nuke_grams(&sockets[sock_id].inq);
455 		break;
456 	case SS_CANTSENDMORE:
457 		/* Call lower level protocol close routine. */
458 		if (tcp_shutdown(sock_id) < 0)
459 			return (-1);
460 		break;
461 	default:
462 		errno = EINVAL;
463 		return (-1);
464 	}
465 
466 	return (0);
467 }
468 
469 /*
470  * "close" a socket.
471  */
472 int
473 socket_close(int s)
474 {
475 	int sock_id, i;
476 
477 	errno = 0;
478 	if ((sock_id = so_check_fd(s, &errno)) == -1)
479 		return (-1);
480 
481 	/* Call lower level protocol close routine. */
482 	for (i = TRANSPORT_LVL; i >= MEDIA_LVL; i--) {
483 		if (sockets[sock_id].close[i] != NULL) {
484 			/*
485 			 * Note that the close() routine of other
486 			 * layers can return an error.  But right
487 			 * now, the only mechanism to report that
488 			 * back is for the close() routine to set
489 			 * the errno and socket_close() will return
490 			 * an error.  But the close operation will
491 			 * not be stopped.
492 			 */
493 			(void) sockets[sock_id].close[i](sock_id);
494 		}
495 	}
496 
497 	/*
498 	 * Clear the input queue.  This has to be done
499 	 * after the lower level protocol close routines have been
500 	 * called as they may want to do something about the queue.
501 	 */
502 	nuke_grams(&sockets[sock_id].inq);
503 
504 	bzero((caddr_t)&sockets[sock_id], sizeof (struct inetboot_socket));
505 	sockets[sock_id].type = INETBOOT_UNUSED;
506 
507 	return (0);
508 }
509 
510 /*
511  * Read up to `nbyte' of data from socket `s' into `buf'; if non-zero,
512  * then give up after `read_timeout' seconds.  Returns the number of
513  * bytes read, or -1 on failure.
514  */
515 int
516 socket_read(int s, void *buf, size_t nbyte, int read_timeout)
517 {
518 	ssize_t	n;
519 	uint_t	start, diff;
520 
521 	/*
522 	 * keep calling non-blocking recvfrom until something received
523 	 * or an error occurs
524 	 */
525 	start = prom_gettime();
526 	for (;;) {
527 		n = recvfrom(s, buf, nbyte, MSG_DONTWAIT, NULL, NULL);
528 		if (n == -1 && errno == EWOULDBLOCK) {
529 			diff = (uint_t)((prom_gettime() - start) + 500) / 1000;
530 			if (read_timeout != 0 && diff > read_timeout) {
531 				errno = EINTR;
532 				return (-1);
533 			}
534 		} else {
535 			return (n);
536 		}
537 	}
538 }
539 
540 /*
541  * Write up to `nbyte' bytes of data from `buf' to the address pointed to
542  * `addr' using socket `s'.  Returns the number of bytes writte on success,
543  * or -1 on failure.
544  */
545 int
546 socket_write(int s, const void *buf, size_t nbyte, struct sockaddr_in *addr)
547 {
548 	return (sendto(s, buf, nbyte, 0, (struct sockaddr *)addr,
549 	    sizeof (*addr)));
550 }
551 
552 static int
553 bind_check(int sock_id, const struct sockaddr *addr)
554 {
555 	int k;
556 	struct sockaddr_in *in_addr = (struct sockaddr_in *)addr;
557 
558 	/* Do not check for duplicate bind() if SO_REUSEADDR option is set. */
559 	if (! (sockets[sock_id].so_opt & SO_REUSEADDR)) {
560 		for (k = 0; k < MAXSOCKET; k++) {
561 			if (sockets[k].type != INETBOOT_UNUSED &&
562 			    sockets[k].proto == sockets[sock_id].proto &&
563 			    sockets[k].bound) {
564 				if ((sockets[k].bind.sin_addr.s_addr ==
565 				    in_addr->sin_addr.s_addr) &&
566 				    (sockets[k].bind.sin_port ==
567 				    in_addr->sin_port)) {
568 					errno = EADDRINUSE;
569 					return (-1);
570 				}
571 			}
572 		}
573 	}
574 	return (0);
575 }
576 
577 /* Assign a name to an unnamed socket. */
578 int
579 bind(int s, const struct sockaddr *name, socklen_t namelen)
580 {
581 	int i;
582 
583 	errno = 0;
584 
585 	if ((i = so_check_fd(s, &errno)) == -1)
586 		return (-1);
587 
588 	if (name == NULL) {
589 		/* unbind */
590 		if (sockets[i].bound) {
591 			bzero((caddr_t)&sockets[i].bind,
592 			    sizeof (struct sockaddr_in));
593 			sockets[i].bound = B_FALSE;
594 		}
595 		return (0);
596 	}
597 	if (namelen != sizeof (struct sockaddr_in) || name == NULL) {
598 		errno = EINVAL;
599 		return (-1);
600 	}
601 	if (name->sa_family != AF_INET) {
602 		errno = EAFNOSUPPORT;
603 		return (-1);
604 	}
605 	if (sockets[i].bound) {
606 		if (bcmp((caddr_t)&sockets[i].bind, (caddr_t)name,
607 		    namelen) == 0) {
608 			/* attempt to bind to same address ok... */
609 			return (0);
610 		}
611 		errno = EINVAL;	/* already bound */
612 		return (-1);
613 	}
614 
615 	if (errno != 0) {
616 		return (-1);
617 	}
618 
619 	/* Check for duplicate bind(). */
620 	if (bind_check(i, name) < 0)
621 		return (-1);
622 
623 	bcopy((caddr_t)name, (caddr_t)&sockets[i].bind, namelen);
624 	if (sockets[i].type == INETBOOT_STREAM) {
625 		if (tcp_bind(i) < 0) {
626 			return (-1);
627 		}
628 	}
629 	sockets[i].bound = B_TRUE;
630 
631 	return (0);
632 }
633 
634 static int
635 quickbind(int sock_id)
636 {
637 	int i;
638 	struct sockaddr_in addr;
639 
640 	/*
641 	 * XXX This needs more work.  Right now, if ipv4_setipaddr()
642 	 * have not been called, this will be wrong.  But we need
643 	 * something better.  Need to be revisited.
644 	 */
645 	ipv4_getipaddr(&addr.sin_addr);
646 	addr.sin_family = AF_INET;
647 
648 	for (i = SMALLEST_ANON_PORT; i <= LARGEST_ANON_PORT; i++) {
649 		addr.sin_port = htons(i);
650 		if (bind_check(sock_id, (struct sockaddr *)&addr) == 0)
651 			break;
652 	}
653 	/* Need to clear errno as it is probably set by bind_check(). */
654 	errno = 0;
655 
656 	if (i <= LARGEST_ANON_PORT) {
657 		bcopy((caddr_t)&addr, (caddr_t)&sockets[sock_id].bind,
658 		    sizeof (struct sockaddr_in));
659 		sockets[sock_id].bound = B_TRUE;
660 #ifdef DEBUG
661 		printf("quick bind done addr %s port %d\n",
662 		    inet_ntoa(sockets[sock_id].bind.sin_addr),
663 		    ntohs(sockets[sock_id].bind.sin_port));
664 #endif
665 		return (0);
666 	} else {
667 		return (-1);
668 	}
669 }
670 
671 int
672 listen(int fd, int backlog)
673 {
674 	int sock_id;
675 
676 	errno = 0;
677 	if ((sock_id = so_check_fd(fd, &errno)) == -1)
678 		return (-1);
679 
680 	if (sockets[sock_id].type != INETBOOT_STREAM) {
681 		errno = EOPNOTSUPP;
682 		return (-1);
683 	}
684 	if (sockets[sock_id].so_error != 0) {
685 		errno = sockets[sock_id].so_error;
686 		return (-1);
687 	}
688 	return (tcp_listen(sock_id, backlog));
689 }
690 
691 int
692 accept(int fd, struct sockaddr *addr,  socklen_t *addr_len)
693 {
694 	int sock_id;
695 	int new_sd;
696 
697 	errno = 0;
698 	if ((sock_id = so_check_fd(fd, &errno)) == -1)
699 		return (-1);
700 
701 	if (sockets[sock_id].type != INETBOOT_STREAM) {
702 		errno = EOPNOTSUPP;
703 		return (-1);
704 	}
705 	if (sockets[sock_id].so_error != 0) {
706 		errno = sockets[sock_id].so_error;
707 		return (-1);
708 	}
709 	if ((new_sd = tcp_accept(sock_id, addr, addr_len)) == -1)
710 		return (-1);
711 	sock_id = so_check_fd(new_sd, &errno);
712 	sockets[sock_id].so_state |= SS_ISCONNECTED;
713 	return (new_sd);
714 }
715 
716 int
717 connect(int fd, const  struct sockaddr *addr, socklen_t addr_len)
718 {
719 	int sock_id;
720 	int so_type;
721 
722 	errno = 0;
723 	if ((sock_id = so_check_fd(fd, &errno)) == -1)
724 		return (-1);
725 
726 	so_type = sockets[sock_id].type;
727 
728 	if (addr == NULL || addr_len == 0) {
729 		errno = EINVAL;
730 		return (-1);
731 	}
732 	/* Don't allow connect for raw socket. */
733 	if (so_type == INETBOOT_RAW) {
734 		errno = EPROTONOSUPPORT;
735 		return (-1);
736 	}
737 
738 	if (sockets[sock_id].so_state & SS_ISCONNECTED) {
739 		errno = EINVAL;
740 		return (-1);
741 	}
742 
743 	if (sockets[sock_id].so_error != 0) {
744 		errno = sockets[sock_id].so_error;
745 		return (-1);
746 	}
747 
748 	/* If the socket is not bound, we need to do a quick bind. */
749 	if (!sockets[sock_id].bound) {
750 		/* For TCP socket, just call tcp_bind(). */
751 		if (so_type == INETBOOT_STREAM) {
752 			if (tcp_bind(sock_id) < 0)
753 				return (-1);
754 		} else {
755 			if (quickbind(sock_id) < 0) {
756 				errno = EADDRNOTAVAIL;
757 				return (-1);
758 			}
759 		}
760 	}
761 	/* Should do some sanity check for addr .... */
762 	bcopy((caddr_t)addr, &sockets[sock_id].remote,
763 	    sizeof (struct sockaddr_in));
764 
765 	if (sockets[sock_id].type == INETBOOT_STREAM) {
766 		/* Call TCP connect routine. */
767 		if (tcp_connect(sock_id) == 0)
768 			sockets[sock_id].so_state |= SS_ISCONNECTED;
769 		else {
770 			if (sockets[sock_id].so_error != 0)
771 				errno = sockets[sock_id].so_error;
772 			return (-1);
773 		}
774 	} else {
775 		sockets[sock_id].so_state |= SS_ISCONNECTED;
776 	}
777 	return (0);
778 }
779 
780 /* Just a wrapper around recvfrom(). */
781 ssize_t
782 recv(int s, void *buf, size_t len, int flags)
783 {
784 	return (recvfrom(s, buf, len, flags, NULL, NULL));
785 }
786 
787 /*
788  * Receive messages from a connectionless socket. Legal flags are 0 and
789  * MSG_DONTWAIT. MSG_WAITALL is not currently supported.
790  *
791  * Returns length of message for success, -1 if error occurred.
792  */
793 ssize_t
794 recvfrom(int s, void *buf, size_t len, int flags, struct sockaddr *from,
795     socklen_t *fromlen)
796 {
797 	int			sock_id, i;
798 	ssize_t			datalen, bytes = 0;
799 	struct inetgram		*icp;
800 	enum SockType		so_type;
801 	char			*tmp_buf;
802 	mblk_t			*mp;
803 
804 	errno = 0;
805 
806 	if ((sock_id = so_check_fd(s, &errno)) == -1) {
807 		errno = EINVAL;
808 		return (-1);
809 	}
810 
811 	if (sockets[sock_id].type == INETBOOT_STREAM &&
812 	    !(sockets[sock_id].so_state & SS_ISCONNECTED)) {
813 		errno = ENOTCONN;
814 		return (-1);
815 	}
816 
817 	if (buf == NULL || len == 0) {
818 		errno = EINVAL;
819 		return (-1);
820 	}
821 	/* Yup - MSG_WAITALL not implemented */
822 	if ((flags & ~MSG_DONTWAIT) != 0) {
823 		errno = EINVAL;
824 		return (-1);
825 	}
826 
827 retry:
828 	if (sockets[sock_id].inq == NULL) {
829 		/* Go out and check the wire */
830 		for (i = MEDIA_LVL; i < APP_LVL; i++) {
831 			if (sockets[sock_id].input[i] != NULL) {
832 				if (sockets[sock_id].input[i](sock_id) < 0) {
833 					if (sockets[sock_id].so_error != 0) {
834 						errno =
835 						    sockets[sock_id].so_error;
836 					}
837 					return (-1);
838 				}
839 			}
840 		}
841 	}
842 
843 	so_type = sockets[sock_id].type;
844 
845 	/* Remove unknown inetgrams from the head of inq.  Can this happen? */
846 	while ((icp = sockets[sock_id].inq) != NULL) {
847 		if ((so_type == INETBOOT_DGRAM ||
848 		    so_type == INETBOOT_STREAM) &&
849 		    icp->igm_level != APP_LVL) {
850 #ifdef	DEBUG
851 			printf("recvfrom: unexpected level %d frame found\n",
852 			    icp->igm_level);
853 #endif	/* DEBUG */
854 			del_gram(&sockets[sock_id].inq, icp, B_TRUE);
855 			continue;
856 		} else {
857 			break;
858 		}
859 	}
860 
861 
862 	if (icp == NULL) {
863 		/*
864 		 * Checking for error should be done everytime a lower layer
865 		 * input routing is called.  For example, if TCP gets a RST,
866 		 * this should be reported asap.
867 		 */
868 		if (sockets[sock_id].so_state & SS_CANTRCVMORE) {
869 			if (sockets[sock_id].so_error != 0) {
870 				errno = sockets[sock_id].so_error;
871 				return (-1);
872 			} else {
873 				return (0);
874 			}
875 		}
876 
877 		if ((flags & MSG_DONTWAIT) == 0)
878 			goto retry;	/* wait forever */
879 
880 		/* no data */
881 		errno = EWOULDBLOCK;
882 		return (-1);
883 	}
884 
885 	if (from != NULL && fromlen != NULL) {
886 		switch (so_type) {
887 		case INETBOOT_STREAM:
888 			/* Need to copy from the socket's remote address. */
889 			bcopy(&(sockets[sock_id].remote), from, MIN(*fromlen,
890 			    sizeof (struct sockaddr_in)));
891 			break;
892 		case INETBOOT_RAW:
893 		case INETBOOT_DGRAM:
894 		default:
895 			if (*fromlen > sizeof (icp->igm_saddr))
896 				*fromlen = sizeof (icp->igm_saddr);
897 			bcopy((caddr_t)&(icp->igm_saddr), (caddr_t)from,
898 			    MIN(*fromlen, sizeof (struct sockaddr_in)));
899 			break;
900 		}
901 	}
902 
903 	mp = icp->igm_mp;
904 	switch (so_type) {
905 	case INETBOOT_STREAM:
906 		/*
907 		 * If the message has igm_id == TCP_CALLB_MAGIC_ID, we need
908 		 * to drain the data held by tcp and try again.
909 		 */
910 		if (icp->igm_id == TCP_CALLB_MAGIC_ID) {
911 			del_gram(&sockets[sock_id].inq, icp, B_TRUE);
912 			tcp_rcv_drain_sock(sock_id);
913 			goto retry;
914 		}
915 
916 		/* TCP should put only user data in the inetgram. */
917 		tmp_buf = (char *)buf;
918 		while (len > 0 && icp != NULL) {
919 			datalen = mp->b_wptr - mp->b_rptr;
920 			if (len < datalen) {
921 				bcopy(mp->b_rptr, tmp_buf, len);
922 				bytes += len;
923 				mp->b_rptr += len;
924 				break;
925 			} else {
926 				bcopy(mp->b_rptr, tmp_buf, datalen);
927 				len -= datalen;
928 				bytes += datalen;
929 				tmp_buf += datalen;
930 				del_gram(&sockets[sock_id].inq, icp, B_TRUE);
931 
932 				/*
933 				 * If we have any embedded magic messages just
934 				 * drop them.
935 				 */
936 				while ((icp = sockets[sock_id].inq) != NULL) {
937 					if (icp->igm_id != TCP_CALLB_MAGIC_ID)
938 						break;
939 					del_gram(&sockets[sock_id].inq, icp,
940 					    B_TRUE);
941 				}
942 
943 				if (icp == NULL)
944 					break;
945 				mp = icp->igm_mp;
946 			}
947 		}
948 		sockets[sock_id].so_rcvbuf += (int32_t)bytes;
949 		break;
950 	case INETBOOT_DGRAM:
951 		datalen = mp->b_wptr - mp->b_rptr;
952 		if (len < datalen)
953 			bytes = len;
954 		else
955 			bytes = datalen;
956 		bcopy(mp->b_rptr, buf, bytes);
957 		del_gram(&sockets[sock_id].inq, icp, B_TRUE);
958 		break;
959 	case INETBOOT_RAW:
960 	default:
961 		datalen = mp->b_wptr - mp->b_rptr;
962 		if (len < datalen)
963 			bytes = len;
964 		else
965 			bytes = datalen;
966 		bcopy(mp->b_rptr, buf, bytes);
967 		del_gram(&sockets[sock_id].inq, icp, B_TRUE);
968 		break;
969 	}
970 
971 #ifdef	DEBUG
972 	printf("recvfrom(%d): data: (0x%x,%d)\n", sock_id,
973 	    (icp != NULL) ? icp->igm_mp : 0, bytes);
974 #endif	/* DEBUG */
975 	return (bytes);
976 }
977 
978 
979 /* Just a wrapper around sendto(). */
980 ssize_t
981 send(int s, const void *msg, size_t len, int flags)
982 {
983 	return (sendto(s, msg, len, flags, NULL, 0));
984 }
985 
986 /*
987  * Transmit a message through a socket.
988  *
989  * Supported flags: MSG_DONTROUTE or 0.
990  */
991 ssize_t
992 sendto(int s, const void *msg, size_t len, int flags, const struct sockaddr *to,
993     socklen_t tolen)
994 {
995 	enum SockType so_type;
996 	int sock_id;
997 	ssize_t bytes;
998 
999 	errno = 0;
1000 
1001 	if ((sock_id = so_check_fd(s, &errno)) == -1) {
1002 		return (-1);
1003 	}
1004 	if (msg == NULL) {
1005 		errno = EINVAL;
1006 		return (-1);
1007 	}
1008 	so_type = sockets[sock_id].type;
1009 	if ((flags & ~MSG_DONTROUTE) != 0) {
1010 		errno = EINVAL;
1011 		return (-1);
1012 	}
1013 	if (sockets[sock_id].so_error != 0) {
1014 		errno = sockets[sock_id].so_error;
1015 		return (-1);
1016 	}
1017 	if (to != NULL && to->sa_family != AF_INET) {
1018 		errno = EAFNOSUPPORT;
1019 		return (-1);
1020 	}
1021 
1022 	switch (so_type) {
1023 	case INETBOOT_RAW:
1024 	case INETBOOT_DGRAM:
1025 		if (!(sockets[sock_id].so_state & SS_ISCONNECTED) &&
1026 		    (to == NULL || tolen != sizeof (struct sockaddr_in))) {
1027 			errno = EINVAL;
1028 			return (-1);
1029 		}
1030 		bytes = dgram_sendto(sock_id, msg, len, flags, to, tolen);
1031 		break;
1032 	case INETBOOT_STREAM:
1033 		if (!((sockets[sock_id].so_state & SS_ISCONNECTED) ||
1034 		    (sockets[sock_id].so_state & SS_ISCONNECTING))) {
1035 			errno = EINVAL;
1036 			return (-1);
1037 		}
1038 		if (sockets[sock_id].so_state & SS_CANTSENDMORE) {
1039 			errno = EPIPE;
1040 			return (-1);
1041 		}
1042 		bytes = stream_sendto(sock_id, msg, len, flags);
1043 		break;
1044 	default:
1045 		/* Should not happen... */
1046 		errno = EPROTOTYPE;
1047 		return (-1);
1048 	}
1049 	return (bytes);
1050 }
1051 
1052 static ssize_t
1053 dgram_sendto(int i, const void *msg, size_t len, int flags,
1054     const struct sockaddr *to, int tolen)
1055 {
1056 	struct inetgram		oc;
1057 	int			l, offset;
1058 	size_t			tlen;
1059 	mblk_t			*mp;
1060 
1061 #ifdef	DEBUG
1062 	{
1063 	struct sockaddr_in *sin = (struct sockaddr_in *)to;
1064 	printf("sendto(%d): msg of length: %d sent to port %d and host: %s\n",
1065 	    i, len, ntohs(sin->sin_port), inet_ntoa(sin->sin_addr));
1066 	}
1067 #endif	/* DEBUG */
1068 
1069 	nuke_grams(&sockets[i].inq); /* flush the input queue */
1070 
1071 	/* calculate offset for data */
1072 	offset = sockets[i].headerlen[MEDIA_LVL](NULL) +
1073 	    (sockets[i].headerlen[NETWORK_LVL])(NULL);
1074 
1075 	bzero((caddr_t)&oc, sizeof (oc));
1076 	if (sockets[i].type != INETBOOT_RAW) {
1077 		offset += (sockets[i].headerlen[TRANSPORT_LVL])(NULL);
1078 		oc.igm_level = TRANSPORT_LVL;
1079 	} else
1080 		oc.igm_level = NETWORK_LVL;
1081 	oc.igm_oflags = flags;
1082 
1083 	if (to != NULL) {
1084 		bcopy((caddr_t)to, (caddr_t)&oc.igm_saddr, tolen);
1085 	} else {
1086 		bcopy((caddr_t)&sockets[i].remote, (caddr_t)&oc.igm_saddr,
1087 		    sizeof (struct sockaddr_in));
1088 	}
1089 
1090 	/* Get a legal source port if the socket isn't bound. */
1091 	if (sockets[i].bound == B_FALSE &&
1092 	    ntohs(oc.igm_saddr.sin_port == 0)) {
1093 		((struct sockaddr_in *)&oc.igm_saddr)->sin_port =
1094 		    get_source_port(B_FALSE);
1095 	}
1096 
1097 	/* Round up to 16bit value for checksum purposes */
1098 	if (sockets[i].type == INETBOOT_DGRAM) {
1099 		tlen = ((len + sizeof (uint16_t) - 1) &
1100 		    ~(sizeof (uint16_t) - 1));
1101 	} else
1102 		tlen = len;
1103 
1104 	if ((oc.igm_mp = allocb(tlen + offset, 0)) == NULL) {
1105 		errno = ENOMEM;
1106 		return (-1);
1107 	}
1108 	mp = oc.igm_mp;
1109 	mp->b_rptr = mp->b_wptr += offset;
1110 	bcopy((caddr_t)msg, mp->b_wptr, len);
1111 	mp->b_wptr += len;
1112 	for (l = TRANSPORT_LVL; l >= MEDIA_LVL; l--) {
1113 		if (sockets[i].output[l] != NULL) {
1114 			if (sockets[i].output[l](i, &oc) < 0) {
1115 				freeb(mp);
1116 				if (errno == 0)
1117 					errno = EIO;
1118 				return (-1);
1119 			}
1120 		}
1121 	}
1122 	freeb(mp);
1123 	return (len);
1124 }
1125 
1126 /* ARGSUSED */
1127 static ssize_t
1128 stream_sendto(int i, const void *msg, size_t len, int flags)
1129 {
1130 	int cnt;
1131 
1132 	assert(sockets[i].pcb != NULL);
1133 
1134 	/*
1135 	 * Call directly TCP's send routine.  We do this because TCP
1136 	 * needs to decide whether to send out the data.
1137 	 *
1138 	 * Note also that currently, TCP ignores all flags passed in for
1139 	 * TCP socket.
1140 	 */
1141 	if ((cnt = tcp_send(i, sockets[i].pcb, msg, len)) < 0) {
1142 		if (sockets[i].so_error != 0)
1143 			errno = sockets[i].so_error;
1144 		return (-1);
1145 	} else {
1146 		return (cnt);
1147 	}
1148 }
1149 
1150 /*
1151  * Returns ptr to the last inetgram in the list, or null if list is null
1152  */
1153 struct inetgram *
1154 last_gram(struct inetgram *igp)
1155 {
1156 	struct inetgram	*wp;
1157 	for (wp = igp; wp != NULL; wp = wp->igm_next) {
1158 		if (wp->igm_next == NULL)
1159 			return (wp);
1160 	}
1161 	return (NULL);
1162 }
1163 
1164 /*
1165  * Adds an inetgram or list of inetgrams to the end of the list.
1166  */
1167 void
1168 add_grams(struct inetgram **igpp, struct inetgram *newgp)
1169 {
1170 	struct inetgram	 *wp;
1171 
1172 	if (newgp == NULL)
1173 		return;
1174 
1175 	if (*igpp == NULL)
1176 		*igpp = newgp;
1177 	else {
1178 		wp = last_gram(*igpp);
1179 		wp->igm_next = newgp;
1180 	}
1181 }
1182 
1183 /*
1184  * Nuke a whole list of grams.
1185  */
1186 void
1187 nuke_grams(struct inetgram **lgpp)
1188 {
1189 	while (*lgpp != NULL)
1190 		del_gram(lgpp, *lgpp, B_TRUE);
1191 }
1192 
1193 /*
1194  * Remove the referenced inetgram. List is altered accordingly. Destroy the
1195  * referenced inetgram if freeit is B_TRUE.
1196  */
1197 void
1198 del_gram(struct inetgram **lgpp, struct inetgram *igp, int freeit)
1199 {
1200 	struct inetgram	*wp, *pp = NULL;
1201 
1202 	if (lgpp == NULL || igp == NULL)
1203 		return;
1204 
1205 	wp = *lgpp;
1206 	while (wp != NULL) {
1207 		if (wp == igp) {
1208 			/* detach wp from the list */
1209 			if (*lgpp == wp)
1210 				*lgpp = (*lgpp)->igm_next;
1211 			else
1212 				pp->igm_next = wp->igm_next;
1213 			igp->igm_next = NULL;
1214 
1215 			if (freeit) {
1216 				if (igp->igm_mp != NULL)
1217 					freeb(igp->igm_mp);
1218 				bkmem_free((caddr_t)igp,
1219 				    sizeof (struct inetgram));
1220 			}
1221 			break;
1222 		}
1223 		pp = wp;
1224 		wp = wp->igm_next;
1225 	}
1226 }
1227 
1228 struct nct_t nct[] = {
1229 	"bootp",	NCT_BOOTP_DHCP,
1230 	"dhcp",		NCT_BOOTP_DHCP,
1231 	"rarp",		NCT_RARP_BOOTPARAMS,
1232 	"manual",	NCT_MANUAL
1233 };
1234 int	nct_entries = sizeof (nct) / sizeof (nct[0]);
1235 
1236 /*
1237  * Figure out from the bootpath what kind of network configuration strategy
1238  * we should use. Returns the network config strategy.
1239  */
1240 int
1241 get_netconfig_strategy(void)
1242 {
1243 	int	i;
1244 #define	ISSPACE(c) (c == ' ' || c == '\t' || c == '\n' || c == '\0')
1245 	char	lbootpath[OBP_MAXPATHLEN];
1246 	char	net_options[NCT_BUFSIZE];
1247 	char	*op, *nop, *sp;
1248 	pnode_t	cn;
1249 	int	proplen;
1250 
1251 	/* If the PROM DHCP cache exists, we're done */
1252 	if (prom_cached_reply(B_TRUE))
1253 		return (NCT_BOOTP_DHCP);
1254 
1255 	/*
1256 	 *	Newer (version 4) PROMs will put the name in the
1257 	 *	"net-config-strategy" property.
1258 	 */
1259 	cn = prom_finddevice("/chosen");
1260 	if ((proplen = prom_getproplen(cn, "net-config-strategy")) <
1261 	    sizeof (net_options)) {
1262 		(void) prom_getprop(cn, "net-config-strategy", net_options);
1263 		net_options[proplen] = '\0';
1264 	} else {
1265 
1266 		/*
1267 		 * We're reduced to sacanning bootpath for the prototol to use.
1268 		 * Since there was no "net-config-strategy" property, this is
1269 		 * an old PROM, so we need to excise any extraneous key/value
1270 		 * initializations from bootpath[].
1271 		 */
1272 		for (op = prom_bootpath(), sp = lbootpath; op != NULL &&
1273 		    !ISSPACE(*op); sp++, op++)
1274 			*sp = *op;
1275 		*sp = '\0';
1276 		/* find the last '/' (in the device path) */
1277 		if ((op = strrchr(lbootpath, '/')) == NULL)	/* last '/' */
1278 			op = lbootpath;
1279 		else
1280 			op++;
1281 		/* then look for the ':' separating it from the protocol */
1282 		while (*op != ':' && *op != '\0')
1283 			op++;
1284 
1285 		if (*op == ':') {
1286 			for (nop = net_options, op++;
1287 			    *op != '\0' && *op != '/' && !ISSPACE(*op) &&
1288 			    nop < &net_options[NCT_BUFSIZE]; nop++, op++)
1289 				*nop = *op;
1290 			*nop = '\0';
1291 		} else
1292 			net_options[0] = '\0';
1293 	}
1294 
1295 #undef	ISSPACE
1296 
1297 	for (i = 0; i < nct_entries; i++)
1298 		if (strcmp(net_options, nct[i].p_name) == 0)
1299 			return (nct[i].p_id);
1300 
1301 	return (NCT_DEFAULT);
1302 }
1303 
1304 /* Modified STREAM routines for ease of porting core TCP code. */
1305 
1306 /*ARGSUSED*/
1307 mblk_t *
1308 allocb(size_t size, uint_t pri)
1309 {
1310 	unsigned char *base;
1311 	mblk_t *mp;
1312 
1313 	if ((mp = (mblk_t *)bkmem_zalloc(sizeof (mblk_t))) == NULL)
1314 		return (NULL);
1315 	if ((base = (unsigned char *)bkmem_zalloc(size)) == NULL)
1316 		return (NULL);
1317 
1318 	mp->b_next = mp->b_prev = mp->b_cont = NULL;
1319 	mp->b_rptr = mp->b_wptr = mp->b_datap = (unsigned char *)base;
1320 	mp->b_size = size;
1321 
1322 	return (mp);
1323 }
1324 
1325 void
1326 freeb(mblk_t *mp)
1327 {
1328 #ifdef DEBUG
1329 	printf("freeb datap %x\n", mp->b_datap);
1330 #endif
1331 	bkmem_free((caddr_t)(mp->b_datap), mp->b_size);
1332 #ifdef DEBUG
1333 	printf("freeb mp %x\n", mp);
1334 #endif
1335 	bkmem_free((caddr_t)mp, sizeof (mblk_t));
1336 }
1337 
1338 void
1339 freemsg(mblk_t *mp)
1340 {
1341 	while (mp) {
1342 		mblk_t *mp_cont = mp->b_cont;
1343 
1344 		freeb(mp);
1345 		mp = mp_cont;
1346 	}
1347 }
1348 
1349 mblk_t *
1350 copyb(mblk_t *bp)
1351 {
1352 	mblk_t *nbp;
1353 	unsigned char *ndp;
1354 
1355 	assert((uintptr_t)(bp->b_wptr - bp->b_rptr) >= 0);
1356 
1357 	if (!(nbp = allocb(bp->b_size, 0)))
1358 		return (NULL);
1359 	nbp->b_cont = NULL;
1360 	ndp = nbp->b_datap;
1361 
1362 	nbp->b_rptr = ndp + (bp->b_rptr - bp->b_datap);
1363 	nbp->b_wptr = nbp->b_rptr + (bp->b_wptr - bp->b_rptr);
1364 	bcopy(bp->b_datap, nbp->b_datap, bp->b_size);
1365 	return (nbp);
1366 }
1367 
1368 /* To simplify things, dupb() is implemented as copyb(). */
1369 mblk_t *
1370 dupb(mblk_t *mp)
1371 {
1372 	return (copyb(mp));
1373 }
1374 
1375 /*
1376  * get number of data bytes in message
1377  */
1378 size_t
1379 msgdsize(mblk_t *bp)
1380 {
1381 	size_t count = 0;
1382 
1383 	for (; bp != NULL; bp = bp->b_cont) {
1384 		assert(bp->b_wptr >= bp->b_rptr);
1385 		count += bp->b_wptr - bp->b_rptr;
1386 	}
1387 	return (count);
1388 }
1389