xref: /illumos-gate/usr/src/stand/lib/sock/socket.c (revision a92282e44f968185a6bba094d1e5fece2da819cf)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  *
25  * socket.c, Code implementing a simple socket interface.
26  */
27 
28 #include <sys/types.h>
29 #include "socket_impl.h"
30 #include <sys/isa_defs.h>
31 #include <sys/sysmacros.h>
32 #include <sys/bootconf.h>
33 #include <sys/socket.h>
34 #include <netinet/in.h>
35 #include <netinet/ip.h>
36 #include <netinet/tcp.h>
37 #include <sys/uio.h>
38 #include <sys/salib.h>
39 #include "socket_inet.h"
40 #include "ipv4.h"
41 #include "ipv4_impl.h"
42 #include "udp_inet.h"
43 #include "tcp_inet.h"
44 #include "mac.h"
45 #include "mac_impl.h"
46 #include <sys/promif.h>
47 
48 struct inetboot_socket	sockets[MAXSOCKET] = { 0 };
49 
50 /* Default send and receive socket buffer size */
51 #define	SO_DEF_SNDBUF	48*1024
52 #define	SO_DEF_RCVBUF	48*1024
53 
54 /* Default max socket buffer size */
55 #define	SO_MAX_BUF	4*1024*1024
56 
57 static ssize_t dgram_sendto(int, const void *, size_t, int,
58     const struct sockaddr *, int);
59 static ssize_t stream_sendto(int, const void *, size_t, int);
60 static int bind_check(int, const struct sockaddr *);
61 static int quickbind(int);
62 
63 /* Check the validity of a fd and return the socket index of that fd. */
64 int
65 so_check_fd(int fd, int *errno)
66 {
67 	int i;
68 
69 	i = FD_TO_SOCKET(fd);
70 	if (i < 0 || i >= MAXSOCKET) {
71 		*errno = ENOTSOCK;
72 		return (-1);
73 	}
74 	if (sockets[i].type == INETBOOT_UNUSED) {
75 		*errno = ENOTSOCK;
76 		return (-1);
77 	}
78 	return (i);
79 }
80 
81 /*
82  * Create an endpoint for network communication. Returns a descriptor.
83  *
84  * Notes:
85  *	Only PF_INET communication domains are supported. Within
86  *	this domain, only SOCK_RAW, SOCK_DGRAM and SOCK_STREAM types are
87  *	supported.
88  */
89 int
90 socket(int domain, int type, int protocol)
91 {
92 	static int sock_initialized;
93 	int i;
94 
95 	errno = 0;
96 
97 	if (!sock_initialized) {
98 		for (i = 0; i < MAXSOCKET; i++)
99 			sockets[i].type = INETBOOT_UNUSED;
100 		sock_initialized = B_TRUE;
101 	}
102 	if (domain != AF_INET) {
103 		errno = EPROTONOSUPPORT;
104 		return (-1);
105 	}
106 
107 	/* Find available socket */
108 	for (i = 0; i < MAXSOCKET; i++) {
109 		if (sockets[i].type == INETBOOT_UNUSED)
110 			break;
111 	}
112 	if (i >= MAXSOCKET) {
113 		errno = EMFILE;	/* No slots left. */
114 		return (-1);
115 	}
116 
117 	/* Some socket initialization... */
118 	sockets[i].so_rcvbuf = SO_DEF_RCVBUF;
119 	sockets[i].so_sndbuf = SO_DEF_SNDBUF;
120 
121 	/*
122 	 * Note that we ignore the protocol field for SOCK_DGRAM and
123 	 * SOCK_STREAM.  When we support different protocols in future,
124 	 * this needs to be changed.
125 	 */
126 	switch (type) {
127 	case SOCK_RAW:
128 		ipv4_raw_socket(&sockets[i], (uint8_t)protocol);
129 		break;
130 	case SOCK_DGRAM:
131 		udp_socket_init(&sockets[i]);
132 		break;
133 	case SOCK_STREAM:
134 		tcp_socket_init(&sockets[i]);
135 		break;
136 	default:
137 		errno = EPROTOTYPE;
138 		break;
139 	}
140 
141 	if (errno != 0)
142 		return (-1);
143 
144 	/* IPv4 generic initialization. */
145 	ipv4_socket_init(&sockets[i]);
146 
147 	/* MAC generic initialization. */
148 	mac_socket_init(&sockets[i]);
149 
150 	return (i + SOCKETTYPE);
151 }
152 
153 int
154 getsockname(int s, struct sockaddr *name,  socklen_t *namelen)
155 {
156 	int i;
157 
158 	errno = 0;
159 	if ((i = so_check_fd(s, &errno)) == -1)
160 		return (-1);
161 
162 	if (*namelen < sizeof (struct sockaddr_in)) {
163 		errno = ENOMEM;
164 		return (-1);
165 	}
166 
167 	/* Structure assignment... */
168 	*((struct sockaddr_in *)name) = sockets[i].bind;
169 	*namelen = sizeof (struct sockaddr_in);
170 	return (0);
171 }
172 
173 /*
174  * The socket options we support are:
175  * SO_RCVTIMEO	-	Value is in msecs, and is of uint32_t.
176  * SO_DONTROUTE	-	Value is an int, and is a boolean (nonzero if set).
177  * SO_REUSEADDR -	Value is an int boolean.
178  * SO_RCVBUF -		Value is an int.
179  * SO_SNDBUF -		Value is an int.
180  */
181 int
182 getsockopt(int s, int level, int option, void *optval, socklen_t *optlen)
183 {
184 	int i;
185 
186 	errno = 0;
187 	if ((i = so_check_fd(s, &errno)) == -1)
188 		return (-1);
189 
190 	switch (level) {
191 	case SOL_SOCKET: {
192 		switch (option) {
193 		case SO_RCVTIMEO:
194 			if (*optlen == sizeof (uint32_t)) {
195 				*(uint32_t *)optval = sockets[i].in_timeout;
196 			} else {
197 				*optlen = 0;
198 				errno = EINVAL;
199 			}
200 			break;
201 		case SO_DONTROUTE:
202 			if (*optlen == sizeof (int)) {
203 				*(int *)optval =
204 				    (sockets[i].out_flags & SO_DONTROUTE);
205 			} else {
206 				*optlen = 0;
207 				errno = EINVAL;
208 			}
209 			break;
210 		case SO_REUSEADDR:
211 			if (*optlen == sizeof (int)) {
212 				*(int *)optval =
213 				    (sockets[i].so_opt & SO_REUSEADDR);
214 			} else {
215 				*optlen = 0;
216 				errno = EINVAL;
217 			}
218 			break;
219 		case SO_RCVBUF:
220 			if (*optlen == sizeof (int)) {
221 				*(int *)optval = sockets[i].so_rcvbuf;
222 			} else {
223 				*optlen = 0;
224 				errno = EINVAL;
225 			}
226 			break;
227 		case SO_SNDBUF:
228 			if (*optlen == sizeof (int)) {
229 				*(int *)optval = sockets[i].so_sndbuf;
230 			} else {
231 				*optlen = 0;
232 				errno = EINVAL;
233 			}
234 			break;
235 		case SO_LINGER:
236 			if (*optlen == sizeof (struct linger)) {
237 				/* struct copy */
238 				*(struct linger *)optval = sockets[i].so_linger;
239 			} else {
240 				*optlen = 0;
241 				errno = EINVAL;
242 			}
243 			break;
244 		default:
245 			errno = ENOPROTOOPT;
246 			break;
247 		}
248 		break;
249 	} /* case SOL_SOCKET */
250 	case IPPROTO_TCP:
251 	case IPPROTO_IP: {
252 		switch (option) {
253 		default:
254 			*optlen = 0;
255 			errno = ENOPROTOOPT;
256 			break;
257 		}
258 		break;
259 	} /* case IPPROTO_IP or IPPROTO_TCP */
260 	default:
261 		errno = ENOPROTOOPT;
262 		break;
263 	} /* switch (level) */
264 
265 	if (errno != 0)
266 		return (-1);
267 	else
268 		return (0);
269 }
270 
271 /*
272  * Generate a network-order source port from the privileged range if
273  * "reserved" is true, dynamic/private range otherwise. We consider the
274  * range of 512-1023 privileged ports as ports we can use. This mirrors
275  * historical rpc client practice for privileged port selection.
276  */
277 in_port_t
278 get_source_port(boolean_t reserved)
279 {
280 	static in_port_t	dynamic = IPPORT_DYNAMIC_START - 1,
281 	    rsvdport = (IPPORT_RESERVED / 2) - 1;
282 	in_port_t		p;
283 
284 	if (reserved) {
285 		if (++rsvdport >= IPPORT_RESERVED)
286 			p = rsvdport = IPPORT_RESERVED / 2;
287 		else
288 			p = rsvdport;
289 	} else
290 		p = ++dynamic;
291 
292 	return (htons(p));
293 }
294 
295 /*
296  * The socket options we support are:
297  * SO_RECVTIMEO	-	Value is uint32_t msecs.
298  * SO_DONTROUTE	-	Value is int boolean (nonzero == TRUE, zero == FALSE).
299  * SO_REUSEADDR -	value is int boolean.
300  * SO_RCVBUF -		Value is int.
301  * SO_SNDBUF -		Value is int.
302  */
303 int
304 setsockopt(int s, int level, int option, const void *optval, socklen_t optlen)
305 {
306 	int i;
307 
308 	errno = 0;
309 	if ((i = so_check_fd(s, &errno)) == -1)
310 		return (-1);
311 
312 	switch (level) {
313 	case SOL_SOCKET: {
314 		switch (option) {
315 		case SO_RCVTIMEO:
316 			if (optlen == sizeof (uint32_t))
317 				sockets[i].in_timeout = *(uint32_t *)optval;
318 			else {
319 				errno = EINVAL;
320 			}
321 			break;
322 		case SO_DONTROUTE:
323 			if (optlen == sizeof (int)) {
324 				if (*(int *)optval)
325 					sockets[i].out_flags |= SO_DONTROUTE;
326 				else
327 					sockets[i].out_flags &= ~SO_DONTROUTE;
328 			} else {
329 				errno = EINVAL;
330 			}
331 			break;
332 		case SO_REUSEADDR:
333 			if (optlen == sizeof (int)) {
334 				if (*(int *)optval)
335 					sockets[i].so_opt |= SO_REUSEADDR;
336 				else
337 					sockets[i].so_opt &= ~SO_REUSEADDR;
338 			} else {
339 				errno = EINVAL;
340 			}
341 			break;
342 		case SO_RCVBUF:
343 			if (optlen == sizeof (int)) {
344 				sockets[i].so_rcvbuf = *(int *)optval;
345 				if (sockets[i].so_rcvbuf > SO_MAX_BUF)
346 					sockets[i].so_rcvbuf = SO_MAX_BUF;
347 				(void) tcp_opt_set(sockets[i].pcb,
348 				    level, option, optval, optlen);
349 			} else {
350 				errno = EINVAL;
351 			}
352 			break;
353 		case SO_SNDBUF:
354 			if (optlen == sizeof (int)) {
355 				sockets[i].so_sndbuf = *(int *)optval;
356 				if (sockets[i].so_sndbuf > SO_MAX_BUF)
357 					sockets[i].so_sndbuf = SO_MAX_BUF;
358 				(void) tcp_opt_set(sockets[i].pcb,
359 				    level, option, optval, optlen);
360 			} else {
361 				errno = EINVAL;
362 			}
363 			break;
364 		case SO_LINGER:
365 			if (optlen == sizeof (struct linger)) {
366 				/* struct copy */
367 				sockets[i].so_linger = *(struct linger *)optval;
368 				(void) tcp_opt_set(sockets[i].pcb,
369 				    level, option, optval, optlen);
370 			} else {
371 				errno = EINVAL;
372 			}
373 			break;
374 		default:
375 			errno = ENOPROTOOPT;
376 			break;
377 		}
378 		break;
379 	} /* case SOL_SOCKET */
380 	case IPPROTO_TCP:
381 	case IPPROTO_IP: {
382 		switch (option) {
383 		default:
384 			errno = ENOPROTOOPT;
385 			break;
386 		}
387 		break;
388 	} /* case IPPROTO_IP  or IPPROTO_TCP */
389 	default:
390 		errno = ENOPROTOOPT;
391 		break;
392 	} /* switch (level) */
393 
394 	if (errno != 0)
395 		return (-1);
396 	else
397 		return (0);
398 }
399 
400 /*
401  * Shut down part of a full-duplex connection.
402  *
403  * Only supported for TCP sockets
404  */
405 int
406 shutdown(int s, int how)
407 {
408 	int sock_id;
409 	int i;
410 
411 	errno = 0;
412 	if ((sock_id = so_check_fd(s, &errno)) == -1)
413 		return (-1);
414 
415 	/* shutdown only supported for TCP sockets */
416 	if (sockets[sock_id].type != INETBOOT_STREAM) {
417 		errno = EOPNOTSUPP;
418 		return (-1);
419 	}
420 
421 	if (!(sockets[sock_id].so_state & SS_ISCONNECTED)) {
422 		errno = ENOTCONN;
423 		return (-1);
424 	}
425 
426 	switch (how) {
427 	case 0:
428 		sockets[sock_id].so_state |= SS_CANTRCVMORE;
429 		break;
430 	case 1:
431 		sockets[sock_id].so_state |= SS_CANTSENDMORE;
432 		break;
433 	case 2:
434 		sockets[sock_id].so_state |= (SS_CANTRCVMORE | SS_CANTSENDMORE);
435 		break;
436 	default:
437 		errno = EINVAL;
438 		return (-1);
439 	}
440 
441 	switch (sockets[sock_id].so_state &
442 	    (SS_CANTRCVMORE | SS_CANTSENDMORE)) {
443 	case (SS_CANTRCVMORE | SS_CANTSENDMORE):
444 		/* Call lower level protocol close routine. */
445 		for (i = TRANSPORT_LVL; i >= MEDIA_LVL; i--) {
446 			if (sockets[sock_id].close[i] != NULL) {
447 				(void) sockets[sock_id].close[i](sock_id);
448 			}
449 		}
450 		nuke_grams(&sockets[sock_id].inq);
451 		break;
452 	case SS_CANTRCVMORE:
453 		nuke_grams(&sockets[sock_id].inq);
454 		break;
455 	case SS_CANTSENDMORE:
456 		/* Call lower level protocol close routine. */
457 		if (tcp_shutdown(sock_id) < 0)
458 			return (-1);
459 		break;
460 	default:
461 		errno = EINVAL;
462 		return (-1);
463 	}
464 
465 	return (0);
466 }
467 
468 /*
469  * "close" a socket.
470  */
471 int
472 socket_close(int s)
473 {
474 	int sock_id, i;
475 
476 	errno = 0;
477 	if ((sock_id = so_check_fd(s, &errno)) == -1)
478 		return (-1);
479 
480 	/* Call lower level protocol close routine. */
481 	for (i = TRANSPORT_LVL; i >= MEDIA_LVL; i--) {
482 		if (sockets[sock_id].close[i] != NULL) {
483 			/*
484 			 * Note that the close() routine of other
485 			 * layers can return an error.  But right
486 			 * now, the only mechanism to report that
487 			 * back is for the close() routine to set
488 			 * the errno and socket_close() will return
489 			 * an error.  But the close operation will
490 			 * not be stopped.
491 			 */
492 			(void) sockets[sock_id].close[i](sock_id);
493 		}
494 	}
495 
496 	/*
497 	 * Clear the input queue.  This has to be done
498 	 * after the lower level protocol close routines have been
499 	 * called as they may want to do something about the queue.
500 	 */
501 	nuke_grams(&sockets[sock_id].inq);
502 
503 	bzero((caddr_t)&sockets[sock_id], sizeof (struct inetboot_socket));
504 	sockets[sock_id].type = INETBOOT_UNUSED;
505 
506 	return (0);
507 }
508 
509 /*
510  * Read up to `nbyte' of data from socket `s' into `buf'; if non-zero,
511  * then give up after `read_timeout' seconds.  Returns the number of
512  * bytes read, or -1 on failure.
513  */
514 int
515 socket_read(int s, void *buf, size_t nbyte, int read_timeout)
516 {
517 	ssize_t	n;
518 	uint_t	start, diff;
519 
520 	/*
521 	 * keep calling non-blocking recvfrom until something received
522 	 * or an error occurs
523 	 */
524 	start = prom_gettime();
525 	for (;;) {
526 		n = recvfrom(s, buf, nbyte, MSG_DONTWAIT, NULL, NULL);
527 		if (n == -1 && errno == EWOULDBLOCK) {
528 			diff = (uint_t)((prom_gettime() - start) + 500) / 1000;
529 			if (read_timeout != 0 && diff > read_timeout) {
530 				errno = EINTR;
531 				return (-1);
532 			}
533 		} else {
534 			return (n);
535 		}
536 	}
537 }
538 
539 /*
540  * Write up to `nbyte' bytes of data from `buf' to the address pointed to
541  * `addr' using socket `s'.  Returns the number of bytes writte on success,
542  * or -1 on failure.
543  */
544 int
545 socket_write(int s, const void *buf, size_t nbyte, struct sockaddr_in *addr)
546 {
547 	return (sendto(s, buf, nbyte, 0, (struct sockaddr *)addr,
548 	    sizeof (*addr)));
549 }
550 
551 static int
552 bind_check(int sock_id, const struct sockaddr *addr)
553 {
554 	int k;
555 	struct sockaddr_in *in_addr = (struct sockaddr_in *)addr;
556 
557 	/* Do not check for duplicate bind() if SO_REUSEADDR option is set. */
558 	if (! (sockets[sock_id].so_opt & SO_REUSEADDR)) {
559 		for (k = 0; k < MAXSOCKET; k++) {
560 			if (sockets[k].type != INETBOOT_UNUSED &&
561 			    sockets[k].proto == sockets[sock_id].proto &&
562 			    sockets[k].bound) {
563 				if ((sockets[k].bind.sin_addr.s_addr ==
564 				    in_addr->sin_addr.s_addr) &&
565 				    (sockets[k].bind.sin_port ==
566 				    in_addr->sin_port)) {
567 					errno = EADDRINUSE;
568 					return (-1);
569 				}
570 			}
571 		}
572 	}
573 	return (0);
574 }
575 
576 /* Assign a name to an unnamed socket. */
577 int
578 bind(int s, const struct sockaddr *name, socklen_t namelen)
579 {
580 	int i;
581 
582 	errno = 0;
583 
584 	if ((i = so_check_fd(s, &errno)) == -1)
585 		return (-1);
586 
587 	if (name == NULL) {
588 		/* unbind */
589 		if (sockets[i].bound) {
590 			bzero((caddr_t)&sockets[i].bind,
591 			    sizeof (struct sockaddr_in));
592 			sockets[i].bound = B_FALSE;
593 		}
594 		return (0);
595 	}
596 	if (namelen != sizeof (struct sockaddr_in) || name == NULL) {
597 		errno = EINVAL;
598 		return (-1);
599 	}
600 	if (name->sa_family != AF_INET) {
601 		errno = EAFNOSUPPORT;
602 		return (-1);
603 	}
604 	if (sockets[i].bound) {
605 		if (bcmp((caddr_t)&sockets[i].bind, (caddr_t)name,
606 		    namelen) == 0) {
607 			/* attempt to bind to same address ok... */
608 			return (0);
609 		}
610 		errno = EINVAL;	/* already bound */
611 		return (-1);
612 	}
613 
614 	if (errno != 0) {
615 		return (-1);
616 	}
617 
618 	/* Check for duplicate bind(). */
619 	if (bind_check(i, name) < 0)
620 		return (-1);
621 
622 	bcopy((caddr_t)name, (caddr_t)&sockets[i].bind, namelen);
623 	if (sockets[i].type == INETBOOT_STREAM) {
624 		if (tcp_bind(i) < 0) {
625 			return (-1);
626 		}
627 	}
628 	sockets[i].bound = B_TRUE;
629 
630 	return (0);
631 }
632 
633 static int
634 quickbind(int sock_id)
635 {
636 	int i;
637 	struct sockaddr_in addr;
638 
639 	/*
640 	 * XXX This needs more work.  Right now, if ipv4_setipaddr()
641 	 * have not been called, this will be wrong.  But we need
642 	 * something better.  Need to be revisited.
643 	 */
644 	ipv4_getipaddr(&addr.sin_addr);
645 	addr.sin_family = AF_INET;
646 
647 	for (i = SMALLEST_ANON_PORT; i <= LARGEST_ANON_PORT; i++) {
648 		addr.sin_port = htons(i);
649 		if (bind_check(sock_id, (struct sockaddr *)&addr) == 0)
650 			break;
651 	}
652 	/* Need to clear errno as it is probably set by bind_check(). */
653 	errno = 0;
654 
655 	if (i <= LARGEST_ANON_PORT) {
656 		bcopy((caddr_t)&addr, (caddr_t)&sockets[sock_id].bind,
657 		    sizeof (struct sockaddr_in));
658 		sockets[sock_id].bound = B_TRUE;
659 #ifdef DEBUG
660 		printf("quick bind done addr %s port %d\n",
661 		    inet_ntoa(sockets[sock_id].bind.sin_addr),
662 		    ntohs(sockets[sock_id].bind.sin_port));
663 #endif
664 		return (0);
665 	} else {
666 		return (-1);
667 	}
668 }
669 
670 int
671 listen(int fd, int backlog)
672 {
673 	int sock_id;
674 
675 	errno = 0;
676 	if ((sock_id = so_check_fd(fd, &errno)) == -1)
677 		return (-1);
678 
679 	if (sockets[sock_id].type != INETBOOT_STREAM) {
680 		errno = EOPNOTSUPP;
681 		return (-1);
682 	}
683 	if (sockets[sock_id].so_error != 0) {
684 		errno = sockets[sock_id].so_error;
685 		return (-1);
686 	}
687 	return (tcp_listen(sock_id, backlog));
688 }
689 
690 int
691 accept(int fd, struct sockaddr *addr,  socklen_t *addr_len)
692 {
693 	int sock_id;
694 	int new_sd;
695 
696 	errno = 0;
697 	if ((sock_id = so_check_fd(fd, &errno)) == -1)
698 		return (-1);
699 
700 	if (sockets[sock_id].type != INETBOOT_STREAM) {
701 		errno = EOPNOTSUPP;
702 		return (-1);
703 	}
704 	if (sockets[sock_id].so_error != 0) {
705 		errno = sockets[sock_id].so_error;
706 		return (-1);
707 	}
708 	if ((new_sd = tcp_accept(sock_id, addr, addr_len)) == -1)
709 		return (-1);
710 	sock_id = so_check_fd(new_sd, &errno);
711 	sockets[sock_id].so_state |= SS_ISCONNECTED;
712 	return (new_sd);
713 }
714 
715 int
716 connect(int fd, const  struct sockaddr *addr, socklen_t addr_len)
717 {
718 	int sock_id;
719 	int so_type;
720 
721 	errno = 0;
722 	if ((sock_id = so_check_fd(fd, &errno)) == -1)
723 		return (-1);
724 
725 	so_type = sockets[sock_id].type;
726 
727 	if (addr == NULL || addr_len == 0) {
728 		errno = EINVAL;
729 		return (-1);
730 	}
731 	/* Don't allow connect for raw socket. */
732 	if (so_type == INETBOOT_RAW) {
733 		errno = EPROTONOSUPPORT;
734 		return (-1);
735 	}
736 
737 	if (sockets[sock_id].so_state & SS_ISCONNECTED) {
738 		errno = EINVAL;
739 		return (-1);
740 	}
741 
742 	if (sockets[sock_id].so_error != 0) {
743 		errno = sockets[sock_id].so_error;
744 		return (-1);
745 	}
746 
747 	/* If the socket is not bound, we need to do a quick bind. */
748 	if (!sockets[sock_id].bound) {
749 		/* For TCP socket, just call tcp_bind(). */
750 		if (so_type == INETBOOT_STREAM) {
751 			if (tcp_bind(sock_id) < 0)
752 				return (-1);
753 		} else {
754 			if (quickbind(sock_id) < 0) {
755 				errno = EADDRNOTAVAIL;
756 				return (-1);
757 			}
758 		}
759 	}
760 	/* Should do some sanity check for addr .... */
761 	bcopy((caddr_t)addr, &sockets[sock_id].remote,
762 	    sizeof (struct sockaddr_in));
763 
764 	if (sockets[sock_id].type == INETBOOT_STREAM) {
765 		/* Call TCP connect routine. */
766 		if (tcp_connect(sock_id) == 0)
767 			sockets[sock_id].so_state |= SS_ISCONNECTED;
768 		else {
769 			if (sockets[sock_id].so_error != 0)
770 				errno = sockets[sock_id].so_error;
771 			return (-1);
772 		}
773 	} else {
774 		sockets[sock_id].so_state |= SS_ISCONNECTED;
775 	}
776 	return (0);
777 }
778 
779 /* Just a wrapper around recvfrom(). */
780 ssize_t
781 recv(int s, void *buf, size_t len, int flags)
782 {
783 	return (recvfrom(s, buf, len, flags, NULL, NULL));
784 }
785 
786 /*
787  * Receive messages from a connectionless socket. Legal flags are 0 and
788  * MSG_DONTWAIT. MSG_WAITALL is not currently supported.
789  *
790  * Returns length of message for success, -1 if error occurred.
791  */
792 ssize_t
793 recvfrom(int s, void *buf, size_t len, int flags, struct sockaddr *from,
794     socklen_t *fromlen)
795 {
796 	int			sock_id, i;
797 	ssize_t			datalen, bytes = 0;
798 	struct inetgram		*icp;
799 	enum SockType		so_type;
800 	char			*tmp_buf;
801 	mblk_t			*mp;
802 
803 	errno = 0;
804 
805 	if ((sock_id = so_check_fd(s, &errno)) == -1) {
806 		errno = EINVAL;
807 		return (-1);
808 	}
809 
810 	if (sockets[sock_id].type == INETBOOT_STREAM &&
811 	    !(sockets[sock_id].so_state & SS_ISCONNECTED)) {
812 		errno = ENOTCONN;
813 		return (-1);
814 	}
815 
816 	if (buf == NULL || len == 0) {
817 		errno = EINVAL;
818 		return (-1);
819 	}
820 	/* Yup - MSG_WAITALL not implemented */
821 	if ((flags & ~MSG_DONTWAIT) != 0) {
822 		errno = EINVAL;
823 		return (-1);
824 	}
825 
826 retry:
827 	if (sockets[sock_id].inq == NULL) {
828 		/* Go out and check the wire */
829 		for (i = MEDIA_LVL; i < APP_LVL; i++) {
830 			if (sockets[sock_id].input[i] != NULL) {
831 				if (sockets[sock_id].input[i](sock_id) < 0) {
832 					if (sockets[sock_id].so_error != 0) {
833 						errno =
834 						    sockets[sock_id].so_error;
835 					}
836 					return (-1);
837 				}
838 			}
839 		}
840 	}
841 
842 	so_type = sockets[sock_id].type;
843 
844 	/* Remove unknown inetgrams from the head of inq.  Can this happen? */
845 	while ((icp = sockets[sock_id].inq) != NULL) {
846 		if ((so_type == INETBOOT_DGRAM ||
847 		    so_type == INETBOOT_STREAM) &&
848 		    icp->igm_level != APP_LVL) {
849 #ifdef	DEBUG
850 			printf("recvfrom: unexpected level %d frame found\n",
851 			    icp->igm_level);
852 #endif	/* DEBUG */
853 			del_gram(&sockets[sock_id].inq, icp, B_TRUE);
854 			continue;
855 		} else {
856 			break;
857 		}
858 	}
859 
860 
861 	if (icp == NULL) {
862 		/*
863 		 * Checking for error should be done everytime a lower layer
864 		 * input routing is called.  For example, if TCP gets a RST,
865 		 * this should be reported asap.
866 		 */
867 		if (sockets[sock_id].so_state & SS_CANTRCVMORE) {
868 			if (sockets[sock_id].so_error != 0) {
869 				errno = sockets[sock_id].so_error;
870 				return (-1);
871 			} else {
872 				return (0);
873 			}
874 		}
875 
876 		if ((flags & MSG_DONTWAIT) == 0)
877 			goto retry;	/* wait forever */
878 
879 		/* no data */
880 		errno = EWOULDBLOCK;
881 		return (-1);
882 	}
883 
884 	if (from != NULL && fromlen != NULL) {
885 		switch (so_type) {
886 		case INETBOOT_STREAM:
887 			/* Need to copy from the socket's remote address. */
888 			bcopy(&(sockets[sock_id].remote), from, MIN(*fromlen,
889 			    sizeof (struct sockaddr_in)));
890 			break;
891 		case INETBOOT_RAW:
892 		case INETBOOT_DGRAM:
893 		default:
894 			if (*fromlen > sizeof (icp->igm_saddr))
895 				*fromlen = sizeof (icp->igm_saddr);
896 			bcopy((caddr_t)&(icp->igm_saddr), (caddr_t)from,
897 			    MIN(*fromlen, sizeof (struct sockaddr_in)));
898 			break;
899 		}
900 	}
901 
902 	mp = icp->igm_mp;
903 	switch (so_type) {
904 	case INETBOOT_STREAM:
905 		/*
906 		 * If the message has igm_id == TCP_CALLB_MAGIC_ID, we need
907 		 * to drain the data held by tcp and try again.
908 		 */
909 		if (icp->igm_id == TCP_CALLB_MAGIC_ID) {
910 			del_gram(&sockets[sock_id].inq, icp, B_TRUE);
911 			tcp_rcv_drain_sock(sock_id);
912 			goto retry;
913 		}
914 
915 		/* TCP should put only user data in the inetgram. */
916 		tmp_buf = (char *)buf;
917 		while (len > 0 && icp != NULL) {
918 			datalen = mp->b_wptr - mp->b_rptr;
919 			if (len < datalen) {
920 				bcopy(mp->b_rptr, tmp_buf, len);
921 				bytes += len;
922 				mp->b_rptr += len;
923 				break;
924 			} else {
925 				bcopy(mp->b_rptr, tmp_buf, datalen);
926 				len -= datalen;
927 				bytes += datalen;
928 				tmp_buf += datalen;
929 				del_gram(&sockets[sock_id].inq, icp, B_TRUE);
930 
931 				/*
932 				 * If we have any embedded magic messages just
933 				 * drop them.
934 				 */
935 				while ((icp = sockets[sock_id].inq) != NULL) {
936 					if (icp->igm_id != TCP_CALLB_MAGIC_ID)
937 						break;
938 					del_gram(&sockets[sock_id].inq, icp,
939 					    B_TRUE);
940 				}
941 
942 				if (icp == NULL)
943 					break;
944 				mp = icp->igm_mp;
945 			}
946 		}
947 		sockets[sock_id].so_rcvbuf += (int32_t)bytes;
948 		break;
949 	case INETBOOT_DGRAM:
950 		datalen = mp->b_wptr - mp->b_rptr;
951 		if (len < datalen)
952 			bytes = len;
953 		else
954 			bytes = datalen;
955 		bcopy(mp->b_rptr, buf, bytes);
956 		del_gram(&sockets[sock_id].inq, icp, B_TRUE);
957 		break;
958 	case INETBOOT_RAW:
959 	default:
960 		datalen = mp->b_wptr - mp->b_rptr;
961 		if (len < datalen)
962 			bytes = len;
963 		else
964 			bytes = datalen;
965 		bcopy(mp->b_rptr, buf, bytes);
966 		del_gram(&sockets[sock_id].inq, icp, B_TRUE);
967 		break;
968 	}
969 
970 #ifdef	DEBUG
971 	printf("recvfrom(%d): data: (0x%x,%d)\n", sock_id,
972 	    (icp != NULL) ? icp->igm_mp : 0, bytes);
973 #endif	/* DEBUG */
974 	return (bytes);
975 }
976 
977 
978 /* Just a wrapper around sendto(). */
979 ssize_t
980 send(int s, const void *msg, size_t len, int flags)
981 {
982 	return (sendto(s, msg, len, flags, NULL, 0));
983 }
984 
985 /*
986  * Transmit a message through a socket.
987  *
988  * Supported flags: MSG_DONTROUTE or 0.
989  */
990 ssize_t
991 sendto(int s, const void *msg, size_t len, int flags, const struct sockaddr *to,
992     socklen_t tolen)
993 {
994 	enum SockType so_type;
995 	int sock_id;
996 	ssize_t bytes;
997 
998 	errno = 0;
999 
1000 	if ((sock_id = so_check_fd(s, &errno)) == -1) {
1001 		return (-1);
1002 	}
1003 	if (msg == NULL) {
1004 		errno = EINVAL;
1005 		return (-1);
1006 	}
1007 	so_type = sockets[sock_id].type;
1008 	if ((flags & ~MSG_DONTROUTE) != 0) {
1009 		errno = EINVAL;
1010 		return (-1);
1011 	}
1012 	if (sockets[sock_id].so_error != 0) {
1013 		errno = sockets[sock_id].so_error;
1014 		return (-1);
1015 	}
1016 	if (to != NULL && to->sa_family != AF_INET) {
1017 		errno = EAFNOSUPPORT;
1018 		return (-1);
1019 	}
1020 
1021 	switch (so_type) {
1022 	case INETBOOT_RAW:
1023 	case INETBOOT_DGRAM:
1024 		if (!(sockets[sock_id].so_state & SS_ISCONNECTED) &&
1025 		    (to == NULL || tolen != sizeof (struct sockaddr_in))) {
1026 			errno = EINVAL;
1027 			return (-1);
1028 		}
1029 		bytes = dgram_sendto(sock_id, msg, len, flags, to, tolen);
1030 		break;
1031 	case INETBOOT_STREAM:
1032 		if (!((sockets[sock_id].so_state & SS_ISCONNECTED) ||
1033 		    (sockets[sock_id].so_state & SS_ISCONNECTING))) {
1034 			errno = EINVAL;
1035 			return (-1);
1036 		}
1037 		if (sockets[sock_id].so_state & SS_CANTSENDMORE) {
1038 			errno = EPIPE;
1039 			return (-1);
1040 		}
1041 		bytes = stream_sendto(sock_id, msg, len, flags);
1042 		break;
1043 	default:
1044 		/* Should not happen... */
1045 		errno = EPROTOTYPE;
1046 		return (-1);
1047 	}
1048 	return (bytes);
1049 }
1050 
1051 static ssize_t
1052 dgram_sendto(int i, const void *msg, size_t len, int flags,
1053     const struct sockaddr *to, int tolen)
1054 {
1055 	struct inetgram		oc;
1056 	int			l, offset;
1057 	size_t			tlen;
1058 	mblk_t			*mp;
1059 
1060 #ifdef	DEBUG
1061 	{
1062 	struct sockaddr_in *sin = (struct sockaddr_in *)to;
1063 	printf("sendto(%d): msg of length: %d sent to port %d and host: %s\n",
1064 	    i, len, ntohs(sin->sin_port), inet_ntoa(sin->sin_addr));
1065 	}
1066 #endif	/* DEBUG */
1067 
1068 	nuke_grams(&sockets[i].inq); /* flush the input queue */
1069 
1070 	/* calculate offset for data */
1071 	offset = sockets[i].headerlen[MEDIA_LVL](NULL) +
1072 	    (sockets[i].headerlen[NETWORK_LVL])(NULL);
1073 
1074 	bzero((caddr_t)&oc, sizeof (oc));
1075 	if (sockets[i].type != INETBOOT_RAW) {
1076 		offset += (sockets[i].headerlen[TRANSPORT_LVL])(NULL);
1077 		oc.igm_level = TRANSPORT_LVL;
1078 	} else
1079 		oc.igm_level = NETWORK_LVL;
1080 	oc.igm_oflags = flags;
1081 
1082 	if (to != NULL) {
1083 		bcopy((caddr_t)to, (caddr_t)&oc.igm_saddr, tolen);
1084 	} else {
1085 		bcopy((caddr_t)&sockets[i].remote, (caddr_t)&oc.igm_saddr,
1086 		    sizeof (struct sockaddr_in));
1087 	}
1088 
1089 	/* Get a legal source port if the socket isn't bound. */
1090 	if (sockets[i].bound == B_FALSE &&
1091 	    ntohs(oc.igm_saddr.sin_port == 0)) {
1092 		((struct sockaddr_in *)&oc.igm_saddr)->sin_port =
1093 		    get_source_port(B_FALSE);
1094 	}
1095 
1096 	/* Round up to 16bit value for checksum purposes */
1097 	if (sockets[i].type == INETBOOT_DGRAM) {
1098 		tlen = ((len + sizeof (uint16_t) - 1) &
1099 		    ~(sizeof (uint16_t) - 1));
1100 	} else
1101 		tlen = len;
1102 
1103 	if ((oc.igm_mp = allocb(tlen + offset, 0)) == NULL) {
1104 		errno = ENOMEM;
1105 		return (-1);
1106 	}
1107 	mp = oc.igm_mp;
1108 	mp->b_rptr = mp->b_wptr += offset;
1109 	bcopy((caddr_t)msg, mp->b_wptr, len);
1110 	mp->b_wptr += len;
1111 	for (l = TRANSPORT_LVL; l >= MEDIA_LVL; l--) {
1112 		if (sockets[i].output[l] != NULL) {
1113 			if (sockets[i].output[l](i, &oc) < 0) {
1114 				freeb(mp);
1115 				if (errno == 0)
1116 					errno = EIO;
1117 				return (-1);
1118 			}
1119 		}
1120 	}
1121 	freeb(mp);
1122 	return (len);
1123 }
1124 
1125 /* ARGSUSED */
1126 static ssize_t
1127 stream_sendto(int i, const void *msg, size_t len, int flags)
1128 {
1129 	int cnt;
1130 
1131 	assert(sockets[i].pcb != NULL);
1132 
1133 	/*
1134 	 * Call directly TCP's send routine.  We do this because TCP
1135 	 * needs to decide whether to send out the data.
1136 	 *
1137 	 * Note also that currently, TCP ignores all flags passed in for
1138 	 * TCP socket.
1139 	 */
1140 	if ((cnt = tcp_send(i, sockets[i].pcb, msg, len)) < 0) {
1141 		if (sockets[i].so_error != 0)
1142 			errno = sockets[i].so_error;
1143 		return (-1);
1144 	} else {
1145 		return (cnt);
1146 	}
1147 }
1148 
1149 /*
1150  * Returns ptr to the last inetgram in the list, or null if list is null
1151  */
1152 struct inetgram *
1153 last_gram(struct inetgram *igp)
1154 {
1155 	struct inetgram	*wp;
1156 	for (wp = igp; wp != NULL; wp = wp->igm_next) {
1157 		if (wp->igm_next == NULL)
1158 			return (wp);
1159 	}
1160 	return (NULL);
1161 }
1162 
1163 /*
1164  * Adds an inetgram or list of inetgrams to the end of the list.
1165  */
1166 void
1167 add_grams(struct inetgram **igpp, struct inetgram *newgp)
1168 {
1169 	struct inetgram	 *wp;
1170 
1171 	if (newgp == NULL)
1172 		return;
1173 
1174 	if (*igpp == NULL)
1175 		*igpp = newgp;
1176 	else {
1177 		wp = last_gram(*igpp);
1178 		wp->igm_next = newgp;
1179 	}
1180 }
1181 
1182 /*
1183  * Nuke a whole list of grams.
1184  */
1185 void
1186 nuke_grams(struct inetgram **lgpp)
1187 {
1188 	while (*lgpp != NULL)
1189 		del_gram(lgpp, *lgpp, B_TRUE);
1190 }
1191 
1192 /*
1193  * Remove the referenced inetgram. List is altered accordingly. Destroy the
1194  * referenced inetgram if freeit is B_TRUE.
1195  */
1196 void
1197 del_gram(struct inetgram **lgpp, struct inetgram *igp, int freeit)
1198 {
1199 	struct inetgram	*wp, *pp = NULL;
1200 
1201 	if (lgpp == NULL || igp == NULL)
1202 		return;
1203 
1204 	wp = *lgpp;
1205 	while (wp != NULL) {
1206 		if (wp == igp) {
1207 			/* detach wp from the list */
1208 			if (*lgpp == wp)
1209 				*lgpp = (*lgpp)->igm_next;
1210 			else
1211 				pp->igm_next = wp->igm_next;
1212 			igp->igm_next = NULL;
1213 
1214 			if (freeit) {
1215 				if (igp->igm_mp != NULL)
1216 					freeb(igp->igm_mp);
1217 				bkmem_free((caddr_t)igp,
1218 				    sizeof (struct inetgram));
1219 			}
1220 			break;
1221 		}
1222 		pp = wp;
1223 		wp = wp->igm_next;
1224 	}
1225 }
1226 
1227 struct nct_t nct[] = {
1228 	"bootp",	NCT_BOOTP_DHCP,
1229 	"dhcp",		NCT_BOOTP_DHCP,
1230 	"rarp",		NCT_RARP_BOOTPARAMS,
1231 	"manual",	NCT_MANUAL
1232 };
1233 int	nct_entries = sizeof (nct) / sizeof (nct[0]);
1234 
1235 /*
1236  * Figure out from the bootpath what kind of network configuration strategy
1237  * we should use. Returns the network config strategy.
1238  */
1239 int
1240 get_netconfig_strategy(void)
1241 {
1242 	int	i;
1243 #define	ISSPACE(c) (c == ' ' || c == '\t' || c == '\n' || c == '\0')
1244 	char	lbootpath[OBP_MAXPATHLEN];
1245 	char	net_options[NCT_BUFSIZE];
1246 	char	*op, *nop, *sp;
1247 	pnode_t	cn;
1248 	int	proplen;
1249 
1250 	/* If the PROM DHCP cache exists, we're done */
1251 	if (prom_cached_reply(B_TRUE))
1252 		return (NCT_BOOTP_DHCP);
1253 
1254 	/*
1255 	 *	Newer (version 4) PROMs will put the name in the
1256 	 *	"net-config-strategy" property.
1257 	 */
1258 	cn = prom_finddevice("/chosen");
1259 	if ((proplen = prom_getproplen(cn, "net-config-strategy")) <
1260 	    sizeof (net_options)) {
1261 		(void) prom_getprop(cn, "net-config-strategy", net_options);
1262 		net_options[proplen] = '\0';
1263 	} else {
1264 
1265 		/*
1266 		 * We're reduced to sacanning bootpath for the prototol to use.
1267 		 * Since there was no "net-config-strategy" property, this is
1268 		 * an old PROM, so we need to excise any extraneous key/value
1269 		 * initializations from bootpath[].
1270 		 */
1271 		for (op = prom_bootpath(), sp = lbootpath; op != NULL &&
1272 		    !ISSPACE(*op); sp++, op++)
1273 			*sp = *op;
1274 		*sp = '\0';
1275 		/* find the last '/' (in the device path) */
1276 		if ((op = strrchr(lbootpath, '/')) == NULL)	/* last '/' */
1277 			op = lbootpath;
1278 		else
1279 			op++;
1280 		/* then look for the ':' separating it from the protocol */
1281 		while (*op != ':' && *op != '\0')
1282 			op++;
1283 
1284 		if (*op == ':') {
1285 			for (nop = net_options, op++;
1286 			    *op != '\0' && *op != '/' && !ISSPACE(*op) &&
1287 			    nop < &net_options[NCT_BUFSIZE]; nop++, op++)
1288 				*nop = *op;
1289 			*nop = '\0';
1290 		} else
1291 			net_options[0] = '\0';
1292 	}
1293 
1294 #undef	ISSPACE
1295 
1296 	for (i = 0; i < nct_entries; i++)
1297 		if (strcmp(net_options, nct[i].p_name) == 0)
1298 			return (nct[i].p_id);
1299 
1300 	return (NCT_DEFAULT);
1301 }
1302 
1303 /* Modified STREAM routines for ease of porting core TCP code. */
1304 
1305 /*ARGSUSED*/
1306 mblk_t *
1307 allocb(size_t size, uint_t pri)
1308 {
1309 	unsigned char *base;
1310 	mblk_t *mp;
1311 
1312 	if ((mp = (mblk_t *)bkmem_zalloc(sizeof (mblk_t))) == NULL)
1313 		return (NULL);
1314 	if ((base = (unsigned char *)bkmem_zalloc(size)) == NULL)
1315 		return (NULL);
1316 
1317 	mp->b_next = mp->b_prev = mp->b_cont = NULL;
1318 	mp->b_rptr = mp->b_wptr = mp->b_datap = (unsigned char *)base;
1319 	mp->b_size = size;
1320 
1321 	return (mp);
1322 }
1323 
1324 void
1325 freeb(mblk_t *mp)
1326 {
1327 #ifdef DEBUG
1328 	printf("freeb datap %x\n", mp->b_datap);
1329 #endif
1330 	bkmem_free((caddr_t)(mp->b_datap), mp->b_size);
1331 #ifdef DEBUG
1332 	printf("freeb mp %x\n", mp);
1333 #endif
1334 	bkmem_free((caddr_t)mp, sizeof (mblk_t));
1335 }
1336 
1337 void
1338 freemsg(mblk_t *mp)
1339 {
1340 	while (mp) {
1341 		mblk_t *mp_cont = mp->b_cont;
1342 
1343 		freeb(mp);
1344 		mp = mp_cont;
1345 	}
1346 }
1347 
1348 mblk_t *
1349 copyb(mblk_t *bp)
1350 {
1351 	mblk_t *nbp;
1352 	unsigned char *ndp;
1353 
1354 	assert((uintptr_t)(bp->b_wptr - bp->b_rptr) >= 0);
1355 
1356 	if (!(nbp = allocb(bp->b_size, 0)))
1357 		return (NULL);
1358 	nbp->b_cont = NULL;
1359 	ndp = nbp->b_datap;
1360 
1361 	nbp->b_rptr = ndp + (bp->b_rptr - bp->b_datap);
1362 	nbp->b_wptr = nbp->b_rptr + (bp->b_wptr - bp->b_rptr);
1363 	bcopy(bp->b_datap, nbp->b_datap, bp->b_size);
1364 	return (nbp);
1365 }
1366 
1367 /* To simplify things, dupb() is implemented as copyb(). */
1368 mblk_t *
1369 dupb(mblk_t *mp)
1370 {
1371 	return (copyb(mp));
1372 }
1373 
1374 /*
1375  * get number of data bytes in message
1376  */
1377 size_t
1378 msgdsize(mblk_t *bp)
1379 {
1380 	size_t count = 0;
1381 
1382 	for (; bp != NULL; bp = bp->b_cont) {
1383 		assert(bp->b_wptr >= bp->b_rptr);
1384 		count += bp->b_wptr - bp->b_rptr;
1385 	}
1386 	return (count);
1387 }
1388