xref: /illumos-gate/usr/src/cmd/fm/modules/common/ip-transport/ip.c (revision e6f8def1ace27f327240a0b4b090911007f71137)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/socket.h>
28 #include <sys/sysmacros.h>
29 #include <sys/fm/protocol.h>
30 
31 #include <netinet/in.h>
32 #include <arpa/inet.h>
33 
34 #include <strings.h>
35 #include <unistd.h>
36 #include <pthread.h>
37 #include <alloca.h>
38 #include <fcntl.h>
39 #include <errno.h>
40 #include <netdb.h>
41 #include <poll.h>
42 #include <stdarg.h>
43 
44 #include <fm/fmd_api.h>
45 
46 #define	IP_MAGIC	"\177FMA" /* magic string identifying a packet header */
47 #define	IP_MAGLEN	4	/* length of magic string */
48 #define	IP_DEBUG_OFF	0	/* No informational debugging printed */
49 #define	IP_DEBUG_FINE	1	/* Basic debug information printed (default) */
50 #define	IP_DEBUG_FINER	2	/* More debug information printed. */
51 #define	IP_DEBUG_FINEST	3	/* All debug information printed */
52 
53 typedef struct ip_hdr {
54 	char iph_magic[IP_MAGLEN]; /* magic string */
55 	uint32_t iph_size;	/* packed size */
56 } ip_hdr_t;
57 
58 typedef struct ip_buf {
59 	void *ipb_buf;		/* data buffer */
60 	size_t ipb_size;	/* size of buffer */
61 } ip_buf_t;
62 
63 typedef struct ip_cinfo {	/* Connection specific information */
64 	struct addrinfo *addr;	/* Connection address(es) */
65 	char *name;		/* The name of the server or interface */
66 	int retry;		/* The number of connection retries */
67 } ip_cinfo_t;
68 
69 typedef struct ip_xprt {
70 	fmd_xprt_t *ipx_xprt;	/* transport handle */
71 	int ipx_flags;		/* transport flags */
72 	int ipx_fd;		/* socket file descriptor */
73 	int ipx_done;		/* flag indicating connection closed */
74 	pthread_t ipx_tid;	/* recv-side auxiliary thread */
75 	ip_buf_t ipx_sndbuf;	/* buffer for sending events */
76 	ip_buf_t ipx_rcvbuf;	/* buffer for receiving events */
77 	ip_cinfo_t *ipx_cinfo;	/* info for reconnect */
78 	char *ipx_addr;		/* address:port of remote connection */
79 	struct ip_xprt *ipx_next;	/* next ip_xprt in global list */
80 } ip_xprt_t;
81 
82 #define	IPX_ID(a) ((a)->ipx_addr)
83 
84 typedef struct ip_stat {
85 	fmd_stat_t ips_accfail;	/* failed accepts */
86 	fmd_stat_t ips_badmagic; /* invalid packet headers */
87 	fmd_stat_t ips_packfail; /* failed packs */
88 	fmd_stat_t ips_unpackfail; /* failed unpacks */
89 } ip_stat_t;
90 
91 static void ip_xprt_create(fmd_xprt_t *, int, int, ip_cinfo_t *, char *);
92 static void ip_xprt_destroy(ip_xprt_t *);
93 
94 static ip_stat_t ip_stat = {
95 	{ "accfail", FMD_TYPE_UINT64, "failed accepts" },
96 	{ "badmagic", FMD_TYPE_UINT64, "invalid packet headers" },
97 	{ "packfail", FMD_TYPE_UINT64, "failed packs" },
98 	{ "unpackfail", FMD_TYPE_UINT64, "failed unpacks" },
99 };
100 
101 static fmd_hdl_t *ip_hdl;	/* module handle */
102 static pthread_mutex_t ip_lock;	/* lock for ip_xps list */
103 static ip_xprt_t *ip_xps;	/* list of active transports */
104 static nvlist_t *ip_auth;	/* authority to use for transport(s) */
105 static size_t ip_size;		/* default buffer size */
106 static volatile int ip_quit;	/* signal to quit */
107 static int ip_qlen;		/* queue length for listen(3SOCKET) */
108 static int ip_mtbf;		/* mtbf for simulating packet drop */
109 static int ip_external;		/* set transport to be "external" */
110 static int ip_no_remote_repair;	/* disallow remote repair */
111 static int ip_hconly;		/* only cache faults that are hc-scheme */
112 static int ip_rdonly;		/* force transport to be rdonly */
113 static int ip_hc_present_only;	/* only cache faults if hc-scheme and present */
114 static char *ip_domain_name;	/* set domain name for received list.suspects */
115 static hrtime_t ip_burp;	/* make mtbf slower by adding this much delay */
116 static int ip_translate;	/* call fmd_xprt_translate() before sending */
117 static char *ip_port;		/* port to connect to (or bind to if server) */
118 static int ip_retry;		/* retry count for ip_xprt_setup() -1=forever */
119 static hrtime_t ip_sleep;	/* sleep delay for ip_xprt_setup() */
120 static ip_cinfo_t ip_listen;	/* Transport service conn info for server */
121 static ip_cinfo_t ip_server;    /* Remote server connection info for client */
122 static ip_cinfo_t ip_server2;	/* Second remote server conn info for client */
123 static int ip_debug_level;	/* level for printing debug messages */
124 
125 /*
126  * Prints a debug message to the fmd debug framework if the debug level is set
127  * to at least the given level.
128  */
129 static void
130 ip_debug(int level, char *fmt, ...)
131 {
132 	if (ip_debug_level >= level) {
133 		va_list args;
134 		va_start(args, fmt);
135 		fmd_hdl_vdebug(ip_hdl, fmt, args);
136 		va_end(args);
137 	}
138 }
139 
140 /*
141  * Allocate space in ipx_sndbuf for a header and a packed XDR encoding of
142  * the specified nvlist, and then send the buffer to our remote peer.
143  */
144 static int
145 ip_fmdo_send(fmd_hdl_t *hdl, fmd_xprt_t *xp, fmd_event_t *ep, nvlist_t *nvl)
146 {
147 	ip_xprt_t *ipx;
148 	size_t size, nvsize;
149 	char *buf, *nvbuf;
150 	ip_hdr_t *iph;
151 	ssize_t r, n;
152 	int err;
153 
154 	if (xp == NULL) {
155 		ip_debug(IP_DEBUG_FINE, "ip_fmdo_send failed: xp=NULL\n");
156 		return (FMD_SEND_FAILED);
157 	}
158 	ipx = fmd_xprt_getspecific(hdl, xp);
159 
160 	/*
161 	 * For testing purposes, if ip_mtbf is non-zero, use this to pseudo-
162 	 * randomly simulate the need for retries.  If ip_burp is also set,
163 	 * then we also suspend the transport for a bit and wake it up again.
164 	 */
165 	if (ip_mtbf != 0 && gethrtime() % ip_mtbf == 0) {
166 		if (ip_burp != 0) {
167 			ip_debug(IP_DEBUG_FINE, "burping ipx %s", IPX_ID(ipx));
168 			ipx->ipx_flags |= FMD_XPRT_SUSPENDED;
169 			(void) fmd_timer_install(ip_hdl, ipx, NULL, ip_burp);
170 			fmd_xprt_suspend(ip_hdl, xp);
171 		}
172 		return (FMD_SEND_RETRY);
173 	}
174 
175 	if (ip_translate && (nvl = fmd_xprt_translate(hdl, xp, ep)) == NULL) {
176 		fmd_hdl_error(hdl, "failed to translate event %p", (void *)ep);
177 		return (FMD_SEND_FAILED);
178 	}
179 
180 	(void) nvlist_size(nvl, &nvsize, NV_ENCODE_XDR);
181 	size = r = sizeof (ip_hdr_t) + nvsize;
182 
183 	if (ipx->ipx_sndbuf.ipb_size < size) {
184 		fmd_hdl_free(hdl, ipx->ipx_sndbuf.ipb_buf,
185 		    ipx->ipx_sndbuf.ipb_size);
186 		ipx->ipx_sndbuf.ipb_size = P2ROUNDUP(size, 16);
187 		ipx->ipx_sndbuf.ipb_buf = fmd_hdl_alloc(hdl,
188 		    ipx->ipx_sndbuf.ipb_size, FMD_SLEEP);
189 	}
190 
191 	buf = ipx->ipx_sndbuf.ipb_buf;
192 	iph = (ip_hdr_t *)(uintptr_t)buf;
193 	nvbuf = buf + sizeof (ip_hdr_t);
194 
195 	bcopy(IP_MAGIC, iph->iph_magic, IP_MAGLEN);
196 	iph->iph_size = htonl(nvsize);
197 	err = nvlist_pack(nvl, &nvbuf, &nvsize, NV_ENCODE_XDR, 0);
198 
199 	if (ip_translate)
200 		nvlist_free(nvl);
201 
202 	if (err != 0) {
203 		fmd_hdl_error(ip_hdl, "failed to pack event for "
204 		    "transport %p: %s\n", (void *)ipx->ipx_xprt, strerror(err));
205 		ip_stat.ips_packfail.fmds_value.ui64++;
206 		return (FMD_SEND_FAILED);
207 	}
208 
209 	while (!ip_quit && r != 0) {
210 		if ((n = send(ipx->ipx_fd, buf, r, 0)) < 0) {
211 			if (errno != EINTR && errno != EWOULDBLOCK) {
212 				ip_debug(IP_DEBUG_FINE,
213 				    "failed to send to %s", IPX_ID(ipx));
214 				return (FMD_SEND_FAILED);
215 			}
216 			continue;
217 		}
218 		buf += n;
219 		r -= n;
220 	}
221 
222 	ip_debug(IP_DEBUG_FINEST, "Sent event %d bytes to %s",
223 	    size, IPX_ID(ipx));
224 	return (FMD_SEND_SUCCESS);
225 }
226 
227 /*
228  * Sends events over transports that are configured read only.  When the module
229  * is in read only mode it will receive all events and only send events that
230  * have a subscription set.
231  *
232  * The configuration file will have to set prop ip_rdonly true and also
233  * subscribe for events that are desired to be sent over the transport in order
234  * for this function to be used.
235  */
236 /* ARGSUSED */
237 static void
238 ip_fmdo_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
239 {
240 	int err;
241 	ip_xprt_t *ipx;
242 
243 	if (ip_rdonly) {
244 		(void) pthread_mutex_lock(&ip_lock);
245 
246 		for (ipx = ip_xps; ipx != NULL; ipx = ipx->ipx_next) {
247 			err = ip_fmdo_send(hdl, ipx->ipx_xprt, ep, nvl);
248 			while (FMD_SEND_RETRY == err) {
249 				err = ip_fmdo_send(hdl, ipx->ipx_xprt, ep, nvl);
250 			}
251 		}
252 		(void) pthread_mutex_unlock(&ip_lock);
253 	}
254 }
255 
256 /*
257  * Receive a chunk of data of the specified size from our remote peer.  The
258  * data is received into ipx_rcvbuf, and then a pointer to the buffer is
259  * returned.  NOTE: The data is only valid until the next call to ip_xprt_recv.
260  * If the connection breaks or ip_quit is set during receive, NULL is returned.
261  */
262 static void *
263 ip_xprt_recv(ip_xprt_t *ipx, size_t size)
264 {
265 	char *buf = ipx->ipx_rcvbuf.ipb_buf;
266 	ssize_t n, r = size;
267 
268 	if (ipx->ipx_rcvbuf.ipb_size < size) {
269 		fmd_hdl_free(ip_hdl, ipx->ipx_rcvbuf.ipb_buf,
270 		    ipx->ipx_rcvbuf.ipb_size);
271 		ipx->ipx_rcvbuf.ipb_size = P2ROUNDUP(size, 16);
272 		ipx->ipx_rcvbuf.ipb_buf = buf = fmd_hdl_alloc(ip_hdl,
273 		    ipx->ipx_rcvbuf.ipb_size, FMD_SLEEP);
274 	}
275 
276 	while (!ip_quit && r != 0) {
277 		if ((n = recv(ipx->ipx_fd, buf, r, MSG_WAITALL)) == 0) {
278 			ipx->ipx_done++;
279 			return (NULL);
280 		}
281 
282 		if (n < 0) {
283 			if (errno != EINTR && errno != EWOULDBLOCK) {
284 				ip_debug(IP_DEBUG_FINE,
285 				    "failed to recv on ipx %s", IPX_ID(ipx));
286 			}
287 			continue;
288 		}
289 		/* Reset retry counter after a successful connection */
290 		if (ipx->ipx_cinfo) {
291 			ipx->ipx_cinfo->retry = ip_retry;
292 		}
293 
294 		buf += n;
295 		r -= n;
296 	}
297 
298 	return (r ? NULL: ipx->ipx_rcvbuf.ipb_buf);
299 }
300 
301 /*
302  * Sets the address/port of the remote connection in the connection info struct
303  * This is called after a TCP session has been set up with a known remote
304  * address (sap)
305  */
306 static void
307 ip_xprt_set_addr(ip_xprt_t *ipx, const struct sockaddr *sap)
308 {
309 	const struct sockaddr_in6 *sin6 = (const void *)sap;
310 	const struct sockaddr_in *sin = (const void *)sap;
311 
312 	char buf[INET6_ADDRSTRLEN + 16];
313 	struct in_addr v4addr;
314 	in_port_t port;
315 	int n;
316 
317 	ip_debug(IP_DEBUG_FINER, "Enter ip_xprt_set_addr");
318 
319 	if (sap->sa_family == AF_INET6 &&
320 	    IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
321 		IN6_V4MAPPED_TO_INADDR(&sin6->sin6_addr, &v4addr);
322 		(void) inet_ntop(AF_INET, &v4addr, buf, sizeof (buf));
323 		port = ntohs(sin6->sin6_port);
324 	} else if (sap->sa_family == AF_INET6) {
325 		(void) inet_ntop(AF_INET6, &sin6->sin6_addr, buf, sizeof (buf));
326 		port = ntohs(sin6->sin6_port);
327 	} else {
328 		(void) inet_ntop(AF_INET, &sin->sin_addr, buf, sizeof (buf));
329 		port = ntohs(sin->sin_port);
330 	}
331 
332 	n = strlen(buf);
333 	(void) snprintf(buf + n, sizeof (buf) - n, ":%u", port);
334 
335 	if (ipx->ipx_addr)
336 		fmd_hdl_strfree(ip_hdl, ipx->ipx_addr);
337 	ipx->ipx_addr = fmd_hdl_strdup(ip_hdl, buf, FMD_SLEEP);
338 	ip_debug(IP_DEBUG_FINE, "connection addr is %s on %p",
339 	    ipx->ipx_addr, (void *)ipx);
340 }
341 
342 static nvlist_t *
343 ip_xprt_auth(ip_xprt_t *ipx)
344 {
345 	nvlist_t *nvl;
346 	int err;
347 
348 	ip_debug(IP_DEBUG_FINER, "Enter ip_xprt_auth");
349 
350 	if (ip_auth != NULL)
351 		err = nvlist_dup(ip_auth, &nvl, 0);
352 	else
353 		err = nvlist_alloc(&nvl, 0, 0);
354 
355 	if (err != 0) {
356 		fmd_hdl_abort(ip_hdl, "failed to create nvlist for "
357 		    "authority: %s\n", strerror(err));
358 	}
359 
360 	if (ip_auth != NULL)
361 		return (nvl);
362 
363 	ip_debug(IP_DEBUG_FINE, "ip_authority %s=%s\n",
364 	    FM_FMRI_AUTH_SERVER, ipx->ipx_addr);
365 
366 	(void) nvlist_add_uint8(nvl, FM_VERSION, FM_FMRI_AUTH_VERSION);
367 	(void) nvlist_add_string(nvl, FM_FMRI_AUTH_SERVER, ipx->ipx_addr);
368 
369 	return (nvl);
370 }
371 
372 static void
373 ip_xprt_accept(ip_xprt_t *ipx)
374 {
375 	struct sockaddr_storage sa;
376 	socklen_t salen = sizeof (sa);
377 	fmd_xprt_t *xp;
378 	int fd;
379 
380 	ip_debug(IP_DEBUG_FINER, "Enter ip_xprt_accept");
381 
382 	if ((fd = accept(ipx->ipx_fd, (struct sockaddr *)&sa, &salen)) == -1) {
383 		fmd_hdl_error(ip_hdl, "failed to accept connection");
384 		ip_stat.ips_accfail.fmds_value.ui64++;
385 		return;
386 	}
387 	ip_debug(IP_DEBUG_FINE, "Accepted socket on fd %d", fd);
388 
389 	ip_xprt_set_addr(ipx, (struct sockaddr *)&sa);
390 	xp = fmd_xprt_open(ip_hdl, ipx->ipx_flags,
391 	    ip_xprt_auth(ipx), NULL);
392 	ip_xprt_create(xp, fd, ipx->ipx_flags, &ip_listen, ipx->ipx_addr);
393 }
394 
395 static void
396 ip_xprt_recv_event(ip_xprt_t *ipx)
397 {
398 	ip_hdr_t *iph;
399 	nvlist_t *nvl;
400 	size_t size;
401 	void *buf;
402 	int err;
403 
404 	if ((iph = ip_xprt_recv(ipx, sizeof (ip_hdr_t))) == NULL)
405 		return; /* connection broken */
406 
407 	if (bcmp(iph->iph_magic, IP_MAGIC, IP_MAGLEN) != 0) {
408 		fmd_hdl_error(ip_hdl,
409 		    "invalid hdr magic %x.%x.%x.%x from transport %s\n",
410 		    iph->iph_magic[0], iph->iph_magic[1], iph->iph_magic[2],
411 		    iph->iph_magic[3], IPX_ID(ipx));
412 		ip_stat.ips_badmagic.fmds_value.ui64++;
413 		return;
414 	}
415 
416 	size = ntohl(iph->iph_size);
417 
418 	if ((buf = ip_xprt_recv(ipx, size)) == NULL)
419 		return; /* connection broken */
420 
421 	if ((err = nvlist_unpack(buf, size, &nvl, 0)) != 0) {
422 		fmd_hdl_error(ip_hdl, "failed to unpack event from "
423 		    "transport %s: %s\n",
424 		    IPX_ID(ipx), strerror(err));
425 		ip_stat.ips_unpackfail.fmds_value.ui64++;
426 	} else {
427 		if (ip_domain_name)
428 			fmd_xprt_add_domain(ip_hdl, nvl, ip_domain_name);
429 		fmd_xprt_post(ip_hdl, ipx->ipx_xprt, nvl, 0);
430 	}
431 
432 	if (fmd_xprt_error(ip_hdl, ipx->ipx_xprt)) {
433 		fmd_hdl_error(ip_hdl, "protocol error on transport %p",
434 		    (void *)ipx->ipx_xprt);
435 		ipx->ipx_done++;
436 	}
437 	ip_debug(IP_DEBUG_FINEST, "Recv event %d bytes from %s",
438 	    size, IPX_ID(ipx));
439 }
440 
441 static void
442 ip_xprt_thread(void *arg)
443 {
444 	ip_xprt_t *ipx = arg;
445 	struct sockaddr_storage sa;
446 	socklen_t salen = sizeof (sa);
447 	struct pollfd pfd;
448 	id_t id;
449 
450 	ip_debug(IP_DEBUG_FINER, "Enter ip_xprt_thread");
451 
452 	while (!ip_quit && !ipx->ipx_done) {
453 		if (ipx->ipx_xprt != NULL || (ipx->ipx_flags & FMD_XPRT_ACCEPT))
454 			pfd.events = POLLIN;
455 		else
456 			pfd.events = POLLOUT;
457 
458 		pfd.fd = ipx->ipx_fd;
459 		pfd.revents = 0;
460 
461 		if (poll(&pfd, 1, -1) <= 0)
462 			continue; /* loop around and check ip_quit */
463 
464 		if (pfd.revents & (POLLHUP | POLLERR)) {
465 			ip_debug(IP_DEBUG_FINE, "hangup fd %d\n", ipx->ipx_fd);
466 			break;
467 		}
468 
469 		if (pfd.revents & POLLOUT) {
470 			/*
471 			 * Once we're connected, there's no reason to have our
472 			 * calls to recv() and send() be non-blocking since we
473 			 * we have separate threads for each: clear O_NONBLOCK.
474 			 */
475 			(void) fcntl(ipx->ipx_fd, F_SETFL,
476 			    fcntl(ipx->ipx_fd, F_GETFL, 0) & ~O_NONBLOCK);
477 
478 			if (getpeername(ipx->ipx_fd, (struct sockaddr *)&sa,
479 			    &salen) != 0) {
480 				fmd_hdl_error(ip_hdl, "failed to get peer name "
481 				    "for fd %d", ipx->ipx_fd);
482 				bzero(&sa, sizeof (sa));
483 				break;
484 			}
485 			ip_xprt_set_addr(ipx, (struct sockaddr *)&sa);
486 			ipx->ipx_xprt = fmd_xprt_open(ip_hdl, ipx->ipx_flags,
487 			    ip_xprt_auth(ipx), ipx);
488 
489 			ip_debug(IP_DEBUG_FINE, "connect fd %d ipx %p",
490 			    ipx->ipx_fd, (void *)ipx);
491 			continue;
492 		}
493 
494 		if (pfd.revents & POLLIN) {
495 			if (ipx->ipx_xprt == NULL)
496 				ip_xprt_accept(ipx);
497 			else
498 				ip_xprt_recv_event(ipx);
499 		}
500 	}
501 
502 	id = fmd_timer_install(ip_hdl, ipx, NULL, 0);
503 	ip_debug(IP_DEBUG_FINE, "close fd %d (timer %d)", ipx->ipx_fd, (int)id);
504 }
505 
506 static void
507 ip_xprt_create(fmd_xprt_t *xp, int fd, int flags, ip_cinfo_t *cinfo, char *addr)
508 {
509 	ip_xprt_t *ipx = fmd_hdl_zalloc(ip_hdl, sizeof (ip_xprt_t), FMD_SLEEP);
510 
511 	ip_debug(IP_DEBUG_FINER, "Enter ip_xprt_create %p", (void *)ipx);
512 
513 	ipx->ipx_xprt = xp;
514 	ipx->ipx_flags = flags;
515 	ipx->ipx_fd = fd;
516 	ipx->ipx_tid = fmd_thr_create(ip_hdl, ip_xprt_thread, ipx);
517 	ipx->ipx_cinfo = cinfo;
518 	ipx->ipx_addr = fmd_hdl_strdup(ip_hdl, addr, FMD_SLEEP);
519 
520 	if (ipx->ipx_xprt != NULL)
521 		fmd_xprt_setspecific(ip_hdl, ipx->ipx_xprt, ipx);
522 
523 	(void) pthread_mutex_lock(&ip_lock);
524 
525 	ipx->ipx_next = ip_xps;
526 	ip_xps = ipx;
527 
528 	(void) pthread_mutex_unlock(&ip_lock);
529 }
530 
531 static void
532 ip_xprt_destroy(ip_xprt_t *ipx)
533 {
534 	ip_xprt_t *ipp, **ppx = &ip_xps;
535 
536 	ip_debug(IP_DEBUG_FINER, "Enter ip_xprt_destory %s %p",
537 	    IPX_ID(ipx), (void *)ipx);
538 
539 	(void) pthread_mutex_lock(&ip_lock);
540 
541 	for (ipp = *ppx; ipp != NULL; ipp = ipp->ipx_next) {
542 		if (ipp != ipx)
543 			ppx = &ipp->ipx_next;
544 		else
545 			break;
546 	}
547 
548 	if (ipp != ipx) {
549 		(void) pthread_mutex_unlock(&ip_lock);
550 		fmd_hdl_abort(ip_hdl, "ipx %p not on xps list\n", (void *)ipx);
551 	}
552 
553 	*ppx = ipx->ipx_next;
554 	ipx->ipx_next = NULL;
555 
556 	(void) pthread_mutex_unlock(&ip_lock);
557 
558 	fmd_thr_signal(ip_hdl, ipx->ipx_tid);
559 	fmd_thr_destroy(ip_hdl, ipx->ipx_tid);
560 
561 	if (ipx->ipx_xprt != NULL)
562 		fmd_xprt_close(ip_hdl, ipx->ipx_xprt);
563 
564 	fmd_hdl_free(ip_hdl, ipx->ipx_sndbuf.ipb_buf, ipx->ipx_sndbuf.ipb_size);
565 	fmd_hdl_free(ip_hdl, ipx->ipx_rcvbuf.ipb_buf, ipx->ipx_rcvbuf.ipb_size);
566 
567 	(void) close(ipx->ipx_fd);
568 	if (ipx->ipx_addr) {
569 		fmd_hdl_strfree(ip_hdl, ipx->ipx_addr);
570 		ipx->ipx_addr = NULL;
571 	}
572 	fmd_hdl_free(ip_hdl, ipx, sizeof (ip_xprt_t));
573 }
574 
575 /*
576  * Loop through the addresses in the connection info structure that were
577  * created by getaddrinfo() in ip_setup_addr during initialization (_fmd_init)
578  * and for each one attempt to create a socket and initialize it.  If we are
579  * successful, return zero.  If we fail, we check ip_retry: if it is non-zero
580  * we return the last errno and let our caller retry ip_xprt_setup() later.  If
581  * ip_retry reaches zero, we call fmd_hdl_abort() with an appropriate message.
582  */
583 static int
584 ip_xprt_setup(fmd_hdl_t *hdl, ip_cinfo_t *cinfo)
585 {
586 	int err, fd, oflags, xflags, optval = 1;
587 	struct addrinfo *aip;
588 	const char *s1, *s2;
589 	struct addrinfo *ail = cinfo->addr;
590 
591 	ip_debug(IP_DEBUG_FINER, "Enter ip_xprt_setup");
592 
593 	/*
594 	 * Set up flags as specified in the .conf file. Note that these are
595 	 * mostly only used for testing purposes, allowing the transport to
596 	 * be set up in various modes.
597 	 */
598 	if (ail != ip_listen.addr)
599 		xflags = (ip_rdonly == FMD_B_TRUE) ? FMD_XPRT_RDONLY :
600 		    FMD_XPRT_RDWR;
601 	else
602 		xflags = ((ip_rdonly == FMD_B_TRUE) ? FMD_XPRT_RDONLY :
603 		    FMD_XPRT_RDWR) | FMD_XPRT_ACCEPT;
604 
605 	if (ip_external == FMD_B_TRUE)
606 		xflags |= FMD_XPRT_EXTERNAL;
607 	if (ip_no_remote_repair == FMD_B_TRUE)
608 		xflags |= FMD_XPRT_NO_REMOTE_REPAIR;
609 	if (ip_hconly == FMD_B_TRUE)
610 		xflags |= FMD_XPRT_HCONLY;
611 	if (ip_hc_present_only == FMD_B_TRUE)
612 		xflags |= FMD_XPRT_HC_PRESENT_ONLY;
613 
614 	for (aip = ail; aip != NULL; aip = aip->ai_next) {
615 		if (aip->ai_family != AF_INET && aip->ai_family != AF_INET6)
616 			continue; /* ignore anything that isn't IPv4 or IPv6 */
617 
618 		if ((fd = socket(aip->ai_family,
619 		    aip->ai_socktype, aip->ai_protocol)) == -1) {
620 			err = errno;
621 			continue;
622 		}
623 
624 		oflags = fcntl(fd, F_GETFL, 0);
625 		(void) fcntl(fd, F_SETFL, oflags | O_NONBLOCK);
626 
627 		if (xflags & FMD_XPRT_ACCEPT) {
628 			err = setsockopt(fd, SOL_SOCKET,
629 			    SO_REUSEADDR, &optval, sizeof (optval)) != 0 ||
630 			    bind(fd, aip->ai_addr, aip->ai_addrlen) != 0 ||
631 			    listen(fd, ip_qlen) != 0;
632 		} else {
633 			err = connect(fd, aip->ai_addr, aip->ai_addrlen);
634 			if (err)
635 				err = errno;
636 			if (err == EINPROGRESS)
637 				err = 0;
638 		}
639 
640 		if (err == 0) {
641 			ip_xprt_create(NULL, fd, xflags, cinfo, NULL);
642 			ip_debug(IP_DEBUG_FINER, "Exit ip_xprt_setup");
643 			return (0);
644 		}
645 
646 		ip_debug(IP_DEBUG_FINE, "Error=%d errno=%d", err, errno);
647 
648 		err = errno;
649 		(void) close(fd);
650 	}
651 
652 	if (cinfo->name != NULL) {
653 		s1 = "failed to connect to";
654 		s2 = cinfo->name;
655 	} else {
656 		s1 = "failed to listen on";
657 		s2 = ip_port;
658 	}
659 
660 	if (err == EACCES || cinfo->retry-- == 0)
661 		fmd_hdl_abort(hdl, "%s %s: %s\n", s1, s2, strerror(err));
662 
663 	ip_debug(IP_DEBUG_FINE, "%s %s: %s (will retry)\n",
664 	    s1, s2, strerror(err));
665 	ip_debug(IP_DEBUG_FINER, "Exit ip_xprt_setup");
666 	return (err);
667 }
668 
669 /*
670  * Free address based resources
671  */
672 static void
673 ip_addr_cleanup()
674 {
675 	if (ip_listen.addr != NULL) {
676 		freeaddrinfo(ip_listen.addr);
677 		ip_listen.addr = NULL;
678 	}
679 	if (ip_server.addr != NULL) {
680 		freeaddrinfo(ip_server.addr);
681 		ip_server.addr = NULL;
682 	}
683 	if (ip_server2.addr != NULL) {
684 		freeaddrinfo(ip_server2.addr);
685 		ip_server2.addr = NULL;
686 	}
687 	fmd_prop_free_string(ip_hdl, ip_server.name);
688 	fmd_prop_free_string(ip_hdl, ip_server2.name);
689 	fmd_prop_free_string(ip_hdl, ip_port);
690 }
691 
692 /*
693  * Setup a single address for ip connection.
694  */
695 static int
696 ip_setup_addr(ip_cinfo_t *cinfo)
697 {
698 	struct addrinfo aih;
699 	char *server = cinfo->name;
700 	int err;
701 
702 	bzero(&aih, sizeof (aih));
703 	aih.ai_flags = AI_ADDRCONFIG;
704 	aih.ai_family = AF_UNSPEC;
705 	aih.ai_socktype = SOCK_STREAM;
706 	if (server != NULL) {
707 		ip_debug(IP_DEBUG_FINE, "resolving %s:%s\n", server, ip_port);
708 	} else {
709 		aih.ai_flags |= AI_PASSIVE;
710 		cinfo->name = "localhost";
711 	}
712 
713 	err = getaddrinfo(server, ip_port, &aih, &cinfo->addr);
714 	if (err != 0) {
715 		fmd_hdl_error(ip_hdl, "failed to resolve host %s port %s: %s\n",
716 		    cinfo->name, ip_port, gai_strerror(err));
717 		cinfo->addr = NULL;
718 	}
719 	return (err);
720 }
721 
722 /*
723  * Setup all IP addresses for network configuration.
724  * The listen address for for a service that will bind to clients.
725  * A client can connect up to two servers using ip_server and ip_server2
726  * properties.
727  */
728 static int
729 ip_setup_addrs()
730 {
731 	int err = 0;
732 	ip_listen.addr = NULL;
733 	ip_server.addr = NULL;
734 	ip_server.retry = ip_retry;
735 	ip_server2.addr = NULL;
736 
737 	if ((ip_server.name == NULL && ip_server2.name == NULL) ||
738 	    ip_listen.name) {
739 		err = ip_setup_addr(&ip_listen);
740 	}
741 	if (ip_server.name != NULL && err == 0) {
742 		err = ip_setup_addr(&ip_server);
743 	}
744 	if (ip_server2.name != NULL && err == 0) {
745 		err = ip_setup_addr(&ip_server2);
746 	}
747 	if (err != 0) {
748 		ip_addr_cleanup();
749 	}
750 	return (err);
751 }
752 
753 /*
754  * Timeout handler for the transport module.  We use these types of timeouts:
755  *
756  * (a) arg is ip_cinfo_t: attempt ip_xprt_setup(), re-install timeout to retry
757  * (b) arg is ip_xprt_t, FMD_XPRT_SUSPENDED: call fmd_xprt_resume() on arg
758  * (c) arg is ip_xprt_t, !FMD_XPRT_SUSPENDED: call ip_xprt_destroy() on arg
759  * (d) arg is NULL, ignore as this shouldn't happen
760  *
761  * Case (c) is required as we need to cause the module's main thread, which
762  * runs this timeout handler, to join with the transport's auxiliary thread.
763  * If the connection is a client then a timer will be installed to retry
764  * connecting to the server.
765  */
766 static void
767 ip_timeout(fmd_hdl_t *hdl, id_t id, void *arg) {
768 	int install_timer;
769 	ip_cinfo_t *cinfo;
770 	ip_xprt_t *ipx;
771 
772 	if (arg == NULL) {
773 		fmd_hdl_error(hdl, "ip_timeout failed because hg arg is NULL");
774 	} else if (arg == &ip_server || arg == &ip_server2 ||
775 	    arg == &ip_listen) {
776 		ip_debug(IP_DEBUG_FINER,
777 			"Enter ip_timeout (a) install new timer");
778 		if (ip_xprt_setup(hdl, arg) != 0)
779 			(void) fmd_timer_install(hdl, arg, NULL, ip_sleep);
780 	} else {
781 		ipx = arg;
782 		if (ipx->ipx_flags & FMD_XPRT_SUSPENDED) {
783 			ip_debug(IP_DEBUG_FINE, "timer %d waking ipx %p",
784 				(int)id, arg);
785 			ipx->ipx_flags &= ~FMD_XPRT_SUSPENDED;
786 			fmd_xprt_resume(hdl, ipx->ipx_xprt);
787 		} else {
788 			ip_debug(IP_DEBUG_FINE, "timer %d closing ipx %p",
789 				(int)id, arg);
790 			cinfo = ipx->ipx_cinfo;
791 			install_timer = (ipx->ipx_flags & FMD_XPRT_ACCEPT) !=
792 				FMD_XPRT_ACCEPT;
793 			ip_xprt_destroy(ipx);
794 			if (install_timer)
795 				(void) fmd_timer_install(
796 					hdl, cinfo, NULL, ip_sleep);
797 		}
798 	}
799 }
800 
801 static const fmd_prop_t fmd_props[] = {
802 	{ "ip_authority", FMD_TYPE_STRING, NULL },
803 	{ "ip_bufsize", FMD_TYPE_SIZE, "4k" },
804 	{ "ip_burp", FMD_TYPE_TIME, "0" },
805 	{ "ip_enable", FMD_TYPE_BOOL, "false" },
806 	{ "ip_mtbf", FMD_TYPE_INT32, "0" },
807 	{ "ip_external", FMD_TYPE_BOOL, "true" },
808 	{ "ip_no_remote_repair", FMD_TYPE_BOOL, "true" },
809 	{ "ip_hconly", FMD_TYPE_BOOL, "false" },
810 	{ "ip_rdonly", FMD_TYPE_BOOL, "false" },
811 	{ "ip_hc_present_only", FMD_TYPE_BOOL, "false" },
812 	{ "ip_domain_name", FMD_TYPE_STRING, NULL },
813 	{ "ip_port", FMD_TYPE_STRING, "664" },
814 	{ "ip_qlen", FMD_TYPE_INT32, "32" },
815 	{ "ip_retry", FMD_TYPE_INT32, "-1" },	    /* -1=forever */
816 	{ "ip_server", FMD_TYPE_STRING, NULL },	    /* server name */
817 	{ "ip_server2", FMD_TYPE_STRING, NULL },    /* secondary server name */
818 	{ "ip_sleep", FMD_TYPE_TIME, "10s" },
819 	{ "ip_translate", FMD_TYPE_BOOL, "false" },
820 	{ "ip_bind_addr", FMD_TYPE_STRING, NULL },  /* network interface addr */
821 	{ "ip_debug_level", FMD_TYPE_INT32, "1" },  /* debug levels 0-3 */
822 	{ NULL, 0, NULL }
823 };
824 
825 static const fmd_hdl_ops_t fmd_ops = {
826 	ip_fmdo_recv,		/* fmdo_recv */
827 	ip_timeout,		/* fmdo_timeout */
828 	NULL,			/* fmdo_close */
829 	NULL,			/* fmdo_stats */
830 	NULL,			/* fmdo_gc */
831 	ip_fmdo_send,		/* fmdo_send */
832 };
833 
834 static const fmd_hdl_info_t fmd_info = {
835 	"IP Transport Agent", "1.0", &fmd_ops, fmd_props
836 };
837 
838 /*
839  * Initialize the ip-transport module as either a server or a client.  Note
840  * that the ip-transport module is not enabled by default under Solaris:
841  * at present we require a developer or tool to "setprop ip_enable true".
842  * If ip-transport is needed in the future out-of-the-box on one or more Sun
843  * platforms, the code to check 'ip_enable' should be replaced with:
844  *
845  * (a) configuring ip-transport to operate in client mode by default,
846  * (b) a platform-specific configuration mechanism, or
847  * (c) a means to assure security and prevent denial-of-service attacks.
848  *
849  * Note that (c) is only an issue when the transport module operates
850  * in server mode (i.e. with the ip_server property set to NULL) on a
851  * generic Solaris system which may be exposed directly to the Internet.
852  * The property ip_bind_addr can be used to define a private network interface
853  * to use so that the service is not exposed to the Internet.
854  */
855 void
856 _fmd_init(fmd_hdl_t *hdl)
857 {
858 	char *auth, *p, *q, *r, *s;
859 
860 	if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0)
861 		return; /* failed to register handle */
862 
863 	if (fmd_prop_get_int32(hdl, "ip_enable") == FMD_B_FALSE) {
864 		fmd_hdl_unregister(hdl);
865 		return;
866 	}
867 
868 	(void) fmd_stat_create(hdl, FMD_STAT_NOALLOC,
869 	    sizeof (ip_stat) / sizeof (fmd_stat_t), (fmd_stat_t *)&ip_stat);
870 
871 	ip_hdl = hdl;
872 	(void) pthread_mutex_init(&ip_lock, NULL);
873 
874 	ip_burp = fmd_prop_get_int64(hdl, "ip_burp");
875 	ip_mtbf = fmd_prop_get_int32(hdl, "ip_mtbf");
876 	ip_external = fmd_prop_get_int32(hdl, "ip_external");
877 	ip_no_remote_repair = fmd_prop_get_int32(hdl, "ip_no_remote_repair");
878 	ip_hconly = fmd_prop_get_int32(hdl, "ip_hconly");
879 	ip_rdonly = fmd_prop_get_int32(hdl, "ip_rdonly");
880 	ip_hc_present_only = fmd_prop_get_int32(hdl, "ip_hc_present_only");
881 	ip_domain_name = fmd_prop_get_string(hdl, "ip_domain_name");
882 	ip_qlen = fmd_prop_get_int32(hdl, "ip_qlen");
883 	ip_retry = fmd_prop_get_int32(hdl, "ip_retry");
884 	ip_sleep = fmd_prop_get_int64(hdl, "ip_sleep");
885 	ip_translate = fmd_prop_get_int32(hdl, "ip_translate");
886 
887 	ip_size = (size_t)fmd_prop_get_int64(hdl, "ip_bufsize");
888 	ip_size = MAX(ip_size, sizeof (ip_hdr_t));
889 
890 	ip_listen.name = fmd_prop_get_string(hdl, "ip_bind_addr");
891 	ip_server.name = fmd_prop_get_string(hdl, "ip_server");
892 	ip_server2.name = fmd_prop_get_string(hdl, "ip_server2");
893 	ip_port = fmd_prop_get_string(hdl, "ip_port");
894 	ip_debug_level = fmd_prop_get_int32(hdl, "ip_debug_level");
895 
896 	if (ip_setup_addrs()) {
897 		fmd_hdl_abort(hdl, "Unable to setup IP addresses.");
898 		return;
899 	}
900 
901 
902 	/*
903 	 * If ip_authority is set, tokenize this string and turn it into an
904 	 * FMA authority represented as a name-value pair list.  We will use
905 	 * this authority for all transports created by this module.  If
906 	 * ip_authority isn't set, we'll compute authorities on the fly.
907 	 */
908 	if ((auth = fmd_prop_get_string(hdl, "ip_authority")) != NULL) {
909 		(void) nvlist_alloc(&ip_auth, 0, 0);
910 		(void) nvlist_add_uint8(ip_auth,
911 		    FM_VERSION, FM_FMRI_AUTH_VERSION);
912 
913 		s = alloca(strlen(auth) + 1);
914 		(void) strcpy(s, auth);
915 		fmd_prop_free_string(hdl, auth);
916 
917 		for (p = strtok_r(s, ",", &q); p != NULL;
918 		    p = strtok_r(NULL, ",", &q)) {
919 
920 			if ((r = strchr(p, '=')) == NULL) {
921 				ip_addr_cleanup();
922 				fmd_hdl_abort(hdl, "ip_authority element <%s> "
923 				    "must be in <name>=<value> form\n", p);
924 			}
925 
926 			*r = '\0';
927 			(void) nvlist_add_string(ip_auth, p, r + 1);
928 			*r = '=';
929 		}
930 	}
931 
932 	/*
933 	 * Start ip transport server to listen for clients
934 	 */
935 	if (ip_listen.addr != NULL) {
936 		if (ip_xprt_setup(hdl, &ip_listen) != 0) {
937 			(void) fmd_timer_install(hdl, &ip_listen, NULL,
938 			    ip_sleep);
939 		}
940 	}
941 
942 	/*
943 	 * Call ip_xprt_setup() to connect to server(s).  If it fails and
944 	 * ip_retry is non-zero, install a timer to try again after
945 	 * 'ip_sleep' nsecs.
946 	 */
947 	if (ip_server.addr != NULL) {
948 		if (ip_xprt_setup(hdl, &ip_server) != 0)
949 			(void) fmd_timer_install(hdl, &ip_server, NULL,
950 			    ip_sleep);
951 	}
952 	if (ip_server2.addr != NULL) {
953 		if (ip_xprt_setup(hdl, &ip_server2) != 0)
954 			(void) fmd_timer_install(hdl, &ip_server2, NULL,
955 			    ip_sleep);
956 	}
957 }
958 
959 void
960 _fmd_fini(fmd_hdl_t *hdl)
961 {
962 	ip_quit++; /* set quit flag before signalling auxiliary threads */
963 
964 	while (ip_xps != NULL)
965 		ip_xprt_destroy(ip_xps);
966 
967 	if (ip_auth != NULL)
968 		nvlist_free(ip_auth);
969 	ip_addr_cleanup();
970 
971 	fmd_hdl_unregister(hdl);
972 }
973