xref: /titanic_50/usr/src/cmd/fs.d/nfs/nfsd/nfsd.c (revision 8a88157cd7245729dea5d91a5181bb05a80164a8)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T		*/
27 /*	  All Rights Reserved  	*/
28 
29 /*
30  * University Copyright- Copyright (c) 1982, 1986, 1988
31  * The Regents of the University of California
32  * All Rights Reserved
33  *
34  * University Acknowledgment- Portions of this document are derived from
35  * software developed by the University of California, Berkeley, and its
36  * contributors.
37  */
38 
39 /* LINTLIBRARY */
40 /* PROTOLIB1 */
41 
42 #pragma ident	"%Z%%M%	%I%	%E% SMI"
43 
44 /* NFS server */
45 
46 #include <sys/param.h>
47 #include <sys/types.h>
48 #include <sys/stat.h>
49 #include <syslog.h>
50 #include <tiuser.h>
51 #include <rpc/rpc.h>
52 #include <errno.h>
53 #include <thread.h>
54 #include <sys/resource.h>
55 #include <sys/time.h>
56 #include <sys/file.h>
57 #include <nfs/nfs.h>
58 #include <nfs/nfs_acl.h>
59 #include <nfs/nfssys.h>
60 #include <stdio.h>
61 #include <stdio_ext.h>
62 #include <stdlib.h>
63 #include <signal.h>
64 #include <netconfig.h>
65 #include <netdir.h>
66 #include <string.h>
67 #include <unistd.h>
68 #include <stropts.h>
69 #include <sys/tihdr.h>
70 #include <poll.h>
71 #include <priv_utils.h>
72 #include <sys/tiuser.h>
73 #include <netinet/tcp.h>
74 #include <deflt.h>
75 #include <rpcsvc/daemon_utils.h>
76 #include <rpcsvc/nfs4_prot.h>
77 #include <libnvpair.h>
78 #include "nfs_tbind.h"
79 #include "thrpool.h"
80 
81 /* quiesce requests will be ignored if nfs_server_vers_max < QUIESCE_VERSMIN */
82 #define	QUIESCE_VERSMIN	4
83 /* DSS: distributed stable storage */
84 #define	DSS_VERSMIN	4
85 
86 static	int	nfssvc(int, struct netbuf, struct netconfig *);
87 static	int	nfssvcpool(int maxservers);
88 static	int	dss_init(uint_t npaths, char **pathnames);
89 static	void	dss_mkleafdirs(uint_t npaths, char **pathnames);
90 static	void	dss_mkleafdir(char *dir, char *leaf, char *path);
91 static	void	usage(void);
92 int		qstrcmp(const void *s1, const void *s2);
93 
94 extern	int	_nfssys(int, void *);
95 
96 /* signal handlers */
97 static void sigflush(int);
98 static void quiesce(int);
99 
100 static	char	*MyName;
101 static	NETSELDECL(defaultproviders)[] = { "/dev/tcp6", "/dev/tcp", "/dev/udp",
102 					    "/dev/udp6", NULL };
103 /* static	NETSELDECL(defaultprotos)[] =	{ NC_UDP, NC_TCP, NULL }; */
104 /*
105  * The following are all globals used by routines in nfs_tbind.c.
106  */
107 size_t	end_listen_fds;		/* used by conn_close_oldest() */
108 size_t	num_fds = 0;		/* used by multiple routines */
109 int	listen_backlog = 32;	/* used by bind_to_{provider,proto}() */
110 int	num_servers;		/* used by cots_listen_event() */
111 int	(*Mysvc)(int, struct netbuf, struct netconfig *) = nfssvc;
112 				/* used by cots_listen_event() */
113 int	max_conns_allowed = -1;	/* used by cots_listen_event() */
114 
115 /*
116  * Keep track of min/max versions of NFS protocol to be started.
117  * Start with the defaults (min == 2, max == 3).  We have the
118  * capability of starting vers=4 but only if the user requests it.
119  */
120 int	nfs_server_vers_min = NFS_VERSMIN_DEFAULT;
121 int	nfs_server_vers_max = NFS_VERSMAX_DEFAULT;
122 
123 /*
124  * Set the default for server delegation enablement and set per
125  * /etc/default/nfs configuration (if present).
126  */
127 int	nfs_server_delegation = NFS_SERVER_DELEGATION_DEFAULT;
128 
129 int
130 main(int ac, char *av[])
131 {
132 	char *dir = "/";
133 	int allflag = 0;
134 	int df_allflag = 0;
135 	int opt_cnt = 0;
136 	int maxservers = 1;	/* zero allows inifinte number of threads */
137 	int maxservers_set = 0;
138 	int logmaxservers = 0;
139 	int pid;
140 	int i;
141 	char *provider = (char *)NULL;
142 	char *df_provider = (char *)NULL;
143 	struct protob *protobp0, *protobp;
144 	NETSELDECL(proto) = NULL;
145 	NETSELDECL(df_proto) = NULL;
146 	NETSELPDECL(providerp);
147 	char *defval;
148 	boolean_t can_do_mlp;
149 	uint_t dss_npaths = 0;
150 	char **dss_pathnames = NULL;
151 	sigset_t sgset;
152 
153 	MyName = *av;
154 
155 	/*
156 	 * Initializations that require more privileges than we need to run.
157 	 */
158 	(void) _create_daemon_lock(NFSD, DAEMON_UID, DAEMON_GID);
159 	svcsetprio();
160 
161 	can_do_mlp = priv_ineffect(PRIV_NET_BINDMLP);
162 	if (__init_daemon_priv(PU_RESETGROUPS|PU_CLEARLIMITSET,
163 	    DAEMON_UID, DAEMON_GID, PRIV_SYS_NFS,
164 	    can_do_mlp ? PRIV_NET_BINDMLP : NULL, NULL) == -1) {
165 		(void) fprintf(stderr, "%s should be run with"
166 		    " sufficient privileges\n", av[0]);
167 		exit(1);
168 	}
169 
170 	(void) enable_extended_FILE_stdio(-1, -1);
171 
172 	/*
173 	 * Read in the values from config file first before we check
174 	 * commandline options so the options override the file.
175 	 */
176 	if ((defopen(NFSADMIN)) == 0) {
177 		if ((defval = defread("NFSD_MAX_CONNECTIONS=")) != NULL) {
178 			errno = 0;
179 			max_conns_allowed = strtol(defval, (char **)NULL, 10);
180 			if (errno != 0) {
181 				max_conns_allowed = -1;
182 			}
183 		}
184 		if ((defval = defread("NFSD_LISTEN_BACKLOG=")) != NULL) {
185 			errno = 0;
186 			listen_backlog = strtol(defval, (char **)NULL, 10);
187 			if (errno != 0) {
188 				listen_backlog = 32;
189 			}
190 		}
191 		if ((defval = defread("NFSD_PROTOCOL=")) != NULL) {
192 			df_proto = strdup(defval);
193 			opt_cnt++;
194 			if (strncasecmp("ALL", defval, 3) == 0) {
195 				free(df_proto);
196 				df_proto = NULL;
197 				df_allflag = 1;
198 			}
199 		}
200 		if ((defval = defread("NFSD_DEVICE=")) != NULL) {
201 			df_provider = strdup(defval);
202 			opt_cnt++;
203 		}
204 		if ((defval = defread("NFSD_SERVERS=")) != NULL) {
205 			errno = 0;
206 			maxservers = strtol(defval, (char **)NULL, 10);
207 			if (errno != 0) {
208 				maxservers = 1;
209 			} else {
210 				maxservers_set = 1;
211 			}
212 		}
213 		if ((defval = defread("NFS_SERVER_VERSMIN=")) != NULL) {
214 			errno = 0;
215 			nfs_server_vers_min =
216 			    strtol(defval, (char **)NULL, 10);
217 			if (errno != 0) {
218 				nfs_server_vers_min = NFS_VERSMIN_DEFAULT;
219 			}
220 		}
221 		if ((defval = defread("NFS_SERVER_VERSMAX=")) != NULL) {
222 			errno = 0;
223 			nfs_server_vers_max =
224 			    strtol(defval, (char **)NULL, 10);
225 			if (errno != 0) {
226 				nfs_server_vers_max = NFS_VERSMAX_DEFAULT;
227 			}
228 		}
229 		if ((defval = defread("NFS_SERVER_DELEGATION=")) != NULL) {
230 			if (strcmp(defval, "off") == 0) {
231 				nfs_server_delegation = FALSE;
232 			}
233 		}
234 
235 		/* close defaults file */
236 		defopen(NULL);
237 	}
238 
239 	/*
240 	 * Conflict options error messages.
241 	 */
242 	if (opt_cnt > 1) {
243 		(void) fprintf(stderr, "\nConflicting options, only one of "
244 		    "the following options can be specified\n"
245 		    "in " NFSADMIN ":\n"
246 		    "\tNFSD_PROTOCOL=ALL\n"
247 		    "\tNFSD_PROTOCOL=protocol\n"
248 		    "\tNFSD_DEVICE=device\n\n");
249 		usage();
250 	}
251 	opt_cnt = 0;
252 
253 	while ((i = getopt(ac, av, "ac:p:s:t:l:")) != EOF) {
254 		switch (i) {
255 		case 'a':
256 			free(df_proto);
257 			df_proto = NULL;
258 			free(df_provider);
259 			df_provider = NULL;
260 
261 			allflag = 1;
262 			opt_cnt++;
263 			break;
264 
265 		case 'c':
266 			max_conns_allowed = atoi(optarg);
267 			break;
268 
269 		case 'p':
270 			proto = optarg;
271 			df_allflag = 0;
272 			opt_cnt++;
273 			break;
274 
275 		/*
276 		 * DSS: NFSv4 distributed stable storage.
277 		 *
278 		 * This is a Contracted Project Private interface, for
279 		 * the sole use of Sun Cluster HA-NFS. See PSARC/2006/313.
280 		 */
281 		case 's':
282 			if (strlen(optarg) < MAXPATHLEN) {
283 				/* first "-s" option encountered? */
284 				if (dss_pathnames == NULL) {
285 					/*
286 					 * Allocate maximum possible space
287 					 * required given cmdline arg count;
288 					 * "-s <path>" consumes two args.
289 					 */
290 					size_t sz = (ac / 2) * sizeof (char *);
291 					dss_pathnames = (char **)malloc(sz);
292 					if (dss_pathnames == NULL) {
293 						(void) fprintf(stderr, "%s: "
294 						    "dss paths malloc failed\n",
295 						    av[0]);
296 						exit(1);
297 					}
298 					(void) memset(dss_pathnames, 0, sz);
299 				}
300 				dss_pathnames[dss_npaths] = optarg;
301 				dss_npaths++;
302 			} else {
303 				(void) fprintf(stderr,
304 				    "%s: -s pathname too long.\n", av[0]);
305 			}
306 			break;
307 
308 		case 't':
309 			provider = optarg;
310 			df_allflag = 0;
311 			opt_cnt++;
312 			break;
313 
314 		case 'l':
315 			listen_backlog = atoi(optarg);
316 			break;
317 
318 		case '?':
319 			usage();
320 			/* NOTREACHED */
321 		}
322 	}
323 
324 	allflag = df_allflag;
325 	if (proto == NULL)
326 		proto = df_proto;
327 	if (provider == NULL)
328 		provider = df_provider;
329 
330 	/*
331 	 * Conflict options error messages.
332 	 */
333 	if (opt_cnt > 1) {
334 		(void) fprintf(stderr, "\nConflicting options, only one of "
335 		    "the following options can be specified\n"
336 		    "on the command line:\n"
337 		    "\t-a\n"
338 		    "\t-p protocol\n"
339 		    "\t-t transport\n\n");
340 		usage();
341 	}
342 
343 	if (proto != NULL &&
344 	    strncasecmp(proto, NC_UDP, strlen(NC_UDP)) == 0) {
345 		if (nfs_server_vers_max == NFS_V4) {
346 			if (nfs_server_vers_min == NFS_V4) {
347 				syslog(LOG_ERR,
348 				    "NFS version 4 is not supported "
349 				    "with the UDP protocol.  Exiting\n");
350 				fprintf(stderr,
351 				    "NFS version 4 is not supported "
352 				    "with the UDP protocol.  Exiting\n");
353 				exit(3);
354 			} else {
355 				fprintf(stderr,
356 				    "NFS version 4 is not supported "
357 				    "with the UDP protocol.\n");
358 			}
359 		}
360 	}
361 
362 	/*
363 	 * If there is exactly one more argument, it is the number of
364 	 * servers.
365 	 */
366 	if (optind == ac - 1) {
367 		maxservers = atoi(av[optind]);
368 		maxservers_set = 1;
369 	}
370 	/*
371 	 * If there are two or more arguments, then this is a usage error.
372 	 */
373 	else if (optind < ac - 1)
374 		usage();
375 	/*
376 	 * Check the ranges for min/max version specified
377 	 */
378 	else if ((nfs_server_vers_min > nfs_server_vers_max) ||
379 	    (nfs_server_vers_min < NFS_VERSMIN) ||
380 	    (nfs_server_vers_max > NFS_VERSMAX))
381 		usage();
382 	/*
383 	 * There are no additional arguments, and we haven't set maxservers
384 	 * explicitly via the config file, we use a default number of
385 	 * servers.  We will log this.
386 	 */
387 	else if (maxservers_set == 0)
388 		logmaxservers = 1;
389 
390 	/*
391 	 * Basic Sanity checks on options
392 	 *
393 	 * max_conns_allowed must be positive, except for the special
394 	 * value of -1 which is used internally to mean unlimited, -1 isn't
395 	 * documented but we allow it anyway.
396 	 *
397 	 * maxservers must be positive
398 	 * listen_backlog must be positive or zero
399 	 */
400 	if (((max_conns_allowed != -1) && (max_conns_allowed <= 0)) ||
401 	    (listen_backlog < 0) || (maxservers <= 0)) {
402 		usage();
403 	}
404 
405 	/*
406 	 * Set current dir to server root
407 	 */
408 	if (chdir(dir) < 0) {
409 		(void) fprintf(stderr, "%s:  ", MyName);
410 		perror(dir);
411 		exit(1);
412 	}
413 
414 #ifndef DEBUG
415 	/*
416 	 * Background
417 	 */
418 	pid = fork();
419 	if (pid < 0) {
420 		perror("nfsd: fork");
421 		exit(1);
422 	}
423 	if (pid != 0)
424 		exit(0);
425 
426 	/*
427 	 * Close existing file descriptors, open "/dev/null" as
428 	 * standard input, output, and error, and detach from
429 	 * controlling terminal.
430 	 */
431 	closefrom(0);
432 	(void) open("/dev/null", O_RDONLY);
433 	(void) open("/dev/null", O_WRONLY);
434 	(void) dup(1);
435 	(void) setsid();
436 #endif
437 	openlog(MyName, LOG_PID | LOG_NDELAY, LOG_DAEMON);
438 
439 	/*
440 	 * establish our lock on the lock file and write our pid to it.
441 	 * exit if some other process holds the lock, or if there's any
442 	 * error in writing/locking the file.
443 	 */
444 	pid = _enter_daemon_lock(NFSD);
445 	switch (pid) {
446 	case 0:
447 		break;
448 	case -1:
449 		syslog(LOG_ERR, "error locking for %s: %s", NFSD,
450 		    strerror(errno));
451 		exit(2);
452 	default:
453 		/* daemon was already running */
454 		exit(0);
455 	}
456 
457 	/*
458 	 * If we've been given a list of paths to be used for distributed
459 	 * stable storage, and provided we're going to run a version
460 	 * that supports it, setup the DSS paths.
461 	 */
462 	if (dss_pathnames != NULL && nfs_server_vers_max >= DSS_VERSMIN) {
463 		if (dss_init(dss_npaths, dss_pathnames) != 0) {
464 			syslog(LOG_ERR, "dss_init failed. Exiting.");
465 			exit(1);
466 		}
467 	}
468 
469 	/*
470 	 * Block all signals till we spawn other
471 	 * threads.
472 	 */
473 	(void) sigfillset(&sgset);
474 	(void) thr_sigsetmask(SIG_BLOCK, &sgset, NULL);
475 
476 	if (logmaxservers) {
477 		(void) syslog(LOG_INFO,
478 		    "Number of servers not specified. Using default of %d.",
479 		    maxservers);
480 	}
481 
482 	/*
483 	 * Make sure to unregister any previous versions in case the
484 	 * user is reconfiguring the server in interesting ways.
485 	 */
486 	svc_unreg(NFS_PROGRAM, NFS_VERSION);
487 	svc_unreg(NFS_PROGRAM, NFS_V3);
488 	svc_unreg(NFS_PROGRAM, NFS_V4);
489 	svc_unreg(NFS_ACL_PROGRAM, NFS_ACL_V2);
490 	svc_unreg(NFS_ACL_PROGRAM, NFS_ACL_V3);
491 
492 	/*
493 	 * Set up kernel RPC thread pool for the NFS server.
494 	 */
495 	if (nfssvcpool(maxservers)) {
496 		(void) syslog(LOG_ERR,
497 		    "Can't set up kernel NFS service: %m. Exiting");
498 		exit(1);
499 	}
500 
501 
502 	/*
503 	 * Set up blocked thread to do LWP creation on behalf of the kernel.
504 	 */
505 	if (svcwait(NFS_SVCPOOL_ID)) {
506 		(void) syslog(LOG_ERR,
507 		    "Can't set up NFS pool creator: %m, Exiting");
508 		exit(1);
509 	}
510 
511 	/*
512 	 * RDMA start and stop thread.
513 	 * Per pool RDMA listener creation and
514 	 * destructor thread.
515 	 *
516 	 * start rdma services and block in the kernel.
517 	 */
518 	if (svcrdma(NFS_SVCPOOL_ID, nfs_server_vers_min, nfs_server_vers_max,
519 	    nfs_server_delegation)) {
520 		(void) syslog(LOG_ERR,
521 		    "Can't set up RDMA creator thread : %m.");
522 	}
523 
524 	/*
525 	 * Now open up for signal delivery
526 	 */
527 
528 	(void) thr_sigsetmask(SIG_UNBLOCK, &sgset, NULL);
529 	sigset(SIGTERM, sigflush);
530 	sigset(SIGUSR1, quiesce);
531 
532 	/*
533 	 * Build a protocol block list for registration.
534 	 */
535 	protobp0 = protobp = (struct protob *)malloc(sizeof (struct protob));
536 	protobp->serv = "NFS";
537 	protobp->versmin = nfs_server_vers_min;
538 	protobp->versmax = nfs_server_vers_max;
539 	protobp->program = NFS_PROGRAM;
540 
541 	protobp->next = (struct protob *)malloc(sizeof (struct protob));
542 	protobp = protobp->next;
543 	protobp->serv = "NFS_ACL";		/* not used */
544 	protobp->versmin = nfs_server_vers_min;
545 	/* XXX - this needs work to get the version just right */
546 	protobp->versmax = (nfs_server_vers_max > NFS_ACL_V3) ?
547 	    NFS_ACL_V3 : nfs_server_vers_max;
548 	protobp->program = NFS_ACL_PROGRAM;
549 	protobp->next = (struct protob *)NULL;
550 
551 	if (allflag) {
552 		if (do_all(protobp0, nfssvc, 0) == -1)
553 			exit(1);
554 	} else if (proto) {
555 		/* there's more than one match for the same protocol */
556 		struct netconfig *nconf;
557 		NCONF_HANDLE *nc;
558 		bool_t	protoFound = FALSE;
559 		if ((nc = setnetconfig()) == (NCONF_HANDLE *) NULL) {
560 			syslog(LOG_ERR, "setnetconfig failed: %m");
561 			goto done;
562 		}
563 		while (nconf = getnetconfig(nc)) {
564 			if (strcmp(nconf->nc_proto, proto) == 0) {
565 				protoFound = TRUE;
566 				do_one(nconf->nc_device, NULL,
567 				    protobp0, nfssvc, 0);
568 			}
569 		}
570 		(void) endnetconfig(nc);
571 		if (protoFound == FALSE)
572 			syslog(LOG_ERR, "couldn't find netconfig entry \
573 			    for protocol %s", proto);
574 
575 	} else if (provider)
576 		do_one(provider, proto, protobp0, nfssvc, 0);
577 	else {
578 		for (providerp = defaultproviders;
579 		    *providerp != NULL; providerp++) {
580 			provider = *providerp;
581 			do_one(provider, NULL, protobp0, nfssvc, 0);
582 		}
583 	}
584 done:
585 
586 	free(protobp);
587 	free(protobp0);
588 
589 
590 	if (num_fds == 0) {
591 		(void) syslog(LOG_ERR,
592 		"Could not start NFS service for any protocol. Exiting");
593 		exit(1);
594 	}
595 
596 	end_listen_fds = num_fds;
597 
598 	/*
599 	 * Get rid of unneeded privileges.
600 	 */
601 	__fini_daemon_priv(PRIV_PROC_FORK, PRIV_PROC_EXEC, PRIV_PROC_SESSION,
602 	    PRIV_FILE_LINK_ANY, PRIV_PROC_INFO, (char *)NULL);
603 
604 	/*
605 	 * Poll for non-data control events on the transport descriptors.
606 	 */
607 	poll_for_action();
608 
609 	/*
610 	 * If we get here, something failed in poll_for_action().
611 	 */
612 	return (1);
613 }
614 
615 static int
616 nfssvcpool(int maxservers)
617 {
618 	struct svcpool_args npa;
619 
620 	npa.id = NFS_SVCPOOL_ID;
621 	npa.maxthreads = maxservers;
622 	npa.redline = 0;
623 	npa.qsize = 0;
624 	npa.timeout = 0;
625 	npa.stksize = 0;
626 	npa.max_same_xprt = 0;
627 	return (_nfssys(SVCPOOL_CREATE, &npa));
628 }
629 
630 /*
631  * Establish NFS service thread.
632  */
633 static int
634 nfssvc(int fd, struct netbuf addrmask, struct netconfig *nconf)
635 {
636 	struct nfs_svc_args nsa;
637 
638 	nsa.fd = fd;
639 	nsa.netid = nconf->nc_netid;
640 	nsa.addrmask = addrmask;
641 	if (strncasecmp(nconf->nc_proto, NC_UDP, strlen(NC_UDP)) == 0) {
642 		nsa.versmax = (nfs_server_vers_max > NFS_V3) ?
643 		    NFS_V3 : nfs_server_vers_max;
644 		nsa.versmin = nfs_server_vers_min;
645 		/*
646 		 * If no version left, silently do nothing, previous
647 		 * checks will have assured at least TCP is available.
648 		 */
649 		if (nsa.versmin > nsa.versmax)
650 			return (0);
651 	} else {
652 		nsa.versmax = nfs_server_vers_max;
653 		nsa.versmin = nfs_server_vers_min;
654 	}
655 	nsa.delegation = nfs_server_delegation;
656 	return (_nfssys(NFS_SVC, &nsa));
657 }
658 
659 static void
660 usage(void)
661 {
662 	(void) fprintf(stderr,
663 "usage: %s [ -a ] [ -c max_conns ] [ -p protocol ] [ -t transport ] ", MyName);
664 	(void) fprintf(stderr, "\n[ -l listen_backlog ] [ nservers ]\n");
665 	(void) fprintf(stderr,
666 "\twhere -a causes <nservers> to be started on each appropriate transport,\n");
667 	(void) fprintf(stderr,
668 "\tmax_conns is the maximum number of concurrent connections allowed,\n");
669 	(void) fprintf(stderr, "\t\tand max_conns must be a decimal number");
670 	(void) fprintf(stderr, "> zero,\n");
671 	(void) fprintf(stderr, "\tprotocol is a protocol identifier,\n");
672 	(void) fprintf(stderr,
673 	    "\ttransport is a transport provider name (i.e. device),\n");
674 	(void) fprintf(stderr,
675 	    "\tlisten_backlog is the TCP listen backlog,\n");
676 	(void) fprintf(stderr,
677 	    "\tand <nservers> must be a decimal number > zero.\n");
678 	exit(1);
679 }
680 
681 /*
682  * Issue nfssys system call to flush all logging buffers asynchronously.
683  *
684  * NOTICE: It is extremely important to flush NFS logging buffers when
685  *	   nfsd exits. When the system is halted or rebooted nfslogd
686  *	   may not have an opportunity to flush the buffers.
687  */
688 static void
689 nfsl_flush()
690 {
691 	struct nfsl_flush_args nfa;
692 
693 	memset((void *)&nfa, 0, sizeof (nfa));
694 	nfa.version = NFSL_FLUSH_ARGS_VERS;
695 	nfa.directive = NFSL_ALL;	/* flush all asynchronously */
696 
697 	if (_nfssys(LOG_FLUSH, &nfa) < 0)
698 		syslog(LOG_ERR, "_nfssys(LOG_FLUSH) failed: %s\n",
699 		    strerror(errno));
700 }
701 
702 /*
703  * SIGTERM handler.
704  * Flush logging buffers and exit.
705  */
706 static void
707 sigflush(int sig)
708 {
709 	nfsl_flush();
710 	_exit(0);
711 }
712 
713 /*
714  * SIGUSR1 handler.
715  *
716  * Request that server quiesce, then (nfsd) exit. For subsequent warm start.
717  *
718  * This is a Contracted Project Private interface, for the sole use
719  * of Sun Cluster HA-NFS. See PSARC/2004/497.
720  *
721  * Equivalent to SIGTERM handler if nfs_server_vers_max < QUIESCE_VERSMIN.
722  */
723 static void
724 quiesce(int sig)
725 {
726 	int error;
727 	int id = NFS_SVCPOOL_ID;
728 
729 	if (nfs_server_vers_max >= QUIESCE_VERSMIN) {
730 		/* Request server quiesce at next shutdown */
731 		error = _nfssys(NFS4_SVC_REQUEST_QUIESCE, &id);
732 
733 		/*
734 		 * ENOENT is returned if there is no matching SVC pool
735 		 * for the id. Possibly because the pool is not yet setup.
736 		 * In this case, just exit as if no error. For all other errors,
737 		 * just return and allow caller to retry.
738 		 */
739 		if (error && errno != ENOENT) {
740 			syslog(LOG_ERR,
741 			    "_nfssys(NFS4_SVC_REQUEST_QUIESCE) failed: %s",
742 			    strerror(errno));
743 			return;
744 		}
745 	}
746 
747 	/* Flush logging buffers */
748 	nfsl_flush();
749 
750 	_exit(0);
751 }
752 
753 /*
754  * DSS: distributed stable storage.
755  * Create leaf directories as required, keeping an eye on path
756  * lengths. Calls exit(1) on failure.
757  * The pathnames passed in must already exist, and must be writeable by nfsd.
758  * Note: the leaf directories under NFS4_VAR_DIR are not created here;
759  * they're created at pkg install.
760  */
761 static void
762 dss_mkleafdirs(uint_t npaths, char **pathnames)
763 {
764 	int i;
765 	char *tmppath = NULL;
766 
767 	/*
768 	 * Create the temporary storage used by dss_mkleafdir() here,
769 	 * rather than in that function, so that it only needs to be
770 	 * done once, rather than once for each call. Too big to put
771 	 * on the function's stack.
772 	 */
773 	tmppath = (char *)malloc(MAXPATHLEN);
774 	if (tmppath == NULL) {
775 		syslog(LOG_ERR, "tmppath malloc failed. Exiting");
776 		exit(1);
777 	}
778 
779 	for (i = 0; i < npaths; i++) {
780 		char *p = pathnames[i];
781 
782 		dss_mkleafdir(p, NFS4_DSS_STATE_LEAF, tmppath);
783 		dss_mkleafdir(p, NFS4_DSS_OLDSTATE_LEAF, tmppath);
784 	}
785 
786 	free(tmppath);
787 }
788 
789 /*
790  * Create "leaf" in "dir" (which must already exist).
791  * leaf: should start with a '/'
792  */
793 static void
794 dss_mkleafdir(char *dir, char *leaf, char *tmppath)
795 {
796 	/* MAXPATHLEN includes the terminating NUL */
797 	if (strlen(dir) + strlen(leaf) > MAXPATHLEN - 1) {
798 		syslog(LOG_ERR, "stable storage path too long: %s%s. Exiting",
799 		    dir, leaf);
800 		exit(1);
801 	}
802 
803 	(void) snprintf(tmppath, MAXPATHLEN, "%s/%s", dir, leaf);
804 
805 	/* the directory may already exist: that's OK */
806 	if (mkdir(tmppath, NFS4_DSS_DIR_MODE) == -1 && errno != EEXIST) {
807 		syslog(LOG_ERR, "error creating stable storage directory: "
808 		    "%s: %s. Exiting", strerror(errno), tmppath);
809 		exit(1);
810 	}
811 }
812 
813 /*
814  * Create the storage dirs, and pass the path list to the kernel.
815  * This requires the nfssrv module to be loaded; the _nfssys() syscall
816  * will fail ENOTSUP if it is not.
817  * Use libnvpair(3LIB) to pass the data to the kernel.
818  */
819 static int
820 dss_init(uint_t npaths, char **pathnames)
821 {
822 	int i, j, nskipped, error;
823 	char *bufp;
824 	uint32_t bufsize;
825 	size_t buflen;
826 	nvlist_t *nvl;
827 
828 	if (npaths > 1) {
829 		/*
830 		 * We need to remove duplicate paths; this might be user error
831 		 * in the general case, but HA-NFSv4 can also cause this.
832 		 * Sort the pathnames array, and NULL out duplicates,
833 		 * then write the non-NULL entries to a new array.
834 		 * Sorting will also allow the kernel to optimise its searches.
835 		 */
836 
837 		qsort(pathnames, npaths, sizeof (char *), qstrcmp);
838 
839 		/* now NULL out any duplicates */
840 		i = 0; j = 1; nskipped = 0;
841 		while (j < npaths) {
842 			if (strcmp(pathnames[i], pathnames[j]) == NULL) {
843 				pathnames[j] = NULL;
844 				j++;
845 				nskipped++;
846 				continue;
847 			}
848 
849 			/* skip i over any of its NULLed duplicates */
850 			i = j++;
851 		}
852 
853 		/* finally, write the non-NULL entries to a new array */
854 		if (nskipped > 0) {
855 			int nreal;
856 			size_t sz;
857 			char **tmp_pathnames;
858 
859 			nreal = npaths - nskipped;
860 
861 			sz = nreal * sizeof (char *);
862 			tmp_pathnames = (char **)malloc(sz);
863 			if (tmp_pathnames == NULL) {
864 				syslog(LOG_ERR, "tmp_pathnames malloc failed");
865 				exit(1);
866 			}
867 
868 			for (i = 0, j = 0; i < npaths; i++)
869 				if (pathnames[i] != NULL)
870 					tmp_pathnames[j++] = pathnames[i];
871 			free(pathnames);
872 			pathnames = tmp_pathnames;
873 			npaths = nreal;
874 		}
875 
876 	}
877 
878 	/* Create directories to store the distributed state files */
879 	dss_mkleafdirs(npaths, pathnames);
880 
881 	/* Create the name-value pair list */
882 	error = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
883 	if (error) {
884 		syslog(LOG_ERR, "nvlist_alloc failed: %s.", strerror(errno));
885 		return (1);
886 	}
887 
888 	/* Add the pathnames array as a single name-value pair */
889 	error = nvlist_add_string_array(nvl, NFS4_DSS_NVPAIR_NAME,
890 	    pathnames, npaths);
891 	if (error) {
892 		syslog(LOG_ERR, "nvlist_add_string_array failed: %s.",
893 		    strerror(errno));
894 		nvlist_free(nvl);
895 		return (1);
896 	}
897 
898 	/*
899 	 * Pack list into contiguous memory, for passing to kernel.
900 	 * nvlist_pack() will allocate the memory for the buffer,
901 	 * which we should free() when no longer needed.
902 	 * NV_ENCODE_XDR for safety across ILP32/LP64 kernel boundary.
903 	 */
904 	bufp = NULL;
905 	error = nvlist_pack(nvl, &bufp, &buflen, NV_ENCODE_XDR, 0);
906 	if (error) {
907 		syslog(LOG_ERR, "nvlist_pack failed: %s.", strerror(errno));
908 		nvlist_free(nvl);
909 		return (1);
910 	}
911 
912 	/* Now we have the packed buffer, we no longer need the list */
913 	nvlist_free(nvl);
914 
915 	/*
916 	 * Let the kernel know in advance how big the buffer is.
917 	 * NOTE: we cannot just pass buflen, since size_t is a long, and
918 	 * thus a different size between ILP32 userland and LP64 kernel.
919 	 * Use an int for the transfer, since that should be big enough;
920 	 * this is a no-op at the moment, here, since nfsd is 32-bit, but
921 	 * that could change.
922 	 */
923 	bufsize = (uint32_t)buflen;
924 	error = _nfssys(NFS4_DSS_SETPATHS_SIZE, &bufsize);
925 	if (error) {
926 		syslog(LOG_ERR,
927 		    "_nfssys(NFS4_DSS_SETPATHS_SIZE) failed: %s. ",
928 		    strerror(errno));
929 		free(bufp);
930 		return (1);
931 	}
932 
933 	/* Pass the packed buffer to the kernel */
934 	error = _nfssys(NFS4_DSS_SETPATHS, bufp);
935 	if (error) {
936 		syslog(LOG_ERR,
937 		    "_nfssys(NFS4_DSS_SETPATHS) failed: %s. ", strerror(errno));
938 		free(bufp);
939 		return (1);
940 	}
941 
942 	/*
943 	 * The kernel has now unpacked the buffer and extracted the
944 	 * pathnames array, we no longer need the buffer.
945 	 */
946 	free(bufp);
947 
948 	return (0);
949 }
950 
951 /*
952  * Quick sort string compare routine, for qsort.
953  * Needed to make arg types correct.
954  */
955 int
956 qstrcmp(const void *p1, const void *p2)
957 {
958 	char *s1 = *((char **)p1);
959 	char *s2 = *((char **)p2);
960 
961 	return (strcmp(s1, s2));
962 }
963