xref: /illumos-gate/usr/src/cmd/fs.d/nfs/nfsd/nfsd.c (revision bd97c7ce2344fa3252d8785c35895490916bc79b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T		*/
26 /*	  All Rights Reserved	*/
27 
28 /*
29  * University Copyright- Copyright (c) 1982, 1986, 1988
30  * The Regents of the University of California
31  * All Rights Reserved
32  *
33  * University Acknowledgment- Portions of this document are derived from
34  * software developed by the University of California, Berkeley, and its
35  * contributors.
36  */
37 
38 /* LINTLIBRARY */
39 /* PROTOLIB1 */
40 
41 /* NFS server */
42 
43 #include <sys/param.h>
44 #include <sys/types.h>
45 #include <sys/stat.h>
46 #include <syslog.h>
47 #include <tiuser.h>
48 #include <rpc/rpc.h>
49 #include <errno.h>
50 #include <thread.h>
51 #include <sys/resource.h>
52 #include <sys/time.h>
53 #include <sys/file.h>
54 #include <nfs/nfs.h>
55 #include <nfs/nfs_acl.h>
56 #include <nfs/nfssys.h>
57 #include <stdio.h>
58 #include <stdio_ext.h>
59 #include <stdlib.h>
60 #include <signal.h>
61 #include <netconfig.h>
62 #include <netdir.h>
63 #include <string.h>
64 #include <unistd.h>
65 #include <limits.h>
66 #include <stropts.h>
67 #include <sys/tihdr.h>
68 #include <sys/wait.h>
69 #include <poll.h>
70 #include <priv_utils.h>
71 #include <sys/tiuser.h>
72 #include <netinet/tcp.h>
73 #include <deflt.h>
74 #include <rpcsvc/daemon_utils.h>
75 #include <rpcsvc/nfs4_prot.h>
76 #include <libnvpair.h>
77 #include <libscf.h>
78 #include <libshare.h>
79 #include "nfs_tbind.h"
80 #include "thrpool.h"
81 #include "smfcfg.h"
82 
83 /* quiesce requests will be ignored if nfs_server_vers_max < QUIESCE_VERSMIN */
84 #define	QUIESCE_VERSMIN	4
85 /* DSS: distributed stable storage */
86 #define	DSS_VERSMIN	4
87 
88 static	int	nfssvc(int, struct netbuf, struct netconfig *);
89 static	int	nfssvcpool(int maxservers);
90 static	int	dss_init(uint_t npaths, char **pathnames);
91 static	void	dss_mkleafdirs(uint_t npaths, char **pathnames);
92 static	void	dss_mkleafdir(char *dir, char *leaf, char *path);
93 static	void	usage(void);
94 int		qstrcmp(const void *s1, const void *s2);
95 
96 extern	int	_nfssys(int, void *);
97 
98 extern int	daemonize_init(void);
99 extern void	daemonize_fini(int fd);
100 
101 /* signal handlers */
102 static void sigflush(int);
103 static void quiesce(int);
104 
105 static	char	*MyName;
106 static	NETSELDECL(defaultproviders)[] = { "/dev/tcp6", "/dev/tcp", "/dev/udp",
107 					    "/dev/udp6", NULL };
108 /* static	NETSELDECL(defaultprotos)[] =	{ NC_UDP, NC_TCP, NULL }; */
109 /*
110  * The following are all globals used by routines in nfs_tbind.c.
111  */
112 size_t	end_listen_fds;		/* used by conn_close_oldest() */
113 size_t	num_fds = 0;		/* used by multiple routines */
114 int	listen_backlog = 32;	/* used by bind_to_{provider,proto}() */
115 int	num_servers;		/* used by cots_listen_event() */
116 int	(*Mysvc)(int, struct netbuf, struct netconfig *) = nfssvc;
117 				/* used by cots_listen_event() */
118 int	max_conns_allowed = -1;	/* used by cots_listen_event() */
119 
120 /*
121  * Keep track of min/max versions of NFS protocol to be started.
122  * Start with the defaults (min == 2, max == 3).  We have the
123  * capability of starting vers=4 but only if the user requests it.
124  */
125 int	nfs_server_vers_min = NFS_VERSMIN_DEFAULT;
126 int	nfs_server_vers_max = NFS_VERSMAX_DEFAULT;
127 
128 /*
129  * Set the default for server delegation enablement and set per
130  * /etc/default/nfs configuration (if present).
131  */
132 int	nfs_server_delegation = NFS_SERVER_DELEGATION_DEFAULT;
133 
134 int
135 main(int ac, char *av[])
136 {
137 	char *dir = "/";
138 	int allflag = 0;
139 	int df_allflag = 0;
140 	int opt_cnt = 0;
141 	int maxservers = 1024;	/* zero allows inifinte number of threads */
142 	int maxservers_set = 0;
143 	int logmaxservers = 0;
144 	int pid;
145 	int i;
146 	char *provider = NULL;
147 	char *df_provider = NULL;
148 	struct protob *protobp0, *protobp;
149 	NETSELDECL(proto) = NULL;
150 	NETSELDECL(df_proto) = NULL;
151 	NETSELPDECL(providerp);
152 	char *defval;
153 	boolean_t can_do_mlp;
154 	uint_t dss_npaths = 0;
155 	char **dss_pathnames = NULL;
156 	sigset_t sgset;
157 	char name[PATH_MAX], value[PATH_MAX];
158 	int ret, bufsz;
159 	int pipe_fd = -1;
160 	const char *errstr;
161 
162 	MyName = *av;
163 
164 	/*
165 	 * Initializations that require more privileges than we need to run.
166 	 */
167 	(void) _create_daemon_lock(NFSD, DAEMON_UID, DAEMON_GID);
168 	svcsetprio();
169 
170 	can_do_mlp = priv_ineffect(PRIV_NET_BINDMLP);
171 	if (__init_daemon_priv(PU_RESETGROUPS|PU_CLEARLIMITSET,
172 	    DAEMON_UID, DAEMON_GID, PRIV_SYS_NFS,
173 	    can_do_mlp ? PRIV_NET_BINDMLP : NULL, NULL) == -1) {
174 		(void) fprintf(stderr, "%s should be run with"
175 		    " sufficient privileges\n", av[0]);
176 		exit(1);
177 	}
178 
179 	(void) enable_extended_FILE_stdio(-1, -1);
180 
181 	/* Upgrade SMF settings, if necessary. */
182 	nfs_config_upgrade(NFSD);
183 
184 	/*
185 	 * Read in the values from SMF first before we check
186 	 * command line options so the options override SMF values.
187 	 */
188 	bufsz = PATH_MAX;
189 	ret = nfs_smf_get_prop("max_connections", value, DEFAULT_INSTANCE,
190 	    SCF_TYPE_INTEGER, NFSD, &bufsz);
191 	if (ret == SA_OK) {
192 		errno = 0;
193 		max_conns_allowed = strtol(value, (char **)NULL, 10);
194 		if (errno != 0)
195 			max_conns_allowed = -1;
196 	}
197 
198 	bufsz = PATH_MAX;
199 	ret = nfs_smf_get_prop("listen_backlog", value, DEFAULT_INSTANCE,
200 	    SCF_TYPE_INTEGER, NFSD, &bufsz);
201 	if (ret == SA_OK) {
202 		errno = 0;
203 		listen_backlog = strtol(value, (char **)NULL, 10);
204 		if (errno != 0) {
205 			listen_backlog = 32;
206 		}
207 	}
208 
209 	bufsz = PATH_MAX;
210 	ret = nfs_smf_get_prop("protocol", value, DEFAULT_INSTANCE,
211 	    SCF_TYPE_ASTRING, NFSD, &bufsz);
212 	if ((ret == SA_OK) && strlen(value) > 0) {
213 		df_proto = strdup(value);
214 		opt_cnt++;
215 		if (strncasecmp("ALL", value, 3) == 0) {
216 			free(df_proto);
217 			df_proto = NULL;
218 			df_allflag = 1;
219 		}
220 	}
221 
222 	bufsz = PATH_MAX;
223 	ret = nfs_smf_get_prop("device", value, DEFAULT_INSTANCE,
224 	    SCF_TYPE_ASTRING, NFSD, &bufsz);
225 	if ((ret == SA_OK) && strlen(value) > 0) {
226 		df_provider = strdup(value);
227 		opt_cnt++;
228 	}
229 
230 	bufsz = PATH_MAX;
231 	ret = nfs_smf_get_prop("servers", value, DEFAULT_INSTANCE,
232 	    SCF_TYPE_INTEGER, NFSD, &bufsz);
233 	if (ret == SA_OK) {
234 		errno = 0;
235 		maxservers = strtol(value, (char **)NULL, 10);
236 		if (errno != 0)
237 			maxservers = 1024;
238 		else
239 			maxservers_set = 1;
240 	}
241 
242 	bufsz = 4;
243 	ret = nfs_smf_get_prop("server_versmin", value, DEFAULT_INSTANCE,
244 	    SCF_TYPE_ASTRING, NFSD, &bufsz);
245 	if (ret == SA_OK) {
246 		ret = strtonum(value, NFS_VERSMIN, NFS_VERSMAX, &errstr);
247 		if (errstr != NULL) {
248 			(void) fprintf(stderr, "invalid server_versmin: %s\n",
249 			    errstr);
250 		} else {
251 			nfs_server_vers_min = ret;
252 		}
253 	}
254 
255 	bufsz = 4;
256 	ret = nfs_smf_get_prop("server_versmax", value, DEFAULT_INSTANCE,
257 	    SCF_TYPE_ASTRING, NFSD, &bufsz);
258 	if (ret == SA_OK) {
259 		ret = strtonum(value, NFS_VERSMIN, NFS_VERSMAX, &errstr);
260 		if (errstr != NULL) {
261 			(void) fprintf(stderr, "invalid server_versmax: %s\n",
262 			    errstr);
263 		} else {
264 			nfs_server_vers_max = ret;
265 		}
266 	}
267 
268 	bufsz = PATH_MAX;
269 	ret = nfs_smf_get_prop("server_delegation", value, DEFAULT_INSTANCE,
270 	    SCF_TYPE_ASTRING, NFSD, &bufsz);
271 	if (ret == SA_OK)
272 		if (strncasecmp(value, "off", 3) == 0)
273 			nfs_server_delegation = FALSE;
274 
275 	/*
276 	 * Conflict options error messages.
277 	 */
278 	if (opt_cnt > 1) {
279 		(void) fprintf(stderr, "\nConflicting options, only one of "
280 		    "the following options can be specified\n"
281 		    "in SMF:\n"
282 		    "\tprotocol=ALL\n"
283 		    "\tprotocol=protocol\n"
284 		    "\tdevice=devicename\n\n");
285 		usage();
286 	}
287 	opt_cnt = 0;
288 
289 	while ((i = getopt(ac, av, "ac:p:s:t:l:")) != EOF) {
290 		switch (i) {
291 		case 'a':
292 			free(df_proto);
293 			df_proto = NULL;
294 			free(df_provider);
295 			df_provider = NULL;
296 
297 			allflag = 1;
298 			opt_cnt++;
299 			break;
300 
301 		case 'c':
302 			max_conns_allowed = atoi(optarg);
303 			break;
304 
305 		case 'p':
306 			proto = optarg;
307 			df_allflag = 0;
308 			opt_cnt++;
309 			break;
310 
311 		/*
312 		 * DSS: NFSv4 distributed stable storage.
313 		 *
314 		 * This is a Contracted Project Private interface, for
315 		 * the sole use of Sun Cluster HA-NFS. See PSARC/2006/313.
316 		 */
317 		case 's':
318 			if (strlen(optarg) < MAXPATHLEN) {
319 				/* first "-s" option encountered? */
320 				if (dss_pathnames == NULL) {
321 					/*
322 					 * Allocate maximum possible space
323 					 * required given cmdline arg count;
324 					 * "-s <path>" consumes two args.
325 					 */
326 					size_t sz = (ac / 2) * sizeof (char *);
327 					dss_pathnames = (char **)malloc(sz);
328 					if (dss_pathnames == NULL) {
329 						(void) fprintf(stderr, "%s: "
330 						    "dss paths malloc failed\n",
331 						    av[0]);
332 						exit(1);
333 					}
334 					(void) memset(dss_pathnames, 0, sz);
335 				}
336 				dss_pathnames[dss_npaths] = optarg;
337 				dss_npaths++;
338 			} else {
339 				(void) fprintf(stderr,
340 				    "%s: -s pathname too long.\n", av[0]);
341 			}
342 			break;
343 
344 		case 't':
345 			provider = optarg;
346 			df_allflag = 0;
347 			opt_cnt++;
348 			break;
349 
350 		case 'l':
351 			listen_backlog = atoi(optarg);
352 			break;
353 
354 		case '?':
355 			usage();
356 			/* NOTREACHED */
357 		}
358 	}
359 
360 	allflag = df_allflag;
361 	if (proto == NULL)
362 		proto = df_proto;
363 	if (provider == NULL)
364 		provider = df_provider;
365 
366 	/*
367 	 * Conflict options error messages.
368 	 */
369 	if (opt_cnt > 1) {
370 		(void) fprintf(stderr, "\nConflicting options, only one of "
371 		    "the following options can be specified\n"
372 		    "on the command line:\n"
373 		    "\t-a\n"
374 		    "\t-p protocol\n"
375 		    "\t-t transport\n\n");
376 		usage();
377 	}
378 
379 	if (proto != NULL &&
380 	    strncasecmp(proto, NC_UDP, strlen(NC_UDP)) == 0) {
381 		if (nfs_server_vers_max == NFS_V4) {
382 			if (nfs_server_vers_min == NFS_V4) {
383 				fprintf(stderr,
384 				    "NFS version 4 is not supported "
385 				    "with the UDP protocol.  Exiting\n");
386 				exit(3);
387 			} else {
388 				fprintf(stderr,
389 				    "NFS version 4 is not supported "
390 				    "with the UDP protocol.\n");
391 			}
392 		}
393 	}
394 
395 	/*
396 	 * If there is exactly one more argument, it is the number of
397 	 * servers.
398 	 */
399 	if (optind == ac - 1) {
400 		maxservers = atoi(av[optind]);
401 		maxservers_set = 1;
402 	}
403 	/*
404 	 * If there are two or more arguments, then this is a usage error.
405 	 */
406 	else if (optind < ac - 1)
407 		usage();
408 	/*
409 	 * Check the ranges for min/max version specified
410 	 */
411 	else if ((nfs_server_vers_min > nfs_server_vers_max) ||
412 	    (nfs_server_vers_min < NFS_VERSMIN) ||
413 	    (nfs_server_vers_max > NFS_VERSMAX))
414 		usage();
415 	/*
416 	 * There are no additional arguments, and we haven't set maxservers
417 	 * explicitly via the config file, we use a default number of
418 	 * servers.  We will log this.
419 	 */
420 	else if (maxservers_set == 0)
421 		logmaxservers = 1;
422 
423 	/*
424 	 * Basic Sanity checks on options
425 	 *
426 	 * max_conns_allowed must be positive, except for the special
427 	 * value of -1 which is used internally to mean unlimited, -1 isn't
428 	 * documented but we allow it anyway.
429 	 *
430 	 * maxservers must be positive
431 	 * listen_backlog must be positive or zero
432 	 */
433 	if (((max_conns_allowed != -1) && (max_conns_allowed <= 0)) ||
434 	    (listen_backlog < 0) || (maxservers <= 0)) {
435 		usage();
436 	}
437 
438 	/*
439 	 * Set current dir to server root
440 	 */
441 	if (chdir(dir) < 0) {
442 		(void) fprintf(stderr, "%s:  ", MyName);
443 		perror(dir);
444 		exit(1);
445 	}
446 
447 #ifndef DEBUG
448 	pipe_fd = daemonize_init();
449 #endif
450 
451 	openlog(MyName, LOG_PID | LOG_NDELAY, LOG_DAEMON);
452 
453 	/*
454 	 * establish our lock on the lock file and write our pid to it.
455 	 * exit if some other process holds the lock, or if there's any
456 	 * error in writing/locking the file.
457 	 */
458 	pid = _enter_daemon_lock(NFSD);
459 	switch (pid) {
460 	case 0:
461 		break;
462 	case -1:
463 		fprintf(stderr, "error locking for %s: %s\n", NFSD,
464 		    strerror(errno));
465 		exit(2);
466 	default:
467 		/* daemon was already running */
468 		exit(0);
469 	}
470 
471 	/*
472 	 * If we've been given a list of paths to be used for distributed
473 	 * stable storage, and provided we're going to run a version
474 	 * that supports it, setup the DSS paths.
475 	 */
476 	if (dss_pathnames != NULL && nfs_server_vers_max >= DSS_VERSMIN) {
477 		if (dss_init(dss_npaths, dss_pathnames) != 0) {
478 			fprintf(stderr, "%s", "dss_init failed. Exiting.\n");
479 			exit(1);
480 		}
481 	}
482 
483 	/*
484 	 * Block all signals till we spawn other
485 	 * threads.
486 	 */
487 	(void) sigfillset(&sgset);
488 	(void) thr_sigsetmask(SIG_BLOCK, &sgset, NULL);
489 
490 	if (logmaxservers) {
491 		fprintf(stderr,
492 		    "Number of servers not specified. Using default of %d.\n",
493 		    maxservers);
494 	}
495 
496 	/*
497 	 * Make sure to unregister any previous versions in case the
498 	 * user is reconfiguring the server in interesting ways.
499 	 */
500 	svc_unreg(NFS_PROGRAM, NFS_VERSION);
501 	svc_unreg(NFS_PROGRAM, NFS_V3);
502 	svc_unreg(NFS_PROGRAM, NFS_V4);
503 	svc_unreg(NFS_ACL_PROGRAM, NFS_ACL_V2);
504 	svc_unreg(NFS_ACL_PROGRAM, NFS_ACL_V3);
505 
506 	/*
507 	 * Set up kernel RPC thread pool for the NFS server.
508 	 */
509 	if (nfssvcpool(maxservers)) {
510 		fprintf(stderr, "Can't set up kernel NFS service: %s. "
511 		    "Exiting.\n", strerror(errno));
512 		exit(1);
513 	}
514 
515 	/*
516 	 * Set up blocked thread to do LWP creation on behalf of the kernel.
517 	 */
518 	if (svcwait(NFS_SVCPOOL_ID)) {
519 		fprintf(stderr, "Can't set up NFS pool creator: %s. Exiting.\n",
520 		    strerror(errno));
521 		exit(1);
522 	}
523 
524 	/*
525 	 * RDMA start and stop thread.
526 	 * Per pool RDMA listener creation and
527 	 * destructor thread.
528 	 *
529 	 * start rdma services and block in the kernel.
530 	 * (only if proto or provider is not set to TCP or UDP)
531 	 */
532 	if ((proto == NULL) && (provider == NULL)) {
533 		if (svcrdma(NFS_SVCPOOL_ID, nfs_server_vers_min,
534 		    nfs_server_vers_max, nfs_server_delegation)) {
535 			fprintf(stderr,
536 			    "Can't set up RDMA creator thread : %s\n",
537 			    strerror(errno));
538 		}
539 	}
540 
541 	/*
542 	 * Now open up for signal delivery
543 	 */
544 
545 	(void) thr_sigsetmask(SIG_UNBLOCK, &sgset, NULL);
546 	sigset(SIGTERM, sigflush);
547 	sigset(SIGUSR1, quiesce);
548 
549 	/*
550 	 * Build a protocol block list for registration.
551 	 */
552 	protobp0 = protobp = (struct protob *)malloc(sizeof (struct protob));
553 	protobp->serv = "NFS";
554 	protobp->versmin = nfs_server_vers_min;
555 	protobp->versmax = nfs_server_vers_max;
556 	protobp->program = NFS_PROGRAM;
557 
558 	protobp->next = (struct protob *)malloc(sizeof (struct protob));
559 	protobp = protobp->next;
560 	protobp->serv = "NFS_ACL";		/* not used */
561 	protobp->versmin = nfs_server_vers_min;
562 	/* XXX - this needs work to get the version just right */
563 	protobp->versmax = (nfs_server_vers_max > NFS_ACL_V3) ?
564 	    NFS_ACL_V3 : nfs_server_vers_max;
565 	protobp->program = NFS_ACL_PROGRAM;
566 	protobp->next = (struct protob *)NULL;
567 
568 	if (allflag) {
569 		if (do_all(protobp0, nfssvc) == -1) {
570 			fprintf(stderr, "setnetconfig failed : %s\n",
571 			    strerror(errno));
572 			exit(1);
573 		}
574 	} else if (proto) {
575 		/* there's more than one match for the same protocol */
576 		struct netconfig *nconf;
577 		NCONF_HANDLE *nc;
578 		bool_t	protoFound = FALSE;
579 		if ((nc = setnetconfig()) == (NCONF_HANDLE *) NULL) {
580 			fprintf(stderr, "setnetconfig failed : %s\n",
581 			    strerror(errno));
582 			goto done;
583 		}
584 		while (nconf = getnetconfig(nc)) {
585 			if (strcmp(nconf->nc_proto, proto) == 0) {
586 				protoFound = TRUE;
587 				do_one(nconf->nc_device, NULL,
588 				    protobp0, nfssvc);
589 			}
590 		}
591 		(void) endnetconfig(nc);
592 		if (protoFound == FALSE) {
593 			fprintf(stderr,
594 			    "couldn't find netconfig entry for protocol %s\n",
595 			    proto);
596 		}
597 	} else if (provider)
598 		do_one(provider, proto, protobp0, nfssvc);
599 	else {
600 		for (providerp = defaultproviders;
601 		    *providerp != NULL; providerp++) {
602 			provider = *providerp;
603 			do_one(provider, NULL, protobp0, nfssvc);
604 		}
605 	}
606 done:
607 
608 	free(protobp);
609 	free(protobp0);
610 
611 	if (num_fds == 0) {
612 		fprintf(stderr, "Could not start NFS service for any protocol."
613 		    " Exiting.\n");
614 		exit(1);
615 	}
616 
617 	end_listen_fds = num_fds;
618 
619 	/*
620 	 * nfsd is up and running as far as we are concerned.
621 	 */
622 	daemonize_fini(pipe_fd);
623 
624 	/*
625 	 * Get rid of unneeded privileges.
626 	 */
627 	__fini_daemon_priv(PRIV_PROC_FORK, PRIV_PROC_EXEC, PRIV_PROC_SESSION,
628 	    PRIV_FILE_LINK_ANY, PRIV_PROC_INFO, (char *)NULL);
629 
630 	/*
631 	 * Poll for non-data control events on the transport descriptors.
632 	 */
633 	poll_for_action();
634 
635 	/*
636 	 * If we get here, something failed in poll_for_action().
637 	 */
638 	return (1);
639 }
640 
641 static int
642 nfssvcpool(int maxservers)
643 {
644 	struct svcpool_args npa;
645 
646 	npa.id = NFS_SVCPOOL_ID;
647 	npa.maxthreads = maxservers;
648 	npa.redline = 0;
649 	npa.qsize = 0;
650 	npa.timeout = 0;
651 	npa.stksize = 0;
652 	npa.max_same_xprt = 0;
653 	return (_nfssys(SVCPOOL_CREATE, &npa));
654 }
655 
656 /*
657  * Establish NFS service thread.
658  */
659 static int
660 nfssvc(int fd, struct netbuf addrmask, struct netconfig *nconf)
661 {
662 	struct nfs_svc_args nsa;
663 
664 	nsa.fd = fd;
665 	nsa.netid = nconf->nc_netid;
666 	nsa.addrmask = addrmask;
667 	if (strncasecmp(nconf->nc_proto, NC_UDP, strlen(NC_UDP)) == 0) {
668 		nsa.versmax = (nfs_server_vers_max > NFS_V3) ?
669 		    NFS_V3 : nfs_server_vers_max;
670 		nsa.versmin = nfs_server_vers_min;
671 		/*
672 		 * If no version left, silently do nothing, previous
673 		 * checks will have assured at least TCP is available.
674 		 */
675 		if (nsa.versmin > nsa.versmax)
676 			return (0);
677 	} else {
678 		nsa.versmax = nfs_server_vers_max;
679 		nsa.versmin = nfs_server_vers_min;
680 	}
681 	nsa.delegation = nfs_server_delegation;
682 	return (_nfssys(NFS_SVC, &nsa));
683 }
684 
685 static void
686 usage(void)
687 {
688 	(void) fprintf(stderr,
689 "usage: %s [ -a ] [ -c max_conns ] [ -p protocol ] [ -t transport ] ", MyName);
690 	(void) fprintf(stderr, "\n[ -l listen_backlog ] [ nservers ]\n");
691 	(void) fprintf(stderr,
692 "\twhere -a causes <nservers> to be started on each appropriate transport,\n");
693 	(void) fprintf(stderr,
694 "\tmax_conns is the maximum number of concurrent connections allowed,\n");
695 	(void) fprintf(stderr, "\t\tand max_conns must be a decimal number");
696 	(void) fprintf(stderr, "> zero,\n");
697 	(void) fprintf(stderr, "\tprotocol is a protocol identifier,\n");
698 	(void) fprintf(stderr,
699 	    "\ttransport is a transport provider name (i.e. device),\n");
700 	(void) fprintf(stderr,
701 	    "\tlisten_backlog is the TCP listen backlog,\n");
702 	(void) fprintf(stderr,
703 	    "\tand <nservers> must be a decimal number > zero.\n");
704 	exit(1);
705 }
706 
707 /*
708  * Issue nfssys system call to flush all logging buffers asynchronously.
709  *
710  * NOTICE: It is extremely important to flush NFS logging buffers when
711  *	   nfsd exits. When the system is halted or rebooted nfslogd
712  *	   may not have an opportunity to flush the buffers.
713  */
714 static void
715 nfsl_flush()
716 {
717 	struct nfsl_flush_args nfa;
718 
719 	memset((void *)&nfa, 0, sizeof (nfa));
720 	nfa.version = NFSL_FLUSH_ARGS_VERS;
721 	nfa.directive = NFSL_ALL;	/* flush all asynchronously */
722 
723 	if (_nfssys(LOG_FLUSH, &nfa) < 0)
724 		syslog(LOG_ERR, "_nfssys(LOG_FLUSH) failed: %s\n",
725 		    strerror(errno));
726 }
727 
728 /*
729  * SIGTERM handler.
730  * Flush logging buffers and exit.
731  */
732 static void
733 sigflush(int sig)
734 {
735 	nfsl_flush();
736 	_exit(0);
737 }
738 
739 /*
740  * SIGUSR1 handler.
741  *
742  * Request that server quiesce, then (nfsd) exit. For subsequent warm start.
743  *
744  * This is a Contracted Project Private interface, for the sole use
745  * of Sun Cluster HA-NFS. See PSARC/2004/497.
746  *
747  * Equivalent to SIGTERM handler if nfs_server_vers_max < QUIESCE_VERSMIN.
748  */
749 static void
750 quiesce(int sig)
751 {
752 	int error;
753 	int id = NFS_SVCPOOL_ID;
754 
755 	if (nfs_server_vers_max >= QUIESCE_VERSMIN) {
756 		/* Request server quiesce at next shutdown */
757 		error = _nfssys(NFS4_SVC_REQUEST_QUIESCE, &id);
758 
759 		/*
760 		 * ENOENT is returned if there is no matching SVC pool
761 		 * for the id. Possibly because the pool is not yet setup.
762 		 * In this case, just exit as if no error. For all other errors,
763 		 * just return and allow caller to retry.
764 		 */
765 		if (error && errno != ENOENT) {
766 			syslog(LOG_ERR,
767 			    "_nfssys(NFS4_SVC_REQUEST_QUIESCE) failed: %s",
768 			    strerror(errno));
769 			return;
770 		}
771 	}
772 
773 	/* Flush logging buffers */
774 	nfsl_flush();
775 
776 	_exit(0);
777 }
778 
779 /*
780  * DSS: distributed stable storage.
781  * Create leaf directories as required, keeping an eye on path
782  * lengths. Calls exit(1) on failure.
783  * The pathnames passed in must already exist, and must be writeable by nfsd.
784  * Note: the leaf directories under NFS4_VAR_DIR are not created here;
785  * they're created at pkg install.
786  */
787 static void
788 dss_mkleafdirs(uint_t npaths, char **pathnames)
789 {
790 	int i;
791 	char *tmppath = NULL;
792 
793 	/*
794 	 * Create the temporary storage used by dss_mkleafdir() here,
795 	 * rather than in that function, so that it only needs to be
796 	 * done once, rather than once for each call. Too big to put
797 	 * on the function's stack.
798 	 */
799 	tmppath = (char *)malloc(MAXPATHLEN);
800 	if (tmppath == NULL) {
801 		syslog(LOG_ERR, "tmppath malloc failed. Exiting");
802 		exit(1);
803 	}
804 
805 	for (i = 0; i < npaths; i++) {
806 		char *p = pathnames[i];
807 
808 		dss_mkleafdir(p, NFS4_DSS_STATE_LEAF, tmppath);
809 		dss_mkleafdir(p, NFS4_DSS_OLDSTATE_LEAF, tmppath);
810 	}
811 
812 	free(tmppath);
813 }
814 
815 /*
816  * Create "leaf" in "dir" (which must already exist).
817  * leaf: should start with a '/'
818  */
819 static void
820 dss_mkleafdir(char *dir, char *leaf, char *tmppath)
821 {
822 	/* MAXPATHLEN includes the terminating NUL */
823 	if (strlen(dir) + strlen(leaf) > MAXPATHLEN - 1) {
824 		fprintf(stderr, "stable storage path too long: %s%s. "
825 		    "Exiting.\n", dir, leaf);
826 		exit(1);
827 	}
828 
829 	(void) snprintf(tmppath, MAXPATHLEN, "%s/%s", dir, leaf);
830 
831 	/* the directory may already exist: that's OK */
832 	if (mkdir(tmppath, NFS4_DSS_DIR_MODE) == -1 && errno != EEXIST) {
833 		fprintf(stderr, "error creating stable storage directory: "
834 		    "%s: %s. Exiting.\n", strerror(errno), tmppath);
835 		exit(1);
836 	}
837 }
838 
839 /*
840  * Create the storage dirs, and pass the path list to the kernel.
841  * This requires the nfssrv module to be loaded; the _nfssys() syscall
842  * will fail ENOTSUP if it is not.
843  * Use libnvpair(3LIB) to pass the data to the kernel.
844  */
845 static int
846 dss_init(uint_t npaths, char **pathnames)
847 {
848 	int i, j, nskipped, error;
849 	char *bufp;
850 	uint32_t bufsize;
851 	size_t buflen;
852 	nvlist_t *nvl;
853 
854 	if (npaths > 1) {
855 		/*
856 		 * We need to remove duplicate paths; this might be user error
857 		 * in the general case, but HA-NFSv4 can also cause this.
858 		 * Sort the pathnames array, and NULL out duplicates,
859 		 * then write the non-NULL entries to a new array.
860 		 * Sorting will also allow the kernel to optimise its searches.
861 		 */
862 
863 		qsort(pathnames, npaths, sizeof (char *), qstrcmp);
864 
865 		/* now NULL out any duplicates */
866 		i = 0; j = 1; nskipped = 0;
867 		while (j < npaths) {
868 			if (strcmp(pathnames[i], pathnames[j]) == 0) {
869 				pathnames[j] = NULL;
870 				j++;
871 				nskipped++;
872 				continue;
873 			}
874 
875 			/* skip i over any of its NULLed duplicates */
876 			i = j++;
877 		}
878 
879 		/* finally, write the non-NULL entries to a new array */
880 		if (nskipped > 0) {
881 			int nreal;
882 			size_t sz;
883 			char **tmp_pathnames;
884 
885 			nreal = npaths - nskipped;
886 
887 			sz = nreal * sizeof (char *);
888 			tmp_pathnames = (char **)malloc(sz);
889 			if (tmp_pathnames == NULL) {
890 				fprintf(stderr, "tmp_pathnames malloc "
891 				    "failed\n");
892 				exit(1);
893 			}
894 
895 			for (i = 0, j = 0; i < npaths; i++)
896 				if (pathnames[i] != NULL)
897 					tmp_pathnames[j++] = pathnames[i];
898 			free(pathnames);
899 			pathnames = tmp_pathnames;
900 			npaths = nreal;
901 		}
902 
903 	}
904 
905 	/* Create directories to store the distributed state files */
906 	dss_mkleafdirs(npaths, pathnames);
907 
908 	/* Create the name-value pair list */
909 	error = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
910 	if (error) {
911 		fprintf(stderr, "nvlist_alloc failed: %s\n", strerror(errno));
912 		return (1);
913 	}
914 
915 	/* Add the pathnames array as a single name-value pair */
916 	error = nvlist_add_string_array(nvl, NFS4_DSS_NVPAIR_NAME,
917 	    pathnames, npaths);
918 	if (error) {
919 		fprintf(stderr, "nvlist_add_string_array failed: %s\n",
920 		    strerror(errno));
921 		nvlist_free(nvl);
922 		return (1);
923 	}
924 
925 	/*
926 	 * Pack list into contiguous memory, for passing to kernel.
927 	 * nvlist_pack() will allocate the memory for the buffer,
928 	 * which we should free() when no longer needed.
929 	 * NV_ENCODE_XDR for safety across ILP32/LP64 kernel boundary.
930 	 */
931 	bufp = NULL;
932 	error = nvlist_pack(nvl, &bufp, &buflen, NV_ENCODE_XDR, 0);
933 	if (error) {
934 		fprintf(stderr, "nvlist_pack failed: %s\n", strerror(errno));
935 		nvlist_free(nvl);
936 		return (1);
937 	}
938 
939 	/* Now we have the packed buffer, we no longer need the list */
940 	nvlist_free(nvl);
941 
942 	/*
943 	 * Let the kernel know in advance how big the buffer is.
944 	 * NOTE: we cannot just pass buflen, since size_t is a long, and
945 	 * thus a different size between ILP32 userland and LP64 kernel.
946 	 * Use an int for the transfer, since that should be big enough;
947 	 * this is a no-op at the moment, here, since nfsd is 32-bit, but
948 	 * that could change.
949 	 */
950 	bufsize = (uint32_t)buflen;
951 	error = _nfssys(NFS4_DSS_SETPATHS_SIZE, &bufsize);
952 	if (error) {
953 		fprintf(stderr,
954 		    "_nfssys(NFS4_DSS_SETPATHS_SIZE) failed: %s\n",
955 		    strerror(errno));
956 		free(bufp);
957 		return (1);
958 	}
959 
960 	/* Pass the packed buffer to the kernel */
961 	error = _nfssys(NFS4_DSS_SETPATHS, bufp);
962 	if (error) {
963 		fprintf(stderr,
964 		    "_nfssys(NFS4_DSS_SETPATHS) failed: %s\n", strerror(errno));
965 		free(bufp);
966 		return (1);
967 	}
968 
969 	/*
970 	 * The kernel has now unpacked the buffer and extracted the
971 	 * pathnames array, we no longer need the buffer.
972 	 */
973 	free(bufp);
974 
975 	return (0);
976 }
977 
978 /*
979  * Quick sort string compare routine, for qsort.
980  * Needed to make arg types correct.
981  */
982 int
983 qstrcmp(const void *p1, const void *p2)
984 {
985 	char *s1 = *((char **)p1);
986 	char *s2 = *((char **)p2);
987 
988 	return (strcmp(s1, s2));
989 }
990