xref: /titanic_41/usr/src/cmd/fs.d/nfs/nfsd/nfsd.c (revision 3db3f65c6274eb042354801a308c8e9bc4994553)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T		*/
27 /*	  All Rights Reserved  	*/
28 
29 /*
30  * University Copyright- Copyright (c) 1982, 1986, 1988
31  * The Regents of the University of California
32  * All Rights Reserved
33  *
34  * University Acknowledgment- Portions of this document are derived from
35  * software developed by the University of California, Berkeley, and its
36  * contributors.
37  */
38 
39 /* LINTLIBRARY */
40 /* PROTOLIB1 */
41 
42 #pragma ident	"%Z%%M%	%I%	%E% SMI"
43 
44 /* NFS server */
45 
46 #include <sys/param.h>
47 #include <sys/types.h>
48 #include <sys/stat.h>
49 #include <syslog.h>
50 #include <tiuser.h>
51 #include <rpc/rpc.h>
52 #include <errno.h>
53 #include <thread.h>
54 #include <sys/resource.h>
55 #include <sys/time.h>
56 #include <sys/file.h>
57 #include <nfs/nfs.h>
58 #include <nfs/nfs_acl.h>
59 #include <nfs/nfssys.h>
60 #include <stdio.h>
61 #include <stdio_ext.h>
62 #include <stdlib.h>
63 #include <signal.h>
64 #include <netconfig.h>
65 #include <netdir.h>
66 #include <string.h>
67 #include <unistd.h>
68 #include <stropts.h>
69 #include <sys/tihdr.h>
70 #include <sys/wait.h>
71 #include <poll.h>
72 #include <priv_utils.h>
73 #include <sys/tiuser.h>
74 #include <netinet/tcp.h>
75 #include <deflt.h>
76 #include <rpcsvc/daemon_utils.h>
77 #include <rpcsvc/nfs4_prot.h>
78 #include <libnvpair.h>
79 #include "nfs_tbind.h"
80 #include "thrpool.h"
81 
82 /* quiesce requests will be ignored if nfs_server_vers_max < QUIESCE_VERSMIN */
83 #define	QUIESCE_VERSMIN	4
84 /* DSS: distributed stable storage */
85 #define	DSS_VERSMIN	4
86 
87 static	int	nfssvc(int, struct netbuf, struct netconfig *);
88 static	int	nfssvcpool(int maxservers);
89 static	int	dss_init(uint_t npaths, char **pathnames);
90 static	void	dss_mkleafdirs(uint_t npaths, char **pathnames);
91 static	void	dss_mkleafdir(char *dir, char *leaf, char *path);
92 static	void	usage(void);
93 int		qstrcmp(const void *s1, const void *s2);
94 
95 extern	int	_nfssys(int, void *);
96 
97 extern int	daemonize_init(void);
98 extern void	daemonize_fini(int fd);
99 
100 /* signal handlers */
101 static void sigflush(int);
102 static void quiesce(int);
103 
104 static	char	*MyName;
105 static	NETSELDECL(defaultproviders)[] = { "/dev/tcp6", "/dev/tcp", "/dev/udp",
106 					    "/dev/udp6", NULL };
107 /* static	NETSELDECL(defaultprotos)[] =	{ NC_UDP, NC_TCP, NULL }; */
108 /*
109  * The following are all globals used by routines in nfs_tbind.c.
110  */
111 size_t	end_listen_fds;		/* used by conn_close_oldest() */
112 size_t	num_fds = 0;		/* used by multiple routines */
113 int	listen_backlog = 32;	/* used by bind_to_{provider,proto}() */
114 int	num_servers;		/* used by cots_listen_event() */
115 int	(*Mysvc)(int, struct netbuf, struct netconfig *) = nfssvc;
116 				/* used by cots_listen_event() */
117 int	max_conns_allowed = -1;	/* used by cots_listen_event() */
118 
119 /*
120  * Keep track of min/max versions of NFS protocol to be started.
121  * Start with the defaults (min == 2, max == 3).  We have the
122  * capability of starting vers=4 but only if the user requests it.
123  */
124 int	nfs_server_vers_min = NFS_VERSMIN_DEFAULT;
125 int	nfs_server_vers_max = NFS_VERSMAX_DEFAULT;
126 
127 /*
128  * Set the default for server delegation enablement and set per
129  * /etc/default/nfs configuration (if present).
130  */
131 int	nfs_server_delegation = NFS_SERVER_DELEGATION_DEFAULT;
132 
133 int
134 main(int ac, char *av[])
135 {
136 	char *dir = "/";
137 	int allflag = 0;
138 	int df_allflag = 0;
139 	int opt_cnt = 0;
140 	int maxservers = 1;	/* zero allows inifinte number of threads */
141 	int maxservers_set = 0;
142 	int logmaxservers = 0;
143 	int pid;
144 	int i;
145 	char *provider = (char *)NULL;
146 	char *df_provider = (char *)NULL;
147 	struct protob *protobp0, *protobp;
148 	NETSELDECL(proto) = NULL;
149 	NETSELDECL(df_proto) = NULL;
150 	NETSELPDECL(providerp);
151 	char *defval;
152 	boolean_t can_do_mlp;
153 	uint_t dss_npaths = 0;
154 	char **dss_pathnames = NULL;
155 	sigset_t sgset;
156 
157 	int pipe_fd = -1;
158 
159 	MyName = *av;
160 
161 	/*
162 	 * Initializations that require more privileges than we need to run.
163 	 */
164 	(void) _create_daemon_lock(NFSD, DAEMON_UID, DAEMON_GID);
165 	svcsetprio();
166 
167 	can_do_mlp = priv_ineffect(PRIV_NET_BINDMLP);
168 	if (__init_daemon_priv(PU_RESETGROUPS|PU_CLEARLIMITSET,
169 	    DAEMON_UID, DAEMON_GID, PRIV_SYS_NFS,
170 	    can_do_mlp ? PRIV_NET_BINDMLP : NULL, NULL) == -1) {
171 		(void) fprintf(stderr, "%s should be run with"
172 		    " sufficient privileges\n", av[0]);
173 		exit(1);
174 	}
175 
176 	(void) enable_extended_FILE_stdio(-1, -1);
177 
178 	/*
179 	 * Read in the values from config file first before we check
180 	 * command line options so the options override the file.
181 	 */
182 	if ((defopen(NFSADMIN)) == 0) {
183 		if ((defval = defread("NFSD_MAX_CONNECTIONS=")) != NULL) {
184 			errno = 0;
185 			max_conns_allowed = strtol(defval, (char **)NULL, 10);
186 			if (errno != 0) {
187 				max_conns_allowed = -1;
188 			}
189 		}
190 		if ((defval = defread("NFSD_LISTEN_BACKLOG=")) != NULL) {
191 			errno = 0;
192 			listen_backlog = strtol(defval, (char **)NULL, 10);
193 			if (errno != 0) {
194 				listen_backlog = 32;
195 			}
196 		}
197 		if ((defval = defread("NFSD_PROTOCOL=")) != NULL) {
198 			df_proto = strdup(defval);
199 			opt_cnt++;
200 			if (strncasecmp("ALL", defval, 3) == 0) {
201 				free(df_proto);
202 				df_proto = NULL;
203 				df_allflag = 1;
204 			}
205 		}
206 		if ((defval = defread("NFSD_DEVICE=")) != NULL) {
207 			df_provider = strdup(defval);
208 			opt_cnt++;
209 		}
210 		if ((defval = defread("NFSD_SERVERS=")) != NULL) {
211 			errno = 0;
212 			maxservers = strtol(defval, (char **)NULL, 10);
213 			if (errno != 0) {
214 				maxservers = 1;
215 			} else {
216 				maxservers_set = 1;
217 			}
218 		}
219 		if ((defval = defread("NFS_SERVER_VERSMIN=")) != NULL) {
220 			errno = 0;
221 			nfs_server_vers_min =
222 			    strtol(defval, (char **)NULL, 10);
223 			if (errno != 0) {
224 				nfs_server_vers_min = NFS_VERSMIN_DEFAULT;
225 			}
226 		}
227 		if ((defval = defread("NFS_SERVER_VERSMAX=")) != NULL) {
228 			errno = 0;
229 			nfs_server_vers_max =
230 			    strtol(defval, (char **)NULL, 10);
231 			if (errno != 0) {
232 				nfs_server_vers_max = NFS_VERSMAX_DEFAULT;
233 			}
234 		}
235 		if ((defval = defread("NFS_SERVER_DELEGATION=")) != NULL) {
236 			if (strcmp(defval, "off") == 0) {
237 				nfs_server_delegation = FALSE;
238 			}
239 		}
240 
241 		/* close defaults file */
242 		defopen(NULL);
243 	}
244 
245 	/*
246 	 * Conflict options error messages.
247 	 */
248 	if (opt_cnt > 1) {
249 		(void) fprintf(stderr, "\nConflicting options, only one of "
250 		    "the following options can be specified\n"
251 		    "in " NFSADMIN ":\n"
252 		    "\tNFSD_PROTOCOL=ALL\n"
253 		    "\tNFSD_PROTOCOL=protocol\n"
254 		    "\tNFSD_DEVICE=device\n\n");
255 		usage();
256 	}
257 	opt_cnt = 0;
258 
259 	while ((i = getopt(ac, av, "ac:p:s:t:l:")) != EOF) {
260 		switch (i) {
261 		case 'a':
262 			free(df_proto);
263 			df_proto = NULL;
264 			free(df_provider);
265 			df_provider = NULL;
266 
267 			allflag = 1;
268 			opt_cnt++;
269 			break;
270 
271 		case 'c':
272 			max_conns_allowed = atoi(optarg);
273 			break;
274 
275 		case 'p':
276 			proto = optarg;
277 			df_allflag = 0;
278 			opt_cnt++;
279 			break;
280 
281 		/*
282 		 * DSS: NFSv4 distributed stable storage.
283 		 *
284 		 * This is a Contracted Project Private interface, for
285 		 * the sole use of Sun Cluster HA-NFS. See PSARC/2006/313.
286 		 */
287 		case 's':
288 			if (strlen(optarg) < MAXPATHLEN) {
289 				/* first "-s" option encountered? */
290 				if (dss_pathnames == NULL) {
291 					/*
292 					 * Allocate maximum possible space
293 					 * required given cmdline arg count;
294 					 * "-s <path>" consumes two args.
295 					 */
296 					size_t sz = (ac / 2) * sizeof (char *);
297 					dss_pathnames = (char **)malloc(sz);
298 					if (dss_pathnames == NULL) {
299 						(void) fprintf(stderr, "%s: "
300 						    "dss paths malloc failed\n",
301 						    av[0]);
302 						exit(1);
303 					}
304 					(void) memset(dss_pathnames, 0, sz);
305 				}
306 				dss_pathnames[dss_npaths] = optarg;
307 				dss_npaths++;
308 			} else {
309 				(void) fprintf(stderr,
310 				    "%s: -s pathname too long.\n", av[0]);
311 			}
312 			break;
313 
314 		case 't':
315 			provider = optarg;
316 			df_allflag = 0;
317 			opt_cnt++;
318 			break;
319 
320 		case 'l':
321 			listen_backlog = atoi(optarg);
322 			break;
323 
324 		case '?':
325 			usage();
326 			/* NOTREACHED */
327 		}
328 	}
329 
330 	allflag = df_allflag;
331 	if (proto == NULL)
332 		proto = df_proto;
333 	if (provider == NULL)
334 		provider = df_provider;
335 
336 	/*
337 	 * Conflict options error messages.
338 	 */
339 	if (opt_cnt > 1) {
340 		(void) fprintf(stderr, "\nConflicting options, only one of "
341 		    "the following options can be specified\n"
342 		    "on the command line:\n"
343 		    "\t-a\n"
344 		    "\t-p protocol\n"
345 		    "\t-t transport\n\n");
346 		usage();
347 	}
348 
349 	if (proto != NULL &&
350 	    strncasecmp(proto, NC_UDP, strlen(NC_UDP)) == 0) {
351 		if (nfs_server_vers_max == NFS_V4) {
352 			if (nfs_server_vers_min == NFS_V4) {
353 				fprintf(stderr,
354 				    "NFS version 4 is not supported "
355 				    "with the UDP protocol.  Exiting\n");
356 				exit(3);
357 			} else {
358 				fprintf(stderr,
359 				    "NFS version 4 is not supported "
360 				    "with the UDP protocol.\n");
361 			}
362 		}
363 	}
364 
365 	/*
366 	 * If there is exactly one more argument, it is the number of
367 	 * servers.
368 	 */
369 	if (optind == ac - 1) {
370 		maxservers = atoi(av[optind]);
371 		maxservers_set = 1;
372 	}
373 	/*
374 	 * If there are two or more arguments, then this is a usage error.
375 	 */
376 	else if (optind < ac - 1)
377 		usage();
378 	/*
379 	 * Check the ranges for min/max version specified
380 	 */
381 	else if ((nfs_server_vers_min > nfs_server_vers_max) ||
382 	    (nfs_server_vers_min < NFS_VERSMIN) ||
383 	    (nfs_server_vers_max > NFS_VERSMAX))
384 		usage();
385 	/*
386 	 * There are no additional arguments, and we haven't set maxservers
387 	 * explicitly via the config file, we use a default number of
388 	 * servers.  We will log this.
389 	 */
390 	else if (maxservers_set == 0)
391 		logmaxservers = 1;
392 
393 	/*
394 	 * Basic Sanity checks on options
395 	 *
396 	 * max_conns_allowed must be positive, except for the special
397 	 * value of -1 which is used internally to mean unlimited, -1 isn't
398 	 * documented but we allow it anyway.
399 	 *
400 	 * maxservers must be positive
401 	 * listen_backlog must be positive or zero
402 	 */
403 	if (((max_conns_allowed != -1) && (max_conns_allowed <= 0)) ||
404 	    (listen_backlog < 0) || (maxservers <= 0)) {
405 		usage();
406 	}
407 
408 	/*
409 	 * Set current dir to server root
410 	 */
411 	if (chdir(dir) < 0) {
412 		(void) fprintf(stderr, "%s:  ", MyName);
413 		perror(dir);
414 		exit(1);
415 	}
416 
417 #ifndef DEBUG
418 	pipe_fd = daemonize_init();
419 #endif
420 
421 	openlog(MyName, LOG_PID | LOG_NDELAY, LOG_DAEMON);
422 
423 	/*
424 	 * establish our lock on the lock file and write our pid to it.
425 	 * exit if some other process holds the lock, or if there's any
426 	 * error in writing/locking the file.
427 	 */
428 	pid = _enter_daemon_lock(NFSD);
429 	switch (pid) {
430 	case 0:
431 		break;
432 	case -1:
433 		fprintf(stderr, "error locking for %s: %s", NFSD,
434 		    strerror(errno));
435 		exit(2);
436 	default:
437 		/* daemon was already running */
438 		exit(0);
439 	}
440 
441 	/*
442 	 * If we've been given a list of paths to be used for distributed
443 	 * stable storage, and provided we're going to run a version
444 	 * that supports it, setup the DSS paths.
445 	 */
446 	if (dss_pathnames != NULL && nfs_server_vers_max >= DSS_VERSMIN) {
447 		if (dss_init(dss_npaths, dss_pathnames) != 0) {
448 			fprintf(stderr, "%s", "dss_init failed. Exiting.");
449 			exit(1);
450 		}
451 	}
452 
453 	/*
454 	 * Block all signals till we spawn other
455 	 * threads.
456 	 */
457 	(void) sigfillset(&sgset);
458 	(void) thr_sigsetmask(SIG_BLOCK, &sgset, NULL);
459 
460 	if (logmaxservers) {
461 		fprintf(stderr,
462 		    "Number of servers not specified. Using default of %d.",
463 		    maxservers);
464 	}
465 
466 	/*
467 	 * Make sure to unregister any previous versions in case the
468 	 * user is reconfiguring the server in interesting ways.
469 	 */
470 	svc_unreg(NFS_PROGRAM, NFS_VERSION);
471 	svc_unreg(NFS_PROGRAM, NFS_V3);
472 	svc_unreg(NFS_PROGRAM, NFS_V4);
473 	svc_unreg(NFS_ACL_PROGRAM, NFS_ACL_V2);
474 	svc_unreg(NFS_ACL_PROGRAM, NFS_ACL_V3);
475 
476 	/*
477 	 * Set up kernel RPC thread pool for the NFS server.
478 	 */
479 	if (nfssvcpool(maxservers)) {
480 		fprintf(stderr, "Can't set up kernel NFS service: %s. Exiting",
481 		    strerror(errno));
482 		exit(1);
483 	}
484 
485 	/*
486 	 * Set up blocked thread to do LWP creation on behalf of the kernel.
487 	 */
488 	if (svcwait(NFS_SVCPOOL_ID)) {
489 		fprintf(stderr, "Can't set up NFS pool creator: %s. Exiting",
490 		    strerror(errno));
491 		exit(1);
492 	}
493 
494 	/*
495 	 * RDMA start and stop thread.
496 	 * Per pool RDMA listener creation and
497 	 * destructor thread.
498 	 *
499 	 * start rdma services and block in the kernel.
500 	 */
501 	if (svcrdma(NFS_SVCPOOL_ID, nfs_server_vers_min, nfs_server_vers_max,
502 	    nfs_server_delegation)) {
503 		fprintf(stderr, "Can't set up RDMA creator thread : %s",
504 		    strerror(errno));
505 	}
506 
507 	/*
508 	 * Now open up for signal delivery
509 	 */
510 
511 	(void) thr_sigsetmask(SIG_UNBLOCK, &sgset, NULL);
512 	sigset(SIGTERM, sigflush);
513 	sigset(SIGUSR1, quiesce);
514 
515 	/*
516 	 * Build a protocol block list for registration.
517 	 */
518 	protobp0 = protobp = (struct protob *)malloc(sizeof (struct protob));
519 	protobp->serv = "NFS";
520 	protobp->versmin = nfs_server_vers_min;
521 	protobp->versmax = nfs_server_vers_max;
522 	protobp->program = NFS_PROGRAM;
523 
524 	protobp->next = (struct protob *)malloc(sizeof (struct protob));
525 	protobp = protobp->next;
526 	protobp->serv = "NFS_ACL";		/* not used */
527 	protobp->versmin = nfs_server_vers_min;
528 	/* XXX - this needs work to get the version just right */
529 	protobp->versmax = (nfs_server_vers_max > NFS_ACL_V3) ?
530 	    NFS_ACL_V3 : nfs_server_vers_max;
531 	protobp->program = NFS_ACL_PROGRAM;
532 	protobp->next = (struct protob *)NULL;
533 
534 	if (allflag) {
535 		if (do_all(protobp0, nfssvc, 0) == -1) {
536 			fprintf(stderr, "setnetconfig failed : %s",
537 			    strerror(errno));
538 			exit(1);
539 		}
540 	} else if (proto) {
541 		/* there's more than one match for the same protocol */
542 		struct netconfig *nconf;
543 		NCONF_HANDLE *nc;
544 		bool_t	protoFound = FALSE;
545 		if ((nc = setnetconfig()) == (NCONF_HANDLE *) NULL) {
546 			fprintf(stderr, "setnetconfig failed : %s",
547 			    strerror(errno));
548 			goto done;
549 		}
550 		while (nconf = getnetconfig(nc)) {
551 			if (strcmp(nconf->nc_proto, proto) == 0) {
552 				protoFound = TRUE;
553 				do_one(nconf->nc_device, NULL,
554 				    protobp0, nfssvc, 0);
555 			}
556 		}
557 		(void) endnetconfig(nc);
558 		if (protoFound == FALSE) {
559 			fprintf(stderr,
560 			    "couldn't find netconfig entry for protocol %s",
561 			    proto);
562 		}
563 	} else if (provider)
564 		do_one(provider, proto, protobp0, nfssvc, 0);
565 	else {
566 		for (providerp = defaultproviders;
567 		    *providerp != NULL; providerp++) {
568 			provider = *providerp;
569 			do_one(provider, NULL, protobp0, nfssvc, 0);
570 		}
571 	}
572 done:
573 
574 	free(protobp);
575 	free(protobp0);
576 
577 	if (num_fds == 0) {
578 		fprintf(stderr, "Could not start NFS service for any protocol."
579 		    " Exiting");
580 		exit(1);
581 	}
582 
583 	end_listen_fds = num_fds;
584 
585 	/*
586 	 * nfsd is up and running as far as we are concerned.
587 	 */
588 	daemonize_fini(pipe_fd);
589 
590 	/*
591 	 * Get rid of unneeded privileges.
592 	 */
593 	__fini_daemon_priv(PRIV_PROC_FORK, PRIV_PROC_EXEC, PRIV_PROC_SESSION,
594 	    PRIV_FILE_LINK_ANY, PRIV_PROC_INFO, (char *)NULL);
595 
596 	/*
597 	 * Poll for non-data control events on the transport descriptors.
598 	 */
599 	poll_for_action();
600 
601 	/*
602 	 * If we get here, something failed in poll_for_action().
603 	 */
604 	return (1);
605 }
606 
607 static int
608 nfssvcpool(int maxservers)
609 {
610 	struct svcpool_args npa;
611 
612 	npa.id = NFS_SVCPOOL_ID;
613 	npa.maxthreads = maxservers;
614 	npa.redline = 0;
615 	npa.qsize = 0;
616 	npa.timeout = 0;
617 	npa.stksize = 0;
618 	npa.max_same_xprt = 0;
619 	return (_nfssys(SVCPOOL_CREATE, &npa));
620 }
621 
622 /*
623  * Establish NFS service thread.
624  */
625 static int
626 nfssvc(int fd, struct netbuf addrmask, struct netconfig *nconf)
627 {
628 	struct nfs_svc_args nsa;
629 
630 	nsa.fd = fd;
631 	nsa.netid = nconf->nc_netid;
632 	nsa.addrmask = addrmask;
633 	if (strncasecmp(nconf->nc_proto, NC_UDP, strlen(NC_UDP)) == 0) {
634 		nsa.versmax = (nfs_server_vers_max > NFS_V3) ?
635 		    NFS_V3 : nfs_server_vers_max;
636 		nsa.versmin = nfs_server_vers_min;
637 		/*
638 		 * If no version left, silently do nothing, previous
639 		 * checks will have assured at least TCP is available.
640 		 */
641 		if (nsa.versmin > nsa.versmax)
642 			return (0);
643 	} else {
644 		nsa.versmax = nfs_server_vers_max;
645 		nsa.versmin = nfs_server_vers_min;
646 	}
647 	nsa.delegation = nfs_server_delegation;
648 	return (_nfssys(NFS_SVC, &nsa));
649 }
650 
651 static void
652 usage(void)
653 {
654 	(void) fprintf(stderr,
655 "usage: %s [ -a ] [ -c max_conns ] [ -p protocol ] [ -t transport ] ", MyName);
656 	(void) fprintf(stderr, "\n[ -l listen_backlog ] [ nservers ]\n");
657 	(void) fprintf(stderr,
658 "\twhere -a causes <nservers> to be started on each appropriate transport,\n");
659 	(void) fprintf(stderr,
660 "\tmax_conns is the maximum number of concurrent connections allowed,\n");
661 	(void) fprintf(stderr, "\t\tand max_conns must be a decimal number");
662 	(void) fprintf(stderr, "> zero,\n");
663 	(void) fprintf(stderr, "\tprotocol is a protocol identifier,\n");
664 	(void) fprintf(stderr,
665 	    "\ttransport is a transport provider name (i.e. device),\n");
666 	(void) fprintf(stderr,
667 	    "\tlisten_backlog is the TCP listen backlog,\n");
668 	(void) fprintf(stderr,
669 	    "\tand <nservers> must be a decimal number > zero.\n");
670 	exit(1);
671 }
672 
673 /*
674  * Issue nfssys system call to flush all logging buffers asynchronously.
675  *
676  * NOTICE: It is extremely important to flush NFS logging buffers when
677  *	   nfsd exits. When the system is halted or rebooted nfslogd
678  *	   may not have an opportunity to flush the buffers.
679  */
680 static void
681 nfsl_flush()
682 {
683 	struct nfsl_flush_args nfa;
684 
685 	memset((void *)&nfa, 0, sizeof (nfa));
686 	nfa.version = NFSL_FLUSH_ARGS_VERS;
687 	nfa.directive = NFSL_ALL;	/* flush all asynchronously */
688 
689 	if (_nfssys(LOG_FLUSH, &nfa) < 0)
690 		syslog(LOG_ERR, "_nfssys(LOG_FLUSH) failed: %s\n",
691 		    strerror(errno));
692 }
693 
694 /*
695  * SIGTERM handler.
696  * Flush logging buffers and exit.
697  */
698 static void
699 sigflush(int sig)
700 {
701 	nfsl_flush();
702 	_exit(0);
703 }
704 
705 /*
706  * SIGUSR1 handler.
707  *
708  * Request that server quiesce, then (nfsd) exit. For subsequent warm start.
709  *
710  * This is a Contracted Project Private interface, for the sole use
711  * of Sun Cluster HA-NFS. See PSARC/2004/497.
712  *
713  * Equivalent to SIGTERM handler if nfs_server_vers_max < QUIESCE_VERSMIN.
714  */
715 static void
716 quiesce(int sig)
717 {
718 	int error;
719 	int id = NFS_SVCPOOL_ID;
720 
721 	if (nfs_server_vers_max >= QUIESCE_VERSMIN) {
722 		/* Request server quiesce at next shutdown */
723 		error = _nfssys(NFS4_SVC_REQUEST_QUIESCE, &id);
724 
725 		/*
726 		 * ENOENT is returned if there is no matching SVC pool
727 		 * for the id. Possibly because the pool is not yet setup.
728 		 * In this case, just exit as if no error. For all other errors,
729 		 * just return and allow caller to retry.
730 		 */
731 		if (error && errno != ENOENT) {
732 			syslog(LOG_ERR,
733 			    "_nfssys(NFS4_SVC_REQUEST_QUIESCE) failed: %s",
734 			    strerror(errno));
735 			return;
736 		}
737 	}
738 
739 	/* Flush logging buffers */
740 	nfsl_flush();
741 
742 	_exit(0);
743 }
744 
745 /*
746  * DSS: distributed stable storage.
747  * Create leaf directories as required, keeping an eye on path
748  * lengths. Calls exit(1) on failure.
749  * The pathnames passed in must already exist, and must be writeable by nfsd.
750  * Note: the leaf directories under NFS4_VAR_DIR are not created here;
751  * they're created at pkg install.
752  */
753 static void
754 dss_mkleafdirs(uint_t npaths, char **pathnames)
755 {
756 	int i;
757 	char *tmppath = NULL;
758 
759 	/*
760 	 * Create the temporary storage used by dss_mkleafdir() here,
761 	 * rather than in that function, so that it only needs to be
762 	 * done once, rather than once for each call. Too big to put
763 	 * on the function's stack.
764 	 */
765 	tmppath = (char *)malloc(MAXPATHLEN);
766 	if (tmppath == NULL) {
767 		syslog(LOG_ERR, "tmppath malloc failed. Exiting");
768 		exit(1);
769 	}
770 
771 	for (i = 0; i < npaths; i++) {
772 		char *p = pathnames[i];
773 
774 		dss_mkleafdir(p, NFS4_DSS_STATE_LEAF, tmppath);
775 		dss_mkleafdir(p, NFS4_DSS_OLDSTATE_LEAF, tmppath);
776 	}
777 
778 	free(tmppath);
779 }
780 
781 /*
782  * Create "leaf" in "dir" (which must already exist).
783  * leaf: should start with a '/'
784  */
785 static void
786 dss_mkleafdir(char *dir, char *leaf, char *tmppath)
787 {
788 	/* MAXPATHLEN includes the terminating NUL */
789 	if (strlen(dir) + strlen(leaf) > MAXPATHLEN - 1) {
790 		fprintf(stderr, "stable storage path too long: %s%s. Exiting",
791 		    dir, leaf);
792 		exit(1);
793 	}
794 
795 	(void) snprintf(tmppath, MAXPATHLEN, "%s/%s", dir, leaf);
796 
797 	/* the directory may already exist: that's OK */
798 	if (mkdir(tmppath, NFS4_DSS_DIR_MODE) == -1 && errno != EEXIST) {
799 		fprintf(stderr, "error creating stable storage directory: "
800 		    "%s: %s. Exiting", strerror(errno), tmppath);
801 		exit(1);
802 	}
803 }
804 
805 /*
806  * Create the storage dirs, and pass the path list to the kernel.
807  * This requires the nfssrv module to be loaded; the _nfssys() syscall
808  * will fail ENOTSUP if it is not.
809  * Use libnvpair(3LIB) to pass the data to the kernel.
810  */
811 static int
812 dss_init(uint_t npaths, char **pathnames)
813 {
814 	int i, j, nskipped, error;
815 	char *bufp;
816 	uint32_t bufsize;
817 	size_t buflen;
818 	nvlist_t *nvl;
819 
820 	if (npaths > 1) {
821 		/*
822 		 * We need to remove duplicate paths; this might be user error
823 		 * in the general case, but HA-NFSv4 can also cause this.
824 		 * Sort the pathnames array, and NULL out duplicates,
825 		 * then write the non-NULL entries to a new array.
826 		 * Sorting will also allow the kernel to optimise its searches.
827 		 */
828 
829 		qsort(pathnames, npaths, sizeof (char *), qstrcmp);
830 
831 		/* now NULL out any duplicates */
832 		i = 0; j = 1; nskipped = 0;
833 		while (j < npaths) {
834 			if (strcmp(pathnames[i], pathnames[j]) == NULL) {
835 				pathnames[j] = NULL;
836 				j++;
837 				nskipped++;
838 				continue;
839 			}
840 
841 			/* skip i over any of its NULLed duplicates */
842 			i = j++;
843 		}
844 
845 		/* finally, write the non-NULL entries to a new array */
846 		if (nskipped > 0) {
847 			int nreal;
848 			size_t sz;
849 			char **tmp_pathnames;
850 
851 			nreal = npaths - nskipped;
852 
853 			sz = nreal * sizeof (char *);
854 			tmp_pathnames = (char **)malloc(sz);
855 			if (tmp_pathnames == NULL) {
856 				fprintf(stderr, "tmp_pathnames malloc failed");
857 				exit(1);
858 			}
859 
860 			for (i = 0, j = 0; i < npaths; i++)
861 				if (pathnames[i] != NULL)
862 					tmp_pathnames[j++] = pathnames[i];
863 			free(pathnames);
864 			pathnames = tmp_pathnames;
865 			npaths = nreal;
866 		}
867 
868 	}
869 
870 	/* Create directories to store the distributed state files */
871 	dss_mkleafdirs(npaths, pathnames);
872 
873 	/* Create the name-value pair list */
874 	error = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
875 	if (error) {
876 		fprintf(stderr, "nvlist_alloc failed: %s.", strerror(errno));
877 		return (1);
878 	}
879 
880 	/* Add the pathnames array as a single name-value pair */
881 	error = nvlist_add_string_array(nvl, NFS4_DSS_NVPAIR_NAME,
882 	    pathnames, npaths);
883 	if (error) {
884 		fprintf(stderr, "nvlist_add_string_array failed: %s.",
885 		    strerror(errno));
886 		nvlist_free(nvl);
887 		return (1);
888 	}
889 
890 	/*
891 	 * Pack list into contiguous memory, for passing to kernel.
892 	 * nvlist_pack() will allocate the memory for the buffer,
893 	 * which we should free() when no longer needed.
894 	 * NV_ENCODE_XDR for safety across ILP32/LP64 kernel boundary.
895 	 */
896 	bufp = NULL;
897 	error = nvlist_pack(nvl, &bufp, &buflen, NV_ENCODE_XDR, 0);
898 	if (error) {
899 		fprintf(stderr, "nvlist_pack failed: %s.", strerror(errno));
900 		nvlist_free(nvl);
901 		return (1);
902 	}
903 
904 	/* Now we have the packed buffer, we no longer need the list */
905 	nvlist_free(nvl);
906 
907 	/*
908 	 * Let the kernel know in advance how big the buffer is.
909 	 * NOTE: we cannot just pass buflen, since size_t is a long, and
910 	 * thus a different size between ILP32 userland and LP64 kernel.
911 	 * Use an int for the transfer, since that should be big enough;
912 	 * this is a no-op at the moment, here, since nfsd is 32-bit, but
913 	 * that could change.
914 	 */
915 	bufsize = (uint32_t)buflen;
916 	error = _nfssys(NFS4_DSS_SETPATHS_SIZE, &bufsize);
917 	if (error) {
918 		fprintf(stderr,
919 		    "_nfssys(NFS4_DSS_SETPATHS_SIZE) failed: %s. ",
920 		    strerror(errno));
921 		free(bufp);
922 		return (1);
923 	}
924 
925 	/* Pass the packed buffer to the kernel */
926 	error = _nfssys(NFS4_DSS_SETPATHS, bufp);
927 	if (error) {
928 		fprintf(stderr,
929 		    "_nfssys(NFS4_DSS_SETPATHS) failed: %s. ", strerror(errno));
930 		free(bufp);
931 		return (1);
932 	}
933 
934 	/*
935 	 * The kernel has now unpacked the buffer and extracted the
936 	 * pathnames array, we no longer need the buffer.
937 	 */
938 	free(bufp);
939 
940 	return (0);
941 }
942 
943 /*
944  * Quick sort string compare routine, for qsort.
945  * Needed to make arg types correct.
946  */
947 int
948 qstrcmp(const void *p1, const void *p2)
949 {
950 	char *s1 = *((char **)p1);
951 	char *s2 = *((char **)p2);
952 
953 	return (strcmp(s1, s2));
954 }
955