xref: /titanic_52/usr/src/cmd/fs.d/nfs/nfsd/nfsd.c (revision ff19e029e81c950f4e0f40f1f1ee1f7d8f8d8041)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T		*/
26 /*	  All Rights Reserved  	*/
27 
28 /*
29  * University Copyright- Copyright (c) 1982, 1986, 1988
30  * The Regents of the University of California
31  * All Rights Reserved
32  *
33  * University Acknowledgment- Portions of this document are derived from
34  * software developed by the University of California, Berkeley, and its
35  * contributors.
36  */
37 
38 /* LINTLIBRARY */
39 /* PROTOLIB1 */
40 
41 /* NFS server */
42 
43 #include <sys/param.h>
44 #include <sys/types.h>
45 #include <sys/stat.h>
46 #include <syslog.h>
47 #include <tiuser.h>
48 #include <rpc/rpc.h>
49 #include <errno.h>
50 #include <thread.h>
51 #include <sys/resource.h>
52 #include <sys/time.h>
53 #include <sys/file.h>
54 #include <nfs/nfs.h>
55 #include <nfs/nfs_acl.h>
56 #include <nfs/nfssys.h>
57 #include <stdio.h>
58 #include <stdio_ext.h>
59 #include <stdlib.h>
60 #include <signal.h>
61 #include <netconfig.h>
62 #include <netdir.h>
63 #include <string.h>
64 #include <unistd.h>
65 #include <stropts.h>
66 #include <sys/tihdr.h>
67 #include <sys/wait.h>
68 #include <poll.h>
69 #include <priv_utils.h>
70 #include <sys/tiuser.h>
71 #include <netinet/tcp.h>
72 #include <deflt.h>
73 #include <rpcsvc/daemon_utils.h>
74 #include <rpcsvc/nfs4_prot.h>
75 #include <libnvpair.h>
76 #include "nfs_tbind.h"
77 #include "thrpool.h"
78 
79 /* quiesce requests will be ignored if nfs_server_vers_max < QUIESCE_VERSMIN */
80 #define	QUIESCE_VERSMIN	4
81 /* DSS: distributed stable storage */
82 #define	DSS_VERSMIN	4
83 
84 static	int	nfssvc(int, struct netbuf, struct netconfig *);
85 static	int	nfssvcpool(int maxservers);
86 static	int	dss_init(uint_t npaths, char **pathnames);
87 static	void	dss_mkleafdirs(uint_t npaths, char **pathnames);
88 static	void	dss_mkleafdir(char *dir, char *leaf, char *path);
89 static	void	usage(void);
90 int		qstrcmp(const void *s1, const void *s2);
91 
92 extern	int	_nfssys(int, void *);
93 
94 extern int	daemonize_init(void);
95 extern void	daemonize_fini(int fd);
96 
97 /* signal handlers */
98 static void sigflush(int);
99 static void quiesce(int);
100 
101 static	char	*MyName;
102 static	NETSELDECL(defaultproviders)[] = { "/dev/tcp6", "/dev/tcp", "/dev/udp",
103 					    "/dev/udp6", NULL };
104 /* static	NETSELDECL(defaultprotos)[] =	{ NC_UDP, NC_TCP, NULL }; */
105 /*
106  * The following are all globals used by routines in nfs_tbind.c.
107  */
108 size_t	end_listen_fds;		/* used by conn_close_oldest() */
109 size_t	num_fds = 0;		/* used by multiple routines */
110 int	listen_backlog = 32;	/* used by bind_to_{provider,proto}() */
111 int	num_servers;		/* used by cots_listen_event() */
112 int	(*Mysvc)(int, struct netbuf, struct netconfig *) = nfssvc;
113 				/* used by cots_listen_event() */
114 int	max_conns_allowed = -1;	/* used by cots_listen_event() */
115 
116 /*
117  * Keep track of min/max versions of NFS protocol to be started.
118  * Start with the defaults (min == 2, max == 3).  We have the
119  * capability of starting vers=4 but only if the user requests it.
120  */
121 int	nfs_server_vers_min = NFS_VERSMIN_DEFAULT;
122 int	nfs_server_vers_max = NFS_VERSMAX_DEFAULT;
123 
124 /*
125  * Set the default for server delegation enablement and set per
126  * /etc/default/nfs configuration (if present).
127  */
128 int	nfs_server_delegation = NFS_SERVER_DELEGATION_DEFAULT;
129 
130 int
131 main(int ac, char *av[])
132 {
133 	char *dir = "/";
134 	int allflag = 0;
135 	int df_allflag = 0;
136 	int opt_cnt = 0;
137 	int maxservers = 1;	/* zero allows inifinte number of threads */
138 	int maxservers_set = 0;
139 	int logmaxservers = 0;
140 	int pid;
141 	int i;
142 	char *provider = (char *)NULL;
143 	char *df_provider = (char *)NULL;
144 	struct protob *protobp0, *protobp;
145 	NETSELDECL(proto) = NULL;
146 	NETSELDECL(df_proto) = NULL;
147 	NETSELPDECL(providerp);
148 	char *defval;
149 	boolean_t can_do_mlp;
150 	uint_t dss_npaths = 0;
151 	char **dss_pathnames = NULL;
152 	sigset_t sgset;
153 
154 	int pipe_fd = -1;
155 
156 	MyName = *av;
157 
158 	/*
159 	 * Initializations that require more privileges than we need to run.
160 	 */
161 	(void) _create_daemon_lock(NFSD, DAEMON_UID, DAEMON_GID);
162 	svcsetprio();
163 
164 	can_do_mlp = priv_ineffect(PRIV_NET_BINDMLP);
165 	if (__init_daemon_priv(PU_RESETGROUPS|PU_CLEARLIMITSET,
166 	    DAEMON_UID, DAEMON_GID, PRIV_SYS_NFS,
167 	    can_do_mlp ? PRIV_NET_BINDMLP : NULL, NULL) == -1) {
168 		(void) fprintf(stderr, "%s should be run with"
169 		    " sufficient privileges\n", av[0]);
170 		exit(1);
171 	}
172 
173 	(void) enable_extended_FILE_stdio(-1, -1);
174 
175 	/*
176 	 * Read in the values from config file first before we check
177 	 * command line options so the options override the file.
178 	 */
179 	if ((defopen(NFSADMIN)) == 0) {
180 		if ((defval = defread("NFSD_MAX_CONNECTIONS=")) != NULL) {
181 			errno = 0;
182 			max_conns_allowed = strtol(defval, (char **)NULL, 10);
183 			if (errno != 0) {
184 				max_conns_allowed = -1;
185 			}
186 		}
187 		if ((defval = defread("NFSD_LISTEN_BACKLOG=")) != NULL) {
188 			errno = 0;
189 			listen_backlog = strtol(defval, (char **)NULL, 10);
190 			if (errno != 0) {
191 				listen_backlog = 32;
192 			}
193 		}
194 		if ((defval = defread("NFSD_PROTOCOL=")) != NULL) {
195 			df_proto = strdup(defval);
196 			opt_cnt++;
197 			if (strncasecmp("ALL", defval, 3) == 0) {
198 				free(df_proto);
199 				df_proto = NULL;
200 				df_allflag = 1;
201 			}
202 		}
203 		if ((defval = defread("NFSD_DEVICE=")) != NULL) {
204 			df_provider = strdup(defval);
205 			opt_cnt++;
206 		}
207 		if ((defval = defread("NFSD_SERVERS=")) != NULL) {
208 			errno = 0;
209 			maxservers = strtol(defval, (char **)NULL, 10);
210 			if (errno != 0) {
211 				maxservers = 1;
212 			} else {
213 				maxservers_set = 1;
214 			}
215 		}
216 		if ((defval = defread("NFS_SERVER_VERSMIN=")) != NULL) {
217 			errno = 0;
218 			nfs_server_vers_min =
219 			    strtol(defval, (char **)NULL, 10);
220 			if (errno != 0) {
221 				nfs_server_vers_min = NFS_VERSMIN_DEFAULT;
222 			}
223 		}
224 		if ((defval = defread("NFS_SERVER_VERSMAX=")) != NULL) {
225 			errno = 0;
226 			nfs_server_vers_max =
227 			    strtol(defval, (char **)NULL, 10);
228 			if (errno != 0) {
229 				nfs_server_vers_max = NFS_VERSMAX_DEFAULT;
230 			}
231 		}
232 		if ((defval = defread("NFS_SERVER_DELEGATION=")) != NULL) {
233 			if (strcmp(defval, "off") == 0) {
234 				nfs_server_delegation = FALSE;
235 			}
236 		}
237 
238 		/* close defaults file */
239 		defopen(NULL);
240 	}
241 
242 	/*
243 	 * Conflict options error messages.
244 	 */
245 	if (opt_cnt > 1) {
246 		(void) fprintf(stderr, "\nConflicting options, only one of "
247 		    "the following options can be specified\n"
248 		    "in " NFSADMIN ":\n"
249 		    "\tNFSD_PROTOCOL=ALL\n"
250 		    "\tNFSD_PROTOCOL=protocol\n"
251 		    "\tNFSD_DEVICE=device\n\n");
252 		usage();
253 	}
254 	opt_cnt = 0;
255 
256 	while ((i = getopt(ac, av, "ac:p:s:t:l:")) != EOF) {
257 		switch (i) {
258 		case 'a':
259 			free(df_proto);
260 			df_proto = NULL;
261 			free(df_provider);
262 			df_provider = NULL;
263 
264 			allflag = 1;
265 			opt_cnt++;
266 			break;
267 
268 		case 'c':
269 			max_conns_allowed = atoi(optarg);
270 			break;
271 
272 		case 'p':
273 			proto = optarg;
274 			df_allflag = 0;
275 			opt_cnt++;
276 			break;
277 
278 		/*
279 		 * DSS: NFSv4 distributed stable storage.
280 		 *
281 		 * This is a Contracted Project Private interface, for
282 		 * the sole use of Sun Cluster HA-NFS. See PSARC/2006/313.
283 		 */
284 		case 's':
285 			if (strlen(optarg) < MAXPATHLEN) {
286 				/* first "-s" option encountered? */
287 				if (dss_pathnames == NULL) {
288 					/*
289 					 * Allocate maximum possible space
290 					 * required given cmdline arg count;
291 					 * "-s <path>" consumes two args.
292 					 */
293 					size_t sz = (ac / 2) * sizeof (char *);
294 					dss_pathnames = (char **)malloc(sz);
295 					if (dss_pathnames == NULL) {
296 						(void) fprintf(stderr, "%s: "
297 						    "dss paths malloc failed\n",
298 						    av[0]);
299 						exit(1);
300 					}
301 					(void) memset(dss_pathnames, 0, sz);
302 				}
303 				dss_pathnames[dss_npaths] = optarg;
304 				dss_npaths++;
305 			} else {
306 				(void) fprintf(stderr,
307 				    "%s: -s pathname too long.\n", av[0]);
308 			}
309 			break;
310 
311 		case 't':
312 			provider = optarg;
313 			df_allflag = 0;
314 			opt_cnt++;
315 			break;
316 
317 		case 'l':
318 			listen_backlog = atoi(optarg);
319 			break;
320 
321 		case '?':
322 			usage();
323 			/* NOTREACHED */
324 		}
325 	}
326 
327 	allflag = df_allflag;
328 	if (proto == NULL)
329 		proto = df_proto;
330 	if (provider == NULL)
331 		provider = df_provider;
332 
333 	/*
334 	 * Conflict options error messages.
335 	 */
336 	if (opt_cnt > 1) {
337 		(void) fprintf(stderr, "\nConflicting options, only one of "
338 		    "the following options can be specified\n"
339 		    "on the command line:\n"
340 		    "\t-a\n"
341 		    "\t-p protocol\n"
342 		    "\t-t transport\n\n");
343 		usage();
344 	}
345 
346 	if (proto != NULL &&
347 	    strncasecmp(proto, NC_UDP, strlen(NC_UDP)) == 0) {
348 		if (nfs_server_vers_max == NFS_V4) {
349 			if (nfs_server_vers_min == NFS_V4) {
350 				fprintf(stderr,
351 				    "NFS version 4 is not supported "
352 				    "with the UDP protocol.  Exiting\n");
353 				exit(3);
354 			} else {
355 				fprintf(stderr,
356 				    "NFS version 4 is not supported "
357 				    "with the UDP protocol.\n");
358 			}
359 		}
360 	}
361 
362 	/*
363 	 * If there is exactly one more argument, it is the number of
364 	 * servers.
365 	 */
366 	if (optind == ac - 1) {
367 		maxservers = atoi(av[optind]);
368 		maxservers_set = 1;
369 	}
370 	/*
371 	 * If there are two or more arguments, then this is a usage error.
372 	 */
373 	else if (optind < ac - 1)
374 		usage();
375 	/*
376 	 * Check the ranges for min/max version specified
377 	 */
378 	else if ((nfs_server_vers_min > nfs_server_vers_max) ||
379 	    (nfs_server_vers_min < NFS_VERSMIN) ||
380 	    (nfs_server_vers_max > NFS_VERSMAX))
381 		usage();
382 	/*
383 	 * There are no additional arguments, and we haven't set maxservers
384 	 * explicitly via the config file, we use a default number of
385 	 * servers.  We will log this.
386 	 */
387 	else if (maxservers_set == 0)
388 		logmaxservers = 1;
389 
390 	/*
391 	 * Basic Sanity checks on options
392 	 *
393 	 * max_conns_allowed must be positive, except for the special
394 	 * value of -1 which is used internally to mean unlimited, -1 isn't
395 	 * documented but we allow it anyway.
396 	 *
397 	 * maxservers must be positive
398 	 * listen_backlog must be positive or zero
399 	 */
400 	if (((max_conns_allowed != -1) && (max_conns_allowed <= 0)) ||
401 	    (listen_backlog < 0) || (maxservers <= 0)) {
402 		usage();
403 	}
404 
405 	/*
406 	 * Set current dir to server root
407 	 */
408 	if (chdir(dir) < 0) {
409 		(void) fprintf(stderr, "%s:  ", MyName);
410 		perror(dir);
411 		exit(1);
412 	}
413 
414 #ifndef DEBUG
415 	pipe_fd = daemonize_init();
416 #endif
417 
418 	openlog(MyName, LOG_PID | LOG_NDELAY, LOG_DAEMON);
419 
420 	/*
421 	 * establish our lock on the lock file and write our pid to it.
422 	 * exit if some other process holds the lock, or if there's any
423 	 * error in writing/locking the file.
424 	 */
425 	pid = _enter_daemon_lock(NFSD);
426 	switch (pid) {
427 	case 0:
428 		break;
429 	case -1:
430 		fprintf(stderr, "error locking for %s: %s", NFSD,
431 		    strerror(errno));
432 		exit(2);
433 	default:
434 		/* daemon was already running */
435 		exit(0);
436 	}
437 
438 	/*
439 	 * If we've been given a list of paths to be used for distributed
440 	 * stable storage, and provided we're going to run a version
441 	 * that supports it, setup the DSS paths.
442 	 */
443 	if (dss_pathnames != NULL && nfs_server_vers_max >= DSS_VERSMIN) {
444 		if (dss_init(dss_npaths, dss_pathnames) != 0) {
445 			fprintf(stderr, "%s", "dss_init failed. Exiting.");
446 			exit(1);
447 		}
448 	}
449 
450 	/*
451 	 * Block all signals till we spawn other
452 	 * threads.
453 	 */
454 	(void) sigfillset(&sgset);
455 	(void) thr_sigsetmask(SIG_BLOCK, &sgset, NULL);
456 
457 	if (logmaxservers) {
458 		fprintf(stderr,
459 		    "Number of servers not specified. Using default of %d.",
460 		    maxservers);
461 	}
462 
463 	/*
464 	 * Make sure to unregister any previous versions in case the
465 	 * user is reconfiguring the server in interesting ways.
466 	 */
467 	svc_unreg(NFS_PROGRAM, NFS_VERSION);
468 	svc_unreg(NFS_PROGRAM, NFS_V3);
469 	svc_unreg(NFS_PROGRAM, NFS_V4);
470 	svc_unreg(NFS_ACL_PROGRAM, NFS_ACL_V2);
471 	svc_unreg(NFS_ACL_PROGRAM, NFS_ACL_V3);
472 
473 	/*
474 	 * Set up kernel RPC thread pool for the NFS server.
475 	 */
476 	if (nfssvcpool(maxservers)) {
477 		fprintf(stderr, "Can't set up kernel NFS service: %s. Exiting",
478 		    strerror(errno));
479 		exit(1);
480 	}
481 
482 	/*
483 	 * Set up blocked thread to do LWP creation on behalf of the kernel.
484 	 */
485 	if (svcwait(NFS_SVCPOOL_ID)) {
486 		fprintf(stderr, "Can't set up NFS pool creator: %s. Exiting",
487 		    strerror(errno));
488 		exit(1);
489 	}
490 
491 	/*
492 	 * RDMA start and stop thread.
493 	 * Per pool RDMA listener creation and
494 	 * destructor thread.
495 	 *
496 	 * start rdma services and block in the kernel.
497 	 * (only if proto or provider is not set to TCP or UDP)
498 	 */
499 	if ((proto == NULL) && (provider == NULL)) {
500 		if (svcrdma(NFS_SVCPOOL_ID, nfs_server_vers_min,
501 		    nfs_server_vers_max, nfs_server_delegation)) {
502 			fprintf(stderr,
503 			    "Can't set up RDMA creator thread : %s",
504 			    strerror(errno));
505 		}
506 	}
507 
508 	/*
509 	 * Now open up for signal delivery
510 	 */
511 
512 	(void) thr_sigsetmask(SIG_UNBLOCK, &sgset, NULL);
513 	sigset(SIGTERM, sigflush);
514 	sigset(SIGUSR1, quiesce);
515 
516 	/*
517 	 * Build a protocol block list for registration.
518 	 */
519 	protobp0 = protobp = (struct protob *)malloc(sizeof (struct protob));
520 	protobp->serv = "NFS";
521 	protobp->versmin = nfs_server_vers_min;
522 	protobp->versmax = nfs_server_vers_max;
523 	protobp->program = NFS_PROGRAM;
524 
525 	protobp->next = (struct protob *)malloc(sizeof (struct protob));
526 	protobp = protobp->next;
527 	protobp->serv = "NFS_ACL";		/* not used */
528 	protobp->versmin = nfs_server_vers_min;
529 	/* XXX - this needs work to get the version just right */
530 	protobp->versmax = (nfs_server_vers_max > NFS_ACL_V3) ?
531 	    NFS_ACL_V3 : nfs_server_vers_max;
532 	protobp->program = NFS_ACL_PROGRAM;
533 	protobp->next = (struct protob *)NULL;
534 
535 	if (allflag) {
536 		if (do_all(protobp0, nfssvc) == -1) {
537 			fprintf(stderr, "setnetconfig failed : %s",
538 			    strerror(errno));
539 			exit(1);
540 		}
541 	} else if (proto) {
542 		/* there's more than one match for the same protocol */
543 		struct netconfig *nconf;
544 		NCONF_HANDLE *nc;
545 		bool_t	protoFound = FALSE;
546 		if ((nc = setnetconfig()) == (NCONF_HANDLE *) NULL) {
547 			fprintf(stderr, "setnetconfig failed : %s",
548 			    strerror(errno));
549 			goto done;
550 		}
551 		while (nconf = getnetconfig(nc)) {
552 			if (strcmp(nconf->nc_proto, proto) == 0) {
553 				protoFound = TRUE;
554 				do_one(nconf->nc_device, NULL,
555 				    protobp0, nfssvc);
556 			}
557 		}
558 		(void) endnetconfig(nc);
559 		if (protoFound == FALSE) {
560 			fprintf(stderr,
561 			    "couldn't find netconfig entry for protocol %s",
562 			    proto);
563 		}
564 	} else if (provider)
565 		do_one(provider, proto, protobp0, nfssvc);
566 	else {
567 		for (providerp = defaultproviders;
568 		    *providerp != NULL; providerp++) {
569 			provider = *providerp;
570 			do_one(provider, NULL, protobp0, nfssvc);
571 		}
572 	}
573 done:
574 
575 	free(protobp);
576 	free(protobp0);
577 
578 	if (num_fds == 0) {
579 		fprintf(stderr, "Could not start NFS service for any protocol."
580 		    " Exiting");
581 		exit(1);
582 	}
583 
584 	end_listen_fds = num_fds;
585 
586 	/*
587 	 * nfsd is up and running as far as we are concerned.
588 	 */
589 	daemonize_fini(pipe_fd);
590 
591 	/*
592 	 * Get rid of unneeded privileges.
593 	 */
594 	__fini_daemon_priv(PRIV_PROC_FORK, PRIV_PROC_EXEC, PRIV_PROC_SESSION,
595 	    PRIV_FILE_LINK_ANY, PRIV_PROC_INFO, (char *)NULL);
596 
597 	/*
598 	 * Poll for non-data control events on the transport descriptors.
599 	 */
600 	poll_for_action();
601 
602 	/*
603 	 * If we get here, something failed in poll_for_action().
604 	 */
605 	return (1);
606 }
607 
608 static int
609 nfssvcpool(int maxservers)
610 {
611 	struct svcpool_args npa;
612 
613 	npa.id = NFS_SVCPOOL_ID;
614 	npa.maxthreads = maxservers;
615 	npa.redline = 0;
616 	npa.qsize = 0;
617 	npa.timeout = 0;
618 	npa.stksize = 0;
619 	npa.max_same_xprt = 0;
620 	return (_nfssys(SVCPOOL_CREATE, &npa));
621 }
622 
623 /*
624  * Establish NFS service thread.
625  */
626 static int
627 nfssvc(int fd, struct netbuf addrmask, struct netconfig *nconf)
628 {
629 	struct nfs_svc_args nsa;
630 
631 	nsa.fd = fd;
632 	nsa.netid = nconf->nc_netid;
633 	nsa.addrmask = addrmask;
634 	if (strncasecmp(nconf->nc_proto, NC_UDP, strlen(NC_UDP)) == 0) {
635 		nsa.versmax = (nfs_server_vers_max > NFS_V3) ?
636 		    NFS_V3 : nfs_server_vers_max;
637 		nsa.versmin = nfs_server_vers_min;
638 		/*
639 		 * If no version left, silently do nothing, previous
640 		 * checks will have assured at least TCP is available.
641 		 */
642 		if (nsa.versmin > nsa.versmax)
643 			return (0);
644 	} else {
645 		nsa.versmax = nfs_server_vers_max;
646 		nsa.versmin = nfs_server_vers_min;
647 	}
648 	nsa.delegation = nfs_server_delegation;
649 	return (_nfssys(NFS_SVC, &nsa));
650 }
651 
652 static void
653 usage(void)
654 {
655 	(void) fprintf(stderr,
656 "usage: %s [ -a ] [ -c max_conns ] [ -p protocol ] [ -t transport ] ", MyName);
657 	(void) fprintf(stderr, "\n[ -l listen_backlog ] [ nservers ]\n");
658 	(void) fprintf(stderr,
659 "\twhere -a causes <nservers> to be started on each appropriate transport,\n");
660 	(void) fprintf(stderr,
661 "\tmax_conns is the maximum number of concurrent connections allowed,\n");
662 	(void) fprintf(stderr, "\t\tand max_conns must be a decimal number");
663 	(void) fprintf(stderr, "> zero,\n");
664 	(void) fprintf(stderr, "\tprotocol is a protocol identifier,\n");
665 	(void) fprintf(stderr,
666 	    "\ttransport is a transport provider name (i.e. device),\n");
667 	(void) fprintf(stderr,
668 	    "\tlisten_backlog is the TCP listen backlog,\n");
669 	(void) fprintf(stderr,
670 	    "\tand <nservers> must be a decimal number > zero.\n");
671 	exit(1);
672 }
673 
674 /*
675  * Issue nfssys system call to flush all logging buffers asynchronously.
676  *
677  * NOTICE: It is extremely important to flush NFS logging buffers when
678  *	   nfsd exits. When the system is halted or rebooted nfslogd
679  *	   may not have an opportunity to flush the buffers.
680  */
681 static void
682 nfsl_flush()
683 {
684 	struct nfsl_flush_args nfa;
685 
686 	memset((void *)&nfa, 0, sizeof (nfa));
687 	nfa.version = NFSL_FLUSH_ARGS_VERS;
688 	nfa.directive = NFSL_ALL;	/* flush all asynchronously */
689 
690 	if (_nfssys(LOG_FLUSH, &nfa) < 0)
691 		syslog(LOG_ERR, "_nfssys(LOG_FLUSH) failed: %s\n",
692 		    strerror(errno));
693 }
694 
695 /*
696  * SIGTERM handler.
697  * Flush logging buffers and exit.
698  */
699 static void
700 sigflush(int sig)
701 {
702 	nfsl_flush();
703 	_exit(0);
704 }
705 
706 /*
707  * SIGUSR1 handler.
708  *
709  * Request that server quiesce, then (nfsd) exit. For subsequent warm start.
710  *
711  * This is a Contracted Project Private interface, for the sole use
712  * of Sun Cluster HA-NFS. See PSARC/2004/497.
713  *
714  * Equivalent to SIGTERM handler if nfs_server_vers_max < QUIESCE_VERSMIN.
715  */
716 static void
717 quiesce(int sig)
718 {
719 	int error;
720 	int id = NFS_SVCPOOL_ID;
721 
722 	if (nfs_server_vers_max >= QUIESCE_VERSMIN) {
723 		/* Request server quiesce at next shutdown */
724 		error = _nfssys(NFS4_SVC_REQUEST_QUIESCE, &id);
725 
726 		/*
727 		 * ENOENT is returned if there is no matching SVC pool
728 		 * for the id. Possibly because the pool is not yet setup.
729 		 * In this case, just exit as if no error. For all other errors,
730 		 * just return and allow caller to retry.
731 		 */
732 		if (error && errno != ENOENT) {
733 			syslog(LOG_ERR,
734 			    "_nfssys(NFS4_SVC_REQUEST_QUIESCE) failed: %s",
735 			    strerror(errno));
736 			return;
737 		}
738 	}
739 
740 	/* Flush logging buffers */
741 	nfsl_flush();
742 
743 	_exit(0);
744 }
745 
746 /*
747  * DSS: distributed stable storage.
748  * Create leaf directories as required, keeping an eye on path
749  * lengths. Calls exit(1) on failure.
750  * The pathnames passed in must already exist, and must be writeable by nfsd.
751  * Note: the leaf directories under NFS4_VAR_DIR are not created here;
752  * they're created at pkg install.
753  */
754 static void
755 dss_mkleafdirs(uint_t npaths, char **pathnames)
756 {
757 	int i;
758 	char *tmppath = NULL;
759 
760 	/*
761 	 * Create the temporary storage used by dss_mkleafdir() here,
762 	 * rather than in that function, so that it only needs to be
763 	 * done once, rather than once for each call. Too big to put
764 	 * on the function's stack.
765 	 */
766 	tmppath = (char *)malloc(MAXPATHLEN);
767 	if (tmppath == NULL) {
768 		syslog(LOG_ERR, "tmppath malloc failed. Exiting");
769 		exit(1);
770 	}
771 
772 	for (i = 0; i < npaths; i++) {
773 		char *p = pathnames[i];
774 
775 		dss_mkleafdir(p, NFS4_DSS_STATE_LEAF, tmppath);
776 		dss_mkleafdir(p, NFS4_DSS_OLDSTATE_LEAF, tmppath);
777 	}
778 
779 	free(tmppath);
780 }
781 
782 /*
783  * Create "leaf" in "dir" (which must already exist).
784  * leaf: should start with a '/'
785  */
786 static void
787 dss_mkleafdir(char *dir, char *leaf, char *tmppath)
788 {
789 	/* MAXPATHLEN includes the terminating NUL */
790 	if (strlen(dir) + strlen(leaf) > MAXPATHLEN - 1) {
791 		fprintf(stderr, "stable storage path too long: %s%s. Exiting",
792 		    dir, leaf);
793 		exit(1);
794 	}
795 
796 	(void) snprintf(tmppath, MAXPATHLEN, "%s/%s", dir, leaf);
797 
798 	/* the directory may already exist: that's OK */
799 	if (mkdir(tmppath, NFS4_DSS_DIR_MODE) == -1 && errno != EEXIST) {
800 		fprintf(stderr, "error creating stable storage directory: "
801 		    "%s: %s. Exiting", strerror(errno), tmppath);
802 		exit(1);
803 	}
804 }
805 
806 /*
807  * Create the storage dirs, and pass the path list to the kernel.
808  * This requires the nfssrv module to be loaded; the _nfssys() syscall
809  * will fail ENOTSUP if it is not.
810  * Use libnvpair(3LIB) to pass the data to the kernel.
811  */
812 static int
813 dss_init(uint_t npaths, char **pathnames)
814 {
815 	int i, j, nskipped, error;
816 	char *bufp;
817 	uint32_t bufsize;
818 	size_t buflen;
819 	nvlist_t *nvl;
820 
821 	if (npaths > 1) {
822 		/*
823 		 * We need to remove duplicate paths; this might be user error
824 		 * in the general case, but HA-NFSv4 can also cause this.
825 		 * Sort the pathnames array, and NULL out duplicates,
826 		 * then write the non-NULL entries to a new array.
827 		 * Sorting will also allow the kernel to optimise its searches.
828 		 */
829 
830 		qsort(pathnames, npaths, sizeof (char *), qstrcmp);
831 
832 		/* now NULL out any duplicates */
833 		i = 0; j = 1; nskipped = 0;
834 		while (j < npaths) {
835 			if (strcmp(pathnames[i], pathnames[j]) == NULL) {
836 				pathnames[j] = NULL;
837 				j++;
838 				nskipped++;
839 				continue;
840 			}
841 
842 			/* skip i over any of its NULLed duplicates */
843 			i = j++;
844 		}
845 
846 		/* finally, write the non-NULL entries to a new array */
847 		if (nskipped > 0) {
848 			int nreal;
849 			size_t sz;
850 			char **tmp_pathnames;
851 
852 			nreal = npaths - nskipped;
853 
854 			sz = nreal * sizeof (char *);
855 			tmp_pathnames = (char **)malloc(sz);
856 			if (tmp_pathnames == NULL) {
857 				fprintf(stderr, "tmp_pathnames malloc failed");
858 				exit(1);
859 			}
860 
861 			for (i = 0, j = 0; i < npaths; i++)
862 				if (pathnames[i] != NULL)
863 					tmp_pathnames[j++] = pathnames[i];
864 			free(pathnames);
865 			pathnames = tmp_pathnames;
866 			npaths = nreal;
867 		}
868 
869 	}
870 
871 	/* Create directories to store the distributed state files */
872 	dss_mkleafdirs(npaths, pathnames);
873 
874 	/* Create the name-value pair list */
875 	error = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
876 	if (error) {
877 		fprintf(stderr, "nvlist_alloc failed: %s.", strerror(errno));
878 		return (1);
879 	}
880 
881 	/* Add the pathnames array as a single name-value pair */
882 	error = nvlist_add_string_array(nvl, NFS4_DSS_NVPAIR_NAME,
883 	    pathnames, npaths);
884 	if (error) {
885 		fprintf(stderr, "nvlist_add_string_array failed: %s.",
886 		    strerror(errno));
887 		nvlist_free(nvl);
888 		return (1);
889 	}
890 
891 	/*
892 	 * Pack list into contiguous memory, for passing to kernel.
893 	 * nvlist_pack() will allocate the memory for the buffer,
894 	 * which we should free() when no longer needed.
895 	 * NV_ENCODE_XDR for safety across ILP32/LP64 kernel boundary.
896 	 */
897 	bufp = NULL;
898 	error = nvlist_pack(nvl, &bufp, &buflen, NV_ENCODE_XDR, 0);
899 	if (error) {
900 		fprintf(stderr, "nvlist_pack failed: %s.", strerror(errno));
901 		nvlist_free(nvl);
902 		return (1);
903 	}
904 
905 	/* Now we have the packed buffer, we no longer need the list */
906 	nvlist_free(nvl);
907 
908 	/*
909 	 * Let the kernel know in advance how big the buffer is.
910 	 * NOTE: we cannot just pass buflen, since size_t is a long, and
911 	 * thus a different size between ILP32 userland and LP64 kernel.
912 	 * Use an int for the transfer, since that should be big enough;
913 	 * this is a no-op at the moment, here, since nfsd is 32-bit, but
914 	 * that could change.
915 	 */
916 	bufsize = (uint32_t)buflen;
917 	error = _nfssys(NFS4_DSS_SETPATHS_SIZE, &bufsize);
918 	if (error) {
919 		fprintf(stderr,
920 		    "_nfssys(NFS4_DSS_SETPATHS_SIZE) failed: %s. ",
921 		    strerror(errno));
922 		free(bufp);
923 		return (1);
924 	}
925 
926 	/* Pass the packed buffer to the kernel */
927 	error = _nfssys(NFS4_DSS_SETPATHS, bufp);
928 	if (error) {
929 		fprintf(stderr,
930 		    "_nfssys(NFS4_DSS_SETPATHS) failed: %s. ", strerror(errno));
931 		free(bufp);
932 		return (1);
933 	}
934 
935 	/*
936 	 * The kernel has now unpacked the buffer and extracted the
937 	 * pathnames array, we no longer need the buffer.
938 	 */
939 	free(bufp);
940 
941 	return (0);
942 }
943 
944 /*
945  * Quick sort string compare routine, for qsort.
946  * Needed to make arg types correct.
947  */
948 int
949 qstrcmp(const void *p1, const void *p2)
950 {
951 	char *s1 = *((char **)p1);
952 	char *s2 = *((char **)p2);
953 
954 	return (strcmp(s1, s2));
955 }
956