xref: /titanic_51/usr/src/cmd/fs.d/nfs/nfsd/nfsd.c (revision e9a20b613b3184ca60a413a1de74b9e6bde67705)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T		*/
27 /*	  All Rights Reserved  	*/
28 
29 /*
30  * University Copyright- Copyright (c) 1982, 1986, 1988
31  * The Regents of the University of California
32  * All Rights Reserved
33  *
34  * University Acknowledgment- Portions of this document are derived from
35  * software developed by the University of California, Berkeley, and its
36  * contributors.
37  */
38 
39 /* LINTLIBRARY */
40 /* PROTOLIB1 */
41 
42 /* NFS server */
43 
44 #include <sys/param.h>
45 #include <sys/types.h>
46 #include <sys/stat.h>
47 #include <syslog.h>
48 #include <tiuser.h>
49 #include <rpc/rpc.h>
50 #include <errno.h>
51 #include <thread.h>
52 #include <sys/resource.h>
53 #include <sys/time.h>
54 #include <sys/file.h>
55 #include <nfs/nfs.h>
56 #include <nfs/nfs_acl.h>
57 #include <nfs/nfssys.h>
58 #include <stdio.h>
59 #include <stdio_ext.h>
60 #include <stdlib.h>
61 #include <signal.h>
62 #include <netconfig.h>
63 #include <netdir.h>
64 #include <string.h>
65 #include <unistd.h>
66 #include <stropts.h>
67 #include <sys/tihdr.h>
68 #include <sys/wait.h>
69 #include <poll.h>
70 #include <priv_utils.h>
71 #include <sys/tiuser.h>
72 #include <netinet/tcp.h>
73 #include <deflt.h>
74 #include <rpcsvc/daemon_utils.h>
75 #include <rpcsvc/nfs4_prot.h>
76 #include <libnvpair.h>
77 #include "nfs_tbind.h"
78 #include "thrpool.h"
79 
80 /* quiesce requests will be ignored if nfs_server_vers_max < QUIESCE_VERSMIN */
81 #define	QUIESCE_VERSMIN	4
82 /* DSS: distributed stable storage */
83 #define	DSS_VERSMIN	4
84 
85 static	int	nfssvc(int, struct netbuf, struct netconfig *);
86 static	int	nfssvcpool(int maxservers);
87 static	int	dss_init(uint_t npaths, char **pathnames);
88 static	void	dss_mkleafdirs(uint_t npaths, char **pathnames);
89 static	void	dss_mkleafdir(char *dir, char *leaf, char *path);
90 static	void	usage(void);
91 int		qstrcmp(const void *s1, const void *s2);
92 
93 extern	int	_nfssys(int, void *);
94 
95 extern int	daemonize_init(void);
96 extern void	daemonize_fini(int fd);
97 
98 /* signal handlers */
99 static void sigflush(int);
100 static void quiesce(int);
101 
102 static	char	*MyName;
103 static	NETSELDECL(defaultproviders)[] = { "/dev/tcp6", "/dev/tcp", "/dev/udp",
104 					    "/dev/udp6", NULL };
105 /* static	NETSELDECL(defaultprotos)[] =	{ NC_UDP, NC_TCP, NULL }; */
106 /*
107  * The following are all globals used by routines in nfs_tbind.c.
108  */
109 size_t	end_listen_fds;		/* used by conn_close_oldest() */
110 size_t	num_fds = 0;		/* used by multiple routines */
111 int	listen_backlog = 32;	/* used by bind_to_{provider,proto}() */
112 int	num_servers;		/* used by cots_listen_event() */
113 int	(*Mysvc)(int, struct netbuf, struct netconfig *) = nfssvc;
114 				/* used by cots_listen_event() */
115 int	max_conns_allowed = -1;	/* used by cots_listen_event() */
116 
117 /*
118  * Keep track of min/max versions of NFS protocol to be started.
119  * Start with the defaults (min == 2, max == 3).  We have the
120  * capability of starting vers=4 but only if the user requests it.
121  */
122 int	nfs_server_vers_min = NFS_VERSMIN_DEFAULT;
123 int	nfs_server_vers_max = NFS_VERSMAX_DEFAULT;
124 
125 /*
126  * Set the default for server delegation enablement and set per
127  * /etc/default/nfs configuration (if present).
128  */
129 int	nfs_server_delegation = NFS_SERVER_DELEGATION_DEFAULT;
130 
131 int
132 main(int ac, char *av[])
133 {
134 	char *dir = "/";
135 	int allflag = 0;
136 	int df_allflag = 0;
137 	int opt_cnt = 0;
138 	int maxservers = 1;	/* zero allows inifinte number of threads */
139 	int maxservers_set = 0;
140 	int logmaxservers = 0;
141 	int pid;
142 	int i;
143 	char *provider = (char *)NULL;
144 	char *df_provider = (char *)NULL;
145 	struct protob *protobp0, *protobp;
146 	NETSELDECL(proto) = NULL;
147 	NETSELDECL(df_proto) = NULL;
148 	NETSELPDECL(providerp);
149 	char *defval;
150 	boolean_t can_do_mlp;
151 	uint_t dss_npaths = 0;
152 	char **dss_pathnames = NULL;
153 	sigset_t sgset;
154 
155 	int pipe_fd = -1;
156 
157 	MyName = *av;
158 
159 	/*
160 	 * Initializations that require more privileges than we need to run.
161 	 */
162 	(void) _create_daemon_lock(NFSD, DAEMON_UID, DAEMON_GID);
163 	svcsetprio();
164 
165 	can_do_mlp = priv_ineffect(PRIV_NET_BINDMLP);
166 	if (__init_daemon_priv(PU_RESETGROUPS|PU_CLEARLIMITSET,
167 	    DAEMON_UID, DAEMON_GID, PRIV_SYS_NFS,
168 	    can_do_mlp ? PRIV_NET_BINDMLP : NULL, NULL) == -1) {
169 		(void) fprintf(stderr, "%s should be run with"
170 		    " sufficient privileges\n", av[0]);
171 		exit(1);
172 	}
173 
174 	(void) enable_extended_FILE_stdio(-1, -1);
175 
176 	/*
177 	 * Read in the values from config file first before we check
178 	 * command line options so the options override the file.
179 	 */
180 	if ((defopen(NFSADMIN)) == 0) {
181 		if ((defval = defread("NFSD_MAX_CONNECTIONS=")) != NULL) {
182 			errno = 0;
183 			max_conns_allowed = strtol(defval, (char **)NULL, 10);
184 			if (errno != 0) {
185 				max_conns_allowed = -1;
186 			}
187 		}
188 		if ((defval = defread("NFSD_LISTEN_BACKLOG=")) != NULL) {
189 			errno = 0;
190 			listen_backlog = strtol(defval, (char **)NULL, 10);
191 			if (errno != 0) {
192 				listen_backlog = 32;
193 			}
194 		}
195 		if ((defval = defread("NFSD_PROTOCOL=")) != NULL) {
196 			df_proto = strdup(defval);
197 			opt_cnt++;
198 			if (strncasecmp("ALL", defval, 3) == 0) {
199 				free(df_proto);
200 				df_proto = NULL;
201 				df_allflag = 1;
202 			}
203 		}
204 		if ((defval = defread("NFSD_DEVICE=")) != NULL) {
205 			df_provider = strdup(defval);
206 			opt_cnt++;
207 		}
208 		if ((defval = defread("NFSD_SERVERS=")) != NULL) {
209 			errno = 0;
210 			maxservers = strtol(defval, (char **)NULL, 10);
211 			if (errno != 0) {
212 				maxservers = 1;
213 			} else {
214 				maxservers_set = 1;
215 			}
216 		}
217 		if ((defval = defread("NFS_SERVER_VERSMIN=")) != NULL) {
218 			errno = 0;
219 			nfs_server_vers_min =
220 			    strtol(defval, (char **)NULL, 10);
221 			if (errno != 0) {
222 				nfs_server_vers_min = NFS_VERSMIN_DEFAULT;
223 			}
224 		}
225 		if ((defval = defread("NFS_SERVER_VERSMAX=")) != NULL) {
226 			errno = 0;
227 			nfs_server_vers_max =
228 			    strtol(defval, (char **)NULL, 10);
229 			if (errno != 0) {
230 				nfs_server_vers_max = NFS_VERSMAX_DEFAULT;
231 			}
232 		}
233 		if ((defval = defread("NFS_SERVER_DELEGATION=")) != NULL) {
234 			if (strcmp(defval, "off") == 0) {
235 				nfs_server_delegation = FALSE;
236 			}
237 		}
238 
239 		/* close defaults file */
240 		defopen(NULL);
241 	}
242 
243 	/*
244 	 * Conflict options error messages.
245 	 */
246 	if (opt_cnt > 1) {
247 		(void) fprintf(stderr, "\nConflicting options, only one of "
248 		    "the following options can be specified\n"
249 		    "in " NFSADMIN ":\n"
250 		    "\tNFSD_PROTOCOL=ALL\n"
251 		    "\tNFSD_PROTOCOL=protocol\n"
252 		    "\tNFSD_DEVICE=device\n\n");
253 		usage();
254 	}
255 	opt_cnt = 0;
256 
257 	while ((i = getopt(ac, av, "ac:p:s:t:l:")) != EOF) {
258 		switch (i) {
259 		case 'a':
260 			free(df_proto);
261 			df_proto = NULL;
262 			free(df_provider);
263 			df_provider = NULL;
264 
265 			allflag = 1;
266 			opt_cnt++;
267 			break;
268 
269 		case 'c':
270 			max_conns_allowed = atoi(optarg);
271 			break;
272 
273 		case 'p':
274 			proto = optarg;
275 			df_allflag = 0;
276 			opt_cnt++;
277 			break;
278 
279 		/*
280 		 * DSS: NFSv4 distributed stable storage.
281 		 *
282 		 * This is a Contracted Project Private interface, for
283 		 * the sole use of Sun Cluster HA-NFS. See PSARC/2006/313.
284 		 */
285 		case 's':
286 			if (strlen(optarg) < MAXPATHLEN) {
287 				/* first "-s" option encountered? */
288 				if (dss_pathnames == NULL) {
289 					/*
290 					 * Allocate maximum possible space
291 					 * required given cmdline arg count;
292 					 * "-s <path>" consumes two args.
293 					 */
294 					size_t sz = (ac / 2) * sizeof (char *);
295 					dss_pathnames = (char **)malloc(sz);
296 					if (dss_pathnames == NULL) {
297 						(void) fprintf(stderr, "%s: "
298 						    "dss paths malloc failed\n",
299 						    av[0]);
300 						exit(1);
301 					}
302 					(void) memset(dss_pathnames, 0, sz);
303 				}
304 				dss_pathnames[dss_npaths] = optarg;
305 				dss_npaths++;
306 			} else {
307 				(void) fprintf(stderr,
308 				    "%s: -s pathname too long.\n", av[0]);
309 			}
310 			break;
311 
312 		case 't':
313 			provider = optarg;
314 			df_allflag = 0;
315 			opt_cnt++;
316 			break;
317 
318 		case 'l':
319 			listen_backlog = atoi(optarg);
320 			break;
321 
322 		case '?':
323 			usage();
324 			/* NOTREACHED */
325 		}
326 	}
327 
328 	allflag = df_allflag;
329 	if (proto == NULL)
330 		proto = df_proto;
331 	if (provider == NULL)
332 		provider = df_provider;
333 
334 	/*
335 	 * Conflict options error messages.
336 	 */
337 	if (opt_cnt > 1) {
338 		(void) fprintf(stderr, "\nConflicting options, only one of "
339 		    "the following options can be specified\n"
340 		    "on the command line:\n"
341 		    "\t-a\n"
342 		    "\t-p protocol\n"
343 		    "\t-t transport\n\n");
344 		usage();
345 	}
346 
347 	if (proto != NULL &&
348 	    strncasecmp(proto, NC_UDP, strlen(NC_UDP)) == 0) {
349 		if (nfs_server_vers_max == NFS_V4) {
350 			if (nfs_server_vers_min == NFS_V4) {
351 				fprintf(stderr,
352 				    "NFS version 4 is not supported "
353 				    "with the UDP protocol.  Exiting\n");
354 				exit(3);
355 			} else {
356 				fprintf(stderr,
357 				    "NFS version 4 is not supported "
358 				    "with the UDP protocol.\n");
359 			}
360 		}
361 	}
362 
363 	/*
364 	 * If there is exactly one more argument, it is the number of
365 	 * servers.
366 	 */
367 	if (optind == ac - 1) {
368 		maxservers = atoi(av[optind]);
369 		maxservers_set = 1;
370 	}
371 	/*
372 	 * If there are two or more arguments, then this is a usage error.
373 	 */
374 	else if (optind < ac - 1)
375 		usage();
376 	/*
377 	 * Check the ranges for min/max version specified
378 	 */
379 	else if ((nfs_server_vers_min > nfs_server_vers_max) ||
380 	    (nfs_server_vers_min < NFS_VERSMIN) ||
381 	    (nfs_server_vers_max > NFS_VERSMAX))
382 		usage();
383 	/*
384 	 * There are no additional arguments, and we haven't set maxservers
385 	 * explicitly via the config file, we use a default number of
386 	 * servers.  We will log this.
387 	 */
388 	else if (maxservers_set == 0)
389 		logmaxservers = 1;
390 
391 	/*
392 	 * Basic Sanity checks on options
393 	 *
394 	 * max_conns_allowed must be positive, except for the special
395 	 * value of -1 which is used internally to mean unlimited, -1 isn't
396 	 * documented but we allow it anyway.
397 	 *
398 	 * maxservers must be positive
399 	 * listen_backlog must be positive or zero
400 	 */
401 	if (((max_conns_allowed != -1) && (max_conns_allowed <= 0)) ||
402 	    (listen_backlog < 0) || (maxservers <= 0)) {
403 		usage();
404 	}
405 
406 	/*
407 	 * Set current dir to server root
408 	 */
409 	if (chdir(dir) < 0) {
410 		(void) fprintf(stderr, "%s:  ", MyName);
411 		perror(dir);
412 		exit(1);
413 	}
414 
415 #ifndef DEBUG
416 	pipe_fd = daemonize_init();
417 #endif
418 
419 	openlog(MyName, LOG_PID | LOG_NDELAY, LOG_DAEMON);
420 
421 	/*
422 	 * establish our lock on the lock file and write our pid to it.
423 	 * exit if some other process holds the lock, or if there's any
424 	 * error in writing/locking the file.
425 	 */
426 	pid = _enter_daemon_lock(NFSD);
427 	switch (pid) {
428 	case 0:
429 		break;
430 	case -1:
431 		fprintf(stderr, "error locking for %s: %s", NFSD,
432 		    strerror(errno));
433 		exit(2);
434 	default:
435 		/* daemon was already running */
436 		exit(0);
437 	}
438 
439 	/*
440 	 * If we've been given a list of paths to be used for distributed
441 	 * stable storage, and provided we're going to run a version
442 	 * that supports it, setup the DSS paths.
443 	 */
444 	if (dss_pathnames != NULL && nfs_server_vers_max >= DSS_VERSMIN) {
445 		if (dss_init(dss_npaths, dss_pathnames) != 0) {
446 			fprintf(stderr, "%s", "dss_init failed. Exiting.");
447 			exit(1);
448 		}
449 	}
450 
451 	/*
452 	 * Block all signals till we spawn other
453 	 * threads.
454 	 */
455 	(void) sigfillset(&sgset);
456 	(void) thr_sigsetmask(SIG_BLOCK, &sgset, NULL);
457 
458 	if (logmaxservers) {
459 		fprintf(stderr,
460 		    "Number of servers not specified. Using default of %d.",
461 		    maxservers);
462 	}
463 
464 	/*
465 	 * Make sure to unregister any previous versions in case the
466 	 * user is reconfiguring the server in interesting ways.
467 	 */
468 	svc_unreg(NFS_PROGRAM, NFS_VERSION);
469 	svc_unreg(NFS_PROGRAM, NFS_V3);
470 	svc_unreg(NFS_PROGRAM, NFS_V4);
471 	svc_unreg(NFS_ACL_PROGRAM, NFS_ACL_V2);
472 	svc_unreg(NFS_ACL_PROGRAM, NFS_ACL_V3);
473 
474 	/*
475 	 * Set up kernel RPC thread pool for the NFS server.
476 	 */
477 	if (nfssvcpool(maxservers)) {
478 		fprintf(stderr, "Can't set up kernel NFS service: %s. Exiting",
479 		    strerror(errno));
480 		exit(1);
481 	}
482 
483 	/*
484 	 * Set up blocked thread to do LWP creation on behalf of the kernel.
485 	 */
486 	if (svcwait(NFS_SVCPOOL_ID)) {
487 		fprintf(stderr, "Can't set up NFS pool creator: %s. Exiting",
488 		    strerror(errno));
489 		exit(1);
490 	}
491 
492 	/*
493 	 * RDMA start and stop thread.
494 	 * Per pool RDMA listener creation and
495 	 * destructor thread.
496 	 *
497 	 * start rdma services and block in the kernel.
498 	 * (only if proto or provider is not set to TCP or UDP)
499 	 */
500 	if ((proto == NULL) && (provider == NULL)) {
501 		if (svcrdma(NFS_SVCPOOL_ID, nfs_server_vers_min,
502 		    nfs_server_vers_max, nfs_server_delegation)) {
503 			fprintf(stderr,
504 			    "Can't set up RDMA creator thread : %s",
505 			    strerror(errno));
506 		}
507 	}
508 
509 	/*
510 	 * Now open up for signal delivery
511 	 */
512 
513 	(void) thr_sigsetmask(SIG_UNBLOCK, &sgset, NULL);
514 	sigset(SIGTERM, sigflush);
515 	sigset(SIGUSR1, quiesce);
516 
517 	/*
518 	 * Build a protocol block list for registration.
519 	 */
520 	protobp0 = protobp = (struct protob *)malloc(sizeof (struct protob));
521 	protobp->serv = "NFS";
522 	protobp->versmin = nfs_server_vers_min;
523 	protobp->versmax = nfs_server_vers_max;
524 	protobp->program = NFS_PROGRAM;
525 
526 	protobp->next = (struct protob *)malloc(sizeof (struct protob));
527 	protobp = protobp->next;
528 	protobp->serv = "NFS_ACL";		/* not used */
529 	protobp->versmin = nfs_server_vers_min;
530 	/* XXX - this needs work to get the version just right */
531 	protobp->versmax = (nfs_server_vers_max > NFS_ACL_V3) ?
532 	    NFS_ACL_V3 : nfs_server_vers_max;
533 	protobp->program = NFS_ACL_PROGRAM;
534 	protobp->next = (struct protob *)NULL;
535 
536 	if (allflag) {
537 		if (do_all(protobp0, nfssvc, 0) == -1) {
538 			fprintf(stderr, "setnetconfig failed : %s",
539 			    strerror(errno));
540 			exit(1);
541 		}
542 	} else if (proto) {
543 		/* there's more than one match for the same protocol */
544 		struct netconfig *nconf;
545 		NCONF_HANDLE *nc;
546 		bool_t	protoFound = FALSE;
547 		if ((nc = setnetconfig()) == (NCONF_HANDLE *) NULL) {
548 			fprintf(stderr, "setnetconfig failed : %s",
549 			    strerror(errno));
550 			goto done;
551 		}
552 		while (nconf = getnetconfig(nc)) {
553 			if (strcmp(nconf->nc_proto, proto) == 0) {
554 				protoFound = TRUE;
555 				do_one(nconf->nc_device, NULL,
556 				    protobp0, nfssvc, 0);
557 			}
558 		}
559 		(void) endnetconfig(nc);
560 		if (protoFound == FALSE) {
561 			fprintf(stderr,
562 			    "couldn't find netconfig entry for protocol %s",
563 			    proto);
564 		}
565 	} else if (provider)
566 		do_one(provider, proto, protobp0, nfssvc, 0);
567 	else {
568 		for (providerp = defaultproviders;
569 		    *providerp != NULL; providerp++) {
570 			provider = *providerp;
571 			do_one(provider, NULL, protobp0, nfssvc, 0);
572 		}
573 	}
574 done:
575 
576 	free(protobp);
577 	free(protobp0);
578 
579 	if (num_fds == 0) {
580 		fprintf(stderr, "Could not start NFS service for any protocol."
581 		    " Exiting");
582 		exit(1);
583 	}
584 
585 	end_listen_fds = num_fds;
586 
587 	/*
588 	 * nfsd is up and running as far as we are concerned.
589 	 */
590 	daemonize_fini(pipe_fd);
591 
592 	/*
593 	 * Get rid of unneeded privileges.
594 	 */
595 	__fini_daemon_priv(PRIV_PROC_FORK, PRIV_PROC_EXEC, PRIV_PROC_SESSION,
596 	    PRIV_FILE_LINK_ANY, PRIV_PROC_INFO, (char *)NULL);
597 
598 	/*
599 	 * Poll for non-data control events on the transport descriptors.
600 	 */
601 	poll_for_action();
602 
603 	/*
604 	 * If we get here, something failed in poll_for_action().
605 	 */
606 	return (1);
607 }
608 
609 static int
610 nfssvcpool(int maxservers)
611 {
612 	struct svcpool_args npa;
613 
614 	npa.id = NFS_SVCPOOL_ID;
615 	npa.maxthreads = maxservers;
616 	npa.redline = 0;
617 	npa.qsize = 0;
618 	npa.timeout = 0;
619 	npa.stksize = 0;
620 	npa.max_same_xprt = 0;
621 	return (_nfssys(SVCPOOL_CREATE, &npa));
622 }
623 
624 /*
625  * Establish NFS service thread.
626  */
627 static int
628 nfssvc(int fd, struct netbuf addrmask, struct netconfig *nconf)
629 {
630 	struct nfs_svc_args nsa;
631 
632 	nsa.fd = fd;
633 	nsa.netid = nconf->nc_netid;
634 	nsa.addrmask = addrmask;
635 	if (strncasecmp(nconf->nc_proto, NC_UDP, strlen(NC_UDP)) == 0) {
636 		nsa.versmax = (nfs_server_vers_max > NFS_V3) ?
637 		    NFS_V3 : nfs_server_vers_max;
638 		nsa.versmin = nfs_server_vers_min;
639 		/*
640 		 * If no version left, silently do nothing, previous
641 		 * checks will have assured at least TCP is available.
642 		 */
643 		if (nsa.versmin > nsa.versmax)
644 			return (0);
645 	} else {
646 		nsa.versmax = nfs_server_vers_max;
647 		nsa.versmin = nfs_server_vers_min;
648 	}
649 	nsa.delegation = nfs_server_delegation;
650 	return (_nfssys(NFS_SVC, &nsa));
651 }
652 
653 static void
654 usage(void)
655 {
656 	(void) fprintf(stderr,
657 "usage: %s [ -a ] [ -c max_conns ] [ -p protocol ] [ -t transport ] ", MyName);
658 	(void) fprintf(stderr, "\n[ -l listen_backlog ] [ nservers ]\n");
659 	(void) fprintf(stderr,
660 "\twhere -a causes <nservers> to be started on each appropriate transport,\n");
661 	(void) fprintf(stderr,
662 "\tmax_conns is the maximum number of concurrent connections allowed,\n");
663 	(void) fprintf(stderr, "\t\tand max_conns must be a decimal number");
664 	(void) fprintf(stderr, "> zero,\n");
665 	(void) fprintf(stderr, "\tprotocol is a protocol identifier,\n");
666 	(void) fprintf(stderr,
667 	    "\ttransport is a transport provider name (i.e. device),\n");
668 	(void) fprintf(stderr,
669 	    "\tlisten_backlog is the TCP listen backlog,\n");
670 	(void) fprintf(stderr,
671 	    "\tand <nservers> must be a decimal number > zero.\n");
672 	exit(1);
673 }
674 
675 /*
676  * Issue nfssys system call to flush all logging buffers asynchronously.
677  *
678  * NOTICE: It is extremely important to flush NFS logging buffers when
679  *	   nfsd exits. When the system is halted or rebooted nfslogd
680  *	   may not have an opportunity to flush the buffers.
681  */
682 static void
683 nfsl_flush()
684 {
685 	struct nfsl_flush_args nfa;
686 
687 	memset((void *)&nfa, 0, sizeof (nfa));
688 	nfa.version = NFSL_FLUSH_ARGS_VERS;
689 	nfa.directive = NFSL_ALL;	/* flush all asynchronously */
690 
691 	if (_nfssys(LOG_FLUSH, &nfa) < 0)
692 		syslog(LOG_ERR, "_nfssys(LOG_FLUSH) failed: %s\n",
693 		    strerror(errno));
694 }
695 
696 /*
697  * SIGTERM handler.
698  * Flush logging buffers and exit.
699  */
700 static void
701 sigflush(int sig)
702 {
703 	nfsl_flush();
704 	_exit(0);
705 }
706 
707 /*
708  * SIGUSR1 handler.
709  *
710  * Request that server quiesce, then (nfsd) exit. For subsequent warm start.
711  *
712  * This is a Contracted Project Private interface, for the sole use
713  * of Sun Cluster HA-NFS. See PSARC/2004/497.
714  *
715  * Equivalent to SIGTERM handler if nfs_server_vers_max < QUIESCE_VERSMIN.
716  */
717 static void
718 quiesce(int sig)
719 {
720 	int error;
721 	int id = NFS_SVCPOOL_ID;
722 
723 	if (nfs_server_vers_max >= QUIESCE_VERSMIN) {
724 		/* Request server quiesce at next shutdown */
725 		error = _nfssys(NFS4_SVC_REQUEST_QUIESCE, &id);
726 
727 		/*
728 		 * ENOENT is returned if there is no matching SVC pool
729 		 * for the id. Possibly because the pool is not yet setup.
730 		 * In this case, just exit as if no error. For all other errors,
731 		 * just return and allow caller to retry.
732 		 */
733 		if (error && errno != ENOENT) {
734 			syslog(LOG_ERR,
735 			    "_nfssys(NFS4_SVC_REQUEST_QUIESCE) failed: %s",
736 			    strerror(errno));
737 			return;
738 		}
739 	}
740 
741 	/* Flush logging buffers */
742 	nfsl_flush();
743 
744 	_exit(0);
745 }
746 
747 /*
748  * DSS: distributed stable storage.
749  * Create leaf directories as required, keeping an eye on path
750  * lengths. Calls exit(1) on failure.
751  * The pathnames passed in must already exist, and must be writeable by nfsd.
752  * Note: the leaf directories under NFS4_VAR_DIR are not created here;
753  * they're created at pkg install.
754  */
755 static void
756 dss_mkleafdirs(uint_t npaths, char **pathnames)
757 {
758 	int i;
759 	char *tmppath = NULL;
760 
761 	/*
762 	 * Create the temporary storage used by dss_mkleafdir() here,
763 	 * rather than in that function, so that it only needs to be
764 	 * done once, rather than once for each call. Too big to put
765 	 * on the function's stack.
766 	 */
767 	tmppath = (char *)malloc(MAXPATHLEN);
768 	if (tmppath == NULL) {
769 		syslog(LOG_ERR, "tmppath malloc failed. Exiting");
770 		exit(1);
771 	}
772 
773 	for (i = 0; i < npaths; i++) {
774 		char *p = pathnames[i];
775 
776 		dss_mkleafdir(p, NFS4_DSS_STATE_LEAF, tmppath);
777 		dss_mkleafdir(p, NFS4_DSS_OLDSTATE_LEAF, tmppath);
778 	}
779 
780 	free(tmppath);
781 }
782 
783 /*
784  * Create "leaf" in "dir" (which must already exist).
785  * leaf: should start with a '/'
786  */
787 static void
788 dss_mkleafdir(char *dir, char *leaf, char *tmppath)
789 {
790 	/* MAXPATHLEN includes the terminating NUL */
791 	if (strlen(dir) + strlen(leaf) > MAXPATHLEN - 1) {
792 		fprintf(stderr, "stable storage path too long: %s%s. Exiting",
793 		    dir, leaf);
794 		exit(1);
795 	}
796 
797 	(void) snprintf(tmppath, MAXPATHLEN, "%s/%s", dir, leaf);
798 
799 	/* the directory may already exist: that's OK */
800 	if (mkdir(tmppath, NFS4_DSS_DIR_MODE) == -1 && errno != EEXIST) {
801 		fprintf(stderr, "error creating stable storage directory: "
802 		    "%s: %s. Exiting", strerror(errno), tmppath);
803 		exit(1);
804 	}
805 }
806 
807 /*
808  * Create the storage dirs, and pass the path list to the kernel.
809  * This requires the nfssrv module to be loaded; the _nfssys() syscall
810  * will fail ENOTSUP if it is not.
811  * Use libnvpair(3LIB) to pass the data to the kernel.
812  */
813 static int
814 dss_init(uint_t npaths, char **pathnames)
815 {
816 	int i, j, nskipped, error;
817 	char *bufp;
818 	uint32_t bufsize;
819 	size_t buflen;
820 	nvlist_t *nvl;
821 
822 	if (npaths > 1) {
823 		/*
824 		 * We need to remove duplicate paths; this might be user error
825 		 * in the general case, but HA-NFSv4 can also cause this.
826 		 * Sort the pathnames array, and NULL out duplicates,
827 		 * then write the non-NULL entries to a new array.
828 		 * Sorting will also allow the kernel to optimise its searches.
829 		 */
830 
831 		qsort(pathnames, npaths, sizeof (char *), qstrcmp);
832 
833 		/* now NULL out any duplicates */
834 		i = 0; j = 1; nskipped = 0;
835 		while (j < npaths) {
836 			if (strcmp(pathnames[i], pathnames[j]) == NULL) {
837 				pathnames[j] = NULL;
838 				j++;
839 				nskipped++;
840 				continue;
841 			}
842 
843 			/* skip i over any of its NULLed duplicates */
844 			i = j++;
845 		}
846 
847 		/* finally, write the non-NULL entries to a new array */
848 		if (nskipped > 0) {
849 			int nreal;
850 			size_t sz;
851 			char **tmp_pathnames;
852 
853 			nreal = npaths - nskipped;
854 
855 			sz = nreal * sizeof (char *);
856 			tmp_pathnames = (char **)malloc(sz);
857 			if (tmp_pathnames == NULL) {
858 				fprintf(stderr, "tmp_pathnames malloc failed");
859 				exit(1);
860 			}
861 
862 			for (i = 0, j = 0; i < npaths; i++)
863 				if (pathnames[i] != NULL)
864 					tmp_pathnames[j++] = pathnames[i];
865 			free(pathnames);
866 			pathnames = tmp_pathnames;
867 			npaths = nreal;
868 		}
869 
870 	}
871 
872 	/* Create directories to store the distributed state files */
873 	dss_mkleafdirs(npaths, pathnames);
874 
875 	/* Create the name-value pair list */
876 	error = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0);
877 	if (error) {
878 		fprintf(stderr, "nvlist_alloc failed: %s.", strerror(errno));
879 		return (1);
880 	}
881 
882 	/* Add the pathnames array as a single name-value pair */
883 	error = nvlist_add_string_array(nvl, NFS4_DSS_NVPAIR_NAME,
884 	    pathnames, npaths);
885 	if (error) {
886 		fprintf(stderr, "nvlist_add_string_array failed: %s.",
887 		    strerror(errno));
888 		nvlist_free(nvl);
889 		return (1);
890 	}
891 
892 	/*
893 	 * Pack list into contiguous memory, for passing to kernel.
894 	 * nvlist_pack() will allocate the memory for the buffer,
895 	 * which we should free() when no longer needed.
896 	 * NV_ENCODE_XDR for safety across ILP32/LP64 kernel boundary.
897 	 */
898 	bufp = NULL;
899 	error = nvlist_pack(nvl, &bufp, &buflen, NV_ENCODE_XDR, 0);
900 	if (error) {
901 		fprintf(stderr, "nvlist_pack failed: %s.", strerror(errno));
902 		nvlist_free(nvl);
903 		return (1);
904 	}
905 
906 	/* Now we have the packed buffer, we no longer need the list */
907 	nvlist_free(nvl);
908 
909 	/*
910 	 * Let the kernel know in advance how big the buffer is.
911 	 * NOTE: we cannot just pass buflen, since size_t is a long, and
912 	 * thus a different size between ILP32 userland and LP64 kernel.
913 	 * Use an int for the transfer, since that should be big enough;
914 	 * this is a no-op at the moment, here, since nfsd is 32-bit, but
915 	 * that could change.
916 	 */
917 	bufsize = (uint32_t)buflen;
918 	error = _nfssys(NFS4_DSS_SETPATHS_SIZE, &bufsize);
919 	if (error) {
920 		fprintf(stderr,
921 		    "_nfssys(NFS4_DSS_SETPATHS_SIZE) failed: %s. ",
922 		    strerror(errno));
923 		free(bufp);
924 		return (1);
925 	}
926 
927 	/* Pass the packed buffer to the kernel */
928 	error = _nfssys(NFS4_DSS_SETPATHS, bufp);
929 	if (error) {
930 		fprintf(stderr,
931 		    "_nfssys(NFS4_DSS_SETPATHS) failed: %s. ", strerror(errno));
932 		free(bufp);
933 		return (1);
934 	}
935 
936 	/*
937 	 * The kernel has now unpacked the buffer and extracted the
938 	 * pathnames array, we no longer need the buffer.
939 	 */
940 	free(bufp);
941 
942 	return (0);
943 }
944 
945 /*
946  * Quick sort string compare routine, for qsort.
947  * Needed to make arg types correct.
948  */
949 int
950 qstrcmp(const void *p1, const void *p2)
951 {
952 	char *s1 = *((char **)p1);
953 	char *s2 = *((char **)p2);
954 
955 	return (strcmp(s1, s2));
956 }
957