xref: /freebsd/sys/nlm/nlm_prot_impl.c (revision d0b2dbfa0ecf2bbc9709efc5e20baf8e4b44bbbf)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2008 Isilon Inc http://www.isilon.com/
5  * Authors: Doug Rabson <dfr@rabson.org>
6  * Developed with Red Inc: Alfred Perlstein <alfred@freebsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include "opt_inet6.h"
31 
32 #include <sys/cdefs.h>
33 #include <sys/param.h>
34 #include <sys/fail.h>
35 #include <sys/fcntl.h>
36 #include <sys/kernel.h>
37 #include <sys/kthread.h>
38 #include <sys/lockf.h>
39 #include <sys/malloc.h>
40 #include <sys/mount.h>
41 #include <sys/priv.h>
42 #include <sys/proc.h>
43 #include <sys/socket.h>
44 #include <sys/socketvar.h>
45 #include <sys/syscall.h>
46 #include <sys/sysctl.h>
47 #include <sys/sysent.h>
48 #include <sys/syslog.h>
49 #include <sys/sysproto.h>
50 #include <sys/systm.h>
51 #include <sys/taskqueue.h>
52 #include <sys/unistd.h>
53 #include <sys/vnode.h>
54 
55 #include <nfs/nfsproto.h>
56 #include <nfs/nfs_lock.h>
57 
58 #include <nlm/nlm_prot.h>
59 #include <nlm/sm_inter.h>
60 #include <nlm/nlm.h>
61 #include <rpc/rpc_com.h>
62 #include <rpc/rpcb_prot.h>
63 
64 MALLOC_DEFINE(M_NLM, "NLM", "Network Lock Manager");
65 
66 /*
67  * If a host is inactive (and holds no locks) for this amount of
68  * seconds, we consider it idle and stop tracking it.
69  */
70 #define NLM_IDLE_TIMEOUT	30
71 
72 /*
73  * We check the host list for idle every few seconds.
74  */
75 #define NLM_IDLE_PERIOD		5
76 
77 /*
78  * We only look for GRANTED_RES messages for a little while.
79  */
80 #define NLM_EXPIRE_TIMEOUT	10
81 
82 /*
83  * Support for sysctl vfs.nlm.sysid
84  */
85 static SYSCTL_NODE(_vfs, OID_AUTO, nlm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
86     "Network Lock Manager");
87 static SYSCTL_NODE(_vfs_nlm, OID_AUTO, sysid,
88     CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
89     "");
90 
91 /*
92  * Syscall hooks
93  */
94 static struct syscall_helper_data nlm_syscalls[] = {
95 	SYSCALL_INIT_HELPER(nlm_syscall),
96 	SYSCALL_INIT_LAST
97 };
98 
99 /*
100  * Debug level passed in from userland. We also support a sysctl hook
101  * so that it can be changed on a live system.
102  */
103 static int nlm_debug_level;
104 SYSCTL_INT(_debug, OID_AUTO, nlm_debug, CTLFLAG_RW, &nlm_debug_level, 0, "");
105 
106 #define NLM_DEBUG(_level, args...)			\
107 	do {						\
108 		if (nlm_debug_level >= (_level))	\
109 			log(LOG_DEBUG, args);		\
110 	} while(0)
111 #define NLM_ERR(args...)			\
112 	do {					\
113 		log(LOG_ERR, args);		\
114 	} while(0)
115 
116 /*
117  * Grace period handling. The value of nlm_grace_threshold is the
118  * value of time_uptime after which we are serving requests normally.
119  */
120 static time_t nlm_grace_threshold;
121 
122 /*
123  * We check for idle hosts if time_uptime is greater than
124  * nlm_next_idle_check,
125  */
126 static time_t nlm_next_idle_check;
127 
128 /*
129  * A flag to indicate the server is already running.
130  */
131 static int nlm_is_running;
132 
133 /*
134  * A socket to use for RPC - shared by all IPv4 RPC clients.
135  */
136 static struct socket *nlm_socket;
137 
138 #ifdef INET6
139 
140 /*
141  * A socket to use for RPC - shared by all IPv6 RPC clients.
142  */
143 static struct socket *nlm_socket6;
144 
145 #endif
146 
147 /*
148  * An RPC client handle that can be used to communicate with the local
149  * NSM.
150  */
151 static CLIENT *nlm_nsm;
152 
153 /*
154  * An AUTH handle for the server's creds.
155  */
156 static AUTH *nlm_auth;
157 
158 /*
159  * A zero timeval for sending async RPC messages.
160  */
161 struct timeval nlm_zero_tv = { 0, 0 };
162 
163 /*
164  * The local NSM state number
165  */
166 int nlm_nsm_state;
167 
168 /*
169  * A lock to protect the host list and waiting lock list.
170  */
171 static struct mtx nlm_global_lock;
172 
173 /*
174  * Locks:
175  * (l)		locked by nh_lock
176  * (s)		only accessed via server RPC which is single threaded
177  * (g)		locked by nlm_global_lock
178  * (c)		const until freeing
179  * (a)		modified using atomic ops
180  */
181 
182 /*
183  * A pending client-side lock request, stored on the nlm_waiting_locks
184  * list.
185  */
186 struct nlm_waiting_lock {
187 	TAILQ_ENTRY(nlm_waiting_lock) nw_link; /* (g) */
188 	bool_t		nw_waiting;	       /* (g) */
189 	nlm4_lock	nw_lock;	       /* (c) */
190 	union nfsfh	nw_fh;		       /* (c) */
191 	struct vnode	*nw_vp;		       /* (c) */
192 };
193 TAILQ_HEAD(nlm_waiting_lock_list, nlm_waiting_lock);
194 
195 struct nlm_waiting_lock_list nlm_waiting_locks; /* (g) */
196 
197 /*
198  * A pending server-side asynchronous lock request, stored on the
199  * nh_pending list of the NLM host.
200  */
201 struct nlm_async_lock {
202 	TAILQ_ENTRY(nlm_async_lock) af_link; /* (l) host's list of locks */
203 	struct task	af_task;	/* (c) async callback details */
204 	void		*af_cookie;	/* (l) lock manager cancel token */
205 	struct vnode	*af_vp;		/* (l) vnode to lock */
206 	struct flock	af_fl;		/* (c) lock details */
207 	struct nlm_host *af_host;	/* (c) host which is locking */
208 	CLIENT		*af_rpc;	/* (c) rpc client to send message */
209 	nlm4_testargs	af_granted;	/* (c) notification details */
210 	time_t		af_expiretime;	/* (c) notification time */
211 };
212 TAILQ_HEAD(nlm_async_lock_list, nlm_async_lock);
213 
214 /*
215  * NLM host.
216  */
217 enum nlm_host_state {
218 	NLM_UNMONITORED,
219 	NLM_MONITORED,
220 	NLM_MONITOR_FAILED,
221 	NLM_RECOVERING
222 };
223 
224 struct nlm_rpc {
225 	CLIENT		*nr_client;    /* (l) RPC client handle */
226 	time_t		nr_create_time; /* (l) when client was created */
227 };
228 
229 struct nlm_host {
230 	struct mtx	nh_lock;
231 	volatile u_int	nh_refs;       /* (a) reference count */
232 	TAILQ_ENTRY(nlm_host) nh_link; /* (g) global list of hosts */
233 	char		nh_caller_name[MAXNAMELEN]; /* (c) printable name of host */
234 	uint32_t	nh_sysid;	 /* (c) our allocaed system ID */
235 	char		nh_sysid_string[10]; /* (c) string rep. of sysid */
236 	struct sockaddr_storage	nh_addr; /* (s) remote address of host */
237 	struct nlm_rpc	nh_srvrpc;	 /* (l) RPC for server replies */
238 	struct nlm_rpc	nh_clntrpc;	 /* (l) RPC for client requests */
239 	rpcvers_t	nh_vers;	 /* (s) NLM version of host */
240 	int		nh_state;	 /* (s) last seen NSM state of host */
241 	enum nlm_host_state nh_monstate; /* (l) local NSM monitoring state */
242 	time_t		nh_idle_timeout; /* (s) Time at which host is idle */
243 	struct sysctl_ctx_list nh_sysctl; /* (c) vfs.nlm.sysid nodes */
244 	uint32_t	nh_grantcookie;  /* (l) grant cookie counter */
245 	struct nlm_async_lock_list nh_pending; /* (l) pending async locks */
246 	struct nlm_async_lock_list nh_granted; /* (l) granted locks */
247 	struct nlm_async_lock_list nh_finished; /* (l) finished async locks */
248 };
249 TAILQ_HEAD(nlm_host_list, nlm_host);
250 
251 static struct nlm_host_list nlm_hosts; /* (g) */
252 static uint32_t nlm_next_sysid = 1;    /* (g) */
253 
254 static void	nlm_host_unmonitor(struct nlm_host *);
255 
256 struct nlm_grantcookie {
257 	uint32_t	ng_sysid;
258 	uint32_t	ng_cookie;
259 };
260 
261 static inline uint32_t
262 ng_sysid(struct netobj *src)
263 {
264 
265 	return ((struct nlm_grantcookie *)src->n_bytes)->ng_sysid;
266 }
267 
268 static inline uint32_t
269 ng_cookie(struct netobj *src)
270 {
271 
272 	return ((struct nlm_grantcookie *)src->n_bytes)->ng_cookie;
273 }
274 
275 /**********************************************************************/
276 
277 /*
278  * Initialise NLM globals.
279  */
280 static int
281 nlm_init(void)
282 {
283 	int error;
284 
285 	mtx_init(&nlm_global_lock, "nlm_global_lock", NULL, MTX_DEF);
286 	TAILQ_INIT(&nlm_waiting_locks);
287 	TAILQ_INIT(&nlm_hosts);
288 
289 	error = syscall_helper_register(nlm_syscalls, SY_THR_STATIC_KLD);
290 	if (error != 0)
291 		NLM_ERR("Can't register NLM syscall\n");
292 	return (error);
293 }
294 
295 static void
296 nlm_uninit(void)
297 {
298 
299 	syscall_helper_unregister(nlm_syscalls);
300 }
301 
302 /*
303  * Create a netobj from an arbitrary source.
304  */
305 void
306 nlm_make_netobj(struct netobj *dst, caddr_t src, size_t srcsize,
307     struct malloc_type *type)
308 {
309 
310 	dst->n_len = srcsize;
311 	dst->n_bytes = malloc(srcsize, type, M_WAITOK);
312 	memcpy(dst->n_bytes, src, srcsize);
313 }
314 
315 /*
316  * Copy a struct netobj.
317  */
318 void
319 nlm_copy_netobj(struct netobj *dst, struct netobj *src,
320     struct malloc_type *type)
321 {
322 
323 	nlm_make_netobj(dst, src->n_bytes, src->n_len, type);
324 }
325 
326 /*
327  * Create an RPC client handle for the given (address,prog,vers)
328  * triple using UDP.
329  */
330 static CLIENT *
331 nlm_get_rpc(struct sockaddr *sa, rpcprog_t prog, rpcvers_t vers)
332 {
333 	char *wchan = "nlmrcv";
334 	struct sockaddr_storage ss;
335 	struct socket *so;
336 	CLIENT *rpcb;
337 	struct timeval timo;
338 	RPCB parms;
339 	char *uaddr;
340 	enum clnt_stat stat = RPC_SUCCESS;
341 	int rpcvers = RPCBVERS4;
342 	bool_t do_tcp = FALSE;
343 	bool_t tryagain = FALSE;
344 	struct portmap mapping;
345 	u_short port = 0;
346 
347 	/*
348 	 * First we need to contact the remote RPCBIND service to find
349 	 * the right port.
350 	 */
351 	memcpy(&ss, sa, sa->sa_len);
352 	switch (ss.ss_family) {
353 	case AF_INET:
354 		((struct sockaddr_in *)&ss)->sin_port = htons(111);
355 		so = nlm_socket;
356 		break;
357 #ifdef INET6
358 	case AF_INET6:
359 		((struct sockaddr_in6 *)&ss)->sin6_port = htons(111);
360 		so = nlm_socket6;
361 		break;
362 #endif
363 
364 	default:
365 		/*
366 		 * Unsupported address family - fail.
367 		 */
368 		return (NULL);
369 	}
370 
371 	rpcb = clnt_dg_create(so, (struct sockaddr *)&ss,
372 	    RPCBPROG, rpcvers, 0, 0);
373 	if (!rpcb)
374 		return (NULL);
375 
376 try_tcp:
377 	parms.r_prog = prog;
378 	parms.r_vers = vers;
379 	if (do_tcp)
380 		parms.r_netid = "tcp";
381 	else
382 		parms.r_netid = "udp";
383 	parms.r_addr = "";
384 	parms.r_owner = "";
385 
386 	/*
387 	 * Use the default timeout.
388 	 */
389 	timo.tv_sec = 25;
390 	timo.tv_usec = 0;
391 again:
392 	switch (rpcvers) {
393 	case RPCBVERS4:
394 	case RPCBVERS:
395 		/*
396 		 * Try RPCBIND 4 then 3.
397 		 */
398 		uaddr = NULL;
399 		stat = CLNT_CALL(rpcb, (rpcprog_t) RPCBPROC_GETADDR,
400 		    (xdrproc_t) xdr_rpcb, &parms,
401 		    (xdrproc_t) xdr_wrapstring, &uaddr, timo);
402 		if (stat == RPC_SUCCESS) {
403 			/*
404 			 * We have a reply from the remote RPCBIND - turn it
405 			 * into an appropriate address and make a new client
406 			 * that can talk to the remote NLM.
407 			 *
408 			 * XXX fixup IPv6 scope ID.
409 			 */
410 			struct netbuf *a;
411 			a = __rpc_uaddr2taddr_af(ss.ss_family, uaddr);
412 			if (!a) {
413 				tryagain = TRUE;
414 			} else {
415 				tryagain = FALSE;
416 				memcpy(&ss, a->buf, a->len);
417 				free(a->buf, M_RPC);
418 				free(a, M_RPC);
419 				xdr_free((xdrproc_t) xdr_wrapstring, &uaddr);
420 			}
421 		}
422 		if (tryagain || stat == RPC_PROGVERSMISMATCH) {
423 			if (rpcvers == RPCBVERS4)
424 				rpcvers = RPCBVERS;
425 			else if (rpcvers == RPCBVERS)
426 				rpcvers = PMAPVERS;
427 			CLNT_CONTROL(rpcb, CLSET_VERS, &rpcvers);
428 			goto again;
429 		}
430 		break;
431 	case PMAPVERS:
432 		/*
433 		 * Try portmap.
434 		 */
435 		mapping.pm_prog = parms.r_prog;
436 		mapping.pm_vers = parms.r_vers;
437 		mapping.pm_prot = do_tcp ? IPPROTO_TCP : IPPROTO_UDP;
438 		mapping.pm_port = 0;
439 
440 		stat = CLNT_CALL(rpcb, (rpcprog_t) PMAPPROC_GETPORT,
441 		    (xdrproc_t) xdr_portmap, &mapping,
442 		    (xdrproc_t) xdr_u_short, &port, timo);
443 
444 		if (stat == RPC_SUCCESS) {
445 			switch (ss.ss_family) {
446 			case AF_INET:
447 				((struct sockaddr_in *)&ss)->sin_port =
448 					htons(port);
449 				break;
450 
451 #ifdef INET6
452 			case AF_INET6:
453 				((struct sockaddr_in6 *)&ss)->sin6_port =
454 					htons(port);
455 				break;
456 #endif
457 			}
458 		}
459 		break;
460 	default:
461 		panic("invalid rpcvers %d", rpcvers);
462 	}
463 	/*
464 	 * We may have a positive response from the portmapper, but the NLM
465 	 * service was not found. Make sure we received a valid port.
466 	 */
467 	switch (ss.ss_family) {
468 	case AF_INET:
469 		port = ((struct sockaddr_in *)&ss)->sin_port;
470 		break;
471 #ifdef INET6
472 	case AF_INET6:
473 		port = ((struct sockaddr_in6 *)&ss)->sin6_port;
474 		break;
475 #endif
476 	}
477 	if (stat != RPC_SUCCESS || !port) {
478 		/*
479 		 * If we were able to talk to rpcbind or portmap, but the udp
480 		 * variant wasn't available, ask about tcp.
481 		 *
482 		 * XXX - We could also check for a TCP portmapper, but
483 		 * if the host is running a portmapper at all, we should be able
484 		 * to hail it over UDP.
485 		 */
486 		if (stat == RPC_SUCCESS && !do_tcp) {
487 			do_tcp = TRUE;
488 			goto try_tcp;
489 		}
490 
491 		/* Otherwise, bad news. */
492 		NLM_ERR("NLM: failed to contact remote rpcbind, "
493 		    "stat = %d, port = %d\n", (int) stat, port);
494 		CLNT_DESTROY(rpcb);
495 		return (NULL);
496 	}
497 
498 	if (do_tcp) {
499 		/*
500 		 * Destroy the UDP client we used to speak to rpcbind and
501 		 * recreate as a TCP client.
502 		 */
503 		struct netconfig *nconf = NULL;
504 
505 		CLNT_DESTROY(rpcb);
506 
507 		switch (ss.ss_family) {
508 		case AF_INET:
509 			nconf = getnetconfigent("tcp");
510 			break;
511 #ifdef INET6
512 		case AF_INET6:
513 			nconf = getnetconfigent("tcp6");
514 			break;
515 #endif
516 		}
517 
518 		rpcb = clnt_reconnect_create(nconf, (struct sockaddr *)&ss,
519 		    prog, vers, 0, 0);
520 		CLNT_CONTROL(rpcb, CLSET_WAITCHAN, wchan);
521 		rpcb->cl_auth = nlm_auth;
522 
523 	} else {
524 		/*
525 		 * Re-use the client we used to speak to rpcbind.
526 		 */
527 		CLNT_CONTROL(rpcb, CLSET_SVC_ADDR, &ss);
528 		CLNT_CONTROL(rpcb, CLSET_PROG, &prog);
529 		CLNT_CONTROL(rpcb, CLSET_VERS, &vers);
530 		CLNT_CONTROL(rpcb, CLSET_WAITCHAN, wchan);
531 		rpcb->cl_auth = nlm_auth;
532 	}
533 
534 	return (rpcb);
535 }
536 
537 /*
538  * This async callback after when an async lock request has been
539  * granted. We notify the host which initiated the request.
540  */
541 static void
542 nlm_lock_callback(void *arg, int pending)
543 {
544 	struct nlm_async_lock *af = (struct nlm_async_lock *) arg;
545 	struct rpc_callextra ext;
546 
547 	NLM_DEBUG(2, "NLM: async lock %p for %s (sysid %d) granted,"
548 	    " cookie %d:%d\n", af, af->af_host->nh_caller_name,
549 	    af->af_host->nh_sysid, ng_sysid(&af->af_granted.cookie),
550 	    ng_cookie(&af->af_granted.cookie));
551 
552 	/*
553 	 * Send the results back to the host.
554 	 *
555 	 * Note: there is a possible race here with nlm_host_notify
556 	 * destroying the RPC client. To avoid problems, the first
557 	 * thing nlm_host_notify does is to cancel pending async lock
558 	 * requests.
559 	 */
560 	memset(&ext, 0, sizeof(ext));
561 	ext.rc_auth = nlm_auth;
562 	if (af->af_host->nh_vers == NLM_VERS4) {
563 		nlm4_granted_msg_4(&af->af_granted,
564 		    NULL, af->af_rpc, &ext, nlm_zero_tv);
565 	} else {
566 		/*
567 		 * Back-convert to legacy protocol
568 		 */
569 		nlm_testargs granted;
570 		granted.cookie = af->af_granted.cookie;
571 		granted.exclusive = af->af_granted.exclusive;
572 		granted.alock.caller_name =
573 			af->af_granted.alock.caller_name;
574 		granted.alock.fh = af->af_granted.alock.fh;
575 		granted.alock.oh = af->af_granted.alock.oh;
576 		granted.alock.svid = af->af_granted.alock.svid;
577 		granted.alock.l_offset =
578 			af->af_granted.alock.l_offset;
579 		granted.alock.l_len =
580 			af->af_granted.alock.l_len;
581 
582 		nlm_granted_msg_1(&granted,
583 		    NULL, af->af_rpc, &ext, nlm_zero_tv);
584 	}
585 
586 	/*
587 	 * Move this entry to the nh_granted list.
588 	 */
589 	af->af_expiretime = time_uptime + NLM_EXPIRE_TIMEOUT;
590 	mtx_lock(&af->af_host->nh_lock);
591 	TAILQ_REMOVE(&af->af_host->nh_pending, af, af_link);
592 	TAILQ_INSERT_TAIL(&af->af_host->nh_granted, af, af_link);
593 	mtx_unlock(&af->af_host->nh_lock);
594 }
595 
596 /*
597  * Free an async lock request. The request must have been removed from
598  * any list.
599  */
600 static void
601 nlm_free_async_lock(struct nlm_async_lock *af)
602 {
603 	/*
604 	 * Free an async lock.
605 	 */
606 	if (af->af_rpc)
607 		CLNT_RELEASE(af->af_rpc);
608 	xdr_free((xdrproc_t) xdr_nlm4_testargs, &af->af_granted);
609 	if (af->af_vp)
610 		vrele(af->af_vp);
611 	free(af, M_NLM);
612 }
613 
614 /*
615  * Cancel our async request - this must be called with
616  * af->nh_host->nh_lock held. This is slightly complicated by a
617  * potential race with our own callback. If we fail to cancel the
618  * lock, it must already have been granted - we make sure our async
619  * task has completed by calling taskqueue_drain in this case.
620  */
621 static int
622 nlm_cancel_async_lock(struct nlm_async_lock *af)
623 {
624 	struct nlm_host *host = af->af_host;
625 	int error;
626 
627 	mtx_assert(&host->nh_lock, MA_OWNED);
628 
629 	mtx_unlock(&host->nh_lock);
630 
631 	error = VOP_ADVLOCKASYNC(af->af_vp, NULL, F_CANCEL, &af->af_fl,
632 	    F_REMOTE, NULL, &af->af_cookie);
633 
634 	if (error) {
635 		/*
636 		 * We failed to cancel - make sure our callback has
637 		 * completed before we continue.
638 		 */
639 		taskqueue_drain(taskqueue_thread, &af->af_task);
640 	}
641 
642 	mtx_lock(&host->nh_lock);
643 
644 	if (!error) {
645 		NLM_DEBUG(2, "NLM: async lock %p for %s (sysid %d) "
646 		    "cancelled\n", af, host->nh_caller_name, host->nh_sysid);
647 
648 		/*
649 		 * Remove from the nh_pending list and free now that
650 		 * we are safe from the callback.
651 		 */
652 		TAILQ_REMOVE(&host->nh_pending, af, af_link);
653 		mtx_unlock(&host->nh_lock);
654 		nlm_free_async_lock(af);
655 		mtx_lock(&host->nh_lock);
656 	}
657 
658 	return (error);
659 }
660 
661 static void
662 nlm_check_expired_locks(struct nlm_host *host)
663 {
664 	struct nlm_async_lock *af;
665 	time_t uptime = time_uptime;
666 
667 	mtx_lock(&host->nh_lock);
668 	while ((af = TAILQ_FIRST(&host->nh_granted)) != NULL
669 	    && uptime >= af->af_expiretime) {
670 		NLM_DEBUG(2, "NLM: async lock %p for %s (sysid %d) expired,"
671 		    " cookie %d:%d\n", af, af->af_host->nh_caller_name,
672 		    af->af_host->nh_sysid, ng_sysid(&af->af_granted.cookie),
673 		    ng_cookie(&af->af_granted.cookie));
674 		TAILQ_REMOVE(&host->nh_granted, af, af_link);
675 		mtx_unlock(&host->nh_lock);
676 		nlm_free_async_lock(af);
677 		mtx_lock(&host->nh_lock);
678 	}
679 	while ((af = TAILQ_FIRST(&host->nh_finished)) != NULL) {
680 		TAILQ_REMOVE(&host->nh_finished, af, af_link);
681 		mtx_unlock(&host->nh_lock);
682 		nlm_free_async_lock(af);
683 		mtx_lock(&host->nh_lock);
684 	}
685 	mtx_unlock(&host->nh_lock);
686 }
687 
688 /*
689  * Free resources used by a host. This is called after the reference
690  * count has reached zero so it doesn't need to worry about locks.
691  */
692 static void
693 nlm_host_destroy(struct nlm_host *host)
694 {
695 
696 	mtx_lock(&nlm_global_lock);
697 	TAILQ_REMOVE(&nlm_hosts, host, nh_link);
698 	mtx_unlock(&nlm_global_lock);
699 
700 	if (host->nh_srvrpc.nr_client)
701 		CLNT_RELEASE(host->nh_srvrpc.nr_client);
702 	if (host->nh_clntrpc.nr_client)
703 		CLNT_RELEASE(host->nh_clntrpc.nr_client);
704 	mtx_destroy(&host->nh_lock);
705 	sysctl_ctx_free(&host->nh_sysctl);
706 	free(host, M_NLM);
707 }
708 
709 /*
710  * Thread start callback for client lock recovery
711  */
712 static void
713 nlm_client_recovery_start(void *arg)
714 {
715 	struct nlm_host *host = (struct nlm_host *) arg;
716 
717 	NLM_DEBUG(1, "NLM: client lock recovery for %s started\n",
718 	    host->nh_caller_name);
719 
720 	nlm_client_recovery(host);
721 
722 	NLM_DEBUG(1, "NLM: client lock recovery for %s completed\n",
723 	    host->nh_caller_name);
724 
725 	host->nh_monstate = NLM_MONITORED;
726 	nlm_host_release(host);
727 
728 	kthread_exit();
729 }
730 
731 /*
732  * This is called when we receive a host state change notification. We
733  * unlock any active locks owned by the host. When rpc.lockd is
734  * shutting down, this function is called with newstate set to zero
735  * which allows us to cancel any pending async locks and clear the
736  * locking state.
737  */
738 static void
739 nlm_host_notify(struct nlm_host *host, int newstate)
740 {
741 	struct nlm_async_lock *af;
742 
743 	if (newstate) {
744 		NLM_DEBUG(1, "NLM: host %s (sysid %d) rebooted, new "
745 		    "state is %d\n", host->nh_caller_name,
746 		    host->nh_sysid, newstate);
747 	}
748 
749 	/*
750 	 * Cancel any pending async locks for this host.
751 	 */
752 	mtx_lock(&host->nh_lock);
753 	while ((af = TAILQ_FIRST(&host->nh_pending)) != NULL) {
754 		/*
755 		 * nlm_cancel_async_lock will remove the entry from
756 		 * nh_pending and free it.
757 		 */
758 		nlm_cancel_async_lock(af);
759 	}
760 	mtx_unlock(&host->nh_lock);
761 	nlm_check_expired_locks(host);
762 
763 	/*
764 	 * The host just rebooted - trash its locks.
765 	 */
766 	lf_clearremotesys(host->nh_sysid);
767 	host->nh_state = newstate;
768 
769 	/*
770 	 * If we have any remote locks for this host (i.e. it
771 	 * represents a remote NFS server that our local NFS client
772 	 * has locks for), start a recovery thread.
773 	 */
774 	if (newstate != 0
775 	    && host->nh_monstate != NLM_RECOVERING
776 	    && lf_countlocks(NLM_SYSID_CLIENT | host->nh_sysid) > 0) {
777 		struct thread *td;
778 		host->nh_monstate = NLM_RECOVERING;
779 		refcount_acquire(&host->nh_refs);
780 		kthread_add(nlm_client_recovery_start, host, curproc, &td, 0, 0,
781 		    "NFS lock recovery for %s", host->nh_caller_name);
782 	}
783 }
784 
785 /*
786  * Sysctl handler to count the number of locks for a sysid.
787  */
788 static int
789 nlm_host_lock_count_sysctl(SYSCTL_HANDLER_ARGS)
790 {
791 	struct nlm_host *host;
792 	int count;
793 
794 	host = oidp->oid_arg1;
795 	count = lf_countlocks(host->nh_sysid);
796 	return sysctl_handle_int(oidp, &count, 0, req);
797 }
798 
799 /*
800  * Sysctl handler to count the number of client locks for a sysid.
801  */
802 static int
803 nlm_host_client_lock_count_sysctl(SYSCTL_HANDLER_ARGS)
804 {
805 	struct nlm_host *host;
806 	int count;
807 
808 	host = oidp->oid_arg1;
809 	count = lf_countlocks(NLM_SYSID_CLIENT | host->nh_sysid);
810 	return sysctl_handle_int(oidp, &count, 0, req);
811 }
812 
813 /*
814  * Create a new NLM host.
815  */
816 static struct nlm_host *
817 nlm_create_host(const char* caller_name)
818 {
819 	struct nlm_host *host;
820 	struct sysctl_oid *oid;
821 
822 	mtx_assert(&nlm_global_lock, MA_OWNED);
823 
824 	NLM_DEBUG(1, "NLM: new host %s (sysid %d)\n",
825 	    caller_name, nlm_next_sysid);
826 	host = malloc(sizeof(struct nlm_host), M_NLM, M_NOWAIT|M_ZERO);
827 	if (!host)
828 		return (NULL);
829 	mtx_init(&host->nh_lock, "nh_lock", NULL, MTX_DEF);
830 	refcount_init(&host->nh_refs, 1);
831 	strlcpy(host->nh_caller_name, caller_name, MAXNAMELEN);
832 	host->nh_sysid = nlm_next_sysid++;
833 	snprintf(host->nh_sysid_string, sizeof(host->nh_sysid_string),
834 		"%d", host->nh_sysid);
835 	host->nh_vers = 0;
836 	host->nh_state = 0;
837 	host->nh_monstate = NLM_UNMONITORED;
838 	host->nh_grantcookie = 1;
839 	TAILQ_INIT(&host->nh_pending);
840 	TAILQ_INIT(&host->nh_granted);
841 	TAILQ_INIT(&host->nh_finished);
842 	TAILQ_INSERT_TAIL(&nlm_hosts, host, nh_link);
843 
844 	mtx_unlock(&nlm_global_lock);
845 
846 	sysctl_ctx_init(&host->nh_sysctl);
847 	oid = SYSCTL_ADD_NODE(&host->nh_sysctl,
848 	    SYSCTL_STATIC_CHILDREN(_vfs_nlm_sysid),
849 	    OID_AUTO, host->nh_sysid_string, CTLFLAG_RD | CTLFLAG_MPSAFE,
850 	    NULL, "");
851 	SYSCTL_ADD_STRING(&host->nh_sysctl, SYSCTL_CHILDREN(oid), OID_AUTO,
852 	    "hostname", CTLFLAG_RD, host->nh_caller_name, 0, "");
853 	SYSCTL_ADD_UINT(&host->nh_sysctl, SYSCTL_CHILDREN(oid), OID_AUTO,
854 	    "version", CTLFLAG_RD, &host->nh_vers, 0, "");
855 	SYSCTL_ADD_UINT(&host->nh_sysctl, SYSCTL_CHILDREN(oid), OID_AUTO,
856 	    "monitored", CTLFLAG_RD, &host->nh_monstate, 0, "");
857 	SYSCTL_ADD_PROC(&host->nh_sysctl, SYSCTL_CHILDREN(oid), OID_AUTO,
858 	    "lock_count", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, host,
859 	    0, nlm_host_lock_count_sysctl, "I", "");
860 	SYSCTL_ADD_PROC(&host->nh_sysctl, SYSCTL_CHILDREN(oid), OID_AUTO,
861 	    "client_lock_count", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
862 	    host, 0, nlm_host_client_lock_count_sysctl, "I", "");
863 
864 	mtx_lock(&nlm_global_lock);
865 
866 	return (host);
867 }
868 
869 /*
870  * Acquire the next sysid for remote locks not handled by the NLM.
871  */
872 uint32_t
873 nlm_acquire_next_sysid(void)
874 {
875 	uint32_t next_sysid;
876 
877 	mtx_lock(&nlm_global_lock);
878 	next_sysid = nlm_next_sysid++;
879 	mtx_unlock(&nlm_global_lock);
880 	return (next_sysid);
881 }
882 
883 /*
884  * Return non-zero if the address parts of the two sockaddrs are the
885  * same.
886  */
887 static int
888 nlm_compare_addr(const struct sockaddr *a, const struct sockaddr *b)
889 {
890 	const struct sockaddr_in *a4, *b4;
891 #ifdef INET6
892 	const struct sockaddr_in6 *a6, *b6;
893 #endif
894 
895 	if (a->sa_family != b->sa_family)
896 		return (FALSE);
897 
898 	switch (a->sa_family) {
899 	case AF_INET:
900 		a4 = (const struct sockaddr_in *) a;
901 		b4 = (const struct sockaddr_in *) b;
902 		return !memcmp(&a4->sin_addr, &b4->sin_addr,
903 		    sizeof(a4->sin_addr));
904 #ifdef INET6
905 	case AF_INET6:
906 		a6 = (const struct sockaddr_in6 *) a;
907 		b6 = (const struct sockaddr_in6 *) b;
908 		return !memcmp(&a6->sin6_addr, &b6->sin6_addr,
909 		    sizeof(a6->sin6_addr));
910 #endif
911 	}
912 
913 	return (0);
914 }
915 
916 /*
917  * Check for idle hosts and stop monitoring them. We could also free
918  * the host structure here, possibly after a larger timeout but that
919  * would require some care to avoid races with
920  * e.g. nlm_host_lock_count_sysctl.
921  */
922 static void
923 nlm_check_idle(void)
924 {
925 	struct nlm_host *host;
926 
927 	mtx_assert(&nlm_global_lock, MA_OWNED);
928 
929 	if (time_uptime <= nlm_next_idle_check)
930 		return;
931 
932 	nlm_next_idle_check = time_uptime + NLM_IDLE_PERIOD;
933 
934 	TAILQ_FOREACH(host, &nlm_hosts, nh_link) {
935 		if (host->nh_monstate == NLM_MONITORED
936 		    && time_uptime > host->nh_idle_timeout) {
937 			mtx_unlock(&nlm_global_lock);
938 			if (lf_countlocks(host->nh_sysid) > 0
939 			    || lf_countlocks(NLM_SYSID_CLIENT
940 				+ host->nh_sysid)) {
941 				host->nh_idle_timeout =
942 					time_uptime + NLM_IDLE_TIMEOUT;
943 				mtx_lock(&nlm_global_lock);
944 				continue;
945 			}
946 			nlm_host_unmonitor(host);
947 			mtx_lock(&nlm_global_lock);
948 		}
949 	}
950 }
951 
952 /*
953  * Search for an existing NLM host that matches the given name
954  * (typically the caller_name element of an nlm4_lock).  If none is
955  * found, create a new host. If 'addr' is non-NULL, record the remote
956  * address of the host so that we can call it back for async
957  * responses. If 'vers' is greater than zero then record the NLM
958  * program version to use to communicate with this client.
959  */
960 struct nlm_host *
961 nlm_find_host_by_name(const char *name, const struct sockaddr *addr,
962     rpcvers_t vers)
963 {
964 	struct nlm_host *host;
965 
966 	mtx_lock(&nlm_global_lock);
967 
968 	/*
969 	 * The remote host is determined by caller_name.
970 	 */
971 	TAILQ_FOREACH(host, &nlm_hosts, nh_link) {
972 		if (!strcmp(host->nh_caller_name, name))
973 			break;
974 	}
975 
976 	if (!host) {
977 		host = nlm_create_host(name);
978 		if (!host) {
979 			mtx_unlock(&nlm_global_lock);
980 			return (NULL);
981 		}
982 	}
983 	refcount_acquire(&host->nh_refs);
984 
985 	host->nh_idle_timeout = time_uptime + NLM_IDLE_TIMEOUT;
986 
987 	/*
988 	 * If we have an address for the host, record it so that we
989 	 * can send async replies etc.
990 	 */
991 	if (addr) {
992 
993 		KASSERT(addr->sa_len < sizeof(struct sockaddr_storage),
994 		    ("Strange remote transport address length"));
995 
996 		/*
997 		 * If we have seen an address before and we currently
998 		 * have an RPC client handle, make sure the address is
999 		 * the same, otherwise discard the client handle.
1000 		 */
1001 		if (host->nh_addr.ss_len && host->nh_srvrpc.nr_client) {
1002 			if (!nlm_compare_addr(
1003 				    (struct sockaddr *) &host->nh_addr,
1004 				    addr)
1005 			    || host->nh_vers != vers) {
1006 				CLIENT *client;
1007 				mtx_lock(&host->nh_lock);
1008 				client = host->nh_srvrpc.nr_client;
1009 				host->nh_srvrpc.nr_client = NULL;
1010 				mtx_unlock(&host->nh_lock);
1011 				if (client) {
1012 					CLNT_RELEASE(client);
1013 				}
1014 			}
1015 		}
1016 		memcpy(&host->nh_addr, addr, addr->sa_len);
1017 		host->nh_vers = vers;
1018 	}
1019 
1020 	nlm_check_idle();
1021 
1022 	mtx_unlock(&nlm_global_lock);
1023 
1024 	return (host);
1025 }
1026 
1027 /*
1028  * Search for an existing NLM host that matches the given remote
1029  * address. If none is found, create a new host with the requested
1030  * address and remember 'vers' as the NLM protocol version to use for
1031  * that host.
1032  */
1033 struct nlm_host *
1034 nlm_find_host_by_addr(const struct sockaddr *addr, int vers)
1035 {
1036 	/*
1037 	 * Fake up a name using inet_ntop. This buffer is
1038 	 * large enough for an IPv6 address.
1039 	 */
1040 	char tmp[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255"];
1041 	struct nlm_host *host;
1042 
1043 	switch (addr->sa_family) {
1044 	case AF_INET:
1045 		inet_ntop(AF_INET,
1046 		    &((const struct sockaddr_in *) addr)->sin_addr,
1047 		    tmp, sizeof tmp);
1048 		break;
1049 #ifdef INET6
1050 	case AF_INET6:
1051 		inet_ntop(AF_INET6,
1052 		    &((const struct sockaddr_in6 *) addr)->sin6_addr,
1053 		    tmp, sizeof tmp);
1054 		break;
1055 #endif
1056 	default:
1057 		strlcpy(tmp, "<unknown>", sizeof(tmp));
1058 	}
1059 
1060 	mtx_lock(&nlm_global_lock);
1061 
1062 	/*
1063 	 * The remote host is determined by caller_name.
1064 	 */
1065 	TAILQ_FOREACH(host, &nlm_hosts, nh_link) {
1066 		if (nlm_compare_addr(addr,
1067 			(const struct sockaddr *) &host->nh_addr))
1068 			break;
1069 	}
1070 
1071 	if (!host) {
1072 		host = nlm_create_host(tmp);
1073 		if (!host) {
1074 			mtx_unlock(&nlm_global_lock);
1075 			return (NULL);
1076 		}
1077 		memcpy(&host->nh_addr, addr, addr->sa_len);
1078 		host->nh_vers = vers;
1079 	}
1080 	refcount_acquire(&host->nh_refs);
1081 
1082 	host->nh_idle_timeout = time_uptime + NLM_IDLE_TIMEOUT;
1083 
1084 	nlm_check_idle();
1085 
1086 	mtx_unlock(&nlm_global_lock);
1087 
1088 	return (host);
1089 }
1090 
1091 /*
1092  * Find the NLM host that matches the value of 'sysid'. If none
1093  * exists, return NULL.
1094  */
1095 static struct nlm_host *
1096 nlm_find_host_by_sysid(int sysid)
1097 {
1098 	struct nlm_host *host;
1099 
1100 	TAILQ_FOREACH(host, &nlm_hosts, nh_link) {
1101 		if (host->nh_sysid == sysid) {
1102 			refcount_acquire(&host->nh_refs);
1103 			return (host);
1104 		}
1105 	}
1106 
1107 	return (NULL);
1108 }
1109 
1110 void nlm_host_release(struct nlm_host *host)
1111 {
1112 	if (refcount_release(&host->nh_refs)) {
1113 		/*
1114 		 * Free the host
1115 		 */
1116 		nlm_host_destroy(host);
1117 	}
1118 }
1119 
1120 /*
1121  * Unregister this NLM host with the local NSM due to idleness.
1122  */
1123 static void
1124 nlm_host_unmonitor(struct nlm_host *host)
1125 {
1126 	mon_id smmonid;
1127 	sm_stat_res smstat;
1128 	struct timeval timo;
1129 	enum clnt_stat stat;
1130 
1131 	NLM_DEBUG(1, "NLM: unmonitoring %s (sysid %d)\n",
1132 	    host->nh_caller_name, host->nh_sysid);
1133 
1134 	/*
1135 	 * We put our assigned system ID value in the priv field to
1136 	 * make it simpler to find the host if we are notified of a
1137 	 * host restart.
1138 	 */
1139 	smmonid.mon_name = host->nh_caller_name;
1140 	smmonid.my_id.my_name = "localhost";
1141 	smmonid.my_id.my_prog = NLM_PROG;
1142 	smmonid.my_id.my_vers = NLM_SM;
1143 	smmonid.my_id.my_proc = NLM_SM_NOTIFY;
1144 
1145 	timo.tv_sec = 25;
1146 	timo.tv_usec = 0;
1147 	stat = CLNT_CALL(nlm_nsm, SM_UNMON,
1148 	    (xdrproc_t) xdr_mon, &smmonid,
1149 	    (xdrproc_t) xdr_sm_stat, &smstat, timo);
1150 
1151 	if (stat != RPC_SUCCESS) {
1152 		NLM_ERR("Failed to contact local NSM - rpc error %d\n", stat);
1153 		return;
1154 	}
1155 	if (smstat.res_stat == stat_fail) {
1156 		NLM_ERR("Local NSM refuses to unmonitor %s\n",
1157 		    host->nh_caller_name);
1158 		return;
1159 	}
1160 
1161 	host->nh_monstate = NLM_UNMONITORED;
1162 }
1163 
1164 /*
1165  * Register this NLM host with the local NSM so that we can be
1166  * notified if it reboots.
1167  */
1168 void
1169 nlm_host_monitor(struct nlm_host *host, int state)
1170 {
1171 	mon smmon;
1172 	sm_stat_res smstat;
1173 	struct timeval timo;
1174 	enum clnt_stat stat;
1175 
1176 	if (state && !host->nh_state) {
1177 		/*
1178 		 * This is the first time we have seen an NSM state
1179 		 * value for this host. We record it here to help
1180 		 * detect host reboots.
1181 		 */
1182 		host->nh_state = state;
1183 		NLM_DEBUG(1, "NLM: host %s (sysid %d) has NSM state %d\n",
1184 		    host->nh_caller_name, host->nh_sysid, state);
1185 	}
1186 
1187 	mtx_lock(&host->nh_lock);
1188 	if (host->nh_monstate != NLM_UNMONITORED) {
1189 		mtx_unlock(&host->nh_lock);
1190 		return;
1191 	}
1192 	host->nh_monstate = NLM_MONITORED;
1193 	mtx_unlock(&host->nh_lock);
1194 
1195 	NLM_DEBUG(1, "NLM: monitoring %s (sysid %d)\n",
1196 	    host->nh_caller_name, host->nh_sysid);
1197 
1198 	/*
1199 	 * We put our assigned system ID value in the priv field to
1200 	 * make it simpler to find the host if we are notified of a
1201 	 * host restart.
1202 	 */
1203 	smmon.mon_id.mon_name = host->nh_caller_name;
1204 	smmon.mon_id.my_id.my_name = "localhost";
1205 	smmon.mon_id.my_id.my_prog = NLM_PROG;
1206 	smmon.mon_id.my_id.my_vers = NLM_SM;
1207 	smmon.mon_id.my_id.my_proc = NLM_SM_NOTIFY;
1208 	memcpy(smmon.priv, &host->nh_sysid, sizeof(host->nh_sysid));
1209 
1210 	timo.tv_sec = 25;
1211 	timo.tv_usec = 0;
1212 	stat = CLNT_CALL(nlm_nsm, SM_MON,
1213 	    (xdrproc_t) xdr_mon, &smmon,
1214 	    (xdrproc_t) xdr_sm_stat, &smstat, timo);
1215 
1216 	if (stat != RPC_SUCCESS) {
1217 		NLM_ERR("Failed to contact local NSM - rpc error %d\n", stat);
1218 		return;
1219 	}
1220 	if (smstat.res_stat == stat_fail) {
1221 		NLM_ERR("Local NSM refuses to monitor %s\n",
1222 		    host->nh_caller_name);
1223 		mtx_lock(&host->nh_lock);
1224 		host->nh_monstate = NLM_MONITOR_FAILED;
1225 		mtx_unlock(&host->nh_lock);
1226 		return;
1227 	}
1228 
1229 	host->nh_monstate = NLM_MONITORED;
1230 }
1231 
1232 /*
1233  * Return an RPC client handle that can be used to talk to the NLM
1234  * running on the given host.
1235  */
1236 CLIENT *
1237 nlm_host_get_rpc(struct nlm_host *host, bool_t isserver)
1238 {
1239 	struct nlm_rpc *rpc;
1240 	CLIENT *client;
1241 
1242 	mtx_lock(&host->nh_lock);
1243 
1244 	if (isserver)
1245 		rpc = &host->nh_srvrpc;
1246 	else
1247 		rpc = &host->nh_clntrpc;
1248 
1249 	/*
1250 	 * We can't hold onto RPC handles for too long - the async
1251 	 * call/reply protocol used by some NLM clients makes it hard
1252 	 * to tell when they change port numbers (e.g. after a
1253 	 * reboot). Note that if a client reboots while it isn't
1254 	 * holding any locks, it won't bother to notify us. We
1255 	 * expire the RPC handles after two minutes.
1256 	 */
1257 	if (rpc->nr_client && time_uptime > rpc->nr_create_time + 2*60) {
1258 		client = rpc->nr_client;
1259 		rpc->nr_client = NULL;
1260 		mtx_unlock(&host->nh_lock);
1261 		CLNT_RELEASE(client);
1262 		mtx_lock(&host->nh_lock);
1263 	}
1264 
1265 	if (!rpc->nr_client) {
1266 		mtx_unlock(&host->nh_lock);
1267 		client = nlm_get_rpc((struct sockaddr *)&host->nh_addr,
1268 		    NLM_PROG, host->nh_vers);
1269 		mtx_lock(&host->nh_lock);
1270 
1271 		if (client) {
1272 			if (rpc->nr_client) {
1273 				mtx_unlock(&host->nh_lock);
1274 				CLNT_DESTROY(client);
1275 				mtx_lock(&host->nh_lock);
1276 			} else {
1277 				rpc->nr_client = client;
1278 				rpc->nr_create_time = time_uptime;
1279 			}
1280 		}
1281 	}
1282 
1283 	client = rpc->nr_client;
1284 	if (client)
1285 		CLNT_ACQUIRE(client);
1286 	mtx_unlock(&host->nh_lock);
1287 
1288 	return (client);
1289 
1290 }
1291 
1292 int nlm_host_get_sysid(struct nlm_host *host)
1293 {
1294 
1295 	return (host->nh_sysid);
1296 }
1297 
1298 int
1299 nlm_host_get_state(struct nlm_host *host)
1300 {
1301 
1302 	return (host->nh_state);
1303 }
1304 
1305 void *
1306 nlm_register_wait_lock(struct nlm4_lock *lock, struct vnode *vp)
1307 {
1308 	struct nlm_waiting_lock *nw;
1309 
1310 	nw = malloc(sizeof(struct nlm_waiting_lock), M_NLM, M_WAITOK);
1311 	nw->nw_lock = *lock;
1312 	memcpy(&nw->nw_fh.fh_bytes, nw->nw_lock.fh.n_bytes,
1313 	    nw->nw_lock.fh.n_len);
1314 	nw->nw_lock.fh.n_bytes = nw->nw_fh.fh_bytes;
1315 	nw->nw_waiting = TRUE;
1316 	nw->nw_vp = vp;
1317 	mtx_lock(&nlm_global_lock);
1318 	TAILQ_INSERT_TAIL(&nlm_waiting_locks, nw, nw_link);
1319 	mtx_unlock(&nlm_global_lock);
1320 
1321 	return nw;
1322 }
1323 
1324 void
1325 nlm_deregister_wait_lock(void *handle)
1326 {
1327 	struct nlm_waiting_lock *nw = handle;
1328 
1329 	mtx_lock(&nlm_global_lock);
1330 	TAILQ_REMOVE(&nlm_waiting_locks, nw, nw_link);
1331 	mtx_unlock(&nlm_global_lock);
1332 
1333 	free(nw, M_NLM);
1334 }
1335 
1336 int
1337 nlm_wait_lock(void *handle, int timo)
1338 {
1339 	struct nlm_waiting_lock *nw = handle;
1340 	int error, stops_deferred;
1341 
1342 	/*
1343 	 * If the granted message arrived before we got here,
1344 	 * nw->nw_waiting will be FALSE - in that case, don't sleep.
1345 	 */
1346 	mtx_lock(&nlm_global_lock);
1347 	error = 0;
1348 	if (nw->nw_waiting) {
1349 		stops_deferred = sigdeferstop(SIGDEFERSTOP_ERESTART);
1350 		error = msleep(nw, &nlm_global_lock, PCATCH, "nlmlock", timo);
1351 		sigallowstop(stops_deferred);
1352 	}
1353 	TAILQ_REMOVE(&nlm_waiting_locks, nw, nw_link);
1354 	if (error) {
1355 		/*
1356 		 * The granted message may arrive after the
1357 		 * interrupt/timeout but before we manage to lock the
1358 		 * mutex. Detect this by examining nw_lock.
1359 		 */
1360 		if (!nw->nw_waiting)
1361 			error = 0;
1362 	} else {
1363 		/*
1364 		 * If nlm_cancel_wait is called, then error will be
1365 		 * zero but nw_waiting will still be TRUE. We
1366 		 * translate this into EINTR.
1367 		 */
1368 		if (nw->nw_waiting)
1369 			error = EINTR;
1370 	}
1371 	mtx_unlock(&nlm_global_lock);
1372 
1373 	free(nw, M_NLM);
1374 
1375 	return (error);
1376 }
1377 
1378 void
1379 nlm_cancel_wait(struct vnode *vp)
1380 {
1381 	struct nlm_waiting_lock *nw;
1382 
1383 	mtx_lock(&nlm_global_lock);
1384 	TAILQ_FOREACH(nw, &nlm_waiting_locks, nw_link) {
1385 		if (nw->nw_vp == vp) {
1386 			wakeup(nw);
1387 		}
1388 	}
1389 	mtx_unlock(&nlm_global_lock);
1390 }
1391 
1392 /**********************************************************************/
1393 
1394 /*
1395  * Syscall interface with userland.
1396  */
1397 
1398 extern void nlm_prog_0(struct svc_req *rqstp, SVCXPRT *transp);
1399 extern void nlm_prog_1(struct svc_req *rqstp, SVCXPRT *transp);
1400 extern void nlm_prog_3(struct svc_req *rqstp, SVCXPRT *transp);
1401 extern void nlm_prog_4(struct svc_req *rqstp, SVCXPRT *transp);
1402 
1403 static int
1404 nlm_register_services(SVCPOOL *pool, int addr_count, char **addrs)
1405 {
1406 	static rpcvers_t versions[] = {
1407 		NLM_SM, NLM_VERS, NLM_VERSX, NLM_VERS4
1408 	};
1409 	static void (*dispatchers[])(struct svc_req *, SVCXPRT *) = {
1410 		nlm_prog_0, nlm_prog_1, nlm_prog_3, nlm_prog_4
1411 	};
1412 
1413 	SVCXPRT **xprts;
1414 	char netid[16];
1415 	char uaddr[128];
1416 	struct netconfig *nconf;
1417 	int i, j, error;
1418 
1419 	if (!addr_count) {
1420 		NLM_ERR("NLM: no service addresses given - can't start server");
1421 		return (EINVAL);
1422 	}
1423 
1424 	if (addr_count < 0 || addr_count > 256 ) {
1425 		NLM_ERR("NLM:  too many service addresses (%d) given, "
1426 		    "max 256 - can't start server\n", addr_count);
1427 		return (EINVAL);
1428 	}
1429 
1430 	xprts = malloc(addr_count * sizeof(SVCXPRT *), M_NLM, M_WAITOK|M_ZERO);
1431 	for (i = 0; i < nitems(versions); i++) {
1432 		for (j = 0; j < addr_count; j++) {
1433 			/*
1434 			 * Create transports for the first version and
1435 			 * then just register everything else to the
1436 			 * same transports.
1437 			 */
1438 			if (i == 0) {
1439 				char *up;
1440 
1441 				error = copyin(&addrs[2*j], &up,
1442 				    sizeof(char*));
1443 				if (error)
1444 					goto out;
1445 				error = copyinstr(up, netid, sizeof(netid),
1446 				    NULL);
1447 				if (error)
1448 					goto out;
1449 				error = copyin(&addrs[2*j+1], &up,
1450 				    sizeof(char*));
1451 				if (error)
1452 					goto out;
1453 				error = copyinstr(up, uaddr, sizeof(uaddr),
1454 				    NULL);
1455 				if (error)
1456 					goto out;
1457 				nconf = getnetconfigent(netid);
1458 				if (!nconf) {
1459 					NLM_ERR("Can't lookup netid %s\n",
1460 					    netid);
1461 					error = EINVAL;
1462 					goto out;
1463 				}
1464 				xprts[j] = svc_tp_create(pool, dispatchers[i],
1465 				    NLM_PROG, versions[i], uaddr, nconf);
1466 				if (!xprts[j]) {
1467 					NLM_ERR("NLM: unable to create "
1468 					    "(NLM_PROG, %d).\n", versions[i]);
1469 					error = EINVAL;
1470 					goto out;
1471 				}
1472 				freenetconfigent(nconf);
1473 			} else {
1474 				nconf = getnetconfigent(xprts[j]->xp_netid);
1475 				rpcb_unset(NLM_PROG, versions[i], nconf);
1476 				if (!svc_reg(xprts[j], NLM_PROG, versions[i],
1477 					dispatchers[i], nconf)) {
1478 					NLM_ERR("NLM: can't register "
1479 					    "(NLM_PROG, %d)\n", versions[i]);
1480 					error = EINVAL;
1481 					goto out;
1482 				}
1483 			}
1484 		}
1485 	}
1486 	error = 0;
1487 out:
1488 	for (j = 0; j < addr_count; j++) {
1489 		if (xprts[j])
1490 			SVC_RELEASE(xprts[j]);
1491 	}
1492 	free(xprts, M_NLM);
1493 	return (error);
1494 }
1495 
1496 /*
1497  * Main server entry point. Contacts the local NSM to get its current
1498  * state and send SM_UNMON_ALL. Registers the NLM services and then
1499  * services requests. Does not return until the server is interrupted
1500  * by a signal.
1501  */
1502 static int
1503 nlm_server_main(int addr_count, char **addrs)
1504 {
1505 	struct thread *td = curthread;
1506 	int error;
1507 	SVCPOOL *pool = NULL;
1508 	struct sockopt opt;
1509 	int portlow;
1510 #ifdef INET6
1511 	struct sockaddr_in6 sin6;
1512 #endif
1513 	struct sockaddr_in sin;
1514 	my_id id;
1515 	sm_stat smstat;
1516 	struct timeval timo;
1517 	enum clnt_stat stat;
1518 	struct nlm_host *host, *nhost;
1519 	struct nlm_waiting_lock *nw;
1520 	vop_advlock_t *old_nfs_advlock;
1521 	vop_reclaim_t *old_nfs_reclaim;
1522 
1523 	if (nlm_is_running != 0) {
1524 		NLM_ERR("NLM: can't start server - "
1525 		    "it appears to be running already\n");
1526 		return (EPERM);
1527 	}
1528 
1529 	if (nlm_socket == NULL) {
1530 		memset(&opt, 0, sizeof(opt));
1531 
1532 		error = socreate(AF_INET, &nlm_socket, SOCK_DGRAM, 0,
1533 		    td->td_ucred, td);
1534 		if (error) {
1535 			NLM_ERR("NLM: can't create IPv4 socket - error %d\n",
1536 			    error);
1537 			return (error);
1538 		}
1539 		opt.sopt_dir = SOPT_SET;
1540 		opt.sopt_level = IPPROTO_IP;
1541 		opt.sopt_name = IP_PORTRANGE;
1542 		portlow = IP_PORTRANGE_LOW;
1543 		opt.sopt_val = &portlow;
1544 		opt.sopt_valsize = sizeof(portlow);
1545 		sosetopt(nlm_socket, &opt);
1546 
1547 #ifdef INET6
1548 		nlm_socket6 = NULL;
1549 		error = socreate(AF_INET6, &nlm_socket6, SOCK_DGRAM, 0,
1550 		    td->td_ucred, td);
1551 		if (error) {
1552 			NLM_ERR("NLM: can't create IPv6 socket - error %d\n",
1553 			    error);
1554 			soclose(nlm_socket);
1555 			nlm_socket = NULL;
1556 			return (error);
1557 		}
1558 		opt.sopt_dir = SOPT_SET;
1559 		opt.sopt_level = IPPROTO_IPV6;
1560 		opt.sopt_name = IPV6_PORTRANGE;
1561 		portlow = IPV6_PORTRANGE_LOW;
1562 		opt.sopt_val = &portlow;
1563 		opt.sopt_valsize = sizeof(portlow);
1564 		sosetopt(nlm_socket6, &opt);
1565 #endif
1566 	}
1567 
1568 	nlm_auth = authunix_create(curthread->td_ucred);
1569 
1570 #ifdef INET6
1571 	memset(&sin6, 0, sizeof(sin6));
1572 	sin6.sin6_len = sizeof(sin6);
1573 	sin6.sin6_family = AF_INET6;
1574 	sin6.sin6_addr = in6addr_loopback;
1575 	nlm_nsm = nlm_get_rpc((struct sockaddr *) &sin6, SM_PROG, SM_VERS);
1576 	if (!nlm_nsm) {
1577 #endif
1578 		memset(&sin, 0, sizeof(sin));
1579 		sin.sin_len = sizeof(sin);
1580 		sin.sin_family = AF_INET;
1581 		sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
1582 		nlm_nsm = nlm_get_rpc((struct sockaddr *) &sin, SM_PROG,
1583 		    SM_VERS);
1584 #ifdef INET6
1585 	}
1586 #endif
1587 
1588 	if (!nlm_nsm) {
1589 		NLM_ERR("Can't start NLM - unable to contact NSM\n");
1590 		error = EINVAL;
1591 		goto out;
1592 	}
1593 
1594 	pool = svcpool_create("NLM", NULL);
1595 
1596 	error = nlm_register_services(pool, addr_count, addrs);
1597 	if (error)
1598 		goto out;
1599 
1600 	memset(&id, 0, sizeof(id));
1601 	id.my_name = "NFS NLM";
1602 
1603 	timo.tv_sec = 25;
1604 	timo.tv_usec = 0;
1605 	stat = CLNT_CALL(nlm_nsm, SM_UNMON_ALL,
1606 	    (xdrproc_t) xdr_my_id, &id,
1607 	    (xdrproc_t) xdr_sm_stat, &smstat, timo);
1608 
1609 	if (stat != RPC_SUCCESS) {
1610 		struct rpc_err err;
1611 
1612 		CLNT_GETERR(nlm_nsm, &err);
1613 		NLM_ERR("NLM: unexpected error contacting NSM, "
1614 		    "stat=%d, errno=%d\n", stat, err.re_errno);
1615 		error = EINVAL;
1616 		goto out;
1617 	}
1618 	nlm_is_running = 1;
1619 
1620 	NLM_DEBUG(1, "NLM: local NSM state is %d\n", smstat.state);
1621 	nlm_nsm_state = smstat.state;
1622 
1623 	old_nfs_advlock = nfs_advlock_p;
1624 	nfs_advlock_p = nlm_advlock;
1625 	old_nfs_reclaim = nfs_reclaim_p;
1626 	nfs_reclaim_p = nlm_reclaim;
1627 
1628 	svc_run(pool);
1629 	error = 0;
1630 
1631 	nfs_advlock_p = old_nfs_advlock;
1632 	nfs_reclaim_p = old_nfs_reclaim;
1633 
1634 out:
1635 	nlm_is_running = 0;
1636 	if (pool)
1637 		svcpool_destroy(pool);
1638 
1639 	/*
1640 	 * We are finished communicating with the NSM.
1641 	 */
1642 	if (nlm_nsm) {
1643 		CLNT_RELEASE(nlm_nsm);
1644 		nlm_nsm = NULL;
1645 	}
1646 
1647 	/*
1648 	 * Trash all the existing state so that if the server
1649 	 * restarts, it gets a clean slate. This is complicated by the
1650 	 * possibility that there may be other threads trying to make
1651 	 * client locking requests.
1652 	 *
1653 	 * First we fake a client reboot notification which will
1654 	 * cancel any pending async locks and purge remote lock state
1655 	 * from the local lock manager. We release the reference from
1656 	 * nlm_hosts to the host (which may remove it from the list
1657 	 * and free it). After this phase, the only entries in the
1658 	 * nlm_host list should be from other threads performing
1659 	 * client lock requests.
1660 	 */
1661 	mtx_lock(&nlm_global_lock);
1662 	TAILQ_FOREACH(nw, &nlm_waiting_locks, nw_link) {
1663 		wakeup(nw);
1664 	}
1665 	TAILQ_FOREACH_SAFE(host, &nlm_hosts, nh_link, nhost) {
1666 		mtx_unlock(&nlm_global_lock);
1667 		nlm_host_notify(host, 0);
1668 		nlm_host_release(host);
1669 		mtx_lock(&nlm_global_lock);
1670 	}
1671 	mtx_unlock(&nlm_global_lock);
1672 
1673 	AUTH_DESTROY(nlm_auth);
1674 
1675 	return (error);
1676 }
1677 
1678 int
1679 sys_nlm_syscall(struct thread *td, struct nlm_syscall_args *uap)
1680 {
1681 	int error;
1682 
1683 	error = priv_check(td, PRIV_NFS_LOCKD);
1684 	if (error)
1685 		return (error);
1686 
1687 	nlm_debug_level = uap->debug_level;
1688 	nlm_grace_threshold = time_uptime + uap->grace_period;
1689 	nlm_next_idle_check = time_uptime + NLM_IDLE_PERIOD;
1690 
1691 	return nlm_server_main(uap->addr_count, uap->addrs);
1692 }
1693 
1694 /**********************************************************************/
1695 
1696 /*
1697  * NLM implementation details, called from the RPC stubs.
1698  */
1699 
1700 void
1701 nlm_sm_notify(struct nlm_sm_status *argp)
1702 {
1703 	uint32_t sysid;
1704 	struct nlm_host *host;
1705 
1706 	NLM_DEBUG(3, "nlm_sm_notify(): mon_name = %s\n", argp->mon_name);
1707 	memcpy(&sysid, &argp->priv, sizeof(sysid));
1708 	host = nlm_find_host_by_sysid(sysid);
1709 	if (host) {
1710 		nlm_host_notify(host, argp->state);
1711 		nlm_host_release(host);
1712 	}
1713 }
1714 
1715 static void
1716 nlm_convert_to_fhandle_t(fhandle_t *fhp, struct netobj *p)
1717 {
1718 	memcpy(fhp, p->n_bytes, sizeof(fhandle_t));
1719 }
1720 
1721 struct vfs_state {
1722 	struct mount	*vs_mp;
1723 	struct vnode	*vs_vp;
1724 	int		vs_vnlocked;
1725 };
1726 
1727 static int
1728 nlm_get_vfs_state(struct nlm_host *host, struct svc_req *rqstp,
1729     fhandle_t *fhp, struct vfs_state *vs, accmode_t accmode)
1730 {
1731 	int error;
1732 	uint64_t exflags;
1733 	struct ucred *cred = NULL, *credanon = NULL;
1734 
1735 	memset(vs, 0, sizeof(*vs));
1736 
1737 	vs->vs_mp = vfs_getvfs(&fhp->fh_fsid);
1738 	if (!vs->vs_mp) {
1739 		return (ESTALE);
1740 	}
1741 
1742 	/* accmode == 0 means don't check, since it is an unlock. */
1743 	if (accmode != 0) {
1744 		error = VFS_CHECKEXP(vs->vs_mp,
1745 		    (struct sockaddr *)&host->nh_addr, &exflags, &credanon,
1746 		    NULL, NULL);
1747 		if (error)
1748 			goto out;
1749 
1750 		if (exflags & MNT_EXRDONLY ||
1751 		    (vs->vs_mp->mnt_flag & MNT_RDONLY)) {
1752 			error = EROFS;
1753 			goto out;
1754 		}
1755 	}
1756 
1757 	error = VFS_FHTOVP(vs->vs_mp, &fhp->fh_fid, LK_EXCLUSIVE, &vs->vs_vp);
1758 	if (error)
1759 		goto out;
1760 	vs->vs_vnlocked = TRUE;
1761 
1762 	if (accmode != 0) {
1763 		if (!svc_getcred(rqstp, &cred, NULL)) {
1764 			error = EINVAL;
1765 			goto out;
1766 		}
1767 		if (cred->cr_uid == 0 || (exflags & MNT_EXPORTANON)) {
1768 			crfree(cred);
1769 			cred = credanon;
1770 			credanon = NULL;
1771 		}
1772 
1773 		/*
1774 		 * Check cred.
1775 		 */
1776 		error = VOP_ACCESS(vs->vs_vp, accmode, cred, curthread);
1777 		/*
1778 		 * If this failed and accmode != VWRITE, try again with
1779 		 * VWRITE to maintain backwards compatibility with the
1780 		 * old code that always used VWRITE.
1781 		 */
1782 		if (error != 0 && accmode != VWRITE)
1783 			error = VOP_ACCESS(vs->vs_vp, VWRITE, cred, curthread);
1784 		if (error)
1785 			goto out;
1786 	}
1787 
1788 	VOP_UNLOCK(vs->vs_vp);
1789 	vs->vs_vnlocked = FALSE;
1790 
1791 out:
1792 	if (cred)
1793 		crfree(cred);
1794 	if (credanon)
1795 		crfree(credanon);
1796 
1797 	return (error);
1798 }
1799 
1800 static void
1801 nlm_release_vfs_state(struct vfs_state *vs)
1802 {
1803 
1804 	if (vs->vs_vp) {
1805 		if (vs->vs_vnlocked)
1806 			vput(vs->vs_vp);
1807 		else
1808 			vrele(vs->vs_vp);
1809 	}
1810 	if (vs->vs_mp)
1811 		vfs_rel(vs->vs_mp);
1812 }
1813 
1814 static nlm4_stats
1815 nlm_convert_error(int error)
1816 {
1817 
1818 	if (error == ESTALE)
1819 		return nlm4_stale_fh;
1820 	else if (error == EROFS)
1821 		return nlm4_rofs;
1822 	else
1823 		return nlm4_failed;
1824 }
1825 
1826 int
1827 nlm_do_test(nlm4_testargs *argp, nlm4_testres *result, struct svc_req *rqstp,
1828 	CLIENT **rpcp)
1829 {
1830 	fhandle_t fh;
1831 	struct vfs_state vs;
1832 	struct nlm_host *host, *bhost;
1833 	int error, sysid;
1834 	struct flock fl;
1835 	accmode_t accmode;
1836 
1837 	memset(result, 0, sizeof(*result));
1838 	memset(&vs, 0, sizeof(vs));
1839 
1840 	host = nlm_find_host_by_name(argp->alock.caller_name,
1841 	    svc_getrpccaller(rqstp), rqstp->rq_vers);
1842 	if (!host) {
1843 		result->stat.stat = nlm4_denied_nolocks;
1844 		return (ENOMEM);
1845 	}
1846 
1847 	NLM_DEBUG(3, "nlm_do_test(): caller_name = %s (sysid = %d)\n",
1848 	    host->nh_caller_name, host->nh_sysid);
1849 
1850 	nlm_check_expired_locks(host);
1851 	sysid = host->nh_sysid;
1852 
1853 	nlm_convert_to_fhandle_t(&fh, &argp->alock.fh);
1854 	nlm_copy_netobj(&result->cookie, &argp->cookie, M_RPC);
1855 
1856 	if (time_uptime < nlm_grace_threshold) {
1857 		result->stat.stat = nlm4_denied_grace_period;
1858 		goto out;
1859 	}
1860 
1861 	accmode = argp->exclusive ? VWRITE : VREAD;
1862 	error = nlm_get_vfs_state(host, rqstp, &fh, &vs, accmode);
1863 	if (error) {
1864 		result->stat.stat = nlm_convert_error(error);
1865 		goto out;
1866 	}
1867 
1868 	fl.l_start = argp->alock.l_offset;
1869 	fl.l_len = argp->alock.l_len;
1870 	fl.l_pid = argp->alock.svid;
1871 	fl.l_sysid = sysid;
1872 	fl.l_whence = SEEK_SET;
1873 	if (argp->exclusive)
1874 		fl.l_type = F_WRLCK;
1875 	else
1876 		fl.l_type = F_RDLCK;
1877 	error = VOP_ADVLOCK(vs.vs_vp, NULL, F_GETLK, &fl, F_REMOTE);
1878 	if (error) {
1879 		result->stat.stat = nlm4_failed;
1880 		goto out;
1881 	}
1882 
1883 	if (fl.l_type == F_UNLCK) {
1884 		result->stat.stat = nlm4_granted;
1885 	} else {
1886 		result->stat.stat = nlm4_denied;
1887 		result->stat.nlm4_testrply_u.holder.exclusive =
1888 			(fl.l_type == F_WRLCK);
1889 		result->stat.nlm4_testrply_u.holder.svid = fl.l_pid;
1890 		bhost = nlm_find_host_by_sysid(fl.l_sysid);
1891 		if (bhost) {
1892 			/*
1893 			 * We don't have any useful way of recording
1894 			 * the value of oh used in the original lock
1895 			 * request. Ideally, the test reply would have
1896 			 * a space for the owning host's name allowing
1897 			 * our caller's NLM to keep track.
1898 			 *
1899 			 * As far as I can see, Solaris uses an eight
1900 			 * byte structure for oh which contains a four
1901 			 * byte pid encoded in local byte order and
1902 			 * the first four bytes of the host
1903 			 * name. Linux uses a variable length string
1904 			 * 'pid@hostname' in ascii but doesn't even
1905 			 * return that in test replies.
1906 			 *
1907 			 * For the moment, return nothing in oh
1908 			 * (already zero'ed above).
1909 			 */
1910 			nlm_host_release(bhost);
1911 		}
1912 		result->stat.nlm4_testrply_u.holder.l_offset = fl.l_start;
1913 		result->stat.nlm4_testrply_u.holder.l_len = fl.l_len;
1914 	}
1915 
1916 out:
1917 	nlm_release_vfs_state(&vs);
1918 	if (rpcp)
1919 		*rpcp = nlm_host_get_rpc(host, TRUE);
1920 	nlm_host_release(host);
1921 	return (0);
1922 }
1923 
1924 int
1925 nlm_do_lock(nlm4_lockargs *argp, nlm4_res *result, struct svc_req *rqstp,
1926     bool_t monitor, CLIENT **rpcp)
1927 {
1928 	fhandle_t fh;
1929 	struct vfs_state vs;
1930 	struct nlm_host *host;
1931 	int error, sysid;
1932 	struct flock fl;
1933 	accmode_t accmode;
1934 
1935 	memset(result, 0, sizeof(*result));
1936 	memset(&vs, 0, sizeof(vs));
1937 
1938 	host = nlm_find_host_by_name(argp->alock.caller_name,
1939 	    svc_getrpccaller(rqstp), rqstp->rq_vers);
1940 	if (!host) {
1941 		result->stat.stat = nlm4_denied_nolocks;
1942 		return (ENOMEM);
1943 	}
1944 
1945 	NLM_DEBUG(3, "nlm_do_lock(): caller_name = %s (sysid = %d)\n",
1946 	    host->nh_caller_name, host->nh_sysid);
1947 
1948 	if (monitor && host->nh_state && argp->state
1949 	    && host->nh_state != argp->state) {
1950 		/*
1951 		 * The host rebooted without telling us. Trash its
1952 		 * locks.
1953 		 */
1954 		nlm_host_notify(host, argp->state);
1955 	}
1956 
1957 	nlm_check_expired_locks(host);
1958 	sysid = host->nh_sysid;
1959 
1960 	nlm_convert_to_fhandle_t(&fh, &argp->alock.fh);
1961 	nlm_copy_netobj(&result->cookie, &argp->cookie, M_RPC);
1962 
1963 	if (time_uptime < nlm_grace_threshold && !argp->reclaim) {
1964 		result->stat.stat = nlm4_denied_grace_period;
1965 		goto out;
1966 	}
1967 
1968 	accmode = argp->exclusive ? VWRITE : VREAD;
1969 	error = nlm_get_vfs_state(host, rqstp, &fh, &vs, accmode);
1970 	if (error) {
1971 		result->stat.stat = nlm_convert_error(error);
1972 		goto out;
1973 	}
1974 
1975 	fl.l_start = argp->alock.l_offset;
1976 	fl.l_len = argp->alock.l_len;
1977 	fl.l_pid = argp->alock.svid;
1978 	fl.l_sysid = sysid;
1979 	fl.l_whence = SEEK_SET;
1980 	if (argp->exclusive)
1981 		fl.l_type = F_WRLCK;
1982 	else
1983 		fl.l_type = F_RDLCK;
1984 	if (argp->block) {
1985 		struct nlm_async_lock *af;
1986 		CLIENT *client;
1987 		struct nlm_grantcookie cookie;
1988 
1989 		/*
1990 		 * First, make sure we can contact the host's NLM.
1991 		 */
1992 		client = nlm_host_get_rpc(host, TRUE);
1993 		if (!client) {
1994 			result->stat.stat = nlm4_failed;
1995 			goto out;
1996 		}
1997 
1998 		/*
1999 		 * First we need to check and see if there is an
2000 		 * existing blocked lock that matches. This could be a
2001 		 * badly behaved client or an RPC re-send. If we find
2002 		 * one, just return nlm4_blocked.
2003 		 */
2004 		mtx_lock(&host->nh_lock);
2005 		TAILQ_FOREACH(af, &host->nh_pending, af_link) {
2006 			if (af->af_fl.l_start == fl.l_start
2007 			    && af->af_fl.l_len == fl.l_len
2008 			    && af->af_fl.l_pid == fl.l_pid
2009 			    && af->af_fl.l_type == fl.l_type) {
2010 				break;
2011 			}
2012 		}
2013 		if (!af) {
2014 			cookie.ng_sysid = host->nh_sysid;
2015 			cookie.ng_cookie = host->nh_grantcookie++;
2016 		}
2017 		mtx_unlock(&host->nh_lock);
2018 		if (af) {
2019 			CLNT_RELEASE(client);
2020 			result->stat.stat = nlm4_blocked;
2021 			goto out;
2022 		}
2023 
2024 		af = malloc(sizeof(struct nlm_async_lock), M_NLM,
2025 		    M_WAITOK|M_ZERO);
2026 		TASK_INIT(&af->af_task, 0, nlm_lock_callback, af);
2027 		af->af_vp = vs.vs_vp;
2028 		af->af_fl = fl;
2029 		af->af_host = host;
2030 		af->af_rpc = client;
2031 		/*
2032 		 * We use M_RPC here so that we can xdr_free the thing
2033 		 * later.
2034 		 */
2035 		nlm_make_netobj(&af->af_granted.cookie,
2036 		    (caddr_t)&cookie, sizeof(cookie), M_RPC);
2037 		af->af_granted.exclusive = argp->exclusive;
2038 		af->af_granted.alock.caller_name =
2039 			strdup(argp->alock.caller_name, M_RPC);
2040 		nlm_copy_netobj(&af->af_granted.alock.fh,
2041 		    &argp->alock.fh, M_RPC);
2042 		nlm_copy_netobj(&af->af_granted.alock.oh,
2043 		    &argp->alock.oh, M_RPC);
2044 		af->af_granted.alock.svid = argp->alock.svid;
2045 		af->af_granted.alock.l_offset = argp->alock.l_offset;
2046 		af->af_granted.alock.l_len = argp->alock.l_len;
2047 
2048 		/*
2049 		 * Put the entry on the pending list before calling
2050 		 * VOP_ADVLOCKASYNC. We do this in case the lock
2051 		 * request was blocked (returning EINPROGRESS) but
2052 		 * then granted before we manage to run again. The
2053 		 * client may receive the granted message before we
2054 		 * send our blocked reply but thats their problem.
2055 		 */
2056 		mtx_lock(&host->nh_lock);
2057 		TAILQ_INSERT_TAIL(&host->nh_pending, af, af_link);
2058 		mtx_unlock(&host->nh_lock);
2059 
2060 		error = VOP_ADVLOCKASYNC(vs.vs_vp, NULL, F_SETLK, &fl, F_REMOTE,
2061 		    &af->af_task, &af->af_cookie);
2062 
2063 		/*
2064 		 * If the lock completed synchronously, just free the
2065 		 * tracking structure now.
2066 		 */
2067 		if (error != EINPROGRESS) {
2068 			CLNT_RELEASE(af->af_rpc);
2069 			mtx_lock(&host->nh_lock);
2070 			TAILQ_REMOVE(&host->nh_pending, af, af_link);
2071 			mtx_unlock(&host->nh_lock);
2072 			xdr_free((xdrproc_t) xdr_nlm4_testargs,
2073 			    &af->af_granted);
2074 			free(af, M_NLM);
2075 		} else {
2076 			NLM_DEBUG(2, "NLM: pending async lock %p for %s "
2077 			    "(sysid %d)\n", af, host->nh_caller_name, sysid);
2078 			/*
2079 			 * Don't vrele the vnode just yet - this must
2080 			 * wait until either the async callback
2081 			 * happens or the lock is cancelled.
2082 			 */
2083 			vs.vs_vp = NULL;
2084 		}
2085 	} else {
2086 		error = VOP_ADVLOCK(vs.vs_vp, NULL, F_SETLK, &fl, F_REMOTE);
2087 	}
2088 
2089 	if (error) {
2090 		if (error == EINPROGRESS) {
2091 			result->stat.stat = nlm4_blocked;
2092 		} else if (error == EDEADLK) {
2093 			result->stat.stat = nlm4_deadlck;
2094 		} else if (error == EAGAIN) {
2095 			result->stat.stat = nlm4_denied;
2096 		} else {
2097 			result->stat.stat = nlm4_failed;
2098 		}
2099 	} else {
2100 		if (monitor)
2101 			nlm_host_monitor(host, argp->state);
2102 		result->stat.stat = nlm4_granted;
2103 	}
2104 
2105 out:
2106 	nlm_release_vfs_state(&vs);
2107 	if (rpcp)
2108 		*rpcp = nlm_host_get_rpc(host, TRUE);
2109 	nlm_host_release(host);
2110 	return (0);
2111 }
2112 
2113 int
2114 nlm_do_cancel(nlm4_cancargs *argp, nlm4_res *result, struct svc_req *rqstp,
2115     CLIENT **rpcp)
2116 {
2117 	fhandle_t fh;
2118 	struct vfs_state vs;
2119 	struct nlm_host *host;
2120 	int error, sysid;
2121 	struct flock fl;
2122 	struct nlm_async_lock *af;
2123 
2124 	memset(result, 0, sizeof(*result));
2125 	memset(&vs, 0, sizeof(vs));
2126 
2127 	host = nlm_find_host_by_name(argp->alock.caller_name,
2128 	    svc_getrpccaller(rqstp), rqstp->rq_vers);
2129 	if (!host) {
2130 		result->stat.stat = nlm4_denied_nolocks;
2131 		return (ENOMEM);
2132 	}
2133 
2134 	NLM_DEBUG(3, "nlm_do_cancel(): caller_name = %s (sysid = %d)\n",
2135 	    host->nh_caller_name, host->nh_sysid);
2136 
2137 	nlm_check_expired_locks(host);
2138 	sysid = host->nh_sysid;
2139 
2140 	nlm_convert_to_fhandle_t(&fh, &argp->alock.fh);
2141 	nlm_copy_netobj(&result->cookie, &argp->cookie, M_RPC);
2142 
2143 	if (time_uptime < nlm_grace_threshold) {
2144 		result->stat.stat = nlm4_denied_grace_period;
2145 		goto out;
2146 	}
2147 
2148 	error = nlm_get_vfs_state(host, rqstp, &fh, &vs, (accmode_t)0);
2149 	if (error) {
2150 		result->stat.stat = nlm_convert_error(error);
2151 		goto out;
2152 	}
2153 
2154 	fl.l_start = argp->alock.l_offset;
2155 	fl.l_len = argp->alock.l_len;
2156 	fl.l_pid = argp->alock.svid;
2157 	fl.l_sysid = sysid;
2158 	fl.l_whence = SEEK_SET;
2159 	if (argp->exclusive)
2160 		fl.l_type = F_WRLCK;
2161 	else
2162 		fl.l_type = F_RDLCK;
2163 
2164 	/*
2165 	 * First we need to try and find the async lock request - if
2166 	 * there isn't one, we give up and return nlm4_denied.
2167 	 */
2168 	mtx_lock(&host->nh_lock);
2169 
2170 	TAILQ_FOREACH(af, &host->nh_pending, af_link) {
2171 		if (af->af_fl.l_start == fl.l_start
2172 		    && af->af_fl.l_len == fl.l_len
2173 		    && af->af_fl.l_pid == fl.l_pid
2174 		    && af->af_fl.l_type == fl.l_type) {
2175 			break;
2176 		}
2177 	}
2178 
2179 	if (!af) {
2180 		mtx_unlock(&host->nh_lock);
2181 		result->stat.stat = nlm4_denied;
2182 		goto out;
2183 	}
2184 
2185 	error = nlm_cancel_async_lock(af);
2186 
2187 	if (error) {
2188 		result->stat.stat = nlm4_denied;
2189 	} else {
2190 		result->stat.stat = nlm4_granted;
2191 	}
2192 
2193 	mtx_unlock(&host->nh_lock);
2194 
2195 out:
2196 	nlm_release_vfs_state(&vs);
2197 	if (rpcp)
2198 		*rpcp = nlm_host_get_rpc(host, TRUE);
2199 	nlm_host_release(host);
2200 	return (0);
2201 }
2202 
2203 int
2204 nlm_do_unlock(nlm4_unlockargs *argp, nlm4_res *result, struct svc_req *rqstp,
2205     CLIENT **rpcp)
2206 {
2207 	fhandle_t fh;
2208 	struct vfs_state vs;
2209 	struct nlm_host *host;
2210 	int error, sysid;
2211 	struct flock fl;
2212 
2213 	memset(result, 0, sizeof(*result));
2214 	memset(&vs, 0, sizeof(vs));
2215 
2216 	host = nlm_find_host_by_name(argp->alock.caller_name,
2217 	    svc_getrpccaller(rqstp), rqstp->rq_vers);
2218 	if (!host) {
2219 		result->stat.stat = nlm4_denied_nolocks;
2220 		return (ENOMEM);
2221 	}
2222 
2223 	NLM_DEBUG(3, "nlm_do_unlock(): caller_name = %s (sysid = %d)\n",
2224 	    host->nh_caller_name, host->nh_sysid);
2225 
2226 	nlm_check_expired_locks(host);
2227 	sysid = host->nh_sysid;
2228 
2229 	nlm_convert_to_fhandle_t(&fh, &argp->alock.fh);
2230 	nlm_copy_netobj(&result->cookie, &argp->cookie, M_RPC);
2231 
2232 	if (time_uptime < nlm_grace_threshold) {
2233 		result->stat.stat = nlm4_denied_grace_period;
2234 		goto out;
2235 	}
2236 
2237 	error = nlm_get_vfs_state(host, rqstp, &fh, &vs, (accmode_t)0);
2238 	if (error) {
2239 		result->stat.stat = nlm_convert_error(error);
2240 		goto out;
2241 	}
2242 
2243 	fl.l_start = argp->alock.l_offset;
2244 	fl.l_len = argp->alock.l_len;
2245 	fl.l_pid = argp->alock.svid;
2246 	fl.l_sysid = sysid;
2247 	fl.l_whence = SEEK_SET;
2248 	fl.l_type = F_UNLCK;
2249 	error = VOP_ADVLOCK(vs.vs_vp, NULL, F_UNLCK, &fl, F_REMOTE);
2250 
2251 	/*
2252 	 * Ignore the error - there is no result code for failure,
2253 	 * only for grace period.
2254 	 */
2255 	result->stat.stat = nlm4_granted;
2256 
2257 out:
2258 	nlm_release_vfs_state(&vs);
2259 	if (rpcp)
2260 		*rpcp = nlm_host_get_rpc(host, TRUE);
2261 	nlm_host_release(host);
2262 	return (0);
2263 }
2264 
2265 int
2266 nlm_do_granted(nlm4_testargs *argp, nlm4_res *result, struct svc_req *rqstp,
2267 
2268     CLIENT **rpcp)
2269 {
2270 	struct nlm_host *host;
2271 	struct nlm_waiting_lock *nw;
2272 
2273 	memset(result, 0, sizeof(*result));
2274 
2275 	host = nlm_find_host_by_addr(svc_getrpccaller(rqstp), rqstp->rq_vers);
2276 	if (!host) {
2277 		result->stat.stat = nlm4_denied_nolocks;
2278 		return (ENOMEM);
2279 	}
2280 
2281 	nlm_copy_netobj(&result->cookie, &argp->cookie, M_RPC);
2282 	result->stat.stat = nlm4_denied;
2283 	KFAIL_POINT_CODE(DEBUG_FP, nlm_deny_grant, goto out);
2284 
2285 	mtx_lock(&nlm_global_lock);
2286 	TAILQ_FOREACH(nw, &nlm_waiting_locks, nw_link) {
2287 		if (!nw->nw_waiting)
2288 			continue;
2289 		if (argp->alock.svid == nw->nw_lock.svid
2290 		    && argp->alock.l_offset == nw->nw_lock.l_offset
2291 		    && argp->alock.l_len == nw->nw_lock.l_len
2292 		    && argp->alock.fh.n_len == nw->nw_lock.fh.n_len
2293 		    && !memcmp(argp->alock.fh.n_bytes, nw->nw_lock.fh.n_bytes,
2294 			nw->nw_lock.fh.n_len)) {
2295 			nw->nw_waiting = FALSE;
2296 			wakeup(nw);
2297 			result->stat.stat = nlm4_granted;
2298 			break;
2299 		}
2300 	}
2301 	mtx_unlock(&nlm_global_lock);
2302 
2303 out:
2304 	if (rpcp)
2305 		*rpcp = nlm_host_get_rpc(host, TRUE);
2306 	nlm_host_release(host);
2307 	return (0);
2308 }
2309 
2310 void
2311 nlm_do_granted_res(nlm4_res *argp, struct svc_req *rqstp)
2312 {
2313 	struct nlm_host *host = NULL;
2314 	struct nlm_async_lock *af = NULL;
2315 	int error;
2316 
2317 	if (argp->cookie.n_len != sizeof(struct nlm_grantcookie)) {
2318 		NLM_DEBUG(1, "NLM: bogus grant cookie");
2319 		goto out;
2320 	}
2321 
2322 	host = nlm_find_host_by_sysid(ng_sysid(&argp->cookie));
2323 	if (!host) {
2324 		NLM_DEBUG(1, "NLM: Unknown host rejected our grant");
2325 		goto out;
2326 	}
2327 
2328 	mtx_lock(&host->nh_lock);
2329 	TAILQ_FOREACH(af, &host->nh_granted, af_link)
2330 	    if (ng_cookie(&argp->cookie) ==
2331 		ng_cookie(&af->af_granted.cookie))
2332 		    break;
2333 	if (af)
2334 		TAILQ_REMOVE(&host->nh_granted, af, af_link);
2335 	mtx_unlock(&host->nh_lock);
2336 
2337 	if (!af) {
2338 		NLM_DEBUG(1, "NLM: host %s (sysid %d) replied to our grant "
2339 		    "with unrecognized cookie %d:%d", host->nh_caller_name,
2340 		    host->nh_sysid, ng_sysid(&argp->cookie),
2341 		    ng_cookie(&argp->cookie));
2342 		goto out;
2343 	}
2344 
2345 	if (argp->stat.stat != nlm4_granted) {
2346 		af->af_fl.l_type = F_UNLCK;
2347 		error = VOP_ADVLOCK(af->af_vp, NULL, F_UNLCK, &af->af_fl, F_REMOTE);
2348 		if (error) {
2349 			NLM_DEBUG(1, "NLM: host %s (sysid %d) rejected our grant "
2350 			    "and we failed to unlock (%d)", host->nh_caller_name,
2351 			    host->nh_sysid, error);
2352 			goto out;
2353 		}
2354 
2355 		NLM_DEBUG(5, "NLM: async lock %p rejected by host %s (sysid %d)",
2356 		    af, host->nh_caller_name, host->nh_sysid);
2357 	} else {
2358 		NLM_DEBUG(5, "NLM: async lock %p accepted by host %s (sysid %d)",
2359 		    af, host->nh_caller_name, host->nh_sysid);
2360 	}
2361 
2362  out:
2363 	if (af)
2364 		nlm_free_async_lock(af);
2365 	if (host)
2366 		nlm_host_release(host);
2367 }
2368 
2369 void
2370 nlm_do_free_all(nlm4_notify *argp)
2371 {
2372 	struct nlm_host *host, *thost;
2373 
2374 	TAILQ_FOREACH_SAFE(host, &nlm_hosts, nh_link, thost) {
2375 		if (!strcmp(host->nh_caller_name, argp->name))
2376 			nlm_host_notify(host, argp->state);
2377 	}
2378 }
2379 
2380 /*
2381  * Kernel module glue
2382  */
2383 static int
2384 nfslockd_modevent(module_t mod, int type, void *data)
2385 {
2386 
2387 	switch (type) {
2388 	case MOD_LOAD:
2389 		return (nlm_init());
2390 
2391 	case MOD_UNLOAD:
2392 		nlm_uninit();
2393 		/* The NLM module cannot be safely unloaded. */
2394 		/* FALLTHROUGH */
2395 	default:
2396 		return (EOPNOTSUPP);
2397 	}
2398 }
2399 static moduledata_t nfslockd_mod = {
2400 	"nfslockd",
2401 	nfslockd_modevent,
2402 	NULL,
2403 };
2404 DECLARE_MODULE(nfslockd, nfslockd_mod, SI_SUB_VFS, SI_ORDER_ANY);
2405 
2406 /* So that loader and kldload(2) can find us, wherever we are.. */
2407 MODULE_DEPEND(nfslockd, xdr, 1, 1, 1);
2408 MODULE_DEPEND(nfslockd, krpc, 1, 1, 1);
2409 MODULE_DEPEND(nfslockd, nfscommon, 1, 1, 1);
2410 MODULE_VERSION(nfslockd, 1);
2411