xref: /illumos-gate/usr/src/uts/common/os/flock.c (revision 49b7860084dbba18bc00b29413d6182197f9fe93)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 /*	All Rights Reserved */
29 
30 /*
31  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
32  * Copyright 2015 Joyent, Inc.
33  */
34 
35 #include <sys/flock_impl.h>
36 #include <sys/vfs.h>
37 #include <sys/t_lock.h>		/* for <sys/callb.h> */
38 #include <sys/callb.h>
39 #include <sys/clconf.h>
40 #include <sys/cladm.h>
41 #include <sys/nbmlock.h>
42 #include <sys/cred.h>
43 #include <sys/policy.h>
44 
45 /*
46  * The following four variables are for statistics purposes and they are
47  * not protected by locks. They may not be accurate but will at least be
48  * close to the actual value.
49  */
50 
51 int	flk_lock_allocs;
52 int	flk_lock_frees;
53 int 	edge_allocs;
54 int	edge_frees;
55 int 	flk_proc_vertex_allocs;
56 int 	flk_proc_edge_allocs;
57 int	flk_proc_vertex_frees;
58 int	flk_proc_edge_frees;
59 
60 static kmutex_t flock_lock;
61 
62 #ifdef DEBUG
63 int check_debug = 0;
64 #define	CHECK_ACTIVE_LOCKS(gp)	if (check_debug) \
65 					check_active_locks(gp);
66 #define	CHECK_SLEEPING_LOCKS(gp)	if (check_debug) \
67 						check_sleeping_locks(gp);
68 #define	CHECK_OWNER_LOCKS(gp, pid, sysid, vp) 	\
69 		if (check_debug)	\
70 			check_owner_locks(gp, pid, sysid, vp);
71 #define	CHECK_LOCK_TRANSITION(old_state, new_state) \
72 	{ \
73 		if (check_lock_transition(old_state, new_state)) { \
74 			cmn_err(CE_PANIC, "Illegal lock transition \
75 			    from %d to %d", old_state, new_state); \
76 		} \
77 	}
78 #else
79 
80 #define	CHECK_ACTIVE_LOCKS(gp)
81 #define	CHECK_SLEEPING_LOCKS(gp)
82 #define	CHECK_OWNER_LOCKS(gp, pid, sysid, vp)
83 #define	CHECK_LOCK_TRANSITION(old_state, new_state)
84 
85 #endif /* DEBUG */
86 
87 struct kmem_cache	*flk_edge_cache;
88 
89 graph_t		*lock_graph[HASH_SIZE];
90 proc_graph_t	pgraph;
91 
92 /*
93  * Clustering.
94  *
95  * NLM REGISTRY TYPE IMPLEMENTATION
96  *
97  * Assumptions:
98  *  1.  Nodes in a cluster are numbered starting at 1; always non-negative
99  *	integers; maximum node id is returned by clconf_maximum_nodeid().
100  *  2.  We use this node id to identify the node an NLM server runs on.
101  */
102 
103 /*
104  * NLM registry object keeps track of NLM servers via their
105  * nlmids (which are the node ids of the node in the cluster they run on)
106  * that have requested locks at this LLM with which this registry is
107  * associated.
108  *
109  * Representation of abstraction:
110  *    rep = record[	states: array[nlm_state],
111  *			lock: mutex]
112  *
113  *    Representation invariants:
114  *	1. index i of rep.states is between 0 and n - 1 where n is number
115  *	   of elements in the array, which happen to be the maximum number
116  *	   of nodes in the cluster configuration + 1.
117  *	2. map nlmid to index i of rep.states
118  *		0   -> 0
119  *		1   -> 1
120  *		2   -> 2
121  *		n-1 -> clconf_maximum_nodeid()+1
122  *	3.  This 1-1 mapping is quite convenient and it avoids errors resulting
123  *	    from forgetting to subtract 1 from the index.
124  *	4.  The reason we keep the 0th index is the following.  A legitimate
125  *	    cluster configuration includes making a UFS file system NFS
126  *	    exportable.  The code is structured so that if you're in a cluster
127  *	    you do one thing; otherwise, you do something else.  The problem
128  *	    is what to do if you think you're in a cluster with PXFS loaded,
129  *	    but you're using UFS not PXFS?  The upper two bytes of the sysid
130  *	    encode the node id of the node where NLM server runs; these bytes
131  *	    are zero for UFS.  Since the nodeid is used to index into the
132  *	    registry, we can record the NLM server state information at index
133  *	    0 using the same mechanism used for PXFS file locks!
134  */
135 static flk_nlm_status_t *nlm_reg_status = NULL;	/* state array 0..N-1 */
136 static kmutex_t nlm_reg_lock;			/* lock to protect arrary */
137 static uint_t nlm_status_size;			/* size of state array */
138 
139 /*
140  * Although we need a global lock dependency graph (and associated data
141  * structures), we also need a per-zone notion of whether the lock manager is
142  * running, and so whether to allow lock manager requests or not.
143  *
144  * Thus, on a per-zone basis we maintain a ``global'' variable
145  * (flk_lockmgr_status), protected by flock_lock, and set when the lock
146  * manager is determined to be changing state (starting or stopping).
147  *
148  * Each graph/zone pair also has a copy of this variable, which is protected by
149  * the graph's mutex.
150  *
151  * The per-graph copies are used to synchronize lock requests with shutdown
152  * requests.  The global copy is used to initialize the per-graph field when a
153  * new graph is created.
154  */
155 struct flock_globals {
156 	flk_lockmgr_status_t flk_lockmgr_status;
157 	flk_lockmgr_status_t lockmgr_status[HASH_SIZE];
158 };
159 
160 zone_key_t flock_zone_key;
161 
162 static void create_flock(lock_descriptor_t *, flock64_t *);
163 static lock_descriptor_t	*flk_get_lock(void);
164 static void	flk_free_lock(lock_descriptor_t	*lock);
165 static void	flk_get_first_blocking_lock(lock_descriptor_t *request);
166 static int flk_process_request(lock_descriptor_t *);
167 static int flk_add_edge(lock_descriptor_t *, lock_descriptor_t *, int, int);
168 static edge_t *flk_get_edge(void);
169 static int flk_wait_execute_request(lock_descriptor_t *);
170 static int flk_relation(lock_descriptor_t *, lock_descriptor_t *);
171 static void flk_insert_active_lock(lock_descriptor_t *);
172 static void flk_delete_active_lock(lock_descriptor_t *, int);
173 static void flk_insert_sleeping_lock(lock_descriptor_t *);
174 static void flk_graph_uncolor(graph_t *);
175 static void flk_wakeup(lock_descriptor_t *, int);
176 static void flk_free_edge(edge_t *);
177 static void flk_recompute_dependencies(lock_descriptor_t *,
178 			lock_descriptor_t **,  int, int);
179 static int flk_find_barriers(lock_descriptor_t *);
180 static void flk_update_barriers(lock_descriptor_t *);
181 static int flk_color_reachables(lock_descriptor_t *);
182 static int flk_canceled(lock_descriptor_t *);
183 static void flk_delete_locks_by_sysid(lock_descriptor_t *);
184 static void report_blocker(lock_descriptor_t *, lock_descriptor_t *);
185 static void wait_for_lock(lock_descriptor_t *);
186 static void unlock_lockmgr_granted(struct flock_globals *);
187 static void wakeup_sleeping_lockmgr_locks(struct flock_globals *);
188 
189 /* Clustering hooks */
190 static void cl_flk_change_nlm_state_all_locks(int, flk_nlm_status_t);
191 static void cl_flk_wakeup_sleeping_nlm_locks(int);
192 static void cl_flk_unlock_nlm_granted(int);
193 
194 #ifdef DEBUG
195 static int check_lock_transition(int, int);
196 static void check_sleeping_locks(graph_t *);
197 static void check_active_locks(graph_t *);
198 static int no_path(lock_descriptor_t *, lock_descriptor_t *);
199 static void path(lock_descriptor_t *, lock_descriptor_t *);
200 static void check_owner_locks(graph_t *, pid_t, int, vnode_t *);
201 static int level_one_path(lock_descriptor_t *, lock_descriptor_t *);
202 static int level_two_path(lock_descriptor_t *, lock_descriptor_t *, int);
203 #endif
204 
205 /*	proc_graph function definitions */
206 static int flk_check_deadlock(lock_descriptor_t *);
207 static void flk_proc_graph_uncolor(void);
208 static proc_vertex_t *flk_get_proc_vertex(lock_descriptor_t *);
209 static proc_edge_t *flk_get_proc_edge(void);
210 static void flk_proc_release(proc_vertex_t *);
211 static void flk_free_proc_edge(proc_edge_t *);
212 static void flk_update_proc_graph(edge_t *, int);
213 
214 /* Non-blocking mandatory locking */
215 static int lock_blocks_io(nbl_op_t, u_offset_t, ssize_t, int, u_offset_t,
216 			u_offset_t);
217 
218 static struct flock_globals *
219 flk_get_globals(void)
220 {
221 	/*
222 	 * The KLM module had better be loaded if we're attempting to handle
223 	 * lockmgr requests.
224 	 */
225 	ASSERT(flock_zone_key != ZONE_KEY_UNINITIALIZED);
226 	return (zone_getspecific(flock_zone_key, curproc->p_zone));
227 }
228 
229 static flk_lockmgr_status_t
230 flk_get_lockmgr_status(void)
231 {
232 	struct flock_globals *fg;
233 
234 	ASSERT(MUTEX_HELD(&flock_lock));
235 
236 	if (flock_zone_key == ZONE_KEY_UNINITIALIZED) {
237 		/*
238 		 * KLM module not loaded; lock manager definitely not running.
239 		 */
240 		return (FLK_LOCKMGR_DOWN);
241 	}
242 	fg = flk_get_globals();
243 	return (fg->flk_lockmgr_status);
244 }
245 
246 /*
247  * This implements Open File Description (not descriptor) style record locking.
248  * These locks can also be thought of as pid-less since they are not tied to a
249  * specific process, thus they're preserved across fork.
250  *
251  * Called directly from fcntl.
252  *
253  * See reclock() for the implementation of the traditional POSIX style record
254  * locking scheme (pid-ful). This function is derived from reclock() but
255  * simplified and modified to work for OFD style locking.
256  *
257  * The two primary advantages of OFD style of locking are:
258  * 1) It is per-file description, so closing a file descriptor that refers to a
259  *    different file description for the same file will not drop the lock (i.e.
260  *    two open's of the same file get different descriptions but a dup or fork
261  *    will refer to the same description).
262  * 2) Locks are preserved across fork(2).
263  *
264  * Because these locks are per-description a lock ptr lives at the f_filocks
265  * member of the file_t and the lock_descriptor includes a file_t pointer
266  * to enable unique lock identification and management.
267  *
268  * Since these locks are pid-less we cannot do deadlock detection with the
269  * current process-oriented implementation. This is consistent with OFD locking
270  * behavior on other operating systems such as Linux. Since we don't do
271  * deadlock detection we never interact with the process graph that is
272  * maintained for deadlock detection on the traditional POSIX-style locks.
273  *
274  * Future Work:
275  *
276  * The current implementation does not support record locks. That is,
277  * currently the single lock must cover the entire file. This is validated in
278  * fcntl. To support record locks the f_filock pointer in the file_t needs to
279  * be changed to a list of pointers to the locks. That list needs to be
280  * managed independently of the lock list on the vnode itself and it needs to
281  * be maintained as record locks are created, split, coalesced and deleted.
282  *
283  * The current implementation does not support remote file systems (e.g.
284  * NFS or CIFS). This is handled in fs_frlock(). The design of how OFD locks
285  * interact with the NLM is not clear since the NLM protocol/implementation
286  * appears to be oriented around locks associated with a process. A further
287  * problem is that a design is needed for what nlm_send_siglost() should do and
288  * where it will send SIGLOST. More recent versions of Linux apparently try to
289  * emulate OFD locks on NFS by converting them to traditional POSIX style locks
290  * that work with the NLM. It is not clear that this provides the correct
291  * semantics in all cases.
292  */
293 int
294 ofdlock(file_t *fp, int fcmd, flock64_t *lckdat, int flag, u_offset_t offset)
295 {
296 	int cmd = 0;
297 	vnode_t *vp;
298 	lock_descriptor_t	stack_lock_request;
299 	lock_descriptor_t	*lock_request;
300 	int error = 0;
301 	graph_t	*gp;
302 	int serialize = 0;
303 
304 	if (fcmd != F_OFD_GETLK)
305 		cmd = SETFLCK;
306 
307 	if (fcmd == F_OFD_SETLKW || fcmd == F_FLOCKW)
308 		cmd |= SLPFLCK;
309 
310 	/* see block comment */
311 	VERIFY(lckdat->l_whence == 0);
312 	VERIFY(lckdat->l_start == 0);
313 	VERIFY(lckdat->l_len == 0);
314 
315 	vp = fp->f_vnode;
316 
317 	/*
318 	 * For reclock fs_frlock() would normally have set these in a few
319 	 * places but for us it's cleaner to centralize it here. Note that
320 	 * IGN_PID is -1. We use 0 for our pid-less locks.
321 	 */
322 	lckdat->l_pid = 0;
323 	lckdat->l_sysid = 0;
324 
325 	/*
326 	 * Check access permissions
327 	 */
328 	if ((fcmd == F_OFD_SETLK || fcmd == F_OFD_SETLKW) &&
329 	    ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
330 	    (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
331 		return (EBADF);
332 
333 	/*
334 	 * for query and unlock we use the stack_lock_request
335 	 */
336 	if (lckdat->l_type == F_UNLCK || !(cmd & SETFLCK)) {
337 		lock_request = &stack_lock_request;
338 		(void) bzero((caddr_t)lock_request,
339 		    sizeof (lock_descriptor_t));
340 
341 		/*
342 		 * following is added to make the assertions in
343 		 * flk_execute_request() pass
344 		 */
345 		lock_request->l_edge.edge_in_next = &lock_request->l_edge;
346 		lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
347 		lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
348 		lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
349 		lock_request->l_status = FLK_INITIAL_STATE;
350 	} else {
351 		lock_request = flk_get_lock();
352 		fp->f_filock = (struct filock *)lock_request;
353 	}
354 	lock_request->l_state = 0;
355 	lock_request->l_vnode = vp;
356 	lock_request->l_zoneid = getzoneid();
357 	lock_request->l_ofd = fp;
358 
359 	/*
360 	 * Convert the request range into the canonical start and end
361 	 * values then check the validity of the lock range.
362 	 */
363 	error = flk_convert_lock_data(vp, lckdat, &lock_request->l_start,
364 	    &lock_request->l_end, offset);
365 	if (error)
366 		goto done;
367 
368 	error = flk_check_lock_data(lock_request->l_start, lock_request->l_end,
369 	    MAXEND);
370 	if (error)
371 		goto done;
372 
373 	ASSERT(lock_request->l_end >= lock_request->l_start);
374 
375 	lock_request->l_type = lckdat->l_type;
376 	if (cmd & SLPFLCK)
377 		lock_request->l_state |= WILLING_TO_SLEEP_LOCK;
378 
379 	if (!(cmd & SETFLCK)) {
380 		if (lock_request->l_type == F_RDLCK ||
381 		    lock_request->l_type == F_WRLCK)
382 			lock_request->l_state |= QUERY_LOCK;
383 	}
384 	lock_request->l_flock = (*lckdat);
385 
386 	/*
387 	 * We are ready for processing the request
388 	 */
389 
390 	if (fcmd != F_OFD_GETLK && lock_request->l_type != F_UNLCK &&
391 	    nbl_need_check(vp)) {
392 		nbl_start_crit(vp, RW_WRITER);
393 		serialize = 1;
394 	}
395 
396 	/* Get the lock graph for a particular vnode */
397 	gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH);
398 
399 	mutex_enter(&gp->gp_mutex);
400 
401 	lock_request->l_state |= REFERENCED_LOCK;
402 	lock_request->l_graph = gp;
403 
404 	switch (lock_request->l_type) {
405 	case F_RDLCK:
406 	case F_WRLCK:
407 		if (IS_QUERY_LOCK(lock_request)) {
408 			flk_get_first_blocking_lock(lock_request);
409 			if (lock_request->l_ofd != NULL)
410 				lock_request->l_flock.l_pid = -1;
411 			(*lckdat) = lock_request->l_flock;
412 		} else {
413 			/* process the request now */
414 			error = flk_process_request(lock_request);
415 		}
416 		break;
417 
418 	case F_UNLCK:
419 		/* unlock request will not block so execute it immediately */
420 		error = flk_execute_request(lock_request);
421 		break;
422 
423 	default:
424 		error = EINVAL;
425 		break;
426 	}
427 
428 	if (lock_request == &stack_lock_request) {
429 		flk_set_state(lock_request, FLK_DEAD_STATE);
430 	} else {
431 		lock_request->l_state &= ~REFERENCED_LOCK;
432 		if ((error != 0) || IS_DELETED(lock_request)) {
433 			flk_set_state(lock_request, FLK_DEAD_STATE);
434 			flk_free_lock(lock_request);
435 		}
436 	}
437 
438 	mutex_exit(&gp->gp_mutex);
439 	if (serialize)
440 		nbl_end_crit(vp);
441 
442 	return (error);
443 
444 done:
445 	flk_set_state(lock_request, FLK_DEAD_STATE);
446 	if (lock_request != &stack_lock_request)
447 		flk_free_lock(lock_request);
448 	return (error);
449 }
450 
451 /*
452  * Remove any lock on the vnode belonging to the given file_t.
453  * Called from closef on last close, file_t is locked.
454  *
455  * This is modeled on the cleanlocks() function but only removes the single
456  * lock associated with fp.
457  */
458 void
459 ofdcleanlock(file_t *fp)
460 {
461 	lock_descriptor_t *fplock, *lock, *nlock;
462 	vnode_t *vp;
463 	graph_t	*gp;
464 
465 	ASSERT(MUTEX_HELD(&fp->f_tlock));
466 
467 	if ((fplock = (lock_descriptor_t *)fp->f_filock) == NULL)
468 		return;
469 
470 	fp->f_filock = NULL;
471 	vp = fp->f_vnode;
472 
473 	gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
474 
475 	if (gp == NULL)
476 		return;
477 	mutex_enter(&gp->gp_mutex);
478 
479 	CHECK_SLEEPING_LOCKS(gp);
480 	CHECK_ACTIVE_LOCKS(gp);
481 
482 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
483 
484 	if (lock) {
485 		do {
486 			nlock = lock->l_next;
487 			if (fplock == lock) {
488 				CANCEL_WAKEUP(lock);
489 				break;
490 			}
491 			lock = nlock;
492 		} while (lock->l_vnode == vp);
493 	}
494 
495 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
496 
497 	if (lock) {
498 		do {
499 			nlock = lock->l_next;
500 			if (fplock == lock) {
501 				flk_delete_active_lock(lock, 0);
502 				flk_wakeup(lock, 1);
503 				flk_free_lock(lock);
504 				break;
505 			}
506 			lock = nlock;
507 		} while (lock->l_vnode == vp);
508 	}
509 
510 	CHECK_SLEEPING_LOCKS(gp);
511 	CHECK_ACTIVE_LOCKS(gp);
512 	mutex_exit(&gp->gp_mutex);
513 }
514 
515 /*
516  * Routine called from fs_frlock in fs/fs_subr.c
517  *
518  * This implements traditional POSIX style record locking. The two primary
519  * drawbacks to this style of locking are:
520  * 1) It is per-process, so any close of a file descriptor that refers to the
521  *    file will drop the lock (e.g. lock /etc/passwd, call a library function
522  *    which opens /etc/passwd to read the file, when the library closes it's
523  *    file descriptor the application loses its lock and does not know).
524  * 2) Locks are not preserved across fork(2).
525  *
526  * Because these locks are only assoiciated with a pid they are per-process.
527  * This is why any close will drop the lock and is also why once the process
528  * forks then the lock is no longer related to the new process. These locks can
529  * be considered as pid-ful.
530  *
531  * See ofdlock() for the implementation of a similar but improved locking
532  * scheme.
533  */
534 int
535 reclock(vnode_t		*vp,
536 	flock64_t	*lckdat,
537 	int		cmd,
538 	int		flag,
539 	u_offset_t	offset,
540 	flk_callback_t	*flk_cbp)
541 {
542 	lock_descriptor_t	stack_lock_request;
543 	lock_descriptor_t	*lock_request;
544 	int error = 0;
545 	graph_t	*gp;
546 	int			nlmid;
547 
548 	/*
549 	 * Check access permissions
550 	 */
551 	if ((cmd & SETFLCK) &&
552 	    ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
553 	    (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
554 			return (EBADF);
555 
556 	/*
557 	 * for query and unlock we use the stack_lock_request
558 	 */
559 
560 	if ((lckdat->l_type == F_UNLCK) ||
561 	    !((cmd & INOFLCK) || (cmd & SETFLCK))) {
562 		lock_request = &stack_lock_request;
563 		(void) bzero((caddr_t)lock_request,
564 		    sizeof (lock_descriptor_t));
565 
566 		/*
567 		 * following is added to make the assertions in
568 		 * flk_execute_request() to pass through
569 		 */
570 
571 		lock_request->l_edge.edge_in_next = &lock_request->l_edge;
572 		lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
573 		lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
574 		lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
575 		lock_request->l_status = FLK_INITIAL_STATE;
576 	} else {
577 		lock_request = flk_get_lock();
578 	}
579 	lock_request->l_state = 0;
580 	lock_request->l_vnode = vp;
581 	lock_request->l_zoneid = getzoneid();
582 
583 	/*
584 	 * Convert the request range into the canonical start and end
585 	 * values.  The NLM protocol supports locking over the entire
586 	 * 32-bit range, so there's no range checking for remote requests,
587 	 * but we still need to verify that local requests obey the rules.
588 	 */
589 	/* Clustering */
590 	if ((cmd & (RCMDLCK | PCMDLCK)) != 0) {
591 		ASSERT(lckdat->l_whence == 0);
592 		lock_request->l_start = lckdat->l_start;
593 		lock_request->l_end = (lckdat->l_len == 0) ? MAX_U_OFFSET_T :
594 		    lckdat->l_start + (lckdat->l_len - 1);
595 	} else {
596 		/* check the validity of the lock range */
597 		error = flk_convert_lock_data(vp, lckdat,
598 		    &lock_request->l_start, &lock_request->l_end,
599 		    offset);
600 		if (error) {
601 			goto done;
602 		}
603 		error = flk_check_lock_data(lock_request->l_start,
604 		    lock_request->l_end, MAXEND);
605 		if (error) {
606 			goto done;
607 		}
608 	}
609 
610 	ASSERT(lock_request->l_end >= lock_request->l_start);
611 
612 	lock_request->l_type = lckdat->l_type;
613 	if (cmd & INOFLCK)
614 		lock_request->l_state |= IO_LOCK;
615 	if (cmd & SLPFLCK)
616 		lock_request->l_state |= WILLING_TO_SLEEP_LOCK;
617 	if (cmd & RCMDLCK)
618 		lock_request->l_state |= LOCKMGR_LOCK;
619 	if (cmd & NBMLCK)
620 		lock_request->l_state |= NBMAND_LOCK;
621 	/*
622 	 * Clustering: set flag for PXFS locks
623 	 * We do not _only_ check for the PCMDLCK flag because PXFS locks could
624 	 * also be of type 'RCMDLCK'.
625 	 * We do not _only_ check the GETPXFSID() macro because local PXFS
626 	 * clients use a pxfsid of zero to permit deadlock detection in the LLM.
627 	 */
628 
629 	if ((cmd & PCMDLCK) || (GETPXFSID(lckdat->l_sysid) != 0)) {
630 		lock_request->l_state |= PXFS_LOCK;
631 	}
632 	if (!((cmd & SETFLCK) || (cmd & INOFLCK))) {
633 		if (lock_request->l_type == F_RDLCK ||
634 		    lock_request->l_type == F_WRLCK)
635 			lock_request->l_state |= QUERY_LOCK;
636 	}
637 	lock_request->l_flock = (*lckdat);
638 	lock_request->l_callbacks = flk_cbp;
639 
640 	/*
641 	 * We are ready for processing the request
642 	 */
643 	if (IS_LOCKMGR(lock_request)) {
644 		/*
645 		 * If the lock request is an NLM server request ....
646 		 */
647 		if (nlm_status_size == 0) { /* not booted as cluster */
648 			mutex_enter(&flock_lock);
649 			/*
650 			 * Bail out if this is a lock manager request and the
651 			 * lock manager is not supposed to be running.
652 			 */
653 			if (flk_get_lockmgr_status() != FLK_LOCKMGR_UP) {
654 				mutex_exit(&flock_lock);
655 				error = ENOLCK;
656 				goto done;
657 			}
658 			mutex_exit(&flock_lock);
659 		} else {			/* booted as a cluster */
660 			nlmid = GETNLMID(lock_request->l_flock.l_sysid);
661 			ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
662 
663 			mutex_enter(&nlm_reg_lock);
664 			/*
665 			 * If the NLM registry does not know about this
666 			 * NLM server making the request, add its nlmid
667 			 * to the registry.
668 			 */
669 			if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status,
670 			    nlmid)) {
671 				FLK_REGISTRY_ADD_NLMID(nlm_reg_status, nlmid);
672 			} else if (!FLK_REGISTRY_IS_NLM_UP(nlm_reg_status,
673 			    nlmid)) {
674 				/*
675 				 * If the NLM server is already known (has made
676 				 * previous lock requests) and its state is
677 				 * not NLM_UP (means that NLM server is
678 				 * shutting down), then bail out with an
679 				 * error to deny the lock request.
680 				 */
681 				mutex_exit(&nlm_reg_lock);
682 				error = ENOLCK;
683 				goto done;
684 			}
685 			mutex_exit(&nlm_reg_lock);
686 		}
687 	}
688 
689 	/* Now get the lock graph for a particular vnode */
690 	gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH);
691 
692 	/*
693 	 * We drop rwlock here otherwise this might end up causing a
694 	 * deadlock if this IOLOCK sleeps. (bugid # 1183392).
695 	 */
696 
697 	if (IS_IO_LOCK(lock_request)) {
698 		VOP_RWUNLOCK(vp,
699 		    (lock_request->l_type == F_RDLCK) ?
700 		    V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
701 	}
702 	mutex_enter(&gp->gp_mutex);
703 
704 	lock_request->l_state |= REFERENCED_LOCK;
705 	lock_request->l_graph = gp;
706 
707 	switch (lock_request->l_type) {
708 	case F_RDLCK:
709 	case F_WRLCK:
710 		if (IS_QUERY_LOCK(lock_request)) {
711 			flk_get_first_blocking_lock(lock_request);
712 			if (lock_request->l_ofd != NULL)
713 				lock_request->l_flock.l_pid = -1;
714 			(*lckdat) = lock_request->l_flock;
715 			break;
716 		}
717 
718 		/* process the request now */
719 
720 		error = flk_process_request(lock_request);
721 		break;
722 
723 	case F_UNLCK:
724 		/* unlock request will not block so execute it immediately */
725 
726 		if (IS_LOCKMGR(lock_request) &&
727 		    flk_canceled(lock_request)) {
728 			error = 0;
729 		} else {
730 			error = flk_execute_request(lock_request);
731 		}
732 		break;
733 
734 	case F_UNLKSYS:
735 		/*
736 		 * Recovery mechanism to release lock manager locks when
737 		 * NFS client crashes and restart. NFS server will clear
738 		 * old locks and grant new locks.
739 		 */
740 
741 		if (lock_request->l_flock.l_sysid == 0) {
742 			mutex_exit(&gp->gp_mutex);
743 			return (EINVAL);
744 		}
745 		if (secpolicy_nfs(CRED()) != 0) {
746 			mutex_exit(&gp->gp_mutex);
747 			return (EPERM);
748 		}
749 		flk_delete_locks_by_sysid(lock_request);
750 		lock_request->l_state &= ~REFERENCED_LOCK;
751 		flk_set_state(lock_request, FLK_DEAD_STATE);
752 		flk_free_lock(lock_request);
753 		mutex_exit(&gp->gp_mutex);
754 		return (0);
755 
756 	default:
757 		error = EINVAL;
758 		break;
759 	}
760 
761 	/* Clustering: For blocked PXFS locks, return */
762 	if (error == PXFS_LOCK_BLOCKED) {
763 		lock_request->l_state &= ~REFERENCED_LOCK;
764 		mutex_exit(&gp->gp_mutex);
765 		return (error);
766 	}
767 
768 	/*
769 	 * Now that we have seen the status of locks in the system for
770 	 * this vnode we acquire the rwlock if it is an IO_LOCK.
771 	 */
772 
773 	if (IS_IO_LOCK(lock_request)) {
774 		(void) VOP_RWLOCK(vp,
775 		    (lock_request->l_type == F_RDLCK) ?
776 		    V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
777 		if (!error) {
778 			lckdat->l_type = F_UNLCK;
779 
780 			/*
781 			 * This wake up is needed otherwise
782 			 * if IO_LOCK has slept the dependents on this
783 			 * will not be woken up at all. (bugid # 1185482).
784 			 */
785 
786 			flk_wakeup(lock_request, 1);
787 			flk_set_state(lock_request, FLK_DEAD_STATE);
788 			flk_free_lock(lock_request);
789 		}
790 		/*
791 		 * else if error had occurred either flk_process_request()
792 		 * has returned EDEADLK in which case there will be no
793 		 * dependents for this lock or EINTR from flk_wait_execute_
794 		 * request() in which case flk_cancel_sleeping_lock()
795 		 * would have been done. same is true with EBADF.
796 		 */
797 	}
798 
799 	if (lock_request == &stack_lock_request) {
800 		flk_set_state(lock_request, FLK_DEAD_STATE);
801 	} else {
802 		lock_request->l_state &= ~REFERENCED_LOCK;
803 		if ((error != 0) || IS_DELETED(lock_request)) {
804 			flk_set_state(lock_request, FLK_DEAD_STATE);
805 			flk_free_lock(lock_request);
806 		}
807 	}
808 
809 	mutex_exit(&gp->gp_mutex);
810 	return (error);
811 
812 done:
813 	flk_set_state(lock_request, FLK_DEAD_STATE);
814 	if (lock_request != &stack_lock_request)
815 		flk_free_lock(lock_request);
816 	return (error);
817 }
818 
819 /*
820  * Invoke the callbacks in the given list.  If before sleeping, invoke in
821  * list order.  If after sleeping, invoke in reverse order.
822  *
823  * CPR (suspend/resume) support: if one of the callbacks returns a
824  * callb_cpr_t, return it.   This will be used to make the thread CPR-safe
825  * while it is sleeping.  There should be at most one callb_cpr_t for the
826  * thread.
827  * XXX This is unnecessarily complicated.  The CPR information should just
828  * get passed in directly through VOP_FRLOCK and reclock, rather than
829  * sneaking it in via a callback.
830  */
831 
832 callb_cpr_t *
833 flk_invoke_callbacks(flk_callback_t *cblist, flk_cb_when_t when)
834 {
835 	callb_cpr_t *cpr_callbackp = NULL;
836 	callb_cpr_t *one_result;
837 	flk_callback_t *cb;
838 
839 	if (cblist == NULL)
840 		return (NULL);
841 
842 	if (when == FLK_BEFORE_SLEEP) {
843 		cb = cblist;
844 		do {
845 			one_result = (*cb->cb_callback)(when, cb->cb_data);
846 			if (one_result != NULL) {
847 				ASSERT(cpr_callbackp == NULL);
848 				cpr_callbackp = one_result;
849 			}
850 			cb = cb->cb_next;
851 		} while (cb != cblist);
852 	} else {
853 		cb = cblist->cb_prev;
854 		do {
855 			one_result = (*cb->cb_callback)(when, cb->cb_data);
856 			if (one_result != NULL) {
857 				cpr_callbackp = one_result;
858 			}
859 			cb = cb->cb_prev;
860 		} while (cb != cblist->cb_prev);
861 	}
862 
863 	return (cpr_callbackp);
864 }
865 
866 /*
867  * Initialize a flk_callback_t to hold the given callback.
868  */
869 
870 void
871 flk_init_callback(flk_callback_t *flk_cb,
872 	callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *), void *cbdata)
873 {
874 	flk_cb->cb_next = flk_cb;
875 	flk_cb->cb_prev = flk_cb;
876 	flk_cb->cb_callback = cb_fcn;
877 	flk_cb->cb_data = cbdata;
878 }
879 
880 /*
881  * Initialize an flk_callback_t and then link it into the head of an
882  * existing list (which may be NULL).
883  */
884 
885 void
886 flk_add_callback(flk_callback_t *newcb,
887 		callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *),
888 		void *cbdata, flk_callback_t *cblist)
889 {
890 	flk_init_callback(newcb, cb_fcn, cbdata);
891 
892 	if (cblist == NULL)
893 		return;
894 
895 	newcb->cb_prev = cblist->cb_prev;
896 	newcb->cb_next = cblist;
897 	cblist->cb_prev->cb_next = newcb;
898 	cblist->cb_prev = newcb;
899 }
900 
901 /*
902  * Initialize the flk_edge_cache data structure and create the
903  * nlm_reg_status array.
904  */
905 
906 void
907 flk_init(void)
908 {
909 	uint_t	i;
910 
911 	flk_edge_cache = kmem_cache_create("flk_edges",
912 	    sizeof (struct edge), 0, NULL, NULL, NULL, NULL, NULL, 0);
913 	if (flk_edge_cache == NULL) {
914 		cmn_err(CE_PANIC, "Couldn't create flk_edge_cache\n");
915 	}
916 	/*
917 	 * Create the NLM registry object.
918 	 */
919 
920 	if (cluster_bootflags & CLUSTER_BOOTED) {
921 		/*
922 		 * This routine tells you the maximum node id that will be used
923 		 * in the cluster.  This number will be the size of the nlm
924 		 * registry status array.  We add 1 because we will be using
925 		 * all entries indexed from 0 to maxnodeid; e.g., from 0
926 		 * to 64, for a total of 65 entries.
927 		 */
928 		nlm_status_size = clconf_maximum_nodeid() + 1;
929 	} else {
930 		nlm_status_size = 0;
931 	}
932 
933 	if (nlm_status_size != 0) {	/* booted as a cluster */
934 		nlm_reg_status = (flk_nlm_status_t *)
935 		    kmem_alloc(sizeof (flk_nlm_status_t) * nlm_status_size,
936 		    KM_SLEEP);
937 
938 		/* initialize all NLM states in array to NLM_UNKNOWN */
939 		for (i = 0; i < nlm_status_size; i++) {
940 			nlm_reg_status[i] = FLK_NLM_UNKNOWN;
941 		}
942 	}
943 }
944 
945 /*
946  * Zone constructor/destructor callbacks to be executed when a zone is
947  * created/destroyed.
948  */
949 /* ARGSUSED */
950 void *
951 flk_zone_init(zoneid_t zoneid)
952 {
953 	struct flock_globals *fg;
954 	uint_t i;
955 
956 	fg = kmem_alloc(sizeof (*fg), KM_SLEEP);
957 	fg->flk_lockmgr_status = FLK_LOCKMGR_UP;
958 	for (i = 0; i < HASH_SIZE; i++)
959 		fg->lockmgr_status[i] = FLK_LOCKMGR_UP;
960 	return (fg);
961 }
962 
963 /* ARGSUSED */
964 void
965 flk_zone_fini(zoneid_t zoneid, void *data)
966 {
967 	struct flock_globals *fg = data;
968 
969 	kmem_free(fg, sizeof (*fg));
970 }
971 
972 /*
973  * Get a lock_descriptor structure with initialization of edge lists.
974  */
975 
976 static lock_descriptor_t *
977 flk_get_lock(void)
978 {
979 	lock_descriptor_t	*l;
980 
981 	l = kmem_zalloc(sizeof (lock_descriptor_t), KM_SLEEP);
982 
983 	cv_init(&l->l_cv, NULL, CV_DRIVER, NULL);
984 	l->l_edge.edge_in_next = &l->l_edge;
985 	l->l_edge.edge_in_prev = &l->l_edge;
986 	l->l_edge.edge_adj_next = &l->l_edge;
987 	l->l_edge.edge_adj_prev = &l->l_edge;
988 	l->pvertex = -1;
989 	l->l_status = FLK_INITIAL_STATE;
990 	flk_lock_allocs++;
991 	return (l);
992 }
993 
994 /*
995  * Free a lock_descriptor structure. Just sets the DELETED_LOCK flag
996  * when some thread has a reference to it as in reclock().
997  */
998 
999 void
1000 flk_free_lock(lock_descriptor_t	*lock)
1001 {
1002 	file_t *fp;
1003 
1004 	ASSERT(IS_DEAD(lock));
1005 
1006 	if ((fp = lock->l_ofd) != NULL)
1007 		fp->f_filock = NULL;
1008 
1009 	if (IS_REFERENCED(lock)) {
1010 		lock->l_state |= DELETED_LOCK;
1011 		return;
1012 	}
1013 	flk_lock_frees++;
1014 	kmem_free((void *)lock, sizeof (lock_descriptor_t));
1015 }
1016 
1017 void
1018 flk_set_state(lock_descriptor_t *lock, int new_state)
1019 {
1020 	/*
1021 	 * Locks in the sleeping list may be woken up in a number of ways,
1022 	 * and more than once.  If a sleeping lock is signaled awake more
1023 	 * than once, then it may or may not change state depending on its
1024 	 * current state.
1025 	 * Also note that NLM locks that are sleeping could be moved to an
1026 	 * interrupted state more than once if the unlock request is
1027 	 * retransmitted by the NLM client - the second time around, this is
1028 	 * just a nop.
1029 	 * The ordering of being signaled awake is:
1030 	 * INTERRUPTED_STATE > CANCELLED_STATE > GRANTED_STATE.
1031 	 * The checks below implement this ordering.
1032 	 */
1033 	if (IS_INTERRUPTED(lock)) {
1034 		if ((new_state == FLK_CANCELLED_STATE) ||
1035 		    (new_state == FLK_GRANTED_STATE) ||
1036 		    (new_state == FLK_INTERRUPTED_STATE)) {
1037 			return;
1038 		}
1039 	}
1040 	if (IS_CANCELLED(lock)) {
1041 		if ((new_state == FLK_GRANTED_STATE) ||
1042 		    (new_state == FLK_CANCELLED_STATE)) {
1043 			return;
1044 		}
1045 	}
1046 	CHECK_LOCK_TRANSITION(lock->l_status, new_state);
1047 	if (IS_PXFS(lock)) {
1048 		cl_flk_state_transition_notify(lock, lock->l_status, new_state);
1049 	}
1050 	lock->l_status = new_state;
1051 }
1052 
1053 /*
1054  * Routine that checks whether there are any blocking locks in the system.
1055  *
1056  * The policy followed is if a write lock is sleeping we don't allow read
1057  * locks before this write lock even though there may not be any active
1058  * locks corresponding to the read locks' region.
1059  *
1060  * flk_add_edge() function adds an edge between l1 and l2 iff there
1061  * is no path between l1 and l2. This is done to have a "minimum
1062  * storage representation" of the dependency graph.
1063  *
1064  * Another property of the graph is since only the new request throws
1065  * edges to the existing locks in the graph, the graph is always topologically
1066  * ordered.
1067  */
1068 
1069 static int
1070 flk_process_request(lock_descriptor_t *request)
1071 {
1072 	graph_t	*gp = request->l_graph;
1073 	lock_descriptor_t *lock;
1074 	int request_blocked_by_active = 0;
1075 	int request_blocked_by_granted = 0;
1076 	int request_blocked_by_sleeping = 0;
1077 	vnode_t	*vp = request->l_vnode;
1078 	int	error = 0;
1079 	int request_will_wait = 0;
1080 	int found_covering_lock = 0;
1081 	lock_descriptor_t *covered_by = NULL;
1082 
1083 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1084 	request_will_wait = IS_WILLING_TO_SLEEP(request);
1085 
1086 	/*
1087 	 * check active locks
1088 	 */
1089 
1090 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1091 
1092 
1093 	if (lock) {
1094 		do {
1095 			if (BLOCKS(lock, request)) {
1096 				if (!request_will_wait)
1097 					return (EAGAIN);
1098 				request_blocked_by_active = 1;
1099 				break;
1100 			}
1101 			/*
1102 			 * Grant lock if it is for the same owner holding active
1103 			 * lock that covers the request.
1104 			 */
1105 
1106 			if (SAME_OWNER(lock, request) &&
1107 			    COVERS(lock, request) &&
1108 			    (request->l_type == F_RDLCK))
1109 				return (flk_execute_request(request));
1110 			lock = lock->l_next;
1111 		} while (lock->l_vnode == vp);
1112 	}
1113 
1114 	if (!request_blocked_by_active) {
1115 			lock_descriptor_t *lk[1];
1116 			lock_descriptor_t *first_glock = NULL;
1117 		/*
1118 		 * Shall we grant this?! NO!!
1119 		 * What about those locks that were just granted and still
1120 		 * in sleep queue. Those threads are woken up and so locks
1121 		 * are almost active.
1122 		 */
1123 		SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1124 		if (lock) {
1125 			do {
1126 				if (BLOCKS(lock, request)) {
1127 					if (IS_GRANTED(lock)) {
1128 						request_blocked_by_granted = 1;
1129 					} else {
1130 						request_blocked_by_sleeping = 1;
1131 					}
1132 				}
1133 
1134 				lock = lock->l_next;
1135 			} while ((lock->l_vnode == vp));
1136 			first_glock = lock->l_prev;
1137 			ASSERT(first_glock->l_vnode == vp);
1138 		}
1139 
1140 		if (request_blocked_by_granted)
1141 			goto block;
1142 
1143 		if (!request_blocked_by_sleeping) {
1144 			/*
1145 			 * If the request isn't going to be blocked by a
1146 			 * sleeping request, we know that it isn't going to
1147 			 * be blocked; we can just execute the request --
1148 			 * without performing costly deadlock detection.
1149 			 */
1150 			ASSERT(!request_blocked_by_active);
1151 			return (flk_execute_request(request));
1152 		} else if (request->l_type == F_RDLCK) {
1153 			/*
1154 			 * If we have a sleeping writer in the requested
1155 			 * lock's range, block.
1156 			 */
1157 			goto block;
1158 		}
1159 
1160 		lk[0] = request;
1161 		request->l_state |= RECOMPUTE_LOCK;
1162 		SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1163 		if (lock) {
1164 			do {
1165 				flk_recompute_dependencies(lock, lk, 1, 0);
1166 				lock = lock->l_next;
1167 			} while (lock->l_vnode == vp);
1168 		}
1169 		lock = first_glock;
1170 		if (lock) {
1171 			do {
1172 				if (IS_GRANTED(lock)) {
1173 				flk_recompute_dependencies(lock, lk, 1, 0);
1174 				}
1175 				lock = lock->l_prev;
1176 			} while ((lock->l_vnode == vp));
1177 		}
1178 		request->l_state &= ~RECOMPUTE_LOCK;
1179 		if (!NO_DEPENDENTS(request) && flk_check_deadlock(request))
1180 			return (EDEADLK);
1181 		return (flk_execute_request(request));
1182 	}
1183 
1184 block:
1185 	if (request_will_wait)
1186 		flk_graph_uncolor(gp);
1187 
1188 	/* check sleeping locks */
1189 
1190 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1191 
1192 	/*
1193 	 * If we find a sleeping write lock that is a superset of the
1194 	 * region wanted by request we can be assured that by adding an
1195 	 * edge to this write lock we have paths to all locks in the
1196 	 * graph that blocks the request except in one case and that is why
1197 	 * another check for SAME_OWNER in the loop below. The exception
1198 	 * case is when this process that owns the sleeping write lock 'l1'
1199 	 * has other locks l2, l3, l4 that are in the system and arrived
1200 	 * before l1. l1 does not have path to these locks as they are from
1201 	 * same process. We break when we find a second covering sleeping
1202 	 * lock l5 owned by a process different from that owning l1, because
1203 	 * there cannot be any of l2, l3, l4, etc., arrived before l5, and if
1204 	 * it has l1 would have produced a deadlock already.
1205 	 */
1206 
1207 	if (lock) {
1208 		do {
1209 			if (BLOCKS(lock, request)) {
1210 				if (!request_will_wait)
1211 					return (EAGAIN);
1212 				if (COVERS(lock, request) &&
1213 				    lock->l_type == F_WRLCK) {
1214 					if (found_covering_lock &&
1215 					    !SAME_OWNER(lock, covered_by)) {
1216 						found_covering_lock++;
1217 						break;
1218 					}
1219 					found_covering_lock = 1;
1220 					covered_by = lock;
1221 				}
1222 				if (found_covering_lock &&
1223 				    !SAME_OWNER(lock, covered_by)) {
1224 					lock = lock->l_next;
1225 					continue;
1226 				}
1227 				if ((error = flk_add_edge(request, lock,
1228 				    !found_covering_lock, 0)))
1229 					return (error);
1230 			}
1231 			lock = lock->l_next;
1232 		} while (lock->l_vnode == vp);
1233 	}
1234 
1235 /*
1236  * found_covering_lock == 2 iff at this point 'request' has paths
1237  * to all locks that blocks 'request'. found_covering_lock == 1 iff at this
1238  * point 'request' has paths to all locks that blocks 'request' whose owners
1239  * are not same as the one that covers 'request' (covered_by above) and
1240  * we can have locks whose owner is same as covered_by in the active list.
1241  */
1242 
1243 	if (request_blocked_by_active && found_covering_lock != 2) {
1244 		SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1245 		ASSERT(lock != NULL);
1246 		do {
1247 			if (BLOCKS(lock, request)) {
1248 				if (found_covering_lock &&
1249 				    !SAME_OWNER(lock, covered_by)) {
1250 					lock = lock->l_next;
1251 					continue;
1252 				}
1253 				if ((error = flk_add_edge(request, lock,
1254 				    CHECK_CYCLE, 0)))
1255 					return (error);
1256 			}
1257 			lock = lock->l_next;
1258 		} while (lock->l_vnode == vp);
1259 	}
1260 
1261 	if (NOT_BLOCKED(request)) {
1262 		/*
1263 		 * request not dependent on any other locks
1264 		 * so execute this request
1265 		 */
1266 		return (flk_execute_request(request));
1267 	} else {
1268 		/*
1269 		 * check for deadlock
1270 		 */
1271 		if (flk_check_deadlock(request))
1272 			return (EDEADLK);
1273 		/*
1274 		 * this thread has to sleep
1275 		 */
1276 		return (flk_wait_execute_request(request));
1277 	}
1278 }
1279 
1280 /*
1281  * The actual execution of the request in the simple case is only to
1282  * insert the 'request' in the list of active locks if it is not an
1283  * UNLOCK.
1284  * We have to consider the existing active locks' relation to
1285  * this 'request' if they are owned by same process. flk_relation() does
1286  * this job and sees to that the dependency graph information is maintained
1287  * properly.
1288  */
1289 
1290 int
1291 flk_execute_request(lock_descriptor_t *request)
1292 {
1293 	graph_t	*gp = request->l_graph;
1294 	vnode_t	*vp = request->l_vnode;
1295 	lock_descriptor_t	*lock, *lock1;
1296 	int done_searching = 0;
1297 
1298 	CHECK_SLEEPING_LOCKS(gp);
1299 	CHECK_ACTIVE_LOCKS(gp);
1300 
1301 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1302 
1303 	flk_set_state(request, FLK_START_STATE);
1304 
1305 	ASSERT(NOT_BLOCKED(request));
1306 
1307 	/* IO_LOCK requests are only to check status */
1308 
1309 	if (IS_IO_LOCK(request))
1310 		return (0);
1311 
1312 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1313 
1314 	if (lock == NULL && request->l_type == F_UNLCK)
1315 		return (0);
1316 	if (lock == NULL) {
1317 		flk_insert_active_lock(request);
1318 		return (0);
1319 	}
1320 
1321 	do {
1322 		lock1 = lock->l_next;
1323 		if (SAME_OWNER(request, lock)) {
1324 			done_searching = flk_relation(lock, request);
1325 		}
1326 		lock = lock1;
1327 	} while (lock->l_vnode == vp && !done_searching);
1328 
1329 	/*
1330 	 * insert in active queue
1331 	 */
1332 
1333 	if (request->l_type != F_UNLCK)
1334 		flk_insert_active_lock(request);
1335 
1336 	return (0);
1337 }
1338 
1339 /*
1340  * 'request' is blocked by some one therefore we put it into sleep queue.
1341  */
1342 static int
1343 flk_wait_execute_request(lock_descriptor_t *request)
1344 {
1345 	graph_t	*gp = request->l_graph;
1346 	callb_cpr_t 	*cprp;		/* CPR info from callback */
1347 	struct flock_globals *fg;
1348 	int index;
1349 
1350 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1351 	ASSERT(IS_WILLING_TO_SLEEP(request));
1352 
1353 	flk_insert_sleeping_lock(request);
1354 
1355 	if (IS_LOCKMGR(request)) {
1356 		index = HASH_INDEX(request->l_vnode);
1357 		fg = flk_get_globals();
1358 
1359 		if (nlm_status_size == 0) {	/* not booted as a cluster */
1360 			if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP) {
1361 				flk_cancel_sleeping_lock(request, 1);
1362 				return (ENOLCK);
1363 			}
1364 		} else {			/* booted as a cluster */
1365 			/*
1366 			 * If the request is an NLM server lock request,
1367 			 * and the NLM state of the lock request is not
1368 			 * NLM_UP (because the NLM server is shutting
1369 			 * down), then cancel the sleeping lock and
1370 			 * return error ENOLCK that will encourage the
1371 			 * client to retransmit.
1372 			 */
1373 			if (!IS_NLM_UP(request)) {
1374 				flk_cancel_sleeping_lock(request, 1);
1375 				return (ENOLCK);
1376 			}
1377 		}
1378 	}
1379 
1380 	/* Clustering: For blocking PXFS locks, return */
1381 	if (IS_PXFS(request)) {
1382 		/*
1383 		 * PXFS locks sleep on the client side.
1384 		 * The callback argument is used to wake up the sleeper
1385 		 * when the lock is granted.
1386 		 * We return -1 (rather than an errno value) to indicate
1387 		 * the client side should sleep
1388 		 */
1389 		return (PXFS_LOCK_BLOCKED);
1390 	}
1391 
1392 	if (request->l_callbacks != NULL) {
1393 		/*
1394 		 * To make sure the shutdown code works correctly, either
1395 		 * the callback must happen after putting the lock on the
1396 		 * sleep list, or we must check the shutdown status after
1397 		 * returning from the callback (and before sleeping).  At
1398 		 * least for now, we'll use the first option.  If a
1399 		 * shutdown or signal or whatever happened while the graph
1400 		 * mutex was dropped, that will be detected by
1401 		 * wait_for_lock().
1402 		 */
1403 		mutex_exit(&gp->gp_mutex);
1404 
1405 		cprp = flk_invoke_callbacks(request->l_callbacks,
1406 		    FLK_BEFORE_SLEEP);
1407 
1408 		mutex_enter(&gp->gp_mutex);
1409 
1410 		if (cprp == NULL) {
1411 			wait_for_lock(request);
1412 		} else {
1413 			mutex_enter(cprp->cc_lockp);
1414 			CALLB_CPR_SAFE_BEGIN(cprp);
1415 			mutex_exit(cprp->cc_lockp);
1416 			wait_for_lock(request);
1417 			mutex_enter(cprp->cc_lockp);
1418 			CALLB_CPR_SAFE_END(cprp, cprp->cc_lockp);
1419 			mutex_exit(cprp->cc_lockp);
1420 		}
1421 
1422 		mutex_exit(&gp->gp_mutex);
1423 		(void) flk_invoke_callbacks(request->l_callbacks,
1424 		    FLK_AFTER_SLEEP);
1425 		mutex_enter(&gp->gp_mutex);
1426 	} else {
1427 		wait_for_lock(request);
1428 	}
1429 
1430 	if (IS_LOCKMGR(request)) {
1431 		/*
1432 		 * If the lock manager is shutting down, return an
1433 		 * error that will encourage the client to retransmit.
1434 		 */
1435 		if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP &&
1436 		    !IS_GRANTED(request)) {
1437 			flk_cancel_sleeping_lock(request, 1);
1438 			return (ENOLCK);
1439 		}
1440 	}
1441 
1442 	if (IS_INTERRUPTED(request)) {
1443 		/* we got a signal, or act like we did */
1444 		flk_cancel_sleeping_lock(request, 1);
1445 		return (EINTR);
1446 	}
1447 
1448 	/* Cancelled if some other thread has closed the file */
1449 
1450 	if (IS_CANCELLED(request)) {
1451 		flk_cancel_sleeping_lock(request, 1);
1452 		return (EBADF);
1453 	}
1454 
1455 	request->l_state &= ~GRANTED_LOCK;
1456 	REMOVE_SLEEP_QUEUE(request);
1457 	return (flk_execute_request(request));
1458 }
1459 
1460 /*
1461  * This routine adds an edge between from and to because from depends
1462  * to. If asked to check for deadlock it checks whether there are any
1463  * reachable locks from "from_lock" that is owned by the same process
1464  * as "from_lock".
1465  * NOTE: It is the caller's responsibility to make sure that the color
1466  * of the graph is consistent between the calls to flk_add_edge as done
1467  * in flk_process_request. This routine does not color and check for
1468  * deadlock explicitly.
1469  */
1470 
1471 static int
1472 flk_add_edge(lock_descriptor_t *from_lock, lock_descriptor_t *to_lock,
1473 			int check_cycle, int update_graph)
1474 {
1475 	edge_t	*edge;
1476 	edge_t	*ep;
1477 	lock_descriptor_t	*vertex;
1478 	lock_descriptor_t *vertex_stack;
1479 
1480 	STACK_INIT(vertex_stack);
1481 
1482 	/*
1483 	 * if to vertex already has mark_color just return
1484 	 * don't add an edge as it is reachable from from vertex
1485 	 * before itself.
1486 	 */
1487 
1488 	if (COLORED(to_lock))
1489 		return (0);
1490 
1491 	edge = flk_get_edge();
1492 
1493 	/*
1494 	 * set the from and to vertex
1495 	 */
1496 
1497 	edge->from_vertex = from_lock;
1498 	edge->to_vertex = to_lock;
1499 
1500 	/*
1501 	 * put in adjacency list of from vertex
1502 	 */
1503 
1504 	from_lock->l_edge.edge_adj_next->edge_adj_prev = edge;
1505 	edge->edge_adj_next = from_lock->l_edge.edge_adj_next;
1506 	edge->edge_adj_prev = &from_lock->l_edge;
1507 	from_lock->l_edge.edge_adj_next = edge;
1508 
1509 	/*
1510 	 * put in list of to vertex
1511 	 */
1512 
1513 	to_lock->l_edge.edge_in_next->edge_in_prev = edge;
1514 	edge->edge_in_next = to_lock->l_edge.edge_in_next;
1515 	to_lock->l_edge.edge_in_next = edge;
1516 	edge->edge_in_prev = &to_lock->l_edge;
1517 
1518 
1519 	if (update_graph) {
1520 		flk_update_proc_graph(edge, 0);
1521 		return (0);
1522 	}
1523 	if (!check_cycle) {
1524 		return (0);
1525 	}
1526 
1527 	STACK_PUSH(vertex_stack, from_lock, l_stack);
1528 
1529 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1530 
1531 		STACK_POP(vertex_stack, l_stack);
1532 
1533 		for (ep = FIRST_ADJ(vertex);
1534 		    ep != HEAD(vertex);
1535 		    ep = NEXT_ADJ(ep)) {
1536 			if (COLORED(ep->to_vertex))
1537 				continue;
1538 			COLOR(ep->to_vertex);
1539 			if (SAME_OWNER(ep->to_vertex, from_lock))
1540 				goto dead_lock;
1541 			STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1542 		}
1543 	}
1544 	return (0);
1545 
1546 dead_lock:
1547 
1548 	/*
1549 	 * remove all edges
1550 	 */
1551 
1552 	ep = FIRST_ADJ(from_lock);
1553 
1554 	while (ep != HEAD(from_lock)) {
1555 		IN_LIST_REMOVE(ep);
1556 		from_lock->l_sedge = NEXT_ADJ(ep);
1557 		ADJ_LIST_REMOVE(ep);
1558 		flk_free_edge(ep);
1559 		ep = from_lock->l_sedge;
1560 	}
1561 	return (EDEADLK);
1562 }
1563 
1564 /*
1565  * Get an edge structure for representing the dependency between two locks.
1566  */
1567 
1568 static edge_t *
1569 flk_get_edge()
1570 {
1571 	edge_t	*ep;
1572 
1573 	ASSERT(flk_edge_cache != NULL);
1574 
1575 	ep = kmem_cache_alloc(flk_edge_cache, KM_SLEEP);
1576 	edge_allocs++;
1577 	return (ep);
1578 }
1579 
1580 /*
1581  * Free the edge structure.
1582  */
1583 
1584 static void
1585 flk_free_edge(edge_t *ep)
1586 {
1587 	edge_frees++;
1588 	kmem_cache_free(flk_edge_cache, (void *)ep);
1589 }
1590 
1591 /*
1592  * Check the relationship of request with lock and perform the
1593  * recomputation of dependencies, break lock if required, and return
1594  * 1 if request cannot have any more relationship with the next
1595  * active locks.
1596  * The 'lock' and 'request' are compared and in case of overlap we
1597  * delete the 'lock' and form new locks to represent the non-overlapped
1598  * portion of original 'lock'. This function has side effects such as
1599  * 'lock' will be freed, new locks will be added to the active list.
1600  */
1601 
1602 static int
1603 flk_relation(lock_descriptor_t *lock, lock_descriptor_t *request)
1604 {
1605 	int lock_effect;
1606 	lock_descriptor_t *lock1, *lock2;
1607 	lock_descriptor_t *topology[3];
1608 	int nvertex = 0;
1609 	int i;
1610 	edge_t	*ep;
1611 	graph_t	*gp = (lock->l_graph);
1612 
1613 
1614 	CHECK_SLEEPING_LOCKS(gp);
1615 	CHECK_ACTIVE_LOCKS(gp);
1616 
1617 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1618 
1619 	topology[0] = topology[1] = topology[2] = NULL;
1620 
1621 	if (request->l_type == F_UNLCK)
1622 		lock_effect = FLK_UNLOCK;
1623 	else if (request->l_type == F_RDLCK &&
1624 	    lock->l_type == F_WRLCK)
1625 		lock_effect = FLK_DOWNGRADE;
1626 	else if (request->l_type == F_WRLCK &&
1627 	    lock->l_type == F_RDLCK)
1628 		lock_effect = FLK_UPGRADE;
1629 	else
1630 		lock_effect = FLK_STAY_SAME;
1631 
1632 	if (lock->l_end < request->l_start) {
1633 		if (lock->l_end == request->l_start - 1 &&
1634 		    lock_effect == FLK_STAY_SAME) {
1635 			topology[0] = request;
1636 			request->l_start = lock->l_start;
1637 			nvertex = 1;
1638 			goto recompute;
1639 		} else {
1640 			return (0);
1641 		}
1642 	}
1643 
1644 	if (lock->l_start > request->l_end) {
1645 		if (request->l_end == lock->l_start - 1 &&
1646 		    lock_effect == FLK_STAY_SAME) {
1647 			topology[0] = request;
1648 			request->l_end = lock->l_end;
1649 			nvertex = 1;
1650 			goto recompute;
1651 		} else {
1652 			return (1);
1653 		}
1654 	}
1655 
1656 	if (request->l_end < lock->l_end) {
1657 		if (request->l_start > lock->l_start) {
1658 			if (lock_effect == FLK_STAY_SAME) {
1659 				request->l_start = lock->l_start;
1660 				request->l_end = lock->l_end;
1661 				topology[0] = request;
1662 				nvertex = 1;
1663 			} else {
1664 				lock1 = flk_get_lock();
1665 				lock2 = flk_get_lock();
1666 				COPY(lock1, lock);
1667 				COPY(lock2, lock);
1668 				lock1->l_start = lock->l_start;
1669 				lock1->l_end = request->l_start - 1;
1670 				lock2->l_start = request->l_end + 1;
1671 				lock2->l_end = lock->l_end;
1672 				topology[0] = lock1;
1673 				topology[1] = lock2;
1674 				topology[2] = request;
1675 				nvertex = 3;
1676 			}
1677 		} else if (request->l_start < lock->l_start) {
1678 			if (lock_effect == FLK_STAY_SAME) {
1679 				request->l_end = lock->l_end;
1680 				topology[0] = request;
1681 				nvertex = 1;
1682 			} else {
1683 				lock1 = flk_get_lock();
1684 				COPY(lock1, lock);
1685 				lock1->l_start = request->l_end + 1;
1686 				topology[0] = lock1;
1687 				topology[1] = request;
1688 				nvertex = 2;
1689 			}
1690 		} else  {
1691 			if (lock_effect == FLK_STAY_SAME) {
1692 				request->l_start = lock->l_start;
1693 				request->l_end = lock->l_end;
1694 				topology[0] = request;
1695 				nvertex = 1;
1696 			} else {
1697 				lock1 = flk_get_lock();
1698 				COPY(lock1, lock);
1699 				lock1->l_start = request->l_end + 1;
1700 				topology[0] = lock1;
1701 				topology[1] = request;
1702 				nvertex = 2;
1703 			}
1704 		}
1705 	} else if (request->l_end > lock->l_end) {
1706 		if (request->l_start > lock->l_start)  {
1707 			if (lock_effect == FLK_STAY_SAME) {
1708 				request->l_start = lock->l_start;
1709 				topology[0] = request;
1710 				nvertex = 1;
1711 			} else {
1712 				lock1 = flk_get_lock();
1713 				COPY(lock1, lock);
1714 				lock1->l_end = request->l_start - 1;
1715 				topology[0] = lock1;
1716 				topology[1] = request;
1717 				nvertex = 2;
1718 			}
1719 		} else if (request->l_start < lock->l_start)  {
1720 			topology[0] = request;
1721 			nvertex = 1;
1722 		} else {
1723 			topology[0] = request;
1724 			nvertex = 1;
1725 		}
1726 	} else {
1727 		if (request->l_start > lock->l_start) {
1728 			if (lock_effect == FLK_STAY_SAME) {
1729 				request->l_start = lock->l_start;
1730 				topology[0] = request;
1731 				nvertex = 1;
1732 			} else {
1733 				lock1 = flk_get_lock();
1734 				COPY(lock1, lock);
1735 				lock1->l_end = request->l_start - 1;
1736 				topology[0] = lock1;
1737 				topology[1] = request;
1738 				nvertex = 2;
1739 			}
1740 		} else if (request->l_start < lock->l_start) {
1741 			topology[0] = request;
1742 			nvertex = 1;
1743 		} else {
1744 			if (lock_effect !=  FLK_UNLOCK) {
1745 				topology[0] = request;
1746 				nvertex = 1;
1747 			} else {
1748 				flk_delete_active_lock(lock, 0);
1749 				flk_wakeup(lock, 1);
1750 				flk_free_lock(lock);
1751 				CHECK_SLEEPING_LOCKS(gp);
1752 				CHECK_ACTIVE_LOCKS(gp);
1753 				return (1);
1754 			}
1755 		}
1756 	}
1757 
1758 recompute:
1759 
1760 	/*
1761 	 * For unlock we don't send the 'request' to for recomputing
1762 	 * dependencies because no lock will add an edge to this.
1763 	 */
1764 
1765 	if (lock_effect == FLK_UNLOCK) {
1766 		topology[nvertex-1] = NULL;
1767 		nvertex--;
1768 	}
1769 	for (i = 0; i < nvertex; i++) {
1770 		topology[i]->l_state |= RECOMPUTE_LOCK;
1771 		topology[i]->l_color = NO_COLOR;
1772 	}
1773 
1774 	ASSERT(FIRST_ADJ(lock) == HEAD(lock));
1775 
1776 	/*
1777 	 * we remove the adjacent edges for all vertices' to this vertex
1778 	 * 'lock'.
1779 	 */
1780 
1781 	ep = FIRST_IN(lock);
1782 	while (ep != HEAD(lock)) {
1783 		ADJ_LIST_REMOVE(ep);
1784 		ep = NEXT_IN(ep);
1785 	}
1786 
1787 	flk_delete_active_lock(lock, 0);
1788 
1789 	/* We are ready for recomputing the dependencies now */
1790 
1791 	flk_recompute_dependencies(lock, topology, nvertex, 1);
1792 
1793 	for (i = 0; i < nvertex; i++) {
1794 		topology[i]->l_state &= ~RECOMPUTE_LOCK;
1795 		topology[i]->l_color = NO_COLOR;
1796 	}
1797 
1798 
1799 	if (lock_effect == FLK_UNLOCK) {
1800 		nvertex++;
1801 	}
1802 	for (i = 0; i < nvertex - 1; i++) {
1803 		flk_insert_active_lock(topology[i]);
1804 	}
1805 
1806 
1807 	if (lock_effect == FLK_DOWNGRADE || lock_effect == FLK_UNLOCK) {
1808 		flk_wakeup(lock, 0);
1809 	} else {
1810 		ep = FIRST_IN(lock);
1811 		while (ep != HEAD(lock)) {
1812 			lock->l_sedge = NEXT_IN(ep);
1813 			IN_LIST_REMOVE(ep);
1814 			flk_update_proc_graph(ep, 1);
1815 			flk_free_edge(ep);
1816 			ep = lock->l_sedge;
1817 		}
1818 	}
1819 	flk_free_lock(lock);
1820 
1821 	CHECK_SLEEPING_LOCKS(gp);
1822 	CHECK_ACTIVE_LOCKS(gp);
1823 	return (0);
1824 }
1825 
1826 /*
1827  * Insert a lock into the active queue.
1828  */
1829 
1830 static void
1831 flk_insert_active_lock(lock_descriptor_t *new_lock)
1832 {
1833 	graph_t	*gp = new_lock->l_graph;
1834 	vnode_t	*vp = new_lock->l_vnode;
1835 	lock_descriptor_t *first_lock, *lock;
1836 
1837 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1838 
1839 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1840 	first_lock = lock;
1841 
1842 	if (first_lock != NULL) {
1843 		for (; (lock->l_vnode == vp &&
1844 		    lock->l_start < new_lock->l_start); lock = lock->l_next)
1845 			;
1846 	} else {
1847 		lock = ACTIVE_HEAD(gp);
1848 	}
1849 
1850 	lock->l_prev->l_next = new_lock;
1851 	new_lock->l_next = lock;
1852 	new_lock->l_prev = lock->l_prev;
1853 	lock->l_prev = new_lock;
1854 
1855 	if (first_lock == NULL || (new_lock->l_start <= first_lock->l_start)) {
1856 		vp->v_filocks = (struct filock *)new_lock;
1857 	}
1858 	flk_set_state(new_lock, FLK_ACTIVE_STATE);
1859 	new_lock->l_state |= ACTIVE_LOCK;
1860 
1861 	CHECK_ACTIVE_LOCKS(gp);
1862 	CHECK_SLEEPING_LOCKS(gp);
1863 }
1864 
1865 /*
1866  * Delete the active lock : Performs two functions depending on the
1867  * value of second parameter. One is to remove from the active lists
1868  * only and other is to both remove and free the lock.
1869  */
1870 
1871 static void
1872 flk_delete_active_lock(lock_descriptor_t *lock, int free_lock)
1873 {
1874 	vnode_t *vp = lock->l_vnode;
1875 	graph_t	*gp = lock->l_graph;
1876 
1877 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1878 	if (free_lock)
1879 		ASSERT(NO_DEPENDENTS(lock));
1880 	ASSERT(NOT_BLOCKED(lock));
1881 	ASSERT(IS_ACTIVE(lock));
1882 
1883 	ASSERT((vp->v_filocks != NULL));
1884 
1885 	if (vp->v_filocks == (struct filock *)lock) {
1886 		vp->v_filocks = (struct filock *)
1887 		    ((lock->l_next->l_vnode == vp) ? lock->l_next :
1888 		    NULL);
1889 	}
1890 	lock->l_next->l_prev = lock->l_prev;
1891 	lock->l_prev->l_next = lock->l_next;
1892 	lock->l_next = lock->l_prev = NULL;
1893 	flk_set_state(lock, FLK_DEAD_STATE);
1894 	lock->l_state &= ~ACTIVE_LOCK;
1895 
1896 	if (free_lock)
1897 		flk_free_lock(lock);
1898 	CHECK_ACTIVE_LOCKS(gp);
1899 	CHECK_SLEEPING_LOCKS(gp);
1900 }
1901 
1902 /*
1903  * Insert into the sleep queue.
1904  */
1905 
1906 static void
1907 flk_insert_sleeping_lock(lock_descriptor_t *request)
1908 {
1909 	graph_t *gp = request->l_graph;
1910 	vnode_t	*vp = request->l_vnode;
1911 	lock_descriptor_t	*lock;
1912 
1913 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1914 	ASSERT(IS_INITIAL(request));
1915 
1916 	for (lock = gp->sleeping_locks.l_next; (lock != &gp->sleeping_locks &&
1917 	    lock->l_vnode < vp); lock = lock->l_next)
1918 		;
1919 
1920 	lock->l_prev->l_next = request;
1921 	request->l_prev = lock->l_prev;
1922 	lock->l_prev = request;
1923 	request->l_next = lock;
1924 	flk_set_state(request, FLK_SLEEPING_STATE);
1925 	request->l_state |= SLEEPING_LOCK;
1926 }
1927 
1928 /*
1929  * Cancelling a sleeping lock implies removing a vertex from the
1930  * dependency graph and therefore we should recompute the dependencies
1931  * of all vertices that have a path  to this vertex, w.r.t. all
1932  * vertices reachable from this vertex.
1933  */
1934 
1935 void
1936 flk_cancel_sleeping_lock(lock_descriptor_t *request, int remove_from_queue)
1937 {
1938 	graph_t	*gp = request->l_graph;
1939 	vnode_t *vp = request->l_vnode;
1940 	lock_descriptor_t **topology = NULL;
1941 	edge_t	*ep;
1942 	lock_descriptor_t *vertex, *lock;
1943 	int nvertex = 0;
1944 	int i;
1945 	lock_descriptor_t *vertex_stack;
1946 
1947 	STACK_INIT(vertex_stack);
1948 
1949 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1950 	/*
1951 	 * count number of vertex pointers that has to be allocated
1952 	 * All vertices that are reachable from request.
1953 	 */
1954 
1955 	STACK_PUSH(vertex_stack, request, l_stack);
1956 
1957 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1958 		STACK_POP(vertex_stack, l_stack);
1959 		for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
1960 		    ep = NEXT_ADJ(ep)) {
1961 			if (IS_RECOMPUTE(ep->to_vertex))
1962 				continue;
1963 			ep->to_vertex->l_state |= RECOMPUTE_LOCK;
1964 			STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1965 			nvertex++;
1966 		}
1967 	}
1968 
1969 	/*
1970 	 * allocate memory for holding the vertex pointers
1971 	 */
1972 
1973 	if (nvertex) {
1974 		topology = kmem_zalloc(nvertex * sizeof (lock_descriptor_t *),
1975 		    KM_SLEEP);
1976 	}
1977 
1978 	/*
1979 	 * one more pass to actually store the vertices in the
1980 	 * allocated array.
1981 	 * We first check sleeping locks and then active locks
1982 	 * so that topology array will be in a topological
1983 	 * order.
1984 	 */
1985 
1986 	nvertex = 0;
1987 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1988 
1989 	if (lock) {
1990 		do {
1991 			if (IS_RECOMPUTE(lock)) {
1992 				lock->l_index = nvertex;
1993 				topology[nvertex++] = lock;
1994 			}
1995 			lock->l_color = NO_COLOR;
1996 			lock = lock->l_next;
1997 		} while (lock->l_vnode == vp);
1998 	}
1999 
2000 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2001 
2002 	if (lock) {
2003 		do {
2004 			if (IS_RECOMPUTE(lock)) {
2005 				lock->l_index = nvertex;
2006 				topology[nvertex++] = lock;
2007 			}
2008 			lock->l_color = NO_COLOR;
2009 			lock = lock->l_next;
2010 		} while (lock->l_vnode == vp);
2011 	}
2012 
2013 	/*
2014 	 * remove in and out edges of request
2015 	 * They are freed after updating proc_graph below.
2016 	 */
2017 
2018 	for (ep = FIRST_IN(request); ep != HEAD(request); ep = NEXT_IN(ep)) {
2019 		ADJ_LIST_REMOVE(ep);
2020 	}
2021 
2022 
2023 	if (remove_from_queue)
2024 		REMOVE_SLEEP_QUEUE(request);
2025 
2026 	/* we are ready to recompute */
2027 
2028 	flk_recompute_dependencies(request, topology, nvertex, 1);
2029 
2030 	ep = FIRST_ADJ(request);
2031 	while (ep != HEAD(request)) {
2032 		IN_LIST_REMOVE(ep);
2033 		request->l_sedge = NEXT_ADJ(ep);
2034 		ADJ_LIST_REMOVE(ep);
2035 		flk_update_proc_graph(ep, 1);
2036 		flk_free_edge(ep);
2037 		ep = request->l_sedge;
2038 	}
2039 
2040 
2041 	/*
2042 	 * unset the RECOMPUTE flag in those vertices
2043 	 */
2044 
2045 	for (i = 0; i < nvertex; i++) {
2046 		topology[i]->l_state &= ~RECOMPUTE_LOCK;
2047 	}
2048 
2049 	/*
2050 	 * free the topology
2051 	 */
2052 	if (nvertex)
2053 		kmem_free((void *)topology,
2054 		    (nvertex * sizeof (lock_descriptor_t *)));
2055 	/*
2056 	 * Possibility of some locks unblocked now
2057 	 */
2058 
2059 	flk_wakeup(request, 0);
2060 
2061 	/*
2062 	 * we expect to have a correctly recomputed graph  now.
2063 	 */
2064 	flk_set_state(request, FLK_DEAD_STATE);
2065 	flk_free_lock(request);
2066 	CHECK_SLEEPING_LOCKS(gp);
2067 	CHECK_ACTIVE_LOCKS(gp);
2068 
2069 }
2070 
2071 /*
2072  * Uncoloring the graph is simply to increment the mark value of the graph
2073  * And only when wrap round takes place will we color all vertices in
2074  * the graph explicitly.
2075  */
2076 
2077 static void
2078 flk_graph_uncolor(graph_t *gp)
2079 {
2080 	lock_descriptor_t *lock;
2081 
2082 	if (gp->mark == UINT_MAX) {
2083 		gp->mark = 1;
2084 	for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
2085 	    lock = lock->l_next)
2086 			lock->l_color  = 0;
2087 
2088 	for (lock = SLEEPING_HEAD(gp)->l_next; lock != SLEEPING_HEAD(gp);
2089 	    lock = lock->l_next)
2090 			lock->l_color  = 0;
2091 	} else {
2092 		gp->mark++;
2093 	}
2094 }
2095 
2096 /*
2097  * Wake up locks that are blocked on the given lock.
2098  */
2099 
2100 static void
2101 flk_wakeup(lock_descriptor_t *lock, int adj_list_remove)
2102 {
2103 	edge_t	*ep;
2104 	graph_t	*gp = lock->l_graph;
2105 	lock_descriptor_t	*lck;
2106 
2107 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
2108 	if (NO_DEPENDENTS(lock))
2109 		return;
2110 	ep = FIRST_IN(lock);
2111 	do {
2112 		/*
2113 		 * delete the edge from the adjacency list
2114 		 * of from vertex. if no more adjacent edges
2115 		 * for this vertex wake this process.
2116 		 */
2117 		lck = ep->from_vertex;
2118 		if (adj_list_remove)
2119 			ADJ_LIST_REMOVE(ep);
2120 		flk_update_proc_graph(ep, 1);
2121 		if (NOT_BLOCKED(lck)) {
2122 			GRANT_WAKEUP(lck);
2123 		}
2124 		lock->l_sedge = NEXT_IN(ep);
2125 		IN_LIST_REMOVE(ep);
2126 		flk_free_edge(ep);
2127 		ep = lock->l_sedge;
2128 	} while (ep != HEAD(lock));
2129 	ASSERT(NO_DEPENDENTS(lock));
2130 }
2131 
2132 /*
2133  * The dependents of request, is checked for its dependency against the
2134  * locks in topology (called topology because the array is and should be in
2135  * topological order for this algorithm, if not in topological order the
2136  * inner loop below might add more edges than necessary. Topological ordering
2137  * of vertices satisfies the property that all edges will be from left to
2138  * right i.e., topology[i] can have an edge to  topology[j], iff i<j)
2139  * If lock l1 in the dependent set of request is dependent (blocked by)
2140  * on lock l2 in topology but does not have a path to it, we add an edge
2141  * in the inner loop below.
2142  *
2143  * We don't want to add an edge between l1 and l2 if there exists
2144  * already a path from l1 to l2, so care has to be taken for those vertices
2145  * that  have two paths to 'request'. These vertices are referred to here
2146  * as barrier locks.
2147  *
2148  * The barriers has to be found (those vertex that originally had two paths
2149  * to request) because otherwise we may end up adding edges unnecessarily
2150  * to vertices in topology, and thus barrier vertices can have an edge
2151  * to a vertex in topology as well a path to it.
2152  */
2153 
2154 static void
2155 flk_recompute_dependencies(lock_descriptor_t *request,
2156 		lock_descriptor_t **topology,
2157 			int nvertex, int update_graph)
2158 {
2159 	lock_descriptor_t *vertex, *lock;
2160 	graph_t	*gp = request->l_graph;
2161 	int i, count;
2162 	int barrier_found = 0;
2163 	edge_t	*ep;
2164 	lock_descriptor_t *vertex_stack;
2165 
2166 	STACK_INIT(vertex_stack);
2167 
2168 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
2169 	if (nvertex == 0)
2170 		return;
2171 	flk_graph_uncolor(request->l_graph);
2172 	barrier_found = flk_find_barriers(request);
2173 	request->l_state |= RECOMPUTE_DONE;
2174 
2175 	STACK_PUSH(vertex_stack, request, l_stack);
2176 	request->l_sedge = FIRST_IN(request);
2177 
2178 
2179 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2180 		if (vertex->l_state & RECOMPUTE_DONE) {
2181 			count = 0;
2182 			goto next_in_edge;
2183 		}
2184 		if (IS_BARRIER(vertex)) {
2185 			/* decrement the barrier count */
2186 			if (vertex->l_index) {
2187 				vertex->l_index--;
2188 				/* this guy will be pushed again anyway ? */
2189 				STACK_POP(vertex_stack, l_stack);
2190 				if (vertex->l_index == 0)  {
2191 				/*
2192 				 * barrier is over we can recompute
2193 				 * dependencies for this lock in the
2194 				 * next stack pop
2195 				 */
2196 					vertex->l_state &= ~BARRIER_LOCK;
2197 				}
2198 				continue;
2199 			}
2200 		}
2201 		vertex->l_state |= RECOMPUTE_DONE;
2202 		flk_graph_uncolor(gp);
2203 		count = flk_color_reachables(vertex);
2204 		for (i = 0; i < nvertex; i++) {
2205 			lock = topology[i];
2206 			if (COLORED(lock))
2207 				continue;
2208 			if (BLOCKS(lock, vertex)) {
2209 				(void) flk_add_edge(vertex, lock,
2210 				    NO_CHECK_CYCLE, update_graph);
2211 				COLOR(lock);
2212 				count++;
2213 				count += flk_color_reachables(lock);
2214 			}
2215 
2216 		}
2217 
2218 next_in_edge:
2219 		if (count == nvertex ||
2220 		    vertex->l_sedge == HEAD(vertex)) {
2221 			/* prune the tree below this */
2222 			STACK_POP(vertex_stack, l_stack);
2223 			vertex->l_state &= ~RECOMPUTE_DONE;
2224 			/* update the barrier locks below this! */
2225 			if (vertex->l_sedge != HEAD(vertex) && barrier_found) {
2226 				flk_graph_uncolor(gp);
2227 				flk_update_barriers(vertex);
2228 			}
2229 			continue;
2230 		}
2231 
2232 		ep = vertex->l_sedge;
2233 		lock = ep->from_vertex;
2234 		STACK_PUSH(vertex_stack, lock, l_stack);
2235 		lock->l_sedge = FIRST_IN(lock);
2236 		vertex->l_sedge = NEXT_IN(ep);
2237 	}
2238 
2239 }
2240 
2241 /*
2242  * Color all reachable vertices from vertex that belongs to topology (here
2243  * those that have RECOMPUTE_LOCK set in their state) and yet uncolored.
2244  *
2245  * Note: we need to use a different stack_link l_stack1 because this is
2246  * called from flk_recompute_dependencies() that already uses a stack with
2247  * l_stack as stack_link.
2248  */
2249 
2250 static int
2251 flk_color_reachables(lock_descriptor_t *vertex)
2252 {
2253 	lock_descriptor_t *ver, *lock;
2254 	int count;
2255 	edge_t	*ep;
2256 	lock_descriptor_t *vertex_stack;
2257 
2258 	STACK_INIT(vertex_stack);
2259 
2260 	STACK_PUSH(vertex_stack, vertex, l_stack1);
2261 	count = 0;
2262 	while ((ver = STACK_TOP(vertex_stack)) != NULL) {
2263 
2264 		STACK_POP(vertex_stack, l_stack1);
2265 		for (ep = FIRST_ADJ(ver); ep != HEAD(ver);
2266 		    ep = NEXT_ADJ(ep)) {
2267 			lock = ep->to_vertex;
2268 			if (COLORED(lock))
2269 				continue;
2270 			COLOR(lock);
2271 			if (IS_RECOMPUTE(lock))
2272 				count++;
2273 			STACK_PUSH(vertex_stack, lock, l_stack1);
2274 		}
2275 
2276 	}
2277 	return (count);
2278 }
2279 
2280 /*
2281  * Called from flk_recompute_dependencies() this routine decrements
2282  * the barrier count of barrier vertices that are reachable from lock.
2283  */
2284 
2285 static void
2286 flk_update_barriers(lock_descriptor_t *lock)
2287 {
2288 	lock_descriptor_t *vertex, *lck;
2289 	edge_t	*ep;
2290 	lock_descriptor_t *vertex_stack;
2291 
2292 	STACK_INIT(vertex_stack);
2293 
2294 	STACK_PUSH(vertex_stack, lock, l_stack1);
2295 
2296 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2297 		STACK_POP(vertex_stack, l_stack1);
2298 		for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2299 		    ep = NEXT_IN(ep)) {
2300 			lck = ep->from_vertex;
2301 			if (COLORED(lck)) {
2302 				if (IS_BARRIER(lck)) {
2303 					ASSERT(lck->l_index > 0);
2304 					lck->l_index--;
2305 					if (lck->l_index == 0)
2306 						lck->l_state &= ~BARRIER_LOCK;
2307 				}
2308 				continue;
2309 			}
2310 			COLOR(lck);
2311 			if (IS_BARRIER(lck)) {
2312 				ASSERT(lck->l_index > 0);
2313 				lck->l_index--;
2314 				if (lck->l_index == 0)
2315 					lck->l_state &= ~BARRIER_LOCK;
2316 			}
2317 			STACK_PUSH(vertex_stack, lck, l_stack1);
2318 		}
2319 	}
2320 }
2321 
2322 /*
2323  * Finds all vertices that are reachable from 'lock' more than once and
2324  * mark them as barrier vertices and increment their barrier count.
2325  * The barrier count is one minus the total number of paths from lock
2326  * to that vertex.
2327  */
2328 
2329 static int
2330 flk_find_barriers(lock_descriptor_t *lock)
2331 {
2332 	lock_descriptor_t *vertex, *lck;
2333 	int found = 0;
2334 	edge_t	*ep;
2335 	lock_descriptor_t *vertex_stack;
2336 
2337 	STACK_INIT(vertex_stack);
2338 
2339 	STACK_PUSH(vertex_stack, lock, l_stack1);
2340 
2341 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2342 		STACK_POP(vertex_stack, l_stack1);
2343 		for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2344 		    ep = NEXT_IN(ep)) {
2345 			lck = ep->from_vertex;
2346 			if (COLORED(lck)) {
2347 				/* this is a barrier */
2348 				lck->l_state |= BARRIER_LOCK;
2349 				/* index will have barrier count */
2350 				lck->l_index++;
2351 				if (!found)
2352 					found = 1;
2353 				continue;
2354 			}
2355 			COLOR(lck);
2356 			lck->l_index = 0;
2357 			STACK_PUSH(vertex_stack, lck, l_stack1);
2358 		}
2359 	}
2360 	return (found);
2361 }
2362 
2363 /*
2364  * Finds the first lock that is mainly responsible for blocking this
2365  * request.  If there is no such lock, request->l_flock.l_type is set to
2366  * F_UNLCK.  Otherwise, request->l_flock is filled in with the particulars
2367  * of the blocking lock.
2368  *
2369  * Note: It is possible a request is blocked by a sleeping lock because
2370  * of the fairness policy used in flk_process_request() to construct the
2371  * dependencies. (see comments before flk_process_request()).
2372  */
2373 
2374 static void
2375 flk_get_first_blocking_lock(lock_descriptor_t *request)
2376 {
2377 	graph_t	*gp = request->l_graph;
2378 	vnode_t *vp = request->l_vnode;
2379 	lock_descriptor_t *lock, *blocker;
2380 
2381 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
2382 	blocker = NULL;
2383 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2384 
2385 	if (lock) {
2386 		do {
2387 			if (BLOCKS(lock, request)) {
2388 				blocker = lock;
2389 				break;
2390 			}
2391 			lock = lock->l_next;
2392 		} while (lock->l_vnode == vp);
2393 	}
2394 
2395 	if (blocker == NULL && request->l_flock.l_type == F_RDLCK) {
2396 		/*
2397 		 * No active lock is blocking this request, but if a read
2398 		 * lock is requested, it may also get blocked by a waiting
2399 		 * writer. So search all sleeping locks and see if there is
2400 		 * a writer waiting.
2401 		 */
2402 		SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2403 		if (lock) {
2404 			do {
2405 				if (BLOCKS(lock, request)) {
2406 					blocker = lock;
2407 					break;
2408 				}
2409 				lock = lock->l_next;
2410 			} while (lock->l_vnode == vp);
2411 		}
2412 	}
2413 
2414 	if (blocker) {
2415 		report_blocker(blocker, request);
2416 	} else
2417 		request->l_flock.l_type = F_UNLCK;
2418 }
2419 
2420 /*
2421  * Get the graph_t structure associated with a vnode.
2422  * If 'initialize' is non-zero, and the graph_t structure for this vnode has
2423  * not yet been initialized, then a new element is allocated and returned.
2424  */
2425 graph_t *
2426 flk_get_lock_graph(vnode_t *vp, int initialize)
2427 {
2428 	graph_t *gp;
2429 	graph_t *gp_alloc = NULL;
2430 	int index = HASH_INDEX(vp);
2431 
2432 	if (initialize == FLK_USE_GRAPH) {
2433 		mutex_enter(&flock_lock);
2434 		gp = lock_graph[index];
2435 		mutex_exit(&flock_lock);
2436 		return (gp);
2437 	}
2438 
2439 	ASSERT(initialize == FLK_INIT_GRAPH);
2440 
2441 	if (lock_graph[index] == NULL) {
2442 
2443 		gp_alloc = kmem_zalloc(sizeof (graph_t), KM_SLEEP);
2444 
2445 		/* Initialize the graph */
2446 
2447 		gp_alloc->active_locks.l_next =
2448 		    gp_alloc->active_locks.l_prev =
2449 		    (lock_descriptor_t *)ACTIVE_HEAD(gp_alloc);
2450 		gp_alloc->sleeping_locks.l_next =
2451 		    gp_alloc->sleeping_locks.l_prev =
2452 		    (lock_descriptor_t *)SLEEPING_HEAD(gp_alloc);
2453 		gp_alloc->index = index;
2454 		mutex_init(&gp_alloc->gp_mutex, NULL, MUTEX_DEFAULT, NULL);
2455 	}
2456 
2457 	mutex_enter(&flock_lock);
2458 
2459 	gp = lock_graph[index];
2460 
2461 	/* Recheck the value within flock_lock */
2462 	if (gp == NULL) {
2463 		struct flock_globals *fg;
2464 
2465 		/* We must have previously allocated the graph_t structure */
2466 		ASSERT(gp_alloc != NULL);
2467 		lock_graph[index] = gp = gp_alloc;
2468 		/*
2469 		 * The lockmgr status is only needed if KLM is loaded.
2470 		 */
2471 		if (flock_zone_key != ZONE_KEY_UNINITIALIZED) {
2472 			fg = flk_get_globals();
2473 			fg->lockmgr_status[index] = fg->flk_lockmgr_status;
2474 		}
2475 	}
2476 
2477 	mutex_exit(&flock_lock);
2478 
2479 	if ((gp_alloc != NULL) && (gp != gp_alloc)) {
2480 		/* There was a race to allocate the graph_t and we lost */
2481 		mutex_destroy(&gp_alloc->gp_mutex);
2482 		kmem_free(gp_alloc, sizeof (graph_t));
2483 	}
2484 
2485 	return (gp);
2486 }
2487 
2488 /*
2489  * PSARC case 1997/292
2490  */
2491 int
2492 cl_flk_has_remote_locks_for_nlmid(vnode_t *vp, int nlmid)
2493 {
2494 	lock_descriptor_t *lock;
2495 	int result = 0;
2496 	graph_t *gp;
2497 	int			lock_nlmid;
2498 
2499 	/*
2500 	 * Check to see if node is booted as a cluster. If not, return.
2501 	 */
2502 	if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2503 		return (0);
2504 	}
2505 
2506 	gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2507 	if (gp == NULL) {
2508 		return (0);
2509 	}
2510 
2511 	mutex_enter(&gp->gp_mutex);
2512 
2513 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2514 
2515 	if (lock) {
2516 		while (lock->l_vnode == vp) {
2517 			/* get NLM id from sysid */
2518 			lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2519 
2520 			/*
2521 			 * If NLM server request _and_ nlmid of lock matches
2522 			 * nlmid of argument, then we've found a remote lock.
2523 			 */
2524 			if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2525 				result = 1;
2526 				goto done;
2527 			}
2528 			lock = lock->l_next;
2529 		}
2530 	}
2531 
2532 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2533 
2534 	if (lock) {
2535 		while (lock->l_vnode == vp) {
2536 			/* get NLM id from sysid */
2537 			lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2538 
2539 			/*
2540 			 * If NLM server request _and_ nlmid of lock matches
2541 			 * nlmid of argument, then we've found a remote lock.
2542 			 */
2543 			if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2544 				result = 1;
2545 				goto done;
2546 			}
2547 			lock = lock->l_next;
2548 		}
2549 	}
2550 
2551 done:
2552 	mutex_exit(&gp->gp_mutex);
2553 	return (result);
2554 }
2555 
2556 /*
2557  * Determine whether there are any locks for the given vnode with a remote
2558  * sysid.  Returns zero if not, non-zero if there are.
2559  *
2560  * Note that the return value from this function is potentially invalid
2561  * once it has been returned.  The caller is responsible for providing its
2562  * own synchronization mechanism to ensure that the return value is useful
2563  * (e.g., see nfs_lockcompletion()).
2564  */
2565 int
2566 flk_has_remote_locks(vnode_t *vp)
2567 {
2568 	lock_descriptor_t *lock;
2569 	int result = 0;
2570 	graph_t *gp;
2571 
2572 	gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2573 	if (gp == NULL) {
2574 		return (0);
2575 	}
2576 
2577 	mutex_enter(&gp->gp_mutex);
2578 
2579 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2580 
2581 	if (lock) {
2582 		while (lock->l_vnode == vp) {
2583 			if (IS_REMOTE(lock)) {
2584 				result = 1;
2585 				goto done;
2586 			}
2587 			lock = lock->l_next;
2588 		}
2589 	}
2590 
2591 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2592 
2593 	if (lock) {
2594 		while (lock->l_vnode == vp) {
2595 			if (IS_REMOTE(lock)) {
2596 				result = 1;
2597 				goto done;
2598 			}
2599 			lock = lock->l_next;
2600 		}
2601 	}
2602 
2603 done:
2604 	mutex_exit(&gp->gp_mutex);
2605 	return (result);
2606 }
2607 
2608 /*
2609  * Determine whether there are any locks for the given vnode with a remote
2610  * sysid matching given sysid.
2611  * Used by the new (open source) NFS Lock Manager (NLM)
2612  */
2613 int
2614 flk_has_remote_locks_for_sysid(vnode_t *vp, int sysid)
2615 {
2616 	lock_descriptor_t *lock;
2617 	int result = 0;
2618 	graph_t *gp;
2619 
2620 	if (sysid == 0)
2621 		return (0);
2622 
2623 	gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2624 	if (gp == NULL) {
2625 		return (0);
2626 	}
2627 
2628 	mutex_enter(&gp->gp_mutex);
2629 
2630 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2631 
2632 	if (lock) {
2633 		while (lock->l_vnode == vp) {
2634 			if (lock->l_flock.l_sysid == sysid) {
2635 				result = 1;
2636 				goto done;
2637 			}
2638 			lock = lock->l_next;
2639 		}
2640 	}
2641 
2642 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2643 
2644 	if (lock) {
2645 		while (lock->l_vnode == vp) {
2646 			if (lock->l_flock.l_sysid == sysid) {
2647 				result = 1;
2648 				goto done;
2649 			}
2650 			lock = lock->l_next;
2651 		}
2652 	}
2653 
2654 done:
2655 	mutex_exit(&gp->gp_mutex);
2656 	return (result);
2657 }
2658 
2659 /*
2660  * Determine if there are any locks owned by the given sysid.
2661  * Returns zero if not, non-zero if there are.  Note that this return code
2662  * could be derived from flk_get_{sleeping,active}_locks, but this routine
2663  * avoids all the memory allocations of those routines.
2664  *
2665  * This routine has the same synchronization issues as
2666  * flk_has_remote_locks.
2667  */
2668 
2669 int
2670 flk_sysid_has_locks(int sysid, int lck_type)
2671 {
2672 	int		has_locks = 0;
2673 	lock_descriptor_t	*lock;
2674 	graph_t 	*gp;
2675 	int		i;
2676 
2677 	for (i = 0; i < HASH_SIZE && !has_locks; i++) {
2678 		mutex_enter(&flock_lock);
2679 		gp = lock_graph[i];
2680 		mutex_exit(&flock_lock);
2681 		if (gp == NULL) {
2682 			continue;
2683 		}
2684 
2685 		mutex_enter(&gp->gp_mutex);
2686 
2687 		if (lck_type & FLK_QUERY_ACTIVE) {
2688 			for (lock = ACTIVE_HEAD(gp)->l_next;
2689 			    lock != ACTIVE_HEAD(gp) && !has_locks;
2690 			    lock = lock->l_next) {
2691 				if (lock->l_flock.l_sysid == sysid)
2692 					has_locks = 1;
2693 			}
2694 		}
2695 
2696 		if (lck_type & FLK_QUERY_SLEEPING) {
2697 			for (lock = SLEEPING_HEAD(gp)->l_next;
2698 			    lock != SLEEPING_HEAD(gp) && !has_locks;
2699 			    lock = lock->l_next) {
2700 				if (lock->l_flock.l_sysid == sysid)
2701 					has_locks = 1;
2702 			}
2703 		}
2704 		mutex_exit(&gp->gp_mutex);
2705 	}
2706 
2707 	return (has_locks);
2708 }
2709 
2710 
2711 /*
2712  * PSARC case 1997/292
2713  *
2714  * Requires: "sysid" is a pair [nlmid, sysid].  The lower half is 16-bit
2715  *  quantity, the real sysid generated by the NLM server; the upper half
2716  *  identifies the node of the cluster where the NLM server ran.
2717  *  This routine is only called by an NLM server running in a cluster.
2718  * Effects: Remove all locks held on behalf of the client identified
2719  *  by "sysid."
2720  */
2721 void
2722 cl_flk_remove_locks_by_sysid(int sysid)
2723 {
2724 	graph_t	*gp;
2725 	int i;
2726 	lock_descriptor_t *lock, *nlock;
2727 
2728 	/*
2729 	 * Check to see if node is booted as a cluster. If not, return.
2730 	 */
2731 	if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2732 		return;
2733 	}
2734 
2735 	ASSERT(sysid != 0);
2736 	for (i = 0; i < HASH_SIZE; i++) {
2737 		mutex_enter(&flock_lock);
2738 		gp = lock_graph[i];
2739 		mutex_exit(&flock_lock);
2740 
2741 		if (gp == NULL)
2742 			continue;
2743 
2744 		mutex_enter(&gp->gp_mutex);	/*  get mutex on lock graph */
2745 
2746 		/* signal sleeping requests so that they bail out */
2747 		lock = SLEEPING_HEAD(gp)->l_next;
2748 		while (lock != SLEEPING_HEAD(gp)) {
2749 			nlock = lock->l_next;
2750 			if (lock->l_flock.l_sysid == sysid) {
2751 				INTERRUPT_WAKEUP(lock);
2752 			}
2753 			lock = nlock;
2754 		}
2755 
2756 		/* delete active locks */
2757 		lock = ACTIVE_HEAD(gp)->l_next;
2758 		while (lock != ACTIVE_HEAD(gp)) {
2759 			nlock = lock->l_next;
2760 			if (lock->l_flock.l_sysid == sysid) {
2761 				flk_delete_active_lock(lock, 0);
2762 				flk_wakeup(lock, 1);
2763 				flk_free_lock(lock);
2764 			}
2765 			lock = nlock;
2766 		}
2767 		mutex_exit(&gp->gp_mutex);    /* release mutex on lock graph */
2768 	}
2769 }
2770 
2771 /*
2772  * Delete all locks in the system that belongs to the sysid of the request.
2773  */
2774 
2775 static void
2776 flk_delete_locks_by_sysid(lock_descriptor_t *request)
2777 {
2778 	int	sysid  = request->l_flock.l_sysid;
2779 	lock_descriptor_t *lock, *nlock;
2780 	graph_t	*gp;
2781 	int i;
2782 
2783 	ASSERT(MUTEX_HELD(&request->l_graph->gp_mutex));
2784 	ASSERT(sysid != 0);
2785 
2786 	mutex_exit(&request->l_graph->gp_mutex);
2787 
2788 	for (i = 0; i < HASH_SIZE; i++) {
2789 		mutex_enter(&flock_lock);
2790 		gp = lock_graph[i];
2791 		mutex_exit(&flock_lock);
2792 
2793 		if (gp == NULL)
2794 			continue;
2795 
2796 		mutex_enter(&gp->gp_mutex);
2797 
2798 		/* signal sleeping requests so that they bail out */
2799 		lock = SLEEPING_HEAD(gp)->l_next;
2800 		while (lock != SLEEPING_HEAD(gp)) {
2801 			nlock = lock->l_next;
2802 			if (lock->l_flock.l_sysid == sysid) {
2803 				INTERRUPT_WAKEUP(lock);
2804 			}
2805 			lock = nlock;
2806 		}
2807 
2808 		/* delete active locks */
2809 		lock = ACTIVE_HEAD(gp)->l_next;
2810 		while (lock != ACTIVE_HEAD(gp)) {
2811 			nlock = lock->l_next;
2812 			if (lock->l_flock.l_sysid == sysid) {
2813 				flk_delete_active_lock(lock, 0);
2814 				flk_wakeup(lock, 1);
2815 				flk_free_lock(lock);
2816 			}
2817 			lock = nlock;
2818 		}
2819 		mutex_exit(&gp->gp_mutex);
2820 	}
2821 
2822 	mutex_enter(&request->l_graph->gp_mutex);
2823 }
2824 
2825 /*
2826  * Clustering: Deletes PXFS locks
2827  * Effects: Delete all locks on files in the given file system and with the
2828  *  given PXFS id.
2829  */
2830 void
2831 cl_flk_delete_pxfs_locks(struct vfs *vfsp, int pxfsid)
2832 {
2833 	lock_descriptor_t *lock, *nlock;
2834 	graph_t	*gp;
2835 	int i;
2836 
2837 	for (i = 0; i < HASH_SIZE; i++) {
2838 		mutex_enter(&flock_lock);
2839 		gp = lock_graph[i];
2840 		mutex_exit(&flock_lock);
2841 
2842 		if (gp == NULL)
2843 			continue;
2844 
2845 		mutex_enter(&gp->gp_mutex);
2846 
2847 		/* signal sleeping requests so that they bail out */
2848 		lock = SLEEPING_HEAD(gp)->l_next;
2849 		while (lock != SLEEPING_HEAD(gp)) {
2850 			nlock = lock->l_next;
2851 			if (lock->l_vnode->v_vfsp == vfsp) {
2852 				ASSERT(IS_PXFS(lock));
2853 				if (GETPXFSID(lock->l_flock.l_sysid) ==
2854 				    pxfsid) {
2855 					flk_set_state(lock,
2856 					    FLK_CANCELLED_STATE);
2857 					flk_cancel_sleeping_lock(lock, 1);
2858 				}
2859 			}
2860 			lock = nlock;
2861 		}
2862 
2863 		/* delete active locks */
2864 		lock = ACTIVE_HEAD(gp)->l_next;
2865 		while (lock != ACTIVE_HEAD(gp)) {
2866 			nlock = lock->l_next;
2867 			if (lock->l_vnode->v_vfsp == vfsp) {
2868 				ASSERT(IS_PXFS(lock));
2869 				if (GETPXFSID(lock->l_flock.l_sysid) ==
2870 				    pxfsid) {
2871 					flk_delete_active_lock(lock, 0);
2872 					flk_wakeup(lock, 1);
2873 					flk_free_lock(lock);
2874 				}
2875 			}
2876 			lock = nlock;
2877 		}
2878 		mutex_exit(&gp->gp_mutex);
2879 	}
2880 }
2881 
2882 /*
2883  * Search for a sleeping lock manager lock which matches exactly this lock
2884  * request; if one is found, fake a signal to cancel it.
2885  *
2886  * Return 1 if a matching lock was found, 0 otherwise.
2887  */
2888 
2889 static int
2890 flk_canceled(lock_descriptor_t *request)
2891 {
2892 	lock_descriptor_t *lock, *nlock;
2893 	graph_t *gp = request->l_graph;
2894 	vnode_t *vp = request->l_vnode;
2895 
2896 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
2897 	ASSERT(IS_LOCKMGR(request));
2898 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2899 
2900 	if (lock) {
2901 		while (lock->l_vnode == vp) {
2902 			nlock = lock->l_next;
2903 			if (SAME_OWNER(lock, request) &&
2904 			    lock->l_start == request->l_start &&
2905 			    lock->l_end == request->l_end) {
2906 				INTERRUPT_WAKEUP(lock);
2907 				return (1);
2908 			}
2909 			lock = nlock;
2910 		}
2911 	}
2912 	return (0);
2913 }
2914 
2915 /*
2916  * Remove all non-OFD locks for the vnode belonging to the given pid and sysid.
2917  * That is, since OFD locks are pid-less we'll never match on the incoming
2918  * pid. OFD locks are removed earlier in the close() path via closef() and
2919  * ofdcleanlock().
2920  */
2921 void
2922 cleanlocks(vnode_t *vp, pid_t pid, int sysid)
2923 {
2924 	graph_t	*gp;
2925 	lock_descriptor_t *lock, *nlock;
2926 	lock_descriptor_t *link_stack;
2927 
2928 	STACK_INIT(link_stack);
2929 
2930 	gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2931 
2932 	if (gp == NULL)
2933 		return;
2934 	mutex_enter(&gp->gp_mutex);
2935 
2936 	CHECK_SLEEPING_LOCKS(gp);
2937 	CHECK_ACTIVE_LOCKS(gp);
2938 
2939 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2940 
2941 	if (lock) {
2942 		do {
2943 			nlock = lock->l_next;
2944 			if ((lock->l_flock.l_pid == pid ||
2945 			    pid == IGN_PID) &&
2946 			    lock->l_flock.l_sysid == sysid) {
2947 				CANCEL_WAKEUP(lock);
2948 			}
2949 			lock = nlock;
2950 		} while (lock->l_vnode == vp);
2951 	}
2952 
2953 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2954 
2955 	if (lock) {
2956 		do {
2957 			nlock = lock->l_next;
2958 			if ((lock->l_flock.l_pid == pid ||
2959 			    pid == IGN_PID) &&
2960 			    lock->l_flock.l_sysid == sysid) {
2961 				flk_delete_active_lock(lock, 0);
2962 				STACK_PUSH(link_stack, lock, l_stack);
2963 			}
2964 			lock = nlock;
2965 		} while (lock->l_vnode == vp);
2966 	}
2967 
2968 	while ((lock = STACK_TOP(link_stack)) != NULL) {
2969 		STACK_POP(link_stack, l_stack);
2970 		flk_wakeup(lock, 1);
2971 		flk_free_lock(lock);
2972 	}
2973 
2974 	CHECK_SLEEPING_LOCKS(gp);
2975 	CHECK_ACTIVE_LOCKS(gp);
2976 	CHECK_OWNER_LOCKS(gp, pid, sysid, vp);
2977 	mutex_exit(&gp->gp_mutex);
2978 }
2979 
2980 
2981 /*
2982  * Called from 'fs' read and write routines for files that have mandatory
2983  * locking enabled.
2984  */
2985 
2986 int
2987 chklock(
2988 	struct vnode	*vp,
2989 	int 		iomode,
2990 	u_offset_t	offset,
2991 	ssize_t		len,
2992 	int 		fmode,
2993 	caller_context_t *ct)
2994 {
2995 	register int	i;
2996 	struct flock64 	bf;
2997 	int 		error = 0;
2998 
2999 	bf.l_type = (iomode & FWRITE) ? F_WRLCK : F_RDLCK;
3000 	bf.l_whence = 0;
3001 	bf.l_start = offset;
3002 	bf.l_len = len;
3003 	if (ct == NULL) {
3004 		bf.l_pid = curproc->p_pid;
3005 		bf.l_sysid = 0;
3006 	} else {
3007 		bf.l_pid = ct->cc_pid;
3008 		bf.l_sysid = ct->cc_sysid;
3009 	}
3010 	i = (fmode & (FNDELAY|FNONBLOCK)) ? INOFLCK : INOFLCK|SLPFLCK;
3011 	if ((i = reclock(vp, &bf, i, 0, offset, NULL)) != 0 ||
3012 	    bf.l_type != F_UNLCK)
3013 		error = i ? i : EAGAIN;
3014 	return (error);
3015 }
3016 
3017 /*
3018  * convoff - converts the given data (start, whence) to the
3019  * given whence.
3020  */
3021 int
3022 convoff(vp, lckdat, whence, offset)
3023 	struct vnode 	*vp;
3024 	struct flock64 	*lckdat;
3025 	int 		whence;
3026 	offset_t	offset;
3027 {
3028 	int 		error;
3029 	struct vattr 	vattr;
3030 
3031 	if ((lckdat->l_whence == 2) || (whence == 2)) {
3032 		vattr.va_mask = AT_SIZE;
3033 		if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
3034 			return (error);
3035 	}
3036 
3037 	switch (lckdat->l_whence) {
3038 	case 1:
3039 		lckdat->l_start += offset;
3040 		break;
3041 	case 2:
3042 		lckdat->l_start += vattr.va_size;
3043 		/* FALLTHRU */
3044 	case 0:
3045 		break;
3046 	default:
3047 		return (EINVAL);
3048 	}
3049 
3050 	if (lckdat->l_start < 0)
3051 		return (EINVAL);
3052 
3053 	switch (whence) {
3054 	case 1:
3055 		lckdat->l_start -= offset;
3056 		break;
3057 	case 2:
3058 		lckdat->l_start -= vattr.va_size;
3059 		/* FALLTHRU */
3060 	case 0:
3061 		break;
3062 	default:
3063 		return (EINVAL);
3064 	}
3065 
3066 	lckdat->l_whence = (short)whence;
3067 	return (0);
3068 }
3069 
3070 
3071 /* 	proc_graph function definitions */
3072 
3073 /*
3074  * Function checks for deadlock due to the new 'lock'. If deadlock found
3075  * edges of this lock are freed and returned.
3076  */
3077 
3078 static int
3079 flk_check_deadlock(lock_descriptor_t *lock)
3080 {
3081 	proc_vertex_t	*start_vertex, *pvertex;
3082 	proc_vertex_t *dvertex;
3083 	proc_edge_t *pep, *ppep;
3084 	edge_t	*ep, *nep;
3085 	proc_vertex_t *process_stack;
3086 
3087 	/*
3088 	 * OFD style locks are not associated with any process so there is
3089 	 * no proc graph for these. Thus we cannot, and do not, do deadlock
3090 	 * detection.
3091 	 */
3092 	if (lock->l_ofd != NULL)
3093 		return (0);
3094 
3095 	STACK_INIT(process_stack);
3096 
3097 	mutex_enter(&flock_lock);
3098 	start_vertex = flk_get_proc_vertex(lock);
3099 	ASSERT(start_vertex != NULL);
3100 
3101 	/* construct the edges from this process to other processes */
3102 
3103 	ep = FIRST_ADJ(lock);
3104 	while (ep != HEAD(lock)) {
3105 		proc_vertex_t *adj_proc;
3106 
3107 		adj_proc = flk_get_proc_vertex(ep->to_vertex);
3108 		for (pep = start_vertex->edge; pep != NULL; pep = pep->next) {
3109 			if (pep->to_proc == adj_proc) {
3110 				ASSERT(pep->refcount);
3111 				pep->refcount++;
3112 				break;
3113 			}
3114 		}
3115 		if (pep == NULL) {
3116 			pep = flk_get_proc_edge();
3117 			pep->to_proc = adj_proc;
3118 			pep->refcount = 1;
3119 			adj_proc->incount++;
3120 			pep->next = start_vertex->edge;
3121 			start_vertex->edge = pep;
3122 		}
3123 		ep = NEXT_ADJ(ep);
3124 	}
3125 
3126 	ep = FIRST_IN(lock);
3127 
3128 	while (ep != HEAD(lock)) {
3129 		proc_vertex_t *in_proc;
3130 
3131 		in_proc = flk_get_proc_vertex(ep->from_vertex);
3132 
3133 		for (pep = in_proc->edge; pep != NULL; pep = pep->next) {
3134 			if (pep->to_proc == start_vertex) {
3135 				ASSERT(pep->refcount);
3136 				pep->refcount++;
3137 				break;
3138 			}
3139 		}
3140 		if (pep == NULL) {
3141 			pep = flk_get_proc_edge();
3142 			pep->to_proc = start_vertex;
3143 			pep->refcount = 1;
3144 			start_vertex->incount++;
3145 			pep->next = in_proc->edge;
3146 			in_proc->edge = pep;
3147 		}
3148 		ep = NEXT_IN(ep);
3149 	}
3150 
3151 	if (start_vertex->incount == 0) {
3152 		mutex_exit(&flock_lock);
3153 		return (0);
3154 	}
3155 
3156 	flk_proc_graph_uncolor();
3157 
3158 	start_vertex->p_sedge = start_vertex->edge;
3159 
3160 	STACK_PUSH(process_stack, start_vertex, p_stack);
3161 
3162 	while ((pvertex = STACK_TOP(process_stack)) != NULL) {
3163 		for (pep = pvertex->p_sedge; pep != NULL; pep = pep->next) {
3164 			dvertex = pep->to_proc;
3165 			if (!PROC_ARRIVED(dvertex)) {
3166 				STACK_PUSH(process_stack, dvertex, p_stack);
3167 				dvertex->p_sedge = dvertex->edge;
3168 				PROC_ARRIVE(pvertex);
3169 				pvertex->p_sedge = pep->next;
3170 				break;
3171 			}
3172 			if (!PROC_DEPARTED(dvertex))
3173 				goto deadlock;
3174 		}
3175 		if (pep == NULL) {
3176 			PROC_DEPART(pvertex);
3177 			STACK_POP(process_stack, p_stack);
3178 		}
3179 	}
3180 	mutex_exit(&flock_lock);
3181 	return (0);
3182 
3183 deadlock:
3184 
3185 	/* we remove all lock edges and proc edges */
3186 
3187 	ep = FIRST_ADJ(lock);
3188 	while (ep != HEAD(lock)) {
3189 		proc_vertex_t *adj_proc;
3190 		adj_proc = flk_get_proc_vertex(ep->to_vertex);
3191 		nep = NEXT_ADJ(ep);
3192 		IN_LIST_REMOVE(ep);
3193 		ADJ_LIST_REMOVE(ep);
3194 		flk_free_edge(ep);
3195 		ppep = start_vertex->edge;
3196 		for (pep = start_vertex->edge; pep != NULL; ppep = pep,
3197 		    pep = ppep->next) {
3198 			if (pep->to_proc == adj_proc) {
3199 				pep->refcount--;
3200 				if (pep->refcount == 0) {
3201 					if (pep == ppep) {
3202 						start_vertex->edge = pep->next;
3203 					} else {
3204 						ppep->next = pep->next;
3205 					}
3206 					adj_proc->incount--;
3207 					flk_proc_release(adj_proc);
3208 					flk_free_proc_edge(pep);
3209 				}
3210 				break;
3211 			}
3212 		}
3213 		ep = nep;
3214 	}
3215 	ep = FIRST_IN(lock);
3216 	while (ep != HEAD(lock)) {
3217 		proc_vertex_t *in_proc;
3218 		in_proc = flk_get_proc_vertex(ep->from_vertex);
3219 		nep = NEXT_IN(ep);
3220 		IN_LIST_REMOVE(ep);
3221 		ADJ_LIST_REMOVE(ep);
3222 		flk_free_edge(ep);
3223 		ppep = in_proc->edge;
3224 		for (pep = in_proc->edge; pep != NULL; ppep = pep,
3225 		    pep = ppep->next) {
3226 			if (pep->to_proc == start_vertex) {
3227 				pep->refcount--;
3228 				if (pep->refcount == 0) {
3229 					if (pep == ppep) {
3230 						in_proc->edge = pep->next;
3231 					} else {
3232 						ppep->next = pep->next;
3233 					}
3234 					start_vertex->incount--;
3235 					flk_proc_release(in_proc);
3236 					flk_free_proc_edge(pep);
3237 				}
3238 				break;
3239 			}
3240 		}
3241 		ep = nep;
3242 	}
3243 	flk_proc_release(start_vertex);
3244 	mutex_exit(&flock_lock);
3245 	return (1);
3246 }
3247 
3248 /*
3249  * Get a proc vertex. If lock's pvertex value gets a correct proc vertex
3250  * from the list we return that, otherwise we allocate one. If necessary,
3251  * we grow the list of vertices also.
3252  */
3253 
3254 static proc_vertex_t *
3255 flk_get_proc_vertex(lock_descriptor_t *lock)
3256 {
3257 	int i;
3258 	proc_vertex_t	*pv;
3259 	proc_vertex_t	**palloc;
3260 
3261 	ASSERT(MUTEX_HELD(&flock_lock));
3262 	if (lock->pvertex != -1) {
3263 		ASSERT(lock->pvertex >= 0);
3264 		pv = pgraph.proc[lock->pvertex];
3265 		if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
3266 			return (pv);
3267 		}
3268 	}
3269 	for (i = 0; i < pgraph.gcount; i++) {
3270 		pv = pgraph.proc[i];
3271 		if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
3272 			lock->pvertex = pv->index = i;
3273 			return (pv);
3274 		}
3275 	}
3276 	pv = kmem_zalloc(sizeof (struct proc_vertex), KM_SLEEP);
3277 	pv->pid = lock->l_flock.l_pid;
3278 	pv->sysid = lock->l_flock.l_sysid;
3279 	flk_proc_vertex_allocs++;
3280 	if (pgraph.free != 0) {
3281 		for (i = 0; i < pgraph.gcount; i++) {
3282 			if (pgraph.proc[i] == NULL) {
3283 				pgraph.proc[i] = pv;
3284 				lock->pvertex = pv->index = i;
3285 				pgraph.free--;
3286 				return (pv);
3287 			}
3288 		}
3289 	}
3290 	palloc = kmem_zalloc((pgraph.gcount + PROC_CHUNK) *
3291 	    sizeof (proc_vertex_t *), KM_SLEEP);
3292 
3293 	if (pgraph.proc) {
3294 		bcopy(pgraph.proc, palloc,
3295 		    pgraph.gcount * sizeof (proc_vertex_t *));
3296 
3297 		kmem_free(pgraph.proc,
3298 		    pgraph.gcount * sizeof (proc_vertex_t *));
3299 	}
3300 	pgraph.proc = palloc;
3301 	pgraph.free += (PROC_CHUNK - 1);
3302 	pv->index = lock->pvertex = pgraph.gcount;
3303 	pgraph.gcount += PROC_CHUNK;
3304 	pgraph.proc[pv->index] = pv;
3305 	return (pv);
3306 }
3307 
3308 /*
3309  * Allocate a proc edge.
3310  */
3311 
3312 static proc_edge_t *
3313 flk_get_proc_edge()
3314 {
3315 	proc_edge_t *pep;
3316 
3317 	pep = kmem_zalloc(sizeof (proc_edge_t), KM_SLEEP);
3318 	flk_proc_edge_allocs++;
3319 	return (pep);
3320 }
3321 
3322 /*
3323  * Free the proc edge. Called whenever its reference count goes to zero.
3324  */
3325 
3326 static void
3327 flk_free_proc_edge(proc_edge_t *pep)
3328 {
3329 	ASSERT(pep->refcount == 0);
3330 	kmem_free((void *)pep, sizeof (proc_edge_t));
3331 	flk_proc_edge_frees++;
3332 }
3333 
3334 /*
3335  * Color the graph explicitly done only when the mark value hits max value.
3336  */
3337 
3338 static void
3339 flk_proc_graph_uncolor()
3340 {
3341 	int i;
3342 
3343 	if (pgraph.mark == UINT_MAX) {
3344 		for (i = 0; i < pgraph.gcount; i++)
3345 			if (pgraph.proc[i] != NULL) {
3346 				pgraph.proc[i]->atime = 0;
3347 				pgraph.proc[i]->dtime = 0;
3348 			}
3349 		pgraph.mark = 1;
3350 	} else {
3351 		pgraph.mark++;
3352 	}
3353 }
3354 
3355 /*
3356  * Release the proc vertex iff both there are no in edges and out edges
3357  */
3358 
3359 static void
3360 flk_proc_release(proc_vertex_t *proc)
3361 {
3362 	ASSERT(MUTEX_HELD(&flock_lock));
3363 	if (proc->edge == NULL && proc->incount == 0) {
3364 		pgraph.proc[proc->index] = NULL;
3365 		pgraph.free++;
3366 		kmem_free(proc, sizeof (proc_vertex_t));
3367 		flk_proc_vertex_frees++;
3368 	}
3369 }
3370 
3371 /*
3372  * Updates process graph to reflect change in a lock_graph.
3373  * Note: We should call this function only after we have a correctly
3374  * recomputed lock graph. Otherwise we might miss a deadlock detection.
3375  * eg: in function flk_relation() we call this function after flk_recompute_
3376  * dependencies() otherwise if a process tries to lock a vnode hashed
3377  * into another graph it might sleep for ever.
3378  */
3379 
3380 static void
3381 flk_update_proc_graph(edge_t *ep, int delete)
3382 {
3383 	proc_vertex_t *toproc, *fromproc;
3384 	proc_edge_t *pep, *prevpep;
3385 
3386 	mutex_enter(&flock_lock);
3387 
3388 	/*
3389 	 * OFD style locks are not associated with any process so there is
3390 	 * no proc graph for these.
3391 	 */
3392 	if (ep->from_vertex->l_ofd != NULL) {
3393 		mutex_exit(&flock_lock);
3394 		return;
3395 	}
3396 
3397 	toproc = flk_get_proc_vertex(ep->to_vertex);
3398 	fromproc = flk_get_proc_vertex(ep->from_vertex);
3399 
3400 	if (!delete)
3401 		goto add;
3402 	pep = prevpep = fromproc->edge;
3403 
3404 	ASSERT(pep != NULL);
3405 	while (pep != NULL) {
3406 		if (pep->to_proc == toproc) {
3407 			ASSERT(pep->refcount > 0);
3408 			pep->refcount--;
3409 			if (pep->refcount == 0) {
3410 				if (pep == prevpep) {
3411 					fromproc->edge = pep->next;
3412 				} else {
3413 					prevpep->next = pep->next;
3414 				}
3415 				toproc->incount--;
3416 				flk_proc_release(toproc);
3417 				flk_free_proc_edge(pep);
3418 			}
3419 			break;
3420 		}
3421 		prevpep = pep;
3422 		pep = pep->next;
3423 	}
3424 	flk_proc_release(fromproc);
3425 	mutex_exit(&flock_lock);
3426 	return;
3427 add:
3428 
3429 	pep = fromproc->edge;
3430 
3431 	while (pep != NULL) {
3432 		if (pep->to_proc == toproc) {
3433 			ASSERT(pep->refcount > 0);
3434 			pep->refcount++;
3435 			break;
3436 		}
3437 		pep = pep->next;
3438 	}
3439 	if (pep == NULL) {
3440 		pep = flk_get_proc_edge();
3441 		pep->to_proc = toproc;
3442 		pep->refcount = 1;
3443 		toproc->incount++;
3444 		pep->next = fromproc->edge;
3445 		fromproc->edge = pep;
3446 	}
3447 	mutex_exit(&flock_lock);
3448 }
3449 
3450 /*
3451  * Set the control status for lock manager requests.
3452  *
3453  */
3454 
3455 /*
3456  * PSARC case 1997/292
3457  *
3458  * Requires: "nlmid" must be >= 1 and <= clconf_maximum_nodeid().
3459  * Effects: Set the state of the NLM server identified by "nlmid"
3460  *   in the NLM registry to state "nlm_state."
3461  *   Raises exception no_such_nlm if "nlmid" doesn't identify a known
3462  *   NLM server to this LLM.
3463  *   Note that when this routine is called with NLM_SHUTTING_DOWN there
3464  *   may be locks requests that have gotten started but not finished.  In
3465  *   particular, there may be blocking requests that are in the callback code
3466  *   before sleeping (so they're not holding the lock for the graph).  If
3467  *   such a thread reacquires the graph's lock (to go to sleep) after
3468  *   NLM state in the NLM registry  is set to a non-up value,
3469  *   it will notice the status and bail out.  If the request gets
3470  *   granted before the thread can check the NLM registry, let it
3471  *   continue normally.  It will get flushed when we are called with NLM_DOWN.
3472  *
3473  * Modifies: nlm_reg_obj (global)
3474  * Arguments:
3475  *    nlmid	(IN):    id uniquely identifying an NLM server
3476  *    nlm_state (IN):    NLM server state to change "nlmid" to
3477  */
3478 void
3479 cl_flk_set_nlm_status(int nlmid, flk_nlm_status_t nlm_state)
3480 {
3481 	/*
3482 	 * Check to see if node is booted as a cluster. If not, return.
3483 	 */
3484 	if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
3485 		return;
3486 	}
3487 
3488 	/*
3489 	 * Check for development/debugging.  It is possible to boot a node
3490 	 * in non-cluster mode, and then run a special script, currently
3491 	 * available only to developers, to bring up the node as part of a
3492 	 * cluster.  The problem is that running such a script does not
3493 	 * result in the routine flk_init() being called and hence global array
3494 	 * nlm_reg_status is NULL.  The NLM thinks it's in cluster mode,
3495 	 * but the LLM needs to do an additional check to see if the global
3496 	 * array has been created or not. If nlm_reg_status is NULL, then
3497 	 * return, else continue.
3498 	 */
3499 	if (nlm_reg_status == NULL) {
3500 		return;
3501 	}
3502 
3503 	ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
3504 	mutex_enter(&nlm_reg_lock);
3505 
3506 	if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status, nlmid)) {
3507 		/*
3508 		 * If the NLM server "nlmid" is unknown in the NLM registry,
3509 		 * add it to the registry in the nlm shutting down state.
3510 		 */
3511 		FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3512 		    FLK_NLM_SHUTTING_DOWN);
3513 	} else {
3514 		/*
3515 		 * Change the state of the NLM server identified by "nlmid"
3516 		 * in the NLM registry to the argument "nlm_state."
3517 		 */
3518 		FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3519 		    nlm_state);
3520 	}
3521 
3522 	/*
3523 	 *  The reason we must register the NLM server that is shutting down
3524 	 *  with an LLM that doesn't already know about it (never sent a lock
3525 	 *  request) is to handle correctly a race between shutdown and a new
3526 	 *  lock request.  Suppose that a shutdown request from the NLM server
3527 	 *  invokes this routine at the LLM, and a thread is spawned to
3528 	 *  service the request. Now suppose a new lock request is in
3529 	 *  progress and has already passed the first line of defense in
3530 	 *  reclock(), which denies new locks requests from NLM servers
3531 	 *  that are not in the NLM_UP state.  After the current routine
3532 	 *  is invoked for both phases of shutdown, the routine will return,
3533 	 *  having done nothing, and the lock request will proceed and
3534 	 *  probably be granted.  The problem is that the shutdown was ignored
3535 	 *  by the lock request because there was no record of that NLM server
3536 	 *  shutting down.   We will be in the peculiar position of thinking
3537 	 *  that we've shutdown the NLM server and all locks at all LLMs have
3538 	 *  been discarded, but in fact there's still one lock held.
3539 	 *  The solution is to record the existence of NLM server and change
3540 	 *  its state immediately to NLM_SHUTTING_DOWN.  The lock request in
3541 	 *  progress may proceed because the next phase NLM_DOWN will catch
3542 	 *  this lock and discard it.
3543 	 */
3544 	mutex_exit(&nlm_reg_lock);
3545 
3546 	switch (nlm_state) {
3547 	case FLK_NLM_UP:
3548 		/*
3549 		 * Change the NLM state of all locks still held on behalf of
3550 		 * the NLM server identified by "nlmid" to NLM_UP.
3551 		 */
3552 		cl_flk_change_nlm_state_all_locks(nlmid, FLK_NLM_UP);
3553 		break;
3554 
3555 	case FLK_NLM_SHUTTING_DOWN:
3556 		/*
3557 		 * Wake up all sleeping locks for the NLM server identified
3558 		 * by "nlmid." Note that eventually all woken threads will
3559 		 * have their lock requests cancelled and descriptors
3560 		 * removed from the sleeping lock list.  Note that the NLM
3561 		 * server state associated with each lock descriptor is
3562 		 * changed to FLK_NLM_SHUTTING_DOWN.
3563 		 */
3564 		cl_flk_wakeup_sleeping_nlm_locks(nlmid);
3565 		break;
3566 
3567 	case FLK_NLM_DOWN:
3568 		/*
3569 		 * Discard all active, granted locks for this NLM server
3570 		 * identified by "nlmid."
3571 		 */
3572 		cl_flk_unlock_nlm_granted(nlmid);
3573 		break;
3574 
3575 	default:
3576 		panic("cl_set_nlm_status: bad status (%d)", nlm_state);
3577 	}
3578 }
3579 
3580 /*
3581  * Set the control status for lock manager requests.
3582  *
3583  * Note that when this routine is called with FLK_WAKEUP_SLEEPERS, there
3584  * may be locks requests that have gotten started but not finished.  In
3585  * particular, there may be blocking requests that are in the callback code
3586  * before sleeping (so they're not holding the lock for the graph).  If
3587  * such a thread reacquires the graph's lock (to go to sleep) after
3588  * flk_lockmgr_status is set to a non-up value, it will notice the status
3589  * and bail out.  If the request gets granted before the thread can check
3590  * flk_lockmgr_status, let it continue normally.  It will get flushed when
3591  * we are called with FLK_LOCKMGR_DOWN.
3592  */
3593 
3594 void
3595 flk_set_lockmgr_status(flk_lockmgr_status_t status)
3596 {
3597 	int i;
3598 	graph_t *gp;
3599 	struct flock_globals *fg;
3600 
3601 	fg = flk_get_globals();
3602 	ASSERT(fg != NULL);
3603 
3604 	mutex_enter(&flock_lock);
3605 	fg->flk_lockmgr_status = status;
3606 	mutex_exit(&flock_lock);
3607 
3608 	/*
3609 	 * If the lock manager is coming back up, all that's needed is to
3610 	 * propagate this information to the graphs.  If the lock manager
3611 	 * is going down, additional action is required, and each graph's
3612 	 * copy of the state is updated atomically with this other action.
3613 	 */
3614 	switch (status) {
3615 	case FLK_LOCKMGR_UP:
3616 		for (i = 0; i < HASH_SIZE; i++) {
3617 			mutex_enter(&flock_lock);
3618 			gp = lock_graph[i];
3619 			mutex_exit(&flock_lock);
3620 			if (gp == NULL)
3621 				continue;
3622 			mutex_enter(&gp->gp_mutex);
3623 			fg->lockmgr_status[i] = status;
3624 			mutex_exit(&gp->gp_mutex);
3625 		}
3626 		break;
3627 	case FLK_WAKEUP_SLEEPERS:
3628 		wakeup_sleeping_lockmgr_locks(fg);
3629 		break;
3630 	case FLK_LOCKMGR_DOWN:
3631 		unlock_lockmgr_granted(fg);
3632 		break;
3633 	default:
3634 		panic("flk_set_lockmgr_status: bad status (%d)", status);
3635 		break;
3636 	}
3637 }
3638 
3639 /*
3640  * This routine returns all the locks that are active or sleeping and are
3641  * associated with a particular set of identifiers.  If lock_state != 0, then
3642  * only locks that match the lock_state are returned. If lock_state == 0, then
3643  * all locks are returned. If pid == NOPID, the pid is ignored.  If
3644  * use_sysid is FALSE, then the sysid is ignored.  If vp is NULL, then the
3645  * vnode pointer is ignored.
3646  *
3647  * A list containing the vnode pointer and an flock structure
3648  * describing the lock is returned.  Each element in the list is
3649  * dynamically allocated and must be freed by the caller.  The
3650  * last item in the list is denoted by a NULL value in the ll_next
3651  * field.
3652  *
3653  * The vnode pointers returned are held.  The caller is responsible
3654  * for releasing these.  Note that the returned list is only a snapshot of
3655  * the current lock information, and that it is a snapshot of a moving
3656  * target (only one graph is locked at a time).
3657  */
3658 
3659 locklist_t *
3660 get_lock_list(int list_type, int lock_state, int sysid, boolean_t use_sysid,
3661 		pid_t pid, const vnode_t *vp, zoneid_t zoneid)
3662 {
3663 	lock_descriptor_t	*lock;
3664 	lock_descriptor_t	*graph_head;
3665 	locklist_t		listhead;
3666 	locklist_t		*llheadp;
3667 	locklist_t		*llp;
3668 	locklist_t		*lltp;
3669 	graph_t			*gp;
3670 	int			i;
3671 	int			first_index; /* graph index */
3672 	int			num_indexes; /* graph index */
3673 
3674 	ASSERT((list_type == FLK_ACTIVE_STATE) ||
3675 	    (list_type == FLK_SLEEPING_STATE));
3676 
3677 	/*
3678 	 * Get a pointer to something to use as a list head while building
3679 	 * the rest of the list.
3680 	 */
3681 	llheadp = &listhead;
3682 	lltp = llheadp;
3683 	llheadp->ll_next = (locklist_t *)NULL;
3684 
3685 	/* Figure out which graphs we want to look at. */
3686 	if (vp == NULL) {
3687 		first_index = 0;
3688 		num_indexes = HASH_SIZE;
3689 	} else {
3690 		first_index = HASH_INDEX(vp);
3691 		num_indexes = 1;
3692 	}
3693 
3694 	for (i = first_index; i < first_index + num_indexes; i++) {
3695 		mutex_enter(&flock_lock);
3696 		gp = lock_graph[i];
3697 		mutex_exit(&flock_lock);
3698 		if (gp == NULL) {
3699 			continue;
3700 		}
3701 
3702 		mutex_enter(&gp->gp_mutex);
3703 		graph_head = (list_type == FLK_ACTIVE_STATE) ?
3704 		    ACTIVE_HEAD(gp) : SLEEPING_HEAD(gp);
3705 		for (lock = graph_head->l_next;
3706 		    lock != graph_head;
3707 		    lock = lock->l_next) {
3708 			if (use_sysid && lock->l_flock.l_sysid != sysid)
3709 				continue;
3710 			if (pid != NOPID && lock->l_flock.l_pid != pid)
3711 				continue;
3712 			if (vp != NULL && lock->l_vnode != vp)
3713 				continue;
3714 			if (lock_state && !(lock_state & lock->l_state))
3715 				continue;
3716 			if (zoneid != lock->l_zoneid && zoneid != ALL_ZONES)
3717 				continue;
3718 			/*
3719 			 * A matching lock was found.  Allocate
3720 			 * space for a new locklist entry and fill
3721 			 * it in.
3722 			 */
3723 			llp = kmem_alloc(sizeof (locklist_t), KM_SLEEP);
3724 			lltp->ll_next = llp;
3725 			VN_HOLD(lock->l_vnode);
3726 			llp->ll_vp = lock->l_vnode;
3727 			create_flock(lock, &(llp->ll_flock));
3728 			llp->ll_next = (locklist_t *)NULL;
3729 			lltp = llp;
3730 		}
3731 		mutex_exit(&gp->gp_mutex);
3732 	}
3733 
3734 	llp = llheadp->ll_next;
3735 	return (llp);
3736 }
3737 
3738 /*
3739  * These two functions are simply interfaces to get_lock_list.  They return
3740  * a list of sleeping or active locks for the given sysid and pid.  See
3741  * get_lock_list for details.
3742  *
3743  * In either case we don't particularly care to specify the zone of interest;
3744  * the sysid-space is global across zones, so the sysid will map to exactly one
3745  * zone, and we'll return information for that zone.
3746  */
3747 
3748 locklist_t *
3749 flk_get_sleeping_locks(int sysid, pid_t pid)
3750 {
3751 	return (get_lock_list(FLK_SLEEPING_STATE, 0, sysid, B_TRUE, pid, NULL,
3752 	    ALL_ZONES));
3753 }
3754 
3755 locklist_t *
3756 flk_get_active_locks(int sysid, pid_t pid)
3757 {
3758 	return (get_lock_list(FLK_ACTIVE_STATE, 0, sysid, B_TRUE, pid, NULL,
3759 	    ALL_ZONES));
3760 }
3761 
3762 /*
3763  * Another interface to get_lock_list.  This one returns all the active
3764  * locks for a given vnode.  Again, see get_lock_list for details.
3765  *
3766  * We don't need to specify which zone's locks we're interested in.  The matter
3767  * would only be interesting if the vnode belonged to NFS, and NFS vnodes can't
3768  * be used by multiple zones, so the list of locks will all be from the right
3769  * zone.
3770  */
3771 
3772 locklist_t *
3773 flk_active_locks_for_vp(const vnode_t *vp)
3774 {
3775 	return (get_lock_list(FLK_ACTIVE_STATE, 0, 0, B_FALSE, NOPID, vp,
3776 	    ALL_ZONES));
3777 }
3778 
3779 /*
3780  * Another interface to get_lock_list.  This one returns all the active
3781  * nbmand locks for a given vnode.  Again, see get_lock_list for details.
3782  *
3783  * See the comment for flk_active_locks_for_vp() for why we don't care to
3784  * specify the particular zone of interest.
3785  */
3786 locklist_t *
3787 flk_active_nbmand_locks_for_vp(const vnode_t *vp)
3788 {
3789 	return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3790 	    NOPID, vp, ALL_ZONES));
3791 }
3792 
3793 /*
3794  * Another interface to get_lock_list.  This one returns all the active
3795  * nbmand locks for a given pid.  Again, see get_lock_list for details.
3796  *
3797  * The zone doesn't need to be specified here; the locks held by a
3798  * particular process will either be local (ie, non-NFS) or from the zone
3799  * the process is executing in.  This is because other parts of the system
3800  * ensure that an NFS vnode can't be used in a zone other than that in
3801  * which it was opened.
3802  */
3803 locklist_t *
3804 flk_active_nbmand_locks(pid_t pid)
3805 {
3806 	return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3807 	    pid, NULL, ALL_ZONES));
3808 }
3809 
3810 /*
3811  * Free up all entries in the locklist.
3812  */
3813 void
3814 flk_free_locklist(locklist_t *llp)
3815 {
3816 	locklist_t *next_llp;
3817 
3818 	while (llp) {
3819 		next_llp = llp->ll_next;
3820 		VN_RELE(llp->ll_vp);
3821 		kmem_free(llp, sizeof (*llp));
3822 		llp = next_llp;
3823 	}
3824 }
3825 
3826 static void
3827 cl_flk_change_nlm_state_all_locks(int nlmid, flk_nlm_status_t nlm_state)
3828 {
3829 	/*
3830 	 * For each graph "lg" in the hash table lock_graph do
3831 	 * a.  Get the list of sleeping locks
3832 	 * b.  For each lock descriptor in the list do
3833 	 *	i.   If the requested lock is an NLM server request AND
3834 	 *		the nlmid is the same as the routine argument then
3835 	 *		change the lock descriptor's state field to
3836 	 *		"nlm_state."
3837 	 * c.  Get the list of active locks
3838 	 * d.  For each lock descriptor in the list do
3839 	 *	i.   If the requested lock is an NLM server request AND
3840 	 *		the nlmid is the same as the routine argument then
3841 	 *		change the lock descriptor's state field to
3842 	 *		"nlm_state."
3843 	 */
3844 
3845 	int			i;
3846 	graph_t			*gp;			/* lock graph */
3847 	lock_descriptor_t	*lock;			/* lock */
3848 	lock_descriptor_t	*nlock = NULL;		/* next lock */
3849 	int			lock_nlmid;
3850 
3851 	for (i = 0; i < HASH_SIZE; i++) {
3852 		mutex_enter(&flock_lock);
3853 		gp = lock_graph[i];
3854 		mutex_exit(&flock_lock);
3855 		if (gp == NULL) {
3856 			continue;
3857 		}
3858 
3859 		/* Get list of sleeping locks in current lock graph. */
3860 		mutex_enter(&gp->gp_mutex);
3861 		for (lock = SLEEPING_HEAD(gp)->l_next;
3862 		    lock != SLEEPING_HEAD(gp);
3863 		    lock = nlock) {
3864 			nlock = lock->l_next;
3865 			/* get NLM id */
3866 			lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3867 
3868 			/*
3869 			 * If NLM server request AND nlmid of lock matches
3870 			 * nlmid of argument, then set the NLM state of the
3871 			 * lock to "nlm_state."
3872 			 */
3873 			if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3874 				SET_NLM_STATE(lock, nlm_state);
3875 			}
3876 		}
3877 
3878 		/* Get list of active locks in current lock graph. */
3879 		for (lock = ACTIVE_HEAD(gp)->l_next;
3880 		    lock != ACTIVE_HEAD(gp);
3881 		    lock = nlock) {
3882 			nlock = lock->l_next;
3883 			/* get NLM id */
3884 			lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3885 
3886 			/*
3887 			 * If NLM server request AND nlmid of lock matches
3888 			 * nlmid of argument, then set the NLM state of the
3889 			 * lock to "nlm_state."
3890 			 */
3891 			if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3892 				ASSERT(IS_ACTIVE(lock));
3893 				SET_NLM_STATE(lock, nlm_state);
3894 			}
3895 		}
3896 		mutex_exit(&gp->gp_mutex);
3897 	}
3898 }
3899 
3900 /*
3901  * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid().
3902  * Effects: Find all sleeping lock manager requests _only_ for the NLM server
3903  *   identified by "nlmid." Poke those lock requests.
3904  */
3905 static void
3906 cl_flk_wakeup_sleeping_nlm_locks(int nlmid)
3907 {
3908 	lock_descriptor_t *lock;
3909 	lock_descriptor_t *nlock = NULL; /* next lock */
3910 	int i;
3911 	graph_t *gp;
3912 	int	lock_nlmid;
3913 
3914 	for (i = 0; i < HASH_SIZE; i++) {
3915 		mutex_enter(&flock_lock);
3916 		gp = lock_graph[i];
3917 		mutex_exit(&flock_lock);
3918 		if (gp == NULL) {
3919 			continue;
3920 		}
3921 
3922 		mutex_enter(&gp->gp_mutex);
3923 		for (lock = SLEEPING_HEAD(gp)->l_next;
3924 		    lock != SLEEPING_HEAD(gp);
3925 		    lock = nlock) {
3926 			nlock = lock->l_next;
3927 			/*
3928 			 * If NLM server request _and_ nlmid of lock matches
3929 			 * nlmid of argument, then set the NLM state of the
3930 			 * lock to NLM_SHUTTING_DOWN, and wake up sleeping
3931 			 * request.
3932 			 */
3933 			if (IS_LOCKMGR(lock)) {
3934 				/* get NLM id */
3935 				lock_nlmid =
3936 				    GETNLMID(lock->l_flock.l_sysid);
3937 				if (nlmid == lock_nlmid) {
3938 					SET_NLM_STATE(lock,
3939 					    FLK_NLM_SHUTTING_DOWN);
3940 					INTERRUPT_WAKEUP(lock);
3941 				}
3942 			}
3943 		}
3944 		mutex_exit(&gp->gp_mutex);
3945 	}
3946 }
3947 
3948 /*
3949  * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid()
3950  * Effects:  Find all active (granted) lock manager locks _only_ for the
3951  *   NLM server identified by "nlmid" and release them.
3952  */
3953 static void
3954 cl_flk_unlock_nlm_granted(int nlmid)
3955 {
3956 	lock_descriptor_t *lock;
3957 	lock_descriptor_t *nlock = NULL; /* next lock */
3958 	int i;
3959 	graph_t *gp;
3960 	int	lock_nlmid;
3961 
3962 	for (i = 0; i < HASH_SIZE; i++) {
3963 		mutex_enter(&flock_lock);
3964 		gp = lock_graph[i];
3965 		mutex_exit(&flock_lock);
3966 		if (gp == NULL) {
3967 			continue;
3968 		}
3969 
3970 		mutex_enter(&gp->gp_mutex);
3971 		for (lock = ACTIVE_HEAD(gp)->l_next;
3972 		    lock != ACTIVE_HEAD(gp);
3973 		    lock = nlock) {
3974 			nlock = lock->l_next;
3975 			ASSERT(IS_ACTIVE(lock));
3976 
3977 			/*
3978 			 * If it's an  NLM server request _and_ nlmid of
3979 			 * the lock matches nlmid of argument, then
3980 			 * remove the active lock the list, wakup blocked
3981 			 * threads, and free the storage for the lock.
3982 			 * Note that there's no need to mark the NLM state
3983 			 * of this lock to NLM_DOWN because the lock will
3984 			 * be deleted anyway and its storage freed.
3985 			 */
3986 			if (IS_LOCKMGR(lock)) {
3987 				/* get NLM id */
3988 				lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3989 				if (nlmid == lock_nlmid) {
3990 					flk_delete_active_lock(lock, 0);
3991 					flk_wakeup(lock, 1);
3992 					flk_free_lock(lock);
3993 				}
3994 			}
3995 		}
3996 		mutex_exit(&gp->gp_mutex);
3997 	}
3998 }
3999 
4000 /*
4001  * Find all sleeping lock manager requests and poke them.
4002  */
4003 static void
4004 wakeup_sleeping_lockmgr_locks(struct flock_globals *fg)
4005 {
4006 	lock_descriptor_t *lock;
4007 	lock_descriptor_t *nlock = NULL; /* next lock */
4008 	int i;
4009 	graph_t *gp;
4010 	zoneid_t zoneid = getzoneid();
4011 
4012 	for (i = 0; i < HASH_SIZE; i++) {
4013 		mutex_enter(&flock_lock);
4014 		gp = lock_graph[i];
4015 		mutex_exit(&flock_lock);
4016 		if (gp == NULL) {
4017 			continue;
4018 		}
4019 
4020 		mutex_enter(&gp->gp_mutex);
4021 		fg->lockmgr_status[i] = FLK_WAKEUP_SLEEPERS;
4022 		for (lock = SLEEPING_HEAD(gp)->l_next;
4023 		    lock != SLEEPING_HEAD(gp);
4024 		    lock = nlock) {
4025 			nlock = lock->l_next;
4026 			if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
4027 				INTERRUPT_WAKEUP(lock);
4028 			}
4029 		}
4030 		mutex_exit(&gp->gp_mutex);
4031 	}
4032 }
4033 
4034 
4035 /*
4036  * Find all active (granted) lock manager locks and release them.
4037  */
4038 static void
4039 unlock_lockmgr_granted(struct flock_globals *fg)
4040 {
4041 	lock_descriptor_t *lock;
4042 	lock_descriptor_t *nlock = NULL; /* next lock */
4043 	int i;
4044 	graph_t *gp;
4045 	zoneid_t zoneid = getzoneid();
4046 
4047 	for (i = 0; i < HASH_SIZE; i++) {
4048 		mutex_enter(&flock_lock);
4049 		gp = lock_graph[i];
4050 		mutex_exit(&flock_lock);
4051 		if (gp == NULL) {
4052 			continue;
4053 		}
4054 
4055 		mutex_enter(&gp->gp_mutex);
4056 		fg->lockmgr_status[i] = FLK_LOCKMGR_DOWN;
4057 		for (lock = ACTIVE_HEAD(gp)->l_next;
4058 		    lock != ACTIVE_HEAD(gp);
4059 		    lock = nlock) {
4060 			nlock = lock->l_next;
4061 			if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
4062 				ASSERT(IS_ACTIVE(lock));
4063 				flk_delete_active_lock(lock, 0);
4064 				flk_wakeup(lock, 1);
4065 				flk_free_lock(lock);
4066 			}
4067 		}
4068 		mutex_exit(&gp->gp_mutex);
4069 	}
4070 }
4071 
4072 
4073 /*
4074  * Wait until a lock is granted, cancelled, or interrupted.
4075  */
4076 
4077 static void
4078 wait_for_lock(lock_descriptor_t *request)
4079 {
4080 	graph_t *gp = request->l_graph;
4081 
4082 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
4083 
4084 	while (!(IS_GRANTED(request)) && !(IS_CANCELLED(request)) &&
4085 	    !(IS_INTERRUPTED(request))) {
4086 		if (!cv_wait_sig(&request->l_cv, &gp->gp_mutex)) {
4087 			flk_set_state(request, FLK_INTERRUPTED_STATE);
4088 			request->l_state |= INTERRUPTED_LOCK;
4089 		}
4090 	}
4091 }
4092 
4093 /*
4094  * Create an flock structure from the existing lock information
4095  *
4096  * This routine is used to create flock structures for the lock manager
4097  * to use in a reclaim request.  Since the lock was originated on this
4098  * host, it must be conforming to UNIX semantics, so no checking is
4099  * done to make sure it falls within the lower half of the 32-bit range.
4100  */
4101 
4102 static void
4103 create_flock(lock_descriptor_t *lp, flock64_t *flp)
4104 {
4105 	ASSERT(lp->l_end == MAX_U_OFFSET_T || lp->l_end <= MAXEND);
4106 	ASSERT(lp->l_end >= lp->l_start);
4107 
4108 	flp->l_type = lp->l_type;
4109 	flp->l_whence = 0;
4110 	flp->l_start = lp->l_start;
4111 	flp->l_len = (lp->l_end == MAX_U_OFFSET_T) ? 0 :
4112 	    (lp->l_end - lp->l_start + 1);
4113 	flp->l_sysid = lp->l_flock.l_sysid;
4114 	flp->l_pid = lp->l_flock.l_pid;
4115 }
4116 
4117 /*
4118  * Convert flock_t data describing a lock range into unsigned long starting
4119  * and ending points, which are put into lock_request.  Returns 0 or an
4120  * errno value.
4121  * Large Files: max is passed by the caller and we return EOVERFLOW
4122  * as defined by LFS API.
4123  */
4124 
4125 int
4126 flk_convert_lock_data(vnode_t *vp, flock64_t *flp,
4127     u_offset_t *start, u_offset_t *end, offset_t offset)
4128 {
4129 	struct vattr	vattr;
4130 	int	error;
4131 
4132 	/*
4133 	 * Determine the starting point of the request
4134 	 */
4135 	switch (flp->l_whence) {
4136 	case 0:		/* SEEK_SET */
4137 		*start = (u_offset_t)flp->l_start;
4138 		break;
4139 	case 1:		/* SEEK_CUR */
4140 		*start = (u_offset_t)(flp->l_start + offset);
4141 		break;
4142 	case 2:		/* SEEK_END */
4143 		vattr.va_mask = AT_SIZE;
4144 		if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
4145 			return (error);
4146 		*start = (u_offset_t)(flp->l_start + vattr.va_size);
4147 		break;
4148 	default:
4149 		return (EINVAL);
4150 	}
4151 
4152 	/*
4153 	 * Determine the range covered by the request.
4154 	 */
4155 	if (flp->l_len == 0)
4156 		*end = MAX_U_OFFSET_T;
4157 	else if ((offset_t)flp->l_len > 0) {
4158 		*end = (u_offset_t)(*start + (flp->l_len - 1));
4159 	} else {
4160 		/*
4161 		 * Negative length; why do we even allow this ?
4162 		 * Because this allows easy specification of
4163 		 * the last n bytes of the file.
4164 		 */
4165 		*end = *start;
4166 		*start += (u_offset_t)flp->l_len;
4167 		(*start)++;
4168 	}
4169 	return (0);
4170 }
4171 
4172 /*
4173  * Check the validity of lock data.  This can used by the NFS
4174  * frlock routines to check data before contacting the server.  The
4175  * server must support semantics that aren't as restrictive as
4176  * the UNIX API, so the NFS client is required to check.
4177  * The maximum is now passed in by the caller.
4178  */
4179 
4180 int
4181 flk_check_lock_data(u_offset_t start, u_offset_t end, offset_t max)
4182 {
4183 	/*
4184 	 * The end (length) for local locking should never be greater
4185 	 * than MAXEND. However, the representation for
4186 	 * the entire file is MAX_U_OFFSET_T.
4187 	 */
4188 	if ((start > max) ||
4189 	    ((end > max) && (end != MAX_U_OFFSET_T))) {
4190 		return (EINVAL);
4191 	}
4192 	if (start > end) {
4193 		return (EINVAL);
4194 	}
4195 	return (0);
4196 }
4197 
4198 /*
4199  * Fill in request->l_flock with information about the lock blocking the
4200  * request.  The complexity here is that lock manager requests are allowed
4201  * to see into the upper part of the 32-bit address range, whereas local
4202  * requests are only allowed to see signed values.
4203  *
4204  * What should be done when "blocker" is a lock manager lock that uses the
4205  * upper portion of the 32-bit range, but "request" is local?  Since the
4206  * request has already been determined to have been blocked by the blocker,
4207  * at least some portion of "blocker" must be in the range of the request,
4208  * or the request extends to the end of file.  For the first case, the
4209  * portion in the lower range is returned with the indication that it goes
4210  * "to EOF."  For the second case, the last byte of the lower range is
4211  * returned with the indication that it goes "to EOF."
4212  */
4213 
4214 static void
4215 report_blocker(lock_descriptor_t *blocker, lock_descriptor_t *request)
4216 {
4217 	flock64_t *flrp;			/* l_flock portion of request */
4218 
4219 	ASSERT(blocker != NULL);
4220 
4221 	flrp = &request->l_flock;
4222 	flrp->l_whence = 0;
4223 	flrp->l_type = blocker->l_type;
4224 	flrp->l_pid = blocker->l_flock.l_pid;
4225 	flrp->l_sysid = blocker->l_flock.l_sysid;
4226 	request->l_ofd = blocker->l_ofd;
4227 
4228 	if (IS_LOCKMGR(request)) {
4229 		flrp->l_start = blocker->l_start;
4230 		if (blocker->l_end == MAX_U_OFFSET_T)
4231 			flrp->l_len = 0;
4232 		else
4233 			flrp->l_len = blocker->l_end - blocker->l_start + 1;
4234 	} else {
4235 		if (blocker->l_start > MAXEND) {
4236 			flrp->l_start = MAXEND;
4237 			flrp->l_len = 0;
4238 		} else {
4239 			flrp->l_start = blocker->l_start;
4240 			if (blocker->l_end == MAX_U_OFFSET_T)
4241 				flrp->l_len = 0;
4242 			else
4243 				flrp->l_len = blocker->l_end -
4244 				    blocker->l_start + 1;
4245 		}
4246 	}
4247 }
4248 
4249 /*
4250  * PSARC case 1997/292
4251  */
4252 /*
4253  * This is the public routine exported by flock.h.
4254  */
4255 void
4256 cl_flk_change_nlm_state_to_unknown(int nlmid)
4257 {
4258 	/*
4259 	 * Check to see if node is booted as a cluster. If not, return.
4260 	 */
4261 	if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
4262 		return;
4263 	}
4264 
4265 	/*
4266 	 * See comment in cl_flk_set_nlm_status().
4267 	 */
4268 	if (nlm_reg_status == NULL) {
4269 		return;
4270 	}
4271 
4272 	/*
4273 	 * protect NLM registry state with a mutex.
4274 	 */
4275 	ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
4276 	mutex_enter(&nlm_reg_lock);
4277 	FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid, FLK_NLM_UNKNOWN);
4278 	mutex_exit(&nlm_reg_lock);
4279 }
4280 
4281 /*
4282  * Return non-zero if the given I/O request conflicts with an active NBMAND
4283  * lock.
4284  * If svmand is non-zero, it means look at all active locks, not just NBMAND
4285  * locks.
4286  */
4287 
4288 int
4289 nbl_lock_conflict(vnode_t *vp, nbl_op_t op, u_offset_t offset,
4290 		ssize_t length, int svmand, caller_context_t *ct)
4291 {
4292 	int conflict = 0;
4293 	graph_t			*gp;
4294 	lock_descriptor_t	*lock;
4295 	pid_t pid;
4296 	int sysid;
4297 
4298 	if (ct == NULL) {
4299 		pid = curproc->p_pid;
4300 		sysid = 0;
4301 	} else {
4302 		pid = ct->cc_pid;
4303 		sysid = ct->cc_sysid;
4304 	}
4305 
4306 	mutex_enter(&flock_lock);
4307 	gp = lock_graph[HASH_INDEX(vp)];
4308 	mutex_exit(&flock_lock);
4309 	if (gp == NULL)
4310 		return (0);
4311 
4312 	mutex_enter(&gp->gp_mutex);
4313 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
4314 
4315 	for (; lock && lock->l_vnode == vp; lock = lock->l_next) {
4316 		if ((svmand || (lock->l_state & NBMAND_LOCK)) &&
4317 		    (lock->l_flock.l_sysid != sysid ||
4318 		    lock->l_flock.l_pid != pid) &&
4319 		    lock_blocks_io(op, offset, length,
4320 		    lock->l_type, lock->l_start, lock->l_end)) {
4321 			conflict = 1;
4322 			break;
4323 		}
4324 	}
4325 	mutex_exit(&gp->gp_mutex);
4326 
4327 	return (conflict);
4328 }
4329 
4330 /*
4331  * Return non-zero if the given I/O request conflicts with the given lock.
4332  */
4333 
4334 static int
4335 lock_blocks_io(nbl_op_t op, u_offset_t offset, ssize_t length,
4336 	    int lock_type, u_offset_t lock_start, u_offset_t lock_end)
4337 {
4338 	ASSERT(op == NBL_READ || op == NBL_WRITE || op == NBL_READWRITE);
4339 	ASSERT(lock_type == F_RDLCK || lock_type == F_WRLCK);
4340 
4341 	if (op == NBL_READ && lock_type == F_RDLCK)
4342 		return (0);
4343 
4344 	if (offset <= lock_start && lock_start < offset + length)
4345 		return (1);
4346 	if (lock_start <= offset && offset <= lock_end)
4347 		return (1);
4348 
4349 	return (0);
4350 }
4351 
4352 #ifdef DEBUG
4353 static void
4354 check_active_locks(graph_t *gp)
4355 {
4356 	lock_descriptor_t *lock, *lock1;
4357 	edge_t	*ep;
4358 
4359 	for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
4360 	    lock = lock->l_next) {
4361 		ASSERT(IS_ACTIVE(lock));
4362 		ASSERT(NOT_BLOCKED(lock));
4363 		ASSERT(!IS_BARRIER(lock));
4364 
4365 		ep = FIRST_IN(lock);
4366 
4367 		while (ep != HEAD(lock)) {
4368 			ASSERT(IS_SLEEPING(ep->from_vertex));
4369 			ASSERT(!NOT_BLOCKED(ep->from_vertex));
4370 			ep = NEXT_IN(ep);
4371 		}
4372 
4373 		for (lock1 = lock->l_next; lock1 != ACTIVE_HEAD(gp);
4374 		    lock1 = lock1->l_next) {
4375 			if (lock1->l_vnode == lock->l_vnode) {
4376 			if (BLOCKS(lock1, lock)) {
4377 				cmn_err(CE_PANIC,
4378 				    "active lock %p blocks %p",
4379 				    (void *)lock1, (void *)lock);
4380 			} else if (BLOCKS(lock, lock1)) {
4381 				cmn_err(CE_PANIC,
4382 				    "active lock %p blocks %p",
4383 				    (void *)lock, (void *)lock1);
4384 			}
4385 			}
4386 		}
4387 	}
4388 }
4389 
4390 /*
4391  * Effect: This functions checks to see if the transition from 'old_state' to
4392  *	'new_state' is a valid one.  It returns 0 if the transition is valid
4393  *	and 1 if it is not.
4394  *	For a map of valid transitions, see sys/flock_impl.h
4395  */
4396 static int
4397 check_lock_transition(int old_state, int new_state)
4398 {
4399 	switch (old_state) {
4400 	case FLK_INITIAL_STATE:
4401 		if ((new_state == FLK_START_STATE) ||
4402 		    (new_state == FLK_SLEEPING_STATE) ||
4403 		    (new_state == FLK_ACTIVE_STATE) ||
4404 		    (new_state == FLK_DEAD_STATE)) {
4405 			return (0);
4406 		} else {
4407 			return (1);
4408 		}
4409 	case FLK_START_STATE:
4410 		if ((new_state == FLK_ACTIVE_STATE) ||
4411 		    (new_state == FLK_DEAD_STATE)) {
4412 			return (0);
4413 		} else {
4414 			return (1);
4415 		}
4416 	case FLK_ACTIVE_STATE:
4417 		if (new_state == FLK_DEAD_STATE) {
4418 			return (0);
4419 		} else {
4420 			return (1);
4421 		}
4422 	case FLK_SLEEPING_STATE:
4423 		if ((new_state == FLK_GRANTED_STATE) ||
4424 		    (new_state == FLK_INTERRUPTED_STATE) ||
4425 		    (new_state == FLK_CANCELLED_STATE)) {
4426 			return (0);
4427 		} else {
4428 			return (1);
4429 		}
4430 	case FLK_GRANTED_STATE:
4431 		if ((new_state == FLK_START_STATE) ||
4432 		    (new_state == FLK_INTERRUPTED_STATE) ||
4433 		    (new_state == FLK_CANCELLED_STATE)) {
4434 			return (0);
4435 		} else {
4436 			return (1);
4437 		}
4438 	case FLK_CANCELLED_STATE:
4439 		if ((new_state == FLK_INTERRUPTED_STATE) ||
4440 		    (new_state == FLK_DEAD_STATE)) {
4441 			return (0);
4442 		} else {
4443 			return (1);
4444 		}
4445 	case FLK_INTERRUPTED_STATE:
4446 		if (new_state == FLK_DEAD_STATE) {
4447 			return (0);
4448 		} else {
4449 			return (1);
4450 		}
4451 	case FLK_DEAD_STATE:
4452 		/* May be set more than once */
4453 		if (new_state == FLK_DEAD_STATE) {
4454 			return (0);
4455 		} else {
4456 			return (1);
4457 		}
4458 	default:
4459 		return (1);
4460 	}
4461 }
4462 
4463 static void
4464 check_sleeping_locks(graph_t *gp)
4465 {
4466 	lock_descriptor_t *lock1, *lock2;
4467 	edge_t *ep;
4468 	for (lock1 = SLEEPING_HEAD(gp)->l_next; lock1 != SLEEPING_HEAD(gp);
4469 	    lock1 = lock1->l_next) {
4470 				ASSERT(!IS_BARRIER(lock1));
4471 	for (lock2 = lock1->l_next; lock2 != SLEEPING_HEAD(gp);
4472 	    lock2 = lock2->l_next) {
4473 		if (lock1->l_vnode == lock2->l_vnode) {
4474 			if (BLOCKS(lock2, lock1)) {
4475 				ASSERT(!IS_GRANTED(lock1));
4476 				ASSERT(!NOT_BLOCKED(lock1));
4477 				path(lock1, lock2);
4478 			}
4479 		}
4480 	}
4481 
4482 	for (lock2 = ACTIVE_HEAD(gp)->l_next; lock2 != ACTIVE_HEAD(gp);
4483 	    lock2 = lock2->l_next) {
4484 				ASSERT(!IS_BARRIER(lock1));
4485 		if (lock1->l_vnode == lock2->l_vnode) {
4486 			if (BLOCKS(lock2, lock1)) {
4487 				ASSERT(!IS_GRANTED(lock1));
4488 				ASSERT(!NOT_BLOCKED(lock1));
4489 				path(lock1, lock2);
4490 			}
4491 		}
4492 	}
4493 	ep = FIRST_ADJ(lock1);
4494 	while (ep != HEAD(lock1)) {
4495 		ASSERT(BLOCKS(ep->to_vertex, lock1));
4496 		ep = NEXT_ADJ(ep);
4497 	}
4498 	}
4499 }
4500 
4501 static int
4502 level_two_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2, int no_path)
4503 {
4504 	edge_t	*ep;
4505 	lock_descriptor_t	*vertex;
4506 	lock_descriptor_t *vertex_stack;
4507 
4508 	STACK_INIT(vertex_stack);
4509 
4510 	flk_graph_uncolor(lock1->l_graph);
4511 	ep = FIRST_ADJ(lock1);
4512 	ASSERT(ep != HEAD(lock1));
4513 	while (ep != HEAD(lock1)) {
4514 		if (no_path)
4515 			ASSERT(ep->to_vertex != lock2);
4516 		STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4517 		COLOR(ep->to_vertex);
4518 		ep = NEXT_ADJ(ep);
4519 	}
4520 
4521 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
4522 		STACK_POP(vertex_stack, l_dstack);
4523 		for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
4524 		    ep = NEXT_ADJ(ep)) {
4525 			if (COLORED(ep->to_vertex))
4526 				continue;
4527 			COLOR(ep->to_vertex);
4528 			if (ep->to_vertex == lock2)
4529 				return (1);
4530 
4531 			STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4532 		}
4533 	}
4534 	return (0);
4535 }
4536 
4537 static void
4538 check_owner_locks(graph_t *gp, pid_t pid, int sysid, vnode_t *vp)
4539 {
4540 	lock_descriptor_t *lock;
4541 
4542 	/* Ignore OFD style locks since they're not process-wide. */
4543 	if (pid == 0)
4544 		return;
4545 
4546 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
4547 
4548 	if (lock) {
4549 		while (lock != ACTIVE_HEAD(gp) && (lock->l_vnode == vp)) {
4550 			if (lock->l_flock.l_pid == pid &&
4551 			    lock->l_flock.l_sysid == sysid)
4552 				cmn_err(CE_PANIC,
4553 				    "owner pid %d's lock %p in active queue",
4554 				    pid, (void *)lock);
4555 			lock = lock->l_next;
4556 		}
4557 	}
4558 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
4559 
4560 	if (lock) {
4561 		while (lock != SLEEPING_HEAD(gp) && (lock->l_vnode == vp)) {
4562 			if (lock->l_flock.l_pid == pid &&
4563 			    lock->l_flock.l_sysid == sysid)
4564 				cmn_err(CE_PANIC,
4565 				    "owner pid %d's lock %p in sleep queue",
4566 				    pid, (void *)lock);
4567 			lock = lock->l_next;
4568 		}
4569 	}
4570 }
4571 
4572 static int
4573 level_one_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4574 {
4575 	edge_t *ep = FIRST_ADJ(lock1);
4576 
4577 	while (ep != HEAD(lock1)) {
4578 		if (ep->to_vertex == lock2)
4579 			return (1);
4580 		else
4581 			ep = NEXT_ADJ(ep);
4582 	}
4583 	return (0);
4584 }
4585 
4586 static int
4587 no_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4588 {
4589 	return (!level_two_path(lock1, lock2, 1));
4590 }
4591 
4592 static void
4593 path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4594 {
4595 	if (level_one_path(lock1, lock2)) {
4596 		if (level_two_path(lock1, lock2, 0) != 0) {
4597 			cmn_err(CE_WARN,
4598 			    "one edge one path from lock1 %p lock2 %p",
4599 			    (void *)lock1, (void *)lock2);
4600 		}
4601 	} else if (no_path(lock1, lock2)) {
4602 		cmn_err(CE_PANIC,
4603 		    "No path from  lock1 %p to lock2 %p",
4604 		    (void *)lock1, (void *)lock2);
4605 	}
4606 }
4607 #endif /* DEBUG */
4608