xref: /titanic_50/usr/src/uts/common/os/flock.c (revision d47ced1f1801c1c1ca309ff7c5997bf0c8cb4092)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 /*	All Rights Reserved */
29 
30 /*
31  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
32  */
33 
34 #include <sys/flock_impl.h>
35 #include <sys/vfs.h>
36 #include <sys/t_lock.h>		/* for <sys/callb.h> */
37 #include <sys/callb.h>
38 #include <sys/clconf.h>
39 #include <sys/cladm.h>
40 #include <sys/nbmlock.h>
41 #include <sys/cred.h>
42 #include <sys/policy.h>
43 
44 /*
45  * The following four variables are for statistics purposes and they are
46  * not protected by locks. They may not be accurate but will at least be
47  * close to the actual value.
48  */
49 
50 int	flk_lock_allocs;
51 int	flk_lock_frees;
52 int 	edge_allocs;
53 int	edge_frees;
54 int 	flk_proc_vertex_allocs;
55 int 	flk_proc_edge_allocs;
56 int	flk_proc_vertex_frees;
57 int	flk_proc_edge_frees;
58 
59 static kmutex_t flock_lock;
60 
61 #ifdef DEBUG
62 int check_debug = 0;
63 #define	CHECK_ACTIVE_LOCKS(gp)	if (check_debug) \
64 					check_active_locks(gp);
65 #define	CHECK_SLEEPING_LOCKS(gp)	if (check_debug) \
66 						check_sleeping_locks(gp);
67 #define	CHECK_OWNER_LOCKS(gp, pid, sysid, vp) 	\
68 		if (check_debug)	\
69 			check_owner_locks(gp, pid, sysid, vp);
70 #define	CHECK_LOCK_TRANSITION(old_state, new_state) \
71 	{ \
72 		if (check_lock_transition(old_state, new_state)) { \
73 			cmn_err(CE_PANIC, "Illegal lock transition \
74 			    from %d to %d", old_state, new_state); \
75 		} \
76 	}
77 #else
78 
79 #define	CHECK_ACTIVE_LOCKS(gp)
80 #define	CHECK_SLEEPING_LOCKS(gp)
81 #define	CHECK_OWNER_LOCKS(gp, pid, sysid, vp)
82 #define	CHECK_LOCK_TRANSITION(old_state, new_state)
83 
84 #endif /* DEBUG */
85 
86 struct kmem_cache	*flk_edge_cache;
87 
88 graph_t		*lock_graph[HASH_SIZE];
89 proc_graph_t	pgraph;
90 
91 /*
92  * Clustering.
93  *
94  * NLM REGISTRY TYPE IMPLEMENTATION
95  *
96  * Assumptions:
97  *  1.  Nodes in a cluster are numbered starting at 1; always non-negative
98  *	integers; maximum node id is returned by clconf_maximum_nodeid().
99  *  2.  We use this node id to identify the node an NLM server runs on.
100  */
101 
102 /*
103  * NLM registry object keeps track of NLM servers via their
104  * nlmids (which are the node ids of the node in the cluster they run on)
105  * that have requested locks at this LLM with which this registry is
106  * associated.
107  *
108  * Representation of abstraction:
109  *    rep = record[	states: array[nlm_state],
110  *			lock: mutex]
111  *
112  *    Representation invariants:
113  *	1. index i of rep.states is between 0 and n - 1 where n is number
114  *	   of elements in the array, which happen to be the maximum number
115  *	   of nodes in the cluster configuration + 1.
116  *	2. map nlmid to index i of rep.states
117  *		0   -> 0
118  *		1   -> 1
119  *		2   -> 2
120  *		n-1 -> clconf_maximum_nodeid()+1
121  *	3.  This 1-1 mapping is quite convenient and it avoids errors resulting
122  *	    from forgetting to subtract 1 from the index.
123  *	4.  The reason we keep the 0th index is the following.  A legitimate
124  *	    cluster configuration includes making a UFS file system NFS
125  *	    exportable.  The code is structured so that if you're in a cluster
126  *	    you do one thing; otherwise, you do something else.  The problem
127  *	    is what to do if you think you're in a cluster with PXFS loaded,
128  *	    but you're using UFS not PXFS?  The upper two bytes of the sysid
129  *	    encode the node id of the node where NLM server runs; these bytes
130  *	    are zero for UFS.  Since the nodeid is used to index into the
131  *	    registry, we can record the NLM server state information at index
132  *	    0 using the same mechanism used for PXFS file locks!
133  */
134 static flk_nlm_status_t *nlm_reg_status = NULL;	/* state array 0..N-1 */
135 static kmutex_t nlm_reg_lock;			/* lock to protect arrary */
136 static uint_t nlm_status_size;			/* size of state array */
137 
138 /*
139  * Although we need a global lock dependency graph (and associated data
140  * structures), we also need a per-zone notion of whether the lock manager is
141  * running, and so whether to allow lock manager requests or not.
142  *
143  * Thus, on a per-zone basis we maintain a ``global'' variable
144  * (flk_lockmgr_status), protected by flock_lock, and set when the lock
145  * manager is determined to be changing state (starting or stopping).
146  *
147  * Each graph/zone pair also has a copy of this variable, which is protected by
148  * the graph's mutex.
149  *
150  * The per-graph copies are used to synchronize lock requests with shutdown
151  * requests.  The global copy is used to initialize the per-graph field when a
152  * new graph is created.
153  */
154 struct flock_globals {
155 	flk_lockmgr_status_t flk_lockmgr_status;
156 	flk_lockmgr_status_t lockmgr_status[HASH_SIZE];
157 };
158 
159 zone_key_t flock_zone_key;
160 
161 static void create_flock(lock_descriptor_t *, flock64_t *);
162 static lock_descriptor_t	*flk_get_lock(void);
163 static void	flk_free_lock(lock_descriptor_t	*lock);
164 static void	flk_get_first_blocking_lock(lock_descriptor_t *request);
165 static int flk_process_request(lock_descriptor_t *);
166 static int flk_add_edge(lock_descriptor_t *, lock_descriptor_t *, int, int);
167 static edge_t *flk_get_edge(void);
168 static int flk_wait_execute_request(lock_descriptor_t *);
169 static int flk_relation(lock_descriptor_t *, lock_descriptor_t *);
170 static void flk_insert_active_lock(lock_descriptor_t *);
171 static void flk_delete_active_lock(lock_descriptor_t *, int);
172 static void flk_insert_sleeping_lock(lock_descriptor_t *);
173 static void flk_graph_uncolor(graph_t *);
174 static void flk_wakeup(lock_descriptor_t *, int);
175 static void flk_free_edge(edge_t *);
176 static void flk_recompute_dependencies(lock_descriptor_t *,
177 			lock_descriptor_t **,  int, int);
178 static int flk_find_barriers(lock_descriptor_t *);
179 static void flk_update_barriers(lock_descriptor_t *);
180 static int flk_color_reachables(lock_descriptor_t *);
181 static int flk_canceled(lock_descriptor_t *);
182 static void flk_delete_locks_by_sysid(lock_descriptor_t *);
183 static void report_blocker(lock_descriptor_t *, lock_descriptor_t *);
184 static void wait_for_lock(lock_descriptor_t *);
185 static void unlock_lockmgr_granted(struct flock_globals *);
186 static void wakeup_sleeping_lockmgr_locks(struct flock_globals *);
187 
188 /* Clustering hooks */
189 static void cl_flk_change_nlm_state_all_locks(int, flk_nlm_status_t);
190 static void cl_flk_wakeup_sleeping_nlm_locks(int);
191 static void cl_flk_unlock_nlm_granted(int);
192 
193 #ifdef DEBUG
194 static int check_lock_transition(int, int);
195 static void check_sleeping_locks(graph_t *);
196 static void check_active_locks(graph_t *);
197 static int no_path(lock_descriptor_t *, lock_descriptor_t *);
198 static void path(lock_descriptor_t *, lock_descriptor_t *);
199 static void check_owner_locks(graph_t *, pid_t, int, vnode_t *);
200 static int level_one_path(lock_descriptor_t *, lock_descriptor_t *);
201 static int level_two_path(lock_descriptor_t *, lock_descriptor_t *, int);
202 #endif
203 
204 /*	proc_graph function definitions */
205 static int flk_check_deadlock(lock_descriptor_t *);
206 static void flk_proc_graph_uncolor(void);
207 static proc_vertex_t *flk_get_proc_vertex(lock_descriptor_t *);
208 static proc_edge_t *flk_get_proc_edge(void);
209 static void flk_proc_release(proc_vertex_t *);
210 static void flk_free_proc_edge(proc_edge_t *);
211 static void flk_update_proc_graph(edge_t *, int);
212 
213 /* Non-blocking mandatory locking */
214 static int lock_blocks_io(nbl_op_t, u_offset_t, ssize_t, int, u_offset_t,
215 			u_offset_t);
216 
217 static struct flock_globals *
218 flk_get_globals(void)
219 {
220 	/*
221 	 * The KLM module had better be loaded if we're attempting to handle
222 	 * lockmgr requests.
223 	 */
224 	ASSERT(flock_zone_key != ZONE_KEY_UNINITIALIZED);
225 	return (zone_getspecific(flock_zone_key, curproc->p_zone));
226 }
227 
228 static flk_lockmgr_status_t
229 flk_get_lockmgr_status(void)
230 {
231 	struct flock_globals *fg;
232 
233 	ASSERT(MUTEX_HELD(&flock_lock));
234 
235 	if (flock_zone_key == ZONE_KEY_UNINITIALIZED) {
236 		/*
237 		 * KLM module not loaded; lock manager definitely not running.
238 		 */
239 		return (FLK_LOCKMGR_DOWN);
240 	}
241 	fg = flk_get_globals();
242 	return (fg->flk_lockmgr_status);
243 }
244 
245 /*
246  * Routine called from fs_frlock in fs/fs_subr.c
247  */
248 
249 int
250 reclock(vnode_t		*vp,
251 	flock64_t	*lckdat,
252 	int		cmd,
253 	int		flag,
254 	u_offset_t	offset,
255 	flk_callback_t	*flk_cbp)
256 {
257 	lock_descriptor_t	stack_lock_request;
258 	lock_descriptor_t	*lock_request;
259 	int error = 0;
260 	graph_t	*gp;
261 	int			nlmid;
262 
263 	/*
264 	 * Check access permissions
265 	 */
266 	if ((cmd & SETFLCK) &&
267 	    ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
268 	    (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
269 			return (EBADF);
270 
271 	/*
272 	 * for query and unlock we use the stack_lock_request
273 	 */
274 
275 	if ((lckdat->l_type == F_UNLCK) ||
276 	    !((cmd & INOFLCK) || (cmd & SETFLCK))) {
277 		lock_request = &stack_lock_request;
278 		(void) bzero((caddr_t)lock_request,
279 		    sizeof (lock_descriptor_t));
280 
281 		/*
282 		 * following is added to make the assertions in
283 		 * flk_execute_request() to pass through
284 		 */
285 
286 		lock_request->l_edge.edge_in_next = &lock_request->l_edge;
287 		lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
288 		lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
289 		lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
290 		lock_request->l_status = FLK_INITIAL_STATE;
291 	} else {
292 		lock_request = flk_get_lock();
293 	}
294 	lock_request->l_state = 0;
295 	lock_request->l_vnode = vp;
296 	lock_request->l_zoneid = getzoneid();
297 
298 	/*
299 	 * Convert the request range into the canonical start and end
300 	 * values.  The NLM protocol supports locking over the entire
301 	 * 32-bit range, so there's no range checking for remote requests,
302 	 * but we still need to verify that local requests obey the rules.
303 	 */
304 	/* Clustering */
305 	if ((cmd & (RCMDLCK | PCMDLCK)) != 0) {
306 		ASSERT(lckdat->l_whence == 0);
307 		lock_request->l_start = lckdat->l_start;
308 		lock_request->l_end = (lckdat->l_len == 0) ? MAX_U_OFFSET_T :
309 		    lckdat->l_start + (lckdat->l_len - 1);
310 	} else {
311 		/* check the validity of the lock range */
312 		error = flk_convert_lock_data(vp, lckdat,
313 		    &lock_request->l_start, &lock_request->l_end,
314 		    offset);
315 		if (error) {
316 			goto done;
317 		}
318 		error = flk_check_lock_data(lock_request->l_start,
319 		    lock_request->l_end, MAXEND);
320 		if (error) {
321 			goto done;
322 		}
323 	}
324 
325 	ASSERT(lock_request->l_end >= lock_request->l_start);
326 
327 	lock_request->l_type = lckdat->l_type;
328 	if (cmd & INOFLCK)
329 		lock_request->l_state |= IO_LOCK;
330 	if (cmd & SLPFLCK)
331 		lock_request->l_state |= WILLING_TO_SLEEP_LOCK;
332 	if (cmd & RCMDLCK)
333 		lock_request->l_state |= LOCKMGR_LOCK;
334 	if (cmd & NBMLCK)
335 		lock_request->l_state |= NBMAND_LOCK;
336 	/*
337 	 * Clustering: set flag for PXFS locks
338 	 * We do not _only_ check for the PCMDLCK flag because PXFS locks could
339 	 * also be of type 'RCMDLCK'.
340 	 * We do not _only_ check the GETPXFSID() macro because local PXFS
341 	 * clients use a pxfsid of zero to permit deadlock detection in the LLM.
342 	 */
343 
344 	if ((cmd & PCMDLCK) || (GETPXFSID(lckdat->l_sysid) != 0)) {
345 		lock_request->l_state |= PXFS_LOCK;
346 	}
347 	if (!((cmd & SETFLCK) || (cmd & INOFLCK))) {
348 		if (lock_request->l_type == F_RDLCK ||
349 		    lock_request->l_type == F_WRLCK)
350 			lock_request->l_state |= QUERY_LOCK;
351 	}
352 	lock_request->l_flock = (*lckdat);
353 	lock_request->l_callbacks = flk_cbp;
354 
355 	/*
356 	 * We are ready for processing the request
357 	 */
358 	if (IS_LOCKMGR(lock_request)) {
359 		/*
360 		 * If the lock request is an NLM server request ....
361 		 */
362 		if (nlm_status_size == 0) { /* not booted as cluster */
363 			mutex_enter(&flock_lock);
364 			/*
365 			 * Bail out if this is a lock manager request and the
366 			 * lock manager is not supposed to be running.
367 			 */
368 			if (flk_get_lockmgr_status() != FLK_LOCKMGR_UP) {
369 				mutex_exit(&flock_lock);
370 				error = ENOLCK;
371 				goto done;
372 			}
373 			mutex_exit(&flock_lock);
374 		} else {			/* booted as a cluster */
375 			nlmid = GETNLMID(lock_request->l_flock.l_sysid);
376 			ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
377 
378 			mutex_enter(&nlm_reg_lock);
379 			/*
380 			 * If the NLM registry does not know about this
381 			 * NLM server making the request, add its nlmid
382 			 * to the registry.
383 			 */
384 			if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status,
385 			    nlmid)) {
386 				FLK_REGISTRY_ADD_NLMID(nlm_reg_status, nlmid);
387 			} else if (!FLK_REGISTRY_IS_NLM_UP(nlm_reg_status,
388 			    nlmid)) {
389 				/*
390 				 * If the NLM server is already known (has made
391 				 * previous lock requests) and its state is
392 				 * not NLM_UP (means that NLM server is
393 				 * shutting down), then bail out with an
394 				 * error to deny the lock request.
395 				 */
396 				mutex_exit(&nlm_reg_lock);
397 				error = ENOLCK;
398 				goto done;
399 			}
400 			mutex_exit(&nlm_reg_lock);
401 		}
402 	}
403 
404 	/* Now get the lock graph for a particular vnode */
405 	gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH);
406 
407 	/*
408 	 * We drop rwlock here otherwise this might end up causing a
409 	 * deadlock if this IOLOCK sleeps. (bugid # 1183392).
410 	 */
411 
412 	if (IS_IO_LOCK(lock_request)) {
413 		VOP_RWUNLOCK(vp,
414 		    (lock_request->l_type == F_RDLCK) ?
415 		    V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
416 	}
417 	mutex_enter(&gp->gp_mutex);
418 
419 	lock_request->l_state |= REFERENCED_LOCK;
420 	lock_request->l_graph = gp;
421 
422 	switch (lock_request->l_type) {
423 	case F_RDLCK:
424 	case F_WRLCK:
425 		if (IS_QUERY_LOCK(lock_request)) {
426 			flk_get_first_blocking_lock(lock_request);
427 			(*lckdat) = lock_request->l_flock;
428 			break;
429 		}
430 
431 		/* process the request now */
432 
433 		error = flk_process_request(lock_request);
434 		break;
435 
436 	case F_UNLCK:
437 		/* unlock request will not block so execute it immediately */
438 
439 		if (IS_LOCKMGR(lock_request) &&
440 		    flk_canceled(lock_request)) {
441 			error = 0;
442 		} else {
443 			error = flk_execute_request(lock_request);
444 		}
445 		break;
446 
447 	case F_UNLKSYS:
448 		/*
449 		 * Recovery mechanism to release lock manager locks when
450 		 * NFS client crashes and restart. NFS server will clear
451 		 * old locks and grant new locks.
452 		 */
453 
454 		if (lock_request->l_flock.l_sysid == 0) {
455 			mutex_exit(&gp->gp_mutex);
456 			return (EINVAL);
457 		}
458 		if (secpolicy_nfs(CRED()) != 0) {
459 			mutex_exit(&gp->gp_mutex);
460 			return (EPERM);
461 		}
462 		flk_delete_locks_by_sysid(lock_request);
463 		lock_request->l_state &= ~REFERENCED_LOCK;
464 		flk_set_state(lock_request, FLK_DEAD_STATE);
465 		flk_free_lock(lock_request);
466 		mutex_exit(&gp->gp_mutex);
467 		return (0);
468 
469 	default:
470 		error = EINVAL;
471 		break;
472 	}
473 
474 	/* Clustering: For blocked PXFS locks, return */
475 	if (error == PXFS_LOCK_BLOCKED) {
476 		lock_request->l_state &= ~REFERENCED_LOCK;
477 		mutex_exit(&gp->gp_mutex);
478 		return (error);
479 	}
480 
481 	/*
482 	 * Now that we have seen the status of locks in the system for
483 	 * this vnode we acquire the rwlock if it is an IO_LOCK.
484 	 */
485 
486 	if (IS_IO_LOCK(lock_request)) {
487 		(void) VOP_RWLOCK(vp,
488 		    (lock_request->l_type == F_RDLCK) ?
489 		    V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
490 		if (!error) {
491 			lckdat->l_type = F_UNLCK;
492 
493 			/*
494 			 * This wake up is needed otherwise
495 			 * if IO_LOCK has slept the dependents on this
496 			 * will not be woken up at all. (bugid # 1185482).
497 			 */
498 
499 			flk_wakeup(lock_request, 1);
500 			flk_set_state(lock_request, FLK_DEAD_STATE);
501 			flk_free_lock(lock_request);
502 		}
503 		/*
504 		 * else if error had occurred either flk_process_request()
505 		 * has returned EDEADLK in which case there will be no
506 		 * dependents for this lock or EINTR from flk_wait_execute_
507 		 * request() in which case flk_cancel_sleeping_lock()
508 		 * would have been done. same is true with EBADF.
509 		 */
510 	}
511 
512 	if (lock_request == &stack_lock_request) {
513 		flk_set_state(lock_request, FLK_DEAD_STATE);
514 	} else {
515 		lock_request->l_state &= ~REFERENCED_LOCK;
516 		if ((error != 0) || IS_DELETED(lock_request)) {
517 			flk_set_state(lock_request, FLK_DEAD_STATE);
518 			flk_free_lock(lock_request);
519 		}
520 	}
521 
522 	mutex_exit(&gp->gp_mutex);
523 	return (error);
524 
525 done:
526 	flk_set_state(lock_request, FLK_DEAD_STATE);
527 	if (lock_request != &stack_lock_request)
528 		flk_free_lock(lock_request);
529 	return (error);
530 }
531 
532 /*
533  * Invoke the callbacks in the given list.  If before sleeping, invoke in
534  * list order.  If after sleeping, invoke in reverse order.
535  *
536  * CPR (suspend/resume) support: if one of the callbacks returns a
537  * callb_cpr_t, return it.   This will be used to make the thread CPR-safe
538  * while it is sleeping.  There should be at most one callb_cpr_t for the
539  * thread.
540  * XXX This is unnecessarily complicated.  The CPR information should just
541  * get passed in directly through VOP_FRLOCK and reclock, rather than
542  * sneaking it in via a callback.
543  */
544 
545 callb_cpr_t *
546 flk_invoke_callbacks(flk_callback_t *cblist, flk_cb_when_t when)
547 {
548 	callb_cpr_t *cpr_callbackp = NULL;
549 	callb_cpr_t *one_result;
550 	flk_callback_t *cb;
551 
552 	if (cblist == NULL)
553 		return (NULL);
554 
555 	if (when == FLK_BEFORE_SLEEP) {
556 		cb = cblist;
557 		do {
558 			one_result = (*cb->cb_callback)(when, cb->cb_data);
559 			if (one_result != NULL) {
560 				ASSERT(cpr_callbackp == NULL);
561 				cpr_callbackp = one_result;
562 			}
563 			cb = cb->cb_next;
564 		} while (cb != cblist);
565 	} else {
566 		cb = cblist->cb_prev;
567 		do {
568 			one_result = (*cb->cb_callback)(when, cb->cb_data);
569 			if (one_result != NULL) {
570 				cpr_callbackp = one_result;
571 			}
572 			cb = cb->cb_prev;
573 		} while (cb != cblist->cb_prev);
574 	}
575 
576 	return (cpr_callbackp);
577 }
578 
579 /*
580  * Initialize a flk_callback_t to hold the given callback.
581  */
582 
583 void
584 flk_init_callback(flk_callback_t *flk_cb,
585 	callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *), void *cbdata)
586 {
587 	flk_cb->cb_next = flk_cb;
588 	flk_cb->cb_prev = flk_cb;
589 	flk_cb->cb_callback = cb_fcn;
590 	flk_cb->cb_data = cbdata;
591 }
592 
593 /*
594  * Initialize an flk_callback_t and then link it into the head of an
595  * existing list (which may be NULL).
596  */
597 
598 void
599 flk_add_callback(flk_callback_t *newcb,
600 		callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *),
601 		void *cbdata, flk_callback_t *cblist)
602 {
603 	flk_init_callback(newcb, cb_fcn, cbdata);
604 
605 	if (cblist == NULL)
606 		return;
607 
608 	newcb->cb_prev = cblist->cb_prev;
609 	newcb->cb_next = cblist;
610 	cblist->cb_prev->cb_next = newcb;
611 	cblist->cb_prev = newcb;
612 }
613 
614 /*
615  * Initialize the flk_edge_cache data structure and create the
616  * nlm_reg_status array.
617  */
618 
619 void
620 flk_init(void)
621 {
622 	uint_t	i;
623 
624 	flk_edge_cache = kmem_cache_create("flk_edges",
625 	    sizeof (struct edge), 0, NULL, NULL, NULL, NULL, NULL, 0);
626 	if (flk_edge_cache == NULL) {
627 		cmn_err(CE_PANIC, "Couldn't create flk_edge_cache\n");
628 	}
629 	/*
630 	 * Create the NLM registry object.
631 	 */
632 
633 	if (cluster_bootflags & CLUSTER_BOOTED) {
634 		/*
635 		 * This routine tells you the maximum node id that will be used
636 		 * in the cluster.  This number will be the size of the nlm
637 		 * registry status array.  We add 1 because we will be using
638 		 * all entries indexed from 0 to maxnodeid; e.g., from 0
639 		 * to 64, for a total of 65 entries.
640 		 */
641 		nlm_status_size = clconf_maximum_nodeid() + 1;
642 	} else {
643 		nlm_status_size = 0;
644 	}
645 
646 	if (nlm_status_size != 0) {	/* booted as a cluster */
647 		nlm_reg_status = (flk_nlm_status_t *)
648 		    kmem_alloc(sizeof (flk_nlm_status_t) * nlm_status_size,
649 		    KM_SLEEP);
650 
651 		/* initialize all NLM states in array to NLM_UNKNOWN */
652 		for (i = 0; i < nlm_status_size; i++) {
653 			nlm_reg_status[i] = FLK_NLM_UNKNOWN;
654 		}
655 	}
656 }
657 
658 /*
659  * Zone constructor/destructor callbacks to be executed when a zone is
660  * created/destroyed.
661  */
662 /* ARGSUSED */
663 void *
664 flk_zone_init(zoneid_t zoneid)
665 {
666 	struct flock_globals *fg;
667 	uint_t i;
668 
669 	fg = kmem_alloc(sizeof (*fg), KM_SLEEP);
670 	fg->flk_lockmgr_status = FLK_LOCKMGR_UP;
671 	for (i = 0; i < HASH_SIZE; i++)
672 		fg->lockmgr_status[i] = FLK_LOCKMGR_UP;
673 	return (fg);
674 }
675 
676 /* ARGSUSED */
677 void
678 flk_zone_fini(zoneid_t zoneid, void *data)
679 {
680 	struct flock_globals *fg = data;
681 
682 	kmem_free(fg, sizeof (*fg));
683 }
684 
685 /*
686  * Get a lock_descriptor structure with initialization of edge lists.
687  */
688 
689 static lock_descriptor_t *
690 flk_get_lock(void)
691 {
692 	lock_descriptor_t	*l;
693 
694 	l = kmem_zalloc(sizeof (lock_descriptor_t), KM_SLEEP);
695 
696 	cv_init(&l->l_cv, NULL, CV_DRIVER, NULL);
697 	l->l_edge.edge_in_next = &l->l_edge;
698 	l->l_edge.edge_in_prev = &l->l_edge;
699 	l->l_edge.edge_adj_next = &l->l_edge;
700 	l->l_edge.edge_adj_prev = &l->l_edge;
701 	l->pvertex = -1;
702 	l->l_status = FLK_INITIAL_STATE;
703 	flk_lock_allocs++;
704 	return (l);
705 }
706 
707 /*
708  * Free a lock_descriptor structure. Just sets the DELETED_LOCK flag
709  * when some thread has a reference to it as in reclock().
710  */
711 
712 void
713 flk_free_lock(lock_descriptor_t	*lock)
714 {
715 	ASSERT(IS_DEAD(lock));
716 	if (IS_REFERENCED(lock)) {
717 		lock->l_state |= DELETED_LOCK;
718 		return;
719 	}
720 	flk_lock_frees++;
721 	kmem_free((void *)lock, sizeof (lock_descriptor_t));
722 }
723 
724 void
725 flk_set_state(lock_descriptor_t *lock, int new_state)
726 {
727 	/*
728 	 * Locks in the sleeping list may be woken up in a number of ways,
729 	 * and more than once.  If a sleeping lock is signaled awake more
730 	 * than once, then it may or may not change state depending on its
731 	 * current state.
732 	 * Also note that NLM locks that are sleeping could be moved to an
733 	 * interrupted state more than once if the unlock request is
734 	 * retransmitted by the NLM client - the second time around, this is
735 	 * just a nop.
736 	 * The ordering of being signaled awake is:
737 	 * INTERRUPTED_STATE > CANCELLED_STATE > GRANTED_STATE.
738 	 * The checks below implement this ordering.
739 	 */
740 	if (IS_INTERRUPTED(lock)) {
741 		if ((new_state == FLK_CANCELLED_STATE) ||
742 		    (new_state == FLK_GRANTED_STATE) ||
743 		    (new_state == FLK_INTERRUPTED_STATE)) {
744 			return;
745 		}
746 	}
747 	if (IS_CANCELLED(lock)) {
748 		if ((new_state == FLK_GRANTED_STATE) ||
749 		    (new_state == FLK_CANCELLED_STATE)) {
750 			return;
751 		}
752 	}
753 	CHECK_LOCK_TRANSITION(lock->l_status, new_state);
754 	if (IS_PXFS(lock)) {
755 		cl_flk_state_transition_notify(lock, lock->l_status, new_state);
756 	}
757 	lock->l_status = new_state;
758 }
759 
760 /*
761  * Routine that checks whether there are any blocking locks in the system.
762  *
763  * The policy followed is if a write lock is sleeping we don't allow read
764  * locks before this write lock even though there may not be any active
765  * locks corresponding to the read locks' region.
766  *
767  * flk_add_edge() function adds an edge between l1 and l2 iff there
768  * is no path between l1 and l2. This is done to have a "minimum
769  * storage representation" of the dependency graph.
770  *
771  * Another property of the graph is since only the new request throws
772  * edges to the existing locks in the graph, the graph is always topologically
773  * ordered.
774  */
775 
776 static int
777 flk_process_request(lock_descriptor_t *request)
778 {
779 	graph_t	*gp = request->l_graph;
780 	lock_descriptor_t *lock;
781 	int request_blocked_by_active = 0;
782 	int request_blocked_by_granted = 0;
783 	int request_blocked_by_sleeping = 0;
784 	vnode_t	*vp = request->l_vnode;
785 	int	error = 0;
786 	int request_will_wait = 0;
787 	int found_covering_lock = 0;
788 	lock_descriptor_t *covered_by = NULL;
789 
790 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
791 	request_will_wait = IS_WILLING_TO_SLEEP(request);
792 
793 	/*
794 	 * check active locks
795 	 */
796 
797 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
798 
799 
800 	if (lock) {
801 		do {
802 			if (BLOCKS(lock, request)) {
803 				if (!request_will_wait)
804 					return (EAGAIN);
805 				request_blocked_by_active = 1;
806 				break;
807 			}
808 			/*
809 			 * Grant lock if it is for the same owner holding active
810 			 * lock that covers the request.
811 			 */
812 
813 			if (SAME_OWNER(lock, request) &&
814 			    COVERS(lock, request) &&
815 			    (request->l_type == F_RDLCK))
816 				return (flk_execute_request(request));
817 			lock = lock->l_next;
818 		} while (lock->l_vnode == vp);
819 	}
820 
821 	if (!request_blocked_by_active) {
822 			lock_descriptor_t *lk[1];
823 			lock_descriptor_t *first_glock = NULL;
824 		/*
825 		 * Shall we grant this?! NO!!
826 		 * What about those locks that were just granted and still
827 		 * in sleep queue. Those threads are woken up and so locks
828 		 * are almost active.
829 		 */
830 		SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
831 		if (lock) {
832 			do {
833 				if (BLOCKS(lock, request)) {
834 					if (IS_GRANTED(lock)) {
835 						request_blocked_by_granted = 1;
836 					} else {
837 						request_blocked_by_sleeping = 1;
838 					}
839 				}
840 
841 				lock = lock->l_next;
842 			} while ((lock->l_vnode == vp));
843 			first_glock = lock->l_prev;
844 			ASSERT(first_glock->l_vnode == vp);
845 		}
846 
847 		if (request_blocked_by_granted)
848 			goto block;
849 
850 		if (!request_blocked_by_sleeping) {
851 			/*
852 			 * If the request isn't going to be blocked by a
853 			 * sleeping request, we know that it isn't going to
854 			 * be blocked; we can just execute the request --
855 			 * without performing costly deadlock detection.
856 			 */
857 			ASSERT(!request_blocked_by_active);
858 			return (flk_execute_request(request));
859 		} else if (request->l_type == F_RDLCK) {
860 			/*
861 			 * If we have a sleeping writer in the requested
862 			 * lock's range, block.
863 			 */
864 			goto block;
865 		}
866 
867 		lk[0] = request;
868 		request->l_state |= RECOMPUTE_LOCK;
869 		SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
870 		if (lock) {
871 			do {
872 				flk_recompute_dependencies(lock, lk, 1, 0);
873 				lock = lock->l_next;
874 			} while (lock->l_vnode == vp);
875 		}
876 		lock = first_glock;
877 		if (lock) {
878 			do {
879 				if (IS_GRANTED(lock)) {
880 				flk_recompute_dependencies(lock, lk, 1, 0);
881 				}
882 				lock = lock->l_prev;
883 			} while ((lock->l_vnode == vp));
884 		}
885 		request->l_state &= ~RECOMPUTE_LOCK;
886 		if (!NO_DEPENDENTS(request) && flk_check_deadlock(request))
887 			return (EDEADLK);
888 		return (flk_execute_request(request));
889 	}
890 
891 block:
892 	if (request_will_wait)
893 		flk_graph_uncolor(gp);
894 
895 	/* check sleeping locks */
896 
897 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
898 
899 	/*
900 	 * If we find a sleeping write lock that is a superset of the
901 	 * region wanted by request we can be assured that by adding an
902 	 * edge to this write lock we have paths to all locks in the
903 	 * graph that blocks the request except in one case and that is why
904 	 * another check for SAME_OWNER in the loop below. The exception
905 	 * case is when this process that owns the sleeping write lock 'l1'
906 	 * has other locks l2, l3, l4 that are in the system and arrived
907 	 * before l1. l1 does not have path to these locks as they are from
908 	 * same process. We break when we find a second covering sleeping
909 	 * lock l5 owned by a process different from that owning l1, because
910 	 * there cannot be any of l2, l3, l4, etc., arrived before l5, and if
911 	 * it has l1 would have produced a deadlock already.
912 	 */
913 
914 	if (lock) {
915 		do {
916 			if (BLOCKS(lock, request)) {
917 				if (!request_will_wait)
918 					return (EAGAIN);
919 				if (COVERS(lock, request) &&
920 				    lock->l_type == F_WRLCK) {
921 					if (found_covering_lock &&
922 					    !SAME_OWNER(lock, covered_by)) {
923 						found_covering_lock++;
924 						break;
925 					}
926 					found_covering_lock = 1;
927 					covered_by = lock;
928 				}
929 				if (found_covering_lock &&
930 				    !SAME_OWNER(lock, covered_by)) {
931 					lock = lock->l_next;
932 					continue;
933 				}
934 				if ((error = flk_add_edge(request, lock,
935 				    !found_covering_lock, 0)))
936 					return (error);
937 			}
938 			lock = lock->l_next;
939 		} while (lock->l_vnode == vp);
940 	}
941 
942 /*
943  * found_covering_lock == 2 iff at this point 'request' has paths
944  * to all locks that blocks 'request'. found_covering_lock == 1 iff at this
945  * point 'request' has paths to all locks that blocks 'request' whose owners
946  * are not same as the one that covers 'request' (covered_by above) and
947  * we can have locks whose owner is same as covered_by in the active list.
948  */
949 
950 	if (request_blocked_by_active && found_covering_lock != 2) {
951 		SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
952 		ASSERT(lock != NULL);
953 		do {
954 			if (BLOCKS(lock, request)) {
955 				if (found_covering_lock &&
956 				    !SAME_OWNER(lock, covered_by)) {
957 					lock = lock->l_next;
958 					continue;
959 				}
960 				if ((error = flk_add_edge(request, lock,
961 				    CHECK_CYCLE, 0)))
962 					return (error);
963 			}
964 			lock = lock->l_next;
965 		} while (lock->l_vnode == vp);
966 	}
967 
968 	if (NOT_BLOCKED(request)) {
969 		/*
970 		 * request not dependent on any other locks
971 		 * so execute this request
972 		 */
973 		return (flk_execute_request(request));
974 	} else {
975 		/*
976 		 * check for deadlock
977 		 */
978 		if (flk_check_deadlock(request))
979 			return (EDEADLK);
980 		/*
981 		 * this thread has to sleep
982 		 */
983 		return (flk_wait_execute_request(request));
984 	}
985 }
986 
987 /*
988  * The actual execution of the request in the simple case is only to
989  * insert the 'request' in the list of active locks if it is not an
990  * UNLOCK.
991  * We have to consider the existing active locks' relation to
992  * this 'request' if they are owned by same process. flk_relation() does
993  * this job and sees to that the dependency graph information is maintained
994  * properly.
995  */
996 
997 int
998 flk_execute_request(lock_descriptor_t *request)
999 {
1000 	graph_t	*gp = request->l_graph;
1001 	vnode_t	*vp = request->l_vnode;
1002 	lock_descriptor_t	*lock, *lock1;
1003 	int done_searching = 0;
1004 
1005 	CHECK_SLEEPING_LOCKS(gp);
1006 	CHECK_ACTIVE_LOCKS(gp);
1007 
1008 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1009 
1010 	flk_set_state(request, FLK_START_STATE);
1011 
1012 	ASSERT(NOT_BLOCKED(request));
1013 
1014 	/* IO_LOCK requests are only to check status */
1015 
1016 	if (IS_IO_LOCK(request))
1017 		return (0);
1018 
1019 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1020 
1021 	if (lock == NULL && request->l_type == F_UNLCK)
1022 		return (0);
1023 	if (lock == NULL) {
1024 		flk_insert_active_lock(request);
1025 		return (0);
1026 	}
1027 
1028 	do {
1029 		lock1 = lock->l_next;
1030 		if (SAME_OWNER(request, lock)) {
1031 			done_searching = flk_relation(lock, request);
1032 		}
1033 		lock = lock1;
1034 	} while (lock->l_vnode == vp && !done_searching);
1035 
1036 	/*
1037 	 * insert in active queue
1038 	 */
1039 
1040 	if (request->l_type != F_UNLCK)
1041 		flk_insert_active_lock(request);
1042 
1043 	return (0);
1044 }
1045 
1046 /*
1047  * 'request' is blocked by some one therefore we put it into sleep queue.
1048  */
1049 static int
1050 flk_wait_execute_request(lock_descriptor_t *request)
1051 {
1052 	graph_t	*gp = request->l_graph;
1053 	callb_cpr_t 	*cprp;		/* CPR info from callback */
1054 	struct flock_globals *fg;
1055 	int index;
1056 
1057 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1058 	ASSERT(IS_WILLING_TO_SLEEP(request));
1059 
1060 	flk_insert_sleeping_lock(request);
1061 
1062 	if (IS_LOCKMGR(request)) {
1063 		index = HASH_INDEX(request->l_vnode);
1064 		fg = flk_get_globals();
1065 
1066 		if (nlm_status_size == 0) {	/* not booted as a cluster */
1067 			if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP) {
1068 				flk_cancel_sleeping_lock(request, 1);
1069 				return (ENOLCK);
1070 			}
1071 		} else {			/* booted as a cluster */
1072 			/*
1073 			 * If the request is an NLM server lock request,
1074 			 * and the NLM state of the lock request is not
1075 			 * NLM_UP (because the NLM server is shutting
1076 			 * down), then cancel the sleeping lock and
1077 			 * return error ENOLCK that will encourage the
1078 			 * client to retransmit.
1079 			 */
1080 			if (!IS_NLM_UP(request)) {
1081 				flk_cancel_sleeping_lock(request, 1);
1082 				return (ENOLCK);
1083 			}
1084 		}
1085 	}
1086 
1087 	/* Clustering: For blocking PXFS locks, return */
1088 	if (IS_PXFS(request)) {
1089 		/*
1090 		 * PXFS locks sleep on the client side.
1091 		 * The callback argument is used to wake up the sleeper
1092 		 * when the lock is granted.
1093 		 * We return -1 (rather than an errno value) to indicate
1094 		 * the client side should sleep
1095 		 */
1096 		return (PXFS_LOCK_BLOCKED);
1097 	}
1098 
1099 	if (request->l_callbacks != NULL) {
1100 		/*
1101 		 * To make sure the shutdown code works correctly, either
1102 		 * the callback must happen after putting the lock on the
1103 		 * sleep list, or we must check the shutdown status after
1104 		 * returning from the callback (and before sleeping).  At
1105 		 * least for now, we'll use the first option.  If a
1106 		 * shutdown or signal or whatever happened while the graph
1107 		 * mutex was dropped, that will be detected by
1108 		 * wait_for_lock().
1109 		 */
1110 		mutex_exit(&gp->gp_mutex);
1111 
1112 		cprp = flk_invoke_callbacks(request->l_callbacks,
1113 		    FLK_BEFORE_SLEEP);
1114 
1115 		mutex_enter(&gp->gp_mutex);
1116 
1117 		if (cprp == NULL) {
1118 			wait_for_lock(request);
1119 		} else {
1120 			mutex_enter(cprp->cc_lockp);
1121 			CALLB_CPR_SAFE_BEGIN(cprp);
1122 			mutex_exit(cprp->cc_lockp);
1123 			wait_for_lock(request);
1124 			mutex_enter(cprp->cc_lockp);
1125 			CALLB_CPR_SAFE_END(cprp, cprp->cc_lockp);
1126 			mutex_exit(cprp->cc_lockp);
1127 		}
1128 
1129 		mutex_exit(&gp->gp_mutex);
1130 		(void) flk_invoke_callbacks(request->l_callbacks,
1131 		    FLK_AFTER_SLEEP);
1132 		mutex_enter(&gp->gp_mutex);
1133 	} else {
1134 		wait_for_lock(request);
1135 	}
1136 
1137 	if (IS_LOCKMGR(request)) {
1138 		/*
1139 		 * If the lock manager is shutting down, return an
1140 		 * error that will encourage the client to retransmit.
1141 		 */
1142 		if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP &&
1143 		    !IS_GRANTED(request)) {
1144 			flk_cancel_sleeping_lock(request, 1);
1145 			return (ENOLCK);
1146 		}
1147 	}
1148 
1149 	if (IS_INTERRUPTED(request)) {
1150 		/* we got a signal, or act like we did */
1151 		flk_cancel_sleeping_lock(request, 1);
1152 		return (EINTR);
1153 	}
1154 
1155 	/* Cancelled if some other thread has closed the file */
1156 
1157 	if (IS_CANCELLED(request)) {
1158 		flk_cancel_sleeping_lock(request, 1);
1159 		return (EBADF);
1160 	}
1161 
1162 	request->l_state &= ~GRANTED_LOCK;
1163 	REMOVE_SLEEP_QUEUE(request);
1164 	return (flk_execute_request(request));
1165 }
1166 
1167 /*
1168  * This routine adds an edge between from and to because from depends
1169  * to. If asked to check for deadlock it checks whether there are any
1170  * reachable locks from "from_lock" that is owned by the same process
1171  * as "from_lock".
1172  * NOTE: It is the caller's responsibility to make sure that the color
1173  * of the graph is consistent between the calls to flk_add_edge as done
1174  * in flk_process_request. This routine does not color and check for
1175  * deadlock explicitly.
1176  */
1177 
1178 static int
1179 flk_add_edge(lock_descriptor_t *from_lock, lock_descriptor_t *to_lock,
1180 			int check_cycle, int update_graph)
1181 {
1182 	edge_t	*edge;
1183 	edge_t	*ep;
1184 	lock_descriptor_t	*vertex;
1185 	lock_descriptor_t *vertex_stack;
1186 
1187 	STACK_INIT(vertex_stack);
1188 
1189 	/*
1190 	 * if to vertex already has mark_color just return
1191 	 * don't add an edge as it is reachable from from vertex
1192 	 * before itself.
1193 	 */
1194 
1195 	if (COLORED(to_lock))
1196 		return (0);
1197 
1198 	edge = flk_get_edge();
1199 
1200 	/*
1201 	 * set the from and to vertex
1202 	 */
1203 
1204 	edge->from_vertex = from_lock;
1205 	edge->to_vertex = to_lock;
1206 
1207 	/*
1208 	 * put in adjacency list of from vertex
1209 	 */
1210 
1211 	from_lock->l_edge.edge_adj_next->edge_adj_prev = edge;
1212 	edge->edge_adj_next = from_lock->l_edge.edge_adj_next;
1213 	edge->edge_adj_prev = &from_lock->l_edge;
1214 	from_lock->l_edge.edge_adj_next = edge;
1215 
1216 	/*
1217 	 * put in in list of to vertex
1218 	 */
1219 
1220 	to_lock->l_edge.edge_in_next->edge_in_prev = edge;
1221 	edge->edge_in_next = to_lock->l_edge.edge_in_next;
1222 	to_lock->l_edge.edge_in_next = edge;
1223 	edge->edge_in_prev = &to_lock->l_edge;
1224 
1225 
1226 	if (update_graph) {
1227 		flk_update_proc_graph(edge, 0);
1228 		return (0);
1229 	}
1230 	if (!check_cycle) {
1231 		return (0);
1232 	}
1233 
1234 	STACK_PUSH(vertex_stack, from_lock, l_stack);
1235 
1236 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1237 
1238 		STACK_POP(vertex_stack, l_stack);
1239 
1240 		for (ep = FIRST_ADJ(vertex);
1241 		    ep != HEAD(vertex);
1242 		    ep = NEXT_ADJ(ep)) {
1243 			if (COLORED(ep->to_vertex))
1244 				continue;
1245 			COLOR(ep->to_vertex);
1246 			if (SAME_OWNER(ep->to_vertex, from_lock))
1247 				goto dead_lock;
1248 			STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1249 		}
1250 	}
1251 	return (0);
1252 
1253 dead_lock:
1254 
1255 	/*
1256 	 * remove all edges
1257 	 */
1258 
1259 	ep = FIRST_ADJ(from_lock);
1260 
1261 	while (ep != HEAD(from_lock)) {
1262 		IN_LIST_REMOVE(ep);
1263 		from_lock->l_sedge = NEXT_ADJ(ep);
1264 		ADJ_LIST_REMOVE(ep);
1265 		flk_free_edge(ep);
1266 		ep = from_lock->l_sedge;
1267 	}
1268 	return (EDEADLK);
1269 }
1270 
1271 /*
1272  * Get an edge structure for representing the dependency between two locks.
1273  */
1274 
1275 static edge_t *
1276 flk_get_edge()
1277 {
1278 	edge_t	*ep;
1279 
1280 	ASSERT(flk_edge_cache != NULL);
1281 
1282 	ep = kmem_cache_alloc(flk_edge_cache, KM_SLEEP);
1283 	edge_allocs++;
1284 	return (ep);
1285 }
1286 
1287 /*
1288  * Free the edge structure.
1289  */
1290 
1291 static void
1292 flk_free_edge(edge_t *ep)
1293 {
1294 	edge_frees++;
1295 	kmem_cache_free(flk_edge_cache, (void *)ep);
1296 }
1297 
1298 /*
1299  * Check the relationship of request with lock and perform the
1300  * recomputation of dependencies, break lock if required, and return
1301  * 1 if request cannot have any more relationship with the next
1302  * active locks.
1303  * The 'lock' and 'request' are compared and in case of overlap we
1304  * delete the 'lock' and form new locks to represent the non-overlapped
1305  * portion of original 'lock'. This function has side effects such as
1306  * 'lock' will be freed, new locks will be added to the active list.
1307  */
1308 
1309 static int
1310 flk_relation(lock_descriptor_t *lock, lock_descriptor_t *request)
1311 {
1312 	int lock_effect;
1313 	lock_descriptor_t *lock1, *lock2;
1314 	lock_descriptor_t *topology[3];
1315 	int nvertex = 0;
1316 	int i;
1317 	edge_t	*ep;
1318 	graph_t	*gp = (lock->l_graph);
1319 
1320 
1321 	CHECK_SLEEPING_LOCKS(gp);
1322 	CHECK_ACTIVE_LOCKS(gp);
1323 
1324 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1325 
1326 	topology[0] = topology[1] = topology[2] = NULL;
1327 
1328 	if (request->l_type == F_UNLCK)
1329 		lock_effect = FLK_UNLOCK;
1330 	else if (request->l_type == F_RDLCK &&
1331 	    lock->l_type == F_WRLCK)
1332 		lock_effect = FLK_DOWNGRADE;
1333 	else if (request->l_type == F_WRLCK &&
1334 	    lock->l_type == F_RDLCK)
1335 		lock_effect = FLK_UPGRADE;
1336 	else
1337 		lock_effect = FLK_STAY_SAME;
1338 
1339 	if (lock->l_end < request->l_start) {
1340 		if (lock->l_end == request->l_start - 1 &&
1341 		    lock_effect == FLK_STAY_SAME) {
1342 			topology[0] = request;
1343 			request->l_start = lock->l_start;
1344 			nvertex = 1;
1345 			goto recompute;
1346 		} else {
1347 			return (0);
1348 		}
1349 	}
1350 
1351 	if (lock->l_start > request->l_end) {
1352 		if (request->l_end == lock->l_start - 1 &&
1353 		    lock_effect == FLK_STAY_SAME) {
1354 			topology[0] = request;
1355 			request->l_end = lock->l_end;
1356 			nvertex = 1;
1357 			goto recompute;
1358 		} else {
1359 			return (1);
1360 		}
1361 	}
1362 
1363 	if (request->l_end < lock->l_end) {
1364 		if (request->l_start > lock->l_start) {
1365 			if (lock_effect == FLK_STAY_SAME) {
1366 				request->l_start = lock->l_start;
1367 				request->l_end = lock->l_end;
1368 				topology[0] = request;
1369 				nvertex = 1;
1370 			} else {
1371 				lock1 = flk_get_lock();
1372 				lock2 = flk_get_lock();
1373 				COPY(lock1, lock);
1374 				COPY(lock2, lock);
1375 				lock1->l_start = lock->l_start;
1376 				lock1->l_end = request->l_start - 1;
1377 				lock2->l_start = request->l_end + 1;
1378 				lock2->l_end = lock->l_end;
1379 				topology[0] = lock1;
1380 				topology[1] = lock2;
1381 				topology[2] = request;
1382 				nvertex = 3;
1383 			}
1384 		} else if (request->l_start < lock->l_start) {
1385 			if (lock_effect == FLK_STAY_SAME) {
1386 				request->l_end = lock->l_end;
1387 				topology[0] = request;
1388 				nvertex = 1;
1389 			} else {
1390 				lock1 = flk_get_lock();
1391 				COPY(lock1, lock);
1392 				lock1->l_start = request->l_end + 1;
1393 				topology[0] = lock1;
1394 				topology[1] = request;
1395 				nvertex = 2;
1396 			}
1397 		} else  {
1398 			if (lock_effect == FLK_STAY_SAME) {
1399 				request->l_start = lock->l_start;
1400 				request->l_end = lock->l_end;
1401 				topology[0] = request;
1402 				nvertex = 1;
1403 			} else {
1404 				lock1 = flk_get_lock();
1405 				COPY(lock1, lock);
1406 				lock1->l_start = request->l_end + 1;
1407 				topology[0] = lock1;
1408 				topology[1] = request;
1409 				nvertex = 2;
1410 			}
1411 		}
1412 	} else if (request->l_end > lock->l_end) {
1413 		if (request->l_start > lock->l_start)  {
1414 			if (lock_effect == FLK_STAY_SAME) {
1415 				request->l_start = lock->l_start;
1416 				topology[0] = request;
1417 				nvertex = 1;
1418 			} else {
1419 				lock1 = flk_get_lock();
1420 				COPY(lock1, lock);
1421 				lock1->l_end = request->l_start - 1;
1422 				topology[0] = lock1;
1423 				topology[1] = request;
1424 				nvertex = 2;
1425 			}
1426 		} else if (request->l_start < lock->l_start)  {
1427 			topology[0] = request;
1428 			nvertex = 1;
1429 		} else {
1430 			topology[0] = request;
1431 			nvertex = 1;
1432 		}
1433 	} else {
1434 		if (request->l_start > lock->l_start) {
1435 			if (lock_effect == FLK_STAY_SAME) {
1436 				request->l_start = lock->l_start;
1437 				topology[0] = request;
1438 				nvertex = 1;
1439 			} else {
1440 				lock1 = flk_get_lock();
1441 				COPY(lock1, lock);
1442 				lock1->l_end = request->l_start - 1;
1443 				topology[0] = lock1;
1444 				topology[1] = request;
1445 				nvertex = 2;
1446 			}
1447 		} else if (request->l_start < lock->l_start) {
1448 			topology[0] = request;
1449 			nvertex = 1;
1450 		} else {
1451 			if (lock_effect !=  FLK_UNLOCK) {
1452 				topology[0] = request;
1453 				nvertex = 1;
1454 			} else {
1455 				flk_delete_active_lock(lock, 0);
1456 				flk_wakeup(lock, 1);
1457 				flk_free_lock(lock);
1458 				CHECK_SLEEPING_LOCKS(gp);
1459 				CHECK_ACTIVE_LOCKS(gp);
1460 				return (1);
1461 			}
1462 		}
1463 	}
1464 
1465 recompute:
1466 
1467 	/*
1468 	 * For unlock we don't send the 'request' to for recomputing
1469 	 * dependencies because no lock will add an edge to this.
1470 	 */
1471 
1472 	if (lock_effect == FLK_UNLOCK) {
1473 		topology[nvertex-1] = NULL;
1474 		nvertex--;
1475 	}
1476 	for (i = 0; i < nvertex; i++) {
1477 		topology[i]->l_state |= RECOMPUTE_LOCK;
1478 		topology[i]->l_color = NO_COLOR;
1479 	}
1480 
1481 	ASSERT(FIRST_ADJ(lock) == HEAD(lock));
1482 
1483 	/*
1484 	 * we remove the adjacent edges for all vertices' to this vertex
1485 	 * 'lock'.
1486 	 */
1487 
1488 	ep = FIRST_IN(lock);
1489 	while (ep != HEAD(lock)) {
1490 		ADJ_LIST_REMOVE(ep);
1491 		ep = NEXT_IN(ep);
1492 	}
1493 
1494 	flk_delete_active_lock(lock, 0);
1495 
1496 	/* We are ready for recomputing the dependencies now */
1497 
1498 	flk_recompute_dependencies(lock, topology, nvertex, 1);
1499 
1500 	for (i = 0; i < nvertex; i++) {
1501 		topology[i]->l_state &= ~RECOMPUTE_LOCK;
1502 		topology[i]->l_color = NO_COLOR;
1503 	}
1504 
1505 
1506 	if (lock_effect == FLK_UNLOCK) {
1507 		nvertex++;
1508 	}
1509 	for (i = 0; i < nvertex - 1; i++) {
1510 		flk_insert_active_lock(topology[i]);
1511 	}
1512 
1513 
1514 	if (lock_effect == FLK_DOWNGRADE || lock_effect == FLK_UNLOCK) {
1515 		flk_wakeup(lock, 0);
1516 	} else {
1517 		ep = FIRST_IN(lock);
1518 		while (ep != HEAD(lock)) {
1519 			lock->l_sedge = NEXT_IN(ep);
1520 			IN_LIST_REMOVE(ep);
1521 			flk_update_proc_graph(ep, 1);
1522 			flk_free_edge(ep);
1523 			ep = lock->l_sedge;
1524 		}
1525 	}
1526 	flk_free_lock(lock);
1527 
1528 	CHECK_SLEEPING_LOCKS(gp);
1529 	CHECK_ACTIVE_LOCKS(gp);
1530 	return (0);
1531 }
1532 
1533 /*
1534  * Insert a lock into the active queue.
1535  */
1536 
1537 static void
1538 flk_insert_active_lock(lock_descriptor_t *new_lock)
1539 {
1540 	graph_t	*gp = new_lock->l_graph;
1541 	vnode_t	*vp = new_lock->l_vnode;
1542 	lock_descriptor_t *first_lock, *lock;
1543 
1544 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1545 
1546 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1547 	first_lock = lock;
1548 
1549 	if (first_lock != NULL) {
1550 		for (; (lock->l_vnode == vp &&
1551 		    lock->l_start < new_lock->l_start); lock = lock->l_next)
1552 			;
1553 	} else {
1554 		lock = ACTIVE_HEAD(gp);
1555 	}
1556 
1557 	lock->l_prev->l_next = new_lock;
1558 	new_lock->l_next = lock;
1559 	new_lock->l_prev = lock->l_prev;
1560 	lock->l_prev = new_lock;
1561 
1562 	if (first_lock == NULL || (new_lock->l_start <= first_lock->l_start)) {
1563 		vp->v_filocks = (struct filock *)new_lock;
1564 	}
1565 	flk_set_state(new_lock, FLK_ACTIVE_STATE);
1566 	new_lock->l_state |= ACTIVE_LOCK;
1567 
1568 	CHECK_ACTIVE_LOCKS(gp);
1569 	CHECK_SLEEPING_LOCKS(gp);
1570 }
1571 
1572 /*
1573  * Delete the active lock : Performs two functions depending on the
1574  * value of second parameter. One is to remove from the active lists
1575  * only and other is to both remove and free the lock.
1576  */
1577 
1578 static void
1579 flk_delete_active_lock(lock_descriptor_t *lock, int free_lock)
1580 {
1581 	vnode_t *vp = lock->l_vnode;
1582 	graph_t	*gp = lock->l_graph;
1583 
1584 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1585 	if (free_lock)
1586 		ASSERT(NO_DEPENDENTS(lock));
1587 	ASSERT(NOT_BLOCKED(lock));
1588 	ASSERT(IS_ACTIVE(lock));
1589 
1590 	ASSERT((vp->v_filocks != NULL));
1591 
1592 	if (vp->v_filocks == (struct filock *)lock) {
1593 		vp->v_filocks = (struct filock *)
1594 		    ((lock->l_next->l_vnode == vp) ? lock->l_next :
1595 		    NULL);
1596 	}
1597 	lock->l_next->l_prev = lock->l_prev;
1598 	lock->l_prev->l_next = lock->l_next;
1599 	lock->l_next = lock->l_prev = NULL;
1600 	flk_set_state(lock, FLK_DEAD_STATE);
1601 	lock->l_state &= ~ACTIVE_LOCK;
1602 
1603 	if (free_lock)
1604 		flk_free_lock(lock);
1605 	CHECK_ACTIVE_LOCKS(gp);
1606 	CHECK_SLEEPING_LOCKS(gp);
1607 }
1608 
1609 /*
1610  * Insert into the sleep queue.
1611  */
1612 
1613 static void
1614 flk_insert_sleeping_lock(lock_descriptor_t *request)
1615 {
1616 	graph_t *gp = request->l_graph;
1617 	vnode_t	*vp = request->l_vnode;
1618 	lock_descriptor_t	*lock;
1619 
1620 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1621 	ASSERT(IS_INITIAL(request));
1622 
1623 	for (lock = gp->sleeping_locks.l_next; (lock != &gp->sleeping_locks &&
1624 	    lock->l_vnode < vp); lock = lock->l_next)
1625 		;
1626 
1627 	lock->l_prev->l_next = request;
1628 	request->l_prev = lock->l_prev;
1629 	lock->l_prev = request;
1630 	request->l_next = lock;
1631 	flk_set_state(request, FLK_SLEEPING_STATE);
1632 	request->l_state |= SLEEPING_LOCK;
1633 }
1634 
1635 /*
1636  * Cancelling a sleeping lock implies removing a vertex from the
1637  * dependency graph and therefore we should recompute the dependencies
1638  * of all vertices that have a path  to this vertex, w.r.t. all
1639  * vertices reachable from this vertex.
1640  */
1641 
1642 void
1643 flk_cancel_sleeping_lock(lock_descriptor_t *request, int remove_from_queue)
1644 {
1645 	graph_t	*gp = request->l_graph;
1646 	vnode_t *vp = request->l_vnode;
1647 	lock_descriptor_t **topology = NULL;
1648 	edge_t	*ep;
1649 	lock_descriptor_t *vertex, *lock;
1650 	int nvertex = 0;
1651 	int i;
1652 	lock_descriptor_t *vertex_stack;
1653 
1654 	STACK_INIT(vertex_stack);
1655 
1656 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1657 	/*
1658 	 * count number of vertex pointers that has to be allocated
1659 	 * All vertices that are reachable from request.
1660 	 */
1661 
1662 	STACK_PUSH(vertex_stack, request, l_stack);
1663 
1664 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1665 		STACK_POP(vertex_stack, l_stack);
1666 		for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
1667 		    ep = NEXT_ADJ(ep)) {
1668 			if (IS_RECOMPUTE(ep->to_vertex))
1669 				continue;
1670 			ep->to_vertex->l_state |= RECOMPUTE_LOCK;
1671 			STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1672 			nvertex++;
1673 		}
1674 	}
1675 
1676 	/*
1677 	 * allocate memory for holding the vertex pointers
1678 	 */
1679 
1680 	if (nvertex) {
1681 		topology = kmem_zalloc(nvertex * sizeof (lock_descriptor_t *),
1682 		    KM_SLEEP);
1683 	}
1684 
1685 	/*
1686 	 * one more pass to actually store the vertices in the
1687 	 * allocated array.
1688 	 * We first check sleeping locks and then active locks
1689 	 * so that topology array will be in a topological
1690 	 * order.
1691 	 */
1692 
1693 	nvertex = 0;
1694 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1695 
1696 	if (lock) {
1697 		do {
1698 			if (IS_RECOMPUTE(lock)) {
1699 				lock->l_index = nvertex;
1700 				topology[nvertex++] = lock;
1701 			}
1702 			lock->l_color = NO_COLOR;
1703 			lock = lock->l_next;
1704 		} while (lock->l_vnode == vp);
1705 	}
1706 
1707 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1708 
1709 	if (lock) {
1710 		do {
1711 			if (IS_RECOMPUTE(lock)) {
1712 				lock->l_index = nvertex;
1713 				topology[nvertex++] = lock;
1714 			}
1715 			lock->l_color = NO_COLOR;
1716 			lock = lock->l_next;
1717 		} while (lock->l_vnode == vp);
1718 	}
1719 
1720 	/*
1721 	 * remove in and out edges of request
1722 	 * They are freed after updating proc_graph below.
1723 	 */
1724 
1725 	for (ep = FIRST_IN(request); ep != HEAD(request); ep = NEXT_IN(ep)) {
1726 		ADJ_LIST_REMOVE(ep);
1727 	}
1728 
1729 
1730 	if (remove_from_queue)
1731 		REMOVE_SLEEP_QUEUE(request);
1732 
1733 	/* we are ready to recompute */
1734 
1735 	flk_recompute_dependencies(request, topology, nvertex, 1);
1736 
1737 	ep = FIRST_ADJ(request);
1738 	while (ep != HEAD(request)) {
1739 		IN_LIST_REMOVE(ep);
1740 		request->l_sedge = NEXT_ADJ(ep);
1741 		ADJ_LIST_REMOVE(ep);
1742 		flk_update_proc_graph(ep, 1);
1743 		flk_free_edge(ep);
1744 		ep = request->l_sedge;
1745 	}
1746 
1747 
1748 	/*
1749 	 * unset the RECOMPUTE flag in those vertices
1750 	 */
1751 
1752 	for (i = 0; i < nvertex; i++) {
1753 		topology[i]->l_state &= ~RECOMPUTE_LOCK;
1754 	}
1755 
1756 	/*
1757 	 * free the topology
1758 	 */
1759 	if (nvertex)
1760 		kmem_free((void *)topology,
1761 		    (nvertex * sizeof (lock_descriptor_t *)));
1762 	/*
1763 	 * Possibility of some locks unblocked now
1764 	 */
1765 
1766 	flk_wakeup(request, 0);
1767 
1768 	/*
1769 	 * we expect to have a correctly recomputed graph  now.
1770 	 */
1771 	flk_set_state(request, FLK_DEAD_STATE);
1772 	flk_free_lock(request);
1773 	CHECK_SLEEPING_LOCKS(gp);
1774 	CHECK_ACTIVE_LOCKS(gp);
1775 
1776 }
1777 
1778 /*
1779  * Uncoloring the graph is simply to increment the mark value of the graph
1780  * And only when wrap round takes place will we color all vertices in
1781  * the graph explicitly.
1782  */
1783 
1784 static void
1785 flk_graph_uncolor(graph_t *gp)
1786 {
1787 	lock_descriptor_t *lock;
1788 
1789 	if (gp->mark == UINT_MAX) {
1790 		gp->mark = 1;
1791 	for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
1792 	    lock = lock->l_next)
1793 			lock->l_color  = 0;
1794 
1795 	for (lock = SLEEPING_HEAD(gp)->l_next; lock != SLEEPING_HEAD(gp);
1796 	    lock = lock->l_next)
1797 			lock->l_color  = 0;
1798 	} else {
1799 		gp->mark++;
1800 	}
1801 }
1802 
1803 /*
1804  * Wake up locks that are blocked on the given lock.
1805  */
1806 
1807 static void
1808 flk_wakeup(lock_descriptor_t *lock, int adj_list_remove)
1809 {
1810 	edge_t	*ep;
1811 	graph_t	*gp = lock->l_graph;
1812 	lock_descriptor_t	*lck;
1813 
1814 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1815 	if (NO_DEPENDENTS(lock))
1816 		return;
1817 	ep = FIRST_IN(lock);
1818 	do {
1819 		/*
1820 		 * delete the edge from the adjacency list
1821 		 * of from vertex. if no more adjacent edges
1822 		 * for this vertex wake this process.
1823 		 */
1824 		lck = ep->from_vertex;
1825 		if (adj_list_remove)
1826 			ADJ_LIST_REMOVE(ep);
1827 		flk_update_proc_graph(ep, 1);
1828 		if (NOT_BLOCKED(lck)) {
1829 			GRANT_WAKEUP(lck);
1830 		}
1831 		lock->l_sedge = NEXT_IN(ep);
1832 		IN_LIST_REMOVE(ep);
1833 		flk_free_edge(ep);
1834 		ep = lock->l_sedge;
1835 	} while (ep != HEAD(lock));
1836 	ASSERT(NO_DEPENDENTS(lock));
1837 }
1838 
1839 /*
1840  * The dependents of request, is checked for its dependency against the
1841  * locks in topology (called topology because the array is and should be in
1842  * topological order for this algorithm, if not in topological order the
1843  * inner loop below might add more edges than necessary. Topological ordering
1844  * of vertices satisfies the property that all edges will be from left to
1845  * right i.e., topology[i] can have an edge to  topology[j], iff i<j)
1846  * If lock l1 in the dependent set of request is dependent (blocked by)
1847  * on lock l2 in topology but does not have a path to it, we add an edge
1848  * in the inner loop below.
1849  *
1850  * We don't want to add an edge between l1 and l2 if there exists
1851  * already a path from l1 to l2, so care has to be taken for those vertices
1852  * that  have two paths to 'request'. These vertices are referred to here
1853  * as barrier locks.
1854  *
1855  * The barriers has to be found (those vertex that originally had two paths
1856  * to request) because otherwise we may end up adding edges unnecessarily
1857  * to vertices in topology, and thus barrier vertices can have an edge
1858  * to a vertex in topology as well a path to it.
1859  */
1860 
1861 static void
1862 flk_recompute_dependencies(lock_descriptor_t *request,
1863 		lock_descriptor_t **topology,
1864 			int nvertex, int update_graph)
1865 {
1866 	lock_descriptor_t *vertex, *lock;
1867 	graph_t	*gp = request->l_graph;
1868 	int i, count;
1869 	int barrier_found = 0;
1870 	edge_t	*ep;
1871 	lock_descriptor_t *vertex_stack;
1872 
1873 	STACK_INIT(vertex_stack);
1874 
1875 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1876 	if (nvertex == 0)
1877 		return;
1878 	flk_graph_uncolor(request->l_graph);
1879 	barrier_found = flk_find_barriers(request);
1880 	request->l_state |= RECOMPUTE_DONE;
1881 
1882 	STACK_PUSH(vertex_stack, request, l_stack);
1883 	request->l_sedge = FIRST_IN(request);
1884 
1885 
1886 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1887 		if (vertex->l_state & RECOMPUTE_DONE) {
1888 			count = 0;
1889 			goto next_in_edge;
1890 		}
1891 		if (IS_BARRIER(vertex)) {
1892 			/* decrement the barrier count */
1893 			if (vertex->l_index) {
1894 				vertex->l_index--;
1895 				/* this guy will be pushed again anyway ? */
1896 				STACK_POP(vertex_stack, l_stack);
1897 				if (vertex->l_index == 0)  {
1898 				/*
1899 				 * barrier is over we can recompute
1900 				 * dependencies for this lock in the
1901 				 * next stack pop
1902 				 */
1903 					vertex->l_state &= ~BARRIER_LOCK;
1904 				}
1905 				continue;
1906 			}
1907 		}
1908 		vertex->l_state |= RECOMPUTE_DONE;
1909 		flk_graph_uncolor(gp);
1910 		count = flk_color_reachables(vertex);
1911 		for (i = 0; i < nvertex; i++) {
1912 			lock = topology[i];
1913 			if (COLORED(lock))
1914 				continue;
1915 			if (BLOCKS(lock, vertex)) {
1916 				(void) flk_add_edge(vertex, lock,
1917 				    NO_CHECK_CYCLE, update_graph);
1918 				COLOR(lock);
1919 				count++;
1920 				count += flk_color_reachables(lock);
1921 			}
1922 
1923 		}
1924 
1925 next_in_edge:
1926 		if (count == nvertex ||
1927 		    vertex->l_sedge == HEAD(vertex)) {
1928 			/* prune the tree below this */
1929 			STACK_POP(vertex_stack, l_stack);
1930 			vertex->l_state &= ~RECOMPUTE_DONE;
1931 			/* update the barrier locks below this! */
1932 			if (vertex->l_sedge != HEAD(vertex) && barrier_found) {
1933 				flk_graph_uncolor(gp);
1934 				flk_update_barriers(vertex);
1935 			}
1936 			continue;
1937 		}
1938 
1939 		ep = vertex->l_sedge;
1940 		lock = ep->from_vertex;
1941 		STACK_PUSH(vertex_stack, lock, l_stack);
1942 		lock->l_sedge = FIRST_IN(lock);
1943 		vertex->l_sedge = NEXT_IN(ep);
1944 	}
1945 
1946 }
1947 
1948 /*
1949  * Color all reachable vertices from vertex that belongs to topology (here
1950  * those that have RECOMPUTE_LOCK set in their state) and yet uncolored.
1951  *
1952  * Note: we need to use a different stack_link l_stack1 because this is
1953  * called from flk_recompute_dependencies() that already uses a stack with
1954  * l_stack as stack_link.
1955  */
1956 
1957 static int
1958 flk_color_reachables(lock_descriptor_t *vertex)
1959 {
1960 	lock_descriptor_t *ver, *lock;
1961 	int count;
1962 	edge_t	*ep;
1963 	lock_descriptor_t *vertex_stack;
1964 
1965 	STACK_INIT(vertex_stack);
1966 
1967 	STACK_PUSH(vertex_stack, vertex, l_stack1);
1968 	count = 0;
1969 	while ((ver = STACK_TOP(vertex_stack)) != NULL) {
1970 
1971 		STACK_POP(vertex_stack, l_stack1);
1972 		for (ep = FIRST_ADJ(ver); ep != HEAD(ver);
1973 		    ep = NEXT_ADJ(ep)) {
1974 			lock = ep->to_vertex;
1975 			if (COLORED(lock))
1976 				continue;
1977 			COLOR(lock);
1978 			if (IS_RECOMPUTE(lock))
1979 				count++;
1980 			STACK_PUSH(vertex_stack, lock, l_stack1);
1981 		}
1982 
1983 	}
1984 	return (count);
1985 }
1986 
1987 /*
1988  * Called from flk_recompute_dependencies() this routine decrements
1989  * the barrier count of barrier vertices that are reachable from lock.
1990  */
1991 
1992 static void
1993 flk_update_barriers(lock_descriptor_t *lock)
1994 {
1995 	lock_descriptor_t *vertex, *lck;
1996 	edge_t	*ep;
1997 	lock_descriptor_t *vertex_stack;
1998 
1999 	STACK_INIT(vertex_stack);
2000 
2001 	STACK_PUSH(vertex_stack, lock, l_stack1);
2002 
2003 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2004 		STACK_POP(vertex_stack, l_stack1);
2005 		for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2006 		    ep = NEXT_IN(ep)) {
2007 			lck = ep->from_vertex;
2008 			if (COLORED(lck)) {
2009 				if (IS_BARRIER(lck)) {
2010 					ASSERT(lck->l_index > 0);
2011 					lck->l_index--;
2012 					if (lck->l_index == 0)
2013 						lck->l_state &= ~BARRIER_LOCK;
2014 				}
2015 				continue;
2016 			}
2017 			COLOR(lck);
2018 			if (IS_BARRIER(lck)) {
2019 				ASSERT(lck->l_index > 0);
2020 				lck->l_index--;
2021 				if (lck->l_index == 0)
2022 					lck->l_state &= ~BARRIER_LOCK;
2023 			}
2024 			STACK_PUSH(vertex_stack, lck, l_stack1);
2025 		}
2026 	}
2027 }
2028 
2029 /*
2030  * Finds all vertices that are reachable from 'lock' more than once and
2031  * mark them as barrier vertices and increment their barrier count.
2032  * The barrier count is one minus the total number of paths from lock
2033  * to that vertex.
2034  */
2035 
2036 static int
2037 flk_find_barriers(lock_descriptor_t *lock)
2038 {
2039 	lock_descriptor_t *vertex, *lck;
2040 	int found = 0;
2041 	edge_t	*ep;
2042 	lock_descriptor_t *vertex_stack;
2043 
2044 	STACK_INIT(vertex_stack);
2045 
2046 	STACK_PUSH(vertex_stack, lock, l_stack1);
2047 
2048 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2049 		STACK_POP(vertex_stack, l_stack1);
2050 		for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2051 		    ep = NEXT_IN(ep)) {
2052 			lck = ep->from_vertex;
2053 			if (COLORED(lck)) {
2054 				/* this is a barrier */
2055 				lck->l_state |= BARRIER_LOCK;
2056 				/* index will have barrier count */
2057 				lck->l_index++;
2058 				if (!found)
2059 					found = 1;
2060 				continue;
2061 			}
2062 			COLOR(lck);
2063 			lck->l_index = 0;
2064 			STACK_PUSH(vertex_stack, lck, l_stack1);
2065 		}
2066 	}
2067 	return (found);
2068 }
2069 
2070 /*
2071  * Finds the first lock that is mainly responsible for blocking this
2072  * request.  If there is no such lock, request->l_flock.l_type is set to
2073  * F_UNLCK.  Otherwise, request->l_flock is filled in with the particulars
2074  * of the blocking lock.
2075  *
2076  * Note: It is possible a request is blocked by a sleeping lock because
2077  * of the fairness policy used in flk_process_request() to construct the
2078  * dependencies. (see comments before flk_process_request()).
2079  */
2080 
2081 static void
2082 flk_get_first_blocking_lock(lock_descriptor_t *request)
2083 {
2084 	graph_t	*gp = request->l_graph;
2085 	vnode_t *vp = request->l_vnode;
2086 	lock_descriptor_t *lock, *blocker;
2087 
2088 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
2089 	blocker = NULL;
2090 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2091 
2092 	if (lock) {
2093 		do {
2094 			if (BLOCKS(lock, request)) {
2095 				blocker = lock;
2096 				break;
2097 			}
2098 			lock = lock->l_next;
2099 		} while (lock->l_vnode == vp);
2100 	}
2101 
2102 	if (blocker == NULL && request->l_flock.l_type == F_RDLCK) {
2103 		/*
2104 		 * No active lock is blocking this request, but if a read
2105 		 * lock is requested, it may also get blocked by a waiting
2106 		 * writer. So search all sleeping locks and see if there is
2107 		 * a writer waiting.
2108 		 */
2109 		SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2110 		if (lock) {
2111 			do {
2112 				if (BLOCKS(lock, request)) {
2113 					blocker = lock;
2114 					break;
2115 				}
2116 				lock = lock->l_next;
2117 			} while (lock->l_vnode == vp);
2118 		}
2119 	}
2120 
2121 	if (blocker) {
2122 		report_blocker(blocker, request);
2123 	} else
2124 		request->l_flock.l_type = F_UNLCK;
2125 }
2126 
2127 /*
2128  * Get the graph_t structure associated with a vnode.
2129  * If 'initialize' is non-zero, and the graph_t structure for this vnode has
2130  * not yet been initialized, then a new element is allocated and returned.
2131  */
2132 graph_t *
2133 flk_get_lock_graph(vnode_t *vp, int initialize)
2134 {
2135 	graph_t *gp;
2136 	graph_t *gp_alloc = NULL;
2137 	int index = HASH_INDEX(vp);
2138 
2139 	if (initialize == FLK_USE_GRAPH) {
2140 		mutex_enter(&flock_lock);
2141 		gp = lock_graph[index];
2142 		mutex_exit(&flock_lock);
2143 		return (gp);
2144 	}
2145 
2146 	ASSERT(initialize == FLK_INIT_GRAPH);
2147 
2148 	if (lock_graph[index] == NULL) {
2149 
2150 		gp_alloc = kmem_zalloc(sizeof (graph_t), KM_SLEEP);
2151 
2152 		/* Initialize the graph */
2153 
2154 		gp_alloc->active_locks.l_next =
2155 		    gp_alloc->active_locks.l_prev =
2156 		    (lock_descriptor_t *)ACTIVE_HEAD(gp_alloc);
2157 		gp_alloc->sleeping_locks.l_next =
2158 		    gp_alloc->sleeping_locks.l_prev =
2159 		    (lock_descriptor_t *)SLEEPING_HEAD(gp_alloc);
2160 		gp_alloc->index = index;
2161 		mutex_init(&gp_alloc->gp_mutex, NULL, MUTEX_DEFAULT, NULL);
2162 	}
2163 
2164 	mutex_enter(&flock_lock);
2165 
2166 	gp = lock_graph[index];
2167 
2168 	/* Recheck the value within flock_lock */
2169 	if (gp == NULL) {
2170 		struct flock_globals *fg;
2171 
2172 		/* We must have previously allocated the graph_t structure */
2173 		ASSERT(gp_alloc != NULL);
2174 		lock_graph[index] = gp = gp_alloc;
2175 		/*
2176 		 * The lockmgr status is only needed if KLM is loaded.
2177 		 */
2178 		if (flock_zone_key != ZONE_KEY_UNINITIALIZED) {
2179 			fg = flk_get_globals();
2180 			fg->lockmgr_status[index] = fg->flk_lockmgr_status;
2181 		}
2182 	}
2183 
2184 	mutex_exit(&flock_lock);
2185 
2186 	if ((gp_alloc != NULL) && (gp != gp_alloc)) {
2187 		/* There was a race to allocate the graph_t and we lost */
2188 		mutex_destroy(&gp_alloc->gp_mutex);
2189 		kmem_free(gp_alloc, sizeof (graph_t));
2190 	}
2191 
2192 	return (gp);
2193 }
2194 
2195 /*
2196  * PSARC case 1997/292
2197  */
2198 int
2199 cl_flk_has_remote_locks_for_nlmid(vnode_t *vp, int nlmid)
2200 {
2201 	lock_descriptor_t *lock;
2202 	int result = 0;
2203 	graph_t *gp;
2204 	int			lock_nlmid;
2205 
2206 	/*
2207 	 * Check to see if node is booted as a cluster. If not, return.
2208 	 */
2209 	if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2210 		return (0);
2211 	}
2212 
2213 	gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2214 	if (gp == NULL) {
2215 		return (0);
2216 	}
2217 
2218 	mutex_enter(&gp->gp_mutex);
2219 
2220 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2221 
2222 	if (lock) {
2223 		while (lock->l_vnode == vp) {
2224 			/* get NLM id from sysid */
2225 			lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2226 
2227 			/*
2228 			 * If NLM server request _and_ nlmid of lock matches
2229 			 * nlmid of argument, then we've found a remote lock.
2230 			 */
2231 			if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2232 				result = 1;
2233 				goto done;
2234 			}
2235 			lock = lock->l_next;
2236 		}
2237 	}
2238 
2239 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2240 
2241 	if (lock) {
2242 		while (lock->l_vnode == vp) {
2243 			/* get NLM id from sysid */
2244 			lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2245 
2246 			/*
2247 			 * If NLM server request _and_ nlmid of lock matches
2248 			 * nlmid of argument, then we've found a remote lock.
2249 			 */
2250 			if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2251 				result = 1;
2252 				goto done;
2253 			}
2254 			lock = lock->l_next;
2255 		}
2256 	}
2257 
2258 done:
2259 	mutex_exit(&gp->gp_mutex);
2260 	return (result);
2261 }
2262 
2263 /*
2264  * Determine whether there are any locks for the given vnode with a remote
2265  * sysid.  Returns zero if not, non-zero if there are.
2266  *
2267  * Note that the return value from this function is potentially invalid
2268  * once it has been returned.  The caller is responsible for providing its
2269  * own synchronization mechanism to ensure that the return value is useful
2270  * (e.g., see nfs_lockcompletion()).
2271  */
2272 int
2273 flk_has_remote_locks(vnode_t *vp)
2274 {
2275 	lock_descriptor_t *lock;
2276 	int result = 0;
2277 	graph_t *gp;
2278 
2279 	gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2280 	if (gp == NULL) {
2281 		return (0);
2282 	}
2283 
2284 	mutex_enter(&gp->gp_mutex);
2285 
2286 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2287 
2288 	if (lock) {
2289 		while (lock->l_vnode == vp) {
2290 			if (IS_REMOTE(lock)) {
2291 				result = 1;
2292 				goto done;
2293 			}
2294 			lock = lock->l_next;
2295 		}
2296 	}
2297 
2298 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2299 
2300 	if (lock) {
2301 		while (lock->l_vnode == vp) {
2302 			if (IS_REMOTE(lock)) {
2303 				result = 1;
2304 				goto done;
2305 			}
2306 			lock = lock->l_next;
2307 		}
2308 	}
2309 
2310 done:
2311 	mutex_exit(&gp->gp_mutex);
2312 	return (result);
2313 }
2314 
2315 /*
2316  * Determine whether there are any locks for the given vnode with a remote
2317  * sysid matching given sysid.
2318  * Used by the new (open source) NFS Lock Manager (NLM)
2319  */
2320 int
2321 flk_has_remote_locks_for_sysid(vnode_t *vp, int sysid)
2322 {
2323 	lock_descriptor_t *lock;
2324 	int result = 0;
2325 	graph_t *gp;
2326 
2327 	if (sysid == 0)
2328 		return (0);
2329 
2330 	gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2331 	if (gp == NULL) {
2332 		return (0);
2333 	}
2334 
2335 	mutex_enter(&gp->gp_mutex);
2336 
2337 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2338 
2339 	if (lock) {
2340 		while (lock->l_vnode == vp) {
2341 			if (lock->l_flock.l_sysid == sysid) {
2342 				result = 1;
2343 				goto done;
2344 			}
2345 			lock = lock->l_next;
2346 		}
2347 	}
2348 
2349 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2350 
2351 	if (lock) {
2352 		while (lock->l_vnode == vp) {
2353 			if (lock->l_flock.l_sysid == sysid) {
2354 				result = 1;
2355 				goto done;
2356 			}
2357 			lock = lock->l_next;
2358 		}
2359 	}
2360 
2361 done:
2362 	mutex_exit(&gp->gp_mutex);
2363 	return (result);
2364 }
2365 
2366 /*
2367  * Determine if there are any locks owned by the given sysid.
2368  * Returns zero if not, non-zero if there are.  Note that this return code
2369  * could be derived from flk_get_{sleeping,active}_locks, but this routine
2370  * avoids all the memory allocations of those routines.
2371  *
2372  * This routine has the same synchronization issues as
2373  * flk_has_remote_locks.
2374  */
2375 
2376 int
2377 flk_sysid_has_locks(int sysid, int lck_type)
2378 {
2379 	int		has_locks = 0;
2380 	lock_descriptor_t	*lock;
2381 	graph_t 	*gp;
2382 	int		i;
2383 
2384 	for (i = 0; i < HASH_SIZE && !has_locks; i++) {
2385 		mutex_enter(&flock_lock);
2386 		gp = lock_graph[i];
2387 		mutex_exit(&flock_lock);
2388 		if (gp == NULL) {
2389 			continue;
2390 		}
2391 
2392 		mutex_enter(&gp->gp_mutex);
2393 
2394 		if (lck_type & FLK_QUERY_ACTIVE) {
2395 			for (lock = ACTIVE_HEAD(gp)->l_next;
2396 			    lock != ACTIVE_HEAD(gp) && !has_locks;
2397 			    lock = lock->l_next) {
2398 				if (lock->l_flock.l_sysid == sysid)
2399 					has_locks = 1;
2400 			}
2401 		}
2402 
2403 		if (lck_type & FLK_QUERY_SLEEPING) {
2404 			for (lock = SLEEPING_HEAD(gp)->l_next;
2405 			    lock != SLEEPING_HEAD(gp) && !has_locks;
2406 			    lock = lock->l_next) {
2407 				if (lock->l_flock.l_sysid == sysid)
2408 					has_locks = 1;
2409 			}
2410 		}
2411 		mutex_exit(&gp->gp_mutex);
2412 	}
2413 
2414 	return (has_locks);
2415 }
2416 
2417 
2418 /*
2419  * PSARC case 1997/292
2420  *
2421  * Requires: "sysid" is a pair [nlmid, sysid].  The lower half is 16-bit
2422  *  quantity, the real sysid generated by the NLM server; the upper half
2423  *  identifies the node of the cluster where the NLM server ran.
2424  *  This routine is only called by an NLM server running in a cluster.
2425  * Effects: Remove all locks held on behalf of the client identified
2426  *  by "sysid."
2427  */
2428 void
2429 cl_flk_remove_locks_by_sysid(int sysid)
2430 {
2431 	graph_t	*gp;
2432 	int i;
2433 	lock_descriptor_t *lock, *nlock;
2434 
2435 	/*
2436 	 * Check to see if node is booted as a cluster. If not, return.
2437 	 */
2438 	if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2439 		return;
2440 	}
2441 
2442 	ASSERT(sysid != 0);
2443 	for (i = 0; i < HASH_SIZE; i++) {
2444 		mutex_enter(&flock_lock);
2445 		gp = lock_graph[i];
2446 		mutex_exit(&flock_lock);
2447 
2448 		if (gp == NULL)
2449 			continue;
2450 
2451 		mutex_enter(&gp->gp_mutex);	/*  get mutex on lock graph */
2452 
2453 		/* signal sleeping requests so that they bail out */
2454 		lock = SLEEPING_HEAD(gp)->l_next;
2455 		while (lock != SLEEPING_HEAD(gp)) {
2456 			nlock = lock->l_next;
2457 			if (lock->l_flock.l_sysid == sysid) {
2458 				INTERRUPT_WAKEUP(lock);
2459 			}
2460 			lock = nlock;
2461 		}
2462 
2463 		/* delete active locks */
2464 		lock = ACTIVE_HEAD(gp)->l_next;
2465 		while (lock != ACTIVE_HEAD(gp)) {
2466 			nlock = lock->l_next;
2467 			if (lock->l_flock.l_sysid == sysid) {
2468 				flk_delete_active_lock(lock, 0);
2469 				flk_wakeup(lock, 1);
2470 				flk_free_lock(lock);
2471 			}
2472 			lock = nlock;
2473 		}
2474 		mutex_exit(&gp->gp_mutex);    /* release mutex on lock graph */
2475 	}
2476 }
2477 
2478 /*
2479  * Delete all locks in the system that belongs to the sysid of the request.
2480  */
2481 
2482 static void
2483 flk_delete_locks_by_sysid(lock_descriptor_t *request)
2484 {
2485 	int	sysid  = request->l_flock.l_sysid;
2486 	lock_descriptor_t *lock, *nlock;
2487 	graph_t	*gp;
2488 	int i;
2489 
2490 	ASSERT(MUTEX_HELD(&request->l_graph->gp_mutex));
2491 	ASSERT(sysid != 0);
2492 
2493 	mutex_exit(&request->l_graph->gp_mutex);
2494 
2495 	for (i = 0; i < HASH_SIZE; i++) {
2496 		mutex_enter(&flock_lock);
2497 		gp = lock_graph[i];
2498 		mutex_exit(&flock_lock);
2499 
2500 		if (gp == NULL)
2501 			continue;
2502 
2503 		mutex_enter(&gp->gp_mutex);
2504 
2505 		/* signal sleeping requests so that they bail out */
2506 		lock = SLEEPING_HEAD(gp)->l_next;
2507 		while (lock != SLEEPING_HEAD(gp)) {
2508 			nlock = lock->l_next;
2509 			if (lock->l_flock.l_sysid == sysid) {
2510 				INTERRUPT_WAKEUP(lock);
2511 			}
2512 			lock = nlock;
2513 		}
2514 
2515 		/* delete active locks */
2516 		lock = ACTIVE_HEAD(gp)->l_next;
2517 		while (lock != ACTIVE_HEAD(gp)) {
2518 			nlock = lock->l_next;
2519 			if (lock->l_flock.l_sysid == sysid) {
2520 				flk_delete_active_lock(lock, 0);
2521 				flk_wakeup(lock, 1);
2522 				flk_free_lock(lock);
2523 			}
2524 			lock = nlock;
2525 		}
2526 		mutex_exit(&gp->gp_mutex);
2527 	}
2528 
2529 	mutex_enter(&request->l_graph->gp_mutex);
2530 }
2531 
2532 /*
2533  * Clustering: Deletes PXFS locks
2534  * Effects: Delete all locks on files in the given file system and with the
2535  *  given PXFS id.
2536  */
2537 void
2538 cl_flk_delete_pxfs_locks(struct vfs *vfsp, int pxfsid)
2539 {
2540 	lock_descriptor_t *lock, *nlock;
2541 	graph_t	*gp;
2542 	int i;
2543 
2544 	for (i = 0; i < HASH_SIZE; i++) {
2545 		mutex_enter(&flock_lock);
2546 		gp = lock_graph[i];
2547 		mutex_exit(&flock_lock);
2548 
2549 		if (gp == NULL)
2550 			continue;
2551 
2552 		mutex_enter(&gp->gp_mutex);
2553 
2554 		/* signal sleeping requests so that they bail out */
2555 		lock = SLEEPING_HEAD(gp)->l_next;
2556 		while (lock != SLEEPING_HEAD(gp)) {
2557 			nlock = lock->l_next;
2558 			if (lock->l_vnode->v_vfsp == vfsp) {
2559 				ASSERT(IS_PXFS(lock));
2560 				if (GETPXFSID(lock->l_flock.l_sysid) ==
2561 				    pxfsid) {
2562 					flk_set_state(lock,
2563 					    FLK_CANCELLED_STATE);
2564 					flk_cancel_sleeping_lock(lock, 1);
2565 				}
2566 			}
2567 			lock = nlock;
2568 		}
2569 
2570 		/* delete active locks */
2571 		lock = ACTIVE_HEAD(gp)->l_next;
2572 		while (lock != ACTIVE_HEAD(gp)) {
2573 			nlock = lock->l_next;
2574 			if (lock->l_vnode->v_vfsp == vfsp) {
2575 				ASSERT(IS_PXFS(lock));
2576 				if (GETPXFSID(lock->l_flock.l_sysid) ==
2577 				    pxfsid) {
2578 					flk_delete_active_lock(lock, 0);
2579 					flk_wakeup(lock, 1);
2580 					flk_free_lock(lock);
2581 				}
2582 			}
2583 			lock = nlock;
2584 		}
2585 		mutex_exit(&gp->gp_mutex);
2586 	}
2587 }
2588 
2589 /*
2590  * Search for a sleeping lock manager lock which matches exactly this lock
2591  * request; if one is found, fake a signal to cancel it.
2592  *
2593  * Return 1 if a matching lock was found, 0 otherwise.
2594  */
2595 
2596 static int
2597 flk_canceled(lock_descriptor_t *request)
2598 {
2599 	lock_descriptor_t *lock, *nlock;
2600 	graph_t *gp = request->l_graph;
2601 	vnode_t *vp = request->l_vnode;
2602 
2603 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
2604 	ASSERT(IS_LOCKMGR(request));
2605 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2606 
2607 	if (lock) {
2608 		while (lock->l_vnode == vp) {
2609 			nlock = lock->l_next;
2610 			if (SAME_OWNER(lock, request) &&
2611 			    lock->l_start == request->l_start &&
2612 			    lock->l_end == request->l_end) {
2613 				INTERRUPT_WAKEUP(lock);
2614 				return (1);
2615 			}
2616 			lock = nlock;
2617 		}
2618 	}
2619 	return (0);
2620 }
2621 
2622 /*
2623  * Remove all the locks for the vnode belonging to the given pid and sysid.
2624  */
2625 
2626 void
2627 cleanlocks(vnode_t *vp, pid_t pid, int sysid)
2628 {
2629 	graph_t	*gp;
2630 	lock_descriptor_t *lock, *nlock;
2631 	lock_descriptor_t *link_stack;
2632 
2633 	STACK_INIT(link_stack);
2634 
2635 	gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2636 
2637 	if (gp == NULL)
2638 		return;
2639 	mutex_enter(&gp->gp_mutex);
2640 
2641 	CHECK_SLEEPING_LOCKS(gp);
2642 	CHECK_ACTIVE_LOCKS(gp);
2643 
2644 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2645 
2646 	if (lock) {
2647 		do {
2648 			nlock = lock->l_next;
2649 			if ((lock->l_flock.l_pid == pid ||
2650 			    pid == IGN_PID) &&
2651 			    lock->l_flock.l_sysid == sysid) {
2652 				CANCEL_WAKEUP(lock);
2653 			}
2654 			lock = nlock;
2655 		} while (lock->l_vnode == vp);
2656 	}
2657 
2658 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2659 
2660 	if (lock) {
2661 		do {
2662 			nlock = lock->l_next;
2663 			if ((lock->l_flock.l_pid == pid ||
2664 			    pid == IGN_PID) &&
2665 			    lock->l_flock.l_sysid == sysid) {
2666 				flk_delete_active_lock(lock, 0);
2667 				STACK_PUSH(link_stack, lock, l_stack);
2668 			}
2669 			lock = nlock;
2670 		} while (lock->l_vnode == vp);
2671 	}
2672 
2673 	while ((lock = STACK_TOP(link_stack)) != NULL) {
2674 		STACK_POP(link_stack, l_stack);
2675 		flk_wakeup(lock, 1);
2676 		flk_free_lock(lock);
2677 	}
2678 
2679 	CHECK_SLEEPING_LOCKS(gp);
2680 	CHECK_ACTIVE_LOCKS(gp);
2681 	CHECK_OWNER_LOCKS(gp, pid, sysid, vp);
2682 	mutex_exit(&gp->gp_mutex);
2683 }
2684 
2685 
2686 /*
2687  * Called from 'fs' read and write routines for files that have mandatory
2688  * locking enabled.
2689  */
2690 
2691 int
2692 chklock(
2693 	struct vnode	*vp,
2694 	int 		iomode,
2695 	u_offset_t	offset,
2696 	ssize_t		len,
2697 	int 		fmode,
2698 	caller_context_t *ct)
2699 {
2700 	register int	i;
2701 	struct flock64 	bf;
2702 	int 		error = 0;
2703 
2704 	bf.l_type = (iomode & FWRITE) ? F_WRLCK : F_RDLCK;
2705 	bf.l_whence = 0;
2706 	bf.l_start = offset;
2707 	bf.l_len = len;
2708 	if (ct == NULL) {
2709 		bf.l_pid = curproc->p_pid;
2710 		bf.l_sysid = 0;
2711 	} else {
2712 		bf.l_pid = ct->cc_pid;
2713 		bf.l_sysid = ct->cc_sysid;
2714 	}
2715 	i = (fmode & (FNDELAY|FNONBLOCK)) ? INOFLCK : INOFLCK|SLPFLCK;
2716 	if ((i = reclock(vp, &bf, i, 0, offset, NULL)) != 0 ||
2717 	    bf.l_type != F_UNLCK)
2718 		error = i ? i : EAGAIN;
2719 	return (error);
2720 }
2721 
2722 /*
2723  * convoff - converts the given data (start, whence) to the
2724  * given whence.
2725  */
2726 int
2727 convoff(vp, lckdat, whence, offset)
2728 	struct vnode 	*vp;
2729 	struct flock64 	*lckdat;
2730 	int 		whence;
2731 	offset_t	offset;
2732 {
2733 	int 		error;
2734 	struct vattr 	vattr;
2735 
2736 	if ((lckdat->l_whence == 2) || (whence == 2)) {
2737 		vattr.va_mask = AT_SIZE;
2738 		if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
2739 			return (error);
2740 	}
2741 
2742 	switch (lckdat->l_whence) {
2743 	case 1:
2744 		lckdat->l_start += offset;
2745 		break;
2746 	case 2:
2747 		lckdat->l_start += vattr.va_size;
2748 		/* FALLTHRU */
2749 	case 0:
2750 		break;
2751 	default:
2752 		return (EINVAL);
2753 	}
2754 
2755 	if (lckdat->l_start < 0)
2756 		return (EINVAL);
2757 
2758 	switch (whence) {
2759 	case 1:
2760 		lckdat->l_start -= offset;
2761 		break;
2762 	case 2:
2763 		lckdat->l_start -= vattr.va_size;
2764 		/* FALLTHRU */
2765 	case 0:
2766 		break;
2767 	default:
2768 		return (EINVAL);
2769 	}
2770 
2771 	lckdat->l_whence = (short)whence;
2772 	return (0);
2773 }
2774 
2775 
2776 /* 	proc_graph function definitions */
2777 
2778 /*
2779  * Function checks for deadlock due to the new 'lock'. If deadlock found
2780  * edges of this lock are freed and returned.
2781  */
2782 
2783 static int
2784 flk_check_deadlock(lock_descriptor_t *lock)
2785 {
2786 	proc_vertex_t	*start_vertex, *pvertex;
2787 	proc_vertex_t *dvertex;
2788 	proc_edge_t *pep, *ppep;
2789 	edge_t	*ep, *nep;
2790 	proc_vertex_t *process_stack;
2791 
2792 	STACK_INIT(process_stack);
2793 
2794 	mutex_enter(&flock_lock);
2795 	start_vertex = flk_get_proc_vertex(lock);
2796 	ASSERT(start_vertex != NULL);
2797 
2798 	/* construct the edges from this process to other processes */
2799 
2800 	ep = FIRST_ADJ(lock);
2801 	while (ep != HEAD(lock)) {
2802 		proc_vertex_t *adj_proc;
2803 
2804 		adj_proc = flk_get_proc_vertex(ep->to_vertex);
2805 		for (pep = start_vertex->edge; pep != NULL; pep = pep->next) {
2806 			if (pep->to_proc == adj_proc) {
2807 				ASSERT(pep->refcount);
2808 				pep->refcount++;
2809 				break;
2810 			}
2811 		}
2812 		if (pep == NULL) {
2813 			pep = flk_get_proc_edge();
2814 			pep->to_proc = adj_proc;
2815 			pep->refcount = 1;
2816 			adj_proc->incount++;
2817 			pep->next = start_vertex->edge;
2818 			start_vertex->edge = pep;
2819 		}
2820 		ep = NEXT_ADJ(ep);
2821 	}
2822 
2823 	ep = FIRST_IN(lock);
2824 
2825 	while (ep != HEAD(lock)) {
2826 		proc_vertex_t *in_proc;
2827 
2828 		in_proc = flk_get_proc_vertex(ep->from_vertex);
2829 
2830 		for (pep = in_proc->edge; pep != NULL; pep = pep->next) {
2831 			if (pep->to_proc == start_vertex) {
2832 				ASSERT(pep->refcount);
2833 				pep->refcount++;
2834 				break;
2835 			}
2836 		}
2837 		if (pep == NULL) {
2838 			pep = flk_get_proc_edge();
2839 			pep->to_proc = start_vertex;
2840 			pep->refcount = 1;
2841 			start_vertex->incount++;
2842 			pep->next = in_proc->edge;
2843 			in_proc->edge = pep;
2844 		}
2845 		ep = NEXT_IN(ep);
2846 	}
2847 
2848 	if (start_vertex->incount == 0) {
2849 		mutex_exit(&flock_lock);
2850 		return (0);
2851 	}
2852 
2853 	flk_proc_graph_uncolor();
2854 
2855 	start_vertex->p_sedge = start_vertex->edge;
2856 
2857 	STACK_PUSH(process_stack, start_vertex, p_stack);
2858 
2859 	while ((pvertex = STACK_TOP(process_stack)) != NULL) {
2860 		for (pep = pvertex->p_sedge; pep != NULL; pep = pep->next) {
2861 			dvertex = pep->to_proc;
2862 			if (!PROC_ARRIVED(dvertex)) {
2863 				STACK_PUSH(process_stack, dvertex, p_stack);
2864 				dvertex->p_sedge = dvertex->edge;
2865 				PROC_ARRIVE(pvertex);
2866 				pvertex->p_sedge = pep->next;
2867 				break;
2868 			}
2869 			if (!PROC_DEPARTED(dvertex))
2870 				goto deadlock;
2871 		}
2872 		if (pep == NULL) {
2873 			PROC_DEPART(pvertex);
2874 			STACK_POP(process_stack, p_stack);
2875 		}
2876 	}
2877 	mutex_exit(&flock_lock);
2878 	return (0);
2879 
2880 deadlock:
2881 
2882 	/* we remove all lock edges and proc edges */
2883 
2884 	ep = FIRST_ADJ(lock);
2885 	while (ep != HEAD(lock)) {
2886 		proc_vertex_t *adj_proc;
2887 		adj_proc = flk_get_proc_vertex(ep->to_vertex);
2888 		nep = NEXT_ADJ(ep);
2889 		IN_LIST_REMOVE(ep);
2890 		ADJ_LIST_REMOVE(ep);
2891 		flk_free_edge(ep);
2892 		ppep = start_vertex->edge;
2893 		for (pep = start_vertex->edge; pep != NULL; ppep = pep,
2894 		    pep = ppep->next) {
2895 			if (pep->to_proc == adj_proc) {
2896 				pep->refcount--;
2897 				if (pep->refcount == 0) {
2898 					if (pep == ppep) {
2899 						start_vertex->edge = pep->next;
2900 					} else {
2901 						ppep->next = pep->next;
2902 					}
2903 					adj_proc->incount--;
2904 					flk_proc_release(adj_proc);
2905 					flk_free_proc_edge(pep);
2906 				}
2907 				break;
2908 			}
2909 		}
2910 		ep = nep;
2911 	}
2912 	ep = FIRST_IN(lock);
2913 	while (ep != HEAD(lock)) {
2914 		proc_vertex_t *in_proc;
2915 		in_proc = flk_get_proc_vertex(ep->from_vertex);
2916 		nep = NEXT_IN(ep);
2917 		IN_LIST_REMOVE(ep);
2918 		ADJ_LIST_REMOVE(ep);
2919 		flk_free_edge(ep);
2920 		ppep = in_proc->edge;
2921 		for (pep = in_proc->edge; pep != NULL; ppep = pep,
2922 		    pep = ppep->next) {
2923 			if (pep->to_proc == start_vertex) {
2924 				pep->refcount--;
2925 				if (pep->refcount == 0) {
2926 					if (pep == ppep) {
2927 						in_proc->edge = pep->next;
2928 					} else {
2929 						ppep->next = pep->next;
2930 					}
2931 					start_vertex->incount--;
2932 					flk_proc_release(in_proc);
2933 					flk_free_proc_edge(pep);
2934 				}
2935 				break;
2936 			}
2937 		}
2938 		ep = nep;
2939 	}
2940 	flk_proc_release(start_vertex);
2941 	mutex_exit(&flock_lock);
2942 	return (1);
2943 }
2944 
2945 /*
2946  * Get a proc vertex. If lock's pvertex value gets a correct proc vertex
2947  * from the list we return that, otherwise we allocate one. If necessary,
2948  * we grow the list of vertices also.
2949  */
2950 
2951 static proc_vertex_t *
2952 flk_get_proc_vertex(lock_descriptor_t *lock)
2953 {
2954 	int i;
2955 	proc_vertex_t	*pv;
2956 	proc_vertex_t	**palloc;
2957 
2958 	ASSERT(MUTEX_HELD(&flock_lock));
2959 	if (lock->pvertex != -1) {
2960 		ASSERT(lock->pvertex >= 0);
2961 		pv = pgraph.proc[lock->pvertex];
2962 		if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
2963 			return (pv);
2964 		}
2965 	}
2966 	for (i = 0; i < pgraph.gcount; i++) {
2967 		pv = pgraph.proc[i];
2968 		if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
2969 			lock->pvertex = pv->index = i;
2970 			return (pv);
2971 		}
2972 	}
2973 	pv = kmem_zalloc(sizeof (struct proc_vertex), KM_SLEEP);
2974 	pv->pid = lock->l_flock.l_pid;
2975 	pv->sysid = lock->l_flock.l_sysid;
2976 	flk_proc_vertex_allocs++;
2977 	if (pgraph.free != 0) {
2978 		for (i = 0; i < pgraph.gcount; i++) {
2979 			if (pgraph.proc[i] == NULL) {
2980 				pgraph.proc[i] = pv;
2981 				lock->pvertex = pv->index = i;
2982 				pgraph.free--;
2983 				return (pv);
2984 			}
2985 		}
2986 	}
2987 	palloc = kmem_zalloc((pgraph.gcount + PROC_CHUNK) *
2988 	    sizeof (proc_vertex_t *), KM_SLEEP);
2989 
2990 	if (pgraph.proc) {
2991 		bcopy(pgraph.proc, palloc,
2992 		    pgraph.gcount * sizeof (proc_vertex_t *));
2993 
2994 		kmem_free(pgraph.proc,
2995 		    pgraph.gcount * sizeof (proc_vertex_t *));
2996 	}
2997 	pgraph.proc = palloc;
2998 	pgraph.free += (PROC_CHUNK - 1);
2999 	pv->index = lock->pvertex = pgraph.gcount;
3000 	pgraph.gcount += PROC_CHUNK;
3001 	pgraph.proc[pv->index] = pv;
3002 	return (pv);
3003 }
3004 
3005 /*
3006  * Allocate a proc edge.
3007  */
3008 
3009 static proc_edge_t *
3010 flk_get_proc_edge()
3011 {
3012 	proc_edge_t *pep;
3013 
3014 	pep = kmem_zalloc(sizeof (proc_edge_t), KM_SLEEP);
3015 	flk_proc_edge_allocs++;
3016 	return (pep);
3017 }
3018 
3019 /*
3020  * Free the proc edge. Called whenever its reference count goes to zero.
3021  */
3022 
3023 static void
3024 flk_free_proc_edge(proc_edge_t *pep)
3025 {
3026 	ASSERT(pep->refcount == 0);
3027 	kmem_free((void *)pep, sizeof (proc_edge_t));
3028 	flk_proc_edge_frees++;
3029 }
3030 
3031 /*
3032  * Color the graph explicitly done only when the mark value hits max value.
3033  */
3034 
3035 static void
3036 flk_proc_graph_uncolor()
3037 {
3038 	int i;
3039 
3040 	if (pgraph.mark == UINT_MAX) {
3041 		for (i = 0; i < pgraph.gcount; i++)
3042 			if (pgraph.proc[i] != NULL) {
3043 				pgraph.proc[i]->atime = 0;
3044 				pgraph.proc[i]->dtime = 0;
3045 			}
3046 		pgraph.mark = 1;
3047 	} else {
3048 		pgraph.mark++;
3049 	}
3050 }
3051 
3052 /*
3053  * Release the proc vertex iff both there are no in edges and out edges
3054  */
3055 
3056 static void
3057 flk_proc_release(proc_vertex_t *proc)
3058 {
3059 	ASSERT(MUTEX_HELD(&flock_lock));
3060 	if (proc->edge == NULL && proc->incount == 0) {
3061 		pgraph.proc[proc->index] = NULL;
3062 		pgraph.free++;
3063 		kmem_free(proc, sizeof (proc_vertex_t));
3064 		flk_proc_vertex_frees++;
3065 	}
3066 }
3067 
3068 /*
3069  * Updates process graph to reflect change in a lock_graph.
3070  * Note: We should call this function only after we have a correctly
3071  * recomputed lock graph. Otherwise we might miss a deadlock detection.
3072  * eg: in function flk_relation() we call this function after flk_recompute_
3073  * dependencies() otherwise if a process tries to lock a vnode hashed
3074  * into another graph it might sleep for ever.
3075  */
3076 
3077 static void
3078 flk_update_proc_graph(edge_t *ep, int delete)
3079 {
3080 	proc_vertex_t *toproc, *fromproc;
3081 	proc_edge_t *pep, *prevpep;
3082 
3083 	mutex_enter(&flock_lock);
3084 	toproc = flk_get_proc_vertex(ep->to_vertex);
3085 	fromproc = flk_get_proc_vertex(ep->from_vertex);
3086 
3087 	if (!delete)
3088 		goto add;
3089 	pep = prevpep = fromproc->edge;
3090 
3091 	ASSERT(pep != NULL);
3092 	while (pep != NULL) {
3093 		if (pep->to_proc == toproc) {
3094 			ASSERT(pep->refcount > 0);
3095 			pep->refcount--;
3096 			if (pep->refcount == 0) {
3097 				if (pep == prevpep) {
3098 					fromproc->edge = pep->next;
3099 				} else {
3100 					prevpep->next = pep->next;
3101 				}
3102 				toproc->incount--;
3103 				flk_proc_release(toproc);
3104 				flk_free_proc_edge(pep);
3105 			}
3106 			break;
3107 		}
3108 		prevpep = pep;
3109 		pep = pep->next;
3110 	}
3111 	flk_proc_release(fromproc);
3112 	mutex_exit(&flock_lock);
3113 	return;
3114 add:
3115 
3116 	pep = fromproc->edge;
3117 
3118 	while (pep != NULL) {
3119 		if (pep->to_proc == toproc) {
3120 			ASSERT(pep->refcount > 0);
3121 			pep->refcount++;
3122 			break;
3123 		}
3124 		pep = pep->next;
3125 	}
3126 	if (pep == NULL) {
3127 		pep = flk_get_proc_edge();
3128 		pep->to_proc = toproc;
3129 		pep->refcount = 1;
3130 		toproc->incount++;
3131 		pep->next = fromproc->edge;
3132 		fromproc->edge = pep;
3133 	}
3134 	mutex_exit(&flock_lock);
3135 }
3136 
3137 /*
3138  * Set the control status for lock manager requests.
3139  *
3140  */
3141 
3142 /*
3143  * PSARC case 1997/292
3144  *
3145  * Requires: "nlmid" must be >= 1 and <= clconf_maximum_nodeid().
3146  * Effects: Set the state of the NLM server identified by "nlmid"
3147  *   in the NLM registry to state "nlm_state."
3148  *   Raises exception no_such_nlm if "nlmid" doesn't identify a known
3149  *   NLM server to this LLM.
3150  *   Note that when this routine is called with NLM_SHUTTING_DOWN there
3151  *   may be locks requests that have gotten started but not finished.  In
3152  *   particular, there may be blocking requests that are in the callback code
3153  *   before sleeping (so they're not holding the lock for the graph).  If
3154  *   such a thread reacquires the graph's lock (to go to sleep) after
3155  *   NLM state in the NLM registry  is set to a non-up value,
3156  *   it will notice the status and bail out.  If the request gets
3157  *   granted before the thread can check the NLM registry, let it
3158  *   continue normally.  It will get flushed when we are called with NLM_DOWN.
3159  *
3160  * Modifies: nlm_reg_obj (global)
3161  * Arguments:
3162  *    nlmid	(IN):    id uniquely identifying an NLM server
3163  *    nlm_state (IN):    NLM server state to change "nlmid" to
3164  */
3165 void
3166 cl_flk_set_nlm_status(int nlmid, flk_nlm_status_t nlm_state)
3167 {
3168 	/*
3169 	 * Check to see if node is booted as a cluster. If not, return.
3170 	 */
3171 	if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
3172 		return;
3173 	}
3174 
3175 	/*
3176 	 * Check for development/debugging.  It is possible to boot a node
3177 	 * in non-cluster mode, and then run a special script, currently
3178 	 * available only to developers, to bring up the node as part of a
3179 	 * cluster.  The problem is that running such a script does not
3180 	 * result in the routine flk_init() being called and hence global array
3181 	 * nlm_reg_status is NULL.  The NLM thinks it's in cluster mode,
3182 	 * but the LLM needs to do an additional check to see if the global
3183 	 * array has been created or not. If nlm_reg_status is NULL, then
3184 	 * return, else continue.
3185 	 */
3186 	if (nlm_reg_status == NULL) {
3187 		return;
3188 	}
3189 
3190 	ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
3191 	mutex_enter(&nlm_reg_lock);
3192 
3193 	if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status, nlmid)) {
3194 		/*
3195 		 * If the NLM server "nlmid" is unknown in the NLM registry,
3196 		 * add it to the registry in the nlm shutting down state.
3197 		 */
3198 		FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3199 		    FLK_NLM_SHUTTING_DOWN);
3200 	} else {
3201 		/*
3202 		 * Change the state of the NLM server identified by "nlmid"
3203 		 * in the NLM registry to the argument "nlm_state."
3204 		 */
3205 		FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3206 		    nlm_state);
3207 	}
3208 
3209 	/*
3210 	 *  The reason we must register the NLM server that is shutting down
3211 	 *  with an LLM that doesn't already know about it (never sent a lock
3212 	 *  request) is to handle correctly a race between shutdown and a new
3213 	 *  lock request.  Suppose that a shutdown request from the NLM server
3214 	 *  invokes this routine at the LLM, and a thread is spawned to
3215 	 *  service the request. Now suppose a new lock request is in
3216 	 *  progress and has already passed the first line of defense in
3217 	 *  reclock(), which denies new locks requests from NLM servers
3218 	 *  that are not in the NLM_UP state.  After the current routine
3219 	 *  is invoked for both phases of shutdown, the routine will return,
3220 	 *  having done nothing, and the lock request will proceed and
3221 	 *  probably be granted.  The problem is that the shutdown was ignored
3222 	 *  by the lock request because there was no record of that NLM server
3223 	 *  shutting down.   We will be in the peculiar position of thinking
3224 	 *  that we've shutdown the NLM server and all locks at all LLMs have
3225 	 *  been discarded, but in fact there's still one lock held.
3226 	 *  The solution is to record the existence of NLM server and change
3227 	 *  its state immediately to NLM_SHUTTING_DOWN.  The lock request in
3228 	 *  progress may proceed because the next phase NLM_DOWN will catch
3229 	 *  this lock and discard it.
3230 	 */
3231 	mutex_exit(&nlm_reg_lock);
3232 
3233 	switch (nlm_state) {
3234 	case FLK_NLM_UP:
3235 		/*
3236 		 * Change the NLM state of all locks still held on behalf of
3237 		 * the NLM server identified by "nlmid" to NLM_UP.
3238 		 */
3239 		cl_flk_change_nlm_state_all_locks(nlmid, FLK_NLM_UP);
3240 		break;
3241 
3242 	case FLK_NLM_SHUTTING_DOWN:
3243 		/*
3244 		 * Wake up all sleeping locks for the NLM server identified
3245 		 * by "nlmid." Note that eventually all woken threads will
3246 		 * have their lock requests cancelled and descriptors
3247 		 * removed from the sleeping lock list.  Note that the NLM
3248 		 * server state associated with each lock descriptor is
3249 		 * changed to FLK_NLM_SHUTTING_DOWN.
3250 		 */
3251 		cl_flk_wakeup_sleeping_nlm_locks(nlmid);
3252 		break;
3253 
3254 	case FLK_NLM_DOWN:
3255 		/*
3256 		 * Discard all active, granted locks for this NLM server
3257 		 * identified by "nlmid."
3258 		 */
3259 		cl_flk_unlock_nlm_granted(nlmid);
3260 		break;
3261 
3262 	default:
3263 		panic("cl_set_nlm_status: bad status (%d)", nlm_state);
3264 	}
3265 }
3266 
3267 /*
3268  * Set the control status for lock manager requests.
3269  *
3270  * Note that when this routine is called with FLK_WAKEUP_SLEEPERS, there
3271  * may be locks requests that have gotten started but not finished.  In
3272  * particular, there may be blocking requests that are in the callback code
3273  * before sleeping (so they're not holding the lock for the graph).  If
3274  * such a thread reacquires the graph's lock (to go to sleep) after
3275  * flk_lockmgr_status is set to a non-up value, it will notice the status
3276  * and bail out.  If the request gets granted before the thread can check
3277  * flk_lockmgr_status, let it continue normally.  It will get flushed when
3278  * we are called with FLK_LOCKMGR_DOWN.
3279  */
3280 
3281 void
3282 flk_set_lockmgr_status(flk_lockmgr_status_t status)
3283 {
3284 	int i;
3285 	graph_t *gp;
3286 	struct flock_globals *fg;
3287 
3288 	fg = flk_get_globals();
3289 	ASSERT(fg != NULL);
3290 
3291 	mutex_enter(&flock_lock);
3292 	fg->flk_lockmgr_status = status;
3293 	mutex_exit(&flock_lock);
3294 
3295 	/*
3296 	 * If the lock manager is coming back up, all that's needed is to
3297 	 * propagate this information to the graphs.  If the lock manager
3298 	 * is going down, additional action is required, and each graph's
3299 	 * copy of the state is updated atomically with this other action.
3300 	 */
3301 	switch (status) {
3302 	case FLK_LOCKMGR_UP:
3303 		for (i = 0; i < HASH_SIZE; i++) {
3304 			mutex_enter(&flock_lock);
3305 			gp = lock_graph[i];
3306 			mutex_exit(&flock_lock);
3307 			if (gp == NULL)
3308 				continue;
3309 			mutex_enter(&gp->gp_mutex);
3310 			fg->lockmgr_status[i] = status;
3311 			mutex_exit(&gp->gp_mutex);
3312 		}
3313 		break;
3314 	case FLK_WAKEUP_SLEEPERS:
3315 		wakeup_sleeping_lockmgr_locks(fg);
3316 		break;
3317 	case FLK_LOCKMGR_DOWN:
3318 		unlock_lockmgr_granted(fg);
3319 		break;
3320 	default:
3321 		panic("flk_set_lockmgr_status: bad status (%d)", status);
3322 		break;
3323 	}
3324 }
3325 
3326 /*
3327  * This routine returns all the locks that are active or sleeping and are
3328  * associated with a particular set of identifiers.  If lock_state != 0, then
3329  * only locks that match the lock_state are returned. If lock_state == 0, then
3330  * all locks are returned. If pid == NOPID, the pid is ignored.  If
3331  * use_sysid is FALSE, then the sysid is ignored.  If vp is NULL, then the
3332  * vnode pointer is ignored.
3333  *
3334  * A list containing the vnode pointer and an flock structure
3335  * describing the lock is returned.  Each element in the list is
3336  * dynamically allocated and must be freed by the caller.  The
3337  * last item in the list is denoted by a NULL value in the ll_next
3338  * field.
3339  *
3340  * The vnode pointers returned are held.  The caller is responsible
3341  * for releasing these.  Note that the returned list is only a snapshot of
3342  * the current lock information, and that it is a snapshot of a moving
3343  * target (only one graph is locked at a time).
3344  */
3345 
3346 locklist_t *
3347 get_lock_list(int list_type, int lock_state, int sysid, boolean_t use_sysid,
3348 		pid_t pid, const vnode_t *vp, zoneid_t zoneid)
3349 {
3350 	lock_descriptor_t	*lock;
3351 	lock_descriptor_t	*graph_head;
3352 	locklist_t		listhead;
3353 	locklist_t		*llheadp;
3354 	locklist_t		*llp;
3355 	locklist_t		*lltp;
3356 	graph_t			*gp;
3357 	int			i;
3358 	int			first_index; /* graph index */
3359 	int			num_indexes; /* graph index */
3360 
3361 	ASSERT((list_type == FLK_ACTIVE_STATE) ||
3362 	    (list_type == FLK_SLEEPING_STATE));
3363 
3364 	/*
3365 	 * Get a pointer to something to use as a list head while building
3366 	 * the rest of the list.
3367 	 */
3368 	llheadp = &listhead;
3369 	lltp = llheadp;
3370 	llheadp->ll_next = (locklist_t *)NULL;
3371 
3372 	/* Figure out which graphs we want to look at. */
3373 	if (vp == NULL) {
3374 		first_index = 0;
3375 		num_indexes = HASH_SIZE;
3376 	} else {
3377 		first_index = HASH_INDEX(vp);
3378 		num_indexes = 1;
3379 	}
3380 
3381 	for (i = first_index; i < first_index + num_indexes; i++) {
3382 		mutex_enter(&flock_lock);
3383 		gp = lock_graph[i];
3384 		mutex_exit(&flock_lock);
3385 		if (gp == NULL) {
3386 			continue;
3387 		}
3388 
3389 		mutex_enter(&gp->gp_mutex);
3390 		graph_head = (list_type == FLK_ACTIVE_STATE) ?
3391 		    ACTIVE_HEAD(gp) : SLEEPING_HEAD(gp);
3392 		for (lock = graph_head->l_next;
3393 		    lock != graph_head;
3394 		    lock = lock->l_next) {
3395 			if (use_sysid && lock->l_flock.l_sysid != sysid)
3396 				continue;
3397 			if (pid != NOPID && lock->l_flock.l_pid != pid)
3398 				continue;
3399 			if (vp != NULL && lock->l_vnode != vp)
3400 				continue;
3401 			if (lock_state && !(lock_state & lock->l_state))
3402 				continue;
3403 			if (zoneid != lock->l_zoneid && zoneid != ALL_ZONES)
3404 				continue;
3405 			/*
3406 			 * A matching lock was found.  Allocate
3407 			 * space for a new locklist entry and fill
3408 			 * it in.
3409 			 */
3410 			llp = kmem_alloc(sizeof (locklist_t), KM_SLEEP);
3411 			lltp->ll_next = llp;
3412 			VN_HOLD(lock->l_vnode);
3413 			llp->ll_vp = lock->l_vnode;
3414 			create_flock(lock, &(llp->ll_flock));
3415 			llp->ll_next = (locklist_t *)NULL;
3416 			lltp = llp;
3417 		}
3418 		mutex_exit(&gp->gp_mutex);
3419 	}
3420 
3421 	llp = llheadp->ll_next;
3422 	return (llp);
3423 }
3424 
3425 /*
3426  * These two functions are simply interfaces to get_lock_list.  They return
3427  * a list of sleeping or active locks for the given sysid and pid.  See
3428  * get_lock_list for details.
3429  *
3430  * In either case we don't particularly care to specify the zone of interest;
3431  * the sysid-space is global across zones, so the sysid will map to exactly one
3432  * zone, and we'll return information for that zone.
3433  */
3434 
3435 locklist_t *
3436 flk_get_sleeping_locks(int sysid, pid_t pid)
3437 {
3438 	return (get_lock_list(FLK_SLEEPING_STATE, 0, sysid, B_TRUE, pid, NULL,
3439 	    ALL_ZONES));
3440 }
3441 
3442 locklist_t *
3443 flk_get_active_locks(int sysid, pid_t pid)
3444 {
3445 	return (get_lock_list(FLK_ACTIVE_STATE, 0, sysid, B_TRUE, pid, NULL,
3446 	    ALL_ZONES));
3447 }
3448 
3449 /*
3450  * Another interface to get_lock_list.  This one returns all the active
3451  * locks for a given vnode.  Again, see get_lock_list for details.
3452  *
3453  * We don't need to specify which zone's locks we're interested in.  The matter
3454  * would only be interesting if the vnode belonged to NFS, and NFS vnodes can't
3455  * be used by multiple zones, so the list of locks will all be from the right
3456  * zone.
3457  */
3458 
3459 locklist_t *
3460 flk_active_locks_for_vp(const vnode_t *vp)
3461 {
3462 	return (get_lock_list(FLK_ACTIVE_STATE, 0, 0, B_FALSE, NOPID, vp,
3463 	    ALL_ZONES));
3464 }
3465 
3466 /*
3467  * Another interface to get_lock_list.  This one returns all the active
3468  * nbmand locks for a given vnode.  Again, see get_lock_list for details.
3469  *
3470  * See the comment for flk_active_locks_for_vp() for why we don't care to
3471  * specify the particular zone of interest.
3472  */
3473 locklist_t *
3474 flk_active_nbmand_locks_for_vp(const vnode_t *vp)
3475 {
3476 	return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3477 	    NOPID, vp, ALL_ZONES));
3478 }
3479 
3480 /*
3481  * Another interface to get_lock_list.  This one returns all the active
3482  * nbmand locks for a given pid.  Again, see get_lock_list for details.
3483  *
3484  * The zone doesn't need to be specified here; the locks held by a
3485  * particular process will either be local (ie, non-NFS) or from the zone
3486  * the process is executing in.  This is because other parts of the system
3487  * ensure that an NFS vnode can't be used in a zone other than that in
3488  * which it was opened.
3489  */
3490 locklist_t *
3491 flk_active_nbmand_locks(pid_t pid)
3492 {
3493 	return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3494 	    pid, NULL, ALL_ZONES));
3495 }
3496 
3497 /*
3498  * Free up all entries in the locklist.
3499  */
3500 void
3501 flk_free_locklist(locklist_t *llp)
3502 {
3503 	locklist_t *next_llp;
3504 
3505 	while (llp) {
3506 		next_llp = llp->ll_next;
3507 		VN_RELE(llp->ll_vp);
3508 		kmem_free(llp, sizeof (*llp));
3509 		llp = next_llp;
3510 	}
3511 }
3512 
3513 static void
3514 cl_flk_change_nlm_state_all_locks(int nlmid, flk_nlm_status_t nlm_state)
3515 {
3516 	/*
3517 	 * For each graph "lg" in the hash table lock_graph do
3518 	 * a.  Get the list of sleeping locks
3519 	 * b.  For each lock descriptor in the list do
3520 	 *	i.   If the requested lock is an NLM server request AND
3521 	 *		the nlmid is the same as the routine argument then
3522 	 *		change the lock descriptor's state field to
3523 	 *		"nlm_state."
3524 	 * c.  Get the list of active locks
3525 	 * d.  For each lock descriptor in the list do
3526 	 *	i.   If the requested lock is an NLM server request AND
3527 	 *		the nlmid is the same as the routine argument then
3528 	 *		change the lock descriptor's state field to
3529 	 *		"nlm_state."
3530 	 */
3531 
3532 	int			i;
3533 	graph_t			*gp;			/* lock graph */
3534 	lock_descriptor_t	*lock;			/* lock */
3535 	lock_descriptor_t	*nlock = NULL;		/* next lock */
3536 	int			lock_nlmid;
3537 
3538 	for (i = 0; i < HASH_SIZE; i++) {
3539 		mutex_enter(&flock_lock);
3540 		gp = lock_graph[i];
3541 		mutex_exit(&flock_lock);
3542 		if (gp == NULL) {
3543 			continue;
3544 		}
3545 
3546 		/* Get list of sleeping locks in current lock graph. */
3547 		mutex_enter(&gp->gp_mutex);
3548 		for (lock = SLEEPING_HEAD(gp)->l_next;
3549 		    lock != SLEEPING_HEAD(gp);
3550 		    lock = nlock) {
3551 			nlock = lock->l_next;
3552 			/* get NLM id */
3553 			lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3554 
3555 			/*
3556 			 * If NLM server request AND nlmid of lock matches
3557 			 * nlmid of argument, then set the NLM state of the
3558 			 * lock to "nlm_state."
3559 			 */
3560 			if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3561 				SET_NLM_STATE(lock, nlm_state);
3562 			}
3563 		}
3564 
3565 		/* Get list of active locks in current lock graph. */
3566 		for (lock = ACTIVE_HEAD(gp)->l_next;
3567 		    lock != ACTIVE_HEAD(gp);
3568 		    lock = nlock) {
3569 			nlock = lock->l_next;
3570 			/* get NLM id */
3571 			lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3572 
3573 			/*
3574 			 * If NLM server request AND nlmid of lock matches
3575 			 * nlmid of argument, then set the NLM state of the
3576 			 * lock to "nlm_state."
3577 			 */
3578 			if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3579 				ASSERT(IS_ACTIVE(lock));
3580 				SET_NLM_STATE(lock, nlm_state);
3581 			}
3582 		}
3583 		mutex_exit(&gp->gp_mutex);
3584 	}
3585 }
3586 
3587 /*
3588  * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid().
3589  * Effects: Find all sleeping lock manager requests _only_ for the NLM server
3590  *   identified by "nlmid." Poke those lock requests.
3591  */
3592 static void
3593 cl_flk_wakeup_sleeping_nlm_locks(int nlmid)
3594 {
3595 	lock_descriptor_t *lock;
3596 	lock_descriptor_t *nlock = NULL; /* next lock */
3597 	int i;
3598 	graph_t *gp;
3599 	int	lock_nlmid;
3600 
3601 	for (i = 0; i < HASH_SIZE; i++) {
3602 		mutex_enter(&flock_lock);
3603 		gp = lock_graph[i];
3604 		mutex_exit(&flock_lock);
3605 		if (gp == NULL) {
3606 			continue;
3607 		}
3608 
3609 		mutex_enter(&gp->gp_mutex);
3610 		for (lock = SLEEPING_HEAD(gp)->l_next;
3611 		    lock != SLEEPING_HEAD(gp);
3612 		    lock = nlock) {
3613 			nlock = lock->l_next;
3614 			/*
3615 			 * If NLM server request _and_ nlmid of lock matches
3616 			 * nlmid of argument, then set the NLM state of the
3617 			 * lock to NLM_SHUTTING_DOWN, and wake up sleeping
3618 			 * request.
3619 			 */
3620 			if (IS_LOCKMGR(lock)) {
3621 				/* get NLM id */
3622 				lock_nlmid =
3623 				    GETNLMID(lock->l_flock.l_sysid);
3624 				if (nlmid == lock_nlmid) {
3625 					SET_NLM_STATE(lock,
3626 					    FLK_NLM_SHUTTING_DOWN);
3627 					INTERRUPT_WAKEUP(lock);
3628 				}
3629 			}
3630 		}
3631 		mutex_exit(&gp->gp_mutex);
3632 	}
3633 }
3634 
3635 /*
3636  * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid()
3637  * Effects:  Find all active (granted) lock manager locks _only_ for the
3638  *   NLM server identified by "nlmid" and release them.
3639  */
3640 static void
3641 cl_flk_unlock_nlm_granted(int nlmid)
3642 {
3643 	lock_descriptor_t *lock;
3644 	lock_descriptor_t *nlock = NULL; /* next lock */
3645 	int i;
3646 	graph_t *gp;
3647 	int	lock_nlmid;
3648 
3649 	for (i = 0; i < HASH_SIZE; i++) {
3650 		mutex_enter(&flock_lock);
3651 		gp = lock_graph[i];
3652 		mutex_exit(&flock_lock);
3653 		if (gp == NULL) {
3654 			continue;
3655 		}
3656 
3657 		mutex_enter(&gp->gp_mutex);
3658 		for (lock = ACTIVE_HEAD(gp)->l_next;
3659 		    lock != ACTIVE_HEAD(gp);
3660 		    lock = nlock) {
3661 			nlock = lock->l_next;
3662 			ASSERT(IS_ACTIVE(lock));
3663 
3664 			/*
3665 			 * If it's an  NLM server request _and_ nlmid of
3666 			 * the lock matches nlmid of argument, then
3667 			 * remove the active lock the list, wakup blocked
3668 			 * threads, and free the storage for the lock.
3669 			 * Note that there's no need to mark the NLM state
3670 			 * of this lock to NLM_DOWN because the lock will
3671 			 * be deleted anyway and its storage freed.
3672 			 */
3673 			if (IS_LOCKMGR(lock)) {
3674 				/* get NLM id */
3675 				lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3676 				if (nlmid == lock_nlmid) {
3677 					flk_delete_active_lock(lock, 0);
3678 					flk_wakeup(lock, 1);
3679 					flk_free_lock(lock);
3680 				}
3681 			}
3682 		}
3683 		mutex_exit(&gp->gp_mutex);
3684 	}
3685 }
3686 
3687 /*
3688  * Find all sleeping lock manager requests and poke them.
3689  */
3690 static void
3691 wakeup_sleeping_lockmgr_locks(struct flock_globals *fg)
3692 {
3693 	lock_descriptor_t *lock;
3694 	lock_descriptor_t *nlock = NULL; /* next lock */
3695 	int i;
3696 	graph_t *gp;
3697 	zoneid_t zoneid = getzoneid();
3698 
3699 	for (i = 0; i < HASH_SIZE; i++) {
3700 		mutex_enter(&flock_lock);
3701 		gp = lock_graph[i];
3702 		mutex_exit(&flock_lock);
3703 		if (gp == NULL) {
3704 			continue;
3705 		}
3706 
3707 		mutex_enter(&gp->gp_mutex);
3708 		fg->lockmgr_status[i] = FLK_WAKEUP_SLEEPERS;
3709 		for (lock = SLEEPING_HEAD(gp)->l_next;
3710 		    lock != SLEEPING_HEAD(gp);
3711 		    lock = nlock) {
3712 			nlock = lock->l_next;
3713 			if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
3714 				INTERRUPT_WAKEUP(lock);
3715 			}
3716 		}
3717 		mutex_exit(&gp->gp_mutex);
3718 	}
3719 }
3720 
3721 
3722 /*
3723  * Find all active (granted) lock manager locks and release them.
3724  */
3725 static void
3726 unlock_lockmgr_granted(struct flock_globals *fg)
3727 {
3728 	lock_descriptor_t *lock;
3729 	lock_descriptor_t *nlock = NULL; /* next lock */
3730 	int i;
3731 	graph_t *gp;
3732 	zoneid_t zoneid = getzoneid();
3733 
3734 	for (i = 0; i < HASH_SIZE; i++) {
3735 		mutex_enter(&flock_lock);
3736 		gp = lock_graph[i];
3737 		mutex_exit(&flock_lock);
3738 		if (gp == NULL) {
3739 			continue;
3740 		}
3741 
3742 		mutex_enter(&gp->gp_mutex);
3743 		fg->lockmgr_status[i] = FLK_LOCKMGR_DOWN;
3744 		for (lock = ACTIVE_HEAD(gp)->l_next;
3745 		    lock != ACTIVE_HEAD(gp);
3746 		    lock = nlock) {
3747 			nlock = lock->l_next;
3748 			if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
3749 				ASSERT(IS_ACTIVE(lock));
3750 				flk_delete_active_lock(lock, 0);
3751 				flk_wakeup(lock, 1);
3752 				flk_free_lock(lock);
3753 			}
3754 		}
3755 		mutex_exit(&gp->gp_mutex);
3756 	}
3757 }
3758 
3759 
3760 /*
3761  * Wait until a lock is granted, cancelled, or interrupted.
3762  */
3763 
3764 static void
3765 wait_for_lock(lock_descriptor_t *request)
3766 {
3767 	graph_t *gp = request->l_graph;
3768 
3769 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
3770 
3771 	while (!(IS_GRANTED(request)) && !(IS_CANCELLED(request)) &&
3772 	    !(IS_INTERRUPTED(request))) {
3773 		if (!cv_wait_sig(&request->l_cv, &gp->gp_mutex)) {
3774 			flk_set_state(request, FLK_INTERRUPTED_STATE);
3775 			request->l_state |= INTERRUPTED_LOCK;
3776 		}
3777 	}
3778 }
3779 
3780 /*
3781  * Create an flock structure from the existing lock information
3782  *
3783  * This routine is used to create flock structures for the lock manager
3784  * to use in a reclaim request.  Since the lock was originated on this
3785  * host, it must be conforming to UNIX semantics, so no checking is
3786  * done to make sure it falls within the lower half of the 32-bit range.
3787  */
3788 
3789 static void
3790 create_flock(lock_descriptor_t *lp, flock64_t *flp)
3791 {
3792 	ASSERT(lp->l_end == MAX_U_OFFSET_T || lp->l_end <= MAXEND);
3793 	ASSERT(lp->l_end >= lp->l_start);
3794 
3795 	flp->l_type = lp->l_type;
3796 	flp->l_whence = 0;
3797 	flp->l_start = lp->l_start;
3798 	flp->l_len = (lp->l_end == MAX_U_OFFSET_T) ? 0 :
3799 	    (lp->l_end - lp->l_start + 1);
3800 	flp->l_sysid = lp->l_flock.l_sysid;
3801 	flp->l_pid = lp->l_flock.l_pid;
3802 }
3803 
3804 /*
3805  * Convert flock_t data describing a lock range into unsigned long starting
3806  * and ending points, which are put into lock_request.  Returns 0 or an
3807  * errno value.
3808  * Large Files: max is passed by the caller and we return EOVERFLOW
3809  * as defined by LFS API.
3810  */
3811 
3812 int
3813 flk_convert_lock_data(vnode_t *vp, flock64_t *flp,
3814     u_offset_t *start, u_offset_t *end, offset_t offset)
3815 {
3816 	struct vattr	vattr;
3817 	int	error;
3818 
3819 	/*
3820 	 * Determine the starting point of the request
3821 	 */
3822 	switch (flp->l_whence) {
3823 	case 0:		/* SEEK_SET */
3824 		*start = (u_offset_t)flp->l_start;
3825 		break;
3826 	case 1:		/* SEEK_CUR */
3827 		*start = (u_offset_t)(flp->l_start + offset);
3828 		break;
3829 	case 2:		/* SEEK_END */
3830 		vattr.va_mask = AT_SIZE;
3831 		if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
3832 			return (error);
3833 		*start = (u_offset_t)(flp->l_start + vattr.va_size);
3834 		break;
3835 	default:
3836 		return (EINVAL);
3837 	}
3838 
3839 	/*
3840 	 * Determine the range covered by the request.
3841 	 */
3842 	if (flp->l_len == 0)
3843 		*end = MAX_U_OFFSET_T;
3844 	else if ((offset_t)flp->l_len > 0) {
3845 		*end = (u_offset_t)(*start + (flp->l_len - 1));
3846 	} else {
3847 		/*
3848 		 * Negative length; why do we even allow this ?
3849 		 * Because this allows easy specification of
3850 		 * the last n bytes of the file.
3851 		 */
3852 		*end = *start;
3853 		*start += (u_offset_t)flp->l_len;
3854 		(*start)++;
3855 	}
3856 	return (0);
3857 }
3858 
3859 /*
3860  * Check the validity of lock data.  This can used by the NFS
3861  * frlock routines to check data before contacting the server.  The
3862  * server must support semantics that aren't as restrictive as
3863  * the UNIX API, so the NFS client is required to check.
3864  * The maximum is now passed in by the caller.
3865  */
3866 
3867 int
3868 flk_check_lock_data(u_offset_t start, u_offset_t end, offset_t max)
3869 {
3870 	/*
3871 	 * The end (length) for local locking should never be greater
3872 	 * than MAXEND. However, the representation for
3873 	 * the entire file is MAX_U_OFFSET_T.
3874 	 */
3875 	if ((start > max) ||
3876 	    ((end > max) && (end != MAX_U_OFFSET_T))) {
3877 		return (EINVAL);
3878 	}
3879 	if (start > end) {
3880 		return (EINVAL);
3881 	}
3882 	return (0);
3883 }
3884 
3885 /*
3886  * Fill in request->l_flock with information about the lock blocking the
3887  * request.  The complexity here is that lock manager requests are allowed
3888  * to see into the upper part of the 32-bit address range, whereas local
3889  * requests are only allowed to see signed values.
3890  *
3891  * What should be done when "blocker" is a lock manager lock that uses the
3892  * upper portion of the 32-bit range, but "request" is local?  Since the
3893  * request has already been determined to have been blocked by the blocker,
3894  * at least some portion of "blocker" must be in the range of the request,
3895  * or the request extends to the end of file.  For the first case, the
3896  * portion in the lower range is returned with the indication that it goes
3897  * "to EOF."  For the second case, the last byte of the lower range is
3898  * returned with the indication that it goes "to EOF."
3899  */
3900 
3901 static void
3902 report_blocker(lock_descriptor_t *blocker, lock_descriptor_t *request)
3903 {
3904 	flock64_t *flrp;			/* l_flock portion of request */
3905 
3906 	ASSERT(blocker != NULL);
3907 
3908 	flrp = &request->l_flock;
3909 	flrp->l_whence = 0;
3910 	flrp->l_type = blocker->l_type;
3911 	flrp->l_pid = blocker->l_flock.l_pid;
3912 	flrp->l_sysid = blocker->l_flock.l_sysid;
3913 
3914 	if (IS_LOCKMGR(request)) {
3915 		flrp->l_start = blocker->l_start;
3916 		if (blocker->l_end == MAX_U_OFFSET_T)
3917 			flrp->l_len = 0;
3918 		else
3919 			flrp->l_len = blocker->l_end - blocker->l_start + 1;
3920 	} else {
3921 		if (blocker->l_start > MAXEND) {
3922 			flrp->l_start = MAXEND;
3923 			flrp->l_len = 0;
3924 		} else {
3925 			flrp->l_start = blocker->l_start;
3926 			if (blocker->l_end == MAX_U_OFFSET_T)
3927 				flrp->l_len = 0;
3928 			else
3929 				flrp->l_len = blocker->l_end -
3930 				    blocker->l_start + 1;
3931 		}
3932 	}
3933 }
3934 
3935 /*
3936  * PSARC case 1997/292
3937  */
3938 /*
3939  * This is the public routine exported by flock.h.
3940  */
3941 void
3942 cl_flk_change_nlm_state_to_unknown(int nlmid)
3943 {
3944 	/*
3945 	 * Check to see if node is booted as a cluster. If not, return.
3946 	 */
3947 	if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
3948 		return;
3949 	}
3950 
3951 	/*
3952 	 * See comment in cl_flk_set_nlm_status().
3953 	 */
3954 	if (nlm_reg_status == NULL) {
3955 		return;
3956 	}
3957 
3958 	/*
3959 	 * protect NLM registry state with a mutex.
3960 	 */
3961 	ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
3962 	mutex_enter(&nlm_reg_lock);
3963 	FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid, FLK_NLM_UNKNOWN);
3964 	mutex_exit(&nlm_reg_lock);
3965 }
3966 
3967 /*
3968  * Return non-zero if the given I/O request conflicts with an active NBMAND
3969  * lock.
3970  * If svmand is non-zero, it means look at all active locks, not just NBMAND
3971  * locks.
3972  */
3973 
3974 int
3975 nbl_lock_conflict(vnode_t *vp, nbl_op_t op, u_offset_t offset,
3976 		ssize_t length, int svmand, caller_context_t *ct)
3977 {
3978 	int conflict = 0;
3979 	graph_t			*gp;
3980 	lock_descriptor_t	*lock;
3981 	pid_t pid;
3982 	int sysid;
3983 
3984 	if (ct == NULL) {
3985 		pid = curproc->p_pid;
3986 		sysid = 0;
3987 	} else {
3988 		pid = ct->cc_pid;
3989 		sysid = ct->cc_sysid;
3990 	}
3991 
3992 	mutex_enter(&flock_lock);
3993 	gp = lock_graph[HASH_INDEX(vp)];
3994 	mutex_exit(&flock_lock);
3995 	if (gp == NULL)
3996 		return (0);
3997 
3998 	mutex_enter(&gp->gp_mutex);
3999 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
4000 
4001 	for (; lock && lock->l_vnode == vp; lock = lock->l_next) {
4002 		if ((svmand || (lock->l_state & NBMAND_LOCK)) &&
4003 		    (lock->l_flock.l_sysid != sysid ||
4004 		    lock->l_flock.l_pid != pid) &&
4005 		    lock_blocks_io(op, offset, length,
4006 		    lock->l_type, lock->l_start, lock->l_end)) {
4007 			conflict = 1;
4008 			break;
4009 		}
4010 	}
4011 	mutex_exit(&gp->gp_mutex);
4012 
4013 	return (conflict);
4014 }
4015 
4016 /*
4017  * Return non-zero if the given I/O request conflicts with the given lock.
4018  */
4019 
4020 static int
4021 lock_blocks_io(nbl_op_t op, u_offset_t offset, ssize_t length,
4022 	    int lock_type, u_offset_t lock_start, u_offset_t lock_end)
4023 {
4024 	ASSERT(op == NBL_READ || op == NBL_WRITE || op == NBL_READWRITE);
4025 	ASSERT(lock_type == F_RDLCK || lock_type == F_WRLCK);
4026 
4027 	if (op == NBL_READ && lock_type == F_RDLCK)
4028 		return (0);
4029 
4030 	if (offset <= lock_start && lock_start < offset + length)
4031 		return (1);
4032 	if (lock_start <= offset && offset <= lock_end)
4033 		return (1);
4034 
4035 	return (0);
4036 }
4037 
4038 #ifdef DEBUG
4039 static void
4040 check_active_locks(graph_t *gp)
4041 {
4042 	lock_descriptor_t *lock, *lock1;
4043 	edge_t	*ep;
4044 
4045 	for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
4046 	    lock = lock->l_next) {
4047 		ASSERT(IS_ACTIVE(lock));
4048 		ASSERT(NOT_BLOCKED(lock));
4049 		ASSERT(!IS_BARRIER(lock));
4050 
4051 		ep = FIRST_IN(lock);
4052 
4053 		while (ep != HEAD(lock)) {
4054 			ASSERT(IS_SLEEPING(ep->from_vertex));
4055 			ASSERT(!NOT_BLOCKED(ep->from_vertex));
4056 			ep = NEXT_IN(ep);
4057 		}
4058 
4059 		for (lock1 = lock->l_next; lock1 != ACTIVE_HEAD(gp);
4060 		    lock1 = lock1->l_next) {
4061 			if (lock1->l_vnode == lock->l_vnode) {
4062 			if (BLOCKS(lock1, lock)) {
4063 				cmn_err(CE_PANIC,
4064 				    "active lock %p blocks %p",
4065 				    (void *)lock1, (void *)lock);
4066 			} else if (BLOCKS(lock, lock1)) {
4067 				cmn_err(CE_PANIC,
4068 				    "active lock %p blocks %p",
4069 				    (void *)lock, (void *)lock1);
4070 			}
4071 			}
4072 		}
4073 	}
4074 }
4075 
4076 /*
4077  * Effect: This functions checks to see if the transition from 'old_state' to
4078  *	'new_state' is a valid one.  It returns 0 if the transition is valid
4079  *	and 1 if it is not.
4080  *	For a map of valid transitions, see sys/flock_impl.h
4081  */
4082 static int
4083 check_lock_transition(int old_state, int new_state)
4084 {
4085 	switch (old_state) {
4086 	case FLK_INITIAL_STATE:
4087 		if ((new_state == FLK_START_STATE) ||
4088 		    (new_state == FLK_SLEEPING_STATE) ||
4089 		    (new_state == FLK_ACTIVE_STATE) ||
4090 		    (new_state == FLK_DEAD_STATE)) {
4091 			return (0);
4092 		} else {
4093 			return (1);
4094 		}
4095 	case FLK_START_STATE:
4096 		if ((new_state == FLK_ACTIVE_STATE) ||
4097 		    (new_state == FLK_DEAD_STATE)) {
4098 			return (0);
4099 		} else {
4100 			return (1);
4101 		}
4102 	case FLK_ACTIVE_STATE:
4103 		if (new_state == FLK_DEAD_STATE) {
4104 			return (0);
4105 		} else {
4106 			return (1);
4107 		}
4108 	case FLK_SLEEPING_STATE:
4109 		if ((new_state == FLK_GRANTED_STATE) ||
4110 		    (new_state == FLK_INTERRUPTED_STATE) ||
4111 		    (new_state == FLK_CANCELLED_STATE)) {
4112 			return (0);
4113 		} else {
4114 			return (1);
4115 		}
4116 	case FLK_GRANTED_STATE:
4117 		if ((new_state == FLK_START_STATE) ||
4118 		    (new_state == FLK_INTERRUPTED_STATE) ||
4119 		    (new_state == FLK_CANCELLED_STATE)) {
4120 			return (0);
4121 		} else {
4122 			return (1);
4123 		}
4124 	case FLK_CANCELLED_STATE:
4125 		if ((new_state == FLK_INTERRUPTED_STATE) ||
4126 		    (new_state == FLK_DEAD_STATE)) {
4127 			return (0);
4128 		} else {
4129 			return (1);
4130 		}
4131 	case FLK_INTERRUPTED_STATE:
4132 		if (new_state == FLK_DEAD_STATE) {
4133 			return (0);
4134 		} else {
4135 			return (1);
4136 		}
4137 	case FLK_DEAD_STATE:
4138 		/* May be set more than once */
4139 		if (new_state == FLK_DEAD_STATE) {
4140 			return (0);
4141 		} else {
4142 			return (1);
4143 		}
4144 	default:
4145 		return (1);
4146 	}
4147 }
4148 
4149 static void
4150 check_sleeping_locks(graph_t *gp)
4151 {
4152 	lock_descriptor_t *lock1, *lock2;
4153 	edge_t *ep;
4154 	for (lock1 = SLEEPING_HEAD(gp)->l_next; lock1 != SLEEPING_HEAD(gp);
4155 	    lock1 = lock1->l_next) {
4156 				ASSERT(!IS_BARRIER(lock1));
4157 	for (lock2 = lock1->l_next; lock2 != SLEEPING_HEAD(gp);
4158 	    lock2 = lock2->l_next) {
4159 		if (lock1->l_vnode == lock2->l_vnode) {
4160 			if (BLOCKS(lock2, lock1)) {
4161 				ASSERT(!IS_GRANTED(lock1));
4162 				ASSERT(!NOT_BLOCKED(lock1));
4163 				path(lock1, lock2);
4164 			}
4165 		}
4166 	}
4167 
4168 	for (lock2 = ACTIVE_HEAD(gp)->l_next; lock2 != ACTIVE_HEAD(gp);
4169 	    lock2 = lock2->l_next) {
4170 				ASSERT(!IS_BARRIER(lock1));
4171 		if (lock1->l_vnode == lock2->l_vnode) {
4172 			if (BLOCKS(lock2, lock1)) {
4173 				ASSERT(!IS_GRANTED(lock1));
4174 				ASSERT(!NOT_BLOCKED(lock1));
4175 				path(lock1, lock2);
4176 			}
4177 		}
4178 	}
4179 	ep = FIRST_ADJ(lock1);
4180 	while (ep != HEAD(lock1)) {
4181 		ASSERT(BLOCKS(ep->to_vertex, lock1));
4182 		ep = NEXT_ADJ(ep);
4183 	}
4184 	}
4185 }
4186 
4187 static int
4188 level_two_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2, int no_path)
4189 {
4190 	edge_t	*ep;
4191 	lock_descriptor_t	*vertex;
4192 	lock_descriptor_t *vertex_stack;
4193 
4194 	STACK_INIT(vertex_stack);
4195 
4196 	flk_graph_uncolor(lock1->l_graph);
4197 	ep = FIRST_ADJ(lock1);
4198 	ASSERT(ep != HEAD(lock1));
4199 	while (ep != HEAD(lock1)) {
4200 		if (no_path)
4201 			ASSERT(ep->to_vertex != lock2);
4202 		STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4203 		COLOR(ep->to_vertex);
4204 		ep = NEXT_ADJ(ep);
4205 	}
4206 
4207 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
4208 		STACK_POP(vertex_stack, l_dstack);
4209 		for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
4210 		    ep = NEXT_ADJ(ep)) {
4211 			if (COLORED(ep->to_vertex))
4212 				continue;
4213 			COLOR(ep->to_vertex);
4214 			if (ep->to_vertex == lock2)
4215 				return (1);
4216 
4217 			STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4218 		}
4219 	}
4220 	return (0);
4221 }
4222 
4223 static void
4224 check_owner_locks(graph_t *gp, pid_t pid, int sysid, vnode_t *vp)
4225 {
4226 	lock_descriptor_t *lock;
4227 
4228 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
4229 
4230 	if (lock) {
4231 		while (lock != ACTIVE_HEAD(gp) && (lock->l_vnode == vp)) {
4232 			if (lock->l_flock.l_pid == pid &&
4233 			    lock->l_flock.l_sysid == sysid)
4234 				cmn_err(CE_PANIC,
4235 				    "owner pid %d's lock %p in active queue",
4236 				    pid, (void *)lock);
4237 			lock = lock->l_next;
4238 		}
4239 	}
4240 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
4241 
4242 	if (lock) {
4243 		while (lock != SLEEPING_HEAD(gp) && (lock->l_vnode == vp)) {
4244 			if (lock->l_flock.l_pid == pid &&
4245 			    lock->l_flock.l_sysid == sysid)
4246 				cmn_err(CE_PANIC,
4247 				    "owner pid %d's lock %p in sleep queue",
4248 				    pid, (void *)lock);
4249 			lock = lock->l_next;
4250 		}
4251 	}
4252 }
4253 
4254 static int
4255 level_one_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4256 {
4257 	edge_t *ep = FIRST_ADJ(lock1);
4258 
4259 	while (ep != HEAD(lock1)) {
4260 		if (ep->to_vertex == lock2)
4261 			return (1);
4262 		else
4263 			ep = NEXT_ADJ(ep);
4264 	}
4265 	return (0);
4266 }
4267 
4268 static int
4269 no_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4270 {
4271 	return (!level_two_path(lock1, lock2, 1));
4272 }
4273 
4274 static void
4275 path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4276 {
4277 	if (level_one_path(lock1, lock2)) {
4278 		if (level_two_path(lock1, lock2, 0) != 0) {
4279 			cmn_err(CE_WARN,
4280 			    "one edge one path from lock1 %p lock2 %p",
4281 			    (void *)lock1, (void *)lock2);
4282 		}
4283 	} else if (no_path(lock1, lock2)) {
4284 		cmn_err(CE_PANIC,
4285 		    "No path from  lock1 %p to lock2 %p",
4286 		    (void *)lock1, (void *)lock2);
4287 	}
4288 }
4289 #endif /* DEBUG */
4290