xref: /illumos-gate/usr/src/uts/common/os/flock.c (revision a6bde1a23b60f140c7ed78df979c2e22b1ed9b2c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 /*	All Rights Reserved */
29 
30 /*
31  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
32  */
33 
34 #include <sys/flock_impl.h>
35 #include <sys/vfs.h>
36 #include <sys/t_lock.h>		/* for <sys/callb.h> */
37 #include <sys/callb.h>
38 #include <sys/clconf.h>
39 #include <sys/cladm.h>
40 #include <sys/nbmlock.h>
41 #include <sys/cred.h>
42 #include <sys/policy.h>
43 
44 /*
45  * The following four variables are for statistics purposes and they are
46  * not protected by locks. They may not be accurate but will at least be
47  * close to the actual value.
48  */
49 
50 int	flk_lock_allocs;
51 int	flk_lock_frees;
52 int 	edge_allocs;
53 int	edge_frees;
54 int 	flk_proc_vertex_allocs;
55 int 	flk_proc_edge_allocs;
56 int	flk_proc_vertex_frees;
57 int	flk_proc_edge_frees;
58 
59 static kmutex_t flock_lock;
60 
61 #ifdef DEBUG
62 int check_debug = 0;
63 #define	CHECK_ACTIVE_LOCKS(gp)	if (check_debug) \
64 					check_active_locks(gp);
65 #define	CHECK_SLEEPING_LOCKS(gp)	if (check_debug) \
66 						check_sleeping_locks(gp);
67 #define	CHECK_OWNER_LOCKS(gp, pid, sysid, vp) 	\
68 		if (check_debug)	\
69 			check_owner_locks(gp, pid, sysid, vp);
70 #define	CHECK_LOCK_TRANSITION(old_state, new_state) \
71 	{ \
72 		if (check_lock_transition(old_state, new_state)) { \
73 			cmn_err(CE_PANIC, "Illegal lock transition \
74 			    from %d to %d", old_state, new_state); \
75 		} \
76 	}
77 #else
78 
79 #define	CHECK_ACTIVE_LOCKS(gp)
80 #define	CHECK_SLEEPING_LOCKS(gp)
81 #define	CHECK_OWNER_LOCKS(gp, pid, sysid, vp)
82 #define	CHECK_LOCK_TRANSITION(old_state, new_state)
83 
84 #endif /* DEBUG */
85 
86 struct kmem_cache	*flk_edge_cache;
87 
88 graph_t		*lock_graph[HASH_SIZE];
89 proc_graph_t	pgraph;
90 
91 /*
92  * Clustering.
93  *
94  * NLM REGISTRY TYPE IMPLEMENTATION
95  *
96  * Assumptions:
97  *  1.  Nodes in a cluster are numbered starting at 1; always non-negative
98  *	integers; maximum node id is returned by clconf_maximum_nodeid().
99  *  2.  We use this node id to identify the node an NLM server runs on.
100  */
101 
102 /*
103  * NLM registry object keeps track of NLM servers via their
104  * nlmids (which are the node ids of the node in the cluster they run on)
105  * that have requested locks at this LLM with which this registry is
106  * associated.
107  *
108  * Representation of abstraction:
109  *    rep = record[	states: array[nlm_state],
110  *			lock: mutex]
111  *
112  *    Representation invariants:
113  *	1. index i of rep.states is between 0 and n - 1 where n is number
114  *	   of elements in the array, which happen to be the maximum number
115  *	   of nodes in the cluster configuration + 1.
116  *	2. map nlmid to index i of rep.states
117  *		0   -> 0
118  *		1   -> 1
119  *		2   -> 2
120  *		n-1 -> clconf_maximum_nodeid()+1
121  *	3.  This 1-1 mapping is quite convenient and it avoids errors resulting
122  *	    from forgetting to subtract 1 from the index.
123  *	4.  The reason we keep the 0th index is the following.  A legitimate
124  *	    cluster configuration includes making a UFS file system NFS
125  *	    exportable.  The code is structured so that if you're in a cluster
126  *	    you do one thing; otherwise, you do something else.  The problem
127  *	    is what to do if you think you're in a cluster with PXFS loaded,
128  *	    but you're using UFS not PXFS?  The upper two bytes of the sysid
129  *	    encode the node id of the node where NLM server runs; these bytes
130  *	    are zero for UFS.  Since the nodeid is used to index into the
131  *	    registry, we can record the NLM server state information at index
132  *	    0 using the same mechanism used for PXFS file locks!
133  */
134 static flk_nlm_status_t *nlm_reg_status = NULL;	/* state array 0..N-1 */
135 static kmutex_t nlm_reg_lock;			/* lock to protect arrary */
136 static uint_t nlm_status_size;			/* size of state array */
137 
138 /*
139  * Although we need a global lock dependency graph (and associated data
140  * structures), we also need a per-zone notion of whether the lock manager is
141  * running, and so whether to allow lock manager requests or not.
142  *
143  * Thus, on a per-zone basis we maintain a ``global'' variable
144  * (flk_lockmgr_status), protected by flock_lock, and set when the lock
145  * manager is determined to be changing state (starting or stopping).
146  *
147  * Each graph/zone pair also has a copy of this variable, which is protected by
148  * the graph's mutex.
149  *
150  * The per-graph copies are used to synchronize lock requests with shutdown
151  * requests.  The global copy is used to initialize the per-graph field when a
152  * new graph is created.
153  */
154 struct flock_globals {
155 	flk_lockmgr_status_t flk_lockmgr_status;
156 	flk_lockmgr_status_t lockmgr_status[HASH_SIZE];
157 };
158 
159 zone_key_t flock_zone_key;
160 
161 static void create_flock(lock_descriptor_t *, flock64_t *);
162 static lock_descriptor_t	*flk_get_lock(void);
163 static void	flk_free_lock(lock_descriptor_t	*lock);
164 static void	flk_get_first_blocking_lock(lock_descriptor_t *request);
165 static int flk_process_request(lock_descriptor_t *);
166 static int flk_add_edge(lock_descriptor_t *, lock_descriptor_t *, int, int);
167 static edge_t *flk_get_edge(void);
168 static int flk_wait_execute_request(lock_descriptor_t *);
169 static int flk_relation(lock_descriptor_t *, lock_descriptor_t *);
170 static void flk_insert_active_lock(lock_descriptor_t *);
171 static void flk_delete_active_lock(lock_descriptor_t *, int);
172 static void flk_insert_sleeping_lock(lock_descriptor_t *);
173 static void flk_graph_uncolor(graph_t *);
174 static void flk_wakeup(lock_descriptor_t *, int);
175 static void flk_free_edge(edge_t *);
176 static void flk_recompute_dependencies(lock_descriptor_t *,
177 			lock_descriptor_t **,  int, int);
178 static int flk_find_barriers(lock_descriptor_t *);
179 static void flk_update_barriers(lock_descriptor_t *);
180 static int flk_color_reachables(lock_descriptor_t *);
181 static int flk_canceled(lock_descriptor_t *);
182 static void flk_delete_locks_by_sysid(lock_descriptor_t *);
183 static void report_blocker(lock_descriptor_t *, lock_descriptor_t *);
184 static void wait_for_lock(lock_descriptor_t *);
185 static void unlock_lockmgr_granted(struct flock_globals *);
186 static void wakeup_sleeping_lockmgr_locks(struct flock_globals *);
187 
188 /* Clustering hooks */
189 static void cl_flk_change_nlm_state_all_locks(int, flk_nlm_status_t);
190 static void cl_flk_wakeup_sleeping_nlm_locks(int);
191 static void cl_flk_unlock_nlm_granted(int);
192 
193 #ifdef DEBUG
194 static int check_lock_transition(int, int);
195 static void check_sleeping_locks(graph_t *);
196 static void check_active_locks(graph_t *);
197 static int no_path(lock_descriptor_t *, lock_descriptor_t *);
198 static void path(lock_descriptor_t *, lock_descriptor_t *);
199 static void check_owner_locks(graph_t *, pid_t, int, vnode_t *);
200 static int level_one_path(lock_descriptor_t *, lock_descriptor_t *);
201 static int level_two_path(lock_descriptor_t *, lock_descriptor_t *, int);
202 #endif
203 
204 /*	proc_graph function definitions */
205 static int flk_check_deadlock(lock_descriptor_t *);
206 static void flk_proc_graph_uncolor(void);
207 static proc_vertex_t *flk_get_proc_vertex(lock_descriptor_t *);
208 static proc_edge_t *flk_get_proc_edge(void);
209 static void flk_proc_release(proc_vertex_t *);
210 static void flk_free_proc_edge(proc_edge_t *);
211 static void flk_update_proc_graph(edge_t *, int);
212 
213 /* Non-blocking mandatory locking */
214 static int lock_blocks_io(nbl_op_t, u_offset_t, ssize_t, int, u_offset_t,
215 			u_offset_t);
216 
217 static struct flock_globals *
218 flk_get_globals(void)
219 {
220 	/*
221 	 * The KLM module had better be loaded if we're attempting to handle
222 	 * lockmgr requests.
223 	 */
224 	ASSERT(flock_zone_key != ZONE_KEY_UNINITIALIZED);
225 	return (zone_getspecific(flock_zone_key, curproc->p_zone));
226 }
227 
228 static flk_lockmgr_status_t
229 flk_get_lockmgr_status(void)
230 {
231 	struct flock_globals *fg;
232 
233 	ASSERT(MUTEX_HELD(&flock_lock));
234 
235 	if (flock_zone_key == ZONE_KEY_UNINITIALIZED) {
236 		/*
237 		 * KLM module not loaded; lock manager definitely not running.
238 		 */
239 		return (FLK_LOCKMGR_DOWN);
240 	}
241 	fg = flk_get_globals();
242 	return (fg->flk_lockmgr_status);
243 }
244 
245 /*
246  * Routine called from fs_frlock in fs/fs_subr.c
247  */
248 
249 int
250 reclock(vnode_t		*vp,
251 	flock64_t	*lckdat,
252 	int		cmd,
253 	int		flag,
254 	u_offset_t	offset,
255 	flk_callback_t	*flk_cbp)
256 {
257 	lock_descriptor_t	stack_lock_request;
258 	lock_descriptor_t	*lock_request;
259 	int error = 0;
260 	graph_t	*gp;
261 	int			nlmid;
262 
263 	/*
264 	 * Check access permissions
265 	 */
266 	if ((cmd & SETFLCK) &&
267 	    ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
268 	    (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
269 			return (EBADF);
270 
271 	/*
272 	 * for query and unlock we use the stack_lock_request
273 	 */
274 
275 	if ((lckdat->l_type == F_UNLCK) ||
276 	    !((cmd & INOFLCK) || (cmd & SETFLCK))) {
277 		lock_request = &stack_lock_request;
278 		(void) bzero((caddr_t)lock_request,
279 		    sizeof (lock_descriptor_t));
280 
281 		/*
282 		 * following is added to make the assertions in
283 		 * flk_execute_request() to pass through
284 		 */
285 
286 		lock_request->l_edge.edge_in_next = &lock_request->l_edge;
287 		lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
288 		lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
289 		lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
290 		lock_request->l_status = FLK_INITIAL_STATE;
291 	} else {
292 		lock_request = flk_get_lock();
293 	}
294 	lock_request->l_state = 0;
295 	lock_request->l_vnode = vp;
296 	lock_request->l_zoneid = getzoneid();
297 
298 	/*
299 	 * Convert the request range into the canonical start and end
300 	 * values.  The NLM protocol supports locking over the entire
301 	 * 32-bit range, so there's no range checking for remote requests,
302 	 * but we still need to verify that local requests obey the rules.
303 	 */
304 	/* Clustering */
305 	if ((cmd & (RCMDLCK | PCMDLCK)) != 0) {
306 		ASSERT(lckdat->l_whence == 0);
307 		lock_request->l_start = lckdat->l_start;
308 		lock_request->l_end = (lckdat->l_len == 0) ? MAX_U_OFFSET_T :
309 		    lckdat->l_start + (lckdat->l_len - 1);
310 	} else {
311 		/* check the validity of the lock range */
312 		error = flk_convert_lock_data(vp, lckdat,
313 		    &lock_request->l_start, &lock_request->l_end,
314 		    offset);
315 		if (error) {
316 			goto done;
317 		}
318 		error = flk_check_lock_data(lock_request->l_start,
319 		    lock_request->l_end, MAXEND);
320 		if (error) {
321 			goto done;
322 		}
323 	}
324 
325 	ASSERT(lock_request->l_end >= lock_request->l_start);
326 
327 	lock_request->l_type = lckdat->l_type;
328 	if (cmd & INOFLCK)
329 		lock_request->l_state |= IO_LOCK;
330 	if (cmd & SLPFLCK)
331 		lock_request->l_state |= WILLING_TO_SLEEP_LOCK;
332 	if (cmd & RCMDLCK)
333 		lock_request->l_state |= LOCKMGR_LOCK;
334 	if (cmd & NBMLCK)
335 		lock_request->l_state |= NBMAND_LOCK;
336 	/*
337 	 * Clustering: set flag for PXFS locks
338 	 * We do not _only_ check for the PCMDLCK flag because PXFS locks could
339 	 * also be of type 'RCMDLCK'.
340 	 * We do not _only_ check the GETPXFSID() macro because local PXFS
341 	 * clients use a pxfsid of zero to permit deadlock detection in the LLM.
342 	 */
343 
344 	if ((cmd & PCMDLCK) || (GETPXFSID(lckdat->l_sysid) != 0)) {
345 		lock_request->l_state |= PXFS_LOCK;
346 	}
347 	if (!((cmd & SETFLCK) || (cmd & INOFLCK))) {
348 		if (lock_request->l_type == F_RDLCK ||
349 		    lock_request->l_type == F_WRLCK)
350 			lock_request->l_state |= QUERY_LOCK;
351 	}
352 	lock_request->l_flock = (*lckdat);
353 	lock_request->l_callbacks = flk_cbp;
354 
355 	/*
356 	 * We are ready for processing the request
357 	 */
358 	if (IS_LOCKMGR(lock_request)) {
359 		/*
360 		 * If the lock request is an NLM server request ....
361 		 */
362 		if (nlm_status_size == 0) { /* not booted as cluster */
363 			mutex_enter(&flock_lock);
364 			/*
365 			 * Bail out if this is a lock manager request and the
366 			 * lock manager is not supposed to be running.
367 			 */
368 			if (flk_get_lockmgr_status() != FLK_LOCKMGR_UP) {
369 				mutex_exit(&flock_lock);
370 				error = ENOLCK;
371 				goto done;
372 			}
373 			mutex_exit(&flock_lock);
374 		} else {			/* booted as a cluster */
375 			nlmid = GETNLMID(lock_request->l_flock.l_sysid);
376 			ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
377 
378 			mutex_enter(&nlm_reg_lock);
379 			/*
380 			 * If the NLM registry does not know about this
381 			 * NLM server making the request, add its nlmid
382 			 * to the registry.
383 			 */
384 			if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status,
385 			    nlmid)) {
386 				FLK_REGISTRY_ADD_NLMID(nlm_reg_status, nlmid);
387 			} else if (!FLK_REGISTRY_IS_NLM_UP(nlm_reg_status,
388 			    nlmid)) {
389 				/*
390 				 * If the NLM server is already known (has made
391 				 * previous lock requests) and its state is
392 				 * not NLM_UP (means that NLM server is
393 				 * shutting down), then bail out with an
394 				 * error to deny the lock request.
395 				 */
396 				mutex_exit(&nlm_reg_lock);
397 				error = ENOLCK;
398 				goto done;
399 			}
400 			mutex_exit(&nlm_reg_lock);
401 		}
402 	}
403 
404 	/* Now get the lock graph for a particular vnode */
405 	gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH);
406 
407 	/*
408 	 * We drop rwlock here otherwise this might end up causing a
409 	 * deadlock if this IOLOCK sleeps. (bugid # 1183392).
410 	 */
411 
412 	if (IS_IO_LOCK(lock_request)) {
413 		VOP_RWUNLOCK(vp,
414 		    (lock_request->l_type == F_RDLCK) ?
415 		    V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
416 	}
417 	mutex_enter(&gp->gp_mutex);
418 
419 	lock_request->l_state |= REFERENCED_LOCK;
420 	lock_request->l_graph = gp;
421 
422 	switch (lock_request->l_type) {
423 	case F_RDLCK:
424 	case F_WRLCK:
425 		if (IS_QUERY_LOCK(lock_request)) {
426 			flk_get_first_blocking_lock(lock_request);
427 			(*lckdat) = lock_request->l_flock;
428 			break;
429 		}
430 
431 		/* process the request now */
432 
433 		error = flk_process_request(lock_request);
434 		break;
435 
436 	case F_UNLCK:
437 		/* unlock request will not block so execute it immediately */
438 
439 		if (IS_LOCKMGR(lock_request) &&
440 		    flk_canceled(lock_request)) {
441 			error = 0;
442 		} else {
443 			error = flk_execute_request(lock_request);
444 		}
445 		break;
446 
447 	case F_UNLKSYS:
448 		/*
449 		 * Recovery mechanism to release lock manager locks when
450 		 * NFS client crashes and restart. NFS server will clear
451 		 * old locks and grant new locks.
452 		 */
453 
454 		if (lock_request->l_flock.l_sysid == 0) {
455 			mutex_exit(&gp->gp_mutex);
456 			return (EINVAL);
457 		}
458 		if (secpolicy_nfs(CRED()) != 0) {
459 			mutex_exit(&gp->gp_mutex);
460 			return (EPERM);
461 		}
462 		flk_delete_locks_by_sysid(lock_request);
463 		lock_request->l_state &= ~REFERENCED_LOCK;
464 		flk_set_state(lock_request, FLK_DEAD_STATE);
465 		flk_free_lock(lock_request);
466 		mutex_exit(&gp->gp_mutex);
467 		return (0);
468 
469 	default:
470 		error = EINVAL;
471 		break;
472 	}
473 
474 	/* Clustering: For blocked PXFS locks, return */
475 	if (error == PXFS_LOCK_BLOCKED) {
476 		lock_request->l_state &= ~REFERENCED_LOCK;
477 		mutex_exit(&gp->gp_mutex);
478 		return (error);
479 	}
480 
481 	/*
482 	 * Now that we have seen the status of locks in the system for
483 	 * this vnode we acquire the rwlock if it is an IO_LOCK.
484 	 */
485 
486 	if (IS_IO_LOCK(lock_request)) {
487 		(void) VOP_RWLOCK(vp,
488 		    (lock_request->l_type == F_RDLCK) ?
489 		    V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
490 		if (!error) {
491 			lckdat->l_type = F_UNLCK;
492 
493 			/*
494 			 * This wake up is needed otherwise
495 			 * if IO_LOCK has slept the dependents on this
496 			 * will not be woken up at all. (bugid # 1185482).
497 			 */
498 
499 			flk_wakeup(lock_request, 1);
500 			flk_set_state(lock_request, FLK_DEAD_STATE);
501 			flk_free_lock(lock_request);
502 		}
503 		/*
504 		 * else if error had occurred either flk_process_request()
505 		 * has returned EDEADLK in which case there will be no
506 		 * dependents for this lock or EINTR from flk_wait_execute_
507 		 * request() in which case flk_cancel_sleeping_lock()
508 		 * would have been done. same is true with EBADF.
509 		 */
510 	}
511 
512 	if (lock_request == &stack_lock_request) {
513 		flk_set_state(lock_request, FLK_DEAD_STATE);
514 	} else {
515 		lock_request->l_state &= ~REFERENCED_LOCK;
516 		if ((error != 0) || IS_DELETED(lock_request)) {
517 			flk_set_state(lock_request, FLK_DEAD_STATE);
518 			flk_free_lock(lock_request);
519 		}
520 	}
521 
522 	mutex_exit(&gp->gp_mutex);
523 	return (error);
524 
525 done:
526 	flk_set_state(lock_request, FLK_DEAD_STATE);
527 	if (lock_request != &stack_lock_request)
528 		flk_free_lock(lock_request);
529 	return (error);
530 }
531 
532 /*
533  * Invoke the callbacks in the given list.  If before sleeping, invoke in
534  * list order.  If after sleeping, invoke in reverse order.
535  *
536  * CPR (suspend/resume) support: if one of the callbacks returns a
537  * callb_cpr_t, return it.   This will be used to make the thread CPR-safe
538  * while it is sleeping.  There should be at most one callb_cpr_t for the
539  * thread.
540  * XXX This is unnecessarily complicated.  The CPR information should just
541  * get passed in directly through VOP_FRLOCK and reclock, rather than
542  * sneaking it in via a callback.
543  */
544 
545 callb_cpr_t *
546 flk_invoke_callbacks(flk_callback_t *cblist, flk_cb_when_t when)
547 {
548 	callb_cpr_t *cpr_callbackp = NULL;
549 	callb_cpr_t *one_result;
550 	flk_callback_t *cb;
551 
552 	if (cblist == NULL)
553 		return (NULL);
554 
555 	if (when == FLK_BEFORE_SLEEP) {
556 		cb = cblist;
557 		do {
558 			one_result = (*cb->cb_callback)(when, cb->cb_data);
559 			if (one_result != NULL) {
560 				ASSERT(cpr_callbackp == NULL);
561 				cpr_callbackp = one_result;
562 			}
563 			cb = cb->cb_next;
564 		} while (cb != cblist);
565 	} else {
566 		cb = cblist->cb_prev;
567 		do {
568 			one_result = (*cb->cb_callback)(when, cb->cb_data);
569 			if (one_result != NULL) {
570 				cpr_callbackp = one_result;
571 			}
572 			cb = cb->cb_prev;
573 		} while (cb != cblist->cb_prev);
574 	}
575 
576 	return (cpr_callbackp);
577 }
578 
579 /*
580  * Initialize a flk_callback_t to hold the given callback.
581  */
582 
583 void
584 flk_init_callback(flk_callback_t *flk_cb,
585 	callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *), void *cbdata)
586 {
587 	flk_cb->cb_next = flk_cb;
588 	flk_cb->cb_prev = flk_cb;
589 	flk_cb->cb_callback = cb_fcn;
590 	flk_cb->cb_data = cbdata;
591 }
592 
593 /*
594  * Initialize an flk_callback_t and then link it into the head of an
595  * existing list (which may be NULL).
596  */
597 
598 void
599 flk_add_callback(flk_callback_t *newcb,
600 		callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *),
601 		void *cbdata, flk_callback_t *cblist)
602 {
603 	flk_init_callback(newcb, cb_fcn, cbdata);
604 
605 	if (cblist == NULL)
606 		return;
607 
608 	newcb->cb_prev = cblist->cb_prev;
609 	newcb->cb_next = cblist;
610 	cblist->cb_prev->cb_next = newcb;
611 	cblist->cb_prev = newcb;
612 }
613 
614 /*
615  * Initialize the flk_edge_cache data structure and create the
616  * nlm_reg_status array.
617  */
618 
619 void
620 flk_init(void)
621 {
622 	uint_t	i;
623 
624 	flk_edge_cache = kmem_cache_create("flk_edges",
625 	    sizeof (struct edge), 0, NULL, NULL, NULL, NULL, NULL, 0);
626 	if (flk_edge_cache == NULL) {
627 		cmn_err(CE_PANIC, "Couldn't create flk_edge_cache\n");
628 	}
629 	/*
630 	 * Create the NLM registry object.
631 	 */
632 
633 	if (cluster_bootflags & CLUSTER_BOOTED) {
634 		/*
635 		 * This routine tells you the maximum node id that will be used
636 		 * in the cluster.  This number will be the size of the nlm
637 		 * registry status array.  We add 1 because we will be using
638 		 * all entries indexed from 0 to maxnodeid; e.g., from 0
639 		 * to 64, for a total of 65 entries.
640 		 */
641 		nlm_status_size = clconf_maximum_nodeid() + 1;
642 	} else {
643 		nlm_status_size = 0;
644 	}
645 
646 	if (nlm_status_size != 0) {	/* booted as a cluster */
647 		nlm_reg_status = (flk_nlm_status_t *)
648 		    kmem_alloc(sizeof (flk_nlm_status_t) * nlm_status_size,
649 		    KM_SLEEP);
650 
651 		/* initialize all NLM states in array to NLM_UNKNOWN */
652 		for (i = 0; i < nlm_status_size; i++) {
653 			nlm_reg_status[i] = FLK_NLM_UNKNOWN;
654 		}
655 	}
656 }
657 
658 /*
659  * Zone constructor/destructor callbacks to be executed when a zone is
660  * created/destroyed.
661  */
662 /* ARGSUSED */
663 void *
664 flk_zone_init(zoneid_t zoneid)
665 {
666 	struct flock_globals *fg;
667 	uint_t i;
668 
669 	fg = kmem_alloc(sizeof (*fg), KM_SLEEP);
670 	fg->flk_lockmgr_status = FLK_LOCKMGR_UP;
671 	for (i = 0; i < HASH_SIZE; i++)
672 		fg->lockmgr_status[i] = FLK_LOCKMGR_UP;
673 	return (fg);
674 }
675 
676 /* ARGSUSED */
677 void
678 flk_zone_fini(zoneid_t zoneid, void *data)
679 {
680 	struct flock_globals *fg = data;
681 
682 	kmem_free(fg, sizeof (*fg));
683 }
684 
685 /*
686  * Get a lock_descriptor structure with initialization of edge lists.
687  */
688 
689 static lock_descriptor_t *
690 flk_get_lock(void)
691 {
692 	lock_descriptor_t	*l;
693 
694 	l = kmem_zalloc(sizeof (lock_descriptor_t), KM_SLEEP);
695 
696 	cv_init(&l->l_cv, NULL, CV_DRIVER, NULL);
697 	l->l_edge.edge_in_next = &l->l_edge;
698 	l->l_edge.edge_in_prev = &l->l_edge;
699 	l->l_edge.edge_adj_next = &l->l_edge;
700 	l->l_edge.edge_adj_prev = &l->l_edge;
701 	l->pvertex = -1;
702 	l->l_status = FLK_INITIAL_STATE;
703 	flk_lock_allocs++;
704 	return (l);
705 }
706 
707 /*
708  * Free a lock_descriptor structure. Just sets the DELETED_LOCK flag
709  * when some thread has a reference to it as in reclock().
710  */
711 
712 void
713 flk_free_lock(lock_descriptor_t	*lock)
714 {
715 	ASSERT(IS_DEAD(lock));
716 	if (IS_REFERENCED(lock)) {
717 		lock->l_state |= DELETED_LOCK;
718 		return;
719 	}
720 	flk_lock_frees++;
721 	kmem_free((void *)lock, sizeof (lock_descriptor_t));
722 }
723 
724 void
725 flk_set_state(lock_descriptor_t *lock, int new_state)
726 {
727 	/*
728 	 * Locks in the sleeping list may be woken up in a number of ways,
729 	 * and more than once.  If a sleeping lock is signaled awake more
730 	 * than once, then it may or may not change state depending on its
731 	 * current state.
732 	 * Also note that NLM locks that are sleeping could be moved to an
733 	 * interrupted state more than once if the unlock request is
734 	 * retransmitted by the NLM client - the second time around, this is
735 	 * just a nop.
736 	 * The ordering of being signaled awake is:
737 	 * INTERRUPTED_STATE > CANCELLED_STATE > GRANTED_STATE.
738 	 * The checks below implement this ordering.
739 	 */
740 	if (IS_INTERRUPTED(lock)) {
741 		if ((new_state == FLK_CANCELLED_STATE) ||
742 		    (new_state == FLK_GRANTED_STATE) ||
743 		    (new_state == FLK_INTERRUPTED_STATE)) {
744 			return;
745 		}
746 	}
747 	if (IS_CANCELLED(lock)) {
748 		if ((new_state == FLK_GRANTED_STATE) ||
749 		    (new_state == FLK_CANCELLED_STATE)) {
750 			return;
751 		}
752 	}
753 	CHECK_LOCK_TRANSITION(lock->l_status, new_state);
754 	if (IS_PXFS(lock)) {
755 		cl_flk_state_transition_notify(lock, lock->l_status, new_state);
756 	}
757 	lock->l_status = new_state;
758 }
759 
760 /*
761  * Routine that checks whether there are any blocking locks in the system.
762  *
763  * The policy followed is if a write lock is sleeping we don't allow read
764  * locks before this write lock even though there may not be any active
765  * locks corresponding to the read locks' region.
766  *
767  * flk_add_edge() function adds an edge between l1 and l2 iff there
768  * is no path between l1 and l2. This is done to have a "minimum
769  * storage representation" of the dependency graph.
770  *
771  * Another property of the graph is since only the new request throws
772  * edges to the existing locks in the graph, the graph is always topologically
773  * ordered.
774  */
775 
776 static int
777 flk_process_request(lock_descriptor_t *request)
778 {
779 	graph_t	*gp = request->l_graph;
780 	lock_descriptor_t *lock;
781 	int request_blocked_by_active = 0;
782 	int request_blocked_by_granted = 0;
783 	int request_blocked_by_sleeping = 0;
784 	vnode_t	*vp = request->l_vnode;
785 	int	error = 0;
786 	int request_will_wait = 0;
787 	int found_covering_lock = 0;
788 	lock_descriptor_t *covered_by = NULL;
789 
790 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
791 	request_will_wait = IS_WILLING_TO_SLEEP(request);
792 
793 	/*
794 	 * check active locks
795 	 */
796 
797 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
798 
799 
800 	if (lock) {
801 		do {
802 			if (BLOCKS(lock, request)) {
803 				if (!request_will_wait)
804 					return (EAGAIN);
805 				request_blocked_by_active = 1;
806 				break;
807 			}
808 			/*
809 			 * Grant lock if it is for the same owner holding active
810 			 * lock that covers the request.
811 			 */
812 
813 			if (SAME_OWNER(lock, request) &&
814 			    COVERS(lock, request) &&
815 			    (request->l_type == F_RDLCK))
816 				return (flk_execute_request(request));
817 			lock = lock->l_next;
818 		} while (lock->l_vnode == vp);
819 	}
820 
821 	if (!request_blocked_by_active) {
822 			lock_descriptor_t *lk[1];
823 			lock_descriptor_t *first_glock = NULL;
824 		/*
825 		 * Shall we grant this?! NO!!
826 		 * What about those locks that were just granted and still
827 		 * in sleep queue. Those threads are woken up and so locks
828 		 * are almost active.
829 		 */
830 		SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
831 		if (lock) {
832 			do {
833 				if (BLOCKS(lock, request)) {
834 					if (IS_GRANTED(lock)) {
835 						request_blocked_by_granted = 1;
836 					} else {
837 						request_blocked_by_sleeping = 1;
838 					}
839 				}
840 
841 				lock = lock->l_next;
842 			} while ((lock->l_vnode == vp));
843 			first_glock = lock->l_prev;
844 			ASSERT(first_glock->l_vnode == vp);
845 		}
846 
847 		if (request_blocked_by_granted)
848 			goto block;
849 
850 		if (!request_blocked_by_sleeping) {
851 			/*
852 			 * If the request isn't going to be blocked by a
853 			 * sleeping request, we know that it isn't going to
854 			 * be blocked; we can just execute the request --
855 			 * without performing costly deadlock detection.
856 			 */
857 			ASSERT(!request_blocked_by_active);
858 			return (flk_execute_request(request));
859 		} else if (request->l_type == F_RDLCK) {
860 			/*
861 			 * If we have a sleeping writer in the requested
862 			 * lock's range, block.
863 			 */
864 			goto block;
865 		}
866 
867 		lk[0] = request;
868 		request->l_state |= RECOMPUTE_LOCK;
869 		SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
870 		if (lock) {
871 			do {
872 				flk_recompute_dependencies(lock, lk, 1, 0);
873 				lock = lock->l_next;
874 			} while (lock->l_vnode == vp);
875 		}
876 		lock = first_glock;
877 		if (lock) {
878 			do {
879 				if (IS_GRANTED(lock)) {
880 				flk_recompute_dependencies(lock, lk, 1, 0);
881 				}
882 				lock = lock->l_prev;
883 			} while ((lock->l_vnode == vp));
884 		}
885 		request->l_state &= ~RECOMPUTE_LOCK;
886 		if (!NO_DEPENDENTS(request) && flk_check_deadlock(request))
887 			return (EDEADLK);
888 		return (flk_execute_request(request));
889 	}
890 
891 block:
892 	if (request_will_wait)
893 		flk_graph_uncolor(gp);
894 
895 	/* check sleeping locks */
896 
897 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
898 
899 	/*
900 	 * If we find a sleeping write lock that is a superset of the
901 	 * region wanted by request we can be assured that by adding an
902 	 * edge to this write lock we have paths to all locks in the
903 	 * graph that blocks the request except in one case and that is why
904 	 * another check for SAME_OWNER in the loop below. The exception
905 	 * case is when this process that owns the sleeping write lock 'l1'
906 	 * has other locks l2, l3, l4 that are in the system and arrived
907 	 * before l1. l1 does not have path to these locks as they are from
908 	 * same process. We break when we find a second covering sleeping
909 	 * lock l5 owned by a process different from that owning l1, because
910 	 * there cannot be any of l2, l3, l4, etc., arrived before l5, and if
911 	 * it has l1 would have produced a deadlock already.
912 	 */
913 
914 	if (lock) {
915 		do {
916 			if (BLOCKS(lock, request)) {
917 				if (!request_will_wait)
918 					return (EAGAIN);
919 				if (COVERS(lock, request) &&
920 				    lock->l_type == F_WRLCK) {
921 					if (found_covering_lock &&
922 					    !SAME_OWNER(lock, covered_by)) {
923 						found_covering_lock++;
924 						break;
925 					}
926 					found_covering_lock = 1;
927 					covered_by = lock;
928 				}
929 				if (found_covering_lock &&
930 				    !SAME_OWNER(lock, covered_by)) {
931 					lock = lock->l_next;
932 					continue;
933 				}
934 				if ((error = flk_add_edge(request, lock,
935 				    !found_covering_lock, 0)))
936 					return (error);
937 			}
938 			lock = lock->l_next;
939 		} while (lock->l_vnode == vp);
940 	}
941 
942 /*
943  * found_covering_lock == 2 iff at this point 'request' has paths
944  * to all locks that blocks 'request'. found_covering_lock == 1 iff at this
945  * point 'request' has paths to all locks that blocks 'request' whose owners
946  * are not same as the one that covers 'request' (covered_by above) and
947  * we can have locks whose owner is same as covered_by in the active list.
948  */
949 
950 	if (request_blocked_by_active && found_covering_lock != 2) {
951 		SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
952 		ASSERT(lock != NULL);
953 		do {
954 			if (BLOCKS(lock, request)) {
955 				if (found_covering_lock &&
956 				    !SAME_OWNER(lock, covered_by)) {
957 					lock = lock->l_next;
958 					continue;
959 				}
960 				if ((error = flk_add_edge(request, lock,
961 				    CHECK_CYCLE, 0)))
962 					return (error);
963 			}
964 			lock = lock->l_next;
965 		} while (lock->l_vnode == vp);
966 	}
967 
968 	if (NOT_BLOCKED(request)) {
969 		/*
970 		 * request not dependent on any other locks
971 		 * so execute this request
972 		 */
973 		return (flk_execute_request(request));
974 	} else {
975 		/*
976 		 * check for deadlock
977 		 */
978 		if (flk_check_deadlock(request))
979 			return (EDEADLK);
980 		/*
981 		 * this thread has to sleep
982 		 */
983 		return (flk_wait_execute_request(request));
984 	}
985 }
986 
987 /*
988  * The actual execution of the request in the simple case is only to
989  * insert the 'request' in the list of active locks if it is not an
990  * UNLOCK.
991  * We have to consider the existing active locks' relation to
992  * this 'request' if they are owned by same process. flk_relation() does
993  * this job and sees to that the dependency graph information is maintained
994  * properly.
995  */
996 
997 int
998 flk_execute_request(lock_descriptor_t *request)
999 {
1000 	graph_t	*gp = request->l_graph;
1001 	vnode_t	*vp = request->l_vnode;
1002 	lock_descriptor_t	*lock, *lock1;
1003 	int done_searching = 0;
1004 
1005 	CHECK_SLEEPING_LOCKS(gp);
1006 	CHECK_ACTIVE_LOCKS(gp);
1007 
1008 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1009 
1010 	flk_set_state(request, FLK_START_STATE);
1011 
1012 	ASSERT(NOT_BLOCKED(request));
1013 
1014 	/* IO_LOCK requests are only to check status */
1015 
1016 	if (IS_IO_LOCK(request))
1017 		return (0);
1018 
1019 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1020 
1021 	if (lock == NULL && request->l_type == F_UNLCK)
1022 		return (0);
1023 	if (lock == NULL) {
1024 		flk_insert_active_lock(request);
1025 		return (0);
1026 	}
1027 
1028 	do {
1029 		lock1 = lock->l_next;
1030 		if (SAME_OWNER(request, lock)) {
1031 			done_searching = flk_relation(lock, request);
1032 		}
1033 		lock = lock1;
1034 	} while (lock->l_vnode == vp && !done_searching);
1035 
1036 	/*
1037 	 * insert in active queue
1038 	 */
1039 
1040 	if (request->l_type != F_UNLCK)
1041 		flk_insert_active_lock(request);
1042 
1043 	return (0);
1044 }
1045 
1046 /*
1047  * 'request' is blocked by some one therefore we put it into sleep queue.
1048  */
1049 static int
1050 flk_wait_execute_request(lock_descriptor_t *request)
1051 {
1052 	graph_t	*gp = request->l_graph;
1053 	callb_cpr_t 	*cprp;		/* CPR info from callback */
1054 	struct flock_globals *fg;
1055 	int index;
1056 
1057 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1058 	ASSERT(IS_WILLING_TO_SLEEP(request));
1059 
1060 	flk_insert_sleeping_lock(request);
1061 
1062 	if (IS_LOCKMGR(request)) {
1063 		index = HASH_INDEX(request->l_vnode);
1064 		fg = flk_get_globals();
1065 
1066 		if (nlm_status_size == 0) {	/* not booted as a cluster */
1067 			if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP) {
1068 				flk_cancel_sleeping_lock(request, 1);
1069 				return (ENOLCK);
1070 			}
1071 		} else {			/* booted as a cluster */
1072 			/*
1073 			 * If the request is an NLM server lock request,
1074 			 * and the NLM state of the lock request is not
1075 			 * NLM_UP (because the NLM server is shutting
1076 			 * down), then cancel the sleeping lock and
1077 			 * return error ENOLCK that will encourage the
1078 			 * client to retransmit.
1079 			 */
1080 			if (!IS_NLM_UP(request)) {
1081 				flk_cancel_sleeping_lock(request, 1);
1082 				return (ENOLCK);
1083 			}
1084 		}
1085 	}
1086 
1087 	/* Clustering: For blocking PXFS locks, return */
1088 	if (IS_PXFS(request)) {
1089 		/*
1090 		 * PXFS locks sleep on the client side.
1091 		 * The callback argument is used to wake up the sleeper
1092 		 * when the lock is granted.
1093 		 * We return -1 (rather than an errno value) to indicate
1094 		 * the client side should sleep
1095 		 */
1096 		return (PXFS_LOCK_BLOCKED);
1097 	}
1098 
1099 	if (request->l_callbacks != NULL) {
1100 		/*
1101 		 * To make sure the shutdown code works correctly, either
1102 		 * the callback must happen after putting the lock on the
1103 		 * sleep list, or we must check the shutdown status after
1104 		 * returning from the callback (and before sleeping).  At
1105 		 * least for now, we'll use the first option.  If a
1106 		 * shutdown or signal or whatever happened while the graph
1107 		 * mutex was dropped, that will be detected by
1108 		 * wait_for_lock().
1109 		 */
1110 		mutex_exit(&gp->gp_mutex);
1111 
1112 		cprp = flk_invoke_callbacks(request->l_callbacks,
1113 		    FLK_BEFORE_SLEEP);
1114 
1115 		mutex_enter(&gp->gp_mutex);
1116 
1117 		if (cprp == NULL) {
1118 			wait_for_lock(request);
1119 		} else {
1120 			mutex_enter(cprp->cc_lockp);
1121 			CALLB_CPR_SAFE_BEGIN(cprp);
1122 			mutex_exit(cprp->cc_lockp);
1123 			wait_for_lock(request);
1124 			mutex_enter(cprp->cc_lockp);
1125 			CALLB_CPR_SAFE_END(cprp, cprp->cc_lockp);
1126 			mutex_exit(cprp->cc_lockp);
1127 		}
1128 
1129 		mutex_exit(&gp->gp_mutex);
1130 		(void) flk_invoke_callbacks(request->l_callbacks,
1131 		    FLK_AFTER_SLEEP);
1132 		mutex_enter(&gp->gp_mutex);
1133 	} else {
1134 		wait_for_lock(request);
1135 	}
1136 
1137 	if (IS_LOCKMGR(request)) {
1138 		/*
1139 		 * If the lock manager is shutting down, return an
1140 		 * error that will encourage the client to retransmit.
1141 		 */
1142 		if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP &&
1143 		    !IS_GRANTED(request)) {
1144 			flk_cancel_sleeping_lock(request, 1);
1145 			return (ENOLCK);
1146 		}
1147 	}
1148 
1149 	if (IS_INTERRUPTED(request)) {
1150 		/* we got a signal, or act like we did */
1151 		flk_cancel_sleeping_lock(request, 1);
1152 		return (EINTR);
1153 	}
1154 
1155 	/* Cancelled if some other thread has closed the file */
1156 
1157 	if (IS_CANCELLED(request)) {
1158 		flk_cancel_sleeping_lock(request, 1);
1159 		return (EBADF);
1160 	}
1161 
1162 	request->l_state &= ~GRANTED_LOCK;
1163 	REMOVE_SLEEP_QUEUE(request);
1164 	return (flk_execute_request(request));
1165 }
1166 
1167 /*
1168  * This routine adds an edge between from and to because from depends
1169  * to. If asked to check for deadlock it checks whether there are any
1170  * reachable locks from "from_lock" that is owned by the same process
1171  * as "from_lock".
1172  * NOTE: It is the caller's responsibility to make sure that the color
1173  * of the graph is consistent between the calls to flk_add_edge as done
1174  * in flk_process_request. This routine does not color and check for
1175  * deadlock explicitly.
1176  */
1177 
1178 static int
1179 flk_add_edge(lock_descriptor_t *from_lock, lock_descriptor_t *to_lock,
1180 			int check_cycle, int update_graph)
1181 {
1182 	edge_t	*edge;
1183 	edge_t	*ep;
1184 	lock_descriptor_t	*vertex;
1185 	lock_descriptor_t *vertex_stack;
1186 
1187 	STACK_INIT(vertex_stack);
1188 
1189 	/*
1190 	 * if to vertex already has mark_color just return
1191 	 * don't add an edge as it is reachable from from vertex
1192 	 * before itself.
1193 	 */
1194 
1195 	if (COLORED(to_lock))
1196 		return (0);
1197 
1198 	edge = flk_get_edge();
1199 
1200 	/*
1201 	 * set the from and to vertex
1202 	 */
1203 
1204 	edge->from_vertex = from_lock;
1205 	edge->to_vertex = to_lock;
1206 
1207 	/*
1208 	 * put in adjacency list of from vertex
1209 	 */
1210 
1211 	from_lock->l_edge.edge_adj_next->edge_adj_prev = edge;
1212 	edge->edge_adj_next = from_lock->l_edge.edge_adj_next;
1213 	edge->edge_adj_prev = &from_lock->l_edge;
1214 	from_lock->l_edge.edge_adj_next = edge;
1215 
1216 	/*
1217 	 * put in in list of to vertex
1218 	 */
1219 
1220 	to_lock->l_edge.edge_in_next->edge_in_prev = edge;
1221 	edge->edge_in_next = to_lock->l_edge.edge_in_next;
1222 	to_lock->l_edge.edge_in_next = edge;
1223 	edge->edge_in_prev = &to_lock->l_edge;
1224 
1225 
1226 	if (update_graph) {
1227 		flk_update_proc_graph(edge, 0);
1228 		return (0);
1229 	}
1230 	if (!check_cycle) {
1231 		return (0);
1232 	}
1233 
1234 	STACK_PUSH(vertex_stack, from_lock, l_stack);
1235 
1236 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1237 
1238 		STACK_POP(vertex_stack, l_stack);
1239 
1240 		for (ep = FIRST_ADJ(vertex);
1241 		    ep != HEAD(vertex);
1242 		    ep = NEXT_ADJ(ep)) {
1243 			if (COLORED(ep->to_vertex))
1244 				continue;
1245 			COLOR(ep->to_vertex);
1246 			if (SAME_OWNER(ep->to_vertex, from_lock))
1247 				goto dead_lock;
1248 			STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1249 		}
1250 	}
1251 	return (0);
1252 
1253 dead_lock:
1254 
1255 	/*
1256 	 * remove all edges
1257 	 */
1258 
1259 	ep = FIRST_ADJ(from_lock);
1260 
1261 	while (ep != HEAD(from_lock)) {
1262 		IN_LIST_REMOVE(ep);
1263 		from_lock->l_sedge = NEXT_ADJ(ep);
1264 		ADJ_LIST_REMOVE(ep);
1265 		flk_free_edge(ep);
1266 		ep = from_lock->l_sedge;
1267 	}
1268 	return (EDEADLK);
1269 }
1270 
1271 /*
1272  * Get an edge structure for representing the dependency between two locks.
1273  */
1274 
1275 static edge_t *
1276 flk_get_edge()
1277 {
1278 	edge_t	*ep;
1279 
1280 	ASSERT(flk_edge_cache != NULL);
1281 
1282 	ep = kmem_cache_alloc(flk_edge_cache, KM_SLEEP);
1283 	edge_allocs++;
1284 	return (ep);
1285 }
1286 
1287 /*
1288  * Free the edge structure.
1289  */
1290 
1291 static void
1292 flk_free_edge(edge_t *ep)
1293 {
1294 	edge_frees++;
1295 	kmem_cache_free(flk_edge_cache, (void *)ep);
1296 }
1297 
1298 /*
1299  * Check the relationship of request with lock and perform the
1300  * recomputation of dependencies, break lock if required, and return
1301  * 1 if request cannot have any more relationship with the next
1302  * active locks.
1303  * The 'lock' and 'request' are compared and in case of overlap we
1304  * delete the 'lock' and form new locks to represent the non-overlapped
1305  * portion of original 'lock'. This function has side effects such as
1306  * 'lock' will be freed, new locks will be added to the active list.
1307  */
1308 
1309 static int
1310 flk_relation(lock_descriptor_t *lock, lock_descriptor_t *request)
1311 {
1312 	int lock_effect;
1313 	lock_descriptor_t *lock1, *lock2;
1314 	lock_descriptor_t *topology[3];
1315 	int nvertex = 0;
1316 	int i;
1317 	edge_t	*ep;
1318 	graph_t	*gp = (lock->l_graph);
1319 
1320 
1321 	CHECK_SLEEPING_LOCKS(gp);
1322 	CHECK_ACTIVE_LOCKS(gp);
1323 
1324 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1325 
1326 	topology[0] = topology[1] = topology[2] = NULL;
1327 
1328 	if (request->l_type == F_UNLCK)
1329 		lock_effect = FLK_UNLOCK;
1330 	else if (request->l_type == F_RDLCK &&
1331 	    lock->l_type == F_WRLCK)
1332 		lock_effect = FLK_DOWNGRADE;
1333 	else if (request->l_type == F_WRLCK &&
1334 	    lock->l_type == F_RDLCK)
1335 		lock_effect = FLK_UPGRADE;
1336 	else
1337 		lock_effect = FLK_STAY_SAME;
1338 
1339 	if (lock->l_end < request->l_start) {
1340 		if (lock->l_end == request->l_start - 1 &&
1341 		    lock_effect == FLK_STAY_SAME) {
1342 			topology[0] = request;
1343 			request->l_start = lock->l_start;
1344 			nvertex = 1;
1345 			goto recompute;
1346 		} else {
1347 			return (0);
1348 		}
1349 	}
1350 
1351 	if (lock->l_start > request->l_end) {
1352 		if (request->l_end == lock->l_start - 1 &&
1353 		    lock_effect == FLK_STAY_SAME) {
1354 			topology[0] = request;
1355 			request->l_end = lock->l_end;
1356 			nvertex = 1;
1357 			goto recompute;
1358 		} else {
1359 			return (1);
1360 		}
1361 	}
1362 
1363 	if (request->l_end < lock->l_end) {
1364 		if (request->l_start > lock->l_start) {
1365 			if (lock_effect == FLK_STAY_SAME) {
1366 				request->l_start = lock->l_start;
1367 				request->l_end = lock->l_end;
1368 				topology[0] = request;
1369 				nvertex = 1;
1370 			} else {
1371 				lock1 = flk_get_lock();
1372 				lock2 = flk_get_lock();
1373 				COPY(lock1, lock);
1374 				COPY(lock2, lock);
1375 				lock1->l_start = lock->l_start;
1376 				lock1->l_end = request->l_start - 1;
1377 				lock2->l_start = request->l_end + 1;
1378 				lock2->l_end = lock->l_end;
1379 				topology[0] = lock1;
1380 				topology[1] = lock2;
1381 				topology[2] = request;
1382 				nvertex = 3;
1383 			}
1384 		} else if (request->l_start < lock->l_start) {
1385 			if (lock_effect == FLK_STAY_SAME) {
1386 				request->l_end = lock->l_end;
1387 				topology[0] = request;
1388 				nvertex = 1;
1389 			} else {
1390 				lock1 = flk_get_lock();
1391 				COPY(lock1, lock);
1392 				lock1->l_start = request->l_end + 1;
1393 				topology[0] = lock1;
1394 				topology[1] = request;
1395 				nvertex = 2;
1396 			}
1397 		} else  {
1398 			if (lock_effect == FLK_STAY_SAME) {
1399 				request->l_start = lock->l_start;
1400 				request->l_end = lock->l_end;
1401 				topology[0] = request;
1402 				nvertex = 1;
1403 			} else {
1404 				lock1 = flk_get_lock();
1405 				COPY(lock1, lock);
1406 				lock1->l_start = request->l_end + 1;
1407 				topology[0] = lock1;
1408 				topology[1] = request;
1409 				nvertex = 2;
1410 			}
1411 		}
1412 	} else if (request->l_end > lock->l_end) {
1413 		if (request->l_start > lock->l_start)  {
1414 			if (lock_effect == FLK_STAY_SAME) {
1415 				request->l_start = lock->l_start;
1416 				topology[0] = request;
1417 				nvertex = 1;
1418 			} else {
1419 				lock1 = flk_get_lock();
1420 				COPY(lock1, lock);
1421 				lock1->l_end = request->l_start - 1;
1422 				topology[0] = lock1;
1423 				topology[1] = request;
1424 				nvertex = 2;
1425 			}
1426 		} else if (request->l_start < lock->l_start)  {
1427 			topology[0] = request;
1428 			nvertex = 1;
1429 		} else {
1430 			topology[0] = request;
1431 			nvertex = 1;
1432 		}
1433 	} else {
1434 		if (request->l_start > lock->l_start) {
1435 			if (lock_effect == FLK_STAY_SAME) {
1436 				request->l_start = lock->l_start;
1437 				topology[0] = request;
1438 				nvertex = 1;
1439 			} else {
1440 				lock1 = flk_get_lock();
1441 				COPY(lock1, lock);
1442 				lock1->l_end = request->l_start - 1;
1443 				topology[0] = lock1;
1444 				topology[1] = request;
1445 				nvertex = 2;
1446 			}
1447 		} else if (request->l_start < lock->l_start) {
1448 			topology[0] = request;
1449 			nvertex = 1;
1450 		} else {
1451 			if (lock_effect !=  FLK_UNLOCK) {
1452 				topology[0] = request;
1453 				nvertex = 1;
1454 			} else {
1455 				flk_delete_active_lock(lock, 0);
1456 				flk_wakeup(lock, 1);
1457 				flk_free_lock(lock);
1458 				CHECK_SLEEPING_LOCKS(gp);
1459 				CHECK_ACTIVE_LOCKS(gp);
1460 				return (1);
1461 			}
1462 		}
1463 	}
1464 
1465 recompute:
1466 
1467 	/*
1468 	 * For unlock we don't send the 'request' to for recomputing
1469 	 * dependencies because no lock will add an edge to this.
1470 	 */
1471 
1472 	if (lock_effect == FLK_UNLOCK) {
1473 		topology[nvertex-1] = NULL;
1474 		nvertex--;
1475 	}
1476 	for (i = 0; i < nvertex; i++) {
1477 		topology[i]->l_state |= RECOMPUTE_LOCK;
1478 		topology[i]->l_color = NO_COLOR;
1479 	}
1480 
1481 	ASSERT(FIRST_ADJ(lock) == HEAD(lock));
1482 
1483 	/*
1484 	 * we remove the adjacent edges for all vertices' to this vertex
1485 	 * 'lock'.
1486 	 */
1487 
1488 	ep = FIRST_IN(lock);
1489 	while (ep != HEAD(lock)) {
1490 		ADJ_LIST_REMOVE(ep);
1491 		ep = NEXT_IN(ep);
1492 	}
1493 
1494 	flk_delete_active_lock(lock, 0);
1495 
1496 	/* We are ready for recomputing the dependencies now */
1497 
1498 	flk_recompute_dependencies(lock, topology, nvertex, 1);
1499 
1500 	for (i = 0; i < nvertex; i++) {
1501 		topology[i]->l_state &= ~RECOMPUTE_LOCK;
1502 		topology[i]->l_color = NO_COLOR;
1503 	}
1504 
1505 
1506 	if (lock_effect == FLK_UNLOCK) {
1507 		nvertex++;
1508 	}
1509 	for (i = 0; i < nvertex - 1; i++) {
1510 		flk_insert_active_lock(topology[i]);
1511 	}
1512 
1513 
1514 	if (lock_effect == FLK_DOWNGRADE || lock_effect == FLK_UNLOCK) {
1515 		flk_wakeup(lock, 0);
1516 	} else {
1517 		ep = FIRST_IN(lock);
1518 		while (ep != HEAD(lock)) {
1519 			lock->l_sedge = NEXT_IN(ep);
1520 			IN_LIST_REMOVE(ep);
1521 			flk_update_proc_graph(ep, 1);
1522 			flk_free_edge(ep);
1523 			ep = lock->l_sedge;
1524 		}
1525 	}
1526 	flk_free_lock(lock);
1527 
1528 	CHECK_SLEEPING_LOCKS(gp);
1529 	CHECK_ACTIVE_LOCKS(gp);
1530 	return (0);
1531 }
1532 
1533 /*
1534  * Insert a lock into the active queue.
1535  */
1536 
1537 static void
1538 flk_insert_active_lock(lock_descriptor_t *new_lock)
1539 {
1540 	graph_t	*gp = new_lock->l_graph;
1541 	vnode_t	*vp = new_lock->l_vnode;
1542 	lock_descriptor_t *first_lock, *lock;
1543 
1544 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1545 
1546 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1547 	first_lock = lock;
1548 
1549 	if (first_lock != NULL) {
1550 		for (; (lock->l_vnode == vp &&
1551 		    lock->l_start < new_lock->l_start); lock = lock->l_next)
1552 			;
1553 	} else {
1554 		lock = ACTIVE_HEAD(gp);
1555 	}
1556 
1557 	lock->l_prev->l_next = new_lock;
1558 	new_lock->l_next = lock;
1559 	new_lock->l_prev = lock->l_prev;
1560 	lock->l_prev = new_lock;
1561 
1562 	if (first_lock == NULL || (new_lock->l_start <= first_lock->l_start)) {
1563 		vp->v_filocks = (struct filock *)new_lock;
1564 	}
1565 	flk_set_state(new_lock, FLK_ACTIVE_STATE);
1566 	new_lock->l_state |= ACTIVE_LOCK;
1567 
1568 	CHECK_ACTIVE_LOCKS(gp);
1569 	CHECK_SLEEPING_LOCKS(gp);
1570 }
1571 
1572 /*
1573  * Delete the active lock : Performs two functions depending on the
1574  * value of second parameter. One is to remove from the active lists
1575  * only and other is to both remove and free the lock.
1576  */
1577 
1578 static void
1579 flk_delete_active_lock(lock_descriptor_t *lock, int free_lock)
1580 {
1581 	vnode_t *vp = lock->l_vnode;
1582 	graph_t	*gp = lock->l_graph;
1583 
1584 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1585 	if (free_lock)
1586 		ASSERT(NO_DEPENDENTS(lock));
1587 	ASSERT(NOT_BLOCKED(lock));
1588 	ASSERT(IS_ACTIVE(lock));
1589 
1590 	ASSERT((vp->v_filocks != NULL));
1591 
1592 	if (vp->v_filocks == (struct filock *)lock) {
1593 		vp->v_filocks = (struct filock *)
1594 		    ((lock->l_next->l_vnode == vp) ? lock->l_next :
1595 		    NULL);
1596 	}
1597 	lock->l_next->l_prev = lock->l_prev;
1598 	lock->l_prev->l_next = lock->l_next;
1599 	lock->l_next = lock->l_prev = NULL;
1600 	flk_set_state(lock, FLK_DEAD_STATE);
1601 	lock->l_state &= ~ACTIVE_LOCK;
1602 
1603 	if (free_lock)
1604 		flk_free_lock(lock);
1605 	CHECK_ACTIVE_LOCKS(gp);
1606 	CHECK_SLEEPING_LOCKS(gp);
1607 }
1608 
1609 /*
1610  * Insert into the sleep queue.
1611  */
1612 
1613 static void
1614 flk_insert_sleeping_lock(lock_descriptor_t *request)
1615 {
1616 	graph_t *gp = request->l_graph;
1617 	vnode_t	*vp = request->l_vnode;
1618 	lock_descriptor_t	*lock;
1619 
1620 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1621 	ASSERT(IS_INITIAL(request));
1622 
1623 	for (lock = gp->sleeping_locks.l_next; (lock != &gp->sleeping_locks &&
1624 	    lock->l_vnode < vp); lock = lock->l_next)
1625 		;
1626 
1627 	lock->l_prev->l_next = request;
1628 	request->l_prev = lock->l_prev;
1629 	lock->l_prev = request;
1630 	request->l_next = lock;
1631 	flk_set_state(request, FLK_SLEEPING_STATE);
1632 	request->l_state |= SLEEPING_LOCK;
1633 }
1634 
1635 /*
1636  * Cancelling a sleeping lock implies removing a vertex from the
1637  * dependency graph and therefore we should recompute the dependencies
1638  * of all vertices that have a path  to this vertex, w.r.t. all
1639  * vertices reachable from this vertex.
1640  */
1641 
1642 void
1643 flk_cancel_sleeping_lock(lock_descriptor_t *request, int remove_from_queue)
1644 {
1645 	graph_t	*gp = request->l_graph;
1646 	vnode_t *vp = request->l_vnode;
1647 	lock_descriptor_t **topology = NULL;
1648 	edge_t	*ep;
1649 	lock_descriptor_t *vertex, *lock;
1650 	int nvertex = 0;
1651 	int i;
1652 	lock_descriptor_t *vertex_stack;
1653 
1654 	STACK_INIT(vertex_stack);
1655 
1656 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1657 	/*
1658 	 * count number of vertex pointers that has to be allocated
1659 	 * All vertices that are reachable from request.
1660 	 */
1661 
1662 	STACK_PUSH(vertex_stack, request, l_stack);
1663 
1664 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1665 		STACK_POP(vertex_stack, l_stack);
1666 		for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
1667 		    ep = NEXT_ADJ(ep)) {
1668 			if (IS_RECOMPUTE(ep->to_vertex))
1669 				continue;
1670 			ep->to_vertex->l_state |= RECOMPUTE_LOCK;
1671 			STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1672 			nvertex++;
1673 		}
1674 	}
1675 
1676 	/*
1677 	 * allocate memory for holding the vertex pointers
1678 	 */
1679 
1680 	if (nvertex) {
1681 		topology = kmem_zalloc(nvertex * sizeof (lock_descriptor_t *),
1682 		    KM_SLEEP);
1683 	}
1684 
1685 	/*
1686 	 * one more pass to actually store the vertices in the
1687 	 * allocated array.
1688 	 * We first check sleeping locks and then active locks
1689 	 * so that topology array will be in a topological
1690 	 * order.
1691 	 */
1692 
1693 	nvertex = 0;
1694 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1695 
1696 	if (lock) {
1697 		do {
1698 			if (IS_RECOMPUTE(lock)) {
1699 				lock->l_index = nvertex;
1700 				topology[nvertex++] = lock;
1701 			}
1702 			lock->l_color = NO_COLOR;
1703 			lock = lock->l_next;
1704 		} while (lock->l_vnode == vp);
1705 	}
1706 
1707 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1708 
1709 	if (lock) {
1710 		do {
1711 			if (IS_RECOMPUTE(lock)) {
1712 				lock->l_index = nvertex;
1713 				topology[nvertex++] = lock;
1714 			}
1715 			lock->l_color = NO_COLOR;
1716 			lock = lock->l_next;
1717 		} while (lock->l_vnode == vp);
1718 	}
1719 
1720 	/*
1721 	 * remove in and out edges of request
1722 	 * They are freed after updating proc_graph below.
1723 	 */
1724 
1725 	for (ep = FIRST_IN(request); ep != HEAD(request); ep = NEXT_IN(ep)) {
1726 		ADJ_LIST_REMOVE(ep);
1727 	}
1728 
1729 
1730 	if (remove_from_queue)
1731 		REMOVE_SLEEP_QUEUE(request);
1732 
1733 	/* we are ready to recompute */
1734 
1735 	flk_recompute_dependencies(request, topology, nvertex, 1);
1736 
1737 	ep = FIRST_ADJ(request);
1738 	while (ep != HEAD(request)) {
1739 		IN_LIST_REMOVE(ep);
1740 		request->l_sedge = NEXT_ADJ(ep);
1741 		ADJ_LIST_REMOVE(ep);
1742 		flk_update_proc_graph(ep, 1);
1743 		flk_free_edge(ep);
1744 		ep = request->l_sedge;
1745 	}
1746 
1747 
1748 	/*
1749 	 * unset the RECOMPUTE flag in those vertices
1750 	 */
1751 
1752 	for (i = 0; i < nvertex; i++) {
1753 		topology[i]->l_state &= ~RECOMPUTE_LOCK;
1754 	}
1755 
1756 	/*
1757 	 * free the topology
1758 	 */
1759 	if (nvertex)
1760 		kmem_free((void *)topology,
1761 		    (nvertex * sizeof (lock_descriptor_t *)));
1762 	/*
1763 	 * Possibility of some locks unblocked now
1764 	 */
1765 
1766 	flk_wakeup(request, 0);
1767 
1768 	/*
1769 	 * we expect to have a correctly recomputed graph  now.
1770 	 */
1771 	flk_set_state(request, FLK_DEAD_STATE);
1772 	flk_free_lock(request);
1773 	CHECK_SLEEPING_LOCKS(gp);
1774 	CHECK_ACTIVE_LOCKS(gp);
1775 
1776 }
1777 
1778 /*
1779  * Uncoloring the graph is simply to increment the mark value of the graph
1780  * And only when wrap round takes place will we color all vertices in
1781  * the graph explicitly.
1782  */
1783 
1784 static void
1785 flk_graph_uncolor(graph_t *gp)
1786 {
1787 	lock_descriptor_t *lock;
1788 
1789 	if (gp->mark == UINT_MAX) {
1790 		gp->mark = 1;
1791 	for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
1792 	    lock = lock->l_next)
1793 			lock->l_color  = 0;
1794 
1795 	for (lock = SLEEPING_HEAD(gp)->l_next; lock != SLEEPING_HEAD(gp);
1796 	    lock = lock->l_next)
1797 			lock->l_color  = 0;
1798 	} else {
1799 		gp->mark++;
1800 	}
1801 }
1802 
1803 /*
1804  * Wake up locks that are blocked on the given lock.
1805  */
1806 
1807 static void
1808 flk_wakeup(lock_descriptor_t *lock, int adj_list_remove)
1809 {
1810 	edge_t	*ep;
1811 	graph_t	*gp = lock->l_graph;
1812 	lock_descriptor_t	*lck;
1813 
1814 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1815 	if (NO_DEPENDENTS(lock))
1816 		return;
1817 	ep = FIRST_IN(lock);
1818 	do {
1819 		/*
1820 		 * delete the edge from the adjacency list
1821 		 * of from vertex. if no more adjacent edges
1822 		 * for this vertex wake this process.
1823 		 */
1824 		lck = ep->from_vertex;
1825 		if (adj_list_remove)
1826 			ADJ_LIST_REMOVE(ep);
1827 		flk_update_proc_graph(ep, 1);
1828 		if (NOT_BLOCKED(lck)) {
1829 			GRANT_WAKEUP(lck);
1830 		}
1831 		lock->l_sedge = NEXT_IN(ep);
1832 		IN_LIST_REMOVE(ep);
1833 		flk_free_edge(ep);
1834 		ep = lock->l_sedge;
1835 	} while (ep != HEAD(lock));
1836 	ASSERT(NO_DEPENDENTS(lock));
1837 }
1838 
1839 /*
1840  * The dependents of request, is checked for its dependency against the
1841  * locks in topology (called topology because the array is and should be in
1842  * topological order for this algorithm, if not in topological order the
1843  * inner loop below might add more edges than necessary. Topological ordering
1844  * of vertices satisfies the property that all edges will be from left to
1845  * right i.e., topology[i] can have an edge to  topology[j], iff i<j)
1846  * If lock l1 in the dependent set of request is dependent (blocked by)
1847  * on lock l2 in topology but does not have a path to it, we add an edge
1848  * in the inner loop below.
1849  *
1850  * We don't want to add an edge between l1 and l2 if there exists
1851  * already a path from l1 to l2, so care has to be taken for those vertices
1852  * that  have two paths to 'request'. These vertices are referred to here
1853  * as barrier locks.
1854  *
1855  * The barriers has to be found (those vertex that originally had two paths
1856  * to request) because otherwise we may end up adding edges unnecessarily
1857  * to vertices in topology, and thus barrier vertices can have an edge
1858  * to a vertex in topology as well a path to it.
1859  */
1860 
1861 static void
1862 flk_recompute_dependencies(lock_descriptor_t *request,
1863 		lock_descriptor_t **topology,
1864 			int nvertex, int update_graph)
1865 {
1866 	lock_descriptor_t *vertex, *lock;
1867 	graph_t	*gp = request->l_graph;
1868 	int i, count;
1869 	int barrier_found = 0;
1870 	edge_t	*ep;
1871 	lock_descriptor_t *vertex_stack;
1872 
1873 	STACK_INIT(vertex_stack);
1874 
1875 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1876 	if (nvertex == 0)
1877 		return;
1878 	flk_graph_uncolor(request->l_graph);
1879 	barrier_found = flk_find_barriers(request);
1880 	request->l_state |= RECOMPUTE_DONE;
1881 
1882 	STACK_PUSH(vertex_stack, request, l_stack);
1883 	request->l_sedge = FIRST_IN(request);
1884 
1885 
1886 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1887 		if (vertex->l_state & RECOMPUTE_DONE) {
1888 			count = 0;
1889 			goto next_in_edge;
1890 		}
1891 		if (IS_BARRIER(vertex)) {
1892 			/* decrement the barrier count */
1893 			if (vertex->l_index) {
1894 				vertex->l_index--;
1895 				/* this guy will be pushed again anyway ? */
1896 				STACK_POP(vertex_stack, l_stack);
1897 				if (vertex->l_index == 0)  {
1898 				/*
1899 				 * barrier is over we can recompute
1900 				 * dependencies for this lock in the
1901 				 * next stack pop
1902 				 */
1903 					vertex->l_state &= ~BARRIER_LOCK;
1904 				}
1905 				continue;
1906 			}
1907 		}
1908 		vertex->l_state |= RECOMPUTE_DONE;
1909 		flk_graph_uncolor(gp);
1910 		count = flk_color_reachables(vertex);
1911 		for (i = 0; i < nvertex; i++) {
1912 			lock = topology[i];
1913 			if (COLORED(lock))
1914 				continue;
1915 			if (BLOCKS(lock, vertex)) {
1916 				(void) flk_add_edge(vertex, lock,
1917 				    NO_CHECK_CYCLE, update_graph);
1918 				COLOR(lock);
1919 				count++;
1920 				count += flk_color_reachables(lock);
1921 			}
1922 
1923 		}
1924 
1925 next_in_edge:
1926 		if (count == nvertex ||
1927 		    vertex->l_sedge == HEAD(vertex)) {
1928 			/* prune the tree below this */
1929 			STACK_POP(vertex_stack, l_stack);
1930 			vertex->l_state &= ~RECOMPUTE_DONE;
1931 			/* update the barrier locks below this! */
1932 			if (vertex->l_sedge != HEAD(vertex) && barrier_found) {
1933 				flk_graph_uncolor(gp);
1934 				flk_update_barriers(vertex);
1935 			}
1936 			continue;
1937 		}
1938 
1939 		ep = vertex->l_sedge;
1940 		lock = ep->from_vertex;
1941 		STACK_PUSH(vertex_stack, lock, l_stack);
1942 		lock->l_sedge = FIRST_IN(lock);
1943 		vertex->l_sedge = NEXT_IN(ep);
1944 	}
1945 
1946 }
1947 
1948 /*
1949  * Color all reachable vertices from vertex that belongs to topology (here
1950  * those that have RECOMPUTE_LOCK set in their state) and yet uncolored.
1951  *
1952  * Note: we need to use a different stack_link l_stack1 because this is
1953  * called from flk_recompute_dependencies() that already uses a stack with
1954  * l_stack as stack_link.
1955  */
1956 
1957 static int
1958 flk_color_reachables(lock_descriptor_t *vertex)
1959 {
1960 	lock_descriptor_t *ver, *lock;
1961 	int count;
1962 	edge_t	*ep;
1963 	lock_descriptor_t *vertex_stack;
1964 
1965 	STACK_INIT(vertex_stack);
1966 
1967 	STACK_PUSH(vertex_stack, vertex, l_stack1);
1968 	count = 0;
1969 	while ((ver = STACK_TOP(vertex_stack)) != NULL) {
1970 
1971 		STACK_POP(vertex_stack, l_stack1);
1972 		for (ep = FIRST_ADJ(ver); ep != HEAD(ver);
1973 		    ep = NEXT_ADJ(ep)) {
1974 			lock = ep->to_vertex;
1975 			if (COLORED(lock))
1976 				continue;
1977 			COLOR(lock);
1978 			if (IS_RECOMPUTE(lock))
1979 				count++;
1980 			STACK_PUSH(vertex_stack, lock, l_stack1);
1981 		}
1982 
1983 	}
1984 	return (count);
1985 }
1986 
1987 /*
1988  * Called from flk_recompute_dependencies() this routine decrements
1989  * the barrier count of barrier vertices that are reachable from lock.
1990  */
1991 
1992 static void
1993 flk_update_barriers(lock_descriptor_t *lock)
1994 {
1995 	lock_descriptor_t *vertex, *lck;
1996 	edge_t	*ep;
1997 	lock_descriptor_t *vertex_stack;
1998 
1999 	STACK_INIT(vertex_stack);
2000 
2001 	STACK_PUSH(vertex_stack, lock, l_stack1);
2002 
2003 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2004 		STACK_POP(vertex_stack, l_stack1);
2005 		for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2006 		    ep = NEXT_IN(ep)) {
2007 			lck = ep->from_vertex;
2008 			if (COLORED(lck)) {
2009 				if (IS_BARRIER(lck)) {
2010 					ASSERT(lck->l_index > 0);
2011 					lck->l_index--;
2012 					if (lck->l_index == 0)
2013 						lck->l_state &= ~BARRIER_LOCK;
2014 				}
2015 				continue;
2016 			}
2017 			COLOR(lck);
2018 			if (IS_BARRIER(lck)) {
2019 				ASSERT(lck->l_index > 0);
2020 				lck->l_index--;
2021 				if (lck->l_index == 0)
2022 					lck->l_state &= ~BARRIER_LOCK;
2023 			}
2024 			STACK_PUSH(vertex_stack, lck, l_stack1);
2025 		}
2026 	}
2027 }
2028 
2029 /*
2030  * Finds all vertices that are reachable from 'lock' more than once and
2031  * mark them as barrier vertices and increment their barrier count.
2032  * The barrier count is one minus the total number of paths from lock
2033  * to that vertex.
2034  */
2035 
2036 static int
2037 flk_find_barriers(lock_descriptor_t *lock)
2038 {
2039 	lock_descriptor_t *vertex, *lck;
2040 	int found = 0;
2041 	edge_t	*ep;
2042 	lock_descriptor_t *vertex_stack;
2043 
2044 	STACK_INIT(vertex_stack);
2045 
2046 	STACK_PUSH(vertex_stack, lock, l_stack1);
2047 
2048 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2049 		STACK_POP(vertex_stack, l_stack1);
2050 		for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2051 		    ep = NEXT_IN(ep)) {
2052 			lck = ep->from_vertex;
2053 			if (COLORED(lck)) {
2054 				/* this is a barrier */
2055 				lck->l_state |= BARRIER_LOCK;
2056 				/* index will have barrier count */
2057 				lck->l_index++;
2058 				if (!found)
2059 					found = 1;
2060 				continue;
2061 			}
2062 			COLOR(lck);
2063 			lck->l_index = 0;
2064 			STACK_PUSH(vertex_stack, lck, l_stack1);
2065 		}
2066 	}
2067 	return (found);
2068 }
2069 
2070 /*
2071  * Finds the first lock that is mainly responsible for blocking this
2072  * request.  If there is no such lock, request->l_flock.l_type is set to
2073  * F_UNLCK.  Otherwise, request->l_flock is filled in with the particulars
2074  * of the blocking lock.
2075  *
2076  * Note: It is possible a request is blocked by a sleeping lock because
2077  * of the fairness policy used in flk_process_request() to construct the
2078  * dependencies. (see comments before flk_process_request()).
2079  */
2080 
2081 static void
2082 flk_get_first_blocking_lock(lock_descriptor_t *request)
2083 {
2084 	graph_t	*gp = request->l_graph;
2085 	vnode_t *vp = request->l_vnode;
2086 	lock_descriptor_t *lock, *blocker;
2087 
2088 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
2089 	blocker = NULL;
2090 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2091 
2092 	if (lock) {
2093 		do {
2094 			if (BLOCKS(lock, request)) {
2095 				blocker = lock;
2096 				break;
2097 			}
2098 			lock = lock->l_next;
2099 		} while (lock->l_vnode == vp);
2100 	}
2101 
2102 	if (blocker) {
2103 		report_blocker(blocker, request);
2104 	} else
2105 		request->l_flock.l_type = F_UNLCK;
2106 }
2107 
2108 /*
2109  * Get the graph_t structure associated with a vnode.
2110  * If 'initialize' is non-zero, and the graph_t structure for this vnode has
2111  * not yet been initialized, then a new element is allocated and returned.
2112  */
2113 graph_t *
2114 flk_get_lock_graph(vnode_t *vp, int initialize)
2115 {
2116 	graph_t *gp;
2117 	graph_t *gp_alloc = NULL;
2118 	int index = HASH_INDEX(vp);
2119 
2120 	if (initialize == FLK_USE_GRAPH) {
2121 		mutex_enter(&flock_lock);
2122 		gp = lock_graph[index];
2123 		mutex_exit(&flock_lock);
2124 		return (gp);
2125 	}
2126 
2127 	ASSERT(initialize == FLK_INIT_GRAPH);
2128 
2129 	if (lock_graph[index] == NULL) {
2130 
2131 		gp_alloc = kmem_zalloc(sizeof (graph_t), KM_SLEEP);
2132 
2133 		/* Initialize the graph */
2134 
2135 		gp_alloc->active_locks.l_next =
2136 		    gp_alloc->active_locks.l_prev =
2137 		    (lock_descriptor_t *)ACTIVE_HEAD(gp_alloc);
2138 		gp_alloc->sleeping_locks.l_next =
2139 		    gp_alloc->sleeping_locks.l_prev =
2140 		    (lock_descriptor_t *)SLEEPING_HEAD(gp_alloc);
2141 		gp_alloc->index = index;
2142 		mutex_init(&gp_alloc->gp_mutex, NULL, MUTEX_DEFAULT, NULL);
2143 	}
2144 
2145 	mutex_enter(&flock_lock);
2146 
2147 	gp = lock_graph[index];
2148 
2149 	/* Recheck the value within flock_lock */
2150 	if (gp == NULL) {
2151 		struct flock_globals *fg;
2152 
2153 		/* We must have previously allocated the graph_t structure */
2154 		ASSERT(gp_alloc != NULL);
2155 		lock_graph[index] = gp = gp_alloc;
2156 		/*
2157 		 * The lockmgr status is only needed if KLM is loaded.
2158 		 */
2159 		if (flock_zone_key != ZONE_KEY_UNINITIALIZED) {
2160 			fg = flk_get_globals();
2161 			fg->lockmgr_status[index] = fg->flk_lockmgr_status;
2162 		}
2163 	}
2164 
2165 	mutex_exit(&flock_lock);
2166 
2167 	if ((gp_alloc != NULL) && (gp != gp_alloc)) {
2168 		/* There was a race to allocate the graph_t and we lost */
2169 		mutex_destroy(&gp_alloc->gp_mutex);
2170 		kmem_free(gp_alloc, sizeof (graph_t));
2171 	}
2172 
2173 	return (gp);
2174 }
2175 
2176 /*
2177  * PSARC case 1997/292
2178  */
2179 int
2180 cl_flk_has_remote_locks_for_nlmid(vnode_t *vp, int nlmid)
2181 {
2182 	lock_descriptor_t *lock;
2183 	int result = 0;
2184 	graph_t *gp;
2185 	int			lock_nlmid;
2186 
2187 	/*
2188 	 * Check to see if node is booted as a cluster. If not, return.
2189 	 */
2190 	if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2191 		return (0);
2192 	}
2193 
2194 	gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2195 	if (gp == NULL) {
2196 		return (0);
2197 	}
2198 
2199 	mutex_enter(&gp->gp_mutex);
2200 
2201 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2202 
2203 	if (lock) {
2204 		while (lock->l_vnode == vp) {
2205 			/* get NLM id from sysid */
2206 			lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2207 
2208 			/*
2209 			 * If NLM server request _and_ nlmid of lock matches
2210 			 * nlmid of argument, then we've found a remote lock.
2211 			 */
2212 			if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2213 				result = 1;
2214 				goto done;
2215 			}
2216 			lock = lock->l_next;
2217 		}
2218 	}
2219 
2220 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2221 
2222 	if (lock) {
2223 		while (lock->l_vnode == vp) {
2224 			/* get NLM id from sysid */
2225 			lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2226 
2227 			/*
2228 			 * If NLM server request _and_ nlmid of lock matches
2229 			 * nlmid of argument, then we've found a remote lock.
2230 			 */
2231 			if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2232 				result = 1;
2233 				goto done;
2234 			}
2235 			lock = lock->l_next;
2236 		}
2237 	}
2238 
2239 done:
2240 	mutex_exit(&gp->gp_mutex);
2241 	return (result);
2242 }
2243 
2244 /*
2245  * Determine whether there are any locks for the given vnode with a remote
2246  * sysid.  Returns zero if not, non-zero if there are.
2247  *
2248  * Note that the return value from this function is potentially invalid
2249  * once it has been returned.  The caller is responsible for providing its
2250  * own synchronization mechanism to ensure that the return value is useful
2251  * (e.g., see nfs_lockcompletion()).
2252  */
2253 int
2254 flk_has_remote_locks(vnode_t *vp)
2255 {
2256 	lock_descriptor_t *lock;
2257 	int result = 0;
2258 	graph_t *gp;
2259 
2260 	gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2261 	if (gp == NULL) {
2262 		return (0);
2263 	}
2264 
2265 	mutex_enter(&gp->gp_mutex);
2266 
2267 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2268 
2269 	if (lock) {
2270 		while (lock->l_vnode == vp) {
2271 			if (IS_REMOTE(lock)) {
2272 				result = 1;
2273 				goto done;
2274 			}
2275 			lock = lock->l_next;
2276 		}
2277 	}
2278 
2279 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2280 
2281 	if (lock) {
2282 		while (lock->l_vnode == vp) {
2283 			if (IS_REMOTE(lock)) {
2284 				result = 1;
2285 				goto done;
2286 			}
2287 			lock = lock->l_next;
2288 		}
2289 	}
2290 
2291 done:
2292 	mutex_exit(&gp->gp_mutex);
2293 	return (result);
2294 }
2295 
2296 /*
2297  * Determine whether there are any locks for the given vnode with a remote
2298  * sysid matching given sysid.
2299  * Used by the new (open source) NFS Lock Manager (NLM)
2300  */
2301 int
2302 flk_has_remote_locks_for_sysid(vnode_t *vp, int sysid)
2303 {
2304 	lock_descriptor_t *lock;
2305 	int result = 0;
2306 	graph_t *gp;
2307 
2308 	if (sysid == 0)
2309 		return (0);
2310 
2311 	gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2312 	if (gp == NULL) {
2313 		return (0);
2314 	}
2315 
2316 	mutex_enter(&gp->gp_mutex);
2317 
2318 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2319 
2320 	if (lock) {
2321 		while (lock->l_vnode == vp) {
2322 			if (lock->l_flock.l_sysid == sysid) {
2323 				result = 1;
2324 				goto done;
2325 			}
2326 			lock = lock->l_next;
2327 		}
2328 	}
2329 
2330 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2331 
2332 	if (lock) {
2333 		while (lock->l_vnode == vp) {
2334 			if (lock->l_flock.l_sysid == sysid) {
2335 				result = 1;
2336 				goto done;
2337 			}
2338 			lock = lock->l_next;
2339 		}
2340 	}
2341 
2342 done:
2343 	mutex_exit(&gp->gp_mutex);
2344 	return (result);
2345 }
2346 
2347 /*
2348  * Determine if there are any locks owned by the given sysid.
2349  * Returns zero if not, non-zero if there are.  Note that this return code
2350  * could be derived from flk_get_{sleeping,active}_locks, but this routine
2351  * avoids all the memory allocations of those routines.
2352  *
2353  * This routine has the same synchronization issues as
2354  * flk_has_remote_locks.
2355  */
2356 
2357 int
2358 flk_sysid_has_locks(int sysid, int lck_type)
2359 {
2360 	int		has_locks = 0;
2361 	lock_descriptor_t	*lock;
2362 	graph_t 	*gp;
2363 	int		i;
2364 
2365 	for (i = 0; i < HASH_SIZE && !has_locks; i++) {
2366 		mutex_enter(&flock_lock);
2367 		gp = lock_graph[i];
2368 		mutex_exit(&flock_lock);
2369 		if (gp == NULL) {
2370 			continue;
2371 		}
2372 
2373 		mutex_enter(&gp->gp_mutex);
2374 
2375 		if (lck_type & FLK_QUERY_ACTIVE) {
2376 			for (lock = ACTIVE_HEAD(gp)->l_next;
2377 			    lock != ACTIVE_HEAD(gp) && !has_locks;
2378 			    lock = lock->l_next) {
2379 				if (lock->l_flock.l_sysid == sysid)
2380 					has_locks = 1;
2381 			}
2382 		}
2383 
2384 		if (lck_type & FLK_QUERY_SLEEPING) {
2385 			for (lock = SLEEPING_HEAD(gp)->l_next;
2386 			    lock != SLEEPING_HEAD(gp) && !has_locks;
2387 			    lock = lock->l_next) {
2388 				if (lock->l_flock.l_sysid == sysid)
2389 					has_locks = 1;
2390 			}
2391 		}
2392 		mutex_exit(&gp->gp_mutex);
2393 	}
2394 
2395 	return (has_locks);
2396 }
2397 
2398 
2399 /*
2400  * PSARC case 1997/292
2401  *
2402  * Requires: "sysid" is a pair [nlmid, sysid].  The lower half is 16-bit
2403  *  quantity, the real sysid generated by the NLM server; the upper half
2404  *  identifies the node of the cluster where the NLM server ran.
2405  *  This routine is only called by an NLM server running in a cluster.
2406  * Effects: Remove all locks held on behalf of the client identified
2407  *  by "sysid."
2408  */
2409 void
2410 cl_flk_remove_locks_by_sysid(int sysid)
2411 {
2412 	graph_t	*gp;
2413 	int i;
2414 	lock_descriptor_t *lock, *nlock;
2415 
2416 	/*
2417 	 * Check to see if node is booted as a cluster. If not, return.
2418 	 */
2419 	if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2420 		return;
2421 	}
2422 
2423 	ASSERT(sysid != 0);
2424 	for (i = 0; i < HASH_SIZE; i++) {
2425 		mutex_enter(&flock_lock);
2426 		gp = lock_graph[i];
2427 		mutex_exit(&flock_lock);
2428 
2429 		if (gp == NULL)
2430 			continue;
2431 
2432 		mutex_enter(&gp->gp_mutex);	/*  get mutex on lock graph */
2433 
2434 		/* signal sleeping requests so that they bail out */
2435 		lock = SLEEPING_HEAD(gp)->l_next;
2436 		while (lock != SLEEPING_HEAD(gp)) {
2437 			nlock = lock->l_next;
2438 			if (lock->l_flock.l_sysid == sysid) {
2439 				INTERRUPT_WAKEUP(lock);
2440 			}
2441 			lock = nlock;
2442 		}
2443 
2444 		/* delete active locks */
2445 		lock = ACTIVE_HEAD(gp)->l_next;
2446 		while (lock != ACTIVE_HEAD(gp)) {
2447 			nlock = lock->l_next;
2448 			if (lock->l_flock.l_sysid == sysid) {
2449 				flk_delete_active_lock(lock, 0);
2450 				flk_wakeup(lock, 1);
2451 				flk_free_lock(lock);
2452 			}
2453 			lock = nlock;
2454 		}
2455 		mutex_exit(&gp->gp_mutex);    /* release mutex on lock graph */
2456 	}
2457 }
2458 
2459 /*
2460  * Delete all locks in the system that belongs to the sysid of the request.
2461  */
2462 
2463 static void
2464 flk_delete_locks_by_sysid(lock_descriptor_t *request)
2465 {
2466 	int	sysid  = request->l_flock.l_sysid;
2467 	lock_descriptor_t *lock, *nlock;
2468 	graph_t	*gp;
2469 	int i;
2470 
2471 	ASSERT(MUTEX_HELD(&request->l_graph->gp_mutex));
2472 	ASSERT(sysid != 0);
2473 
2474 	mutex_exit(&request->l_graph->gp_mutex);
2475 
2476 	for (i = 0; i < HASH_SIZE; i++) {
2477 		mutex_enter(&flock_lock);
2478 		gp = lock_graph[i];
2479 		mutex_exit(&flock_lock);
2480 
2481 		if (gp == NULL)
2482 			continue;
2483 
2484 		mutex_enter(&gp->gp_mutex);
2485 
2486 		/* signal sleeping requests so that they bail out */
2487 		lock = SLEEPING_HEAD(gp)->l_next;
2488 		while (lock != SLEEPING_HEAD(gp)) {
2489 			nlock = lock->l_next;
2490 			if (lock->l_flock.l_sysid == sysid) {
2491 				INTERRUPT_WAKEUP(lock);
2492 			}
2493 			lock = nlock;
2494 		}
2495 
2496 		/* delete active locks */
2497 		lock = ACTIVE_HEAD(gp)->l_next;
2498 		while (lock != ACTIVE_HEAD(gp)) {
2499 			nlock = lock->l_next;
2500 			if (lock->l_flock.l_sysid == sysid) {
2501 				flk_delete_active_lock(lock, 0);
2502 				flk_wakeup(lock, 1);
2503 				flk_free_lock(lock);
2504 			}
2505 			lock = nlock;
2506 		}
2507 		mutex_exit(&gp->gp_mutex);
2508 	}
2509 
2510 	mutex_enter(&request->l_graph->gp_mutex);
2511 }
2512 
2513 /*
2514  * Clustering: Deletes PXFS locks
2515  * Effects: Delete all locks on files in the given file system and with the
2516  *  given PXFS id.
2517  */
2518 void
2519 cl_flk_delete_pxfs_locks(struct vfs *vfsp, int pxfsid)
2520 {
2521 	lock_descriptor_t *lock, *nlock;
2522 	graph_t	*gp;
2523 	int i;
2524 
2525 	for (i = 0; i < HASH_SIZE; i++) {
2526 		mutex_enter(&flock_lock);
2527 		gp = lock_graph[i];
2528 		mutex_exit(&flock_lock);
2529 
2530 		if (gp == NULL)
2531 			continue;
2532 
2533 		mutex_enter(&gp->gp_mutex);
2534 
2535 		/* signal sleeping requests so that they bail out */
2536 		lock = SLEEPING_HEAD(gp)->l_next;
2537 		while (lock != SLEEPING_HEAD(gp)) {
2538 			nlock = lock->l_next;
2539 			if (lock->l_vnode->v_vfsp == vfsp) {
2540 				ASSERT(IS_PXFS(lock));
2541 				if (GETPXFSID(lock->l_flock.l_sysid) ==
2542 				    pxfsid) {
2543 					flk_set_state(lock,
2544 					    FLK_CANCELLED_STATE);
2545 					flk_cancel_sleeping_lock(lock, 1);
2546 				}
2547 			}
2548 			lock = nlock;
2549 		}
2550 
2551 		/* delete active locks */
2552 		lock = ACTIVE_HEAD(gp)->l_next;
2553 		while (lock != ACTIVE_HEAD(gp)) {
2554 			nlock = lock->l_next;
2555 			if (lock->l_vnode->v_vfsp == vfsp) {
2556 				ASSERT(IS_PXFS(lock));
2557 				if (GETPXFSID(lock->l_flock.l_sysid) ==
2558 				    pxfsid) {
2559 					flk_delete_active_lock(lock, 0);
2560 					flk_wakeup(lock, 1);
2561 					flk_free_lock(lock);
2562 				}
2563 			}
2564 			lock = nlock;
2565 		}
2566 		mutex_exit(&gp->gp_mutex);
2567 	}
2568 }
2569 
2570 /*
2571  * Search for a sleeping lock manager lock which matches exactly this lock
2572  * request; if one is found, fake a signal to cancel it.
2573  *
2574  * Return 1 if a matching lock was found, 0 otherwise.
2575  */
2576 
2577 static int
2578 flk_canceled(lock_descriptor_t *request)
2579 {
2580 	lock_descriptor_t *lock, *nlock;
2581 	graph_t *gp = request->l_graph;
2582 	vnode_t *vp = request->l_vnode;
2583 
2584 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
2585 	ASSERT(IS_LOCKMGR(request));
2586 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2587 
2588 	if (lock) {
2589 		while (lock->l_vnode == vp) {
2590 			nlock = lock->l_next;
2591 			if (SAME_OWNER(lock, request) &&
2592 			    lock->l_start == request->l_start &&
2593 			    lock->l_end == request->l_end) {
2594 				INTERRUPT_WAKEUP(lock);
2595 				return (1);
2596 			}
2597 			lock = nlock;
2598 		}
2599 	}
2600 	return (0);
2601 }
2602 
2603 /*
2604  * Remove all the locks for the vnode belonging to the given pid and sysid.
2605  */
2606 
2607 void
2608 cleanlocks(vnode_t *vp, pid_t pid, int sysid)
2609 {
2610 	graph_t	*gp;
2611 	lock_descriptor_t *lock, *nlock;
2612 	lock_descriptor_t *link_stack;
2613 
2614 	STACK_INIT(link_stack);
2615 
2616 	gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2617 
2618 	if (gp == NULL)
2619 		return;
2620 	mutex_enter(&gp->gp_mutex);
2621 
2622 	CHECK_SLEEPING_LOCKS(gp);
2623 	CHECK_ACTIVE_LOCKS(gp);
2624 
2625 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2626 
2627 	if (lock) {
2628 		do {
2629 			nlock = lock->l_next;
2630 			if ((lock->l_flock.l_pid == pid ||
2631 			    pid == IGN_PID) &&
2632 			    lock->l_flock.l_sysid == sysid) {
2633 				CANCEL_WAKEUP(lock);
2634 			}
2635 			lock = nlock;
2636 		} while (lock->l_vnode == vp);
2637 	}
2638 
2639 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2640 
2641 	if (lock) {
2642 		do {
2643 			nlock = lock->l_next;
2644 			if ((lock->l_flock.l_pid == pid ||
2645 			    pid == IGN_PID) &&
2646 			    lock->l_flock.l_sysid == sysid) {
2647 				flk_delete_active_lock(lock, 0);
2648 				STACK_PUSH(link_stack, lock, l_stack);
2649 			}
2650 			lock = nlock;
2651 		} while (lock->l_vnode == vp);
2652 	}
2653 
2654 	while ((lock = STACK_TOP(link_stack)) != NULL) {
2655 		STACK_POP(link_stack, l_stack);
2656 		flk_wakeup(lock, 1);
2657 		flk_free_lock(lock);
2658 	}
2659 
2660 	CHECK_SLEEPING_LOCKS(gp);
2661 	CHECK_ACTIVE_LOCKS(gp);
2662 	CHECK_OWNER_LOCKS(gp, pid, sysid, vp);
2663 	mutex_exit(&gp->gp_mutex);
2664 }
2665 
2666 
2667 /*
2668  * Called from 'fs' read and write routines for files that have mandatory
2669  * locking enabled.
2670  */
2671 
2672 int
2673 chklock(
2674 	struct vnode	*vp,
2675 	int 		iomode,
2676 	u_offset_t	offset,
2677 	ssize_t		len,
2678 	int 		fmode,
2679 	caller_context_t *ct)
2680 {
2681 	register int	i;
2682 	struct flock64 	bf;
2683 	int 		error = 0;
2684 
2685 	bf.l_type = (iomode & FWRITE) ? F_WRLCK : F_RDLCK;
2686 	bf.l_whence = 0;
2687 	bf.l_start = offset;
2688 	bf.l_len = len;
2689 	if (ct == NULL) {
2690 		bf.l_pid = curproc->p_pid;
2691 		bf.l_sysid = 0;
2692 	} else {
2693 		bf.l_pid = ct->cc_pid;
2694 		bf.l_sysid = ct->cc_sysid;
2695 	}
2696 	i = (fmode & (FNDELAY|FNONBLOCK)) ? INOFLCK : INOFLCK|SLPFLCK;
2697 	if ((i = reclock(vp, &bf, i, 0, offset, NULL)) != 0 ||
2698 	    bf.l_type != F_UNLCK)
2699 		error = i ? i : EAGAIN;
2700 	return (error);
2701 }
2702 
2703 /*
2704  * convoff - converts the given data (start, whence) to the
2705  * given whence.
2706  */
2707 int
2708 convoff(vp, lckdat, whence, offset)
2709 	struct vnode 	*vp;
2710 	struct flock64 	*lckdat;
2711 	int 		whence;
2712 	offset_t	offset;
2713 {
2714 	int 		error;
2715 	struct vattr 	vattr;
2716 
2717 	if ((lckdat->l_whence == 2) || (whence == 2)) {
2718 		vattr.va_mask = AT_SIZE;
2719 		if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
2720 			return (error);
2721 	}
2722 
2723 	switch (lckdat->l_whence) {
2724 	case 1:
2725 		lckdat->l_start += offset;
2726 		break;
2727 	case 2:
2728 		lckdat->l_start += vattr.va_size;
2729 		/* FALLTHRU */
2730 	case 0:
2731 		break;
2732 	default:
2733 		return (EINVAL);
2734 	}
2735 
2736 	if (lckdat->l_start < 0)
2737 		return (EINVAL);
2738 
2739 	switch (whence) {
2740 	case 1:
2741 		lckdat->l_start -= offset;
2742 		break;
2743 	case 2:
2744 		lckdat->l_start -= vattr.va_size;
2745 		/* FALLTHRU */
2746 	case 0:
2747 		break;
2748 	default:
2749 		return (EINVAL);
2750 	}
2751 
2752 	lckdat->l_whence = (short)whence;
2753 	return (0);
2754 }
2755 
2756 
2757 /* 	proc_graph function definitions */
2758 
2759 /*
2760  * Function checks for deadlock due to the new 'lock'. If deadlock found
2761  * edges of this lock are freed and returned.
2762  */
2763 
2764 static int
2765 flk_check_deadlock(lock_descriptor_t *lock)
2766 {
2767 	proc_vertex_t	*start_vertex, *pvertex;
2768 	proc_vertex_t *dvertex;
2769 	proc_edge_t *pep, *ppep;
2770 	edge_t	*ep, *nep;
2771 	proc_vertex_t *process_stack;
2772 
2773 	STACK_INIT(process_stack);
2774 
2775 	mutex_enter(&flock_lock);
2776 	start_vertex = flk_get_proc_vertex(lock);
2777 	ASSERT(start_vertex != NULL);
2778 
2779 	/* construct the edges from this process to other processes */
2780 
2781 	ep = FIRST_ADJ(lock);
2782 	while (ep != HEAD(lock)) {
2783 		proc_vertex_t *adj_proc;
2784 
2785 		adj_proc = flk_get_proc_vertex(ep->to_vertex);
2786 		for (pep = start_vertex->edge; pep != NULL; pep = pep->next) {
2787 			if (pep->to_proc == adj_proc) {
2788 				ASSERT(pep->refcount);
2789 				pep->refcount++;
2790 				break;
2791 			}
2792 		}
2793 		if (pep == NULL) {
2794 			pep = flk_get_proc_edge();
2795 			pep->to_proc = adj_proc;
2796 			pep->refcount = 1;
2797 			adj_proc->incount++;
2798 			pep->next = start_vertex->edge;
2799 			start_vertex->edge = pep;
2800 		}
2801 		ep = NEXT_ADJ(ep);
2802 	}
2803 
2804 	ep = FIRST_IN(lock);
2805 
2806 	while (ep != HEAD(lock)) {
2807 		proc_vertex_t *in_proc;
2808 
2809 		in_proc = flk_get_proc_vertex(ep->from_vertex);
2810 
2811 		for (pep = in_proc->edge; pep != NULL; pep = pep->next) {
2812 			if (pep->to_proc == start_vertex) {
2813 				ASSERT(pep->refcount);
2814 				pep->refcount++;
2815 				break;
2816 			}
2817 		}
2818 		if (pep == NULL) {
2819 			pep = flk_get_proc_edge();
2820 			pep->to_proc = start_vertex;
2821 			pep->refcount = 1;
2822 			start_vertex->incount++;
2823 			pep->next = in_proc->edge;
2824 			in_proc->edge = pep;
2825 		}
2826 		ep = NEXT_IN(ep);
2827 	}
2828 
2829 	if (start_vertex->incount == 0) {
2830 		mutex_exit(&flock_lock);
2831 		return (0);
2832 	}
2833 
2834 	flk_proc_graph_uncolor();
2835 
2836 	start_vertex->p_sedge = start_vertex->edge;
2837 
2838 	STACK_PUSH(process_stack, start_vertex, p_stack);
2839 
2840 	while ((pvertex = STACK_TOP(process_stack)) != NULL) {
2841 		for (pep = pvertex->p_sedge; pep != NULL; pep = pep->next) {
2842 			dvertex = pep->to_proc;
2843 			if (!PROC_ARRIVED(dvertex)) {
2844 				STACK_PUSH(process_stack, dvertex, p_stack);
2845 				dvertex->p_sedge = dvertex->edge;
2846 				PROC_ARRIVE(pvertex);
2847 				pvertex->p_sedge = pep->next;
2848 				break;
2849 			}
2850 			if (!PROC_DEPARTED(dvertex))
2851 				goto deadlock;
2852 		}
2853 		if (pep == NULL) {
2854 			PROC_DEPART(pvertex);
2855 			STACK_POP(process_stack, p_stack);
2856 		}
2857 	}
2858 	mutex_exit(&flock_lock);
2859 	return (0);
2860 
2861 deadlock:
2862 
2863 	/* we remove all lock edges and proc edges */
2864 
2865 	ep = FIRST_ADJ(lock);
2866 	while (ep != HEAD(lock)) {
2867 		proc_vertex_t *adj_proc;
2868 		adj_proc = flk_get_proc_vertex(ep->to_vertex);
2869 		nep = NEXT_ADJ(ep);
2870 		IN_LIST_REMOVE(ep);
2871 		ADJ_LIST_REMOVE(ep);
2872 		flk_free_edge(ep);
2873 		ppep = start_vertex->edge;
2874 		for (pep = start_vertex->edge; pep != NULL; ppep = pep,
2875 		    pep = ppep->next) {
2876 			if (pep->to_proc == adj_proc) {
2877 				pep->refcount--;
2878 				if (pep->refcount == 0) {
2879 					if (pep == ppep) {
2880 						start_vertex->edge = pep->next;
2881 					} else {
2882 						ppep->next = pep->next;
2883 					}
2884 					adj_proc->incount--;
2885 					flk_proc_release(adj_proc);
2886 					flk_free_proc_edge(pep);
2887 				}
2888 				break;
2889 			}
2890 		}
2891 		ep = nep;
2892 	}
2893 	ep = FIRST_IN(lock);
2894 	while (ep != HEAD(lock)) {
2895 		proc_vertex_t *in_proc;
2896 		in_proc = flk_get_proc_vertex(ep->from_vertex);
2897 		nep = NEXT_IN(ep);
2898 		IN_LIST_REMOVE(ep);
2899 		ADJ_LIST_REMOVE(ep);
2900 		flk_free_edge(ep);
2901 		ppep = in_proc->edge;
2902 		for (pep = in_proc->edge; pep != NULL; ppep = pep,
2903 		    pep = ppep->next) {
2904 			if (pep->to_proc == start_vertex) {
2905 				pep->refcount--;
2906 				if (pep->refcount == 0) {
2907 					if (pep == ppep) {
2908 						in_proc->edge = pep->next;
2909 					} else {
2910 						ppep->next = pep->next;
2911 					}
2912 					start_vertex->incount--;
2913 					flk_proc_release(in_proc);
2914 					flk_free_proc_edge(pep);
2915 				}
2916 				break;
2917 			}
2918 		}
2919 		ep = nep;
2920 	}
2921 	flk_proc_release(start_vertex);
2922 	mutex_exit(&flock_lock);
2923 	return (1);
2924 }
2925 
2926 /*
2927  * Get a proc vertex. If lock's pvertex value gets a correct proc vertex
2928  * from the list we return that, otherwise we allocate one. If necessary,
2929  * we grow the list of vertices also.
2930  */
2931 
2932 static proc_vertex_t *
2933 flk_get_proc_vertex(lock_descriptor_t *lock)
2934 {
2935 	int i;
2936 	proc_vertex_t	*pv;
2937 	proc_vertex_t	**palloc;
2938 
2939 	ASSERT(MUTEX_HELD(&flock_lock));
2940 	if (lock->pvertex != -1) {
2941 		ASSERT(lock->pvertex >= 0);
2942 		pv = pgraph.proc[lock->pvertex];
2943 		if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
2944 			return (pv);
2945 		}
2946 	}
2947 	for (i = 0; i < pgraph.gcount; i++) {
2948 		pv = pgraph.proc[i];
2949 		if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
2950 			lock->pvertex = pv->index = i;
2951 			return (pv);
2952 		}
2953 	}
2954 	pv = kmem_zalloc(sizeof (struct proc_vertex), KM_SLEEP);
2955 	pv->pid = lock->l_flock.l_pid;
2956 	pv->sysid = lock->l_flock.l_sysid;
2957 	flk_proc_vertex_allocs++;
2958 	if (pgraph.free != 0) {
2959 		for (i = 0; i < pgraph.gcount; i++) {
2960 			if (pgraph.proc[i] == NULL) {
2961 				pgraph.proc[i] = pv;
2962 				lock->pvertex = pv->index = i;
2963 				pgraph.free--;
2964 				return (pv);
2965 			}
2966 		}
2967 	}
2968 	palloc = kmem_zalloc((pgraph.gcount + PROC_CHUNK) *
2969 	    sizeof (proc_vertex_t *), KM_SLEEP);
2970 
2971 	if (pgraph.proc) {
2972 		bcopy(pgraph.proc, palloc,
2973 		    pgraph.gcount * sizeof (proc_vertex_t *));
2974 
2975 		kmem_free(pgraph.proc,
2976 		    pgraph.gcount * sizeof (proc_vertex_t *));
2977 	}
2978 	pgraph.proc = palloc;
2979 	pgraph.free += (PROC_CHUNK - 1);
2980 	pv->index = lock->pvertex = pgraph.gcount;
2981 	pgraph.gcount += PROC_CHUNK;
2982 	pgraph.proc[pv->index] = pv;
2983 	return (pv);
2984 }
2985 
2986 /*
2987  * Allocate a proc edge.
2988  */
2989 
2990 static proc_edge_t *
2991 flk_get_proc_edge()
2992 {
2993 	proc_edge_t *pep;
2994 
2995 	pep = kmem_zalloc(sizeof (proc_edge_t), KM_SLEEP);
2996 	flk_proc_edge_allocs++;
2997 	return (pep);
2998 }
2999 
3000 /*
3001  * Free the proc edge. Called whenever its reference count goes to zero.
3002  */
3003 
3004 static void
3005 flk_free_proc_edge(proc_edge_t *pep)
3006 {
3007 	ASSERT(pep->refcount == 0);
3008 	kmem_free((void *)pep, sizeof (proc_edge_t));
3009 	flk_proc_edge_frees++;
3010 }
3011 
3012 /*
3013  * Color the graph explicitly done only when the mark value hits max value.
3014  */
3015 
3016 static void
3017 flk_proc_graph_uncolor()
3018 {
3019 	int i;
3020 
3021 	if (pgraph.mark == UINT_MAX) {
3022 		for (i = 0; i < pgraph.gcount; i++)
3023 			if (pgraph.proc[i] != NULL) {
3024 				pgraph.proc[i]->atime = 0;
3025 				pgraph.proc[i]->dtime = 0;
3026 			}
3027 		pgraph.mark = 1;
3028 	} else {
3029 		pgraph.mark++;
3030 	}
3031 }
3032 
3033 /*
3034  * Release the proc vertex iff both there are no in edges and out edges
3035  */
3036 
3037 static void
3038 flk_proc_release(proc_vertex_t *proc)
3039 {
3040 	ASSERT(MUTEX_HELD(&flock_lock));
3041 	if (proc->edge == NULL && proc->incount == 0) {
3042 		pgraph.proc[proc->index] = NULL;
3043 		pgraph.free++;
3044 		kmem_free(proc, sizeof (proc_vertex_t));
3045 		flk_proc_vertex_frees++;
3046 	}
3047 }
3048 
3049 /*
3050  * Updates process graph to reflect change in a lock_graph.
3051  * Note: We should call this function only after we have a correctly
3052  * recomputed lock graph. Otherwise we might miss a deadlock detection.
3053  * eg: in function flk_relation() we call this function after flk_recompute_
3054  * dependencies() otherwise if a process tries to lock a vnode hashed
3055  * into another graph it might sleep for ever.
3056  */
3057 
3058 static void
3059 flk_update_proc_graph(edge_t *ep, int delete)
3060 {
3061 	proc_vertex_t *toproc, *fromproc;
3062 	proc_edge_t *pep, *prevpep;
3063 
3064 	mutex_enter(&flock_lock);
3065 	toproc = flk_get_proc_vertex(ep->to_vertex);
3066 	fromproc = flk_get_proc_vertex(ep->from_vertex);
3067 
3068 	if (!delete)
3069 		goto add;
3070 	pep = prevpep = fromproc->edge;
3071 
3072 	ASSERT(pep != NULL);
3073 	while (pep != NULL) {
3074 		if (pep->to_proc == toproc) {
3075 			ASSERT(pep->refcount > 0);
3076 			pep->refcount--;
3077 			if (pep->refcount == 0) {
3078 				if (pep == prevpep) {
3079 					fromproc->edge = pep->next;
3080 				} else {
3081 					prevpep->next = pep->next;
3082 				}
3083 				toproc->incount--;
3084 				flk_proc_release(toproc);
3085 				flk_free_proc_edge(pep);
3086 			}
3087 			break;
3088 		}
3089 		prevpep = pep;
3090 		pep = pep->next;
3091 	}
3092 	flk_proc_release(fromproc);
3093 	mutex_exit(&flock_lock);
3094 	return;
3095 add:
3096 
3097 	pep = fromproc->edge;
3098 
3099 	while (pep != NULL) {
3100 		if (pep->to_proc == toproc) {
3101 			ASSERT(pep->refcount > 0);
3102 			pep->refcount++;
3103 			break;
3104 		}
3105 		pep = pep->next;
3106 	}
3107 	if (pep == NULL) {
3108 		pep = flk_get_proc_edge();
3109 		pep->to_proc = toproc;
3110 		pep->refcount = 1;
3111 		toproc->incount++;
3112 		pep->next = fromproc->edge;
3113 		fromproc->edge = pep;
3114 	}
3115 	mutex_exit(&flock_lock);
3116 }
3117 
3118 /*
3119  * Set the control status for lock manager requests.
3120  *
3121  */
3122 
3123 /*
3124  * PSARC case 1997/292
3125  *
3126  * Requires: "nlmid" must be >= 1 and <= clconf_maximum_nodeid().
3127  * Effects: Set the state of the NLM server identified by "nlmid"
3128  *   in the NLM registry to state "nlm_state."
3129  *   Raises exception no_such_nlm if "nlmid" doesn't identify a known
3130  *   NLM server to this LLM.
3131  *   Note that when this routine is called with NLM_SHUTTING_DOWN there
3132  *   may be locks requests that have gotten started but not finished.  In
3133  *   particular, there may be blocking requests that are in the callback code
3134  *   before sleeping (so they're not holding the lock for the graph).  If
3135  *   such a thread reacquires the graph's lock (to go to sleep) after
3136  *   NLM state in the NLM registry  is set to a non-up value,
3137  *   it will notice the status and bail out.  If the request gets
3138  *   granted before the thread can check the NLM registry, let it
3139  *   continue normally.  It will get flushed when we are called with NLM_DOWN.
3140  *
3141  * Modifies: nlm_reg_obj (global)
3142  * Arguments:
3143  *    nlmid	(IN):    id uniquely identifying an NLM server
3144  *    nlm_state (IN):    NLM server state to change "nlmid" to
3145  */
3146 void
3147 cl_flk_set_nlm_status(int nlmid, flk_nlm_status_t nlm_state)
3148 {
3149 	/*
3150 	 * Check to see if node is booted as a cluster. If not, return.
3151 	 */
3152 	if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
3153 		return;
3154 	}
3155 
3156 	/*
3157 	 * Check for development/debugging.  It is possible to boot a node
3158 	 * in non-cluster mode, and then run a special script, currently
3159 	 * available only to developers, to bring up the node as part of a
3160 	 * cluster.  The problem is that running such a script does not
3161 	 * result in the routine flk_init() being called and hence global array
3162 	 * nlm_reg_status is NULL.  The NLM thinks it's in cluster mode,
3163 	 * but the LLM needs to do an additional check to see if the global
3164 	 * array has been created or not. If nlm_reg_status is NULL, then
3165 	 * return, else continue.
3166 	 */
3167 	if (nlm_reg_status == NULL) {
3168 		return;
3169 	}
3170 
3171 	ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
3172 	mutex_enter(&nlm_reg_lock);
3173 
3174 	if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status, nlmid)) {
3175 		/*
3176 		 * If the NLM server "nlmid" is unknown in the NLM registry,
3177 		 * add it to the registry in the nlm shutting down state.
3178 		 */
3179 		FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3180 		    FLK_NLM_SHUTTING_DOWN);
3181 	} else {
3182 		/*
3183 		 * Change the state of the NLM server identified by "nlmid"
3184 		 * in the NLM registry to the argument "nlm_state."
3185 		 */
3186 		FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3187 		    nlm_state);
3188 	}
3189 
3190 	/*
3191 	 *  The reason we must register the NLM server that is shutting down
3192 	 *  with an LLM that doesn't already know about it (never sent a lock
3193 	 *  request) is to handle correctly a race between shutdown and a new
3194 	 *  lock request.  Suppose that a shutdown request from the NLM server
3195 	 *  invokes this routine at the LLM, and a thread is spawned to
3196 	 *  service the request. Now suppose a new lock request is in
3197 	 *  progress and has already passed the first line of defense in
3198 	 *  reclock(), which denies new locks requests from NLM servers
3199 	 *  that are not in the NLM_UP state.  After the current routine
3200 	 *  is invoked for both phases of shutdown, the routine will return,
3201 	 *  having done nothing, and the lock request will proceed and
3202 	 *  probably be granted.  The problem is that the shutdown was ignored
3203 	 *  by the lock request because there was no record of that NLM server
3204 	 *  shutting down.   We will be in the peculiar position of thinking
3205 	 *  that we've shutdown the NLM server and all locks at all LLMs have
3206 	 *  been discarded, but in fact there's still one lock held.
3207 	 *  The solution is to record the existence of NLM server and change
3208 	 *  its state immediately to NLM_SHUTTING_DOWN.  The lock request in
3209 	 *  progress may proceed because the next phase NLM_DOWN will catch
3210 	 *  this lock and discard it.
3211 	 */
3212 	mutex_exit(&nlm_reg_lock);
3213 
3214 	switch (nlm_state) {
3215 	case FLK_NLM_UP:
3216 		/*
3217 		 * Change the NLM state of all locks still held on behalf of
3218 		 * the NLM server identified by "nlmid" to NLM_UP.
3219 		 */
3220 		cl_flk_change_nlm_state_all_locks(nlmid, FLK_NLM_UP);
3221 		break;
3222 
3223 	case FLK_NLM_SHUTTING_DOWN:
3224 		/*
3225 		 * Wake up all sleeping locks for the NLM server identified
3226 		 * by "nlmid." Note that eventually all woken threads will
3227 		 * have their lock requests cancelled and descriptors
3228 		 * removed from the sleeping lock list.  Note that the NLM
3229 		 * server state associated with each lock descriptor is
3230 		 * changed to FLK_NLM_SHUTTING_DOWN.
3231 		 */
3232 		cl_flk_wakeup_sleeping_nlm_locks(nlmid);
3233 		break;
3234 
3235 	case FLK_NLM_DOWN:
3236 		/*
3237 		 * Discard all active, granted locks for this NLM server
3238 		 * identified by "nlmid."
3239 		 */
3240 		cl_flk_unlock_nlm_granted(nlmid);
3241 		break;
3242 
3243 	default:
3244 		panic("cl_set_nlm_status: bad status (%d)", nlm_state);
3245 	}
3246 }
3247 
3248 /*
3249  * Set the control status for lock manager requests.
3250  *
3251  * Note that when this routine is called with FLK_WAKEUP_SLEEPERS, there
3252  * may be locks requests that have gotten started but not finished.  In
3253  * particular, there may be blocking requests that are in the callback code
3254  * before sleeping (so they're not holding the lock for the graph).  If
3255  * such a thread reacquires the graph's lock (to go to sleep) after
3256  * flk_lockmgr_status is set to a non-up value, it will notice the status
3257  * and bail out.  If the request gets granted before the thread can check
3258  * flk_lockmgr_status, let it continue normally.  It will get flushed when
3259  * we are called with FLK_LOCKMGR_DOWN.
3260  */
3261 
3262 void
3263 flk_set_lockmgr_status(flk_lockmgr_status_t status)
3264 {
3265 	int i;
3266 	graph_t *gp;
3267 	struct flock_globals *fg;
3268 
3269 	fg = flk_get_globals();
3270 	ASSERT(fg != NULL);
3271 
3272 	mutex_enter(&flock_lock);
3273 	fg->flk_lockmgr_status = status;
3274 	mutex_exit(&flock_lock);
3275 
3276 	/*
3277 	 * If the lock manager is coming back up, all that's needed is to
3278 	 * propagate this information to the graphs.  If the lock manager
3279 	 * is going down, additional action is required, and each graph's
3280 	 * copy of the state is updated atomically with this other action.
3281 	 */
3282 	switch (status) {
3283 	case FLK_LOCKMGR_UP:
3284 		for (i = 0; i < HASH_SIZE; i++) {
3285 			mutex_enter(&flock_lock);
3286 			gp = lock_graph[i];
3287 			mutex_exit(&flock_lock);
3288 			if (gp == NULL)
3289 				continue;
3290 			mutex_enter(&gp->gp_mutex);
3291 			fg->lockmgr_status[i] = status;
3292 			mutex_exit(&gp->gp_mutex);
3293 		}
3294 		break;
3295 	case FLK_WAKEUP_SLEEPERS:
3296 		wakeup_sleeping_lockmgr_locks(fg);
3297 		break;
3298 	case FLK_LOCKMGR_DOWN:
3299 		unlock_lockmgr_granted(fg);
3300 		break;
3301 	default:
3302 		panic("flk_set_lockmgr_status: bad status (%d)", status);
3303 		break;
3304 	}
3305 }
3306 
3307 /*
3308  * This routine returns all the locks that are active or sleeping and are
3309  * associated with a particular set of identifiers.  If lock_state != 0, then
3310  * only locks that match the lock_state are returned. If lock_state == 0, then
3311  * all locks are returned. If pid == NOPID, the pid is ignored.  If
3312  * use_sysid is FALSE, then the sysid is ignored.  If vp is NULL, then the
3313  * vnode pointer is ignored.
3314  *
3315  * A list containing the vnode pointer and an flock structure
3316  * describing the lock is returned.  Each element in the list is
3317  * dynamically allocated and must be freed by the caller.  The
3318  * last item in the list is denoted by a NULL value in the ll_next
3319  * field.
3320  *
3321  * The vnode pointers returned are held.  The caller is responsible
3322  * for releasing these.  Note that the returned list is only a snapshot of
3323  * the current lock information, and that it is a snapshot of a moving
3324  * target (only one graph is locked at a time).
3325  */
3326 
3327 locklist_t *
3328 get_lock_list(int list_type, int lock_state, int sysid, boolean_t use_sysid,
3329 		pid_t pid, const vnode_t *vp, zoneid_t zoneid)
3330 {
3331 	lock_descriptor_t	*lock;
3332 	lock_descriptor_t	*graph_head;
3333 	locklist_t		listhead;
3334 	locklist_t		*llheadp;
3335 	locklist_t		*llp;
3336 	locklist_t		*lltp;
3337 	graph_t			*gp;
3338 	int			i;
3339 	int			first_index; /* graph index */
3340 	int			num_indexes; /* graph index */
3341 
3342 	ASSERT((list_type == FLK_ACTIVE_STATE) ||
3343 	    (list_type == FLK_SLEEPING_STATE));
3344 
3345 	/*
3346 	 * Get a pointer to something to use as a list head while building
3347 	 * the rest of the list.
3348 	 */
3349 	llheadp = &listhead;
3350 	lltp = llheadp;
3351 	llheadp->ll_next = (locklist_t *)NULL;
3352 
3353 	/* Figure out which graphs we want to look at. */
3354 	if (vp == NULL) {
3355 		first_index = 0;
3356 		num_indexes = HASH_SIZE;
3357 	} else {
3358 		first_index = HASH_INDEX(vp);
3359 		num_indexes = 1;
3360 	}
3361 
3362 	for (i = first_index; i < first_index + num_indexes; i++) {
3363 		mutex_enter(&flock_lock);
3364 		gp = lock_graph[i];
3365 		mutex_exit(&flock_lock);
3366 		if (gp == NULL) {
3367 			continue;
3368 		}
3369 
3370 		mutex_enter(&gp->gp_mutex);
3371 		graph_head = (list_type == FLK_ACTIVE_STATE) ?
3372 		    ACTIVE_HEAD(gp) : SLEEPING_HEAD(gp);
3373 		for (lock = graph_head->l_next;
3374 		    lock != graph_head;
3375 		    lock = lock->l_next) {
3376 			if (use_sysid && lock->l_flock.l_sysid != sysid)
3377 				continue;
3378 			if (pid != NOPID && lock->l_flock.l_pid != pid)
3379 				continue;
3380 			if (vp != NULL && lock->l_vnode != vp)
3381 				continue;
3382 			if (lock_state && !(lock_state & lock->l_state))
3383 				continue;
3384 			if (zoneid != lock->l_zoneid && zoneid != ALL_ZONES)
3385 				continue;
3386 			/*
3387 			 * A matching lock was found.  Allocate
3388 			 * space for a new locklist entry and fill
3389 			 * it in.
3390 			 */
3391 			llp = kmem_alloc(sizeof (locklist_t), KM_SLEEP);
3392 			lltp->ll_next = llp;
3393 			VN_HOLD(lock->l_vnode);
3394 			llp->ll_vp = lock->l_vnode;
3395 			create_flock(lock, &(llp->ll_flock));
3396 			llp->ll_next = (locklist_t *)NULL;
3397 			lltp = llp;
3398 		}
3399 		mutex_exit(&gp->gp_mutex);
3400 	}
3401 
3402 	llp = llheadp->ll_next;
3403 	return (llp);
3404 }
3405 
3406 /*
3407  * These two functions are simply interfaces to get_lock_list.  They return
3408  * a list of sleeping or active locks for the given sysid and pid.  See
3409  * get_lock_list for details.
3410  *
3411  * In either case we don't particularly care to specify the zone of interest;
3412  * the sysid-space is global across zones, so the sysid will map to exactly one
3413  * zone, and we'll return information for that zone.
3414  */
3415 
3416 locklist_t *
3417 flk_get_sleeping_locks(int sysid, pid_t pid)
3418 {
3419 	return (get_lock_list(FLK_SLEEPING_STATE, 0, sysid, B_TRUE, pid, NULL,
3420 	    ALL_ZONES));
3421 }
3422 
3423 locklist_t *
3424 flk_get_active_locks(int sysid, pid_t pid)
3425 {
3426 	return (get_lock_list(FLK_ACTIVE_STATE, 0, sysid, B_TRUE, pid, NULL,
3427 	    ALL_ZONES));
3428 }
3429 
3430 /*
3431  * Another interface to get_lock_list.  This one returns all the active
3432  * locks for a given vnode.  Again, see get_lock_list for details.
3433  *
3434  * We don't need to specify which zone's locks we're interested in.  The matter
3435  * would only be interesting if the vnode belonged to NFS, and NFS vnodes can't
3436  * be used by multiple zones, so the list of locks will all be from the right
3437  * zone.
3438  */
3439 
3440 locklist_t *
3441 flk_active_locks_for_vp(const vnode_t *vp)
3442 {
3443 	return (get_lock_list(FLK_ACTIVE_STATE, 0, 0, B_FALSE, NOPID, vp,
3444 	    ALL_ZONES));
3445 }
3446 
3447 /*
3448  * Another interface to get_lock_list.  This one returns all the active
3449  * nbmand locks for a given vnode.  Again, see get_lock_list for details.
3450  *
3451  * See the comment for flk_active_locks_for_vp() for why we don't care to
3452  * specify the particular zone of interest.
3453  */
3454 locklist_t *
3455 flk_active_nbmand_locks_for_vp(const vnode_t *vp)
3456 {
3457 	return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3458 	    NOPID, vp, ALL_ZONES));
3459 }
3460 
3461 /*
3462  * Another interface to get_lock_list.  This one returns all the active
3463  * nbmand locks for a given pid.  Again, see get_lock_list for details.
3464  *
3465  * The zone doesn't need to be specified here; the locks held by a
3466  * particular process will either be local (ie, non-NFS) or from the zone
3467  * the process is executing in.  This is because other parts of the system
3468  * ensure that an NFS vnode can't be used in a zone other than that in
3469  * which it was opened.
3470  */
3471 locklist_t *
3472 flk_active_nbmand_locks(pid_t pid)
3473 {
3474 	return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3475 	    pid, NULL, ALL_ZONES));
3476 }
3477 
3478 /*
3479  * Free up all entries in the locklist.
3480  */
3481 void
3482 flk_free_locklist(locklist_t *llp)
3483 {
3484 	locklist_t *next_llp;
3485 
3486 	while (llp) {
3487 		next_llp = llp->ll_next;
3488 		VN_RELE(llp->ll_vp);
3489 		kmem_free(llp, sizeof (*llp));
3490 		llp = next_llp;
3491 	}
3492 }
3493 
3494 static void
3495 cl_flk_change_nlm_state_all_locks(int nlmid, flk_nlm_status_t nlm_state)
3496 {
3497 	/*
3498 	 * For each graph "lg" in the hash table lock_graph do
3499 	 * a.  Get the list of sleeping locks
3500 	 * b.  For each lock descriptor in the list do
3501 	 *	i.   If the requested lock is an NLM server request AND
3502 	 *		the nlmid is the same as the routine argument then
3503 	 *		change the lock descriptor's state field to
3504 	 *		"nlm_state."
3505 	 * c.  Get the list of active locks
3506 	 * d.  For each lock descriptor in the list do
3507 	 *	i.   If the requested lock is an NLM server request AND
3508 	 *		the nlmid is the same as the routine argument then
3509 	 *		change the lock descriptor's state field to
3510 	 *		"nlm_state."
3511 	 */
3512 
3513 	int			i;
3514 	graph_t			*gp;			/* lock graph */
3515 	lock_descriptor_t	*lock;			/* lock */
3516 	lock_descriptor_t	*nlock = NULL;		/* next lock */
3517 	int			lock_nlmid;
3518 
3519 	for (i = 0; i < HASH_SIZE; i++) {
3520 		mutex_enter(&flock_lock);
3521 		gp = lock_graph[i];
3522 		mutex_exit(&flock_lock);
3523 		if (gp == NULL) {
3524 			continue;
3525 		}
3526 
3527 		/* Get list of sleeping locks in current lock graph. */
3528 		mutex_enter(&gp->gp_mutex);
3529 		for (lock = SLEEPING_HEAD(gp)->l_next;
3530 		    lock != SLEEPING_HEAD(gp);
3531 		    lock = nlock) {
3532 			nlock = lock->l_next;
3533 			/* get NLM id */
3534 			lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3535 
3536 			/*
3537 			 * If NLM server request AND nlmid of lock matches
3538 			 * nlmid of argument, then set the NLM state of the
3539 			 * lock to "nlm_state."
3540 			 */
3541 			if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3542 				SET_NLM_STATE(lock, nlm_state);
3543 			}
3544 		}
3545 
3546 		/* Get list of active locks in current lock graph. */
3547 		for (lock = ACTIVE_HEAD(gp)->l_next;
3548 		    lock != ACTIVE_HEAD(gp);
3549 		    lock = nlock) {
3550 			nlock = lock->l_next;
3551 			/* get NLM id */
3552 			lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3553 
3554 			/*
3555 			 * If NLM server request AND nlmid of lock matches
3556 			 * nlmid of argument, then set the NLM state of the
3557 			 * lock to "nlm_state."
3558 			 */
3559 			if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3560 				ASSERT(IS_ACTIVE(lock));
3561 				SET_NLM_STATE(lock, nlm_state);
3562 			}
3563 		}
3564 		mutex_exit(&gp->gp_mutex);
3565 	}
3566 }
3567 
3568 /*
3569  * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid().
3570  * Effects: Find all sleeping lock manager requests _only_ for the NLM server
3571  *   identified by "nlmid." Poke those lock requests.
3572  */
3573 static void
3574 cl_flk_wakeup_sleeping_nlm_locks(int nlmid)
3575 {
3576 	lock_descriptor_t *lock;
3577 	lock_descriptor_t *nlock = NULL; /* next lock */
3578 	int i;
3579 	graph_t *gp;
3580 	int	lock_nlmid;
3581 
3582 	for (i = 0; i < HASH_SIZE; i++) {
3583 		mutex_enter(&flock_lock);
3584 		gp = lock_graph[i];
3585 		mutex_exit(&flock_lock);
3586 		if (gp == NULL) {
3587 			continue;
3588 		}
3589 
3590 		mutex_enter(&gp->gp_mutex);
3591 		for (lock = SLEEPING_HEAD(gp)->l_next;
3592 		    lock != SLEEPING_HEAD(gp);
3593 		    lock = nlock) {
3594 			nlock = lock->l_next;
3595 			/*
3596 			 * If NLM server request _and_ nlmid of lock matches
3597 			 * nlmid of argument, then set the NLM state of the
3598 			 * lock to NLM_SHUTTING_DOWN, and wake up sleeping
3599 			 * request.
3600 			 */
3601 			if (IS_LOCKMGR(lock)) {
3602 				/* get NLM id */
3603 				lock_nlmid =
3604 				    GETNLMID(lock->l_flock.l_sysid);
3605 				if (nlmid == lock_nlmid) {
3606 					SET_NLM_STATE(lock,
3607 					    FLK_NLM_SHUTTING_DOWN);
3608 					INTERRUPT_WAKEUP(lock);
3609 				}
3610 			}
3611 		}
3612 		mutex_exit(&gp->gp_mutex);
3613 	}
3614 }
3615 
3616 /*
3617  * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid()
3618  * Effects:  Find all active (granted) lock manager locks _only_ for the
3619  *   NLM server identified by "nlmid" and release them.
3620  */
3621 static void
3622 cl_flk_unlock_nlm_granted(int nlmid)
3623 {
3624 	lock_descriptor_t *lock;
3625 	lock_descriptor_t *nlock = NULL; /* next lock */
3626 	int i;
3627 	graph_t *gp;
3628 	int	lock_nlmid;
3629 
3630 	for (i = 0; i < HASH_SIZE; i++) {
3631 		mutex_enter(&flock_lock);
3632 		gp = lock_graph[i];
3633 		mutex_exit(&flock_lock);
3634 		if (gp == NULL) {
3635 			continue;
3636 		}
3637 
3638 		mutex_enter(&gp->gp_mutex);
3639 		for (lock = ACTIVE_HEAD(gp)->l_next;
3640 		    lock != ACTIVE_HEAD(gp);
3641 		    lock = nlock) {
3642 			nlock = lock->l_next;
3643 			ASSERT(IS_ACTIVE(lock));
3644 
3645 			/*
3646 			 * If it's an  NLM server request _and_ nlmid of
3647 			 * the lock matches nlmid of argument, then
3648 			 * remove the active lock the list, wakup blocked
3649 			 * threads, and free the storage for the lock.
3650 			 * Note that there's no need to mark the NLM state
3651 			 * of this lock to NLM_DOWN because the lock will
3652 			 * be deleted anyway and its storage freed.
3653 			 */
3654 			if (IS_LOCKMGR(lock)) {
3655 				/* get NLM id */
3656 				lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3657 				if (nlmid == lock_nlmid) {
3658 					flk_delete_active_lock(lock, 0);
3659 					flk_wakeup(lock, 1);
3660 					flk_free_lock(lock);
3661 				}
3662 			}
3663 		}
3664 		mutex_exit(&gp->gp_mutex);
3665 	}
3666 }
3667 
3668 /*
3669  * Find all sleeping lock manager requests and poke them.
3670  */
3671 static void
3672 wakeup_sleeping_lockmgr_locks(struct flock_globals *fg)
3673 {
3674 	lock_descriptor_t *lock;
3675 	lock_descriptor_t *nlock = NULL; /* next lock */
3676 	int i;
3677 	graph_t *gp;
3678 	zoneid_t zoneid = getzoneid();
3679 
3680 	for (i = 0; i < HASH_SIZE; i++) {
3681 		mutex_enter(&flock_lock);
3682 		gp = lock_graph[i];
3683 		mutex_exit(&flock_lock);
3684 		if (gp == NULL) {
3685 			continue;
3686 		}
3687 
3688 		mutex_enter(&gp->gp_mutex);
3689 		fg->lockmgr_status[i] = FLK_WAKEUP_SLEEPERS;
3690 		for (lock = SLEEPING_HEAD(gp)->l_next;
3691 		    lock != SLEEPING_HEAD(gp);
3692 		    lock = nlock) {
3693 			nlock = lock->l_next;
3694 			if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
3695 				INTERRUPT_WAKEUP(lock);
3696 			}
3697 		}
3698 		mutex_exit(&gp->gp_mutex);
3699 	}
3700 }
3701 
3702 
3703 /*
3704  * Find all active (granted) lock manager locks and release them.
3705  */
3706 static void
3707 unlock_lockmgr_granted(struct flock_globals *fg)
3708 {
3709 	lock_descriptor_t *lock;
3710 	lock_descriptor_t *nlock = NULL; /* next lock */
3711 	int i;
3712 	graph_t *gp;
3713 	zoneid_t zoneid = getzoneid();
3714 
3715 	for (i = 0; i < HASH_SIZE; i++) {
3716 		mutex_enter(&flock_lock);
3717 		gp = lock_graph[i];
3718 		mutex_exit(&flock_lock);
3719 		if (gp == NULL) {
3720 			continue;
3721 		}
3722 
3723 		mutex_enter(&gp->gp_mutex);
3724 		fg->lockmgr_status[i] = FLK_LOCKMGR_DOWN;
3725 		for (lock = ACTIVE_HEAD(gp)->l_next;
3726 		    lock != ACTIVE_HEAD(gp);
3727 		    lock = nlock) {
3728 			nlock = lock->l_next;
3729 			if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
3730 				ASSERT(IS_ACTIVE(lock));
3731 				flk_delete_active_lock(lock, 0);
3732 				flk_wakeup(lock, 1);
3733 				flk_free_lock(lock);
3734 			}
3735 		}
3736 		mutex_exit(&gp->gp_mutex);
3737 	}
3738 }
3739 
3740 
3741 /*
3742  * Wait until a lock is granted, cancelled, or interrupted.
3743  */
3744 
3745 static void
3746 wait_for_lock(lock_descriptor_t *request)
3747 {
3748 	graph_t *gp = request->l_graph;
3749 
3750 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
3751 
3752 	while (!(IS_GRANTED(request)) && !(IS_CANCELLED(request)) &&
3753 	    !(IS_INTERRUPTED(request))) {
3754 		if (!cv_wait_sig(&request->l_cv, &gp->gp_mutex)) {
3755 			flk_set_state(request, FLK_INTERRUPTED_STATE);
3756 			request->l_state |= INTERRUPTED_LOCK;
3757 		}
3758 	}
3759 }
3760 
3761 /*
3762  * Create an flock structure from the existing lock information
3763  *
3764  * This routine is used to create flock structures for the lock manager
3765  * to use in a reclaim request.  Since the lock was originated on this
3766  * host, it must be conforming to UNIX semantics, so no checking is
3767  * done to make sure it falls within the lower half of the 32-bit range.
3768  */
3769 
3770 static void
3771 create_flock(lock_descriptor_t *lp, flock64_t *flp)
3772 {
3773 	ASSERT(lp->l_end == MAX_U_OFFSET_T || lp->l_end <= MAXEND);
3774 	ASSERT(lp->l_end >= lp->l_start);
3775 
3776 	flp->l_type = lp->l_type;
3777 	flp->l_whence = 0;
3778 	flp->l_start = lp->l_start;
3779 	flp->l_len = (lp->l_end == MAX_U_OFFSET_T) ? 0 :
3780 	    (lp->l_end - lp->l_start + 1);
3781 	flp->l_sysid = lp->l_flock.l_sysid;
3782 	flp->l_pid = lp->l_flock.l_pid;
3783 }
3784 
3785 /*
3786  * Convert flock_t data describing a lock range into unsigned long starting
3787  * and ending points, which are put into lock_request.  Returns 0 or an
3788  * errno value.
3789  * Large Files: max is passed by the caller and we return EOVERFLOW
3790  * as defined by LFS API.
3791  */
3792 
3793 int
3794 flk_convert_lock_data(vnode_t *vp, flock64_t *flp,
3795     u_offset_t *start, u_offset_t *end, offset_t offset)
3796 {
3797 	struct vattr	vattr;
3798 	int	error;
3799 
3800 	/*
3801 	 * Determine the starting point of the request
3802 	 */
3803 	switch (flp->l_whence) {
3804 	case 0:		/* SEEK_SET */
3805 		*start = (u_offset_t)flp->l_start;
3806 		break;
3807 	case 1:		/* SEEK_CUR */
3808 		*start = (u_offset_t)(flp->l_start + offset);
3809 		break;
3810 	case 2:		/* SEEK_END */
3811 		vattr.va_mask = AT_SIZE;
3812 		if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
3813 			return (error);
3814 		*start = (u_offset_t)(flp->l_start + vattr.va_size);
3815 		break;
3816 	default:
3817 		return (EINVAL);
3818 	}
3819 
3820 	/*
3821 	 * Determine the range covered by the request.
3822 	 */
3823 	if (flp->l_len == 0)
3824 		*end = MAX_U_OFFSET_T;
3825 	else if ((offset_t)flp->l_len > 0) {
3826 		*end = (u_offset_t)(*start + (flp->l_len - 1));
3827 	} else {
3828 		/*
3829 		 * Negative length; why do we even allow this ?
3830 		 * Because this allows easy specification of
3831 		 * the last n bytes of the file.
3832 		 */
3833 		*end = *start;
3834 		*start += (u_offset_t)flp->l_len;
3835 		(*start)++;
3836 	}
3837 	return (0);
3838 }
3839 
3840 /*
3841  * Check the validity of lock data.  This can used by the NFS
3842  * frlock routines to check data before contacting the server.  The
3843  * server must support semantics that aren't as restrictive as
3844  * the UNIX API, so the NFS client is required to check.
3845  * The maximum is now passed in by the caller.
3846  */
3847 
3848 int
3849 flk_check_lock_data(u_offset_t start, u_offset_t end, offset_t max)
3850 {
3851 	/*
3852 	 * The end (length) for local locking should never be greater
3853 	 * than MAXEND. However, the representation for
3854 	 * the entire file is MAX_U_OFFSET_T.
3855 	 */
3856 	if ((start > max) ||
3857 	    ((end > max) && (end != MAX_U_OFFSET_T))) {
3858 		return (EINVAL);
3859 	}
3860 	if (start > end) {
3861 		return (EINVAL);
3862 	}
3863 	return (0);
3864 }
3865 
3866 /*
3867  * Fill in request->l_flock with information about the lock blocking the
3868  * request.  The complexity here is that lock manager requests are allowed
3869  * to see into the upper part of the 32-bit address range, whereas local
3870  * requests are only allowed to see signed values.
3871  *
3872  * What should be done when "blocker" is a lock manager lock that uses the
3873  * upper portion of the 32-bit range, but "request" is local?  Since the
3874  * request has already been determined to have been blocked by the blocker,
3875  * at least some portion of "blocker" must be in the range of the request,
3876  * or the request extends to the end of file.  For the first case, the
3877  * portion in the lower range is returned with the indication that it goes
3878  * "to EOF."  For the second case, the last byte of the lower range is
3879  * returned with the indication that it goes "to EOF."
3880  */
3881 
3882 static void
3883 report_blocker(lock_descriptor_t *blocker, lock_descriptor_t *request)
3884 {
3885 	flock64_t *flrp;			/* l_flock portion of request */
3886 
3887 	ASSERT(blocker != NULL);
3888 
3889 	flrp = &request->l_flock;
3890 	flrp->l_whence = 0;
3891 	flrp->l_type = blocker->l_type;
3892 	flrp->l_pid = blocker->l_flock.l_pid;
3893 	flrp->l_sysid = blocker->l_flock.l_sysid;
3894 
3895 	if (IS_LOCKMGR(request)) {
3896 		flrp->l_start = blocker->l_start;
3897 		if (blocker->l_end == MAX_U_OFFSET_T)
3898 			flrp->l_len = 0;
3899 		else
3900 			flrp->l_len = blocker->l_end - blocker->l_start + 1;
3901 	} else {
3902 		if (blocker->l_start > MAXEND) {
3903 			flrp->l_start = MAXEND;
3904 			flrp->l_len = 0;
3905 		} else {
3906 			flrp->l_start = blocker->l_start;
3907 			if (blocker->l_end == MAX_U_OFFSET_T)
3908 				flrp->l_len = 0;
3909 			else
3910 				flrp->l_len = blocker->l_end -
3911 				    blocker->l_start + 1;
3912 		}
3913 	}
3914 }
3915 
3916 /*
3917  * PSARC case 1997/292
3918  */
3919 /*
3920  * This is the public routine exported by flock.h.
3921  */
3922 void
3923 cl_flk_change_nlm_state_to_unknown(int nlmid)
3924 {
3925 	/*
3926 	 * Check to see if node is booted as a cluster. If not, return.
3927 	 */
3928 	if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
3929 		return;
3930 	}
3931 
3932 	/*
3933 	 * See comment in cl_flk_set_nlm_status().
3934 	 */
3935 	if (nlm_reg_status == NULL) {
3936 		return;
3937 	}
3938 
3939 	/*
3940 	 * protect NLM registry state with a mutex.
3941 	 */
3942 	ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
3943 	mutex_enter(&nlm_reg_lock);
3944 	FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid, FLK_NLM_UNKNOWN);
3945 	mutex_exit(&nlm_reg_lock);
3946 }
3947 
3948 /*
3949  * Return non-zero if the given I/O request conflicts with an active NBMAND
3950  * lock.
3951  * If svmand is non-zero, it means look at all active locks, not just NBMAND
3952  * locks.
3953  */
3954 
3955 int
3956 nbl_lock_conflict(vnode_t *vp, nbl_op_t op, u_offset_t offset,
3957 		ssize_t length, int svmand, caller_context_t *ct)
3958 {
3959 	int conflict = 0;
3960 	graph_t			*gp;
3961 	lock_descriptor_t	*lock;
3962 	pid_t pid;
3963 	int sysid;
3964 
3965 	if (ct == NULL) {
3966 		pid = curproc->p_pid;
3967 		sysid = 0;
3968 	} else {
3969 		pid = ct->cc_pid;
3970 		sysid = ct->cc_sysid;
3971 	}
3972 
3973 	mutex_enter(&flock_lock);
3974 	gp = lock_graph[HASH_INDEX(vp)];
3975 	mutex_exit(&flock_lock);
3976 	if (gp == NULL)
3977 		return (0);
3978 
3979 	mutex_enter(&gp->gp_mutex);
3980 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
3981 
3982 	for (; lock && lock->l_vnode == vp; lock = lock->l_next) {
3983 		if ((svmand || (lock->l_state & NBMAND_LOCK)) &&
3984 		    (lock->l_flock.l_sysid != sysid ||
3985 		    lock->l_flock.l_pid != pid) &&
3986 		    lock_blocks_io(op, offset, length,
3987 		    lock->l_type, lock->l_start, lock->l_end)) {
3988 			conflict = 1;
3989 			break;
3990 		}
3991 	}
3992 	mutex_exit(&gp->gp_mutex);
3993 
3994 	return (conflict);
3995 }
3996 
3997 /*
3998  * Return non-zero if the given I/O request conflicts with the given lock.
3999  */
4000 
4001 static int
4002 lock_blocks_io(nbl_op_t op, u_offset_t offset, ssize_t length,
4003 	    int lock_type, u_offset_t lock_start, u_offset_t lock_end)
4004 {
4005 	ASSERT(op == NBL_READ || op == NBL_WRITE || op == NBL_READWRITE);
4006 	ASSERT(lock_type == F_RDLCK || lock_type == F_WRLCK);
4007 
4008 	if (op == NBL_READ && lock_type == F_RDLCK)
4009 		return (0);
4010 
4011 	if (offset <= lock_start && lock_start < offset + length)
4012 		return (1);
4013 	if (lock_start <= offset && offset <= lock_end)
4014 		return (1);
4015 
4016 	return (0);
4017 }
4018 
4019 #ifdef DEBUG
4020 static void
4021 check_active_locks(graph_t *gp)
4022 {
4023 	lock_descriptor_t *lock, *lock1;
4024 	edge_t	*ep;
4025 
4026 	for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
4027 	    lock = lock->l_next) {
4028 		ASSERT(IS_ACTIVE(lock));
4029 		ASSERT(NOT_BLOCKED(lock));
4030 		ASSERT(!IS_BARRIER(lock));
4031 
4032 		ep = FIRST_IN(lock);
4033 
4034 		while (ep != HEAD(lock)) {
4035 			ASSERT(IS_SLEEPING(ep->from_vertex));
4036 			ASSERT(!NOT_BLOCKED(ep->from_vertex));
4037 			ep = NEXT_IN(ep);
4038 		}
4039 
4040 		for (lock1 = lock->l_next; lock1 != ACTIVE_HEAD(gp);
4041 		    lock1 = lock1->l_next) {
4042 			if (lock1->l_vnode == lock->l_vnode) {
4043 			if (BLOCKS(lock1, lock)) {
4044 				cmn_err(CE_PANIC,
4045 				    "active lock %p blocks %p",
4046 				    (void *)lock1, (void *)lock);
4047 			} else if (BLOCKS(lock, lock1)) {
4048 				cmn_err(CE_PANIC,
4049 				    "active lock %p blocks %p",
4050 				    (void *)lock, (void *)lock1);
4051 			}
4052 			}
4053 		}
4054 	}
4055 }
4056 
4057 /*
4058  * Effect: This functions checks to see if the transition from 'old_state' to
4059  *	'new_state' is a valid one.  It returns 0 if the transition is valid
4060  *	and 1 if it is not.
4061  *	For a map of valid transitions, see sys/flock_impl.h
4062  */
4063 static int
4064 check_lock_transition(int old_state, int new_state)
4065 {
4066 	switch (old_state) {
4067 	case FLK_INITIAL_STATE:
4068 		if ((new_state == FLK_START_STATE) ||
4069 		    (new_state == FLK_SLEEPING_STATE) ||
4070 		    (new_state == FLK_ACTIVE_STATE) ||
4071 		    (new_state == FLK_DEAD_STATE)) {
4072 			return (0);
4073 		} else {
4074 			return (1);
4075 		}
4076 	case FLK_START_STATE:
4077 		if ((new_state == FLK_ACTIVE_STATE) ||
4078 		    (new_state == FLK_DEAD_STATE)) {
4079 			return (0);
4080 		} else {
4081 			return (1);
4082 		}
4083 	case FLK_ACTIVE_STATE:
4084 		if (new_state == FLK_DEAD_STATE) {
4085 			return (0);
4086 		} else {
4087 			return (1);
4088 		}
4089 	case FLK_SLEEPING_STATE:
4090 		if ((new_state == FLK_GRANTED_STATE) ||
4091 		    (new_state == FLK_INTERRUPTED_STATE) ||
4092 		    (new_state == FLK_CANCELLED_STATE)) {
4093 			return (0);
4094 		} else {
4095 			return (1);
4096 		}
4097 	case FLK_GRANTED_STATE:
4098 		if ((new_state == FLK_START_STATE) ||
4099 		    (new_state == FLK_INTERRUPTED_STATE) ||
4100 		    (new_state == FLK_CANCELLED_STATE)) {
4101 			return (0);
4102 		} else {
4103 			return (1);
4104 		}
4105 	case FLK_CANCELLED_STATE:
4106 		if ((new_state == FLK_INTERRUPTED_STATE) ||
4107 		    (new_state == FLK_DEAD_STATE)) {
4108 			return (0);
4109 		} else {
4110 			return (1);
4111 		}
4112 	case FLK_INTERRUPTED_STATE:
4113 		if (new_state == FLK_DEAD_STATE) {
4114 			return (0);
4115 		} else {
4116 			return (1);
4117 		}
4118 	case FLK_DEAD_STATE:
4119 		/* May be set more than once */
4120 		if (new_state == FLK_DEAD_STATE) {
4121 			return (0);
4122 		} else {
4123 			return (1);
4124 		}
4125 	default:
4126 		return (1);
4127 	}
4128 }
4129 
4130 static void
4131 check_sleeping_locks(graph_t *gp)
4132 {
4133 	lock_descriptor_t *lock1, *lock2;
4134 	edge_t *ep;
4135 	for (lock1 = SLEEPING_HEAD(gp)->l_next; lock1 != SLEEPING_HEAD(gp);
4136 	    lock1 = lock1->l_next) {
4137 				ASSERT(!IS_BARRIER(lock1));
4138 	for (lock2 = lock1->l_next; lock2 != SLEEPING_HEAD(gp);
4139 	    lock2 = lock2->l_next) {
4140 		if (lock1->l_vnode == lock2->l_vnode) {
4141 			if (BLOCKS(lock2, lock1)) {
4142 				ASSERT(!IS_GRANTED(lock1));
4143 				ASSERT(!NOT_BLOCKED(lock1));
4144 				path(lock1, lock2);
4145 			}
4146 		}
4147 	}
4148 
4149 	for (lock2 = ACTIVE_HEAD(gp)->l_next; lock2 != ACTIVE_HEAD(gp);
4150 	    lock2 = lock2->l_next) {
4151 				ASSERT(!IS_BARRIER(lock1));
4152 		if (lock1->l_vnode == lock2->l_vnode) {
4153 			if (BLOCKS(lock2, lock1)) {
4154 				ASSERT(!IS_GRANTED(lock1));
4155 				ASSERT(!NOT_BLOCKED(lock1));
4156 				path(lock1, lock2);
4157 			}
4158 		}
4159 	}
4160 	ep = FIRST_ADJ(lock1);
4161 	while (ep != HEAD(lock1)) {
4162 		ASSERT(BLOCKS(ep->to_vertex, lock1));
4163 		ep = NEXT_ADJ(ep);
4164 	}
4165 	}
4166 }
4167 
4168 static int
4169 level_two_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2, int no_path)
4170 {
4171 	edge_t	*ep;
4172 	lock_descriptor_t	*vertex;
4173 	lock_descriptor_t *vertex_stack;
4174 
4175 	STACK_INIT(vertex_stack);
4176 
4177 	flk_graph_uncolor(lock1->l_graph);
4178 	ep = FIRST_ADJ(lock1);
4179 	ASSERT(ep != HEAD(lock1));
4180 	while (ep != HEAD(lock1)) {
4181 		if (no_path)
4182 			ASSERT(ep->to_vertex != lock2);
4183 		STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4184 		COLOR(ep->to_vertex);
4185 		ep = NEXT_ADJ(ep);
4186 	}
4187 
4188 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
4189 		STACK_POP(vertex_stack, l_dstack);
4190 		for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
4191 		    ep = NEXT_ADJ(ep)) {
4192 			if (COLORED(ep->to_vertex))
4193 				continue;
4194 			COLOR(ep->to_vertex);
4195 			if (ep->to_vertex == lock2)
4196 				return (1);
4197 
4198 			STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4199 		}
4200 	}
4201 	return (0);
4202 }
4203 
4204 static void
4205 check_owner_locks(graph_t *gp, pid_t pid, int sysid, vnode_t *vp)
4206 {
4207 	lock_descriptor_t *lock;
4208 
4209 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
4210 
4211 	if (lock) {
4212 		while (lock != ACTIVE_HEAD(gp) && (lock->l_vnode == vp)) {
4213 			if (lock->l_flock.l_pid == pid &&
4214 			    lock->l_flock.l_sysid == sysid)
4215 				cmn_err(CE_PANIC,
4216 				    "owner pid %d's lock %p in active queue",
4217 				    pid, (void *)lock);
4218 			lock = lock->l_next;
4219 		}
4220 	}
4221 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
4222 
4223 	if (lock) {
4224 		while (lock != SLEEPING_HEAD(gp) && (lock->l_vnode == vp)) {
4225 			if (lock->l_flock.l_pid == pid &&
4226 			    lock->l_flock.l_sysid == sysid)
4227 				cmn_err(CE_PANIC,
4228 				    "owner pid %d's lock %p in sleep queue",
4229 				    pid, (void *)lock);
4230 			lock = lock->l_next;
4231 		}
4232 	}
4233 }
4234 
4235 static int
4236 level_one_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4237 {
4238 	edge_t *ep = FIRST_ADJ(lock1);
4239 
4240 	while (ep != HEAD(lock1)) {
4241 		if (ep->to_vertex == lock2)
4242 			return (1);
4243 		else
4244 			ep = NEXT_ADJ(ep);
4245 	}
4246 	return (0);
4247 }
4248 
4249 static int
4250 no_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4251 {
4252 	return (!level_two_path(lock1, lock2, 1));
4253 }
4254 
4255 static void
4256 path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4257 {
4258 	if (level_one_path(lock1, lock2)) {
4259 		if (level_two_path(lock1, lock2, 0) != 0) {
4260 			cmn_err(CE_WARN,
4261 			    "one edge one path from lock1 %p lock2 %p",
4262 			    (void *)lock1, (void *)lock2);
4263 		}
4264 	} else if (no_path(lock1, lock2)) {
4265 		cmn_err(CE_PANIC,
4266 		    "No path from  lock1 %p to lock2 %p",
4267 		    (void *)lock1, (void *)lock2);
4268 	}
4269 }
4270 #endif /* DEBUG */
4271