xref: /titanic_52/usr/src/uts/common/os/flock.c (revision 03831d35f7499c87d51205817c93e9a8d42c4bae)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /* ONC_PLUS EXTRACT START */
23 
24 /*
25  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
26  * Use is subject to license terms.
27  */
28 
29 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
30 /*	All Rights Reserved */
31 
32 #pragma ident	"%Z%%M%	%I%	%E% SMI"
33 
34 #include <sys/flock_impl.h>
35 #include <sys/vfs.h>
36 #include <sys/t_lock.h>		/* for <sys/callb.h> */
37 #include <sys/callb.h>
38 #include <sys/clconf.h>
39 #include <sys/cladm.h>
40 #include <sys/nbmlock.h>
41 #include <sys/cred.h>
42 #include <sys/policy.h>
43 
44 /*
45  * The following four variables are for statistics purposes and they are
46  * not protected by locks. They may not be accurate but will at least be
47  * close to the actual value.
48  */
49 
50 int	flk_lock_allocs;
51 int	flk_lock_frees;
52 int 	edge_allocs;
53 int	edge_frees;
54 int 	flk_proc_vertex_allocs;
55 int 	flk_proc_edge_allocs;
56 int	flk_proc_vertex_frees;
57 int	flk_proc_edge_frees;
58 
59 static kmutex_t flock_lock;
60 
61 #ifdef DEBUG
62 int check_debug = 0;
63 #define	CHECK_ACTIVE_LOCKS(gp)	if (check_debug) \
64 					check_active_locks(gp);
65 #define	CHECK_SLEEPING_LOCKS(gp)	if (check_debug) \
66 						check_sleeping_locks(gp);
67 #define	CHECK_OWNER_LOCKS(gp, pid, sysid, vp) 	\
68 		if (check_debug)	\
69 			check_owner_locks(gp, pid, sysid, vp);
70 #define	CHECK_LOCK_TRANSITION(old_state, new_state) \
71 	{ \
72 		if (check_lock_transition(old_state, new_state)) { \
73 			cmn_err(CE_PANIC, "Illegal lock transition \
74 			    from %d to %d", old_state, new_state); \
75 		} \
76 	}
77 #else
78 
79 #define	CHECK_ACTIVE_LOCKS(gp)
80 #define	CHECK_SLEEPING_LOCKS(gp)
81 #define	CHECK_OWNER_LOCKS(gp, pid, sysid, vp)
82 #define	CHECK_LOCK_TRANSITION(old_state, new_state)
83 
84 #endif /* DEBUG */
85 
86 struct kmem_cache	*flk_edge_cache;
87 
88 graph_t		*lock_graph[HASH_SIZE];
89 proc_graph_t	pgraph;
90 
91 /*
92  * Clustering.
93  *
94  * NLM REGISTRY TYPE IMPLEMENTATION
95  *
96  * Assumptions:
97  *  1.  Nodes in a cluster are numbered starting at 1; always non-negative
98  *	integers; maximum node id is returned by clconf_maximum_nodeid().
99  *  2.  We use this node id to identify the node an NLM server runs on.
100  */
101 
102 /*
103  * NLM registry object keeps track of NLM servers via their
104  * nlmids (which are the node ids of the node in the cluster they run on)
105  * that have requested locks at this LLM with which this registry is
106  * associated.
107  *
108  * Representation of abstraction:
109  *    rep = record[	states: array[nlm_state],
110  *			lock: mutex]
111  *
112  *    Representation invariants:
113  *	1. index i of rep.states is between 0 and n - 1 where n is number
114  *	   of elements in the array, which happen to be the maximum number
115  *	   of nodes in the cluster configuration + 1.
116  *	2. map nlmid to index i of rep.states
117  *		0   -> 0
118  *		1   -> 1
119  *		2   -> 2
120  *		n-1 -> clconf_maximum_nodeid()+1
121  *	3.  This 1-1 mapping is quite convenient and it avoids errors resulting
122  *	    from forgetting to subtract 1 from the index.
123  *	4.  The reason we keep the 0th index is the following.  A legitimate
124  *	    cluster configuration includes making a UFS file system NFS
125  *	    exportable.  The code is structured so that if you're in a cluster
126  *	    you do one thing; otherwise, you do something else.  The problem
127  *	    is what to do if you think you're in a cluster with PXFS loaded,
128  *	    but you're using UFS not PXFS?  The upper two bytes of the sysid
129  *	    encode the node id of the node where NLM server runs; these bytes
130  *	    are zero for UFS.  Since the nodeid is used to index into the
131  *	    registry, we can record the NLM server state information at index
132  *	    0 using the same mechanism used for PXFS file locks!
133  */
134 static flk_nlm_status_t *nlm_reg_status = NULL;	/* state array 0..N-1 */
135 static kmutex_t nlm_reg_lock;			/* lock to protect arrary */
136 static uint_t nlm_status_size;			/* size of state array */
137 
138 /*
139  * Although we need a global lock dependency graph (and associated data
140  * structures), we also need a per-zone notion of whether the lock manager is
141  * running, and so whether to allow lock manager requests or not.
142  *
143  * Thus, on a per-zone basis we maintain a ``global'' variable
144  * (flk_lockmgr_status), protected by flock_lock, and set when the lock
145  * manager is determined to be changing state (starting or stopping).
146  *
147  * Each graph/zone pair also has a copy of this variable, which is protected by
148  * the graph's mutex.
149  *
150  * The per-graph copies are used to synchronize lock requests with shutdown
151  * requests.  The global copy is used to initialize the per-graph field when a
152  * new graph is created.
153  */
154 struct flock_globals {
155 	flk_lockmgr_status_t flk_lockmgr_status;
156 	flk_lockmgr_status_t lockmgr_status[HASH_SIZE];
157 };
158 
159 zone_key_t flock_zone_key;
160 
161 static void create_flock(lock_descriptor_t *, flock64_t *);
162 static lock_descriptor_t	*flk_get_lock(void);
163 static void	flk_free_lock(lock_descriptor_t	*lock);
164 static void	flk_get_first_blocking_lock(lock_descriptor_t *request);
165 static int flk_process_request(lock_descriptor_t *);
166 static int flk_add_edge(lock_descriptor_t *, lock_descriptor_t *, int, int);
167 static edge_t *flk_get_edge(void);
168 static int flk_wait_execute_request(lock_descriptor_t *);
169 static int flk_relation(lock_descriptor_t *, lock_descriptor_t *);
170 static void flk_insert_active_lock(lock_descriptor_t *);
171 static void flk_delete_active_lock(lock_descriptor_t *, int);
172 static void flk_insert_sleeping_lock(lock_descriptor_t *);
173 static void flk_graph_uncolor(graph_t *);
174 static void flk_wakeup(lock_descriptor_t *, int);
175 static void flk_free_edge(edge_t *);
176 static void flk_recompute_dependencies(lock_descriptor_t *,
177 			lock_descriptor_t **,  int, int);
178 static int flk_find_barriers(lock_descriptor_t *);
179 static void flk_update_barriers(lock_descriptor_t *);
180 static int flk_color_reachables(lock_descriptor_t *);
181 static int flk_canceled(lock_descriptor_t *);
182 static void flk_delete_locks_by_sysid(lock_descriptor_t *);
183 static void report_blocker(lock_descriptor_t *, lock_descriptor_t *);
184 static void wait_for_lock(lock_descriptor_t *);
185 static void unlock_lockmgr_granted(struct flock_globals *);
186 static void wakeup_sleeping_lockmgr_locks(struct flock_globals *);
187 
188 /* Clustering hooks */
189 static void cl_flk_change_nlm_state_all_locks(int, flk_nlm_status_t);
190 static void cl_flk_wakeup_sleeping_nlm_locks(int);
191 static void cl_flk_unlock_nlm_granted(int);
192 
193 #ifdef DEBUG
194 static int check_lock_transition(int, int);
195 static void check_sleeping_locks(graph_t *);
196 static void check_active_locks(graph_t *);
197 static int no_path(lock_descriptor_t *, lock_descriptor_t *);
198 static void path(lock_descriptor_t *, lock_descriptor_t *);
199 static void check_owner_locks(graph_t *, pid_t, int, vnode_t *);
200 static int level_one_path(lock_descriptor_t *, lock_descriptor_t *);
201 static int level_two_path(lock_descriptor_t *, lock_descriptor_t *, int);
202 #endif
203 
204 /*	proc_graph function definitons */
205 static int flk_check_deadlock(lock_descriptor_t *);
206 static void flk_proc_graph_uncolor(void);
207 static proc_vertex_t *flk_get_proc_vertex(lock_descriptor_t *);
208 static proc_edge_t *flk_get_proc_edge(void);
209 static void flk_proc_release(proc_vertex_t *);
210 static void flk_free_proc_edge(proc_edge_t *);
211 static void flk_update_proc_graph(edge_t *, int);
212 
213 /* Non-blocking mandatory locking */
214 static int lock_blocks_io(nbl_op_t, u_offset_t, ssize_t, int, u_offset_t,
215 			u_offset_t);
216 
217 static struct flock_globals *
218 flk_get_globals(void)
219 {
220 	/*
221 	 * The KLM module had better be loaded if we're attempting to handle
222 	 * lockmgr requests.
223 	 */
224 	ASSERT(flock_zone_key != ZONE_KEY_UNINITIALIZED);
225 	return (zone_getspecific(flock_zone_key, curproc->p_zone));
226 }
227 
228 static flk_lockmgr_status_t
229 flk_get_lockmgr_status(void)
230 {
231 	struct flock_globals *fg;
232 
233 	ASSERT(MUTEX_HELD(&flock_lock));
234 
235 	if (flock_zone_key == ZONE_KEY_UNINITIALIZED) {
236 		/*
237 		 * KLM module not loaded; lock manager definitely not running.
238 		 */
239 		return (FLK_LOCKMGR_DOWN);
240 	}
241 	fg = flk_get_globals();
242 	return (fg->flk_lockmgr_status);
243 }
244 
245 /*
246  * Routine called from fs_frlock in fs/fs_subr.c
247  */
248 
249 int
250 reclock(vnode_t		*vp,
251 	flock64_t	*lckdat,
252 	int		cmd,
253 	int		flag,
254 	u_offset_t	offset,
255 	flk_callback_t	*flk_cbp)
256 {
257 	lock_descriptor_t	stack_lock_request;
258 	lock_descriptor_t	*lock_request;
259 	int error = 0;
260 	graph_t	*gp;
261 	int			nlmid;
262 
263 	/*
264 	 * Check access permissions
265 	 */
266 	if ((cmd & SETFLCK) &&
267 		((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
268 		(lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
269 			return (EBADF);
270 
271 	/*
272 	 * for query and unlock we use the stack_lock_request
273 	 */
274 
275 	if ((lckdat->l_type == F_UNLCK) ||
276 			!((cmd & INOFLCK) || (cmd & SETFLCK))) {
277 		lock_request = &stack_lock_request;
278 		(void) bzero((caddr_t)lock_request,
279 				sizeof (lock_descriptor_t));
280 
281 		/*
282 		 * following is added to make the assertions in
283 		 * flk_execute_request() to pass through
284 		 */
285 
286 		lock_request->l_edge.edge_in_next = &lock_request->l_edge;
287 		lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
288 		lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
289 		lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
290 		lock_request->l_status = FLK_INITIAL_STATE;
291 	} else {
292 		lock_request = flk_get_lock();
293 	}
294 	lock_request->l_state = 0;
295 	lock_request->l_vnode = vp;
296 	lock_request->l_zoneid = getzoneid();
297 
298 	/*
299 	 * Convert the request range into the canonical start and end
300 	 * values.  The NLM protocol supports locking over the entire
301 	 * 32-bit range, so there's no range checking for remote requests,
302 	 * but we still need to verify that local requests obey the rules.
303 	 */
304 	/* Clustering */
305 	if ((cmd & (RCMDLCK | PCMDLCK)) != 0) {
306 		ASSERT(lckdat->l_whence == 0);
307 		lock_request->l_start = lckdat->l_start;
308 		lock_request->l_end = (lckdat->l_len == 0) ? MAX_U_OFFSET_T :
309 			lckdat->l_start + (lckdat->l_len - 1);
310 	} else {
311 		/* check the validity of the lock range */
312 		error = flk_convert_lock_data(vp, lckdat,
313 			&lock_request->l_start, &lock_request->l_end,
314 			offset);
315 		if (error) {
316 			goto done;
317 		}
318 		error = flk_check_lock_data(lock_request->l_start,
319 					    lock_request->l_end, MAXEND);
320 		if (error) {
321 			goto done;
322 		}
323 	}
324 
325 	ASSERT(lock_request->l_end >= lock_request->l_start);
326 
327 	lock_request->l_type = lckdat->l_type;
328 	if (cmd & INOFLCK)
329 		lock_request->l_state |= IO_LOCK;
330 	if (cmd & SLPFLCK)
331 		lock_request->l_state |= WILLING_TO_SLEEP_LOCK;
332 	if (cmd & RCMDLCK)
333 		lock_request->l_state |= LOCKMGR_LOCK;
334 	if (cmd & NBMLCK)
335 		lock_request->l_state |= NBMAND_LOCK;
336 	/*
337 	 * Clustering: set flag for PXFS locks
338 	 * We do not _only_ check for the PCMDLCK flag because PXFS locks could
339 	 * also be of type 'RCMDLCK'.
340 	 * We do not _only_ check the GETPXFSID() macro because local PXFS
341 	 * clients use a pxfsid of zero to permit deadlock detection in the LLM.
342 	 */
343 
344 	if ((cmd & PCMDLCK) || (GETPXFSID(lckdat->l_sysid) != 0)) {
345 		lock_request->l_state |= PXFS_LOCK;
346 	}
347 	if (!((cmd & SETFLCK) || (cmd & INOFLCK))) {
348 		if (lock_request->l_type == F_RDLCK ||
349 			lock_request->l_type == F_WRLCK)
350 			lock_request->l_state |= QUERY_LOCK;
351 	}
352 	lock_request->l_flock = (*lckdat);
353 	lock_request->l_callbacks = flk_cbp;
354 
355 	/*
356 	 * We are ready for processing the request
357 	 */
358 	if (IS_LOCKMGR(lock_request)) {
359 		/*
360 		 * If the lock request is an NLM server request ....
361 		 */
362 		if (nlm_status_size == 0) { /* not booted as cluster */
363 			mutex_enter(&flock_lock);
364 			/*
365 			 * Bail out if this is a lock manager request and the
366 			 * lock manager is not supposed to be running.
367 			 */
368 			if (flk_get_lockmgr_status() != FLK_LOCKMGR_UP) {
369 				mutex_exit(&flock_lock);
370 				error = ENOLCK;
371 				goto done;
372 			}
373 			mutex_exit(&flock_lock);
374 		} else {			/* booted as a cluster */
375 			nlmid = GETNLMID(lock_request->l_flock.l_sysid);
376 			ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
377 
378 			mutex_enter(&nlm_reg_lock);
379 			/*
380 			 * If the NLM registry does not know about this
381 			 * NLM server making the request, add its nlmid
382 			 * to the registry.
383 			 */
384 			if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status,
385 				nlmid)) {
386 				FLK_REGISTRY_ADD_NLMID(nlm_reg_status, nlmid);
387 			} else if (!FLK_REGISTRY_IS_NLM_UP(nlm_reg_status,
388 				nlmid)) {
389 				/*
390 				 * If the NLM server is already known (has made
391 				 * previous lock requests) and its state is
392 				 * not NLM_UP (means that NLM server is
393 				 * shutting down), then bail out with an
394 				 * error to deny the lock request.
395 				 */
396 				mutex_exit(&nlm_reg_lock);
397 				error = ENOLCK;
398 				goto done;
399 			}
400 			mutex_exit(&nlm_reg_lock);
401 		}
402 	}
403 
404 	/* Now get the lock graph for a particular vnode */
405 	gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH);
406 
407 	/*
408 	 * We drop rwlock here otherwise this might end up causing a
409 	 * deadlock if this IOLOCK sleeps. (bugid # 1183392).
410 	 */
411 
412 	if (IS_IO_LOCK(lock_request)) {
413 		VOP_RWUNLOCK(vp,
414 			(lock_request->l_type == F_RDLCK) ?
415 				V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
416 	}
417 	mutex_enter(&gp->gp_mutex);
418 
419 	lock_request->l_state |= REFERENCED_LOCK;
420 	lock_request->l_graph = gp;
421 
422 	switch (lock_request->l_type) {
423 	case F_RDLCK:
424 	case F_WRLCK:
425 		if (IS_QUERY_LOCK(lock_request)) {
426 			flk_get_first_blocking_lock(lock_request);
427 			(*lckdat) = lock_request->l_flock;
428 			break;
429 		}
430 
431 		/* process the request now */
432 
433 		error = flk_process_request(lock_request);
434 		break;
435 
436 	case F_UNLCK:
437 		/* unlock request will not block so execute it immediately */
438 
439 		if (IS_LOCKMGR(lock_request) &&
440 		    flk_canceled(lock_request)) {
441 			error = 0;
442 		} else {
443 			error = flk_execute_request(lock_request);
444 		}
445 		break;
446 
447 	case F_UNLKSYS:
448 		/*
449 		 * Recovery mechanism to release lock manager locks when
450 		 * NFS client crashes and restart. NFS server will clear
451 		 * old locks and grant new locks.
452 		 */
453 
454 		if (lock_request->l_flock.l_sysid == 0) {
455 			mutex_exit(&gp->gp_mutex);
456 			return (EINVAL);
457 		}
458 		if (secpolicy_nfs(CRED()) != 0) {
459 			mutex_exit(&gp->gp_mutex);
460 			return (EPERM);
461 		}
462 		flk_delete_locks_by_sysid(lock_request);
463 		lock_request->l_state &= ~REFERENCED_LOCK;
464 		flk_set_state(lock_request, FLK_DEAD_STATE);
465 		flk_free_lock(lock_request);
466 		mutex_exit(&gp->gp_mutex);
467 		return (0);
468 
469 	default:
470 		error = EINVAL;
471 		break;
472 	}
473 
474 	/* Clustering: For blocked PXFS locks, return */
475 	if (error == PXFS_LOCK_BLOCKED) {
476 		lock_request->l_state &= ~REFERENCED_LOCK;
477 		mutex_exit(&gp->gp_mutex);
478 		return (error);
479 	}
480 
481 	/*
482 	 * Now that we have seen the status of locks in the system for
483 	 * this vnode we acquire the rwlock if it is an IO_LOCK.
484 	 */
485 
486 	if (IS_IO_LOCK(lock_request)) {
487 		(void) VOP_RWLOCK(vp,
488 			(lock_request->l_type == F_RDLCK) ?
489 				V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
490 		if (!error) {
491 			lckdat->l_type = F_UNLCK;
492 
493 			/*
494 			 * This wake up is needed otherwise
495 			 * if IO_LOCK has slept the dependents on this
496 			 * will not be woken up at all. (bugid # 1185482).
497 			 */
498 
499 			flk_wakeup(lock_request, 1);
500 			flk_set_state(lock_request, FLK_DEAD_STATE);
501 			flk_free_lock(lock_request);
502 		}
503 		/*
504 		 * else if error had occurred either flk_process_request()
505 		 * has returned EDEADLK in which case there will be no
506 		 * dependents for this lock or EINTR from flk_wait_execute_
507 		 * request() in which case flk_cancel_sleeping_lock()
508 		 * would have been done. same is true with EBADF.
509 		 */
510 	}
511 
512 	if (lock_request == &stack_lock_request) {
513 		flk_set_state(lock_request, FLK_DEAD_STATE);
514 	} else {
515 		lock_request->l_state &= ~REFERENCED_LOCK;
516 		if ((error != 0) || IS_DELETED(lock_request)) {
517 			flk_set_state(lock_request, FLK_DEAD_STATE);
518 			flk_free_lock(lock_request);
519 		}
520 	}
521 
522 	mutex_exit(&gp->gp_mutex);
523 	return (error);
524 
525 done:
526 	flk_set_state(lock_request, FLK_DEAD_STATE);
527 	if (lock_request != &stack_lock_request)
528 		flk_free_lock(lock_request);
529 	return (error);
530 }
531 
532 /*
533  * Invoke the callbacks in the given list.  If before sleeping, invoke in
534  * list order.  If after sleeping, invoke in reverse order.
535  *
536  * CPR (suspend/resume) support: if one of the callbacks returns a
537  * callb_cpr_t, return it.   This will be used to make the thread CPR-safe
538  * while it is sleeping.  There should be at most one callb_cpr_t for the
539  * thread.
540  * XXX This is unnecessarily complicated.  The CPR information should just
541  * get passed in directly through VOP_FRLOCK and reclock, rather than
542  * sneaking it in via a callback.
543  */
544 
545 callb_cpr_t *
546 flk_invoke_callbacks(flk_callback_t *cblist, flk_cb_when_t when)
547 {
548 	callb_cpr_t *cpr_callbackp = NULL;
549 	callb_cpr_t *one_result;
550 	flk_callback_t *cb;
551 
552 	if (cblist == NULL)
553 		return (NULL);
554 
555 	if (when == FLK_BEFORE_SLEEP) {
556 		cb = cblist;
557 		do {
558 			one_result = (*cb->cb_callback)(when, cb->cb_data);
559 			if (one_result != NULL) {
560 				ASSERT(cpr_callbackp == NULL);
561 				cpr_callbackp = one_result;
562 			}
563 			cb = cb->cb_next;
564 		} while (cb != cblist);
565 	} else {
566 		cb = cblist->cb_prev;
567 		do {
568 			one_result = (*cb->cb_callback)(when, cb->cb_data);
569 			if (one_result != NULL) {
570 				cpr_callbackp = one_result;
571 			}
572 			cb = cb->cb_prev;
573 		} while (cb != cblist->cb_prev);
574 	}
575 
576 	return (cpr_callbackp);
577 }
578 
579 /*
580  * Initialize a flk_callback_t to hold the given callback.
581  */
582 
583 void
584 flk_init_callback(flk_callback_t *flk_cb,
585 	callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *), void *cbdata)
586 {
587 	flk_cb->cb_next = flk_cb;
588 	flk_cb->cb_prev = flk_cb;
589 	flk_cb->cb_callback = cb_fcn;
590 	flk_cb->cb_data = cbdata;
591 }
592 
593 /*
594  * Initialize an flk_callback_t and then link it into the head of an
595  * existing list (which may be NULL).
596  */
597 
598 void
599 flk_add_callback(flk_callback_t *newcb,
600 		callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *),
601 		void *cbdata, flk_callback_t *cblist)
602 {
603 	flk_init_callback(newcb, cb_fcn, cbdata);
604 
605 	if (cblist == NULL)
606 		return;
607 
608 	newcb->cb_prev = cblist->cb_prev;
609 	newcb->cb_next = cblist;
610 	cblist->cb_prev->cb_next = newcb;
611 	cblist->cb_prev = newcb;
612 }
613 /* ONC_PLUS EXTRACT END */
614 
615 /*
616  * Initialize the flk_edge_cache data structure and create the
617  * nlm_reg_status array.
618  */
619 
620 void
621 flk_init(void)
622 {
623 	uint_t	i;
624 
625 	flk_edge_cache = kmem_cache_create("flk_edges",
626 		sizeof (struct edge), 0, NULL, NULL, NULL, NULL, NULL, 0);
627 	if (flk_edge_cache == NULL) {
628 		cmn_err(CE_PANIC, "Couldn't create flk_edge_cache\n");
629 	}
630 	/*
631 	 * Create the NLM registry object.
632 	 */
633 
634 	if (cluster_bootflags & CLUSTER_BOOTED) {
635 		/*
636 		 * This routine tells you the maximum node id that will be used
637 		 * in the cluster.  This number will be the size of the nlm
638 		 * registry status array.  We add 1 because we will be using
639 		 * all entries indexed from 0 to maxnodeid; e.g., from 0
640 		 * to 64, for a total of 65 entries.
641 		 */
642 		nlm_status_size = clconf_maximum_nodeid() + 1;
643 	} else {
644 		nlm_status_size = 0;
645 	}
646 
647 	if (nlm_status_size != 0) {	/* booted as a cluster */
648 		nlm_reg_status = (flk_nlm_status_t *)
649 			kmem_alloc(sizeof (flk_nlm_status_t) * nlm_status_size,
650 				KM_SLEEP);
651 
652 		/* initialize all NLM states in array to NLM_UNKNOWN */
653 		for (i = 0; i < nlm_status_size; i++) {
654 			nlm_reg_status[i] = FLK_NLM_UNKNOWN;
655 		}
656 	}
657 }
658 
659 /*
660  * Zone constructor/destructor callbacks to be executed when a zone is
661  * created/destroyed.
662  */
663 /* ARGSUSED */
664 void *
665 flk_zone_init(zoneid_t zoneid)
666 {
667 	struct flock_globals *fg;
668 	uint_t i;
669 
670 	fg = kmem_alloc(sizeof (*fg), KM_SLEEP);
671 	fg->flk_lockmgr_status = FLK_LOCKMGR_UP;
672 	for (i = 0; i < HASH_SIZE; i++)
673 		fg->lockmgr_status[i] = FLK_LOCKMGR_UP;
674 	return (fg);
675 }
676 
677 /* ARGSUSED */
678 void
679 flk_zone_fini(zoneid_t zoneid, void *data)
680 {
681 	struct flock_globals *fg = data;
682 
683 	kmem_free(fg, sizeof (*fg));
684 }
685 
686 /*
687  * Get a lock_descriptor structure with initialisation of edge lists.
688  */
689 
690 static lock_descriptor_t *
691 flk_get_lock(void)
692 {
693 	lock_descriptor_t	*l;
694 
695 	l = kmem_zalloc(sizeof (lock_descriptor_t), KM_SLEEP);
696 
697 	cv_init(&l->l_cv, NULL, CV_DRIVER, NULL);
698 	l->l_edge.edge_in_next = &l->l_edge;
699 	l->l_edge.edge_in_prev = &l->l_edge;
700 	l->l_edge.edge_adj_next = &l->l_edge;
701 	l->l_edge.edge_adj_prev = &l->l_edge;
702 	l->pvertex = -1;
703 	l->l_status = FLK_INITIAL_STATE;
704 	flk_lock_allocs++;
705 	return (l);
706 }
707 
708 /*
709  * Free a lock_descriptor structure. Just sets the DELETED_LOCK flag
710  * when some thread has a reference to it as in reclock().
711  */
712 
713 void
714 flk_free_lock(lock_descriptor_t	*lock)
715 {
716 	ASSERT(IS_DEAD(lock));
717 	if (IS_REFERENCED(lock)) {
718 		lock->l_state |= DELETED_LOCK;
719 		return;
720 	}
721 	flk_lock_frees++;
722 	kmem_free((void *)lock, sizeof (lock_descriptor_t));
723 }
724 
725 void
726 flk_set_state(lock_descriptor_t *lock, int new_state)
727 {
728 	/*
729 	 * Locks in the sleeping list may be woken up in a number of ways,
730 	 * and more than once.  If a sleeping lock is signalled awake more
731 	 * than once, then it may or may not change state depending on its
732 	 * current state.
733 	 * Also note that NLM locks that are sleeping could be moved to an
734 	 * interrupted state more than once if the unlock request is
735 	 * retransmitted by the NLM client - the second time around, this is
736 	 * just a nop.
737 	 * The ordering of being signalled awake is:
738 	 * INTERRUPTED_STATE > CANCELLED_STATE > GRANTED_STATE.
739 	 * The checks below implement this ordering.
740 	 */
741 	if (IS_INTERRUPTED(lock)) {
742 		if ((new_state == FLK_CANCELLED_STATE) ||
743 		    (new_state == FLK_GRANTED_STATE) ||
744 		    (new_state == FLK_INTERRUPTED_STATE)) {
745 			return;
746 		}
747 	}
748 	if (IS_CANCELLED(lock)) {
749 		if ((new_state == FLK_GRANTED_STATE) ||
750 		    (new_state == FLK_CANCELLED_STATE)) {
751 			return;
752 		}
753 	}
754 	CHECK_LOCK_TRANSITION(lock->l_status, new_state);
755 	if (IS_PXFS(lock)) {
756 		cl_flk_state_transition_notify(lock, lock->l_status, new_state);
757 	}
758 	lock->l_status = new_state;
759 }
760 
761 /*
762  * Routine that checks whether there are any blocking locks in the system.
763  *
764  * The policy followed is if a write lock is sleeping we don't allow read
765  * locks before this write lock even though there may not be any active
766  * locks corresponding to the read locks' region.
767  *
768  * flk_add_edge() function adds an edge between l1 and l2 iff there
769  * is no path between l1 and l2. This is done to have a "minimum
770  * storage representation" of the dependency graph.
771  *
772  * Another property of the graph is since only the new request throws
773  * edges to the existing locks in the graph, the graph is always topologically
774  * ordered.
775  */
776 
777 static int
778 flk_process_request(lock_descriptor_t *request)
779 {
780 	graph_t	*gp = request->l_graph;
781 	lock_descriptor_t *lock;
782 	int request_blocked_by_active = 0;
783 	int request_blocked_by_granted = 0;
784 	int request_blocked_by_sleeping = 0;
785 	vnode_t	*vp = request->l_vnode;
786 	int	error = 0;
787 	int request_will_wait = 0;
788 	int found_covering_lock = 0;
789 	lock_descriptor_t *covered_by = NULL;
790 
791 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
792 	request_will_wait = IS_WILLING_TO_SLEEP(request);
793 
794 	/*
795 	 * check active locks
796 	 */
797 
798 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
799 
800 
801 	if (lock) {
802 		do {
803 			if (BLOCKS(lock, request)) {
804 				if (!request_will_wait)
805 					return (EAGAIN);
806 				request_blocked_by_active = 1;
807 				break;
808 			}
809 			/*
810 			 * Grant lock if it is for the same owner holding active
811 			 * lock that covers the request.
812 			 */
813 
814 			if (SAME_OWNER(lock, request) &&
815 					COVERS(lock, request) &&
816 						(request->l_type == F_RDLCK))
817 				return (flk_execute_request(request));
818 			lock = lock->l_next;
819 		} while (lock->l_vnode == vp);
820 	}
821 
822 	if (!request_blocked_by_active) {
823 			lock_descriptor_t *lk[1];
824 			lock_descriptor_t *first_glock = NULL;
825 		/*
826 		 * Shall we grant this?! NO!!
827 		 * What about those locks that were just granted and still
828 		 * in sleep queue. Those threads are woken up and so locks
829 		 * are almost active.
830 		 */
831 		SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
832 		if (lock) {
833 			do {
834 				if (BLOCKS(lock, request)) {
835 					if (IS_GRANTED(lock)) {
836 						request_blocked_by_granted = 1;
837 					} else {
838 						request_blocked_by_sleeping = 1;
839 					}
840 				}
841 
842 				lock = lock->l_next;
843 			} while ((lock->l_vnode == vp));
844 			first_glock = lock->l_prev;
845 			ASSERT(first_glock->l_vnode == vp);
846 		}
847 
848 		if (request_blocked_by_granted)
849 			goto block;
850 
851 		if (!request_blocked_by_sleeping) {
852 			/*
853 			 * If the request isn't going to be blocked by a
854 			 * sleeping request, we know that it isn't going to
855 			 * be blocked; we can just execute the request --
856 			 * without performing costly deadlock detection.
857 			 */
858 			ASSERT(!request_blocked_by_active);
859 			return (flk_execute_request(request));
860 		} else if (request->l_type == F_RDLCK) {
861 			/*
862 			 * If we have a sleeping writer in the requested
863 			 * lock's range, block.
864 			 */
865 			goto block;
866 		}
867 
868 		lk[0] = request;
869 		request->l_state |= RECOMPUTE_LOCK;
870 		SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
871 		if (lock) {
872 			do {
873 				flk_recompute_dependencies(lock, lk, 1, 0);
874 				lock = lock->l_next;
875 			} while (lock->l_vnode == vp);
876 		}
877 		lock = first_glock;
878 		if (lock) {
879 			do {
880 				if (IS_GRANTED(lock)) {
881 				flk_recompute_dependencies(lock, lk, 1, 0);
882 				}
883 				lock = lock->l_prev;
884 			} while ((lock->l_vnode == vp));
885 		}
886 		request->l_state &= ~RECOMPUTE_LOCK;
887 		if (!NO_DEPENDENTS(request) && flk_check_deadlock(request))
888 			return (EDEADLK);
889 		return (flk_execute_request(request));
890 	}
891 
892 block:
893 	if (request_will_wait)
894 		flk_graph_uncolor(gp);
895 
896 	/* check sleeping locks */
897 
898 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
899 
900 	/*
901 	 * If we find a sleeping write lock that is a superset of the
902 	 * region wanted by request we can be assured that by adding an
903 	 * edge to this write lock we have paths to all locks in the
904 	 * graph that blocks the request except in one case and that is why
905 	 * another check for SAME_OWNER in the loop below. The exception
906 	 * case is when this process that owns the sleeping write lock 'l1'
907 	 * has other locks l2, l3, l4 that are in the system and arrived
908 	 * before l1. l1 does not have path to these locks as they are from
909 	 * same process. We break when we find a second covering sleeping
910 	 * lock l5 owned by a process different from that owning l1, because
911 	 * there cannot be any of l2, l3, l4, etc., arrived before l5, and if
912 	 * it has l1 would have produced a deadlock already.
913 	 */
914 
915 	if (lock) {
916 		do {
917 			if (BLOCKS(lock, request)) {
918 				if (!request_will_wait)
919 					return (EAGAIN);
920 				if (COVERS(lock, request) &&
921 						lock->l_type == F_WRLCK) {
922 					if (found_covering_lock &&
923 					    !SAME_OWNER(lock, covered_by)) {
924 						found_covering_lock++;
925 						break;
926 					}
927 					found_covering_lock = 1;
928 					covered_by = lock;
929 				}
930 				if (found_covering_lock &&
931 					!SAME_OWNER(lock, covered_by)) {
932 					lock = lock->l_next;
933 					continue;
934 				}
935 				if ((error = flk_add_edge(request, lock,
936 						!found_covering_lock, 0)))
937 					return (error);
938 			}
939 			lock = lock->l_next;
940 		} while (lock->l_vnode == vp);
941 	}
942 
943 /*
944  * found_covering_lock == 2 iff at this point 'request' has paths
945  * to all locks that blocks 'request'. found_covering_lock == 1 iff at this
946  * point 'request' has paths to all locks that blocks 'request' whose owners
947  * are not same as the one that covers 'request' (covered_by above) and
948  * we can have locks whose owner is same as covered_by in the active list.
949  */
950 
951 	if (request_blocked_by_active && found_covering_lock != 2) {
952 		SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
953 		ASSERT(lock != NULL);
954 		do {
955 			if (BLOCKS(lock, request)) {
956 				if (found_covering_lock &&
957 					!SAME_OWNER(lock, covered_by)) {
958 					lock = lock->l_next;
959 					continue;
960 				}
961 				if ((error = flk_add_edge(request, lock,
962 							CHECK_CYCLE, 0)))
963 					return (error);
964 			}
965 			lock = lock->l_next;
966 		} while (lock->l_vnode == vp);
967 	}
968 
969 	if (NOT_BLOCKED(request)) {
970 		/*
971 		 * request not dependent on any other locks
972 		 * so execute this request
973 		 */
974 		return (flk_execute_request(request));
975 	} else {
976 		/*
977 		 * check for deadlock
978 		 */
979 		if (flk_check_deadlock(request))
980 			return (EDEADLK);
981 		/*
982 		 * this thread has to sleep
983 		 */
984 		return (flk_wait_execute_request(request));
985 	}
986 }
987 
988 /* ONC_PLUS EXTRACT START */
989 /*
990  * The actual execution of the request in the simple case is only to
991  * insert the 'request' in the list of active locks if it is not an
992  * UNLOCK.
993  * We have to consider the existing active locks' relation to
994  * this 'request' if they are owned by same process. flk_relation() does
995  * this job and sees to that the dependency graph information is maintained
996  * properly.
997  */
998 
999 int
1000 flk_execute_request(lock_descriptor_t *request)
1001 {
1002 	graph_t	*gp = request->l_graph;
1003 	vnode_t	*vp = request->l_vnode;
1004 	lock_descriptor_t	*lock, *lock1;
1005 	int done_searching = 0;
1006 
1007 	CHECK_SLEEPING_LOCKS(gp);
1008 	CHECK_ACTIVE_LOCKS(gp);
1009 
1010 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1011 
1012 	flk_set_state(request, FLK_START_STATE);
1013 
1014 	ASSERT(NOT_BLOCKED(request));
1015 
1016 	/* IO_LOCK requests are only to check status */
1017 
1018 	if (IS_IO_LOCK(request))
1019 		return (0);
1020 
1021 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1022 
1023 	if (lock == NULL && request->l_type == F_UNLCK)
1024 		return (0);
1025 	if (lock == NULL) {
1026 		flk_insert_active_lock(request);
1027 		return (0);
1028 	}
1029 
1030 	do {
1031 		lock1 = lock->l_next;
1032 		if (SAME_OWNER(request, lock)) {
1033 			done_searching = flk_relation(lock, request);
1034 		}
1035 		lock = lock1;
1036 	} while (lock->l_vnode == vp && !done_searching);
1037 
1038 	/*
1039 	 * insert in active queue
1040 	 */
1041 
1042 	if (request->l_type != F_UNLCK)
1043 		flk_insert_active_lock(request);
1044 
1045 	return (0);
1046 }
1047 /* ONC_PLUS EXTRACT END */
1048 
1049 /*
1050  * 'request' is blocked by some one therefore we put it into sleep queue.
1051  */
1052 static int
1053 flk_wait_execute_request(lock_descriptor_t *request)
1054 {
1055 	graph_t	*gp = request->l_graph;
1056 	callb_cpr_t 	*cprp;		/* CPR info from callback */
1057 	struct flock_globals *fg;
1058 	int index;
1059 
1060 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1061 	ASSERT(IS_WILLING_TO_SLEEP(request));
1062 
1063 	flk_insert_sleeping_lock(request);
1064 
1065 	if (IS_LOCKMGR(request)) {
1066 		index = HASH_INDEX(request->l_vnode);
1067 		fg = flk_get_globals();
1068 
1069 		if (nlm_status_size == 0) {	/* not booted as a cluster */
1070 			if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP) {
1071 				flk_cancel_sleeping_lock(request, 1);
1072 				return (ENOLCK);
1073 			}
1074 		} else {			/* booted as a cluster */
1075 			/*
1076 			 * If the request is an NLM server lock request,
1077 			 * and the NLM state of the lock request is not
1078 			 * NLM_UP (because the NLM server is shutting
1079 			 * down), then cancel the sleeping lock and
1080 			 * return error ENOLCK that will encourage the
1081 			 * client to retransmit.
1082 			 */
1083 			if (!IS_NLM_UP(request)) {
1084 				flk_cancel_sleeping_lock(request, 1);
1085 				return (ENOLCK);
1086 			}
1087 		}
1088 	}
1089 
1090 	/* Clustering: For blocking PXFS locks, return */
1091 	if (IS_PXFS(request)) {
1092 		/*
1093 		 * PXFS locks sleep on the client side.
1094 		 * The callback argument is used to wake up the sleeper
1095 		 * when the lock is granted.
1096 		 * We return -1 (rather than an errno value) to indicate
1097 		 * the client side should sleep
1098 		 */
1099 		return (PXFS_LOCK_BLOCKED);
1100 	}
1101 
1102 	if (request->l_callbacks != NULL) {
1103 		/*
1104 		 * To make sure the shutdown code works correctly, either
1105 		 * the callback must happen after putting the lock on the
1106 		 * sleep list, or we must check the shutdown status after
1107 		 * returning from the callback (and before sleeping).  At
1108 		 * least for now, we'll use the first option.  If a
1109 		 * shutdown or signal or whatever happened while the graph
1110 		 * mutex was dropped, that will be detected by
1111 		 * wait_for_lock().
1112 		 */
1113 		mutex_exit(&gp->gp_mutex);
1114 
1115 		cprp = flk_invoke_callbacks(request->l_callbacks,
1116 					    FLK_BEFORE_SLEEP);
1117 
1118 		mutex_enter(&gp->gp_mutex);
1119 
1120 		if (cprp == NULL) {
1121 			wait_for_lock(request);
1122 		} else {
1123 			mutex_enter(cprp->cc_lockp);
1124 			CALLB_CPR_SAFE_BEGIN(cprp);
1125 			mutex_exit(cprp->cc_lockp);
1126 			wait_for_lock(request);
1127 			mutex_enter(cprp->cc_lockp);
1128 			CALLB_CPR_SAFE_END(cprp, cprp->cc_lockp);
1129 			mutex_exit(cprp->cc_lockp);
1130 		}
1131 
1132 		mutex_exit(&gp->gp_mutex);
1133 		(void) flk_invoke_callbacks(request->l_callbacks,
1134 					    FLK_AFTER_SLEEP);
1135 		mutex_enter(&gp->gp_mutex);
1136 	} else {
1137 		wait_for_lock(request);
1138 	}
1139 
1140 	if (IS_LOCKMGR(request)) {
1141 		/*
1142 		 * If the lock manager is shutting down, return an
1143 		 * error that will encourage the client to retransmit.
1144 		 */
1145 		if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP &&
1146 			!IS_GRANTED(request)) {
1147 			flk_cancel_sleeping_lock(request, 1);
1148 			return (ENOLCK);
1149 		}
1150 	}
1151 
1152 	if (IS_INTERRUPTED(request)) {
1153 		/* we got a signal, or act like we did */
1154 		flk_cancel_sleeping_lock(request, 1);
1155 		return (EINTR);
1156 	}
1157 
1158 	/* Cancelled if some other thread has closed the file */
1159 
1160 	if (IS_CANCELLED(request)) {
1161 		flk_cancel_sleeping_lock(request, 1);
1162 		return (EBADF);
1163 	}
1164 
1165 	request->l_state &= ~GRANTED_LOCK;
1166 	REMOVE_SLEEP_QUEUE(request);
1167 	return (flk_execute_request(request));
1168 }
1169 
1170 /*
1171  * This routine adds an edge between from and to because from depends
1172  * to. If asked to check for deadlock it checks whether there are any
1173  * reachable locks from "from_lock" that is owned by the same process
1174  * as "from_lock".
1175  * NOTE: It is the caller's responsibility to make sure that the color
1176  * of the graph is consistent between the calls to flk_add_edge as done
1177  * in flk_process_request. This routine does not color and check for
1178  * deadlock explicitly.
1179  */
1180 
1181 static int
1182 flk_add_edge(lock_descriptor_t *from_lock, lock_descriptor_t *to_lock,
1183 			int check_cycle, int update_graph)
1184 {
1185 	edge_t	*edge;
1186 	edge_t	*ep;
1187 	lock_descriptor_t	*vertex;
1188 	lock_descriptor_t *vertex_stack;
1189 
1190 	STACK_INIT(vertex_stack);
1191 
1192 	/*
1193 	 * if to vertex already has mark_color just return
1194 	 * don't add an edge as it is reachable from from vertex
1195 	 * before itself.
1196 	 */
1197 
1198 	if (COLORED(to_lock))
1199 		return (0);
1200 
1201 	edge = flk_get_edge();
1202 
1203 	/*
1204 	 * set the from and to vertex
1205 	 */
1206 
1207 	edge->from_vertex = from_lock;
1208 	edge->to_vertex = to_lock;
1209 
1210 	/*
1211 	 * put in adjacency list of from vertex
1212 	 */
1213 
1214 	from_lock->l_edge.edge_adj_next->edge_adj_prev = edge;
1215 	edge->edge_adj_next = from_lock->l_edge.edge_adj_next;
1216 	edge->edge_adj_prev = &from_lock->l_edge;
1217 	from_lock->l_edge.edge_adj_next = edge;
1218 
1219 	/*
1220 	 * put in in list of to vertex
1221 	 */
1222 
1223 	to_lock->l_edge.edge_in_next->edge_in_prev = edge;
1224 	edge->edge_in_next = to_lock->l_edge.edge_in_next;
1225 	to_lock->l_edge.edge_in_next = edge;
1226 	edge->edge_in_prev = &to_lock->l_edge;
1227 
1228 
1229 	if (update_graph) {
1230 		flk_update_proc_graph(edge, 0);
1231 		return (0);
1232 	}
1233 	if (!check_cycle) {
1234 		return (0);
1235 	}
1236 
1237 	STACK_PUSH(vertex_stack, from_lock, l_stack);
1238 
1239 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1240 
1241 		STACK_POP(vertex_stack, l_stack);
1242 
1243 		for (ep = FIRST_ADJ(vertex);
1244 			ep != HEAD(vertex);
1245 				ep = NEXT_ADJ(ep)) {
1246 			if (COLORED(ep->to_vertex))
1247 				continue;
1248 			COLOR(ep->to_vertex);
1249 			if (SAME_OWNER(ep->to_vertex, from_lock))
1250 				goto dead_lock;
1251 			STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1252 		}
1253 	}
1254 	return (0);
1255 
1256 dead_lock:
1257 
1258 	/*
1259 	 * remove all edges
1260 	 */
1261 
1262 	ep = FIRST_ADJ(from_lock);
1263 
1264 	while (ep != HEAD(from_lock)) {
1265 		IN_LIST_REMOVE(ep);
1266 		from_lock->l_sedge = NEXT_ADJ(ep);
1267 		ADJ_LIST_REMOVE(ep);
1268 		flk_free_edge(ep);
1269 		ep = from_lock->l_sedge;
1270 	}
1271 	return (EDEADLK);
1272 }
1273 
1274 /*
1275  * Get an edge structure for representing the dependency between two locks.
1276  */
1277 
1278 static edge_t *
1279 flk_get_edge()
1280 {
1281 	edge_t	*ep;
1282 
1283 	ASSERT(flk_edge_cache != NULL);
1284 
1285 	ep = kmem_cache_alloc(flk_edge_cache, KM_SLEEP);
1286 	edge_allocs++;
1287 	return (ep);
1288 }
1289 
1290 /*
1291  * Free the edge structure.
1292  */
1293 
1294 static void
1295 flk_free_edge(edge_t *ep)
1296 {
1297 	edge_frees++;
1298 	kmem_cache_free(flk_edge_cache, (void *)ep);
1299 }
1300 
1301 /*
1302  * Check the relationship of request with lock and perform the
1303  * recomputation of dependencies, break lock if required, and return
1304  * 1 if request cannot have any more relationship with the next
1305  * active locks.
1306  * The 'lock' and 'request' are compared and in case of overlap we
1307  * delete the 'lock' and form new locks to represent the non-overlapped
1308  * portion of original 'lock'. This function has side effects such as
1309  * 'lock' will be freed, new locks will be added to the active list.
1310  */
1311 
1312 static int
1313 flk_relation(lock_descriptor_t *lock, lock_descriptor_t *request)
1314 {
1315 	int lock_effect;
1316 	lock_descriptor_t *lock1, *lock2;
1317 	lock_descriptor_t *topology[3];
1318 	int nvertex = 0;
1319 	int i;
1320 	edge_t	*ep;
1321 	graph_t	*gp = (lock->l_graph);
1322 
1323 
1324 	CHECK_SLEEPING_LOCKS(gp);
1325 	CHECK_ACTIVE_LOCKS(gp);
1326 
1327 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1328 
1329 	topology[0] = topology[1] = topology[2] = NULL;
1330 
1331 	if (request->l_type == F_UNLCK)
1332 		lock_effect = FLK_UNLOCK;
1333 	else if (request->l_type == F_RDLCK &&
1334 			lock->l_type == F_WRLCK)
1335 		lock_effect = FLK_DOWNGRADE;
1336 	else if (request->l_type == F_WRLCK &&
1337 			lock->l_type == F_RDLCK)
1338 		lock_effect = FLK_UPGRADE;
1339 	else
1340 		lock_effect = FLK_STAY_SAME;
1341 
1342 	if (lock->l_end < request->l_start) {
1343 		if (lock->l_end == request->l_start - 1 &&
1344 				lock_effect == FLK_STAY_SAME) {
1345 			topology[0] = request;
1346 			request->l_start = lock->l_start;
1347 			nvertex = 1;
1348 			goto recompute;
1349 		} else {
1350 			return (0);
1351 		}
1352 	}
1353 
1354 	if (lock->l_start > request->l_end) {
1355 		if (request->l_end == lock->l_start - 1 &&
1356 					lock_effect == FLK_STAY_SAME) {
1357 			topology[0] = request;
1358 			request->l_end = lock->l_end;
1359 			nvertex = 1;
1360 			goto recompute;
1361 		} else {
1362 			return (1);
1363 		}
1364 	}
1365 
1366 	if (request->l_end < lock->l_end) {
1367 		if (request->l_start > lock->l_start) {
1368 			if (lock_effect == FLK_STAY_SAME) {
1369 				request->l_start = lock->l_start;
1370 				request->l_end = lock->l_end;
1371 				topology[0] = request;
1372 				nvertex = 1;
1373 			} else {
1374 				lock1 = flk_get_lock();
1375 				lock2 = flk_get_lock();
1376 				COPY(lock1, lock);
1377 				COPY(lock2, lock);
1378 				lock1->l_start = lock->l_start;
1379 				lock1->l_end = request->l_start - 1;
1380 				lock2->l_start = request->l_end + 1;
1381 				lock2->l_end = lock->l_end;
1382 				topology[0] = lock1;
1383 				topology[1] = lock2;
1384 				topology[2] = request;
1385 				nvertex = 3;
1386 			}
1387 		} else if (request->l_start < lock->l_start) {
1388 			if (lock_effect == FLK_STAY_SAME) {
1389 				request->l_end = lock->l_end;
1390 				topology[0] = request;
1391 				nvertex = 1;
1392 			} else {
1393 				lock1 = flk_get_lock();
1394 				COPY(lock1, lock);
1395 				lock1->l_start = request->l_end + 1;
1396 				topology[0] = lock1;
1397 				topology[1] = request;
1398 				nvertex = 2;
1399 			}
1400 		} else  {
1401 			if (lock_effect == FLK_STAY_SAME) {
1402 				request->l_start = lock->l_start;
1403 				request->l_end = lock->l_end;
1404 				topology[0] = request;
1405 				nvertex = 1;
1406 			} else {
1407 				lock1 = flk_get_lock();
1408 				COPY(lock1, lock);
1409 				lock1->l_start = request->l_end + 1;
1410 				topology[0] = lock1;
1411 				topology[1] = request;
1412 				nvertex = 2;
1413 			}
1414 		}
1415 	} else if (request->l_end > lock->l_end) {
1416 		if (request->l_start > lock->l_start)  {
1417 			if (lock_effect == FLK_STAY_SAME) {
1418 				request->l_start = lock->l_start;
1419 				topology[0] = request;
1420 				nvertex = 1;
1421 			} else {
1422 				lock1 = flk_get_lock();
1423 				COPY(lock1, lock);
1424 				lock1->l_end = request->l_start - 1;
1425 				topology[0] = lock1;
1426 				topology[1] = request;
1427 				nvertex = 2;
1428 			}
1429 		} else if (request->l_start < lock->l_start)  {
1430 			topology[0] = request;
1431 			nvertex = 1;
1432 		} else {
1433 			topology[0] = request;
1434 			nvertex = 1;
1435 		}
1436 	} else {
1437 		if (request->l_start > lock->l_start) {
1438 			if (lock_effect == FLK_STAY_SAME) {
1439 				request->l_start = lock->l_start;
1440 				topology[0] = request;
1441 				nvertex = 1;
1442 			} else {
1443 				lock1 = flk_get_lock();
1444 				COPY(lock1, lock);
1445 				lock1->l_end = request->l_start - 1;
1446 				topology[0] = lock1;
1447 				topology[1] = request;
1448 				nvertex = 2;
1449 			}
1450 		} else if (request->l_start < lock->l_start) {
1451 			topology[0] = request;
1452 			nvertex = 1;
1453 		} else {
1454 			if (lock_effect !=  FLK_UNLOCK) {
1455 				topology[0] = request;
1456 				nvertex = 1;
1457 			} else {
1458 				flk_delete_active_lock(lock, 0);
1459 				flk_wakeup(lock, 1);
1460 				flk_free_lock(lock);
1461 				CHECK_SLEEPING_LOCKS(gp);
1462 				CHECK_ACTIVE_LOCKS(gp);
1463 				return (1);
1464 			}
1465 		}
1466 	}
1467 
1468 recompute:
1469 
1470 	/*
1471 	 * For unlock we don't send the 'request' to for recomputing
1472 	 * dependencies because no lock will add an edge to this.
1473 	 */
1474 
1475 	if (lock_effect == FLK_UNLOCK) {
1476 		topology[nvertex-1] = NULL;
1477 		nvertex--;
1478 	}
1479 	for (i = 0; i < nvertex; i++) {
1480 		topology[i]->l_state |= RECOMPUTE_LOCK;
1481 		topology[i]->l_color = NO_COLOR;
1482 	}
1483 
1484 	ASSERT(FIRST_ADJ(lock) == HEAD(lock));
1485 
1486 	/*
1487 	 * we remove the adjacent edges for all vertices' to this vertex
1488 	 * 'lock'.
1489 	 */
1490 
1491 	ep = FIRST_IN(lock);
1492 	while (ep != HEAD(lock)) {
1493 		ADJ_LIST_REMOVE(ep);
1494 		ep = NEXT_IN(ep);
1495 	}
1496 
1497 	flk_delete_active_lock(lock, 0);
1498 
1499 	/* We are ready for recomputing the dependencies now */
1500 
1501 	flk_recompute_dependencies(lock, topology, nvertex, 1);
1502 
1503 	for (i = 0; i < nvertex; i++) {
1504 		topology[i]->l_state &= ~RECOMPUTE_LOCK;
1505 		topology[i]->l_color = NO_COLOR;
1506 	}
1507 
1508 
1509 	if (lock_effect == FLK_UNLOCK) {
1510 		nvertex++;
1511 	}
1512 	for (i = 0; i < nvertex - 1; i++) {
1513 		flk_insert_active_lock(topology[i]);
1514 	}
1515 
1516 
1517 	if (lock_effect == FLK_DOWNGRADE || lock_effect == FLK_UNLOCK) {
1518 		flk_wakeup(lock, 0);
1519 	} else {
1520 		ep = FIRST_IN(lock);
1521 		while (ep != HEAD(lock)) {
1522 			lock->l_sedge = NEXT_IN(ep);
1523 			IN_LIST_REMOVE(ep);
1524 			flk_update_proc_graph(ep, 1);
1525 			flk_free_edge(ep);
1526 			ep = lock->l_sedge;
1527 		}
1528 	}
1529 	flk_free_lock(lock);
1530 
1531 	CHECK_SLEEPING_LOCKS(gp);
1532 	CHECK_ACTIVE_LOCKS(gp);
1533 	return (0);
1534 }
1535 
1536 /*
1537  * Insert a lock into the active queue.
1538  */
1539 
1540 static void
1541 flk_insert_active_lock(lock_descriptor_t *new_lock)
1542 {
1543 	graph_t	*gp = new_lock->l_graph;
1544 	vnode_t	*vp = new_lock->l_vnode;
1545 	lock_descriptor_t *first_lock, *lock;
1546 
1547 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1548 
1549 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1550 	first_lock = lock;
1551 
1552 	if (first_lock != NULL) {
1553 		for (; (lock->l_vnode == vp &&
1554 			lock->l_start < new_lock->l_start); lock = lock->l_next)
1555 			;
1556 	} else {
1557 		lock = ACTIVE_HEAD(gp);
1558 	}
1559 
1560 	lock->l_prev->l_next = new_lock;
1561 	new_lock->l_next = lock;
1562 	new_lock->l_prev = lock->l_prev;
1563 	lock->l_prev = new_lock;
1564 
1565 	if (first_lock == NULL || (new_lock->l_start <= first_lock->l_start)) {
1566 		vp->v_filocks = (struct filock *)new_lock;
1567 	}
1568 	flk_set_state(new_lock, FLK_ACTIVE_STATE);
1569 	new_lock->l_state |= ACTIVE_LOCK;
1570 
1571 	CHECK_ACTIVE_LOCKS(gp);
1572 	CHECK_SLEEPING_LOCKS(gp);
1573 }
1574 
1575 /*
1576  * Delete the active lock : Performs two functions depending on the
1577  * value of second parameter. One is to remove from the active lists
1578  * only and other is to both remove and free the lock.
1579  */
1580 
1581 static void
1582 flk_delete_active_lock(lock_descriptor_t *lock, int free_lock)
1583 {
1584 	vnode_t *vp = lock->l_vnode;
1585 	graph_t	*gp = lock->l_graph;
1586 
1587 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1588 	if (free_lock)
1589 		ASSERT(NO_DEPENDENTS(lock));
1590 	ASSERT(NOT_BLOCKED(lock));
1591 	ASSERT(IS_ACTIVE(lock));
1592 
1593 	ASSERT((vp->v_filocks != NULL));
1594 
1595 	if (vp->v_filocks == (struct filock *)lock) {
1596 		vp->v_filocks = (struct filock *)
1597 				((lock->l_next->l_vnode == vp) ? lock->l_next :
1598 								NULL);
1599 	}
1600 	lock->l_next->l_prev = lock->l_prev;
1601 	lock->l_prev->l_next = lock->l_next;
1602 	lock->l_next = lock->l_prev = NULL;
1603 	flk_set_state(lock, FLK_DEAD_STATE);
1604 	lock->l_state &= ~ACTIVE_LOCK;
1605 
1606 	if (free_lock)
1607 		flk_free_lock(lock);
1608 	CHECK_ACTIVE_LOCKS(gp);
1609 	CHECK_SLEEPING_LOCKS(gp);
1610 }
1611 
1612 /*
1613  * Insert into the sleep queue.
1614  */
1615 
1616 static void
1617 flk_insert_sleeping_lock(lock_descriptor_t *request)
1618 {
1619 	graph_t *gp = request->l_graph;
1620 	vnode_t	*vp = request->l_vnode;
1621 	lock_descriptor_t	*lock;
1622 
1623 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1624 	ASSERT(IS_INITIAL(request));
1625 
1626 	for (lock = gp->sleeping_locks.l_next; (lock != &gp->sleeping_locks &&
1627 		lock->l_vnode < vp); lock = lock->l_next)
1628 		;
1629 
1630 	lock->l_prev->l_next = request;
1631 	request->l_prev = lock->l_prev;
1632 	lock->l_prev = request;
1633 	request->l_next = lock;
1634 	flk_set_state(request, FLK_SLEEPING_STATE);
1635 	request->l_state |= SLEEPING_LOCK;
1636 }
1637 
1638 /*
1639  * Cancelling a sleeping lock implies removing a vertex from the
1640  * dependency graph and therefore we should recompute the dependencies
1641  * of all vertices that have a path  to this vertex, w.r.t. all
1642  * vertices reachable from this vertex.
1643  */
1644 
1645 void
1646 flk_cancel_sleeping_lock(lock_descriptor_t *request, int remove_from_queue)
1647 {
1648 	graph_t	*gp = request->l_graph;
1649 	vnode_t *vp = request->l_vnode;
1650 	lock_descriptor_t **topology = NULL;
1651 	edge_t	*ep;
1652 	lock_descriptor_t *vertex, *lock;
1653 	int nvertex = 0;
1654 	int i;
1655 	lock_descriptor_t *vertex_stack;
1656 
1657 	STACK_INIT(vertex_stack);
1658 
1659 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1660 	/*
1661 	 * count number of vertex pointers that has to be allocated
1662 	 * All vertices that are reachable from request.
1663 	 */
1664 
1665 	STACK_PUSH(vertex_stack, request, l_stack);
1666 
1667 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1668 		STACK_POP(vertex_stack, l_stack);
1669 		for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
1670 					ep = NEXT_ADJ(ep)) {
1671 			if (IS_RECOMPUTE(ep->to_vertex))
1672 				continue;
1673 			ep->to_vertex->l_state |= RECOMPUTE_LOCK;
1674 			STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1675 			nvertex++;
1676 		}
1677 	}
1678 
1679 	/*
1680 	 * allocate memory for holding the vertex pointers
1681 	 */
1682 
1683 	if (nvertex) {
1684 		topology = kmem_zalloc(nvertex * sizeof (lock_descriptor_t *),
1685 						KM_SLEEP);
1686 	}
1687 
1688 	/*
1689 	 * one more pass to actually store the vertices in the
1690 	 * allocated array.
1691 	 * We first check sleeping locks and then active locks
1692 	 * so that topology array will be in a topological
1693 	 * order.
1694 	 */
1695 
1696 	nvertex = 0;
1697 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1698 
1699 	if (lock) {
1700 		do {
1701 			if (IS_RECOMPUTE(lock)) {
1702 				lock->l_index = nvertex;
1703 				topology[nvertex++] = lock;
1704 			}
1705 			lock->l_color = NO_COLOR;
1706 			lock = lock->l_next;
1707 		} while (lock->l_vnode == vp);
1708 	}
1709 
1710 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1711 
1712 	if (lock) {
1713 		do {
1714 			if (IS_RECOMPUTE(lock)) {
1715 				lock->l_index = nvertex;
1716 				topology[nvertex++] = lock;
1717 			}
1718 			lock->l_color = NO_COLOR;
1719 			lock = lock->l_next;
1720 		} while (lock->l_vnode == vp);
1721 	}
1722 
1723 	/*
1724 	 * remove in and out edges of request
1725 	 * They are freed after updating proc_graph below.
1726 	 */
1727 
1728 	for (ep = FIRST_IN(request); ep != HEAD(request); ep = NEXT_IN(ep)) {
1729 		ADJ_LIST_REMOVE(ep);
1730 	}
1731 
1732 
1733 	if (remove_from_queue)
1734 		REMOVE_SLEEP_QUEUE(request);
1735 
1736 	/* we are ready to recompute */
1737 
1738 	flk_recompute_dependencies(request, topology, nvertex, 1);
1739 
1740 	ep = FIRST_ADJ(request);
1741 	while (ep != HEAD(request)) {
1742 		IN_LIST_REMOVE(ep);
1743 		request->l_sedge = NEXT_ADJ(ep);
1744 		ADJ_LIST_REMOVE(ep);
1745 		flk_update_proc_graph(ep, 1);
1746 		flk_free_edge(ep);
1747 		ep = request->l_sedge;
1748 	}
1749 
1750 
1751 	/*
1752 	 * unset the RECOMPUTE flag in those vertices
1753 	 */
1754 
1755 	for (i = 0; i < nvertex; i++) {
1756 		topology[i]->l_state &= ~RECOMPUTE_LOCK;
1757 	}
1758 
1759 	/*
1760 	 * free the topology
1761 	 */
1762 	if (nvertex)
1763 		kmem_free((void *)topology,
1764 			(nvertex * sizeof (lock_descriptor_t *)));
1765 	/*
1766 	 * Possibility of some locks unblocked now
1767 	 */
1768 
1769 	flk_wakeup(request, 0);
1770 
1771 	/*
1772 	 * we expect to have a correctly recomputed graph  now.
1773 	 */
1774 	flk_set_state(request, FLK_DEAD_STATE);
1775 	flk_free_lock(request);
1776 	CHECK_SLEEPING_LOCKS(gp);
1777 	CHECK_ACTIVE_LOCKS(gp);
1778 
1779 }
1780 
1781 /*
1782  * Uncoloring the graph is simply to increment the mark value of the graph
1783  * And only when wrap round takes place will we color all vertices in
1784  * the graph explicitly.
1785  */
1786 
1787 static void
1788 flk_graph_uncolor(graph_t *gp)
1789 {
1790 	lock_descriptor_t *lock;
1791 
1792 	if (gp->mark == UINT_MAX) {
1793 		gp->mark = 1;
1794 	for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
1795 					lock = lock->l_next)
1796 			lock->l_color  = 0;
1797 
1798 	for (lock = SLEEPING_HEAD(gp)->l_next; lock != SLEEPING_HEAD(gp);
1799 					lock = lock->l_next)
1800 			lock->l_color  = 0;
1801 	} else {
1802 		gp->mark++;
1803 	}
1804 }
1805 
1806 /*
1807  * Wake up locks that are blocked on the given lock.
1808  */
1809 
1810 static void
1811 flk_wakeup(lock_descriptor_t *lock, int adj_list_remove)
1812 {
1813 	edge_t	*ep;
1814 	graph_t	*gp = lock->l_graph;
1815 	lock_descriptor_t	*lck;
1816 
1817 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1818 	if (NO_DEPENDENTS(lock))
1819 		return;
1820 	ep = FIRST_IN(lock);
1821 	do {
1822 		/*
1823 		 * delete the edge from the adjacency list
1824 		 * of from vertex. if no more adjacent edges
1825 		 * for this vertex wake this process.
1826 		 */
1827 		lck = ep->from_vertex;
1828 		if (adj_list_remove)
1829 			ADJ_LIST_REMOVE(ep);
1830 		flk_update_proc_graph(ep, 1);
1831 		if (NOT_BLOCKED(lck)) {
1832 			GRANT_WAKEUP(lck);
1833 		}
1834 		lock->l_sedge = NEXT_IN(ep);
1835 		IN_LIST_REMOVE(ep);
1836 		flk_free_edge(ep);
1837 		ep = lock->l_sedge;
1838 	} while (ep != HEAD(lock));
1839 	ASSERT(NO_DEPENDENTS(lock));
1840 }
1841 
1842 /*
1843  * The dependents of request, is checked for its dependency against the
1844  * locks in topology (called topology because the array is and should be in
1845  * topological order for this algorithm, if not in topological order the
1846  * inner loop below might add more edges than necessary. Topological ordering
1847  * of vertices satisfies the property that all edges will be from left to
1848  * right i.e., topology[i] can have an edge to  topology[j], iff i<j)
1849  * If lock l1 in the dependent set of request is dependent (blocked by)
1850  * on lock l2 in topology but does not have a path to it, we add an edge
1851  * in the inner loop below.
1852  *
1853  * We don't want to add an edge between l1 and l2 if there exists
1854  * already a path from l1 to l2, so care has to be taken for those vertices
1855  * that  have two paths to 'request'. These vertices are referred to here
1856  * as barrier locks.
1857  *
1858  * The barriers has to be found (those vertex that originally had two paths
1859  * to request) because otherwise we may end up adding edges unnecessarily
1860  * to vertices in topology, and thus barrier vertices can have an edge
1861  * to a vertex in topology as well a path to it.
1862  */
1863 
1864 static void
1865 flk_recompute_dependencies(lock_descriptor_t *request,
1866 		lock_descriptor_t **topology,
1867 			int nvertex, int update_graph)
1868 {
1869 	lock_descriptor_t *vertex, *lock;
1870 	graph_t	*gp = request->l_graph;
1871 	int i, count;
1872 	int barrier_found = 0;
1873 	edge_t	*ep;
1874 	lock_descriptor_t *vertex_stack;
1875 
1876 	STACK_INIT(vertex_stack);
1877 
1878 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1879 	if (nvertex == 0)
1880 		return;
1881 	flk_graph_uncolor(request->l_graph);
1882 	barrier_found = flk_find_barriers(request);
1883 	request->l_state |= RECOMPUTE_DONE;
1884 
1885 	STACK_PUSH(vertex_stack, request, l_stack);
1886 	request->l_sedge = FIRST_IN(request);
1887 
1888 
1889 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1890 		if (vertex->l_state & RECOMPUTE_DONE) {
1891 			count = 0;
1892 			goto next_in_edge;
1893 		}
1894 		if (IS_BARRIER(vertex)) {
1895 			/* decrement the barrier count */
1896 			if (vertex->l_index) {
1897 				vertex->l_index--;
1898 				/* this guy will be pushed again anyway ? */
1899 				STACK_POP(vertex_stack, l_stack);
1900 				if (vertex->l_index == 0)  {
1901 				/*
1902 				 * barrier is over we can recompute
1903 				 * dependencies for this lock in the
1904 				 * next stack pop
1905 				 */
1906 					vertex->l_state &= ~BARRIER_LOCK;
1907 				}
1908 				continue;
1909 			}
1910 		}
1911 		vertex->l_state |= RECOMPUTE_DONE;
1912 		flk_graph_uncolor(gp);
1913 		count = flk_color_reachables(vertex);
1914 		for (i = 0; i < nvertex; i++) {
1915 			lock = topology[i];
1916 			if (COLORED(lock))
1917 				continue;
1918 			if (BLOCKS(lock, vertex)) {
1919 				(void) flk_add_edge(vertex, lock,
1920 				    NO_CHECK_CYCLE, update_graph);
1921 				COLOR(lock);
1922 				count++;
1923 				count += flk_color_reachables(lock);
1924 			}
1925 
1926 		}
1927 
1928 next_in_edge:
1929 		if (count == nvertex ||
1930 				vertex->l_sedge == HEAD(vertex)) {
1931 			/* prune the tree below this */
1932 			STACK_POP(vertex_stack, l_stack);
1933 			vertex->l_state &= ~RECOMPUTE_DONE;
1934 			/* update the barrier locks below this! */
1935 			if (vertex->l_sedge != HEAD(vertex) && barrier_found) {
1936 				flk_graph_uncolor(gp);
1937 				flk_update_barriers(vertex);
1938 			}
1939 			continue;
1940 		}
1941 
1942 		ep = vertex->l_sedge;
1943 		lock = ep->from_vertex;
1944 		STACK_PUSH(vertex_stack, lock, l_stack);
1945 		lock->l_sedge = FIRST_IN(lock);
1946 		vertex->l_sedge = NEXT_IN(ep);
1947 	}
1948 
1949 }
1950 
1951 /*
1952  * Color all reachable vertices from vertex that belongs to topology (here
1953  * those that have RECOMPUTE_LOCK set in their state) and yet uncolored.
1954  *
1955  * Note: we need to use a different stack_link l_stack1 because this is
1956  * called from flk_recompute_dependencies() that already uses a stack with
1957  * l_stack as stack_link.
1958  */
1959 
1960 static int
1961 flk_color_reachables(lock_descriptor_t *vertex)
1962 {
1963 	lock_descriptor_t *ver, *lock;
1964 	int count;
1965 	edge_t	*ep;
1966 	lock_descriptor_t *vertex_stack;
1967 
1968 	STACK_INIT(vertex_stack);
1969 
1970 	STACK_PUSH(vertex_stack, vertex, l_stack1);
1971 	count = 0;
1972 	while ((ver = STACK_TOP(vertex_stack)) != NULL) {
1973 
1974 		STACK_POP(vertex_stack, l_stack1);
1975 		for (ep = FIRST_ADJ(ver); ep != HEAD(ver);
1976 					ep = NEXT_ADJ(ep)) {
1977 			lock = ep->to_vertex;
1978 			if (COLORED(lock))
1979 				continue;
1980 			COLOR(lock);
1981 			if (IS_RECOMPUTE(lock))
1982 				count++;
1983 			STACK_PUSH(vertex_stack, lock, l_stack1);
1984 		}
1985 
1986 	}
1987 	return (count);
1988 }
1989 
1990 /*
1991  * Called from flk_recompute_dependencies() this routine decrements
1992  * the barrier count of barrier vertices that are reachable from lock.
1993  */
1994 
1995 static void
1996 flk_update_barriers(lock_descriptor_t *lock)
1997 {
1998 	lock_descriptor_t *vertex, *lck;
1999 	edge_t	*ep;
2000 	lock_descriptor_t *vertex_stack;
2001 
2002 	STACK_INIT(vertex_stack);
2003 
2004 	STACK_PUSH(vertex_stack, lock, l_stack1);
2005 
2006 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2007 		STACK_POP(vertex_stack, l_stack1);
2008 		for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2009 						ep = NEXT_IN(ep)) {
2010 			lck = ep->from_vertex;
2011 			if (COLORED(lck)) {
2012 				if (IS_BARRIER(lck)) {
2013 					ASSERT(lck->l_index > 0);
2014 					lck->l_index--;
2015 					if (lck->l_index == 0)
2016 						lck->l_state &= ~BARRIER_LOCK;
2017 				}
2018 				continue;
2019 			}
2020 			COLOR(lck);
2021 			if (IS_BARRIER(lck)) {
2022 				ASSERT(lck->l_index > 0);
2023 				lck->l_index--;
2024 				if (lck->l_index == 0)
2025 					lck->l_state &= ~BARRIER_LOCK;
2026 			}
2027 			STACK_PUSH(vertex_stack, lck, l_stack1);
2028 		}
2029 	}
2030 }
2031 
2032 /*
2033  * Finds all vertices that are reachable from 'lock' more than once and
2034  * mark them as barrier vertices and increment their barrier count.
2035  * The barrier count is one minus the total number of paths from lock
2036  * to that vertex.
2037  */
2038 
2039 static int
2040 flk_find_barriers(lock_descriptor_t *lock)
2041 {
2042 	lock_descriptor_t *vertex, *lck;
2043 	int found = 0;
2044 	edge_t	*ep;
2045 	lock_descriptor_t *vertex_stack;
2046 
2047 	STACK_INIT(vertex_stack);
2048 
2049 	STACK_PUSH(vertex_stack, lock, l_stack1);
2050 
2051 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2052 		STACK_POP(vertex_stack, l_stack1);
2053 		for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2054 						ep = NEXT_IN(ep)) {
2055 			lck = ep->from_vertex;
2056 			if (COLORED(lck)) {
2057 				/* this is a barrier */
2058 				lck->l_state |= BARRIER_LOCK;
2059 				/* index will have barrier count */
2060 				lck->l_index++;
2061 				if (!found)
2062 					found = 1;
2063 				continue;
2064 			}
2065 			COLOR(lck);
2066 			lck->l_index = 0;
2067 			STACK_PUSH(vertex_stack, lck, l_stack1);
2068 		}
2069 	}
2070 	return (found);
2071 }
2072 
2073 /*
2074  * Finds the first lock that is mainly responsible for blocking this
2075  * request.  If there is no such lock, request->l_flock.l_type is set to
2076  * F_UNLCK.  Otherwise, request->l_flock is filled in with the particulars
2077  * of the blocking lock.
2078  *
2079  * Note: It is possible a request is blocked by a sleeping lock because
2080  * of the fairness policy used in flk_process_request() to construct the
2081  * dependencies. (see comments before flk_process_request()).
2082  */
2083 
2084 static void
2085 flk_get_first_blocking_lock(lock_descriptor_t *request)
2086 {
2087 	graph_t	*gp = request->l_graph;
2088 	vnode_t *vp = request->l_vnode;
2089 	lock_descriptor_t *lock, *blocker;
2090 
2091 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
2092 	blocker = NULL;
2093 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2094 
2095 	if (lock) {
2096 		do {
2097 			if (BLOCKS(lock, request)) {
2098 				blocker = lock;
2099 				break;
2100 			}
2101 			lock = lock->l_next;
2102 		} while (lock->l_vnode == vp);
2103 	}
2104 
2105 	if (blocker) {
2106 		report_blocker(blocker, request);
2107 	} else
2108 		request->l_flock.l_type = F_UNLCK;
2109 }
2110 
2111 /*
2112  * Get the graph_t structure associated with a vnode.
2113  * If 'initialize' is non-zero, and the graph_t structure for this vnode has
2114  * not yet been initialized, then a new element is allocated and returned.
2115  */
2116 graph_t *
2117 flk_get_lock_graph(vnode_t *vp, int initialize)
2118 {
2119 	graph_t *gp;
2120 	graph_t *gp_alloc = NULL;
2121 	int index = HASH_INDEX(vp);
2122 
2123 	if (initialize == FLK_USE_GRAPH) {
2124 		mutex_enter(&flock_lock);
2125 		gp = lock_graph[index];
2126 		mutex_exit(&flock_lock);
2127 		return (gp);
2128 	}
2129 
2130 	ASSERT(initialize == FLK_INIT_GRAPH);
2131 
2132 	if (lock_graph[index] == NULL) {
2133 
2134 		gp_alloc = kmem_zalloc(sizeof (graph_t), KM_SLEEP);
2135 
2136 		/* Initialize the graph */
2137 
2138 		gp_alloc->active_locks.l_next =
2139 		    gp_alloc->active_locks.l_prev =
2140 		    (lock_descriptor_t *)ACTIVE_HEAD(gp_alloc);
2141 		gp_alloc->sleeping_locks.l_next =
2142 		    gp_alloc->sleeping_locks.l_prev =
2143 		    (lock_descriptor_t *)SLEEPING_HEAD(gp_alloc);
2144 		gp_alloc->index = index;
2145 		mutex_init(&gp_alloc->gp_mutex, NULL, MUTEX_DEFAULT, NULL);
2146 	}
2147 
2148 	mutex_enter(&flock_lock);
2149 
2150 	gp = lock_graph[index];
2151 
2152 	/* Recheck the value within flock_lock */
2153 	if (gp == NULL) {
2154 		struct flock_globals *fg;
2155 
2156 		/* We must have previously allocated the graph_t structure */
2157 		ASSERT(gp_alloc != NULL);
2158 		lock_graph[index] = gp = gp_alloc;
2159 		/*
2160 		 * The lockmgr status is only needed if KLM is loaded.
2161 		 */
2162 		if (flock_zone_key != ZONE_KEY_UNINITIALIZED) {
2163 			fg = flk_get_globals();
2164 			fg->lockmgr_status[index] = fg->flk_lockmgr_status;
2165 		}
2166 	}
2167 
2168 	mutex_exit(&flock_lock);
2169 
2170 	if ((gp_alloc != NULL) && (gp != gp_alloc)) {
2171 		/* There was a race to allocate the graph_t and we lost */
2172 		mutex_destroy(&gp_alloc->gp_mutex);
2173 		kmem_free(gp_alloc, sizeof (graph_t));
2174 	}
2175 
2176 	return (gp);
2177 }
2178 
2179 /*
2180  * PSARC case 1997/292
2181  */
2182 int
2183 cl_flk_has_remote_locks_for_nlmid(vnode_t *vp, int nlmid)
2184 {
2185 	lock_descriptor_t *lock;
2186 	int result = 0;
2187 	graph_t *gp;
2188 	int			lock_nlmid;
2189 
2190 	/*
2191 	 * Check to see if node is booted as a cluster. If not, return.
2192 	 */
2193 	if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2194 		return (0);
2195 	}
2196 
2197 	gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2198 	if (gp == NULL) {
2199 		return (0);
2200 	}
2201 
2202 	mutex_enter(&gp->gp_mutex);
2203 
2204 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2205 
2206 	if (lock) {
2207 		while (lock->l_vnode == vp) {
2208 			/* get NLM id from sysid */
2209 			lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2210 
2211 			/*
2212 			 * If NLM server request _and_ nlmid of lock matches
2213 			 * nlmid of argument, then we've found a remote lock.
2214 			 */
2215 			if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2216 				result = 1;
2217 				goto done;
2218 			}
2219 			lock = lock->l_next;
2220 		}
2221 	}
2222 
2223 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2224 
2225 	if (lock) {
2226 		while (lock->l_vnode == vp) {
2227 			/* get NLM id from sysid */
2228 			lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2229 
2230 			/*
2231 			 * If NLM server request _and_ nlmid of lock matches
2232 			 * nlmid of argument, then we've found a remote lock.
2233 			 */
2234 			if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2235 				result = 1;
2236 				goto done;
2237 			}
2238 			lock = lock->l_next;
2239 		}
2240 	}
2241 
2242 done:
2243 	mutex_exit(&gp->gp_mutex);
2244 	return (result);
2245 }
2246 
2247 /* ONC_PLUS EXTRACT START */
2248 /*
2249  * Determine whether there are any locks for the given vnode with a remote
2250  * sysid.  Returns zero if not, non-zero if there are.
2251  *
2252  * Note that the return value from this function is potentially invalid
2253  * once it has been returned.  The caller is responsible for providing its
2254  * own synchronization mechanism to ensure that the return value is useful
2255  * (e.g., see nfs_lockcompletion()).
2256  */
2257 int
2258 flk_has_remote_locks(vnode_t *vp)
2259 {
2260 	lock_descriptor_t *lock;
2261 	int result = 0;
2262 	graph_t *gp;
2263 
2264 	gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2265 	if (gp == NULL) {
2266 		return (0);
2267 	}
2268 
2269 	mutex_enter(&gp->gp_mutex);
2270 
2271 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2272 
2273 	if (lock) {
2274 		while (lock->l_vnode == vp) {
2275 			if (IS_REMOTE(lock)) {
2276 				result = 1;
2277 				goto done;
2278 			}
2279 			lock = lock->l_next;
2280 		}
2281 	}
2282 
2283 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2284 
2285 	if (lock) {
2286 		while (lock->l_vnode == vp) {
2287 			if (IS_REMOTE(lock)) {
2288 				result = 1;
2289 				goto done;
2290 			}
2291 			lock = lock->l_next;
2292 		}
2293 	}
2294 
2295 done:
2296 	mutex_exit(&gp->gp_mutex);
2297 	return (result);
2298 }
2299 
2300 /*
2301  * Determine if there are any locks owned by the given sysid.
2302  * Returns zero if not, non-zero if there are.  Note that this return code
2303  * could be derived from flk_get_{sleeping,active}_locks, but this routine
2304  * avoids all the memory allocations of those routines.
2305  *
2306  * This routine has the same synchronization issues as
2307  * flk_has_remote_locks.
2308  */
2309 
2310 int
2311 flk_sysid_has_locks(int sysid, int lck_type)
2312 {
2313 	int		has_locks = 0;
2314 	lock_descriptor_t	*lock;
2315 	graph_t 	*gp;
2316 	int		i;
2317 
2318 	for (i = 0; i < HASH_SIZE && !has_locks; i++) {
2319 		mutex_enter(&flock_lock);
2320 		gp = lock_graph[i];
2321 		mutex_exit(&flock_lock);
2322 		if (gp == NULL) {
2323 			continue;
2324 		}
2325 
2326 		mutex_enter(&gp->gp_mutex);
2327 
2328 		if (lck_type & FLK_QUERY_ACTIVE) {
2329 			for (lock = ACTIVE_HEAD(gp)->l_next;
2330 			    lock != ACTIVE_HEAD(gp) && !has_locks;
2331 			    lock = lock->l_next) {
2332 				if (lock->l_flock.l_sysid == sysid)
2333 					has_locks = 1;
2334 			}
2335 		}
2336 
2337 		if (lck_type & FLK_QUERY_SLEEPING) {
2338 			for (lock = SLEEPING_HEAD(gp)->l_next;
2339 				lock != SLEEPING_HEAD(gp) && !has_locks;
2340 				lock = lock->l_next) {
2341 				if (lock->l_flock.l_sysid == sysid)
2342 					has_locks = 1;
2343 			}
2344 		}
2345 		mutex_exit(&gp->gp_mutex);
2346 	}
2347 
2348 	return (has_locks);
2349 }
2350 
2351 
2352 /*
2353  * PSARC case 1997/292
2354  *
2355  * Requires: "sysid" is a pair [nlmid, sysid].  The lower half is 16-bit
2356  *  quantity, the real sysid generated by the NLM server; the upper half
2357  *  identifies the node of the cluster where the NLM server ran.
2358  *  This routine is only called by an NLM server running in a cluster.
2359  * Effects: Remove all locks held on behalf of the client identified
2360  *  by "sysid."
2361  */
2362 void
2363 cl_flk_remove_locks_by_sysid(int sysid)
2364 {
2365 	graph_t	*gp;
2366 	int i;
2367 	lock_descriptor_t *lock, *nlock;
2368 
2369 	/*
2370 	 * Check to see if node is booted as a cluster. If not, return.
2371 	 */
2372 	if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2373 		return;
2374 	}
2375 
2376 	ASSERT(sysid != 0);
2377 	for (i = 0; i < HASH_SIZE; i++) {
2378 		mutex_enter(&flock_lock);
2379 		gp = lock_graph[i];
2380 		mutex_exit(&flock_lock);
2381 
2382 		if (gp == NULL)
2383 			continue;
2384 
2385 		mutex_enter(&gp->gp_mutex);	/*  get mutex on lock graph */
2386 
2387 		/* signal sleeping requests so that they bail out */
2388 		lock = SLEEPING_HEAD(gp)->l_next;
2389 		while (lock != SLEEPING_HEAD(gp)) {
2390 			nlock = lock->l_next;
2391 			if (lock->l_flock.l_sysid == sysid) {
2392 				INTERRUPT_WAKEUP(lock);
2393 			}
2394 			lock = nlock;
2395 		}
2396 
2397 		/* delete active locks */
2398 		lock = ACTIVE_HEAD(gp)->l_next;
2399 		while (lock != ACTIVE_HEAD(gp)) {
2400 			nlock = lock->l_next;
2401 			if (lock->l_flock.l_sysid == sysid) {
2402 				flk_delete_active_lock(lock, 0);
2403 				flk_wakeup(lock, 1);
2404 				flk_free_lock(lock);
2405 			}
2406 			lock = nlock;
2407 		}
2408 		mutex_exit(&gp->gp_mutex);    /* release mutex on lock graph */
2409 	}
2410 }
2411 
2412 /*
2413  * Delete all locks in the system that belongs to the sysid of the request.
2414  */
2415 
2416 static void
2417 flk_delete_locks_by_sysid(lock_descriptor_t *request)
2418 {
2419 	int	sysid  = request->l_flock.l_sysid;
2420 	lock_descriptor_t *lock, *nlock;
2421 	graph_t	*gp;
2422 	int i;
2423 
2424 	ASSERT(MUTEX_HELD(&request->l_graph->gp_mutex));
2425 	ASSERT(sysid != 0);
2426 
2427 	mutex_exit(&request->l_graph->gp_mutex);
2428 
2429 	for (i = 0; i < HASH_SIZE; i++) {
2430 		mutex_enter(&flock_lock);
2431 		gp = lock_graph[i];
2432 		mutex_exit(&flock_lock);
2433 
2434 		if (gp == NULL)
2435 			continue;
2436 
2437 		mutex_enter(&gp->gp_mutex);
2438 
2439 		/* signal sleeping requests so that they bail out */
2440 		lock = SLEEPING_HEAD(gp)->l_next;
2441 		while (lock != SLEEPING_HEAD(gp)) {
2442 			nlock = lock->l_next;
2443 			if (lock->l_flock.l_sysid == sysid) {
2444 				INTERRUPT_WAKEUP(lock);
2445 			}
2446 			lock = nlock;
2447 		}
2448 
2449 		/* delete active locks */
2450 		lock = ACTIVE_HEAD(gp)->l_next;
2451 		while (lock != ACTIVE_HEAD(gp)) {
2452 			nlock = lock->l_next;
2453 			if (lock->l_flock.l_sysid == sysid) {
2454 				flk_delete_active_lock(lock, 0);
2455 				flk_wakeup(lock, 1);
2456 				flk_free_lock(lock);
2457 			}
2458 			lock = nlock;
2459 		}
2460 		mutex_exit(&gp->gp_mutex);
2461 	}
2462 
2463 	mutex_enter(&request->l_graph->gp_mutex);
2464 }
2465 
2466 /*
2467  * Clustering: Deletes PXFS locks
2468  * Effects: Delete all locks on files in the given file system and with the
2469  *  given PXFS id.
2470  */
2471 void
2472 cl_flk_delete_pxfs_locks(struct vfs *vfsp, int pxfsid)
2473 {
2474 	lock_descriptor_t *lock, *nlock;
2475 	graph_t	*gp;
2476 	int i;
2477 
2478 	for (i = 0; i < HASH_SIZE; i++) {
2479 		mutex_enter(&flock_lock);
2480 		gp = lock_graph[i];
2481 		mutex_exit(&flock_lock);
2482 
2483 		if (gp == NULL)
2484 			continue;
2485 
2486 		mutex_enter(&gp->gp_mutex);
2487 
2488 		/* signal sleeping requests so that they bail out */
2489 		lock = SLEEPING_HEAD(gp)->l_next;
2490 		while (lock != SLEEPING_HEAD(gp)) {
2491 			nlock = lock->l_next;
2492 			if (lock->l_vnode->v_vfsp == vfsp) {
2493 				ASSERT(IS_PXFS(lock));
2494 				if (GETPXFSID(lock->l_flock.l_sysid) ==
2495 				    pxfsid) {
2496 					flk_set_state(lock,
2497 					    FLK_CANCELLED_STATE);
2498 					flk_cancel_sleeping_lock(lock, 1);
2499 				}
2500 			}
2501 			lock = nlock;
2502 		}
2503 
2504 		/* delete active locks */
2505 		lock = ACTIVE_HEAD(gp)->l_next;
2506 		while (lock != ACTIVE_HEAD(gp)) {
2507 			nlock = lock->l_next;
2508 			if (lock->l_vnode->v_vfsp == vfsp) {
2509 				ASSERT(IS_PXFS(lock));
2510 				if (GETPXFSID(lock->l_flock.l_sysid) ==
2511 				    pxfsid) {
2512 					flk_delete_active_lock(lock, 0);
2513 					flk_wakeup(lock, 1);
2514 					flk_free_lock(lock);
2515 				}
2516 			}
2517 			lock = nlock;
2518 		}
2519 		mutex_exit(&gp->gp_mutex);
2520 	}
2521 }
2522 
2523 /*
2524  * Search for a sleeping lock manager lock which matches exactly this lock
2525  * request; if one is found, fake a signal to cancel it.
2526  *
2527  * Return 1 if a matching lock was found, 0 otherwise.
2528  */
2529 
2530 static int
2531 flk_canceled(lock_descriptor_t *request)
2532 {
2533 	lock_descriptor_t *lock, *nlock;
2534 	graph_t *gp = request->l_graph;
2535 	vnode_t *vp = request->l_vnode;
2536 
2537 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
2538 	ASSERT(IS_LOCKMGR(request));
2539 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2540 
2541 	if (lock) {
2542 		while (lock->l_vnode == vp) {
2543 			nlock = lock->l_next;
2544 			if (SAME_OWNER(lock, request) &&
2545 				lock->l_start == request->l_start &&
2546 					lock->l_end == request->l_end) {
2547 				INTERRUPT_WAKEUP(lock);
2548 				return (1);
2549 			}
2550 			lock = nlock;
2551 		}
2552 	}
2553 	return (0);
2554 }
2555 
2556 /*
2557  * Remove all the locks for the vnode belonging to the given pid and sysid.
2558  */
2559 
2560 void
2561 cleanlocks(vnode_t *vp, pid_t pid, int sysid)
2562 {
2563 	graph_t	*gp;
2564 	lock_descriptor_t *lock, *nlock;
2565 	lock_descriptor_t *link_stack;
2566 
2567 	STACK_INIT(link_stack);
2568 
2569 	gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2570 
2571 	if (gp == NULL)
2572 		return;
2573 	mutex_enter(&gp->gp_mutex);
2574 
2575 	CHECK_SLEEPING_LOCKS(gp);
2576 	CHECK_ACTIVE_LOCKS(gp);
2577 
2578 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2579 
2580 	if (lock) {
2581 		do {
2582 			nlock = lock->l_next;
2583 			if ((lock->l_flock.l_pid == pid ||
2584 					pid == IGN_PID) &&
2585 				lock->l_flock.l_sysid == sysid) {
2586 				CANCEL_WAKEUP(lock);
2587 			}
2588 			lock = nlock;
2589 		} while (lock->l_vnode == vp);
2590 	}
2591 
2592 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2593 
2594 	if (lock) {
2595 		do {
2596 			nlock = lock->l_next;
2597 			if ((lock->l_flock.l_pid == pid ||
2598 					pid == IGN_PID) &&
2599 				lock->l_flock.l_sysid == sysid) {
2600 				flk_delete_active_lock(lock, 0);
2601 				STACK_PUSH(link_stack, lock, l_stack);
2602 			}
2603 			lock = nlock;
2604 		} while (lock->l_vnode == vp);
2605 	}
2606 
2607 	while ((lock = STACK_TOP(link_stack)) != NULL) {
2608 		STACK_POP(link_stack, l_stack);
2609 		flk_wakeup(lock, 1);
2610 		flk_free_lock(lock);
2611 	}
2612 
2613 	CHECK_SLEEPING_LOCKS(gp);
2614 	CHECK_ACTIVE_LOCKS(gp);
2615 	CHECK_OWNER_LOCKS(gp, pid, sysid, vp);
2616 	mutex_exit(&gp->gp_mutex);
2617 }
2618 /* ONC_PLUS EXTRACT END */
2619 
2620 
2621 /*
2622  * Called from 'fs' read and write routines for files that have mandatory
2623  * locking enabled.
2624  */
2625 
2626 int
2627 chklock(
2628 	struct vnode	*vp,
2629 	int 		iomode,
2630 	u_offset_t	offset,
2631 	ssize_t		len,
2632 	int 		fmode,
2633 	caller_context_t *ct)
2634 {
2635 	register int	i;
2636 	struct flock64 	bf;
2637 	int 		error = 0;
2638 
2639 	bf.l_type = (iomode & FWRITE) ? F_WRLCK : F_RDLCK;
2640 	bf.l_whence = 0;
2641 	bf.l_start = offset;
2642 	bf.l_len = len;
2643 	if (ct == NULL) {
2644 		bf.l_pid = curproc->p_pid;
2645 		bf.l_sysid = 0;
2646 	} else {
2647 		bf.l_pid = ct->cc_pid;
2648 		bf.l_sysid = ct->cc_sysid;
2649 	}
2650 	i = (fmode & (FNDELAY|FNONBLOCK)) ? INOFLCK : INOFLCK|SLPFLCK;
2651 	if ((i = reclock(vp, &bf, i, 0, offset, NULL)) != 0 ||
2652 	    bf.l_type != F_UNLCK)
2653 		error = i ? i : EAGAIN;
2654 	return (error);
2655 }
2656 
2657 /* ONC_PLUS EXTRACT START */
2658 /*
2659  * convoff - converts the given data (start, whence) to the
2660  * given whence.
2661  */
2662 int
2663 convoff(vp, lckdat, whence, offset)
2664 	struct vnode 	*vp;
2665 	struct flock64 	*lckdat;
2666 	int 		whence;
2667 	offset_t	offset;
2668 {
2669 	int 		error;
2670 	struct vattr 	vattr;
2671 
2672 	if ((lckdat->l_whence == 2) || (whence == 2)) {
2673 		vattr.va_mask = AT_SIZE;
2674 		if (error = VOP_GETATTR(vp, &vattr, 0, CRED()))
2675 			return (error);
2676 	}
2677 
2678 	switch (lckdat->l_whence) {
2679 	case 1:
2680 		lckdat->l_start += offset;
2681 		break;
2682 	case 2:
2683 		lckdat->l_start += vattr.va_size;
2684 		/* FALLTHRU */
2685 	case 0:
2686 		break;
2687 	default:
2688 		return (EINVAL);
2689 	}
2690 
2691 	if (lckdat->l_start < 0)
2692 		return (EINVAL);
2693 
2694 	switch (whence) {
2695 	case 1:
2696 		lckdat->l_start -= offset;
2697 		break;
2698 	case 2:
2699 		lckdat->l_start -= vattr.va_size;
2700 		/* FALLTHRU */
2701 	case 0:
2702 		break;
2703 	default:
2704 		return (EINVAL);
2705 	}
2706 
2707 	lckdat->l_whence = (short)whence;
2708 	return (0);
2709 }
2710 /* ONC_PLUS EXTRACT END */
2711 
2712 
2713 /* 	proc_graph function definitions */
2714 
2715 /*
2716  * Function checks for deadlock due to the new 'lock'. If deadlock found
2717  * edges of this lock are freed and returned.
2718  */
2719 
2720 static int
2721 flk_check_deadlock(lock_descriptor_t *lock)
2722 {
2723 	proc_vertex_t	*start_vertex, *pvertex;
2724 	proc_vertex_t *dvertex;
2725 	proc_edge_t *pep, *ppep;
2726 	edge_t	*ep, *nep;
2727 	proc_vertex_t *process_stack;
2728 
2729 	STACK_INIT(process_stack);
2730 
2731 	mutex_enter(&flock_lock);
2732 	start_vertex = flk_get_proc_vertex(lock);
2733 	ASSERT(start_vertex != NULL);
2734 
2735 	/* construct the edges from this process to other processes */
2736 
2737 	ep = FIRST_ADJ(lock);
2738 	while (ep != HEAD(lock)) {
2739 		proc_vertex_t *adj_proc;
2740 
2741 		adj_proc = flk_get_proc_vertex(ep->to_vertex);
2742 		for (pep = start_vertex->edge; pep != NULL; pep = pep->next) {
2743 			if (pep->to_proc == adj_proc) {
2744 				ASSERT(pep->refcount);
2745 				pep->refcount++;
2746 				break;
2747 			}
2748 		}
2749 		if (pep == NULL) {
2750 			pep = flk_get_proc_edge();
2751 			pep->to_proc = adj_proc;
2752 			pep->refcount = 1;
2753 			adj_proc->incount++;
2754 			pep->next = start_vertex->edge;
2755 			start_vertex->edge = pep;
2756 		}
2757 		ep = NEXT_ADJ(ep);
2758 	}
2759 
2760 	ep = FIRST_IN(lock);
2761 
2762 	while (ep != HEAD(lock)) {
2763 		proc_vertex_t *in_proc;
2764 
2765 		in_proc = flk_get_proc_vertex(ep->from_vertex);
2766 
2767 		for (pep = in_proc->edge; pep != NULL; pep = pep->next) {
2768 			if (pep->to_proc == start_vertex) {
2769 				ASSERT(pep->refcount);
2770 				pep->refcount++;
2771 				break;
2772 			}
2773 		}
2774 		if (pep == NULL) {
2775 			pep = flk_get_proc_edge();
2776 			pep->to_proc = start_vertex;
2777 			pep->refcount = 1;
2778 			start_vertex->incount++;
2779 			pep->next = in_proc->edge;
2780 			in_proc->edge = pep;
2781 		}
2782 		ep = NEXT_IN(ep);
2783 	}
2784 
2785 	if (start_vertex->incount == 0) {
2786 		mutex_exit(&flock_lock);
2787 		return (0);
2788 	}
2789 
2790 	flk_proc_graph_uncolor();
2791 
2792 	start_vertex->p_sedge = start_vertex->edge;
2793 
2794 	STACK_PUSH(process_stack, start_vertex, p_stack);
2795 
2796 	while ((pvertex = STACK_TOP(process_stack)) != NULL) {
2797 		for (pep = pvertex->p_sedge; pep != NULL; pep = pep->next) {
2798 			dvertex = pep->to_proc;
2799 			if (!PROC_ARRIVED(dvertex)) {
2800 				STACK_PUSH(process_stack, dvertex, p_stack);
2801 				dvertex->p_sedge = dvertex->edge;
2802 				PROC_ARRIVE(pvertex);
2803 				pvertex->p_sedge = pep->next;
2804 				break;
2805 			}
2806 			if (!PROC_DEPARTED(dvertex))
2807 				goto deadlock;
2808 		}
2809 		if (pep == NULL) {
2810 			PROC_DEPART(pvertex);
2811 			STACK_POP(process_stack, p_stack);
2812 		}
2813 	}
2814 	mutex_exit(&flock_lock);
2815 	return (0);
2816 
2817 deadlock:
2818 
2819 	/* we remove all lock edges and proc edges */
2820 
2821 	ep = FIRST_ADJ(lock);
2822 	while (ep != HEAD(lock)) {
2823 		proc_vertex_t *adj_proc;
2824 		adj_proc = flk_get_proc_vertex(ep->to_vertex);
2825 		nep = NEXT_ADJ(ep);
2826 		IN_LIST_REMOVE(ep);
2827 		ADJ_LIST_REMOVE(ep);
2828 		flk_free_edge(ep);
2829 		ppep = start_vertex->edge;
2830 		for (pep = start_vertex->edge; pep != NULL; ppep = pep,
2831 						pep = ppep->next) {
2832 			if (pep->to_proc == adj_proc) {
2833 				pep->refcount--;
2834 				if (pep->refcount == 0) {
2835 					if (pep == ppep) {
2836 						start_vertex->edge = pep->next;
2837 					} else {
2838 						ppep->next = pep->next;
2839 					}
2840 					adj_proc->incount--;
2841 					flk_proc_release(adj_proc);
2842 					flk_free_proc_edge(pep);
2843 				}
2844 				break;
2845 			}
2846 		}
2847 		ep = nep;
2848 	}
2849 	ep = FIRST_IN(lock);
2850 	while (ep != HEAD(lock)) {
2851 		proc_vertex_t *in_proc;
2852 		in_proc = flk_get_proc_vertex(ep->from_vertex);
2853 		nep = NEXT_IN(ep);
2854 		IN_LIST_REMOVE(ep);
2855 		ADJ_LIST_REMOVE(ep);
2856 		flk_free_edge(ep);
2857 		ppep = in_proc->edge;
2858 		for (pep = in_proc->edge; pep != NULL; ppep = pep,
2859 						pep = ppep->next) {
2860 			if (pep->to_proc == start_vertex) {
2861 				pep->refcount--;
2862 				if (pep->refcount == 0) {
2863 					if (pep == ppep) {
2864 						in_proc->edge = pep->next;
2865 					} else {
2866 						ppep->next = pep->next;
2867 					}
2868 					start_vertex->incount--;
2869 					flk_proc_release(in_proc);
2870 					flk_free_proc_edge(pep);
2871 				}
2872 				break;
2873 			}
2874 		}
2875 		ep = nep;
2876 	}
2877 	flk_proc_release(start_vertex);
2878 	mutex_exit(&flock_lock);
2879 	return (1);
2880 }
2881 
2882 /*
2883  * Get a proc vertex. If lock's pvertex value gets a correct proc vertex
2884  * from the list we return that, otherwise we allocate one. If necessary,
2885  * we grow the list of vertices also.
2886  */
2887 
2888 static proc_vertex_t *
2889 flk_get_proc_vertex(lock_descriptor_t *lock)
2890 {
2891 	int i;
2892 	proc_vertex_t	*pv;
2893 	proc_vertex_t	**palloc;
2894 
2895 	ASSERT(MUTEX_HELD(&flock_lock));
2896 	if (lock->pvertex != -1) {
2897 		ASSERT(lock->pvertex >= 0);
2898 		pv = pgraph.proc[lock->pvertex];
2899 		if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
2900 			return (pv);
2901 		}
2902 	}
2903 	for (i = 0; i < pgraph.gcount; i++) {
2904 		pv = pgraph.proc[i];
2905 		if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
2906 			lock->pvertex = pv->index = i;
2907 			return (pv);
2908 		}
2909 	}
2910 	pv = kmem_zalloc(sizeof (struct proc_vertex), KM_SLEEP);
2911 	pv->pid = lock->l_flock.l_pid;
2912 	pv->sysid = lock->l_flock.l_sysid;
2913 	flk_proc_vertex_allocs++;
2914 	if (pgraph.free != 0) {
2915 		for (i = 0; i < pgraph.gcount; i++) {
2916 			if (pgraph.proc[i] == NULL) {
2917 				pgraph.proc[i] = pv;
2918 				lock->pvertex = pv->index = i;
2919 				pgraph.free--;
2920 				return (pv);
2921 			}
2922 		}
2923 	}
2924 	palloc = kmem_zalloc((pgraph.gcount + PROC_CHUNK) *
2925 				sizeof (proc_vertex_t *), KM_SLEEP);
2926 
2927 	if (pgraph.proc) {
2928 		bcopy(pgraph.proc, palloc,
2929 			pgraph.gcount * sizeof (proc_vertex_t *));
2930 
2931 		kmem_free(pgraph.proc,
2932 			pgraph.gcount * sizeof (proc_vertex_t *));
2933 	}
2934 	pgraph.proc = palloc;
2935 	pgraph.free += (PROC_CHUNK - 1);
2936 	pv->index = lock->pvertex = pgraph.gcount;
2937 	pgraph.gcount += PROC_CHUNK;
2938 	pgraph.proc[pv->index] = pv;
2939 	return (pv);
2940 }
2941 
2942 /*
2943  * Allocate a proc edge.
2944  */
2945 
2946 static proc_edge_t *
2947 flk_get_proc_edge()
2948 {
2949 	proc_edge_t *pep;
2950 
2951 	pep = kmem_zalloc(sizeof (proc_edge_t), KM_SLEEP);
2952 	flk_proc_edge_allocs++;
2953 	return (pep);
2954 }
2955 
2956 /*
2957  * Free the proc edge. Called whenever its reference count goes to zero.
2958  */
2959 
2960 static void
2961 flk_free_proc_edge(proc_edge_t *pep)
2962 {
2963 	ASSERT(pep->refcount == 0);
2964 	kmem_free((void *)pep, sizeof (proc_edge_t));
2965 	flk_proc_edge_frees++;
2966 }
2967 
2968 /*
2969  * Color the graph explicitly done only when the mark value hits max value.
2970  */
2971 
2972 static void
2973 flk_proc_graph_uncolor()
2974 {
2975 	int i;
2976 
2977 	if (pgraph.mark == UINT_MAX) {
2978 		for (i = 0; i < pgraph.gcount; i++)
2979 			if (pgraph.proc[i] != NULL) {
2980 				pgraph.proc[i]->atime = 0;
2981 				pgraph.proc[i]->dtime = 0;
2982 			}
2983 		pgraph.mark = 1;
2984 	} else {
2985 		pgraph.mark++;
2986 	}
2987 }
2988 
2989 /*
2990  * Release the proc vertex iff both there are no in edges and out edges
2991  */
2992 
2993 static void
2994 flk_proc_release(proc_vertex_t *proc)
2995 {
2996 	ASSERT(MUTEX_HELD(&flock_lock));
2997 	if (proc->edge == NULL && proc->incount == 0) {
2998 		pgraph.proc[proc->index] = NULL;
2999 		pgraph.free++;
3000 		kmem_free(proc, sizeof (proc_vertex_t));
3001 		flk_proc_vertex_frees++;
3002 	}
3003 }
3004 
3005 /*
3006  * Updates process graph to reflect change in a lock_graph.
3007  * Note: We should call this function only after we have a correctly
3008  * recomputed lock graph. Otherwise we might miss a deadlock detection.
3009  * eg: in function flk_relation() we call this function after flk_recompute_
3010  * dependencies() otherwise if a process tries to lock a vnode hashed
3011  * into another graph it might sleep for ever.
3012  */
3013 
3014 static void
3015 flk_update_proc_graph(edge_t *ep, int delete)
3016 {
3017 	proc_vertex_t *toproc, *fromproc;
3018 	proc_edge_t *pep, *prevpep;
3019 
3020 	mutex_enter(&flock_lock);
3021 	toproc = flk_get_proc_vertex(ep->to_vertex);
3022 	fromproc = flk_get_proc_vertex(ep->from_vertex);
3023 
3024 	if (!delete)
3025 		goto add;
3026 	pep = prevpep = fromproc->edge;
3027 
3028 	ASSERT(pep != NULL);
3029 	while (pep != NULL) {
3030 		if (pep->to_proc == toproc) {
3031 			ASSERT(pep->refcount > 0);
3032 			pep->refcount--;
3033 			if (pep->refcount == 0) {
3034 				if (pep == prevpep) {
3035 					fromproc->edge = pep->next;
3036 				} else {
3037 					prevpep->next = pep->next;
3038 				}
3039 				toproc->incount--;
3040 				flk_proc_release(toproc);
3041 				flk_free_proc_edge(pep);
3042 			}
3043 			break;
3044 		}
3045 		prevpep = pep;
3046 		pep = pep->next;
3047 	}
3048 	flk_proc_release(fromproc);
3049 	mutex_exit(&flock_lock);
3050 	return;
3051 add:
3052 
3053 	pep = fromproc->edge;
3054 
3055 	while (pep != NULL) {
3056 		if (pep->to_proc == toproc) {
3057 			ASSERT(pep->refcount > 0);
3058 			pep->refcount++;
3059 			break;
3060 		}
3061 		pep = pep->next;
3062 	}
3063 	if (pep == NULL) {
3064 		pep = flk_get_proc_edge();
3065 		pep->to_proc = toproc;
3066 		pep->refcount = 1;
3067 		toproc->incount++;
3068 		pep->next = fromproc->edge;
3069 		fromproc->edge = pep;
3070 	}
3071 	mutex_exit(&flock_lock);
3072 }
3073 
3074 /* ONC_PLUS EXTRACT START */
3075 /*
3076  * Set the control status for lock manager requests.
3077  *
3078  */
3079 
3080 /*
3081  * PSARC case 1997/292
3082  *
3083  * Requires: "nlmid" must be >= 1 and <= clconf_maximum_nodeid().
3084  * Effects: Set the state of the NLM server identified by "nlmid"
3085  *   in the NLM registry to state "nlm_state."
3086  *   Raises exception no_such_nlm if "nlmid" doesn't identify a known
3087  *   NLM server to this LLM.
3088  *   Note that when this routine is called with NLM_SHUTTING_DOWN there
3089  *   may be locks requests that have gotten started but not finished.  In
3090  *   particular, there may be blocking requests that are in the callback code
3091  *   before sleeping (so they're not holding the lock for the graph).  If
3092  *   such a thread reacquires the graph's lock (to go to sleep) after
3093  *   NLM state in the NLM registry  is set to a non-up value,
3094  *   it will notice the status and bail out.  If the request gets
3095  *   granted before the thread can check the NLM registry, let it
3096  *   continue normally.  It will get flushed when we are called with NLM_DOWN.
3097  *
3098  * Modifies: nlm_reg_obj (global)
3099  * Arguments:
3100  *    nlmid	(IN):    id uniquely identifying an NLM server
3101  *    nlm_state (IN):    NLM server state to change "nlmid" to
3102  */
3103 void
3104 cl_flk_set_nlm_status(int nlmid, flk_nlm_status_t nlm_state)
3105 {
3106 	/*
3107 	 * Check to see if node is booted as a cluster. If not, return.
3108 	 */
3109 	if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
3110 		return;
3111 	}
3112 
3113 	/*
3114 	 * Check for development/debugging.  It is possible to boot a node
3115 	 * in non-cluster mode, and then run a special script, currently
3116 	 * available only to developers, to bring up the node as part of a
3117 	 * cluster.  The problem is that running such a script does not
3118 	 * result in the routine flk_init() being called and hence global array
3119 	 * nlm_reg_status is NULL.  The NLM thinks it's in cluster mode,
3120 	 * but the LLM needs to do an additional check to see if the global
3121 	 * array has been created or not. If nlm_reg_status is NULL, then
3122 	 * return, else continue.
3123 	 */
3124 	if (nlm_reg_status == NULL) {
3125 		return;
3126 	}
3127 
3128 	ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
3129 	mutex_enter(&nlm_reg_lock);
3130 
3131 	if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status, nlmid)) {
3132 		/*
3133 		 * If the NLM server "nlmid" is unknown in the NLM registry,
3134 		 * add it to the registry in the nlm shutting down state.
3135 		 */
3136 		FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3137 			FLK_NLM_SHUTTING_DOWN);
3138 	} else {
3139 		/*
3140 		 * Change the state of the NLM server identified by "nlmid"
3141 		 * in the NLM registry to the argument "nlm_state."
3142 		 */
3143 		FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3144 			nlm_state);
3145 	}
3146 
3147 	/*
3148 	 *  The reason we must register the NLM server that is shutting down
3149 	 *  with an LLM that doesn't already know about it (never sent a lock
3150 	 *  request) is to handle correctly a race between shutdown and a new
3151 	 *  lock request.  Suppose that a shutdown request from the NLM server
3152 	 *  invokes this routine at the LLM, and a thread is spawned to
3153 	 *  service the request. Now suppose a new lock request is in
3154 	 *  progress and has already passed the first line of defense in
3155 	 *  reclock(), which denies new locks requests from NLM servers
3156 	 *  that are not in the NLM_UP state.  After the current routine
3157 	 *  is invoked for both phases of shutdown, the routine will return,
3158 	 *  having done nothing, and the lock request will proceed and
3159 	 *  probably be granted.  The problem is that the shutdown was ignored
3160 	 *  by the lock request because there was no record of that NLM server
3161 	 *  shutting down.   We will be in the peculiar position of thinking
3162 	 *  that we've shutdown the NLM server and all locks at all LLMs have
3163 	 *  been discarded, but in fact there's still one lock held.
3164 	 *  The solution is to record the existence of NLM server and change
3165 	 *  its state immediately to NLM_SHUTTING_DOWN.  The lock request in
3166 	 *  progress may proceed because the next phase NLM_DOWN will catch
3167 	 *  this lock and discard it.
3168 	 */
3169 	mutex_exit(&nlm_reg_lock);
3170 
3171 	switch (nlm_state) {
3172 	case FLK_NLM_UP:
3173 		/*
3174 		 * Change the NLM state of all locks still held on behalf of
3175 		 * the NLM server identified by "nlmid" to NLM_UP.
3176 		 */
3177 		cl_flk_change_nlm_state_all_locks(nlmid, FLK_NLM_UP);
3178 		break;
3179 
3180 	case FLK_NLM_SHUTTING_DOWN:
3181 		/*
3182 		 * Wake up all sleeping locks for the NLM server identified
3183 		 * by "nlmid." Note that eventually all woken threads will
3184 		 * have their lock requests cancelled and descriptors
3185 		 * removed from the sleeping lock list.  Note that the NLM
3186 		 * server state associated with each lock descriptor is
3187 		 * changed to FLK_NLM_SHUTTING_DOWN.
3188 		 */
3189 		cl_flk_wakeup_sleeping_nlm_locks(nlmid);
3190 		break;
3191 
3192 	case FLK_NLM_DOWN:
3193 		/*
3194 		 * Discard all active, granted locks for this NLM server
3195 		 * identified by "nlmid."
3196 		 */
3197 		cl_flk_unlock_nlm_granted(nlmid);
3198 		break;
3199 
3200 	default:
3201 		panic("cl_set_nlm_status: bad status (%d)", nlm_state);
3202 	}
3203 }
3204 
3205 /*
3206  * Set the control status for lock manager requests.
3207  *
3208  * Note that when this routine is called with FLK_WAKEUP_SLEEPERS, there
3209  * may be locks requests that have gotten started but not finished.  In
3210  * particular, there may be blocking requests that are in the callback code
3211  * before sleeping (so they're not holding the lock for the graph).  If
3212  * such a thread reacquires the graph's lock (to go to sleep) after
3213  * flk_lockmgr_status is set to a non-up value, it will notice the status
3214  * and bail out.  If the request gets granted before the thread can check
3215  * flk_lockmgr_status, let it continue normally.  It will get flushed when
3216  * we are called with FLK_LOCKMGR_DOWN.
3217  */
3218 
3219 void
3220 flk_set_lockmgr_status(flk_lockmgr_status_t status)
3221 {
3222 	int i;
3223 	graph_t *gp;
3224 	struct flock_globals *fg;
3225 
3226 	fg = flk_get_globals();
3227 	ASSERT(fg != NULL);
3228 
3229 	mutex_enter(&flock_lock);
3230 	fg->flk_lockmgr_status = status;
3231 	mutex_exit(&flock_lock);
3232 
3233 	/*
3234 	 * If the lock manager is coming back up, all that's needed is to
3235 	 * propagate this information to the graphs.  If the lock manager
3236 	 * is going down, additional action is required, and each graph's
3237 	 * copy of the state is updated atomically with this other action.
3238 	 */
3239 	switch (status) {
3240 	case FLK_LOCKMGR_UP:
3241 		for (i = 0; i < HASH_SIZE; i++) {
3242 			mutex_enter(&flock_lock);
3243 			gp = lock_graph[i];
3244 			mutex_exit(&flock_lock);
3245 			if (gp == NULL)
3246 				continue;
3247 			mutex_enter(&gp->gp_mutex);
3248 			fg->lockmgr_status[i] = status;
3249 			mutex_exit(&gp->gp_mutex);
3250 		}
3251 		break;
3252 	case FLK_WAKEUP_SLEEPERS:
3253 		wakeup_sleeping_lockmgr_locks(fg);
3254 		break;
3255 	case FLK_LOCKMGR_DOWN:
3256 		unlock_lockmgr_granted(fg);
3257 		break;
3258 	default:
3259 		panic("flk_set_lockmgr_status: bad status (%d)", status);
3260 		break;
3261 	}
3262 }
3263 
3264 /*
3265  * This routine returns all the locks that are active or sleeping and are
3266  * associated with a particular set of identifiers.  If lock_state != 0, then
3267  * only locks that match the lock_state are returned. If lock_state == 0, then
3268  * all locks are returned. If pid == NOPID, the pid is ignored.  If
3269  * use_sysid is FALSE, then the sysid is ignored.  If vp is NULL, then the
3270  * vnode pointer is ignored.
3271  *
3272  * A list containing the vnode pointer and an flock structure
3273  * describing the lock is returned.  Each element in the list is
3274  * dynammically allocated and must be freed by the caller.  The
3275  * last item in the list is denoted by a NULL value in the ll_next
3276  * field.
3277  *
3278  * The vnode pointers returned are held.  The caller is responsible
3279  * for releasing these.  Note that the returned list is only a snapshot of
3280  * the current lock information, and that it is a snapshot of a moving
3281  * target (only one graph is locked at a time).
3282  */
3283 
3284 locklist_t *
3285 get_lock_list(int list_type, int lock_state, int sysid, boolean_t use_sysid,
3286 		pid_t pid, const vnode_t *vp, zoneid_t zoneid)
3287 {
3288 	lock_descriptor_t	*lock;
3289 	lock_descriptor_t	*graph_head;
3290 	locklist_t		listhead;
3291 	locklist_t		*llheadp;
3292 	locklist_t		*llp;
3293 	locklist_t		*lltp;
3294 	graph_t			*gp;
3295 	int			i;
3296 	int			first_index; /* graph index */
3297 	int			num_indexes; /* graph index */
3298 
3299 	ASSERT((list_type == FLK_ACTIVE_STATE) ||
3300 	    (list_type == FLK_SLEEPING_STATE));
3301 
3302 	/*
3303 	 * Get a pointer to something to use as a list head while building
3304 	 * the rest of the list.
3305 	 */
3306 	llheadp = &listhead;
3307 	lltp = llheadp;
3308 	llheadp->ll_next = (locklist_t *)NULL;
3309 
3310 	/* Figure out which graphs we want to look at. */
3311 	if (vp == NULL) {
3312 		first_index = 0;
3313 		num_indexes = HASH_SIZE;
3314 	} else {
3315 		first_index = HASH_INDEX(vp);
3316 		num_indexes = 1;
3317 	}
3318 
3319 	for (i = first_index; i < first_index + num_indexes; i++) {
3320 		mutex_enter(&flock_lock);
3321 		gp = lock_graph[i];
3322 		mutex_exit(&flock_lock);
3323 		if (gp == NULL) {
3324 			continue;
3325 		}
3326 
3327 		mutex_enter(&gp->gp_mutex);
3328 		graph_head = (list_type == FLK_ACTIVE_STATE) ?
3329 			ACTIVE_HEAD(gp) : SLEEPING_HEAD(gp);
3330 		for (lock = graph_head->l_next;
3331 		    lock != graph_head;
3332 		    lock = lock->l_next) {
3333 			if (use_sysid && lock->l_flock.l_sysid != sysid)
3334 				continue;
3335 			if (pid != NOPID && lock->l_flock.l_pid != pid)
3336 				continue;
3337 			if (vp != NULL && lock->l_vnode != vp)
3338 				continue;
3339 			if (lock_state && !(lock_state & lock->l_state))
3340 				continue;
3341 			if (zoneid != lock->l_zoneid && zoneid != ALL_ZONES)
3342 				continue;
3343 			/*
3344 			 * A matching lock was found.  Allocate
3345 			 * space for a new locklist entry and fill
3346 			 * it in.
3347 			 */
3348 			llp = kmem_alloc(sizeof (locklist_t), KM_SLEEP);
3349 			lltp->ll_next = llp;
3350 			VN_HOLD(lock->l_vnode);
3351 			llp->ll_vp = lock->l_vnode;
3352 			create_flock(lock, &(llp->ll_flock));
3353 			llp->ll_next = (locklist_t *)NULL;
3354 			lltp = llp;
3355 		}
3356 		mutex_exit(&gp->gp_mutex);
3357 	}
3358 
3359 	llp = llheadp->ll_next;
3360 	return (llp);
3361 }
3362 
3363 /*
3364  * These two functions are simply interfaces to get_lock_list.  They return
3365  * a list of sleeping or active locks for the given sysid and pid.  See
3366  * get_lock_list for details.
3367  *
3368  * In either case we don't particularly care to specify the zone of interest;
3369  * the sysid-space is global across zones, so the sysid will map to exactly one
3370  * zone, and we'll return information for that zone.
3371  */
3372 
3373 locklist_t *
3374 flk_get_sleeping_locks(int sysid, pid_t pid)
3375 {
3376 	return (get_lock_list(FLK_SLEEPING_STATE, 0, sysid, B_TRUE, pid, NULL,
3377 		    ALL_ZONES));
3378 }
3379 
3380 locklist_t *
3381 flk_get_active_locks(int sysid, pid_t pid)
3382 {
3383 	return (get_lock_list(FLK_ACTIVE_STATE, 0, sysid, B_TRUE, pid, NULL,
3384 		    ALL_ZONES));
3385 }
3386 
3387 /*
3388  * Another interface to get_lock_list.  This one returns all the active
3389  * locks for a given vnode.  Again, see get_lock_list for details.
3390  *
3391  * We don't need to specify which zone's locks we're interested in.  The matter
3392  * would only be interesting if the vnode belonged to NFS, and NFS vnodes can't
3393  * be used by multiple zones, so the list of locks will all be from the right
3394  * zone.
3395  */
3396 
3397 locklist_t *
3398 flk_active_locks_for_vp(const vnode_t *vp)
3399 {
3400 	return (get_lock_list(FLK_ACTIVE_STATE, 0, 0, B_FALSE, NOPID, vp,
3401 		    ALL_ZONES));
3402 }
3403 
3404 /*
3405  * Another interface to get_lock_list.  This one returns all the active
3406  * nbmand locks for a given vnode.  Again, see get_lock_list for details.
3407  *
3408  * See the comment for flk_active_locks_for_vp() for why we don't care to
3409  * specify the particular zone of interest.
3410  */
3411 locklist_t *
3412 flk_active_nbmand_locks_for_vp(const vnode_t *vp)
3413 {
3414 	return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3415 				NOPID, vp, ALL_ZONES));
3416 }
3417 
3418 /*
3419  * Another interface to get_lock_list.  This one returns all the active
3420  * nbmand locks for a given pid.  Again, see get_lock_list for details.
3421  *
3422  * The zone doesn't need to be specified here; the locks held by a
3423  * particular process will either be local (ie, non-NFS) or from the zone
3424  * the process is executing in.  This is because other parts of the system
3425  * ensure that an NFS vnode can't be used in a zone other than that in
3426  * which it was opened.
3427  */
3428 locklist_t *
3429 flk_active_nbmand_locks(pid_t pid)
3430 {
3431 	return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3432 				pid, NULL, ALL_ZONES));
3433 }
3434 
3435 /*
3436  * Free up all entries in the locklist.
3437  */
3438 void
3439 flk_free_locklist(locklist_t *llp)
3440 {
3441 	locklist_t *next_llp;
3442 
3443 	while (llp) {
3444 		next_llp = llp->ll_next;
3445 		VN_RELE(llp->ll_vp);
3446 		kmem_free(llp, sizeof (*llp));
3447 		llp = next_llp;
3448 	}
3449 }
3450 
3451 static void
3452 cl_flk_change_nlm_state_all_locks(int nlmid, flk_nlm_status_t nlm_state)
3453 {
3454 	/*
3455 	 * For each graph "lg" in the hash table lock_graph do
3456 	 * a.  Get the list of sleeping locks
3457 	 * b.  For each lock descriptor in the list do
3458 	 *	i.   If the requested lock is an NLM server request AND
3459 	 *		the nlmid is the same as the routine argument then
3460 	 *		change the lock descriptor's state field to
3461 	 *		"nlm_state."
3462 	 * c.  Get the list of active locks
3463 	 * d.  For each lock descriptor in the list do
3464 	 *	i.   If the requested lock is an NLM server request AND
3465 	 *		the nlmid is the same as the routine argument then
3466 	 *		change the lock descriptor's state field to
3467 	 *		"nlm_state."
3468 	 */
3469 
3470 	int			i;
3471 	graph_t			*gp;			/* lock graph */
3472 	lock_descriptor_t	*lock;			/* lock */
3473 	lock_descriptor_t	*nlock = NULL;		/* next lock */
3474 	int			lock_nlmid;
3475 
3476 	for (i = 0; i < HASH_SIZE; i++) {
3477 		mutex_enter(&flock_lock);
3478 		gp = lock_graph[i];
3479 		mutex_exit(&flock_lock);
3480 		if (gp == NULL) {
3481 			continue;
3482 		}
3483 
3484 		/* Get list of sleeping locks in current lock graph. */
3485 		mutex_enter(&gp->gp_mutex);
3486 		for (lock = SLEEPING_HEAD(gp)->l_next;
3487 		    lock != SLEEPING_HEAD(gp);
3488 		    lock = nlock) {
3489 			nlock = lock->l_next;
3490 			/* get NLM id */
3491 			lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3492 
3493 			/*
3494 			 * If NLM server request AND nlmid of lock matches
3495 			 * nlmid of argument, then set the NLM state of the
3496 			 * lock to "nlm_state."
3497 			 */
3498 			if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3499 				SET_NLM_STATE(lock, nlm_state);
3500 			}
3501 		}
3502 
3503 		/* Get list of active locks in current lock graph. */
3504 		for (lock = ACTIVE_HEAD(gp)->l_next;
3505 		    lock != ACTIVE_HEAD(gp);
3506 		    lock = nlock) {
3507 			nlock = lock->l_next;
3508 			/* get NLM id */
3509 			lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3510 
3511 			/*
3512 			 * If NLM server request AND nlmid of lock matches
3513 			 * nlmid of argument, then set the NLM state of the
3514 			 * lock to "nlm_state."
3515 			 */
3516 			if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3517 				ASSERT(IS_ACTIVE(lock));
3518 				SET_NLM_STATE(lock, nlm_state);
3519 			}
3520 		}
3521 		mutex_exit(&gp->gp_mutex);
3522 	}
3523 }
3524 
3525 /*
3526  * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid().
3527  * Effects: Find all sleeping lock manager requests _only_ for the NLM server
3528  *   identified by "nlmid." Poke those lock requests.
3529  */
3530 static void
3531 cl_flk_wakeup_sleeping_nlm_locks(int nlmid)
3532 {
3533 	lock_descriptor_t *lock;
3534 	lock_descriptor_t *nlock = NULL; /* next lock */
3535 	int i;
3536 	graph_t *gp;
3537 	int	lock_nlmid;
3538 
3539 	for (i = 0; i < HASH_SIZE; i++) {
3540 		mutex_enter(&flock_lock);
3541 		gp = lock_graph[i];
3542 		mutex_exit(&flock_lock);
3543 		if (gp == NULL) {
3544 			continue;
3545 		}
3546 
3547 		mutex_enter(&gp->gp_mutex);
3548 		for (lock = SLEEPING_HEAD(gp)->l_next;
3549 		    lock != SLEEPING_HEAD(gp);
3550 		    lock = nlock) {
3551 			nlock = lock->l_next;
3552 			/*
3553 			 * If NLM server request _and_ nlmid of lock matches
3554 			 * nlmid of argument, then set the NLM state of the
3555 			 * lock to NLM_SHUTTING_DOWN, and wake up sleeping
3556 			 * request.
3557 			 */
3558 			if (IS_LOCKMGR(lock)) {
3559 				/* get NLM id */
3560 				lock_nlmid =
3561 					GETNLMID(lock->l_flock.l_sysid);
3562 				if (nlmid == lock_nlmid) {
3563 					SET_NLM_STATE(lock,
3564 						FLK_NLM_SHUTTING_DOWN);
3565 					INTERRUPT_WAKEUP(lock);
3566 				}
3567 			}
3568 		}
3569 		mutex_exit(&gp->gp_mutex);
3570 	}
3571 }
3572 
3573 /*
3574  * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid()
3575  * Effects:  Find all active (granted) lock manager locks _only_ for the
3576  *   NLM server identified by "nlmid" and release them.
3577  */
3578 static void
3579 cl_flk_unlock_nlm_granted(int nlmid)
3580 {
3581 	lock_descriptor_t *lock;
3582 	lock_descriptor_t *nlock = NULL; /* next lock */
3583 	int i;
3584 	graph_t *gp;
3585 	int	lock_nlmid;
3586 
3587 	for (i = 0; i < HASH_SIZE; i++) {
3588 		mutex_enter(&flock_lock);
3589 		gp = lock_graph[i];
3590 		mutex_exit(&flock_lock);
3591 		if (gp == NULL) {
3592 			continue;
3593 		}
3594 
3595 		mutex_enter(&gp->gp_mutex);
3596 		for (lock = ACTIVE_HEAD(gp)->l_next;
3597 		    lock != ACTIVE_HEAD(gp);
3598 		    lock = nlock) {
3599 			nlock = lock->l_next;
3600 			ASSERT(IS_ACTIVE(lock));
3601 
3602 			/*
3603 			 * If it's an  NLM server request _and_ nlmid of
3604 			 * the lock matches nlmid of argument, then
3605 			 * remove the active lock the list, wakup blocked
3606 			 * threads, and free the storage for the lock.
3607 			 * Note that there's no need to mark the NLM state
3608 			 * of this lock to NLM_DOWN because the lock will
3609 			 * be deleted anyway and its storage freed.
3610 			 */
3611 			if (IS_LOCKMGR(lock)) {
3612 				/* get NLM id */
3613 				lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3614 				if (nlmid == lock_nlmid) {
3615 					flk_delete_active_lock(lock, 0);
3616 					flk_wakeup(lock, 1);
3617 					flk_free_lock(lock);
3618 				}
3619 			}
3620 		}
3621 		mutex_exit(&gp->gp_mutex);
3622 	}
3623 }
3624 
3625 /*
3626  * Find all sleeping lock manager requests and poke them.
3627  */
3628 static void
3629 wakeup_sleeping_lockmgr_locks(struct flock_globals *fg)
3630 {
3631 	lock_descriptor_t *lock;
3632 	lock_descriptor_t *nlock = NULL; /* next lock */
3633 	int i;
3634 	graph_t *gp;
3635 	zoneid_t zoneid = getzoneid();
3636 
3637 	for (i = 0; i < HASH_SIZE; i++) {
3638 		mutex_enter(&flock_lock);
3639 		gp = lock_graph[i];
3640 		mutex_exit(&flock_lock);
3641 		if (gp == NULL) {
3642 			continue;
3643 		}
3644 
3645 		mutex_enter(&gp->gp_mutex);
3646 		fg->lockmgr_status[i] = FLK_WAKEUP_SLEEPERS;
3647 		for (lock = SLEEPING_HEAD(gp)->l_next;
3648 		    lock != SLEEPING_HEAD(gp);
3649 		    lock = nlock) {
3650 			nlock = lock->l_next;
3651 			if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
3652 				INTERRUPT_WAKEUP(lock);
3653 			}
3654 		}
3655 		mutex_exit(&gp->gp_mutex);
3656 	}
3657 }
3658 
3659 
3660 /*
3661  * Find all active (granted) lock manager locks and release them.
3662  */
3663 static void
3664 unlock_lockmgr_granted(struct flock_globals *fg)
3665 {
3666 	lock_descriptor_t *lock;
3667 	lock_descriptor_t *nlock = NULL; /* next lock */
3668 	int i;
3669 	graph_t *gp;
3670 	zoneid_t zoneid = getzoneid();
3671 
3672 	for (i = 0; i < HASH_SIZE; i++) {
3673 		mutex_enter(&flock_lock);
3674 		gp = lock_graph[i];
3675 		mutex_exit(&flock_lock);
3676 		if (gp == NULL) {
3677 			continue;
3678 		}
3679 
3680 		mutex_enter(&gp->gp_mutex);
3681 		fg->lockmgr_status[i] = FLK_LOCKMGR_DOWN;
3682 		for (lock = ACTIVE_HEAD(gp)->l_next;
3683 		    lock != ACTIVE_HEAD(gp);
3684 		    lock = nlock) {
3685 			nlock = lock->l_next;
3686 			if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
3687 				ASSERT(IS_ACTIVE(lock));
3688 				flk_delete_active_lock(lock, 0);
3689 				flk_wakeup(lock, 1);
3690 				flk_free_lock(lock);
3691 			}
3692 		}
3693 		mutex_exit(&gp->gp_mutex);
3694 	}
3695 }
3696 /* ONC_PLUS EXTRACT END */
3697 
3698 
3699 /*
3700  * Wait until a lock is granted, cancelled, or interrupted.
3701  */
3702 
3703 static void
3704 wait_for_lock(lock_descriptor_t *request)
3705 {
3706 	graph_t *gp = request->l_graph;
3707 
3708 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
3709 
3710 	while (!(IS_GRANTED(request)) && !(IS_CANCELLED(request)) &&
3711 	    !(IS_INTERRUPTED(request))) {
3712 		if (!cv_wait_sig(&request->l_cv, &gp->gp_mutex)) {
3713 			flk_set_state(request, FLK_INTERRUPTED_STATE);
3714 			request->l_state |= INTERRUPTED_LOCK;
3715 		}
3716 	}
3717 }
3718 
3719 /* ONC_PLUS EXTRACT START */
3720 /*
3721  * Create an flock structure from the existing lock information
3722  *
3723  * This routine is used to create flock structures for the lock manager
3724  * to use in a reclaim request.  Since the lock was orginated on this
3725  * host, it must be conforming to UNIX semantics, so no checking is
3726  * done to make sure it falls within the lower half of the 32-bit range.
3727  */
3728 
3729 static void
3730 create_flock(lock_descriptor_t *lp, flock64_t *flp)
3731 {
3732 	ASSERT(lp->l_end == MAX_U_OFFSET_T || lp->l_end <= MAXEND);
3733 	ASSERT(lp->l_end >= lp->l_start);
3734 
3735 	flp->l_type = lp->l_type;
3736 	flp->l_whence = 0;
3737 	flp->l_start = lp->l_start;
3738 	flp->l_len = (lp->l_end == MAX_U_OFFSET_T) ? 0 :
3739 		(lp->l_end - lp->l_start + 1);
3740 	flp->l_sysid = lp->l_flock.l_sysid;
3741 	flp->l_pid = lp->l_flock.l_pid;
3742 }
3743 
3744 /*
3745  * Convert flock_t data describing a lock range into unsigned long starting
3746  * and ending points, which are put into lock_request.  Returns 0 or an
3747  * errno value.
3748  * Large Files: max is passed by the caller and we return EOVERFLOW
3749  * as defined by LFS API.
3750  */
3751 
3752 int
3753 flk_convert_lock_data(vnode_t *vp, flock64_t *flp,
3754     u_offset_t *start, u_offset_t *end, offset_t offset)
3755 {
3756 	struct vattr	vattr;
3757 	int	error;
3758 
3759 	/*
3760 	 * Determine the starting point of the request
3761 	 */
3762 	switch (flp->l_whence) {
3763 	case 0:		/* SEEK_SET */
3764 		*start = (u_offset_t)flp->l_start;
3765 		break;
3766 	case 1:		/* SEEK_CUR */
3767 		*start = (u_offset_t)(flp->l_start + offset);
3768 		break;
3769 	case 2:		/* SEEK_END */
3770 		vattr.va_mask = AT_SIZE;
3771 		if (error = VOP_GETATTR(vp, &vattr, 0, CRED()))
3772 			return (error);
3773 		*start = (u_offset_t)(flp->l_start + vattr.va_size);
3774 		break;
3775 	default:
3776 		return (EINVAL);
3777 	}
3778 
3779 	/*
3780 	 * Determine the range covered by the request.
3781 	 */
3782 	if (flp->l_len == 0)
3783 		*end = MAX_U_OFFSET_T;
3784 	else if ((offset_t)flp->l_len > 0) {
3785 		*end = (u_offset_t)(*start + (flp->l_len - 1));
3786 	} else {
3787 		/*
3788 		 * Negative length; why do we even allow this ?
3789 		 * Because this allows easy specification of
3790 		 * the last n bytes of the file.
3791 		 */
3792 		*end = *start;
3793 		*start += (u_offset_t)flp->l_len;
3794 		(*start)++;
3795 	}
3796 	return (0);
3797 }
3798 
3799 /*
3800  * Check the validity of lock data.  This can used by the NFS
3801  * frlock routines to check data before contacting the server.  The
3802  * server must support semantics that aren't as restrictive as
3803  * the UNIX API, so the NFS client is required to check.
3804  * The maximum is now passed in by the caller.
3805  */
3806 
3807 int
3808 flk_check_lock_data(u_offset_t start, u_offset_t end, offset_t max)
3809 {
3810 	/*
3811 	 * The end (length) for local locking should never be greater
3812 	 * than MAXEND. However, the representation for
3813 	 * the entire file is MAX_U_OFFSET_T.
3814 	 */
3815 	if ((start > max) ||
3816 	    ((end > max) && (end != MAX_U_OFFSET_T))) {
3817 		return (EINVAL);
3818 	}
3819 	if (start > end) {
3820 	    return (EINVAL);
3821 	}
3822 	return (0);
3823 }
3824 
3825 /*
3826  * Fill in request->l_flock with information about the lock blocking the
3827  * request.  The complexity here is that lock manager requests are allowed
3828  * to see into the upper part of the 32-bit address range, whereas local
3829  * requests are only allowed to see signed values.
3830  *
3831  * What should be done when "blocker" is a lock manager lock that uses the
3832  * upper portion of the 32-bit range, but "request" is local?  Since the
3833  * request has already been determined to have been blocked by the blocker,
3834  * at least some portion of "blocker" must be in the range of the request,
3835  * or the request extends to the end of file.  For the first case, the
3836  * portion in the lower range is returned with the indication that it goes
3837  * "to EOF."  For the second case, the last byte of the lower range is
3838  * returned with the indication that it goes "to EOF."
3839  */
3840 
3841 static void
3842 report_blocker(lock_descriptor_t *blocker, lock_descriptor_t *request)
3843 {
3844 	flock64_t *flrp;			/* l_flock portion of request */
3845 
3846 	ASSERT(blocker != NULL);
3847 
3848 	flrp = &request->l_flock;
3849 	flrp->l_whence = 0;
3850 	flrp->l_type = blocker->l_type;
3851 	flrp->l_pid = blocker->l_flock.l_pid;
3852 	flrp->l_sysid = blocker->l_flock.l_sysid;
3853 
3854 	if (IS_LOCKMGR(request)) {
3855 		flrp->l_start = blocker->l_start;
3856 		if (blocker->l_end == MAX_U_OFFSET_T)
3857 			flrp->l_len = 0;
3858 		else
3859 			flrp->l_len = blocker->l_end - blocker->l_start + 1;
3860 	} else {
3861 		if (blocker->l_start > MAXEND) {
3862 			flrp->l_start = MAXEND;
3863 			flrp->l_len = 0;
3864 		} else {
3865 			flrp->l_start = blocker->l_start;
3866 			if (blocker->l_end == MAX_U_OFFSET_T)
3867 				flrp->l_len = 0;
3868 			else
3869 				flrp->l_len = blocker->l_end -
3870 					blocker->l_start + 1;
3871 		}
3872 	}
3873 }
3874 /* ONC_PLUS EXTRACT END */
3875 
3876 /*
3877  * PSARC case 1997/292
3878  */
3879 /*
3880  * This is the public routine exported by flock.h.
3881  */
3882 void
3883 cl_flk_change_nlm_state_to_unknown(int nlmid)
3884 {
3885 	/*
3886 	 * Check to see if node is booted as a cluster. If not, return.
3887 	 */
3888 	if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
3889 		return;
3890 	}
3891 
3892 	/*
3893 	 * See comment in cl_flk_set_nlm_status().
3894 	 */
3895 	if (nlm_reg_status == NULL) {
3896 		return;
3897 	}
3898 
3899 	/*
3900 	 * protect NLM registry state with a mutex.
3901 	 */
3902 	ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
3903 	mutex_enter(&nlm_reg_lock);
3904 	FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid, FLK_NLM_UNKNOWN);
3905 	mutex_exit(&nlm_reg_lock);
3906 }
3907 
3908 /*
3909  * Return non-zero if the given I/O request conflicts with an active NBMAND
3910  * lock.
3911  * If svmand is non-zero, it means look at all active locks, not just NBMAND
3912  * locks.
3913  */
3914 
3915 int
3916 nbl_lock_conflict(vnode_t *vp, nbl_op_t op, u_offset_t offset,
3917 		ssize_t length, int svmand)
3918 {
3919 	int conflict = 0;
3920 	graph_t			*gp;
3921 	lock_descriptor_t	*lock;
3922 
3923 	mutex_enter(&flock_lock);
3924 	gp = lock_graph[HASH_INDEX(vp)];
3925 	mutex_exit(&flock_lock);
3926 	if (gp == NULL)
3927 		return (0);
3928 
3929 	mutex_enter(&gp->gp_mutex);
3930 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
3931 
3932 	for (; lock && lock->l_vnode == vp; lock = lock->l_next) {
3933 		if ((svmand || (lock->l_state & NBMAND_LOCK)) &&
3934 		    lock->l_flock.l_sysid == 0 &&
3935 		    lock->l_flock.l_pid != curproc->p_pid &&
3936 		    lock_blocks_io(op, offset, length,
3937 				lock->l_type, lock->l_start, lock->l_end)) {
3938 			conflict = 1;
3939 			break;
3940 		}
3941 	}
3942 	mutex_exit(&gp->gp_mutex);
3943 
3944 	return (conflict);
3945 }
3946 
3947 /*
3948  * Return non-zero if the given I/O request conflicts with the given lock.
3949  */
3950 
3951 static int
3952 lock_blocks_io(nbl_op_t op, u_offset_t offset, ssize_t length,
3953 	    int lock_type, u_offset_t lock_start, u_offset_t lock_end)
3954 {
3955 	ASSERT(op == NBL_READ || op == NBL_WRITE || op == NBL_READWRITE);
3956 	ASSERT(lock_type == F_RDLCK || lock_type == F_WRLCK);
3957 
3958 	if (op == NBL_READ && lock_type == F_RDLCK)
3959 		return (0);
3960 
3961 	if (offset <= lock_start && lock_start < offset + length)
3962 		return (1);
3963 	if (lock_start <= offset && offset <= lock_end)
3964 		return (1);
3965 
3966 	return (0);
3967 }
3968 
3969 #ifdef DEBUG
3970 static void
3971 check_active_locks(graph_t *gp)
3972 {
3973 	lock_descriptor_t *lock, *lock1;
3974 	edge_t	*ep;
3975 
3976 	for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
3977 						lock = lock->l_next) {
3978 		ASSERT(IS_ACTIVE(lock));
3979 		ASSERT(NOT_BLOCKED(lock));
3980 		ASSERT(!IS_BARRIER(lock));
3981 
3982 		ep = FIRST_IN(lock);
3983 
3984 		while (ep != HEAD(lock)) {
3985 			ASSERT(IS_SLEEPING(ep->from_vertex));
3986 			ASSERT(!NOT_BLOCKED(ep->from_vertex));
3987 			ep = NEXT_IN(ep);
3988 		}
3989 
3990 		for (lock1 = lock->l_next; lock1 != ACTIVE_HEAD(gp);
3991 					lock1 = lock1->l_next) {
3992 			if (lock1->l_vnode == lock->l_vnode) {
3993 			if (BLOCKS(lock1, lock)) {
3994 				cmn_err(CE_PANIC,
3995 				    "active lock %p blocks %p",
3996 				    (void *)lock1, (void *)lock);
3997 			} else if (BLOCKS(lock, lock1)) {
3998 				cmn_err(CE_PANIC,
3999 				    "active lock %p blocks %p",
4000 				    (void *)lock, (void *)lock1);
4001 			}
4002 			}
4003 		}
4004 	}
4005 }
4006 
4007 /*
4008  * Effect: This functions checks to see if the transition from 'old_state' to
4009  *	'new_state' is a valid one.  It returns 0 if the transition is valid
4010  *	and 1 if it is not.
4011  *	For a map of valid transitions, see sys/flock_impl.h
4012  */
4013 static int
4014 check_lock_transition(int old_state, int new_state)
4015 {
4016 	switch (old_state) {
4017 	case FLK_INITIAL_STATE:
4018 		if ((new_state == FLK_START_STATE) ||
4019 		    (new_state == FLK_SLEEPING_STATE) ||
4020 		    (new_state == FLK_ACTIVE_STATE) ||
4021 		    (new_state == FLK_DEAD_STATE)) {
4022 			return (0);
4023 		} else {
4024 			return (1);
4025 		}
4026 	case FLK_START_STATE:
4027 		if ((new_state == FLK_ACTIVE_STATE) ||
4028 		    (new_state == FLK_DEAD_STATE)) {
4029 			return (0);
4030 		} else {
4031 			return (1);
4032 		}
4033 	case FLK_ACTIVE_STATE:
4034 		if (new_state == FLK_DEAD_STATE) {
4035 			return (0);
4036 		} else {
4037 			return (1);
4038 		}
4039 	case FLK_SLEEPING_STATE:
4040 		if ((new_state == FLK_GRANTED_STATE) ||
4041 		    (new_state == FLK_INTERRUPTED_STATE) ||
4042 		    (new_state == FLK_CANCELLED_STATE)) {
4043 			return (0);
4044 		} else {
4045 			return (1);
4046 		}
4047 	case FLK_GRANTED_STATE:
4048 		if ((new_state == FLK_START_STATE) ||
4049 		    (new_state == FLK_INTERRUPTED_STATE) ||
4050 		    (new_state == FLK_CANCELLED_STATE)) {
4051 			return (0);
4052 		} else {
4053 			return (1);
4054 		}
4055 	case FLK_CANCELLED_STATE:
4056 		if ((new_state == FLK_INTERRUPTED_STATE) ||
4057 		    (new_state == FLK_DEAD_STATE)) {
4058 			return (0);
4059 		} else {
4060 			return (1);
4061 		}
4062 	case FLK_INTERRUPTED_STATE:
4063 		if (new_state == FLK_DEAD_STATE) {
4064 			return (0);
4065 		} else {
4066 			return (1);
4067 		}
4068 	case FLK_DEAD_STATE:
4069 		/* May be set more than once */
4070 		if (new_state == FLK_DEAD_STATE) {
4071 			return (0);
4072 		} else {
4073 			return (1);
4074 		}
4075 	default:
4076 		return (1);
4077 	}
4078 }
4079 
4080 static void
4081 check_sleeping_locks(graph_t *gp)
4082 {
4083 	lock_descriptor_t *lock1, *lock2;
4084 	edge_t *ep;
4085 	for (lock1 = SLEEPING_HEAD(gp)->l_next; lock1 != SLEEPING_HEAD(gp);
4086 				lock1 = lock1->l_next) {
4087 				ASSERT(!IS_BARRIER(lock1));
4088 	for (lock2 = lock1->l_next; lock2 != SLEEPING_HEAD(gp);
4089 				lock2 = lock2->l_next) {
4090 		if (lock1->l_vnode == lock2->l_vnode) {
4091 			if (BLOCKS(lock2, lock1)) {
4092 				ASSERT(!IS_GRANTED(lock1));
4093 				ASSERT(!NOT_BLOCKED(lock1));
4094 				path(lock1, lock2);
4095 			}
4096 		}
4097 	}
4098 
4099 	for (lock2 = ACTIVE_HEAD(gp)->l_next; lock2 != ACTIVE_HEAD(gp);
4100 					lock2 = lock2->l_next) {
4101 				ASSERT(!IS_BARRIER(lock1));
4102 		if (lock1->l_vnode == lock2->l_vnode) {
4103 			if (BLOCKS(lock2, lock1)) {
4104 				ASSERT(!IS_GRANTED(lock1));
4105 				ASSERT(!NOT_BLOCKED(lock1));
4106 				path(lock1, lock2);
4107 			}
4108 		}
4109 	}
4110 	ep = FIRST_ADJ(lock1);
4111 	while (ep != HEAD(lock1)) {
4112 		ASSERT(BLOCKS(ep->to_vertex, lock1));
4113 		ep = NEXT_ADJ(ep);
4114 	}
4115 	}
4116 }
4117 
4118 static int
4119 level_two_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2, int no_path)
4120 {
4121 	edge_t	*ep;
4122 	lock_descriptor_t	*vertex;
4123 	lock_descriptor_t *vertex_stack;
4124 
4125 	STACK_INIT(vertex_stack);
4126 
4127 	flk_graph_uncolor(lock1->l_graph);
4128 	ep = FIRST_ADJ(lock1);
4129 	ASSERT(ep != HEAD(lock1));
4130 	while (ep != HEAD(lock1)) {
4131 		if (no_path)
4132 			ASSERT(ep->to_vertex != lock2);
4133 		STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4134 		COLOR(ep->to_vertex);
4135 		ep = NEXT_ADJ(ep);
4136 	}
4137 
4138 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
4139 		STACK_POP(vertex_stack, l_dstack);
4140 		for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
4141 						ep = NEXT_ADJ(ep)) {
4142 			if (COLORED(ep->to_vertex))
4143 				continue;
4144 			COLOR(ep->to_vertex);
4145 			if (ep->to_vertex == lock2)
4146 				return (1);
4147 
4148 			STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4149 		}
4150 	}
4151 	return (0);
4152 }
4153 
4154 static void
4155 check_owner_locks(graph_t *gp, pid_t pid, int sysid, vnode_t *vp)
4156 {
4157 	lock_descriptor_t *lock;
4158 
4159 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
4160 
4161 	if (lock) {
4162 		while (lock != ACTIVE_HEAD(gp) && (lock->l_vnode == vp)) {
4163 			if (lock->l_flock.l_pid == pid &&
4164 			    lock->l_flock.l_sysid == sysid)
4165 				cmn_err(CE_PANIC,
4166 				    "owner pid %d's lock %p in active queue",
4167 				    pid, (void *)lock);
4168 			lock = lock->l_next;
4169 		}
4170 	}
4171 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
4172 
4173 	if (lock) {
4174 		while (lock != SLEEPING_HEAD(gp) && (lock->l_vnode == vp)) {
4175 			if (lock->l_flock.l_pid == pid &&
4176 			    lock->l_flock.l_sysid == sysid)
4177 				cmn_err(CE_PANIC,
4178 				    "owner pid %d's lock %p in sleep queue",
4179 				    pid, (void *)lock);
4180 			lock = lock->l_next;
4181 		}
4182 	}
4183 }
4184 
4185 static int
4186 level_one_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4187 {
4188 	edge_t *ep = FIRST_ADJ(lock1);
4189 
4190 	while (ep != HEAD(lock1)) {
4191 		if (ep->to_vertex == lock2)
4192 			return (1);
4193 		else
4194 			ep = NEXT_ADJ(ep);
4195 	}
4196 	return (0);
4197 }
4198 
4199 static int
4200 no_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4201 {
4202 	return (!level_two_path(lock1, lock2, 1));
4203 }
4204 
4205 static void
4206 path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4207 {
4208 	if (level_one_path(lock1, lock2)) {
4209 		if (level_two_path(lock1, lock2, 0) != 0) {
4210 			cmn_err(CE_WARN,
4211 			    "one edge one path from lock1 %p lock2 %p",
4212 			    (void *)lock1, (void *)lock2);
4213 		}
4214 	} else if (no_path(lock1, lock2)) {
4215 		cmn_err(CE_PANIC,
4216 		    "No path from  lock1 %p to lock2 %p",
4217 		    (void *)lock1, (void *)lock2);
4218 	}
4219 }
4220 #endif /* DEBUG */
4221