xref: /titanic_41/usr/src/uts/common/os/flock.c (revision af33352a19f03e8fe054fe9b44b19c20aeed0da6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /* ONC_PLUS EXTRACT START */
22 
23 /*
24  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
25  * Use is subject to license terms.
26  */
27 
28 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
29 /*	All Rights Reserved */
30 
31 #pragma ident	"%Z%%M%	%I%	%E% SMI"
32 
33 #include <sys/flock_impl.h>
34 #include <sys/vfs.h>
35 #include <sys/t_lock.h>		/* for <sys/callb.h> */
36 #include <sys/callb.h>
37 #include <sys/clconf.h>
38 #include <sys/cladm.h>
39 #include <sys/nbmlock.h>
40 #include <sys/cred.h>
41 #include <sys/policy.h>
42 
43 /*
44  * The following four variables are for statistics purposes and they are
45  * not protected by locks. They may not be accurate but will at least be
46  * close to the actual value.
47  */
48 
49 int	flk_lock_allocs;
50 int	flk_lock_frees;
51 int 	edge_allocs;
52 int	edge_frees;
53 int 	flk_proc_vertex_allocs;
54 int 	flk_proc_edge_allocs;
55 int	flk_proc_vertex_frees;
56 int	flk_proc_edge_frees;
57 
58 static kmutex_t flock_lock;
59 
60 #ifdef DEBUG
61 int check_debug = 0;
62 #define	CHECK_ACTIVE_LOCKS(gp)	if (check_debug) \
63 					check_active_locks(gp);
64 #define	CHECK_SLEEPING_LOCKS(gp)	if (check_debug) \
65 						check_sleeping_locks(gp);
66 #define	CHECK_OWNER_LOCKS(gp, pid, sysid, vp) 	\
67 		if (check_debug)	\
68 			check_owner_locks(gp, pid, sysid, vp);
69 #define	CHECK_LOCK_TRANSITION(old_state, new_state) \
70 	{ \
71 		if (check_lock_transition(old_state, new_state)) { \
72 			cmn_err(CE_PANIC, "Illegal lock transition \
73 			    from %d to %d", old_state, new_state); \
74 		} \
75 	}
76 #else
77 
78 #define	CHECK_ACTIVE_LOCKS(gp)
79 #define	CHECK_SLEEPING_LOCKS(gp)
80 #define	CHECK_OWNER_LOCKS(gp, pid, sysid, vp)
81 #define	CHECK_LOCK_TRANSITION(old_state, new_state)
82 
83 #endif /* DEBUG */
84 
85 struct kmem_cache	*flk_edge_cache;
86 
87 graph_t		*lock_graph[HASH_SIZE];
88 proc_graph_t	pgraph;
89 
90 /*
91  * Clustering.
92  *
93  * NLM REGISTRY TYPE IMPLEMENTATION
94  *
95  * Assumptions:
96  *  1.  Nodes in a cluster are numbered starting at 1; always non-negative
97  *	integers; maximum node id is returned by clconf_maximum_nodeid().
98  *  2.  We use this node id to identify the node an NLM server runs on.
99  */
100 
101 /*
102  * NLM registry object keeps track of NLM servers via their
103  * nlmids (which are the node ids of the node in the cluster they run on)
104  * that have requested locks at this LLM with which this registry is
105  * associated.
106  *
107  * Representation of abstraction:
108  *    rep = record[	states: array[nlm_state],
109  *			lock: mutex]
110  *
111  *    Representation invariants:
112  *	1. index i of rep.states is between 0 and n - 1 where n is number
113  *	   of elements in the array, which happen to be the maximum number
114  *	   of nodes in the cluster configuration + 1.
115  *	2. map nlmid to index i of rep.states
116  *		0   -> 0
117  *		1   -> 1
118  *		2   -> 2
119  *		n-1 -> clconf_maximum_nodeid()+1
120  *	3.  This 1-1 mapping is quite convenient and it avoids errors resulting
121  *	    from forgetting to subtract 1 from the index.
122  *	4.  The reason we keep the 0th index is the following.  A legitimate
123  *	    cluster configuration includes making a UFS file system NFS
124  *	    exportable.  The code is structured so that if you're in a cluster
125  *	    you do one thing; otherwise, you do something else.  The problem
126  *	    is what to do if you think you're in a cluster with PXFS loaded,
127  *	    but you're using UFS not PXFS?  The upper two bytes of the sysid
128  *	    encode the node id of the node where NLM server runs; these bytes
129  *	    are zero for UFS.  Since the nodeid is used to index into the
130  *	    registry, we can record the NLM server state information at index
131  *	    0 using the same mechanism used for PXFS file locks!
132  */
133 static flk_nlm_status_t *nlm_reg_status = NULL;	/* state array 0..N-1 */
134 static kmutex_t nlm_reg_lock;			/* lock to protect arrary */
135 static uint_t nlm_status_size;			/* size of state array */
136 
137 /*
138  * Although we need a global lock dependency graph (and associated data
139  * structures), we also need a per-zone notion of whether the lock manager is
140  * running, and so whether to allow lock manager requests or not.
141  *
142  * Thus, on a per-zone basis we maintain a ``global'' variable
143  * (flk_lockmgr_status), protected by flock_lock, and set when the lock
144  * manager is determined to be changing state (starting or stopping).
145  *
146  * Each graph/zone pair also has a copy of this variable, which is protected by
147  * the graph's mutex.
148  *
149  * The per-graph copies are used to synchronize lock requests with shutdown
150  * requests.  The global copy is used to initialize the per-graph field when a
151  * new graph is created.
152  */
153 struct flock_globals {
154 	flk_lockmgr_status_t flk_lockmgr_status;
155 	flk_lockmgr_status_t lockmgr_status[HASH_SIZE];
156 };
157 
158 zone_key_t flock_zone_key;
159 
160 static void create_flock(lock_descriptor_t *, flock64_t *);
161 static lock_descriptor_t	*flk_get_lock(void);
162 static void	flk_free_lock(lock_descriptor_t	*lock);
163 static void	flk_get_first_blocking_lock(lock_descriptor_t *request);
164 static int flk_process_request(lock_descriptor_t *);
165 static int flk_add_edge(lock_descriptor_t *, lock_descriptor_t *, int, int);
166 static edge_t *flk_get_edge(void);
167 static int flk_wait_execute_request(lock_descriptor_t *);
168 static int flk_relation(lock_descriptor_t *, lock_descriptor_t *);
169 static void flk_insert_active_lock(lock_descriptor_t *);
170 static void flk_delete_active_lock(lock_descriptor_t *, int);
171 static void flk_insert_sleeping_lock(lock_descriptor_t *);
172 static void flk_graph_uncolor(graph_t *);
173 static void flk_wakeup(lock_descriptor_t *, int);
174 static void flk_free_edge(edge_t *);
175 static void flk_recompute_dependencies(lock_descriptor_t *,
176 			lock_descriptor_t **,  int, int);
177 static int flk_find_barriers(lock_descriptor_t *);
178 static void flk_update_barriers(lock_descriptor_t *);
179 static int flk_color_reachables(lock_descriptor_t *);
180 static int flk_canceled(lock_descriptor_t *);
181 static void flk_delete_locks_by_sysid(lock_descriptor_t *);
182 static void report_blocker(lock_descriptor_t *, lock_descriptor_t *);
183 static void wait_for_lock(lock_descriptor_t *);
184 static void unlock_lockmgr_granted(struct flock_globals *);
185 static void wakeup_sleeping_lockmgr_locks(struct flock_globals *);
186 
187 /* Clustering hooks */
188 static void cl_flk_change_nlm_state_all_locks(int, flk_nlm_status_t);
189 static void cl_flk_wakeup_sleeping_nlm_locks(int);
190 static void cl_flk_unlock_nlm_granted(int);
191 
192 #ifdef DEBUG
193 static int check_lock_transition(int, int);
194 static void check_sleeping_locks(graph_t *);
195 static void check_active_locks(graph_t *);
196 static int no_path(lock_descriptor_t *, lock_descriptor_t *);
197 static void path(lock_descriptor_t *, lock_descriptor_t *);
198 static void check_owner_locks(graph_t *, pid_t, int, vnode_t *);
199 static int level_one_path(lock_descriptor_t *, lock_descriptor_t *);
200 static int level_two_path(lock_descriptor_t *, lock_descriptor_t *, int);
201 #endif
202 
203 /*	proc_graph function definitions */
204 static int flk_check_deadlock(lock_descriptor_t *);
205 static void flk_proc_graph_uncolor(void);
206 static proc_vertex_t *flk_get_proc_vertex(lock_descriptor_t *);
207 static proc_edge_t *flk_get_proc_edge(void);
208 static void flk_proc_release(proc_vertex_t *);
209 static void flk_free_proc_edge(proc_edge_t *);
210 static void flk_update_proc_graph(edge_t *, int);
211 
212 /* Non-blocking mandatory locking */
213 static int lock_blocks_io(nbl_op_t, u_offset_t, ssize_t, int, u_offset_t,
214 			u_offset_t);
215 
216 static struct flock_globals *
217 flk_get_globals(void)
218 {
219 	/*
220 	 * The KLM module had better be loaded if we're attempting to handle
221 	 * lockmgr requests.
222 	 */
223 	ASSERT(flock_zone_key != ZONE_KEY_UNINITIALIZED);
224 	return (zone_getspecific(flock_zone_key, curproc->p_zone));
225 }
226 
227 static flk_lockmgr_status_t
228 flk_get_lockmgr_status(void)
229 {
230 	struct flock_globals *fg;
231 
232 	ASSERT(MUTEX_HELD(&flock_lock));
233 
234 	if (flock_zone_key == ZONE_KEY_UNINITIALIZED) {
235 		/*
236 		 * KLM module not loaded; lock manager definitely not running.
237 		 */
238 		return (FLK_LOCKMGR_DOWN);
239 	}
240 	fg = flk_get_globals();
241 	return (fg->flk_lockmgr_status);
242 }
243 
244 /*
245  * Routine called from fs_frlock in fs/fs_subr.c
246  */
247 
248 int
249 reclock(vnode_t		*vp,
250 	flock64_t	*lckdat,
251 	int		cmd,
252 	int		flag,
253 	u_offset_t	offset,
254 	flk_callback_t	*flk_cbp)
255 {
256 	lock_descriptor_t	stack_lock_request;
257 	lock_descriptor_t	*lock_request;
258 	int error = 0;
259 	graph_t	*gp;
260 	int			nlmid;
261 
262 	/*
263 	 * Check access permissions
264 	 */
265 	if ((cmd & SETFLCK) &&
266 		((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
267 		(lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
268 			return (EBADF);
269 
270 	/*
271 	 * for query and unlock we use the stack_lock_request
272 	 */
273 
274 	if ((lckdat->l_type == F_UNLCK) ||
275 			!((cmd & INOFLCK) || (cmd & SETFLCK))) {
276 		lock_request = &stack_lock_request;
277 		(void) bzero((caddr_t)lock_request,
278 				sizeof (lock_descriptor_t));
279 
280 		/*
281 		 * following is added to make the assertions in
282 		 * flk_execute_request() to pass through
283 		 */
284 
285 		lock_request->l_edge.edge_in_next = &lock_request->l_edge;
286 		lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
287 		lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
288 		lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
289 		lock_request->l_status = FLK_INITIAL_STATE;
290 	} else {
291 		lock_request = flk_get_lock();
292 	}
293 	lock_request->l_state = 0;
294 	lock_request->l_vnode = vp;
295 	lock_request->l_zoneid = getzoneid();
296 
297 	/*
298 	 * Convert the request range into the canonical start and end
299 	 * values.  The NLM protocol supports locking over the entire
300 	 * 32-bit range, so there's no range checking for remote requests,
301 	 * but we still need to verify that local requests obey the rules.
302 	 */
303 	/* Clustering */
304 	if ((cmd & (RCMDLCK | PCMDLCK)) != 0) {
305 		ASSERT(lckdat->l_whence == 0);
306 		lock_request->l_start = lckdat->l_start;
307 		lock_request->l_end = (lckdat->l_len == 0) ? MAX_U_OFFSET_T :
308 			lckdat->l_start + (lckdat->l_len - 1);
309 	} else {
310 		/* check the validity of the lock range */
311 		error = flk_convert_lock_data(vp, lckdat,
312 			&lock_request->l_start, &lock_request->l_end,
313 			offset);
314 		if (error) {
315 			goto done;
316 		}
317 		error = flk_check_lock_data(lock_request->l_start,
318 					    lock_request->l_end, MAXEND);
319 		if (error) {
320 			goto done;
321 		}
322 	}
323 
324 	ASSERT(lock_request->l_end >= lock_request->l_start);
325 
326 	lock_request->l_type = lckdat->l_type;
327 	if (cmd & INOFLCK)
328 		lock_request->l_state |= IO_LOCK;
329 	if (cmd & SLPFLCK)
330 		lock_request->l_state |= WILLING_TO_SLEEP_LOCK;
331 	if (cmd & RCMDLCK)
332 		lock_request->l_state |= LOCKMGR_LOCK;
333 	if (cmd & NBMLCK)
334 		lock_request->l_state |= NBMAND_LOCK;
335 	/*
336 	 * Clustering: set flag for PXFS locks
337 	 * We do not _only_ check for the PCMDLCK flag because PXFS locks could
338 	 * also be of type 'RCMDLCK'.
339 	 * We do not _only_ check the GETPXFSID() macro because local PXFS
340 	 * clients use a pxfsid of zero to permit deadlock detection in the LLM.
341 	 */
342 
343 	if ((cmd & PCMDLCK) || (GETPXFSID(lckdat->l_sysid) != 0)) {
344 		lock_request->l_state |= PXFS_LOCK;
345 	}
346 	if (!((cmd & SETFLCK) || (cmd & INOFLCK))) {
347 		if (lock_request->l_type == F_RDLCK ||
348 			lock_request->l_type == F_WRLCK)
349 			lock_request->l_state |= QUERY_LOCK;
350 	}
351 	lock_request->l_flock = (*lckdat);
352 	lock_request->l_callbacks = flk_cbp;
353 
354 	/*
355 	 * We are ready for processing the request
356 	 */
357 	if (IS_LOCKMGR(lock_request)) {
358 		/*
359 		 * If the lock request is an NLM server request ....
360 		 */
361 		if (nlm_status_size == 0) { /* not booted as cluster */
362 			mutex_enter(&flock_lock);
363 			/*
364 			 * Bail out if this is a lock manager request and the
365 			 * lock manager is not supposed to be running.
366 			 */
367 			if (flk_get_lockmgr_status() != FLK_LOCKMGR_UP) {
368 				mutex_exit(&flock_lock);
369 				error = ENOLCK;
370 				goto done;
371 			}
372 			mutex_exit(&flock_lock);
373 		} else {			/* booted as a cluster */
374 			nlmid = GETNLMID(lock_request->l_flock.l_sysid);
375 			ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
376 
377 			mutex_enter(&nlm_reg_lock);
378 			/*
379 			 * If the NLM registry does not know about this
380 			 * NLM server making the request, add its nlmid
381 			 * to the registry.
382 			 */
383 			if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status,
384 				nlmid)) {
385 				FLK_REGISTRY_ADD_NLMID(nlm_reg_status, nlmid);
386 			} else if (!FLK_REGISTRY_IS_NLM_UP(nlm_reg_status,
387 				nlmid)) {
388 				/*
389 				 * If the NLM server is already known (has made
390 				 * previous lock requests) and its state is
391 				 * not NLM_UP (means that NLM server is
392 				 * shutting down), then bail out with an
393 				 * error to deny the lock request.
394 				 */
395 				mutex_exit(&nlm_reg_lock);
396 				error = ENOLCK;
397 				goto done;
398 			}
399 			mutex_exit(&nlm_reg_lock);
400 		}
401 	}
402 
403 	/* Now get the lock graph for a particular vnode */
404 	gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH);
405 
406 	/*
407 	 * We drop rwlock here otherwise this might end up causing a
408 	 * deadlock if this IOLOCK sleeps. (bugid # 1183392).
409 	 */
410 
411 	if (IS_IO_LOCK(lock_request)) {
412 		VOP_RWUNLOCK(vp,
413 			(lock_request->l_type == F_RDLCK) ?
414 				V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
415 	}
416 	mutex_enter(&gp->gp_mutex);
417 
418 	lock_request->l_state |= REFERENCED_LOCK;
419 	lock_request->l_graph = gp;
420 
421 	switch (lock_request->l_type) {
422 	case F_RDLCK:
423 	case F_WRLCK:
424 		if (IS_QUERY_LOCK(lock_request)) {
425 			flk_get_first_blocking_lock(lock_request);
426 			(*lckdat) = lock_request->l_flock;
427 			break;
428 		}
429 
430 		/* process the request now */
431 
432 		error = flk_process_request(lock_request);
433 		break;
434 
435 	case F_UNLCK:
436 		/* unlock request will not block so execute it immediately */
437 
438 		if (IS_LOCKMGR(lock_request) &&
439 		    flk_canceled(lock_request)) {
440 			error = 0;
441 		} else {
442 			error = flk_execute_request(lock_request);
443 		}
444 		break;
445 
446 	case F_UNLKSYS:
447 		/*
448 		 * Recovery mechanism to release lock manager locks when
449 		 * NFS client crashes and restart. NFS server will clear
450 		 * old locks and grant new locks.
451 		 */
452 
453 		if (lock_request->l_flock.l_sysid == 0) {
454 			mutex_exit(&gp->gp_mutex);
455 			return (EINVAL);
456 		}
457 		if (secpolicy_nfs(CRED()) != 0) {
458 			mutex_exit(&gp->gp_mutex);
459 			return (EPERM);
460 		}
461 		flk_delete_locks_by_sysid(lock_request);
462 		lock_request->l_state &= ~REFERENCED_LOCK;
463 		flk_set_state(lock_request, FLK_DEAD_STATE);
464 		flk_free_lock(lock_request);
465 		mutex_exit(&gp->gp_mutex);
466 		return (0);
467 
468 	default:
469 		error = EINVAL;
470 		break;
471 	}
472 
473 	/* Clustering: For blocked PXFS locks, return */
474 	if (error == PXFS_LOCK_BLOCKED) {
475 		lock_request->l_state &= ~REFERENCED_LOCK;
476 		mutex_exit(&gp->gp_mutex);
477 		return (error);
478 	}
479 
480 	/*
481 	 * Now that we have seen the status of locks in the system for
482 	 * this vnode we acquire the rwlock if it is an IO_LOCK.
483 	 */
484 
485 	if (IS_IO_LOCK(lock_request)) {
486 		(void) VOP_RWLOCK(vp,
487 			(lock_request->l_type == F_RDLCK) ?
488 				V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
489 		if (!error) {
490 			lckdat->l_type = F_UNLCK;
491 
492 			/*
493 			 * This wake up is needed otherwise
494 			 * if IO_LOCK has slept the dependents on this
495 			 * will not be woken up at all. (bugid # 1185482).
496 			 */
497 
498 			flk_wakeup(lock_request, 1);
499 			flk_set_state(lock_request, FLK_DEAD_STATE);
500 			flk_free_lock(lock_request);
501 		}
502 		/*
503 		 * else if error had occurred either flk_process_request()
504 		 * has returned EDEADLK in which case there will be no
505 		 * dependents for this lock or EINTR from flk_wait_execute_
506 		 * request() in which case flk_cancel_sleeping_lock()
507 		 * would have been done. same is true with EBADF.
508 		 */
509 	}
510 
511 	if (lock_request == &stack_lock_request) {
512 		flk_set_state(lock_request, FLK_DEAD_STATE);
513 	} else {
514 		lock_request->l_state &= ~REFERENCED_LOCK;
515 		if ((error != 0) || IS_DELETED(lock_request)) {
516 			flk_set_state(lock_request, FLK_DEAD_STATE);
517 			flk_free_lock(lock_request);
518 		}
519 	}
520 
521 	mutex_exit(&gp->gp_mutex);
522 	return (error);
523 
524 done:
525 	flk_set_state(lock_request, FLK_DEAD_STATE);
526 	if (lock_request != &stack_lock_request)
527 		flk_free_lock(lock_request);
528 	return (error);
529 }
530 
531 /*
532  * Invoke the callbacks in the given list.  If before sleeping, invoke in
533  * list order.  If after sleeping, invoke in reverse order.
534  *
535  * CPR (suspend/resume) support: if one of the callbacks returns a
536  * callb_cpr_t, return it.   This will be used to make the thread CPR-safe
537  * while it is sleeping.  There should be at most one callb_cpr_t for the
538  * thread.
539  * XXX This is unnecessarily complicated.  The CPR information should just
540  * get passed in directly through VOP_FRLOCK and reclock, rather than
541  * sneaking it in via a callback.
542  */
543 
544 callb_cpr_t *
545 flk_invoke_callbacks(flk_callback_t *cblist, flk_cb_when_t when)
546 {
547 	callb_cpr_t *cpr_callbackp = NULL;
548 	callb_cpr_t *one_result;
549 	flk_callback_t *cb;
550 
551 	if (cblist == NULL)
552 		return (NULL);
553 
554 	if (when == FLK_BEFORE_SLEEP) {
555 		cb = cblist;
556 		do {
557 			one_result = (*cb->cb_callback)(when, cb->cb_data);
558 			if (one_result != NULL) {
559 				ASSERT(cpr_callbackp == NULL);
560 				cpr_callbackp = one_result;
561 			}
562 			cb = cb->cb_next;
563 		} while (cb != cblist);
564 	} else {
565 		cb = cblist->cb_prev;
566 		do {
567 			one_result = (*cb->cb_callback)(when, cb->cb_data);
568 			if (one_result != NULL) {
569 				cpr_callbackp = one_result;
570 			}
571 			cb = cb->cb_prev;
572 		} while (cb != cblist->cb_prev);
573 	}
574 
575 	return (cpr_callbackp);
576 }
577 
578 /*
579  * Initialize a flk_callback_t to hold the given callback.
580  */
581 
582 void
583 flk_init_callback(flk_callback_t *flk_cb,
584 	callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *), void *cbdata)
585 {
586 	flk_cb->cb_next = flk_cb;
587 	flk_cb->cb_prev = flk_cb;
588 	flk_cb->cb_callback = cb_fcn;
589 	flk_cb->cb_data = cbdata;
590 }
591 
592 /*
593  * Initialize an flk_callback_t and then link it into the head of an
594  * existing list (which may be NULL).
595  */
596 
597 void
598 flk_add_callback(flk_callback_t *newcb,
599 		callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *),
600 		void *cbdata, flk_callback_t *cblist)
601 {
602 	flk_init_callback(newcb, cb_fcn, cbdata);
603 
604 	if (cblist == NULL)
605 		return;
606 
607 	newcb->cb_prev = cblist->cb_prev;
608 	newcb->cb_next = cblist;
609 	cblist->cb_prev->cb_next = newcb;
610 	cblist->cb_prev = newcb;
611 }
612 /* ONC_PLUS EXTRACT END */
613 
614 /*
615  * Initialize the flk_edge_cache data structure and create the
616  * nlm_reg_status array.
617  */
618 
619 void
620 flk_init(void)
621 {
622 	uint_t	i;
623 
624 	flk_edge_cache = kmem_cache_create("flk_edges",
625 		sizeof (struct edge), 0, NULL, NULL, NULL, NULL, NULL, 0);
626 	if (flk_edge_cache == NULL) {
627 		cmn_err(CE_PANIC, "Couldn't create flk_edge_cache\n");
628 	}
629 	/*
630 	 * Create the NLM registry object.
631 	 */
632 
633 	if (cluster_bootflags & CLUSTER_BOOTED) {
634 		/*
635 		 * This routine tells you the maximum node id that will be used
636 		 * in the cluster.  This number will be the size of the nlm
637 		 * registry status array.  We add 1 because we will be using
638 		 * all entries indexed from 0 to maxnodeid; e.g., from 0
639 		 * to 64, for a total of 65 entries.
640 		 */
641 		nlm_status_size = clconf_maximum_nodeid() + 1;
642 	} else {
643 		nlm_status_size = 0;
644 	}
645 
646 	if (nlm_status_size != 0) {	/* booted as a cluster */
647 		nlm_reg_status = (flk_nlm_status_t *)
648 			kmem_alloc(sizeof (flk_nlm_status_t) * nlm_status_size,
649 				KM_SLEEP);
650 
651 		/* initialize all NLM states in array to NLM_UNKNOWN */
652 		for (i = 0; i < nlm_status_size; i++) {
653 			nlm_reg_status[i] = FLK_NLM_UNKNOWN;
654 		}
655 	}
656 }
657 
658 /*
659  * Zone constructor/destructor callbacks to be executed when a zone is
660  * created/destroyed.
661  */
662 /* ARGSUSED */
663 void *
664 flk_zone_init(zoneid_t zoneid)
665 {
666 	struct flock_globals *fg;
667 	uint_t i;
668 
669 	fg = kmem_alloc(sizeof (*fg), KM_SLEEP);
670 	fg->flk_lockmgr_status = FLK_LOCKMGR_UP;
671 	for (i = 0; i < HASH_SIZE; i++)
672 		fg->lockmgr_status[i] = FLK_LOCKMGR_UP;
673 	return (fg);
674 }
675 
676 /* ARGSUSED */
677 void
678 flk_zone_fini(zoneid_t zoneid, void *data)
679 {
680 	struct flock_globals *fg = data;
681 
682 	kmem_free(fg, sizeof (*fg));
683 }
684 
685 /*
686  * Get a lock_descriptor structure with initialization of edge lists.
687  */
688 
689 static lock_descriptor_t *
690 flk_get_lock(void)
691 {
692 	lock_descriptor_t	*l;
693 
694 	l = kmem_zalloc(sizeof (lock_descriptor_t), KM_SLEEP);
695 
696 	cv_init(&l->l_cv, NULL, CV_DRIVER, NULL);
697 	l->l_edge.edge_in_next = &l->l_edge;
698 	l->l_edge.edge_in_prev = &l->l_edge;
699 	l->l_edge.edge_adj_next = &l->l_edge;
700 	l->l_edge.edge_adj_prev = &l->l_edge;
701 	l->pvertex = -1;
702 	l->l_status = FLK_INITIAL_STATE;
703 	flk_lock_allocs++;
704 	return (l);
705 }
706 
707 /*
708  * Free a lock_descriptor structure. Just sets the DELETED_LOCK flag
709  * when some thread has a reference to it as in reclock().
710  */
711 
712 void
713 flk_free_lock(lock_descriptor_t	*lock)
714 {
715 	ASSERT(IS_DEAD(lock));
716 	if (IS_REFERENCED(lock)) {
717 		lock->l_state |= DELETED_LOCK;
718 		return;
719 	}
720 	flk_lock_frees++;
721 	kmem_free((void *)lock, sizeof (lock_descriptor_t));
722 }
723 
724 void
725 flk_set_state(lock_descriptor_t *lock, int new_state)
726 {
727 	/*
728 	 * Locks in the sleeping list may be woken up in a number of ways,
729 	 * and more than once.  If a sleeping lock is signaled awake more
730 	 * than once, then it may or may not change state depending on its
731 	 * current state.
732 	 * Also note that NLM locks that are sleeping could be moved to an
733 	 * interrupted state more than once if the unlock request is
734 	 * retransmitted by the NLM client - the second time around, this is
735 	 * just a nop.
736 	 * The ordering of being signaled awake is:
737 	 * INTERRUPTED_STATE > CANCELLED_STATE > GRANTED_STATE.
738 	 * The checks below implement this ordering.
739 	 */
740 	if (IS_INTERRUPTED(lock)) {
741 		if ((new_state == FLK_CANCELLED_STATE) ||
742 		    (new_state == FLK_GRANTED_STATE) ||
743 		    (new_state == FLK_INTERRUPTED_STATE)) {
744 			return;
745 		}
746 	}
747 	if (IS_CANCELLED(lock)) {
748 		if ((new_state == FLK_GRANTED_STATE) ||
749 		    (new_state == FLK_CANCELLED_STATE)) {
750 			return;
751 		}
752 	}
753 	CHECK_LOCK_TRANSITION(lock->l_status, new_state);
754 	if (IS_PXFS(lock)) {
755 		cl_flk_state_transition_notify(lock, lock->l_status, new_state);
756 	}
757 	lock->l_status = new_state;
758 }
759 
760 /*
761  * Routine that checks whether there are any blocking locks in the system.
762  *
763  * The policy followed is if a write lock is sleeping we don't allow read
764  * locks before this write lock even though there may not be any active
765  * locks corresponding to the read locks' region.
766  *
767  * flk_add_edge() function adds an edge between l1 and l2 iff there
768  * is no path between l1 and l2. This is done to have a "minimum
769  * storage representation" of the dependency graph.
770  *
771  * Another property of the graph is since only the new request throws
772  * edges to the existing locks in the graph, the graph is always topologically
773  * ordered.
774  */
775 
776 static int
777 flk_process_request(lock_descriptor_t *request)
778 {
779 	graph_t	*gp = request->l_graph;
780 	lock_descriptor_t *lock;
781 	int request_blocked_by_active = 0;
782 	int request_blocked_by_granted = 0;
783 	int request_blocked_by_sleeping = 0;
784 	vnode_t	*vp = request->l_vnode;
785 	int	error = 0;
786 	int request_will_wait = 0;
787 	int found_covering_lock = 0;
788 	lock_descriptor_t *covered_by = NULL;
789 
790 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
791 	request_will_wait = IS_WILLING_TO_SLEEP(request);
792 
793 	/*
794 	 * check active locks
795 	 */
796 
797 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
798 
799 
800 	if (lock) {
801 		do {
802 			if (BLOCKS(lock, request)) {
803 				if (!request_will_wait)
804 					return (EAGAIN);
805 				request_blocked_by_active = 1;
806 				break;
807 			}
808 			/*
809 			 * Grant lock if it is for the same owner holding active
810 			 * lock that covers the request.
811 			 */
812 
813 			if (SAME_OWNER(lock, request) &&
814 					COVERS(lock, request) &&
815 						(request->l_type == F_RDLCK))
816 				return (flk_execute_request(request));
817 			lock = lock->l_next;
818 		} while (lock->l_vnode == vp);
819 	}
820 
821 	if (!request_blocked_by_active) {
822 			lock_descriptor_t *lk[1];
823 			lock_descriptor_t *first_glock = NULL;
824 		/*
825 		 * Shall we grant this?! NO!!
826 		 * What about those locks that were just granted and still
827 		 * in sleep queue. Those threads are woken up and so locks
828 		 * are almost active.
829 		 */
830 		SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
831 		if (lock) {
832 			do {
833 				if (BLOCKS(lock, request)) {
834 					if (IS_GRANTED(lock)) {
835 						request_blocked_by_granted = 1;
836 					} else {
837 						request_blocked_by_sleeping = 1;
838 					}
839 				}
840 
841 				lock = lock->l_next;
842 			} while ((lock->l_vnode == vp));
843 			first_glock = lock->l_prev;
844 			ASSERT(first_glock->l_vnode == vp);
845 		}
846 
847 		if (request_blocked_by_granted)
848 			goto block;
849 
850 		if (!request_blocked_by_sleeping) {
851 			/*
852 			 * If the request isn't going to be blocked by a
853 			 * sleeping request, we know that it isn't going to
854 			 * be blocked; we can just execute the request --
855 			 * without performing costly deadlock detection.
856 			 */
857 			ASSERT(!request_blocked_by_active);
858 			return (flk_execute_request(request));
859 		} else if (request->l_type == F_RDLCK) {
860 			/*
861 			 * If we have a sleeping writer in the requested
862 			 * lock's range, block.
863 			 */
864 			goto block;
865 		}
866 
867 		lk[0] = request;
868 		request->l_state |= RECOMPUTE_LOCK;
869 		SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
870 		if (lock) {
871 			do {
872 				flk_recompute_dependencies(lock, lk, 1, 0);
873 				lock = lock->l_next;
874 			} while (lock->l_vnode == vp);
875 		}
876 		lock = first_glock;
877 		if (lock) {
878 			do {
879 				if (IS_GRANTED(lock)) {
880 				flk_recompute_dependencies(lock, lk, 1, 0);
881 				}
882 				lock = lock->l_prev;
883 			} while ((lock->l_vnode == vp));
884 		}
885 		request->l_state &= ~RECOMPUTE_LOCK;
886 		if (!NO_DEPENDENTS(request) && flk_check_deadlock(request))
887 			return (EDEADLK);
888 		return (flk_execute_request(request));
889 	}
890 
891 block:
892 	if (request_will_wait)
893 		flk_graph_uncolor(gp);
894 
895 	/* check sleeping locks */
896 
897 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
898 
899 	/*
900 	 * If we find a sleeping write lock that is a superset of the
901 	 * region wanted by request we can be assured that by adding an
902 	 * edge to this write lock we have paths to all locks in the
903 	 * graph that blocks the request except in one case and that is why
904 	 * another check for SAME_OWNER in the loop below. The exception
905 	 * case is when this process that owns the sleeping write lock 'l1'
906 	 * has other locks l2, l3, l4 that are in the system and arrived
907 	 * before l1. l1 does not have path to these locks as they are from
908 	 * same process. We break when we find a second covering sleeping
909 	 * lock l5 owned by a process different from that owning l1, because
910 	 * there cannot be any of l2, l3, l4, etc., arrived before l5, and if
911 	 * it has l1 would have produced a deadlock already.
912 	 */
913 
914 	if (lock) {
915 		do {
916 			if (BLOCKS(lock, request)) {
917 				if (!request_will_wait)
918 					return (EAGAIN);
919 				if (COVERS(lock, request) &&
920 						lock->l_type == F_WRLCK) {
921 					if (found_covering_lock &&
922 					    !SAME_OWNER(lock, covered_by)) {
923 						found_covering_lock++;
924 						break;
925 					}
926 					found_covering_lock = 1;
927 					covered_by = lock;
928 				}
929 				if (found_covering_lock &&
930 					!SAME_OWNER(lock, covered_by)) {
931 					lock = lock->l_next;
932 					continue;
933 				}
934 				if ((error = flk_add_edge(request, lock,
935 						!found_covering_lock, 0)))
936 					return (error);
937 			}
938 			lock = lock->l_next;
939 		} while (lock->l_vnode == vp);
940 	}
941 
942 /*
943  * found_covering_lock == 2 iff at this point 'request' has paths
944  * to all locks that blocks 'request'. found_covering_lock == 1 iff at this
945  * point 'request' has paths to all locks that blocks 'request' whose owners
946  * are not same as the one that covers 'request' (covered_by above) and
947  * we can have locks whose owner is same as covered_by in the active list.
948  */
949 
950 	if (request_blocked_by_active && found_covering_lock != 2) {
951 		SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
952 		ASSERT(lock != NULL);
953 		do {
954 			if (BLOCKS(lock, request)) {
955 				if (found_covering_lock &&
956 					!SAME_OWNER(lock, covered_by)) {
957 					lock = lock->l_next;
958 					continue;
959 				}
960 				if ((error = flk_add_edge(request, lock,
961 							CHECK_CYCLE, 0)))
962 					return (error);
963 			}
964 			lock = lock->l_next;
965 		} while (lock->l_vnode == vp);
966 	}
967 
968 	if (NOT_BLOCKED(request)) {
969 		/*
970 		 * request not dependent on any other locks
971 		 * so execute this request
972 		 */
973 		return (flk_execute_request(request));
974 	} else {
975 		/*
976 		 * check for deadlock
977 		 */
978 		if (flk_check_deadlock(request))
979 			return (EDEADLK);
980 		/*
981 		 * this thread has to sleep
982 		 */
983 		return (flk_wait_execute_request(request));
984 	}
985 }
986 
987 /* ONC_PLUS EXTRACT START */
988 /*
989  * The actual execution of the request in the simple case is only to
990  * insert the 'request' in the list of active locks if it is not an
991  * UNLOCK.
992  * We have to consider the existing active locks' relation to
993  * this 'request' if they are owned by same process. flk_relation() does
994  * this job and sees to that the dependency graph information is maintained
995  * properly.
996  */
997 
998 int
999 flk_execute_request(lock_descriptor_t *request)
1000 {
1001 	graph_t	*gp = request->l_graph;
1002 	vnode_t	*vp = request->l_vnode;
1003 	lock_descriptor_t	*lock, *lock1;
1004 	int done_searching = 0;
1005 
1006 	CHECK_SLEEPING_LOCKS(gp);
1007 	CHECK_ACTIVE_LOCKS(gp);
1008 
1009 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1010 
1011 	flk_set_state(request, FLK_START_STATE);
1012 
1013 	ASSERT(NOT_BLOCKED(request));
1014 
1015 	/* IO_LOCK requests are only to check status */
1016 
1017 	if (IS_IO_LOCK(request))
1018 		return (0);
1019 
1020 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1021 
1022 	if (lock == NULL && request->l_type == F_UNLCK)
1023 		return (0);
1024 	if (lock == NULL) {
1025 		flk_insert_active_lock(request);
1026 		return (0);
1027 	}
1028 
1029 	do {
1030 		lock1 = lock->l_next;
1031 		if (SAME_OWNER(request, lock)) {
1032 			done_searching = flk_relation(lock, request);
1033 		}
1034 		lock = lock1;
1035 	} while (lock->l_vnode == vp && !done_searching);
1036 
1037 	/*
1038 	 * insert in active queue
1039 	 */
1040 
1041 	if (request->l_type != F_UNLCK)
1042 		flk_insert_active_lock(request);
1043 
1044 	return (0);
1045 }
1046 /* ONC_PLUS EXTRACT END */
1047 
1048 /*
1049  * 'request' is blocked by some one therefore we put it into sleep queue.
1050  */
1051 static int
1052 flk_wait_execute_request(lock_descriptor_t *request)
1053 {
1054 	graph_t	*gp = request->l_graph;
1055 	callb_cpr_t 	*cprp;		/* CPR info from callback */
1056 	struct flock_globals *fg;
1057 	int index;
1058 
1059 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1060 	ASSERT(IS_WILLING_TO_SLEEP(request));
1061 
1062 	flk_insert_sleeping_lock(request);
1063 
1064 	if (IS_LOCKMGR(request)) {
1065 		index = HASH_INDEX(request->l_vnode);
1066 		fg = flk_get_globals();
1067 
1068 		if (nlm_status_size == 0) {	/* not booted as a cluster */
1069 			if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP) {
1070 				flk_cancel_sleeping_lock(request, 1);
1071 				return (ENOLCK);
1072 			}
1073 		} else {			/* booted as a cluster */
1074 			/*
1075 			 * If the request is an NLM server lock request,
1076 			 * and the NLM state of the lock request is not
1077 			 * NLM_UP (because the NLM server is shutting
1078 			 * down), then cancel the sleeping lock and
1079 			 * return error ENOLCK that will encourage the
1080 			 * client to retransmit.
1081 			 */
1082 			if (!IS_NLM_UP(request)) {
1083 				flk_cancel_sleeping_lock(request, 1);
1084 				return (ENOLCK);
1085 			}
1086 		}
1087 	}
1088 
1089 	/* Clustering: For blocking PXFS locks, return */
1090 	if (IS_PXFS(request)) {
1091 		/*
1092 		 * PXFS locks sleep on the client side.
1093 		 * The callback argument is used to wake up the sleeper
1094 		 * when the lock is granted.
1095 		 * We return -1 (rather than an errno value) to indicate
1096 		 * the client side should sleep
1097 		 */
1098 		return (PXFS_LOCK_BLOCKED);
1099 	}
1100 
1101 	if (request->l_callbacks != NULL) {
1102 		/*
1103 		 * To make sure the shutdown code works correctly, either
1104 		 * the callback must happen after putting the lock on the
1105 		 * sleep list, or we must check the shutdown status after
1106 		 * returning from the callback (and before sleeping).  At
1107 		 * least for now, we'll use the first option.  If a
1108 		 * shutdown or signal or whatever happened while the graph
1109 		 * mutex was dropped, that will be detected by
1110 		 * wait_for_lock().
1111 		 */
1112 		mutex_exit(&gp->gp_mutex);
1113 
1114 		cprp = flk_invoke_callbacks(request->l_callbacks,
1115 					    FLK_BEFORE_SLEEP);
1116 
1117 		mutex_enter(&gp->gp_mutex);
1118 
1119 		if (cprp == NULL) {
1120 			wait_for_lock(request);
1121 		} else {
1122 			mutex_enter(cprp->cc_lockp);
1123 			CALLB_CPR_SAFE_BEGIN(cprp);
1124 			mutex_exit(cprp->cc_lockp);
1125 			wait_for_lock(request);
1126 			mutex_enter(cprp->cc_lockp);
1127 			CALLB_CPR_SAFE_END(cprp, cprp->cc_lockp);
1128 			mutex_exit(cprp->cc_lockp);
1129 		}
1130 
1131 		mutex_exit(&gp->gp_mutex);
1132 		(void) flk_invoke_callbacks(request->l_callbacks,
1133 					    FLK_AFTER_SLEEP);
1134 		mutex_enter(&gp->gp_mutex);
1135 	} else {
1136 		wait_for_lock(request);
1137 	}
1138 
1139 	if (IS_LOCKMGR(request)) {
1140 		/*
1141 		 * If the lock manager is shutting down, return an
1142 		 * error that will encourage the client to retransmit.
1143 		 */
1144 		if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP &&
1145 			!IS_GRANTED(request)) {
1146 			flk_cancel_sleeping_lock(request, 1);
1147 			return (ENOLCK);
1148 		}
1149 	}
1150 
1151 	if (IS_INTERRUPTED(request)) {
1152 		/* we got a signal, or act like we did */
1153 		flk_cancel_sleeping_lock(request, 1);
1154 		return (EINTR);
1155 	}
1156 
1157 	/* Cancelled if some other thread has closed the file */
1158 
1159 	if (IS_CANCELLED(request)) {
1160 		flk_cancel_sleeping_lock(request, 1);
1161 		return (EBADF);
1162 	}
1163 
1164 	request->l_state &= ~GRANTED_LOCK;
1165 	REMOVE_SLEEP_QUEUE(request);
1166 	return (flk_execute_request(request));
1167 }
1168 
1169 /*
1170  * This routine adds an edge between from and to because from depends
1171  * to. If asked to check for deadlock it checks whether there are any
1172  * reachable locks from "from_lock" that is owned by the same process
1173  * as "from_lock".
1174  * NOTE: It is the caller's responsibility to make sure that the color
1175  * of the graph is consistent between the calls to flk_add_edge as done
1176  * in flk_process_request. This routine does not color and check for
1177  * deadlock explicitly.
1178  */
1179 
1180 static int
1181 flk_add_edge(lock_descriptor_t *from_lock, lock_descriptor_t *to_lock,
1182 			int check_cycle, int update_graph)
1183 {
1184 	edge_t	*edge;
1185 	edge_t	*ep;
1186 	lock_descriptor_t	*vertex;
1187 	lock_descriptor_t *vertex_stack;
1188 
1189 	STACK_INIT(vertex_stack);
1190 
1191 	/*
1192 	 * if to vertex already has mark_color just return
1193 	 * don't add an edge as it is reachable from from vertex
1194 	 * before itself.
1195 	 */
1196 
1197 	if (COLORED(to_lock))
1198 		return (0);
1199 
1200 	edge = flk_get_edge();
1201 
1202 	/*
1203 	 * set the from and to vertex
1204 	 */
1205 
1206 	edge->from_vertex = from_lock;
1207 	edge->to_vertex = to_lock;
1208 
1209 	/*
1210 	 * put in adjacency list of from vertex
1211 	 */
1212 
1213 	from_lock->l_edge.edge_adj_next->edge_adj_prev = edge;
1214 	edge->edge_adj_next = from_lock->l_edge.edge_adj_next;
1215 	edge->edge_adj_prev = &from_lock->l_edge;
1216 	from_lock->l_edge.edge_adj_next = edge;
1217 
1218 	/*
1219 	 * put in in list of to vertex
1220 	 */
1221 
1222 	to_lock->l_edge.edge_in_next->edge_in_prev = edge;
1223 	edge->edge_in_next = to_lock->l_edge.edge_in_next;
1224 	to_lock->l_edge.edge_in_next = edge;
1225 	edge->edge_in_prev = &to_lock->l_edge;
1226 
1227 
1228 	if (update_graph) {
1229 		flk_update_proc_graph(edge, 0);
1230 		return (0);
1231 	}
1232 	if (!check_cycle) {
1233 		return (0);
1234 	}
1235 
1236 	STACK_PUSH(vertex_stack, from_lock, l_stack);
1237 
1238 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1239 
1240 		STACK_POP(vertex_stack, l_stack);
1241 
1242 		for (ep = FIRST_ADJ(vertex);
1243 			ep != HEAD(vertex);
1244 				ep = NEXT_ADJ(ep)) {
1245 			if (COLORED(ep->to_vertex))
1246 				continue;
1247 			COLOR(ep->to_vertex);
1248 			if (SAME_OWNER(ep->to_vertex, from_lock))
1249 				goto dead_lock;
1250 			STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1251 		}
1252 	}
1253 	return (0);
1254 
1255 dead_lock:
1256 
1257 	/*
1258 	 * remove all edges
1259 	 */
1260 
1261 	ep = FIRST_ADJ(from_lock);
1262 
1263 	while (ep != HEAD(from_lock)) {
1264 		IN_LIST_REMOVE(ep);
1265 		from_lock->l_sedge = NEXT_ADJ(ep);
1266 		ADJ_LIST_REMOVE(ep);
1267 		flk_free_edge(ep);
1268 		ep = from_lock->l_sedge;
1269 	}
1270 	return (EDEADLK);
1271 }
1272 
1273 /*
1274  * Get an edge structure for representing the dependency between two locks.
1275  */
1276 
1277 static edge_t *
1278 flk_get_edge()
1279 {
1280 	edge_t	*ep;
1281 
1282 	ASSERT(flk_edge_cache != NULL);
1283 
1284 	ep = kmem_cache_alloc(flk_edge_cache, KM_SLEEP);
1285 	edge_allocs++;
1286 	return (ep);
1287 }
1288 
1289 /*
1290  * Free the edge structure.
1291  */
1292 
1293 static void
1294 flk_free_edge(edge_t *ep)
1295 {
1296 	edge_frees++;
1297 	kmem_cache_free(flk_edge_cache, (void *)ep);
1298 }
1299 
1300 /*
1301  * Check the relationship of request with lock and perform the
1302  * recomputation of dependencies, break lock if required, and return
1303  * 1 if request cannot have any more relationship with the next
1304  * active locks.
1305  * The 'lock' and 'request' are compared and in case of overlap we
1306  * delete the 'lock' and form new locks to represent the non-overlapped
1307  * portion of original 'lock'. This function has side effects such as
1308  * 'lock' will be freed, new locks will be added to the active list.
1309  */
1310 
1311 static int
1312 flk_relation(lock_descriptor_t *lock, lock_descriptor_t *request)
1313 {
1314 	int lock_effect;
1315 	lock_descriptor_t *lock1, *lock2;
1316 	lock_descriptor_t *topology[3];
1317 	int nvertex = 0;
1318 	int i;
1319 	edge_t	*ep;
1320 	graph_t	*gp = (lock->l_graph);
1321 
1322 
1323 	CHECK_SLEEPING_LOCKS(gp);
1324 	CHECK_ACTIVE_LOCKS(gp);
1325 
1326 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1327 
1328 	topology[0] = topology[1] = topology[2] = NULL;
1329 
1330 	if (request->l_type == F_UNLCK)
1331 		lock_effect = FLK_UNLOCK;
1332 	else if (request->l_type == F_RDLCK &&
1333 			lock->l_type == F_WRLCK)
1334 		lock_effect = FLK_DOWNGRADE;
1335 	else if (request->l_type == F_WRLCK &&
1336 			lock->l_type == F_RDLCK)
1337 		lock_effect = FLK_UPGRADE;
1338 	else
1339 		lock_effect = FLK_STAY_SAME;
1340 
1341 	if (lock->l_end < request->l_start) {
1342 		if (lock->l_end == request->l_start - 1 &&
1343 				lock_effect == FLK_STAY_SAME) {
1344 			topology[0] = request;
1345 			request->l_start = lock->l_start;
1346 			nvertex = 1;
1347 			goto recompute;
1348 		} else {
1349 			return (0);
1350 		}
1351 	}
1352 
1353 	if (lock->l_start > request->l_end) {
1354 		if (request->l_end == lock->l_start - 1 &&
1355 					lock_effect == FLK_STAY_SAME) {
1356 			topology[0] = request;
1357 			request->l_end = lock->l_end;
1358 			nvertex = 1;
1359 			goto recompute;
1360 		} else {
1361 			return (1);
1362 		}
1363 	}
1364 
1365 	if (request->l_end < lock->l_end) {
1366 		if (request->l_start > lock->l_start) {
1367 			if (lock_effect == FLK_STAY_SAME) {
1368 				request->l_start = lock->l_start;
1369 				request->l_end = lock->l_end;
1370 				topology[0] = request;
1371 				nvertex = 1;
1372 			} else {
1373 				lock1 = flk_get_lock();
1374 				lock2 = flk_get_lock();
1375 				COPY(lock1, lock);
1376 				COPY(lock2, lock);
1377 				lock1->l_start = lock->l_start;
1378 				lock1->l_end = request->l_start - 1;
1379 				lock2->l_start = request->l_end + 1;
1380 				lock2->l_end = lock->l_end;
1381 				topology[0] = lock1;
1382 				topology[1] = lock2;
1383 				topology[2] = request;
1384 				nvertex = 3;
1385 			}
1386 		} else if (request->l_start < lock->l_start) {
1387 			if (lock_effect == FLK_STAY_SAME) {
1388 				request->l_end = lock->l_end;
1389 				topology[0] = request;
1390 				nvertex = 1;
1391 			} else {
1392 				lock1 = flk_get_lock();
1393 				COPY(lock1, lock);
1394 				lock1->l_start = request->l_end + 1;
1395 				topology[0] = lock1;
1396 				topology[1] = request;
1397 				nvertex = 2;
1398 			}
1399 		} else  {
1400 			if (lock_effect == FLK_STAY_SAME) {
1401 				request->l_start = lock->l_start;
1402 				request->l_end = lock->l_end;
1403 				topology[0] = request;
1404 				nvertex = 1;
1405 			} else {
1406 				lock1 = flk_get_lock();
1407 				COPY(lock1, lock);
1408 				lock1->l_start = request->l_end + 1;
1409 				topology[0] = lock1;
1410 				topology[1] = request;
1411 				nvertex = 2;
1412 			}
1413 		}
1414 	} else if (request->l_end > lock->l_end) {
1415 		if (request->l_start > lock->l_start)  {
1416 			if (lock_effect == FLK_STAY_SAME) {
1417 				request->l_start = lock->l_start;
1418 				topology[0] = request;
1419 				nvertex = 1;
1420 			} else {
1421 				lock1 = flk_get_lock();
1422 				COPY(lock1, lock);
1423 				lock1->l_end = request->l_start - 1;
1424 				topology[0] = lock1;
1425 				topology[1] = request;
1426 				nvertex = 2;
1427 			}
1428 		} else if (request->l_start < lock->l_start)  {
1429 			topology[0] = request;
1430 			nvertex = 1;
1431 		} else {
1432 			topology[0] = request;
1433 			nvertex = 1;
1434 		}
1435 	} else {
1436 		if (request->l_start > lock->l_start) {
1437 			if (lock_effect == FLK_STAY_SAME) {
1438 				request->l_start = lock->l_start;
1439 				topology[0] = request;
1440 				nvertex = 1;
1441 			} else {
1442 				lock1 = flk_get_lock();
1443 				COPY(lock1, lock);
1444 				lock1->l_end = request->l_start - 1;
1445 				topology[0] = lock1;
1446 				topology[1] = request;
1447 				nvertex = 2;
1448 			}
1449 		} else if (request->l_start < lock->l_start) {
1450 			topology[0] = request;
1451 			nvertex = 1;
1452 		} else {
1453 			if (lock_effect !=  FLK_UNLOCK) {
1454 				topology[0] = request;
1455 				nvertex = 1;
1456 			} else {
1457 				flk_delete_active_lock(lock, 0);
1458 				flk_wakeup(lock, 1);
1459 				flk_free_lock(lock);
1460 				CHECK_SLEEPING_LOCKS(gp);
1461 				CHECK_ACTIVE_LOCKS(gp);
1462 				return (1);
1463 			}
1464 		}
1465 	}
1466 
1467 recompute:
1468 
1469 	/*
1470 	 * For unlock we don't send the 'request' to for recomputing
1471 	 * dependencies because no lock will add an edge to this.
1472 	 */
1473 
1474 	if (lock_effect == FLK_UNLOCK) {
1475 		topology[nvertex-1] = NULL;
1476 		nvertex--;
1477 	}
1478 	for (i = 0; i < nvertex; i++) {
1479 		topology[i]->l_state |= RECOMPUTE_LOCK;
1480 		topology[i]->l_color = NO_COLOR;
1481 	}
1482 
1483 	ASSERT(FIRST_ADJ(lock) == HEAD(lock));
1484 
1485 	/*
1486 	 * we remove the adjacent edges for all vertices' to this vertex
1487 	 * 'lock'.
1488 	 */
1489 
1490 	ep = FIRST_IN(lock);
1491 	while (ep != HEAD(lock)) {
1492 		ADJ_LIST_REMOVE(ep);
1493 		ep = NEXT_IN(ep);
1494 	}
1495 
1496 	flk_delete_active_lock(lock, 0);
1497 
1498 	/* We are ready for recomputing the dependencies now */
1499 
1500 	flk_recompute_dependencies(lock, topology, nvertex, 1);
1501 
1502 	for (i = 0; i < nvertex; i++) {
1503 		topology[i]->l_state &= ~RECOMPUTE_LOCK;
1504 		topology[i]->l_color = NO_COLOR;
1505 	}
1506 
1507 
1508 	if (lock_effect == FLK_UNLOCK) {
1509 		nvertex++;
1510 	}
1511 	for (i = 0; i < nvertex - 1; i++) {
1512 		flk_insert_active_lock(topology[i]);
1513 	}
1514 
1515 
1516 	if (lock_effect == FLK_DOWNGRADE || lock_effect == FLK_UNLOCK) {
1517 		flk_wakeup(lock, 0);
1518 	} else {
1519 		ep = FIRST_IN(lock);
1520 		while (ep != HEAD(lock)) {
1521 			lock->l_sedge = NEXT_IN(ep);
1522 			IN_LIST_REMOVE(ep);
1523 			flk_update_proc_graph(ep, 1);
1524 			flk_free_edge(ep);
1525 			ep = lock->l_sedge;
1526 		}
1527 	}
1528 	flk_free_lock(lock);
1529 
1530 	CHECK_SLEEPING_LOCKS(gp);
1531 	CHECK_ACTIVE_LOCKS(gp);
1532 	return (0);
1533 }
1534 
1535 /*
1536  * Insert a lock into the active queue.
1537  */
1538 
1539 static void
1540 flk_insert_active_lock(lock_descriptor_t *new_lock)
1541 {
1542 	graph_t	*gp = new_lock->l_graph;
1543 	vnode_t	*vp = new_lock->l_vnode;
1544 	lock_descriptor_t *first_lock, *lock;
1545 
1546 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1547 
1548 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1549 	first_lock = lock;
1550 
1551 	if (first_lock != NULL) {
1552 		for (; (lock->l_vnode == vp &&
1553 			lock->l_start < new_lock->l_start); lock = lock->l_next)
1554 			;
1555 	} else {
1556 		lock = ACTIVE_HEAD(gp);
1557 	}
1558 
1559 	lock->l_prev->l_next = new_lock;
1560 	new_lock->l_next = lock;
1561 	new_lock->l_prev = lock->l_prev;
1562 	lock->l_prev = new_lock;
1563 
1564 	if (first_lock == NULL || (new_lock->l_start <= first_lock->l_start)) {
1565 		vp->v_filocks = (struct filock *)new_lock;
1566 	}
1567 	flk_set_state(new_lock, FLK_ACTIVE_STATE);
1568 	new_lock->l_state |= ACTIVE_LOCK;
1569 
1570 	CHECK_ACTIVE_LOCKS(gp);
1571 	CHECK_SLEEPING_LOCKS(gp);
1572 }
1573 
1574 /*
1575  * Delete the active lock : Performs two functions depending on the
1576  * value of second parameter. One is to remove from the active lists
1577  * only and other is to both remove and free the lock.
1578  */
1579 
1580 static void
1581 flk_delete_active_lock(lock_descriptor_t *lock, int free_lock)
1582 {
1583 	vnode_t *vp = lock->l_vnode;
1584 	graph_t	*gp = lock->l_graph;
1585 
1586 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1587 	if (free_lock)
1588 		ASSERT(NO_DEPENDENTS(lock));
1589 	ASSERT(NOT_BLOCKED(lock));
1590 	ASSERT(IS_ACTIVE(lock));
1591 
1592 	ASSERT((vp->v_filocks != NULL));
1593 
1594 	if (vp->v_filocks == (struct filock *)lock) {
1595 		vp->v_filocks = (struct filock *)
1596 				((lock->l_next->l_vnode == vp) ? lock->l_next :
1597 								NULL);
1598 	}
1599 	lock->l_next->l_prev = lock->l_prev;
1600 	lock->l_prev->l_next = lock->l_next;
1601 	lock->l_next = lock->l_prev = NULL;
1602 	flk_set_state(lock, FLK_DEAD_STATE);
1603 	lock->l_state &= ~ACTIVE_LOCK;
1604 
1605 	if (free_lock)
1606 		flk_free_lock(lock);
1607 	CHECK_ACTIVE_LOCKS(gp);
1608 	CHECK_SLEEPING_LOCKS(gp);
1609 }
1610 
1611 /*
1612  * Insert into the sleep queue.
1613  */
1614 
1615 static void
1616 flk_insert_sleeping_lock(lock_descriptor_t *request)
1617 {
1618 	graph_t *gp = request->l_graph;
1619 	vnode_t	*vp = request->l_vnode;
1620 	lock_descriptor_t	*lock;
1621 
1622 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1623 	ASSERT(IS_INITIAL(request));
1624 
1625 	for (lock = gp->sleeping_locks.l_next; (lock != &gp->sleeping_locks &&
1626 		lock->l_vnode < vp); lock = lock->l_next)
1627 		;
1628 
1629 	lock->l_prev->l_next = request;
1630 	request->l_prev = lock->l_prev;
1631 	lock->l_prev = request;
1632 	request->l_next = lock;
1633 	flk_set_state(request, FLK_SLEEPING_STATE);
1634 	request->l_state |= SLEEPING_LOCK;
1635 }
1636 
1637 /*
1638  * Cancelling a sleeping lock implies removing a vertex from the
1639  * dependency graph and therefore we should recompute the dependencies
1640  * of all vertices that have a path  to this vertex, w.r.t. all
1641  * vertices reachable from this vertex.
1642  */
1643 
1644 void
1645 flk_cancel_sleeping_lock(lock_descriptor_t *request, int remove_from_queue)
1646 {
1647 	graph_t	*gp = request->l_graph;
1648 	vnode_t *vp = request->l_vnode;
1649 	lock_descriptor_t **topology = NULL;
1650 	edge_t	*ep;
1651 	lock_descriptor_t *vertex, *lock;
1652 	int nvertex = 0;
1653 	int i;
1654 	lock_descriptor_t *vertex_stack;
1655 
1656 	STACK_INIT(vertex_stack);
1657 
1658 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1659 	/*
1660 	 * count number of vertex pointers that has to be allocated
1661 	 * All vertices that are reachable from request.
1662 	 */
1663 
1664 	STACK_PUSH(vertex_stack, request, l_stack);
1665 
1666 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1667 		STACK_POP(vertex_stack, l_stack);
1668 		for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
1669 					ep = NEXT_ADJ(ep)) {
1670 			if (IS_RECOMPUTE(ep->to_vertex))
1671 				continue;
1672 			ep->to_vertex->l_state |= RECOMPUTE_LOCK;
1673 			STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1674 			nvertex++;
1675 		}
1676 	}
1677 
1678 	/*
1679 	 * allocate memory for holding the vertex pointers
1680 	 */
1681 
1682 	if (nvertex) {
1683 		topology = kmem_zalloc(nvertex * sizeof (lock_descriptor_t *),
1684 						KM_SLEEP);
1685 	}
1686 
1687 	/*
1688 	 * one more pass to actually store the vertices in the
1689 	 * allocated array.
1690 	 * We first check sleeping locks and then active locks
1691 	 * so that topology array will be in a topological
1692 	 * order.
1693 	 */
1694 
1695 	nvertex = 0;
1696 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1697 
1698 	if (lock) {
1699 		do {
1700 			if (IS_RECOMPUTE(lock)) {
1701 				lock->l_index = nvertex;
1702 				topology[nvertex++] = lock;
1703 			}
1704 			lock->l_color = NO_COLOR;
1705 			lock = lock->l_next;
1706 		} while (lock->l_vnode == vp);
1707 	}
1708 
1709 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1710 
1711 	if (lock) {
1712 		do {
1713 			if (IS_RECOMPUTE(lock)) {
1714 				lock->l_index = nvertex;
1715 				topology[nvertex++] = lock;
1716 			}
1717 			lock->l_color = NO_COLOR;
1718 			lock = lock->l_next;
1719 		} while (lock->l_vnode == vp);
1720 	}
1721 
1722 	/*
1723 	 * remove in and out edges of request
1724 	 * They are freed after updating proc_graph below.
1725 	 */
1726 
1727 	for (ep = FIRST_IN(request); ep != HEAD(request); ep = NEXT_IN(ep)) {
1728 		ADJ_LIST_REMOVE(ep);
1729 	}
1730 
1731 
1732 	if (remove_from_queue)
1733 		REMOVE_SLEEP_QUEUE(request);
1734 
1735 	/* we are ready to recompute */
1736 
1737 	flk_recompute_dependencies(request, topology, nvertex, 1);
1738 
1739 	ep = FIRST_ADJ(request);
1740 	while (ep != HEAD(request)) {
1741 		IN_LIST_REMOVE(ep);
1742 		request->l_sedge = NEXT_ADJ(ep);
1743 		ADJ_LIST_REMOVE(ep);
1744 		flk_update_proc_graph(ep, 1);
1745 		flk_free_edge(ep);
1746 		ep = request->l_sedge;
1747 	}
1748 
1749 
1750 	/*
1751 	 * unset the RECOMPUTE flag in those vertices
1752 	 */
1753 
1754 	for (i = 0; i < nvertex; i++) {
1755 		topology[i]->l_state &= ~RECOMPUTE_LOCK;
1756 	}
1757 
1758 	/*
1759 	 * free the topology
1760 	 */
1761 	if (nvertex)
1762 		kmem_free((void *)topology,
1763 			(nvertex * sizeof (lock_descriptor_t *)));
1764 	/*
1765 	 * Possibility of some locks unblocked now
1766 	 */
1767 
1768 	flk_wakeup(request, 0);
1769 
1770 	/*
1771 	 * we expect to have a correctly recomputed graph  now.
1772 	 */
1773 	flk_set_state(request, FLK_DEAD_STATE);
1774 	flk_free_lock(request);
1775 	CHECK_SLEEPING_LOCKS(gp);
1776 	CHECK_ACTIVE_LOCKS(gp);
1777 
1778 }
1779 
1780 /*
1781  * Uncoloring the graph is simply to increment the mark value of the graph
1782  * And only when wrap round takes place will we color all vertices in
1783  * the graph explicitly.
1784  */
1785 
1786 static void
1787 flk_graph_uncolor(graph_t *gp)
1788 {
1789 	lock_descriptor_t *lock;
1790 
1791 	if (gp->mark == UINT_MAX) {
1792 		gp->mark = 1;
1793 	for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
1794 					lock = lock->l_next)
1795 			lock->l_color  = 0;
1796 
1797 	for (lock = SLEEPING_HEAD(gp)->l_next; lock != SLEEPING_HEAD(gp);
1798 					lock = lock->l_next)
1799 			lock->l_color  = 0;
1800 	} else {
1801 		gp->mark++;
1802 	}
1803 }
1804 
1805 /*
1806  * Wake up locks that are blocked on the given lock.
1807  */
1808 
1809 static void
1810 flk_wakeup(lock_descriptor_t *lock, int adj_list_remove)
1811 {
1812 	edge_t	*ep;
1813 	graph_t	*gp = lock->l_graph;
1814 	lock_descriptor_t	*lck;
1815 
1816 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1817 	if (NO_DEPENDENTS(lock))
1818 		return;
1819 	ep = FIRST_IN(lock);
1820 	do {
1821 		/*
1822 		 * delete the edge from the adjacency list
1823 		 * of from vertex. if no more adjacent edges
1824 		 * for this vertex wake this process.
1825 		 */
1826 		lck = ep->from_vertex;
1827 		if (adj_list_remove)
1828 			ADJ_LIST_REMOVE(ep);
1829 		flk_update_proc_graph(ep, 1);
1830 		if (NOT_BLOCKED(lck)) {
1831 			GRANT_WAKEUP(lck);
1832 		}
1833 		lock->l_sedge = NEXT_IN(ep);
1834 		IN_LIST_REMOVE(ep);
1835 		flk_free_edge(ep);
1836 		ep = lock->l_sedge;
1837 	} while (ep != HEAD(lock));
1838 	ASSERT(NO_DEPENDENTS(lock));
1839 }
1840 
1841 /*
1842  * The dependents of request, is checked for its dependency against the
1843  * locks in topology (called topology because the array is and should be in
1844  * topological order for this algorithm, if not in topological order the
1845  * inner loop below might add more edges than necessary. Topological ordering
1846  * of vertices satisfies the property that all edges will be from left to
1847  * right i.e., topology[i] can have an edge to  topology[j], iff i<j)
1848  * If lock l1 in the dependent set of request is dependent (blocked by)
1849  * on lock l2 in topology but does not have a path to it, we add an edge
1850  * in the inner loop below.
1851  *
1852  * We don't want to add an edge between l1 and l2 if there exists
1853  * already a path from l1 to l2, so care has to be taken for those vertices
1854  * that  have two paths to 'request'. These vertices are referred to here
1855  * as barrier locks.
1856  *
1857  * The barriers has to be found (those vertex that originally had two paths
1858  * to request) because otherwise we may end up adding edges unnecessarily
1859  * to vertices in topology, and thus barrier vertices can have an edge
1860  * to a vertex in topology as well a path to it.
1861  */
1862 
1863 static void
1864 flk_recompute_dependencies(lock_descriptor_t *request,
1865 		lock_descriptor_t **topology,
1866 			int nvertex, int update_graph)
1867 {
1868 	lock_descriptor_t *vertex, *lock;
1869 	graph_t	*gp = request->l_graph;
1870 	int i, count;
1871 	int barrier_found = 0;
1872 	edge_t	*ep;
1873 	lock_descriptor_t *vertex_stack;
1874 
1875 	STACK_INIT(vertex_stack);
1876 
1877 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1878 	if (nvertex == 0)
1879 		return;
1880 	flk_graph_uncolor(request->l_graph);
1881 	barrier_found = flk_find_barriers(request);
1882 	request->l_state |= RECOMPUTE_DONE;
1883 
1884 	STACK_PUSH(vertex_stack, request, l_stack);
1885 	request->l_sedge = FIRST_IN(request);
1886 
1887 
1888 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1889 		if (vertex->l_state & RECOMPUTE_DONE) {
1890 			count = 0;
1891 			goto next_in_edge;
1892 		}
1893 		if (IS_BARRIER(vertex)) {
1894 			/* decrement the barrier count */
1895 			if (vertex->l_index) {
1896 				vertex->l_index--;
1897 				/* this guy will be pushed again anyway ? */
1898 				STACK_POP(vertex_stack, l_stack);
1899 				if (vertex->l_index == 0)  {
1900 				/*
1901 				 * barrier is over we can recompute
1902 				 * dependencies for this lock in the
1903 				 * next stack pop
1904 				 */
1905 					vertex->l_state &= ~BARRIER_LOCK;
1906 				}
1907 				continue;
1908 			}
1909 		}
1910 		vertex->l_state |= RECOMPUTE_DONE;
1911 		flk_graph_uncolor(gp);
1912 		count = flk_color_reachables(vertex);
1913 		for (i = 0; i < nvertex; i++) {
1914 			lock = topology[i];
1915 			if (COLORED(lock))
1916 				continue;
1917 			if (BLOCKS(lock, vertex)) {
1918 				(void) flk_add_edge(vertex, lock,
1919 				    NO_CHECK_CYCLE, update_graph);
1920 				COLOR(lock);
1921 				count++;
1922 				count += flk_color_reachables(lock);
1923 			}
1924 
1925 		}
1926 
1927 next_in_edge:
1928 		if (count == nvertex ||
1929 				vertex->l_sedge == HEAD(vertex)) {
1930 			/* prune the tree below this */
1931 			STACK_POP(vertex_stack, l_stack);
1932 			vertex->l_state &= ~RECOMPUTE_DONE;
1933 			/* update the barrier locks below this! */
1934 			if (vertex->l_sedge != HEAD(vertex) && barrier_found) {
1935 				flk_graph_uncolor(gp);
1936 				flk_update_barriers(vertex);
1937 			}
1938 			continue;
1939 		}
1940 
1941 		ep = vertex->l_sedge;
1942 		lock = ep->from_vertex;
1943 		STACK_PUSH(vertex_stack, lock, l_stack);
1944 		lock->l_sedge = FIRST_IN(lock);
1945 		vertex->l_sedge = NEXT_IN(ep);
1946 	}
1947 
1948 }
1949 
1950 /*
1951  * Color all reachable vertices from vertex that belongs to topology (here
1952  * those that have RECOMPUTE_LOCK set in their state) and yet uncolored.
1953  *
1954  * Note: we need to use a different stack_link l_stack1 because this is
1955  * called from flk_recompute_dependencies() that already uses a stack with
1956  * l_stack as stack_link.
1957  */
1958 
1959 static int
1960 flk_color_reachables(lock_descriptor_t *vertex)
1961 {
1962 	lock_descriptor_t *ver, *lock;
1963 	int count;
1964 	edge_t	*ep;
1965 	lock_descriptor_t *vertex_stack;
1966 
1967 	STACK_INIT(vertex_stack);
1968 
1969 	STACK_PUSH(vertex_stack, vertex, l_stack1);
1970 	count = 0;
1971 	while ((ver = STACK_TOP(vertex_stack)) != NULL) {
1972 
1973 		STACK_POP(vertex_stack, l_stack1);
1974 		for (ep = FIRST_ADJ(ver); ep != HEAD(ver);
1975 					ep = NEXT_ADJ(ep)) {
1976 			lock = ep->to_vertex;
1977 			if (COLORED(lock))
1978 				continue;
1979 			COLOR(lock);
1980 			if (IS_RECOMPUTE(lock))
1981 				count++;
1982 			STACK_PUSH(vertex_stack, lock, l_stack1);
1983 		}
1984 
1985 	}
1986 	return (count);
1987 }
1988 
1989 /*
1990  * Called from flk_recompute_dependencies() this routine decrements
1991  * the barrier count of barrier vertices that are reachable from lock.
1992  */
1993 
1994 static void
1995 flk_update_barriers(lock_descriptor_t *lock)
1996 {
1997 	lock_descriptor_t *vertex, *lck;
1998 	edge_t	*ep;
1999 	lock_descriptor_t *vertex_stack;
2000 
2001 	STACK_INIT(vertex_stack);
2002 
2003 	STACK_PUSH(vertex_stack, lock, l_stack1);
2004 
2005 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2006 		STACK_POP(vertex_stack, l_stack1);
2007 		for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2008 						ep = NEXT_IN(ep)) {
2009 			lck = ep->from_vertex;
2010 			if (COLORED(lck)) {
2011 				if (IS_BARRIER(lck)) {
2012 					ASSERT(lck->l_index > 0);
2013 					lck->l_index--;
2014 					if (lck->l_index == 0)
2015 						lck->l_state &= ~BARRIER_LOCK;
2016 				}
2017 				continue;
2018 			}
2019 			COLOR(lck);
2020 			if (IS_BARRIER(lck)) {
2021 				ASSERT(lck->l_index > 0);
2022 				lck->l_index--;
2023 				if (lck->l_index == 0)
2024 					lck->l_state &= ~BARRIER_LOCK;
2025 			}
2026 			STACK_PUSH(vertex_stack, lck, l_stack1);
2027 		}
2028 	}
2029 }
2030 
2031 /*
2032  * Finds all vertices that are reachable from 'lock' more than once and
2033  * mark them as barrier vertices and increment their barrier count.
2034  * The barrier count is one minus the total number of paths from lock
2035  * to that vertex.
2036  */
2037 
2038 static int
2039 flk_find_barriers(lock_descriptor_t *lock)
2040 {
2041 	lock_descriptor_t *vertex, *lck;
2042 	int found = 0;
2043 	edge_t	*ep;
2044 	lock_descriptor_t *vertex_stack;
2045 
2046 	STACK_INIT(vertex_stack);
2047 
2048 	STACK_PUSH(vertex_stack, lock, l_stack1);
2049 
2050 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2051 		STACK_POP(vertex_stack, l_stack1);
2052 		for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2053 						ep = NEXT_IN(ep)) {
2054 			lck = ep->from_vertex;
2055 			if (COLORED(lck)) {
2056 				/* this is a barrier */
2057 				lck->l_state |= BARRIER_LOCK;
2058 				/* index will have barrier count */
2059 				lck->l_index++;
2060 				if (!found)
2061 					found = 1;
2062 				continue;
2063 			}
2064 			COLOR(lck);
2065 			lck->l_index = 0;
2066 			STACK_PUSH(vertex_stack, lck, l_stack1);
2067 		}
2068 	}
2069 	return (found);
2070 }
2071 
2072 /*
2073  * Finds the first lock that is mainly responsible for blocking this
2074  * request.  If there is no such lock, request->l_flock.l_type is set to
2075  * F_UNLCK.  Otherwise, request->l_flock is filled in with the particulars
2076  * of the blocking lock.
2077  *
2078  * Note: It is possible a request is blocked by a sleeping lock because
2079  * of the fairness policy used in flk_process_request() to construct the
2080  * dependencies. (see comments before flk_process_request()).
2081  */
2082 
2083 static void
2084 flk_get_first_blocking_lock(lock_descriptor_t *request)
2085 {
2086 	graph_t	*gp = request->l_graph;
2087 	vnode_t *vp = request->l_vnode;
2088 	lock_descriptor_t *lock, *blocker;
2089 
2090 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
2091 	blocker = NULL;
2092 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2093 
2094 	if (lock) {
2095 		do {
2096 			if (BLOCKS(lock, request)) {
2097 				blocker = lock;
2098 				break;
2099 			}
2100 			lock = lock->l_next;
2101 		} while (lock->l_vnode == vp);
2102 	}
2103 
2104 	if (blocker) {
2105 		report_blocker(blocker, request);
2106 	} else
2107 		request->l_flock.l_type = F_UNLCK;
2108 }
2109 
2110 /*
2111  * Get the graph_t structure associated with a vnode.
2112  * If 'initialize' is non-zero, and the graph_t structure for this vnode has
2113  * not yet been initialized, then a new element is allocated and returned.
2114  */
2115 graph_t *
2116 flk_get_lock_graph(vnode_t *vp, int initialize)
2117 {
2118 	graph_t *gp;
2119 	graph_t *gp_alloc = NULL;
2120 	int index = HASH_INDEX(vp);
2121 
2122 	if (initialize == FLK_USE_GRAPH) {
2123 		mutex_enter(&flock_lock);
2124 		gp = lock_graph[index];
2125 		mutex_exit(&flock_lock);
2126 		return (gp);
2127 	}
2128 
2129 	ASSERT(initialize == FLK_INIT_GRAPH);
2130 
2131 	if (lock_graph[index] == NULL) {
2132 
2133 		gp_alloc = kmem_zalloc(sizeof (graph_t), KM_SLEEP);
2134 
2135 		/* Initialize the graph */
2136 
2137 		gp_alloc->active_locks.l_next =
2138 		    gp_alloc->active_locks.l_prev =
2139 		    (lock_descriptor_t *)ACTIVE_HEAD(gp_alloc);
2140 		gp_alloc->sleeping_locks.l_next =
2141 		    gp_alloc->sleeping_locks.l_prev =
2142 		    (lock_descriptor_t *)SLEEPING_HEAD(gp_alloc);
2143 		gp_alloc->index = index;
2144 		mutex_init(&gp_alloc->gp_mutex, NULL, MUTEX_DEFAULT, NULL);
2145 	}
2146 
2147 	mutex_enter(&flock_lock);
2148 
2149 	gp = lock_graph[index];
2150 
2151 	/* Recheck the value within flock_lock */
2152 	if (gp == NULL) {
2153 		struct flock_globals *fg;
2154 
2155 		/* We must have previously allocated the graph_t structure */
2156 		ASSERT(gp_alloc != NULL);
2157 		lock_graph[index] = gp = gp_alloc;
2158 		/*
2159 		 * The lockmgr status is only needed if KLM is loaded.
2160 		 */
2161 		if (flock_zone_key != ZONE_KEY_UNINITIALIZED) {
2162 			fg = flk_get_globals();
2163 			fg->lockmgr_status[index] = fg->flk_lockmgr_status;
2164 		}
2165 	}
2166 
2167 	mutex_exit(&flock_lock);
2168 
2169 	if ((gp_alloc != NULL) && (gp != gp_alloc)) {
2170 		/* There was a race to allocate the graph_t and we lost */
2171 		mutex_destroy(&gp_alloc->gp_mutex);
2172 		kmem_free(gp_alloc, sizeof (graph_t));
2173 	}
2174 
2175 	return (gp);
2176 }
2177 
2178 /*
2179  * PSARC case 1997/292
2180  */
2181 int
2182 cl_flk_has_remote_locks_for_nlmid(vnode_t *vp, int nlmid)
2183 {
2184 	lock_descriptor_t *lock;
2185 	int result = 0;
2186 	graph_t *gp;
2187 	int			lock_nlmid;
2188 
2189 	/*
2190 	 * Check to see if node is booted as a cluster. If not, return.
2191 	 */
2192 	if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2193 		return (0);
2194 	}
2195 
2196 	gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2197 	if (gp == NULL) {
2198 		return (0);
2199 	}
2200 
2201 	mutex_enter(&gp->gp_mutex);
2202 
2203 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2204 
2205 	if (lock) {
2206 		while (lock->l_vnode == vp) {
2207 			/* get NLM id from sysid */
2208 			lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2209 
2210 			/*
2211 			 * If NLM server request _and_ nlmid of lock matches
2212 			 * nlmid of argument, then we've found a remote lock.
2213 			 */
2214 			if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2215 				result = 1;
2216 				goto done;
2217 			}
2218 			lock = lock->l_next;
2219 		}
2220 	}
2221 
2222 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2223 
2224 	if (lock) {
2225 		while (lock->l_vnode == vp) {
2226 			/* get NLM id from sysid */
2227 			lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2228 
2229 			/*
2230 			 * If NLM server request _and_ nlmid of lock matches
2231 			 * nlmid of argument, then we've found a remote lock.
2232 			 */
2233 			if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2234 				result = 1;
2235 				goto done;
2236 			}
2237 			lock = lock->l_next;
2238 		}
2239 	}
2240 
2241 done:
2242 	mutex_exit(&gp->gp_mutex);
2243 	return (result);
2244 }
2245 
2246 /* ONC_PLUS EXTRACT START */
2247 /*
2248  * Determine whether there are any locks for the given vnode with a remote
2249  * sysid.  Returns zero if not, non-zero if there are.
2250  *
2251  * Note that the return value from this function is potentially invalid
2252  * once it has been returned.  The caller is responsible for providing its
2253  * own synchronization mechanism to ensure that the return value is useful
2254  * (e.g., see nfs_lockcompletion()).
2255  */
2256 int
2257 flk_has_remote_locks(vnode_t *vp)
2258 {
2259 	lock_descriptor_t *lock;
2260 	int result = 0;
2261 	graph_t *gp;
2262 
2263 	gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2264 	if (gp == NULL) {
2265 		return (0);
2266 	}
2267 
2268 	mutex_enter(&gp->gp_mutex);
2269 
2270 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2271 
2272 	if (lock) {
2273 		while (lock->l_vnode == vp) {
2274 			if (IS_REMOTE(lock)) {
2275 				result = 1;
2276 				goto done;
2277 			}
2278 			lock = lock->l_next;
2279 		}
2280 	}
2281 
2282 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2283 
2284 	if (lock) {
2285 		while (lock->l_vnode == vp) {
2286 			if (IS_REMOTE(lock)) {
2287 				result = 1;
2288 				goto done;
2289 			}
2290 			lock = lock->l_next;
2291 		}
2292 	}
2293 
2294 done:
2295 	mutex_exit(&gp->gp_mutex);
2296 	return (result);
2297 }
2298 
2299 /*
2300  * Determine if there are any locks owned by the given sysid.
2301  * Returns zero if not, non-zero if there are.  Note that this return code
2302  * could be derived from flk_get_{sleeping,active}_locks, but this routine
2303  * avoids all the memory allocations of those routines.
2304  *
2305  * This routine has the same synchronization issues as
2306  * flk_has_remote_locks.
2307  */
2308 
2309 int
2310 flk_sysid_has_locks(int sysid, int lck_type)
2311 {
2312 	int		has_locks = 0;
2313 	lock_descriptor_t	*lock;
2314 	graph_t 	*gp;
2315 	int		i;
2316 
2317 	for (i = 0; i < HASH_SIZE && !has_locks; i++) {
2318 		mutex_enter(&flock_lock);
2319 		gp = lock_graph[i];
2320 		mutex_exit(&flock_lock);
2321 		if (gp == NULL) {
2322 			continue;
2323 		}
2324 
2325 		mutex_enter(&gp->gp_mutex);
2326 
2327 		if (lck_type & FLK_QUERY_ACTIVE) {
2328 			for (lock = ACTIVE_HEAD(gp)->l_next;
2329 			    lock != ACTIVE_HEAD(gp) && !has_locks;
2330 			    lock = lock->l_next) {
2331 				if (lock->l_flock.l_sysid == sysid)
2332 					has_locks = 1;
2333 			}
2334 		}
2335 
2336 		if (lck_type & FLK_QUERY_SLEEPING) {
2337 			for (lock = SLEEPING_HEAD(gp)->l_next;
2338 				lock != SLEEPING_HEAD(gp) && !has_locks;
2339 				lock = lock->l_next) {
2340 				if (lock->l_flock.l_sysid == sysid)
2341 					has_locks = 1;
2342 			}
2343 		}
2344 		mutex_exit(&gp->gp_mutex);
2345 	}
2346 
2347 	return (has_locks);
2348 }
2349 
2350 
2351 /*
2352  * PSARC case 1997/292
2353  *
2354  * Requires: "sysid" is a pair [nlmid, sysid].  The lower half is 16-bit
2355  *  quantity, the real sysid generated by the NLM server; the upper half
2356  *  identifies the node of the cluster where the NLM server ran.
2357  *  This routine is only called by an NLM server running in a cluster.
2358  * Effects: Remove all locks held on behalf of the client identified
2359  *  by "sysid."
2360  */
2361 void
2362 cl_flk_remove_locks_by_sysid(int sysid)
2363 {
2364 	graph_t	*gp;
2365 	int i;
2366 	lock_descriptor_t *lock, *nlock;
2367 
2368 	/*
2369 	 * Check to see if node is booted as a cluster. If not, return.
2370 	 */
2371 	if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2372 		return;
2373 	}
2374 
2375 	ASSERT(sysid != 0);
2376 	for (i = 0; i < HASH_SIZE; i++) {
2377 		mutex_enter(&flock_lock);
2378 		gp = lock_graph[i];
2379 		mutex_exit(&flock_lock);
2380 
2381 		if (gp == NULL)
2382 			continue;
2383 
2384 		mutex_enter(&gp->gp_mutex);	/*  get mutex on lock graph */
2385 
2386 		/* signal sleeping requests so that they bail out */
2387 		lock = SLEEPING_HEAD(gp)->l_next;
2388 		while (lock != SLEEPING_HEAD(gp)) {
2389 			nlock = lock->l_next;
2390 			if (lock->l_flock.l_sysid == sysid) {
2391 				INTERRUPT_WAKEUP(lock);
2392 			}
2393 			lock = nlock;
2394 		}
2395 
2396 		/* delete active locks */
2397 		lock = ACTIVE_HEAD(gp)->l_next;
2398 		while (lock != ACTIVE_HEAD(gp)) {
2399 			nlock = lock->l_next;
2400 			if (lock->l_flock.l_sysid == sysid) {
2401 				flk_delete_active_lock(lock, 0);
2402 				flk_wakeup(lock, 1);
2403 				flk_free_lock(lock);
2404 			}
2405 			lock = nlock;
2406 		}
2407 		mutex_exit(&gp->gp_mutex);    /* release mutex on lock graph */
2408 	}
2409 }
2410 
2411 /*
2412  * Delete all locks in the system that belongs to the sysid of the request.
2413  */
2414 
2415 static void
2416 flk_delete_locks_by_sysid(lock_descriptor_t *request)
2417 {
2418 	int	sysid  = request->l_flock.l_sysid;
2419 	lock_descriptor_t *lock, *nlock;
2420 	graph_t	*gp;
2421 	int i;
2422 
2423 	ASSERT(MUTEX_HELD(&request->l_graph->gp_mutex));
2424 	ASSERT(sysid != 0);
2425 
2426 	mutex_exit(&request->l_graph->gp_mutex);
2427 
2428 	for (i = 0; i < HASH_SIZE; i++) {
2429 		mutex_enter(&flock_lock);
2430 		gp = lock_graph[i];
2431 		mutex_exit(&flock_lock);
2432 
2433 		if (gp == NULL)
2434 			continue;
2435 
2436 		mutex_enter(&gp->gp_mutex);
2437 
2438 		/* signal sleeping requests so that they bail out */
2439 		lock = SLEEPING_HEAD(gp)->l_next;
2440 		while (lock != SLEEPING_HEAD(gp)) {
2441 			nlock = lock->l_next;
2442 			if (lock->l_flock.l_sysid == sysid) {
2443 				INTERRUPT_WAKEUP(lock);
2444 			}
2445 			lock = nlock;
2446 		}
2447 
2448 		/* delete active locks */
2449 		lock = ACTIVE_HEAD(gp)->l_next;
2450 		while (lock != ACTIVE_HEAD(gp)) {
2451 			nlock = lock->l_next;
2452 			if (lock->l_flock.l_sysid == sysid) {
2453 				flk_delete_active_lock(lock, 0);
2454 				flk_wakeup(lock, 1);
2455 				flk_free_lock(lock);
2456 			}
2457 			lock = nlock;
2458 		}
2459 		mutex_exit(&gp->gp_mutex);
2460 	}
2461 
2462 	mutex_enter(&request->l_graph->gp_mutex);
2463 }
2464 
2465 /*
2466  * Clustering: Deletes PXFS locks
2467  * Effects: Delete all locks on files in the given file system and with the
2468  *  given PXFS id.
2469  */
2470 void
2471 cl_flk_delete_pxfs_locks(struct vfs *vfsp, int pxfsid)
2472 {
2473 	lock_descriptor_t *lock, *nlock;
2474 	graph_t	*gp;
2475 	int i;
2476 
2477 	for (i = 0; i < HASH_SIZE; i++) {
2478 		mutex_enter(&flock_lock);
2479 		gp = lock_graph[i];
2480 		mutex_exit(&flock_lock);
2481 
2482 		if (gp == NULL)
2483 			continue;
2484 
2485 		mutex_enter(&gp->gp_mutex);
2486 
2487 		/* signal sleeping requests so that they bail out */
2488 		lock = SLEEPING_HEAD(gp)->l_next;
2489 		while (lock != SLEEPING_HEAD(gp)) {
2490 			nlock = lock->l_next;
2491 			if (lock->l_vnode->v_vfsp == vfsp) {
2492 				ASSERT(IS_PXFS(lock));
2493 				if (GETPXFSID(lock->l_flock.l_sysid) ==
2494 				    pxfsid) {
2495 					flk_set_state(lock,
2496 					    FLK_CANCELLED_STATE);
2497 					flk_cancel_sleeping_lock(lock, 1);
2498 				}
2499 			}
2500 			lock = nlock;
2501 		}
2502 
2503 		/* delete active locks */
2504 		lock = ACTIVE_HEAD(gp)->l_next;
2505 		while (lock != ACTIVE_HEAD(gp)) {
2506 			nlock = lock->l_next;
2507 			if (lock->l_vnode->v_vfsp == vfsp) {
2508 				ASSERT(IS_PXFS(lock));
2509 				if (GETPXFSID(lock->l_flock.l_sysid) ==
2510 				    pxfsid) {
2511 					flk_delete_active_lock(lock, 0);
2512 					flk_wakeup(lock, 1);
2513 					flk_free_lock(lock);
2514 				}
2515 			}
2516 			lock = nlock;
2517 		}
2518 		mutex_exit(&gp->gp_mutex);
2519 	}
2520 }
2521 
2522 /*
2523  * Search for a sleeping lock manager lock which matches exactly this lock
2524  * request; if one is found, fake a signal to cancel it.
2525  *
2526  * Return 1 if a matching lock was found, 0 otherwise.
2527  */
2528 
2529 static int
2530 flk_canceled(lock_descriptor_t *request)
2531 {
2532 	lock_descriptor_t *lock, *nlock;
2533 	graph_t *gp = request->l_graph;
2534 	vnode_t *vp = request->l_vnode;
2535 
2536 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
2537 	ASSERT(IS_LOCKMGR(request));
2538 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2539 
2540 	if (lock) {
2541 		while (lock->l_vnode == vp) {
2542 			nlock = lock->l_next;
2543 			if (SAME_OWNER(lock, request) &&
2544 				lock->l_start == request->l_start &&
2545 					lock->l_end == request->l_end) {
2546 				INTERRUPT_WAKEUP(lock);
2547 				return (1);
2548 			}
2549 			lock = nlock;
2550 		}
2551 	}
2552 	return (0);
2553 }
2554 
2555 /*
2556  * Remove all the locks for the vnode belonging to the given pid and sysid.
2557  */
2558 
2559 void
2560 cleanlocks(vnode_t *vp, pid_t pid, int sysid)
2561 {
2562 	graph_t	*gp;
2563 	lock_descriptor_t *lock, *nlock;
2564 	lock_descriptor_t *link_stack;
2565 
2566 	STACK_INIT(link_stack);
2567 
2568 	gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2569 
2570 	if (gp == NULL)
2571 		return;
2572 	mutex_enter(&gp->gp_mutex);
2573 
2574 	CHECK_SLEEPING_LOCKS(gp);
2575 	CHECK_ACTIVE_LOCKS(gp);
2576 
2577 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2578 
2579 	if (lock) {
2580 		do {
2581 			nlock = lock->l_next;
2582 			if ((lock->l_flock.l_pid == pid ||
2583 					pid == IGN_PID) &&
2584 				lock->l_flock.l_sysid == sysid) {
2585 				CANCEL_WAKEUP(lock);
2586 			}
2587 			lock = nlock;
2588 		} while (lock->l_vnode == vp);
2589 	}
2590 
2591 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2592 
2593 	if (lock) {
2594 		do {
2595 			nlock = lock->l_next;
2596 			if ((lock->l_flock.l_pid == pid ||
2597 					pid == IGN_PID) &&
2598 				lock->l_flock.l_sysid == sysid) {
2599 				flk_delete_active_lock(lock, 0);
2600 				STACK_PUSH(link_stack, lock, l_stack);
2601 			}
2602 			lock = nlock;
2603 		} while (lock->l_vnode == vp);
2604 	}
2605 
2606 	while ((lock = STACK_TOP(link_stack)) != NULL) {
2607 		STACK_POP(link_stack, l_stack);
2608 		flk_wakeup(lock, 1);
2609 		flk_free_lock(lock);
2610 	}
2611 
2612 	CHECK_SLEEPING_LOCKS(gp);
2613 	CHECK_ACTIVE_LOCKS(gp);
2614 	CHECK_OWNER_LOCKS(gp, pid, sysid, vp);
2615 	mutex_exit(&gp->gp_mutex);
2616 }
2617 /* ONC_PLUS EXTRACT END */
2618 
2619 
2620 /*
2621  * Called from 'fs' read and write routines for files that have mandatory
2622  * locking enabled.
2623  */
2624 
2625 int
2626 chklock(
2627 	struct vnode	*vp,
2628 	int 		iomode,
2629 	u_offset_t	offset,
2630 	ssize_t		len,
2631 	int 		fmode,
2632 	caller_context_t *ct)
2633 {
2634 	register int	i;
2635 	struct flock64 	bf;
2636 	int 		error = 0;
2637 
2638 	bf.l_type = (iomode & FWRITE) ? F_WRLCK : F_RDLCK;
2639 	bf.l_whence = 0;
2640 	bf.l_start = offset;
2641 	bf.l_len = len;
2642 	if (ct == NULL) {
2643 		bf.l_pid = curproc->p_pid;
2644 		bf.l_sysid = 0;
2645 	} else {
2646 		bf.l_pid = ct->cc_pid;
2647 		bf.l_sysid = ct->cc_sysid;
2648 	}
2649 	i = (fmode & (FNDELAY|FNONBLOCK)) ? INOFLCK : INOFLCK|SLPFLCK;
2650 	if ((i = reclock(vp, &bf, i, 0, offset, NULL)) != 0 ||
2651 	    bf.l_type != F_UNLCK)
2652 		error = i ? i : EAGAIN;
2653 	return (error);
2654 }
2655 
2656 /* ONC_PLUS EXTRACT START */
2657 /*
2658  * convoff - converts the given data (start, whence) to the
2659  * given whence.
2660  */
2661 int
2662 convoff(vp, lckdat, whence, offset)
2663 	struct vnode 	*vp;
2664 	struct flock64 	*lckdat;
2665 	int 		whence;
2666 	offset_t	offset;
2667 {
2668 	int 		error;
2669 	struct vattr 	vattr;
2670 
2671 	if ((lckdat->l_whence == 2) || (whence == 2)) {
2672 		vattr.va_mask = AT_SIZE;
2673 		if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
2674 			return (error);
2675 	}
2676 
2677 	switch (lckdat->l_whence) {
2678 	case 1:
2679 		lckdat->l_start += offset;
2680 		break;
2681 	case 2:
2682 		lckdat->l_start += vattr.va_size;
2683 		/* FALLTHRU */
2684 	case 0:
2685 		break;
2686 	default:
2687 		return (EINVAL);
2688 	}
2689 
2690 	if (lckdat->l_start < 0)
2691 		return (EINVAL);
2692 
2693 	switch (whence) {
2694 	case 1:
2695 		lckdat->l_start -= offset;
2696 		break;
2697 	case 2:
2698 		lckdat->l_start -= vattr.va_size;
2699 		/* FALLTHRU */
2700 	case 0:
2701 		break;
2702 	default:
2703 		return (EINVAL);
2704 	}
2705 
2706 	lckdat->l_whence = (short)whence;
2707 	return (0);
2708 }
2709 /* ONC_PLUS EXTRACT END */
2710 
2711 
2712 /* 	proc_graph function definitions */
2713 
2714 /*
2715  * Function checks for deadlock due to the new 'lock'. If deadlock found
2716  * edges of this lock are freed and returned.
2717  */
2718 
2719 static int
2720 flk_check_deadlock(lock_descriptor_t *lock)
2721 {
2722 	proc_vertex_t	*start_vertex, *pvertex;
2723 	proc_vertex_t *dvertex;
2724 	proc_edge_t *pep, *ppep;
2725 	edge_t	*ep, *nep;
2726 	proc_vertex_t *process_stack;
2727 
2728 	STACK_INIT(process_stack);
2729 
2730 	mutex_enter(&flock_lock);
2731 	start_vertex = flk_get_proc_vertex(lock);
2732 	ASSERT(start_vertex != NULL);
2733 
2734 	/* construct the edges from this process to other processes */
2735 
2736 	ep = FIRST_ADJ(lock);
2737 	while (ep != HEAD(lock)) {
2738 		proc_vertex_t *adj_proc;
2739 
2740 		adj_proc = flk_get_proc_vertex(ep->to_vertex);
2741 		for (pep = start_vertex->edge; pep != NULL; pep = pep->next) {
2742 			if (pep->to_proc == adj_proc) {
2743 				ASSERT(pep->refcount);
2744 				pep->refcount++;
2745 				break;
2746 			}
2747 		}
2748 		if (pep == NULL) {
2749 			pep = flk_get_proc_edge();
2750 			pep->to_proc = adj_proc;
2751 			pep->refcount = 1;
2752 			adj_proc->incount++;
2753 			pep->next = start_vertex->edge;
2754 			start_vertex->edge = pep;
2755 		}
2756 		ep = NEXT_ADJ(ep);
2757 	}
2758 
2759 	ep = FIRST_IN(lock);
2760 
2761 	while (ep != HEAD(lock)) {
2762 		proc_vertex_t *in_proc;
2763 
2764 		in_proc = flk_get_proc_vertex(ep->from_vertex);
2765 
2766 		for (pep = in_proc->edge; pep != NULL; pep = pep->next) {
2767 			if (pep->to_proc == start_vertex) {
2768 				ASSERT(pep->refcount);
2769 				pep->refcount++;
2770 				break;
2771 			}
2772 		}
2773 		if (pep == NULL) {
2774 			pep = flk_get_proc_edge();
2775 			pep->to_proc = start_vertex;
2776 			pep->refcount = 1;
2777 			start_vertex->incount++;
2778 			pep->next = in_proc->edge;
2779 			in_proc->edge = pep;
2780 		}
2781 		ep = NEXT_IN(ep);
2782 	}
2783 
2784 	if (start_vertex->incount == 0) {
2785 		mutex_exit(&flock_lock);
2786 		return (0);
2787 	}
2788 
2789 	flk_proc_graph_uncolor();
2790 
2791 	start_vertex->p_sedge = start_vertex->edge;
2792 
2793 	STACK_PUSH(process_stack, start_vertex, p_stack);
2794 
2795 	while ((pvertex = STACK_TOP(process_stack)) != NULL) {
2796 		for (pep = pvertex->p_sedge; pep != NULL; pep = pep->next) {
2797 			dvertex = pep->to_proc;
2798 			if (!PROC_ARRIVED(dvertex)) {
2799 				STACK_PUSH(process_stack, dvertex, p_stack);
2800 				dvertex->p_sedge = dvertex->edge;
2801 				PROC_ARRIVE(pvertex);
2802 				pvertex->p_sedge = pep->next;
2803 				break;
2804 			}
2805 			if (!PROC_DEPARTED(dvertex))
2806 				goto deadlock;
2807 		}
2808 		if (pep == NULL) {
2809 			PROC_DEPART(pvertex);
2810 			STACK_POP(process_stack, p_stack);
2811 		}
2812 	}
2813 	mutex_exit(&flock_lock);
2814 	return (0);
2815 
2816 deadlock:
2817 
2818 	/* we remove all lock edges and proc edges */
2819 
2820 	ep = FIRST_ADJ(lock);
2821 	while (ep != HEAD(lock)) {
2822 		proc_vertex_t *adj_proc;
2823 		adj_proc = flk_get_proc_vertex(ep->to_vertex);
2824 		nep = NEXT_ADJ(ep);
2825 		IN_LIST_REMOVE(ep);
2826 		ADJ_LIST_REMOVE(ep);
2827 		flk_free_edge(ep);
2828 		ppep = start_vertex->edge;
2829 		for (pep = start_vertex->edge; pep != NULL; ppep = pep,
2830 						pep = ppep->next) {
2831 			if (pep->to_proc == adj_proc) {
2832 				pep->refcount--;
2833 				if (pep->refcount == 0) {
2834 					if (pep == ppep) {
2835 						start_vertex->edge = pep->next;
2836 					} else {
2837 						ppep->next = pep->next;
2838 					}
2839 					adj_proc->incount--;
2840 					flk_proc_release(adj_proc);
2841 					flk_free_proc_edge(pep);
2842 				}
2843 				break;
2844 			}
2845 		}
2846 		ep = nep;
2847 	}
2848 	ep = FIRST_IN(lock);
2849 	while (ep != HEAD(lock)) {
2850 		proc_vertex_t *in_proc;
2851 		in_proc = flk_get_proc_vertex(ep->from_vertex);
2852 		nep = NEXT_IN(ep);
2853 		IN_LIST_REMOVE(ep);
2854 		ADJ_LIST_REMOVE(ep);
2855 		flk_free_edge(ep);
2856 		ppep = in_proc->edge;
2857 		for (pep = in_proc->edge; pep != NULL; ppep = pep,
2858 						pep = ppep->next) {
2859 			if (pep->to_proc == start_vertex) {
2860 				pep->refcount--;
2861 				if (pep->refcount == 0) {
2862 					if (pep == ppep) {
2863 						in_proc->edge = pep->next;
2864 					} else {
2865 						ppep->next = pep->next;
2866 					}
2867 					start_vertex->incount--;
2868 					flk_proc_release(in_proc);
2869 					flk_free_proc_edge(pep);
2870 				}
2871 				break;
2872 			}
2873 		}
2874 		ep = nep;
2875 	}
2876 	flk_proc_release(start_vertex);
2877 	mutex_exit(&flock_lock);
2878 	return (1);
2879 }
2880 
2881 /*
2882  * Get a proc vertex. If lock's pvertex value gets a correct proc vertex
2883  * from the list we return that, otherwise we allocate one. If necessary,
2884  * we grow the list of vertices also.
2885  */
2886 
2887 static proc_vertex_t *
2888 flk_get_proc_vertex(lock_descriptor_t *lock)
2889 {
2890 	int i;
2891 	proc_vertex_t	*pv;
2892 	proc_vertex_t	**palloc;
2893 
2894 	ASSERT(MUTEX_HELD(&flock_lock));
2895 	if (lock->pvertex != -1) {
2896 		ASSERT(lock->pvertex >= 0);
2897 		pv = pgraph.proc[lock->pvertex];
2898 		if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
2899 			return (pv);
2900 		}
2901 	}
2902 	for (i = 0; i < pgraph.gcount; i++) {
2903 		pv = pgraph.proc[i];
2904 		if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
2905 			lock->pvertex = pv->index = i;
2906 			return (pv);
2907 		}
2908 	}
2909 	pv = kmem_zalloc(sizeof (struct proc_vertex), KM_SLEEP);
2910 	pv->pid = lock->l_flock.l_pid;
2911 	pv->sysid = lock->l_flock.l_sysid;
2912 	flk_proc_vertex_allocs++;
2913 	if (pgraph.free != 0) {
2914 		for (i = 0; i < pgraph.gcount; i++) {
2915 			if (pgraph.proc[i] == NULL) {
2916 				pgraph.proc[i] = pv;
2917 				lock->pvertex = pv->index = i;
2918 				pgraph.free--;
2919 				return (pv);
2920 			}
2921 		}
2922 	}
2923 	palloc = kmem_zalloc((pgraph.gcount + PROC_CHUNK) *
2924 				sizeof (proc_vertex_t *), KM_SLEEP);
2925 
2926 	if (pgraph.proc) {
2927 		bcopy(pgraph.proc, palloc,
2928 			pgraph.gcount * sizeof (proc_vertex_t *));
2929 
2930 		kmem_free(pgraph.proc,
2931 			pgraph.gcount * sizeof (proc_vertex_t *));
2932 	}
2933 	pgraph.proc = palloc;
2934 	pgraph.free += (PROC_CHUNK - 1);
2935 	pv->index = lock->pvertex = pgraph.gcount;
2936 	pgraph.gcount += PROC_CHUNK;
2937 	pgraph.proc[pv->index] = pv;
2938 	return (pv);
2939 }
2940 
2941 /*
2942  * Allocate a proc edge.
2943  */
2944 
2945 static proc_edge_t *
2946 flk_get_proc_edge()
2947 {
2948 	proc_edge_t *pep;
2949 
2950 	pep = kmem_zalloc(sizeof (proc_edge_t), KM_SLEEP);
2951 	flk_proc_edge_allocs++;
2952 	return (pep);
2953 }
2954 
2955 /*
2956  * Free the proc edge. Called whenever its reference count goes to zero.
2957  */
2958 
2959 static void
2960 flk_free_proc_edge(proc_edge_t *pep)
2961 {
2962 	ASSERT(pep->refcount == 0);
2963 	kmem_free((void *)pep, sizeof (proc_edge_t));
2964 	flk_proc_edge_frees++;
2965 }
2966 
2967 /*
2968  * Color the graph explicitly done only when the mark value hits max value.
2969  */
2970 
2971 static void
2972 flk_proc_graph_uncolor()
2973 {
2974 	int i;
2975 
2976 	if (pgraph.mark == UINT_MAX) {
2977 		for (i = 0; i < pgraph.gcount; i++)
2978 			if (pgraph.proc[i] != NULL) {
2979 				pgraph.proc[i]->atime = 0;
2980 				pgraph.proc[i]->dtime = 0;
2981 			}
2982 		pgraph.mark = 1;
2983 	} else {
2984 		pgraph.mark++;
2985 	}
2986 }
2987 
2988 /*
2989  * Release the proc vertex iff both there are no in edges and out edges
2990  */
2991 
2992 static void
2993 flk_proc_release(proc_vertex_t *proc)
2994 {
2995 	ASSERT(MUTEX_HELD(&flock_lock));
2996 	if (proc->edge == NULL && proc->incount == 0) {
2997 		pgraph.proc[proc->index] = NULL;
2998 		pgraph.free++;
2999 		kmem_free(proc, sizeof (proc_vertex_t));
3000 		flk_proc_vertex_frees++;
3001 	}
3002 }
3003 
3004 /*
3005  * Updates process graph to reflect change in a lock_graph.
3006  * Note: We should call this function only after we have a correctly
3007  * recomputed lock graph. Otherwise we might miss a deadlock detection.
3008  * eg: in function flk_relation() we call this function after flk_recompute_
3009  * dependencies() otherwise if a process tries to lock a vnode hashed
3010  * into another graph it might sleep for ever.
3011  */
3012 
3013 static void
3014 flk_update_proc_graph(edge_t *ep, int delete)
3015 {
3016 	proc_vertex_t *toproc, *fromproc;
3017 	proc_edge_t *pep, *prevpep;
3018 
3019 	mutex_enter(&flock_lock);
3020 	toproc = flk_get_proc_vertex(ep->to_vertex);
3021 	fromproc = flk_get_proc_vertex(ep->from_vertex);
3022 
3023 	if (!delete)
3024 		goto add;
3025 	pep = prevpep = fromproc->edge;
3026 
3027 	ASSERT(pep != NULL);
3028 	while (pep != NULL) {
3029 		if (pep->to_proc == toproc) {
3030 			ASSERT(pep->refcount > 0);
3031 			pep->refcount--;
3032 			if (pep->refcount == 0) {
3033 				if (pep == prevpep) {
3034 					fromproc->edge = pep->next;
3035 				} else {
3036 					prevpep->next = pep->next;
3037 				}
3038 				toproc->incount--;
3039 				flk_proc_release(toproc);
3040 				flk_free_proc_edge(pep);
3041 			}
3042 			break;
3043 		}
3044 		prevpep = pep;
3045 		pep = pep->next;
3046 	}
3047 	flk_proc_release(fromproc);
3048 	mutex_exit(&flock_lock);
3049 	return;
3050 add:
3051 
3052 	pep = fromproc->edge;
3053 
3054 	while (pep != NULL) {
3055 		if (pep->to_proc == toproc) {
3056 			ASSERT(pep->refcount > 0);
3057 			pep->refcount++;
3058 			break;
3059 		}
3060 		pep = pep->next;
3061 	}
3062 	if (pep == NULL) {
3063 		pep = flk_get_proc_edge();
3064 		pep->to_proc = toproc;
3065 		pep->refcount = 1;
3066 		toproc->incount++;
3067 		pep->next = fromproc->edge;
3068 		fromproc->edge = pep;
3069 	}
3070 	mutex_exit(&flock_lock);
3071 }
3072 
3073 /* ONC_PLUS EXTRACT START */
3074 /*
3075  * Set the control status for lock manager requests.
3076  *
3077  */
3078 
3079 /*
3080  * PSARC case 1997/292
3081  *
3082  * Requires: "nlmid" must be >= 1 and <= clconf_maximum_nodeid().
3083  * Effects: Set the state of the NLM server identified by "nlmid"
3084  *   in the NLM registry to state "nlm_state."
3085  *   Raises exception no_such_nlm if "nlmid" doesn't identify a known
3086  *   NLM server to this LLM.
3087  *   Note that when this routine is called with NLM_SHUTTING_DOWN there
3088  *   may be locks requests that have gotten started but not finished.  In
3089  *   particular, there may be blocking requests that are in the callback code
3090  *   before sleeping (so they're not holding the lock for the graph).  If
3091  *   such a thread reacquires the graph's lock (to go to sleep) after
3092  *   NLM state in the NLM registry  is set to a non-up value,
3093  *   it will notice the status and bail out.  If the request gets
3094  *   granted before the thread can check the NLM registry, let it
3095  *   continue normally.  It will get flushed when we are called with NLM_DOWN.
3096  *
3097  * Modifies: nlm_reg_obj (global)
3098  * Arguments:
3099  *    nlmid	(IN):    id uniquely identifying an NLM server
3100  *    nlm_state (IN):    NLM server state to change "nlmid" to
3101  */
3102 void
3103 cl_flk_set_nlm_status(int nlmid, flk_nlm_status_t nlm_state)
3104 {
3105 	/*
3106 	 * Check to see if node is booted as a cluster. If not, return.
3107 	 */
3108 	if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
3109 		return;
3110 	}
3111 
3112 	/*
3113 	 * Check for development/debugging.  It is possible to boot a node
3114 	 * in non-cluster mode, and then run a special script, currently
3115 	 * available only to developers, to bring up the node as part of a
3116 	 * cluster.  The problem is that running such a script does not
3117 	 * result in the routine flk_init() being called and hence global array
3118 	 * nlm_reg_status is NULL.  The NLM thinks it's in cluster mode,
3119 	 * but the LLM needs to do an additional check to see if the global
3120 	 * array has been created or not. If nlm_reg_status is NULL, then
3121 	 * return, else continue.
3122 	 */
3123 	if (nlm_reg_status == NULL) {
3124 		return;
3125 	}
3126 
3127 	ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
3128 	mutex_enter(&nlm_reg_lock);
3129 
3130 	if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status, nlmid)) {
3131 		/*
3132 		 * If the NLM server "nlmid" is unknown in the NLM registry,
3133 		 * add it to the registry in the nlm shutting down state.
3134 		 */
3135 		FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3136 			FLK_NLM_SHUTTING_DOWN);
3137 	} else {
3138 		/*
3139 		 * Change the state of the NLM server identified by "nlmid"
3140 		 * in the NLM registry to the argument "nlm_state."
3141 		 */
3142 		FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3143 			nlm_state);
3144 	}
3145 
3146 	/*
3147 	 *  The reason we must register the NLM server that is shutting down
3148 	 *  with an LLM that doesn't already know about it (never sent a lock
3149 	 *  request) is to handle correctly a race between shutdown and a new
3150 	 *  lock request.  Suppose that a shutdown request from the NLM server
3151 	 *  invokes this routine at the LLM, and a thread is spawned to
3152 	 *  service the request. Now suppose a new lock request is in
3153 	 *  progress and has already passed the first line of defense in
3154 	 *  reclock(), which denies new locks requests from NLM servers
3155 	 *  that are not in the NLM_UP state.  After the current routine
3156 	 *  is invoked for both phases of shutdown, the routine will return,
3157 	 *  having done nothing, and the lock request will proceed and
3158 	 *  probably be granted.  The problem is that the shutdown was ignored
3159 	 *  by the lock request because there was no record of that NLM server
3160 	 *  shutting down.   We will be in the peculiar position of thinking
3161 	 *  that we've shutdown the NLM server and all locks at all LLMs have
3162 	 *  been discarded, but in fact there's still one lock held.
3163 	 *  The solution is to record the existence of NLM server and change
3164 	 *  its state immediately to NLM_SHUTTING_DOWN.  The lock request in
3165 	 *  progress may proceed because the next phase NLM_DOWN will catch
3166 	 *  this lock and discard it.
3167 	 */
3168 	mutex_exit(&nlm_reg_lock);
3169 
3170 	switch (nlm_state) {
3171 	case FLK_NLM_UP:
3172 		/*
3173 		 * Change the NLM state of all locks still held on behalf of
3174 		 * the NLM server identified by "nlmid" to NLM_UP.
3175 		 */
3176 		cl_flk_change_nlm_state_all_locks(nlmid, FLK_NLM_UP);
3177 		break;
3178 
3179 	case FLK_NLM_SHUTTING_DOWN:
3180 		/*
3181 		 * Wake up all sleeping locks for the NLM server identified
3182 		 * by "nlmid." Note that eventually all woken threads will
3183 		 * have their lock requests cancelled and descriptors
3184 		 * removed from the sleeping lock list.  Note that the NLM
3185 		 * server state associated with each lock descriptor is
3186 		 * changed to FLK_NLM_SHUTTING_DOWN.
3187 		 */
3188 		cl_flk_wakeup_sleeping_nlm_locks(nlmid);
3189 		break;
3190 
3191 	case FLK_NLM_DOWN:
3192 		/*
3193 		 * Discard all active, granted locks for this NLM server
3194 		 * identified by "nlmid."
3195 		 */
3196 		cl_flk_unlock_nlm_granted(nlmid);
3197 		break;
3198 
3199 	default:
3200 		panic("cl_set_nlm_status: bad status (%d)", nlm_state);
3201 	}
3202 }
3203 
3204 /*
3205  * Set the control status for lock manager requests.
3206  *
3207  * Note that when this routine is called with FLK_WAKEUP_SLEEPERS, there
3208  * may be locks requests that have gotten started but not finished.  In
3209  * particular, there may be blocking requests that are in the callback code
3210  * before sleeping (so they're not holding the lock for the graph).  If
3211  * such a thread reacquires the graph's lock (to go to sleep) after
3212  * flk_lockmgr_status is set to a non-up value, it will notice the status
3213  * and bail out.  If the request gets granted before the thread can check
3214  * flk_lockmgr_status, let it continue normally.  It will get flushed when
3215  * we are called with FLK_LOCKMGR_DOWN.
3216  */
3217 
3218 void
3219 flk_set_lockmgr_status(flk_lockmgr_status_t status)
3220 {
3221 	int i;
3222 	graph_t *gp;
3223 	struct flock_globals *fg;
3224 
3225 	fg = flk_get_globals();
3226 	ASSERT(fg != NULL);
3227 
3228 	mutex_enter(&flock_lock);
3229 	fg->flk_lockmgr_status = status;
3230 	mutex_exit(&flock_lock);
3231 
3232 	/*
3233 	 * If the lock manager is coming back up, all that's needed is to
3234 	 * propagate this information to the graphs.  If the lock manager
3235 	 * is going down, additional action is required, and each graph's
3236 	 * copy of the state is updated atomically with this other action.
3237 	 */
3238 	switch (status) {
3239 	case FLK_LOCKMGR_UP:
3240 		for (i = 0; i < HASH_SIZE; i++) {
3241 			mutex_enter(&flock_lock);
3242 			gp = lock_graph[i];
3243 			mutex_exit(&flock_lock);
3244 			if (gp == NULL)
3245 				continue;
3246 			mutex_enter(&gp->gp_mutex);
3247 			fg->lockmgr_status[i] = status;
3248 			mutex_exit(&gp->gp_mutex);
3249 		}
3250 		break;
3251 	case FLK_WAKEUP_SLEEPERS:
3252 		wakeup_sleeping_lockmgr_locks(fg);
3253 		break;
3254 	case FLK_LOCKMGR_DOWN:
3255 		unlock_lockmgr_granted(fg);
3256 		break;
3257 	default:
3258 		panic("flk_set_lockmgr_status: bad status (%d)", status);
3259 		break;
3260 	}
3261 }
3262 
3263 /*
3264  * This routine returns all the locks that are active or sleeping and are
3265  * associated with a particular set of identifiers.  If lock_state != 0, then
3266  * only locks that match the lock_state are returned. If lock_state == 0, then
3267  * all locks are returned. If pid == NOPID, the pid is ignored.  If
3268  * use_sysid is FALSE, then the sysid is ignored.  If vp is NULL, then the
3269  * vnode pointer is ignored.
3270  *
3271  * A list containing the vnode pointer and an flock structure
3272  * describing the lock is returned.  Each element in the list is
3273  * dynamically allocated and must be freed by the caller.  The
3274  * last item in the list is denoted by a NULL value in the ll_next
3275  * field.
3276  *
3277  * The vnode pointers returned are held.  The caller is responsible
3278  * for releasing these.  Note that the returned list is only a snapshot of
3279  * the current lock information, and that it is a snapshot of a moving
3280  * target (only one graph is locked at a time).
3281  */
3282 
3283 locklist_t *
3284 get_lock_list(int list_type, int lock_state, int sysid, boolean_t use_sysid,
3285 		pid_t pid, const vnode_t *vp, zoneid_t zoneid)
3286 {
3287 	lock_descriptor_t	*lock;
3288 	lock_descriptor_t	*graph_head;
3289 	locklist_t		listhead;
3290 	locklist_t		*llheadp;
3291 	locklist_t		*llp;
3292 	locklist_t		*lltp;
3293 	graph_t			*gp;
3294 	int			i;
3295 	int			first_index; /* graph index */
3296 	int			num_indexes; /* graph index */
3297 
3298 	ASSERT((list_type == FLK_ACTIVE_STATE) ||
3299 	    (list_type == FLK_SLEEPING_STATE));
3300 
3301 	/*
3302 	 * Get a pointer to something to use as a list head while building
3303 	 * the rest of the list.
3304 	 */
3305 	llheadp = &listhead;
3306 	lltp = llheadp;
3307 	llheadp->ll_next = (locklist_t *)NULL;
3308 
3309 	/* Figure out which graphs we want to look at. */
3310 	if (vp == NULL) {
3311 		first_index = 0;
3312 		num_indexes = HASH_SIZE;
3313 	} else {
3314 		first_index = HASH_INDEX(vp);
3315 		num_indexes = 1;
3316 	}
3317 
3318 	for (i = first_index; i < first_index + num_indexes; i++) {
3319 		mutex_enter(&flock_lock);
3320 		gp = lock_graph[i];
3321 		mutex_exit(&flock_lock);
3322 		if (gp == NULL) {
3323 			continue;
3324 		}
3325 
3326 		mutex_enter(&gp->gp_mutex);
3327 		graph_head = (list_type == FLK_ACTIVE_STATE) ?
3328 			ACTIVE_HEAD(gp) : SLEEPING_HEAD(gp);
3329 		for (lock = graph_head->l_next;
3330 		    lock != graph_head;
3331 		    lock = lock->l_next) {
3332 			if (use_sysid && lock->l_flock.l_sysid != sysid)
3333 				continue;
3334 			if (pid != NOPID && lock->l_flock.l_pid != pid)
3335 				continue;
3336 			if (vp != NULL && lock->l_vnode != vp)
3337 				continue;
3338 			if (lock_state && !(lock_state & lock->l_state))
3339 				continue;
3340 			if (zoneid != lock->l_zoneid && zoneid != ALL_ZONES)
3341 				continue;
3342 			/*
3343 			 * A matching lock was found.  Allocate
3344 			 * space for a new locklist entry and fill
3345 			 * it in.
3346 			 */
3347 			llp = kmem_alloc(sizeof (locklist_t), KM_SLEEP);
3348 			lltp->ll_next = llp;
3349 			VN_HOLD(lock->l_vnode);
3350 			llp->ll_vp = lock->l_vnode;
3351 			create_flock(lock, &(llp->ll_flock));
3352 			llp->ll_next = (locklist_t *)NULL;
3353 			lltp = llp;
3354 		}
3355 		mutex_exit(&gp->gp_mutex);
3356 	}
3357 
3358 	llp = llheadp->ll_next;
3359 	return (llp);
3360 }
3361 
3362 /*
3363  * These two functions are simply interfaces to get_lock_list.  They return
3364  * a list of sleeping or active locks for the given sysid and pid.  See
3365  * get_lock_list for details.
3366  *
3367  * In either case we don't particularly care to specify the zone of interest;
3368  * the sysid-space is global across zones, so the sysid will map to exactly one
3369  * zone, and we'll return information for that zone.
3370  */
3371 
3372 locklist_t *
3373 flk_get_sleeping_locks(int sysid, pid_t pid)
3374 {
3375 	return (get_lock_list(FLK_SLEEPING_STATE, 0, sysid, B_TRUE, pid, NULL,
3376 		    ALL_ZONES));
3377 }
3378 
3379 locklist_t *
3380 flk_get_active_locks(int sysid, pid_t pid)
3381 {
3382 	return (get_lock_list(FLK_ACTIVE_STATE, 0, sysid, B_TRUE, pid, NULL,
3383 		    ALL_ZONES));
3384 }
3385 
3386 /*
3387  * Another interface to get_lock_list.  This one returns all the active
3388  * locks for a given vnode.  Again, see get_lock_list for details.
3389  *
3390  * We don't need to specify which zone's locks we're interested in.  The matter
3391  * would only be interesting if the vnode belonged to NFS, and NFS vnodes can't
3392  * be used by multiple zones, so the list of locks will all be from the right
3393  * zone.
3394  */
3395 
3396 locklist_t *
3397 flk_active_locks_for_vp(const vnode_t *vp)
3398 {
3399 	return (get_lock_list(FLK_ACTIVE_STATE, 0, 0, B_FALSE, NOPID, vp,
3400 		    ALL_ZONES));
3401 }
3402 
3403 /*
3404  * Another interface to get_lock_list.  This one returns all the active
3405  * nbmand locks for a given vnode.  Again, see get_lock_list for details.
3406  *
3407  * See the comment for flk_active_locks_for_vp() for why we don't care to
3408  * specify the particular zone of interest.
3409  */
3410 locklist_t *
3411 flk_active_nbmand_locks_for_vp(const vnode_t *vp)
3412 {
3413 	return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3414 				NOPID, vp, ALL_ZONES));
3415 }
3416 
3417 /*
3418  * Another interface to get_lock_list.  This one returns all the active
3419  * nbmand locks for a given pid.  Again, see get_lock_list for details.
3420  *
3421  * The zone doesn't need to be specified here; the locks held by a
3422  * particular process will either be local (ie, non-NFS) or from the zone
3423  * the process is executing in.  This is because other parts of the system
3424  * ensure that an NFS vnode can't be used in a zone other than that in
3425  * which it was opened.
3426  */
3427 locklist_t *
3428 flk_active_nbmand_locks(pid_t pid)
3429 {
3430 	return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3431 				pid, NULL, ALL_ZONES));
3432 }
3433 
3434 /*
3435  * Free up all entries in the locklist.
3436  */
3437 void
3438 flk_free_locklist(locklist_t *llp)
3439 {
3440 	locklist_t *next_llp;
3441 
3442 	while (llp) {
3443 		next_llp = llp->ll_next;
3444 		VN_RELE(llp->ll_vp);
3445 		kmem_free(llp, sizeof (*llp));
3446 		llp = next_llp;
3447 	}
3448 }
3449 
3450 static void
3451 cl_flk_change_nlm_state_all_locks(int nlmid, flk_nlm_status_t nlm_state)
3452 {
3453 	/*
3454 	 * For each graph "lg" in the hash table lock_graph do
3455 	 * a.  Get the list of sleeping locks
3456 	 * b.  For each lock descriptor in the list do
3457 	 *	i.   If the requested lock is an NLM server request AND
3458 	 *		the nlmid is the same as the routine argument then
3459 	 *		change the lock descriptor's state field to
3460 	 *		"nlm_state."
3461 	 * c.  Get the list of active locks
3462 	 * d.  For each lock descriptor in the list do
3463 	 *	i.   If the requested lock is an NLM server request AND
3464 	 *		the nlmid is the same as the routine argument then
3465 	 *		change the lock descriptor's state field to
3466 	 *		"nlm_state."
3467 	 */
3468 
3469 	int			i;
3470 	graph_t			*gp;			/* lock graph */
3471 	lock_descriptor_t	*lock;			/* lock */
3472 	lock_descriptor_t	*nlock = NULL;		/* next lock */
3473 	int			lock_nlmid;
3474 
3475 	for (i = 0; i < HASH_SIZE; i++) {
3476 		mutex_enter(&flock_lock);
3477 		gp = lock_graph[i];
3478 		mutex_exit(&flock_lock);
3479 		if (gp == NULL) {
3480 			continue;
3481 		}
3482 
3483 		/* Get list of sleeping locks in current lock graph. */
3484 		mutex_enter(&gp->gp_mutex);
3485 		for (lock = SLEEPING_HEAD(gp)->l_next;
3486 		    lock != SLEEPING_HEAD(gp);
3487 		    lock = nlock) {
3488 			nlock = lock->l_next;
3489 			/* get NLM id */
3490 			lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3491 
3492 			/*
3493 			 * If NLM server request AND nlmid of lock matches
3494 			 * nlmid of argument, then set the NLM state of the
3495 			 * lock to "nlm_state."
3496 			 */
3497 			if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3498 				SET_NLM_STATE(lock, nlm_state);
3499 			}
3500 		}
3501 
3502 		/* Get list of active locks in current lock graph. */
3503 		for (lock = ACTIVE_HEAD(gp)->l_next;
3504 		    lock != ACTIVE_HEAD(gp);
3505 		    lock = nlock) {
3506 			nlock = lock->l_next;
3507 			/* get NLM id */
3508 			lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3509 
3510 			/*
3511 			 * If NLM server request AND nlmid of lock matches
3512 			 * nlmid of argument, then set the NLM state of the
3513 			 * lock to "nlm_state."
3514 			 */
3515 			if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3516 				ASSERT(IS_ACTIVE(lock));
3517 				SET_NLM_STATE(lock, nlm_state);
3518 			}
3519 		}
3520 		mutex_exit(&gp->gp_mutex);
3521 	}
3522 }
3523 
3524 /*
3525  * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid().
3526  * Effects: Find all sleeping lock manager requests _only_ for the NLM server
3527  *   identified by "nlmid." Poke those lock requests.
3528  */
3529 static void
3530 cl_flk_wakeup_sleeping_nlm_locks(int nlmid)
3531 {
3532 	lock_descriptor_t *lock;
3533 	lock_descriptor_t *nlock = NULL; /* next lock */
3534 	int i;
3535 	graph_t *gp;
3536 	int	lock_nlmid;
3537 
3538 	for (i = 0; i < HASH_SIZE; i++) {
3539 		mutex_enter(&flock_lock);
3540 		gp = lock_graph[i];
3541 		mutex_exit(&flock_lock);
3542 		if (gp == NULL) {
3543 			continue;
3544 		}
3545 
3546 		mutex_enter(&gp->gp_mutex);
3547 		for (lock = SLEEPING_HEAD(gp)->l_next;
3548 		    lock != SLEEPING_HEAD(gp);
3549 		    lock = nlock) {
3550 			nlock = lock->l_next;
3551 			/*
3552 			 * If NLM server request _and_ nlmid of lock matches
3553 			 * nlmid of argument, then set the NLM state of the
3554 			 * lock to NLM_SHUTTING_DOWN, and wake up sleeping
3555 			 * request.
3556 			 */
3557 			if (IS_LOCKMGR(lock)) {
3558 				/* get NLM id */
3559 				lock_nlmid =
3560 					GETNLMID(lock->l_flock.l_sysid);
3561 				if (nlmid == lock_nlmid) {
3562 					SET_NLM_STATE(lock,
3563 						FLK_NLM_SHUTTING_DOWN);
3564 					INTERRUPT_WAKEUP(lock);
3565 				}
3566 			}
3567 		}
3568 		mutex_exit(&gp->gp_mutex);
3569 	}
3570 }
3571 
3572 /*
3573  * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid()
3574  * Effects:  Find all active (granted) lock manager locks _only_ for the
3575  *   NLM server identified by "nlmid" and release them.
3576  */
3577 static void
3578 cl_flk_unlock_nlm_granted(int nlmid)
3579 {
3580 	lock_descriptor_t *lock;
3581 	lock_descriptor_t *nlock = NULL; /* next lock */
3582 	int i;
3583 	graph_t *gp;
3584 	int	lock_nlmid;
3585 
3586 	for (i = 0; i < HASH_SIZE; i++) {
3587 		mutex_enter(&flock_lock);
3588 		gp = lock_graph[i];
3589 		mutex_exit(&flock_lock);
3590 		if (gp == NULL) {
3591 			continue;
3592 		}
3593 
3594 		mutex_enter(&gp->gp_mutex);
3595 		for (lock = ACTIVE_HEAD(gp)->l_next;
3596 		    lock != ACTIVE_HEAD(gp);
3597 		    lock = nlock) {
3598 			nlock = lock->l_next;
3599 			ASSERT(IS_ACTIVE(lock));
3600 
3601 			/*
3602 			 * If it's an  NLM server request _and_ nlmid of
3603 			 * the lock matches nlmid of argument, then
3604 			 * remove the active lock the list, wakup blocked
3605 			 * threads, and free the storage for the lock.
3606 			 * Note that there's no need to mark the NLM state
3607 			 * of this lock to NLM_DOWN because the lock will
3608 			 * be deleted anyway and its storage freed.
3609 			 */
3610 			if (IS_LOCKMGR(lock)) {
3611 				/* get NLM id */
3612 				lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3613 				if (nlmid == lock_nlmid) {
3614 					flk_delete_active_lock(lock, 0);
3615 					flk_wakeup(lock, 1);
3616 					flk_free_lock(lock);
3617 				}
3618 			}
3619 		}
3620 		mutex_exit(&gp->gp_mutex);
3621 	}
3622 }
3623 
3624 /*
3625  * Find all sleeping lock manager requests and poke them.
3626  */
3627 static void
3628 wakeup_sleeping_lockmgr_locks(struct flock_globals *fg)
3629 {
3630 	lock_descriptor_t *lock;
3631 	lock_descriptor_t *nlock = NULL; /* next lock */
3632 	int i;
3633 	graph_t *gp;
3634 	zoneid_t zoneid = getzoneid();
3635 
3636 	for (i = 0; i < HASH_SIZE; i++) {
3637 		mutex_enter(&flock_lock);
3638 		gp = lock_graph[i];
3639 		mutex_exit(&flock_lock);
3640 		if (gp == NULL) {
3641 			continue;
3642 		}
3643 
3644 		mutex_enter(&gp->gp_mutex);
3645 		fg->lockmgr_status[i] = FLK_WAKEUP_SLEEPERS;
3646 		for (lock = SLEEPING_HEAD(gp)->l_next;
3647 		    lock != SLEEPING_HEAD(gp);
3648 		    lock = nlock) {
3649 			nlock = lock->l_next;
3650 			if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
3651 				INTERRUPT_WAKEUP(lock);
3652 			}
3653 		}
3654 		mutex_exit(&gp->gp_mutex);
3655 	}
3656 }
3657 
3658 
3659 /*
3660  * Find all active (granted) lock manager locks and release them.
3661  */
3662 static void
3663 unlock_lockmgr_granted(struct flock_globals *fg)
3664 {
3665 	lock_descriptor_t *lock;
3666 	lock_descriptor_t *nlock = NULL; /* next lock */
3667 	int i;
3668 	graph_t *gp;
3669 	zoneid_t zoneid = getzoneid();
3670 
3671 	for (i = 0; i < HASH_SIZE; i++) {
3672 		mutex_enter(&flock_lock);
3673 		gp = lock_graph[i];
3674 		mutex_exit(&flock_lock);
3675 		if (gp == NULL) {
3676 			continue;
3677 		}
3678 
3679 		mutex_enter(&gp->gp_mutex);
3680 		fg->lockmgr_status[i] = FLK_LOCKMGR_DOWN;
3681 		for (lock = ACTIVE_HEAD(gp)->l_next;
3682 		    lock != ACTIVE_HEAD(gp);
3683 		    lock = nlock) {
3684 			nlock = lock->l_next;
3685 			if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
3686 				ASSERT(IS_ACTIVE(lock));
3687 				flk_delete_active_lock(lock, 0);
3688 				flk_wakeup(lock, 1);
3689 				flk_free_lock(lock);
3690 			}
3691 		}
3692 		mutex_exit(&gp->gp_mutex);
3693 	}
3694 }
3695 /* ONC_PLUS EXTRACT END */
3696 
3697 
3698 /*
3699  * Wait until a lock is granted, cancelled, or interrupted.
3700  */
3701 
3702 static void
3703 wait_for_lock(lock_descriptor_t *request)
3704 {
3705 	graph_t *gp = request->l_graph;
3706 
3707 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
3708 
3709 	while (!(IS_GRANTED(request)) && !(IS_CANCELLED(request)) &&
3710 	    !(IS_INTERRUPTED(request))) {
3711 		if (!cv_wait_sig(&request->l_cv, &gp->gp_mutex)) {
3712 			flk_set_state(request, FLK_INTERRUPTED_STATE);
3713 			request->l_state |= INTERRUPTED_LOCK;
3714 		}
3715 	}
3716 }
3717 
3718 /* ONC_PLUS EXTRACT START */
3719 /*
3720  * Create an flock structure from the existing lock information
3721  *
3722  * This routine is used to create flock structures for the lock manager
3723  * to use in a reclaim request.  Since the lock was originated on this
3724  * host, it must be conforming to UNIX semantics, so no checking is
3725  * done to make sure it falls within the lower half of the 32-bit range.
3726  */
3727 
3728 static void
3729 create_flock(lock_descriptor_t *lp, flock64_t *flp)
3730 {
3731 	ASSERT(lp->l_end == MAX_U_OFFSET_T || lp->l_end <= MAXEND);
3732 	ASSERT(lp->l_end >= lp->l_start);
3733 
3734 	flp->l_type = lp->l_type;
3735 	flp->l_whence = 0;
3736 	flp->l_start = lp->l_start;
3737 	flp->l_len = (lp->l_end == MAX_U_OFFSET_T) ? 0 :
3738 		(lp->l_end - lp->l_start + 1);
3739 	flp->l_sysid = lp->l_flock.l_sysid;
3740 	flp->l_pid = lp->l_flock.l_pid;
3741 }
3742 
3743 /*
3744  * Convert flock_t data describing a lock range into unsigned long starting
3745  * and ending points, which are put into lock_request.  Returns 0 or an
3746  * errno value.
3747  * Large Files: max is passed by the caller and we return EOVERFLOW
3748  * as defined by LFS API.
3749  */
3750 
3751 int
3752 flk_convert_lock_data(vnode_t *vp, flock64_t *flp,
3753     u_offset_t *start, u_offset_t *end, offset_t offset)
3754 {
3755 	struct vattr	vattr;
3756 	int	error;
3757 
3758 	/*
3759 	 * Determine the starting point of the request
3760 	 */
3761 	switch (flp->l_whence) {
3762 	case 0:		/* SEEK_SET */
3763 		*start = (u_offset_t)flp->l_start;
3764 		break;
3765 	case 1:		/* SEEK_CUR */
3766 		*start = (u_offset_t)(flp->l_start + offset);
3767 		break;
3768 	case 2:		/* SEEK_END */
3769 		vattr.va_mask = AT_SIZE;
3770 		if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
3771 			return (error);
3772 		*start = (u_offset_t)(flp->l_start + vattr.va_size);
3773 		break;
3774 	default:
3775 		return (EINVAL);
3776 	}
3777 
3778 	/*
3779 	 * Determine the range covered by the request.
3780 	 */
3781 	if (flp->l_len == 0)
3782 		*end = MAX_U_OFFSET_T;
3783 	else if ((offset_t)flp->l_len > 0) {
3784 		*end = (u_offset_t)(*start + (flp->l_len - 1));
3785 	} else {
3786 		/*
3787 		 * Negative length; why do we even allow this ?
3788 		 * Because this allows easy specification of
3789 		 * the last n bytes of the file.
3790 		 */
3791 		*end = *start;
3792 		*start += (u_offset_t)flp->l_len;
3793 		(*start)++;
3794 	}
3795 	return (0);
3796 }
3797 
3798 /*
3799  * Check the validity of lock data.  This can used by the NFS
3800  * frlock routines to check data before contacting the server.  The
3801  * server must support semantics that aren't as restrictive as
3802  * the UNIX API, so the NFS client is required to check.
3803  * The maximum is now passed in by the caller.
3804  */
3805 
3806 int
3807 flk_check_lock_data(u_offset_t start, u_offset_t end, offset_t max)
3808 {
3809 	/*
3810 	 * The end (length) for local locking should never be greater
3811 	 * than MAXEND. However, the representation for
3812 	 * the entire file is MAX_U_OFFSET_T.
3813 	 */
3814 	if ((start > max) ||
3815 	    ((end > max) && (end != MAX_U_OFFSET_T))) {
3816 		return (EINVAL);
3817 	}
3818 	if (start > end) {
3819 	    return (EINVAL);
3820 	}
3821 	return (0);
3822 }
3823 
3824 /*
3825  * Fill in request->l_flock with information about the lock blocking the
3826  * request.  The complexity here is that lock manager requests are allowed
3827  * to see into the upper part of the 32-bit address range, whereas local
3828  * requests are only allowed to see signed values.
3829  *
3830  * What should be done when "blocker" is a lock manager lock that uses the
3831  * upper portion of the 32-bit range, but "request" is local?  Since the
3832  * request has already been determined to have been blocked by the blocker,
3833  * at least some portion of "blocker" must be in the range of the request,
3834  * or the request extends to the end of file.  For the first case, the
3835  * portion in the lower range is returned with the indication that it goes
3836  * "to EOF."  For the second case, the last byte of the lower range is
3837  * returned with the indication that it goes "to EOF."
3838  */
3839 
3840 static void
3841 report_blocker(lock_descriptor_t *blocker, lock_descriptor_t *request)
3842 {
3843 	flock64_t *flrp;			/* l_flock portion of request */
3844 
3845 	ASSERT(blocker != NULL);
3846 
3847 	flrp = &request->l_flock;
3848 	flrp->l_whence = 0;
3849 	flrp->l_type = blocker->l_type;
3850 	flrp->l_pid = blocker->l_flock.l_pid;
3851 	flrp->l_sysid = blocker->l_flock.l_sysid;
3852 
3853 	if (IS_LOCKMGR(request)) {
3854 		flrp->l_start = blocker->l_start;
3855 		if (blocker->l_end == MAX_U_OFFSET_T)
3856 			flrp->l_len = 0;
3857 		else
3858 			flrp->l_len = blocker->l_end - blocker->l_start + 1;
3859 	} else {
3860 		if (blocker->l_start > MAXEND) {
3861 			flrp->l_start = MAXEND;
3862 			flrp->l_len = 0;
3863 		} else {
3864 			flrp->l_start = blocker->l_start;
3865 			if (blocker->l_end == MAX_U_OFFSET_T)
3866 				flrp->l_len = 0;
3867 			else
3868 				flrp->l_len = blocker->l_end -
3869 					blocker->l_start + 1;
3870 		}
3871 	}
3872 }
3873 /* ONC_PLUS EXTRACT END */
3874 
3875 /*
3876  * PSARC case 1997/292
3877  */
3878 /*
3879  * This is the public routine exported by flock.h.
3880  */
3881 void
3882 cl_flk_change_nlm_state_to_unknown(int nlmid)
3883 {
3884 	/*
3885 	 * Check to see if node is booted as a cluster. If not, return.
3886 	 */
3887 	if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
3888 		return;
3889 	}
3890 
3891 	/*
3892 	 * See comment in cl_flk_set_nlm_status().
3893 	 */
3894 	if (nlm_reg_status == NULL) {
3895 		return;
3896 	}
3897 
3898 	/*
3899 	 * protect NLM registry state with a mutex.
3900 	 */
3901 	ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
3902 	mutex_enter(&nlm_reg_lock);
3903 	FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid, FLK_NLM_UNKNOWN);
3904 	mutex_exit(&nlm_reg_lock);
3905 }
3906 
3907 /*
3908  * Return non-zero if the given I/O request conflicts with an active NBMAND
3909  * lock.
3910  * If svmand is non-zero, it means look at all active locks, not just NBMAND
3911  * locks.
3912  */
3913 
3914 int
3915 nbl_lock_conflict(vnode_t *vp, nbl_op_t op, u_offset_t offset,
3916 		ssize_t length, int svmand, caller_context_t *ct)
3917 {
3918 	int conflict = 0;
3919 	graph_t			*gp;
3920 	lock_descriptor_t	*lock;
3921 	pid_t pid;
3922 	int sysid;
3923 
3924 	if (ct == NULL) {
3925 		pid = curproc->p_pid;
3926 		sysid = 0;
3927 	} else {
3928 		pid = ct->cc_pid;
3929 		sysid = ct->cc_sysid;
3930 	}
3931 
3932 	mutex_enter(&flock_lock);
3933 	gp = lock_graph[HASH_INDEX(vp)];
3934 	mutex_exit(&flock_lock);
3935 	if (gp == NULL)
3936 		return (0);
3937 
3938 	mutex_enter(&gp->gp_mutex);
3939 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
3940 
3941 	for (; lock && lock->l_vnode == vp; lock = lock->l_next) {
3942 		if ((svmand || (lock->l_state & NBMAND_LOCK)) &&
3943 		    (lock->l_flock.l_sysid != sysid ||
3944 		    lock->l_flock.l_pid != pid) &&
3945 		    lock_blocks_io(op, offset, length,
3946 				lock->l_type, lock->l_start, lock->l_end)) {
3947 			conflict = 1;
3948 			break;
3949 		}
3950 	}
3951 	mutex_exit(&gp->gp_mutex);
3952 
3953 	return (conflict);
3954 }
3955 
3956 /*
3957  * Return non-zero if the given I/O request conflicts with the given lock.
3958  */
3959 
3960 static int
3961 lock_blocks_io(nbl_op_t op, u_offset_t offset, ssize_t length,
3962 	    int lock_type, u_offset_t lock_start, u_offset_t lock_end)
3963 {
3964 	ASSERT(op == NBL_READ || op == NBL_WRITE || op == NBL_READWRITE);
3965 	ASSERT(lock_type == F_RDLCK || lock_type == F_WRLCK);
3966 
3967 	if (op == NBL_READ && lock_type == F_RDLCK)
3968 		return (0);
3969 
3970 	if (offset <= lock_start && lock_start < offset + length)
3971 		return (1);
3972 	if (lock_start <= offset && offset <= lock_end)
3973 		return (1);
3974 
3975 	return (0);
3976 }
3977 
3978 #ifdef DEBUG
3979 static void
3980 check_active_locks(graph_t *gp)
3981 {
3982 	lock_descriptor_t *lock, *lock1;
3983 	edge_t	*ep;
3984 
3985 	for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
3986 						lock = lock->l_next) {
3987 		ASSERT(IS_ACTIVE(lock));
3988 		ASSERT(NOT_BLOCKED(lock));
3989 		ASSERT(!IS_BARRIER(lock));
3990 
3991 		ep = FIRST_IN(lock);
3992 
3993 		while (ep != HEAD(lock)) {
3994 			ASSERT(IS_SLEEPING(ep->from_vertex));
3995 			ASSERT(!NOT_BLOCKED(ep->from_vertex));
3996 			ep = NEXT_IN(ep);
3997 		}
3998 
3999 		for (lock1 = lock->l_next; lock1 != ACTIVE_HEAD(gp);
4000 					lock1 = lock1->l_next) {
4001 			if (lock1->l_vnode == lock->l_vnode) {
4002 			if (BLOCKS(lock1, lock)) {
4003 				cmn_err(CE_PANIC,
4004 				    "active lock %p blocks %p",
4005 				    (void *)lock1, (void *)lock);
4006 			} else if (BLOCKS(lock, lock1)) {
4007 				cmn_err(CE_PANIC,
4008 				    "active lock %p blocks %p",
4009 				    (void *)lock, (void *)lock1);
4010 			}
4011 			}
4012 		}
4013 	}
4014 }
4015 
4016 /*
4017  * Effect: This functions checks to see if the transition from 'old_state' to
4018  *	'new_state' is a valid one.  It returns 0 if the transition is valid
4019  *	and 1 if it is not.
4020  *	For a map of valid transitions, see sys/flock_impl.h
4021  */
4022 static int
4023 check_lock_transition(int old_state, int new_state)
4024 {
4025 	switch (old_state) {
4026 	case FLK_INITIAL_STATE:
4027 		if ((new_state == FLK_START_STATE) ||
4028 		    (new_state == FLK_SLEEPING_STATE) ||
4029 		    (new_state == FLK_ACTIVE_STATE) ||
4030 		    (new_state == FLK_DEAD_STATE)) {
4031 			return (0);
4032 		} else {
4033 			return (1);
4034 		}
4035 	case FLK_START_STATE:
4036 		if ((new_state == FLK_ACTIVE_STATE) ||
4037 		    (new_state == FLK_DEAD_STATE)) {
4038 			return (0);
4039 		} else {
4040 			return (1);
4041 		}
4042 	case FLK_ACTIVE_STATE:
4043 		if (new_state == FLK_DEAD_STATE) {
4044 			return (0);
4045 		} else {
4046 			return (1);
4047 		}
4048 	case FLK_SLEEPING_STATE:
4049 		if ((new_state == FLK_GRANTED_STATE) ||
4050 		    (new_state == FLK_INTERRUPTED_STATE) ||
4051 		    (new_state == FLK_CANCELLED_STATE)) {
4052 			return (0);
4053 		} else {
4054 			return (1);
4055 		}
4056 	case FLK_GRANTED_STATE:
4057 		if ((new_state == FLK_START_STATE) ||
4058 		    (new_state == FLK_INTERRUPTED_STATE) ||
4059 		    (new_state == FLK_CANCELLED_STATE)) {
4060 			return (0);
4061 		} else {
4062 			return (1);
4063 		}
4064 	case FLK_CANCELLED_STATE:
4065 		if ((new_state == FLK_INTERRUPTED_STATE) ||
4066 		    (new_state == FLK_DEAD_STATE)) {
4067 			return (0);
4068 		} else {
4069 			return (1);
4070 		}
4071 	case FLK_INTERRUPTED_STATE:
4072 		if (new_state == FLK_DEAD_STATE) {
4073 			return (0);
4074 		} else {
4075 			return (1);
4076 		}
4077 	case FLK_DEAD_STATE:
4078 		/* May be set more than once */
4079 		if (new_state == FLK_DEAD_STATE) {
4080 			return (0);
4081 		} else {
4082 			return (1);
4083 		}
4084 	default:
4085 		return (1);
4086 	}
4087 }
4088 
4089 static void
4090 check_sleeping_locks(graph_t *gp)
4091 {
4092 	lock_descriptor_t *lock1, *lock2;
4093 	edge_t *ep;
4094 	for (lock1 = SLEEPING_HEAD(gp)->l_next; lock1 != SLEEPING_HEAD(gp);
4095 				lock1 = lock1->l_next) {
4096 				ASSERT(!IS_BARRIER(lock1));
4097 	for (lock2 = lock1->l_next; lock2 != SLEEPING_HEAD(gp);
4098 				lock2 = lock2->l_next) {
4099 		if (lock1->l_vnode == lock2->l_vnode) {
4100 			if (BLOCKS(lock2, lock1)) {
4101 				ASSERT(!IS_GRANTED(lock1));
4102 				ASSERT(!NOT_BLOCKED(lock1));
4103 				path(lock1, lock2);
4104 			}
4105 		}
4106 	}
4107 
4108 	for (lock2 = ACTIVE_HEAD(gp)->l_next; lock2 != ACTIVE_HEAD(gp);
4109 					lock2 = lock2->l_next) {
4110 				ASSERT(!IS_BARRIER(lock1));
4111 		if (lock1->l_vnode == lock2->l_vnode) {
4112 			if (BLOCKS(lock2, lock1)) {
4113 				ASSERT(!IS_GRANTED(lock1));
4114 				ASSERT(!NOT_BLOCKED(lock1));
4115 				path(lock1, lock2);
4116 			}
4117 		}
4118 	}
4119 	ep = FIRST_ADJ(lock1);
4120 	while (ep != HEAD(lock1)) {
4121 		ASSERT(BLOCKS(ep->to_vertex, lock1));
4122 		ep = NEXT_ADJ(ep);
4123 	}
4124 	}
4125 }
4126 
4127 static int
4128 level_two_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2, int no_path)
4129 {
4130 	edge_t	*ep;
4131 	lock_descriptor_t	*vertex;
4132 	lock_descriptor_t *vertex_stack;
4133 
4134 	STACK_INIT(vertex_stack);
4135 
4136 	flk_graph_uncolor(lock1->l_graph);
4137 	ep = FIRST_ADJ(lock1);
4138 	ASSERT(ep != HEAD(lock1));
4139 	while (ep != HEAD(lock1)) {
4140 		if (no_path)
4141 			ASSERT(ep->to_vertex != lock2);
4142 		STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4143 		COLOR(ep->to_vertex);
4144 		ep = NEXT_ADJ(ep);
4145 	}
4146 
4147 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
4148 		STACK_POP(vertex_stack, l_dstack);
4149 		for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
4150 						ep = NEXT_ADJ(ep)) {
4151 			if (COLORED(ep->to_vertex))
4152 				continue;
4153 			COLOR(ep->to_vertex);
4154 			if (ep->to_vertex == lock2)
4155 				return (1);
4156 
4157 			STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4158 		}
4159 	}
4160 	return (0);
4161 }
4162 
4163 static void
4164 check_owner_locks(graph_t *gp, pid_t pid, int sysid, vnode_t *vp)
4165 {
4166 	lock_descriptor_t *lock;
4167 
4168 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
4169 
4170 	if (lock) {
4171 		while (lock != ACTIVE_HEAD(gp) && (lock->l_vnode == vp)) {
4172 			if (lock->l_flock.l_pid == pid &&
4173 			    lock->l_flock.l_sysid == sysid)
4174 				cmn_err(CE_PANIC,
4175 				    "owner pid %d's lock %p in active queue",
4176 				    pid, (void *)lock);
4177 			lock = lock->l_next;
4178 		}
4179 	}
4180 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
4181 
4182 	if (lock) {
4183 		while (lock != SLEEPING_HEAD(gp) && (lock->l_vnode == vp)) {
4184 			if (lock->l_flock.l_pid == pid &&
4185 			    lock->l_flock.l_sysid == sysid)
4186 				cmn_err(CE_PANIC,
4187 				    "owner pid %d's lock %p in sleep queue",
4188 				    pid, (void *)lock);
4189 			lock = lock->l_next;
4190 		}
4191 	}
4192 }
4193 
4194 static int
4195 level_one_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4196 {
4197 	edge_t *ep = FIRST_ADJ(lock1);
4198 
4199 	while (ep != HEAD(lock1)) {
4200 		if (ep->to_vertex == lock2)
4201 			return (1);
4202 		else
4203 			ep = NEXT_ADJ(ep);
4204 	}
4205 	return (0);
4206 }
4207 
4208 static int
4209 no_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4210 {
4211 	return (!level_two_path(lock1, lock2, 1));
4212 }
4213 
4214 static void
4215 path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4216 {
4217 	if (level_one_path(lock1, lock2)) {
4218 		if (level_two_path(lock1, lock2, 0) != 0) {
4219 			cmn_err(CE_WARN,
4220 			    "one edge one path from lock1 %p lock2 %p",
4221 			    (void *)lock1, (void *)lock2);
4222 		}
4223 	} else if (no_path(lock1, lock2)) {
4224 		cmn_err(CE_PANIC,
4225 		    "No path from  lock1 %p to lock2 %p",
4226 		    (void *)lock1, (void *)lock2);
4227 	}
4228 }
4229 #endif /* DEBUG */
4230