xref: /titanic_41/usr/src/uts/common/os/flock.c (revision c04062bdc8dc59c4474832b0ca88e697ead10df2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 /*	All Rights Reserved */
29 
30 /*
31  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
32  */
33 
34 #include <sys/flock_impl.h>
35 #include <sys/vfs.h>
36 #include <sys/t_lock.h>		/* for <sys/callb.h> */
37 #include <sys/callb.h>
38 #include <sys/clconf.h>
39 #include <sys/cladm.h>
40 #include <sys/nbmlock.h>
41 #include <sys/cred.h>
42 #include <sys/policy.h>
43 
44 /*
45  * The following four variables are for statistics purposes and they are
46  * not protected by locks. They may not be accurate but will at least be
47  * close to the actual value.
48  */
49 
50 int	flk_lock_allocs;
51 int	flk_lock_frees;
52 int 	edge_allocs;
53 int	edge_frees;
54 int 	flk_proc_vertex_allocs;
55 int 	flk_proc_edge_allocs;
56 int	flk_proc_vertex_frees;
57 int	flk_proc_edge_frees;
58 
59 static kmutex_t flock_lock;
60 
61 #ifdef DEBUG
62 int check_debug = 0;
63 #define	CHECK_ACTIVE_LOCKS(gp)	if (check_debug) \
64 					check_active_locks(gp);
65 #define	CHECK_SLEEPING_LOCKS(gp)	if (check_debug) \
66 						check_sleeping_locks(gp);
67 #define	CHECK_OWNER_LOCKS(gp, pid, sysid, vp) 	\
68 		if (check_debug)	\
69 			check_owner_locks(gp, pid, sysid, vp);
70 #define	CHECK_LOCK_TRANSITION(old_state, new_state) \
71 	{ \
72 		if (check_lock_transition(old_state, new_state)) { \
73 			cmn_err(CE_PANIC, "Illegal lock transition \
74 			    from %d to %d", old_state, new_state); \
75 		} \
76 	}
77 #else
78 
79 #define	CHECK_ACTIVE_LOCKS(gp)
80 #define	CHECK_SLEEPING_LOCKS(gp)
81 #define	CHECK_OWNER_LOCKS(gp, pid, sysid, vp)
82 #define	CHECK_LOCK_TRANSITION(old_state, new_state)
83 
84 #endif /* DEBUG */
85 
86 struct kmem_cache	*flk_edge_cache;
87 
88 graph_t		*lock_graph[HASH_SIZE];
89 proc_graph_t	pgraph;
90 
91 /*
92  * Clustering.
93  *
94  * NLM REGISTRY TYPE IMPLEMENTATION
95  *
96  * Assumptions:
97  *  1.  Nodes in a cluster are numbered starting at 1; always non-negative
98  *	integers; maximum node id is returned by clconf_maximum_nodeid().
99  *  2.  We use this node id to identify the node an NLM server runs on.
100  */
101 
102 /*
103  * NLM registry object keeps track of NLM servers via their
104  * nlmids (which are the node ids of the node in the cluster they run on)
105  * that have requested locks at this LLM with which this registry is
106  * associated.
107  *
108  * Representation of abstraction:
109  *    rep = record[	states: array[nlm_state],
110  *			lock: mutex]
111  *
112  *    Representation invariants:
113  *	1. index i of rep.states is between 0 and n - 1 where n is number
114  *	   of elements in the array, which happen to be the maximum number
115  *	   of nodes in the cluster configuration + 1.
116  *	2. map nlmid to index i of rep.states
117  *		0   -> 0
118  *		1   -> 1
119  *		2   -> 2
120  *		n-1 -> clconf_maximum_nodeid()+1
121  *	3.  This 1-1 mapping is quite convenient and it avoids errors resulting
122  *	    from forgetting to subtract 1 from the index.
123  *	4.  The reason we keep the 0th index is the following.  A legitimate
124  *	    cluster configuration includes making a UFS file system NFS
125  *	    exportable.  The code is structured so that if you're in a cluster
126  *	    you do one thing; otherwise, you do something else.  The problem
127  *	    is what to do if you think you're in a cluster with PXFS loaded,
128  *	    but you're using UFS not PXFS?  The upper two bytes of the sysid
129  *	    encode the node id of the node where NLM server runs; these bytes
130  *	    are zero for UFS.  Since the nodeid is used to index into the
131  *	    registry, we can record the NLM server state information at index
132  *	    0 using the same mechanism used for PXFS file locks!
133  */
134 static flk_nlm_status_t *nlm_reg_status = NULL;	/* state array 0..N-1 */
135 static kmutex_t nlm_reg_lock;			/* lock to protect arrary */
136 static uint_t nlm_status_size;			/* size of state array */
137 
138 /*
139  * Although we need a global lock dependency graph (and associated data
140  * structures), we also need a per-zone notion of whether the lock manager is
141  * running, and so whether to allow lock manager requests or not.
142  *
143  * Thus, on a per-zone basis we maintain a ``global'' variable
144  * (flk_lockmgr_status), protected by flock_lock, and set when the lock
145  * manager is determined to be changing state (starting or stopping).
146  *
147  * Each graph/zone pair also has a copy of this variable, which is protected by
148  * the graph's mutex.
149  *
150  * The per-graph copies are used to synchronize lock requests with shutdown
151  * requests.  The global copy is used to initialize the per-graph field when a
152  * new graph is created.
153  */
154 struct flock_globals {
155 	flk_lockmgr_status_t flk_lockmgr_status;
156 	flk_lockmgr_status_t lockmgr_status[HASH_SIZE];
157 };
158 
159 zone_key_t flock_zone_key;
160 
161 static void create_flock(lock_descriptor_t *, flock64_t *);
162 static lock_descriptor_t	*flk_get_lock(void);
163 static void	flk_free_lock(lock_descriptor_t	*lock);
164 static void	flk_get_first_blocking_lock(lock_descriptor_t *request);
165 static int flk_process_request(lock_descriptor_t *);
166 static int flk_add_edge(lock_descriptor_t *, lock_descriptor_t *, int, int);
167 static edge_t *flk_get_edge(void);
168 static int flk_wait_execute_request(lock_descriptor_t *);
169 static int flk_relation(lock_descriptor_t *, lock_descriptor_t *);
170 static void flk_insert_active_lock(lock_descriptor_t *);
171 static void flk_delete_active_lock(lock_descriptor_t *, int);
172 static void flk_insert_sleeping_lock(lock_descriptor_t *);
173 static void flk_graph_uncolor(graph_t *);
174 static void flk_wakeup(lock_descriptor_t *, int);
175 static void flk_free_edge(edge_t *);
176 static void flk_recompute_dependencies(lock_descriptor_t *,
177 			lock_descriptor_t **,  int, int);
178 static int flk_find_barriers(lock_descriptor_t *);
179 static void flk_update_barriers(lock_descriptor_t *);
180 static int flk_color_reachables(lock_descriptor_t *);
181 static int flk_canceled(lock_descriptor_t *);
182 static void flk_delete_locks_by_sysid(lock_descriptor_t *);
183 static void report_blocker(lock_descriptor_t *, lock_descriptor_t *);
184 static void wait_for_lock(lock_descriptor_t *);
185 static void unlock_lockmgr_granted(struct flock_globals *);
186 static void wakeup_sleeping_lockmgr_locks(struct flock_globals *);
187 
188 /* Clustering hooks */
189 static void cl_flk_change_nlm_state_all_locks(int, flk_nlm_status_t);
190 static void cl_flk_wakeup_sleeping_nlm_locks(int);
191 static void cl_flk_unlock_nlm_granted(int);
192 
193 #ifdef DEBUG
194 static int check_lock_transition(int, int);
195 static void check_sleeping_locks(graph_t *);
196 static void check_active_locks(graph_t *);
197 static int no_path(lock_descriptor_t *, lock_descriptor_t *);
198 static void path(lock_descriptor_t *, lock_descriptor_t *);
199 static void check_owner_locks(graph_t *, pid_t, int, vnode_t *);
200 static int level_one_path(lock_descriptor_t *, lock_descriptor_t *);
201 static int level_two_path(lock_descriptor_t *, lock_descriptor_t *, int);
202 #endif
203 
204 /*	proc_graph function definitions */
205 static int flk_check_deadlock(lock_descriptor_t *);
206 static void flk_proc_graph_uncolor(void);
207 static proc_vertex_t *flk_get_proc_vertex(lock_descriptor_t *);
208 static proc_edge_t *flk_get_proc_edge(void);
209 static void flk_proc_release(proc_vertex_t *);
210 static void flk_free_proc_edge(proc_edge_t *);
211 static void flk_update_proc_graph(edge_t *, int);
212 
213 /* Non-blocking mandatory locking */
214 static int lock_blocks_io(nbl_op_t, u_offset_t, ssize_t, int, u_offset_t,
215 			u_offset_t);
216 
217 static struct flock_globals *
flk_get_globals(void)218 flk_get_globals(void)
219 {
220 	/*
221 	 * The KLM module had better be loaded if we're attempting to handle
222 	 * lockmgr requests.
223 	 */
224 	ASSERT(flock_zone_key != ZONE_KEY_UNINITIALIZED);
225 	return (zone_getspecific(flock_zone_key, curproc->p_zone));
226 }
227 
228 static flk_lockmgr_status_t
flk_get_lockmgr_status(void)229 flk_get_lockmgr_status(void)
230 {
231 	struct flock_globals *fg;
232 
233 	ASSERT(MUTEX_HELD(&flock_lock));
234 
235 	if (flock_zone_key == ZONE_KEY_UNINITIALIZED) {
236 		/*
237 		 * KLM module not loaded; lock manager definitely not running.
238 		 */
239 		return (FLK_LOCKMGR_DOWN);
240 	}
241 	fg = flk_get_globals();
242 	return (fg->flk_lockmgr_status);
243 }
244 
245 /*
246  * Routine called from fs_frlock in fs/fs_subr.c
247  */
248 
249 int
reclock(vnode_t * vp,flock64_t * lckdat,int cmd,int flag,u_offset_t offset,flk_callback_t * flk_cbp)250 reclock(vnode_t *vp, flock64_t *lckdat, int cmd, int flag, u_offset_t offset,
251     flk_callback_t *flk_cbp)
252 {
253 	lock_descriptor_t	stack_lock_request;
254 	lock_descriptor_t	*lock_request;
255 	int error = 0;
256 	graph_t	*gp;
257 	int			nlmid;
258 
259 	/*
260 	 * Check access permissions
261 	 */
262 	if ((cmd & SETFLCK) &&
263 	    ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
264 	    (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
265 			return (EBADF);
266 
267 	/*
268 	 * for query and unlock we use the stack_lock_request
269 	 */
270 
271 	if ((lckdat->l_type == F_UNLCK) ||
272 	    !((cmd & INOFLCK) || (cmd & SETFLCK))) {
273 		lock_request = &stack_lock_request;
274 		(void) bzero((caddr_t)lock_request,
275 		    sizeof (lock_descriptor_t));
276 
277 		/*
278 		 * following is added to make the assertions in
279 		 * flk_execute_request() to pass through
280 		 */
281 
282 		lock_request->l_edge.edge_in_next = &lock_request->l_edge;
283 		lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
284 		lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
285 		lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
286 		lock_request->l_status = FLK_INITIAL_STATE;
287 	} else {
288 		lock_request = flk_get_lock();
289 	}
290 	lock_request->l_state = 0;
291 	lock_request->l_vnode = vp;
292 	lock_request->l_zoneid = getzoneid();
293 
294 	/*
295 	 * Convert the request range into the canonical start and end
296 	 * values.  The NLM protocol supports locking over the entire
297 	 * 32-bit range, so there's no range checking for remote requests,
298 	 * but we still need to verify that local requests obey the rules.
299 	 */
300 	/* Clustering */
301 	if ((cmd & (RCMDLCK | PCMDLCK)) != 0) {
302 		ASSERT(lckdat->l_whence == 0);
303 		lock_request->l_start = lckdat->l_start;
304 		lock_request->l_end = (lckdat->l_len == 0) ? MAX_U_OFFSET_T :
305 		    lckdat->l_start + (lckdat->l_len - 1);
306 	} else {
307 		/* check the validity of the lock range */
308 		error = flk_convert_lock_data(vp, lckdat,
309 		    &lock_request->l_start, &lock_request->l_end,
310 		    offset);
311 		if (error) {
312 			goto done;
313 		}
314 		error = flk_check_lock_data(lock_request->l_start,
315 		    lock_request->l_end, MAXEND);
316 		if (error) {
317 			goto done;
318 		}
319 	}
320 
321 	ASSERT(lock_request->l_end >= lock_request->l_start);
322 
323 	lock_request->l_type = lckdat->l_type;
324 	if (cmd & INOFLCK)
325 		lock_request->l_state |= IO_LOCK;
326 	if (cmd & SLPFLCK)
327 		lock_request->l_state |= WILLING_TO_SLEEP_LOCK;
328 	if (cmd & RCMDLCK)
329 		lock_request->l_state |= LOCKMGR_LOCK;
330 	if (cmd & NBMLCK)
331 		lock_request->l_state |= NBMAND_LOCK;
332 	/*
333 	 * Clustering: set flag for PXFS locks
334 	 * We do not _only_ check for the PCMDLCK flag because PXFS locks could
335 	 * also be of type 'RCMDLCK'.
336 	 * We do not _only_ check the GETPXFSID() macro because local PXFS
337 	 * clients use a pxfsid of zero to permit deadlock detection in the LLM.
338 	 */
339 
340 	if ((cmd & PCMDLCK) || (GETPXFSID(lckdat->l_sysid) != 0)) {
341 		lock_request->l_state |= PXFS_LOCK;
342 	}
343 	if (!((cmd & SETFLCK) || (cmd & INOFLCK))) {
344 		if (lock_request->l_type == F_RDLCK ||
345 		    lock_request->l_type == F_WRLCK)
346 			lock_request->l_state |= QUERY_LOCK;
347 	}
348 	lock_request->l_flock = (*lckdat);
349 	lock_request->l_callbacks = flk_cbp;
350 
351 	/*
352 	 * We are ready for processing the request
353 	 */
354 	if (IS_LOCKMGR(lock_request)) {
355 		/*
356 		 * If the lock request is an NLM server request ....
357 		 */
358 		if (nlm_status_size == 0) { /* not booted as cluster */
359 			mutex_enter(&flock_lock);
360 			/*
361 			 * Bail out if this is a lock manager request and the
362 			 * lock manager is not supposed to be running.
363 			 */
364 			if (flk_get_lockmgr_status() != FLK_LOCKMGR_UP) {
365 				mutex_exit(&flock_lock);
366 				error = ENOLCK;
367 				goto done;
368 			}
369 			mutex_exit(&flock_lock);
370 		} else {			/* booted as a cluster */
371 			nlmid = GETNLMID(lock_request->l_flock.l_sysid);
372 			ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
373 
374 			mutex_enter(&nlm_reg_lock);
375 			/*
376 			 * If the NLM registry does not know about this
377 			 * NLM server making the request, add its nlmid
378 			 * to the registry.
379 			 */
380 			if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status,
381 			    nlmid)) {
382 				FLK_REGISTRY_ADD_NLMID(nlm_reg_status, nlmid);
383 			} else if (!FLK_REGISTRY_IS_NLM_UP(nlm_reg_status,
384 			    nlmid)) {
385 				/*
386 				 * If the NLM server is already known (has made
387 				 * previous lock requests) and its state is
388 				 * not NLM_UP (means that NLM server is
389 				 * shutting down), then bail out with an
390 				 * error to deny the lock request.
391 				 */
392 				mutex_exit(&nlm_reg_lock);
393 				error = ENOLCK;
394 				goto done;
395 			}
396 			mutex_exit(&nlm_reg_lock);
397 		}
398 	}
399 
400 	/* Now get the lock graph for a particular vnode */
401 	gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH);
402 
403 	/*
404 	 * We drop rwlock here otherwise this might end up causing a
405 	 * deadlock if this IOLOCK sleeps. (bugid # 1183392).
406 	 */
407 
408 	if (IS_IO_LOCK(lock_request)) {
409 		VOP_RWUNLOCK(vp,
410 		    (lock_request->l_type == F_RDLCK) ?
411 		    V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
412 	}
413 	mutex_enter(&gp->gp_mutex);
414 
415 	lock_request->l_state |= REFERENCED_LOCK;
416 	lock_request->l_graph = gp;
417 
418 	switch (lock_request->l_type) {
419 	case F_RDLCK:
420 	case F_WRLCK:
421 		if (IS_QUERY_LOCK(lock_request)) {
422 			flk_get_first_blocking_lock(lock_request);
423 			(*lckdat) = lock_request->l_flock;
424 			break;
425 		}
426 
427 		/* process the request now */
428 
429 		error = flk_process_request(lock_request);
430 		break;
431 
432 	case F_UNLCK:
433 		/* unlock request will not block so execute it immediately */
434 
435 		if (IS_LOCKMGR(lock_request) &&
436 		    flk_canceled(lock_request)) {
437 			error = 0;
438 		} else {
439 			error = flk_execute_request(lock_request);
440 		}
441 		break;
442 
443 	case F_UNLKSYS:
444 		/*
445 		 * Recovery mechanism to release lock manager locks when
446 		 * NFS client crashes and restart. NFS server will clear
447 		 * old locks and grant new locks.
448 		 */
449 
450 		if (lock_request->l_flock.l_sysid == 0) {
451 			mutex_exit(&gp->gp_mutex);
452 			return (EINVAL);
453 		}
454 		if (secpolicy_nfs(CRED()) != 0) {
455 			mutex_exit(&gp->gp_mutex);
456 			return (EPERM);
457 		}
458 		flk_delete_locks_by_sysid(lock_request);
459 		lock_request->l_state &= ~REFERENCED_LOCK;
460 		flk_set_state(lock_request, FLK_DEAD_STATE);
461 		flk_free_lock(lock_request);
462 		mutex_exit(&gp->gp_mutex);
463 		return (0);
464 
465 	default:
466 		error = EINVAL;
467 		break;
468 	}
469 
470 	/* Clustering: For blocked PXFS locks, return */
471 	if (error == PXFS_LOCK_BLOCKED) {
472 		lock_request->l_state &= ~REFERENCED_LOCK;
473 		mutex_exit(&gp->gp_mutex);
474 		return (error);
475 	}
476 
477 	/*
478 	 * Now that we have seen the status of locks in the system for
479 	 * this vnode we acquire the rwlock if it is an IO_LOCK.
480 	 */
481 
482 	if (IS_IO_LOCK(lock_request)) {
483 		(void) VOP_RWLOCK(vp,
484 		    (lock_request->l_type == F_RDLCK) ?
485 		    V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
486 		if (!error) {
487 			lckdat->l_type = F_UNLCK;
488 
489 			/*
490 			 * This wake up is needed otherwise
491 			 * if IO_LOCK has slept the dependents on this
492 			 * will not be woken up at all. (bugid # 1185482).
493 			 */
494 
495 			flk_wakeup(lock_request, 1);
496 			flk_set_state(lock_request, FLK_DEAD_STATE);
497 			flk_free_lock(lock_request);
498 		}
499 		/*
500 		 * else if error had occurred either flk_process_request()
501 		 * has returned EDEADLK in which case there will be no
502 		 * dependents for this lock or EINTR from flk_wait_execute_
503 		 * request() in which case flk_cancel_sleeping_lock()
504 		 * would have been done. same is true with EBADF.
505 		 */
506 	}
507 
508 	if (lock_request == &stack_lock_request) {
509 		flk_set_state(lock_request, FLK_DEAD_STATE);
510 	} else {
511 		lock_request->l_state &= ~REFERENCED_LOCK;
512 		if ((error != 0) || IS_DELETED(lock_request)) {
513 			flk_set_state(lock_request, FLK_DEAD_STATE);
514 			flk_free_lock(lock_request);
515 		}
516 	}
517 
518 	mutex_exit(&gp->gp_mutex);
519 	return (error);
520 
521 done:
522 	flk_set_state(lock_request, FLK_DEAD_STATE);
523 	if (lock_request != &stack_lock_request)
524 		flk_free_lock(lock_request);
525 	return (error);
526 }
527 
528 /*
529  * Invoke the callbacks in the given list.  If before sleeping, invoke in
530  * list order.  If after sleeping, invoke in reverse order.
531  *
532  * CPR (suspend/resume) support: if one of the callbacks returns a
533  * callb_cpr_t, return it.   This will be used to make the thread CPR-safe
534  * while it is sleeping.  There should be at most one callb_cpr_t for the
535  * thread.
536  * XXX This is unnecessarily complicated.  The CPR information should just
537  * get passed in directly through VOP_FRLOCK and reclock, rather than
538  * sneaking it in via a callback.
539  */
540 
541 callb_cpr_t *
flk_invoke_callbacks(flk_callback_t * cblist,flk_cb_when_t when)542 flk_invoke_callbacks(flk_callback_t *cblist, flk_cb_when_t when)
543 {
544 	callb_cpr_t *cpr_callbackp = NULL;
545 	callb_cpr_t *one_result;
546 	flk_callback_t *cb;
547 
548 	if (cblist == NULL)
549 		return (NULL);
550 
551 	if (when == FLK_BEFORE_SLEEP) {
552 		cb = cblist;
553 		do {
554 			one_result = (*cb->cb_callback)(when, cb->cb_data);
555 			if (one_result != NULL) {
556 				ASSERT(cpr_callbackp == NULL);
557 				cpr_callbackp = one_result;
558 			}
559 			cb = cb->cb_next;
560 		} while (cb != cblist);
561 	} else {
562 		cb = cblist->cb_prev;
563 		do {
564 			one_result = (*cb->cb_callback)(when, cb->cb_data);
565 			if (one_result != NULL) {
566 				cpr_callbackp = one_result;
567 			}
568 			cb = cb->cb_prev;
569 		} while (cb != cblist->cb_prev);
570 	}
571 
572 	return (cpr_callbackp);
573 }
574 
575 /*
576  * Initialize a flk_callback_t to hold the given callback.
577  */
578 
579 void
flk_init_callback(flk_callback_t * flk_cb,callb_cpr_t * (* cb_fcn)(flk_cb_when_t,void *),void * cbdata)580 flk_init_callback(flk_callback_t *flk_cb,
581     callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *), void *cbdata)
582 {
583 	flk_cb->cb_next = flk_cb;
584 	flk_cb->cb_prev = flk_cb;
585 	flk_cb->cb_callback = cb_fcn;
586 	flk_cb->cb_data = cbdata;
587 }
588 
589 /*
590  * Initialize an flk_callback_t and then link it into the head of an
591  * existing list (which may be NULL).
592  */
593 
594 void
flk_add_callback(flk_callback_t * newcb,callb_cpr_t * (* cb_fcn)(flk_cb_when_t,void *),void * cbdata,flk_callback_t * cblist)595 flk_add_callback(flk_callback_t *newcb,
596     callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *),
597     void *cbdata, flk_callback_t *cblist)
598 {
599 	flk_init_callback(newcb, cb_fcn, cbdata);
600 
601 	if (cblist == NULL)
602 		return;
603 
604 	newcb->cb_prev = cblist->cb_prev;
605 	newcb->cb_next = cblist;
606 	cblist->cb_prev->cb_next = newcb;
607 	cblist->cb_prev = newcb;
608 }
609 
610 /*
611  * Remove the callback from a list.
612  */
613 
614 void
flk_del_callback(flk_callback_t * flk_cb)615 flk_del_callback(flk_callback_t *flk_cb)
616 {
617 	flk_cb->cb_next->cb_prev = flk_cb->cb_prev;
618 	flk_cb->cb_prev->cb_next = flk_cb->cb_next;
619 
620 	flk_cb->cb_prev = flk_cb;
621 	flk_cb->cb_next = flk_cb;
622 }
623 
624 /*
625  * Initialize the flk_edge_cache data structure and create the
626  * nlm_reg_status array.
627  */
628 
629 void
flk_init(void)630 flk_init(void)
631 {
632 	uint_t	i;
633 
634 	flk_edge_cache = kmem_cache_create("flk_edges",
635 	    sizeof (struct edge), 0, NULL, NULL, NULL, NULL, NULL, 0);
636 	if (flk_edge_cache == NULL) {
637 		cmn_err(CE_PANIC, "Couldn't create flk_edge_cache\n");
638 	}
639 	/*
640 	 * Create the NLM registry object.
641 	 */
642 
643 	if (cluster_bootflags & CLUSTER_BOOTED) {
644 		/*
645 		 * This routine tells you the maximum node id that will be used
646 		 * in the cluster.  This number will be the size of the nlm
647 		 * registry status array.  We add 1 because we will be using
648 		 * all entries indexed from 0 to maxnodeid; e.g., from 0
649 		 * to 64, for a total of 65 entries.
650 		 */
651 		nlm_status_size = clconf_maximum_nodeid() + 1;
652 	} else {
653 		nlm_status_size = 0;
654 	}
655 
656 	if (nlm_status_size != 0) {	/* booted as a cluster */
657 		nlm_reg_status = (flk_nlm_status_t *)
658 		    kmem_alloc(sizeof (flk_nlm_status_t) * nlm_status_size,
659 		    KM_SLEEP);
660 
661 		/* initialize all NLM states in array to NLM_UNKNOWN */
662 		for (i = 0; i < nlm_status_size; i++) {
663 			nlm_reg_status[i] = FLK_NLM_UNKNOWN;
664 		}
665 	}
666 }
667 
668 /*
669  * Zone constructor/destructor callbacks to be executed when a zone is
670  * created/destroyed.
671  */
672 /* ARGSUSED */
673 void *
flk_zone_init(zoneid_t zoneid)674 flk_zone_init(zoneid_t zoneid)
675 {
676 	struct flock_globals *fg;
677 	uint_t i;
678 
679 	fg = kmem_alloc(sizeof (*fg), KM_SLEEP);
680 	fg->flk_lockmgr_status = FLK_LOCKMGR_UP;
681 	for (i = 0; i < HASH_SIZE; i++)
682 		fg->lockmgr_status[i] = FLK_LOCKMGR_UP;
683 	return (fg);
684 }
685 
686 /* ARGSUSED */
687 void
flk_zone_fini(zoneid_t zoneid,void * data)688 flk_zone_fini(zoneid_t zoneid, void *data)
689 {
690 	struct flock_globals *fg = data;
691 
692 	kmem_free(fg, sizeof (*fg));
693 }
694 
695 /*
696  * Get a lock_descriptor structure with initialization of edge lists.
697  */
698 
699 static lock_descriptor_t *
flk_get_lock(void)700 flk_get_lock(void)
701 {
702 	lock_descriptor_t	*l;
703 
704 	l = kmem_zalloc(sizeof (lock_descriptor_t), KM_SLEEP);
705 
706 	cv_init(&l->l_cv, NULL, CV_DRIVER, NULL);
707 	l->l_edge.edge_in_next = &l->l_edge;
708 	l->l_edge.edge_in_prev = &l->l_edge;
709 	l->l_edge.edge_adj_next = &l->l_edge;
710 	l->l_edge.edge_adj_prev = &l->l_edge;
711 	l->pvertex = -1;
712 	l->l_status = FLK_INITIAL_STATE;
713 	flk_lock_allocs++;
714 	return (l);
715 }
716 
717 /*
718  * Free a lock_descriptor structure. Just sets the DELETED_LOCK flag
719  * when some thread has a reference to it as in reclock().
720  */
721 
722 void
flk_free_lock(lock_descriptor_t * lock)723 flk_free_lock(lock_descriptor_t	*lock)
724 {
725 	ASSERT(IS_DEAD(lock));
726 	if (IS_REFERENCED(lock)) {
727 		lock->l_state |= DELETED_LOCK;
728 		return;
729 	}
730 	flk_lock_frees++;
731 	kmem_free((void *)lock, sizeof (lock_descriptor_t));
732 }
733 
734 void
flk_set_state(lock_descriptor_t * lock,int new_state)735 flk_set_state(lock_descriptor_t *lock, int new_state)
736 {
737 	/*
738 	 * Locks in the sleeping list may be woken up in a number of ways,
739 	 * and more than once.  If a sleeping lock is signaled awake more
740 	 * than once, then it may or may not change state depending on its
741 	 * current state.
742 	 * Also note that NLM locks that are sleeping could be moved to an
743 	 * interrupted state more than once if the unlock request is
744 	 * retransmitted by the NLM client - the second time around, this is
745 	 * just a nop.
746 	 * The ordering of being signaled awake is:
747 	 * INTERRUPTED_STATE > CANCELLED_STATE > GRANTED_STATE.
748 	 * The checks below implement this ordering.
749 	 */
750 	if (IS_INTERRUPTED(lock)) {
751 		if ((new_state == FLK_CANCELLED_STATE) ||
752 		    (new_state == FLK_GRANTED_STATE) ||
753 		    (new_state == FLK_INTERRUPTED_STATE)) {
754 			return;
755 		}
756 	}
757 	if (IS_CANCELLED(lock)) {
758 		if ((new_state == FLK_GRANTED_STATE) ||
759 		    (new_state == FLK_CANCELLED_STATE)) {
760 			return;
761 		}
762 	}
763 	CHECK_LOCK_TRANSITION(lock->l_status, new_state);
764 	if (IS_PXFS(lock)) {
765 		cl_flk_state_transition_notify(lock, lock->l_status, new_state);
766 	}
767 	lock->l_status = new_state;
768 }
769 
770 /*
771  * Routine that checks whether there are any blocking locks in the system.
772  *
773  * The policy followed is if a write lock is sleeping we don't allow read
774  * locks before this write lock even though there may not be any active
775  * locks corresponding to the read locks' region.
776  *
777  * flk_add_edge() function adds an edge between l1 and l2 iff there
778  * is no path between l1 and l2. This is done to have a "minimum
779  * storage representation" of the dependency graph.
780  *
781  * Another property of the graph is since only the new request throws
782  * edges to the existing locks in the graph, the graph is always topologically
783  * ordered.
784  */
785 
786 static int
flk_process_request(lock_descriptor_t * request)787 flk_process_request(lock_descriptor_t *request)
788 {
789 	graph_t	*gp = request->l_graph;
790 	lock_descriptor_t *lock;
791 	int request_blocked_by_active = 0;
792 	int request_blocked_by_granted = 0;
793 	int request_blocked_by_sleeping = 0;
794 	vnode_t	*vp = request->l_vnode;
795 	int	error = 0;
796 	int request_will_wait = 0;
797 	int found_covering_lock = 0;
798 	lock_descriptor_t *covered_by = NULL;
799 
800 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
801 	request_will_wait = IS_WILLING_TO_SLEEP(request);
802 
803 	/*
804 	 * check active locks
805 	 */
806 
807 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
808 
809 
810 	if (lock) {
811 		do {
812 			if (BLOCKS(lock, request)) {
813 				if (!request_will_wait)
814 					return (EAGAIN);
815 				request_blocked_by_active = 1;
816 				break;
817 			}
818 			/*
819 			 * Grant lock if it is for the same owner holding active
820 			 * lock that covers the request.
821 			 */
822 
823 			if (SAME_OWNER(lock, request) &&
824 			    COVERS(lock, request) &&
825 			    (request->l_type == F_RDLCK))
826 				return (flk_execute_request(request));
827 			lock = lock->l_next;
828 		} while (lock->l_vnode == vp);
829 	}
830 
831 	if (!request_blocked_by_active) {
832 			lock_descriptor_t *lk[1];
833 			lock_descriptor_t *first_glock = NULL;
834 		/*
835 		 * Shall we grant this?! NO!!
836 		 * What about those locks that were just granted and still
837 		 * in sleep queue. Those threads are woken up and so locks
838 		 * are almost active.
839 		 */
840 		SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
841 		if (lock) {
842 			do {
843 				if (BLOCKS(lock, request)) {
844 					if (IS_GRANTED(lock)) {
845 						request_blocked_by_granted = 1;
846 					} else {
847 						request_blocked_by_sleeping = 1;
848 					}
849 				}
850 
851 				lock = lock->l_next;
852 			} while ((lock->l_vnode == vp));
853 			first_glock = lock->l_prev;
854 			ASSERT(first_glock->l_vnode == vp);
855 		}
856 
857 		if (request_blocked_by_granted)
858 			goto block;
859 
860 		if (!request_blocked_by_sleeping) {
861 			/*
862 			 * If the request isn't going to be blocked by a
863 			 * sleeping request, we know that it isn't going to
864 			 * be blocked; we can just execute the request --
865 			 * without performing costly deadlock detection.
866 			 */
867 			ASSERT(!request_blocked_by_active);
868 			return (flk_execute_request(request));
869 		} else if (request->l_type == F_RDLCK) {
870 			/*
871 			 * If we have a sleeping writer in the requested
872 			 * lock's range, block.
873 			 */
874 			goto block;
875 		}
876 
877 		lk[0] = request;
878 		request->l_state |= RECOMPUTE_LOCK;
879 		SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
880 		if (lock) {
881 			do {
882 				flk_recompute_dependencies(lock, lk, 1, 0);
883 				lock = lock->l_next;
884 			} while (lock->l_vnode == vp);
885 		}
886 		lock = first_glock;
887 		if (lock) {
888 			do {
889 				if (IS_GRANTED(lock)) {
890 				flk_recompute_dependencies(lock, lk, 1, 0);
891 				}
892 				lock = lock->l_prev;
893 			} while ((lock->l_vnode == vp));
894 		}
895 		request->l_state &= ~RECOMPUTE_LOCK;
896 		if (!NO_DEPENDENTS(request) && flk_check_deadlock(request))
897 			return (EDEADLK);
898 		return (flk_execute_request(request));
899 	}
900 
901 block:
902 	if (request_will_wait)
903 		flk_graph_uncolor(gp);
904 
905 	/* check sleeping locks */
906 
907 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
908 
909 	/*
910 	 * If we find a sleeping write lock that is a superset of the
911 	 * region wanted by request we can be assured that by adding an
912 	 * edge to this write lock we have paths to all locks in the
913 	 * graph that blocks the request except in one case and that is why
914 	 * another check for SAME_OWNER in the loop below. The exception
915 	 * case is when this process that owns the sleeping write lock 'l1'
916 	 * has other locks l2, l3, l4 that are in the system and arrived
917 	 * before l1. l1 does not have path to these locks as they are from
918 	 * same process. We break when we find a second covering sleeping
919 	 * lock l5 owned by a process different from that owning l1, because
920 	 * there cannot be any of l2, l3, l4, etc., arrived before l5, and if
921 	 * it has l1 would have produced a deadlock already.
922 	 */
923 
924 	if (lock) {
925 		do {
926 			if (BLOCKS(lock, request)) {
927 				if (!request_will_wait)
928 					return (EAGAIN);
929 				if (COVERS(lock, request) &&
930 				    lock->l_type == F_WRLCK) {
931 					if (found_covering_lock &&
932 					    !SAME_OWNER(lock, covered_by)) {
933 						found_covering_lock++;
934 						break;
935 					}
936 					found_covering_lock = 1;
937 					covered_by = lock;
938 				}
939 				if (found_covering_lock &&
940 				    !SAME_OWNER(lock, covered_by)) {
941 					lock = lock->l_next;
942 					continue;
943 				}
944 				if ((error = flk_add_edge(request, lock,
945 				    !found_covering_lock, 0)))
946 					return (error);
947 			}
948 			lock = lock->l_next;
949 		} while (lock->l_vnode == vp);
950 	}
951 
952 /*
953  * found_covering_lock == 2 iff at this point 'request' has paths
954  * to all locks that blocks 'request'. found_covering_lock == 1 iff at this
955  * point 'request' has paths to all locks that blocks 'request' whose owners
956  * are not same as the one that covers 'request' (covered_by above) and
957  * we can have locks whose owner is same as covered_by in the active list.
958  */
959 
960 	if (request_blocked_by_active && found_covering_lock != 2) {
961 		SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
962 		ASSERT(lock != NULL);
963 		do {
964 			if (BLOCKS(lock, request)) {
965 				if (found_covering_lock &&
966 				    !SAME_OWNER(lock, covered_by)) {
967 					lock = lock->l_next;
968 					continue;
969 				}
970 				if ((error = flk_add_edge(request, lock,
971 				    CHECK_CYCLE, 0)))
972 					return (error);
973 			}
974 			lock = lock->l_next;
975 		} while (lock->l_vnode == vp);
976 	}
977 
978 	if (NOT_BLOCKED(request)) {
979 		/*
980 		 * request not dependent on any other locks
981 		 * so execute this request
982 		 */
983 		return (flk_execute_request(request));
984 	} else {
985 		/*
986 		 * check for deadlock
987 		 */
988 		if (flk_check_deadlock(request))
989 			return (EDEADLK);
990 		/*
991 		 * this thread has to sleep
992 		 */
993 		return (flk_wait_execute_request(request));
994 	}
995 }
996 
997 /*
998  * The actual execution of the request in the simple case is only to
999  * insert the 'request' in the list of active locks if it is not an
1000  * UNLOCK.
1001  * We have to consider the existing active locks' relation to
1002  * this 'request' if they are owned by same process. flk_relation() does
1003  * this job and sees to that the dependency graph information is maintained
1004  * properly.
1005  */
1006 
1007 int
flk_execute_request(lock_descriptor_t * request)1008 flk_execute_request(lock_descriptor_t *request)
1009 {
1010 	graph_t	*gp = request->l_graph;
1011 	vnode_t	*vp = request->l_vnode;
1012 	lock_descriptor_t	*lock, *lock1;
1013 	int done_searching = 0;
1014 
1015 	CHECK_SLEEPING_LOCKS(gp);
1016 	CHECK_ACTIVE_LOCKS(gp);
1017 
1018 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1019 
1020 	flk_set_state(request, FLK_START_STATE);
1021 
1022 	ASSERT(NOT_BLOCKED(request));
1023 
1024 	/* IO_LOCK requests are only to check status */
1025 
1026 	if (IS_IO_LOCK(request))
1027 		return (0);
1028 
1029 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1030 
1031 	if (lock == NULL && request->l_type == F_UNLCK)
1032 		return (0);
1033 	if (lock == NULL) {
1034 		flk_insert_active_lock(request);
1035 		return (0);
1036 	}
1037 
1038 	do {
1039 		lock1 = lock->l_next;
1040 		if (SAME_OWNER(request, lock)) {
1041 			done_searching = flk_relation(lock, request);
1042 		}
1043 		lock = lock1;
1044 	} while (lock->l_vnode == vp && !done_searching);
1045 
1046 	/*
1047 	 * insert in active queue
1048 	 */
1049 
1050 	if (request->l_type != F_UNLCK)
1051 		flk_insert_active_lock(request);
1052 
1053 	return (0);
1054 }
1055 
1056 /*
1057  * 'request' is blocked by some one therefore we put it into sleep queue.
1058  */
1059 static int
flk_wait_execute_request(lock_descriptor_t * request)1060 flk_wait_execute_request(lock_descriptor_t *request)
1061 {
1062 	graph_t	*gp = request->l_graph;
1063 	callb_cpr_t 	*cprp;		/* CPR info from callback */
1064 	struct flock_globals *fg;
1065 	int index;
1066 
1067 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1068 	ASSERT(IS_WILLING_TO_SLEEP(request));
1069 
1070 	flk_insert_sleeping_lock(request);
1071 
1072 	if (IS_LOCKMGR(request)) {
1073 		index = HASH_INDEX(request->l_vnode);
1074 		fg = flk_get_globals();
1075 
1076 		if (nlm_status_size == 0) {	/* not booted as a cluster */
1077 			if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP) {
1078 				flk_cancel_sleeping_lock(request, 1);
1079 				return (ENOLCK);
1080 			}
1081 		} else {			/* booted as a cluster */
1082 			/*
1083 			 * If the request is an NLM server lock request,
1084 			 * and the NLM state of the lock request is not
1085 			 * NLM_UP (because the NLM server is shutting
1086 			 * down), then cancel the sleeping lock and
1087 			 * return error ENOLCK that will encourage the
1088 			 * client to retransmit.
1089 			 */
1090 			if (!IS_NLM_UP(request)) {
1091 				flk_cancel_sleeping_lock(request, 1);
1092 				return (ENOLCK);
1093 			}
1094 		}
1095 	}
1096 
1097 	/* Clustering: For blocking PXFS locks, return */
1098 	if (IS_PXFS(request)) {
1099 		/*
1100 		 * PXFS locks sleep on the client side.
1101 		 * The callback argument is used to wake up the sleeper
1102 		 * when the lock is granted.
1103 		 * We return -1 (rather than an errno value) to indicate
1104 		 * the client side should sleep
1105 		 */
1106 		return (PXFS_LOCK_BLOCKED);
1107 	}
1108 
1109 	if (request->l_callbacks != NULL) {
1110 		/*
1111 		 * To make sure the shutdown code works correctly, either
1112 		 * the callback must happen after putting the lock on the
1113 		 * sleep list, or we must check the shutdown status after
1114 		 * returning from the callback (and before sleeping).  At
1115 		 * least for now, we'll use the first option.  If a
1116 		 * shutdown or signal or whatever happened while the graph
1117 		 * mutex was dropped, that will be detected by
1118 		 * wait_for_lock().
1119 		 */
1120 		mutex_exit(&gp->gp_mutex);
1121 
1122 		cprp = flk_invoke_callbacks(request->l_callbacks,
1123 		    FLK_BEFORE_SLEEP);
1124 
1125 		mutex_enter(&gp->gp_mutex);
1126 
1127 		if (cprp == NULL) {
1128 			wait_for_lock(request);
1129 		} else {
1130 			mutex_enter(cprp->cc_lockp);
1131 			CALLB_CPR_SAFE_BEGIN(cprp);
1132 			mutex_exit(cprp->cc_lockp);
1133 			wait_for_lock(request);
1134 			mutex_enter(cprp->cc_lockp);
1135 			CALLB_CPR_SAFE_END(cprp, cprp->cc_lockp);
1136 			mutex_exit(cprp->cc_lockp);
1137 		}
1138 
1139 		mutex_exit(&gp->gp_mutex);
1140 		(void) flk_invoke_callbacks(request->l_callbacks,
1141 		    FLK_AFTER_SLEEP);
1142 		mutex_enter(&gp->gp_mutex);
1143 	} else {
1144 		wait_for_lock(request);
1145 	}
1146 
1147 	if (IS_LOCKMGR(request)) {
1148 		/*
1149 		 * If the lock manager is shutting down, return an
1150 		 * error that will encourage the client to retransmit.
1151 		 */
1152 		if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP &&
1153 		    !IS_GRANTED(request)) {
1154 			flk_cancel_sleeping_lock(request, 1);
1155 			return (ENOLCK);
1156 		}
1157 	}
1158 
1159 	if (IS_INTERRUPTED(request)) {
1160 		/* we got a signal, or act like we did */
1161 		flk_cancel_sleeping_lock(request, 1);
1162 		return (EINTR);
1163 	}
1164 
1165 	/* Cancelled if some other thread has closed the file */
1166 
1167 	if (IS_CANCELLED(request)) {
1168 		flk_cancel_sleeping_lock(request, 1);
1169 		return (EBADF);
1170 	}
1171 
1172 	request->l_state &= ~GRANTED_LOCK;
1173 	REMOVE_SLEEP_QUEUE(request);
1174 	return (flk_execute_request(request));
1175 }
1176 
1177 /*
1178  * This routine adds an edge between from and to because from depends
1179  * to. If asked to check for deadlock it checks whether there are any
1180  * reachable locks from "from_lock" that is owned by the same process
1181  * as "from_lock".
1182  * NOTE: It is the caller's responsibility to make sure that the color
1183  * of the graph is consistent between the calls to flk_add_edge as done
1184  * in flk_process_request. This routine does not color and check for
1185  * deadlock explicitly.
1186  */
1187 
1188 static int
flk_add_edge(lock_descriptor_t * from_lock,lock_descriptor_t * to_lock,int check_cycle,int update_graph)1189 flk_add_edge(lock_descriptor_t *from_lock, lock_descriptor_t *to_lock,
1190     int check_cycle, int update_graph)
1191 {
1192 	edge_t	*edge;
1193 	edge_t	*ep;
1194 	lock_descriptor_t	*vertex;
1195 	lock_descriptor_t *vertex_stack;
1196 
1197 	STACK_INIT(vertex_stack);
1198 
1199 	/*
1200 	 * if to vertex already has mark_color just return
1201 	 * don't add an edge as it is reachable from from vertex
1202 	 * before itself.
1203 	 */
1204 
1205 	if (COLORED(to_lock))
1206 		return (0);
1207 
1208 	edge = flk_get_edge();
1209 
1210 	/*
1211 	 * set the from and to vertex
1212 	 */
1213 
1214 	edge->from_vertex = from_lock;
1215 	edge->to_vertex = to_lock;
1216 
1217 	/*
1218 	 * put in adjacency list of from vertex
1219 	 */
1220 
1221 	from_lock->l_edge.edge_adj_next->edge_adj_prev = edge;
1222 	edge->edge_adj_next = from_lock->l_edge.edge_adj_next;
1223 	edge->edge_adj_prev = &from_lock->l_edge;
1224 	from_lock->l_edge.edge_adj_next = edge;
1225 
1226 	/*
1227 	 * put in in list of to vertex
1228 	 */
1229 
1230 	to_lock->l_edge.edge_in_next->edge_in_prev = edge;
1231 	edge->edge_in_next = to_lock->l_edge.edge_in_next;
1232 	to_lock->l_edge.edge_in_next = edge;
1233 	edge->edge_in_prev = &to_lock->l_edge;
1234 
1235 
1236 	if (update_graph) {
1237 		flk_update_proc_graph(edge, 0);
1238 		return (0);
1239 	}
1240 	if (!check_cycle) {
1241 		return (0);
1242 	}
1243 
1244 	STACK_PUSH(vertex_stack, from_lock, l_stack);
1245 
1246 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1247 
1248 		STACK_POP(vertex_stack, l_stack);
1249 
1250 		for (ep = FIRST_ADJ(vertex);
1251 		    ep != HEAD(vertex);
1252 		    ep = NEXT_ADJ(ep)) {
1253 			if (COLORED(ep->to_vertex))
1254 				continue;
1255 			COLOR(ep->to_vertex);
1256 			if (SAME_OWNER(ep->to_vertex, from_lock))
1257 				goto dead_lock;
1258 			STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1259 		}
1260 	}
1261 	return (0);
1262 
1263 dead_lock:
1264 
1265 	/*
1266 	 * remove all edges
1267 	 */
1268 
1269 	ep = FIRST_ADJ(from_lock);
1270 
1271 	while (ep != HEAD(from_lock)) {
1272 		IN_LIST_REMOVE(ep);
1273 		from_lock->l_sedge = NEXT_ADJ(ep);
1274 		ADJ_LIST_REMOVE(ep);
1275 		flk_free_edge(ep);
1276 		ep = from_lock->l_sedge;
1277 	}
1278 	return (EDEADLK);
1279 }
1280 
1281 /*
1282  * Get an edge structure for representing the dependency between two locks.
1283  */
1284 
1285 static edge_t *
flk_get_edge()1286 flk_get_edge()
1287 {
1288 	edge_t	*ep;
1289 
1290 	ASSERT(flk_edge_cache != NULL);
1291 
1292 	ep = kmem_cache_alloc(flk_edge_cache, KM_SLEEP);
1293 	edge_allocs++;
1294 	return (ep);
1295 }
1296 
1297 /*
1298  * Free the edge structure.
1299  */
1300 
1301 static void
flk_free_edge(edge_t * ep)1302 flk_free_edge(edge_t *ep)
1303 {
1304 	edge_frees++;
1305 	kmem_cache_free(flk_edge_cache, (void *)ep);
1306 }
1307 
1308 /*
1309  * Check the relationship of request with lock and perform the
1310  * recomputation of dependencies, break lock if required, and return
1311  * 1 if request cannot have any more relationship with the next
1312  * active locks.
1313  * The 'lock' and 'request' are compared and in case of overlap we
1314  * delete the 'lock' and form new locks to represent the non-overlapped
1315  * portion of original 'lock'. This function has side effects such as
1316  * 'lock' will be freed, new locks will be added to the active list.
1317  */
1318 
1319 static int
flk_relation(lock_descriptor_t * lock,lock_descriptor_t * request)1320 flk_relation(lock_descriptor_t *lock, lock_descriptor_t *request)
1321 {
1322 	int lock_effect;
1323 	lock_descriptor_t *lock1, *lock2;
1324 	lock_descriptor_t *topology[3];
1325 	int nvertex = 0;
1326 	int i;
1327 	edge_t	*ep;
1328 	graph_t	*gp = (lock->l_graph);
1329 
1330 
1331 	CHECK_SLEEPING_LOCKS(gp);
1332 	CHECK_ACTIVE_LOCKS(gp);
1333 
1334 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1335 
1336 	topology[0] = topology[1] = topology[2] = NULL;
1337 
1338 	if (request->l_type == F_UNLCK)
1339 		lock_effect = FLK_UNLOCK;
1340 	else if (request->l_type == F_RDLCK &&
1341 	    lock->l_type == F_WRLCK)
1342 		lock_effect = FLK_DOWNGRADE;
1343 	else if (request->l_type == F_WRLCK &&
1344 	    lock->l_type == F_RDLCK)
1345 		lock_effect = FLK_UPGRADE;
1346 	else
1347 		lock_effect = FLK_STAY_SAME;
1348 
1349 	if (lock->l_end < request->l_start) {
1350 		if (lock->l_end == request->l_start - 1 &&
1351 		    lock_effect == FLK_STAY_SAME) {
1352 			topology[0] = request;
1353 			request->l_start = lock->l_start;
1354 			nvertex = 1;
1355 			goto recompute;
1356 		} else {
1357 			return (0);
1358 		}
1359 	}
1360 
1361 	if (lock->l_start > request->l_end) {
1362 		if (request->l_end == lock->l_start - 1 &&
1363 		    lock_effect == FLK_STAY_SAME) {
1364 			topology[0] = request;
1365 			request->l_end = lock->l_end;
1366 			nvertex = 1;
1367 			goto recompute;
1368 		} else {
1369 			return (1);
1370 		}
1371 	}
1372 
1373 	if (request->l_end < lock->l_end) {
1374 		if (request->l_start > lock->l_start) {
1375 			if (lock_effect == FLK_STAY_SAME) {
1376 				request->l_start = lock->l_start;
1377 				request->l_end = lock->l_end;
1378 				topology[0] = request;
1379 				nvertex = 1;
1380 			} else {
1381 				lock1 = flk_get_lock();
1382 				lock2 = flk_get_lock();
1383 				COPY(lock1, lock);
1384 				COPY(lock2, lock);
1385 				lock1->l_start = lock->l_start;
1386 				lock1->l_end = request->l_start - 1;
1387 				lock2->l_start = request->l_end + 1;
1388 				lock2->l_end = lock->l_end;
1389 				topology[0] = lock1;
1390 				topology[1] = lock2;
1391 				topology[2] = request;
1392 				nvertex = 3;
1393 			}
1394 		} else if (request->l_start < lock->l_start) {
1395 			if (lock_effect == FLK_STAY_SAME) {
1396 				request->l_end = lock->l_end;
1397 				topology[0] = request;
1398 				nvertex = 1;
1399 			} else {
1400 				lock1 = flk_get_lock();
1401 				COPY(lock1, lock);
1402 				lock1->l_start = request->l_end + 1;
1403 				topology[0] = lock1;
1404 				topology[1] = request;
1405 				nvertex = 2;
1406 			}
1407 		} else  {
1408 			if (lock_effect == FLK_STAY_SAME) {
1409 				request->l_start = lock->l_start;
1410 				request->l_end = lock->l_end;
1411 				topology[0] = request;
1412 				nvertex = 1;
1413 			} else {
1414 				lock1 = flk_get_lock();
1415 				COPY(lock1, lock);
1416 				lock1->l_start = request->l_end + 1;
1417 				topology[0] = lock1;
1418 				topology[1] = request;
1419 				nvertex = 2;
1420 			}
1421 		}
1422 	} else if (request->l_end > lock->l_end) {
1423 		if (request->l_start > lock->l_start)  {
1424 			if (lock_effect == FLK_STAY_SAME) {
1425 				request->l_start = lock->l_start;
1426 				topology[0] = request;
1427 				nvertex = 1;
1428 			} else {
1429 				lock1 = flk_get_lock();
1430 				COPY(lock1, lock);
1431 				lock1->l_end = request->l_start - 1;
1432 				topology[0] = lock1;
1433 				topology[1] = request;
1434 				nvertex = 2;
1435 			}
1436 		} else if (request->l_start < lock->l_start)  {
1437 			topology[0] = request;
1438 			nvertex = 1;
1439 		} else {
1440 			topology[0] = request;
1441 			nvertex = 1;
1442 		}
1443 	} else {
1444 		if (request->l_start > lock->l_start) {
1445 			if (lock_effect == FLK_STAY_SAME) {
1446 				request->l_start = lock->l_start;
1447 				topology[0] = request;
1448 				nvertex = 1;
1449 			} else {
1450 				lock1 = flk_get_lock();
1451 				COPY(lock1, lock);
1452 				lock1->l_end = request->l_start - 1;
1453 				topology[0] = lock1;
1454 				topology[1] = request;
1455 				nvertex = 2;
1456 			}
1457 		} else if (request->l_start < lock->l_start) {
1458 			topology[0] = request;
1459 			nvertex = 1;
1460 		} else {
1461 			if (lock_effect !=  FLK_UNLOCK) {
1462 				topology[0] = request;
1463 				nvertex = 1;
1464 			} else {
1465 				flk_delete_active_lock(lock, 0);
1466 				flk_wakeup(lock, 1);
1467 				flk_free_lock(lock);
1468 				CHECK_SLEEPING_LOCKS(gp);
1469 				CHECK_ACTIVE_LOCKS(gp);
1470 				return (1);
1471 			}
1472 		}
1473 	}
1474 
1475 recompute:
1476 
1477 	/*
1478 	 * For unlock we don't send the 'request' to for recomputing
1479 	 * dependencies because no lock will add an edge to this.
1480 	 */
1481 
1482 	if (lock_effect == FLK_UNLOCK) {
1483 		topology[nvertex-1] = NULL;
1484 		nvertex--;
1485 	}
1486 	for (i = 0; i < nvertex; i++) {
1487 		topology[i]->l_state |= RECOMPUTE_LOCK;
1488 		topology[i]->l_color = NO_COLOR;
1489 	}
1490 
1491 	ASSERT(FIRST_ADJ(lock) == HEAD(lock));
1492 
1493 	/*
1494 	 * we remove the adjacent edges for all vertices' to this vertex
1495 	 * 'lock'.
1496 	 */
1497 
1498 	ep = FIRST_IN(lock);
1499 	while (ep != HEAD(lock)) {
1500 		ADJ_LIST_REMOVE(ep);
1501 		ep = NEXT_IN(ep);
1502 	}
1503 
1504 	flk_delete_active_lock(lock, 0);
1505 
1506 	/* We are ready for recomputing the dependencies now */
1507 
1508 	flk_recompute_dependencies(lock, topology, nvertex, 1);
1509 
1510 	for (i = 0; i < nvertex; i++) {
1511 		topology[i]->l_state &= ~RECOMPUTE_LOCK;
1512 		topology[i]->l_color = NO_COLOR;
1513 	}
1514 
1515 
1516 	if (lock_effect == FLK_UNLOCK) {
1517 		nvertex++;
1518 	}
1519 	for (i = 0; i < nvertex - 1; i++) {
1520 		flk_insert_active_lock(topology[i]);
1521 	}
1522 
1523 
1524 	if (lock_effect == FLK_DOWNGRADE || lock_effect == FLK_UNLOCK) {
1525 		flk_wakeup(lock, 0);
1526 	} else {
1527 		ep = FIRST_IN(lock);
1528 		while (ep != HEAD(lock)) {
1529 			lock->l_sedge = NEXT_IN(ep);
1530 			IN_LIST_REMOVE(ep);
1531 			flk_update_proc_graph(ep, 1);
1532 			flk_free_edge(ep);
1533 			ep = lock->l_sedge;
1534 		}
1535 	}
1536 	flk_free_lock(lock);
1537 
1538 	CHECK_SLEEPING_LOCKS(gp);
1539 	CHECK_ACTIVE_LOCKS(gp);
1540 	return (0);
1541 }
1542 
1543 /*
1544  * Insert a lock into the active queue.
1545  */
1546 
1547 static void
flk_insert_active_lock(lock_descriptor_t * new_lock)1548 flk_insert_active_lock(lock_descriptor_t *new_lock)
1549 {
1550 	graph_t	*gp = new_lock->l_graph;
1551 	vnode_t	*vp = new_lock->l_vnode;
1552 	lock_descriptor_t *first_lock, *lock;
1553 
1554 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1555 
1556 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1557 	first_lock = lock;
1558 
1559 	if (first_lock != NULL) {
1560 		for (; (lock->l_vnode == vp &&
1561 		    lock->l_start < new_lock->l_start); lock = lock->l_next)
1562 			;
1563 	} else {
1564 		lock = ACTIVE_HEAD(gp);
1565 	}
1566 
1567 	lock->l_prev->l_next = new_lock;
1568 	new_lock->l_next = lock;
1569 	new_lock->l_prev = lock->l_prev;
1570 	lock->l_prev = new_lock;
1571 
1572 	if (first_lock == NULL || (new_lock->l_start <= first_lock->l_start)) {
1573 		vp->v_filocks = (struct filock *)new_lock;
1574 	}
1575 	flk_set_state(new_lock, FLK_ACTIVE_STATE);
1576 	new_lock->l_state |= ACTIVE_LOCK;
1577 
1578 	CHECK_ACTIVE_LOCKS(gp);
1579 	CHECK_SLEEPING_LOCKS(gp);
1580 }
1581 
1582 /*
1583  * Delete the active lock : Performs two functions depending on the
1584  * value of second parameter. One is to remove from the active lists
1585  * only and other is to both remove and free the lock.
1586  */
1587 
1588 static void
flk_delete_active_lock(lock_descriptor_t * lock,int free_lock)1589 flk_delete_active_lock(lock_descriptor_t *lock, int free_lock)
1590 {
1591 	vnode_t *vp = lock->l_vnode;
1592 	graph_t	*gp = lock->l_graph;
1593 
1594 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1595 	if (free_lock)
1596 		ASSERT(NO_DEPENDENTS(lock));
1597 	ASSERT(NOT_BLOCKED(lock));
1598 	ASSERT(IS_ACTIVE(lock));
1599 
1600 	ASSERT((vp->v_filocks != NULL));
1601 
1602 	if (vp->v_filocks == (struct filock *)lock) {
1603 		vp->v_filocks = (struct filock *)
1604 		    ((lock->l_next->l_vnode == vp) ? lock->l_next :
1605 		    NULL);
1606 	}
1607 	lock->l_next->l_prev = lock->l_prev;
1608 	lock->l_prev->l_next = lock->l_next;
1609 	lock->l_next = lock->l_prev = NULL;
1610 	flk_set_state(lock, FLK_DEAD_STATE);
1611 	lock->l_state &= ~ACTIVE_LOCK;
1612 
1613 	if (free_lock)
1614 		flk_free_lock(lock);
1615 	CHECK_ACTIVE_LOCKS(gp);
1616 	CHECK_SLEEPING_LOCKS(gp);
1617 }
1618 
1619 /*
1620  * Insert into the sleep queue.
1621  */
1622 
1623 static void
flk_insert_sleeping_lock(lock_descriptor_t * request)1624 flk_insert_sleeping_lock(lock_descriptor_t *request)
1625 {
1626 	graph_t *gp = request->l_graph;
1627 	vnode_t	*vp = request->l_vnode;
1628 	lock_descriptor_t	*lock;
1629 
1630 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1631 	ASSERT(IS_INITIAL(request));
1632 
1633 	for (lock = gp->sleeping_locks.l_next; (lock != &gp->sleeping_locks &&
1634 	    lock->l_vnode < vp); lock = lock->l_next)
1635 		;
1636 
1637 	lock->l_prev->l_next = request;
1638 	request->l_prev = lock->l_prev;
1639 	lock->l_prev = request;
1640 	request->l_next = lock;
1641 	flk_set_state(request, FLK_SLEEPING_STATE);
1642 	request->l_state |= SLEEPING_LOCK;
1643 }
1644 
1645 /*
1646  * Cancelling a sleeping lock implies removing a vertex from the
1647  * dependency graph and therefore we should recompute the dependencies
1648  * of all vertices that have a path  to this vertex, w.r.t. all
1649  * vertices reachable from this vertex.
1650  */
1651 
1652 void
flk_cancel_sleeping_lock(lock_descriptor_t * request,int remove_from_queue)1653 flk_cancel_sleeping_lock(lock_descriptor_t *request, int remove_from_queue)
1654 {
1655 	graph_t	*gp = request->l_graph;
1656 	vnode_t *vp = request->l_vnode;
1657 	lock_descriptor_t **topology = NULL;
1658 	edge_t	*ep;
1659 	lock_descriptor_t *vertex, *lock;
1660 	int nvertex = 0;
1661 	int i;
1662 	lock_descriptor_t *vertex_stack;
1663 
1664 	STACK_INIT(vertex_stack);
1665 
1666 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1667 	/*
1668 	 * count number of vertex pointers that has to be allocated
1669 	 * All vertices that are reachable from request.
1670 	 */
1671 
1672 	STACK_PUSH(vertex_stack, request, l_stack);
1673 
1674 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1675 		STACK_POP(vertex_stack, l_stack);
1676 		for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
1677 		    ep = NEXT_ADJ(ep)) {
1678 			if (IS_RECOMPUTE(ep->to_vertex))
1679 				continue;
1680 			ep->to_vertex->l_state |= RECOMPUTE_LOCK;
1681 			STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1682 			nvertex++;
1683 		}
1684 	}
1685 
1686 	/*
1687 	 * allocate memory for holding the vertex pointers
1688 	 */
1689 
1690 	if (nvertex) {
1691 		topology = kmem_zalloc(nvertex * sizeof (lock_descriptor_t *),
1692 		    KM_SLEEP);
1693 	}
1694 
1695 	/*
1696 	 * one more pass to actually store the vertices in the
1697 	 * allocated array.
1698 	 * We first check sleeping locks and then active locks
1699 	 * so that topology array will be in a topological
1700 	 * order.
1701 	 */
1702 
1703 	nvertex = 0;
1704 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1705 
1706 	if (lock) {
1707 		do {
1708 			if (IS_RECOMPUTE(lock)) {
1709 				lock->l_index = nvertex;
1710 				topology[nvertex++] = lock;
1711 			}
1712 			lock->l_color = NO_COLOR;
1713 			lock = lock->l_next;
1714 		} while (lock->l_vnode == vp);
1715 	}
1716 
1717 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1718 
1719 	if (lock) {
1720 		do {
1721 			if (IS_RECOMPUTE(lock)) {
1722 				lock->l_index = nvertex;
1723 				topology[nvertex++] = lock;
1724 			}
1725 			lock->l_color = NO_COLOR;
1726 			lock = lock->l_next;
1727 		} while (lock->l_vnode == vp);
1728 	}
1729 
1730 	/*
1731 	 * remove in and out edges of request
1732 	 * They are freed after updating proc_graph below.
1733 	 */
1734 
1735 	for (ep = FIRST_IN(request); ep != HEAD(request); ep = NEXT_IN(ep)) {
1736 		ADJ_LIST_REMOVE(ep);
1737 	}
1738 
1739 
1740 	if (remove_from_queue)
1741 		REMOVE_SLEEP_QUEUE(request);
1742 
1743 	/* we are ready to recompute */
1744 
1745 	flk_recompute_dependencies(request, topology, nvertex, 1);
1746 
1747 	ep = FIRST_ADJ(request);
1748 	while (ep != HEAD(request)) {
1749 		IN_LIST_REMOVE(ep);
1750 		request->l_sedge = NEXT_ADJ(ep);
1751 		ADJ_LIST_REMOVE(ep);
1752 		flk_update_proc_graph(ep, 1);
1753 		flk_free_edge(ep);
1754 		ep = request->l_sedge;
1755 	}
1756 
1757 
1758 	/*
1759 	 * unset the RECOMPUTE flag in those vertices
1760 	 */
1761 
1762 	for (i = 0; i < nvertex; i++) {
1763 		topology[i]->l_state &= ~RECOMPUTE_LOCK;
1764 	}
1765 
1766 	/*
1767 	 * free the topology
1768 	 */
1769 	if (nvertex)
1770 		kmem_free((void *)topology,
1771 		    (nvertex * sizeof (lock_descriptor_t *)));
1772 	/*
1773 	 * Possibility of some locks unblocked now
1774 	 */
1775 
1776 	flk_wakeup(request, 0);
1777 
1778 	/*
1779 	 * we expect to have a correctly recomputed graph  now.
1780 	 */
1781 	flk_set_state(request, FLK_DEAD_STATE);
1782 	flk_free_lock(request);
1783 	CHECK_SLEEPING_LOCKS(gp);
1784 	CHECK_ACTIVE_LOCKS(gp);
1785 
1786 }
1787 
1788 /*
1789  * Uncoloring the graph is simply to increment the mark value of the graph
1790  * And only when wrap round takes place will we color all vertices in
1791  * the graph explicitly.
1792  */
1793 
1794 static void
flk_graph_uncolor(graph_t * gp)1795 flk_graph_uncolor(graph_t *gp)
1796 {
1797 	lock_descriptor_t *lock;
1798 
1799 	if (gp->mark == UINT_MAX) {
1800 		gp->mark = 1;
1801 	for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
1802 	    lock = lock->l_next)
1803 			lock->l_color  = 0;
1804 
1805 	for (lock = SLEEPING_HEAD(gp)->l_next; lock != SLEEPING_HEAD(gp);
1806 	    lock = lock->l_next)
1807 			lock->l_color  = 0;
1808 	} else {
1809 		gp->mark++;
1810 	}
1811 }
1812 
1813 /*
1814  * Wake up locks that are blocked on the given lock.
1815  */
1816 
1817 static void
flk_wakeup(lock_descriptor_t * lock,int adj_list_remove)1818 flk_wakeup(lock_descriptor_t *lock, int adj_list_remove)
1819 {
1820 	edge_t	*ep;
1821 	graph_t	*gp = lock->l_graph;
1822 	lock_descriptor_t	*lck;
1823 
1824 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1825 	if (NO_DEPENDENTS(lock))
1826 		return;
1827 	ep = FIRST_IN(lock);
1828 	do {
1829 		/*
1830 		 * delete the edge from the adjacency list
1831 		 * of from vertex. if no more adjacent edges
1832 		 * for this vertex wake this process.
1833 		 */
1834 		lck = ep->from_vertex;
1835 		if (adj_list_remove)
1836 			ADJ_LIST_REMOVE(ep);
1837 		flk_update_proc_graph(ep, 1);
1838 		if (NOT_BLOCKED(lck)) {
1839 			GRANT_WAKEUP(lck);
1840 		}
1841 		lock->l_sedge = NEXT_IN(ep);
1842 		IN_LIST_REMOVE(ep);
1843 		flk_free_edge(ep);
1844 		ep = lock->l_sedge;
1845 	} while (ep != HEAD(lock));
1846 	ASSERT(NO_DEPENDENTS(lock));
1847 }
1848 
1849 /*
1850  * The dependents of request, is checked for its dependency against the
1851  * locks in topology (called topology because the array is and should be in
1852  * topological order for this algorithm, if not in topological order the
1853  * inner loop below might add more edges than necessary. Topological ordering
1854  * of vertices satisfies the property that all edges will be from left to
1855  * right i.e., topology[i] can have an edge to  topology[j], iff i<j)
1856  * If lock l1 in the dependent set of request is dependent (blocked by)
1857  * on lock l2 in topology but does not have a path to it, we add an edge
1858  * in the inner loop below.
1859  *
1860  * We don't want to add an edge between l1 and l2 if there exists
1861  * already a path from l1 to l2, so care has to be taken for those vertices
1862  * that  have two paths to 'request'. These vertices are referred to here
1863  * as barrier locks.
1864  *
1865  * The barriers has to be found (those vertex that originally had two paths
1866  * to request) because otherwise we may end up adding edges unnecessarily
1867  * to vertices in topology, and thus barrier vertices can have an edge
1868  * to a vertex in topology as well a path to it.
1869  */
1870 
1871 static void
flk_recompute_dependencies(lock_descriptor_t * request,lock_descriptor_t ** topology,int nvertex,int update_graph)1872 flk_recompute_dependencies(lock_descriptor_t *request,
1873     lock_descriptor_t **topology, int nvertex, int update_graph)
1874 {
1875 	lock_descriptor_t *vertex, *lock;
1876 	graph_t	*gp = request->l_graph;
1877 	int i, count;
1878 	int barrier_found = 0;
1879 	edge_t	*ep;
1880 	lock_descriptor_t *vertex_stack;
1881 
1882 	STACK_INIT(vertex_stack);
1883 
1884 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
1885 	if (nvertex == 0)
1886 		return;
1887 	flk_graph_uncolor(request->l_graph);
1888 	barrier_found = flk_find_barriers(request);
1889 	request->l_state |= RECOMPUTE_DONE;
1890 
1891 	STACK_PUSH(vertex_stack, request, l_stack);
1892 	request->l_sedge = FIRST_IN(request);
1893 
1894 
1895 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1896 		if (vertex->l_state & RECOMPUTE_DONE) {
1897 			count = 0;
1898 			goto next_in_edge;
1899 		}
1900 		if (IS_BARRIER(vertex)) {
1901 			/* decrement the barrier count */
1902 			if (vertex->l_index) {
1903 				vertex->l_index--;
1904 				/* this guy will be pushed again anyway ? */
1905 				STACK_POP(vertex_stack, l_stack);
1906 				if (vertex->l_index == 0)  {
1907 				/*
1908 				 * barrier is over we can recompute
1909 				 * dependencies for this lock in the
1910 				 * next stack pop
1911 				 */
1912 					vertex->l_state &= ~BARRIER_LOCK;
1913 				}
1914 				continue;
1915 			}
1916 		}
1917 		vertex->l_state |= RECOMPUTE_DONE;
1918 		flk_graph_uncolor(gp);
1919 		count = flk_color_reachables(vertex);
1920 		for (i = 0; i < nvertex; i++) {
1921 			lock = topology[i];
1922 			if (COLORED(lock))
1923 				continue;
1924 			if (BLOCKS(lock, vertex)) {
1925 				(void) flk_add_edge(vertex, lock,
1926 				    NO_CHECK_CYCLE, update_graph);
1927 				COLOR(lock);
1928 				count++;
1929 				count += flk_color_reachables(lock);
1930 			}
1931 
1932 		}
1933 
1934 next_in_edge:
1935 		if (count == nvertex ||
1936 		    vertex->l_sedge == HEAD(vertex)) {
1937 			/* prune the tree below this */
1938 			STACK_POP(vertex_stack, l_stack);
1939 			vertex->l_state &= ~RECOMPUTE_DONE;
1940 			/* update the barrier locks below this! */
1941 			if (vertex->l_sedge != HEAD(vertex) && barrier_found) {
1942 				flk_graph_uncolor(gp);
1943 				flk_update_barriers(vertex);
1944 			}
1945 			continue;
1946 		}
1947 
1948 		ep = vertex->l_sedge;
1949 		lock = ep->from_vertex;
1950 		STACK_PUSH(vertex_stack, lock, l_stack);
1951 		lock->l_sedge = FIRST_IN(lock);
1952 		vertex->l_sedge = NEXT_IN(ep);
1953 	}
1954 
1955 }
1956 
1957 /*
1958  * Color all reachable vertices from vertex that belongs to topology (here
1959  * those that have RECOMPUTE_LOCK set in their state) and yet uncolored.
1960  *
1961  * Note: we need to use a different stack_link l_stack1 because this is
1962  * called from flk_recompute_dependencies() that already uses a stack with
1963  * l_stack as stack_link.
1964  */
1965 
1966 static int
flk_color_reachables(lock_descriptor_t * vertex)1967 flk_color_reachables(lock_descriptor_t *vertex)
1968 {
1969 	lock_descriptor_t *ver, *lock;
1970 	int count;
1971 	edge_t	*ep;
1972 	lock_descriptor_t *vertex_stack;
1973 
1974 	STACK_INIT(vertex_stack);
1975 
1976 	STACK_PUSH(vertex_stack, vertex, l_stack1);
1977 	count = 0;
1978 	while ((ver = STACK_TOP(vertex_stack)) != NULL) {
1979 
1980 		STACK_POP(vertex_stack, l_stack1);
1981 		for (ep = FIRST_ADJ(ver); ep != HEAD(ver);
1982 		    ep = NEXT_ADJ(ep)) {
1983 			lock = ep->to_vertex;
1984 			if (COLORED(lock))
1985 				continue;
1986 			COLOR(lock);
1987 			if (IS_RECOMPUTE(lock))
1988 				count++;
1989 			STACK_PUSH(vertex_stack, lock, l_stack1);
1990 		}
1991 
1992 	}
1993 	return (count);
1994 }
1995 
1996 /*
1997  * Called from flk_recompute_dependencies() this routine decrements
1998  * the barrier count of barrier vertices that are reachable from lock.
1999  */
2000 
2001 static void
flk_update_barriers(lock_descriptor_t * lock)2002 flk_update_barriers(lock_descriptor_t *lock)
2003 {
2004 	lock_descriptor_t *vertex, *lck;
2005 	edge_t	*ep;
2006 	lock_descriptor_t *vertex_stack;
2007 
2008 	STACK_INIT(vertex_stack);
2009 
2010 	STACK_PUSH(vertex_stack, lock, l_stack1);
2011 
2012 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2013 		STACK_POP(vertex_stack, l_stack1);
2014 		for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2015 		    ep = NEXT_IN(ep)) {
2016 			lck = ep->from_vertex;
2017 			if (COLORED(lck)) {
2018 				if (IS_BARRIER(lck)) {
2019 					ASSERT(lck->l_index > 0);
2020 					lck->l_index--;
2021 					if (lck->l_index == 0)
2022 						lck->l_state &= ~BARRIER_LOCK;
2023 				}
2024 				continue;
2025 			}
2026 			COLOR(lck);
2027 			if (IS_BARRIER(lck)) {
2028 				ASSERT(lck->l_index > 0);
2029 				lck->l_index--;
2030 				if (lck->l_index == 0)
2031 					lck->l_state &= ~BARRIER_LOCK;
2032 			}
2033 			STACK_PUSH(vertex_stack, lck, l_stack1);
2034 		}
2035 	}
2036 }
2037 
2038 /*
2039  * Finds all vertices that are reachable from 'lock' more than once and
2040  * mark them as barrier vertices and increment their barrier count.
2041  * The barrier count is one minus the total number of paths from lock
2042  * to that vertex.
2043  */
2044 
2045 static int
flk_find_barriers(lock_descriptor_t * lock)2046 flk_find_barriers(lock_descriptor_t *lock)
2047 {
2048 	lock_descriptor_t *vertex, *lck;
2049 	int found = 0;
2050 	edge_t	*ep;
2051 	lock_descriptor_t *vertex_stack;
2052 
2053 	STACK_INIT(vertex_stack);
2054 
2055 	STACK_PUSH(vertex_stack, lock, l_stack1);
2056 
2057 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2058 		STACK_POP(vertex_stack, l_stack1);
2059 		for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2060 		    ep = NEXT_IN(ep)) {
2061 			lck = ep->from_vertex;
2062 			if (COLORED(lck)) {
2063 				/* this is a barrier */
2064 				lck->l_state |= BARRIER_LOCK;
2065 				/* index will have barrier count */
2066 				lck->l_index++;
2067 				if (!found)
2068 					found = 1;
2069 				continue;
2070 			}
2071 			COLOR(lck);
2072 			lck->l_index = 0;
2073 			STACK_PUSH(vertex_stack, lck, l_stack1);
2074 		}
2075 	}
2076 	return (found);
2077 }
2078 
2079 /*
2080  * Finds the first lock that is mainly responsible for blocking this
2081  * request.  If there is no such lock, request->l_flock.l_type is set to
2082  * F_UNLCK.  Otherwise, request->l_flock is filled in with the particulars
2083  * of the blocking lock.
2084  *
2085  * Note: It is possible a request is blocked by a sleeping lock because
2086  * of the fairness policy used in flk_process_request() to construct the
2087  * dependencies. (see comments before flk_process_request()).
2088  */
2089 
2090 static void
flk_get_first_blocking_lock(lock_descriptor_t * request)2091 flk_get_first_blocking_lock(lock_descriptor_t *request)
2092 {
2093 	graph_t	*gp = request->l_graph;
2094 	vnode_t *vp = request->l_vnode;
2095 	lock_descriptor_t *lock, *blocker;
2096 
2097 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
2098 	blocker = NULL;
2099 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2100 
2101 	if (lock) {
2102 		do {
2103 			if (BLOCKS(lock, request)) {
2104 				blocker = lock;
2105 				break;
2106 			}
2107 			lock = lock->l_next;
2108 		} while (lock->l_vnode == vp);
2109 	}
2110 
2111 	if (blocker == NULL && request->l_flock.l_type == F_RDLCK) {
2112 		/*
2113 		 * No active lock is blocking this request, but if a read
2114 		 * lock is requested, it may also get blocked by a waiting
2115 		 * writer. So search all sleeping locks and see if there is
2116 		 * a writer waiting.
2117 		 */
2118 		SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2119 		if (lock) {
2120 			do {
2121 				if (BLOCKS(lock, request)) {
2122 					blocker = lock;
2123 					break;
2124 				}
2125 				lock = lock->l_next;
2126 			} while (lock->l_vnode == vp);
2127 		}
2128 	}
2129 
2130 	if (blocker) {
2131 		report_blocker(blocker, request);
2132 	} else
2133 		request->l_flock.l_type = F_UNLCK;
2134 }
2135 
2136 /*
2137  * Get the graph_t structure associated with a vnode.
2138  * If 'initialize' is non-zero, and the graph_t structure for this vnode has
2139  * not yet been initialized, then a new element is allocated and returned.
2140  */
2141 graph_t *
flk_get_lock_graph(vnode_t * vp,int initialize)2142 flk_get_lock_graph(vnode_t *vp, int initialize)
2143 {
2144 	graph_t *gp;
2145 	graph_t *gp_alloc = NULL;
2146 	int index = HASH_INDEX(vp);
2147 
2148 	if (initialize == FLK_USE_GRAPH) {
2149 		mutex_enter(&flock_lock);
2150 		gp = lock_graph[index];
2151 		mutex_exit(&flock_lock);
2152 		return (gp);
2153 	}
2154 
2155 	ASSERT(initialize == FLK_INIT_GRAPH);
2156 
2157 	if (lock_graph[index] == NULL) {
2158 
2159 		gp_alloc = kmem_zalloc(sizeof (graph_t), KM_SLEEP);
2160 
2161 		/* Initialize the graph */
2162 
2163 		gp_alloc->active_locks.l_next =
2164 		    gp_alloc->active_locks.l_prev =
2165 		    (lock_descriptor_t *)ACTIVE_HEAD(gp_alloc);
2166 		gp_alloc->sleeping_locks.l_next =
2167 		    gp_alloc->sleeping_locks.l_prev =
2168 		    (lock_descriptor_t *)SLEEPING_HEAD(gp_alloc);
2169 		gp_alloc->index = index;
2170 		mutex_init(&gp_alloc->gp_mutex, NULL, MUTEX_DEFAULT, NULL);
2171 	}
2172 
2173 	mutex_enter(&flock_lock);
2174 
2175 	gp = lock_graph[index];
2176 
2177 	/* Recheck the value within flock_lock */
2178 	if (gp == NULL) {
2179 		struct flock_globals *fg;
2180 
2181 		/* We must have previously allocated the graph_t structure */
2182 		ASSERT(gp_alloc != NULL);
2183 		lock_graph[index] = gp = gp_alloc;
2184 		/*
2185 		 * The lockmgr status is only needed if KLM is loaded.
2186 		 */
2187 		if (flock_zone_key != ZONE_KEY_UNINITIALIZED) {
2188 			fg = flk_get_globals();
2189 			fg->lockmgr_status[index] = fg->flk_lockmgr_status;
2190 		}
2191 	}
2192 
2193 	mutex_exit(&flock_lock);
2194 
2195 	if ((gp_alloc != NULL) && (gp != gp_alloc)) {
2196 		/* There was a race to allocate the graph_t and we lost */
2197 		mutex_destroy(&gp_alloc->gp_mutex);
2198 		kmem_free(gp_alloc, sizeof (graph_t));
2199 	}
2200 
2201 	return (gp);
2202 }
2203 
2204 /*
2205  * PSARC case 1997/292
2206  */
2207 int
cl_flk_has_remote_locks_for_nlmid(vnode_t * vp,int nlmid)2208 cl_flk_has_remote_locks_for_nlmid(vnode_t *vp, int nlmid)
2209 {
2210 	lock_descriptor_t *lock;
2211 	int result = 0;
2212 	graph_t *gp;
2213 	int			lock_nlmid;
2214 
2215 	/*
2216 	 * Check to see if node is booted as a cluster. If not, return.
2217 	 */
2218 	if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2219 		return (0);
2220 	}
2221 
2222 	gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2223 	if (gp == NULL) {
2224 		return (0);
2225 	}
2226 
2227 	mutex_enter(&gp->gp_mutex);
2228 
2229 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2230 
2231 	if (lock) {
2232 		while (lock->l_vnode == vp) {
2233 			/* get NLM id from sysid */
2234 			lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2235 
2236 			/*
2237 			 * If NLM server request _and_ nlmid of lock matches
2238 			 * nlmid of argument, then we've found a remote lock.
2239 			 */
2240 			if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2241 				result = 1;
2242 				goto done;
2243 			}
2244 			lock = lock->l_next;
2245 		}
2246 	}
2247 
2248 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2249 
2250 	if (lock) {
2251 		while (lock->l_vnode == vp) {
2252 			/* get NLM id from sysid */
2253 			lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2254 
2255 			/*
2256 			 * If NLM server request _and_ nlmid of lock matches
2257 			 * nlmid of argument, then we've found a remote lock.
2258 			 */
2259 			if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2260 				result = 1;
2261 				goto done;
2262 			}
2263 			lock = lock->l_next;
2264 		}
2265 	}
2266 
2267 done:
2268 	mutex_exit(&gp->gp_mutex);
2269 	return (result);
2270 }
2271 
2272 /*
2273  * Determine whether there are any locks for the given vnode with a remote
2274  * sysid.  Returns zero if not, non-zero if there are.
2275  *
2276  * Note that the return value from this function is potentially invalid
2277  * once it has been returned.  The caller is responsible for providing its
2278  * own synchronization mechanism to ensure that the return value is useful
2279  * (e.g., see nfs_lockcompletion()).
2280  */
2281 int
flk_has_remote_locks(vnode_t * vp)2282 flk_has_remote_locks(vnode_t *vp)
2283 {
2284 	lock_descriptor_t *lock;
2285 	int result = 0;
2286 	graph_t *gp;
2287 
2288 	gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2289 	if (gp == NULL) {
2290 		return (0);
2291 	}
2292 
2293 	mutex_enter(&gp->gp_mutex);
2294 
2295 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2296 
2297 	if (lock) {
2298 		while (lock->l_vnode == vp) {
2299 			if (IS_REMOTE(lock)) {
2300 				result = 1;
2301 				goto done;
2302 			}
2303 			lock = lock->l_next;
2304 		}
2305 	}
2306 
2307 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2308 
2309 	if (lock) {
2310 		while (lock->l_vnode == vp) {
2311 			if (IS_REMOTE(lock)) {
2312 				result = 1;
2313 				goto done;
2314 			}
2315 			lock = lock->l_next;
2316 		}
2317 	}
2318 
2319 done:
2320 	mutex_exit(&gp->gp_mutex);
2321 	return (result);
2322 }
2323 
2324 /*
2325  * Determine whether there are any locks for the given vnode with a remote
2326  * sysid matching given sysid.
2327  * Used by the new (open source) NFS Lock Manager (NLM)
2328  */
2329 int
flk_has_remote_locks_for_sysid(vnode_t * vp,int sysid)2330 flk_has_remote_locks_for_sysid(vnode_t *vp, int sysid)
2331 {
2332 	lock_descriptor_t *lock;
2333 	int result = 0;
2334 	graph_t *gp;
2335 
2336 	if (sysid == 0)
2337 		return (0);
2338 
2339 	gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2340 	if (gp == NULL) {
2341 		return (0);
2342 	}
2343 
2344 	mutex_enter(&gp->gp_mutex);
2345 
2346 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2347 
2348 	if (lock) {
2349 		while (lock->l_vnode == vp) {
2350 			if (lock->l_flock.l_sysid == sysid) {
2351 				result = 1;
2352 				goto done;
2353 			}
2354 			lock = lock->l_next;
2355 		}
2356 	}
2357 
2358 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2359 
2360 	if (lock) {
2361 		while (lock->l_vnode == vp) {
2362 			if (lock->l_flock.l_sysid == sysid) {
2363 				result = 1;
2364 				goto done;
2365 			}
2366 			lock = lock->l_next;
2367 		}
2368 	}
2369 
2370 done:
2371 	mutex_exit(&gp->gp_mutex);
2372 	return (result);
2373 }
2374 
2375 /*
2376  * Determine if there are any locks owned by the given sysid.
2377  * Returns zero if not, non-zero if there are.  Note that this return code
2378  * could be derived from flk_get_{sleeping,active}_locks, but this routine
2379  * avoids all the memory allocations of those routines.
2380  *
2381  * This routine has the same synchronization issues as
2382  * flk_has_remote_locks.
2383  */
2384 
2385 int
flk_sysid_has_locks(int sysid,int lck_type)2386 flk_sysid_has_locks(int sysid, int lck_type)
2387 {
2388 	int		has_locks = 0;
2389 	lock_descriptor_t	*lock;
2390 	graph_t 	*gp;
2391 	int		i;
2392 
2393 	for (i = 0; i < HASH_SIZE && !has_locks; i++) {
2394 		mutex_enter(&flock_lock);
2395 		gp = lock_graph[i];
2396 		mutex_exit(&flock_lock);
2397 		if (gp == NULL) {
2398 			continue;
2399 		}
2400 
2401 		mutex_enter(&gp->gp_mutex);
2402 
2403 		if (lck_type & FLK_QUERY_ACTIVE) {
2404 			for (lock = ACTIVE_HEAD(gp)->l_next;
2405 			    lock != ACTIVE_HEAD(gp) && !has_locks;
2406 			    lock = lock->l_next) {
2407 				if (lock->l_flock.l_sysid == sysid)
2408 					has_locks = 1;
2409 			}
2410 		}
2411 
2412 		if (lck_type & FLK_QUERY_SLEEPING) {
2413 			for (lock = SLEEPING_HEAD(gp)->l_next;
2414 			    lock != SLEEPING_HEAD(gp) && !has_locks;
2415 			    lock = lock->l_next) {
2416 				if (lock->l_flock.l_sysid == sysid)
2417 					has_locks = 1;
2418 			}
2419 		}
2420 		mutex_exit(&gp->gp_mutex);
2421 	}
2422 
2423 	return (has_locks);
2424 }
2425 
2426 
2427 /*
2428  * PSARC case 1997/292
2429  *
2430  * Requires: "sysid" is a pair [nlmid, sysid].  The lower half is 16-bit
2431  *  quantity, the real sysid generated by the NLM server; the upper half
2432  *  identifies the node of the cluster where the NLM server ran.
2433  *  This routine is only called by an NLM server running in a cluster.
2434  * Effects: Remove all locks held on behalf of the client identified
2435  *  by "sysid."
2436  */
2437 void
cl_flk_remove_locks_by_sysid(int sysid)2438 cl_flk_remove_locks_by_sysid(int sysid)
2439 {
2440 	graph_t	*gp;
2441 	int i;
2442 	lock_descriptor_t *lock, *nlock;
2443 
2444 	/*
2445 	 * Check to see if node is booted as a cluster. If not, return.
2446 	 */
2447 	if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2448 		return;
2449 	}
2450 
2451 	ASSERT(sysid != 0);
2452 	for (i = 0; i < HASH_SIZE; i++) {
2453 		mutex_enter(&flock_lock);
2454 		gp = lock_graph[i];
2455 		mutex_exit(&flock_lock);
2456 
2457 		if (gp == NULL)
2458 			continue;
2459 
2460 		mutex_enter(&gp->gp_mutex);	/*  get mutex on lock graph */
2461 
2462 		/* signal sleeping requests so that they bail out */
2463 		lock = SLEEPING_HEAD(gp)->l_next;
2464 		while (lock != SLEEPING_HEAD(gp)) {
2465 			nlock = lock->l_next;
2466 			if (lock->l_flock.l_sysid == sysid) {
2467 				INTERRUPT_WAKEUP(lock);
2468 			}
2469 			lock = nlock;
2470 		}
2471 
2472 		/* delete active locks */
2473 		lock = ACTIVE_HEAD(gp)->l_next;
2474 		while (lock != ACTIVE_HEAD(gp)) {
2475 			nlock = lock->l_next;
2476 			if (lock->l_flock.l_sysid == sysid) {
2477 				flk_delete_active_lock(lock, 0);
2478 				flk_wakeup(lock, 1);
2479 				flk_free_lock(lock);
2480 			}
2481 			lock = nlock;
2482 		}
2483 		mutex_exit(&gp->gp_mutex);    /* release mutex on lock graph */
2484 	}
2485 }
2486 
2487 /*
2488  * Delete all locks in the system that belongs to the sysid of the request.
2489  */
2490 
2491 static void
flk_delete_locks_by_sysid(lock_descriptor_t * request)2492 flk_delete_locks_by_sysid(lock_descriptor_t *request)
2493 {
2494 	int	sysid  = request->l_flock.l_sysid;
2495 	lock_descriptor_t *lock, *nlock;
2496 	graph_t	*gp;
2497 	int i;
2498 
2499 	ASSERT(MUTEX_HELD(&request->l_graph->gp_mutex));
2500 	ASSERT(sysid != 0);
2501 
2502 	mutex_exit(&request->l_graph->gp_mutex);
2503 
2504 	for (i = 0; i < HASH_SIZE; i++) {
2505 		mutex_enter(&flock_lock);
2506 		gp = lock_graph[i];
2507 		mutex_exit(&flock_lock);
2508 
2509 		if (gp == NULL)
2510 			continue;
2511 
2512 		mutex_enter(&gp->gp_mutex);
2513 
2514 		/* signal sleeping requests so that they bail out */
2515 		lock = SLEEPING_HEAD(gp)->l_next;
2516 		while (lock != SLEEPING_HEAD(gp)) {
2517 			nlock = lock->l_next;
2518 			if (lock->l_flock.l_sysid == sysid) {
2519 				INTERRUPT_WAKEUP(lock);
2520 			}
2521 			lock = nlock;
2522 		}
2523 
2524 		/* delete active locks */
2525 		lock = ACTIVE_HEAD(gp)->l_next;
2526 		while (lock != ACTIVE_HEAD(gp)) {
2527 			nlock = lock->l_next;
2528 			if (lock->l_flock.l_sysid == sysid) {
2529 				flk_delete_active_lock(lock, 0);
2530 				flk_wakeup(lock, 1);
2531 				flk_free_lock(lock);
2532 			}
2533 			lock = nlock;
2534 		}
2535 		mutex_exit(&gp->gp_mutex);
2536 	}
2537 
2538 	mutex_enter(&request->l_graph->gp_mutex);
2539 }
2540 
2541 /*
2542  * Clustering: Deletes PXFS locks
2543  * Effects: Delete all locks on files in the given file system and with the
2544  *  given PXFS id.
2545  */
2546 void
cl_flk_delete_pxfs_locks(struct vfs * vfsp,int pxfsid)2547 cl_flk_delete_pxfs_locks(struct vfs *vfsp, int pxfsid)
2548 {
2549 	lock_descriptor_t *lock, *nlock;
2550 	graph_t	*gp;
2551 	int i;
2552 
2553 	for (i = 0; i < HASH_SIZE; i++) {
2554 		mutex_enter(&flock_lock);
2555 		gp = lock_graph[i];
2556 		mutex_exit(&flock_lock);
2557 
2558 		if (gp == NULL)
2559 			continue;
2560 
2561 		mutex_enter(&gp->gp_mutex);
2562 
2563 		/* signal sleeping requests so that they bail out */
2564 		lock = SLEEPING_HEAD(gp)->l_next;
2565 		while (lock != SLEEPING_HEAD(gp)) {
2566 			nlock = lock->l_next;
2567 			if (lock->l_vnode->v_vfsp == vfsp) {
2568 				ASSERT(IS_PXFS(lock));
2569 				if (GETPXFSID(lock->l_flock.l_sysid) ==
2570 				    pxfsid) {
2571 					flk_set_state(lock,
2572 					    FLK_CANCELLED_STATE);
2573 					flk_cancel_sleeping_lock(lock, 1);
2574 				}
2575 			}
2576 			lock = nlock;
2577 		}
2578 
2579 		/* delete active locks */
2580 		lock = ACTIVE_HEAD(gp)->l_next;
2581 		while (lock != ACTIVE_HEAD(gp)) {
2582 			nlock = lock->l_next;
2583 			if (lock->l_vnode->v_vfsp == vfsp) {
2584 				ASSERT(IS_PXFS(lock));
2585 				if (GETPXFSID(lock->l_flock.l_sysid) ==
2586 				    pxfsid) {
2587 					flk_delete_active_lock(lock, 0);
2588 					flk_wakeup(lock, 1);
2589 					flk_free_lock(lock);
2590 				}
2591 			}
2592 			lock = nlock;
2593 		}
2594 		mutex_exit(&gp->gp_mutex);
2595 	}
2596 }
2597 
2598 /*
2599  * Search for a sleeping lock manager lock which matches exactly this lock
2600  * request; if one is found, fake a signal to cancel it.
2601  *
2602  * Return 1 if a matching lock was found, 0 otherwise.
2603  */
2604 
2605 static int
flk_canceled(lock_descriptor_t * request)2606 flk_canceled(lock_descriptor_t *request)
2607 {
2608 	lock_descriptor_t *lock, *nlock;
2609 	graph_t *gp = request->l_graph;
2610 	vnode_t *vp = request->l_vnode;
2611 
2612 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
2613 	ASSERT(IS_LOCKMGR(request));
2614 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2615 
2616 	if (lock) {
2617 		while (lock->l_vnode == vp) {
2618 			nlock = lock->l_next;
2619 			if (SAME_OWNER(lock, request) &&
2620 			    lock->l_start == request->l_start &&
2621 			    lock->l_end == request->l_end) {
2622 				INTERRUPT_WAKEUP(lock);
2623 				return (1);
2624 			}
2625 			lock = nlock;
2626 		}
2627 	}
2628 	return (0);
2629 }
2630 
2631 /*
2632  * Remove all the locks for the vnode belonging to the given pid and sysid.
2633  */
2634 
2635 void
cleanlocks(vnode_t * vp,pid_t pid,int sysid)2636 cleanlocks(vnode_t *vp, pid_t pid, int sysid)
2637 {
2638 	graph_t	*gp;
2639 	lock_descriptor_t *lock, *nlock;
2640 	lock_descriptor_t *link_stack;
2641 
2642 	STACK_INIT(link_stack);
2643 
2644 	gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2645 
2646 	if (gp == NULL)
2647 		return;
2648 	mutex_enter(&gp->gp_mutex);
2649 
2650 	CHECK_SLEEPING_LOCKS(gp);
2651 	CHECK_ACTIVE_LOCKS(gp);
2652 
2653 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2654 
2655 	if (lock) {
2656 		do {
2657 			nlock = lock->l_next;
2658 			if ((lock->l_flock.l_pid == pid ||
2659 			    pid == IGN_PID) &&
2660 			    lock->l_flock.l_sysid == sysid) {
2661 				CANCEL_WAKEUP(lock);
2662 			}
2663 			lock = nlock;
2664 		} while (lock->l_vnode == vp);
2665 	}
2666 
2667 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2668 
2669 	if (lock) {
2670 		do {
2671 			nlock = lock->l_next;
2672 			if ((lock->l_flock.l_pid == pid ||
2673 			    pid == IGN_PID) &&
2674 			    lock->l_flock.l_sysid == sysid) {
2675 				flk_delete_active_lock(lock, 0);
2676 				STACK_PUSH(link_stack, lock, l_stack);
2677 			}
2678 			lock = nlock;
2679 		} while (lock->l_vnode == vp);
2680 	}
2681 
2682 	while ((lock = STACK_TOP(link_stack)) != NULL) {
2683 		STACK_POP(link_stack, l_stack);
2684 		flk_wakeup(lock, 1);
2685 		flk_free_lock(lock);
2686 	}
2687 
2688 	CHECK_SLEEPING_LOCKS(gp);
2689 	CHECK_ACTIVE_LOCKS(gp);
2690 	CHECK_OWNER_LOCKS(gp, pid, sysid, vp);
2691 	mutex_exit(&gp->gp_mutex);
2692 }
2693 
2694 
2695 /*
2696  * Called from 'fs' read and write routines for files that have mandatory
2697  * locking enabled.
2698  */
2699 
2700 int
chklock(struct vnode * vp,int iomode,u_offset_t offset,ssize_t len,int fmode,caller_context_t * ct)2701 chklock(struct vnode *vp, int iomode, u_offset_t offset, ssize_t len, int fmode,
2702     caller_context_t *ct)
2703 {
2704 	register int	i;
2705 	struct flock64 	bf;
2706 	int 		error = 0;
2707 
2708 	bf.l_type = (iomode & FWRITE) ? F_WRLCK : F_RDLCK;
2709 	bf.l_whence = 0;
2710 	bf.l_start = offset;
2711 	bf.l_len = len;
2712 	if (ct == NULL) {
2713 		bf.l_pid = curproc->p_pid;
2714 		bf.l_sysid = 0;
2715 	} else {
2716 		bf.l_pid = ct->cc_pid;
2717 		bf.l_sysid = ct->cc_sysid;
2718 	}
2719 	i = (fmode & (FNDELAY|FNONBLOCK)) ? INOFLCK : INOFLCK|SLPFLCK;
2720 	if ((i = reclock(vp, &bf, i, 0, offset, NULL)) != 0 ||
2721 	    bf.l_type != F_UNLCK)
2722 		error = i ? i : EAGAIN;
2723 	return (error);
2724 }
2725 
2726 /*
2727  * convoff - converts the given data (start, whence) to the
2728  * given whence.
2729  */
2730 int
convoff(struct vnode * vp,struct flock64 * lckdat,int whence,offset_t offset)2731 convoff(struct vnode *vp, struct flock64 *lckdat, int whence, offset_t offset)
2732 {
2733 	int 		error;
2734 	struct vattr 	vattr;
2735 
2736 	if ((lckdat->l_whence == 2) || (whence == 2)) {
2737 		vattr.va_mask = AT_SIZE;
2738 		if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
2739 			return (error);
2740 	}
2741 
2742 	switch (lckdat->l_whence) {
2743 	case 1:
2744 		lckdat->l_start += offset;
2745 		break;
2746 	case 2:
2747 		lckdat->l_start += vattr.va_size;
2748 		/* FALLTHRU */
2749 	case 0:
2750 		break;
2751 	default:
2752 		return (EINVAL);
2753 	}
2754 
2755 	if (lckdat->l_start < 0)
2756 		return (EINVAL);
2757 
2758 	switch (whence) {
2759 	case 1:
2760 		lckdat->l_start -= offset;
2761 		break;
2762 	case 2:
2763 		lckdat->l_start -= vattr.va_size;
2764 		/* FALLTHRU */
2765 	case 0:
2766 		break;
2767 	default:
2768 		return (EINVAL);
2769 	}
2770 
2771 	lckdat->l_whence = (short)whence;
2772 	return (0);
2773 }
2774 
2775 
2776 /* 	proc_graph function definitions */
2777 
2778 /*
2779  * Function checks for deadlock due to the new 'lock'. If deadlock found
2780  * edges of this lock are freed and returned.
2781  */
2782 
2783 static int
flk_check_deadlock(lock_descriptor_t * lock)2784 flk_check_deadlock(lock_descriptor_t *lock)
2785 {
2786 	proc_vertex_t	*start_vertex, *pvertex;
2787 	proc_vertex_t *dvertex;
2788 	proc_edge_t *pep, *ppep;
2789 	edge_t	*ep, *nep;
2790 	proc_vertex_t *process_stack;
2791 
2792 	STACK_INIT(process_stack);
2793 
2794 	mutex_enter(&flock_lock);
2795 	start_vertex = flk_get_proc_vertex(lock);
2796 	ASSERT(start_vertex != NULL);
2797 
2798 	/* construct the edges from this process to other processes */
2799 
2800 	ep = FIRST_ADJ(lock);
2801 	while (ep != HEAD(lock)) {
2802 		proc_vertex_t *adj_proc;
2803 
2804 		adj_proc = flk_get_proc_vertex(ep->to_vertex);
2805 		for (pep = start_vertex->edge; pep != NULL; pep = pep->next) {
2806 			if (pep->to_proc == adj_proc) {
2807 				ASSERT(pep->refcount);
2808 				pep->refcount++;
2809 				break;
2810 			}
2811 		}
2812 		if (pep == NULL) {
2813 			pep = flk_get_proc_edge();
2814 			pep->to_proc = adj_proc;
2815 			pep->refcount = 1;
2816 			adj_proc->incount++;
2817 			pep->next = start_vertex->edge;
2818 			start_vertex->edge = pep;
2819 		}
2820 		ep = NEXT_ADJ(ep);
2821 	}
2822 
2823 	ep = FIRST_IN(lock);
2824 
2825 	while (ep != HEAD(lock)) {
2826 		proc_vertex_t *in_proc;
2827 
2828 		in_proc = flk_get_proc_vertex(ep->from_vertex);
2829 
2830 		for (pep = in_proc->edge; pep != NULL; pep = pep->next) {
2831 			if (pep->to_proc == start_vertex) {
2832 				ASSERT(pep->refcount);
2833 				pep->refcount++;
2834 				break;
2835 			}
2836 		}
2837 		if (pep == NULL) {
2838 			pep = flk_get_proc_edge();
2839 			pep->to_proc = start_vertex;
2840 			pep->refcount = 1;
2841 			start_vertex->incount++;
2842 			pep->next = in_proc->edge;
2843 			in_proc->edge = pep;
2844 		}
2845 		ep = NEXT_IN(ep);
2846 	}
2847 
2848 	if (start_vertex->incount == 0) {
2849 		mutex_exit(&flock_lock);
2850 		return (0);
2851 	}
2852 
2853 	flk_proc_graph_uncolor();
2854 
2855 	start_vertex->p_sedge = start_vertex->edge;
2856 
2857 	STACK_PUSH(process_stack, start_vertex, p_stack);
2858 
2859 	while ((pvertex = STACK_TOP(process_stack)) != NULL) {
2860 		for (pep = pvertex->p_sedge; pep != NULL; pep = pep->next) {
2861 			dvertex = pep->to_proc;
2862 			if (!PROC_ARRIVED(dvertex)) {
2863 				STACK_PUSH(process_stack, dvertex, p_stack);
2864 				dvertex->p_sedge = dvertex->edge;
2865 				PROC_ARRIVE(pvertex);
2866 				pvertex->p_sedge = pep->next;
2867 				break;
2868 			}
2869 			if (!PROC_DEPARTED(dvertex))
2870 				goto deadlock;
2871 		}
2872 		if (pep == NULL) {
2873 			PROC_DEPART(pvertex);
2874 			STACK_POP(process_stack, p_stack);
2875 		}
2876 	}
2877 	mutex_exit(&flock_lock);
2878 	return (0);
2879 
2880 deadlock:
2881 
2882 	/* we remove all lock edges and proc edges */
2883 
2884 	ep = FIRST_ADJ(lock);
2885 	while (ep != HEAD(lock)) {
2886 		proc_vertex_t *adj_proc;
2887 		adj_proc = flk_get_proc_vertex(ep->to_vertex);
2888 		nep = NEXT_ADJ(ep);
2889 		IN_LIST_REMOVE(ep);
2890 		ADJ_LIST_REMOVE(ep);
2891 		flk_free_edge(ep);
2892 		ppep = start_vertex->edge;
2893 		for (pep = start_vertex->edge; pep != NULL; ppep = pep,
2894 		    pep = ppep->next) {
2895 			if (pep->to_proc == adj_proc) {
2896 				pep->refcount--;
2897 				if (pep->refcount == 0) {
2898 					if (pep == ppep) {
2899 						start_vertex->edge = pep->next;
2900 					} else {
2901 						ppep->next = pep->next;
2902 					}
2903 					adj_proc->incount--;
2904 					flk_proc_release(adj_proc);
2905 					flk_free_proc_edge(pep);
2906 				}
2907 				break;
2908 			}
2909 		}
2910 		ep = nep;
2911 	}
2912 	ep = FIRST_IN(lock);
2913 	while (ep != HEAD(lock)) {
2914 		proc_vertex_t *in_proc;
2915 		in_proc = flk_get_proc_vertex(ep->from_vertex);
2916 		nep = NEXT_IN(ep);
2917 		IN_LIST_REMOVE(ep);
2918 		ADJ_LIST_REMOVE(ep);
2919 		flk_free_edge(ep);
2920 		ppep = in_proc->edge;
2921 		for (pep = in_proc->edge; pep != NULL; ppep = pep,
2922 		    pep = ppep->next) {
2923 			if (pep->to_proc == start_vertex) {
2924 				pep->refcount--;
2925 				if (pep->refcount == 0) {
2926 					if (pep == ppep) {
2927 						in_proc->edge = pep->next;
2928 					} else {
2929 						ppep->next = pep->next;
2930 					}
2931 					start_vertex->incount--;
2932 					flk_proc_release(in_proc);
2933 					flk_free_proc_edge(pep);
2934 				}
2935 				break;
2936 			}
2937 		}
2938 		ep = nep;
2939 	}
2940 	flk_proc_release(start_vertex);
2941 	mutex_exit(&flock_lock);
2942 	return (1);
2943 }
2944 
2945 /*
2946  * Get a proc vertex. If lock's pvertex value gets a correct proc vertex
2947  * from the list we return that, otherwise we allocate one. If necessary,
2948  * we grow the list of vertices also.
2949  */
2950 
2951 static proc_vertex_t *
flk_get_proc_vertex(lock_descriptor_t * lock)2952 flk_get_proc_vertex(lock_descriptor_t *lock)
2953 {
2954 	int i;
2955 	proc_vertex_t	*pv;
2956 	proc_vertex_t	**palloc;
2957 
2958 	ASSERT(MUTEX_HELD(&flock_lock));
2959 	if (lock->pvertex != -1) {
2960 		ASSERT(lock->pvertex >= 0);
2961 		pv = pgraph.proc[lock->pvertex];
2962 		if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
2963 			return (pv);
2964 		}
2965 	}
2966 	for (i = 0; i < pgraph.gcount; i++) {
2967 		pv = pgraph.proc[i];
2968 		if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
2969 			lock->pvertex = pv->index = i;
2970 			return (pv);
2971 		}
2972 	}
2973 	pv = kmem_zalloc(sizeof (struct proc_vertex), KM_SLEEP);
2974 	pv->pid = lock->l_flock.l_pid;
2975 	pv->sysid = lock->l_flock.l_sysid;
2976 	flk_proc_vertex_allocs++;
2977 	if (pgraph.free != 0) {
2978 		for (i = 0; i < pgraph.gcount; i++) {
2979 			if (pgraph.proc[i] == NULL) {
2980 				pgraph.proc[i] = pv;
2981 				lock->pvertex = pv->index = i;
2982 				pgraph.free--;
2983 				return (pv);
2984 			}
2985 		}
2986 	}
2987 	palloc = kmem_zalloc((pgraph.gcount + PROC_CHUNK) *
2988 	    sizeof (proc_vertex_t *), KM_SLEEP);
2989 
2990 	if (pgraph.proc) {
2991 		bcopy(pgraph.proc, palloc,
2992 		    pgraph.gcount * sizeof (proc_vertex_t *));
2993 
2994 		kmem_free(pgraph.proc,
2995 		    pgraph.gcount * sizeof (proc_vertex_t *));
2996 	}
2997 	pgraph.proc = palloc;
2998 	pgraph.free += (PROC_CHUNK - 1);
2999 	pv->index = lock->pvertex = pgraph.gcount;
3000 	pgraph.gcount += PROC_CHUNK;
3001 	pgraph.proc[pv->index] = pv;
3002 	return (pv);
3003 }
3004 
3005 /*
3006  * Allocate a proc edge.
3007  */
3008 
3009 static proc_edge_t *
flk_get_proc_edge()3010 flk_get_proc_edge()
3011 {
3012 	proc_edge_t *pep;
3013 
3014 	pep = kmem_zalloc(sizeof (proc_edge_t), KM_SLEEP);
3015 	flk_proc_edge_allocs++;
3016 	return (pep);
3017 }
3018 
3019 /*
3020  * Free the proc edge. Called whenever its reference count goes to zero.
3021  */
3022 
3023 static void
flk_free_proc_edge(proc_edge_t * pep)3024 flk_free_proc_edge(proc_edge_t *pep)
3025 {
3026 	ASSERT(pep->refcount == 0);
3027 	kmem_free((void *)pep, sizeof (proc_edge_t));
3028 	flk_proc_edge_frees++;
3029 }
3030 
3031 /*
3032  * Color the graph explicitly done only when the mark value hits max value.
3033  */
3034 
3035 static void
flk_proc_graph_uncolor()3036 flk_proc_graph_uncolor()
3037 {
3038 	int i;
3039 
3040 	if (pgraph.mark == UINT_MAX) {
3041 		for (i = 0; i < pgraph.gcount; i++)
3042 			if (pgraph.proc[i] != NULL) {
3043 				pgraph.proc[i]->atime = 0;
3044 				pgraph.proc[i]->dtime = 0;
3045 			}
3046 		pgraph.mark = 1;
3047 	} else {
3048 		pgraph.mark++;
3049 	}
3050 }
3051 
3052 /*
3053  * Release the proc vertex iff both there are no in edges and out edges
3054  */
3055 
3056 static void
flk_proc_release(proc_vertex_t * proc)3057 flk_proc_release(proc_vertex_t *proc)
3058 {
3059 	ASSERT(MUTEX_HELD(&flock_lock));
3060 	if (proc->edge == NULL && proc->incount == 0) {
3061 		pgraph.proc[proc->index] = NULL;
3062 		pgraph.free++;
3063 		kmem_free(proc, sizeof (proc_vertex_t));
3064 		flk_proc_vertex_frees++;
3065 	}
3066 }
3067 
3068 /*
3069  * Updates process graph to reflect change in a lock_graph.
3070  * Note: We should call this function only after we have a correctly
3071  * recomputed lock graph. Otherwise we might miss a deadlock detection.
3072  * eg: in function flk_relation() we call this function after flk_recompute_
3073  * dependencies() otherwise if a process tries to lock a vnode hashed
3074  * into another graph it might sleep for ever.
3075  */
3076 
3077 static void
flk_update_proc_graph(edge_t * ep,int delete)3078 flk_update_proc_graph(edge_t *ep, int delete)
3079 {
3080 	proc_vertex_t *toproc, *fromproc;
3081 	proc_edge_t *pep, *prevpep;
3082 
3083 	mutex_enter(&flock_lock);
3084 	toproc = flk_get_proc_vertex(ep->to_vertex);
3085 	fromproc = flk_get_proc_vertex(ep->from_vertex);
3086 
3087 	if (!delete)
3088 		goto add;
3089 	pep = prevpep = fromproc->edge;
3090 
3091 	ASSERT(pep != NULL);
3092 	while (pep != NULL) {
3093 		if (pep->to_proc == toproc) {
3094 			ASSERT(pep->refcount > 0);
3095 			pep->refcount--;
3096 			if (pep->refcount == 0) {
3097 				if (pep == prevpep) {
3098 					fromproc->edge = pep->next;
3099 				} else {
3100 					prevpep->next = pep->next;
3101 				}
3102 				toproc->incount--;
3103 				flk_proc_release(toproc);
3104 				flk_free_proc_edge(pep);
3105 			}
3106 			break;
3107 		}
3108 		prevpep = pep;
3109 		pep = pep->next;
3110 	}
3111 	flk_proc_release(fromproc);
3112 	mutex_exit(&flock_lock);
3113 	return;
3114 add:
3115 
3116 	pep = fromproc->edge;
3117 
3118 	while (pep != NULL) {
3119 		if (pep->to_proc == toproc) {
3120 			ASSERT(pep->refcount > 0);
3121 			pep->refcount++;
3122 			break;
3123 		}
3124 		pep = pep->next;
3125 	}
3126 	if (pep == NULL) {
3127 		pep = flk_get_proc_edge();
3128 		pep->to_proc = toproc;
3129 		pep->refcount = 1;
3130 		toproc->incount++;
3131 		pep->next = fromproc->edge;
3132 		fromproc->edge = pep;
3133 	}
3134 	mutex_exit(&flock_lock);
3135 }
3136 
3137 /*
3138  * Set the control status for lock manager requests.
3139  *
3140  */
3141 
3142 /*
3143  * PSARC case 1997/292
3144  *
3145  * Requires: "nlmid" must be >= 1 and <= clconf_maximum_nodeid().
3146  * Effects: Set the state of the NLM server identified by "nlmid"
3147  *   in the NLM registry to state "nlm_state."
3148  *   Raises exception no_such_nlm if "nlmid" doesn't identify a known
3149  *   NLM server to this LLM.
3150  *   Note that when this routine is called with NLM_SHUTTING_DOWN there
3151  *   may be locks requests that have gotten started but not finished.  In
3152  *   particular, there may be blocking requests that are in the callback code
3153  *   before sleeping (so they're not holding the lock for the graph).  If
3154  *   such a thread reacquires the graph's lock (to go to sleep) after
3155  *   NLM state in the NLM registry  is set to a non-up value,
3156  *   it will notice the status and bail out.  If the request gets
3157  *   granted before the thread can check the NLM registry, let it
3158  *   continue normally.  It will get flushed when we are called with NLM_DOWN.
3159  *
3160  * Modifies: nlm_reg_obj (global)
3161  * Arguments:
3162  *    nlmid	(IN):    id uniquely identifying an NLM server
3163  *    nlm_state (IN):    NLM server state to change "nlmid" to
3164  */
3165 void
cl_flk_set_nlm_status(int nlmid,flk_nlm_status_t nlm_state)3166 cl_flk_set_nlm_status(int nlmid, flk_nlm_status_t nlm_state)
3167 {
3168 	/*
3169 	 * Check to see if node is booted as a cluster. If not, return.
3170 	 */
3171 	if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
3172 		return;
3173 	}
3174 
3175 	/*
3176 	 * Check for development/debugging.  It is possible to boot a node
3177 	 * in non-cluster mode, and then run a special script, currently
3178 	 * available only to developers, to bring up the node as part of a
3179 	 * cluster.  The problem is that running such a script does not
3180 	 * result in the routine flk_init() being called and hence global array
3181 	 * nlm_reg_status is NULL.  The NLM thinks it's in cluster mode,
3182 	 * but the LLM needs to do an additional check to see if the global
3183 	 * array has been created or not. If nlm_reg_status is NULL, then
3184 	 * return, else continue.
3185 	 */
3186 	if (nlm_reg_status == NULL) {
3187 		return;
3188 	}
3189 
3190 	ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
3191 	mutex_enter(&nlm_reg_lock);
3192 
3193 	if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status, nlmid)) {
3194 		/*
3195 		 * If the NLM server "nlmid" is unknown in the NLM registry,
3196 		 * add it to the registry in the nlm shutting down state.
3197 		 */
3198 		FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3199 		    FLK_NLM_SHUTTING_DOWN);
3200 	} else {
3201 		/*
3202 		 * Change the state of the NLM server identified by "nlmid"
3203 		 * in the NLM registry to the argument "nlm_state."
3204 		 */
3205 		FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3206 		    nlm_state);
3207 	}
3208 
3209 	/*
3210 	 *  The reason we must register the NLM server that is shutting down
3211 	 *  with an LLM that doesn't already know about it (never sent a lock
3212 	 *  request) is to handle correctly a race between shutdown and a new
3213 	 *  lock request.  Suppose that a shutdown request from the NLM server
3214 	 *  invokes this routine at the LLM, and a thread is spawned to
3215 	 *  service the request. Now suppose a new lock request is in
3216 	 *  progress and has already passed the first line of defense in
3217 	 *  reclock(), which denies new locks requests from NLM servers
3218 	 *  that are not in the NLM_UP state.  After the current routine
3219 	 *  is invoked for both phases of shutdown, the routine will return,
3220 	 *  having done nothing, and the lock request will proceed and
3221 	 *  probably be granted.  The problem is that the shutdown was ignored
3222 	 *  by the lock request because there was no record of that NLM server
3223 	 *  shutting down.   We will be in the peculiar position of thinking
3224 	 *  that we've shutdown the NLM server and all locks at all LLMs have
3225 	 *  been discarded, but in fact there's still one lock held.
3226 	 *  The solution is to record the existence of NLM server and change
3227 	 *  its state immediately to NLM_SHUTTING_DOWN.  The lock request in
3228 	 *  progress may proceed because the next phase NLM_DOWN will catch
3229 	 *  this lock and discard it.
3230 	 */
3231 	mutex_exit(&nlm_reg_lock);
3232 
3233 	switch (nlm_state) {
3234 	case FLK_NLM_UP:
3235 		/*
3236 		 * Change the NLM state of all locks still held on behalf of
3237 		 * the NLM server identified by "nlmid" to NLM_UP.
3238 		 */
3239 		cl_flk_change_nlm_state_all_locks(nlmid, FLK_NLM_UP);
3240 		break;
3241 
3242 	case FLK_NLM_SHUTTING_DOWN:
3243 		/*
3244 		 * Wake up all sleeping locks for the NLM server identified
3245 		 * by "nlmid." Note that eventually all woken threads will
3246 		 * have their lock requests cancelled and descriptors
3247 		 * removed from the sleeping lock list.  Note that the NLM
3248 		 * server state associated with each lock descriptor is
3249 		 * changed to FLK_NLM_SHUTTING_DOWN.
3250 		 */
3251 		cl_flk_wakeup_sleeping_nlm_locks(nlmid);
3252 		break;
3253 
3254 	case FLK_NLM_DOWN:
3255 		/*
3256 		 * Discard all active, granted locks for this NLM server
3257 		 * identified by "nlmid."
3258 		 */
3259 		cl_flk_unlock_nlm_granted(nlmid);
3260 		break;
3261 
3262 	default:
3263 		panic("cl_set_nlm_status: bad status (%d)", nlm_state);
3264 	}
3265 }
3266 
3267 /*
3268  * Set the control status for lock manager requests.
3269  *
3270  * Note that when this routine is called with FLK_WAKEUP_SLEEPERS, there
3271  * may be locks requests that have gotten started but not finished.  In
3272  * particular, there may be blocking requests that are in the callback code
3273  * before sleeping (so they're not holding the lock for the graph).  If
3274  * such a thread reacquires the graph's lock (to go to sleep) after
3275  * flk_lockmgr_status is set to a non-up value, it will notice the status
3276  * and bail out.  If the request gets granted before the thread can check
3277  * flk_lockmgr_status, let it continue normally.  It will get flushed when
3278  * we are called with FLK_LOCKMGR_DOWN.
3279  */
3280 
3281 void
flk_set_lockmgr_status(flk_lockmgr_status_t status)3282 flk_set_lockmgr_status(flk_lockmgr_status_t status)
3283 {
3284 	int i;
3285 	graph_t *gp;
3286 	struct flock_globals *fg;
3287 
3288 	fg = flk_get_globals();
3289 	ASSERT(fg != NULL);
3290 
3291 	mutex_enter(&flock_lock);
3292 	fg->flk_lockmgr_status = status;
3293 	mutex_exit(&flock_lock);
3294 
3295 	/*
3296 	 * If the lock manager is coming back up, all that's needed is to
3297 	 * propagate this information to the graphs.  If the lock manager
3298 	 * is going down, additional action is required, and each graph's
3299 	 * copy of the state is updated atomically with this other action.
3300 	 */
3301 	switch (status) {
3302 	case FLK_LOCKMGR_UP:
3303 		for (i = 0; i < HASH_SIZE; i++) {
3304 			mutex_enter(&flock_lock);
3305 			gp = lock_graph[i];
3306 			mutex_exit(&flock_lock);
3307 			if (gp == NULL)
3308 				continue;
3309 			mutex_enter(&gp->gp_mutex);
3310 			fg->lockmgr_status[i] = status;
3311 			mutex_exit(&gp->gp_mutex);
3312 		}
3313 		break;
3314 	case FLK_WAKEUP_SLEEPERS:
3315 		wakeup_sleeping_lockmgr_locks(fg);
3316 		break;
3317 	case FLK_LOCKMGR_DOWN:
3318 		unlock_lockmgr_granted(fg);
3319 		break;
3320 	default:
3321 		panic("flk_set_lockmgr_status: bad status (%d)", status);
3322 		break;
3323 	}
3324 }
3325 
3326 /*
3327  * This routine returns all the locks that are active or sleeping and are
3328  * associated with a particular set of identifiers.  If lock_state != 0, then
3329  * only locks that match the lock_state are returned. If lock_state == 0, then
3330  * all locks are returned. If pid == NOPID, the pid is ignored.  If
3331  * use_sysid is FALSE, then the sysid is ignored.  If vp is NULL, then the
3332  * vnode pointer is ignored.
3333  *
3334  * A list containing the vnode pointer and an flock structure
3335  * describing the lock is returned.  Each element in the list is
3336  * dynamically allocated and must be freed by the caller.  The
3337  * last item in the list is denoted by a NULL value in the ll_next
3338  * field.
3339  *
3340  * The vnode pointers returned are held.  The caller is responsible
3341  * for releasing these.  Note that the returned list is only a snapshot of
3342  * the current lock information, and that it is a snapshot of a moving
3343  * target (only one graph is locked at a time).
3344  */
3345 
3346 locklist_t *
get_lock_list(int list_type,int lock_state,int sysid,boolean_t use_sysid,pid_t pid,const vnode_t * vp,zoneid_t zoneid)3347 get_lock_list(int list_type, int lock_state, int sysid, boolean_t use_sysid,
3348     pid_t pid, const vnode_t *vp, zoneid_t zoneid)
3349 {
3350 	lock_descriptor_t	*lock;
3351 	lock_descriptor_t	*graph_head;
3352 	locklist_t		listhead;
3353 	locklist_t		*llheadp;
3354 	locklist_t		*llp;
3355 	locklist_t		*lltp;
3356 	graph_t			*gp;
3357 	int			i;
3358 	int			first_index; /* graph index */
3359 	int			num_indexes; /* graph index */
3360 
3361 	ASSERT((list_type == FLK_ACTIVE_STATE) ||
3362 	    (list_type == FLK_SLEEPING_STATE));
3363 
3364 	/*
3365 	 * Get a pointer to something to use as a list head while building
3366 	 * the rest of the list.
3367 	 */
3368 	llheadp = &listhead;
3369 	lltp = llheadp;
3370 	llheadp->ll_next = (locklist_t *)NULL;
3371 
3372 	/* Figure out which graphs we want to look at. */
3373 	if (vp == NULL) {
3374 		first_index = 0;
3375 		num_indexes = HASH_SIZE;
3376 	} else {
3377 		first_index = HASH_INDEX(vp);
3378 		num_indexes = 1;
3379 	}
3380 
3381 	for (i = first_index; i < first_index + num_indexes; i++) {
3382 		mutex_enter(&flock_lock);
3383 		gp = lock_graph[i];
3384 		mutex_exit(&flock_lock);
3385 		if (gp == NULL) {
3386 			continue;
3387 		}
3388 
3389 		mutex_enter(&gp->gp_mutex);
3390 		graph_head = (list_type == FLK_ACTIVE_STATE) ?
3391 		    ACTIVE_HEAD(gp) : SLEEPING_HEAD(gp);
3392 		for (lock = graph_head->l_next;
3393 		    lock != graph_head;
3394 		    lock = lock->l_next) {
3395 			if (use_sysid && lock->l_flock.l_sysid != sysid)
3396 				continue;
3397 			if (pid != NOPID && lock->l_flock.l_pid != pid)
3398 				continue;
3399 			if (vp != NULL && lock->l_vnode != vp)
3400 				continue;
3401 			if (lock_state && !(lock_state & lock->l_state))
3402 				continue;
3403 			if (zoneid != lock->l_zoneid && zoneid != ALL_ZONES)
3404 				continue;
3405 			/*
3406 			 * A matching lock was found.  Allocate
3407 			 * space for a new locklist entry and fill
3408 			 * it in.
3409 			 */
3410 			llp = kmem_alloc(sizeof (locklist_t), KM_SLEEP);
3411 			lltp->ll_next = llp;
3412 			VN_HOLD(lock->l_vnode);
3413 			llp->ll_vp = lock->l_vnode;
3414 			create_flock(lock, &(llp->ll_flock));
3415 			llp->ll_next = (locklist_t *)NULL;
3416 			lltp = llp;
3417 		}
3418 		mutex_exit(&gp->gp_mutex);
3419 	}
3420 
3421 	llp = llheadp->ll_next;
3422 	return (llp);
3423 }
3424 
3425 /*
3426  * These two functions are simply interfaces to get_lock_list.  They return
3427  * a list of sleeping or active locks for the given sysid and pid.  See
3428  * get_lock_list for details.
3429  *
3430  * In either case we don't particularly care to specify the zone of interest;
3431  * the sysid-space is global across zones, so the sysid will map to exactly one
3432  * zone, and we'll return information for that zone.
3433  */
3434 
3435 locklist_t *
flk_get_sleeping_locks(int sysid,pid_t pid)3436 flk_get_sleeping_locks(int sysid, pid_t pid)
3437 {
3438 	return (get_lock_list(FLK_SLEEPING_STATE, 0, sysid, B_TRUE, pid, NULL,
3439 	    ALL_ZONES));
3440 }
3441 
3442 locklist_t *
flk_get_active_locks(int sysid,pid_t pid)3443 flk_get_active_locks(int sysid, pid_t pid)
3444 {
3445 	return (get_lock_list(FLK_ACTIVE_STATE, 0, sysid, B_TRUE, pid, NULL,
3446 	    ALL_ZONES));
3447 }
3448 
3449 /*
3450  * Another interface to get_lock_list.  This one returns all the active
3451  * locks for a given vnode.  Again, see get_lock_list for details.
3452  *
3453  * We don't need to specify which zone's locks we're interested in.  The matter
3454  * would only be interesting if the vnode belonged to NFS, and NFS vnodes can't
3455  * be used by multiple zones, so the list of locks will all be from the right
3456  * zone.
3457  */
3458 
3459 locklist_t *
flk_active_locks_for_vp(const vnode_t * vp)3460 flk_active_locks_for_vp(const vnode_t *vp)
3461 {
3462 	return (get_lock_list(FLK_ACTIVE_STATE, 0, 0, B_FALSE, NOPID, vp,
3463 	    ALL_ZONES));
3464 }
3465 
3466 /*
3467  * Another interface to get_lock_list.  This one returns all the active
3468  * nbmand locks for a given vnode.  Again, see get_lock_list for details.
3469  *
3470  * See the comment for flk_active_locks_for_vp() for why we don't care to
3471  * specify the particular zone of interest.
3472  */
3473 locklist_t *
flk_active_nbmand_locks_for_vp(const vnode_t * vp)3474 flk_active_nbmand_locks_for_vp(const vnode_t *vp)
3475 {
3476 	return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3477 	    NOPID, vp, ALL_ZONES));
3478 }
3479 
3480 /*
3481  * Another interface to get_lock_list.  This one returns all the active
3482  * nbmand locks for a given pid.  Again, see get_lock_list for details.
3483  *
3484  * The zone doesn't need to be specified here; the locks held by a
3485  * particular process will either be local (ie, non-NFS) or from the zone
3486  * the process is executing in.  This is because other parts of the system
3487  * ensure that an NFS vnode can't be used in a zone other than that in
3488  * which it was opened.
3489  */
3490 locklist_t *
flk_active_nbmand_locks(pid_t pid)3491 flk_active_nbmand_locks(pid_t pid)
3492 {
3493 	return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3494 	    pid, NULL, ALL_ZONES));
3495 }
3496 
3497 /*
3498  * Free up all entries in the locklist.
3499  */
3500 void
flk_free_locklist(locklist_t * llp)3501 flk_free_locklist(locklist_t *llp)
3502 {
3503 	locklist_t *next_llp;
3504 
3505 	while (llp) {
3506 		next_llp = llp->ll_next;
3507 		VN_RELE(llp->ll_vp);
3508 		kmem_free(llp, sizeof (*llp));
3509 		llp = next_llp;
3510 	}
3511 }
3512 
3513 static void
cl_flk_change_nlm_state_all_locks(int nlmid,flk_nlm_status_t nlm_state)3514 cl_flk_change_nlm_state_all_locks(int nlmid, flk_nlm_status_t nlm_state)
3515 {
3516 	/*
3517 	 * For each graph "lg" in the hash table lock_graph do
3518 	 * a.  Get the list of sleeping locks
3519 	 * b.  For each lock descriptor in the list do
3520 	 *	i.   If the requested lock is an NLM server request AND
3521 	 *		the nlmid is the same as the routine argument then
3522 	 *		change the lock descriptor's state field to
3523 	 *		"nlm_state."
3524 	 * c.  Get the list of active locks
3525 	 * d.  For each lock descriptor in the list do
3526 	 *	i.   If the requested lock is an NLM server request AND
3527 	 *		the nlmid is the same as the routine argument then
3528 	 *		change the lock descriptor's state field to
3529 	 *		"nlm_state."
3530 	 */
3531 
3532 	int			i;
3533 	graph_t			*gp;			/* lock graph */
3534 	lock_descriptor_t	*lock;			/* lock */
3535 	lock_descriptor_t	*nlock = NULL;		/* next lock */
3536 	int			lock_nlmid;
3537 
3538 	for (i = 0; i < HASH_SIZE; i++) {
3539 		mutex_enter(&flock_lock);
3540 		gp = lock_graph[i];
3541 		mutex_exit(&flock_lock);
3542 		if (gp == NULL) {
3543 			continue;
3544 		}
3545 
3546 		/* Get list of sleeping locks in current lock graph. */
3547 		mutex_enter(&gp->gp_mutex);
3548 		for (lock = SLEEPING_HEAD(gp)->l_next;
3549 		    lock != SLEEPING_HEAD(gp);
3550 		    lock = nlock) {
3551 			nlock = lock->l_next;
3552 			/* get NLM id */
3553 			lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3554 
3555 			/*
3556 			 * If NLM server request AND nlmid of lock matches
3557 			 * nlmid of argument, then set the NLM state of the
3558 			 * lock to "nlm_state."
3559 			 */
3560 			if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3561 				SET_NLM_STATE(lock, nlm_state);
3562 			}
3563 		}
3564 
3565 		/* Get list of active locks in current lock graph. */
3566 		for (lock = ACTIVE_HEAD(gp)->l_next;
3567 		    lock != ACTIVE_HEAD(gp);
3568 		    lock = nlock) {
3569 			nlock = lock->l_next;
3570 			/* get NLM id */
3571 			lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3572 
3573 			/*
3574 			 * If NLM server request AND nlmid of lock matches
3575 			 * nlmid of argument, then set the NLM state of the
3576 			 * lock to "nlm_state."
3577 			 */
3578 			if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3579 				ASSERT(IS_ACTIVE(lock));
3580 				SET_NLM_STATE(lock, nlm_state);
3581 			}
3582 		}
3583 		mutex_exit(&gp->gp_mutex);
3584 	}
3585 }
3586 
3587 /*
3588  * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid().
3589  * Effects: Find all sleeping lock manager requests _only_ for the NLM server
3590  *   identified by "nlmid." Poke those lock requests.
3591  */
3592 static void
cl_flk_wakeup_sleeping_nlm_locks(int nlmid)3593 cl_flk_wakeup_sleeping_nlm_locks(int nlmid)
3594 {
3595 	lock_descriptor_t *lock;
3596 	lock_descriptor_t *nlock = NULL; /* next lock */
3597 	int i;
3598 	graph_t *gp;
3599 	int	lock_nlmid;
3600 
3601 	for (i = 0; i < HASH_SIZE; i++) {
3602 		mutex_enter(&flock_lock);
3603 		gp = lock_graph[i];
3604 		mutex_exit(&flock_lock);
3605 		if (gp == NULL) {
3606 			continue;
3607 		}
3608 
3609 		mutex_enter(&gp->gp_mutex);
3610 		for (lock = SLEEPING_HEAD(gp)->l_next;
3611 		    lock != SLEEPING_HEAD(gp);
3612 		    lock = nlock) {
3613 			nlock = lock->l_next;
3614 			/*
3615 			 * If NLM server request _and_ nlmid of lock matches
3616 			 * nlmid of argument, then set the NLM state of the
3617 			 * lock to NLM_SHUTTING_DOWN, and wake up sleeping
3618 			 * request.
3619 			 */
3620 			if (IS_LOCKMGR(lock)) {
3621 				/* get NLM id */
3622 				lock_nlmid =
3623 				    GETNLMID(lock->l_flock.l_sysid);
3624 				if (nlmid == lock_nlmid) {
3625 					SET_NLM_STATE(lock,
3626 					    FLK_NLM_SHUTTING_DOWN);
3627 					INTERRUPT_WAKEUP(lock);
3628 				}
3629 			}
3630 		}
3631 		mutex_exit(&gp->gp_mutex);
3632 	}
3633 }
3634 
3635 /*
3636  * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid()
3637  * Effects:  Find all active (granted) lock manager locks _only_ for the
3638  *   NLM server identified by "nlmid" and release them.
3639  */
3640 static void
cl_flk_unlock_nlm_granted(int nlmid)3641 cl_flk_unlock_nlm_granted(int nlmid)
3642 {
3643 	lock_descriptor_t *lock;
3644 	lock_descriptor_t *nlock = NULL; /* next lock */
3645 	int i;
3646 	graph_t *gp;
3647 	int	lock_nlmid;
3648 
3649 	for (i = 0; i < HASH_SIZE; i++) {
3650 		mutex_enter(&flock_lock);
3651 		gp = lock_graph[i];
3652 		mutex_exit(&flock_lock);
3653 		if (gp == NULL) {
3654 			continue;
3655 		}
3656 
3657 		mutex_enter(&gp->gp_mutex);
3658 		for (lock = ACTIVE_HEAD(gp)->l_next;
3659 		    lock != ACTIVE_HEAD(gp);
3660 		    lock = nlock) {
3661 			nlock = lock->l_next;
3662 			ASSERT(IS_ACTIVE(lock));
3663 
3664 			/*
3665 			 * If it's an  NLM server request _and_ nlmid of
3666 			 * the lock matches nlmid of argument, then
3667 			 * remove the active lock the list, wakup blocked
3668 			 * threads, and free the storage for the lock.
3669 			 * Note that there's no need to mark the NLM state
3670 			 * of this lock to NLM_DOWN because the lock will
3671 			 * be deleted anyway and its storage freed.
3672 			 */
3673 			if (IS_LOCKMGR(lock)) {
3674 				/* get NLM id */
3675 				lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3676 				if (nlmid == lock_nlmid) {
3677 					flk_delete_active_lock(lock, 0);
3678 					flk_wakeup(lock, 1);
3679 					flk_free_lock(lock);
3680 				}
3681 			}
3682 		}
3683 		mutex_exit(&gp->gp_mutex);
3684 	}
3685 }
3686 
3687 /*
3688  * Find all sleeping lock manager requests and poke them.
3689  */
3690 static void
wakeup_sleeping_lockmgr_locks(struct flock_globals * fg)3691 wakeup_sleeping_lockmgr_locks(struct flock_globals *fg)
3692 {
3693 	lock_descriptor_t *lock;
3694 	lock_descriptor_t *nlock = NULL; /* next lock */
3695 	int i;
3696 	graph_t *gp;
3697 	zoneid_t zoneid = getzoneid();
3698 
3699 	for (i = 0; i < HASH_SIZE; i++) {
3700 		mutex_enter(&flock_lock);
3701 		gp = lock_graph[i];
3702 		mutex_exit(&flock_lock);
3703 		if (gp == NULL) {
3704 			continue;
3705 		}
3706 
3707 		mutex_enter(&gp->gp_mutex);
3708 		fg->lockmgr_status[i] = FLK_WAKEUP_SLEEPERS;
3709 		for (lock = SLEEPING_HEAD(gp)->l_next;
3710 		    lock != SLEEPING_HEAD(gp);
3711 		    lock = nlock) {
3712 			nlock = lock->l_next;
3713 			if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
3714 				INTERRUPT_WAKEUP(lock);
3715 			}
3716 		}
3717 		mutex_exit(&gp->gp_mutex);
3718 	}
3719 }
3720 
3721 
3722 /*
3723  * Find all active (granted) lock manager locks and release them.
3724  */
3725 static void
unlock_lockmgr_granted(struct flock_globals * fg)3726 unlock_lockmgr_granted(struct flock_globals *fg)
3727 {
3728 	lock_descriptor_t *lock;
3729 	lock_descriptor_t *nlock = NULL; /* next lock */
3730 	int i;
3731 	graph_t *gp;
3732 	zoneid_t zoneid = getzoneid();
3733 
3734 	for (i = 0; i < HASH_SIZE; i++) {
3735 		mutex_enter(&flock_lock);
3736 		gp = lock_graph[i];
3737 		mutex_exit(&flock_lock);
3738 		if (gp == NULL) {
3739 			continue;
3740 		}
3741 
3742 		mutex_enter(&gp->gp_mutex);
3743 		fg->lockmgr_status[i] = FLK_LOCKMGR_DOWN;
3744 		for (lock = ACTIVE_HEAD(gp)->l_next;
3745 		    lock != ACTIVE_HEAD(gp);
3746 		    lock = nlock) {
3747 			nlock = lock->l_next;
3748 			if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
3749 				ASSERT(IS_ACTIVE(lock));
3750 				flk_delete_active_lock(lock, 0);
3751 				flk_wakeup(lock, 1);
3752 				flk_free_lock(lock);
3753 			}
3754 		}
3755 		mutex_exit(&gp->gp_mutex);
3756 	}
3757 }
3758 
3759 
3760 /*
3761  * Wait until a lock is granted, cancelled, or interrupted.
3762  */
3763 
3764 static void
wait_for_lock(lock_descriptor_t * request)3765 wait_for_lock(lock_descriptor_t *request)
3766 {
3767 	graph_t *gp = request->l_graph;
3768 
3769 	ASSERT(MUTEX_HELD(&gp->gp_mutex));
3770 
3771 	while (!(IS_GRANTED(request)) && !(IS_CANCELLED(request)) &&
3772 	    !(IS_INTERRUPTED(request))) {
3773 		if (!cv_wait_sig(&request->l_cv, &gp->gp_mutex)) {
3774 			flk_set_state(request, FLK_INTERRUPTED_STATE);
3775 			request->l_state |= INTERRUPTED_LOCK;
3776 		}
3777 	}
3778 }
3779 
3780 /*
3781  * Create an flock structure from the existing lock information
3782  *
3783  * This routine is used to create flock structures for the lock manager
3784  * to use in a reclaim request.  Since the lock was originated on this
3785  * host, it must be conforming to UNIX semantics, so no checking is
3786  * done to make sure it falls within the lower half of the 32-bit range.
3787  */
3788 
3789 static void
create_flock(lock_descriptor_t * lp,flock64_t * flp)3790 create_flock(lock_descriptor_t *lp, flock64_t *flp)
3791 {
3792 	ASSERT(lp->l_end == MAX_U_OFFSET_T || lp->l_end <= MAXEND);
3793 	ASSERT(lp->l_end >= lp->l_start);
3794 
3795 	flp->l_type = lp->l_type;
3796 	flp->l_whence = 0;
3797 	flp->l_start = lp->l_start;
3798 	flp->l_len = (lp->l_end == MAX_U_OFFSET_T) ? 0 :
3799 	    (lp->l_end - lp->l_start + 1);
3800 	flp->l_sysid = lp->l_flock.l_sysid;
3801 	flp->l_pid = lp->l_flock.l_pid;
3802 }
3803 
3804 /*
3805  * Convert flock_t data describing a lock range into unsigned long starting
3806  * and ending points, which are put into lock_request.  Returns 0 or an
3807  * errno value.
3808  */
3809 
3810 int
flk_convert_lock_data(vnode_t * vp,flock64_t * flp,u_offset_t * start,u_offset_t * end,offset_t offset)3811 flk_convert_lock_data(vnode_t *vp, flock64_t *flp,
3812     u_offset_t *start, u_offset_t *end, offset_t offset)
3813 {
3814 	struct vattr	vattr;
3815 	int	error;
3816 
3817 	/*
3818 	 * Determine the starting point of the request
3819 	 */
3820 	switch (flp->l_whence) {
3821 	case 0:		/* SEEK_SET */
3822 		*start = (u_offset_t)flp->l_start;
3823 		break;
3824 	case 1:		/* SEEK_CUR */
3825 		*start = (u_offset_t)(flp->l_start + offset);
3826 		break;
3827 	case 2:		/* SEEK_END */
3828 		vattr.va_mask = AT_SIZE;
3829 		if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
3830 			return (error);
3831 		*start = (u_offset_t)(flp->l_start + vattr.va_size);
3832 		break;
3833 	default:
3834 		return (EINVAL);
3835 	}
3836 
3837 	/*
3838 	 * Determine the range covered by the request.
3839 	 */
3840 	if (flp->l_len == 0)
3841 		*end = MAX_U_OFFSET_T;
3842 	else if ((offset_t)flp->l_len > 0) {
3843 		*end = (u_offset_t)(*start + (flp->l_len - 1));
3844 	} else {
3845 		/*
3846 		 * Negative length; why do we even allow this ?
3847 		 * Because this allows easy specification of
3848 		 * the last n bytes of the file.
3849 		 */
3850 		*end = *start;
3851 		*start += (u_offset_t)flp->l_len;
3852 		(*start)++;
3853 	}
3854 	return (0);
3855 }
3856 
3857 /*
3858  * Check the validity of lock data.  This can used by the NFS
3859  * frlock routines to check data before contacting the server.  The
3860  * server must support semantics that aren't as restrictive as
3861  * the UNIX API, so the NFS client is required to check.
3862  * The maximum is now passed in by the caller.
3863  */
3864 
3865 int
flk_check_lock_data(u_offset_t start,u_offset_t end,offset_t max)3866 flk_check_lock_data(u_offset_t start, u_offset_t end, offset_t max)
3867 {
3868 	/*
3869 	 * The end (length) for local locking should never be greater
3870 	 * than MAXEND. However, the representation for
3871 	 * the entire file is MAX_U_OFFSET_T.
3872 	 */
3873 	if ((start > max) ||
3874 	    ((end > max) && (end != MAX_U_OFFSET_T))) {
3875 		return (EINVAL);
3876 	}
3877 	if (start > end) {
3878 		return (EINVAL);
3879 	}
3880 	return (0);
3881 }
3882 
3883 /*
3884  * Fill in request->l_flock with information about the lock blocking the
3885  * request.  The complexity here is that lock manager requests are allowed
3886  * to see into the upper part of the 32-bit address range, whereas local
3887  * requests are only allowed to see signed values.
3888  *
3889  * What should be done when "blocker" is a lock manager lock that uses the
3890  * upper portion of the 32-bit range, but "request" is local?  Since the
3891  * request has already been determined to have been blocked by the blocker,
3892  * at least some portion of "blocker" must be in the range of the request,
3893  * or the request extends to the end of file.  For the first case, the
3894  * portion in the lower range is returned with the indication that it goes
3895  * "to EOF."  For the second case, the last byte of the lower range is
3896  * returned with the indication that it goes "to EOF."
3897  */
3898 
3899 static void
report_blocker(lock_descriptor_t * blocker,lock_descriptor_t * request)3900 report_blocker(lock_descriptor_t *blocker, lock_descriptor_t *request)
3901 {
3902 	flock64_t *flrp;			/* l_flock portion of request */
3903 
3904 	ASSERT(blocker != NULL);
3905 
3906 	flrp = &request->l_flock;
3907 	flrp->l_whence = 0;
3908 	flrp->l_type = blocker->l_type;
3909 	flrp->l_pid = blocker->l_flock.l_pid;
3910 	flrp->l_sysid = blocker->l_flock.l_sysid;
3911 
3912 	if (IS_LOCKMGR(request)) {
3913 		flrp->l_start = blocker->l_start;
3914 		if (blocker->l_end == MAX_U_OFFSET_T)
3915 			flrp->l_len = 0;
3916 		else
3917 			flrp->l_len = blocker->l_end - blocker->l_start + 1;
3918 	} else {
3919 		if (blocker->l_start > MAXEND) {
3920 			flrp->l_start = MAXEND;
3921 			flrp->l_len = 0;
3922 		} else {
3923 			flrp->l_start = blocker->l_start;
3924 			if (blocker->l_end == MAX_U_OFFSET_T)
3925 				flrp->l_len = 0;
3926 			else
3927 				flrp->l_len = blocker->l_end -
3928 				    blocker->l_start + 1;
3929 		}
3930 	}
3931 }
3932 
3933 /*
3934  * PSARC case 1997/292
3935  */
3936 /*
3937  * This is the public routine exported by flock.h.
3938  */
3939 void
cl_flk_change_nlm_state_to_unknown(int nlmid)3940 cl_flk_change_nlm_state_to_unknown(int nlmid)
3941 {
3942 	/*
3943 	 * Check to see if node is booted as a cluster. If not, return.
3944 	 */
3945 	if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
3946 		return;
3947 	}
3948 
3949 	/*
3950 	 * See comment in cl_flk_set_nlm_status().
3951 	 */
3952 	if (nlm_reg_status == NULL) {
3953 		return;
3954 	}
3955 
3956 	/*
3957 	 * protect NLM registry state with a mutex.
3958 	 */
3959 	ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
3960 	mutex_enter(&nlm_reg_lock);
3961 	FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid, FLK_NLM_UNKNOWN);
3962 	mutex_exit(&nlm_reg_lock);
3963 }
3964 
3965 /*
3966  * Return non-zero if the given I/O request conflicts with an active NBMAND
3967  * lock.
3968  * If svmand is non-zero, it means look at all active locks, not just NBMAND
3969  * locks.
3970  */
3971 
3972 int
nbl_lock_conflict(vnode_t * vp,nbl_op_t op,u_offset_t offset,ssize_t length,int svmand,caller_context_t * ct)3973 nbl_lock_conflict(vnode_t *vp, nbl_op_t op, u_offset_t offset,
3974     ssize_t length, int svmand, caller_context_t *ct)
3975 {
3976 	int conflict = 0;
3977 	graph_t			*gp;
3978 	lock_descriptor_t	*lock;
3979 	pid_t pid;
3980 	int sysid;
3981 
3982 	if (ct == NULL) {
3983 		pid = curproc->p_pid;
3984 		sysid = 0;
3985 	} else {
3986 		pid = ct->cc_pid;
3987 		sysid = ct->cc_sysid;
3988 	}
3989 
3990 	mutex_enter(&flock_lock);
3991 	gp = lock_graph[HASH_INDEX(vp)];
3992 	mutex_exit(&flock_lock);
3993 	if (gp == NULL)
3994 		return (0);
3995 
3996 	mutex_enter(&gp->gp_mutex);
3997 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
3998 
3999 	for (; lock && lock->l_vnode == vp; lock = lock->l_next) {
4000 		if ((svmand || (lock->l_state & NBMAND_LOCK)) &&
4001 		    (lock->l_flock.l_sysid != sysid ||
4002 		    lock->l_flock.l_pid != pid) &&
4003 		    lock_blocks_io(op, offset, length,
4004 		    lock->l_type, lock->l_start, lock->l_end)) {
4005 			conflict = 1;
4006 			break;
4007 		}
4008 	}
4009 	mutex_exit(&gp->gp_mutex);
4010 
4011 	return (conflict);
4012 }
4013 
4014 /*
4015  * Return non-zero if the given I/O request conflicts with the given lock.
4016  */
4017 
4018 static int
lock_blocks_io(nbl_op_t op,u_offset_t offset,ssize_t length,int lock_type,u_offset_t lock_start,u_offset_t lock_end)4019 lock_blocks_io(nbl_op_t op, u_offset_t offset, ssize_t length,
4020     int lock_type, u_offset_t lock_start, u_offset_t lock_end)
4021 {
4022 	ASSERT(op == NBL_READ || op == NBL_WRITE || op == NBL_READWRITE);
4023 	ASSERT(lock_type == F_RDLCK || lock_type == F_WRLCK);
4024 
4025 	if (op == NBL_READ && lock_type == F_RDLCK)
4026 		return (0);
4027 
4028 	if (offset <= lock_start && lock_start < offset + length)
4029 		return (1);
4030 	if (lock_start <= offset && offset <= lock_end)
4031 		return (1);
4032 
4033 	return (0);
4034 }
4035 
4036 #ifdef DEBUG
4037 static void
check_active_locks(graph_t * gp)4038 check_active_locks(graph_t *gp)
4039 {
4040 	lock_descriptor_t *lock, *lock1;
4041 	edge_t	*ep;
4042 
4043 	for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
4044 	    lock = lock->l_next) {
4045 		ASSERT(IS_ACTIVE(lock));
4046 		ASSERT(NOT_BLOCKED(lock));
4047 		ASSERT(!IS_BARRIER(lock));
4048 
4049 		ep = FIRST_IN(lock);
4050 
4051 		while (ep != HEAD(lock)) {
4052 			ASSERT(IS_SLEEPING(ep->from_vertex));
4053 			ASSERT(!NOT_BLOCKED(ep->from_vertex));
4054 			ep = NEXT_IN(ep);
4055 		}
4056 
4057 		for (lock1 = lock->l_next; lock1 != ACTIVE_HEAD(gp);
4058 		    lock1 = lock1->l_next) {
4059 			if (lock1->l_vnode == lock->l_vnode) {
4060 			if (BLOCKS(lock1, lock)) {
4061 				cmn_err(CE_PANIC,
4062 				    "active lock %p blocks %p",
4063 				    (void *)lock1, (void *)lock);
4064 			} else if (BLOCKS(lock, lock1)) {
4065 				cmn_err(CE_PANIC,
4066 				    "active lock %p blocks %p",
4067 				    (void *)lock, (void *)lock1);
4068 			}
4069 			}
4070 		}
4071 	}
4072 }
4073 
4074 /*
4075  * Effect: This functions checks to see if the transition from 'old_state' to
4076  *	'new_state' is a valid one.  It returns 0 if the transition is valid
4077  *	and 1 if it is not.
4078  *	For a map of valid transitions, see sys/flock_impl.h
4079  */
4080 static int
check_lock_transition(int old_state,int new_state)4081 check_lock_transition(int old_state, int new_state)
4082 {
4083 	switch (old_state) {
4084 	case FLK_INITIAL_STATE:
4085 		if ((new_state == FLK_START_STATE) ||
4086 		    (new_state == FLK_SLEEPING_STATE) ||
4087 		    (new_state == FLK_ACTIVE_STATE) ||
4088 		    (new_state == FLK_DEAD_STATE)) {
4089 			return (0);
4090 		} else {
4091 			return (1);
4092 		}
4093 	case FLK_START_STATE:
4094 		if ((new_state == FLK_ACTIVE_STATE) ||
4095 		    (new_state == FLK_DEAD_STATE)) {
4096 			return (0);
4097 		} else {
4098 			return (1);
4099 		}
4100 	case FLK_ACTIVE_STATE:
4101 		if (new_state == FLK_DEAD_STATE) {
4102 			return (0);
4103 		} else {
4104 			return (1);
4105 		}
4106 	case FLK_SLEEPING_STATE:
4107 		if ((new_state == FLK_GRANTED_STATE) ||
4108 		    (new_state == FLK_INTERRUPTED_STATE) ||
4109 		    (new_state == FLK_CANCELLED_STATE)) {
4110 			return (0);
4111 		} else {
4112 			return (1);
4113 		}
4114 	case FLK_GRANTED_STATE:
4115 		if ((new_state == FLK_START_STATE) ||
4116 		    (new_state == FLK_INTERRUPTED_STATE) ||
4117 		    (new_state == FLK_CANCELLED_STATE)) {
4118 			return (0);
4119 		} else {
4120 			return (1);
4121 		}
4122 	case FLK_CANCELLED_STATE:
4123 		if ((new_state == FLK_INTERRUPTED_STATE) ||
4124 		    (new_state == FLK_DEAD_STATE)) {
4125 			return (0);
4126 		} else {
4127 			return (1);
4128 		}
4129 	case FLK_INTERRUPTED_STATE:
4130 		if (new_state == FLK_DEAD_STATE) {
4131 			return (0);
4132 		} else {
4133 			return (1);
4134 		}
4135 	case FLK_DEAD_STATE:
4136 		/* May be set more than once */
4137 		if (new_state == FLK_DEAD_STATE) {
4138 			return (0);
4139 		} else {
4140 			return (1);
4141 		}
4142 	default:
4143 		return (1);
4144 	}
4145 }
4146 
4147 static void
check_sleeping_locks(graph_t * gp)4148 check_sleeping_locks(graph_t *gp)
4149 {
4150 	lock_descriptor_t *lock1, *lock2;
4151 	edge_t *ep;
4152 	for (lock1 = SLEEPING_HEAD(gp)->l_next; lock1 != SLEEPING_HEAD(gp);
4153 	    lock1 = lock1->l_next) {
4154 				ASSERT(!IS_BARRIER(lock1));
4155 	for (lock2 = lock1->l_next; lock2 != SLEEPING_HEAD(gp);
4156 	    lock2 = lock2->l_next) {
4157 		if (lock1->l_vnode == lock2->l_vnode) {
4158 			if (BLOCKS(lock2, lock1)) {
4159 				ASSERT(!IS_GRANTED(lock1));
4160 				ASSERT(!NOT_BLOCKED(lock1));
4161 				path(lock1, lock2);
4162 			}
4163 		}
4164 	}
4165 
4166 	for (lock2 = ACTIVE_HEAD(gp)->l_next; lock2 != ACTIVE_HEAD(gp);
4167 	    lock2 = lock2->l_next) {
4168 				ASSERT(!IS_BARRIER(lock1));
4169 		if (lock1->l_vnode == lock2->l_vnode) {
4170 			if (BLOCKS(lock2, lock1)) {
4171 				ASSERT(!IS_GRANTED(lock1));
4172 				ASSERT(!NOT_BLOCKED(lock1));
4173 				path(lock1, lock2);
4174 			}
4175 		}
4176 	}
4177 	ep = FIRST_ADJ(lock1);
4178 	while (ep != HEAD(lock1)) {
4179 		ASSERT(BLOCKS(ep->to_vertex, lock1));
4180 		ep = NEXT_ADJ(ep);
4181 	}
4182 	}
4183 }
4184 
4185 static int
level_two_path(lock_descriptor_t * lock1,lock_descriptor_t * lock2,int no_path)4186 level_two_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2, int no_path)
4187 {
4188 	edge_t	*ep;
4189 	lock_descriptor_t	*vertex;
4190 	lock_descriptor_t *vertex_stack;
4191 
4192 	STACK_INIT(vertex_stack);
4193 
4194 	flk_graph_uncolor(lock1->l_graph);
4195 	ep = FIRST_ADJ(lock1);
4196 	ASSERT(ep != HEAD(lock1));
4197 	while (ep != HEAD(lock1)) {
4198 		if (no_path)
4199 			ASSERT(ep->to_vertex != lock2);
4200 		STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4201 		COLOR(ep->to_vertex);
4202 		ep = NEXT_ADJ(ep);
4203 	}
4204 
4205 	while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
4206 		STACK_POP(vertex_stack, l_dstack);
4207 		for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
4208 		    ep = NEXT_ADJ(ep)) {
4209 			if (COLORED(ep->to_vertex))
4210 				continue;
4211 			COLOR(ep->to_vertex);
4212 			if (ep->to_vertex == lock2)
4213 				return (1);
4214 
4215 			STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4216 		}
4217 	}
4218 	return (0);
4219 }
4220 
4221 static void
check_owner_locks(graph_t * gp,pid_t pid,int sysid,vnode_t * vp)4222 check_owner_locks(graph_t *gp, pid_t pid, int sysid, vnode_t *vp)
4223 {
4224 	lock_descriptor_t *lock;
4225 
4226 	SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
4227 
4228 	if (lock) {
4229 		while (lock != ACTIVE_HEAD(gp) && (lock->l_vnode == vp)) {
4230 			if (lock->l_flock.l_pid == pid &&
4231 			    lock->l_flock.l_sysid == sysid)
4232 				cmn_err(CE_PANIC,
4233 				    "owner pid %d's lock %p in active queue",
4234 				    pid, (void *)lock);
4235 			lock = lock->l_next;
4236 		}
4237 	}
4238 	SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
4239 
4240 	if (lock) {
4241 		while (lock != SLEEPING_HEAD(gp) && (lock->l_vnode == vp)) {
4242 			if (lock->l_flock.l_pid == pid &&
4243 			    lock->l_flock.l_sysid == sysid)
4244 				cmn_err(CE_PANIC,
4245 				    "owner pid %d's lock %p in sleep queue",
4246 				    pid, (void *)lock);
4247 			lock = lock->l_next;
4248 		}
4249 	}
4250 }
4251 
4252 static int
level_one_path(lock_descriptor_t * lock1,lock_descriptor_t * lock2)4253 level_one_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4254 {
4255 	edge_t *ep = FIRST_ADJ(lock1);
4256 
4257 	while (ep != HEAD(lock1)) {
4258 		if (ep->to_vertex == lock2)
4259 			return (1);
4260 		else
4261 			ep = NEXT_ADJ(ep);
4262 	}
4263 	return (0);
4264 }
4265 
4266 static int
no_path(lock_descriptor_t * lock1,lock_descriptor_t * lock2)4267 no_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4268 {
4269 	return (!level_two_path(lock1, lock2, 1));
4270 }
4271 
4272 static void
path(lock_descriptor_t * lock1,lock_descriptor_t * lock2)4273 path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4274 {
4275 	if (level_one_path(lock1, lock2)) {
4276 		if (level_two_path(lock1, lock2, 0) != 0) {
4277 			cmn_err(CE_WARN,
4278 			    "one edge one path from lock1 %p lock2 %p",
4279 			    (void *)lock1, (void *)lock2);
4280 		}
4281 	} else if (no_path(lock1, lock2)) {
4282 		cmn_err(CE_PANIC,
4283 		    "No path from  lock1 %p to lock2 %p",
4284 		    (void *)lock1, (void *)lock2);
4285 	}
4286 }
4287 #endif /* DEBUG */
4288