1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
29
30 /*
31 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
32 */
33
34 #include <sys/flock_impl.h>
35 #include <sys/vfs.h>
36 #include <sys/t_lock.h> /* for <sys/callb.h> */
37 #include <sys/callb.h>
38 #include <sys/clconf.h>
39 #include <sys/cladm.h>
40 #include <sys/nbmlock.h>
41 #include <sys/cred.h>
42 #include <sys/policy.h>
43
44 /*
45 * The following four variables are for statistics purposes and they are
46 * not protected by locks. They may not be accurate but will at least be
47 * close to the actual value.
48 */
49
50 int flk_lock_allocs;
51 int flk_lock_frees;
52 int edge_allocs;
53 int edge_frees;
54 int flk_proc_vertex_allocs;
55 int flk_proc_edge_allocs;
56 int flk_proc_vertex_frees;
57 int flk_proc_edge_frees;
58
59 static kmutex_t flock_lock;
60
61 #ifdef DEBUG
62 int check_debug = 0;
63 #define CHECK_ACTIVE_LOCKS(gp) if (check_debug) \
64 check_active_locks(gp);
65 #define CHECK_SLEEPING_LOCKS(gp) if (check_debug) \
66 check_sleeping_locks(gp);
67 #define CHECK_OWNER_LOCKS(gp, pid, sysid, vp) \
68 if (check_debug) \
69 check_owner_locks(gp, pid, sysid, vp);
70 #define CHECK_LOCK_TRANSITION(old_state, new_state) \
71 { \
72 if (check_lock_transition(old_state, new_state)) { \
73 cmn_err(CE_PANIC, "Illegal lock transition \
74 from %d to %d", old_state, new_state); \
75 } \
76 }
77 #else
78
79 #define CHECK_ACTIVE_LOCKS(gp)
80 #define CHECK_SLEEPING_LOCKS(gp)
81 #define CHECK_OWNER_LOCKS(gp, pid, sysid, vp)
82 #define CHECK_LOCK_TRANSITION(old_state, new_state)
83
84 #endif /* DEBUG */
85
86 struct kmem_cache *flk_edge_cache;
87
88 graph_t *lock_graph[HASH_SIZE];
89 proc_graph_t pgraph;
90
91 /*
92 * Clustering.
93 *
94 * NLM REGISTRY TYPE IMPLEMENTATION
95 *
96 * Assumptions:
97 * 1. Nodes in a cluster are numbered starting at 1; always non-negative
98 * integers; maximum node id is returned by clconf_maximum_nodeid().
99 * 2. We use this node id to identify the node an NLM server runs on.
100 */
101
102 /*
103 * NLM registry object keeps track of NLM servers via their
104 * nlmids (which are the node ids of the node in the cluster they run on)
105 * that have requested locks at this LLM with which this registry is
106 * associated.
107 *
108 * Representation of abstraction:
109 * rep = record[ states: array[nlm_state],
110 * lock: mutex]
111 *
112 * Representation invariants:
113 * 1. index i of rep.states is between 0 and n - 1 where n is number
114 * of elements in the array, which happen to be the maximum number
115 * of nodes in the cluster configuration + 1.
116 * 2. map nlmid to index i of rep.states
117 * 0 -> 0
118 * 1 -> 1
119 * 2 -> 2
120 * n-1 -> clconf_maximum_nodeid()+1
121 * 3. This 1-1 mapping is quite convenient and it avoids errors resulting
122 * from forgetting to subtract 1 from the index.
123 * 4. The reason we keep the 0th index is the following. A legitimate
124 * cluster configuration includes making a UFS file system NFS
125 * exportable. The code is structured so that if you're in a cluster
126 * you do one thing; otherwise, you do something else. The problem
127 * is what to do if you think you're in a cluster with PXFS loaded,
128 * but you're using UFS not PXFS? The upper two bytes of the sysid
129 * encode the node id of the node where NLM server runs; these bytes
130 * are zero for UFS. Since the nodeid is used to index into the
131 * registry, we can record the NLM server state information at index
132 * 0 using the same mechanism used for PXFS file locks!
133 */
134 static flk_nlm_status_t *nlm_reg_status = NULL; /* state array 0..N-1 */
135 static kmutex_t nlm_reg_lock; /* lock to protect arrary */
136 static uint_t nlm_status_size; /* size of state array */
137
138 /*
139 * Although we need a global lock dependency graph (and associated data
140 * structures), we also need a per-zone notion of whether the lock manager is
141 * running, and so whether to allow lock manager requests or not.
142 *
143 * Thus, on a per-zone basis we maintain a ``global'' variable
144 * (flk_lockmgr_status), protected by flock_lock, and set when the lock
145 * manager is determined to be changing state (starting or stopping).
146 *
147 * Each graph/zone pair also has a copy of this variable, which is protected by
148 * the graph's mutex.
149 *
150 * The per-graph copies are used to synchronize lock requests with shutdown
151 * requests. The global copy is used to initialize the per-graph field when a
152 * new graph is created.
153 */
154 struct flock_globals {
155 flk_lockmgr_status_t flk_lockmgr_status;
156 flk_lockmgr_status_t lockmgr_status[HASH_SIZE];
157 };
158
159 zone_key_t flock_zone_key;
160
161 static void create_flock(lock_descriptor_t *, flock64_t *);
162 static lock_descriptor_t *flk_get_lock(void);
163 static void flk_free_lock(lock_descriptor_t *lock);
164 static void flk_get_first_blocking_lock(lock_descriptor_t *request);
165 static int flk_process_request(lock_descriptor_t *);
166 static int flk_add_edge(lock_descriptor_t *, lock_descriptor_t *, int, int);
167 static edge_t *flk_get_edge(void);
168 static int flk_wait_execute_request(lock_descriptor_t *);
169 static int flk_relation(lock_descriptor_t *, lock_descriptor_t *);
170 static void flk_insert_active_lock(lock_descriptor_t *);
171 static void flk_delete_active_lock(lock_descriptor_t *, int);
172 static void flk_insert_sleeping_lock(lock_descriptor_t *);
173 static void flk_graph_uncolor(graph_t *);
174 static void flk_wakeup(lock_descriptor_t *, int);
175 static void flk_free_edge(edge_t *);
176 static void flk_recompute_dependencies(lock_descriptor_t *,
177 lock_descriptor_t **, int, int);
178 static int flk_find_barriers(lock_descriptor_t *);
179 static void flk_update_barriers(lock_descriptor_t *);
180 static int flk_color_reachables(lock_descriptor_t *);
181 static int flk_canceled(lock_descriptor_t *);
182 static void flk_delete_locks_by_sysid(lock_descriptor_t *);
183 static void report_blocker(lock_descriptor_t *, lock_descriptor_t *);
184 static void wait_for_lock(lock_descriptor_t *);
185 static void unlock_lockmgr_granted(struct flock_globals *);
186 static void wakeup_sleeping_lockmgr_locks(struct flock_globals *);
187
188 /* Clustering hooks */
189 static void cl_flk_change_nlm_state_all_locks(int, flk_nlm_status_t);
190 static void cl_flk_wakeup_sleeping_nlm_locks(int);
191 static void cl_flk_unlock_nlm_granted(int);
192
193 #ifdef DEBUG
194 static int check_lock_transition(int, int);
195 static void check_sleeping_locks(graph_t *);
196 static void check_active_locks(graph_t *);
197 static int no_path(lock_descriptor_t *, lock_descriptor_t *);
198 static void path(lock_descriptor_t *, lock_descriptor_t *);
199 static void check_owner_locks(graph_t *, pid_t, int, vnode_t *);
200 static int level_one_path(lock_descriptor_t *, lock_descriptor_t *);
201 static int level_two_path(lock_descriptor_t *, lock_descriptor_t *, int);
202 #endif
203
204 /* proc_graph function definitions */
205 static int flk_check_deadlock(lock_descriptor_t *);
206 static void flk_proc_graph_uncolor(void);
207 static proc_vertex_t *flk_get_proc_vertex(lock_descriptor_t *);
208 static proc_edge_t *flk_get_proc_edge(void);
209 static void flk_proc_release(proc_vertex_t *);
210 static void flk_free_proc_edge(proc_edge_t *);
211 static void flk_update_proc_graph(edge_t *, int);
212
213 /* Non-blocking mandatory locking */
214 static int lock_blocks_io(nbl_op_t, u_offset_t, ssize_t, int, u_offset_t,
215 u_offset_t);
216
217 static struct flock_globals *
flk_get_globals(void)218 flk_get_globals(void)
219 {
220 /*
221 * The KLM module had better be loaded if we're attempting to handle
222 * lockmgr requests.
223 */
224 ASSERT(flock_zone_key != ZONE_KEY_UNINITIALIZED);
225 return (zone_getspecific(flock_zone_key, curproc->p_zone));
226 }
227
228 static flk_lockmgr_status_t
flk_get_lockmgr_status(void)229 flk_get_lockmgr_status(void)
230 {
231 struct flock_globals *fg;
232
233 ASSERT(MUTEX_HELD(&flock_lock));
234
235 if (flock_zone_key == ZONE_KEY_UNINITIALIZED) {
236 /*
237 * KLM module not loaded; lock manager definitely not running.
238 */
239 return (FLK_LOCKMGR_DOWN);
240 }
241 fg = flk_get_globals();
242 return (fg->flk_lockmgr_status);
243 }
244
245 /*
246 * Routine called from fs_frlock in fs/fs_subr.c
247 */
248
249 int
reclock(vnode_t * vp,flock64_t * lckdat,int cmd,int flag,u_offset_t offset,flk_callback_t * flk_cbp)250 reclock(vnode_t *vp, flock64_t *lckdat, int cmd, int flag, u_offset_t offset,
251 flk_callback_t *flk_cbp)
252 {
253 lock_descriptor_t stack_lock_request;
254 lock_descriptor_t *lock_request;
255 int error = 0;
256 graph_t *gp;
257 int nlmid;
258
259 /*
260 * Check access permissions
261 */
262 if ((cmd & SETFLCK) &&
263 ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
264 (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
265 return (EBADF);
266
267 /*
268 * for query and unlock we use the stack_lock_request
269 */
270
271 if ((lckdat->l_type == F_UNLCK) ||
272 !((cmd & INOFLCK) || (cmd & SETFLCK))) {
273 lock_request = &stack_lock_request;
274 (void) bzero((caddr_t)lock_request,
275 sizeof (lock_descriptor_t));
276
277 /*
278 * following is added to make the assertions in
279 * flk_execute_request() to pass through
280 */
281
282 lock_request->l_edge.edge_in_next = &lock_request->l_edge;
283 lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
284 lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
285 lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
286 lock_request->l_status = FLK_INITIAL_STATE;
287 } else {
288 lock_request = flk_get_lock();
289 }
290 lock_request->l_state = 0;
291 lock_request->l_vnode = vp;
292 lock_request->l_zoneid = getzoneid();
293
294 /*
295 * Convert the request range into the canonical start and end
296 * values. The NLM protocol supports locking over the entire
297 * 32-bit range, so there's no range checking for remote requests,
298 * but we still need to verify that local requests obey the rules.
299 */
300 /* Clustering */
301 if ((cmd & (RCMDLCK | PCMDLCK)) != 0) {
302 ASSERT(lckdat->l_whence == 0);
303 lock_request->l_start = lckdat->l_start;
304 lock_request->l_end = (lckdat->l_len == 0) ? MAX_U_OFFSET_T :
305 lckdat->l_start + (lckdat->l_len - 1);
306 } else {
307 /* check the validity of the lock range */
308 error = flk_convert_lock_data(vp, lckdat,
309 &lock_request->l_start, &lock_request->l_end,
310 offset);
311 if (error) {
312 goto done;
313 }
314 error = flk_check_lock_data(lock_request->l_start,
315 lock_request->l_end, MAXEND);
316 if (error) {
317 goto done;
318 }
319 }
320
321 ASSERT(lock_request->l_end >= lock_request->l_start);
322
323 lock_request->l_type = lckdat->l_type;
324 if (cmd & INOFLCK)
325 lock_request->l_state |= IO_LOCK;
326 if (cmd & SLPFLCK)
327 lock_request->l_state |= WILLING_TO_SLEEP_LOCK;
328 if (cmd & RCMDLCK)
329 lock_request->l_state |= LOCKMGR_LOCK;
330 if (cmd & NBMLCK)
331 lock_request->l_state |= NBMAND_LOCK;
332 /*
333 * Clustering: set flag for PXFS locks
334 * We do not _only_ check for the PCMDLCK flag because PXFS locks could
335 * also be of type 'RCMDLCK'.
336 * We do not _only_ check the GETPXFSID() macro because local PXFS
337 * clients use a pxfsid of zero to permit deadlock detection in the LLM.
338 */
339
340 if ((cmd & PCMDLCK) || (GETPXFSID(lckdat->l_sysid) != 0)) {
341 lock_request->l_state |= PXFS_LOCK;
342 }
343 if (!((cmd & SETFLCK) || (cmd & INOFLCK))) {
344 if (lock_request->l_type == F_RDLCK ||
345 lock_request->l_type == F_WRLCK)
346 lock_request->l_state |= QUERY_LOCK;
347 }
348 lock_request->l_flock = (*lckdat);
349 lock_request->l_callbacks = flk_cbp;
350
351 /*
352 * We are ready for processing the request
353 */
354 if (IS_LOCKMGR(lock_request)) {
355 /*
356 * If the lock request is an NLM server request ....
357 */
358 if (nlm_status_size == 0) { /* not booted as cluster */
359 mutex_enter(&flock_lock);
360 /*
361 * Bail out if this is a lock manager request and the
362 * lock manager is not supposed to be running.
363 */
364 if (flk_get_lockmgr_status() != FLK_LOCKMGR_UP) {
365 mutex_exit(&flock_lock);
366 error = ENOLCK;
367 goto done;
368 }
369 mutex_exit(&flock_lock);
370 } else { /* booted as a cluster */
371 nlmid = GETNLMID(lock_request->l_flock.l_sysid);
372 ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
373
374 mutex_enter(&nlm_reg_lock);
375 /*
376 * If the NLM registry does not know about this
377 * NLM server making the request, add its nlmid
378 * to the registry.
379 */
380 if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status,
381 nlmid)) {
382 FLK_REGISTRY_ADD_NLMID(nlm_reg_status, nlmid);
383 } else if (!FLK_REGISTRY_IS_NLM_UP(nlm_reg_status,
384 nlmid)) {
385 /*
386 * If the NLM server is already known (has made
387 * previous lock requests) and its state is
388 * not NLM_UP (means that NLM server is
389 * shutting down), then bail out with an
390 * error to deny the lock request.
391 */
392 mutex_exit(&nlm_reg_lock);
393 error = ENOLCK;
394 goto done;
395 }
396 mutex_exit(&nlm_reg_lock);
397 }
398 }
399
400 /* Now get the lock graph for a particular vnode */
401 gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH);
402
403 /*
404 * We drop rwlock here otherwise this might end up causing a
405 * deadlock if this IOLOCK sleeps. (bugid # 1183392).
406 */
407
408 if (IS_IO_LOCK(lock_request)) {
409 VOP_RWUNLOCK(vp,
410 (lock_request->l_type == F_RDLCK) ?
411 V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
412 }
413 mutex_enter(&gp->gp_mutex);
414
415 lock_request->l_state |= REFERENCED_LOCK;
416 lock_request->l_graph = gp;
417
418 switch (lock_request->l_type) {
419 case F_RDLCK:
420 case F_WRLCK:
421 if (IS_QUERY_LOCK(lock_request)) {
422 flk_get_first_blocking_lock(lock_request);
423 (*lckdat) = lock_request->l_flock;
424 break;
425 }
426
427 /* process the request now */
428
429 error = flk_process_request(lock_request);
430 break;
431
432 case F_UNLCK:
433 /* unlock request will not block so execute it immediately */
434
435 if (IS_LOCKMGR(lock_request) &&
436 flk_canceled(lock_request)) {
437 error = 0;
438 } else {
439 error = flk_execute_request(lock_request);
440 }
441 break;
442
443 case F_UNLKSYS:
444 /*
445 * Recovery mechanism to release lock manager locks when
446 * NFS client crashes and restart. NFS server will clear
447 * old locks and grant new locks.
448 */
449
450 if (lock_request->l_flock.l_sysid == 0) {
451 mutex_exit(&gp->gp_mutex);
452 return (EINVAL);
453 }
454 if (secpolicy_nfs(CRED()) != 0) {
455 mutex_exit(&gp->gp_mutex);
456 return (EPERM);
457 }
458 flk_delete_locks_by_sysid(lock_request);
459 lock_request->l_state &= ~REFERENCED_LOCK;
460 flk_set_state(lock_request, FLK_DEAD_STATE);
461 flk_free_lock(lock_request);
462 mutex_exit(&gp->gp_mutex);
463 return (0);
464
465 default:
466 error = EINVAL;
467 break;
468 }
469
470 /* Clustering: For blocked PXFS locks, return */
471 if (error == PXFS_LOCK_BLOCKED) {
472 lock_request->l_state &= ~REFERENCED_LOCK;
473 mutex_exit(&gp->gp_mutex);
474 return (error);
475 }
476
477 /*
478 * Now that we have seen the status of locks in the system for
479 * this vnode we acquire the rwlock if it is an IO_LOCK.
480 */
481
482 if (IS_IO_LOCK(lock_request)) {
483 (void) VOP_RWLOCK(vp,
484 (lock_request->l_type == F_RDLCK) ?
485 V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
486 if (!error) {
487 lckdat->l_type = F_UNLCK;
488
489 /*
490 * This wake up is needed otherwise
491 * if IO_LOCK has slept the dependents on this
492 * will not be woken up at all. (bugid # 1185482).
493 */
494
495 flk_wakeup(lock_request, 1);
496 flk_set_state(lock_request, FLK_DEAD_STATE);
497 flk_free_lock(lock_request);
498 }
499 /*
500 * else if error had occurred either flk_process_request()
501 * has returned EDEADLK in which case there will be no
502 * dependents for this lock or EINTR from flk_wait_execute_
503 * request() in which case flk_cancel_sleeping_lock()
504 * would have been done. same is true with EBADF.
505 */
506 }
507
508 if (lock_request == &stack_lock_request) {
509 flk_set_state(lock_request, FLK_DEAD_STATE);
510 } else {
511 lock_request->l_state &= ~REFERENCED_LOCK;
512 if ((error != 0) || IS_DELETED(lock_request)) {
513 flk_set_state(lock_request, FLK_DEAD_STATE);
514 flk_free_lock(lock_request);
515 }
516 }
517
518 mutex_exit(&gp->gp_mutex);
519 return (error);
520
521 done:
522 flk_set_state(lock_request, FLK_DEAD_STATE);
523 if (lock_request != &stack_lock_request)
524 flk_free_lock(lock_request);
525 return (error);
526 }
527
528 /*
529 * Invoke the callbacks in the given list. If before sleeping, invoke in
530 * list order. If after sleeping, invoke in reverse order.
531 *
532 * CPR (suspend/resume) support: if one of the callbacks returns a
533 * callb_cpr_t, return it. This will be used to make the thread CPR-safe
534 * while it is sleeping. There should be at most one callb_cpr_t for the
535 * thread.
536 * XXX This is unnecessarily complicated. The CPR information should just
537 * get passed in directly through VOP_FRLOCK and reclock, rather than
538 * sneaking it in via a callback.
539 */
540
541 callb_cpr_t *
flk_invoke_callbacks(flk_callback_t * cblist,flk_cb_when_t when)542 flk_invoke_callbacks(flk_callback_t *cblist, flk_cb_when_t when)
543 {
544 callb_cpr_t *cpr_callbackp = NULL;
545 callb_cpr_t *one_result;
546 flk_callback_t *cb;
547
548 if (cblist == NULL)
549 return (NULL);
550
551 if (when == FLK_BEFORE_SLEEP) {
552 cb = cblist;
553 do {
554 one_result = (*cb->cb_callback)(when, cb->cb_data);
555 if (one_result != NULL) {
556 ASSERT(cpr_callbackp == NULL);
557 cpr_callbackp = one_result;
558 }
559 cb = cb->cb_next;
560 } while (cb != cblist);
561 } else {
562 cb = cblist->cb_prev;
563 do {
564 one_result = (*cb->cb_callback)(when, cb->cb_data);
565 if (one_result != NULL) {
566 cpr_callbackp = one_result;
567 }
568 cb = cb->cb_prev;
569 } while (cb != cblist->cb_prev);
570 }
571
572 return (cpr_callbackp);
573 }
574
575 /*
576 * Initialize a flk_callback_t to hold the given callback.
577 */
578
579 void
flk_init_callback(flk_callback_t * flk_cb,callb_cpr_t * (* cb_fcn)(flk_cb_when_t,void *),void * cbdata)580 flk_init_callback(flk_callback_t *flk_cb,
581 callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *), void *cbdata)
582 {
583 flk_cb->cb_next = flk_cb;
584 flk_cb->cb_prev = flk_cb;
585 flk_cb->cb_callback = cb_fcn;
586 flk_cb->cb_data = cbdata;
587 }
588
589 /*
590 * Initialize an flk_callback_t and then link it into the head of an
591 * existing list (which may be NULL).
592 */
593
594 void
flk_add_callback(flk_callback_t * newcb,callb_cpr_t * (* cb_fcn)(flk_cb_when_t,void *),void * cbdata,flk_callback_t * cblist)595 flk_add_callback(flk_callback_t *newcb,
596 callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *),
597 void *cbdata, flk_callback_t *cblist)
598 {
599 flk_init_callback(newcb, cb_fcn, cbdata);
600
601 if (cblist == NULL)
602 return;
603
604 newcb->cb_prev = cblist->cb_prev;
605 newcb->cb_next = cblist;
606 cblist->cb_prev->cb_next = newcb;
607 cblist->cb_prev = newcb;
608 }
609
610 /*
611 * Remove the callback from a list.
612 */
613
614 void
flk_del_callback(flk_callback_t * flk_cb)615 flk_del_callback(flk_callback_t *flk_cb)
616 {
617 flk_cb->cb_next->cb_prev = flk_cb->cb_prev;
618 flk_cb->cb_prev->cb_next = flk_cb->cb_next;
619
620 flk_cb->cb_prev = flk_cb;
621 flk_cb->cb_next = flk_cb;
622 }
623
624 /*
625 * Initialize the flk_edge_cache data structure and create the
626 * nlm_reg_status array.
627 */
628
629 void
flk_init(void)630 flk_init(void)
631 {
632 uint_t i;
633
634 flk_edge_cache = kmem_cache_create("flk_edges",
635 sizeof (struct edge), 0, NULL, NULL, NULL, NULL, NULL, 0);
636 if (flk_edge_cache == NULL) {
637 cmn_err(CE_PANIC, "Couldn't create flk_edge_cache\n");
638 }
639 /*
640 * Create the NLM registry object.
641 */
642
643 if (cluster_bootflags & CLUSTER_BOOTED) {
644 /*
645 * This routine tells you the maximum node id that will be used
646 * in the cluster. This number will be the size of the nlm
647 * registry status array. We add 1 because we will be using
648 * all entries indexed from 0 to maxnodeid; e.g., from 0
649 * to 64, for a total of 65 entries.
650 */
651 nlm_status_size = clconf_maximum_nodeid() + 1;
652 } else {
653 nlm_status_size = 0;
654 }
655
656 if (nlm_status_size != 0) { /* booted as a cluster */
657 nlm_reg_status = (flk_nlm_status_t *)
658 kmem_alloc(sizeof (flk_nlm_status_t) * nlm_status_size,
659 KM_SLEEP);
660
661 /* initialize all NLM states in array to NLM_UNKNOWN */
662 for (i = 0; i < nlm_status_size; i++) {
663 nlm_reg_status[i] = FLK_NLM_UNKNOWN;
664 }
665 }
666 }
667
668 /*
669 * Zone constructor/destructor callbacks to be executed when a zone is
670 * created/destroyed.
671 */
672 /* ARGSUSED */
673 void *
flk_zone_init(zoneid_t zoneid)674 flk_zone_init(zoneid_t zoneid)
675 {
676 struct flock_globals *fg;
677 uint_t i;
678
679 fg = kmem_alloc(sizeof (*fg), KM_SLEEP);
680 fg->flk_lockmgr_status = FLK_LOCKMGR_UP;
681 for (i = 0; i < HASH_SIZE; i++)
682 fg->lockmgr_status[i] = FLK_LOCKMGR_UP;
683 return (fg);
684 }
685
686 /* ARGSUSED */
687 void
flk_zone_fini(zoneid_t zoneid,void * data)688 flk_zone_fini(zoneid_t zoneid, void *data)
689 {
690 struct flock_globals *fg = data;
691
692 kmem_free(fg, sizeof (*fg));
693 }
694
695 /*
696 * Get a lock_descriptor structure with initialization of edge lists.
697 */
698
699 static lock_descriptor_t *
flk_get_lock(void)700 flk_get_lock(void)
701 {
702 lock_descriptor_t *l;
703
704 l = kmem_zalloc(sizeof (lock_descriptor_t), KM_SLEEP);
705
706 cv_init(&l->l_cv, NULL, CV_DRIVER, NULL);
707 l->l_edge.edge_in_next = &l->l_edge;
708 l->l_edge.edge_in_prev = &l->l_edge;
709 l->l_edge.edge_adj_next = &l->l_edge;
710 l->l_edge.edge_adj_prev = &l->l_edge;
711 l->pvertex = -1;
712 l->l_status = FLK_INITIAL_STATE;
713 flk_lock_allocs++;
714 return (l);
715 }
716
717 /*
718 * Free a lock_descriptor structure. Just sets the DELETED_LOCK flag
719 * when some thread has a reference to it as in reclock().
720 */
721
722 void
flk_free_lock(lock_descriptor_t * lock)723 flk_free_lock(lock_descriptor_t *lock)
724 {
725 ASSERT(IS_DEAD(lock));
726 if (IS_REFERENCED(lock)) {
727 lock->l_state |= DELETED_LOCK;
728 return;
729 }
730 flk_lock_frees++;
731 kmem_free((void *)lock, sizeof (lock_descriptor_t));
732 }
733
734 void
flk_set_state(lock_descriptor_t * lock,int new_state)735 flk_set_state(lock_descriptor_t *lock, int new_state)
736 {
737 /*
738 * Locks in the sleeping list may be woken up in a number of ways,
739 * and more than once. If a sleeping lock is signaled awake more
740 * than once, then it may or may not change state depending on its
741 * current state.
742 * Also note that NLM locks that are sleeping could be moved to an
743 * interrupted state more than once if the unlock request is
744 * retransmitted by the NLM client - the second time around, this is
745 * just a nop.
746 * The ordering of being signaled awake is:
747 * INTERRUPTED_STATE > CANCELLED_STATE > GRANTED_STATE.
748 * The checks below implement this ordering.
749 */
750 if (IS_INTERRUPTED(lock)) {
751 if ((new_state == FLK_CANCELLED_STATE) ||
752 (new_state == FLK_GRANTED_STATE) ||
753 (new_state == FLK_INTERRUPTED_STATE)) {
754 return;
755 }
756 }
757 if (IS_CANCELLED(lock)) {
758 if ((new_state == FLK_GRANTED_STATE) ||
759 (new_state == FLK_CANCELLED_STATE)) {
760 return;
761 }
762 }
763 CHECK_LOCK_TRANSITION(lock->l_status, new_state);
764 if (IS_PXFS(lock)) {
765 cl_flk_state_transition_notify(lock, lock->l_status, new_state);
766 }
767 lock->l_status = new_state;
768 }
769
770 /*
771 * Routine that checks whether there are any blocking locks in the system.
772 *
773 * The policy followed is if a write lock is sleeping we don't allow read
774 * locks before this write lock even though there may not be any active
775 * locks corresponding to the read locks' region.
776 *
777 * flk_add_edge() function adds an edge between l1 and l2 iff there
778 * is no path between l1 and l2. This is done to have a "minimum
779 * storage representation" of the dependency graph.
780 *
781 * Another property of the graph is since only the new request throws
782 * edges to the existing locks in the graph, the graph is always topologically
783 * ordered.
784 */
785
786 static int
flk_process_request(lock_descriptor_t * request)787 flk_process_request(lock_descriptor_t *request)
788 {
789 graph_t *gp = request->l_graph;
790 lock_descriptor_t *lock;
791 int request_blocked_by_active = 0;
792 int request_blocked_by_granted = 0;
793 int request_blocked_by_sleeping = 0;
794 vnode_t *vp = request->l_vnode;
795 int error = 0;
796 int request_will_wait = 0;
797 int found_covering_lock = 0;
798 lock_descriptor_t *covered_by = NULL;
799
800 ASSERT(MUTEX_HELD(&gp->gp_mutex));
801 request_will_wait = IS_WILLING_TO_SLEEP(request);
802
803 /*
804 * check active locks
805 */
806
807 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
808
809
810 if (lock) {
811 do {
812 if (BLOCKS(lock, request)) {
813 if (!request_will_wait)
814 return (EAGAIN);
815 request_blocked_by_active = 1;
816 break;
817 }
818 /*
819 * Grant lock if it is for the same owner holding active
820 * lock that covers the request.
821 */
822
823 if (SAME_OWNER(lock, request) &&
824 COVERS(lock, request) &&
825 (request->l_type == F_RDLCK))
826 return (flk_execute_request(request));
827 lock = lock->l_next;
828 } while (lock->l_vnode == vp);
829 }
830
831 if (!request_blocked_by_active) {
832 lock_descriptor_t *lk[1];
833 lock_descriptor_t *first_glock = NULL;
834 /*
835 * Shall we grant this?! NO!!
836 * What about those locks that were just granted and still
837 * in sleep queue. Those threads are woken up and so locks
838 * are almost active.
839 */
840 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
841 if (lock) {
842 do {
843 if (BLOCKS(lock, request)) {
844 if (IS_GRANTED(lock)) {
845 request_blocked_by_granted = 1;
846 } else {
847 request_blocked_by_sleeping = 1;
848 }
849 }
850
851 lock = lock->l_next;
852 } while ((lock->l_vnode == vp));
853 first_glock = lock->l_prev;
854 ASSERT(first_glock->l_vnode == vp);
855 }
856
857 if (request_blocked_by_granted)
858 goto block;
859
860 if (!request_blocked_by_sleeping) {
861 /*
862 * If the request isn't going to be blocked by a
863 * sleeping request, we know that it isn't going to
864 * be blocked; we can just execute the request --
865 * without performing costly deadlock detection.
866 */
867 ASSERT(!request_blocked_by_active);
868 return (flk_execute_request(request));
869 } else if (request->l_type == F_RDLCK) {
870 /*
871 * If we have a sleeping writer in the requested
872 * lock's range, block.
873 */
874 goto block;
875 }
876
877 lk[0] = request;
878 request->l_state |= RECOMPUTE_LOCK;
879 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
880 if (lock) {
881 do {
882 flk_recompute_dependencies(lock, lk, 1, 0);
883 lock = lock->l_next;
884 } while (lock->l_vnode == vp);
885 }
886 lock = first_glock;
887 if (lock) {
888 do {
889 if (IS_GRANTED(lock)) {
890 flk_recompute_dependencies(lock, lk, 1, 0);
891 }
892 lock = lock->l_prev;
893 } while ((lock->l_vnode == vp));
894 }
895 request->l_state &= ~RECOMPUTE_LOCK;
896 if (!NO_DEPENDENTS(request) && flk_check_deadlock(request))
897 return (EDEADLK);
898 return (flk_execute_request(request));
899 }
900
901 block:
902 if (request_will_wait)
903 flk_graph_uncolor(gp);
904
905 /* check sleeping locks */
906
907 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
908
909 /*
910 * If we find a sleeping write lock that is a superset of the
911 * region wanted by request we can be assured that by adding an
912 * edge to this write lock we have paths to all locks in the
913 * graph that blocks the request except in one case and that is why
914 * another check for SAME_OWNER in the loop below. The exception
915 * case is when this process that owns the sleeping write lock 'l1'
916 * has other locks l2, l3, l4 that are in the system and arrived
917 * before l1. l1 does not have path to these locks as they are from
918 * same process. We break when we find a second covering sleeping
919 * lock l5 owned by a process different from that owning l1, because
920 * there cannot be any of l2, l3, l4, etc., arrived before l5, and if
921 * it has l1 would have produced a deadlock already.
922 */
923
924 if (lock) {
925 do {
926 if (BLOCKS(lock, request)) {
927 if (!request_will_wait)
928 return (EAGAIN);
929 if (COVERS(lock, request) &&
930 lock->l_type == F_WRLCK) {
931 if (found_covering_lock &&
932 !SAME_OWNER(lock, covered_by)) {
933 found_covering_lock++;
934 break;
935 }
936 found_covering_lock = 1;
937 covered_by = lock;
938 }
939 if (found_covering_lock &&
940 !SAME_OWNER(lock, covered_by)) {
941 lock = lock->l_next;
942 continue;
943 }
944 if ((error = flk_add_edge(request, lock,
945 !found_covering_lock, 0)))
946 return (error);
947 }
948 lock = lock->l_next;
949 } while (lock->l_vnode == vp);
950 }
951
952 /*
953 * found_covering_lock == 2 iff at this point 'request' has paths
954 * to all locks that blocks 'request'. found_covering_lock == 1 iff at this
955 * point 'request' has paths to all locks that blocks 'request' whose owners
956 * are not same as the one that covers 'request' (covered_by above) and
957 * we can have locks whose owner is same as covered_by in the active list.
958 */
959
960 if (request_blocked_by_active && found_covering_lock != 2) {
961 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
962 ASSERT(lock != NULL);
963 do {
964 if (BLOCKS(lock, request)) {
965 if (found_covering_lock &&
966 !SAME_OWNER(lock, covered_by)) {
967 lock = lock->l_next;
968 continue;
969 }
970 if ((error = flk_add_edge(request, lock,
971 CHECK_CYCLE, 0)))
972 return (error);
973 }
974 lock = lock->l_next;
975 } while (lock->l_vnode == vp);
976 }
977
978 if (NOT_BLOCKED(request)) {
979 /*
980 * request not dependent on any other locks
981 * so execute this request
982 */
983 return (flk_execute_request(request));
984 } else {
985 /*
986 * check for deadlock
987 */
988 if (flk_check_deadlock(request))
989 return (EDEADLK);
990 /*
991 * this thread has to sleep
992 */
993 return (flk_wait_execute_request(request));
994 }
995 }
996
997 /*
998 * The actual execution of the request in the simple case is only to
999 * insert the 'request' in the list of active locks if it is not an
1000 * UNLOCK.
1001 * We have to consider the existing active locks' relation to
1002 * this 'request' if they are owned by same process. flk_relation() does
1003 * this job and sees to that the dependency graph information is maintained
1004 * properly.
1005 */
1006
1007 int
flk_execute_request(lock_descriptor_t * request)1008 flk_execute_request(lock_descriptor_t *request)
1009 {
1010 graph_t *gp = request->l_graph;
1011 vnode_t *vp = request->l_vnode;
1012 lock_descriptor_t *lock, *lock1;
1013 int done_searching = 0;
1014
1015 CHECK_SLEEPING_LOCKS(gp);
1016 CHECK_ACTIVE_LOCKS(gp);
1017
1018 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1019
1020 flk_set_state(request, FLK_START_STATE);
1021
1022 ASSERT(NOT_BLOCKED(request));
1023
1024 /* IO_LOCK requests are only to check status */
1025
1026 if (IS_IO_LOCK(request))
1027 return (0);
1028
1029 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1030
1031 if (lock == NULL && request->l_type == F_UNLCK)
1032 return (0);
1033 if (lock == NULL) {
1034 flk_insert_active_lock(request);
1035 return (0);
1036 }
1037
1038 do {
1039 lock1 = lock->l_next;
1040 if (SAME_OWNER(request, lock)) {
1041 done_searching = flk_relation(lock, request);
1042 }
1043 lock = lock1;
1044 } while (lock->l_vnode == vp && !done_searching);
1045
1046 /*
1047 * insert in active queue
1048 */
1049
1050 if (request->l_type != F_UNLCK)
1051 flk_insert_active_lock(request);
1052
1053 return (0);
1054 }
1055
1056 /*
1057 * 'request' is blocked by some one therefore we put it into sleep queue.
1058 */
1059 static int
flk_wait_execute_request(lock_descriptor_t * request)1060 flk_wait_execute_request(lock_descriptor_t *request)
1061 {
1062 graph_t *gp = request->l_graph;
1063 callb_cpr_t *cprp; /* CPR info from callback */
1064 struct flock_globals *fg;
1065 int index;
1066
1067 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1068 ASSERT(IS_WILLING_TO_SLEEP(request));
1069
1070 flk_insert_sleeping_lock(request);
1071
1072 if (IS_LOCKMGR(request)) {
1073 index = HASH_INDEX(request->l_vnode);
1074 fg = flk_get_globals();
1075
1076 if (nlm_status_size == 0) { /* not booted as a cluster */
1077 if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP) {
1078 flk_cancel_sleeping_lock(request, 1);
1079 return (ENOLCK);
1080 }
1081 } else { /* booted as a cluster */
1082 /*
1083 * If the request is an NLM server lock request,
1084 * and the NLM state of the lock request is not
1085 * NLM_UP (because the NLM server is shutting
1086 * down), then cancel the sleeping lock and
1087 * return error ENOLCK that will encourage the
1088 * client to retransmit.
1089 */
1090 if (!IS_NLM_UP(request)) {
1091 flk_cancel_sleeping_lock(request, 1);
1092 return (ENOLCK);
1093 }
1094 }
1095 }
1096
1097 /* Clustering: For blocking PXFS locks, return */
1098 if (IS_PXFS(request)) {
1099 /*
1100 * PXFS locks sleep on the client side.
1101 * The callback argument is used to wake up the sleeper
1102 * when the lock is granted.
1103 * We return -1 (rather than an errno value) to indicate
1104 * the client side should sleep
1105 */
1106 return (PXFS_LOCK_BLOCKED);
1107 }
1108
1109 if (request->l_callbacks != NULL) {
1110 /*
1111 * To make sure the shutdown code works correctly, either
1112 * the callback must happen after putting the lock on the
1113 * sleep list, or we must check the shutdown status after
1114 * returning from the callback (and before sleeping). At
1115 * least for now, we'll use the first option. If a
1116 * shutdown or signal or whatever happened while the graph
1117 * mutex was dropped, that will be detected by
1118 * wait_for_lock().
1119 */
1120 mutex_exit(&gp->gp_mutex);
1121
1122 cprp = flk_invoke_callbacks(request->l_callbacks,
1123 FLK_BEFORE_SLEEP);
1124
1125 mutex_enter(&gp->gp_mutex);
1126
1127 if (cprp == NULL) {
1128 wait_for_lock(request);
1129 } else {
1130 mutex_enter(cprp->cc_lockp);
1131 CALLB_CPR_SAFE_BEGIN(cprp);
1132 mutex_exit(cprp->cc_lockp);
1133 wait_for_lock(request);
1134 mutex_enter(cprp->cc_lockp);
1135 CALLB_CPR_SAFE_END(cprp, cprp->cc_lockp);
1136 mutex_exit(cprp->cc_lockp);
1137 }
1138
1139 mutex_exit(&gp->gp_mutex);
1140 (void) flk_invoke_callbacks(request->l_callbacks,
1141 FLK_AFTER_SLEEP);
1142 mutex_enter(&gp->gp_mutex);
1143 } else {
1144 wait_for_lock(request);
1145 }
1146
1147 if (IS_LOCKMGR(request)) {
1148 /*
1149 * If the lock manager is shutting down, return an
1150 * error that will encourage the client to retransmit.
1151 */
1152 if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP &&
1153 !IS_GRANTED(request)) {
1154 flk_cancel_sleeping_lock(request, 1);
1155 return (ENOLCK);
1156 }
1157 }
1158
1159 if (IS_INTERRUPTED(request)) {
1160 /* we got a signal, or act like we did */
1161 flk_cancel_sleeping_lock(request, 1);
1162 return (EINTR);
1163 }
1164
1165 /* Cancelled if some other thread has closed the file */
1166
1167 if (IS_CANCELLED(request)) {
1168 flk_cancel_sleeping_lock(request, 1);
1169 return (EBADF);
1170 }
1171
1172 request->l_state &= ~GRANTED_LOCK;
1173 REMOVE_SLEEP_QUEUE(request);
1174 return (flk_execute_request(request));
1175 }
1176
1177 /*
1178 * This routine adds an edge between from and to because from depends
1179 * to. If asked to check for deadlock it checks whether there are any
1180 * reachable locks from "from_lock" that is owned by the same process
1181 * as "from_lock".
1182 * NOTE: It is the caller's responsibility to make sure that the color
1183 * of the graph is consistent between the calls to flk_add_edge as done
1184 * in flk_process_request. This routine does not color and check for
1185 * deadlock explicitly.
1186 */
1187
1188 static int
flk_add_edge(lock_descriptor_t * from_lock,lock_descriptor_t * to_lock,int check_cycle,int update_graph)1189 flk_add_edge(lock_descriptor_t *from_lock, lock_descriptor_t *to_lock,
1190 int check_cycle, int update_graph)
1191 {
1192 edge_t *edge;
1193 edge_t *ep;
1194 lock_descriptor_t *vertex;
1195 lock_descriptor_t *vertex_stack;
1196
1197 STACK_INIT(vertex_stack);
1198
1199 /*
1200 * if to vertex already has mark_color just return
1201 * don't add an edge as it is reachable from from vertex
1202 * before itself.
1203 */
1204
1205 if (COLORED(to_lock))
1206 return (0);
1207
1208 edge = flk_get_edge();
1209
1210 /*
1211 * set the from and to vertex
1212 */
1213
1214 edge->from_vertex = from_lock;
1215 edge->to_vertex = to_lock;
1216
1217 /*
1218 * put in adjacency list of from vertex
1219 */
1220
1221 from_lock->l_edge.edge_adj_next->edge_adj_prev = edge;
1222 edge->edge_adj_next = from_lock->l_edge.edge_adj_next;
1223 edge->edge_adj_prev = &from_lock->l_edge;
1224 from_lock->l_edge.edge_adj_next = edge;
1225
1226 /*
1227 * put in in list of to vertex
1228 */
1229
1230 to_lock->l_edge.edge_in_next->edge_in_prev = edge;
1231 edge->edge_in_next = to_lock->l_edge.edge_in_next;
1232 to_lock->l_edge.edge_in_next = edge;
1233 edge->edge_in_prev = &to_lock->l_edge;
1234
1235
1236 if (update_graph) {
1237 flk_update_proc_graph(edge, 0);
1238 return (0);
1239 }
1240 if (!check_cycle) {
1241 return (0);
1242 }
1243
1244 STACK_PUSH(vertex_stack, from_lock, l_stack);
1245
1246 while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1247
1248 STACK_POP(vertex_stack, l_stack);
1249
1250 for (ep = FIRST_ADJ(vertex);
1251 ep != HEAD(vertex);
1252 ep = NEXT_ADJ(ep)) {
1253 if (COLORED(ep->to_vertex))
1254 continue;
1255 COLOR(ep->to_vertex);
1256 if (SAME_OWNER(ep->to_vertex, from_lock))
1257 goto dead_lock;
1258 STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1259 }
1260 }
1261 return (0);
1262
1263 dead_lock:
1264
1265 /*
1266 * remove all edges
1267 */
1268
1269 ep = FIRST_ADJ(from_lock);
1270
1271 while (ep != HEAD(from_lock)) {
1272 IN_LIST_REMOVE(ep);
1273 from_lock->l_sedge = NEXT_ADJ(ep);
1274 ADJ_LIST_REMOVE(ep);
1275 flk_free_edge(ep);
1276 ep = from_lock->l_sedge;
1277 }
1278 return (EDEADLK);
1279 }
1280
1281 /*
1282 * Get an edge structure for representing the dependency between two locks.
1283 */
1284
1285 static edge_t *
flk_get_edge()1286 flk_get_edge()
1287 {
1288 edge_t *ep;
1289
1290 ASSERT(flk_edge_cache != NULL);
1291
1292 ep = kmem_cache_alloc(flk_edge_cache, KM_SLEEP);
1293 edge_allocs++;
1294 return (ep);
1295 }
1296
1297 /*
1298 * Free the edge structure.
1299 */
1300
1301 static void
flk_free_edge(edge_t * ep)1302 flk_free_edge(edge_t *ep)
1303 {
1304 edge_frees++;
1305 kmem_cache_free(flk_edge_cache, (void *)ep);
1306 }
1307
1308 /*
1309 * Check the relationship of request with lock and perform the
1310 * recomputation of dependencies, break lock if required, and return
1311 * 1 if request cannot have any more relationship with the next
1312 * active locks.
1313 * The 'lock' and 'request' are compared and in case of overlap we
1314 * delete the 'lock' and form new locks to represent the non-overlapped
1315 * portion of original 'lock'. This function has side effects such as
1316 * 'lock' will be freed, new locks will be added to the active list.
1317 */
1318
1319 static int
flk_relation(lock_descriptor_t * lock,lock_descriptor_t * request)1320 flk_relation(lock_descriptor_t *lock, lock_descriptor_t *request)
1321 {
1322 int lock_effect;
1323 lock_descriptor_t *lock1, *lock2;
1324 lock_descriptor_t *topology[3];
1325 int nvertex = 0;
1326 int i;
1327 edge_t *ep;
1328 graph_t *gp = (lock->l_graph);
1329
1330
1331 CHECK_SLEEPING_LOCKS(gp);
1332 CHECK_ACTIVE_LOCKS(gp);
1333
1334 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1335
1336 topology[0] = topology[1] = topology[2] = NULL;
1337
1338 if (request->l_type == F_UNLCK)
1339 lock_effect = FLK_UNLOCK;
1340 else if (request->l_type == F_RDLCK &&
1341 lock->l_type == F_WRLCK)
1342 lock_effect = FLK_DOWNGRADE;
1343 else if (request->l_type == F_WRLCK &&
1344 lock->l_type == F_RDLCK)
1345 lock_effect = FLK_UPGRADE;
1346 else
1347 lock_effect = FLK_STAY_SAME;
1348
1349 if (lock->l_end < request->l_start) {
1350 if (lock->l_end == request->l_start - 1 &&
1351 lock_effect == FLK_STAY_SAME) {
1352 topology[0] = request;
1353 request->l_start = lock->l_start;
1354 nvertex = 1;
1355 goto recompute;
1356 } else {
1357 return (0);
1358 }
1359 }
1360
1361 if (lock->l_start > request->l_end) {
1362 if (request->l_end == lock->l_start - 1 &&
1363 lock_effect == FLK_STAY_SAME) {
1364 topology[0] = request;
1365 request->l_end = lock->l_end;
1366 nvertex = 1;
1367 goto recompute;
1368 } else {
1369 return (1);
1370 }
1371 }
1372
1373 if (request->l_end < lock->l_end) {
1374 if (request->l_start > lock->l_start) {
1375 if (lock_effect == FLK_STAY_SAME) {
1376 request->l_start = lock->l_start;
1377 request->l_end = lock->l_end;
1378 topology[0] = request;
1379 nvertex = 1;
1380 } else {
1381 lock1 = flk_get_lock();
1382 lock2 = flk_get_lock();
1383 COPY(lock1, lock);
1384 COPY(lock2, lock);
1385 lock1->l_start = lock->l_start;
1386 lock1->l_end = request->l_start - 1;
1387 lock2->l_start = request->l_end + 1;
1388 lock2->l_end = lock->l_end;
1389 topology[0] = lock1;
1390 topology[1] = lock2;
1391 topology[2] = request;
1392 nvertex = 3;
1393 }
1394 } else if (request->l_start < lock->l_start) {
1395 if (lock_effect == FLK_STAY_SAME) {
1396 request->l_end = lock->l_end;
1397 topology[0] = request;
1398 nvertex = 1;
1399 } else {
1400 lock1 = flk_get_lock();
1401 COPY(lock1, lock);
1402 lock1->l_start = request->l_end + 1;
1403 topology[0] = lock1;
1404 topology[1] = request;
1405 nvertex = 2;
1406 }
1407 } else {
1408 if (lock_effect == FLK_STAY_SAME) {
1409 request->l_start = lock->l_start;
1410 request->l_end = lock->l_end;
1411 topology[0] = request;
1412 nvertex = 1;
1413 } else {
1414 lock1 = flk_get_lock();
1415 COPY(lock1, lock);
1416 lock1->l_start = request->l_end + 1;
1417 topology[0] = lock1;
1418 topology[1] = request;
1419 nvertex = 2;
1420 }
1421 }
1422 } else if (request->l_end > lock->l_end) {
1423 if (request->l_start > lock->l_start) {
1424 if (lock_effect == FLK_STAY_SAME) {
1425 request->l_start = lock->l_start;
1426 topology[0] = request;
1427 nvertex = 1;
1428 } else {
1429 lock1 = flk_get_lock();
1430 COPY(lock1, lock);
1431 lock1->l_end = request->l_start - 1;
1432 topology[0] = lock1;
1433 topology[1] = request;
1434 nvertex = 2;
1435 }
1436 } else if (request->l_start < lock->l_start) {
1437 topology[0] = request;
1438 nvertex = 1;
1439 } else {
1440 topology[0] = request;
1441 nvertex = 1;
1442 }
1443 } else {
1444 if (request->l_start > lock->l_start) {
1445 if (lock_effect == FLK_STAY_SAME) {
1446 request->l_start = lock->l_start;
1447 topology[0] = request;
1448 nvertex = 1;
1449 } else {
1450 lock1 = flk_get_lock();
1451 COPY(lock1, lock);
1452 lock1->l_end = request->l_start - 1;
1453 topology[0] = lock1;
1454 topology[1] = request;
1455 nvertex = 2;
1456 }
1457 } else if (request->l_start < lock->l_start) {
1458 topology[0] = request;
1459 nvertex = 1;
1460 } else {
1461 if (lock_effect != FLK_UNLOCK) {
1462 topology[0] = request;
1463 nvertex = 1;
1464 } else {
1465 flk_delete_active_lock(lock, 0);
1466 flk_wakeup(lock, 1);
1467 flk_free_lock(lock);
1468 CHECK_SLEEPING_LOCKS(gp);
1469 CHECK_ACTIVE_LOCKS(gp);
1470 return (1);
1471 }
1472 }
1473 }
1474
1475 recompute:
1476
1477 /*
1478 * For unlock we don't send the 'request' to for recomputing
1479 * dependencies because no lock will add an edge to this.
1480 */
1481
1482 if (lock_effect == FLK_UNLOCK) {
1483 topology[nvertex-1] = NULL;
1484 nvertex--;
1485 }
1486 for (i = 0; i < nvertex; i++) {
1487 topology[i]->l_state |= RECOMPUTE_LOCK;
1488 topology[i]->l_color = NO_COLOR;
1489 }
1490
1491 ASSERT(FIRST_ADJ(lock) == HEAD(lock));
1492
1493 /*
1494 * we remove the adjacent edges for all vertices' to this vertex
1495 * 'lock'.
1496 */
1497
1498 ep = FIRST_IN(lock);
1499 while (ep != HEAD(lock)) {
1500 ADJ_LIST_REMOVE(ep);
1501 ep = NEXT_IN(ep);
1502 }
1503
1504 flk_delete_active_lock(lock, 0);
1505
1506 /* We are ready for recomputing the dependencies now */
1507
1508 flk_recompute_dependencies(lock, topology, nvertex, 1);
1509
1510 for (i = 0; i < nvertex; i++) {
1511 topology[i]->l_state &= ~RECOMPUTE_LOCK;
1512 topology[i]->l_color = NO_COLOR;
1513 }
1514
1515
1516 if (lock_effect == FLK_UNLOCK) {
1517 nvertex++;
1518 }
1519 for (i = 0; i < nvertex - 1; i++) {
1520 flk_insert_active_lock(topology[i]);
1521 }
1522
1523
1524 if (lock_effect == FLK_DOWNGRADE || lock_effect == FLK_UNLOCK) {
1525 flk_wakeup(lock, 0);
1526 } else {
1527 ep = FIRST_IN(lock);
1528 while (ep != HEAD(lock)) {
1529 lock->l_sedge = NEXT_IN(ep);
1530 IN_LIST_REMOVE(ep);
1531 flk_update_proc_graph(ep, 1);
1532 flk_free_edge(ep);
1533 ep = lock->l_sedge;
1534 }
1535 }
1536 flk_free_lock(lock);
1537
1538 CHECK_SLEEPING_LOCKS(gp);
1539 CHECK_ACTIVE_LOCKS(gp);
1540 return (0);
1541 }
1542
1543 /*
1544 * Insert a lock into the active queue.
1545 */
1546
1547 static void
flk_insert_active_lock(lock_descriptor_t * new_lock)1548 flk_insert_active_lock(lock_descriptor_t *new_lock)
1549 {
1550 graph_t *gp = new_lock->l_graph;
1551 vnode_t *vp = new_lock->l_vnode;
1552 lock_descriptor_t *first_lock, *lock;
1553
1554 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1555
1556 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1557 first_lock = lock;
1558
1559 if (first_lock != NULL) {
1560 for (; (lock->l_vnode == vp &&
1561 lock->l_start < new_lock->l_start); lock = lock->l_next)
1562 ;
1563 } else {
1564 lock = ACTIVE_HEAD(gp);
1565 }
1566
1567 lock->l_prev->l_next = new_lock;
1568 new_lock->l_next = lock;
1569 new_lock->l_prev = lock->l_prev;
1570 lock->l_prev = new_lock;
1571
1572 if (first_lock == NULL || (new_lock->l_start <= first_lock->l_start)) {
1573 vp->v_filocks = (struct filock *)new_lock;
1574 }
1575 flk_set_state(new_lock, FLK_ACTIVE_STATE);
1576 new_lock->l_state |= ACTIVE_LOCK;
1577
1578 CHECK_ACTIVE_LOCKS(gp);
1579 CHECK_SLEEPING_LOCKS(gp);
1580 }
1581
1582 /*
1583 * Delete the active lock : Performs two functions depending on the
1584 * value of second parameter. One is to remove from the active lists
1585 * only and other is to both remove and free the lock.
1586 */
1587
1588 static void
flk_delete_active_lock(lock_descriptor_t * lock,int free_lock)1589 flk_delete_active_lock(lock_descriptor_t *lock, int free_lock)
1590 {
1591 vnode_t *vp = lock->l_vnode;
1592 graph_t *gp = lock->l_graph;
1593
1594 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1595 if (free_lock)
1596 ASSERT(NO_DEPENDENTS(lock));
1597 ASSERT(NOT_BLOCKED(lock));
1598 ASSERT(IS_ACTIVE(lock));
1599
1600 ASSERT((vp->v_filocks != NULL));
1601
1602 if (vp->v_filocks == (struct filock *)lock) {
1603 vp->v_filocks = (struct filock *)
1604 ((lock->l_next->l_vnode == vp) ? lock->l_next :
1605 NULL);
1606 }
1607 lock->l_next->l_prev = lock->l_prev;
1608 lock->l_prev->l_next = lock->l_next;
1609 lock->l_next = lock->l_prev = NULL;
1610 flk_set_state(lock, FLK_DEAD_STATE);
1611 lock->l_state &= ~ACTIVE_LOCK;
1612
1613 if (free_lock)
1614 flk_free_lock(lock);
1615 CHECK_ACTIVE_LOCKS(gp);
1616 CHECK_SLEEPING_LOCKS(gp);
1617 }
1618
1619 /*
1620 * Insert into the sleep queue.
1621 */
1622
1623 static void
flk_insert_sleeping_lock(lock_descriptor_t * request)1624 flk_insert_sleeping_lock(lock_descriptor_t *request)
1625 {
1626 graph_t *gp = request->l_graph;
1627 vnode_t *vp = request->l_vnode;
1628 lock_descriptor_t *lock;
1629
1630 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1631 ASSERT(IS_INITIAL(request));
1632
1633 for (lock = gp->sleeping_locks.l_next; (lock != &gp->sleeping_locks &&
1634 lock->l_vnode < vp); lock = lock->l_next)
1635 ;
1636
1637 lock->l_prev->l_next = request;
1638 request->l_prev = lock->l_prev;
1639 lock->l_prev = request;
1640 request->l_next = lock;
1641 flk_set_state(request, FLK_SLEEPING_STATE);
1642 request->l_state |= SLEEPING_LOCK;
1643 }
1644
1645 /*
1646 * Cancelling a sleeping lock implies removing a vertex from the
1647 * dependency graph and therefore we should recompute the dependencies
1648 * of all vertices that have a path to this vertex, w.r.t. all
1649 * vertices reachable from this vertex.
1650 */
1651
1652 void
flk_cancel_sleeping_lock(lock_descriptor_t * request,int remove_from_queue)1653 flk_cancel_sleeping_lock(lock_descriptor_t *request, int remove_from_queue)
1654 {
1655 graph_t *gp = request->l_graph;
1656 vnode_t *vp = request->l_vnode;
1657 lock_descriptor_t **topology = NULL;
1658 edge_t *ep;
1659 lock_descriptor_t *vertex, *lock;
1660 int nvertex = 0;
1661 int i;
1662 lock_descriptor_t *vertex_stack;
1663
1664 STACK_INIT(vertex_stack);
1665
1666 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1667 /*
1668 * count number of vertex pointers that has to be allocated
1669 * All vertices that are reachable from request.
1670 */
1671
1672 STACK_PUSH(vertex_stack, request, l_stack);
1673
1674 while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1675 STACK_POP(vertex_stack, l_stack);
1676 for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
1677 ep = NEXT_ADJ(ep)) {
1678 if (IS_RECOMPUTE(ep->to_vertex))
1679 continue;
1680 ep->to_vertex->l_state |= RECOMPUTE_LOCK;
1681 STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1682 nvertex++;
1683 }
1684 }
1685
1686 /*
1687 * allocate memory for holding the vertex pointers
1688 */
1689
1690 if (nvertex) {
1691 topology = kmem_zalloc(nvertex * sizeof (lock_descriptor_t *),
1692 KM_SLEEP);
1693 }
1694
1695 /*
1696 * one more pass to actually store the vertices in the
1697 * allocated array.
1698 * We first check sleeping locks and then active locks
1699 * so that topology array will be in a topological
1700 * order.
1701 */
1702
1703 nvertex = 0;
1704 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1705
1706 if (lock) {
1707 do {
1708 if (IS_RECOMPUTE(lock)) {
1709 lock->l_index = nvertex;
1710 topology[nvertex++] = lock;
1711 }
1712 lock->l_color = NO_COLOR;
1713 lock = lock->l_next;
1714 } while (lock->l_vnode == vp);
1715 }
1716
1717 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1718
1719 if (lock) {
1720 do {
1721 if (IS_RECOMPUTE(lock)) {
1722 lock->l_index = nvertex;
1723 topology[nvertex++] = lock;
1724 }
1725 lock->l_color = NO_COLOR;
1726 lock = lock->l_next;
1727 } while (lock->l_vnode == vp);
1728 }
1729
1730 /*
1731 * remove in and out edges of request
1732 * They are freed after updating proc_graph below.
1733 */
1734
1735 for (ep = FIRST_IN(request); ep != HEAD(request); ep = NEXT_IN(ep)) {
1736 ADJ_LIST_REMOVE(ep);
1737 }
1738
1739
1740 if (remove_from_queue)
1741 REMOVE_SLEEP_QUEUE(request);
1742
1743 /* we are ready to recompute */
1744
1745 flk_recompute_dependencies(request, topology, nvertex, 1);
1746
1747 ep = FIRST_ADJ(request);
1748 while (ep != HEAD(request)) {
1749 IN_LIST_REMOVE(ep);
1750 request->l_sedge = NEXT_ADJ(ep);
1751 ADJ_LIST_REMOVE(ep);
1752 flk_update_proc_graph(ep, 1);
1753 flk_free_edge(ep);
1754 ep = request->l_sedge;
1755 }
1756
1757
1758 /*
1759 * unset the RECOMPUTE flag in those vertices
1760 */
1761
1762 for (i = 0; i < nvertex; i++) {
1763 topology[i]->l_state &= ~RECOMPUTE_LOCK;
1764 }
1765
1766 /*
1767 * free the topology
1768 */
1769 if (nvertex)
1770 kmem_free((void *)topology,
1771 (nvertex * sizeof (lock_descriptor_t *)));
1772 /*
1773 * Possibility of some locks unblocked now
1774 */
1775
1776 flk_wakeup(request, 0);
1777
1778 /*
1779 * we expect to have a correctly recomputed graph now.
1780 */
1781 flk_set_state(request, FLK_DEAD_STATE);
1782 flk_free_lock(request);
1783 CHECK_SLEEPING_LOCKS(gp);
1784 CHECK_ACTIVE_LOCKS(gp);
1785
1786 }
1787
1788 /*
1789 * Uncoloring the graph is simply to increment the mark value of the graph
1790 * And only when wrap round takes place will we color all vertices in
1791 * the graph explicitly.
1792 */
1793
1794 static void
flk_graph_uncolor(graph_t * gp)1795 flk_graph_uncolor(graph_t *gp)
1796 {
1797 lock_descriptor_t *lock;
1798
1799 if (gp->mark == UINT_MAX) {
1800 gp->mark = 1;
1801 for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
1802 lock = lock->l_next)
1803 lock->l_color = 0;
1804
1805 for (lock = SLEEPING_HEAD(gp)->l_next; lock != SLEEPING_HEAD(gp);
1806 lock = lock->l_next)
1807 lock->l_color = 0;
1808 } else {
1809 gp->mark++;
1810 }
1811 }
1812
1813 /*
1814 * Wake up locks that are blocked on the given lock.
1815 */
1816
1817 static void
flk_wakeup(lock_descriptor_t * lock,int adj_list_remove)1818 flk_wakeup(lock_descriptor_t *lock, int adj_list_remove)
1819 {
1820 edge_t *ep;
1821 graph_t *gp = lock->l_graph;
1822 lock_descriptor_t *lck;
1823
1824 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1825 if (NO_DEPENDENTS(lock))
1826 return;
1827 ep = FIRST_IN(lock);
1828 do {
1829 /*
1830 * delete the edge from the adjacency list
1831 * of from vertex. if no more adjacent edges
1832 * for this vertex wake this process.
1833 */
1834 lck = ep->from_vertex;
1835 if (adj_list_remove)
1836 ADJ_LIST_REMOVE(ep);
1837 flk_update_proc_graph(ep, 1);
1838 if (NOT_BLOCKED(lck)) {
1839 GRANT_WAKEUP(lck);
1840 }
1841 lock->l_sedge = NEXT_IN(ep);
1842 IN_LIST_REMOVE(ep);
1843 flk_free_edge(ep);
1844 ep = lock->l_sedge;
1845 } while (ep != HEAD(lock));
1846 ASSERT(NO_DEPENDENTS(lock));
1847 }
1848
1849 /*
1850 * The dependents of request, is checked for its dependency against the
1851 * locks in topology (called topology because the array is and should be in
1852 * topological order for this algorithm, if not in topological order the
1853 * inner loop below might add more edges than necessary. Topological ordering
1854 * of vertices satisfies the property that all edges will be from left to
1855 * right i.e., topology[i] can have an edge to topology[j], iff i<j)
1856 * If lock l1 in the dependent set of request is dependent (blocked by)
1857 * on lock l2 in topology but does not have a path to it, we add an edge
1858 * in the inner loop below.
1859 *
1860 * We don't want to add an edge between l1 and l2 if there exists
1861 * already a path from l1 to l2, so care has to be taken for those vertices
1862 * that have two paths to 'request'. These vertices are referred to here
1863 * as barrier locks.
1864 *
1865 * The barriers has to be found (those vertex that originally had two paths
1866 * to request) because otherwise we may end up adding edges unnecessarily
1867 * to vertices in topology, and thus barrier vertices can have an edge
1868 * to a vertex in topology as well a path to it.
1869 */
1870
1871 static void
flk_recompute_dependencies(lock_descriptor_t * request,lock_descriptor_t ** topology,int nvertex,int update_graph)1872 flk_recompute_dependencies(lock_descriptor_t *request,
1873 lock_descriptor_t **topology, int nvertex, int update_graph)
1874 {
1875 lock_descriptor_t *vertex, *lock;
1876 graph_t *gp = request->l_graph;
1877 int i, count;
1878 int barrier_found = 0;
1879 edge_t *ep;
1880 lock_descriptor_t *vertex_stack;
1881
1882 STACK_INIT(vertex_stack);
1883
1884 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1885 if (nvertex == 0)
1886 return;
1887 flk_graph_uncolor(request->l_graph);
1888 barrier_found = flk_find_barriers(request);
1889 request->l_state |= RECOMPUTE_DONE;
1890
1891 STACK_PUSH(vertex_stack, request, l_stack);
1892 request->l_sedge = FIRST_IN(request);
1893
1894
1895 while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1896 if (vertex->l_state & RECOMPUTE_DONE) {
1897 count = 0;
1898 goto next_in_edge;
1899 }
1900 if (IS_BARRIER(vertex)) {
1901 /* decrement the barrier count */
1902 if (vertex->l_index) {
1903 vertex->l_index--;
1904 /* this guy will be pushed again anyway ? */
1905 STACK_POP(vertex_stack, l_stack);
1906 if (vertex->l_index == 0) {
1907 /*
1908 * barrier is over we can recompute
1909 * dependencies for this lock in the
1910 * next stack pop
1911 */
1912 vertex->l_state &= ~BARRIER_LOCK;
1913 }
1914 continue;
1915 }
1916 }
1917 vertex->l_state |= RECOMPUTE_DONE;
1918 flk_graph_uncolor(gp);
1919 count = flk_color_reachables(vertex);
1920 for (i = 0; i < nvertex; i++) {
1921 lock = topology[i];
1922 if (COLORED(lock))
1923 continue;
1924 if (BLOCKS(lock, vertex)) {
1925 (void) flk_add_edge(vertex, lock,
1926 NO_CHECK_CYCLE, update_graph);
1927 COLOR(lock);
1928 count++;
1929 count += flk_color_reachables(lock);
1930 }
1931
1932 }
1933
1934 next_in_edge:
1935 if (count == nvertex ||
1936 vertex->l_sedge == HEAD(vertex)) {
1937 /* prune the tree below this */
1938 STACK_POP(vertex_stack, l_stack);
1939 vertex->l_state &= ~RECOMPUTE_DONE;
1940 /* update the barrier locks below this! */
1941 if (vertex->l_sedge != HEAD(vertex) && barrier_found) {
1942 flk_graph_uncolor(gp);
1943 flk_update_barriers(vertex);
1944 }
1945 continue;
1946 }
1947
1948 ep = vertex->l_sedge;
1949 lock = ep->from_vertex;
1950 STACK_PUSH(vertex_stack, lock, l_stack);
1951 lock->l_sedge = FIRST_IN(lock);
1952 vertex->l_sedge = NEXT_IN(ep);
1953 }
1954
1955 }
1956
1957 /*
1958 * Color all reachable vertices from vertex that belongs to topology (here
1959 * those that have RECOMPUTE_LOCK set in their state) and yet uncolored.
1960 *
1961 * Note: we need to use a different stack_link l_stack1 because this is
1962 * called from flk_recompute_dependencies() that already uses a stack with
1963 * l_stack as stack_link.
1964 */
1965
1966 static int
flk_color_reachables(lock_descriptor_t * vertex)1967 flk_color_reachables(lock_descriptor_t *vertex)
1968 {
1969 lock_descriptor_t *ver, *lock;
1970 int count;
1971 edge_t *ep;
1972 lock_descriptor_t *vertex_stack;
1973
1974 STACK_INIT(vertex_stack);
1975
1976 STACK_PUSH(vertex_stack, vertex, l_stack1);
1977 count = 0;
1978 while ((ver = STACK_TOP(vertex_stack)) != NULL) {
1979
1980 STACK_POP(vertex_stack, l_stack1);
1981 for (ep = FIRST_ADJ(ver); ep != HEAD(ver);
1982 ep = NEXT_ADJ(ep)) {
1983 lock = ep->to_vertex;
1984 if (COLORED(lock))
1985 continue;
1986 COLOR(lock);
1987 if (IS_RECOMPUTE(lock))
1988 count++;
1989 STACK_PUSH(vertex_stack, lock, l_stack1);
1990 }
1991
1992 }
1993 return (count);
1994 }
1995
1996 /*
1997 * Called from flk_recompute_dependencies() this routine decrements
1998 * the barrier count of barrier vertices that are reachable from lock.
1999 */
2000
2001 static void
flk_update_barriers(lock_descriptor_t * lock)2002 flk_update_barriers(lock_descriptor_t *lock)
2003 {
2004 lock_descriptor_t *vertex, *lck;
2005 edge_t *ep;
2006 lock_descriptor_t *vertex_stack;
2007
2008 STACK_INIT(vertex_stack);
2009
2010 STACK_PUSH(vertex_stack, lock, l_stack1);
2011
2012 while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2013 STACK_POP(vertex_stack, l_stack1);
2014 for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2015 ep = NEXT_IN(ep)) {
2016 lck = ep->from_vertex;
2017 if (COLORED(lck)) {
2018 if (IS_BARRIER(lck)) {
2019 ASSERT(lck->l_index > 0);
2020 lck->l_index--;
2021 if (lck->l_index == 0)
2022 lck->l_state &= ~BARRIER_LOCK;
2023 }
2024 continue;
2025 }
2026 COLOR(lck);
2027 if (IS_BARRIER(lck)) {
2028 ASSERT(lck->l_index > 0);
2029 lck->l_index--;
2030 if (lck->l_index == 0)
2031 lck->l_state &= ~BARRIER_LOCK;
2032 }
2033 STACK_PUSH(vertex_stack, lck, l_stack1);
2034 }
2035 }
2036 }
2037
2038 /*
2039 * Finds all vertices that are reachable from 'lock' more than once and
2040 * mark them as barrier vertices and increment their barrier count.
2041 * The barrier count is one minus the total number of paths from lock
2042 * to that vertex.
2043 */
2044
2045 static int
flk_find_barriers(lock_descriptor_t * lock)2046 flk_find_barriers(lock_descriptor_t *lock)
2047 {
2048 lock_descriptor_t *vertex, *lck;
2049 int found = 0;
2050 edge_t *ep;
2051 lock_descriptor_t *vertex_stack;
2052
2053 STACK_INIT(vertex_stack);
2054
2055 STACK_PUSH(vertex_stack, lock, l_stack1);
2056
2057 while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2058 STACK_POP(vertex_stack, l_stack1);
2059 for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2060 ep = NEXT_IN(ep)) {
2061 lck = ep->from_vertex;
2062 if (COLORED(lck)) {
2063 /* this is a barrier */
2064 lck->l_state |= BARRIER_LOCK;
2065 /* index will have barrier count */
2066 lck->l_index++;
2067 if (!found)
2068 found = 1;
2069 continue;
2070 }
2071 COLOR(lck);
2072 lck->l_index = 0;
2073 STACK_PUSH(vertex_stack, lck, l_stack1);
2074 }
2075 }
2076 return (found);
2077 }
2078
2079 /*
2080 * Finds the first lock that is mainly responsible for blocking this
2081 * request. If there is no such lock, request->l_flock.l_type is set to
2082 * F_UNLCK. Otherwise, request->l_flock is filled in with the particulars
2083 * of the blocking lock.
2084 *
2085 * Note: It is possible a request is blocked by a sleeping lock because
2086 * of the fairness policy used in flk_process_request() to construct the
2087 * dependencies. (see comments before flk_process_request()).
2088 */
2089
2090 static void
flk_get_first_blocking_lock(lock_descriptor_t * request)2091 flk_get_first_blocking_lock(lock_descriptor_t *request)
2092 {
2093 graph_t *gp = request->l_graph;
2094 vnode_t *vp = request->l_vnode;
2095 lock_descriptor_t *lock, *blocker;
2096
2097 ASSERT(MUTEX_HELD(&gp->gp_mutex));
2098 blocker = NULL;
2099 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2100
2101 if (lock) {
2102 do {
2103 if (BLOCKS(lock, request)) {
2104 blocker = lock;
2105 break;
2106 }
2107 lock = lock->l_next;
2108 } while (lock->l_vnode == vp);
2109 }
2110
2111 if (blocker == NULL && request->l_flock.l_type == F_RDLCK) {
2112 /*
2113 * No active lock is blocking this request, but if a read
2114 * lock is requested, it may also get blocked by a waiting
2115 * writer. So search all sleeping locks and see if there is
2116 * a writer waiting.
2117 */
2118 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2119 if (lock) {
2120 do {
2121 if (BLOCKS(lock, request)) {
2122 blocker = lock;
2123 break;
2124 }
2125 lock = lock->l_next;
2126 } while (lock->l_vnode == vp);
2127 }
2128 }
2129
2130 if (blocker) {
2131 report_blocker(blocker, request);
2132 } else
2133 request->l_flock.l_type = F_UNLCK;
2134 }
2135
2136 /*
2137 * Get the graph_t structure associated with a vnode.
2138 * If 'initialize' is non-zero, and the graph_t structure for this vnode has
2139 * not yet been initialized, then a new element is allocated and returned.
2140 */
2141 graph_t *
flk_get_lock_graph(vnode_t * vp,int initialize)2142 flk_get_lock_graph(vnode_t *vp, int initialize)
2143 {
2144 graph_t *gp;
2145 graph_t *gp_alloc = NULL;
2146 int index = HASH_INDEX(vp);
2147
2148 if (initialize == FLK_USE_GRAPH) {
2149 mutex_enter(&flock_lock);
2150 gp = lock_graph[index];
2151 mutex_exit(&flock_lock);
2152 return (gp);
2153 }
2154
2155 ASSERT(initialize == FLK_INIT_GRAPH);
2156
2157 if (lock_graph[index] == NULL) {
2158
2159 gp_alloc = kmem_zalloc(sizeof (graph_t), KM_SLEEP);
2160
2161 /* Initialize the graph */
2162
2163 gp_alloc->active_locks.l_next =
2164 gp_alloc->active_locks.l_prev =
2165 (lock_descriptor_t *)ACTIVE_HEAD(gp_alloc);
2166 gp_alloc->sleeping_locks.l_next =
2167 gp_alloc->sleeping_locks.l_prev =
2168 (lock_descriptor_t *)SLEEPING_HEAD(gp_alloc);
2169 gp_alloc->index = index;
2170 mutex_init(&gp_alloc->gp_mutex, NULL, MUTEX_DEFAULT, NULL);
2171 }
2172
2173 mutex_enter(&flock_lock);
2174
2175 gp = lock_graph[index];
2176
2177 /* Recheck the value within flock_lock */
2178 if (gp == NULL) {
2179 struct flock_globals *fg;
2180
2181 /* We must have previously allocated the graph_t structure */
2182 ASSERT(gp_alloc != NULL);
2183 lock_graph[index] = gp = gp_alloc;
2184 /*
2185 * The lockmgr status is only needed if KLM is loaded.
2186 */
2187 if (flock_zone_key != ZONE_KEY_UNINITIALIZED) {
2188 fg = flk_get_globals();
2189 fg->lockmgr_status[index] = fg->flk_lockmgr_status;
2190 }
2191 }
2192
2193 mutex_exit(&flock_lock);
2194
2195 if ((gp_alloc != NULL) && (gp != gp_alloc)) {
2196 /* There was a race to allocate the graph_t and we lost */
2197 mutex_destroy(&gp_alloc->gp_mutex);
2198 kmem_free(gp_alloc, sizeof (graph_t));
2199 }
2200
2201 return (gp);
2202 }
2203
2204 /*
2205 * PSARC case 1997/292
2206 */
2207 int
cl_flk_has_remote_locks_for_nlmid(vnode_t * vp,int nlmid)2208 cl_flk_has_remote_locks_for_nlmid(vnode_t *vp, int nlmid)
2209 {
2210 lock_descriptor_t *lock;
2211 int result = 0;
2212 graph_t *gp;
2213 int lock_nlmid;
2214
2215 /*
2216 * Check to see if node is booted as a cluster. If not, return.
2217 */
2218 if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2219 return (0);
2220 }
2221
2222 gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2223 if (gp == NULL) {
2224 return (0);
2225 }
2226
2227 mutex_enter(&gp->gp_mutex);
2228
2229 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2230
2231 if (lock) {
2232 while (lock->l_vnode == vp) {
2233 /* get NLM id from sysid */
2234 lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2235
2236 /*
2237 * If NLM server request _and_ nlmid of lock matches
2238 * nlmid of argument, then we've found a remote lock.
2239 */
2240 if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2241 result = 1;
2242 goto done;
2243 }
2244 lock = lock->l_next;
2245 }
2246 }
2247
2248 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2249
2250 if (lock) {
2251 while (lock->l_vnode == vp) {
2252 /* get NLM id from sysid */
2253 lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2254
2255 /*
2256 * If NLM server request _and_ nlmid of lock matches
2257 * nlmid of argument, then we've found a remote lock.
2258 */
2259 if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2260 result = 1;
2261 goto done;
2262 }
2263 lock = lock->l_next;
2264 }
2265 }
2266
2267 done:
2268 mutex_exit(&gp->gp_mutex);
2269 return (result);
2270 }
2271
2272 /*
2273 * Determine whether there are any locks for the given vnode with a remote
2274 * sysid. Returns zero if not, non-zero if there are.
2275 *
2276 * Note that the return value from this function is potentially invalid
2277 * once it has been returned. The caller is responsible for providing its
2278 * own synchronization mechanism to ensure that the return value is useful
2279 * (e.g., see nfs_lockcompletion()).
2280 */
2281 int
flk_has_remote_locks(vnode_t * vp)2282 flk_has_remote_locks(vnode_t *vp)
2283 {
2284 lock_descriptor_t *lock;
2285 int result = 0;
2286 graph_t *gp;
2287
2288 gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2289 if (gp == NULL) {
2290 return (0);
2291 }
2292
2293 mutex_enter(&gp->gp_mutex);
2294
2295 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2296
2297 if (lock) {
2298 while (lock->l_vnode == vp) {
2299 if (IS_REMOTE(lock)) {
2300 result = 1;
2301 goto done;
2302 }
2303 lock = lock->l_next;
2304 }
2305 }
2306
2307 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2308
2309 if (lock) {
2310 while (lock->l_vnode == vp) {
2311 if (IS_REMOTE(lock)) {
2312 result = 1;
2313 goto done;
2314 }
2315 lock = lock->l_next;
2316 }
2317 }
2318
2319 done:
2320 mutex_exit(&gp->gp_mutex);
2321 return (result);
2322 }
2323
2324 /*
2325 * Determine whether there are any locks for the given vnode with a remote
2326 * sysid matching given sysid.
2327 * Used by the new (open source) NFS Lock Manager (NLM)
2328 */
2329 int
flk_has_remote_locks_for_sysid(vnode_t * vp,int sysid)2330 flk_has_remote_locks_for_sysid(vnode_t *vp, int sysid)
2331 {
2332 lock_descriptor_t *lock;
2333 int result = 0;
2334 graph_t *gp;
2335
2336 if (sysid == 0)
2337 return (0);
2338
2339 gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2340 if (gp == NULL) {
2341 return (0);
2342 }
2343
2344 mutex_enter(&gp->gp_mutex);
2345
2346 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2347
2348 if (lock) {
2349 while (lock->l_vnode == vp) {
2350 if (lock->l_flock.l_sysid == sysid) {
2351 result = 1;
2352 goto done;
2353 }
2354 lock = lock->l_next;
2355 }
2356 }
2357
2358 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2359
2360 if (lock) {
2361 while (lock->l_vnode == vp) {
2362 if (lock->l_flock.l_sysid == sysid) {
2363 result = 1;
2364 goto done;
2365 }
2366 lock = lock->l_next;
2367 }
2368 }
2369
2370 done:
2371 mutex_exit(&gp->gp_mutex);
2372 return (result);
2373 }
2374
2375 /*
2376 * Determine if there are any locks owned by the given sysid.
2377 * Returns zero if not, non-zero if there are. Note that this return code
2378 * could be derived from flk_get_{sleeping,active}_locks, but this routine
2379 * avoids all the memory allocations of those routines.
2380 *
2381 * This routine has the same synchronization issues as
2382 * flk_has_remote_locks.
2383 */
2384
2385 int
flk_sysid_has_locks(int sysid,int lck_type)2386 flk_sysid_has_locks(int sysid, int lck_type)
2387 {
2388 int has_locks = 0;
2389 lock_descriptor_t *lock;
2390 graph_t *gp;
2391 int i;
2392
2393 for (i = 0; i < HASH_SIZE && !has_locks; i++) {
2394 mutex_enter(&flock_lock);
2395 gp = lock_graph[i];
2396 mutex_exit(&flock_lock);
2397 if (gp == NULL) {
2398 continue;
2399 }
2400
2401 mutex_enter(&gp->gp_mutex);
2402
2403 if (lck_type & FLK_QUERY_ACTIVE) {
2404 for (lock = ACTIVE_HEAD(gp)->l_next;
2405 lock != ACTIVE_HEAD(gp) && !has_locks;
2406 lock = lock->l_next) {
2407 if (lock->l_flock.l_sysid == sysid)
2408 has_locks = 1;
2409 }
2410 }
2411
2412 if (lck_type & FLK_QUERY_SLEEPING) {
2413 for (lock = SLEEPING_HEAD(gp)->l_next;
2414 lock != SLEEPING_HEAD(gp) && !has_locks;
2415 lock = lock->l_next) {
2416 if (lock->l_flock.l_sysid == sysid)
2417 has_locks = 1;
2418 }
2419 }
2420 mutex_exit(&gp->gp_mutex);
2421 }
2422
2423 return (has_locks);
2424 }
2425
2426
2427 /*
2428 * PSARC case 1997/292
2429 *
2430 * Requires: "sysid" is a pair [nlmid, sysid]. The lower half is 16-bit
2431 * quantity, the real sysid generated by the NLM server; the upper half
2432 * identifies the node of the cluster where the NLM server ran.
2433 * This routine is only called by an NLM server running in a cluster.
2434 * Effects: Remove all locks held on behalf of the client identified
2435 * by "sysid."
2436 */
2437 void
cl_flk_remove_locks_by_sysid(int sysid)2438 cl_flk_remove_locks_by_sysid(int sysid)
2439 {
2440 graph_t *gp;
2441 int i;
2442 lock_descriptor_t *lock, *nlock;
2443
2444 /*
2445 * Check to see if node is booted as a cluster. If not, return.
2446 */
2447 if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2448 return;
2449 }
2450
2451 ASSERT(sysid != 0);
2452 for (i = 0; i < HASH_SIZE; i++) {
2453 mutex_enter(&flock_lock);
2454 gp = lock_graph[i];
2455 mutex_exit(&flock_lock);
2456
2457 if (gp == NULL)
2458 continue;
2459
2460 mutex_enter(&gp->gp_mutex); /* get mutex on lock graph */
2461
2462 /* signal sleeping requests so that they bail out */
2463 lock = SLEEPING_HEAD(gp)->l_next;
2464 while (lock != SLEEPING_HEAD(gp)) {
2465 nlock = lock->l_next;
2466 if (lock->l_flock.l_sysid == sysid) {
2467 INTERRUPT_WAKEUP(lock);
2468 }
2469 lock = nlock;
2470 }
2471
2472 /* delete active locks */
2473 lock = ACTIVE_HEAD(gp)->l_next;
2474 while (lock != ACTIVE_HEAD(gp)) {
2475 nlock = lock->l_next;
2476 if (lock->l_flock.l_sysid == sysid) {
2477 flk_delete_active_lock(lock, 0);
2478 flk_wakeup(lock, 1);
2479 flk_free_lock(lock);
2480 }
2481 lock = nlock;
2482 }
2483 mutex_exit(&gp->gp_mutex); /* release mutex on lock graph */
2484 }
2485 }
2486
2487 /*
2488 * Delete all locks in the system that belongs to the sysid of the request.
2489 */
2490
2491 static void
flk_delete_locks_by_sysid(lock_descriptor_t * request)2492 flk_delete_locks_by_sysid(lock_descriptor_t *request)
2493 {
2494 int sysid = request->l_flock.l_sysid;
2495 lock_descriptor_t *lock, *nlock;
2496 graph_t *gp;
2497 int i;
2498
2499 ASSERT(MUTEX_HELD(&request->l_graph->gp_mutex));
2500 ASSERT(sysid != 0);
2501
2502 mutex_exit(&request->l_graph->gp_mutex);
2503
2504 for (i = 0; i < HASH_SIZE; i++) {
2505 mutex_enter(&flock_lock);
2506 gp = lock_graph[i];
2507 mutex_exit(&flock_lock);
2508
2509 if (gp == NULL)
2510 continue;
2511
2512 mutex_enter(&gp->gp_mutex);
2513
2514 /* signal sleeping requests so that they bail out */
2515 lock = SLEEPING_HEAD(gp)->l_next;
2516 while (lock != SLEEPING_HEAD(gp)) {
2517 nlock = lock->l_next;
2518 if (lock->l_flock.l_sysid == sysid) {
2519 INTERRUPT_WAKEUP(lock);
2520 }
2521 lock = nlock;
2522 }
2523
2524 /* delete active locks */
2525 lock = ACTIVE_HEAD(gp)->l_next;
2526 while (lock != ACTIVE_HEAD(gp)) {
2527 nlock = lock->l_next;
2528 if (lock->l_flock.l_sysid == sysid) {
2529 flk_delete_active_lock(lock, 0);
2530 flk_wakeup(lock, 1);
2531 flk_free_lock(lock);
2532 }
2533 lock = nlock;
2534 }
2535 mutex_exit(&gp->gp_mutex);
2536 }
2537
2538 mutex_enter(&request->l_graph->gp_mutex);
2539 }
2540
2541 /*
2542 * Clustering: Deletes PXFS locks
2543 * Effects: Delete all locks on files in the given file system and with the
2544 * given PXFS id.
2545 */
2546 void
cl_flk_delete_pxfs_locks(struct vfs * vfsp,int pxfsid)2547 cl_flk_delete_pxfs_locks(struct vfs *vfsp, int pxfsid)
2548 {
2549 lock_descriptor_t *lock, *nlock;
2550 graph_t *gp;
2551 int i;
2552
2553 for (i = 0; i < HASH_SIZE; i++) {
2554 mutex_enter(&flock_lock);
2555 gp = lock_graph[i];
2556 mutex_exit(&flock_lock);
2557
2558 if (gp == NULL)
2559 continue;
2560
2561 mutex_enter(&gp->gp_mutex);
2562
2563 /* signal sleeping requests so that they bail out */
2564 lock = SLEEPING_HEAD(gp)->l_next;
2565 while (lock != SLEEPING_HEAD(gp)) {
2566 nlock = lock->l_next;
2567 if (lock->l_vnode->v_vfsp == vfsp) {
2568 ASSERT(IS_PXFS(lock));
2569 if (GETPXFSID(lock->l_flock.l_sysid) ==
2570 pxfsid) {
2571 flk_set_state(lock,
2572 FLK_CANCELLED_STATE);
2573 flk_cancel_sleeping_lock(lock, 1);
2574 }
2575 }
2576 lock = nlock;
2577 }
2578
2579 /* delete active locks */
2580 lock = ACTIVE_HEAD(gp)->l_next;
2581 while (lock != ACTIVE_HEAD(gp)) {
2582 nlock = lock->l_next;
2583 if (lock->l_vnode->v_vfsp == vfsp) {
2584 ASSERT(IS_PXFS(lock));
2585 if (GETPXFSID(lock->l_flock.l_sysid) ==
2586 pxfsid) {
2587 flk_delete_active_lock(lock, 0);
2588 flk_wakeup(lock, 1);
2589 flk_free_lock(lock);
2590 }
2591 }
2592 lock = nlock;
2593 }
2594 mutex_exit(&gp->gp_mutex);
2595 }
2596 }
2597
2598 /*
2599 * Search for a sleeping lock manager lock which matches exactly this lock
2600 * request; if one is found, fake a signal to cancel it.
2601 *
2602 * Return 1 if a matching lock was found, 0 otherwise.
2603 */
2604
2605 static int
flk_canceled(lock_descriptor_t * request)2606 flk_canceled(lock_descriptor_t *request)
2607 {
2608 lock_descriptor_t *lock, *nlock;
2609 graph_t *gp = request->l_graph;
2610 vnode_t *vp = request->l_vnode;
2611
2612 ASSERT(MUTEX_HELD(&gp->gp_mutex));
2613 ASSERT(IS_LOCKMGR(request));
2614 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2615
2616 if (lock) {
2617 while (lock->l_vnode == vp) {
2618 nlock = lock->l_next;
2619 if (SAME_OWNER(lock, request) &&
2620 lock->l_start == request->l_start &&
2621 lock->l_end == request->l_end) {
2622 INTERRUPT_WAKEUP(lock);
2623 return (1);
2624 }
2625 lock = nlock;
2626 }
2627 }
2628 return (0);
2629 }
2630
2631 /*
2632 * Remove all the locks for the vnode belonging to the given pid and sysid.
2633 */
2634
2635 void
cleanlocks(vnode_t * vp,pid_t pid,int sysid)2636 cleanlocks(vnode_t *vp, pid_t pid, int sysid)
2637 {
2638 graph_t *gp;
2639 lock_descriptor_t *lock, *nlock;
2640 lock_descriptor_t *link_stack;
2641
2642 STACK_INIT(link_stack);
2643
2644 gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2645
2646 if (gp == NULL)
2647 return;
2648 mutex_enter(&gp->gp_mutex);
2649
2650 CHECK_SLEEPING_LOCKS(gp);
2651 CHECK_ACTIVE_LOCKS(gp);
2652
2653 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2654
2655 if (lock) {
2656 do {
2657 nlock = lock->l_next;
2658 if ((lock->l_flock.l_pid == pid ||
2659 pid == IGN_PID) &&
2660 lock->l_flock.l_sysid == sysid) {
2661 CANCEL_WAKEUP(lock);
2662 }
2663 lock = nlock;
2664 } while (lock->l_vnode == vp);
2665 }
2666
2667 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2668
2669 if (lock) {
2670 do {
2671 nlock = lock->l_next;
2672 if ((lock->l_flock.l_pid == pid ||
2673 pid == IGN_PID) &&
2674 lock->l_flock.l_sysid == sysid) {
2675 flk_delete_active_lock(lock, 0);
2676 STACK_PUSH(link_stack, lock, l_stack);
2677 }
2678 lock = nlock;
2679 } while (lock->l_vnode == vp);
2680 }
2681
2682 while ((lock = STACK_TOP(link_stack)) != NULL) {
2683 STACK_POP(link_stack, l_stack);
2684 flk_wakeup(lock, 1);
2685 flk_free_lock(lock);
2686 }
2687
2688 CHECK_SLEEPING_LOCKS(gp);
2689 CHECK_ACTIVE_LOCKS(gp);
2690 CHECK_OWNER_LOCKS(gp, pid, sysid, vp);
2691 mutex_exit(&gp->gp_mutex);
2692 }
2693
2694
2695 /*
2696 * Called from 'fs' read and write routines for files that have mandatory
2697 * locking enabled.
2698 */
2699
2700 int
chklock(struct vnode * vp,int iomode,u_offset_t offset,ssize_t len,int fmode,caller_context_t * ct)2701 chklock(struct vnode *vp, int iomode, u_offset_t offset, ssize_t len, int fmode,
2702 caller_context_t *ct)
2703 {
2704 register int i;
2705 struct flock64 bf;
2706 int error = 0;
2707
2708 bf.l_type = (iomode & FWRITE) ? F_WRLCK : F_RDLCK;
2709 bf.l_whence = 0;
2710 bf.l_start = offset;
2711 bf.l_len = len;
2712 if (ct == NULL) {
2713 bf.l_pid = curproc->p_pid;
2714 bf.l_sysid = 0;
2715 } else {
2716 bf.l_pid = ct->cc_pid;
2717 bf.l_sysid = ct->cc_sysid;
2718 }
2719 i = (fmode & (FNDELAY|FNONBLOCK)) ? INOFLCK : INOFLCK|SLPFLCK;
2720 if ((i = reclock(vp, &bf, i, 0, offset, NULL)) != 0 ||
2721 bf.l_type != F_UNLCK)
2722 error = i ? i : EAGAIN;
2723 return (error);
2724 }
2725
2726 /*
2727 * convoff - converts the given data (start, whence) to the
2728 * given whence.
2729 */
2730 int
convoff(struct vnode * vp,struct flock64 * lckdat,int whence,offset_t offset)2731 convoff(struct vnode *vp, struct flock64 *lckdat, int whence, offset_t offset)
2732 {
2733 int error;
2734 struct vattr vattr;
2735
2736 if ((lckdat->l_whence == 2) || (whence == 2)) {
2737 vattr.va_mask = AT_SIZE;
2738 if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
2739 return (error);
2740 }
2741
2742 switch (lckdat->l_whence) {
2743 case 1:
2744 lckdat->l_start += offset;
2745 break;
2746 case 2:
2747 lckdat->l_start += vattr.va_size;
2748 /* FALLTHRU */
2749 case 0:
2750 break;
2751 default:
2752 return (EINVAL);
2753 }
2754
2755 if (lckdat->l_start < 0)
2756 return (EINVAL);
2757
2758 switch (whence) {
2759 case 1:
2760 lckdat->l_start -= offset;
2761 break;
2762 case 2:
2763 lckdat->l_start -= vattr.va_size;
2764 /* FALLTHRU */
2765 case 0:
2766 break;
2767 default:
2768 return (EINVAL);
2769 }
2770
2771 lckdat->l_whence = (short)whence;
2772 return (0);
2773 }
2774
2775
2776 /* proc_graph function definitions */
2777
2778 /*
2779 * Function checks for deadlock due to the new 'lock'. If deadlock found
2780 * edges of this lock are freed and returned.
2781 */
2782
2783 static int
flk_check_deadlock(lock_descriptor_t * lock)2784 flk_check_deadlock(lock_descriptor_t *lock)
2785 {
2786 proc_vertex_t *start_vertex, *pvertex;
2787 proc_vertex_t *dvertex;
2788 proc_edge_t *pep, *ppep;
2789 edge_t *ep, *nep;
2790 proc_vertex_t *process_stack;
2791
2792 STACK_INIT(process_stack);
2793
2794 mutex_enter(&flock_lock);
2795 start_vertex = flk_get_proc_vertex(lock);
2796 ASSERT(start_vertex != NULL);
2797
2798 /* construct the edges from this process to other processes */
2799
2800 ep = FIRST_ADJ(lock);
2801 while (ep != HEAD(lock)) {
2802 proc_vertex_t *adj_proc;
2803
2804 adj_proc = flk_get_proc_vertex(ep->to_vertex);
2805 for (pep = start_vertex->edge; pep != NULL; pep = pep->next) {
2806 if (pep->to_proc == adj_proc) {
2807 ASSERT(pep->refcount);
2808 pep->refcount++;
2809 break;
2810 }
2811 }
2812 if (pep == NULL) {
2813 pep = flk_get_proc_edge();
2814 pep->to_proc = adj_proc;
2815 pep->refcount = 1;
2816 adj_proc->incount++;
2817 pep->next = start_vertex->edge;
2818 start_vertex->edge = pep;
2819 }
2820 ep = NEXT_ADJ(ep);
2821 }
2822
2823 ep = FIRST_IN(lock);
2824
2825 while (ep != HEAD(lock)) {
2826 proc_vertex_t *in_proc;
2827
2828 in_proc = flk_get_proc_vertex(ep->from_vertex);
2829
2830 for (pep = in_proc->edge; pep != NULL; pep = pep->next) {
2831 if (pep->to_proc == start_vertex) {
2832 ASSERT(pep->refcount);
2833 pep->refcount++;
2834 break;
2835 }
2836 }
2837 if (pep == NULL) {
2838 pep = flk_get_proc_edge();
2839 pep->to_proc = start_vertex;
2840 pep->refcount = 1;
2841 start_vertex->incount++;
2842 pep->next = in_proc->edge;
2843 in_proc->edge = pep;
2844 }
2845 ep = NEXT_IN(ep);
2846 }
2847
2848 if (start_vertex->incount == 0) {
2849 mutex_exit(&flock_lock);
2850 return (0);
2851 }
2852
2853 flk_proc_graph_uncolor();
2854
2855 start_vertex->p_sedge = start_vertex->edge;
2856
2857 STACK_PUSH(process_stack, start_vertex, p_stack);
2858
2859 while ((pvertex = STACK_TOP(process_stack)) != NULL) {
2860 for (pep = pvertex->p_sedge; pep != NULL; pep = pep->next) {
2861 dvertex = pep->to_proc;
2862 if (!PROC_ARRIVED(dvertex)) {
2863 STACK_PUSH(process_stack, dvertex, p_stack);
2864 dvertex->p_sedge = dvertex->edge;
2865 PROC_ARRIVE(pvertex);
2866 pvertex->p_sedge = pep->next;
2867 break;
2868 }
2869 if (!PROC_DEPARTED(dvertex))
2870 goto deadlock;
2871 }
2872 if (pep == NULL) {
2873 PROC_DEPART(pvertex);
2874 STACK_POP(process_stack, p_stack);
2875 }
2876 }
2877 mutex_exit(&flock_lock);
2878 return (0);
2879
2880 deadlock:
2881
2882 /* we remove all lock edges and proc edges */
2883
2884 ep = FIRST_ADJ(lock);
2885 while (ep != HEAD(lock)) {
2886 proc_vertex_t *adj_proc;
2887 adj_proc = flk_get_proc_vertex(ep->to_vertex);
2888 nep = NEXT_ADJ(ep);
2889 IN_LIST_REMOVE(ep);
2890 ADJ_LIST_REMOVE(ep);
2891 flk_free_edge(ep);
2892 ppep = start_vertex->edge;
2893 for (pep = start_vertex->edge; pep != NULL; ppep = pep,
2894 pep = ppep->next) {
2895 if (pep->to_proc == adj_proc) {
2896 pep->refcount--;
2897 if (pep->refcount == 0) {
2898 if (pep == ppep) {
2899 start_vertex->edge = pep->next;
2900 } else {
2901 ppep->next = pep->next;
2902 }
2903 adj_proc->incount--;
2904 flk_proc_release(adj_proc);
2905 flk_free_proc_edge(pep);
2906 }
2907 break;
2908 }
2909 }
2910 ep = nep;
2911 }
2912 ep = FIRST_IN(lock);
2913 while (ep != HEAD(lock)) {
2914 proc_vertex_t *in_proc;
2915 in_proc = flk_get_proc_vertex(ep->from_vertex);
2916 nep = NEXT_IN(ep);
2917 IN_LIST_REMOVE(ep);
2918 ADJ_LIST_REMOVE(ep);
2919 flk_free_edge(ep);
2920 ppep = in_proc->edge;
2921 for (pep = in_proc->edge; pep != NULL; ppep = pep,
2922 pep = ppep->next) {
2923 if (pep->to_proc == start_vertex) {
2924 pep->refcount--;
2925 if (pep->refcount == 0) {
2926 if (pep == ppep) {
2927 in_proc->edge = pep->next;
2928 } else {
2929 ppep->next = pep->next;
2930 }
2931 start_vertex->incount--;
2932 flk_proc_release(in_proc);
2933 flk_free_proc_edge(pep);
2934 }
2935 break;
2936 }
2937 }
2938 ep = nep;
2939 }
2940 flk_proc_release(start_vertex);
2941 mutex_exit(&flock_lock);
2942 return (1);
2943 }
2944
2945 /*
2946 * Get a proc vertex. If lock's pvertex value gets a correct proc vertex
2947 * from the list we return that, otherwise we allocate one. If necessary,
2948 * we grow the list of vertices also.
2949 */
2950
2951 static proc_vertex_t *
flk_get_proc_vertex(lock_descriptor_t * lock)2952 flk_get_proc_vertex(lock_descriptor_t *lock)
2953 {
2954 int i;
2955 proc_vertex_t *pv;
2956 proc_vertex_t **palloc;
2957
2958 ASSERT(MUTEX_HELD(&flock_lock));
2959 if (lock->pvertex != -1) {
2960 ASSERT(lock->pvertex >= 0);
2961 pv = pgraph.proc[lock->pvertex];
2962 if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
2963 return (pv);
2964 }
2965 }
2966 for (i = 0; i < pgraph.gcount; i++) {
2967 pv = pgraph.proc[i];
2968 if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
2969 lock->pvertex = pv->index = i;
2970 return (pv);
2971 }
2972 }
2973 pv = kmem_zalloc(sizeof (struct proc_vertex), KM_SLEEP);
2974 pv->pid = lock->l_flock.l_pid;
2975 pv->sysid = lock->l_flock.l_sysid;
2976 flk_proc_vertex_allocs++;
2977 if (pgraph.free != 0) {
2978 for (i = 0; i < pgraph.gcount; i++) {
2979 if (pgraph.proc[i] == NULL) {
2980 pgraph.proc[i] = pv;
2981 lock->pvertex = pv->index = i;
2982 pgraph.free--;
2983 return (pv);
2984 }
2985 }
2986 }
2987 palloc = kmem_zalloc((pgraph.gcount + PROC_CHUNK) *
2988 sizeof (proc_vertex_t *), KM_SLEEP);
2989
2990 if (pgraph.proc) {
2991 bcopy(pgraph.proc, palloc,
2992 pgraph.gcount * sizeof (proc_vertex_t *));
2993
2994 kmem_free(pgraph.proc,
2995 pgraph.gcount * sizeof (proc_vertex_t *));
2996 }
2997 pgraph.proc = palloc;
2998 pgraph.free += (PROC_CHUNK - 1);
2999 pv->index = lock->pvertex = pgraph.gcount;
3000 pgraph.gcount += PROC_CHUNK;
3001 pgraph.proc[pv->index] = pv;
3002 return (pv);
3003 }
3004
3005 /*
3006 * Allocate a proc edge.
3007 */
3008
3009 static proc_edge_t *
flk_get_proc_edge()3010 flk_get_proc_edge()
3011 {
3012 proc_edge_t *pep;
3013
3014 pep = kmem_zalloc(sizeof (proc_edge_t), KM_SLEEP);
3015 flk_proc_edge_allocs++;
3016 return (pep);
3017 }
3018
3019 /*
3020 * Free the proc edge. Called whenever its reference count goes to zero.
3021 */
3022
3023 static void
flk_free_proc_edge(proc_edge_t * pep)3024 flk_free_proc_edge(proc_edge_t *pep)
3025 {
3026 ASSERT(pep->refcount == 0);
3027 kmem_free((void *)pep, sizeof (proc_edge_t));
3028 flk_proc_edge_frees++;
3029 }
3030
3031 /*
3032 * Color the graph explicitly done only when the mark value hits max value.
3033 */
3034
3035 static void
flk_proc_graph_uncolor()3036 flk_proc_graph_uncolor()
3037 {
3038 int i;
3039
3040 if (pgraph.mark == UINT_MAX) {
3041 for (i = 0; i < pgraph.gcount; i++)
3042 if (pgraph.proc[i] != NULL) {
3043 pgraph.proc[i]->atime = 0;
3044 pgraph.proc[i]->dtime = 0;
3045 }
3046 pgraph.mark = 1;
3047 } else {
3048 pgraph.mark++;
3049 }
3050 }
3051
3052 /*
3053 * Release the proc vertex iff both there are no in edges and out edges
3054 */
3055
3056 static void
flk_proc_release(proc_vertex_t * proc)3057 flk_proc_release(proc_vertex_t *proc)
3058 {
3059 ASSERT(MUTEX_HELD(&flock_lock));
3060 if (proc->edge == NULL && proc->incount == 0) {
3061 pgraph.proc[proc->index] = NULL;
3062 pgraph.free++;
3063 kmem_free(proc, sizeof (proc_vertex_t));
3064 flk_proc_vertex_frees++;
3065 }
3066 }
3067
3068 /*
3069 * Updates process graph to reflect change in a lock_graph.
3070 * Note: We should call this function only after we have a correctly
3071 * recomputed lock graph. Otherwise we might miss a deadlock detection.
3072 * eg: in function flk_relation() we call this function after flk_recompute_
3073 * dependencies() otherwise if a process tries to lock a vnode hashed
3074 * into another graph it might sleep for ever.
3075 */
3076
3077 static void
flk_update_proc_graph(edge_t * ep,int delete)3078 flk_update_proc_graph(edge_t *ep, int delete)
3079 {
3080 proc_vertex_t *toproc, *fromproc;
3081 proc_edge_t *pep, *prevpep;
3082
3083 mutex_enter(&flock_lock);
3084 toproc = flk_get_proc_vertex(ep->to_vertex);
3085 fromproc = flk_get_proc_vertex(ep->from_vertex);
3086
3087 if (!delete)
3088 goto add;
3089 pep = prevpep = fromproc->edge;
3090
3091 ASSERT(pep != NULL);
3092 while (pep != NULL) {
3093 if (pep->to_proc == toproc) {
3094 ASSERT(pep->refcount > 0);
3095 pep->refcount--;
3096 if (pep->refcount == 0) {
3097 if (pep == prevpep) {
3098 fromproc->edge = pep->next;
3099 } else {
3100 prevpep->next = pep->next;
3101 }
3102 toproc->incount--;
3103 flk_proc_release(toproc);
3104 flk_free_proc_edge(pep);
3105 }
3106 break;
3107 }
3108 prevpep = pep;
3109 pep = pep->next;
3110 }
3111 flk_proc_release(fromproc);
3112 mutex_exit(&flock_lock);
3113 return;
3114 add:
3115
3116 pep = fromproc->edge;
3117
3118 while (pep != NULL) {
3119 if (pep->to_proc == toproc) {
3120 ASSERT(pep->refcount > 0);
3121 pep->refcount++;
3122 break;
3123 }
3124 pep = pep->next;
3125 }
3126 if (pep == NULL) {
3127 pep = flk_get_proc_edge();
3128 pep->to_proc = toproc;
3129 pep->refcount = 1;
3130 toproc->incount++;
3131 pep->next = fromproc->edge;
3132 fromproc->edge = pep;
3133 }
3134 mutex_exit(&flock_lock);
3135 }
3136
3137 /*
3138 * Set the control status for lock manager requests.
3139 *
3140 */
3141
3142 /*
3143 * PSARC case 1997/292
3144 *
3145 * Requires: "nlmid" must be >= 1 and <= clconf_maximum_nodeid().
3146 * Effects: Set the state of the NLM server identified by "nlmid"
3147 * in the NLM registry to state "nlm_state."
3148 * Raises exception no_such_nlm if "nlmid" doesn't identify a known
3149 * NLM server to this LLM.
3150 * Note that when this routine is called with NLM_SHUTTING_DOWN there
3151 * may be locks requests that have gotten started but not finished. In
3152 * particular, there may be blocking requests that are in the callback code
3153 * before sleeping (so they're not holding the lock for the graph). If
3154 * such a thread reacquires the graph's lock (to go to sleep) after
3155 * NLM state in the NLM registry is set to a non-up value,
3156 * it will notice the status and bail out. If the request gets
3157 * granted before the thread can check the NLM registry, let it
3158 * continue normally. It will get flushed when we are called with NLM_DOWN.
3159 *
3160 * Modifies: nlm_reg_obj (global)
3161 * Arguments:
3162 * nlmid (IN): id uniquely identifying an NLM server
3163 * nlm_state (IN): NLM server state to change "nlmid" to
3164 */
3165 void
cl_flk_set_nlm_status(int nlmid,flk_nlm_status_t nlm_state)3166 cl_flk_set_nlm_status(int nlmid, flk_nlm_status_t nlm_state)
3167 {
3168 /*
3169 * Check to see if node is booted as a cluster. If not, return.
3170 */
3171 if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
3172 return;
3173 }
3174
3175 /*
3176 * Check for development/debugging. It is possible to boot a node
3177 * in non-cluster mode, and then run a special script, currently
3178 * available only to developers, to bring up the node as part of a
3179 * cluster. The problem is that running such a script does not
3180 * result in the routine flk_init() being called and hence global array
3181 * nlm_reg_status is NULL. The NLM thinks it's in cluster mode,
3182 * but the LLM needs to do an additional check to see if the global
3183 * array has been created or not. If nlm_reg_status is NULL, then
3184 * return, else continue.
3185 */
3186 if (nlm_reg_status == NULL) {
3187 return;
3188 }
3189
3190 ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
3191 mutex_enter(&nlm_reg_lock);
3192
3193 if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status, nlmid)) {
3194 /*
3195 * If the NLM server "nlmid" is unknown in the NLM registry,
3196 * add it to the registry in the nlm shutting down state.
3197 */
3198 FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3199 FLK_NLM_SHUTTING_DOWN);
3200 } else {
3201 /*
3202 * Change the state of the NLM server identified by "nlmid"
3203 * in the NLM registry to the argument "nlm_state."
3204 */
3205 FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3206 nlm_state);
3207 }
3208
3209 /*
3210 * The reason we must register the NLM server that is shutting down
3211 * with an LLM that doesn't already know about it (never sent a lock
3212 * request) is to handle correctly a race between shutdown and a new
3213 * lock request. Suppose that a shutdown request from the NLM server
3214 * invokes this routine at the LLM, and a thread is spawned to
3215 * service the request. Now suppose a new lock request is in
3216 * progress and has already passed the first line of defense in
3217 * reclock(), which denies new locks requests from NLM servers
3218 * that are not in the NLM_UP state. After the current routine
3219 * is invoked for both phases of shutdown, the routine will return,
3220 * having done nothing, and the lock request will proceed and
3221 * probably be granted. The problem is that the shutdown was ignored
3222 * by the lock request because there was no record of that NLM server
3223 * shutting down. We will be in the peculiar position of thinking
3224 * that we've shutdown the NLM server and all locks at all LLMs have
3225 * been discarded, but in fact there's still one lock held.
3226 * The solution is to record the existence of NLM server and change
3227 * its state immediately to NLM_SHUTTING_DOWN. The lock request in
3228 * progress may proceed because the next phase NLM_DOWN will catch
3229 * this lock and discard it.
3230 */
3231 mutex_exit(&nlm_reg_lock);
3232
3233 switch (nlm_state) {
3234 case FLK_NLM_UP:
3235 /*
3236 * Change the NLM state of all locks still held on behalf of
3237 * the NLM server identified by "nlmid" to NLM_UP.
3238 */
3239 cl_flk_change_nlm_state_all_locks(nlmid, FLK_NLM_UP);
3240 break;
3241
3242 case FLK_NLM_SHUTTING_DOWN:
3243 /*
3244 * Wake up all sleeping locks for the NLM server identified
3245 * by "nlmid." Note that eventually all woken threads will
3246 * have their lock requests cancelled and descriptors
3247 * removed from the sleeping lock list. Note that the NLM
3248 * server state associated with each lock descriptor is
3249 * changed to FLK_NLM_SHUTTING_DOWN.
3250 */
3251 cl_flk_wakeup_sleeping_nlm_locks(nlmid);
3252 break;
3253
3254 case FLK_NLM_DOWN:
3255 /*
3256 * Discard all active, granted locks for this NLM server
3257 * identified by "nlmid."
3258 */
3259 cl_flk_unlock_nlm_granted(nlmid);
3260 break;
3261
3262 default:
3263 panic("cl_set_nlm_status: bad status (%d)", nlm_state);
3264 }
3265 }
3266
3267 /*
3268 * Set the control status for lock manager requests.
3269 *
3270 * Note that when this routine is called with FLK_WAKEUP_SLEEPERS, there
3271 * may be locks requests that have gotten started but not finished. In
3272 * particular, there may be blocking requests that are in the callback code
3273 * before sleeping (so they're not holding the lock for the graph). If
3274 * such a thread reacquires the graph's lock (to go to sleep) after
3275 * flk_lockmgr_status is set to a non-up value, it will notice the status
3276 * and bail out. If the request gets granted before the thread can check
3277 * flk_lockmgr_status, let it continue normally. It will get flushed when
3278 * we are called with FLK_LOCKMGR_DOWN.
3279 */
3280
3281 void
flk_set_lockmgr_status(flk_lockmgr_status_t status)3282 flk_set_lockmgr_status(flk_lockmgr_status_t status)
3283 {
3284 int i;
3285 graph_t *gp;
3286 struct flock_globals *fg;
3287
3288 fg = flk_get_globals();
3289 ASSERT(fg != NULL);
3290
3291 mutex_enter(&flock_lock);
3292 fg->flk_lockmgr_status = status;
3293 mutex_exit(&flock_lock);
3294
3295 /*
3296 * If the lock manager is coming back up, all that's needed is to
3297 * propagate this information to the graphs. If the lock manager
3298 * is going down, additional action is required, and each graph's
3299 * copy of the state is updated atomically with this other action.
3300 */
3301 switch (status) {
3302 case FLK_LOCKMGR_UP:
3303 for (i = 0; i < HASH_SIZE; i++) {
3304 mutex_enter(&flock_lock);
3305 gp = lock_graph[i];
3306 mutex_exit(&flock_lock);
3307 if (gp == NULL)
3308 continue;
3309 mutex_enter(&gp->gp_mutex);
3310 fg->lockmgr_status[i] = status;
3311 mutex_exit(&gp->gp_mutex);
3312 }
3313 break;
3314 case FLK_WAKEUP_SLEEPERS:
3315 wakeup_sleeping_lockmgr_locks(fg);
3316 break;
3317 case FLK_LOCKMGR_DOWN:
3318 unlock_lockmgr_granted(fg);
3319 break;
3320 default:
3321 panic("flk_set_lockmgr_status: bad status (%d)", status);
3322 break;
3323 }
3324 }
3325
3326 /*
3327 * This routine returns all the locks that are active or sleeping and are
3328 * associated with a particular set of identifiers. If lock_state != 0, then
3329 * only locks that match the lock_state are returned. If lock_state == 0, then
3330 * all locks are returned. If pid == NOPID, the pid is ignored. If
3331 * use_sysid is FALSE, then the sysid is ignored. If vp is NULL, then the
3332 * vnode pointer is ignored.
3333 *
3334 * A list containing the vnode pointer and an flock structure
3335 * describing the lock is returned. Each element in the list is
3336 * dynamically allocated and must be freed by the caller. The
3337 * last item in the list is denoted by a NULL value in the ll_next
3338 * field.
3339 *
3340 * The vnode pointers returned are held. The caller is responsible
3341 * for releasing these. Note that the returned list is only a snapshot of
3342 * the current lock information, and that it is a snapshot of a moving
3343 * target (only one graph is locked at a time).
3344 */
3345
3346 locklist_t *
get_lock_list(int list_type,int lock_state,int sysid,boolean_t use_sysid,pid_t pid,const vnode_t * vp,zoneid_t zoneid)3347 get_lock_list(int list_type, int lock_state, int sysid, boolean_t use_sysid,
3348 pid_t pid, const vnode_t *vp, zoneid_t zoneid)
3349 {
3350 lock_descriptor_t *lock;
3351 lock_descriptor_t *graph_head;
3352 locklist_t listhead;
3353 locklist_t *llheadp;
3354 locklist_t *llp;
3355 locklist_t *lltp;
3356 graph_t *gp;
3357 int i;
3358 int first_index; /* graph index */
3359 int num_indexes; /* graph index */
3360
3361 ASSERT((list_type == FLK_ACTIVE_STATE) ||
3362 (list_type == FLK_SLEEPING_STATE));
3363
3364 /*
3365 * Get a pointer to something to use as a list head while building
3366 * the rest of the list.
3367 */
3368 llheadp = &listhead;
3369 lltp = llheadp;
3370 llheadp->ll_next = (locklist_t *)NULL;
3371
3372 /* Figure out which graphs we want to look at. */
3373 if (vp == NULL) {
3374 first_index = 0;
3375 num_indexes = HASH_SIZE;
3376 } else {
3377 first_index = HASH_INDEX(vp);
3378 num_indexes = 1;
3379 }
3380
3381 for (i = first_index; i < first_index + num_indexes; i++) {
3382 mutex_enter(&flock_lock);
3383 gp = lock_graph[i];
3384 mutex_exit(&flock_lock);
3385 if (gp == NULL) {
3386 continue;
3387 }
3388
3389 mutex_enter(&gp->gp_mutex);
3390 graph_head = (list_type == FLK_ACTIVE_STATE) ?
3391 ACTIVE_HEAD(gp) : SLEEPING_HEAD(gp);
3392 for (lock = graph_head->l_next;
3393 lock != graph_head;
3394 lock = lock->l_next) {
3395 if (use_sysid && lock->l_flock.l_sysid != sysid)
3396 continue;
3397 if (pid != NOPID && lock->l_flock.l_pid != pid)
3398 continue;
3399 if (vp != NULL && lock->l_vnode != vp)
3400 continue;
3401 if (lock_state && !(lock_state & lock->l_state))
3402 continue;
3403 if (zoneid != lock->l_zoneid && zoneid != ALL_ZONES)
3404 continue;
3405 /*
3406 * A matching lock was found. Allocate
3407 * space for a new locklist entry and fill
3408 * it in.
3409 */
3410 llp = kmem_alloc(sizeof (locklist_t), KM_SLEEP);
3411 lltp->ll_next = llp;
3412 VN_HOLD(lock->l_vnode);
3413 llp->ll_vp = lock->l_vnode;
3414 create_flock(lock, &(llp->ll_flock));
3415 llp->ll_next = (locklist_t *)NULL;
3416 lltp = llp;
3417 }
3418 mutex_exit(&gp->gp_mutex);
3419 }
3420
3421 llp = llheadp->ll_next;
3422 return (llp);
3423 }
3424
3425 /*
3426 * These two functions are simply interfaces to get_lock_list. They return
3427 * a list of sleeping or active locks for the given sysid and pid. See
3428 * get_lock_list for details.
3429 *
3430 * In either case we don't particularly care to specify the zone of interest;
3431 * the sysid-space is global across zones, so the sysid will map to exactly one
3432 * zone, and we'll return information for that zone.
3433 */
3434
3435 locklist_t *
flk_get_sleeping_locks(int sysid,pid_t pid)3436 flk_get_sleeping_locks(int sysid, pid_t pid)
3437 {
3438 return (get_lock_list(FLK_SLEEPING_STATE, 0, sysid, B_TRUE, pid, NULL,
3439 ALL_ZONES));
3440 }
3441
3442 locklist_t *
flk_get_active_locks(int sysid,pid_t pid)3443 flk_get_active_locks(int sysid, pid_t pid)
3444 {
3445 return (get_lock_list(FLK_ACTIVE_STATE, 0, sysid, B_TRUE, pid, NULL,
3446 ALL_ZONES));
3447 }
3448
3449 /*
3450 * Another interface to get_lock_list. This one returns all the active
3451 * locks for a given vnode. Again, see get_lock_list for details.
3452 *
3453 * We don't need to specify which zone's locks we're interested in. The matter
3454 * would only be interesting if the vnode belonged to NFS, and NFS vnodes can't
3455 * be used by multiple zones, so the list of locks will all be from the right
3456 * zone.
3457 */
3458
3459 locklist_t *
flk_active_locks_for_vp(const vnode_t * vp)3460 flk_active_locks_for_vp(const vnode_t *vp)
3461 {
3462 return (get_lock_list(FLK_ACTIVE_STATE, 0, 0, B_FALSE, NOPID, vp,
3463 ALL_ZONES));
3464 }
3465
3466 /*
3467 * Another interface to get_lock_list. This one returns all the active
3468 * nbmand locks for a given vnode. Again, see get_lock_list for details.
3469 *
3470 * See the comment for flk_active_locks_for_vp() for why we don't care to
3471 * specify the particular zone of interest.
3472 */
3473 locklist_t *
flk_active_nbmand_locks_for_vp(const vnode_t * vp)3474 flk_active_nbmand_locks_for_vp(const vnode_t *vp)
3475 {
3476 return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3477 NOPID, vp, ALL_ZONES));
3478 }
3479
3480 /*
3481 * Another interface to get_lock_list. This one returns all the active
3482 * nbmand locks for a given pid. Again, see get_lock_list for details.
3483 *
3484 * The zone doesn't need to be specified here; the locks held by a
3485 * particular process will either be local (ie, non-NFS) or from the zone
3486 * the process is executing in. This is because other parts of the system
3487 * ensure that an NFS vnode can't be used in a zone other than that in
3488 * which it was opened.
3489 */
3490 locklist_t *
flk_active_nbmand_locks(pid_t pid)3491 flk_active_nbmand_locks(pid_t pid)
3492 {
3493 return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3494 pid, NULL, ALL_ZONES));
3495 }
3496
3497 /*
3498 * Free up all entries in the locklist.
3499 */
3500 void
flk_free_locklist(locklist_t * llp)3501 flk_free_locklist(locklist_t *llp)
3502 {
3503 locklist_t *next_llp;
3504
3505 while (llp) {
3506 next_llp = llp->ll_next;
3507 VN_RELE(llp->ll_vp);
3508 kmem_free(llp, sizeof (*llp));
3509 llp = next_llp;
3510 }
3511 }
3512
3513 static void
cl_flk_change_nlm_state_all_locks(int nlmid,flk_nlm_status_t nlm_state)3514 cl_flk_change_nlm_state_all_locks(int nlmid, flk_nlm_status_t nlm_state)
3515 {
3516 /*
3517 * For each graph "lg" in the hash table lock_graph do
3518 * a. Get the list of sleeping locks
3519 * b. For each lock descriptor in the list do
3520 * i. If the requested lock is an NLM server request AND
3521 * the nlmid is the same as the routine argument then
3522 * change the lock descriptor's state field to
3523 * "nlm_state."
3524 * c. Get the list of active locks
3525 * d. For each lock descriptor in the list do
3526 * i. If the requested lock is an NLM server request AND
3527 * the nlmid is the same as the routine argument then
3528 * change the lock descriptor's state field to
3529 * "nlm_state."
3530 */
3531
3532 int i;
3533 graph_t *gp; /* lock graph */
3534 lock_descriptor_t *lock; /* lock */
3535 lock_descriptor_t *nlock = NULL; /* next lock */
3536 int lock_nlmid;
3537
3538 for (i = 0; i < HASH_SIZE; i++) {
3539 mutex_enter(&flock_lock);
3540 gp = lock_graph[i];
3541 mutex_exit(&flock_lock);
3542 if (gp == NULL) {
3543 continue;
3544 }
3545
3546 /* Get list of sleeping locks in current lock graph. */
3547 mutex_enter(&gp->gp_mutex);
3548 for (lock = SLEEPING_HEAD(gp)->l_next;
3549 lock != SLEEPING_HEAD(gp);
3550 lock = nlock) {
3551 nlock = lock->l_next;
3552 /* get NLM id */
3553 lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3554
3555 /*
3556 * If NLM server request AND nlmid of lock matches
3557 * nlmid of argument, then set the NLM state of the
3558 * lock to "nlm_state."
3559 */
3560 if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3561 SET_NLM_STATE(lock, nlm_state);
3562 }
3563 }
3564
3565 /* Get list of active locks in current lock graph. */
3566 for (lock = ACTIVE_HEAD(gp)->l_next;
3567 lock != ACTIVE_HEAD(gp);
3568 lock = nlock) {
3569 nlock = lock->l_next;
3570 /* get NLM id */
3571 lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3572
3573 /*
3574 * If NLM server request AND nlmid of lock matches
3575 * nlmid of argument, then set the NLM state of the
3576 * lock to "nlm_state."
3577 */
3578 if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3579 ASSERT(IS_ACTIVE(lock));
3580 SET_NLM_STATE(lock, nlm_state);
3581 }
3582 }
3583 mutex_exit(&gp->gp_mutex);
3584 }
3585 }
3586
3587 /*
3588 * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid().
3589 * Effects: Find all sleeping lock manager requests _only_ for the NLM server
3590 * identified by "nlmid." Poke those lock requests.
3591 */
3592 static void
cl_flk_wakeup_sleeping_nlm_locks(int nlmid)3593 cl_flk_wakeup_sleeping_nlm_locks(int nlmid)
3594 {
3595 lock_descriptor_t *lock;
3596 lock_descriptor_t *nlock = NULL; /* next lock */
3597 int i;
3598 graph_t *gp;
3599 int lock_nlmid;
3600
3601 for (i = 0; i < HASH_SIZE; i++) {
3602 mutex_enter(&flock_lock);
3603 gp = lock_graph[i];
3604 mutex_exit(&flock_lock);
3605 if (gp == NULL) {
3606 continue;
3607 }
3608
3609 mutex_enter(&gp->gp_mutex);
3610 for (lock = SLEEPING_HEAD(gp)->l_next;
3611 lock != SLEEPING_HEAD(gp);
3612 lock = nlock) {
3613 nlock = lock->l_next;
3614 /*
3615 * If NLM server request _and_ nlmid of lock matches
3616 * nlmid of argument, then set the NLM state of the
3617 * lock to NLM_SHUTTING_DOWN, and wake up sleeping
3618 * request.
3619 */
3620 if (IS_LOCKMGR(lock)) {
3621 /* get NLM id */
3622 lock_nlmid =
3623 GETNLMID(lock->l_flock.l_sysid);
3624 if (nlmid == lock_nlmid) {
3625 SET_NLM_STATE(lock,
3626 FLK_NLM_SHUTTING_DOWN);
3627 INTERRUPT_WAKEUP(lock);
3628 }
3629 }
3630 }
3631 mutex_exit(&gp->gp_mutex);
3632 }
3633 }
3634
3635 /*
3636 * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid()
3637 * Effects: Find all active (granted) lock manager locks _only_ for the
3638 * NLM server identified by "nlmid" and release them.
3639 */
3640 static void
cl_flk_unlock_nlm_granted(int nlmid)3641 cl_flk_unlock_nlm_granted(int nlmid)
3642 {
3643 lock_descriptor_t *lock;
3644 lock_descriptor_t *nlock = NULL; /* next lock */
3645 int i;
3646 graph_t *gp;
3647 int lock_nlmid;
3648
3649 for (i = 0; i < HASH_SIZE; i++) {
3650 mutex_enter(&flock_lock);
3651 gp = lock_graph[i];
3652 mutex_exit(&flock_lock);
3653 if (gp == NULL) {
3654 continue;
3655 }
3656
3657 mutex_enter(&gp->gp_mutex);
3658 for (lock = ACTIVE_HEAD(gp)->l_next;
3659 lock != ACTIVE_HEAD(gp);
3660 lock = nlock) {
3661 nlock = lock->l_next;
3662 ASSERT(IS_ACTIVE(lock));
3663
3664 /*
3665 * If it's an NLM server request _and_ nlmid of
3666 * the lock matches nlmid of argument, then
3667 * remove the active lock the list, wakup blocked
3668 * threads, and free the storage for the lock.
3669 * Note that there's no need to mark the NLM state
3670 * of this lock to NLM_DOWN because the lock will
3671 * be deleted anyway and its storage freed.
3672 */
3673 if (IS_LOCKMGR(lock)) {
3674 /* get NLM id */
3675 lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3676 if (nlmid == lock_nlmid) {
3677 flk_delete_active_lock(lock, 0);
3678 flk_wakeup(lock, 1);
3679 flk_free_lock(lock);
3680 }
3681 }
3682 }
3683 mutex_exit(&gp->gp_mutex);
3684 }
3685 }
3686
3687 /*
3688 * Find all sleeping lock manager requests and poke them.
3689 */
3690 static void
wakeup_sleeping_lockmgr_locks(struct flock_globals * fg)3691 wakeup_sleeping_lockmgr_locks(struct flock_globals *fg)
3692 {
3693 lock_descriptor_t *lock;
3694 lock_descriptor_t *nlock = NULL; /* next lock */
3695 int i;
3696 graph_t *gp;
3697 zoneid_t zoneid = getzoneid();
3698
3699 for (i = 0; i < HASH_SIZE; i++) {
3700 mutex_enter(&flock_lock);
3701 gp = lock_graph[i];
3702 mutex_exit(&flock_lock);
3703 if (gp == NULL) {
3704 continue;
3705 }
3706
3707 mutex_enter(&gp->gp_mutex);
3708 fg->lockmgr_status[i] = FLK_WAKEUP_SLEEPERS;
3709 for (lock = SLEEPING_HEAD(gp)->l_next;
3710 lock != SLEEPING_HEAD(gp);
3711 lock = nlock) {
3712 nlock = lock->l_next;
3713 if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
3714 INTERRUPT_WAKEUP(lock);
3715 }
3716 }
3717 mutex_exit(&gp->gp_mutex);
3718 }
3719 }
3720
3721
3722 /*
3723 * Find all active (granted) lock manager locks and release them.
3724 */
3725 static void
unlock_lockmgr_granted(struct flock_globals * fg)3726 unlock_lockmgr_granted(struct flock_globals *fg)
3727 {
3728 lock_descriptor_t *lock;
3729 lock_descriptor_t *nlock = NULL; /* next lock */
3730 int i;
3731 graph_t *gp;
3732 zoneid_t zoneid = getzoneid();
3733
3734 for (i = 0; i < HASH_SIZE; i++) {
3735 mutex_enter(&flock_lock);
3736 gp = lock_graph[i];
3737 mutex_exit(&flock_lock);
3738 if (gp == NULL) {
3739 continue;
3740 }
3741
3742 mutex_enter(&gp->gp_mutex);
3743 fg->lockmgr_status[i] = FLK_LOCKMGR_DOWN;
3744 for (lock = ACTIVE_HEAD(gp)->l_next;
3745 lock != ACTIVE_HEAD(gp);
3746 lock = nlock) {
3747 nlock = lock->l_next;
3748 if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
3749 ASSERT(IS_ACTIVE(lock));
3750 flk_delete_active_lock(lock, 0);
3751 flk_wakeup(lock, 1);
3752 flk_free_lock(lock);
3753 }
3754 }
3755 mutex_exit(&gp->gp_mutex);
3756 }
3757 }
3758
3759
3760 /*
3761 * Wait until a lock is granted, cancelled, or interrupted.
3762 */
3763
3764 static void
wait_for_lock(lock_descriptor_t * request)3765 wait_for_lock(lock_descriptor_t *request)
3766 {
3767 graph_t *gp = request->l_graph;
3768
3769 ASSERT(MUTEX_HELD(&gp->gp_mutex));
3770
3771 while (!(IS_GRANTED(request)) && !(IS_CANCELLED(request)) &&
3772 !(IS_INTERRUPTED(request))) {
3773 if (!cv_wait_sig(&request->l_cv, &gp->gp_mutex)) {
3774 flk_set_state(request, FLK_INTERRUPTED_STATE);
3775 request->l_state |= INTERRUPTED_LOCK;
3776 }
3777 }
3778 }
3779
3780 /*
3781 * Create an flock structure from the existing lock information
3782 *
3783 * This routine is used to create flock structures for the lock manager
3784 * to use in a reclaim request. Since the lock was originated on this
3785 * host, it must be conforming to UNIX semantics, so no checking is
3786 * done to make sure it falls within the lower half of the 32-bit range.
3787 */
3788
3789 static void
create_flock(lock_descriptor_t * lp,flock64_t * flp)3790 create_flock(lock_descriptor_t *lp, flock64_t *flp)
3791 {
3792 ASSERT(lp->l_end == MAX_U_OFFSET_T || lp->l_end <= MAXEND);
3793 ASSERT(lp->l_end >= lp->l_start);
3794
3795 flp->l_type = lp->l_type;
3796 flp->l_whence = 0;
3797 flp->l_start = lp->l_start;
3798 flp->l_len = (lp->l_end == MAX_U_OFFSET_T) ? 0 :
3799 (lp->l_end - lp->l_start + 1);
3800 flp->l_sysid = lp->l_flock.l_sysid;
3801 flp->l_pid = lp->l_flock.l_pid;
3802 }
3803
3804 /*
3805 * Convert flock_t data describing a lock range into unsigned long starting
3806 * and ending points, which are put into lock_request. Returns 0 or an
3807 * errno value.
3808 */
3809
3810 int
flk_convert_lock_data(vnode_t * vp,flock64_t * flp,u_offset_t * start,u_offset_t * end,offset_t offset)3811 flk_convert_lock_data(vnode_t *vp, flock64_t *flp,
3812 u_offset_t *start, u_offset_t *end, offset_t offset)
3813 {
3814 struct vattr vattr;
3815 int error;
3816
3817 /*
3818 * Determine the starting point of the request
3819 */
3820 switch (flp->l_whence) {
3821 case 0: /* SEEK_SET */
3822 *start = (u_offset_t)flp->l_start;
3823 break;
3824 case 1: /* SEEK_CUR */
3825 *start = (u_offset_t)(flp->l_start + offset);
3826 break;
3827 case 2: /* SEEK_END */
3828 vattr.va_mask = AT_SIZE;
3829 if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
3830 return (error);
3831 *start = (u_offset_t)(flp->l_start + vattr.va_size);
3832 break;
3833 default:
3834 return (EINVAL);
3835 }
3836
3837 /*
3838 * Determine the range covered by the request.
3839 */
3840 if (flp->l_len == 0)
3841 *end = MAX_U_OFFSET_T;
3842 else if ((offset_t)flp->l_len > 0) {
3843 *end = (u_offset_t)(*start + (flp->l_len - 1));
3844 } else {
3845 /*
3846 * Negative length; why do we even allow this ?
3847 * Because this allows easy specification of
3848 * the last n bytes of the file.
3849 */
3850 *end = *start;
3851 *start += (u_offset_t)flp->l_len;
3852 (*start)++;
3853 }
3854 return (0);
3855 }
3856
3857 /*
3858 * Check the validity of lock data. This can used by the NFS
3859 * frlock routines to check data before contacting the server. The
3860 * server must support semantics that aren't as restrictive as
3861 * the UNIX API, so the NFS client is required to check.
3862 * The maximum is now passed in by the caller.
3863 */
3864
3865 int
flk_check_lock_data(u_offset_t start,u_offset_t end,offset_t max)3866 flk_check_lock_data(u_offset_t start, u_offset_t end, offset_t max)
3867 {
3868 /*
3869 * The end (length) for local locking should never be greater
3870 * than MAXEND. However, the representation for
3871 * the entire file is MAX_U_OFFSET_T.
3872 */
3873 if ((start > max) ||
3874 ((end > max) && (end != MAX_U_OFFSET_T))) {
3875 return (EINVAL);
3876 }
3877 if (start > end) {
3878 return (EINVAL);
3879 }
3880 return (0);
3881 }
3882
3883 /*
3884 * Fill in request->l_flock with information about the lock blocking the
3885 * request. The complexity here is that lock manager requests are allowed
3886 * to see into the upper part of the 32-bit address range, whereas local
3887 * requests are only allowed to see signed values.
3888 *
3889 * What should be done when "blocker" is a lock manager lock that uses the
3890 * upper portion of the 32-bit range, but "request" is local? Since the
3891 * request has already been determined to have been blocked by the blocker,
3892 * at least some portion of "blocker" must be in the range of the request,
3893 * or the request extends to the end of file. For the first case, the
3894 * portion in the lower range is returned with the indication that it goes
3895 * "to EOF." For the second case, the last byte of the lower range is
3896 * returned with the indication that it goes "to EOF."
3897 */
3898
3899 static void
report_blocker(lock_descriptor_t * blocker,lock_descriptor_t * request)3900 report_blocker(lock_descriptor_t *blocker, lock_descriptor_t *request)
3901 {
3902 flock64_t *flrp; /* l_flock portion of request */
3903
3904 ASSERT(blocker != NULL);
3905
3906 flrp = &request->l_flock;
3907 flrp->l_whence = 0;
3908 flrp->l_type = blocker->l_type;
3909 flrp->l_pid = blocker->l_flock.l_pid;
3910 flrp->l_sysid = blocker->l_flock.l_sysid;
3911
3912 if (IS_LOCKMGR(request)) {
3913 flrp->l_start = blocker->l_start;
3914 if (blocker->l_end == MAX_U_OFFSET_T)
3915 flrp->l_len = 0;
3916 else
3917 flrp->l_len = blocker->l_end - blocker->l_start + 1;
3918 } else {
3919 if (blocker->l_start > MAXEND) {
3920 flrp->l_start = MAXEND;
3921 flrp->l_len = 0;
3922 } else {
3923 flrp->l_start = blocker->l_start;
3924 if (blocker->l_end == MAX_U_OFFSET_T)
3925 flrp->l_len = 0;
3926 else
3927 flrp->l_len = blocker->l_end -
3928 blocker->l_start + 1;
3929 }
3930 }
3931 }
3932
3933 /*
3934 * PSARC case 1997/292
3935 */
3936 /*
3937 * This is the public routine exported by flock.h.
3938 */
3939 void
cl_flk_change_nlm_state_to_unknown(int nlmid)3940 cl_flk_change_nlm_state_to_unknown(int nlmid)
3941 {
3942 /*
3943 * Check to see if node is booted as a cluster. If not, return.
3944 */
3945 if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
3946 return;
3947 }
3948
3949 /*
3950 * See comment in cl_flk_set_nlm_status().
3951 */
3952 if (nlm_reg_status == NULL) {
3953 return;
3954 }
3955
3956 /*
3957 * protect NLM registry state with a mutex.
3958 */
3959 ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
3960 mutex_enter(&nlm_reg_lock);
3961 FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid, FLK_NLM_UNKNOWN);
3962 mutex_exit(&nlm_reg_lock);
3963 }
3964
3965 /*
3966 * Return non-zero if the given I/O request conflicts with an active NBMAND
3967 * lock.
3968 * If svmand is non-zero, it means look at all active locks, not just NBMAND
3969 * locks.
3970 */
3971
3972 int
nbl_lock_conflict(vnode_t * vp,nbl_op_t op,u_offset_t offset,ssize_t length,int svmand,caller_context_t * ct)3973 nbl_lock_conflict(vnode_t *vp, nbl_op_t op, u_offset_t offset,
3974 ssize_t length, int svmand, caller_context_t *ct)
3975 {
3976 int conflict = 0;
3977 graph_t *gp;
3978 lock_descriptor_t *lock;
3979 pid_t pid;
3980 int sysid;
3981
3982 if (ct == NULL) {
3983 pid = curproc->p_pid;
3984 sysid = 0;
3985 } else {
3986 pid = ct->cc_pid;
3987 sysid = ct->cc_sysid;
3988 }
3989
3990 mutex_enter(&flock_lock);
3991 gp = lock_graph[HASH_INDEX(vp)];
3992 mutex_exit(&flock_lock);
3993 if (gp == NULL)
3994 return (0);
3995
3996 mutex_enter(&gp->gp_mutex);
3997 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
3998
3999 for (; lock && lock->l_vnode == vp; lock = lock->l_next) {
4000 if ((svmand || (lock->l_state & NBMAND_LOCK)) &&
4001 (lock->l_flock.l_sysid != sysid ||
4002 lock->l_flock.l_pid != pid) &&
4003 lock_blocks_io(op, offset, length,
4004 lock->l_type, lock->l_start, lock->l_end)) {
4005 conflict = 1;
4006 break;
4007 }
4008 }
4009 mutex_exit(&gp->gp_mutex);
4010
4011 return (conflict);
4012 }
4013
4014 /*
4015 * Return non-zero if the given I/O request conflicts with the given lock.
4016 */
4017
4018 static int
lock_blocks_io(nbl_op_t op,u_offset_t offset,ssize_t length,int lock_type,u_offset_t lock_start,u_offset_t lock_end)4019 lock_blocks_io(nbl_op_t op, u_offset_t offset, ssize_t length,
4020 int lock_type, u_offset_t lock_start, u_offset_t lock_end)
4021 {
4022 ASSERT(op == NBL_READ || op == NBL_WRITE || op == NBL_READWRITE);
4023 ASSERT(lock_type == F_RDLCK || lock_type == F_WRLCK);
4024
4025 if (op == NBL_READ && lock_type == F_RDLCK)
4026 return (0);
4027
4028 if (offset <= lock_start && lock_start < offset + length)
4029 return (1);
4030 if (lock_start <= offset && offset <= lock_end)
4031 return (1);
4032
4033 return (0);
4034 }
4035
4036 #ifdef DEBUG
4037 static void
check_active_locks(graph_t * gp)4038 check_active_locks(graph_t *gp)
4039 {
4040 lock_descriptor_t *lock, *lock1;
4041 edge_t *ep;
4042
4043 for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
4044 lock = lock->l_next) {
4045 ASSERT(IS_ACTIVE(lock));
4046 ASSERT(NOT_BLOCKED(lock));
4047 ASSERT(!IS_BARRIER(lock));
4048
4049 ep = FIRST_IN(lock);
4050
4051 while (ep != HEAD(lock)) {
4052 ASSERT(IS_SLEEPING(ep->from_vertex));
4053 ASSERT(!NOT_BLOCKED(ep->from_vertex));
4054 ep = NEXT_IN(ep);
4055 }
4056
4057 for (lock1 = lock->l_next; lock1 != ACTIVE_HEAD(gp);
4058 lock1 = lock1->l_next) {
4059 if (lock1->l_vnode == lock->l_vnode) {
4060 if (BLOCKS(lock1, lock)) {
4061 cmn_err(CE_PANIC,
4062 "active lock %p blocks %p",
4063 (void *)lock1, (void *)lock);
4064 } else if (BLOCKS(lock, lock1)) {
4065 cmn_err(CE_PANIC,
4066 "active lock %p blocks %p",
4067 (void *)lock, (void *)lock1);
4068 }
4069 }
4070 }
4071 }
4072 }
4073
4074 /*
4075 * Effect: This functions checks to see if the transition from 'old_state' to
4076 * 'new_state' is a valid one. It returns 0 if the transition is valid
4077 * and 1 if it is not.
4078 * For a map of valid transitions, see sys/flock_impl.h
4079 */
4080 static int
check_lock_transition(int old_state,int new_state)4081 check_lock_transition(int old_state, int new_state)
4082 {
4083 switch (old_state) {
4084 case FLK_INITIAL_STATE:
4085 if ((new_state == FLK_START_STATE) ||
4086 (new_state == FLK_SLEEPING_STATE) ||
4087 (new_state == FLK_ACTIVE_STATE) ||
4088 (new_state == FLK_DEAD_STATE)) {
4089 return (0);
4090 } else {
4091 return (1);
4092 }
4093 case FLK_START_STATE:
4094 if ((new_state == FLK_ACTIVE_STATE) ||
4095 (new_state == FLK_DEAD_STATE)) {
4096 return (0);
4097 } else {
4098 return (1);
4099 }
4100 case FLK_ACTIVE_STATE:
4101 if (new_state == FLK_DEAD_STATE) {
4102 return (0);
4103 } else {
4104 return (1);
4105 }
4106 case FLK_SLEEPING_STATE:
4107 if ((new_state == FLK_GRANTED_STATE) ||
4108 (new_state == FLK_INTERRUPTED_STATE) ||
4109 (new_state == FLK_CANCELLED_STATE)) {
4110 return (0);
4111 } else {
4112 return (1);
4113 }
4114 case FLK_GRANTED_STATE:
4115 if ((new_state == FLK_START_STATE) ||
4116 (new_state == FLK_INTERRUPTED_STATE) ||
4117 (new_state == FLK_CANCELLED_STATE)) {
4118 return (0);
4119 } else {
4120 return (1);
4121 }
4122 case FLK_CANCELLED_STATE:
4123 if ((new_state == FLK_INTERRUPTED_STATE) ||
4124 (new_state == FLK_DEAD_STATE)) {
4125 return (0);
4126 } else {
4127 return (1);
4128 }
4129 case FLK_INTERRUPTED_STATE:
4130 if (new_state == FLK_DEAD_STATE) {
4131 return (0);
4132 } else {
4133 return (1);
4134 }
4135 case FLK_DEAD_STATE:
4136 /* May be set more than once */
4137 if (new_state == FLK_DEAD_STATE) {
4138 return (0);
4139 } else {
4140 return (1);
4141 }
4142 default:
4143 return (1);
4144 }
4145 }
4146
4147 static void
check_sleeping_locks(graph_t * gp)4148 check_sleeping_locks(graph_t *gp)
4149 {
4150 lock_descriptor_t *lock1, *lock2;
4151 edge_t *ep;
4152 for (lock1 = SLEEPING_HEAD(gp)->l_next; lock1 != SLEEPING_HEAD(gp);
4153 lock1 = lock1->l_next) {
4154 ASSERT(!IS_BARRIER(lock1));
4155 for (lock2 = lock1->l_next; lock2 != SLEEPING_HEAD(gp);
4156 lock2 = lock2->l_next) {
4157 if (lock1->l_vnode == lock2->l_vnode) {
4158 if (BLOCKS(lock2, lock1)) {
4159 ASSERT(!IS_GRANTED(lock1));
4160 ASSERT(!NOT_BLOCKED(lock1));
4161 path(lock1, lock2);
4162 }
4163 }
4164 }
4165
4166 for (lock2 = ACTIVE_HEAD(gp)->l_next; lock2 != ACTIVE_HEAD(gp);
4167 lock2 = lock2->l_next) {
4168 ASSERT(!IS_BARRIER(lock1));
4169 if (lock1->l_vnode == lock2->l_vnode) {
4170 if (BLOCKS(lock2, lock1)) {
4171 ASSERT(!IS_GRANTED(lock1));
4172 ASSERT(!NOT_BLOCKED(lock1));
4173 path(lock1, lock2);
4174 }
4175 }
4176 }
4177 ep = FIRST_ADJ(lock1);
4178 while (ep != HEAD(lock1)) {
4179 ASSERT(BLOCKS(ep->to_vertex, lock1));
4180 ep = NEXT_ADJ(ep);
4181 }
4182 }
4183 }
4184
4185 static int
level_two_path(lock_descriptor_t * lock1,lock_descriptor_t * lock2,int no_path)4186 level_two_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2, int no_path)
4187 {
4188 edge_t *ep;
4189 lock_descriptor_t *vertex;
4190 lock_descriptor_t *vertex_stack;
4191
4192 STACK_INIT(vertex_stack);
4193
4194 flk_graph_uncolor(lock1->l_graph);
4195 ep = FIRST_ADJ(lock1);
4196 ASSERT(ep != HEAD(lock1));
4197 while (ep != HEAD(lock1)) {
4198 if (no_path)
4199 ASSERT(ep->to_vertex != lock2);
4200 STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4201 COLOR(ep->to_vertex);
4202 ep = NEXT_ADJ(ep);
4203 }
4204
4205 while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
4206 STACK_POP(vertex_stack, l_dstack);
4207 for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
4208 ep = NEXT_ADJ(ep)) {
4209 if (COLORED(ep->to_vertex))
4210 continue;
4211 COLOR(ep->to_vertex);
4212 if (ep->to_vertex == lock2)
4213 return (1);
4214
4215 STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4216 }
4217 }
4218 return (0);
4219 }
4220
4221 static void
check_owner_locks(graph_t * gp,pid_t pid,int sysid,vnode_t * vp)4222 check_owner_locks(graph_t *gp, pid_t pid, int sysid, vnode_t *vp)
4223 {
4224 lock_descriptor_t *lock;
4225
4226 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
4227
4228 if (lock) {
4229 while (lock != ACTIVE_HEAD(gp) && (lock->l_vnode == vp)) {
4230 if (lock->l_flock.l_pid == pid &&
4231 lock->l_flock.l_sysid == sysid)
4232 cmn_err(CE_PANIC,
4233 "owner pid %d's lock %p in active queue",
4234 pid, (void *)lock);
4235 lock = lock->l_next;
4236 }
4237 }
4238 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
4239
4240 if (lock) {
4241 while (lock != SLEEPING_HEAD(gp) && (lock->l_vnode == vp)) {
4242 if (lock->l_flock.l_pid == pid &&
4243 lock->l_flock.l_sysid == sysid)
4244 cmn_err(CE_PANIC,
4245 "owner pid %d's lock %p in sleep queue",
4246 pid, (void *)lock);
4247 lock = lock->l_next;
4248 }
4249 }
4250 }
4251
4252 static int
level_one_path(lock_descriptor_t * lock1,lock_descriptor_t * lock2)4253 level_one_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4254 {
4255 edge_t *ep = FIRST_ADJ(lock1);
4256
4257 while (ep != HEAD(lock1)) {
4258 if (ep->to_vertex == lock2)
4259 return (1);
4260 else
4261 ep = NEXT_ADJ(ep);
4262 }
4263 return (0);
4264 }
4265
4266 static int
no_path(lock_descriptor_t * lock1,lock_descriptor_t * lock2)4267 no_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4268 {
4269 return (!level_two_path(lock1, lock2, 1));
4270 }
4271
4272 static void
path(lock_descriptor_t * lock1,lock_descriptor_t * lock2)4273 path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4274 {
4275 if (level_one_path(lock1, lock2)) {
4276 if (level_two_path(lock1, lock2, 0) != 0) {
4277 cmn_err(CE_WARN,
4278 "one edge one path from lock1 %p lock2 %p",
4279 (void *)lock1, (void *)lock2);
4280 }
4281 } else if (no_path(lock1, lock2)) {
4282 cmn_err(CE_PANIC,
4283 "No path from lock1 %p to lock2 %p",
4284 (void *)lock1, (void *)lock2);
4285 }
4286 }
4287 #endif /* DEBUG */
4288