1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
29
30 /*
31 * Copyright 2019 Nexenta by DDN, Inc. All rights reserved.
32 * Copyright 2015 Joyent, Inc.
33 */
34
35 #include <sys/flock_impl.h>
36 #include <sys/vfs.h>
37 #include <sys/t_lock.h> /* for <sys/callb.h> */
38 #include <sys/callb.h>
39 #include <sys/clconf.h>
40 #include <sys/cladm.h>
41 #include <sys/nbmlock.h>
42 #include <sys/cred.h>
43 #include <sys/policy.h>
44
45 /*
46 * The following four variables are for statistics purposes and they are
47 * not protected by locks. They may not be accurate but will at least be
48 * close to the actual value.
49 */
50
51 int flk_lock_allocs;
52 int flk_lock_frees;
53 int edge_allocs;
54 int edge_frees;
55 int flk_proc_vertex_allocs;
56 int flk_proc_edge_allocs;
57 int flk_proc_vertex_frees;
58 int flk_proc_edge_frees;
59
60 static kmutex_t flock_lock;
61
62 #ifdef DEBUG
63 int check_debug = 0;
64 #define CHECK_ACTIVE_LOCKS(gp) if (check_debug) \
65 check_active_locks(gp);
66 #define CHECK_SLEEPING_LOCKS(gp) if (check_debug) \
67 check_sleeping_locks(gp);
68 #define CHECK_OWNER_LOCKS(gp, pid, sysid, vp) \
69 if (check_debug) \
70 check_owner_locks(gp, pid, sysid, vp);
71 #define CHECK_LOCK_TRANSITION(old_state, new_state) \
72 { \
73 if (check_lock_transition(old_state, new_state)) { \
74 cmn_err(CE_PANIC, "Illegal lock transition \
75 from %d to %d", old_state, new_state); \
76 } \
77 }
78 #else
79
80 #define CHECK_ACTIVE_LOCKS(gp)
81 #define CHECK_SLEEPING_LOCKS(gp)
82 #define CHECK_OWNER_LOCKS(gp, pid, sysid, vp)
83 #define CHECK_LOCK_TRANSITION(old_state, new_state)
84
85 #endif /* DEBUG */
86
87 struct kmem_cache *flk_edge_cache;
88
89 graph_t *lock_graph[HASH_SIZE];
90 proc_graph_t pgraph;
91
92 /*
93 * Clustering.
94 *
95 * NLM REGISTRY TYPE IMPLEMENTATION
96 *
97 * Assumptions:
98 * 1. Nodes in a cluster are numbered starting at 1; always non-negative
99 * integers; maximum node id is returned by clconf_maximum_nodeid().
100 * 2. We use this node id to identify the node an NLM server runs on.
101 */
102
103 /*
104 * NLM registry object keeps track of NLM servers via their
105 * nlmids (which are the node ids of the node in the cluster they run on)
106 * that have requested locks at this LLM with which this registry is
107 * associated.
108 *
109 * Representation of abstraction:
110 * rep = record[ states: array[nlm_state],
111 * lock: mutex]
112 *
113 * Representation invariants:
114 * 1. index i of rep.states is between 0 and n - 1 where n is number
115 * of elements in the array, which happen to be the maximum number
116 * of nodes in the cluster configuration + 1.
117 * 2. map nlmid to index i of rep.states
118 * 0 -> 0
119 * 1 -> 1
120 * 2 -> 2
121 * n-1 -> clconf_maximum_nodeid()+1
122 * 3. This 1-1 mapping is quite convenient and it avoids errors resulting
123 * from forgetting to subtract 1 from the index.
124 * 4. The reason we keep the 0th index is the following. A legitimate
125 * cluster configuration includes making a UFS file system NFS
126 * exportable. The code is structured so that if you're in a cluster
127 * you do one thing; otherwise, you do something else. The problem
128 * is what to do if you think you're in a cluster with PXFS loaded,
129 * but you're using UFS not PXFS? The upper two bytes of the sysid
130 * encode the node id of the node where NLM server runs; these bytes
131 * are zero for UFS. Since the nodeid is used to index into the
132 * registry, we can record the NLM server state information at index
133 * 0 using the same mechanism used for PXFS file locks!
134 */
135 static flk_nlm_status_t *nlm_reg_status = NULL; /* state array 0..N-1 */
136 static kmutex_t nlm_reg_lock; /* lock to protect arrary */
137 static uint_t nlm_status_size; /* size of state array */
138
139 /*
140 * Although we need a global lock dependency graph (and associated data
141 * structures), we also need a per-zone notion of whether the lock manager is
142 * running, and so whether to allow lock manager requests or not.
143 *
144 * Thus, on a per-zone basis we maintain a ``global'' variable
145 * (flk_lockmgr_status), protected by flock_lock, and set when the lock
146 * manager is determined to be changing state (starting or stopping).
147 *
148 * Each graph/zone pair also has a copy of this variable, which is protected by
149 * the graph's mutex.
150 *
151 * The per-graph copies are used to synchronize lock requests with shutdown
152 * requests. The global copy is used to initialize the per-graph field when a
153 * new graph is created.
154 */
155 struct flock_globals {
156 flk_lockmgr_status_t flk_lockmgr_status;
157 flk_lockmgr_status_t lockmgr_status[HASH_SIZE];
158 };
159
160 zone_key_t flock_zone_key;
161
162 static void create_flock(lock_descriptor_t *, flock64_t *);
163 static lock_descriptor_t *flk_get_lock(void);
164 static void flk_free_lock(lock_descriptor_t *lock);
165 static void flk_get_first_blocking_lock(lock_descriptor_t *request);
166 static int flk_process_request(lock_descriptor_t *);
167 static int flk_add_edge(lock_descriptor_t *, lock_descriptor_t *, int, int);
168 static edge_t *flk_get_edge(void);
169 static int flk_wait_execute_request(lock_descriptor_t *);
170 static int flk_relation(lock_descriptor_t *, lock_descriptor_t *);
171 static void flk_insert_active_lock(lock_descriptor_t *);
172 static void flk_delete_active_lock(lock_descriptor_t *, int);
173 static void flk_insert_sleeping_lock(lock_descriptor_t *);
174 static void flk_graph_uncolor(graph_t *);
175 static void flk_wakeup(lock_descriptor_t *, int);
176 static void flk_free_edge(edge_t *);
177 static void flk_recompute_dependencies(lock_descriptor_t *,
178 lock_descriptor_t **, int, int);
179 static int flk_find_barriers(lock_descriptor_t *);
180 static void flk_update_barriers(lock_descriptor_t *);
181 static int flk_color_reachables(lock_descriptor_t *);
182 static int flk_canceled(lock_descriptor_t *);
183 static void flk_delete_locks_by_sysid(lock_descriptor_t *);
184 static void report_blocker(lock_descriptor_t *, lock_descriptor_t *);
185 static void wait_for_lock(lock_descriptor_t *);
186 static void unlock_lockmgr_granted(struct flock_globals *);
187 static void wakeup_sleeping_lockmgr_locks(struct flock_globals *);
188
189 /* Clustering hooks */
190 static void cl_flk_change_nlm_state_all_locks(int, flk_nlm_status_t);
191 static void cl_flk_wakeup_sleeping_nlm_locks(int);
192 static void cl_flk_unlock_nlm_granted(int);
193
194 #ifdef DEBUG
195 static int check_lock_transition(int, int);
196 static void check_sleeping_locks(graph_t *);
197 static void check_active_locks(graph_t *);
198 static int no_path(lock_descriptor_t *, lock_descriptor_t *);
199 static void path(lock_descriptor_t *, lock_descriptor_t *);
200 static void check_owner_locks(graph_t *, pid_t, int, vnode_t *);
201 static int level_one_path(lock_descriptor_t *, lock_descriptor_t *);
202 static int level_two_path(lock_descriptor_t *, lock_descriptor_t *, int);
203 #endif
204
205 /* proc_graph function definitions */
206 static int flk_check_deadlock(lock_descriptor_t *);
207 static void flk_proc_graph_uncolor(void);
208 static proc_vertex_t *flk_get_proc_vertex(lock_descriptor_t *);
209 static proc_edge_t *flk_get_proc_edge(void);
210 static void flk_proc_release(proc_vertex_t *);
211 static void flk_free_proc_edge(proc_edge_t *);
212 static void flk_update_proc_graph(edge_t *, int);
213
214 /* Non-blocking mandatory locking */
215 static int lock_blocks_io(nbl_op_t, u_offset_t, ssize_t, int, u_offset_t,
216 u_offset_t);
217
218 static struct flock_globals *
flk_get_globals(void)219 flk_get_globals(void)
220 {
221 /*
222 * The KLM module had better be loaded if we're attempting to handle
223 * lockmgr requests.
224 */
225 ASSERT(flock_zone_key != ZONE_KEY_UNINITIALIZED);
226 return (zone_getspecific(flock_zone_key, curproc->p_zone));
227 }
228
229 static flk_lockmgr_status_t
flk_get_lockmgr_status(void)230 flk_get_lockmgr_status(void)
231 {
232 struct flock_globals *fg;
233
234 ASSERT(MUTEX_HELD(&flock_lock));
235
236 if (flock_zone_key == ZONE_KEY_UNINITIALIZED) {
237 /*
238 * KLM module not loaded; lock manager definitely not running.
239 */
240 return (FLK_LOCKMGR_DOWN);
241 }
242 fg = flk_get_globals();
243 return (fg->flk_lockmgr_status);
244 }
245
246 /*
247 * This implements Open File Description (not descriptor) style record locking.
248 * These locks can also be thought of as pid-less since they are not tied to a
249 * specific process, thus they're preserved across fork.
250 *
251 * Called directly from fcntl.
252 *
253 * See reclock() for the implementation of the traditional POSIX style record
254 * locking scheme (pid-ful). This function is derived from reclock() but
255 * simplified and modified to work for OFD style locking.
256 *
257 * The two primary advantages of OFD style of locking are:
258 * 1) It is per-file description, so closing a file descriptor that refers to a
259 * different file description for the same file will not drop the lock (i.e.
260 * two open's of the same file get different descriptions but a dup or fork
261 * will refer to the same description).
262 * 2) Locks are preserved across fork(2).
263 *
264 * Because these locks are per-description a lock ptr lives at the f_filocks
265 * member of the file_t and the lock_descriptor includes a file_t pointer
266 * to enable unique lock identification and management.
267 *
268 * Since these locks are pid-less we cannot do deadlock detection with the
269 * current process-oriented implementation. This is consistent with OFD locking
270 * behavior on other operating systems such as Linux. Since we don't do
271 * deadlock detection we never interact with the process graph that is
272 * maintained for deadlock detection on the traditional POSIX-style locks.
273 *
274 * Future Work:
275 *
276 * The current implementation does not support record locks. That is,
277 * currently the single lock must cover the entire file. This is validated in
278 * fcntl. To support record locks the f_filock pointer in the file_t needs to
279 * be changed to a list of pointers to the locks. That list needs to be
280 * managed independently of the lock list on the vnode itself and it needs to
281 * be maintained as record locks are created, split, coalesced and deleted.
282 *
283 * The current implementation does not support remote file systems (e.g.
284 * NFS or CIFS). This is handled in fs_frlock(). The design of how OFD locks
285 * interact with the NLM is not clear since the NLM protocol/implementation
286 * appears to be oriented around locks associated with a process. A further
287 * problem is that a design is needed for what nlm_send_siglost() should do and
288 * where it will send SIGLOST. More recent versions of Linux apparently try to
289 * emulate OFD locks on NFS by converting them to traditional POSIX style locks
290 * that work with the NLM. It is not clear that this provides the correct
291 * semantics in all cases.
292 */
293 int
ofdlock(file_t * fp,int fcmd,flock64_t * lckdat,int flag,u_offset_t offset)294 ofdlock(file_t *fp, int fcmd, flock64_t *lckdat, int flag, u_offset_t offset)
295 {
296 int cmd = 0;
297 vnode_t *vp;
298 lock_descriptor_t stack_lock_request;
299 lock_descriptor_t *lock_request;
300 int error = 0;
301 graph_t *gp;
302 int serialize = 0;
303
304 if (fcmd != F_OFD_GETLK)
305 cmd = SETFLCK;
306
307 if (fcmd == F_OFD_SETLKW || fcmd == F_FLOCKW)
308 cmd |= SLPFLCK;
309
310 /* see block comment */
311 VERIFY(lckdat->l_whence == 0);
312 VERIFY(lckdat->l_start == 0);
313 VERIFY(lckdat->l_len == 0);
314
315 vp = fp->f_vnode;
316
317 /*
318 * For reclock fs_frlock() would normally have set these in a few
319 * places but for us it's cleaner to centralize it here. Note that
320 * IGN_PID is -1. We use 0 for our pid-less locks.
321 */
322 lckdat->l_pid = 0;
323 lckdat->l_sysid = 0;
324
325 /*
326 * Check access permissions
327 */
328 if ((fcmd == F_OFD_SETLK || fcmd == F_OFD_SETLKW) &&
329 ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
330 (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
331 return (EBADF);
332
333 /*
334 * for query and unlock we use the stack_lock_request
335 */
336 if (lckdat->l_type == F_UNLCK || !(cmd & SETFLCK)) {
337 lock_request = &stack_lock_request;
338 (void) bzero((caddr_t)lock_request,
339 sizeof (lock_descriptor_t));
340
341 /*
342 * following is added to make the assertions in
343 * flk_execute_request() pass
344 */
345 lock_request->l_edge.edge_in_next = &lock_request->l_edge;
346 lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
347 lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
348 lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
349 lock_request->l_status = FLK_INITIAL_STATE;
350 } else {
351 lock_request = flk_get_lock();
352 fp->f_filock = (struct filock *)lock_request;
353 }
354 lock_request->l_state = 0;
355 lock_request->l_vnode = vp;
356 lock_request->l_zoneid = getzoneid();
357 lock_request->l_ofd = fp;
358
359 /*
360 * Convert the request range into the canonical start and end
361 * values then check the validity of the lock range.
362 */
363 error = flk_convert_lock_data(vp, lckdat, &lock_request->l_start,
364 &lock_request->l_end, offset);
365 if (error)
366 goto done;
367
368 error = flk_check_lock_data(lock_request->l_start, lock_request->l_end,
369 MAXEND);
370 if (error)
371 goto done;
372
373 ASSERT(lock_request->l_end >= lock_request->l_start);
374
375 lock_request->l_type = lckdat->l_type;
376 if (cmd & SLPFLCK)
377 lock_request->l_state |= WILLING_TO_SLEEP_LOCK;
378
379 if (!(cmd & SETFLCK)) {
380 if (lock_request->l_type == F_RDLCK ||
381 lock_request->l_type == F_WRLCK)
382 lock_request->l_state |= QUERY_LOCK;
383 }
384 lock_request->l_flock = (*lckdat);
385
386 /*
387 * We are ready for processing the request
388 */
389
390 if (fcmd != F_OFD_GETLK && lock_request->l_type != F_UNLCK &&
391 nbl_need_check(vp)) {
392 nbl_start_crit(vp, RW_WRITER);
393 serialize = 1;
394 }
395
396 /* Get the lock graph for a particular vnode */
397 gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH);
398
399 mutex_enter(&gp->gp_mutex);
400
401 lock_request->l_state |= REFERENCED_LOCK;
402 lock_request->l_graph = gp;
403
404 switch (lock_request->l_type) {
405 case F_RDLCK:
406 case F_WRLCK:
407 if (IS_QUERY_LOCK(lock_request)) {
408 flk_get_first_blocking_lock(lock_request);
409 if (lock_request->l_ofd != NULL)
410 lock_request->l_flock.l_pid = -1;
411 (*lckdat) = lock_request->l_flock;
412 } else {
413 /* process the request now */
414 error = flk_process_request(lock_request);
415 }
416 break;
417
418 case F_UNLCK:
419 /* unlock request will not block so execute it immediately */
420 error = flk_execute_request(lock_request);
421 break;
422
423 default:
424 error = EINVAL;
425 break;
426 }
427
428 if (lock_request == &stack_lock_request) {
429 flk_set_state(lock_request, FLK_DEAD_STATE);
430 } else {
431 lock_request->l_state &= ~REFERENCED_LOCK;
432 if ((error != 0) || IS_DELETED(lock_request)) {
433 flk_set_state(lock_request, FLK_DEAD_STATE);
434 flk_free_lock(lock_request);
435 }
436 }
437
438 mutex_exit(&gp->gp_mutex);
439 if (serialize)
440 nbl_end_crit(vp);
441
442 return (error);
443
444 done:
445 flk_set_state(lock_request, FLK_DEAD_STATE);
446 if (lock_request != &stack_lock_request)
447 flk_free_lock(lock_request);
448 return (error);
449 }
450
451 /*
452 * Remove any lock on the vnode belonging to the given file_t.
453 * Called from closef on last close, file_t is locked.
454 *
455 * This is modeled on the cleanlocks() function but only removes the single
456 * lock associated with fp.
457 */
458 void
ofdcleanlock(file_t * fp)459 ofdcleanlock(file_t *fp)
460 {
461 lock_descriptor_t *fplock, *lock, *nlock;
462 vnode_t *vp;
463 graph_t *gp;
464
465 ASSERT(MUTEX_HELD(&fp->f_tlock));
466
467 if ((fplock = (lock_descriptor_t *)fp->f_filock) == NULL)
468 return;
469
470 fp->f_filock = NULL;
471 vp = fp->f_vnode;
472
473 gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
474
475 if (gp == NULL)
476 return;
477 mutex_enter(&gp->gp_mutex);
478
479 CHECK_SLEEPING_LOCKS(gp);
480 CHECK_ACTIVE_LOCKS(gp);
481
482 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
483
484 if (lock) {
485 do {
486 nlock = lock->l_next;
487 if (fplock == lock) {
488 CANCEL_WAKEUP(lock);
489 break;
490 }
491 lock = nlock;
492 } while (lock->l_vnode == vp);
493 }
494
495 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
496
497 if (lock) {
498 do {
499 nlock = lock->l_next;
500 if (fplock == lock) {
501 flk_delete_active_lock(lock, 0);
502 flk_wakeup(lock, 1);
503 flk_free_lock(lock);
504 break;
505 }
506 lock = nlock;
507 } while (lock->l_vnode == vp);
508 }
509
510 CHECK_SLEEPING_LOCKS(gp);
511 CHECK_ACTIVE_LOCKS(gp);
512 mutex_exit(&gp->gp_mutex);
513 }
514
515 /*
516 * Routine called from fs_frlock in fs/fs_subr.c
517 *
518 * This implements traditional POSIX style record locking. The two primary
519 * drawbacks to this style of locking are:
520 * 1) It is per-process, so any close of a file descriptor that refers to the
521 * file will drop the lock (e.g. lock /etc/passwd, call a library function
522 * which opens /etc/passwd to read the file, when the library closes it's
523 * file descriptor the application loses its lock and does not know).
524 * 2) Locks are not preserved across fork(2).
525 *
526 * Because these locks are only associated with a PID, they are per-process.
527 * This is why any close will drop the lock and is also why, once the process
528 * forks, the lock is no longer related to the new process. These locks can
529 * be considered as PID-ful.
530 *
531 * See ofdlock() for the implementation of a similar but improved locking
532 * scheme.
533 */
534 int
reclock(vnode_t * vp,flock64_t * lckdat,int cmd,int flag,u_offset_t offset,flk_callback_t * flk_cbp)535 reclock(vnode_t *vp, flock64_t *lckdat, int cmd, int flag, u_offset_t offset,
536 flk_callback_t *flk_cbp)
537 {
538 lock_descriptor_t stack_lock_request;
539 lock_descriptor_t *lock_request;
540 int error = 0;
541 graph_t *gp;
542 int nlmid;
543
544 /*
545 * Check access permissions
546 */
547 if ((cmd & SETFLCK) &&
548 ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) ||
549 (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0)))
550 return (EBADF);
551
552 /*
553 * for query and unlock we use the stack_lock_request
554 */
555
556 if ((lckdat->l_type == F_UNLCK) ||
557 !((cmd & INOFLCK) || (cmd & SETFLCK))) {
558 lock_request = &stack_lock_request;
559 (void) bzero((caddr_t)lock_request,
560 sizeof (lock_descriptor_t));
561
562 /*
563 * following is added to make the assertions in
564 * flk_execute_request() to pass through
565 */
566
567 lock_request->l_edge.edge_in_next = &lock_request->l_edge;
568 lock_request->l_edge.edge_in_prev = &lock_request->l_edge;
569 lock_request->l_edge.edge_adj_next = &lock_request->l_edge;
570 lock_request->l_edge.edge_adj_prev = &lock_request->l_edge;
571 lock_request->l_status = FLK_INITIAL_STATE;
572 } else {
573 lock_request = flk_get_lock();
574 }
575 lock_request->l_state = 0;
576 lock_request->l_vnode = vp;
577 lock_request->l_zoneid = getzoneid();
578
579 /*
580 * Convert the request range into the canonical start and end
581 * values. The NLM protocol supports locking over the entire
582 * 32-bit range, so there's no range checking for remote requests,
583 * but we still need to verify that local requests obey the rules.
584 */
585 /* Clustering */
586 if ((cmd & (RCMDLCK | PCMDLCK)) != 0) {
587 ASSERT(lckdat->l_whence == 0);
588 lock_request->l_start = lckdat->l_start;
589 lock_request->l_end = (lckdat->l_len == 0) ? MAX_U_OFFSET_T :
590 lckdat->l_start + (lckdat->l_len - 1);
591 } else {
592 /* check the validity of the lock range */
593 error = flk_convert_lock_data(vp, lckdat,
594 &lock_request->l_start, &lock_request->l_end,
595 offset);
596 if (error) {
597 goto done;
598 }
599 error = flk_check_lock_data(lock_request->l_start,
600 lock_request->l_end, MAXEND);
601 if (error) {
602 goto done;
603 }
604 }
605
606 ASSERT(lock_request->l_end >= lock_request->l_start);
607
608 lock_request->l_type = lckdat->l_type;
609 if (cmd & INOFLCK)
610 lock_request->l_state |= IO_LOCK;
611 if (cmd & SLPFLCK)
612 lock_request->l_state |= WILLING_TO_SLEEP_LOCK;
613 if (cmd & RCMDLCK)
614 lock_request->l_state |= LOCKMGR_LOCK;
615 if (cmd & NBMLCK)
616 lock_request->l_state |= NBMAND_LOCK;
617 /*
618 * Clustering: set flag for PXFS locks
619 * We do not _only_ check for the PCMDLCK flag because PXFS locks could
620 * also be of type 'RCMDLCK'.
621 * We do not _only_ check the GETPXFSID() macro because local PXFS
622 * clients use a pxfsid of zero to permit deadlock detection in the LLM.
623 */
624
625 if ((cmd & PCMDLCK) || (GETPXFSID(lckdat->l_sysid) != 0)) {
626 lock_request->l_state |= PXFS_LOCK;
627 }
628 if (!((cmd & SETFLCK) || (cmd & INOFLCK))) {
629 if (lock_request->l_type == F_RDLCK ||
630 lock_request->l_type == F_WRLCK)
631 lock_request->l_state |= QUERY_LOCK;
632 }
633 lock_request->l_flock = (*lckdat);
634 lock_request->l_callbacks = flk_cbp;
635
636 /*
637 * We are ready for processing the request
638 */
639 if (IS_LOCKMGR(lock_request)) {
640 /*
641 * If the lock request is an NLM server request ....
642 */
643 if (nlm_status_size == 0) { /* not booted as cluster */
644 mutex_enter(&flock_lock);
645 /*
646 * Bail out if this is a lock manager request and the
647 * lock manager is not supposed to be running.
648 */
649 if (flk_get_lockmgr_status() != FLK_LOCKMGR_UP) {
650 mutex_exit(&flock_lock);
651 error = ENOLCK;
652 goto done;
653 }
654 mutex_exit(&flock_lock);
655 } else { /* booted as a cluster */
656 nlmid = GETNLMID(lock_request->l_flock.l_sysid);
657 ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
658
659 mutex_enter(&nlm_reg_lock);
660 /*
661 * If the NLM registry does not know about this
662 * NLM server making the request, add its nlmid
663 * to the registry.
664 */
665 if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status,
666 nlmid)) {
667 FLK_REGISTRY_ADD_NLMID(nlm_reg_status, nlmid);
668 } else if (!FLK_REGISTRY_IS_NLM_UP(nlm_reg_status,
669 nlmid)) {
670 /*
671 * If the NLM server is already known (has made
672 * previous lock requests) and its state is
673 * not NLM_UP (means that NLM server is
674 * shutting down), then bail out with an
675 * error to deny the lock request.
676 */
677 mutex_exit(&nlm_reg_lock);
678 error = ENOLCK;
679 goto done;
680 }
681 mutex_exit(&nlm_reg_lock);
682 }
683 }
684
685 /* Now get the lock graph for a particular vnode */
686 gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH);
687
688 /*
689 * We drop rwlock here otherwise this might end up causing a
690 * deadlock if this IOLOCK sleeps. (bugid # 1183392).
691 */
692
693 if (IS_IO_LOCK(lock_request)) {
694 VOP_RWUNLOCK(vp,
695 (lock_request->l_type == F_RDLCK) ?
696 V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
697 }
698 mutex_enter(&gp->gp_mutex);
699
700 lock_request->l_state |= REFERENCED_LOCK;
701 lock_request->l_graph = gp;
702
703 switch (lock_request->l_type) {
704 case F_RDLCK:
705 case F_WRLCK:
706 if (IS_QUERY_LOCK(lock_request)) {
707 flk_get_first_blocking_lock(lock_request);
708 if (lock_request->l_ofd != NULL)
709 lock_request->l_flock.l_pid = -1;
710 (*lckdat) = lock_request->l_flock;
711 break;
712 }
713
714 /* process the request now */
715
716 error = flk_process_request(lock_request);
717 break;
718
719 case F_UNLCK:
720 /* unlock request will not block so execute it immediately */
721
722 if (IS_LOCKMGR(lock_request) &&
723 flk_canceled(lock_request)) {
724 error = 0;
725 } else {
726 error = flk_execute_request(lock_request);
727 }
728 break;
729
730 case F_UNLKSYS:
731 /*
732 * Recovery mechanism to release lock manager locks when
733 * NFS client crashes and restart. NFS server will clear
734 * old locks and grant new locks.
735 */
736
737 if (lock_request->l_flock.l_sysid == 0) {
738 mutex_exit(&gp->gp_mutex);
739 return (EINVAL);
740 }
741 if (secpolicy_nfs(CRED()) != 0) {
742 mutex_exit(&gp->gp_mutex);
743 return (EPERM);
744 }
745 flk_delete_locks_by_sysid(lock_request);
746 lock_request->l_state &= ~REFERENCED_LOCK;
747 flk_set_state(lock_request, FLK_DEAD_STATE);
748 flk_free_lock(lock_request);
749 mutex_exit(&gp->gp_mutex);
750 return (0);
751
752 default:
753 error = EINVAL;
754 break;
755 }
756
757 /* Clustering: For blocked PXFS locks, return */
758 if (error == PXFS_LOCK_BLOCKED) {
759 lock_request->l_state &= ~REFERENCED_LOCK;
760 mutex_exit(&gp->gp_mutex);
761 return (error);
762 }
763
764 /*
765 * Now that we have seen the status of locks in the system for
766 * this vnode we acquire the rwlock if it is an IO_LOCK.
767 */
768
769 if (IS_IO_LOCK(lock_request)) {
770 (void) VOP_RWLOCK(vp,
771 (lock_request->l_type == F_RDLCK) ?
772 V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL);
773 if (!error) {
774 lckdat->l_type = F_UNLCK;
775
776 /*
777 * This wake up is needed otherwise
778 * if IO_LOCK has slept the dependents on this
779 * will not be woken up at all. (bugid # 1185482).
780 */
781
782 flk_wakeup(lock_request, 1);
783 flk_set_state(lock_request, FLK_DEAD_STATE);
784 flk_free_lock(lock_request);
785 }
786 /*
787 * else if error had occurred either flk_process_request()
788 * has returned EDEADLK in which case there will be no
789 * dependents for this lock or EINTR from flk_wait_execute_
790 * request() in which case flk_cancel_sleeping_lock()
791 * would have been done. same is true with EBADF.
792 */
793 }
794
795 if (lock_request == &stack_lock_request) {
796 flk_set_state(lock_request, FLK_DEAD_STATE);
797 } else {
798 lock_request->l_state &= ~REFERENCED_LOCK;
799 if ((error != 0) || IS_DELETED(lock_request)) {
800 flk_set_state(lock_request, FLK_DEAD_STATE);
801 flk_free_lock(lock_request);
802 }
803 }
804
805 mutex_exit(&gp->gp_mutex);
806 return (error);
807
808 done:
809 flk_set_state(lock_request, FLK_DEAD_STATE);
810 if (lock_request != &stack_lock_request)
811 flk_free_lock(lock_request);
812 return (error);
813 }
814
815 /*
816 * Invoke the callbacks in the given list. If before sleeping, invoke in
817 * list order. If after sleeping, invoke in reverse order.
818 *
819 * CPR (suspend/resume) support: if one of the callbacks returns a
820 * callb_cpr_t, return it. This will be used to make the thread CPR-safe
821 * while it is sleeping. There should be at most one callb_cpr_t for the
822 * thread.
823 * XXX This is unnecessarily complicated. The CPR information should just
824 * get passed in directly through VOP_FRLOCK and reclock, rather than
825 * sneaking it in via a callback.
826 */
827
828 callb_cpr_t *
flk_invoke_callbacks(flk_callback_t * cblist,flk_cb_when_t when)829 flk_invoke_callbacks(flk_callback_t *cblist, flk_cb_when_t when)
830 {
831 callb_cpr_t *cpr_callbackp = NULL;
832 callb_cpr_t *one_result;
833 flk_callback_t *cb;
834
835 if (cblist == NULL)
836 return (NULL);
837
838 if (when == FLK_BEFORE_SLEEP) {
839 cb = cblist;
840 do {
841 one_result = (*cb->cb_callback)(when, cb->cb_data);
842 if (one_result != NULL) {
843 ASSERT(cpr_callbackp == NULL);
844 cpr_callbackp = one_result;
845 }
846 cb = cb->cb_next;
847 } while (cb != cblist);
848 } else {
849 cb = cblist->cb_prev;
850 do {
851 one_result = (*cb->cb_callback)(when, cb->cb_data);
852 if (one_result != NULL) {
853 cpr_callbackp = one_result;
854 }
855 cb = cb->cb_prev;
856 } while (cb != cblist->cb_prev);
857 }
858
859 return (cpr_callbackp);
860 }
861
862 /*
863 * Initialize a flk_callback_t to hold the given callback.
864 */
865
866 void
flk_init_callback(flk_callback_t * flk_cb,callb_cpr_t * (* cb_fcn)(flk_cb_when_t,void *),void * cbdata)867 flk_init_callback(flk_callback_t *flk_cb,
868 callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *), void *cbdata)
869 {
870 flk_cb->cb_next = flk_cb;
871 flk_cb->cb_prev = flk_cb;
872 flk_cb->cb_callback = cb_fcn;
873 flk_cb->cb_data = cbdata;
874 }
875
876 /*
877 * Initialize an flk_callback_t and then link it into the head of an
878 * existing list (which may be NULL).
879 */
880
881 void
flk_add_callback(flk_callback_t * newcb,callb_cpr_t * (* cb_fcn)(flk_cb_when_t,void *),void * cbdata,flk_callback_t * cblist)882 flk_add_callback(flk_callback_t *newcb,
883 callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *),
884 void *cbdata, flk_callback_t *cblist)
885 {
886 flk_init_callback(newcb, cb_fcn, cbdata);
887
888 if (cblist == NULL)
889 return;
890
891 newcb->cb_prev = cblist->cb_prev;
892 newcb->cb_next = cblist;
893 cblist->cb_prev->cb_next = newcb;
894 cblist->cb_prev = newcb;
895 }
896
897 /*
898 * Remove the callback from a list.
899 */
900
901 void
flk_del_callback(flk_callback_t * flk_cb)902 flk_del_callback(flk_callback_t *flk_cb)
903 {
904 flk_cb->cb_next->cb_prev = flk_cb->cb_prev;
905 flk_cb->cb_prev->cb_next = flk_cb->cb_next;
906
907 flk_cb->cb_prev = flk_cb;
908 flk_cb->cb_next = flk_cb;
909 }
910
911 /*
912 * Initialize the flk_edge_cache data structure and create the
913 * nlm_reg_status array.
914 */
915
916 void
flk_init(void)917 flk_init(void)
918 {
919 uint_t i;
920
921 flk_edge_cache = kmem_cache_create("flk_edges",
922 sizeof (struct edge), 0, NULL, NULL, NULL, NULL, NULL, 0);
923 if (flk_edge_cache == NULL) {
924 cmn_err(CE_PANIC, "Couldn't create flk_edge_cache\n");
925 }
926 /*
927 * Create the NLM registry object.
928 */
929
930 if (cluster_bootflags & CLUSTER_BOOTED) {
931 /*
932 * This routine tells you the maximum node id that will be used
933 * in the cluster. This number will be the size of the nlm
934 * registry status array. We add 1 because we will be using
935 * all entries indexed from 0 to maxnodeid; e.g., from 0
936 * to 64, for a total of 65 entries.
937 */
938 nlm_status_size = clconf_maximum_nodeid() + 1;
939 } else {
940 nlm_status_size = 0;
941 }
942
943 if (nlm_status_size != 0) { /* booted as a cluster */
944 nlm_reg_status = (flk_nlm_status_t *)
945 kmem_alloc(sizeof (flk_nlm_status_t) * nlm_status_size,
946 KM_SLEEP);
947
948 /* initialize all NLM states in array to NLM_UNKNOWN */
949 for (i = 0; i < nlm_status_size; i++) {
950 nlm_reg_status[i] = FLK_NLM_UNKNOWN;
951 }
952 }
953 }
954
955 /*
956 * Zone constructor/destructor callbacks to be executed when a zone is
957 * created/destroyed.
958 */
959 /* ARGSUSED */
960 void *
flk_zone_init(zoneid_t zoneid)961 flk_zone_init(zoneid_t zoneid)
962 {
963 struct flock_globals *fg;
964 uint_t i;
965
966 fg = kmem_alloc(sizeof (*fg), KM_SLEEP);
967 fg->flk_lockmgr_status = FLK_LOCKMGR_UP;
968 for (i = 0; i < HASH_SIZE; i++)
969 fg->lockmgr_status[i] = FLK_LOCKMGR_UP;
970 return (fg);
971 }
972
973 /* ARGSUSED */
974 void
flk_zone_fini(zoneid_t zoneid,void * data)975 flk_zone_fini(zoneid_t zoneid, void *data)
976 {
977 struct flock_globals *fg = data;
978
979 kmem_free(fg, sizeof (*fg));
980 }
981
982 /*
983 * Get a lock_descriptor structure with initialization of edge lists.
984 */
985
986 static lock_descriptor_t *
flk_get_lock(void)987 flk_get_lock(void)
988 {
989 lock_descriptor_t *l;
990
991 l = kmem_zalloc(sizeof (lock_descriptor_t), KM_SLEEP);
992
993 cv_init(&l->l_cv, NULL, CV_DRIVER, NULL);
994 l->l_edge.edge_in_next = &l->l_edge;
995 l->l_edge.edge_in_prev = &l->l_edge;
996 l->l_edge.edge_adj_next = &l->l_edge;
997 l->l_edge.edge_adj_prev = &l->l_edge;
998 l->pvertex = -1;
999 l->l_status = FLK_INITIAL_STATE;
1000 flk_lock_allocs++;
1001 return (l);
1002 }
1003
1004 /*
1005 * Free a lock_descriptor structure. Just sets the DELETED_LOCK flag
1006 * when some thread has a reference to it as in reclock().
1007 */
1008
1009 void
flk_free_lock(lock_descriptor_t * lock)1010 flk_free_lock(lock_descriptor_t *lock)
1011 {
1012 file_t *fp;
1013
1014 ASSERT(IS_DEAD(lock));
1015
1016 if ((fp = lock->l_ofd) != NULL && fp->f_filock == (struct filock *)lock)
1017 fp->f_filock = NULL;
1018
1019 if (IS_REFERENCED(lock)) {
1020 lock->l_state |= DELETED_LOCK;
1021 return;
1022 }
1023 flk_lock_frees++;
1024 kmem_free((void *)lock, sizeof (lock_descriptor_t));
1025 }
1026
1027 void
flk_set_state(lock_descriptor_t * lock,int new_state)1028 flk_set_state(lock_descriptor_t *lock, int new_state)
1029 {
1030 /*
1031 * Locks in the sleeping list may be woken up in a number of ways,
1032 * and more than once. If a sleeping lock is signaled awake more
1033 * than once, then it may or may not change state depending on its
1034 * current state.
1035 * Also note that NLM locks that are sleeping could be moved to an
1036 * interrupted state more than once if the unlock request is
1037 * retransmitted by the NLM client - the second time around, this is
1038 * just a nop.
1039 * The ordering of being signaled awake is:
1040 * INTERRUPTED_STATE > CANCELLED_STATE > GRANTED_STATE.
1041 * The checks below implement this ordering.
1042 */
1043 if (IS_INTERRUPTED(lock)) {
1044 if ((new_state == FLK_CANCELLED_STATE) ||
1045 (new_state == FLK_GRANTED_STATE) ||
1046 (new_state == FLK_INTERRUPTED_STATE)) {
1047 return;
1048 }
1049 }
1050 if (IS_CANCELLED(lock)) {
1051 if ((new_state == FLK_GRANTED_STATE) ||
1052 (new_state == FLK_CANCELLED_STATE)) {
1053 return;
1054 }
1055 }
1056 CHECK_LOCK_TRANSITION(lock->l_status, new_state);
1057 if (IS_PXFS(lock)) {
1058 cl_flk_state_transition_notify(lock, lock->l_status, new_state);
1059 }
1060 lock->l_status = new_state;
1061 }
1062
1063 /*
1064 * Routine that checks whether there are any blocking locks in the system.
1065 *
1066 * The policy followed is if a write lock is sleeping we don't allow read
1067 * locks before this write lock even though there may not be any active
1068 * locks corresponding to the read locks' region.
1069 *
1070 * flk_add_edge() function adds an edge between l1 and l2 iff there
1071 * is no path between l1 and l2. This is done to have a "minimum
1072 * storage representation" of the dependency graph.
1073 *
1074 * Another property of the graph is since only the new request throws
1075 * edges to the existing locks in the graph, the graph is always topologically
1076 * ordered.
1077 */
1078
1079 static int
flk_process_request(lock_descriptor_t * request)1080 flk_process_request(lock_descriptor_t *request)
1081 {
1082 graph_t *gp = request->l_graph;
1083 lock_descriptor_t *lock;
1084 int request_blocked_by_active = 0;
1085 int request_blocked_by_granted = 0;
1086 int request_blocked_by_sleeping = 0;
1087 vnode_t *vp = request->l_vnode;
1088 int error = 0;
1089 int request_will_wait = 0;
1090 int found_covering_lock = 0;
1091 lock_descriptor_t *covered_by = NULL;
1092
1093 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1094 request_will_wait = IS_WILLING_TO_SLEEP(request);
1095
1096 /*
1097 * check active locks
1098 */
1099
1100 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1101
1102
1103 if (lock) {
1104 do {
1105 if (BLOCKS(lock, request)) {
1106 if (!request_will_wait)
1107 return (EAGAIN);
1108 request_blocked_by_active = 1;
1109 break;
1110 }
1111 /*
1112 * Grant lock if it is for the same owner holding active
1113 * lock that covers the request.
1114 */
1115
1116 if (SAME_OWNER(lock, request) &&
1117 COVERS(lock, request) &&
1118 (request->l_type == F_RDLCK))
1119 return (flk_execute_request(request));
1120 lock = lock->l_next;
1121 } while (lock->l_vnode == vp);
1122 }
1123
1124 if (!request_blocked_by_active) {
1125 lock_descriptor_t *lk[1];
1126 lock_descriptor_t *first_glock = NULL;
1127 /*
1128 * Shall we grant this?! NO!!
1129 * What about those locks that were just granted and still
1130 * in sleep queue. Those threads are woken up and so locks
1131 * are almost active.
1132 */
1133 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1134 if (lock) {
1135 do {
1136 if (BLOCKS(lock, request)) {
1137 if (IS_GRANTED(lock)) {
1138 request_blocked_by_granted = 1;
1139 } else {
1140 request_blocked_by_sleeping = 1;
1141 }
1142 }
1143
1144 lock = lock->l_next;
1145 } while ((lock->l_vnode == vp));
1146 first_glock = lock->l_prev;
1147 ASSERT(first_glock->l_vnode == vp);
1148 }
1149
1150 if (request_blocked_by_granted)
1151 goto block;
1152
1153 if (!request_blocked_by_sleeping) {
1154 /*
1155 * If the request isn't going to be blocked by a
1156 * sleeping request, we know that it isn't going to
1157 * be blocked; we can just execute the request --
1158 * without performing costly deadlock detection.
1159 */
1160 ASSERT(!request_blocked_by_active);
1161 return (flk_execute_request(request));
1162 } else if (request->l_type == F_RDLCK) {
1163 /*
1164 * If we have a sleeping writer in the requested
1165 * lock's range, block.
1166 */
1167 goto block;
1168 }
1169
1170 lk[0] = request;
1171 request->l_state |= RECOMPUTE_LOCK;
1172 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1173 if (lock) {
1174 do {
1175 flk_recompute_dependencies(lock, lk, 1, 0);
1176 lock = lock->l_next;
1177 } while (lock->l_vnode == vp);
1178 }
1179 lock = first_glock;
1180 if (lock) {
1181 do {
1182 if (IS_GRANTED(lock)) {
1183 flk_recompute_dependencies(lock, lk, 1, 0);
1184 }
1185 lock = lock->l_prev;
1186 } while ((lock->l_vnode == vp));
1187 }
1188 request->l_state &= ~RECOMPUTE_LOCK;
1189 if (!NO_DEPENDENTS(request) && flk_check_deadlock(request))
1190 return (EDEADLK);
1191 return (flk_execute_request(request));
1192 }
1193
1194 block:
1195 if (request_will_wait)
1196 flk_graph_uncolor(gp);
1197
1198 /* check sleeping locks */
1199
1200 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
1201
1202 /*
1203 * If we find a sleeping write lock that is a superset of the
1204 * region wanted by request we can be assured that by adding an
1205 * edge to this write lock we have paths to all locks in the
1206 * graph that blocks the request except in one case and that is why
1207 * another check for SAME_OWNER in the loop below. The exception
1208 * case is when this process that owns the sleeping write lock 'l1'
1209 * has other locks l2, l3, l4 that are in the system and arrived
1210 * before l1. l1 does not have path to these locks as they are from
1211 * same process. We break when we find a second covering sleeping
1212 * lock l5 owned by a process different from that owning l1, because
1213 * there cannot be any of l2, l3, l4, etc., arrived before l5, and if
1214 * it has l1 would have produced a deadlock already.
1215 */
1216
1217 if (lock) {
1218 do {
1219 if (BLOCKS(lock, request)) {
1220 if (!request_will_wait)
1221 return (EAGAIN);
1222 if (COVERS(lock, request) &&
1223 lock->l_type == F_WRLCK) {
1224 if (found_covering_lock &&
1225 !SAME_OWNER(lock, covered_by)) {
1226 found_covering_lock++;
1227 break;
1228 }
1229 found_covering_lock = 1;
1230 covered_by = lock;
1231 }
1232 if (found_covering_lock &&
1233 !SAME_OWNER(lock, covered_by)) {
1234 lock = lock->l_next;
1235 continue;
1236 }
1237 if ((error = flk_add_edge(request, lock,
1238 !found_covering_lock, 0)))
1239 return (error);
1240 }
1241 lock = lock->l_next;
1242 } while (lock->l_vnode == vp);
1243 }
1244
1245 /*
1246 * found_covering_lock == 2 iff at this point 'request' has paths
1247 * to all locks that blocks 'request'. found_covering_lock == 1 iff at this
1248 * point 'request' has paths to all locks that blocks 'request' whose owners
1249 * are not same as the one that covers 'request' (covered_by above) and
1250 * we can have locks whose owner is same as covered_by in the active list.
1251 */
1252
1253 if (request_blocked_by_active && found_covering_lock != 2) {
1254 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1255 ASSERT(lock != NULL);
1256 do {
1257 if (BLOCKS(lock, request)) {
1258 if (found_covering_lock &&
1259 !SAME_OWNER(lock, covered_by)) {
1260 lock = lock->l_next;
1261 continue;
1262 }
1263 if ((error = flk_add_edge(request, lock,
1264 CHECK_CYCLE, 0)))
1265 return (error);
1266 }
1267 lock = lock->l_next;
1268 } while (lock->l_vnode == vp);
1269 }
1270
1271 if (NOT_BLOCKED(request)) {
1272 /*
1273 * request not dependent on any other locks
1274 * so execute this request
1275 */
1276 return (flk_execute_request(request));
1277 } else {
1278 /*
1279 * check for deadlock
1280 */
1281 if (flk_check_deadlock(request))
1282 return (EDEADLK);
1283 /*
1284 * this thread has to sleep
1285 */
1286 return (flk_wait_execute_request(request));
1287 }
1288 }
1289
1290 /*
1291 * The actual execution of the request in the simple case is only to
1292 * insert the 'request' in the list of active locks if it is not an
1293 * UNLOCK.
1294 * We have to consider the existing active locks' relation to
1295 * this 'request' if they are owned by same process. flk_relation() does
1296 * this job and sees to that the dependency graph information is maintained
1297 * properly.
1298 */
1299
1300 int
flk_execute_request(lock_descriptor_t * request)1301 flk_execute_request(lock_descriptor_t *request)
1302 {
1303 graph_t *gp = request->l_graph;
1304 vnode_t *vp = request->l_vnode;
1305 lock_descriptor_t *lock, *lock1;
1306 int done_searching = 0;
1307
1308 CHECK_SLEEPING_LOCKS(gp);
1309 CHECK_ACTIVE_LOCKS(gp);
1310
1311 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1312
1313 flk_set_state(request, FLK_START_STATE);
1314
1315 ASSERT(NOT_BLOCKED(request));
1316
1317 /* IO_LOCK requests are only to check status */
1318
1319 if (IS_IO_LOCK(request))
1320 return (0);
1321
1322 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1323
1324 if (lock == NULL && request->l_type == F_UNLCK)
1325 return (0);
1326 if (lock == NULL) {
1327 flk_insert_active_lock(request);
1328 return (0);
1329 }
1330
1331 do {
1332 lock1 = lock->l_next;
1333 if (SAME_OWNER(request, lock)) {
1334 done_searching = flk_relation(lock, request);
1335 }
1336 lock = lock1;
1337 } while (lock->l_vnode == vp && !done_searching);
1338
1339 /*
1340 * insert in active queue
1341 */
1342
1343 if (request->l_type != F_UNLCK)
1344 flk_insert_active_lock(request);
1345
1346 return (0);
1347 }
1348
1349 /*
1350 * 'request' is blocked by some one therefore we put it into sleep queue.
1351 */
1352 static int
flk_wait_execute_request(lock_descriptor_t * request)1353 flk_wait_execute_request(lock_descriptor_t *request)
1354 {
1355 graph_t *gp = request->l_graph;
1356 callb_cpr_t *cprp; /* CPR info from callback */
1357 struct flock_globals *fg;
1358 int index;
1359
1360 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1361 ASSERT(IS_WILLING_TO_SLEEP(request));
1362
1363 flk_insert_sleeping_lock(request);
1364
1365 index = 0; /* quiesce compiler warning. */
1366 fg = NULL;
1367 if (IS_LOCKMGR(request)) {
1368 index = HASH_INDEX(request->l_vnode);
1369 fg = flk_get_globals();
1370
1371 if (nlm_status_size == 0) { /* not booted as a cluster */
1372 if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP) {
1373 flk_cancel_sleeping_lock(request, 1);
1374 return (ENOLCK);
1375 }
1376 } else { /* booted as a cluster */
1377 /*
1378 * If the request is an NLM server lock request,
1379 * and the NLM state of the lock request is not
1380 * NLM_UP (because the NLM server is shutting
1381 * down), then cancel the sleeping lock and
1382 * return error ENOLCK that will encourage the
1383 * client to retransmit.
1384 */
1385 if (!IS_NLM_UP(request)) {
1386 flk_cancel_sleeping_lock(request, 1);
1387 return (ENOLCK);
1388 }
1389 }
1390 }
1391
1392 /* Clustering: For blocking PXFS locks, return */
1393 if (IS_PXFS(request)) {
1394 /*
1395 * PXFS locks sleep on the client side.
1396 * The callback argument is used to wake up the sleeper
1397 * when the lock is granted.
1398 * We return -1 (rather than an errno value) to indicate
1399 * the client side should sleep
1400 */
1401 return (PXFS_LOCK_BLOCKED);
1402 }
1403
1404 if (request->l_callbacks != NULL) {
1405 /*
1406 * To make sure the shutdown code works correctly, either
1407 * the callback must happen after putting the lock on the
1408 * sleep list, or we must check the shutdown status after
1409 * returning from the callback (and before sleeping). At
1410 * least for now, we'll use the first option. If a
1411 * shutdown or signal or whatever happened while the graph
1412 * mutex was dropped, that will be detected by
1413 * wait_for_lock().
1414 */
1415 mutex_exit(&gp->gp_mutex);
1416
1417 cprp = flk_invoke_callbacks(request->l_callbacks,
1418 FLK_BEFORE_SLEEP);
1419
1420 mutex_enter(&gp->gp_mutex);
1421
1422 if (cprp == NULL) {
1423 wait_for_lock(request);
1424 } else {
1425 mutex_enter(cprp->cc_lockp);
1426 CALLB_CPR_SAFE_BEGIN(cprp);
1427 mutex_exit(cprp->cc_lockp);
1428 wait_for_lock(request);
1429 mutex_enter(cprp->cc_lockp);
1430 CALLB_CPR_SAFE_END(cprp, cprp->cc_lockp);
1431 mutex_exit(cprp->cc_lockp);
1432 }
1433
1434 mutex_exit(&gp->gp_mutex);
1435 (void) flk_invoke_callbacks(request->l_callbacks,
1436 FLK_AFTER_SLEEP);
1437 mutex_enter(&gp->gp_mutex);
1438 } else {
1439 wait_for_lock(request);
1440 }
1441
1442 if (IS_LOCKMGR(request)) {
1443 /*
1444 * If the lock manager is shutting down, return an
1445 * error that will encourage the client to retransmit.
1446 */
1447 if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP &&
1448 !IS_GRANTED(request)) {
1449 flk_cancel_sleeping_lock(request, 1);
1450 return (ENOLCK);
1451 }
1452 }
1453
1454 if (IS_INTERRUPTED(request)) {
1455 /* we got a signal, or act like we did */
1456 flk_cancel_sleeping_lock(request, 1);
1457 return (EINTR);
1458 }
1459
1460 /* Cancelled if some other thread has closed the file */
1461
1462 if (IS_CANCELLED(request)) {
1463 flk_cancel_sleeping_lock(request, 1);
1464 return (EBADF);
1465 }
1466
1467 request->l_state &= ~GRANTED_LOCK;
1468 REMOVE_SLEEP_QUEUE(request);
1469 return (flk_execute_request(request));
1470 }
1471
1472 /*
1473 * This routine adds an edge between from and to because from depends
1474 * to. If asked to check for deadlock it checks whether there are any
1475 * reachable locks from "from_lock" that is owned by the same process
1476 * as "from_lock".
1477 * NOTE: It is the caller's responsibility to make sure that the color
1478 * of the graph is consistent between the calls to flk_add_edge as done
1479 * in flk_process_request. This routine does not color and check for
1480 * deadlock explicitly.
1481 */
1482
1483 static int
flk_add_edge(lock_descriptor_t * from_lock,lock_descriptor_t * to_lock,int check_cycle,int update_graph)1484 flk_add_edge(lock_descriptor_t *from_lock, lock_descriptor_t *to_lock,
1485 int check_cycle, int update_graph)
1486 {
1487 edge_t *edge;
1488 edge_t *ep;
1489 lock_descriptor_t *vertex;
1490 lock_descriptor_t *vertex_stack;
1491
1492 STACK_INIT(vertex_stack);
1493
1494 /*
1495 * if to vertex already has mark_color just return
1496 * don't add an edge as it is reachable from from vertex
1497 * before itself.
1498 */
1499
1500 if (COLORED(to_lock))
1501 return (0);
1502
1503 edge = flk_get_edge();
1504
1505 /*
1506 * set the from and to vertex
1507 */
1508
1509 edge->from_vertex = from_lock;
1510 edge->to_vertex = to_lock;
1511
1512 /*
1513 * put in adjacency list of from vertex
1514 */
1515
1516 from_lock->l_edge.edge_adj_next->edge_adj_prev = edge;
1517 edge->edge_adj_next = from_lock->l_edge.edge_adj_next;
1518 edge->edge_adj_prev = &from_lock->l_edge;
1519 from_lock->l_edge.edge_adj_next = edge;
1520
1521 /*
1522 * put in list of to vertex
1523 */
1524
1525 to_lock->l_edge.edge_in_next->edge_in_prev = edge;
1526 edge->edge_in_next = to_lock->l_edge.edge_in_next;
1527 to_lock->l_edge.edge_in_next = edge;
1528 edge->edge_in_prev = &to_lock->l_edge;
1529
1530
1531 if (update_graph) {
1532 flk_update_proc_graph(edge, 0);
1533 return (0);
1534 }
1535 if (!check_cycle) {
1536 return (0);
1537 }
1538
1539 STACK_PUSH(vertex_stack, from_lock, l_stack);
1540
1541 while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1542
1543 STACK_POP(vertex_stack, l_stack);
1544
1545 for (ep = FIRST_ADJ(vertex);
1546 ep != HEAD(vertex);
1547 ep = NEXT_ADJ(ep)) {
1548 if (COLORED(ep->to_vertex))
1549 continue;
1550 COLOR(ep->to_vertex);
1551 if (SAME_OWNER(ep->to_vertex, from_lock))
1552 goto dead_lock;
1553 STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1554 }
1555 }
1556 return (0);
1557
1558 dead_lock:
1559
1560 /*
1561 * remove all edges
1562 */
1563
1564 ep = FIRST_ADJ(from_lock);
1565
1566 while (ep != HEAD(from_lock)) {
1567 IN_LIST_REMOVE(ep);
1568 from_lock->l_sedge = NEXT_ADJ(ep);
1569 ADJ_LIST_REMOVE(ep);
1570 flk_free_edge(ep);
1571 ep = from_lock->l_sedge;
1572 }
1573 return (EDEADLK);
1574 }
1575
1576 /*
1577 * Get an edge structure for representing the dependency between two locks.
1578 */
1579
1580 static edge_t *
flk_get_edge()1581 flk_get_edge()
1582 {
1583 edge_t *ep;
1584
1585 ASSERT(flk_edge_cache != NULL);
1586
1587 ep = kmem_cache_alloc(flk_edge_cache, KM_SLEEP);
1588 edge_allocs++;
1589 return (ep);
1590 }
1591
1592 /*
1593 * Free the edge structure.
1594 */
1595
1596 static void
flk_free_edge(edge_t * ep)1597 flk_free_edge(edge_t *ep)
1598 {
1599 edge_frees++;
1600 kmem_cache_free(flk_edge_cache, (void *)ep);
1601 }
1602
1603 /*
1604 * Check the relationship of request with lock and perform the
1605 * recomputation of dependencies, break lock if required, and return
1606 * 1 if request cannot have any more relationship with the next
1607 * active locks.
1608 * The 'lock' and 'request' are compared and in case of overlap we
1609 * delete the 'lock' and form new locks to represent the non-overlapped
1610 * portion of original 'lock'. This function has side effects such as
1611 * 'lock' will be freed, new locks will be added to the active list.
1612 */
1613
1614 static int
flk_relation(lock_descriptor_t * lock,lock_descriptor_t * request)1615 flk_relation(lock_descriptor_t *lock, lock_descriptor_t *request)
1616 {
1617 int lock_effect;
1618 lock_descriptor_t *lock1, *lock2;
1619 lock_descriptor_t *topology[3];
1620 int nvertex = 0;
1621 int i;
1622 edge_t *ep;
1623 graph_t *gp = (lock->l_graph);
1624
1625
1626 CHECK_SLEEPING_LOCKS(gp);
1627 CHECK_ACTIVE_LOCKS(gp);
1628
1629 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1630
1631 topology[0] = topology[1] = topology[2] = NULL;
1632
1633 if (request->l_type == F_UNLCK)
1634 lock_effect = FLK_UNLOCK;
1635 else if (request->l_type == F_RDLCK &&
1636 lock->l_type == F_WRLCK)
1637 lock_effect = FLK_DOWNGRADE;
1638 else if (request->l_type == F_WRLCK &&
1639 lock->l_type == F_RDLCK)
1640 lock_effect = FLK_UPGRADE;
1641 else
1642 lock_effect = FLK_STAY_SAME;
1643
1644 if (lock->l_end < request->l_start) {
1645 if (lock->l_end == request->l_start - 1 &&
1646 lock_effect == FLK_STAY_SAME) {
1647 topology[0] = request;
1648 request->l_start = lock->l_start;
1649 nvertex = 1;
1650 goto recompute;
1651 } else {
1652 return (0);
1653 }
1654 }
1655
1656 if (lock->l_start > request->l_end) {
1657 if (request->l_end == lock->l_start - 1 &&
1658 lock_effect == FLK_STAY_SAME) {
1659 topology[0] = request;
1660 request->l_end = lock->l_end;
1661 nvertex = 1;
1662 goto recompute;
1663 } else {
1664 return (1);
1665 }
1666 }
1667
1668 if (request->l_end < lock->l_end) {
1669 if (request->l_start > lock->l_start) {
1670 if (lock_effect == FLK_STAY_SAME) {
1671 request->l_start = lock->l_start;
1672 request->l_end = lock->l_end;
1673 topology[0] = request;
1674 nvertex = 1;
1675 } else {
1676 lock1 = flk_get_lock();
1677 lock2 = flk_get_lock();
1678 COPY(lock1, lock);
1679 COPY(lock2, lock);
1680 lock1->l_start = lock->l_start;
1681 lock1->l_end = request->l_start - 1;
1682 lock2->l_start = request->l_end + 1;
1683 lock2->l_end = lock->l_end;
1684 topology[0] = lock1;
1685 topology[1] = lock2;
1686 topology[2] = request;
1687 nvertex = 3;
1688 }
1689 } else if (request->l_start < lock->l_start) {
1690 if (lock_effect == FLK_STAY_SAME) {
1691 request->l_end = lock->l_end;
1692 topology[0] = request;
1693 nvertex = 1;
1694 } else {
1695 lock1 = flk_get_lock();
1696 COPY(lock1, lock);
1697 lock1->l_start = request->l_end + 1;
1698 topology[0] = lock1;
1699 topology[1] = request;
1700 nvertex = 2;
1701 }
1702 } else {
1703 if (lock_effect == FLK_STAY_SAME) {
1704 request->l_start = lock->l_start;
1705 request->l_end = lock->l_end;
1706 topology[0] = request;
1707 nvertex = 1;
1708 } else {
1709 lock1 = flk_get_lock();
1710 COPY(lock1, lock);
1711 lock1->l_start = request->l_end + 1;
1712 topology[0] = lock1;
1713 topology[1] = request;
1714 nvertex = 2;
1715 }
1716 }
1717 } else if (request->l_end > lock->l_end) {
1718 if (request->l_start > lock->l_start) {
1719 if (lock_effect == FLK_STAY_SAME) {
1720 request->l_start = lock->l_start;
1721 topology[0] = request;
1722 nvertex = 1;
1723 } else {
1724 lock1 = flk_get_lock();
1725 COPY(lock1, lock);
1726 lock1->l_end = request->l_start - 1;
1727 topology[0] = lock1;
1728 topology[1] = request;
1729 nvertex = 2;
1730 }
1731 } else if (request->l_start < lock->l_start) {
1732 topology[0] = request;
1733 nvertex = 1;
1734 } else {
1735 topology[0] = request;
1736 nvertex = 1;
1737 }
1738 } else {
1739 if (request->l_start > lock->l_start) {
1740 if (lock_effect == FLK_STAY_SAME) {
1741 request->l_start = lock->l_start;
1742 topology[0] = request;
1743 nvertex = 1;
1744 } else {
1745 lock1 = flk_get_lock();
1746 COPY(lock1, lock);
1747 lock1->l_end = request->l_start - 1;
1748 topology[0] = lock1;
1749 topology[1] = request;
1750 nvertex = 2;
1751 }
1752 } else if (request->l_start < lock->l_start) {
1753 topology[0] = request;
1754 nvertex = 1;
1755 } else {
1756 if (lock_effect != FLK_UNLOCK) {
1757 topology[0] = request;
1758 nvertex = 1;
1759 } else {
1760 flk_delete_active_lock(lock, 0);
1761 flk_wakeup(lock, 1);
1762 flk_free_lock(lock);
1763 CHECK_SLEEPING_LOCKS(gp);
1764 CHECK_ACTIVE_LOCKS(gp);
1765 return (1);
1766 }
1767 }
1768 }
1769
1770 recompute:
1771
1772 /*
1773 * For unlock we don't send the 'request' to for recomputing
1774 * dependencies because no lock will add an edge to this.
1775 */
1776
1777 if (lock_effect == FLK_UNLOCK) {
1778 topology[nvertex-1] = NULL;
1779 nvertex--;
1780 }
1781 for (i = 0; i < nvertex; i++) {
1782 topology[i]->l_state |= RECOMPUTE_LOCK;
1783 topology[i]->l_color = NO_COLOR;
1784 }
1785
1786 ASSERT(FIRST_ADJ(lock) == HEAD(lock));
1787
1788 /*
1789 * we remove the adjacent edges for all vertices' to this vertex
1790 * 'lock'.
1791 */
1792
1793 ep = FIRST_IN(lock);
1794 while (ep != HEAD(lock)) {
1795 ADJ_LIST_REMOVE(ep);
1796 ep = NEXT_IN(ep);
1797 }
1798
1799 flk_delete_active_lock(lock, 0);
1800
1801 /* We are ready for recomputing the dependencies now */
1802
1803 flk_recompute_dependencies(lock, topology, nvertex, 1);
1804
1805 for (i = 0; i < nvertex; i++) {
1806 topology[i]->l_state &= ~RECOMPUTE_LOCK;
1807 topology[i]->l_color = NO_COLOR;
1808 }
1809
1810
1811 if (lock_effect == FLK_UNLOCK) {
1812 nvertex++;
1813 }
1814 for (i = 0; i < nvertex - 1; i++) {
1815 flk_insert_active_lock(topology[i]);
1816 }
1817
1818
1819 if (lock_effect == FLK_DOWNGRADE || lock_effect == FLK_UNLOCK) {
1820 flk_wakeup(lock, 0);
1821 } else {
1822 ep = FIRST_IN(lock);
1823 while (ep != HEAD(lock)) {
1824 lock->l_sedge = NEXT_IN(ep);
1825 IN_LIST_REMOVE(ep);
1826 flk_update_proc_graph(ep, 1);
1827 flk_free_edge(ep);
1828 ep = lock->l_sedge;
1829 }
1830 }
1831 flk_free_lock(lock);
1832
1833 CHECK_SLEEPING_LOCKS(gp);
1834 CHECK_ACTIVE_LOCKS(gp);
1835 return (0);
1836 }
1837
1838 /*
1839 * Insert a lock into the active queue.
1840 */
1841
1842 static void
flk_insert_active_lock(lock_descriptor_t * new_lock)1843 flk_insert_active_lock(lock_descriptor_t *new_lock)
1844 {
1845 graph_t *gp = new_lock->l_graph;
1846 vnode_t *vp = new_lock->l_vnode;
1847 lock_descriptor_t *first_lock, *lock;
1848
1849 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1850
1851 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
1852 first_lock = lock;
1853
1854 if (first_lock != NULL) {
1855 for (; (lock->l_vnode == vp &&
1856 lock->l_start < new_lock->l_start); lock = lock->l_next)
1857 ;
1858 } else {
1859 lock = ACTIVE_HEAD(gp);
1860 }
1861
1862 lock->l_prev->l_next = new_lock;
1863 new_lock->l_next = lock;
1864 new_lock->l_prev = lock->l_prev;
1865 lock->l_prev = new_lock;
1866
1867 if (first_lock == NULL || (new_lock->l_start <= first_lock->l_start)) {
1868 vp->v_filocks = (struct filock *)new_lock;
1869 }
1870 flk_set_state(new_lock, FLK_ACTIVE_STATE);
1871 new_lock->l_state |= ACTIVE_LOCK;
1872
1873 CHECK_ACTIVE_LOCKS(gp);
1874 CHECK_SLEEPING_LOCKS(gp);
1875 }
1876
1877 /*
1878 * Delete the active lock : Performs two functions depending on the
1879 * value of second parameter. One is to remove from the active lists
1880 * only and other is to both remove and free the lock.
1881 */
1882
1883 static void
flk_delete_active_lock(lock_descriptor_t * lock,int free_lock)1884 flk_delete_active_lock(lock_descriptor_t *lock, int free_lock)
1885 {
1886 vnode_t *vp = lock->l_vnode;
1887 graph_t *gp = lock->l_graph;
1888
1889 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1890 if (free_lock)
1891 ASSERT(NO_DEPENDENTS(lock));
1892 ASSERT(NOT_BLOCKED(lock));
1893 ASSERT(IS_ACTIVE(lock));
1894
1895 ASSERT((vp->v_filocks != NULL));
1896
1897 if (vp->v_filocks == (struct filock *)lock) {
1898 vp->v_filocks = (struct filock *)
1899 ((lock->l_next->l_vnode == vp) ? lock->l_next :
1900 NULL);
1901 }
1902 lock->l_next->l_prev = lock->l_prev;
1903 lock->l_prev->l_next = lock->l_next;
1904 lock->l_next = lock->l_prev = NULL;
1905 flk_set_state(lock, FLK_DEAD_STATE);
1906 lock->l_state &= ~ACTIVE_LOCK;
1907
1908 if (free_lock)
1909 flk_free_lock(lock);
1910 CHECK_ACTIVE_LOCKS(gp);
1911 CHECK_SLEEPING_LOCKS(gp);
1912 }
1913
1914 /*
1915 * Insert into the sleep queue.
1916 */
1917
1918 static void
flk_insert_sleeping_lock(lock_descriptor_t * request)1919 flk_insert_sleeping_lock(lock_descriptor_t *request)
1920 {
1921 graph_t *gp = request->l_graph;
1922 vnode_t *vp = request->l_vnode;
1923 lock_descriptor_t *lock;
1924
1925 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1926 ASSERT(IS_INITIAL(request));
1927
1928 for (lock = gp->sleeping_locks.l_next; (lock != &gp->sleeping_locks &&
1929 lock->l_vnode < vp); lock = lock->l_next)
1930 ;
1931
1932 lock->l_prev->l_next = request;
1933 request->l_prev = lock->l_prev;
1934 lock->l_prev = request;
1935 request->l_next = lock;
1936 flk_set_state(request, FLK_SLEEPING_STATE);
1937 request->l_state |= SLEEPING_LOCK;
1938 }
1939
1940 /*
1941 * Cancelling a sleeping lock implies removing a vertex from the
1942 * dependency graph and therefore we should recompute the dependencies
1943 * of all vertices that have a path to this vertex, w.r.t. all
1944 * vertices reachable from this vertex.
1945 */
1946
1947 void
flk_cancel_sleeping_lock(lock_descriptor_t * request,int remove_from_queue)1948 flk_cancel_sleeping_lock(lock_descriptor_t *request, int remove_from_queue)
1949 {
1950 graph_t *gp = request->l_graph;
1951 vnode_t *vp = request->l_vnode;
1952 lock_descriptor_t **topology = NULL;
1953 edge_t *ep;
1954 lock_descriptor_t *vertex, *lock;
1955 int nvertex = 0;
1956 int i;
1957 lock_descriptor_t *vertex_stack;
1958
1959 STACK_INIT(vertex_stack);
1960
1961 ASSERT(MUTEX_HELD(&gp->gp_mutex));
1962 /*
1963 * count number of vertex pointers that has to be allocated
1964 * All vertices that are reachable from request.
1965 */
1966
1967 STACK_PUSH(vertex_stack, request, l_stack);
1968
1969 while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
1970 STACK_POP(vertex_stack, l_stack);
1971 for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
1972 ep = NEXT_ADJ(ep)) {
1973 if (IS_RECOMPUTE(ep->to_vertex))
1974 continue;
1975 ep->to_vertex->l_state |= RECOMPUTE_LOCK;
1976 STACK_PUSH(vertex_stack, ep->to_vertex, l_stack);
1977 nvertex++;
1978 }
1979 }
1980
1981 /*
1982 * allocate memory for holding the vertex pointers
1983 */
1984
1985 if (nvertex) {
1986 topology = kmem_zalloc(nvertex * sizeof (lock_descriptor_t *),
1987 KM_SLEEP);
1988 }
1989
1990 /*
1991 * one more pass to actually store the vertices in the
1992 * allocated array.
1993 * We first check sleeping locks and then active locks
1994 * so that topology array will be in a topological
1995 * order.
1996 */
1997
1998 nvertex = 0;
1999 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2000
2001 if (lock) {
2002 do {
2003 if (IS_RECOMPUTE(lock)) {
2004 lock->l_index = nvertex;
2005 topology[nvertex++] = lock;
2006 }
2007 lock->l_color = NO_COLOR;
2008 lock = lock->l_next;
2009 } while (lock->l_vnode == vp);
2010 }
2011
2012 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2013
2014 if (lock) {
2015 do {
2016 if (IS_RECOMPUTE(lock)) {
2017 lock->l_index = nvertex;
2018 topology[nvertex++] = lock;
2019 }
2020 lock->l_color = NO_COLOR;
2021 lock = lock->l_next;
2022 } while (lock->l_vnode == vp);
2023 }
2024
2025 /*
2026 * remove in and out edges of request
2027 * They are freed after updating proc_graph below.
2028 */
2029
2030 for (ep = FIRST_IN(request); ep != HEAD(request); ep = NEXT_IN(ep)) {
2031 ADJ_LIST_REMOVE(ep);
2032 }
2033
2034
2035 if (remove_from_queue)
2036 REMOVE_SLEEP_QUEUE(request);
2037
2038 /* we are ready to recompute */
2039
2040 flk_recompute_dependencies(request, topology, nvertex, 1);
2041
2042 ep = FIRST_ADJ(request);
2043 while (ep != HEAD(request)) {
2044 IN_LIST_REMOVE(ep);
2045 request->l_sedge = NEXT_ADJ(ep);
2046 ADJ_LIST_REMOVE(ep);
2047 flk_update_proc_graph(ep, 1);
2048 flk_free_edge(ep);
2049 ep = request->l_sedge;
2050 }
2051
2052
2053 /*
2054 * unset the RECOMPUTE flag in those vertices
2055 */
2056
2057 for (i = 0; i < nvertex; i++) {
2058 topology[i]->l_state &= ~RECOMPUTE_LOCK;
2059 }
2060
2061 /*
2062 * free the topology
2063 */
2064 if (nvertex)
2065 kmem_free((void *)topology,
2066 (nvertex * sizeof (lock_descriptor_t *)));
2067 /*
2068 * Possibility of some locks unblocked now
2069 */
2070
2071 flk_wakeup(request, 0);
2072
2073 /*
2074 * we expect to have a correctly recomputed graph now.
2075 */
2076 flk_set_state(request, FLK_DEAD_STATE);
2077 flk_free_lock(request);
2078 CHECK_SLEEPING_LOCKS(gp);
2079 CHECK_ACTIVE_LOCKS(gp);
2080
2081 }
2082
2083 /*
2084 * Uncoloring the graph is simply to increment the mark value of the graph
2085 * And only when wrap round takes place will we color all vertices in
2086 * the graph explicitly.
2087 */
2088
2089 static void
flk_graph_uncolor(graph_t * gp)2090 flk_graph_uncolor(graph_t *gp)
2091 {
2092 lock_descriptor_t *lock;
2093
2094 if (gp->mark == UINT_MAX) {
2095 gp->mark = 1;
2096 for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
2097 lock = lock->l_next)
2098 lock->l_color = 0;
2099
2100 for (lock = SLEEPING_HEAD(gp)->l_next;
2101 lock != SLEEPING_HEAD(gp); lock = lock->l_next)
2102 lock->l_color = 0;
2103 } else {
2104 gp->mark++;
2105 }
2106 }
2107
2108 /*
2109 * Wake up locks that are blocked on the given lock.
2110 */
2111
2112 static void
flk_wakeup(lock_descriptor_t * lock,int adj_list_remove)2113 flk_wakeup(lock_descriptor_t *lock, int adj_list_remove)
2114 {
2115 edge_t *ep;
2116 graph_t *gp = lock->l_graph;
2117 lock_descriptor_t *lck;
2118
2119 ASSERT(MUTEX_HELD(&gp->gp_mutex));
2120 if (NO_DEPENDENTS(lock))
2121 return;
2122 ep = FIRST_IN(lock);
2123 do {
2124 /*
2125 * delete the edge from the adjacency list
2126 * of from vertex. if no more adjacent edges
2127 * for this vertex wake this process.
2128 */
2129 lck = ep->from_vertex;
2130 if (adj_list_remove)
2131 ADJ_LIST_REMOVE(ep);
2132 flk_update_proc_graph(ep, 1);
2133 if (NOT_BLOCKED(lck)) {
2134 GRANT_WAKEUP(lck);
2135 }
2136 lock->l_sedge = NEXT_IN(ep);
2137 IN_LIST_REMOVE(ep);
2138 flk_free_edge(ep);
2139 ep = lock->l_sedge;
2140 } while (ep != HEAD(lock));
2141 ASSERT(NO_DEPENDENTS(lock));
2142 }
2143
2144 /*
2145 * The dependents of request, is checked for its dependency against the
2146 * locks in topology (called topology because the array is and should be in
2147 * topological order for this algorithm, if not in topological order the
2148 * inner loop below might add more edges than necessary. Topological ordering
2149 * of vertices satisfies the property that all edges will be from left to
2150 * right i.e., topology[i] can have an edge to topology[j], iff i<j)
2151 * If lock l1 in the dependent set of request is dependent (blocked by)
2152 * on lock l2 in topology but does not have a path to it, we add an edge
2153 * in the inner loop below.
2154 *
2155 * We don't want to add an edge between l1 and l2 if there exists
2156 * already a path from l1 to l2, so care has to be taken for those vertices
2157 * that have two paths to 'request'. These vertices are referred to here
2158 * as barrier locks.
2159 *
2160 * The barriers has to be found (those vertex that originally had two paths
2161 * to request) because otherwise we may end up adding edges unnecessarily
2162 * to vertices in topology, and thus barrier vertices can have an edge
2163 * to a vertex in topology as well a path to it.
2164 */
2165
2166 static void
flk_recompute_dependencies(lock_descriptor_t * request,lock_descriptor_t ** topology,int nvertex,int update_graph)2167 flk_recompute_dependencies(lock_descriptor_t *request,
2168 lock_descriptor_t **topology, int nvertex, int update_graph)
2169 {
2170 lock_descriptor_t *vertex, *lock;
2171 graph_t *gp = request->l_graph;
2172 int i, count;
2173 int barrier_found = 0;
2174 edge_t *ep;
2175 lock_descriptor_t *vertex_stack;
2176
2177 STACK_INIT(vertex_stack);
2178
2179 ASSERT(MUTEX_HELD(&gp->gp_mutex));
2180 if (nvertex == 0)
2181 return;
2182 flk_graph_uncolor(request->l_graph);
2183 barrier_found = flk_find_barriers(request);
2184 request->l_state |= RECOMPUTE_DONE;
2185
2186 STACK_PUSH(vertex_stack, request, l_stack);
2187 request->l_sedge = FIRST_IN(request);
2188
2189
2190 while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2191 if (vertex->l_state & RECOMPUTE_DONE) {
2192 count = 0;
2193 goto next_in_edge;
2194 }
2195 if (IS_BARRIER(vertex)) {
2196 /* decrement the barrier count */
2197 if (vertex->l_index) {
2198 vertex->l_index--;
2199 /* this guy will be pushed again anyway ? */
2200 STACK_POP(vertex_stack, l_stack);
2201 if (vertex->l_index == 0) {
2202 /*
2203 * barrier is over we can recompute
2204 * dependencies for this lock in the
2205 * next stack pop
2206 */
2207 vertex->l_state &= ~BARRIER_LOCK;
2208 }
2209 continue;
2210 }
2211 }
2212 vertex->l_state |= RECOMPUTE_DONE;
2213 flk_graph_uncolor(gp);
2214 count = flk_color_reachables(vertex);
2215 for (i = 0; i < nvertex; i++) {
2216 lock = topology[i];
2217 if (COLORED(lock))
2218 continue;
2219 if (BLOCKS(lock, vertex)) {
2220 (void) flk_add_edge(vertex, lock,
2221 NO_CHECK_CYCLE, update_graph);
2222 COLOR(lock);
2223 count++;
2224 count += flk_color_reachables(lock);
2225 }
2226
2227 }
2228
2229 next_in_edge:
2230 if (count == nvertex ||
2231 vertex->l_sedge == HEAD(vertex)) {
2232 /* prune the tree below this */
2233 STACK_POP(vertex_stack, l_stack);
2234 vertex->l_state &= ~RECOMPUTE_DONE;
2235 /* update the barrier locks below this! */
2236 if (vertex->l_sedge != HEAD(vertex) && barrier_found) {
2237 flk_graph_uncolor(gp);
2238 flk_update_barriers(vertex);
2239 }
2240 continue;
2241 }
2242
2243 ep = vertex->l_sedge;
2244 lock = ep->from_vertex;
2245 STACK_PUSH(vertex_stack, lock, l_stack);
2246 lock->l_sedge = FIRST_IN(lock);
2247 vertex->l_sedge = NEXT_IN(ep);
2248 }
2249
2250 }
2251
2252 /*
2253 * Color all reachable vertices from vertex that belongs to topology (here
2254 * those that have RECOMPUTE_LOCK set in their state) and yet uncolored.
2255 *
2256 * Note: we need to use a different stack_link l_stack1 because this is
2257 * called from flk_recompute_dependencies() that already uses a stack with
2258 * l_stack as stack_link.
2259 */
2260
2261 static int
flk_color_reachables(lock_descriptor_t * vertex)2262 flk_color_reachables(lock_descriptor_t *vertex)
2263 {
2264 lock_descriptor_t *ver, *lock;
2265 int count;
2266 edge_t *ep;
2267 lock_descriptor_t *vertex_stack;
2268
2269 STACK_INIT(vertex_stack);
2270
2271 STACK_PUSH(vertex_stack, vertex, l_stack1);
2272 count = 0;
2273 while ((ver = STACK_TOP(vertex_stack)) != NULL) {
2274
2275 STACK_POP(vertex_stack, l_stack1);
2276 for (ep = FIRST_ADJ(ver); ep != HEAD(ver);
2277 ep = NEXT_ADJ(ep)) {
2278 lock = ep->to_vertex;
2279 if (COLORED(lock))
2280 continue;
2281 COLOR(lock);
2282 if (IS_RECOMPUTE(lock))
2283 count++;
2284 STACK_PUSH(vertex_stack, lock, l_stack1);
2285 }
2286
2287 }
2288 return (count);
2289 }
2290
2291 /*
2292 * Called from flk_recompute_dependencies() this routine decrements
2293 * the barrier count of barrier vertices that are reachable from lock.
2294 */
2295
2296 static void
flk_update_barriers(lock_descriptor_t * lock)2297 flk_update_barriers(lock_descriptor_t *lock)
2298 {
2299 lock_descriptor_t *vertex, *lck;
2300 edge_t *ep;
2301 lock_descriptor_t *vertex_stack;
2302
2303 STACK_INIT(vertex_stack);
2304
2305 STACK_PUSH(vertex_stack, lock, l_stack1);
2306
2307 while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2308 STACK_POP(vertex_stack, l_stack1);
2309 for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2310 ep = NEXT_IN(ep)) {
2311 lck = ep->from_vertex;
2312 if (COLORED(lck)) {
2313 if (IS_BARRIER(lck)) {
2314 ASSERT(lck->l_index > 0);
2315 lck->l_index--;
2316 if (lck->l_index == 0)
2317 lck->l_state &= ~BARRIER_LOCK;
2318 }
2319 continue;
2320 }
2321 COLOR(lck);
2322 if (IS_BARRIER(lck)) {
2323 ASSERT(lck->l_index > 0);
2324 lck->l_index--;
2325 if (lck->l_index == 0)
2326 lck->l_state &= ~BARRIER_LOCK;
2327 }
2328 STACK_PUSH(vertex_stack, lck, l_stack1);
2329 }
2330 }
2331 }
2332
2333 /*
2334 * Finds all vertices that are reachable from 'lock' more than once and
2335 * mark them as barrier vertices and increment their barrier count.
2336 * The barrier count is one minus the total number of paths from lock
2337 * to that vertex.
2338 */
2339
2340 static int
flk_find_barriers(lock_descriptor_t * lock)2341 flk_find_barriers(lock_descriptor_t *lock)
2342 {
2343 lock_descriptor_t *vertex, *lck;
2344 int found = 0;
2345 edge_t *ep;
2346 lock_descriptor_t *vertex_stack;
2347
2348 STACK_INIT(vertex_stack);
2349
2350 STACK_PUSH(vertex_stack, lock, l_stack1);
2351
2352 while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
2353 STACK_POP(vertex_stack, l_stack1);
2354 for (ep = FIRST_IN(vertex); ep != HEAD(vertex);
2355 ep = NEXT_IN(ep)) {
2356 lck = ep->from_vertex;
2357 if (COLORED(lck)) {
2358 /* this is a barrier */
2359 lck->l_state |= BARRIER_LOCK;
2360 /* index will have barrier count */
2361 lck->l_index++;
2362 if (!found)
2363 found = 1;
2364 continue;
2365 }
2366 COLOR(lck);
2367 lck->l_index = 0;
2368 STACK_PUSH(vertex_stack, lck, l_stack1);
2369 }
2370 }
2371 return (found);
2372 }
2373
2374 /*
2375 * Finds the first lock that is mainly responsible for blocking this
2376 * request. If there is no such lock, request->l_flock.l_type is set to
2377 * F_UNLCK. Otherwise, request->l_flock is filled in with the particulars
2378 * of the blocking lock.
2379 *
2380 * Note: It is possible a request is blocked by a sleeping lock because
2381 * of the fairness policy used in flk_process_request() to construct the
2382 * dependencies. (see comments before flk_process_request()).
2383 */
2384
2385 static void
flk_get_first_blocking_lock(lock_descriptor_t * request)2386 flk_get_first_blocking_lock(lock_descriptor_t *request)
2387 {
2388 graph_t *gp = request->l_graph;
2389 vnode_t *vp = request->l_vnode;
2390 lock_descriptor_t *lock, *blocker;
2391
2392 ASSERT(MUTEX_HELD(&gp->gp_mutex));
2393 blocker = NULL;
2394 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2395
2396 if (lock) {
2397 do {
2398 if (BLOCKS(lock, request)) {
2399 blocker = lock;
2400 break;
2401 }
2402 lock = lock->l_next;
2403 } while (lock->l_vnode == vp);
2404 }
2405
2406 if (blocker == NULL && request->l_flock.l_type == F_RDLCK) {
2407 /*
2408 * No active lock is blocking this request, but if a read
2409 * lock is requested, it may also get blocked by a waiting
2410 * writer. So search all sleeping locks and see if there is
2411 * a writer waiting.
2412 */
2413 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2414 if (lock) {
2415 do {
2416 if (BLOCKS(lock, request)) {
2417 blocker = lock;
2418 break;
2419 }
2420 lock = lock->l_next;
2421 } while (lock->l_vnode == vp);
2422 }
2423 }
2424
2425 if (blocker) {
2426 report_blocker(blocker, request);
2427 } else
2428 request->l_flock.l_type = F_UNLCK;
2429 }
2430
2431 /*
2432 * Get the graph_t structure associated with a vnode.
2433 * If 'initialize' is non-zero, and the graph_t structure for this vnode has
2434 * not yet been initialized, then a new element is allocated and returned.
2435 */
2436 graph_t *
flk_get_lock_graph(vnode_t * vp,int initialize)2437 flk_get_lock_graph(vnode_t *vp, int initialize)
2438 {
2439 graph_t *gp;
2440 graph_t *gp_alloc = NULL;
2441 int index = HASH_INDEX(vp);
2442
2443 if (initialize == FLK_USE_GRAPH) {
2444 mutex_enter(&flock_lock);
2445 gp = lock_graph[index];
2446 mutex_exit(&flock_lock);
2447 return (gp);
2448 }
2449
2450 ASSERT(initialize == FLK_INIT_GRAPH);
2451
2452 if (lock_graph[index] == NULL) {
2453
2454 gp_alloc = kmem_zalloc(sizeof (graph_t), KM_SLEEP);
2455
2456 /* Initialize the graph */
2457
2458 gp_alloc->active_locks.l_next =
2459 gp_alloc->active_locks.l_prev =
2460 (lock_descriptor_t *)ACTIVE_HEAD(gp_alloc);
2461 gp_alloc->sleeping_locks.l_next =
2462 gp_alloc->sleeping_locks.l_prev =
2463 (lock_descriptor_t *)SLEEPING_HEAD(gp_alloc);
2464 gp_alloc->index = index;
2465 mutex_init(&gp_alloc->gp_mutex, NULL, MUTEX_DEFAULT, NULL);
2466 }
2467
2468 mutex_enter(&flock_lock);
2469
2470 gp = lock_graph[index];
2471
2472 /* Recheck the value within flock_lock */
2473 if (gp == NULL) {
2474 struct flock_globals *fg;
2475
2476 /* We must have previously allocated the graph_t structure */
2477 ASSERT(gp_alloc != NULL);
2478 lock_graph[index] = gp = gp_alloc;
2479 /*
2480 * The lockmgr status is only needed if KLM is loaded.
2481 */
2482 if (flock_zone_key != ZONE_KEY_UNINITIALIZED) {
2483 fg = flk_get_globals();
2484 fg->lockmgr_status[index] = fg->flk_lockmgr_status;
2485 }
2486 }
2487
2488 mutex_exit(&flock_lock);
2489
2490 if ((gp_alloc != NULL) && (gp != gp_alloc)) {
2491 /* There was a race to allocate the graph_t and we lost */
2492 mutex_destroy(&gp_alloc->gp_mutex);
2493 kmem_free(gp_alloc, sizeof (graph_t));
2494 }
2495
2496 return (gp);
2497 }
2498
2499 /*
2500 * PSARC case 1997/292
2501 */
2502 int
cl_flk_has_remote_locks_for_nlmid(vnode_t * vp,int nlmid)2503 cl_flk_has_remote_locks_for_nlmid(vnode_t *vp, int nlmid)
2504 {
2505 lock_descriptor_t *lock;
2506 int result = 0;
2507 graph_t *gp;
2508 int lock_nlmid;
2509
2510 /*
2511 * Check to see if node is booted as a cluster. If not, return.
2512 */
2513 if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2514 return (0);
2515 }
2516
2517 gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2518 if (gp == NULL) {
2519 return (0);
2520 }
2521
2522 mutex_enter(&gp->gp_mutex);
2523
2524 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2525
2526 if (lock) {
2527 while (lock->l_vnode == vp) {
2528 /* get NLM id from sysid */
2529 lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2530
2531 /*
2532 * If NLM server request _and_ nlmid of lock matches
2533 * nlmid of argument, then we've found a remote lock.
2534 */
2535 if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2536 result = 1;
2537 goto done;
2538 }
2539 lock = lock->l_next;
2540 }
2541 }
2542
2543 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2544
2545 if (lock) {
2546 while (lock->l_vnode == vp) {
2547 /* get NLM id from sysid */
2548 lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
2549
2550 /*
2551 * If NLM server request _and_ nlmid of lock matches
2552 * nlmid of argument, then we've found a remote lock.
2553 */
2554 if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
2555 result = 1;
2556 goto done;
2557 }
2558 lock = lock->l_next;
2559 }
2560 }
2561
2562 done:
2563 mutex_exit(&gp->gp_mutex);
2564 return (result);
2565 }
2566
2567 /*
2568 * Determine whether there are any locks for the given vnode with a remote
2569 * sysid. Returns zero if not, non-zero if there are.
2570 *
2571 * Note that the return value from this function is potentially invalid
2572 * once it has been returned. The caller is responsible for providing its
2573 * own synchronization mechanism to ensure that the return value is useful
2574 * (e.g., see nfs_lockcompletion()).
2575 */
2576 int
flk_has_remote_locks(vnode_t * vp)2577 flk_has_remote_locks(vnode_t *vp)
2578 {
2579 lock_descriptor_t *lock;
2580 int result = 0;
2581 graph_t *gp;
2582
2583 gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2584 if (gp == NULL) {
2585 return (0);
2586 }
2587
2588 mutex_enter(&gp->gp_mutex);
2589
2590 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2591
2592 if (lock) {
2593 while (lock->l_vnode == vp) {
2594 if (IS_REMOTE(lock)) {
2595 result = 1;
2596 goto done;
2597 }
2598 lock = lock->l_next;
2599 }
2600 }
2601
2602 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2603
2604 if (lock) {
2605 while (lock->l_vnode == vp) {
2606 if (IS_REMOTE(lock)) {
2607 result = 1;
2608 goto done;
2609 }
2610 lock = lock->l_next;
2611 }
2612 }
2613
2614 done:
2615 mutex_exit(&gp->gp_mutex);
2616 return (result);
2617 }
2618
2619 /*
2620 * Determine whether there are any locks for the given vnode with a remote
2621 * sysid matching given sysid.
2622 * Used by the new (open source) NFS Lock Manager (NLM)
2623 */
2624 int
flk_has_remote_locks_for_sysid(vnode_t * vp,int sysid)2625 flk_has_remote_locks_for_sysid(vnode_t *vp, int sysid)
2626 {
2627 lock_descriptor_t *lock;
2628 int result = 0;
2629 graph_t *gp;
2630
2631 if (sysid == 0)
2632 return (0);
2633
2634 gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2635 if (gp == NULL) {
2636 return (0);
2637 }
2638
2639 mutex_enter(&gp->gp_mutex);
2640
2641 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2642
2643 if (lock) {
2644 while (lock->l_vnode == vp) {
2645 if (lock->l_flock.l_sysid == sysid) {
2646 result = 1;
2647 goto done;
2648 }
2649 lock = lock->l_next;
2650 }
2651 }
2652
2653 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2654
2655 if (lock) {
2656 while (lock->l_vnode == vp) {
2657 if (lock->l_flock.l_sysid == sysid) {
2658 result = 1;
2659 goto done;
2660 }
2661 lock = lock->l_next;
2662 }
2663 }
2664
2665 done:
2666 mutex_exit(&gp->gp_mutex);
2667 return (result);
2668 }
2669
2670 /*
2671 * Determine if there are any locks owned by the given sysid.
2672 * Returns zero if not, non-zero if there are. Note that this return code
2673 * could be derived from flk_get_{sleeping,active}_locks, but this routine
2674 * avoids all the memory allocations of those routines.
2675 *
2676 * This routine has the same synchronization issues as
2677 * flk_has_remote_locks.
2678 */
2679
2680 int
flk_sysid_has_locks(int sysid,int lck_type)2681 flk_sysid_has_locks(int sysid, int lck_type)
2682 {
2683 int has_locks = 0;
2684 lock_descriptor_t *lock;
2685 graph_t *gp;
2686 int i;
2687
2688 for (i = 0; i < HASH_SIZE && !has_locks; i++) {
2689 mutex_enter(&flock_lock);
2690 gp = lock_graph[i];
2691 mutex_exit(&flock_lock);
2692 if (gp == NULL) {
2693 continue;
2694 }
2695
2696 mutex_enter(&gp->gp_mutex);
2697
2698 if (lck_type & FLK_QUERY_ACTIVE) {
2699 for (lock = ACTIVE_HEAD(gp)->l_next;
2700 lock != ACTIVE_HEAD(gp) && !has_locks;
2701 lock = lock->l_next) {
2702 if (lock->l_flock.l_sysid == sysid)
2703 has_locks = 1;
2704 }
2705 }
2706
2707 if (lck_type & FLK_QUERY_SLEEPING) {
2708 for (lock = SLEEPING_HEAD(gp)->l_next;
2709 lock != SLEEPING_HEAD(gp) && !has_locks;
2710 lock = lock->l_next) {
2711 if (lock->l_flock.l_sysid == sysid)
2712 has_locks = 1;
2713 }
2714 }
2715 mutex_exit(&gp->gp_mutex);
2716 }
2717
2718 return (has_locks);
2719 }
2720
2721
2722 /*
2723 * PSARC case 1997/292
2724 *
2725 * Requires: "sysid" is a pair [nlmid, sysid]. The lower half is 16-bit
2726 * quantity, the real sysid generated by the NLM server; the upper half
2727 * identifies the node of the cluster where the NLM server ran.
2728 * This routine is only called by an NLM server running in a cluster.
2729 * Effects: Remove all locks held on behalf of the client identified
2730 * by "sysid."
2731 */
2732 void
cl_flk_remove_locks_by_sysid(int sysid)2733 cl_flk_remove_locks_by_sysid(int sysid)
2734 {
2735 graph_t *gp;
2736 int i;
2737 lock_descriptor_t *lock, *nlock;
2738
2739 /*
2740 * Check to see if node is booted as a cluster. If not, return.
2741 */
2742 if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
2743 return;
2744 }
2745
2746 ASSERT(sysid != 0);
2747 for (i = 0; i < HASH_SIZE; i++) {
2748 mutex_enter(&flock_lock);
2749 gp = lock_graph[i];
2750 mutex_exit(&flock_lock);
2751
2752 if (gp == NULL)
2753 continue;
2754
2755 mutex_enter(&gp->gp_mutex); /* get mutex on lock graph */
2756
2757 /* signal sleeping requests so that they bail out */
2758 lock = SLEEPING_HEAD(gp)->l_next;
2759 while (lock != SLEEPING_HEAD(gp)) {
2760 nlock = lock->l_next;
2761 if (lock->l_flock.l_sysid == sysid) {
2762 INTERRUPT_WAKEUP(lock);
2763 }
2764 lock = nlock;
2765 }
2766
2767 /* delete active locks */
2768 lock = ACTIVE_HEAD(gp)->l_next;
2769 while (lock != ACTIVE_HEAD(gp)) {
2770 nlock = lock->l_next;
2771 if (lock->l_flock.l_sysid == sysid) {
2772 flk_delete_active_lock(lock, 0);
2773 flk_wakeup(lock, 1);
2774 flk_free_lock(lock);
2775 }
2776 lock = nlock;
2777 }
2778 mutex_exit(&gp->gp_mutex); /* release mutex on lock graph */
2779 }
2780 }
2781
2782 /*
2783 * Delete all locks in the system that belongs to the sysid of the request.
2784 */
2785
2786 static void
flk_delete_locks_by_sysid(lock_descriptor_t * request)2787 flk_delete_locks_by_sysid(lock_descriptor_t *request)
2788 {
2789 int sysid = request->l_flock.l_sysid;
2790 lock_descriptor_t *lock, *nlock;
2791 graph_t *gp;
2792 int i;
2793
2794 ASSERT(MUTEX_HELD(&request->l_graph->gp_mutex));
2795 ASSERT(sysid != 0);
2796
2797 mutex_exit(&request->l_graph->gp_mutex);
2798
2799 for (i = 0; i < HASH_SIZE; i++) {
2800 mutex_enter(&flock_lock);
2801 gp = lock_graph[i];
2802 mutex_exit(&flock_lock);
2803
2804 if (gp == NULL)
2805 continue;
2806
2807 mutex_enter(&gp->gp_mutex);
2808
2809 /* signal sleeping requests so that they bail out */
2810 lock = SLEEPING_HEAD(gp)->l_next;
2811 while (lock != SLEEPING_HEAD(gp)) {
2812 nlock = lock->l_next;
2813 if (lock->l_flock.l_sysid == sysid) {
2814 INTERRUPT_WAKEUP(lock);
2815 }
2816 lock = nlock;
2817 }
2818
2819 /* delete active locks */
2820 lock = ACTIVE_HEAD(gp)->l_next;
2821 while (lock != ACTIVE_HEAD(gp)) {
2822 nlock = lock->l_next;
2823 if (lock->l_flock.l_sysid == sysid) {
2824 flk_delete_active_lock(lock, 0);
2825 flk_wakeup(lock, 1);
2826 flk_free_lock(lock);
2827 }
2828 lock = nlock;
2829 }
2830 mutex_exit(&gp->gp_mutex);
2831 }
2832
2833 mutex_enter(&request->l_graph->gp_mutex);
2834 }
2835
2836 /*
2837 * Clustering: Deletes PXFS locks
2838 * Effects: Delete all locks on files in the given file system and with the
2839 * given PXFS id.
2840 */
2841 void
cl_flk_delete_pxfs_locks(struct vfs * vfsp,int pxfsid)2842 cl_flk_delete_pxfs_locks(struct vfs *vfsp, int pxfsid)
2843 {
2844 lock_descriptor_t *lock, *nlock;
2845 graph_t *gp;
2846 int i;
2847
2848 for (i = 0; i < HASH_SIZE; i++) {
2849 mutex_enter(&flock_lock);
2850 gp = lock_graph[i];
2851 mutex_exit(&flock_lock);
2852
2853 if (gp == NULL)
2854 continue;
2855
2856 mutex_enter(&gp->gp_mutex);
2857
2858 /* signal sleeping requests so that they bail out */
2859 lock = SLEEPING_HEAD(gp)->l_next;
2860 while (lock != SLEEPING_HEAD(gp)) {
2861 nlock = lock->l_next;
2862 if (lock->l_vnode->v_vfsp == vfsp) {
2863 ASSERT(IS_PXFS(lock));
2864 if (GETPXFSID(lock->l_flock.l_sysid) ==
2865 pxfsid) {
2866 flk_set_state(lock,
2867 FLK_CANCELLED_STATE);
2868 flk_cancel_sleeping_lock(lock, 1);
2869 }
2870 }
2871 lock = nlock;
2872 }
2873
2874 /* delete active locks */
2875 lock = ACTIVE_HEAD(gp)->l_next;
2876 while (lock != ACTIVE_HEAD(gp)) {
2877 nlock = lock->l_next;
2878 if (lock->l_vnode->v_vfsp == vfsp) {
2879 ASSERT(IS_PXFS(lock));
2880 if (GETPXFSID(lock->l_flock.l_sysid) ==
2881 pxfsid) {
2882 flk_delete_active_lock(lock, 0);
2883 flk_wakeup(lock, 1);
2884 flk_free_lock(lock);
2885 }
2886 }
2887 lock = nlock;
2888 }
2889 mutex_exit(&gp->gp_mutex);
2890 }
2891 }
2892
2893 /*
2894 * Search for a sleeping lock manager lock which matches exactly this lock
2895 * request; if one is found, fake a signal to cancel it.
2896 *
2897 * Return 1 if a matching lock was found, 0 otherwise.
2898 */
2899
2900 static int
flk_canceled(lock_descriptor_t * request)2901 flk_canceled(lock_descriptor_t *request)
2902 {
2903 lock_descriptor_t *lock, *nlock;
2904 graph_t *gp = request->l_graph;
2905 vnode_t *vp = request->l_vnode;
2906
2907 ASSERT(MUTEX_HELD(&gp->gp_mutex));
2908 ASSERT(IS_LOCKMGR(request));
2909 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2910
2911 if (lock) {
2912 while (lock->l_vnode == vp) {
2913 nlock = lock->l_next;
2914 if (SAME_OWNER(lock, request) &&
2915 lock->l_start == request->l_start &&
2916 lock->l_end == request->l_end) {
2917 INTERRUPT_WAKEUP(lock);
2918 return (1);
2919 }
2920 lock = nlock;
2921 }
2922 }
2923 return (0);
2924 }
2925
2926 /*
2927 * Remove all non-OFD locks for the vnode belonging to the given pid and sysid.
2928 * That is, since OFD locks are pid-less we'll never match on the incoming
2929 * pid. OFD locks are removed earlier in the close() path via closef() and
2930 * ofdcleanlock().
2931 */
2932 void
cleanlocks(vnode_t * vp,pid_t pid,int sysid)2933 cleanlocks(vnode_t *vp, pid_t pid, int sysid)
2934 {
2935 graph_t *gp;
2936 lock_descriptor_t *lock, *nlock;
2937 lock_descriptor_t *link_stack;
2938
2939 STACK_INIT(link_stack);
2940
2941 gp = flk_get_lock_graph(vp, FLK_USE_GRAPH);
2942
2943 if (gp == NULL)
2944 return;
2945 mutex_enter(&gp->gp_mutex);
2946
2947 CHECK_SLEEPING_LOCKS(gp);
2948 CHECK_ACTIVE_LOCKS(gp);
2949
2950 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
2951
2952 if (lock) {
2953 do {
2954 nlock = lock->l_next;
2955 if ((lock->l_flock.l_pid == pid ||
2956 pid == IGN_PID) &&
2957 lock->l_flock.l_sysid == sysid) {
2958 CANCEL_WAKEUP(lock);
2959 }
2960 lock = nlock;
2961 } while (lock->l_vnode == vp);
2962 }
2963
2964 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
2965
2966 if (lock) {
2967 do {
2968 nlock = lock->l_next;
2969 if ((lock->l_flock.l_pid == pid ||
2970 pid == IGN_PID) &&
2971 lock->l_flock.l_sysid == sysid) {
2972 flk_delete_active_lock(lock, 0);
2973 STACK_PUSH(link_stack, lock, l_stack);
2974 }
2975 lock = nlock;
2976 } while (lock->l_vnode == vp);
2977 }
2978
2979 while ((lock = STACK_TOP(link_stack)) != NULL) {
2980 STACK_POP(link_stack, l_stack);
2981 flk_wakeup(lock, 1);
2982 flk_free_lock(lock);
2983 }
2984
2985 CHECK_SLEEPING_LOCKS(gp);
2986 CHECK_ACTIVE_LOCKS(gp);
2987 CHECK_OWNER_LOCKS(gp, pid, sysid, vp);
2988 mutex_exit(&gp->gp_mutex);
2989 }
2990
2991
2992 /*
2993 * Called from 'fs' read and write routines for files that have mandatory
2994 * locking enabled.
2995 */
2996
2997 int
chklock(struct vnode * vp,int iomode,u_offset_t offset,ssize_t len,int fmode,caller_context_t * ct)2998 chklock(struct vnode *vp, int iomode, u_offset_t offset, ssize_t len, int fmode,
2999 caller_context_t *ct)
3000 {
3001 register int i;
3002 struct flock64 bf;
3003 int error = 0;
3004
3005 bf.l_type = (iomode & FWRITE) ? F_WRLCK : F_RDLCK;
3006 bf.l_whence = 0;
3007 bf.l_start = offset;
3008 bf.l_len = len;
3009 if (ct == NULL) {
3010 bf.l_pid = curproc->p_pid;
3011 bf.l_sysid = 0;
3012 } else {
3013 bf.l_pid = ct->cc_pid;
3014 bf.l_sysid = ct->cc_sysid;
3015 }
3016 i = (fmode & (FNDELAY|FNONBLOCK)) ? INOFLCK : INOFLCK|SLPFLCK;
3017 if ((i = reclock(vp, &bf, i, 0, offset, NULL)) != 0 ||
3018 bf.l_type != F_UNLCK)
3019 error = i ? i : EAGAIN;
3020 return (error);
3021 }
3022
3023 /*
3024 * convoff - converts the given data (start, whence) to the
3025 * given whence.
3026 */
3027 int
convoff(struct vnode * vp,struct flock64 * lckdat,int whence,offset_t offset)3028 convoff(struct vnode *vp, struct flock64 *lckdat, int whence, offset_t offset)
3029 {
3030 int error;
3031 struct vattr vattr;
3032
3033 if ((lckdat->l_whence == 2) || (whence == 2)) {
3034 vattr.va_mask = AT_SIZE;
3035 if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
3036 return (error);
3037 }
3038
3039 switch (lckdat->l_whence) {
3040 case 1:
3041 lckdat->l_start += offset;
3042 break;
3043 case 2:
3044 lckdat->l_start += vattr.va_size;
3045 /* FALLTHRU */
3046 case 0:
3047 break;
3048 default:
3049 return (EINVAL);
3050 }
3051
3052 if (lckdat->l_start < 0)
3053 return (EINVAL);
3054
3055 switch (whence) {
3056 case 1:
3057 lckdat->l_start -= offset;
3058 break;
3059 case 2:
3060 lckdat->l_start -= vattr.va_size;
3061 /* FALLTHRU */
3062 case 0:
3063 break;
3064 default:
3065 return (EINVAL);
3066 }
3067
3068 lckdat->l_whence = (short)whence;
3069 return (0);
3070 }
3071
3072
3073 /* proc_graph function definitions */
3074
3075 /*
3076 * Function checks for deadlock due to the new 'lock'. If deadlock found
3077 * edges of this lock are freed and returned.
3078 */
3079
3080 static int
flk_check_deadlock(lock_descriptor_t * lock)3081 flk_check_deadlock(lock_descriptor_t *lock)
3082 {
3083 proc_vertex_t *start_vertex, *pvertex;
3084 proc_vertex_t *dvertex;
3085 proc_edge_t *pep, *ppep;
3086 edge_t *ep, *nep;
3087 proc_vertex_t *process_stack;
3088
3089 /*
3090 * OFD style locks are not associated with any process so there is
3091 * no proc graph for these. Thus we cannot, and do not, do deadlock
3092 * detection.
3093 */
3094 if (lock->l_ofd != NULL)
3095 return (0);
3096
3097 STACK_INIT(process_stack);
3098
3099 mutex_enter(&flock_lock);
3100 start_vertex = flk_get_proc_vertex(lock);
3101 ASSERT(start_vertex != NULL);
3102
3103 /* construct the edges from this process to other processes */
3104
3105 ep = FIRST_ADJ(lock);
3106 while (ep != HEAD(lock)) {
3107 proc_vertex_t *adj_proc;
3108
3109 adj_proc = flk_get_proc_vertex(ep->to_vertex);
3110 for (pep = start_vertex->edge; pep != NULL; pep = pep->next) {
3111 if (pep->to_proc == adj_proc) {
3112 ASSERT(pep->refcount);
3113 pep->refcount++;
3114 break;
3115 }
3116 }
3117 if (pep == NULL) {
3118 pep = flk_get_proc_edge();
3119 pep->to_proc = adj_proc;
3120 pep->refcount = 1;
3121 adj_proc->incount++;
3122 pep->next = start_vertex->edge;
3123 start_vertex->edge = pep;
3124 }
3125 ep = NEXT_ADJ(ep);
3126 }
3127
3128 ep = FIRST_IN(lock);
3129
3130 while (ep != HEAD(lock)) {
3131 proc_vertex_t *in_proc;
3132
3133 in_proc = flk_get_proc_vertex(ep->from_vertex);
3134
3135 for (pep = in_proc->edge; pep != NULL; pep = pep->next) {
3136 if (pep->to_proc == start_vertex) {
3137 ASSERT(pep->refcount);
3138 pep->refcount++;
3139 break;
3140 }
3141 }
3142 if (pep == NULL) {
3143 pep = flk_get_proc_edge();
3144 pep->to_proc = start_vertex;
3145 pep->refcount = 1;
3146 start_vertex->incount++;
3147 pep->next = in_proc->edge;
3148 in_proc->edge = pep;
3149 }
3150 ep = NEXT_IN(ep);
3151 }
3152
3153 if (start_vertex->incount == 0) {
3154 mutex_exit(&flock_lock);
3155 return (0);
3156 }
3157
3158 flk_proc_graph_uncolor();
3159
3160 start_vertex->p_sedge = start_vertex->edge;
3161
3162 STACK_PUSH(process_stack, start_vertex, p_stack);
3163
3164 while ((pvertex = STACK_TOP(process_stack)) != NULL) {
3165 for (pep = pvertex->p_sedge; pep != NULL; pep = pep->next) {
3166 dvertex = pep->to_proc;
3167 if (!PROC_ARRIVED(dvertex)) {
3168 STACK_PUSH(process_stack, dvertex, p_stack);
3169 dvertex->p_sedge = dvertex->edge;
3170 PROC_ARRIVE(pvertex);
3171 pvertex->p_sedge = pep->next;
3172 break;
3173 }
3174 if (!PROC_DEPARTED(dvertex))
3175 goto deadlock;
3176 }
3177 if (pep == NULL) {
3178 PROC_DEPART(pvertex);
3179 STACK_POP(process_stack, p_stack);
3180 }
3181 }
3182 mutex_exit(&flock_lock);
3183 return (0);
3184
3185 deadlock:
3186
3187 /* we remove all lock edges and proc edges */
3188
3189 ep = FIRST_ADJ(lock);
3190 while (ep != HEAD(lock)) {
3191 proc_vertex_t *adj_proc;
3192 adj_proc = flk_get_proc_vertex(ep->to_vertex);
3193 nep = NEXT_ADJ(ep);
3194 IN_LIST_REMOVE(ep);
3195 ADJ_LIST_REMOVE(ep);
3196 flk_free_edge(ep);
3197 ppep = start_vertex->edge;
3198 for (pep = start_vertex->edge; pep != NULL; ppep = pep,
3199 pep = ppep->next) {
3200 if (pep->to_proc == adj_proc) {
3201 pep->refcount--;
3202 if (pep->refcount == 0) {
3203 if (pep == ppep) {
3204 start_vertex->edge = pep->next;
3205 } else {
3206 ppep->next = pep->next;
3207 }
3208 adj_proc->incount--;
3209 flk_proc_release(adj_proc);
3210 flk_free_proc_edge(pep);
3211 }
3212 break;
3213 }
3214 }
3215 ep = nep;
3216 }
3217 ep = FIRST_IN(lock);
3218 while (ep != HEAD(lock)) {
3219 proc_vertex_t *in_proc;
3220 in_proc = flk_get_proc_vertex(ep->from_vertex);
3221 nep = NEXT_IN(ep);
3222 IN_LIST_REMOVE(ep);
3223 ADJ_LIST_REMOVE(ep);
3224 flk_free_edge(ep);
3225 ppep = in_proc->edge;
3226 for (pep = in_proc->edge; pep != NULL; ppep = pep,
3227 pep = ppep->next) {
3228 if (pep->to_proc == start_vertex) {
3229 pep->refcount--;
3230 if (pep->refcount == 0) {
3231 if (pep == ppep) {
3232 in_proc->edge = pep->next;
3233 } else {
3234 ppep->next = pep->next;
3235 }
3236 start_vertex->incount--;
3237 flk_proc_release(in_proc);
3238 flk_free_proc_edge(pep);
3239 }
3240 break;
3241 }
3242 }
3243 ep = nep;
3244 }
3245 flk_proc_release(start_vertex);
3246 mutex_exit(&flock_lock);
3247 return (1);
3248 }
3249
3250 /*
3251 * Get a proc vertex. If lock's pvertex value gets a correct proc vertex
3252 * from the list we return that, otherwise we allocate one. If necessary,
3253 * we grow the list of vertices also.
3254 */
3255
3256 static proc_vertex_t *
flk_get_proc_vertex(lock_descriptor_t * lock)3257 flk_get_proc_vertex(lock_descriptor_t *lock)
3258 {
3259 int i;
3260 proc_vertex_t *pv;
3261 proc_vertex_t **palloc;
3262
3263 ASSERT(MUTEX_HELD(&flock_lock));
3264 if (lock->pvertex != -1) {
3265 ASSERT(lock->pvertex >= 0);
3266 pv = pgraph.proc[lock->pvertex];
3267 if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
3268 return (pv);
3269 }
3270 }
3271 for (i = 0; i < pgraph.gcount; i++) {
3272 pv = pgraph.proc[i];
3273 if (pv != NULL && PROC_SAME_OWNER(lock, pv)) {
3274 lock->pvertex = pv->index = i;
3275 return (pv);
3276 }
3277 }
3278 pv = kmem_zalloc(sizeof (struct proc_vertex), KM_SLEEP);
3279 pv->pid = lock->l_flock.l_pid;
3280 pv->sysid = lock->l_flock.l_sysid;
3281 flk_proc_vertex_allocs++;
3282 if (pgraph.free != 0) {
3283 for (i = 0; i < pgraph.gcount; i++) {
3284 if (pgraph.proc[i] == NULL) {
3285 pgraph.proc[i] = pv;
3286 lock->pvertex = pv->index = i;
3287 pgraph.free--;
3288 return (pv);
3289 }
3290 }
3291 }
3292 palloc = kmem_zalloc((pgraph.gcount + PROC_CHUNK) *
3293 sizeof (proc_vertex_t *), KM_SLEEP);
3294
3295 if (pgraph.proc) {
3296 bcopy(pgraph.proc, palloc,
3297 pgraph.gcount * sizeof (proc_vertex_t *));
3298
3299 kmem_free(pgraph.proc,
3300 pgraph.gcount * sizeof (proc_vertex_t *));
3301 }
3302 pgraph.proc = palloc;
3303 pgraph.free += (PROC_CHUNK - 1);
3304 pv->index = lock->pvertex = pgraph.gcount;
3305 pgraph.gcount += PROC_CHUNK;
3306 pgraph.proc[pv->index] = pv;
3307 return (pv);
3308 }
3309
3310 /*
3311 * Allocate a proc edge.
3312 */
3313
3314 static proc_edge_t *
flk_get_proc_edge()3315 flk_get_proc_edge()
3316 {
3317 proc_edge_t *pep;
3318
3319 pep = kmem_zalloc(sizeof (proc_edge_t), KM_SLEEP);
3320 flk_proc_edge_allocs++;
3321 return (pep);
3322 }
3323
3324 /*
3325 * Free the proc edge. Called whenever its reference count goes to zero.
3326 */
3327
3328 static void
flk_free_proc_edge(proc_edge_t * pep)3329 flk_free_proc_edge(proc_edge_t *pep)
3330 {
3331 ASSERT(pep->refcount == 0);
3332 kmem_free((void *)pep, sizeof (proc_edge_t));
3333 flk_proc_edge_frees++;
3334 }
3335
3336 /*
3337 * Color the graph explicitly done only when the mark value hits max value.
3338 */
3339
3340 static void
flk_proc_graph_uncolor()3341 flk_proc_graph_uncolor()
3342 {
3343 int i;
3344
3345 if (pgraph.mark == UINT_MAX) {
3346 for (i = 0; i < pgraph.gcount; i++)
3347 if (pgraph.proc[i] != NULL) {
3348 pgraph.proc[i]->atime = 0;
3349 pgraph.proc[i]->dtime = 0;
3350 }
3351 pgraph.mark = 1;
3352 } else {
3353 pgraph.mark++;
3354 }
3355 }
3356
3357 /*
3358 * Release the proc vertex iff both there are no in edges and out edges
3359 */
3360
3361 static void
flk_proc_release(proc_vertex_t * proc)3362 flk_proc_release(proc_vertex_t *proc)
3363 {
3364 ASSERT(MUTEX_HELD(&flock_lock));
3365 if (proc->edge == NULL && proc->incount == 0) {
3366 pgraph.proc[proc->index] = NULL;
3367 pgraph.free++;
3368 kmem_free(proc, sizeof (proc_vertex_t));
3369 flk_proc_vertex_frees++;
3370 }
3371 }
3372
3373 /*
3374 * Updates process graph to reflect change in a lock_graph.
3375 * Note: We should call this function only after we have a correctly
3376 * recomputed lock graph. Otherwise we might miss a deadlock detection.
3377 * eg: in function flk_relation() we call this function after flk_recompute_
3378 * dependencies() otherwise if a process tries to lock a vnode hashed
3379 * into another graph it might sleep for ever.
3380 */
3381
3382 static void
flk_update_proc_graph(edge_t * ep,int delete)3383 flk_update_proc_graph(edge_t *ep, int delete)
3384 {
3385 proc_vertex_t *toproc, *fromproc;
3386 proc_edge_t *pep, *prevpep;
3387
3388 mutex_enter(&flock_lock);
3389
3390 /*
3391 * OFD style locks are not associated with any process so there is
3392 * no proc graph for these.
3393 */
3394 if (ep->from_vertex->l_ofd != NULL) {
3395 mutex_exit(&flock_lock);
3396 return;
3397 }
3398
3399 toproc = flk_get_proc_vertex(ep->to_vertex);
3400 fromproc = flk_get_proc_vertex(ep->from_vertex);
3401
3402 if (!delete)
3403 goto add;
3404 pep = prevpep = fromproc->edge;
3405
3406 ASSERT(pep != NULL);
3407 while (pep != NULL) {
3408 if (pep->to_proc == toproc) {
3409 ASSERT(pep->refcount > 0);
3410 pep->refcount--;
3411 if (pep->refcount == 0) {
3412 if (pep == prevpep) {
3413 fromproc->edge = pep->next;
3414 } else {
3415 prevpep->next = pep->next;
3416 }
3417 toproc->incount--;
3418 flk_proc_release(toproc);
3419 flk_free_proc_edge(pep);
3420 }
3421 break;
3422 }
3423 prevpep = pep;
3424 pep = pep->next;
3425 }
3426 flk_proc_release(fromproc);
3427 mutex_exit(&flock_lock);
3428 return;
3429 add:
3430
3431 pep = fromproc->edge;
3432
3433 while (pep != NULL) {
3434 if (pep->to_proc == toproc) {
3435 ASSERT(pep->refcount > 0);
3436 pep->refcount++;
3437 break;
3438 }
3439 pep = pep->next;
3440 }
3441 if (pep == NULL) {
3442 pep = flk_get_proc_edge();
3443 pep->to_proc = toproc;
3444 pep->refcount = 1;
3445 toproc->incount++;
3446 pep->next = fromproc->edge;
3447 fromproc->edge = pep;
3448 }
3449 mutex_exit(&flock_lock);
3450 }
3451
3452 /*
3453 * Set the control status for lock manager requests.
3454 *
3455 */
3456
3457 /*
3458 * PSARC case 1997/292
3459 *
3460 * Requires: "nlmid" must be >= 1 and <= clconf_maximum_nodeid().
3461 * Effects: Set the state of the NLM server identified by "nlmid"
3462 * in the NLM registry to state "nlm_state."
3463 * Raises exception no_such_nlm if "nlmid" doesn't identify a known
3464 * NLM server to this LLM.
3465 * Note that when this routine is called with NLM_SHUTTING_DOWN there
3466 * may be locks requests that have gotten started but not finished. In
3467 * particular, there may be blocking requests that are in the callback code
3468 * before sleeping (so they're not holding the lock for the graph). If
3469 * such a thread reacquires the graph's lock (to go to sleep) after
3470 * NLM state in the NLM registry is set to a non-up value,
3471 * it will notice the status and bail out. If the request gets
3472 * granted before the thread can check the NLM registry, let it
3473 * continue normally. It will get flushed when we are called with NLM_DOWN.
3474 *
3475 * Modifies: nlm_reg_obj (global)
3476 * Arguments:
3477 * nlmid (IN): id uniquely identifying an NLM server
3478 * nlm_state (IN): NLM server state to change "nlmid" to
3479 */
3480 void
cl_flk_set_nlm_status(int nlmid,flk_nlm_status_t nlm_state)3481 cl_flk_set_nlm_status(int nlmid, flk_nlm_status_t nlm_state)
3482 {
3483 /*
3484 * Check to see if node is booted as a cluster. If not, return.
3485 */
3486 if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
3487 return;
3488 }
3489
3490 /*
3491 * Check for development/debugging. It is possible to boot a node
3492 * in non-cluster mode, and then run a special script, currently
3493 * available only to developers, to bring up the node as part of a
3494 * cluster. The problem is that running such a script does not
3495 * result in the routine flk_init() being called and hence global array
3496 * nlm_reg_status is NULL. The NLM thinks it's in cluster mode,
3497 * but the LLM needs to do an additional check to see if the global
3498 * array has been created or not. If nlm_reg_status is NULL, then
3499 * return, else continue.
3500 */
3501 if (nlm_reg_status == NULL) {
3502 return;
3503 }
3504
3505 ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
3506 mutex_enter(&nlm_reg_lock);
3507
3508 if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status, nlmid)) {
3509 /*
3510 * If the NLM server "nlmid" is unknown in the NLM registry,
3511 * add it to the registry in the nlm shutting down state.
3512 */
3513 FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3514 FLK_NLM_SHUTTING_DOWN);
3515 } else {
3516 /*
3517 * Change the state of the NLM server identified by "nlmid"
3518 * in the NLM registry to the argument "nlm_state."
3519 */
3520 FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid,
3521 nlm_state);
3522 }
3523
3524 /*
3525 * The reason we must register the NLM server that is shutting down
3526 * with an LLM that doesn't already know about it (never sent a lock
3527 * request) is to handle correctly a race between shutdown and a new
3528 * lock request. Suppose that a shutdown request from the NLM server
3529 * invokes this routine at the LLM, and a thread is spawned to
3530 * service the request. Now suppose a new lock request is in
3531 * progress and has already passed the first line of defense in
3532 * reclock(), which denies new locks requests from NLM servers
3533 * that are not in the NLM_UP state. After the current routine
3534 * is invoked for both phases of shutdown, the routine will return,
3535 * having done nothing, and the lock request will proceed and
3536 * probably be granted. The problem is that the shutdown was ignored
3537 * by the lock request because there was no record of that NLM server
3538 * shutting down. We will be in the peculiar position of thinking
3539 * that we've shutdown the NLM server and all locks at all LLMs have
3540 * been discarded, but in fact there's still one lock held.
3541 * The solution is to record the existence of NLM server and change
3542 * its state immediately to NLM_SHUTTING_DOWN. The lock request in
3543 * progress may proceed because the next phase NLM_DOWN will catch
3544 * this lock and discard it.
3545 */
3546 mutex_exit(&nlm_reg_lock);
3547
3548 switch (nlm_state) {
3549 case FLK_NLM_UP:
3550 /*
3551 * Change the NLM state of all locks still held on behalf of
3552 * the NLM server identified by "nlmid" to NLM_UP.
3553 */
3554 cl_flk_change_nlm_state_all_locks(nlmid, FLK_NLM_UP);
3555 break;
3556
3557 case FLK_NLM_SHUTTING_DOWN:
3558 /*
3559 * Wake up all sleeping locks for the NLM server identified
3560 * by "nlmid." Note that eventually all woken threads will
3561 * have their lock requests cancelled and descriptors
3562 * removed from the sleeping lock list. Note that the NLM
3563 * server state associated with each lock descriptor is
3564 * changed to FLK_NLM_SHUTTING_DOWN.
3565 */
3566 cl_flk_wakeup_sleeping_nlm_locks(nlmid);
3567 break;
3568
3569 case FLK_NLM_DOWN:
3570 /*
3571 * Discard all active, granted locks for this NLM server
3572 * identified by "nlmid."
3573 */
3574 cl_flk_unlock_nlm_granted(nlmid);
3575 break;
3576
3577 default:
3578 panic("cl_set_nlm_status: bad status (%d)", nlm_state);
3579 }
3580 }
3581
3582 /*
3583 * Set the control status for lock manager requests.
3584 *
3585 * Note that when this routine is called with FLK_WAKEUP_SLEEPERS, there
3586 * may be locks requests that have gotten started but not finished. In
3587 * particular, there may be blocking requests that are in the callback code
3588 * before sleeping (so they're not holding the lock for the graph). If
3589 * such a thread reacquires the graph's lock (to go to sleep) after
3590 * flk_lockmgr_status is set to a non-up value, it will notice the status
3591 * and bail out. If the request gets granted before the thread can check
3592 * flk_lockmgr_status, let it continue normally. It will get flushed when
3593 * we are called with FLK_LOCKMGR_DOWN.
3594 */
3595
3596 void
flk_set_lockmgr_status(flk_lockmgr_status_t status)3597 flk_set_lockmgr_status(flk_lockmgr_status_t status)
3598 {
3599 int i;
3600 graph_t *gp;
3601 struct flock_globals *fg;
3602
3603 fg = flk_get_globals();
3604 ASSERT(fg != NULL);
3605
3606 mutex_enter(&flock_lock);
3607 fg->flk_lockmgr_status = status;
3608 mutex_exit(&flock_lock);
3609
3610 /*
3611 * If the lock manager is coming back up, all that's needed is to
3612 * propagate this information to the graphs. If the lock manager
3613 * is going down, additional action is required, and each graph's
3614 * copy of the state is updated atomically with this other action.
3615 */
3616 switch (status) {
3617 case FLK_LOCKMGR_UP:
3618 for (i = 0; i < HASH_SIZE; i++) {
3619 mutex_enter(&flock_lock);
3620 gp = lock_graph[i];
3621 mutex_exit(&flock_lock);
3622 if (gp == NULL)
3623 continue;
3624 mutex_enter(&gp->gp_mutex);
3625 fg->lockmgr_status[i] = status;
3626 mutex_exit(&gp->gp_mutex);
3627 }
3628 break;
3629 case FLK_WAKEUP_SLEEPERS:
3630 wakeup_sleeping_lockmgr_locks(fg);
3631 break;
3632 case FLK_LOCKMGR_DOWN:
3633 unlock_lockmgr_granted(fg);
3634 break;
3635 default:
3636 panic("flk_set_lockmgr_status: bad status (%d)", status);
3637 break;
3638 }
3639 }
3640
3641 /*
3642 * This routine returns all the locks that are active or sleeping and are
3643 * associated with a particular set of identifiers. If lock_state != 0, then
3644 * only locks that match the lock_state are returned. If lock_state == 0, then
3645 * all locks are returned. If pid == NOPID, the pid is ignored. If
3646 * use_sysid is FALSE, then the sysid is ignored. If vp is NULL, then the
3647 * vnode pointer is ignored.
3648 *
3649 * A list containing the vnode pointer and an flock structure
3650 * describing the lock is returned. Each element in the list is
3651 * dynamically allocated and must be freed by the caller. The
3652 * last item in the list is denoted by a NULL value in the ll_next
3653 * field.
3654 *
3655 * The vnode pointers returned are held. The caller is responsible
3656 * for releasing these. Note that the returned list is only a snapshot of
3657 * the current lock information, and that it is a snapshot of a moving
3658 * target (only one graph is locked at a time).
3659 */
3660
3661 locklist_t *
get_lock_list(int list_type,int lock_state,int sysid,boolean_t use_sysid,pid_t pid,const vnode_t * vp,zoneid_t zoneid)3662 get_lock_list(int list_type, int lock_state, int sysid, boolean_t use_sysid,
3663 pid_t pid, const vnode_t *vp, zoneid_t zoneid)
3664 {
3665 lock_descriptor_t *lock;
3666 lock_descriptor_t *graph_head;
3667 locklist_t listhead;
3668 locklist_t *llheadp;
3669 locklist_t *llp;
3670 locklist_t *lltp;
3671 graph_t *gp;
3672 int i;
3673 int first_index; /* graph index */
3674 int num_indexes; /* graph index */
3675
3676 ASSERT((list_type == FLK_ACTIVE_STATE) ||
3677 (list_type == FLK_SLEEPING_STATE));
3678
3679 /*
3680 * Get a pointer to something to use as a list head while building
3681 * the rest of the list.
3682 */
3683 llheadp = &listhead;
3684 lltp = llheadp;
3685 llheadp->ll_next = (locklist_t *)NULL;
3686
3687 /* Figure out which graphs we want to look at. */
3688 if (vp == NULL) {
3689 first_index = 0;
3690 num_indexes = HASH_SIZE;
3691 } else {
3692 first_index = HASH_INDEX(vp);
3693 num_indexes = 1;
3694 }
3695
3696 for (i = first_index; i < first_index + num_indexes; i++) {
3697 mutex_enter(&flock_lock);
3698 gp = lock_graph[i];
3699 mutex_exit(&flock_lock);
3700 if (gp == NULL) {
3701 continue;
3702 }
3703
3704 mutex_enter(&gp->gp_mutex);
3705 graph_head = (list_type == FLK_ACTIVE_STATE) ?
3706 ACTIVE_HEAD(gp) : SLEEPING_HEAD(gp);
3707 for (lock = graph_head->l_next;
3708 lock != graph_head;
3709 lock = lock->l_next) {
3710 if (use_sysid && lock->l_flock.l_sysid != sysid)
3711 continue;
3712 if (pid != NOPID && lock->l_flock.l_pid != pid)
3713 continue;
3714 if (vp != NULL && lock->l_vnode != vp)
3715 continue;
3716 if (lock_state && !(lock_state & lock->l_state))
3717 continue;
3718 if (zoneid != lock->l_zoneid && zoneid != ALL_ZONES)
3719 continue;
3720 /*
3721 * A matching lock was found. Allocate
3722 * space for a new locklist entry and fill
3723 * it in.
3724 */
3725 llp = kmem_alloc(sizeof (locklist_t), KM_SLEEP);
3726 lltp->ll_next = llp;
3727 VN_HOLD(lock->l_vnode);
3728 llp->ll_vp = lock->l_vnode;
3729 create_flock(lock, &(llp->ll_flock));
3730 llp->ll_next = (locklist_t *)NULL;
3731 lltp = llp;
3732 }
3733 mutex_exit(&gp->gp_mutex);
3734 }
3735
3736 llp = llheadp->ll_next;
3737 return (llp);
3738 }
3739
3740 /*
3741 * These two functions are simply interfaces to get_lock_list. They return
3742 * a list of sleeping or active locks for the given sysid and pid. See
3743 * get_lock_list for details.
3744 *
3745 * In either case we don't particularly care to specify the zone of interest;
3746 * the sysid-space is global across zones, so the sysid will map to exactly one
3747 * zone, and we'll return information for that zone.
3748 */
3749
3750 locklist_t *
flk_get_sleeping_locks(int sysid,pid_t pid)3751 flk_get_sleeping_locks(int sysid, pid_t pid)
3752 {
3753 return (get_lock_list(FLK_SLEEPING_STATE, 0, sysid, B_TRUE, pid, NULL,
3754 ALL_ZONES));
3755 }
3756
3757 locklist_t *
flk_get_active_locks(int sysid,pid_t pid)3758 flk_get_active_locks(int sysid, pid_t pid)
3759 {
3760 return (get_lock_list(FLK_ACTIVE_STATE, 0, sysid, B_TRUE, pid, NULL,
3761 ALL_ZONES));
3762 }
3763
3764 /*
3765 * Another interface to get_lock_list. This one returns all the active
3766 * locks for a given vnode. Again, see get_lock_list for details.
3767 *
3768 * We don't need to specify which zone's locks we're interested in. The matter
3769 * would only be interesting if the vnode belonged to NFS, and NFS vnodes can't
3770 * be used by multiple zones, so the list of locks will all be from the right
3771 * zone.
3772 */
3773
3774 locklist_t *
flk_active_locks_for_vp(const vnode_t * vp)3775 flk_active_locks_for_vp(const vnode_t *vp)
3776 {
3777 return (get_lock_list(FLK_ACTIVE_STATE, 0, 0, B_FALSE, NOPID, vp,
3778 ALL_ZONES));
3779 }
3780
3781 /*
3782 * Another interface to get_lock_list. This one returns all the active
3783 * nbmand locks for a given vnode. Again, see get_lock_list for details.
3784 *
3785 * See the comment for flk_active_locks_for_vp() for why we don't care to
3786 * specify the particular zone of interest.
3787 */
3788 locklist_t *
flk_active_nbmand_locks_for_vp(const vnode_t * vp)3789 flk_active_nbmand_locks_for_vp(const vnode_t *vp)
3790 {
3791 return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3792 NOPID, vp, ALL_ZONES));
3793 }
3794
3795 /*
3796 * Another interface to get_lock_list. This one returns all the active
3797 * nbmand locks for a given pid. Again, see get_lock_list for details.
3798 *
3799 * The zone doesn't need to be specified here; the locks held by a
3800 * particular process will either be local (ie, non-NFS) or from the zone
3801 * the process is executing in. This is because other parts of the system
3802 * ensure that an NFS vnode can't be used in a zone other than that in
3803 * which it was opened.
3804 */
3805 locklist_t *
flk_active_nbmand_locks(pid_t pid)3806 flk_active_nbmand_locks(pid_t pid)
3807 {
3808 return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE,
3809 pid, NULL, ALL_ZONES));
3810 }
3811
3812 /*
3813 * Free up all entries in the locklist.
3814 */
3815 void
flk_free_locklist(locklist_t * llp)3816 flk_free_locklist(locklist_t *llp)
3817 {
3818 locklist_t *next_llp;
3819
3820 while (llp) {
3821 next_llp = llp->ll_next;
3822 VN_RELE(llp->ll_vp);
3823 kmem_free(llp, sizeof (*llp));
3824 llp = next_llp;
3825 }
3826 }
3827
3828 static void
cl_flk_change_nlm_state_all_locks(int nlmid,flk_nlm_status_t nlm_state)3829 cl_flk_change_nlm_state_all_locks(int nlmid, flk_nlm_status_t nlm_state)
3830 {
3831 /*
3832 * For each graph "lg" in the hash table lock_graph do
3833 * a. Get the list of sleeping locks
3834 * b. For each lock descriptor in the list do
3835 * i. If the requested lock is an NLM server request AND
3836 * the nlmid is the same as the routine argument then
3837 * change the lock descriptor's state field to
3838 * "nlm_state."
3839 * c. Get the list of active locks
3840 * d. For each lock descriptor in the list do
3841 * i. If the requested lock is an NLM server request AND
3842 * the nlmid is the same as the routine argument then
3843 * change the lock descriptor's state field to
3844 * "nlm_state."
3845 */
3846
3847 int i;
3848 graph_t *gp; /* lock graph */
3849 lock_descriptor_t *lock; /* lock */
3850 lock_descriptor_t *nlock = NULL; /* next lock */
3851 int lock_nlmid;
3852
3853 for (i = 0; i < HASH_SIZE; i++) {
3854 mutex_enter(&flock_lock);
3855 gp = lock_graph[i];
3856 mutex_exit(&flock_lock);
3857 if (gp == NULL) {
3858 continue;
3859 }
3860
3861 /* Get list of sleeping locks in current lock graph. */
3862 mutex_enter(&gp->gp_mutex);
3863 for (lock = SLEEPING_HEAD(gp)->l_next;
3864 lock != SLEEPING_HEAD(gp);
3865 lock = nlock) {
3866 nlock = lock->l_next;
3867 /* get NLM id */
3868 lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3869
3870 /*
3871 * If NLM server request AND nlmid of lock matches
3872 * nlmid of argument, then set the NLM state of the
3873 * lock to "nlm_state."
3874 */
3875 if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3876 SET_NLM_STATE(lock, nlm_state);
3877 }
3878 }
3879
3880 /* Get list of active locks in current lock graph. */
3881 for (lock = ACTIVE_HEAD(gp)->l_next;
3882 lock != ACTIVE_HEAD(gp);
3883 lock = nlock) {
3884 nlock = lock->l_next;
3885 /* get NLM id */
3886 lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3887
3888 /*
3889 * If NLM server request AND nlmid of lock matches
3890 * nlmid of argument, then set the NLM state of the
3891 * lock to "nlm_state."
3892 */
3893 if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) {
3894 ASSERT(IS_ACTIVE(lock));
3895 SET_NLM_STATE(lock, nlm_state);
3896 }
3897 }
3898 mutex_exit(&gp->gp_mutex);
3899 }
3900 }
3901
3902 /*
3903 * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid().
3904 * Effects: Find all sleeping lock manager requests _only_ for the NLM server
3905 * identified by "nlmid." Poke those lock requests.
3906 */
3907 static void
cl_flk_wakeup_sleeping_nlm_locks(int nlmid)3908 cl_flk_wakeup_sleeping_nlm_locks(int nlmid)
3909 {
3910 lock_descriptor_t *lock;
3911 lock_descriptor_t *nlock = NULL; /* next lock */
3912 int i;
3913 graph_t *gp;
3914 int lock_nlmid;
3915
3916 for (i = 0; i < HASH_SIZE; i++) {
3917 mutex_enter(&flock_lock);
3918 gp = lock_graph[i];
3919 mutex_exit(&flock_lock);
3920 if (gp == NULL) {
3921 continue;
3922 }
3923
3924 mutex_enter(&gp->gp_mutex);
3925 for (lock = SLEEPING_HEAD(gp)->l_next;
3926 lock != SLEEPING_HEAD(gp);
3927 lock = nlock) {
3928 nlock = lock->l_next;
3929 /*
3930 * If NLM server request _and_ nlmid of lock matches
3931 * nlmid of argument, then set the NLM state of the
3932 * lock to NLM_SHUTTING_DOWN, and wake up sleeping
3933 * request.
3934 */
3935 if (IS_LOCKMGR(lock)) {
3936 /* get NLM id */
3937 lock_nlmid =
3938 GETNLMID(lock->l_flock.l_sysid);
3939 if (nlmid == lock_nlmid) {
3940 SET_NLM_STATE(lock,
3941 FLK_NLM_SHUTTING_DOWN);
3942 INTERRUPT_WAKEUP(lock);
3943 }
3944 }
3945 }
3946 mutex_exit(&gp->gp_mutex);
3947 }
3948 }
3949
3950 /*
3951 * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid()
3952 * Effects: Find all active (granted) lock manager locks _only_ for the
3953 * NLM server identified by "nlmid" and release them.
3954 */
3955 static void
cl_flk_unlock_nlm_granted(int nlmid)3956 cl_flk_unlock_nlm_granted(int nlmid)
3957 {
3958 lock_descriptor_t *lock;
3959 lock_descriptor_t *nlock = NULL; /* next lock */
3960 int i;
3961 graph_t *gp;
3962 int lock_nlmid;
3963
3964 for (i = 0; i < HASH_SIZE; i++) {
3965 mutex_enter(&flock_lock);
3966 gp = lock_graph[i];
3967 mutex_exit(&flock_lock);
3968 if (gp == NULL) {
3969 continue;
3970 }
3971
3972 mutex_enter(&gp->gp_mutex);
3973 for (lock = ACTIVE_HEAD(gp)->l_next;
3974 lock != ACTIVE_HEAD(gp);
3975 lock = nlock) {
3976 nlock = lock->l_next;
3977 ASSERT(IS_ACTIVE(lock));
3978
3979 /*
3980 * If it's an NLM server request _and_ nlmid of
3981 * the lock matches nlmid of argument, then
3982 * remove the active lock the list, wakup blocked
3983 * threads, and free the storage for the lock.
3984 * Note that there's no need to mark the NLM state
3985 * of this lock to NLM_DOWN because the lock will
3986 * be deleted anyway and its storage freed.
3987 */
3988 if (IS_LOCKMGR(lock)) {
3989 /* get NLM id */
3990 lock_nlmid = GETNLMID(lock->l_flock.l_sysid);
3991 if (nlmid == lock_nlmid) {
3992 flk_delete_active_lock(lock, 0);
3993 flk_wakeup(lock, 1);
3994 flk_free_lock(lock);
3995 }
3996 }
3997 }
3998 mutex_exit(&gp->gp_mutex);
3999 }
4000 }
4001
4002 /*
4003 * Find all sleeping lock manager requests and poke them.
4004 */
4005 static void
wakeup_sleeping_lockmgr_locks(struct flock_globals * fg)4006 wakeup_sleeping_lockmgr_locks(struct flock_globals *fg)
4007 {
4008 lock_descriptor_t *lock;
4009 lock_descriptor_t *nlock = NULL; /* next lock */
4010 int i;
4011 graph_t *gp;
4012 zoneid_t zoneid = getzoneid();
4013
4014 for (i = 0; i < HASH_SIZE; i++) {
4015 mutex_enter(&flock_lock);
4016 gp = lock_graph[i];
4017 mutex_exit(&flock_lock);
4018 if (gp == NULL) {
4019 continue;
4020 }
4021
4022 mutex_enter(&gp->gp_mutex);
4023 fg->lockmgr_status[i] = FLK_WAKEUP_SLEEPERS;
4024 for (lock = SLEEPING_HEAD(gp)->l_next;
4025 lock != SLEEPING_HEAD(gp);
4026 lock = nlock) {
4027 nlock = lock->l_next;
4028 if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
4029 INTERRUPT_WAKEUP(lock);
4030 }
4031 }
4032 mutex_exit(&gp->gp_mutex);
4033 }
4034 }
4035
4036
4037 /*
4038 * Find all active (granted) lock manager locks and release them.
4039 */
4040 static void
unlock_lockmgr_granted(struct flock_globals * fg)4041 unlock_lockmgr_granted(struct flock_globals *fg)
4042 {
4043 lock_descriptor_t *lock;
4044 lock_descriptor_t *nlock = NULL; /* next lock */
4045 int i;
4046 graph_t *gp;
4047 zoneid_t zoneid = getzoneid();
4048
4049 for (i = 0; i < HASH_SIZE; i++) {
4050 mutex_enter(&flock_lock);
4051 gp = lock_graph[i];
4052 mutex_exit(&flock_lock);
4053 if (gp == NULL) {
4054 continue;
4055 }
4056
4057 mutex_enter(&gp->gp_mutex);
4058 fg->lockmgr_status[i] = FLK_LOCKMGR_DOWN;
4059 for (lock = ACTIVE_HEAD(gp)->l_next;
4060 lock != ACTIVE_HEAD(gp);
4061 lock = nlock) {
4062 nlock = lock->l_next;
4063 if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) {
4064 ASSERT(IS_ACTIVE(lock));
4065 flk_delete_active_lock(lock, 0);
4066 flk_wakeup(lock, 1);
4067 flk_free_lock(lock);
4068 }
4069 }
4070 mutex_exit(&gp->gp_mutex);
4071 }
4072 }
4073
4074
4075 /*
4076 * Wait until a lock is granted, cancelled, or interrupted.
4077 */
4078
4079 static void
wait_for_lock(lock_descriptor_t * request)4080 wait_for_lock(lock_descriptor_t *request)
4081 {
4082 graph_t *gp = request->l_graph;
4083
4084 ASSERT(MUTEX_HELD(&gp->gp_mutex));
4085
4086 while (!(IS_GRANTED(request)) && !(IS_CANCELLED(request)) &&
4087 !(IS_INTERRUPTED(request))) {
4088 if (!cv_wait_sig(&request->l_cv, &gp->gp_mutex)) {
4089 flk_set_state(request, FLK_INTERRUPTED_STATE);
4090 request->l_state |= INTERRUPTED_LOCK;
4091 }
4092 }
4093 }
4094
4095 /*
4096 * Create an flock structure from the existing lock information
4097 *
4098 * This routine is used to create flock structures for the lock manager
4099 * to use in a reclaim request. Since the lock was originated on this
4100 * host, it must be conforming to UNIX semantics, so no checking is
4101 * done to make sure it falls within the lower half of the 32-bit range.
4102 */
4103
4104 static void
create_flock(lock_descriptor_t * lp,flock64_t * flp)4105 create_flock(lock_descriptor_t *lp, flock64_t *flp)
4106 {
4107 ASSERT(lp->l_end == MAX_U_OFFSET_T || lp->l_end <= MAXEND);
4108 ASSERT(lp->l_end >= lp->l_start);
4109
4110 flp->l_type = lp->l_type;
4111 flp->l_whence = 0;
4112 flp->l_start = lp->l_start;
4113 flp->l_len = (lp->l_end == MAX_U_OFFSET_T) ? 0 :
4114 (lp->l_end - lp->l_start + 1);
4115 flp->l_sysid = lp->l_flock.l_sysid;
4116 flp->l_pid = lp->l_flock.l_pid;
4117 }
4118
4119 /*
4120 * Convert flock_t data describing a lock range into unsigned long starting
4121 * and ending points, which are put into lock_request. Returns 0 or an
4122 * errno value.
4123 */
4124
4125 int
flk_convert_lock_data(vnode_t * vp,flock64_t * flp,u_offset_t * start,u_offset_t * end,offset_t offset)4126 flk_convert_lock_data(vnode_t *vp, flock64_t *flp,
4127 u_offset_t *start, u_offset_t *end, offset_t offset)
4128 {
4129 struct vattr vattr;
4130 int error;
4131
4132 /*
4133 * Determine the starting point of the request
4134 */
4135 switch (flp->l_whence) {
4136 case 0: /* SEEK_SET */
4137 *start = (u_offset_t)flp->l_start;
4138 break;
4139 case 1: /* SEEK_CUR */
4140 *start = (u_offset_t)(flp->l_start + offset);
4141 break;
4142 case 2: /* SEEK_END */
4143 vattr.va_mask = AT_SIZE;
4144 if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL))
4145 return (error);
4146 *start = (u_offset_t)(flp->l_start + vattr.va_size);
4147 break;
4148 default:
4149 return (EINVAL);
4150 }
4151
4152 /*
4153 * Determine the range covered by the request.
4154 */
4155 if (flp->l_len == 0)
4156 *end = MAX_U_OFFSET_T;
4157 else if ((offset_t)flp->l_len > 0) {
4158 *end = (u_offset_t)(*start + (flp->l_len - 1));
4159 } else {
4160 /*
4161 * Negative length; why do we even allow this ?
4162 * Because this allows easy specification of
4163 * the last n bytes of the file.
4164 */
4165 *end = *start;
4166 *start += (u_offset_t)flp->l_len;
4167 (*start)++;
4168 }
4169 return (0);
4170 }
4171
4172 /*
4173 * Check the validity of lock data. This can used by the NFS
4174 * frlock routines to check data before contacting the server. The
4175 * server must support semantics that aren't as restrictive as
4176 * the UNIX API, so the NFS client is required to check.
4177 * The maximum is now passed in by the caller.
4178 */
4179
4180 int
flk_check_lock_data(u_offset_t start,u_offset_t end,offset_t max)4181 flk_check_lock_data(u_offset_t start, u_offset_t end, offset_t max)
4182 {
4183 /*
4184 * The end (length) for local locking should never be greater
4185 * than MAXEND. However, the representation for
4186 * the entire file is MAX_U_OFFSET_T.
4187 */
4188 if ((start > max) ||
4189 ((end > max) && (end != MAX_U_OFFSET_T))) {
4190 return (EINVAL);
4191 }
4192 if (start > end) {
4193 return (EINVAL);
4194 }
4195 return (0);
4196 }
4197
4198 /*
4199 * Fill in request->l_flock with information about the lock blocking the
4200 * request. The complexity here is that lock manager requests are allowed
4201 * to see into the upper part of the 32-bit address range, whereas local
4202 * requests are only allowed to see signed values.
4203 *
4204 * What should be done when "blocker" is a lock manager lock that uses the
4205 * upper portion of the 32-bit range, but "request" is local? Since the
4206 * request has already been determined to have been blocked by the blocker,
4207 * at least some portion of "blocker" must be in the range of the request,
4208 * or the request extends to the end of file. For the first case, the
4209 * portion in the lower range is returned with the indication that it goes
4210 * "to EOF." For the second case, the last byte of the lower range is
4211 * returned with the indication that it goes "to EOF."
4212 */
4213
4214 static void
report_blocker(lock_descriptor_t * blocker,lock_descriptor_t * request)4215 report_blocker(lock_descriptor_t *blocker, lock_descriptor_t *request)
4216 {
4217 flock64_t *flrp; /* l_flock portion of request */
4218
4219 ASSERT(blocker != NULL);
4220
4221 flrp = &request->l_flock;
4222 flrp->l_whence = 0;
4223 flrp->l_type = blocker->l_type;
4224 flrp->l_pid = blocker->l_flock.l_pid;
4225 flrp->l_sysid = blocker->l_flock.l_sysid;
4226 request->l_ofd = blocker->l_ofd;
4227
4228 if (IS_LOCKMGR(request)) {
4229 flrp->l_start = blocker->l_start;
4230 if (blocker->l_end == MAX_U_OFFSET_T)
4231 flrp->l_len = 0;
4232 else
4233 flrp->l_len = blocker->l_end - blocker->l_start + 1;
4234 } else {
4235 if (blocker->l_start > MAXEND) {
4236 flrp->l_start = MAXEND;
4237 flrp->l_len = 0;
4238 } else {
4239 flrp->l_start = blocker->l_start;
4240 if (blocker->l_end == MAX_U_OFFSET_T)
4241 flrp->l_len = 0;
4242 else
4243 flrp->l_len = blocker->l_end -
4244 blocker->l_start + 1;
4245 }
4246 }
4247 }
4248
4249 /*
4250 * PSARC case 1997/292
4251 */
4252 /*
4253 * This is the public routine exported by flock.h.
4254 */
4255 void
cl_flk_change_nlm_state_to_unknown(int nlmid)4256 cl_flk_change_nlm_state_to_unknown(int nlmid)
4257 {
4258 /*
4259 * Check to see if node is booted as a cluster. If not, return.
4260 */
4261 if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
4262 return;
4263 }
4264
4265 /*
4266 * See comment in cl_flk_set_nlm_status().
4267 */
4268 if (nlm_reg_status == NULL) {
4269 return;
4270 }
4271
4272 /*
4273 * protect NLM registry state with a mutex.
4274 */
4275 ASSERT(nlmid <= nlm_status_size && nlmid >= 0);
4276 mutex_enter(&nlm_reg_lock);
4277 FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid, FLK_NLM_UNKNOWN);
4278 mutex_exit(&nlm_reg_lock);
4279 }
4280
4281 /*
4282 * Return non-zero if the given I/O request conflicts with an active NBMAND
4283 * lock.
4284 * If svmand is non-zero, it means look at all active locks, not just NBMAND
4285 * locks.
4286 */
4287
4288 int
nbl_lock_conflict(vnode_t * vp,nbl_op_t op,u_offset_t offset,ssize_t length,int svmand,caller_context_t * ct)4289 nbl_lock_conflict(vnode_t *vp, nbl_op_t op, u_offset_t offset,
4290 ssize_t length, int svmand, caller_context_t *ct)
4291 {
4292 int conflict = 0;
4293 graph_t *gp;
4294 lock_descriptor_t *lock;
4295 pid_t pid;
4296 int sysid;
4297
4298 if (ct == NULL) {
4299 pid = curproc->p_pid;
4300 sysid = 0;
4301 } else {
4302 pid = ct->cc_pid;
4303 sysid = ct->cc_sysid;
4304 }
4305
4306 mutex_enter(&flock_lock);
4307 gp = lock_graph[HASH_INDEX(vp)];
4308 mutex_exit(&flock_lock);
4309 if (gp == NULL)
4310 return (0);
4311
4312 mutex_enter(&gp->gp_mutex);
4313 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
4314
4315 for (; lock && lock->l_vnode == vp; lock = lock->l_next) {
4316 if ((svmand || (lock->l_state & NBMAND_LOCK)) &&
4317 (lock->l_flock.l_sysid != sysid ||
4318 lock->l_flock.l_pid != pid) &&
4319 lock_blocks_io(op, offset, length,
4320 lock->l_type, lock->l_start, lock->l_end)) {
4321 DTRACE_PROBE1(conflict_lock,
4322 lock_descriptor_t *, lock);
4323 conflict = 1;
4324 break;
4325 }
4326 }
4327 mutex_exit(&gp->gp_mutex);
4328
4329 return (conflict);
4330 }
4331
4332 /*
4333 * Return non-zero if the given I/O request conflicts with the given lock.
4334 */
4335
4336 static int
lock_blocks_io(nbl_op_t op,u_offset_t offset,ssize_t length,int lock_type,u_offset_t lock_start,u_offset_t lock_end)4337 lock_blocks_io(nbl_op_t op, u_offset_t offset, ssize_t length,
4338 int lock_type, u_offset_t lock_start, u_offset_t lock_end)
4339 {
4340 ASSERT(op == NBL_READ || op == NBL_WRITE || op == NBL_READWRITE);
4341 ASSERT(lock_type == F_RDLCK || lock_type == F_WRLCK);
4342
4343 if (op == NBL_READ && lock_type == F_RDLCK)
4344 return (0);
4345
4346 if (offset <= lock_start && lock_start < offset + length)
4347 return (1);
4348 if (lock_start <= offset && offset <= lock_end)
4349 return (1);
4350
4351 return (0);
4352 }
4353
4354 #ifdef DEBUG
4355 static void
check_active_locks(graph_t * gp)4356 check_active_locks(graph_t *gp)
4357 {
4358 lock_descriptor_t *lock, *lock1;
4359 edge_t *ep;
4360
4361 for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp);
4362 lock = lock->l_next) {
4363 ASSERT(IS_ACTIVE(lock));
4364 ASSERT(NOT_BLOCKED(lock));
4365 ASSERT(!IS_BARRIER(lock));
4366
4367 ep = FIRST_IN(lock);
4368
4369 while (ep != HEAD(lock)) {
4370 ASSERT(IS_SLEEPING(ep->from_vertex));
4371 ASSERT(!NOT_BLOCKED(ep->from_vertex));
4372 ep = NEXT_IN(ep);
4373 }
4374
4375 for (lock1 = lock->l_next; lock1 != ACTIVE_HEAD(gp);
4376 lock1 = lock1->l_next) {
4377 if (lock1->l_vnode == lock->l_vnode) {
4378 if (BLOCKS(lock1, lock)) {
4379 cmn_err(CE_PANIC,
4380 "active lock %p blocks %p",
4381 (void *)lock1, (void *)lock);
4382 } else if (BLOCKS(lock, lock1)) {
4383 cmn_err(CE_PANIC,
4384 "active lock %p blocks %p",
4385 (void *)lock, (void *)lock1);
4386 }
4387 }
4388 }
4389 }
4390 }
4391
4392 /*
4393 * Effect: This functions checks to see if the transition from 'old_state' to
4394 * 'new_state' is a valid one. It returns 0 if the transition is valid
4395 * and 1 if it is not.
4396 * For a map of valid transitions, see sys/flock_impl.h
4397 */
4398 static int
check_lock_transition(int old_state,int new_state)4399 check_lock_transition(int old_state, int new_state)
4400 {
4401 switch (old_state) {
4402 case FLK_INITIAL_STATE:
4403 if ((new_state == FLK_START_STATE) ||
4404 (new_state == FLK_SLEEPING_STATE) ||
4405 (new_state == FLK_ACTIVE_STATE) ||
4406 (new_state == FLK_DEAD_STATE)) {
4407 return (0);
4408 } else {
4409 return (1);
4410 }
4411 case FLK_START_STATE:
4412 if ((new_state == FLK_ACTIVE_STATE) ||
4413 (new_state == FLK_DEAD_STATE)) {
4414 return (0);
4415 } else {
4416 return (1);
4417 }
4418 case FLK_ACTIVE_STATE:
4419 if (new_state == FLK_DEAD_STATE) {
4420 return (0);
4421 } else {
4422 return (1);
4423 }
4424 case FLK_SLEEPING_STATE:
4425 if ((new_state == FLK_GRANTED_STATE) ||
4426 (new_state == FLK_INTERRUPTED_STATE) ||
4427 (new_state == FLK_CANCELLED_STATE)) {
4428 return (0);
4429 } else {
4430 return (1);
4431 }
4432 case FLK_GRANTED_STATE:
4433 if ((new_state == FLK_START_STATE) ||
4434 (new_state == FLK_INTERRUPTED_STATE) ||
4435 (new_state == FLK_CANCELLED_STATE)) {
4436 return (0);
4437 } else {
4438 return (1);
4439 }
4440 case FLK_CANCELLED_STATE:
4441 if ((new_state == FLK_INTERRUPTED_STATE) ||
4442 (new_state == FLK_DEAD_STATE)) {
4443 return (0);
4444 } else {
4445 return (1);
4446 }
4447 case FLK_INTERRUPTED_STATE:
4448 if (new_state == FLK_DEAD_STATE) {
4449 return (0);
4450 } else {
4451 return (1);
4452 }
4453 case FLK_DEAD_STATE:
4454 /* May be set more than once */
4455 if (new_state == FLK_DEAD_STATE) {
4456 return (0);
4457 } else {
4458 return (1);
4459 }
4460 default:
4461 return (1);
4462 }
4463 }
4464
4465 static void
check_sleeping_locks(graph_t * gp)4466 check_sleeping_locks(graph_t *gp)
4467 {
4468 lock_descriptor_t *lock1, *lock2;
4469 edge_t *ep;
4470 for (lock1 = SLEEPING_HEAD(gp)->l_next; lock1 != SLEEPING_HEAD(gp);
4471 lock1 = lock1->l_next) {
4472 ASSERT(!IS_BARRIER(lock1));
4473 for (lock2 = lock1->l_next; lock2 != SLEEPING_HEAD(gp);
4474 lock2 = lock2->l_next) {
4475 if (lock1->l_vnode == lock2->l_vnode) {
4476 if (BLOCKS(lock2, lock1)) {
4477 ASSERT(!IS_GRANTED(lock1));
4478 ASSERT(!NOT_BLOCKED(lock1));
4479 path(lock1, lock2);
4480 }
4481 }
4482 }
4483
4484 for (lock2 = ACTIVE_HEAD(gp)->l_next; lock2 != ACTIVE_HEAD(gp);
4485 lock2 = lock2->l_next) {
4486 ASSERT(!IS_BARRIER(lock1));
4487 if (lock1->l_vnode == lock2->l_vnode) {
4488 if (BLOCKS(lock2, lock1)) {
4489 ASSERT(!IS_GRANTED(lock1));
4490 ASSERT(!NOT_BLOCKED(lock1));
4491 path(lock1, lock2);
4492 }
4493 }
4494 }
4495 ep = FIRST_ADJ(lock1);
4496 while (ep != HEAD(lock1)) {
4497 ASSERT(BLOCKS(ep->to_vertex, lock1));
4498 ep = NEXT_ADJ(ep);
4499 }
4500 }
4501 }
4502
4503 static int
level_two_path(lock_descriptor_t * lock1,lock_descriptor_t * lock2,int no_path)4504 level_two_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2, int no_path)
4505 {
4506 edge_t *ep;
4507 lock_descriptor_t *vertex;
4508 lock_descriptor_t *vertex_stack;
4509
4510 STACK_INIT(vertex_stack);
4511
4512 flk_graph_uncolor(lock1->l_graph);
4513 ep = FIRST_ADJ(lock1);
4514 ASSERT(ep != HEAD(lock1));
4515 while (ep != HEAD(lock1)) {
4516 if (no_path)
4517 ASSERT(ep->to_vertex != lock2);
4518 STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4519 COLOR(ep->to_vertex);
4520 ep = NEXT_ADJ(ep);
4521 }
4522
4523 while ((vertex = STACK_TOP(vertex_stack)) != NULL) {
4524 STACK_POP(vertex_stack, l_dstack);
4525 for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex);
4526 ep = NEXT_ADJ(ep)) {
4527 if (COLORED(ep->to_vertex))
4528 continue;
4529 COLOR(ep->to_vertex);
4530 if (ep->to_vertex == lock2)
4531 return (1);
4532
4533 STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack);
4534 }
4535 }
4536 return (0);
4537 }
4538
4539 static void
check_owner_locks(graph_t * gp,pid_t pid,int sysid,vnode_t * vp)4540 check_owner_locks(graph_t *gp, pid_t pid, int sysid, vnode_t *vp)
4541 {
4542 lock_descriptor_t *lock;
4543
4544 /* Ignore OFD style locks since they're not process-wide. */
4545 if (pid == 0)
4546 return;
4547
4548 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp);
4549
4550 if (lock) {
4551 while (lock != ACTIVE_HEAD(gp) && (lock->l_vnode == vp)) {
4552 if (lock->l_flock.l_pid == pid &&
4553 lock->l_flock.l_sysid == sysid)
4554 cmn_err(CE_PANIC,
4555 "owner pid %d's lock %p in active queue",
4556 pid, (void *)lock);
4557 lock = lock->l_next;
4558 }
4559 }
4560 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp);
4561
4562 if (lock) {
4563 while (lock != SLEEPING_HEAD(gp) && (lock->l_vnode == vp)) {
4564 if (lock->l_flock.l_pid == pid &&
4565 lock->l_flock.l_sysid == sysid)
4566 cmn_err(CE_PANIC,
4567 "owner pid %d's lock %p in sleep queue",
4568 pid, (void *)lock);
4569 lock = lock->l_next;
4570 }
4571 }
4572 }
4573
4574 static int
level_one_path(lock_descriptor_t * lock1,lock_descriptor_t * lock2)4575 level_one_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4576 {
4577 edge_t *ep = FIRST_ADJ(lock1);
4578
4579 while (ep != HEAD(lock1)) {
4580 if (ep->to_vertex == lock2)
4581 return (1);
4582 else
4583 ep = NEXT_ADJ(ep);
4584 }
4585 return (0);
4586 }
4587
4588 static int
no_path(lock_descriptor_t * lock1,lock_descriptor_t * lock2)4589 no_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4590 {
4591 return (!level_two_path(lock1, lock2, 1));
4592 }
4593
4594 static void
path(lock_descriptor_t * lock1,lock_descriptor_t * lock2)4595 path(lock_descriptor_t *lock1, lock_descriptor_t *lock2)
4596 {
4597 if (level_one_path(lock1, lock2)) {
4598 if (level_two_path(lock1, lock2, 0) != 0) {
4599 cmn_err(CE_WARN,
4600 "one edge one path from lock1 %p lock2 %p",
4601 (void *)lock1, (void *)lock2);
4602 }
4603 } else if (no_path(lock1, lock2)) {
4604 cmn_err(CE_PANIC,
4605 "No path from lock1 %p to lock2 %p",
4606 (void *)lock1, (void *)lock2);
4607 }
4608 }
4609 #endif /* DEBUG */
4610