1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 32 * Copyright 2015 Joyent, Inc. 33 */ 34 35 #include <sys/flock_impl.h> 36 #include <sys/vfs.h> 37 #include <sys/t_lock.h> /* for <sys/callb.h> */ 38 #include <sys/callb.h> 39 #include <sys/clconf.h> 40 #include <sys/cladm.h> 41 #include <sys/nbmlock.h> 42 #include <sys/cred.h> 43 #include <sys/policy.h> 44 45 /* 46 * The following four variables are for statistics purposes and they are 47 * not protected by locks. They may not be accurate but will at least be 48 * close to the actual value. 49 */ 50 51 int flk_lock_allocs; 52 int flk_lock_frees; 53 int edge_allocs; 54 int edge_frees; 55 int flk_proc_vertex_allocs; 56 int flk_proc_edge_allocs; 57 int flk_proc_vertex_frees; 58 int flk_proc_edge_frees; 59 60 static kmutex_t flock_lock; 61 62 #ifdef DEBUG 63 int check_debug = 0; 64 #define CHECK_ACTIVE_LOCKS(gp) if (check_debug) \ 65 check_active_locks(gp); 66 #define CHECK_SLEEPING_LOCKS(gp) if (check_debug) \ 67 check_sleeping_locks(gp); 68 #define CHECK_OWNER_LOCKS(gp, pid, sysid, vp) \ 69 if (check_debug) \ 70 check_owner_locks(gp, pid, sysid, vp); 71 #define CHECK_LOCK_TRANSITION(old_state, new_state) \ 72 { \ 73 if (check_lock_transition(old_state, new_state)) { \ 74 cmn_err(CE_PANIC, "Illegal lock transition \ 75 from %d to %d", old_state, new_state); \ 76 } \ 77 } 78 #else 79 80 #define CHECK_ACTIVE_LOCKS(gp) 81 #define CHECK_SLEEPING_LOCKS(gp) 82 #define CHECK_OWNER_LOCKS(gp, pid, sysid, vp) 83 #define CHECK_LOCK_TRANSITION(old_state, new_state) 84 85 #endif /* DEBUG */ 86 87 struct kmem_cache *flk_edge_cache; 88 89 graph_t *lock_graph[HASH_SIZE]; 90 proc_graph_t pgraph; 91 92 /* 93 * Clustering. 94 * 95 * NLM REGISTRY TYPE IMPLEMENTATION 96 * 97 * Assumptions: 98 * 1. Nodes in a cluster are numbered starting at 1; always non-negative 99 * integers; maximum node id is returned by clconf_maximum_nodeid(). 100 * 2. We use this node id to identify the node an NLM server runs on. 101 */ 102 103 /* 104 * NLM registry object keeps track of NLM servers via their 105 * nlmids (which are the node ids of the node in the cluster they run on) 106 * that have requested locks at this LLM with which this registry is 107 * associated. 108 * 109 * Representation of abstraction: 110 * rep = record[ states: array[nlm_state], 111 * lock: mutex] 112 * 113 * Representation invariants: 114 * 1. index i of rep.states is between 0 and n - 1 where n is number 115 * of elements in the array, which happen to be the maximum number 116 * of nodes in the cluster configuration + 1. 117 * 2. map nlmid to index i of rep.states 118 * 0 -> 0 119 * 1 -> 1 120 * 2 -> 2 121 * n-1 -> clconf_maximum_nodeid()+1 122 * 3. This 1-1 mapping is quite convenient and it avoids errors resulting 123 * from forgetting to subtract 1 from the index. 124 * 4. The reason we keep the 0th index is the following. A legitimate 125 * cluster configuration includes making a UFS file system NFS 126 * exportable. The code is structured so that if you're in a cluster 127 * you do one thing; otherwise, you do something else. The problem 128 * is what to do if you think you're in a cluster with PXFS loaded, 129 * but you're using UFS not PXFS? The upper two bytes of the sysid 130 * encode the node id of the node where NLM server runs; these bytes 131 * are zero for UFS. Since the nodeid is used to index into the 132 * registry, we can record the NLM server state information at index 133 * 0 using the same mechanism used for PXFS file locks! 134 */ 135 static flk_nlm_status_t *nlm_reg_status = NULL; /* state array 0..N-1 */ 136 static kmutex_t nlm_reg_lock; /* lock to protect arrary */ 137 static uint_t nlm_status_size; /* size of state array */ 138 139 /* 140 * Although we need a global lock dependency graph (and associated data 141 * structures), we also need a per-zone notion of whether the lock manager is 142 * running, and so whether to allow lock manager requests or not. 143 * 144 * Thus, on a per-zone basis we maintain a ``global'' variable 145 * (flk_lockmgr_status), protected by flock_lock, and set when the lock 146 * manager is determined to be changing state (starting or stopping). 147 * 148 * Each graph/zone pair also has a copy of this variable, which is protected by 149 * the graph's mutex. 150 * 151 * The per-graph copies are used to synchronize lock requests with shutdown 152 * requests. The global copy is used to initialize the per-graph field when a 153 * new graph is created. 154 */ 155 struct flock_globals { 156 flk_lockmgr_status_t flk_lockmgr_status; 157 flk_lockmgr_status_t lockmgr_status[HASH_SIZE]; 158 }; 159 160 zone_key_t flock_zone_key; 161 162 static void create_flock(lock_descriptor_t *, flock64_t *); 163 static lock_descriptor_t *flk_get_lock(void); 164 static void flk_free_lock(lock_descriptor_t *lock); 165 static void flk_get_first_blocking_lock(lock_descriptor_t *request); 166 static int flk_process_request(lock_descriptor_t *); 167 static int flk_add_edge(lock_descriptor_t *, lock_descriptor_t *, int, int); 168 static edge_t *flk_get_edge(void); 169 static int flk_wait_execute_request(lock_descriptor_t *); 170 static int flk_relation(lock_descriptor_t *, lock_descriptor_t *); 171 static void flk_insert_active_lock(lock_descriptor_t *); 172 static void flk_delete_active_lock(lock_descriptor_t *, int); 173 static void flk_insert_sleeping_lock(lock_descriptor_t *); 174 static void flk_graph_uncolor(graph_t *); 175 static void flk_wakeup(lock_descriptor_t *, int); 176 static void flk_free_edge(edge_t *); 177 static void flk_recompute_dependencies(lock_descriptor_t *, 178 lock_descriptor_t **, int, int); 179 static int flk_find_barriers(lock_descriptor_t *); 180 static void flk_update_barriers(lock_descriptor_t *); 181 static int flk_color_reachables(lock_descriptor_t *); 182 static int flk_canceled(lock_descriptor_t *); 183 static void flk_delete_locks_by_sysid(lock_descriptor_t *); 184 static void report_blocker(lock_descriptor_t *, lock_descriptor_t *); 185 static void wait_for_lock(lock_descriptor_t *); 186 static void unlock_lockmgr_granted(struct flock_globals *); 187 static void wakeup_sleeping_lockmgr_locks(struct flock_globals *); 188 189 /* Clustering hooks */ 190 static void cl_flk_change_nlm_state_all_locks(int, flk_nlm_status_t); 191 static void cl_flk_wakeup_sleeping_nlm_locks(int); 192 static void cl_flk_unlock_nlm_granted(int); 193 194 #ifdef DEBUG 195 static int check_lock_transition(int, int); 196 static void check_sleeping_locks(graph_t *); 197 static void check_active_locks(graph_t *); 198 static int no_path(lock_descriptor_t *, lock_descriptor_t *); 199 static void path(lock_descriptor_t *, lock_descriptor_t *); 200 static void check_owner_locks(graph_t *, pid_t, int, vnode_t *); 201 static int level_one_path(lock_descriptor_t *, lock_descriptor_t *); 202 static int level_two_path(lock_descriptor_t *, lock_descriptor_t *, int); 203 #endif 204 205 /* proc_graph function definitions */ 206 static int flk_check_deadlock(lock_descriptor_t *); 207 static void flk_proc_graph_uncolor(void); 208 static proc_vertex_t *flk_get_proc_vertex(lock_descriptor_t *); 209 static proc_edge_t *flk_get_proc_edge(void); 210 static void flk_proc_release(proc_vertex_t *); 211 static void flk_free_proc_edge(proc_edge_t *); 212 static void flk_update_proc_graph(edge_t *, int); 213 214 /* Non-blocking mandatory locking */ 215 static int lock_blocks_io(nbl_op_t, u_offset_t, ssize_t, int, u_offset_t, 216 u_offset_t); 217 218 static struct flock_globals * 219 flk_get_globals(void) 220 { 221 /* 222 * The KLM module had better be loaded if we're attempting to handle 223 * lockmgr requests. 224 */ 225 ASSERT(flock_zone_key != ZONE_KEY_UNINITIALIZED); 226 return (zone_getspecific(flock_zone_key, curproc->p_zone)); 227 } 228 229 static flk_lockmgr_status_t 230 flk_get_lockmgr_status(void) 231 { 232 struct flock_globals *fg; 233 234 ASSERT(MUTEX_HELD(&flock_lock)); 235 236 if (flock_zone_key == ZONE_KEY_UNINITIALIZED) { 237 /* 238 * KLM module not loaded; lock manager definitely not running. 239 */ 240 return (FLK_LOCKMGR_DOWN); 241 } 242 fg = flk_get_globals(); 243 return (fg->flk_lockmgr_status); 244 } 245 246 /* 247 * This implements Open File Description (not descriptor) style record locking. 248 * These locks can also be thought of as pid-less since they are not tied to a 249 * specific process, thus they're preserved across fork. 250 * 251 * Called directly from fcntl. 252 * 253 * See reclock() for the implementation of the traditional POSIX style record 254 * locking scheme (pid-ful). This function is derived from reclock() but 255 * simplified and modified to work for OFD style locking. 256 * 257 * The two primary advantages of OFD style of locking are: 258 * 1) It is per-file description, so closing a file descriptor that refers to a 259 * different file description for the same file will not drop the lock (i.e. 260 * two open's of the same file get different descriptions but a dup or fork 261 * will refer to the same description). 262 * 2) Locks are preserved across fork(2). 263 * 264 * Because these locks are per-description a lock ptr lives at the f_filocks 265 * member of the file_t and the lock_descriptor includes a file_t pointer 266 * to enable unique lock identification and management. 267 * 268 * Since these locks are pid-less we cannot do deadlock detection with the 269 * current process-oriented implementation. This is consistent with OFD locking 270 * behavior on other operating systems such as Linux. Since we don't do 271 * deadlock detection we never interact with the process graph that is 272 * maintained for deadlock detection on the traditional POSIX-style locks. 273 * 274 * Future Work: 275 * 276 * The current implementation does not support record locks. That is, 277 * currently the single lock must cover the entire file. This is validated in 278 * fcntl. To support record locks the f_filock pointer in the file_t needs to 279 * be changed to a list of pointers to the locks. That list needs to be 280 * managed independently of the lock list on the vnode itself and it needs to 281 * be maintained as record locks are created, split, coalesced and deleted. 282 * 283 * The current implementation does not support remote file systems (e.g. 284 * NFS or CIFS). This is handled in fs_frlock(). The design of how OFD locks 285 * interact with the NLM is not clear since the NLM protocol/implementation 286 * appears to be oriented around locks associated with a process. A further 287 * problem is that a design is needed for what nlm_send_siglost() should do and 288 * where it will send SIGLOST. More recent versions of Linux apparently try to 289 * emulate OFD locks on NFS by converting them to traditional POSIX style locks 290 * that work with the NLM. It is not clear that this provides the correct 291 * semantics in all cases. 292 */ 293 int 294 ofdlock(file_t *fp, int fcmd, flock64_t *lckdat, int flag, u_offset_t offset) 295 { 296 int cmd = 0; 297 vnode_t *vp; 298 lock_descriptor_t stack_lock_request; 299 lock_descriptor_t *lock_request; 300 int error = 0; 301 graph_t *gp; 302 int serialize = 0; 303 304 if (fcmd != F_OFD_GETLK) 305 cmd = SETFLCK; 306 307 if (fcmd == F_OFD_SETLKW || fcmd == F_FLOCKW) 308 cmd |= SLPFLCK; 309 310 /* see block comment */ 311 VERIFY(lckdat->l_whence == 0); 312 VERIFY(lckdat->l_start == 0); 313 VERIFY(lckdat->l_len == 0); 314 315 vp = fp->f_vnode; 316 317 /* 318 * For reclock fs_frlock() would normally have set these in a few 319 * places but for us it's cleaner to centralize it here. Note that 320 * IGN_PID is -1. We use 0 for our pid-less locks. 321 */ 322 lckdat->l_pid = 0; 323 lckdat->l_sysid = 0; 324 325 /* 326 * Check access permissions 327 */ 328 if ((fcmd == F_OFD_SETLK || fcmd == F_OFD_SETLKW) && 329 ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) || 330 (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0))) 331 return (EBADF); 332 333 /* 334 * for query and unlock we use the stack_lock_request 335 */ 336 if (lckdat->l_type == F_UNLCK || !(cmd & SETFLCK)) { 337 lock_request = &stack_lock_request; 338 (void) bzero((caddr_t)lock_request, 339 sizeof (lock_descriptor_t)); 340 341 /* 342 * following is added to make the assertions in 343 * flk_execute_request() pass 344 */ 345 lock_request->l_edge.edge_in_next = &lock_request->l_edge; 346 lock_request->l_edge.edge_in_prev = &lock_request->l_edge; 347 lock_request->l_edge.edge_adj_next = &lock_request->l_edge; 348 lock_request->l_edge.edge_adj_prev = &lock_request->l_edge; 349 lock_request->l_status = FLK_INITIAL_STATE; 350 } else { 351 lock_request = flk_get_lock(); 352 fp->f_filock = (struct filock *)lock_request; 353 } 354 lock_request->l_state = 0; 355 lock_request->l_vnode = vp; 356 lock_request->l_zoneid = getzoneid(); 357 lock_request->l_ofd = fp; 358 359 /* 360 * Convert the request range into the canonical start and end 361 * values then check the validity of the lock range. 362 */ 363 error = flk_convert_lock_data(vp, lckdat, &lock_request->l_start, 364 &lock_request->l_end, offset); 365 if (error) 366 goto done; 367 368 error = flk_check_lock_data(lock_request->l_start, lock_request->l_end, 369 MAXEND); 370 if (error) 371 goto done; 372 373 ASSERT(lock_request->l_end >= lock_request->l_start); 374 375 lock_request->l_type = lckdat->l_type; 376 if (cmd & SLPFLCK) 377 lock_request->l_state |= WILLING_TO_SLEEP_LOCK; 378 379 if (!(cmd & SETFLCK)) { 380 if (lock_request->l_type == F_RDLCK || 381 lock_request->l_type == F_WRLCK) 382 lock_request->l_state |= QUERY_LOCK; 383 } 384 lock_request->l_flock = (*lckdat); 385 386 /* 387 * We are ready for processing the request 388 */ 389 390 if (fcmd != F_OFD_GETLK && lock_request->l_type != F_UNLCK && 391 nbl_need_check(vp)) { 392 nbl_start_crit(vp, RW_WRITER); 393 serialize = 1; 394 } 395 396 /* Get the lock graph for a particular vnode */ 397 gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH); 398 399 mutex_enter(&gp->gp_mutex); 400 401 lock_request->l_state |= REFERENCED_LOCK; 402 lock_request->l_graph = gp; 403 404 switch (lock_request->l_type) { 405 case F_RDLCK: 406 case F_WRLCK: 407 if (IS_QUERY_LOCK(lock_request)) { 408 flk_get_first_blocking_lock(lock_request); 409 if (lock_request->l_ofd != NULL) 410 lock_request->l_flock.l_pid = -1; 411 (*lckdat) = lock_request->l_flock; 412 } else { 413 /* process the request now */ 414 error = flk_process_request(lock_request); 415 } 416 break; 417 418 case F_UNLCK: 419 /* unlock request will not block so execute it immediately */ 420 error = flk_execute_request(lock_request); 421 break; 422 423 default: 424 error = EINVAL; 425 break; 426 } 427 428 if (lock_request == &stack_lock_request) { 429 flk_set_state(lock_request, FLK_DEAD_STATE); 430 } else { 431 lock_request->l_state &= ~REFERENCED_LOCK; 432 if ((error != 0) || IS_DELETED(lock_request)) { 433 flk_set_state(lock_request, FLK_DEAD_STATE); 434 flk_free_lock(lock_request); 435 } 436 } 437 438 mutex_exit(&gp->gp_mutex); 439 if (serialize) 440 nbl_end_crit(vp); 441 442 return (error); 443 444 done: 445 flk_set_state(lock_request, FLK_DEAD_STATE); 446 if (lock_request != &stack_lock_request) 447 flk_free_lock(lock_request); 448 return (error); 449 } 450 451 /* 452 * Remove any lock on the vnode belonging to the given file_t. 453 * Called from closef on last close, file_t is locked. 454 * 455 * This is modeled on the cleanlocks() function but only removes the single 456 * lock associated with fp. 457 */ 458 void 459 ofdcleanlock(file_t *fp) 460 { 461 lock_descriptor_t *fplock, *lock, *nlock; 462 vnode_t *vp; 463 graph_t *gp; 464 465 ASSERT(MUTEX_HELD(&fp->f_tlock)); 466 467 if ((fplock = (lock_descriptor_t *)fp->f_filock) == NULL) 468 return; 469 470 fp->f_filock = NULL; 471 vp = fp->f_vnode; 472 473 gp = flk_get_lock_graph(vp, FLK_USE_GRAPH); 474 475 if (gp == NULL) 476 return; 477 mutex_enter(&gp->gp_mutex); 478 479 CHECK_SLEEPING_LOCKS(gp); 480 CHECK_ACTIVE_LOCKS(gp); 481 482 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp); 483 484 if (lock) { 485 do { 486 nlock = lock->l_next; 487 if (fplock == lock) { 488 CANCEL_WAKEUP(lock); 489 break; 490 } 491 lock = nlock; 492 } while (lock->l_vnode == vp); 493 } 494 495 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp); 496 497 if (lock) { 498 do { 499 nlock = lock->l_next; 500 if (fplock == lock) { 501 flk_delete_active_lock(lock, 0); 502 flk_wakeup(lock, 1); 503 flk_free_lock(lock); 504 break; 505 } 506 lock = nlock; 507 } while (lock->l_vnode == vp); 508 } 509 510 CHECK_SLEEPING_LOCKS(gp); 511 CHECK_ACTIVE_LOCKS(gp); 512 mutex_exit(&gp->gp_mutex); 513 } 514 515 /* 516 * Routine called from fs_frlock in fs/fs_subr.c 517 * 518 * This implements traditional POSIX style record locking. The two primary 519 * drawbacks to this style of locking are: 520 * 1) It is per-process, so any close of a file descriptor that refers to the 521 * file will drop the lock (e.g. lock /etc/passwd, call a library function 522 * which opens /etc/passwd to read the file, when the library closes it's 523 * file descriptor the application loses its lock and does not know). 524 * 2) Locks are not preserved across fork(2). 525 * 526 * Because these locks are only associated with a PID, they are per-process. 527 * This is why any close will drop the lock and is also why, once the process 528 * forks, the lock is no longer related to the new process. These locks can 529 * be considered as PID-ful. 530 * 531 * See ofdlock() for the implementation of a similar but improved locking 532 * scheme. 533 */ 534 int 535 reclock(vnode_t *vp, 536 flock64_t *lckdat, 537 int cmd, 538 int flag, 539 u_offset_t offset, 540 flk_callback_t *flk_cbp) 541 { 542 lock_descriptor_t stack_lock_request; 543 lock_descriptor_t *lock_request; 544 int error = 0; 545 graph_t *gp; 546 int nlmid; 547 548 /* 549 * Check access permissions 550 */ 551 if ((cmd & SETFLCK) && 552 ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) || 553 (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0))) 554 return (EBADF); 555 556 /* 557 * for query and unlock we use the stack_lock_request 558 */ 559 560 if ((lckdat->l_type == F_UNLCK) || 561 !((cmd & INOFLCK) || (cmd & SETFLCK))) { 562 lock_request = &stack_lock_request; 563 (void) bzero((caddr_t)lock_request, 564 sizeof (lock_descriptor_t)); 565 566 /* 567 * following is added to make the assertions in 568 * flk_execute_request() to pass through 569 */ 570 571 lock_request->l_edge.edge_in_next = &lock_request->l_edge; 572 lock_request->l_edge.edge_in_prev = &lock_request->l_edge; 573 lock_request->l_edge.edge_adj_next = &lock_request->l_edge; 574 lock_request->l_edge.edge_adj_prev = &lock_request->l_edge; 575 lock_request->l_status = FLK_INITIAL_STATE; 576 } else { 577 lock_request = flk_get_lock(); 578 } 579 lock_request->l_state = 0; 580 lock_request->l_vnode = vp; 581 lock_request->l_zoneid = getzoneid(); 582 583 /* 584 * Convert the request range into the canonical start and end 585 * values. The NLM protocol supports locking over the entire 586 * 32-bit range, so there's no range checking for remote requests, 587 * but we still need to verify that local requests obey the rules. 588 */ 589 /* Clustering */ 590 if ((cmd & (RCMDLCK | PCMDLCK)) != 0) { 591 ASSERT(lckdat->l_whence == 0); 592 lock_request->l_start = lckdat->l_start; 593 lock_request->l_end = (lckdat->l_len == 0) ? MAX_U_OFFSET_T : 594 lckdat->l_start + (lckdat->l_len - 1); 595 } else { 596 /* check the validity of the lock range */ 597 error = flk_convert_lock_data(vp, lckdat, 598 &lock_request->l_start, &lock_request->l_end, 599 offset); 600 if (error) { 601 goto done; 602 } 603 error = flk_check_lock_data(lock_request->l_start, 604 lock_request->l_end, MAXEND); 605 if (error) { 606 goto done; 607 } 608 } 609 610 ASSERT(lock_request->l_end >= lock_request->l_start); 611 612 lock_request->l_type = lckdat->l_type; 613 if (cmd & INOFLCK) 614 lock_request->l_state |= IO_LOCK; 615 if (cmd & SLPFLCK) 616 lock_request->l_state |= WILLING_TO_SLEEP_LOCK; 617 if (cmd & RCMDLCK) 618 lock_request->l_state |= LOCKMGR_LOCK; 619 if (cmd & NBMLCK) 620 lock_request->l_state |= NBMAND_LOCK; 621 /* 622 * Clustering: set flag for PXFS locks 623 * We do not _only_ check for the PCMDLCK flag because PXFS locks could 624 * also be of type 'RCMDLCK'. 625 * We do not _only_ check the GETPXFSID() macro because local PXFS 626 * clients use a pxfsid of zero to permit deadlock detection in the LLM. 627 */ 628 629 if ((cmd & PCMDLCK) || (GETPXFSID(lckdat->l_sysid) != 0)) { 630 lock_request->l_state |= PXFS_LOCK; 631 } 632 if (!((cmd & SETFLCK) || (cmd & INOFLCK))) { 633 if (lock_request->l_type == F_RDLCK || 634 lock_request->l_type == F_WRLCK) 635 lock_request->l_state |= QUERY_LOCK; 636 } 637 lock_request->l_flock = (*lckdat); 638 lock_request->l_callbacks = flk_cbp; 639 640 /* 641 * We are ready for processing the request 642 */ 643 if (IS_LOCKMGR(lock_request)) { 644 /* 645 * If the lock request is an NLM server request .... 646 */ 647 if (nlm_status_size == 0) { /* not booted as cluster */ 648 mutex_enter(&flock_lock); 649 /* 650 * Bail out if this is a lock manager request and the 651 * lock manager is not supposed to be running. 652 */ 653 if (flk_get_lockmgr_status() != FLK_LOCKMGR_UP) { 654 mutex_exit(&flock_lock); 655 error = ENOLCK; 656 goto done; 657 } 658 mutex_exit(&flock_lock); 659 } else { /* booted as a cluster */ 660 nlmid = GETNLMID(lock_request->l_flock.l_sysid); 661 ASSERT(nlmid <= nlm_status_size && nlmid >= 0); 662 663 mutex_enter(&nlm_reg_lock); 664 /* 665 * If the NLM registry does not know about this 666 * NLM server making the request, add its nlmid 667 * to the registry. 668 */ 669 if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status, 670 nlmid)) { 671 FLK_REGISTRY_ADD_NLMID(nlm_reg_status, nlmid); 672 } else if (!FLK_REGISTRY_IS_NLM_UP(nlm_reg_status, 673 nlmid)) { 674 /* 675 * If the NLM server is already known (has made 676 * previous lock requests) and its state is 677 * not NLM_UP (means that NLM server is 678 * shutting down), then bail out with an 679 * error to deny the lock request. 680 */ 681 mutex_exit(&nlm_reg_lock); 682 error = ENOLCK; 683 goto done; 684 } 685 mutex_exit(&nlm_reg_lock); 686 } 687 } 688 689 /* Now get the lock graph for a particular vnode */ 690 gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH); 691 692 /* 693 * We drop rwlock here otherwise this might end up causing a 694 * deadlock if this IOLOCK sleeps. (bugid # 1183392). 695 */ 696 697 if (IS_IO_LOCK(lock_request)) { 698 VOP_RWUNLOCK(vp, 699 (lock_request->l_type == F_RDLCK) ? 700 V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL); 701 } 702 mutex_enter(&gp->gp_mutex); 703 704 lock_request->l_state |= REFERENCED_LOCK; 705 lock_request->l_graph = gp; 706 707 switch (lock_request->l_type) { 708 case F_RDLCK: 709 case F_WRLCK: 710 if (IS_QUERY_LOCK(lock_request)) { 711 flk_get_first_blocking_lock(lock_request); 712 if (lock_request->l_ofd != NULL) 713 lock_request->l_flock.l_pid = -1; 714 (*lckdat) = lock_request->l_flock; 715 break; 716 } 717 718 /* process the request now */ 719 720 error = flk_process_request(lock_request); 721 break; 722 723 case F_UNLCK: 724 /* unlock request will not block so execute it immediately */ 725 726 if (IS_LOCKMGR(lock_request) && 727 flk_canceled(lock_request)) { 728 error = 0; 729 } else { 730 error = flk_execute_request(lock_request); 731 } 732 break; 733 734 case F_UNLKSYS: 735 /* 736 * Recovery mechanism to release lock manager locks when 737 * NFS client crashes and restart. NFS server will clear 738 * old locks and grant new locks. 739 */ 740 741 if (lock_request->l_flock.l_sysid == 0) { 742 mutex_exit(&gp->gp_mutex); 743 return (EINVAL); 744 } 745 if (secpolicy_nfs(CRED()) != 0) { 746 mutex_exit(&gp->gp_mutex); 747 return (EPERM); 748 } 749 flk_delete_locks_by_sysid(lock_request); 750 lock_request->l_state &= ~REFERENCED_LOCK; 751 flk_set_state(lock_request, FLK_DEAD_STATE); 752 flk_free_lock(lock_request); 753 mutex_exit(&gp->gp_mutex); 754 return (0); 755 756 default: 757 error = EINVAL; 758 break; 759 } 760 761 /* Clustering: For blocked PXFS locks, return */ 762 if (error == PXFS_LOCK_BLOCKED) { 763 lock_request->l_state &= ~REFERENCED_LOCK; 764 mutex_exit(&gp->gp_mutex); 765 return (error); 766 } 767 768 /* 769 * Now that we have seen the status of locks in the system for 770 * this vnode we acquire the rwlock if it is an IO_LOCK. 771 */ 772 773 if (IS_IO_LOCK(lock_request)) { 774 (void) VOP_RWLOCK(vp, 775 (lock_request->l_type == F_RDLCK) ? 776 V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL); 777 if (!error) { 778 lckdat->l_type = F_UNLCK; 779 780 /* 781 * This wake up is needed otherwise 782 * if IO_LOCK has slept the dependents on this 783 * will not be woken up at all. (bugid # 1185482). 784 */ 785 786 flk_wakeup(lock_request, 1); 787 flk_set_state(lock_request, FLK_DEAD_STATE); 788 flk_free_lock(lock_request); 789 } 790 /* 791 * else if error had occurred either flk_process_request() 792 * has returned EDEADLK in which case there will be no 793 * dependents for this lock or EINTR from flk_wait_execute_ 794 * request() in which case flk_cancel_sleeping_lock() 795 * would have been done. same is true with EBADF. 796 */ 797 } 798 799 if (lock_request == &stack_lock_request) { 800 flk_set_state(lock_request, FLK_DEAD_STATE); 801 } else { 802 lock_request->l_state &= ~REFERENCED_LOCK; 803 if ((error != 0) || IS_DELETED(lock_request)) { 804 flk_set_state(lock_request, FLK_DEAD_STATE); 805 flk_free_lock(lock_request); 806 } 807 } 808 809 mutex_exit(&gp->gp_mutex); 810 return (error); 811 812 done: 813 flk_set_state(lock_request, FLK_DEAD_STATE); 814 if (lock_request != &stack_lock_request) 815 flk_free_lock(lock_request); 816 return (error); 817 } 818 819 /* 820 * Invoke the callbacks in the given list. If before sleeping, invoke in 821 * list order. If after sleeping, invoke in reverse order. 822 * 823 * CPR (suspend/resume) support: if one of the callbacks returns a 824 * callb_cpr_t, return it. This will be used to make the thread CPR-safe 825 * while it is sleeping. There should be at most one callb_cpr_t for the 826 * thread. 827 * XXX This is unnecessarily complicated. The CPR information should just 828 * get passed in directly through VOP_FRLOCK and reclock, rather than 829 * sneaking it in via a callback. 830 */ 831 832 callb_cpr_t * 833 flk_invoke_callbacks(flk_callback_t *cblist, flk_cb_when_t when) 834 { 835 callb_cpr_t *cpr_callbackp = NULL; 836 callb_cpr_t *one_result; 837 flk_callback_t *cb; 838 839 if (cblist == NULL) 840 return (NULL); 841 842 if (when == FLK_BEFORE_SLEEP) { 843 cb = cblist; 844 do { 845 one_result = (*cb->cb_callback)(when, cb->cb_data); 846 if (one_result != NULL) { 847 ASSERT(cpr_callbackp == NULL); 848 cpr_callbackp = one_result; 849 } 850 cb = cb->cb_next; 851 } while (cb != cblist); 852 } else { 853 cb = cblist->cb_prev; 854 do { 855 one_result = (*cb->cb_callback)(when, cb->cb_data); 856 if (one_result != NULL) { 857 cpr_callbackp = one_result; 858 } 859 cb = cb->cb_prev; 860 } while (cb != cblist->cb_prev); 861 } 862 863 return (cpr_callbackp); 864 } 865 866 /* 867 * Initialize a flk_callback_t to hold the given callback. 868 */ 869 870 void 871 flk_init_callback(flk_callback_t *flk_cb, 872 callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *), void *cbdata) 873 { 874 flk_cb->cb_next = flk_cb; 875 flk_cb->cb_prev = flk_cb; 876 flk_cb->cb_callback = cb_fcn; 877 flk_cb->cb_data = cbdata; 878 } 879 880 /* 881 * Initialize an flk_callback_t and then link it into the head of an 882 * existing list (which may be NULL). 883 */ 884 885 void 886 flk_add_callback(flk_callback_t *newcb, 887 callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *), 888 void *cbdata, flk_callback_t *cblist) 889 { 890 flk_init_callback(newcb, cb_fcn, cbdata); 891 892 if (cblist == NULL) 893 return; 894 895 newcb->cb_prev = cblist->cb_prev; 896 newcb->cb_next = cblist; 897 cblist->cb_prev->cb_next = newcb; 898 cblist->cb_prev = newcb; 899 } 900 901 /* 902 * Initialize the flk_edge_cache data structure and create the 903 * nlm_reg_status array. 904 */ 905 906 void 907 flk_init(void) 908 { 909 uint_t i; 910 911 flk_edge_cache = kmem_cache_create("flk_edges", 912 sizeof (struct edge), 0, NULL, NULL, NULL, NULL, NULL, 0); 913 if (flk_edge_cache == NULL) { 914 cmn_err(CE_PANIC, "Couldn't create flk_edge_cache\n"); 915 } 916 /* 917 * Create the NLM registry object. 918 */ 919 920 if (cluster_bootflags & CLUSTER_BOOTED) { 921 /* 922 * This routine tells you the maximum node id that will be used 923 * in the cluster. This number will be the size of the nlm 924 * registry status array. We add 1 because we will be using 925 * all entries indexed from 0 to maxnodeid; e.g., from 0 926 * to 64, for a total of 65 entries. 927 */ 928 nlm_status_size = clconf_maximum_nodeid() + 1; 929 } else { 930 nlm_status_size = 0; 931 } 932 933 if (nlm_status_size != 0) { /* booted as a cluster */ 934 nlm_reg_status = (flk_nlm_status_t *) 935 kmem_alloc(sizeof (flk_nlm_status_t) * nlm_status_size, 936 KM_SLEEP); 937 938 /* initialize all NLM states in array to NLM_UNKNOWN */ 939 for (i = 0; i < nlm_status_size; i++) { 940 nlm_reg_status[i] = FLK_NLM_UNKNOWN; 941 } 942 } 943 } 944 945 /* 946 * Zone constructor/destructor callbacks to be executed when a zone is 947 * created/destroyed. 948 */ 949 /* ARGSUSED */ 950 void * 951 flk_zone_init(zoneid_t zoneid) 952 { 953 struct flock_globals *fg; 954 uint_t i; 955 956 fg = kmem_alloc(sizeof (*fg), KM_SLEEP); 957 fg->flk_lockmgr_status = FLK_LOCKMGR_UP; 958 for (i = 0; i < HASH_SIZE; i++) 959 fg->lockmgr_status[i] = FLK_LOCKMGR_UP; 960 return (fg); 961 } 962 963 /* ARGSUSED */ 964 void 965 flk_zone_fini(zoneid_t zoneid, void *data) 966 { 967 struct flock_globals *fg = data; 968 969 kmem_free(fg, sizeof (*fg)); 970 } 971 972 /* 973 * Get a lock_descriptor structure with initialization of edge lists. 974 */ 975 976 static lock_descriptor_t * 977 flk_get_lock(void) 978 { 979 lock_descriptor_t *l; 980 981 l = kmem_zalloc(sizeof (lock_descriptor_t), KM_SLEEP); 982 983 cv_init(&l->l_cv, NULL, CV_DRIVER, NULL); 984 l->l_edge.edge_in_next = &l->l_edge; 985 l->l_edge.edge_in_prev = &l->l_edge; 986 l->l_edge.edge_adj_next = &l->l_edge; 987 l->l_edge.edge_adj_prev = &l->l_edge; 988 l->pvertex = -1; 989 l->l_status = FLK_INITIAL_STATE; 990 flk_lock_allocs++; 991 return (l); 992 } 993 994 /* 995 * Free a lock_descriptor structure. Just sets the DELETED_LOCK flag 996 * when some thread has a reference to it as in reclock(). 997 */ 998 999 void 1000 flk_free_lock(lock_descriptor_t *lock) 1001 { 1002 file_t *fp; 1003 1004 ASSERT(IS_DEAD(lock)); 1005 1006 if ((fp = lock->l_ofd) != NULL && fp->f_filock == (struct filock *)lock) 1007 fp->f_filock = NULL; 1008 1009 if (IS_REFERENCED(lock)) { 1010 lock->l_state |= DELETED_LOCK; 1011 return; 1012 } 1013 flk_lock_frees++; 1014 kmem_free((void *)lock, sizeof (lock_descriptor_t)); 1015 } 1016 1017 void 1018 flk_set_state(lock_descriptor_t *lock, int new_state) 1019 { 1020 /* 1021 * Locks in the sleeping list may be woken up in a number of ways, 1022 * and more than once. If a sleeping lock is signaled awake more 1023 * than once, then it may or may not change state depending on its 1024 * current state. 1025 * Also note that NLM locks that are sleeping could be moved to an 1026 * interrupted state more than once if the unlock request is 1027 * retransmitted by the NLM client - the second time around, this is 1028 * just a nop. 1029 * The ordering of being signaled awake is: 1030 * INTERRUPTED_STATE > CANCELLED_STATE > GRANTED_STATE. 1031 * The checks below implement this ordering. 1032 */ 1033 if (IS_INTERRUPTED(lock)) { 1034 if ((new_state == FLK_CANCELLED_STATE) || 1035 (new_state == FLK_GRANTED_STATE) || 1036 (new_state == FLK_INTERRUPTED_STATE)) { 1037 return; 1038 } 1039 } 1040 if (IS_CANCELLED(lock)) { 1041 if ((new_state == FLK_GRANTED_STATE) || 1042 (new_state == FLK_CANCELLED_STATE)) { 1043 return; 1044 } 1045 } 1046 CHECK_LOCK_TRANSITION(lock->l_status, new_state); 1047 if (IS_PXFS(lock)) { 1048 cl_flk_state_transition_notify(lock, lock->l_status, new_state); 1049 } 1050 lock->l_status = new_state; 1051 } 1052 1053 /* 1054 * Routine that checks whether there are any blocking locks in the system. 1055 * 1056 * The policy followed is if a write lock is sleeping we don't allow read 1057 * locks before this write lock even though there may not be any active 1058 * locks corresponding to the read locks' region. 1059 * 1060 * flk_add_edge() function adds an edge between l1 and l2 iff there 1061 * is no path between l1 and l2. This is done to have a "minimum 1062 * storage representation" of the dependency graph. 1063 * 1064 * Another property of the graph is since only the new request throws 1065 * edges to the existing locks in the graph, the graph is always topologically 1066 * ordered. 1067 */ 1068 1069 static int 1070 flk_process_request(lock_descriptor_t *request) 1071 { 1072 graph_t *gp = request->l_graph; 1073 lock_descriptor_t *lock; 1074 int request_blocked_by_active = 0; 1075 int request_blocked_by_granted = 0; 1076 int request_blocked_by_sleeping = 0; 1077 vnode_t *vp = request->l_vnode; 1078 int error = 0; 1079 int request_will_wait = 0; 1080 int found_covering_lock = 0; 1081 lock_descriptor_t *covered_by = NULL; 1082 1083 ASSERT(MUTEX_HELD(&gp->gp_mutex)); 1084 request_will_wait = IS_WILLING_TO_SLEEP(request); 1085 1086 /* 1087 * check active locks 1088 */ 1089 1090 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp); 1091 1092 1093 if (lock) { 1094 do { 1095 if (BLOCKS(lock, request)) { 1096 if (!request_will_wait) 1097 return (EAGAIN); 1098 request_blocked_by_active = 1; 1099 break; 1100 } 1101 /* 1102 * Grant lock if it is for the same owner holding active 1103 * lock that covers the request. 1104 */ 1105 1106 if (SAME_OWNER(lock, request) && 1107 COVERS(lock, request) && 1108 (request->l_type == F_RDLCK)) 1109 return (flk_execute_request(request)); 1110 lock = lock->l_next; 1111 } while (lock->l_vnode == vp); 1112 } 1113 1114 if (!request_blocked_by_active) { 1115 lock_descriptor_t *lk[1]; 1116 lock_descriptor_t *first_glock = NULL; 1117 /* 1118 * Shall we grant this?! NO!! 1119 * What about those locks that were just granted and still 1120 * in sleep queue. Those threads are woken up and so locks 1121 * are almost active. 1122 */ 1123 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp); 1124 if (lock) { 1125 do { 1126 if (BLOCKS(lock, request)) { 1127 if (IS_GRANTED(lock)) { 1128 request_blocked_by_granted = 1; 1129 } else { 1130 request_blocked_by_sleeping = 1; 1131 } 1132 } 1133 1134 lock = lock->l_next; 1135 } while ((lock->l_vnode == vp)); 1136 first_glock = lock->l_prev; 1137 ASSERT(first_glock->l_vnode == vp); 1138 } 1139 1140 if (request_blocked_by_granted) 1141 goto block; 1142 1143 if (!request_blocked_by_sleeping) { 1144 /* 1145 * If the request isn't going to be blocked by a 1146 * sleeping request, we know that it isn't going to 1147 * be blocked; we can just execute the request -- 1148 * without performing costly deadlock detection. 1149 */ 1150 ASSERT(!request_blocked_by_active); 1151 return (flk_execute_request(request)); 1152 } else if (request->l_type == F_RDLCK) { 1153 /* 1154 * If we have a sleeping writer in the requested 1155 * lock's range, block. 1156 */ 1157 goto block; 1158 } 1159 1160 lk[0] = request; 1161 request->l_state |= RECOMPUTE_LOCK; 1162 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp); 1163 if (lock) { 1164 do { 1165 flk_recompute_dependencies(lock, lk, 1, 0); 1166 lock = lock->l_next; 1167 } while (lock->l_vnode == vp); 1168 } 1169 lock = first_glock; 1170 if (lock) { 1171 do { 1172 if (IS_GRANTED(lock)) { 1173 flk_recompute_dependencies(lock, lk, 1, 0); 1174 } 1175 lock = lock->l_prev; 1176 } while ((lock->l_vnode == vp)); 1177 } 1178 request->l_state &= ~RECOMPUTE_LOCK; 1179 if (!NO_DEPENDENTS(request) && flk_check_deadlock(request)) 1180 return (EDEADLK); 1181 return (flk_execute_request(request)); 1182 } 1183 1184 block: 1185 if (request_will_wait) 1186 flk_graph_uncolor(gp); 1187 1188 /* check sleeping locks */ 1189 1190 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp); 1191 1192 /* 1193 * If we find a sleeping write lock that is a superset of the 1194 * region wanted by request we can be assured that by adding an 1195 * edge to this write lock we have paths to all locks in the 1196 * graph that blocks the request except in one case and that is why 1197 * another check for SAME_OWNER in the loop below. The exception 1198 * case is when this process that owns the sleeping write lock 'l1' 1199 * has other locks l2, l3, l4 that are in the system and arrived 1200 * before l1. l1 does not have path to these locks as they are from 1201 * same process. We break when we find a second covering sleeping 1202 * lock l5 owned by a process different from that owning l1, because 1203 * there cannot be any of l2, l3, l4, etc., arrived before l5, and if 1204 * it has l1 would have produced a deadlock already. 1205 */ 1206 1207 if (lock) { 1208 do { 1209 if (BLOCKS(lock, request)) { 1210 if (!request_will_wait) 1211 return (EAGAIN); 1212 if (COVERS(lock, request) && 1213 lock->l_type == F_WRLCK) { 1214 if (found_covering_lock && 1215 !SAME_OWNER(lock, covered_by)) { 1216 found_covering_lock++; 1217 break; 1218 } 1219 found_covering_lock = 1; 1220 covered_by = lock; 1221 } 1222 if (found_covering_lock && 1223 !SAME_OWNER(lock, covered_by)) { 1224 lock = lock->l_next; 1225 continue; 1226 } 1227 if ((error = flk_add_edge(request, lock, 1228 !found_covering_lock, 0))) 1229 return (error); 1230 } 1231 lock = lock->l_next; 1232 } while (lock->l_vnode == vp); 1233 } 1234 1235 /* 1236 * found_covering_lock == 2 iff at this point 'request' has paths 1237 * to all locks that blocks 'request'. found_covering_lock == 1 iff at this 1238 * point 'request' has paths to all locks that blocks 'request' whose owners 1239 * are not same as the one that covers 'request' (covered_by above) and 1240 * we can have locks whose owner is same as covered_by in the active list. 1241 */ 1242 1243 if (request_blocked_by_active && found_covering_lock != 2) { 1244 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp); 1245 ASSERT(lock != NULL); 1246 do { 1247 if (BLOCKS(lock, request)) { 1248 if (found_covering_lock && 1249 !SAME_OWNER(lock, covered_by)) { 1250 lock = lock->l_next; 1251 continue; 1252 } 1253 if ((error = flk_add_edge(request, lock, 1254 CHECK_CYCLE, 0))) 1255 return (error); 1256 } 1257 lock = lock->l_next; 1258 } while (lock->l_vnode == vp); 1259 } 1260 1261 if (NOT_BLOCKED(request)) { 1262 /* 1263 * request not dependent on any other locks 1264 * so execute this request 1265 */ 1266 return (flk_execute_request(request)); 1267 } else { 1268 /* 1269 * check for deadlock 1270 */ 1271 if (flk_check_deadlock(request)) 1272 return (EDEADLK); 1273 /* 1274 * this thread has to sleep 1275 */ 1276 return (flk_wait_execute_request(request)); 1277 } 1278 } 1279 1280 /* 1281 * The actual execution of the request in the simple case is only to 1282 * insert the 'request' in the list of active locks if it is not an 1283 * UNLOCK. 1284 * We have to consider the existing active locks' relation to 1285 * this 'request' if they are owned by same process. flk_relation() does 1286 * this job and sees to that the dependency graph information is maintained 1287 * properly. 1288 */ 1289 1290 int 1291 flk_execute_request(lock_descriptor_t *request) 1292 { 1293 graph_t *gp = request->l_graph; 1294 vnode_t *vp = request->l_vnode; 1295 lock_descriptor_t *lock, *lock1; 1296 int done_searching = 0; 1297 1298 CHECK_SLEEPING_LOCKS(gp); 1299 CHECK_ACTIVE_LOCKS(gp); 1300 1301 ASSERT(MUTEX_HELD(&gp->gp_mutex)); 1302 1303 flk_set_state(request, FLK_START_STATE); 1304 1305 ASSERT(NOT_BLOCKED(request)); 1306 1307 /* IO_LOCK requests are only to check status */ 1308 1309 if (IS_IO_LOCK(request)) 1310 return (0); 1311 1312 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp); 1313 1314 if (lock == NULL && request->l_type == F_UNLCK) 1315 return (0); 1316 if (lock == NULL) { 1317 flk_insert_active_lock(request); 1318 return (0); 1319 } 1320 1321 do { 1322 lock1 = lock->l_next; 1323 if (SAME_OWNER(request, lock)) { 1324 done_searching = flk_relation(lock, request); 1325 } 1326 lock = lock1; 1327 } while (lock->l_vnode == vp && !done_searching); 1328 1329 /* 1330 * insert in active queue 1331 */ 1332 1333 if (request->l_type != F_UNLCK) 1334 flk_insert_active_lock(request); 1335 1336 return (0); 1337 } 1338 1339 /* 1340 * 'request' is blocked by some one therefore we put it into sleep queue. 1341 */ 1342 static int 1343 flk_wait_execute_request(lock_descriptor_t *request) 1344 { 1345 graph_t *gp = request->l_graph; 1346 callb_cpr_t *cprp; /* CPR info from callback */ 1347 struct flock_globals *fg; 1348 int index; 1349 1350 ASSERT(MUTEX_HELD(&gp->gp_mutex)); 1351 ASSERT(IS_WILLING_TO_SLEEP(request)); 1352 1353 flk_insert_sleeping_lock(request); 1354 1355 if (IS_LOCKMGR(request)) { 1356 index = HASH_INDEX(request->l_vnode); 1357 fg = flk_get_globals(); 1358 1359 if (nlm_status_size == 0) { /* not booted as a cluster */ 1360 if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP) { 1361 flk_cancel_sleeping_lock(request, 1); 1362 return (ENOLCK); 1363 } 1364 } else { /* booted as a cluster */ 1365 /* 1366 * If the request is an NLM server lock request, 1367 * and the NLM state of the lock request is not 1368 * NLM_UP (because the NLM server is shutting 1369 * down), then cancel the sleeping lock and 1370 * return error ENOLCK that will encourage the 1371 * client to retransmit. 1372 */ 1373 if (!IS_NLM_UP(request)) { 1374 flk_cancel_sleeping_lock(request, 1); 1375 return (ENOLCK); 1376 } 1377 } 1378 } 1379 1380 /* Clustering: For blocking PXFS locks, return */ 1381 if (IS_PXFS(request)) { 1382 /* 1383 * PXFS locks sleep on the client side. 1384 * The callback argument is used to wake up the sleeper 1385 * when the lock is granted. 1386 * We return -1 (rather than an errno value) to indicate 1387 * the client side should sleep 1388 */ 1389 return (PXFS_LOCK_BLOCKED); 1390 } 1391 1392 if (request->l_callbacks != NULL) { 1393 /* 1394 * To make sure the shutdown code works correctly, either 1395 * the callback must happen after putting the lock on the 1396 * sleep list, or we must check the shutdown status after 1397 * returning from the callback (and before sleeping). At 1398 * least for now, we'll use the first option. If a 1399 * shutdown or signal or whatever happened while the graph 1400 * mutex was dropped, that will be detected by 1401 * wait_for_lock(). 1402 */ 1403 mutex_exit(&gp->gp_mutex); 1404 1405 cprp = flk_invoke_callbacks(request->l_callbacks, 1406 FLK_BEFORE_SLEEP); 1407 1408 mutex_enter(&gp->gp_mutex); 1409 1410 if (cprp == NULL) { 1411 wait_for_lock(request); 1412 } else { 1413 mutex_enter(cprp->cc_lockp); 1414 CALLB_CPR_SAFE_BEGIN(cprp); 1415 mutex_exit(cprp->cc_lockp); 1416 wait_for_lock(request); 1417 mutex_enter(cprp->cc_lockp); 1418 CALLB_CPR_SAFE_END(cprp, cprp->cc_lockp); 1419 mutex_exit(cprp->cc_lockp); 1420 } 1421 1422 mutex_exit(&gp->gp_mutex); 1423 (void) flk_invoke_callbacks(request->l_callbacks, 1424 FLK_AFTER_SLEEP); 1425 mutex_enter(&gp->gp_mutex); 1426 } else { 1427 wait_for_lock(request); 1428 } 1429 1430 if (IS_LOCKMGR(request)) { 1431 /* 1432 * If the lock manager is shutting down, return an 1433 * error that will encourage the client to retransmit. 1434 */ 1435 if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP && 1436 !IS_GRANTED(request)) { 1437 flk_cancel_sleeping_lock(request, 1); 1438 return (ENOLCK); 1439 } 1440 } 1441 1442 if (IS_INTERRUPTED(request)) { 1443 /* we got a signal, or act like we did */ 1444 flk_cancel_sleeping_lock(request, 1); 1445 return (EINTR); 1446 } 1447 1448 /* Cancelled if some other thread has closed the file */ 1449 1450 if (IS_CANCELLED(request)) { 1451 flk_cancel_sleeping_lock(request, 1); 1452 return (EBADF); 1453 } 1454 1455 request->l_state &= ~GRANTED_LOCK; 1456 REMOVE_SLEEP_QUEUE(request); 1457 return (flk_execute_request(request)); 1458 } 1459 1460 /* 1461 * This routine adds an edge between from and to because from depends 1462 * to. If asked to check for deadlock it checks whether there are any 1463 * reachable locks from "from_lock" that is owned by the same process 1464 * as "from_lock". 1465 * NOTE: It is the caller's responsibility to make sure that the color 1466 * of the graph is consistent between the calls to flk_add_edge as done 1467 * in flk_process_request. This routine does not color and check for 1468 * deadlock explicitly. 1469 */ 1470 1471 static int 1472 flk_add_edge(lock_descriptor_t *from_lock, lock_descriptor_t *to_lock, 1473 int check_cycle, int update_graph) 1474 { 1475 edge_t *edge; 1476 edge_t *ep; 1477 lock_descriptor_t *vertex; 1478 lock_descriptor_t *vertex_stack; 1479 1480 STACK_INIT(vertex_stack); 1481 1482 /* 1483 * if to vertex already has mark_color just return 1484 * don't add an edge as it is reachable from from vertex 1485 * before itself. 1486 */ 1487 1488 if (COLORED(to_lock)) 1489 return (0); 1490 1491 edge = flk_get_edge(); 1492 1493 /* 1494 * set the from and to vertex 1495 */ 1496 1497 edge->from_vertex = from_lock; 1498 edge->to_vertex = to_lock; 1499 1500 /* 1501 * put in adjacency list of from vertex 1502 */ 1503 1504 from_lock->l_edge.edge_adj_next->edge_adj_prev = edge; 1505 edge->edge_adj_next = from_lock->l_edge.edge_adj_next; 1506 edge->edge_adj_prev = &from_lock->l_edge; 1507 from_lock->l_edge.edge_adj_next = edge; 1508 1509 /* 1510 * put in list of to vertex 1511 */ 1512 1513 to_lock->l_edge.edge_in_next->edge_in_prev = edge; 1514 edge->edge_in_next = to_lock->l_edge.edge_in_next; 1515 to_lock->l_edge.edge_in_next = edge; 1516 edge->edge_in_prev = &to_lock->l_edge; 1517 1518 1519 if (update_graph) { 1520 flk_update_proc_graph(edge, 0); 1521 return (0); 1522 } 1523 if (!check_cycle) { 1524 return (0); 1525 } 1526 1527 STACK_PUSH(vertex_stack, from_lock, l_stack); 1528 1529 while ((vertex = STACK_TOP(vertex_stack)) != NULL) { 1530 1531 STACK_POP(vertex_stack, l_stack); 1532 1533 for (ep = FIRST_ADJ(vertex); 1534 ep != HEAD(vertex); 1535 ep = NEXT_ADJ(ep)) { 1536 if (COLORED(ep->to_vertex)) 1537 continue; 1538 COLOR(ep->to_vertex); 1539 if (SAME_OWNER(ep->to_vertex, from_lock)) 1540 goto dead_lock; 1541 STACK_PUSH(vertex_stack, ep->to_vertex, l_stack); 1542 } 1543 } 1544 return (0); 1545 1546 dead_lock: 1547 1548 /* 1549 * remove all edges 1550 */ 1551 1552 ep = FIRST_ADJ(from_lock); 1553 1554 while (ep != HEAD(from_lock)) { 1555 IN_LIST_REMOVE(ep); 1556 from_lock->l_sedge = NEXT_ADJ(ep); 1557 ADJ_LIST_REMOVE(ep); 1558 flk_free_edge(ep); 1559 ep = from_lock->l_sedge; 1560 } 1561 return (EDEADLK); 1562 } 1563 1564 /* 1565 * Get an edge structure for representing the dependency between two locks. 1566 */ 1567 1568 static edge_t * 1569 flk_get_edge() 1570 { 1571 edge_t *ep; 1572 1573 ASSERT(flk_edge_cache != NULL); 1574 1575 ep = kmem_cache_alloc(flk_edge_cache, KM_SLEEP); 1576 edge_allocs++; 1577 return (ep); 1578 } 1579 1580 /* 1581 * Free the edge structure. 1582 */ 1583 1584 static void 1585 flk_free_edge(edge_t *ep) 1586 { 1587 edge_frees++; 1588 kmem_cache_free(flk_edge_cache, (void *)ep); 1589 } 1590 1591 /* 1592 * Check the relationship of request with lock and perform the 1593 * recomputation of dependencies, break lock if required, and return 1594 * 1 if request cannot have any more relationship with the next 1595 * active locks. 1596 * The 'lock' and 'request' are compared and in case of overlap we 1597 * delete the 'lock' and form new locks to represent the non-overlapped 1598 * portion of original 'lock'. This function has side effects such as 1599 * 'lock' will be freed, new locks will be added to the active list. 1600 */ 1601 1602 static int 1603 flk_relation(lock_descriptor_t *lock, lock_descriptor_t *request) 1604 { 1605 int lock_effect; 1606 lock_descriptor_t *lock1, *lock2; 1607 lock_descriptor_t *topology[3]; 1608 int nvertex = 0; 1609 int i; 1610 edge_t *ep; 1611 graph_t *gp = (lock->l_graph); 1612 1613 1614 CHECK_SLEEPING_LOCKS(gp); 1615 CHECK_ACTIVE_LOCKS(gp); 1616 1617 ASSERT(MUTEX_HELD(&gp->gp_mutex)); 1618 1619 topology[0] = topology[1] = topology[2] = NULL; 1620 1621 if (request->l_type == F_UNLCK) 1622 lock_effect = FLK_UNLOCK; 1623 else if (request->l_type == F_RDLCK && 1624 lock->l_type == F_WRLCK) 1625 lock_effect = FLK_DOWNGRADE; 1626 else if (request->l_type == F_WRLCK && 1627 lock->l_type == F_RDLCK) 1628 lock_effect = FLK_UPGRADE; 1629 else 1630 lock_effect = FLK_STAY_SAME; 1631 1632 if (lock->l_end < request->l_start) { 1633 if (lock->l_end == request->l_start - 1 && 1634 lock_effect == FLK_STAY_SAME) { 1635 topology[0] = request; 1636 request->l_start = lock->l_start; 1637 nvertex = 1; 1638 goto recompute; 1639 } else { 1640 return (0); 1641 } 1642 } 1643 1644 if (lock->l_start > request->l_end) { 1645 if (request->l_end == lock->l_start - 1 && 1646 lock_effect == FLK_STAY_SAME) { 1647 topology[0] = request; 1648 request->l_end = lock->l_end; 1649 nvertex = 1; 1650 goto recompute; 1651 } else { 1652 return (1); 1653 } 1654 } 1655 1656 if (request->l_end < lock->l_end) { 1657 if (request->l_start > lock->l_start) { 1658 if (lock_effect == FLK_STAY_SAME) { 1659 request->l_start = lock->l_start; 1660 request->l_end = lock->l_end; 1661 topology[0] = request; 1662 nvertex = 1; 1663 } else { 1664 lock1 = flk_get_lock(); 1665 lock2 = flk_get_lock(); 1666 COPY(lock1, lock); 1667 COPY(lock2, lock); 1668 lock1->l_start = lock->l_start; 1669 lock1->l_end = request->l_start - 1; 1670 lock2->l_start = request->l_end + 1; 1671 lock2->l_end = lock->l_end; 1672 topology[0] = lock1; 1673 topology[1] = lock2; 1674 topology[2] = request; 1675 nvertex = 3; 1676 } 1677 } else if (request->l_start < lock->l_start) { 1678 if (lock_effect == FLK_STAY_SAME) { 1679 request->l_end = lock->l_end; 1680 topology[0] = request; 1681 nvertex = 1; 1682 } else { 1683 lock1 = flk_get_lock(); 1684 COPY(lock1, lock); 1685 lock1->l_start = request->l_end + 1; 1686 topology[0] = lock1; 1687 topology[1] = request; 1688 nvertex = 2; 1689 } 1690 } else { 1691 if (lock_effect == FLK_STAY_SAME) { 1692 request->l_start = lock->l_start; 1693 request->l_end = lock->l_end; 1694 topology[0] = request; 1695 nvertex = 1; 1696 } else { 1697 lock1 = flk_get_lock(); 1698 COPY(lock1, lock); 1699 lock1->l_start = request->l_end + 1; 1700 topology[0] = lock1; 1701 topology[1] = request; 1702 nvertex = 2; 1703 } 1704 } 1705 } else if (request->l_end > lock->l_end) { 1706 if (request->l_start > lock->l_start) { 1707 if (lock_effect == FLK_STAY_SAME) { 1708 request->l_start = lock->l_start; 1709 topology[0] = request; 1710 nvertex = 1; 1711 } else { 1712 lock1 = flk_get_lock(); 1713 COPY(lock1, lock); 1714 lock1->l_end = request->l_start - 1; 1715 topology[0] = lock1; 1716 topology[1] = request; 1717 nvertex = 2; 1718 } 1719 } else if (request->l_start < lock->l_start) { 1720 topology[0] = request; 1721 nvertex = 1; 1722 } else { 1723 topology[0] = request; 1724 nvertex = 1; 1725 } 1726 } else { 1727 if (request->l_start > lock->l_start) { 1728 if (lock_effect == FLK_STAY_SAME) { 1729 request->l_start = lock->l_start; 1730 topology[0] = request; 1731 nvertex = 1; 1732 } else { 1733 lock1 = flk_get_lock(); 1734 COPY(lock1, lock); 1735 lock1->l_end = request->l_start - 1; 1736 topology[0] = lock1; 1737 topology[1] = request; 1738 nvertex = 2; 1739 } 1740 } else if (request->l_start < lock->l_start) { 1741 topology[0] = request; 1742 nvertex = 1; 1743 } else { 1744 if (lock_effect != FLK_UNLOCK) { 1745 topology[0] = request; 1746 nvertex = 1; 1747 } else { 1748 flk_delete_active_lock(lock, 0); 1749 flk_wakeup(lock, 1); 1750 flk_free_lock(lock); 1751 CHECK_SLEEPING_LOCKS(gp); 1752 CHECK_ACTIVE_LOCKS(gp); 1753 return (1); 1754 } 1755 } 1756 } 1757 1758 recompute: 1759 1760 /* 1761 * For unlock we don't send the 'request' to for recomputing 1762 * dependencies because no lock will add an edge to this. 1763 */ 1764 1765 if (lock_effect == FLK_UNLOCK) { 1766 topology[nvertex-1] = NULL; 1767 nvertex--; 1768 } 1769 for (i = 0; i < nvertex; i++) { 1770 topology[i]->l_state |= RECOMPUTE_LOCK; 1771 topology[i]->l_color = NO_COLOR; 1772 } 1773 1774 ASSERT(FIRST_ADJ(lock) == HEAD(lock)); 1775 1776 /* 1777 * we remove the adjacent edges for all vertices' to this vertex 1778 * 'lock'. 1779 */ 1780 1781 ep = FIRST_IN(lock); 1782 while (ep != HEAD(lock)) { 1783 ADJ_LIST_REMOVE(ep); 1784 ep = NEXT_IN(ep); 1785 } 1786 1787 flk_delete_active_lock(lock, 0); 1788 1789 /* We are ready for recomputing the dependencies now */ 1790 1791 flk_recompute_dependencies(lock, topology, nvertex, 1); 1792 1793 for (i = 0; i < nvertex; i++) { 1794 topology[i]->l_state &= ~RECOMPUTE_LOCK; 1795 topology[i]->l_color = NO_COLOR; 1796 } 1797 1798 1799 if (lock_effect == FLK_UNLOCK) { 1800 nvertex++; 1801 } 1802 for (i = 0; i < nvertex - 1; i++) { 1803 flk_insert_active_lock(topology[i]); 1804 } 1805 1806 1807 if (lock_effect == FLK_DOWNGRADE || lock_effect == FLK_UNLOCK) { 1808 flk_wakeup(lock, 0); 1809 } else { 1810 ep = FIRST_IN(lock); 1811 while (ep != HEAD(lock)) { 1812 lock->l_sedge = NEXT_IN(ep); 1813 IN_LIST_REMOVE(ep); 1814 flk_update_proc_graph(ep, 1); 1815 flk_free_edge(ep); 1816 ep = lock->l_sedge; 1817 } 1818 } 1819 flk_free_lock(lock); 1820 1821 CHECK_SLEEPING_LOCKS(gp); 1822 CHECK_ACTIVE_LOCKS(gp); 1823 return (0); 1824 } 1825 1826 /* 1827 * Insert a lock into the active queue. 1828 */ 1829 1830 static void 1831 flk_insert_active_lock(lock_descriptor_t *new_lock) 1832 { 1833 graph_t *gp = new_lock->l_graph; 1834 vnode_t *vp = new_lock->l_vnode; 1835 lock_descriptor_t *first_lock, *lock; 1836 1837 ASSERT(MUTEX_HELD(&gp->gp_mutex)); 1838 1839 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp); 1840 first_lock = lock; 1841 1842 if (first_lock != NULL) { 1843 for (; (lock->l_vnode == vp && 1844 lock->l_start < new_lock->l_start); lock = lock->l_next) 1845 ; 1846 } else { 1847 lock = ACTIVE_HEAD(gp); 1848 } 1849 1850 lock->l_prev->l_next = new_lock; 1851 new_lock->l_next = lock; 1852 new_lock->l_prev = lock->l_prev; 1853 lock->l_prev = new_lock; 1854 1855 if (first_lock == NULL || (new_lock->l_start <= first_lock->l_start)) { 1856 vp->v_filocks = (struct filock *)new_lock; 1857 } 1858 flk_set_state(new_lock, FLK_ACTIVE_STATE); 1859 new_lock->l_state |= ACTIVE_LOCK; 1860 1861 CHECK_ACTIVE_LOCKS(gp); 1862 CHECK_SLEEPING_LOCKS(gp); 1863 } 1864 1865 /* 1866 * Delete the active lock : Performs two functions depending on the 1867 * value of second parameter. One is to remove from the active lists 1868 * only and other is to both remove and free the lock. 1869 */ 1870 1871 static void 1872 flk_delete_active_lock(lock_descriptor_t *lock, int free_lock) 1873 { 1874 vnode_t *vp = lock->l_vnode; 1875 graph_t *gp = lock->l_graph; 1876 1877 ASSERT(MUTEX_HELD(&gp->gp_mutex)); 1878 if (free_lock) 1879 ASSERT(NO_DEPENDENTS(lock)); 1880 ASSERT(NOT_BLOCKED(lock)); 1881 ASSERT(IS_ACTIVE(lock)); 1882 1883 ASSERT((vp->v_filocks != NULL)); 1884 1885 if (vp->v_filocks == (struct filock *)lock) { 1886 vp->v_filocks = (struct filock *) 1887 ((lock->l_next->l_vnode == vp) ? lock->l_next : 1888 NULL); 1889 } 1890 lock->l_next->l_prev = lock->l_prev; 1891 lock->l_prev->l_next = lock->l_next; 1892 lock->l_next = lock->l_prev = NULL; 1893 flk_set_state(lock, FLK_DEAD_STATE); 1894 lock->l_state &= ~ACTIVE_LOCK; 1895 1896 if (free_lock) 1897 flk_free_lock(lock); 1898 CHECK_ACTIVE_LOCKS(gp); 1899 CHECK_SLEEPING_LOCKS(gp); 1900 } 1901 1902 /* 1903 * Insert into the sleep queue. 1904 */ 1905 1906 static void 1907 flk_insert_sleeping_lock(lock_descriptor_t *request) 1908 { 1909 graph_t *gp = request->l_graph; 1910 vnode_t *vp = request->l_vnode; 1911 lock_descriptor_t *lock; 1912 1913 ASSERT(MUTEX_HELD(&gp->gp_mutex)); 1914 ASSERT(IS_INITIAL(request)); 1915 1916 for (lock = gp->sleeping_locks.l_next; (lock != &gp->sleeping_locks && 1917 lock->l_vnode < vp); lock = lock->l_next) 1918 ; 1919 1920 lock->l_prev->l_next = request; 1921 request->l_prev = lock->l_prev; 1922 lock->l_prev = request; 1923 request->l_next = lock; 1924 flk_set_state(request, FLK_SLEEPING_STATE); 1925 request->l_state |= SLEEPING_LOCK; 1926 } 1927 1928 /* 1929 * Cancelling a sleeping lock implies removing a vertex from the 1930 * dependency graph and therefore we should recompute the dependencies 1931 * of all vertices that have a path to this vertex, w.r.t. all 1932 * vertices reachable from this vertex. 1933 */ 1934 1935 void 1936 flk_cancel_sleeping_lock(lock_descriptor_t *request, int remove_from_queue) 1937 { 1938 graph_t *gp = request->l_graph; 1939 vnode_t *vp = request->l_vnode; 1940 lock_descriptor_t **topology = NULL; 1941 edge_t *ep; 1942 lock_descriptor_t *vertex, *lock; 1943 int nvertex = 0; 1944 int i; 1945 lock_descriptor_t *vertex_stack; 1946 1947 STACK_INIT(vertex_stack); 1948 1949 ASSERT(MUTEX_HELD(&gp->gp_mutex)); 1950 /* 1951 * count number of vertex pointers that has to be allocated 1952 * All vertices that are reachable from request. 1953 */ 1954 1955 STACK_PUSH(vertex_stack, request, l_stack); 1956 1957 while ((vertex = STACK_TOP(vertex_stack)) != NULL) { 1958 STACK_POP(vertex_stack, l_stack); 1959 for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex); 1960 ep = NEXT_ADJ(ep)) { 1961 if (IS_RECOMPUTE(ep->to_vertex)) 1962 continue; 1963 ep->to_vertex->l_state |= RECOMPUTE_LOCK; 1964 STACK_PUSH(vertex_stack, ep->to_vertex, l_stack); 1965 nvertex++; 1966 } 1967 } 1968 1969 /* 1970 * allocate memory for holding the vertex pointers 1971 */ 1972 1973 if (nvertex) { 1974 topology = kmem_zalloc(nvertex * sizeof (lock_descriptor_t *), 1975 KM_SLEEP); 1976 } 1977 1978 /* 1979 * one more pass to actually store the vertices in the 1980 * allocated array. 1981 * We first check sleeping locks and then active locks 1982 * so that topology array will be in a topological 1983 * order. 1984 */ 1985 1986 nvertex = 0; 1987 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp); 1988 1989 if (lock) { 1990 do { 1991 if (IS_RECOMPUTE(lock)) { 1992 lock->l_index = nvertex; 1993 topology[nvertex++] = lock; 1994 } 1995 lock->l_color = NO_COLOR; 1996 lock = lock->l_next; 1997 } while (lock->l_vnode == vp); 1998 } 1999 2000 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp); 2001 2002 if (lock) { 2003 do { 2004 if (IS_RECOMPUTE(lock)) { 2005 lock->l_index = nvertex; 2006 topology[nvertex++] = lock; 2007 } 2008 lock->l_color = NO_COLOR; 2009 lock = lock->l_next; 2010 } while (lock->l_vnode == vp); 2011 } 2012 2013 /* 2014 * remove in and out edges of request 2015 * They are freed after updating proc_graph below. 2016 */ 2017 2018 for (ep = FIRST_IN(request); ep != HEAD(request); ep = NEXT_IN(ep)) { 2019 ADJ_LIST_REMOVE(ep); 2020 } 2021 2022 2023 if (remove_from_queue) 2024 REMOVE_SLEEP_QUEUE(request); 2025 2026 /* we are ready to recompute */ 2027 2028 flk_recompute_dependencies(request, topology, nvertex, 1); 2029 2030 ep = FIRST_ADJ(request); 2031 while (ep != HEAD(request)) { 2032 IN_LIST_REMOVE(ep); 2033 request->l_sedge = NEXT_ADJ(ep); 2034 ADJ_LIST_REMOVE(ep); 2035 flk_update_proc_graph(ep, 1); 2036 flk_free_edge(ep); 2037 ep = request->l_sedge; 2038 } 2039 2040 2041 /* 2042 * unset the RECOMPUTE flag in those vertices 2043 */ 2044 2045 for (i = 0; i < nvertex; i++) { 2046 topology[i]->l_state &= ~RECOMPUTE_LOCK; 2047 } 2048 2049 /* 2050 * free the topology 2051 */ 2052 if (nvertex) 2053 kmem_free((void *)topology, 2054 (nvertex * sizeof (lock_descriptor_t *))); 2055 /* 2056 * Possibility of some locks unblocked now 2057 */ 2058 2059 flk_wakeup(request, 0); 2060 2061 /* 2062 * we expect to have a correctly recomputed graph now. 2063 */ 2064 flk_set_state(request, FLK_DEAD_STATE); 2065 flk_free_lock(request); 2066 CHECK_SLEEPING_LOCKS(gp); 2067 CHECK_ACTIVE_LOCKS(gp); 2068 2069 } 2070 2071 /* 2072 * Uncoloring the graph is simply to increment the mark value of the graph 2073 * And only when wrap round takes place will we color all vertices in 2074 * the graph explicitly. 2075 */ 2076 2077 static void 2078 flk_graph_uncolor(graph_t *gp) 2079 { 2080 lock_descriptor_t *lock; 2081 2082 if (gp->mark == UINT_MAX) { 2083 gp->mark = 1; 2084 for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp); 2085 lock = lock->l_next) 2086 lock->l_color = 0; 2087 2088 for (lock = SLEEPING_HEAD(gp)->l_next; lock != SLEEPING_HEAD(gp); 2089 lock = lock->l_next) 2090 lock->l_color = 0; 2091 } else { 2092 gp->mark++; 2093 } 2094 } 2095 2096 /* 2097 * Wake up locks that are blocked on the given lock. 2098 */ 2099 2100 static void 2101 flk_wakeup(lock_descriptor_t *lock, int adj_list_remove) 2102 { 2103 edge_t *ep; 2104 graph_t *gp = lock->l_graph; 2105 lock_descriptor_t *lck; 2106 2107 ASSERT(MUTEX_HELD(&gp->gp_mutex)); 2108 if (NO_DEPENDENTS(lock)) 2109 return; 2110 ep = FIRST_IN(lock); 2111 do { 2112 /* 2113 * delete the edge from the adjacency list 2114 * of from vertex. if no more adjacent edges 2115 * for this vertex wake this process. 2116 */ 2117 lck = ep->from_vertex; 2118 if (adj_list_remove) 2119 ADJ_LIST_REMOVE(ep); 2120 flk_update_proc_graph(ep, 1); 2121 if (NOT_BLOCKED(lck)) { 2122 GRANT_WAKEUP(lck); 2123 } 2124 lock->l_sedge = NEXT_IN(ep); 2125 IN_LIST_REMOVE(ep); 2126 flk_free_edge(ep); 2127 ep = lock->l_sedge; 2128 } while (ep != HEAD(lock)); 2129 ASSERT(NO_DEPENDENTS(lock)); 2130 } 2131 2132 /* 2133 * The dependents of request, is checked for its dependency against the 2134 * locks in topology (called topology because the array is and should be in 2135 * topological order for this algorithm, if not in topological order the 2136 * inner loop below might add more edges than necessary. Topological ordering 2137 * of vertices satisfies the property that all edges will be from left to 2138 * right i.e., topology[i] can have an edge to topology[j], iff i<j) 2139 * If lock l1 in the dependent set of request is dependent (blocked by) 2140 * on lock l2 in topology but does not have a path to it, we add an edge 2141 * in the inner loop below. 2142 * 2143 * We don't want to add an edge between l1 and l2 if there exists 2144 * already a path from l1 to l2, so care has to be taken for those vertices 2145 * that have two paths to 'request'. These vertices are referred to here 2146 * as barrier locks. 2147 * 2148 * The barriers has to be found (those vertex that originally had two paths 2149 * to request) because otherwise we may end up adding edges unnecessarily 2150 * to vertices in topology, and thus barrier vertices can have an edge 2151 * to a vertex in topology as well a path to it. 2152 */ 2153 2154 static void 2155 flk_recompute_dependencies(lock_descriptor_t *request, 2156 lock_descriptor_t **topology, 2157 int nvertex, int update_graph) 2158 { 2159 lock_descriptor_t *vertex, *lock; 2160 graph_t *gp = request->l_graph; 2161 int i, count; 2162 int barrier_found = 0; 2163 edge_t *ep; 2164 lock_descriptor_t *vertex_stack; 2165 2166 STACK_INIT(vertex_stack); 2167 2168 ASSERT(MUTEX_HELD(&gp->gp_mutex)); 2169 if (nvertex == 0) 2170 return; 2171 flk_graph_uncolor(request->l_graph); 2172 barrier_found = flk_find_barriers(request); 2173 request->l_state |= RECOMPUTE_DONE; 2174 2175 STACK_PUSH(vertex_stack, request, l_stack); 2176 request->l_sedge = FIRST_IN(request); 2177 2178 2179 while ((vertex = STACK_TOP(vertex_stack)) != NULL) { 2180 if (vertex->l_state & RECOMPUTE_DONE) { 2181 count = 0; 2182 goto next_in_edge; 2183 } 2184 if (IS_BARRIER(vertex)) { 2185 /* decrement the barrier count */ 2186 if (vertex->l_index) { 2187 vertex->l_index--; 2188 /* this guy will be pushed again anyway ? */ 2189 STACK_POP(vertex_stack, l_stack); 2190 if (vertex->l_index == 0) { 2191 /* 2192 * barrier is over we can recompute 2193 * dependencies for this lock in the 2194 * next stack pop 2195 */ 2196 vertex->l_state &= ~BARRIER_LOCK; 2197 } 2198 continue; 2199 } 2200 } 2201 vertex->l_state |= RECOMPUTE_DONE; 2202 flk_graph_uncolor(gp); 2203 count = flk_color_reachables(vertex); 2204 for (i = 0; i < nvertex; i++) { 2205 lock = topology[i]; 2206 if (COLORED(lock)) 2207 continue; 2208 if (BLOCKS(lock, vertex)) { 2209 (void) flk_add_edge(vertex, lock, 2210 NO_CHECK_CYCLE, update_graph); 2211 COLOR(lock); 2212 count++; 2213 count += flk_color_reachables(lock); 2214 } 2215 2216 } 2217 2218 next_in_edge: 2219 if (count == nvertex || 2220 vertex->l_sedge == HEAD(vertex)) { 2221 /* prune the tree below this */ 2222 STACK_POP(vertex_stack, l_stack); 2223 vertex->l_state &= ~RECOMPUTE_DONE; 2224 /* update the barrier locks below this! */ 2225 if (vertex->l_sedge != HEAD(vertex) && barrier_found) { 2226 flk_graph_uncolor(gp); 2227 flk_update_barriers(vertex); 2228 } 2229 continue; 2230 } 2231 2232 ep = vertex->l_sedge; 2233 lock = ep->from_vertex; 2234 STACK_PUSH(vertex_stack, lock, l_stack); 2235 lock->l_sedge = FIRST_IN(lock); 2236 vertex->l_sedge = NEXT_IN(ep); 2237 } 2238 2239 } 2240 2241 /* 2242 * Color all reachable vertices from vertex that belongs to topology (here 2243 * those that have RECOMPUTE_LOCK set in their state) and yet uncolored. 2244 * 2245 * Note: we need to use a different stack_link l_stack1 because this is 2246 * called from flk_recompute_dependencies() that already uses a stack with 2247 * l_stack as stack_link. 2248 */ 2249 2250 static int 2251 flk_color_reachables(lock_descriptor_t *vertex) 2252 { 2253 lock_descriptor_t *ver, *lock; 2254 int count; 2255 edge_t *ep; 2256 lock_descriptor_t *vertex_stack; 2257 2258 STACK_INIT(vertex_stack); 2259 2260 STACK_PUSH(vertex_stack, vertex, l_stack1); 2261 count = 0; 2262 while ((ver = STACK_TOP(vertex_stack)) != NULL) { 2263 2264 STACK_POP(vertex_stack, l_stack1); 2265 for (ep = FIRST_ADJ(ver); ep != HEAD(ver); 2266 ep = NEXT_ADJ(ep)) { 2267 lock = ep->to_vertex; 2268 if (COLORED(lock)) 2269 continue; 2270 COLOR(lock); 2271 if (IS_RECOMPUTE(lock)) 2272 count++; 2273 STACK_PUSH(vertex_stack, lock, l_stack1); 2274 } 2275 2276 } 2277 return (count); 2278 } 2279 2280 /* 2281 * Called from flk_recompute_dependencies() this routine decrements 2282 * the barrier count of barrier vertices that are reachable from lock. 2283 */ 2284 2285 static void 2286 flk_update_barriers(lock_descriptor_t *lock) 2287 { 2288 lock_descriptor_t *vertex, *lck; 2289 edge_t *ep; 2290 lock_descriptor_t *vertex_stack; 2291 2292 STACK_INIT(vertex_stack); 2293 2294 STACK_PUSH(vertex_stack, lock, l_stack1); 2295 2296 while ((vertex = STACK_TOP(vertex_stack)) != NULL) { 2297 STACK_POP(vertex_stack, l_stack1); 2298 for (ep = FIRST_IN(vertex); ep != HEAD(vertex); 2299 ep = NEXT_IN(ep)) { 2300 lck = ep->from_vertex; 2301 if (COLORED(lck)) { 2302 if (IS_BARRIER(lck)) { 2303 ASSERT(lck->l_index > 0); 2304 lck->l_index--; 2305 if (lck->l_index == 0) 2306 lck->l_state &= ~BARRIER_LOCK; 2307 } 2308 continue; 2309 } 2310 COLOR(lck); 2311 if (IS_BARRIER(lck)) { 2312 ASSERT(lck->l_index > 0); 2313 lck->l_index--; 2314 if (lck->l_index == 0) 2315 lck->l_state &= ~BARRIER_LOCK; 2316 } 2317 STACK_PUSH(vertex_stack, lck, l_stack1); 2318 } 2319 } 2320 } 2321 2322 /* 2323 * Finds all vertices that are reachable from 'lock' more than once and 2324 * mark them as barrier vertices and increment their barrier count. 2325 * The barrier count is one minus the total number of paths from lock 2326 * to that vertex. 2327 */ 2328 2329 static int 2330 flk_find_barriers(lock_descriptor_t *lock) 2331 { 2332 lock_descriptor_t *vertex, *lck; 2333 int found = 0; 2334 edge_t *ep; 2335 lock_descriptor_t *vertex_stack; 2336 2337 STACK_INIT(vertex_stack); 2338 2339 STACK_PUSH(vertex_stack, lock, l_stack1); 2340 2341 while ((vertex = STACK_TOP(vertex_stack)) != NULL) { 2342 STACK_POP(vertex_stack, l_stack1); 2343 for (ep = FIRST_IN(vertex); ep != HEAD(vertex); 2344 ep = NEXT_IN(ep)) { 2345 lck = ep->from_vertex; 2346 if (COLORED(lck)) { 2347 /* this is a barrier */ 2348 lck->l_state |= BARRIER_LOCK; 2349 /* index will have barrier count */ 2350 lck->l_index++; 2351 if (!found) 2352 found = 1; 2353 continue; 2354 } 2355 COLOR(lck); 2356 lck->l_index = 0; 2357 STACK_PUSH(vertex_stack, lck, l_stack1); 2358 } 2359 } 2360 return (found); 2361 } 2362 2363 /* 2364 * Finds the first lock that is mainly responsible for blocking this 2365 * request. If there is no such lock, request->l_flock.l_type is set to 2366 * F_UNLCK. Otherwise, request->l_flock is filled in with the particulars 2367 * of the blocking lock. 2368 * 2369 * Note: It is possible a request is blocked by a sleeping lock because 2370 * of the fairness policy used in flk_process_request() to construct the 2371 * dependencies. (see comments before flk_process_request()). 2372 */ 2373 2374 static void 2375 flk_get_first_blocking_lock(lock_descriptor_t *request) 2376 { 2377 graph_t *gp = request->l_graph; 2378 vnode_t *vp = request->l_vnode; 2379 lock_descriptor_t *lock, *blocker; 2380 2381 ASSERT(MUTEX_HELD(&gp->gp_mutex)); 2382 blocker = NULL; 2383 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp); 2384 2385 if (lock) { 2386 do { 2387 if (BLOCKS(lock, request)) { 2388 blocker = lock; 2389 break; 2390 } 2391 lock = lock->l_next; 2392 } while (lock->l_vnode == vp); 2393 } 2394 2395 if (blocker == NULL && request->l_flock.l_type == F_RDLCK) { 2396 /* 2397 * No active lock is blocking this request, but if a read 2398 * lock is requested, it may also get blocked by a waiting 2399 * writer. So search all sleeping locks and see if there is 2400 * a writer waiting. 2401 */ 2402 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp); 2403 if (lock) { 2404 do { 2405 if (BLOCKS(lock, request)) { 2406 blocker = lock; 2407 break; 2408 } 2409 lock = lock->l_next; 2410 } while (lock->l_vnode == vp); 2411 } 2412 } 2413 2414 if (blocker) { 2415 report_blocker(blocker, request); 2416 } else 2417 request->l_flock.l_type = F_UNLCK; 2418 } 2419 2420 /* 2421 * Get the graph_t structure associated with a vnode. 2422 * If 'initialize' is non-zero, and the graph_t structure for this vnode has 2423 * not yet been initialized, then a new element is allocated and returned. 2424 */ 2425 graph_t * 2426 flk_get_lock_graph(vnode_t *vp, int initialize) 2427 { 2428 graph_t *gp; 2429 graph_t *gp_alloc = NULL; 2430 int index = HASH_INDEX(vp); 2431 2432 if (initialize == FLK_USE_GRAPH) { 2433 mutex_enter(&flock_lock); 2434 gp = lock_graph[index]; 2435 mutex_exit(&flock_lock); 2436 return (gp); 2437 } 2438 2439 ASSERT(initialize == FLK_INIT_GRAPH); 2440 2441 if (lock_graph[index] == NULL) { 2442 2443 gp_alloc = kmem_zalloc(sizeof (graph_t), KM_SLEEP); 2444 2445 /* Initialize the graph */ 2446 2447 gp_alloc->active_locks.l_next = 2448 gp_alloc->active_locks.l_prev = 2449 (lock_descriptor_t *)ACTIVE_HEAD(gp_alloc); 2450 gp_alloc->sleeping_locks.l_next = 2451 gp_alloc->sleeping_locks.l_prev = 2452 (lock_descriptor_t *)SLEEPING_HEAD(gp_alloc); 2453 gp_alloc->index = index; 2454 mutex_init(&gp_alloc->gp_mutex, NULL, MUTEX_DEFAULT, NULL); 2455 } 2456 2457 mutex_enter(&flock_lock); 2458 2459 gp = lock_graph[index]; 2460 2461 /* Recheck the value within flock_lock */ 2462 if (gp == NULL) { 2463 struct flock_globals *fg; 2464 2465 /* We must have previously allocated the graph_t structure */ 2466 ASSERT(gp_alloc != NULL); 2467 lock_graph[index] = gp = gp_alloc; 2468 /* 2469 * The lockmgr status is only needed if KLM is loaded. 2470 */ 2471 if (flock_zone_key != ZONE_KEY_UNINITIALIZED) { 2472 fg = flk_get_globals(); 2473 fg->lockmgr_status[index] = fg->flk_lockmgr_status; 2474 } 2475 } 2476 2477 mutex_exit(&flock_lock); 2478 2479 if ((gp_alloc != NULL) && (gp != gp_alloc)) { 2480 /* There was a race to allocate the graph_t and we lost */ 2481 mutex_destroy(&gp_alloc->gp_mutex); 2482 kmem_free(gp_alloc, sizeof (graph_t)); 2483 } 2484 2485 return (gp); 2486 } 2487 2488 /* 2489 * PSARC case 1997/292 2490 */ 2491 int 2492 cl_flk_has_remote_locks_for_nlmid(vnode_t *vp, int nlmid) 2493 { 2494 lock_descriptor_t *lock; 2495 int result = 0; 2496 graph_t *gp; 2497 int lock_nlmid; 2498 2499 /* 2500 * Check to see if node is booted as a cluster. If not, return. 2501 */ 2502 if ((cluster_bootflags & CLUSTER_BOOTED) == 0) { 2503 return (0); 2504 } 2505 2506 gp = flk_get_lock_graph(vp, FLK_USE_GRAPH); 2507 if (gp == NULL) { 2508 return (0); 2509 } 2510 2511 mutex_enter(&gp->gp_mutex); 2512 2513 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp); 2514 2515 if (lock) { 2516 while (lock->l_vnode == vp) { 2517 /* get NLM id from sysid */ 2518 lock_nlmid = GETNLMID(lock->l_flock.l_sysid); 2519 2520 /* 2521 * If NLM server request _and_ nlmid of lock matches 2522 * nlmid of argument, then we've found a remote lock. 2523 */ 2524 if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) { 2525 result = 1; 2526 goto done; 2527 } 2528 lock = lock->l_next; 2529 } 2530 } 2531 2532 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp); 2533 2534 if (lock) { 2535 while (lock->l_vnode == vp) { 2536 /* get NLM id from sysid */ 2537 lock_nlmid = GETNLMID(lock->l_flock.l_sysid); 2538 2539 /* 2540 * If NLM server request _and_ nlmid of lock matches 2541 * nlmid of argument, then we've found a remote lock. 2542 */ 2543 if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) { 2544 result = 1; 2545 goto done; 2546 } 2547 lock = lock->l_next; 2548 } 2549 } 2550 2551 done: 2552 mutex_exit(&gp->gp_mutex); 2553 return (result); 2554 } 2555 2556 /* 2557 * Determine whether there are any locks for the given vnode with a remote 2558 * sysid. Returns zero if not, non-zero if there are. 2559 * 2560 * Note that the return value from this function is potentially invalid 2561 * once it has been returned. The caller is responsible for providing its 2562 * own synchronization mechanism to ensure that the return value is useful 2563 * (e.g., see nfs_lockcompletion()). 2564 */ 2565 int 2566 flk_has_remote_locks(vnode_t *vp) 2567 { 2568 lock_descriptor_t *lock; 2569 int result = 0; 2570 graph_t *gp; 2571 2572 gp = flk_get_lock_graph(vp, FLK_USE_GRAPH); 2573 if (gp == NULL) { 2574 return (0); 2575 } 2576 2577 mutex_enter(&gp->gp_mutex); 2578 2579 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp); 2580 2581 if (lock) { 2582 while (lock->l_vnode == vp) { 2583 if (IS_REMOTE(lock)) { 2584 result = 1; 2585 goto done; 2586 } 2587 lock = lock->l_next; 2588 } 2589 } 2590 2591 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp); 2592 2593 if (lock) { 2594 while (lock->l_vnode == vp) { 2595 if (IS_REMOTE(lock)) { 2596 result = 1; 2597 goto done; 2598 } 2599 lock = lock->l_next; 2600 } 2601 } 2602 2603 done: 2604 mutex_exit(&gp->gp_mutex); 2605 return (result); 2606 } 2607 2608 /* 2609 * Determine whether there are any locks for the given vnode with a remote 2610 * sysid matching given sysid. 2611 * Used by the new (open source) NFS Lock Manager (NLM) 2612 */ 2613 int 2614 flk_has_remote_locks_for_sysid(vnode_t *vp, int sysid) 2615 { 2616 lock_descriptor_t *lock; 2617 int result = 0; 2618 graph_t *gp; 2619 2620 if (sysid == 0) 2621 return (0); 2622 2623 gp = flk_get_lock_graph(vp, FLK_USE_GRAPH); 2624 if (gp == NULL) { 2625 return (0); 2626 } 2627 2628 mutex_enter(&gp->gp_mutex); 2629 2630 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp); 2631 2632 if (lock) { 2633 while (lock->l_vnode == vp) { 2634 if (lock->l_flock.l_sysid == sysid) { 2635 result = 1; 2636 goto done; 2637 } 2638 lock = lock->l_next; 2639 } 2640 } 2641 2642 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp); 2643 2644 if (lock) { 2645 while (lock->l_vnode == vp) { 2646 if (lock->l_flock.l_sysid == sysid) { 2647 result = 1; 2648 goto done; 2649 } 2650 lock = lock->l_next; 2651 } 2652 } 2653 2654 done: 2655 mutex_exit(&gp->gp_mutex); 2656 return (result); 2657 } 2658 2659 /* 2660 * Determine if there are any locks owned by the given sysid. 2661 * Returns zero if not, non-zero if there are. Note that this return code 2662 * could be derived from flk_get_{sleeping,active}_locks, but this routine 2663 * avoids all the memory allocations of those routines. 2664 * 2665 * This routine has the same synchronization issues as 2666 * flk_has_remote_locks. 2667 */ 2668 2669 int 2670 flk_sysid_has_locks(int sysid, int lck_type) 2671 { 2672 int has_locks = 0; 2673 lock_descriptor_t *lock; 2674 graph_t *gp; 2675 int i; 2676 2677 for (i = 0; i < HASH_SIZE && !has_locks; i++) { 2678 mutex_enter(&flock_lock); 2679 gp = lock_graph[i]; 2680 mutex_exit(&flock_lock); 2681 if (gp == NULL) { 2682 continue; 2683 } 2684 2685 mutex_enter(&gp->gp_mutex); 2686 2687 if (lck_type & FLK_QUERY_ACTIVE) { 2688 for (lock = ACTIVE_HEAD(gp)->l_next; 2689 lock != ACTIVE_HEAD(gp) && !has_locks; 2690 lock = lock->l_next) { 2691 if (lock->l_flock.l_sysid == sysid) 2692 has_locks = 1; 2693 } 2694 } 2695 2696 if (lck_type & FLK_QUERY_SLEEPING) { 2697 for (lock = SLEEPING_HEAD(gp)->l_next; 2698 lock != SLEEPING_HEAD(gp) && !has_locks; 2699 lock = lock->l_next) { 2700 if (lock->l_flock.l_sysid == sysid) 2701 has_locks = 1; 2702 } 2703 } 2704 mutex_exit(&gp->gp_mutex); 2705 } 2706 2707 return (has_locks); 2708 } 2709 2710 2711 /* 2712 * PSARC case 1997/292 2713 * 2714 * Requires: "sysid" is a pair [nlmid, sysid]. The lower half is 16-bit 2715 * quantity, the real sysid generated by the NLM server; the upper half 2716 * identifies the node of the cluster where the NLM server ran. 2717 * This routine is only called by an NLM server running in a cluster. 2718 * Effects: Remove all locks held on behalf of the client identified 2719 * by "sysid." 2720 */ 2721 void 2722 cl_flk_remove_locks_by_sysid(int sysid) 2723 { 2724 graph_t *gp; 2725 int i; 2726 lock_descriptor_t *lock, *nlock; 2727 2728 /* 2729 * Check to see if node is booted as a cluster. If not, return. 2730 */ 2731 if ((cluster_bootflags & CLUSTER_BOOTED) == 0) { 2732 return; 2733 } 2734 2735 ASSERT(sysid != 0); 2736 for (i = 0; i < HASH_SIZE; i++) { 2737 mutex_enter(&flock_lock); 2738 gp = lock_graph[i]; 2739 mutex_exit(&flock_lock); 2740 2741 if (gp == NULL) 2742 continue; 2743 2744 mutex_enter(&gp->gp_mutex); /* get mutex on lock graph */ 2745 2746 /* signal sleeping requests so that they bail out */ 2747 lock = SLEEPING_HEAD(gp)->l_next; 2748 while (lock != SLEEPING_HEAD(gp)) { 2749 nlock = lock->l_next; 2750 if (lock->l_flock.l_sysid == sysid) { 2751 INTERRUPT_WAKEUP(lock); 2752 } 2753 lock = nlock; 2754 } 2755 2756 /* delete active locks */ 2757 lock = ACTIVE_HEAD(gp)->l_next; 2758 while (lock != ACTIVE_HEAD(gp)) { 2759 nlock = lock->l_next; 2760 if (lock->l_flock.l_sysid == sysid) { 2761 flk_delete_active_lock(lock, 0); 2762 flk_wakeup(lock, 1); 2763 flk_free_lock(lock); 2764 } 2765 lock = nlock; 2766 } 2767 mutex_exit(&gp->gp_mutex); /* release mutex on lock graph */ 2768 } 2769 } 2770 2771 /* 2772 * Delete all locks in the system that belongs to the sysid of the request. 2773 */ 2774 2775 static void 2776 flk_delete_locks_by_sysid(lock_descriptor_t *request) 2777 { 2778 int sysid = request->l_flock.l_sysid; 2779 lock_descriptor_t *lock, *nlock; 2780 graph_t *gp; 2781 int i; 2782 2783 ASSERT(MUTEX_HELD(&request->l_graph->gp_mutex)); 2784 ASSERT(sysid != 0); 2785 2786 mutex_exit(&request->l_graph->gp_mutex); 2787 2788 for (i = 0; i < HASH_SIZE; i++) { 2789 mutex_enter(&flock_lock); 2790 gp = lock_graph[i]; 2791 mutex_exit(&flock_lock); 2792 2793 if (gp == NULL) 2794 continue; 2795 2796 mutex_enter(&gp->gp_mutex); 2797 2798 /* signal sleeping requests so that they bail out */ 2799 lock = SLEEPING_HEAD(gp)->l_next; 2800 while (lock != SLEEPING_HEAD(gp)) { 2801 nlock = lock->l_next; 2802 if (lock->l_flock.l_sysid == sysid) { 2803 INTERRUPT_WAKEUP(lock); 2804 } 2805 lock = nlock; 2806 } 2807 2808 /* delete active locks */ 2809 lock = ACTIVE_HEAD(gp)->l_next; 2810 while (lock != ACTIVE_HEAD(gp)) { 2811 nlock = lock->l_next; 2812 if (lock->l_flock.l_sysid == sysid) { 2813 flk_delete_active_lock(lock, 0); 2814 flk_wakeup(lock, 1); 2815 flk_free_lock(lock); 2816 } 2817 lock = nlock; 2818 } 2819 mutex_exit(&gp->gp_mutex); 2820 } 2821 2822 mutex_enter(&request->l_graph->gp_mutex); 2823 } 2824 2825 /* 2826 * Clustering: Deletes PXFS locks 2827 * Effects: Delete all locks on files in the given file system and with the 2828 * given PXFS id. 2829 */ 2830 void 2831 cl_flk_delete_pxfs_locks(struct vfs *vfsp, int pxfsid) 2832 { 2833 lock_descriptor_t *lock, *nlock; 2834 graph_t *gp; 2835 int i; 2836 2837 for (i = 0; i < HASH_SIZE; i++) { 2838 mutex_enter(&flock_lock); 2839 gp = lock_graph[i]; 2840 mutex_exit(&flock_lock); 2841 2842 if (gp == NULL) 2843 continue; 2844 2845 mutex_enter(&gp->gp_mutex); 2846 2847 /* signal sleeping requests so that they bail out */ 2848 lock = SLEEPING_HEAD(gp)->l_next; 2849 while (lock != SLEEPING_HEAD(gp)) { 2850 nlock = lock->l_next; 2851 if (lock->l_vnode->v_vfsp == vfsp) { 2852 ASSERT(IS_PXFS(lock)); 2853 if (GETPXFSID(lock->l_flock.l_sysid) == 2854 pxfsid) { 2855 flk_set_state(lock, 2856 FLK_CANCELLED_STATE); 2857 flk_cancel_sleeping_lock(lock, 1); 2858 } 2859 } 2860 lock = nlock; 2861 } 2862 2863 /* delete active locks */ 2864 lock = ACTIVE_HEAD(gp)->l_next; 2865 while (lock != ACTIVE_HEAD(gp)) { 2866 nlock = lock->l_next; 2867 if (lock->l_vnode->v_vfsp == vfsp) { 2868 ASSERT(IS_PXFS(lock)); 2869 if (GETPXFSID(lock->l_flock.l_sysid) == 2870 pxfsid) { 2871 flk_delete_active_lock(lock, 0); 2872 flk_wakeup(lock, 1); 2873 flk_free_lock(lock); 2874 } 2875 } 2876 lock = nlock; 2877 } 2878 mutex_exit(&gp->gp_mutex); 2879 } 2880 } 2881 2882 /* 2883 * Search for a sleeping lock manager lock which matches exactly this lock 2884 * request; if one is found, fake a signal to cancel it. 2885 * 2886 * Return 1 if a matching lock was found, 0 otherwise. 2887 */ 2888 2889 static int 2890 flk_canceled(lock_descriptor_t *request) 2891 { 2892 lock_descriptor_t *lock, *nlock; 2893 graph_t *gp = request->l_graph; 2894 vnode_t *vp = request->l_vnode; 2895 2896 ASSERT(MUTEX_HELD(&gp->gp_mutex)); 2897 ASSERT(IS_LOCKMGR(request)); 2898 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp); 2899 2900 if (lock) { 2901 while (lock->l_vnode == vp) { 2902 nlock = lock->l_next; 2903 if (SAME_OWNER(lock, request) && 2904 lock->l_start == request->l_start && 2905 lock->l_end == request->l_end) { 2906 INTERRUPT_WAKEUP(lock); 2907 return (1); 2908 } 2909 lock = nlock; 2910 } 2911 } 2912 return (0); 2913 } 2914 2915 /* 2916 * Remove all non-OFD locks for the vnode belonging to the given pid and sysid. 2917 * That is, since OFD locks are pid-less we'll never match on the incoming 2918 * pid. OFD locks are removed earlier in the close() path via closef() and 2919 * ofdcleanlock(). 2920 */ 2921 void 2922 cleanlocks(vnode_t *vp, pid_t pid, int sysid) 2923 { 2924 graph_t *gp; 2925 lock_descriptor_t *lock, *nlock; 2926 lock_descriptor_t *link_stack; 2927 2928 STACK_INIT(link_stack); 2929 2930 gp = flk_get_lock_graph(vp, FLK_USE_GRAPH); 2931 2932 if (gp == NULL) 2933 return; 2934 mutex_enter(&gp->gp_mutex); 2935 2936 CHECK_SLEEPING_LOCKS(gp); 2937 CHECK_ACTIVE_LOCKS(gp); 2938 2939 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp); 2940 2941 if (lock) { 2942 do { 2943 nlock = lock->l_next; 2944 if ((lock->l_flock.l_pid == pid || 2945 pid == IGN_PID) && 2946 lock->l_flock.l_sysid == sysid) { 2947 CANCEL_WAKEUP(lock); 2948 } 2949 lock = nlock; 2950 } while (lock->l_vnode == vp); 2951 } 2952 2953 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp); 2954 2955 if (lock) { 2956 do { 2957 nlock = lock->l_next; 2958 if ((lock->l_flock.l_pid == pid || 2959 pid == IGN_PID) && 2960 lock->l_flock.l_sysid == sysid) { 2961 flk_delete_active_lock(lock, 0); 2962 STACK_PUSH(link_stack, lock, l_stack); 2963 } 2964 lock = nlock; 2965 } while (lock->l_vnode == vp); 2966 } 2967 2968 while ((lock = STACK_TOP(link_stack)) != NULL) { 2969 STACK_POP(link_stack, l_stack); 2970 flk_wakeup(lock, 1); 2971 flk_free_lock(lock); 2972 } 2973 2974 CHECK_SLEEPING_LOCKS(gp); 2975 CHECK_ACTIVE_LOCKS(gp); 2976 CHECK_OWNER_LOCKS(gp, pid, sysid, vp); 2977 mutex_exit(&gp->gp_mutex); 2978 } 2979 2980 2981 /* 2982 * Called from 'fs' read and write routines for files that have mandatory 2983 * locking enabled. 2984 */ 2985 2986 int 2987 chklock( 2988 struct vnode *vp, 2989 int iomode, 2990 u_offset_t offset, 2991 ssize_t len, 2992 int fmode, 2993 caller_context_t *ct) 2994 { 2995 register int i; 2996 struct flock64 bf; 2997 int error = 0; 2998 2999 bf.l_type = (iomode & FWRITE) ? F_WRLCK : F_RDLCK; 3000 bf.l_whence = 0; 3001 bf.l_start = offset; 3002 bf.l_len = len; 3003 if (ct == NULL) { 3004 bf.l_pid = curproc->p_pid; 3005 bf.l_sysid = 0; 3006 } else { 3007 bf.l_pid = ct->cc_pid; 3008 bf.l_sysid = ct->cc_sysid; 3009 } 3010 i = (fmode & (FNDELAY|FNONBLOCK)) ? INOFLCK : INOFLCK|SLPFLCK; 3011 if ((i = reclock(vp, &bf, i, 0, offset, NULL)) != 0 || 3012 bf.l_type != F_UNLCK) 3013 error = i ? i : EAGAIN; 3014 return (error); 3015 } 3016 3017 /* 3018 * convoff - converts the given data (start, whence) to the 3019 * given whence. 3020 */ 3021 int 3022 convoff(vp, lckdat, whence, offset) 3023 struct vnode *vp; 3024 struct flock64 *lckdat; 3025 int whence; 3026 offset_t offset; 3027 { 3028 int error; 3029 struct vattr vattr; 3030 3031 if ((lckdat->l_whence == 2) || (whence == 2)) { 3032 vattr.va_mask = AT_SIZE; 3033 if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) 3034 return (error); 3035 } 3036 3037 switch (lckdat->l_whence) { 3038 case 1: 3039 lckdat->l_start += offset; 3040 break; 3041 case 2: 3042 lckdat->l_start += vattr.va_size; 3043 /* FALLTHRU */ 3044 case 0: 3045 break; 3046 default: 3047 return (EINVAL); 3048 } 3049 3050 if (lckdat->l_start < 0) 3051 return (EINVAL); 3052 3053 switch (whence) { 3054 case 1: 3055 lckdat->l_start -= offset; 3056 break; 3057 case 2: 3058 lckdat->l_start -= vattr.va_size; 3059 /* FALLTHRU */ 3060 case 0: 3061 break; 3062 default: 3063 return (EINVAL); 3064 } 3065 3066 lckdat->l_whence = (short)whence; 3067 return (0); 3068 } 3069 3070 3071 /* proc_graph function definitions */ 3072 3073 /* 3074 * Function checks for deadlock due to the new 'lock'. If deadlock found 3075 * edges of this lock are freed and returned. 3076 */ 3077 3078 static int 3079 flk_check_deadlock(lock_descriptor_t *lock) 3080 { 3081 proc_vertex_t *start_vertex, *pvertex; 3082 proc_vertex_t *dvertex; 3083 proc_edge_t *pep, *ppep; 3084 edge_t *ep, *nep; 3085 proc_vertex_t *process_stack; 3086 3087 /* 3088 * OFD style locks are not associated with any process so there is 3089 * no proc graph for these. Thus we cannot, and do not, do deadlock 3090 * detection. 3091 */ 3092 if (lock->l_ofd != NULL) 3093 return (0); 3094 3095 STACK_INIT(process_stack); 3096 3097 mutex_enter(&flock_lock); 3098 start_vertex = flk_get_proc_vertex(lock); 3099 ASSERT(start_vertex != NULL); 3100 3101 /* construct the edges from this process to other processes */ 3102 3103 ep = FIRST_ADJ(lock); 3104 while (ep != HEAD(lock)) { 3105 proc_vertex_t *adj_proc; 3106 3107 adj_proc = flk_get_proc_vertex(ep->to_vertex); 3108 for (pep = start_vertex->edge; pep != NULL; pep = pep->next) { 3109 if (pep->to_proc == adj_proc) { 3110 ASSERT(pep->refcount); 3111 pep->refcount++; 3112 break; 3113 } 3114 } 3115 if (pep == NULL) { 3116 pep = flk_get_proc_edge(); 3117 pep->to_proc = adj_proc; 3118 pep->refcount = 1; 3119 adj_proc->incount++; 3120 pep->next = start_vertex->edge; 3121 start_vertex->edge = pep; 3122 } 3123 ep = NEXT_ADJ(ep); 3124 } 3125 3126 ep = FIRST_IN(lock); 3127 3128 while (ep != HEAD(lock)) { 3129 proc_vertex_t *in_proc; 3130 3131 in_proc = flk_get_proc_vertex(ep->from_vertex); 3132 3133 for (pep = in_proc->edge; pep != NULL; pep = pep->next) { 3134 if (pep->to_proc == start_vertex) { 3135 ASSERT(pep->refcount); 3136 pep->refcount++; 3137 break; 3138 } 3139 } 3140 if (pep == NULL) { 3141 pep = flk_get_proc_edge(); 3142 pep->to_proc = start_vertex; 3143 pep->refcount = 1; 3144 start_vertex->incount++; 3145 pep->next = in_proc->edge; 3146 in_proc->edge = pep; 3147 } 3148 ep = NEXT_IN(ep); 3149 } 3150 3151 if (start_vertex->incount == 0) { 3152 mutex_exit(&flock_lock); 3153 return (0); 3154 } 3155 3156 flk_proc_graph_uncolor(); 3157 3158 start_vertex->p_sedge = start_vertex->edge; 3159 3160 STACK_PUSH(process_stack, start_vertex, p_stack); 3161 3162 while ((pvertex = STACK_TOP(process_stack)) != NULL) { 3163 for (pep = pvertex->p_sedge; pep != NULL; pep = pep->next) { 3164 dvertex = pep->to_proc; 3165 if (!PROC_ARRIVED(dvertex)) { 3166 STACK_PUSH(process_stack, dvertex, p_stack); 3167 dvertex->p_sedge = dvertex->edge; 3168 PROC_ARRIVE(pvertex); 3169 pvertex->p_sedge = pep->next; 3170 break; 3171 } 3172 if (!PROC_DEPARTED(dvertex)) 3173 goto deadlock; 3174 } 3175 if (pep == NULL) { 3176 PROC_DEPART(pvertex); 3177 STACK_POP(process_stack, p_stack); 3178 } 3179 } 3180 mutex_exit(&flock_lock); 3181 return (0); 3182 3183 deadlock: 3184 3185 /* we remove all lock edges and proc edges */ 3186 3187 ep = FIRST_ADJ(lock); 3188 while (ep != HEAD(lock)) { 3189 proc_vertex_t *adj_proc; 3190 adj_proc = flk_get_proc_vertex(ep->to_vertex); 3191 nep = NEXT_ADJ(ep); 3192 IN_LIST_REMOVE(ep); 3193 ADJ_LIST_REMOVE(ep); 3194 flk_free_edge(ep); 3195 ppep = start_vertex->edge; 3196 for (pep = start_vertex->edge; pep != NULL; ppep = pep, 3197 pep = ppep->next) { 3198 if (pep->to_proc == adj_proc) { 3199 pep->refcount--; 3200 if (pep->refcount == 0) { 3201 if (pep == ppep) { 3202 start_vertex->edge = pep->next; 3203 } else { 3204 ppep->next = pep->next; 3205 } 3206 adj_proc->incount--; 3207 flk_proc_release(adj_proc); 3208 flk_free_proc_edge(pep); 3209 } 3210 break; 3211 } 3212 } 3213 ep = nep; 3214 } 3215 ep = FIRST_IN(lock); 3216 while (ep != HEAD(lock)) { 3217 proc_vertex_t *in_proc; 3218 in_proc = flk_get_proc_vertex(ep->from_vertex); 3219 nep = NEXT_IN(ep); 3220 IN_LIST_REMOVE(ep); 3221 ADJ_LIST_REMOVE(ep); 3222 flk_free_edge(ep); 3223 ppep = in_proc->edge; 3224 for (pep = in_proc->edge; pep != NULL; ppep = pep, 3225 pep = ppep->next) { 3226 if (pep->to_proc == start_vertex) { 3227 pep->refcount--; 3228 if (pep->refcount == 0) { 3229 if (pep == ppep) { 3230 in_proc->edge = pep->next; 3231 } else { 3232 ppep->next = pep->next; 3233 } 3234 start_vertex->incount--; 3235 flk_proc_release(in_proc); 3236 flk_free_proc_edge(pep); 3237 } 3238 break; 3239 } 3240 } 3241 ep = nep; 3242 } 3243 flk_proc_release(start_vertex); 3244 mutex_exit(&flock_lock); 3245 return (1); 3246 } 3247 3248 /* 3249 * Get a proc vertex. If lock's pvertex value gets a correct proc vertex 3250 * from the list we return that, otherwise we allocate one. If necessary, 3251 * we grow the list of vertices also. 3252 */ 3253 3254 static proc_vertex_t * 3255 flk_get_proc_vertex(lock_descriptor_t *lock) 3256 { 3257 int i; 3258 proc_vertex_t *pv; 3259 proc_vertex_t **palloc; 3260 3261 ASSERT(MUTEX_HELD(&flock_lock)); 3262 if (lock->pvertex != -1) { 3263 ASSERT(lock->pvertex >= 0); 3264 pv = pgraph.proc[lock->pvertex]; 3265 if (pv != NULL && PROC_SAME_OWNER(lock, pv)) { 3266 return (pv); 3267 } 3268 } 3269 for (i = 0; i < pgraph.gcount; i++) { 3270 pv = pgraph.proc[i]; 3271 if (pv != NULL && PROC_SAME_OWNER(lock, pv)) { 3272 lock->pvertex = pv->index = i; 3273 return (pv); 3274 } 3275 } 3276 pv = kmem_zalloc(sizeof (struct proc_vertex), KM_SLEEP); 3277 pv->pid = lock->l_flock.l_pid; 3278 pv->sysid = lock->l_flock.l_sysid; 3279 flk_proc_vertex_allocs++; 3280 if (pgraph.free != 0) { 3281 for (i = 0; i < pgraph.gcount; i++) { 3282 if (pgraph.proc[i] == NULL) { 3283 pgraph.proc[i] = pv; 3284 lock->pvertex = pv->index = i; 3285 pgraph.free--; 3286 return (pv); 3287 } 3288 } 3289 } 3290 palloc = kmem_zalloc((pgraph.gcount + PROC_CHUNK) * 3291 sizeof (proc_vertex_t *), KM_SLEEP); 3292 3293 if (pgraph.proc) { 3294 bcopy(pgraph.proc, palloc, 3295 pgraph.gcount * sizeof (proc_vertex_t *)); 3296 3297 kmem_free(pgraph.proc, 3298 pgraph.gcount * sizeof (proc_vertex_t *)); 3299 } 3300 pgraph.proc = palloc; 3301 pgraph.free += (PROC_CHUNK - 1); 3302 pv->index = lock->pvertex = pgraph.gcount; 3303 pgraph.gcount += PROC_CHUNK; 3304 pgraph.proc[pv->index] = pv; 3305 return (pv); 3306 } 3307 3308 /* 3309 * Allocate a proc edge. 3310 */ 3311 3312 static proc_edge_t * 3313 flk_get_proc_edge() 3314 { 3315 proc_edge_t *pep; 3316 3317 pep = kmem_zalloc(sizeof (proc_edge_t), KM_SLEEP); 3318 flk_proc_edge_allocs++; 3319 return (pep); 3320 } 3321 3322 /* 3323 * Free the proc edge. Called whenever its reference count goes to zero. 3324 */ 3325 3326 static void 3327 flk_free_proc_edge(proc_edge_t *pep) 3328 { 3329 ASSERT(pep->refcount == 0); 3330 kmem_free((void *)pep, sizeof (proc_edge_t)); 3331 flk_proc_edge_frees++; 3332 } 3333 3334 /* 3335 * Color the graph explicitly done only when the mark value hits max value. 3336 */ 3337 3338 static void 3339 flk_proc_graph_uncolor() 3340 { 3341 int i; 3342 3343 if (pgraph.mark == UINT_MAX) { 3344 for (i = 0; i < pgraph.gcount; i++) 3345 if (pgraph.proc[i] != NULL) { 3346 pgraph.proc[i]->atime = 0; 3347 pgraph.proc[i]->dtime = 0; 3348 } 3349 pgraph.mark = 1; 3350 } else { 3351 pgraph.mark++; 3352 } 3353 } 3354 3355 /* 3356 * Release the proc vertex iff both there are no in edges and out edges 3357 */ 3358 3359 static void 3360 flk_proc_release(proc_vertex_t *proc) 3361 { 3362 ASSERT(MUTEX_HELD(&flock_lock)); 3363 if (proc->edge == NULL && proc->incount == 0) { 3364 pgraph.proc[proc->index] = NULL; 3365 pgraph.free++; 3366 kmem_free(proc, sizeof (proc_vertex_t)); 3367 flk_proc_vertex_frees++; 3368 } 3369 } 3370 3371 /* 3372 * Updates process graph to reflect change in a lock_graph. 3373 * Note: We should call this function only after we have a correctly 3374 * recomputed lock graph. Otherwise we might miss a deadlock detection. 3375 * eg: in function flk_relation() we call this function after flk_recompute_ 3376 * dependencies() otherwise if a process tries to lock a vnode hashed 3377 * into another graph it might sleep for ever. 3378 */ 3379 3380 static void 3381 flk_update_proc_graph(edge_t *ep, int delete) 3382 { 3383 proc_vertex_t *toproc, *fromproc; 3384 proc_edge_t *pep, *prevpep; 3385 3386 mutex_enter(&flock_lock); 3387 3388 /* 3389 * OFD style locks are not associated with any process so there is 3390 * no proc graph for these. 3391 */ 3392 if (ep->from_vertex->l_ofd != NULL) { 3393 mutex_exit(&flock_lock); 3394 return; 3395 } 3396 3397 toproc = flk_get_proc_vertex(ep->to_vertex); 3398 fromproc = flk_get_proc_vertex(ep->from_vertex); 3399 3400 if (!delete) 3401 goto add; 3402 pep = prevpep = fromproc->edge; 3403 3404 ASSERT(pep != NULL); 3405 while (pep != NULL) { 3406 if (pep->to_proc == toproc) { 3407 ASSERT(pep->refcount > 0); 3408 pep->refcount--; 3409 if (pep->refcount == 0) { 3410 if (pep == prevpep) { 3411 fromproc->edge = pep->next; 3412 } else { 3413 prevpep->next = pep->next; 3414 } 3415 toproc->incount--; 3416 flk_proc_release(toproc); 3417 flk_free_proc_edge(pep); 3418 } 3419 break; 3420 } 3421 prevpep = pep; 3422 pep = pep->next; 3423 } 3424 flk_proc_release(fromproc); 3425 mutex_exit(&flock_lock); 3426 return; 3427 add: 3428 3429 pep = fromproc->edge; 3430 3431 while (pep != NULL) { 3432 if (pep->to_proc == toproc) { 3433 ASSERT(pep->refcount > 0); 3434 pep->refcount++; 3435 break; 3436 } 3437 pep = pep->next; 3438 } 3439 if (pep == NULL) { 3440 pep = flk_get_proc_edge(); 3441 pep->to_proc = toproc; 3442 pep->refcount = 1; 3443 toproc->incount++; 3444 pep->next = fromproc->edge; 3445 fromproc->edge = pep; 3446 } 3447 mutex_exit(&flock_lock); 3448 } 3449 3450 /* 3451 * Set the control status for lock manager requests. 3452 * 3453 */ 3454 3455 /* 3456 * PSARC case 1997/292 3457 * 3458 * Requires: "nlmid" must be >= 1 and <= clconf_maximum_nodeid(). 3459 * Effects: Set the state of the NLM server identified by "nlmid" 3460 * in the NLM registry to state "nlm_state." 3461 * Raises exception no_such_nlm if "nlmid" doesn't identify a known 3462 * NLM server to this LLM. 3463 * Note that when this routine is called with NLM_SHUTTING_DOWN there 3464 * may be locks requests that have gotten started but not finished. In 3465 * particular, there may be blocking requests that are in the callback code 3466 * before sleeping (so they're not holding the lock for the graph). If 3467 * such a thread reacquires the graph's lock (to go to sleep) after 3468 * NLM state in the NLM registry is set to a non-up value, 3469 * it will notice the status and bail out. If the request gets 3470 * granted before the thread can check the NLM registry, let it 3471 * continue normally. It will get flushed when we are called with NLM_DOWN. 3472 * 3473 * Modifies: nlm_reg_obj (global) 3474 * Arguments: 3475 * nlmid (IN): id uniquely identifying an NLM server 3476 * nlm_state (IN): NLM server state to change "nlmid" to 3477 */ 3478 void 3479 cl_flk_set_nlm_status(int nlmid, flk_nlm_status_t nlm_state) 3480 { 3481 /* 3482 * Check to see if node is booted as a cluster. If not, return. 3483 */ 3484 if ((cluster_bootflags & CLUSTER_BOOTED) == 0) { 3485 return; 3486 } 3487 3488 /* 3489 * Check for development/debugging. It is possible to boot a node 3490 * in non-cluster mode, and then run a special script, currently 3491 * available only to developers, to bring up the node as part of a 3492 * cluster. The problem is that running such a script does not 3493 * result in the routine flk_init() being called and hence global array 3494 * nlm_reg_status is NULL. The NLM thinks it's in cluster mode, 3495 * but the LLM needs to do an additional check to see if the global 3496 * array has been created or not. If nlm_reg_status is NULL, then 3497 * return, else continue. 3498 */ 3499 if (nlm_reg_status == NULL) { 3500 return; 3501 } 3502 3503 ASSERT(nlmid <= nlm_status_size && nlmid >= 0); 3504 mutex_enter(&nlm_reg_lock); 3505 3506 if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status, nlmid)) { 3507 /* 3508 * If the NLM server "nlmid" is unknown in the NLM registry, 3509 * add it to the registry in the nlm shutting down state. 3510 */ 3511 FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid, 3512 FLK_NLM_SHUTTING_DOWN); 3513 } else { 3514 /* 3515 * Change the state of the NLM server identified by "nlmid" 3516 * in the NLM registry to the argument "nlm_state." 3517 */ 3518 FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid, 3519 nlm_state); 3520 } 3521 3522 /* 3523 * The reason we must register the NLM server that is shutting down 3524 * with an LLM that doesn't already know about it (never sent a lock 3525 * request) is to handle correctly a race between shutdown and a new 3526 * lock request. Suppose that a shutdown request from the NLM server 3527 * invokes this routine at the LLM, and a thread is spawned to 3528 * service the request. Now suppose a new lock request is in 3529 * progress and has already passed the first line of defense in 3530 * reclock(), which denies new locks requests from NLM servers 3531 * that are not in the NLM_UP state. After the current routine 3532 * is invoked for both phases of shutdown, the routine will return, 3533 * having done nothing, and the lock request will proceed and 3534 * probably be granted. The problem is that the shutdown was ignored 3535 * by the lock request because there was no record of that NLM server 3536 * shutting down. We will be in the peculiar position of thinking 3537 * that we've shutdown the NLM server and all locks at all LLMs have 3538 * been discarded, but in fact there's still one lock held. 3539 * The solution is to record the existence of NLM server and change 3540 * its state immediately to NLM_SHUTTING_DOWN. The lock request in 3541 * progress may proceed because the next phase NLM_DOWN will catch 3542 * this lock and discard it. 3543 */ 3544 mutex_exit(&nlm_reg_lock); 3545 3546 switch (nlm_state) { 3547 case FLK_NLM_UP: 3548 /* 3549 * Change the NLM state of all locks still held on behalf of 3550 * the NLM server identified by "nlmid" to NLM_UP. 3551 */ 3552 cl_flk_change_nlm_state_all_locks(nlmid, FLK_NLM_UP); 3553 break; 3554 3555 case FLK_NLM_SHUTTING_DOWN: 3556 /* 3557 * Wake up all sleeping locks for the NLM server identified 3558 * by "nlmid." Note that eventually all woken threads will 3559 * have their lock requests cancelled and descriptors 3560 * removed from the sleeping lock list. Note that the NLM 3561 * server state associated with each lock descriptor is 3562 * changed to FLK_NLM_SHUTTING_DOWN. 3563 */ 3564 cl_flk_wakeup_sleeping_nlm_locks(nlmid); 3565 break; 3566 3567 case FLK_NLM_DOWN: 3568 /* 3569 * Discard all active, granted locks for this NLM server 3570 * identified by "nlmid." 3571 */ 3572 cl_flk_unlock_nlm_granted(nlmid); 3573 break; 3574 3575 default: 3576 panic("cl_set_nlm_status: bad status (%d)", nlm_state); 3577 } 3578 } 3579 3580 /* 3581 * Set the control status for lock manager requests. 3582 * 3583 * Note that when this routine is called with FLK_WAKEUP_SLEEPERS, there 3584 * may be locks requests that have gotten started but not finished. In 3585 * particular, there may be blocking requests that are in the callback code 3586 * before sleeping (so they're not holding the lock for the graph). If 3587 * such a thread reacquires the graph's lock (to go to sleep) after 3588 * flk_lockmgr_status is set to a non-up value, it will notice the status 3589 * and bail out. If the request gets granted before the thread can check 3590 * flk_lockmgr_status, let it continue normally. It will get flushed when 3591 * we are called with FLK_LOCKMGR_DOWN. 3592 */ 3593 3594 void 3595 flk_set_lockmgr_status(flk_lockmgr_status_t status) 3596 { 3597 int i; 3598 graph_t *gp; 3599 struct flock_globals *fg; 3600 3601 fg = flk_get_globals(); 3602 ASSERT(fg != NULL); 3603 3604 mutex_enter(&flock_lock); 3605 fg->flk_lockmgr_status = status; 3606 mutex_exit(&flock_lock); 3607 3608 /* 3609 * If the lock manager is coming back up, all that's needed is to 3610 * propagate this information to the graphs. If the lock manager 3611 * is going down, additional action is required, and each graph's 3612 * copy of the state is updated atomically with this other action. 3613 */ 3614 switch (status) { 3615 case FLK_LOCKMGR_UP: 3616 for (i = 0; i < HASH_SIZE; i++) { 3617 mutex_enter(&flock_lock); 3618 gp = lock_graph[i]; 3619 mutex_exit(&flock_lock); 3620 if (gp == NULL) 3621 continue; 3622 mutex_enter(&gp->gp_mutex); 3623 fg->lockmgr_status[i] = status; 3624 mutex_exit(&gp->gp_mutex); 3625 } 3626 break; 3627 case FLK_WAKEUP_SLEEPERS: 3628 wakeup_sleeping_lockmgr_locks(fg); 3629 break; 3630 case FLK_LOCKMGR_DOWN: 3631 unlock_lockmgr_granted(fg); 3632 break; 3633 default: 3634 panic("flk_set_lockmgr_status: bad status (%d)", status); 3635 break; 3636 } 3637 } 3638 3639 /* 3640 * This routine returns all the locks that are active or sleeping and are 3641 * associated with a particular set of identifiers. If lock_state != 0, then 3642 * only locks that match the lock_state are returned. If lock_state == 0, then 3643 * all locks are returned. If pid == NOPID, the pid is ignored. If 3644 * use_sysid is FALSE, then the sysid is ignored. If vp is NULL, then the 3645 * vnode pointer is ignored. 3646 * 3647 * A list containing the vnode pointer and an flock structure 3648 * describing the lock is returned. Each element in the list is 3649 * dynamically allocated and must be freed by the caller. The 3650 * last item in the list is denoted by a NULL value in the ll_next 3651 * field. 3652 * 3653 * The vnode pointers returned are held. The caller is responsible 3654 * for releasing these. Note that the returned list is only a snapshot of 3655 * the current lock information, and that it is a snapshot of a moving 3656 * target (only one graph is locked at a time). 3657 */ 3658 3659 locklist_t * 3660 get_lock_list(int list_type, int lock_state, int sysid, boolean_t use_sysid, 3661 pid_t pid, const vnode_t *vp, zoneid_t zoneid) 3662 { 3663 lock_descriptor_t *lock; 3664 lock_descriptor_t *graph_head; 3665 locklist_t listhead; 3666 locklist_t *llheadp; 3667 locklist_t *llp; 3668 locklist_t *lltp; 3669 graph_t *gp; 3670 int i; 3671 int first_index; /* graph index */ 3672 int num_indexes; /* graph index */ 3673 3674 ASSERT((list_type == FLK_ACTIVE_STATE) || 3675 (list_type == FLK_SLEEPING_STATE)); 3676 3677 /* 3678 * Get a pointer to something to use as a list head while building 3679 * the rest of the list. 3680 */ 3681 llheadp = &listhead; 3682 lltp = llheadp; 3683 llheadp->ll_next = (locklist_t *)NULL; 3684 3685 /* Figure out which graphs we want to look at. */ 3686 if (vp == NULL) { 3687 first_index = 0; 3688 num_indexes = HASH_SIZE; 3689 } else { 3690 first_index = HASH_INDEX(vp); 3691 num_indexes = 1; 3692 } 3693 3694 for (i = first_index; i < first_index + num_indexes; i++) { 3695 mutex_enter(&flock_lock); 3696 gp = lock_graph[i]; 3697 mutex_exit(&flock_lock); 3698 if (gp == NULL) { 3699 continue; 3700 } 3701 3702 mutex_enter(&gp->gp_mutex); 3703 graph_head = (list_type == FLK_ACTIVE_STATE) ? 3704 ACTIVE_HEAD(gp) : SLEEPING_HEAD(gp); 3705 for (lock = graph_head->l_next; 3706 lock != graph_head; 3707 lock = lock->l_next) { 3708 if (use_sysid && lock->l_flock.l_sysid != sysid) 3709 continue; 3710 if (pid != NOPID && lock->l_flock.l_pid != pid) 3711 continue; 3712 if (vp != NULL && lock->l_vnode != vp) 3713 continue; 3714 if (lock_state && !(lock_state & lock->l_state)) 3715 continue; 3716 if (zoneid != lock->l_zoneid && zoneid != ALL_ZONES) 3717 continue; 3718 /* 3719 * A matching lock was found. Allocate 3720 * space for a new locklist entry and fill 3721 * it in. 3722 */ 3723 llp = kmem_alloc(sizeof (locklist_t), KM_SLEEP); 3724 lltp->ll_next = llp; 3725 VN_HOLD(lock->l_vnode); 3726 llp->ll_vp = lock->l_vnode; 3727 create_flock(lock, &(llp->ll_flock)); 3728 llp->ll_next = (locklist_t *)NULL; 3729 lltp = llp; 3730 } 3731 mutex_exit(&gp->gp_mutex); 3732 } 3733 3734 llp = llheadp->ll_next; 3735 return (llp); 3736 } 3737 3738 /* 3739 * These two functions are simply interfaces to get_lock_list. They return 3740 * a list of sleeping or active locks for the given sysid and pid. See 3741 * get_lock_list for details. 3742 * 3743 * In either case we don't particularly care to specify the zone of interest; 3744 * the sysid-space is global across zones, so the sysid will map to exactly one 3745 * zone, and we'll return information for that zone. 3746 */ 3747 3748 locklist_t * 3749 flk_get_sleeping_locks(int sysid, pid_t pid) 3750 { 3751 return (get_lock_list(FLK_SLEEPING_STATE, 0, sysid, B_TRUE, pid, NULL, 3752 ALL_ZONES)); 3753 } 3754 3755 locklist_t * 3756 flk_get_active_locks(int sysid, pid_t pid) 3757 { 3758 return (get_lock_list(FLK_ACTIVE_STATE, 0, sysid, B_TRUE, pid, NULL, 3759 ALL_ZONES)); 3760 } 3761 3762 /* 3763 * Another interface to get_lock_list. This one returns all the active 3764 * locks for a given vnode. Again, see get_lock_list for details. 3765 * 3766 * We don't need to specify which zone's locks we're interested in. The matter 3767 * would only be interesting if the vnode belonged to NFS, and NFS vnodes can't 3768 * be used by multiple zones, so the list of locks will all be from the right 3769 * zone. 3770 */ 3771 3772 locklist_t * 3773 flk_active_locks_for_vp(const vnode_t *vp) 3774 { 3775 return (get_lock_list(FLK_ACTIVE_STATE, 0, 0, B_FALSE, NOPID, vp, 3776 ALL_ZONES)); 3777 } 3778 3779 /* 3780 * Another interface to get_lock_list. This one returns all the active 3781 * nbmand locks for a given vnode. Again, see get_lock_list for details. 3782 * 3783 * See the comment for flk_active_locks_for_vp() for why we don't care to 3784 * specify the particular zone of interest. 3785 */ 3786 locklist_t * 3787 flk_active_nbmand_locks_for_vp(const vnode_t *vp) 3788 { 3789 return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE, 3790 NOPID, vp, ALL_ZONES)); 3791 } 3792 3793 /* 3794 * Another interface to get_lock_list. This one returns all the active 3795 * nbmand locks for a given pid. Again, see get_lock_list for details. 3796 * 3797 * The zone doesn't need to be specified here; the locks held by a 3798 * particular process will either be local (ie, non-NFS) or from the zone 3799 * the process is executing in. This is because other parts of the system 3800 * ensure that an NFS vnode can't be used in a zone other than that in 3801 * which it was opened. 3802 */ 3803 locklist_t * 3804 flk_active_nbmand_locks(pid_t pid) 3805 { 3806 return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE, 3807 pid, NULL, ALL_ZONES)); 3808 } 3809 3810 /* 3811 * Free up all entries in the locklist. 3812 */ 3813 void 3814 flk_free_locklist(locklist_t *llp) 3815 { 3816 locklist_t *next_llp; 3817 3818 while (llp) { 3819 next_llp = llp->ll_next; 3820 VN_RELE(llp->ll_vp); 3821 kmem_free(llp, sizeof (*llp)); 3822 llp = next_llp; 3823 } 3824 } 3825 3826 static void 3827 cl_flk_change_nlm_state_all_locks(int nlmid, flk_nlm_status_t nlm_state) 3828 { 3829 /* 3830 * For each graph "lg" in the hash table lock_graph do 3831 * a. Get the list of sleeping locks 3832 * b. For each lock descriptor in the list do 3833 * i. If the requested lock is an NLM server request AND 3834 * the nlmid is the same as the routine argument then 3835 * change the lock descriptor's state field to 3836 * "nlm_state." 3837 * c. Get the list of active locks 3838 * d. For each lock descriptor in the list do 3839 * i. If the requested lock is an NLM server request AND 3840 * the nlmid is the same as the routine argument then 3841 * change the lock descriptor's state field to 3842 * "nlm_state." 3843 */ 3844 3845 int i; 3846 graph_t *gp; /* lock graph */ 3847 lock_descriptor_t *lock; /* lock */ 3848 lock_descriptor_t *nlock = NULL; /* next lock */ 3849 int lock_nlmid; 3850 3851 for (i = 0; i < HASH_SIZE; i++) { 3852 mutex_enter(&flock_lock); 3853 gp = lock_graph[i]; 3854 mutex_exit(&flock_lock); 3855 if (gp == NULL) { 3856 continue; 3857 } 3858 3859 /* Get list of sleeping locks in current lock graph. */ 3860 mutex_enter(&gp->gp_mutex); 3861 for (lock = SLEEPING_HEAD(gp)->l_next; 3862 lock != SLEEPING_HEAD(gp); 3863 lock = nlock) { 3864 nlock = lock->l_next; 3865 /* get NLM id */ 3866 lock_nlmid = GETNLMID(lock->l_flock.l_sysid); 3867 3868 /* 3869 * If NLM server request AND nlmid of lock matches 3870 * nlmid of argument, then set the NLM state of the 3871 * lock to "nlm_state." 3872 */ 3873 if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) { 3874 SET_NLM_STATE(lock, nlm_state); 3875 } 3876 } 3877 3878 /* Get list of active locks in current lock graph. */ 3879 for (lock = ACTIVE_HEAD(gp)->l_next; 3880 lock != ACTIVE_HEAD(gp); 3881 lock = nlock) { 3882 nlock = lock->l_next; 3883 /* get NLM id */ 3884 lock_nlmid = GETNLMID(lock->l_flock.l_sysid); 3885 3886 /* 3887 * If NLM server request AND nlmid of lock matches 3888 * nlmid of argument, then set the NLM state of the 3889 * lock to "nlm_state." 3890 */ 3891 if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) { 3892 ASSERT(IS_ACTIVE(lock)); 3893 SET_NLM_STATE(lock, nlm_state); 3894 } 3895 } 3896 mutex_exit(&gp->gp_mutex); 3897 } 3898 } 3899 3900 /* 3901 * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid(). 3902 * Effects: Find all sleeping lock manager requests _only_ for the NLM server 3903 * identified by "nlmid." Poke those lock requests. 3904 */ 3905 static void 3906 cl_flk_wakeup_sleeping_nlm_locks(int nlmid) 3907 { 3908 lock_descriptor_t *lock; 3909 lock_descriptor_t *nlock = NULL; /* next lock */ 3910 int i; 3911 graph_t *gp; 3912 int lock_nlmid; 3913 3914 for (i = 0; i < HASH_SIZE; i++) { 3915 mutex_enter(&flock_lock); 3916 gp = lock_graph[i]; 3917 mutex_exit(&flock_lock); 3918 if (gp == NULL) { 3919 continue; 3920 } 3921 3922 mutex_enter(&gp->gp_mutex); 3923 for (lock = SLEEPING_HEAD(gp)->l_next; 3924 lock != SLEEPING_HEAD(gp); 3925 lock = nlock) { 3926 nlock = lock->l_next; 3927 /* 3928 * If NLM server request _and_ nlmid of lock matches 3929 * nlmid of argument, then set the NLM state of the 3930 * lock to NLM_SHUTTING_DOWN, and wake up sleeping 3931 * request. 3932 */ 3933 if (IS_LOCKMGR(lock)) { 3934 /* get NLM id */ 3935 lock_nlmid = 3936 GETNLMID(lock->l_flock.l_sysid); 3937 if (nlmid == lock_nlmid) { 3938 SET_NLM_STATE(lock, 3939 FLK_NLM_SHUTTING_DOWN); 3940 INTERRUPT_WAKEUP(lock); 3941 } 3942 } 3943 } 3944 mutex_exit(&gp->gp_mutex); 3945 } 3946 } 3947 3948 /* 3949 * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid() 3950 * Effects: Find all active (granted) lock manager locks _only_ for the 3951 * NLM server identified by "nlmid" and release them. 3952 */ 3953 static void 3954 cl_flk_unlock_nlm_granted(int nlmid) 3955 { 3956 lock_descriptor_t *lock; 3957 lock_descriptor_t *nlock = NULL; /* next lock */ 3958 int i; 3959 graph_t *gp; 3960 int lock_nlmid; 3961 3962 for (i = 0; i < HASH_SIZE; i++) { 3963 mutex_enter(&flock_lock); 3964 gp = lock_graph[i]; 3965 mutex_exit(&flock_lock); 3966 if (gp == NULL) { 3967 continue; 3968 } 3969 3970 mutex_enter(&gp->gp_mutex); 3971 for (lock = ACTIVE_HEAD(gp)->l_next; 3972 lock != ACTIVE_HEAD(gp); 3973 lock = nlock) { 3974 nlock = lock->l_next; 3975 ASSERT(IS_ACTIVE(lock)); 3976 3977 /* 3978 * If it's an NLM server request _and_ nlmid of 3979 * the lock matches nlmid of argument, then 3980 * remove the active lock the list, wakup blocked 3981 * threads, and free the storage for the lock. 3982 * Note that there's no need to mark the NLM state 3983 * of this lock to NLM_DOWN because the lock will 3984 * be deleted anyway and its storage freed. 3985 */ 3986 if (IS_LOCKMGR(lock)) { 3987 /* get NLM id */ 3988 lock_nlmid = GETNLMID(lock->l_flock.l_sysid); 3989 if (nlmid == lock_nlmid) { 3990 flk_delete_active_lock(lock, 0); 3991 flk_wakeup(lock, 1); 3992 flk_free_lock(lock); 3993 } 3994 } 3995 } 3996 mutex_exit(&gp->gp_mutex); 3997 } 3998 } 3999 4000 /* 4001 * Find all sleeping lock manager requests and poke them. 4002 */ 4003 static void 4004 wakeup_sleeping_lockmgr_locks(struct flock_globals *fg) 4005 { 4006 lock_descriptor_t *lock; 4007 lock_descriptor_t *nlock = NULL; /* next lock */ 4008 int i; 4009 graph_t *gp; 4010 zoneid_t zoneid = getzoneid(); 4011 4012 for (i = 0; i < HASH_SIZE; i++) { 4013 mutex_enter(&flock_lock); 4014 gp = lock_graph[i]; 4015 mutex_exit(&flock_lock); 4016 if (gp == NULL) { 4017 continue; 4018 } 4019 4020 mutex_enter(&gp->gp_mutex); 4021 fg->lockmgr_status[i] = FLK_WAKEUP_SLEEPERS; 4022 for (lock = SLEEPING_HEAD(gp)->l_next; 4023 lock != SLEEPING_HEAD(gp); 4024 lock = nlock) { 4025 nlock = lock->l_next; 4026 if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) { 4027 INTERRUPT_WAKEUP(lock); 4028 } 4029 } 4030 mutex_exit(&gp->gp_mutex); 4031 } 4032 } 4033 4034 4035 /* 4036 * Find all active (granted) lock manager locks and release them. 4037 */ 4038 static void 4039 unlock_lockmgr_granted(struct flock_globals *fg) 4040 { 4041 lock_descriptor_t *lock; 4042 lock_descriptor_t *nlock = NULL; /* next lock */ 4043 int i; 4044 graph_t *gp; 4045 zoneid_t zoneid = getzoneid(); 4046 4047 for (i = 0; i < HASH_SIZE; i++) { 4048 mutex_enter(&flock_lock); 4049 gp = lock_graph[i]; 4050 mutex_exit(&flock_lock); 4051 if (gp == NULL) { 4052 continue; 4053 } 4054 4055 mutex_enter(&gp->gp_mutex); 4056 fg->lockmgr_status[i] = FLK_LOCKMGR_DOWN; 4057 for (lock = ACTIVE_HEAD(gp)->l_next; 4058 lock != ACTIVE_HEAD(gp); 4059 lock = nlock) { 4060 nlock = lock->l_next; 4061 if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) { 4062 ASSERT(IS_ACTIVE(lock)); 4063 flk_delete_active_lock(lock, 0); 4064 flk_wakeup(lock, 1); 4065 flk_free_lock(lock); 4066 } 4067 } 4068 mutex_exit(&gp->gp_mutex); 4069 } 4070 } 4071 4072 4073 /* 4074 * Wait until a lock is granted, cancelled, or interrupted. 4075 */ 4076 4077 static void 4078 wait_for_lock(lock_descriptor_t *request) 4079 { 4080 graph_t *gp = request->l_graph; 4081 4082 ASSERT(MUTEX_HELD(&gp->gp_mutex)); 4083 4084 while (!(IS_GRANTED(request)) && !(IS_CANCELLED(request)) && 4085 !(IS_INTERRUPTED(request))) { 4086 if (!cv_wait_sig(&request->l_cv, &gp->gp_mutex)) { 4087 flk_set_state(request, FLK_INTERRUPTED_STATE); 4088 request->l_state |= INTERRUPTED_LOCK; 4089 } 4090 } 4091 } 4092 4093 /* 4094 * Create an flock structure from the existing lock information 4095 * 4096 * This routine is used to create flock structures for the lock manager 4097 * to use in a reclaim request. Since the lock was originated on this 4098 * host, it must be conforming to UNIX semantics, so no checking is 4099 * done to make sure it falls within the lower half of the 32-bit range. 4100 */ 4101 4102 static void 4103 create_flock(lock_descriptor_t *lp, flock64_t *flp) 4104 { 4105 ASSERT(lp->l_end == MAX_U_OFFSET_T || lp->l_end <= MAXEND); 4106 ASSERT(lp->l_end >= lp->l_start); 4107 4108 flp->l_type = lp->l_type; 4109 flp->l_whence = 0; 4110 flp->l_start = lp->l_start; 4111 flp->l_len = (lp->l_end == MAX_U_OFFSET_T) ? 0 : 4112 (lp->l_end - lp->l_start + 1); 4113 flp->l_sysid = lp->l_flock.l_sysid; 4114 flp->l_pid = lp->l_flock.l_pid; 4115 } 4116 4117 /* 4118 * Convert flock_t data describing a lock range into unsigned long starting 4119 * and ending points, which are put into lock_request. Returns 0 or an 4120 * errno value. 4121 * Large Files: max is passed by the caller and we return EOVERFLOW 4122 * as defined by LFS API. 4123 */ 4124 4125 int 4126 flk_convert_lock_data(vnode_t *vp, flock64_t *flp, 4127 u_offset_t *start, u_offset_t *end, offset_t offset) 4128 { 4129 struct vattr vattr; 4130 int error; 4131 4132 /* 4133 * Determine the starting point of the request 4134 */ 4135 switch (flp->l_whence) { 4136 case 0: /* SEEK_SET */ 4137 *start = (u_offset_t)flp->l_start; 4138 break; 4139 case 1: /* SEEK_CUR */ 4140 *start = (u_offset_t)(flp->l_start + offset); 4141 break; 4142 case 2: /* SEEK_END */ 4143 vattr.va_mask = AT_SIZE; 4144 if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) 4145 return (error); 4146 *start = (u_offset_t)(flp->l_start + vattr.va_size); 4147 break; 4148 default: 4149 return (EINVAL); 4150 } 4151 4152 /* 4153 * Determine the range covered by the request. 4154 */ 4155 if (flp->l_len == 0) 4156 *end = MAX_U_OFFSET_T; 4157 else if ((offset_t)flp->l_len > 0) { 4158 *end = (u_offset_t)(*start + (flp->l_len - 1)); 4159 } else { 4160 /* 4161 * Negative length; why do we even allow this ? 4162 * Because this allows easy specification of 4163 * the last n bytes of the file. 4164 */ 4165 *end = *start; 4166 *start += (u_offset_t)flp->l_len; 4167 (*start)++; 4168 } 4169 return (0); 4170 } 4171 4172 /* 4173 * Check the validity of lock data. This can used by the NFS 4174 * frlock routines to check data before contacting the server. The 4175 * server must support semantics that aren't as restrictive as 4176 * the UNIX API, so the NFS client is required to check. 4177 * The maximum is now passed in by the caller. 4178 */ 4179 4180 int 4181 flk_check_lock_data(u_offset_t start, u_offset_t end, offset_t max) 4182 { 4183 /* 4184 * The end (length) for local locking should never be greater 4185 * than MAXEND. However, the representation for 4186 * the entire file is MAX_U_OFFSET_T. 4187 */ 4188 if ((start > max) || 4189 ((end > max) && (end != MAX_U_OFFSET_T))) { 4190 return (EINVAL); 4191 } 4192 if (start > end) { 4193 return (EINVAL); 4194 } 4195 return (0); 4196 } 4197 4198 /* 4199 * Fill in request->l_flock with information about the lock blocking the 4200 * request. The complexity here is that lock manager requests are allowed 4201 * to see into the upper part of the 32-bit address range, whereas local 4202 * requests are only allowed to see signed values. 4203 * 4204 * What should be done when "blocker" is a lock manager lock that uses the 4205 * upper portion of the 32-bit range, but "request" is local? Since the 4206 * request has already been determined to have been blocked by the blocker, 4207 * at least some portion of "blocker" must be in the range of the request, 4208 * or the request extends to the end of file. For the first case, the 4209 * portion in the lower range is returned with the indication that it goes 4210 * "to EOF." For the second case, the last byte of the lower range is 4211 * returned with the indication that it goes "to EOF." 4212 */ 4213 4214 static void 4215 report_blocker(lock_descriptor_t *blocker, lock_descriptor_t *request) 4216 { 4217 flock64_t *flrp; /* l_flock portion of request */ 4218 4219 ASSERT(blocker != NULL); 4220 4221 flrp = &request->l_flock; 4222 flrp->l_whence = 0; 4223 flrp->l_type = blocker->l_type; 4224 flrp->l_pid = blocker->l_flock.l_pid; 4225 flrp->l_sysid = blocker->l_flock.l_sysid; 4226 request->l_ofd = blocker->l_ofd; 4227 4228 if (IS_LOCKMGR(request)) { 4229 flrp->l_start = blocker->l_start; 4230 if (blocker->l_end == MAX_U_OFFSET_T) 4231 flrp->l_len = 0; 4232 else 4233 flrp->l_len = blocker->l_end - blocker->l_start + 1; 4234 } else { 4235 if (blocker->l_start > MAXEND) { 4236 flrp->l_start = MAXEND; 4237 flrp->l_len = 0; 4238 } else { 4239 flrp->l_start = blocker->l_start; 4240 if (blocker->l_end == MAX_U_OFFSET_T) 4241 flrp->l_len = 0; 4242 else 4243 flrp->l_len = blocker->l_end - 4244 blocker->l_start + 1; 4245 } 4246 } 4247 } 4248 4249 /* 4250 * PSARC case 1997/292 4251 */ 4252 /* 4253 * This is the public routine exported by flock.h. 4254 */ 4255 void 4256 cl_flk_change_nlm_state_to_unknown(int nlmid) 4257 { 4258 /* 4259 * Check to see if node is booted as a cluster. If not, return. 4260 */ 4261 if ((cluster_bootflags & CLUSTER_BOOTED) == 0) { 4262 return; 4263 } 4264 4265 /* 4266 * See comment in cl_flk_set_nlm_status(). 4267 */ 4268 if (nlm_reg_status == NULL) { 4269 return; 4270 } 4271 4272 /* 4273 * protect NLM registry state with a mutex. 4274 */ 4275 ASSERT(nlmid <= nlm_status_size && nlmid >= 0); 4276 mutex_enter(&nlm_reg_lock); 4277 FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid, FLK_NLM_UNKNOWN); 4278 mutex_exit(&nlm_reg_lock); 4279 } 4280 4281 /* 4282 * Return non-zero if the given I/O request conflicts with an active NBMAND 4283 * lock. 4284 * If svmand is non-zero, it means look at all active locks, not just NBMAND 4285 * locks. 4286 */ 4287 4288 int 4289 nbl_lock_conflict(vnode_t *vp, nbl_op_t op, u_offset_t offset, 4290 ssize_t length, int svmand, caller_context_t *ct) 4291 { 4292 int conflict = 0; 4293 graph_t *gp; 4294 lock_descriptor_t *lock; 4295 pid_t pid; 4296 int sysid; 4297 4298 if (ct == NULL) { 4299 pid = curproc->p_pid; 4300 sysid = 0; 4301 } else { 4302 pid = ct->cc_pid; 4303 sysid = ct->cc_sysid; 4304 } 4305 4306 mutex_enter(&flock_lock); 4307 gp = lock_graph[HASH_INDEX(vp)]; 4308 mutex_exit(&flock_lock); 4309 if (gp == NULL) 4310 return (0); 4311 4312 mutex_enter(&gp->gp_mutex); 4313 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp); 4314 4315 for (; lock && lock->l_vnode == vp; lock = lock->l_next) { 4316 if ((svmand || (lock->l_state & NBMAND_LOCK)) && 4317 (lock->l_flock.l_sysid != sysid || 4318 lock->l_flock.l_pid != pid) && 4319 lock_blocks_io(op, offset, length, 4320 lock->l_type, lock->l_start, lock->l_end)) { 4321 conflict = 1; 4322 break; 4323 } 4324 } 4325 mutex_exit(&gp->gp_mutex); 4326 4327 return (conflict); 4328 } 4329 4330 /* 4331 * Return non-zero if the given I/O request conflicts with the given lock. 4332 */ 4333 4334 static int 4335 lock_blocks_io(nbl_op_t op, u_offset_t offset, ssize_t length, 4336 int lock_type, u_offset_t lock_start, u_offset_t lock_end) 4337 { 4338 ASSERT(op == NBL_READ || op == NBL_WRITE || op == NBL_READWRITE); 4339 ASSERT(lock_type == F_RDLCK || lock_type == F_WRLCK); 4340 4341 if (op == NBL_READ && lock_type == F_RDLCK) 4342 return (0); 4343 4344 if (offset <= lock_start && lock_start < offset + length) 4345 return (1); 4346 if (lock_start <= offset && offset <= lock_end) 4347 return (1); 4348 4349 return (0); 4350 } 4351 4352 #ifdef DEBUG 4353 static void 4354 check_active_locks(graph_t *gp) 4355 { 4356 lock_descriptor_t *lock, *lock1; 4357 edge_t *ep; 4358 4359 for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp); 4360 lock = lock->l_next) { 4361 ASSERT(IS_ACTIVE(lock)); 4362 ASSERT(NOT_BLOCKED(lock)); 4363 ASSERT(!IS_BARRIER(lock)); 4364 4365 ep = FIRST_IN(lock); 4366 4367 while (ep != HEAD(lock)) { 4368 ASSERT(IS_SLEEPING(ep->from_vertex)); 4369 ASSERT(!NOT_BLOCKED(ep->from_vertex)); 4370 ep = NEXT_IN(ep); 4371 } 4372 4373 for (lock1 = lock->l_next; lock1 != ACTIVE_HEAD(gp); 4374 lock1 = lock1->l_next) { 4375 if (lock1->l_vnode == lock->l_vnode) { 4376 if (BLOCKS(lock1, lock)) { 4377 cmn_err(CE_PANIC, 4378 "active lock %p blocks %p", 4379 (void *)lock1, (void *)lock); 4380 } else if (BLOCKS(lock, lock1)) { 4381 cmn_err(CE_PANIC, 4382 "active lock %p blocks %p", 4383 (void *)lock, (void *)lock1); 4384 } 4385 } 4386 } 4387 } 4388 } 4389 4390 /* 4391 * Effect: This functions checks to see if the transition from 'old_state' to 4392 * 'new_state' is a valid one. It returns 0 if the transition is valid 4393 * and 1 if it is not. 4394 * For a map of valid transitions, see sys/flock_impl.h 4395 */ 4396 static int 4397 check_lock_transition(int old_state, int new_state) 4398 { 4399 switch (old_state) { 4400 case FLK_INITIAL_STATE: 4401 if ((new_state == FLK_START_STATE) || 4402 (new_state == FLK_SLEEPING_STATE) || 4403 (new_state == FLK_ACTIVE_STATE) || 4404 (new_state == FLK_DEAD_STATE)) { 4405 return (0); 4406 } else { 4407 return (1); 4408 } 4409 case FLK_START_STATE: 4410 if ((new_state == FLK_ACTIVE_STATE) || 4411 (new_state == FLK_DEAD_STATE)) { 4412 return (0); 4413 } else { 4414 return (1); 4415 } 4416 case FLK_ACTIVE_STATE: 4417 if (new_state == FLK_DEAD_STATE) { 4418 return (0); 4419 } else { 4420 return (1); 4421 } 4422 case FLK_SLEEPING_STATE: 4423 if ((new_state == FLK_GRANTED_STATE) || 4424 (new_state == FLK_INTERRUPTED_STATE) || 4425 (new_state == FLK_CANCELLED_STATE)) { 4426 return (0); 4427 } else { 4428 return (1); 4429 } 4430 case FLK_GRANTED_STATE: 4431 if ((new_state == FLK_START_STATE) || 4432 (new_state == FLK_INTERRUPTED_STATE) || 4433 (new_state == FLK_CANCELLED_STATE)) { 4434 return (0); 4435 } else { 4436 return (1); 4437 } 4438 case FLK_CANCELLED_STATE: 4439 if ((new_state == FLK_INTERRUPTED_STATE) || 4440 (new_state == FLK_DEAD_STATE)) { 4441 return (0); 4442 } else { 4443 return (1); 4444 } 4445 case FLK_INTERRUPTED_STATE: 4446 if (new_state == FLK_DEAD_STATE) { 4447 return (0); 4448 } else { 4449 return (1); 4450 } 4451 case FLK_DEAD_STATE: 4452 /* May be set more than once */ 4453 if (new_state == FLK_DEAD_STATE) { 4454 return (0); 4455 } else { 4456 return (1); 4457 } 4458 default: 4459 return (1); 4460 } 4461 } 4462 4463 static void 4464 check_sleeping_locks(graph_t *gp) 4465 { 4466 lock_descriptor_t *lock1, *lock2; 4467 edge_t *ep; 4468 for (lock1 = SLEEPING_HEAD(gp)->l_next; lock1 != SLEEPING_HEAD(gp); 4469 lock1 = lock1->l_next) { 4470 ASSERT(!IS_BARRIER(lock1)); 4471 for (lock2 = lock1->l_next; lock2 != SLEEPING_HEAD(gp); 4472 lock2 = lock2->l_next) { 4473 if (lock1->l_vnode == lock2->l_vnode) { 4474 if (BLOCKS(lock2, lock1)) { 4475 ASSERT(!IS_GRANTED(lock1)); 4476 ASSERT(!NOT_BLOCKED(lock1)); 4477 path(lock1, lock2); 4478 } 4479 } 4480 } 4481 4482 for (lock2 = ACTIVE_HEAD(gp)->l_next; lock2 != ACTIVE_HEAD(gp); 4483 lock2 = lock2->l_next) { 4484 ASSERT(!IS_BARRIER(lock1)); 4485 if (lock1->l_vnode == lock2->l_vnode) { 4486 if (BLOCKS(lock2, lock1)) { 4487 ASSERT(!IS_GRANTED(lock1)); 4488 ASSERT(!NOT_BLOCKED(lock1)); 4489 path(lock1, lock2); 4490 } 4491 } 4492 } 4493 ep = FIRST_ADJ(lock1); 4494 while (ep != HEAD(lock1)) { 4495 ASSERT(BLOCKS(ep->to_vertex, lock1)); 4496 ep = NEXT_ADJ(ep); 4497 } 4498 } 4499 } 4500 4501 static int 4502 level_two_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2, int no_path) 4503 { 4504 edge_t *ep; 4505 lock_descriptor_t *vertex; 4506 lock_descriptor_t *vertex_stack; 4507 4508 STACK_INIT(vertex_stack); 4509 4510 flk_graph_uncolor(lock1->l_graph); 4511 ep = FIRST_ADJ(lock1); 4512 ASSERT(ep != HEAD(lock1)); 4513 while (ep != HEAD(lock1)) { 4514 if (no_path) 4515 ASSERT(ep->to_vertex != lock2); 4516 STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack); 4517 COLOR(ep->to_vertex); 4518 ep = NEXT_ADJ(ep); 4519 } 4520 4521 while ((vertex = STACK_TOP(vertex_stack)) != NULL) { 4522 STACK_POP(vertex_stack, l_dstack); 4523 for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex); 4524 ep = NEXT_ADJ(ep)) { 4525 if (COLORED(ep->to_vertex)) 4526 continue; 4527 COLOR(ep->to_vertex); 4528 if (ep->to_vertex == lock2) 4529 return (1); 4530 4531 STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack); 4532 } 4533 } 4534 return (0); 4535 } 4536 4537 static void 4538 check_owner_locks(graph_t *gp, pid_t pid, int sysid, vnode_t *vp) 4539 { 4540 lock_descriptor_t *lock; 4541 4542 /* Ignore OFD style locks since they're not process-wide. */ 4543 if (pid == 0) 4544 return; 4545 4546 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp); 4547 4548 if (lock) { 4549 while (lock != ACTIVE_HEAD(gp) && (lock->l_vnode == vp)) { 4550 if (lock->l_flock.l_pid == pid && 4551 lock->l_flock.l_sysid == sysid) 4552 cmn_err(CE_PANIC, 4553 "owner pid %d's lock %p in active queue", 4554 pid, (void *)lock); 4555 lock = lock->l_next; 4556 } 4557 } 4558 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp); 4559 4560 if (lock) { 4561 while (lock != SLEEPING_HEAD(gp) && (lock->l_vnode == vp)) { 4562 if (lock->l_flock.l_pid == pid && 4563 lock->l_flock.l_sysid == sysid) 4564 cmn_err(CE_PANIC, 4565 "owner pid %d's lock %p in sleep queue", 4566 pid, (void *)lock); 4567 lock = lock->l_next; 4568 } 4569 } 4570 } 4571 4572 static int 4573 level_one_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2) 4574 { 4575 edge_t *ep = FIRST_ADJ(lock1); 4576 4577 while (ep != HEAD(lock1)) { 4578 if (ep->to_vertex == lock2) 4579 return (1); 4580 else 4581 ep = NEXT_ADJ(ep); 4582 } 4583 return (0); 4584 } 4585 4586 static int 4587 no_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2) 4588 { 4589 return (!level_two_path(lock1, lock2, 1)); 4590 } 4591 4592 static void 4593 path(lock_descriptor_t *lock1, lock_descriptor_t *lock2) 4594 { 4595 if (level_one_path(lock1, lock2)) { 4596 if (level_two_path(lock1, lock2, 0) != 0) { 4597 cmn_err(CE_WARN, 4598 "one edge one path from lock1 %p lock2 %p", 4599 (void *)lock1, (void *)lock2); 4600 } 4601 } else if (no_path(lock1, lock2)) { 4602 cmn_err(CE_PANIC, 4603 "No path from lock1 %p to lock2 %p", 4604 (void *)lock1, (void *)lock2); 4605 } 4606 } 4607 #endif /* DEBUG */ 4608