1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * Copyright 2019 Nexenta by DDN, Inc. All rights reserved. 32 * Copyright 2015 Joyent, Inc. 33 */ 34 35 #include <sys/flock_impl.h> 36 #include <sys/vfs.h> 37 #include <sys/t_lock.h> /* for <sys/callb.h> */ 38 #include <sys/callb.h> 39 #include <sys/clconf.h> 40 #include <sys/cladm.h> 41 #include <sys/nbmlock.h> 42 #include <sys/cred.h> 43 #include <sys/policy.h> 44 45 /* 46 * The following four variables are for statistics purposes and they are 47 * not protected by locks. They may not be accurate but will at least be 48 * close to the actual value. 49 */ 50 51 int flk_lock_allocs; 52 int flk_lock_frees; 53 int edge_allocs; 54 int edge_frees; 55 int flk_proc_vertex_allocs; 56 int flk_proc_edge_allocs; 57 int flk_proc_vertex_frees; 58 int flk_proc_edge_frees; 59 60 static kmutex_t flock_lock; 61 62 #ifdef DEBUG 63 int check_debug = 0; 64 #define CHECK_ACTIVE_LOCKS(gp) if (check_debug) \ 65 check_active_locks(gp); 66 #define CHECK_SLEEPING_LOCKS(gp) if (check_debug) \ 67 check_sleeping_locks(gp); 68 #define CHECK_OWNER_LOCKS(gp, pid, sysid, vp) \ 69 if (check_debug) \ 70 check_owner_locks(gp, pid, sysid, vp); 71 #define CHECK_LOCK_TRANSITION(old_state, new_state) \ 72 { \ 73 if (check_lock_transition(old_state, new_state)) { \ 74 cmn_err(CE_PANIC, "Illegal lock transition \ 75 from %d to %d", old_state, new_state); \ 76 } \ 77 } 78 #else 79 80 #define CHECK_ACTIVE_LOCKS(gp) 81 #define CHECK_SLEEPING_LOCKS(gp) 82 #define CHECK_OWNER_LOCKS(gp, pid, sysid, vp) 83 #define CHECK_LOCK_TRANSITION(old_state, new_state) 84 85 #endif /* DEBUG */ 86 87 struct kmem_cache *flk_edge_cache; 88 89 graph_t *lock_graph[HASH_SIZE]; 90 proc_graph_t pgraph; 91 92 /* 93 * Clustering. 94 * 95 * NLM REGISTRY TYPE IMPLEMENTATION 96 * 97 * Assumptions: 98 * 1. Nodes in a cluster are numbered starting at 1; always non-negative 99 * integers; maximum node id is returned by clconf_maximum_nodeid(). 100 * 2. We use this node id to identify the node an NLM server runs on. 101 */ 102 103 /* 104 * NLM registry object keeps track of NLM servers via their 105 * nlmids (which are the node ids of the node in the cluster they run on) 106 * that have requested locks at this LLM with which this registry is 107 * associated. 108 * 109 * Representation of abstraction: 110 * rep = record[ states: array[nlm_state], 111 * lock: mutex] 112 * 113 * Representation invariants: 114 * 1. index i of rep.states is between 0 and n - 1 where n is number 115 * of elements in the array, which happen to be the maximum number 116 * of nodes in the cluster configuration + 1. 117 * 2. map nlmid to index i of rep.states 118 * 0 -> 0 119 * 1 -> 1 120 * 2 -> 2 121 * n-1 -> clconf_maximum_nodeid()+1 122 * 3. This 1-1 mapping is quite convenient and it avoids errors resulting 123 * from forgetting to subtract 1 from the index. 124 * 4. The reason we keep the 0th index is the following. A legitimate 125 * cluster configuration includes making a UFS file system NFS 126 * exportable. The code is structured so that if you're in a cluster 127 * you do one thing; otherwise, you do something else. The problem 128 * is what to do if you think you're in a cluster with PXFS loaded, 129 * but you're using UFS not PXFS? The upper two bytes of the sysid 130 * encode the node id of the node where NLM server runs; these bytes 131 * are zero for UFS. Since the nodeid is used to index into the 132 * registry, we can record the NLM server state information at index 133 * 0 using the same mechanism used for PXFS file locks! 134 */ 135 static flk_nlm_status_t *nlm_reg_status = NULL; /* state array 0..N-1 */ 136 static kmutex_t nlm_reg_lock; /* lock to protect arrary */ 137 static uint_t nlm_status_size; /* size of state array */ 138 139 /* 140 * Although we need a global lock dependency graph (and associated data 141 * structures), we also need a per-zone notion of whether the lock manager is 142 * running, and so whether to allow lock manager requests or not. 143 * 144 * Thus, on a per-zone basis we maintain a ``global'' variable 145 * (flk_lockmgr_status), protected by flock_lock, and set when the lock 146 * manager is determined to be changing state (starting or stopping). 147 * 148 * Each graph/zone pair also has a copy of this variable, which is protected by 149 * the graph's mutex. 150 * 151 * The per-graph copies are used to synchronize lock requests with shutdown 152 * requests. The global copy is used to initialize the per-graph field when a 153 * new graph is created. 154 */ 155 struct flock_globals { 156 flk_lockmgr_status_t flk_lockmgr_status; 157 flk_lockmgr_status_t lockmgr_status[HASH_SIZE]; 158 }; 159 160 zone_key_t flock_zone_key; 161 162 static void create_flock(lock_descriptor_t *, flock64_t *); 163 static lock_descriptor_t *flk_get_lock(void); 164 static void flk_free_lock(lock_descriptor_t *lock); 165 static void flk_get_first_blocking_lock(lock_descriptor_t *request); 166 static int flk_process_request(lock_descriptor_t *); 167 static int flk_add_edge(lock_descriptor_t *, lock_descriptor_t *, int, int); 168 static edge_t *flk_get_edge(void); 169 static int flk_wait_execute_request(lock_descriptor_t *); 170 static int flk_relation(lock_descriptor_t *, lock_descriptor_t *); 171 static void flk_insert_active_lock(lock_descriptor_t *); 172 static void flk_delete_active_lock(lock_descriptor_t *, int); 173 static void flk_insert_sleeping_lock(lock_descriptor_t *); 174 static void flk_graph_uncolor(graph_t *); 175 static void flk_wakeup(lock_descriptor_t *, int); 176 static void flk_free_edge(edge_t *); 177 static void flk_recompute_dependencies(lock_descriptor_t *, 178 lock_descriptor_t **, int, int); 179 static int flk_find_barriers(lock_descriptor_t *); 180 static void flk_update_barriers(lock_descriptor_t *); 181 static int flk_color_reachables(lock_descriptor_t *); 182 static int flk_canceled(lock_descriptor_t *); 183 static void flk_delete_locks_by_sysid(lock_descriptor_t *); 184 static void report_blocker(lock_descriptor_t *, lock_descriptor_t *); 185 static void wait_for_lock(lock_descriptor_t *); 186 static void unlock_lockmgr_granted(struct flock_globals *); 187 static void wakeup_sleeping_lockmgr_locks(struct flock_globals *); 188 189 /* Clustering hooks */ 190 static void cl_flk_change_nlm_state_all_locks(int, flk_nlm_status_t); 191 static void cl_flk_wakeup_sleeping_nlm_locks(int); 192 static void cl_flk_unlock_nlm_granted(int); 193 194 #ifdef DEBUG 195 static int check_lock_transition(int, int); 196 static void check_sleeping_locks(graph_t *); 197 static void check_active_locks(graph_t *); 198 static int no_path(lock_descriptor_t *, lock_descriptor_t *); 199 static void path(lock_descriptor_t *, lock_descriptor_t *); 200 static void check_owner_locks(graph_t *, pid_t, int, vnode_t *); 201 static int level_one_path(lock_descriptor_t *, lock_descriptor_t *); 202 static int level_two_path(lock_descriptor_t *, lock_descriptor_t *, int); 203 #endif 204 205 /* proc_graph function definitions */ 206 static int flk_check_deadlock(lock_descriptor_t *); 207 static void flk_proc_graph_uncolor(void); 208 static proc_vertex_t *flk_get_proc_vertex(lock_descriptor_t *); 209 static proc_edge_t *flk_get_proc_edge(void); 210 static void flk_proc_release(proc_vertex_t *); 211 static void flk_free_proc_edge(proc_edge_t *); 212 static void flk_update_proc_graph(edge_t *, int); 213 214 /* Non-blocking mandatory locking */ 215 static int lock_blocks_io(nbl_op_t, u_offset_t, ssize_t, int, u_offset_t, 216 u_offset_t); 217 218 static struct flock_globals * 219 flk_get_globals(void) 220 { 221 /* 222 * The KLM module had better be loaded if we're attempting to handle 223 * lockmgr requests. 224 */ 225 ASSERT(flock_zone_key != ZONE_KEY_UNINITIALIZED); 226 return (zone_getspecific(flock_zone_key, curproc->p_zone)); 227 } 228 229 static flk_lockmgr_status_t 230 flk_get_lockmgr_status(void) 231 { 232 struct flock_globals *fg; 233 234 ASSERT(MUTEX_HELD(&flock_lock)); 235 236 if (flock_zone_key == ZONE_KEY_UNINITIALIZED) { 237 /* 238 * KLM module not loaded; lock manager definitely not running. 239 */ 240 return (FLK_LOCKMGR_DOWN); 241 } 242 fg = flk_get_globals(); 243 return (fg->flk_lockmgr_status); 244 } 245 246 /* 247 * This implements Open File Description (not descriptor) style record locking. 248 * These locks can also be thought of as pid-less since they are not tied to a 249 * specific process, thus they're preserved across fork. 250 * 251 * Called directly from fcntl. 252 * 253 * See reclock() for the implementation of the traditional POSIX style record 254 * locking scheme (pid-ful). This function is derived from reclock() but 255 * simplified and modified to work for OFD style locking. 256 * 257 * The two primary advantages of OFD style of locking are: 258 * 1) It is per-file description, so closing a file descriptor that refers to a 259 * different file description for the same file will not drop the lock (i.e. 260 * two open's of the same file get different descriptions but a dup or fork 261 * will refer to the same description). 262 * 2) Locks are preserved across fork(2). 263 * 264 * Because these locks are per-description a lock ptr lives at the f_filocks 265 * member of the file_t and the lock_descriptor includes a file_t pointer 266 * to enable unique lock identification and management. 267 * 268 * Since these locks are pid-less we cannot do deadlock detection with the 269 * current process-oriented implementation. This is consistent with OFD locking 270 * behavior on other operating systems such as Linux. Since we don't do 271 * deadlock detection we never interact with the process graph that is 272 * maintained for deadlock detection on the traditional POSIX-style locks. 273 * 274 * Future Work: 275 * 276 * The current implementation does not support record locks. That is, 277 * currently the single lock must cover the entire file. This is validated in 278 * fcntl. To support record locks the f_filock pointer in the file_t needs to 279 * be changed to a list of pointers to the locks. That list needs to be 280 * managed independently of the lock list on the vnode itself and it needs to 281 * be maintained as record locks are created, split, coalesced and deleted. 282 * 283 * The current implementation does not support remote file systems (e.g. 284 * NFS or CIFS). This is handled in fs_frlock(). The design of how OFD locks 285 * interact with the NLM is not clear since the NLM protocol/implementation 286 * appears to be oriented around locks associated with a process. A further 287 * problem is that a design is needed for what nlm_send_siglost() should do and 288 * where it will send SIGLOST. More recent versions of Linux apparently try to 289 * emulate OFD locks on NFS by converting them to traditional POSIX style locks 290 * that work with the NLM. It is not clear that this provides the correct 291 * semantics in all cases. 292 */ 293 int 294 ofdlock(file_t *fp, int fcmd, flock64_t *lckdat, int flag, u_offset_t offset) 295 { 296 int cmd = 0; 297 vnode_t *vp; 298 lock_descriptor_t stack_lock_request; 299 lock_descriptor_t *lock_request; 300 int error = 0; 301 graph_t *gp; 302 int serialize = 0; 303 304 if (fcmd != F_OFD_GETLK) 305 cmd = SETFLCK; 306 307 if (fcmd == F_OFD_SETLKW || fcmd == F_FLOCKW) 308 cmd |= SLPFLCK; 309 310 /* see block comment */ 311 VERIFY(lckdat->l_whence == 0); 312 VERIFY(lckdat->l_start == 0); 313 VERIFY(lckdat->l_len == 0); 314 315 vp = fp->f_vnode; 316 317 /* 318 * For reclock fs_frlock() would normally have set these in a few 319 * places but for us it's cleaner to centralize it here. Note that 320 * IGN_PID is -1. We use 0 for our pid-less locks. 321 */ 322 lckdat->l_pid = 0; 323 lckdat->l_sysid = 0; 324 325 /* 326 * Check access permissions 327 */ 328 if ((fcmd == F_OFD_SETLK || fcmd == F_OFD_SETLKW) && 329 ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) || 330 (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0))) 331 return (EBADF); 332 333 /* 334 * for query and unlock we use the stack_lock_request 335 */ 336 if (lckdat->l_type == F_UNLCK || !(cmd & SETFLCK)) { 337 lock_request = &stack_lock_request; 338 (void) bzero((caddr_t)lock_request, 339 sizeof (lock_descriptor_t)); 340 341 /* 342 * following is added to make the assertions in 343 * flk_execute_request() pass 344 */ 345 lock_request->l_edge.edge_in_next = &lock_request->l_edge; 346 lock_request->l_edge.edge_in_prev = &lock_request->l_edge; 347 lock_request->l_edge.edge_adj_next = &lock_request->l_edge; 348 lock_request->l_edge.edge_adj_prev = &lock_request->l_edge; 349 lock_request->l_status = FLK_INITIAL_STATE; 350 } else { 351 lock_request = flk_get_lock(); 352 fp->f_filock = (struct filock *)lock_request; 353 } 354 lock_request->l_state = 0; 355 lock_request->l_vnode = vp; 356 lock_request->l_zoneid = getzoneid(); 357 lock_request->l_ofd = fp; 358 359 /* 360 * Convert the request range into the canonical start and end 361 * values then check the validity of the lock range. 362 */ 363 error = flk_convert_lock_data(vp, lckdat, &lock_request->l_start, 364 &lock_request->l_end, offset); 365 if (error) 366 goto done; 367 368 error = flk_check_lock_data(lock_request->l_start, lock_request->l_end, 369 MAXEND); 370 if (error) 371 goto done; 372 373 ASSERT(lock_request->l_end >= lock_request->l_start); 374 375 lock_request->l_type = lckdat->l_type; 376 if (cmd & SLPFLCK) 377 lock_request->l_state |= WILLING_TO_SLEEP_LOCK; 378 379 if (!(cmd & SETFLCK)) { 380 if (lock_request->l_type == F_RDLCK || 381 lock_request->l_type == F_WRLCK) 382 lock_request->l_state |= QUERY_LOCK; 383 } 384 lock_request->l_flock = (*lckdat); 385 386 /* 387 * We are ready for processing the request 388 */ 389 390 if (fcmd != F_OFD_GETLK && lock_request->l_type != F_UNLCK && 391 nbl_need_check(vp)) { 392 nbl_start_crit(vp, RW_WRITER); 393 serialize = 1; 394 } 395 396 /* Get the lock graph for a particular vnode */ 397 gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH); 398 399 mutex_enter(&gp->gp_mutex); 400 401 lock_request->l_state |= REFERENCED_LOCK; 402 lock_request->l_graph = gp; 403 404 switch (lock_request->l_type) { 405 case F_RDLCK: 406 case F_WRLCK: 407 if (IS_QUERY_LOCK(lock_request)) { 408 flk_get_first_blocking_lock(lock_request); 409 if (lock_request->l_ofd != NULL) 410 lock_request->l_flock.l_pid = -1; 411 (*lckdat) = lock_request->l_flock; 412 } else { 413 /* process the request now */ 414 error = flk_process_request(lock_request); 415 } 416 break; 417 418 case F_UNLCK: 419 /* unlock request will not block so execute it immediately */ 420 error = flk_execute_request(lock_request); 421 break; 422 423 default: 424 error = EINVAL; 425 break; 426 } 427 428 if (lock_request == &stack_lock_request) { 429 flk_set_state(lock_request, FLK_DEAD_STATE); 430 } else { 431 lock_request->l_state &= ~REFERENCED_LOCK; 432 if ((error != 0) || IS_DELETED(lock_request)) { 433 flk_set_state(lock_request, FLK_DEAD_STATE); 434 flk_free_lock(lock_request); 435 } 436 } 437 438 mutex_exit(&gp->gp_mutex); 439 if (serialize) 440 nbl_end_crit(vp); 441 442 return (error); 443 444 done: 445 flk_set_state(lock_request, FLK_DEAD_STATE); 446 if (lock_request != &stack_lock_request) 447 flk_free_lock(lock_request); 448 return (error); 449 } 450 451 /* 452 * Remove any lock on the vnode belonging to the given file_t. 453 * Called from closef on last close, file_t is locked. 454 * 455 * This is modeled on the cleanlocks() function but only removes the single 456 * lock associated with fp. 457 */ 458 void 459 ofdcleanlock(file_t *fp) 460 { 461 lock_descriptor_t *fplock, *lock, *nlock; 462 vnode_t *vp; 463 graph_t *gp; 464 465 ASSERT(MUTEX_HELD(&fp->f_tlock)); 466 467 if ((fplock = (lock_descriptor_t *)fp->f_filock) == NULL) 468 return; 469 470 fp->f_filock = NULL; 471 vp = fp->f_vnode; 472 473 gp = flk_get_lock_graph(vp, FLK_USE_GRAPH); 474 475 if (gp == NULL) 476 return; 477 mutex_enter(&gp->gp_mutex); 478 479 CHECK_SLEEPING_LOCKS(gp); 480 CHECK_ACTIVE_LOCKS(gp); 481 482 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp); 483 484 if (lock) { 485 do { 486 nlock = lock->l_next; 487 if (fplock == lock) { 488 CANCEL_WAKEUP(lock); 489 break; 490 } 491 lock = nlock; 492 } while (lock->l_vnode == vp); 493 } 494 495 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp); 496 497 if (lock) { 498 do { 499 nlock = lock->l_next; 500 if (fplock == lock) { 501 flk_delete_active_lock(lock, 0); 502 flk_wakeup(lock, 1); 503 flk_free_lock(lock); 504 break; 505 } 506 lock = nlock; 507 } while (lock->l_vnode == vp); 508 } 509 510 CHECK_SLEEPING_LOCKS(gp); 511 CHECK_ACTIVE_LOCKS(gp); 512 mutex_exit(&gp->gp_mutex); 513 } 514 515 /* 516 * Routine called from fs_frlock in fs/fs_subr.c 517 * 518 * This implements traditional POSIX style record locking. The two primary 519 * drawbacks to this style of locking are: 520 * 1) It is per-process, so any close of a file descriptor that refers to the 521 * file will drop the lock (e.g. lock /etc/passwd, call a library function 522 * which opens /etc/passwd to read the file, when the library closes it's 523 * file descriptor the application loses its lock and does not know). 524 * 2) Locks are not preserved across fork(2). 525 * 526 * Because these locks are only associated with a PID, they are per-process. 527 * This is why any close will drop the lock and is also why, once the process 528 * forks, the lock is no longer related to the new process. These locks can 529 * be considered as PID-ful. 530 * 531 * See ofdlock() for the implementation of a similar but improved locking 532 * scheme. 533 */ 534 int 535 reclock(vnode_t *vp, flock64_t *lckdat, int cmd, int flag, u_offset_t offset, 536 flk_callback_t *flk_cbp) 537 { 538 lock_descriptor_t stack_lock_request; 539 lock_descriptor_t *lock_request; 540 int error = 0; 541 graph_t *gp; 542 int nlmid; 543 544 /* 545 * Check access permissions 546 */ 547 if ((cmd & SETFLCK) && 548 ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) || 549 (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0))) 550 return (EBADF); 551 552 /* 553 * for query and unlock we use the stack_lock_request 554 */ 555 556 if ((lckdat->l_type == F_UNLCK) || 557 !((cmd & INOFLCK) || (cmd & SETFLCK))) { 558 lock_request = &stack_lock_request; 559 (void) bzero((caddr_t)lock_request, 560 sizeof (lock_descriptor_t)); 561 562 /* 563 * following is added to make the assertions in 564 * flk_execute_request() to pass through 565 */ 566 567 lock_request->l_edge.edge_in_next = &lock_request->l_edge; 568 lock_request->l_edge.edge_in_prev = &lock_request->l_edge; 569 lock_request->l_edge.edge_adj_next = &lock_request->l_edge; 570 lock_request->l_edge.edge_adj_prev = &lock_request->l_edge; 571 lock_request->l_status = FLK_INITIAL_STATE; 572 } else { 573 lock_request = flk_get_lock(); 574 } 575 lock_request->l_state = 0; 576 lock_request->l_vnode = vp; 577 lock_request->l_zoneid = getzoneid(); 578 579 /* 580 * Convert the request range into the canonical start and end 581 * values. The NLM protocol supports locking over the entire 582 * 32-bit range, so there's no range checking for remote requests, 583 * but we still need to verify that local requests obey the rules. 584 */ 585 /* Clustering */ 586 if ((cmd & (RCMDLCK | PCMDLCK)) != 0) { 587 ASSERT(lckdat->l_whence == 0); 588 lock_request->l_start = lckdat->l_start; 589 lock_request->l_end = (lckdat->l_len == 0) ? MAX_U_OFFSET_T : 590 lckdat->l_start + (lckdat->l_len - 1); 591 } else { 592 /* check the validity of the lock range */ 593 error = flk_convert_lock_data(vp, lckdat, 594 &lock_request->l_start, &lock_request->l_end, 595 offset); 596 if (error) { 597 goto done; 598 } 599 error = flk_check_lock_data(lock_request->l_start, 600 lock_request->l_end, MAXEND); 601 if (error) { 602 goto done; 603 } 604 } 605 606 ASSERT(lock_request->l_end >= lock_request->l_start); 607 608 lock_request->l_type = lckdat->l_type; 609 if (cmd & INOFLCK) 610 lock_request->l_state |= IO_LOCK; 611 if (cmd & SLPFLCK) 612 lock_request->l_state |= WILLING_TO_SLEEP_LOCK; 613 if (cmd & RCMDLCK) 614 lock_request->l_state |= LOCKMGR_LOCK; 615 if (cmd & NBMLCK) 616 lock_request->l_state |= NBMAND_LOCK; 617 /* 618 * Clustering: set flag for PXFS locks 619 * We do not _only_ check for the PCMDLCK flag because PXFS locks could 620 * also be of type 'RCMDLCK'. 621 * We do not _only_ check the GETPXFSID() macro because local PXFS 622 * clients use a pxfsid of zero to permit deadlock detection in the LLM. 623 */ 624 625 if ((cmd & PCMDLCK) || (GETPXFSID(lckdat->l_sysid) != 0)) { 626 lock_request->l_state |= PXFS_LOCK; 627 } 628 if (!((cmd & SETFLCK) || (cmd & INOFLCK))) { 629 if (lock_request->l_type == F_RDLCK || 630 lock_request->l_type == F_WRLCK) 631 lock_request->l_state |= QUERY_LOCK; 632 } 633 lock_request->l_flock = (*lckdat); 634 lock_request->l_callbacks = flk_cbp; 635 636 /* 637 * We are ready for processing the request 638 */ 639 if (IS_LOCKMGR(lock_request)) { 640 /* 641 * If the lock request is an NLM server request .... 642 */ 643 if (nlm_status_size == 0) { /* not booted as cluster */ 644 mutex_enter(&flock_lock); 645 /* 646 * Bail out if this is a lock manager request and the 647 * lock manager is not supposed to be running. 648 */ 649 if (flk_get_lockmgr_status() != FLK_LOCKMGR_UP) { 650 mutex_exit(&flock_lock); 651 error = ENOLCK; 652 goto done; 653 } 654 mutex_exit(&flock_lock); 655 } else { /* booted as a cluster */ 656 nlmid = GETNLMID(lock_request->l_flock.l_sysid); 657 ASSERT(nlmid <= nlm_status_size && nlmid >= 0); 658 659 mutex_enter(&nlm_reg_lock); 660 /* 661 * If the NLM registry does not know about this 662 * NLM server making the request, add its nlmid 663 * to the registry. 664 */ 665 if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status, 666 nlmid)) { 667 FLK_REGISTRY_ADD_NLMID(nlm_reg_status, nlmid); 668 } else if (!FLK_REGISTRY_IS_NLM_UP(nlm_reg_status, 669 nlmid)) { 670 /* 671 * If the NLM server is already known (has made 672 * previous lock requests) and its state is 673 * not NLM_UP (means that NLM server is 674 * shutting down), then bail out with an 675 * error to deny the lock request. 676 */ 677 mutex_exit(&nlm_reg_lock); 678 error = ENOLCK; 679 goto done; 680 } 681 mutex_exit(&nlm_reg_lock); 682 } 683 } 684 685 /* Now get the lock graph for a particular vnode */ 686 gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH); 687 688 /* 689 * We drop rwlock here otherwise this might end up causing a 690 * deadlock if this IOLOCK sleeps. (bugid # 1183392). 691 */ 692 693 if (IS_IO_LOCK(lock_request)) { 694 VOP_RWUNLOCK(vp, 695 (lock_request->l_type == F_RDLCK) ? 696 V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL); 697 } 698 mutex_enter(&gp->gp_mutex); 699 700 lock_request->l_state |= REFERENCED_LOCK; 701 lock_request->l_graph = gp; 702 703 switch (lock_request->l_type) { 704 case F_RDLCK: 705 case F_WRLCK: 706 if (IS_QUERY_LOCK(lock_request)) { 707 flk_get_first_blocking_lock(lock_request); 708 if (lock_request->l_ofd != NULL) 709 lock_request->l_flock.l_pid = -1; 710 (*lckdat) = lock_request->l_flock; 711 break; 712 } 713 714 /* process the request now */ 715 716 error = flk_process_request(lock_request); 717 break; 718 719 case F_UNLCK: 720 /* unlock request will not block so execute it immediately */ 721 722 if (IS_LOCKMGR(lock_request) && 723 flk_canceled(lock_request)) { 724 error = 0; 725 } else { 726 error = flk_execute_request(lock_request); 727 } 728 break; 729 730 case F_UNLKSYS: 731 /* 732 * Recovery mechanism to release lock manager locks when 733 * NFS client crashes and restart. NFS server will clear 734 * old locks and grant new locks. 735 */ 736 737 if (lock_request->l_flock.l_sysid == 0) { 738 mutex_exit(&gp->gp_mutex); 739 return (EINVAL); 740 } 741 if (secpolicy_nfs(CRED()) != 0) { 742 mutex_exit(&gp->gp_mutex); 743 return (EPERM); 744 } 745 flk_delete_locks_by_sysid(lock_request); 746 lock_request->l_state &= ~REFERENCED_LOCK; 747 flk_set_state(lock_request, FLK_DEAD_STATE); 748 flk_free_lock(lock_request); 749 mutex_exit(&gp->gp_mutex); 750 return (0); 751 752 default: 753 error = EINVAL; 754 break; 755 } 756 757 /* Clustering: For blocked PXFS locks, return */ 758 if (error == PXFS_LOCK_BLOCKED) { 759 lock_request->l_state &= ~REFERENCED_LOCK; 760 mutex_exit(&gp->gp_mutex); 761 return (error); 762 } 763 764 /* 765 * Now that we have seen the status of locks in the system for 766 * this vnode we acquire the rwlock if it is an IO_LOCK. 767 */ 768 769 if (IS_IO_LOCK(lock_request)) { 770 (void) VOP_RWLOCK(vp, 771 (lock_request->l_type == F_RDLCK) ? 772 V_WRITELOCK_FALSE : V_WRITELOCK_TRUE, NULL); 773 if (!error) { 774 lckdat->l_type = F_UNLCK; 775 776 /* 777 * This wake up is needed otherwise 778 * if IO_LOCK has slept the dependents on this 779 * will not be woken up at all. (bugid # 1185482). 780 */ 781 782 flk_wakeup(lock_request, 1); 783 flk_set_state(lock_request, FLK_DEAD_STATE); 784 flk_free_lock(lock_request); 785 } 786 /* 787 * else if error had occurred either flk_process_request() 788 * has returned EDEADLK in which case there will be no 789 * dependents for this lock or EINTR from flk_wait_execute_ 790 * request() in which case flk_cancel_sleeping_lock() 791 * would have been done. same is true with EBADF. 792 */ 793 } 794 795 if (lock_request == &stack_lock_request) { 796 flk_set_state(lock_request, FLK_DEAD_STATE); 797 } else { 798 lock_request->l_state &= ~REFERENCED_LOCK; 799 if ((error != 0) || IS_DELETED(lock_request)) { 800 flk_set_state(lock_request, FLK_DEAD_STATE); 801 flk_free_lock(lock_request); 802 } 803 } 804 805 mutex_exit(&gp->gp_mutex); 806 return (error); 807 808 done: 809 flk_set_state(lock_request, FLK_DEAD_STATE); 810 if (lock_request != &stack_lock_request) 811 flk_free_lock(lock_request); 812 return (error); 813 } 814 815 /* 816 * Invoke the callbacks in the given list. If before sleeping, invoke in 817 * list order. If after sleeping, invoke in reverse order. 818 * 819 * CPR (suspend/resume) support: if one of the callbacks returns a 820 * callb_cpr_t, return it. This will be used to make the thread CPR-safe 821 * while it is sleeping. There should be at most one callb_cpr_t for the 822 * thread. 823 * XXX This is unnecessarily complicated. The CPR information should just 824 * get passed in directly through VOP_FRLOCK and reclock, rather than 825 * sneaking it in via a callback. 826 */ 827 828 callb_cpr_t * 829 flk_invoke_callbacks(flk_callback_t *cblist, flk_cb_when_t when) 830 { 831 callb_cpr_t *cpr_callbackp = NULL; 832 callb_cpr_t *one_result; 833 flk_callback_t *cb; 834 835 if (cblist == NULL) 836 return (NULL); 837 838 if (when == FLK_BEFORE_SLEEP) { 839 cb = cblist; 840 do { 841 one_result = (*cb->cb_callback)(when, cb->cb_data); 842 if (one_result != NULL) { 843 ASSERT(cpr_callbackp == NULL); 844 cpr_callbackp = one_result; 845 } 846 cb = cb->cb_next; 847 } while (cb != cblist); 848 } else { 849 cb = cblist->cb_prev; 850 do { 851 one_result = (*cb->cb_callback)(when, cb->cb_data); 852 if (one_result != NULL) { 853 cpr_callbackp = one_result; 854 } 855 cb = cb->cb_prev; 856 } while (cb != cblist->cb_prev); 857 } 858 859 return (cpr_callbackp); 860 } 861 862 /* 863 * Initialize a flk_callback_t to hold the given callback. 864 */ 865 866 void 867 flk_init_callback(flk_callback_t *flk_cb, 868 callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *), void *cbdata) 869 { 870 flk_cb->cb_next = flk_cb; 871 flk_cb->cb_prev = flk_cb; 872 flk_cb->cb_callback = cb_fcn; 873 flk_cb->cb_data = cbdata; 874 } 875 876 /* 877 * Initialize an flk_callback_t and then link it into the head of an 878 * existing list (which may be NULL). 879 */ 880 881 void 882 flk_add_callback(flk_callback_t *newcb, 883 callb_cpr_t *(*cb_fcn)(flk_cb_when_t, void *), 884 void *cbdata, flk_callback_t *cblist) 885 { 886 flk_init_callback(newcb, cb_fcn, cbdata); 887 888 if (cblist == NULL) 889 return; 890 891 newcb->cb_prev = cblist->cb_prev; 892 newcb->cb_next = cblist; 893 cblist->cb_prev->cb_next = newcb; 894 cblist->cb_prev = newcb; 895 } 896 897 /* 898 * Remove the callback from a list. 899 */ 900 901 void 902 flk_del_callback(flk_callback_t *flk_cb) 903 { 904 flk_cb->cb_next->cb_prev = flk_cb->cb_prev; 905 flk_cb->cb_prev->cb_next = flk_cb->cb_next; 906 907 flk_cb->cb_prev = flk_cb; 908 flk_cb->cb_next = flk_cb; 909 } 910 911 /* 912 * Initialize the flk_edge_cache data structure and create the 913 * nlm_reg_status array. 914 */ 915 916 void 917 flk_init(void) 918 { 919 uint_t i; 920 921 flk_edge_cache = kmem_cache_create("flk_edges", 922 sizeof (struct edge), 0, NULL, NULL, NULL, NULL, NULL, 0); 923 if (flk_edge_cache == NULL) { 924 cmn_err(CE_PANIC, "Couldn't create flk_edge_cache\n"); 925 } 926 /* 927 * Create the NLM registry object. 928 */ 929 930 if (cluster_bootflags & CLUSTER_BOOTED) { 931 /* 932 * This routine tells you the maximum node id that will be used 933 * in the cluster. This number will be the size of the nlm 934 * registry status array. We add 1 because we will be using 935 * all entries indexed from 0 to maxnodeid; e.g., from 0 936 * to 64, for a total of 65 entries. 937 */ 938 nlm_status_size = clconf_maximum_nodeid() + 1; 939 } else { 940 nlm_status_size = 0; 941 } 942 943 if (nlm_status_size != 0) { /* booted as a cluster */ 944 nlm_reg_status = (flk_nlm_status_t *) 945 kmem_alloc(sizeof (flk_nlm_status_t) * nlm_status_size, 946 KM_SLEEP); 947 948 /* initialize all NLM states in array to NLM_UNKNOWN */ 949 for (i = 0; i < nlm_status_size; i++) { 950 nlm_reg_status[i] = FLK_NLM_UNKNOWN; 951 } 952 } 953 } 954 955 /* 956 * Zone constructor/destructor callbacks to be executed when a zone is 957 * created/destroyed. 958 */ 959 /* ARGSUSED */ 960 void * 961 flk_zone_init(zoneid_t zoneid) 962 { 963 struct flock_globals *fg; 964 uint_t i; 965 966 fg = kmem_alloc(sizeof (*fg), KM_SLEEP); 967 fg->flk_lockmgr_status = FLK_LOCKMGR_UP; 968 for (i = 0; i < HASH_SIZE; i++) 969 fg->lockmgr_status[i] = FLK_LOCKMGR_UP; 970 return (fg); 971 } 972 973 /* ARGSUSED */ 974 void 975 flk_zone_fini(zoneid_t zoneid, void *data) 976 { 977 struct flock_globals *fg = data; 978 979 kmem_free(fg, sizeof (*fg)); 980 } 981 982 /* 983 * Get a lock_descriptor structure with initialization of edge lists. 984 */ 985 986 static lock_descriptor_t * 987 flk_get_lock(void) 988 { 989 lock_descriptor_t *l; 990 991 l = kmem_zalloc(sizeof (lock_descriptor_t), KM_SLEEP); 992 993 cv_init(&l->l_cv, NULL, CV_DRIVER, NULL); 994 l->l_edge.edge_in_next = &l->l_edge; 995 l->l_edge.edge_in_prev = &l->l_edge; 996 l->l_edge.edge_adj_next = &l->l_edge; 997 l->l_edge.edge_adj_prev = &l->l_edge; 998 l->pvertex = -1; 999 l->l_status = FLK_INITIAL_STATE; 1000 flk_lock_allocs++; 1001 return (l); 1002 } 1003 1004 /* 1005 * Free a lock_descriptor structure. Just sets the DELETED_LOCK flag 1006 * when some thread has a reference to it as in reclock(). 1007 */ 1008 1009 void 1010 flk_free_lock(lock_descriptor_t *lock) 1011 { 1012 file_t *fp; 1013 1014 ASSERT(IS_DEAD(lock)); 1015 1016 if ((fp = lock->l_ofd) != NULL && fp->f_filock == (struct filock *)lock) 1017 fp->f_filock = NULL; 1018 1019 if (IS_REFERENCED(lock)) { 1020 lock->l_state |= DELETED_LOCK; 1021 return; 1022 } 1023 flk_lock_frees++; 1024 kmem_free((void *)lock, sizeof (lock_descriptor_t)); 1025 } 1026 1027 void 1028 flk_set_state(lock_descriptor_t *lock, int new_state) 1029 { 1030 /* 1031 * Locks in the sleeping list may be woken up in a number of ways, 1032 * and more than once. If a sleeping lock is signaled awake more 1033 * than once, then it may or may not change state depending on its 1034 * current state. 1035 * Also note that NLM locks that are sleeping could be moved to an 1036 * interrupted state more than once if the unlock request is 1037 * retransmitted by the NLM client - the second time around, this is 1038 * just a nop. 1039 * The ordering of being signaled awake is: 1040 * INTERRUPTED_STATE > CANCELLED_STATE > GRANTED_STATE. 1041 * The checks below implement this ordering. 1042 */ 1043 if (IS_INTERRUPTED(lock)) { 1044 if ((new_state == FLK_CANCELLED_STATE) || 1045 (new_state == FLK_GRANTED_STATE) || 1046 (new_state == FLK_INTERRUPTED_STATE)) { 1047 return; 1048 } 1049 } 1050 if (IS_CANCELLED(lock)) { 1051 if ((new_state == FLK_GRANTED_STATE) || 1052 (new_state == FLK_CANCELLED_STATE)) { 1053 return; 1054 } 1055 } 1056 CHECK_LOCK_TRANSITION(lock->l_status, new_state); 1057 if (IS_PXFS(lock)) { 1058 cl_flk_state_transition_notify(lock, lock->l_status, new_state); 1059 } 1060 lock->l_status = new_state; 1061 } 1062 1063 /* 1064 * Routine that checks whether there are any blocking locks in the system. 1065 * 1066 * The policy followed is if a write lock is sleeping we don't allow read 1067 * locks before this write lock even though there may not be any active 1068 * locks corresponding to the read locks' region. 1069 * 1070 * flk_add_edge() function adds an edge between l1 and l2 iff there 1071 * is no path between l1 and l2. This is done to have a "minimum 1072 * storage representation" of the dependency graph. 1073 * 1074 * Another property of the graph is since only the new request throws 1075 * edges to the existing locks in the graph, the graph is always topologically 1076 * ordered. 1077 */ 1078 1079 static int 1080 flk_process_request(lock_descriptor_t *request) 1081 { 1082 graph_t *gp = request->l_graph; 1083 lock_descriptor_t *lock; 1084 int request_blocked_by_active = 0; 1085 int request_blocked_by_granted = 0; 1086 int request_blocked_by_sleeping = 0; 1087 vnode_t *vp = request->l_vnode; 1088 int error = 0; 1089 int request_will_wait = 0; 1090 int found_covering_lock = 0; 1091 lock_descriptor_t *covered_by = NULL; 1092 1093 ASSERT(MUTEX_HELD(&gp->gp_mutex)); 1094 request_will_wait = IS_WILLING_TO_SLEEP(request); 1095 1096 /* 1097 * check active locks 1098 */ 1099 1100 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp); 1101 1102 1103 if (lock) { 1104 do { 1105 if (BLOCKS(lock, request)) { 1106 if (!request_will_wait) 1107 return (EAGAIN); 1108 request_blocked_by_active = 1; 1109 break; 1110 } 1111 /* 1112 * Grant lock if it is for the same owner holding active 1113 * lock that covers the request. 1114 */ 1115 1116 if (SAME_OWNER(lock, request) && 1117 COVERS(lock, request) && 1118 (request->l_type == F_RDLCK)) 1119 return (flk_execute_request(request)); 1120 lock = lock->l_next; 1121 } while (lock->l_vnode == vp); 1122 } 1123 1124 if (!request_blocked_by_active) { 1125 lock_descriptor_t *lk[1]; 1126 lock_descriptor_t *first_glock = NULL; 1127 /* 1128 * Shall we grant this?! NO!! 1129 * What about those locks that were just granted and still 1130 * in sleep queue. Those threads are woken up and so locks 1131 * are almost active. 1132 */ 1133 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp); 1134 if (lock) { 1135 do { 1136 if (BLOCKS(lock, request)) { 1137 if (IS_GRANTED(lock)) { 1138 request_blocked_by_granted = 1; 1139 } else { 1140 request_blocked_by_sleeping = 1; 1141 } 1142 } 1143 1144 lock = lock->l_next; 1145 } while ((lock->l_vnode == vp)); 1146 first_glock = lock->l_prev; 1147 ASSERT(first_glock->l_vnode == vp); 1148 } 1149 1150 if (request_blocked_by_granted) 1151 goto block; 1152 1153 if (!request_blocked_by_sleeping) { 1154 /* 1155 * If the request isn't going to be blocked by a 1156 * sleeping request, we know that it isn't going to 1157 * be blocked; we can just execute the request -- 1158 * without performing costly deadlock detection. 1159 */ 1160 ASSERT(!request_blocked_by_active); 1161 return (flk_execute_request(request)); 1162 } else if (request->l_type == F_RDLCK) { 1163 /* 1164 * If we have a sleeping writer in the requested 1165 * lock's range, block. 1166 */ 1167 goto block; 1168 } 1169 1170 lk[0] = request; 1171 request->l_state |= RECOMPUTE_LOCK; 1172 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp); 1173 if (lock) { 1174 do { 1175 flk_recompute_dependencies(lock, lk, 1, 0); 1176 lock = lock->l_next; 1177 } while (lock->l_vnode == vp); 1178 } 1179 lock = first_glock; 1180 if (lock) { 1181 do { 1182 if (IS_GRANTED(lock)) { 1183 flk_recompute_dependencies(lock, lk, 1, 0); 1184 } 1185 lock = lock->l_prev; 1186 } while ((lock->l_vnode == vp)); 1187 } 1188 request->l_state &= ~RECOMPUTE_LOCK; 1189 if (!NO_DEPENDENTS(request) && flk_check_deadlock(request)) 1190 return (EDEADLK); 1191 return (flk_execute_request(request)); 1192 } 1193 1194 block: 1195 if (request_will_wait) 1196 flk_graph_uncolor(gp); 1197 1198 /* check sleeping locks */ 1199 1200 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp); 1201 1202 /* 1203 * If we find a sleeping write lock that is a superset of the 1204 * region wanted by request we can be assured that by adding an 1205 * edge to this write lock we have paths to all locks in the 1206 * graph that blocks the request except in one case and that is why 1207 * another check for SAME_OWNER in the loop below. The exception 1208 * case is when this process that owns the sleeping write lock 'l1' 1209 * has other locks l2, l3, l4 that are in the system and arrived 1210 * before l1. l1 does not have path to these locks as they are from 1211 * same process. We break when we find a second covering sleeping 1212 * lock l5 owned by a process different from that owning l1, because 1213 * there cannot be any of l2, l3, l4, etc., arrived before l5, and if 1214 * it has l1 would have produced a deadlock already. 1215 */ 1216 1217 if (lock) { 1218 do { 1219 if (BLOCKS(lock, request)) { 1220 if (!request_will_wait) 1221 return (EAGAIN); 1222 if (COVERS(lock, request) && 1223 lock->l_type == F_WRLCK) { 1224 if (found_covering_lock && 1225 !SAME_OWNER(lock, covered_by)) { 1226 found_covering_lock++; 1227 break; 1228 } 1229 found_covering_lock = 1; 1230 covered_by = lock; 1231 } 1232 if (found_covering_lock && 1233 !SAME_OWNER(lock, covered_by)) { 1234 lock = lock->l_next; 1235 continue; 1236 } 1237 if ((error = flk_add_edge(request, lock, 1238 !found_covering_lock, 0))) 1239 return (error); 1240 } 1241 lock = lock->l_next; 1242 } while (lock->l_vnode == vp); 1243 } 1244 1245 /* 1246 * found_covering_lock == 2 iff at this point 'request' has paths 1247 * to all locks that blocks 'request'. found_covering_lock == 1 iff at this 1248 * point 'request' has paths to all locks that blocks 'request' whose owners 1249 * are not same as the one that covers 'request' (covered_by above) and 1250 * we can have locks whose owner is same as covered_by in the active list. 1251 */ 1252 1253 if (request_blocked_by_active && found_covering_lock != 2) { 1254 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp); 1255 ASSERT(lock != NULL); 1256 do { 1257 if (BLOCKS(lock, request)) { 1258 if (found_covering_lock && 1259 !SAME_OWNER(lock, covered_by)) { 1260 lock = lock->l_next; 1261 continue; 1262 } 1263 if ((error = flk_add_edge(request, lock, 1264 CHECK_CYCLE, 0))) 1265 return (error); 1266 } 1267 lock = lock->l_next; 1268 } while (lock->l_vnode == vp); 1269 } 1270 1271 if (NOT_BLOCKED(request)) { 1272 /* 1273 * request not dependent on any other locks 1274 * so execute this request 1275 */ 1276 return (flk_execute_request(request)); 1277 } else { 1278 /* 1279 * check for deadlock 1280 */ 1281 if (flk_check_deadlock(request)) 1282 return (EDEADLK); 1283 /* 1284 * this thread has to sleep 1285 */ 1286 return (flk_wait_execute_request(request)); 1287 } 1288 } 1289 1290 /* 1291 * The actual execution of the request in the simple case is only to 1292 * insert the 'request' in the list of active locks if it is not an 1293 * UNLOCK. 1294 * We have to consider the existing active locks' relation to 1295 * this 'request' if they are owned by same process. flk_relation() does 1296 * this job and sees to that the dependency graph information is maintained 1297 * properly. 1298 */ 1299 1300 int 1301 flk_execute_request(lock_descriptor_t *request) 1302 { 1303 graph_t *gp = request->l_graph; 1304 vnode_t *vp = request->l_vnode; 1305 lock_descriptor_t *lock, *lock1; 1306 int done_searching = 0; 1307 1308 CHECK_SLEEPING_LOCKS(gp); 1309 CHECK_ACTIVE_LOCKS(gp); 1310 1311 ASSERT(MUTEX_HELD(&gp->gp_mutex)); 1312 1313 flk_set_state(request, FLK_START_STATE); 1314 1315 ASSERT(NOT_BLOCKED(request)); 1316 1317 /* IO_LOCK requests are only to check status */ 1318 1319 if (IS_IO_LOCK(request)) 1320 return (0); 1321 1322 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp); 1323 1324 if (lock == NULL && request->l_type == F_UNLCK) 1325 return (0); 1326 if (lock == NULL) { 1327 flk_insert_active_lock(request); 1328 return (0); 1329 } 1330 1331 do { 1332 lock1 = lock->l_next; 1333 if (SAME_OWNER(request, lock)) { 1334 done_searching = flk_relation(lock, request); 1335 } 1336 lock = lock1; 1337 } while (lock->l_vnode == vp && !done_searching); 1338 1339 /* 1340 * insert in active queue 1341 */ 1342 1343 if (request->l_type != F_UNLCK) 1344 flk_insert_active_lock(request); 1345 1346 return (0); 1347 } 1348 1349 /* 1350 * 'request' is blocked by some one therefore we put it into sleep queue. 1351 */ 1352 static int 1353 flk_wait_execute_request(lock_descriptor_t *request) 1354 { 1355 graph_t *gp = request->l_graph; 1356 callb_cpr_t *cprp; /* CPR info from callback */ 1357 struct flock_globals *fg; 1358 int index; 1359 1360 ASSERT(MUTEX_HELD(&gp->gp_mutex)); 1361 ASSERT(IS_WILLING_TO_SLEEP(request)); 1362 1363 flk_insert_sleeping_lock(request); 1364 1365 index = 0; /* quiesce compiler warning. */ 1366 fg = NULL; 1367 if (IS_LOCKMGR(request)) { 1368 index = HASH_INDEX(request->l_vnode); 1369 fg = flk_get_globals(); 1370 1371 if (nlm_status_size == 0) { /* not booted as a cluster */ 1372 if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP) { 1373 flk_cancel_sleeping_lock(request, 1); 1374 return (ENOLCK); 1375 } 1376 } else { /* booted as a cluster */ 1377 /* 1378 * If the request is an NLM server lock request, 1379 * and the NLM state of the lock request is not 1380 * NLM_UP (because the NLM server is shutting 1381 * down), then cancel the sleeping lock and 1382 * return error ENOLCK that will encourage the 1383 * client to retransmit. 1384 */ 1385 if (!IS_NLM_UP(request)) { 1386 flk_cancel_sleeping_lock(request, 1); 1387 return (ENOLCK); 1388 } 1389 } 1390 } 1391 1392 /* Clustering: For blocking PXFS locks, return */ 1393 if (IS_PXFS(request)) { 1394 /* 1395 * PXFS locks sleep on the client side. 1396 * The callback argument is used to wake up the sleeper 1397 * when the lock is granted. 1398 * We return -1 (rather than an errno value) to indicate 1399 * the client side should sleep 1400 */ 1401 return (PXFS_LOCK_BLOCKED); 1402 } 1403 1404 if (request->l_callbacks != NULL) { 1405 /* 1406 * To make sure the shutdown code works correctly, either 1407 * the callback must happen after putting the lock on the 1408 * sleep list, or we must check the shutdown status after 1409 * returning from the callback (and before sleeping). At 1410 * least for now, we'll use the first option. If a 1411 * shutdown or signal or whatever happened while the graph 1412 * mutex was dropped, that will be detected by 1413 * wait_for_lock(). 1414 */ 1415 mutex_exit(&gp->gp_mutex); 1416 1417 cprp = flk_invoke_callbacks(request->l_callbacks, 1418 FLK_BEFORE_SLEEP); 1419 1420 mutex_enter(&gp->gp_mutex); 1421 1422 if (cprp == NULL) { 1423 wait_for_lock(request); 1424 } else { 1425 mutex_enter(cprp->cc_lockp); 1426 CALLB_CPR_SAFE_BEGIN(cprp); 1427 mutex_exit(cprp->cc_lockp); 1428 wait_for_lock(request); 1429 mutex_enter(cprp->cc_lockp); 1430 CALLB_CPR_SAFE_END(cprp, cprp->cc_lockp); 1431 mutex_exit(cprp->cc_lockp); 1432 } 1433 1434 mutex_exit(&gp->gp_mutex); 1435 (void) flk_invoke_callbacks(request->l_callbacks, 1436 FLK_AFTER_SLEEP); 1437 mutex_enter(&gp->gp_mutex); 1438 } else { 1439 wait_for_lock(request); 1440 } 1441 1442 if (IS_LOCKMGR(request)) { 1443 /* 1444 * If the lock manager is shutting down, return an 1445 * error that will encourage the client to retransmit. 1446 */ 1447 if (fg->lockmgr_status[index] != FLK_LOCKMGR_UP && 1448 !IS_GRANTED(request)) { 1449 flk_cancel_sleeping_lock(request, 1); 1450 return (ENOLCK); 1451 } 1452 } 1453 1454 if (IS_INTERRUPTED(request)) { 1455 /* we got a signal, or act like we did */ 1456 flk_cancel_sleeping_lock(request, 1); 1457 return (EINTR); 1458 } 1459 1460 /* Cancelled if some other thread has closed the file */ 1461 1462 if (IS_CANCELLED(request)) { 1463 flk_cancel_sleeping_lock(request, 1); 1464 return (EBADF); 1465 } 1466 1467 request->l_state &= ~GRANTED_LOCK; 1468 REMOVE_SLEEP_QUEUE(request); 1469 return (flk_execute_request(request)); 1470 } 1471 1472 /* 1473 * This routine adds an edge between from and to because from depends 1474 * to. If asked to check for deadlock it checks whether there are any 1475 * reachable locks from "from_lock" that is owned by the same process 1476 * as "from_lock". 1477 * NOTE: It is the caller's responsibility to make sure that the color 1478 * of the graph is consistent between the calls to flk_add_edge as done 1479 * in flk_process_request. This routine does not color and check for 1480 * deadlock explicitly. 1481 */ 1482 1483 static int 1484 flk_add_edge(lock_descriptor_t *from_lock, lock_descriptor_t *to_lock, 1485 int check_cycle, int update_graph) 1486 { 1487 edge_t *edge; 1488 edge_t *ep; 1489 lock_descriptor_t *vertex; 1490 lock_descriptor_t *vertex_stack; 1491 1492 STACK_INIT(vertex_stack); 1493 1494 /* 1495 * if to vertex already has mark_color just return 1496 * don't add an edge as it is reachable from from vertex 1497 * before itself. 1498 */ 1499 1500 if (COLORED(to_lock)) 1501 return (0); 1502 1503 edge = flk_get_edge(); 1504 1505 /* 1506 * set the from and to vertex 1507 */ 1508 1509 edge->from_vertex = from_lock; 1510 edge->to_vertex = to_lock; 1511 1512 /* 1513 * put in adjacency list of from vertex 1514 */ 1515 1516 from_lock->l_edge.edge_adj_next->edge_adj_prev = edge; 1517 edge->edge_adj_next = from_lock->l_edge.edge_adj_next; 1518 edge->edge_adj_prev = &from_lock->l_edge; 1519 from_lock->l_edge.edge_adj_next = edge; 1520 1521 /* 1522 * put in list of to vertex 1523 */ 1524 1525 to_lock->l_edge.edge_in_next->edge_in_prev = edge; 1526 edge->edge_in_next = to_lock->l_edge.edge_in_next; 1527 to_lock->l_edge.edge_in_next = edge; 1528 edge->edge_in_prev = &to_lock->l_edge; 1529 1530 1531 if (update_graph) { 1532 flk_update_proc_graph(edge, 0); 1533 return (0); 1534 } 1535 if (!check_cycle) { 1536 return (0); 1537 } 1538 1539 STACK_PUSH(vertex_stack, from_lock, l_stack); 1540 1541 while ((vertex = STACK_TOP(vertex_stack)) != NULL) { 1542 1543 STACK_POP(vertex_stack, l_stack); 1544 1545 for (ep = FIRST_ADJ(vertex); 1546 ep != HEAD(vertex); 1547 ep = NEXT_ADJ(ep)) { 1548 if (COLORED(ep->to_vertex)) 1549 continue; 1550 COLOR(ep->to_vertex); 1551 if (SAME_OWNER(ep->to_vertex, from_lock)) 1552 goto dead_lock; 1553 STACK_PUSH(vertex_stack, ep->to_vertex, l_stack); 1554 } 1555 } 1556 return (0); 1557 1558 dead_lock: 1559 1560 /* 1561 * remove all edges 1562 */ 1563 1564 ep = FIRST_ADJ(from_lock); 1565 1566 while (ep != HEAD(from_lock)) { 1567 IN_LIST_REMOVE(ep); 1568 from_lock->l_sedge = NEXT_ADJ(ep); 1569 ADJ_LIST_REMOVE(ep); 1570 flk_free_edge(ep); 1571 ep = from_lock->l_sedge; 1572 } 1573 return (EDEADLK); 1574 } 1575 1576 /* 1577 * Get an edge structure for representing the dependency between two locks. 1578 */ 1579 1580 static edge_t * 1581 flk_get_edge() 1582 { 1583 edge_t *ep; 1584 1585 ASSERT(flk_edge_cache != NULL); 1586 1587 ep = kmem_cache_alloc(flk_edge_cache, KM_SLEEP); 1588 edge_allocs++; 1589 return (ep); 1590 } 1591 1592 /* 1593 * Free the edge structure. 1594 */ 1595 1596 static void 1597 flk_free_edge(edge_t *ep) 1598 { 1599 edge_frees++; 1600 kmem_cache_free(flk_edge_cache, (void *)ep); 1601 } 1602 1603 /* 1604 * Check the relationship of request with lock and perform the 1605 * recomputation of dependencies, break lock if required, and return 1606 * 1 if request cannot have any more relationship with the next 1607 * active locks. 1608 * The 'lock' and 'request' are compared and in case of overlap we 1609 * delete the 'lock' and form new locks to represent the non-overlapped 1610 * portion of original 'lock'. This function has side effects such as 1611 * 'lock' will be freed, new locks will be added to the active list. 1612 */ 1613 1614 static int 1615 flk_relation(lock_descriptor_t *lock, lock_descriptor_t *request) 1616 { 1617 int lock_effect; 1618 lock_descriptor_t *lock1, *lock2; 1619 lock_descriptor_t *topology[3]; 1620 int nvertex = 0; 1621 int i; 1622 edge_t *ep; 1623 graph_t *gp = (lock->l_graph); 1624 1625 1626 CHECK_SLEEPING_LOCKS(gp); 1627 CHECK_ACTIVE_LOCKS(gp); 1628 1629 ASSERT(MUTEX_HELD(&gp->gp_mutex)); 1630 1631 topology[0] = topology[1] = topology[2] = NULL; 1632 1633 if (request->l_type == F_UNLCK) 1634 lock_effect = FLK_UNLOCK; 1635 else if (request->l_type == F_RDLCK && 1636 lock->l_type == F_WRLCK) 1637 lock_effect = FLK_DOWNGRADE; 1638 else if (request->l_type == F_WRLCK && 1639 lock->l_type == F_RDLCK) 1640 lock_effect = FLK_UPGRADE; 1641 else 1642 lock_effect = FLK_STAY_SAME; 1643 1644 if (lock->l_end < request->l_start) { 1645 if (lock->l_end == request->l_start - 1 && 1646 lock_effect == FLK_STAY_SAME) { 1647 topology[0] = request; 1648 request->l_start = lock->l_start; 1649 nvertex = 1; 1650 goto recompute; 1651 } else { 1652 return (0); 1653 } 1654 } 1655 1656 if (lock->l_start > request->l_end) { 1657 if (request->l_end == lock->l_start - 1 && 1658 lock_effect == FLK_STAY_SAME) { 1659 topology[0] = request; 1660 request->l_end = lock->l_end; 1661 nvertex = 1; 1662 goto recompute; 1663 } else { 1664 return (1); 1665 } 1666 } 1667 1668 if (request->l_end < lock->l_end) { 1669 if (request->l_start > lock->l_start) { 1670 if (lock_effect == FLK_STAY_SAME) { 1671 request->l_start = lock->l_start; 1672 request->l_end = lock->l_end; 1673 topology[0] = request; 1674 nvertex = 1; 1675 } else { 1676 lock1 = flk_get_lock(); 1677 lock2 = flk_get_lock(); 1678 COPY(lock1, lock); 1679 COPY(lock2, lock); 1680 lock1->l_start = lock->l_start; 1681 lock1->l_end = request->l_start - 1; 1682 lock2->l_start = request->l_end + 1; 1683 lock2->l_end = lock->l_end; 1684 topology[0] = lock1; 1685 topology[1] = lock2; 1686 topology[2] = request; 1687 nvertex = 3; 1688 } 1689 } else if (request->l_start < lock->l_start) { 1690 if (lock_effect == FLK_STAY_SAME) { 1691 request->l_end = lock->l_end; 1692 topology[0] = request; 1693 nvertex = 1; 1694 } else { 1695 lock1 = flk_get_lock(); 1696 COPY(lock1, lock); 1697 lock1->l_start = request->l_end + 1; 1698 topology[0] = lock1; 1699 topology[1] = request; 1700 nvertex = 2; 1701 } 1702 } else { 1703 if (lock_effect == FLK_STAY_SAME) { 1704 request->l_start = lock->l_start; 1705 request->l_end = lock->l_end; 1706 topology[0] = request; 1707 nvertex = 1; 1708 } else { 1709 lock1 = flk_get_lock(); 1710 COPY(lock1, lock); 1711 lock1->l_start = request->l_end + 1; 1712 topology[0] = lock1; 1713 topology[1] = request; 1714 nvertex = 2; 1715 } 1716 } 1717 } else if (request->l_end > lock->l_end) { 1718 if (request->l_start > lock->l_start) { 1719 if (lock_effect == FLK_STAY_SAME) { 1720 request->l_start = lock->l_start; 1721 topology[0] = request; 1722 nvertex = 1; 1723 } else { 1724 lock1 = flk_get_lock(); 1725 COPY(lock1, lock); 1726 lock1->l_end = request->l_start - 1; 1727 topology[0] = lock1; 1728 topology[1] = request; 1729 nvertex = 2; 1730 } 1731 } else if (request->l_start < lock->l_start) { 1732 topology[0] = request; 1733 nvertex = 1; 1734 } else { 1735 topology[0] = request; 1736 nvertex = 1; 1737 } 1738 } else { 1739 if (request->l_start > lock->l_start) { 1740 if (lock_effect == FLK_STAY_SAME) { 1741 request->l_start = lock->l_start; 1742 topology[0] = request; 1743 nvertex = 1; 1744 } else { 1745 lock1 = flk_get_lock(); 1746 COPY(lock1, lock); 1747 lock1->l_end = request->l_start - 1; 1748 topology[0] = lock1; 1749 topology[1] = request; 1750 nvertex = 2; 1751 } 1752 } else if (request->l_start < lock->l_start) { 1753 topology[0] = request; 1754 nvertex = 1; 1755 } else { 1756 if (lock_effect != FLK_UNLOCK) { 1757 topology[0] = request; 1758 nvertex = 1; 1759 } else { 1760 flk_delete_active_lock(lock, 0); 1761 flk_wakeup(lock, 1); 1762 flk_free_lock(lock); 1763 CHECK_SLEEPING_LOCKS(gp); 1764 CHECK_ACTIVE_LOCKS(gp); 1765 return (1); 1766 } 1767 } 1768 } 1769 1770 recompute: 1771 1772 /* 1773 * For unlock we don't send the 'request' to for recomputing 1774 * dependencies because no lock will add an edge to this. 1775 */ 1776 1777 if (lock_effect == FLK_UNLOCK) { 1778 topology[nvertex-1] = NULL; 1779 nvertex--; 1780 } 1781 for (i = 0; i < nvertex; i++) { 1782 topology[i]->l_state |= RECOMPUTE_LOCK; 1783 topology[i]->l_color = NO_COLOR; 1784 } 1785 1786 ASSERT(FIRST_ADJ(lock) == HEAD(lock)); 1787 1788 /* 1789 * we remove the adjacent edges for all vertices' to this vertex 1790 * 'lock'. 1791 */ 1792 1793 ep = FIRST_IN(lock); 1794 while (ep != HEAD(lock)) { 1795 ADJ_LIST_REMOVE(ep); 1796 ep = NEXT_IN(ep); 1797 } 1798 1799 flk_delete_active_lock(lock, 0); 1800 1801 /* We are ready for recomputing the dependencies now */ 1802 1803 flk_recompute_dependencies(lock, topology, nvertex, 1); 1804 1805 for (i = 0; i < nvertex; i++) { 1806 topology[i]->l_state &= ~RECOMPUTE_LOCK; 1807 topology[i]->l_color = NO_COLOR; 1808 } 1809 1810 1811 if (lock_effect == FLK_UNLOCK) { 1812 nvertex++; 1813 } 1814 for (i = 0; i < nvertex - 1; i++) { 1815 flk_insert_active_lock(topology[i]); 1816 } 1817 1818 1819 if (lock_effect == FLK_DOWNGRADE || lock_effect == FLK_UNLOCK) { 1820 flk_wakeup(lock, 0); 1821 } else { 1822 ep = FIRST_IN(lock); 1823 while (ep != HEAD(lock)) { 1824 lock->l_sedge = NEXT_IN(ep); 1825 IN_LIST_REMOVE(ep); 1826 flk_update_proc_graph(ep, 1); 1827 flk_free_edge(ep); 1828 ep = lock->l_sedge; 1829 } 1830 } 1831 flk_free_lock(lock); 1832 1833 CHECK_SLEEPING_LOCKS(gp); 1834 CHECK_ACTIVE_LOCKS(gp); 1835 return (0); 1836 } 1837 1838 /* 1839 * Insert a lock into the active queue. 1840 */ 1841 1842 static void 1843 flk_insert_active_lock(lock_descriptor_t *new_lock) 1844 { 1845 graph_t *gp = new_lock->l_graph; 1846 vnode_t *vp = new_lock->l_vnode; 1847 lock_descriptor_t *first_lock, *lock; 1848 1849 ASSERT(MUTEX_HELD(&gp->gp_mutex)); 1850 1851 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp); 1852 first_lock = lock; 1853 1854 if (first_lock != NULL) { 1855 for (; (lock->l_vnode == vp && 1856 lock->l_start < new_lock->l_start); lock = lock->l_next) 1857 ; 1858 } else { 1859 lock = ACTIVE_HEAD(gp); 1860 } 1861 1862 lock->l_prev->l_next = new_lock; 1863 new_lock->l_next = lock; 1864 new_lock->l_prev = lock->l_prev; 1865 lock->l_prev = new_lock; 1866 1867 if (first_lock == NULL || (new_lock->l_start <= first_lock->l_start)) { 1868 vp->v_filocks = (struct filock *)new_lock; 1869 } 1870 flk_set_state(new_lock, FLK_ACTIVE_STATE); 1871 new_lock->l_state |= ACTIVE_LOCK; 1872 1873 CHECK_ACTIVE_LOCKS(gp); 1874 CHECK_SLEEPING_LOCKS(gp); 1875 } 1876 1877 /* 1878 * Delete the active lock : Performs two functions depending on the 1879 * value of second parameter. One is to remove from the active lists 1880 * only and other is to both remove and free the lock. 1881 */ 1882 1883 static void 1884 flk_delete_active_lock(lock_descriptor_t *lock, int free_lock) 1885 { 1886 vnode_t *vp = lock->l_vnode; 1887 graph_t *gp = lock->l_graph; 1888 1889 ASSERT(MUTEX_HELD(&gp->gp_mutex)); 1890 if (free_lock) 1891 ASSERT(NO_DEPENDENTS(lock)); 1892 ASSERT(NOT_BLOCKED(lock)); 1893 ASSERT(IS_ACTIVE(lock)); 1894 1895 ASSERT((vp->v_filocks != NULL)); 1896 1897 if (vp->v_filocks == (struct filock *)lock) { 1898 vp->v_filocks = (struct filock *) 1899 ((lock->l_next->l_vnode == vp) ? lock->l_next : 1900 NULL); 1901 } 1902 lock->l_next->l_prev = lock->l_prev; 1903 lock->l_prev->l_next = lock->l_next; 1904 lock->l_next = lock->l_prev = NULL; 1905 flk_set_state(lock, FLK_DEAD_STATE); 1906 lock->l_state &= ~ACTIVE_LOCK; 1907 1908 if (free_lock) 1909 flk_free_lock(lock); 1910 CHECK_ACTIVE_LOCKS(gp); 1911 CHECK_SLEEPING_LOCKS(gp); 1912 } 1913 1914 /* 1915 * Insert into the sleep queue. 1916 */ 1917 1918 static void 1919 flk_insert_sleeping_lock(lock_descriptor_t *request) 1920 { 1921 graph_t *gp = request->l_graph; 1922 vnode_t *vp = request->l_vnode; 1923 lock_descriptor_t *lock; 1924 1925 ASSERT(MUTEX_HELD(&gp->gp_mutex)); 1926 ASSERT(IS_INITIAL(request)); 1927 1928 for (lock = gp->sleeping_locks.l_next; (lock != &gp->sleeping_locks && 1929 lock->l_vnode < vp); lock = lock->l_next) 1930 ; 1931 1932 lock->l_prev->l_next = request; 1933 request->l_prev = lock->l_prev; 1934 lock->l_prev = request; 1935 request->l_next = lock; 1936 flk_set_state(request, FLK_SLEEPING_STATE); 1937 request->l_state |= SLEEPING_LOCK; 1938 } 1939 1940 /* 1941 * Cancelling a sleeping lock implies removing a vertex from the 1942 * dependency graph and therefore we should recompute the dependencies 1943 * of all vertices that have a path to this vertex, w.r.t. all 1944 * vertices reachable from this vertex. 1945 */ 1946 1947 void 1948 flk_cancel_sleeping_lock(lock_descriptor_t *request, int remove_from_queue) 1949 { 1950 graph_t *gp = request->l_graph; 1951 vnode_t *vp = request->l_vnode; 1952 lock_descriptor_t **topology = NULL; 1953 edge_t *ep; 1954 lock_descriptor_t *vertex, *lock; 1955 int nvertex = 0; 1956 int i; 1957 lock_descriptor_t *vertex_stack; 1958 1959 STACK_INIT(vertex_stack); 1960 1961 ASSERT(MUTEX_HELD(&gp->gp_mutex)); 1962 /* 1963 * count number of vertex pointers that has to be allocated 1964 * All vertices that are reachable from request. 1965 */ 1966 1967 STACK_PUSH(vertex_stack, request, l_stack); 1968 1969 while ((vertex = STACK_TOP(vertex_stack)) != NULL) { 1970 STACK_POP(vertex_stack, l_stack); 1971 for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex); 1972 ep = NEXT_ADJ(ep)) { 1973 if (IS_RECOMPUTE(ep->to_vertex)) 1974 continue; 1975 ep->to_vertex->l_state |= RECOMPUTE_LOCK; 1976 STACK_PUSH(vertex_stack, ep->to_vertex, l_stack); 1977 nvertex++; 1978 } 1979 } 1980 1981 /* 1982 * allocate memory for holding the vertex pointers 1983 */ 1984 1985 if (nvertex) { 1986 topology = kmem_zalloc(nvertex * sizeof (lock_descriptor_t *), 1987 KM_SLEEP); 1988 } 1989 1990 /* 1991 * one more pass to actually store the vertices in the 1992 * allocated array. 1993 * We first check sleeping locks and then active locks 1994 * so that topology array will be in a topological 1995 * order. 1996 */ 1997 1998 nvertex = 0; 1999 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp); 2000 2001 if (lock) { 2002 do { 2003 if (IS_RECOMPUTE(lock)) { 2004 lock->l_index = nvertex; 2005 topology[nvertex++] = lock; 2006 } 2007 lock->l_color = NO_COLOR; 2008 lock = lock->l_next; 2009 } while (lock->l_vnode == vp); 2010 } 2011 2012 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp); 2013 2014 if (lock) { 2015 do { 2016 if (IS_RECOMPUTE(lock)) { 2017 lock->l_index = nvertex; 2018 topology[nvertex++] = lock; 2019 } 2020 lock->l_color = NO_COLOR; 2021 lock = lock->l_next; 2022 } while (lock->l_vnode == vp); 2023 } 2024 2025 /* 2026 * remove in and out edges of request 2027 * They are freed after updating proc_graph below. 2028 */ 2029 2030 for (ep = FIRST_IN(request); ep != HEAD(request); ep = NEXT_IN(ep)) { 2031 ADJ_LIST_REMOVE(ep); 2032 } 2033 2034 2035 if (remove_from_queue) 2036 REMOVE_SLEEP_QUEUE(request); 2037 2038 /* we are ready to recompute */ 2039 2040 flk_recompute_dependencies(request, topology, nvertex, 1); 2041 2042 ep = FIRST_ADJ(request); 2043 while (ep != HEAD(request)) { 2044 IN_LIST_REMOVE(ep); 2045 request->l_sedge = NEXT_ADJ(ep); 2046 ADJ_LIST_REMOVE(ep); 2047 flk_update_proc_graph(ep, 1); 2048 flk_free_edge(ep); 2049 ep = request->l_sedge; 2050 } 2051 2052 2053 /* 2054 * unset the RECOMPUTE flag in those vertices 2055 */ 2056 2057 for (i = 0; i < nvertex; i++) { 2058 topology[i]->l_state &= ~RECOMPUTE_LOCK; 2059 } 2060 2061 /* 2062 * free the topology 2063 */ 2064 if (nvertex) 2065 kmem_free((void *)topology, 2066 (nvertex * sizeof (lock_descriptor_t *))); 2067 /* 2068 * Possibility of some locks unblocked now 2069 */ 2070 2071 flk_wakeup(request, 0); 2072 2073 /* 2074 * we expect to have a correctly recomputed graph now. 2075 */ 2076 flk_set_state(request, FLK_DEAD_STATE); 2077 flk_free_lock(request); 2078 CHECK_SLEEPING_LOCKS(gp); 2079 CHECK_ACTIVE_LOCKS(gp); 2080 2081 } 2082 2083 /* 2084 * Uncoloring the graph is simply to increment the mark value of the graph 2085 * And only when wrap round takes place will we color all vertices in 2086 * the graph explicitly. 2087 */ 2088 2089 static void 2090 flk_graph_uncolor(graph_t *gp) 2091 { 2092 lock_descriptor_t *lock; 2093 2094 if (gp->mark == UINT_MAX) { 2095 gp->mark = 1; 2096 for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp); 2097 lock = lock->l_next) 2098 lock->l_color = 0; 2099 2100 for (lock = SLEEPING_HEAD(gp)->l_next; 2101 lock != SLEEPING_HEAD(gp); lock = lock->l_next) 2102 lock->l_color = 0; 2103 } else { 2104 gp->mark++; 2105 } 2106 } 2107 2108 /* 2109 * Wake up locks that are blocked on the given lock. 2110 */ 2111 2112 static void 2113 flk_wakeup(lock_descriptor_t *lock, int adj_list_remove) 2114 { 2115 edge_t *ep; 2116 graph_t *gp = lock->l_graph; 2117 lock_descriptor_t *lck; 2118 2119 ASSERT(MUTEX_HELD(&gp->gp_mutex)); 2120 if (NO_DEPENDENTS(lock)) 2121 return; 2122 ep = FIRST_IN(lock); 2123 do { 2124 /* 2125 * delete the edge from the adjacency list 2126 * of from vertex. if no more adjacent edges 2127 * for this vertex wake this process. 2128 */ 2129 lck = ep->from_vertex; 2130 if (adj_list_remove) 2131 ADJ_LIST_REMOVE(ep); 2132 flk_update_proc_graph(ep, 1); 2133 if (NOT_BLOCKED(lck)) { 2134 GRANT_WAKEUP(lck); 2135 } 2136 lock->l_sedge = NEXT_IN(ep); 2137 IN_LIST_REMOVE(ep); 2138 flk_free_edge(ep); 2139 ep = lock->l_sedge; 2140 } while (ep != HEAD(lock)); 2141 ASSERT(NO_DEPENDENTS(lock)); 2142 } 2143 2144 /* 2145 * The dependents of request, is checked for its dependency against the 2146 * locks in topology (called topology because the array is and should be in 2147 * topological order for this algorithm, if not in topological order the 2148 * inner loop below might add more edges than necessary. Topological ordering 2149 * of vertices satisfies the property that all edges will be from left to 2150 * right i.e., topology[i] can have an edge to topology[j], iff i<j) 2151 * If lock l1 in the dependent set of request is dependent (blocked by) 2152 * on lock l2 in topology but does not have a path to it, we add an edge 2153 * in the inner loop below. 2154 * 2155 * We don't want to add an edge between l1 and l2 if there exists 2156 * already a path from l1 to l2, so care has to be taken for those vertices 2157 * that have two paths to 'request'. These vertices are referred to here 2158 * as barrier locks. 2159 * 2160 * The barriers has to be found (those vertex that originally had two paths 2161 * to request) because otherwise we may end up adding edges unnecessarily 2162 * to vertices in topology, and thus barrier vertices can have an edge 2163 * to a vertex in topology as well a path to it. 2164 */ 2165 2166 static void 2167 flk_recompute_dependencies(lock_descriptor_t *request, 2168 lock_descriptor_t **topology, int nvertex, int update_graph) 2169 { 2170 lock_descriptor_t *vertex, *lock; 2171 graph_t *gp = request->l_graph; 2172 int i, count; 2173 int barrier_found = 0; 2174 edge_t *ep; 2175 lock_descriptor_t *vertex_stack; 2176 2177 STACK_INIT(vertex_stack); 2178 2179 ASSERT(MUTEX_HELD(&gp->gp_mutex)); 2180 if (nvertex == 0) 2181 return; 2182 flk_graph_uncolor(request->l_graph); 2183 barrier_found = flk_find_barriers(request); 2184 request->l_state |= RECOMPUTE_DONE; 2185 2186 STACK_PUSH(vertex_stack, request, l_stack); 2187 request->l_sedge = FIRST_IN(request); 2188 2189 2190 while ((vertex = STACK_TOP(vertex_stack)) != NULL) { 2191 if (vertex->l_state & RECOMPUTE_DONE) { 2192 count = 0; 2193 goto next_in_edge; 2194 } 2195 if (IS_BARRIER(vertex)) { 2196 /* decrement the barrier count */ 2197 if (vertex->l_index) { 2198 vertex->l_index--; 2199 /* this guy will be pushed again anyway ? */ 2200 STACK_POP(vertex_stack, l_stack); 2201 if (vertex->l_index == 0) { 2202 /* 2203 * barrier is over we can recompute 2204 * dependencies for this lock in the 2205 * next stack pop 2206 */ 2207 vertex->l_state &= ~BARRIER_LOCK; 2208 } 2209 continue; 2210 } 2211 } 2212 vertex->l_state |= RECOMPUTE_DONE; 2213 flk_graph_uncolor(gp); 2214 count = flk_color_reachables(vertex); 2215 for (i = 0; i < nvertex; i++) { 2216 lock = topology[i]; 2217 if (COLORED(lock)) 2218 continue; 2219 if (BLOCKS(lock, vertex)) { 2220 (void) flk_add_edge(vertex, lock, 2221 NO_CHECK_CYCLE, update_graph); 2222 COLOR(lock); 2223 count++; 2224 count += flk_color_reachables(lock); 2225 } 2226 2227 } 2228 2229 next_in_edge: 2230 if (count == nvertex || 2231 vertex->l_sedge == HEAD(vertex)) { 2232 /* prune the tree below this */ 2233 STACK_POP(vertex_stack, l_stack); 2234 vertex->l_state &= ~RECOMPUTE_DONE; 2235 /* update the barrier locks below this! */ 2236 if (vertex->l_sedge != HEAD(vertex) && barrier_found) { 2237 flk_graph_uncolor(gp); 2238 flk_update_barriers(vertex); 2239 } 2240 continue; 2241 } 2242 2243 ep = vertex->l_sedge; 2244 lock = ep->from_vertex; 2245 STACK_PUSH(vertex_stack, lock, l_stack); 2246 lock->l_sedge = FIRST_IN(lock); 2247 vertex->l_sedge = NEXT_IN(ep); 2248 } 2249 2250 } 2251 2252 /* 2253 * Color all reachable vertices from vertex that belongs to topology (here 2254 * those that have RECOMPUTE_LOCK set in their state) and yet uncolored. 2255 * 2256 * Note: we need to use a different stack_link l_stack1 because this is 2257 * called from flk_recompute_dependencies() that already uses a stack with 2258 * l_stack as stack_link. 2259 */ 2260 2261 static int 2262 flk_color_reachables(lock_descriptor_t *vertex) 2263 { 2264 lock_descriptor_t *ver, *lock; 2265 int count; 2266 edge_t *ep; 2267 lock_descriptor_t *vertex_stack; 2268 2269 STACK_INIT(vertex_stack); 2270 2271 STACK_PUSH(vertex_stack, vertex, l_stack1); 2272 count = 0; 2273 while ((ver = STACK_TOP(vertex_stack)) != NULL) { 2274 2275 STACK_POP(vertex_stack, l_stack1); 2276 for (ep = FIRST_ADJ(ver); ep != HEAD(ver); 2277 ep = NEXT_ADJ(ep)) { 2278 lock = ep->to_vertex; 2279 if (COLORED(lock)) 2280 continue; 2281 COLOR(lock); 2282 if (IS_RECOMPUTE(lock)) 2283 count++; 2284 STACK_PUSH(vertex_stack, lock, l_stack1); 2285 } 2286 2287 } 2288 return (count); 2289 } 2290 2291 /* 2292 * Called from flk_recompute_dependencies() this routine decrements 2293 * the barrier count of barrier vertices that are reachable from lock. 2294 */ 2295 2296 static void 2297 flk_update_barriers(lock_descriptor_t *lock) 2298 { 2299 lock_descriptor_t *vertex, *lck; 2300 edge_t *ep; 2301 lock_descriptor_t *vertex_stack; 2302 2303 STACK_INIT(vertex_stack); 2304 2305 STACK_PUSH(vertex_stack, lock, l_stack1); 2306 2307 while ((vertex = STACK_TOP(vertex_stack)) != NULL) { 2308 STACK_POP(vertex_stack, l_stack1); 2309 for (ep = FIRST_IN(vertex); ep != HEAD(vertex); 2310 ep = NEXT_IN(ep)) { 2311 lck = ep->from_vertex; 2312 if (COLORED(lck)) { 2313 if (IS_BARRIER(lck)) { 2314 ASSERT(lck->l_index > 0); 2315 lck->l_index--; 2316 if (lck->l_index == 0) 2317 lck->l_state &= ~BARRIER_LOCK; 2318 } 2319 continue; 2320 } 2321 COLOR(lck); 2322 if (IS_BARRIER(lck)) { 2323 ASSERT(lck->l_index > 0); 2324 lck->l_index--; 2325 if (lck->l_index == 0) 2326 lck->l_state &= ~BARRIER_LOCK; 2327 } 2328 STACK_PUSH(vertex_stack, lck, l_stack1); 2329 } 2330 } 2331 } 2332 2333 /* 2334 * Finds all vertices that are reachable from 'lock' more than once and 2335 * mark them as barrier vertices and increment their barrier count. 2336 * The barrier count is one minus the total number of paths from lock 2337 * to that vertex. 2338 */ 2339 2340 static int 2341 flk_find_barriers(lock_descriptor_t *lock) 2342 { 2343 lock_descriptor_t *vertex, *lck; 2344 int found = 0; 2345 edge_t *ep; 2346 lock_descriptor_t *vertex_stack; 2347 2348 STACK_INIT(vertex_stack); 2349 2350 STACK_PUSH(vertex_stack, lock, l_stack1); 2351 2352 while ((vertex = STACK_TOP(vertex_stack)) != NULL) { 2353 STACK_POP(vertex_stack, l_stack1); 2354 for (ep = FIRST_IN(vertex); ep != HEAD(vertex); 2355 ep = NEXT_IN(ep)) { 2356 lck = ep->from_vertex; 2357 if (COLORED(lck)) { 2358 /* this is a barrier */ 2359 lck->l_state |= BARRIER_LOCK; 2360 /* index will have barrier count */ 2361 lck->l_index++; 2362 if (!found) 2363 found = 1; 2364 continue; 2365 } 2366 COLOR(lck); 2367 lck->l_index = 0; 2368 STACK_PUSH(vertex_stack, lck, l_stack1); 2369 } 2370 } 2371 return (found); 2372 } 2373 2374 /* 2375 * Finds the first lock that is mainly responsible for blocking this 2376 * request. If there is no such lock, request->l_flock.l_type is set to 2377 * F_UNLCK. Otherwise, request->l_flock is filled in with the particulars 2378 * of the blocking lock. 2379 * 2380 * Note: It is possible a request is blocked by a sleeping lock because 2381 * of the fairness policy used in flk_process_request() to construct the 2382 * dependencies. (see comments before flk_process_request()). 2383 */ 2384 2385 static void 2386 flk_get_first_blocking_lock(lock_descriptor_t *request) 2387 { 2388 graph_t *gp = request->l_graph; 2389 vnode_t *vp = request->l_vnode; 2390 lock_descriptor_t *lock, *blocker; 2391 2392 ASSERT(MUTEX_HELD(&gp->gp_mutex)); 2393 blocker = NULL; 2394 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp); 2395 2396 if (lock) { 2397 do { 2398 if (BLOCKS(lock, request)) { 2399 blocker = lock; 2400 break; 2401 } 2402 lock = lock->l_next; 2403 } while (lock->l_vnode == vp); 2404 } 2405 2406 if (blocker == NULL && request->l_flock.l_type == F_RDLCK) { 2407 /* 2408 * No active lock is blocking this request, but if a read 2409 * lock is requested, it may also get blocked by a waiting 2410 * writer. So search all sleeping locks and see if there is 2411 * a writer waiting. 2412 */ 2413 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp); 2414 if (lock) { 2415 do { 2416 if (BLOCKS(lock, request)) { 2417 blocker = lock; 2418 break; 2419 } 2420 lock = lock->l_next; 2421 } while (lock->l_vnode == vp); 2422 } 2423 } 2424 2425 if (blocker) { 2426 report_blocker(blocker, request); 2427 } else 2428 request->l_flock.l_type = F_UNLCK; 2429 } 2430 2431 /* 2432 * Get the graph_t structure associated with a vnode. 2433 * If 'initialize' is non-zero, and the graph_t structure for this vnode has 2434 * not yet been initialized, then a new element is allocated and returned. 2435 */ 2436 graph_t * 2437 flk_get_lock_graph(vnode_t *vp, int initialize) 2438 { 2439 graph_t *gp; 2440 graph_t *gp_alloc = NULL; 2441 int index = HASH_INDEX(vp); 2442 2443 if (initialize == FLK_USE_GRAPH) { 2444 mutex_enter(&flock_lock); 2445 gp = lock_graph[index]; 2446 mutex_exit(&flock_lock); 2447 return (gp); 2448 } 2449 2450 ASSERT(initialize == FLK_INIT_GRAPH); 2451 2452 if (lock_graph[index] == NULL) { 2453 2454 gp_alloc = kmem_zalloc(sizeof (graph_t), KM_SLEEP); 2455 2456 /* Initialize the graph */ 2457 2458 gp_alloc->active_locks.l_next = 2459 gp_alloc->active_locks.l_prev = 2460 (lock_descriptor_t *)ACTIVE_HEAD(gp_alloc); 2461 gp_alloc->sleeping_locks.l_next = 2462 gp_alloc->sleeping_locks.l_prev = 2463 (lock_descriptor_t *)SLEEPING_HEAD(gp_alloc); 2464 gp_alloc->index = index; 2465 mutex_init(&gp_alloc->gp_mutex, NULL, MUTEX_DEFAULT, NULL); 2466 } 2467 2468 mutex_enter(&flock_lock); 2469 2470 gp = lock_graph[index]; 2471 2472 /* Recheck the value within flock_lock */ 2473 if (gp == NULL) { 2474 struct flock_globals *fg; 2475 2476 /* We must have previously allocated the graph_t structure */ 2477 ASSERT(gp_alloc != NULL); 2478 lock_graph[index] = gp = gp_alloc; 2479 /* 2480 * The lockmgr status is only needed if KLM is loaded. 2481 */ 2482 if (flock_zone_key != ZONE_KEY_UNINITIALIZED) { 2483 fg = flk_get_globals(); 2484 fg->lockmgr_status[index] = fg->flk_lockmgr_status; 2485 } 2486 } 2487 2488 mutex_exit(&flock_lock); 2489 2490 if ((gp_alloc != NULL) && (gp != gp_alloc)) { 2491 /* There was a race to allocate the graph_t and we lost */ 2492 mutex_destroy(&gp_alloc->gp_mutex); 2493 kmem_free(gp_alloc, sizeof (graph_t)); 2494 } 2495 2496 return (gp); 2497 } 2498 2499 /* 2500 * PSARC case 1997/292 2501 */ 2502 int 2503 cl_flk_has_remote_locks_for_nlmid(vnode_t *vp, int nlmid) 2504 { 2505 lock_descriptor_t *lock; 2506 int result = 0; 2507 graph_t *gp; 2508 int lock_nlmid; 2509 2510 /* 2511 * Check to see if node is booted as a cluster. If not, return. 2512 */ 2513 if ((cluster_bootflags & CLUSTER_BOOTED) == 0) { 2514 return (0); 2515 } 2516 2517 gp = flk_get_lock_graph(vp, FLK_USE_GRAPH); 2518 if (gp == NULL) { 2519 return (0); 2520 } 2521 2522 mutex_enter(&gp->gp_mutex); 2523 2524 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp); 2525 2526 if (lock) { 2527 while (lock->l_vnode == vp) { 2528 /* get NLM id from sysid */ 2529 lock_nlmid = GETNLMID(lock->l_flock.l_sysid); 2530 2531 /* 2532 * If NLM server request _and_ nlmid of lock matches 2533 * nlmid of argument, then we've found a remote lock. 2534 */ 2535 if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) { 2536 result = 1; 2537 goto done; 2538 } 2539 lock = lock->l_next; 2540 } 2541 } 2542 2543 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp); 2544 2545 if (lock) { 2546 while (lock->l_vnode == vp) { 2547 /* get NLM id from sysid */ 2548 lock_nlmid = GETNLMID(lock->l_flock.l_sysid); 2549 2550 /* 2551 * If NLM server request _and_ nlmid of lock matches 2552 * nlmid of argument, then we've found a remote lock. 2553 */ 2554 if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) { 2555 result = 1; 2556 goto done; 2557 } 2558 lock = lock->l_next; 2559 } 2560 } 2561 2562 done: 2563 mutex_exit(&gp->gp_mutex); 2564 return (result); 2565 } 2566 2567 /* 2568 * Determine whether there are any locks for the given vnode with a remote 2569 * sysid. Returns zero if not, non-zero if there are. 2570 * 2571 * Note that the return value from this function is potentially invalid 2572 * once it has been returned. The caller is responsible for providing its 2573 * own synchronization mechanism to ensure that the return value is useful 2574 * (e.g., see nfs_lockcompletion()). 2575 */ 2576 int 2577 flk_has_remote_locks(vnode_t *vp) 2578 { 2579 lock_descriptor_t *lock; 2580 int result = 0; 2581 graph_t *gp; 2582 2583 gp = flk_get_lock_graph(vp, FLK_USE_GRAPH); 2584 if (gp == NULL) { 2585 return (0); 2586 } 2587 2588 mutex_enter(&gp->gp_mutex); 2589 2590 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp); 2591 2592 if (lock) { 2593 while (lock->l_vnode == vp) { 2594 if (IS_REMOTE(lock)) { 2595 result = 1; 2596 goto done; 2597 } 2598 lock = lock->l_next; 2599 } 2600 } 2601 2602 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp); 2603 2604 if (lock) { 2605 while (lock->l_vnode == vp) { 2606 if (IS_REMOTE(lock)) { 2607 result = 1; 2608 goto done; 2609 } 2610 lock = lock->l_next; 2611 } 2612 } 2613 2614 done: 2615 mutex_exit(&gp->gp_mutex); 2616 return (result); 2617 } 2618 2619 /* 2620 * Determine whether there are any locks for the given vnode with a remote 2621 * sysid matching given sysid. 2622 * Used by the new (open source) NFS Lock Manager (NLM) 2623 */ 2624 int 2625 flk_has_remote_locks_for_sysid(vnode_t *vp, int sysid) 2626 { 2627 lock_descriptor_t *lock; 2628 int result = 0; 2629 graph_t *gp; 2630 2631 if (sysid == 0) 2632 return (0); 2633 2634 gp = flk_get_lock_graph(vp, FLK_USE_GRAPH); 2635 if (gp == NULL) { 2636 return (0); 2637 } 2638 2639 mutex_enter(&gp->gp_mutex); 2640 2641 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp); 2642 2643 if (lock) { 2644 while (lock->l_vnode == vp) { 2645 if (lock->l_flock.l_sysid == sysid) { 2646 result = 1; 2647 goto done; 2648 } 2649 lock = lock->l_next; 2650 } 2651 } 2652 2653 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp); 2654 2655 if (lock) { 2656 while (lock->l_vnode == vp) { 2657 if (lock->l_flock.l_sysid == sysid) { 2658 result = 1; 2659 goto done; 2660 } 2661 lock = lock->l_next; 2662 } 2663 } 2664 2665 done: 2666 mutex_exit(&gp->gp_mutex); 2667 return (result); 2668 } 2669 2670 /* 2671 * Determine if there are any locks owned by the given sysid. 2672 * Returns zero if not, non-zero if there are. Note that this return code 2673 * could be derived from flk_get_{sleeping,active}_locks, but this routine 2674 * avoids all the memory allocations of those routines. 2675 * 2676 * This routine has the same synchronization issues as 2677 * flk_has_remote_locks. 2678 */ 2679 2680 int 2681 flk_sysid_has_locks(int sysid, int lck_type) 2682 { 2683 int has_locks = 0; 2684 lock_descriptor_t *lock; 2685 graph_t *gp; 2686 int i; 2687 2688 for (i = 0; i < HASH_SIZE && !has_locks; i++) { 2689 mutex_enter(&flock_lock); 2690 gp = lock_graph[i]; 2691 mutex_exit(&flock_lock); 2692 if (gp == NULL) { 2693 continue; 2694 } 2695 2696 mutex_enter(&gp->gp_mutex); 2697 2698 if (lck_type & FLK_QUERY_ACTIVE) { 2699 for (lock = ACTIVE_HEAD(gp)->l_next; 2700 lock != ACTIVE_HEAD(gp) && !has_locks; 2701 lock = lock->l_next) { 2702 if (lock->l_flock.l_sysid == sysid) 2703 has_locks = 1; 2704 } 2705 } 2706 2707 if (lck_type & FLK_QUERY_SLEEPING) { 2708 for (lock = SLEEPING_HEAD(gp)->l_next; 2709 lock != SLEEPING_HEAD(gp) && !has_locks; 2710 lock = lock->l_next) { 2711 if (lock->l_flock.l_sysid == sysid) 2712 has_locks = 1; 2713 } 2714 } 2715 mutex_exit(&gp->gp_mutex); 2716 } 2717 2718 return (has_locks); 2719 } 2720 2721 2722 /* 2723 * PSARC case 1997/292 2724 * 2725 * Requires: "sysid" is a pair [nlmid, sysid]. The lower half is 16-bit 2726 * quantity, the real sysid generated by the NLM server; the upper half 2727 * identifies the node of the cluster where the NLM server ran. 2728 * This routine is only called by an NLM server running in a cluster. 2729 * Effects: Remove all locks held on behalf of the client identified 2730 * by "sysid." 2731 */ 2732 void 2733 cl_flk_remove_locks_by_sysid(int sysid) 2734 { 2735 graph_t *gp; 2736 int i; 2737 lock_descriptor_t *lock, *nlock; 2738 2739 /* 2740 * Check to see if node is booted as a cluster. If not, return. 2741 */ 2742 if ((cluster_bootflags & CLUSTER_BOOTED) == 0) { 2743 return; 2744 } 2745 2746 ASSERT(sysid != 0); 2747 for (i = 0; i < HASH_SIZE; i++) { 2748 mutex_enter(&flock_lock); 2749 gp = lock_graph[i]; 2750 mutex_exit(&flock_lock); 2751 2752 if (gp == NULL) 2753 continue; 2754 2755 mutex_enter(&gp->gp_mutex); /* get mutex on lock graph */ 2756 2757 /* signal sleeping requests so that they bail out */ 2758 lock = SLEEPING_HEAD(gp)->l_next; 2759 while (lock != SLEEPING_HEAD(gp)) { 2760 nlock = lock->l_next; 2761 if (lock->l_flock.l_sysid == sysid) { 2762 INTERRUPT_WAKEUP(lock); 2763 } 2764 lock = nlock; 2765 } 2766 2767 /* delete active locks */ 2768 lock = ACTIVE_HEAD(gp)->l_next; 2769 while (lock != ACTIVE_HEAD(gp)) { 2770 nlock = lock->l_next; 2771 if (lock->l_flock.l_sysid == sysid) { 2772 flk_delete_active_lock(lock, 0); 2773 flk_wakeup(lock, 1); 2774 flk_free_lock(lock); 2775 } 2776 lock = nlock; 2777 } 2778 mutex_exit(&gp->gp_mutex); /* release mutex on lock graph */ 2779 } 2780 } 2781 2782 /* 2783 * Delete all locks in the system that belongs to the sysid of the request. 2784 */ 2785 2786 static void 2787 flk_delete_locks_by_sysid(lock_descriptor_t *request) 2788 { 2789 int sysid = request->l_flock.l_sysid; 2790 lock_descriptor_t *lock, *nlock; 2791 graph_t *gp; 2792 int i; 2793 2794 ASSERT(MUTEX_HELD(&request->l_graph->gp_mutex)); 2795 ASSERT(sysid != 0); 2796 2797 mutex_exit(&request->l_graph->gp_mutex); 2798 2799 for (i = 0; i < HASH_SIZE; i++) { 2800 mutex_enter(&flock_lock); 2801 gp = lock_graph[i]; 2802 mutex_exit(&flock_lock); 2803 2804 if (gp == NULL) 2805 continue; 2806 2807 mutex_enter(&gp->gp_mutex); 2808 2809 /* signal sleeping requests so that they bail out */ 2810 lock = SLEEPING_HEAD(gp)->l_next; 2811 while (lock != SLEEPING_HEAD(gp)) { 2812 nlock = lock->l_next; 2813 if (lock->l_flock.l_sysid == sysid) { 2814 INTERRUPT_WAKEUP(lock); 2815 } 2816 lock = nlock; 2817 } 2818 2819 /* delete active locks */ 2820 lock = ACTIVE_HEAD(gp)->l_next; 2821 while (lock != ACTIVE_HEAD(gp)) { 2822 nlock = lock->l_next; 2823 if (lock->l_flock.l_sysid == sysid) { 2824 flk_delete_active_lock(lock, 0); 2825 flk_wakeup(lock, 1); 2826 flk_free_lock(lock); 2827 } 2828 lock = nlock; 2829 } 2830 mutex_exit(&gp->gp_mutex); 2831 } 2832 2833 mutex_enter(&request->l_graph->gp_mutex); 2834 } 2835 2836 /* 2837 * Clustering: Deletes PXFS locks 2838 * Effects: Delete all locks on files in the given file system and with the 2839 * given PXFS id. 2840 */ 2841 void 2842 cl_flk_delete_pxfs_locks(struct vfs *vfsp, int pxfsid) 2843 { 2844 lock_descriptor_t *lock, *nlock; 2845 graph_t *gp; 2846 int i; 2847 2848 for (i = 0; i < HASH_SIZE; i++) { 2849 mutex_enter(&flock_lock); 2850 gp = lock_graph[i]; 2851 mutex_exit(&flock_lock); 2852 2853 if (gp == NULL) 2854 continue; 2855 2856 mutex_enter(&gp->gp_mutex); 2857 2858 /* signal sleeping requests so that they bail out */ 2859 lock = SLEEPING_HEAD(gp)->l_next; 2860 while (lock != SLEEPING_HEAD(gp)) { 2861 nlock = lock->l_next; 2862 if (lock->l_vnode->v_vfsp == vfsp) { 2863 ASSERT(IS_PXFS(lock)); 2864 if (GETPXFSID(lock->l_flock.l_sysid) == 2865 pxfsid) { 2866 flk_set_state(lock, 2867 FLK_CANCELLED_STATE); 2868 flk_cancel_sleeping_lock(lock, 1); 2869 } 2870 } 2871 lock = nlock; 2872 } 2873 2874 /* delete active locks */ 2875 lock = ACTIVE_HEAD(gp)->l_next; 2876 while (lock != ACTIVE_HEAD(gp)) { 2877 nlock = lock->l_next; 2878 if (lock->l_vnode->v_vfsp == vfsp) { 2879 ASSERT(IS_PXFS(lock)); 2880 if (GETPXFSID(lock->l_flock.l_sysid) == 2881 pxfsid) { 2882 flk_delete_active_lock(lock, 0); 2883 flk_wakeup(lock, 1); 2884 flk_free_lock(lock); 2885 } 2886 } 2887 lock = nlock; 2888 } 2889 mutex_exit(&gp->gp_mutex); 2890 } 2891 } 2892 2893 /* 2894 * Search for a sleeping lock manager lock which matches exactly this lock 2895 * request; if one is found, fake a signal to cancel it. 2896 * 2897 * Return 1 if a matching lock was found, 0 otherwise. 2898 */ 2899 2900 static int 2901 flk_canceled(lock_descriptor_t *request) 2902 { 2903 lock_descriptor_t *lock, *nlock; 2904 graph_t *gp = request->l_graph; 2905 vnode_t *vp = request->l_vnode; 2906 2907 ASSERT(MUTEX_HELD(&gp->gp_mutex)); 2908 ASSERT(IS_LOCKMGR(request)); 2909 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp); 2910 2911 if (lock) { 2912 while (lock->l_vnode == vp) { 2913 nlock = lock->l_next; 2914 if (SAME_OWNER(lock, request) && 2915 lock->l_start == request->l_start && 2916 lock->l_end == request->l_end) { 2917 INTERRUPT_WAKEUP(lock); 2918 return (1); 2919 } 2920 lock = nlock; 2921 } 2922 } 2923 return (0); 2924 } 2925 2926 /* 2927 * Remove all non-OFD locks for the vnode belonging to the given pid and sysid. 2928 * That is, since OFD locks are pid-less we'll never match on the incoming 2929 * pid. OFD locks are removed earlier in the close() path via closef() and 2930 * ofdcleanlock(). 2931 */ 2932 void 2933 cleanlocks(vnode_t *vp, pid_t pid, int sysid) 2934 { 2935 graph_t *gp; 2936 lock_descriptor_t *lock, *nlock; 2937 lock_descriptor_t *link_stack; 2938 2939 STACK_INIT(link_stack); 2940 2941 gp = flk_get_lock_graph(vp, FLK_USE_GRAPH); 2942 2943 if (gp == NULL) 2944 return; 2945 mutex_enter(&gp->gp_mutex); 2946 2947 CHECK_SLEEPING_LOCKS(gp); 2948 CHECK_ACTIVE_LOCKS(gp); 2949 2950 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp); 2951 2952 if (lock) { 2953 do { 2954 nlock = lock->l_next; 2955 if ((lock->l_flock.l_pid == pid || 2956 pid == IGN_PID) && 2957 lock->l_flock.l_sysid == sysid) { 2958 CANCEL_WAKEUP(lock); 2959 } 2960 lock = nlock; 2961 } while (lock->l_vnode == vp); 2962 } 2963 2964 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp); 2965 2966 if (lock) { 2967 do { 2968 nlock = lock->l_next; 2969 if ((lock->l_flock.l_pid == pid || 2970 pid == IGN_PID) && 2971 lock->l_flock.l_sysid == sysid) { 2972 flk_delete_active_lock(lock, 0); 2973 STACK_PUSH(link_stack, lock, l_stack); 2974 } 2975 lock = nlock; 2976 } while (lock->l_vnode == vp); 2977 } 2978 2979 while ((lock = STACK_TOP(link_stack)) != NULL) { 2980 STACK_POP(link_stack, l_stack); 2981 flk_wakeup(lock, 1); 2982 flk_free_lock(lock); 2983 } 2984 2985 CHECK_SLEEPING_LOCKS(gp); 2986 CHECK_ACTIVE_LOCKS(gp); 2987 CHECK_OWNER_LOCKS(gp, pid, sysid, vp); 2988 mutex_exit(&gp->gp_mutex); 2989 } 2990 2991 2992 /* 2993 * Called from 'fs' read and write routines for files that have mandatory 2994 * locking enabled. 2995 */ 2996 2997 int 2998 chklock(struct vnode *vp, int iomode, u_offset_t offset, ssize_t len, int fmode, 2999 caller_context_t *ct) 3000 { 3001 register int i; 3002 struct flock64 bf; 3003 int error = 0; 3004 3005 bf.l_type = (iomode & FWRITE) ? F_WRLCK : F_RDLCK; 3006 bf.l_whence = 0; 3007 bf.l_start = offset; 3008 bf.l_len = len; 3009 if (ct == NULL) { 3010 bf.l_pid = curproc->p_pid; 3011 bf.l_sysid = 0; 3012 } else { 3013 bf.l_pid = ct->cc_pid; 3014 bf.l_sysid = ct->cc_sysid; 3015 } 3016 i = (fmode & (FNDELAY|FNONBLOCK)) ? INOFLCK : INOFLCK|SLPFLCK; 3017 if ((i = reclock(vp, &bf, i, 0, offset, NULL)) != 0 || 3018 bf.l_type != F_UNLCK) 3019 error = i ? i : EAGAIN; 3020 return (error); 3021 } 3022 3023 /* 3024 * convoff - converts the given data (start, whence) to the 3025 * given whence. 3026 */ 3027 int 3028 convoff(struct vnode *vp, struct flock64 *lckdat, int whence, offset_t offset) 3029 { 3030 int error; 3031 struct vattr vattr; 3032 3033 if ((lckdat->l_whence == 2) || (whence == 2)) { 3034 vattr.va_mask = AT_SIZE; 3035 if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) 3036 return (error); 3037 } 3038 3039 switch (lckdat->l_whence) { 3040 case 1: 3041 lckdat->l_start += offset; 3042 break; 3043 case 2: 3044 lckdat->l_start += vattr.va_size; 3045 /* FALLTHRU */ 3046 case 0: 3047 break; 3048 default: 3049 return (EINVAL); 3050 } 3051 3052 if (lckdat->l_start < 0) 3053 return (EINVAL); 3054 3055 switch (whence) { 3056 case 1: 3057 lckdat->l_start -= offset; 3058 break; 3059 case 2: 3060 lckdat->l_start -= vattr.va_size; 3061 /* FALLTHRU */ 3062 case 0: 3063 break; 3064 default: 3065 return (EINVAL); 3066 } 3067 3068 lckdat->l_whence = (short)whence; 3069 return (0); 3070 } 3071 3072 3073 /* proc_graph function definitions */ 3074 3075 /* 3076 * Function checks for deadlock due to the new 'lock'. If deadlock found 3077 * edges of this lock are freed and returned. 3078 */ 3079 3080 static int 3081 flk_check_deadlock(lock_descriptor_t *lock) 3082 { 3083 proc_vertex_t *start_vertex, *pvertex; 3084 proc_vertex_t *dvertex; 3085 proc_edge_t *pep, *ppep; 3086 edge_t *ep, *nep; 3087 proc_vertex_t *process_stack; 3088 3089 /* 3090 * OFD style locks are not associated with any process so there is 3091 * no proc graph for these. Thus we cannot, and do not, do deadlock 3092 * detection. 3093 */ 3094 if (lock->l_ofd != NULL) 3095 return (0); 3096 3097 STACK_INIT(process_stack); 3098 3099 mutex_enter(&flock_lock); 3100 start_vertex = flk_get_proc_vertex(lock); 3101 ASSERT(start_vertex != NULL); 3102 3103 /* construct the edges from this process to other processes */ 3104 3105 ep = FIRST_ADJ(lock); 3106 while (ep != HEAD(lock)) { 3107 proc_vertex_t *adj_proc; 3108 3109 adj_proc = flk_get_proc_vertex(ep->to_vertex); 3110 for (pep = start_vertex->edge; pep != NULL; pep = pep->next) { 3111 if (pep->to_proc == adj_proc) { 3112 ASSERT(pep->refcount); 3113 pep->refcount++; 3114 break; 3115 } 3116 } 3117 if (pep == NULL) { 3118 pep = flk_get_proc_edge(); 3119 pep->to_proc = adj_proc; 3120 pep->refcount = 1; 3121 adj_proc->incount++; 3122 pep->next = start_vertex->edge; 3123 start_vertex->edge = pep; 3124 } 3125 ep = NEXT_ADJ(ep); 3126 } 3127 3128 ep = FIRST_IN(lock); 3129 3130 while (ep != HEAD(lock)) { 3131 proc_vertex_t *in_proc; 3132 3133 in_proc = flk_get_proc_vertex(ep->from_vertex); 3134 3135 for (pep = in_proc->edge; pep != NULL; pep = pep->next) { 3136 if (pep->to_proc == start_vertex) { 3137 ASSERT(pep->refcount); 3138 pep->refcount++; 3139 break; 3140 } 3141 } 3142 if (pep == NULL) { 3143 pep = flk_get_proc_edge(); 3144 pep->to_proc = start_vertex; 3145 pep->refcount = 1; 3146 start_vertex->incount++; 3147 pep->next = in_proc->edge; 3148 in_proc->edge = pep; 3149 } 3150 ep = NEXT_IN(ep); 3151 } 3152 3153 if (start_vertex->incount == 0) { 3154 mutex_exit(&flock_lock); 3155 return (0); 3156 } 3157 3158 flk_proc_graph_uncolor(); 3159 3160 start_vertex->p_sedge = start_vertex->edge; 3161 3162 STACK_PUSH(process_stack, start_vertex, p_stack); 3163 3164 while ((pvertex = STACK_TOP(process_stack)) != NULL) { 3165 for (pep = pvertex->p_sedge; pep != NULL; pep = pep->next) { 3166 dvertex = pep->to_proc; 3167 if (!PROC_ARRIVED(dvertex)) { 3168 STACK_PUSH(process_stack, dvertex, p_stack); 3169 dvertex->p_sedge = dvertex->edge; 3170 PROC_ARRIVE(pvertex); 3171 pvertex->p_sedge = pep->next; 3172 break; 3173 } 3174 if (!PROC_DEPARTED(dvertex)) 3175 goto deadlock; 3176 } 3177 if (pep == NULL) { 3178 PROC_DEPART(pvertex); 3179 STACK_POP(process_stack, p_stack); 3180 } 3181 } 3182 mutex_exit(&flock_lock); 3183 return (0); 3184 3185 deadlock: 3186 3187 /* we remove all lock edges and proc edges */ 3188 3189 ep = FIRST_ADJ(lock); 3190 while (ep != HEAD(lock)) { 3191 proc_vertex_t *adj_proc; 3192 adj_proc = flk_get_proc_vertex(ep->to_vertex); 3193 nep = NEXT_ADJ(ep); 3194 IN_LIST_REMOVE(ep); 3195 ADJ_LIST_REMOVE(ep); 3196 flk_free_edge(ep); 3197 ppep = start_vertex->edge; 3198 for (pep = start_vertex->edge; pep != NULL; ppep = pep, 3199 pep = ppep->next) { 3200 if (pep->to_proc == adj_proc) { 3201 pep->refcount--; 3202 if (pep->refcount == 0) { 3203 if (pep == ppep) { 3204 start_vertex->edge = pep->next; 3205 } else { 3206 ppep->next = pep->next; 3207 } 3208 adj_proc->incount--; 3209 flk_proc_release(adj_proc); 3210 flk_free_proc_edge(pep); 3211 } 3212 break; 3213 } 3214 } 3215 ep = nep; 3216 } 3217 ep = FIRST_IN(lock); 3218 while (ep != HEAD(lock)) { 3219 proc_vertex_t *in_proc; 3220 in_proc = flk_get_proc_vertex(ep->from_vertex); 3221 nep = NEXT_IN(ep); 3222 IN_LIST_REMOVE(ep); 3223 ADJ_LIST_REMOVE(ep); 3224 flk_free_edge(ep); 3225 ppep = in_proc->edge; 3226 for (pep = in_proc->edge; pep != NULL; ppep = pep, 3227 pep = ppep->next) { 3228 if (pep->to_proc == start_vertex) { 3229 pep->refcount--; 3230 if (pep->refcount == 0) { 3231 if (pep == ppep) { 3232 in_proc->edge = pep->next; 3233 } else { 3234 ppep->next = pep->next; 3235 } 3236 start_vertex->incount--; 3237 flk_proc_release(in_proc); 3238 flk_free_proc_edge(pep); 3239 } 3240 break; 3241 } 3242 } 3243 ep = nep; 3244 } 3245 flk_proc_release(start_vertex); 3246 mutex_exit(&flock_lock); 3247 return (1); 3248 } 3249 3250 /* 3251 * Get a proc vertex. If lock's pvertex value gets a correct proc vertex 3252 * from the list we return that, otherwise we allocate one. If necessary, 3253 * we grow the list of vertices also. 3254 */ 3255 3256 static proc_vertex_t * 3257 flk_get_proc_vertex(lock_descriptor_t *lock) 3258 { 3259 int i; 3260 proc_vertex_t *pv; 3261 proc_vertex_t **palloc; 3262 3263 ASSERT(MUTEX_HELD(&flock_lock)); 3264 if (lock->pvertex != -1) { 3265 ASSERT(lock->pvertex >= 0); 3266 pv = pgraph.proc[lock->pvertex]; 3267 if (pv != NULL && PROC_SAME_OWNER(lock, pv)) { 3268 return (pv); 3269 } 3270 } 3271 for (i = 0; i < pgraph.gcount; i++) { 3272 pv = pgraph.proc[i]; 3273 if (pv != NULL && PROC_SAME_OWNER(lock, pv)) { 3274 lock->pvertex = pv->index = i; 3275 return (pv); 3276 } 3277 } 3278 pv = kmem_zalloc(sizeof (struct proc_vertex), KM_SLEEP); 3279 pv->pid = lock->l_flock.l_pid; 3280 pv->sysid = lock->l_flock.l_sysid; 3281 flk_proc_vertex_allocs++; 3282 if (pgraph.free != 0) { 3283 for (i = 0; i < pgraph.gcount; i++) { 3284 if (pgraph.proc[i] == NULL) { 3285 pgraph.proc[i] = pv; 3286 lock->pvertex = pv->index = i; 3287 pgraph.free--; 3288 return (pv); 3289 } 3290 } 3291 } 3292 palloc = kmem_zalloc((pgraph.gcount + PROC_CHUNK) * 3293 sizeof (proc_vertex_t *), KM_SLEEP); 3294 3295 if (pgraph.proc) { 3296 bcopy(pgraph.proc, palloc, 3297 pgraph.gcount * sizeof (proc_vertex_t *)); 3298 3299 kmem_free(pgraph.proc, 3300 pgraph.gcount * sizeof (proc_vertex_t *)); 3301 } 3302 pgraph.proc = palloc; 3303 pgraph.free += (PROC_CHUNK - 1); 3304 pv->index = lock->pvertex = pgraph.gcount; 3305 pgraph.gcount += PROC_CHUNK; 3306 pgraph.proc[pv->index] = pv; 3307 return (pv); 3308 } 3309 3310 /* 3311 * Allocate a proc edge. 3312 */ 3313 3314 static proc_edge_t * 3315 flk_get_proc_edge() 3316 { 3317 proc_edge_t *pep; 3318 3319 pep = kmem_zalloc(sizeof (proc_edge_t), KM_SLEEP); 3320 flk_proc_edge_allocs++; 3321 return (pep); 3322 } 3323 3324 /* 3325 * Free the proc edge. Called whenever its reference count goes to zero. 3326 */ 3327 3328 static void 3329 flk_free_proc_edge(proc_edge_t *pep) 3330 { 3331 ASSERT(pep->refcount == 0); 3332 kmem_free((void *)pep, sizeof (proc_edge_t)); 3333 flk_proc_edge_frees++; 3334 } 3335 3336 /* 3337 * Color the graph explicitly done only when the mark value hits max value. 3338 */ 3339 3340 static void 3341 flk_proc_graph_uncolor() 3342 { 3343 int i; 3344 3345 if (pgraph.mark == UINT_MAX) { 3346 for (i = 0; i < pgraph.gcount; i++) 3347 if (pgraph.proc[i] != NULL) { 3348 pgraph.proc[i]->atime = 0; 3349 pgraph.proc[i]->dtime = 0; 3350 } 3351 pgraph.mark = 1; 3352 } else { 3353 pgraph.mark++; 3354 } 3355 } 3356 3357 /* 3358 * Release the proc vertex iff both there are no in edges and out edges 3359 */ 3360 3361 static void 3362 flk_proc_release(proc_vertex_t *proc) 3363 { 3364 ASSERT(MUTEX_HELD(&flock_lock)); 3365 if (proc->edge == NULL && proc->incount == 0) { 3366 pgraph.proc[proc->index] = NULL; 3367 pgraph.free++; 3368 kmem_free(proc, sizeof (proc_vertex_t)); 3369 flk_proc_vertex_frees++; 3370 } 3371 } 3372 3373 /* 3374 * Updates process graph to reflect change in a lock_graph. 3375 * Note: We should call this function only after we have a correctly 3376 * recomputed lock graph. Otherwise we might miss a deadlock detection. 3377 * eg: in function flk_relation() we call this function after flk_recompute_ 3378 * dependencies() otherwise if a process tries to lock a vnode hashed 3379 * into another graph it might sleep for ever. 3380 */ 3381 3382 static void 3383 flk_update_proc_graph(edge_t *ep, int delete) 3384 { 3385 proc_vertex_t *toproc, *fromproc; 3386 proc_edge_t *pep, *prevpep; 3387 3388 mutex_enter(&flock_lock); 3389 3390 /* 3391 * OFD style locks are not associated with any process so there is 3392 * no proc graph for these. 3393 */ 3394 if (ep->from_vertex->l_ofd != NULL) { 3395 mutex_exit(&flock_lock); 3396 return; 3397 } 3398 3399 toproc = flk_get_proc_vertex(ep->to_vertex); 3400 fromproc = flk_get_proc_vertex(ep->from_vertex); 3401 3402 if (!delete) 3403 goto add; 3404 pep = prevpep = fromproc->edge; 3405 3406 ASSERT(pep != NULL); 3407 while (pep != NULL) { 3408 if (pep->to_proc == toproc) { 3409 ASSERT(pep->refcount > 0); 3410 pep->refcount--; 3411 if (pep->refcount == 0) { 3412 if (pep == prevpep) { 3413 fromproc->edge = pep->next; 3414 } else { 3415 prevpep->next = pep->next; 3416 } 3417 toproc->incount--; 3418 flk_proc_release(toproc); 3419 flk_free_proc_edge(pep); 3420 } 3421 break; 3422 } 3423 prevpep = pep; 3424 pep = pep->next; 3425 } 3426 flk_proc_release(fromproc); 3427 mutex_exit(&flock_lock); 3428 return; 3429 add: 3430 3431 pep = fromproc->edge; 3432 3433 while (pep != NULL) { 3434 if (pep->to_proc == toproc) { 3435 ASSERT(pep->refcount > 0); 3436 pep->refcount++; 3437 break; 3438 } 3439 pep = pep->next; 3440 } 3441 if (pep == NULL) { 3442 pep = flk_get_proc_edge(); 3443 pep->to_proc = toproc; 3444 pep->refcount = 1; 3445 toproc->incount++; 3446 pep->next = fromproc->edge; 3447 fromproc->edge = pep; 3448 } 3449 mutex_exit(&flock_lock); 3450 } 3451 3452 /* 3453 * Set the control status for lock manager requests. 3454 * 3455 */ 3456 3457 /* 3458 * PSARC case 1997/292 3459 * 3460 * Requires: "nlmid" must be >= 1 and <= clconf_maximum_nodeid(). 3461 * Effects: Set the state of the NLM server identified by "nlmid" 3462 * in the NLM registry to state "nlm_state." 3463 * Raises exception no_such_nlm if "nlmid" doesn't identify a known 3464 * NLM server to this LLM. 3465 * Note that when this routine is called with NLM_SHUTTING_DOWN there 3466 * may be locks requests that have gotten started but not finished. In 3467 * particular, there may be blocking requests that are in the callback code 3468 * before sleeping (so they're not holding the lock for the graph). If 3469 * such a thread reacquires the graph's lock (to go to sleep) after 3470 * NLM state in the NLM registry is set to a non-up value, 3471 * it will notice the status and bail out. If the request gets 3472 * granted before the thread can check the NLM registry, let it 3473 * continue normally. It will get flushed when we are called with NLM_DOWN. 3474 * 3475 * Modifies: nlm_reg_obj (global) 3476 * Arguments: 3477 * nlmid (IN): id uniquely identifying an NLM server 3478 * nlm_state (IN): NLM server state to change "nlmid" to 3479 */ 3480 void 3481 cl_flk_set_nlm_status(int nlmid, flk_nlm_status_t nlm_state) 3482 { 3483 /* 3484 * Check to see if node is booted as a cluster. If not, return. 3485 */ 3486 if ((cluster_bootflags & CLUSTER_BOOTED) == 0) { 3487 return; 3488 } 3489 3490 /* 3491 * Check for development/debugging. It is possible to boot a node 3492 * in non-cluster mode, and then run a special script, currently 3493 * available only to developers, to bring up the node as part of a 3494 * cluster. The problem is that running such a script does not 3495 * result in the routine flk_init() being called and hence global array 3496 * nlm_reg_status is NULL. The NLM thinks it's in cluster mode, 3497 * but the LLM needs to do an additional check to see if the global 3498 * array has been created or not. If nlm_reg_status is NULL, then 3499 * return, else continue. 3500 */ 3501 if (nlm_reg_status == NULL) { 3502 return; 3503 } 3504 3505 ASSERT(nlmid <= nlm_status_size && nlmid >= 0); 3506 mutex_enter(&nlm_reg_lock); 3507 3508 if (FLK_REGISTRY_IS_NLM_UNKNOWN(nlm_reg_status, nlmid)) { 3509 /* 3510 * If the NLM server "nlmid" is unknown in the NLM registry, 3511 * add it to the registry in the nlm shutting down state. 3512 */ 3513 FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid, 3514 FLK_NLM_SHUTTING_DOWN); 3515 } else { 3516 /* 3517 * Change the state of the NLM server identified by "nlmid" 3518 * in the NLM registry to the argument "nlm_state." 3519 */ 3520 FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid, 3521 nlm_state); 3522 } 3523 3524 /* 3525 * The reason we must register the NLM server that is shutting down 3526 * with an LLM that doesn't already know about it (never sent a lock 3527 * request) is to handle correctly a race between shutdown and a new 3528 * lock request. Suppose that a shutdown request from the NLM server 3529 * invokes this routine at the LLM, and a thread is spawned to 3530 * service the request. Now suppose a new lock request is in 3531 * progress and has already passed the first line of defense in 3532 * reclock(), which denies new locks requests from NLM servers 3533 * that are not in the NLM_UP state. After the current routine 3534 * is invoked for both phases of shutdown, the routine will return, 3535 * having done nothing, and the lock request will proceed and 3536 * probably be granted. The problem is that the shutdown was ignored 3537 * by the lock request because there was no record of that NLM server 3538 * shutting down. We will be in the peculiar position of thinking 3539 * that we've shutdown the NLM server and all locks at all LLMs have 3540 * been discarded, but in fact there's still one lock held. 3541 * The solution is to record the existence of NLM server and change 3542 * its state immediately to NLM_SHUTTING_DOWN. The lock request in 3543 * progress may proceed because the next phase NLM_DOWN will catch 3544 * this lock and discard it. 3545 */ 3546 mutex_exit(&nlm_reg_lock); 3547 3548 switch (nlm_state) { 3549 case FLK_NLM_UP: 3550 /* 3551 * Change the NLM state of all locks still held on behalf of 3552 * the NLM server identified by "nlmid" to NLM_UP. 3553 */ 3554 cl_flk_change_nlm_state_all_locks(nlmid, FLK_NLM_UP); 3555 break; 3556 3557 case FLK_NLM_SHUTTING_DOWN: 3558 /* 3559 * Wake up all sleeping locks for the NLM server identified 3560 * by "nlmid." Note that eventually all woken threads will 3561 * have their lock requests cancelled and descriptors 3562 * removed from the sleeping lock list. Note that the NLM 3563 * server state associated with each lock descriptor is 3564 * changed to FLK_NLM_SHUTTING_DOWN. 3565 */ 3566 cl_flk_wakeup_sleeping_nlm_locks(nlmid); 3567 break; 3568 3569 case FLK_NLM_DOWN: 3570 /* 3571 * Discard all active, granted locks for this NLM server 3572 * identified by "nlmid." 3573 */ 3574 cl_flk_unlock_nlm_granted(nlmid); 3575 break; 3576 3577 default: 3578 panic("cl_set_nlm_status: bad status (%d)", nlm_state); 3579 } 3580 } 3581 3582 /* 3583 * Set the control status for lock manager requests. 3584 * 3585 * Note that when this routine is called with FLK_WAKEUP_SLEEPERS, there 3586 * may be locks requests that have gotten started but not finished. In 3587 * particular, there may be blocking requests that are in the callback code 3588 * before sleeping (so they're not holding the lock for the graph). If 3589 * such a thread reacquires the graph's lock (to go to sleep) after 3590 * flk_lockmgr_status is set to a non-up value, it will notice the status 3591 * and bail out. If the request gets granted before the thread can check 3592 * flk_lockmgr_status, let it continue normally. It will get flushed when 3593 * we are called with FLK_LOCKMGR_DOWN. 3594 */ 3595 3596 void 3597 flk_set_lockmgr_status(flk_lockmgr_status_t status) 3598 { 3599 int i; 3600 graph_t *gp; 3601 struct flock_globals *fg; 3602 3603 fg = flk_get_globals(); 3604 ASSERT(fg != NULL); 3605 3606 mutex_enter(&flock_lock); 3607 fg->flk_lockmgr_status = status; 3608 mutex_exit(&flock_lock); 3609 3610 /* 3611 * If the lock manager is coming back up, all that's needed is to 3612 * propagate this information to the graphs. If the lock manager 3613 * is going down, additional action is required, and each graph's 3614 * copy of the state is updated atomically with this other action. 3615 */ 3616 switch (status) { 3617 case FLK_LOCKMGR_UP: 3618 for (i = 0; i < HASH_SIZE; i++) { 3619 mutex_enter(&flock_lock); 3620 gp = lock_graph[i]; 3621 mutex_exit(&flock_lock); 3622 if (gp == NULL) 3623 continue; 3624 mutex_enter(&gp->gp_mutex); 3625 fg->lockmgr_status[i] = status; 3626 mutex_exit(&gp->gp_mutex); 3627 } 3628 break; 3629 case FLK_WAKEUP_SLEEPERS: 3630 wakeup_sleeping_lockmgr_locks(fg); 3631 break; 3632 case FLK_LOCKMGR_DOWN: 3633 unlock_lockmgr_granted(fg); 3634 break; 3635 default: 3636 panic("flk_set_lockmgr_status: bad status (%d)", status); 3637 break; 3638 } 3639 } 3640 3641 /* 3642 * This routine returns all the locks that are active or sleeping and are 3643 * associated with a particular set of identifiers. If lock_state != 0, then 3644 * only locks that match the lock_state are returned. If lock_state == 0, then 3645 * all locks are returned. If pid == NOPID, the pid is ignored. If 3646 * use_sysid is FALSE, then the sysid is ignored. If vp is NULL, then the 3647 * vnode pointer is ignored. 3648 * 3649 * A list containing the vnode pointer and an flock structure 3650 * describing the lock is returned. Each element in the list is 3651 * dynamically allocated and must be freed by the caller. The 3652 * last item in the list is denoted by a NULL value in the ll_next 3653 * field. 3654 * 3655 * The vnode pointers returned are held. The caller is responsible 3656 * for releasing these. Note that the returned list is only a snapshot of 3657 * the current lock information, and that it is a snapshot of a moving 3658 * target (only one graph is locked at a time). 3659 */ 3660 3661 locklist_t * 3662 get_lock_list(int list_type, int lock_state, int sysid, boolean_t use_sysid, 3663 pid_t pid, const vnode_t *vp, zoneid_t zoneid) 3664 { 3665 lock_descriptor_t *lock; 3666 lock_descriptor_t *graph_head; 3667 locklist_t listhead; 3668 locklist_t *llheadp; 3669 locklist_t *llp; 3670 locklist_t *lltp; 3671 graph_t *gp; 3672 int i; 3673 int first_index; /* graph index */ 3674 int num_indexes; /* graph index */ 3675 3676 ASSERT((list_type == FLK_ACTIVE_STATE) || 3677 (list_type == FLK_SLEEPING_STATE)); 3678 3679 /* 3680 * Get a pointer to something to use as a list head while building 3681 * the rest of the list. 3682 */ 3683 llheadp = &listhead; 3684 lltp = llheadp; 3685 llheadp->ll_next = (locklist_t *)NULL; 3686 3687 /* Figure out which graphs we want to look at. */ 3688 if (vp == NULL) { 3689 first_index = 0; 3690 num_indexes = HASH_SIZE; 3691 } else { 3692 first_index = HASH_INDEX(vp); 3693 num_indexes = 1; 3694 } 3695 3696 for (i = first_index; i < first_index + num_indexes; i++) { 3697 mutex_enter(&flock_lock); 3698 gp = lock_graph[i]; 3699 mutex_exit(&flock_lock); 3700 if (gp == NULL) { 3701 continue; 3702 } 3703 3704 mutex_enter(&gp->gp_mutex); 3705 graph_head = (list_type == FLK_ACTIVE_STATE) ? 3706 ACTIVE_HEAD(gp) : SLEEPING_HEAD(gp); 3707 for (lock = graph_head->l_next; 3708 lock != graph_head; 3709 lock = lock->l_next) { 3710 if (use_sysid && lock->l_flock.l_sysid != sysid) 3711 continue; 3712 if (pid != NOPID && lock->l_flock.l_pid != pid) 3713 continue; 3714 if (vp != NULL && lock->l_vnode != vp) 3715 continue; 3716 if (lock_state && !(lock_state & lock->l_state)) 3717 continue; 3718 if (zoneid != lock->l_zoneid && zoneid != ALL_ZONES) 3719 continue; 3720 /* 3721 * A matching lock was found. Allocate 3722 * space for a new locklist entry and fill 3723 * it in. 3724 */ 3725 llp = kmem_alloc(sizeof (locklist_t), KM_SLEEP); 3726 lltp->ll_next = llp; 3727 VN_HOLD(lock->l_vnode); 3728 llp->ll_vp = lock->l_vnode; 3729 create_flock(lock, &(llp->ll_flock)); 3730 llp->ll_next = (locklist_t *)NULL; 3731 lltp = llp; 3732 } 3733 mutex_exit(&gp->gp_mutex); 3734 } 3735 3736 llp = llheadp->ll_next; 3737 return (llp); 3738 } 3739 3740 /* 3741 * These two functions are simply interfaces to get_lock_list. They return 3742 * a list of sleeping or active locks for the given sysid and pid. See 3743 * get_lock_list for details. 3744 * 3745 * In either case we don't particularly care to specify the zone of interest; 3746 * the sysid-space is global across zones, so the sysid will map to exactly one 3747 * zone, and we'll return information for that zone. 3748 */ 3749 3750 locklist_t * 3751 flk_get_sleeping_locks(int sysid, pid_t pid) 3752 { 3753 return (get_lock_list(FLK_SLEEPING_STATE, 0, sysid, B_TRUE, pid, NULL, 3754 ALL_ZONES)); 3755 } 3756 3757 locklist_t * 3758 flk_get_active_locks(int sysid, pid_t pid) 3759 { 3760 return (get_lock_list(FLK_ACTIVE_STATE, 0, sysid, B_TRUE, pid, NULL, 3761 ALL_ZONES)); 3762 } 3763 3764 /* 3765 * Another interface to get_lock_list. This one returns all the active 3766 * locks for a given vnode. Again, see get_lock_list for details. 3767 * 3768 * We don't need to specify which zone's locks we're interested in. The matter 3769 * would only be interesting if the vnode belonged to NFS, and NFS vnodes can't 3770 * be used by multiple zones, so the list of locks will all be from the right 3771 * zone. 3772 */ 3773 3774 locklist_t * 3775 flk_active_locks_for_vp(const vnode_t *vp) 3776 { 3777 return (get_lock_list(FLK_ACTIVE_STATE, 0, 0, B_FALSE, NOPID, vp, 3778 ALL_ZONES)); 3779 } 3780 3781 /* 3782 * Another interface to get_lock_list. This one returns all the active 3783 * nbmand locks for a given vnode. Again, see get_lock_list for details. 3784 * 3785 * See the comment for flk_active_locks_for_vp() for why we don't care to 3786 * specify the particular zone of interest. 3787 */ 3788 locklist_t * 3789 flk_active_nbmand_locks_for_vp(const vnode_t *vp) 3790 { 3791 return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE, 3792 NOPID, vp, ALL_ZONES)); 3793 } 3794 3795 /* 3796 * Another interface to get_lock_list. This one returns all the active 3797 * nbmand locks for a given pid. Again, see get_lock_list for details. 3798 * 3799 * The zone doesn't need to be specified here; the locks held by a 3800 * particular process will either be local (ie, non-NFS) or from the zone 3801 * the process is executing in. This is because other parts of the system 3802 * ensure that an NFS vnode can't be used in a zone other than that in 3803 * which it was opened. 3804 */ 3805 locklist_t * 3806 flk_active_nbmand_locks(pid_t pid) 3807 { 3808 return (get_lock_list(FLK_ACTIVE_STATE, NBMAND_LOCK, 0, B_FALSE, 3809 pid, NULL, ALL_ZONES)); 3810 } 3811 3812 /* 3813 * Free up all entries in the locklist. 3814 */ 3815 void 3816 flk_free_locklist(locklist_t *llp) 3817 { 3818 locklist_t *next_llp; 3819 3820 while (llp) { 3821 next_llp = llp->ll_next; 3822 VN_RELE(llp->ll_vp); 3823 kmem_free(llp, sizeof (*llp)); 3824 llp = next_llp; 3825 } 3826 } 3827 3828 static void 3829 cl_flk_change_nlm_state_all_locks(int nlmid, flk_nlm_status_t nlm_state) 3830 { 3831 /* 3832 * For each graph "lg" in the hash table lock_graph do 3833 * a. Get the list of sleeping locks 3834 * b. For each lock descriptor in the list do 3835 * i. If the requested lock is an NLM server request AND 3836 * the nlmid is the same as the routine argument then 3837 * change the lock descriptor's state field to 3838 * "nlm_state." 3839 * c. Get the list of active locks 3840 * d. For each lock descriptor in the list do 3841 * i. If the requested lock is an NLM server request AND 3842 * the nlmid is the same as the routine argument then 3843 * change the lock descriptor's state field to 3844 * "nlm_state." 3845 */ 3846 3847 int i; 3848 graph_t *gp; /* lock graph */ 3849 lock_descriptor_t *lock; /* lock */ 3850 lock_descriptor_t *nlock = NULL; /* next lock */ 3851 int lock_nlmid; 3852 3853 for (i = 0; i < HASH_SIZE; i++) { 3854 mutex_enter(&flock_lock); 3855 gp = lock_graph[i]; 3856 mutex_exit(&flock_lock); 3857 if (gp == NULL) { 3858 continue; 3859 } 3860 3861 /* Get list of sleeping locks in current lock graph. */ 3862 mutex_enter(&gp->gp_mutex); 3863 for (lock = SLEEPING_HEAD(gp)->l_next; 3864 lock != SLEEPING_HEAD(gp); 3865 lock = nlock) { 3866 nlock = lock->l_next; 3867 /* get NLM id */ 3868 lock_nlmid = GETNLMID(lock->l_flock.l_sysid); 3869 3870 /* 3871 * If NLM server request AND nlmid of lock matches 3872 * nlmid of argument, then set the NLM state of the 3873 * lock to "nlm_state." 3874 */ 3875 if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) { 3876 SET_NLM_STATE(lock, nlm_state); 3877 } 3878 } 3879 3880 /* Get list of active locks in current lock graph. */ 3881 for (lock = ACTIVE_HEAD(gp)->l_next; 3882 lock != ACTIVE_HEAD(gp); 3883 lock = nlock) { 3884 nlock = lock->l_next; 3885 /* get NLM id */ 3886 lock_nlmid = GETNLMID(lock->l_flock.l_sysid); 3887 3888 /* 3889 * If NLM server request AND nlmid of lock matches 3890 * nlmid of argument, then set the NLM state of the 3891 * lock to "nlm_state." 3892 */ 3893 if (IS_LOCKMGR(lock) && nlmid == lock_nlmid) { 3894 ASSERT(IS_ACTIVE(lock)); 3895 SET_NLM_STATE(lock, nlm_state); 3896 } 3897 } 3898 mutex_exit(&gp->gp_mutex); 3899 } 3900 } 3901 3902 /* 3903 * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid(). 3904 * Effects: Find all sleeping lock manager requests _only_ for the NLM server 3905 * identified by "nlmid." Poke those lock requests. 3906 */ 3907 static void 3908 cl_flk_wakeup_sleeping_nlm_locks(int nlmid) 3909 { 3910 lock_descriptor_t *lock; 3911 lock_descriptor_t *nlock = NULL; /* next lock */ 3912 int i; 3913 graph_t *gp; 3914 int lock_nlmid; 3915 3916 for (i = 0; i < HASH_SIZE; i++) { 3917 mutex_enter(&flock_lock); 3918 gp = lock_graph[i]; 3919 mutex_exit(&flock_lock); 3920 if (gp == NULL) { 3921 continue; 3922 } 3923 3924 mutex_enter(&gp->gp_mutex); 3925 for (lock = SLEEPING_HEAD(gp)->l_next; 3926 lock != SLEEPING_HEAD(gp); 3927 lock = nlock) { 3928 nlock = lock->l_next; 3929 /* 3930 * If NLM server request _and_ nlmid of lock matches 3931 * nlmid of argument, then set the NLM state of the 3932 * lock to NLM_SHUTTING_DOWN, and wake up sleeping 3933 * request. 3934 */ 3935 if (IS_LOCKMGR(lock)) { 3936 /* get NLM id */ 3937 lock_nlmid = 3938 GETNLMID(lock->l_flock.l_sysid); 3939 if (nlmid == lock_nlmid) { 3940 SET_NLM_STATE(lock, 3941 FLK_NLM_SHUTTING_DOWN); 3942 INTERRUPT_WAKEUP(lock); 3943 } 3944 } 3945 } 3946 mutex_exit(&gp->gp_mutex); 3947 } 3948 } 3949 3950 /* 3951 * Requires: "nlmid" >= 1 and <= clconf_maximum_nodeid() 3952 * Effects: Find all active (granted) lock manager locks _only_ for the 3953 * NLM server identified by "nlmid" and release them. 3954 */ 3955 static void 3956 cl_flk_unlock_nlm_granted(int nlmid) 3957 { 3958 lock_descriptor_t *lock; 3959 lock_descriptor_t *nlock = NULL; /* next lock */ 3960 int i; 3961 graph_t *gp; 3962 int lock_nlmid; 3963 3964 for (i = 0; i < HASH_SIZE; i++) { 3965 mutex_enter(&flock_lock); 3966 gp = lock_graph[i]; 3967 mutex_exit(&flock_lock); 3968 if (gp == NULL) { 3969 continue; 3970 } 3971 3972 mutex_enter(&gp->gp_mutex); 3973 for (lock = ACTIVE_HEAD(gp)->l_next; 3974 lock != ACTIVE_HEAD(gp); 3975 lock = nlock) { 3976 nlock = lock->l_next; 3977 ASSERT(IS_ACTIVE(lock)); 3978 3979 /* 3980 * If it's an NLM server request _and_ nlmid of 3981 * the lock matches nlmid of argument, then 3982 * remove the active lock the list, wakup blocked 3983 * threads, and free the storage for the lock. 3984 * Note that there's no need to mark the NLM state 3985 * of this lock to NLM_DOWN because the lock will 3986 * be deleted anyway and its storage freed. 3987 */ 3988 if (IS_LOCKMGR(lock)) { 3989 /* get NLM id */ 3990 lock_nlmid = GETNLMID(lock->l_flock.l_sysid); 3991 if (nlmid == lock_nlmid) { 3992 flk_delete_active_lock(lock, 0); 3993 flk_wakeup(lock, 1); 3994 flk_free_lock(lock); 3995 } 3996 } 3997 } 3998 mutex_exit(&gp->gp_mutex); 3999 } 4000 } 4001 4002 /* 4003 * Find all sleeping lock manager requests and poke them. 4004 */ 4005 static void 4006 wakeup_sleeping_lockmgr_locks(struct flock_globals *fg) 4007 { 4008 lock_descriptor_t *lock; 4009 lock_descriptor_t *nlock = NULL; /* next lock */ 4010 int i; 4011 graph_t *gp; 4012 zoneid_t zoneid = getzoneid(); 4013 4014 for (i = 0; i < HASH_SIZE; i++) { 4015 mutex_enter(&flock_lock); 4016 gp = lock_graph[i]; 4017 mutex_exit(&flock_lock); 4018 if (gp == NULL) { 4019 continue; 4020 } 4021 4022 mutex_enter(&gp->gp_mutex); 4023 fg->lockmgr_status[i] = FLK_WAKEUP_SLEEPERS; 4024 for (lock = SLEEPING_HEAD(gp)->l_next; 4025 lock != SLEEPING_HEAD(gp); 4026 lock = nlock) { 4027 nlock = lock->l_next; 4028 if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) { 4029 INTERRUPT_WAKEUP(lock); 4030 } 4031 } 4032 mutex_exit(&gp->gp_mutex); 4033 } 4034 } 4035 4036 4037 /* 4038 * Find all active (granted) lock manager locks and release them. 4039 */ 4040 static void 4041 unlock_lockmgr_granted(struct flock_globals *fg) 4042 { 4043 lock_descriptor_t *lock; 4044 lock_descriptor_t *nlock = NULL; /* next lock */ 4045 int i; 4046 graph_t *gp; 4047 zoneid_t zoneid = getzoneid(); 4048 4049 for (i = 0; i < HASH_SIZE; i++) { 4050 mutex_enter(&flock_lock); 4051 gp = lock_graph[i]; 4052 mutex_exit(&flock_lock); 4053 if (gp == NULL) { 4054 continue; 4055 } 4056 4057 mutex_enter(&gp->gp_mutex); 4058 fg->lockmgr_status[i] = FLK_LOCKMGR_DOWN; 4059 for (lock = ACTIVE_HEAD(gp)->l_next; 4060 lock != ACTIVE_HEAD(gp); 4061 lock = nlock) { 4062 nlock = lock->l_next; 4063 if (IS_LOCKMGR(lock) && lock->l_zoneid == zoneid) { 4064 ASSERT(IS_ACTIVE(lock)); 4065 flk_delete_active_lock(lock, 0); 4066 flk_wakeup(lock, 1); 4067 flk_free_lock(lock); 4068 } 4069 } 4070 mutex_exit(&gp->gp_mutex); 4071 } 4072 } 4073 4074 4075 /* 4076 * Wait until a lock is granted, cancelled, or interrupted. 4077 */ 4078 4079 static void 4080 wait_for_lock(lock_descriptor_t *request) 4081 { 4082 graph_t *gp = request->l_graph; 4083 4084 ASSERT(MUTEX_HELD(&gp->gp_mutex)); 4085 4086 while (!(IS_GRANTED(request)) && !(IS_CANCELLED(request)) && 4087 !(IS_INTERRUPTED(request))) { 4088 if (!cv_wait_sig(&request->l_cv, &gp->gp_mutex)) { 4089 flk_set_state(request, FLK_INTERRUPTED_STATE); 4090 request->l_state |= INTERRUPTED_LOCK; 4091 } 4092 } 4093 } 4094 4095 /* 4096 * Create an flock structure from the existing lock information 4097 * 4098 * This routine is used to create flock structures for the lock manager 4099 * to use in a reclaim request. Since the lock was originated on this 4100 * host, it must be conforming to UNIX semantics, so no checking is 4101 * done to make sure it falls within the lower half of the 32-bit range. 4102 */ 4103 4104 static void 4105 create_flock(lock_descriptor_t *lp, flock64_t *flp) 4106 { 4107 ASSERT(lp->l_end == MAX_U_OFFSET_T || lp->l_end <= MAXEND); 4108 ASSERT(lp->l_end >= lp->l_start); 4109 4110 flp->l_type = lp->l_type; 4111 flp->l_whence = 0; 4112 flp->l_start = lp->l_start; 4113 flp->l_len = (lp->l_end == MAX_U_OFFSET_T) ? 0 : 4114 (lp->l_end - lp->l_start + 1); 4115 flp->l_sysid = lp->l_flock.l_sysid; 4116 flp->l_pid = lp->l_flock.l_pid; 4117 } 4118 4119 /* 4120 * Convert flock_t data describing a lock range into unsigned long starting 4121 * and ending points, which are put into lock_request. Returns 0 or an 4122 * errno value. 4123 */ 4124 4125 int 4126 flk_convert_lock_data(vnode_t *vp, flock64_t *flp, 4127 u_offset_t *start, u_offset_t *end, offset_t offset) 4128 { 4129 struct vattr vattr; 4130 int error; 4131 4132 /* 4133 * Determine the starting point of the request 4134 */ 4135 switch (flp->l_whence) { 4136 case 0: /* SEEK_SET */ 4137 *start = (u_offset_t)flp->l_start; 4138 break; 4139 case 1: /* SEEK_CUR */ 4140 *start = (u_offset_t)(flp->l_start + offset); 4141 break; 4142 case 2: /* SEEK_END */ 4143 vattr.va_mask = AT_SIZE; 4144 if (error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) 4145 return (error); 4146 *start = (u_offset_t)(flp->l_start + vattr.va_size); 4147 break; 4148 default: 4149 return (EINVAL); 4150 } 4151 4152 /* 4153 * Determine the range covered by the request. 4154 */ 4155 if (flp->l_len == 0) 4156 *end = MAX_U_OFFSET_T; 4157 else if ((offset_t)flp->l_len > 0) { 4158 *end = (u_offset_t)(*start + (flp->l_len - 1)); 4159 } else { 4160 /* 4161 * Negative length; why do we even allow this ? 4162 * Because this allows easy specification of 4163 * the last n bytes of the file. 4164 */ 4165 *end = *start; 4166 *start += (u_offset_t)flp->l_len; 4167 (*start)++; 4168 } 4169 return (0); 4170 } 4171 4172 /* 4173 * Check the validity of lock data. This can used by the NFS 4174 * frlock routines to check data before contacting the server. The 4175 * server must support semantics that aren't as restrictive as 4176 * the UNIX API, so the NFS client is required to check. 4177 * The maximum is now passed in by the caller. 4178 */ 4179 4180 int 4181 flk_check_lock_data(u_offset_t start, u_offset_t end, offset_t max) 4182 { 4183 /* 4184 * The end (length) for local locking should never be greater 4185 * than MAXEND. However, the representation for 4186 * the entire file is MAX_U_OFFSET_T. 4187 */ 4188 if ((start > max) || 4189 ((end > max) && (end != MAX_U_OFFSET_T))) { 4190 return (EINVAL); 4191 } 4192 if (start > end) { 4193 return (EINVAL); 4194 } 4195 return (0); 4196 } 4197 4198 /* 4199 * Fill in request->l_flock with information about the lock blocking the 4200 * request. The complexity here is that lock manager requests are allowed 4201 * to see into the upper part of the 32-bit address range, whereas local 4202 * requests are only allowed to see signed values. 4203 * 4204 * What should be done when "blocker" is a lock manager lock that uses the 4205 * upper portion of the 32-bit range, but "request" is local? Since the 4206 * request has already been determined to have been blocked by the blocker, 4207 * at least some portion of "blocker" must be in the range of the request, 4208 * or the request extends to the end of file. For the first case, the 4209 * portion in the lower range is returned with the indication that it goes 4210 * "to EOF." For the second case, the last byte of the lower range is 4211 * returned with the indication that it goes "to EOF." 4212 */ 4213 4214 static void 4215 report_blocker(lock_descriptor_t *blocker, lock_descriptor_t *request) 4216 { 4217 flock64_t *flrp; /* l_flock portion of request */ 4218 4219 ASSERT(blocker != NULL); 4220 4221 flrp = &request->l_flock; 4222 flrp->l_whence = 0; 4223 flrp->l_type = blocker->l_type; 4224 flrp->l_pid = blocker->l_flock.l_pid; 4225 flrp->l_sysid = blocker->l_flock.l_sysid; 4226 request->l_ofd = blocker->l_ofd; 4227 4228 if (IS_LOCKMGR(request)) { 4229 flrp->l_start = blocker->l_start; 4230 if (blocker->l_end == MAX_U_OFFSET_T) 4231 flrp->l_len = 0; 4232 else 4233 flrp->l_len = blocker->l_end - blocker->l_start + 1; 4234 } else { 4235 if (blocker->l_start > MAXEND) { 4236 flrp->l_start = MAXEND; 4237 flrp->l_len = 0; 4238 } else { 4239 flrp->l_start = blocker->l_start; 4240 if (blocker->l_end == MAX_U_OFFSET_T) 4241 flrp->l_len = 0; 4242 else 4243 flrp->l_len = blocker->l_end - 4244 blocker->l_start + 1; 4245 } 4246 } 4247 } 4248 4249 /* 4250 * PSARC case 1997/292 4251 */ 4252 /* 4253 * This is the public routine exported by flock.h. 4254 */ 4255 void 4256 cl_flk_change_nlm_state_to_unknown(int nlmid) 4257 { 4258 /* 4259 * Check to see if node is booted as a cluster. If not, return. 4260 */ 4261 if ((cluster_bootflags & CLUSTER_BOOTED) == 0) { 4262 return; 4263 } 4264 4265 /* 4266 * See comment in cl_flk_set_nlm_status(). 4267 */ 4268 if (nlm_reg_status == NULL) { 4269 return; 4270 } 4271 4272 /* 4273 * protect NLM registry state with a mutex. 4274 */ 4275 ASSERT(nlmid <= nlm_status_size && nlmid >= 0); 4276 mutex_enter(&nlm_reg_lock); 4277 FLK_REGISTRY_CHANGE_NLM_STATE(nlm_reg_status, nlmid, FLK_NLM_UNKNOWN); 4278 mutex_exit(&nlm_reg_lock); 4279 } 4280 4281 /* 4282 * Return non-zero if the given I/O request conflicts with an active NBMAND 4283 * lock. 4284 * If svmand is non-zero, it means look at all active locks, not just NBMAND 4285 * locks. 4286 */ 4287 4288 int 4289 nbl_lock_conflict(vnode_t *vp, nbl_op_t op, u_offset_t offset, 4290 ssize_t length, int svmand, caller_context_t *ct) 4291 { 4292 int conflict = 0; 4293 graph_t *gp; 4294 lock_descriptor_t *lock; 4295 pid_t pid; 4296 int sysid; 4297 4298 if (ct == NULL) { 4299 pid = curproc->p_pid; 4300 sysid = 0; 4301 } else { 4302 pid = ct->cc_pid; 4303 sysid = ct->cc_sysid; 4304 } 4305 4306 mutex_enter(&flock_lock); 4307 gp = lock_graph[HASH_INDEX(vp)]; 4308 mutex_exit(&flock_lock); 4309 if (gp == NULL) 4310 return (0); 4311 4312 mutex_enter(&gp->gp_mutex); 4313 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp); 4314 4315 for (; lock && lock->l_vnode == vp; lock = lock->l_next) { 4316 if ((svmand || (lock->l_state & NBMAND_LOCK)) && 4317 (lock->l_flock.l_sysid != sysid || 4318 lock->l_flock.l_pid != pid) && 4319 lock_blocks_io(op, offset, length, 4320 lock->l_type, lock->l_start, lock->l_end)) { 4321 DTRACE_PROBE1(conflict_lock, 4322 lock_descriptor_t *, lock); 4323 conflict = 1; 4324 break; 4325 } 4326 } 4327 mutex_exit(&gp->gp_mutex); 4328 4329 return (conflict); 4330 } 4331 4332 /* 4333 * Return non-zero if the given I/O request conflicts with the given lock. 4334 */ 4335 4336 static int 4337 lock_blocks_io(nbl_op_t op, u_offset_t offset, ssize_t length, 4338 int lock_type, u_offset_t lock_start, u_offset_t lock_end) 4339 { 4340 ASSERT(op == NBL_READ || op == NBL_WRITE || op == NBL_READWRITE); 4341 ASSERT(lock_type == F_RDLCK || lock_type == F_WRLCK); 4342 4343 if (op == NBL_READ && lock_type == F_RDLCK) 4344 return (0); 4345 4346 if (offset <= lock_start && lock_start < offset + length) 4347 return (1); 4348 if (lock_start <= offset && offset <= lock_end) 4349 return (1); 4350 4351 return (0); 4352 } 4353 4354 #ifdef DEBUG 4355 static void 4356 check_active_locks(graph_t *gp) 4357 { 4358 lock_descriptor_t *lock, *lock1; 4359 edge_t *ep; 4360 4361 for (lock = ACTIVE_HEAD(gp)->l_next; lock != ACTIVE_HEAD(gp); 4362 lock = lock->l_next) { 4363 ASSERT(IS_ACTIVE(lock)); 4364 ASSERT(NOT_BLOCKED(lock)); 4365 ASSERT(!IS_BARRIER(lock)); 4366 4367 ep = FIRST_IN(lock); 4368 4369 while (ep != HEAD(lock)) { 4370 ASSERT(IS_SLEEPING(ep->from_vertex)); 4371 ASSERT(!NOT_BLOCKED(ep->from_vertex)); 4372 ep = NEXT_IN(ep); 4373 } 4374 4375 for (lock1 = lock->l_next; lock1 != ACTIVE_HEAD(gp); 4376 lock1 = lock1->l_next) { 4377 if (lock1->l_vnode == lock->l_vnode) { 4378 if (BLOCKS(lock1, lock)) { 4379 cmn_err(CE_PANIC, 4380 "active lock %p blocks %p", 4381 (void *)lock1, (void *)lock); 4382 } else if (BLOCKS(lock, lock1)) { 4383 cmn_err(CE_PANIC, 4384 "active lock %p blocks %p", 4385 (void *)lock, (void *)lock1); 4386 } 4387 } 4388 } 4389 } 4390 } 4391 4392 /* 4393 * Effect: This functions checks to see if the transition from 'old_state' to 4394 * 'new_state' is a valid one. It returns 0 if the transition is valid 4395 * and 1 if it is not. 4396 * For a map of valid transitions, see sys/flock_impl.h 4397 */ 4398 static int 4399 check_lock_transition(int old_state, int new_state) 4400 { 4401 switch (old_state) { 4402 case FLK_INITIAL_STATE: 4403 if ((new_state == FLK_START_STATE) || 4404 (new_state == FLK_SLEEPING_STATE) || 4405 (new_state == FLK_ACTIVE_STATE) || 4406 (new_state == FLK_DEAD_STATE)) { 4407 return (0); 4408 } else { 4409 return (1); 4410 } 4411 case FLK_START_STATE: 4412 if ((new_state == FLK_ACTIVE_STATE) || 4413 (new_state == FLK_DEAD_STATE)) { 4414 return (0); 4415 } else { 4416 return (1); 4417 } 4418 case FLK_ACTIVE_STATE: 4419 if (new_state == FLK_DEAD_STATE) { 4420 return (0); 4421 } else { 4422 return (1); 4423 } 4424 case FLK_SLEEPING_STATE: 4425 if ((new_state == FLK_GRANTED_STATE) || 4426 (new_state == FLK_INTERRUPTED_STATE) || 4427 (new_state == FLK_CANCELLED_STATE)) { 4428 return (0); 4429 } else { 4430 return (1); 4431 } 4432 case FLK_GRANTED_STATE: 4433 if ((new_state == FLK_START_STATE) || 4434 (new_state == FLK_INTERRUPTED_STATE) || 4435 (new_state == FLK_CANCELLED_STATE)) { 4436 return (0); 4437 } else { 4438 return (1); 4439 } 4440 case FLK_CANCELLED_STATE: 4441 if ((new_state == FLK_INTERRUPTED_STATE) || 4442 (new_state == FLK_DEAD_STATE)) { 4443 return (0); 4444 } else { 4445 return (1); 4446 } 4447 case FLK_INTERRUPTED_STATE: 4448 if (new_state == FLK_DEAD_STATE) { 4449 return (0); 4450 } else { 4451 return (1); 4452 } 4453 case FLK_DEAD_STATE: 4454 /* May be set more than once */ 4455 if (new_state == FLK_DEAD_STATE) { 4456 return (0); 4457 } else { 4458 return (1); 4459 } 4460 default: 4461 return (1); 4462 } 4463 } 4464 4465 static void 4466 check_sleeping_locks(graph_t *gp) 4467 { 4468 lock_descriptor_t *lock1, *lock2; 4469 edge_t *ep; 4470 for (lock1 = SLEEPING_HEAD(gp)->l_next; lock1 != SLEEPING_HEAD(gp); 4471 lock1 = lock1->l_next) { 4472 ASSERT(!IS_BARRIER(lock1)); 4473 for (lock2 = lock1->l_next; lock2 != SLEEPING_HEAD(gp); 4474 lock2 = lock2->l_next) { 4475 if (lock1->l_vnode == lock2->l_vnode) { 4476 if (BLOCKS(lock2, lock1)) { 4477 ASSERT(!IS_GRANTED(lock1)); 4478 ASSERT(!NOT_BLOCKED(lock1)); 4479 path(lock1, lock2); 4480 } 4481 } 4482 } 4483 4484 for (lock2 = ACTIVE_HEAD(gp)->l_next; lock2 != ACTIVE_HEAD(gp); 4485 lock2 = lock2->l_next) { 4486 ASSERT(!IS_BARRIER(lock1)); 4487 if (lock1->l_vnode == lock2->l_vnode) { 4488 if (BLOCKS(lock2, lock1)) { 4489 ASSERT(!IS_GRANTED(lock1)); 4490 ASSERT(!NOT_BLOCKED(lock1)); 4491 path(lock1, lock2); 4492 } 4493 } 4494 } 4495 ep = FIRST_ADJ(lock1); 4496 while (ep != HEAD(lock1)) { 4497 ASSERT(BLOCKS(ep->to_vertex, lock1)); 4498 ep = NEXT_ADJ(ep); 4499 } 4500 } 4501 } 4502 4503 static int 4504 level_two_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2, int no_path) 4505 { 4506 edge_t *ep; 4507 lock_descriptor_t *vertex; 4508 lock_descriptor_t *vertex_stack; 4509 4510 STACK_INIT(vertex_stack); 4511 4512 flk_graph_uncolor(lock1->l_graph); 4513 ep = FIRST_ADJ(lock1); 4514 ASSERT(ep != HEAD(lock1)); 4515 while (ep != HEAD(lock1)) { 4516 if (no_path) 4517 ASSERT(ep->to_vertex != lock2); 4518 STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack); 4519 COLOR(ep->to_vertex); 4520 ep = NEXT_ADJ(ep); 4521 } 4522 4523 while ((vertex = STACK_TOP(vertex_stack)) != NULL) { 4524 STACK_POP(vertex_stack, l_dstack); 4525 for (ep = FIRST_ADJ(vertex); ep != HEAD(vertex); 4526 ep = NEXT_ADJ(ep)) { 4527 if (COLORED(ep->to_vertex)) 4528 continue; 4529 COLOR(ep->to_vertex); 4530 if (ep->to_vertex == lock2) 4531 return (1); 4532 4533 STACK_PUSH(vertex_stack, ep->to_vertex, l_dstack); 4534 } 4535 } 4536 return (0); 4537 } 4538 4539 static void 4540 check_owner_locks(graph_t *gp, pid_t pid, int sysid, vnode_t *vp) 4541 { 4542 lock_descriptor_t *lock; 4543 4544 /* Ignore OFD style locks since they're not process-wide. */ 4545 if (pid == 0) 4546 return; 4547 4548 SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp); 4549 4550 if (lock) { 4551 while (lock != ACTIVE_HEAD(gp) && (lock->l_vnode == vp)) { 4552 if (lock->l_flock.l_pid == pid && 4553 lock->l_flock.l_sysid == sysid) 4554 cmn_err(CE_PANIC, 4555 "owner pid %d's lock %p in active queue", 4556 pid, (void *)lock); 4557 lock = lock->l_next; 4558 } 4559 } 4560 SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp); 4561 4562 if (lock) { 4563 while (lock != SLEEPING_HEAD(gp) && (lock->l_vnode == vp)) { 4564 if (lock->l_flock.l_pid == pid && 4565 lock->l_flock.l_sysid == sysid) 4566 cmn_err(CE_PANIC, 4567 "owner pid %d's lock %p in sleep queue", 4568 pid, (void *)lock); 4569 lock = lock->l_next; 4570 } 4571 } 4572 } 4573 4574 static int 4575 level_one_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2) 4576 { 4577 edge_t *ep = FIRST_ADJ(lock1); 4578 4579 while (ep != HEAD(lock1)) { 4580 if (ep->to_vertex == lock2) 4581 return (1); 4582 else 4583 ep = NEXT_ADJ(ep); 4584 } 4585 return (0); 4586 } 4587 4588 static int 4589 no_path(lock_descriptor_t *lock1, lock_descriptor_t *lock2) 4590 { 4591 return (!level_two_path(lock1, lock2, 1)); 4592 } 4593 4594 static void 4595 path(lock_descriptor_t *lock1, lock_descriptor_t *lock2) 4596 { 4597 if (level_one_path(lock1, lock2)) { 4598 if (level_two_path(lock1, lock2, 0) != 0) { 4599 cmn_err(CE_WARN, 4600 "one edge one path from lock1 %p lock2 %p", 4601 (void *)lock1, (void *)lock2); 4602 } 4603 } else if (no_path(lock1, lock2)) { 4604 cmn_err(CE_PANIC, 4605 "No path from lock1 %p to lock2 %p", 4606 (void *)lock1, (void *)lock2); 4607 } 4608 } 4609 #endif /* DEBUG */ 4610