1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 2008 Isilon Inc http://www.isilon.com/ 5 * Authors: Doug Rabson <dfr@rabson.org> 6 * Developed with Red Inc: Alfred Perlstein <alfred@freebsd.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 /*- 30 * Copyright (c) 1982, 1986, 1989, 1993 31 * The Regents of the University of California. All rights reserved. 32 * 33 * This code is derived from software contributed to Berkeley by 34 * Scooter Morris at Genentech Inc. 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 3. Neither the name of the University nor the names of its contributors 45 * may be used to endorse or promote products derived from this software 46 * without specific prior written permission. 47 * 48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 58 * SUCH DAMAGE. 59 * 60 * @(#)ufs_lockf.c 8.3 (Berkeley) 1/6/94 61 */ 62 63 #include <sys/cdefs.h> 64 __FBSDID("$FreeBSD$"); 65 66 #include "opt_debug_lockf.h" 67 68 #include <sys/param.h> 69 #include <sys/systm.h> 70 #include <sys/hash.h> 71 #include <sys/jail.h> 72 #include <sys/kernel.h> 73 #include <sys/limits.h> 74 #include <sys/lock.h> 75 #include <sys/mount.h> 76 #include <sys/mutex.h> 77 #include <sys/proc.h> 78 #include <sys/sbuf.h> 79 #include <sys/stat.h> 80 #include <sys/sx.h> 81 #include <sys/unistd.h> 82 #include <sys/user.h> 83 #include <sys/vnode.h> 84 #include <sys/malloc.h> 85 #include <sys/fcntl.h> 86 #include <sys/lockf.h> 87 #include <sys/taskqueue.h> 88 89 #ifdef LOCKF_DEBUG 90 #include <sys/sysctl.h> 91 92 static int lockf_debug = 0; /* control debug output */ 93 SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW, &lockf_debug, 0, ""); 94 #endif 95 96 static MALLOC_DEFINE(M_LOCKF, "lockf", "Byte-range locking structures"); 97 98 struct owner_edge; 99 struct owner_vertex; 100 struct owner_vertex_list; 101 struct owner_graph; 102 103 #define NOLOCKF (struct lockf_entry *)0 104 #define SELF 0x1 105 #define OTHERS 0x2 106 static void lf_init(void *); 107 static int lf_hash_owner(caddr_t, struct vnode *, struct flock *, int); 108 static int lf_owner_matches(struct lock_owner *, caddr_t, struct flock *, 109 int); 110 static struct lockf_entry * 111 lf_alloc_lock(struct lock_owner *); 112 static int lf_free_lock(struct lockf_entry *); 113 static int lf_clearlock(struct lockf *, struct lockf_entry *); 114 static int lf_overlaps(struct lockf_entry *, struct lockf_entry *); 115 static int lf_blocks(struct lockf_entry *, struct lockf_entry *); 116 static void lf_free_edge(struct lockf_edge *); 117 static struct lockf_edge * 118 lf_alloc_edge(void); 119 static void lf_alloc_vertex(struct lockf_entry *); 120 static int lf_add_edge(struct lockf_entry *, struct lockf_entry *); 121 static void lf_remove_edge(struct lockf_edge *); 122 static void lf_remove_outgoing(struct lockf_entry *); 123 static void lf_remove_incoming(struct lockf_entry *); 124 static int lf_add_outgoing(struct lockf *, struct lockf_entry *); 125 static int lf_add_incoming(struct lockf *, struct lockf_entry *); 126 static int lf_findoverlap(struct lockf_entry **, struct lockf_entry *, 127 int); 128 static struct lockf_entry * 129 lf_getblock(struct lockf *, struct lockf_entry *); 130 static int lf_getlock(struct lockf *, struct lockf_entry *, struct flock *); 131 static void lf_insert_lock(struct lockf *, struct lockf_entry *); 132 static void lf_wakeup_lock(struct lockf *, struct lockf_entry *); 133 static void lf_update_dependancies(struct lockf *, struct lockf_entry *, 134 int all, struct lockf_entry_list *); 135 static void lf_set_start(struct lockf *, struct lockf_entry *, off_t, 136 struct lockf_entry_list*); 137 static void lf_set_end(struct lockf *, struct lockf_entry *, off_t, 138 struct lockf_entry_list*); 139 static int lf_setlock(struct lockf *, struct lockf_entry *, 140 struct vnode *, void **cookiep); 141 static int lf_cancel(struct lockf *, struct lockf_entry *, void *); 142 static void lf_split(struct lockf *, struct lockf_entry *, 143 struct lockf_entry *, struct lockf_entry_list *); 144 #ifdef LOCKF_DEBUG 145 static int graph_reaches(struct owner_vertex *x, struct owner_vertex *y, 146 struct owner_vertex_list *path); 147 static void graph_check(struct owner_graph *g, int checkorder); 148 static void graph_print_vertices(struct owner_vertex_list *set); 149 #endif 150 static int graph_delta_forward(struct owner_graph *g, 151 struct owner_vertex *x, struct owner_vertex *y, 152 struct owner_vertex_list *delta); 153 static int graph_delta_backward(struct owner_graph *g, 154 struct owner_vertex *x, struct owner_vertex *y, 155 struct owner_vertex_list *delta); 156 static int graph_add_indices(int *indices, int n, 157 struct owner_vertex_list *set); 158 static int graph_assign_indices(struct owner_graph *g, int *indices, 159 int nextunused, struct owner_vertex_list *set); 160 static int graph_add_edge(struct owner_graph *g, 161 struct owner_vertex *x, struct owner_vertex *y); 162 static void graph_remove_edge(struct owner_graph *g, 163 struct owner_vertex *x, struct owner_vertex *y); 164 static struct owner_vertex *graph_alloc_vertex(struct owner_graph *g, 165 struct lock_owner *lo); 166 static void graph_free_vertex(struct owner_graph *g, 167 struct owner_vertex *v); 168 static struct owner_graph * graph_init(struct owner_graph *g); 169 #ifdef LOCKF_DEBUG 170 static void lf_print(char *, struct lockf_entry *); 171 static void lf_printlist(char *, struct lockf_entry *); 172 static void lf_print_owner(struct lock_owner *); 173 #endif 174 175 /* 176 * This structure is used to keep track of both local and remote lock 177 * owners. The lf_owner field of the struct lockf_entry points back at 178 * the lock owner structure. Each possible lock owner (local proc for 179 * POSIX fcntl locks, local file for BSD flock locks or <pid,sysid> 180 * pair for remote locks) is represented by a unique instance of 181 * struct lock_owner. 182 * 183 * If a lock owner has a lock that blocks some other lock or a lock 184 * that is waiting for some other lock, it also has a vertex in the 185 * owner_graph below. 186 * 187 * Locks: 188 * (s) locked by state->ls_lock 189 * (S) locked by lf_lock_states_lock 190 * (g) locked by lf_owner_graph_lock 191 * (c) const until freeing 192 */ 193 #define LOCK_OWNER_HASH_SIZE 256 194 195 struct lock_owner { 196 LIST_ENTRY(lock_owner) lo_link; /* (l) hash chain */ 197 int lo_refs; /* (l) Number of locks referring to this */ 198 int lo_flags; /* (c) Flags passwd to lf_advlock */ 199 caddr_t lo_id; /* (c) Id value passed to lf_advlock */ 200 pid_t lo_pid; /* (c) Process Id of the lock owner */ 201 int lo_sysid; /* (c) System Id of the lock owner */ 202 int lo_hash; /* (c) Used to lock the appropriate chain */ 203 struct owner_vertex *lo_vertex; /* (g) entry in deadlock graph */ 204 }; 205 206 LIST_HEAD(lock_owner_list, lock_owner); 207 208 struct lock_owner_chain { 209 struct sx lock; 210 struct lock_owner_list list; 211 }; 212 213 static struct sx lf_lock_states_lock; 214 static struct lockf_list lf_lock_states; /* (S) */ 215 static struct lock_owner_chain lf_lock_owners[LOCK_OWNER_HASH_SIZE]; 216 217 /* 218 * Structures for deadlock detection. 219 * 220 * We have two types of directed graph, the first is the set of locks, 221 * both active and pending on a vnode. Within this graph, active locks 222 * are terminal nodes in the graph (i.e. have no out-going 223 * edges). Pending locks have out-going edges to each blocking active 224 * lock that prevents the lock from being granted and also to each 225 * older pending lock that would block them if it was active. The 226 * graph for each vnode is naturally acyclic; new edges are only ever 227 * added to or from new nodes (either new pending locks which only add 228 * out-going edges or new active locks which only add in-coming edges) 229 * therefore they cannot create loops in the lock graph. 230 * 231 * The second graph is a global graph of lock owners. Each lock owner 232 * is a vertex in that graph and an edge is added to the graph 233 * whenever an edge is added to a vnode graph, with end points 234 * corresponding to owner of the new pending lock and the owner of the 235 * lock upon which it waits. In order to prevent deadlock, we only add 236 * an edge to this graph if the new edge would not create a cycle. 237 * 238 * The lock owner graph is topologically sorted, i.e. if a node has 239 * any outgoing edges, then it has an order strictly less than any 240 * node to which it has an outgoing edge. We preserve this ordering 241 * (and detect cycles) on edge insertion using Algorithm PK from the 242 * paper "A Dynamic Topological Sort Algorithm for Directed Acyclic 243 * Graphs" (ACM Journal of Experimental Algorithms, Vol 11, Article 244 * No. 1.7) 245 */ 246 struct owner_vertex; 247 248 struct owner_edge { 249 LIST_ENTRY(owner_edge) e_outlink; /* (g) link from's out-edge list */ 250 LIST_ENTRY(owner_edge) e_inlink; /* (g) link to's in-edge list */ 251 int e_refs; /* (g) number of times added */ 252 struct owner_vertex *e_from; /* (c) out-going from here */ 253 struct owner_vertex *e_to; /* (c) in-coming to here */ 254 }; 255 LIST_HEAD(owner_edge_list, owner_edge); 256 257 struct owner_vertex { 258 TAILQ_ENTRY(owner_vertex) v_link; /* (g) workspace for edge insertion */ 259 uint32_t v_gen; /* (g) workspace for edge insertion */ 260 int v_order; /* (g) order of vertex in graph */ 261 struct owner_edge_list v_outedges;/* (g) list of out-edges */ 262 struct owner_edge_list v_inedges; /* (g) list of in-edges */ 263 struct lock_owner *v_owner; /* (c) corresponding lock owner */ 264 }; 265 TAILQ_HEAD(owner_vertex_list, owner_vertex); 266 267 struct owner_graph { 268 struct owner_vertex** g_vertices; /* (g) pointers to vertices */ 269 int g_size; /* (g) number of vertices */ 270 int g_space; /* (g) space allocated for vertices */ 271 int *g_indexbuf; /* (g) workspace for loop detection */ 272 uint32_t g_gen; /* (g) increment when re-ordering */ 273 }; 274 275 static struct sx lf_owner_graph_lock; 276 static struct owner_graph lf_owner_graph; 277 278 /* 279 * Initialise various structures and locks. 280 */ 281 static void 282 lf_init(void *dummy) 283 { 284 int i; 285 286 sx_init(&lf_lock_states_lock, "lock states lock"); 287 LIST_INIT(&lf_lock_states); 288 289 for (i = 0; i < LOCK_OWNER_HASH_SIZE; i++) { 290 sx_init(&lf_lock_owners[i].lock, "lock owners lock"); 291 LIST_INIT(&lf_lock_owners[i].list); 292 } 293 294 sx_init(&lf_owner_graph_lock, "owner graph lock"); 295 graph_init(&lf_owner_graph); 296 } 297 SYSINIT(lf_init, SI_SUB_LOCK, SI_ORDER_FIRST, lf_init, NULL); 298 299 /* 300 * Generate a hash value for a lock owner. 301 */ 302 static int 303 lf_hash_owner(caddr_t id, struct vnode *vp, struct flock *fl, int flags) 304 { 305 uint32_t h; 306 307 if (flags & F_REMOTE) { 308 h = HASHSTEP(0, fl->l_pid); 309 h = HASHSTEP(h, fl->l_sysid); 310 } else if (flags & F_FLOCK) { 311 h = ((uintptr_t) id) >> 7; 312 } else { 313 h = ((uintptr_t) vp) >> 7; 314 } 315 316 return (h % LOCK_OWNER_HASH_SIZE); 317 } 318 319 /* 320 * Return true if a lock owner matches the details passed to 321 * lf_advlock. 322 */ 323 static int 324 lf_owner_matches(struct lock_owner *lo, caddr_t id, struct flock *fl, 325 int flags) 326 { 327 if (flags & F_REMOTE) { 328 return lo->lo_pid == fl->l_pid 329 && lo->lo_sysid == fl->l_sysid; 330 } else { 331 return lo->lo_id == id; 332 } 333 } 334 335 static struct lockf_entry * 336 lf_alloc_lock(struct lock_owner *lo) 337 { 338 struct lockf_entry *lf; 339 340 lf = malloc(sizeof(struct lockf_entry), M_LOCKF, M_WAITOK|M_ZERO); 341 342 #ifdef LOCKF_DEBUG 343 if (lockf_debug & 4) 344 printf("Allocated lock %p\n", lf); 345 #endif 346 if (lo) { 347 sx_xlock(&lf_lock_owners[lo->lo_hash].lock); 348 lo->lo_refs++; 349 sx_xunlock(&lf_lock_owners[lo->lo_hash].lock); 350 lf->lf_owner = lo; 351 } 352 353 return (lf); 354 } 355 356 static int 357 lf_free_lock(struct lockf_entry *lock) 358 { 359 struct sx *chainlock; 360 361 KASSERT(lock->lf_refs > 0, ("lockf_entry negative ref count %p", lock)); 362 if (--lock->lf_refs > 0) 363 return (0); 364 /* 365 * Adjust the lock_owner reference count and 366 * reclaim the entry if this is the last lock 367 * for that owner. 368 */ 369 struct lock_owner *lo = lock->lf_owner; 370 if (lo) { 371 KASSERT(LIST_EMPTY(&lock->lf_outedges), 372 ("freeing lock with dependencies")); 373 KASSERT(LIST_EMPTY(&lock->lf_inedges), 374 ("freeing lock with dependants")); 375 chainlock = &lf_lock_owners[lo->lo_hash].lock; 376 sx_xlock(chainlock); 377 KASSERT(lo->lo_refs > 0, ("lock owner refcount")); 378 lo->lo_refs--; 379 if (lo->lo_refs == 0) { 380 #ifdef LOCKF_DEBUG 381 if (lockf_debug & 1) 382 printf("lf_free_lock: freeing lock owner %p\n", 383 lo); 384 #endif 385 if (lo->lo_vertex) { 386 sx_xlock(&lf_owner_graph_lock); 387 graph_free_vertex(&lf_owner_graph, 388 lo->lo_vertex); 389 sx_xunlock(&lf_owner_graph_lock); 390 } 391 LIST_REMOVE(lo, lo_link); 392 free(lo, M_LOCKF); 393 #ifdef LOCKF_DEBUG 394 if (lockf_debug & 4) 395 printf("Freed lock owner %p\n", lo); 396 #endif 397 } 398 sx_unlock(chainlock); 399 } 400 if ((lock->lf_flags & F_REMOTE) && lock->lf_vnode) { 401 vrele(lock->lf_vnode); 402 lock->lf_vnode = NULL; 403 } 404 #ifdef LOCKF_DEBUG 405 if (lockf_debug & 4) 406 printf("Freed lock %p\n", lock); 407 #endif 408 free(lock, M_LOCKF); 409 return (1); 410 } 411 412 /* 413 * Advisory record locking support 414 */ 415 int 416 lf_advlockasync(struct vop_advlockasync_args *ap, struct lockf **statep, 417 u_quad_t size) 418 { 419 struct lockf *state; 420 struct flock *fl = ap->a_fl; 421 struct lockf_entry *lock; 422 struct vnode *vp = ap->a_vp; 423 caddr_t id = ap->a_id; 424 int flags = ap->a_flags; 425 int hash; 426 struct lock_owner *lo; 427 off_t start, end, oadd; 428 int error; 429 430 /* 431 * Handle the F_UNLKSYS case first - no need to mess about 432 * creating a lock owner for this one. 433 */ 434 if (ap->a_op == F_UNLCKSYS) { 435 lf_clearremotesys(fl->l_sysid); 436 return (0); 437 } 438 439 /* 440 * Convert the flock structure into a start and end. 441 */ 442 switch (fl->l_whence) { 443 case SEEK_SET: 444 case SEEK_CUR: 445 /* 446 * Caller is responsible for adding any necessary offset 447 * when SEEK_CUR is used. 448 */ 449 start = fl->l_start; 450 break; 451 452 case SEEK_END: 453 if (size > OFF_MAX || 454 (fl->l_start > 0 && size > OFF_MAX - fl->l_start)) 455 return (EOVERFLOW); 456 start = size + fl->l_start; 457 break; 458 459 default: 460 return (EINVAL); 461 } 462 if (start < 0) 463 return (EINVAL); 464 if (fl->l_len < 0) { 465 if (start == 0) 466 return (EINVAL); 467 end = start - 1; 468 start += fl->l_len; 469 if (start < 0) 470 return (EINVAL); 471 } else if (fl->l_len == 0) { 472 end = OFF_MAX; 473 } else { 474 oadd = fl->l_len - 1; 475 if (oadd > OFF_MAX - start) 476 return (EOVERFLOW); 477 end = start + oadd; 478 } 479 480 retry_setlock: 481 482 /* 483 * Avoid the common case of unlocking when inode has no locks. 484 */ 485 if (ap->a_op != F_SETLK && (*statep) == NULL) { 486 VI_LOCK(vp); 487 if ((*statep) == NULL) { 488 fl->l_type = F_UNLCK; 489 VI_UNLOCK(vp); 490 return (0); 491 } 492 VI_UNLOCK(vp); 493 } 494 495 /* 496 * Map our arguments to an existing lock owner or create one 497 * if this is the first time we have seen this owner. 498 */ 499 hash = lf_hash_owner(id, vp, fl, flags); 500 sx_xlock(&lf_lock_owners[hash].lock); 501 LIST_FOREACH(lo, &lf_lock_owners[hash].list, lo_link) 502 if (lf_owner_matches(lo, id, fl, flags)) 503 break; 504 if (!lo) { 505 /* 506 * We initialise the lock with a reference 507 * count which matches the new lockf_entry 508 * structure created below. 509 */ 510 lo = malloc(sizeof(struct lock_owner), M_LOCKF, 511 M_WAITOK|M_ZERO); 512 #ifdef LOCKF_DEBUG 513 if (lockf_debug & 4) 514 printf("Allocated lock owner %p\n", lo); 515 #endif 516 517 lo->lo_refs = 1; 518 lo->lo_flags = flags; 519 lo->lo_id = id; 520 lo->lo_hash = hash; 521 if (flags & F_REMOTE) { 522 lo->lo_pid = fl->l_pid; 523 lo->lo_sysid = fl->l_sysid; 524 } else if (flags & F_FLOCK) { 525 lo->lo_pid = -1; 526 lo->lo_sysid = 0; 527 } else { 528 struct proc *p = (struct proc *) id; 529 lo->lo_pid = p->p_pid; 530 lo->lo_sysid = 0; 531 } 532 lo->lo_vertex = NULL; 533 534 #ifdef LOCKF_DEBUG 535 if (lockf_debug & 1) { 536 printf("lf_advlockasync: new lock owner %p ", lo); 537 lf_print_owner(lo); 538 printf("\n"); 539 } 540 #endif 541 542 LIST_INSERT_HEAD(&lf_lock_owners[hash].list, lo, lo_link); 543 } else { 544 /* 545 * We have seen this lock owner before, increase its 546 * reference count to account for the new lockf_entry 547 * structure we create below. 548 */ 549 lo->lo_refs++; 550 } 551 sx_xunlock(&lf_lock_owners[hash].lock); 552 553 /* 554 * Create the lockf structure. We initialise the lf_owner 555 * field here instead of in lf_alloc_lock() to avoid paying 556 * the lf_lock_owners_lock tax twice. 557 */ 558 lock = lf_alloc_lock(NULL); 559 lock->lf_refs = 1; 560 lock->lf_start = start; 561 lock->lf_end = end; 562 lock->lf_owner = lo; 563 lock->lf_vnode = vp; 564 if (flags & F_REMOTE) { 565 /* 566 * For remote locks, the caller may release its ref to 567 * the vnode at any time - we have to ref it here to 568 * prevent it from being recycled unexpectedly. 569 */ 570 vref(vp); 571 } 572 573 lock->lf_type = fl->l_type; 574 LIST_INIT(&lock->lf_outedges); 575 LIST_INIT(&lock->lf_inedges); 576 lock->lf_async_task = ap->a_task; 577 lock->lf_flags = ap->a_flags; 578 579 /* 580 * Do the requested operation. First find our state structure 581 * and create a new one if necessary - the caller's *statep 582 * variable and the state's ls_threads count is protected by 583 * the vnode interlock. 584 */ 585 VI_LOCK(vp); 586 if (VN_IS_DOOMED(vp)) { 587 VI_UNLOCK(vp); 588 lf_free_lock(lock); 589 return (ENOENT); 590 } 591 592 /* 593 * Allocate a state structure if necessary. 594 */ 595 state = *statep; 596 if (state == NULL) { 597 struct lockf *ls; 598 599 VI_UNLOCK(vp); 600 601 ls = malloc(sizeof(struct lockf), M_LOCKF, M_WAITOK|M_ZERO); 602 sx_init(&ls->ls_lock, "ls_lock"); 603 LIST_INIT(&ls->ls_active); 604 LIST_INIT(&ls->ls_pending); 605 ls->ls_threads = 1; 606 607 sx_xlock(&lf_lock_states_lock); 608 LIST_INSERT_HEAD(&lf_lock_states, ls, ls_link); 609 sx_xunlock(&lf_lock_states_lock); 610 611 /* 612 * Cope if we lost a race with some other thread while 613 * trying to allocate memory. 614 */ 615 VI_LOCK(vp); 616 if (VN_IS_DOOMED(vp)) { 617 VI_UNLOCK(vp); 618 sx_xlock(&lf_lock_states_lock); 619 LIST_REMOVE(ls, ls_link); 620 sx_xunlock(&lf_lock_states_lock); 621 sx_destroy(&ls->ls_lock); 622 free(ls, M_LOCKF); 623 lf_free_lock(lock); 624 return (ENOENT); 625 } 626 if ((*statep) == NULL) { 627 state = *statep = ls; 628 VI_UNLOCK(vp); 629 } else { 630 state = *statep; 631 MPASS(state->ls_threads >= 0); 632 state->ls_threads++; 633 VI_UNLOCK(vp); 634 635 sx_xlock(&lf_lock_states_lock); 636 LIST_REMOVE(ls, ls_link); 637 sx_xunlock(&lf_lock_states_lock); 638 sx_destroy(&ls->ls_lock); 639 free(ls, M_LOCKF); 640 } 641 } else { 642 MPASS(state->ls_threads >= 0); 643 state->ls_threads++; 644 VI_UNLOCK(vp); 645 } 646 647 sx_xlock(&state->ls_lock); 648 /* 649 * Recheck the doomed vnode after state->ls_lock is 650 * locked. lf_purgelocks() requires that no new threads add 651 * pending locks when vnode is marked by VIRF_DOOMED flag. 652 */ 653 if (VN_IS_DOOMED(vp)) { 654 VI_LOCK(vp); 655 MPASS(state->ls_threads > 0); 656 state->ls_threads--; 657 wakeup(state); 658 VI_UNLOCK(vp); 659 sx_xunlock(&state->ls_lock); 660 lf_free_lock(lock); 661 return (ENOENT); 662 } 663 664 switch (ap->a_op) { 665 case F_SETLK: 666 error = lf_setlock(state, lock, vp, ap->a_cookiep); 667 break; 668 669 case F_UNLCK: 670 error = lf_clearlock(state, lock); 671 lf_free_lock(lock); 672 break; 673 674 case F_GETLK: 675 error = lf_getlock(state, lock, fl); 676 lf_free_lock(lock); 677 break; 678 679 case F_CANCEL: 680 if (ap->a_cookiep) 681 error = lf_cancel(state, lock, *ap->a_cookiep); 682 else 683 error = EINVAL; 684 lf_free_lock(lock); 685 break; 686 687 default: 688 lf_free_lock(lock); 689 error = EINVAL; 690 break; 691 } 692 693 #ifdef DIAGNOSTIC 694 /* 695 * Check for some can't happen stuff. In this case, the active 696 * lock list becoming disordered or containing mutually 697 * blocking locks. We also check the pending list for locks 698 * which should be active (i.e. have no out-going edges). 699 */ 700 LIST_FOREACH(lock, &state->ls_active, lf_link) { 701 struct lockf_entry *lf; 702 if (LIST_NEXT(lock, lf_link)) 703 KASSERT((lock->lf_start 704 <= LIST_NEXT(lock, lf_link)->lf_start), 705 ("locks disordered")); 706 LIST_FOREACH(lf, &state->ls_active, lf_link) { 707 if (lock == lf) 708 break; 709 KASSERT(!lf_blocks(lock, lf), 710 ("two conflicting active locks")); 711 if (lock->lf_owner == lf->lf_owner) 712 KASSERT(!lf_overlaps(lock, lf), 713 ("two overlapping locks from same owner")); 714 } 715 } 716 LIST_FOREACH(lock, &state->ls_pending, lf_link) { 717 KASSERT(!LIST_EMPTY(&lock->lf_outedges), 718 ("pending lock which should be active")); 719 } 720 #endif 721 sx_xunlock(&state->ls_lock); 722 723 VI_LOCK(vp); 724 MPASS(state->ls_threads > 0); 725 state->ls_threads--; 726 if (state->ls_threads != 0) { 727 wakeup(state); 728 } 729 VI_UNLOCK(vp); 730 731 if (error == EDOOFUS) { 732 KASSERT(ap->a_op == F_SETLK, ("EDOOFUS")); 733 goto retry_setlock; 734 } 735 return (error); 736 } 737 738 int 739 lf_advlock(struct vop_advlock_args *ap, struct lockf **statep, u_quad_t size) 740 { 741 struct vop_advlockasync_args a; 742 743 a.a_vp = ap->a_vp; 744 a.a_id = ap->a_id; 745 a.a_op = ap->a_op; 746 a.a_fl = ap->a_fl; 747 a.a_flags = ap->a_flags; 748 a.a_task = NULL; 749 a.a_cookiep = NULL; 750 751 return (lf_advlockasync(&a, statep, size)); 752 } 753 754 void 755 lf_purgelocks(struct vnode *vp, struct lockf **statep) 756 { 757 struct lockf *state; 758 struct lockf_entry *lock, *nlock; 759 760 /* 761 * For this to work correctly, the caller must ensure that no 762 * other threads enter the locking system for this vnode, 763 * e.g. by checking VIRF_DOOMED. We wake up any threads that are 764 * sleeping waiting for locks on this vnode and then free all 765 * the remaining locks. 766 */ 767 VI_LOCK(vp); 768 KASSERT(VN_IS_DOOMED(vp), 769 ("lf_purgelocks: vp %p has not vgone yet", vp)); 770 state = *statep; 771 if (state == NULL) { 772 VI_UNLOCK(vp); 773 return; 774 } 775 *statep = NULL; 776 if (LIST_EMPTY(&state->ls_active) && state->ls_threads == 0) { 777 KASSERT(LIST_EMPTY(&state->ls_pending), 778 ("freeing state with pending locks")); 779 VI_UNLOCK(vp); 780 goto out_free; 781 } 782 MPASS(state->ls_threads >= 0); 783 state->ls_threads++; 784 VI_UNLOCK(vp); 785 786 sx_xlock(&state->ls_lock); 787 sx_xlock(&lf_owner_graph_lock); 788 LIST_FOREACH_SAFE(lock, &state->ls_pending, lf_link, nlock) { 789 LIST_REMOVE(lock, lf_link); 790 lf_remove_outgoing(lock); 791 lf_remove_incoming(lock); 792 793 /* 794 * If its an async lock, we can just free it 795 * here, otherwise we let the sleeping thread 796 * free it. 797 */ 798 if (lock->lf_async_task) { 799 lf_free_lock(lock); 800 } else { 801 lock->lf_flags |= F_INTR; 802 wakeup(lock); 803 } 804 } 805 sx_xunlock(&lf_owner_graph_lock); 806 sx_xunlock(&state->ls_lock); 807 808 /* 809 * Wait for all other threads, sleeping and otherwise 810 * to leave. 811 */ 812 VI_LOCK(vp); 813 while (state->ls_threads > 1) 814 msleep(state, VI_MTX(vp), 0, "purgelocks", 0); 815 VI_UNLOCK(vp); 816 817 /* 818 * We can just free all the active locks since they 819 * will have no dependencies (we removed them all 820 * above). We don't need to bother locking since we 821 * are the last thread using this state structure. 822 */ 823 KASSERT(LIST_EMPTY(&state->ls_pending), 824 ("lock pending for %p", state)); 825 LIST_FOREACH_SAFE(lock, &state->ls_active, lf_link, nlock) { 826 LIST_REMOVE(lock, lf_link); 827 lf_free_lock(lock); 828 } 829 out_free: 830 sx_xlock(&lf_lock_states_lock); 831 LIST_REMOVE(state, ls_link); 832 sx_xunlock(&lf_lock_states_lock); 833 sx_destroy(&state->ls_lock); 834 free(state, M_LOCKF); 835 } 836 837 /* 838 * Return non-zero if locks 'x' and 'y' overlap. 839 */ 840 static int 841 lf_overlaps(struct lockf_entry *x, struct lockf_entry *y) 842 { 843 844 return (x->lf_start <= y->lf_end && x->lf_end >= y->lf_start); 845 } 846 847 /* 848 * Return non-zero if lock 'x' is blocked by lock 'y' (or vice versa). 849 */ 850 static int 851 lf_blocks(struct lockf_entry *x, struct lockf_entry *y) 852 { 853 854 return x->lf_owner != y->lf_owner 855 && (x->lf_type == F_WRLCK || y->lf_type == F_WRLCK) 856 && lf_overlaps(x, y); 857 } 858 859 /* 860 * Allocate a lock edge from the free list 861 */ 862 static struct lockf_edge * 863 lf_alloc_edge(void) 864 { 865 866 return (malloc(sizeof(struct lockf_edge), M_LOCKF, M_WAITOK|M_ZERO)); 867 } 868 869 /* 870 * Free a lock edge. 871 */ 872 static void 873 lf_free_edge(struct lockf_edge *e) 874 { 875 876 free(e, M_LOCKF); 877 } 878 879 /* 880 * Ensure that the lock's owner has a corresponding vertex in the 881 * owner graph. 882 */ 883 static void 884 lf_alloc_vertex(struct lockf_entry *lock) 885 { 886 struct owner_graph *g = &lf_owner_graph; 887 888 if (!lock->lf_owner->lo_vertex) 889 lock->lf_owner->lo_vertex = 890 graph_alloc_vertex(g, lock->lf_owner); 891 } 892 893 /* 894 * Attempt to record an edge from lock x to lock y. Return EDEADLK if 895 * the new edge would cause a cycle in the owner graph. 896 */ 897 static int 898 lf_add_edge(struct lockf_entry *x, struct lockf_entry *y) 899 { 900 struct owner_graph *g = &lf_owner_graph; 901 struct lockf_edge *e; 902 int error; 903 904 #ifdef DIAGNOSTIC 905 LIST_FOREACH(e, &x->lf_outedges, le_outlink) 906 KASSERT(e->le_to != y, ("adding lock edge twice")); 907 #endif 908 909 /* 910 * Make sure the two owners have entries in the owner graph. 911 */ 912 lf_alloc_vertex(x); 913 lf_alloc_vertex(y); 914 915 error = graph_add_edge(g, x->lf_owner->lo_vertex, 916 y->lf_owner->lo_vertex); 917 if (error) 918 return (error); 919 920 e = lf_alloc_edge(); 921 LIST_INSERT_HEAD(&x->lf_outedges, e, le_outlink); 922 LIST_INSERT_HEAD(&y->lf_inedges, e, le_inlink); 923 e->le_from = x; 924 e->le_to = y; 925 926 return (0); 927 } 928 929 /* 930 * Remove an edge from the lock graph. 931 */ 932 static void 933 lf_remove_edge(struct lockf_edge *e) 934 { 935 struct owner_graph *g = &lf_owner_graph; 936 struct lockf_entry *x = e->le_from; 937 struct lockf_entry *y = e->le_to; 938 939 graph_remove_edge(g, x->lf_owner->lo_vertex, y->lf_owner->lo_vertex); 940 LIST_REMOVE(e, le_outlink); 941 LIST_REMOVE(e, le_inlink); 942 e->le_from = NULL; 943 e->le_to = NULL; 944 lf_free_edge(e); 945 } 946 947 /* 948 * Remove all out-going edges from lock x. 949 */ 950 static void 951 lf_remove_outgoing(struct lockf_entry *x) 952 { 953 struct lockf_edge *e; 954 955 while ((e = LIST_FIRST(&x->lf_outedges)) != NULL) { 956 lf_remove_edge(e); 957 } 958 } 959 960 /* 961 * Remove all in-coming edges from lock x. 962 */ 963 static void 964 lf_remove_incoming(struct lockf_entry *x) 965 { 966 struct lockf_edge *e; 967 968 while ((e = LIST_FIRST(&x->lf_inedges)) != NULL) { 969 lf_remove_edge(e); 970 } 971 } 972 973 /* 974 * Walk the list of locks for the file and create an out-going edge 975 * from lock to each blocking lock. 976 */ 977 static int 978 lf_add_outgoing(struct lockf *state, struct lockf_entry *lock) 979 { 980 struct lockf_entry *overlap; 981 int error; 982 983 LIST_FOREACH(overlap, &state->ls_active, lf_link) { 984 /* 985 * We may assume that the active list is sorted by 986 * lf_start. 987 */ 988 if (overlap->lf_start > lock->lf_end) 989 break; 990 if (!lf_blocks(lock, overlap)) 991 continue; 992 993 /* 994 * We've found a blocking lock. Add the corresponding 995 * edge to the graphs and see if it would cause a 996 * deadlock. 997 */ 998 error = lf_add_edge(lock, overlap); 999 1000 /* 1001 * The only error that lf_add_edge returns is EDEADLK. 1002 * Remove any edges we added and return the error. 1003 */ 1004 if (error) { 1005 lf_remove_outgoing(lock); 1006 return (error); 1007 } 1008 } 1009 1010 /* 1011 * We also need to add edges to sleeping locks that block 1012 * us. This ensures that lf_wakeup_lock cannot grant two 1013 * mutually blocking locks simultaneously and also enforces a 1014 * 'first come, first served' fairness model. Note that this 1015 * only happens if we are blocked by at least one active lock 1016 * due to the call to lf_getblock in lf_setlock below. 1017 */ 1018 LIST_FOREACH(overlap, &state->ls_pending, lf_link) { 1019 if (!lf_blocks(lock, overlap)) 1020 continue; 1021 /* 1022 * We've found a blocking lock. Add the corresponding 1023 * edge to the graphs and see if it would cause a 1024 * deadlock. 1025 */ 1026 error = lf_add_edge(lock, overlap); 1027 1028 /* 1029 * The only error that lf_add_edge returns is EDEADLK. 1030 * Remove any edges we added and return the error. 1031 */ 1032 if (error) { 1033 lf_remove_outgoing(lock); 1034 return (error); 1035 } 1036 } 1037 1038 return (0); 1039 } 1040 1041 /* 1042 * Walk the list of pending locks for the file and create an in-coming 1043 * edge from lock to each blocking lock. 1044 */ 1045 static int 1046 lf_add_incoming(struct lockf *state, struct lockf_entry *lock) 1047 { 1048 struct lockf_entry *overlap; 1049 int error; 1050 1051 sx_assert(&state->ls_lock, SX_XLOCKED); 1052 if (LIST_EMPTY(&state->ls_pending)) 1053 return (0); 1054 1055 error = 0; 1056 sx_xlock(&lf_owner_graph_lock); 1057 LIST_FOREACH(overlap, &state->ls_pending, lf_link) { 1058 if (!lf_blocks(lock, overlap)) 1059 continue; 1060 1061 /* 1062 * We've found a blocking lock. Add the corresponding 1063 * edge to the graphs and see if it would cause a 1064 * deadlock. 1065 */ 1066 error = lf_add_edge(overlap, lock); 1067 1068 /* 1069 * The only error that lf_add_edge returns is EDEADLK. 1070 * Remove any edges we added and return the error. 1071 */ 1072 if (error) { 1073 lf_remove_incoming(lock); 1074 break; 1075 } 1076 } 1077 sx_xunlock(&lf_owner_graph_lock); 1078 return (error); 1079 } 1080 1081 /* 1082 * Insert lock into the active list, keeping list entries ordered by 1083 * increasing values of lf_start. 1084 */ 1085 static void 1086 lf_insert_lock(struct lockf *state, struct lockf_entry *lock) 1087 { 1088 struct lockf_entry *lf, *lfprev; 1089 1090 if (LIST_EMPTY(&state->ls_active)) { 1091 LIST_INSERT_HEAD(&state->ls_active, lock, lf_link); 1092 return; 1093 } 1094 1095 lfprev = NULL; 1096 LIST_FOREACH(lf, &state->ls_active, lf_link) { 1097 if (lf->lf_start > lock->lf_start) { 1098 LIST_INSERT_BEFORE(lf, lock, lf_link); 1099 return; 1100 } 1101 lfprev = lf; 1102 } 1103 LIST_INSERT_AFTER(lfprev, lock, lf_link); 1104 } 1105 1106 /* 1107 * Wake up a sleeping lock and remove it from the pending list now 1108 * that all its dependencies have been resolved. The caller should 1109 * arrange for the lock to be added to the active list, adjusting any 1110 * existing locks for the same owner as needed. 1111 */ 1112 static void 1113 lf_wakeup_lock(struct lockf *state, struct lockf_entry *wakelock) 1114 { 1115 1116 /* 1117 * Remove from ls_pending list and wake up the caller 1118 * or start the async notification, as appropriate. 1119 */ 1120 LIST_REMOVE(wakelock, lf_link); 1121 #ifdef LOCKF_DEBUG 1122 if (lockf_debug & 1) 1123 lf_print("lf_wakeup_lock: awakening", wakelock); 1124 #endif /* LOCKF_DEBUG */ 1125 if (wakelock->lf_async_task) { 1126 taskqueue_enqueue(taskqueue_thread, wakelock->lf_async_task); 1127 } else { 1128 wakeup(wakelock); 1129 } 1130 } 1131 1132 /* 1133 * Re-check all dependent locks and remove edges to locks that we no 1134 * longer block. If 'all' is non-zero, the lock has been removed and 1135 * we must remove all the dependencies, otherwise it has simply been 1136 * reduced but remains active. Any pending locks which have been been 1137 * unblocked are added to 'granted' 1138 */ 1139 static void 1140 lf_update_dependancies(struct lockf *state, struct lockf_entry *lock, int all, 1141 struct lockf_entry_list *granted) 1142 { 1143 struct lockf_edge *e, *ne; 1144 struct lockf_entry *deplock; 1145 1146 LIST_FOREACH_SAFE(e, &lock->lf_inedges, le_inlink, ne) { 1147 deplock = e->le_from; 1148 if (all || !lf_blocks(lock, deplock)) { 1149 sx_xlock(&lf_owner_graph_lock); 1150 lf_remove_edge(e); 1151 sx_xunlock(&lf_owner_graph_lock); 1152 if (LIST_EMPTY(&deplock->lf_outedges)) { 1153 lf_wakeup_lock(state, deplock); 1154 LIST_INSERT_HEAD(granted, deplock, lf_link); 1155 } 1156 } 1157 } 1158 } 1159 1160 /* 1161 * Set the start of an existing active lock, updating dependencies and 1162 * adding any newly woken locks to 'granted'. 1163 */ 1164 static void 1165 lf_set_start(struct lockf *state, struct lockf_entry *lock, off_t new_start, 1166 struct lockf_entry_list *granted) 1167 { 1168 1169 KASSERT(new_start >= lock->lf_start, ("can't increase lock")); 1170 lock->lf_start = new_start; 1171 LIST_REMOVE(lock, lf_link); 1172 lf_insert_lock(state, lock); 1173 lf_update_dependancies(state, lock, FALSE, granted); 1174 } 1175 1176 /* 1177 * Set the end of an existing active lock, updating dependencies and 1178 * adding any newly woken locks to 'granted'. 1179 */ 1180 static void 1181 lf_set_end(struct lockf *state, struct lockf_entry *lock, off_t new_end, 1182 struct lockf_entry_list *granted) 1183 { 1184 1185 KASSERT(new_end <= lock->lf_end, ("can't increase lock")); 1186 lock->lf_end = new_end; 1187 lf_update_dependancies(state, lock, FALSE, granted); 1188 } 1189 1190 /* 1191 * Add a lock to the active list, updating or removing any current 1192 * locks owned by the same owner and processing any pending locks that 1193 * become unblocked as a result. This code is also used for unlock 1194 * since the logic for updating existing locks is identical. 1195 * 1196 * As a result of processing the new lock, we may unblock existing 1197 * pending locks as a result of downgrading/unlocking. We simply 1198 * activate the newly granted locks by looping. 1199 * 1200 * Since the new lock already has its dependencies set up, we always 1201 * add it to the list (unless its an unlock request). This may 1202 * fragment the lock list in some pathological cases but its probably 1203 * not a real problem. 1204 */ 1205 static void 1206 lf_activate_lock(struct lockf *state, struct lockf_entry *lock) 1207 { 1208 struct lockf_entry *overlap, *lf; 1209 struct lockf_entry_list granted; 1210 int ovcase; 1211 1212 LIST_INIT(&granted); 1213 LIST_INSERT_HEAD(&granted, lock, lf_link); 1214 1215 while (!LIST_EMPTY(&granted)) { 1216 lock = LIST_FIRST(&granted); 1217 LIST_REMOVE(lock, lf_link); 1218 1219 /* 1220 * Skip over locks owned by other processes. Handle 1221 * any locks that overlap and are owned by ourselves. 1222 */ 1223 overlap = LIST_FIRST(&state->ls_active); 1224 for (;;) { 1225 ovcase = lf_findoverlap(&overlap, lock, SELF); 1226 1227 #ifdef LOCKF_DEBUG 1228 if (ovcase && (lockf_debug & 2)) { 1229 printf("lf_setlock: overlap %d", ovcase); 1230 lf_print("", overlap); 1231 } 1232 #endif 1233 /* 1234 * Six cases: 1235 * 0) no overlap 1236 * 1) overlap == lock 1237 * 2) overlap contains lock 1238 * 3) lock contains overlap 1239 * 4) overlap starts before lock 1240 * 5) overlap ends after lock 1241 */ 1242 switch (ovcase) { 1243 case 0: /* no overlap */ 1244 break; 1245 1246 case 1: /* overlap == lock */ 1247 /* 1248 * We have already setup the 1249 * dependants for the new lock, taking 1250 * into account a possible downgrade 1251 * or unlock. Remove the old lock. 1252 */ 1253 LIST_REMOVE(overlap, lf_link); 1254 lf_update_dependancies(state, overlap, TRUE, 1255 &granted); 1256 lf_free_lock(overlap); 1257 break; 1258 1259 case 2: /* overlap contains lock */ 1260 /* 1261 * Just split the existing lock. 1262 */ 1263 lf_split(state, overlap, lock, &granted); 1264 break; 1265 1266 case 3: /* lock contains overlap */ 1267 /* 1268 * Delete the overlap and advance to 1269 * the next entry in the list. 1270 */ 1271 lf = LIST_NEXT(overlap, lf_link); 1272 LIST_REMOVE(overlap, lf_link); 1273 lf_update_dependancies(state, overlap, TRUE, 1274 &granted); 1275 lf_free_lock(overlap); 1276 overlap = lf; 1277 continue; 1278 1279 case 4: /* overlap starts before lock */ 1280 /* 1281 * Just update the overlap end and 1282 * move on. 1283 */ 1284 lf_set_end(state, overlap, lock->lf_start - 1, 1285 &granted); 1286 overlap = LIST_NEXT(overlap, lf_link); 1287 continue; 1288 1289 case 5: /* overlap ends after lock */ 1290 /* 1291 * Change the start of overlap and 1292 * re-insert. 1293 */ 1294 lf_set_start(state, overlap, lock->lf_end + 1, 1295 &granted); 1296 break; 1297 } 1298 break; 1299 } 1300 #ifdef LOCKF_DEBUG 1301 if (lockf_debug & 1) { 1302 if (lock->lf_type != F_UNLCK) 1303 lf_print("lf_activate_lock: activated", lock); 1304 else 1305 lf_print("lf_activate_lock: unlocked", lock); 1306 lf_printlist("lf_activate_lock", lock); 1307 } 1308 #endif /* LOCKF_DEBUG */ 1309 if (lock->lf_type != F_UNLCK) 1310 lf_insert_lock(state, lock); 1311 } 1312 } 1313 1314 /* 1315 * Cancel a pending lock request, either as a result of a signal or a 1316 * cancel request for an async lock. 1317 */ 1318 static void 1319 lf_cancel_lock(struct lockf *state, struct lockf_entry *lock) 1320 { 1321 struct lockf_entry_list granted; 1322 1323 /* 1324 * Note it is theoretically possible that cancelling this lock 1325 * may allow some other pending lock to become 1326 * active. Consider this case: 1327 * 1328 * Owner Action Result Dependencies 1329 * 1330 * A: lock [0..0] succeeds 1331 * B: lock [2..2] succeeds 1332 * C: lock [1..2] blocked C->B 1333 * D: lock [0..1] blocked C->B,D->A,D->C 1334 * A: unlock [0..0] C->B,D->C 1335 * C: cancel [1..2] 1336 */ 1337 1338 LIST_REMOVE(lock, lf_link); 1339 1340 /* 1341 * Removing out-going edges is simple. 1342 */ 1343 sx_xlock(&lf_owner_graph_lock); 1344 lf_remove_outgoing(lock); 1345 sx_xunlock(&lf_owner_graph_lock); 1346 1347 /* 1348 * Removing in-coming edges may allow some other lock to 1349 * become active - we use lf_update_dependancies to figure 1350 * this out. 1351 */ 1352 LIST_INIT(&granted); 1353 lf_update_dependancies(state, lock, TRUE, &granted); 1354 lf_free_lock(lock); 1355 1356 /* 1357 * Feed any newly active locks to lf_activate_lock. 1358 */ 1359 while (!LIST_EMPTY(&granted)) { 1360 lock = LIST_FIRST(&granted); 1361 LIST_REMOVE(lock, lf_link); 1362 lf_activate_lock(state, lock); 1363 } 1364 } 1365 1366 /* 1367 * Set a byte-range lock. 1368 */ 1369 static int 1370 lf_setlock(struct lockf *state, struct lockf_entry *lock, struct vnode *vp, 1371 void **cookiep) 1372 { 1373 static char lockstr[] = "lockf"; 1374 int error, priority, stops_deferred; 1375 1376 #ifdef LOCKF_DEBUG 1377 if (lockf_debug & 1) 1378 lf_print("lf_setlock", lock); 1379 #endif /* LOCKF_DEBUG */ 1380 1381 /* 1382 * Set the priority 1383 */ 1384 priority = PLOCK; 1385 if (lock->lf_type == F_WRLCK) 1386 priority += 4; 1387 if (!(lock->lf_flags & F_NOINTR)) 1388 priority |= PCATCH; 1389 /* 1390 * Scan lock list for this file looking for locks that would block us. 1391 */ 1392 if (lf_getblock(state, lock)) { 1393 /* 1394 * Free the structure and return if nonblocking. 1395 */ 1396 if ((lock->lf_flags & F_WAIT) == 0 1397 && lock->lf_async_task == NULL) { 1398 lf_free_lock(lock); 1399 error = EAGAIN; 1400 goto out; 1401 } 1402 1403 /* 1404 * For flock type locks, we must first remove 1405 * any shared locks that we hold before we sleep 1406 * waiting for an exclusive lock. 1407 */ 1408 if ((lock->lf_flags & F_FLOCK) && 1409 lock->lf_type == F_WRLCK) { 1410 lock->lf_type = F_UNLCK; 1411 lf_activate_lock(state, lock); 1412 lock->lf_type = F_WRLCK; 1413 } 1414 1415 /* 1416 * We are blocked. Create edges to each blocking lock, 1417 * checking for deadlock using the owner graph. For 1418 * simplicity, we run deadlock detection for all 1419 * locks, posix and otherwise. 1420 */ 1421 sx_xlock(&lf_owner_graph_lock); 1422 error = lf_add_outgoing(state, lock); 1423 sx_xunlock(&lf_owner_graph_lock); 1424 1425 if (error) { 1426 #ifdef LOCKF_DEBUG 1427 if (lockf_debug & 1) 1428 lf_print("lf_setlock: deadlock", lock); 1429 #endif 1430 lf_free_lock(lock); 1431 goto out; 1432 } 1433 1434 /* 1435 * We have added edges to everything that blocks 1436 * us. Sleep until they all go away. 1437 */ 1438 LIST_INSERT_HEAD(&state->ls_pending, lock, lf_link); 1439 #ifdef LOCKF_DEBUG 1440 if (lockf_debug & 1) { 1441 struct lockf_edge *e; 1442 LIST_FOREACH(e, &lock->lf_outedges, le_outlink) { 1443 lf_print("lf_setlock: blocking on", e->le_to); 1444 lf_printlist("lf_setlock", e->le_to); 1445 } 1446 } 1447 #endif /* LOCKF_DEBUG */ 1448 1449 if ((lock->lf_flags & F_WAIT) == 0) { 1450 /* 1451 * The caller requested async notification - 1452 * this callback happens when the blocking 1453 * lock is released, allowing the caller to 1454 * make another attempt to take the lock. 1455 */ 1456 *cookiep = (void *) lock; 1457 error = EINPROGRESS; 1458 goto out; 1459 } 1460 1461 lock->lf_refs++; 1462 stops_deferred = sigdeferstop(SIGDEFERSTOP_ERESTART); 1463 error = sx_sleep(lock, &state->ls_lock, priority, lockstr, 0); 1464 sigallowstop(stops_deferred); 1465 if (lf_free_lock(lock)) { 1466 error = EDOOFUS; 1467 goto out; 1468 } 1469 1470 /* 1471 * We may have been awakened by a signal and/or by a 1472 * debugger continuing us (in which cases we must 1473 * remove our lock graph edges) and/or by another 1474 * process releasing a lock (in which case our edges 1475 * have already been removed and we have been moved to 1476 * the active list). We may also have been woken by 1477 * lf_purgelocks which we report to the caller as 1478 * EINTR. In that case, lf_purgelocks will have 1479 * removed our lock graph edges. 1480 * 1481 * Note that it is possible to receive a signal after 1482 * we were successfully woken (and moved to the active 1483 * list) but before we resumed execution. In this 1484 * case, our lf_outedges list will be clear. We 1485 * pretend there was no error. 1486 * 1487 * Note also, if we have been sleeping long enough, we 1488 * may now have incoming edges from some newer lock 1489 * which is waiting behind us in the queue. 1490 */ 1491 if (lock->lf_flags & F_INTR) { 1492 error = EINTR; 1493 lf_free_lock(lock); 1494 goto out; 1495 } 1496 if (LIST_EMPTY(&lock->lf_outedges)) { 1497 error = 0; 1498 } else { 1499 lf_cancel_lock(state, lock); 1500 goto out; 1501 } 1502 #ifdef LOCKF_DEBUG 1503 if (lockf_debug & 1) { 1504 lf_print("lf_setlock: granted", lock); 1505 } 1506 #endif 1507 goto out; 1508 } 1509 /* 1510 * It looks like we are going to grant the lock. First add 1511 * edges from any currently pending lock that the new lock 1512 * would block. 1513 */ 1514 error = lf_add_incoming(state, lock); 1515 if (error) { 1516 #ifdef LOCKF_DEBUG 1517 if (lockf_debug & 1) 1518 lf_print("lf_setlock: deadlock", lock); 1519 #endif 1520 lf_free_lock(lock); 1521 goto out; 1522 } 1523 1524 /* 1525 * No blocks!! Add the lock. Note that we will 1526 * downgrade or upgrade any overlapping locks this 1527 * process already owns. 1528 */ 1529 lf_activate_lock(state, lock); 1530 error = 0; 1531 out: 1532 return (error); 1533 } 1534 1535 /* 1536 * Remove a byte-range lock on an inode. 1537 * 1538 * Generally, find the lock (or an overlap to that lock) 1539 * and remove it (or shrink it), then wakeup anyone we can. 1540 */ 1541 static int 1542 lf_clearlock(struct lockf *state, struct lockf_entry *unlock) 1543 { 1544 struct lockf_entry *overlap; 1545 1546 overlap = LIST_FIRST(&state->ls_active); 1547 1548 if (overlap == NOLOCKF) 1549 return (0); 1550 #ifdef LOCKF_DEBUG 1551 if (unlock->lf_type != F_UNLCK) 1552 panic("lf_clearlock: bad type"); 1553 if (lockf_debug & 1) 1554 lf_print("lf_clearlock", unlock); 1555 #endif /* LOCKF_DEBUG */ 1556 1557 lf_activate_lock(state, unlock); 1558 1559 return (0); 1560 } 1561 1562 /* 1563 * Check whether there is a blocking lock, and if so return its 1564 * details in '*fl'. 1565 */ 1566 static int 1567 lf_getlock(struct lockf *state, struct lockf_entry *lock, struct flock *fl) 1568 { 1569 struct lockf_entry *block; 1570 1571 #ifdef LOCKF_DEBUG 1572 if (lockf_debug & 1) 1573 lf_print("lf_getlock", lock); 1574 #endif /* LOCKF_DEBUG */ 1575 1576 if ((block = lf_getblock(state, lock))) { 1577 fl->l_type = block->lf_type; 1578 fl->l_whence = SEEK_SET; 1579 fl->l_start = block->lf_start; 1580 if (block->lf_end == OFF_MAX) 1581 fl->l_len = 0; 1582 else 1583 fl->l_len = block->lf_end - block->lf_start + 1; 1584 fl->l_pid = block->lf_owner->lo_pid; 1585 fl->l_sysid = block->lf_owner->lo_sysid; 1586 } else { 1587 fl->l_type = F_UNLCK; 1588 } 1589 return (0); 1590 } 1591 1592 /* 1593 * Cancel an async lock request. 1594 */ 1595 static int 1596 lf_cancel(struct lockf *state, struct lockf_entry *lock, void *cookie) 1597 { 1598 struct lockf_entry *reallock; 1599 1600 /* 1601 * We need to match this request with an existing lock 1602 * request. 1603 */ 1604 LIST_FOREACH(reallock, &state->ls_pending, lf_link) { 1605 if ((void *) reallock == cookie) { 1606 /* 1607 * Double-check that this lock looks right 1608 * (maybe use a rolling ID for the cancel 1609 * cookie instead?) 1610 */ 1611 if (!(reallock->lf_vnode == lock->lf_vnode 1612 && reallock->lf_start == lock->lf_start 1613 && reallock->lf_end == lock->lf_end)) { 1614 return (ENOENT); 1615 } 1616 1617 /* 1618 * Make sure this lock was async and then just 1619 * remove it from its wait lists. 1620 */ 1621 if (!reallock->lf_async_task) { 1622 return (ENOENT); 1623 } 1624 1625 /* 1626 * Note that since any other thread must take 1627 * state->ls_lock before it can possibly 1628 * trigger the async callback, we are safe 1629 * from a race with lf_wakeup_lock, i.e. we 1630 * can free the lock (actually our caller does 1631 * this). 1632 */ 1633 lf_cancel_lock(state, reallock); 1634 return (0); 1635 } 1636 } 1637 1638 /* 1639 * We didn't find a matching lock - not much we can do here. 1640 */ 1641 return (ENOENT); 1642 } 1643 1644 /* 1645 * Walk the list of locks for an inode and 1646 * return the first blocking lock. 1647 */ 1648 static struct lockf_entry * 1649 lf_getblock(struct lockf *state, struct lockf_entry *lock) 1650 { 1651 struct lockf_entry *overlap; 1652 1653 LIST_FOREACH(overlap, &state->ls_active, lf_link) { 1654 /* 1655 * We may assume that the active list is sorted by 1656 * lf_start. 1657 */ 1658 if (overlap->lf_start > lock->lf_end) 1659 break; 1660 if (!lf_blocks(lock, overlap)) 1661 continue; 1662 return (overlap); 1663 } 1664 return (NOLOCKF); 1665 } 1666 1667 /* 1668 * Walk the list of locks for an inode to find an overlapping lock (if 1669 * any) and return a classification of that overlap. 1670 * 1671 * Arguments: 1672 * *overlap The place in the lock list to start looking 1673 * lock The lock which is being tested 1674 * type Pass 'SELF' to test only locks with the same 1675 * owner as lock, or 'OTHER' to test only locks 1676 * with a different owner 1677 * 1678 * Returns one of six values: 1679 * 0) no overlap 1680 * 1) overlap == lock 1681 * 2) overlap contains lock 1682 * 3) lock contains overlap 1683 * 4) overlap starts before lock 1684 * 5) overlap ends after lock 1685 * 1686 * If there is an overlapping lock, '*overlap' is set to point at the 1687 * overlapping lock. 1688 * 1689 * NOTE: this returns only the FIRST overlapping lock. There 1690 * may be more than one. 1691 */ 1692 static int 1693 lf_findoverlap(struct lockf_entry **overlap, struct lockf_entry *lock, int type) 1694 { 1695 struct lockf_entry *lf; 1696 off_t start, end; 1697 int res; 1698 1699 if ((*overlap) == NOLOCKF) { 1700 return (0); 1701 } 1702 #ifdef LOCKF_DEBUG 1703 if (lockf_debug & 2) 1704 lf_print("lf_findoverlap: looking for overlap in", lock); 1705 #endif /* LOCKF_DEBUG */ 1706 start = lock->lf_start; 1707 end = lock->lf_end; 1708 res = 0; 1709 while (*overlap) { 1710 lf = *overlap; 1711 if (lf->lf_start > end) 1712 break; 1713 if (((type & SELF) && lf->lf_owner != lock->lf_owner) || 1714 ((type & OTHERS) && lf->lf_owner == lock->lf_owner)) { 1715 *overlap = LIST_NEXT(lf, lf_link); 1716 continue; 1717 } 1718 #ifdef LOCKF_DEBUG 1719 if (lockf_debug & 2) 1720 lf_print("\tchecking", lf); 1721 #endif /* LOCKF_DEBUG */ 1722 /* 1723 * OK, check for overlap 1724 * 1725 * Six cases: 1726 * 0) no overlap 1727 * 1) overlap == lock 1728 * 2) overlap contains lock 1729 * 3) lock contains overlap 1730 * 4) overlap starts before lock 1731 * 5) overlap ends after lock 1732 */ 1733 if (start > lf->lf_end) { 1734 /* Case 0 */ 1735 #ifdef LOCKF_DEBUG 1736 if (lockf_debug & 2) 1737 printf("no overlap\n"); 1738 #endif /* LOCKF_DEBUG */ 1739 *overlap = LIST_NEXT(lf, lf_link); 1740 continue; 1741 } 1742 if (lf->lf_start == start && lf->lf_end == end) { 1743 /* Case 1 */ 1744 #ifdef LOCKF_DEBUG 1745 if (lockf_debug & 2) 1746 printf("overlap == lock\n"); 1747 #endif /* LOCKF_DEBUG */ 1748 res = 1; 1749 break; 1750 } 1751 if (lf->lf_start <= start && lf->lf_end >= end) { 1752 /* Case 2 */ 1753 #ifdef LOCKF_DEBUG 1754 if (lockf_debug & 2) 1755 printf("overlap contains lock\n"); 1756 #endif /* LOCKF_DEBUG */ 1757 res = 2; 1758 break; 1759 } 1760 if (start <= lf->lf_start && end >= lf->lf_end) { 1761 /* Case 3 */ 1762 #ifdef LOCKF_DEBUG 1763 if (lockf_debug & 2) 1764 printf("lock contains overlap\n"); 1765 #endif /* LOCKF_DEBUG */ 1766 res = 3; 1767 break; 1768 } 1769 if (lf->lf_start < start && lf->lf_end >= start) { 1770 /* Case 4 */ 1771 #ifdef LOCKF_DEBUG 1772 if (lockf_debug & 2) 1773 printf("overlap starts before lock\n"); 1774 #endif /* LOCKF_DEBUG */ 1775 res = 4; 1776 break; 1777 } 1778 if (lf->lf_start > start && lf->lf_end > end) { 1779 /* Case 5 */ 1780 #ifdef LOCKF_DEBUG 1781 if (lockf_debug & 2) 1782 printf("overlap ends after lock\n"); 1783 #endif /* LOCKF_DEBUG */ 1784 res = 5; 1785 break; 1786 } 1787 panic("lf_findoverlap: default"); 1788 } 1789 return (res); 1790 } 1791 1792 /* 1793 * Split an the existing 'lock1', based on the extent of the lock 1794 * described by 'lock2'. The existing lock should cover 'lock2' 1795 * entirely. 1796 * 1797 * Any pending locks which have been been unblocked are added to 1798 * 'granted' 1799 */ 1800 static void 1801 lf_split(struct lockf *state, struct lockf_entry *lock1, 1802 struct lockf_entry *lock2, struct lockf_entry_list *granted) 1803 { 1804 struct lockf_entry *splitlock; 1805 1806 #ifdef LOCKF_DEBUG 1807 if (lockf_debug & 2) { 1808 lf_print("lf_split", lock1); 1809 lf_print("splitting from", lock2); 1810 } 1811 #endif /* LOCKF_DEBUG */ 1812 /* 1813 * Check to see if we don't need to split at all. 1814 */ 1815 if (lock1->lf_start == lock2->lf_start) { 1816 lf_set_start(state, lock1, lock2->lf_end + 1, granted); 1817 return; 1818 } 1819 if (lock1->lf_end == lock2->lf_end) { 1820 lf_set_end(state, lock1, lock2->lf_start - 1, granted); 1821 return; 1822 } 1823 /* 1824 * Make a new lock consisting of the last part of 1825 * the encompassing lock. 1826 */ 1827 splitlock = lf_alloc_lock(lock1->lf_owner); 1828 memcpy(splitlock, lock1, sizeof *splitlock); 1829 splitlock->lf_refs = 1; 1830 if (splitlock->lf_flags & F_REMOTE) 1831 vref(splitlock->lf_vnode); 1832 1833 /* 1834 * This cannot cause a deadlock since any edges we would add 1835 * to splitlock already exist in lock1. We must be sure to add 1836 * necessary dependencies to splitlock before we reduce lock1 1837 * otherwise we may accidentally grant a pending lock that 1838 * was blocked by the tail end of lock1. 1839 */ 1840 splitlock->lf_start = lock2->lf_end + 1; 1841 LIST_INIT(&splitlock->lf_outedges); 1842 LIST_INIT(&splitlock->lf_inedges); 1843 lf_add_incoming(state, splitlock); 1844 1845 lf_set_end(state, lock1, lock2->lf_start - 1, granted); 1846 1847 /* 1848 * OK, now link it in 1849 */ 1850 lf_insert_lock(state, splitlock); 1851 } 1852 1853 struct lockdesc { 1854 STAILQ_ENTRY(lockdesc) link; 1855 struct vnode *vp; 1856 struct flock fl; 1857 }; 1858 STAILQ_HEAD(lockdesclist, lockdesc); 1859 1860 int 1861 lf_iteratelocks_sysid(int sysid, lf_iterator *fn, void *arg) 1862 { 1863 struct lockf *ls; 1864 struct lockf_entry *lf; 1865 struct lockdesc *ldesc; 1866 struct lockdesclist locks; 1867 int error; 1868 1869 /* 1870 * In order to keep the locking simple, we iterate over the 1871 * active lock lists to build a list of locks that need 1872 * releasing. We then call the iterator for each one in turn. 1873 * 1874 * We take an extra reference to the vnode for the duration to 1875 * make sure it doesn't go away before we are finished. 1876 */ 1877 STAILQ_INIT(&locks); 1878 sx_xlock(&lf_lock_states_lock); 1879 LIST_FOREACH(ls, &lf_lock_states, ls_link) { 1880 sx_xlock(&ls->ls_lock); 1881 LIST_FOREACH(lf, &ls->ls_active, lf_link) { 1882 if (lf->lf_owner->lo_sysid != sysid) 1883 continue; 1884 1885 ldesc = malloc(sizeof(struct lockdesc), M_LOCKF, 1886 M_WAITOK); 1887 ldesc->vp = lf->lf_vnode; 1888 vref(ldesc->vp); 1889 ldesc->fl.l_start = lf->lf_start; 1890 if (lf->lf_end == OFF_MAX) 1891 ldesc->fl.l_len = 0; 1892 else 1893 ldesc->fl.l_len = 1894 lf->lf_end - lf->lf_start + 1; 1895 ldesc->fl.l_whence = SEEK_SET; 1896 ldesc->fl.l_type = F_UNLCK; 1897 ldesc->fl.l_pid = lf->lf_owner->lo_pid; 1898 ldesc->fl.l_sysid = sysid; 1899 STAILQ_INSERT_TAIL(&locks, ldesc, link); 1900 } 1901 sx_xunlock(&ls->ls_lock); 1902 } 1903 sx_xunlock(&lf_lock_states_lock); 1904 1905 /* 1906 * Call the iterator function for each lock in turn. If the 1907 * iterator returns an error code, just free the rest of the 1908 * lockdesc structures. 1909 */ 1910 error = 0; 1911 while ((ldesc = STAILQ_FIRST(&locks)) != NULL) { 1912 STAILQ_REMOVE_HEAD(&locks, link); 1913 if (!error) 1914 error = fn(ldesc->vp, &ldesc->fl, arg); 1915 vrele(ldesc->vp); 1916 free(ldesc, M_LOCKF); 1917 } 1918 1919 return (error); 1920 } 1921 1922 int 1923 lf_iteratelocks_vnode(struct vnode *vp, lf_iterator *fn, void *arg) 1924 { 1925 struct lockf *ls; 1926 struct lockf_entry *lf; 1927 struct lockdesc *ldesc; 1928 struct lockdesclist locks; 1929 int error; 1930 1931 /* 1932 * In order to keep the locking simple, we iterate over the 1933 * active lock lists to build a list of locks that need 1934 * releasing. We then call the iterator for each one in turn. 1935 * 1936 * We take an extra reference to the vnode for the duration to 1937 * make sure it doesn't go away before we are finished. 1938 */ 1939 STAILQ_INIT(&locks); 1940 VI_LOCK(vp); 1941 ls = vp->v_lockf; 1942 if (!ls) { 1943 VI_UNLOCK(vp); 1944 return (0); 1945 } 1946 MPASS(ls->ls_threads >= 0); 1947 ls->ls_threads++; 1948 VI_UNLOCK(vp); 1949 1950 sx_xlock(&ls->ls_lock); 1951 LIST_FOREACH(lf, &ls->ls_active, lf_link) { 1952 ldesc = malloc(sizeof(struct lockdesc), M_LOCKF, 1953 M_WAITOK); 1954 ldesc->vp = lf->lf_vnode; 1955 vref(ldesc->vp); 1956 ldesc->fl.l_start = lf->lf_start; 1957 if (lf->lf_end == OFF_MAX) 1958 ldesc->fl.l_len = 0; 1959 else 1960 ldesc->fl.l_len = 1961 lf->lf_end - lf->lf_start + 1; 1962 ldesc->fl.l_whence = SEEK_SET; 1963 ldesc->fl.l_type = F_UNLCK; 1964 ldesc->fl.l_pid = lf->lf_owner->lo_pid; 1965 ldesc->fl.l_sysid = lf->lf_owner->lo_sysid; 1966 STAILQ_INSERT_TAIL(&locks, ldesc, link); 1967 } 1968 sx_xunlock(&ls->ls_lock); 1969 VI_LOCK(vp); 1970 MPASS(ls->ls_threads > 0); 1971 ls->ls_threads--; 1972 wakeup(ls); 1973 VI_UNLOCK(vp); 1974 1975 /* 1976 * Call the iterator function for each lock in turn. If the 1977 * iterator returns an error code, just free the rest of the 1978 * lockdesc structures. 1979 */ 1980 error = 0; 1981 while ((ldesc = STAILQ_FIRST(&locks)) != NULL) { 1982 STAILQ_REMOVE_HEAD(&locks, link); 1983 if (!error) 1984 error = fn(ldesc->vp, &ldesc->fl, arg); 1985 vrele(ldesc->vp); 1986 free(ldesc, M_LOCKF); 1987 } 1988 1989 return (error); 1990 } 1991 1992 static int 1993 lf_clearremotesys_iterator(struct vnode *vp, struct flock *fl, void *arg) 1994 { 1995 1996 VOP_ADVLOCK(vp, 0, F_UNLCK, fl, F_REMOTE); 1997 return (0); 1998 } 1999 2000 void 2001 lf_clearremotesys(int sysid) 2002 { 2003 2004 KASSERT(sysid != 0, ("Can't clear local locks with F_UNLCKSYS")); 2005 lf_iteratelocks_sysid(sysid, lf_clearremotesys_iterator, NULL); 2006 } 2007 2008 int 2009 lf_countlocks(int sysid) 2010 { 2011 int i; 2012 struct lock_owner *lo; 2013 int count; 2014 2015 count = 0; 2016 for (i = 0; i < LOCK_OWNER_HASH_SIZE; i++) { 2017 sx_xlock(&lf_lock_owners[i].lock); 2018 LIST_FOREACH(lo, &lf_lock_owners[i].list, lo_link) 2019 if (lo->lo_sysid == sysid) 2020 count += lo->lo_refs; 2021 sx_xunlock(&lf_lock_owners[i].lock); 2022 } 2023 2024 return (count); 2025 } 2026 2027 #ifdef LOCKF_DEBUG 2028 2029 /* 2030 * Return non-zero if y is reachable from x using a brute force 2031 * search. If reachable and path is non-null, return the route taken 2032 * in path. 2033 */ 2034 static int 2035 graph_reaches(struct owner_vertex *x, struct owner_vertex *y, 2036 struct owner_vertex_list *path) 2037 { 2038 struct owner_edge *e; 2039 2040 if (x == y) { 2041 if (path) 2042 TAILQ_INSERT_HEAD(path, x, v_link); 2043 return 1; 2044 } 2045 2046 LIST_FOREACH(e, &x->v_outedges, e_outlink) { 2047 if (graph_reaches(e->e_to, y, path)) { 2048 if (path) 2049 TAILQ_INSERT_HEAD(path, x, v_link); 2050 return 1; 2051 } 2052 } 2053 return 0; 2054 } 2055 2056 /* 2057 * Perform consistency checks on the graph. Make sure the values of 2058 * v_order are correct. If checkorder is non-zero, check no vertex can 2059 * reach any other vertex with a smaller order. 2060 */ 2061 static void 2062 graph_check(struct owner_graph *g, int checkorder) 2063 { 2064 int i, j; 2065 2066 for (i = 0; i < g->g_size; i++) { 2067 if (!g->g_vertices[i]->v_owner) 2068 continue; 2069 KASSERT(g->g_vertices[i]->v_order == i, 2070 ("lock graph vertices disordered")); 2071 if (checkorder) { 2072 for (j = 0; j < i; j++) { 2073 if (!g->g_vertices[j]->v_owner) 2074 continue; 2075 KASSERT(!graph_reaches(g->g_vertices[i], 2076 g->g_vertices[j], NULL), 2077 ("lock graph vertices disordered")); 2078 } 2079 } 2080 } 2081 } 2082 2083 static void 2084 graph_print_vertices(struct owner_vertex_list *set) 2085 { 2086 struct owner_vertex *v; 2087 2088 printf("{ "); 2089 TAILQ_FOREACH(v, set, v_link) { 2090 printf("%d:", v->v_order); 2091 lf_print_owner(v->v_owner); 2092 if (TAILQ_NEXT(v, v_link)) 2093 printf(", "); 2094 } 2095 printf(" }\n"); 2096 } 2097 2098 #endif 2099 2100 /* 2101 * Calculate the sub-set of vertices v from the affected region [y..x] 2102 * where v is reachable from y. Return -1 if a loop was detected 2103 * (i.e. x is reachable from y, otherwise the number of vertices in 2104 * this subset. 2105 */ 2106 static int 2107 graph_delta_forward(struct owner_graph *g, struct owner_vertex *x, 2108 struct owner_vertex *y, struct owner_vertex_list *delta) 2109 { 2110 uint32_t gen; 2111 struct owner_vertex *v; 2112 struct owner_edge *e; 2113 int n; 2114 2115 /* 2116 * We start with a set containing just y. Then for each vertex 2117 * v in the set so far unprocessed, we add each vertex that v 2118 * has an out-edge to and that is within the affected region 2119 * [y..x]. If we see the vertex x on our travels, stop 2120 * immediately. 2121 */ 2122 TAILQ_INIT(delta); 2123 TAILQ_INSERT_TAIL(delta, y, v_link); 2124 v = y; 2125 n = 1; 2126 gen = g->g_gen; 2127 while (v) { 2128 LIST_FOREACH(e, &v->v_outedges, e_outlink) { 2129 if (e->e_to == x) 2130 return -1; 2131 if (e->e_to->v_order < x->v_order 2132 && e->e_to->v_gen != gen) { 2133 e->e_to->v_gen = gen; 2134 TAILQ_INSERT_TAIL(delta, e->e_to, v_link); 2135 n++; 2136 } 2137 } 2138 v = TAILQ_NEXT(v, v_link); 2139 } 2140 2141 return (n); 2142 } 2143 2144 /* 2145 * Calculate the sub-set of vertices v from the affected region [y..x] 2146 * where v reaches x. Return the number of vertices in this subset. 2147 */ 2148 static int 2149 graph_delta_backward(struct owner_graph *g, struct owner_vertex *x, 2150 struct owner_vertex *y, struct owner_vertex_list *delta) 2151 { 2152 uint32_t gen; 2153 struct owner_vertex *v; 2154 struct owner_edge *e; 2155 int n; 2156 2157 /* 2158 * We start with a set containing just x. Then for each vertex 2159 * v in the set so far unprocessed, we add each vertex that v 2160 * has an in-edge from and that is within the affected region 2161 * [y..x]. 2162 */ 2163 TAILQ_INIT(delta); 2164 TAILQ_INSERT_TAIL(delta, x, v_link); 2165 v = x; 2166 n = 1; 2167 gen = g->g_gen; 2168 while (v) { 2169 LIST_FOREACH(e, &v->v_inedges, e_inlink) { 2170 if (e->e_from->v_order > y->v_order 2171 && e->e_from->v_gen != gen) { 2172 e->e_from->v_gen = gen; 2173 TAILQ_INSERT_HEAD(delta, e->e_from, v_link); 2174 n++; 2175 } 2176 } 2177 v = TAILQ_PREV(v, owner_vertex_list, v_link); 2178 } 2179 2180 return (n); 2181 } 2182 2183 static int 2184 graph_add_indices(int *indices, int n, struct owner_vertex_list *set) 2185 { 2186 struct owner_vertex *v; 2187 int i, j; 2188 2189 TAILQ_FOREACH(v, set, v_link) { 2190 for (i = n; 2191 i > 0 && indices[i - 1] > v->v_order; i--) 2192 ; 2193 for (j = n - 1; j >= i; j--) 2194 indices[j + 1] = indices[j]; 2195 indices[i] = v->v_order; 2196 n++; 2197 } 2198 2199 return (n); 2200 } 2201 2202 static int 2203 graph_assign_indices(struct owner_graph *g, int *indices, int nextunused, 2204 struct owner_vertex_list *set) 2205 { 2206 struct owner_vertex *v, *vlowest; 2207 2208 while (!TAILQ_EMPTY(set)) { 2209 vlowest = NULL; 2210 TAILQ_FOREACH(v, set, v_link) { 2211 if (!vlowest || v->v_order < vlowest->v_order) 2212 vlowest = v; 2213 } 2214 TAILQ_REMOVE(set, vlowest, v_link); 2215 vlowest->v_order = indices[nextunused]; 2216 g->g_vertices[vlowest->v_order] = vlowest; 2217 nextunused++; 2218 } 2219 2220 return (nextunused); 2221 } 2222 2223 static int 2224 graph_add_edge(struct owner_graph *g, struct owner_vertex *x, 2225 struct owner_vertex *y) 2226 { 2227 struct owner_edge *e; 2228 struct owner_vertex_list deltaF, deltaB; 2229 int nF, n, vi, i; 2230 int *indices; 2231 int nB __unused; 2232 2233 sx_assert(&lf_owner_graph_lock, SX_XLOCKED); 2234 2235 LIST_FOREACH(e, &x->v_outedges, e_outlink) { 2236 if (e->e_to == y) { 2237 e->e_refs++; 2238 return (0); 2239 } 2240 } 2241 2242 #ifdef LOCKF_DEBUG 2243 if (lockf_debug & 8) { 2244 printf("adding edge %d:", x->v_order); 2245 lf_print_owner(x->v_owner); 2246 printf(" -> %d:", y->v_order); 2247 lf_print_owner(y->v_owner); 2248 printf("\n"); 2249 } 2250 #endif 2251 if (y->v_order < x->v_order) { 2252 /* 2253 * The new edge violates the order. First find the set 2254 * of affected vertices reachable from y (deltaF) and 2255 * the set of affect vertices affected that reach x 2256 * (deltaB), using the graph generation number to 2257 * detect whether we have visited a given vertex 2258 * already. We re-order the graph so that each vertex 2259 * in deltaB appears before each vertex in deltaF. 2260 * 2261 * If x is a member of deltaF, then the new edge would 2262 * create a cycle. Otherwise, we may assume that 2263 * deltaF and deltaB are disjoint. 2264 */ 2265 g->g_gen++; 2266 if (g->g_gen == 0) { 2267 /* 2268 * Generation wrap. 2269 */ 2270 for (vi = 0; vi < g->g_size; vi++) { 2271 g->g_vertices[vi]->v_gen = 0; 2272 } 2273 g->g_gen++; 2274 } 2275 nF = graph_delta_forward(g, x, y, &deltaF); 2276 if (nF < 0) { 2277 #ifdef LOCKF_DEBUG 2278 if (lockf_debug & 8) { 2279 struct owner_vertex_list path; 2280 printf("deadlock: "); 2281 TAILQ_INIT(&path); 2282 graph_reaches(y, x, &path); 2283 graph_print_vertices(&path); 2284 } 2285 #endif 2286 return (EDEADLK); 2287 } 2288 2289 #ifdef LOCKF_DEBUG 2290 if (lockf_debug & 8) { 2291 printf("re-ordering graph vertices\n"); 2292 printf("deltaF = "); 2293 graph_print_vertices(&deltaF); 2294 } 2295 #endif 2296 2297 nB = graph_delta_backward(g, x, y, &deltaB); 2298 2299 #ifdef LOCKF_DEBUG 2300 if (lockf_debug & 8) { 2301 printf("deltaB = "); 2302 graph_print_vertices(&deltaB); 2303 } 2304 #endif 2305 2306 /* 2307 * We first build a set of vertex indices (vertex 2308 * order values) that we may use, then we re-assign 2309 * orders first to those vertices in deltaB, then to 2310 * deltaF. Note that the contents of deltaF and deltaB 2311 * may be partially disordered - we perform an 2312 * insertion sort while building our index set. 2313 */ 2314 indices = g->g_indexbuf; 2315 n = graph_add_indices(indices, 0, &deltaF); 2316 graph_add_indices(indices, n, &deltaB); 2317 2318 /* 2319 * We must also be sure to maintain the relative 2320 * ordering of deltaF and deltaB when re-assigning 2321 * vertices. We do this by iteratively removing the 2322 * lowest ordered element from the set and assigning 2323 * it the next value from our new ordering. 2324 */ 2325 i = graph_assign_indices(g, indices, 0, &deltaB); 2326 graph_assign_indices(g, indices, i, &deltaF); 2327 2328 #ifdef LOCKF_DEBUG 2329 if (lockf_debug & 8) { 2330 struct owner_vertex_list set; 2331 TAILQ_INIT(&set); 2332 for (i = 0; i < nB + nF; i++) 2333 TAILQ_INSERT_TAIL(&set, 2334 g->g_vertices[indices[i]], v_link); 2335 printf("new ordering = "); 2336 graph_print_vertices(&set); 2337 } 2338 #endif 2339 } 2340 2341 KASSERT(x->v_order < y->v_order, ("Failed to re-order graph")); 2342 2343 #ifdef LOCKF_DEBUG 2344 if (lockf_debug & 8) { 2345 graph_check(g, TRUE); 2346 } 2347 #endif 2348 2349 e = malloc(sizeof(struct owner_edge), M_LOCKF, M_WAITOK); 2350 2351 LIST_INSERT_HEAD(&x->v_outedges, e, e_outlink); 2352 LIST_INSERT_HEAD(&y->v_inedges, e, e_inlink); 2353 e->e_refs = 1; 2354 e->e_from = x; 2355 e->e_to = y; 2356 2357 return (0); 2358 } 2359 2360 /* 2361 * Remove an edge x->y from the graph. 2362 */ 2363 static void 2364 graph_remove_edge(struct owner_graph *g, struct owner_vertex *x, 2365 struct owner_vertex *y) 2366 { 2367 struct owner_edge *e; 2368 2369 sx_assert(&lf_owner_graph_lock, SX_XLOCKED); 2370 2371 LIST_FOREACH(e, &x->v_outedges, e_outlink) { 2372 if (e->e_to == y) 2373 break; 2374 } 2375 KASSERT(e, ("Removing non-existent edge from deadlock graph")); 2376 2377 e->e_refs--; 2378 if (e->e_refs == 0) { 2379 #ifdef LOCKF_DEBUG 2380 if (lockf_debug & 8) { 2381 printf("removing edge %d:", x->v_order); 2382 lf_print_owner(x->v_owner); 2383 printf(" -> %d:", y->v_order); 2384 lf_print_owner(y->v_owner); 2385 printf("\n"); 2386 } 2387 #endif 2388 LIST_REMOVE(e, e_outlink); 2389 LIST_REMOVE(e, e_inlink); 2390 free(e, M_LOCKF); 2391 } 2392 } 2393 2394 /* 2395 * Allocate a vertex from the free list. Return ENOMEM if there are 2396 * none. 2397 */ 2398 static struct owner_vertex * 2399 graph_alloc_vertex(struct owner_graph *g, struct lock_owner *lo) 2400 { 2401 struct owner_vertex *v; 2402 2403 sx_assert(&lf_owner_graph_lock, SX_XLOCKED); 2404 2405 v = malloc(sizeof(struct owner_vertex), M_LOCKF, M_WAITOK); 2406 if (g->g_size == g->g_space) { 2407 g->g_vertices = realloc(g->g_vertices, 2408 2 * g->g_space * sizeof(struct owner_vertex *), 2409 M_LOCKF, M_WAITOK); 2410 free(g->g_indexbuf, M_LOCKF); 2411 g->g_indexbuf = malloc(2 * g->g_space * sizeof(int), 2412 M_LOCKF, M_WAITOK); 2413 g->g_space = 2 * g->g_space; 2414 } 2415 v->v_order = g->g_size; 2416 v->v_gen = g->g_gen; 2417 g->g_vertices[g->g_size] = v; 2418 g->g_size++; 2419 2420 LIST_INIT(&v->v_outedges); 2421 LIST_INIT(&v->v_inedges); 2422 v->v_owner = lo; 2423 2424 return (v); 2425 } 2426 2427 static void 2428 graph_free_vertex(struct owner_graph *g, struct owner_vertex *v) 2429 { 2430 struct owner_vertex *w; 2431 int i; 2432 2433 sx_assert(&lf_owner_graph_lock, SX_XLOCKED); 2434 2435 KASSERT(LIST_EMPTY(&v->v_outedges), ("Freeing vertex with edges")); 2436 KASSERT(LIST_EMPTY(&v->v_inedges), ("Freeing vertex with edges")); 2437 2438 /* 2439 * Remove from the graph's array and close up the gap, 2440 * renumbering the other vertices. 2441 */ 2442 for (i = v->v_order + 1; i < g->g_size; i++) { 2443 w = g->g_vertices[i]; 2444 w->v_order--; 2445 g->g_vertices[i - 1] = w; 2446 } 2447 g->g_size--; 2448 2449 free(v, M_LOCKF); 2450 } 2451 2452 static struct owner_graph * 2453 graph_init(struct owner_graph *g) 2454 { 2455 2456 g->g_vertices = malloc(10 * sizeof(struct owner_vertex *), 2457 M_LOCKF, M_WAITOK); 2458 g->g_size = 0; 2459 g->g_space = 10; 2460 g->g_indexbuf = malloc(g->g_space * sizeof(int), M_LOCKF, M_WAITOK); 2461 g->g_gen = 0; 2462 2463 return (g); 2464 } 2465 2466 struct kinfo_lockf_linked { 2467 struct kinfo_lockf kl; 2468 struct vnode *vp; 2469 STAILQ_ENTRY(kinfo_lockf_linked) link; 2470 }; 2471 2472 int 2473 vfs_report_lockf(struct mount *mp, struct sbuf *sb) 2474 { 2475 struct lockf *ls; 2476 struct lockf_entry *lf; 2477 struct kinfo_lockf_linked *klf; 2478 struct vnode *vp; 2479 struct ucred *ucred; 2480 char *fullpath, *freepath; 2481 struct stat stt; 2482 fsid_t fsidx; 2483 STAILQ_HEAD(, kinfo_lockf_linked) locks; 2484 int error, gerror; 2485 2486 STAILQ_INIT(&locks); 2487 sx_slock(&lf_lock_states_lock); 2488 LIST_FOREACH(ls, &lf_lock_states, ls_link) { 2489 sx_slock(&ls->ls_lock); 2490 LIST_FOREACH(lf, &ls->ls_active, lf_link) { 2491 vp = lf->lf_vnode; 2492 if (VN_IS_DOOMED(vp) || vp->v_mount != mp) 2493 continue; 2494 vhold(vp); 2495 klf = malloc(sizeof(struct kinfo_lockf_linked), 2496 M_LOCKF, M_WAITOK | M_ZERO); 2497 klf->vp = vp; 2498 klf->kl.kl_structsize = sizeof(struct kinfo_lockf); 2499 klf->kl.kl_start = lf->lf_start; 2500 klf->kl.kl_len = lf->lf_end == OFF_MAX ? 0 : 2501 lf->lf_end - lf->lf_start + 1; 2502 klf->kl.kl_rw = lf->lf_type == F_RDLCK ? 2503 KLOCKF_RW_READ : KLOCKF_RW_WRITE; 2504 if (lf->lf_owner->lo_sysid != 0) { 2505 klf->kl.kl_pid = lf->lf_owner->lo_pid; 2506 klf->kl.kl_sysid = lf->lf_owner->lo_sysid; 2507 klf->kl.kl_type = KLOCKF_TYPE_REMOTE; 2508 } else if (lf->lf_owner->lo_pid == -1) { 2509 klf->kl.kl_pid = -1; 2510 klf->kl.kl_sysid = 0; 2511 klf->kl.kl_type = KLOCKF_TYPE_FLOCK; 2512 } else { 2513 klf->kl.kl_pid = lf->lf_owner->lo_pid; 2514 klf->kl.kl_sysid = 0; 2515 klf->kl.kl_type = KLOCKF_TYPE_PID; 2516 } 2517 STAILQ_INSERT_TAIL(&locks, klf, link); 2518 } 2519 sx_sunlock(&ls->ls_lock); 2520 } 2521 sx_sunlock(&lf_lock_states_lock); 2522 2523 gerror = 0; 2524 ucred = curthread->td_ucred; 2525 fsidx = mp->mnt_stat.f_fsid; 2526 while ((klf = STAILQ_FIRST(&locks)) != NULL) { 2527 STAILQ_REMOVE_HEAD(&locks, link); 2528 vp = klf->vp; 2529 if (gerror == 0 && vn_lock(vp, LK_SHARED) == 0) { 2530 error = prison_canseemount(ucred, vp->v_mount); 2531 if (error == 0) 2532 error = VOP_STAT(vp, &stt, ucred, NOCRED); 2533 VOP_UNLOCK(vp); 2534 if (error == 0) { 2535 memcpy(&klf->kl.kl_file_fsid, &fsidx, 2536 sizeof(fsidx)); 2537 klf->kl.kl_file_rdev = stt.st_rdev; 2538 klf->kl.kl_file_fileid = stt.st_ino; 2539 freepath = NULL; 2540 fullpath = "-"; 2541 error = vn_fullpath(vp, &fullpath, &freepath); 2542 if (error == 0) 2543 strlcpy(klf->kl.kl_path, fullpath, 2544 sizeof(klf->kl.kl_path)); 2545 free(freepath, M_TEMP); 2546 if (sbuf_bcat(sb, &klf->kl, 2547 klf->kl.kl_structsize) != 0) { 2548 gerror = sbuf_error(sb); 2549 } 2550 } 2551 } 2552 vdrop(vp); 2553 free(klf, M_LOCKF); 2554 } 2555 2556 return (gerror); 2557 } 2558 2559 static int 2560 sysctl_kern_lockf_run(struct sbuf *sb) 2561 { 2562 struct mount *mp; 2563 int error; 2564 2565 error = 0; 2566 mtx_lock(&mountlist_mtx); 2567 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 2568 error = vfs_busy(mp, MBF_MNTLSTLOCK); 2569 if (error != 0) 2570 continue; 2571 error = mp->mnt_op->vfs_report_lockf(mp, sb); 2572 mtx_lock(&mountlist_mtx); 2573 vfs_unbusy(mp); 2574 if (error != 0) 2575 break; 2576 } 2577 mtx_unlock(&mountlist_mtx); 2578 return (error); 2579 } 2580 2581 static int 2582 sysctl_kern_lockf(SYSCTL_HANDLER_ARGS) 2583 { 2584 struct sbuf sb; 2585 int error, error2; 2586 2587 sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_lockf) * 5, req); 2588 sbuf_clear_flags(&sb, SBUF_INCLUDENUL); 2589 error = sysctl_kern_lockf_run(&sb); 2590 error2 = sbuf_finish(&sb); 2591 sbuf_delete(&sb); 2592 return (error != 0 ? error : error2); 2593 } 2594 SYSCTL_PROC(_kern, KERN_LOCKF, lockf, 2595 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, 2596 0, 0, sysctl_kern_lockf, "S,lockf", 2597 "Advisory locks table"); 2598 2599 #ifdef LOCKF_DEBUG 2600 /* 2601 * Print description of a lock owner 2602 */ 2603 static void 2604 lf_print_owner(struct lock_owner *lo) 2605 { 2606 2607 if (lo->lo_flags & F_REMOTE) { 2608 printf("remote pid %d, system %d", 2609 lo->lo_pid, lo->lo_sysid); 2610 } else if (lo->lo_flags & F_FLOCK) { 2611 printf("file %p", lo->lo_id); 2612 } else { 2613 printf("local pid %d", lo->lo_pid); 2614 } 2615 } 2616 2617 /* 2618 * Print out a lock. 2619 */ 2620 static void 2621 lf_print(char *tag, struct lockf_entry *lock) 2622 { 2623 2624 printf("%s: lock %p for ", tag, (void *)lock); 2625 lf_print_owner(lock->lf_owner); 2626 printf("\nvnode %p", lock->lf_vnode); 2627 VOP_PRINT(lock->lf_vnode); 2628 printf(" %s, start %jd, end ", 2629 lock->lf_type == F_RDLCK ? "shared" : 2630 lock->lf_type == F_WRLCK ? "exclusive" : 2631 lock->lf_type == F_UNLCK ? "unlock" : "unknown", 2632 (intmax_t)lock->lf_start); 2633 if (lock->lf_end == OFF_MAX) 2634 printf("EOF"); 2635 else 2636 printf("%jd", (intmax_t)lock->lf_end); 2637 if (!LIST_EMPTY(&lock->lf_outedges)) 2638 printf(" block %p\n", 2639 (void *)LIST_FIRST(&lock->lf_outedges)->le_to); 2640 else 2641 printf("\n"); 2642 } 2643 2644 static void 2645 lf_printlist(char *tag, struct lockf_entry *lock) 2646 { 2647 struct lockf_entry *lf, *blk; 2648 struct lockf_edge *e; 2649 2650 printf("%s: Lock list for vnode %p:\n", tag, lock->lf_vnode); 2651 LIST_FOREACH(lf, &lock->lf_vnode->v_lockf->ls_active, lf_link) { 2652 printf("\tlock %p for ",(void *)lf); 2653 lf_print_owner(lock->lf_owner); 2654 printf(", %s, start %jd, end %jd", 2655 lf->lf_type == F_RDLCK ? "shared" : 2656 lf->lf_type == F_WRLCK ? "exclusive" : 2657 lf->lf_type == F_UNLCK ? "unlock" : 2658 "unknown", (intmax_t)lf->lf_start, (intmax_t)lf->lf_end); 2659 LIST_FOREACH(e, &lf->lf_outedges, le_outlink) { 2660 blk = e->le_to; 2661 printf("\n\t\tlock request %p for ", (void *)blk); 2662 lf_print_owner(blk->lf_owner); 2663 printf(", %s, start %jd, end %jd", 2664 blk->lf_type == F_RDLCK ? "shared" : 2665 blk->lf_type == F_WRLCK ? "exclusive" : 2666 blk->lf_type == F_UNLCK ? "unlock" : 2667 "unknown", (intmax_t)blk->lf_start, 2668 (intmax_t)blk->lf_end); 2669 if (!LIST_EMPTY(&blk->lf_inedges)) 2670 panic("lf_printlist: bad list"); 2671 } 2672 printf("\n"); 2673 } 2674 } 2675 #endif /* LOCKF_DEBUG */ 2676