1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2004 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 * Copyright 2015 Joyent, Inc. 26 */ 27 28 #ifndef _SYS_FLOCK_IMPL_H 29 #define _SYS_FLOCK_IMPL_H 30 31 #include <sys/types.h> 32 #include <sys/fcntl.h> /* flock definition */ 33 #include <sys/file.h> /* FREAD etc */ 34 #include <sys/flock.h> /* RCMD etc */ 35 #include <sys/kmem.h> 36 #include <sys/user.h> 37 #include <sys/thread.h> 38 #include <sys/proc.h> 39 #include <sys/cred.h> 40 #include <sys/debug.h> 41 #include <sys/cmn_err.h> 42 #include <sys/errno.h> 43 #include <sys/systm.h> 44 #include <sys/vnode.h> 45 #include <sys/share.h> /* just to get GETSYSID def */ 46 47 #ifdef __cplusplus 48 extern "C" { 49 #endif 50 51 struct edge { 52 struct edge *edge_adj_next; /* adjacency list next */ 53 struct edge *edge_adj_prev; /* adjacency list prev */ 54 struct edge *edge_in_next; /* incoming edges list next */ 55 struct edge *edge_in_prev; /* incoming edges list prev */ 56 struct lock_descriptor *from_vertex; /* edge emanating from lock */ 57 struct lock_descriptor *to_vertex; /* edge pointing to lock */ 58 }; 59 60 typedef struct edge edge_t; 61 62 struct lock_descriptor { 63 struct lock_descriptor *l_next; /* next active/sleep lock */ 64 struct lock_descriptor *l_prev; /* previous active/sleep lock */ 65 struct edge l_edge; /* edge for adj and in lists */ 66 struct lock_descriptor *l_stack; /* for stack operations */ 67 struct lock_descriptor *l_stack1; /* for stack operations */ 68 struct lock_descriptor *l_dstack; /* stack for debug functions */ 69 struct edge *l_sedge; /* start edge for graph alg. */ 70 int l_index; /* used for barrier count */ 71 struct graph *l_graph; /* graph this belongs to */ 72 vnode_t *l_vnode; /* vnode being locked */ 73 int l_type; /* type of lock */ 74 int l_state; /* state described below */ 75 u_offset_t l_start; /* start offset */ 76 u_offset_t l_end; /* end offset */ 77 flock64_t l_flock; /* original flock request */ 78 int l_color; /* color used for graph alg */ 79 kcondvar_t l_cv; /* wait condition for lock */ 80 int pvertex; /* index to proc vertex */ 81 int l_status; /* status described below */ 82 flk_nlm_status_t l_nlm_state; /* state of NLM server */ 83 flk_callback_t *l_callbacks; /* callbacks, or NULL */ 84 zoneid_t l_zoneid; /* zone of request */ 85 file_t *l_ofd; /* OFD-style reference */ 86 }; 87 88 typedef struct lock_descriptor lock_descriptor_t; 89 90 /* 91 * Each graph holds locking information for some number of vnodes. The 92 * active and sleeping lists are circular, with a dummy head element. 93 */ 94 95 struct graph { 96 kmutex_t gp_mutex; /* mutex for this graph */ 97 struct lock_descriptor active_locks; 98 struct lock_descriptor sleeping_locks; 99 int index; /* index of this graph into the hash table */ 100 int mark; /* used for coloring the graph */ 101 }; 102 103 typedef struct graph graph_t; 104 105 /* 106 * The possible states a lock can be in. These states are stored in the 107 * 'l_status' member of the 'lock_descriptor_t' structure. All locks start 108 * life in the INITIAL state, and end up in the DEAD state. Possible state 109 * transitions are : 110 * 111 * INITIAL--> START --> ACTIVE --> DEAD 112 * 113 * --> DEAD 114 * 115 * --> ACTIVE --> DEAD (new locks from flk_relation) 116 * 117 * --> SLEEPING --> GRANTED --> START --> ACTIVE --> DEAD 118 * 119 * --> INTR --> DEAD 120 * 121 * --> CANCELLED --> DEAD 122 * 123 * --> INTR --> DEAD 124 * 125 * --> INTR --> DEAD 126 * 127 * --> CANCELLED --> DEAD 128 * 129 * --> INTR --> DEAD 130 * 131 * Lock transitions are done in the following functions: 132 * --> INITIAL flk_get_lock(), reclock() 133 * --> START flk_execute_request() 134 * --> ACTIVE flk_insert_active_lock() 135 * --> SLEEPING flk_insert_sleeping_lock() 136 * --> GRANTED GRANT_WAKEUP 137 * --> INTERRUPTED INTERRUPT_WAKEUP 138 * --> CANCELLED CANCEL_WAKEUP 139 * --> DEAD reclock(), flk_delete_active_lock(), and 140 * flk_cancel_sleeping_lock() 141 */ 142 143 #define FLK_INITIAL_STATE 1 /* Initial state of all requests */ 144 #define FLK_START_STATE 2 /* Request has started execution */ 145 #define FLK_ACTIVE_STATE 3 /* In active queue */ 146 #define FLK_SLEEPING_STATE 4 /* Request is blocked */ 147 #define FLK_GRANTED_STATE 5 /* Request is granted */ 148 #define FLK_INTERRUPTED_STATE 6 /* Request is interrupted */ 149 #define FLK_CANCELLED_STATE 7 /* Request is cancelled */ 150 #define FLK_DEAD_STATE 8 /* Request is done - will be deleted */ 151 152 /* flags defining state of locks */ 153 154 /* 155 * The LLM design has been modified so that lock states are now stored 156 * in the l_status field of lock_descriptor_t. The l_state field is 157 * currently preserved for binary compatibility, but may be modified or 158 * removed in a minor release of Solaris. Note that both of these 159 * fields (and the rest of the lock_descriptor_t structure) are private 160 * to the implementation of the lock manager and should not be used 161 * externally. 162 */ 163 164 #define ACTIVE_LOCK 0x0001 /* in active queue */ 165 #define SLEEPING_LOCK 0x0002 /* in sleep queue */ 166 #define IO_LOCK 0x0004 /* is an IO lock */ 167 #define REFERENCED_LOCK 0x0008 /* referenced some where */ 168 #define QUERY_LOCK 0x0010 /* querying about lock */ 169 #define WILLING_TO_SLEEP_LOCK 0x0020 /* lock can be put in sleep queue */ 170 #define RECOMPUTE_LOCK 0x0040 /* used for recomputing dependencies */ 171 #define RECOMPUTE_DONE 0x0080 /* used for recomputing dependencies */ 172 #define BARRIER_LOCK 0x0100 /* used for recomputing dependencies */ 173 #define GRANTED_LOCK 0x0200 /* granted but still in sleep queue */ 174 #define CANCELLED_LOCK 0x0400 /* cancelled will be thrown out */ 175 #define DELETED_LOCK 0x0800 /* deleted - free at earliest */ 176 #define INTERRUPTED_LOCK 0x1000 /* pretend signal */ 177 #define LOCKMGR_LOCK 0x2000 /* remote lock (server-side) */ 178 /* Clustering: flag for PXFS locks */ 179 #define PXFS_LOCK 0x4000 /* lock created by PXFS file system */ 180 #define NBMAND_LOCK 0x8000 /* non-blocking mandatory locking */ 181 182 #define HASH_SIZE 32 183 #define HASH_SHIFT (HASH_SIZE - 1) 184 #define HASH_INDEX(vp) (((uintptr_t)vp >> 7) & HASH_SHIFT) 185 186 /* extern definitions */ 187 188 extern struct graph *lock_graph[HASH_SIZE]; 189 extern struct kmem_cache *flk_edge_cache; 190 191 /* Clustering: functions called by PXFS */ 192 int flk_execute_request(lock_descriptor_t *); 193 void flk_cancel_sleeping_lock(lock_descriptor_t *, int); 194 void flk_set_state(lock_descriptor_t *, int); 195 graph_t *flk_get_lock_graph(vnode_t *, int); 196 197 /* flags used for readability in flock.c */ 198 199 #define FLK_USE_GRAPH 0 /* don't initialize the lock_graph */ 200 #define FLK_INIT_GRAPH 1 /* initialize the lock graph */ 201 #define NO_COLOR 0 /* vertex is not colored */ 202 #define NO_CHECK_CYCLE 0 /* don't mark vertex's in flk_add_edge */ 203 #define CHECK_CYCLE 1 /* mark vertex's in flk_add_edge */ 204 205 #define SAME_OWNER(lock1, lock2) \ 206 (((lock1)->l_flock.l_pid == (lock2)->l_flock.l_pid) && \ 207 ((lock1)->l_flock.l_sysid == (lock2)->l_flock.l_sysid) && \ 208 ((lock1)->l_ofd == (lock2)->l_ofd)) 209 210 #define COLORED(vertex) ((vertex)->l_color == (vertex)->l_graph->mark) 211 #define COLOR(vertex) ((vertex)->l_color = (vertex)->l_graph->mark) 212 213 /* 214 * stack data structure and operations 215 */ 216 217 #define STACK_INIT(stack) ((stack) = NULL) 218 #define STACK_PUSH(stack, ptr, stack_link) (ptr)->stack_link = (stack),\ 219 (stack) = (ptr) 220 #define STACK_POP(stack, stack_link) (stack) = (stack)->stack_link 221 #define STACK_TOP(stack) (stack) 222 #define STACK_EMPTY(stack) ((stack) == NULL) 223 224 225 #define ACTIVE_HEAD(gp) (&(gp)->active_locks) 226 227 #define SLEEPING_HEAD(gp) (&(gp)->sleeping_locks) 228 229 #define SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp) \ 230 { \ 231 (lock) = (lock_descriptor_t *)vp->v_filocks; \ 232 } 233 234 #define SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp) \ 235 { \ 236 for ((lock) = SLEEPING_HEAD((gp))->l_next; ((lock) != SLEEPING_HEAD((gp)) && \ 237 (lock)->l_vnode != (vp)); (lock) = (lock)->l_next) \ 238 ; \ 239 (lock) = ((lock) == SLEEPING_HEAD((gp))) ? NULL : (lock); \ 240 } 241 242 #define OVERLAP(lock1, lock2) \ 243 (((lock1)->l_start <= (lock2)->l_start && \ 244 (lock2)->l_start <= (lock1)->l_end) || \ 245 ((lock2)->l_start <= (lock1)->l_start && \ 246 (lock1)->l_start <= (lock2)->l_end)) 247 248 #define IS_INITIAL(lock) ((lock)->l_status == FLK_INITIAL_STATE) 249 #define IS_ACTIVE(lock) ((lock)->l_status == FLK_ACTIVE_STATE) 250 #define IS_SLEEPING(lock) ((lock)->l_status == FLK_SLEEPING_STATE) 251 #define IS_GRANTED(lock) ((lock)->l_status == FLK_GRANTED_STATE) 252 #define IS_INTERRUPTED(lock) ((lock)->l_status == FLK_INTERRUPTED_STATE) 253 #define IS_CANCELLED(lock) ((lock)->l_status == FLK_CANCELLED_STATE) 254 #define IS_DEAD(lock) ((lock)->l_status == FLK_DEAD_STATE) 255 256 #define IS_QUERY_LOCK(lock) ((lock)->l_state & QUERY_LOCK) 257 #define IS_RECOMPUTE(lock) ((lock)->l_state & RECOMPUTE_LOCK) 258 #define IS_BARRIER(lock) ((lock)->l_state & BARRIER_LOCK) 259 #define IS_DELETED(lock) ((lock)->l_state & DELETED_LOCK) 260 #define IS_REFERENCED(lock) ((lock)->l_state & REFERENCED_LOCK) 261 #define IS_IO_LOCK(lock) ((lock)->l_state & IO_LOCK) 262 #define IS_WILLING_TO_SLEEP(lock) \ 263 ((lock)->l_state & WILLING_TO_SLEEP_LOCK) 264 #define IS_LOCKMGR(lock) ((lock)->l_state & LOCKMGR_LOCK) 265 #define IS_NLM_UP(lock) ((lock)->l_nlm_state == FLK_NLM_UP) 266 /* Clustering: Macro for PXFS locks */ 267 #define IS_PXFS(lock) ((lock)->l_state & PXFS_LOCK) 268 269 /* 270 * "local" requests don't involve the NFS lock manager in any way. 271 * "remote" requests can be on the server (requests from a remote client), 272 * in which case they should be associated with a local vnode (UFS, tmpfs, 273 * etc.). These requests are flagged with LOCKMGR_LOCK and are made using 274 * kernel service threads. Remote requests can also be on an NFS client, 275 * because the NFS lock manager uses local locking for some of its 276 * bookkeeping. These requests are made by regular user processes. 277 */ 278 #define IS_LOCAL(lock) (GETSYSID((lock)->l_flock.l_sysid) == 0) 279 #define IS_REMOTE(lock) (! IS_LOCAL(lock)) 280 281 /* Clustering: Return value for blocking PXFS locks */ 282 /* 283 * For PXFS locks, reclock() will return this error code for requests that 284 * need to block 285 */ 286 #define PXFS_LOCK_BLOCKED -1 287 288 /* Clustering: PXFS callback function */ 289 /* 290 * This function is a callback from the LLM into the PXFS server module. It 291 * is initialized as a weak stub, and is functional when the pxfs server module 292 * is loaded. 293 */ 294 extern void cl_flk_state_transition_notify(lock_descriptor_t *lock, 295 int old_state, int new_state); 296 297 #define BLOCKS(lock1, lock2) (!SAME_OWNER((lock1), (lock2)) && \ 298 (((lock1)->l_type == F_WRLCK) || \ 299 ((lock2)->l_type == F_WRLCK)) && \ 300 OVERLAP((lock1), (lock2))) 301 302 #define COVERS(lock1, lock2) \ 303 (((lock1)->l_start <= (lock2)->l_start) && \ 304 ((lock1)->l_end >= (lock2)->l_end)) 305 306 #define IN_LIST_REMOVE(ep) \ 307 { \ 308 (ep)->edge_in_next->edge_in_prev = (ep)->edge_in_prev; \ 309 (ep)->edge_in_prev->edge_in_next = (ep)->edge_in_next; \ 310 } 311 312 #define ADJ_LIST_REMOVE(ep) \ 313 { \ 314 (ep)->edge_adj_next->edge_adj_prev = (ep)->edge_adj_prev; \ 315 (ep)->edge_adj_prev->edge_adj_next = (ep)->edge_adj_next; \ 316 } 317 318 #define NOT_BLOCKED(lock) \ 319 ((lock)->l_edge.edge_adj_next == &(lock)->l_edge && !IS_GRANTED(lock)) 320 321 #define GRANT_WAKEUP(lock) \ 322 { \ 323 flk_set_state(lock, FLK_GRANTED_STATE); \ 324 (lock)->l_state |= GRANTED_LOCK; \ 325 /* \ 326 * Clustering: PXFS locks do not sleep in the LLM, \ 327 * so there is no need to signal them \ 328 */ \ 329 if (!IS_PXFS(lock)) { \ 330 cv_signal(&(lock)->l_cv); \ 331 } \ 332 } 333 334 #define CANCEL_WAKEUP(lock) \ 335 { \ 336 flk_set_state(lock, FLK_CANCELLED_STATE); \ 337 (lock)->l_state |= CANCELLED_LOCK; \ 338 /* \ 339 * Clustering: PXFS locks do not sleep in the LLM, \ 340 * so there is no need to signal them \ 341 */ \ 342 if (!IS_PXFS(lock)) { \ 343 cv_signal(&(lock)->l_cv); \ 344 } \ 345 } 346 347 #define INTERRUPT_WAKEUP(lock) \ 348 { \ 349 flk_set_state(lock, FLK_INTERRUPTED_STATE); \ 350 (lock)->l_state |= INTERRUPTED_LOCK; \ 351 /* \ 352 * Clustering: PXFS locks do not sleep in the LLM, \ 353 * so there is no need to signal them \ 354 */ \ 355 if (!IS_PXFS(lock)) { \ 356 cv_signal(&(lock)->l_cv); \ 357 } \ 358 } 359 360 #define REMOVE_SLEEP_QUEUE(lock) \ 361 { \ 362 ASSERT(IS_SLEEPING(lock) || IS_GRANTED(lock) || \ 363 IS_INTERRUPTED(lock) || IS_CANCELLED(lock)); \ 364 (lock)->l_state &= ~SLEEPING_LOCK; \ 365 (lock)->l_next->l_prev = (lock)->l_prev; \ 366 (lock)->l_prev->l_next = (lock)->l_next; \ 367 (lock)->l_next = (lock)->l_prev = (lock_descriptor_t *)NULL; \ 368 } 369 370 #define NO_DEPENDENTS(lock) \ 371 ((lock)->l_edge.edge_in_next == &(lock)->l_edge) 372 373 #define GRANT(lock) \ 374 { \ 375 (lock)->l_state |= GRANTED_LOCK; \ 376 flk_set_state(lock, FLK_GRANTED_STATE); \ 377 } 378 379 #define FIRST_IN(lock) ((lock)->l_edge.edge_in_next) 380 #define FIRST_ADJ(lock) ((lock)->l_edge.edge_adj_next) 381 #define HEAD(lock) (&(lock)->l_edge) 382 #define NEXT_ADJ(ep) ((ep)->edge_adj_next) 383 #define NEXT_IN(ep) ((ep)->edge_in_next) 384 #define IN_ADJ_INIT(lock) \ 385 { \ 386 (lock)->l_edge.edge_adj_next = (lock)->l_edge.edge_adj_prev = &(lock)->l_edge; \ 387 (lock)->l_edge.edge_in_next = (lock)->l_edge.edge_in_prev = &(lock)->l_edge; \ 388 } 389 390 #define COPY(lock1, lock2) \ 391 { \ 392 (lock1)->l_graph = (lock2)->l_graph; \ 393 (lock1)->l_vnode = (lock2)->l_vnode; \ 394 (lock1)->l_type = (lock2)->l_type; \ 395 (lock1)->l_state = (lock2)->l_state; \ 396 (lock1)->l_start = (lock2)->l_start; \ 397 (lock1)->l_end = (lock2)->l_end; \ 398 (lock1)->l_flock = (lock2)->l_flock; \ 399 (lock1)->l_zoneid = (lock2)->l_zoneid; \ 400 (lock1)->pvertex = (lock2)->pvertex; \ 401 } 402 403 /* 404 * Clustering 405 */ 406 /* Routines to set and get the NLM state in a lock request */ 407 #define SET_NLM_STATE(lock, nlm_state) ((lock)->l_nlm_state = nlm_state) 408 #define GET_NLM_STATE(lock) ((lock)->l_nlm_state) 409 /* 410 * NLM registry abstraction: 411 * Abstraction overview: 412 * This registry keeps track of the NLM servers via their nlmids 413 * that have requested locks at the LLM this registry is associated 414 * with. 415 */ 416 /* Routines to manipulate the NLM registry object state */ 417 #define FLK_REGISTRY_IS_NLM_UNKNOWN(nlmreg, nlmid) \ 418 ((nlmreg)[nlmid] == FLK_NLM_UNKNOWN) 419 #define FLK_REGISTRY_IS_NLM_UP(nlmreg, nlmid) \ 420 ((nlmreg)[nlmid] == FLK_NLM_UP) 421 #define FLK_REGISTRY_ADD_NLMID(nlmreg, nlmid) \ 422 ((nlmreg)[nlmid] = FLK_NLM_UP) 423 #define FLK_REGISTRY_CHANGE_NLM_STATE(nlmreg, nlmid, state) \ 424 ((nlmreg)[nlmid] = state) 425 426 /* Indicates the effect of executing a request on the existing locks */ 427 428 #define FLK_UNLOCK 0x1 /* request unlocks the existing lock */ 429 #define FLK_DOWNGRADE 0x2 /* request downgrades the existing lock */ 430 #define FLK_UPGRADE 0x3 /* request upgrades the existing lock */ 431 #define FLK_STAY_SAME 0x4 /* request type is same as existing lock */ 432 433 434 /* proc graph definitions */ 435 436 /* 437 * Proc graph is the global process graph that maintains information 438 * about the dependencies between processes. An edge is added between two 439 * processes represented by proc_vertex's A and B, iff there exists l1 440 * owned by process A in any of the lock_graph's dependent on l2 441 * (thus having an edge to l2) owned by process B. 442 */ 443 struct proc_vertex { 444 pid_t pid; /* pid of the process */ 445 long sysid; /* sysid of the process */ 446 struct proc_edge *edge; /* adajcent edges of this process */ 447 int incount; /* Number of inedges to this process */ 448 struct proc_edge *p_sedge; /* used for implementing stack alg. */ 449 struct proc_vertex *p_stack; /* used for stack alg. */ 450 int atime; /* used for cycle detection algorithm */ 451 int dtime; /* used for cycle detection algorithm */ 452 int index; /* index into the array of proc_graph vertices */ 453 }; 454 455 typedef struct proc_vertex proc_vertex_t; 456 457 struct proc_edge { 458 struct proc_edge *next; /* next edge in adjacency list */ 459 int refcount; /* reference count of this edge */ 460 struct proc_vertex *to_proc; /* process this points to */ 461 }; 462 463 typedef struct proc_edge proc_edge_t; 464 465 466 #define PROC_CHUNK 100 467 468 struct proc_graph { 469 struct proc_vertex **proc; /* list of proc_vertexes */ 470 int gcount; /* list size */ 471 int free; /* number of free slots in the list */ 472 int mark; /* used for graph coloring */ 473 }; 474 475 typedef struct proc_graph proc_graph_t; 476 477 extern struct proc_graph pgraph; 478 479 #define PROC_SAME_OWNER(lock, pvertex) \ 480 (((lock)->l_flock.l_pid == (pvertex)->pid) && \ 481 ((lock)->l_flock.l_sysid == (pvertex)->sysid)) 482 483 #define PROC_ARRIVE(pvertex) ((pvertex)->atime = pgraph.mark) 484 #define PROC_DEPART(pvertex) ((pvertex)->dtime = pgraph.mark) 485 #define PROC_ARRIVED(pvertex) ((pvertex)->atime == pgraph.mark) 486 #define PROC_DEPARTED(pvertex) ((pvertex)->dtime == pgraph.mark) 487 488 #ifdef __cplusplus 489 } 490 #endif 491 492 #endif /* _SYS_FLOCK_IMPL_H */ 493