xref: /titanic_41/usr/src/cmd/sendmail/db/lock/lock.c (revision 3d7072f8bd27709dba14f6fe336f149d25d9e207)
1 /*-
2  * See the file LICENSE for redistribution information.
3  *
4  * Copyright (c) 1996, 1997, 1998
5  *	Sleepycat Software.  All rights reserved.
6  */
7 
8 #include "config.h"
9 
10 #ifndef lint
11 static const char sccsid[] = "@(#)lock.c	10.61 (Sleepycat) 1/3/99";
12 #endif /* not lint */
13 
14 #ifndef NO_SYSTEM_INCLUDES
15 #include <sys/types.h>
16 
17 #include <errno.h>
18 #include <string.h>
19 #endif
20 
21 #include "db_int.h"
22 #include "shqueue.h"
23 #include "db_page.h"
24 #include "db_shash.h"
25 #include "lock.h"
26 #include "db_am.h"
27 #include "txn_auto.h"
28 #include "txn_ext.h"
29 #include "common_ext.h"
30 
31 static void __lock_checklocker __P((DB_LOCKTAB *, struct __db_lock *, int));
32 static void __lock_freeobj __P((DB_LOCKTAB *, DB_LOCKOBJ *));
33 static int  __lock_get_internal __P((DB_LOCKTAB *, u_int32_t, DB_TXN *,
34     u_int32_t, const DBT *, db_lockmode_t, struct __db_lock **));
35 static int  __lock_is_parent __P((u_int32_t, DB_TXN *));
36 static int  __lock_promote __P((DB_LOCKTAB *, DB_LOCKOBJ *));
37 static int  __lock_put_internal __P((DB_LOCKTAB *, struct __db_lock *, int));
38 static void __lock_remove_waiter
39     __P((DB_LOCKTAB *, DB_LOCKOBJ *, struct __db_lock *, db_status_t));
40 static int  __lock_vec_internal __P((DB_LOCKTAB *, u_int32_t, DB_TXN *,
41 	    u_int32_t, DB_LOCKREQ *, int, DB_LOCKREQ **elistp));
42 
43 int
44 lock_id(lt, idp)
45 	DB_LOCKTAB *lt;
46 	u_int32_t *idp;
47 {
48 	u_int32_t id;
49 
50 	LOCK_PANIC_CHECK(lt);
51 
52 	LOCK_LOCKREGION(lt);
53 	if (lt->region->id >= DB_LOCK_MAXID)
54 		lt->region->id = 0;
55 	id = ++lt->region->id;
56 	UNLOCK_LOCKREGION(lt);
57 
58 	*idp = id;
59 	return (0);
60 }
61 
62 int
63 lock_vec(lt, locker, flags, list, nlist, elistp)
64 	DB_LOCKTAB *lt;
65 	u_int32_t locker, flags;
66 	int nlist;
67 	DB_LOCKREQ *list, **elistp;
68 {
69 	return (__lock_vec_internal(lt,
70 	    locker, NULL, flags, list, nlist, elistp));
71 }
72 
73 int
74 lock_tvec(lt, txn, flags, list, nlist, elistp)
75 	DB_LOCKTAB *lt;
76 	DB_TXN *txn;
77 	u_int32_t flags;
78 	int nlist;
79 	DB_LOCKREQ *list, **elistp;
80 {
81 	return (__lock_vec_internal(lt,
82 	    txn->txnid, txn, flags, list, nlist, elistp));
83 }
84 
85 static int
86 __lock_vec_internal(lt, locker, txn, flags, list, nlist, elistp)
87 	DB_LOCKTAB *lt;
88 	u_int32_t locker;
89 	DB_TXN *txn;
90 	u_int32_t flags;
91 	int nlist;
92 	DB_LOCKREQ *list, **elistp;
93 {
94 	struct __db_lock *lp;
95 	DB_LOCKOBJ *sh_obj, *sh_locker, *sh_parent;
96 	int i, ret, run_dd;
97 
98 	LOCK_PANIC_CHECK(lt);
99 
100 	/* Validate arguments. */
101 	if ((ret =
102 	    __db_fchk(lt->dbenv, "lock_vec", flags, DB_LOCK_NOWAIT)) != 0)
103 		return (ret);
104 
105 	LOCK_LOCKREGION(lt);
106 
107 	if ((ret = __lock_validate_region(lt)) != 0) {
108 		UNLOCK_LOCKREGION(lt);
109 		return (ret);
110 	}
111 
112 	ret = 0;
113 	for (i = 0; i < nlist && ret == 0; i++) {
114 		switch (list[i].op) {
115 		case DB_LOCK_GET:
116 			ret = __lock_get_internal(lt, locker, txn, flags,
117 			    list[i].obj, list[i].mode, &lp);
118 			if (ret == 0) {
119 				list[i].lock = LOCK_TO_OFFSET(lt, lp);
120 				lt->region->nrequests++;
121 			}
122 			break;
123 		case DB_LOCK_INHERIT:
124 			/* Find the locker. */
125 			if ((ret = __lock_getobj(lt, locker,
126 			    NULL, DB_LOCK_LOCKER, &sh_locker)) != 0)
127 				break;
128 			if (txn == NULL || txn->parent == NULL) {
129 				ret = EINVAL;
130 				break;
131 			}
132 
133 			if ((ret = __lock_getobj(lt, txn->parent->txnid,
134 			    NULL, DB_LOCK_LOCKER, &sh_parent)) != 0)
135 				break;
136 
137 			/*
138 			 * Traverse all the locks held by this locker.  Remove
139 			 * the locks from the locker's list and put them on the
140 			 * parent's list.
141 			 */
142 			for (lp = SH_LIST_FIRST(&sh_locker->heldby, __db_lock);
143 			    lp != NULL;
144 			    lp = SH_LIST_FIRST(&sh_locker->heldby, __db_lock)) {
145 				SH_LIST_REMOVE(lp, locker_links, __db_lock);
146 				SH_LIST_INSERT_HEAD(&sh_parent->heldby, lp,
147 				    locker_links, __db_lock);
148 				lp->holder = txn->parent->txnid;
149 			}
150 			__lock_freeobj(lt, sh_locker);
151 			lt->region->nlockers--;
152 			break;
153 		case DB_LOCK_PUT:
154 			lp = OFFSET_TO_LOCK(lt, list[i].lock);
155 			if (lp->holder != locker) {
156 				ret = DB_LOCK_NOTHELD;
157 				break;
158 			}
159 			list[i].mode = lp->mode;
160 
161 			ret = __lock_put_internal(lt, lp, 0);
162 			__lock_checklocker(lt, lp, 0);
163 			break;
164 		case DB_LOCK_PUT_ALL:
165 			/* Find the locker. */
166 			if ((ret = __lock_getobj(lt, locker,
167 			    NULL, DB_LOCK_LOCKER, &sh_locker)) != 0)
168 				break;
169 
170 			for (lp = SH_LIST_FIRST(&sh_locker->heldby, __db_lock);
171 			    lp != NULL;
172 			    lp = SH_LIST_FIRST(&sh_locker->heldby, __db_lock)) {
173 				if ((ret = __lock_put_internal(lt, lp, 1)) != 0)
174 					break;
175 			}
176 			__lock_freeobj(lt, sh_locker);
177 			lt->region->nlockers--;
178 			break;
179 		case DB_LOCK_PUT_OBJ:
180 
181 			/* Look up the object in the hash table. */
182 			HASHLOOKUP(lt->hashtab, __db_lockobj, links,
183 			    list[i].obj, sh_obj, lt->region->table_size,
184 			    __lock_ohash, __lock_cmp);
185 			if (sh_obj == NULL) {
186 				ret = EINVAL;
187 				break;
188 			}
189 			/*
190 			 * Release waiters first, because they won't cause
191 			 * anyone else to be awakened.  If we release the
192 			 * lockers first, all the waiters get awakened
193 			 * needlessly.
194 			 */
195 			for (lp = SH_TAILQ_FIRST(&sh_obj->waiters, __db_lock);
196 			    lp != NULL;
197 			    lp = SH_TAILQ_FIRST(&sh_obj->waiters, __db_lock)) {
198 				lt->region->nreleases += lp->refcount;
199 				__lock_remove_waiter(lt, sh_obj, lp,
200 				    DB_LSTAT_NOGRANT);
201 				__lock_checklocker(lt, lp, 1);
202 			}
203 
204 			for (lp = SH_TAILQ_FIRST(&sh_obj->holders, __db_lock);
205 			    lp != NULL;
206 			    lp = SH_TAILQ_FIRST(&sh_obj->holders, __db_lock)) {
207 
208 				lt->region->nreleases += lp->refcount;
209 				SH_LIST_REMOVE(lp, locker_links, __db_lock);
210 				SH_TAILQ_REMOVE(&sh_obj->holders, lp, links,
211 				    __db_lock);
212 				lp->status = DB_LSTAT_FREE;
213 				SH_TAILQ_INSERT_HEAD(&lt->region->free_locks,
214 				    lp, links, __db_lock);
215 			}
216 
217 			/* Now free the object. */
218 			__lock_freeobj(lt, sh_obj);
219 			break;
220 #ifdef DEBUG
221 		case DB_LOCK_DUMP:
222 			/* Find the locker. */
223 			if ((ret = __lock_getobj(lt, locker,
224 			    NULL, DB_LOCK_LOCKER, &sh_locker)) != 0)
225 				break;
226 
227 			for (lp = SH_LIST_FIRST(&sh_locker->heldby, __db_lock);
228 			    lp != NULL;
229 			    lp = SH_LIST_NEXT(lp, locker_links, __db_lock)) {
230 				__lock_printlock(lt, lp, 1);
231 				ret = EINVAL;
232 			}
233 			if (ret == 0) {
234 				__lock_freeobj(lt, sh_locker);
235 				lt->region->nlockers--;
236 			}
237 			break;
238 #endif
239 		default:
240 			ret = EINVAL;
241 			break;
242 		}
243 	}
244 
245 	if (lt->region->need_dd && lt->region->detect != DB_LOCK_NORUN) {
246 		run_dd = 1;
247 		lt->region->need_dd = 0;
248 	} else
249 		run_dd = 0;
250 
251 	UNLOCK_LOCKREGION(lt);
252 
253 	if (ret == 0 && run_dd)
254 		lock_detect(lt, 0, lt->region->detect);
255 
256 	if (elistp && ret != 0)
257 		*elistp = &list[i - 1];
258 	return (ret);
259 }
260 
261 int
262 lock_get(lt, locker, flags, obj, lock_mode, lock)
263 	DB_LOCKTAB *lt;
264 	u_int32_t locker, flags;
265 	const DBT *obj;
266 	db_lockmode_t lock_mode;
267 	DB_LOCK *lock;
268 {
269 	struct __db_lock *lockp;
270 	int ret;
271 
272 	LOCK_PANIC_CHECK(lt);
273 
274 	/* Validate arguments. */
275 	if ((ret = __db_fchk(lt->dbenv,
276 	    "lock_get", flags, DB_LOCK_NOWAIT | DB_LOCK_UPGRADE)) != 0)
277 		return (ret);
278 
279 	LOCK_LOCKREGION(lt);
280 
281 	if ((ret = __lock_validate_region(lt)) == 0) {
282 		if (LF_ISSET(DB_LOCK_UPGRADE))
283 			lockp = OFFSET_TO_LOCK(lt, *lock);
284 
285 		if ((ret = __lock_get_internal(lt,
286 		    locker, NULL, flags, obj, lock_mode, &lockp)) == 0) {
287 			if (!LF_ISSET(DB_LOCK_UPGRADE))
288 				*lock = LOCK_TO_OFFSET(lt, lockp);
289 			lt->region->nrequests++;
290 		}
291 	}
292 
293 	UNLOCK_LOCKREGION(lt);
294 	return (ret);
295 }
296 
297 int
298 lock_tget(lt, txn, flags, obj, lock_mode, lock)
299 	DB_LOCKTAB *lt;
300 	DB_TXN *txn;
301 	u_int32_t flags;
302 	const DBT *obj;
303 	db_lockmode_t lock_mode;
304 	DB_LOCK *lock;
305 {
306 	struct __db_lock *lockp;
307 	int ret;
308 
309 	LOCK_PANIC_CHECK(lt);
310 
311 	/* Validate arguments. */
312 	if ((ret = __db_fchk(lt->dbenv,
313 	    "lock_get", flags, DB_LOCK_NOWAIT | DB_LOCK_UPGRADE)) != 0)
314 		return (ret);
315 
316 	LOCK_LOCKREGION(lt);
317 
318 	if ((ret = __lock_validate_region(lt)) == 0) {
319 		if (LF_ISSET(DB_LOCK_UPGRADE))
320 			lockp = OFFSET_TO_LOCK(lt, *lock);
321 
322 		if ((ret = __lock_get_internal(lt,
323 		    txn->txnid, txn, flags, obj, lock_mode, &lockp)) == 0) {
324 			if (!LF_ISSET(DB_LOCK_UPGRADE))
325 				*lock = LOCK_TO_OFFSET(lt, lockp);
326 			lt->region->nrequests++;
327 		}
328 	}
329 
330 	UNLOCK_LOCKREGION(lt);
331 	return (ret);
332 }
333 int
334 lock_put(lt, lock)
335 	DB_LOCKTAB *lt;
336 	DB_LOCK lock;
337 {
338 	struct __db_lock *lockp;
339 	int ret, run_dd;
340 
341 	LOCK_PANIC_CHECK(lt);
342 
343 	LOCK_LOCKREGION(lt);
344 
345 	if ((ret = __lock_validate_region(lt)) != 0)
346 		return (ret);
347 	else {
348 		lockp = OFFSET_TO_LOCK(lt, lock);
349 		ret = __lock_put_internal(lt, lockp, 0);
350 	}
351 
352 	__lock_checklocker(lt, lockp, 0);
353 
354 	if (lt->region->need_dd && lt->region->detect != DB_LOCK_NORUN) {
355 		run_dd = 1;
356 		lt->region->need_dd = 0;
357 	} else
358 		run_dd = 0;
359 
360 	UNLOCK_LOCKREGION(lt);
361 
362 	if (ret == 0 && run_dd)
363 		lock_detect(lt, 0, lt->region->detect);
364 
365 	return (ret);
366 }
367 
368 static int
369 __lock_put_internal(lt, lockp, do_all)
370 	DB_LOCKTAB *lt;
371 	struct __db_lock *lockp;
372 	int do_all;
373 {
374 	DB_LOCKOBJ *sh_obj;
375 	int state_changed;
376 
377 	if (lockp->refcount == 0 || (lockp->status != DB_LSTAT_HELD &&
378 	    lockp->status != DB_LSTAT_WAITING) || lockp->obj == 0) {
379 		__db_err(lt->dbenv, "lock_put: invalid lock %lu",
380 		    (u_long)((u_int8_t *)lockp - (u_int8_t *)lt->region));
381 		return (EINVAL);
382 	}
383 
384 	if (do_all)
385 		lt->region->nreleases += lockp->refcount;
386 	else
387 		lt->region->nreleases++;
388 	if (do_all == 0 && lockp->refcount > 1) {
389 		lockp->refcount--;
390 		return (0);
391 	}
392 
393 	/* Get the object associated with this lock. */
394 	sh_obj = (DB_LOCKOBJ *)((u_int8_t *)lockp + lockp->obj);
395 
396 	/* Remove lock from locker list. */
397 	SH_LIST_REMOVE(lockp, locker_links, __db_lock);
398 
399 	/* Remove this lock from its holders/waitlist. */
400 	if (lockp->status != DB_LSTAT_HELD)
401 		__lock_remove_waiter(lt, sh_obj, lockp, DB_LSTAT_FREE);
402 	else
403 		SH_TAILQ_REMOVE(&sh_obj->holders, lockp, links, __db_lock);
404 
405 	state_changed = __lock_promote(lt, sh_obj);
406 
407 	/* Check if object should be reclaimed. */
408 	if (SH_TAILQ_FIRST(&sh_obj->holders, __db_lock) == NULL) {
409 		HASHREMOVE_EL(lt->hashtab, __db_lockobj,
410 		    links, sh_obj, lt->region->table_size, __lock_lhash);
411 		if (sh_obj->lockobj.size > sizeof(sh_obj->objdata))
412 			__db_shalloc_free(lt->mem,
413 			    SH_DBT_PTR(&sh_obj->lockobj));
414 		SH_TAILQ_INSERT_HEAD(&lt->region->free_objs, sh_obj, links,
415 		    __db_lockobj);
416 		state_changed = 1;
417 	}
418 
419 	/* Free lock. */
420 	lockp->status = DB_LSTAT_FREE;
421 	SH_TAILQ_INSERT_HEAD(&lt->region->free_locks, lockp, links, __db_lock);
422 
423 	/*
424 	 * If we did not promote anyone; we need to run the deadlock
425 	 * detector again.
426 	 */
427 	if (state_changed == 0)
428 		lt->region->need_dd = 1;
429 
430 	return (0);
431 }
432 
433 static int
434 __lock_get_internal(lt, locker, txn, flags, obj, lock_mode, lockp)
435 	DB_LOCKTAB *lt;
436 	u_int32_t locker, flags;
437 	DB_TXN *txn;
438 	const DBT *obj;
439 	db_lockmode_t lock_mode;
440 	struct __db_lock **lockp;
441 {
442 	struct __db_lock *newl, *lp;
443 	DB_LOCKOBJ *sh_obj, *sh_locker;
444 	DB_LOCKREGION *lrp;
445 	size_t newl_off;
446 	int ihold, no_dd, ret;
447 
448 	no_dd = ret = 0;
449 
450 	/*
451 	 * Check that lock mode is valid.
452 	 */
453 	lrp = lt->region;
454 	if ((u_int32_t)lock_mode >= lrp->nmodes) {
455 		__db_err(lt->dbenv,
456 		    "lock_get: invalid lock mode %lu\n", (u_long)lock_mode);
457 		return (EINVAL);
458 	}
459 
460 	/* Allocate a new lock.  Optimize for the common case of a grant. */
461 	if ((newl = SH_TAILQ_FIRST(&lrp->free_locks, __db_lock)) == NULL) {
462 		if ((ret = __lock_grow_region(lt, DB_LOCK_LOCK, 0)) != 0)
463 			return (ret);
464 		lrp = lt->region;
465 		newl = SH_TAILQ_FIRST(&lrp->free_locks, __db_lock);
466 	}
467 	newl_off = LOCK_TO_OFFSET(lt, newl);
468 
469 	/* Optimize for common case of granting a lock. */
470 	SH_TAILQ_REMOVE(&lrp->free_locks, newl, links, __db_lock);
471 
472 	newl->mode = lock_mode;
473 	newl->status = DB_LSTAT_HELD;
474 	newl->holder = locker;
475 	newl->refcount = 1;
476 
477 	if ((ret = __lock_getobj(lt, 0, obj, DB_LOCK_OBJTYPE, &sh_obj)) != 0)
478 		return (ret);
479 
480 	lrp = lt->region;			/* getobj might have grown */
481 	newl = OFFSET_TO_LOCK(lt, newl_off);
482 
483 	/* Now make new lock point to object */
484 	newl->obj = SH_PTR_TO_OFF(newl, sh_obj);
485 
486 	/*
487 	 * Now we have a lock and an object and we need to see if we should
488 	 * grant the lock.  We use a FIFO ordering so we can only grant a
489 	 * new lock if it does not conflict with anyone on the holders list
490 	 * OR anyone on the waiters list.  The reason that we don't grant if
491 	 * there's a conflict is that this can lead to starvation (a writer
492 	 * waiting on a popularly read item will never be granted).  The
493 	 * downside of this is that a waiting reader can prevent an upgrade
494 	 * from reader to writer, which is not uncommon.
495 	 *
496 	 * There is one exception to the no-conflict rule.  If a lock is held
497 	 * by the requesting locker AND the new lock does not conflict with
498 	 * any other holders, then we grant the lock.  The most common place
499 	 * this happens is when the holder has a WRITE lock and a READ lock
500 	 * request comes in for the same locker.  If we do not grant the read
501 	 * lock, then we guarantee deadlock.
502 	 *
503 	 * In case of conflict, we put the new lock on the end of the waiters
504 	 * list, unless we are upgrading in which case the locker goes on the
505 	 * front of the list.
506 	 */
507 	ihold = 0;
508 	for (lp = SH_TAILQ_FIRST(&sh_obj->holders, __db_lock);
509 	    lp != NULL;
510 	    lp = SH_TAILQ_NEXT(lp, links, __db_lock)) {
511 		if (locker == lp->holder ||
512 		    __lock_is_parent(lp->holder, txn)) {
513 			if (lp->mode == lock_mode &&
514 			    lp->status == DB_LSTAT_HELD) {
515 				if (LF_ISSET(DB_LOCK_UPGRADE))
516 					goto upgrade;
517 
518 				/*
519 				 * Lock is held, so we can increment the
520 				 * reference count and return this lock.
521 				 */
522 				lp->refcount++;
523 				*lockp = lp;
524 				SH_TAILQ_INSERT_HEAD(&lrp->free_locks,
525 				    newl, links, __db_lock);
526 				return (0);
527 			} else
528 				ihold = 1;
529 		} else if (CONFLICTS(lt, lp->mode, lock_mode))
530 			break;
531     	}
532 
533 	/*
534 	 * If we are upgrading, then there are two scenarios.  Either
535 	 * we had no conflicts, so we can do the upgrade.  Or, there
536 	 * is a conflict and we should wait at the HEAD of the waiters
537 	 * list.
538 	 */
539 	if (LF_ISSET(DB_LOCK_UPGRADE)) {
540 		if (lp == NULL)
541 			goto upgrade;
542 
543 		/* There was a conflict, wait. */
544 		SH_TAILQ_INSERT_HEAD(&sh_obj->waiters, newl, links, __db_lock);
545 		goto wait;
546 	}
547 
548 	if (lp == NULL && !ihold)
549 		for (lp = SH_TAILQ_FIRST(&sh_obj->waiters, __db_lock);
550 		    lp != NULL;
551 		    lp = SH_TAILQ_NEXT(lp, links, __db_lock)) {
552 			if (CONFLICTS(lt, lp->mode, lock_mode) &&
553 			    locker != lp->holder)
554 				break;
555 		}
556 	if (lp == NULL)
557 		SH_TAILQ_INSERT_TAIL(&sh_obj->holders, newl, links);
558 	else if (!(flags & DB_LOCK_NOWAIT))
559 		SH_TAILQ_INSERT_TAIL(&sh_obj->waiters, newl, links);
560 	else {
561 		/* Free the lock and return an error. */
562 		newl->status = DB_LSTAT_FREE;
563 		SH_TAILQ_INSERT_HEAD(&lrp->free_locks, newl, links, __db_lock);
564 		return (DB_LOCK_NOTGRANTED);
565 	}
566 
567 	/*
568 	 * Now, insert the lock onto its locker's list.  If the locker does
569 	 * not currently hold any locks, there's no reason to run a deadlock
570 	 * detector, save that information.
571 	 */
572 	if ((ret =
573 	    __lock_getobj(lt, locker, NULL, DB_LOCK_LOCKER, &sh_locker)) != 0)
574 		return (ret);
575 	no_dd = SH_LIST_FIRST(&sh_locker->heldby, __db_lock) == NULL;
576 
577 	lrp = lt->region;
578 	SH_LIST_INSERT_HEAD(&sh_locker->heldby, newl, locker_links, __db_lock);
579 
580 	if (lp != NULL) {
581 		/*
582 		 * This is really a blocker for the process, so initialize it
583 		 * set.  That way the current process will block when it tries
584 		 * to get it and the waking process will release it.
585 		 */
586 wait:		(void)__db_mutex_init(&newl->mutex,
587 		    MUTEX_LOCK_OFFSET(lt->region, &newl->mutex));
588 		(void)__db_mutex_lock(&newl->mutex, lt->reginfo.fd);
589 
590 		newl->status = DB_LSTAT_WAITING;
591 		lrp->nconflicts++;
592 
593 		/*
594 		 * We are about to wait; must release the region mutex.  Then,
595 		 * when we wakeup, we need to reacquire the region mutex before
596 		 * continuing.
597 		 */
598 		if (lrp->detect == DB_LOCK_NORUN)
599 			lt->region->need_dd = 1;
600 		UNLOCK_LOCKREGION(lt);
601 
602 		/*
603 		 * We are about to wait; before waiting, see if the deadlock
604 		 * detector should be run.
605 		 */
606 		if (lrp->detect != DB_LOCK_NORUN && !no_dd)
607 			(void)lock_detect(lt, 0, lrp->detect);
608 
609 		(void)__db_mutex_lock(&newl->mutex, lt->reginfo.fd);
610 
611 		LOCK_LOCKREGION(lt);
612 		if (newl->status != DB_LSTAT_PENDING) {
613 			/*
614 			 * If this lock errored due to a deadlock, then
615 			 * we have waiters that require promotion.
616 			 */
617 			if (newl->status == DB_LSTAT_ABORTED)
618 				(void)__lock_promote(lt, sh_obj);
619 			/* Return to free list. */
620 			__lock_checklocker(lt, newl, 0);
621 			SH_TAILQ_INSERT_HEAD(&lrp->free_locks, newl, links,
622 			    __db_lock);
623 			switch (newl->status) {
624 				case DB_LSTAT_ABORTED:
625 					ret = DB_LOCK_DEADLOCK;
626 					break;
627 				case DB_LSTAT_NOGRANT:
628 					ret = DB_LOCK_NOTGRANTED;
629 					break;
630 				default:
631 					ret = EINVAL;
632 					break;
633 			}
634 			newl->status = DB_LSTAT_FREE;
635 			newl = NULL;
636 		} else if (LF_ISSET(DB_LOCK_UPGRADE)) {
637 			/*
638 			 * The lock that was just granted got put on the
639 			 * holders list.  Since we're upgrading some other
640 			 * lock, we've got to remove it here.
641 			 */
642 			SH_TAILQ_REMOVE(&sh_obj->holders,
643 			    newl, links, __db_lock);
644 			goto upgrade;
645 		} else
646 			newl->status = DB_LSTAT_HELD;
647 	}
648 
649 	*lockp = newl;
650 	return (ret);
651 
652 upgrade:
653 	/*
654 	 * This was an upgrade, so return the new lock to the free list and
655 	 * upgrade the mode.
656 	 */
657 	(*lockp)->mode = lock_mode;
658 	newl->status = DB_LSTAT_FREE;
659 	SH_TAILQ_INSERT_HEAD(&lrp->free_locks, newl, links, __db_lock);
660 	return (0);
661 }
662 
663 /*
664  * __lock_is_locked --
665  *
666  * PUBLIC: int __lock_is_locked
667  * PUBLIC:    __P((DB_LOCKTAB *, u_int32_t, DBT *, db_lockmode_t));
668  */
669 int
670 __lock_is_locked(lt, locker, dbt, mode)
671 	DB_LOCKTAB *lt;
672 	u_int32_t locker;
673 	DBT *dbt;
674 	db_lockmode_t mode;
675 {
676 	struct __db_lock *lp;
677 	DB_LOCKOBJ *sh_obj;
678 	DB_LOCKREGION *lrp;
679 
680 	lrp = lt->region;
681 
682 	/* Look up the object in the hash table. */
683 	HASHLOOKUP(lt->hashtab, __db_lockobj, links,
684 	    dbt, sh_obj, lrp->table_size, __lock_ohash, __lock_cmp);
685 	if (sh_obj == NULL)
686 		return (0);
687 
688 	for (lp = SH_TAILQ_FIRST(&sh_obj->holders, __db_lock);
689 	    lp != NULL;
690 	    lp = SH_TAILQ_FIRST(&sh_obj->holders, __db_lock)) {
691 		if (lp->holder == locker && lp->mode == mode)
692 			return (1);
693 	}
694 
695 	return (0);
696 }
697 
698 /*
699  * __lock_printlock --
700  *
701  * PUBLIC: void __lock_printlock __P((DB_LOCKTAB *, struct __db_lock *, int));
702  */
703 void
704 __lock_printlock(lt, lp, ispgno)
705 	DB_LOCKTAB *lt;
706 	struct __db_lock *lp;
707 	int ispgno;
708 {
709 	DB_LOCKOBJ *lockobj;
710 	db_pgno_t pgno;
711 	size_t obj;
712 	u_int8_t *ptr;
713 	const char *mode, *status;
714 
715 	switch (lp->mode) {
716 	case DB_LOCK_IREAD:
717 		mode = "IREAD";
718 		break;
719 	case DB_LOCK_IWR:
720 		mode = "IWR";
721 		break;
722 	case DB_LOCK_IWRITE:
723 		mode = "IWRITE";
724 		break;
725 	case DB_LOCK_NG:
726 		mode = "NG";
727 		break;
728 	case DB_LOCK_READ:
729 		mode = "READ";
730 		break;
731 	case DB_LOCK_WRITE:
732 		mode = "WRITE";
733 		break;
734 	default:
735 		mode = "UNKNOWN";
736 		break;
737 	}
738 	switch (lp->status) {
739 	case DB_LSTAT_ABORTED:
740 		status = "ABORT";
741 		break;
742 	case DB_LSTAT_ERR:
743 		status = "ERROR";
744 		break;
745 	case DB_LSTAT_FREE:
746 		status = "FREE";
747 		break;
748 	case DB_LSTAT_HELD:
749 		status = "HELD";
750 		break;
751 	case DB_LSTAT_NOGRANT:
752 		status = "NONE";
753 		break;
754 	case DB_LSTAT_WAITING:
755 		status = "WAIT";
756 		break;
757 	case DB_LSTAT_PENDING:
758 		status = "PENDING";
759 		break;
760 	default:
761 		status = "UNKNOWN";
762 		break;
763 	}
764 	printf("\t%lx\t%s\t%lu\t%s\t",
765 	    (u_long)lp->holder, mode, (u_long)lp->refcount, status);
766 
767 	lockobj = (DB_LOCKOBJ *)((u_int8_t *)lp + lp->obj);
768 	ptr = SH_DBT_PTR(&lockobj->lockobj);
769 	if (ispgno) {
770 		/* Assume this is a DBT lock. */
771 		memcpy(&pgno, ptr, sizeof(db_pgno_t));
772 		printf("page %lu\n", (u_long)pgno);
773 	} else {
774 		obj = (u_int8_t *)lp + lp->obj - (u_int8_t *)lt->region;
775 		printf("0x%lx ", (u_long)obj);
776 		__db_pr(ptr, lockobj->lockobj.size);
777 		printf("\n");
778 	}
779 }
780 
781 /*
782  * PUBLIC: int __lock_getobj  __P((DB_LOCKTAB *,
783  * PUBLIC:     u_int32_t, const DBT *, u_int32_t type, DB_LOCKOBJ **));
784  */
785 int
786 __lock_getobj(lt, locker, dbt, type, objp)
787 	DB_LOCKTAB *lt;
788 	u_int32_t locker, type;
789 	const DBT *dbt;
790 	DB_LOCKOBJ **objp;
791 {
792 	DB_LOCKREGION *lrp;
793 	DB_LOCKOBJ *sh_obj;
794 	u_int32_t obj_size;
795 	int ret;
796 	void *p, *src;
797 
798 	lrp = lt->region;
799 
800 	/* Look up the object in the hash table. */
801 	if (type == DB_LOCK_OBJTYPE) {
802 		HASHLOOKUP(lt->hashtab, __db_lockobj, links, dbt, sh_obj,
803 		    lrp->table_size, __lock_ohash, __lock_cmp);
804 		obj_size = dbt->size;
805 	} else {
806 		HASHLOOKUP(lt->hashtab, __db_lockobj, links, locker,
807 		    sh_obj, lrp->table_size, __lock_locker_hash,
808 		    __lock_locker_cmp);
809 		obj_size = sizeof(locker);
810 	}
811 
812 	/*
813 	 * If we found the object, then we can just return it.  If
814 	 * we didn't find the object, then we need to create it.
815 	 */
816 	if (sh_obj == NULL) {
817 		/* Create new object and then insert it into hash table. */
818 		if ((sh_obj =
819 		    SH_TAILQ_FIRST(&lrp->free_objs, __db_lockobj)) == NULL) {
820 			if ((ret = __lock_grow_region(lt, DB_LOCK_OBJ, 0)) != 0)
821 				return (ret);
822 			lrp = lt->region;
823 			sh_obj = SH_TAILQ_FIRST(&lrp->free_objs, __db_lockobj);
824 		}
825 
826 		/*
827 		 * If we can fit this object in the structure, do so instead
828 		 * of shalloc-ing space for it.
829 		 */
830 		if (obj_size <= sizeof(sh_obj->objdata))
831 			p = sh_obj->objdata;
832 		else
833 			if ((ret =
834 			    __db_shalloc(lt->mem, obj_size, 0, &p)) != 0) {
835 				if ((ret = __lock_grow_region(lt,
836 				    DB_LOCK_MEM, obj_size)) != 0)
837 					return (ret);
838 				lrp = lt->region;
839 				/* Reacquire the head of the list. */
840 				sh_obj = SH_TAILQ_FIRST(&lrp->free_objs,
841 				    __db_lockobj);
842 				(void)__db_shalloc(lt->mem, obj_size, 0, &p);
843 			}
844 
845 		src = type == DB_LOCK_OBJTYPE ? dbt->data : (void *)&locker;
846 		memcpy(p, src, obj_size);
847 
848 		sh_obj->type = type;
849 		SH_TAILQ_REMOVE(&lrp->free_objs, sh_obj, links, __db_lockobj);
850 
851 		SH_TAILQ_INIT(&sh_obj->waiters);
852 		if (type == DB_LOCK_LOCKER)
853 			SH_LIST_INIT(&sh_obj->heldby);
854 		else
855 			SH_TAILQ_INIT(&sh_obj->holders);
856 		sh_obj->lockobj.size = obj_size;
857 		sh_obj->lockobj.off = SH_PTR_TO_OFF(&sh_obj->lockobj, p);
858 
859 		HASHINSERT(lt->hashtab,
860 		    __db_lockobj, links, sh_obj, lrp->table_size, __lock_lhash);
861 
862 		if (type == DB_LOCK_LOCKER)
863 			lrp->nlockers++;
864 	}
865 
866 	*objp = sh_obj;
867 	return (0);
868 }
869 
870 /*
871  * Any lock on the waitlist has a process waiting for it.  Therefore, we
872  * can't return the lock to the freelist immediately.  Instead, we can
873  * remove the lock from the list of waiters, set the status field of the
874  * lock, and then let the process waking up return the lock to the
875  * free list.
876  */
877 static void
878 __lock_remove_waiter(lt, sh_obj, lockp, status)
879 	DB_LOCKTAB *lt;
880 	DB_LOCKOBJ *sh_obj;
881 	struct __db_lock *lockp;
882 	db_status_t status;
883 {
884 	SH_TAILQ_REMOVE(&sh_obj->waiters, lockp, links, __db_lock);
885 	lockp->status = status;
886 
887 	/* Wake whoever is waiting on this lock. */
888 	(void)__db_mutex_unlock(&lockp->mutex, lt->reginfo.fd);
889 }
890 
891 static void
892 __lock_checklocker(lt, lockp, do_remove)
893 	DB_LOCKTAB *lt;
894 	struct __db_lock *lockp;
895 	int do_remove;
896 {
897 	DB_LOCKOBJ *sh_locker;
898 
899 	if (do_remove)
900 		SH_LIST_REMOVE(lockp, locker_links, __db_lock);
901 
902 	/* if the locker list is NULL, free up the object. */
903 	if (__lock_getobj(lt, lockp->holder, NULL, DB_LOCK_LOCKER, &sh_locker)
904 	    == 0 && SH_LIST_FIRST(&sh_locker->heldby, __db_lock) == NULL) {
905 		__lock_freeobj(lt, sh_locker);
906 		    lt->region->nlockers--;
907 	}
908 }
909 
910 static void
911 __lock_freeobj(lt, obj)
912 	DB_LOCKTAB *lt;
913 	DB_LOCKOBJ *obj;
914 {
915 	HASHREMOVE_EL(lt->hashtab,
916 	    __db_lockobj, links, obj, lt->region->table_size, __lock_lhash);
917 	if (obj->lockobj.size > sizeof(obj->objdata))
918 		__db_shalloc_free(lt->mem, SH_DBT_PTR(&obj->lockobj));
919 	SH_TAILQ_INSERT_HEAD(&lt->region->free_objs, obj, links, __db_lockobj);
920 }
921 
922 /*
923  * __lock_downgrade --
924  *	Used by the concurrent access product to downgrade write locks
925  * back to iwrite locks.
926  *
927  * PUBLIC: int __lock_downgrade __P((DB_LOCKTAB *,
928  * PUBLIC:     DB_LOCK, db_lockmode_t, u_int32_t));
929  */
930 int
931 __lock_downgrade(lt, lock, new_mode, flags)
932 	DB_LOCKTAB *lt;
933 	DB_LOCK lock;
934 	db_lockmode_t new_mode;
935 	u_int32_t flags;
936 {
937 	struct __db_lock *lockp;
938 	DB_LOCKOBJ *obj;
939 	int ret;
940 
941 	COMPQUIET(flags, 0);
942 	LOCK_PANIC_CHECK(lt);
943 	LOCK_LOCKREGION(lt);
944 
945 	if ((ret = __lock_validate_region(lt)) == 0) {
946 		lockp = OFFSET_TO_LOCK(lt, lock);
947 		lockp->mode = new_mode;
948 
949 		/* Get the object associated with this lock. */
950 		obj = (DB_LOCKOBJ *)((u_int8_t *)lockp + lockp->obj);
951 		(void)__lock_promote(lt, obj);
952 		++lt->region->nreleases;
953 	}
954 
955 	UNLOCK_LOCKREGION(lt);
956 
957 	return (ret);
958 }
959 
960 /*
961  * __lock_promote --
962  *
963  * Look through the waiters and holders lists and decide which (if any)
964  * locks can be promoted.   Promote any that are eligible.
965  */
966 static int
967 __lock_promote(lt, obj)
968 	DB_LOCKTAB *lt;
969 	DB_LOCKOBJ *obj;
970 {
971 	struct __db_lock *lp_w, *lp_h, *next_waiter;
972 	int state_changed, waiter_is_txn;
973 
974 	/*
975 	 * We need to do lock promotion.  We also need to determine if
976 	 * we're going to need to run the deadlock detector again.  If
977 	 * we release locks, and there are waiters, but no one gets promoted,
978 	 * then we haven't fundamentally changed the lockmgr state, so
979 	 * we may still have a deadlock and we have to run again.  However,
980 	 * if there were no waiters, or we actually promoted someone, then
981 	 * we are OK and we don't have to run it immediately.
982 	 *
983 	 * During promotion, we look for state changes so we can return
984 	 * this information to the caller.
985 	 */
986 	for (lp_w = SH_TAILQ_FIRST(&obj->waiters, __db_lock),
987 	    state_changed = lp_w == NULL;
988 	    lp_w != NULL;
989 	    lp_w = next_waiter) {
990 		waiter_is_txn = TXN_IS_HOLDING(lp_w);
991 		next_waiter = SH_TAILQ_NEXT(lp_w, links, __db_lock);
992 		for (lp_h = SH_TAILQ_FIRST(&obj->holders, __db_lock);
993 		    lp_h != NULL;
994 		    lp_h = SH_TAILQ_NEXT(lp_h, links, __db_lock)) {
995 			if (CONFLICTS(lt, lp_h->mode, lp_w->mode) &&
996 			    lp_h->holder != lp_w->holder &&
997 			    !(waiter_is_txn &&
998 			    TXN_IS_HOLDING(lp_h) &&
999 			    __txn_is_ancestor(lt->dbenv->tx_info,
1000 			        lp_h->txnoff, lp_w->txnoff)))
1001 				break;
1002 		}
1003 		if (lp_h != NULL)	/* Found a conflict. */
1004 			break;
1005 
1006 		/* No conflict, promote the waiting lock. */
1007 		SH_TAILQ_REMOVE(&obj->waiters, lp_w, links, __db_lock);
1008 		lp_w->status = DB_LSTAT_PENDING;
1009 		SH_TAILQ_INSERT_TAIL(&obj->holders, lp_w, links);
1010 
1011 		/* Wake up waiter. */
1012 		(void)__db_mutex_unlock(&lp_w->mutex, lt->reginfo.fd);
1013 		state_changed = 1;
1014 	}
1015 
1016 	return (state_changed);
1017 }
1018 
1019 static int
1020 __lock_is_parent(locker, txn)
1021 	u_int32_t locker;
1022 	DB_TXN *txn;
1023 {
1024 	DB_TXN *t;
1025 
1026 	if (txn == NULL)
1027 		return (0);
1028 
1029 	for (t = txn->parent; t != NULL; t = t->parent)
1030 		if (t->txnid == locker)
1031 			return (1);
1032 
1033 	return (0);
1034 }
1035