xref: /freebsd/sys/kern/kern_rangelock.c (revision e3680954376d380b897066a542ba7cf0b7ba9124)
18f0e9130SKonstantin Belousov /*-
28a36da99SPedro F. Giffuni  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
38a36da99SPedro F. Giffuni  *
48f0e9130SKonstantin Belousov  * Copyright (c) 2009 Konstantin Belousov <kib@FreeBSD.org>
58f0e9130SKonstantin Belousov  * All rights reserved.
68f0e9130SKonstantin Belousov  *
78f0e9130SKonstantin Belousov  * Redistribution and use in source and binary forms, with or without
88f0e9130SKonstantin Belousov  * modification, are permitted provided that the following conditions
98f0e9130SKonstantin Belousov  * are met:
108f0e9130SKonstantin Belousov  * 1. Redistributions of source code must retain the above copyright
118f0e9130SKonstantin Belousov  *    notice unmodified, this list of conditions, and the following
128f0e9130SKonstantin Belousov  *    disclaimer.
138f0e9130SKonstantin Belousov  * 2. Redistributions in binary form must reproduce the above copyright
148f0e9130SKonstantin Belousov  *    notice, this list of conditions and the following disclaimer in the
158f0e9130SKonstantin Belousov  *    documentation and/or other materials provided with the distribution.
168f0e9130SKonstantin Belousov  *
178f0e9130SKonstantin Belousov  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
188f0e9130SKonstantin Belousov  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
198f0e9130SKonstantin Belousov  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
208f0e9130SKonstantin Belousov  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
218f0e9130SKonstantin Belousov  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
228f0e9130SKonstantin Belousov  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
238f0e9130SKonstantin Belousov  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
248f0e9130SKonstantin Belousov  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
258f0e9130SKonstantin Belousov  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
268f0e9130SKonstantin Belousov  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
278f0e9130SKonstantin Belousov  */
288f0e9130SKonstantin Belousov 
298f0e9130SKonstantin Belousov #include <sys/cdefs.h>
308f0e9130SKonstantin Belousov __FBSDID("$FreeBSD$");
318f0e9130SKonstantin Belousov 
328f0e9130SKonstantin Belousov #include <sys/param.h>
338f0e9130SKonstantin Belousov #include <sys/kernel.h>
348f0e9130SKonstantin Belousov #include <sys/lock.h>
358f0e9130SKonstantin Belousov #include <sys/mutex.h>
368f0e9130SKonstantin Belousov #include <sys/proc.h>
378f0e9130SKonstantin Belousov #include <sys/rangelock.h>
388f0e9130SKonstantin Belousov #include <sys/systm.h>
398f0e9130SKonstantin Belousov 
408f0e9130SKonstantin Belousov #include <vm/uma.h>
418f0e9130SKonstantin Belousov 
428f0e9130SKonstantin Belousov struct rl_q_entry {
438f0e9130SKonstantin Belousov 	TAILQ_ENTRY(rl_q_entry) rl_q_link;
448f0e9130SKonstantin Belousov 	off_t		rl_q_start, rl_q_end;
458f0e9130SKonstantin Belousov 	int		rl_q_flags;
468f0e9130SKonstantin Belousov };
478f0e9130SKonstantin Belousov 
488f0e9130SKonstantin Belousov static uma_zone_t rl_entry_zone;
498f0e9130SKonstantin Belousov 
508f0e9130SKonstantin Belousov static void
518f0e9130SKonstantin Belousov rangelock_sys_init(void)
528f0e9130SKonstantin Belousov {
538f0e9130SKonstantin Belousov 
548f0e9130SKonstantin Belousov 	rl_entry_zone = uma_zcreate("rl_entry", sizeof(struct rl_q_entry),
558f0e9130SKonstantin Belousov 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
568f0e9130SKonstantin Belousov }
578f0e9130SKonstantin Belousov SYSINIT(vfs, SI_SUB_LOCK, SI_ORDER_ANY, rangelock_sys_init, NULL);
588f0e9130SKonstantin Belousov 
598f0e9130SKonstantin Belousov static struct rl_q_entry *
608f0e9130SKonstantin Belousov rlqentry_alloc(void)
618f0e9130SKonstantin Belousov {
628f0e9130SKonstantin Belousov 
638f0e9130SKonstantin Belousov 	return (uma_zalloc(rl_entry_zone, M_WAITOK));
648f0e9130SKonstantin Belousov }
658f0e9130SKonstantin Belousov 
668f0e9130SKonstantin Belousov void
678f0e9130SKonstantin Belousov rlqentry_free(struct rl_q_entry *rleq)
688f0e9130SKonstantin Belousov {
698f0e9130SKonstantin Belousov 
708f0e9130SKonstantin Belousov 	uma_zfree(rl_entry_zone, rleq);
718f0e9130SKonstantin Belousov }
728f0e9130SKonstantin Belousov 
738f0e9130SKonstantin Belousov void
748f0e9130SKonstantin Belousov rangelock_init(struct rangelock *lock)
758f0e9130SKonstantin Belousov {
768f0e9130SKonstantin Belousov 
778f0e9130SKonstantin Belousov 	TAILQ_INIT(&lock->rl_waiters);
788f0e9130SKonstantin Belousov 	lock->rl_currdep = NULL;
798f0e9130SKonstantin Belousov }
808f0e9130SKonstantin Belousov 
818f0e9130SKonstantin Belousov void
828f0e9130SKonstantin Belousov rangelock_destroy(struct rangelock *lock)
838f0e9130SKonstantin Belousov {
848f0e9130SKonstantin Belousov 
858f0e9130SKonstantin Belousov 	KASSERT(TAILQ_EMPTY(&lock->rl_waiters), ("Dangling waiters"));
868f0e9130SKonstantin Belousov }
878f0e9130SKonstantin Belousov 
888f0e9130SKonstantin Belousov /*
898f0e9130SKonstantin Belousov  * Two entries are compatible if their ranges do not overlap, or both
908f0e9130SKonstantin Belousov  * entries are for read.
918f0e9130SKonstantin Belousov  */
928f0e9130SKonstantin Belousov static int
932bb93f2dSColin Percival ranges_overlap(const struct rl_q_entry *e1,
948f0e9130SKonstantin Belousov     const struct rl_q_entry *e2)
958f0e9130SKonstantin Belousov {
968f0e9130SKonstantin Belousov 
978f0e9130SKonstantin Belousov 	if (e1->rl_q_start < e2->rl_q_end && e1->rl_q_end > e2->rl_q_start)
988f0e9130SKonstantin Belousov 		return (1);
998f0e9130SKonstantin Belousov 	return (0);
1008f0e9130SKonstantin Belousov }
1018f0e9130SKonstantin Belousov 
1028f0e9130SKonstantin Belousov /*
1038f0e9130SKonstantin Belousov  * Recalculate the lock->rl_currdep after an unlock.
1048f0e9130SKonstantin Belousov  */
1058f0e9130SKonstantin Belousov static void
1068f0e9130SKonstantin Belousov rangelock_calc_block(struct rangelock *lock)
1078f0e9130SKonstantin Belousov {
1082bb93f2dSColin Percival 	struct rl_q_entry *entry, *nextentry, *entry1;
1098f0e9130SKonstantin Belousov 
1102bb93f2dSColin Percival 	for (entry = lock->rl_currdep; entry != NULL; entry = nextentry) {
1112bb93f2dSColin Percival 		nextentry = TAILQ_NEXT(entry, rl_q_link);
1122bb93f2dSColin Percival 		if (entry->rl_q_flags & RL_LOCK_READ) {
1132bb93f2dSColin Percival 			/* Reads must not overlap with granted writes. */
1142bb93f2dSColin Percival 			for (entry1 = TAILQ_FIRST(&lock->rl_waiters);
1152bb93f2dSColin Percival 			    !(entry1->rl_q_flags & RL_LOCK_READ);
1162bb93f2dSColin Percival 			    entry1 = TAILQ_NEXT(entry1, rl_q_link)) {
1172bb93f2dSColin Percival 				if (ranges_overlap(entry, entry1))
1188f0e9130SKonstantin Belousov 					goto out;
1198f0e9130SKonstantin Belousov 			}
1202bb93f2dSColin Percival 		} else {
1212bb93f2dSColin Percival 			/* Write must not overlap with any granted locks. */
1222bb93f2dSColin Percival 			for (entry1 = TAILQ_FIRST(&lock->rl_waiters);
1232bb93f2dSColin Percival 			    entry1 != entry;
1242bb93f2dSColin Percival 			    entry1 = TAILQ_NEXT(entry1, rl_q_link)) {
1252bb93f2dSColin Percival 				if (ranges_overlap(entry, entry1))
1262bb93f2dSColin Percival 					goto out;
1272bb93f2dSColin Percival 			}
1282bb93f2dSColin Percival 
1292bb93f2dSColin Percival 			/* Move grantable write locks to the front. */
1302bb93f2dSColin Percival 			TAILQ_REMOVE(&lock->rl_waiters, entry, rl_q_link);
1312bb93f2dSColin Percival 			TAILQ_INSERT_HEAD(&lock->rl_waiters, entry, rl_q_link);
1322bb93f2dSColin Percival 		}
1332bb93f2dSColin Percival 
1342bb93f2dSColin Percival 		/* Grant this lock. */
1352bb93f2dSColin Percival 		entry->rl_q_flags |= RL_LOCK_GRANTED;
1362bb93f2dSColin Percival 		wakeup(entry);
1378f0e9130SKonstantin Belousov 	}
1388f0e9130SKonstantin Belousov out:
1398f0e9130SKonstantin Belousov 	lock->rl_currdep = entry;
1408f0e9130SKonstantin Belousov }
1418f0e9130SKonstantin Belousov 
1428f0e9130SKonstantin Belousov static void
1438f0e9130SKonstantin Belousov rangelock_unlock_locked(struct rangelock *lock, struct rl_q_entry *entry,
144*e3680954SRick Macklem     struct mtx *ilk, bool do_calc_block)
1458f0e9130SKonstantin Belousov {
1468f0e9130SKonstantin Belousov 
1478f0e9130SKonstantin Belousov 	MPASS(lock != NULL && entry != NULL && ilk != NULL);
1488f0e9130SKonstantin Belousov 	mtx_assert(ilk, MA_OWNED);
149*e3680954SRick Macklem 
150*e3680954SRick Macklem 	if (!do_calc_block) {
151*e3680954SRick Macklem 		/*
152*e3680954SRick Macklem 		 * This is the case where rangelock_enqueue() has been called
153*e3680954SRick Macklem 		 * with trylock == true and just inserted this entry in the
154*e3680954SRick Macklem 		 * queue.
155*e3680954SRick Macklem 		 * If rl_currdep is this entry, rl_currdep needs to
156*e3680954SRick Macklem 		 * be set to the next entry in the rl_waiters list.
157*e3680954SRick Macklem 		 * However, since this entry is the last entry in the
158*e3680954SRick Macklem 		 * list, the next entry is NULL.
159*e3680954SRick Macklem 		 */
160*e3680954SRick Macklem 		if (lock->rl_currdep == entry) {
161*e3680954SRick Macklem 			KASSERT(TAILQ_NEXT(lock->rl_currdep, rl_q_link) == NULL,
162*e3680954SRick Macklem 			    ("rangelock_enqueue: next entry not NULL"));
163*e3680954SRick Macklem 			lock->rl_currdep = NULL;
164*e3680954SRick Macklem 		}
165*e3680954SRick Macklem 	} else
1668f0e9130SKonstantin Belousov 		KASSERT(entry != lock->rl_currdep, ("stuck currdep"));
1678f0e9130SKonstantin Belousov 
1688f0e9130SKonstantin Belousov 	TAILQ_REMOVE(&lock->rl_waiters, entry, rl_q_link);
169*e3680954SRick Macklem 	if (do_calc_block)
1708f0e9130SKonstantin Belousov 		rangelock_calc_block(lock);
1718f0e9130SKonstantin Belousov 	mtx_unlock(ilk);
1728f0e9130SKonstantin Belousov 	if (curthread->td_rlqe == NULL)
1738f0e9130SKonstantin Belousov 		curthread->td_rlqe = entry;
1748f0e9130SKonstantin Belousov 	else
1758f0e9130SKonstantin Belousov 		rlqentry_free(entry);
1768f0e9130SKonstantin Belousov }
1778f0e9130SKonstantin Belousov 
1788f0e9130SKonstantin Belousov void
1798f0e9130SKonstantin Belousov rangelock_unlock(struct rangelock *lock, void *cookie, struct mtx *ilk)
1808f0e9130SKonstantin Belousov {
1818f0e9130SKonstantin Belousov 
1828f0e9130SKonstantin Belousov 	MPASS(lock != NULL && cookie != NULL && ilk != NULL);
1838f0e9130SKonstantin Belousov 
1848f0e9130SKonstantin Belousov 	mtx_lock(ilk);
185*e3680954SRick Macklem 	rangelock_unlock_locked(lock, cookie, ilk, true);
1868f0e9130SKonstantin Belousov }
1878f0e9130SKonstantin Belousov 
1888f0e9130SKonstantin Belousov /*
1898f0e9130SKonstantin Belousov  * Unlock the sub-range of granted lock.
1908f0e9130SKonstantin Belousov  */
1918f0e9130SKonstantin Belousov void *
1928f0e9130SKonstantin Belousov rangelock_unlock_range(struct rangelock *lock, void *cookie, off_t start,
1938f0e9130SKonstantin Belousov     off_t end, struct mtx *ilk)
1948f0e9130SKonstantin Belousov {
1958f0e9130SKonstantin Belousov 	struct rl_q_entry *entry;
1968f0e9130SKonstantin Belousov 
1978f0e9130SKonstantin Belousov 	MPASS(lock != NULL && cookie != NULL && ilk != NULL);
1988f0e9130SKonstantin Belousov 	entry = cookie;
1998f0e9130SKonstantin Belousov 	KASSERT(entry->rl_q_flags & RL_LOCK_GRANTED,
2008f0e9130SKonstantin Belousov 	    ("Unlocking non-granted lock"));
2018f0e9130SKonstantin Belousov 	KASSERT(entry->rl_q_start == start, ("wrong start"));
2028f0e9130SKonstantin Belousov 	KASSERT(entry->rl_q_end >= end, ("wrong end"));
2038f0e9130SKonstantin Belousov 
2048f0e9130SKonstantin Belousov 	mtx_lock(ilk);
2058f0e9130SKonstantin Belousov 	if (entry->rl_q_end == end) {
206*e3680954SRick Macklem 		rangelock_unlock_locked(lock, cookie, ilk, true);
2078f0e9130SKonstantin Belousov 		return (NULL);
2088f0e9130SKonstantin Belousov 	}
2098f0e9130SKonstantin Belousov 	entry->rl_q_end = end;
2108f0e9130SKonstantin Belousov 	rangelock_calc_block(lock);
2118f0e9130SKonstantin Belousov 	mtx_unlock(ilk);
2128f0e9130SKonstantin Belousov 	return (cookie);
2138f0e9130SKonstantin Belousov }
2148f0e9130SKonstantin Belousov 
2158f0e9130SKonstantin Belousov /*
2168f0e9130SKonstantin Belousov  * Add the lock request to the queue of the pending requests for
217*e3680954SRick Macklem  * rangelock.  Sleep until the request can be granted unless trylock == true.
2188f0e9130SKonstantin Belousov  */
2198f0e9130SKonstantin Belousov static void *
2208f0e9130SKonstantin Belousov rangelock_enqueue(struct rangelock *lock, off_t start, off_t end, int mode,
221*e3680954SRick Macklem     struct mtx *ilk, bool trylock)
2228f0e9130SKonstantin Belousov {
2238f0e9130SKonstantin Belousov 	struct rl_q_entry *entry;
2248f0e9130SKonstantin Belousov 	struct thread *td;
2258f0e9130SKonstantin Belousov 
2268f0e9130SKonstantin Belousov 	MPASS(lock != NULL && ilk != NULL);
2278f0e9130SKonstantin Belousov 
2288f0e9130SKonstantin Belousov 	td = curthread;
2298f0e9130SKonstantin Belousov 	if (td->td_rlqe != NULL) {
2308f0e9130SKonstantin Belousov 		entry = td->td_rlqe;
2318f0e9130SKonstantin Belousov 		td->td_rlqe = NULL;
2328f0e9130SKonstantin Belousov 	} else
2338f0e9130SKonstantin Belousov 		entry = rlqentry_alloc();
2348f0e9130SKonstantin Belousov 	MPASS(entry != NULL);
2358f0e9130SKonstantin Belousov 	entry->rl_q_flags = mode;
2368f0e9130SKonstantin Belousov 	entry->rl_q_start = start;
2378f0e9130SKonstantin Belousov 	entry->rl_q_end = end;
2388f0e9130SKonstantin Belousov 
2398f0e9130SKonstantin Belousov 	mtx_lock(ilk);
2408f0e9130SKonstantin Belousov 	/*
2418f0e9130SKonstantin Belousov 	 * XXXKIB TODO. Check that a thread does not try to enqueue a
2428f0e9130SKonstantin Belousov 	 * lock that is incompatible with another request from the same
2438f0e9130SKonstantin Belousov 	 * thread.
2448f0e9130SKonstantin Belousov 	 */
2458f0e9130SKonstantin Belousov 
2468f0e9130SKonstantin Belousov 	TAILQ_INSERT_TAIL(&lock->rl_waiters, entry, rl_q_link);
247*e3680954SRick Macklem 	/*
248*e3680954SRick Macklem 	 * If rl_currdep == NULL, there is no entry waiting for a conflicting
249*e3680954SRick Macklem 	 * range to be resolved, so set rl_currdep to this entry.  If there is
250*e3680954SRick Macklem 	 * no conflicting entry for this entry, rl_currdep will be set back to
251*e3680954SRick Macklem 	 * NULL by rangelock_calc_block().
252*e3680954SRick Macklem 	 */
2538f0e9130SKonstantin Belousov 	if (lock->rl_currdep == NULL)
2548f0e9130SKonstantin Belousov 		lock->rl_currdep = entry;
2558f0e9130SKonstantin Belousov 	rangelock_calc_block(lock);
256*e3680954SRick Macklem 	while (!(entry->rl_q_flags & RL_LOCK_GRANTED)) {
257*e3680954SRick Macklem 		if (trylock) {
258*e3680954SRick Macklem 			/*
259*e3680954SRick Macklem 			 * For this case, the range is not actually locked
260*e3680954SRick Macklem 			 * yet, but removal from the list requires the same
261*e3680954SRick Macklem 			 * steps, except for not doing a rangelock_calc_block()
262*e3680954SRick Macklem 			 * call, since rangelock_calc_block() was called above.
263*e3680954SRick Macklem 			 */
264*e3680954SRick Macklem 			rangelock_unlock_locked(lock, entry, ilk, false);
265*e3680954SRick Macklem 			return (NULL);
266*e3680954SRick Macklem 		}
2678f0e9130SKonstantin Belousov 		msleep(entry, ilk, 0, "range", 0);
268*e3680954SRick Macklem 	}
2698f0e9130SKonstantin Belousov 	mtx_unlock(ilk);
2708f0e9130SKonstantin Belousov 	return (entry);
2718f0e9130SKonstantin Belousov }
2728f0e9130SKonstantin Belousov 
2738f0e9130SKonstantin Belousov void *
2748f0e9130SKonstantin Belousov rangelock_rlock(struct rangelock *lock, off_t start, off_t end, struct mtx *ilk)
2758f0e9130SKonstantin Belousov {
2768f0e9130SKonstantin Belousov 
277*e3680954SRick Macklem 	return (rangelock_enqueue(lock, start, end, RL_LOCK_READ, ilk, false));
278*e3680954SRick Macklem }
279*e3680954SRick Macklem 
280*e3680954SRick Macklem void *
281*e3680954SRick Macklem rangelock_tryrlock(struct rangelock *lock, off_t start, off_t end,
282*e3680954SRick Macklem     struct mtx *ilk)
283*e3680954SRick Macklem {
284*e3680954SRick Macklem 
285*e3680954SRick Macklem 	return (rangelock_enqueue(lock, start, end, RL_LOCK_READ, ilk, true));
2868f0e9130SKonstantin Belousov }
2878f0e9130SKonstantin Belousov 
2888f0e9130SKonstantin Belousov void *
2898f0e9130SKonstantin Belousov rangelock_wlock(struct rangelock *lock, off_t start, off_t end, struct mtx *ilk)
2908f0e9130SKonstantin Belousov {
2918f0e9130SKonstantin Belousov 
292*e3680954SRick Macklem 	return (rangelock_enqueue(lock, start, end, RL_LOCK_WRITE, ilk, false));
293*e3680954SRick Macklem }
294*e3680954SRick Macklem 
295*e3680954SRick Macklem void *
296*e3680954SRick Macklem rangelock_trywlock(struct rangelock *lock, off_t start, off_t end,
297*e3680954SRick Macklem     struct mtx *ilk)
298*e3680954SRick Macklem {
299*e3680954SRick Macklem 
300*e3680954SRick Macklem 	return (rangelock_enqueue(lock, start, end, RL_LOCK_WRITE, ilk, true));
3018f0e9130SKonstantin Belousov }
302