xref: /freebsd/sys/kern/kern_rangelock.c (revision a58ece87303f882367105c92a27268ed6befa655)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2009 Konstantin Belousov <kib@FreeBSD.org>
5  * Copyright (c) 2023 The FreeBSD Foundation
6  *
7  * Portions of this software were developed by
8  * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
9  * the FreeBSD Foundation.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice unmodified, this list of conditions, and the following
16  *    disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
26  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
30  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 #include <sys/param.h>
34 #include <sys/kassert.h>
35 #include <sys/kernel.h>
36 #include <sys/lock.h>
37 #include <sys/mutex.h>
38 #include <sys/proc.h>
39 #include <sys/rangelock.h>
40 #include <sys/sleepqueue.h>
41 #include <sys/smr.h>
42 #include <sys/sysctl.h>
43 
44 #include <vm/uma.h>
45 
46 /*
47  * Immediately after initialization (subject to 'rangelock_cheat'
48  * below), and until a request comes that conflicts with granted ones
49  * based on type, rangelocks serve requests in the "cheating" mode.
50  * In this mode, a rangelock behaves like a sxlock, as if each request
51  * covered the whole range of the protected object.  On receiving a
52  * conflicting request (any request while a write request is
53  * effective, or any write request while some read ones are
54  * effective), all requests granted in "cheating" mode are drained,
55  * and the rangelock then switches to effectively keeping track of the
56  * precise range of each new request.
57  *
58  * Normal sx implementation is not used to not bloat structures (most
59  * important, vnodes) which embeds rangelocks.
60  *
61  * The cheating greatly helps very common pattern where file is first
62  * written single-threaded, and then read by many processes.
63  *
64  * Lock is in cheat mode when RL_CHEAT_CHEATING bit is set in the
65  * lock->head.  Special cookies are returned in this mode, and
66  * trylocks are same as normal locks but do not drain.
67  */
68 
69 static int rangelock_cheat = 1;
70 SYSCTL_INT(_debug, OID_AUTO, rangelock_cheat, CTLFLAG_RWTUN,
71     &rangelock_cheat, 0,
72     "");
73 
74 #define	RL_CHEAT_MASK		0x7
75 #define	RL_CHEAT_CHEATING	0x1
76 /* #define	RL_CHEAT_RLOCKED	0x0 */
77 #define	RL_CHEAT_WLOCKED	0x2
78 #define	RL_CHEAT_DRAINING	0x4
79 
80 #define	RL_CHEAT_READER		0x8
81 
82 #define	RL_RET_CHEAT_RLOCKED	0x1100
83 #define	RL_RET_CHEAT_WLOCKED	0x2200
84 
85 static void
86 rangelock_cheat_drain(struct rangelock *lock)
87 {
88 	uintptr_t v;
89 
90 	DROP_GIANT();
91 	for (;;) {
92 		v = atomic_load_ptr(&lock->head);
93 		if ((v & RL_CHEAT_DRAINING) == 0)
94 			break;
95 		sleepq_add(&lock->head, NULL, "ranged1", 0, 0);
96 		sleepq_wait(&lock->head, PRI_USER);
97 		sleepq_lock(&lock->head);
98 	}
99 	sleepq_release(&lock->head);
100 	PICKUP_GIANT();
101 }
102 
103 static bool
104 rangelock_cheat_lock(struct rangelock *lock, int locktype, bool trylock,
105     void **cookie)
106 {
107 	uintptr_t v, x;
108 
109 	v = atomic_load_ptr(&lock->head);
110 	if ((v & RL_CHEAT_CHEATING) == 0)
111 		return (false);
112 	if ((v & RL_CHEAT_DRAINING) != 0) {
113 drain:
114 		if (trylock) {
115 			*cookie = NULL;
116 			return (true);
117 		}
118 		sleepq_lock(&lock->head);
119 drain1:
120 		rangelock_cheat_drain(lock);
121 		return (false);
122 	}
123 
124 	switch (locktype) {
125 	case RL_LOCK_READ:
126 		for (;;) {
127 			if ((v & RL_CHEAT_WLOCKED) != 0) {
128 				if (trylock) {
129 					*cookie = NULL;
130 					return (true);
131 				}
132 				x = v | RL_CHEAT_DRAINING;
133 				sleepq_lock(&lock->head);
134 				if (atomic_fcmpset_rel_ptr(&lock->head, &v,
135 				    x) != 0)
136 					goto drain1;
137 				sleepq_release(&lock->head);
138 				/* Possibly forgive passed conflict */
139 			} else {
140 				x = (v & ~RL_CHEAT_MASK) + RL_CHEAT_READER;
141 				x |= RL_CHEAT_CHEATING;
142 				if (atomic_fcmpset_acq_ptr(&lock->head, &v,
143 				    x) != 0)
144 					break;
145 			}
146 			if ((v & RL_CHEAT_CHEATING) == 0)
147 				return (false);
148 			if ((v & RL_CHEAT_DRAINING) != 0)
149 				goto drain;
150 		}
151 		*(uintptr_t *)cookie = RL_RET_CHEAT_RLOCKED;
152 		break;
153 	case RL_LOCK_WRITE:
154 		for (;;) {
155 			if ((v & ~RL_CHEAT_MASK) >= RL_CHEAT_READER ||
156 			    (v & RL_CHEAT_WLOCKED) != 0) {
157 				if (trylock) {
158 					*cookie = NULL;
159 					return (true);
160 				}
161 				x = v | RL_CHEAT_DRAINING;
162 				sleepq_lock(&lock->head);
163 				if (atomic_fcmpset_rel_ptr(&lock->head, &v,
164 				    x) != 0)
165 					goto drain1;
166 				sleepq_release(&lock->head);
167 				/* Possibly forgive passed conflict */
168 			} else {
169 				x = RL_CHEAT_WLOCKED | RL_CHEAT_CHEATING;
170 				if (atomic_fcmpset_acq_ptr(&lock->head, &v,
171 				    x) != 0)
172 					break;
173 			}
174 			if ((v & RL_CHEAT_CHEATING) == 0)
175 				return (false);
176 			if ((v & RL_CHEAT_DRAINING) != 0)
177 				goto drain;
178 		}
179 		*(uintptr_t *)cookie = RL_RET_CHEAT_WLOCKED;
180 		break;
181 	default:
182 		__assert_unreachable();
183 		break;
184 	}
185 	return (true);
186 }
187 
188 static bool
189 rangelock_cheat_unlock(struct rangelock *lock, void *cookie)
190 {
191 	uintptr_t v, x;
192 
193 	v = atomic_load_ptr(&lock->head);
194 	if ((v & RL_CHEAT_CHEATING) == 0)
195 		return (false);
196 
197 	MPASS((uintptr_t)cookie == RL_RET_CHEAT_WLOCKED ||
198 	    (uintptr_t)cookie == RL_RET_CHEAT_RLOCKED);
199 
200 	switch ((uintptr_t)cookie) {
201 	case RL_RET_CHEAT_RLOCKED:
202 		for (;;) {
203 			MPASS((v & ~RL_CHEAT_MASK) >= RL_CHEAT_READER);
204 			MPASS((v & RL_CHEAT_WLOCKED) == 0);
205 			x = (v & ~RL_CHEAT_MASK) - RL_CHEAT_READER;
206 			if ((v & RL_CHEAT_DRAINING) != 0) {
207 				if (x != 0) {
208 					x |= RL_CHEAT_DRAINING |
209 					    RL_CHEAT_CHEATING;
210 					if (atomic_fcmpset_rel_ptr(&lock->head,
211 					    &v, x) != 0)
212 						break;
213 				} else {
214 					sleepq_lock(&lock->head);
215 					if (atomic_fcmpset_rel_ptr(&lock->head,
216 					    &v, x) != 0) {
217 						sleepq_broadcast(
218 						    &lock->head,
219 						    SLEEPQ_SLEEP, 0, 0);
220 						sleepq_release(&lock->head);
221 						break;
222 					}
223 					sleepq_release(&lock->head);
224 				}
225 			} else {
226 				x |= RL_CHEAT_CHEATING;
227 				if (atomic_fcmpset_rel_ptr(&lock->head, &v,
228 				    x) != 0)
229 					break;
230 			}
231 		}
232 		break;
233 	case RL_RET_CHEAT_WLOCKED:
234 		for (;;) {
235 			MPASS((v & RL_CHEAT_WLOCKED) != 0);
236 			if ((v & RL_CHEAT_DRAINING) != 0) {
237 				sleepq_lock(&lock->head);
238 				atomic_store_ptr(&lock->head, 0);
239 				sleepq_broadcast(&lock->head,
240 				    SLEEPQ_SLEEP, 0, 0);
241 				sleepq_release(&lock->head);
242 				break;
243 			} else {
244 				if (atomic_fcmpset_ptr(&lock->head, &v,
245 				    RL_CHEAT_CHEATING) != 0)
246 					break;
247 			}
248 		}
249 		break;
250 	default:
251 		__assert_unreachable();
252 		break;
253 	}
254 	return (true);
255 }
256 
257 static bool
258 rangelock_cheat_destroy(struct rangelock *lock)
259 {
260 	uintptr_t v;
261 
262 	v = atomic_load_ptr(&lock->head);
263 	if ((v & RL_CHEAT_CHEATING) == 0)
264 		return (false);
265 	MPASS(v == RL_CHEAT_CHEATING);
266 	return (true);
267 }
268 
269 /*
270  * Implementation of range locks based on the paper
271  * https://doi.org/10.1145/3342195.3387533
272  * arXiv:2006.12144v1 [cs.OS] 22 Jun 2020
273  * Scalable Range Locks for Scalable Address Spaces and Beyond
274  * by Alex Kogan, Dave Dice, and Shady Issa
275  */
276 
277 static struct rl_q_entry *rl_e_unmark(const struct rl_q_entry *e);
278 
279 /*
280  * rl_q_next links all granted ranges in the lock.  We cannot free an
281  * rl_q_entry while in the smr section, and cannot reuse rl_q_next
282  * linkage since other threads might follow it even after CAS removed
283  * the range.  Use rl_q_free for local list of ranges to remove after
284  * the smr section is dropped.
285  */
286 struct rl_q_entry {
287 	struct rl_q_entry *rl_q_next;
288 	struct rl_q_entry *rl_q_free;
289 	off_t		rl_q_start, rl_q_end;
290 	int		rl_q_flags;
291 #ifdef INVARIANTS
292 	struct thread	*rl_q_owner;
293 #endif
294 };
295 
296 static uma_zone_t rl_entry_zone;
297 static smr_t rl_smr;
298 
299 static void rangelock_free_free(struct rl_q_entry *free);
300 static void rangelock_noncheating_destroy(struct rangelock *lock);
301 
302 static void
303 rangelock_sys_init(void)
304 {
305 	rl_entry_zone = uma_zcreate("rl_entry", sizeof(struct rl_q_entry),
306 	    NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct rl_q_entry),
307 	    UMA_ZONE_SMR);
308 	rl_smr = uma_zone_get_smr(rl_entry_zone);
309 }
310 SYSINIT(rl, SI_SUB_LOCK, SI_ORDER_ANY, rangelock_sys_init, NULL);
311 
312 static struct rl_q_entry *
313 rlqentry_alloc(vm_ooffset_t start, vm_ooffset_t end, int flags)
314 {
315 	struct rl_q_entry *e;
316 	struct thread *td;
317 
318 	td = curthread;
319 	if (td->td_rlqe != NULL) {
320 		e = td->td_rlqe;
321 		td->td_rlqe = NULL;
322 	} else {
323 		e = uma_zalloc_smr(rl_entry_zone, M_WAITOK);
324 	}
325 	e->rl_q_next = NULL;
326 	e->rl_q_free = NULL;
327 	e->rl_q_start = start;
328 	e->rl_q_end = end;
329 	e->rl_q_flags = flags;
330 #ifdef INVARIANTS
331 	e->rl_q_owner = curthread;
332 #endif
333 	return (e);
334 }
335 
336 void
337 rangelock_entry_free(struct rl_q_entry *e)
338 {
339 	uma_zfree_smr(rl_entry_zone, e);
340 }
341 
342 void
343 rangelock_init(struct rangelock *lock)
344 {
345 	lock->sleepers = false;
346 	atomic_store_ptr(&lock->head, rangelock_cheat ? RL_CHEAT_CHEATING : 0);
347 }
348 
349 void
350 rangelock_destroy(struct rangelock *lock)
351 {
352 	MPASS(!lock->sleepers);
353 	if (!rangelock_cheat_destroy(lock))
354 		rangelock_noncheating_destroy(lock);
355 	DEBUG_POISON_POINTER(*(void **)&lock->head);
356 }
357 
358 static bool
359 rl_e_is_marked(const struct rl_q_entry *e)
360 {
361 	return (((uintptr_t)e & 1) != 0);
362 }
363 
364 static struct rl_q_entry *
365 rl_e_unmark_unchecked(const struct rl_q_entry *e)
366 {
367 	return ((struct rl_q_entry *)((uintptr_t)e & ~1));
368 }
369 
370 static struct rl_q_entry *
371 rl_e_unmark(const struct rl_q_entry *e)
372 {
373 	MPASS(rl_e_is_marked(e));
374 	return (rl_e_unmark_unchecked(e));
375 }
376 
377 static void
378 rl_e_mark(struct rl_q_entry *e)
379 {
380 #if defined(INVARIANTS) && defined(__LP64__)
381 	int r = atomic_testandset_long((uintptr_t *)&e->rl_q_next, 0);
382 	MPASS(r == 0);
383 #else
384 	atomic_set_ptr((uintptr_t *)&e->rl_q_next, 1);
385 #endif
386 }
387 
388 static struct rl_q_entry *
389 rl_q_load(struct rl_q_entry **p)
390 {
391 	return ((struct rl_q_entry *)atomic_load_acq_ptr((uintptr_t *)p));
392 }
393 
394 static bool
395 rl_e_is_rlock(const struct rl_q_entry *e)
396 {
397 	return ((e->rl_q_flags & RL_LOCK_TYPE_MASK) == RL_LOCK_READ);
398 }
399 
400 static void
401 rangelock_free_free(struct rl_q_entry *free)
402 {
403 	struct rl_q_entry *x, *xp;
404 	struct thread *td;
405 
406 	td = curthread;
407 	for (x = free; x != NULL; x = xp) {
408 		MPASS(!rl_e_is_marked(x));
409 		xp = x->rl_q_free;
410 		MPASS(!rl_e_is_marked(xp));
411 		if (td->td_rlqe == NULL) {
412 			smr_synchronize(rl_smr);
413 			td->td_rlqe = x;
414 		} else {
415 			uma_zfree_smr(rl_entry_zone, x);
416 		}
417 	}
418 }
419 
420 static void
421 rangelock_unlock_int(struct rangelock *lock, struct rl_q_entry *e)
422 {
423 	bool sleepers;
424 
425 	MPASS(lock != NULL && e != NULL);
426 	MPASS(!rl_e_is_marked(rl_q_load(&e->rl_q_next)));
427 	MPASS(e->rl_q_owner == curthread);
428 
429 	rl_e_mark(e);
430 	sleepers = lock->sleepers;
431 	lock->sleepers = false;
432 	if (sleepers)
433 		sleepq_broadcast(&lock->sleepers, SLEEPQ_SLEEP, 0, 0);
434 }
435 
436 void
437 rangelock_unlock(struct rangelock *lock, void *cookie)
438 {
439 	if (rangelock_cheat_unlock(lock, cookie))
440 		return;
441 
442 	sleepq_lock(&lock->sleepers);
443 	rangelock_unlock_int(lock, cookie);
444 	sleepq_release(&lock->sleepers);
445 }
446 
447 /*
448  * result: -1 if e1 before e2, or both locks are readers and e1
449  *		starts before or at e2
450  *          1 if e1 after e2, or both locks are readers and e1
451  *		starts after e2
452  *          0 if e1 and e2 overlap and at least one lock is writer
453  */
454 static int
455 rl_e_compare(const struct rl_q_entry *e1, const struct rl_q_entry *e2)
456 {
457 	bool rds;
458 
459 	if (e1 == NULL)
460 		return (1);
461 	if (e2->rl_q_start >= e1->rl_q_end)
462 		return (-1);
463 	rds = rl_e_is_rlock(e1) && rl_e_is_rlock(e2);
464 	if (e2->rl_q_start >= e1->rl_q_start && rds)
465 		return (-1);
466 	if (e1->rl_q_start >= e2->rl_q_end)
467 		return (1);
468 	if (e1->rl_q_start >= e2->rl_q_start && rds)
469 		return (1);
470 	return (0);
471 }
472 
473 static void
474 rl_insert_sleep(struct rangelock *lock)
475 {
476 	smr_exit(rl_smr);
477 	DROP_GIANT();
478 	lock->sleepers = true;
479 	sleepq_add(&lock->sleepers, NULL, "rangelk", 0, 0);
480 	sleepq_wait(&lock->sleepers, PRI_USER);
481 	PICKUP_GIANT();
482 	smr_enter(rl_smr);
483 }
484 
485 static bool
486 rl_q_cas(struct rl_q_entry **prev, struct rl_q_entry *old,
487     struct rl_q_entry *new)
488 {
489 	MPASS(!rl_e_is_marked(old));
490 	return (atomic_cmpset_rel_ptr((uintptr_t *)prev, (uintptr_t)old,
491 	    (uintptr_t)new) != 0);
492 }
493 
494 static void
495 rangelock_noncheating_destroy(struct rangelock *lock)
496 {
497 	struct rl_q_entry *cur, *free, *next, **prev;
498 
499 	free = NULL;
500 again:
501 	smr_enter(rl_smr);
502 	prev = (struct rl_q_entry **)&lock->head;
503 	cur = rl_q_load(prev);
504 	MPASS(!rl_e_is_marked(cur));
505 
506 	for (;;) {
507 		if (cur == NULL)
508 			break;
509 		if (rl_e_is_marked(cur))
510 			goto again;
511 
512 		next = rl_q_load(&cur->rl_q_next);
513 		if (rl_e_is_marked(next)) {
514 			next = rl_e_unmark(next);
515 			if (rl_q_cas(prev, cur, next)) {
516 #ifdef INVARIANTS
517 				cur->rl_q_owner = NULL;
518 #endif
519 				cur->rl_q_free = free;
520 				free = cur;
521 				cur = next;
522 				continue;
523 			}
524 			smr_exit(rl_smr);
525 			goto again;
526 		}
527 
528 		sleepq_lock(&lock->sleepers);
529 		if (!rl_e_is_marked(cur)) {
530 			rl_insert_sleep(lock);
531 			goto again;
532 		}
533 	}
534 	smr_exit(rl_smr);
535 	rangelock_free_free(free);
536 }
537 
538 enum RL_INSERT_RES {
539 	RL_TRYLOCK_FAILED,
540 	RL_LOCK_SUCCESS,
541 	RL_LOCK_RETRY,
542 };
543 
544 static enum RL_INSERT_RES
545 rl_r_validate(struct rangelock *lock, struct rl_q_entry *e, bool trylock,
546     struct rl_q_entry **free)
547 {
548 	struct rl_q_entry *cur, *next, **prev;
549 
550 again:
551 	prev = &e->rl_q_next;
552 	cur = rl_q_load(prev);
553 	MPASS(!rl_e_is_marked(cur));	/* nobody can unlock e yet */
554 	for (;;) {
555 		if (cur == NULL || cur->rl_q_start >= e->rl_q_end)
556 			return (RL_LOCK_SUCCESS);
557 		next = rl_q_load(&cur->rl_q_next);
558 		if (rl_e_is_marked(next)) {
559 			next = rl_e_unmark(next);
560 			if (rl_q_cas(prev, cur, next)) {
561 				cur->rl_q_free = *free;
562 				*free = cur;
563 				cur = next;
564 				continue;
565 			}
566 			goto again;
567 		}
568 		if (rl_e_is_rlock(cur)) {
569 			prev = &cur->rl_q_next;
570 			cur = rl_e_unmark_unchecked(rl_q_load(prev));
571 			continue;
572 		}
573 		if (!rl_e_is_marked(rl_q_load(&cur->rl_q_next))) {
574 			sleepq_lock(&lock->sleepers);
575 			if (rl_e_is_marked(rl_q_load(&cur->rl_q_next))) {
576 				sleepq_release(&lock->sleepers);
577 				continue;
578 			}
579 			rangelock_unlock_int(lock, e);
580 			if (trylock) {
581 				sleepq_release(&lock->sleepers);
582 				return (RL_TRYLOCK_FAILED);
583 			}
584 			rl_insert_sleep(lock);
585 			return (RL_LOCK_RETRY);
586 		}
587 	}
588 }
589 
590 static enum RL_INSERT_RES
591 rl_w_validate(struct rangelock *lock, struct rl_q_entry *e,
592     bool trylock, struct rl_q_entry **free)
593 {
594 	struct rl_q_entry *cur, *next, **prev;
595 
596 again:
597 	prev = (struct rl_q_entry **)&lock->head;
598 	cur = rl_q_load(prev);
599 	MPASS(!rl_e_is_marked(cur));	/* head is not marked */
600 	for (;;) {
601 		if (cur == e)
602 			return (RL_LOCK_SUCCESS);
603 		next = rl_q_load(&cur->rl_q_next);
604 		if (rl_e_is_marked(next)) {
605 			next = rl_e_unmark(next);
606 			if (rl_q_cas(prev, cur, next)) {
607 				cur->rl_q_free = *free;
608 				*free = cur;
609 				cur = next;
610 				continue;
611 			}
612 			goto again;
613 		}
614 		if (cur->rl_q_end <= e->rl_q_start) {
615 			prev = &cur->rl_q_next;
616 			cur = rl_e_unmark_unchecked(rl_q_load(prev));
617 			continue;
618 		}
619 		sleepq_lock(&lock->sleepers);
620 		/* Reload after sleepq is locked */
621 		next = rl_q_load(&cur->rl_q_next);
622 		if (rl_e_is_marked(next)) {
623 			sleepq_release(&lock->sleepers);
624 			goto again;
625 		}
626 		rangelock_unlock_int(lock, e);
627 		if (trylock) {
628 			sleepq_release(&lock->sleepers);
629 			return (RL_TRYLOCK_FAILED);
630 		}
631 		rl_insert_sleep(lock);
632 		return (RL_LOCK_RETRY);
633 	}
634 }
635 
636 static enum RL_INSERT_RES
637 rl_insert(struct rangelock *lock, struct rl_q_entry *e, bool trylock,
638     struct rl_q_entry **free)
639 {
640 	struct rl_q_entry *cur, *next, **prev;
641 	int r;
642 
643 again:
644 	prev = (struct rl_q_entry **)&lock->head;
645 	cur = rl_q_load(prev);
646 	if (cur == NULL && rl_q_cas(prev, NULL, e))
647 		return (RL_LOCK_SUCCESS);
648 
649 	for (;;) {
650 		if (cur != NULL) {
651 			if (rl_e_is_marked(cur))
652 				goto again;
653 
654 			next = rl_q_load(&cur->rl_q_next);
655 			if (rl_e_is_marked(next)) {
656 				next = rl_e_unmark(next);
657 				if (rl_q_cas(prev, cur, next)) {
658 #ifdef INVARIANTS
659 					cur->rl_q_owner = NULL;
660 #endif
661 					cur->rl_q_free = *free;
662 					*free = cur;
663 					cur = next;
664 					continue;
665 				}
666 				goto again;
667 			}
668 		}
669 
670 		MPASS(!rl_e_is_marked(cur));
671 		r = rl_e_compare(cur, e);
672 		if (r == -1) {
673 			prev = &cur->rl_q_next;
674 			cur = rl_q_load(prev);
675 		} else if (r == 0) {
676 			sleepq_lock(&lock->sleepers);
677 			if (__predict_false(rl_e_is_marked(rl_q_load(
678 			    &cur->rl_q_next)))) {
679 				sleepq_release(&lock->sleepers);
680 				continue;
681 			}
682 			if (trylock) {
683 				sleepq_release(&lock->sleepers);
684 				return (RL_TRYLOCK_FAILED);
685 			}
686 			rl_insert_sleep(lock);
687 			/* e is still valid */
688 			goto again;
689 		} else /* r == 1 */ {
690 			e->rl_q_next = cur;
691 			if (rl_q_cas(prev, cur, e)) {
692 				atomic_thread_fence_acq();
693 				return (rl_e_is_rlock(e) ?
694 				    rl_r_validate(lock, e, trylock, free) :
695 				    rl_w_validate(lock, e, trylock, free));
696 			}
697 			/* Reset rl_q_next in case we hit fast path. */
698 			e->rl_q_next = NULL;
699 			cur = rl_q_load(prev);
700 		}
701 	}
702 }
703 
704 static struct rl_q_entry *
705 rangelock_lock_int(struct rangelock *lock, bool trylock, vm_ooffset_t start,
706     vm_ooffset_t end, int locktype)
707 {
708 	struct rl_q_entry *e, *free;
709 	void *cookie;
710 	enum RL_INSERT_RES res;
711 
712 	if (rangelock_cheat_lock(lock, locktype, trylock, &cookie))
713 		return (cookie);
714 	for (res = RL_LOCK_RETRY; res == RL_LOCK_RETRY;) {
715 		free = NULL;
716 		e = rlqentry_alloc(start, end, locktype);
717 		smr_enter(rl_smr);
718 		res = rl_insert(lock, e, trylock, &free);
719 		smr_exit(rl_smr);
720 		if (res == RL_TRYLOCK_FAILED) {
721 			MPASS(trylock);
722 			e->rl_q_free = free;
723 			free = e;
724 			e = NULL;
725 		}
726 		rangelock_free_free(free);
727 	}
728 	return (e);
729 }
730 
731 void *
732 rangelock_rlock(struct rangelock *lock, vm_ooffset_t start, vm_ooffset_t end)
733 {
734 	return (rangelock_lock_int(lock, false, start, end, RL_LOCK_READ));
735 }
736 
737 void *
738 rangelock_tryrlock(struct rangelock *lock, vm_ooffset_t start, vm_ooffset_t end)
739 {
740 	return (rangelock_lock_int(lock, true, start, end, RL_LOCK_READ));
741 }
742 
743 void *
744 rangelock_wlock(struct rangelock *lock, vm_ooffset_t start, vm_ooffset_t end)
745 {
746 	return (rangelock_lock_int(lock, false, start, end, RL_LOCK_WRITE));
747 }
748 
749 void *
750 rangelock_trywlock(struct rangelock *lock, vm_ooffset_t start, vm_ooffset_t end)
751 {
752 	return (rangelock_lock_int(lock, true, start, end, RL_LOCK_WRITE));
753 }
754 
755 /*
756  * If the caller asserts that it can obtain the range locks on the
757  * same lock simultaneously, switch to the non-cheat mode.  Cheat mode
758  * cannot handle it, hanging in drain or trylock retries.
759  */
760 void
761 rangelock_may_recurse(struct rangelock *lock)
762 {
763 	uintptr_t v, x;
764 
765 	v = atomic_load_ptr(&lock->head);
766 	if ((v & RL_CHEAT_CHEATING) == 0)
767 		return;
768 
769 	sleepq_lock(&lock->head);
770 	for (;;) {
771 		if ((v & RL_CHEAT_CHEATING) == 0) {
772 			sleepq_release(&lock->head);
773 			return;
774 		}
775 
776 		/* Cheating and locked, drain. */
777 		if ((v & RL_CHEAT_WLOCKED) != 0 ||
778 		    (v & ~RL_CHEAT_MASK) >= RL_CHEAT_READER) {
779 			x = v | RL_CHEAT_DRAINING;
780 			if (atomic_fcmpset_ptr(&lock->head, &v, x) != 0) {
781 				rangelock_cheat_drain(lock);
782 				return;
783 			}
784 			continue;
785 		}
786 
787 		/* Cheating and unlocked, clear RL_CHEAT_CHEATING. */
788 		x = 0;
789 		if (atomic_fcmpset_ptr(&lock->head, &v, x) != 0) {
790 			sleepq_release(&lock->head);
791 			return;
792 		}
793 	}
794 }
795 
796 #ifdef INVARIANT_SUPPORT
797 void
798 _rangelock_cookie_assert(void *cookie, int what, const char *file, int line)
799 {
800 }
801 #endif	/* INVARIANT_SUPPORT */
802 
803 #include "opt_ddb.h"
804 #ifdef DDB
805 #include <ddb/ddb.h>
806 
807 DB_SHOW_COMMAND(rangelock, db_show_rangelock)
808 {
809 	struct rangelock *lock;
810 	struct rl_q_entry *e, *x;
811 	uintptr_t v;
812 
813 	if (!have_addr) {
814 		db_printf("show rangelock addr\n");
815 		return;
816 	}
817 
818 	lock = (struct rangelock *)addr;
819 	db_printf("rangelock %p sleepers %d\n", lock, lock->sleepers);
820 	v = lock->head;
821 	if ((v & RL_CHEAT_CHEATING) != 0) {
822 		db_printf("  cheating head %#jx\n", (uintmax_t)v);
823 		return;
824 	}
825 	for (e = (struct rl_q_entry *)(lock->head);;) {
826 		x = rl_e_is_marked(e) ? rl_e_unmark(e) : e;
827 		if (x == NULL)
828 			break;
829 		db_printf("  entry %p marked %d %d start %#jx end %#jx "
830 		    "flags %x next %p",
831 		    e, rl_e_is_marked(e), rl_e_is_marked(x->rl_q_next),
832 		    x->rl_q_start, x->rl_q_end, x->rl_q_flags, x->rl_q_next);
833 #ifdef INVARIANTS
834 		db_printf(" owner %p (%d)", x->rl_q_owner,
835 		    x->rl_q_owner != NULL ? x->rl_q_owner->td_tid : -1);
836 #endif
837 		db_printf("\n");
838 		e = x->rl_q_next;
839 	}
840 }
841 
842 #endif	/* DDB */
843