xref: /freebsd/sys/kern/kern_rangelock.c (revision f5541f9f473430a3e608e07f623294322853d25a)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2009 Konstantin Belousov <kib@FreeBSD.org>
5  * Copyright (c) 2023 The FreeBSD Foundation
6  *
7  * Portions of this software were developed by
8  * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
9  * the FreeBSD Foundation.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice unmodified, this list of conditions, and the following
16  *    disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
26  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
30  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 #include <sys/param.h>
34 #include <sys/kassert.h>
35 #include <sys/kernel.h>
36 #include <sys/lock.h>
37 #include <sys/mutex.h>
38 #include <sys/proc.h>
39 #include <sys/rangelock.h>
40 #include <sys/sleepqueue.h>
41 #include <sys/smr.h>
42 #include <sys/sysctl.h>
43 
44 #include <vm/uma.h>
45 
46 /*
47  * Immediately after initialization (subject to 'rangelock_cheat'
48  * below), and until a request comes that conflicts with granted ones
49  * based on type, rangelocks serve requests in the "cheating" mode.
50  * In this mode, a rangelock behaves like a sxlock, as if each request
51  * covered the whole range of the protected object.  On receiving a
52  * conflicting request (any request while a write request is
53  * effective, or any write request while some read ones are
54  * effective), all requests granted in "cheating" mode are drained,
55  * and the rangelock then switches to effectively keeping track of the
56  * precise range of each new request.
57  *
58  * Normal sx implementation is not used to not bloat structures (most
59  * important, vnodes) which embeds rangelocks.
60  *
61  * The cheating greatly helps very common pattern where file is first
62  * written single-threaded, and then read by many processes.
63  *
64  * Lock is in cheat mode when RL_CHEAT_CHEATING bit is set in the
65  * lock->head.  Special cookies are returned in this mode, and
66  * trylocks are same as normal locks but do not drain.
67  */
68 
69 static int rangelock_cheat = 1;
70 SYSCTL_INT(_debug, OID_AUTO, rangelock_cheat, CTLFLAG_RWTUN,
71     &rangelock_cheat, 0,
72     "");
73 
74 #define	RL_CHEAT_MASK		0x7
75 #define	RL_CHEAT_CHEATING	0x1
76 /* #define	RL_CHEAT_RLOCKED	0x0 */
77 #define	RL_CHEAT_WLOCKED	0x2
78 #define	RL_CHEAT_DRAINING	0x4
79 
80 #define	RL_CHEAT_READER		0x8
81 
82 #define	RL_RET_CHEAT_RLOCKED	0x1100
83 #define	RL_RET_CHEAT_WLOCKED	0x2200
84 
85 static void
86 rangelock_cheat_drain(struct rangelock *lock)
87 {
88 	uintptr_t v;
89 
90 	DROP_GIANT();
91 	for (;;) {
92 		v = atomic_load_ptr(&lock->head);
93 		if ((v & RL_CHEAT_DRAINING) == 0)
94 			break;
95 		sleepq_add(&lock->head, NULL, "ranged1", 0, 0);
96 		sleepq_wait(&lock->head, PRI_USER);
97 		sleepq_lock(&lock->head);
98 	}
99 	sleepq_release(&lock->head);
100 	PICKUP_GIANT();
101 }
102 
103 static bool
104 rangelock_cheat_lock(struct rangelock *lock, int locktype, bool trylock,
105     void **cookie)
106 {
107 	uintptr_t v, x;
108 
109 	v = atomic_load_ptr(&lock->head);
110 	if ((v & RL_CHEAT_CHEATING) == 0)
111 		return (false);
112 	if ((v & RL_CHEAT_DRAINING) != 0) {
113 drain:
114 		if (trylock) {
115 			*cookie = NULL;
116 			return (true);
117 		}
118 		sleepq_lock(&lock->head);
119 drain1:
120 		rangelock_cheat_drain(lock);
121 		return (false);
122 	}
123 
124 	switch (locktype) {
125 	case RL_LOCK_READ:
126 		for (;;) {
127 			if ((v & RL_CHEAT_WLOCKED) != 0) {
128 				if (trylock) {
129 					*cookie = NULL;
130 					return (true);
131 				}
132 				x = v | RL_CHEAT_DRAINING;
133 				sleepq_lock(&lock->head);
134 				if (atomic_fcmpset_rel_ptr(&lock->head, &v,
135 				    x) != 0)
136 					goto drain1;
137 				sleepq_release(&lock->head);
138 				/* Possibly forgive passed conflict */
139 			} else {
140 				x = (v & ~RL_CHEAT_MASK) + RL_CHEAT_READER;
141 				x |= RL_CHEAT_CHEATING;
142 				if (atomic_fcmpset_acq_ptr(&lock->head, &v,
143 				    x) != 0)
144 					break;
145 			}
146 			if ((v & RL_CHEAT_CHEATING) == 0)
147 				return (false);
148 			if ((v & RL_CHEAT_DRAINING) != 0)
149 				goto drain;
150 		}
151 		*(uintptr_t *)cookie = RL_RET_CHEAT_RLOCKED;
152 		break;
153 	case RL_LOCK_WRITE:
154 		for (;;) {
155 			if ((v & ~RL_CHEAT_MASK) >= RL_CHEAT_READER ||
156 			    (v & RL_CHEAT_WLOCKED) != 0) {
157 				if (trylock) {
158 					*cookie = NULL;
159 					return (true);
160 				}
161 				x = v | RL_CHEAT_DRAINING;
162 				sleepq_lock(&lock->head);
163 				if (atomic_fcmpset_rel_ptr(&lock->head, &v,
164 				    x) != 0)
165 					goto drain1;
166 				sleepq_release(&lock->head);
167 				/* Possibly forgive passed conflict */
168 			} else {
169 				x = RL_CHEAT_WLOCKED | RL_CHEAT_CHEATING;
170 				if (atomic_fcmpset_acq_ptr(&lock->head, &v,
171 				    x) != 0)
172 					break;
173 			}
174 			if ((v & RL_CHEAT_CHEATING) == 0)
175 				return (false);
176 			if ((v & RL_CHEAT_DRAINING) != 0)
177 				goto drain;
178 		}
179 		*(uintptr_t *)cookie = RL_RET_CHEAT_WLOCKED;
180 		break;
181 	default:
182 		__assert_unreachable();
183 		break;
184 	}
185 	return (true);
186 }
187 
188 static bool
189 rangelock_cheat_unlock(struct rangelock *lock, void *cookie)
190 {
191 	uintptr_t v, x;
192 
193 	v = atomic_load_ptr(&lock->head);
194 	if ((v & RL_CHEAT_CHEATING) == 0)
195 		return (false);
196 
197 	MPASS((uintptr_t)cookie == RL_RET_CHEAT_WLOCKED ||
198 	    (uintptr_t)cookie == RL_RET_CHEAT_RLOCKED);
199 
200 	switch ((uintptr_t)cookie) {
201 	case RL_RET_CHEAT_RLOCKED:
202 		for (;;) {
203 			MPASS((v & ~RL_CHEAT_MASK) >= RL_CHEAT_READER);
204 			MPASS((v & RL_CHEAT_WLOCKED) == 0);
205 			x = (v & ~RL_CHEAT_MASK) - RL_CHEAT_READER;
206 			if ((v & RL_CHEAT_DRAINING) != 0) {
207 				if (x != 0) {
208 					x |= RL_CHEAT_DRAINING |
209 					    RL_CHEAT_CHEATING;
210 					if (atomic_fcmpset_rel_ptr(&lock->head,
211 					    &v, x) != 0)
212 						break;
213 				} else {
214 					sleepq_lock(&lock->head);
215 					if (atomic_fcmpset_rel_ptr(&lock->head,
216 					    &v, x) != 0) {
217 						sleepq_broadcast(
218 						    &lock->head,
219 						    SLEEPQ_SLEEP, 0, 0);
220 						sleepq_release(&lock->head);
221 						break;
222 					}
223 					sleepq_release(&lock->head);
224 				}
225 			} else {
226 				x |= RL_CHEAT_CHEATING;
227 				if (atomic_fcmpset_rel_ptr(&lock->head, &v,
228 				    x) != 0)
229 					break;
230 			}
231 		}
232 		break;
233 	case RL_RET_CHEAT_WLOCKED:
234 		for (;;) {
235 			MPASS((v & RL_CHEAT_WLOCKED) != 0);
236 			if ((v & RL_CHEAT_DRAINING) != 0) {
237 				sleepq_lock(&lock->head);
238 				atomic_store_ptr(&lock->head, 0);
239 				sleepq_broadcast(&lock->head,
240 				    SLEEPQ_SLEEP, 0, 0);
241 				sleepq_release(&lock->head);
242 				break;
243 			} else {
244 				if (atomic_fcmpset_ptr(&lock->head, &v,
245 				    RL_CHEAT_CHEATING) != 0)
246 					break;
247 			}
248 		}
249 		break;
250 	default:
251 		__assert_unreachable();
252 		break;
253 	}
254 	return (true);
255 }
256 
257 static bool
258 rangelock_cheat_destroy(struct rangelock *lock)
259 {
260 	uintptr_t v;
261 
262 	v = atomic_load_ptr(&lock->head);
263 	if ((v & RL_CHEAT_CHEATING) == 0)
264 		return (false);
265 	MPASS(v == RL_CHEAT_CHEATING);
266 	return (true);
267 }
268 
269 /*
270  * Implementation of range locks based on the paper
271  * https://doi.org/10.1145/3342195.3387533
272  * arXiv:2006.12144v1 [cs.OS] 22 Jun 2020
273  * Scalable Range Locks for Scalable Address Spaces and Beyond
274  * by Alex Kogan, Dave Dice, and Shady Issa
275  */
276 
277 static struct rl_q_entry *rl_e_unmark(const struct rl_q_entry *e);
278 
279 /*
280  * rl_q_next links all granted ranges in the lock.  We cannot free an
281  * rl_q_entry while in the smr section, and cannot reuse rl_q_next
282  * linkage since other threads might follow it even after CAS removed
283  * the range.  Use rl_q_free for local list of ranges to remove after
284  * the smr section is dropped.
285  */
286 struct rl_q_entry {
287 	struct rl_q_entry *rl_q_next;
288 	struct rl_q_entry *rl_q_free;
289 	off_t		rl_q_start, rl_q_end;
290 	int		rl_q_flags;
291 #ifdef INVARIANTS
292 	struct thread	*rl_q_owner;
293 #endif
294 };
295 
296 static uma_zone_t rl_entry_zone;
297 static smr_t rl_smr;
298 
299 static void rangelock_free_free(struct rl_q_entry *free);
300 static void rangelock_noncheating_destroy(struct rangelock *lock);
301 
302 static void
303 rangelock_sys_init(void)
304 {
305 	rl_entry_zone = uma_zcreate("rl_entry", sizeof(struct rl_q_entry),
306 	    NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct rl_q_entry),
307 	    UMA_ZONE_SMR);
308 	rl_smr = uma_zone_get_smr(rl_entry_zone);
309 }
310 SYSINIT(rl, SI_SUB_LOCK, SI_ORDER_ANY, rangelock_sys_init, NULL);
311 
312 static struct rl_q_entry *
313 rlqentry_alloc(vm_ooffset_t start, vm_ooffset_t end, int flags)
314 {
315 	struct rl_q_entry *e;
316 
317 	e = uma_zalloc_smr(rl_entry_zone, M_WAITOK);
318 	e->rl_q_next = NULL;
319 	e->rl_q_free = NULL;
320 	e->rl_q_start = start;
321 	e->rl_q_end = end;
322 	e->rl_q_flags = flags;
323 #ifdef INVARIANTS
324 	e->rl_q_owner = curthread;
325 #endif
326 	return (e);
327 }
328 
329 void
330 rangelock_init(struct rangelock *lock)
331 {
332 	lock->sleepers = false;
333 	atomic_store_ptr(&lock->head, rangelock_cheat ? RL_CHEAT_CHEATING : 0);
334 }
335 
336 void
337 rangelock_destroy(struct rangelock *lock)
338 {
339 	MPASS(!lock->sleepers);
340 	if (!rangelock_cheat_destroy(lock))
341 		rangelock_noncheating_destroy(lock);
342 	DEBUG_POISON_POINTER(*(void **)&lock->head);
343 }
344 
345 static bool
346 rl_e_is_marked(const struct rl_q_entry *e)
347 {
348 	return (((uintptr_t)e & 1) != 0);
349 }
350 
351 static struct rl_q_entry *
352 rl_e_unmark_unchecked(const struct rl_q_entry *e)
353 {
354 	return ((struct rl_q_entry *)((uintptr_t)e & ~1));
355 }
356 
357 static struct rl_q_entry *
358 rl_e_unmark(const struct rl_q_entry *e)
359 {
360 	MPASS(rl_e_is_marked(e));
361 	return (rl_e_unmark_unchecked(e));
362 }
363 
364 static void
365 rl_e_mark(struct rl_q_entry *e)
366 {
367 #if defined(INVARIANTS) && defined(__LP64__)
368 	int r = atomic_testandset_long((uintptr_t *)&e->rl_q_next, 0);
369 	MPASS(r == 0);
370 #else
371 	atomic_set_ptr((uintptr_t *)&e->rl_q_next, 1);
372 #endif
373 }
374 
375 static struct rl_q_entry *
376 rl_q_load(struct rl_q_entry **p)
377 {
378 	return ((struct rl_q_entry *)atomic_load_acq_ptr((uintptr_t *)p));
379 }
380 
381 static bool
382 rl_e_is_rlock(const struct rl_q_entry *e)
383 {
384 	return ((e->rl_q_flags & RL_LOCK_TYPE_MASK) == RL_LOCK_READ);
385 }
386 
387 static void
388 rangelock_free_free(struct rl_q_entry *free)
389 {
390 	struct rl_q_entry *x, *xp;
391 
392 	for (x = free; x != NULL; x = xp) {
393 		MPASS(!rl_e_is_marked(x));
394 		xp = x->rl_q_free;
395 		MPASS(!rl_e_is_marked(xp));
396 		uma_zfree_smr(rl_entry_zone, x);
397 	}
398 }
399 
400 static void
401 rangelock_unlock_int(struct rangelock *lock, struct rl_q_entry *e)
402 {
403 	bool sleepers;
404 
405 	MPASS(lock != NULL && e != NULL);
406 	MPASS(!rl_e_is_marked(rl_q_load(&e->rl_q_next)));
407 	MPASS(e->rl_q_owner == curthread);
408 
409 	rl_e_mark(e);
410 	sleepers = lock->sleepers;
411 	lock->sleepers = false;
412 	if (sleepers)
413 		sleepq_broadcast(&lock->sleepers, SLEEPQ_SLEEP, 0, 0);
414 }
415 
416 void
417 rangelock_unlock(struct rangelock *lock, void *cookie)
418 {
419 	if (rangelock_cheat_unlock(lock, cookie))
420 		return;
421 
422 	sleepq_lock(&lock->sleepers);
423 	rangelock_unlock_int(lock, cookie);
424 	sleepq_release(&lock->sleepers);
425 }
426 
427 /*
428  * result: -1 if e1 before e2, or both locks are readers and e1
429  *		starts before or at e2
430  *          1 if e1 after e2, or both locks are readers and e1
431  *		starts after e2
432  *          0 if e1 and e2 overlap and at least one lock is writer
433  */
434 static int
435 rl_e_compare(const struct rl_q_entry *e1, const struct rl_q_entry *e2)
436 {
437 	bool rds;
438 
439 	if (e1 == NULL)
440 		return (1);
441 	if (e2->rl_q_start >= e1->rl_q_end)
442 		return (-1);
443 	rds = rl_e_is_rlock(e1) && rl_e_is_rlock(e2);
444 	if (e2->rl_q_start >= e1->rl_q_start && rds)
445 		return (-1);
446 	if (e1->rl_q_start >= e2->rl_q_end)
447 		return (1);
448 	if (e1->rl_q_start >= e2->rl_q_start && rds)
449 		return (1);
450 	return (0);
451 }
452 
453 static void
454 rl_insert_sleep(struct rangelock *lock)
455 {
456 	smr_exit(rl_smr);
457 	DROP_GIANT();
458 	lock->sleepers = true;
459 	sleepq_add(&lock->sleepers, NULL, "rangelk", 0, 0);
460 	sleepq_wait(&lock->sleepers, PRI_USER);
461 	PICKUP_GIANT();
462 	smr_enter(rl_smr);
463 }
464 
465 static bool
466 rl_q_cas(struct rl_q_entry **prev, struct rl_q_entry *old,
467     struct rl_q_entry *new)
468 {
469 	MPASS(!rl_e_is_marked(old));
470 	return (atomic_cmpset_rel_ptr((uintptr_t *)prev, (uintptr_t)old,
471 	    (uintptr_t)new) != 0);
472 }
473 
474 static void
475 rangelock_noncheating_destroy(struct rangelock *lock)
476 {
477 	struct rl_q_entry *cur, *free, *next, **prev;
478 
479 	free = NULL;
480 again:
481 	smr_enter(rl_smr);
482 	prev = (struct rl_q_entry **)&lock->head;
483 	cur = rl_q_load(prev);
484 	MPASS(!rl_e_is_marked(cur));
485 
486 	for (;;) {
487 		if (cur == NULL)
488 			break;
489 		if (rl_e_is_marked(cur))
490 			goto again;
491 
492 		next = rl_q_load(&cur->rl_q_next);
493 		if (rl_e_is_marked(next)) {
494 			next = rl_e_unmark(next);
495 			if (rl_q_cas(prev, cur, next)) {
496 #ifdef INVARIANTS
497 				cur->rl_q_owner = NULL;
498 #endif
499 				cur->rl_q_free = free;
500 				free = cur;
501 				cur = next;
502 				continue;
503 			}
504 			smr_exit(rl_smr);
505 			goto again;
506 		}
507 
508 		sleepq_lock(&lock->sleepers);
509 		if (!rl_e_is_marked(cur)) {
510 			rl_insert_sleep(lock);
511 			goto again;
512 		}
513 	}
514 	smr_exit(rl_smr);
515 	rangelock_free_free(free);
516 }
517 
518 enum RL_INSERT_RES {
519 	RL_TRYLOCK_FAILED,
520 	RL_LOCK_SUCCESS,
521 	RL_LOCK_RETRY,
522 };
523 
524 static enum RL_INSERT_RES
525 rl_r_validate(struct rangelock *lock, struct rl_q_entry *e, bool trylock,
526     struct rl_q_entry **free)
527 {
528 	struct rl_q_entry *cur, *next, **prev;
529 
530 again:
531 	prev = &e->rl_q_next;
532 	cur = rl_q_load(prev);
533 	MPASS(!rl_e_is_marked(cur));	/* nobody can unlock e yet */
534 	for (;;) {
535 		if (cur == NULL || cur->rl_q_start >= e->rl_q_end)
536 			return (RL_LOCK_SUCCESS);
537 		next = rl_q_load(&cur->rl_q_next);
538 		if (rl_e_is_marked(next)) {
539 			next = rl_e_unmark(next);
540 			if (rl_q_cas(prev, cur, next)) {
541 				cur->rl_q_free = *free;
542 				*free = cur;
543 				cur = next;
544 				continue;
545 			}
546 			goto again;
547 		}
548 		if (rl_e_is_rlock(cur)) {
549 			prev = &cur->rl_q_next;
550 			cur = rl_e_unmark_unchecked(rl_q_load(prev));
551 			continue;
552 		}
553 		if (!rl_e_is_marked(rl_q_load(&cur->rl_q_next))) {
554 			sleepq_lock(&lock->sleepers);
555 			if (rl_e_is_marked(rl_q_load(&cur->rl_q_next))) {
556 				sleepq_release(&lock->sleepers);
557 				continue;
558 			}
559 			rangelock_unlock_int(lock, e);
560 			if (trylock) {
561 				sleepq_release(&lock->sleepers);
562 				return (RL_TRYLOCK_FAILED);
563 			}
564 			rl_insert_sleep(lock);
565 			return (RL_LOCK_RETRY);
566 		}
567 	}
568 }
569 
570 static enum RL_INSERT_RES
571 rl_w_validate(struct rangelock *lock, struct rl_q_entry *e,
572     bool trylock, struct rl_q_entry **free)
573 {
574 	struct rl_q_entry *cur, *next, **prev;
575 
576 again:
577 	prev = (struct rl_q_entry **)&lock->head;
578 	cur = rl_q_load(prev);
579 	MPASS(!rl_e_is_marked(cur));	/* head is not marked */
580 	for (;;) {
581 		if (cur == e)
582 			return (RL_LOCK_SUCCESS);
583 		next = rl_q_load(&cur->rl_q_next);
584 		if (rl_e_is_marked(next)) {
585 			next = rl_e_unmark(next);
586 			if (rl_q_cas(prev, cur, next)) {
587 				cur->rl_q_free = *free;
588 				*free = cur;
589 				cur = next;
590 				continue;
591 			}
592 			goto again;
593 		}
594 		if (cur->rl_q_end <= e->rl_q_start) {
595 			prev = &cur->rl_q_next;
596 			cur = rl_e_unmark_unchecked(rl_q_load(prev));
597 			continue;
598 		}
599 		sleepq_lock(&lock->sleepers);
600 		/* Reload after sleepq is locked */
601 		next = rl_q_load(&cur->rl_q_next);
602 		if (rl_e_is_marked(next)) {
603 			sleepq_release(&lock->sleepers);
604 			goto again;
605 		}
606 		rangelock_unlock_int(lock, e);
607 		if (trylock) {
608 			sleepq_release(&lock->sleepers);
609 			return (RL_TRYLOCK_FAILED);
610 		}
611 		rl_insert_sleep(lock);
612 		return (RL_LOCK_RETRY);
613 	}
614 }
615 
616 static enum RL_INSERT_RES
617 rl_insert(struct rangelock *lock, struct rl_q_entry *e, bool trylock,
618     struct rl_q_entry **free)
619 {
620 	struct rl_q_entry *cur, *next, **prev;
621 	int r;
622 
623 again:
624 	prev = (struct rl_q_entry **)&lock->head;
625 	cur = rl_q_load(prev);
626 	if (cur == NULL && rl_q_cas(prev, NULL, e))
627 		return (RL_LOCK_SUCCESS);
628 
629 	for (;;) {
630 		if (cur != NULL) {
631 			if (rl_e_is_marked(cur))
632 				goto again;
633 
634 			next = rl_q_load(&cur->rl_q_next);
635 			if (rl_e_is_marked(next)) {
636 				next = rl_e_unmark(next);
637 				if (rl_q_cas(prev, cur, next)) {
638 #ifdef INVARIANTS
639 					cur->rl_q_owner = NULL;
640 #endif
641 					cur->rl_q_free = *free;
642 					*free = cur;
643 					cur = next;
644 					continue;
645 				}
646 				goto again;
647 			}
648 		}
649 
650 		MPASS(!rl_e_is_marked(cur));
651 		r = rl_e_compare(cur, e);
652 		if (r == -1) {
653 			prev = &cur->rl_q_next;
654 			cur = rl_q_load(prev);
655 		} else if (r == 0) {
656 			sleepq_lock(&lock->sleepers);
657 			if (__predict_false(rl_e_is_marked(rl_q_load(
658 			    &cur->rl_q_next)))) {
659 				sleepq_release(&lock->sleepers);
660 				continue;
661 			}
662 			if (trylock) {
663 				sleepq_release(&lock->sleepers);
664 				return (RL_TRYLOCK_FAILED);
665 			}
666 			rl_insert_sleep(lock);
667 			/* e is still valid */
668 			goto again;
669 		} else /* r == 1 */ {
670 			e->rl_q_next = cur;
671 			if (rl_q_cas(prev, cur, e)) {
672 				atomic_thread_fence_acq();
673 				return (rl_e_is_rlock(e) ?
674 				    rl_r_validate(lock, e, trylock, free) :
675 				    rl_w_validate(lock, e, trylock, free));
676 			}
677 			/* Reset rl_q_next in case we hit fast path. */
678 			e->rl_q_next = NULL;
679 			cur = rl_q_load(prev);
680 		}
681 	}
682 }
683 
684 static struct rl_q_entry *
685 rangelock_lock_int(struct rangelock *lock, bool trylock, vm_ooffset_t start,
686     vm_ooffset_t end, int locktype)
687 {
688 	struct rl_q_entry *e, *free;
689 	void *cookie;
690 	enum RL_INSERT_RES res;
691 
692 	if (rangelock_cheat_lock(lock, locktype, trylock, &cookie))
693 		return (cookie);
694 	for (res = RL_LOCK_RETRY; res == RL_LOCK_RETRY;) {
695 		free = NULL;
696 		e = rlqentry_alloc(start, end, locktype);
697 		smr_enter(rl_smr);
698 		res = rl_insert(lock, e, trylock, &free);
699 		smr_exit(rl_smr);
700 		if (res == RL_TRYLOCK_FAILED) {
701 			MPASS(trylock);
702 			e->rl_q_free = free;
703 			free = e;
704 			e = NULL;
705 		}
706 		rangelock_free_free(free);
707 	}
708 	return (e);
709 }
710 
711 void *
712 rangelock_rlock(struct rangelock *lock, vm_ooffset_t start, vm_ooffset_t end)
713 {
714 	return (rangelock_lock_int(lock, false, start, end, RL_LOCK_READ));
715 }
716 
717 void *
718 rangelock_tryrlock(struct rangelock *lock, vm_ooffset_t start, vm_ooffset_t end)
719 {
720 	return (rangelock_lock_int(lock, true, start, end, RL_LOCK_READ));
721 }
722 
723 void *
724 rangelock_wlock(struct rangelock *lock, vm_ooffset_t start, vm_ooffset_t end)
725 {
726 	return (rangelock_lock_int(lock, false, start, end, RL_LOCK_WRITE));
727 }
728 
729 void *
730 rangelock_trywlock(struct rangelock *lock, vm_ooffset_t start, vm_ooffset_t end)
731 {
732 	return (rangelock_lock_int(lock, true, start, end, RL_LOCK_WRITE));
733 }
734 
735 /*
736  * If the caller asserts that it can obtain the range locks on the
737  * same lock simultaneously, switch to the non-cheat mode.  Cheat mode
738  * cannot handle it, hanging in drain or trylock retries.
739  */
740 void
741 rangelock_may_recurse(struct rangelock *lock)
742 {
743 	uintptr_t v, x;
744 
745 	v = atomic_load_ptr(&lock->head);
746 	if ((v & RL_CHEAT_CHEATING) == 0)
747 		return;
748 
749 	sleepq_lock(&lock->head);
750 	for (;;) {
751 		if ((v & RL_CHEAT_CHEATING) == 0) {
752 			sleepq_release(&lock->head);
753 			return;
754 		}
755 
756 		/* Cheating and locked, drain. */
757 		if ((v & RL_CHEAT_WLOCKED) != 0 ||
758 		    (v & ~RL_CHEAT_MASK) >= RL_CHEAT_READER) {
759 			x = v | RL_CHEAT_DRAINING;
760 			if (atomic_fcmpset_ptr(&lock->head, &v, x) != 0) {
761 				rangelock_cheat_drain(lock);
762 				return;
763 			}
764 			continue;
765 		}
766 
767 		/* Cheating and unlocked, clear RL_CHEAT_CHEATING. */
768 		x = 0;
769 		if (atomic_fcmpset_ptr(&lock->head, &v, x) != 0) {
770 			sleepq_release(&lock->head);
771 			return;
772 		}
773 	}
774 }
775 
776 #ifdef INVARIANT_SUPPORT
777 void
778 _rangelock_cookie_assert(void *cookie, int what, const char *file, int line)
779 {
780 }
781 #endif	/* INVARIANT_SUPPORT */
782 
783 #include "opt_ddb.h"
784 #ifdef DDB
785 #include <ddb/ddb.h>
786 
787 DB_SHOW_COMMAND(rangelock, db_show_rangelock)
788 {
789 	struct rangelock *lock;
790 	struct rl_q_entry *e, *x;
791 	uintptr_t v;
792 
793 	if (!have_addr) {
794 		db_printf("show rangelock addr\n");
795 		return;
796 	}
797 
798 	lock = (struct rangelock *)addr;
799 	db_printf("rangelock %p sleepers %d\n", lock, lock->sleepers);
800 	v = lock->head;
801 	if ((v & RL_CHEAT_CHEATING) != 0) {
802 		db_printf("  cheating head %#jx\n", (uintmax_t)v);
803 		return;
804 	}
805 	for (e = (struct rl_q_entry *)(lock->head);;) {
806 		x = rl_e_is_marked(e) ? rl_e_unmark(e) : e;
807 		if (x == NULL)
808 			break;
809 		db_printf("  entry %p marked %d %d start %#jx end %#jx "
810 		    "flags %x next %p",
811 		    e, rl_e_is_marked(e), rl_e_is_marked(x->rl_q_next),
812 		    x->rl_q_start, x->rl_q_end, x->rl_q_flags, x->rl_q_next);
813 #ifdef INVARIANTS
814 		db_printf(" owner %p (%d)", x->rl_q_owner,
815 		    x->rl_q_owner != NULL ? x->rl_q_owner->td_tid : -1);
816 #endif
817 		db_printf("\n");
818 		e = x->rl_q_next;
819 	}
820 }
821 
822 #endif	/* DDB */
823