1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2009 Konstantin Belousov <kib@FreeBSD.org>
5 * Copyright (c) 2023 The FreeBSD Foundation
6 *
7 * Portions of this software were developed by
8 * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
9 * the FreeBSD Foundation.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice unmodified, this list of conditions, and the following
16 * disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
26 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
30 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 #include <sys/param.h>
34 #include <sys/kassert.h>
35 #include <sys/kernel.h>
36 #include <sys/lock.h>
37 #include <sys/mutex.h>
38 #include <sys/proc.h>
39 #include <sys/rangelock.h>
40 #include <sys/sleepqueue.h>
41 #include <sys/smr.h>
42 #include <sys/sysctl.h>
43
44 #include <vm/uma.h>
45
46 /*
47 * Immediately after initialization (subject to 'rangelock_cheat'
48 * below), and until a request comes that conflicts with granted ones
49 * based on type, rangelocks serve requests in the "cheating" mode.
50 * In this mode, a rangelock behaves like a sxlock, as if each request
51 * covered the whole range of the protected object. On receiving a
52 * conflicting request (any request while a write request is
53 * effective, or any write request while some read ones are
54 * effective), all requests granted in "cheating" mode are drained,
55 * and the rangelock then switches to effectively keeping track of the
56 * precise range of each new request.
57 *
58 * Normal sx implementation is not used to not bloat structures (most
59 * important, vnodes) which embeds rangelocks.
60 *
61 * The cheating greatly helps very common pattern where file is first
62 * written single-threaded, and then read by many processes.
63 *
64 * Lock is in cheat mode when RL_CHEAT_CHEATING bit is set in the
65 * lock->head. Special cookies are returned in this mode, and
66 * trylocks are same as normal locks but do not drain.
67 */
68
69 static int rangelock_cheat = 1;
70 SYSCTL_INT(_debug, OID_AUTO, rangelock_cheat, CTLFLAG_RWTUN,
71 &rangelock_cheat, 0,
72 "");
73
74 #define RL_CHEAT_MASK 0x7
75 #define RL_CHEAT_CHEATING 0x1
76 /* #define RL_CHEAT_RLOCKED 0x0 */
77 #define RL_CHEAT_WLOCKED 0x2
78 #define RL_CHEAT_DRAINING 0x4
79
80 #define RL_CHEAT_READER 0x8
81
82 #define RL_RET_CHEAT_RLOCKED 0x1100
83 #define RL_RET_CHEAT_WLOCKED 0x2200
84
85 static void
rangelock_cheat_drain(struct rangelock * lock)86 rangelock_cheat_drain(struct rangelock *lock)
87 {
88 uintptr_t v;
89
90 DROP_GIANT();
91 for (;;) {
92 v = atomic_load_ptr(&lock->head);
93 if ((v & RL_CHEAT_DRAINING) == 0)
94 break;
95 sleepq_add(&lock->head, NULL, "ranged1", 0, 0);
96 sleepq_wait(&lock->head, PRI_USER);
97 sleepq_lock(&lock->head);
98 }
99 sleepq_release(&lock->head);
100 PICKUP_GIANT();
101 }
102
103 static bool
rangelock_cheat_lock(struct rangelock * lock,int locktype,bool trylock,void ** cookie)104 rangelock_cheat_lock(struct rangelock *lock, int locktype, bool trylock,
105 void **cookie)
106 {
107 uintptr_t v, x;
108
109 v = atomic_load_ptr(&lock->head);
110 if ((v & RL_CHEAT_CHEATING) == 0)
111 return (false);
112 if ((v & RL_CHEAT_DRAINING) != 0) {
113 drain:
114 if (trylock) {
115 *cookie = NULL;
116 return (true);
117 }
118 sleepq_lock(&lock->head);
119 drain1:
120 rangelock_cheat_drain(lock);
121 return (false);
122 }
123
124 switch (locktype) {
125 case RL_LOCK_READ:
126 for (;;) {
127 if ((v & RL_CHEAT_WLOCKED) != 0) {
128 if (trylock) {
129 *cookie = NULL;
130 return (true);
131 }
132 x = v | RL_CHEAT_DRAINING;
133 sleepq_lock(&lock->head);
134 if (atomic_fcmpset_rel_ptr(&lock->head, &v,
135 x) != 0)
136 goto drain1;
137 sleepq_release(&lock->head);
138 /* Possibly forgive passed conflict */
139 } else {
140 x = (v & ~RL_CHEAT_MASK) + RL_CHEAT_READER;
141 x |= RL_CHEAT_CHEATING;
142 if (atomic_fcmpset_acq_ptr(&lock->head, &v,
143 x) != 0)
144 break;
145 }
146 if ((v & RL_CHEAT_CHEATING) == 0)
147 return (false);
148 if ((v & RL_CHEAT_DRAINING) != 0)
149 goto drain;
150 }
151 *(uintptr_t *)cookie = RL_RET_CHEAT_RLOCKED;
152 break;
153 case RL_LOCK_WRITE:
154 for (;;) {
155 if ((v & ~RL_CHEAT_MASK) >= RL_CHEAT_READER ||
156 (v & RL_CHEAT_WLOCKED) != 0) {
157 if (trylock) {
158 *cookie = NULL;
159 return (true);
160 }
161 x = v | RL_CHEAT_DRAINING;
162 sleepq_lock(&lock->head);
163 if (atomic_fcmpset_rel_ptr(&lock->head, &v,
164 x) != 0)
165 goto drain1;
166 sleepq_release(&lock->head);
167 /* Possibly forgive passed conflict */
168 } else {
169 x = RL_CHEAT_WLOCKED | RL_CHEAT_CHEATING;
170 if (atomic_fcmpset_acq_ptr(&lock->head, &v,
171 x) != 0)
172 break;
173 }
174 if ((v & RL_CHEAT_CHEATING) == 0)
175 return (false);
176 if ((v & RL_CHEAT_DRAINING) != 0)
177 goto drain;
178 }
179 *(uintptr_t *)cookie = RL_RET_CHEAT_WLOCKED;
180 break;
181 default:
182 __assert_unreachable();
183 break;
184 }
185 return (true);
186 }
187
188 static bool
rangelock_cheat_unlock(struct rangelock * lock,void * cookie)189 rangelock_cheat_unlock(struct rangelock *lock, void *cookie)
190 {
191 uintptr_t v, x;
192
193 v = atomic_load_ptr(&lock->head);
194 if ((v & RL_CHEAT_CHEATING) == 0)
195 return (false);
196
197 MPASS((uintptr_t)cookie == RL_RET_CHEAT_WLOCKED ||
198 (uintptr_t)cookie == RL_RET_CHEAT_RLOCKED);
199
200 switch ((uintptr_t)cookie) {
201 case RL_RET_CHEAT_RLOCKED:
202 for (;;) {
203 MPASS((v & ~RL_CHEAT_MASK) >= RL_CHEAT_READER);
204 MPASS((v & RL_CHEAT_WLOCKED) == 0);
205 x = (v & ~RL_CHEAT_MASK) - RL_CHEAT_READER;
206 if ((v & RL_CHEAT_DRAINING) != 0) {
207 if (x != 0) {
208 x |= RL_CHEAT_DRAINING |
209 RL_CHEAT_CHEATING;
210 if (atomic_fcmpset_rel_ptr(&lock->head,
211 &v, x) != 0)
212 break;
213 } else {
214 sleepq_lock(&lock->head);
215 if (atomic_fcmpset_rel_ptr(&lock->head,
216 &v, x) != 0) {
217 sleepq_broadcast(
218 &lock->head,
219 SLEEPQ_SLEEP, 0, 0);
220 sleepq_release(&lock->head);
221 break;
222 }
223 sleepq_release(&lock->head);
224 }
225 } else {
226 x |= RL_CHEAT_CHEATING;
227 if (atomic_fcmpset_rel_ptr(&lock->head, &v,
228 x) != 0)
229 break;
230 }
231 }
232 break;
233 case RL_RET_CHEAT_WLOCKED:
234 for (;;) {
235 MPASS((v & RL_CHEAT_WLOCKED) != 0);
236 if ((v & RL_CHEAT_DRAINING) != 0) {
237 sleepq_lock(&lock->head);
238 atomic_store_ptr(&lock->head, 0);
239 sleepq_broadcast(&lock->head,
240 SLEEPQ_SLEEP, 0, 0);
241 sleepq_release(&lock->head);
242 break;
243 } else {
244 if (atomic_fcmpset_ptr(&lock->head, &v,
245 RL_CHEAT_CHEATING) != 0)
246 break;
247 }
248 }
249 break;
250 default:
251 __assert_unreachable();
252 break;
253 }
254 return (true);
255 }
256
257 static bool
rangelock_cheat_destroy(struct rangelock * lock)258 rangelock_cheat_destroy(struct rangelock *lock)
259 {
260 uintptr_t v;
261
262 v = atomic_load_ptr(&lock->head);
263 if ((v & RL_CHEAT_CHEATING) == 0)
264 return (false);
265 MPASS(v == RL_CHEAT_CHEATING);
266 return (true);
267 }
268
269 /*
270 * Implementation of range locks based on the paper
271 * https://doi.org/10.1145/3342195.3387533
272 * arXiv:2006.12144v1 [cs.OS] 22 Jun 2020
273 * Scalable Range Locks for Scalable Address Spaces and Beyond
274 * by Alex Kogan, Dave Dice, and Shady Issa
275 */
276
277 static struct rl_q_entry *rl_e_unmark(const struct rl_q_entry *e);
278
279 /*
280 * rl_q_next links all granted ranges in the lock. We cannot free an
281 * rl_q_entry while in the smr section, and cannot reuse rl_q_next
282 * linkage since other threads might follow it even after CAS removed
283 * the range. Use rl_q_free for local list of ranges to remove after
284 * the smr section is dropped.
285 */
286 struct rl_q_entry {
287 struct rl_q_entry *rl_q_next;
288 struct rl_q_entry *rl_q_free;
289 off_t rl_q_start, rl_q_end;
290 int rl_q_flags;
291 #ifdef INVARIANTS
292 struct thread *rl_q_owner;
293 #endif
294 };
295
296 static uma_zone_t rl_entry_zone;
297 static smr_t rl_smr;
298
299 static void rangelock_free_free(struct rl_q_entry *free);
300 static void rangelock_noncheating_destroy(struct rangelock *lock);
301
302 static void
rangelock_sys_init(void)303 rangelock_sys_init(void)
304 {
305 rl_entry_zone = uma_zcreate("rl_entry", sizeof(struct rl_q_entry),
306 NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct rl_q_entry),
307 UMA_ZONE_SMR);
308 rl_smr = uma_zone_get_smr(rl_entry_zone);
309 }
310 SYSINIT(rl, SI_SUB_LOCK, SI_ORDER_ANY, rangelock_sys_init, NULL);
311
312 static struct rl_q_entry *
rlqentry_alloc(vm_ooffset_t start,vm_ooffset_t end,int flags)313 rlqentry_alloc(vm_ooffset_t start, vm_ooffset_t end, int flags)
314 {
315 struct rl_q_entry *e;
316
317 e = uma_zalloc_smr(rl_entry_zone, M_WAITOK);
318 e->rl_q_next = NULL;
319 e->rl_q_free = NULL;
320 e->rl_q_start = start;
321 e->rl_q_end = end;
322 e->rl_q_flags = flags;
323 #ifdef INVARIANTS
324 e->rl_q_owner = curthread;
325 #endif
326 return (e);
327 }
328
329 void
rangelock_init(struct rangelock * lock)330 rangelock_init(struct rangelock *lock)
331 {
332 lock->sleepers = false;
333 atomic_store_ptr(&lock->head, rangelock_cheat ? RL_CHEAT_CHEATING : 0);
334 }
335
336 void
rangelock_destroy(struct rangelock * lock)337 rangelock_destroy(struct rangelock *lock)
338 {
339 MPASS(!lock->sleepers);
340 if (!rangelock_cheat_destroy(lock))
341 rangelock_noncheating_destroy(lock);
342 DEBUG_POISON_POINTER(*(void **)&lock->head);
343 }
344
345 static bool
rl_e_is_marked(const struct rl_q_entry * e)346 rl_e_is_marked(const struct rl_q_entry *e)
347 {
348 return (((uintptr_t)e & 1) != 0);
349 }
350
351 static struct rl_q_entry *
rl_e_unmark_unchecked(const struct rl_q_entry * e)352 rl_e_unmark_unchecked(const struct rl_q_entry *e)
353 {
354 return ((struct rl_q_entry *)((uintptr_t)e & ~1));
355 }
356
357 static struct rl_q_entry *
rl_e_unmark(const struct rl_q_entry * e)358 rl_e_unmark(const struct rl_q_entry *e)
359 {
360 MPASS(rl_e_is_marked(e));
361 return (rl_e_unmark_unchecked(e));
362 }
363
364 static void
rl_e_mark(struct rl_q_entry * e)365 rl_e_mark(struct rl_q_entry *e)
366 {
367 #if defined(INVARIANTS)
368 int r = atomic_testandset_ptr((uintptr_t *)&e->rl_q_next, 0);
369 MPASS(r == 0);
370 #else
371 atomic_set_ptr((uintptr_t *)&e->rl_q_next, 1);
372 #endif
373 }
374
375 static struct rl_q_entry *
rl_q_load(struct rl_q_entry ** p)376 rl_q_load(struct rl_q_entry **p)
377 {
378 return ((struct rl_q_entry *)atomic_load_acq_ptr((uintptr_t *)p));
379 }
380
381 static bool
rl_e_is_rlock(const struct rl_q_entry * e)382 rl_e_is_rlock(const struct rl_q_entry *e)
383 {
384 return ((e->rl_q_flags & RL_LOCK_TYPE_MASK) == RL_LOCK_READ);
385 }
386
387 static void
rangelock_free_free(struct rl_q_entry * free)388 rangelock_free_free(struct rl_q_entry *free)
389 {
390 struct rl_q_entry *x, *xp;
391
392 for (x = free; x != NULL; x = xp) {
393 MPASS(!rl_e_is_marked(x));
394 xp = x->rl_q_free;
395 MPASS(!rl_e_is_marked(xp));
396 uma_zfree_smr(rl_entry_zone, x);
397 }
398 }
399
400 static void
rangelock_unlock_int(struct rangelock * lock,struct rl_q_entry * e)401 rangelock_unlock_int(struct rangelock *lock, struct rl_q_entry *e)
402 {
403 bool sleepers;
404
405 MPASS(lock != NULL && e != NULL);
406 MPASS(!rl_e_is_marked(rl_q_load(&e->rl_q_next)));
407 MPASS(e->rl_q_owner == curthread);
408
409 rl_e_mark(e);
410 sleepers = lock->sleepers;
411 lock->sleepers = false;
412 if (sleepers)
413 sleepq_broadcast(&lock->sleepers, SLEEPQ_SLEEP, 0, 0);
414 }
415
416 void
rangelock_unlock(struct rangelock * lock,void * cookie)417 rangelock_unlock(struct rangelock *lock, void *cookie)
418 {
419 if (rangelock_cheat_unlock(lock, cookie))
420 return;
421
422 sleepq_lock(&lock->sleepers);
423 rangelock_unlock_int(lock, cookie);
424 sleepq_release(&lock->sleepers);
425 }
426
427 /*
428 * result: -1 if e1 before e2, or both locks are readers and e1
429 * starts before or at e2
430 * 1 if e1 after e2, or both locks are readers and e1
431 * starts after e2
432 * 0 if e1 and e2 overlap and at least one lock is writer
433 */
434 static int
rl_e_compare(const struct rl_q_entry * e1,const struct rl_q_entry * e2)435 rl_e_compare(const struct rl_q_entry *e1, const struct rl_q_entry *e2)
436 {
437 bool rds;
438
439 if (e1 == NULL)
440 return (1);
441 if (e2->rl_q_start >= e1->rl_q_end)
442 return (-1);
443 rds = rl_e_is_rlock(e1) && rl_e_is_rlock(e2);
444 if (e2->rl_q_start >= e1->rl_q_start && rds)
445 return (-1);
446 if (e1->rl_q_start >= e2->rl_q_end)
447 return (1);
448 if (e1->rl_q_start >= e2->rl_q_start && rds)
449 return (1);
450 return (0);
451 }
452
453 static void
rl_insert_sleep(struct rangelock * lock)454 rl_insert_sleep(struct rangelock *lock)
455 {
456 smr_exit(rl_smr);
457 DROP_GIANT();
458 lock->sleepers = true;
459 sleepq_add(&lock->sleepers, NULL, "rangelk", 0, 0);
460 sleepq_wait(&lock->sleepers, PRI_USER);
461 PICKUP_GIANT();
462 smr_enter(rl_smr);
463 }
464
465 /*
466 * Try to insert an entry into the queue. Return true if successful, otherwise
467 * false.
468 */
469 static bool
rl_q_cas(struct rl_q_entry ** prev,struct rl_q_entry * old,struct rl_q_entry * new)470 rl_q_cas(struct rl_q_entry **prev, struct rl_q_entry *old,
471 struct rl_q_entry *new)
472 {
473 MPASS(!rl_e_is_marked(old));
474 return (atomic_cmpset_rel_ptr((uintptr_t *)prev, (uintptr_t)old,
475 (uintptr_t)new) != 0);
476 }
477
478 static void
rangelock_noncheating_destroy(struct rangelock * lock)479 rangelock_noncheating_destroy(struct rangelock *lock)
480 {
481 struct rl_q_entry *cur, *free, *next, **prev;
482
483 free = NULL;
484 again:
485 smr_enter(rl_smr);
486 prev = (struct rl_q_entry **)&lock->head;
487 cur = rl_q_load(prev);
488 MPASS(!rl_e_is_marked(cur));
489
490 for (;;) {
491 if (cur == NULL)
492 break;
493 if (rl_e_is_marked(cur))
494 goto again;
495
496 next = rl_q_load(&cur->rl_q_next);
497 if (rl_e_is_marked(next)) {
498 next = rl_e_unmark(next);
499 if (rl_q_cas(prev, cur, next)) {
500 #ifdef INVARIANTS
501 cur->rl_q_owner = NULL;
502 #endif
503 cur->rl_q_free = free;
504 free = cur;
505 cur = next;
506 continue;
507 }
508 smr_exit(rl_smr);
509 goto again;
510 }
511
512 sleepq_lock(&lock->sleepers);
513 if (!rl_e_is_marked(cur)) {
514 rl_insert_sleep(lock);
515 goto again;
516 }
517 }
518 smr_exit(rl_smr);
519 rangelock_free_free(free);
520 }
521
522 enum RL_INSERT_RES {
523 RL_TRYLOCK_FAILED,
524 RL_TRYLOCK_FAILED_MARKED,
525 RL_LOCK_SUCCESS,
526 RL_LOCK_RETRY,
527 };
528
529 /*
530 * Handle a possible lock conflict between cur and e. "inserted" is true if e
531 * is already inserted into the queue.
532 */
533 static enum RL_INSERT_RES
rl_conflict(struct rangelock * lock,struct rl_q_entry * cur,struct rl_q_entry * e,bool trylock,bool inserted)534 rl_conflict(struct rangelock *lock, struct rl_q_entry *cur, struct rl_q_entry *e,
535 bool trylock, bool inserted)
536 {
537 sleepq_lock(&lock->sleepers);
538 if (rl_e_is_marked(rl_q_load(&cur->rl_q_next))) {
539 sleepq_release(&lock->sleepers);
540 return (RL_LOCK_SUCCESS); /* no conflict after all */
541 }
542
543 /*
544 * Make sure we're not conflicting with one of our own locks. This
545 * scenario could conceivably happen for one of two reasons: a bug in
546 * the rangelock consumer (if "inserted" is true), or a bug in the
547 * rangelock implementation itself (if "inserted" is false).
548 */
549 KASSERT(cur->rl_q_owner != curthread,
550 ("%s: conflicting range is locked by the current thread",
551 __func__));
552
553 if (inserted)
554 rangelock_unlock_int(lock, e);
555 if (trylock) {
556 sleepq_release(&lock->sleepers);
557
558 /*
559 * If the new entry e has been enqueued and is thus visible to
560 * other threads, it cannot be safely freed.
561 */
562 return (inserted ? RL_TRYLOCK_FAILED_MARKED: RL_TRYLOCK_FAILED);
563 }
564 rl_insert_sleep(lock);
565 return (RL_LOCK_RETRY);
566 }
567
568 /*
569 * Having inserted entry e, verify that no conflicting write locks are present;
570 * clean up dead entries that we encounter along the way.
571 */
572 static enum RL_INSERT_RES
rl_r_validate(struct rangelock * lock,struct rl_q_entry * e,bool trylock,struct rl_q_entry ** free)573 rl_r_validate(struct rangelock *lock, struct rl_q_entry *e, bool trylock,
574 struct rl_q_entry **free)
575 {
576 struct rl_q_entry *cur, *next, **prev;
577 enum RL_INSERT_RES res;
578
579 again:
580 prev = &e->rl_q_next;
581 cur = rl_q_load(prev);
582 MPASS(!rl_e_is_marked(cur)); /* nobody can unlock e yet */
583 for (;;) {
584 if (cur == NULL || cur->rl_q_start >= e->rl_q_end)
585 return (RL_LOCK_SUCCESS);
586 next = rl_q_load(&cur->rl_q_next);
587 if (rl_e_is_marked(next)) {
588 next = rl_e_unmark(next);
589 if (rl_q_cas(prev, cur, next)) {
590 cur->rl_q_free = *free;
591 *free = cur;
592 cur = next;
593 continue;
594 }
595 goto again;
596 }
597 if (rl_e_is_rlock(cur)) {
598 prev = &cur->rl_q_next;
599 cur = rl_e_unmark_unchecked(rl_q_load(prev));
600 continue;
601 }
602
603 res = rl_conflict(lock, cur, e, trylock, true);
604 if (res != RL_LOCK_SUCCESS)
605 return (res);
606 }
607 }
608
609 /*
610 * Having inserted entry e, verify that no conflicting locks are present;
611 * clean up dead entries that we encounter along the way.
612 */
613 static enum RL_INSERT_RES
rl_w_validate(struct rangelock * lock,struct rl_q_entry * e,bool trylock,struct rl_q_entry ** free)614 rl_w_validate(struct rangelock *lock, struct rl_q_entry *e,
615 bool trylock, struct rl_q_entry **free)
616 {
617 struct rl_q_entry *cur, *next, **prev;
618 enum RL_INSERT_RES res;
619
620 again:
621 prev = (struct rl_q_entry **)&lock->head;
622 cur = rl_q_load(prev);
623 MPASS(!rl_e_is_marked(cur)); /* head is not marked */
624 for (;;) {
625 if (cur == e)
626 return (RL_LOCK_SUCCESS);
627 next = rl_q_load(&cur->rl_q_next);
628 if (rl_e_is_marked(next)) {
629 next = rl_e_unmark(next);
630 if (rl_q_cas(prev, cur, next)) {
631 cur->rl_q_free = *free;
632 *free = cur;
633 cur = next;
634 continue;
635 }
636 goto again;
637 }
638 if (cur->rl_q_end <= e->rl_q_start) {
639 prev = &cur->rl_q_next;
640 cur = rl_e_unmark_unchecked(rl_q_load(prev));
641 continue;
642 }
643
644 res = rl_conflict(lock, cur, e, trylock, true);
645 if (res != RL_LOCK_SUCCESS)
646 return (res);
647 }
648 }
649
650 static enum RL_INSERT_RES
rl_insert(struct rangelock * lock,struct rl_q_entry * e,bool trylock,struct rl_q_entry ** free)651 rl_insert(struct rangelock *lock, struct rl_q_entry *e, bool trylock,
652 struct rl_q_entry **free)
653 {
654 struct rl_q_entry *cur, *next, **prev;
655 int r;
656
657 again:
658 prev = (struct rl_q_entry **)&lock->head;
659 cur = rl_q_load(prev);
660 if (cur == NULL && rl_q_cas(prev, NULL, e))
661 return (RL_LOCK_SUCCESS);
662
663 for (;;) {
664 if (cur != NULL) {
665 if (rl_e_is_marked(cur))
666 goto again;
667
668 next = rl_q_load(&cur->rl_q_next);
669 if (rl_e_is_marked(next)) {
670 next = rl_e_unmark(next);
671 if (rl_q_cas(prev, cur, next)) {
672 #ifdef INVARIANTS
673 cur->rl_q_owner = NULL;
674 #endif
675 cur->rl_q_free = *free;
676 *free = cur;
677 cur = next;
678 continue;
679 }
680 goto again;
681 }
682 }
683
684 MPASS(!rl_e_is_marked(cur));
685 r = rl_e_compare(cur, e);
686 if (r == -1) {
687 prev = &cur->rl_q_next;
688 cur = rl_q_load(prev);
689 } else if (r == 0) {
690 enum RL_INSERT_RES res;
691
692 res = rl_conflict(lock, cur, e, trylock, false);
693 switch (res) {
694 case RL_LOCK_SUCCESS:
695 /* cur does not conflict after all. */
696 break;
697 case RL_LOCK_RETRY:
698 /* e is still valid. */
699 goto again;
700 default:
701 return (res);
702 }
703 } else /* r == 1 */ {
704 e->rl_q_next = cur;
705 if (rl_q_cas(prev, cur, e)) {
706 atomic_thread_fence_acq();
707 return (rl_e_is_rlock(e) ?
708 rl_r_validate(lock, e, trylock, free) :
709 rl_w_validate(lock, e, trylock, free));
710 }
711 /* Reset rl_q_next in case we hit fast path. */
712 e->rl_q_next = NULL;
713 cur = rl_q_load(prev);
714 }
715 }
716 }
717
718 static struct rl_q_entry *
rangelock_lock_int(struct rangelock * lock,bool trylock,vm_ooffset_t start,vm_ooffset_t end,int locktype)719 rangelock_lock_int(struct rangelock *lock, bool trylock, vm_ooffset_t start,
720 vm_ooffset_t end, int locktype)
721 {
722 struct rl_q_entry *e, *free;
723 void *cookie;
724 enum RL_INSERT_RES res;
725
726 if (rangelock_cheat_lock(lock, locktype, trylock, &cookie))
727 return (cookie);
728 for (res = RL_LOCK_RETRY; res == RL_LOCK_RETRY;) {
729 free = NULL;
730 e = rlqentry_alloc(start, end, locktype);
731 smr_enter(rl_smr);
732 res = rl_insert(lock, e, trylock, &free);
733 smr_exit(rl_smr);
734 if (res == RL_TRYLOCK_FAILED || res == RL_TRYLOCK_FAILED_MARKED) {
735 MPASS(trylock);
736 if (res == RL_TRYLOCK_FAILED) {
737 e->rl_q_free = free;
738 free = e;
739 }
740 e = NULL;
741 }
742 rangelock_free_free(free);
743 }
744 return (e);
745 }
746
747 void *
rangelock_rlock(struct rangelock * lock,vm_ooffset_t start,vm_ooffset_t end)748 rangelock_rlock(struct rangelock *lock, vm_ooffset_t start, vm_ooffset_t end)
749 {
750 return (rangelock_lock_int(lock, false, start, end, RL_LOCK_READ));
751 }
752
753 void *
rangelock_tryrlock(struct rangelock * lock,vm_ooffset_t start,vm_ooffset_t end)754 rangelock_tryrlock(struct rangelock *lock, vm_ooffset_t start, vm_ooffset_t end)
755 {
756 return (rangelock_lock_int(lock, true, start, end, RL_LOCK_READ));
757 }
758
759 void *
rangelock_wlock(struct rangelock * lock,vm_ooffset_t start,vm_ooffset_t end)760 rangelock_wlock(struct rangelock *lock, vm_ooffset_t start, vm_ooffset_t end)
761 {
762 return (rangelock_lock_int(lock, false, start, end, RL_LOCK_WRITE));
763 }
764
765 void *
rangelock_trywlock(struct rangelock * lock,vm_ooffset_t start,vm_ooffset_t end)766 rangelock_trywlock(struct rangelock *lock, vm_ooffset_t start, vm_ooffset_t end)
767 {
768 return (rangelock_lock_int(lock, true, start, end, RL_LOCK_WRITE));
769 }
770
771 /*
772 * If the caller asserts that it can obtain the range locks on the
773 * same lock simultaneously, switch to the non-cheat mode. Cheat mode
774 * cannot handle it, hanging in drain or trylock retries.
775 */
776 void
rangelock_may_recurse(struct rangelock * lock)777 rangelock_may_recurse(struct rangelock *lock)
778 {
779 uintptr_t v, x;
780
781 v = atomic_load_ptr(&lock->head);
782 if ((v & RL_CHEAT_CHEATING) == 0)
783 return;
784
785 sleepq_lock(&lock->head);
786 for (;;) {
787 if ((v & RL_CHEAT_CHEATING) == 0) {
788 sleepq_release(&lock->head);
789 return;
790 }
791
792 /* Cheating and locked, drain. */
793 if ((v & RL_CHEAT_WLOCKED) != 0 ||
794 (v & ~RL_CHEAT_MASK) >= RL_CHEAT_READER) {
795 x = v | RL_CHEAT_DRAINING;
796 if (atomic_fcmpset_ptr(&lock->head, &v, x) != 0) {
797 rangelock_cheat_drain(lock);
798 return;
799 }
800 continue;
801 }
802
803 /* Cheating and unlocked, clear RL_CHEAT_CHEATING. */
804 x = 0;
805 if (atomic_fcmpset_ptr(&lock->head, &v, x) != 0) {
806 sleepq_release(&lock->head);
807 return;
808 }
809 }
810 }
811
812 #ifdef INVARIANT_SUPPORT
813 void
_rangelock_cookie_assert(void * cookie,int what,const char * file,int line)814 _rangelock_cookie_assert(void *cookie, int what, const char *file, int line)
815 {
816 }
817 #endif /* INVARIANT_SUPPORT */
818
819 #include "opt_ddb.h"
820 #ifdef DDB
821 #include <ddb/ddb.h>
822
DB_SHOW_COMMAND(rangelock,db_show_rangelock)823 DB_SHOW_COMMAND(rangelock, db_show_rangelock)
824 {
825 struct rangelock *lock;
826 struct rl_q_entry *e, *x;
827 uintptr_t v;
828
829 if (!have_addr) {
830 db_printf("show rangelock addr\n");
831 return;
832 }
833
834 lock = (struct rangelock *)addr;
835 db_printf("rangelock %p sleepers %d\n", lock, lock->sleepers);
836 v = lock->head;
837 if ((v & RL_CHEAT_CHEATING) != 0) {
838 db_printf(" cheating head %#jx\n", (uintmax_t)v);
839 return;
840 }
841 for (e = (struct rl_q_entry *)(lock->head);;) {
842 x = rl_e_is_marked(e) ? rl_e_unmark(e) : e;
843 if (x == NULL)
844 break;
845 db_printf(" entry %p marked %d %d start %#jx end %#jx "
846 "flags %x next %p",
847 e, rl_e_is_marked(e), rl_e_is_marked(x->rl_q_next),
848 x->rl_q_start, x->rl_q_end, x->rl_q_flags, x->rl_q_next);
849 #ifdef INVARIANTS
850 db_printf(" owner %p (%d)", x->rl_q_owner,
851 x->rl_q_owner != NULL ? x->rl_q_owner->td_tid : -1);
852 #endif
853 db_printf("\n");
854 e = x->rl_q_next;
855 }
856 }
857
858 #endif /* DDB */
859