xref: /freebsd/sys/kern/kern_rmlock.c (revision 8ddb146abcdf061be9f2c0db7e391697dafad85c)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 2007 Stephan Uphoff <ups@FreeBSD.org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. Neither the name of the author nor the names of any co-contributors
16  *    may be used to endorse or promote products derived from this software
17  *    without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 /*
33  * Machine independent bits of reader/writer lock implementation.
34  */
35 
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD$");
38 
39 #include "opt_ddb.h"
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 
44 #include <sys/kernel.h>
45 #include <sys/kdb.h>
46 #include <sys/ktr.h>
47 #include <sys/lock.h>
48 #include <sys/mutex.h>
49 #include <sys/proc.h>
50 #include <sys/rmlock.h>
51 #include <sys/sched.h>
52 #include <sys/smp.h>
53 #include <sys/turnstile.h>
54 #include <sys/lock_profile.h>
55 #include <machine/cpu.h>
56 #include <vm/uma.h>
57 
58 #ifdef DDB
59 #include <ddb/ddb.h>
60 #endif
61 
62 /*
63  * A cookie to mark destroyed rmlocks.  This is stored in the head of
64  * rm_activeReaders.
65  */
66 #define	RM_DESTROYED	((void *)0xdead)
67 
68 #define	rm_destroyed(rm)						\
69 	(LIST_FIRST(&(rm)->rm_activeReaders) == RM_DESTROYED)
70 
71 #define RMPF_ONQUEUE	1
72 #define RMPF_SIGNAL	2
73 
74 #ifndef INVARIANTS
75 #define	_rm_assert(c, what, file, line)
76 #endif
77 
78 static void	assert_rm(const struct lock_object *lock, int what);
79 #ifdef DDB
80 static void	db_show_rm(const struct lock_object *lock);
81 #endif
82 static void	lock_rm(struct lock_object *lock, uintptr_t how);
83 #ifdef KDTRACE_HOOKS
84 static int	owner_rm(const struct lock_object *lock, struct thread **owner);
85 #endif
86 static uintptr_t unlock_rm(struct lock_object *lock);
87 
88 struct lock_class lock_class_rm = {
89 	.lc_name = "rm",
90 	.lc_flags = LC_SLEEPLOCK | LC_RECURSABLE,
91 	.lc_assert = assert_rm,
92 #ifdef DDB
93 	.lc_ddb_show = db_show_rm,
94 #endif
95 	.lc_lock = lock_rm,
96 	.lc_unlock = unlock_rm,
97 #ifdef KDTRACE_HOOKS
98 	.lc_owner = owner_rm,
99 #endif
100 };
101 
102 struct lock_class lock_class_rm_sleepable = {
103 	.lc_name = "sleepable rm",
104 	.lc_flags = LC_SLEEPLOCK | LC_SLEEPABLE | LC_RECURSABLE,
105 	.lc_assert = assert_rm,
106 #ifdef DDB
107 	.lc_ddb_show = db_show_rm,
108 #endif
109 	.lc_lock = lock_rm,
110 	.lc_unlock = unlock_rm,
111 #ifdef KDTRACE_HOOKS
112 	.lc_owner = owner_rm,
113 #endif
114 };
115 
116 static void
117 assert_rm(const struct lock_object *lock, int what)
118 {
119 
120 	rm_assert((const struct rmlock *)lock, what);
121 }
122 
123 static void
124 lock_rm(struct lock_object *lock, uintptr_t how)
125 {
126 	struct rmlock *rm;
127 	struct rm_priotracker *tracker;
128 
129 	rm = (struct rmlock *)lock;
130 	if (how == 0)
131 		rm_wlock(rm);
132 	else {
133 		tracker = (struct rm_priotracker *)how;
134 		rm_rlock(rm, tracker);
135 	}
136 }
137 
138 static uintptr_t
139 unlock_rm(struct lock_object *lock)
140 {
141 	struct thread *td;
142 	struct pcpu *pc;
143 	struct rmlock *rm;
144 	struct rm_queue *queue;
145 	struct rm_priotracker *tracker;
146 	uintptr_t how;
147 
148 	rm = (struct rmlock *)lock;
149 	tracker = NULL;
150 	how = 0;
151 	rm_assert(rm, RA_LOCKED | RA_NOTRECURSED);
152 	if (rm_wowned(rm))
153 		rm_wunlock(rm);
154 	else {
155 		/*
156 		 * Find the right rm_priotracker structure for curthread.
157 		 * The guarantee about its uniqueness is given by the fact
158 		 * we already asserted the lock wasn't recursively acquired.
159 		 */
160 		critical_enter();
161 		td = curthread;
162 		pc = get_pcpu();
163 		for (queue = pc->pc_rm_queue.rmq_next;
164 		    queue != &pc->pc_rm_queue; queue = queue->rmq_next) {
165 			tracker = (struct rm_priotracker *)queue;
166 				if ((tracker->rmp_rmlock == rm) &&
167 				    (tracker->rmp_thread == td)) {
168 					how = (uintptr_t)tracker;
169 					break;
170 				}
171 		}
172 		KASSERT(tracker != NULL,
173 		    ("rm_priotracker is non-NULL when lock held in read mode"));
174 		critical_exit();
175 		rm_runlock(rm, tracker);
176 	}
177 	return (how);
178 }
179 
180 #ifdef KDTRACE_HOOKS
181 static int
182 owner_rm(const struct lock_object *lock, struct thread **owner)
183 {
184 	const struct rmlock *rm;
185 	struct lock_class *lc;
186 
187 	rm = (const struct rmlock *)lock;
188 	lc = LOCK_CLASS(&rm->rm_wlock_object);
189 	return (lc->lc_owner(&rm->rm_wlock_object, owner));
190 }
191 #endif
192 
193 static struct mtx rm_spinlock;
194 
195 MTX_SYSINIT(rm_spinlock, &rm_spinlock, "rm_spinlock", MTX_SPIN);
196 
197 /*
198  * Add or remove tracker from per-cpu list.
199  *
200  * The per-cpu list can be traversed at any time in forward direction from an
201  * interrupt on the *local* cpu.
202  */
203 static void inline
204 rm_tracker_add(struct pcpu *pc, struct rm_priotracker *tracker)
205 {
206 	struct rm_queue *next;
207 
208 	/* Initialize all tracker pointers */
209 	tracker->rmp_cpuQueue.rmq_prev = &pc->pc_rm_queue;
210 	next = pc->pc_rm_queue.rmq_next;
211 	tracker->rmp_cpuQueue.rmq_next = next;
212 
213 	/* rmq_prev is not used during froward traversal. */
214 	next->rmq_prev = &tracker->rmp_cpuQueue;
215 
216 	/* Update pointer to first element. */
217 	pc->pc_rm_queue.rmq_next = &tracker->rmp_cpuQueue;
218 }
219 
220 /*
221  * Return a count of the number of trackers the thread 'td' already
222  * has on this CPU for the lock 'rm'.
223  */
224 static int
225 rm_trackers_present(const struct pcpu *pc, const struct rmlock *rm,
226     const struct thread *td)
227 {
228 	struct rm_queue *queue;
229 	struct rm_priotracker *tracker;
230 	int count;
231 
232 	count = 0;
233 	for (queue = pc->pc_rm_queue.rmq_next; queue != &pc->pc_rm_queue;
234 	    queue = queue->rmq_next) {
235 		tracker = (struct rm_priotracker *)queue;
236 		if ((tracker->rmp_rmlock == rm) && (tracker->rmp_thread == td))
237 			count++;
238 	}
239 	return (count);
240 }
241 
242 static void inline
243 rm_tracker_remove(struct pcpu *pc, struct rm_priotracker *tracker)
244 {
245 	struct rm_queue *next, *prev;
246 
247 	next = tracker->rmp_cpuQueue.rmq_next;
248 	prev = tracker->rmp_cpuQueue.rmq_prev;
249 
250 	/* Not used during forward traversal. */
251 	next->rmq_prev = prev;
252 
253 	/* Remove from list. */
254 	prev->rmq_next = next;
255 }
256 
257 static void
258 rm_cleanIPI(void *arg)
259 {
260 	struct pcpu *pc;
261 	struct rmlock *rm = arg;
262 	struct rm_priotracker *tracker;
263 	struct rm_queue *queue;
264 	pc = get_pcpu();
265 
266 	for (queue = pc->pc_rm_queue.rmq_next; queue != &pc->pc_rm_queue;
267 	    queue = queue->rmq_next) {
268 		tracker = (struct rm_priotracker *)queue;
269 		if (tracker->rmp_rmlock == rm && tracker->rmp_flags == 0) {
270 			tracker->rmp_flags = RMPF_ONQUEUE;
271 			mtx_lock_spin(&rm_spinlock);
272 			LIST_INSERT_HEAD(&rm->rm_activeReaders, tracker,
273 			    rmp_qentry);
274 			mtx_unlock_spin(&rm_spinlock);
275 		}
276 	}
277 }
278 
279 void
280 rm_init_flags(struct rmlock *rm, const char *name, int opts)
281 {
282 	struct lock_class *lc;
283 	int liflags, xflags;
284 
285 	liflags = 0;
286 	if (!(opts & RM_NOWITNESS))
287 		liflags |= LO_WITNESS;
288 	if (opts & RM_RECURSE)
289 		liflags |= LO_RECURSABLE;
290 	if (opts & RM_NEW)
291 		liflags |= LO_NEW;
292 	if (opts & RM_DUPOK)
293 		liflags |= LO_DUPOK;
294 	rm->rm_writecpus = all_cpus;
295 	LIST_INIT(&rm->rm_activeReaders);
296 	if (opts & RM_SLEEPABLE) {
297 		liflags |= LO_SLEEPABLE;
298 		lc = &lock_class_rm_sleepable;
299 		xflags = (opts & RM_NEW ? SX_NEW : 0);
300 		sx_init_flags(&rm->rm_lock_sx, "rmlock_sx",
301 		    xflags | SX_NOWITNESS);
302 	} else {
303 		lc = &lock_class_rm;
304 		xflags = (opts & RM_NEW ? MTX_NEW : 0);
305 		mtx_init(&rm->rm_lock_mtx, name, "rmlock_mtx",
306 		    xflags | MTX_NOWITNESS);
307 	}
308 	lock_init(&rm->lock_object, lc, name, NULL, liflags);
309 }
310 
311 void
312 rm_init(struct rmlock *rm, const char *name)
313 {
314 
315 	rm_init_flags(rm, name, 0);
316 }
317 
318 void
319 rm_destroy(struct rmlock *rm)
320 {
321 
322 	rm_assert(rm, RA_UNLOCKED);
323 	LIST_FIRST(&rm->rm_activeReaders) = RM_DESTROYED;
324 	if (rm->lock_object.lo_flags & LO_SLEEPABLE)
325 		sx_destroy(&rm->rm_lock_sx);
326 	else
327 		mtx_destroy(&rm->rm_lock_mtx);
328 	lock_destroy(&rm->lock_object);
329 }
330 
331 int
332 rm_wowned(const struct rmlock *rm)
333 {
334 
335 	if (rm->lock_object.lo_flags & LO_SLEEPABLE)
336 		return (sx_xlocked(&rm->rm_lock_sx));
337 	else
338 		return (mtx_owned(&rm->rm_lock_mtx));
339 }
340 
341 void
342 rm_sysinit(void *arg)
343 {
344 	struct rm_args *args;
345 
346 	args = arg;
347 	rm_init_flags(args->ra_rm, args->ra_desc, args->ra_flags);
348 }
349 
350 static __noinline int
351 _rm_rlock_hard(struct rmlock *rm, struct rm_priotracker *tracker, int trylock)
352 {
353 	struct pcpu *pc;
354 
355 	critical_enter();
356 	pc = get_pcpu();
357 
358 	/* Check if we just need to do a proper critical_exit. */
359 	if (!CPU_ISSET(pc->pc_cpuid, &rm->rm_writecpus)) {
360 		critical_exit();
361 		return (1);
362 	}
363 
364 	/* Remove our tracker from the per-cpu list. */
365 	rm_tracker_remove(pc, tracker);
366 
367 	/*
368 	 * Check to see if the IPI granted us the lock after all.  The load of
369 	 * rmp_flags must happen after the tracker is removed from the list.
370 	 */
371 	atomic_interrupt_fence();
372 	if (tracker->rmp_flags) {
373 		/* Just add back tracker - we hold the lock. */
374 		rm_tracker_add(pc, tracker);
375 		critical_exit();
376 		return (1);
377 	}
378 
379 	/*
380 	 * We allow readers to acquire a lock even if a writer is blocked if
381 	 * the lock is recursive and the reader already holds the lock.
382 	 */
383 	if ((rm->lock_object.lo_flags & LO_RECURSABLE) != 0) {
384 		/*
385 		 * Just grant the lock if this thread already has a tracker
386 		 * for this lock on the per-cpu queue.
387 		 */
388 		if (rm_trackers_present(pc, rm, curthread) != 0) {
389 			mtx_lock_spin(&rm_spinlock);
390 			LIST_INSERT_HEAD(&rm->rm_activeReaders, tracker,
391 			    rmp_qentry);
392 			tracker->rmp_flags = RMPF_ONQUEUE;
393 			mtx_unlock_spin(&rm_spinlock);
394 			rm_tracker_add(pc, tracker);
395 			critical_exit();
396 			return (1);
397 		}
398 	}
399 
400 	sched_unpin();
401 	critical_exit();
402 
403 	if (trylock) {
404 		if (rm->lock_object.lo_flags & LO_SLEEPABLE) {
405 			if (!sx_try_xlock(&rm->rm_lock_sx))
406 				return (0);
407 		} else {
408 			if (!mtx_trylock(&rm->rm_lock_mtx))
409 				return (0);
410 		}
411 	} else {
412 		if (rm->lock_object.lo_flags & LO_SLEEPABLE) {
413 			THREAD_SLEEPING_OK();
414 			sx_xlock(&rm->rm_lock_sx);
415 			THREAD_NO_SLEEPING();
416 		} else
417 			mtx_lock(&rm->rm_lock_mtx);
418 	}
419 
420 	critical_enter();
421 	pc = get_pcpu();
422 	CPU_CLR(pc->pc_cpuid, &rm->rm_writecpus);
423 	rm_tracker_add(pc, tracker);
424 	sched_pin();
425 	critical_exit();
426 
427 	if (rm->lock_object.lo_flags & LO_SLEEPABLE)
428 		sx_xunlock(&rm->rm_lock_sx);
429 	else
430 		mtx_unlock(&rm->rm_lock_mtx);
431 
432 	return (1);
433 }
434 
435 int
436 _rm_rlock(struct rmlock *rm, struct rm_priotracker *tracker, int trylock)
437 {
438 	struct thread *td = curthread;
439 	struct pcpu *pc;
440 
441 	if (SCHEDULER_STOPPED())
442 		return (1);
443 
444 	tracker->rmp_flags  = 0;
445 	tracker->rmp_thread = td;
446 	tracker->rmp_rmlock = rm;
447 
448 	if (rm->lock_object.lo_flags & LO_SLEEPABLE)
449 		THREAD_NO_SLEEPING();
450 
451 	td->td_critnest++;	/* critical_enter(); */
452 	atomic_interrupt_fence();
453 
454 	pc = cpuid_to_pcpu[td->td_oncpu];
455 	rm_tracker_add(pc, tracker);
456 	sched_pin();
457 
458 	atomic_interrupt_fence();
459 	td->td_critnest--;
460 
461 	/*
462 	 * Fast path to combine two common conditions into a single
463 	 * conditional jump.
464 	 */
465 	if (__predict_true(0 == (td->td_owepreempt |
466 	    CPU_ISSET(pc->pc_cpuid, &rm->rm_writecpus))))
467 		return (1);
468 
469 	/* We do not have a read token and need to acquire one. */
470 	return _rm_rlock_hard(rm, tracker, trylock);
471 }
472 
473 static __noinline void
474 _rm_unlock_hard(struct thread *td,struct rm_priotracker *tracker)
475 {
476 
477 	if (td->td_owepreempt) {
478 		td->td_critnest++;
479 		critical_exit();
480 	}
481 
482 	if (!tracker->rmp_flags)
483 		return;
484 
485 	mtx_lock_spin(&rm_spinlock);
486 	LIST_REMOVE(tracker, rmp_qentry);
487 
488 	if (tracker->rmp_flags & RMPF_SIGNAL) {
489 		struct rmlock *rm;
490 		struct turnstile *ts;
491 
492 		rm = tracker->rmp_rmlock;
493 
494 		turnstile_chain_lock(&rm->lock_object);
495 		mtx_unlock_spin(&rm_spinlock);
496 
497 		ts = turnstile_lookup(&rm->lock_object);
498 
499 		turnstile_signal(ts, TS_EXCLUSIVE_QUEUE);
500 		turnstile_unpend(ts);
501 		turnstile_chain_unlock(&rm->lock_object);
502 	} else
503 		mtx_unlock_spin(&rm_spinlock);
504 }
505 
506 void
507 _rm_runlock(struct rmlock *rm, struct rm_priotracker *tracker)
508 {
509 	struct pcpu *pc;
510 	struct thread *td = tracker->rmp_thread;
511 
512 	if (SCHEDULER_STOPPED())
513 		return;
514 
515 	td->td_critnest++;	/* critical_enter(); */
516 	atomic_interrupt_fence();
517 
518 	pc = cpuid_to_pcpu[td->td_oncpu];
519 	rm_tracker_remove(pc, tracker);
520 
521 	atomic_interrupt_fence();
522 	td->td_critnest--;
523 	sched_unpin();
524 
525 	if (rm->lock_object.lo_flags & LO_SLEEPABLE)
526 		THREAD_SLEEPING_OK();
527 
528 	if (__predict_true(0 == (td->td_owepreempt | tracker->rmp_flags)))
529 		return;
530 
531 	_rm_unlock_hard(td, tracker);
532 }
533 
534 void
535 _rm_wlock(struct rmlock *rm)
536 {
537 	struct rm_priotracker *prio;
538 	struct turnstile *ts;
539 	cpuset_t readcpus;
540 
541 	if (SCHEDULER_STOPPED())
542 		return;
543 
544 	if (rm->lock_object.lo_flags & LO_SLEEPABLE)
545 		sx_xlock(&rm->rm_lock_sx);
546 	else
547 		mtx_lock(&rm->rm_lock_mtx);
548 
549 	if (CPU_CMP(&rm->rm_writecpus, &all_cpus)) {
550 		/* Get all read tokens back */
551 		readcpus = all_cpus;
552 		CPU_ANDNOT(&readcpus, &readcpus, &rm->rm_writecpus);
553 		rm->rm_writecpus = all_cpus;
554 
555 		/*
556 		 * Assumes rm->rm_writecpus update is visible on other CPUs
557 		 * before rm_cleanIPI is called.
558 		 */
559 #ifdef SMP
560 		smp_rendezvous_cpus(readcpus,
561 		    smp_no_rendezvous_barrier,
562 		    rm_cleanIPI,
563 		    smp_no_rendezvous_barrier,
564 		    rm);
565 
566 #else
567 		rm_cleanIPI(rm);
568 #endif
569 
570 		mtx_lock_spin(&rm_spinlock);
571 		while ((prio = LIST_FIRST(&rm->rm_activeReaders)) != NULL) {
572 			ts = turnstile_trywait(&rm->lock_object);
573 			prio->rmp_flags = RMPF_ONQUEUE | RMPF_SIGNAL;
574 			mtx_unlock_spin(&rm_spinlock);
575 			turnstile_wait(ts, prio->rmp_thread,
576 			    TS_EXCLUSIVE_QUEUE);
577 			mtx_lock_spin(&rm_spinlock);
578 		}
579 		mtx_unlock_spin(&rm_spinlock);
580 	}
581 }
582 
583 void
584 _rm_wunlock(struct rmlock *rm)
585 {
586 
587 	if (rm->lock_object.lo_flags & LO_SLEEPABLE)
588 		sx_xunlock(&rm->rm_lock_sx);
589 	else
590 		mtx_unlock(&rm->rm_lock_mtx);
591 }
592 
593 #if LOCK_DEBUG > 0
594 
595 void
596 _rm_wlock_debug(struct rmlock *rm, const char *file, int line)
597 {
598 
599 	if (SCHEDULER_STOPPED())
600 		return;
601 
602 	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
603 	    ("rm_wlock() by idle thread %p on rmlock %s @ %s:%d",
604 	    curthread, rm->lock_object.lo_name, file, line));
605 	KASSERT(!rm_destroyed(rm),
606 	    ("rm_wlock() of destroyed rmlock @ %s:%d", file, line));
607 	_rm_assert(rm, RA_UNLOCKED, file, line);
608 
609 	WITNESS_CHECKORDER(&rm->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE,
610 	    file, line, NULL);
611 
612 	_rm_wlock(rm);
613 
614 	LOCK_LOG_LOCK("RMWLOCK", &rm->lock_object, 0, 0, file, line);
615 	WITNESS_LOCK(&rm->lock_object, LOP_EXCLUSIVE, file, line);
616 	TD_LOCKS_INC(curthread);
617 }
618 
619 void
620 _rm_wunlock_debug(struct rmlock *rm, const char *file, int line)
621 {
622 
623 	if (SCHEDULER_STOPPED())
624 		return;
625 
626 	KASSERT(!rm_destroyed(rm),
627 	    ("rm_wunlock() of destroyed rmlock @ %s:%d", file, line));
628 	_rm_assert(rm, RA_WLOCKED, file, line);
629 	WITNESS_UNLOCK(&rm->lock_object, LOP_EXCLUSIVE, file, line);
630 	LOCK_LOG_LOCK("RMWUNLOCK", &rm->lock_object, 0, 0, file, line);
631 	_rm_wunlock(rm);
632 	TD_LOCKS_DEC(curthread);
633 }
634 
635 int
636 _rm_rlock_debug(struct rmlock *rm, struct rm_priotracker *tracker,
637     int trylock, const char *file, int line)
638 {
639 
640 	if (SCHEDULER_STOPPED())
641 		return (1);
642 
643 #ifdef INVARIANTS
644 	if (!(rm->lock_object.lo_flags & LO_RECURSABLE) && !trylock) {
645 		critical_enter();
646 		KASSERT(rm_trackers_present(get_pcpu(), rm,
647 		    curthread) == 0,
648 		    ("rm_rlock: recursed on non-recursive rmlock %s @ %s:%d\n",
649 		    rm->lock_object.lo_name, file, line));
650 		critical_exit();
651 	}
652 #endif
653 	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
654 	    ("rm_rlock() by idle thread %p on rmlock %s @ %s:%d",
655 	    curthread, rm->lock_object.lo_name, file, line));
656 	KASSERT(!rm_destroyed(rm),
657 	    ("rm_rlock() of destroyed rmlock @ %s:%d", file, line));
658 	if (!trylock) {
659 		KASSERT(!rm_wowned(rm),
660 		    ("rm_rlock: wlock already held for %s @ %s:%d",
661 		    rm->lock_object.lo_name, file, line));
662 		WITNESS_CHECKORDER(&rm->lock_object,
663 		    LOP_NEWORDER | LOP_NOSLEEP, file, line, NULL);
664 	}
665 
666 	if (_rm_rlock(rm, tracker, trylock)) {
667 		if (trylock)
668 			LOCK_LOG_TRY("RMRLOCK", &rm->lock_object, 0, 1, file,
669 			    line);
670 		else
671 			LOCK_LOG_LOCK("RMRLOCK", &rm->lock_object, 0, 0, file,
672 			    line);
673 		WITNESS_LOCK(&rm->lock_object, LOP_NOSLEEP, file, line);
674 		TD_LOCKS_INC(curthread);
675 		return (1);
676 	} else if (trylock)
677 		LOCK_LOG_TRY("RMRLOCK", &rm->lock_object, 0, 0, file, line);
678 
679 	return (0);
680 }
681 
682 void
683 _rm_runlock_debug(struct rmlock *rm, struct rm_priotracker *tracker,
684     const char *file, int line)
685 {
686 
687 	if (SCHEDULER_STOPPED())
688 		return;
689 
690 	KASSERT(!rm_destroyed(rm),
691 	    ("rm_runlock() of destroyed rmlock @ %s:%d", file, line));
692 	_rm_assert(rm, RA_RLOCKED, file, line);
693 	WITNESS_UNLOCK(&rm->lock_object, 0, file, line);
694 	LOCK_LOG_LOCK("RMRUNLOCK", &rm->lock_object, 0, 0, file, line);
695 	_rm_runlock(rm, tracker);
696 	TD_LOCKS_DEC(curthread);
697 }
698 
699 #else
700 
701 /*
702  * Just strip out file and line arguments if no lock debugging is enabled in
703  * the kernel - we are called from a kernel module.
704  */
705 void
706 _rm_wlock_debug(struct rmlock *rm, const char *file, int line)
707 {
708 
709 	_rm_wlock(rm);
710 }
711 
712 void
713 _rm_wunlock_debug(struct rmlock *rm, const char *file, int line)
714 {
715 
716 	_rm_wunlock(rm);
717 }
718 
719 int
720 _rm_rlock_debug(struct rmlock *rm, struct rm_priotracker *tracker,
721     int trylock, const char *file, int line)
722 {
723 
724 	return _rm_rlock(rm, tracker, trylock);
725 }
726 
727 void
728 _rm_runlock_debug(struct rmlock *rm, struct rm_priotracker *tracker,
729     const char *file, int line)
730 {
731 
732 	_rm_runlock(rm, tracker);
733 }
734 
735 #endif
736 
737 #ifdef INVARIANT_SUPPORT
738 #ifndef INVARIANTS
739 #undef _rm_assert
740 #endif
741 
742 /*
743  * Note that this does not need to use witness_assert() for read lock
744  * assertions since an exact count of read locks held by this thread
745  * is computable.
746  */
747 void
748 _rm_assert(const struct rmlock *rm, int what, const char *file, int line)
749 {
750 	int count;
751 
752 	if (SCHEDULER_STOPPED())
753 		return;
754 	switch (what) {
755 	case RA_LOCKED:
756 	case RA_LOCKED | RA_RECURSED:
757 	case RA_LOCKED | RA_NOTRECURSED:
758 	case RA_RLOCKED:
759 	case RA_RLOCKED | RA_RECURSED:
760 	case RA_RLOCKED | RA_NOTRECURSED:
761 		/*
762 		 * Handle the write-locked case.  Unlike other
763 		 * primitives, writers can never recurse.
764 		 */
765 		if (rm_wowned(rm)) {
766 			if (what & RA_RLOCKED)
767 				panic("Lock %s exclusively locked @ %s:%d\n",
768 				    rm->lock_object.lo_name, file, line);
769 			if (what & RA_RECURSED)
770 				panic("Lock %s not recursed @ %s:%d\n",
771 				    rm->lock_object.lo_name, file, line);
772 			break;
773 		}
774 
775 		critical_enter();
776 		count = rm_trackers_present(get_pcpu(), rm, curthread);
777 		critical_exit();
778 
779 		if (count == 0)
780 			panic("Lock %s not %slocked @ %s:%d\n",
781 			    rm->lock_object.lo_name, (what & RA_RLOCKED) ?
782 			    "read " : "", file, line);
783 		if (count > 1) {
784 			if (what & RA_NOTRECURSED)
785 				panic("Lock %s recursed @ %s:%d\n",
786 				    rm->lock_object.lo_name, file, line);
787 		} else if (what & RA_RECURSED)
788 			panic("Lock %s not recursed @ %s:%d\n",
789 			    rm->lock_object.lo_name, file, line);
790 		break;
791 	case RA_WLOCKED:
792 		if (!rm_wowned(rm))
793 			panic("Lock %s not exclusively locked @ %s:%d\n",
794 			    rm->lock_object.lo_name, file, line);
795 		break;
796 	case RA_UNLOCKED:
797 		if (rm_wowned(rm))
798 			panic("Lock %s exclusively locked @ %s:%d\n",
799 			    rm->lock_object.lo_name, file, line);
800 
801 		critical_enter();
802 		count = rm_trackers_present(get_pcpu(), rm, curthread);
803 		critical_exit();
804 
805 		if (count != 0)
806 			panic("Lock %s read locked @ %s:%d\n",
807 			    rm->lock_object.lo_name, file, line);
808 		break;
809 	default:
810 		panic("Unknown rm lock assertion: %d @ %s:%d", what, file,
811 		    line);
812 	}
813 }
814 #endif /* INVARIANT_SUPPORT */
815 
816 #ifdef DDB
817 static void
818 print_tracker(struct rm_priotracker *tr)
819 {
820 	struct thread *td;
821 
822 	td = tr->rmp_thread;
823 	db_printf("   thread %p (tid %d, pid %d, \"%s\") {", td, td->td_tid,
824 	    td->td_proc->p_pid, td->td_name);
825 	if (tr->rmp_flags & RMPF_ONQUEUE) {
826 		db_printf("ONQUEUE");
827 		if (tr->rmp_flags & RMPF_SIGNAL)
828 			db_printf(",SIGNAL");
829 	} else
830 		db_printf("0");
831 	db_printf("}\n");
832 }
833 
834 static void
835 db_show_rm(const struct lock_object *lock)
836 {
837 	struct rm_priotracker *tr;
838 	struct rm_queue *queue;
839 	const struct rmlock *rm;
840 	struct lock_class *lc;
841 	struct pcpu *pc;
842 
843 	rm = (const struct rmlock *)lock;
844 	db_printf(" writecpus: ");
845 	ddb_display_cpuset(__DEQUALIFY(const cpuset_t *, &rm->rm_writecpus));
846 	db_printf("\n");
847 	db_printf(" per-CPU readers:\n");
848 	STAILQ_FOREACH(pc, &cpuhead, pc_allcpu)
849 		for (queue = pc->pc_rm_queue.rmq_next;
850 		    queue != &pc->pc_rm_queue; queue = queue->rmq_next) {
851 			tr = (struct rm_priotracker *)queue;
852 			if (tr->rmp_rmlock == rm)
853 				print_tracker(tr);
854 		}
855 	db_printf(" active readers:\n");
856 	LIST_FOREACH(tr, &rm->rm_activeReaders, rmp_qentry)
857 		print_tracker(tr);
858 	lc = LOCK_CLASS(&rm->rm_wlock_object);
859 	db_printf("Backing write-lock (%s):\n", lc->lc_name);
860 	lc->lc_ddb_show(&rm->rm_wlock_object);
861 }
862 #endif
863 
864 /*
865  * Read-mostly sleepable locks.
866  *
867  * These primitives allow both readers and writers to sleep. However, neither
868  * readers nor writers are tracked and subsequently there is no priority
869  * propagation.
870  *
871  * They are intended to be only used when write-locking is almost never needed
872  * (e.g., they can guard against unloading a kernel module) while read-locking
873  * happens all the time.
874  *
875  * Concurrent writers take turns taking the lock while going off cpu. If this is
876  * of concern for your usecase, this is not the right primitive.
877  *
878  * Neither rms_rlock nor rms_runlock use thread fences. Instead interrupt
879  * fences are inserted to ensure ordering with the code executed in the IPI
880  * handler.
881  *
882  * No attempt is made to track which CPUs read locked at least once,
883  * consequently write locking sends IPIs to all of them. This will become a
884  * problem at some point. The easiest way to lessen it is to provide a bitmap.
885  */
886 
887 #define	RMS_NOOWNER	((void *)0x1)
888 #define	RMS_TRANSIENT	((void *)0x2)
889 #define	RMS_FLAGMASK	0xf
890 
891 struct rmslock_pcpu {
892 	int influx;
893 	int readers;
894 };
895 
896 _Static_assert(sizeof(struct rmslock_pcpu) == 8, "bad size");
897 
898 /*
899  * Internal routines
900  */
901 static struct rmslock_pcpu *
902 rms_int_pcpu(struct rmslock *rms)
903 {
904 
905 	CRITICAL_ASSERT(curthread);
906 	return (zpcpu_get(rms->pcpu));
907 }
908 
909 static struct rmslock_pcpu *
910 rms_int_remote_pcpu(struct rmslock *rms, int cpu)
911 {
912 
913 	return (zpcpu_get_cpu(rms->pcpu, cpu));
914 }
915 
916 static void
917 rms_int_influx_enter(struct rmslock *rms, struct rmslock_pcpu *pcpu)
918 {
919 
920 	CRITICAL_ASSERT(curthread);
921 	MPASS(pcpu->influx == 0);
922 	pcpu->influx = 1;
923 }
924 
925 static void
926 rms_int_influx_exit(struct rmslock *rms, struct rmslock_pcpu *pcpu)
927 {
928 
929 	CRITICAL_ASSERT(curthread);
930 	MPASS(pcpu->influx == 1);
931 	pcpu->influx = 0;
932 }
933 
934 #ifdef INVARIANTS
935 static void
936 rms_int_debug_readers_inc(struct rmslock *rms)
937 {
938 	int old;
939 	old = atomic_fetchadd_int(&rms->debug_readers, 1);
940 	KASSERT(old >= 0, ("%s: bad readers count %d\n", __func__, old));
941 }
942 
943 static void
944 rms_int_debug_readers_dec(struct rmslock *rms)
945 {
946 	int old;
947 
948 	old = atomic_fetchadd_int(&rms->debug_readers, -1);
949 	KASSERT(old > 0, ("%s: bad readers count %d\n", __func__, old));
950 }
951 #else
952 static void
953 rms_int_debug_readers_inc(struct rmslock *rms)
954 {
955 }
956 
957 static void
958 rms_int_debug_readers_dec(struct rmslock *rms)
959 {
960 }
961 #endif
962 
963 static void
964 rms_int_readers_inc(struct rmslock *rms, struct rmslock_pcpu *pcpu)
965 {
966 
967 	CRITICAL_ASSERT(curthread);
968 	rms_int_debug_readers_inc(rms);
969 	pcpu->readers++;
970 }
971 
972 static void
973 rms_int_readers_dec(struct rmslock *rms, struct rmslock_pcpu *pcpu)
974 {
975 
976 	CRITICAL_ASSERT(curthread);
977 	rms_int_debug_readers_dec(rms);
978 	pcpu->readers--;
979 }
980 
981 /*
982  * Public API
983  */
984 void
985 rms_init(struct rmslock *rms, const char *name)
986 {
987 
988 	rms->owner = RMS_NOOWNER;
989 	rms->writers = 0;
990 	rms->readers = 0;
991 	rms->debug_readers = 0;
992 	mtx_init(&rms->mtx, name, NULL, MTX_DEF | MTX_NEW);
993 	rms->pcpu = uma_zalloc_pcpu(pcpu_zone_8, M_WAITOK | M_ZERO);
994 }
995 
996 void
997 rms_destroy(struct rmslock *rms)
998 {
999 
1000 	MPASS(rms->writers == 0);
1001 	MPASS(rms->readers == 0);
1002 	mtx_destroy(&rms->mtx);
1003 	uma_zfree_pcpu(pcpu_zone_8, rms->pcpu);
1004 }
1005 
1006 static void __noinline
1007 rms_rlock_fallback(struct rmslock *rms)
1008 {
1009 
1010 	rms_int_influx_exit(rms, rms_int_pcpu(rms));
1011 	critical_exit();
1012 
1013 	mtx_lock(&rms->mtx);
1014 	while (rms->writers > 0)
1015 		msleep(&rms->readers, &rms->mtx, PUSER - 1, mtx_name(&rms->mtx), 0);
1016 	critical_enter();
1017 	rms_int_readers_inc(rms, rms_int_pcpu(rms));
1018 	mtx_unlock(&rms->mtx);
1019 	critical_exit();
1020 	TD_LOCKS_INC(curthread);
1021 }
1022 
1023 void
1024 rms_rlock(struct rmslock *rms)
1025 {
1026 	struct rmslock_pcpu *pcpu;
1027 
1028 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, __func__);
1029 	MPASS(atomic_load_ptr(&rms->owner) != curthread);
1030 
1031 	critical_enter();
1032 	pcpu = rms_int_pcpu(rms);
1033 	rms_int_influx_enter(rms, pcpu);
1034 	atomic_interrupt_fence();
1035 	if (__predict_false(rms->writers > 0)) {
1036 		rms_rlock_fallback(rms);
1037 		return;
1038 	}
1039 	atomic_interrupt_fence();
1040 	rms_int_readers_inc(rms, pcpu);
1041 	atomic_interrupt_fence();
1042 	rms_int_influx_exit(rms, pcpu);
1043 	critical_exit();
1044 	TD_LOCKS_INC(curthread);
1045 }
1046 
1047 int
1048 rms_try_rlock(struct rmslock *rms)
1049 {
1050 	struct rmslock_pcpu *pcpu;
1051 
1052 	MPASS(atomic_load_ptr(&rms->owner) != curthread);
1053 
1054 	critical_enter();
1055 	pcpu = rms_int_pcpu(rms);
1056 	rms_int_influx_enter(rms, pcpu);
1057 	atomic_interrupt_fence();
1058 	if (__predict_false(rms->writers > 0)) {
1059 		rms_int_influx_exit(rms, pcpu);
1060 		critical_exit();
1061 		return (0);
1062 	}
1063 	atomic_interrupt_fence();
1064 	rms_int_readers_inc(rms, pcpu);
1065 	atomic_interrupt_fence();
1066 	rms_int_influx_exit(rms, pcpu);
1067 	critical_exit();
1068 	TD_LOCKS_INC(curthread);
1069 	return (1);
1070 }
1071 
1072 static void __noinline
1073 rms_runlock_fallback(struct rmslock *rms)
1074 {
1075 
1076 	rms_int_influx_exit(rms, rms_int_pcpu(rms));
1077 	critical_exit();
1078 
1079 	mtx_lock(&rms->mtx);
1080 	MPASS(rms->writers > 0);
1081 	MPASS(rms->readers > 0);
1082 	MPASS(rms->debug_readers == rms->readers);
1083 	rms_int_debug_readers_dec(rms);
1084 	rms->readers--;
1085 	if (rms->readers == 0)
1086 		wakeup_one(&rms->writers);
1087 	mtx_unlock(&rms->mtx);
1088 	TD_LOCKS_DEC(curthread);
1089 }
1090 
1091 void
1092 rms_runlock(struct rmslock *rms)
1093 {
1094 	struct rmslock_pcpu *pcpu;
1095 
1096 	critical_enter();
1097 	pcpu = rms_int_pcpu(rms);
1098 	rms_int_influx_enter(rms, pcpu);
1099 	atomic_interrupt_fence();
1100 	if (__predict_false(rms->writers > 0)) {
1101 		rms_runlock_fallback(rms);
1102 		return;
1103 	}
1104 	atomic_interrupt_fence();
1105 	rms_int_readers_dec(rms, pcpu);
1106 	atomic_interrupt_fence();
1107 	rms_int_influx_exit(rms, pcpu);
1108 	critical_exit();
1109 	TD_LOCKS_DEC(curthread);
1110 }
1111 
1112 struct rmslock_ipi {
1113 	struct rmslock *rms;
1114 	struct smp_rendezvous_cpus_retry_arg srcra;
1115 };
1116 
1117 static void
1118 rms_action_func(void *arg)
1119 {
1120 	struct rmslock_ipi *rmsipi;
1121 	struct rmslock_pcpu *pcpu;
1122 	struct rmslock *rms;
1123 
1124 	rmsipi = __containerof(arg, struct rmslock_ipi, srcra);
1125 	rms = rmsipi->rms;
1126 	pcpu = rms_int_pcpu(rms);
1127 
1128 	if (pcpu->influx)
1129 		return;
1130 	if (pcpu->readers != 0) {
1131 		atomic_add_int(&rms->readers, pcpu->readers);
1132 		pcpu->readers = 0;
1133 	}
1134 	smp_rendezvous_cpus_done(arg);
1135 }
1136 
1137 static void
1138 rms_wait_func(void *arg, int cpu)
1139 {
1140 	struct rmslock_ipi *rmsipi;
1141 	struct rmslock_pcpu *pcpu;
1142 	struct rmslock *rms;
1143 
1144 	rmsipi = __containerof(arg, struct rmslock_ipi, srcra);
1145 	rms = rmsipi->rms;
1146 	pcpu = rms_int_remote_pcpu(rms, cpu);
1147 
1148 	while (atomic_load_int(&pcpu->influx))
1149 		cpu_spinwait();
1150 }
1151 
1152 #ifdef INVARIANTS
1153 static void
1154 rms_assert_no_pcpu_readers(struct rmslock *rms)
1155 {
1156 	struct rmslock_pcpu *pcpu;
1157 	int cpu;
1158 
1159 	CPU_FOREACH(cpu) {
1160 		pcpu = rms_int_remote_pcpu(rms, cpu);
1161 		if (pcpu->readers != 0) {
1162 			panic("%s: got %d readers on cpu %d\n", __func__,
1163 			    pcpu->readers, cpu);
1164 		}
1165 	}
1166 }
1167 #else
1168 static void
1169 rms_assert_no_pcpu_readers(struct rmslock *rms)
1170 {
1171 }
1172 #endif
1173 
1174 static void
1175 rms_wlock_switch(struct rmslock *rms)
1176 {
1177 	struct rmslock_ipi rmsipi;
1178 
1179 	MPASS(rms->readers == 0);
1180 	MPASS(rms->writers == 1);
1181 
1182 	rmsipi.rms = rms;
1183 
1184 	smp_rendezvous_cpus_retry(all_cpus,
1185 	    smp_no_rendezvous_barrier,
1186 	    rms_action_func,
1187 	    smp_no_rendezvous_barrier,
1188 	    rms_wait_func,
1189 	    &rmsipi.srcra);
1190 }
1191 
1192 void
1193 rms_wlock(struct rmslock *rms)
1194 {
1195 
1196 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, __func__);
1197 	MPASS(atomic_load_ptr(&rms->owner) != curthread);
1198 
1199 	mtx_lock(&rms->mtx);
1200 	rms->writers++;
1201 	if (rms->writers > 1) {
1202 		msleep(&rms->owner, &rms->mtx, (PUSER - 1),
1203 		    mtx_name(&rms->mtx), 0);
1204 		MPASS(rms->readers == 0);
1205 		KASSERT(rms->owner == RMS_TRANSIENT,
1206 		    ("%s: unexpected owner value %p\n", __func__,
1207 		    rms->owner));
1208 		goto out_grab;
1209 	}
1210 
1211 	KASSERT(rms->owner == RMS_NOOWNER,
1212 	    ("%s: unexpected owner value %p\n", __func__, rms->owner));
1213 
1214 	rms_wlock_switch(rms);
1215 	rms_assert_no_pcpu_readers(rms);
1216 
1217 	if (rms->readers > 0) {
1218 		msleep(&rms->writers, &rms->mtx, (PUSER - 1),
1219 		    mtx_name(&rms->mtx), 0);
1220 	}
1221 
1222 out_grab:
1223 	rms->owner = curthread;
1224 	rms_assert_no_pcpu_readers(rms);
1225 	mtx_unlock(&rms->mtx);
1226 	MPASS(rms->readers == 0);
1227 	TD_LOCKS_INC(curthread);
1228 }
1229 
1230 void
1231 rms_wunlock(struct rmslock *rms)
1232 {
1233 
1234 	mtx_lock(&rms->mtx);
1235 	KASSERT(rms->owner == curthread,
1236 	    ("%s: unexpected owner value %p\n", __func__, rms->owner));
1237 	MPASS(rms->writers >= 1);
1238 	MPASS(rms->readers == 0);
1239 	rms->writers--;
1240 	if (rms->writers > 0) {
1241 		wakeup_one(&rms->owner);
1242 		rms->owner = RMS_TRANSIENT;
1243 	} else {
1244 		wakeup(&rms->readers);
1245 		rms->owner = RMS_NOOWNER;
1246 	}
1247 	mtx_unlock(&rms->mtx);
1248 	TD_LOCKS_DEC(curthread);
1249 }
1250 
1251 void
1252 rms_unlock(struct rmslock *rms)
1253 {
1254 
1255 	if (rms_wowned(rms))
1256 		rms_wunlock(rms);
1257 	else
1258 		rms_runlock(rms);
1259 }
1260