xref: /freebsd/sys/kern/kern_rwlock.c (revision f5147e312f43a9050468de539aeafa072caa1a60)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2006 John Baldwin <jhb@FreeBSD.org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 /*
30  * Machine independent bits of reader/writer lock implementation.
31  */
32 
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35 
36 #include "opt_ddb.h"
37 #include "opt_hwpmc_hooks.h"
38 #include "opt_no_adaptive_rwlocks.h"
39 
40 #include <sys/param.h>
41 #include <sys/kdb.h>
42 #include <sys/ktr.h>
43 #include <sys/kernel.h>
44 #include <sys/lock.h>
45 #include <sys/mutex.h>
46 #include <sys/proc.h>
47 #include <sys/rwlock.h>
48 #include <sys/sched.h>
49 #include <sys/smp.h>
50 #include <sys/sysctl.h>
51 #include <sys/systm.h>
52 #include <sys/turnstile.h>
53 
54 #include <machine/cpu.h>
55 
56 #if defined(SMP) && !defined(NO_ADAPTIVE_RWLOCKS)
57 #define	ADAPTIVE_RWLOCKS
58 #endif
59 
60 #ifdef HWPMC_HOOKS
61 #include <sys/pmckern.h>
62 PMC_SOFT_DECLARE( , , lock, failed);
63 #endif
64 
65 /*
66  * Return the rwlock address when the lock cookie address is provided.
67  * This functionality assumes that struct rwlock* have a member named rw_lock.
68  */
69 #define	rwlock2rw(c)	(__containerof(c, struct rwlock, rw_lock))
70 
71 #ifdef DDB
72 #include <ddb/ddb.h>
73 
74 static void	db_show_rwlock(const struct lock_object *lock);
75 #endif
76 static void	assert_rw(const struct lock_object *lock, int what);
77 static void	lock_rw(struct lock_object *lock, uintptr_t how);
78 #ifdef KDTRACE_HOOKS
79 static int	owner_rw(const struct lock_object *lock, struct thread **owner);
80 #endif
81 static uintptr_t unlock_rw(struct lock_object *lock);
82 
83 struct lock_class lock_class_rw = {
84 	.lc_name = "rw",
85 	.lc_flags = LC_SLEEPLOCK | LC_RECURSABLE | LC_UPGRADABLE,
86 	.lc_assert = assert_rw,
87 #ifdef DDB
88 	.lc_ddb_show = db_show_rwlock,
89 #endif
90 	.lc_lock = lock_rw,
91 	.lc_unlock = unlock_rw,
92 #ifdef KDTRACE_HOOKS
93 	.lc_owner = owner_rw,
94 #endif
95 };
96 
97 #ifdef ADAPTIVE_RWLOCKS
98 static int __read_frequently rowner_retries = 10;
99 static int __read_frequently rowner_loops = 10000;
100 static SYSCTL_NODE(_debug, OID_AUTO, rwlock, CTLFLAG_RD, NULL,
101     "rwlock debugging");
102 SYSCTL_INT(_debug_rwlock, OID_AUTO, retry, CTLFLAG_RW, &rowner_retries, 0, "");
103 SYSCTL_INT(_debug_rwlock, OID_AUTO, loops, CTLFLAG_RW, &rowner_loops, 0, "");
104 
105 static struct lock_delay_config __read_frequently rw_delay;
106 
107 SYSCTL_INT(_debug_rwlock, OID_AUTO, delay_base, CTLFLAG_RW, &rw_delay.base,
108     0, "");
109 SYSCTL_INT(_debug_rwlock, OID_AUTO, delay_max, CTLFLAG_RW, &rw_delay.max,
110     0, "");
111 
112 LOCK_DELAY_SYSINIT_DEFAULT(rw_delay);
113 #endif
114 
115 /*
116  * Return a pointer to the owning thread if the lock is write-locked or
117  * NULL if the lock is unlocked or read-locked.
118  */
119 
120 #define	lv_rw_wowner(v)							\
121 	((v) & RW_LOCK_READ ? NULL :					\
122 	 (struct thread *)RW_OWNER((v)))
123 
124 #define	rw_wowner(rw)	lv_rw_wowner(RW_READ_VALUE(rw))
125 
126 /*
127  * Returns if a write owner is recursed.  Write ownership is not assured
128  * here and should be previously checked.
129  */
130 #define	rw_recursed(rw)		((rw)->rw_recurse != 0)
131 
132 /*
133  * Return true if curthread helds the lock.
134  */
135 #define	rw_wlocked(rw)		(rw_wowner((rw)) == curthread)
136 
137 /*
138  * Return a pointer to the owning thread for this lock who should receive
139  * any priority lent by threads that block on this lock.  Currently this
140  * is identical to rw_wowner().
141  */
142 #define	rw_owner(rw)		rw_wowner(rw)
143 
144 #ifndef INVARIANTS
145 #define	__rw_assert(c, what, file, line)
146 #endif
147 
148 void
149 assert_rw(const struct lock_object *lock, int what)
150 {
151 
152 	rw_assert((const struct rwlock *)lock, what);
153 }
154 
155 void
156 lock_rw(struct lock_object *lock, uintptr_t how)
157 {
158 	struct rwlock *rw;
159 
160 	rw = (struct rwlock *)lock;
161 	if (how)
162 		rw_rlock(rw);
163 	else
164 		rw_wlock(rw);
165 }
166 
167 uintptr_t
168 unlock_rw(struct lock_object *lock)
169 {
170 	struct rwlock *rw;
171 
172 	rw = (struct rwlock *)lock;
173 	rw_assert(rw, RA_LOCKED | LA_NOTRECURSED);
174 	if (rw->rw_lock & RW_LOCK_READ) {
175 		rw_runlock(rw);
176 		return (1);
177 	} else {
178 		rw_wunlock(rw);
179 		return (0);
180 	}
181 }
182 
183 #ifdef KDTRACE_HOOKS
184 int
185 owner_rw(const struct lock_object *lock, struct thread **owner)
186 {
187 	const struct rwlock *rw = (const struct rwlock *)lock;
188 	uintptr_t x = rw->rw_lock;
189 
190 	*owner = rw_wowner(rw);
191 	return ((x & RW_LOCK_READ) != 0 ?  (RW_READERS(x) != 0) :
192 	    (*owner != NULL));
193 }
194 #endif
195 
196 void
197 _rw_init_flags(volatile uintptr_t *c, const char *name, int opts)
198 {
199 	struct rwlock *rw;
200 	int flags;
201 
202 	rw = rwlock2rw(c);
203 
204 	MPASS((opts & ~(RW_DUPOK | RW_NOPROFILE | RW_NOWITNESS | RW_QUIET |
205 	    RW_RECURSE | RW_NEW)) == 0);
206 	ASSERT_ATOMIC_LOAD_PTR(rw->rw_lock,
207 	    ("%s: rw_lock not aligned for %s: %p", __func__, name,
208 	    &rw->rw_lock));
209 
210 	flags = LO_UPGRADABLE;
211 	if (opts & RW_DUPOK)
212 		flags |= LO_DUPOK;
213 	if (opts & RW_NOPROFILE)
214 		flags |= LO_NOPROFILE;
215 	if (!(opts & RW_NOWITNESS))
216 		flags |= LO_WITNESS;
217 	if (opts & RW_RECURSE)
218 		flags |= LO_RECURSABLE;
219 	if (opts & RW_QUIET)
220 		flags |= LO_QUIET;
221 	if (opts & RW_NEW)
222 		flags |= LO_NEW;
223 
224 	lock_init(&rw->lock_object, &lock_class_rw, name, NULL, flags);
225 	rw->rw_lock = RW_UNLOCKED;
226 	rw->rw_recurse = 0;
227 }
228 
229 void
230 _rw_destroy(volatile uintptr_t *c)
231 {
232 	struct rwlock *rw;
233 
234 	rw = rwlock2rw(c);
235 
236 	KASSERT(rw->rw_lock == RW_UNLOCKED, ("rw lock %p not unlocked", rw));
237 	KASSERT(rw->rw_recurse == 0, ("rw lock %p still recursed", rw));
238 	rw->rw_lock = RW_DESTROYED;
239 	lock_destroy(&rw->lock_object);
240 }
241 
242 void
243 rw_sysinit(void *arg)
244 {
245 	struct rw_args *args;
246 
247 	args = arg;
248 	rw_init_flags((struct rwlock *)args->ra_rw, args->ra_desc,
249 	    args->ra_flags);
250 }
251 
252 int
253 _rw_wowned(const volatile uintptr_t *c)
254 {
255 
256 	return (rw_wowner(rwlock2rw(c)) == curthread);
257 }
258 
259 void
260 _rw_wlock_cookie(volatile uintptr_t *c, const char *file, int line)
261 {
262 	struct rwlock *rw;
263 	uintptr_t tid, v;
264 
265 	rw = rwlock2rw(c);
266 
267 	KASSERT(kdb_active != 0 || SCHEDULER_STOPPED() ||
268 	    !TD_IS_IDLETHREAD(curthread),
269 	    ("rw_wlock() by idle thread %p on rwlock %s @ %s:%d",
270 	    curthread, rw->lock_object.lo_name, file, line));
271 	KASSERT(rw->rw_lock != RW_DESTROYED,
272 	    ("rw_wlock() of destroyed rwlock @ %s:%d", file, line));
273 	WITNESS_CHECKORDER(&rw->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file,
274 	    line, NULL);
275 	tid = (uintptr_t)curthread;
276 	v = RW_UNLOCKED;
277 	if (!_rw_write_lock_fetch(rw, &v, tid))
278 		_rw_wlock_hard(rw, v, file, line);
279 	else
280 		LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(rw__acquire, rw,
281 		    0, 0, file, line, LOCKSTAT_WRITER);
282 
283 	LOCK_LOG_LOCK("WLOCK", &rw->lock_object, 0, rw->rw_recurse, file, line);
284 	WITNESS_LOCK(&rw->lock_object, LOP_EXCLUSIVE, file, line);
285 	TD_LOCKS_INC(curthread);
286 }
287 
288 int
289 __rw_try_wlock_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF)
290 {
291 	struct thread *td;
292 	uintptr_t tid, v;
293 	int rval;
294 	bool recursed;
295 
296 	td = curthread;
297 	tid = (uintptr_t)td;
298 	if (SCHEDULER_STOPPED_TD(td))
299 		return (1);
300 
301 	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(td),
302 	    ("rw_try_wlock() by idle thread %p on rwlock %s @ %s:%d",
303 	    curthread, rw->lock_object.lo_name, file, line));
304 	KASSERT(rw->rw_lock != RW_DESTROYED,
305 	    ("rw_try_wlock() of destroyed rwlock @ %s:%d", file, line));
306 
307 	rval = 1;
308 	recursed = false;
309 	v = RW_UNLOCKED;
310 	for (;;) {
311 		if (atomic_fcmpset_acq_ptr(&rw->rw_lock, &v, tid))
312 			break;
313 		if (v == RW_UNLOCKED)
314 			continue;
315 		if (v == tid && (rw->lock_object.lo_flags & LO_RECURSABLE)) {
316 			rw->rw_recurse++;
317 			atomic_set_ptr(&rw->rw_lock, RW_LOCK_WRITER_RECURSED);
318 			break;
319 		}
320 		rval = 0;
321 		break;
322 	}
323 
324 	LOCK_LOG_TRY("WLOCK", &rw->lock_object, 0, rval, file, line);
325 	if (rval) {
326 		WITNESS_LOCK(&rw->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK,
327 		    file, line);
328 		if (!recursed)
329 			LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(rw__acquire,
330 			    rw, 0, 0, file, line, LOCKSTAT_WRITER);
331 		TD_LOCKS_INC(curthread);
332 	}
333 	return (rval);
334 }
335 
336 int
337 __rw_try_wlock(volatile uintptr_t *c, const char *file, int line)
338 {
339 	struct rwlock *rw;
340 
341 	rw = rwlock2rw(c);
342 	return (__rw_try_wlock_int(rw LOCK_FILE_LINE_ARG));
343 }
344 
345 void
346 _rw_wunlock_cookie(volatile uintptr_t *c, const char *file, int line)
347 {
348 	struct rwlock *rw;
349 
350 	rw = rwlock2rw(c);
351 
352 	KASSERT(rw->rw_lock != RW_DESTROYED,
353 	    ("rw_wunlock() of destroyed rwlock @ %s:%d", file, line));
354 	__rw_assert(c, RA_WLOCKED, file, line);
355 	WITNESS_UNLOCK(&rw->lock_object, LOP_EXCLUSIVE, file, line);
356 	LOCK_LOG_LOCK("WUNLOCK", &rw->lock_object, 0, rw->rw_recurse, file,
357 	    line);
358 
359 #ifdef LOCK_PROFILING
360 	_rw_wunlock_hard(rw, (uintptr_t)curthread, file, line);
361 #else
362 	__rw_wunlock(rw, curthread, file, line);
363 #endif
364 
365 	TD_LOCKS_DEC(curthread);
366 }
367 
368 /*
369  * Determines whether a new reader can acquire a lock.  Succeeds if the
370  * reader already owns a read lock and the lock is locked for read to
371  * prevent deadlock from reader recursion.  Also succeeds if the lock
372  * is unlocked and has no writer waiters or spinners.  Failing otherwise
373  * prioritizes writers before readers.
374  */
375 static bool __always_inline
376 __rw_can_read(struct thread *td, uintptr_t v, bool fp)
377 {
378 
379 	if ((v & (RW_LOCK_READ | RW_LOCK_WRITE_WAITERS | RW_LOCK_WRITE_SPINNER))
380 	    == RW_LOCK_READ)
381 		return (true);
382 	if (!fp && td->td_rw_rlocks && (v & RW_LOCK_READ))
383 		return (true);
384 	return (false);
385 }
386 
387 static bool __always_inline
388 __rw_rlock_try(struct rwlock *rw, struct thread *td, uintptr_t *vp, bool fp
389     LOCK_FILE_LINE_ARG_DEF)
390 {
391 
392 	/*
393 	 * Handle the easy case.  If no other thread has a write
394 	 * lock, then try to bump up the count of read locks.  Note
395 	 * that we have to preserve the current state of the
396 	 * RW_LOCK_WRITE_WAITERS flag.  If we fail to acquire a
397 	 * read lock, then rw_lock must have changed, so restart
398 	 * the loop.  Note that this handles the case of a
399 	 * completely unlocked rwlock since such a lock is encoded
400 	 * as a read lock with no waiters.
401 	 */
402 	while (__rw_can_read(td, *vp, fp)) {
403 		if (atomic_fcmpset_acq_ptr(&rw->rw_lock, vp,
404 			*vp + RW_ONE_READER)) {
405 			if (LOCK_LOG_TEST(&rw->lock_object, 0))
406 				CTR4(KTR_LOCK,
407 				    "%s: %p succeed %p -> %p", __func__,
408 				    rw, (void *)*vp,
409 				    (void *)(*vp + RW_ONE_READER));
410 			td->td_rw_rlocks++;
411 			return (true);
412 		}
413 	}
414 	return (false);
415 }
416 
417 static void __noinline
418 __rw_rlock_hard(struct rwlock *rw, struct thread *td, uintptr_t v
419     LOCK_FILE_LINE_ARG_DEF)
420 {
421 	struct turnstile *ts;
422 	struct thread *owner;
423 #ifdef ADAPTIVE_RWLOCKS
424 	int spintries = 0;
425 	int i, n;
426 #endif
427 #ifdef LOCK_PROFILING
428 	uint64_t waittime = 0;
429 	int contested = 0;
430 #endif
431 #if defined(ADAPTIVE_RWLOCKS) || defined(KDTRACE_HOOKS)
432 	struct lock_delay_arg lda;
433 #endif
434 #ifdef KDTRACE_HOOKS
435 	u_int sleep_cnt = 0;
436 	int64_t sleep_time = 0;
437 	int64_t all_time = 0;
438 #endif
439 #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING)
440 	uintptr_t state;
441 	int doing_lockprof = 0;
442 #endif
443 
444 #ifdef KDTRACE_HOOKS
445 	if (LOCKSTAT_PROFILE_ENABLED(rw__acquire)) {
446 		if (__rw_rlock_try(rw, td, &v, false LOCK_FILE_LINE_ARG))
447 			goto out_lockstat;
448 		doing_lockprof = 1;
449 		all_time -= lockstat_nsecs(&rw->lock_object);
450 		state = v;
451 	}
452 #endif
453 #ifdef LOCK_PROFILING
454 	doing_lockprof = 1;
455 	state = v;
456 #endif
457 
458 	if (SCHEDULER_STOPPED())
459 		return;
460 
461 #if defined(ADAPTIVE_RWLOCKS)
462 	lock_delay_arg_init(&lda, &rw_delay);
463 #elif defined(KDTRACE_HOOKS)
464 	lock_delay_arg_init(&lda, NULL);
465 #endif
466 
467 #ifdef HWPMC_HOOKS
468 	PMC_SOFT_CALL( , , lock, failed);
469 #endif
470 	lock_profile_obtain_lock_failed(&rw->lock_object,
471 	    &contested, &waittime);
472 
473 	for (;;) {
474 		if (__rw_rlock_try(rw, td, &v, false LOCK_FILE_LINE_ARG))
475 			break;
476 #ifdef KDTRACE_HOOKS
477 		lda.spin_cnt++;
478 #endif
479 
480 #ifdef ADAPTIVE_RWLOCKS
481 		/*
482 		 * If the owner is running on another CPU, spin until
483 		 * the owner stops running or the state of the lock
484 		 * changes.
485 		 */
486 		if ((v & RW_LOCK_READ) == 0) {
487 			owner = (struct thread *)RW_OWNER(v);
488 			if (TD_IS_RUNNING(owner)) {
489 				if (LOCK_LOG_TEST(&rw->lock_object, 0))
490 					CTR3(KTR_LOCK,
491 					    "%s: spinning on %p held by %p",
492 					    __func__, rw, owner);
493 				KTR_STATE1(KTR_SCHED, "thread",
494 				    sched_tdname(curthread), "spinning",
495 				    "lockname:\"%s\"", rw->lock_object.lo_name);
496 				do {
497 					lock_delay(&lda);
498 					v = RW_READ_VALUE(rw);
499 					owner = lv_rw_wowner(v);
500 				} while (owner != NULL && TD_IS_RUNNING(owner));
501 				KTR_STATE0(KTR_SCHED, "thread",
502 				    sched_tdname(curthread), "running");
503 				continue;
504 			}
505 		} else if (spintries < rowner_retries) {
506 			spintries++;
507 			KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread),
508 			    "spinning", "lockname:\"%s\"",
509 			    rw->lock_object.lo_name);
510 			for (i = 0; i < rowner_loops; i += n) {
511 				n = RW_READERS(v);
512 				lock_delay_spin(n);
513 				v = RW_READ_VALUE(rw);
514 				if ((v & RW_LOCK_READ) == 0 || __rw_can_read(td, v, false))
515 					break;
516 			}
517 #ifdef KDTRACE_HOOKS
518 			lda.spin_cnt += rowner_loops - i;
519 #endif
520 			KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread),
521 			    "running");
522 			if (i < rowner_loops)
523 				continue;
524 		}
525 #endif
526 
527 		/*
528 		 * Okay, now it's the hard case.  Some other thread already
529 		 * has a write lock or there are write waiters present,
530 		 * acquire the turnstile lock so we can begin the process
531 		 * of blocking.
532 		 */
533 		ts = turnstile_trywait(&rw->lock_object);
534 
535 		/*
536 		 * The lock might have been released while we spun, so
537 		 * recheck its state and restart the loop if needed.
538 		 */
539 		v = RW_READ_VALUE(rw);
540 retry_ts:
541 		if (__rw_can_read(td, v, false)) {
542 			turnstile_cancel(ts);
543 			continue;
544 		}
545 
546 		owner = lv_rw_wowner(v);
547 
548 #ifdef ADAPTIVE_RWLOCKS
549 		/*
550 		 * The current lock owner might have started executing
551 		 * on another CPU (or the lock could have changed
552 		 * owners) while we were waiting on the turnstile
553 		 * chain lock.  If so, drop the turnstile lock and try
554 		 * again.
555 		 */
556 		if (owner != NULL) {
557 			if (TD_IS_RUNNING(owner)) {
558 				turnstile_cancel(ts);
559 				continue;
560 			}
561 		}
562 #endif
563 
564 		/*
565 		 * The lock is held in write mode or it already has waiters.
566 		 */
567 		MPASS(!__rw_can_read(td, v, false));
568 
569 		/*
570 		 * If the RW_LOCK_READ_WAITERS flag is already set, then
571 		 * we can go ahead and block.  If it is not set then try
572 		 * to set it.  If we fail to set it drop the turnstile
573 		 * lock and restart the loop.
574 		 */
575 		if (!(v & RW_LOCK_READ_WAITERS)) {
576 			if (!atomic_fcmpset_ptr(&rw->rw_lock, &v,
577 			    v | RW_LOCK_READ_WAITERS))
578 				goto retry_ts;
579 			if (LOCK_LOG_TEST(&rw->lock_object, 0))
580 				CTR2(KTR_LOCK, "%s: %p set read waiters flag",
581 				    __func__, rw);
582 		}
583 
584 		/*
585 		 * We were unable to acquire the lock and the read waiters
586 		 * flag is set, so we must block on the turnstile.
587 		 */
588 		if (LOCK_LOG_TEST(&rw->lock_object, 0))
589 			CTR2(KTR_LOCK, "%s: %p blocking on turnstile", __func__,
590 			    rw);
591 #ifdef KDTRACE_HOOKS
592 		sleep_time -= lockstat_nsecs(&rw->lock_object);
593 #endif
594 		MPASS(owner == rw_owner(rw));
595 		turnstile_wait(ts, owner, TS_SHARED_QUEUE);
596 #ifdef KDTRACE_HOOKS
597 		sleep_time += lockstat_nsecs(&rw->lock_object);
598 		sleep_cnt++;
599 #endif
600 		if (LOCK_LOG_TEST(&rw->lock_object, 0))
601 			CTR2(KTR_LOCK, "%s: %p resuming from turnstile",
602 			    __func__, rw);
603 		v = RW_READ_VALUE(rw);
604 	}
605 #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING)
606 	if (__predict_true(!doing_lockprof))
607 		return;
608 #endif
609 #ifdef KDTRACE_HOOKS
610 	all_time += lockstat_nsecs(&rw->lock_object);
611 	if (sleep_time)
612 		LOCKSTAT_RECORD4(rw__block, rw, sleep_time,
613 		    LOCKSTAT_READER, (state & RW_LOCK_READ) == 0,
614 		    (state & RW_LOCK_READ) == 0 ? 0 : RW_READERS(state));
615 
616 	/* Record only the loops spinning and not sleeping. */
617 	if (lda.spin_cnt > sleep_cnt)
618 		LOCKSTAT_RECORD4(rw__spin, rw, all_time - sleep_time,
619 		    LOCKSTAT_READER, (state & RW_LOCK_READ) == 0,
620 		    (state & RW_LOCK_READ) == 0 ? 0 : RW_READERS(state));
621 out_lockstat:
622 #endif
623 	/*
624 	 * TODO: acquire "owner of record" here.  Here be turnstile dragons
625 	 * however.  turnstiles don't like owners changing between calls to
626 	 * turnstile_wait() currently.
627 	 */
628 	LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(rw__acquire, rw, contested,
629 	    waittime, file, line, LOCKSTAT_READER);
630 }
631 
632 void
633 __rw_rlock_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF)
634 {
635 	struct thread *td;
636 	uintptr_t v;
637 
638 	td = curthread;
639 
640 	KASSERT(kdb_active != 0 || SCHEDULER_STOPPED_TD(td) ||
641 	    !TD_IS_IDLETHREAD(td),
642 	    ("rw_rlock() by idle thread %p on rwlock %s @ %s:%d",
643 	    td, rw->lock_object.lo_name, file, line));
644 	KASSERT(rw->rw_lock != RW_DESTROYED,
645 	    ("rw_rlock() of destroyed rwlock @ %s:%d", file, line));
646 	KASSERT(rw_wowner(rw) != td,
647 	    ("rw_rlock: wlock already held for %s @ %s:%d",
648 	    rw->lock_object.lo_name, file, line));
649 	WITNESS_CHECKORDER(&rw->lock_object, LOP_NEWORDER, file, line, NULL);
650 
651 	v = RW_READ_VALUE(rw);
652 	if (__predict_false(LOCKSTAT_PROFILE_ENABLED(rw__acquire) ||
653 	    !__rw_rlock_try(rw, td, &v, true LOCK_FILE_LINE_ARG)))
654 		__rw_rlock_hard(rw, td, v LOCK_FILE_LINE_ARG);
655 	else
656 		lock_profile_obtain_lock_success(&rw->lock_object, 0, 0,
657 		    file, line);
658 
659 	LOCK_LOG_LOCK("RLOCK", &rw->lock_object, 0, 0, file, line);
660 	WITNESS_LOCK(&rw->lock_object, 0, file, line);
661 	TD_LOCKS_INC(curthread);
662 }
663 
664 void
665 __rw_rlock(volatile uintptr_t *c, const char *file, int line)
666 {
667 	struct rwlock *rw;
668 
669 	rw = rwlock2rw(c);
670 	__rw_rlock_int(rw LOCK_FILE_LINE_ARG);
671 }
672 
673 int
674 __rw_try_rlock_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF)
675 {
676 	uintptr_t x;
677 
678 	if (SCHEDULER_STOPPED())
679 		return (1);
680 
681 	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
682 	    ("rw_try_rlock() by idle thread %p on rwlock %s @ %s:%d",
683 	    curthread, rw->lock_object.lo_name, file, line));
684 
685 	x = rw->rw_lock;
686 	for (;;) {
687 		KASSERT(rw->rw_lock != RW_DESTROYED,
688 		    ("rw_try_rlock() of destroyed rwlock @ %s:%d", file, line));
689 		if (!(x & RW_LOCK_READ))
690 			break;
691 		if (atomic_fcmpset_acq_ptr(&rw->rw_lock, &x, x + RW_ONE_READER)) {
692 			LOCK_LOG_TRY("RLOCK", &rw->lock_object, 0, 1, file,
693 			    line);
694 			WITNESS_LOCK(&rw->lock_object, LOP_TRYLOCK, file, line);
695 			LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(rw__acquire,
696 			    rw, 0, 0, file, line, LOCKSTAT_READER);
697 			TD_LOCKS_INC(curthread);
698 			curthread->td_rw_rlocks++;
699 			return (1);
700 		}
701 	}
702 
703 	LOCK_LOG_TRY("RLOCK", &rw->lock_object, 0, 0, file, line);
704 	return (0);
705 }
706 
707 int
708 __rw_try_rlock(volatile uintptr_t *c, const char *file, int line)
709 {
710 	struct rwlock *rw;
711 
712 	rw = rwlock2rw(c);
713 	return (__rw_try_rlock_int(rw LOCK_FILE_LINE_ARG));
714 }
715 
716 static bool __always_inline
717 __rw_runlock_try(struct rwlock *rw, struct thread *td, uintptr_t *vp)
718 {
719 
720 	for (;;) {
721 		/*
722 		 * See if there is more than one read lock held.  If so,
723 		 * just drop one and return.
724 		 */
725 		if (RW_READERS(*vp) > 1) {
726 			if (atomic_fcmpset_rel_ptr(&rw->rw_lock, vp,
727 			    *vp - RW_ONE_READER)) {
728 				if (LOCK_LOG_TEST(&rw->lock_object, 0))
729 					CTR4(KTR_LOCK,
730 					    "%s: %p succeeded %p -> %p",
731 					    __func__, rw, (void *)*vp,
732 					    (void *)(*vp - RW_ONE_READER));
733 				td->td_rw_rlocks--;
734 				return (true);
735 			}
736 			continue;
737 		}
738 		/*
739 		 * If there aren't any waiters for a write lock, then try
740 		 * to drop it quickly.
741 		 */
742 		if (!(*vp & RW_LOCK_WAITERS)) {
743 			MPASS((*vp & ~RW_LOCK_WRITE_SPINNER) ==
744 			    RW_READERS_LOCK(1));
745 			if (atomic_fcmpset_rel_ptr(&rw->rw_lock, vp,
746 			    RW_UNLOCKED)) {
747 				if (LOCK_LOG_TEST(&rw->lock_object, 0))
748 					CTR2(KTR_LOCK, "%s: %p last succeeded",
749 					    __func__, rw);
750 				td->td_rw_rlocks--;
751 				return (true);
752 			}
753 			continue;
754 		}
755 		break;
756 	}
757 	return (false);
758 }
759 
760 static void __noinline
761 __rw_runlock_hard(struct rwlock *rw, struct thread *td, uintptr_t v
762     LOCK_FILE_LINE_ARG_DEF)
763 {
764 	struct turnstile *ts;
765 	uintptr_t setv, queue;
766 
767 	if (SCHEDULER_STOPPED())
768 		return;
769 
770 	if (__rw_runlock_try(rw, td, &v))
771 		goto out_lockstat;
772 
773 	/*
774 	 * Ok, we know we have waiters and we think we are the
775 	 * last reader, so grab the turnstile lock.
776 	 */
777 	turnstile_chain_lock(&rw->lock_object);
778 	v = RW_READ_VALUE(rw);
779 	for (;;) {
780 		if (__rw_runlock_try(rw, td, &v))
781 			break;
782 
783 		v &= (RW_LOCK_WAITERS | RW_LOCK_WRITE_SPINNER);
784 		MPASS(v & RW_LOCK_WAITERS);
785 
786 		/*
787 		 * Try to drop our lock leaving the lock in a unlocked
788 		 * state.
789 		 *
790 		 * If you wanted to do explicit lock handoff you'd have to
791 		 * do it here.  You'd also want to use turnstile_signal()
792 		 * and you'd have to handle the race where a higher
793 		 * priority thread blocks on the write lock before the
794 		 * thread you wakeup actually runs and have the new thread
795 		 * "steal" the lock.  For now it's a lot simpler to just
796 		 * wakeup all of the waiters.
797 		 *
798 		 * As above, if we fail, then another thread might have
799 		 * acquired a read lock, so drop the turnstile lock and
800 		 * restart.
801 		 */
802 		setv = RW_UNLOCKED;
803 		queue = TS_SHARED_QUEUE;
804 		if (v & RW_LOCK_WRITE_WAITERS) {
805 			queue = TS_EXCLUSIVE_QUEUE;
806 			setv |= (v & RW_LOCK_READ_WAITERS);
807 		}
808 		v |= RW_READERS_LOCK(1);
809 		if (!atomic_fcmpset_rel_ptr(&rw->rw_lock, &v, setv))
810 			continue;
811 		if (LOCK_LOG_TEST(&rw->lock_object, 0))
812 			CTR2(KTR_LOCK, "%s: %p last succeeded with waiters",
813 			    __func__, rw);
814 
815 		/*
816 		 * Ok.  The lock is released and all that's left is to
817 		 * wake up the waiters.  Note that the lock might not be
818 		 * free anymore, but in that case the writers will just
819 		 * block again if they run before the new lock holder(s)
820 		 * release the lock.
821 		 */
822 		ts = turnstile_lookup(&rw->lock_object);
823 		MPASS(ts != NULL);
824 		turnstile_broadcast(ts, queue);
825 		turnstile_unpend(ts, TS_SHARED_LOCK);
826 		td->td_rw_rlocks--;
827 		break;
828 	}
829 	turnstile_chain_unlock(&rw->lock_object);
830 out_lockstat:
831 	LOCKSTAT_PROFILE_RELEASE_RWLOCK(rw__release, rw, LOCKSTAT_READER);
832 }
833 
834 void
835 _rw_runlock_cookie_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF)
836 {
837 	struct thread *td;
838 	uintptr_t v;
839 
840 	KASSERT(rw->rw_lock != RW_DESTROYED,
841 	    ("rw_runlock() of destroyed rwlock @ %s:%d", file, line));
842 	__rw_assert(&rw->rw_lock, RA_RLOCKED, file, line);
843 	WITNESS_UNLOCK(&rw->lock_object, 0, file, line);
844 	LOCK_LOG_LOCK("RUNLOCK", &rw->lock_object, 0, 0, file, line);
845 
846 	td = curthread;
847 	v = RW_READ_VALUE(rw);
848 
849 	if (__predict_false(LOCKSTAT_PROFILE_ENABLED(rw__release) ||
850 	    !__rw_runlock_try(rw, td, &v)))
851 		__rw_runlock_hard(rw, td, v LOCK_FILE_LINE_ARG);
852 	else
853 		lock_profile_release_lock(&rw->lock_object);
854 
855 	TD_LOCKS_DEC(curthread);
856 }
857 
858 void
859 _rw_runlock_cookie(volatile uintptr_t *c, const char *file, int line)
860 {
861 	struct rwlock *rw;
862 
863 	rw = rwlock2rw(c);
864 	_rw_runlock_cookie_int(rw LOCK_FILE_LINE_ARG);
865 }
866 
867 /*
868  * This function is called when we are unable to obtain a write lock on the
869  * first try.  This means that at least one other thread holds either a
870  * read or write lock.
871  */
872 void
873 __rw_wlock_hard(volatile uintptr_t *c, uintptr_t v LOCK_FILE_LINE_ARG_DEF)
874 {
875 	uintptr_t tid;
876 	struct rwlock *rw;
877 	struct turnstile *ts;
878 	struct thread *owner;
879 #ifdef ADAPTIVE_RWLOCKS
880 	int spintries = 0;
881 	int i, n;
882 	enum { READERS, WRITER } sleep_reason;
883 #endif
884 	uintptr_t x;
885 #ifdef LOCK_PROFILING
886 	uint64_t waittime = 0;
887 	int contested = 0;
888 #endif
889 #if defined(ADAPTIVE_RWLOCKS) || defined(KDTRACE_HOOKS)
890 	struct lock_delay_arg lda;
891 #endif
892 #ifdef KDTRACE_HOOKS
893 	u_int sleep_cnt = 0;
894 	int64_t sleep_time = 0;
895 	int64_t all_time = 0;
896 #endif
897 #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING)
898 	uintptr_t state;
899 	int doing_lockprof = 0;
900 #endif
901 
902 	tid = (uintptr_t)curthread;
903 	rw = rwlock2rw(c);
904 
905 #ifdef KDTRACE_HOOKS
906 	if (LOCKSTAT_PROFILE_ENABLED(rw__acquire)) {
907 		while (v == RW_UNLOCKED) {
908 			if (_rw_write_lock_fetch(rw, &v, tid))
909 				goto out_lockstat;
910 		}
911 		doing_lockprof = 1;
912 		all_time -= lockstat_nsecs(&rw->lock_object);
913 		state = v;
914 	}
915 #endif
916 #ifdef LOCK_PROFILING
917 	doing_lockprof = 1;
918 	state = v;
919 #endif
920 
921 	if (SCHEDULER_STOPPED())
922 		return;
923 
924 #if defined(ADAPTIVE_RWLOCKS)
925 	lock_delay_arg_init(&lda, &rw_delay);
926 #elif defined(KDTRACE_HOOKS)
927 	lock_delay_arg_init(&lda, NULL);
928 #endif
929 	if (__predict_false(v == RW_UNLOCKED))
930 		v = RW_READ_VALUE(rw);
931 
932 	if (__predict_false(lv_rw_wowner(v) == (struct thread *)tid)) {
933 		KASSERT(rw->lock_object.lo_flags & LO_RECURSABLE,
934 		    ("%s: recursing but non-recursive rw %s @ %s:%d\n",
935 		    __func__, rw->lock_object.lo_name, file, line));
936 		rw->rw_recurse++;
937 		atomic_set_ptr(&rw->rw_lock, RW_LOCK_WRITER_RECURSED);
938 		if (LOCK_LOG_TEST(&rw->lock_object, 0))
939 			CTR2(KTR_LOCK, "%s: %p recursing", __func__, rw);
940 		return;
941 	}
942 
943 	if (LOCK_LOG_TEST(&rw->lock_object, 0))
944 		CTR5(KTR_LOCK, "%s: %s contested (lock=%p) at %s:%d", __func__,
945 		    rw->lock_object.lo_name, (void *)rw->rw_lock, file, line);
946 
947 #ifdef HWPMC_HOOKS
948 	PMC_SOFT_CALL( , , lock, failed);
949 #endif
950 	lock_profile_obtain_lock_failed(&rw->lock_object,
951 	    &contested, &waittime);
952 
953 	for (;;) {
954 		if (v == RW_UNLOCKED) {
955 			if (_rw_write_lock_fetch(rw, &v, tid))
956 				break;
957 			continue;
958 		}
959 #ifdef KDTRACE_HOOKS
960 		lda.spin_cnt++;
961 #endif
962 
963 #ifdef ADAPTIVE_RWLOCKS
964 		/*
965 		 * If the lock is write locked and the owner is
966 		 * running on another CPU, spin until the owner stops
967 		 * running or the state of the lock changes.
968 		 */
969 		if (!(v & RW_LOCK_READ)) {
970 			sleep_reason = WRITER;
971 			owner = lv_rw_wowner(v);
972 			if (!TD_IS_RUNNING(owner))
973 				goto ts;
974 			if (LOCK_LOG_TEST(&rw->lock_object, 0))
975 				CTR3(KTR_LOCK, "%s: spinning on %p held by %p",
976 				    __func__, rw, owner);
977 			KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread),
978 			    "spinning", "lockname:\"%s\"",
979 			    rw->lock_object.lo_name);
980 			do {
981 				lock_delay(&lda);
982 				v = RW_READ_VALUE(rw);
983 				owner = lv_rw_wowner(v);
984 			} while (owner != NULL && TD_IS_RUNNING(owner));
985 			KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread),
986 			    "running");
987 			continue;
988 		} else if (RW_READERS(v) > 0) {
989 			sleep_reason = READERS;
990 			if (spintries == rowner_retries)
991 				goto ts;
992 			if (!(v & RW_LOCK_WRITE_SPINNER)) {
993 				if (!atomic_fcmpset_ptr(&rw->rw_lock, &v,
994 				    v | RW_LOCK_WRITE_SPINNER)) {
995 					continue;
996 				}
997 			}
998 			spintries++;
999 			KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread),
1000 			    "spinning", "lockname:\"%s\"",
1001 			    rw->lock_object.lo_name);
1002 			for (i = 0; i < rowner_loops; i += n) {
1003 				n = RW_READERS(v);
1004 				lock_delay_spin(n);
1005 				v = RW_READ_VALUE(rw);
1006 				if ((v & RW_LOCK_WRITE_SPINNER) == 0)
1007 					break;
1008 			}
1009 #ifdef KDTRACE_HOOKS
1010 			lda.spin_cnt += i;
1011 #endif
1012 			KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread),
1013 			    "running");
1014 			if (i < rowner_loops)
1015 				continue;
1016 		}
1017 ts:
1018 #endif
1019 		ts = turnstile_trywait(&rw->lock_object);
1020 		v = RW_READ_VALUE(rw);
1021 retry_ts:
1022 		owner = lv_rw_wowner(v);
1023 
1024 #ifdef ADAPTIVE_RWLOCKS
1025 		/*
1026 		 * The current lock owner might have started executing
1027 		 * on another CPU (or the lock could have changed
1028 		 * owners) while we were waiting on the turnstile
1029 		 * chain lock.  If so, drop the turnstile lock and try
1030 		 * again.
1031 		 */
1032 		if (owner != NULL) {
1033 			if (TD_IS_RUNNING(owner)) {
1034 				turnstile_cancel(ts);
1035 				continue;
1036 			}
1037 		} else if (RW_READERS(v) > 0 && sleep_reason == WRITER) {
1038 			turnstile_cancel(ts);
1039 			continue;
1040 		}
1041 #endif
1042 		/*
1043 		 * Check for the waiters flags about this rwlock.
1044 		 * If the lock was released, without maintain any pending
1045 		 * waiters queue, simply try to acquire it.
1046 		 * If a pending waiters queue is present, claim the lock
1047 		 * ownership and maintain the pending queue.
1048 		 */
1049 		x = v & (RW_LOCK_WAITERS | RW_LOCK_WRITE_SPINNER);
1050 		if ((v & ~x) == RW_UNLOCKED) {
1051 			x &= ~RW_LOCK_WRITE_SPINNER;
1052 			if (atomic_fcmpset_acq_ptr(&rw->rw_lock, &v, tid | x)) {
1053 				if (x)
1054 					turnstile_claim(ts);
1055 				else
1056 					turnstile_cancel(ts);
1057 				break;
1058 			}
1059 			goto retry_ts;
1060 		}
1061 		/*
1062 		 * If the RW_LOCK_WRITE_WAITERS flag isn't set, then try to
1063 		 * set it.  If we fail to set it, then loop back and try
1064 		 * again.
1065 		 */
1066 		if (!(v & RW_LOCK_WRITE_WAITERS)) {
1067 			if (!atomic_fcmpset_ptr(&rw->rw_lock, &v,
1068 			    v | RW_LOCK_WRITE_WAITERS))
1069 				goto retry_ts;
1070 			if (LOCK_LOG_TEST(&rw->lock_object, 0))
1071 				CTR2(KTR_LOCK, "%s: %p set write waiters flag",
1072 				    __func__, rw);
1073 		}
1074 		/*
1075 		 * We were unable to acquire the lock and the write waiters
1076 		 * flag is set, so we must block on the turnstile.
1077 		 */
1078 		if (LOCK_LOG_TEST(&rw->lock_object, 0))
1079 			CTR2(KTR_LOCK, "%s: %p blocking on turnstile", __func__,
1080 			    rw);
1081 #ifdef KDTRACE_HOOKS
1082 		sleep_time -= lockstat_nsecs(&rw->lock_object);
1083 #endif
1084 		MPASS(owner == rw_owner(rw));
1085 		turnstile_wait(ts, owner, TS_EXCLUSIVE_QUEUE);
1086 #ifdef KDTRACE_HOOKS
1087 		sleep_time += lockstat_nsecs(&rw->lock_object);
1088 		sleep_cnt++;
1089 #endif
1090 		if (LOCK_LOG_TEST(&rw->lock_object, 0))
1091 			CTR2(KTR_LOCK, "%s: %p resuming from turnstile",
1092 			    __func__, rw);
1093 #ifdef ADAPTIVE_RWLOCKS
1094 		spintries = 0;
1095 #endif
1096 		v = RW_READ_VALUE(rw);
1097 	}
1098 #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING)
1099 	if (__predict_true(!doing_lockprof))
1100 		return;
1101 #endif
1102 #ifdef KDTRACE_HOOKS
1103 	all_time += lockstat_nsecs(&rw->lock_object);
1104 	if (sleep_time)
1105 		LOCKSTAT_RECORD4(rw__block, rw, sleep_time,
1106 		    LOCKSTAT_WRITER, (state & RW_LOCK_READ) == 0,
1107 		    (state & RW_LOCK_READ) == 0 ? 0 : RW_READERS(state));
1108 
1109 	/* Record only the loops spinning and not sleeping. */
1110 	if (lda.spin_cnt > sleep_cnt)
1111 		LOCKSTAT_RECORD4(rw__spin, rw, all_time - sleep_time,
1112 		    LOCKSTAT_WRITER, (state & RW_LOCK_READ) == 0,
1113 		    (state & RW_LOCK_READ) == 0 ? 0 : RW_READERS(state));
1114 out_lockstat:
1115 #endif
1116 	LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(rw__acquire, rw, contested,
1117 	    waittime, file, line, LOCKSTAT_WRITER);
1118 }
1119 
1120 /*
1121  * This function is called if lockstat is active or the first try at releasing
1122  * a write lock failed.  The latter means that the lock is recursed or one of
1123  * the 2 waiter bits must be set indicating that at least one thread is waiting
1124  * on this lock.
1125  */
1126 void
1127 __rw_wunlock_hard(volatile uintptr_t *c, uintptr_t v LOCK_FILE_LINE_ARG_DEF)
1128 {
1129 	struct rwlock *rw;
1130 	struct turnstile *ts;
1131 	uintptr_t tid, setv;
1132 	int queue;
1133 
1134 	tid = (uintptr_t)curthread;
1135 	if (SCHEDULER_STOPPED())
1136 		return;
1137 
1138 	rw = rwlock2rw(c);
1139 	if (__predict_false(v == tid))
1140 		v = RW_READ_VALUE(rw);
1141 
1142 	if (v & RW_LOCK_WRITER_RECURSED) {
1143 		if (--(rw->rw_recurse) == 0)
1144 			atomic_clear_ptr(&rw->rw_lock, RW_LOCK_WRITER_RECURSED);
1145 		if (LOCK_LOG_TEST(&rw->lock_object, 0))
1146 			CTR2(KTR_LOCK, "%s: %p unrecursing", __func__, rw);
1147 		return;
1148 	}
1149 
1150 	LOCKSTAT_PROFILE_RELEASE_RWLOCK(rw__release, rw, LOCKSTAT_WRITER);
1151 	if (v == tid && _rw_write_unlock(rw, tid))
1152 		return;
1153 
1154 	KASSERT(rw->rw_lock & (RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS),
1155 	    ("%s: neither of the waiter flags are set", __func__));
1156 
1157 	if (LOCK_LOG_TEST(&rw->lock_object, 0))
1158 		CTR2(KTR_LOCK, "%s: %p contested", __func__, rw);
1159 
1160 	turnstile_chain_lock(&rw->lock_object);
1161 
1162 	/*
1163 	 * Use the same algo as sx locks for now.  Prefer waking up shared
1164 	 * waiters if we have any over writers.  This is probably not ideal.
1165 	 *
1166 	 * 'v' is the value we are going to write back to rw_lock.  If we
1167 	 * have waiters on both queues, we need to preserve the state of
1168 	 * the waiter flag for the queue we don't wake up.  For now this is
1169 	 * hardcoded for the algorithm mentioned above.
1170 	 *
1171 	 * In the case of both readers and writers waiting we wakeup the
1172 	 * readers but leave the RW_LOCK_WRITE_WAITERS flag set.  If a
1173 	 * new writer comes in before a reader it will claim the lock up
1174 	 * above.  There is probably a potential priority inversion in
1175 	 * there that could be worked around either by waking both queues
1176 	 * of waiters or doing some complicated lock handoff gymnastics.
1177 	 */
1178 	setv = RW_UNLOCKED;
1179 	v = RW_READ_VALUE(rw);
1180 	queue = TS_SHARED_QUEUE;
1181 	if (v & RW_LOCK_WRITE_WAITERS) {
1182 		queue = TS_EXCLUSIVE_QUEUE;
1183 		setv |= (v & RW_LOCK_READ_WAITERS);
1184 	}
1185 	atomic_store_rel_ptr(&rw->rw_lock, setv);
1186 
1187 	/* Wake up all waiters for the specific queue. */
1188 	if (LOCK_LOG_TEST(&rw->lock_object, 0))
1189 		CTR3(KTR_LOCK, "%s: %p waking up %s waiters", __func__, rw,
1190 		    queue == TS_SHARED_QUEUE ? "read" : "write");
1191 
1192 	ts = turnstile_lookup(&rw->lock_object);
1193 	MPASS(ts != NULL);
1194 	turnstile_broadcast(ts, queue);
1195 	turnstile_unpend(ts, TS_EXCLUSIVE_LOCK);
1196 	turnstile_chain_unlock(&rw->lock_object);
1197 }
1198 
1199 /*
1200  * Attempt to do a non-blocking upgrade from a read lock to a write
1201  * lock.  This will only succeed if this thread holds a single read
1202  * lock.  Returns true if the upgrade succeeded and false otherwise.
1203  */
1204 int
1205 __rw_try_upgrade_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF)
1206 {
1207 	uintptr_t v, x, tid;
1208 	struct turnstile *ts;
1209 	int success;
1210 
1211 	if (SCHEDULER_STOPPED())
1212 		return (1);
1213 
1214 	KASSERT(rw->rw_lock != RW_DESTROYED,
1215 	    ("rw_try_upgrade() of destroyed rwlock @ %s:%d", file, line));
1216 	__rw_assert(&rw->rw_lock, RA_RLOCKED, file, line);
1217 
1218 	/*
1219 	 * Attempt to switch from one reader to a writer.  If there
1220 	 * are any write waiters, then we will have to lock the
1221 	 * turnstile first to prevent races with another writer
1222 	 * calling turnstile_wait() before we have claimed this
1223 	 * turnstile.  So, do the simple case of no waiters first.
1224 	 */
1225 	tid = (uintptr_t)curthread;
1226 	success = 0;
1227 	for (;;) {
1228 		v = rw->rw_lock;
1229 		if (RW_READERS(v) > 1)
1230 			break;
1231 		if (!(v & RW_LOCK_WAITERS)) {
1232 			success = atomic_cmpset_acq_ptr(&rw->rw_lock, v, tid);
1233 			if (!success)
1234 				continue;
1235 			break;
1236 		}
1237 
1238 		/*
1239 		 * Ok, we think we have waiters, so lock the turnstile.
1240 		 */
1241 		ts = turnstile_trywait(&rw->lock_object);
1242 		v = rw->rw_lock;
1243 		if (RW_READERS(v) > 1) {
1244 			turnstile_cancel(ts);
1245 			break;
1246 		}
1247 		/*
1248 		 * Try to switch from one reader to a writer again.  This time
1249 		 * we honor the current state of the waiters flags.
1250 		 * If we obtain the lock with the flags set, then claim
1251 		 * ownership of the turnstile.
1252 		 */
1253 		x = rw->rw_lock & RW_LOCK_WAITERS;
1254 		success = atomic_cmpset_ptr(&rw->rw_lock, v, tid | x);
1255 		if (success) {
1256 			if (x)
1257 				turnstile_claim(ts);
1258 			else
1259 				turnstile_cancel(ts);
1260 			break;
1261 		}
1262 		turnstile_cancel(ts);
1263 	}
1264 	LOCK_LOG_TRY("WUPGRADE", &rw->lock_object, 0, success, file, line);
1265 	if (success) {
1266 		curthread->td_rw_rlocks--;
1267 		WITNESS_UPGRADE(&rw->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK,
1268 		    file, line);
1269 		LOCKSTAT_RECORD0(rw__upgrade, rw);
1270 	}
1271 	return (success);
1272 }
1273 
1274 int
1275 __rw_try_upgrade(volatile uintptr_t *c, const char *file, int line)
1276 {
1277 	struct rwlock *rw;
1278 
1279 	rw = rwlock2rw(c);
1280 	return (__rw_try_upgrade_int(rw LOCK_FILE_LINE_ARG));
1281 }
1282 
1283 /*
1284  * Downgrade a write lock into a single read lock.
1285  */
1286 void
1287 __rw_downgrade_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF)
1288 {
1289 	struct turnstile *ts;
1290 	uintptr_t tid, v;
1291 	int rwait, wwait;
1292 
1293 	if (SCHEDULER_STOPPED())
1294 		return;
1295 
1296 	KASSERT(rw->rw_lock != RW_DESTROYED,
1297 	    ("rw_downgrade() of destroyed rwlock @ %s:%d", file, line));
1298 	__rw_assert(&rw->rw_lock, RA_WLOCKED | RA_NOTRECURSED, file, line);
1299 #ifndef INVARIANTS
1300 	if (rw_recursed(rw))
1301 		panic("downgrade of a recursed lock");
1302 #endif
1303 
1304 	WITNESS_DOWNGRADE(&rw->lock_object, 0, file, line);
1305 
1306 	/*
1307 	 * Convert from a writer to a single reader.  First we handle
1308 	 * the easy case with no waiters.  If there are any waiters, we
1309 	 * lock the turnstile and "disown" the lock.
1310 	 */
1311 	tid = (uintptr_t)curthread;
1312 	if (atomic_cmpset_rel_ptr(&rw->rw_lock, tid, RW_READERS_LOCK(1)))
1313 		goto out;
1314 
1315 	/*
1316 	 * Ok, we think we have waiters, so lock the turnstile so we can
1317 	 * read the waiter flags without any races.
1318 	 */
1319 	turnstile_chain_lock(&rw->lock_object);
1320 	v = rw->rw_lock & RW_LOCK_WAITERS;
1321 	rwait = v & RW_LOCK_READ_WAITERS;
1322 	wwait = v & RW_LOCK_WRITE_WAITERS;
1323 	MPASS(rwait | wwait);
1324 
1325 	/*
1326 	 * Downgrade from a write lock while preserving waiters flag
1327 	 * and give up ownership of the turnstile.
1328 	 */
1329 	ts = turnstile_lookup(&rw->lock_object);
1330 	MPASS(ts != NULL);
1331 	if (!wwait)
1332 		v &= ~RW_LOCK_READ_WAITERS;
1333 	atomic_store_rel_ptr(&rw->rw_lock, RW_READERS_LOCK(1) | v);
1334 	/*
1335 	 * Wake other readers if there are no writers pending.  Otherwise they
1336 	 * won't be able to acquire the lock anyway.
1337 	 */
1338 	if (rwait && !wwait) {
1339 		turnstile_broadcast(ts, TS_SHARED_QUEUE);
1340 		turnstile_unpend(ts, TS_EXCLUSIVE_LOCK);
1341 	} else
1342 		turnstile_disown(ts);
1343 	turnstile_chain_unlock(&rw->lock_object);
1344 out:
1345 	curthread->td_rw_rlocks++;
1346 	LOCK_LOG_LOCK("WDOWNGRADE", &rw->lock_object, 0, 0, file, line);
1347 	LOCKSTAT_RECORD0(rw__downgrade, rw);
1348 }
1349 
1350 void
1351 __rw_downgrade(volatile uintptr_t *c, const char *file, int line)
1352 {
1353 	struct rwlock *rw;
1354 
1355 	rw = rwlock2rw(c);
1356 	__rw_downgrade_int(rw LOCK_FILE_LINE_ARG);
1357 }
1358 
1359 #ifdef INVARIANT_SUPPORT
1360 #ifndef INVARIANTS
1361 #undef __rw_assert
1362 #endif
1363 
1364 /*
1365  * In the non-WITNESS case, rw_assert() can only detect that at least
1366  * *some* thread owns an rlock, but it cannot guarantee that *this*
1367  * thread owns an rlock.
1368  */
1369 void
1370 __rw_assert(const volatile uintptr_t *c, int what, const char *file, int line)
1371 {
1372 	const struct rwlock *rw;
1373 
1374 	if (panicstr != NULL)
1375 		return;
1376 
1377 	rw = rwlock2rw(c);
1378 
1379 	switch (what) {
1380 	case RA_LOCKED:
1381 	case RA_LOCKED | RA_RECURSED:
1382 	case RA_LOCKED | RA_NOTRECURSED:
1383 	case RA_RLOCKED:
1384 	case RA_RLOCKED | RA_RECURSED:
1385 	case RA_RLOCKED | RA_NOTRECURSED:
1386 #ifdef WITNESS
1387 		witness_assert(&rw->lock_object, what, file, line);
1388 #else
1389 		/*
1390 		 * If some other thread has a write lock or we have one
1391 		 * and are asserting a read lock, fail.  Also, if no one
1392 		 * has a lock at all, fail.
1393 		 */
1394 		if (rw->rw_lock == RW_UNLOCKED ||
1395 		    (!(rw->rw_lock & RW_LOCK_READ) && (what & RA_RLOCKED ||
1396 		    rw_wowner(rw) != curthread)))
1397 			panic("Lock %s not %slocked @ %s:%d\n",
1398 			    rw->lock_object.lo_name, (what & RA_RLOCKED) ?
1399 			    "read " : "", file, line);
1400 
1401 		if (!(rw->rw_lock & RW_LOCK_READ) && !(what & RA_RLOCKED)) {
1402 			if (rw_recursed(rw)) {
1403 				if (what & RA_NOTRECURSED)
1404 					panic("Lock %s recursed @ %s:%d\n",
1405 					    rw->lock_object.lo_name, file,
1406 					    line);
1407 			} else if (what & RA_RECURSED)
1408 				panic("Lock %s not recursed @ %s:%d\n",
1409 				    rw->lock_object.lo_name, file, line);
1410 		}
1411 #endif
1412 		break;
1413 	case RA_WLOCKED:
1414 	case RA_WLOCKED | RA_RECURSED:
1415 	case RA_WLOCKED | RA_NOTRECURSED:
1416 		if (rw_wowner(rw) != curthread)
1417 			panic("Lock %s not exclusively locked @ %s:%d\n",
1418 			    rw->lock_object.lo_name, file, line);
1419 		if (rw_recursed(rw)) {
1420 			if (what & RA_NOTRECURSED)
1421 				panic("Lock %s recursed @ %s:%d\n",
1422 				    rw->lock_object.lo_name, file, line);
1423 		} else if (what & RA_RECURSED)
1424 			panic("Lock %s not recursed @ %s:%d\n",
1425 			    rw->lock_object.lo_name, file, line);
1426 		break;
1427 	case RA_UNLOCKED:
1428 #ifdef WITNESS
1429 		witness_assert(&rw->lock_object, what, file, line);
1430 #else
1431 		/*
1432 		 * If we hold a write lock fail.  We can't reliably check
1433 		 * to see if we hold a read lock or not.
1434 		 */
1435 		if (rw_wowner(rw) == curthread)
1436 			panic("Lock %s exclusively locked @ %s:%d\n",
1437 			    rw->lock_object.lo_name, file, line);
1438 #endif
1439 		break;
1440 	default:
1441 		panic("Unknown rw lock assertion: %d @ %s:%d", what, file,
1442 		    line);
1443 	}
1444 }
1445 #endif /* INVARIANT_SUPPORT */
1446 
1447 #ifdef DDB
1448 void
1449 db_show_rwlock(const struct lock_object *lock)
1450 {
1451 	const struct rwlock *rw;
1452 	struct thread *td;
1453 
1454 	rw = (const struct rwlock *)lock;
1455 
1456 	db_printf(" state: ");
1457 	if (rw->rw_lock == RW_UNLOCKED)
1458 		db_printf("UNLOCKED\n");
1459 	else if (rw->rw_lock == RW_DESTROYED) {
1460 		db_printf("DESTROYED\n");
1461 		return;
1462 	} else if (rw->rw_lock & RW_LOCK_READ)
1463 		db_printf("RLOCK: %ju locks\n",
1464 		    (uintmax_t)(RW_READERS(rw->rw_lock)));
1465 	else {
1466 		td = rw_wowner(rw);
1467 		db_printf("WLOCK: %p (tid %d, pid %d, \"%s\")\n", td,
1468 		    td->td_tid, td->td_proc->p_pid, td->td_name);
1469 		if (rw_recursed(rw))
1470 			db_printf(" recursed: %u\n", rw->rw_recurse);
1471 	}
1472 	db_printf(" waiters: ");
1473 	switch (rw->rw_lock & (RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS)) {
1474 	case RW_LOCK_READ_WAITERS:
1475 		db_printf("readers\n");
1476 		break;
1477 	case RW_LOCK_WRITE_WAITERS:
1478 		db_printf("writers\n");
1479 		break;
1480 	case RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS:
1481 		db_printf("readers and writers\n");
1482 		break;
1483 	default:
1484 		db_printf("none\n");
1485 		break;
1486 	}
1487 }
1488 
1489 #endif
1490