xref: /linux/kernel/locking/rwsem.c (revision 3fd6c59042dbba50391e30862beac979491145fe)
1  // SPDX-License-Identifier: GPL-2.0
2  /* kernel/rwsem.c: R/W semaphores, public implementation
3   *
4   * Written by David Howells (dhowells@redhat.com).
5   * Derived from asm-i386/semaphore.h
6   *
7   * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
8   * and Michel Lespinasse <walken@google.com>
9   *
10   * Optimistic spinning by Tim Chen <tim.c.chen@intel.com>
11   * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
12   *
13   * Rwsem count bit fields re-definition and rwsem rearchitecture by
14   * Waiman Long <longman@redhat.com> and
15   * Peter Zijlstra <peterz@infradead.org>.
16   */
17  
18  #include <linux/types.h>
19  #include <linux/kernel.h>
20  #include <linux/sched.h>
21  #include <linux/sched/rt.h>
22  #include <linux/sched/task.h>
23  #include <linux/sched/debug.h>
24  #include <linux/sched/wake_q.h>
25  #include <linux/sched/signal.h>
26  #include <linux/sched/clock.h>
27  #include <linux/export.h>
28  #include <linux/rwsem.h>
29  #include <linux/atomic.h>
30  #include <trace/events/lock.h>
31  
32  #ifndef CONFIG_PREEMPT_RT
33  #include "lock_events.h"
34  
35  /*
36   * The least significant 2 bits of the owner value has the following
37   * meanings when set.
38   *  - Bit 0: RWSEM_READER_OWNED - rwsem may be owned by readers (just a hint)
39   *  - Bit 1: RWSEM_NONSPINNABLE - Cannot spin on a reader-owned lock
40   *
41   * When the rwsem is reader-owned and a spinning writer has timed out,
42   * the nonspinnable bit will be set to disable optimistic spinning.
43  
44   * When a writer acquires a rwsem, it puts its task_struct pointer
45   * into the owner field. It is cleared after an unlock.
46   *
47   * When a reader acquires a rwsem, it will also puts its task_struct
48   * pointer into the owner field with the RWSEM_READER_OWNED bit set.
49   * On unlock, the owner field will largely be left untouched. So
50   * for a free or reader-owned rwsem, the owner value may contain
51   * information about the last reader that acquires the rwsem.
52   *
53   * That information may be helpful in debugging cases where the system
54   * seems to hang on a reader owned rwsem especially if only one reader
55   * is involved. Ideally we would like to track all the readers that own
56   * a rwsem, but the overhead is simply too big.
57   *
58   * A fast path reader optimistic lock stealing is supported when the rwsem
59   * is previously owned by a writer and the following conditions are met:
60   *  - rwsem is not currently writer owned
61   *  - the handoff isn't set.
62   */
63  #define RWSEM_READER_OWNED	(1UL << 0)
64  #define RWSEM_NONSPINNABLE	(1UL << 1)
65  #define RWSEM_OWNER_FLAGS_MASK	(RWSEM_READER_OWNED | RWSEM_NONSPINNABLE)
66  
67  #ifdef CONFIG_DEBUG_RWSEMS
68  # define DEBUG_RWSEMS_WARN_ON(c, sem)	do {			\
69  	if (!debug_locks_silent &&				\
70  	    WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, magic = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\
71  		#c, atomic_long_read(&(sem)->count),		\
72  		(unsigned long) sem->magic,			\
73  		atomic_long_read(&(sem)->owner), (long)current,	\
74  		list_empty(&(sem)->wait_list) ? "" : "not "))	\
75  			debug_locks_off();			\
76  	} while (0)
77  #else
78  # define DEBUG_RWSEMS_WARN_ON(c, sem)
79  #endif
80  
81  /*
82   * On 64-bit architectures, the bit definitions of the count are:
83   *
84   * Bit  0    - writer locked bit
85   * Bit  1    - waiters present bit
86   * Bit  2    - lock handoff bit
87   * Bits 3-7  - reserved
88   * Bits 8-62 - 55-bit reader count
89   * Bit  63   - read fail bit
90   *
91   * On 32-bit architectures, the bit definitions of the count are:
92   *
93   * Bit  0    - writer locked bit
94   * Bit  1    - waiters present bit
95   * Bit  2    - lock handoff bit
96   * Bits 3-7  - reserved
97   * Bits 8-30 - 23-bit reader count
98   * Bit  31   - read fail bit
99   *
100   * It is not likely that the most significant bit (read fail bit) will ever
101   * be set. This guard bit is still checked anyway in the down_read() fastpath
102   * just in case we need to use up more of the reader bits for other purpose
103   * in the future.
104   *
105   * atomic_long_fetch_add() is used to obtain reader lock, whereas
106   * atomic_long_cmpxchg() will be used to obtain writer lock.
107   *
108   * There are three places where the lock handoff bit may be set or cleared.
109   * 1) rwsem_mark_wake() for readers		-- set, clear
110   * 2) rwsem_try_write_lock() for writers	-- set, clear
111   * 3) rwsem_del_waiter()			-- clear
112   *
113   * For all the above cases, wait_lock will be held. A writer must also
114   * be the first one in the wait_list to be eligible for setting the handoff
115   * bit. So concurrent setting/clearing of handoff bit is not possible.
116   */
117  #define RWSEM_WRITER_LOCKED	(1UL << 0)
118  #define RWSEM_FLAG_WAITERS	(1UL << 1)
119  #define RWSEM_FLAG_HANDOFF	(1UL << 2)
120  #define RWSEM_FLAG_READFAIL	(1UL << (BITS_PER_LONG - 1))
121  
122  #define RWSEM_READER_SHIFT	8
123  #define RWSEM_READER_BIAS	(1UL << RWSEM_READER_SHIFT)
124  #define RWSEM_READER_MASK	(~(RWSEM_READER_BIAS - 1))
125  #define RWSEM_WRITER_MASK	RWSEM_WRITER_LOCKED
126  #define RWSEM_LOCK_MASK		(RWSEM_WRITER_MASK|RWSEM_READER_MASK)
127  #define RWSEM_READ_FAILED_MASK	(RWSEM_WRITER_MASK|RWSEM_FLAG_WAITERS|\
128  				 RWSEM_FLAG_HANDOFF|RWSEM_FLAG_READFAIL)
129  
130  /*
131   * All writes to owner are protected by WRITE_ONCE() to make sure that
132   * store tearing can't happen as optimistic spinners may read and use
133   * the owner value concurrently without lock. Read from owner, however,
134   * may not need READ_ONCE() as long as the pointer value is only used
135   * for comparison and isn't being dereferenced.
136   *
137   * Both rwsem_{set,clear}_owner() functions should be in the same
138   * preempt disable section as the atomic op that changes sem->count.
139   */
rwsem_set_owner(struct rw_semaphore * sem)140  static inline void rwsem_set_owner(struct rw_semaphore *sem)
141  {
142  	lockdep_assert_preemption_disabled();
143  	atomic_long_set(&sem->owner, (long)current);
144  }
145  
rwsem_clear_owner(struct rw_semaphore * sem)146  static inline void rwsem_clear_owner(struct rw_semaphore *sem)
147  {
148  	lockdep_assert_preemption_disabled();
149  	atomic_long_set(&sem->owner, 0);
150  }
151  
152  /*
153   * Test the flags in the owner field.
154   */
rwsem_test_oflags(struct rw_semaphore * sem,long flags)155  static inline bool rwsem_test_oflags(struct rw_semaphore *sem, long flags)
156  {
157  	return atomic_long_read(&sem->owner) & flags;
158  }
159  
160  /*
161   * The task_struct pointer of the last owning reader will be left in
162   * the owner field.
163   *
164   * Note that the owner value just indicates the task has owned the rwsem
165   * previously, it may not be the real owner or one of the real owners
166   * anymore when that field is examined, so take it with a grain of salt.
167   *
168   * The reader non-spinnable bit is preserved.
169   */
__rwsem_set_reader_owned(struct rw_semaphore * sem,struct task_struct * owner)170  static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
171  					    struct task_struct *owner)
172  {
173  	unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED |
174  		(atomic_long_read(&sem->owner) & RWSEM_NONSPINNABLE);
175  
176  	atomic_long_set(&sem->owner, val);
177  }
178  
rwsem_set_reader_owned(struct rw_semaphore * sem)179  static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
180  {
181  	__rwsem_set_reader_owned(sem, current);
182  }
183  
184  #ifdef CONFIG_DEBUG_RWSEMS
185  /*
186   * Return just the real task structure pointer of the owner
187   */
rwsem_owner(struct rw_semaphore * sem)188  static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem)
189  {
190  	return (struct task_struct *)
191  		(atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK);
192  }
193  
194  /*
195   * Return true if the rwsem is owned by a reader.
196   */
is_rwsem_reader_owned(struct rw_semaphore * sem)197  static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
198  {
199  	/*
200  	 * Check the count to see if it is write-locked.
201  	 */
202  	long count = atomic_long_read(&sem->count);
203  
204  	if (count & RWSEM_WRITER_MASK)
205  		return false;
206  	return rwsem_test_oflags(sem, RWSEM_READER_OWNED);
207  }
208  
209  /*
210   * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there
211   * is a task pointer in owner of a reader-owned rwsem, it will be the
212   * real owner or one of the real owners. The only exception is when the
213   * unlock is done by up_read_non_owner().
214   */
rwsem_clear_reader_owned(struct rw_semaphore * sem)215  static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
216  {
217  	unsigned long val = atomic_long_read(&sem->owner);
218  
219  	while ((val & ~RWSEM_OWNER_FLAGS_MASK) == (unsigned long)current) {
220  		if (atomic_long_try_cmpxchg(&sem->owner, &val,
221  					    val & RWSEM_OWNER_FLAGS_MASK))
222  			return;
223  	}
224  }
225  #else
rwsem_clear_reader_owned(struct rw_semaphore * sem)226  static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
227  {
228  }
229  #endif
230  
231  /*
232   * Set the RWSEM_NONSPINNABLE bits if the RWSEM_READER_OWNED flag
233   * remains set. Otherwise, the operation will be aborted.
234   */
rwsem_set_nonspinnable(struct rw_semaphore * sem)235  static inline void rwsem_set_nonspinnable(struct rw_semaphore *sem)
236  {
237  	unsigned long owner = atomic_long_read(&sem->owner);
238  
239  	do {
240  		if (!(owner & RWSEM_READER_OWNED))
241  			break;
242  		if (owner & RWSEM_NONSPINNABLE)
243  			break;
244  	} while (!atomic_long_try_cmpxchg(&sem->owner, &owner,
245  					  owner | RWSEM_NONSPINNABLE));
246  }
247  
rwsem_read_trylock(struct rw_semaphore * sem,long * cntp)248  static inline bool rwsem_read_trylock(struct rw_semaphore *sem, long *cntp)
249  {
250  	*cntp = atomic_long_add_return_acquire(RWSEM_READER_BIAS, &sem->count);
251  
252  	if (WARN_ON_ONCE(*cntp < 0))
253  		rwsem_set_nonspinnable(sem);
254  
255  	if (!(*cntp & RWSEM_READ_FAILED_MASK)) {
256  		rwsem_set_reader_owned(sem);
257  		return true;
258  	}
259  
260  	return false;
261  }
262  
rwsem_write_trylock(struct rw_semaphore * sem)263  static inline bool rwsem_write_trylock(struct rw_semaphore *sem)
264  {
265  	long tmp = RWSEM_UNLOCKED_VALUE;
266  
267  	if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, RWSEM_WRITER_LOCKED)) {
268  		rwsem_set_owner(sem);
269  		return true;
270  	}
271  
272  	return false;
273  }
274  
275  /*
276   * Return the real task structure pointer of the owner and the embedded
277   * flags in the owner. pflags must be non-NULL.
278   */
279  static inline struct task_struct *
rwsem_owner_flags(struct rw_semaphore * sem,unsigned long * pflags)280  rwsem_owner_flags(struct rw_semaphore *sem, unsigned long *pflags)
281  {
282  	unsigned long owner = atomic_long_read(&sem->owner);
283  
284  	*pflags = owner & RWSEM_OWNER_FLAGS_MASK;
285  	return (struct task_struct *)(owner & ~RWSEM_OWNER_FLAGS_MASK);
286  }
287  
288  /*
289   * Guide to the rw_semaphore's count field.
290   *
291   * When the RWSEM_WRITER_LOCKED bit in count is set, the lock is owned
292   * by a writer.
293   *
294   * The lock is owned by readers when
295   * (1) the RWSEM_WRITER_LOCKED isn't set in count,
296   * (2) some of the reader bits are set in count, and
297   * (3) the owner field has RWSEM_READ_OWNED bit set.
298   *
299   * Having some reader bits set is not enough to guarantee a readers owned
300   * lock as the readers may be in the process of backing out from the count
301   * and a writer has just released the lock. So another writer may steal
302   * the lock immediately after that.
303   */
304  
305  /*
306   * Initialize an rwsem:
307   */
__init_rwsem(struct rw_semaphore * sem,const char * name,struct lock_class_key * key)308  void __init_rwsem(struct rw_semaphore *sem, const char *name,
309  		  struct lock_class_key *key)
310  {
311  #ifdef CONFIG_DEBUG_LOCK_ALLOC
312  	/*
313  	 * Make sure we are not reinitializing a held semaphore:
314  	 */
315  	debug_check_no_locks_freed((void *)sem, sizeof(*sem));
316  	lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP);
317  #endif
318  #ifdef CONFIG_DEBUG_RWSEMS
319  	sem->magic = sem;
320  #endif
321  	atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);
322  	raw_spin_lock_init(&sem->wait_lock);
323  	INIT_LIST_HEAD(&sem->wait_list);
324  	atomic_long_set(&sem->owner, 0L);
325  #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
326  	osq_lock_init(&sem->osq);
327  #endif
328  }
329  EXPORT_SYMBOL(__init_rwsem);
330  
331  enum rwsem_waiter_type {
332  	RWSEM_WAITING_FOR_WRITE,
333  	RWSEM_WAITING_FOR_READ
334  };
335  
336  struct rwsem_waiter {
337  	struct list_head list;
338  	struct task_struct *task;
339  	enum rwsem_waiter_type type;
340  	unsigned long timeout;
341  	bool handoff_set;
342  };
343  #define rwsem_first_waiter(sem) \
344  	list_first_entry(&sem->wait_list, struct rwsem_waiter, list)
345  
346  enum rwsem_wake_type {
347  	RWSEM_WAKE_ANY,		/* Wake whatever's at head of wait list */
348  	RWSEM_WAKE_READERS,	/* Wake readers only */
349  	RWSEM_WAKE_READ_OWNED	/* Waker thread holds the read lock */
350  };
351  
352  /*
353   * The typical HZ value is either 250 or 1000. So set the minimum waiting
354   * time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait
355   * queue before initiating the handoff protocol.
356   */
357  #define RWSEM_WAIT_TIMEOUT	DIV_ROUND_UP(HZ, 250)
358  
359  /*
360   * Magic number to batch-wakeup waiting readers, even when writers are
361   * also present in the queue. This both limits the amount of work the
362   * waking thread must do and also prevents any potential counter overflow,
363   * however unlikely.
364   */
365  #define MAX_READERS_WAKEUP	0x100
366  
367  static inline void
rwsem_add_waiter(struct rw_semaphore * sem,struct rwsem_waiter * waiter)368  rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
369  {
370  	lockdep_assert_held(&sem->wait_lock);
371  	list_add_tail(&waiter->list, &sem->wait_list);
372  	/* caller will set RWSEM_FLAG_WAITERS */
373  }
374  
375  /*
376   * Remove a waiter from the wait_list and clear flags.
377   *
378   * Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of
379   * this function. Modify with care.
380   *
381   * Return: true if wait_list isn't empty and false otherwise
382   */
383  static inline bool
rwsem_del_waiter(struct rw_semaphore * sem,struct rwsem_waiter * waiter)384  rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
385  {
386  	lockdep_assert_held(&sem->wait_lock);
387  	list_del(&waiter->list);
388  	if (likely(!list_empty(&sem->wait_list)))
389  		return true;
390  
391  	atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count);
392  	return false;
393  }
394  
395  /*
396   * handle the lock release when processes blocked on it that can now run
397   * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
398   *   have been set.
399   * - there must be someone on the queue
400   * - the wait_lock must be held by the caller
401   * - tasks are marked for wakeup, the caller must later invoke wake_up_q()
402   *   to actually wakeup the blocked task(s) and drop the reference count,
403   *   preferably when the wait_lock is released
404   * - woken process blocks are discarded from the list after having task zeroed
405   * - writers are only marked woken if downgrading is false
406   *
407   * Implies rwsem_del_waiter() for all woken readers.
408   */
rwsem_mark_wake(struct rw_semaphore * sem,enum rwsem_wake_type wake_type,struct wake_q_head * wake_q)409  static void rwsem_mark_wake(struct rw_semaphore *sem,
410  			    enum rwsem_wake_type wake_type,
411  			    struct wake_q_head *wake_q)
412  {
413  	struct rwsem_waiter *waiter, *tmp;
414  	long oldcount, woken = 0, adjustment = 0;
415  	struct list_head wlist;
416  
417  	lockdep_assert_held(&sem->wait_lock);
418  
419  	/*
420  	 * Take a peek at the queue head waiter such that we can determine
421  	 * the wakeup(s) to perform.
422  	 */
423  	waiter = rwsem_first_waiter(sem);
424  
425  	if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
426  		if (wake_type == RWSEM_WAKE_ANY) {
427  			/*
428  			 * Mark writer at the front of the queue for wakeup.
429  			 * Until the task is actually later awoken later by
430  			 * the caller, other writers are able to steal it.
431  			 * Readers, on the other hand, will block as they
432  			 * will notice the queued writer.
433  			 */
434  			wake_q_add(wake_q, waiter->task);
435  			lockevent_inc(rwsem_wake_writer);
436  		}
437  
438  		return;
439  	}
440  
441  	/*
442  	 * No reader wakeup if there are too many of them already.
443  	 */
444  	if (unlikely(atomic_long_read(&sem->count) < 0))
445  		return;
446  
447  	/*
448  	 * Writers might steal the lock before we grant it to the next reader.
449  	 * We prefer to do the first reader grant before counting readers
450  	 * so we can bail out early if a writer stole the lock.
451  	 */
452  	if (wake_type != RWSEM_WAKE_READ_OWNED) {
453  		struct task_struct *owner;
454  
455  		adjustment = RWSEM_READER_BIAS;
456  		oldcount = atomic_long_fetch_add(adjustment, &sem->count);
457  		if (unlikely(oldcount & RWSEM_WRITER_MASK)) {
458  			/*
459  			 * When we've been waiting "too" long (for writers
460  			 * to give up the lock), request a HANDOFF to
461  			 * force the issue.
462  			 */
463  			if (time_after(jiffies, waiter->timeout)) {
464  				if (!(oldcount & RWSEM_FLAG_HANDOFF)) {
465  					adjustment -= RWSEM_FLAG_HANDOFF;
466  					lockevent_inc(rwsem_rlock_handoff);
467  				}
468  				waiter->handoff_set = true;
469  			}
470  
471  			atomic_long_add(-adjustment, &sem->count);
472  			return;
473  		}
474  		/*
475  		 * Set it to reader-owned to give spinners an early
476  		 * indication that readers now have the lock.
477  		 * The reader nonspinnable bit seen at slowpath entry of
478  		 * the reader is copied over.
479  		 */
480  		owner = waiter->task;
481  		__rwsem_set_reader_owned(sem, owner);
482  	}
483  
484  	/*
485  	 * Grant up to MAX_READERS_WAKEUP read locks to all the readers in the
486  	 * queue. We know that the woken will be at least 1 as we accounted
487  	 * for above. Note we increment the 'active part' of the count by the
488  	 * number of readers before waking any processes up.
489  	 *
490  	 * This is an adaptation of the phase-fair R/W locks where at the
491  	 * reader phase (first waiter is a reader), all readers are eligible
492  	 * to acquire the lock at the same time irrespective of their order
493  	 * in the queue. The writers acquire the lock according to their
494  	 * order in the queue.
495  	 *
496  	 * We have to do wakeup in 2 passes to prevent the possibility that
497  	 * the reader count may be decremented before it is incremented. It
498  	 * is because the to-be-woken waiter may not have slept yet. So it
499  	 * may see waiter->task got cleared, finish its critical section and
500  	 * do an unlock before the reader count increment.
501  	 *
502  	 * 1) Collect the read-waiters in a separate list, count them and
503  	 *    fully increment the reader count in rwsem.
504  	 * 2) For each waiters in the new list, clear waiter->task and
505  	 *    put them into wake_q to be woken up later.
506  	 */
507  	INIT_LIST_HEAD(&wlist);
508  	list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) {
509  		if (waiter->type == RWSEM_WAITING_FOR_WRITE)
510  			continue;
511  
512  		woken++;
513  		list_move_tail(&waiter->list, &wlist);
514  
515  		/*
516  		 * Limit # of readers that can be woken up per wakeup call.
517  		 */
518  		if (unlikely(woken >= MAX_READERS_WAKEUP))
519  			break;
520  	}
521  
522  	adjustment = woken * RWSEM_READER_BIAS - adjustment;
523  	lockevent_cond_inc(rwsem_wake_reader, woken);
524  
525  	oldcount = atomic_long_read(&sem->count);
526  	if (list_empty(&sem->wait_list)) {
527  		/*
528  		 * Combined with list_move_tail() above, this implies
529  		 * rwsem_del_waiter().
530  		 */
531  		adjustment -= RWSEM_FLAG_WAITERS;
532  		if (oldcount & RWSEM_FLAG_HANDOFF)
533  			adjustment -= RWSEM_FLAG_HANDOFF;
534  	} else if (woken) {
535  		/*
536  		 * When we've woken a reader, we no longer need to force
537  		 * writers to give up the lock and we can clear HANDOFF.
538  		 */
539  		if (oldcount & RWSEM_FLAG_HANDOFF)
540  			adjustment -= RWSEM_FLAG_HANDOFF;
541  	}
542  
543  	if (adjustment)
544  		atomic_long_add(adjustment, &sem->count);
545  
546  	/* 2nd pass */
547  	list_for_each_entry_safe(waiter, tmp, &wlist, list) {
548  		struct task_struct *tsk;
549  
550  		tsk = waiter->task;
551  		get_task_struct(tsk);
552  
553  		/*
554  		 * Ensure calling get_task_struct() before setting the reader
555  		 * waiter to nil such that rwsem_down_read_slowpath() cannot
556  		 * race with do_exit() by always holding a reference count
557  		 * to the task to wakeup.
558  		 */
559  		smp_store_release(&waiter->task, NULL);
560  		/*
561  		 * Ensure issuing the wakeup (either by us or someone else)
562  		 * after setting the reader waiter to nil.
563  		 */
564  		wake_q_add_safe(wake_q, tsk);
565  	}
566  }
567  
568  /*
569   * Remove a waiter and try to wake up other waiters in the wait queue
570   * This function is called from the out_nolock path of both the reader and
571   * writer slowpaths with wait_lock held. It releases the wait_lock and
572   * optionally wake up waiters before it returns.
573   */
574  static inline void
rwsem_del_wake_waiter(struct rw_semaphore * sem,struct rwsem_waiter * waiter,struct wake_q_head * wake_q)575  rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter,
576  		      struct wake_q_head *wake_q)
577  		      __releases(&sem->wait_lock)
578  {
579  	bool first = rwsem_first_waiter(sem) == waiter;
580  
581  	wake_q_init(wake_q);
582  
583  	/*
584  	 * If the wait_list isn't empty and the waiter to be deleted is
585  	 * the first waiter, we wake up the remaining waiters as they may
586  	 * be eligible to acquire or spin on the lock.
587  	 */
588  	if (rwsem_del_waiter(sem, waiter) && first)
589  		rwsem_mark_wake(sem, RWSEM_WAKE_ANY, wake_q);
590  	raw_spin_unlock_irq(&sem->wait_lock);
591  	if (!wake_q_empty(wake_q))
592  		wake_up_q(wake_q);
593  }
594  
595  /*
596   * This function must be called with the sem->wait_lock held to prevent
597   * race conditions between checking the rwsem wait list and setting the
598   * sem->count accordingly.
599   *
600   * Implies rwsem_del_waiter() on success.
601   */
rwsem_try_write_lock(struct rw_semaphore * sem,struct rwsem_waiter * waiter)602  static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
603  					struct rwsem_waiter *waiter)
604  {
605  	struct rwsem_waiter *first = rwsem_first_waiter(sem);
606  	long count, new;
607  
608  	lockdep_assert_held(&sem->wait_lock);
609  
610  	count = atomic_long_read(&sem->count);
611  	do {
612  		bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
613  
614  		if (has_handoff) {
615  			/*
616  			 * Honor handoff bit and yield only when the first
617  			 * waiter is the one that set it. Otherwisee, we
618  			 * still try to acquire the rwsem.
619  			 */
620  			if (first->handoff_set && (waiter != first))
621  				return false;
622  		}
623  
624  		new = count;
625  
626  		if (count & RWSEM_LOCK_MASK) {
627  			/*
628  			 * A waiter (first or not) can set the handoff bit
629  			 * if it is an RT task or wait in the wait queue
630  			 * for too long.
631  			 */
632  			if (has_handoff || (!rt_or_dl_task(waiter->task) &&
633  					    !time_after(jiffies, waiter->timeout)))
634  				return false;
635  
636  			new |= RWSEM_FLAG_HANDOFF;
637  		} else {
638  			new |= RWSEM_WRITER_LOCKED;
639  			new &= ~RWSEM_FLAG_HANDOFF;
640  
641  			if (list_is_singular(&sem->wait_list))
642  				new &= ~RWSEM_FLAG_WAITERS;
643  		}
644  	} while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new));
645  
646  	/*
647  	 * We have either acquired the lock with handoff bit cleared or set
648  	 * the handoff bit. Only the first waiter can have its handoff_set
649  	 * set here to enable optimistic spinning in slowpath loop.
650  	 */
651  	if (new & RWSEM_FLAG_HANDOFF) {
652  		first->handoff_set = true;
653  		lockevent_inc(rwsem_wlock_handoff);
654  		return false;
655  	}
656  
657  	/*
658  	 * Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on
659  	 * success.
660  	 */
661  	list_del(&waiter->list);
662  	rwsem_set_owner(sem);
663  	return true;
664  }
665  
666  /*
667   * The rwsem_spin_on_owner() function returns the following 4 values
668   * depending on the lock owner state.
669   *   OWNER_NULL  : owner is currently NULL
670   *   OWNER_WRITER: when owner changes and is a writer
671   *   OWNER_READER: when owner changes and the new owner may be a reader.
672   *   OWNER_NONSPINNABLE:
673   *		   when optimistic spinning has to stop because either the
674   *		   owner stops running, is unknown, or its timeslice has
675   *		   been used up.
676   */
677  enum owner_state {
678  	OWNER_NULL		= 1 << 0,
679  	OWNER_WRITER		= 1 << 1,
680  	OWNER_READER		= 1 << 2,
681  	OWNER_NONSPINNABLE	= 1 << 3,
682  };
683  
684  #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
685  /*
686   * Try to acquire write lock before the writer has been put on wait queue.
687   */
rwsem_try_write_lock_unqueued(struct rw_semaphore * sem)688  static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
689  {
690  	long count = atomic_long_read(&sem->count);
691  
692  	while (!(count & (RWSEM_LOCK_MASK|RWSEM_FLAG_HANDOFF))) {
693  		if (atomic_long_try_cmpxchg_acquire(&sem->count, &count,
694  					count | RWSEM_WRITER_LOCKED)) {
695  			rwsem_set_owner(sem);
696  			lockevent_inc(rwsem_opt_lock);
697  			return true;
698  		}
699  	}
700  	return false;
701  }
702  
rwsem_can_spin_on_owner(struct rw_semaphore * sem)703  static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
704  {
705  	struct task_struct *owner;
706  	unsigned long flags;
707  	bool ret = true;
708  
709  	if (need_resched()) {
710  		lockevent_inc(rwsem_opt_fail);
711  		return false;
712  	}
713  
714  	/*
715  	 * Disable preemption is equal to the RCU read-side crital section,
716  	 * thus the task_strcut structure won't go away.
717  	 */
718  	owner = rwsem_owner_flags(sem, &flags);
719  	/*
720  	 * Don't check the read-owner as the entry may be stale.
721  	 */
722  	if ((flags & RWSEM_NONSPINNABLE) ||
723  	    (owner && !(flags & RWSEM_READER_OWNED) && !owner_on_cpu(owner)))
724  		ret = false;
725  
726  	lockevent_cond_inc(rwsem_opt_fail, !ret);
727  	return ret;
728  }
729  
730  #define OWNER_SPINNABLE		(OWNER_NULL | OWNER_WRITER | OWNER_READER)
731  
732  static inline enum owner_state
rwsem_owner_state(struct task_struct * owner,unsigned long flags)733  rwsem_owner_state(struct task_struct *owner, unsigned long flags)
734  {
735  	if (flags & RWSEM_NONSPINNABLE)
736  		return OWNER_NONSPINNABLE;
737  
738  	if (flags & RWSEM_READER_OWNED)
739  		return OWNER_READER;
740  
741  	return owner ? OWNER_WRITER : OWNER_NULL;
742  }
743  
744  static noinline enum owner_state
rwsem_spin_on_owner(struct rw_semaphore * sem)745  rwsem_spin_on_owner(struct rw_semaphore *sem)
746  {
747  	struct task_struct *new, *owner;
748  	unsigned long flags, new_flags;
749  	enum owner_state state;
750  
751  	lockdep_assert_preemption_disabled();
752  
753  	owner = rwsem_owner_flags(sem, &flags);
754  	state = rwsem_owner_state(owner, flags);
755  	if (state != OWNER_WRITER)
756  		return state;
757  
758  	for (;;) {
759  		/*
760  		 * When a waiting writer set the handoff flag, it may spin
761  		 * on the owner as well. Once that writer acquires the lock,
762  		 * we can spin on it. So we don't need to quit even when the
763  		 * handoff bit is set.
764  		 */
765  		new = rwsem_owner_flags(sem, &new_flags);
766  		if ((new != owner) || (new_flags != flags)) {
767  			state = rwsem_owner_state(new, new_flags);
768  			break;
769  		}
770  
771  		/*
772  		 * Ensure we emit the owner->on_cpu, dereference _after_
773  		 * checking sem->owner still matches owner, if that fails,
774  		 * owner might point to free()d memory, if it still matches,
775  		 * our spinning context already disabled preemption which is
776  		 * equal to RCU read-side crital section ensures the memory
777  		 * stays valid.
778  		 */
779  		barrier();
780  
781  		if (need_resched() || !owner_on_cpu(owner)) {
782  			state = OWNER_NONSPINNABLE;
783  			break;
784  		}
785  
786  		cpu_relax();
787  	}
788  
789  	return state;
790  }
791  
792  /*
793   * Calculate reader-owned rwsem spinning threshold for writer
794   *
795   * The more readers own the rwsem, the longer it will take for them to
796   * wind down and free the rwsem. So the empirical formula used to
797   * determine the actual spinning time limit here is:
798   *
799   *   Spinning threshold = (10 + nr_readers/2)us
800   *
801   * The limit is capped to a maximum of 25us (30 readers). This is just
802   * a heuristic and is subjected to change in the future.
803   */
rwsem_rspin_threshold(struct rw_semaphore * sem)804  static inline u64 rwsem_rspin_threshold(struct rw_semaphore *sem)
805  {
806  	long count = atomic_long_read(&sem->count);
807  	int readers = count >> RWSEM_READER_SHIFT;
808  	u64 delta;
809  
810  	if (readers > 30)
811  		readers = 30;
812  	delta = (20 + readers) * NSEC_PER_USEC / 2;
813  
814  	return sched_clock() + delta;
815  }
816  
rwsem_optimistic_spin(struct rw_semaphore * sem)817  static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
818  {
819  	bool taken = false;
820  	int prev_owner_state = OWNER_NULL;
821  	int loop = 0;
822  	u64 rspin_threshold = 0;
823  
824  	/* sem->wait_lock should not be held when doing optimistic spinning */
825  	if (!osq_lock(&sem->osq))
826  		goto done;
827  
828  	/*
829  	 * Optimistically spin on the owner field and attempt to acquire the
830  	 * lock whenever the owner changes. Spinning will be stopped when:
831  	 *  1) the owning writer isn't running; or
832  	 *  2) readers own the lock and spinning time has exceeded limit.
833  	 */
834  	for (;;) {
835  		enum owner_state owner_state;
836  
837  		owner_state = rwsem_spin_on_owner(sem);
838  		if (!(owner_state & OWNER_SPINNABLE))
839  			break;
840  
841  		/*
842  		 * Try to acquire the lock
843  		 */
844  		taken = rwsem_try_write_lock_unqueued(sem);
845  
846  		if (taken)
847  			break;
848  
849  		/*
850  		 * Time-based reader-owned rwsem optimistic spinning
851  		 */
852  		if (owner_state == OWNER_READER) {
853  			/*
854  			 * Re-initialize rspin_threshold every time when
855  			 * the owner state changes from non-reader to reader.
856  			 * This allows a writer to steal the lock in between
857  			 * 2 reader phases and have the threshold reset at
858  			 * the beginning of the 2nd reader phase.
859  			 */
860  			if (prev_owner_state != OWNER_READER) {
861  				if (rwsem_test_oflags(sem, RWSEM_NONSPINNABLE))
862  					break;
863  				rspin_threshold = rwsem_rspin_threshold(sem);
864  				loop = 0;
865  			}
866  
867  			/*
868  			 * Check time threshold once every 16 iterations to
869  			 * avoid calling sched_clock() too frequently so
870  			 * as to reduce the average latency between the times
871  			 * when the lock becomes free and when the spinner
872  			 * is ready to do a trylock.
873  			 */
874  			else if (!(++loop & 0xf) && (sched_clock() > rspin_threshold)) {
875  				rwsem_set_nonspinnable(sem);
876  				lockevent_inc(rwsem_opt_nospin);
877  				break;
878  			}
879  		}
880  
881  		/*
882  		 * An RT task cannot do optimistic spinning if it cannot
883  		 * be sure the lock holder is running or live-lock may
884  		 * happen if the current task and the lock holder happen
885  		 * to run in the same CPU. However, aborting optimistic
886  		 * spinning while a NULL owner is detected may miss some
887  		 * opportunity where spinning can continue without causing
888  		 * problem.
889  		 *
890  		 * There are 2 possible cases where an RT task may be able
891  		 * to continue spinning.
892  		 *
893  		 * 1) The lock owner is in the process of releasing the
894  		 *    lock, sem->owner is cleared but the lock has not
895  		 *    been released yet.
896  		 * 2) The lock was free and owner cleared, but another
897  		 *    task just comes in and acquire the lock before
898  		 *    we try to get it. The new owner may be a spinnable
899  		 *    writer.
900  		 *
901  		 * To take advantage of two scenarios listed above, the RT
902  		 * task is made to retry one more time to see if it can
903  		 * acquire the lock or continue spinning on the new owning
904  		 * writer. Of course, if the time lag is long enough or the
905  		 * new owner is not a writer or spinnable, the RT task will
906  		 * quit spinning.
907  		 *
908  		 * If the owner is a writer, the need_resched() check is
909  		 * done inside rwsem_spin_on_owner(). If the owner is not
910  		 * a writer, need_resched() check needs to be done here.
911  		 */
912  		if (owner_state != OWNER_WRITER) {
913  			if (need_resched())
914  				break;
915  			if (rt_or_dl_task(current) &&
916  			   (prev_owner_state != OWNER_WRITER))
917  				break;
918  		}
919  		prev_owner_state = owner_state;
920  
921  		/*
922  		 * The cpu_relax() call is a compiler barrier which forces
923  		 * everything in this loop to be re-loaded. We don't need
924  		 * memory barriers as we'll eventually observe the right
925  		 * values at the cost of a few extra spins.
926  		 */
927  		cpu_relax();
928  	}
929  	osq_unlock(&sem->osq);
930  done:
931  	lockevent_cond_inc(rwsem_opt_fail, !taken);
932  	return taken;
933  }
934  
935  /*
936   * Clear the owner's RWSEM_NONSPINNABLE bit if it is set. This should
937   * only be called when the reader count reaches 0.
938   */
clear_nonspinnable(struct rw_semaphore * sem)939  static inline void clear_nonspinnable(struct rw_semaphore *sem)
940  {
941  	if (unlikely(rwsem_test_oflags(sem, RWSEM_NONSPINNABLE)))
942  		atomic_long_andnot(RWSEM_NONSPINNABLE, &sem->owner);
943  }
944  
945  #else
rwsem_can_spin_on_owner(struct rw_semaphore * sem)946  static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
947  {
948  	return false;
949  }
950  
rwsem_optimistic_spin(struct rw_semaphore * sem)951  static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem)
952  {
953  	return false;
954  }
955  
clear_nonspinnable(struct rw_semaphore * sem)956  static inline void clear_nonspinnable(struct rw_semaphore *sem) { }
957  
958  static inline enum owner_state
rwsem_spin_on_owner(struct rw_semaphore * sem)959  rwsem_spin_on_owner(struct rw_semaphore *sem)
960  {
961  	return OWNER_NONSPINNABLE;
962  }
963  #endif
964  
965  /*
966   * Prepare to wake up waiter(s) in the wait queue by putting them into the
967   * given wake_q if the rwsem lock owner isn't a writer. If rwsem is likely
968   * reader-owned, wake up read lock waiters in queue front or wake up any
969   * front waiter otherwise.
970  
971   * This is being called from both reader and writer slow paths.
972   */
rwsem_cond_wake_waiter(struct rw_semaphore * sem,long count,struct wake_q_head * wake_q)973  static inline void rwsem_cond_wake_waiter(struct rw_semaphore *sem, long count,
974  					  struct wake_q_head *wake_q)
975  {
976  	enum rwsem_wake_type wake_type;
977  
978  	if (count & RWSEM_WRITER_MASK)
979  		return;
980  
981  	if (count & RWSEM_READER_MASK) {
982  		wake_type = RWSEM_WAKE_READERS;
983  	} else {
984  		wake_type = RWSEM_WAKE_ANY;
985  		clear_nonspinnable(sem);
986  	}
987  	rwsem_mark_wake(sem, wake_type, wake_q);
988  }
989  
990  /*
991   * Wait for the read lock to be granted
992   */
993  static struct rw_semaphore __sched *
rwsem_down_read_slowpath(struct rw_semaphore * sem,long count,unsigned int state)994  rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int state)
995  {
996  	long adjustment = -RWSEM_READER_BIAS;
997  	long rcnt = (count >> RWSEM_READER_SHIFT);
998  	struct rwsem_waiter waiter;
999  	DEFINE_WAKE_Q(wake_q);
1000  
1001  	/*
1002  	 * To prevent a constant stream of readers from starving a sleeping
1003  	 * writer, don't attempt optimistic lock stealing if the lock is
1004  	 * very likely owned by readers.
1005  	 */
1006  	if ((atomic_long_read(&sem->owner) & RWSEM_READER_OWNED) &&
1007  	    (rcnt > 1) && !(count & RWSEM_WRITER_LOCKED))
1008  		goto queue;
1009  
1010  	/*
1011  	 * Reader optimistic lock stealing.
1012  	 */
1013  	if (!(count & (RWSEM_WRITER_LOCKED | RWSEM_FLAG_HANDOFF))) {
1014  		rwsem_set_reader_owned(sem);
1015  		lockevent_inc(rwsem_rlock_steal);
1016  
1017  		/*
1018  		 * Wake up other readers in the wait queue if it is
1019  		 * the first reader.
1020  		 */
1021  		if ((rcnt == 1) && (count & RWSEM_FLAG_WAITERS)) {
1022  			raw_spin_lock_irq(&sem->wait_lock);
1023  			if (!list_empty(&sem->wait_list))
1024  				rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED,
1025  						&wake_q);
1026  			raw_spin_unlock_irq(&sem->wait_lock);
1027  			wake_up_q(&wake_q);
1028  		}
1029  		return sem;
1030  	}
1031  
1032  queue:
1033  	waiter.task = current;
1034  	waiter.type = RWSEM_WAITING_FOR_READ;
1035  	waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
1036  	waiter.handoff_set = false;
1037  
1038  	raw_spin_lock_irq(&sem->wait_lock);
1039  	if (list_empty(&sem->wait_list)) {
1040  		/*
1041  		 * In case the wait queue is empty and the lock isn't owned
1042  		 * by a writer, this reader can exit the slowpath and return
1043  		 * immediately as its RWSEM_READER_BIAS has already been set
1044  		 * in the count.
1045  		 */
1046  		if (!(atomic_long_read(&sem->count) & RWSEM_WRITER_MASK)) {
1047  			/* Provide lock ACQUIRE */
1048  			smp_acquire__after_ctrl_dep();
1049  			raw_spin_unlock_irq(&sem->wait_lock);
1050  			rwsem_set_reader_owned(sem);
1051  			lockevent_inc(rwsem_rlock_fast);
1052  			return sem;
1053  		}
1054  		adjustment += RWSEM_FLAG_WAITERS;
1055  	}
1056  	rwsem_add_waiter(sem, &waiter);
1057  
1058  	/* we're now waiting on the lock, but no longer actively locking */
1059  	count = atomic_long_add_return(adjustment, &sem->count);
1060  
1061  	rwsem_cond_wake_waiter(sem, count, &wake_q);
1062  	raw_spin_unlock_irq(&sem->wait_lock);
1063  
1064  	if (!wake_q_empty(&wake_q))
1065  		wake_up_q(&wake_q);
1066  
1067  	trace_contention_begin(sem, LCB_F_READ);
1068  
1069  	/* wait to be given the lock */
1070  	for (;;) {
1071  		set_current_state(state);
1072  		if (!smp_load_acquire(&waiter.task)) {
1073  			/* Matches rwsem_mark_wake()'s smp_store_release(). */
1074  			break;
1075  		}
1076  		if (signal_pending_state(state, current)) {
1077  			raw_spin_lock_irq(&sem->wait_lock);
1078  			if (waiter.task)
1079  				goto out_nolock;
1080  			raw_spin_unlock_irq(&sem->wait_lock);
1081  			/* Ordered by sem->wait_lock against rwsem_mark_wake(). */
1082  			break;
1083  		}
1084  		schedule_preempt_disabled();
1085  		lockevent_inc(rwsem_sleep_reader);
1086  	}
1087  
1088  	__set_current_state(TASK_RUNNING);
1089  	lockevent_inc(rwsem_rlock);
1090  	trace_contention_end(sem, 0);
1091  	return sem;
1092  
1093  out_nolock:
1094  	rwsem_del_wake_waiter(sem, &waiter, &wake_q);
1095  	__set_current_state(TASK_RUNNING);
1096  	lockevent_inc(rwsem_rlock_fail);
1097  	trace_contention_end(sem, -EINTR);
1098  	return ERR_PTR(-EINTR);
1099  }
1100  
1101  /*
1102   * Wait until we successfully acquire the write lock
1103   */
1104  static struct rw_semaphore __sched *
rwsem_down_write_slowpath(struct rw_semaphore * sem,int state)1105  rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
1106  {
1107  	struct rwsem_waiter waiter;
1108  	DEFINE_WAKE_Q(wake_q);
1109  
1110  	/* do optimistic spinning and steal lock if possible */
1111  	if (rwsem_can_spin_on_owner(sem) && rwsem_optimistic_spin(sem)) {
1112  		/* rwsem_optimistic_spin() implies ACQUIRE on success */
1113  		return sem;
1114  	}
1115  
1116  	/*
1117  	 * Optimistic spinning failed, proceed to the slowpath
1118  	 * and block until we can acquire the sem.
1119  	 */
1120  	waiter.task = current;
1121  	waiter.type = RWSEM_WAITING_FOR_WRITE;
1122  	waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
1123  	waiter.handoff_set = false;
1124  
1125  	raw_spin_lock_irq(&sem->wait_lock);
1126  	rwsem_add_waiter(sem, &waiter);
1127  
1128  	/* we're now waiting on the lock */
1129  	if (rwsem_first_waiter(sem) != &waiter) {
1130  		rwsem_cond_wake_waiter(sem, atomic_long_read(&sem->count),
1131  				       &wake_q);
1132  		if (!wake_q_empty(&wake_q)) {
1133  			/*
1134  			 * We want to minimize wait_lock hold time especially
1135  			 * when a large number of readers are to be woken up.
1136  			 */
1137  			raw_spin_unlock_irq(&sem->wait_lock);
1138  			wake_up_q(&wake_q);
1139  			raw_spin_lock_irq(&sem->wait_lock);
1140  		}
1141  	} else {
1142  		atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count);
1143  	}
1144  
1145  	/* wait until we successfully acquire the lock */
1146  	set_current_state(state);
1147  	trace_contention_begin(sem, LCB_F_WRITE);
1148  
1149  	for (;;) {
1150  		if (rwsem_try_write_lock(sem, &waiter)) {
1151  			/* rwsem_try_write_lock() implies ACQUIRE on success */
1152  			break;
1153  		}
1154  
1155  		raw_spin_unlock_irq(&sem->wait_lock);
1156  
1157  		if (signal_pending_state(state, current))
1158  			goto out_nolock;
1159  
1160  		/*
1161  		 * After setting the handoff bit and failing to acquire
1162  		 * the lock, attempt to spin on owner to accelerate lock
1163  		 * transfer. If the previous owner is a on-cpu writer and it
1164  		 * has just released the lock, OWNER_NULL will be returned.
1165  		 * In this case, we attempt to acquire the lock again
1166  		 * without sleeping.
1167  		 */
1168  		if (waiter.handoff_set) {
1169  			enum owner_state owner_state;
1170  
1171  			owner_state = rwsem_spin_on_owner(sem);
1172  			if (owner_state == OWNER_NULL)
1173  				goto trylock_again;
1174  		}
1175  
1176  		schedule_preempt_disabled();
1177  		lockevent_inc(rwsem_sleep_writer);
1178  		set_current_state(state);
1179  trylock_again:
1180  		raw_spin_lock_irq(&sem->wait_lock);
1181  	}
1182  	__set_current_state(TASK_RUNNING);
1183  	raw_spin_unlock_irq(&sem->wait_lock);
1184  	lockevent_inc(rwsem_wlock);
1185  	trace_contention_end(sem, 0);
1186  	return sem;
1187  
1188  out_nolock:
1189  	__set_current_state(TASK_RUNNING);
1190  	raw_spin_lock_irq(&sem->wait_lock);
1191  	rwsem_del_wake_waiter(sem, &waiter, &wake_q);
1192  	lockevent_inc(rwsem_wlock_fail);
1193  	trace_contention_end(sem, -EINTR);
1194  	return ERR_PTR(-EINTR);
1195  }
1196  
1197  /*
1198   * handle waking up a waiter on the semaphore
1199   * - up_read/up_write has decremented the active part of count if we come here
1200   */
rwsem_wake(struct rw_semaphore * sem)1201  static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
1202  {
1203  	unsigned long flags;
1204  	DEFINE_WAKE_Q(wake_q);
1205  
1206  	raw_spin_lock_irqsave(&sem->wait_lock, flags);
1207  
1208  	if (!list_empty(&sem->wait_list))
1209  		rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
1210  
1211  	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
1212  	wake_up_q(&wake_q);
1213  
1214  	return sem;
1215  }
1216  
1217  /*
1218   * downgrade a write lock into a read lock
1219   * - caller incremented waiting part of count and discovered it still negative
1220   * - just wake up any readers at the front of the queue
1221   */
rwsem_downgrade_wake(struct rw_semaphore * sem)1222  static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
1223  {
1224  	unsigned long flags;
1225  	DEFINE_WAKE_Q(wake_q);
1226  
1227  	raw_spin_lock_irqsave(&sem->wait_lock, flags);
1228  
1229  	if (!list_empty(&sem->wait_list))
1230  		rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
1231  
1232  	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
1233  	wake_up_q(&wake_q);
1234  
1235  	return sem;
1236  }
1237  
1238  /*
1239   * lock for reading
1240   */
__down_read_common(struct rw_semaphore * sem,int state)1241  static __always_inline int __down_read_common(struct rw_semaphore *sem, int state)
1242  {
1243  	int ret = 0;
1244  	long count;
1245  
1246  	preempt_disable();
1247  	if (!rwsem_read_trylock(sem, &count)) {
1248  		if (IS_ERR(rwsem_down_read_slowpath(sem, count, state))) {
1249  			ret = -EINTR;
1250  			goto out;
1251  		}
1252  		DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
1253  	}
1254  out:
1255  	preempt_enable();
1256  	return ret;
1257  }
1258  
__down_read(struct rw_semaphore * sem)1259  static __always_inline void __down_read(struct rw_semaphore *sem)
1260  {
1261  	__down_read_common(sem, TASK_UNINTERRUPTIBLE);
1262  }
1263  
__down_read_interruptible(struct rw_semaphore * sem)1264  static __always_inline int __down_read_interruptible(struct rw_semaphore *sem)
1265  {
1266  	return __down_read_common(sem, TASK_INTERRUPTIBLE);
1267  }
1268  
__down_read_killable(struct rw_semaphore * sem)1269  static __always_inline int __down_read_killable(struct rw_semaphore *sem)
1270  {
1271  	return __down_read_common(sem, TASK_KILLABLE);
1272  }
1273  
__down_read_trylock(struct rw_semaphore * sem)1274  static inline int __down_read_trylock(struct rw_semaphore *sem)
1275  {
1276  	int ret = 0;
1277  	long tmp;
1278  
1279  	DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
1280  
1281  	preempt_disable();
1282  	tmp = atomic_long_read(&sem->count);
1283  	while (!(tmp & RWSEM_READ_FAILED_MASK)) {
1284  		if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
1285  						    tmp + RWSEM_READER_BIAS)) {
1286  			rwsem_set_reader_owned(sem);
1287  			ret = 1;
1288  			break;
1289  		}
1290  	}
1291  	preempt_enable();
1292  	return ret;
1293  }
1294  
1295  /*
1296   * lock for writing
1297   */
__down_write_common(struct rw_semaphore * sem,int state)1298  static __always_inline int __down_write_common(struct rw_semaphore *sem, int state)
1299  {
1300  	int ret = 0;
1301  
1302  	preempt_disable();
1303  	if (unlikely(!rwsem_write_trylock(sem))) {
1304  		if (IS_ERR(rwsem_down_write_slowpath(sem, state)))
1305  			ret = -EINTR;
1306  	}
1307  	preempt_enable();
1308  	return ret;
1309  }
1310  
__down_write(struct rw_semaphore * sem)1311  static __always_inline void __down_write(struct rw_semaphore *sem)
1312  {
1313  	__down_write_common(sem, TASK_UNINTERRUPTIBLE);
1314  }
1315  
__down_write_killable(struct rw_semaphore * sem)1316  static __always_inline int __down_write_killable(struct rw_semaphore *sem)
1317  {
1318  	return __down_write_common(sem, TASK_KILLABLE);
1319  }
1320  
__down_write_trylock(struct rw_semaphore * sem)1321  static inline int __down_write_trylock(struct rw_semaphore *sem)
1322  {
1323  	int ret;
1324  
1325  	preempt_disable();
1326  	DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
1327  	ret = rwsem_write_trylock(sem);
1328  	preempt_enable();
1329  
1330  	return ret;
1331  }
1332  
1333  /*
1334   * unlock after reading
1335   */
__up_read(struct rw_semaphore * sem)1336  static inline void __up_read(struct rw_semaphore *sem)
1337  {
1338  	long tmp;
1339  
1340  	DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
1341  	DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
1342  
1343  	preempt_disable();
1344  	rwsem_clear_reader_owned(sem);
1345  	tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count);
1346  	DEBUG_RWSEMS_WARN_ON(tmp < 0, sem);
1347  	if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) ==
1348  		      RWSEM_FLAG_WAITERS)) {
1349  		clear_nonspinnable(sem);
1350  		rwsem_wake(sem);
1351  	}
1352  	preempt_enable();
1353  }
1354  
1355  /*
1356   * unlock after writing
1357   */
__up_write(struct rw_semaphore * sem)1358  static inline void __up_write(struct rw_semaphore *sem)
1359  {
1360  	long tmp;
1361  
1362  	DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
1363  	/*
1364  	 * sem->owner may differ from current if the ownership is transferred
1365  	 * to an anonymous writer by setting the RWSEM_NONSPINNABLE bits.
1366  	 */
1367  	DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) &&
1368  			    !rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem);
1369  
1370  	preempt_disable();
1371  	rwsem_clear_owner(sem);
1372  	tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count);
1373  	if (unlikely(tmp & RWSEM_FLAG_WAITERS))
1374  		rwsem_wake(sem);
1375  	preempt_enable();
1376  }
1377  
1378  /*
1379   * downgrade write lock to read lock
1380   */
__downgrade_write(struct rw_semaphore * sem)1381  static inline void __downgrade_write(struct rw_semaphore *sem)
1382  {
1383  	long tmp;
1384  
1385  	/*
1386  	 * When downgrading from exclusive to shared ownership,
1387  	 * anything inside the write-locked region cannot leak
1388  	 * into the read side. In contrast, anything in the
1389  	 * read-locked region is ok to be re-ordered into the
1390  	 * write side. As such, rely on RELEASE semantics.
1391  	 */
1392  	DEBUG_RWSEMS_WARN_ON(rwsem_owner(sem) != current, sem);
1393  	preempt_disable();
1394  	tmp = atomic_long_fetch_add_release(
1395  		-RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count);
1396  	rwsem_set_reader_owned(sem);
1397  	if (tmp & RWSEM_FLAG_WAITERS)
1398  		rwsem_downgrade_wake(sem);
1399  	preempt_enable();
1400  }
1401  
1402  #else /* !CONFIG_PREEMPT_RT */
1403  
1404  #define RT_MUTEX_BUILD_MUTEX
1405  #include "rtmutex.c"
1406  
1407  #define rwbase_set_and_save_current_state(state)	\
1408  	set_current_state(state)
1409  
1410  #define rwbase_restore_current_state()			\
1411  	__set_current_state(TASK_RUNNING)
1412  
1413  #define rwbase_rtmutex_lock_state(rtm, state)		\
1414  	__rt_mutex_lock(rtm, state)
1415  
1416  #define rwbase_rtmutex_slowlock_locked(rtm, state, wq)	\
1417  	__rt_mutex_slowlock_locked(rtm, NULL, state, wq)
1418  
1419  #define rwbase_rtmutex_unlock(rtm)			\
1420  	__rt_mutex_unlock(rtm)
1421  
1422  #define rwbase_rtmutex_trylock(rtm)			\
1423  	__rt_mutex_trylock(rtm)
1424  
1425  #define rwbase_signal_pending_state(state, current)	\
1426  	signal_pending_state(state, current)
1427  
1428  #define rwbase_pre_schedule()				\
1429  	rt_mutex_pre_schedule()
1430  
1431  #define rwbase_schedule()				\
1432  	rt_mutex_schedule()
1433  
1434  #define rwbase_post_schedule()				\
1435  	rt_mutex_post_schedule()
1436  
1437  #include "rwbase_rt.c"
1438  
__init_rwsem(struct rw_semaphore * sem,const char * name,struct lock_class_key * key)1439  void __init_rwsem(struct rw_semaphore *sem, const char *name,
1440  		  struct lock_class_key *key)
1441  {
1442  	init_rwbase_rt(&(sem)->rwbase);
1443  
1444  #ifdef CONFIG_DEBUG_LOCK_ALLOC
1445  	debug_check_no_locks_freed((void *)sem, sizeof(*sem));
1446  	lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP);
1447  #endif
1448  }
1449  EXPORT_SYMBOL(__init_rwsem);
1450  
__down_read(struct rw_semaphore * sem)1451  static inline void __down_read(struct rw_semaphore *sem)
1452  {
1453  	rwbase_read_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE);
1454  }
1455  
__down_read_interruptible(struct rw_semaphore * sem)1456  static inline int __down_read_interruptible(struct rw_semaphore *sem)
1457  {
1458  	return rwbase_read_lock(&sem->rwbase, TASK_INTERRUPTIBLE);
1459  }
1460  
__down_read_killable(struct rw_semaphore * sem)1461  static inline int __down_read_killable(struct rw_semaphore *sem)
1462  {
1463  	return rwbase_read_lock(&sem->rwbase, TASK_KILLABLE);
1464  }
1465  
__down_read_trylock(struct rw_semaphore * sem)1466  static inline int __down_read_trylock(struct rw_semaphore *sem)
1467  {
1468  	return rwbase_read_trylock(&sem->rwbase);
1469  }
1470  
__up_read(struct rw_semaphore * sem)1471  static inline void __up_read(struct rw_semaphore *sem)
1472  {
1473  	rwbase_read_unlock(&sem->rwbase, TASK_NORMAL);
1474  }
1475  
__down_write(struct rw_semaphore * sem)1476  static inline void __sched __down_write(struct rw_semaphore *sem)
1477  {
1478  	rwbase_write_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE);
1479  }
1480  
__down_write_killable(struct rw_semaphore * sem)1481  static inline int __sched __down_write_killable(struct rw_semaphore *sem)
1482  {
1483  	return rwbase_write_lock(&sem->rwbase, TASK_KILLABLE);
1484  }
1485  
__down_write_trylock(struct rw_semaphore * sem)1486  static inline int __down_write_trylock(struct rw_semaphore *sem)
1487  {
1488  	return rwbase_write_trylock(&sem->rwbase);
1489  }
1490  
__up_write(struct rw_semaphore * sem)1491  static inline void __up_write(struct rw_semaphore *sem)
1492  {
1493  	rwbase_write_unlock(&sem->rwbase);
1494  }
1495  
__downgrade_write(struct rw_semaphore * sem)1496  static inline void __downgrade_write(struct rw_semaphore *sem)
1497  {
1498  	rwbase_write_downgrade(&sem->rwbase);
1499  }
1500  
1501  /* Debug stubs for the common API */
1502  #define DEBUG_RWSEMS_WARN_ON(c, sem)
1503  
__rwsem_set_reader_owned(struct rw_semaphore * sem,struct task_struct * owner)1504  static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
1505  					    struct task_struct *owner)
1506  {
1507  }
1508  
is_rwsem_reader_owned(struct rw_semaphore * sem)1509  static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
1510  {
1511  	int count = atomic_read(&sem->rwbase.readers);
1512  
1513  	return count < 0 && count != READER_BIAS;
1514  }
1515  
1516  #endif /* CONFIG_PREEMPT_RT */
1517  
1518  /*
1519   * lock for reading
1520   */
down_read(struct rw_semaphore * sem)1521  void __sched down_read(struct rw_semaphore *sem)
1522  {
1523  	might_sleep();
1524  	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
1525  
1526  	LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
1527  }
1528  EXPORT_SYMBOL(down_read);
1529  
down_read_interruptible(struct rw_semaphore * sem)1530  int __sched down_read_interruptible(struct rw_semaphore *sem)
1531  {
1532  	might_sleep();
1533  	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
1534  
1535  	if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_interruptible)) {
1536  		rwsem_release(&sem->dep_map, _RET_IP_);
1537  		return -EINTR;
1538  	}
1539  
1540  	return 0;
1541  }
1542  EXPORT_SYMBOL(down_read_interruptible);
1543  
down_read_killable(struct rw_semaphore * sem)1544  int __sched down_read_killable(struct rw_semaphore *sem)
1545  {
1546  	might_sleep();
1547  	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
1548  
1549  	if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) {
1550  		rwsem_release(&sem->dep_map, _RET_IP_);
1551  		return -EINTR;
1552  	}
1553  
1554  	return 0;
1555  }
1556  EXPORT_SYMBOL(down_read_killable);
1557  
1558  /*
1559   * trylock for reading -- returns 1 if successful, 0 if contention
1560   */
down_read_trylock(struct rw_semaphore * sem)1561  int down_read_trylock(struct rw_semaphore *sem)
1562  {
1563  	int ret = __down_read_trylock(sem);
1564  
1565  	if (ret == 1)
1566  		rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
1567  	return ret;
1568  }
1569  EXPORT_SYMBOL(down_read_trylock);
1570  
1571  /*
1572   * lock for writing
1573   */
down_write(struct rw_semaphore * sem)1574  void __sched down_write(struct rw_semaphore *sem)
1575  {
1576  	might_sleep();
1577  	rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
1578  	LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
1579  }
1580  EXPORT_SYMBOL(down_write);
1581  
1582  /*
1583   * lock for writing
1584   */
down_write_killable(struct rw_semaphore * sem)1585  int __sched down_write_killable(struct rw_semaphore *sem)
1586  {
1587  	might_sleep();
1588  	rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
1589  
1590  	if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock,
1591  				  __down_write_killable)) {
1592  		rwsem_release(&sem->dep_map, _RET_IP_);
1593  		return -EINTR;
1594  	}
1595  
1596  	return 0;
1597  }
1598  EXPORT_SYMBOL(down_write_killable);
1599  
1600  /*
1601   * trylock for writing -- returns 1 if successful, 0 if contention
1602   */
down_write_trylock(struct rw_semaphore * sem)1603  int down_write_trylock(struct rw_semaphore *sem)
1604  {
1605  	int ret = __down_write_trylock(sem);
1606  
1607  	if (ret == 1)
1608  		rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
1609  
1610  	return ret;
1611  }
1612  EXPORT_SYMBOL(down_write_trylock);
1613  
1614  /*
1615   * release a read lock
1616   */
up_read(struct rw_semaphore * sem)1617  void up_read(struct rw_semaphore *sem)
1618  {
1619  	rwsem_release(&sem->dep_map, _RET_IP_);
1620  	__up_read(sem);
1621  }
1622  EXPORT_SYMBOL(up_read);
1623  
1624  /*
1625   * release a write lock
1626   */
up_write(struct rw_semaphore * sem)1627  void up_write(struct rw_semaphore *sem)
1628  {
1629  	rwsem_release(&sem->dep_map, _RET_IP_);
1630  	__up_write(sem);
1631  }
1632  EXPORT_SYMBOL(up_write);
1633  
1634  /*
1635   * downgrade write lock to read lock
1636   */
downgrade_write(struct rw_semaphore * sem)1637  void downgrade_write(struct rw_semaphore *sem)
1638  {
1639  	lock_downgrade(&sem->dep_map, _RET_IP_);
1640  	__downgrade_write(sem);
1641  }
1642  EXPORT_SYMBOL(downgrade_write);
1643  
1644  #ifdef CONFIG_DEBUG_LOCK_ALLOC
1645  
down_read_nested(struct rw_semaphore * sem,int subclass)1646  void down_read_nested(struct rw_semaphore *sem, int subclass)
1647  {
1648  	might_sleep();
1649  	rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
1650  	LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
1651  }
1652  EXPORT_SYMBOL(down_read_nested);
1653  
down_read_killable_nested(struct rw_semaphore * sem,int subclass)1654  int down_read_killable_nested(struct rw_semaphore *sem, int subclass)
1655  {
1656  	might_sleep();
1657  	rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
1658  
1659  	if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) {
1660  		rwsem_release(&sem->dep_map, _RET_IP_);
1661  		return -EINTR;
1662  	}
1663  
1664  	return 0;
1665  }
1666  EXPORT_SYMBOL(down_read_killable_nested);
1667  
_down_write_nest_lock(struct rw_semaphore * sem,struct lockdep_map * nest)1668  void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
1669  {
1670  	might_sleep();
1671  	rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
1672  	LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
1673  }
1674  EXPORT_SYMBOL(_down_write_nest_lock);
1675  
down_read_non_owner(struct rw_semaphore * sem)1676  void down_read_non_owner(struct rw_semaphore *sem)
1677  {
1678  	might_sleep();
1679  	__down_read(sem);
1680  	/*
1681  	 * The owner value for a reader-owned lock is mostly for debugging
1682  	 * purpose only and is not critical to the correct functioning of
1683  	 * rwsem. So it is perfectly fine to set it in a preempt-enabled
1684  	 * context here.
1685  	 */
1686  	__rwsem_set_reader_owned(sem, NULL);
1687  }
1688  EXPORT_SYMBOL(down_read_non_owner);
1689  
down_write_nested(struct rw_semaphore * sem,int subclass)1690  void down_write_nested(struct rw_semaphore *sem, int subclass)
1691  {
1692  	might_sleep();
1693  	rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
1694  	LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
1695  }
1696  EXPORT_SYMBOL(down_write_nested);
1697  
down_write_killable_nested(struct rw_semaphore * sem,int subclass)1698  int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
1699  {
1700  	might_sleep();
1701  	rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
1702  
1703  	if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock,
1704  				  __down_write_killable)) {
1705  		rwsem_release(&sem->dep_map, _RET_IP_);
1706  		return -EINTR;
1707  	}
1708  
1709  	return 0;
1710  }
1711  EXPORT_SYMBOL(down_write_killable_nested);
1712  
up_read_non_owner(struct rw_semaphore * sem)1713  void up_read_non_owner(struct rw_semaphore *sem)
1714  {
1715  	DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
1716  	__up_read(sem);
1717  }
1718  EXPORT_SYMBOL(up_read_non_owner);
1719  
1720  #endif
1721