xref: /linux/Documentation/RCU/rcuref.rst (revision 4b4193256c8d3bc3a5397b5cd9494c2ad386317d)
1*90c73cb2SMauro Carvalho Chehab.. SPDX-License-Identifier: GPL-2.0
2*90c73cb2SMauro Carvalho Chehab
3*90c73cb2SMauro Carvalho Chehab====================================================================
4*90c73cb2SMauro Carvalho ChehabReference-count design for elements of lists/arrays protected by RCU
5*90c73cb2SMauro Carvalho Chehab====================================================================
6*90c73cb2SMauro Carvalho Chehab
7*90c73cb2SMauro Carvalho Chehab
8*90c73cb2SMauro Carvalho ChehabPlease note that the percpu-ref feature is likely your first
9*90c73cb2SMauro Carvalho Chehabstop if you need to combine reference counts and RCU.  Please see
10*90c73cb2SMauro Carvalho Chehabinclude/linux/percpu-refcount.h for more information.  However, in
11*90c73cb2SMauro Carvalho Chehabthose unusual cases where percpu-ref would consume too much memory,
12*90c73cb2SMauro Carvalho Chehabplease read on.
13*90c73cb2SMauro Carvalho Chehab
14*90c73cb2SMauro Carvalho Chehab------------------------------------------------------------------------
15*90c73cb2SMauro Carvalho Chehab
16*90c73cb2SMauro Carvalho ChehabReference counting on elements of lists which are protected by traditional
17*90c73cb2SMauro Carvalho Chehabreader/writer spinlocks or semaphores are straightforward:
18*90c73cb2SMauro Carvalho Chehab
19*90c73cb2SMauro Carvalho ChehabCODE LISTING A::
20*90c73cb2SMauro Carvalho Chehab
21*90c73cb2SMauro Carvalho Chehab    1.					    2.
22*90c73cb2SMauro Carvalho Chehab    add()				    search_and_reference()
23*90c73cb2SMauro Carvalho Chehab    {					    {
24*90c73cb2SMauro Carvalho Chehab	alloc_object				read_lock(&list_lock);
25*90c73cb2SMauro Carvalho Chehab	...					search_for_element
26*90c73cb2SMauro Carvalho Chehab	atomic_set(&el->rc, 1);			atomic_inc(&el->rc);
27*90c73cb2SMauro Carvalho Chehab	write_lock(&list_lock);			 ...
28*90c73cb2SMauro Carvalho Chehab	add_element				read_unlock(&list_lock);
29*90c73cb2SMauro Carvalho Chehab	...					...
30*90c73cb2SMauro Carvalho Chehab	write_unlock(&list_lock);	   }
31*90c73cb2SMauro Carvalho Chehab    }
32*90c73cb2SMauro Carvalho Chehab
33*90c73cb2SMauro Carvalho Chehab    3.					    4.
34*90c73cb2SMauro Carvalho Chehab    release_referenced()		    delete()
35*90c73cb2SMauro Carvalho Chehab    {					    {
36*90c73cb2SMauro Carvalho Chehab	...					write_lock(&list_lock);
37*90c73cb2SMauro Carvalho Chehab	if(atomic_dec_and_test(&el->rc))	...
38*90c73cb2SMauro Carvalho Chehab	    kfree(el);
39*90c73cb2SMauro Carvalho Chehab	...					remove_element
40*90c73cb2SMauro Carvalho Chehab    }						write_unlock(&list_lock);
41*90c73cb2SMauro Carvalho Chehab						...
42*90c73cb2SMauro Carvalho Chehab						if (atomic_dec_and_test(&el->rc))
43*90c73cb2SMauro Carvalho Chehab						    kfree(el);
44*90c73cb2SMauro Carvalho Chehab						...
45*90c73cb2SMauro Carvalho Chehab					    }
46*90c73cb2SMauro Carvalho Chehab
47*90c73cb2SMauro Carvalho ChehabIf this list/array is made lock free using RCU as in changing the
48*90c73cb2SMauro Carvalho Chehabwrite_lock() in add() and delete() to spin_lock() and changing read_lock()
49*90c73cb2SMauro Carvalho Chehabin search_and_reference() to rcu_read_lock(), the atomic_inc() in
50*90c73cb2SMauro Carvalho Chehabsearch_and_reference() could potentially hold reference to an element which
51*90c73cb2SMauro Carvalho Chehabhas already been deleted from the list/array.  Use atomic_inc_not_zero()
52*90c73cb2SMauro Carvalho Chehabin this scenario as follows:
53*90c73cb2SMauro Carvalho Chehab
54*90c73cb2SMauro Carvalho ChehabCODE LISTING B::
55*90c73cb2SMauro Carvalho Chehab
56*90c73cb2SMauro Carvalho Chehab    1.					    2.
57*90c73cb2SMauro Carvalho Chehab    add()				    search_and_reference()
58*90c73cb2SMauro Carvalho Chehab    {					    {
59*90c73cb2SMauro Carvalho Chehab	alloc_object				rcu_read_lock();
60*90c73cb2SMauro Carvalho Chehab	...					search_for_element
61*90c73cb2SMauro Carvalho Chehab	atomic_set(&el->rc, 1);			if (!atomic_inc_not_zero(&el->rc)) {
62*90c73cb2SMauro Carvalho Chehab	spin_lock(&list_lock);			    rcu_read_unlock();
63*90c73cb2SMauro Carvalho Chehab						    return FAIL;
64*90c73cb2SMauro Carvalho Chehab	add_element				}
65*90c73cb2SMauro Carvalho Chehab	...					...
66*90c73cb2SMauro Carvalho Chehab	spin_unlock(&list_lock);		rcu_read_unlock();
67*90c73cb2SMauro Carvalho Chehab    }					    }
68*90c73cb2SMauro Carvalho Chehab    3.					    4.
69*90c73cb2SMauro Carvalho Chehab    release_referenced()		    delete()
70*90c73cb2SMauro Carvalho Chehab    {					    {
71*90c73cb2SMauro Carvalho Chehab	...					spin_lock(&list_lock);
72*90c73cb2SMauro Carvalho Chehab	if (atomic_dec_and_test(&el->rc))	...
73*90c73cb2SMauro Carvalho Chehab	    call_rcu(&el->head, el_free);	remove_element
74*90c73cb2SMauro Carvalho Chehab	...					spin_unlock(&list_lock);
75*90c73cb2SMauro Carvalho Chehab    }						...
76*90c73cb2SMauro Carvalho Chehab						if (atomic_dec_and_test(&el->rc))
77*90c73cb2SMauro Carvalho Chehab						    call_rcu(&el->head, el_free);
78*90c73cb2SMauro Carvalho Chehab						...
79*90c73cb2SMauro Carvalho Chehab					    }
80*90c73cb2SMauro Carvalho Chehab
81*90c73cb2SMauro Carvalho ChehabSometimes, a reference to the element needs to be obtained in the
82*90c73cb2SMauro Carvalho Chehabupdate (write) stream.	In such cases, atomic_inc_not_zero() might be
83*90c73cb2SMauro Carvalho Chehaboverkill, since we hold the update-side spinlock.  One might instead
84*90c73cb2SMauro Carvalho Chehabuse atomic_inc() in such cases.
85*90c73cb2SMauro Carvalho Chehab
86*90c73cb2SMauro Carvalho ChehabIt is not always convenient to deal with "FAIL" in the
87*90c73cb2SMauro Carvalho Chehabsearch_and_reference() code path.  In such cases, the
88*90c73cb2SMauro Carvalho Chehabatomic_dec_and_test() may be moved from delete() to el_free()
89*90c73cb2SMauro Carvalho Chehabas follows:
90*90c73cb2SMauro Carvalho Chehab
91*90c73cb2SMauro Carvalho ChehabCODE LISTING C::
92*90c73cb2SMauro Carvalho Chehab
93*90c73cb2SMauro Carvalho Chehab    1.					    2.
94*90c73cb2SMauro Carvalho Chehab    add()				    search_and_reference()
95*90c73cb2SMauro Carvalho Chehab    {					    {
96*90c73cb2SMauro Carvalho Chehab	alloc_object				rcu_read_lock();
97*90c73cb2SMauro Carvalho Chehab	...					search_for_element
98*90c73cb2SMauro Carvalho Chehab	atomic_set(&el->rc, 1);			atomic_inc(&el->rc);
99*90c73cb2SMauro Carvalho Chehab	spin_lock(&list_lock);			...
100*90c73cb2SMauro Carvalho Chehab
101*90c73cb2SMauro Carvalho Chehab	add_element				rcu_read_unlock();
102*90c73cb2SMauro Carvalho Chehab	...				    }
103*90c73cb2SMauro Carvalho Chehab	spin_unlock(&list_lock);	    4.
104*90c73cb2SMauro Carvalho Chehab    }					    delete()
105*90c73cb2SMauro Carvalho Chehab    3.					    {
106*90c73cb2SMauro Carvalho Chehab    release_referenced()			spin_lock(&list_lock);
107*90c73cb2SMauro Carvalho Chehab    {						...
108*90c73cb2SMauro Carvalho Chehab	...					remove_element
109*90c73cb2SMauro Carvalho Chehab	if (atomic_dec_and_test(&el->rc))	spin_unlock(&list_lock);
110*90c73cb2SMauro Carvalho Chehab	    kfree(el);				...
111*90c73cb2SMauro Carvalho Chehab	...					call_rcu(&el->head, el_free);
112*90c73cb2SMauro Carvalho Chehab    }						...
113*90c73cb2SMauro Carvalho Chehab    5.					    }
114*90c73cb2SMauro Carvalho Chehab    void el_free(struct rcu_head *rhp)
115*90c73cb2SMauro Carvalho Chehab    {
116*90c73cb2SMauro Carvalho Chehab	release_referenced();
117*90c73cb2SMauro Carvalho Chehab    }
118*90c73cb2SMauro Carvalho Chehab
119*90c73cb2SMauro Carvalho ChehabThe key point is that the initial reference added by add() is not removed
120*90c73cb2SMauro Carvalho Chehabuntil after a grace period has elapsed following removal.  This means that
121*90c73cb2SMauro Carvalho Chehabsearch_and_reference() cannot find this element, which means that the value
122*90c73cb2SMauro Carvalho Chehabof el->rc cannot increase.  Thus, once it reaches zero, there are no
123*90c73cb2SMauro Carvalho Chehabreaders that can or ever will be able to reference the element.	 The
124*90c73cb2SMauro Carvalho Chehabelement can therefore safely be freed.	This in turn guarantees that if
125*90c73cb2SMauro Carvalho Chehabany reader finds the element, that reader may safely acquire a reference
126*90c73cb2SMauro Carvalho Chehabwithout checking the value of the reference counter.
127*90c73cb2SMauro Carvalho Chehab
128*90c73cb2SMauro Carvalho ChehabA clear advantage of the RCU-based pattern in listing C over the one
129*90c73cb2SMauro Carvalho Chehabin listing B is that any call to search_and_reference() that locates
130*90c73cb2SMauro Carvalho Chehaba given object will succeed in obtaining a reference to that object,
131*90c73cb2SMauro Carvalho Chehabeven given a concurrent invocation of delete() for that same object.
132*90c73cb2SMauro Carvalho ChehabSimilarly, a clear advantage of both listings B and C over listing A is
133*90c73cb2SMauro Carvalho Chehabthat a call to delete() is not delayed even if there are an arbitrarily
134*90c73cb2SMauro Carvalho Chehablarge number of calls to search_and_reference() searching for the same
135*90c73cb2SMauro Carvalho Chehabobject that delete() was invoked on.  Instead, all that is delayed is
136*90c73cb2SMauro Carvalho Chehabthe eventual invocation of kfree(), which is usually not a problem on
137*90c73cb2SMauro Carvalho Chehabmodern computer systems, even the small ones.
138*90c73cb2SMauro Carvalho Chehab
139*90c73cb2SMauro Carvalho ChehabIn cases where delete() can sleep, synchronize_rcu() can be called from
140*90c73cb2SMauro Carvalho Chehabdelete(), so that el_free() can be subsumed into delete as follows::
141*90c73cb2SMauro Carvalho Chehab
142*90c73cb2SMauro Carvalho Chehab    4.
143*90c73cb2SMauro Carvalho Chehab    delete()
144*90c73cb2SMauro Carvalho Chehab    {
145*90c73cb2SMauro Carvalho Chehab	spin_lock(&list_lock);
146*90c73cb2SMauro Carvalho Chehab	...
147*90c73cb2SMauro Carvalho Chehab	remove_element
148*90c73cb2SMauro Carvalho Chehab	spin_unlock(&list_lock);
149*90c73cb2SMauro Carvalho Chehab	...
150*90c73cb2SMauro Carvalho Chehab	synchronize_rcu();
151*90c73cb2SMauro Carvalho Chehab	if (atomic_dec_and_test(&el->rc))
152*90c73cb2SMauro Carvalho Chehab	    kfree(el);
153*90c73cb2SMauro Carvalho Chehab	...
154*90c73cb2SMauro Carvalho Chehab    }
155*90c73cb2SMauro Carvalho Chehab
156*90c73cb2SMauro Carvalho ChehabAs additional examples in the kernel, the pattern in listing C is used by
157*90c73cb2SMauro Carvalho Chehabreference counting of struct pid, while the pattern in listing B is used by
158*90c73cb2SMauro Carvalho Chehabstruct posix_acl.
159