xref: /linux/drivers/android/binder/page_range.rs (revision 5c75125672443a209a40033f0df5fb823e356452)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 // Copyright (C) 2025 Google LLC.
4 
5 //! This module has utilities for managing a page range where unused pages may be reclaimed by a
6 //! vma shrinker.
7 
8 // To avoid deadlocks, locks are taken in the order:
9 //
10 //  1. mmap lock
11 //  2. spinlock
12 //  3. lru spinlock
13 //
14 // The shrinker will use trylock methods because it locks them in a different order.
15 
16 use core::{
17     marker::PhantomPinned,
18     mem::{size_of, size_of_val, MaybeUninit},
19     ptr,
20 };
21 
22 use kernel::{
23     bindings,
24     error::Result,
25     ffi::{c_ulong, c_void},
26     mm::{virt, Mm, MmWithUser},
27     new_mutex, new_spinlock,
28     page::{Page, PAGE_SHIFT, PAGE_SIZE},
29     prelude::*,
30     str::CStr,
31     sync::{aref::ARef, Mutex, SpinLock},
32     task::Pid,
33     transmute::FromBytes,
34     types::Opaque,
35     uaccess::UserSliceReader,
36 };
37 
38 /// Represents a shrinker that can be registered with the kernel.
39 ///
40 /// Each shrinker can be used by many `ShrinkablePageRange` objects.
41 #[repr(C)]
42 pub(crate) struct Shrinker {
43     inner: Opaque<*mut bindings::shrinker>,
44     list_lru: Opaque<bindings::list_lru>,
45 }
46 
47 // SAFETY: The shrinker and list_lru are thread safe.
48 unsafe impl Send for Shrinker {}
49 // SAFETY: The shrinker and list_lru are thread safe.
50 unsafe impl Sync for Shrinker {}
51 
52 impl Shrinker {
53     /// Create a new shrinker.
54     ///
55     /// # Safety
56     ///
57     /// Before using this shrinker with a `ShrinkablePageRange`, the `register` method must have
58     /// been called exactly once, and it must not have returned an error.
new() -> Self59     pub(crate) const unsafe fn new() -> Self {
60         Self {
61             inner: Opaque::uninit(),
62             list_lru: Opaque::uninit(),
63         }
64     }
65 
66     /// Register this shrinker with the kernel.
register(&'static self, name: &CStr) -> Result<()>67     pub(crate) fn register(&'static self, name: &CStr) -> Result<()> {
68         // SAFETY: These fields are not yet used, so it's okay to zero them.
69         unsafe {
70             self.inner.get().write(ptr::null_mut());
71             self.list_lru.get().write_bytes(0, 1);
72         }
73 
74         // SAFETY: The field is not yet used, so we can initialize it.
75         let ret = unsafe { bindings::__list_lru_init(self.list_lru.get(), false, ptr::null_mut()) };
76         if ret != 0 {
77             return Err(Error::from_errno(ret));
78         }
79 
80         // SAFETY: The `name` points at a valid c string.
81         let shrinker = unsafe { bindings::shrinker_alloc(0, name.as_char_ptr()) };
82         if shrinker.is_null() {
83             // SAFETY: We initialized it, so its okay to destroy it.
84             unsafe { bindings::list_lru_destroy(self.list_lru.get()) };
85             return Err(Error::from_errno(ret));
86         }
87 
88         // SAFETY: We're about to register the shrinker, and these are the fields we need to
89         // initialize. (All other fields are already zeroed.)
90         unsafe {
91             (&raw mut (*shrinker).count_objects).write(Some(rust_shrink_count));
92             (&raw mut (*shrinker).scan_objects).write(Some(rust_shrink_scan));
93             (&raw mut (*shrinker).private_data).write(self.list_lru.get().cast());
94         }
95 
96         // SAFETY: The new shrinker has been fully initialized, so we can register it.
97         unsafe { bindings::shrinker_register(shrinker) };
98 
99         // SAFETY: This initializes the pointer to the shrinker so that we can use it.
100         unsafe { self.inner.get().write(shrinker) };
101 
102         Ok(())
103     }
104 }
105 
106 /// A container that manages a page range in a vma.
107 ///
108 /// The pages can be thought of as an array of booleans of whether the pages are usable. The
109 /// methods `use_range` and `stop_using_range` set all booleans in a range to true or false
110 /// respectively. Initially, no pages are allocated. When a page is not used, it is not freed
111 /// immediately. Instead, it is made available to the memory shrinker to free it if the device is
112 /// under memory pressure.
113 ///
114 /// It's okay for `use_range` and `stop_using_range` to race with each other, although there's no
115 /// way to know whether an index ends up with true or false if a call to `use_range` races with
116 /// another call to `stop_using_range` on a given index.
117 ///
118 /// It's also okay for the two methods to race with themselves, e.g. if two threads call
119 /// `use_range` on the same index, then that's fine and neither call will return until the page is
120 /// allocated and mapped.
121 ///
122 /// The methods that read or write to a range require that the page is marked as in use. So it is
123 /// _not_ okay to call `stop_using_range` on a page that is in use by the methods that read or
124 /// write to the page.
125 #[pin_data(PinnedDrop)]
126 pub(crate) struct ShrinkablePageRange {
127     /// Shrinker object registered with the kernel.
128     shrinker: &'static Shrinker,
129     /// Pid using this page range. Only used as debugging information.
130     pid: Pid,
131     /// The mm for the relevant process.
132     mm: ARef<Mm>,
133     /// Used to synchronize calls to `vm_insert_page` and `zap_page_range_single`.
134     #[pin]
135     mm_lock: Mutex<()>,
136     /// Spinlock protecting changes to pages.
137     #[pin]
138     lock: SpinLock<Inner>,
139 
140     /// Must not move, since page info has pointers back.
141     #[pin]
142     _pin: PhantomPinned,
143 }
144 
145 // We do not define any ops. For now, used only to check identity of vmas.
146 static BINDER_VM_OPS: bindings::vm_operations_struct = pin_init::zeroed();
147 
148 // To ensure that we do not accidentally install pages into or zap pages from the wrong vma, we
149 // check its vm_ops and private data before using it.
check_vma(vma: &virt::VmaRef, owner: *const ShrinkablePageRange) -> Option<&virt::VmaMixedMap>150 fn check_vma(vma: &virt::VmaRef, owner: *const ShrinkablePageRange) -> Option<&virt::VmaMixedMap> {
151     // SAFETY: Just reading the vm_ops pointer of any active vma is safe.
152     let vm_ops = unsafe { (*vma.as_ptr()).vm_ops };
153     if !ptr::eq(vm_ops, &BINDER_VM_OPS) {
154         return None;
155     }
156 
157     // SAFETY: Reading the vm_private_data pointer of a binder-owned vma is safe.
158     let vm_private_data = unsafe { (*vma.as_ptr()).vm_private_data };
159     // The ShrinkablePageRange is only dropped when the Process is dropped, which only happens once
160     // the file's ->release handler is invoked, which means the ShrinkablePageRange outlives any
161     // VMA associated with it, so there can't be any false positives due to pointer reuse here.
162     if !ptr::eq(vm_private_data, owner.cast()) {
163         return None;
164     }
165 
166     vma.as_mixedmap_vma()
167 }
168 
169 struct Inner {
170     /// Array of pages.
171     ///
172     /// Since this is also accessed by the shrinker, we can't use a `Box`, which asserts exclusive
173     /// ownership. To deal with that, we manage it using raw pointers.
174     pages: *mut PageInfo,
175     /// Length of the `pages` array.
176     size: usize,
177     /// The address of the vma to insert the pages into.
178     vma_addr: usize,
179 }
180 
181 // SAFETY: proper locking is in place for `Inner`
182 unsafe impl Send for Inner {}
183 
184 type StableMmGuard =
185     kernel::sync::lock::Guard<'static, (), kernel::sync::lock::mutex::MutexBackend>;
186 
187 /// An array element that describes the current state of a page.
188 ///
189 /// There are three states:
190 ///
191 ///  * Free. The page is None. The `lru` element is not queued.
192 ///  * Available. The page is Some. The `lru` element is queued to the shrinker's lru.
193 ///  * Used. The page is Some. The `lru` element is not queued.
194 ///
195 /// When an element is available, the shrinker is able to free the page.
196 #[repr(C)]
197 struct PageInfo {
198     lru: bindings::list_head,
199     page: Option<Page>,
200     range: *const ShrinkablePageRange,
201 }
202 
203 impl PageInfo {
204     /// # Safety
205     ///
206     /// The caller ensures that writing to `me.page` is ok, and that the page is not currently set.
set_page(me: *mut PageInfo, page: Page)207     unsafe fn set_page(me: *mut PageInfo, page: Page) {
208         // SAFETY: This pointer offset is in bounds.
209         let ptr = unsafe { &raw mut (*me).page };
210 
211         // SAFETY: The pointer is valid for writing, so also valid for reading.
212         if unsafe { (*ptr).is_some() } {
213             pr_err!("set_page called when there is already a page");
214             // SAFETY: We will initialize the page again below.
215             unsafe { ptr::drop_in_place(ptr) };
216         }
217 
218         // SAFETY: The pointer is valid for writing.
219         unsafe { ptr::write(ptr, Some(page)) };
220     }
221 
222     /// # Safety
223     ///
224     /// The caller ensures that reading from `me.page` is ok for the duration of 'a.
get_page<'a>(me: *const PageInfo) -> Option<&'a Page>225     unsafe fn get_page<'a>(me: *const PageInfo) -> Option<&'a Page> {
226         // SAFETY: This pointer offset is in bounds.
227         let ptr = unsafe { &raw const (*me).page };
228 
229         // SAFETY: The pointer is valid for reading.
230         unsafe { (*ptr).as_ref() }
231     }
232 
233     /// # Safety
234     ///
235     /// The caller ensures that writing to `me.page` is ok for the duration of 'a.
take_page(me: *mut PageInfo) -> Option<Page>236     unsafe fn take_page(me: *mut PageInfo) -> Option<Page> {
237         // SAFETY: This pointer offset is in bounds.
238         let ptr = unsafe { &raw mut (*me).page };
239 
240         // SAFETY: The pointer is valid for reading.
241         unsafe { (*ptr).take() }
242     }
243 
244     /// Add this page to the lru list, if not already in the list.
245     ///
246     /// # Safety
247     ///
248     /// The pointer must be valid, and it must be the right shrinker and nid.
list_lru_add(me: *mut PageInfo, nid: i32, shrinker: &'static Shrinker)249     unsafe fn list_lru_add(me: *mut PageInfo, nid: i32, shrinker: &'static Shrinker) {
250         // SAFETY: This pointer offset is in bounds.
251         let lru_ptr = unsafe { &raw mut (*me).lru };
252         // SAFETY: The lru pointer is valid, and we're not using it with any other lru list.
253         unsafe { bindings::list_lru_add(shrinker.list_lru.get(), lru_ptr, nid, ptr::null_mut()) };
254     }
255 
256     /// Remove this page from the lru list, if it is in the list.
257     ///
258     /// # Safety
259     ///
260     /// The pointer must be valid, and it must be the right shrinker and nid.
list_lru_del(me: *mut PageInfo, nid: i32, shrinker: &'static Shrinker)261     unsafe fn list_lru_del(me: *mut PageInfo, nid: i32, shrinker: &'static Shrinker) {
262         // SAFETY: This pointer offset is in bounds.
263         let lru_ptr = unsafe { &raw mut (*me).lru };
264         // SAFETY: The lru pointer is valid, and we're not using it with any other lru list.
265         unsafe { bindings::list_lru_del(shrinker.list_lru.get(), lru_ptr, nid, ptr::null_mut()) };
266     }
267 }
268 
269 impl ShrinkablePageRange {
270     /// Create a new `ShrinkablePageRange` using the given shrinker.
new(shrinker: &'static Shrinker) -> impl PinInit<Self, Error>271     pub(crate) fn new(shrinker: &'static Shrinker) -> impl PinInit<Self, Error> {
272         try_pin_init!(Self {
273             shrinker,
274             pid: kernel::current!().pid(),
275             mm: ARef::from(&**kernel::current!().mm().ok_or(ESRCH)?),
276             mm_lock <- new_mutex!((), "ShrinkablePageRange::mm"),
277             lock <- new_spinlock!(Inner {
278                 pages: ptr::null_mut(),
279                 size: 0,
280                 vma_addr: 0,
281             }, "ShrinkablePageRange"),
282             _pin: PhantomPinned,
283         })
284     }
285 
stable_trylock_mm(&self) -> Option<StableMmGuard>286     pub(crate) fn stable_trylock_mm(&self) -> Option<StableMmGuard> {
287         // SAFETY: This extends the duration of the reference. Since this call happens before
288         // `mm_lock` is taken in the destructor of `ShrinkablePageRange`, the destructor will block
289         // until the returned guard is dropped. This ensures that the guard is valid until dropped.
290         let mm_lock = unsafe { &*ptr::from_ref(&self.mm_lock) };
291 
292         mm_lock.try_lock()
293     }
294 
295     /// Register a vma with this page range. Returns the size of the region.
register_with_vma(&self, vma: &virt::VmaNew) -> Result<usize>296     pub(crate) fn register_with_vma(&self, vma: &virt::VmaNew) -> Result<usize> {
297         let num_bytes = usize::min(vma.end() - vma.start(), bindings::SZ_4M as usize);
298         let num_pages = num_bytes >> PAGE_SHIFT;
299 
300         if !ptr::eq::<Mm>(&*self.mm, &**vma.mm()) {
301             pr_debug!("Failed to register with vma: invalid vma->vm_mm");
302             return Err(EINVAL);
303         }
304         if num_pages == 0 {
305             pr_debug!("Failed to register with vma: size zero");
306             return Err(EINVAL);
307         }
308 
309         let mut pages = KVVec::<PageInfo>::with_capacity(num_pages, GFP_KERNEL)?;
310 
311         // SAFETY: This just initializes the pages array.
312         unsafe {
313             let self_ptr = self as *const ShrinkablePageRange;
314             for i in 0..num_pages {
315                 let info = pages.as_mut_ptr().add(i);
316                 (&raw mut (*info).range).write(self_ptr);
317                 (&raw mut (*info).page).write(None);
318                 let lru = &raw mut (*info).lru;
319                 (&raw mut (*lru).next).write(lru);
320                 (&raw mut (*lru).prev).write(lru);
321             }
322         }
323 
324         let mut inner = self.lock.lock();
325         if inner.size > 0 {
326             pr_debug!("Failed to register with vma: already registered");
327             drop(inner);
328             return Err(EBUSY);
329         }
330 
331         inner.pages = pages.into_raw_parts().0;
332         inner.size = num_pages;
333         inner.vma_addr = vma.start();
334 
335         // This pointer is only used for comparison - it's not dereferenced.
336         //
337         // SAFETY: We own the vma, and we don't use any methods on VmaNew that rely on
338         // `vm_private_data`.
339         unsafe {
340             (*vma.as_ptr()).vm_private_data = ptr::from_ref(self).cast_mut().cast::<c_void>()
341         };
342 
343         // SAFETY: We own the vma, and we don't use any methods on VmaNew that rely on
344         // `vm_ops`.
345         unsafe { (*vma.as_ptr()).vm_ops = &BINDER_VM_OPS };
346 
347         Ok(num_pages)
348     }
349 
350     /// Make sure that the given pages are allocated and mapped.
351     ///
352     /// Must not be called from an atomic context.
use_range(&self, start: usize, end: usize) -> Result<()>353     pub(crate) fn use_range(&self, start: usize, end: usize) -> Result<()> {
354         if start >= end {
355             return Ok(());
356         }
357         let mut inner = self.lock.lock();
358         assert!(end <= inner.size);
359 
360         for i in start..end {
361             // SAFETY: This pointer offset is in bounds.
362             let page_info = unsafe { inner.pages.add(i) };
363 
364             // SAFETY: The pointer is valid, and we hold the lock so reading from the page is okay.
365             if let Some(page) = unsafe { PageInfo::get_page(page_info) } {
366                 // Since we're going to use the page, we should remove it from the lru list so that
367                 // the shrinker will not free it.
368                 //
369                 // SAFETY: The pointer is valid, and this is the right shrinker.
370                 //
371                 // The shrinker can't free the page between the check and this call to
372                 // `list_lru_del` because we hold the lock.
373                 unsafe { PageInfo::list_lru_del(page_info, page.nid(), self.shrinker) };
374             } else {
375                 // We have to allocate a new page. Use the slow path.
376                 drop(inner);
377                 // SAFETY: `i < end <= inner.size` so `i` is in bounds.
378                 match unsafe { self.use_page_slow(i) } {
379                     Ok(()) => {}
380                     Err(err) => {
381                         pr_warn!("Error in use_page_slow: {:?}", err);
382                         return Err(err);
383                     }
384                 }
385                 inner = self.lock.lock();
386             }
387         }
388         Ok(())
389     }
390 
391     /// Mark the given page as in use, slow path.
392     ///
393     /// Must not be called from an atomic context.
394     ///
395     /// # Safety
396     ///
397     /// Assumes that `i` is in bounds.
398     #[cold]
use_page_slow(&self, i: usize) -> Result<()>399     unsafe fn use_page_slow(&self, i: usize) -> Result<()> {
400         let new_page = Page::alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO)?;
401 
402         let mm_mutex = self.mm_lock.lock();
403         let inner = self.lock.lock();
404 
405         // SAFETY: This pointer offset is in bounds.
406         let page_info = unsafe { inner.pages.add(i) };
407 
408         // SAFETY: The pointer is valid, and we hold the lock so reading from the page is okay.
409         if let Some(page) = unsafe { PageInfo::get_page(page_info) } {
410             // The page was already there, or someone else added the page while we didn't hold the
411             // spinlock.
412             //
413             // SAFETY: The pointer is valid, and this is the right shrinker.
414             //
415             // The shrinker can't free the page between the check and this call to
416             // `list_lru_del` because we hold the lock.
417             unsafe { PageInfo::list_lru_del(page_info, page.nid(), self.shrinker) };
418             return Ok(());
419         }
420 
421         let vma_addr = inner.vma_addr;
422         // Release the spinlock while we insert the page into the vma.
423         drop(inner);
424 
425         // No overflow since we stay in bounds of the vma.
426         let user_page_addr = vma_addr + (i << PAGE_SHIFT);
427 
428         // We use `mmput_async` when dropping the `mm` because `use_page_slow` is usually used from
429         // a remote process. If the call to `mmput` races with the process shutting down, then the
430         // caller of `use_page_slow` becomes responsible for cleaning up the `mm`, which doesn't
431         // happen until it returns to userspace. However, the caller might instead go to sleep and
432         // wait for the owner of the `mm` to wake it up, which doesn't happen because it's in the
433         // middle of a shutdown process that won't complete until the `mm` is dropped. This can
434         // amount to a deadlock.
435         //
436         // Using `mmput_async` avoids this, because then the `mm` cleanup is instead queued to a
437         // workqueue.
438         let mm = MmWithUser::into_mmput_async(self.mm.mmget_not_zero().ok_or(ESRCH)?);
439         {
440             let vma_read;
441             let mmap_read;
442             let vma = if let Some(ret) = mm.lock_vma_under_rcu(vma_addr) {
443                 vma_read = ret;
444                 check_vma(&vma_read, self)
445             } else {
446                 mmap_read = mm.mmap_read_lock();
447                 mmap_read
448                     .vma_lookup(vma_addr)
449                     .and_then(|vma| check_vma(vma, self))
450             };
451 
452             match vma {
453                 Some(vma) => vma.vm_insert_page(user_page_addr, &new_page)?,
454                 None => return Err(ESRCH),
455             }
456         }
457 
458         let inner = self.lock.lock();
459 
460         // SAFETY: The `page_info` pointer is valid and currently does not have a page. The page
461         // can be written to since we hold the lock.
462         //
463         // We released and reacquired the spinlock since we checked that the page is null, but we
464         // always hold the mm_lock mutex when setting the page to a non-null value, so it's not
465         // possible for someone else to have changed it since our check.
466         unsafe { PageInfo::set_page(page_info, new_page) };
467 
468         drop(inner);
469         drop(mm_mutex);
470 
471         Ok(())
472     }
473 
474     /// If the given page is in use, then mark it as available so that the shrinker can free it.
475     ///
476     /// May be called from an atomic context.
stop_using_range(&self, start: usize, end: usize)477     pub(crate) fn stop_using_range(&self, start: usize, end: usize) {
478         if start >= end {
479             return;
480         }
481         let inner = self.lock.lock();
482         assert!(end <= inner.size);
483 
484         for i in (start..end).rev() {
485             // SAFETY: The pointer is in bounds.
486             let page_info = unsafe { inner.pages.add(i) };
487 
488             // SAFETY: Okay for reading since we have the lock.
489             if let Some(page) = unsafe { PageInfo::get_page(page_info) } {
490                 // SAFETY: The pointer is valid, and it's the right shrinker.
491                 unsafe { PageInfo::list_lru_add(page_info, page.nid(), self.shrinker) };
492             }
493         }
494     }
495 
496     /// Helper for reading or writing to a range of bytes that may overlap with several pages.
497     ///
498     /// # Safety
499     ///
500     /// All pages touched by this operation must be in use for the duration of this call.
iterate<T>(&self, mut offset: usize, mut size: usize, mut cb: T) -> Result where T: FnMut(&Page, usize, usize) -> Result,501     unsafe fn iterate<T>(&self, mut offset: usize, mut size: usize, mut cb: T) -> Result
502     where
503         T: FnMut(&Page, usize, usize) -> Result,
504     {
505         if size == 0 {
506             return Ok(());
507         }
508 
509         let (pages, num_pages) = {
510             let inner = self.lock.lock();
511             (inner.pages, inner.size)
512         };
513         let num_bytes = num_pages << PAGE_SHIFT;
514 
515         // Check that the request is within the buffer.
516         if offset.checked_add(size).ok_or(EFAULT)? > num_bytes {
517             return Err(EFAULT);
518         }
519 
520         let mut page_index = offset >> PAGE_SHIFT;
521         offset &= PAGE_SIZE - 1;
522         while size > 0 {
523             let available = usize::min(size, PAGE_SIZE - offset);
524             // SAFETY: The pointer is in bounds.
525             let page_info = unsafe { pages.add(page_index) };
526             // SAFETY: The caller guarantees that this page is in the "in use" state for the
527             // duration of this call to `iterate`, so nobody will change the page.
528             let page = unsafe { PageInfo::get_page(page_info) };
529             if page.is_none() {
530                 pr_warn!("Page is null!");
531             }
532             let page = page.ok_or(EFAULT)?;
533             cb(page, offset, available)?;
534             size -= available;
535             page_index += 1;
536             offset = 0;
537         }
538         Ok(())
539     }
540 
541     /// Copy from userspace into this page range.
542     ///
543     /// # Safety
544     ///
545     /// All pages touched by this operation must be in use for the duration of this call.
copy_from_user_slice( &self, reader: &mut UserSliceReader, offset: usize, size: usize, ) -> Result546     pub(crate) unsafe fn copy_from_user_slice(
547         &self,
548         reader: &mut UserSliceReader,
549         offset: usize,
550         size: usize,
551     ) -> Result {
552         // SAFETY: `self.iterate` has the same safety requirements as `copy_from_user_slice`.
553         unsafe {
554             self.iterate(offset, size, |page, offset, to_copy| {
555                 page.copy_from_user_slice_raw(reader, offset, to_copy)
556             })
557         }
558     }
559 
560     /// Copy from this page range into kernel space.
561     ///
562     /// # Safety
563     ///
564     /// All pages touched by this operation must be in use for the duration of this call.
read<T: FromBytes>(&self, offset: usize) -> Result<T>565     pub(crate) unsafe fn read<T: FromBytes>(&self, offset: usize) -> Result<T> {
566         let mut out = MaybeUninit::<T>::uninit();
567         let mut out_offset = 0;
568         // SAFETY: `self.iterate` has the same safety requirements as `read`.
569         unsafe {
570             self.iterate(offset, size_of::<T>(), |page, offset, to_copy| {
571                 // SAFETY: The sum of `offset` and `to_copy` is bounded by the size of T.
572                 let obj_ptr = (out.as_mut_ptr() as *mut u8).add(out_offset);
573                 // SAFETY: The pointer points is in-bounds of the `out` variable, so it is valid.
574                 page.read_raw(obj_ptr, offset, to_copy)?;
575                 out_offset += to_copy;
576                 Ok(())
577             })?;
578         }
579         // SAFETY: We just initialised the data.
580         Ok(unsafe { out.assume_init() })
581     }
582 
583     /// Copy from kernel space into this page range.
584     ///
585     /// # Safety
586     ///
587     /// All pages touched by this operation must be in use for the duration of this call.
write<T: ?Sized>(&self, offset: usize, obj: &T) -> Result588     pub(crate) unsafe fn write<T: ?Sized>(&self, offset: usize, obj: &T) -> Result {
589         let mut obj_offset = 0;
590         // SAFETY: `self.iterate` has the same safety requirements as `write`.
591         unsafe {
592             self.iterate(offset, size_of_val(obj), |page, offset, to_copy| {
593                 // SAFETY: The sum of `offset` and `to_copy` is bounded by the size of T.
594                 let obj_ptr = (obj as *const T as *const u8).add(obj_offset);
595                 // SAFETY: We have a reference to the object, so the pointer is valid.
596                 page.write_raw(obj_ptr, offset, to_copy)?;
597                 obj_offset += to_copy;
598                 Ok(())
599             })
600         }
601     }
602 
603     /// Write zeroes to the given range.
604     ///
605     /// # Safety
606     ///
607     /// All pages touched by this operation must be in use for the duration of this call.
fill_zero(&self, offset: usize, size: usize) -> Result608     pub(crate) unsafe fn fill_zero(&self, offset: usize, size: usize) -> Result {
609         // SAFETY: `self.iterate` has the same safety requirements as `copy_into`.
610         unsafe {
611             self.iterate(offset, size, |page, offset, len| {
612                 page.fill_zero_raw(offset, len)
613             })
614         }
615     }
616 }
617 
618 #[pinned_drop]
619 impl PinnedDrop for ShrinkablePageRange {
drop(self: Pin<&mut Self>)620     fn drop(self: Pin<&mut Self>) {
621         let (pages, size) = {
622             let lock = self.lock.lock();
623             (lock.pages, lock.size)
624         };
625 
626         if size == 0 {
627             return;
628         }
629 
630         // Note: This call is also necessary for the safety of `stable_trylock_mm`.
631         let mm_lock = self.mm_lock.lock();
632 
633         // This is the destructor, so unlike the other methods, we only need to worry about races
634         // with the shrinker here. Since we hold the `mm_lock`, we also can't race with the
635         // shrinker, and after this loop, the shrinker will not access any of our pages since we
636         // removed them from the lru list.
637         for i in 0..size {
638             // SAFETY: Loop is in-bounds of the size.
639             let p_ptr = unsafe { pages.add(i) };
640             // SAFETY: No other readers, so we can read.
641             if let Some(p) = unsafe { PageInfo::get_page(p_ptr) } {
642                 // SAFETY: The pointer is valid and it's the right shrinker.
643                 unsafe { PageInfo::list_lru_del(p_ptr, p.nid(), self.shrinker) };
644             }
645         }
646 
647         drop(mm_lock);
648 
649         // SAFETY: `pages` was allocated as an `KVVec<PageInfo>` with capacity `size`. Furthermore,
650         // all `size` elements are initialized. Also, the array is no longer shared with the
651         // shrinker due to the above loop.
652         drop(unsafe { KVVec::from_raw_parts(pages, size, size) });
653     }
654 }
655 
656 /// # Safety
657 /// Called by the shrinker.
658 #[no_mangle]
rust_shrink_count( shrink: *mut bindings::shrinker, _sc: *mut bindings::shrink_control, ) -> c_ulong659 unsafe extern "C" fn rust_shrink_count(
660     shrink: *mut bindings::shrinker,
661     _sc: *mut bindings::shrink_control,
662 ) -> c_ulong {
663     // SAFETY: We can access our own private data.
664     let list_lru = unsafe { (*shrink).private_data.cast::<bindings::list_lru>() };
665     // SAFETY: Accessing the lru list is okay. Just an FFI call.
666     unsafe { bindings::list_lru_count(list_lru) }
667 }
668 
669 /// # Safety
670 /// Called by the shrinker.
671 #[no_mangle]
rust_shrink_scan( shrink: *mut bindings::shrinker, sc: *mut bindings::shrink_control, ) -> c_ulong672 unsafe extern "C" fn rust_shrink_scan(
673     shrink: *mut bindings::shrinker,
674     sc: *mut bindings::shrink_control,
675 ) -> c_ulong {
676     // SAFETY: We can access our own private data.
677     let list_lru = unsafe { (*shrink).private_data.cast::<bindings::list_lru>() };
678     // SAFETY: Caller guarantees that it is safe to read this field.
679     let nr_to_scan = unsafe { (*sc).nr_to_scan };
680     // SAFETY: Accessing the lru list is okay. Just an FFI call.
681     unsafe {
682         bindings::list_lru_walk(
683             list_lru,
684             Some(bindings::rust_shrink_free_page_wrap),
685             ptr::null_mut(),
686             nr_to_scan,
687         )
688     }
689 }
690 
691 const LRU_SKIP: bindings::lru_status = bindings::lru_status_LRU_SKIP;
692 const LRU_REMOVED_ENTRY: bindings::lru_status = bindings::lru_status_LRU_REMOVED_RETRY;
693 
694 /// # Safety
695 /// Called by the shrinker.
696 #[no_mangle]
rust_shrink_free_page( item: *mut bindings::list_head, lru: *mut bindings::list_lru_one, _cb_arg: *mut c_void, ) -> bindings::lru_status697 unsafe extern "C" fn rust_shrink_free_page(
698     item: *mut bindings::list_head,
699     lru: *mut bindings::list_lru_one,
700     _cb_arg: *mut c_void,
701 ) -> bindings::lru_status {
702     // Fields that should survive after unlocking the lru lock.
703     let page;
704     let page_index;
705     let mm;
706     let mmap_read;
707     let mm_mutex;
708     let vma_addr;
709     let range_ptr;
710 
711     {
712         // CAST: The `list_head` field is first in `PageInfo`.
713         let info = item as *mut PageInfo;
714         // SAFETY: The `range` field of `PageInfo` is immutable.
715         range_ptr = unsafe { (*info).range };
716         // SAFETY: The `range` outlives its `PageInfo` values.
717         let range = unsafe { &*range_ptr };
718 
719         mm = match range.mm.mmget_not_zero() {
720             Some(mm) => MmWithUser::into_mmput_async(mm),
721             None => return LRU_SKIP,
722         };
723 
724         mm_mutex = match range.stable_trylock_mm() {
725             Some(guard) => guard,
726             None => return LRU_SKIP,
727         };
728 
729         mmap_read = match mm.mmap_read_trylock() {
730             Some(guard) => guard,
731             None => return LRU_SKIP,
732         };
733 
734         // We can't lock it normally here, since we hold the lru lock.
735         let inner = match range.lock.try_lock() {
736             Some(inner) => inner,
737             None => return LRU_SKIP,
738         };
739 
740         // SAFETY: The item is in this lru list, so it's okay to remove it.
741         unsafe { bindings::list_lru_isolate(lru, item) };
742 
743         // SAFETY: Both pointers are in bounds of the same allocation.
744         page_index = unsafe { info.offset_from(inner.pages) } as usize;
745 
746         // SAFETY: We hold the spinlock, so we can take the page.
747         //
748         // This sets the page pointer to zero before we unmap it from the vma. However, we call
749         // `zap_page_range` before we release the mmap lock, so `use_page_slow` will not be able to
750         // insert a new page until after our call to `zap_page_range`.
751         page = unsafe { PageInfo::take_page(info) };
752         vma_addr = inner.vma_addr;
753 
754         // From this point on, we don't access this PageInfo or ShrinkablePageRange again, because
755         // they can be freed at any point after we unlock `lru_lock`. This is with the exception of
756         // `mm_mutex` which is kept alive by holding the lock.
757     }
758 
759     // SAFETY: The lru lock is locked when this method is called.
760     unsafe { bindings::spin_unlock(&raw mut (*lru).lock) };
761 
762     if let Some(unchecked_vma) = mmap_read.vma_lookup(vma_addr) {
763         if let Some(vma) = check_vma(unchecked_vma, range_ptr) {
764             let user_page_addr = vma_addr + (page_index << PAGE_SHIFT);
765             vma.zap_page_range_single(user_page_addr, PAGE_SIZE);
766         }
767     }
768 
769     drop(mmap_read);
770     drop(mm_mutex);
771     drop(mm);
772     drop(page);
773 
774     LRU_REMOVED_ENTRY
775 }
776