xref: /linux/rust/kernel/mm/virt.rs (revision 8026aed072e1221f0a61e5acc48c64546341bd4d)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 // Copyright (C) 2024 Google LLC.
4 
5 //! Virtual memory.
6 //!
7 //! This module deals with managing a single VMA in the address space of a userspace process. Each
8 //! VMA corresponds to a region of memory that the userspace process can access, and the VMA lets
9 //! you control what happens when userspace reads or writes to that region of memory.
10 //!
11 //! The module has several different Rust types that all correspond to the C type called
12 //! `vm_area_struct`. The different structs represent what kind of access you have to the VMA, e.g.
13 //! [`VmaRef`] is used when you hold the mmap or vma read lock. Using the appropriate struct
14 //! ensures that you can't, for example, accidentally call a function that requires holding the
15 //! write lock when you only hold the read lock.
16 
17 use crate::{
18     bindings,
19     error::{code::EINVAL, to_result, Result},
20     mm::MmWithUser,
21     page::Page,
22     types::Opaque,
23 };
24 
25 use core::ops::Deref;
26 
27 /// A wrapper for the kernel's `struct vm_area_struct` with read access.
28 ///
29 /// It represents an area of virtual memory.
30 ///
31 /// # Invariants
32 ///
33 /// The caller must hold the mmap read lock or the vma read lock.
34 #[repr(transparent)]
35 pub struct VmaRef {
36     vma: Opaque<bindings::vm_area_struct>,
37 }
38 
39 // Methods you can call when holding the mmap or vma read lock (or stronger). They must be usable
40 // no matter what the vma flags are.
41 impl VmaRef {
42     /// Access a virtual memory area given a raw pointer.
43     ///
44     /// # Safety
45     ///
46     /// Callers must ensure that `vma` is valid for the duration of 'a, and that the mmap or vma
47     /// read lock (or stronger) is held for at least the duration of 'a.
48     #[inline]
from_raw<'a>(vma: *const bindings::vm_area_struct) -> &'a Self49     pub unsafe fn from_raw<'a>(vma: *const bindings::vm_area_struct) -> &'a Self {
50         // SAFETY: The caller ensures that the invariants are satisfied for the duration of 'a.
51         unsafe { &*vma.cast() }
52     }
53 
54     /// Returns a raw pointer to this area.
55     #[inline]
as_ptr(&self) -> *mut bindings::vm_area_struct56     pub fn as_ptr(&self) -> *mut bindings::vm_area_struct {
57         self.vma.get()
58     }
59 
60     /// Access the underlying `mm_struct`.
61     #[inline]
mm(&self) -> &MmWithUser62     pub fn mm(&self) -> &MmWithUser {
63         // SAFETY: By the type invariants, this `vm_area_struct` is valid and we hold the mmap/vma
64         // read lock or stronger. This implies that the underlying mm has a non-zero value of
65         // `mm_users`.
66         unsafe { MmWithUser::from_raw((*self.as_ptr()).vm_mm) }
67     }
68 
69     /// Returns the flags associated with the virtual memory area.
70     ///
71     /// The possible flags are a combination of the constants in [`flags`].
72     #[inline]
flags(&self) -> vm_flags_t73     pub fn flags(&self) -> vm_flags_t {
74         // SAFETY: By the type invariants, the caller holds at least the mmap read lock, so this
75         // access is not a data race.
76         unsafe { (*self.as_ptr()).__bindgen_anon_2.vm_flags }
77     }
78 
79     /// Returns the (inclusive) start address of the virtual memory area.
80     #[inline]
start(&self) -> usize81     pub fn start(&self) -> usize {
82         // SAFETY: By the type invariants, the caller holds at least the mmap read lock, so this
83         // access is not a data race.
84         unsafe { (*self.as_ptr()).__bindgen_anon_1.__bindgen_anon_1.vm_start }
85     }
86 
87     /// Returns the (exclusive) end address of the virtual memory area.
88     #[inline]
end(&self) -> usize89     pub fn end(&self) -> usize {
90         // SAFETY: By the type invariants, the caller holds at least the mmap read lock, so this
91         // access is not a data race.
92         unsafe { (*self.as_ptr()).__bindgen_anon_1.__bindgen_anon_1.vm_end }
93     }
94 
95     /// Zap pages in the given page range.
96     ///
97     /// This clears page table mappings for the range at the leaf level, leaving all other page
98     /// tables intact, and freeing any memory referenced by the VMA in this range. That is,
99     /// anonymous memory is completely freed, file-backed memory has its reference count on page
100     /// cache folio's dropped, any dirty data will still be written back to disk as usual.
101     ///
102     /// It may seem odd that we clear at the leaf level, this is however a product of the page
103     /// table structure used to map physical memory into a virtual address space - each virtual
104     /// address actually consists of a bitmap of array indices into page tables, which form a
105     /// hierarchical page table level structure.
106     ///
107     /// As a result, each page table level maps a multiple of page table levels below, and thus
108     /// span ever increasing ranges of pages. At the leaf or PTE level, we map the actual physical
109     /// memory.
110     ///
111     /// It is here where a zap operates, as it the only place we can be certain of clearing without
112     /// impacting any other virtual mappings. It is an implementation detail as to whether the
113     /// kernel goes further in freeing unused page tables, but for the purposes of this operation
114     /// we must only assume that the leaf level is cleared.
115     #[inline]
zap_page_range_single(&self, address: usize, size: usize)116     pub fn zap_page_range_single(&self, address: usize, size: usize) {
117         let (end, did_overflow) = address.overflowing_add(size);
118         if did_overflow || address < self.start() || self.end() < end {
119             // TODO: call WARN_ONCE once Rust version of it is added
120             return;
121         }
122 
123         // SAFETY: By the type invariants, the caller has read access to this VMA, which is
124         // sufficient for this method call. This method has no requirements on the vma flags. The
125         // address range is checked to be within the vma.
126         unsafe {
127             bindings::zap_page_range_single(self.as_ptr(), address, size, core::ptr::null_mut())
128         };
129     }
130 
131     /// If the [`VM_MIXEDMAP`] flag is set, returns a [`VmaMixedMap`] to this VMA, otherwise
132     /// returns `None`.
133     ///
134     /// This can be used to access methods that require [`VM_MIXEDMAP`] to be set.
135     ///
136     /// [`VM_MIXEDMAP`]: flags::MIXEDMAP
137     #[inline]
as_mixedmap_vma(&self) -> Option<&VmaMixedMap>138     pub fn as_mixedmap_vma(&self) -> Option<&VmaMixedMap> {
139         if self.flags() & flags::MIXEDMAP != 0 {
140             // SAFETY: We just checked that `VM_MIXEDMAP` is set. All other requirements are
141             // satisfied by the type invariants of `VmaRef`.
142             Some(unsafe { VmaMixedMap::from_raw(self.as_ptr()) })
143         } else {
144             None
145         }
146     }
147 }
148 
149 /// A wrapper for the kernel's `struct vm_area_struct` with read access and [`VM_MIXEDMAP`] set.
150 ///
151 /// It represents an area of virtual memory.
152 ///
153 /// This struct is identical to [`VmaRef`] except that it must only be used when the
154 /// [`VM_MIXEDMAP`] flag is set on the vma.
155 ///
156 /// # Invariants
157 ///
158 /// The caller must hold the mmap read lock or the vma read lock. The `VM_MIXEDMAP` flag must be
159 /// set.
160 ///
161 /// [`VM_MIXEDMAP`]: flags::MIXEDMAP
162 #[repr(transparent)]
163 pub struct VmaMixedMap {
164     vma: VmaRef,
165 }
166 
167 // Make all `VmaRef` methods available on `VmaMixedMap`.
168 impl Deref for VmaMixedMap {
169     type Target = VmaRef;
170 
171     #[inline]
deref(&self) -> &VmaRef172     fn deref(&self) -> &VmaRef {
173         &self.vma
174     }
175 }
176 
177 impl VmaMixedMap {
178     /// Access a virtual memory area given a raw pointer.
179     ///
180     /// # Safety
181     ///
182     /// Callers must ensure that `vma` is valid for the duration of 'a, and that the mmap read lock
183     /// (or stronger) is held for at least the duration of 'a. The `VM_MIXEDMAP` flag must be set.
184     #[inline]
from_raw<'a>(vma: *const bindings::vm_area_struct) -> &'a Self185     pub unsafe fn from_raw<'a>(vma: *const bindings::vm_area_struct) -> &'a Self {
186         // SAFETY: The caller ensures that the invariants are satisfied for the duration of 'a.
187         unsafe { &*vma.cast() }
188     }
189 
190     /// Maps a single page at the given address within the virtual memory area.
191     ///
192     /// This operation does not take ownership of the page.
193     #[inline]
vm_insert_page(&self, address: usize, page: &Page) -> Result194     pub fn vm_insert_page(&self, address: usize, page: &Page) -> Result {
195         // SAFETY: By the type invariant of `Self` caller has read access and has verified that
196         // `VM_MIXEDMAP` is set. By invariant on `Page` the page has order 0.
197         to_result(unsafe { bindings::vm_insert_page(self.as_ptr(), address, page.as_ptr()) })
198     }
199 }
200 
201 /// A configuration object for setting up a VMA in an `f_ops->mmap()` hook.
202 ///
203 /// The `f_ops->mmap()` hook is called when a new VMA is being created, and the hook is able to
204 /// configure the VMA in various ways to fit the driver that owns it. Using `VmaNew` indicates that
205 /// you are allowed to perform operations on the VMA that can only be performed before the VMA is
206 /// fully initialized.
207 ///
208 /// # Invariants
209 ///
210 /// For the duration of 'a, the referenced vma must be undergoing initialization in an
211 /// `f_ops->mmap()` hook.
212 #[repr(transparent)]
213 pub struct VmaNew {
214     vma: VmaRef,
215 }
216 
217 // Make all `VmaRef` methods available on `VmaNew`.
218 impl Deref for VmaNew {
219     type Target = VmaRef;
220 
221     #[inline]
deref(&self) -> &VmaRef222     fn deref(&self) -> &VmaRef {
223         &self.vma
224     }
225 }
226 
227 impl VmaNew {
228     /// Access a virtual memory area given a raw pointer.
229     ///
230     /// # Safety
231     ///
232     /// Callers must ensure that `vma` is undergoing initial vma setup for the duration of 'a.
233     #[inline]
from_raw<'a>(vma: *mut bindings::vm_area_struct) -> &'a Self234     pub unsafe fn from_raw<'a>(vma: *mut bindings::vm_area_struct) -> &'a Self {
235         // SAFETY: The caller ensures that the invariants are satisfied for the duration of 'a.
236         unsafe { &*vma.cast() }
237     }
238 
239     /// Internal method for updating the vma flags.
240     ///
241     /// # Safety
242     ///
243     /// This must not be used to set the flags to an invalid value.
244     #[inline]
update_flags(&self, set: vm_flags_t, unset: vm_flags_t)245     unsafe fn update_flags(&self, set: vm_flags_t, unset: vm_flags_t) {
246         let mut flags = self.flags();
247         flags |= set;
248         flags &= !unset;
249 
250         // SAFETY: This is not a data race: the vma is undergoing initial setup, so it's not yet
251         // shared. Additionally, `VmaNew` is `!Sync`, so it cannot be used to write in parallel.
252         // The caller promises that this does not set the flags to an invalid value.
253         unsafe { (*self.as_ptr()).__bindgen_anon_2.__vm_flags = flags };
254     }
255 
256     /// Set the `VM_MIXEDMAP` flag on this vma.
257     ///
258     /// This enables the vma to contain both `struct page` and pure PFN pages. Returns a reference
259     /// that can be used to call `vm_insert_page` on the vma.
260     #[inline]
set_mixedmap(&self) -> &VmaMixedMap261     pub fn set_mixedmap(&self) -> &VmaMixedMap {
262         // SAFETY: We don't yet provide a way to set VM_PFNMAP, so this cannot put the flags in an
263         // invalid state.
264         unsafe { self.update_flags(flags::MIXEDMAP, 0) };
265 
266         // SAFETY: We just set `VM_MIXEDMAP` on the vma.
267         unsafe { VmaMixedMap::from_raw(self.vma.as_ptr()) }
268     }
269 
270     /// Set the `VM_IO` flag on this vma.
271     ///
272     /// This is used for memory mapped IO and similar. The flag tells other parts of the kernel to
273     /// avoid looking at the pages. For memory mapped IO this is useful as accesses to the pages
274     /// could have side effects.
275     #[inline]
set_io(&self)276     pub fn set_io(&self) {
277         // SAFETY: Setting the VM_IO flag is always okay.
278         unsafe { self.update_flags(flags::IO, 0) };
279     }
280 
281     /// Set the `VM_DONTEXPAND` flag on this vma.
282     ///
283     /// This prevents the vma from being expanded with `mremap()`.
284     #[inline]
set_dontexpand(&self)285     pub fn set_dontexpand(&self) {
286         // SAFETY: Setting the VM_DONTEXPAND flag is always okay.
287         unsafe { self.update_flags(flags::DONTEXPAND, 0) };
288     }
289 
290     /// Set the `VM_DONTCOPY` flag on this vma.
291     ///
292     /// This prevents the vma from being copied on fork. This option is only permanent if `VM_IO`
293     /// is set.
294     #[inline]
set_dontcopy(&self)295     pub fn set_dontcopy(&self) {
296         // SAFETY: Setting the VM_DONTCOPY flag is always okay.
297         unsafe { self.update_flags(flags::DONTCOPY, 0) };
298     }
299 
300     /// Set the `VM_DONTDUMP` flag on this vma.
301     ///
302     /// This prevents the vma from being included in core dumps. This option is only permanent if
303     /// `VM_IO` is set.
304     #[inline]
set_dontdump(&self)305     pub fn set_dontdump(&self) {
306         // SAFETY: Setting the VM_DONTDUMP flag is always okay.
307         unsafe { self.update_flags(flags::DONTDUMP, 0) };
308     }
309 
310     /// Returns whether `VM_READ` is set.
311     ///
312     /// This flag indicates whether userspace is mapping this vma as readable.
313     #[inline]
readable(&self) -> bool314     pub fn readable(&self) -> bool {
315         (self.flags() & flags::READ) != 0
316     }
317 
318     /// Try to clear the `VM_MAYREAD` flag, failing if `VM_READ` is set.
319     ///
320     /// This flag indicates whether userspace is allowed to make this vma readable with
321     /// `mprotect()`.
322     ///
323     /// Note that this operation is irreversible. Once `VM_MAYREAD` has been cleared, it can never
324     /// be set again.
325     #[inline]
try_clear_mayread(&self) -> Result326     pub fn try_clear_mayread(&self) -> Result {
327         if self.readable() {
328             return Err(EINVAL);
329         }
330         // SAFETY: Clearing `VM_MAYREAD` is okay when `VM_READ` is not set.
331         unsafe { self.update_flags(0, flags::MAYREAD) };
332         Ok(())
333     }
334 
335     /// Returns whether `VM_WRITE` is set.
336     ///
337     /// This flag indicates whether userspace is mapping this vma as writable.
338     #[inline]
writable(&self) -> bool339     pub fn writable(&self) -> bool {
340         (self.flags() & flags::WRITE) != 0
341     }
342 
343     /// Try to clear the `VM_MAYWRITE` flag, failing if `VM_WRITE` is set.
344     ///
345     /// This flag indicates whether userspace is allowed to make this vma writable with
346     /// `mprotect()`.
347     ///
348     /// Note that this operation is irreversible. Once `VM_MAYWRITE` has been cleared, it can never
349     /// be set again.
350     #[inline]
try_clear_maywrite(&self) -> Result351     pub fn try_clear_maywrite(&self) -> Result {
352         if self.writable() {
353             return Err(EINVAL);
354         }
355         // SAFETY: Clearing `VM_MAYWRITE` is okay when `VM_WRITE` is not set.
356         unsafe { self.update_flags(0, flags::MAYWRITE) };
357         Ok(())
358     }
359 
360     /// Returns whether `VM_EXEC` is set.
361     ///
362     /// This flag indicates whether userspace is mapping this vma as executable.
363     #[inline]
executable(&self) -> bool364     pub fn executable(&self) -> bool {
365         (self.flags() & flags::EXEC) != 0
366     }
367 
368     /// Try to clear the `VM_MAYEXEC` flag, failing if `VM_EXEC` is set.
369     ///
370     /// This flag indicates whether userspace is allowed to make this vma executable with
371     /// `mprotect()`.
372     ///
373     /// Note that this operation is irreversible. Once `VM_MAYEXEC` has been cleared, it can never
374     /// be set again.
375     #[inline]
try_clear_mayexec(&self) -> Result376     pub fn try_clear_mayexec(&self) -> Result {
377         if self.executable() {
378             return Err(EINVAL);
379         }
380         // SAFETY: Clearing `VM_MAYEXEC` is okay when `VM_EXEC` is not set.
381         unsafe { self.update_flags(0, flags::MAYEXEC) };
382         Ok(())
383     }
384 }
385 
386 /// The integer type used for vma flags.
387 #[doc(inline)]
388 pub use bindings::vm_flags_t;
389 
390 /// All possible flags for [`VmaRef`].
391 pub mod flags {
392     use super::vm_flags_t;
393     use crate::bindings;
394 
395     /// No flags are set.
396     pub const NONE: vm_flags_t = bindings::VM_NONE as vm_flags_t;
397 
398     /// Mapping allows reads.
399     pub const READ: vm_flags_t = bindings::VM_READ as vm_flags_t;
400 
401     /// Mapping allows writes.
402     pub const WRITE: vm_flags_t = bindings::VM_WRITE as vm_flags_t;
403 
404     /// Mapping allows execution.
405     pub const EXEC: vm_flags_t = bindings::VM_EXEC as vm_flags_t;
406 
407     /// Mapping is shared.
408     pub const SHARED: vm_flags_t = bindings::VM_SHARED as vm_flags_t;
409 
410     /// Mapping may be updated to allow reads.
411     pub const MAYREAD: vm_flags_t = bindings::VM_MAYREAD as vm_flags_t;
412 
413     /// Mapping may be updated to allow writes.
414     pub const MAYWRITE: vm_flags_t = bindings::VM_MAYWRITE as vm_flags_t;
415 
416     /// Mapping may be updated to allow execution.
417     pub const MAYEXEC: vm_flags_t = bindings::VM_MAYEXEC as vm_flags_t;
418 
419     /// Mapping may be updated to be shared.
420     pub const MAYSHARE: vm_flags_t = bindings::VM_MAYSHARE as vm_flags_t;
421 
422     /// Page-ranges managed without `struct page`, just pure PFN.
423     pub const PFNMAP: vm_flags_t = bindings::VM_PFNMAP as vm_flags_t;
424 
425     /// Memory mapped I/O or similar.
426     pub const IO: vm_flags_t = bindings::VM_IO as vm_flags_t;
427 
428     /// Do not copy this vma on fork.
429     pub const DONTCOPY: vm_flags_t = bindings::VM_DONTCOPY as vm_flags_t;
430 
431     /// Cannot expand with mremap().
432     pub const DONTEXPAND: vm_flags_t = bindings::VM_DONTEXPAND as vm_flags_t;
433 
434     /// Lock the pages covered when they are faulted in.
435     pub const LOCKONFAULT: vm_flags_t = bindings::VM_LOCKONFAULT as vm_flags_t;
436 
437     /// Is a VM accounted object.
438     pub const ACCOUNT: vm_flags_t = bindings::VM_ACCOUNT as vm_flags_t;
439 
440     /// Should the VM suppress accounting.
441     pub const NORESERVE: vm_flags_t = bindings::VM_NORESERVE as vm_flags_t;
442 
443     /// Huge TLB Page VM.
444     pub const HUGETLB: vm_flags_t = bindings::VM_HUGETLB as vm_flags_t;
445 
446     /// Synchronous page faults. (DAX-specific)
447     pub const SYNC: vm_flags_t = bindings::VM_SYNC as vm_flags_t;
448 
449     /// Architecture-specific flag.
450     pub const ARCH_1: vm_flags_t = bindings::VM_ARCH_1 as vm_flags_t;
451 
452     /// Wipe VMA contents in child on fork.
453     pub const WIPEONFORK: vm_flags_t = bindings::VM_WIPEONFORK as vm_flags_t;
454 
455     /// Do not include in the core dump.
456     pub const DONTDUMP: vm_flags_t = bindings::VM_DONTDUMP as vm_flags_t;
457 
458     /// Not soft dirty clean area.
459     pub const SOFTDIRTY: vm_flags_t = bindings::VM_SOFTDIRTY as vm_flags_t;
460 
461     /// Can contain `struct page` and pure PFN pages.
462     pub const MIXEDMAP: vm_flags_t = bindings::VM_MIXEDMAP as vm_flags_t;
463 
464     /// MADV_HUGEPAGE marked this vma.
465     pub const HUGEPAGE: vm_flags_t = bindings::VM_HUGEPAGE as vm_flags_t;
466 
467     /// MADV_NOHUGEPAGE marked this vma.
468     pub const NOHUGEPAGE: vm_flags_t = bindings::VM_NOHUGEPAGE as vm_flags_t;
469 
470     /// KSM may merge identical pages.
471     pub const MERGEABLE: vm_flags_t = bindings::VM_MERGEABLE as vm_flags_t;
472 }
473