1 // SPDX-License-Identifier: GPL-2.0 2 3 // Copyright (C) 2025 Google LLC. 4 5 use kernel::{ 6 page::PAGE_SIZE, 7 prelude::*, 8 rbtree::{RBTree, RBTreeNode, RBTreeNodeReservation}, 9 seq_file::SeqFile, 10 seq_print, 11 task::Pid, 12 }; 13 14 use crate::range_alloc::{DescriptorState, FreedRange, Range}; 15 16 /// Keeps track of allocations in a process' mmap. 17 /// 18 /// Each process has an mmap where the data for incoming transactions will be placed. This struct 19 /// keeps track of allocations made in the mmap. For each allocation, we store a descriptor that 20 /// has metadata related to the allocation. We also keep track of available free space. 21 pub(super) struct TreeRangeAllocator<T> { 22 /// This collection contains descriptors for *both* ranges containing an allocation, *and* free 23 /// ranges between allocations. The free ranges get merged, so there are never two free ranges 24 /// next to each other. 25 tree: RBTree<usize, Descriptor<T>>, 26 /// Contains an entry for every free range in `self.tree`. This tree sorts the ranges by size, 27 /// letting us look up the smallest range whose size is at least some lower bound. 28 free_tree: RBTree<FreeKey, ()>, 29 size: usize, 30 free_oneway_space: usize, 31 } 32 33 impl<T> TreeRangeAllocator<T> { 34 pub(crate) fn from_array( 35 size: usize, 36 ranges: &mut KVec<Range<T>>, 37 alloc: &mut FromArrayAllocs<T>, 38 ) -> Self { 39 let mut tree = TreeRangeAllocator { 40 tree: RBTree::new(), 41 free_tree: RBTree::new(), 42 size, 43 free_oneway_space: size / 2, 44 }; 45 46 let mut free_offset = 0; 47 for range in ranges.drain_all() { 48 let free_size = range.offset - free_offset; 49 if free_size > 0 { 50 let free_node = alloc.free_tree.pop().unwrap(); 51 tree.free_tree 52 .insert(free_node.into_node((free_size, free_offset), ())); 53 let tree_node = alloc.tree.pop().unwrap(); 54 tree.tree.insert( 55 tree_node.into_node(free_offset, Descriptor::new(free_offset, free_size)), 56 ); 57 } 58 free_offset = range.endpoint(); 59 60 if range.state.is_oneway() { 61 tree.free_oneway_space = tree.free_oneway_space.saturating_sub(range.size); 62 } 63 64 let free_res = alloc.free_tree.pop().unwrap(); 65 let tree_node = alloc.tree.pop().unwrap(); 66 let mut desc = Descriptor::new(range.offset, range.size); 67 desc.state = Some((range.state, free_res)); 68 tree.tree.insert(tree_node.into_node(range.offset, desc)); 69 } 70 71 // After the last range, we may need a free range. 72 if free_offset < size { 73 let free_size = size - free_offset; 74 let free_node = alloc.free_tree.pop().unwrap(); 75 tree.free_tree 76 .insert(free_node.into_node((free_size, free_offset), ())); 77 let tree_node = alloc.tree.pop().unwrap(); 78 tree.tree 79 .insert(tree_node.into_node(free_offset, Descriptor::new(free_offset, free_size))); 80 } 81 82 tree 83 } 84 85 pub(crate) fn is_empty(&self) -> bool { 86 let mut tree_iter = self.tree.values(); 87 // There's always at least one range, because index zero is either the start of a free or 88 // allocated range. 89 let first_value = tree_iter.next().unwrap(); 90 if tree_iter.next().is_some() { 91 // There are never two free ranges next to each other, so if there is more than one 92 // descriptor, then at least one of them must hold an allocated range. 93 return false; 94 } 95 // There is only one descriptor. Return true if it is for a free range. 96 first_value.state.is_none() 97 } 98 99 pub(crate) fn total_size(&self) -> usize { 100 self.size 101 } 102 103 pub(crate) fn free_oneway_space(&self) -> usize { 104 self.free_oneway_space 105 } 106 107 pub(crate) fn count_buffers(&self) -> usize { 108 self.tree 109 .values() 110 .filter(|desc| desc.state.is_some()) 111 .count() 112 } 113 114 pub(crate) fn debug_print(&self, m: &SeqFile) -> Result<()> { 115 for desc in self.tree.values() { 116 let state = match &desc.state { 117 Some(state) => &state.0, 118 None => continue, 119 }; 120 seq_print!( 121 m, 122 " buffer: {} size {} pid {}", 123 desc.offset, 124 desc.size, 125 state.pid(), 126 ); 127 if state.is_oneway() { 128 seq_print!(m, " oneway"); 129 } 130 match state { 131 DescriptorState::Reserved(_res) => { 132 seq_print!(m, " reserved\n"); 133 } 134 DescriptorState::Allocated(_alloc) => { 135 seq_print!(m, " allocated\n"); 136 } 137 } 138 } 139 Ok(()) 140 } 141 142 fn find_best_match(&mut self, size: usize) -> Option<&mut Descriptor<T>> { 143 let free_cursor = self.free_tree.cursor_lower_bound(&(size, 0))?; 144 let ((_, offset), ()) = free_cursor.current(); 145 self.tree.get_mut(offset) 146 } 147 148 /// Try to reserve a new buffer, using the provided allocation if necessary. 149 pub(crate) fn reserve_new( 150 &mut self, 151 debug_id: usize, 152 size: usize, 153 is_oneway: bool, 154 pid: Pid, 155 alloc: ReserveNewTreeAlloc<T>, 156 ) -> Result<(usize, bool)> { 157 // Compute new value of free_oneway_space, which is set only on success. 158 let new_oneway_space = if is_oneway { 159 match self.free_oneway_space.checked_sub(size) { 160 Some(new_oneway_space) => new_oneway_space, 161 None => return Err(ENOSPC), 162 } 163 } else { 164 self.free_oneway_space 165 }; 166 167 // Start detecting spammers once we have less than 20% 168 // of async space left (which is less than 10% of total 169 // buffer size). 170 // 171 // (This will short-circut, so `low_oneway_space` is 172 // only called when necessary.) 173 let oneway_spam_detected = 174 is_oneway && new_oneway_space < self.size / 10 && self.low_oneway_space(pid); 175 176 let (found_size, found_off, tree_node, free_tree_node) = match self.find_best_match(size) { 177 None => { 178 pr_warn!("ENOSPC from range_alloc.reserve_new - size: {}", size); 179 return Err(ENOSPC); 180 } 181 Some(desc) => { 182 let found_size = desc.size; 183 let found_offset = desc.offset; 184 185 // In case we need to break up the descriptor 186 let new_desc = Descriptor::new(found_offset + size, found_size - size); 187 let (tree_node, free_tree_node, desc_node_res) = alloc.initialize(new_desc); 188 189 desc.state = Some(( 190 DescriptorState::new(is_oneway, debug_id, pid), 191 desc_node_res, 192 )); 193 desc.size = size; 194 195 (found_size, found_offset, tree_node, free_tree_node) 196 } 197 }; 198 self.free_oneway_space = new_oneway_space; 199 self.free_tree.remove(&(found_size, found_off)); 200 201 if found_size != size { 202 self.tree.insert(tree_node); 203 self.free_tree.insert(free_tree_node); 204 } 205 206 Ok((found_off, oneway_spam_detected)) 207 } 208 209 pub(crate) fn reservation_abort(&mut self, offset: usize) -> Result<FreedRange> { 210 let mut cursor = self.tree.cursor_lower_bound(&offset).ok_or_else(|| { 211 pr_warn!( 212 "EINVAL from range_alloc.reservation_abort - offset: {}", 213 offset 214 ); 215 EINVAL 216 })?; 217 218 let (_, desc) = cursor.current_mut(); 219 220 if desc.offset != offset { 221 pr_warn!( 222 "EINVAL from range_alloc.reservation_abort - offset: {}", 223 offset 224 ); 225 return Err(EINVAL); 226 } 227 228 let (reservation, free_node_res) = desc.try_change_state(|state| match state { 229 Some((DescriptorState::Reserved(reservation), free_node_res)) => { 230 (None, Ok((reservation, free_node_res))) 231 } 232 None => { 233 pr_warn!( 234 "EINVAL from range_alloc.reservation_abort - offset: {}", 235 offset 236 ); 237 (None, Err(EINVAL)) 238 } 239 allocated => { 240 pr_warn!( 241 "EPERM from range_alloc.reservation_abort - offset: {}", 242 offset 243 ); 244 (allocated, Err(EPERM)) 245 } 246 })?; 247 248 let mut size = desc.size; 249 let mut offset = desc.offset; 250 let free_oneway_space_add = if reservation.is_oneway { size } else { 0 }; 251 252 self.free_oneway_space += free_oneway_space_add; 253 254 let mut freed_range = FreedRange::interior_pages(offset, size); 255 // Compute how large the next free region needs to be to include one more page in 256 // the newly freed range. 257 let add_next_page_needed = match (offset + size) % PAGE_SIZE { 258 0 => usize::MAX, 259 unalign => PAGE_SIZE - unalign, 260 }; 261 // Compute how large the previous free region needs to be to include one more page 262 // in the newly freed range. 263 let add_prev_page_needed = match offset % PAGE_SIZE { 264 0 => usize::MAX, 265 unalign => unalign, 266 }; 267 268 // Merge next into current if next is free 269 let remove_next = match cursor.peek_next() { 270 Some((_, next)) if next.state.is_none() => { 271 if next.size >= add_next_page_needed { 272 freed_range.end_page_idx += 1; 273 } 274 self.free_tree.remove(&(next.size, next.offset)); 275 size += next.size; 276 true 277 } 278 _ => false, 279 }; 280 281 if remove_next { 282 let (_, desc) = cursor.current_mut(); 283 desc.size = size; 284 cursor.remove_next(); 285 } 286 287 // Merge current into prev if prev is free 288 match cursor.peek_prev_mut() { 289 Some((_, prev)) if prev.state.is_none() => { 290 if prev.size >= add_prev_page_needed { 291 freed_range.start_page_idx -= 1; 292 } 293 // merge previous with current, remove current 294 self.free_tree.remove(&(prev.size, prev.offset)); 295 offset = prev.offset; 296 size += prev.size; 297 prev.size = size; 298 cursor.remove_current(); 299 } 300 _ => {} 301 }; 302 303 self.free_tree 304 .insert(free_node_res.into_node((size, offset), ())); 305 306 Ok(freed_range) 307 } 308 309 pub(crate) fn reservation_commit(&mut self, offset: usize, data: &mut Option<T>) -> Result { 310 let desc = self.tree.get_mut(&offset).ok_or(ENOENT)?; 311 312 desc.try_change_state(|state| match state { 313 Some((DescriptorState::Reserved(reservation), free_node_res)) => ( 314 Some(( 315 DescriptorState::Allocated(reservation.allocate(data.take())), 316 free_node_res, 317 )), 318 Ok(()), 319 ), 320 other => (other, Err(ENOENT)), 321 }) 322 } 323 324 /// Takes an entry at the given offset from [`DescriptorState::Allocated`] to 325 /// [`DescriptorState::Reserved`]. 326 /// 327 /// Returns the size of the existing entry and the data associated with it. 328 pub(crate) fn reserve_existing(&mut self, offset: usize) -> Result<(usize, usize, Option<T>)> { 329 let desc = self.tree.get_mut(&offset).ok_or_else(|| { 330 pr_warn!( 331 "ENOENT from range_alloc.reserve_existing - offset: {}", 332 offset 333 ); 334 ENOENT 335 })?; 336 337 let (debug_id, data) = desc.try_change_state(|state| match state { 338 Some((DescriptorState::Allocated(allocation), free_node_res)) => { 339 let (reservation, data) = allocation.deallocate(); 340 let debug_id = reservation.debug_id; 341 ( 342 Some((DescriptorState::Reserved(reservation), free_node_res)), 343 Ok((debug_id, data)), 344 ) 345 } 346 other => { 347 pr_warn!( 348 "ENOENT from range_alloc.reserve_existing - offset: {}", 349 offset 350 ); 351 (other, Err(ENOENT)) 352 } 353 })?; 354 355 Ok((desc.size, debug_id, data)) 356 } 357 358 /// Call the provided callback at every allocated region. 359 /// 360 /// This destroys the range allocator. Used only during shutdown. 361 pub(crate) fn take_for_each<F: Fn(usize, usize, usize, Option<T>)>(&mut self, callback: F) { 362 for (_, desc) in self.tree.iter_mut() { 363 if let Some((DescriptorState::Allocated(allocation), _)) = &mut desc.state { 364 callback( 365 desc.offset, 366 desc.size, 367 allocation.debug_id(), 368 allocation.take(), 369 ); 370 } 371 } 372 } 373 374 /// Find the amount and size of buffers allocated by the current caller. 375 /// 376 /// The idea is that once we cross the threshold, whoever is responsible 377 /// for the low async space is likely to try to send another async transaction, 378 /// and at some point we'll catch them in the act. This is more efficient 379 /// than keeping a map per pid. 380 fn low_oneway_space(&self, calling_pid: Pid) -> bool { 381 let mut total_alloc_size = 0; 382 let mut num_buffers = 0; 383 for (_, desc) in self.tree.iter() { 384 if let Some((state, _)) = &desc.state { 385 if state.is_oneway() && state.pid() == calling_pid { 386 total_alloc_size += desc.size; 387 num_buffers += 1; 388 } 389 } 390 } 391 392 // Warn if this pid has more than 50 transactions, or more than 50% of 393 // async space (which is 25% of total buffer size). Oneway spam is only 394 // detected when the threshold is exceeded. 395 num_buffers > 50 || total_alloc_size > self.size / 4 396 } 397 } 398 399 type TreeDescriptorState<T> = (DescriptorState<T>, FreeNodeRes); 400 struct Descriptor<T> { 401 size: usize, 402 offset: usize, 403 state: Option<TreeDescriptorState<T>>, 404 } 405 406 impl<T> Descriptor<T> { 407 fn new(offset: usize, size: usize) -> Self { 408 Self { 409 size, 410 offset, 411 state: None, 412 } 413 } 414 415 fn try_change_state<F, Data>(&mut self, f: F) -> Result<Data> 416 where 417 F: FnOnce(Option<TreeDescriptorState<T>>) -> (Option<TreeDescriptorState<T>>, Result<Data>), 418 { 419 let (new_state, result) = f(self.state.take()); 420 self.state = new_state; 421 result 422 } 423 } 424 425 // (Descriptor.size, Descriptor.offset) 426 type FreeKey = (usize, usize); 427 type FreeNodeRes = RBTreeNodeReservation<FreeKey, ()>; 428 429 /// An allocation for use by `reserve_new`. 430 pub(crate) struct ReserveNewTreeAlloc<T> { 431 tree_node_res: RBTreeNodeReservation<usize, Descriptor<T>>, 432 free_tree_node_res: FreeNodeRes, 433 desc_node_res: FreeNodeRes, 434 } 435 436 impl<T> ReserveNewTreeAlloc<T> { 437 pub(crate) fn try_new() -> Result<Self> { 438 let tree_node_res = RBTreeNodeReservation::new(GFP_KERNEL)?; 439 let free_tree_node_res = RBTreeNodeReservation::new(GFP_KERNEL)?; 440 let desc_node_res = RBTreeNodeReservation::new(GFP_KERNEL)?; 441 Ok(Self { 442 tree_node_res, 443 free_tree_node_res, 444 desc_node_res, 445 }) 446 } 447 448 fn initialize( 449 self, 450 desc: Descriptor<T>, 451 ) -> ( 452 RBTreeNode<usize, Descriptor<T>>, 453 RBTreeNode<FreeKey, ()>, 454 FreeNodeRes, 455 ) { 456 let size = desc.size; 457 let offset = desc.offset; 458 ( 459 self.tree_node_res.into_node(offset, desc), 460 self.free_tree_node_res.into_node((size, offset), ()), 461 self.desc_node_res, 462 ) 463 } 464 } 465 466 /// An allocation for creating a tree from an `ArrayRangeAllocator`. 467 pub(crate) struct FromArrayAllocs<T> { 468 tree: KVec<RBTreeNodeReservation<usize, Descriptor<T>>>, 469 free_tree: KVec<RBTreeNodeReservation<FreeKey, ()>>, 470 } 471 472 impl<T> FromArrayAllocs<T> { 473 pub(crate) fn try_new(len: usize) -> Result<Self> { 474 let num_descriptors = 2 * len + 1; 475 476 let mut tree = KVec::with_capacity(num_descriptors, GFP_KERNEL)?; 477 for _ in 0..num_descriptors { 478 tree.push(RBTreeNodeReservation::new(GFP_KERNEL)?, GFP_KERNEL)?; 479 } 480 481 let mut free_tree = KVec::with_capacity(num_descriptors, GFP_KERNEL)?; 482 for _ in 0..num_descriptors { 483 free_tree.push(RBTreeNodeReservation::new(GFP_KERNEL)?, GFP_KERNEL)?; 484 } 485 486 Ok(Self { tree, free_tree }) 487 } 488 } 489