1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/mm.h> 3 #include <linux/mmzone.h> 4 #include <linux/page_reporting.h> 5 #include <linux/gfp.h> 6 #include <linux/export.h> 7 #include <linux/module.h> 8 #include <linux/delay.h> 9 #include <linux/scatterlist.h> 10 11 #include "page_reporting.h" 12 #include "internal.h" 13 14 /* Initialize to an unsupported value */ 15 unsigned int page_reporting_order = -1; 16 17 static int page_order_update_notify(const char *val, const struct kernel_param *kp) 18 { 19 /* 20 * If param is set beyond this limit, order is set to default 21 * pageblock_order value 22 */ 23 return param_set_uint_minmax(val, kp, 0, MAX_PAGE_ORDER); 24 } 25 26 static const struct kernel_param_ops page_reporting_param_ops = { 27 .set = &page_order_update_notify, 28 /* 29 * For the get op, use param_get_int instead of param_get_uint. 30 * This is to make sure that when unset the initialized value of 31 * -1 is shown correctly 32 */ 33 .get = ¶m_get_int, 34 }; 35 36 module_param_cb(page_reporting_order, &page_reporting_param_ops, 37 &page_reporting_order, 0644); 38 MODULE_PARM_DESC(page_reporting_order, "Set page reporting order"); 39 40 /* 41 * This symbol is also a kernel parameter. Export the page_reporting_order 42 * symbol so that other drivers can access it to control order values without 43 * having to introduce another configurable parameter. Only one driver can 44 * register with the page_reporting driver for the service, so we have just 45 * one control parameter for the use case(which can be accessed in both 46 * drivers) 47 */ 48 EXPORT_SYMBOL_GPL(page_reporting_order); 49 50 #define PAGE_REPORTING_DELAY (2 * HZ) 51 static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly; 52 53 enum { 54 PAGE_REPORTING_IDLE = 0, 55 PAGE_REPORTING_REQUESTED, 56 PAGE_REPORTING_ACTIVE 57 }; 58 59 /* request page reporting */ 60 static void 61 __page_reporting_request(struct page_reporting_dev_info *prdev) 62 { 63 unsigned int state; 64 65 /* Check to see if we are in desired state */ 66 state = atomic_read(&prdev->state); 67 if (state == PAGE_REPORTING_REQUESTED) 68 return; 69 70 /* 71 * If reporting is already active there is nothing we need to do. 72 * Test against 0 as that represents PAGE_REPORTING_IDLE. 73 */ 74 state = atomic_xchg(&prdev->state, PAGE_REPORTING_REQUESTED); 75 if (state != PAGE_REPORTING_IDLE) 76 return; 77 78 /* 79 * Delay the start of work to allow a sizable queue to build. For 80 * now we are limiting this to running no more than once every 81 * couple of seconds. 82 */ 83 schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY); 84 } 85 86 /* notify prdev of free page reporting request */ 87 void __page_reporting_notify(void) 88 { 89 struct page_reporting_dev_info *prdev; 90 91 /* 92 * We use RCU to protect the pr_dev_info pointer. In almost all 93 * cases this should be present, however in the unlikely case of 94 * a shutdown this will be NULL and we should exit. 95 */ 96 rcu_read_lock(); 97 prdev = rcu_dereference(pr_dev_info); 98 if (likely(prdev)) 99 __page_reporting_request(prdev); 100 101 rcu_read_unlock(); 102 } 103 104 static void 105 page_reporting_drain(struct page_reporting_dev_info *prdev, 106 struct scatterlist *sgl, unsigned int nents, bool reported) 107 { 108 struct scatterlist *sg = sgl; 109 110 /* 111 * Drain the now reported pages back into their respective 112 * free lists/areas. We assume at least one page is populated. 113 */ 114 do { 115 struct page *page = sg_page(sg); 116 int mt = get_pageblock_migratetype(page); 117 unsigned int order = get_order(sg->length); 118 119 __putback_isolated_page(page, order, mt); 120 121 /* If the pages were not reported due to error skip flagging */ 122 if (!reported) 123 continue; 124 125 /* 126 * If page was not comingled with another page we can 127 * consider the result to be "reported" since the page 128 * hasn't been modified, otherwise we will need to 129 * report on the new larger page when we make our way 130 * up to that higher order. 131 */ 132 if (PageBuddy(page) && buddy_order(page) == order) 133 __SetPageReported(page); 134 } while ((sg = sg_next(sg))); 135 136 /* reinitialize scatterlist now that it is empty */ 137 sg_init_table(sgl, nents); 138 } 139 140 /* 141 * The page reporting cycle consists of 4 stages, fill, report, drain, and 142 * idle. We will cycle through the first 3 stages until we cannot obtain a 143 * full scatterlist of pages, in that case we will switch to idle. 144 */ 145 static int 146 page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone, 147 unsigned int order, unsigned int mt, 148 struct scatterlist *sgl, unsigned int *offset) 149 { 150 struct free_area *area = &zone->free_area[order]; 151 struct list_head *list = &area->free_list[mt]; 152 unsigned int page_len = PAGE_SIZE << order; 153 struct page *page, *next; 154 long budget; 155 int err = 0; 156 157 /* 158 * Perform early check, if free area is empty there is 159 * nothing to process so we can skip this free_list. 160 */ 161 if (list_empty(list)) 162 return err; 163 164 spin_lock_irq(&zone->lock); 165 166 /* 167 * Limit how many calls we will be making to the page reporting 168 * device for this list. By doing this we avoid processing any 169 * given list for too long. 170 * 171 * The current value used allows us enough calls to process over a 172 * sixteenth of the current list plus one additional call to handle 173 * any pages that may have already been present from the previous 174 * list processed. This should result in us reporting all pages on 175 * an idle system in about 30 seconds. 176 * 177 * The division here should be cheap since PAGE_REPORTING_CAPACITY 178 * should always be a power of 2. 179 */ 180 budget = DIV_ROUND_UP(area->nr_free, PAGE_REPORTING_CAPACITY * 16); 181 182 /* loop through free list adding unreported pages to sg list */ 183 list_for_each_entry_safe(page, next, list, lru) { 184 /* We are going to skip over the reported pages. */ 185 if (PageReported(page)) 186 continue; 187 188 /* 189 * If we fully consumed our budget then update our 190 * state to indicate that we are requesting additional 191 * processing and exit this list. 192 */ 193 if (budget < 0) { 194 atomic_set(&prdev->state, PAGE_REPORTING_REQUESTED); 195 next = page; 196 break; 197 } 198 199 /* Attempt to pull page from list and place in scatterlist */ 200 if (*offset) { 201 if (!__isolate_free_page(page, order)) { 202 next = page; 203 break; 204 } 205 206 /* Add page to scatter list */ 207 --(*offset); 208 sg_set_page(&sgl[*offset], page, page_len, 0); 209 210 continue; 211 } 212 213 /* 214 * Make the first non-reported page in the free list 215 * the new head of the free list before we release the 216 * zone lock. 217 */ 218 if (!list_is_first(&page->lru, list)) 219 list_rotate_to_front(&page->lru, list); 220 221 /* release lock before waiting on report processing */ 222 spin_unlock_irq(&zone->lock); 223 224 /* begin processing pages in local list */ 225 err = prdev->report(prdev, sgl, PAGE_REPORTING_CAPACITY); 226 227 /* reset offset since the full list was reported */ 228 *offset = PAGE_REPORTING_CAPACITY; 229 230 /* update budget to reflect call to report function */ 231 budget--; 232 233 /* reacquire zone lock and resume processing */ 234 spin_lock_irq(&zone->lock); 235 236 /* flush reported pages from the sg list */ 237 page_reporting_drain(prdev, sgl, PAGE_REPORTING_CAPACITY, !err); 238 239 /* 240 * Reset next to first entry, the old next isn't valid 241 * since we dropped the lock to report the pages 242 */ 243 next = list_first_entry(list, struct page, lru); 244 245 /* exit on error */ 246 if (err) 247 break; 248 } 249 250 /* Rotate any leftover pages to the head of the freelist */ 251 if (!list_entry_is_head(next, list, lru) && !list_is_first(&next->lru, list)) 252 list_rotate_to_front(&next->lru, list); 253 254 spin_unlock_irq(&zone->lock); 255 256 return err; 257 } 258 259 static int 260 page_reporting_process_zone(struct page_reporting_dev_info *prdev, 261 struct scatterlist *sgl, struct zone *zone) 262 { 263 unsigned int order, mt, leftover, offset = PAGE_REPORTING_CAPACITY; 264 unsigned long watermark; 265 int err = 0; 266 267 /* Generate minimum watermark to be able to guarantee progress */ 268 watermark = low_wmark_pages(zone) + 269 (PAGE_REPORTING_CAPACITY << page_reporting_order); 270 271 /* 272 * Cancel request if insufficient free memory or if we failed 273 * to allocate page reporting statistics for the zone. 274 */ 275 if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) 276 return err; 277 278 /* Process each free list starting from lowest order/mt */ 279 for (order = page_reporting_order; order < NR_PAGE_ORDERS; order++) { 280 for (mt = 0; mt < MIGRATE_TYPES; mt++) { 281 /* We do not pull pages from the isolate free list */ 282 if (is_migrate_isolate(mt)) 283 continue; 284 285 err = page_reporting_cycle(prdev, zone, order, mt, 286 sgl, &offset); 287 if (err) 288 return err; 289 } 290 } 291 292 /* report the leftover pages before going idle */ 293 leftover = PAGE_REPORTING_CAPACITY - offset; 294 if (leftover) { 295 sgl = &sgl[offset]; 296 err = prdev->report(prdev, sgl, leftover); 297 298 /* flush any remaining pages out from the last report */ 299 spin_lock_irq(&zone->lock); 300 page_reporting_drain(prdev, sgl, leftover, !err); 301 spin_unlock_irq(&zone->lock); 302 } 303 304 return err; 305 } 306 307 static void page_reporting_process(struct work_struct *work) 308 { 309 struct delayed_work *d_work = to_delayed_work(work); 310 struct page_reporting_dev_info *prdev = 311 container_of(d_work, struct page_reporting_dev_info, work); 312 int err = 0, state = PAGE_REPORTING_ACTIVE; 313 struct scatterlist *sgl; 314 struct zone *zone; 315 316 /* 317 * Change the state to "Active" so that we can track if there is 318 * anyone requests page reporting after we complete our pass. If 319 * the state is not altered by the end of the pass we will switch 320 * to idle and quit scheduling reporting runs. 321 */ 322 atomic_set(&prdev->state, state); 323 324 /* allocate scatterlist to store pages being reported on */ 325 sgl = kmalloc_array(PAGE_REPORTING_CAPACITY, sizeof(*sgl), GFP_KERNEL); 326 if (!sgl) 327 goto err_out; 328 329 sg_init_table(sgl, PAGE_REPORTING_CAPACITY); 330 331 for_each_zone(zone) { 332 err = page_reporting_process_zone(prdev, sgl, zone); 333 if (err) 334 break; 335 } 336 337 kfree(sgl); 338 err_out: 339 /* 340 * If the state has reverted back to requested then there may be 341 * additional pages to be processed. We will defer for 2s to allow 342 * more pages to accumulate. 343 */ 344 state = atomic_cmpxchg(&prdev->state, state, PAGE_REPORTING_IDLE); 345 if (state == PAGE_REPORTING_REQUESTED) 346 schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY); 347 } 348 349 static DEFINE_MUTEX(page_reporting_mutex); 350 DEFINE_STATIC_KEY_FALSE(page_reporting_enabled); 351 352 int page_reporting_register(struct page_reporting_dev_info *prdev) 353 { 354 int err = 0; 355 356 mutex_lock(&page_reporting_mutex); 357 358 /* nothing to do if already in use */ 359 if (rcu_dereference_protected(pr_dev_info, 360 lockdep_is_held(&page_reporting_mutex))) { 361 err = -EBUSY; 362 goto err_out; 363 } 364 365 /* 366 * If the page_reporting_order value is not set, we check if 367 * an order is provided from the driver that is performing the 368 * registration. If that is not provided either, we default to 369 * pageblock_order. 370 */ 371 372 if (page_reporting_order == -1) { 373 if (prdev->order > 0 && prdev->order <= MAX_PAGE_ORDER) 374 page_reporting_order = prdev->order; 375 else 376 page_reporting_order = pageblock_order; 377 } 378 379 /* initialize state and work structures */ 380 atomic_set(&prdev->state, PAGE_REPORTING_IDLE); 381 INIT_DELAYED_WORK(&prdev->work, &page_reporting_process); 382 383 /* Begin initial flush of zones */ 384 __page_reporting_request(prdev); 385 386 /* Assign device to allow notifications */ 387 rcu_assign_pointer(pr_dev_info, prdev); 388 389 /* enable page reporting notification */ 390 if (!static_key_enabled(&page_reporting_enabled)) { 391 static_branch_enable(&page_reporting_enabled); 392 pr_info("Free page reporting enabled\n"); 393 } 394 err_out: 395 mutex_unlock(&page_reporting_mutex); 396 397 return err; 398 } 399 EXPORT_SYMBOL_GPL(page_reporting_register); 400 401 void page_reporting_unregister(struct page_reporting_dev_info *prdev) 402 { 403 mutex_lock(&page_reporting_mutex); 404 405 if (prdev == rcu_dereference_protected(pr_dev_info, 406 lockdep_is_held(&page_reporting_mutex))) { 407 /* Disable page reporting notification */ 408 RCU_INIT_POINTER(pr_dev_info, NULL); 409 synchronize_rcu(); 410 411 /* Flush any existing work, and lock it out */ 412 cancel_delayed_work_sync(&prdev->work); 413 } 414 415 mutex_unlock(&page_reporting_mutex); 416 } 417 EXPORT_SYMBOL_GPL(page_reporting_unregister); 418