1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/mm.h> 3 #include <linux/mmzone.h> 4 #include <linux/page_reporting.h> 5 #include <linux/gfp.h> 6 #include <linux/export.h> 7 #include <linux/module.h> 8 #include <linux/delay.h> 9 #include <linux/scatterlist.h> 10 11 #include "page_reporting.h" 12 #include "internal.h" 13 14 unsigned int page_reporting_order = MAX_ORDER; 15 module_param(page_reporting_order, uint, 0644); 16 MODULE_PARM_DESC(page_reporting_order, "Set page reporting order"); 17 18 #define PAGE_REPORTING_DELAY (2 * HZ) 19 static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly; 20 21 enum { 22 PAGE_REPORTING_IDLE = 0, 23 PAGE_REPORTING_REQUESTED, 24 PAGE_REPORTING_ACTIVE 25 }; 26 27 /* request page reporting */ 28 static void 29 __page_reporting_request(struct page_reporting_dev_info *prdev) 30 { 31 unsigned int state; 32 33 /* Check to see if we are in desired state */ 34 state = atomic_read(&prdev->state); 35 if (state == PAGE_REPORTING_REQUESTED) 36 return; 37 38 /* 39 * If reporting is already active there is nothing we need to do. 40 * Test against 0 as that represents PAGE_REPORTING_IDLE. 41 */ 42 state = atomic_xchg(&prdev->state, PAGE_REPORTING_REQUESTED); 43 if (state != PAGE_REPORTING_IDLE) 44 return; 45 46 /* 47 * Delay the start of work to allow a sizable queue to build. For 48 * now we are limiting this to running no more than once every 49 * couple of seconds. 50 */ 51 schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY); 52 } 53 54 /* notify prdev of free page reporting request */ 55 void __page_reporting_notify(void) 56 { 57 struct page_reporting_dev_info *prdev; 58 59 /* 60 * We use RCU to protect the pr_dev_info pointer. In almost all 61 * cases this should be present, however in the unlikely case of 62 * a shutdown this will be NULL and we should exit. 63 */ 64 rcu_read_lock(); 65 prdev = rcu_dereference(pr_dev_info); 66 if (likely(prdev)) 67 __page_reporting_request(prdev); 68 69 rcu_read_unlock(); 70 } 71 72 static void 73 page_reporting_drain(struct page_reporting_dev_info *prdev, 74 struct scatterlist *sgl, unsigned int nents, bool reported) 75 { 76 struct scatterlist *sg = sgl; 77 78 /* 79 * Drain the now reported pages back into their respective 80 * free lists/areas. We assume at least one page is populated. 81 */ 82 do { 83 struct page *page = sg_page(sg); 84 int mt = get_pageblock_migratetype(page); 85 unsigned int order = get_order(sg->length); 86 87 __putback_isolated_page(page, order, mt); 88 89 /* If the pages were not reported due to error skip flagging */ 90 if (!reported) 91 continue; 92 93 /* 94 * If page was not comingled with another page we can 95 * consider the result to be "reported" since the page 96 * hasn't been modified, otherwise we will need to 97 * report on the new larger page when we make our way 98 * up to that higher order. 99 */ 100 if (PageBuddy(page) && buddy_order(page) == order) 101 __SetPageReported(page); 102 } while ((sg = sg_next(sg))); 103 104 /* reinitialize scatterlist now that it is empty */ 105 sg_init_table(sgl, nents); 106 } 107 108 /* 109 * The page reporting cycle consists of 4 stages, fill, report, drain, and 110 * idle. We will cycle through the first 3 stages until we cannot obtain a 111 * full scatterlist of pages, in that case we will switch to idle. 112 */ 113 static int 114 page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone, 115 unsigned int order, unsigned int mt, 116 struct scatterlist *sgl, unsigned int *offset) 117 { 118 struct free_area *area = &zone->free_area[order]; 119 struct list_head *list = &area->free_list[mt]; 120 unsigned int page_len = PAGE_SIZE << order; 121 struct page *page, *next; 122 long budget; 123 int err = 0; 124 125 /* 126 * Perform early check, if free area is empty there is 127 * nothing to process so we can skip this free_list. 128 */ 129 if (list_empty(list)) 130 return err; 131 132 spin_lock_irq(&zone->lock); 133 134 /* 135 * Limit how many calls we will be making to the page reporting 136 * device for this list. By doing this we avoid processing any 137 * given list for too long. 138 * 139 * The current value used allows us enough calls to process over a 140 * sixteenth of the current list plus one additional call to handle 141 * any pages that may have already been present from the previous 142 * list processed. This should result in us reporting all pages on 143 * an idle system in about 30 seconds. 144 * 145 * The division here should be cheap since PAGE_REPORTING_CAPACITY 146 * should always be a power of 2. 147 */ 148 budget = DIV_ROUND_UP(area->nr_free, PAGE_REPORTING_CAPACITY * 16); 149 150 /* loop through free list adding unreported pages to sg list */ 151 list_for_each_entry_safe(page, next, list, lru) { 152 /* We are going to skip over the reported pages. */ 153 if (PageReported(page)) 154 continue; 155 156 /* 157 * If we fully consumed our budget then update our 158 * state to indicate that we are requesting additional 159 * processing and exit this list. 160 */ 161 if (budget < 0) { 162 atomic_set(&prdev->state, PAGE_REPORTING_REQUESTED); 163 next = page; 164 break; 165 } 166 167 /* Attempt to pull page from list and place in scatterlist */ 168 if (*offset) { 169 if (!__isolate_free_page(page, order)) { 170 next = page; 171 break; 172 } 173 174 /* Add page to scatter list */ 175 --(*offset); 176 sg_set_page(&sgl[*offset], page, page_len, 0); 177 178 continue; 179 } 180 181 /* 182 * Make the first non-reported page in the free list 183 * the new head of the free list before we release the 184 * zone lock. 185 */ 186 if (!list_is_first(&page->lru, list)) 187 list_rotate_to_front(&page->lru, list); 188 189 /* release lock before waiting on report processing */ 190 spin_unlock_irq(&zone->lock); 191 192 /* begin processing pages in local list */ 193 err = prdev->report(prdev, sgl, PAGE_REPORTING_CAPACITY); 194 195 /* reset offset since the full list was reported */ 196 *offset = PAGE_REPORTING_CAPACITY; 197 198 /* update budget to reflect call to report function */ 199 budget--; 200 201 /* reacquire zone lock and resume processing */ 202 spin_lock_irq(&zone->lock); 203 204 /* flush reported pages from the sg list */ 205 page_reporting_drain(prdev, sgl, PAGE_REPORTING_CAPACITY, !err); 206 207 /* 208 * Reset next to first entry, the old next isn't valid 209 * since we dropped the lock to report the pages 210 */ 211 next = list_first_entry(list, struct page, lru); 212 213 /* exit on error */ 214 if (err) 215 break; 216 } 217 218 /* Rotate any leftover pages to the head of the freelist */ 219 if (!list_entry_is_head(next, list, lru) && !list_is_first(&next->lru, list)) 220 list_rotate_to_front(&next->lru, list); 221 222 spin_unlock_irq(&zone->lock); 223 224 return err; 225 } 226 227 static int 228 page_reporting_process_zone(struct page_reporting_dev_info *prdev, 229 struct scatterlist *sgl, struct zone *zone) 230 { 231 unsigned int order, mt, leftover, offset = PAGE_REPORTING_CAPACITY; 232 unsigned long watermark; 233 int err = 0; 234 235 /* Generate minimum watermark to be able to guarantee progress */ 236 watermark = low_wmark_pages(zone) + 237 (PAGE_REPORTING_CAPACITY << page_reporting_order); 238 239 /* 240 * Cancel request if insufficient free memory or if we failed 241 * to allocate page reporting statistics for the zone. 242 */ 243 if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) 244 return err; 245 246 /* Process each free list starting from lowest order/mt */ 247 for (order = page_reporting_order; order < MAX_ORDER; order++) { 248 for (mt = 0; mt < MIGRATE_TYPES; mt++) { 249 /* We do not pull pages from the isolate free list */ 250 if (is_migrate_isolate(mt)) 251 continue; 252 253 err = page_reporting_cycle(prdev, zone, order, mt, 254 sgl, &offset); 255 if (err) 256 return err; 257 } 258 } 259 260 /* report the leftover pages before going idle */ 261 leftover = PAGE_REPORTING_CAPACITY - offset; 262 if (leftover) { 263 sgl = &sgl[offset]; 264 err = prdev->report(prdev, sgl, leftover); 265 266 /* flush any remaining pages out from the last report */ 267 spin_lock_irq(&zone->lock); 268 page_reporting_drain(prdev, sgl, leftover, !err); 269 spin_unlock_irq(&zone->lock); 270 } 271 272 return err; 273 } 274 275 static void page_reporting_process(struct work_struct *work) 276 { 277 struct delayed_work *d_work = to_delayed_work(work); 278 struct page_reporting_dev_info *prdev = 279 container_of(d_work, struct page_reporting_dev_info, work); 280 int err = 0, state = PAGE_REPORTING_ACTIVE; 281 struct scatterlist *sgl; 282 struct zone *zone; 283 284 /* 285 * Change the state to "Active" so that we can track if there is 286 * anyone requests page reporting after we complete our pass. If 287 * the state is not altered by the end of the pass we will switch 288 * to idle and quit scheduling reporting runs. 289 */ 290 atomic_set(&prdev->state, state); 291 292 /* allocate scatterlist to store pages being reported on */ 293 sgl = kmalloc_array(PAGE_REPORTING_CAPACITY, sizeof(*sgl), GFP_KERNEL); 294 if (!sgl) 295 goto err_out; 296 297 sg_init_table(sgl, PAGE_REPORTING_CAPACITY); 298 299 for_each_zone(zone) { 300 err = page_reporting_process_zone(prdev, sgl, zone); 301 if (err) 302 break; 303 } 304 305 kfree(sgl); 306 err_out: 307 /* 308 * If the state has reverted back to requested then there may be 309 * additional pages to be processed. We will defer for 2s to allow 310 * more pages to accumulate. 311 */ 312 state = atomic_cmpxchg(&prdev->state, state, PAGE_REPORTING_IDLE); 313 if (state == PAGE_REPORTING_REQUESTED) 314 schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY); 315 } 316 317 static DEFINE_MUTEX(page_reporting_mutex); 318 DEFINE_STATIC_KEY_FALSE(page_reporting_enabled); 319 320 int page_reporting_register(struct page_reporting_dev_info *prdev) 321 { 322 int err = 0; 323 324 mutex_lock(&page_reporting_mutex); 325 326 /* nothing to do if already in use */ 327 if (rcu_access_pointer(pr_dev_info)) { 328 err = -EBUSY; 329 goto err_out; 330 } 331 332 /* 333 * Update the page reporting order if it's specified by driver. 334 * Otherwise, it falls back to @pageblock_order. 335 */ 336 page_reporting_order = prdev->order ? : pageblock_order; 337 338 /* initialize state and work structures */ 339 atomic_set(&prdev->state, PAGE_REPORTING_IDLE); 340 INIT_DELAYED_WORK(&prdev->work, &page_reporting_process); 341 342 /* Begin initial flush of zones */ 343 __page_reporting_request(prdev); 344 345 /* Assign device to allow notifications */ 346 rcu_assign_pointer(pr_dev_info, prdev); 347 348 /* enable page reporting notification */ 349 if (!static_key_enabled(&page_reporting_enabled)) { 350 static_branch_enable(&page_reporting_enabled); 351 pr_info("Free page reporting enabled\n"); 352 } 353 err_out: 354 mutex_unlock(&page_reporting_mutex); 355 356 return err; 357 } 358 EXPORT_SYMBOL_GPL(page_reporting_register); 359 360 void page_reporting_unregister(struct page_reporting_dev_info *prdev) 361 { 362 mutex_lock(&page_reporting_mutex); 363 364 if (rcu_access_pointer(pr_dev_info) == prdev) { 365 /* Disable page reporting notification */ 366 RCU_INIT_POINTER(pr_dev_info, NULL); 367 synchronize_rcu(); 368 369 /* Flush any existing work, and lock it out */ 370 cancel_delayed_work_sync(&prdev->work); 371 } 372 373 mutex_unlock(&page_reporting_mutex); 374 } 375 EXPORT_SYMBOL_GPL(page_reporting_unregister); 376