1db874c57Selowe /* 2db874c57Selowe * CDDL HEADER START 3db874c57Selowe * 4db874c57Selowe * The contents of this file are subject to the terms of the 58b464eb8Smec * Common Development and Distribution License (the "License"). 68b464eb8Smec * You may not use this file except in compliance with the License. 7db874c57Selowe * 8db874c57Selowe * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9db874c57Selowe * or http://www.opensolaris.org/os/licensing. 10db874c57Selowe * See the License for the specific language governing permissions 11db874c57Selowe * and limitations under the License. 12db874c57Selowe * 13db874c57Selowe * When distributing Covered Code, include this CDDL HEADER in each 14db874c57Selowe * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15db874c57Selowe * If applicable, add the following below this CDDL HEADER, with the 16db874c57Selowe * fields enclosed by brackets "[]" replaced with your own identifying 17db874c57Selowe * information: Portions Copyright [yyyy] [name of copyright owner] 18db874c57Selowe * 19db874c57Selowe * CDDL HEADER END 20db874c57Selowe */ 21db874c57Selowe /* 221a3c9a5aSVijay S Balakrishna * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23db874c57Selowe * Use is subject to license terms. 24db874c57Selowe */ 25db874c57Selowe 26db874c57Selowe /* 27db874c57Selowe * Page Retire - Big Theory Statement. 28db874c57Selowe * 29db874c57Selowe * This file handles removing sections of faulty memory from use when the 30db874c57Selowe * user land FMA Diagnosis Engine requests that a page be removed or when 31db874c57Selowe * a CE or UE is detected by the hardware. 32db874c57Selowe * 33db874c57Selowe * In the bad old days, the kernel side of Page Retire did a lot of the work 34db874c57Selowe * on its own. Now, with the DE keeping track of errors, the kernel side is 35db874c57Selowe * rather simple minded on most platforms. 36db874c57Selowe * 37db874c57Selowe * Errors are all reflected to the DE, and after digesting the error and 38db874c57Selowe * looking at all previously reported errors, the DE decides what should 39db874c57Selowe * be done about the current error. If the DE wants a particular page to 40db874c57Selowe * be retired, then the kernel page retire code is invoked via an ioctl. 41db874c57Selowe * On non-FMA platforms, the ue_drain and ce_drain paths ends up calling 42db874c57Selowe * page retire to handle the error. Since page retire is just a simple 43db874c57Selowe * mechanism it doesn't need to differentiate between the different callers. 44db874c57Selowe * 45db874c57Selowe * The p_toxic field in the page_t is used to indicate which errors have 46db874c57Selowe * occurred and what action has been taken on a given page. Because errors are 47db874c57Selowe * reported without regard to the locked state of a page, no locks are used 48db874c57Selowe * to SET the error bits in p_toxic. However, in order to clear the error 49db874c57Selowe * bits, the page_t must be held exclusively locked. 50db874c57Selowe * 51db874c57Selowe * When page_retire() is called, it must be able to acquire locks, sleep, etc. 52db874c57Selowe * It must not be called from high-level interrupt context. 53db874c57Selowe * 54db874c57Selowe * Depending on how the requested page is being used at the time of the retire 55db874c57Selowe * request (and on the availability of sufficient system resources), the page 56db874c57Selowe * may be retired immediately, or just marked for retirement later. For 57db874c57Selowe * example, locked pages are marked, while free pages are retired. Multiple 58db874c57Selowe * requests may be made to retire the same page, although there is no need 59db874c57Selowe * to: once the p_toxic flags are set, the page will be retired as soon as it 60db874c57Selowe * can be exclusively locked. 61db874c57Selowe * 62db874c57Selowe * The retire mechanism is driven centrally out of page_unlock(). To expedite 63db874c57Selowe * the retirement of pages, further requests for SE_SHARED locks are denied 64db874c57Selowe * as long as a page retirement is pending. In addition, as long as pages are 65db874c57Selowe * pending retirement a background thread runs periodically trying to retire 66db874c57Selowe * those pages. Pages which could not be retired while the system is running 67db874c57Selowe * are scrubbed prior to rebooting to avoid latent errors on the next boot. 68db874c57Selowe * 694fc2445aSelowe * UE pages without persistent errors are scrubbed and returned to service. 704fc2445aSelowe * Recidivist pages, as well as FMA-directed requests for retirement, result 714fc2445aSelowe * in the page being taken out of service. Once the decision is made to take 724fc2445aSelowe * a page out of service, the page is cleared, hashed onto the retired_pages 734fc2445aSelowe * vnode, marked as retired, and it is unlocked. No other requesters (except 744fc2445aSelowe * for unretire) are allowed to lock retired pages. 75db874c57Selowe * 76db874c57Selowe * The public routines return (sadly) 0 if they worked and a non-zero error 77db874c57Selowe * value if something went wrong. This is done for the ioctl side of the 78db874c57Selowe * world to allow errors to be reflected all the way out to user land. The 79db874c57Selowe * non-zero values are explained in comments atop each function. 80db874c57Selowe */ 81db874c57Selowe 82db874c57Selowe /* 83db874c57Selowe * Things to fix: 84db874c57Selowe * 858b464eb8Smec * 1. Trying to retire non-relocatable kvp pages may result in a 86db874c57Selowe * quagmire. This is because seg_kmem() no longer keeps its pages locked, 87db874c57Selowe * and calls page_lookup() in the free path; since kvp pages are modified 88db874c57Selowe * and don't have a usable backing store, page_retire() can't do anything 89db874c57Selowe * with them, and we'll keep denying the lock to seg_kmem_free() in a 90db874c57Selowe * vicious cycle. To prevent that, we don't deny locks to kvp pages, and 918b464eb8Smec * hence only try to retire a page from page_unlock() in the free path. 92db874c57Selowe * Since most kernel pages are indefinitely held anyway, and don't 93db874c57Selowe * participate in I/O, this is of little consequence. 94db874c57Selowe * 958b464eb8Smec * 2. Low memory situations will be interesting. If we don't have 96db874c57Selowe * enough memory for page_relocate() to succeed, we won't be able to 97db874c57Selowe * retire dirty pages; nobody will be able to push them out to disk 98db874c57Selowe * either, since we aggressively deny the page lock. We could change 99db874c57Selowe * fsflush so it can recognize this situation, grab the lock, and push 100db874c57Selowe * the page out, where we'll catch it in the free path and retire it. 101db874c57Selowe * 1028b464eb8Smec * 3. Beware of places that have code like this in them: 103db874c57Selowe * 104db874c57Selowe * if (! page_tryupgrade(pp)) { 105db874c57Selowe * page_unlock(pp); 106db874c57Selowe * while (! page_lock(pp, SE_EXCL, NULL, P_RECLAIM)) { 107db874c57Selowe * / *NOTHING* / 108db874c57Selowe * } 109db874c57Selowe * } 110db874c57Selowe * page_free(pp); 111db874c57Selowe * 112db874c57Selowe * The problem is that pp can change identity right after the 113db874c57Selowe * page_unlock() call. In particular, page_retire() can step in 114db874c57Selowe * there, change pp's identity, and hash pp onto the retired_vnode. 115db874c57Selowe * 116db874c57Selowe * Of course, other functions besides page_retire() can have the 117db874c57Selowe * same effect. A kmem reader can waltz by, set up a mapping to the 118db874c57Selowe * page, and then unlock the page. Page_free() will then go castors 119db874c57Selowe * up. So if anybody is doing this, it's already a bug. 120db874c57Selowe * 1218b464eb8Smec * 4. mdboot()'s call into page_retire_mdboot() should probably be 122db874c57Selowe * moved lower. Where the call is made now, we can get into trouble 123db874c57Selowe * by scrubbing a kernel page that is then accessed later. 124db874c57Selowe */ 125db874c57Selowe 126db874c57Selowe #include <sys/types.h> 127db874c57Selowe #include <sys/param.h> 128db874c57Selowe #include <sys/systm.h> 129db874c57Selowe #include <sys/mman.h> 130db874c57Selowe #include <sys/vnode.h> 131aa59c4cbSrsb #include <sys/vfs_opreg.h> 132db874c57Selowe #include <sys/cmn_err.h> 133db874c57Selowe #include <sys/ksynch.h> 134db874c57Selowe #include <sys/thread.h> 135db874c57Selowe #include <sys/disp.h> 136db874c57Selowe #include <sys/ontrap.h> 137db874c57Selowe #include <sys/vmsystm.h> 138db874c57Selowe #include <sys/mem_config.h> 139db874c57Selowe #include <sys/atomic.h> 140db874c57Selowe #include <sys/callb.h> 1411a3c9a5aSVijay S Balakrishna #include <sys/kobj.h> 142db874c57Selowe #include <vm/page.h> 143db874c57Selowe #include <vm/vm_dep.h> 144db874c57Selowe #include <vm/as.h> 145db874c57Selowe #include <vm/hat.h> 146af4c679fSSean McEnroe #include <vm/seg_kmem.h> 147db874c57Selowe 148db874c57Selowe /* 149db874c57Selowe * vnode for all pages which are retired from the VM system; 150db874c57Selowe */ 151db874c57Selowe vnode_t *retired_pages; 152db874c57Selowe 1538b464eb8Smec static int page_retire_pp_finish(page_t *, void *, uint_t); 154db874c57Selowe 155db874c57Selowe /* 156db874c57Selowe * Make a list of all of the pages that have been marked for retirement 157db874c57Selowe * but are not yet retired. At system shutdown, we will scrub all of the 158db874c57Selowe * pages in the list in case there are outstanding UEs. Then, we 159db874c57Selowe * cross-check this list against the number of pages that are yet to be 160db874c57Selowe * retired, and if we find inconsistencies, we scan every page_t in the 161db874c57Selowe * whole system looking for any pages that need to be scrubbed for UEs. 162db874c57Selowe * The background thread also uses this queue to determine which pages 163db874c57Selowe * it should keep trying to retire. 164db874c57Selowe */ 165db874c57Selowe #ifdef DEBUG 166db874c57Selowe #define PR_PENDING_QMAX 32 167db874c57Selowe #else /* DEBUG */ 168db874c57Selowe #define PR_PENDING_QMAX 256 169db874c57Selowe #endif /* DEBUG */ 170db874c57Selowe page_t *pr_pending_q[PR_PENDING_QMAX]; 171db874c57Selowe kmutex_t pr_q_mutex; 172db874c57Selowe 173db874c57Selowe /* 174db874c57Selowe * Page retire global kstats 175db874c57Selowe */ 176db874c57Selowe struct page_retire_kstat { 177db874c57Selowe kstat_named_t pr_retired; 178db874c57Selowe kstat_named_t pr_requested; 179db874c57Selowe kstat_named_t pr_requested_free; 180db874c57Selowe kstat_named_t pr_enqueue_fail; 181db874c57Selowe kstat_named_t pr_dequeue_fail; 182db874c57Selowe kstat_named_t pr_pending; 183704b9682SChristopher Baumbauer - Sun Microsystems - San Diego United States kstat_named_t pr_pending_kas; 184db874c57Selowe kstat_named_t pr_failed; 185db874c57Selowe kstat_named_t pr_failed_kernel; 186db874c57Selowe kstat_named_t pr_limit; 187db874c57Selowe kstat_named_t pr_limit_exceeded; 188db874c57Selowe kstat_named_t pr_fma; 189db874c57Selowe kstat_named_t pr_mce; 190db874c57Selowe kstat_named_t pr_ue; 191db874c57Selowe kstat_named_t pr_ue_cleared_retire; 192db874c57Selowe kstat_named_t pr_ue_cleared_free; 193db874c57Selowe kstat_named_t pr_ue_persistent; 194db874c57Selowe kstat_named_t pr_unretired; 195db874c57Selowe }; 196db874c57Selowe 197db874c57Selowe static struct page_retire_kstat page_retire_kstat = { 198db874c57Selowe { "pages_retired", KSTAT_DATA_UINT64}, 199db874c57Selowe { "pages_retire_request", KSTAT_DATA_UINT64}, 200db874c57Selowe { "pages_retire_request_free", KSTAT_DATA_UINT64}, 201db874c57Selowe { "pages_notenqueued", KSTAT_DATA_UINT64}, 202db874c57Selowe { "pages_notdequeued", KSTAT_DATA_UINT64}, 203db874c57Selowe { "pages_pending", KSTAT_DATA_UINT64}, 204704b9682SChristopher Baumbauer - Sun Microsystems - San Diego United States { "pages_pending_kas", KSTAT_DATA_UINT64}, 205db874c57Selowe { "pages_deferred", KSTAT_DATA_UINT64}, 206db874c57Selowe { "pages_deferred_kernel", KSTAT_DATA_UINT64}, 207db874c57Selowe { "pages_limit", KSTAT_DATA_UINT64}, 208db874c57Selowe { "pages_limit_exceeded", KSTAT_DATA_UINT64}, 209db874c57Selowe { "pages_fma", KSTAT_DATA_UINT64}, 210db874c57Selowe { "pages_multiple_ce", KSTAT_DATA_UINT64}, 211db874c57Selowe { "pages_ue", KSTAT_DATA_UINT64}, 212db874c57Selowe { "pages_ue_cleared_retired", KSTAT_DATA_UINT64}, 213db874c57Selowe { "pages_ue_cleared_freed", KSTAT_DATA_UINT64}, 214db874c57Selowe { "pages_ue_persistent", KSTAT_DATA_UINT64}, 215db874c57Selowe { "pages_unretired", KSTAT_DATA_UINT64}, 216db874c57Selowe }; 217db874c57Selowe 218db874c57Selowe static kstat_t *page_retire_ksp = NULL; 219db874c57Selowe 220db874c57Selowe #define PR_INCR_KSTAT(stat) \ 221*1a5e258fSJosef 'Jeff' Sipek atomic_inc_64(&(page_retire_kstat.stat.value.ui64)) 222db874c57Selowe #define PR_DECR_KSTAT(stat) \ 223*1a5e258fSJosef 'Jeff' Sipek atomic_dec_64(&(page_retire_kstat.stat.value.ui64)) 224db874c57Selowe 225db874c57Selowe #define PR_KSTAT_RETIRED_CE (page_retire_kstat.pr_mce.value.ui64) 226db874c57Selowe #define PR_KSTAT_RETIRED_FMA (page_retire_kstat.pr_fma.value.ui64) 227db874c57Selowe #define PR_KSTAT_RETIRED_NOTUE (PR_KSTAT_RETIRED_CE + PR_KSTAT_RETIRED_FMA) 228db874c57Selowe #define PR_KSTAT_PENDING (page_retire_kstat.pr_pending.value.ui64) 229704b9682SChristopher Baumbauer - Sun Microsystems - San Diego United States #define PR_KSTAT_PENDING_KAS (page_retire_kstat.pr_pending_kas.value.ui64) 230db874c57Selowe #define PR_KSTAT_EQFAIL (page_retire_kstat.pr_enqueue_fail.value.ui64) 231db874c57Selowe #define PR_KSTAT_DQFAIL (page_retire_kstat.pr_dequeue_fail.value.ui64) 232db874c57Selowe 233db874c57Selowe /* 2348b464eb8Smec * page retire kstats to list all retired pages 2358b464eb8Smec */ 2368b464eb8Smec static int pr_list_kstat_update(kstat_t *ksp, int rw); 2378b464eb8Smec static int pr_list_kstat_snapshot(kstat_t *ksp, void *buf, int rw); 2388b464eb8Smec kmutex_t pr_list_kstat_mutex; 2398b464eb8Smec 2408b464eb8Smec /* 241db874c57Selowe * Limit the number of multiple CE page retires. 242db874c57Selowe * The default is 0.1% of physmem, or 1 in 1000 pages. This is set in 243db874c57Selowe * basis points, where 100 basis points equals one percent. 244db874c57Selowe */ 245db874c57Selowe #define MCE_BPT 10 246db874c57Selowe uint64_t max_pages_retired_bps = MCE_BPT; 247db874c57Selowe #define PAGE_RETIRE_LIMIT ((physmem * max_pages_retired_bps) / 10000) 248db874c57Selowe 249db874c57Selowe /* 250db874c57Selowe * Control over the verbosity of page retirement. 251db874c57Selowe * 252db874c57Selowe * When set to zero (the default), no messages will be printed. 253db874c57Selowe * When set to one, summary messages will be printed. 254db874c57Selowe * When set > one, all messages will be printed. 255db874c57Selowe * 256db874c57Selowe * A value of one will trigger detailed messages for retirement operations, 257db874c57Selowe * and is intended as a platform tunable for processors where FMA's DE does 258db874c57Selowe * not run (e.g., spitfire). Values > one are intended for debugging only. 259db874c57Selowe */ 260db874c57Selowe int page_retire_messages = 0; 261db874c57Selowe 262db874c57Selowe /* 263db874c57Selowe * Control whether or not we return scrubbed UE pages to service. 264db874c57Selowe * By default we do not since FMA wants to run its diagnostics first 265db874c57Selowe * and then ask us to unretire the page if it passes. Non-FMA platforms 266db874c57Selowe * may set this to zero so we will only retire recidivist pages. It should 267db874c57Selowe * not be changed by the user. 268db874c57Selowe */ 269db874c57Selowe int page_retire_first_ue = 1; 270db874c57Selowe 271db874c57Selowe /* 272db874c57Selowe * Master enable for page retire. This prevents a CE or UE early in boot 273db874c57Selowe * from trying to retire a page before page_retire_init() has finished 274db874c57Selowe * setting things up. This is internal only and is not a tunable! 275db874c57Selowe */ 276db874c57Selowe static int pr_enable = 0; 277db874c57Selowe 2781a3c9a5aSVijay S Balakrishna static void (*memscrub_notify_func)(uint64_t); 2791a3c9a5aSVijay S Balakrishna 280db874c57Selowe #ifdef DEBUG 281db874c57Selowe struct page_retire_debug { 2822708b84eSelowe int prd_dup1; 2832708b84eSelowe int prd_dup2; 2842708b84eSelowe int prd_qdup; 285db874c57Selowe int prd_noaction; 286db874c57Selowe int prd_queued; 287db874c57Selowe int prd_notqueued; 288db874c57Selowe int prd_dequeue; 289db874c57Selowe int prd_top; 290db874c57Selowe int prd_locked; 291db874c57Selowe int prd_reloc; 29224e9c58bSelowe int prd_relocfail; 29324e9c58bSelowe int prd_mod; 29424e9c58bSelowe int prd_mod_late; 295db874c57Selowe int prd_kern; 296db874c57Selowe int prd_free; 297db874c57Selowe int prd_noreclaim; 298db874c57Selowe int prd_hashout; 299db874c57Selowe int prd_fma; 300db874c57Selowe int prd_uescrubbed; 301db874c57Selowe int prd_uenotscrubbed; 302db874c57Selowe int prd_mce; 303db874c57Selowe int prd_prlocked; 304db874c57Selowe int prd_prnotlocked; 305db874c57Selowe int prd_prretired; 306db874c57Selowe int prd_ulocked; 307db874c57Selowe int prd_unotretired; 308db874c57Selowe int prd_udestroy; 309db874c57Selowe int prd_uhashout; 310db874c57Selowe int prd_uunretired; 311db874c57Selowe int prd_unotlocked; 312db874c57Selowe int prd_checkhit; 3132708b84eSelowe int prd_checkmiss_pend; 3142708b84eSelowe int prd_checkmiss_noerr; 315db874c57Selowe int prd_tctop; 316db874c57Selowe int prd_tclocked; 317db874c57Selowe int prd_hunt; 318db874c57Selowe int prd_dohunt; 319db874c57Selowe int prd_earlyhunt; 320db874c57Selowe int prd_latehunt; 321db874c57Selowe int prd_nofreedemote; 322db874c57Selowe int prd_nodemote; 323db874c57Selowe int prd_demoted; 324db874c57Selowe } pr_debug; 325db874c57Selowe 326db874c57Selowe #define PR_DEBUG(foo) ((pr_debug.foo)++) 327db874c57Selowe 328db874c57Selowe /* 329db874c57Selowe * A type histogram. We record the incidence of the various toxic 330db874c57Selowe * flag combinations along with the interesting page attributes. The 331db874c57Selowe * goal is to get as many combinations as we can while driving all 332db874c57Selowe * pr_debug values nonzero (indicating we've exercised all possible 333db874c57Selowe * code paths across all possible page types). Not all combinations 334db874c57Selowe * will make sense -- e.g. PRT_MOD|PRT_KERNEL. 335db874c57Selowe * 336db874c57Selowe * pr_type offset bit encoding (when examining with a debugger): 337db874c57Selowe * 338db874c57Selowe * PRT_NAMED - 0x4 339db874c57Selowe * PRT_KERNEL - 0x8 340db874c57Selowe * PRT_FREE - 0x10 341db874c57Selowe * PRT_MOD - 0x20 342db874c57Selowe * PRT_FMA - 0x0 343db874c57Selowe * PRT_MCE - 0x40 344db874c57Selowe * PRT_UE - 0x80 345db874c57Selowe */ 346db874c57Selowe 347db874c57Selowe #define PRT_NAMED 0x01 348db874c57Selowe #define PRT_KERNEL 0x02 349db874c57Selowe #define PRT_FREE 0x04 350db874c57Selowe #define PRT_MOD 0x08 351db874c57Selowe #define PRT_FMA 0x00 /* yes, this is not a mistake */ 352db874c57Selowe #define PRT_MCE 0x10 353db874c57Selowe #define PRT_UE 0x20 354db874c57Selowe #define PRT_ALL 0x3F 355db874c57Selowe 356db874c57Selowe int pr_types[PRT_ALL+1]; 357db874c57Selowe 358db874c57Selowe #define PR_TYPES(pp) { \ 359db874c57Selowe int whichtype = 0; \ 360db874c57Selowe if (pp->p_vnode) \ 361db874c57Selowe whichtype |= PRT_NAMED; \ 362ad23a2dbSjohansen if (PP_ISKAS(pp)) \ 363db874c57Selowe whichtype |= PRT_KERNEL; \ 364db874c57Selowe if (PP_ISFREE(pp)) \ 365db874c57Selowe whichtype |= PRT_FREE; \ 366db874c57Selowe if (hat_ismod(pp)) \ 367db874c57Selowe whichtype |= PRT_MOD; \ 368db874c57Selowe if (pp->p_toxic & PR_UE) \ 369db874c57Selowe whichtype |= PRT_UE; \ 370db874c57Selowe if (pp->p_toxic & PR_MCE) \ 371db874c57Selowe whichtype |= PRT_MCE; \ 372db874c57Selowe pr_types[whichtype]++; \ 373db874c57Selowe } 374db874c57Selowe 375db874c57Selowe int recl_calls; 376db874c57Selowe int recl_mtbf = 3; 377db874c57Selowe int reloc_calls; 378db874c57Selowe int reloc_mtbf = 7; 379db874c57Selowe int pr_calls; 380db874c57Selowe int pr_mtbf = 15; 381db874c57Selowe 382db874c57Selowe #define MTBF(v, f) (((++(v)) & (f)) != (f)) 383db874c57Selowe 384db874c57Selowe #else /* DEBUG */ 385db874c57Selowe 386db874c57Selowe #define PR_DEBUG(foo) /* nothing */ 387db874c57Selowe #define PR_TYPES(foo) /* nothing */ 388db874c57Selowe #define MTBF(v, f) (1) 389db874c57Selowe 390db874c57Selowe #endif /* DEBUG */ 391db874c57Selowe 392db874c57Selowe /* 393db874c57Selowe * page_retire_done() - completion processing 394db874c57Selowe * 395db874c57Selowe * Used by the page_retire code for common completion processing. 396db874c57Selowe * It keeps track of how many times a given result has happened, 397db874c57Selowe * and writes out an occasional message. 398db874c57Selowe * 399db874c57Selowe * May be called with a NULL pp (PRD_INVALID_PA case). 400db874c57Selowe */ 401db874c57Selowe #define PRD_INVALID_KEY -1 402db874c57Selowe #define PRD_SUCCESS 0 403db874c57Selowe #define PRD_PENDING 1 404db874c57Selowe #define PRD_FAILED 2 405db874c57Selowe #define PRD_DUPLICATE 3 406db874c57Selowe #define PRD_INVALID_PA 4 407db874c57Selowe #define PRD_LIMIT 5 408db874c57Selowe #define PRD_UE_SCRUBBED 6 409db874c57Selowe #define PRD_UNR_SUCCESS 7 410db874c57Selowe #define PRD_UNR_CANTLOCK 8 411db874c57Selowe #define PRD_UNR_NOT 9 412db874c57Selowe 413db874c57Selowe typedef struct page_retire_op { 414db874c57Selowe int pr_key; /* one of the PRD_* defines from above */ 415db874c57Selowe int pr_count; /* How many times this has happened */ 416db874c57Selowe int pr_retval; /* return value */ 417db874c57Selowe int pr_msglvl; /* message level - when to print */ 418db874c57Selowe char *pr_message; /* Cryptic message for field service */ 419db874c57Selowe } page_retire_op_t; 420db874c57Selowe 421db874c57Selowe static page_retire_op_t page_retire_ops[] = { 422db874c57Selowe /* key count retval msglvl message */ 423db874c57Selowe {PRD_SUCCESS, 0, 0, 1, 424db874c57Selowe "Page 0x%08x.%08x removed from service"}, 425db874c57Selowe {PRD_PENDING, 0, EAGAIN, 2, 426db874c57Selowe "Page 0x%08x.%08x will be retired on free"}, 427db874c57Selowe {PRD_FAILED, 0, EAGAIN, 0, NULL}, 4282708b84eSelowe {PRD_DUPLICATE, 0, EIO, 2, 4292708b84eSelowe "Page 0x%08x.%08x already retired or pending"}, 430db874c57Selowe {PRD_INVALID_PA, 0, EINVAL, 2, 431db874c57Selowe "PA 0x%08x.%08x is not a relocatable page"}, 432db874c57Selowe {PRD_LIMIT, 0, 0, 1, 433db874c57Selowe "Page 0x%08x.%08x not retired due to limit exceeded"}, 434db874c57Selowe {PRD_UE_SCRUBBED, 0, 0, 1, 435db874c57Selowe "Previously reported error on page 0x%08x.%08x cleared"}, 436db874c57Selowe {PRD_UNR_SUCCESS, 0, 0, 1, 437db874c57Selowe "Page 0x%08x.%08x returned to service"}, 438db874c57Selowe {PRD_UNR_CANTLOCK, 0, EAGAIN, 2, 439db874c57Selowe "Page 0x%08x.%08x could not be unretired"}, 4402708b84eSelowe {PRD_UNR_NOT, 0, EIO, 2, 441db874c57Selowe "Page 0x%08x.%08x is not retired"}, 442db874c57Selowe {PRD_INVALID_KEY, 0, 0, 0, NULL} /* MUST BE LAST! */ 443db874c57Selowe }; 444db874c57Selowe 445db874c57Selowe /* 446db874c57Selowe * print a message if page_retire_messages is true. 447db874c57Selowe */ 448db874c57Selowe #define PR_MESSAGE(debuglvl, msglvl, msg, pa) \ 449db874c57Selowe { \ 450db874c57Selowe uint64_t p = (uint64_t)pa; \ 451db874c57Selowe if (page_retire_messages >= msglvl && msg != NULL) { \ 452db874c57Selowe cmn_err(debuglvl, msg, \ 453db874c57Selowe (uint32_t)(p >> 32), (uint32_t)p); \ 454db874c57Selowe } \ 455db874c57Selowe } 456db874c57Selowe 457db874c57Selowe /* 458db874c57Selowe * Note that multiple bits may be set in a single settoxic operation. 459db874c57Selowe * May be called without the page locked. 460db874c57Selowe */ 461db874c57Selowe void 462db874c57Selowe page_settoxic(page_t *pp, uchar_t bits) 463db874c57Selowe { 464db874c57Selowe atomic_or_8(&pp->p_toxic, bits); 465db874c57Selowe } 466db874c57Selowe 467db874c57Selowe /* 468db874c57Selowe * Note that multiple bits may cleared in a single clrtoxic operation. 4694fc2445aSelowe * Must be called with the page exclusively locked to prevent races which 4704fc2445aSelowe * may attempt to retire a page without any toxic bits set. 4718b464eb8Smec * Note that the PR_CAPTURE bit can be cleared without the exclusive lock 4728b464eb8Smec * being held as there is a separate mutex which protects that bit. 473db874c57Selowe */ 474db874c57Selowe void 475db874c57Selowe page_clrtoxic(page_t *pp, uchar_t bits) 476db874c57Selowe { 4778b464eb8Smec ASSERT((bits & PR_CAPTURE) || PAGE_EXCL(pp)); 478db874c57Selowe atomic_and_8(&pp->p_toxic, ~bits); 479db874c57Selowe } 480db874c57Selowe 481db874c57Selowe /* 482db874c57Selowe * Prints any page retire messages to the user, and decides what 483db874c57Selowe * error code is appropriate for the condition reported. 484db874c57Selowe */ 485db874c57Selowe static int 486db874c57Selowe page_retire_done(page_t *pp, int code) 487db874c57Selowe { 488db874c57Selowe page_retire_op_t *prop; 489db874c57Selowe uint64_t pa = 0; 490db874c57Selowe int i; 491db874c57Selowe 492db874c57Selowe if (pp != NULL) { 4934fc2445aSelowe pa = mmu_ptob((uint64_t)pp->p_pagenum); 494db874c57Selowe } 495db874c57Selowe 496db874c57Selowe prop = NULL; 497db874c57Selowe for (i = 0; page_retire_ops[i].pr_key != PRD_INVALID_KEY; i++) { 498db874c57Selowe if (page_retire_ops[i].pr_key == code) { 499db874c57Selowe prop = &page_retire_ops[i]; 500db874c57Selowe break; 501db874c57Selowe } 502db874c57Selowe } 503db874c57Selowe 504db874c57Selowe #ifdef DEBUG 505db874c57Selowe if (page_retire_ops[i].pr_key == PRD_INVALID_KEY) { 506db874c57Selowe cmn_err(CE_PANIC, "page_retire_done: Invalid opcode %d", code); 507db874c57Selowe } 508db874c57Selowe #endif 509db874c57Selowe 510db874c57Selowe ASSERT(prop->pr_key == code); 511db874c57Selowe 512db874c57Selowe prop->pr_count++; 513db874c57Selowe 514db874c57Selowe PR_MESSAGE(CE_NOTE, prop->pr_msglvl, prop->pr_message, pa); 515db874c57Selowe if (pp != NULL) { 516db874c57Selowe page_settoxic(pp, PR_MSG); 517db874c57Selowe } 518db874c57Selowe 519db874c57Selowe return (prop->pr_retval); 520db874c57Selowe } 521db874c57Selowe 522db874c57Selowe /* 523db874c57Selowe * Act like page_destroy(), but instead of freeing the page, hash it onto 524db874c57Selowe * the retired_pages vnode, and mark it retired. 525db874c57Selowe * 526db874c57Selowe * For fun, we try to scrub the page until it's squeaky clean. 527db874c57Selowe * availrmem is adjusted here. 528db874c57Selowe */ 529db874c57Selowe static void 530db874c57Selowe page_retire_destroy(page_t *pp) 531db874c57Selowe { 53224e9c58bSelowe u_offset_t off = (u_offset_t)((uintptr_t)pp); 53324e9c58bSelowe 534db874c57Selowe ASSERT(PAGE_EXCL(pp)); 535db874c57Selowe ASSERT(!PP_ISFREE(pp)); 536db874c57Selowe ASSERT(pp->p_szc == 0); 537db874c57Selowe ASSERT(!hat_page_is_mapped(pp)); 538db874c57Selowe ASSERT(!pp->p_vnode); 539db874c57Selowe 5409d0d62adSJason Beloro page_clr_all_props(pp); 541db874c57Selowe pagescrub(pp, 0, MMU_PAGESIZE); 542db874c57Selowe 543db874c57Selowe pp->p_next = NULL; 544db874c57Selowe pp->p_prev = NULL; 54524e9c58bSelowe if (page_hashin(pp, retired_pages, off, NULL) == 0) { 546db874c57Selowe cmn_err(CE_PANIC, "retired page %p hashin failed", (void *)pp); 547db874c57Selowe } 548db874c57Selowe 549db874c57Selowe page_settoxic(pp, PR_RETIRED); 550db874c57Selowe PR_INCR_KSTAT(pr_retired); 551db874c57Selowe 552db874c57Selowe if (pp->p_toxic & PR_FMA) { 553db874c57Selowe PR_INCR_KSTAT(pr_fma); 554db874c57Selowe } else if (pp->p_toxic & PR_UE) { 555db874c57Selowe PR_INCR_KSTAT(pr_ue); 556db874c57Selowe } else { 557db874c57Selowe PR_INCR_KSTAT(pr_mce); 558db874c57Selowe } 559db874c57Selowe 560db874c57Selowe mutex_enter(&freemem_lock); 561db874c57Selowe availrmem--; 562db874c57Selowe mutex_exit(&freemem_lock); 563db874c57Selowe 564db874c57Selowe page_unlock(pp); 565db874c57Selowe } 566db874c57Selowe 567db874c57Selowe /* 568db874c57Selowe * Check whether the number of pages which have been retired already exceeds 569db874c57Selowe * the maximum allowable percentage of memory which may be retired. 570db874c57Selowe * 571db874c57Selowe * Returns 1 if the limit has been exceeded. 572db874c57Selowe */ 573db874c57Selowe static int 574db874c57Selowe page_retire_limit(void) 575db874c57Selowe { 576db874c57Selowe if (PR_KSTAT_RETIRED_NOTUE >= (uint64_t)PAGE_RETIRE_LIMIT) { 577db874c57Selowe PR_INCR_KSTAT(pr_limit_exceeded); 578db874c57Selowe return (1); 579db874c57Selowe } 580db874c57Selowe 581db874c57Selowe return (0); 582db874c57Selowe } 583db874c57Selowe 584db874c57Selowe #define MSG_DM "Data Mismatch occurred at PA 0x%08x.%08x" \ 585db874c57Selowe "[ 0x%x != 0x%x ] while attempting to clear previously " \ 586db874c57Selowe "reported error; page removed from service" 587db874c57Selowe 588db874c57Selowe #define MSG_UE "Uncorrectable Error occurred at PA 0x%08x.%08x while " \ 589db874c57Selowe "attempting to clear previously reported error; page removed " \ 590db874c57Selowe "from service" 591db874c57Selowe 592db874c57Selowe /* 593db874c57Selowe * Attempt to clear a UE from a page. 594db874c57Selowe * Returns 1 if the error has been successfully cleared. 595db874c57Selowe */ 596db874c57Selowe static int 597db874c57Selowe page_clear_transient_ue(page_t *pp) 598db874c57Selowe { 599db874c57Selowe caddr_t kaddr; 600db874c57Selowe uint8_t rb, wb; 601db874c57Selowe uint64_t pa; 602db874c57Selowe uint32_t pa_hi, pa_lo; 603db874c57Selowe on_trap_data_t otd; 604db874c57Selowe int errors = 0; 605db874c57Selowe int i; 606db874c57Selowe 607db874c57Selowe ASSERT(PAGE_EXCL(pp)); 608db874c57Selowe ASSERT(PP_PR_REQ(pp)); 609db874c57Selowe ASSERT(pp->p_szc == 0); 610db874c57Selowe ASSERT(!hat_page_is_mapped(pp)); 611db874c57Selowe 612db874c57Selowe /* 613db874c57Selowe * Clear the page and attempt to clear the UE. If we trap 614db874c57Selowe * on the next access to the page, we know the UE has recurred. 615db874c57Selowe */ 616db874c57Selowe pagescrub(pp, 0, PAGESIZE); 617db874c57Selowe 618db874c57Selowe /* 619db874c57Selowe * Map the page and write a bunch of bit patterns to compare 620db874c57Selowe * what we wrote with what we read back. This isn't a perfect 621db874c57Selowe * test but it should be good enough to catch most of the 622db874c57Selowe * recurring UEs. If this fails to catch a recurrent UE, we'll 623db874c57Selowe * retire the page the next time we see a UE on the page. 624db874c57Selowe */ 625db874c57Selowe kaddr = ppmapin(pp, PROT_READ|PROT_WRITE, (caddr_t)-1); 626db874c57Selowe 627db874c57Selowe pa = ptob((uint64_t)page_pptonum(pp)); 628db874c57Selowe pa_hi = (uint32_t)(pa >> 32); 629db874c57Selowe pa_lo = (uint32_t)pa; 630db874c57Selowe 631db874c57Selowe /* 632bb9e3cfdSChristopher Baumbauer - Sun Microsystems - San Diego United States * Disable preemption to prevent the off chance that 633bb9e3cfdSChristopher Baumbauer - Sun Microsystems - San Diego United States * we migrate while in the middle of running through 634bb9e3cfdSChristopher Baumbauer - Sun Microsystems - San Diego United States * the bit pattern and run on a different processor 635bb9e3cfdSChristopher Baumbauer - Sun Microsystems - San Diego United States * than what we started on. 636bb9e3cfdSChristopher Baumbauer - Sun Microsystems - San Diego United States */ 637bb9e3cfdSChristopher Baumbauer - Sun Microsystems - San Diego United States kpreempt_disable(); 638bb9e3cfdSChristopher Baumbauer - Sun Microsystems - San Diego United States 639bb9e3cfdSChristopher Baumbauer - Sun Microsystems - San Diego United States /* 640db874c57Selowe * Fill the page with each (0x00 - 0xFF] bit pattern, flushing 641db874c57Selowe * the cache in between reading and writing. We do this under 642db874c57Selowe * on_trap() protection to avoid recursion. 643db874c57Selowe */ 644db874c57Selowe if (on_trap(&otd, OT_DATA_EC)) { 645db874c57Selowe PR_MESSAGE(CE_WARN, 1, MSG_UE, pa); 646db874c57Selowe errors = 1; 647db874c57Selowe } else { 648db874c57Selowe for (wb = 0xff; wb > 0; wb--) { 649db874c57Selowe for (i = 0; i < PAGESIZE; i++) { 650db874c57Selowe kaddr[i] = wb; 651db874c57Selowe } 652db874c57Selowe 653db874c57Selowe sync_data_memory(kaddr, PAGESIZE); 654db874c57Selowe 655db874c57Selowe for (i = 0; i < PAGESIZE; i++) { 656db874c57Selowe rb = kaddr[i]; 657db874c57Selowe if (rb != wb) { 658db874c57Selowe /* 659db874c57Selowe * We had a mismatch without a trap. 660db874c57Selowe * Uh-oh. Something is really wrong 661db874c57Selowe * with this system. 662db874c57Selowe */ 663db874c57Selowe if (page_retire_messages) { 664db874c57Selowe cmn_err(CE_WARN, MSG_DM, 665db874c57Selowe pa_hi, pa_lo, rb, wb); 666db874c57Selowe } 667db874c57Selowe errors = 1; 668db874c57Selowe goto out; /* double break */ 669db874c57Selowe } 670db874c57Selowe } 671db874c57Selowe } 672db874c57Selowe } 673db874c57Selowe out: 674db874c57Selowe no_trap(); 675bb9e3cfdSChristopher Baumbauer - Sun Microsystems - San Diego United States kpreempt_enable(); 676db874c57Selowe ppmapout(kaddr); 677db874c57Selowe 678db874c57Selowe return (errors ? 0 : 1); 679db874c57Selowe } 680db874c57Selowe 681db874c57Selowe /* 682db874c57Selowe * Try to clear a page_t with a single UE. If the UE was transient, it is 683db874c57Selowe * returned to service, and we return 1. Otherwise we return 0 meaning 684db874c57Selowe * that further processing is required to retire the page. 685db874c57Selowe */ 686db874c57Selowe static int 687db874c57Selowe page_retire_transient_ue(page_t *pp) 688db874c57Selowe { 689db874c57Selowe ASSERT(PAGE_EXCL(pp)); 690db874c57Selowe ASSERT(!hat_page_is_mapped(pp)); 691db874c57Selowe 692db874c57Selowe /* 693db874c57Selowe * If this page is a repeat offender, retire him under the 694db874c57Selowe * "two strikes and you're out" rule. The caller is responsible 695db874c57Selowe * for scrubbing the page to try to clear the error. 696db874c57Selowe */ 697db874c57Selowe if (pp->p_toxic & PR_UE_SCRUBBED) { 698db874c57Selowe PR_INCR_KSTAT(pr_ue_persistent); 699db874c57Selowe return (0); 700db874c57Selowe } 701db874c57Selowe 702db874c57Selowe if (page_clear_transient_ue(pp)) { 703db874c57Selowe /* 704db874c57Selowe * We set the PR_SCRUBBED_UE bit; if we ever see this 705db874c57Selowe * page again, we will retire it, no questions asked. 706db874c57Selowe */ 707db874c57Selowe page_settoxic(pp, PR_UE_SCRUBBED); 708db874c57Selowe 709db874c57Selowe if (page_retire_first_ue) { 710db874c57Selowe PR_INCR_KSTAT(pr_ue_cleared_retire); 711db874c57Selowe return (0); 712db874c57Selowe } else { 713db874c57Selowe PR_INCR_KSTAT(pr_ue_cleared_free); 714db874c57Selowe 7158b464eb8Smec page_clrtoxic(pp, PR_UE | PR_MCE | PR_MSG); 716db874c57Selowe 717db874c57Selowe /* LINTED: CONSTCOND */ 718db874c57Selowe VN_DISPOSE(pp, B_FREE, 1, kcred); 719db874c57Selowe return (1); 720db874c57Selowe } 721db874c57Selowe } 722db874c57Selowe 723db874c57Selowe PR_INCR_KSTAT(pr_ue_persistent); 724db874c57Selowe return (0); 725db874c57Selowe } 726db874c57Selowe 727db874c57Selowe /* 728db874c57Selowe * Update the statistics dynamically when our kstat is read. 729db874c57Selowe */ 730db874c57Selowe static int 731db874c57Selowe page_retire_kstat_update(kstat_t *ksp, int rw) 732db874c57Selowe { 733db874c57Selowe struct page_retire_kstat *pr; 734db874c57Selowe 735db874c57Selowe if (ksp == NULL) 736db874c57Selowe return (EINVAL); 737db874c57Selowe 738db874c57Selowe switch (rw) { 739db874c57Selowe 740db874c57Selowe case KSTAT_READ: 741db874c57Selowe pr = (struct page_retire_kstat *)ksp->ks_data; 742db874c57Selowe ASSERT(pr == &page_retire_kstat); 743db874c57Selowe pr->pr_limit.value.ui64 = PAGE_RETIRE_LIMIT; 744db874c57Selowe return (0); 745db874c57Selowe 746db874c57Selowe case KSTAT_WRITE: 747db874c57Selowe return (EACCES); 748db874c57Selowe 749db874c57Selowe default: 750db874c57Selowe return (EINVAL); 751db874c57Selowe } 752db874c57Selowe /*NOTREACHED*/ 753db874c57Selowe } 754db874c57Selowe 7558b464eb8Smec static int 7568b464eb8Smec pr_list_kstat_update(kstat_t *ksp, int rw) 7578b464eb8Smec { 7588b464eb8Smec uint_t count; 7598b464eb8Smec page_t *pp; 7608b464eb8Smec kmutex_t *vphm; 7618b464eb8Smec 7628b464eb8Smec if (rw == KSTAT_WRITE) 7638b464eb8Smec return (EACCES); 7648b464eb8Smec 7658b464eb8Smec vphm = page_vnode_mutex(retired_pages); 7668b464eb8Smec mutex_enter(vphm); 7678b464eb8Smec /* Needs to be under a lock so that for loop will work right */ 7688b464eb8Smec if (retired_pages->v_pages == NULL) { 7698b464eb8Smec mutex_exit(vphm); 7708b464eb8Smec ksp->ks_ndata = 0; 7718b464eb8Smec ksp->ks_data_size = 0; 7728b464eb8Smec return (0); 7738b464eb8Smec } 7748b464eb8Smec 7758b464eb8Smec count = 1; 7768b464eb8Smec for (pp = retired_pages->v_pages->p_vpnext; 7778b464eb8Smec pp != retired_pages->v_pages; pp = pp->p_vpnext) { 7788b464eb8Smec count++; 7798b464eb8Smec } 7808b464eb8Smec mutex_exit(vphm); 7818b464eb8Smec 7828b464eb8Smec ksp->ks_ndata = count; 7838b464eb8Smec ksp->ks_data_size = count * 2 * sizeof (uint64_t); 7848b464eb8Smec 7858b464eb8Smec return (0); 7868b464eb8Smec } 7878b464eb8Smec 7888b464eb8Smec /* 7898b464eb8Smec * all spans will be pagesize and no coalescing will be done with the 7908b464eb8Smec * list produced. 7918b464eb8Smec */ 7928b464eb8Smec static int 7938b464eb8Smec pr_list_kstat_snapshot(kstat_t *ksp, void *buf, int rw) 7948b464eb8Smec { 7958b464eb8Smec kmutex_t *vphm; 7968b464eb8Smec page_t *pp; 7978b464eb8Smec struct memunit { 7988b464eb8Smec uint64_t address; 7998b464eb8Smec uint64_t size; 8008b464eb8Smec } *kspmem; 8018b464eb8Smec 8028b464eb8Smec if (rw == KSTAT_WRITE) 8038b464eb8Smec return (EACCES); 8048b464eb8Smec 8058b464eb8Smec ksp->ks_snaptime = gethrtime(); 8068b464eb8Smec 8078b464eb8Smec kspmem = (struct memunit *)buf; 8088b464eb8Smec 8098b464eb8Smec vphm = page_vnode_mutex(retired_pages); 8108b464eb8Smec mutex_enter(vphm); 8118b464eb8Smec pp = retired_pages->v_pages; 8128b464eb8Smec if (((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size) || 8138b464eb8Smec (pp == NULL)) { 8148b464eb8Smec mutex_exit(vphm); 8158b464eb8Smec return (0); 8168b464eb8Smec } 8178b464eb8Smec kspmem->address = ptob(pp->p_pagenum); 8188b464eb8Smec kspmem->size = PAGESIZE; 8198b464eb8Smec kspmem++; 8208b464eb8Smec for (pp = pp->p_vpnext; pp != retired_pages->v_pages; 8218b464eb8Smec pp = pp->p_vpnext, kspmem++) { 8228b464eb8Smec if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size) 8238b464eb8Smec break; 8248b464eb8Smec kspmem->address = ptob(pp->p_pagenum); 8258b464eb8Smec kspmem->size = PAGESIZE; 8268b464eb8Smec } 8278b464eb8Smec mutex_exit(vphm); 8288b464eb8Smec 8298b464eb8Smec return (0); 8308b464eb8Smec } 8318b464eb8Smec 832db874c57Selowe /* 833cee1d74bSjfrank * page_retire_pend_count -- helper function for page_capture_thread, 834cee1d74bSjfrank * returns the number of pages pending retirement. 835cee1d74bSjfrank */ 836cee1d74bSjfrank uint64_t 837cee1d74bSjfrank page_retire_pend_count(void) 838cee1d74bSjfrank { 839cee1d74bSjfrank return (PR_KSTAT_PENDING); 840cee1d74bSjfrank } 841cee1d74bSjfrank 842704b9682SChristopher Baumbauer - Sun Microsystems - San Diego United States uint64_t 843704b9682SChristopher Baumbauer - Sun Microsystems - San Diego United States page_retire_pend_kas_count(void) 844cee1d74bSjfrank { 845704b9682SChristopher Baumbauer - Sun Microsystems - San Diego United States return (PR_KSTAT_PENDING_KAS); 846cee1d74bSjfrank } 847cee1d74bSjfrank 848cee1d74bSjfrank void 849704b9682SChristopher Baumbauer - Sun Microsystems - San Diego United States page_retire_incr_pend_count(void *datap) 850704b9682SChristopher Baumbauer - Sun Microsystems - San Diego United States { 851704b9682SChristopher Baumbauer - Sun Microsystems - San Diego United States PR_INCR_KSTAT(pr_pending); 852704b9682SChristopher Baumbauer - Sun Microsystems - San Diego United States 853704b9682SChristopher Baumbauer - Sun Microsystems - San Diego United States if ((datap == &kvp) || (datap == &zvp)) { 854704b9682SChristopher Baumbauer - Sun Microsystems - San Diego United States PR_INCR_KSTAT(pr_pending_kas); 855704b9682SChristopher Baumbauer - Sun Microsystems - San Diego United States } 856704b9682SChristopher Baumbauer - Sun Microsystems - San Diego United States } 857704b9682SChristopher Baumbauer - Sun Microsystems - San Diego United States 858704b9682SChristopher Baumbauer - Sun Microsystems - San Diego United States void 859704b9682SChristopher Baumbauer - Sun Microsystems - San Diego United States page_retire_decr_pend_count(void *datap) 860cee1d74bSjfrank { 861cee1d74bSjfrank PR_DECR_KSTAT(pr_pending); 862704b9682SChristopher Baumbauer - Sun Microsystems - San Diego United States 863704b9682SChristopher Baumbauer - Sun Microsystems - San Diego United States if ((datap == &kvp) || (datap == &zvp)) { 864704b9682SChristopher Baumbauer - Sun Microsystems - San Diego United States PR_DECR_KSTAT(pr_pending_kas); 865704b9682SChristopher Baumbauer - Sun Microsystems - San Diego United States } 866cee1d74bSjfrank } 867cee1d74bSjfrank 868cee1d74bSjfrank /* 869db874c57Selowe * Initialize the page retire mechanism: 870db874c57Selowe * 871db874c57Selowe * - Establish the correctable error retire limit. 872db874c57Selowe * - Initialize locks. 873db874c57Selowe * - Build the retired_pages vnode. 874db874c57Selowe * - Set up the kstats. 875db874c57Selowe * - Fire off the background thread. 8768b464eb8Smec * - Tell page_retire() it's OK to start retiring pages. 877db874c57Selowe */ 878db874c57Selowe void 879db874c57Selowe page_retire_init(void) 880db874c57Selowe { 881aa59c4cbSrsb const fs_operation_def_t retired_vnodeops_template[] = { 882aa59c4cbSrsb { NULL, NULL } 883aa59c4cbSrsb }; 884db874c57Selowe struct vnodeops *vops; 8858b464eb8Smec kstat_t *ksp; 886db874c57Selowe 887db874c57Selowe const uint_t page_retire_ndata = 888db874c57Selowe sizeof (page_retire_kstat) / sizeof (kstat_named_t); 889db874c57Selowe 890db874c57Selowe ASSERT(page_retire_ksp == NULL); 891db874c57Selowe 892db874c57Selowe if (max_pages_retired_bps <= 0) { 893db874c57Selowe max_pages_retired_bps = MCE_BPT; 894db874c57Selowe } 895db874c57Selowe 896db874c57Selowe mutex_init(&pr_q_mutex, NULL, MUTEX_DEFAULT, NULL); 897db874c57Selowe 898db874c57Selowe retired_pages = vn_alloc(KM_SLEEP); 899db874c57Selowe if (vn_make_ops("retired_pages", retired_vnodeops_template, &vops)) { 900db874c57Selowe cmn_err(CE_PANIC, 901db874c57Selowe "page_retired_init: can't make retired vnodeops"); 902db874c57Selowe } 903db874c57Selowe vn_setops(retired_pages, vops); 904db874c57Selowe 905db874c57Selowe if ((page_retire_ksp = kstat_create("unix", 0, "page_retire", 906db874c57Selowe "misc", KSTAT_TYPE_NAMED, page_retire_ndata, 907db874c57Selowe KSTAT_FLAG_VIRTUAL)) == NULL) { 908db874c57Selowe cmn_err(CE_WARN, "kstat_create for page_retire failed"); 909db874c57Selowe } else { 910db874c57Selowe page_retire_ksp->ks_data = (void *)&page_retire_kstat; 911db874c57Selowe page_retire_ksp->ks_update = page_retire_kstat_update; 912db874c57Selowe kstat_install(page_retire_ksp); 913db874c57Selowe } 914db874c57Selowe 9158b464eb8Smec mutex_init(&pr_list_kstat_mutex, NULL, MUTEX_DEFAULT, NULL); 9168b464eb8Smec ksp = kstat_create("unix", 0, "page_retire_list", "misc", 9178b464eb8Smec KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL); 9188b464eb8Smec if (ksp != NULL) { 9198b464eb8Smec ksp->ks_update = pr_list_kstat_update; 9208b464eb8Smec ksp->ks_snapshot = pr_list_kstat_snapshot; 9218b464eb8Smec ksp->ks_lock = &pr_list_kstat_mutex; 9228b464eb8Smec kstat_install(ksp); 9238b464eb8Smec } 924db874c57Selowe 9251a3c9a5aSVijay S Balakrishna memscrub_notify_func = 9261a3c9a5aSVijay S Balakrishna (void(*)(uint64_t))kobj_getsymvalue("memscrub_notify", 0); 9271a3c9a5aSVijay S Balakrishna 9288b464eb8Smec page_capture_register_callback(PC_RETIRE, -1, page_retire_pp_finish); 929db874c57Selowe pr_enable = 1; 930db874c57Selowe } 931db874c57Selowe 932db874c57Selowe /* 933db874c57Selowe * page_retire_hunt() callback for the retire thread. 934db874c57Selowe */ 935db874c57Selowe static void 936db874c57Selowe page_retire_thread_cb(page_t *pp) 937db874c57Selowe { 938db874c57Selowe PR_DEBUG(prd_tctop); 939ad23a2dbSjohansen if (!PP_ISKAS(pp) && page_trylock(pp, SE_EXCL)) { 940db874c57Selowe PR_DEBUG(prd_tclocked); 941db874c57Selowe page_unlock(pp); 942db874c57Selowe } 943db874c57Selowe } 944db874c57Selowe 945db874c57Selowe /* 9468b464eb8Smec * Callback used by page_trycapture() to finish off retiring a page. 9478b464eb8Smec * The page has already been cleaned and we've been given sole access to 9488b464eb8Smec * it. 9498b464eb8Smec * Always returns 0 to indicate that callback succeded as the callback never 9508b464eb8Smec * fails to finish retiring the given page. 951db874c57Selowe */ 9528b464eb8Smec /*ARGSUSED*/ 953db874c57Selowe static int 9548b464eb8Smec page_retire_pp_finish(page_t *pp, void *notused, uint_t flags) 955db874c57Selowe { 956db874c57Selowe int toxic; 957db874c57Selowe 958db874c57Selowe ASSERT(PAGE_EXCL(pp)); 959db874c57Selowe ASSERT(pp->p_iolock_state == 0); 960db874c57Selowe ASSERT(pp->p_szc == 0); 961db874c57Selowe 962db874c57Selowe toxic = pp->p_toxic; 963db874c57Selowe 964db874c57Selowe /* 965db874c57Selowe * The problem page is locked, demoted, unmapped, not free, 966db874c57Selowe * hashed out, and not COW or mlocked (whew!). 967db874c57Selowe * 968db874c57Selowe * Now we select our ammunition, take it around back, and shoot it. 969db874c57Selowe */ 970db874c57Selowe if (toxic & PR_UE) { 9718b464eb8Smec ue_error: 972db874c57Selowe if (page_retire_transient_ue(pp)) { 973db874c57Selowe PR_DEBUG(prd_uescrubbed); 9748b464eb8Smec (void) page_retire_done(pp, PRD_UE_SCRUBBED); 975db874c57Selowe } else { 976db874c57Selowe PR_DEBUG(prd_uenotscrubbed); 977db874c57Selowe page_retire_destroy(pp); 9788b464eb8Smec (void) page_retire_done(pp, PRD_SUCCESS); 979db874c57Selowe } 9808b464eb8Smec return (0); 981db874c57Selowe } else if (toxic & PR_FMA) { 982db874c57Selowe PR_DEBUG(prd_fma); 983db874c57Selowe page_retire_destroy(pp); 9848b464eb8Smec (void) page_retire_done(pp, PRD_SUCCESS); 9858b464eb8Smec return (0); 986db874c57Selowe } else if (toxic & PR_MCE) { 987db874c57Selowe PR_DEBUG(prd_mce); 988db874c57Selowe page_retire_destroy(pp); 9898b464eb8Smec (void) page_retire_done(pp, PRD_SUCCESS); 9908b464eb8Smec return (0); 991db874c57Selowe } 992db874c57Selowe 993db874c57Selowe /* 9948b464eb8Smec * When page_retire_first_ue is set to zero and a UE occurs which is 9958b464eb8Smec * transient, it's possible that we clear some flags set by a second 9968b464eb8Smec * UE error on the page which occurs while the first is currently being 9978b464eb8Smec * handled and thus we need to handle the case where none of the above 9988b464eb8Smec * are set. In this instance, PR_UE_SCRUBBED should be set and thus 9998b464eb8Smec * we should execute the UE code above. 1000db874c57Selowe */ 10018b464eb8Smec if (toxic & PR_UE_SCRUBBED) { 10028b464eb8Smec goto ue_error; 1003db874c57Selowe } 1004db874c57Selowe 1005db874c57Selowe /* 10068b464eb8Smec * It's impossible to get here. 1007db874c57Selowe */ 10088b464eb8Smec panic("bad toxic flags 0x%x in page_retire_pp_finish\n", toxic); 10098b464eb8Smec return (0); 1010db874c57Selowe } 1011db874c57Selowe 1012db874c57Selowe /* 1013db874c57Selowe * page_retire() - the front door in to retire a page. 1014db874c57Selowe * 1015db874c57Selowe * Ideally, page_retire() would instantly retire the requested page. 1016db874c57Selowe * Unfortunately, some pages are locked or otherwise tied up and cannot be 10178b464eb8Smec * retired right away. We use the page capture logic to deal with this 10188b464eb8Smec * situation as it will continuously try to retire the page in the background 10198b464eb8Smec * if the first attempt fails. Success is determined by looking to see whether 10208b464eb8Smec * the page has been retired after the page_trycapture() attempt. 1021db874c57Selowe * 1022db874c57Selowe * Returns: 1023db874c57Selowe * 1024db874c57Selowe * - 0 on success, 1025db874c57Selowe * - EINVAL when the PA is whacko, 10262708b84eSelowe * - EIO if the page is already retired or already pending retirement, or 10272708b84eSelowe * - EAGAIN if the page could not be _immediately_ retired but is pending. 1028db874c57Selowe */ 1029db874c57Selowe int 1030db874c57Selowe page_retire(uint64_t pa, uchar_t reason) 1031db874c57Selowe { 1032db874c57Selowe page_t *pp; 1033db874c57Selowe 1034db874c57Selowe ASSERT(reason & PR_REASONS); /* there must be a reason */ 1035db874c57Selowe ASSERT(!(reason & ~PR_REASONS)); /* but no other bits */ 1036db874c57Selowe 1037db874c57Selowe pp = page_numtopp_nolock(mmu_btop(pa)); 1038db874c57Selowe if (pp == NULL) { 1039db874c57Selowe PR_MESSAGE(CE_WARN, 1, "Cannot schedule clearing of error on" 1040db874c57Selowe " page 0x%08x.%08x; page is not relocatable memory", pa); 1041db874c57Selowe return (page_retire_done(pp, PRD_INVALID_PA)); 1042db874c57Selowe } 1043db874c57Selowe if (PP_RETIRED(pp)) { 10442708b84eSelowe PR_DEBUG(prd_dup1); 1045db874c57Selowe return (page_retire_done(pp, PRD_DUPLICATE)); 1046db874c57Selowe } 1047db874c57Selowe 10481a3c9a5aSVijay S Balakrishna if (memscrub_notify_func != NULL) { 10491a3c9a5aSVijay S Balakrishna (void) memscrub_notify_func(pa); 10501a3c9a5aSVijay S Balakrishna } 10511a3c9a5aSVijay S Balakrishna 10522708b84eSelowe if ((reason & PR_UE) && !PP_TOXIC(pp)) { 1053db874c57Selowe PR_MESSAGE(CE_NOTE, 1, "Scheduling clearing of error on" 1054db874c57Selowe " page 0x%08x.%08x", pa); 10552708b84eSelowe } else if (PP_PR_REQ(pp)) { 10562708b84eSelowe PR_DEBUG(prd_dup2); 10572708b84eSelowe return (page_retire_done(pp, PRD_DUPLICATE)); 1058db874c57Selowe } else { 1059db874c57Selowe PR_MESSAGE(CE_NOTE, 1, "Scheduling removal of" 1060db874c57Selowe " page 0x%08x.%08x", pa); 1061db874c57Selowe } 1062db874c57Selowe 10638b464eb8Smec /* Avoid setting toxic bits in the first place */ 10648b464eb8Smec if ((reason & (PR_FMA | PR_MCE)) && !(reason & PR_UE) && 10658b464eb8Smec page_retire_limit()) { 10668b464eb8Smec return (page_retire_done(pp, PRD_LIMIT)); 10678b464eb8Smec } 10688b464eb8Smec 10698b464eb8Smec if (MTBF(pr_calls, pr_mtbf)) { 10708b464eb8Smec page_settoxic(pp, reason); 1071704b9682SChristopher Baumbauer - Sun Microsystems - San Diego United States if (page_trycapture(pp, 0, CAPTURE_RETIRE, pp->p_vnode) == 0) { 1072db874c57Selowe PR_DEBUG(prd_prlocked); 10738b464eb8Smec } else { 10748b464eb8Smec PR_DEBUG(prd_prnotlocked); 10758b464eb8Smec } 1076db874c57Selowe } else { 1077db874c57Selowe PR_DEBUG(prd_prnotlocked); 1078db874c57Selowe } 1079db874c57Selowe 1080db874c57Selowe if (PP_RETIRED(pp)) { 1081db874c57Selowe PR_DEBUG(prd_prretired); 1082db874c57Selowe return (0); 1083db874c57Selowe } else { 10848b464eb8Smec cv_signal(&pc_cv); 1085db874c57Selowe PR_INCR_KSTAT(pr_failed); 1086db874c57Selowe 1087db874c57Selowe if (pp->p_toxic & PR_MSG) { 1088db874c57Selowe return (page_retire_done(pp, PRD_FAILED)); 1089db874c57Selowe } else { 1090db874c57Selowe return (page_retire_done(pp, PRD_PENDING)); 1091db874c57Selowe } 1092db874c57Selowe } 1093db874c57Selowe } 1094db874c57Selowe 1095db874c57Selowe /* 1096db874c57Selowe * Take a retired page off the retired-pages vnode and clear the toxic flags. 1097db874c57Selowe * If "free" is nonzero, lock it and put it back on the freelist. If "free" 1098db874c57Selowe * is zero, the caller already holds SE_EXCL lock so we simply unretire it 1099db874c57Selowe * and don't do anything else with it. 1100db874c57Selowe * 1101db874c57Selowe * Any unretire messages are printed from this routine. 1102db874c57Selowe * 1103db874c57Selowe * Returns 0 if page pp was unretired; else an error code. 11048b464eb8Smec * 11058b464eb8Smec * If flags is: 11068b464eb8Smec * PR_UNR_FREE - lock the page, clear the toxic flags and free it 11078b464eb8Smec * to the freelist. 11088b464eb8Smec * PR_UNR_TEMP - lock the page, unretire it, leave the toxic 11098b464eb8Smec * bits set as is and return it to the caller. 11108b464eb8Smec * PR_UNR_CLEAN - page is SE_EXCL locked, unretire it, clear the 11118b464eb8Smec * toxic flags and return it to caller as is. 1112db874c57Selowe */ 1113db874c57Selowe int 11148b464eb8Smec page_unretire_pp(page_t *pp, int flags) 1115db874c57Selowe { 1116db874c57Selowe /* 1117db874c57Selowe * To be retired, a page has to be hashed onto the retired_pages vnode 1118db874c57Selowe * and have PR_RETIRED set in p_toxic. 1119db874c57Selowe */ 11208b464eb8Smec if (flags == PR_UNR_CLEAN || 11218b464eb8Smec page_try_reclaim_lock(pp, SE_EXCL, SE_RETIRED)) { 1122db874c57Selowe ASSERT(PAGE_EXCL(pp)); 1123db874c57Selowe PR_DEBUG(prd_ulocked); 1124db874c57Selowe if (!PP_RETIRED(pp)) { 1125db874c57Selowe PR_DEBUG(prd_unotretired); 1126db874c57Selowe page_unlock(pp); 1127db874c57Selowe return (page_retire_done(pp, PRD_UNR_NOT)); 1128db874c57Selowe } 1129db874c57Selowe 1130db874c57Selowe PR_MESSAGE(CE_NOTE, 1, "unretiring retired" 11314fc2445aSelowe " page 0x%08x.%08x", mmu_ptob((uint64_t)pp->p_pagenum)); 1132db874c57Selowe if (pp->p_toxic & PR_FMA) { 1133db874c57Selowe PR_DECR_KSTAT(pr_fma); 1134db874c57Selowe } else if (pp->p_toxic & PR_UE) { 1135db874c57Selowe PR_DECR_KSTAT(pr_ue); 1136db874c57Selowe } else { 1137db874c57Selowe PR_DECR_KSTAT(pr_mce); 1138db874c57Selowe } 1139db874c57Selowe 11408b464eb8Smec if (flags == PR_UNR_TEMP) 11418b464eb8Smec page_clrtoxic(pp, PR_RETIRED); 11428b464eb8Smec else 11438b464eb8Smec page_clrtoxic(pp, PR_TOXICFLAGS); 11448b464eb8Smec 11458b464eb8Smec if (flags == PR_UNR_FREE) { 1146db874c57Selowe PR_DEBUG(prd_udestroy); 1147db874c57Selowe page_destroy(pp, 0); 1148db874c57Selowe } else { 1149db874c57Selowe PR_DEBUG(prd_uhashout); 1150db874c57Selowe page_hashout(pp, NULL); 1151db874c57Selowe } 1152db874c57Selowe 1153db874c57Selowe mutex_enter(&freemem_lock); 1154db874c57Selowe availrmem++; 1155db874c57Selowe mutex_exit(&freemem_lock); 1156db874c57Selowe 1157db874c57Selowe PR_DEBUG(prd_uunretired); 1158db874c57Selowe PR_DECR_KSTAT(pr_retired); 1159db874c57Selowe PR_INCR_KSTAT(pr_unretired); 1160db874c57Selowe return (page_retire_done(pp, PRD_UNR_SUCCESS)); 1161db874c57Selowe } 1162db874c57Selowe PR_DEBUG(prd_unotlocked); 1163db874c57Selowe return (page_retire_done(pp, PRD_UNR_CANTLOCK)); 1164db874c57Selowe } 1165db874c57Selowe 1166db874c57Selowe /* 1167db874c57Selowe * Return a page to service by moving it from the retired_pages vnode 1168db874c57Selowe * onto the freelist. 1169db874c57Selowe * 1170db874c57Selowe * Called from mmioctl_page_retire() on behalf of the FMA DE. 1171db874c57Selowe * 1172db874c57Selowe * Returns: 1173db874c57Selowe * 1174db874c57Selowe * - 0 if the page is unretired, 1175db874c57Selowe * - EAGAIN if the pp can not be locked, 1176db874c57Selowe * - EINVAL if the PA is whacko, and 11772708b84eSelowe * - EIO if the pp is not retired. 1178db874c57Selowe */ 1179db874c57Selowe int 1180db874c57Selowe page_unretire(uint64_t pa) 1181db874c57Selowe { 1182db874c57Selowe page_t *pp; 1183db874c57Selowe 1184db874c57Selowe pp = page_numtopp_nolock(mmu_btop(pa)); 1185db874c57Selowe if (pp == NULL) { 1186db874c57Selowe return (page_retire_done(pp, PRD_INVALID_PA)); 1187db874c57Selowe } 1188db874c57Selowe 11898b464eb8Smec return (page_unretire_pp(pp, PR_UNR_FREE)); 1190db874c57Selowe } 1191db874c57Selowe 1192db874c57Selowe /* 1193db874c57Selowe * Test a page to see if it is retired. If errors is non-NULL, the toxic 1194db874c57Selowe * bits of the page are returned. Returns 0 on success, error code on failure. 1195db874c57Selowe */ 1196db874c57Selowe int 1197db874c57Selowe page_retire_check_pp(page_t *pp, uint64_t *errors) 1198db874c57Selowe { 1199db874c57Selowe int rc; 1200db874c57Selowe 1201db874c57Selowe if (PP_RETIRED(pp)) { 1202db874c57Selowe PR_DEBUG(prd_checkhit); 1203db874c57Selowe rc = 0; 12042708b84eSelowe } else if (PP_PR_REQ(pp)) { 12052708b84eSelowe PR_DEBUG(prd_checkmiss_pend); 1206db874c57Selowe rc = EAGAIN; 12072708b84eSelowe } else { 12082708b84eSelowe PR_DEBUG(prd_checkmiss_noerr); 12092708b84eSelowe rc = EIO; 1210db874c57Selowe } 1211db874c57Selowe 1212db874c57Selowe /* 1213db874c57Selowe * We have magically arranged the bit values returned to fmd(1M) 1214db874c57Selowe * to line up with the FMA, MCE, and UE bits of the page_t. 1215db874c57Selowe */ 1216db874c57Selowe if (errors) { 1217db874c57Selowe uint64_t toxic = (uint64_t)(pp->p_toxic & PR_ERRMASK); 1218db874c57Selowe if (toxic & PR_UE_SCRUBBED) { 1219db874c57Selowe toxic &= ~PR_UE_SCRUBBED; 1220db874c57Selowe toxic |= PR_UE; 1221db874c57Selowe } 1222db874c57Selowe *errors = toxic; 1223db874c57Selowe } 1224db874c57Selowe 1225db874c57Selowe return (rc); 1226db874c57Selowe } 1227db874c57Selowe 1228db874c57Selowe /* 1229db874c57Selowe * Test to see if the page_t for a given PA is retired, and return the 1230db874c57Selowe * hardware errors we have seen on the page if requested. 1231db874c57Selowe * 1232db874c57Selowe * Called from mmioctl_page_retire on behalf of the FMA DE. 1233db874c57Selowe * 1234db874c57Selowe * Returns: 1235db874c57Selowe * 1236db874c57Selowe * - 0 if the page is retired, 12372708b84eSelowe * - EIO if the page is not retired and has no errors, 12382708b84eSelowe * - EAGAIN if the page is not retired but is pending; and 1239db874c57Selowe * - EINVAL if the PA is whacko. 1240db874c57Selowe */ 1241db874c57Selowe int 1242db874c57Selowe page_retire_check(uint64_t pa, uint64_t *errors) 1243db874c57Selowe { 1244db874c57Selowe page_t *pp; 1245db874c57Selowe 1246db874c57Selowe if (errors) { 1247db874c57Selowe *errors = 0; 1248db874c57Selowe } 1249db874c57Selowe 1250db874c57Selowe pp = page_numtopp_nolock(mmu_btop(pa)); 1251db874c57Selowe if (pp == NULL) { 1252db874c57Selowe return (page_retire_done(pp, PRD_INVALID_PA)); 1253db874c57Selowe } 1254db874c57Selowe 1255db874c57Selowe return (page_retire_check_pp(pp, errors)); 1256db874c57Selowe } 1257db874c57Selowe 1258db874c57Selowe /* 1259db874c57Selowe * Page retire self-test. For now, it always returns 0. 1260db874c57Selowe */ 1261db874c57Selowe int 1262db874c57Selowe page_retire_test(void) 1263db874c57Selowe { 1264db874c57Selowe page_t *first, *pp, *cpp, *cpp2, *lpp; 1265db874c57Selowe 1266db874c57Selowe /* 1267db874c57Selowe * Tests the corner case where a large page can't be retired 1268db874c57Selowe * because one of the constituent pages is locked. We mark 1269db874c57Selowe * one page to be retired and try to retire it, and mark the 1270db874c57Selowe * other page to be retired but don't try to retire it, so 1271db874c57Selowe * that page_unlock() in the failure path will recurse and try 1272db874c57Selowe * to retire THAT page. This is the worst possible situation 1273db874c57Selowe * we can get ourselves into. 1274db874c57Selowe */ 1275db874c57Selowe memsegs_lock(0); 1276db874c57Selowe pp = first = page_first(); 1277db874c57Selowe do { 1278db874c57Selowe if (pp->p_szc && PP_PAGEROOT(pp) == pp) { 1279db874c57Selowe cpp = pp + 1; 1280db874c57Selowe lpp = PP_ISFREE(pp)? pp : pp + 2; 1281db874c57Selowe cpp2 = pp + 3; 1282db874c57Selowe if (!page_trylock(lpp, pp == lpp? SE_EXCL : SE_SHARED)) 1283db874c57Selowe continue; 1284db874c57Selowe if (!page_trylock(cpp, SE_EXCL)) { 1285db874c57Selowe page_unlock(lpp); 1286db874c57Selowe continue; 1287db874c57Selowe } 12888b464eb8Smec 12898b464eb8Smec /* fails */ 12908b464eb8Smec (void) page_retire(ptob(cpp->p_pagenum), PR_FMA); 12918b464eb8Smec 1292db874c57Selowe page_unlock(lpp); 12938b464eb8Smec page_unlock(cpp); 12948b464eb8Smec (void) page_retire(ptob(cpp->p_pagenum), PR_FMA); 12958b464eb8Smec (void) page_retire(ptob(cpp2->p_pagenum), PR_FMA); 1296db874c57Selowe } 1297db874c57Selowe } while ((pp = page_next(pp)) != first); 1298db874c57Selowe memsegs_unlock(0); 1299db874c57Selowe 1300db874c57Selowe return (0); 1301db874c57Selowe } 1302