1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #ifndef _CHEETAHASM_H 28 #define _CHEETAHASM_H 29 30 #pragma ident "%Z%%M% %I% %E% SMI" 31 32 #ifdef __cplusplus 33 extern "C" { 34 #endif 35 36 #ifdef _ASM 37 /* BEGIN CSTYLED */ 38 39 #define ASM_LD(reg, symbol) \ 40 sethi %hi(symbol), reg; \ 41 ld [reg + %lo(symbol)], reg; \ 42 43 #define ASM_LDX(reg, symbol) \ 44 sethi %hi(symbol), reg; \ 45 ldx [reg + %lo(symbol)], reg; \ 46 47 #define ASM_JMP(reg, symbol) \ 48 sethi %hi(symbol), reg; \ 49 jmp reg + %lo(symbol); \ 50 nop 51 52 /* 53 * Macro for getting to offset from 'cpu_private' ptr. The 'cpu_private' 54 * ptr is in the machcpu structure. 55 * off_reg: Register offset from 'cpu_private' ptr. 56 * scr1: Scratch, ptr is returned in this register. 57 * scr2: Scratch 58 * label: Label to branch to if cpu_private ptr is null/zero. 59 */ 60 #define GET_CPU_PRIVATE_PTR(off_reg, scr1, scr2, label) \ 61 CPU_ADDR(scr1, scr2); \ 62 ldn [scr1 + CPU_PRIVATE], scr1; \ 63 cmp scr1, 0; \ 64 be label; \ 65 nop; \ 66 add scr1, off_reg, scr1 67 68 /* 69 * Macro version of get_dcache_dtag. We use this macro in the 70 * CPU logout code. Since the Dcache is virtually indexed, only 71 * bits [12:5] of the AFAR can be used so we need to search through 72 * 8 indexes (4 ways + bit 13) in order to find the tag we want. 73 * afar: input AFAR, not modified. 74 * datap: input ptr to ch_dc_data_t, at end pts to end of ch_dc_data_t. 75 * scr1: scratch. 76 * scr2: scratch, will hold tag to look for. 77 * scr3: used for Dcache index, loops through 4 ways. 78 */ 79 #define GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3) \ 80 set CH_DCACHE_IDX_MASK, scr3; \ 81 and afar, scr3, scr3; \ 82 srlx afar, CH_DCTAG_PA_SHIFT, scr2; \ 83 b 1f; \ 84 or scr2, CH_DCTAG_VALID_BIT, scr2; /* tag we want */ \ 85 .align 128; \ 86 1: \ 87 ldxa [scr3]ASI_DC_TAG, scr1; /* read tag */ \ 88 cmp scr1, scr2; \ 89 bne 4f; /* not found? */ \ 90 nop; \ 91 stxa scr3, [datap + CH_DC_IDX]%asi; /* store index */ \ 92 stxa scr1, [datap + CH_DC_TAG]%asi; /* store tag */ \ 93 membar #Sync; /* Cheetah PRM 10.6.3 */ \ 94 ldxa [scr3]ASI_DC_UTAG, scr1; /* read utag */ \ 95 membar #Sync; /* Cheetah PRM 10.6.3 */ \ 96 stxa scr1, [datap + CH_DC_UTAG]%asi; \ 97 ldxa [scr3]ASI_DC_SNP_TAG, scr1; /* read snoop tag */ \ 98 stxa scr1, [datap + CH_DC_SNTAG]%asi; \ 99 add datap, CH_DC_DATA, datap; \ 100 clr scr2; \ 101 2: \ 102 membar #Sync; /* Cheetah PRM 10.6.1 */ \ 103 ldxa [scr3 + scr2]ASI_DC_DATA, scr1; /* read data */ \ 104 membar #Sync; /* Cheetah PRM 10.6.1 */ \ 105 stxa scr1, [datap]%asi; \ 106 add datap, 8, datap; \ 107 cmp scr2, CH_DC_DATA_REG_SIZE - 8; \ 108 blt 2b; \ 109 add scr2, 8, scr2; \ 110 \ 111 GET_CPU_IMPL(scr2); /* Parity bits are elsewhere for */ \ 112 cmp scr2, PANTHER_IMPL; /* panther processors. */ \ 113 bne,a 5f; /* Done if not panther. */ \ 114 add datap, 8, datap; /* Skip to the end of the struct. */ \ 115 clr scr2; \ 116 add datap, 7, datap; /* offset of the last parity byte */ \ 117 mov 1, scr1; \ 118 sll scr1, PN_DC_DATA_PARITY_BIT_SHIFT, scr1; \ 119 or scr3, scr1, scr3; /* add DC_data_parity bit to index */ \ 120 3: \ 121 membar #Sync; /* Cheetah PRM 10.6.1 */ \ 122 ldxa [scr3 + scr2]ASI_DC_DATA, scr1; /* read parity bits */ \ 123 membar #Sync; /* Cheetah PRM 10.6.1 */ \ 124 stba scr1, [datap]%asi; \ 125 dec datap; \ 126 cmp scr2, CH_DC_DATA_REG_SIZE - 8; \ 127 blt 3b; \ 128 add scr2, 8, scr2; \ 129 b 5f; \ 130 add datap, 5, datap; /* set pointer to end of our struct */ \ 131 4: \ 132 set CH_DCACHE_IDX_INCR, scr1; /* incr. idx (scr3) */ \ 133 add scr3, scr1, scr3; \ 134 set CH_DCACHE_IDX_LIMIT, scr1; /* done? */ \ 135 cmp scr3, scr1; \ 136 blt 1b; \ 137 nop; \ 138 add datap, CH_DC_DATA_SIZE, datap; \ 139 5: 140 141 /* 142 * Macro version of get_icache_dtag. We use this macro in the CPU 143 * logout code. If the Icache is on, we don't want to capture the data. 144 * afar: input AFAR, not modified. 145 * datap: input ptr to ch_ic_data_t, at end pts to end of ch_ic_data_t. 146 * scr1: scratch. 147 * scr2: scratch, will hold tag to look for. 148 * scr3: used for Icache index, loops through 4 ways. 149 * Note: For Panther, the Icache is virtually indexed and increases in 150 * size to 64KB (instead of 32KB) with a line size of 64 bytes (instead 151 * of 32). This means the IC_addr index bits[14:7] for Panther now 152 * correspond to VA bits[13:6]. But since it is virtually indexed, we 153 * still mask out only bits[12:5] from the AFAR (we have to manually 154 * check bit 13). In order to make this code work for all processors, 155 * we end up checking twice as many indexes (8 instead of 4) as required 156 * for non-Panther CPUs and saving off twice as much data (16 instructions 157 * instead of just 8). 158 */ 159 #define GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3) \ 160 ldxa [%g0]ASI_DCU, scr1; \ 161 btst DCU_IC, scr1; /* is Icache enabled? */ \ 162 bne,a 6f; /* yes, don't capture */ \ 163 add datap, CH_IC_DATA_SIZE, datap; /* anul if no branch */ \ 164 GET_CPU_IMPL(scr2); /* Panther only uses VA[13:6] */ \ 165 cmp scr2, PANTHER_IMPL; /* and we also want to mask */ \ 166 be 1f; /* out bit 13 since the */ \ 167 nop; /* Panther I$ is VIPT. */ \ 168 set CH_ICACHE_IDX_MASK, scr3; \ 169 b 2f; \ 170 nop; \ 171 1: \ 172 set PN_ICACHE_VA_IDX_MASK, scr3; \ 173 2: \ 174 and afar, scr3, scr3; \ 175 sllx scr3, CH_ICACHE_IDX_SHIFT, scr3; \ 176 srlx afar, CH_ICPATAG_SHIFT, scr2; /* pa tag we want */ \ 177 andn scr2, CH_ICPATAG_LBITS, scr2; /* mask off lower */ \ 178 b 3f; \ 179 nop; \ 180 .align 128; \ 181 3: \ 182 ldxa [scr3]ASI_IC_TAG, scr1; /* read pa tag */ \ 183 andn scr1, CH_ICPATAG_LBITS, scr1; /* mask off lower */ \ 184 cmp scr1, scr2; \ 185 bne 5f; /* not found? */ \ 186 nop; \ 187 stxa scr3, [datap + CH_IC_IDX]%asi; /* store index */ \ 188 stxa scr1, [datap + CH_IC_PATAG]%asi; /* store pa tag */ \ 189 add scr3, CH_ICTAG_UTAG, scr3; /* read utag */ \ 190 ldxa [scr3]ASI_IC_TAG, scr1; \ 191 add scr3, (CH_ICTAG_UPPER - CH_ICTAG_UTAG), scr3; \ 192 stxa scr1, [datap + CH_IC_UTAG]%asi; \ 193 ldxa [scr3]ASI_IC_TAG, scr1; /* read upper tag */ \ 194 add scr3, (CH_ICTAG_LOWER - CH_ICTAG_UPPER), scr3; \ 195 stxa scr1, [datap + CH_IC_UPPER]%asi; \ 196 ldxa [scr3]ASI_IC_TAG, scr1; /* read lower tag */ \ 197 andn scr3, CH_ICTAG_TMASK, scr3; \ 198 stxa scr1, [datap + CH_IC_LOWER]%asi; \ 199 ldxa [scr3]ASI_IC_SNP_TAG, scr1; /* read snoop tag */ \ 200 stxa scr1, [datap + CH_IC_SNTAG]%asi; \ 201 add datap, CH_IC_DATA, datap; \ 202 clr scr2; \ 203 4: \ 204 ldxa [scr3 + scr2]ASI_IC_DATA, scr1; /* read ins. data */ \ 205 stxa scr1, [datap]%asi; \ 206 add datap, 8, datap; \ 207 cmp scr2, PN_IC_DATA_REG_SIZE - 8; \ 208 blt 4b; \ 209 add scr2, 8, scr2; \ 210 b 6f; \ 211 nop; \ 212 5: \ 213 set CH_ICACHE_IDX_INCR, scr1; /* incr. idx (scr3) */ \ 214 add scr3, scr1, scr3; \ 215 set PN_ICACHE_IDX_LIMIT, scr1; /* done? */ \ 216 cmp scr3, scr1; \ 217 blt 3b; \ 218 nop; \ 219 add datap, CH_IC_DATA_SIZE, datap; \ 220 6: 221 222 #if defined(JALAPENO) || defined(SERRANO) 223 /* 224 * Macro version of get_ecache_dtag. We use this macro in the 225 * CPU logout code. 226 * afar: input AFAR, not modified 227 * datap: Ptr to ch_ec_data_t, at end pts just past ch_ec_data_t. 228 * ec_way: Constant value (way number) 229 * scr1: Scratch 230 * scr2: Scratch. 231 * scr3: Scratch. 232 */ 233 #define GET_ECACHE_DTAG(afar, datap, ec_way, scr1, scr2, scr3) \ 234 mov ec_way, scr1; \ 235 and scr1, JP_ECACHE_NWAY - 1, scr1; /* mask E$ way bits */ \ 236 sllx scr1, JP_EC_TAG_DATA_WAY_SHIFT, scr1; \ 237 set ((JP_ECACHE_MAX_SIZE / JP_ECACHE_NWAY) - 1), scr2; \ 238 and afar, scr2, scr3; /* get set offset */ \ 239 andn scr3, (JP_ECACHE_MAX_LSIZE - 1), scr3; /* VA<5:0>=0 */ \ 240 or scr3, scr1, scr3; /* or WAY bits */ \ 241 b 1f; \ 242 stxa scr3, [datap + CH_EC_IDX]%asi; /* store E$ index */ \ 243 .align 64; \ 244 1: \ 245 JP_EC_DIAG_ACCESS_MEMBAR; \ 246 ldxa [scr3]ASI_EC_DIAG, scr1; /* get E$ tag */ \ 247 JP_EC_DIAG_ACCESS_MEMBAR; \ 248 stxa scr1, [datap + CH_EC_TAG]%asi; \ 249 add datap, CH_EC_DATA, datap; \ 250 2: \ 251 ldxa [scr3]ASI_EC_R, %g0; /* ld E$ stging regs */ \ 252 clr scr1; \ 253 3: /* loop thru 5 regs */ \ 254 ldxa [scr1]ASI_EC_DATA, scr2; \ 255 stxa scr2, [datap]%asi; \ 256 add datap, 8, datap; \ 257 cmp scr1, CH_ECACHE_STGREG_TOTALSIZE - 8; \ 258 bne 3b; \ 259 add scr1, 8, scr1; \ 260 btst CH_ECACHE_STGREG_SIZE, scr3; /* done? */ \ 261 beq 2b; \ 262 add scr3, CH_ECACHE_STGREG_SIZE, scr3 263 264 #define GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3) \ 265 GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3); \ 266 GET_ECACHE_DTAG(afar, datap, 1, scr1, scr2, scr3); \ 267 GET_ECACHE_DTAG(afar, datap, 2, scr1, scr2, scr3); \ 268 GET_ECACHE_DTAG(afar, datap, 3, scr1, scr2, scr3); \ 269 add datap, (CHD_EC_DATA_SETS-4)*CH_EC_DATA_SIZE, datap; \ 270 add datap, CH_EC_DATA_SIZE * PN_L2_NWAYS, datap; \ 271 272 /* 273 * Jalapeno does not have cores so these macros are null. 274 */ 275 #define PARK_SIBLING_CORE(dcucr_reg, scr1, scr2) 276 #define UNPARK_SIBLING_CORE(dcucr_reg, scr1, scr2) 277 278 #if defined(JALAPENO) 279 /* 280 * Jalapeno gets primary AFSR and AFAR. All bits in the AFSR except 281 * the fatal error bits are cleared. 282 * datap: pointer to cpu logout structure. 283 * afar: returned primary AFAR value. 284 * scr1: scratch 285 * scr2: scratch 286 */ 287 #define GET_AFSR_AFAR(datap, afar, scr1, scr2) \ 288 ldxa [%g0]ASI_AFAR, afar; \ 289 stxa afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi; \ 290 ldxa [%g0]ASI_AFSR, scr2; \ 291 stxa scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi; \ 292 sethi %hh(C_AFSR_FATAL_ERRS), scr1; \ 293 sllx scr1, 32, scr1; \ 294 bclr scr1, scr2; /* Clear fatal error bits here, so */ \ 295 stxa scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */ \ 296 membar #Sync 297 298 /* 299 * Jalapeno has no shadow AFAR, null operation. 300 */ 301 #define GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3) 302 303 #elif defined(SERRANO) 304 /* 305 * Serrano gets primary AFSR and AFAR. All bits in the AFSR except 306 * the fatal error bits are cleared. For Serrano, we also save the 307 * AFAR2 register. 308 * datap: pointer to cpu logout structure. 309 * afar: returned primary AFAR value. 310 * scr1: scratch 311 * scr2: scratch 312 */ 313 #define GET_AFSR_AFAR(datap, afar, scr1, scr2) \ 314 set ASI_MCU_AFAR2_VA, scr1; \ 315 ldxa [scr1]ASI_MCU_CTRL, afar; \ 316 stxa afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR2)]%asi; \ 317 ldxa [%g0]ASI_AFAR, afar; \ 318 stxa afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi; \ 319 ldxa [%g0]ASI_AFSR, scr2; \ 320 stxa scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi; \ 321 sethi %hh(C_AFSR_FATAL_ERRS), scr1; \ 322 sllx scr1, 32, scr1; \ 323 bclr scr1, scr2; /* Clear fatal error bits here, so */ \ 324 stxa scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */ \ 325 membar #Sync 326 327 /* 328 * Serrano needs to capture E$, D$ and I$ lines associated with afar2. 329 * afar: scratch, holds afar2. 330 * datap: pointer to cpu logout structure 331 * scr1: scratch 332 * scr2: scratch 333 * scr3: scratch 334 */ 335 #define GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3) \ 336 ldxa [datap + (CH_CLO_DATA + CH_CHD_AFAR2)]%asi, afar; \ 337 add datap, CH_CLO_SDW_DATA + CH_CHD_EC_DATA, datap; \ 338 GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3); \ 339 GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3); \ 340 GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3); \ 341 sub datap, CH_CPU_LOGOUT_SIZE, datap 342 #endif /* SERRANO */ 343 344 #elif defined(CHEETAH_PLUS) 345 /* 346 * Macro version of get_ecache_dtag. We use this macro in the 347 * CPU logout code. 348 * afar: input AFAR, not modified. 349 * datap: Ptr to ch_ec_data_t, at end pts just past ch_ec_data_t. 350 * pn_way: ecache way for panther (value = 0-3). For non-panther 351 * cpus, this macro will be called with pn_way = 0. 352 * scr1: Scratch. 353 * scr2: Scratch. 354 * scr3: Scratch. 355 */ 356 #define GET_ECACHE_DTAG(afar, datap, pn_way, scr1, scr2, scr3) \ 357 mov afar, scr3; \ 358 andn scr3, (CH_ECACHE_SUBBLK_SIZE - 1), scr3; /* VA<5:0>=0 */\ 359 set (CH_ECACHE_8M_SIZE - 1), scr2; \ 360 and scr3, scr2, scr3; /* VA<63:23>=0 */ \ 361 mov pn_way, scr1; /* panther L3$ is 4-way so we ... */ \ 362 sllx scr1, PN_L3_WAY_SHIFT, scr1; /* need to mask... */ \ 363 or scr3, scr1, scr3; /* in the way bits <24:23>. */ \ 364 b 1f; \ 365 stxa scr3, [datap + CH_EC_IDX]%asi; /* store E$ index */ \ 366 .align 64; \ 367 1: \ 368 ldxa [scr3]ASI_EC_DIAG, scr1; /* get E$ tag */ \ 369 stxa scr1, [datap + CH_EC_TAG]%asi; \ 370 set CHP_ECACHE_IDX_TAG_ECC, scr1; \ 371 or scr3, scr1, scr1; \ 372 ldxa [scr1]ASI_EC_DIAG, scr1; /* get E$ tag ECC */ \ 373 stxa scr1, [datap + CH_EC_TAG_ECC]%asi; \ 374 add datap, CH_EC_DATA, datap; \ 375 2: \ 376 ldxa [scr3]ASI_EC_R, %g0; /* ld E$ stging regs */ \ 377 clr scr1; \ 378 3: /* loop thru 5 regs */ \ 379 ldxa [scr1]ASI_EC_DATA, scr2; \ 380 stxa scr2, [datap]%asi; \ 381 add datap, 8, datap; \ 382 cmp scr1, CH_ECACHE_STGREG_TOTALSIZE - 8; \ 383 bne 3b; \ 384 add scr1, 8, scr1; \ 385 btst CH_ECACHE_STGREG_SIZE, scr3; /* done? */ \ 386 beq 2b; \ 387 add scr3, CH_ECACHE_STGREG_SIZE, scr3 388 389 /* 390 * If this is a panther, we need to make sure the sibling core is 391 * parked so that we avoid any race conditions during diagnostic 392 * accesses to the shared L2 and L3 caches. 393 * dcucr_reg: This register will be used to keep track of whether 394 * or not we need to unpark the core later. 395 * It just so happens that we also use this same register 396 * to keep track of our saved DCUCR value so we only touch 397 * bit 4 of the register (which is a "reserved" bit in the 398 * DCUCR) for keeping track of core parking. 399 * scr1: Scratch register. 400 * scr2: Scratch register. 401 */ 402 #define PARK_SIBLING_CORE(dcucr_reg, scr1, scr2) \ 403 GET_CPU_IMPL(scr1); \ 404 cmp scr1, PANTHER_IMPL; /* only park for panthers */ \ 405 bne,a %xcc, 2f; \ 406 andn dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg; \ 407 set ASI_CORE_RUNNING_STATUS, scr1; /* check other core */ \ 408 ldxa [scr1]ASI_CMP_SHARED, scr2; /* is it running? */ \ 409 cmp scr2, PN_BOTH_CORES_RUNNING; \ 410 bne,a %xcc, 2f; /* if not running, we are done */ \ 411 andn dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg; \ 412 or dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg; \ 413 set ASI_CORE_ID, scr1; \ 414 ldxa [scr1]ASI_CMP_PER_CORE, scr2; \ 415 and scr2, COREID_MASK, scr2; \ 416 or %g0, 1, scr1; /* find out which core... */ \ 417 sll scr1, scr2, scr2; /* ... we need to park... */ \ 418 1: \ 419 set ASI_CORE_RUNNING_RW, scr1; \ 420 stxa scr2, [scr1]ASI_CMP_SHARED; /* ... and park it. */ \ 421 membar #Sync; /* spin until the... */ \ 422 ldxa [scr1]ASI_CMP_SHARED, scr1; /* ... the other... */ \ 423 cmp scr1, scr2; /* ...core is parked according to... */ \ 424 bne,a %xcc, 1b; /* ...the core running status reg. */ \ 425 nop; \ 426 2: 427 428 /* 429 * The core running this code will unpark its sibling core if the 430 * sibling core had been parked by the current core earlier in this 431 * trap handler. 432 * dcucr_reg: This register is used to keep track of whether or not 433 * we need to unpark our sibling core. 434 * It just so happens that we also use this same register 435 * to keep track of our saved DCUCR value so we only touch 436 * bit 4 of the register (which is a "reserved" bit in the 437 * DCUCR) for keeping track of core parking. 438 * scr1: Scratch register. 439 * scr2: Scratch register. 440 */ 441 #define UNPARK_SIBLING_CORE(dcucr_reg, scr1, scr2) \ 442 btst PN_PARKED_OTHER_CORE, dcucr_reg; \ 443 bz,pt %xcc, 1f; /* if nothing to unpark, we are done */ \ 444 andn dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg; \ 445 set ASI_CORE_RUNNING_RW, scr1; \ 446 set PN_BOTH_CORES_RUNNING, scr2; /* we want both... */ \ 447 stxa scr2, [scr1]ASI_CMP_SHARED; /* ...cores running. */ \ 448 membar #Sync; \ 449 1: 450 451 /* 452 * Cheetah+ and Jaguar get both primary and secondary AFSR/AFAR. All bits 453 * in the primary AFSR are cleared except the fatal error bits. For Panther, 454 * we also have to read and clear the AFSR_EXT, again leaving the fatal 455 * error bits alone. 456 * datap: pointer to cpu logout structure. 457 * afar: returned primary AFAR value. 458 * scr1: scratch 459 * scr2: scratch 460 */ 461 #define GET_AFSR_AFAR(datap, afar, scr1, scr2) \ 462 set ASI_SHADOW_REG_VA, scr1; \ 463 ldxa [scr1]ASI_AFAR, scr2; \ 464 stxa scr2, [datap + (CH_CLO_SDW_DATA + CH_CHD_AFAR)]%asi; \ 465 ldxa [scr1]ASI_AFSR, scr2; \ 466 stxa scr2, [datap + (CH_CLO_SDW_DATA + CH_CHD_AFSR)]%asi; \ 467 ldxa [%g0]ASI_AFAR, afar; \ 468 stxa afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi; \ 469 ldxa [%g0]ASI_AFSR, scr2; \ 470 stxa scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi; \ 471 sethi %hh(C_AFSR_FATAL_ERRS), scr1; \ 472 sllx scr1, 32, scr1; \ 473 bclr scr1, scr2; /* Clear fatal error bits here, so */ \ 474 stxa scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */ \ 475 membar #Sync; \ 476 GET_CPU_IMPL(scr1); \ 477 cmp scr1, PANTHER_IMPL; \ 478 bne %xcc, 1f; \ 479 nop; \ 480 set ASI_SHADOW_AFSR_EXT_VA, scr1; /* shadow AFSR_EXT */ \ 481 ldxa [scr1]ASI_AFSR, scr2; \ 482 stxa scr2, [datap + (CH_CLO_SDW_DATA + CH_CHD_AFSR_EXT)]%asi; \ 483 set ASI_AFSR_EXT_VA, scr1; /* primary AFSR_EXT */ \ 484 ldxa [scr1]ASI_AFSR, scr2; \ 485 stxa scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR_EXT)]%asi; \ 486 set C_AFSR_EXT_FATAL_ERRS, scr1; \ 487 bclr scr1, scr2; /* Clear fatal error bits here, */ \ 488 set ASI_AFSR_EXT_VA, scr1; /* so they're left */ \ 489 stxa scr2, [scr1]ASI_AFSR; /* as is in AFSR_EXT */ \ 490 membar #Sync; \ 491 1: 492 493 /* 494 * This macro is used in the CPU logout code to capture diagnostic 495 * information from the L2 cache on panther processors. 496 * afar: input AFAR, not modified. 497 * datap: Ptr to pn_l2_data_t, at end pts just past pn_l2_data_t. 498 * scr1: Scratch. 499 * scr2: Scratch. 500 * scr3: Scratch. 501 */ 502 #define GET_PN_L2_CACHE_DTAGS(afar, datap, scr1, scr2, scr3) \ 503 mov afar, scr3; \ 504 set PN_L2_INDEX_MASK, scr1; \ 505 and scr3, scr1, scr3; \ 506 b 1f; /* code to read tags and data should be ... */ \ 507 nop; /* ...on the same cache line if possible. */ \ 508 .align 128; /* update this line if you add lines below. */ \ 509 1: \ 510 stxa scr3, [datap + CH_EC_IDX]%asi; /* store L2$ index */ \ 511 ldxa [scr3]ASI_L2_TAG, scr1; /* read the L2$ tag */ \ 512 stxa scr1, [datap + CH_EC_TAG]%asi; \ 513 add datap, CH_EC_DATA, datap; \ 514 clr scr1; \ 515 2: \ 516 ldxa [scr3 + scr1]ASI_L2_DATA, scr2; /* loop through */ \ 517 stxa scr2, [datap]%asi; /* <511:256> of L2 */ \ 518 add datap, 8, datap; /* data and record */ \ 519 cmp scr1, (PN_L2_LINESIZE / 2) - 8; /* it in the cpu */ \ 520 bne 2b; /* logout struct. */ \ 521 add scr1, 8, scr1; \ 522 set PN_L2_DATA_ECC_SEL, scr2; /* ECC_sel bit. */ \ 523 ldxa [scr3 + scr2]ASI_L2_DATA, scr2; /* Read and record */ \ 524 stxa scr2, [datap]%asi; /* ecc of <511:256> */ \ 525 add datap, 8, datap; \ 526 3: \ 527 ldxa [scr3 + scr1]ASI_L2_DATA, scr2; /* loop through */ \ 528 stxa scr2, [datap]%asi; /* <255:0> of L2 */ \ 529 add datap, 8, datap; /* data and record */ \ 530 cmp scr1, PN_L2_LINESIZE - 8; /* it in the cpu */ \ 531 bne 3b; /* logout struct. */ \ 532 add scr1, 8, scr1; \ 533 set PN_L2_DATA_ECC_SEL, scr2; /* ECC_sel bit. */ \ 534 add scr2, PN_L2_ECC_LO_REG, scr2; \ 535 ldxa [scr3 + scr2]ASI_L2_DATA, scr2; /* Read and record */ \ 536 stxa scr2, [datap]%asi; /* ecc of <255:0>. */ \ 537 add datap, 8, datap; /* Advance pointer */ \ 538 set PN_L2_SET_SIZE, scr2; \ 539 set PN_L2_MAX_SET, scr1; \ 540 cmp scr1, scr3; /* more ways to try for this line? */ \ 541 bg,a %xcc, 1b; /* if so, start over with next way */ \ 542 add scr3, scr2, scr3 543 544 /* 545 * Cheetah+ assumes E$ is 2-way and grabs both E$ lines associated with afar. 546 * afar: AFAR from access. 547 * datap: pointer to cpu logout structure. 548 * scr1: scratch 549 * scr2: scratch 550 * scr3: scratch 551 */ 552 #define GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3) \ 553 GET_CPU_IMPL(scr1); \ 554 cmp scr1, PANTHER_IMPL; \ 555 bne %xcc, 4f; \ 556 nop; \ 557 GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3); \ 558 GET_ECACHE_DTAG(afar, datap, 1, scr1, scr2, scr3); \ 559 GET_ECACHE_DTAG(afar, datap, 2, scr1, scr2, scr3); \ 560 GET_ECACHE_DTAG(afar, datap, 3, scr1, scr2, scr3); \ 561 add datap, (CHD_EC_DATA_SETS-4)*CH_EC_DATA_SIZE, datap; \ 562 GET_PN_L2_CACHE_DTAGS(afar, datap, scr1, scr2, scr3); \ 563 b 5f; \ 564 nop; \ 565 4: \ 566 GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3); \ 567 GET_ECACHE_WAY_BIT(scr1, scr2); \ 568 xor afar, scr1, afar; \ 569 GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3); \ 570 GET_ECACHE_WAY_BIT(scr1, scr2); /* restore AFAR */ \ 571 xor afar, scr1, afar; \ 572 add datap, (CHD_EC_DATA_SETS-2)*CH_EC_DATA_SIZE, datap; \ 573 add datap, CH_EC_DATA_SIZE * PN_L2_NWAYS, datap; \ 574 5: 575 576 /* 577 * Cheetah+ needs to capture E$, D$ and I$ lines associated with 578 * shadow afar. 579 * afar: scratch, holds shadow afar. 580 * datap: pointer to cpu logout structure 581 * scr1: scratch 582 * scr2: scratch 583 * scr3: scratch 584 */ 585 #define GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3) \ 586 ldxa [datap + (CH_CLO_SDW_DATA + CH_CHD_AFAR)]%asi, afar; \ 587 add datap, CH_CLO_SDW_DATA + CH_CHD_EC_DATA, datap; \ 588 GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3); \ 589 GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3); \ 590 GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3); \ 591 sub datap, CH_CPU_LOGOUT_SIZE, datap 592 593 /* 594 * Compute the "Way" bit for 2-way Ecache for Cheetah+. 595 */ 596 #define GET_ECACHE_WAY_BIT(scr1, scr2) \ 597 CPU_INDEX(scr1, scr2); \ 598 mulx scr1, CPU_NODE_SIZE, scr1; \ 599 add scr1, ECACHE_SIZE, scr1; \ 600 set cpunodes, scr2; \ 601 ld [scr1 + scr2], scr1; \ 602 srlx scr1, 1, scr1 603 604 #else /* CHEETAH_PLUS */ 605 /* 606 * Macro version of get_ecache_dtag. We use this macro in the 607 * CPU logout code. 608 * afar: input AFAR, not modified. 609 * datap: Ptr to ch_ec_data_t, at end pts just past ch_ec_data_t. 610 * scr1: Scratch. 611 * scr2: Scratch. 612 * scr3: Scratch. 613 */ 614 #define GET_ECACHE_DTAG(afar, datap, scr1, scr2, scr3) \ 615 mov afar, scr3; \ 616 andn scr3, (CH_ECACHE_SUBBLK_SIZE - 1), scr3; /* VA<5:0>=0 */\ 617 set (CH_ECACHE_8M_SIZE - 1), scr2; \ 618 and scr3, scr2, scr3; /* VA<63:23>=0 */ \ 619 b 1f; \ 620 stxa scr3, [datap + CH_EC_IDX]%asi; /* store E$ index */ \ 621 .align 64; \ 622 1: \ 623 ldxa [scr3]ASI_EC_DIAG, scr1; /* get E$ tag */ \ 624 stxa scr1, [datap + CH_EC_TAG]%asi; \ 625 add datap, CH_EC_DATA, datap; \ 626 2: \ 627 ldxa [scr3]ASI_EC_R, %g0; /* ld E$ stging regs */ \ 628 clr scr1; \ 629 3: /* loop thru 5 regs */ \ 630 ldxa [scr1]ASI_EC_DATA, scr2; \ 631 stxa scr2, [datap]%asi; \ 632 add datap, 8, datap; \ 633 cmp scr1, CH_ECACHE_STGREG_TOTALSIZE - 8; \ 634 bne 3b; \ 635 add scr1, 8, scr1; \ 636 btst CH_ECACHE_STGREG_SIZE, scr3; /* done? */ \ 637 beq 2b; \ 638 add scr3, CH_ECACHE_STGREG_SIZE, scr3 639 640 /* 641 * Cheetah does not have cores so these macros are null. 642 */ 643 #define PARK_SIBLING_CORE(dcucr_reg, scr1, scr2) 644 #define UNPARK_SIBLING_CORE(dcucr_reg, scr1, scr2) 645 646 /* 647 * Cheetah gets primary AFSR and AFAR and clears the AFSR, except for the 648 * fatal error bits. 649 * datap: pointer to cpu logout structure. 650 * afar: returned primary AFAR value. 651 * scr1: scratch 652 * scr2: scratch 653 */ 654 #define GET_AFSR_AFAR(datap, afar, scr1, scr2) \ 655 ldxa [%g0]ASI_AFAR, afar; \ 656 stxa afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi; \ 657 ldxa [%g0]ASI_AFSR, scr2; \ 658 stxa scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi; \ 659 sethi %hh(C_AFSR_FATAL_ERRS), scr1; \ 660 sllx scr1, 32, scr1; \ 661 bclr scr1, scr2; /* Clear fatal error bits here, so */ \ 662 stxa scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */ \ 663 membar #Sync 664 665 /* 666 * Cheetah E$ is direct-mapped, so we grab line data and skip second line. 667 * afar: AFAR from access. 668 * datap: pointer to cpu logout structure. 669 * scr1: scratch 670 * scr2: scratch 671 * scr3: scratch 672 */ 673 #define GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3) \ 674 GET_ECACHE_DTAG(afar, datap, scr1, scr2, scr3); \ 675 add datap, (CHD_EC_DATA_SETS-1)*CH_EC_DATA_SIZE, datap; \ 676 add datap, CH_EC_DATA_SIZE * PN_L2_NWAYS, datap; \ 677 678 /* 679 * Cheetah has no shadow AFAR, null operation. 680 */ 681 #define GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3) 682 683 #endif /* CHEETAH_PLUS */ 684 685 /* 686 * Cheetah/(Cheetah+ Jaguar Panther)/Jalapeno Macro for capturing CPU 687 * logout data at TL>0. r_val is a register that returns the "failure count" 688 * to the caller, and may be used as a scratch register until the end of 689 * the macro. afar is used to return the primary AFAR value to the caller 690 * and it too can be used as a scratch register until the end. r_or_s is 691 * a reg or symbol that has the offset within the "cpu_private" data area 692 * to deposit the logout data. t_flags is a register that has the 693 * trap-type/trap-level/CEEN info. This t_flags register may be used after 694 * the GET_AFSR_AFAR macro. 695 * 696 * The CPU logout operation will fail (r_val > 0) if the logout 697 * structure in question is already being used. Otherwise, the CPU 698 * logout operation will succeed (r_val = 0). For failures, r_val 699 * returns the busy count (# of times we tried using this CPU logout 700 * structure when it was busy.) 701 * 702 * Register usage: 703 * %asi: Must be set to either ASI_MEM if the address in datap 704 * is a physical address or to ASI_N if the address in 705 * datap is a virtual address. 706 * r_val: This register is the return value which tells the 707 * caller whether or not the LOGOUT operation was successful. 708 * For failures, r_val returns the fail count (i.e. number of 709 * times we have tried to use this logout structure when it was 710 * already being used. 711 * afar: output: contains AFAR on exit 712 * t_flags: input trap type info, may be used as scratch after stored 713 * to cpu log out structure. 714 * datap: Points to log out data area. 715 * scr1: Scratch 716 * scr2: Scratch (may be r_val) 717 * scr3: Scratch (may be t_flags) 718 */ 719 #define DO_TL1_CPU_LOGOUT(r_val, afar, t_flags, datap, scr1, scr2, scr3) \ 720 setx LOGOUT_INVALID, scr2, scr1; \ 721 ldxa [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi, scr2; \ 722 cmp scr2, scr1; \ 723 bne 8f; \ 724 nop; \ 725 stxa t_flags, [datap + CH_CLO_FLAGS]%asi; \ 726 GET_AFSR_AFAR(datap, afar, scr1, scr2); \ 727 add datap, CH_CLO_DATA + CH_CHD_EC_DATA, datap; \ 728 GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3); \ 729 GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3); \ 730 GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3); \ 731 sub datap, CH_CLO_DATA + CH_DIAG_DATA_SIZE, datap; \ 732 GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3); \ 733 ldxa [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi, afar; \ 734 set 0, r_val; /* return value for success */ \ 735 ba 9f; \ 736 nop; \ 737 8: \ 738 ldxa [%g0]ASI_AFAR, afar; \ 739 ldxa [datap + CH_CLO_NEST_CNT]%asi, r_val; \ 740 inc r_val; /* return value for failure */ \ 741 stxa r_val, [datap + CH_CLO_NEST_CNT]%asi; \ 742 membar #Sync; \ 743 9: 744 745 /* 746 * Cheetah/(Cheetah+ Jaguar Panther)/Jalapeno Macro for capturing CPU 747 * logout data. Uses DO_TL1_CPU_LOGOUT macro defined above, and sets 748 * up the expected data pointer in the scr1 register and sets the %asi 749 * register to ASI_N for kernel virtual addresses instead of ASI_MEM as 750 * is used at TL>0. 751 * 752 * The CPU logout operation will fail (r_val > 0) if the logout 753 * structure in question is already being used. Otherwise, the CPU 754 * logout operation will succeed (r_val = 0). For failures, r_val 755 * returns the busy count (# of times we tried using this CPU logout 756 * structure when it was busy.) 757 * 758 * Register usage: 759 * r_val: This register is the return value which tells the 760 * caller whether or not the LOGOUT operation was successful. 761 * For failures, r_val returns the fail count (i.e. number of 762 * times we have tried to use this logout structure when it was 763 * already being used. 764 * afar: returns AFAR, used internally as afar value. 765 * output: if the cpu_private struct has not been initialized, 766 * then we return the t_flags value listed below. 767 * r_or_s: input offset, either register or constant (symbol). It's 768 * OK for r_or_s to be a register as long as it's not scr1 or 769 * scr3. 770 * t_flags: input trap type info, may be used as scratch after stored 771 * to cpu log out structure. 772 * scr1: Scratch, points to log out data area. 773 * scr2: Scratch (may be r_or_s) 774 * scr3: Scratch (may be r_val) 775 * scr4: Scratch (may be t_flags) 776 */ 777 #define DO_CPU_LOGOUT(r_val, afar, r_or_s, t_flags, scr1, scr2, scr3, scr4) \ 778 GET_CPU_PRIVATE_PTR(r_or_s, scr1, scr3, 7f); /* can't use scr2/4 */ \ 779 wr %g0, ASI_N, %asi; \ 780 DO_TL1_CPU_LOGOUT(r_val, afar, t_flags, scr1, scr2, scr3, scr4) \ 781 ba 6f; \ 782 nop; \ 783 7: \ 784 mov t_flags, afar; /* depends on afar = %g2 */ \ 785 set 0, r_val; /* success in this case. */ \ 786 6: 787 788 /* 789 * The P$ is flushed as a side effect of writing to the Primary 790 * or Secondary Context Register. After writing to a context 791 * register, every line of the P$ in the Valid state is invalidated, 792 * regardless of which context it belongs to. 793 * This routine simply touches the Primary context register by 794 * reading the current value and writing it back. The Primary 795 * context is not changed. 796 */ 797 #define PCACHE_FLUSHALL(tmp1, tmp2, tmp3) \ 798 sethi %hi(FLUSH_ADDR), tmp1 ;\ 799 set MMU_PCONTEXT, tmp2 ;\ 800 ldxa [tmp2]ASI_DMMU, tmp3 ;\ 801 stxa tmp3, [tmp2]ASI_DMMU ;\ 802 flush tmp1 /* See Cheetah PRM 8.10.2 */ 803 804 /* 805 * Macro that flushes the entire Dcache. 806 * 807 * arg1 = dcache size 808 * arg2 = dcache linesize 809 */ 810 #define CH_DCACHE_FLUSHALL(arg1, arg2, tmp1) \ 811 sub arg1, arg2, tmp1; \ 812 1: \ 813 stxa %g0, [tmp1]ASI_DC_TAG; \ 814 membar #Sync; \ 815 cmp %g0, tmp1; \ 816 bne,pt %icc, 1b; \ 817 sub tmp1, arg2, tmp1; 818 819 /* 820 * Macro that flushes the entire Icache. 821 * 822 * Note that we cannot access ASI 0x67 (ASI_IC_TAG) with the Icache on, 823 * because accesses to ASI 0x67 interfere with Icache coherency. We 824 * must make sure the Icache is off, then turn it back on after the entire 825 * cache has been invalidated. If the Icache is originally off, we'll just 826 * clear the tags but not turn the Icache on. 827 * 828 * arg1 = icache size 829 * arg2 = icache linesize 830 */ 831 #define CH_ICACHE_FLUSHALL(arg1, arg2, tmp1, tmp2) \ 832 ldxa [%g0]ASI_DCU, tmp2; \ 833 andn tmp2, DCU_IC, tmp1; \ 834 stxa tmp1, [%g0]ASI_DCU; \ 835 flush %g0; /* flush required after changing the IC bit */ \ 836 sllx arg2, 1, arg2; /* arg2 = linesize * 2 */ \ 837 sllx arg1, 1, arg1; /* arg1 = size * 2 */ \ 838 sub arg1, arg2, arg1; \ 839 or arg1, CH_ICTAG_LOWER, arg1; /* "write" tag */ \ 840 1: \ 841 stxa %g0, [arg1]ASI_IC_TAG; \ 842 membar #Sync; /* Cheetah PRM 8.9.3 */ \ 843 cmp arg1, CH_ICTAG_LOWER; \ 844 bne,pt %icc, 1b; \ 845 sub arg1, arg2, arg1; \ 846 stxa tmp2, [%g0]ASI_DCU; \ 847 flush %g0; /* flush required after changing the IC bit */ 848 849 850 #if defined(JALAPENO) || defined(SERRANO) 851 852 /* 853 * ASI access to the L2 tag or L2 flush can hang the cpu when interacting 854 * with combinations of L2 snoops, victims and stores. 855 * 856 * A possible workaround is to surround each L2 ASI access with membars 857 * and make sure that the code is hitting in the Icache. This requires 858 * aligning code sequence at E$ boundary and forcing I$ fetch by 859 * jumping to selected offsets so that we don't take any I$ misses 860 * during ASI access to the L2 tag or L2 flush. This also requires 861 * making sure that we don't take any interrupts or traps (such as 862 * fast ECC trap, I$/D$ tag parity error) which can result in eviction 863 * of this code sequence from I$, thus causing a miss. 864 * 865 * Because of the complexity/risk, we have decided to do a partial fix 866 * of adding membar around each ASI access to the L2 tag or L2 flush. 867 */ 868 869 #define JP_EC_DIAG_ACCESS_MEMBAR \ 870 membar #Sync 871 872 /* 873 * Jalapeno version of macro that flushes the entire Ecache. 874 * 875 * Uses Jalapeno displacement flush feature of ASI_EC_DIAG. 876 * 877 * arg1 = ecache size 878 * arg2 = ecache linesize - not modified; can be an immediate constant. 879 */ 880 #define ECACHE_FLUSHALL(arg1, arg2, tmp1, tmp2) \ 881 CPU_INDEX(tmp1, tmp2); \ 882 set JP_ECACHE_IDX_DISP_FLUSH, tmp2; \ 883 sllx tmp1, JP_ECFLUSH_PORTID_SHIFT, tmp1; \ 884 or tmp1, tmp2, tmp1; \ 885 srlx arg1, JP_EC_TO_SET_SIZE_SHIFT, tmp2; \ 886 1: \ 887 subcc tmp2, arg2, tmp2; \ 888 JP_EC_DIAG_ACCESS_MEMBAR; \ 889 ldxa [tmp1 + tmp2]ASI_EC_DIAG, %g0; \ 890 JP_EC_DIAG_ACCESS_MEMBAR; \ 891 bg,pt %xcc, 1b; \ 892 nop; \ 893 mov 1, tmp2; \ 894 sllx tmp2, JP_ECFLUSH_EC_WAY_SHIFT, tmp2; \ 895 add tmp1, tmp2, tmp1; \ 896 mov (JP_ECACHE_NWAY-1), tmp2; \ 897 sllx tmp2, JP_ECFLUSH_EC_WAY_SHIFT, tmp2; \ 898 andcc tmp1, tmp2, tmp2; \ 899 bnz,pt %xcc, 1b; \ 900 srlx arg1, JP_EC_TO_SET_SIZE_SHIFT, tmp2 901 902 #else /* JALAPENO || SERRANO */ 903 904 /* 905 * Cheetah version of macro that flushes the entire Ecache. 906 * 907 * Need to displacement flush 2x ecache size from Ecache flush area. 908 * 909 * arg1 = ecache size 910 * arg2 = ecache linesize 911 * arg3 = ecache flush address - for cheetah only 912 */ 913 #define CH_ECACHE_FLUSHALL(arg1, arg2, arg3) \ 914 sllx arg1, 1, arg1; \ 915 1: \ 916 subcc arg1, arg2, arg1; \ 917 bg,pt %xcc, 1b; \ 918 ldxa [arg1 + arg3]ASI_MEM, %g0; 919 920 /* 921 * Cheetah+ version of macro that flushes the entire Ecache. 922 * 923 * Uses the displacement flush feature. 924 * 925 * arg1 = ecache size 926 * arg2 = ecache linesize 927 * impl = CPU implementation as returned from GET_CPU_IMPL() 928 * The value in this register is destroyed during execution 929 * of the macro. 930 */ 931 #if defined(CHEETAH_PLUS) 932 #define CHP_ECACHE_FLUSHALL(arg1, arg2, impl) \ 933 cmp impl, PANTHER_IMPL; \ 934 bne %xcc, 1f; \ 935 nop; \ 936 set PN_L3_IDX_DISP_FLUSH, impl; \ 937 b 2f; \ 938 nop; \ 939 1: \ 940 set CHP_ECACHE_IDX_DISP_FLUSH, impl; \ 941 2: \ 942 subcc arg1, arg2, arg1; \ 943 bg,pt %xcc, 2b; \ 944 ldxa [arg1 + impl]ASI_EC_DIAG, %g0; 945 #else /* CHEETAH_PLUS */ 946 #define CHP_ECACHE_FLUSHALL(arg1, arg2, impl) 947 #endif /* CHEETAH_PLUS */ 948 949 /* 950 * Macro that flushes the entire Ecache. 951 * 952 * arg1 = ecache size 953 * arg2 = ecache linesize 954 * arg3 = ecache flush address - for cheetah only 955 */ 956 #define ECACHE_FLUSHALL(arg1, arg2, arg3, tmp1) \ 957 GET_CPU_IMPL(tmp1); \ 958 cmp tmp1, CHEETAH_IMPL; \ 959 bne %xcc, 2f; \ 960 nop; \ 961 CH_ECACHE_FLUSHALL(arg1, arg2, arg3); \ 962 ba 3f; \ 963 nop; \ 964 2: \ 965 CHP_ECACHE_FLUSHALL(arg1, arg2, tmp1); \ 966 3: 967 968 #endif /* JALAPENO || SERRANO */ 969 970 /* 971 * Macro that flushes the Panther L2 cache. 972 */ 973 #if defined(CHEETAH_PLUS) 974 #define PN_L2_FLUSHALL(scr1, scr2, scr3) \ 975 GET_CPU_IMPL(scr3); \ 976 cmp scr3, PANTHER_IMPL; \ 977 bne %xcc, 2f; \ 978 nop; \ 979 set PN_L2_SIZE, scr1; \ 980 set PN_L2_LINESIZE, scr2; \ 981 set PN_L2_IDX_DISP_FLUSH, scr3; \ 982 1: \ 983 subcc scr1, scr2, scr1; \ 984 bg,pt %xcc, 1b; \ 985 ldxa [scr1 + scr3]ASI_L2_TAG, %g0; \ 986 2: 987 #else /* CHEETAH_PLUS */ 988 #define PN_L2_FLUSHALL(scr1, scr2, scr3) 989 #endif /* CHEETAH_PLUS */ 990 991 /* 992 * Given a VA and page size (page size as encoded in ASI_MMU_TAG_ACCESS_EXT), 993 * this macro returns the TLB index for that mapping based on a 512 entry 994 * (2-way set associative) TLB. Aaside from the 16 entry fully associative 995 * TLBs, all TLBs in Panther are 512 entry, 2-way set associative. 996 * 997 * To find the index, we shift the VA right by 13 + (3 * pg_sz) and then 998 * mask out all but the lower 8 bits because: 999 * 1000 * ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 0 for 8K 1001 * ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 1 for 64K 1002 * ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 2 for 512K 1003 * ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 3 for 4M 1004 * ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 4 for 32M 1005 * ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 5 for 256M 1006 * 1007 * and 1008 * 1009 * array index for 8K pages = VA[20:13] 1010 * array index for 64K pages = VA[23:16] 1011 * array index for 512K pages = VA[26:19] 1012 * array index for 4M pages = VA[29:22] 1013 * array index for 32M pages = VA[32:25] 1014 * array index for 256M pages = VA[35:28] 1015 * 1016 * Inputs: 1017 * 1018 * va - Register. 1019 * Input: Virtual address in which we are interested. 1020 * Output: TLB index value. 1021 * pg_sz - Register. Page Size of the TLB in question as encoded 1022 * in the ASI_[D|I]MMU_TAG_ACCESS_EXT register. 1023 */ 1024 #if defined(CHEETAH_PLUS) 1025 #define PN_GET_TLB_INDEX(va, pg_sz) \ 1026 srlx va, 13, va; /* first shift the 13 bits and then */ \ 1027 srlx va, pg_sz, va; /* shift by pg_sz three times. */ \ 1028 srlx va, pg_sz, va; \ 1029 srlx va, pg_sz, va; \ 1030 and va, 0xff, va; /* mask out all but the lower 8 bits */ 1031 #endif /* CHEETAH_PLUS */ 1032 1033 /* 1034 * The following macros are for error traps at TL>0. 1035 * The issue with error traps at TL>0 is that there are no safely 1036 * available global registers. So we use the trick of generating a 1037 * software trap, then using the %tpc, %tnpc and %tstate registers to 1038 * temporarily save the values of %g1 and %g2. 1039 */ 1040 1041 /* 1042 * Macro to generate 8-instruction trap table entry for TL>0 trap handlers. 1043 * Does the following steps: 1044 * 1. membar #Sync - required for USIII family errors. 1045 * 2. Specified software trap. 1046 * NB: Must be 8 instructions or less to fit in trap table and code must 1047 * be relocatable. 1048 */ 1049 #define CH_ERR_TL1_TRAPENTRY(trapno) \ 1050 membar #Sync; \ 1051 ta trapno; \ 1052 nop; nop; nop; nop; nop; nop 1053 1054 /* 1055 * Macro to generate 8-instruction trap table entry for TL>0 software trap. 1056 * We save the values of %g1 and %g2 in %tpc, %tnpc and %tstate (since 1057 * the low-order two bits of %tpc/%tnpc are reserved and read as zero, 1058 * we need to put the low-order two bits of %g1 and %g2 in %tstate). 1059 * Note that %tstate has a reserved hole from bits 3-7, so we put the 1060 * low-order two bits of %g1 in bits 0-1 and the low-order two bits of 1061 * %g2 in bits 10-11 (insuring bits 8-9 are zero for use by the D$/I$ 1062 * state bits). Note that we must do a jmp instruction, since this 1063 * is moved into the trap table entry. 1064 * NB: Must be 8 instructions or less to fit in trap table and code must 1065 * be relocatable. 1066 */ 1067 #define CH_ERR_TL1_SWTRAPENTRY(label) \ 1068 wrpr %g1, %tpc; \ 1069 and %g1, 3, %g1; \ 1070 wrpr %g2, %tnpc; \ 1071 sllx %g2, CH_ERR_G2_TO_TSTATE_SHFT, %g2; \ 1072 or %g1, %g2, %g2; \ 1073 sethi %hi(label), %g1; \ 1074 jmp %g1+%lo(label); \ 1075 wrpr %g2, %tstate 1076 1077 /* 1078 * Macro to get ptr to ch_err_tl1_data. 1079 * reg1 will either point to a physaddr with ASI_MEM in %asi OR it 1080 * will point to a kernel nucleus virtual address with ASI_N in %asi. 1081 * This allows us to: 1082 * 1. Avoid getting MMU misses. We may have gotten the original 1083 * Fast ECC error in an MMU handler and if we get an MMU trap 1084 * in the TL>0 handlers, we'll scribble on the MMU regs. 1085 * 2. Allows us to use the same code in the TL>0 handlers whether 1086 * we're accessing kernel nucleus virtual addresses or physical 1087 * addresses. 1088 * pseudo-code: 1089 * reg1 <- ch_err_tl1_paddrs[CPUID]; 1090 * if (reg1 == NULL) { 1091 * reg1 <- &ch_err_tl1_data 1092 * %asi <- ASI_N 1093 * } else { 1094 * reg1 <- reg1 + offset + 1095 * sizeof (ch_err_tl1_data) * (%tl - 3) 1096 * %asi <- ASI_MEM 1097 * } 1098 */ 1099 #define GET_CH_ERR_TL1_PTR(reg1, reg2, offset) \ 1100 CPU_INDEX(reg1, reg2); \ 1101 sllx reg1, 3, reg1; \ 1102 set ch_err_tl1_paddrs, reg2; \ 1103 ldx [reg1+reg2], reg1; \ 1104 brnz reg1, 1f; \ 1105 add reg1, offset, reg1; \ 1106 set ch_err_tl1_data, reg1; \ 1107 ba 2f; \ 1108 wr %g0, ASI_N, %asi; \ 1109 1: rdpr %tl, reg2; \ 1110 sub reg2, 3, reg2; \ 1111 mulx reg2, CH_ERR_TL1_DATA_SIZE, reg2; \ 1112 add reg1, reg2, reg1; \ 1113 wr %g0, ASI_MEM, %asi; \ 1114 2: 1115 1116 /* 1117 * Macro to generate entry code for TL>0 error handlers. 1118 * At the end of this macro, %g1 will point to the ch_err_tl1_data 1119 * structure and %g2 will have the original flags in the ch_err_tl1_data 1120 * structure and %g5 will have the value of %tstate where the Fast ECC 1121 * routines will save the state of the D$ in Bit2 CH_ERR_TSTATE_DC_ON. 1122 * All %g registers except for %g1, %g2 and %g5 will be available after 1123 * this macro. 1124 * Does the following steps: 1125 * 1. Compute physical address of per-cpu/per-tl save area using 1126 * only %g1+%g2 (which we've saved in %tpc, %tnpc, %tstate) 1127 * leaving address in %g1 and updating the %asi register. 1128 * If there is no data area available, we branch to label. 1129 * 2. Save %g3-%g7 in save area. 1130 * 3. Save %tpc->%g3, %tnpc->%g4, %tstate->%g5, which contain 1131 * original %g1+%g2 values (because we're going to change %tl). 1132 * 4. set %tl <- %tl - 1. We do this ASAP to make window of 1133 * running at %tl+1 as small as possible. 1134 * 5. Reconstitute %g1+%g2 from %tpc (%g3), %tnpc (%g4), 1135 * %tstate (%g5) and save in save area, carefully preserving %g5 1136 * because it has the CH_ERR_TSTATE_DC_ON value. 1137 * 6. Load existing ch_err_tl1_data flags in %g2 1138 * 7. Compute the new flags 1139 * 8. If %g2 is non-zero (the structure was busy), shift the new 1140 * flags by CH_ERR_ME_SHIFT and or them with the old flags. 1141 * 9. Store the updated flags into ch_err_tl1_data flags. 1142 * 10. If %g2 is non-zero, read the %tpc and store it in 1143 * ch_err_tl1_data. 1144 */ 1145 #define CH_ERR_TL1_ENTER(flags) \ 1146 GET_CH_ERR_TL1_PTR(%g1, %g2, CHPR_TL1_ERR_DATA); \ 1147 stxa %g3, [%g1 + CH_ERR_TL1_G3]%asi; \ 1148 stxa %g4, [%g1 + CH_ERR_TL1_G4]%asi; \ 1149 stxa %g5, [%g1 + CH_ERR_TL1_G5]%asi; \ 1150 stxa %g6, [%g1 + CH_ERR_TL1_G6]%asi; \ 1151 stxa %g7, [%g1 + CH_ERR_TL1_G7]%asi; \ 1152 rdpr %tpc, %g3; \ 1153 rdpr %tnpc, %g4; \ 1154 rdpr %tstate, %g5; \ 1155 rdpr %tl, %g6; \ 1156 sub %g6, 1, %g6; \ 1157 wrpr %g6, %tl; \ 1158 and %g5, 3, %g6; \ 1159 andn %g3, 3, %g3; \ 1160 or %g3, %g6, %g3; \ 1161 stxa %g3, [%g1 + CH_ERR_TL1_G1]%asi; \ 1162 srlx %g5, CH_ERR_G2_TO_TSTATE_SHFT, %g6; \ 1163 and %g6, 3, %g6; \ 1164 andn %g4, 3, %g4; \ 1165 or %g6, %g4, %g4; \ 1166 stxa %g4, [%g1 + CH_ERR_TL1_G2]%asi; \ 1167 ldxa [%g1 + CH_ERR_TL1_FLAGS]%asi, %g2; \ 1168 set flags | CH_ERR_TL, %g3; \ 1169 brz %g2, 9f; \ 1170 sllx %g3, CH_ERR_ME_SHIFT, %g4; \ 1171 or %g2, %g4, %g3; \ 1172 9: stxa %g3, [%g1 + CH_ERR_TL1_FLAGS]%asi; \ 1173 brnz %g2, 8f; \ 1174 rdpr %tpc, %g4; \ 1175 stxa %g4, [%g1 + CH_ERR_TL1_TPC]%asi; \ 1176 8: 1177 1178 /* 1179 * Turns off D$/I$ and saves the state of DCU_DC+DCU_IC in %tstate Bits 8+9 1180 * (CH_ERR_TSTATE_DC_ON/CH_ERR_TSTATE_IC_ON). This is invoked on Fast ECC 1181 * at TL>0 handlers because the D$ may have corrupted data and we need to 1182 * turn off the I$ to allow for diagnostic accesses. We then invoke 1183 * the normal entry macro and after it is done we save the values of 1184 * the original D$/I$ state, which is in %g5 bits CH_ERR_TSTATE_DC_ON/ 1185 * CH_ERR_TSTATE_IC_ON in ch_err_tl1_tmp. 1186 */ 1187 #define CH_ERR_TL1_FECC_ENTER \ 1188 ldxa [%g0]ASI_DCU, %g1; \ 1189 andn %g1, DCU_DC + DCU_IC, %g2; \ 1190 stxa %g2, [%g0]ASI_DCU; \ 1191 flush %g0; /* DCU_IC need flush */ \ 1192 rdpr %tstate, %g2; \ 1193 and %g1, DCU_DC + DCU_IC, %g1; \ 1194 sllx %g1, CH_ERR_DCU_TO_TSTATE_SHFT, %g1; \ 1195 or %g1, %g2, %g2; \ 1196 wrpr %g2, %tstate; \ 1197 CH_ERR_TL1_ENTER(CH_ERR_FECC); \ 1198 and %g5, CH_ERR_TSTATE_DC_ON + CH_ERR_TSTATE_IC_ON, %g5; \ 1199 stxa %g5, [%g1 + CH_ERR_TL1_TMP]%asi 1200 1201 /* 1202 * Macro to generate exit code for TL>0 error handlers. 1203 * We fall into this macro if we've successfully logged the error in 1204 * the ch_err_tl1_data structure and want the PIL15 softint to pick 1205 * it up and log it. 1206 * Does the following steps: 1207 * 1. Set pending flag for this cpu in ch_err_tl1_pending. 1208 * 2. Write %set_softint with (1<<pil) to cause a pil level trap 1209 * 3. Restore registers from ch_err_tl1_data, which is pointed to 1210 * by %g1, last register to restore is %g1 since it's pointing 1211 * to the save area. 1212 * 4. Execute retry 1213 */ 1214 #define CH_ERR_TL1_EXIT \ 1215 CPU_INDEX(%g2, %g3); \ 1216 set ch_err_tl1_pending, %g3; \ 1217 set -1, %g4; \ 1218 stb %g4, [%g2 + %g3]; \ 1219 mov 1, %g2; \ 1220 sll %g2, PIL_15, %g2; \ 1221 wr %g2, SET_SOFTINT; \ 1222 ldxa [%g1 + CH_ERR_TL1_G7]%asi, %g7; \ 1223 ldxa [%g1 + CH_ERR_TL1_G6]%asi, %g6; \ 1224 ldxa [%g1 + CH_ERR_TL1_G5]%asi, %g5; \ 1225 ldxa [%g1 + CH_ERR_TL1_G4]%asi, %g4; \ 1226 ldxa [%g1 + CH_ERR_TL1_G3]%asi, %g3; \ 1227 ldxa [%g1 + CH_ERR_TL1_G2]%asi, %g2; \ 1228 ldxa [%g1 + CH_ERR_TL1_G1]%asi, %g1; \ 1229 retry 1230 1231 /* 1232 * Generates unrecoverable error label for TL>0 handlers. 1233 * At label (Unrecoverable error routine) 1234 * 1. Sets flags in ch_err_tl1_data and leaves in %g2 (first 1235 * argument to cpu_tl1_err_panic). 1236 * 2. Call cpu_tl1_err_panic via systrap at PIL 15 1237 */ 1238 #define CH_ERR_TL1_PANIC_EXIT(label) \ 1239 label: ldxa [%g1 + CH_ERR_TL1_FLAGS]%asi, %g2; \ 1240 or %g2, CH_ERR_TL | CH_ERR_PANIC, %g2; \ 1241 stxa %g2, [%g1 + CH_ERR_TL1_FLAGS]%asi; \ 1242 set cpu_tl1_err_panic, %g1; \ 1243 ba sys_trap; \ 1244 mov PIL_15, %g4 1245 1246 1247 1248 /* END CSTYLED */ 1249 #endif /* _ASM */ 1250 1251 #ifdef __cplusplus 1252 } 1253 #endif 1254 1255 #endif /* _CHEETAHASM_H */ 1256