1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #ifndef _CHEETAHASM_H 28 #define _CHEETAHASM_H 29 30 #pragma ident "%Z%%M% %I% %E% SMI" 31 32 #ifdef __cplusplus 33 extern "C" { 34 #endif 35 36 #ifdef _ASM 37 /* BEGIN CSTYLED */ 38 39 #define ASM_LD(reg, symbol) \ 40 sethi %hi(symbol), reg; \ 41 ld [reg + %lo(symbol)], reg; \ 42 43 #define ASM_LDX(reg, symbol) \ 44 sethi %hi(symbol), reg; \ 45 ldx [reg + %lo(symbol)], reg; \ 46 47 #define ASM_JMP(reg, symbol) \ 48 sethi %hi(symbol), reg; \ 49 jmp reg + %lo(symbol); \ 50 nop 51 52 /* 53 * Macro for getting to offset from 'cpu_private' ptr. The 'cpu_private' 54 * ptr is in the machcpu structure. 55 * off_reg: Register offset from 'cpu_private' ptr. 56 * scr1: Scratch, ptr is returned in this register. 57 * scr2: Scratch 58 * label: Label to branch to if cpu_private ptr is null/zero. 59 */ 60 #define GET_CPU_PRIVATE_PTR(off_reg, scr1, scr2, label) \ 61 CPU_ADDR(scr1, scr2); \ 62 ldn [scr1 + CPU_PRIVATE], scr1; \ 63 cmp scr1, 0; \ 64 be label; \ 65 nop; \ 66 add scr1, off_reg, scr1 67 68 /* 69 * Macro version of get_dcache_dtag. We use this macro in the 70 * CPU logout code. Since the Dcache is virtually indexed, only 71 * bits [12:5] of the AFAR can be used so we need to search through 72 * 8 indexes (4 ways + bit 13) in order to find the tag we want. 73 * afar: input AFAR, not modified. 74 * datap: input ptr to ch_dc_data_t, at end pts to end of ch_dc_data_t. 75 * scr1: scratch. 76 * scr2: scratch, will hold tag to look for. 77 * scr3: used for Dcache index, loops through 4 ways. 78 */ 79 #define GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3) \ 80 set CH_DCACHE_IDX_MASK, scr3; \ 81 and afar, scr3, scr3; \ 82 srlx afar, CH_DCTAG_PA_SHIFT, scr2; \ 83 b 1f; \ 84 or scr2, CH_DCTAG_VALID_BIT, scr2; /* tag we want */ \ 85 .align 128; \ 86 1: \ 87 ldxa [scr3]ASI_DC_TAG, scr1; /* read tag */ \ 88 cmp scr1, scr2; \ 89 bne 4f; /* not found? */ \ 90 nop; \ 91 stxa scr3, [datap + CH_DC_IDX]%asi; /* store index */ \ 92 stxa scr1, [datap + CH_DC_TAG]%asi; /* store tag */ \ 93 membar #Sync; /* Cheetah PRM 10.6.3 */ \ 94 ldxa [scr3]ASI_DC_UTAG, scr1; /* read utag */ \ 95 membar #Sync; /* Cheetah PRM 10.6.3 */ \ 96 stxa scr1, [datap + CH_DC_UTAG]%asi; \ 97 ldxa [scr3]ASI_DC_SNP_TAG, scr1; /* read snoop tag */ \ 98 stxa scr1, [datap + CH_DC_SNTAG]%asi; \ 99 add datap, CH_DC_DATA, datap; \ 100 clr scr2; \ 101 2: \ 102 membar #Sync; /* Cheetah PRM 10.6.1 */ \ 103 ldxa [scr3 + scr2]ASI_DC_DATA, scr1; /* read data */ \ 104 membar #Sync; /* Cheetah PRM 10.6.1 */ \ 105 stxa scr1, [datap]%asi; \ 106 add datap, 8, datap; \ 107 cmp scr2, CH_DC_DATA_REG_SIZE - 8; \ 108 blt 2b; \ 109 add scr2, 8, scr2; \ 110 \ 111 GET_CPU_IMPL(scr2); /* Parity bits are elsewhere for */ \ 112 cmp scr2, PANTHER_IMPL; /* panther processors. */ \ 113 bne,a 5f; /* Done if not panther. */ \ 114 add datap, 8, datap; /* Skip to the end of the struct. */ \ 115 clr scr2; \ 116 add datap, 7, datap; /* offset of the last parity byte */ \ 117 mov 1, scr1; \ 118 sll scr1, PN_DC_DATA_PARITY_BIT_SHIFT, scr1; \ 119 or scr3, scr1, scr3; /* add DC_data_parity bit to index */ \ 120 3: \ 121 membar #Sync; /* Cheetah PRM 10.6.1 */ \ 122 ldxa [scr3 + scr2]ASI_DC_DATA, scr1; /* read parity bits */ \ 123 membar #Sync; /* Cheetah PRM 10.6.1 */ \ 124 stba scr1, [datap]%asi; \ 125 dec datap; \ 126 cmp scr2, CH_DC_DATA_REG_SIZE - 8; \ 127 blt 3b; \ 128 add scr2, 8, scr2; \ 129 b 5f; \ 130 add datap, 5, datap; /* set pointer to end of our struct */ \ 131 4: \ 132 set CH_DCACHE_IDX_INCR, scr1; /* incr. idx (scr3) */ \ 133 add scr3, scr1, scr3; \ 134 set CH_DCACHE_IDX_LIMIT, scr1; /* done? */ \ 135 cmp scr3, scr1; \ 136 blt 1b; \ 137 nop; \ 138 add datap, CH_DC_DATA_SIZE, datap; \ 139 5: 140 141 /* 142 * Macro version of get_icache_dtag. We use this macro in the CPU 143 * logout code. If the Icache is on, we don't want to capture the data. 144 * afar: input AFAR, not modified. 145 * datap: input ptr to ch_ic_data_t, at end pts to end of ch_ic_data_t. 146 * scr1: scratch. 147 * scr2: scratch, will hold tag to look for. 148 * scr3: used for Icache index, loops through 4 ways. 149 * Note: For Panther, the Icache is virtually indexed and increases in 150 * size to 64KB (instead of 32KB) with a line size of 64 bytes (instead 151 * of 32). This means the IC_addr index bits[14:7] for Panther now 152 * correspond to VA bits[13:6]. But since it is virtually indexed, we 153 * still mask out only bits[12:5] from the AFAR (we have to manually 154 * check bit 13). In order to make this code work for all processors, 155 * we end up checking twice as many indexes (8 instead of 4) as required 156 * for non-Panther CPUs and saving off twice as much data (16 instructions 157 * instead of just 8). 158 */ 159 #define GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3) \ 160 ldxa [%g0]ASI_DCU, scr1; \ 161 btst DCU_IC, scr1; /* is Icache enabled? */ \ 162 bne,a 6f; /* yes, don't capture */ \ 163 add datap, CH_IC_DATA_SIZE, datap; /* anul if no branch */ \ 164 GET_CPU_IMPL(scr2); /* Panther only uses VA[13:6] */ \ 165 cmp scr2, PANTHER_IMPL; /* and we also want to mask */ \ 166 be 1f; /* out bit 13 since the */ \ 167 nop; /* Panther I$ is VIPT. */ \ 168 set CH_ICACHE_IDX_MASK, scr3; \ 169 b 2f; \ 170 nop; \ 171 1: \ 172 set PN_ICACHE_VA_IDX_MASK, scr3; \ 173 2: \ 174 and afar, scr3, scr3; \ 175 sllx scr3, CH_ICACHE_IDX_SHIFT, scr3; \ 176 srlx afar, CH_ICPATAG_SHIFT, scr2; /* pa tag we want */ \ 177 andn scr2, CH_ICPATAG_LBITS, scr2; /* mask off lower */ \ 178 b 3f; \ 179 nop; \ 180 .align 128; \ 181 3: \ 182 ldxa [scr3]ASI_IC_TAG, scr1; /* read pa tag */ \ 183 andn scr1, CH_ICPATAG_LBITS, scr1; /* mask off lower */ \ 184 cmp scr1, scr2; \ 185 bne 5f; /* not found? */ \ 186 nop; \ 187 stxa scr3, [datap + CH_IC_IDX]%asi; /* store index */ \ 188 stxa scr1, [datap + CH_IC_PATAG]%asi; /* store pa tag */ \ 189 add scr3, CH_ICTAG_UTAG, scr3; /* read utag */ \ 190 ldxa [scr3]ASI_IC_TAG, scr1; \ 191 add scr3, (CH_ICTAG_UPPER - CH_ICTAG_UTAG), scr3; \ 192 stxa scr1, [datap + CH_IC_UTAG]%asi; \ 193 ldxa [scr3]ASI_IC_TAG, scr1; /* read upper tag */ \ 194 add scr3, (CH_ICTAG_LOWER - CH_ICTAG_UPPER), scr3; \ 195 stxa scr1, [datap + CH_IC_UPPER]%asi; \ 196 ldxa [scr3]ASI_IC_TAG, scr1; /* read lower tag */ \ 197 andn scr3, CH_ICTAG_TMASK, scr3; \ 198 stxa scr1, [datap + CH_IC_LOWER]%asi; \ 199 ldxa [scr3]ASI_IC_SNP_TAG, scr1; /* read snoop tag */ \ 200 stxa scr1, [datap + CH_IC_SNTAG]%asi; \ 201 add datap, CH_IC_DATA, datap; \ 202 clr scr2; \ 203 4: \ 204 ldxa [scr3 + scr2]ASI_IC_DATA, scr1; /* read ins. data */ \ 205 stxa scr1, [datap]%asi; \ 206 add datap, 8, datap; \ 207 cmp scr2, PN_IC_DATA_REG_SIZE - 8; \ 208 blt 4b; \ 209 add scr2, 8, scr2; \ 210 b 6f; \ 211 nop; \ 212 5: \ 213 set CH_ICACHE_IDX_INCR, scr1; /* incr. idx (scr3) */ \ 214 add scr3, scr1, scr3; \ 215 set PN_ICACHE_IDX_LIMIT, scr1; /* done? */ \ 216 cmp scr3, scr1; \ 217 blt 3b; \ 218 nop; \ 219 add datap, CH_IC_DATA_SIZE, datap; \ 220 6: 221 222 #if defined(JALAPENO) || defined(SERRANO) 223 /* 224 * Macro version of get_ecache_dtag. We use this macro in the 225 * CPU logout code. 226 * afar: input AFAR, not modified 227 * datap: Ptr to ch_ec_data_t, at end pts just past ch_ec_data_t. 228 * ec_way: Constant value (way number) 229 * scr1: Scratch 230 * scr2: Scratch. 231 * scr3: Scratch. 232 */ 233 #define GET_ECACHE_DTAG(afar, datap, ec_way, scr1, scr2, scr3) \ 234 mov ec_way, scr1; \ 235 and scr1, JP_ECACHE_NWAY - 1, scr1; /* mask E$ way bits */ \ 236 sllx scr1, JP_EC_TAG_DATA_WAY_SHIFT, scr1; \ 237 set ((JP_ECACHE_MAX_SIZE / JP_ECACHE_NWAY) - 1), scr2; \ 238 and afar, scr2, scr3; /* get set offset */ \ 239 andn scr3, (JP_ECACHE_MAX_LSIZE - 1), scr3; /* VA<5:0>=0 */ \ 240 or scr3, scr1, scr3; /* or WAY bits */ \ 241 b 1f; \ 242 stxa scr3, [datap + CH_EC_IDX]%asi; /* store E$ index */ \ 243 .align 64; \ 244 1: \ 245 JP_EC_DIAG_ACCESS_MEMBAR; \ 246 ldxa [scr3]ASI_EC_DIAG, scr1; /* get E$ tag */ \ 247 JP_EC_DIAG_ACCESS_MEMBAR; \ 248 stxa scr1, [datap + CH_EC_TAG]%asi; \ 249 add datap, CH_EC_DATA, datap; \ 250 2: \ 251 ldxa [scr3]ASI_EC_R, %g0; /* ld E$ stging regs */ \ 252 clr scr1; \ 253 3: /* loop thru 5 regs */ \ 254 ldxa [scr1]ASI_EC_DATA, scr2; \ 255 stxa scr2, [datap]%asi; \ 256 add datap, 8, datap; \ 257 cmp scr1, CH_ECACHE_STGREG_TOTALSIZE - 8; \ 258 bne 3b; \ 259 add scr1, 8, scr1; \ 260 btst CH_ECACHE_STGREG_SIZE, scr3; /* done? */ \ 261 beq 2b; \ 262 add scr3, CH_ECACHE_STGREG_SIZE, scr3 263 264 #define GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3) \ 265 GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3); \ 266 GET_ECACHE_DTAG(afar, datap, 1, scr1, scr2, scr3); \ 267 GET_ECACHE_DTAG(afar, datap, 2, scr1, scr2, scr3); \ 268 GET_ECACHE_DTAG(afar, datap, 3, scr1, scr2, scr3); \ 269 add datap, (CHD_EC_DATA_SETS-4)*CH_EC_DATA_SIZE, datap; \ 270 add datap, CH_EC_DATA_SIZE * PN_L2_NWAYS, datap; \ 271 272 /* 273 * Jalapeno does not have cores so these macros are null. 274 */ 275 #define PARK_SIBLING_CORE(dcucr_reg, scr1, scr2) 276 #define UNPARK_SIBLING_CORE(dcucr_reg, scr1, scr2) 277 278 #if defined(JALAPENO) 279 /* 280 * Jalapeno gets primary AFSR and AFAR. All bits in the AFSR except 281 * the fatal error bits are cleared. 282 * datap: pointer to cpu logout structure. 283 * afar: returned primary AFAR value. 284 * scr1: scratch 285 * scr2: scratch 286 */ 287 #define GET_AFSR_AFAR(datap, afar, scr1, scr2) \ 288 ldxa [%g0]ASI_AFAR, afar; \ 289 stxa afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi; \ 290 ldxa [%g0]ASI_AFSR, scr2; \ 291 stxa scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi; \ 292 sethi %hh(C_AFSR_FATAL_ERRS), scr1; \ 293 sllx scr1, 32, scr1; \ 294 bclr scr1, scr2; /* Clear fatal error bits here, so */ \ 295 stxa scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */ \ 296 membar #Sync 297 298 /* 299 * Jalapeno has no shadow AFAR, null operation. 300 */ 301 #define GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3) 302 303 #elif defined(SERRANO) 304 /* 305 * Serrano gets primary AFSR and AFAR. All bits in the AFSR except 306 * the fatal error bits are cleared. For Serrano, we also save the 307 * AFAR2 register. 308 * datap: pointer to cpu logout structure. 309 * afar: returned primary AFAR value. 310 * scr1: scratch 311 * scr2: scratch 312 */ 313 #define GET_AFSR_AFAR(datap, afar, scr1, scr2) \ 314 set ASI_MCU_AFAR2_VA, scr1; \ 315 ldxa [scr1]ASI_MCU_CTRL, afar; \ 316 stxa afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR2)]%asi; \ 317 ldxa [%g0]ASI_AFAR, afar; \ 318 stxa afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi; \ 319 ldxa [%g0]ASI_AFSR, scr2; \ 320 stxa scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi; \ 321 sethi %hh(C_AFSR_FATAL_ERRS), scr1; \ 322 sllx scr1, 32, scr1; \ 323 bclr scr1, scr2; /* Clear fatal error bits here, so */ \ 324 stxa scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */ \ 325 membar #Sync 326 327 /* 328 * Serrano needs to capture E$, D$ and I$ lines associated with afar2. 329 * afar: scratch, holds afar2. 330 * datap: pointer to cpu logout structure 331 * scr1: scratch 332 * scr2: scratch 333 * scr3: scratch 334 */ 335 #define GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3) \ 336 ldxa [datap + (CH_CLO_DATA + CH_CHD_AFAR2)]%asi, afar; \ 337 add datap, CH_CLO_SDW_DATA + CH_CHD_EC_DATA, datap; \ 338 GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3); \ 339 GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3); \ 340 GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3); \ 341 sub datap, CH_CPU_LOGOUT_SIZE, datap 342 #endif /* SERRANO */ 343 344 #elif defined(CHEETAH_PLUS) 345 /* 346 * Macro version of get_ecache_dtag. We use this macro in the 347 * CPU logout code. 348 * afar: input AFAR, not modified. 349 * datap: Ptr to ch_ec_data_t, at end pts just past ch_ec_data_t. 350 * pn_way: ecache way for panther (value = 0-3). For non-panther 351 * cpus, this macro will be called with pn_way = 0. 352 * scr1: Scratch. 353 * scr2: Scratch. 354 * scr3: Scratch. 355 */ 356 #define GET_ECACHE_DTAG(afar, datap, pn_way, scr1, scr2, scr3) \ 357 mov afar, scr3; \ 358 andn scr3, (CH_ECACHE_SUBBLK_SIZE - 1), scr3; /* VA<5:0>=0 */\ 359 set (CH_ECACHE_8M_SIZE - 1), scr2; \ 360 and scr3, scr2, scr3; /* VA<63:23>=0 */ \ 361 mov pn_way, scr1; /* panther L3$ is 4-way so we ... */ \ 362 sllx scr1, PN_L3_WAY_SHIFT, scr1; /* need to mask... */ \ 363 or scr3, scr1, scr3; /* in the way bits <24:23>. */ \ 364 b 1f; \ 365 stxa scr3, [datap + CH_EC_IDX]%asi; /* store E$ index */ \ 366 .align 64; \ 367 1: \ 368 ldxa [scr3]ASI_EC_DIAG, scr1; /* get E$ tag */ \ 369 stxa scr1, [datap + CH_EC_TAG]%asi; \ 370 set CHP_ECACHE_IDX_TAG_ECC, scr1; \ 371 or scr3, scr1, scr1; \ 372 ldxa [scr1]ASI_EC_DIAG, scr1; /* get E$ tag ECC */ \ 373 stxa scr1, [datap + CH_EC_TAG_ECC]%asi; \ 374 add datap, CH_EC_DATA, datap; \ 375 2: \ 376 ldxa [scr3]ASI_EC_R, %g0; /* ld E$ stging regs */ \ 377 clr scr1; \ 378 3: /* loop thru 5 regs */ \ 379 ldxa [scr1]ASI_EC_DATA, scr2; \ 380 stxa scr2, [datap]%asi; \ 381 add datap, 8, datap; \ 382 cmp scr1, CH_ECACHE_STGREG_TOTALSIZE - 8; \ 383 bne 3b; \ 384 add scr1, 8, scr1; \ 385 btst CH_ECACHE_STGREG_SIZE, scr3; /* done? */ \ 386 beq 2b; \ 387 add scr3, CH_ECACHE_STGREG_SIZE, scr3 388 389 /* 390 * If this is a panther, we need to make sure the sibling core is 391 * parked so that we avoid any race conditions during diagnostic 392 * accesses to the shared L2 and L3 caches. 393 * dcucr_reg: This register will be used to keep track of whether 394 * or not we need to unpark the core later. 395 * It just so happens that we also use this same register 396 * to keep track of our saved DCUCR value so we only touch 397 * bit 4 of the register (which is a "reserved" bit in the 398 * DCUCR) for keeping track of core parking. 399 * scr1: Scratch register. 400 * scr2: Scratch register. 401 */ 402 #define PARK_SIBLING_CORE(dcucr_reg, scr1, scr2) \ 403 GET_CPU_IMPL(scr1); \ 404 cmp scr1, PANTHER_IMPL; /* only park for panthers */ \ 405 bne,a %xcc, 2f; \ 406 andn dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg; \ 407 set ASI_CORE_RUNNING_STATUS, scr1; /* check other core */ \ 408 ldxa [scr1]ASI_CMP_SHARED, scr2; /* is it running? */ \ 409 cmp scr2, PN_BOTH_CORES_RUNNING; \ 410 bne,a %xcc, 2f; /* if not running, we are done */ \ 411 andn dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg; \ 412 or dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg; \ 413 set ASI_CORE_ID, scr1; \ 414 ldxa [scr1]ASI_CMP_PER_CORE, scr2; \ 415 and scr2, COREID_MASK, scr2; \ 416 or %g0, 1, scr1; /* find out which core... */ \ 417 sll scr1, scr2, scr2; /* ... we need to park... */ \ 418 1: \ 419 set ASI_CORE_RUNNING_RW, scr1; \ 420 stxa scr2, [scr1]ASI_CMP_SHARED; /* ... and park it. */ \ 421 membar #Sync; \ 422 set ASI_CORE_RUNNING_STATUS, scr1; /* spin until... */ \ 423 ldxa [scr1]ASI_CMP_SHARED, scr1; /* ... the other... */ \ 424 cmp scr1, scr2; /* ...core is parked according to... */ \ 425 bne,a %xcc, 1b; /* ...the core running status reg. */ \ 426 nop; \ 427 2: 428 429 /* 430 * The core running this code will unpark its sibling core if the 431 * sibling core had been parked by the current core earlier in this 432 * trap handler. 433 * dcucr_reg: This register is used to keep track of whether or not 434 * we need to unpark our sibling core. 435 * It just so happens that we also use this same register 436 * to keep track of our saved DCUCR value so we only touch 437 * bit 4 of the register (which is a "reserved" bit in the 438 * DCUCR) for keeping track of core parking. 439 * scr1: Scratch register. 440 * scr2: Scratch register. 441 */ 442 #define UNPARK_SIBLING_CORE(dcucr_reg, scr1, scr2) \ 443 btst PN_PARKED_OTHER_CORE, dcucr_reg; \ 444 bz,pt %xcc, 1f; /* if nothing to unpark, we are done */ \ 445 andn dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg; \ 446 set ASI_CORE_RUNNING_RW, scr1; \ 447 set PN_BOTH_CORES_RUNNING, scr2; /* we want both... */ \ 448 stxa scr2, [scr1]ASI_CMP_SHARED; /* ...cores running. */ \ 449 membar #Sync; \ 450 1: 451 452 /* 453 * Cheetah+ and Jaguar get both primary and secondary AFSR/AFAR. All bits 454 * in the primary AFSR are cleared except the fatal error bits. For Panther, 455 * we also have to read and clear the AFSR_EXT, again leaving the fatal 456 * error bits alone. 457 * datap: pointer to cpu logout structure. 458 * afar: returned primary AFAR value. 459 * scr1: scratch 460 * scr2: scratch 461 */ 462 #define GET_AFSR_AFAR(datap, afar, scr1, scr2) \ 463 set ASI_SHADOW_REG_VA, scr1; \ 464 ldxa [scr1]ASI_AFAR, scr2; \ 465 stxa scr2, [datap + (CH_CLO_SDW_DATA + CH_CHD_AFAR)]%asi; \ 466 ldxa [scr1]ASI_AFSR, scr2; \ 467 stxa scr2, [datap + (CH_CLO_SDW_DATA + CH_CHD_AFSR)]%asi; \ 468 ldxa [%g0]ASI_AFAR, afar; \ 469 stxa afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi; \ 470 ldxa [%g0]ASI_AFSR, scr2; \ 471 stxa scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi; \ 472 sethi %hh(C_AFSR_FATAL_ERRS), scr1; \ 473 sllx scr1, 32, scr1; \ 474 bclr scr1, scr2; /* Clear fatal error bits here, so */ \ 475 stxa scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */ \ 476 membar #Sync; \ 477 GET_CPU_IMPL(scr1); \ 478 cmp scr1, PANTHER_IMPL; \ 479 bne %xcc, 1f; \ 480 nop; \ 481 set ASI_SHADOW_AFSR_EXT_VA, scr1; /* shadow AFSR_EXT */ \ 482 ldxa [scr1]ASI_AFSR, scr2; \ 483 stxa scr2, [datap + (CH_CLO_SDW_DATA + CH_CHD_AFSR_EXT)]%asi; \ 484 set ASI_AFSR_EXT_VA, scr1; /* primary AFSR_EXT */ \ 485 ldxa [scr1]ASI_AFSR, scr2; \ 486 stxa scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR_EXT)]%asi; \ 487 set C_AFSR_EXT_FATAL_ERRS, scr1; \ 488 bclr scr1, scr2; /* Clear fatal error bits here, */ \ 489 set ASI_AFSR_EXT_VA, scr1; /* so they're left */ \ 490 stxa scr2, [scr1]ASI_AFSR; /* as is in AFSR_EXT */ \ 491 membar #Sync; \ 492 1: 493 494 /* 495 * This macro is used in the CPU logout code to capture diagnostic 496 * information from the L2 cache on panther processors. 497 * afar: input AFAR, not modified. 498 * datap: Ptr to pn_l2_data_t, at end pts just past pn_l2_data_t. 499 * scr1: Scratch. 500 * scr2: Scratch. 501 * scr3: Scratch. 502 */ 503 #define GET_PN_L2_CACHE_DTAGS(afar, datap, scr1, scr2, scr3) \ 504 mov afar, scr3; \ 505 set PN_L2_INDEX_MASK, scr1; \ 506 and scr3, scr1, scr3; \ 507 b 1f; /* code to read tags and data should be ... */ \ 508 nop; /* ...on the same cache line if possible. */ \ 509 .align 128; /* update this line if you add lines below. */ \ 510 1: \ 511 stxa scr3, [datap + CH_EC_IDX]%asi; /* store L2$ index */ \ 512 ldxa [scr3]ASI_L2_TAG, scr1; /* read the L2$ tag */ \ 513 stxa scr1, [datap + CH_EC_TAG]%asi; \ 514 add datap, CH_EC_DATA, datap; \ 515 clr scr1; \ 516 2: \ 517 ldxa [scr3 + scr1]ASI_L2_DATA, scr2; /* loop through */ \ 518 stxa scr2, [datap]%asi; /* <511:256> of L2 */ \ 519 add datap, 8, datap; /* data and record */ \ 520 cmp scr1, (PN_L2_LINESIZE / 2) - 8; /* it in the cpu */ \ 521 bne 2b; /* logout struct. */ \ 522 add scr1, 8, scr1; \ 523 set PN_L2_DATA_ECC_SEL, scr2; /* ECC_sel bit. */ \ 524 ldxa [scr3 + scr2]ASI_L2_DATA, scr2; /* Read and record */ \ 525 stxa scr2, [datap]%asi; /* ecc of <511:256> */ \ 526 add datap, 8, datap; \ 527 3: \ 528 ldxa [scr3 + scr1]ASI_L2_DATA, scr2; /* loop through */ \ 529 stxa scr2, [datap]%asi; /* <255:0> of L2 */ \ 530 add datap, 8, datap; /* data and record */ \ 531 cmp scr1, PN_L2_LINESIZE - 8; /* it in the cpu */ \ 532 bne 3b; /* logout struct. */ \ 533 add scr1, 8, scr1; \ 534 set PN_L2_DATA_ECC_SEL, scr2; /* ECC_sel bit. */ \ 535 add scr2, PN_L2_ECC_LO_REG, scr2; \ 536 ldxa [scr3 + scr2]ASI_L2_DATA, scr2; /* Read and record */ \ 537 stxa scr2, [datap]%asi; /* ecc of <255:0>. */ \ 538 add datap, 8, datap; /* Advance pointer */ \ 539 set PN_L2_SET_SIZE, scr2; \ 540 set PN_L2_MAX_SET, scr1; \ 541 cmp scr1, scr3; /* more ways to try for this line? */ \ 542 bg,a %xcc, 1b; /* if so, start over with next way */ \ 543 add scr3, scr2, scr3 544 545 /* 546 * Cheetah+ assumes E$ is 2-way and grabs both E$ lines associated with afar. 547 * afar: AFAR from access. 548 * datap: pointer to cpu logout structure. 549 * scr1: scratch 550 * scr2: scratch 551 * scr3: scratch 552 */ 553 #define GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3) \ 554 GET_CPU_IMPL(scr1); \ 555 cmp scr1, PANTHER_IMPL; \ 556 bne %xcc, 4f; \ 557 nop; \ 558 GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3); \ 559 GET_ECACHE_DTAG(afar, datap, 1, scr1, scr2, scr3); \ 560 GET_ECACHE_DTAG(afar, datap, 2, scr1, scr2, scr3); \ 561 GET_ECACHE_DTAG(afar, datap, 3, scr1, scr2, scr3); \ 562 add datap, (CHD_EC_DATA_SETS-4)*CH_EC_DATA_SIZE, datap; \ 563 GET_PN_L2_CACHE_DTAGS(afar, datap, scr1, scr2, scr3); \ 564 b 5f; \ 565 nop; \ 566 4: \ 567 GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3); \ 568 GET_ECACHE_WAY_BIT(scr1, scr2); \ 569 xor afar, scr1, afar; \ 570 GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3); \ 571 GET_ECACHE_WAY_BIT(scr1, scr2); /* restore AFAR */ \ 572 xor afar, scr1, afar; \ 573 add datap, (CHD_EC_DATA_SETS-2)*CH_EC_DATA_SIZE, datap; \ 574 add datap, CH_EC_DATA_SIZE * PN_L2_NWAYS, datap; \ 575 5: 576 577 /* 578 * Cheetah+ needs to capture E$, D$ and I$ lines associated with 579 * shadow afar. 580 * afar: scratch, holds shadow afar. 581 * datap: pointer to cpu logout structure 582 * scr1: scratch 583 * scr2: scratch 584 * scr3: scratch 585 */ 586 #define GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3) \ 587 ldxa [datap + (CH_CLO_SDW_DATA + CH_CHD_AFAR)]%asi, afar; \ 588 add datap, CH_CLO_SDW_DATA + CH_CHD_EC_DATA, datap; \ 589 GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3); \ 590 GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3); \ 591 GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3); \ 592 sub datap, CH_CPU_LOGOUT_SIZE, datap 593 594 /* 595 * Compute the "Way" bit for 2-way Ecache for Cheetah+. 596 */ 597 #define GET_ECACHE_WAY_BIT(scr1, scr2) \ 598 CPU_INDEX(scr1, scr2); \ 599 mulx scr1, CPU_NODE_SIZE, scr1; \ 600 add scr1, ECACHE_SIZE, scr1; \ 601 set cpunodes, scr2; \ 602 ld [scr1 + scr2], scr1; \ 603 srlx scr1, 1, scr1 604 605 #else /* CHEETAH_PLUS */ 606 /* 607 * Macro version of get_ecache_dtag. We use this macro in the 608 * CPU logout code. 609 * afar: input AFAR, not modified. 610 * datap: Ptr to ch_ec_data_t, at end pts just past ch_ec_data_t. 611 * scr1: Scratch. 612 * scr2: Scratch. 613 * scr3: Scratch. 614 */ 615 #define GET_ECACHE_DTAG(afar, datap, scr1, scr2, scr3) \ 616 mov afar, scr3; \ 617 andn scr3, (CH_ECACHE_SUBBLK_SIZE - 1), scr3; /* VA<5:0>=0 */\ 618 set (CH_ECACHE_8M_SIZE - 1), scr2; \ 619 and scr3, scr2, scr3; /* VA<63:23>=0 */ \ 620 b 1f; \ 621 stxa scr3, [datap + CH_EC_IDX]%asi; /* store E$ index */ \ 622 .align 64; \ 623 1: \ 624 ldxa [scr3]ASI_EC_DIAG, scr1; /* get E$ tag */ \ 625 stxa scr1, [datap + CH_EC_TAG]%asi; \ 626 add datap, CH_EC_DATA, datap; \ 627 2: \ 628 ldxa [scr3]ASI_EC_R, %g0; /* ld E$ stging regs */ \ 629 clr scr1; \ 630 3: /* loop thru 5 regs */ \ 631 ldxa [scr1]ASI_EC_DATA, scr2; \ 632 stxa scr2, [datap]%asi; \ 633 add datap, 8, datap; \ 634 cmp scr1, CH_ECACHE_STGREG_TOTALSIZE - 8; \ 635 bne 3b; \ 636 add scr1, 8, scr1; \ 637 btst CH_ECACHE_STGREG_SIZE, scr3; /* done? */ \ 638 beq 2b; \ 639 add scr3, CH_ECACHE_STGREG_SIZE, scr3 640 641 /* 642 * Cheetah does not have cores so these macros are null. 643 */ 644 #define PARK_SIBLING_CORE(dcucr_reg, scr1, scr2) 645 #define UNPARK_SIBLING_CORE(dcucr_reg, scr1, scr2) 646 647 /* 648 * Cheetah gets primary AFSR and AFAR and clears the AFSR, except for the 649 * fatal error bits. 650 * datap: pointer to cpu logout structure. 651 * afar: returned primary AFAR value. 652 * scr1: scratch 653 * scr2: scratch 654 */ 655 #define GET_AFSR_AFAR(datap, afar, scr1, scr2) \ 656 ldxa [%g0]ASI_AFAR, afar; \ 657 stxa afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi; \ 658 ldxa [%g0]ASI_AFSR, scr2; \ 659 stxa scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi; \ 660 sethi %hh(C_AFSR_FATAL_ERRS), scr1; \ 661 sllx scr1, 32, scr1; \ 662 bclr scr1, scr2; /* Clear fatal error bits here, so */ \ 663 stxa scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */ \ 664 membar #Sync 665 666 /* 667 * Cheetah E$ is direct-mapped, so we grab line data and skip second line. 668 * afar: AFAR from access. 669 * datap: pointer to cpu logout structure. 670 * scr1: scratch 671 * scr2: scratch 672 * scr3: scratch 673 */ 674 #define GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3) \ 675 GET_ECACHE_DTAG(afar, datap, scr1, scr2, scr3); \ 676 add datap, (CHD_EC_DATA_SETS-1)*CH_EC_DATA_SIZE, datap; \ 677 add datap, CH_EC_DATA_SIZE * PN_L2_NWAYS, datap; \ 678 679 /* 680 * Cheetah has no shadow AFAR, null operation. 681 */ 682 #define GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3) 683 684 #endif /* CHEETAH_PLUS */ 685 686 /* 687 * Cheetah/(Cheetah+ Jaguar Panther)/Jalapeno Macro for capturing CPU 688 * logout data at TL>0. r_val is a register that returns the "failure count" 689 * to the caller, and may be used as a scratch register until the end of 690 * the macro. afar is used to return the primary AFAR value to the caller 691 * and it too can be used as a scratch register until the end. r_or_s is 692 * a reg or symbol that has the offset within the "cpu_private" data area 693 * to deposit the logout data. t_flags is a register that has the 694 * trap-type/trap-level/CEEN info. This t_flags register may be used after 695 * the GET_AFSR_AFAR macro. 696 * 697 * The CPU logout operation will fail (r_val > 0) if the logout 698 * structure in question is already being used. Otherwise, the CPU 699 * logout operation will succeed (r_val = 0). For failures, r_val 700 * returns the busy count (# of times we tried using this CPU logout 701 * structure when it was busy.) 702 * 703 * Register usage: 704 * %asi: Must be set to either ASI_MEM if the address in datap 705 * is a physical address or to ASI_N if the address in 706 * datap is a virtual address. 707 * r_val: This register is the return value which tells the 708 * caller whether or not the LOGOUT operation was successful. 709 * For failures, r_val returns the fail count (i.e. number of 710 * times we have tried to use this logout structure when it was 711 * already being used. 712 * afar: output: contains AFAR on exit 713 * t_flags: input trap type info, may be used as scratch after stored 714 * to cpu log out structure. 715 * datap: Points to log out data area. 716 * scr1: Scratch 717 * scr2: Scratch (may be r_val) 718 * scr3: Scratch (may be t_flags) 719 */ 720 #define DO_TL1_CPU_LOGOUT(r_val, afar, t_flags, datap, scr1, scr2, scr3) \ 721 setx LOGOUT_INVALID, scr2, scr1; \ 722 ldxa [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi, scr2; \ 723 cmp scr2, scr1; \ 724 bne 8f; \ 725 nop; \ 726 stxa t_flags, [datap + CH_CLO_FLAGS]%asi; \ 727 GET_AFSR_AFAR(datap, afar, scr1, scr2); \ 728 add datap, CH_CLO_DATA + CH_CHD_EC_DATA, datap; \ 729 GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3); \ 730 GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3); \ 731 GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3); \ 732 sub datap, CH_CLO_DATA + CH_DIAG_DATA_SIZE, datap; \ 733 GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3); \ 734 ldxa [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi, afar; \ 735 set 0, r_val; /* return value for success */ \ 736 ba 9f; \ 737 nop; \ 738 8: \ 739 ldxa [%g0]ASI_AFAR, afar; \ 740 ldxa [datap + CH_CLO_NEST_CNT]%asi, r_val; \ 741 inc r_val; /* return value for failure */ \ 742 stxa r_val, [datap + CH_CLO_NEST_CNT]%asi; \ 743 membar #Sync; \ 744 9: 745 746 /* 747 * Cheetah/(Cheetah+ Jaguar Panther)/Jalapeno Macro for capturing CPU 748 * logout data. Uses DO_TL1_CPU_LOGOUT macro defined above, and sets 749 * up the expected data pointer in the scr1 register and sets the %asi 750 * register to ASI_N for kernel virtual addresses instead of ASI_MEM as 751 * is used at TL>0. 752 * 753 * The CPU logout operation will fail (r_val > 0) if the logout 754 * structure in question is already being used. Otherwise, the CPU 755 * logout operation will succeed (r_val = 0). For failures, r_val 756 * returns the busy count (# of times we tried using this CPU logout 757 * structure when it was busy.) 758 * 759 * Register usage: 760 * r_val: This register is the return value which tells the 761 * caller whether or not the LOGOUT operation was successful. 762 * For failures, r_val returns the fail count (i.e. number of 763 * times we have tried to use this logout structure when it was 764 * already being used. 765 * afar: returns AFAR, used internally as afar value. 766 * output: if the cpu_private struct has not been initialized, 767 * then we return the t_flags value listed below. 768 * r_or_s: input offset, either register or constant (symbol). It's 769 * OK for r_or_s to be a register as long as it's not scr1 or 770 * scr3. 771 * t_flags: input trap type info, may be used as scratch after stored 772 * to cpu log out structure. 773 * scr1: Scratch, points to log out data area. 774 * scr2: Scratch (may be r_or_s) 775 * scr3: Scratch (may be r_val) 776 * scr4: Scratch (may be t_flags) 777 */ 778 #define DO_CPU_LOGOUT(r_val, afar, r_or_s, t_flags, scr1, scr2, scr3, scr4) \ 779 GET_CPU_PRIVATE_PTR(r_or_s, scr1, scr3, 7f); /* can't use scr2/4 */ \ 780 wr %g0, ASI_N, %asi; \ 781 DO_TL1_CPU_LOGOUT(r_val, afar, t_flags, scr1, scr2, scr3, scr4) \ 782 ba 6f; \ 783 nop; \ 784 7: \ 785 mov t_flags, afar; /* depends on afar = %g2 */ \ 786 set 0, r_val; /* success in this case. */ \ 787 6: 788 789 /* 790 * The P$ is flushed as a side effect of writing to the Primary 791 * or Secondary Context Register. After writing to a context 792 * register, every line of the P$ in the Valid state is invalidated, 793 * regardless of which context it belongs to. 794 * This routine simply touches the Primary context register by 795 * reading the current value and writing it back. The Primary 796 * context is not changed. 797 */ 798 #define PCACHE_FLUSHALL(tmp1, tmp2, tmp3) \ 799 sethi %hi(FLUSH_ADDR), tmp1 ;\ 800 set MMU_PCONTEXT, tmp2 ;\ 801 ldxa [tmp2]ASI_DMMU, tmp3 ;\ 802 stxa tmp3, [tmp2]ASI_DMMU ;\ 803 flush tmp1 /* See Cheetah PRM 8.10.2 */ 804 805 /* 806 * Macro that flushes the entire Dcache. 807 * 808 * arg1 = dcache size 809 * arg2 = dcache linesize 810 */ 811 #define CH_DCACHE_FLUSHALL(arg1, arg2, tmp1) \ 812 sub arg1, arg2, tmp1; \ 813 1: \ 814 stxa %g0, [tmp1]ASI_DC_TAG; \ 815 membar #Sync; \ 816 cmp %g0, tmp1; \ 817 bne,pt %icc, 1b; \ 818 sub tmp1, arg2, tmp1; 819 820 /* 821 * Macro that flushes the entire Icache. 822 * 823 * Note that we cannot access ASI 0x67 (ASI_IC_TAG) with the Icache on, 824 * because accesses to ASI 0x67 interfere with Icache coherency. We 825 * must make sure the Icache is off, then turn it back on after the entire 826 * cache has been invalidated. If the Icache is originally off, we'll just 827 * clear the tags but not turn the Icache on. 828 * 829 * arg1 = icache size 830 * arg2 = icache linesize 831 */ 832 #define CH_ICACHE_FLUSHALL(arg1, arg2, tmp1, tmp2) \ 833 ldxa [%g0]ASI_DCU, tmp2; \ 834 andn tmp2, DCU_IC, tmp1; \ 835 stxa tmp1, [%g0]ASI_DCU; \ 836 flush %g0; /* flush required after changing the IC bit */ \ 837 sllx arg2, 1, arg2; /* arg2 = linesize * 2 */ \ 838 sllx arg1, 1, arg1; /* arg1 = size * 2 */ \ 839 sub arg1, arg2, arg1; \ 840 or arg1, CH_ICTAG_LOWER, arg1; /* "write" tag */ \ 841 1: \ 842 stxa %g0, [arg1]ASI_IC_TAG; \ 843 membar #Sync; /* Cheetah PRM 8.9.3 */ \ 844 cmp arg1, CH_ICTAG_LOWER; \ 845 bne,pt %icc, 1b; \ 846 sub arg1, arg2, arg1; \ 847 stxa tmp2, [%g0]ASI_DCU; \ 848 flush %g0; /* flush required after changing the IC bit */ 849 850 851 #if defined(JALAPENO) || defined(SERRANO) 852 853 /* 854 * ASI access to the L2 tag or L2 flush can hang the cpu when interacting 855 * with combinations of L2 snoops, victims and stores. 856 * 857 * A possible workaround is to surround each L2 ASI access with membars 858 * and make sure that the code is hitting in the Icache. This requires 859 * aligning code sequence at E$ boundary and forcing I$ fetch by 860 * jumping to selected offsets so that we don't take any I$ misses 861 * during ASI access to the L2 tag or L2 flush. This also requires 862 * making sure that we don't take any interrupts or traps (such as 863 * fast ECC trap, I$/D$ tag parity error) which can result in eviction 864 * of this code sequence from I$, thus causing a miss. 865 * 866 * Because of the complexity/risk, we have decided to do a partial fix 867 * of adding membar around each ASI access to the L2 tag or L2 flush. 868 */ 869 870 #define JP_EC_DIAG_ACCESS_MEMBAR \ 871 membar #Sync 872 873 /* 874 * Jalapeno version of macro that flushes the entire Ecache. 875 * 876 * Uses Jalapeno displacement flush feature of ASI_EC_DIAG. 877 * 878 * arg1 = ecache size 879 * arg2 = ecache linesize - not modified; can be an immediate constant. 880 */ 881 #define ECACHE_FLUSHALL(arg1, arg2, tmp1, tmp2) \ 882 CPU_INDEX(tmp1, tmp2); \ 883 set JP_ECACHE_IDX_DISP_FLUSH, tmp2; \ 884 sllx tmp1, JP_ECFLUSH_PORTID_SHIFT, tmp1; \ 885 or tmp1, tmp2, tmp1; \ 886 srlx arg1, JP_EC_TO_SET_SIZE_SHIFT, tmp2; \ 887 1: \ 888 subcc tmp2, arg2, tmp2; \ 889 JP_EC_DIAG_ACCESS_MEMBAR; \ 890 ldxa [tmp1 + tmp2]ASI_EC_DIAG, %g0; \ 891 JP_EC_DIAG_ACCESS_MEMBAR; \ 892 bg,pt %xcc, 1b; \ 893 nop; \ 894 mov 1, tmp2; \ 895 sllx tmp2, JP_ECFLUSH_EC_WAY_SHIFT, tmp2; \ 896 add tmp1, tmp2, tmp1; \ 897 mov (JP_ECACHE_NWAY-1), tmp2; \ 898 sllx tmp2, JP_ECFLUSH_EC_WAY_SHIFT, tmp2; \ 899 andcc tmp1, tmp2, tmp2; \ 900 bnz,pt %xcc, 1b; \ 901 srlx arg1, JP_EC_TO_SET_SIZE_SHIFT, tmp2 902 903 #else /* JALAPENO || SERRANO */ 904 905 /* 906 * Cheetah version of macro that flushes the entire Ecache. 907 * 908 * Need to displacement flush 2x ecache size from Ecache flush area. 909 * 910 * arg1 = ecache size 911 * arg2 = ecache linesize 912 * arg3 = ecache flush address - for cheetah only 913 */ 914 #define CH_ECACHE_FLUSHALL(arg1, arg2, arg3) \ 915 sllx arg1, 1, arg1; \ 916 1: \ 917 subcc arg1, arg2, arg1; \ 918 bg,pt %xcc, 1b; \ 919 ldxa [arg1 + arg3]ASI_MEM, %g0; 920 921 /* 922 * Cheetah+ version of macro that flushes the entire Ecache. 923 * 924 * Uses the displacement flush feature. 925 * 926 * arg1 = ecache size 927 * arg2 = ecache linesize 928 * impl = CPU implementation as returned from GET_CPU_IMPL() 929 * The value in this register is destroyed during execution 930 * of the macro. 931 */ 932 #if defined(CHEETAH_PLUS) 933 #define CHP_ECACHE_FLUSHALL(arg1, arg2, impl) \ 934 cmp impl, PANTHER_IMPL; \ 935 bne %xcc, 1f; \ 936 nop; \ 937 set PN_L3_IDX_DISP_FLUSH, impl; \ 938 b 2f; \ 939 nop; \ 940 1: \ 941 set CHP_ECACHE_IDX_DISP_FLUSH, impl; \ 942 2: \ 943 subcc arg1, arg2, arg1; \ 944 bg,pt %xcc, 2b; \ 945 ldxa [arg1 + impl]ASI_EC_DIAG, %g0; 946 #else /* CHEETAH_PLUS */ 947 #define CHP_ECACHE_FLUSHALL(arg1, arg2, impl) 948 #endif /* CHEETAH_PLUS */ 949 950 /* 951 * Macro that flushes the entire Ecache. 952 * 953 * arg1 = ecache size 954 * arg2 = ecache linesize 955 * arg3 = ecache flush address - for cheetah only 956 */ 957 #define ECACHE_FLUSHALL(arg1, arg2, arg3, tmp1) \ 958 GET_CPU_IMPL(tmp1); \ 959 cmp tmp1, CHEETAH_IMPL; \ 960 bne %xcc, 2f; \ 961 nop; \ 962 CH_ECACHE_FLUSHALL(arg1, arg2, arg3); \ 963 ba 3f; \ 964 nop; \ 965 2: \ 966 CHP_ECACHE_FLUSHALL(arg1, arg2, tmp1); \ 967 3: 968 969 #endif /* JALAPENO || SERRANO */ 970 971 /* 972 * Macro that flushes the Panther L2 cache. 973 */ 974 #if defined(CHEETAH_PLUS) 975 #define PN_L2_FLUSHALL(scr1, scr2, scr3) \ 976 GET_CPU_IMPL(scr3); \ 977 cmp scr3, PANTHER_IMPL; \ 978 bne %xcc, 2f; \ 979 nop; \ 980 set PN_L2_SIZE, scr1; \ 981 set PN_L2_LINESIZE, scr2; \ 982 set PN_L2_IDX_DISP_FLUSH, scr3; \ 983 1: \ 984 subcc scr1, scr2, scr1; \ 985 bg,pt %xcc, 1b; \ 986 ldxa [scr1 + scr3]ASI_L2_TAG, %g0; \ 987 2: 988 #else /* CHEETAH_PLUS */ 989 #define PN_L2_FLUSHALL(scr1, scr2, scr3) 990 #endif /* CHEETAH_PLUS */ 991 992 /* 993 * Given a VA and page size (page size as encoded in ASI_MMU_TAG_ACCESS_EXT), 994 * this macro returns the TLB index for that mapping based on a 512 entry 995 * (2-way set associative) TLB. Aaside from the 16 entry fully associative 996 * TLBs, all TLBs in Panther are 512 entry, 2-way set associative. 997 * 998 * To find the index, we shift the VA right by 13 + (3 * pg_sz) and then 999 * mask out all but the lower 8 bits because: 1000 * 1001 * ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 0 for 8K 1002 * ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 1 for 64K 1003 * ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 2 for 512K 1004 * ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 3 for 4M 1005 * ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 4 for 32M 1006 * ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 5 for 256M 1007 * 1008 * and 1009 * 1010 * array index for 8K pages = VA[20:13] 1011 * array index for 64K pages = VA[23:16] 1012 * array index for 512K pages = VA[26:19] 1013 * array index for 4M pages = VA[29:22] 1014 * array index for 32M pages = VA[32:25] 1015 * array index for 256M pages = VA[35:28] 1016 * 1017 * Inputs: 1018 * 1019 * va - Register. 1020 * Input: Virtual address in which we are interested. 1021 * Output: TLB index value. 1022 * pg_sz - Register. Page Size of the TLB in question as encoded 1023 * in the ASI_[D|I]MMU_TAG_ACCESS_EXT register. 1024 */ 1025 #if defined(CHEETAH_PLUS) 1026 #define PN_GET_TLB_INDEX(va, pg_sz) \ 1027 srlx va, 13, va; /* first shift the 13 bits and then */ \ 1028 srlx va, pg_sz, va; /* shift by pg_sz three times. */ \ 1029 srlx va, pg_sz, va; \ 1030 srlx va, pg_sz, va; \ 1031 and va, 0xff, va; /* mask out all but the lower 8 bits */ 1032 #endif /* CHEETAH_PLUS */ 1033 1034 /* 1035 * The following macros are for error traps at TL>0. 1036 * The issue with error traps at TL>0 is that there are no safely 1037 * available global registers. So we use the trick of generating a 1038 * software trap, then using the %tpc, %tnpc and %tstate registers to 1039 * temporarily save the values of %g1 and %g2. 1040 */ 1041 1042 /* 1043 * Macro to generate 8-instruction trap table entry for TL>0 trap handlers. 1044 * Does the following steps: 1045 * 1. membar #Sync - required for USIII family errors. 1046 * 2. Specified software trap. 1047 * NB: Must be 8 instructions or less to fit in trap table and code must 1048 * be relocatable. 1049 */ 1050 #define CH_ERR_TL1_TRAPENTRY(trapno) \ 1051 membar #Sync; \ 1052 ta trapno; \ 1053 nop; nop; nop; nop; nop; nop 1054 1055 /* 1056 * Macro to generate 8-instruction trap table entry for TL>0 software trap. 1057 * We save the values of %g1 and %g2 in %tpc, %tnpc and %tstate (since 1058 * the low-order two bits of %tpc/%tnpc are reserved and read as zero, 1059 * we need to put the low-order two bits of %g1 and %g2 in %tstate). 1060 * Note that %tstate has a reserved hole from bits 3-7, so we put the 1061 * low-order two bits of %g1 in bits 0-1 and the low-order two bits of 1062 * %g2 in bits 10-11 (insuring bits 8-9 are zero for use by the D$/I$ 1063 * state bits). Note that we must do a jmp instruction, since this 1064 * is moved into the trap table entry. 1065 * NB: Must be 8 instructions or less to fit in trap table and code must 1066 * be relocatable. 1067 */ 1068 #define CH_ERR_TL1_SWTRAPENTRY(label) \ 1069 wrpr %g1, %tpc; \ 1070 and %g1, 3, %g1; \ 1071 wrpr %g2, %tnpc; \ 1072 sllx %g2, CH_ERR_G2_TO_TSTATE_SHFT, %g2; \ 1073 or %g1, %g2, %g2; \ 1074 sethi %hi(label), %g1; \ 1075 jmp %g1+%lo(label); \ 1076 wrpr %g2, %tstate 1077 1078 /* 1079 * Macro to get ptr to ch_err_tl1_data. 1080 * reg1 will either point to a physaddr with ASI_MEM in %asi OR it 1081 * will point to a kernel nucleus virtual address with ASI_N in %asi. 1082 * This allows us to: 1083 * 1. Avoid getting MMU misses. We may have gotten the original 1084 * Fast ECC error in an MMU handler and if we get an MMU trap 1085 * in the TL>0 handlers, we'll scribble on the MMU regs. 1086 * 2. Allows us to use the same code in the TL>0 handlers whether 1087 * we're accessing kernel nucleus virtual addresses or physical 1088 * addresses. 1089 * pseudo-code: 1090 * reg1 <- ch_err_tl1_paddrs[CPUID]; 1091 * if (reg1 == NULL) { 1092 * reg1 <- &ch_err_tl1_data 1093 * %asi <- ASI_N 1094 * } else { 1095 * reg1 <- reg1 + offset + 1096 * sizeof (ch_err_tl1_data) * (%tl - 3) 1097 * %asi <- ASI_MEM 1098 * } 1099 */ 1100 #define GET_CH_ERR_TL1_PTR(reg1, reg2, offset) \ 1101 CPU_INDEX(reg1, reg2); \ 1102 sllx reg1, 3, reg1; \ 1103 set ch_err_tl1_paddrs, reg2; \ 1104 ldx [reg1+reg2], reg1; \ 1105 brnz reg1, 1f; \ 1106 add reg1, offset, reg1; \ 1107 set ch_err_tl1_data, reg1; \ 1108 ba 2f; \ 1109 wr %g0, ASI_N, %asi; \ 1110 1: rdpr %tl, reg2; \ 1111 sub reg2, 3, reg2; \ 1112 mulx reg2, CH_ERR_TL1_DATA_SIZE, reg2; \ 1113 add reg1, reg2, reg1; \ 1114 wr %g0, ASI_MEM, %asi; \ 1115 2: 1116 1117 /* 1118 * Macro to generate entry code for TL>0 error handlers. 1119 * At the end of this macro, %g1 will point to the ch_err_tl1_data 1120 * structure and %g2 will have the original flags in the ch_err_tl1_data 1121 * structure and %g5 will have the value of %tstate where the Fast ECC 1122 * routines will save the state of the D$ in Bit2 CH_ERR_TSTATE_DC_ON. 1123 * All %g registers except for %g1, %g2 and %g5 will be available after 1124 * this macro. 1125 * Does the following steps: 1126 * 1. Compute physical address of per-cpu/per-tl save area using 1127 * only %g1+%g2 (which we've saved in %tpc, %tnpc, %tstate) 1128 * leaving address in %g1 and updating the %asi register. 1129 * If there is no data area available, we branch to label. 1130 * 2. Save %g3-%g7 in save area. 1131 * 3. Save %tpc->%g3, %tnpc->%g4, %tstate->%g5, which contain 1132 * original %g1+%g2 values (because we're going to change %tl). 1133 * 4. set %tl <- %tl - 1. We do this ASAP to make window of 1134 * running at %tl+1 as small as possible. 1135 * 5. Reconstitute %g1+%g2 from %tpc (%g3), %tnpc (%g4), 1136 * %tstate (%g5) and save in save area, carefully preserving %g5 1137 * because it has the CH_ERR_TSTATE_DC_ON value. 1138 * 6. Load existing ch_err_tl1_data flags in %g2 1139 * 7. Compute the new flags 1140 * 8. If %g2 is non-zero (the structure was busy), shift the new 1141 * flags by CH_ERR_ME_SHIFT and or them with the old flags. 1142 * 9. Store the updated flags into ch_err_tl1_data flags. 1143 * 10. If %g2 is non-zero, read the %tpc and store it in 1144 * ch_err_tl1_data. 1145 */ 1146 #define CH_ERR_TL1_ENTER(flags) \ 1147 GET_CH_ERR_TL1_PTR(%g1, %g2, CHPR_TL1_ERR_DATA); \ 1148 stxa %g3, [%g1 + CH_ERR_TL1_G3]%asi; \ 1149 stxa %g4, [%g1 + CH_ERR_TL1_G4]%asi; \ 1150 stxa %g5, [%g1 + CH_ERR_TL1_G5]%asi; \ 1151 stxa %g6, [%g1 + CH_ERR_TL1_G6]%asi; \ 1152 stxa %g7, [%g1 + CH_ERR_TL1_G7]%asi; \ 1153 rdpr %tpc, %g3; \ 1154 rdpr %tnpc, %g4; \ 1155 rdpr %tstate, %g5; \ 1156 rdpr %tl, %g6; \ 1157 sub %g6, 1, %g6; \ 1158 wrpr %g6, %tl; \ 1159 and %g5, 3, %g6; \ 1160 andn %g3, 3, %g3; \ 1161 or %g3, %g6, %g3; \ 1162 stxa %g3, [%g1 + CH_ERR_TL1_G1]%asi; \ 1163 srlx %g5, CH_ERR_G2_TO_TSTATE_SHFT, %g6; \ 1164 and %g6, 3, %g6; \ 1165 andn %g4, 3, %g4; \ 1166 or %g6, %g4, %g4; \ 1167 stxa %g4, [%g1 + CH_ERR_TL1_G2]%asi; \ 1168 ldxa [%g1 + CH_ERR_TL1_FLAGS]%asi, %g2; \ 1169 set flags | CH_ERR_TL, %g3; \ 1170 brz %g2, 9f; \ 1171 sllx %g3, CH_ERR_ME_SHIFT, %g4; \ 1172 or %g2, %g4, %g3; \ 1173 9: stxa %g3, [%g1 + CH_ERR_TL1_FLAGS]%asi; \ 1174 brnz %g2, 8f; \ 1175 rdpr %tpc, %g4; \ 1176 stxa %g4, [%g1 + CH_ERR_TL1_TPC]%asi; \ 1177 8: 1178 1179 /* 1180 * Turns off D$/I$ and saves the state of DCU_DC+DCU_IC in %tstate Bits 8+9 1181 * (CH_ERR_TSTATE_DC_ON/CH_ERR_TSTATE_IC_ON). This is invoked on Fast ECC 1182 * at TL>0 handlers because the D$ may have corrupted data and we need to 1183 * turn off the I$ to allow for diagnostic accesses. We then invoke 1184 * the normal entry macro and after it is done we save the values of 1185 * the original D$/I$ state, which is in %g5 bits CH_ERR_TSTATE_DC_ON/ 1186 * CH_ERR_TSTATE_IC_ON in ch_err_tl1_tmp. 1187 */ 1188 #define CH_ERR_TL1_FECC_ENTER \ 1189 ldxa [%g0]ASI_DCU, %g1; \ 1190 andn %g1, DCU_DC + DCU_IC, %g2; \ 1191 stxa %g2, [%g0]ASI_DCU; \ 1192 flush %g0; /* DCU_IC need flush */ \ 1193 rdpr %tstate, %g2; \ 1194 and %g1, DCU_DC + DCU_IC, %g1; \ 1195 sllx %g1, CH_ERR_DCU_TO_TSTATE_SHFT, %g1; \ 1196 or %g1, %g2, %g2; \ 1197 wrpr %g2, %tstate; \ 1198 CH_ERR_TL1_ENTER(CH_ERR_FECC); \ 1199 and %g5, CH_ERR_TSTATE_DC_ON + CH_ERR_TSTATE_IC_ON, %g5; \ 1200 stxa %g5, [%g1 + CH_ERR_TL1_TMP]%asi 1201 1202 /* 1203 * Macro to generate exit code for TL>0 error handlers. 1204 * We fall into this macro if we've successfully logged the error in 1205 * the ch_err_tl1_data structure and want the PIL15 softint to pick 1206 * it up and log it. 1207 * Does the following steps: 1208 * 1. Set pending flag for this cpu in ch_err_tl1_pending. 1209 * 2. Write %set_softint with (1<<pil) to cause a pil level trap 1210 * 3. Restore registers from ch_err_tl1_data, which is pointed to 1211 * by %g1, last register to restore is %g1 since it's pointing 1212 * to the save area. 1213 * 4. Execute retry 1214 */ 1215 #define CH_ERR_TL1_EXIT \ 1216 CPU_INDEX(%g2, %g3); \ 1217 set ch_err_tl1_pending, %g3; \ 1218 set -1, %g4; \ 1219 stb %g4, [%g2 + %g3]; \ 1220 mov 1, %g2; \ 1221 sll %g2, PIL_15, %g2; \ 1222 wr %g2, SET_SOFTINT; \ 1223 ldxa [%g1 + CH_ERR_TL1_G7]%asi, %g7; \ 1224 ldxa [%g1 + CH_ERR_TL1_G6]%asi, %g6; \ 1225 ldxa [%g1 + CH_ERR_TL1_G5]%asi, %g5; \ 1226 ldxa [%g1 + CH_ERR_TL1_G4]%asi, %g4; \ 1227 ldxa [%g1 + CH_ERR_TL1_G3]%asi, %g3; \ 1228 ldxa [%g1 + CH_ERR_TL1_G2]%asi, %g2; \ 1229 ldxa [%g1 + CH_ERR_TL1_G1]%asi, %g1; \ 1230 retry 1231 1232 /* 1233 * Generates unrecoverable error label for TL>0 handlers. 1234 * At label (Unrecoverable error routine) 1235 * 1. Sets flags in ch_err_tl1_data and leaves in %g2 (first 1236 * argument to cpu_tl1_err_panic). 1237 * 2. Call cpu_tl1_err_panic via systrap at PIL 15 1238 */ 1239 #define CH_ERR_TL1_PANIC_EXIT(label) \ 1240 label: ldxa [%g1 + CH_ERR_TL1_FLAGS]%asi, %g2; \ 1241 or %g2, CH_ERR_TL | CH_ERR_PANIC, %g2; \ 1242 stxa %g2, [%g1 + CH_ERR_TL1_FLAGS]%asi; \ 1243 set cpu_tl1_err_panic, %g1; \ 1244 ba sys_trap; \ 1245 mov PIL_15, %g4 1246 1247 1248 1249 /* END CSTYLED */ 1250 #endif /* _ASM */ 1251 1252 #ifdef __cplusplus 1253 } 1254 #endif 1255 1256 #endif /* _CHEETAHASM_H */ 1257