1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #ifndef _CHEETAHASM_H 27 #define _CHEETAHASM_H 28 29 #pragma ident "%Z%%M% %I% %E% SMI" 30 31 #ifdef __cplusplus 32 extern "C" { 33 #endif 34 35 #ifdef _ASM 36 /* BEGIN CSTYLED */ 37 38 #define ASM_LD(reg, symbol) \ 39 sethi %hi(symbol), reg; \ 40 ld [reg + %lo(symbol)], reg; \ 41 42 #define ASM_LDX(reg, symbol) \ 43 sethi %hi(symbol), reg; \ 44 ldx [reg + %lo(symbol)], reg; \ 45 46 #define ASM_JMP(reg, symbol) \ 47 sethi %hi(symbol), reg; \ 48 jmp reg + %lo(symbol); \ 49 nop 50 51 /* 52 * Macro for getting to offset from 'cpu_private' ptr. The 'cpu_private' 53 * ptr is in the machcpu structure. 54 * off_reg: Register offset from 'cpu_private' ptr. 55 * scr1: Scratch, ptr is returned in this register. 56 * scr2: Scratch 57 * label: Label to branch to if cpu_private ptr is null/zero. 58 */ 59 #define GET_CPU_PRIVATE_PTR(off_reg, scr1, scr2, label) \ 60 CPU_ADDR(scr1, scr2); \ 61 ldn [scr1 + CPU_PRIVATE], scr1; \ 62 cmp scr1, 0; \ 63 be label; \ 64 nop; \ 65 add scr1, off_reg, scr1 66 67 /* 68 * Macro version of get_dcache_dtag. We use this macro in the 69 * CPU logout code. Since the Dcache is virtually indexed, only 70 * bits [12:5] of the AFAR can be used so we need to search through 71 * 8 indexes (4 ways + bit 13) in order to find the tag we want. 72 * afar: input AFAR, not modified. 73 * datap: input ptr to ch_dc_data_t, at end pts to end of ch_dc_data_t. 74 * scr1: scratch. 75 * scr2: scratch, will hold tag to look for. 76 * scr3: used for Dcache index, loops through 4 ways. 77 */ 78 #define GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3) \ 79 set CH_DCACHE_IDX_MASK, scr3; \ 80 and afar, scr3, scr3; \ 81 srlx afar, CH_DCTAG_PA_SHIFT, scr2; \ 82 b 1f; \ 83 or scr2, CH_DCTAG_VALID_BIT, scr2; /* tag we want */ \ 84 .align 128; \ 85 1: \ 86 ldxa [scr3]ASI_DC_TAG, scr1; /* read tag */ \ 87 cmp scr1, scr2; \ 88 bne 4f; /* not found? */ \ 89 nop; \ 90 stxa scr3, [datap + CH_DC_IDX]%asi; /* store index */ \ 91 stxa scr1, [datap + CH_DC_TAG]%asi; /* store tag */ \ 92 membar #Sync; /* Cheetah PRM 10.6.3 */ \ 93 ldxa [scr3]ASI_DC_UTAG, scr1; /* read utag */ \ 94 membar #Sync; /* Cheetah PRM 10.6.3 */ \ 95 stxa scr1, [datap + CH_DC_UTAG]%asi; \ 96 ldxa [scr3]ASI_DC_SNP_TAG, scr1; /* read snoop tag */ \ 97 stxa scr1, [datap + CH_DC_SNTAG]%asi; \ 98 add datap, CH_DC_DATA, datap; \ 99 clr scr2; \ 100 2: \ 101 membar #Sync; /* Cheetah PRM 10.6.1 */ \ 102 ldxa [scr3 + scr2]ASI_DC_DATA, scr1; /* read data */ \ 103 membar #Sync; /* Cheetah PRM 10.6.1 */ \ 104 stxa scr1, [datap]%asi; \ 105 add datap, 8, datap; \ 106 cmp scr2, CH_DC_DATA_REG_SIZE - 8; \ 107 blt 2b; \ 108 add scr2, 8, scr2; \ 109 \ 110 GET_CPU_IMPL(scr2); /* Parity bits are elsewhere for */ \ 111 cmp scr2, PANTHER_IMPL; /* panther processors. */ \ 112 bne,a 5f; /* Done if not panther. */ \ 113 add datap, 8, datap; /* Skip to the end of the struct. */ \ 114 clr scr2; \ 115 add datap, 7, datap; /* offset of the last parity byte */ \ 116 mov 1, scr1; \ 117 sll scr1, PN_DC_DATA_PARITY_BIT_SHIFT, scr1; \ 118 or scr3, scr1, scr3; /* add DC_data_parity bit to index */ \ 119 3: \ 120 membar #Sync; /* Cheetah PRM 10.6.1 */ \ 121 ldxa [scr3 + scr2]ASI_DC_DATA, scr1; /* read parity bits */ \ 122 membar #Sync; /* Cheetah PRM 10.6.1 */ \ 123 stba scr1, [datap]%asi; \ 124 dec datap; \ 125 cmp scr2, CH_DC_DATA_REG_SIZE - 8; \ 126 blt 3b; \ 127 add scr2, 8, scr2; \ 128 b 5f; \ 129 add datap, 5, datap; /* set pointer to end of our struct */ \ 130 4: \ 131 set CH_DCACHE_IDX_INCR, scr1; /* incr. idx (scr3) */ \ 132 add scr3, scr1, scr3; \ 133 set CH_DCACHE_IDX_LIMIT, scr1; /* done? */ \ 134 cmp scr3, scr1; \ 135 blt 1b; \ 136 nop; \ 137 add datap, CH_DC_DATA_SIZE, datap; \ 138 5: 139 140 /* 141 * Macro version of get_icache_dtag. We use this macro in the CPU 142 * logout code. If the Icache is on, we don't want to capture the data. 143 * afar: input AFAR, not modified. 144 * datap: input ptr to ch_ic_data_t, at end pts to end of ch_ic_data_t. 145 * scr1: scratch. 146 * scr2: scratch, will hold tag to look for. 147 * scr3: used for Icache index, loops through 4 ways. 148 * Note: For Panther, the Icache is virtually indexed and increases in 149 * size to 64KB (instead of 32KB) with a line size of 64 bytes (instead 150 * of 32). This means the IC_addr index bits[14:7] for Panther now 151 * correspond to VA bits[13:6]. But since it is virtually indexed, we 152 * still mask out only bits[12:5] from the AFAR (we have to manually 153 * check bit 13). In order to make this code work for all processors, 154 * we end up checking twice as many indexes (8 instead of 4) as required 155 * for non-Panther CPUs and saving off twice as much data (16 instructions 156 * instead of just 8). 157 */ 158 #define GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3) \ 159 ldxa [%g0]ASI_DCU, scr1; \ 160 btst DCU_IC, scr1; /* is Icache enabled? */ \ 161 bne,a 6f; /* yes, don't capture */ \ 162 add datap, CH_IC_DATA_SIZE, datap; /* anul if no branch */ \ 163 GET_CPU_IMPL(scr2); /* Panther only uses VA[13:6] */ \ 164 cmp scr2, PANTHER_IMPL; /* and we also want to mask */ \ 165 be 1f; /* out bit 13 since the */ \ 166 nop; /* Panther I$ is VIPT. */ \ 167 set CH_ICACHE_IDX_MASK, scr3; \ 168 b 2f; \ 169 nop; \ 170 1: \ 171 set PN_ICACHE_VA_IDX_MASK, scr3; \ 172 2: \ 173 and afar, scr3, scr3; \ 174 sllx scr3, CH_ICACHE_IDX_SHIFT, scr3; \ 175 srlx afar, CH_ICPATAG_SHIFT, scr2; /* pa tag we want */ \ 176 andn scr2, CH_ICPATAG_LBITS, scr2; /* mask off lower */ \ 177 b 3f; \ 178 nop; \ 179 .align 128; \ 180 3: \ 181 ldxa [scr3]ASI_IC_TAG, scr1; /* read pa tag */ \ 182 andn scr1, CH_ICPATAG_LBITS, scr1; /* mask off lower */ \ 183 cmp scr1, scr2; \ 184 bne 5f; /* not found? */ \ 185 nop; \ 186 stxa scr3, [datap + CH_IC_IDX]%asi; /* store index */ \ 187 stxa scr1, [datap + CH_IC_PATAG]%asi; /* store pa tag */ \ 188 add scr3, CH_ICTAG_UTAG, scr3; /* read utag */ \ 189 ldxa [scr3]ASI_IC_TAG, scr1; \ 190 add scr3, (CH_ICTAG_UPPER - CH_ICTAG_UTAG), scr3; \ 191 stxa scr1, [datap + CH_IC_UTAG]%asi; \ 192 ldxa [scr3]ASI_IC_TAG, scr1; /* read upper tag */ \ 193 add scr3, (CH_ICTAG_LOWER - CH_ICTAG_UPPER), scr3; \ 194 stxa scr1, [datap + CH_IC_UPPER]%asi; \ 195 ldxa [scr3]ASI_IC_TAG, scr1; /* read lower tag */ \ 196 andn scr3, CH_ICTAG_TMASK, scr3; \ 197 stxa scr1, [datap + CH_IC_LOWER]%asi; \ 198 ldxa [scr3]ASI_IC_SNP_TAG, scr1; /* read snoop tag */ \ 199 stxa scr1, [datap + CH_IC_SNTAG]%asi; \ 200 add datap, CH_IC_DATA, datap; \ 201 clr scr2; \ 202 4: \ 203 ldxa [scr3 + scr2]ASI_IC_DATA, scr1; /* read ins. data */ \ 204 stxa scr1, [datap]%asi; \ 205 add datap, 8, datap; \ 206 cmp scr2, PN_IC_DATA_REG_SIZE - 8; \ 207 blt 4b; \ 208 add scr2, 8, scr2; \ 209 b 6f; \ 210 nop; \ 211 5: \ 212 set CH_ICACHE_IDX_INCR, scr1; /* incr. idx (scr3) */ \ 213 add scr3, scr1, scr3; \ 214 set PN_ICACHE_IDX_LIMIT, scr1; /* done? */ \ 215 cmp scr3, scr1; \ 216 blt 3b; \ 217 nop; \ 218 add datap, CH_IC_DATA_SIZE, datap; \ 219 6: 220 221 #if defined(JALAPENO) || defined(SERRANO) 222 /* 223 * Macro version of get_ecache_dtag. We use this macro in the 224 * CPU logout code. 225 * afar: input AFAR, not modified 226 * datap: Ptr to ch_ec_data_t, at end pts just past ch_ec_data_t. 227 * ec_way: Constant value (way number) 228 * scr1: Scratch 229 * scr2: Scratch. 230 * scr3: Scratch. 231 */ 232 #define GET_ECACHE_DTAG(afar, datap, ec_way, scr1, scr2, scr3) \ 233 mov ec_way, scr1; \ 234 and scr1, JP_ECACHE_NWAY - 1, scr1; /* mask E$ way bits */ \ 235 sllx scr1, JP_EC_TAG_DATA_WAY_SHIFT, scr1; \ 236 set ((JP_ECACHE_MAX_SIZE / JP_ECACHE_NWAY) - 1), scr2; \ 237 and afar, scr2, scr3; /* get set offset */ \ 238 andn scr3, (JP_ECACHE_MAX_LSIZE - 1), scr3; /* VA<5:0>=0 */ \ 239 or scr3, scr1, scr3; /* or WAY bits */ \ 240 b 1f; \ 241 stxa scr3, [datap + CH_EC_IDX]%asi; /* store E$ index */ \ 242 .align 64; \ 243 1: \ 244 JP_EC_DIAG_ACCESS_MEMBAR; \ 245 ldxa [scr3]ASI_EC_DIAG, scr1; /* get E$ tag */ \ 246 JP_EC_DIAG_ACCESS_MEMBAR; \ 247 stxa scr1, [datap + CH_EC_TAG]%asi; \ 248 add datap, CH_EC_DATA, datap; \ 249 2: \ 250 ldxa [scr3]ASI_EC_R, %g0; /* ld E$ stging regs */ \ 251 clr scr1; \ 252 3: /* loop thru 5 regs */ \ 253 ldxa [scr1]ASI_EC_DATA, scr2; \ 254 stxa scr2, [datap]%asi; \ 255 add datap, 8, datap; \ 256 cmp scr1, CH_ECACHE_STGREG_TOTALSIZE - 8; \ 257 bne 3b; \ 258 add scr1, 8, scr1; \ 259 btst CH_ECACHE_STGREG_SIZE, scr3; /* done? */ \ 260 beq 2b; \ 261 add scr3, CH_ECACHE_STGREG_SIZE, scr3 262 263 #define GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3) \ 264 GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3); \ 265 GET_ECACHE_DTAG(afar, datap, 1, scr1, scr2, scr3); \ 266 GET_ECACHE_DTAG(afar, datap, 2, scr1, scr2, scr3); \ 267 GET_ECACHE_DTAG(afar, datap, 3, scr1, scr2, scr3); \ 268 add datap, (CHD_EC_DATA_SETS-4)*CH_EC_DATA_SIZE, datap; \ 269 add datap, CH_EC_DATA_SIZE * PN_L2_NWAYS, datap; \ 270 271 /* 272 * Jalapeno does not have cores so these macros are null. 273 */ 274 #define PARK_SIBLING_CORE(dcucr_reg, scr1, scr2) 275 #define UNPARK_SIBLING_CORE(dcucr_reg, scr1, scr2) 276 277 #if defined(JALAPENO) 278 /* 279 * Jalapeno gets primary AFSR and AFAR. All bits in the AFSR except 280 * the fatal error bits are cleared. 281 * datap: pointer to cpu logout structure. 282 * afar: returned primary AFAR value. 283 * scr1: scratch 284 * scr2: scratch 285 */ 286 #define GET_AFSR_AFAR(datap, afar, scr1, scr2) \ 287 ldxa [%g0]ASI_AFAR, afar; \ 288 stxa afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi; \ 289 ldxa [%g0]ASI_AFSR, scr2; \ 290 stxa scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi; \ 291 sethi %hh(C_AFSR_FATAL_ERRS), scr1; \ 292 sllx scr1, 32, scr1; \ 293 bclr scr1, scr2; /* Clear fatal error bits here, so */ \ 294 stxa scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */ \ 295 membar #Sync 296 297 /* 298 * Jalapeno has no shadow AFAR, null operation. 299 */ 300 #define GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3) 301 302 #elif defined(SERRANO) 303 /* 304 * Serrano gets primary AFSR and AFAR. All bits in the AFSR except 305 * the fatal error bits are cleared. For Serrano, we also save the 306 * AFAR2 register. 307 * datap: pointer to cpu logout structure. 308 * afar: returned primary AFAR value. 309 * scr1: scratch 310 * scr2: scratch 311 */ 312 #define GET_AFSR_AFAR(datap, afar, scr1, scr2) \ 313 set ASI_MCU_AFAR2_VA, scr1; \ 314 ldxa [scr1]ASI_MCU_CTRL, afar; \ 315 stxa afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR2)]%asi; \ 316 ldxa [%g0]ASI_AFAR, afar; \ 317 stxa afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi; \ 318 ldxa [%g0]ASI_AFSR, scr2; \ 319 stxa scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi; \ 320 sethi %hh(C_AFSR_FATAL_ERRS), scr1; \ 321 sllx scr1, 32, scr1; \ 322 bclr scr1, scr2; /* Clear fatal error bits here, so */ \ 323 stxa scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */ \ 324 membar #Sync 325 326 /* 327 * Serrano needs to capture E$, D$ and I$ lines associated with afar2. 328 * afar: scratch, holds afar2. 329 * datap: pointer to cpu logout structure 330 * scr1: scratch 331 * scr2: scratch 332 * scr3: scratch 333 */ 334 #define GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3) \ 335 ldxa [datap + (CH_CLO_DATA + CH_CHD_AFAR2)]%asi, afar; \ 336 add datap, CH_CLO_SDW_DATA + CH_CHD_EC_DATA, datap; \ 337 GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3); \ 338 GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3); \ 339 GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3); \ 340 sub datap, CH_CPU_LOGOUT_SIZE, datap 341 #endif /* SERRANO */ 342 343 #elif defined(CHEETAH_PLUS) 344 /* 345 * Macro version of get_ecache_dtag. We use this macro in the 346 * CPU logout code. 347 * afar: input AFAR, not modified. 348 * datap: Ptr to ch_ec_data_t, at end pts just past ch_ec_data_t. 349 * pn_way: ecache way for panther (value = 0-3). For non-panther 350 * cpus, this macro will be called with pn_way = 0. 351 * scr1: Scratch. 352 * scr2: Scratch. 353 * scr3: Scratch. 354 */ 355 #define GET_ECACHE_DTAG(afar, datap, pn_way, scr1, scr2, scr3) \ 356 mov afar, scr3; \ 357 andn scr3, (CH_ECACHE_SUBBLK_SIZE - 1), scr3; /* VA<5:0>=0 */\ 358 set (CH_ECACHE_8M_SIZE - 1), scr2; \ 359 and scr3, scr2, scr3; /* VA<63:23>=0 */ \ 360 mov pn_way, scr1; /* panther L3$ is 4-way so we ... */ \ 361 sllx scr1, PN_L3_WAY_SHIFT, scr1; /* need to mask... */ \ 362 or scr3, scr1, scr3; /* in the way bits <24:23>. */ \ 363 b 1f; \ 364 stxa scr3, [datap + CH_EC_IDX]%asi; /* store E$ index */ \ 365 .align 64; \ 366 1: \ 367 ldxa [scr3]ASI_EC_DIAG, scr1; /* get E$ tag */ \ 368 stxa scr1, [datap + CH_EC_TAG]%asi; \ 369 set CHP_ECACHE_IDX_TAG_ECC, scr1; \ 370 or scr3, scr1, scr1; \ 371 ldxa [scr1]ASI_EC_DIAG, scr1; /* get E$ tag ECC */ \ 372 stxa scr1, [datap + CH_EC_TAG_ECC]%asi; \ 373 add datap, CH_EC_DATA, datap; \ 374 2: \ 375 ldxa [scr3]ASI_EC_R, %g0; /* ld E$ stging regs */ \ 376 clr scr1; \ 377 3: /* loop thru 5 regs */ \ 378 ldxa [scr1]ASI_EC_DATA, scr2; \ 379 stxa scr2, [datap]%asi; \ 380 add datap, 8, datap; \ 381 cmp scr1, CH_ECACHE_STGREG_TOTALSIZE - 8; \ 382 bne 3b; \ 383 add scr1, 8, scr1; \ 384 btst CH_ECACHE_STGREG_SIZE, scr3; /* done? */ \ 385 beq 2b; \ 386 add scr3, CH_ECACHE_STGREG_SIZE, scr3 387 388 /* 389 * If this is a panther, we need to make sure the sibling core is 390 * parked so that we avoid any race conditions during diagnostic 391 * accesses to the shared L2 and L3 caches. 392 * dcucr_reg: This register will be used to keep track of whether 393 * or not we need to unpark the core later. 394 * It just so happens that we also use this same register 395 * to keep track of our saved DCUCR value so we only touch 396 * bit 4 of the register (which is a "reserved" bit in the 397 * DCUCR) for keeping track of core parking. 398 * scr1: Scratch register. 399 * scr2: Scratch register. 400 */ 401 #define PARK_SIBLING_CORE(dcucr_reg, scr1, scr2) \ 402 GET_CPU_IMPL(scr1); \ 403 cmp scr1, PANTHER_IMPL; /* only park for panthers */ \ 404 bne,a %xcc, 2f; \ 405 andn dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg; \ 406 set ASI_CORE_RUNNING_STATUS, scr1; /* check other core */ \ 407 ldxa [scr1]ASI_CMP_SHARED, scr2; /* is it running? */ \ 408 cmp scr2, PN_BOTH_CORES_RUNNING; \ 409 bne,a %xcc, 2f; /* if not running, we are done */ \ 410 andn dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg; \ 411 or dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg; \ 412 set ASI_CORE_ID, scr1; \ 413 ldxa [scr1]ASI_CMP_PER_CORE, scr2; \ 414 and scr2, COREID_MASK, scr2; \ 415 or %g0, 1, scr1; /* find out which core... */ \ 416 sll scr1, scr2, scr2; /* ... we need to park... */ \ 417 1: \ 418 set ASI_CORE_RUNNING_RW, scr1; \ 419 ldxa [scr1]ASI_CMP_SHARED, scr1; /* ...but are we? */ \ 420 btst scr1, scr2; /* check our own parked status */ \ 421 bz %xcc, 1b; /* if we are then go round again */ \ 422 nop; \ 423 set ASI_CORE_RUNNING_RW, scr1; /* else proceed... */ \ 424 stxa scr2, [scr1]ASI_CMP_SHARED; /* ... and park it. */ \ 425 membar #Sync; \ 426 set ASI_CORE_RUNNING_STATUS, scr1; /* spin until... */ \ 427 ldxa [scr1]ASI_CMP_SHARED, scr1; /* ... the other... */ \ 428 cmp scr1, scr2; /* ...core is parked according to... */ \ 429 bne,a %xcc, 1b; /* ...the core running status reg. */ \ 430 nop; \ 431 2: 432 433 /* 434 * The core running this code will unpark its sibling core if the 435 * sibling core had been parked by the current core earlier in this 436 * trap handler. 437 * dcucr_reg: This register is used to keep track of whether or not 438 * we need to unpark our sibling core. 439 * It just so happens that we also use this same register 440 * to keep track of our saved DCUCR value so we only touch 441 * bit 4 of the register (which is a "reserved" bit in the 442 * DCUCR) for keeping track of core parking. 443 * scr1: Scratch register. 444 * scr2: Scratch register. 445 */ 446 #define UNPARK_SIBLING_CORE(dcucr_reg, scr1, scr2) \ 447 btst PN_PARKED_OTHER_CORE, dcucr_reg; \ 448 bz,pt %xcc, 1f; /* if nothing to unpark, we are done */ \ 449 andn dcucr_reg, PN_PARKED_OTHER_CORE, dcucr_reg; \ 450 set ASI_CORE_RUNNING_RW, scr1; \ 451 set PN_BOTH_CORES_RUNNING, scr2; /* we want both... */ \ 452 stxa scr2, [scr1]ASI_CMP_SHARED; /* ...cores running. */ \ 453 membar #Sync; \ 454 1: 455 456 /* 457 * Cheetah+ and Jaguar get both primary and secondary AFSR/AFAR. All bits 458 * in the primary AFSR are cleared except the fatal error bits. For Panther, 459 * we also have to read and clear the AFSR_EXT, again leaving the fatal 460 * error bits alone. 461 * datap: pointer to cpu logout structure. 462 * afar: returned primary AFAR value. 463 * scr1: scratch 464 * scr2: scratch 465 */ 466 #define GET_AFSR_AFAR(datap, afar, scr1, scr2) \ 467 set ASI_SHADOW_REG_VA, scr1; \ 468 ldxa [scr1]ASI_AFAR, scr2; \ 469 stxa scr2, [datap + (CH_CLO_SDW_DATA + CH_CHD_AFAR)]%asi; \ 470 ldxa [scr1]ASI_AFSR, scr2; \ 471 stxa scr2, [datap + (CH_CLO_SDW_DATA + CH_CHD_AFSR)]%asi; \ 472 ldxa [%g0]ASI_AFAR, afar; \ 473 stxa afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi; \ 474 ldxa [%g0]ASI_AFSR, scr2; \ 475 stxa scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi; \ 476 sethi %hh(C_AFSR_FATAL_ERRS), scr1; \ 477 sllx scr1, 32, scr1; \ 478 bclr scr1, scr2; /* Clear fatal error bits here, so */ \ 479 stxa scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */ \ 480 membar #Sync; \ 481 GET_CPU_IMPL(scr1); \ 482 cmp scr1, PANTHER_IMPL; \ 483 bne %xcc, 1f; \ 484 nop; \ 485 set ASI_SHADOW_AFSR_EXT_VA, scr1; /* shadow AFSR_EXT */ \ 486 ldxa [scr1]ASI_AFSR, scr2; \ 487 stxa scr2, [datap + (CH_CLO_SDW_DATA + CH_CHD_AFSR_EXT)]%asi; \ 488 set ASI_AFSR_EXT_VA, scr1; /* primary AFSR_EXT */ \ 489 ldxa [scr1]ASI_AFSR, scr2; \ 490 stxa scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR_EXT)]%asi; \ 491 set C_AFSR_EXT_FATAL_ERRS, scr1; \ 492 bclr scr1, scr2; /* Clear fatal error bits here, */ \ 493 set ASI_AFSR_EXT_VA, scr1; /* so they're left */ \ 494 stxa scr2, [scr1]ASI_AFSR; /* as is in AFSR_EXT */ \ 495 membar #Sync; \ 496 1: 497 498 /* 499 * This macro is used in the CPU logout code to capture diagnostic 500 * information from the L2 cache on panther processors. 501 * afar: input AFAR, not modified. 502 * datap: Ptr to pn_l2_data_t, at end pts just past pn_l2_data_t. 503 * scr1: Scratch. 504 * scr2: Scratch. 505 * scr3: Scratch. 506 */ 507 #define GET_PN_L2_CACHE_DTAGS(afar, datap, scr1, scr2, scr3) \ 508 mov afar, scr3; \ 509 set PN_L2_INDEX_MASK, scr1; \ 510 and scr3, scr1, scr3; \ 511 b 1f; /* code to read tags and data should be ... */ \ 512 nop; /* ...on the same cache line if possible. */ \ 513 .align 128; /* update this line if you add lines below. */ \ 514 1: \ 515 stxa scr3, [datap + CH_EC_IDX]%asi; /* store L2$ index */ \ 516 ldxa [scr3]ASI_L2_TAG, scr1; /* read the L2$ tag */ \ 517 stxa scr1, [datap + CH_EC_TAG]%asi; \ 518 add datap, CH_EC_DATA, datap; \ 519 clr scr1; \ 520 2: \ 521 ldxa [scr3 + scr1]ASI_L2_DATA, scr2; /* loop through */ \ 522 stxa scr2, [datap]%asi; /* <511:256> of L2 */ \ 523 add datap, 8, datap; /* data and record */ \ 524 cmp scr1, (PN_L2_LINESIZE / 2) - 8; /* it in the cpu */ \ 525 bne 2b; /* logout struct. */ \ 526 add scr1, 8, scr1; \ 527 set PN_L2_DATA_ECC_SEL, scr2; /* ECC_sel bit. */ \ 528 ldxa [scr3 + scr2]ASI_L2_DATA, scr2; /* Read and record */ \ 529 stxa scr2, [datap]%asi; /* ecc of <511:256> */ \ 530 add datap, 8, datap; \ 531 3: \ 532 ldxa [scr3 + scr1]ASI_L2_DATA, scr2; /* loop through */ \ 533 stxa scr2, [datap]%asi; /* <255:0> of L2 */ \ 534 add datap, 8, datap; /* data and record */ \ 535 cmp scr1, PN_L2_LINESIZE - 8; /* it in the cpu */ \ 536 bne 3b; /* logout struct. */ \ 537 add scr1, 8, scr1; \ 538 set PN_L2_DATA_ECC_SEL, scr2; /* ECC_sel bit. */ \ 539 add scr2, PN_L2_ECC_LO_REG, scr2; \ 540 ldxa [scr3 + scr2]ASI_L2_DATA, scr2; /* Read and record */ \ 541 stxa scr2, [datap]%asi; /* ecc of <255:0>. */ \ 542 add datap, 8, datap; /* Advance pointer */ \ 543 set PN_L2_SET_SIZE, scr2; \ 544 set PN_L2_MAX_SET, scr1; \ 545 cmp scr1, scr3; /* more ways to try for this line? */ \ 546 bg,a %xcc, 1b; /* if so, start over with next way */ \ 547 add scr3, scr2, scr3 548 549 /* 550 * Cheetah+ assumes E$ is 2-way and grabs both E$ lines associated with afar. 551 * afar: AFAR from access. 552 * datap: pointer to cpu logout structure. 553 * scr1: scratch 554 * scr2: scratch 555 * scr3: scratch 556 */ 557 #define GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3) \ 558 GET_CPU_IMPL(scr1); \ 559 cmp scr1, PANTHER_IMPL; \ 560 bne %xcc, 4f; \ 561 nop; \ 562 GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3); \ 563 GET_ECACHE_DTAG(afar, datap, 1, scr1, scr2, scr3); \ 564 GET_ECACHE_DTAG(afar, datap, 2, scr1, scr2, scr3); \ 565 GET_ECACHE_DTAG(afar, datap, 3, scr1, scr2, scr3); \ 566 add datap, (CHD_EC_DATA_SETS-4)*CH_EC_DATA_SIZE, datap; \ 567 GET_PN_L2_CACHE_DTAGS(afar, datap, scr1, scr2, scr3); \ 568 b 5f; \ 569 nop; \ 570 4: \ 571 GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3); \ 572 GET_ECACHE_WAY_BIT(scr1, scr2); \ 573 xor afar, scr1, afar; \ 574 GET_ECACHE_DTAG(afar, datap, 0, scr1, scr2, scr3); \ 575 GET_ECACHE_WAY_BIT(scr1, scr2); /* restore AFAR */ \ 576 xor afar, scr1, afar; \ 577 add datap, (CHD_EC_DATA_SETS-2)*CH_EC_DATA_SIZE, datap; \ 578 add datap, CH_EC_DATA_SIZE * PN_L2_NWAYS, datap; \ 579 5: 580 581 /* 582 * Cheetah+ needs to capture E$, D$ and I$ lines associated with 583 * shadow afar. 584 * afar: scratch, holds shadow afar. 585 * datap: pointer to cpu logout structure 586 * scr1: scratch 587 * scr2: scratch 588 * scr3: scratch 589 */ 590 #define GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3) \ 591 ldxa [datap + (CH_CLO_SDW_DATA + CH_CHD_AFAR)]%asi, afar; \ 592 add datap, CH_CLO_SDW_DATA + CH_CHD_EC_DATA, datap; \ 593 GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3); \ 594 GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3); \ 595 GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3); \ 596 sub datap, CH_CPU_LOGOUT_SIZE, datap 597 598 /* 599 * Compute the "Way" bit for 2-way Ecache for Cheetah+. 600 */ 601 #define GET_ECACHE_WAY_BIT(scr1, scr2) \ 602 CPU_INDEX(scr1, scr2); \ 603 mulx scr1, CPU_NODE_SIZE, scr1; \ 604 add scr1, ECACHE_SIZE, scr1; \ 605 set cpunodes, scr2; \ 606 ld [scr1 + scr2], scr1; \ 607 srlx scr1, 1, scr1 608 609 #else /* CHEETAH_PLUS */ 610 /* 611 * Macro version of get_ecache_dtag. We use this macro in the 612 * CPU logout code. 613 * afar: input AFAR, not modified. 614 * datap: Ptr to ch_ec_data_t, at end pts just past ch_ec_data_t. 615 * scr1: Scratch. 616 * scr2: Scratch. 617 * scr3: Scratch. 618 */ 619 #define GET_ECACHE_DTAG(afar, datap, scr1, scr2, scr3) \ 620 mov afar, scr3; \ 621 andn scr3, (CH_ECACHE_SUBBLK_SIZE - 1), scr3; /* VA<5:0>=0 */\ 622 set (CH_ECACHE_8M_SIZE - 1), scr2; \ 623 and scr3, scr2, scr3; /* VA<63:23>=0 */ \ 624 b 1f; \ 625 stxa scr3, [datap + CH_EC_IDX]%asi; /* store E$ index */ \ 626 .align 64; \ 627 1: \ 628 ldxa [scr3]ASI_EC_DIAG, scr1; /* get E$ tag */ \ 629 stxa scr1, [datap + CH_EC_TAG]%asi; \ 630 add datap, CH_EC_DATA, datap; \ 631 2: \ 632 ldxa [scr3]ASI_EC_R, %g0; /* ld E$ stging regs */ \ 633 clr scr1; \ 634 3: /* loop thru 5 regs */ \ 635 ldxa [scr1]ASI_EC_DATA, scr2; \ 636 stxa scr2, [datap]%asi; \ 637 add datap, 8, datap; \ 638 cmp scr1, CH_ECACHE_STGREG_TOTALSIZE - 8; \ 639 bne 3b; \ 640 add scr1, 8, scr1; \ 641 btst CH_ECACHE_STGREG_SIZE, scr3; /* done? */ \ 642 beq 2b; \ 643 add scr3, CH_ECACHE_STGREG_SIZE, scr3 644 645 /* 646 * Cheetah does not have cores so these macros are null. 647 */ 648 #define PARK_SIBLING_CORE(dcucr_reg, scr1, scr2) 649 #define UNPARK_SIBLING_CORE(dcucr_reg, scr1, scr2) 650 651 /* 652 * Cheetah gets primary AFSR and AFAR and clears the AFSR, except for the 653 * fatal error bits. 654 * datap: pointer to cpu logout structure. 655 * afar: returned primary AFAR value. 656 * scr1: scratch 657 * scr2: scratch 658 */ 659 #define GET_AFSR_AFAR(datap, afar, scr1, scr2) \ 660 ldxa [%g0]ASI_AFAR, afar; \ 661 stxa afar, [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi; \ 662 ldxa [%g0]ASI_AFSR, scr2; \ 663 stxa scr2, [datap + (CH_CLO_DATA + CH_CHD_AFSR)]%asi; \ 664 sethi %hh(C_AFSR_FATAL_ERRS), scr1; \ 665 sllx scr1, 32, scr1; \ 666 bclr scr1, scr2; /* Clear fatal error bits here, so */ \ 667 stxa scr2, [%g0]ASI_AFSR; /* they're left as is in AFSR */ \ 668 membar #Sync 669 670 /* 671 * Cheetah E$ is direct-mapped, so we grab line data and skip second line. 672 * afar: AFAR from access. 673 * datap: pointer to cpu logout structure. 674 * scr1: scratch 675 * scr2: scratch 676 * scr3: scratch 677 */ 678 #define GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3) \ 679 GET_ECACHE_DTAG(afar, datap, scr1, scr2, scr3); \ 680 add datap, (CHD_EC_DATA_SETS-1)*CH_EC_DATA_SIZE, datap; \ 681 add datap, CH_EC_DATA_SIZE * PN_L2_NWAYS, datap; \ 682 683 /* 684 * Cheetah has no shadow AFAR, null operation. 685 */ 686 #define GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3) 687 688 #endif /* CHEETAH_PLUS */ 689 690 /* 691 * Cheetah/(Cheetah+ Jaguar Panther)/Jalapeno Macro for capturing CPU 692 * logout data at TL>0. r_val is a register that returns the "failure count" 693 * to the caller, and may be used as a scratch register until the end of 694 * the macro. afar is used to return the primary AFAR value to the caller 695 * and it too can be used as a scratch register until the end. r_or_s is 696 * a reg or symbol that has the offset within the "cpu_private" data area 697 * to deposit the logout data. t_flags is a register that has the 698 * trap-type/trap-level/CEEN info. This t_flags register may be used after 699 * the GET_AFSR_AFAR macro. 700 * 701 * The CPU logout operation will fail (r_val > 0) if the logout 702 * structure in question is already being used. Otherwise, the CPU 703 * logout operation will succeed (r_val = 0). For failures, r_val 704 * returns the busy count (# of times we tried using this CPU logout 705 * structure when it was busy.) 706 * 707 * Register usage: 708 * %asi: Must be set to either ASI_MEM if the address in datap 709 * is a physical address or to ASI_N if the address in 710 * datap is a virtual address. 711 * r_val: This register is the return value which tells the 712 * caller whether or not the LOGOUT operation was successful. 713 * For failures, r_val returns the fail count (i.e. number of 714 * times we have tried to use this logout structure when it was 715 * already being used. 716 * afar: output: contains AFAR on exit 717 * t_flags: input trap type info, may be used as scratch after stored 718 * to cpu log out structure. 719 * datap: Points to log out data area. 720 * scr1: Scratch 721 * scr2: Scratch (may be r_val) 722 * scr3: Scratch (may be t_flags) 723 */ 724 #define DO_TL1_CPU_LOGOUT(r_val, afar, t_flags, datap, scr1, scr2, scr3) \ 725 setx LOGOUT_INVALID, scr2, scr1; \ 726 ldxa [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi, scr2; \ 727 cmp scr2, scr1; \ 728 bne 8f; \ 729 nop; \ 730 stxa t_flags, [datap + CH_CLO_FLAGS]%asi; \ 731 GET_AFSR_AFAR(datap, afar, scr1, scr2); \ 732 add datap, CH_CLO_DATA + CH_CHD_EC_DATA, datap; \ 733 GET_ECACHE_DTAGS(afar, datap, scr1, scr2, scr3); \ 734 GET_DCACHE_DTAG(afar, datap, scr1, scr2, scr3); \ 735 GET_ICACHE_DTAG(afar, datap, scr1, scr2, scr3); \ 736 sub datap, CH_CLO_DATA + CH_DIAG_DATA_SIZE, datap; \ 737 GET_SHADOW_DATA(afar, datap, scr1, scr2, scr3); \ 738 ldxa [datap + (CH_CLO_DATA + CH_CHD_AFAR)]%asi, afar; \ 739 set 0, r_val; /* return value for success */ \ 740 ba 9f; \ 741 nop; \ 742 8: \ 743 ldxa [%g0]ASI_AFAR, afar; \ 744 ldxa [datap + CH_CLO_NEST_CNT]%asi, r_val; \ 745 inc r_val; /* return value for failure */ \ 746 stxa r_val, [datap + CH_CLO_NEST_CNT]%asi; \ 747 membar #Sync; \ 748 9: 749 750 /* 751 * Cheetah/(Cheetah+ Jaguar Panther)/Jalapeno Macro for capturing CPU 752 * logout data. Uses DO_TL1_CPU_LOGOUT macro defined above, and sets 753 * up the expected data pointer in the scr1 register and sets the %asi 754 * register to ASI_N for kernel virtual addresses instead of ASI_MEM as 755 * is used at TL>0. 756 * 757 * The CPU logout operation will fail (r_val > 0) if the logout 758 * structure in question is already being used. Otherwise, the CPU 759 * logout operation will succeed (r_val = 0). For failures, r_val 760 * returns the busy count (# of times we tried using this CPU logout 761 * structure when it was busy.) 762 * 763 * Register usage: 764 * r_val: This register is the return value which tells the 765 * caller whether or not the LOGOUT operation was successful. 766 * For failures, r_val returns the fail count (i.e. number of 767 * times we have tried to use this logout structure when it was 768 * already being used. 769 * afar: returns AFAR, used internally as afar value. 770 * output: if the cpu_private struct has not been initialized, 771 * then we return the t_flags value listed below. 772 * r_or_s: input offset, either register or constant (symbol). It's 773 * OK for r_or_s to be a register as long as it's not scr1 or 774 * scr3. 775 * t_flags: input trap type info, may be used as scratch after stored 776 * to cpu log out structure. 777 * scr1: Scratch, points to log out data area. 778 * scr2: Scratch (may be r_or_s) 779 * scr3: Scratch (may be r_val) 780 * scr4: Scratch (may be t_flags) 781 */ 782 #define DO_CPU_LOGOUT(r_val, afar, r_or_s, t_flags, scr1, scr2, scr3, scr4) \ 783 GET_CPU_PRIVATE_PTR(r_or_s, scr1, scr3, 7f); /* can't use scr2/4 */ \ 784 wr %g0, ASI_N, %asi; \ 785 DO_TL1_CPU_LOGOUT(r_val, afar, t_flags, scr1, scr2, scr3, scr4) \ 786 ba 6f; \ 787 nop; \ 788 7: \ 789 mov t_flags, afar; /* depends on afar = %g2 */ \ 790 set 0, r_val; /* success in this case. */ \ 791 6: 792 793 /* 794 * The P$ is flushed as a side effect of writing to the Primary 795 * or Secondary Context Register. After writing to a context 796 * register, every line of the P$ in the Valid state is invalidated, 797 * regardless of which context it belongs to. 798 * This routine simply touches the Primary context register by 799 * reading the current value and writing it back. The Primary 800 * context is not changed. 801 */ 802 #define PCACHE_FLUSHALL(tmp1, tmp2, tmp3) \ 803 sethi %hi(FLUSH_ADDR), tmp1 ;\ 804 set MMU_PCONTEXT, tmp2 ;\ 805 ldxa [tmp2]ASI_DMMU, tmp3 ;\ 806 stxa tmp3, [tmp2]ASI_DMMU ;\ 807 flush tmp1 /* See Cheetah PRM 8.10.2 */ 808 809 /* 810 * Macro that flushes the entire Dcache. 811 * 812 * arg1 = dcache size 813 * arg2 = dcache linesize 814 */ 815 #define CH_DCACHE_FLUSHALL(arg1, arg2, tmp1) \ 816 sub arg1, arg2, tmp1; \ 817 1: \ 818 stxa %g0, [tmp1]ASI_DC_TAG; \ 819 membar #Sync; \ 820 cmp %g0, tmp1; \ 821 bne,pt %icc, 1b; \ 822 sub tmp1, arg2, tmp1; 823 824 /* 825 * Macro that flushes the entire Icache. 826 * 827 * Note that we cannot access ASI 0x67 (ASI_IC_TAG) with the Icache on, 828 * because accesses to ASI 0x67 interfere with Icache coherency. We 829 * must make sure the Icache is off, then turn it back on after the entire 830 * cache has been invalidated. If the Icache is originally off, we'll just 831 * clear the tags but not turn the Icache on. 832 * 833 * arg1 = icache size 834 * arg2 = icache linesize 835 */ 836 #define CH_ICACHE_FLUSHALL(arg1, arg2, tmp1, tmp2) \ 837 ldxa [%g0]ASI_DCU, tmp2; \ 838 andn tmp2, DCU_IC, tmp1; \ 839 stxa tmp1, [%g0]ASI_DCU; \ 840 flush %g0; /* flush required after changing the IC bit */ \ 841 sllx arg2, 1, arg2; /* arg2 = linesize * 2 */ \ 842 sllx arg1, 1, arg1; /* arg1 = size * 2 */ \ 843 sub arg1, arg2, arg1; \ 844 or arg1, CH_ICTAG_LOWER, arg1; /* "write" tag */ \ 845 1: \ 846 stxa %g0, [arg1]ASI_IC_TAG; \ 847 membar #Sync; /* Cheetah PRM 8.9.3 */ \ 848 cmp arg1, CH_ICTAG_LOWER; \ 849 bne,pt %icc, 1b; \ 850 sub arg1, arg2, arg1; \ 851 stxa tmp2, [%g0]ASI_DCU; \ 852 flush %g0; /* flush required after changing the IC bit */ 853 854 855 #if defined(JALAPENO) || defined(SERRANO) 856 857 /* 858 * ASI access to the L2 tag or L2 flush can hang the cpu when interacting 859 * with combinations of L2 snoops, victims and stores. 860 * 861 * A possible workaround is to surround each L2 ASI access with membars 862 * and make sure that the code is hitting in the Icache. This requires 863 * aligning code sequence at E$ boundary and forcing I$ fetch by 864 * jumping to selected offsets so that we don't take any I$ misses 865 * during ASI access to the L2 tag or L2 flush. This also requires 866 * making sure that we don't take any interrupts or traps (such as 867 * fast ECC trap, I$/D$ tag parity error) which can result in eviction 868 * of this code sequence from I$, thus causing a miss. 869 * 870 * Because of the complexity/risk, we have decided to do a partial fix 871 * of adding membar around each ASI access to the L2 tag or L2 flush. 872 */ 873 874 #define JP_EC_DIAG_ACCESS_MEMBAR \ 875 membar #Sync 876 877 /* 878 * Jalapeno version of macro that flushes the entire Ecache. 879 * 880 * Uses Jalapeno displacement flush feature of ASI_EC_DIAG. 881 * 882 * arg1 = ecache size 883 * arg2 = ecache linesize - not modified; can be an immediate constant. 884 */ 885 #define ECACHE_FLUSHALL(arg1, arg2, tmp1, tmp2) \ 886 CPU_INDEX(tmp1, tmp2); \ 887 set JP_ECACHE_IDX_DISP_FLUSH, tmp2; \ 888 sllx tmp1, JP_ECFLUSH_PORTID_SHIFT, tmp1; \ 889 or tmp1, tmp2, tmp1; \ 890 srlx arg1, JP_EC_TO_SET_SIZE_SHIFT, tmp2; \ 891 1: \ 892 subcc tmp2, arg2, tmp2; \ 893 JP_EC_DIAG_ACCESS_MEMBAR; \ 894 ldxa [tmp1 + tmp2]ASI_EC_DIAG, %g0; \ 895 JP_EC_DIAG_ACCESS_MEMBAR; \ 896 bg,pt %xcc, 1b; \ 897 nop; \ 898 mov 1, tmp2; \ 899 sllx tmp2, JP_ECFLUSH_EC_WAY_SHIFT, tmp2; \ 900 add tmp1, tmp2, tmp1; \ 901 mov (JP_ECACHE_NWAY-1), tmp2; \ 902 sllx tmp2, JP_ECFLUSH_EC_WAY_SHIFT, tmp2; \ 903 andcc tmp1, tmp2, tmp2; \ 904 bnz,pt %xcc, 1b; \ 905 srlx arg1, JP_EC_TO_SET_SIZE_SHIFT, tmp2 906 907 #else /* JALAPENO || SERRANO */ 908 909 /* 910 * Cheetah version of macro that flushes the entire Ecache. 911 * 912 * Need to displacement flush 2x ecache size from Ecache flush area. 913 * 914 * arg1 = ecache size 915 * arg2 = ecache linesize 916 * arg3 = ecache flush address - for cheetah only 917 */ 918 #define CH_ECACHE_FLUSHALL(arg1, arg2, arg3) \ 919 sllx arg1, 1, arg1; \ 920 1: \ 921 subcc arg1, arg2, arg1; \ 922 bg,pt %xcc, 1b; \ 923 ldxa [arg1 + arg3]ASI_MEM, %g0; 924 925 /* 926 * Cheetah+ version of macro that flushes the entire Ecache. 927 * 928 * Uses the displacement flush feature. 929 * 930 * arg1 = ecache size 931 * arg2 = ecache linesize 932 * impl = CPU implementation as returned from GET_CPU_IMPL() 933 * The value in this register is destroyed during execution 934 * of the macro. 935 */ 936 #if defined(CHEETAH_PLUS) 937 #define CHP_ECACHE_FLUSHALL(arg1, arg2, impl) \ 938 cmp impl, PANTHER_IMPL; \ 939 bne %xcc, 1f; \ 940 nop; \ 941 set PN_L3_IDX_DISP_FLUSH, impl; \ 942 b 2f; \ 943 nop; \ 944 1: \ 945 set CHP_ECACHE_IDX_DISP_FLUSH, impl; \ 946 2: \ 947 subcc arg1, arg2, arg1; \ 948 bg,pt %xcc, 2b; \ 949 ldxa [arg1 + impl]ASI_EC_DIAG, %g0; 950 #else /* CHEETAH_PLUS */ 951 #define CHP_ECACHE_FLUSHALL(arg1, arg2, impl) 952 #endif /* CHEETAH_PLUS */ 953 954 /* 955 * Macro that flushes the entire Ecache. 956 * 957 * arg1 = ecache size 958 * arg2 = ecache linesize 959 * arg3 = ecache flush address - for cheetah only 960 */ 961 #define ECACHE_FLUSHALL(arg1, arg2, arg3, tmp1) \ 962 GET_CPU_IMPL(tmp1); \ 963 cmp tmp1, CHEETAH_IMPL; \ 964 bne %xcc, 2f; \ 965 nop; \ 966 CH_ECACHE_FLUSHALL(arg1, arg2, arg3); \ 967 ba 3f; \ 968 nop; \ 969 2: \ 970 CHP_ECACHE_FLUSHALL(arg1, arg2, tmp1); \ 971 3: 972 973 #endif /* JALAPENO || SERRANO */ 974 975 /* 976 * Macro that flushes the Panther L2 cache. 977 */ 978 #if defined(CHEETAH_PLUS) 979 #define PN_L2_FLUSHALL(scr1, scr2, scr3) \ 980 GET_CPU_IMPL(scr3); \ 981 cmp scr3, PANTHER_IMPL; \ 982 bne %xcc, 2f; \ 983 nop; \ 984 set PN_L2_SIZE, scr1; \ 985 set PN_L2_LINESIZE, scr2; \ 986 set PN_L2_IDX_DISP_FLUSH, scr3; \ 987 1: \ 988 subcc scr1, scr2, scr1; \ 989 bg,pt %xcc, 1b; \ 990 ldxa [scr1 + scr3]ASI_L2_TAG, %g0; \ 991 2: 992 #else /* CHEETAH_PLUS */ 993 #define PN_L2_FLUSHALL(scr1, scr2, scr3) 994 #endif /* CHEETAH_PLUS */ 995 996 /* 997 * Given a VA and page size (page size as encoded in ASI_MMU_TAG_ACCESS_EXT), 998 * this macro returns the TLB index for that mapping based on a 512 entry 999 * (2-way set associative) TLB. Aaside from the 16 entry fully associative 1000 * TLBs, all TLBs in Panther are 512 entry, 2-way set associative. 1001 * 1002 * To find the index, we shift the VA right by 13 + (3 * pg_sz) and then 1003 * mask out all but the lower 8 bits because: 1004 * 1005 * ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 0 for 8K 1006 * ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 1 for 64K 1007 * ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 2 for 512K 1008 * ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 3 for 4M 1009 * ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 4 for 32M 1010 * ASI_[D|I]MMU_TAG_ACCESS_EXT.PgSz = 5 for 256M 1011 * 1012 * and 1013 * 1014 * array index for 8K pages = VA[20:13] 1015 * array index for 64K pages = VA[23:16] 1016 * array index for 512K pages = VA[26:19] 1017 * array index for 4M pages = VA[29:22] 1018 * array index for 32M pages = VA[32:25] 1019 * array index for 256M pages = VA[35:28] 1020 * 1021 * Inputs: 1022 * 1023 * va - Register. 1024 * Input: Virtual address in which we are interested. 1025 * Output: TLB index value. 1026 * pg_sz - Register. Page Size of the TLB in question as encoded 1027 * in the ASI_[D|I]MMU_TAG_ACCESS_EXT register. 1028 */ 1029 #if defined(CHEETAH_PLUS) 1030 #define PN_GET_TLB_INDEX(va, pg_sz) \ 1031 srlx va, 13, va; /* first shift the 13 bits and then */ \ 1032 srlx va, pg_sz, va; /* shift by pg_sz three times. */ \ 1033 srlx va, pg_sz, va; \ 1034 srlx va, pg_sz, va; \ 1035 and va, 0xff, va; /* mask out all but the lower 8 bits */ 1036 #endif /* CHEETAH_PLUS */ 1037 1038 /* 1039 * The following macros are for error traps at TL>0. 1040 * The issue with error traps at TL>0 is that there are no safely 1041 * available global registers. So we use the trick of generating a 1042 * software trap, then using the %tpc, %tnpc and %tstate registers to 1043 * temporarily save the values of %g1 and %g2. 1044 */ 1045 1046 /* 1047 * Macro to generate 8-instruction trap table entry for TL>0 trap handlers. 1048 * Does the following steps: 1049 * 1. membar #Sync - required for USIII family errors. 1050 * 2. Specified software trap. 1051 * NB: Must be 8 instructions or less to fit in trap table and code must 1052 * be relocatable. 1053 */ 1054 #define CH_ERR_TL1_TRAPENTRY(trapno) \ 1055 membar #Sync; \ 1056 ta trapno; \ 1057 nop; nop; nop; nop; nop; nop 1058 1059 /* 1060 * Macro to generate 8-instruction trap table entry for TL>0 software trap. 1061 * We save the values of %g1 and %g2 in %tpc, %tnpc and %tstate (since 1062 * the low-order two bits of %tpc/%tnpc are reserved and read as zero, 1063 * we need to put the low-order two bits of %g1 and %g2 in %tstate). 1064 * Note that %tstate has a reserved hole from bits 3-7, so we put the 1065 * low-order two bits of %g1 in bits 0-1 and the low-order two bits of 1066 * %g2 in bits 10-11 (insuring bits 8-9 are zero for use by the D$/I$ 1067 * state bits). Note that we must do a jmp instruction, since this 1068 * is moved into the trap table entry. 1069 * NB: Must be 8 instructions or less to fit in trap table and code must 1070 * be relocatable. 1071 */ 1072 #define CH_ERR_TL1_SWTRAPENTRY(label) \ 1073 wrpr %g1, %tpc; \ 1074 and %g1, 3, %g1; \ 1075 wrpr %g2, %tnpc; \ 1076 sllx %g2, CH_ERR_G2_TO_TSTATE_SHFT, %g2; \ 1077 or %g1, %g2, %g2; \ 1078 sethi %hi(label), %g1; \ 1079 jmp %g1+%lo(label); \ 1080 wrpr %g2, %tstate 1081 1082 /* 1083 * Macro to get ptr to ch_err_tl1_data. 1084 * reg1 will either point to a physaddr with ASI_MEM in %asi OR it 1085 * will point to a kernel nucleus virtual address with ASI_N in %asi. 1086 * This allows us to: 1087 * 1. Avoid getting MMU misses. We may have gotten the original 1088 * Fast ECC error in an MMU handler and if we get an MMU trap 1089 * in the TL>0 handlers, we'll scribble on the MMU regs. 1090 * 2. Allows us to use the same code in the TL>0 handlers whether 1091 * we're accessing kernel nucleus virtual addresses or physical 1092 * addresses. 1093 * pseudo-code: 1094 * reg1 <- ch_err_tl1_paddrs[CPUID]; 1095 * if (reg1 == NULL) { 1096 * reg1 <- &ch_err_tl1_data 1097 * %asi <- ASI_N 1098 * } else { 1099 * reg1 <- reg1 + offset + 1100 * sizeof (ch_err_tl1_data) * (%tl - 3) 1101 * %asi <- ASI_MEM 1102 * } 1103 */ 1104 #define GET_CH_ERR_TL1_PTR(reg1, reg2, offset) \ 1105 CPU_INDEX(reg1, reg2); \ 1106 sllx reg1, 3, reg1; \ 1107 set ch_err_tl1_paddrs, reg2; \ 1108 ldx [reg1+reg2], reg1; \ 1109 brnz reg1, 1f; \ 1110 add reg1, offset, reg1; \ 1111 set ch_err_tl1_data, reg1; \ 1112 ba 2f; \ 1113 wr %g0, ASI_N, %asi; \ 1114 1: rdpr %tl, reg2; \ 1115 sub reg2, 3, reg2; \ 1116 mulx reg2, CH_ERR_TL1_DATA_SIZE, reg2; \ 1117 add reg1, reg2, reg1; \ 1118 wr %g0, ASI_MEM, %asi; \ 1119 2: 1120 1121 /* 1122 * Macro to generate entry code for TL>0 error handlers. 1123 * At the end of this macro, %g1 will point to the ch_err_tl1_data 1124 * structure and %g2 will have the original flags in the ch_err_tl1_data 1125 * structure and %g5 will have the value of %tstate where the Fast ECC 1126 * routines will save the state of the D$ in Bit2 CH_ERR_TSTATE_DC_ON. 1127 * All %g registers except for %g1, %g2 and %g5 will be available after 1128 * this macro. 1129 * Does the following steps: 1130 * 1. Compute physical address of per-cpu/per-tl save area using 1131 * only %g1+%g2 (which we've saved in %tpc, %tnpc, %tstate) 1132 * leaving address in %g1 and updating the %asi register. 1133 * If there is no data area available, we branch to label. 1134 * 2. Save %g3-%g7 in save area. 1135 * 3. Save %tpc->%g3, %tnpc->%g4, %tstate->%g5, which contain 1136 * original %g1+%g2 values (because we're going to change %tl). 1137 * 4. set %tl <- %tl - 1. We do this ASAP to make window of 1138 * running at %tl+1 as small as possible. 1139 * 5. Reconstitute %g1+%g2 from %tpc (%g3), %tnpc (%g4), 1140 * %tstate (%g5) and save in save area, carefully preserving %g5 1141 * because it has the CH_ERR_TSTATE_DC_ON value. 1142 * 6. Load existing ch_err_tl1_data flags in %g2 1143 * 7. Compute the new flags 1144 * 8. If %g2 is non-zero (the structure was busy), shift the new 1145 * flags by CH_ERR_ME_SHIFT and or them with the old flags. 1146 * 9. Store the updated flags into ch_err_tl1_data flags. 1147 * 10. If %g2 is non-zero, read the %tpc and store it in 1148 * ch_err_tl1_data. 1149 */ 1150 #define CH_ERR_TL1_ENTER(flags) \ 1151 GET_CH_ERR_TL1_PTR(%g1, %g2, CHPR_TL1_ERR_DATA); \ 1152 stxa %g3, [%g1 + CH_ERR_TL1_G3]%asi; \ 1153 stxa %g4, [%g1 + CH_ERR_TL1_G4]%asi; \ 1154 stxa %g5, [%g1 + CH_ERR_TL1_G5]%asi; \ 1155 stxa %g6, [%g1 + CH_ERR_TL1_G6]%asi; \ 1156 stxa %g7, [%g1 + CH_ERR_TL1_G7]%asi; \ 1157 rdpr %tpc, %g3; \ 1158 rdpr %tnpc, %g4; \ 1159 rdpr %tstate, %g5; \ 1160 rdpr %tl, %g6; \ 1161 sub %g6, 1, %g6; \ 1162 wrpr %g6, %tl; \ 1163 and %g5, 3, %g6; \ 1164 andn %g3, 3, %g3; \ 1165 or %g3, %g6, %g3; \ 1166 stxa %g3, [%g1 + CH_ERR_TL1_G1]%asi; \ 1167 srlx %g5, CH_ERR_G2_TO_TSTATE_SHFT, %g6; \ 1168 and %g6, 3, %g6; \ 1169 andn %g4, 3, %g4; \ 1170 or %g6, %g4, %g4; \ 1171 stxa %g4, [%g1 + CH_ERR_TL1_G2]%asi; \ 1172 ldxa [%g1 + CH_ERR_TL1_FLAGS]%asi, %g2; \ 1173 set flags | CH_ERR_TL, %g3; \ 1174 brz %g2, 9f; \ 1175 sllx %g3, CH_ERR_ME_SHIFT, %g4; \ 1176 or %g2, %g4, %g3; \ 1177 9: stxa %g3, [%g1 + CH_ERR_TL1_FLAGS]%asi; \ 1178 brnz %g2, 8f; \ 1179 rdpr %tpc, %g4; \ 1180 stxa %g4, [%g1 + CH_ERR_TL1_TPC]%asi; \ 1181 8: 1182 1183 /* 1184 * Turns off D$/I$ and saves the state of DCU_DC+DCU_IC in %tstate Bits 8+9 1185 * (CH_ERR_TSTATE_DC_ON/CH_ERR_TSTATE_IC_ON). This is invoked on Fast ECC 1186 * at TL>0 handlers because the D$ may have corrupted data and we need to 1187 * turn off the I$ to allow for diagnostic accesses. We then invoke 1188 * the normal entry macro and after it is done we save the values of 1189 * the original D$/I$ state, which is in %g5 bits CH_ERR_TSTATE_DC_ON/ 1190 * CH_ERR_TSTATE_IC_ON in ch_err_tl1_tmp. 1191 */ 1192 #define CH_ERR_TL1_FECC_ENTER \ 1193 ldxa [%g0]ASI_DCU, %g1; \ 1194 andn %g1, DCU_DC + DCU_IC, %g2; \ 1195 stxa %g2, [%g0]ASI_DCU; \ 1196 flush %g0; /* DCU_IC need flush */ \ 1197 rdpr %tstate, %g2; \ 1198 and %g1, DCU_DC + DCU_IC, %g1; \ 1199 sllx %g1, CH_ERR_DCU_TO_TSTATE_SHFT, %g1; \ 1200 or %g1, %g2, %g2; \ 1201 wrpr %g2, %tstate; \ 1202 CH_ERR_TL1_ENTER(CH_ERR_FECC); \ 1203 and %g5, CH_ERR_TSTATE_DC_ON + CH_ERR_TSTATE_IC_ON, %g5; \ 1204 stxa %g5, [%g1 + CH_ERR_TL1_TMP]%asi 1205 1206 /* 1207 * Macro to generate exit code for TL>0 error handlers. 1208 * We fall into this macro if we've successfully logged the error in 1209 * the ch_err_tl1_data structure and want the PIL15 softint to pick 1210 * it up and log it. 1211 * Does the following steps: 1212 * 1. Set pending flag for this cpu in ch_err_tl1_pending. 1213 * 2. Write %set_softint with (1<<pil) to cause a pil level trap 1214 * 3. Restore registers from ch_err_tl1_data, which is pointed to 1215 * by %g1, last register to restore is %g1 since it's pointing 1216 * to the save area. 1217 * 4. Execute retry 1218 */ 1219 #define CH_ERR_TL1_EXIT \ 1220 CPU_INDEX(%g2, %g3); \ 1221 set ch_err_tl1_pending, %g3; \ 1222 set -1, %g4; \ 1223 stb %g4, [%g2 + %g3]; \ 1224 mov 1, %g2; \ 1225 sll %g2, PIL_15, %g2; \ 1226 wr %g2, SET_SOFTINT; \ 1227 ldxa [%g1 + CH_ERR_TL1_G7]%asi, %g7; \ 1228 ldxa [%g1 + CH_ERR_TL1_G6]%asi, %g6; \ 1229 ldxa [%g1 + CH_ERR_TL1_G5]%asi, %g5; \ 1230 ldxa [%g1 + CH_ERR_TL1_G4]%asi, %g4; \ 1231 ldxa [%g1 + CH_ERR_TL1_G3]%asi, %g3; \ 1232 ldxa [%g1 + CH_ERR_TL1_G2]%asi, %g2; \ 1233 ldxa [%g1 + CH_ERR_TL1_G1]%asi, %g1; \ 1234 retry 1235 1236 /* 1237 * Generates unrecoverable error label for TL>0 handlers. 1238 * At label (Unrecoverable error routine) 1239 * 1. Sets flags in ch_err_tl1_data and leaves in %g2 (first 1240 * argument to cpu_tl1_err_panic). 1241 * 2. Call cpu_tl1_err_panic via systrap at PIL 15 1242 */ 1243 #define CH_ERR_TL1_PANIC_EXIT(label) \ 1244 label: ldxa [%g1 + CH_ERR_TL1_FLAGS]%asi, %g2; \ 1245 or %g2, CH_ERR_TL | CH_ERR_PANIC, %g2; \ 1246 stxa %g2, [%g1 + CH_ERR_TL1_FLAGS]%asi; \ 1247 set cpu_tl1_err_panic, %g1; \ 1248 ba sys_trap; \ 1249 mov PIL_15, %g4 1250 1251 1252 1253 /* END CSTYLED */ 1254 #endif /* _ASM */ 1255 1256 #ifdef __cplusplus 1257 } 1258 #endif 1259 1260 #endif /* _CHEETAHASM_H */ 1261