1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26
27 /*
28 * Support routines for managing per-Lxcache state.
29 */
30
31 #include <cmd_Lxcache.h>
32 #include <cmd_mem.h>
33 #include <cmd_cpu.h>
34 #include <cmd.h>
35 #include <errno.h>
36 #include <fcntl.h>
37 #include <unistd.h>
38 #include <stdio.h>
39 #include <strings.h>
40 #include <fm/fmd_api.h>
41 #include <sys/fm/protocol.h>
42 #include <sys/cheetahregs.h>
43 #include <sys/mem_cache.h>
44
45 #define PN_ECSTATE_NA 5
46 /*
47 * These values are our threshold values for SERDing CPU's based on the
48 * the # of times we have retired a cache line for each category.
49 */
50
51 #define CMD_CPU_SERD_AGG_1 64
52 #define CMD_CPU_SERD_AGG_2 64
53
54 static int8_t cmd_lowest_way[16] = {
55 /* 0x0 0x1 0x2 0x3 0x4 0x5 0x6 0x7 0x8 0x9 0xa 0xb 0xc 0xd 0xe 0xf */
56 -1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
57 static int cmd_num_of_bits[16] = {
58 /* 0x0 0x1 0x2 0x3 0x4 0x5 0x6 0x7 0x8 0x9 0xa 0xb 0xc 0xd 0xe 0xf */
59 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
60
61
62 void
cmd_Lxcache_write(fmd_hdl_t * hdl,cmd_Lxcache_t * Lxcache)63 cmd_Lxcache_write(fmd_hdl_t *hdl, cmd_Lxcache_t *Lxcache)
64 {
65 fmd_buf_write(hdl, NULL, Lxcache->Lxcache_bufname, Lxcache,
66 sizeof (cmd_Lxcache_pers_t));
67 }
68
69 const char *
cmd_type_to_str(cmd_ptrsubtype_t pstype)70 cmd_type_to_str(cmd_ptrsubtype_t pstype)
71 {
72 switch (pstype) {
73 case CMD_PTR_CPU_L2DATA:
74 return ("l2data");
75 break;
76 case CMD_PTR_CPU_L3DATA:
77 return ("l3data");
78 break;
79 case CMD_PTR_CPU_L2TAG:
80 return ("l2tag");
81 break;
82 case CMD_PTR_CPU_L3TAG:
83 return ("l3tag");
84 break;
85 default:
86 return ("unknown");
87 break;
88 }
89 }
90
91 const char *
cmd_flags_to_str(int flags)92 cmd_flags_to_str(int flags)
93 {
94 switch (flags) {
95 case CMD_LxCACHE_F_ACTIVE:
96 return ("ACTIVE");
97 case CMD_LxCACHE_F_FAULTING:
98 return ("FAULTING");
99 case CMD_LxCACHE_F_RETIRED:
100 return ("RETIRED");
101 case CMD_LxCACHE_F_UNRETIRED:
102 return ("UNRETIRED");
103 case CMD_LxCACHE_F_RERETIRED:
104 return ("RERETIRED");
105 default:
106 return ("Unknown_flags");
107 }
108 }
109
110 const char *
cmd_reason_to_str(int reason)111 cmd_reason_to_str(int reason)
112 {
113 switch (reason) {
114 case CMD_LXSUSPECT_DATA:
115 return ("SUSPECT_DATA");
116 case CMD_LXSUSPECT_0_TAG:
117 return ("SUSPECT_0_TAG");
118 case CMD_LXSUSPECT_1_TAG:
119 return ("SUSPECT_1_TAG");
120 case CMD_LXCONVICTED:
121 return ("CONVICTED");
122 case CMD_LXFUNCTIONING:
123 return ("FUNCTIONING");
124 default:
125 return ("Unknown_reason");
126 }
127 }
128
129 static void
cmd_pretty_print_Lxcache(fmd_hdl_t * hdl,cmd_Lxcache_t * Lxcache)130 cmd_pretty_print_Lxcache(fmd_hdl_t *hdl, cmd_Lxcache_t *Lxcache)
131 {
132 fmd_hdl_debug(hdl,
133 "\n"
134 " cpu = %s\n"
135 " type = %s\n"
136 " index = %d\n"
137 " way = %d\n"
138 " bit = %d\n"
139 " reason = %s\n"
140 " flags = %s\n",
141 Lxcache->Lxcache_cpu_bufname,
142 cmd_type_to_str(Lxcache->Lxcache_type),
143 Lxcache->Lxcache_index,
144 Lxcache->Lxcache_way,
145 Lxcache->Lxcache_bit,
146 cmd_reason_to_str(Lxcache->Lxcache_reason),
147 cmd_flags_to_str(Lxcache->Lxcache_flags));
148 }
149
150 void
cmd_Lxcache_free(fmd_hdl_t * hdl,cmd_cpu_t * cpu,cmd_Lxcache_t * Lxcache,int destroy)151 cmd_Lxcache_free(fmd_hdl_t *hdl, cmd_cpu_t *cpu, cmd_Lxcache_t *Lxcache,
152 int destroy)
153 {
154 cmd_case_t *cc = &Lxcache->Lxcache_case;
155
156 fmd_hdl_debug(hdl, "Entering cmd_Lxcache_free for %s destroy = %d\n",
157 Lxcache->Lxcache_bufname, destroy);
158
159 if (cc->cc_cp != NULL)
160 cmd_case_fini(hdl, cc->cc_cp, destroy);
161 if (cc->cc_serdnm != NULL) {
162 if (fmd_serd_exists(hdl, cc->cc_serdnm) && destroy) {
163 fmd_serd_destroy(hdl, cc->cc_serdnm);
164 fmd_hdl_strfree(hdl, cc->cc_serdnm);
165 cc->cc_serdnm = NULL;
166 }
167 }
168 if (Lxcache->Lxcache_nvl) {
169 nvlist_free(Lxcache->Lxcache_nvl);
170 Lxcache->Lxcache_nvl = NULL;
171 }
172 /*
173 * Clean up the SERD engine created to handle recheck of TAGS.
174 * This SERD engine was created to save the event pointer.
175 */
176 if (Lxcache->Lxcache_serdnm != NULL) {
177 if (fmd_serd_exists(hdl, Lxcache->Lxcache_serdnm) && destroy) {
178 fmd_serd_destroy(hdl, Lxcache->Lxcache_serdnm);
179 fmd_hdl_strfree(hdl, Lxcache->Lxcache_serdnm);
180 Lxcache->Lxcache_serdnm = NULL;
181 }
182 }
183 Lxcache->Lxcache_timeout_id = -1;
184 Lxcache->Lxcache_ep = NULL;
185 Lxcache->Lxcache_retry_count = 0;
186 if (destroy)
187 fmd_buf_destroy(hdl, NULL, Lxcache->Lxcache_bufname);
188 cmd_fmri_fini(hdl, &Lxcache->Lxcache_asru, destroy);
189 cmd_list_delete(&cpu->cpu_Lxcaches, Lxcache);
190 fmd_hdl_free(hdl, Lxcache, sizeof (cmd_Lxcache_t));
191 }
192
193 void
cmd_Lxcache_destroy(fmd_hdl_t * hdl,cmd_cpu_t * cpu,cmd_Lxcache_t * Lxcache)194 cmd_Lxcache_destroy(fmd_hdl_t *hdl, cmd_cpu_t *cpu, cmd_Lxcache_t *Lxcache)
195 {
196 cmd_Lxcache_free(hdl, cpu, Lxcache, FMD_B_TRUE);
197 }
198
199 cmd_Lxcache_t *
cmd_Lxcache_lookup_by_type_index_way_bit(cmd_cpu_t * cpu,cmd_ptrsubtype_t pstype,int32_t index,int8_t way,int16_t bit)200 cmd_Lxcache_lookup_by_type_index_way_bit(cmd_cpu_t *cpu,
201 cmd_ptrsubtype_t pstype, int32_t index, int8_t way, int16_t bit)
202 {
203 cmd_Lxcache_t *Lxcache;
204
205 for (Lxcache = cmd_list_next(&cpu->cpu_Lxcaches); Lxcache != NULL;
206 Lxcache = cmd_list_next(Lxcache)) {
207 if ((Lxcache->Lxcache_type == pstype) &&
208 (Lxcache->Lxcache_index == (uint32_t)index) &&
209 (Lxcache->Lxcache_way == (uint32_t)way) &&
210 (Lxcache->Lxcache_bit == (uint16_t)bit))
211 return (Lxcache);
212 }
213
214 return (NULL);
215 }
216
217 cmd_Lxcache_t *
cmd_Lxcache_create(fmd_hdl_t * hdl,cmd_xr_t * xr,cmd_cpu_t * cpu,nvlist_t * modasru,cmd_ptrsubtype_t pstype,int32_t index,int8_t way,int16_t bit)218 cmd_Lxcache_create(fmd_hdl_t *hdl, cmd_xr_t *xr, cmd_cpu_t *cpu,
219 nvlist_t *modasru, cmd_ptrsubtype_t pstype, int32_t index,
220 int8_t way, int16_t bit)
221 {
222 cmd_Lxcache_t *Lxcache;
223 nvlist_t *asru;
224 const char *pstype_name;
225 uint8_t fmri_Lxcache_type;
226
227 pstype_name = cmd_type_to_str(pstype);
228 fmd_hdl_debug(hdl,
229 "\n%s:cpu_id %d:Creating new Lxcache for index=%d way=%d bit=%d\n",
230 pstype_name, cpu->cpu_cpuid, index, way, bit);
231
232 CMD_CPU_STAT_BUMP(cpu, Lxcache_creat);
233
234 Lxcache = fmd_hdl_zalloc(hdl, sizeof (cmd_Lxcache_t), FMD_SLEEP);
235 (void) strncpy(Lxcache->Lxcache_cpu_bufname,
236 cpu->cpu_bufname, CMD_BUFNMLEN);
237 Lxcache->Lxcache_nodetype = CMD_NT_LxCACHE;
238 Lxcache->Lxcache_version = CMD_LxCACHE_VERSION;
239 Lxcache->Lxcache_type = pstype;
240 Lxcache->Lxcache_index = (uint32_t)index;
241 Lxcache->Lxcache_way = (uint32_t)way;
242 Lxcache->Lxcache_bit = (uint16_t)bit;
243 Lxcache->Lxcache_reason = CMD_LXFUNCTIONING;
244 Lxcache->Lxcache_flags = CMD_LxCACHE_F_ACTIVE;
245 Lxcache->Lxcache_timeout_id = -1;
246 Lxcache->Lxcache_retry_count = 0;
247 Lxcache->Lxcache_nvl = NULL;
248 Lxcache->Lxcache_ep = NULL;
249 Lxcache->Lxcache_serdnm = NULL;
250 Lxcache->Lxcache_clcode = 0;
251 Lxcache->xr = xr;
252 Lxcache->Lxcache_retired_fmri[0] = '\0';
253 switch (pstype) {
254 case CMD_PTR_CPU_L2DATA:
255 fmri_Lxcache_type = FM_FMRI_CPU_CACHE_TYPE_L2;
256 break;
257 case CMD_PTR_CPU_L3DATA:
258 fmri_Lxcache_type = FM_FMRI_CPU_CACHE_TYPE_L3;
259 break;
260 case CMD_PTR_CPU_L2TAG:
261 fmri_Lxcache_type = FM_FMRI_CPU_CACHE_TYPE_L2;
262 break;
263 case CMD_PTR_CPU_L3TAG:
264 fmri_Lxcache_type = FM_FMRI_CPU_CACHE_TYPE_L3;
265 break;
266 default:
267 break;
268 }
269
270 cmd_bufname(Lxcache->Lxcache_bufname, sizeof (Lxcache->Lxcache_bufname),
271 "Lxcache_%s_%d_%d_%d_%d", pstype_name, cpu->cpu_cpuid,
272 index, way, bit);
273 fmd_hdl_debug(hdl,
274 "\n%s:cpu_id %d: new Lxcache name is %s\n",
275 pstype_name, cpu->cpu_cpuid, Lxcache->Lxcache_bufname);
276 if ((errno = nvlist_dup(modasru, &asru, 0)) != 0 ||
277 (errno = nvlist_add_uint32(asru, FM_FMRI_CPU_CACHE_INDEX,
278 index)) != 0 ||
279 (errno = nvlist_add_uint32(asru, FM_FMRI_CPU_CACHE_WAY,
280 (uint32_t)way)) != 0 ||
281 (errno = nvlist_add_uint16(asru, FM_FMRI_CPU_CACHE_BIT,
282 bit)) != 0 ||
283 (errno = nvlist_add_uint8(asru, FM_FMRI_CPU_CACHE_TYPE,
284 fmri_Lxcache_type)) != 0 ||
285 (errno = fmd_nvl_fmri_expand(hdl, asru)) != 0)
286 fmd_hdl_abort(hdl, "failed to build Lxcache fmri");
287 asru->nvl_nvflag |= NV_UNIQUE_NAME_TYPE;
288
289 cmd_fmri_init(hdl, &Lxcache->Lxcache_asru, asru,
290 "%s_asru_%d_%d_%d", pstype_name, index, way, bit);
291
292 nvlist_free(asru);
293
294 cmd_list_append(&cpu->cpu_Lxcaches, Lxcache);
295 cmd_Lxcache_write(hdl, Lxcache);
296
297 return (Lxcache);
298 }
299
300 cmd_Lxcache_t *
cmd_Lxcache_lookup_by_index_way(cmd_cpu_t * cpu,cmd_ptrsubtype_t pstype,int32_t index,int8_t way)301 cmd_Lxcache_lookup_by_index_way(cmd_cpu_t *cpu, cmd_ptrsubtype_t pstype,
302 int32_t index, int8_t way)
303 {
304 cmd_Lxcache_t *cache;
305
306 for (cache = cmd_list_next(&cpu->cpu_Lxcaches); cache != NULL;
307 cache = cmd_list_next(cache)) {
308 if ((cache->Lxcache_index == (uint32_t)index) &&
309 (cache->Lxcache_way == (uint32_t)way) &&
310 (cache->Lxcache_type == pstype)) {
311 return (cache);
312 }
313 }
314
315 return (NULL);
316 }
317
318 static cmd_Lxcache_t *
Lxcache_wrapv1(fmd_hdl_t * hdl,cmd_Lxcache_pers_t * pers,size_t psz)319 Lxcache_wrapv1(fmd_hdl_t *hdl, cmd_Lxcache_pers_t *pers, size_t psz)
320 {
321 cmd_Lxcache_t *Lxcache;
322
323 if (psz != sizeof (cmd_Lxcache_pers_t)) {
324 fmd_hdl_abort(hdl, "size of state doesn't match size of "
325 "version 1 state (%u bytes).\n",
326 sizeof (cmd_Lxcache_pers_t));
327 }
328
329 Lxcache = fmd_hdl_zalloc(hdl, sizeof (cmd_Lxcache_t), FMD_SLEEP);
330 bcopy(pers, Lxcache, sizeof (cmd_Lxcache_pers_t));
331 fmd_hdl_free(hdl, pers, psz);
332 return (Lxcache);
333 }
334
335 void *
cmd_Lxcache_restore(fmd_hdl_t * hdl,fmd_case_t * cp,cmd_case_ptr_t * ptr)336 cmd_Lxcache_restore(fmd_hdl_t *hdl, fmd_case_t *cp, cmd_case_ptr_t *ptr)
337 {
338 cmd_Lxcache_t *Lxcache;
339 cmd_Lxcache_t *recovered_Lxcache;
340 cmd_cpu_t *cpu;
341 size_t Lxcachesz;
342 char *serdnm;
343
344 /*
345 * We need to first extract the cpu name by reading directly
346 * from fmd buffers in order to begin our search for Lxcache in
347 * the appropriate cpu list.
348 * After we identify the cpu list using buf name we look
349 * in cpu list for our Lxcache states.
350 */
351 fmd_hdl_debug(hdl, "restoring Lxcache from %s\n", ptr->ptr_name);
352
353 if ((Lxcachesz = fmd_buf_size(hdl, NULL, ptr->ptr_name)) == 0) {
354 fmd_hdl_abort(hdl, "Lxcache referenced by case %s does "
355 "not exist in saved state\n",
356 fmd_case_uuid(hdl, cp));
357 } else if (Lxcachesz != sizeof (cmd_Lxcache_pers_t)) {
358 fmd_hdl_abort(hdl, "Lxcache buffer referenced by case %s "
359 "is %d bytes. Expected size is %d bytes\n",
360 fmd_case_uuid(hdl, cp), Lxcachesz,
361 sizeof (cmd_Lxcache_pers_t));
362 }
363
364 if ((Lxcache = cmd_buf_read(hdl, NULL, ptr->ptr_name,
365 Lxcachesz)) == NULL) {
366 fmd_hdl_abort(hdl, "failed to read Lxcache buf %s",
367 ptr->ptr_name);
368 }
369 cmd_pretty_print_Lxcache(hdl, Lxcache);
370
371 fmd_hdl_debug(hdl, "found %d in version field\n",
372 Lxcache->Lxcache_version);
373 cpu = cmd_restore_cpu_only(hdl, cp, Lxcache->Lxcache_cpu_bufname);
374 if (cpu == NULL) {
375 fmd_hdl_debug(hdl,
376 "\nCould not restore cpu %s\n",
377 Lxcache->Lxcache_cpu_bufname);
378 return (NULL);
379 }
380 recovered_Lxcache = Lxcache; /* save the recovered Lxcache */
381
382 for (Lxcache = cmd_list_next(&cpu->cpu_Lxcaches); Lxcache != NULL;
383 Lxcache = cmd_list_next(Lxcache)) {
384 if (strcmp(Lxcache->Lxcache_bufname, ptr->ptr_name) == 0)
385 break;
386 }
387
388 if (Lxcache == NULL) {
389
390 switch (recovered_Lxcache->Lxcache_version) {
391 case CMD_LxCACHE_VERSION_1:
392 Lxcache = Lxcache_wrapv1(hdl,
393 (cmd_Lxcache_pers_t *)recovered_Lxcache,
394 Lxcachesz);
395 break;
396 default:
397 fmd_hdl_abort(hdl, "unknown version (found %d) "
398 "for Lxcache state referenced by case %s.\n",
399 recovered_Lxcache->Lxcache_version,
400 fmd_case_uuid(hdl, cp));
401 break;
402 }
403
404 cmd_fmri_restore(hdl, &Lxcache->Lxcache_asru);
405 /*
406 * We need to cleanup the information associated with
407 * the timeout routine because these are not checkpointed
408 * and cannot be retored.
409 */
410 Lxcache->Lxcache_timeout_id = -1;
411 Lxcache->Lxcache_retry_count = 0;
412 Lxcache->Lxcache_nvl = NULL;
413 Lxcache->Lxcache_ep = NULL;
414 Lxcache->Lxcache_serdnm = NULL;
415
416 cmd_list_append(&cpu->cpu_Lxcaches, Lxcache);
417 }
418 serdnm = cmd_Lxcache_serdnm_create(hdl, cpu->cpu_cpuid,
419 Lxcache->Lxcache_type, Lxcache->Lxcache_index,
420 Lxcache->Lxcache_way, Lxcache->Lxcache_bit);
421 fmd_hdl_debug(hdl,
422 "cpu_id %d: serdname for the case is %s\n",
423 cpu->cpu_cpuid, serdnm);
424 fmd_hdl_debug(hdl,
425 "cpu_id %d: restoring the case for index %d way %d bit %d\n",
426 cpu->cpu_cpuid, Lxcache->Lxcache_index,
427 Lxcache->Lxcache_way, Lxcache->Lxcache_bit);
428 cmd_case_restore(hdl, &Lxcache->Lxcache_case, cp, serdnm);
429
430 return (Lxcache);
431 }
432
433 /*ARGSUSED*/
434 void
cmd_Lxcache_validate(fmd_hdl_t * hdl,cmd_cpu_t * cpu)435 cmd_Lxcache_validate(fmd_hdl_t *hdl, cmd_cpu_t *cpu)
436 {
437 cmd_Lxcache_t *Lxcache, *next;
438
439 for (Lxcache = cmd_list_next(&cpu->cpu_Lxcaches);
440 Lxcache != NULL; Lxcache = next) {
441 next = cmd_list_next(Lxcache);
442
443 if (fmd_nvl_fmri_unusable(hdl, Lxcache->Lxcache_asru_nvl)) {
444 cmd_Lxcache_destroy(hdl, cpu, Lxcache);
445 }
446 }
447 }
448
449 void
cmd_Lxcache_dirty(fmd_hdl_t * hdl,cmd_Lxcache_t * Lxcache)450 cmd_Lxcache_dirty(fmd_hdl_t *hdl, cmd_Lxcache_t *Lxcache)
451 {
452 if (fmd_buf_size(hdl, NULL, Lxcache->Lxcache_bufname) !=
453 sizeof (cmd_Lxcache_pers_t))
454 fmd_buf_destroy(hdl, NULL, Lxcache->Lxcache_bufname);
455
456 /* No need to rewrite the FMRIs in the Lxcache - they don't change */
457 fmd_buf_write(hdl, NULL,
458 Lxcache->Lxcache_bufname, &Lxcache->Lxcache_pers,
459 sizeof (cmd_Lxcache_pers_t));
460 }
461
462 void
cmd_Lxcache_fini(fmd_hdl_t * hdl,cmd_cpu_t * cpu)463 cmd_Lxcache_fini(fmd_hdl_t *hdl, cmd_cpu_t *cpu)
464 {
465 cmd_Lxcache_t *Lxcache;
466
467 while ((Lxcache = cmd_list_next(&cpu->cpu_Lxcaches)) != NULL)
468 cmd_Lxcache_free(hdl, cpu, Lxcache, FMD_B_FALSE);
469 }
470
471 char *
cmd_Lxcache_serdnm_create(fmd_hdl_t * hdl,uint32_t cpu_id,cmd_ptrsubtype_t pstype,int32_t index,int8_t way,int16_t bit)472 cmd_Lxcache_serdnm_create(fmd_hdl_t *hdl, uint32_t cpu_id,
473 cmd_ptrsubtype_t pstype,
474 int32_t index, int8_t way, int16_t bit)
475 {
476 const char *fmt = "cpu_%d:%s_%d_%d_%d_serd";
477 const char *serdbase;
478 size_t sz;
479 char *nm;
480
481 serdbase = cmd_type_to_str(pstype);
482 sz = (snprintf(NULL, 0, fmt, cpu_id, serdbase, index, way, bit) + 1);
483 nm = fmd_hdl_alloc(hdl, sz, FMD_SLEEP);
484 (void) snprintf(nm, sz, fmt, cpu_id, serdbase, index, way, bit);
485 return (nm);
486 }
487
488 char *
cmd_Lxcache_anonymous_serdnm_create(fmd_hdl_t * hdl,uint32_t cpu_id,cmd_ptrsubtype_t pstype,int32_t index,int8_t way,int16_t bit)489 cmd_Lxcache_anonymous_serdnm_create(fmd_hdl_t *hdl, uint32_t cpu_id,
490 cmd_ptrsubtype_t pstype,
491 int32_t index, int8_t way, int16_t bit)
492 {
493 const char *fmt = "cpu_%d:%s_%d_%d_%d_anonymous_serd";
494 const char *serdbase;
495 size_t sz;
496 char *nm;
497
498 serdbase = cmd_type_to_str(pstype);
499 sz = (snprintf(NULL, 0, fmt, cpu_id, serdbase, index, way, bit) + 1);
500 nm = fmd_hdl_alloc(hdl, sz, FMD_SLEEP);
501 (void) snprintf(nm, sz, fmt, cpu_id, serdbase, index, way, bit);
502 return (nm);
503 }
504
505 /*
506 * Count the number of SERD type 2 ways retired for a given cpu
507 * These are defined to be L3 Cache data retirements
508 */
509
510 uint32_t
cmd_Lx_index_count_type2_ways(cmd_cpu_t * cpu)511 cmd_Lx_index_count_type2_ways(cmd_cpu_t *cpu)
512 {
513 cmd_Lxcache_t *cache = NULL;
514 uint32_t ret_count = 0;
515
516 for (cache = cmd_list_next(&cpu->cpu_Lxcaches); cache != NULL;
517 cache = cmd_list_next(cache)) {
518 if ((cache->Lxcache_flags & CMD_LxCACHE_F_RETIRED) &&
519 (cache->Lxcache_type == CMD_PTR_CPU_L3DATA)) {
520 ret_count++;
521 }
522 }
523 return (ret_count);
524 }
525 /*
526 * Count the number of SERD type 1 ways retired for a given cpu
527 * These are defined to be L2 Data, tag and L3 Tag retirements
528 */
529
530 uint32_t
cmd_Lx_index_count_type1_ways(cmd_cpu_t * cpu)531 cmd_Lx_index_count_type1_ways(cmd_cpu_t *cpu)
532 {
533 cmd_Lxcache_t *cache = NULL;
534 uint32_t ret_count = 0;
535
536 for (cache = cmd_list_next(&cpu->cpu_Lxcaches); cache != NULL;
537 cache = cmd_list_next(cache)) {
538 if ((cache->Lxcache_flags & CMD_LxCACHE_F_RETIRED) &&
539 ((cache->Lxcache_type == CMD_PTR_CPU_L2DATA) ||
540 IS_TAG(cache->Lxcache_type))) {
541 ret_count++;
542 }
543 }
544 return (ret_count);
545 }
546
547 void
cmd_fault_the_cpu(fmd_hdl_t * hdl,cmd_cpu_t * cpu,cmd_ptrsubtype_t pstype,const char * fltnm)548 cmd_fault_the_cpu(fmd_hdl_t *hdl, cmd_cpu_t *cpu, cmd_ptrsubtype_t pstype,
549 const char *fltnm)
550 {
551 fmd_case_t *cp;
552 const char *uuid;
553
554 cp = cmd_case_create(hdl, &cpu->cpu_header, pstype,
555 &uuid);
556 fmd_hdl_debug(hdl,
557 "\n%s:cpu_id %d Created case %s to retire CPU\n",
558 fltnm, cpu->cpu_cpuid);
559
560 if ((errno = fmd_nvl_fmri_expand(hdl, cpu->cpu_asru_nvl)) != 0)
561 fmd_hdl_abort(hdl, "failed to build CPU fmri");
562
563 cmd_cpu_create_faultlist(hdl, cp, cpu, fltnm, NULL, HUNDRED_PERCENT);
564 fmd_case_solve(hdl, cp);
565 }
566
567 void
cmd_retire_cpu_if_limits_exceeded(fmd_hdl_t * hdl,cmd_cpu_t * cpu,cmd_ptrsubtype_t pstype,const char * fltnm)568 cmd_retire_cpu_if_limits_exceeded(fmd_hdl_t *hdl, cmd_cpu_t *cpu,
569 cmd_ptrsubtype_t pstype, const char *fltnm)
570 {
571 int cpu_retired_1, cpu_retired_2;
572
573 /* Retrieve the number of retired ways for each category */
574
575 cpu_retired_1 = cmd_Lx_index_count_type1_ways(cpu);
576 cpu_retired_2 = cmd_Lx_index_count_type2_ways(cpu);
577 fmd_hdl_debug(hdl,
578 "\n%s:CPU %d retired Type 1 way count is: %d\n",
579 fltnm, cpu->cpu_cpuid, cpu_retired_1);
580 fmd_hdl_debug(hdl, "\n%s:CPU %d retired Type 2 way count is: %d\n",
581 fltnm, cpu->cpu_cpuid, cpu_retired_2);
582
583 if (((cpu_retired_1 > CMD_CPU_SERD_AGG_1) ||
584 (cpu_retired_2 > CMD_CPU_SERD_AGG_2)) &&
585 (cpu->cpu_faulting != FMD_B_TRUE)) {
586 cmd_fault_the_cpu(hdl, cpu, pstype, fltnm);
587 }
588 }
589
590 void
cmd_Lxcache_fault(fmd_hdl_t * hdl,cmd_cpu_t * cpu,cmd_Lxcache_t * Lxcache,const char * fltnm,nvlist_t * rsrc,uint_t cert)591 cmd_Lxcache_fault(fmd_hdl_t *hdl, cmd_cpu_t *cpu, cmd_Lxcache_t *Lxcache,
592 const char *fltnm, nvlist_t *rsrc, uint_t cert)
593 {
594 char fltmsg[64];
595 nvlist_t *flt;
596
597 (void) snprintf(fltmsg, sizeof (fltmsg), "fault.cpu.%s.%s-line",
598 cmd_cpu_type2name(hdl, cpu->cpu_type), fltnm);
599 fmd_hdl_debug(hdl,
600 "\n%s:cpu_id %d: fltmsg = %s\n",
601 fltnm, cpu->cpu_cpuid, fltmsg);
602 if (Lxcache->Lxcache_flags & CMD_LxCACHE_F_FAULTING) {
603 return;
604 }
605 Lxcache->Lxcache_flags |= CMD_LxCACHE_F_FAULTING;
606 flt = fmd_nvl_create_fault(hdl, fltmsg, cert,
607 Lxcache->Lxcache_asru.fmri_nvl, cpu->cpu_fru_nvl, rsrc);
608 if (nvlist_add_boolean_value(flt, FM_SUSPECT_MESSAGE, B_FALSE) != 0)
609 fmd_hdl_abort(hdl, "failed to add no-message member to fault");
610
611 fmd_hdl_debug(hdl,
612 "\n%s:cpu_id %d: adding suspect list to case %s\n",
613 fltnm, cpu->cpu_cpuid,
614 fmd_case_uuid(hdl, Lxcache->Lxcache_case.cc_cp));
615 fmd_case_add_suspect(hdl, Lxcache->Lxcache_case.cc_cp, flt);
616 fmd_case_solve(hdl, Lxcache->Lxcache_case.cc_cp);
617 if (Lxcache->Lxcache_retired_fmri[0] == 0) {
618 if (cmd_fmri_nvl2str(hdl, Lxcache->Lxcache_asru.fmri_nvl,
619 Lxcache->Lxcache_retired_fmri,
620 sizeof (Lxcache->Lxcache_retired_fmri)) == -1)
621 fmd_hdl_debug(hdl,
622 "\n%s:cpu_id %d: Failed to save the"
623 " retired fmri string\n",
624 fltnm, cpu->cpu_cpuid);
625 else
626 fmd_hdl_debug(hdl,
627 "\n%s:cpu_id %d:Saved the retired fmri string %s\n",
628 fltnm, cpu->cpu_cpuid,
629 Lxcache->Lxcache_retired_fmri);
630 }
631 Lxcache->Lxcache_flags &= ~(CMD_LxCACHE_F_FAULTING);
632
633 }
634
635 void
cmd_Lxcache_close(fmd_hdl_t * hdl,void * arg)636 cmd_Lxcache_close(fmd_hdl_t *hdl, void *arg)
637 {
638 cmd_cpu_t *cpu;
639 cmd_Lxcache_t *Lxcache;
640 cmd_case_t *cc;
641
642 Lxcache = (cmd_Lxcache_t *)arg;
643 fmd_hdl_debug(hdl, "cmd_Lxcache_close called for %s\n",
644 Lxcache->Lxcache_bufname);
645 cc = &Lxcache->Lxcache_case;
646
647 for (cpu = cmd_list_next(&cmd.cmd_cpus); cpu != NULL;
648 cpu = cmd_list_next(cpu)) {
649 if (strcmp(cpu->cpu_bufname,
650 Lxcache->Lxcache_cpu_bufname) == 0)
651 break;
652 }
653 if (cpu == NULL)
654 fmd_hdl_abort(hdl, "failed to find the cpu %s for %s\n",
655 Lxcache->Lxcache_cpu_bufname,
656 Lxcache->Lxcache_bufname);
657 /*
658 * We will destroy the case and serd engine.
659 * The rest will be destroyed when we retire the CPU
660 * until then we keep the Lxcache strutures alive.
661 */
662 if (cc->cc_cp != NULL) {
663 cmd_case_fini(hdl, cc->cc_cp, FMD_B_TRUE);
664 cc->cc_cp = NULL;
665 }
666 if (cc->cc_serdnm != NULL) {
667 if (fmd_serd_exists(hdl, cc->cc_serdnm))
668 fmd_serd_destroy(hdl, cc->cc_serdnm);
669 fmd_hdl_strfree(hdl, cc->cc_serdnm);
670 cc->cc_serdnm = NULL;
671 }
672
673 }
674
675 cmd_Lxcache_t *
cmd_Lxcache_lookup_by_timeout_id(id_t id)676 cmd_Lxcache_lookup_by_timeout_id(id_t id)
677 {
678 cmd_cpu_t *cpu;
679 cmd_Lxcache_t *cmd_Lxcache;
680
681 for (cpu = cmd_list_next(&cmd.cmd_cpus); cpu != NULL;
682 cpu = cmd_list_next(cpu)) {
683 for (cmd_Lxcache = cmd_list_next(&cpu->cpu_Lxcaches);
684 cmd_Lxcache != NULL;
685 cmd_Lxcache = cmd_list_next(cmd_Lxcache)) {
686 if (cmd_Lxcache->Lxcache_timeout_id == id)
687 return (cmd_Lxcache);
688 }
689 }
690 return (NULL);
691 }
692
693 void
cmd_Lxcache_gc(fmd_hdl_t * hdl)694 cmd_Lxcache_gc(fmd_hdl_t *hdl)
695 {
696 cmd_cpu_t *cpu;
697
698 for (cpu = cmd_list_next(&cmd.cmd_cpus); cpu != NULL;
699 cpu = cmd_list_next(cpu))
700 cmd_Lxcache_validate(hdl, cpu);
701 }
702
703 cmd_evdisp_t
get_tagdata(cmd_cpu_t * cpu,cmd_ptrsubtype_t pstype,int32_t index,uint64_t * tag_data)704 get_tagdata(cmd_cpu_t *cpu, cmd_ptrsubtype_t pstype,
705 int32_t index, uint64_t *tag_data)
706 {
707 int fd;
708 cache_info_t cache_info;
709
710 fd = open(mem_cache_device, O_RDONLY);
711 if (fd == -1) {
712 (void) printf(
713 "cpu_id = %d could not open %s to read tag info.\n",
714 cpu->cpu_cpuid, mem_cache_device);
715 return (CMD_EVD_BAD);
716 }
717 switch (pstype) {
718 case CMD_PTR_CPU_L2TAG:
719 case CMD_PTR_CPU_L2DATA:
720 cache_info.cache = L2_CACHE_TAG;
721 break;
722 case CMD_PTR_CPU_L3TAG:
723 case CMD_PTR_CPU_L3DATA:
724 cache_info.cache = L3_CACHE_TAG;
725 break;
726 }
727 cache_info.cpu_id = cpu->cpu_cpuid;
728 cache_info.index = index;
729 cache_info.datap = tag_data;
730 cache_info.way = 0;
731
732 if (test_mode) {
733
734 if (ioctl(fd, MEM_CACHE_READ_ERROR_INJECTED_TAGS, &cache_info)
735 == -1) {
736 (void) printf("cpu_id = %d ioctl"
737 " MEM_CACHE_READ_ERROR_INJECTED_TAGS failed"
738 " errno = %d\n",
739 cpu->cpu_cpuid, errno);
740 (void) close(fd);
741 return (CMD_EVD_BAD);
742 }
743 } else {
744 if (ioctl(fd, MEM_CACHE_READ_TAGS, &cache_info)
745 == -1) {
746 (void) printf("cpu_id = %d ioctl"
747 " MEM_CACHE_READ_TAGS failed"
748 " errno = %d\n",
749 cpu->cpu_cpuid, errno);
750 (void) close(fd);
751 return (CMD_EVD_BAD);
752 }
753 }
754 (void) close(fd);
755 return (CMD_EVD_OK);
756 }
757
758 int
get_index_retired_ways(cmd_cpu_t * cpu,cmd_ptrsubtype_t pstype,int32_t index)759 get_index_retired_ways(cmd_cpu_t *cpu, cmd_ptrsubtype_t pstype, int32_t index)
760 {
761 int i, retired_ways;
762 uint64_t tag_data[PN_CACHE_NWAYS];
763
764 if (get_tagdata(cpu, pstype, index, tag_data) != 0) {
765 return (-1);
766 }
767 retired_ways = 0;
768 for (i = 0; i < PN_CACHE_NWAYS; i++) {
769 if ((tag_data[i] & CH_ECSTATE_MASK) ==
770 PN_ECSTATE_NA)
771 retired_ways++;
772 }
773 return (retired_ways);
774 }
775
776 boolean_t
cmd_cache_way_retire(fmd_hdl_t * hdl,cmd_cpu_t * cpu,cmd_Lxcache_t * Lxcache)777 cmd_cache_way_retire(fmd_hdl_t *hdl, cmd_cpu_t *cpu, cmd_Lxcache_t *Lxcache)
778 {
779 const char *fltnm;
780 cache_info_t cache_info;
781 int ret, fd;
782
783 fltnm = cmd_type_to_str(Lxcache->Lxcache_type);
784 fd = open(mem_cache_device, O_RDWR);
785 if (fd == -1) {
786 fmd_hdl_debug(hdl,
787 "fltnm:cpu_id %d open of %s failed\n",
788 fltnm, cpu->cpu_cpuid, mem_cache_device);
789 return (B_FALSE);
790 }
791 cache_info.cpu_id = cpu->cpu_cpuid;
792 cache_info.way = Lxcache->Lxcache_way;
793 cache_info.bit = Lxcache->Lxcache_bit;
794 cache_info.index = Lxcache->Lxcache_index;
795
796 switch (Lxcache->Lxcache_type) {
797 case CMD_PTR_CPU_L2TAG:
798 cache_info.cache = L2_CACHE_TAG;
799 break;
800 case CMD_PTR_CPU_L2DATA:
801 cache_info.cache = L2_CACHE_DATA;
802 break;
803 case CMD_PTR_CPU_L3TAG:
804 cache_info.cache = L3_CACHE_TAG;
805 break;
806 case CMD_PTR_CPU_L3DATA:
807 cache_info.cache = L3_CACHE_DATA;
808 break;
809 }
810
811 fmd_hdl_debug(hdl,
812 "\n%s:cpu %d: Retiring index %d, way %d bit %d\n",
813 fltnm, cpu->cpu_cpuid, cache_info.index, cache_info.way,
814 (int16_t)cache_info.bit);
815 ret = ioctl(fd, MEM_CACHE_RETIRE, &cache_info);
816 (void) close(fd);
817 if (ret == -1) {
818 fmd_hdl_debug(hdl,
819 "fltnm:cpu_id %d MEM_CACHE_RETIRE ioctl failed\n",
820 fltnm, cpu->cpu_cpuid);
821 return (B_FALSE);
822 }
823
824 return (B_TRUE);
825 }
826
827 boolean_t
cmd_cache_way_unretire(fmd_hdl_t * hdl,cmd_cpu_t * cpu,cmd_Lxcache_t * Lxcache)828 cmd_cache_way_unretire(fmd_hdl_t *hdl, cmd_cpu_t *cpu, cmd_Lxcache_t *Lxcache)
829 {
830 const char *fltnm;
831 cache_info_t cache_info;
832 int ret, fd;
833
834 fltnm = cmd_type_to_str(Lxcache->Lxcache_type);
835 fd = open(mem_cache_device, O_RDWR);
836 if (fd == -1) {
837 fmd_hdl_debug(hdl,
838 "fltnm:cpu_id %d open of %s failed\n",
839 fltnm, cpu->cpu_cpuid, mem_cache_device);
840 return (B_FALSE);
841 }
842 cache_info.cpu_id = cpu->cpu_cpuid;
843 cache_info.way = Lxcache->Lxcache_way;
844 cache_info.bit = Lxcache->Lxcache_bit;
845 cache_info.index = Lxcache->Lxcache_index;
846
847 switch (Lxcache->Lxcache_type) {
848 case CMD_PTR_CPU_L2TAG:
849 cache_info.cache = L2_CACHE_TAG;
850 break;
851 case CMD_PTR_CPU_L2DATA:
852 cache_info.cache = L2_CACHE_DATA;
853 break;
854 case CMD_PTR_CPU_L3TAG:
855 cache_info.cache = L3_CACHE_TAG;
856 break;
857 case CMD_PTR_CPU_L3DATA:
858 cache_info.cache = L3_CACHE_DATA;
859 break;
860 }
861
862 fmd_hdl_debug(hdl,
863 "\n%s:cpu %d: Unretiring index %d, way %d bit %d\n",
864 fltnm, cpu->cpu_cpuid, cache_info.index, cache_info.way,
865 (int16_t)cache_info.bit);
866 ret = ioctl(fd, MEM_CACHE_UNRETIRE, &cache_info);
867 (void) close(fd);
868 if (ret == -1) {
869 fmd_hdl_debug(hdl,
870 "fltnm:cpu_id %d MEM_CACHE_UNRETIRE ioctl failed\n",
871 fltnm, cpu->cpu_cpuid);
872 return (B_FALSE);
873 }
874
875 return (B_TRUE);
876 }
877
878 static cmd_Lxcache_t *
cmd_Lxcache_lookup_by_type_index_way_flags(cmd_cpu_t * cpu,cmd_ptrsubtype_t type,int32_t index,int8_t way,int32_t flags)879 cmd_Lxcache_lookup_by_type_index_way_flags(cmd_cpu_t *cpu,
880 cmd_ptrsubtype_t type, int32_t index, int8_t way, int32_t flags)
881 {
882 cmd_Lxcache_t *cmd_Lxcache;
883
884 for (cmd_Lxcache = cmd_list_next(&cpu->cpu_Lxcaches);
885 cmd_Lxcache != NULL;
886 cmd_Lxcache = cmd_list_next(cmd_Lxcache)) {
887 if ((cmd_Lxcache->Lxcache_index == index) &&
888 (cmd_Lxcache->Lxcache_way == way) &&
889 (cmd_Lxcache->Lxcache_type == type) &&
890 (cmd_Lxcache->Lxcache_flags & flags))
891 return (cmd_Lxcache);
892 }
893 return (NULL);
894 }
895
896 static int8_t
cmd_Lxcache_get_bit_array_of_available_ways(cmd_cpu_t * cpu,cmd_ptrsubtype_t type,int32_t index)897 cmd_Lxcache_get_bit_array_of_available_ways(cmd_cpu_t *cpu,
898 cmd_ptrsubtype_t type, int32_t index)
899 {
900 uint8_t bit_array_of_unavailable_ways;
901 uint8_t bit_array_of_available_ways;
902 cmd_ptrsubtype_t match_type;
903 cmd_Lxcache_t *cmd_Lxcache;
904 uint8_t bit_array_of_retired_ways;
905
906
907 /*
908 * We scan the Lxcache structures for this CPU and collect
909 * the following 2 information.
910 * - bit_array_of_retired_ways
911 * - bit_array_of_unavailable_ways
912 * If type is Lx_TAG then unavailable_ways will not include ways that
913 * were retired due to DATA faults, because these ways can still be
914 * re-retired for TAG faults.
915 * If 3 ways have been retired then we protect the only remaining
916 * unretired way by marking it as unavailable.
917 */
918 bit_array_of_unavailable_ways = 0;
919 bit_array_of_retired_ways = 0;
920 switch (type) {
921 case CMD_PTR_CPU_L2TAG:
922 match_type = CMD_PTR_CPU_L2DATA;
923 break;
924 case CMD_PTR_CPU_L2DATA:
925 match_type = CMD_PTR_CPU_L2TAG;
926 break;
927 case CMD_PTR_CPU_L3TAG:
928 match_type = CMD_PTR_CPU_L3DATA;
929 break;
930 case CMD_PTR_CPU_L3DATA:
931 match_type = CMD_PTR_CPU_L3TAG;
932 break;
933 }
934
935 for (cmd_Lxcache = cmd_list_next(&cpu->cpu_Lxcaches);
936 cmd_Lxcache != NULL;
937 cmd_Lxcache = cmd_list_next(cmd_Lxcache)) {
938 if ((cmd_Lxcache->Lxcache_index == index) &&
939 ((cmd_Lxcache->Lxcache_type == type) ||
940 (cmd_Lxcache->Lxcache_type == match_type)) &&
941 (cmd_Lxcache->Lxcache_flags &
942 (CMD_LxCACHE_F_RETIRED | CMD_LxCACHE_F_RERETIRED))) {
943 bit_array_of_retired_ways |=
944 (1 << cmd_Lxcache->Lxcache_way);
945 /*
946 * If we are calling this while handling TAG errors
947 * we can reretire the cachelines retired due to DATA
948 * errors. We will ignore the cachelnes that are
949 * retired due to DATA faults.
950 */
951 if ((type == CMD_PTR_CPU_L2TAG) &&
952 (cmd_Lxcache->Lxcache_type == CMD_PTR_CPU_L2DATA))
953 continue;
954 if ((type == CMD_PTR_CPU_L3TAG) &&
955 (cmd_Lxcache->Lxcache_type == CMD_PTR_CPU_L3DATA))
956 continue;
957 bit_array_of_unavailable_ways |=
958 (1 << cmd_Lxcache->Lxcache_way);
959 }
960 }
961 if (cmd_num_of_bits[bit_array_of_retired_ways & 0xf] == 3) {
962 /*
963 * special case: 3 ways are already retired.
964 * The Lone unretired way is set as 1, rest are set as 0.
965 * We now OR this with bit_array_of_unavailable_ways
966 * so that this unretired way will not be allocated.
967 */
968 bit_array_of_retired_ways ^= 0xf;
969 bit_array_of_retired_ways &= 0xf;
970 bit_array_of_unavailable_ways |= bit_array_of_retired_ways;
971 }
972 bit_array_of_available_ways =
973 ((bit_array_of_unavailable_ways ^ 0xf) & 0xf);
974 return (bit_array_of_available_ways);
975 }
976
977
978 /*
979 * Look for a way next to the specified way that is
980 * not in a retired state.
981 * We stop when way 3 is reached.
982 */
983 int8_t
cmd_Lxcache_get_next_retirable_way(cmd_cpu_t * cpu,int32_t index,cmd_ptrsubtype_t pstype,int8_t specified_way)984 cmd_Lxcache_get_next_retirable_way(cmd_cpu_t *cpu,
985 int32_t index, cmd_ptrsubtype_t pstype, int8_t specified_way)
986 {
987 uint8_t bit_array_of_ways;
988 int8_t mask;
989
990 if (specified_way == 3)
991 return (-1);
992 bit_array_of_ways = cmd_Lxcache_get_bit_array_of_available_ways(
993 cpu,
994 pstype, index);
995 if (specified_way == 2)
996 mask = 0x8;
997 else if (specified_way == 1)
998 mask = 0xc;
999 else
1000 mask = 0xe;
1001 return (cmd_lowest_way[bit_array_of_ways & mask]);
1002 }
1003
1004 int8_t
cmd_Lxcache_get_lowest_retirable_way(cmd_cpu_t * cpu,int32_t index,cmd_ptrsubtype_t pstype)1005 cmd_Lxcache_get_lowest_retirable_way(cmd_cpu_t *cpu,
1006 int32_t index, cmd_ptrsubtype_t pstype)
1007 {
1008 uint8_t bit_array_of_ways;
1009
1010 bit_array_of_ways = cmd_Lxcache_get_bit_array_of_available_ways(
1011 cpu,
1012 pstype, index);
1013 return (cmd_lowest_way[bit_array_of_ways]);
1014 }
1015
1016 cmd_Lxcache_t *
cmd_Lxcache_lookup_by_type_index_way_reason(cmd_cpu_t * cpu,cmd_ptrsubtype_t pstype,int32_t index,int8_t way,int32_t reason)1017 cmd_Lxcache_lookup_by_type_index_way_reason(cmd_cpu_t *cpu,
1018 cmd_ptrsubtype_t pstype, int32_t index, int8_t way, int32_t reason)
1019 {
1020 cmd_Lxcache_t *cmd_Lxcache;
1021
1022 for (cmd_Lxcache = cmd_list_next(&cpu->cpu_Lxcaches);
1023 cmd_Lxcache != NULL;
1024 cmd_Lxcache = cmd_list_next(cmd_Lxcache)) {
1025 if ((cmd_Lxcache->Lxcache_index == (uint32_t)index) &&
1026 (cmd_Lxcache->Lxcache_way == (uint32_t)way) &&
1027 (cmd_Lxcache->Lxcache_reason & reason) &&
1028 (cmd_Lxcache->Lxcache_type == pstype)) {
1029 return (cmd_Lxcache);
1030 }
1031 }
1032 return (NULL);
1033 }
1034
1035 cmd_Lxcache_t *
cmd_Lxcache_lookup_by_type_index_bit_reason(cmd_cpu_t * cpu,cmd_ptrsubtype_t pstype,int32_t index,int16_t bit,int32_t reason)1036 cmd_Lxcache_lookup_by_type_index_bit_reason(cmd_cpu_t *cpu,
1037 cmd_ptrsubtype_t pstype, int32_t index, int16_t bit, int32_t reason)
1038 {
1039 cmd_Lxcache_t *cmd_Lxcache;
1040
1041 for (cmd_Lxcache = cmd_list_next(&cpu->cpu_Lxcaches);
1042 cmd_Lxcache != NULL;
1043 cmd_Lxcache = cmd_list_next(cmd_Lxcache)) {
1044 if ((cmd_Lxcache->Lxcache_index == (uint32_t)index) &&
1045 (cmd_Lxcache->Lxcache_bit == (uint16_t)bit) &&
1046 (cmd_Lxcache->Lxcache_reason & reason) &&
1047 (cmd_Lxcache->Lxcache_type == pstype)) {
1048 return (cmd_Lxcache);
1049 }
1050 }
1051 return (NULL);
1052 }
1053
1054 void
cmd_Lxcache_destroy_anonymous_serd_engines(fmd_hdl_t * hdl,cmd_cpu_t * cpu,cmd_ptrsubtype_t type,int32_t index,int16_t bit)1055 cmd_Lxcache_destroy_anonymous_serd_engines(fmd_hdl_t *hdl, cmd_cpu_t *cpu,
1056 cmd_ptrsubtype_t type, int32_t index, int16_t bit)
1057 {
1058 cmd_Lxcache_t *cmd_Lxcache;
1059 cmd_case_t *cc;
1060
1061 for (cmd_Lxcache = cmd_list_next(&cpu->cpu_Lxcaches);
1062 cmd_Lxcache != NULL;
1063 cmd_Lxcache = cmd_list_next(cmd_Lxcache)) {
1064 if ((cmd_Lxcache->Lxcache_type == type) &&
1065 (cmd_Lxcache->Lxcache_index == (uint32_t)index) &&
1066 (cmd_Lxcache->Lxcache_bit == (uint16_t)bit) &&
1067 (cmd_Lxcache->Lxcache_way == (uint32_t)CMD_ANON_WAY)) {
1068 cc = &cmd_Lxcache->Lxcache_case;
1069 if (cc == NULL)
1070 continue;
1071 if (cc->cc_serdnm != NULL) {
1072 if (fmd_serd_exists(hdl, cc->cc_serdnm)) {
1073 fmd_hdl_debug(hdl,
1074 "\n%s:cpu_id %d destroying SERD"
1075 " engine %s\n",
1076 cmd_type_to_str(type),
1077 cpu->cpu_cpuid, cc->cc_serdnm);
1078 fmd_serd_destroy(hdl, cc->cc_serdnm);
1079 }
1080 fmd_hdl_strfree(hdl, cc->cc_serdnm);
1081 cc->cc_serdnm = NULL;
1082 }
1083 }
1084 }
1085 }
1086
1087 ssize_t
cmd_fmri_nvl2str(fmd_hdl_t * hdl,nvlist_t * nvl,char * buf,size_t buflen)1088 cmd_fmri_nvl2str(fmd_hdl_t *hdl, nvlist_t *nvl, char *buf, size_t buflen)
1089 {
1090 uint8_t type;
1091 uint32_t cpuid, way;
1092 uint32_t index;
1093 uint16_t bit;
1094 char *serstr = NULL;
1095 char missing_list[128];
1096
1097 missing_list[0] = 0;
1098 if (nvlist_lookup_uint32(nvl, FM_FMRI_CPU_ID, &cpuid) != 0)
1099 (void) strcat(missing_list, FM_FMRI_CPU_ID);
1100 if (nvlist_lookup_string(nvl, FM_FMRI_CPU_SERIAL_ID, &serstr) != 0)
1101 (void) strcat(missing_list, FM_FMRI_CPU_SERIAL_ID);
1102 if (nvlist_lookup_uint32(nvl, FM_FMRI_CPU_CACHE_INDEX, &index) != 0)
1103 (void) strcat(missing_list, FM_FMRI_CPU_CACHE_INDEX);
1104 if (nvlist_lookup_uint32(nvl, FM_FMRI_CPU_CACHE_WAY, &way) != 0)
1105 (void) strcat(missing_list, FM_FMRI_CPU_CACHE_WAY);
1106 if (nvlist_lookup_uint16(nvl, FM_FMRI_CPU_CACHE_BIT, &bit) != 0)
1107 (void) strcat(missing_list, FM_FMRI_CPU_CACHE_BIT);
1108 if (nvlist_lookup_uint8(nvl, FM_FMRI_CPU_CACHE_TYPE, &type) != 0)
1109 (void) strcat(missing_list, FM_FMRI_CPU_CACHE_TYPE);
1110
1111 if (strlen(missing_list) != 0) {
1112 fmd_hdl_debug(hdl,
1113 "\ncmd_fmri_nvl2str: missing %s in fmri\n",
1114 missing_list);
1115 return (-1);
1116 }
1117
1118 return (snprintf(buf, buflen,
1119 "cpu:///%s=%u/%s=%s/%s=%u/%s=%u/%s=%d/%s=%d",
1120 FM_FMRI_CPU_ID, cpuid,
1121 FM_FMRI_CPU_SERIAL_ID, serstr,
1122 FM_FMRI_CPU_CACHE_INDEX, index,
1123 FM_FMRI_CPU_CACHE_WAY, way,
1124 FM_FMRI_CPU_CACHE_BIT, bit,
1125 FM_FMRI_CPU_CACHE_TYPE, type));
1126 }
1127
1128 boolean_t
cmd_create_case_for_Lxcache(fmd_hdl_t * hdl,cmd_cpu_t * cpu,cmd_Lxcache_t * cmd_Lxcache)1129 cmd_create_case_for_Lxcache(fmd_hdl_t *hdl, cmd_cpu_t *cpu,
1130 cmd_Lxcache_t *cmd_Lxcache)
1131 {
1132 const char *fltnm;
1133 const char *uuid;
1134
1135 if (cmd_Lxcache->Lxcache_case.cc_cp != NULL)
1136 return (B_TRUE);
1137 cmd_Lxcache->Lxcache_case.cc_cp = cmd_case_create(hdl,
1138 &cmd_Lxcache->Lxcache_header, CMD_PTR_LxCACHE_CASE,
1139 &uuid);
1140 fltnm = cmd_type_to_str(cmd_Lxcache->Lxcache_type);
1141 if (cmd_Lxcache->Lxcache_case.cc_cp == NULL) {
1142 fmd_hdl_debug(hdl,
1143 "\n%s:cpu_id %d:Failed to create a case for"
1144 " index %d way %d bit %d\n",
1145 fltnm, cpu->cpu_cpuid,
1146 cmd_Lxcache->Lxcache_index,
1147 cmd_Lxcache->Lxcache_way, cmd_Lxcache->Lxcache_bit);
1148 return (B_FALSE);
1149 }
1150 fmd_hdl_debug(hdl,
1151 "\n%s:cpu_id %d: New case %s created.\n",
1152 fltnm, cpu->cpu_cpuid, uuid);
1153 if (cmd_Lxcache->Lxcache_ep)
1154 fmd_case_add_ereport(hdl, cmd_Lxcache->Lxcache_case.cc_cp,
1155 cmd_Lxcache->Lxcache_ep);
1156 return (B_TRUE);
1157 }
1158
1159 static int
cmd_repair_fmri(fmd_hdl_t * hdl,char * buf)1160 cmd_repair_fmri(fmd_hdl_t *hdl, char *buf)
1161 {
1162 int err;
1163
1164 err = fmd_repair_asru(hdl, buf);
1165 if (err) {
1166 fmd_hdl_debug(hdl,
1167 "Failed to repair %s err = %d\n", buf, err);
1168 }
1169 return (err);
1170 }
1171
1172 boolean_t
cmd_Lxcache_unretire(fmd_hdl_t * hdl,cmd_cpu_t * cpu,cmd_Lxcache_t * unretire_this_Lxcache,const char * fltnm)1173 cmd_Lxcache_unretire(fmd_hdl_t *hdl, cmd_cpu_t *cpu,
1174 cmd_Lxcache_t *unretire_this_Lxcache, const char *fltnm)
1175 {
1176 cmd_ptrsubtype_t data_type;
1177 cmd_Lxcache_t *previously_retired_Lxcache;
1178 int found_reretired_cacheline = 0;
1179 int certainty;
1180
1181 /*
1182 * If we are unretiring a cacheline retired due to suspected TAG
1183 * fault, then we must first check if we are using a cacheline
1184 * that was retired earlier for DATA fault.
1185 * If so we will not unretire the cacheline.
1186 * We will change the flags to reflect the current condition.
1187 * We will return success, though.
1188 */
1189 if (IS_TAG(unretire_this_Lxcache->Lxcache_type)) {
1190 if (unretire_this_Lxcache->Lxcache_type == CMD_PTR_CPU_L2TAG)
1191 data_type = CMD_PTR_CPU_L2DATA;
1192 if (unretire_this_Lxcache->Lxcache_type == CMD_PTR_CPU_L3TAG)
1193 data_type = CMD_PTR_CPU_L3DATA;
1194 fmd_hdl_debug(hdl,
1195 "\n%s:cpuid %d checking if there is a %s"
1196 " cacheline re-retired at this index %d and way %d\n",
1197 fltnm, cpu->cpu_cpuid, cmd_type_to_str(data_type),
1198 unretire_this_Lxcache->Lxcache_index,
1199 unretire_this_Lxcache->Lxcache_way);
1200 previously_retired_Lxcache =
1201 cmd_Lxcache_lookup_by_type_index_way_flags(
1202 cpu, data_type, unretire_this_Lxcache->Lxcache_index,
1203 unretire_this_Lxcache->Lxcache_way,
1204 CMD_LxCACHE_F_RERETIRED);
1205 if (previously_retired_Lxcache) {
1206 fmd_hdl_debug(hdl,
1207 "\n%s:cpuid %d Found a %s cacheline re-retired at"
1208 " this index %d and way %d. Will mark this"
1209 " RETIRED\n",
1210 fltnm, cpu->cpu_cpuid, cmd_type_to_str(data_type),
1211 unretire_this_Lxcache->Lxcache_index,
1212 unretire_this_Lxcache->Lxcache_way);
1213 /*
1214 * We call the cmd_Lxcache_fault to inform fmd
1215 * about the suspect fmri. The cacheline is already
1216 * retired but the existing suspect fmri is for TAG
1217 * fault which will be removed in this routine.
1218 */
1219 if (previously_retired_Lxcache->Lxcache_reason
1220 == CMD_LXCONVICTED)
1221 certainty = HUNDRED_PERCENT;
1222 else
1223 certainty = SUSPECT_PERCENT;
1224 cmd_Lxcache_fault(hdl, cpu, previously_retired_Lxcache,
1225 fltnm, cpu->cpu_fru_nvl, certainty);
1226 previously_retired_Lxcache->Lxcache_flags =
1227 CMD_LxCACHE_F_RETIRED;
1228 /*
1229 * Update persistent storage
1230 */
1231 cmd_Lxcache_write(hdl, previously_retired_Lxcache);
1232 found_reretired_cacheline = 1;
1233 }
1234 } else {
1235 /*
1236 * We have been called to unretire a cacheline retired
1237 * earlier due to DATA errors.
1238 * If this cacheline is marked RERETIRED then it means that
1239 * the cacheline has been retired due to TAG errors and
1240 * we should not be unretiring the cacheline.
1241 */
1242 if (unretire_this_Lxcache->Lxcache_flags &
1243 CMD_LxCACHE_F_RERETIRED) {
1244 fmd_hdl_debug(hdl,
1245 "\n%s:cpuid %d The cacheline at index %d and"
1246 " way %d which we are attempting to unretire"
1247 " is in RERETIRED state. Therefore we will not"
1248 " unretire it but will mark it as RETIRED.\n",
1249 fltnm, cpu->cpu_cpuid,
1250 unretire_this_Lxcache->Lxcache_index,
1251 unretire_this_Lxcache->Lxcache_way);
1252 found_reretired_cacheline = 1;
1253 }
1254 }
1255 /*
1256 * if we did not find a RERETIRED cacheline above
1257 * unretire the cacheline.
1258 */
1259 if (!found_reretired_cacheline) {
1260 if (cmd_cache_way_unretire(hdl, cpu, unretire_this_Lxcache)
1261 == B_FALSE)
1262 return (B_FALSE);
1263 }
1264 unretire_this_Lxcache->Lxcache_flags = CMD_LxCACHE_F_UNRETIRED;
1265 /*
1266 * We have exonerated the cacheline. We need to inform the fmd
1267 * that we have repaired the suspect fmri that we retired earlier.
1268 * The cpumem agent will not unretire cacheline in response to
1269 * the list.repair events it receives.
1270 */
1271 if (unretire_this_Lxcache->Lxcache_retired_fmri[0] != 0) {
1272 fmd_hdl_debug(hdl,
1273 "\n%s:cpuid %d Repairing the retired fmri %s",
1274 fltnm, cpu->cpu_cpuid,
1275 unretire_this_Lxcache->Lxcache_retired_fmri);
1276 if (cmd_repair_fmri(hdl,
1277 unretire_this_Lxcache->Lxcache_retired_fmri) != 0) {
1278 fmd_hdl_debug(hdl,
1279 "\n%s:cpuid %d Failed to repair retired fmri.",
1280 fltnm, cpu->cpu_cpuid);
1281 /*
1282 * We need to retire the cacheline that we just
1283 * unretired.
1284 */
1285 if (cmd_cache_way_retire(hdl, cpu,
1286 unretire_this_Lxcache) == B_FALSE) {
1287 /*
1288 * A hopeless situation.
1289 * cannot maintain consistency of cacheline
1290 * sate between fmd and DE.
1291 * Aborting the DE.
1292 */
1293 fmd_hdl_abort(hdl,
1294 "\n%s:cpuid %d We are unable to repair"
1295 " the fmri we just unretired and are"
1296 " unable to restore the DE and fmd to"
1297 " a sane state.\n",
1298 fltnm, cpu->cpu_cpuid);
1299 }
1300 return (B_FALSE);
1301 } else {
1302 unretire_this_Lxcache->Lxcache_retired_fmri[0] = 0;
1303 }
1304 }
1305 return (B_TRUE);
1306 }
1307
1308 boolean_t
cmd_Lxcache_retire(fmd_hdl_t * hdl,cmd_cpu_t * cpu,cmd_Lxcache_t * retire_this_Lxcache,const char * fltnm,uint_t cert)1309 cmd_Lxcache_retire(fmd_hdl_t *hdl, cmd_cpu_t *cpu,
1310 cmd_Lxcache_t *retire_this_Lxcache, const char *fltnm, uint_t cert)
1311 {
1312 cmd_Lxcache_t *previously_retired_Lxcache;
1313 cmd_ptrsubtype_t data_type;
1314 const char *uuid;
1315 char suspect_list[128];
1316
1317 fmd_hdl_debug(hdl,
1318 "\n%s:cpu_id %d: cmd_Lxcache_retire called for index %d"
1319 " way %d bit %d\n",
1320 fltnm, cpu->cpu_cpuid, retire_this_Lxcache->Lxcache_index,
1321 retire_this_Lxcache->Lxcache_way, retire_this_Lxcache->Lxcache_bit);
1322 if (fmd_case_solved(hdl, retire_this_Lxcache->Lxcache_case.cc_cp)) {
1323 /*
1324 * Case solved implies that the cache line is already
1325 * retired as SUSPECT_0_TAG and we are here to retire this
1326 * as SUSPECT_1_TAG.
1327 * We will first repair the retired cacheline
1328 * so that it does not get retired during replay for
1329 * wrong reason.
1330 * If we are able to repair the retired cacheline we close the
1331 * case and open a new case for it.
1332 */
1333 if (retire_this_Lxcache->Lxcache_reason !=
1334 CMD_LXSUSPECT_0_TAG) {
1335 fmd_hdl_debug(hdl,
1336 "\n%s:cpu_id %d: Unexpected condition encountered."
1337 " Expected the reason for retirement as"
1338 " SUSPECT_0_TAG however found the reason"
1339 " to be %s\n",
1340 fltnm, cpu->cpu_cpuid,
1341 cmd_reason_to_str(
1342 retire_this_Lxcache->Lxcache_reason));
1343 return (B_FALSE);
1344 }
1345 fmd_hdl_debug(hdl,
1346 "\n%s:cpu_id %d: We are re-retiring SUSPECT_0_TAG as"
1347 " SUSPECT_1_TAG index %d way %d bit %d\n",
1348 fltnm, cpu->cpu_cpuid,
1349 retire_this_Lxcache->Lxcache_index,
1350 retire_this_Lxcache->Lxcache_way,
1351 retire_this_Lxcache->Lxcache_bit);
1352 fmd_hdl_debug(hdl,
1353 "\n%s:cpu_id %d: The existing case for this Lxcache has"
1354 " has been already solved. We will first repair the suspect"
1355 " cacheline and if we are successful then close this case,"
1356 " and open a new case.\n",
1357 fltnm, cpu->cpu_cpuid);
1358 /*
1359 * repair the retired cacheline.
1360 */
1361 if (retire_this_Lxcache->Lxcache_retired_fmri[0] != 0) {
1362 fmd_hdl_debug(hdl,
1363 "\n%s:cpuid %d Repairing the retired suspect"
1364 " cacheline %s\n",
1365 fltnm, cpu->cpu_cpuid,
1366 retire_this_Lxcache->Lxcache_retired_fmri);
1367 if (cmd_repair_fmri(hdl,
1368 retire_this_Lxcache->Lxcache_retired_fmri) != 0) {
1369 fmd_hdl_debug(hdl,
1370 "\n%s:cpuid %d Failed to repair the"
1371 " retired fmri.",
1372 fltnm, cpu->cpu_cpuid);
1373 return (B_FALSE);
1374 } else {
1375 retire_this_Lxcache->Lxcache_retired_fmri[0] =
1376 0;
1377 }
1378 }
1379 uuid = fmd_case_uuid(hdl,
1380 retire_this_Lxcache->Lxcache_case.cc_cp);
1381 fmd_hdl_debug(hdl,
1382 "\n%s:cpuid %d: Closing the case %s\n",
1383 fltnm, cpu->cpu_cpuid, uuid);
1384 cmd_case_fini(hdl, retire_this_Lxcache->Lxcache_case.cc_cp,
1385 FMD_B_TRUE);
1386 retire_this_Lxcache->Lxcache_case.cc_cp = NULL;
1387 if (cmd_create_case_for_Lxcache(hdl, cpu, retire_this_Lxcache)
1388 == B_FALSE)
1389 return (B_FALSE);
1390 } else {
1391 /*
1392 * Not a SUSPECT_0_TAG.
1393 * We should be entering this path if the cacheline is
1394 * transitioning from ACTIVE/UNRETIRED to RETIRED state.
1395 * If the cacheline state is not as expected we print debug
1396 * message and return failure.
1397 */
1398 if ((retire_this_Lxcache->Lxcache_flags !=
1399 CMD_LxCACHE_F_ACTIVE) &&
1400 (retire_this_Lxcache->Lxcache_flags
1401 != CMD_LxCACHE_F_UNRETIRED)) {
1402 /*
1403 * Unexpected condition.
1404 */
1405 fmd_hdl_debug(hdl,
1406 "\n%s:cpu_id %d:Unexpected state %s for the"
1407 " cacheline at index %d way %d encountered.\n",
1408 fltnm, cpu->cpu_cpuid,
1409 cmd_flags_to_str(
1410 retire_this_Lxcache->Lxcache_flags),
1411 retire_this_Lxcache->Lxcache_index,
1412 retire_this_Lxcache->Lxcache_way);
1413 return (B_FALSE);
1414 }
1415 }
1416 suspect_list[0] = 0;
1417 (void) cmd_fmri_nvl2str(hdl, retire_this_Lxcache->Lxcache_asru.fmri_nvl,
1418 suspect_list, sizeof (suspect_list));
1419 fmd_hdl_debug(hdl,
1420 "\n%s:cpu_id %d:current suspect list is %s\n",
1421 fltnm, cpu->cpu_cpuid, suspect_list);
1422 cmd_Lxcache_fault(hdl, cpu, retire_this_Lxcache, fltnm,
1423 cpu->cpu_fru_nvl,
1424 cert);
1425 retire_this_Lxcache->Lxcache_flags = CMD_LxCACHE_F_RETIRED;
1426 if (IS_TAG(retire_this_Lxcache->Lxcache_type)) {
1427 /*
1428 * If the cacheline we just retired was retired earlier
1429 * due to DATA faults we mark the Lxcache
1430 * corresponding to DATA as RERETIRED.
1431 */
1432 if (retire_this_Lxcache->Lxcache_type == CMD_PTR_CPU_L2TAG)
1433 data_type = CMD_PTR_CPU_L2DATA;
1434 if (retire_this_Lxcache->Lxcache_type == CMD_PTR_CPU_L3TAG)
1435 data_type = CMD_PTR_CPU_L3DATA;
1436 fmd_hdl_debug(hdl,
1437 "\n%s:cpuid %d checking if there is a %s"
1438 " cacheline retired at this index %d way %d\n",
1439 fltnm, cpu->cpu_cpuid,
1440 cmd_type_to_str(data_type),
1441 retire_this_Lxcache->Lxcache_index,
1442 retire_this_Lxcache->Lxcache_way);
1443 previously_retired_Lxcache =
1444 cmd_Lxcache_lookup_by_type_index_way_flags(cpu,
1445 data_type, retire_this_Lxcache->Lxcache_index,
1446 retire_this_Lxcache->Lxcache_way, CMD_LxCACHE_F_RETIRED);
1447 if (previously_retired_Lxcache) {
1448 fmd_hdl_debug(hdl,
1449 "\n%s:cpu_id %d: Found index %d way %d"
1450 " retired earlier. Will mark this Lxcache"
1451 " as RERETIRED.\n",
1452 fltnm, cpu->cpu_cpuid,
1453 retire_this_Lxcache->Lxcache_index,
1454 retire_this_Lxcache->Lxcache_way);
1455 /*
1456 * First repair the retired cacheline and if successful
1457 * close the existing case and create a new case.
1458 */
1459
1460 /*
1461 * This cacheline has already been retired for
1462 * TAG fault.
1463 * Repair the previously retired DATA fault cacheline so
1464 * that it does not get retired by fmd during replay.
1465 */
1466 if (previously_retired_Lxcache->Lxcache_retired_fmri[0]
1467 != 0) {
1468 fmd_hdl_debug(hdl,
1469 "\n%s:cpuid %d Repairing the cacheline"
1470 " retired due to data errors. %s\n",
1471 fltnm, cpu->cpu_cpuid,
1472 previously_retired_Lxcache->
1473 Lxcache_retired_fmri);
1474 if (cmd_repair_fmri(hdl,
1475 previously_retired_Lxcache->
1476 Lxcache_retired_fmri)
1477 != 0) {
1478 fmd_hdl_debug(hdl,
1479 "\n%s:cpuid %d Failed to repair the"
1480 " retired fmri.",
1481 fltnm, cpu->cpu_cpuid);
1482 return (B_FALSE);
1483 } else {
1484 previously_retired_Lxcache->
1485 Lxcache_retired_fmri[0] = 0;
1486 }
1487 }
1488 cmd_case_fini(hdl,
1489 previously_retired_Lxcache->Lxcache_case.cc_cp,
1490 FMD_B_TRUE);
1491 previously_retired_Lxcache->Lxcache_case.cc_cp = NULL;
1492 previously_retired_Lxcache->Lxcache_flags =
1493 CMD_LxCACHE_F_RERETIRED;
1494 /*
1495 * Update persistent storage
1496 */
1497 cmd_Lxcache_write(hdl, previously_retired_Lxcache);
1498 /*
1499 * Create a new case so that this Lxcache structure
1500 * gets restored on replay.
1501 */
1502 if (cmd_create_case_for_Lxcache(hdl, cpu,
1503 previously_retired_Lxcache) == B_FALSE)
1504 return (B_FALSE);
1505 }
1506 }
1507 cmd_retire_cpu_if_limits_exceeded(hdl, cpu,
1508 retire_this_Lxcache->Lxcache_type,
1509 fltnm);
1510 return (B_TRUE);
1511 }
1512