1 /*
2 * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved.
3 * Copyright (c) 2007 Xsigo Systems Inc. All rights reserved.
4 * Copyright (c) 2008 Lawrence Livermore National Lab. All rights reserved.
5 * Copyright (c) 2009 HNR Consulting. All rights reserved.
6 * Copyright (c) 2010,2011 Mellanox Technologies LTD. All rights reserved.
7 *
8 * This software is available to you under a choice of one of two
9 * licenses. You may choose to be licensed under the terms of the GNU
10 * General Public License (GPL) Version 2, available from the file
11 * COPYING in the main directory of this source tree, or the
12 * OpenIB.org BSD license below:
13 *
14 * Redistribution and use in source and binary forms, with or
15 * without modification, are permitted provided that the following
16 * conditions are met:
17 *
18 * - Redistributions of source code must retain the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer.
21 *
22 * - Redistributions in binary form must reproduce the above
23 * copyright notice, this list of conditions and the following
24 * disclaimer in the documentation and/or other materials
25 * provided with the distribution.
26 *
27 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
28 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
29 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
30 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
31 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
32 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
33 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
34 * SOFTWARE.
35 *
36 */
37
38 #if HAVE_CONFIG_H
39 # include <config.h>
40 #endif /* HAVE_CONFIG_H */
41
42 #define _GNU_SOURCE
43 #include <stdio.h>
44 #include <stdlib.h>
45 #include <unistd.h>
46 #include <stdarg.h>
47 #include <time.h>
48 #include <string.h>
49 #include <getopt.h>
50 #include <errno.h>
51 #include <inttypes.h>
52
53 #include <complib/cl_nodenamemap.h>
54 #include <infiniband/ibnetdisc.h>
55 #include <infiniband/mad.h>
56
57 #include "ibdiag_common.h"
58 #include "ibdiag_sa.h"
59
60 struct ibmad_port *ibmad_port;
61 static char *node_name_map_file = NULL;
62 static nn_map_t *node_name_map = NULL;
63 static char *load_cache_file = NULL;
64 static uint16_t lid2sl_table[sizeof(uint8_t) * 1024 * 48] = { 0 };
65 static int obtain_sl = 1;
66
67 int data_counters = 0;
68 int data_counters_only = 0;
69 int port_config = 0;
70 uint64_t port_guid = 0;
71 char *port_guid_str = NULL;
72 #define SUP_MAX 64
73 int sup_total = 0;
74 enum MAD_FIELDS suppressed_fields[SUP_MAX];
75 char *dr_path = NULL;
76 uint8_t node_type_to_print = 0;
77 unsigned clear_errors = 0, clear_counts = 0, details = 0;
78
79 #define PRINT_SWITCH 0x1
80 #define PRINT_CA 0x2
81 #define PRINT_ROUTER 0x4
82 #define PRINT_ALL 0xFF /* all nodes default flag */
83
84 #define DEFAULT_HALF_WORLD_PR_TIMEOUT (3000)
85
86 struct {
87 int nodes_checked;
88 int bad_nodes;
89 int ports_checked;
90 int bad_ports;
91 int pma_query_failures;
92 } summary = { 0 };
93
94 #define DEF_THRES_FILE IBDIAG_CONFIG_PATH"/error_thresholds"
95 static char *threshold_file = DEF_THRES_FILE;
96
97 /* define a "packet" with threshold values in it */
98 uint8_t thresholds[1204] = { 0 };
99 char * threshold_str = "";
100
valid_gid(ib_gid_t * gid)101 static unsigned valid_gid(ib_gid_t * gid)
102 {
103 ib_gid_t zero_gid;
104 memset(&zero_gid, 0, sizeof zero_gid);
105 return memcmp(&zero_gid, gid, sizeof(*gid));
106 }
107
set_thres(char * name,uint32_t val)108 static void set_thres(char *name, uint32_t val)
109 {
110 int f;
111 int n;
112 char tmp[256];
113 for (f = IB_PC_FIRST_F; f <= IB_PC_LAST_F; f++) {
114 if (strcmp(name, mad_field_name(f)) == 0) {
115 mad_encode_field(thresholds, f, &val);
116 snprintf(tmp, 255, "[%s = %u]", name, val);
117 threshold_str = realloc(threshold_str,
118 strlen(threshold_str)+strlen(tmp)+1);
119 if (!threshold_str) {
120 fprintf(stderr, "Failed to allocate memory: "
121 "%s\n", strerror(errno));
122 exit(1);
123 }
124 n = strlen(threshold_str);
125 strcpy(threshold_str+n, tmp);
126 }
127 }
128 }
129
set_thresholds(char * threshold_file)130 static void set_thresholds(char *threshold_file)
131 {
132 char buf[1024];
133 char orig_buf[1024];
134 int val = 0;
135 FILE *thresf = fopen(threshold_file, "r");
136 char *p_prefix, *p_last;
137 char *name;
138 char *val_str;
139 char str[64];
140
141 if (!thresf)
142 return;
143
144 snprintf(str, 63, "Thresholds: ");
145 threshold_str = malloc(strlen(str)+1);
146 if (!threshold_str) {
147 fprintf(stderr, "Failed to allocate memory: %s\n",
148 strerror(errno));
149 exit(1);
150 }
151 strcpy(threshold_str, str);
152 while (fgets(buf, sizeof buf, thresf) != NULL) {
153 p_prefix = strtok_r(buf, "\n", &p_last);
154 if (!p_prefix)
155 continue; /* ignore blank lines */
156
157 if (*p_prefix == '#')
158 continue; /* ignore comment lines */
159
160 strlcpy(orig_buf, buf, sizeof(orig_buf));
161 name = strtok_r(p_prefix, "=", &p_last);
162 val_str = strtok_r(NULL, "\n", &p_last);
163 if (!name || !val_str) {
164 fprintf(stderr, "malformed line in \"%s\":\n%s\n",
165 threshold_file, orig_buf);
166 continue;
167 }
168
169 val = strtoul(val_str, NULL, 0);
170 set_thres(name, val);
171 }
172
173 fclose(thresf);
174 }
175
exceeds_threshold(int field,unsigned val)176 static int exceeds_threshold(int field, unsigned val)
177 {
178 uint32_t thres = 0;
179 mad_decode_field(thresholds, field, &thres);
180 return (val > thres);
181 }
182
print_port_config(ibnd_node_t * node,int portnum)183 static void print_port_config(ibnd_node_t * node, int portnum)
184 {
185 char width[64], speed[64], state[64], physstate[64];
186 char remote_str[256];
187 char link_str[256];
188 char width_msg[256];
189 char speed_msg[256];
190 char ext_port_str[256];
191 int iwidth, ispeed, fdr10, espeed, istate, iphystate, cap_mask;
192 uint8_t *info;
193
194 ibnd_port_t *port = node->ports[portnum];
195
196 if (!port)
197 return;
198
199 iwidth = mad_get_field(port->info, 0, IB_PORT_LINK_WIDTH_ACTIVE_F);
200 ispeed = mad_get_field(port->info, 0, IB_PORT_LINK_SPEED_ACTIVE_F);
201 fdr10 = mad_get_field(port->ext_info, 0,
202 IB_MLNX_EXT_PORT_LINK_SPEED_ACTIVE_F) & FDR10;
203
204 if (port->node->type == IB_NODE_SWITCH)
205 info = (uint8_t *)&port->node->ports[0]->info;
206 else
207 info = (uint8_t *)&port->info;
208 cap_mask = mad_get_field(info, 0, IB_PORT_CAPMASK_F);
209 if (cap_mask & CL_NTOH32(IB_PORT_CAP_HAS_EXT_SPEEDS))
210 espeed = mad_get_field(port->info, 0,
211 IB_PORT_LINK_SPEED_EXT_ACTIVE_F);
212 else
213 espeed = 0;
214 istate = mad_get_field(port->info, 0, IB_PORT_STATE_F);
215 iphystate = mad_get_field(port->info, 0, IB_PORT_PHYS_STATE_F);
216
217 remote_str[0] = '\0';
218 link_str[0] = '\0';
219 width_msg[0] = '\0';
220 speed_msg[0] = '\0';
221
222 /* C14-24.2.1 states that a down port allows for invalid data to be
223 * returned for all PortInfo components except PortState and
224 * PortPhysicalState */
225 if (istate != IB_LINK_DOWN) {
226 if (!espeed) {
227 if (fdr10)
228 sprintf(speed, "10.0 Gbps (FDR10)");
229 else
230 mad_dump_val(IB_PORT_LINK_SPEED_ACTIVE_F, speed,
231 64, &ispeed);
232 } else
233 mad_dump_val(IB_PORT_LINK_SPEED_EXT_ACTIVE_F, speed,
234 64, &espeed);
235
236 snprintf(link_str, 256, "(%3s %18s %6s/%8s)",
237 mad_dump_val(IB_PORT_LINK_WIDTH_ACTIVE_F, width, 64, &iwidth),
238 speed,
239 mad_dump_val(IB_PORT_STATE_F, state, 64, &istate),
240 mad_dump_val(IB_PORT_PHYS_STATE_F, physstate, 64, &iphystate));
241 } else {
242 snprintf(link_str, 256, "( %6s/%8s)",
243 mad_dump_val(IB_PORT_STATE_F, state, 64, &istate),
244 mad_dump_val(IB_PORT_PHYS_STATE_F, physstate, 64, &iphystate));
245 }
246
247 if (port->remoteport) {
248 char *rem_node_name = NULL;
249
250 if (port->remoteport->ext_portnum)
251 snprintf(ext_port_str, 256, "%d",
252 port->remoteport->ext_portnum);
253 else
254 ext_port_str[0] = '\0';
255
256 get_max_msg(width_msg, speed_msg, 256, port);
257
258 rem_node_name = remap_node_name(node_name_map,
259 port->remoteport->node->guid,
260 port->remoteport->node->
261 nodedesc);
262
263 snprintf(remote_str, 256,
264 "0x%016" PRIx64 " %6d %4d[%2s] \"%s\" (%s %s)\n",
265 port->remoteport->guid,
266 port->remoteport->base_lid ? port->remoteport->
267 base_lid : port->remoteport->node->smalid,
268 port->remoteport->portnum, ext_port_str, rem_node_name,
269 width_msg, speed_msg);
270
271 free(rem_node_name);
272 } else
273 snprintf(remote_str, 256, " [ ] \"\" ( )\n");
274
275 if (port->ext_portnum)
276 snprintf(ext_port_str, 256, "%d", port->ext_portnum);
277 else
278 ext_port_str[0] = '\0';
279
280 if (node->type == IB_NODE_SWITCH)
281 printf(" Link info: %6d", node->smalid);
282 else
283 printf(" Link info: %6d", port->base_lid);
284
285 printf("%4d[%2s] ==%s==> %s",
286 port->portnum, ext_port_str, link_str, remote_str);
287 }
288
suppress(enum MAD_FIELDS field)289 static int suppress(enum MAD_FIELDS field)
290 {
291 int i = 0;
292 for (i = 0; i < sup_total; i++)
293 if (field == suppressed_fields[i])
294 return 1;
295 return 0;
296 }
297
report_suppressed(void)298 static void report_suppressed(void)
299 {
300 int i = 0;
301 printf("## Suppressed:");
302 for (i = 0; i < sup_total; i++)
303 printf(" %s", mad_field_name(suppressed_fields[i]));
304 printf("\n");
305 }
306
print_summary(void)307 static int print_summary(void)
308 {
309 printf("\n## Summary: %d nodes checked, %d bad nodes found\n",
310 summary.nodes_checked, summary.bad_nodes);
311 printf("## %d ports checked, %d ports have errors beyond threshold\n",
312 summary.ports_checked, summary.bad_ports);
313 printf("## %s\n", threshold_str);
314 if (summary.pma_query_failures)
315 printf("## %d PMA query failures\n", summary.pma_query_failures);
316 report_suppressed();
317 return (summary.bad_ports);
318 }
319
insert_lid2sl_table(struct sa_query_result * r)320 static void insert_lid2sl_table(struct sa_query_result *r)
321 {
322 unsigned int i;
323 for (i = 0; i < r->result_cnt; i++) {
324 ib_path_rec_t *p_pr = (ib_path_rec_t *)sa_get_query_rec(r->p_result_madw, i);
325 lid2sl_table[cl_ntoh16(p_pr->dlid)] = ib_path_rec_sl(p_pr);
326 }
327 }
328
path_record_query(ib_gid_t sgid,uint64_t dguid)329 static int path_record_query(ib_gid_t sgid,uint64_t dguid)
330 {
331 ib_path_rec_t pr;
332 ib_net64_t comp_mask = 0;
333 uint8_t reversible = 0;
334 struct sa_handle * h;
335
336 if (!(h = sa_get_handle()))
337 return -1;
338
339 ibd_timeout = DEFAULT_HALF_WORLD_PR_TIMEOUT;
340 memset(&pr, 0, sizeof(pr));
341
342 CHECK_AND_SET_GID(sgid, pr.sgid, PR, SGID);
343 if(dguid) {
344 mad_encode_field(sgid.raw, IB_GID_GUID_F, &dguid);
345 CHECK_AND_SET_GID(sgid, pr.dgid, PR, DGID);
346 }
347
348 CHECK_AND_SET_VAL(1, 8, -1, pr.num_path, PR, NUMBPATH);/*to get only one PathRecord for each source and destination pair*/
349 CHECK_AND_SET_VAL(1, 8, -1, reversible, PR, REVERSIBLE);/*for a reversible path*/
350 pr.num_path |= reversible << 7;
351 struct sa_query_result result;
352 int ret = sa_query(h, IB_MAD_METHOD_GET_TABLE,
353 (uint16_t)IB_SA_ATTR_PATHRECORD,0,cl_ntoh64(comp_mask),ibd_sakey,
354 &pr, sizeof(pr), &result);
355 if (ret) {
356 sa_free_handle(h);
357 fprintf(stderr, "Query SA failed: %s; sa call path_query failed\n", strerror(ret));
358 return ret;
359 }
360 if (result.status != IB_SA_MAD_STATUS_SUCCESS) {
361 sa_report_err(result.status);
362 ret = EIO;
363 goto Exit;
364 }
365
366 insert_lid2sl_table(&result);
367 Exit:
368 sa_free_handle(h);
369 sa_free_result_mad(&result);
370 return ret;
371 }
372
query_and_dump(char * buf,size_t size,ib_portid_t * portid,char * node_name,int portnum,const char * attr_name,uint16_t attr_id,int start_field,int end_field)373 static int query_and_dump(char *buf, size_t size, ib_portid_t * portid,
374 char *node_name, int portnum,
375 const char *attr_name, uint16_t attr_id,
376 int start_field, int end_field)
377 {
378 uint8_t pc[1024];
379 uint32_t val = 0;
380 int i, n;
381
382 memset(pc, 0, sizeof(pc));
383
384 if (!pma_query_via(pc, portid, portnum, ibd_timeout, attr_id,
385 ibmad_port)) {
386 IBWARN("%s query failed on %s, %s port %d", attr_name,
387 node_name, portid2str(portid), portnum);
388 summary.pma_query_failures++;
389 return 0;
390 }
391
392 for (n = 0, i = start_field; i < end_field; i++) {
393 mad_decode_field(pc, i, (void *)&val);
394 if (val)
395 n += snprintf(buf + n, size - n, " [%s == %u]",
396 mad_field_name(i), val);
397 }
398
399 return n;
400 }
401
402
print_results(ib_portid_t * portid,char * node_name,ibnd_node_t * node,uint8_t * pc,int portnum,int * header_printed,uint8_t * pce,uint16_t cap_mask)403 static int print_results(ib_portid_t * portid, char *node_name,
404 ibnd_node_t * node, uint8_t * pc, int portnum,
405 int *header_printed, uint8_t *pce, uint16_t cap_mask)
406 {
407 char buf[1024];
408 char *str = buf;
409 uint32_t val = 0;
410 int i, n;
411
412 for (n = 0, i = IB_PC_ERR_SYM_F; i <= IB_PC_VL15_DROPPED_F; i++) {
413 if (suppress(i))
414 continue;
415
416 /* this is not a counter, skip it */
417 if (i == IB_PC_COUNTER_SELECT2_F)
418 continue;
419
420 mad_decode_field(pc, i, (void *)&val);
421 if (exceeds_threshold(i, val)) {
422 n += snprintf(str + n, 1024 - n, " [%s == %u]",
423 mad_field_name(i), val);
424
425 /* If there are PortXmitDiscards, get details (if supported) */
426 if (i == IB_PC_XMT_DISCARDS_F && details) {
427 n += query_and_dump(str + n, sizeof(buf) - n, portid,
428 node_name, portnum,
429 "PortXmitDiscardDetails",
430 IB_GSI_PORT_XMIT_DISCARD_DETAILS,
431 IB_PC_RCV_LOCAL_PHY_ERR_F,
432 IB_PC_RCV_ERR_LAST_F);
433 /* If there are PortRcvErrors, get details (if supported) */
434 } else if (i == IB_PC_ERR_RCV_F && details) {
435 n += query_and_dump(str + n, sizeof(buf) - n, portid,
436 node_name, portnum,
437 "PortRcvErrorDetails",
438 IB_GSI_PORT_RCV_ERROR_DETAILS,
439 IB_PC_XMT_INACT_DISC_F,
440 IB_PC_XMT_DISC_LAST_F);
441 }
442 }
443 }
444
445 if (!suppress(IB_PC_XMT_WAIT_F)) {
446 mad_decode_field(pc, IB_PC_XMT_WAIT_F, (void *)&val);
447 if (exceeds_threshold(IB_PC_XMT_WAIT_F, val))
448 n += snprintf(str + n, 1024 - n, " [%s == %u]",
449 mad_field_name(IB_PC_XMT_WAIT_F), val);
450 }
451
452 /* if we found errors. */
453 if (n != 0) {
454 if (data_counters) {
455 uint8_t *pkt = pc;
456 int start_field = IB_PC_XMT_BYTES_F;
457 int end_field = IB_PC_RCV_PKTS_F;
458
459 if (pce) {
460 pkt = pce;
461 start_field = IB_PC_EXT_XMT_BYTES_F;
462 if (cap_mask & IB_PM_EXT_WIDTH_SUPPORTED)
463 end_field = IB_PC_EXT_RCV_MPKTS_F;
464 else
465 end_field = IB_PC_EXT_RCV_PKTS_F;
466 }
467
468 for (i = start_field; i <= end_field; i++) {
469 uint64_t val64 = 0;
470 float val = 0;
471 char *unit = "";
472 mad_decode_field(pkt, i, (void *)&val64);
473 if (val64) {
474 int data = 0;
475 if (i == IB_PC_EXT_XMT_BYTES_F ||
476 i == IB_PC_EXT_RCV_BYTES_F ||
477 i == IB_PC_XMT_BYTES_F ||
478 i == IB_PC_RCV_BYTES_F)
479 data = 1;
480 unit = conv_cnt_human_readable(val64,
481 &val, data);
482 n += snprintf(str + n, 1024 - n,
483 " [%s == %" PRIu64
484 " (%5.3f%s)]",
485 mad_field_name(i), val64, val,
486 unit);
487 }
488 }
489 }
490
491 if (!*header_printed) {
492 if (node->type == IB_NODE_SWITCH)
493 printf("Errors for 0x%" PRIx64 " \"%s\"\n",
494 node->ports[0]->guid, node_name);
495 else
496 printf("Errors for \"%s\"\n", node_name);
497 *header_printed = 1;
498 summary.bad_nodes++;
499 }
500
501 if (portnum == 0xFF) {
502 if (node->type == IB_NODE_SWITCH)
503 printf(" GUID 0x%" PRIx64 " port ALL:%s\n",
504 node->ports[0]->guid, str);
505 } else {
506 printf(" GUID 0x%" PRIx64 " port %d:%s\n",
507 node->ports[portnum]->guid, portnum, str);
508 if (port_config)
509 print_port_config(node, portnum);
510 summary.bad_ports++;
511 }
512 }
513 return (n);
514 }
515
query_cap_mask(ib_portid_t * portid,char * node_name,int portnum,uint16_t * cap_mask)516 static int query_cap_mask(ib_portid_t * portid, char *node_name, int portnum,
517 uint16_t * cap_mask)
518 {
519 uint8_t pc[1024] = { 0 };
520 uint16_t rc_cap_mask;
521
522 portid->sl = lid2sl_table[portid->lid];
523
524 /* PerfMgt ClassPortInfo is a required attribute */
525 if (!pma_query_via(pc, portid, portnum, ibd_timeout, CLASS_PORT_INFO,
526 ibmad_port)) {
527 IBWARN("classportinfo query failed on %s, %s port %d",
528 node_name, portid2str(portid), portnum);
529 summary.pma_query_failures++;
530 return -1;
531 }
532
533 /* ClassPortInfo should be supported as part of libibmad */
534 memcpy(&rc_cap_mask, pc + 2, sizeof(rc_cap_mask)); /* CapabilityMask */
535
536 *cap_mask = rc_cap_mask;
537 return 0;
538 }
539
print_data_cnts(ib_portid_t * portid,uint16_t cap_mask,char * node_name,ibnd_node_t * node,int portnum,int * header_printed)540 static int print_data_cnts(ib_portid_t * portid, uint16_t cap_mask,
541 char *node_name, ibnd_node_t * node, int portnum,
542 int *header_printed)
543 {
544 uint8_t pc[1024];
545 int i;
546 int start_field = IB_PC_XMT_BYTES_F;
547 int end_field = IB_PC_RCV_PKTS_F;
548
549 memset(pc, 0, 1024);
550
551 portid->sl = lid2sl_table[portid->lid];
552
553 if (cap_mask & (IB_PM_EXT_WIDTH_SUPPORTED | IB_PM_EXT_WIDTH_NOIETF_SUP)) {
554 if (!pma_query_via(pc, portid, portnum, ibd_timeout,
555 IB_GSI_PORT_COUNTERS_EXT, ibmad_port)) {
556 IBWARN("IB_GSI_PORT_COUNTERS_EXT query failed on %s, %s port %d",
557 node_name, portid2str(portid), portnum);
558 summary.pma_query_failures++;
559 return (1);
560 }
561 start_field = IB_PC_EXT_XMT_BYTES_F;
562 if (cap_mask & IB_PM_EXT_WIDTH_SUPPORTED)
563 end_field = IB_PC_EXT_RCV_MPKTS_F;
564 else
565 end_field = IB_PC_EXT_RCV_PKTS_F;
566 } else {
567 if (!pma_query_via(pc, portid, portnum, ibd_timeout,
568 IB_GSI_PORT_COUNTERS, ibmad_port)) {
569 IBWARN("IB_GSI_PORT_COUNTERS query failed on %s, %s port %d",
570 node_name, portid2str(portid), portnum);
571 summary.pma_query_failures++;
572 return (1);
573 }
574 start_field = IB_PC_XMT_BYTES_F;
575 end_field = IB_PC_RCV_PKTS_F;
576 }
577
578 if (!*header_printed) {
579 printf("Data Counters for 0x%" PRIx64 " \"%s\"\n", node->guid,
580 node_name);
581 *header_printed = 1;
582 }
583
584 if (portnum == 0xFF)
585 printf(" GUID 0x%" PRIx64 " port ALL:", node->guid);
586 else
587 printf(" GUID 0x%" PRIx64 " port %d:",
588 node->guid, portnum);
589
590 for (i = start_field; i <= end_field; i++) {
591 uint64_t val64 = 0;
592 float val = 0;
593 char *unit = "";
594 int data = 0;
595 mad_decode_field(pc, i, (void *)&val64);
596 if (i == IB_PC_EXT_XMT_BYTES_F || i == IB_PC_EXT_RCV_BYTES_F ||
597 i == IB_PC_XMT_BYTES_F || i == IB_PC_RCV_BYTES_F)
598 data = 1;
599 unit = conv_cnt_human_readable(val64, &val, data);
600 printf(" [%s == %" PRIu64 " (%5.3f%s)]", mad_field_name(i),
601 val64, val, unit);
602 }
603 printf("\n");
604
605 if (portnum != 0xFF && port_config)
606 print_port_config(node, portnum);
607
608 return (0);
609 }
610
print_errors(ib_portid_t * portid,uint16_t cap_mask,char * node_name,ibnd_node_t * node,int portnum,int * header_printed)611 static int print_errors(ib_portid_t * portid, uint16_t cap_mask,
612 char *node_name, ibnd_node_t * node, int portnum,
613 int *header_printed)
614 {
615 uint8_t pc[1024];
616 uint8_t pce[1024];
617 uint8_t *pc_ext = NULL;
618
619 memset(pc, 0, 1024);
620 memset(pce, 0, 1024);
621
622 portid->sl = lid2sl_table[portid->lid];
623
624 if (!pma_query_via(pc, portid, portnum, ibd_timeout,
625 IB_GSI_PORT_COUNTERS, ibmad_port)) {
626 IBWARN("IB_GSI_PORT_COUNTERS query failed on %s, %s port %d",
627 node_name, portid2str(portid), portnum);
628 summary.pma_query_failures++;
629 return (0);
630 }
631
632 if (cap_mask & (IB_PM_EXT_WIDTH_SUPPORTED | IB_PM_EXT_WIDTH_NOIETF_SUP)) {
633 if (!pma_query_via(pce, portid, portnum, ibd_timeout,
634 IB_GSI_PORT_COUNTERS_EXT, ibmad_port)) {
635 IBWARN("IB_GSI_PORT_COUNTERS_EXT query failed on %s, %s port %d",
636 node_name, portid2str(portid), portnum);
637 summary.pma_query_failures++;
638 return (0);
639 }
640 pc_ext = pce;
641 }
642
643 if (!(cap_mask & IB_PM_PC_XMIT_WAIT_SUP)) {
644 /* if PortCounters:PortXmitWait not supported clear this counter */
645 uint32_t foo = 0;
646 mad_encode_field(pc, IB_PC_XMT_WAIT_F, &foo);
647 }
648 return (print_results(portid, node_name, node, pc, portnum,
649 header_printed, pc_ext, cap_mask));
650 }
651
reset_pc_ext(void * rcvbuf,ib_portid_t * dest,int port,unsigned mask,unsigned timeout,const struct ibmad_port * srcport)652 uint8_t *reset_pc_ext(void *rcvbuf, ib_portid_t * dest,
653 int port, unsigned mask, unsigned timeout,
654 const struct ibmad_port * srcport)
655 {
656 ib_rpc_t rpc = { 0 };
657 int lid = dest->lid;
658
659 DEBUG("lid %u port %d mask 0x%x", lid, port, mask);
660
661 if (lid == -1) {
662 IBWARN("only lid routed is supported");
663 return NULL;
664 }
665
666 if (!mask)
667 mask = ~0;
668
669 rpc.mgtclass = IB_PERFORMANCE_CLASS;
670 rpc.method = IB_MAD_METHOD_SET;
671 rpc.attr.id = IB_GSI_PORT_COUNTERS_EXT;
672
673 memset(rcvbuf, 0, IB_MAD_SIZE);
674
675 /* Same for attribute IDs */
676 mad_set_field(rcvbuf, 0, IB_PC_EXT_PORT_SELECT_F, port);
677 mad_set_field(rcvbuf, 0, IB_PC_EXT_COUNTER_SELECT_F, mask);
678 rpc.attr.mod = 0;
679 rpc.timeout = timeout;
680 rpc.datasz = IB_PC_DATA_SZ;
681 rpc.dataoffs = IB_PC_DATA_OFFS;
682 if (!dest->qp)
683 dest->qp = 1;
684 if (!dest->qkey)
685 dest->qkey = IB_DEFAULT_QP1_QKEY;
686
687 return mad_rpc(srcport, &rpc, dest, rcvbuf, rcvbuf);
688 }
689
clear_port(ib_portid_t * portid,uint16_t cap_mask,char * node_name,int port)690 static void clear_port(ib_portid_t * portid, uint16_t cap_mask,
691 char *node_name, int port)
692 {
693 uint8_t pc[1024] = { 0 };
694 /* bits defined in Table 228 PortCounters CounterSelect and
695 * CounterSelect2
696 */
697 uint32_t mask = 0;
698
699 if (clear_errors) {
700 mask |= 0xFFF;
701 if (cap_mask & IB_PM_PC_XMIT_WAIT_SUP)
702 mask |= 0x10000;
703 }
704 if (clear_counts)
705 mask |= 0xF000;
706
707 if (mask)
708 if (!performance_reset_via(pc, portid, port, mask, ibd_timeout,
709 IB_GSI_PORT_COUNTERS, ibmad_port))
710 fprintf(stderr, "Failed to reset errors %s port %d\n", node_name,
711 port);
712
713 if (clear_errors && details) {
714 memset(pc, 0, 1024);
715 performance_reset_via(pc, portid, port, 0xf, ibd_timeout,
716 IB_GSI_PORT_XMIT_DISCARD_DETAILS,
717 ibmad_port);
718 memset(pc, 0, 1024);
719 performance_reset_via(pc, portid, port, 0x3f, ibd_timeout,
720 IB_GSI_PORT_RCV_ERROR_DETAILS,
721 ibmad_port);
722 }
723
724 if (clear_counts &&
725 (cap_mask &
726 (IB_PM_EXT_WIDTH_SUPPORTED | IB_PM_EXT_WIDTH_NOIETF_SUP))) {
727 if (cap_mask & IB_PM_EXT_WIDTH_SUPPORTED)
728 mask = 0xFF;
729 else
730 mask = 0x0F;
731
732 if (!reset_pc_ext(pc, portid, port, mask, ibd_timeout,
733 ibmad_port))
734 fprintf(stderr, "Failed to reset extended data counters %s, "
735 "%s port %d\n", node_name, portid2str(portid),
736 port);
737 }
738 }
739
print_node(ibnd_node_t * node,void * user_data)740 void print_node(ibnd_node_t * node, void *user_data)
741 {
742 int header_printed = 0;
743 int p = 0;
744 int startport = 1;
745 int type = 0;
746 int all_port_sup = 0;
747 ib_portid_t portid = { 0 };
748 uint16_t cap_mask = 0;
749 char *node_name = NULL;
750
751 switch (node->type) {
752 case IB_NODE_SWITCH:
753 type = PRINT_SWITCH;
754 break;
755 case IB_NODE_CA:
756 type = PRINT_CA;
757 break;
758 case IB_NODE_ROUTER:
759 type = PRINT_ROUTER;
760 break;
761 }
762
763 if ((type & node_type_to_print) == 0)
764 return;
765
766 if (node->type == IB_NODE_SWITCH && node->smaenhsp0)
767 startport = 0;
768
769 node_name = remap_node_name(node_name_map, node->guid, node->nodedesc);
770
771 if (node->type == IB_NODE_SWITCH) {
772 ib_portid_set(&portid, node->smalid, 0, 0);
773 p = 0;
774 } else {
775 for (p = 1; p <= node->numports; p++) {
776 if (node->ports[p]) {
777 ib_portid_set(&portid,
778 node->ports[p]->base_lid,
779 0, 0);
780 break;
781 }
782 }
783 }
784
785 if ((query_cap_mask(&portid, node_name, p, &cap_mask) == 0) &&
786 (cap_mask & IB_PM_ALL_PORT_SELECT))
787 all_port_sup = 1;
788
789 if (data_counters_only) {
790 for (p = startport; p <= node->numports; p++) {
791 if (node->ports[p]) {
792 if (node->type == IB_NODE_SWITCH)
793 ib_portid_set(&portid, node->smalid, 0, 0);
794 else
795 ib_portid_set(&portid, node->ports[p]->base_lid,
796 0, 0);
797
798 print_data_cnts(&portid, cap_mask, node_name, node, p,
799 &header_printed);
800 summary.ports_checked++;
801 if (!all_port_sup)
802 clear_port(&portid, cap_mask, node_name, p);
803 }
804 }
805 } else {
806 if (all_port_sup)
807 if (!print_errors(&portid, cap_mask, node_name, node,
808 0xFF, &header_printed)) {
809 summary.ports_checked += node->numports;
810 goto clear;
811 }
812
813 for (p = startport; p <= node->numports; p++) {
814 if (node->ports[p]) {
815 if (node->type == IB_NODE_SWITCH)
816 ib_portid_set(&portid, node->smalid, 0, 0);
817 else
818 ib_portid_set(&portid, node->ports[p]->base_lid,
819 0, 0);
820
821 print_errors(&portid, cap_mask, node_name, node, p,
822 &header_printed);
823 summary.ports_checked++;
824 if (!all_port_sup)
825 clear_port(&portid, cap_mask, node_name, p);
826 }
827 }
828 }
829
830 clear:
831 summary.nodes_checked++;
832 if (all_port_sup)
833 clear_port(&portid, cap_mask, node_name, 0xFF);
834
835 free(node_name);
836 }
837
add_suppressed(enum MAD_FIELDS field)838 static void add_suppressed(enum MAD_FIELDS field)
839 {
840 if (sup_total >= SUP_MAX) {
841 IBWARN("Maximum (%d) fields have been suppressed; skipping %s",
842 sup_total, mad_field_name(field));
843 return;
844 }
845 suppressed_fields[sup_total++] = field;
846 }
847
calculate_suppressed_fields(char * str)848 static void calculate_suppressed_fields(char *str)
849 {
850 enum MAD_FIELDS f;
851 char *val, *lasts = NULL;
852 char *tmp = strdup(str);
853
854 val = strtok_r(tmp, ",", &lasts);
855 while (val) {
856 for (f = IB_PC_FIRST_F; f <= IB_PC_LAST_F; f++)
857 if (strcmp(val, mad_field_name(f)) == 0)
858 add_suppressed(f);
859 val = strtok_r(NULL, ",", &lasts);
860 }
861
862 free(tmp);
863 }
864
process_opt(void * context,int ch,char * optarg)865 static int process_opt(void *context, int ch, char *optarg)
866 {
867 struct ibnd_config *cfg = context;
868 switch (ch) {
869 case 's':
870 calculate_suppressed_fields(optarg);
871 break;
872 case 'c':
873 /* Right now this is the only "common" error */
874 add_suppressed(IB_PC_ERR_SWITCH_REL_F);
875 break;
876 case 1:
877 node_name_map_file = strdup(optarg);
878 break;
879 case 2:
880 data_counters++;
881 break;
882 case 3:
883 node_type_to_print |= PRINT_SWITCH;
884 break;
885 case 4:
886 node_type_to_print |= PRINT_CA;
887 break;
888 case 5:
889 node_type_to_print |= PRINT_ROUTER;
890 break;
891 case 6:
892 details = 1;
893 break;
894 case 7:
895 load_cache_file = strdup(optarg);
896 break;
897 case 8:
898 threshold_file = strdup(optarg);
899 break;
900 case 9:
901 data_counters_only = 1;
902 break;
903 case 10:
904 obtain_sl = 0;
905 break;
906 case 'G':
907 case 'S':
908 port_guid_str = optarg;
909 port_guid = strtoull(optarg, 0, 0);
910 break;
911 case 'D':
912 dr_path = strdup(optarg);
913 break;
914 case 'r':
915 port_config++;
916 break;
917 case 'R': /* nop */
918 break;
919 case 'k':
920 clear_errors = 1;
921 break;
922 case 'K':
923 clear_counts = 1;
924 break;
925 case 'o':
926 cfg->max_smps = strtoul(optarg, NULL, 0);
927 break;
928 default:
929 return -1;
930 }
931
932 return 0;
933 }
934
main(int argc,char ** argv)935 int main(int argc, char **argv)
936 {
937 struct ibnd_config config = { 0 };
938 int resolved = -1;
939 ib_portid_t portid = { 0 };
940 ib_portid_t self_portid = { 0 };
941 int rc = 0;
942 ibnd_fabric_t *fabric = NULL;
943 ib_gid_t self_gid;
944 int port = 0;
945
946 int mgmt_classes[4] = { IB_SMI_CLASS, IB_SMI_DIRECT_CLASS, IB_SA_CLASS,
947 IB_PERFORMANCE_CLASS
948 };
949
950 const struct ibdiag_opt opts[] = {
951 {"suppress", 's', 1, "<err1,err2,...>",
952 "suppress errors listed"},
953 {"suppress-common", 'c', 0, NULL,
954 "suppress some of the common counters"},
955 {"node-name-map", 1, 1, "<file>", "node name map file"},
956 {"port-guid", 'G', 1, "<port_guid>",
957 "report the node containing the port specified by <port_guid>"},
958 {"", 'S', 1, "<port_guid>",
959 "Same as \"-G\" for backward compatibility"},
960 {"Direct", 'D', 1, "<dr_path>",
961 "report the node containing the port specified by <dr_path>"},
962 {"skip-sl", 10, 0, NULL,"don't obtain SL to all destinations"},
963 {"report-port", 'r', 0, NULL,
964 "report port link information"},
965 {"threshold-file", 8, 1, NULL,
966 "specify an alternate threshold file, default: " DEF_THRES_FILE},
967 {"GNDN", 'R', 0, NULL,
968 "(This option is obsolete and does nothing)"},
969 {"data", 2, 0, NULL, "include data counters for ports with errors"},
970 {"switch", 3, 0, NULL, "print data for switches only"},
971 {"ca", 4, 0, NULL, "print data for CA's only"},
972 {"router", 5, 0, NULL, "print data for routers only"},
973 {"details", 6, 0, NULL, "include transmit discard details"},
974 {"counters", 9, 0, NULL, "print data counters only"},
975 {"clear-errors", 'k', 0, NULL,
976 "Clear error counters after read"},
977 {"clear-counts", 'K', 0, NULL,
978 "Clear data counters after read"},
979 {"load-cache", 7, 1, "<file>",
980 "filename of ibnetdiscover cache to load"},
981 {"outstanding_smps", 'o', 1, NULL,
982 "specify the number of outstanding SMP's which should be "
983 "issued during the scan"},
984 {0}
985 };
986 char usage_args[] = "";
987
988 memset(suppressed_fields, 0, sizeof suppressed_fields);
989 ibdiag_process_opts(argc, argv, &config, "cDGKLnRrSs", opts, process_opt,
990 usage_args, NULL);
991
992 argc -= optind;
993 argv += optind;
994
995 if (!node_type_to_print)
996 node_type_to_print = PRINT_ALL;
997
998 ibmad_port = mad_rpc_open_port(ibd_ca, ibd_ca_port, mgmt_classes, 4);
999 if (!ibmad_port)
1000 IBEXIT("Failed to open port; %s:%d\n", ibd_ca, ibd_ca_port);
1001
1002 smp_mkey_set(ibmad_port, ibd_mkey);
1003
1004 if (ibd_timeout) {
1005 mad_rpc_set_timeout(ibmad_port, ibd_timeout);
1006 config.timeout_ms = ibd_timeout;
1007 }
1008
1009 config.flags = ibd_ibnetdisc_flags;
1010 config.mkey = ibd_mkey;
1011
1012 if (dr_path && load_cache_file) {
1013 mad_rpc_close_port(ibmad_port);
1014 fprintf(stderr, "Cannot specify cache and direct route path\n");
1015 exit(-1);
1016 }
1017
1018 if (resolve_self(ibd_ca, ibd_ca_port, &self_portid, &port, &self_gid.raw) < 0) {
1019 mad_rpc_close_port(ibmad_port);
1020 IBEXIT("can't resolve self port %s", argv[0]);
1021 }
1022
1023 node_name_map = open_node_name_map(node_name_map_file);
1024
1025 /* limit the scan the fabric around the target */
1026 if (dr_path) {
1027 if ((resolved =
1028 resolve_portid_str(ibd_ca, ibd_ca_port, &portid, dr_path,
1029 IB_DEST_DRPATH, NULL, ibmad_port)) < 0)
1030 IBWARN("Failed to resolve %s; attempting full scan",
1031 dr_path);
1032 } else if (port_guid_str) {
1033 if ((resolved =
1034 resolve_portid_str(ibd_ca, ibd_ca_port, &portid,
1035 port_guid_str, IB_DEST_GUID, ibd_sm_id,
1036 ibmad_port)) < 0)
1037 IBWARN("Failed to resolve %s; attempting full scan",
1038 port_guid_str);
1039 if(obtain_sl)
1040 lid2sl_table[portid.lid] = portid.sl;
1041 }
1042
1043 mad_rpc_close_port(ibmad_port);
1044
1045 if (load_cache_file) {
1046 if ((fabric = ibnd_load_fabric(load_cache_file, 0)) == NULL) {
1047 fprintf(stderr, "loading cached fabric failed\n");
1048 rc = -1;
1049 goto close_port;
1050 }
1051 } else {
1052 if (resolved >= 0) {
1053 if (!config.max_hops)
1054 config.max_hops = 1;
1055 if (!(fabric = ibnd_discover_fabric(ibd_ca, ibd_ca_port,
1056 &portid, &config)))
1057 IBWARN("Single node discover failed;"
1058 " attempting full scan");
1059 }
1060
1061 if (!fabric && !(fabric = ibnd_discover_fabric(ibd_ca,
1062 ibd_ca_port,
1063 NULL,
1064 &config))) {
1065 fprintf(stderr, "discover failed\n");
1066 rc = -1;
1067 goto close_port;
1068 }
1069 }
1070
1071 set_thresholds(threshold_file);
1072
1073 /* reopen the global ibmad_port */
1074 ibmad_port = mad_rpc_open_port(ibd_ca, ibd_ca_port,
1075 mgmt_classes, 4);
1076 if (!ibmad_port) {
1077 ibnd_destroy_fabric(fabric);
1078 close_node_name_map(node_name_map);
1079 IBEXIT("Failed to reopen port: %s:%d\n",
1080 ibd_ca, ibd_ca_port);
1081 }
1082
1083 smp_mkey_set(ibmad_port, ibd_mkey);
1084
1085 if (ibd_timeout)
1086 mad_rpc_set_timeout(ibmad_port, ibd_timeout);
1087
1088 if (port_guid_str) {
1089 ibnd_port_t *port = ibnd_find_port_guid(fabric, port_guid);
1090 if (port)
1091 print_node(port->node, NULL);
1092 else
1093 fprintf(stderr, "Failed to find node: %s\n",
1094 port_guid_str);
1095 } else if (dr_path) {
1096 ibnd_port_t *port;
1097 uint8_t ni[IB_SMP_DATA_SIZE] = { 0 };
1098 if (!smp_query_via(ni, &portid, IB_ATTR_NODE_INFO, 0,
1099 ibd_timeout, ibmad_port)) {
1100 fprintf(stderr, "Failed to query local Node Info\n");
1101 goto destroy_fabric;
1102 }
1103
1104 mad_decode_field(ni, IB_NODE_PORT_GUID_F, &(port_guid));
1105
1106 port = ibnd_find_port_guid(fabric, port_guid);
1107 if (port) {
1108 if(obtain_sl)
1109 if(path_record_query(self_gid,port->guid))
1110 goto destroy_fabric;
1111 print_node(port->node, NULL);
1112 } else
1113 fprintf(stderr, "Failed to find node: %s\n", dr_path);
1114 } else {
1115 if(obtain_sl)
1116 if(path_record_query(self_gid,0))
1117 goto destroy_fabric;
1118
1119 ibnd_iter_nodes(fabric, print_node, NULL);
1120 }
1121
1122 rc = print_summary();
1123 if (rc)
1124 rc = 1;
1125
1126 destroy_fabric:
1127 mad_rpc_close_port(ibmad_port);
1128 ibnd_destroy_fabric(fabric);
1129
1130 close_port:
1131 close_node_name_map(node_name_map);
1132 exit(rc);
1133 }
1134