1 /* 2 * Copyright (c) 2004-2009 Voltaire Inc. All rights reserved. 3 * Copyright (c) 2007 Xsigo Systems Inc. All rights reserved. 4 * Copyright (c) 2008 Lawrence Livermore National Lab. All rights reserved. 5 * Copyright (c) 2009 HNR Consulting. All rights reserved. 6 * Copyright (c) 2010,2011 Mellanox Technologies LTD. All rights reserved. 7 * 8 * This software is available to you under a choice of one of two 9 * licenses. You may choose to be licensed under the terms of the GNU 10 * General Public License (GPL) Version 2, available from the file 11 * COPYING in the main directory of this source tree, or the 12 * OpenIB.org BSD license below: 13 * 14 * Redistribution and use in source and binary forms, with or 15 * without modification, are permitted provided that the following 16 * conditions are met: 17 * 18 * - Redistributions of source code must retain the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer. 21 * 22 * - Redistributions in binary form must reproduce the above 23 * copyright notice, this list of conditions and the following 24 * disclaimer in the documentation and/or other materials 25 * provided with the distribution. 26 * 27 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 28 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 29 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 30 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 31 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 32 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 33 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 34 * SOFTWARE. 35 * 36 */ 37 38 #if HAVE_CONFIG_H 39 # include <config.h> 40 #endif /* HAVE_CONFIG_H */ 41 42 #define _GNU_SOURCE 43 #include <stdio.h> 44 #include <stdlib.h> 45 #include <unistd.h> 46 #include <stdarg.h> 47 #include <time.h> 48 #include <string.h> 49 #include <getopt.h> 50 #include <errno.h> 51 #include <inttypes.h> 52 53 #include <complib/cl_nodenamemap.h> 54 #include <infiniband/ibnetdisc.h> 55 #include <infiniband/mad.h> 56 57 #include "ibdiag_common.h" 58 #include "ibdiag_sa.h" 59 60 struct ibmad_port *ibmad_port; 61 static char *node_name_map_file = NULL; 62 static nn_map_t *node_name_map = NULL; 63 static char *load_cache_file = NULL; 64 static uint16_t lid2sl_table[sizeof(uint8_t) * 1024 * 48] = { 0 }; 65 static int obtain_sl = 1; 66 67 int data_counters = 0; 68 int data_counters_only = 0; 69 int port_config = 0; 70 uint64_t port_guid = 0; 71 char *port_guid_str = NULL; 72 #define SUP_MAX 64 73 int sup_total = 0; 74 enum MAD_FIELDS suppressed_fields[SUP_MAX]; 75 char *dr_path = NULL; 76 uint8_t node_type_to_print = 0; 77 unsigned clear_errors = 0, clear_counts = 0, details = 0; 78 79 #define PRINT_SWITCH 0x1 80 #define PRINT_CA 0x2 81 #define PRINT_ROUTER 0x4 82 #define PRINT_ALL 0xFF /* all nodes default flag */ 83 84 #define DEFAULT_HALF_WORLD_PR_TIMEOUT (3000) 85 86 struct { 87 int nodes_checked; 88 int bad_nodes; 89 int ports_checked; 90 int bad_ports; 91 int pma_query_failures; 92 } summary = { 0 }; 93 94 #define DEF_THRES_FILE IBDIAG_CONFIG_PATH"/error_thresholds" 95 static char *threshold_file = DEF_THRES_FILE; 96 97 /* define a "packet" with threshold values in it */ 98 uint8_t thresholds[1204] = { 0 }; 99 char * threshold_str = ""; 100 101 static unsigned valid_gid(ib_gid_t * gid) 102 { 103 ib_gid_t zero_gid; 104 memset(&zero_gid, 0, sizeof zero_gid); 105 return memcmp(&zero_gid, gid, sizeof(*gid)); 106 } 107 108 static void set_thres(char *name, uint32_t val) 109 { 110 int f; 111 int n; 112 char tmp[256]; 113 for (f = IB_PC_FIRST_F; f <= IB_PC_LAST_F; f++) { 114 if (strcmp(name, mad_field_name(f)) == 0) { 115 mad_encode_field(thresholds, f, &val); 116 snprintf(tmp, 255, "[%s = %u]", name, val); 117 threshold_str = realloc(threshold_str, 118 strlen(threshold_str)+strlen(tmp)+1); 119 if (!threshold_str) { 120 fprintf(stderr, "Failed to allocate memory: " 121 "%s\n", strerror(errno)); 122 exit(1); 123 } 124 n = strlen(threshold_str); 125 strcpy(threshold_str+n, tmp); 126 } 127 } 128 } 129 130 static void set_thresholds(char *threshold_file) 131 { 132 char buf[1024]; 133 int val = 0; 134 FILE *thresf = fopen(threshold_file, "r"); 135 char *p_prefix, *p_last; 136 char *name; 137 char *val_str; 138 char str[64]; 139 140 if (!thresf) 141 return; 142 143 snprintf(str, 63, "Thresholds: "); 144 threshold_str = malloc(strlen(str)+1); 145 if (!threshold_str) { 146 fprintf(stderr, "Failed to allocate memory: %s\n", 147 strerror(errno)); 148 exit(1); 149 } 150 strcpy(threshold_str, str); 151 while (fgets(buf, sizeof buf, thresf) != NULL) { 152 p_prefix = strtok_r(buf, "\n", &p_last); 153 if (!p_prefix) 154 continue; /* ignore blank lines */ 155 156 if (*p_prefix == '#') 157 continue; /* ignore comment lines */ 158 159 name = strtok_r(p_prefix, "=", &p_last); 160 val_str = strtok_r(NULL, "\n", &p_last); 161 162 val = strtoul(val_str, NULL, 0); 163 set_thres(name, val); 164 } 165 166 fclose(thresf); 167 } 168 169 static int exceeds_threshold(int field, unsigned val) 170 { 171 uint32_t thres = 0; 172 mad_decode_field(thresholds, field, &thres); 173 return (val > thres); 174 } 175 176 static void print_port_config(ibnd_node_t * node, int portnum) 177 { 178 char width[64], speed[64], state[64], physstate[64]; 179 char remote_str[256]; 180 char link_str[256]; 181 char width_msg[256]; 182 char speed_msg[256]; 183 char ext_port_str[256]; 184 int iwidth, ispeed, fdr10, espeed, istate, iphystate, cap_mask; 185 uint8_t *info; 186 187 ibnd_port_t *port = node->ports[portnum]; 188 189 if (!port) 190 return; 191 192 iwidth = mad_get_field(port->info, 0, IB_PORT_LINK_WIDTH_ACTIVE_F); 193 ispeed = mad_get_field(port->info, 0, IB_PORT_LINK_SPEED_ACTIVE_F); 194 fdr10 = mad_get_field(port->ext_info, 0, 195 IB_MLNX_EXT_PORT_LINK_SPEED_ACTIVE_F) & FDR10; 196 197 if (port->node->type == IB_NODE_SWITCH) 198 info = (uint8_t *)&port->node->ports[0]->info; 199 else 200 info = (uint8_t *)&port->info; 201 cap_mask = mad_get_field(info, 0, IB_PORT_CAPMASK_F); 202 if (cap_mask & CL_NTOH32(IB_PORT_CAP_HAS_EXT_SPEEDS)) 203 espeed = mad_get_field(port->info, 0, 204 IB_PORT_LINK_SPEED_EXT_ACTIVE_F); 205 else 206 espeed = 0; 207 istate = mad_get_field(port->info, 0, IB_PORT_STATE_F); 208 iphystate = mad_get_field(port->info, 0, IB_PORT_PHYS_STATE_F); 209 210 remote_str[0] = '\0'; 211 link_str[0] = '\0'; 212 width_msg[0] = '\0'; 213 speed_msg[0] = '\0'; 214 215 /* C14-24.2.1 states that a down port allows for invalid data to be 216 * returned for all PortInfo components except PortState and 217 * PortPhysicalState */ 218 if (istate != IB_LINK_DOWN) { 219 if (!espeed) { 220 if (fdr10) 221 sprintf(speed, "10.0 Gbps (FDR10)"); 222 else 223 mad_dump_val(IB_PORT_LINK_SPEED_ACTIVE_F, speed, 224 64, &ispeed); 225 } else 226 mad_dump_val(IB_PORT_LINK_SPEED_EXT_ACTIVE_F, speed, 227 64, &espeed); 228 229 snprintf(link_str, 256, "(%3s %18s %6s/%8s)", 230 mad_dump_val(IB_PORT_LINK_WIDTH_ACTIVE_F, width, 64, &iwidth), 231 speed, 232 mad_dump_val(IB_PORT_STATE_F, state, 64, &istate), 233 mad_dump_val(IB_PORT_PHYS_STATE_F, physstate, 64, &iphystate)); 234 } else { 235 snprintf(link_str, 256, "( %6s/%8s)", 236 mad_dump_val(IB_PORT_STATE_F, state, 64, &istate), 237 mad_dump_val(IB_PORT_PHYS_STATE_F, physstate, 64, &iphystate)); 238 } 239 240 if (port->remoteport) { 241 char *rem_node_name = NULL; 242 243 if (port->remoteport->ext_portnum) 244 snprintf(ext_port_str, 256, "%d", 245 port->remoteport->ext_portnum); 246 else 247 ext_port_str[0] = '\0'; 248 249 get_max_msg(width_msg, speed_msg, 256, port); 250 251 rem_node_name = remap_node_name(node_name_map, 252 port->remoteport->node->guid, 253 port->remoteport->node-> 254 nodedesc); 255 256 snprintf(remote_str, 256, 257 "0x%016" PRIx64 " %6d %4d[%2s] \"%s\" (%s %s)\n", 258 port->remoteport->guid, 259 port->remoteport->base_lid ? port->remoteport-> 260 base_lid : port->remoteport->node->smalid, 261 port->remoteport->portnum, ext_port_str, rem_node_name, 262 width_msg, speed_msg); 263 264 free(rem_node_name); 265 } else 266 snprintf(remote_str, 256, " [ ] \"\" ( )\n"); 267 268 if (port->ext_portnum) 269 snprintf(ext_port_str, 256, "%d", port->ext_portnum); 270 else 271 ext_port_str[0] = '\0'; 272 273 if (node->type == IB_NODE_SWITCH) 274 printf(" Link info: %6d", node->smalid); 275 else 276 printf(" Link info: %6d", port->base_lid); 277 278 printf("%4d[%2s] ==%s==> %s", 279 port->portnum, ext_port_str, link_str, remote_str); 280 } 281 282 static int suppress(enum MAD_FIELDS field) 283 { 284 int i = 0; 285 for (i = 0; i < sup_total; i++) 286 if (field == suppressed_fields[i]) 287 return 1; 288 return 0; 289 } 290 291 static void report_suppressed(void) 292 { 293 int i = 0; 294 printf("## Suppressed:"); 295 for (i = 0; i < sup_total; i++) 296 printf(" %s", mad_field_name(suppressed_fields[i])); 297 printf("\n"); 298 } 299 300 static int print_summary(void) 301 { 302 printf("\n## Summary: %d nodes checked, %d bad nodes found\n", 303 summary.nodes_checked, summary.bad_nodes); 304 printf("## %d ports checked, %d ports have errors beyond threshold\n", 305 summary.ports_checked, summary.bad_ports); 306 printf("## %s\n", threshold_str); 307 if (summary.pma_query_failures) 308 printf("## %d PMA query failures\n", summary.pma_query_failures); 309 report_suppressed(); 310 return (summary.bad_ports); 311 } 312 313 static void insert_lid2sl_table(struct sa_query_result *r) 314 { 315 unsigned int i; 316 for (i = 0; i < r->result_cnt; i++) { 317 ib_path_rec_t *p_pr = (ib_path_rec_t *)sa_get_query_rec(r->p_result_madw, i); 318 lid2sl_table[cl_ntoh16(p_pr->dlid)] = ib_path_rec_sl(p_pr); 319 } 320 } 321 322 static int path_record_query(ib_gid_t sgid,uint64_t dguid) 323 { 324 ib_path_rec_t pr; 325 ib_net64_t comp_mask = 0; 326 uint8_t reversible = 0; 327 struct sa_handle * h; 328 329 if (!(h = sa_get_handle())) 330 return -1; 331 332 ibd_timeout = DEFAULT_HALF_WORLD_PR_TIMEOUT; 333 memset(&pr, 0, sizeof(pr)); 334 335 CHECK_AND_SET_GID(sgid, pr.sgid, PR, SGID); 336 if(dguid) { 337 mad_encode_field(sgid.raw, IB_GID_GUID_F, &dguid); 338 CHECK_AND_SET_GID(sgid, pr.dgid, PR, DGID); 339 } 340 341 CHECK_AND_SET_VAL(1, 8, -1, pr.num_path, PR, NUMBPATH);/*to get only one PathRecord for each source and destination pair*/ 342 CHECK_AND_SET_VAL(1, 8, -1, reversible, PR, REVERSIBLE);/*for a reversible path*/ 343 pr.num_path |= reversible << 7; 344 struct sa_query_result result; 345 int ret = sa_query(h, IB_MAD_METHOD_GET_TABLE, 346 (uint16_t)IB_SA_ATTR_PATHRECORD,0,cl_ntoh64(comp_mask),ibd_sakey, 347 &pr, sizeof(pr), &result); 348 if (ret) { 349 sa_free_handle(h); 350 fprintf(stderr, "Query SA failed: %s; sa call path_query failed\n", strerror(ret)); 351 return ret; 352 } 353 if (result.status != IB_SA_MAD_STATUS_SUCCESS) { 354 sa_report_err(result.status); 355 ret = EIO; 356 goto Exit; 357 } 358 359 insert_lid2sl_table(&result); 360 Exit: 361 sa_free_handle(h); 362 sa_free_result_mad(&result); 363 return ret; 364 } 365 366 static int query_and_dump(char *buf, size_t size, ib_portid_t * portid, 367 char *node_name, int portnum, 368 const char *attr_name, uint16_t attr_id, 369 int start_field, int end_field) 370 { 371 uint8_t pc[1024]; 372 uint32_t val = 0; 373 int i, n; 374 375 memset(pc, 0, sizeof(pc)); 376 377 if (!pma_query_via(pc, portid, portnum, ibd_timeout, attr_id, 378 ibmad_port)) { 379 IBWARN("%s query failed on %s, %s port %d", attr_name, 380 node_name, portid2str(portid), portnum); 381 summary.pma_query_failures++; 382 return 0; 383 } 384 385 for (n = 0, i = start_field; i < end_field; i++) { 386 mad_decode_field(pc, i, (void *)&val); 387 if (val) 388 n += snprintf(buf + n, size - n, " [%s == %u]", 389 mad_field_name(i), val); 390 } 391 392 return n; 393 } 394 395 396 static int print_results(ib_portid_t * portid, char *node_name, 397 ibnd_node_t * node, uint8_t * pc, int portnum, 398 int *header_printed, uint8_t *pce, uint16_t cap_mask) 399 { 400 char buf[1024]; 401 char *str = buf; 402 uint32_t val = 0; 403 int i, n; 404 405 for (n = 0, i = IB_PC_ERR_SYM_F; i <= IB_PC_VL15_DROPPED_F; i++) { 406 if (suppress(i)) 407 continue; 408 409 /* this is not a counter, skip it */ 410 if (i == IB_PC_COUNTER_SELECT2_F) 411 continue; 412 413 mad_decode_field(pc, i, (void *)&val); 414 if (exceeds_threshold(i, val)) { 415 n += snprintf(str + n, 1024 - n, " [%s == %u]", 416 mad_field_name(i), val); 417 418 /* If there are PortXmitDiscards, get details (if supported) */ 419 if (i == IB_PC_XMT_DISCARDS_F && details) { 420 n += query_and_dump(str + n, sizeof(buf) - n, portid, 421 node_name, portnum, 422 "PortXmitDiscardDetails", 423 IB_GSI_PORT_XMIT_DISCARD_DETAILS, 424 IB_PC_RCV_LOCAL_PHY_ERR_F, 425 IB_PC_RCV_ERR_LAST_F); 426 /* If there are PortRcvErrors, get details (if supported) */ 427 } else if (i == IB_PC_ERR_RCV_F && details) { 428 n += query_and_dump(str + n, sizeof(buf) - n, portid, 429 node_name, portnum, 430 "PortRcvErrorDetails", 431 IB_GSI_PORT_RCV_ERROR_DETAILS, 432 IB_PC_XMT_INACT_DISC_F, 433 IB_PC_XMT_DISC_LAST_F); 434 } 435 } 436 } 437 438 if (!suppress(IB_PC_XMT_WAIT_F)) { 439 mad_decode_field(pc, IB_PC_XMT_WAIT_F, (void *)&val); 440 if (exceeds_threshold(IB_PC_XMT_WAIT_F, val)) 441 n += snprintf(str + n, 1024 - n, " [%s == %u]", 442 mad_field_name(IB_PC_XMT_WAIT_F), val); 443 } 444 445 /* if we found errors. */ 446 if (n != 0) { 447 if (data_counters) { 448 uint8_t *pkt = pc; 449 int start_field = IB_PC_XMT_BYTES_F; 450 int end_field = IB_PC_RCV_PKTS_F; 451 452 if (pce) { 453 pkt = pce; 454 start_field = IB_PC_EXT_XMT_BYTES_F; 455 if (cap_mask & IB_PM_EXT_WIDTH_SUPPORTED) 456 end_field = IB_PC_EXT_RCV_MPKTS_F; 457 else 458 end_field = IB_PC_EXT_RCV_PKTS_F; 459 } 460 461 for (i = start_field; i <= end_field; i++) { 462 uint64_t val64 = 0; 463 float val = 0; 464 char *unit = ""; 465 mad_decode_field(pkt, i, (void *)&val64); 466 if (val64) { 467 int data = 0; 468 if (i == IB_PC_EXT_XMT_BYTES_F || 469 i == IB_PC_EXT_RCV_BYTES_F || 470 i == IB_PC_XMT_BYTES_F || 471 i == IB_PC_RCV_BYTES_F) 472 data = 1; 473 unit = conv_cnt_human_readable(val64, 474 &val, data); 475 n += snprintf(str + n, 1024 - n, 476 " [%s == %" PRIu64 477 " (%5.3f%s)]", 478 mad_field_name(i), val64, val, 479 unit); 480 } 481 } 482 } 483 484 if (!*header_printed) { 485 if (node->type == IB_NODE_SWITCH) 486 printf("Errors for 0x%" PRIx64 " \"%s\"\n", 487 node->ports[0]->guid, node_name); 488 else 489 printf("Errors for \"%s\"\n", node_name); 490 *header_printed = 1; 491 summary.bad_nodes++; 492 } 493 494 if (portnum == 0xFF) { 495 if (node->type == IB_NODE_SWITCH) 496 printf(" GUID 0x%" PRIx64 " port ALL:%s\n", 497 node->ports[0]->guid, str); 498 } else { 499 printf(" GUID 0x%" PRIx64 " port %d:%s\n", 500 node->ports[portnum]->guid, portnum, str); 501 if (port_config) 502 print_port_config(node, portnum); 503 summary.bad_ports++; 504 } 505 } 506 return (n); 507 } 508 509 static int query_cap_mask(ib_portid_t * portid, char *node_name, int portnum, 510 uint16_t * cap_mask) 511 { 512 uint8_t pc[1024] = { 0 }; 513 uint16_t rc_cap_mask; 514 515 portid->sl = lid2sl_table[portid->lid]; 516 517 /* PerfMgt ClassPortInfo is a required attribute */ 518 if (!pma_query_via(pc, portid, portnum, ibd_timeout, CLASS_PORT_INFO, 519 ibmad_port)) { 520 IBWARN("classportinfo query failed on %s, %s port %d", 521 node_name, portid2str(portid), portnum); 522 summary.pma_query_failures++; 523 return -1; 524 } 525 526 /* ClassPortInfo should be supported as part of libibmad */ 527 memcpy(&rc_cap_mask, pc + 2, sizeof(rc_cap_mask)); /* CapabilityMask */ 528 529 *cap_mask = rc_cap_mask; 530 return 0; 531 } 532 533 static int print_data_cnts(ib_portid_t * portid, uint16_t cap_mask, 534 char *node_name, ibnd_node_t * node, int portnum, 535 int *header_printed) 536 { 537 uint8_t pc[1024]; 538 int i; 539 int start_field = IB_PC_XMT_BYTES_F; 540 int end_field = IB_PC_RCV_PKTS_F; 541 542 memset(pc, 0, 1024); 543 544 portid->sl = lid2sl_table[portid->lid]; 545 546 if (cap_mask & (IB_PM_EXT_WIDTH_SUPPORTED | IB_PM_EXT_WIDTH_NOIETF_SUP)) { 547 if (!pma_query_via(pc, portid, portnum, ibd_timeout, 548 IB_GSI_PORT_COUNTERS_EXT, ibmad_port)) { 549 IBWARN("IB_GSI_PORT_COUNTERS_EXT query failed on %s, %s port %d", 550 node_name, portid2str(portid), portnum); 551 summary.pma_query_failures++; 552 return (1); 553 } 554 start_field = IB_PC_EXT_XMT_BYTES_F; 555 if (cap_mask & IB_PM_EXT_WIDTH_SUPPORTED) 556 end_field = IB_PC_EXT_RCV_MPKTS_F; 557 else 558 end_field = IB_PC_EXT_RCV_PKTS_F; 559 } else { 560 if (!pma_query_via(pc, portid, portnum, ibd_timeout, 561 IB_GSI_PORT_COUNTERS, ibmad_port)) { 562 IBWARN("IB_GSI_PORT_COUNTERS query failed on %s, %s port %d", 563 node_name, portid2str(portid), portnum); 564 summary.pma_query_failures++; 565 return (1); 566 } 567 start_field = IB_PC_XMT_BYTES_F; 568 end_field = IB_PC_RCV_PKTS_F; 569 } 570 571 if (!*header_printed) { 572 printf("Data Counters for 0x%" PRIx64 " \"%s\"\n", node->guid, 573 node_name); 574 *header_printed = 1; 575 } 576 577 if (portnum == 0xFF) 578 printf(" GUID 0x%" PRIx64 " port ALL:", node->guid); 579 else 580 printf(" GUID 0x%" PRIx64 " port %d:", 581 node->guid, portnum); 582 583 for (i = start_field; i <= end_field; i++) { 584 uint64_t val64 = 0; 585 float val = 0; 586 char *unit = ""; 587 int data = 0; 588 mad_decode_field(pc, i, (void *)&val64); 589 if (i == IB_PC_EXT_XMT_BYTES_F || i == IB_PC_EXT_RCV_BYTES_F || 590 i == IB_PC_XMT_BYTES_F || i == IB_PC_RCV_BYTES_F) 591 data = 1; 592 unit = conv_cnt_human_readable(val64, &val, data); 593 printf(" [%s == %" PRIu64 " (%5.3f%s)]", mad_field_name(i), 594 val64, val, unit); 595 } 596 printf("\n"); 597 598 if (portnum != 0xFF && port_config) 599 print_port_config(node, portnum); 600 601 return (0); 602 } 603 604 static int print_errors(ib_portid_t * portid, uint16_t cap_mask, 605 char *node_name, ibnd_node_t * node, int portnum, 606 int *header_printed) 607 { 608 uint8_t pc[1024]; 609 uint8_t pce[1024]; 610 uint8_t *pc_ext = NULL; 611 612 memset(pc, 0, 1024); 613 memset(pce, 0, 1024); 614 615 portid->sl = lid2sl_table[portid->lid]; 616 617 if (!pma_query_via(pc, portid, portnum, ibd_timeout, 618 IB_GSI_PORT_COUNTERS, ibmad_port)) { 619 IBWARN("IB_GSI_PORT_COUNTERS query failed on %s, %s port %d", 620 node_name, portid2str(portid), portnum); 621 summary.pma_query_failures++; 622 return (0); 623 } 624 625 if (cap_mask & (IB_PM_EXT_WIDTH_SUPPORTED | IB_PM_EXT_WIDTH_NOIETF_SUP)) { 626 if (!pma_query_via(pce, portid, portnum, ibd_timeout, 627 IB_GSI_PORT_COUNTERS_EXT, ibmad_port)) { 628 IBWARN("IB_GSI_PORT_COUNTERS_EXT query failed on %s, %s port %d", 629 node_name, portid2str(portid), portnum); 630 summary.pma_query_failures++; 631 return (0); 632 } 633 pc_ext = pce; 634 } 635 636 if (!(cap_mask & IB_PM_PC_XMIT_WAIT_SUP)) { 637 /* if PortCounters:PortXmitWait not supported clear this counter */ 638 uint32_t foo = 0; 639 mad_encode_field(pc, IB_PC_XMT_WAIT_F, &foo); 640 } 641 return (print_results(portid, node_name, node, pc, portnum, 642 header_printed, pc_ext, cap_mask)); 643 } 644 645 uint8_t *reset_pc_ext(void *rcvbuf, ib_portid_t * dest, 646 int port, unsigned mask, unsigned timeout, 647 const struct ibmad_port * srcport) 648 { 649 ib_rpc_t rpc = { 0 }; 650 int lid = dest->lid; 651 652 DEBUG("lid %u port %d mask 0x%x", lid, port, mask); 653 654 if (lid == -1) { 655 IBWARN("only lid routed is supported"); 656 return NULL; 657 } 658 659 if (!mask) 660 mask = ~0; 661 662 rpc.mgtclass = IB_PERFORMANCE_CLASS; 663 rpc.method = IB_MAD_METHOD_SET; 664 rpc.attr.id = IB_GSI_PORT_COUNTERS_EXT; 665 666 memset(rcvbuf, 0, IB_MAD_SIZE); 667 668 /* Same for attribute IDs */ 669 mad_set_field(rcvbuf, 0, IB_PC_EXT_PORT_SELECT_F, port); 670 mad_set_field(rcvbuf, 0, IB_PC_EXT_COUNTER_SELECT_F, mask); 671 rpc.attr.mod = 0; 672 rpc.timeout = timeout; 673 rpc.datasz = IB_PC_DATA_SZ; 674 rpc.dataoffs = IB_PC_DATA_OFFS; 675 if (!dest->qp) 676 dest->qp = 1; 677 if (!dest->qkey) 678 dest->qkey = IB_DEFAULT_QP1_QKEY; 679 680 return mad_rpc(srcport, &rpc, dest, rcvbuf, rcvbuf); 681 } 682 683 static void clear_port(ib_portid_t * portid, uint16_t cap_mask, 684 char *node_name, int port) 685 { 686 uint8_t pc[1024] = { 0 }; 687 /* bits defined in Table 228 PortCounters CounterSelect and 688 * CounterSelect2 689 */ 690 uint32_t mask = 0; 691 692 if (clear_errors) { 693 mask |= 0xFFF; 694 if (cap_mask & IB_PM_PC_XMIT_WAIT_SUP) 695 mask |= 0x10000; 696 } 697 if (clear_counts) 698 mask |= 0xF000; 699 700 if (mask) 701 if (!performance_reset_via(pc, portid, port, mask, ibd_timeout, 702 IB_GSI_PORT_COUNTERS, ibmad_port)) 703 fprintf(stderr, "Failed to reset errors %s port %d\n", node_name, 704 port); 705 706 if (clear_errors && details) { 707 memset(pc, 0, 1024); 708 performance_reset_via(pc, portid, port, 0xf, ibd_timeout, 709 IB_GSI_PORT_XMIT_DISCARD_DETAILS, 710 ibmad_port); 711 memset(pc, 0, 1024); 712 performance_reset_via(pc, portid, port, 0x3f, ibd_timeout, 713 IB_GSI_PORT_RCV_ERROR_DETAILS, 714 ibmad_port); 715 } 716 717 if (clear_counts && 718 (cap_mask & 719 (IB_PM_EXT_WIDTH_SUPPORTED | IB_PM_EXT_WIDTH_NOIETF_SUP))) { 720 if (cap_mask & IB_PM_EXT_WIDTH_SUPPORTED) 721 mask = 0xFF; 722 else 723 mask = 0x0F; 724 725 if (!reset_pc_ext(pc, portid, port, mask, ibd_timeout, 726 ibmad_port)) 727 fprintf(stderr, "Failed to reset extended data counters %s, " 728 "%s port %d\n", node_name, portid2str(portid), 729 port); 730 } 731 } 732 733 void print_node(ibnd_node_t * node, void *user_data) 734 { 735 int header_printed = 0; 736 int p = 0; 737 int startport = 1; 738 int type = 0; 739 int all_port_sup = 0; 740 ib_portid_t portid = { 0 }; 741 uint16_t cap_mask = 0; 742 char *node_name = NULL; 743 744 switch (node->type) { 745 case IB_NODE_SWITCH: 746 type = PRINT_SWITCH; 747 break; 748 case IB_NODE_CA: 749 type = PRINT_CA; 750 break; 751 case IB_NODE_ROUTER: 752 type = PRINT_ROUTER; 753 break; 754 } 755 756 if ((type & node_type_to_print) == 0) 757 return; 758 759 if (node->type == IB_NODE_SWITCH && node->smaenhsp0) 760 startport = 0; 761 762 node_name = remap_node_name(node_name_map, node->guid, node->nodedesc); 763 764 if (node->type == IB_NODE_SWITCH) { 765 ib_portid_set(&portid, node->smalid, 0, 0); 766 p = 0; 767 } else { 768 for (p = 1; p <= node->numports; p++) { 769 if (node->ports[p]) { 770 ib_portid_set(&portid, 771 node->ports[p]->base_lid, 772 0, 0); 773 break; 774 } 775 } 776 } 777 778 if ((query_cap_mask(&portid, node_name, p, &cap_mask) == 0) && 779 (cap_mask & IB_PM_ALL_PORT_SELECT)) 780 all_port_sup = 1; 781 782 if (data_counters_only) { 783 for (p = startport; p <= node->numports; p++) { 784 if (node->ports[p]) { 785 if (node->type == IB_NODE_SWITCH) 786 ib_portid_set(&portid, node->smalid, 0, 0); 787 else 788 ib_portid_set(&portid, node->ports[p]->base_lid, 789 0, 0); 790 791 print_data_cnts(&portid, cap_mask, node_name, node, p, 792 &header_printed); 793 summary.ports_checked++; 794 if (!all_port_sup) 795 clear_port(&portid, cap_mask, node_name, p); 796 } 797 } 798 } else { 799 if (all_port_sup) 800 if (!print_errors(&portid, cap_mask, node_name, node, 801 0xFF, &header_printed)) { 802 summary.ports_checked += node->numports; 803 goto clear; 804 } 805 806 for (p = startport; p <= node->numports; p++) { 807 if (node->ports[p]) { 808 if (node->type == IB_NODE_SWITCH) 809 ib_portid_set(&portid, node->smalid, 0, 0); 810 else 811 ib_portid_set(&portid, node->ports[p]->base_lid, 812 0, 0); 813 814 print_errors(&portid, cap_mask, node_name, node, p, 815 &header_printed); 816 summary.ports_checked++; 817 if (!all_port_sup) 818 clear_port(&portid, cap_mask, node_name, p); 819 } 820 } 821 } 822 823 clear: 824 summary.nodes_checked++; 825 if (all_port_sup) 826 clear_port(&portid, cap_mask, node_name, 0xFF); 827 828 free(node_name); 829 } 830 831 static void add_suppressed(enum MAD_FIELDS field) 832 { 833 if (sup_total >= SUP_MAX) { 834 IBWARN("Maximum (%d) fields have been suppressed; skipping %s", 835 sup_total, mad_field_name(field)); 836 return; 837 } 838 suppressed_fields[sup_total++] = field; 839 } 840 841 static void calculate_suppressed_fields(char *str) 842 { 843 enum MAD_FIELDS f; 844 char *val, *lasts = NULL; 845 char *tmp = strdup(str); 846 847 val = strtok_r(tmp, ",", &lasts); 848 while (val) { 849 for (f = IB_PC_FIRST_F; f <= IB_PC_LAST_F; f++) 850 if (strcmp(val, mad_field_name(f)) == 0) 851 add_suppressed(f); 852 val = strtok_r(NULL, ",", &lasts); 853 } 854 855 free(tmp); 856 } 857 858 static int process_opt(void *context, int ch, char *optarg) 859 { 860 struct ibnd_config *cfg = context; 861 switch (ch) { 862 case 's': 863 calculate_suppressed_fields(optarg); 864 break; 865 case 'c': 866 /* Right now this is the only "common" error */ 867 add_suppressed(IB_PC_ERR_SWITCH_REL_F); 868 break; 869 case 1: 870 node_name_map_file = strdup(optarg); 871 break; 872 case 2: 873 data_counters++; 874 break; 875 case 3: 876 node_type_to_print |= PRINT_SWITCH; 877 break; 878 case 4: 879 node_type_to_print |= PRINT_CA; 880 break; 881 case 5: 882 node_type_to_print |= PRINT_ROUTER; 883 break; 884 case 6: 885 details = 1; 886 break; 887 case 7: 888 load_cache_file = strdup(optarg); 889 break; 890 case 8: 891 threshold_file = strdup(optarg); 892 break; 893 case 9: 894 data_counters_only = 1; 895 break; 896 case 10: 897 obtain_sl = 0; 898 break; 899 case 'G': 900 case 'S': 901 port_guid_str = optarg; 902 port_guid = strtoull(optarg, 0, 0); 903 break; 904 case 'D': 905 dr_path = strdup(optarg); 906 break; 907 case 'r': 908 port_config++; 909 break; 910 case 'R': /* nop */ 911 break; 912 case 'k': 913 clear_errors = 1; 914 break; 915 case 'K': 916 clear_counts = 1; 917 break; 918 case 'o': 919 cfg->max_smps = strtoul(optarg, NULL, 0); 920 break; 921 default: 922 return -1; 923 } 924 925 return 0; 926 } 927 928 int main(int argc, char **argv) 929 { 930 struct ibnd_config config = { 0 }; 931 int resolved = -1; 932 ib_portid_t portid = { 0 }; 933 ib_portid_t self_portid = { 0 }; 934 int rc = 0; 935 ibnd_fabric_t *fabric = NULL; 936 ib_gid_t self_gid; 937 int port = 0; 938 939 int mgmt_classes[4] = { IB_SMI_CLASS, IB_SMI_DIRECT_CLASS, IB_SA_CLASS, 940 IB_PERFORMANCE_CLASS 941 }; 942 943 const struct ibdiag_opt opts[] = { 944 {"suppress", 's', 1, "<err1,err2,...>", 945 "suppress errors listed"}, 946 {"suppress-common", 'c', 0, NULL, 947 "suppress some of the common counters"}, 948 {"node-name-map", 1, 1, "<file>", "node name map file"}, 949 {"port-guid", 'G', 1, "<port_guid>", 950 "report the node containing the port specified by <port_guid>"}, 951 {"", 'S', 1, "<port_guid>", 952 "Same as \"-G\" for backward compatibility"}, 953 {"Direct", 'D', 1, "<dr_path>", 954 "report the node containing the port specified by <dr_path>"}, 955 {"skip-sl", 10, 0, NULL,"don't obtain SL to all destinations"}, 956 {"report-port", 'r', 0, NULL, 957 "report port link information"}, 958 {"threshold-file", 8, 1, NULL, 959 "specify an alternate threshold file, default: " DEF_THRES_FILE}, 960 {"GNDN", 'R', 0, NULL, 961 "(This option is obsolete and does nothing)"}, 962 {"data", 2, 0, NULL, "include data counters for ports with errors"}, 963 {"switch", 3, 0, NULL, "print data for switches only"}, 964 {"ca", 4, 0, NULL, "print data for CA's only"}, 965 {"router", 5, 0, NULL, "print data for routers only"}, 966 {"details", 6, 0, NULL, "include transmit discard details"}, 967 {"counters", 9, 0, NULL, "print data counters only"}, 968 {"clear-errors", 'k', 0, NULL, 969 "Clear error counters after read"}, 970 {"clear-counts", 'K', 0, NULL, 971 "Clear data counters after read"}, 972 {"load-cache", 7, 1, "<file>", 973 "filename of ibnetdiscover cache to load"}, 974 {"outstanding_smps", 'o', 1, NULL, 975 "specify the number of outstanding SMP's which should be " 976 "issued during the scan"}, 977 {0} 978 }; 979 char usage_args[] = ""; 980 981 memset(suppressed_fields, 0, sizeof suppressed_fields); 982 ibdiag_process_opts(argc, argv, &config, "cDGKLnRrSs", opts, process_opt, 983 usage_args, NULL); 984 985 argc -= optind; 986 argv += optind; 987 988 if (!node_type_to_print) 989 node_type_to_print = PRINT_ALL; 990 991 ibmad_port = mad_rpc_open_port(ibd_ca, ibd_ca_port, mgmt_classes, 4); 992 if (!ibmad_port) 993 IBEXIT("Failed to open port; %s:%d\n", ibd_ca, ibd_ca_port); 994 995 smp_mkey_set(ibmad_port, ibd_mkey); 996 997 if (ibd_timeout) { 998 mad_rpc_set_timeout(ibmad_port, ibd_timeout); 999 config.timeout_ms = ibd_timeout; 1000 } 1001 1002 config.flags = ibd_ibnetdisc_flags; 1003 config.mkey = ibd_mkey; 1004 1005 if (dr_path && load_cache_file) { 1006 mad_rpc_close_port(ibmad_port); 1007 fprintf(stderr, "Cannot specify cache and direct route path\n"); 1008 exit(-1); 1009 } 1010 1011 if (resolve_self(ibd_ca, ibd_ca_port, &self_portid, &port, &self_gid.raw) < 0) { 1012 mad_rpc_close_port(ibmad_port); 1013 IBEXIT("can't resolve self port %s", argv[0]); 1014 } 1015 1016 node_name_map = open_node_name_map(node_name_map_file); 1017 1018 /* limit the scan the fabric around the target */ 1019 if (dr_path) { 1020 if ((resolved = 1021 resolve_portid_str(ibd_ca, ibd_ca_port, &portid, dr_path, 1022 IB_DEST_DRPATH, NULL, ibmad_port)) < 0) 1023 IBWARN("Failed to resolve %s; attempting full scan", 1024 dr_path); 1025 } else if (port_guid_str) { 1026 if ((resolved = 1027 resolve_portid_str(ibd_ca, ibd_ca_port, &portid, 1028 port_guid_str, IB_DEST_GUID, ibd_sm_id, 1029 ibmad_port)) < 0) 1030 IBWARN("Failed to resolve %s; attempting full scan", 1031 port_guid_str); 1032 if(obtain_sl) 1033 lid2sl_table[portid.lid] = portid.sl; 1034 } 1035 1036 mad_rpc_close_port(ibmad_port); 1037 1038 if (load_cache_file) { 1039 if ((fabric = ibnd_load_fabric(load_cache_file, 0)) == NULL) { 1040 fprintf(stderr, "loading cached fabric failed\n"); 1041 rc = -1; 1042 goto close_port; 1043 } 1044 } else { 1045 if (resolved >= 0) { 1046 if (!config.max_hops) 1047 config.max_hops = 1; 1048 if (!(fabric = ibnd_discover_fabric(ibd_ca, ibd_ca_port, 1049 &portid, &config))) 1050 IBWARN("Single node discover failed;" 1051 " attempting full scan"); 1052 } 1053 1054 if (!fabric && !(fabric = ibnd_discover_fabric(ibd_ca, 1055 ibd_ca_port, 1056 NULL, 1057 &config))) { 1058 fprintf(stderr, "discover failed\n"); 1059 rc = -1; 1060 goto close_port; 1061 } 1062 } 1063 1064 set_thresholds(threshold_file); 1065 1066 /* reopen the global ibmad_port */ 1067 ibmad_port = mad_rpc_open_port(ibd_ca, ibd_ca_port, 1068 mgmt_classes, 4); 1069 if (!ibmad_port) { 1070 ibnd_destroy_fabric(fabric); 1071 close_node_name_map(node_name_map); 1072 IBEXIT("Failed to reopen port: %s:%d\n", 1073 ibd_ca, ibd_ca_port); 1074 } 1075 1076 smp_mkey_set(ibmad_port, ibd_mkey); 1077 1078 if (ibd_timeout) 1079 mad_rpc_set_timeout(ibmad_port, ibd_timeout); 1080 1081 if (port_guid_str) { 1082 ibnd_port_t *port = ibnd_find_port_guid(fabric, port_guid); 1083 if (port) 1084 print_node(port->node, NULL); 1085 else 1086 fprintf(stderr, "Failed to find node: %s\n", 1087 port_guid_str); 1088 } else if (dr_path) { 1089 ibnd_port_t *port; 1090 uint8_t ni[IB_SMP_DATA_SIZE] = { 0 }; 1091 if (!smp_query_via(ni, &portid, IB_ATTR_NODE_INFO, 0, 1092 ibd_timeout, ibmad_port)) { 1093 fprintf(stderr, "Failed to query local Node Info\n"); 1094 goto destroy_fabric; 1095 } 1096 1097 mad_decode_field(ni, IB_NODE_PORT_GUID_F, &(port_guid)); 1098 1099 port = ibnd_find_port_guid(fabric, port_guid); 1100 if (port) { 1101 if(obtain_sl) 1102 if(path_record_query(self_gid,port->guid)) 1103 goto destroy_fabric; 1104 print_node(port->node, NULL); 1105 } else 1106 fprintf(stderr, "Failed to find node: %s\n", dr_path); 1107 } else { 1108 if(obtain_sl) 1109 if(path_record_query(self_gid,0)) 1110 goto destroy_fabric; 1111 1112 ibnd_iter_nodes(fabric, print_node, NULL); 1113 } 1114 1115 rc = print_summary(); 1116 if (rc) 1117 rc = 1; 1118 1119 destroy_fabric: 1120 mad_rpc_close_port(ibmad_port); 1121 ibnd_destroy_fabric(fabric); 1122 1123 close_port: 1124 close_node_name_map(node_name_map); 1125 exit(rc); 1126 } 1127