1 // SPDX-License-Identifier: GPL-2.0 2 /* Copyright (c) 2019 Facebook 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of version 2 of the GNU General Public 6 * License as published by the Free Software Foundation. 7 * 8 * Example program for Host Bandwidth Managment 9 * 10 * This program loads a cgroup skb BPF program to enforce cgroup output 11 * (egress) or input (ingress) bandwidth limits. 12 * 13 * USAGE: hbm [-d] [-l] [-n <id>] [-r <rate>] [-s] [-t <secs>] [-w] [-h] [prog] 14 * Where: 15 * -d Print BPF trace debug buffer 16 * -l Also limit flows doing loopback 17 * -n <#> To create cgroup \"/hbm#\" and attach prog 18 * Default is /hbm1 19 * -r <rate> Rate limit in Mbps 20 * -s Get HBM stats (marked, dropped, etc.) 21 * -t <time> Exit after specified seconds (default is 0) 22 * -w Work conserving flag. cgroup can increase its bandwidth 23 * beyond the rate limit specified while there is available 24 * bandwidth. Current implementation assumes there is only 25 * NIC (eth0), but can be extended to support multiple NICs. 26 * Currrently only supported for egress. 27 * -h Print this info 28 * prog BPF program file name. Name defaults to hbm_out_kern.o 29 */ 30 31 #define _GNU_SOURCE 32 33 #include <stdio.h> 34 #include <stdlib.h> 35 #include <assert.h> 36 #include <sys/resource.h> 37 #include <sys/time.h> 38 #include <unistd.h> 39 #include <errno.h> 40 #include <fcntl.h> 41 #include <linux/unistd.h> 42 43 #include <linux/bpf.h> 44 #include <bpf/bpf.h> 45 46 #include "bpf_load.h" 47 #include "bpf_rlimit.h" 48 #include "cgroup_helpers.h" 49 #include "hbm.h" 50 #include "bpf_util.h" 51 #include "bpf/bpf.h" 52 #include "bpf/libbpf.h" 53 54 bool outFlag = true; 55 int minRate = 1000; /* cgroup rate limit in Mbps */ 56 int rate = 1000; /* can grow if rate conserving is enabled */ 57 int dur = 1; 58 bool stats_flag; 59 bool loopback_flag; 60 bool debugFlag; 61 bool work_conserving_flag; 62 63 static void Usage(void); 64 static void read_trace_pipe2(void); 65 static void do_error(char *msg, bool errno_flag); 66 67 #define DEBUGFS "/sys/kernel/debug/tracing/" 68 69 struct bpf_object *obj; 70 int bpfprog_fd; 71 int cgroup_storage_fd; 72 73 static void read_trace_pipe2(void) 74 { 75 int trace_fd; 76 FILE *outf; 77 char *outFname = "hbm_out.log"; 78 79 trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0); 80 if (trace_fd < 0) { 81 printf("Error opening trace_pipe\n"); 82 return; 83 } 84 85 // Future support of ingress 86 // if (!outFlag) 87 // outFname = "hbm_in.log"; 88 outf = fopen(outFname, "w"); 89 90 if (outf == NULL) 91 printf("Error creating %s\n", outFname); 92 93 while (1) { 94 static char buf[4097]; 95 ssize_t sz; 96 97 sz = read(trace_fd, buf, sizeof(buf) - 1); 98 if (sz > 0) { 99 buf[sz] = 0; 100 puts(buf); 101 if (outf != NULL) { 102 fprintf(outf, "%s\n", buf); 103 fflush(outf); 104 } 105 } 106 } 107 } 108 109 static void do_error(char *msg, bool errno_flag) 110 { 111 if (errno_flag) 112 printf("ERROR: %s, errno: %d\n", msg, errno); 113 else 114 printf("ERROR: %s\n", msg); 115 exit(1); 116 } 117 118 static int prog_load(char *prog) 119 { 120 struct bpf_prog_load_attr prog_load_attr = { 121 .prog_type = BPF_PROG_TYPE_CGROUP_SKB, 122 .file = prog, 123 .expected_attach_type = BPF_CGROUP_INET_EGRESS, 124 }; 125 int map_fd; 126 struct bpf_map *map; 127 128 int ret = 0; 129 130 if (access(prog, O_RDONLY) < 0) { 131 printf("Error accessing file %s: %s\n", prog, strerror(errno)); 132 return 1; 133 } 134 if (bpf_prog_load_xattr(&prog_load_attr, &obj, &bpfprog_fd)) 135 ret = 1; 136 if (!ret) { 137 map = bpf_object__find_map_by_name(obj, "queue_stats"); 138 map_fd = bpf_map__fd(map); 139 if (map_fd < 0) { 140 printf("Map not found: %s\n", strerror(map_fd)); 141 ret = 1; 142 } 143 } 144 145 if (ret) { 146 printf("ERROR: load_bpf_file failed for: %s\n", prog); 147 printf(" Output from verifier:\n%s\n------\n", bpf_log_buf); 148 ret = -1; 149 } else { 150 ret = map_fd; 151 } 152 153 return ret; 154 } 155 156 static int run_bpf_prog(char *prog, int cg_id) 157 { 158 int map_fd; 159 int rc = 0; 160 int key = 0; 161 int cg1 = 0; 162 int type = BPF_CGROUP_INET_EGRESS; 163 char cg_dir[100]; 164 struct hbm_queue_stats qstats = {0}; 165 166 sprintf(cg_dir, "/hbm%d", cg_id); 167 map_fd = prog_load(prog); 168 if (map_fd == -1) 169 return 1; 170 171 if (setup_cgroup_environment()) { 172 printf("ERROR: setting cgroup environment\n"); 173 goto err; 174 } 175 cg1 = create_and_get_cgroup(cg_dir); 176 if (!cg1) { 177 printf("ERROR: create_and_get_cgroup\n"); 178 goto err; 179 } 180 if (join_cgroup(cg_dir)) { 181 printf("ERROR: join_cgroup\n"); 182 goto err; 183 } 184 185 qstats.rate = rate; 186 qstats.stats = stats_flag ? 1 : 0; 187 qstats.loopback = loopback_flag ? 1 : 0; 188 if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) { 189 printf("ERROR: Could not update map element\n"); 190 goto err; 191 } 192 193 if (!outFlag) 194 type = BPF_CGROUP_INET_INGRESS; 195 if (bpf_prog_attach(bpfprog_fd, cg1, type, 0)) { 196 printf("ERROR: bpf_prog_attach fails!\n"); 197 log_err("Attaching prog"); 198 goto err; 199 } 200 201 if (work_conserving_flag) { 202 struct timeval t0, t_last, t_new; 203 FILE *fin; 204 unsigned long long last_eth_tx_bytes, new_eth_tx_bytes; 205 signed long long last_cg_tx_bytes, new_cg_tx_bytes; 206 signed long long delta_time, delta_bytes, delta_rate; 207 int delta_ms; 208 #define DELTA_RATE_CHECK 10000 /* in us */ 209 #define RATE_THRESHOLD 9500000000 /* 9.5 Gbps */ 210 211 bpf_map_lookup_elem(map_fd, &key, &qstats); 212 if (gettimeofday(&t0, NULL) < 0) 213 do_error("gettimeofday failed", true); 214 t_last = t0; 215 fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", "r"); 216 if (fscanf(fin, "%llu", &last_eth_tx_bytes) != 1) 217 do_error("fscanf fails", false); 218 fclose(fin); 219 last_cg_tx_bytes = qstats.bytes_total; 220 while (true) { 221 usleep(DELTA_RATE_CHECK); 222 if (gettimeofday(&t_new, NULL) < 0) 223 do_error("gettimeofday failed", true); 224 delta_ms = (t_new.tv_sec - t0.tv_sec) * 1000 + 225 (t_new.tv_usec - t0.tv_usec)/1000; 226 if (delta_ms > dur * 1000) 227 break; 228 delta_time = (t_new.tv_sec - t_last.tv_sec) * 1000000 + 229 (t_new.tv_usec - t_last.tv_usec); 230 if (delta_time == 0) 231 continue; 232 t_last = t_new; 233 fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", 234 "r"); 235 if (fscanf(fin, "%llu", &new_eth_tx_bytes) != 1) 236 do_error("fscanf fails", false); 237 fclose(fin); 238 printf(" new_eth_tx_bytes:%llu\n", 239 new_eth_tx_bytes); 240 bpf_map_lookup_elem(map_fd, &key, &qstats); 241 new_cg_tx_bytes = qstats.bytes_total; 242 delta_bytes = new_eth_tx_bytes - last_eth_tx_bytes; 243 last_eth_tx_bytes = new_eth_tx_bytes; 244 delta_rate = (delta_bytes * 8000000) / delta_time; 245 printf("%5d - eth_rate:%.1fGbps cg_rate:%.3fGbps", 246 delta_ms, delta_rate/1000000000.0, 247 rate/1000.0); 248 if (delta_rate < RATE_THRESHOLD) { 249 /* can increase cgroup rate limit, but first 250 * check if we are using the current limit. 251 * Currently increasing by 6.25%, unknown 252 * if that is the optimal rate. 253 */ 254 int rate_diff100; 255 256 delta_bytes = new_cg_tx_bytes - 257 last_cg_tx_bytes; 258 last_cg_tx_bytes = new_cg_tx_bytes; 259 delta_rate = (delta_bytes * 8000000) / 260 delta_time; 261 printf(" rate:%.3fGbps", 262 delta_rate/1000000000.0); 263 rate_diff100 = (((long long)rate)*1000000 - 264 delta_rate) * 100 / 265 (((long long) rate) * 1000000); 266 printf(" rdiff:%d", rate_diff100); 267 if (rate_diff100 <= 3) { 268 rate += (rate >> 4); 269 if (rate > RATE_THRESHOLD / 1000000) 270 rate = RATE_THRESHOLD / 1000000; 271 qstats.rate = rate; 272 printf(" INC\n"); 273 } else { 274 printf("\n"); 275 } 276 } else { 277 /* Need to decrease cgroup rate limit. 278 * Currently decreasing by 12.5%, unknown 279 * if that is optimal 280 */ 281 printf(" DEC\n"); 282 rate -= (rate >> 3); 283 if (rate < minRate) 284 rate = minRate; 285 qstats.rate = rate; 286 } 287 if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) 288 do_error("update map element fails", false); 289 } 290 } else { 291 sleep(dur); 292 } 293 // Get stats! 294 if (stats_flag && bpf_map_lookup_elem(map_fd, &key, &qstats)) { 295 char fname[100]; 296 FILE *fout; 297 298 if (!outFlag) 299 sprintf(fname, "hbm.%d.in", cg_id); 300 else 301 sprintf(fname, "hbm.%d.out", cg_id); 302 fout = fopen(fname, "w"); 303 fprintf(fout, "id:%d\n", cg_id); 304 fprintf(fout, "ERROR: Could not lookup queue_stats\n"); 305 } else if (stats_flag && qstats.lastPacketTime > 306 qstats.firstPacketTime) { 307 long long delta_us = (qstats.lastPacketTime - 308 qstats.firstPacketTime)/1000; 309 unsigned int rate_mbps = ((qstats.bytes_total - 310 qstats.bytes_dropped) * 8 / 311 delta_us); 312 double percent_pkts, percent_bytes; 313 char fname[100]; 314 FILE *fout; 315 316 // Future support of ingress 317 // if (!outFlag) 318 // sprintf(fname, "hbm.%d.in", cg_id); 319 // else 320 sprintf(fname, "hbm.%d.out", cg_id); 321 fout = fopen(fname, "w"); 322 fprintf(fout, "id:%d\n", cg_id); 323 fprintf(fout, "rate_mbps:%d\n", rate_mbps); 324 fprintf(fout, "duration:%.1f secs\n", 325 (qstats.lastPacketTime - qstats.firstPacketTime) / 326 1000000000.0); 327 fprintf(fout, "packets:%d\n", (int)qstats.pkts_total); 328 fprintf(fout, "bytes_MB:%d\n", (int)(qstats.bytes_total / 329 1000000)); 330 fprintf(fout, "pkts_dropped:%d\n", (int)qstats.pkts_dropped); 331 fprintf(fout, "bytes_dropped_MB:%d\n", 332 (int)(qstats.bytes_dropped / 333 1000000)); 334 // Marked Pkts and Bytes 335 percent_pkts = (qstats.pkts_marked * 100.0) / 336 (qstats.pkts_total + 1); 337 percent_bytes = (qstats.bytes_marked * 100.0) / 338 (qstats.bytes_total + 1); 339 fprintf(fout, "pkts_marked_percent:%6.2f\n", percent_pkts); 340 fprintf(fout, "bytes_marked_percent:%6.2f\n", percent_bytes); 341 342 // Dropped Pkts and Bytes 343 percent_pkts = (qstats.pkts_dropped * 100.0) / 344 (qstats.pkts_total + 1); 345 percent_bytes = (qstats.bytes_dropped * 100.0) / 346 (qstats.bytes_total + 1); 347 fprintf(fout, "pkts_dropped_percent:%6.2f\n", percent_pkts); 348 fprintf(fout, "bytes_dropped_percent:%6.2f\n", percent_bytes); 349 fclose(fout); 350 } 351 352 if (debugFlag) 353 read_trace_pipe2(); 354 return rc; 355 err: 356 rc = 1; 357 358 if (cg1) 359 close(cg1); 360 cleanup_cgroup_environment(); 361 362 return rc; 363 } 364 365 static void Usage(void) 366 { 367 printf("This program loads a cgroup skb BPF program to enforce\n" 368 "cgroup output (egress) bandwidth limits.\n\n" 369 "USAGE: hbm [-o] [-d] [-l] [-n <id>] [-r <rate>] [-s]\n" 370 " [-t <secs>] [-w] [-h] [prog]\n" 371 " Where:\n" 372 " -o indicates egress direction (default)\n" 373 " -d print BPF trace debug buffer\n" 374 " -l also limit flows using loopback\n" 375 " -n <#> to create cgroup \"/hbm#\" and attach prog\n" 376 " Default is /hbm1\n" 377 " -r <rate> Rate in Mbps\n" 378 " -s Update HBM stats\n" 379 " -t <time> Exit after specified seconds (default is 0)\n" 380 " -w Work conserving flag. cgroup can increase\n" 381 " bandwidth beyond the rate limit specified\n" 382 " while there is available bandwidth. Current\n" 383 " implementation assumes there is only eth0\n" 384 " but can be extended to support multiple NICs\n" 385 " -h print this info\n" 386 " prog BPF program file name. Name defaults to\n" 387 " hbm_out_kern.o\n"); 388 } 389 390 int main(int argc, char **argv) 391 { 392 char *prog = "hbm_out_kern.o"; 393 int k; 394 int cg_id = 1; 395 char *optstring = "iodln:r:st:wh"; 396 397 while ((k = getopt(argc, argv, optstring)) != -1) { 398 switch (k) { 399 case'o': 400 break; 401 case 'd': 402 debugFlag = true; 403 break; 404 case 'l': 405 loopback_flag = true; 406 break; 407 case 'n': 408 cg_id = atoi(optarg); 409 break; 410 case 'r': 411 minRate = atoi(optarg) * 1.024; 412 rate = minRate; 413 break; 414 case 's': 415 stats_flag = true; 416 break; 417 case 't': 418 dur = atoi(optarg); 419 break; 420 case 'w': 421 work_conserving_flag = true; 422 break; 423 case '?': 424 if (optopt == 'n' || optopt == 'r' || optopt == 't') 425 fprintf(stderr, 426 "Option -%c requires an argument.\n\n", 427 optopt); 428 case 'h': 429 // fallthrough 430 default: 431 Usage(); 432 return 0; 433 } 434 } 435 436 if (optind < argc) 437 prog = argv[optind]; 438 printf("HBM prog: %s\n", prog != NULL ? prog : "NULL"); 439 440 return run_bpf_prog(prog, cg_id); 441 } 442