1 // SPDX-License-Identifier: GPL-2.0-only
2
3 /* Highlights:
4 * 1. The major difference between this bpf program and tcp_cubic.c
5 * is that this bpf program relies on `cong_control` rather than
6 * `cong_avoid` in the struct tcp_congestion_ops.
7 * 2. Logic such as tcp_cwnd_reduction, tcp_cong_avoid, and
8 * tcp_update_pacing_rate is bypassed when `cong_control` is
9 * defined, so moving these logic to `cong_control`.
10 * 3. WARNING: This bpf program is NOT the same as tcp_cubic.c.
11 * The main purpose is to show use cases of the arguments in
12 * `cong_control`. For simplicity's sake, it reuses tcp cubic's
13 * kernel functions.
14 */
15
16 #include "bpf_tracing_net.h"
17 #include <bpf/bpf_helpers.h>
18 #include <bpf/bpf_tracing.h>
19
20 #define USEC_PER_SEC 1000000UL
21 #define TCP_PACING_SS_RATIO (200)
22 #define TCP_PACING_CA_RATIO (120)
23 #define TCP_REORDERING (12)
24
25 #define min(a, b) ((a) < (b) ? (a) : (b))
26 #define max(a, b) ((a) > (b) ? (a) : (b))
27 #define after(seq2, seq1) before(seq1, seq2)
28
29 extern void cubictcp_init(struct sock *sk) __ksym;
30 extern void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) __ksym;
31 extern __u32 cubictcp_recalc_ssthresh(struct sock *sk) __ksym;
32 extern void cubictcp_state(struct sock *sk, __u8 new_state) __ksym;
33 extern __u32 tcp_reno_undo_cwnd(struct sock *sk) __ksym;
34 extern void cubictcp_acked(struct sock *sk, const struct ack_sample *sample) __ksym;
35 extern void cubictcp_cong_avoid(struct sock *sk, __u32 ack, __u32 acked) __ksym;
36
before(__u32 seq1,__u32 seq2)37 static bool before(__u32 seq1, __u32 seq2)
38 {
39 return (__s32)(seq1-seq2) < 0;
40 }
41
div64_u64(__u64 dividend,__u64 divisor)42 static __u64 div64_u64(__u64 dividend, __u64 divisor)
43 {
44 return dividend / divisor;
45 }
46
tcp_update_pacing_rate(struct sock * sk)47 static void tcp_update_pacing_rate(struct sock *sk)
48 {
49 const struct tcp_sock *tp = tcp_sk(sk);
50 __u64 rate;
51
52 /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
53 rate = (__u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3);
54
55 /* current rate is (cwnd * mss) / srtt
56 * In Slow Start [1], set sk_pacing_rate to 200 % the current rate.
57 * In Congestion Avoidance phase, set it to 120 % the current rate.
58 *
59 * [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh)
60 * If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching
61 * end of slow start and should slow down.
62 */
63 if (tp->snd_cwnd < tp->snd_ssthresh / 2)
64 rate *= TCP_PACING_SS_RATIO;
65 else
66 rate *= TCP_PACING_CA_RATIO;
67
68 rate *= max(tp->snd_cwnd, tp->packets_out);
69
70 if (tp->srtt_us)
71 rate = div64_u64(rate, (__u64)tp->srtt_us);
72
73 sk->sk_pacing_rate = min(rate, sk->sk_max_pacing_rate);
74 }
75
tcp_cwnd_reduction(struct sock * sk,int newly_acked_sacked,int newly_lost,int flag)76 static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked,
77 int newly_lost, int flag)
78 {
79 struct tcp_sock *tp = tcp_sk(sk);
80 int sndcnt = 0;
81 __u32 pkts_in_flight = tp->packets_out - (tp->sacked_out + tp->lost_out) + tp->retrans_out;
82 int delta = tp->snd_ssthresh - pkts_in_flight;
83
84 if (newly_acked_sacked <= 0 || !tp->prior_cwnd)
85 return;
86
87 __u32 prr_delivered = tp->prr_delivered + newly_acked_sacked;
88
89 if (delta < 0) {
90 __u64 dividend =
91 (__u64)tp->snd_ssthresh * prr_delivered + tp->prior_cwnd - 1;
92 sndcnt = (__u32)div64_u64(dividend, (__u64)tp->prior_cwnd) - tp->prr_out;
93 } else {
94 sndcnt = max(prr_delivered - tp->prr_out, newly_acked_sacked);
95 if (flag & FLAG_SND_UNA_ADVANCED && !newly_lost)
96 sndcnt++;
97 sndcnt = min(delta, sndcnt);
98 }
99 /* Force a fast retransmit upon entering fast recovery */
100 sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1));
101 tp->snd_cwnd = pkts_in_flight + sndcnt;
102 }
103
104 /* Decide wheather to run the increase function of congestion control. */
tcp_may_raise_cwnd(const struct sock * sk,const int flag)105 static bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
106 {
107 if (tcp_sk(sk)->reordering > TCP_REORDERING)
108 return flag & FLAG_FORWARD_PROGRESS;
109
110 return flag & FLAG_DATA_ACKED;
111 }
112
113 SEC("struct_ops")
BPF_PROG(bpf_cubic_init,struct sock * sk)114 void BPF_PROG(bpf_cubic_init, struct sock *sk)
115 {
116 cubictcp_init(sk);
117 }
118
119 SEC("struct_ops")
BPF_PROG(bpf_cubic_cwnd_event,struct sock * sk,enum tcp_ca_event event)120 void BPF_PROG(bpf_cubic_cwnd_event, struct sock *sk, enum tcp_ca_event event)
121 {
122 cubictcp_cwnd_event(sk, event);
123 }
124
125 SEC("struct_ops")
BPF_PROG(bpf_cubic_cong_control,struct sock * sk,__u32 ack,int flag,const struct rate_sample * rs)126 void BPF_PROG(bpf_cubic_cong_control, struct sock *sk, __u32 ack, int flag,
127 const struct rate_sample *rs)
128 {
129 struct tcp_sock *tp = tcp_sk(sk);
130
131 if (((1<<TCP_CA_CWR) | (1<<TCP_CA_Recovery)) &
132 (1 << inet_csk(sk)->icsk_ca_state)) {
133 /* Reduce cwnd if state mandates */
134 tcp_cwnd_reduction(sk, rs->acked_sacked, rs->losses, flag);
135
136 if (!before(tp->snd_una, tp->high_seq)) {
137 /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
138 if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH &&
139 inet_csk(sk)->icsk_ca_state == TCP_CA_CWR) {
140 tp->snd_cwnd = tp->snd_ssthresh;
141 tp->snd_cwnd_stamp = tcp_jiffies32;
142 }
143 }
144 } else if (tcp_may_raise_cwnd(sk, flag)) {
145 /* Advance cwnd if state allows */
146 cubictcp_cong_avoid(sk, ack, rs->acked_sacked);
147 tp->snd_cwnd_stamp = tcp_jiffies32;
148 }
149
150 tcp_update_pacing_rate(sk);
151 }
152
153 SEC("struct_ops")
BPF_PROG(bpf_cubic_recalc_ssthresh,struct sock * sk)154 __u32 BPF_PROG(bpf_cubic_recalc_ssthresh, struct sock *sk)
155 {
156 return cubictcp_recalc_ssthresh(sk);
157 }
158
159 SEC("struct_ops")
BPF_PROG(bpf_cubic_state,struct sock * sk,__u8 new_state)160 void BPF_PROG(bpf_cubic_state, struct sock *sk, __u8 new_state)
161 {
162 cubictcp_state(sk, new_state);
163 }
164
165 SEC("struct_ops")
BPF_PROG(bpf_cubic_acked,struct sock * sk,const struct ack_sample * sample)166 void BPF_PROG(bpf_cubic_acked, struct sock *sk, const struct ack_sample *sample)
167 {
168 cubictcp_acked(sk, sample);
169 }
170
171 SEC("struct_ops")
BPF_PROG(bpf_cubic_undo_cwnd,struct sock * sk)172 __u32 BPF_PROG(bpf_cubic_undo_cwnd, struct sock *sk)
173 {
174 return tcp_reno_undo_cwnd(sk);
175 }
176
177 SEC(".struct_ops")
178 struct tcp_congestion_ops cc_cubic = {
179 .init = (void *)bpf_cubic_init,
180 .ssthresh = (void *)bpf_cubic_recalc_ssthresh,
181 .cong_control = (void *)bpf_cubic_cong_control,
182 .set_state = (void *)bpf_cubic_state,
183 .undo_cwnd = (void *)bpf_cubic_undo_cwnd,
184 .cwnd_event = (void *)bpf_cubic_cwnd_event,
185 .pkts_acked = (void *)bpf_cubic_acked,
186 .name = "bpf_cc_cubic",
187 };
188
189 char _license[] SEC("license") = "GPL";
190