1 // SPDX-License-Identifier: GPL-2.0+
2 /*
3 * PCIe bandwidth controller
4 *
5 * Author: Alexandru Gagniuc <mr.nuke.me@gmail.com>
6 *
7 * Copyright (C) 2019 Dell Inc
8 * Copyright (C) 2023-2024 Intel Corporation
9 *
10 * The PCIe bandwidth controller provides a way to alter PCIe Link Speeds
11 * and notify the operating system when the Link Width or Speed changes. The
12 * notification capability is required for all Root Ports and Downstream
13 * Ports supporting Link Width wider than x1 and/or multiple Link Speeds.
14 *
15 * This service port driver hooks into the Bandwidth Notification interrupt
16 * watching for changes or links becoming degraded in operation. It updates
17 * the cached Current Link Speed that is exposed to user space through sysfs.
18 */
19
20 #define dev_fmt(fmt) "bwctrl: " fmt
21
22 #include <linux/atomic.h>
23 #include <linux/bitops.h>
24 #include <linux/bits.h>
25 #include <linux/cleanup.h>
26 #include <linux/errno.h>
27 #include <linux/interrupt.h>
28 #include <linux/mutex.h>
29 #include <linux/pci.h>
30 #include <linux/pci-bwctrl.h>
31 #include <linux/rwsem.h>
32 #include <linux/slab.h>
33 #include <linux/types.h>
34
35 #include "../pci.h"
36 #include "portdrv.h"
37
38 /**
39 * struct pcie_bwctrl_data - PCIe bandwidth controller
40 * @set_speed_mutex: Serializes link speed changes
41 * @lbms_count: Count for LBMS (since last reset)
42 * @cdev: Thermal cooling device associated with the port
43 */
44 struct pcie_bwctrl_data {
45 struct mutex set_speed_mutex;
46 atomic_t lbms_count;
47 struct thermal_cooling_device *cdev;
48 };
49
50 /*
51 * Prevent port removal during LBMS count accessors and Link Speed changes.
52 *
53 * These have to be differentiated because pcie_bwctrl_change_speed() calls
54 * pcie_retrain_link() which uses LBMS count reset accessor on success
55 * (using just one rwsem triggers "possible recursive locking detected"
56 * warning).
57 */
58 static DECLARE_RWSEM(pcie_bwctrl_lbms_rwsem);
59 static DECLARE_RWSEM(pcie_bwctrl_setspeed_rwsem);
60
pcie_valid_speed(enum pci_bus_speed speed)61 static bool pcie_valid_speed(enum pci_bus_speed speed)
62 {
63 return (speed >= PCIE_SPEED_2_5GT) && (speed <= PCIE_SPEED_64_0GT);
64 }
65
pci_bus_speed2lnkctl2(enum pci_bus_speed speed)66 static u16 pci_bus_speed2lnkctl2(enum pci_bus_speed speed)
67 {
68 static const u8 speed_conv[] = {
69 [PCIE_SPEED_2_5GT] = PCI_EXP_LNKCTL2_TLS_2_5GT,
70 [PCIE_SPEED_5_0GT] = PCI_EXP_LNKCTL2_TLS_5_0GT,
71 [PCIE_SPEED_8_0GT] = PCI_EXP_LNKCTL2_TLS_8_0GT,
72 [PCIE_SPEED_16_0GT] = PCI_EXP_LNKCTL2_TLS_16_0GT,
73 [PCIE_SPEED_32_0GT] = PCI_EXP_LNKCTL2_TLS_32_0GT,
74 [PCIE_SPEED_64_0GT] = PCI_EXP_LNKCTL2_TLS_64_0GT,
75 };
76
77 if (WARN_ON_ONCE(!pcie_valid_speed(speed)))
78 return 0;
79
80 return speed_conv[speed];
81 }
82
pcie_supported_speeds2target_speed(u8 supported_speeds)83 static inline u16 pcie_supported_speeds2target_speed(u8 supported_speeds)
84 {
85 return __fls(supported_speeds);
86 }
87
88 /**
89 * pcie_bwctrl_select_speed - Select Target Link Speed
90 * @port: PCIe Port
91 * @speed_req: Requested PCIe Link Speed
92 *
93 * Select Target Link Speed by take into account Supported Link Speeds of
94 * both the Root Port and the Endpoint.
95 *
96 * Return: Target Link Speed (1=2.5GT/s, 2=5GT/s, 3=8GT/s, etc.)
97 */
pcie_bwctrl_select_speed(struct pci_dev * port,enum pci_bus_speed speed_req)98 static u16 pcie_bwctrl_select_speed(struct pci_dev *port, enum pci_bus_speed speed_req)
99 {
100 struct pci_bus *bus = port->subordinate;
101 u8 desired_speeds, supported_speeds;
102 struct pci_dev *dev;
103
104 desired_speeds = GENMASK(pci_bus_speed2lnkctl2(speed_req),
105 __fls(PCI_EXP_LNKCAP2_SLS_2_5GB));
106
107 supported_speeds = port->supported_speeds;
108 if (bus) {
109 down_read(&pci_bus_sem);
110 dev = list_first_entry_or_null(&bus->devices, struct pci_dev, bus_list);
111 if (dev)
112 supported_speeds &= dev->supported_speeds;
113 up_read(&pci_bus_sem);
114 }
115 if (!supported_speeds)
116 return PCI_EXP_LNKCAP2_SLS_2_5GB;
117
118 return pcie_supported_speeds2target_speed(supported_speeds & desired_speeds);
119 }
120
pcie_bwctrl_change_speed(struct pci_dev * port,u16 target_speed,bool use_lt)121 static int pcie_bwctrl_change_speed(struct pci_dev *port, u16 target_speed, bool use_lt)
122 {
123 int ret;
124
125 ret = pcie_capability_clear_and_set_word(port, PCI_EXP_LNKCTL2,
126 PCI_EXP_LNKCTL2_TLS, target_speed);
127 if (ret != PCIBIOS_SUCCESSFUL)
128 return pcibios_err_to_errno(ret);
129
130 ret = pcie_retrain_link(port, use_lt);
131 if (ret < 0)
132 return ret;
133
134 /*
135 * Ensure link speed updates also with platforms that have problems
136 * with notifications.
137 */
138 if (port->subordinate)
139 pcie_update_link_speed(port->subordinate);
140
141 return 0;
142 }
143
144 /**
145 * pcie_set_target_speed - Set downstream Link Speed for PCIe Port
146 * @port: PCIe Port
147 * @speed_req: Requested PCIe Link Speed
148 * @use_lt: Wait for the LT or DLLLA bit to detect the end of link training
149 *
150 * Attempt to set PCIe Port Link Speed to @speed_req. @speed_req may be
151 * adjusted downwards to the best speed supported by both the Port and PCIe
152 * Device underneath it.
153 *
154 * Return:
155 * * 0 - on success
156 * * -EINVAL - @speed_req is not a PCIe Link Speed
157 * * -ENODEV - @port is not controllable
158 * * -ETIMEDOUT - changing Link Speed took too long
159 * * -EAGAIN - Link Speed was changed but @speed_req was not achieved
160 */
pcie_set_target_speed(struct pci_dev * port,enum pci_bus_speed speed_req,bool use_lt)161 int pcie_set_target_speed(struct pci_dev *port, enum pci_bus_speed speed_req,
162 bool use_lt)
163 {
164 struct pci_bus *bus = port->subordinate;
165 u16 target_speed;
166 int ret;
167
168 if (WARN_ON_ONCE(!pcie_valid_speed(speed_req)))
169 return -EINVAL;
170
171 if (bus && bus->cur_bus_speed == speed_req)
172 return 0;
173
174 target_speed = pcie_bwctrl_select_speed(port, speed_req);
175
176 scoped_guard(rwsem_read, &pcie_bwctrl_setspeed_rwsem) {
177 struct pcie_bwctrl_data *data = port->link_bwctrl;
178
179 /*
180 * port->link_bwctrl is NULL during initial scan when called
181 * e.g. from the Target Speed quirk.
182 */
183 if (data)
184 mutex_lock(&data->set_speed_mutex);
185
186 ret = pcie_bwctrl_change_speed(port, target_speed, use_lt);
187
188 if (data)
189 mutex_unlock(&data->set_speed_mutex);
190 }
191
192 /*
193 * Despite setting higher speed into the Target Link Speed, empty
194 * bus won't train to 5GT+ speeds.
195 */
196 if (!ret && bus && bus->cur_bus_speed != speed_req &&
197 !list_empty(&bus->devices))
198 ret = -EAGAIN;
199
200 return ret;
201 }
202
pcie_bwnotif_enable(struct pcie_device * srv)203 static void pcie_bwnotif_enable(struct pcie_device *srv)
204 {
205 struct pcie_bwctrl_data *data = srv->port->link_bwctrl;
206 struct pci_dev *port = srv->port;
207 u16 link_status;
208 int ret;
209
210 /* Count LBMS seen so far as one */
211 ret = pcie_capability_read_word(port, PCI_EXP_LNKSTA, &link_status);
212 if (ret == PCIBIOS_SUCCESSFUL && link_status & PCI_EXP_LNKSTA_LBMS)
213 atomic_inc(&data->lbms_count);
214
215 pcie_capability_set_word(port, PCI_EXP_LNKCTL,
216 PCI_EXP_LNKCTL_LBMIE | PCI_EXP_LNKCTL_LABIE);
217 pcie_capability_write_word(port, PCI_EXP_LNKSTA,
218 PCI_EXP_LNKSTA_LBMS | PCI_EXP_LNKSTA_LABS);
219
220 /*
221 * Update after enabling notifications & clearing status bits ensures
222 * link speed is up to date.
223 */
224 pcie_update_link_speed(port->subordinate);
225 }
226
pcie_bwnotif_disable(struct pci_dev * port)227 static void pcie_bwnotif_disable(struct pci_dev *port)
228 {
229 pcie_capability_clear_word(port, PCI_EXP_LNKCTL,
230 PCI_EXP_LNKCTL_LBMIE | PCI_EXP_LNKCTL_LABIE);
231 }
232
pcie_bwnotif_irq(int irq,void * context)233 static irqreturn_t pcie_bwnotif_irq(int irq, void *context)
234 {
235 struct pcie_device *srv = context;
236 struct pcie_bwctrl_data *data = srv->port->link_bwctrl;
237 struct pci_dev *port = srv->port;
238 u16 link_status, events;
239 int ret;
240
241 ret = pcie_capability_read_word(port, PCI_EXP_LNKSTA, &link_status);
242 if (ret != PCIBIOS_SUCCESSFUL)
243 return IRQ_NONE;
244
245 events = link_status & (PCI_EXP_LNKSTA_LBMS | PCI_EXP_LNKSTA_LABS);
246 if (!events)
247 return IRQ_NONE;
248
249 if (events & PCI_EXP_LNKSTA_LBMS)
250 atomic_inc(&data->lbms_count);
251
252 pcie_capability_write_word(port, PCI_EXP_LNKSTA, events);
253
254 /*
255 * Interrupts will not be triggered from any further Link Speed
256 * change until LBMS is cleared by the write. Therefore, re-read the
257 * speed (inside pcie_update_link_speed()) after LBMS has been
258 * cleared to avoid missing link speed changes.
259 */
260 pcie_update_link_speed(port->subordinate);
261
262 return IRQ_HANDLED;
263 }
264
pcie_reset_lbms_count(struct pci_dev * port)265 void pcie_reset_lbms_count(struct pci_dev *port)
266 {
267 struct pcie_bwctrl_data *data;
268
269 guard(rwsem_read)(&pcie_bwctrl_lbms_rwsem);
270 data = port->link_bwctrl;
271 if (data)
272 atomic_set(&data->lbms_count, 0);
273 else
274 pcie_capability_write_word(port, PCI_EXP_LNKSTA,
275 PCI_EXP_LNKSTA_LBMS);
276 }
277
pcie_lbms_count(struct pci_dev * port,unsigned long * val)278 int pcie_lbms_count(struct pci_dev *port, unsigned long *val)
279 {
280 struct pcie_bwctrl_data *data;
281
282 guard(rwsem_read)(&pcie_bwctrl_lbms_rwsem);
283 data = port->link_bwctrl;
284 if (!data)
285 return -ENOTTY;
286
287 *val = atomic_read(&data->lbms_count);
288
289 return 0;
290 }
291
pcie_bwnotif_probe(struct pcie_device * srv)292 static int pcie_bwnotif_probe(struct pcie_device *srv)
293 {
294 struct pci_dev *port = srv->port;
295 int ret;
296
297 struct pcie_bwctrl_data *data = devm_kzalloc(&srv->device,
298 sizeof(*data), GFP_KERNEL);
299 if (!data)
300 return -ENOMEM;
301
302 ret = devm_mutex_init(&srv->device, &data->set_speed_mutex);
303 if (ret)
304 return ret;
305
306 scoped_guard(rwsem_write, &pcie_bwctrl_setspeed_rwsem) {
307 scoped_guard(rwsem_write, &pcie_bwctrl_lbms_rwsem) {
308 port->link_bwctrl = data;
309
310 ret = request_irq(srv->irq, pcie_bwnotif_irq,
311 IRQF_SHARED, "PCIe bwctrl", srv);
312 if (ret) {
313 port->link_bwctrl = NULL;
314 return ret;
315 }
316
317 pcie_bwnotif_enable(srv);
318 }
319 }
320
321 pci_dbg(port, "enabled with IRQ %d\n", srv->irq);
322
323 /* Don't fail on errors. Don't leave IS_ERR() "pointer" into ->cdev */
324 port->link_bwctrl->cdev = pcie_cooling_device_register(port);
325 if (IS_ERR(port->link_bwctrl->cdev))
326 port->link_bwctrl->cdev = NULL;
327
328 return 0;
329 }
330
pcie_bwnotif_remove(struct pcie_device * srv)331 static void pcie_bwnotif_remove(struct pcie_device *srv)
332 {
333 struct pcie_bwctrl_data *data = srv->port->link_bwctrl;
334
335 pcie_cooling_device_unregister(data->cdev);
336
337 scoped_guard(rwsem_write, &pcie_bwctrl_setspeed_rwsem) {
338 scoped_guard(rwsem_write, &pcie_bwctrl_lbms_rwsem) {
339 pcie_bwnotif_disable(srv->port);
340
341 free_irq(srv->irq, srv);
342
343 srv->port->link_bwctrl = NULL;
344 }
345 }
346 }
347
pcie_bwnotif_suspend(struct pcie_device * srv)348 static int pcie_bwnotif_suspend(struct pcie_device *srv)
349 {
350 pcie_bwnotif_disable(srv->port);
351 return 0;
352 }
353
pcie_bwnotif_resume(struct pcie_device * srv)354 static int pcie_bwnotif_resume(struct pcie_device *srv)
355 {
356 pcie_bwnotif_enable(srv);
357 return 0;
358 }
359
360 static struct pcie_port_service_driver pcie_bwctrl_driver = {
361 .name = "pcie_bwctrl",
362 .port_type = PCIE_ANY_PORT,
363 .service = PCIE_PORT_SERVICE_BWCTRL,
364 .probe = pcie_bwnotif_probe,
365 .suspend = pcie_bwnotif_suspend,
366 .resume = pcie_bwnotif_resume,
367 .remove = pcie_bwnotif_remove,
368 };
369
pcie_bwctrl_init(void)370 int __init pcie_bwctrl_init(void)
371 {
372 return pcie_port_service_register(&pcie_bwctrl_driver);
373 }
374