1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright(c) 2022 Intel Corporation. All rights reserved. */
3 #include <linux/memregion.h>
4 #include <linux/genalloc.h>
5 #include <linux/debugfs.h>
6 #include <linux/device.h>
7 #include <linux/module.h>
8 #include <linux/memory.h>
9 #include <linux/slab.h>
10 #include <linux/uuid.h>
11 #include <linux/sort.h>
12 #include <linux/idr.h>
13 #include <linux/memory-tiers.h>
14 #include <linux/string_choices.h>
15 #include <cxlmem.h>
16 #include <cxl.h>
17 #include "core.h"
18
19 /**
20 * DOC: cxl core region
21 *
22 * CXL Regions represent mapped memory capacity in system physical address
23 * space. Whereas the CXL Root Decoders identify the bounds of potential CXL
24 * Memory ranges, Regions represent the active mapped capacity by the HDM
25 * Decoder Capability structures throughout the Host Bridges, Switches, and
26 * Endpoints in the topology.
27 *
28 * Region configuration has ordering constraints. UUID may be set at any time
29 * but is only visible for persistent regions.
30 * 1. Interleave granularity
31 * 2. Interleave size
32 * 3. Decoder targets
33 */
34
35 /*
36 * nodemask that sets per node when the access_coordinates for the node has
37 * been updated by the CXL memory hotplug notifier.
38 */
39 static nodemask_t nodemask_region_seen = NODE_MASK_NONE;
40
41 static struct cxl_region *to_cxl_region(struct device *dev);
42
43 #define __ACCESS_ATTR_RO(_level, _name) { \
44 .attr = { .name = __stringify(_name), .mode = 0444 }, \
45 .show = _name##_access##_level##_show, \
46 }
47
48 #define ACCESS_DEVICE_ATTR_RO(level, name) \
49 struct device_attribute dev_attr_access##level##_##name = __ACCESS_ATTR_RO(level, name)
50
51 #define ACCESS_ATTR_RO(level, attrib) \
52 static ssize_t attrib##_access##level##_show(struct device *dev, \
53 struct device_attribute *attr, \
54 char *buf) \
55 { \
56 struct cxl_region *cxlr = to_cxl_region(dev); \
57 \
58 if (cxlr->coord[level].attrib == 0) \
59 return -ENOENT; \
60 \
61 return sysfs_emit(buf, "%u\n", cxlr->coord[level].attrib); \
62 } \
63 static ACCESS_DEVICE_ATTR_RO(level, attrib)
64
65 ACCESS_ATTR_RO(0, read_bandwidth);
66 ACCESS_ATTR_RO(0, read_latency);
67 ACCESS_ATTR_RO(0, write_bandwidth);
68 ACCESS_ATTR_RO(0, write_latency);
69
70 #define ACCESS_ATTR_DECLARE(level, attrib) \
71 (&dev_attr_access##level##_##attrib.attr)
72
73 static struct attribute *access0_coordinate_attrs[] = {
74 ACCESS_ATTR_DECLARE(0, read_bandwidth),
75 ACCESS_ATTR_DECLARE(0, write_bandwidth),
76 ACCESS_ATTR_DECLARE(0, read_latency),
77 ACCESS_ATTR_DECLARE(0, write_latency),
78 NULL
79 };
80
81 ACCESS_ATTR_RO(1, read_bandwidth);
82 ACCESS_ATTR_RO(1, read_latency);
83 ACCESS_ATTR_RO(1, write_bandwidth);
84 ACCESS_ATTR_RO(1, write_latency);
85
86 static struct attribute *access1_coordinate_attrs[] = {
87 ACCESS_ATTR_DECLARE(1, read_bandwidth),
88 ACCESS_ATTR_DECLARE(1, write_bandwidth),
89 ACCESS_ATTR_DECLARE(1, read_latency),
90 ACCESS_ATTR_DECLARE(1, write_latency),
91 NULL
92 };
93
94 #define ACCESS_VISIBLE(level) \
95 static umode_t cxl_region_access##level##_coordinate_visible( \
96 struct kobject *kobj, struct attribute *a, int n) \
97 { \
98 struct device *dev = kobj_to_dev(kobj); \
99 struct cxl_region *cxlr = to_cxl_region(dev); \
100 \
101 if (a == &dev_attr_access##level##_read_latency.attr && \
102 cxlr->coord[level].read_latency == 0) \
103 return 0; \
104 \
105 if (a == &dev_attr_access##level##_write_latency.attr && \
106 cxlr->coord[level].write_latency == 0) \
107 return 0; \
108 \
109 if (a == &dev_attr_access##level##_read_bandwidth.attr && \
110 cxlr->coord[level].read_bandwidth == 0) \
111 return 0; \
112 \
113 if (a == &dev_attr_access##level##_write_bandwidth.attr && \
114 cxlr->coord[level].write_bandwidth == 0) \
115 return 0; \
116 \
117 return a->mode; \
118 }
119
120 ACCESS_VISIBLE(0);
121 ACCESS_VISIBLE(1);
122
123 static const struct attribute_group cxl_region_access0_coordinate_group = {
124 .name = "access0",
125 .attrs = access0_coordinate_attrs,
126 .is_visible = cxl_region_access0_coordinate_visible,
127 };
128
get_cxl_region_access0_group(void)129 static const struct attribute_group *get_cxl_region_access0_group(void)
130 {
131 return &cxl_region_access0_coordinate_group;
132 }
133
134 static const struct attribute_group cxl_region_access1_coordinate_group = {
135 .name = "access1",
136 .attrs = access1_coordinate_attrs,
137 .is_visible = cxl_region_access1_coordinate_visible,
138 };
139
get_cxl_region_access1_group(void)140 static const struct attribute_group *get_cxl_region_access1_group(void)
141 {
142 return &cxl_region_access1_coordinate_group;
143 }
144
uuid_show(struct device * dev,struct device_attribute * attr,char * buf)145 static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
146 char *buf)
147 {
148 struct cxl_region *cxlr = to_cxl_region(dev);
149 struct cxl_region_params *p = &cxlr->params;
150 ssize_t rc;
151
152 ACQUIRE(rwsem_read_intr, region_rwsem)(&cxl_rwsem.region);
153 if ((rc = ACQUIRE_ERR(rwsem_read_intr, ®ion_rwsem)))
154 return rc;
155 if (cxlr->mode != CXL_PARTMODE_PMEM)
156 return sysfs_emit(buf, "\n");
157 return sysfs_emit(buf, "%pUb\n", &p->uuid);
158 }
159
is_dup(struct device * match,void * data)160 static int is_dup(struct device *match, void *data)
161 {
162 struct cxl_region_params *p;
163 struct cxl_region *cxlr;
164 uuid_t *uuid = data;
165
166 if (!is_cxl_region(match))
167 return 0;
168
169 lockdep_assert_held(&cxl_rwsem.region);
170 cxlr = to_cxl_region(match);
171 p = &cxlr->params;
172
173 if (uuid_equal(&p->uuid, uuid)) {
174 dev_dbg(match, "already has uuid: %pUb\n", uuid);
175 return -EBUSY;
176 }
177
178 return 0;
179 }
180
uuid_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t len)181 static ssize_t uuid_store(struct device *dev, struct device_attribute *attr,
182 const char *buf, size_t len)
183 {
184 struct cxl_region *cxlr = to_cxl_region(dev);
185 struct cxl_region_params *p = &cxlr->params;
186 uuid_t temp;
187 ssize_t rc;
188
189 if (len != UUID_STRING_LEN + 1)
190 return -EINVAL;
191
192 rc = uuid_parse(buf, &temp);
193 if (rc)
194 return rc;
195
196 if (uuid_is_null(&temp))
197 return -EINVAL;
198
199 ACQUIRE(rwsem_write_kill, region_rwsem)(&cxl_rwsem.region);
200 if ((rc = ACQUIRE_ERR(rwsem_write_kill, ®ion_rwsem)))
201 return rc;
202
203 if (uuid_equal(&p->uuid, &temp))
204 return len;
205
206 if (p->state >= CXL_CONFIG_ACTIVE)
207 return -EBUSY;
208
209 rc = bus_for_each_dev(&cxl_bus_type, NULL, &temp, is_dup);
210 if (rc < 0)
211 return rc;
212
213 uuid_copy(&p->uuid, &temp);
214
215 return len;
216 }
217 static DEVICE_ATTR_RW(uuid);
218
cxl_rr_load(struct cxl_port * port,struct cxl_region * cxlr)219 static struct cxl_region_ref *cxl_rr_load(struct cxl_port *port,
220 struct cxl_region *cxlr)
221 {
222 return xa_load(&port->regions, (unsigned long)cxlr);
223 }
224
cxl_region_invalidate_memregion(struct cxl_region * cxlr)225 static int cxl_region_invalidate_memregion(struct cxl_region *cxlr)
226 {
227 if (!cpu_cache_has_invalidate_memregion()) {
228 if (IS_ENABLED(CONFIG_CXL_REGION_INVALIDATION_TEST)) {
229 dev_info_once(
230 &cxlr->dev,
231 "Bypassing cpu_cache_invalidate_memregion() for testing!\n");
232 return 0;
233 }
234 dev_WARN(&cxlr->dev,
235 "Failed to synchronize CPU cache state\n");
236 return -ENXIO;
237 }
238
239 if (!cxlr->params.res)
240 return -ENXIO;
241 cpu_cache_invalidate_memregion(cxlr->params.res->start,
242 resource_size(cxlr->params.res));
243 return 0;
244 }
245
cxl_region_decode_reset(struct cxl_region * cxlr,int count)246 static void cxl_region_decode_reset(struct cxl_region *cxlr, int count)
247 {
248 struct cxl_region_params *p = &cxlr->params;
249 int i;
250
251 if (test_bit(CXL_REGION_F_LOCK, &cxlr->flags))
252 return;
253
254 /*
255 * Before region teardown attempt to flush, evict any data cached for
256 * this region, or scream loudly about missing arch / platform support
257 * for CXL teardown.
258 */
259 cxl_region_invalidate_memregion(cxlr);
260
261 for (i = count - 1; i >= 0; i--) {
262 struct cxl_endpoint_decoder *cxled = p->targets[i];
263 struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
264 struct cxl_port *iter = cxled_to_port(cxled);
265 struct cxl_dev_state *cxlds = cxlmd->cxlds;
266 struct cxl_ep *ep;
267
268 if (cxlds->rcd)
269 goto endpoint_reset;
270
271 while (!is_cxl_root(to_cxl_port(iter->dev.parent)))
272 iter = to_cxl_port(iter->dev.parent);
273
274 for (ep = cxl_ep_load(iter, cxlmd); iter;
275 iter = ep->next, ep = cxl_ep_load(iter, cxlmd)) {
276 struct cxl_region_ref *cxl_rr;
277 struct cxl_decoder *cxld;
278
279 cxl_rr = cxl_rr_load(iter, cxlr);
280 cxld = cxl_rr->decoder;
281 if (cxld->reset)
282 cxld->reset(cxld);
283 set_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags);
284 }
285
286 endpoint_reset:
287 cxled->cxld.reset(&cxled->cxld);
288 set_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags);
289 }
290
291 /* all decoders associated with this region have been torn down */
292 clear_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags);
293 }
294
commit_decoder(struct cxl_decoder * cxld)295 static int commit_decoder(struct cxl_decoder *cxld)
296 {
297 struct cxl_switch_decoder *cxlsd = NULL;
298
299 if (cxld->commit)
300 return cxld->commit(cxld);
301
302 if (is_switch_decoder(&cxld->dev))
303 cxlsd = to_cxl_switch_decoder(&cxld->dev);
304
305 if (dev_WARN_ONCE(&cxld->dev, !cxlsd || cxlsd->nr_targets > 1,
306 "->commit() is required\n"))
307 return -ENXIO;
308 return 0;
309 }
310
cxl_region_decode_commit(struct cxl_region * cxlr)311 static int cxl_region_decode_commit(struct cxl_region *cxlr)
312 {
313 struct cxl_region_params *p = &cxlr->params;
314 int i, rc = 0;
315
316 for (i = 0; i < p->nr_targets; i++) {
317 struct cxl_endpoint_decoder *cxled = p->targets[i];
318 struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
319 struct cxl_region_ref *cxl_rr;
320 struct cxl_decoder *cxld;
321 struct cxl_port *iter;
322 struct cxl_ep *ep;
323
324 /* commit bottom up */
325 for (iter = cxled_to_port(cxled); !is_cxl_root(iter);
326 iter = to_cxl_port(iter->dev.parent)) {
327 cxl_rr = cxl_rr_load(iter, cxlr);
328 cxld = cxl_rr->decoder;
329 rc = commit_decoder(cxld);
330 if (rc)
331 break;
332 }
333
334 if (rc) {
335 /* programming @iter failed, teardown */
336 for (ep = cxl_ep_load(iter, cxlmd); ep && iter;
337 iter = ep->next, ep = cxl_ep_load(iter, cxlmd)) {
338 cxl_rr = cxl_rr_load(iter, cxlr);
339 cxld = cxl_rr->decoder;
340 if (cxld->reset)
341 cxld->reset(cxld);
342 }
343
344 cxled->cxld.reset(&cxled->cxld);
345 goto err;
346 }
347 }
348
349 return 0;
350
351 err:
352 /* undo the targets that were successfully committed */
353 cxl_region_decode_reset(cxlr, i);
354 return rc;
355 }
356
queue_reset(struct cxl_region * cxlr)357 static int queue_reset(struct cxl_region *cxlr)
358 {
359 struct cxl_region_params *p = &cxlr->params;
360 int rc;
361
362 ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region);
363 if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem)))
364 return rc;
365
366 /* Already in the requested state? */
367 if (p->state < CXL_CONFIG_COMMIT)
368 return 0;
369
370 p->state = CXL_CONFIG_RESET_PENDING;
371
372 return 0;
373 }
374
__commit(struct cxl_region * cxlr)375 static int __commit(struct cxl_region *cxlr)
376 {
377 struct cxl_region_params *p = &cxlr->params;
378 int rc;
379
380 ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region);
381 if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem)))
382 return rc;
383
384 /* Already in the requested state? */
385 if (p->state >= CXL_CONFIG_COMMIT)
386 return 0;
387
388 /* Not ready to commit? */
389 if (p->state < CXL_CONFIG_ACTIVE)
390 return -ENXIO;
391
392 /*
393 * Invalidate caches before region setup to drop any speculative
394 * consumption of this address space
395 */
396 rc = cxl_region_invalidate_memregion(cxlr);
397 if (rc)
398 return rc;
399
400 rc = cxl_region_decode_commit(cxlr);
401 if (rc)
402 return rc;
403
404 p->state = CXL_CONFIG_COMMIT;
405
406 return 0;
407 }
408
commit_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t len)409 static ssize_t commit_store(struct device *dev, struct device_attribute *attr,
410 const char *buf, size_t len)
411 {
412 struct cxl_region *cxlr = to_cxl_region(dev);
413 struct cxl_region_params *p = &cxlr->params;
414 bool commit;
415 ssize_t rc;
416
417 rc = kstrtobool(buf, &commit);
418 if (rc)
419 return rc;
420
421 if (commit) {
422 rc = __commit(cxlr);
423 if (rc)
424 return rc;
425 return len;
426 }
427
428 if (test_bit(CXL_REGION_F_LOCK, &cxlr->flags))
429 return -EPERM;
430
431 rc = queue_reset(cxlr);
432 if (rc)
433 return rc;
434
435 /*
436 * Unmap the region and depend the reset-pending state to ensure
437 * it does not go active again until post reset
438 */
439 device_release_driver(&cxlr->dev);
440
441 /*
442 * With the reset pending take cxl_rwsem.region unconditionally
443 * to ensure the reset gets handled before returning.
444 */
445 guard(rwsem_write)(&cxl_rwsem.region);
446
447 /*
448 * Revalidate that the reset is still pending in case another
449 * thread already handled this reset.
450 */
451 if (p->state == CXL_CONFIG_RESET_PENDING) {
452 cxl_region_decode_reset(cxlr, p->interleave_ways);
453 p->state = CXL_CONFIG_ACTIVE;
454 }
455
456 return len;
457 }
458
commit_show(struct device * dev,struct device_attribute * attr,char * buf)459 static ssize_t commit_show(struct device *dev, struct device_attribute *attr,
460 char *buf)
461 {
462 struct cxl_region *cxlr = to_cxl_region(dev);
463 struct cxl_region_params *p = &cxlr->params;
464 ssize_t rc;
465
466 ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
467 if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
468 return rc;
469 return sysfs_emit(buf, "%d\n", p->state >= CXL_CONFIG_COMMIT);
470 }
471 static DEVICE_ATTR_RW(commit);
472
interleave_ways_show(struct device * dev,struct device_attribute * attr,char * buf)473 static ssize_t interleave_ways_show(struct device *dev,
474 struct device_attribute *attr, char *buf)
475 {
476 struct cxl_region *cxlr = to_cxl_region(dev);
477 struct cxl_region_params *p = &cxlr->params;
478 int rc;
479
480 ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
481 if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
482 return rc;
483 return sysfs_emit(buf, "%d\n", p->interleave_ways);
484 }
485
486 static const struct attribute_group *get_cxl_region_target_group(void);
487
set_interleave_ways(struct cxl_region * cxlr,int val)488 static int set_interleave_ways(struct cxl_region *cxlr, int val)
489 {
490 struct cxl_root_decoder *cxlrd = cxlr->cxlrd;
491 struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld;
492 struct cxl_region_params *p = &cxlr->params;
493 int save, rc;
494 u8 iw;
495
496 rc = ways_to_eiw(val, &iw);
497 if (rc)
498 return rc;
499
500 /*
501 * Even for x3, x6, and x12 interleaves the region interleave must be a
502 * power of 2 multiple of the host bridge interleave.
503 */
504 if (!is_power_of_2(val / cxld->interleave_ways) ||
505 (val % cxld->interleave_ways)) {
506 dev_dbg(&cxlr->dev, "invalid interleave: %d\n", val);
507 return -EINVAL;
508 }
509
510 lockdep_assert_held_write(&cxl_rwsem.region);
511
512 if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE)
513 return -EBUSY;
514
515 save = p->interleave_ways;
516 p->interleave_ways = val;
517 rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_target_group());
518 if (rc)
519 p->interleave_ways = save;
520
521 return rc;
522 }
523
interleave_ways_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t len)524 static ssize_t interleave_ways_store(struct device *dev,
525 struct device_attribute *attr,
526 const char *buf, size_t len)
527 {
528 struct cxl_region *cxlr = to_cxl_region(dev);
529 int val;
530 int rc;
531
532 rc = kstrtoint(buf, 0, &val);
533 if (rc)
534 return rc;
535
536 ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region);
537 if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem)))
538 return rc;
539
540 rc = set_interleave_ways(cxlr, val);
541 if (rc)
542 return rc;
543
544 return len;
545 }
546 static DEVICE_ATTR_RW(interleave_ways);
547
interleave_granularity_show(struct device * dev,struct device_attribute * attr,char * buf)548 static ssize_t interleave_granularity_show(struct device *dev,
549 struct device_attribute *attr,
550 char *buf)
551 {
552 struct cxl_region *cxlr = to_cxl_region(dev);
553 struct cxl_region_params *p = &cxlr->params;
554 int rc;
555
556 ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
557 if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
558 return rc;
559 return sysfs_emit(buf, "%d\n", p->interleave_granularity);
560 }
561
set_interleave_granularity(struct cxl_region * cxlr,int val)562 static int set_interleave_granularity(struct cxl_region *cxlr, int val)
563 {
564 struct cxl_root_decoder *cxlrd = cxlr->cxlrd;
565 struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld;
566 struct cxl_region_params *p = &cxlr->params;
567 int rc;
568 u16 ig;
569
570 rc = granularity_to_eig(val, &ig);
571 if (rc)
572 return rc;
573
574 /*
575 * When the host-bridge is interleaved, disallow region granularity !=
576 * root granularity. Regions with a granularity less than the root
577 * interleave result in needing multiple endpoints to support a single
578 * slot in the interleave (possible to support in the future). Regions
579 * with a granularity greater than the root interleave result in invalid
580 * DPA translations (invalid to support).
581 */
582 if (cxld->interleave_ways > 1 && val != cxld->interleave_granularity)
583 return -EINVAL;
584
585 lockdep_assert_held_write(&cxl_rwsem.region);
586
587 if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE)
588 return -EBUSY;
589
590 p->interleave_granularity = val;
591 return 0;
592 }
593
interleave_granularity_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t len)594 static ssize_t interleave_granularity_store(struct device *dev,
595 struct device_attribute *attr,
596 const char *buf, size_t len)
597 {
598 struct cxl_region *cxlr = to_cxl_region(dev);
599 int rc, val;
600
601 rc = kstrtoint(buf, 0, &val);
602 if (rc)
603 return rc;
604
605 ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region);
606 if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem)))
607 return rc;
608
609 rc = set_interleave_granularity(cxlr, val);
610 if (rc)
611 return rc;
612
613 return len;
614 }
615 static DEVICE_ATTR_RW(interleave_granularity);
616
resource_show(struct device * dev,struct device_attribute * attr,char * buf)617 static ssize_t resource_show(struct device *dev, struct device_attribute *attr,
618 char *buf)
619 {
620 struct cxl_region *cxlr = to_cxl_region(dev);
621 struct cxl_region_params *p = &cxlr->params;
622 u64 resource = -1ULL;
623 int rc;
624
625 ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
626 if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
627 return rc;
628
629 if (p->res)
630 resource = p->res->start;
631 return sysfs_emit(buf, "%#llx\n", resource);
632 }
633 static DEVICE_ATTR_RO(resource);
634
mode_show(struct device * dev,struct device_attribute * attr,char * buf)635 static ssize_t mode_show(struct device *dev, struct device_attribute *attr,
636 char *buf)
637 {
638 struct cxl_region *cxlr = to_cxl_region(dev);
639 const char *desc;
640
641 if (cxlr->mode == CXL_PARTMODE_RAM)
642 desc = "ram";
643 else if (cxlr->mode == CXL_PARTMODE_PMEM)
644 desc = "pmem";
645 else
646 desc = "";
647
648 return sysfs_emit(buf, "%s\n", desc);
649 }
650 static DEVICE_ATTR_RO(mode);
651
alloc_hpa(struct cxl_region * cxlr,resource_size_t size)652 static int alloc_hpa(struct cxl_region *cxlr, resource_size_t size)
653 {
654 struct cxl_root_decoder *cxlrd = cxlr->cxlrd;
655 struct cxl_region_params *p = &cxlr->params;
656 struct resource *res;
657 u64 remainder = 0;
658
659 lockdep_assert_held_write(&cxl_rwsem.region);
660
661 /* Nothing to do... */
662 if (p->res && resource_size(p->res) == size)
663 return 0;
664
665 /* To change size the old size must be freed first */
666 if (p->res)
667 return -EBUSY;
668
669 if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE)
670 return -EBUSY;
671
672 /* ways, granularity and uuid (if PMEM) need to be set before HPA */
673 if (!p->interleave_ways || !p->interleave_granularity ||
674 (cxlr->mode == CXL_PARTMODE_PMEM && uuid_is_null(&p->uuid)))
675 return -ENXIO;
676
677 div64_u64_rem(size, (u64)SZ_256M * p->interleave_ways, &remainder);
678 if (remainder)
679 return -EINVAL;
680
681 res = alloc_free_mem_region(cxlrd->res, size, SZ_256M,
682 dev_name(&cxlr->dev));
683 if (IS_ERR(res)) {
684 dev_dbg(&cxlr->dev,
685 "HPA allocation error (%ld) for size:%pap in %s %pr\n",
686 PTR_ERR(res), &size, cxlrd->res->name, cxlrd->res);
687 return PTR_ERR(res);
688 }
689
690 cxlr->hpa_range = DEFINE_RANGE(res->start, res->end);
691
692 p->res = res;
693 p->state = CXL_CONFIG_INTERLEAVE_ACTIVE;
694
695 return 0;
696 }
697
cxl_region_iomem_release(struct cxl_region * cxlr)698 static void cxl_region_iomem_release(struct cxl_region *cxlr)
699 {
700 struct cxl_region_params *p = &cxlr->params;
701
702 if (device_is_registered(&cxlr->dev))
703 lockdep_assert_held_write(&cxl_rwsem.region);
704 if (p->res) {
705 /*
706 * Autodiscovered regions may not have been able to insert their
707 * resource.
708 */
709 if (p->res->parent)
710 remove_resource(p->res);
711 kfree(p->res);
712 p->res = NULL;
713 }
714 }
715
free_hpa(struct cxl_region * cxlr)716 static int free_hpa(struct cxl_region *cxlr)
717 {
718 struct cxl_region_params *p = &cxlr->params;
719
720 lockdep_assert_held_write(&cxl_rwsem.region);
721
722 if (!p->res)
723 return 0;
724
725 if (p->state >= CXL_CONFIG_ACTIVE)
726 return -EBUSY;
727
728 cxlr->hpa_range = DEFINE_RANGE(0, -1);
729
730 cxl_region_iomem_release(cxlr);
731 p->state = CXL_CONFIG_IDLE;
732 return 0;
733 }
734
size_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t len)735 static ssize_t size_store(struct device *dev, struct device_attribute *attr,
736 const char *buf, size_t len)
737 {
738 struct cxl_region *cxlr = to_cxl_region(dev);
739 u64 val;
740 int rc;
741
742 rc = kstrtou64(buf, 0, &val);
743 if (rc)
744 return rc;
745
746 ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region);
747 if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem)))
748 return rc;
749
750 if (val)
751 rc = alloc_hpa(cxlr, val);
752 else
753 rc = free_hpa(cxlr);
754
755 if (rc)
756 return rc;
757
758 return len;
759 }
760
size_show(struct device * dev,struct device_attribute * attr,char * buf)761 static ssize_t size_show(struct device *dev, struct device_attribute *attr,
762 char *buf)
763 {
764 struct cxl_region *cxlr = to_cxl_region(dev);
765 struct cxl_region_params *p = &cxlr->params;
766 u64 size = 0;
767 ssize_t rc;
768
769 ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
770 if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
771 return rc;
772 if (p->res)
773 size = resource_size(p->res);
774 return sysfs_emit(buf, "%#llx\n", size);
775 }
776 static DEVICE_ATTR_RW(size);
777
extended_linear_cache_size_show(struct device * dev,struct device_attribute * attr,char * buf)778 static ssize_t extended_linear_cache_size_show(struct device *dev,
779 struct device_attribute *attr,
780 char *buf)
781 {
782 struct cxl_region *cxlr = to_cxl_region(dev);
783 struct cxl_region_params *p = &cxlr->params;
784 ssize_t rc;
785
786 ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
787 if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
788 return rc;
789 return sysfs_emit(buf, "%pap\n", &p->cache_size);
790 }
791 static DEVICE_ATTR_RO(extended_linear_cache_size);
792
locked_show(struct device * dev,struct device_attribute * attr,char * buf)793 static ssize_t locked_show(struct device *dev,
794 struct device_attribute *attr,
795 char *buf)
796 {
797 struct cxl_region *cxlr = to_cxl_region(dev);
798 int rc;
799
800 ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
801 if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
802 return rc;
803
804 rc = test_bit(CXL_REGION_F_LOCK, &cxlr->flags);
805 return sysfs_emit(buf, "%d\n", rc);
806 }
807 static DEVICE_ATTR_RO(locked);
808
809 static struct attribute *cxl_region_attrs[] = {
810 &dev_attr_uuid.attr,
811 &dev_attr_commit.attr,
812 &dev_attr_interleave_ways.attr,
813 &dev_attr_interleave_granularity.attr,
814 &dev_attr_resource.attr,
815 &dev_attr_size.attr,
816 &dev_attr_mode.attr,
817 &dev_attr_extended_linear_cache_size.attr,
818 &dev_attr_locked.attr,
819 NULL,
820 };
821
cxl_region_visible(struct kobject * kobj,struct attribute * a,int n)822 static umode_t cxl_region_visible(struct kobject *kobj, struct attribute *a,
823 int n)
824 {
825 struct device *dev = kobj_to_dev(kobj);
826 struct cxl_region *cxlr = to_cxl_region(dev);
827
828 /*
829 * Support tooling that expects to find a 'uuid' attribute for all
830 * regions regardless of mode.
831 */
832 if (a == &dev_attr_uuid.attr && cxlr->mode != CXL_PARTMODE_PMEM)
833 return 0444;
834
835 /*
836 * Don't display extended linear cache attribute if there is no
837 * extended linear cache.
838 */
839 if (a == &dev_attr_extended_linear_cache_size.attr &&
840 cxlr->params.cache_size == 0)
841 return 0;
842
843 return a->mode;
844 }
845
846 static const struct attribute_group cxl_region_group = {
847 .attrs = cxl_region_attrs,
848 .is_visible = cxl_region_visible,
849 };
850
show_targetN(struct cxl_region * cxlr,char * buf,int pos)851 static size_t show_targetN(struct cxl_region *cxlr, char *buf, int pos)
852 {
853 struct cxl_region_params *p = &cxlr->params;
854 struct cxl_endpoint_decoder *cxled;
855 int rc;
856
857 ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
858 if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
859 return rc;
860
861 if (pos >= p->interleave_ways) {
862 dev_dbg(&cxlr->dev, "position %d out of range %d\n", pos,
863 p->interleave_ways);
864 return -ENXIO;
865 }
866
867 cxled = p->targets[pos];
868 if (!cxled)
869 return sysfs_emit(buf, "\n");
870 return sysfs_emit(buf, "%s\n", dev_name(&cxled->cxld.dev));
871 }
872
check_commit_order(struct device * dev,void * data)873 static int check_commit_order(struct device *dev, void *data)
874 {
875 struct cxl_decoder *cxld = to_cxl_decoder(dev);
876
877 /*
878 * if port->commit_end is not the only free decoder, then out of
879 * order shutdown has occurred, block further allocations until
880 * that is resolved
881 */
882 if (((cxld->flags & CXL_DECODER_F_ENABLE) == 0))
883 return -EBUSY;
884 return 0;
885 }
886
match_free_decoder(struct device * dev,const void * data)887 static int match_free_decoder(struct device *dev, const void *data)
888 {
889 struct cxl_port *port = to_cxl_port(dev->parent);
890 struct cxl_decoder *cxld;
891 int rc;
892
893 if (!is_switch_decoder(dev))
894 return 0;
895
896 cxld = to_cxl_decoder(dev);
897
898 if (cxld->id != port->commit_end + 1)
899 return 0;
900
901 if (cxld->region) {
902 dev_dbg(dev->parent,
903 "next decoder to commit (%s) is already reserved (%s)\n",
904 dev_name(dev), dev_name(&cxld->region->dev));
905 return 0;
906 }
907
908 rc = device_for_each_child_reverse_from(dev->parent, dev, NULL,
909 check_commit_order);
910 if (rc) {
911 dev_dbg(dev->parent,
912 "unable to allocate %s due to out of order shutdown\n",
913 dev_name(dev));
914 return 0;
915 }
916 return 1;
917 }
918
spa_maps_hpa(const struct cxl_region_params * p,const struct range * range)919 static bool spa_maps_hpa(const struct cxl_region_params *p,
920 const struct range *range)
921 {
922 if (!p->res)
923 return false;
924
925 /*
926 * The extended linear cache region is constructed by a 1:1 ratio
927 * where the SPA maps equal amounts of DRAM and CXL HPA capacity with
928 * CXL decoders at the high end of the SPA range.
929 */
930 return p->res->start + p->cache_size == range->start &&
931 p->res->end == range->end;
932 }
933
match_auto_decoder(struct device * dev,const void * data)934 static int match_auto_decoder(struct device *dev, const void *data)
935 {
936 const struct cxl_region_params *p = data;
937 struct cxl_decoder *cxld;
938 struct range *r;
939
940 if (!is_switch_decoder(dev))
941 return 0;
942
943 cxld = to_cxl_decoder(dev);
944 r = &cxld->hpa_range;
945
946 if (spa_maps_hpa(p, r))
947 return 1;
948
949 return 0;
950 }
951
952 /**
953 * cxl_port_pick_region_decoder() - assign or lookup a decoder for a region
954 * @port: a port in the ancestry of the endpoint implied by @cxled
955 * @cxled: endpoint decoder to be, or currently, mapped by @port
956 * @cxlr: region to establish, or validate, decode @port
957 *
958 * In the region creation path cxl_port_pick_region_decoder() is an
959 * allocator to find a free port. In the region assembly path, it is
960 * recalling the decoder that platform firmware picked for validation
961 * purposes.
962 *
963 * The result is recorded in a 'struct cxl_region_ref' in @port.
964 */
965 static struct cxl_decoder *
cxl_port_pick_region_decoder(struct cxl_port * port,struct cxl_endpoint_decoder * cxled,struct cxl_region * cxlr)966 cxl_port_pick_region_decoder(struct cxl_port *port,
967 struct cxl_endpoint_decoder *cxled,
968 struct cxl_region *cxlr)
969 {
970 struct device *dev;
971
972 if (port == cxled_to_port(cxled))
973 return &cxled->cxld;
974
975 if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags))
976 dev = device_find_child(&port->dev, &cxlr->params,
977 match_auto_decoder);
978 else
979 dev = device_find_child(&port->dev, NULL, match_free_decoder);
980 if (!dev)
981 return NULL;
982 /*
983 * This decoder is pinned registered as long as the endpoint decoder is
984 * registered, and endpoint decoder unregistration holds the
985 * cxl_rwsem.region over unregister events, so no need to hold on to
986 * this extra reference.
987 */
988 put_device(dev);
989 return to_cxl_decoder(dev);
990 }
991
auto_order_ok(struct cxl_port * port,struct cxl_region * cxlr_iter,struct cxl_decoder * cxld)992 static bool auto_order_ok(struct cxl_port *port, struct cxl_region *cxlr_iter,
993 struct cxl_decoder *cxld)
994 {
995 struct cxl_region_ref *rr = cxl_rr_load(port, cxlr_iter);
996 struct cxl_decoder *cxld_iter = rr->decoder;
997
998 /*
999 * Allow the out of order assembly of auto-discovered regions.
1000 * Per CXL Spec 3.1 8.2.4.20.12 software must commit decoders
1001 * in HPA order. Confirm that the decoder with the lesser HPA
1002 * starting address has the lesser id.
1003 */
1004 dev_dbg(&cxld->dev, "check for HPA violation %s:%d < %s:%d\n",
1005 dev_name(&cxld->dev), cxld->id,
1006 dev_name(&cxld_iter->dev), cxld_iter->id);
1007
1008 if (cxld_iter->id > cxld->id)
1009 return true;
1010
1011 return false;
1012 }
1013
1014 static struct cxl_region_ref *
alloc_region_ref(struct cxl_port * port,struct cxl_region * cxlr,struct cxl_endpoint_decoder * cxled,struct cxl_decoder * cxld)1015 alloc_region_ref(struct cxl_port *port, struct cxl_region *cxlr,
1016 struct cxl_endpoint_decoder *cxled,
1017 struct cxl_decoder *cxld)
1018 {
1019 struct cxl_region_params *p = &cxlr->params;
1020 struct cxl_region_ref *cxl_rr, *iter;
1021 unsigned long index;
1022 int rc;
1023
1024 xa_for_each(&port->regions, index, iter) {
1025 struct cxl_region_params *ip = &iter->region->params;
1026
1027 if (!ip->res || ip->res->start < p->res->start)
1028 continue;
1029
1030 if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) {
1031 if (auto_order_ok(port, iter->region, cxld))
1032 continue;
1033 }
1034 dev_dbg(&cxlr->dev, "%s: HPA order violation %s:%pr vs %pr\n",
1035 dev_name(&port->dev),
1036 dev_name(&iter->region->dev), ip->res, p->res);
1037
1038 return ERR_PTR(-EBUSY);
1039 }
1040
1041 cxl_rr = kzalloc_obj(*cxl_rr);
1042 if (!cxl_rr)
1043 return ERR_PTR(-ENOMEM);
1044 cxl_rr->port = port;
1045 cxl_rr->region = cxlr;
1046 cxl_rr->nr_targets = 1;
1047 xa_init(&cxl_rr->endpoints);
1048
1049 rc = xa_insert(&port->regions, (unsigned long)cxlr, cxl_rr, GFP_KERNEL);
1050 if (rc) {
1051 dev_dbg(&cxlr->dev,
1052 "%s: failed to track region reference: %d\n",
1053 dev_name(&port->dev), rc);
1054 kfree(cxl_rr);
1055 return ERR_PTR(rc);
1056 }
1057
1058 return cxl_rr;
1059 }
1060
cxl_rr_free_decoder(struct cxl_region_ref * cxl_rr)1061 static void cxl_rr_free_decoder(struct cxl_region_ref *cxl_rr)
1062 {
1063 struct cxl_region *cxlr = cxl_rr->region;
1064 struct cxl_decoder *cxld = cxl_rr->decoder;
1065
1066 if (!cxld)
1067 return;
1068
1069 dev_WARN_ONCE(&cxlr->dev, cxld->region != cxlr, "region mismatch\n");
1070 if (cxld->region == cxlr) {
1071 cxld->region = NULL;
1072 put_device(&cxlr->dev);
1073 }
1074 }
1075
free_region_ref(struct cxl_region_ref * cxl_rr)1076 static void free_region_ref(struct cxl_region_ref *cxl_rr)
1077 {
1078 struct cxl_port *port = cxl_rr->port;
1079 struct cxl_region *cxlr = cxl_rr->region;
1080
1081 cxl_rr_free_decoder(cxl_rr);
1082 xa_erase(&port->regions, (unsigned long)cxlr);
1083 xa_destroy(&cxl_rr->endpoints);
1084 kfree(cxl_rr);
1085 }
1086
cxl_rr_ep_add(struct cxl_region_ref * cxl_rr,struct cxl_endpoint_decoder * cxled)1087 static int cxl_rr_ep_add(struct cxl_region_ref *cxl_rr,
1088 struct cxl_endpoint_decoder *cxled)
1089 {
1090 int rc;
1091 struct cxl_port *port = cxl_rr->port;
1092 struct cxl_region *cxlr = cxl_rr->region;
1093 struct cxl_decoder *cxld = cxl_rr->decoder;
1094 struct cxl_ep *ep = cxl_ep_load(port, cxled_to_memdev(cxled));
1095
1096 if (ep) {
1097 rc = xa_insert(&cxl_rr->endpoints, (unsigned long)cxled, ep,
1098 GFP_KERNEL);
1099 if (rc)
1100 return rc;
1101 }
1102 cxl_rr->nr_eps++;
1103
1104 if (!cxld->region) {
1105 cxld->region = cxlr;
1106
1107 /*
1108 * Now that cxld->region is set the intermediate staging state
1109 * can be cleared.
1110 */
1111 if (cxld == &cxled->cxld &&
1112 cxled->state == CXL_DECODER_STATE_AUTO_STAGED)
1113 cxled->state = CXL_DECODER_STATE_AUTO;
1114 get_device(&cxlr->dev);
1115 }
1116
1117 return 0;
1118 }
1119
cxl_rr_assign_decoder(struct cxl_port * port,struct cxl_region * cxlr,struct cxl_endpoint_decoder * cxled,struct cxl_region_ref * cxl_rr,struct cxl_decoder * cxld)1120 static int cxl_rr_assign_decoder(struct cxl_port *port, struct cxl_region *cxlr,
1121 struct cxl_endpoint_decoder *cxled,
1122 struct cxl_region_ref *cxl_rr,
1123 struct cxl_decoder *cxld)
1124 {
1125 if (cxld->region) {
1126 dev_dbg(&cxlr->dev, "%s: %s already attached to %s\n",
1127 dev_name(&port->dev), dev_name(&cxld->dev),
1128 dev_name(&cxld->region->dev));
1129 return -EBUSY;
1130 }
1131
1132 /*
1133 * Endpoints should already match the region type, but backstop that
1134 * assumption with an assertion. Switch-decoders change mapping-type
1135 * based on what is mapped when they are assigned to a region.
1136 */
1137 dev_WARN_ONCE(&cxlr->dev,
1138 port == cxled_to_port(cxled) &&
1139 cxld->target_type != cxlr->type,
1140 "%s:%s mismatch decoder type %d -> %d\n",
1141 dev_name(&cxled_to_memdev(cxled)->dev),
1142 dev_name(&cxld->dev), cxld->target_type, cxlr->type);
1143 cxld->target_type = cxlr->type;
1144 cxl_rr->decoder = cxld;
1145 return 0;
1146 }
1147
cxl_region_setup_flags(struct cxl_region * cxlr,struct cxl_decoder * cxld)1148 static void cxl_region_setup_flags(struct cxl_region *cxlr,
1149 struct cxl_decoder *cxld)
1150 {
1151 if (cxld->flags & CXL_DECODER_F_LOCK) {
1152 set_bit(CXL_REGION_F_LOCK, &cxlr->flags);
1153 clear_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags);
1154 }
1155
1156 if (cxld->flags & CXL_DECODER_F_NORMALIZED_ADDRESSING)
1157 set_bit(CXL_REGION_F_NORMALIZED_ADDRESSING, &cxlr->flags);
1158 }
1159
1160 /**
1161 * cxl_port_attach_region() - track a region's interest in a port by endpoint
1162 * @port: port to add a new region reference 'struct cxl_region_ref'
1163 * @cxlr: region to attach to @port
1164 * @cxled: endpoint decoder used to create or further pin a region reference
1165 * @pos: interleave position of @cxled in @cxlr
1166 *
1167 * The attach event is an opportunity to validate CXL decode setup
1168 * constraints and record metadata needed for programming HDM decoders,
1169 * in particular decoder target lists.
1170 *
1171 * The steps are:
1172 *
1173 * - validate that there are no other regions with a higher HPA already
1174 * associated with @port
1175 * - establish a region reference if one is not already present
1176 *
1177 * - additionally allocate a decoder instance that will host @cxlr on
1178 * @port
1179 *
1180 * - pin the region reference by the endpoint
1181 * - account for how many entries in @port's target list are needed to
1182 * cover all of the added endpoints.
1183 */
cxl_port_attach_region(struct cxl_port * port,struct cxl_region * cxlr,struct cxl_endpoint_decoder * cxled,int pos)1184 static int cxl_port_attach_region(struct cxl_port *port,
1185 struct cxl_region *cxlr,
1186 struct cxl_endpoint_decoder *cxled, int pos)
1187 {
1188 struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
1189 struct cxl_ep *ep = cxl_ep_load(port, cxlmd);
1190 struct cxl_region_ref *cxl_rr;
1191 bool nr_targets_inc = false;
1192 struct cxl_decoder *cxld;
1193 unsigned long index;
1194 int rc = -EBUSY;
1195
1196 lockdep_assert_held_write(&cxl_rwsem.region);
1197
1198 cxl_rr = cxl_rr_load(port, cxlr);
1199 if (cxl_rr) {
1200 struct cxl_ep *ep_iter;
1201 int found = 0;
1202
1203 /*
1204 * Walk the existing endpoints that have been attached to
1205 * @cxlr at @port and see if they share the same 'next' port
1206 * in the downstream direction. I.e. endpoints that share common
1207 * upstream switch.
1208 */
1209 xa_for_each(&cxl_rr->endpoints, index, ep_iter) {
1210 if (ep_iter == ep)
1211 continue;
1212 if (ep_iter->next == ep->next) {
1213 found++;
1214 break;
1215 }
1216 }
1217
1218 /*
1219 * New target port, or @port is an endpoint port that always
1220 * accounts its own local decode as a target.
1221 */
1222 if (!found || !ep->next) {
1223 cxl_rr->nr_targets++;
1224 nr_targets_inc = true;
1225 }
1226 } else {
1227 struct cxl_decoder *cxld;
1228
1229 cxld = cxl_port_pick_region_decoder(port, cxled, cxlr);
1230 if (!cxld) {
1231 dev_dbg(&cxlr->dev, "%s: no decoder available\n",
1232 dev_name(&port->dev));
1233 return -EBUSY;
1234 }
1235
1236 cxl_rr = alloc_region_ref(port, cxlr, cxled, cxld);
1237 if (IS_ERR(cxl_rr)) {
1238 dev_dbg(&cxlr->dev,
1239 "%s: failed to allocate region reference\n",
1240 dev_name(&port->dev));
1241 return PTR_ERR(cxl_rr);
1242 }
1243 nr_targets_inc = true;
1244
1245 rc = cxl_rr_assign_decoder(port, cxlr, cxled, cxl_rr, cxld);
1246 if (rc)
1247 goto out_erase;
1248 }
1249 cxld = cxl_rr->decoder;
1250
1251 /*
1252 * the number of targets should not exceed the target_count
1253 * of the decoder
1254 */
1255 if (is_switch_decoder(&cxld->dev)) {
1256 struct cxl_switch_decoder *cxlsd;
1257
1258 cxlsd = to_cxl_switch_decoder(&cxld->dev);
1259 if (cxl_rr->nr_targets > cxlsd->nr_targets) {
1260 dev_dbg(&cxlr->dev,
1261 "%s:%s %s add: %s:%s @ %d overflows targets: %d\n",
1262 dev_name(port->uport_dev), dev_name(&port->dev),
1263 dev_name(&cxld->dev), dev_name(&cxlmd->dev),
1264 dev_name(&cxled->cxld.dev), pos,
1265 cxlsd->nr_targets);
1266 rc = -ENXIO;
1267 goto out_erase;
1268 }
1269 }
1270
1271 cxl_region_setup_flags(cxlr, cxld);
1272
1273 rc = cxl_rr_ep_add(cxl_rr, cxled);
1274 if (rc) {
1275 dev_dbg(&cxlr->dev,
1276 "%s: failed to track endpoint %s:%s reference\n",
1277 dev_name(&port->dev), dev_name(&cxlmd->dev),
1278 dev_name(&cxld->dev));
1279 goto out_erase;
1280 }
1281
1282 dev_dbg(&cxlr->dev,
1283 "%s:%s %s add: %s:%s @ %d next: %s nr_eps: %d nr_targets: %d\n",
1284 dev_name(port->uport_dev), dev_name(&port->dev),
1285 dev_name(&cxld->dev), dev_name(&cxlmd->dev),
1286 dev_name(&cxled->cxld.dev), pos,
1287 ep ? ep->next ? dev_name(ep->next->uport_dev) :
1288 dev_name(&cxlmd->dev) :
1289 "none",
1290 cxl_rr->nr_eps, cxl_rr->nr_targets);
1291
1292 return 0;
1293 out_erase:
1294 if (nr_targets_inc)
1295 cxl_rr->nr_targets--;
1296 if (cxl_rr->nr_eps == 0)
1297 free_region_ref(cxl_rr);
1298 return rc;
1299 }
1300
cxl_port_detach_region(struct cxl_port * port,struct cxl_region * cxlr,struct cxl_endpoint_decoder * cxled)1301 static void cxl_port_detach_region(struct cxl_port *port,
1302 struct cxl_region *cxlr,
1303 struct cxl_endpoint_decoder *cxled)
1304 {
1305 struct cxl_region_ref *cxl_rr;
1306 struct cxl_ep *ep = NULL;
1307
1308 lockdep_assert_held_write(&cxl_rwsem.region);
1309
1310 cxl_rr = cxl_rr_load(port, cxlr);
1311 if (!cxl_rr)
1312 return;
1313
1314 /*
1315 * Endpoint ports do not carry cxl_ep references, and they
1316 * never target more than one endpoint by definition
1317 */
1318 if (cxl_rr->decoder == &cxled->cxld)
1319 cxl_rr->nr_eps--;
1320 else
1321 ep = xa_erase(&cxl_rr->endpoints, (unsigned long)cxled);
1322 if (ep) {
1323 struct cxl_ep *ep_iter;
1324 unsigned long index;
1325 int found = 0;
1326
1327 cxl_rr->nr_eps--;
1328 xa_for_each(&cxl_rr->endpoints, index, ep_iter) {
1329 if (ep_iter->next == ep->next) {
1330 found++;
1331 break;
1332 }
1333 }
1334 if (!found)
1335 cxl_rr->nr_targets--;
1336 }
1337
1338 if (cxl_rr->nr_eps == 0)
1339 free_region_ref(cxl_rr);
1340 }
1341
check_last_peer(struct cxl_endpoint_decoder * cxled,struct cxl_ep * ep,struct cxl_region_ref * cxl_rr,int distance)1342 static int check_last_peer(struct cxl_endpoint_decoder *cxled,
1343 struct cxl_ep *ep, struct cxl_region_ref *cxl_rr,
1344 int distance)
1345 {
1346 struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
1347 struct cxl_region *cxlr = cxl_rr->region;
1348 struct cxl_region_params *p = &cxlr->params;
1349 struct cxl_endpoint_decoder *cxled_peer;
1350 struct cxl_port *port = cxl_rr->port;
1351 struct cxl_memdev *cxlmd_peer;
1352 struct cxl_ep *ep_peer;
1353 int pos = cxled->pos;
1354
1355 /*
1356 * If this position wants to share a dport with the last endpoint mapped
1357 * then that endpoint, at index 'position - distance', must also be
1358 * mapped by this dport.
1359 */
1360 if (pos < distance) {
1361 dev_dbg(&cxlr->dev, "%s:%s: cannot host %s:%s at %d\n",
1362 dev_name(port->uport_dev), dev_name(&port->dev),
1363 dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), pos);
1364 return -ENXIO;
1365 }
1366 cxled_peer = p->targets[pos - distance];
1367 cxlmd_peer = cxled_to_memdev(cxled_peer);
1368 ep_peer = cxl_ep_load(port, cxlmd_peer);
1369 if (ep->dport != ep_peer->dport) {
1370 dev_dbg(&cxlr->dev,
1371 "%s:%s: %s:%s pos %d mismatched peer %s:%s\n",
1372 dev_name(port->uport_dev), dev_name(&port->dev),
1373 dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), pos,
1374 dev_name(&cxlmd_peer->dev),
1375 dev_name(&cxled_peer->cxld.dev));
1376 return -ENXIO;
1377 }
1378
1379 return 0;
1380 }
1381
check_interleave_cap(struct cxl_decoder * cxld,int iw,int ig)1382 static int check_interleave_cap(struct cxl_decoder *cxld, int iw, int ig)
1383 {
1384 struct cxl_port *port = to_cxl_port(cxld->dev.parent);
1385 struct cxl_hdm *cxlhdm = dev_get_drvdata(&port->dev);
1386 unsigned int interleave_mask;
1387 u8 eiw;
1388 u16 eig;
1389 int high_pos, low_pos;
1390
1391 if (!test_bit(iw, &cxlhdm->iw_cap_mask))
1392 return -ENXIO;
1393 /*
1394 * Per CXL specification r3.1(8.2.4.20.13 Decoder Protection),
1395 * if eiw < 8:
1396 * DPAOFFSET[51: eig + 8] = HPAOFFSET[51: eig + 8 + eiw]
1397 * DPAOFFSET[eig + 7: 0] = HPAOFFSET[eig + 7: 0]
1398 *
1399 * when the eiw is 0, all the bits of HPAOFFSET[51: 0] are used, the
1400 * interleave bits are none.
1401 *
1402 * if eiw >= 8:
1403 * DPAOFFSET[51: eig + 8] = HPAOFFSET[51: eig + eiw] / 3
1404 * DPAOFFSET[eig + 7: 0] = HPAOFFSET[eig + 7: 0]
1405 *
1406 * when the eiw is 8, all the bits of HPAOFFSET[51: 0] are used, the
1407 * interleave bits are none.
1408 */
1409 ways_to_eiw(iw, &eiw);
1410 if (eiw == 0 || eiw == 8)
1411 return 0;
1412
1413 granularity_to_eig(ig, &eig);
1414 if (eiw > 8)
1415 high_pos = eiw + eig - 1;
1416 else
1417 high_pos = eiw + eig + 7;
1418 low_pos = eig + 8;
1419 interleave_mask = GENMASK(high_pos, low_pos);
1420 if (interleave_mask & ~cxlhdm->interleave_mask)
1421 return -ENXIO;
1422
1423 return 0;
1424 }
1425
cxl_port_setup_targets(struct cxl_port * port,struct cxl_region * cxlr,struct cxl_endpoint_decoder * cxled)1426 static int cxl_port_setup_targets(struct cxl_port *port,
1427 struct cxl_region *cxlr,
1428 struct cxl_endpoint_decoder *cxled)
1429 {
1430 struct cxl_root_decoder *cxlrd = cxlr->cxlrd;
1431 int parent_iw, parent_ig, ig, iw, rc, pos = cxled->pos;
1432 struct cxl_port *parent_port = to_cxl_port(port->dev.parent);
1433 struct cxl_region_ref *cxl_rr = cxl_rr_load(port, cxlr);
1434 struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
1435 struct cxl_ep *ep = cxl_ep_load(port, cxlmd);
1436 struct cxl_region_params *p = &cxlr->params;
1437 struct cxl_decoder *cxld = cxl_rr->decoder;
1438 struct cxl_switch_decoder *cxlsd;
1439 struct cxl_port *iter = port;
1440 u16 eig, peig;
1441 u8 eiw, peiw;
1442
1443 /*
1444 * While root level decoders support x3, x6, x12, switch level
1445 * decoders only support powers of 2 up to x16.
1446 */
1447 if (!is_power_of_2(cxl_rr->nr_targets)) {
1448 dev_dbg(&cxlr->dev, "%s:%s: invalid target count %d\n",
1449 dev_name(port->uport_dev), dev_name(&port->dev),
1450 cxl_rr->nr_targets);
1451 return -EINVAL;
1452 }
1453
1454 cxlsd = to_cxl_switch_decoder(&cxld->dev);
1455 if (cxl_rr->nr_targets_set) {
1456 int i, distance = 1;
1457 struct cxl_region_ref *cxl_rr_iter;
1458
1459 /*
1460 * The "distance" between peer downstream ports represents which
1461 * endpoint positions in the region interleave a given port can
1462 * host.
1463 *
1464 * For example, at the root of a hierarchy the distance is
1465 * always 1 as every index targets a different host-bridge. At
1466 * each subsequent switch level those ports map every Nth region
1467 * position where N is the width of the switch == distance.
1468 */
1469 do {
1470 cxl_rr_iter = cxl_rr_load(iter, cxlr);
1471 distance *= cxl_rr_iter->nr_targets;
1472 iter = to_cxl_port(iter->dev.parent);
1473 } while (!is_cxl_root(iter));
1474 distance *= cxlrd->cxlsd.cxld.interleave_ways;
1475
1476 for (i = 0; i < cxl_rr->nr_targets_set; i++)
1477 if (ep->dport == cxlsd->target[i]) {
1478 rc = check_last_peer(cxled, ep, cxl_rr,
1479 distance);
1480 if (rc)
1481 return rc;
1482 goto out_target_set;
1483 }
1484 goto add_target;
1485 }
1486
1487 if (is_cxl_root(parent_port)) {
1488 /*
1489 * Root decoder IG is always set to value in CFMWS which
1490 * may be different than this region's IG. We can use the
1491 * region's IG here since interleave_granularity_store()
1492 * does not allow interleaved host-bridges with
1493 * root IG != region IG.
1494 */
1495 parent_ig = p->interleave_granularity;
1496 parent_iw = cxlrd->cxlsd.cxld.interleave_ways;
1497 /*
1498 * For purposes of address bit routing, use power-of-2 math for
1499 * switch ports.
1500 */
1501 if (!is_power_of_2(parent_iw))
1502 parent_iw /= 3;
1503 } else {
1504 struct cxl_region_ref *parent_rr;
1505 struct cxl_decoder *parent_cxld;
1506
1507 parent_rr = cxl_rr_load(parent_port, cxlr);
1508 parent_cxld = parent_rr->decoder;
1509 parent_ig = parent_cxld->interleave_granularity;
1510 parent_iw = parent_cxld->interleave_ways;
1511 }
1512
1513 rc = granularity_to_eig(parent_ig, &peig);
1514 if (rc) {
1515 dev_dbg(&cxlr->dev, "%s:%s: invalid parent granularity: %d\n",
1516 dev_name(parent_port->uport_dev),
1517 dev_name(&parent_port->dev), parent_ig);
1518 return rc;
1519 }
1520
1521 rc = ways_to_eiw(parent_iw, &peiw);
1522 if (rc) {
1523 dev_dbg(&cxlr->dev, "%s:%s: invalid parent interleave: %d\n",
1524 dev_name(parent_port->uport_dev),
1525 dev_name(&parent_port->dev), parent_iw);
1526 return rc;
1527 }
1528
1529 iw = cxl_rr->nr_targets;
1530 rc = ways_to_eiw(iw, &eiw);
1531 if (rc) {
1532 dev_dbg(&cxlr->dev, "%s:%s: invalid port interleave: %d\n",
1533 dev_name(port->uport_dev), dev_name(&port->dev), iw);
1534 return rc;
1535 }
1536
1537 /*
1538 * Interleave granularity is a multiple of @parent_port granularity.
1539 * Multiplier is the parent port interleave ways.
1540 */
1541 rc = granularity_to_eig(parent_ig * parent_iw, &eig);
1542 if (rc) {
1543 dev_dbg(&cxlr->dev,
1544 "%s: invalid granularity calculation (%d * %d)\n",
1545 dev_name(&parent_port->dev), parent_ig, parent_iw);
1546 return rc;
1547 }
1548
1549 rc = eig_to_granularity(eig, &ig);
1550 if (rc) {
1551 dev_dbg(&cxlr->dev, "%s:%s: invalid interleave: %d\n",
1552 dev_name(port->uport_dev), dev_name(&port->dev),
1553 256 << eig);
1554 return rc;
1555 }
1556
1557 if (iw > 8 || iw > cxlsd->nr_targets) {
1558 dev_dbg(&cxlr->dev,
1559 "%s:%s:%s: ways: %d overflows targets: %d\n",
1560 dev_name(port->uport_dev), dev_name(&port->dev),
1561 dev_name(&cxld->dev), iw, cxlsd->nr_targets);
1562 return -ENXIO;
1563 }
1564
1565 if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) {
1566 if (cxld->interleave_ways != iw ||
1567 (iw > 1 && cxld->interleave_granularity != ig) ||
1568 !spa_maps_hpa(p, &cxld->hpa_range) ||
1569 ((cxld->flags & CXL_DECODER_F_ENABLE) == 0)) {
1570 dev_err(&cxlr->dev,
1571 "%s:%s %s expected iw: %d ig: %d %pr\n",
1572 dev_name(port->uport_dev), dev_name(&port->dev),
1573 __func__, iw, ig, p->res);
1574 dev_err(&cxlr->dev,
1575 "%s:%s %s got iw: %d ig: %d state: %s %#llx:%#llx\n",
1576 dev_name(port->uport_dev), dev_name(&port->dev),
1577 __func__, cxld->interleave_ways,
1578 cxld->interleave_granularity,
1579 str_enabled_disabled(cxld->flags & CXL_DECODER_F_ENABLE),
1580 cxld->hpa_range.start, cxld->hpa_range.end);
1581 return -ENXIO;
1582 }
1583 } else {
1584 rc = check_interleave_cap(cxld, iw, ig);
1585 if (rc) {
1586 dev_dbg(&cxlr->dev,
1587 "%s:%s iw: %d ig: %d is not supported\n",
1588 dev_name(port->uport_dev),
1589 dev_name(&port->dev), iw, ig);
1590 return rc;
1591 }
1592
1593 cxld->interleave_ways = iw;
1594 cxld->interleave_granularity = ig;
1595 cxld->hpa_range = (struct range) {
1596 .start = p->res->start,
1597 .end = p->res->end,
1598 };
1599 }
1600 dev_dbg(&cxlr->dev, "%s:%s iw: %d ig: %d\n", dev_name(port->uport_dev),
1601 dev_name(&port->dev), iw, ig);
1602 add_target:
1603 if (cxl_rr->nr_targets_set == cxl_rr->nr_targets) {
1604 dev_dbg(&cxlr->dev,
1605 "%s:%s: targets full trying to add %s:%s at %d\n",
1606 dev_name(port->uport_dev), dev_name(&port->dev),
1607 dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), pos);
1608 return -ENXIO;
1609 }
1610 if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) {
1611 if (cxlsd->target[cxl_rr->nr_targets_set] != ep->dport) {
1612 dev_dbg(&cxlr->dev, "%s:%s: %s expected %s at %d\n",
1613 dev_name(port->uport_dev), dev_name(&port->dev),
1614 dev_name(&cxlsd->cxld.dev),
1615 dev_name(ep->dport->dport_dev),
1616 cxl_rr->nr_targets_set);
1617 return -ENXIO;
1618 }
1619 } else {
1620 cxlsd->target[cxl_rr->nr_targets_set] = ep->dport;
1621 cxlsd->cxld.target_map[cxl_rr->nr_targets_set] = ep->dport->port_id;
1622 }
1623 cxl_rr->nr_targets_set++;
1624 out_target_set:
1625 dev_dbg(&cxlr->dev, "%s:%s target[%d] = %s for %s:%s @ %d\n",
1626 dev_name(port->uport_dev), dev_name(&port->dev),
1627 cxl_rr->nr_targets_set - 1, dev_name(ep->dport->dport_dev),
1628 dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), pos);
1629
1630 return 0;
1631 }
1632
cxl_port_reset_targets(struct cxl_port * port,struct cxl_region * cxlr)1633 static void cxl_port_reset_targets(struct cxl_port *port,
1634 struct cxl_region *cxlr)
1635 {
1636 struct cxl_region_ref *cxl_rr = cxl_rr_load(port, cxlr);
1637 struct cxl_decoder *cxld;
1638
1639 /*
1640 * After the last endpoint has been detached the entire cxl_rr may now
1641 * be gone.
1642 */
1643 if (!cxl_rr)
1644 return;
1645 cxl_rr->nr_targets_set = 0;
1646
1647 cxld = cxl_rr->decoder;
1648 cxld->hpa_range = (struct range) {
1649 .start = 0,
1650 .end = -1,
1651 };
1652 }
1653
cxl_region_teardown_targets(struct cxl_region * cxlr)1654 static void cxl_region_teardown_targets(struct cxl_region *cxlr)
1655 {
1656 struct cxl_region_params *p = &cxlr->params;
1657 struct cxl_endpoint_decoder *cxled;
1658 struct cxl_dev_state *cxlds;
1659 struct cxl_memdev *cxlmd;
1660 struct cxl_port *iter;
1661 struct cxl_ep *ep;
1662 int i;
1663
1664 /*
1665 * In the auto-discovery case skip automatic teardown since the
1666 * address space is already active
1667 */
1668 if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags))
1669 return;
1670
1671 for (i = 0; i < p->nr_targets; i++) {
1672 cxled = p->targets[i];
1673 cxlmd = cxled_to_memdev(cxled);
1674 cxlds = cxlmd->cxlds;
1675
1676 if (cxlds->rcd)
1677 continue;
1678
1679 iter = cxled_to_port(cxled);
1680 while (!is_cxl_root(to_cxl_port(iter->dev.parent)))
1681 iter = to_cxl_port(iter->dev.parent);
1682
1683 for (ep = cxl_ep_load(iter, cxlmd); iter;
1684 iter = ep->next, ep = cxl_ep_load(iter, cxlmd))
1685 cxl_port_reset_targets(iter, cxlr);
1686 }
1687 }
1688
cxl_region_setup_targets(struct cxl_region * cxlr)1689 static int cxl_region_setup_targets(struct cxl_region *cxlr)
1690 {
1691 struct cxl_region_params *p = &cxlr->params;
1692 struct cxl_endpoint_decoder *cxled;
1693 struct cxl_dev_state *cxlds;
1694 int i, rc, rch = 0, vh = 0;
1695 struct cxl_memdev *cxlmd;
1696 struct cxl_port *iter;
1697 struct cxl_ep *ep;
1698
1699 for (i = 0; i < p->nr_targets; i++) {
1700 cxled = p->targets[i];
1701 cxlmd = cxled_to_memdev(cxled);
1702 cxlds = cxlmd->cxlds;
1703
1704 /* validate that all targets agree on topology */
1705 if (!cxlds->rcd) {
1706 vh++;
1707 } else {
1708 rch++;
1709 continue;
1710 }
1711
1712 iter = cxled_to_port(cxled);
1713 while (!is_cxl_root(to_cxl_port(iter->dev.parent)))
1714 iter = to_cxl_port(iter->dev.parent);
1715
1716 /*
1717 * Descend the topology tree programming / validating
1718 * targets while looking for conflicts.
1719 */
1720 for (ep = cxl_ep_load(iter, cxlmd); iter;
1721 iter = ep->next, ep = cxl_ep_load(iter, cxlmd)) {
1722 rc = cxl_port_setup_targets(iter, cxlr, cxled);
1723 if (rc) {
1724 cxl_region_teardown_targets(cxlr);
1725 return rc;
1726 }
1727 }
1728 }
1729
1730 if (rch && vh) {
1731 dev_err(&cxlr->dev, "mismatched CXL topologies detected\n");
1732 cxl_region_teardown_targets(cxlr);
1733 return -ENXIO;
1734 }
1735
1736 return 0;
1737 }
1738
cxl_region_validate_position(struct cxl_region * cxlr,struct cxl_endpoint_decoder * cxled,int pos)1739 static int cxl_region_validate_position(struct cxl_region *cxlr,
1740 struct cxl_endpoint_decoder *cxled,
1741 int pos)
1742 {
1743 struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
1744 struct cxl_region_params *p = &cxlr->params;
1745 int i;
1746
1747 if (pos < 0 || pos >= p->interleave_ways) {
1748 dev_dbg(&cxlr->dev, "position %d out of range %d\n", pos,
1749 p->interleave_ways);
1750 return -ENXIO;
1751 }
1752
1753 if (p->targets[pos] == cxled)
1754 return 0;
1755
1756 if (p->targets[pos]) {
1757 struct cxl_endpoint_decoder *cxled_target = p->targets[pos];
1758 struct cxl_memdev *cxlmd_target = cxled_to_memdev(cxled_target);
1759
1760 dev_dbg(&cxlr->dev, "position %d already assigned to %s:%s\n",
1761 pos, dev_name(&cxlmd_target->dev),
1762 dev_name(&cxled_target->cxld.dev));
1763 return -EBUSY;
1764 }
1765
1766 for (i = 0; i < p->interleave_ways; i++) {
1767 struct cxl_endpoint_decoder *cxled_target;
1768 struct cxl_memdev *cxlmd_target;
1769
1770 cxled_target = p->targets[i];
1771 if (!cxled_target)
1772 continue;
1773
1774 cxlmd_target = cxled_to_memdev(cxled_target);
1775 if (cxlmd_target == cxlmd) {
1776 dev_dbg(&cxlr->dev,
1777 "%s already specified at position %d via: %s\n",
1778 dev_name(&cxlmd->dev), pos,
1779 dev_name(&cxled_target->cxld.dev));
1780 return -EBUSY;
1781 }
1782 }
1783
1784 return 0;
1785 }
1786
cxl_region_attach_position(struct cxl_region * cxlr,struct cxl_endpoint_decoder * cxled,const struct cxl_dport * dport,int pos)1787 static int cxl_region_attach_position(struct cxl_region *cxlr,
1788 struct cxl_endpoint_decoder *cxled,
1789 const struct cxl_dport *dport, int pos)
1790 {
1791 struct cxl_root_decoder *cxlrd = cxlr->cxlrd;
1792 struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
1793 struct cxl_switch_decoder *cxlsd = &cxlrd->cxlsd;
1794 struct cxl_decoder *cxld = &cxlsd->cxld;
1795 int iw = cxld->interleave_ways;
1796 struct cxl_port *iter;
1797 int rc;
1798
1799 if (dport != cxlrd->cxlsd.target[pos % iw]) {
1800 dev_dbg(&cxlr->dev, "%s:%s invalid target position for %s\n",
1801 dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
1802 dev_name(&cxlrd->cxlsd.cxld.dev));
1803 return -ENXIO;
1804 }
1805
1806 for (iter = cxled_to_port(cxled); !is_cxl_root(iter);
1807 iter = to_cxl_port(iter->dev.parent)) {
1808 rc = cxl_port_attach_region(iter, cxlr, cxled, pos);
1809 if (rc)
1810 goto err;
1811 }
1812
1813 return 0;
1814
1815 err:
1816 for (iter = cxled_to_port(cxled); !is_cxl_root(iter);
1817 iter = to_cxl_port(iter->dev.parent))
1818 cxl_port_detach_region(iter, cxlr, cxled);
1819 return rc;
1820 }
1821
cxl_region_attach_auto(struct cxl_region * cxlr,struct cxl_endpoint_decoder * cxled,int pos)1822 static int cxl_region_attach_auto(struct cxl_region *cxlr,
1823 struct cxl_endpoint_decoder *cxled, int pos)
1824 {
1825 struct cxl_region_params *p = &cxlr->params;
1826
1827 if (cxled->state != CXL_DECODER_STATE_AUTO) {
1828 dev_err(&cxlr->dev,
1829 "%s: unable to add decoder to autodetected region\n",
1830 dev_name(&cxled->cxld.dev));
1831 return -EINVAL;
1832 }
1833
1834 if (pos >= 0) {
1835 dev_dbg(&cxlr->dev, "%s: expected auto position, not %d\n",
1836 dev_name(&cxled->cxld.dev), pos);
1837 return -EINVAL;
1838 }
1839
1840 if (p->nr_targets >= p->interleave_ways) {
1841 dev_err(&cxlr->dev, "%s: no more target slots available\n",
1842 dev_name(&cxled->cxld.dev));
1843 return -ENXIO;
1844 }
1845
1846 /*
1847 * Temporarily record the endpoint decoder into the target array. Yes,
1848 * this means that userspace can view devices in the wrong position
1849 * before the region activates, and must be careful to understand when
1850 * it might be racing region autodiscovery.
1851 */
1852 pos = p->nr_targets;
1853 p->targets[pos] = cxled;
1854 cxled->pos = pos;
1855 cxled->state = CXL_DECODER_STATE_AUTO_STAGED;
1856 p->nr_targets++;
1857
1858 return 0;
1859 }
1860
cmp_interleave_pos(const void * a,const void * b)1861 static int cmp_interleave_pos(const void *a, const void *b)
1862 {
1863 struct cxl_endpoint_decoder *cxled_a = *(typeof(cxled_a) *)a;
1864 struct cxl_endpoint_decoder *cxled_b = *(typeof(cxled_b) *)b;
1865
1866 return cxled_a->pos - cxled_b->pos;
1867 }
1868
match_switch_decoder_by_range(struct device * dev,const void * data)1869 static int match_switch_decoder_by_range(struct device *dev,
1870 const void *data)
1871 {
1872 struct cxl_switch_decoder *cxlsd;
1873 const struct range *r1, *r2 = data;
1874
1875
1876 if (!is_switch_decoder(dev))
1877 return 0;
1878
1879 cxlsd = to_cxl_switch_decoder(dev);
1880 r1 = &cxlsd->cxld.hpa_range;
1881
1882 if (is_root_decoder(dev))
1883 return range_contains(r1, r2);
1884 return (r1->start == r2->start && r1->end == r2->end);
1885 }
1886
find_pos_and_ways(struct cxl_port * port,struct range * range,int * pos,int * ways)1887 static int find_pos_and_ways(struct cxl_port *port, struct range *range,
1888 int *pos, int *ways)
1889 {
1890 struct cxl_switch_decoder *cxlsd;
1891 struct cxl_port *parent;
1892 struct device *dev;
1893 int rc = -ENXIO;
1894
1895 parent = parent_port_of(port);
1896 if (!parent)
1897 return rc;
1898
1899 dev = device_find_child(&parent->dev, range,
1900 match_switch_decoder_by_range);
1901 if (!dev) {
1902 dev_err(port->uport_dev,
1903 "failed to find decoder mapping %#llx-%#llx\n",
1904 range->start, range->end);
1905 return rc;
1906 }
1907 cxlsd = to_cxl_switch_decoder(dev);
1908 *ways = cxlsd->cxld.interleave_ways;
1909
1910 for (int i = 0; i < *ways; i++) {
1911 if (cxlsd->target[i] == port->parent_dport) {
1912 *pos = i;
1913 rc = 0;
1914 break;
1915 }
1916 }
1917 put_device(dev);
1918
1919 if (rc)
1920 dev_err(port->uport_dev,
1921 "failed to find %s:%s in target list of %s\n",
1922 dev_name(&port->dev),
1923 dev_name(port->parent_dport->dport_dev),
1924 dev_name(&cxlsd->cxld.dev));
1925
1926 return rc;
1927 }
1928
1929 /**
1930 * cxl_calc_interleave_pos() - calculate an endpoint position in a region
1931 * @cxled: endpoint decoder member of given region
1932 * @hpa_range: translated HPA range of the endpoint
1933 *
1934 * The endpoint position is calculated by traversing the topology from
1935 * the endpoint to the root decoder and iteratively applying this
1936 * calculation:
1937 *
1938 * position = position * parent_ways + parent_pos;
1939 *
1940 * ...where @position is inferred from switch and root decoder target lists.
1941 *
1942 * Return: position >= 0 on success
1943 * -ENXIO on failure
1944 */
cxl_calc_interleave_pos(struct cxl_endpoint_decoder * cxled,struct range * hpa_range)1945 static int cxl_calc_interleave_pos(struct cxl_endpoint_decoder *cxled,
1946 struct range *hpa_range)
1947 {
1948 struct cxl_port *iter, *port = cxled_to_port(cxled);
1949 struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
1950 int parent_ways = 0, parent_pos = 0, pos = 0;
1951 int rc;
1952
1953 /*
1954 * Example: the expected interleave order of the 4-way region shown
1955 * below is: mem0, mem2, mem1, mem3
1956 *
1957 * root_port
1958 * / \
1959 * host_bridge_0 host_bridge_1
1960 * | | | |
1961 * mem0 mem1 mem2 mem3
1962 *
1963 * In the example the calculator will iterate twice. The first iteration
1964 * uses the mem position in the host-bridge and the ways of the host-
1965 * bridge to generate the first, or local, position. The second
1966 * iteration uses the host-bridge position in the root_port and the ways
1967 * of the root_port to refine the position.
1968 *
1969 * A trace of the calculation per endpoint looks like this:
1970 * mem0: pos = 0 * 2 + 0 mem2: pos = 0 * 2 + 0
1971 * pos = 0 * 2 + 0 pos = 0 * 2 + 1
1972 * pos: 0 pos: 1
1973 *
1974 * mem1: pos = 0 * 2 + 1 mem3: pos = 0 * 2 + 1
1975 * pos = 1 * 2 + 0 pos = 1 * 2 + 1
1976 * pos: 2 pos = 3
1977 *
1978 * Note that while this example is simple, the method applies to more
1979 * complex topologies, including those with switches.
1980 */
1981
1982 /* Iterate from endpoint to root_port refining the position */
1983 for (iter = port; iter; iter = parent_port_of(iter)) {
1984 if (is_cxl_root(iter))
1985 break;
1986
1987 rc = find_pos_and_ways(iter, hpa_range, &parent_pos,
1988 &parent_ways);
1989 if (rc)
1990 return rc;
1991
1992 pos = pos * parent_ways + parent_pos;
1993 }
1994
1995 dev_dbg(&cxlmd->dev,
1996 "decoder:%s parent:%s port:%s range:%#llx-%#llx pos:%d\n",
1997 dev_name(&cxled->cxld.dev), dev_name(cxlmd->dev.parent),
1998 dev_name(&port->dev), hpa_range->start, hpa_range->end, pos);
1999
2000 return pos;
2001 }
2002
cxl_region_sort_targets(struct cxl_region * cxlr)2003 static int cxl_region_sort_targets(struct cxl_region *cxlr)
2004 {
2005 struct cxl_region_params *p = &cxlr->params;
2006 int i, rc = 0;
2007
2008 for (i = 0; i < p->nr_targets; i++) {
2009 struct cxl_endpoint_decoder *cxled = p->targets[i];
2010
2011 cxled->pos = cxl_calc_interleave_pos(cxled, &cxlr->hpa_range);
2012 /*
2013 * Record that sorting failed, but still continue to calc
2014 * cxled->pos so that follow-on code paths can reliably
2015 * do p->targets[cxled->pos] to self-reference their entry.
2016 */
2017 if (cxled->pos < 0)
2018 rc = -ENXIO;
2019 }
2020 /* Keep the cxlr target list in interleave position order */
2021 sort(p->targets, p->nr_targets, sizeof(p->targets[0]),
2022 cmp_interleave_pos, NULL);
2023
2024 dev_dbg(&cxlr->dev, "region sort %s\n", rc ? "failed" : "successful");
2025 return rc;
2026 }
2027
cxl_region_attach(struct cxl_region * cxlr,struct cxl_endpoint_decoder * cxled,int pos)2028 static int cxl_region_attach(struct cxl_region *cxlr,
2029 struct cxl_endpoint_decoder *cxled, int pos)
2030 {
2031 struct cxl_root_decoder *cxlrd = cxlr->cxlrd;
2032 struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
2033 struct cxl_dev_state *cxlds = cxlmd->cxlds;
2034 struct cxl_region_params *p = &cxlr->params;
2035 struct cxl_port *ep_port, *root_port;
2036 struct cxl_dport *dport;
2037 int rc = -ENXIO;
2038
2039 rc = check_interleave_cap(&cxled->cxld, p->interleave_ways,
2040 p->interleave_granularity);
2041 if (rc) {
2042 dev_dbg(&cxlr->dev, "%s iw: %d ig: %d is not supported\n",
2043 dev_name(&cxled->cxld.dev), p->interleave_ways,
2044 p->interleave_granularity);
2045 return rc;
2046 }
2047
2048 if (cxled->part < 0) {
2049 dev_dbg(&cxlr->dev, "%s dead\n", dev_name(&cxled->cxld.dev));
2050 return -ENODEV;
2051 }
2052
2053 if (cxlds->part[cxled->part].mode != cxlr->mode) {
2054 dev_dbg(&cxlr->dev, "%s region mode: %d mismatch\n",
2055 dev_name(&cxled->cxld.dev), cxlr->mode);
2056 return -EINVAL;
2057 }
2058
2059 /* all full of members, or interleave config not established? */
2060 if (p->state > CXL_CONFIG_INTERLEAVE_ACTIVE) {
2061 dev_dbg(&cxlr->dev, "region already active\n");
2062 return -EBUSY;
2063 }
2064
2065 if (p->state < CXL_CONFIG_INTERLEAVE_ACTIVE) {
2066 dev_dbg(&cxlr->dev, "interleave config missing\n");
2067 return -ENXIO;
2068 }
2069
2070 if (p->nr_targets >= p->interleave_ways) {
2071 dev_dbg(&cxlr->dev, "region already has %d endpoints\n",
2072 p->nr_targets);
2073 return -EINVAL;
2074 }
2075
2076 ep_port = cxled_to_port(cxled);
2077 root_port = cxlrd_to_port(cxlrd);
2078 dport = cxl_find_dport_by_dev(root_port, ep_port->host_bridge);
2079 if (!dport) {
2080 dev_dbg(&cxlr->dev, "%s:%s invalid target for %s\n",
2081 dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
2082 dev_name(cxlr->dev.parent));
2083 return -ENXIO;
2084 }
2085
2086 if (cxled->cxld.target_type != cxlr->type) {
2087 dev_dbg(&cxlr->dev, "%s:%s type mismatch: %d vs %d\n",
2088 dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
2089 cxled->cxld.target_type, cxlr->type);
2090 return -ENXIO;
2091 }
2092
2093 if (!cxled->dpa_res) {
2094 dev_dbg(&cxlr->dev, "%s:%s: missing DPA allocation.\n",
2095 dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev));
2096 return -ENXIO;
2097 }
2098
2099 if (resource_size(cxled->dpa_res) * p->interleave_ways + p->cache_size !=
2100 resource_size(p->res)) {
2101 dev_dbg(&cxlr->dev,
2102 "%s:%s-size-%#llx * ways-%d + cache-%#llx != region-size-%#llx\n",
2103 dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
2104 (u64)resource_size(cxled->dpa_res), p->interleave_ways,
2105 (u64)p->cache_size, (u64)resource_size(p->res));
2106 return -EINVAL;
2107 }
2108
2109 cxl_region_perf_data_calculate(cxlr, cxled);
2110
2111 if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) {
2112 int i;
2113
2114 rc = cxl_region_attach_auto(cxlr, cxled, pos);
2115 if (rc)
2116 return rc;
2117
2118 /* await more targets to arrive... */
2119 if (p->nr_targets < p->interleave_ways)
2120 return 0;
2121
2122 /*
2123 * All targets are here, which implies all PCI enumeration that
2124 * affects this region has been completed. Walk the topology to
2125 * sort the devices into their relative region decode position.
2126 */
2127 rc = cxl_region_sort_targets(cxlr);
2128 if (rc)
2129 return rc;
2130
2131 for (i = 0; i < p->nr_targets; i++) {
2132 cxled = p->targets[i];
2133 ep_port = cxled_to_port(cxled);
2134 dport = cxl_find_dport_by_dev(root_port,
2135 ep_port->host_bridge);
2136 rc = cxl_region_attach_position(cxlr, cxled, dport, i);
2137 if (rc)
2138 return rc;
2139 }
2140
2141 rc = cxl_region_setup_targets(cxlr);
2142 if (rc)
2143 return rc;
2144
2145 /*
2146 * If target setup succeeds in the autodiscovery case
2147 * then the region is already committed.
2148 */
2149 p->state = CXL_CONFIG_COMMIT;
2150 cxl_region_shared_upstream_bandwidth_update(cxlr);
2151
2152 return 0;
2153 }
2154
2155 rc = cxl_region_validate_position(cxlr, cxled, pos);
2156 if (rc)
2157 return rc;
2158
2159 rc = cxl_region_attach_position(cxlr, cxled, dport, pos);
2160 if (rc)
2161 return rc;
2162
2163 p->targets[pos] = cxled;
2164 cxled->pos = pos;
2165 p->nr_targets++;
2166
2167 if (p->nr_targets == p->interleave_ways) {
2168 rc = cxl_region_setup_targets(cxlr);
2169 if (rc)
2170 return rc;
2171 p->state = CXL_CONFIG_ACTIVE;
2172 cxl_region_shared_upstream_bandwidth_update(cxlr);
2173 }
2174
2175 cxled->cxld.interleave_ways = p->interleave_ways;
2176 cxled->cxld.interleave_granularity = p->interleave_granularity;
2177 cxled->cxld.hpa_range = (struct range) {
2178 .start = p->res->start,
2179 .end = p->res->end,
2180 };
2181
2182 if (p->nr_targets != p->interleave_ways)
2183 return 0;
2184
2185 /*
2186 * Test the auto-discovery position calculator function
2187 * against this successfully created user-defined region.
2188 * A fail message here means that this interleave config
2189 * will fail when presented as CXL_REGION_F_AUTO.
2190 */
2191 for (int i = 0; i < p->nr_targets; i++) {
2192 struct cxl_endpoint_decoder *cxled = p->targets[i];
2193 int test_pos;
2194
2195 test_pos = cxl_calc_interleave_pos(cxled, &cxlr->hpa_range);
2196 dev_dbg(&cxled->cxld.dev,
2197 "Test cxl_calc_interleave_pos(): %s test_pos:%d cxled->pos:%d\n",
2198 (test_pos == cxled->pos) ? "success" : "fail",
2199 test_pos, cxled->pos);
2200 }
2201
2202 return 0;
2203 }
2204
cxl_region_by_target(struct device * dev,const void * data)2205 static int cxl_region_by_target(struct device *dev, const void *data)
2206 {
2207 const struct cxl_endpoint_decoder *cxled = data;
2208 struct cxl_region_params *p;
2209 struct cxl_region *cxlr;
2210
2211 if (!is_cxl_region(dev))
2212 return 0;
2213
2214 cxlr = to_cxl_region(dev);
2215 p = &cxlr->params;
2216 return p->targets[cxled->pos] == cxled;
2217 }
2218
2219 /*
2220 * When an auto-region fails to assemble the decoder may be listed as a target,
2221 * but not fully attached.
2222 */
cxl_cancel_auto_attach(struct cxl_endpoint_decoder * cxled)2223 static void cxl_cancel_auto_attach(struct cxl_endpoint_decoder *cxled)
2224 {
2225 struct cxl_region_params *p;
2226 struct cxl_region *cxlr;
2227 int pos = cxled->pos;
2228
2229 if (cxled->state != CXL_DECODER_STATE_AUTO_STAGED)
2230 return;
2231
2232 struct device *dev __free(put_device) =
2233 bus_find_device(&cxl_bus_type, NULL, cxled, cxl_region_by_target);
2234 if (!dev)
2235 return;
2236
2237 cxlr = to_cxl_region(dev);
2238 p = &cxlr->params;
2239
2240 p->nr_targets--;
2241 cxled->state = CXL_DECODER_STATE_AUTO;
2242 cxled->pos = -1;
2243 p->targets[pos] = NULL;
2244 }
2245
2246 static struct cxl_region *
__cxl_decoder_detach(struct cxl_region * cxlr,struct cxl_endpoint_decoder * cxled,int pos,enum cxl_detach_mode mode)2247 __cxl_decoder_detach(struct cxl_region *cxlr,
2248 struct cxl_endpoint_decoder *cxled, int pos,
2249 enum cxl_detach_mode mode)
2250 {
2251 struct cxl_region_params *p;
2252
2253 lockdep_assert_held_write(&cxl_rwsem.region);
2254
2255 if (!cxled) {
2256 p = &cxlr->params;
2257
2258 if (pos >= p->interleave_ways) {
2259 dev_dbg(&cxlr->dev, "position %d out of range %d\n",
2260 pos, p->interleave_ways);
2261 return NULL;
2262 }
2263
2264 if (!p->targets[pos])
2265 return NULL;
2266 cxled = p->targets[pos];
2267 } else {
2268 cxlr = cxled->cxld.region;
2269 if (!cxlr) {
2270 cxl_cancel_auto_attach(cxled);
2271 return NULL;
2272 }
2273 p = &cxlr->params;
2274 }
2275
2276 if (mode == DETACH_INVALIDATE)
2277 cxled->part = -1;
2278
2279 if (p->state > CXL_CONFIG_ACTIVE) {
2280 cxl_region_decode_reset(cxlr, p->interleave_ways);
2281 p->state = CXL_CONFIG_ACTIVE;
2282 }
2283
2284 for (struct cxl_port *iter = cxled_to_port(cxled); !is_cxl_root(iter);
2285 iter = to_cxl_port(iter->dev.parent))
2286 cxl_port_detach_region(iter, cxlr, cxled);
2287
2288 if (cxled->pos < 0 || cxled->pos >= p->interleave_ways ||
2289 p->targets[cxled->pos] != cxled) {
2290 struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
2291
2292 dev_WARN_ONCE(&cxlr->dev, 1, "expected %s:%s at position %d\n",
2293 dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
2294 cxled->pos);
2295 return NULL;
2296 }
2297
2298 if (p->state == CXL_CONFIG_ACTIVE) {
2299 p->state = CXL_CONFIG_INTERLEAVE_ACTIVE;
2300 cxl_region_teardown_targets(cxlr);
2301 }
2302 p->targets[cxled->pos] = NULL;
2303 p->nr_targets--;
2304 cxled->cxld.hpa_range = (struct range) {
2305 .start = 0,
2306 .end = -1,
2307 };
2308
2309 get_device(&cxlr->dev);
2310 return cxlr;
2311 }
2312
2313 /*
2314 * Cleanup a decoder's interest in a region. There are 2 cases to
2315 * handle, removing an unknown @cxled from a known position in a region
2316 * (detach_target()) or removing a known @cxled from an unknown @cxlr
2317 * (cxld_unregister())
2318 *
2319 * When the detachment finds a region release the region driver.
2320 */
cxl_decoder_detach(struct cxl_region * cxlr,struct cxl_endpoint_decoder * cxled,int pos,enum cxl_detach_mode mode)2321 int cxl_decoder_detach(struct cxl_region *cxlr,
2322 struct cxl_endpoint_decoder *cxled, int pos,
2323 enum cxl_detach_mode mode)
2324 {
2325 struct cxl_region *detach;
2326
2327 /* when the decoder is being destroyed lock unconditionally */
2328 if (mode == DETACH_INVALIDATE) {
2329 guard(rwsem_write)(&cxl_rwsem.region);
2330 detach = __cxl_decoder_detach(cxlr, cxled, pos, mode);
2331 } else {
2332 int rc;
2333
2334 ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region);
2335 if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem)))
2336 return rc;
2337 detach = __cxl_decoder_detach(cxlr, cxled, pos, mode);
2338 }
2339
2340 if (detach) {
2341 device_release_driver(&detach->dev);
2342 put_device(&detach->dev);
2343 }
2344 return 0;
2345 }
2346
__attach_target(struct cxl_region * cxlr,struct cxl_endpoint_decoder * cxled,int pos,unsigned int state)2347 static int __attach_target(struct cxl_region *cxlr,
2348 struct cxl_endpoint_decoder *cxled, int pos,
2349 unsigned int state)
2350 {
2351 int rc;
2352
2353 if (state == TASK_INTERRUPTIBLE) {
2354 ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region);
2355 if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem)))
2356 return rc;
2357 guard(rwsem_read)(&cxl_rwsem.dpa);
2358 return cxl_region_attach(cxlr, cxled, pos);
2359 }
2360 guard(rwsem_write)(&cxl_rwsem.region);
2361 guard(rwsem_read)(&cxl_rwsem.dpa);
2362 return cxl_region_attach(cxlr, cxled, pos);
2363 }
2364
attach_target(struct cxl_region * cxlr,struct cxl_endpoint_decoder * cxled,int pos,unsigned int state)2365 static int attach_target(struct cxl_region *cxlr,
2366 struct cxl_endpoint_decoder *cxled, int pos,
2367 unsigned int state)
2368 {
2369 int rc = __attach_target(cxlr, cxled, pos, state);
2370
2371 if (rc == 0)
2372 return 0;
2373
2374 dev_warn(cxled->cxld.dev.parent, "failed to attach %s to %s: %d\n",
2375 dev_name(&cxled->cxld.dev), dev_name(&cxlr->dev), rc);
2376 return rc;
2377 }
2378
detach_target(struct cxl_region * cxlr,int pos)2379 static int detach_target(struct cxl_region *cxlr, int pos)
2380 {
2381 return cxl_decoder_detach(cxlr, NULL, pos, DETACH_ONLY);
2382 }
2383
store_targetN(struct cxl_region * cxlr,const char * buf,int pos,size_t len)2384 static size_t store_targetN(struct cxl_region *cxlr, const char *buf, int pos,
2385 size_t len)
2386 {
2387 int rc;
2388
2389 if (sysfs_streq(buf, "\n"))
2390 rc = detach_target(cxlr, pos);
2391 else {
2392 struct device *dev;
2393
2394 dev = bus_find_device_by_name(&cxl_bus_type, NULL, buf);
2395 if (!dev)
2396 return -ENODEV;
2397
2398 if (!is_endpoint_decoder(dev)) {
2399 rc = -EINVAL;
2400 goto out;
2401 }
2402
2403 rc = attach_target(cxlr, to_cxl_endpoint_decoder(dev), pos,
2404 TASK_INTERRUPTIBLE);
2405 out:
2406 put_device(dev);
2407 }
2408
2409 if (rc < 0)
2410 return rc;
2411 return len;
2412 }
2413
2414 #define TARGET_ATTR_RW(n) \
2415 static ssize_t target##n##_show( \
2416 struct device *dev, struct device_attribute *attr, char *buf) \
2417 { \
2418 return show_targetN(to_cxl_region(dev), buf, (n)); \
2419 } \
2420 static ssize_t target##n##_store(struct device *dev, \
2421 struct device_attribute *attr, \
2422 const char *buf, size_t len) \
2423 { \
2424 return store_targetN(to_cxl_region(dev), buf, (n), len); \
2425 } \
2426 static DEVICE_ATTR_RW(target##n)
2427
2428 TARGET_ATTR_RW(0);
2429 TARGET_ATTR_RW(1);
2430 TARGET_ATTR_RW(2);
2431 TARGET_ATTR_RW(3);
2432 TARGET_ATTR_RW(4);
2433 TARGET_ATTR_RW(5);
2434 TARGET_ATTR_RW(6);
2435 TARGET_ATTR_RW(7);
2436 TARGET_ATTR_RW(8);
2437 TARGET_ATTR_RW(9);
2438 TARGET_ATTR_RW(10);
2439 TARGET_ATTR_RW(11);
2440 TARGET_ATTR_RW(12);
2441 TARGET_ATTR_RW(13);
2442 TARGET_ATTR_RW(14);
2443 TARGET_ATTR_RW(15);
2444
2445 static struct attribute *target_attrs[] = {
2446 &dev_attr_target0.attr,
2447 &dev_attr_target1.attr,
2448 &dev_attr_target2.attr,
2449 &dev_attr_target3.attr,
2450 &dev_attr_target4.attr,
2451 &dev_attr_target5.attr,
2452 &dev_attr_target6.attr,
2453 &dev_attr_target7.attr,
2454 &dev_attr_target8.attr,
2455 &dev_attr_target9.attr,
2456 &dev_attr_target10.attr,
2457 &dev_attr_target11.attr,
2458 &dev_attr_target12.attr,
2459 &dev_attr_target13.attr,
2460 &dev_attr_target14.attr,
2461 &dev_attr_target15.attr,
2462 NULL,
2463 };
2464
cxl_region_target_visible(struct kobject * kobj,struct attribute * a,int n)2465 static umode_t cxl_region_target_visible(struct kobject *kobj,
2466 struct attribute *a, int n)
2467 {
2468 struct device *dev = kobj_to_dev(kobj);
2469 struct cxl_region *cxlr = to_cxl_region(dev);
2470 struct cxl_region_params *p = &cxlr->params;
2471
2472 if (n < p->interleave_ways)
2473 return a->mode;
2474 return 0;
2475 }
2476
2477 static const struct attribute_group cxl_region_target_group = {
2478 .attrs = target_attrs,
2479 .is_visible = cxl_region_target_visible,
2480 };
2481
get_cxl_region_target_group(void)2482 static const struct attribute_group *get_cxl_region_target_group(void)
2483 {
2484 return &cxl_region_target_group;
2485 }
2486
2487 static const struct attribute_group *region_groups[] = {
2488 &cxl_base_attribute_group,
2489 &cxl_region_group,
2490 &cxl_region_target_group,
2491 &cxl_region_access0_coordinate_group,
2492 &cxl_region_access1_coordinate_group,
2493 NULL,
2494 };
2495
cxl_region_release(struct device * dev)2496 static void cxl_region_release(struct device *dev)
2497 {
2498 struct cxl_region *cxlr = to_cxl_region(dev);
2499 struct cxl_root_decoder *cxlrd = cxlr->cxlrd;
2500 int id = atomic_read(&cxlrd->region_id);
2501
2502 /*
2503 * Try to reuse the recently idled id rather than the cached
2504 * next id to prevent the region id space from increasing
2505 * unnecessarily.
2506 */
2507 if (cxlr->id < id)
2508 if (atomic_try_cmpxchg(&cxlrd->region_id, &id, cxlr->id)) {
2509 memregion_free(id);
2510 goto out;
2511 }
2512
2513 memregion_free(cxlr->id);
2514 out:
2515 put_device(dev->parent);
2516 kfree(cxlr);
2517 }
2518
2519 const struct device_type cxl_region_type = {
2520 .name = "cxl_region",
2521 .release = cxl_region_release,
2522 .groups = region_groups
2523 };
2524
is_cxl_region(struct device * dev)2525 bool is_cxl_region(struct device *dev)
2526 {
2527 return dev->type == &cxl_region_type;
2528 }
2529 EXPORT_SYMBOL_NS_GPL(is_cxl_region, "CXL");
2530
to_cxl_region(struct device * dev)2531 static struct cxl_region *to_cxl_region(struct device *dev)
2532 {
2533 if (dev_WARN_ONCE(dev, dev->type != &cxl_region_type,
2534 "not a cxl_region device\n"))
2535 return NULL;
2536
2537 return container_of(dev, struct cxl_region, dev);
2538 }
2539
unregister_region(void * _cxlr)2540 static void unregister_region(void *_cxlr)
2541 {
2542 struct cxl_region *cxlr = _cxlr;
2543 struct cxl_region_params *p = &cxlr->params;
2544 int i;
2545
2546 device_del(&cxlr->dev);
2547
2548 /*
2549 * Now that region sysfs is shutdown, the parameter block is now
2550 * read-only, so no need to hold the region rwsem to access the
2551 * region parameters.
2552 */
2553 for (i = 0; i < p->interleave_ways; i++)
2554 detach_target(cxlr, i);
2555
2556 cxlr->hpa_range = DEFINE_RANGE(0, -1);
2557
2558 cxl_region_iomem_release(cxlr);
2559 put_device(&cxlr->dev);
2560 }
2561
2562 static struct lock_class_key cxl_region_key;
2563
cxl_region_alloc(struct cxl_root_decoder * cxlrd,int id)2564 static struct cxl_region *cxl_region_alloc(struct cxl_root_decoder *cxlrd, int id)
2565 {
2566 struct cxl_region *cxlr;
2567 struct device *dev;
2568
2569 cxlr = kzalloc_obj(*cxlr);
2570 if (!cxlr) {
2571 memregion_free(id);
2572 return ERR_PTR(-ENOMEM);
2573 }
2574
2575 dev = &cxlr->dev;
2576 device_initialize(dev);
2577 lockdep_set_class(&dev->mutex, &cxl_region_key);
2578 dev->parent = &cxlrd->cxlsd.cxld.dev;
2579 /*
2580 * Keep root decoder pinned through cxl_region_release to fixup
2581 * region id allocations
2582 */
2583 get_device(dev->parent);
2584 cxlr->cxlrd = cxlrd;
2585 cxlr->id = id;
2586
2587 device_set_pm_not_required(dev);
2588 dev->bus = &cxl_bus_type;
2589 dev->type = &cxl_region_type;
2590 cxl_region_setup_flags(cxlr, &cxlrd->cxlsd.cxld);
2591
2592 return cxlr;
2593 }
2594
cxl_region_update_coordinates(struct cxl_region * cxlr,int nid)2595 static bool cxl_region_update_coordinates(struct cxl_region *cxlr, int nid)
2596 {
2597 int cset = 0;
2598 int rc;
2599
2600 for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) {
2601 if (cxlr->coord[i].read_bandwidth) {
2602 node_update_perf_attrs(nid, &cxlr->coord[i], i);
2603 cset++;
2604 }
2605 }
2606
2607 if (!cset)
2608 return false;
2609
2610 rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_access0_group());
2611 if (rc)
2612 dev_dbg(&cxlr->dev, "Failed to update access0 group\n");
2613
2614 rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_access1_group());
2615 if (rc)
2616 dev_dbg(&cxlr->dev, "Failed to update access1 group\n");
2617
2618 return true;
2619 }
2620
cxl_region_perf_attrs_callback(struct notifier_block * nb,unsigned long action,void * arg)2621 static int cxl_region_perf_attrs_callback(struct notifier_block *nb,
2622 unsigned long action, void *arg)
2623 {
2624 struct cxl_region *cxlr = container_of(nb, struct cxl_region,
2625 node_notifier);
2626 struct node_notify *nn = arg;
2627 int nid = nn->nid;
2628 int region_nid;
2629
2630 if (action != NODE_ADDED_FIRST_MEMORY)
2631 return NOTIFY_DONE;
2632
2633 /*
2634 * No need to hold cxl_rwsem.region; region parameters are stable
2635 * within the cxl_region driver.
2636 */
2637 region_nid = phys_to_target_node(cxlr->params.res->start);
2638 if (nid != region_nid)
2639 return NOTIFY_DONE;
2640
2641 /* No action needed if node bit already set */
2642 if (node_test_and_set(nid, nodemask_region_seen))
2643 return NOTIFY_DONE;
2644
2645 if (!cxl_region_update_coordinates(cxlr, nid))
2646 return NOTIFY_DONE;
2647
2648 return NOTIFY_OK;
2649 }
2650
cxl_region_calculate_adistance(struct notifier_block * nb,unsigned long nid,void * data)2651 static int cxl_region_calculate_adistance(struct notifier_block *nb,
2652 unsigned long nid, void *data)
2653 {
2654 struct cxl_region *cxlr = container_of(nb, struct cxl_region,
2655 adist_notifier);
2656 struct access_coordinate *perf;
2657 int *adist = data;
2658 int region_nid;
2659
2660 /*
2661 * No need to hold cxl_rwsem.region; region parameters are stable
2662 * within the cxl_region driver.
2663 */
2664 region_nid = phys_to_target_node(cxlr->params.res->start);
2665 if (nid != region_nid)
2666 return NOTIFY_OK;
2667
2668 perf = &cxlr->coord[ACCESS_COORDINATE_CPU];
2669
2670 if (mt_perf_to_adistance(perf, adist))
2671 return NOTIFY_OK;
2672
2673 return NOTIFY_STOP;
2674 }
2675
2676 /**
2677 * devm_cxl_add_region - Adds a region to a decoder
2678 * @cxlrd: root decoder
2679 * @id: memregion id to create, or memregion_free() on failure
2680 * @mode: mode for the endpoint decoders of this region
2681 * @type: select whether this is an expander or accelerator (type-2 or type-3)
2682 *
2683 * This is the second step of region initialization. Regions exist within an
2684 * address space which is mapped by a @cxlrd.
2685 *
2686 * Return: 0 if the region was added to the @cxlrd, else returns negative error
2687 * code. The region will be named "regionZ" where Z is the unique region number.
2688 */
devm_cxl_add_region(struct cxl_root_decoder * cxlrd,int id,enum cxl_partition_mode mode,enum cxl_decoder_type type)2689 static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd,
2690 int id,
2691 enum cxl_partition_mode mode,
2692 enum cxl_decoder_type type)
2693 {
2694 struct cxl_port *port = to_cxl_port(cxlrd->cxlsd.cxld.dev.parent);
2695 struct cxl_region *cxlr;
2696 struct device *dev;
2697 int rc;
2698
2699 cxlr = cxl_region_alloc(cxlrd, id);
2700 if (IS_ERR(cxlr))
2701 return cxlr;
2702 cxlr->mode = mode;
2703 cxlr->type = type;
2704
2705 dev = &cxlr->dev;
2706 rc = dev_set_name(dev, "region%d", id);
2707 if (rc)
2708 goto err;
2709
2710 rc = device_add(dev);
2711 if (rc)
2712 goto err;
2713
2714 rc = devm_add_action_or_reset(port->uport_dev, unregister_region, cxlr);
2715 if (rc)
2716 return ERR_PTR(rc);
2717
2718 dev_dbg(port->uport_dev, "%s: created %s\n",
2719 dev_name(&cxlrd->cxlsd.cxld.dev), dev_name(dev));
2720 return cxlr;
2721
2722 err:
2723 put_device(dev);
2724 return ERR_PTR(rc);
2725 }
2726
__create_region_show(struct cxl_root_decoder * cxlrd,char * buf)2727 static ssize_t __create_region_show(struct cxl_root_decoder *cxlrd, char *buf)
2728 {
2729 return sysfs_emit(buf, "region%u\n", atomic_read(&cxlrd->region_id));
2730 }
2731
create_pmem_region_show(struct device * dev,struct device_attribute * attr,char * buf)2732 static ssize_t create_pmem_region_show(struct device *dev,
2733 struct device_attribute *attr, char *buf)
2734 {
2735 return __create_region_show(to_cxl_root_decoder(dev), buf);
2736 }
2737
create_ram_region_show(struct device * dev,struct device_attribute * attr,char * buf)2738 static ssize_t create_ram_region_show(struct device *dev,
2739 struct device_attribute *attr, char *buf)
2740 {
2741 return __create_region_show(to_cxl_root_decoder(dev), buf);
2742 }
2743
__create_region(struct cxl_root_decoder * cxlrd,enum cxl_partition_mode mode,int id,enum cxl_decoder_type target_type)2744 static struct cxl_region *__create_region(struct cxl_root_decoder *cxlrd,
2745 enum cxl_partition_mode mode, int id,
2746 enum cxl_decoder_type target_type)
2747 {
2748 int rc;
2749
2750 switch (mode) {
2751 case CXL_PARTMODE_RAM:
2752 case CXL_PARTMODE_PMEM:
2753 break;
2754 default:
2755 dev_err(&cxlrd->cxlsd.cxld.dev, "unsupported mode %d\n", mode);
2756 return ERR_PTR(-EINVAL);
2757 }
2758
2759 rc = memregion_alloc(GFP_KERNEL);
2760 if (rc < 0)
2761 return ERR_PTR(rc);
2762
2763 if (atomic_cmpxchg(&cxlrd->region_id, id, rc) != id) {
2764 memregion_free(rc);
2765 return ERR_PTR(-EBUSY);
2766 }
2767
2768 return devm_cxl_add_region(cxlrd, id, mode, target_type);
2769 }
2770
create_region_store(struct device * dev,const char * buf,size_t len,enum cxl_partition_mode mode)2771 static ssize_t create_region_store(struct device *dev, const char *buf,
2772 size_t len, enum cxl_partition_mode mode)
2773 {
2774 struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
2775 struct cxl_region *cxlr;
2776 int rc, id;
2777
2778 rc = sscanf(buf, "region%d\n", &id);
2779 if (rc != 1)
2780 return -EINVAL;
2781
2782 cxlr = __create_region(cxlrd, mode, id, CXL_DECODER_HOSTONLYMEM);
2783 if (IS_ERR(cxlr))
2784 return PTR_ERR(cxlr);
2785
2786 return len;
2787 }
2788
create_pmem_region_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t len)2789 static ssize_t create_pmem_region_store(struct device *dev,
2790 struct device_attribute *attr,
2791 const char *buf, size_t len)
2792 {
2793 return create_region_store(dev, buf, len, CXL_PARTMODE_PMEM);
2794 }
2795 DEVICE_ATTR_RW(create_pmem_region);
2796
create_ram_region_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t len)2797 static ssize_t create_ram_region_store(struct device *dev,
2798 struct device_attribute *attr,
2799 const char *buf, size_t len)
2800 {
2801 return create_region_store(dev, buf, len, CXL_PARTMODE_RAM);
2802 }
2803 DEVICE_ATTR_RW(create_ram_region);
2804
region_show(struct device * dev,struct device_attribute * attr,char * buf)2805 static ssize_t region_show(struct device *dev, struct device_attribute *attr,
2806 char *buf)
2807 {
2808 struct cxl_decoder *cxld = to_cxl_decoder(dev);
2809 ssize_t rc;
2810
2811 ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
2812 if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
2813 return rc;
2814
2815 if (cxld->region)
2816 return sysfs_emit(buf, "%s\n", dev_name(&cxld->region->dev));
2817 return sysfs_emit(buf, "\n");
2818 }
2819 DEVICE_ATTR_RO(region);
2820
2821 static struct cxl_region *
cxl_find_region_by_name(struct cxl_root_decoder * cxlrd,const char * name)2822 cxl_find_region_by_name(struct cxl_root_decoder *cxlrd, const char *name)
2823 {
2824 struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld;
2825 struct device *region_dev;
2826
2827 region_dev = device_find_child_by_name(&cxld->dev, name);
2828 if (!region_dev)
2829 return ERR_PTR(-ENODEV);
2830
2831 return to_cxl_region(region_dev);
2832 }
2833
delete_region_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t len)2834 static ssize_t delete_region_store(struct device *dev,
2835 struct device_attribute *attr,
2836 const char *buf, size_t len)
2837 {
2838 struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
2839 struct cxl_port *port = to_cxl_port(dev->parent);
2840 struct cxl_region *cxlr;
2841
2842 cxlr = cxl_find_region_by_name(cxlrd, buf);
2843 if (IS_ERR(cxlr))
2844 return PTR_ERR(cxlr);
2845
2846 devm_release_action(port->uport_dev, unregister_region, cxlr);
2847 put_device(&cxlr->dev);
2848
2849 return len;
2850 }
2851 DEVICE_ATTR_WO(delete_region);
2852
2853 struct cxl_poison_context {
2854 struct cxl_port *port;
2855 int part;
2856 u64 offset;
2857 };
2858
cxl_get_poison_unmapped(struct cxl_memdev * cxlmd,struct cxl_poison_context * ctx)2859 static int cxl_get_poison_unmapped(struct cxl_memdev *cxlmd,
2860 struct cxl_poison_context *ctx)
2861 {
2862 struct cxl_dev_state *cxlds = cxlmd->cxlds;
2863 const struct resource *res;
2864 struct resource *p, *last;
2865 u64 offset, length;
2866 int rc = 0;
2867
2868 if (ctx->part < 0)
2869 return 0;
2870
2871 /*
2872 * Collect poison for the remaining unmapped resources after
2873 * poison is collected by committed endpoints decoders.
2874 */
2875 for (int i = ctx->part; i < cxlds->nr_partitions; i++) {
2876 res = &cxlds->part[i].res;
2877 for (p = res->child, last = NULL; p; p = p->sibling)
2878 last = p;
2879 if (last)
2880 offset = last->end + 1;
2881 else
2882 offset = res->start;
2883 length = res->end - offset + 1;
2884 if (!length)
2885 break;
2886 rc = cxl_mem_get_poison(cxlmd, offset, length, NULL);
2887 if (rc == -EFAULT && cxlds->part[i].mode == CXL_PARTMODE_RAM)
2888 continue;
2889 if (rc)
2890 break;
2891 }
2892
2893 return rc;
2894 }
2895
poison_by_decoder(struct device * dev,void * arg)2896 static int poison_by_decoder(struct device *dev, void *arg)
2897 {
2898 struct cxl_poison_context *ctx = arg;
2899 struct cxl_endpoint_decoder *cxled;
2900 enum cxl_partition_mode mode;
2901 struct cxl_dev_state *cxlds;
2902 struct cxl_memdev *cxlmd;
2903 u64 offset, length;
2904 int rc = 0;
2905
2906 if (!is_endpoint_decoder(dev))
2907 return rc;
2908
2909 cxled = to_cxl_endpoint_decoder(dev);
2910 if (!cxled->dpa_res)
2911 return rc;
2912
2913 cxlmd = cxled_to_memdev(cxled);
2914 cxlds = cxlmd->cxlds;
2915 mode = cxlds->part[cxled->part].mode;
2916
2917 if (cxled->skip) {
2918 offset = cxled->dpa_res->start - cxled->skip;
2919 length = cxled->skip;
2920 rc = cxl_mem_get_poison(cxlmd, offset, length, NULL);
2921 if (rc == -EFAULT && mode == CXL_PARTMODE_RAM)
2922 rc = 0;
2923 if (rc)
2924 return rc;
2925 }
2926
2927 offset = cxled->dpa_res->start;
2928 length = cxled->dpa_res->end - offset + 1;
2929 rc = cxl_mem_get_poison(cxlmd, offset, length, cxled->cxld.region);
2930 if (rc == -EFAULT && mode == CXL_PARTMODE_RAM)
2931 rc = 0;
2932 if (rc)
2933 return rc;
2934
2935 /* Iterate until commit_end is reached */
2936 if (cxled->cxld.id == ctx->port->commit_end) {
2937 ctx->offset = cxled->dpa_res->end + 1;
2938 ctx->part = cxled->part;
2939 return 1;
2940 }
2941
2942 return 0;
2943 }
2944
cxl_get_poison_by_endpoint(struct cxl_port * port)2945 int cxl_get_poison_by_endpoint(struct cxl_port *port)
2946 {
2947 struct cxl_poison_context ctx;
2948 int rc = 0;
2949
2950 ctx = (struct cxl_poison_context) {
2951 .port = port,
2952 .part = -1,
2953 };
2954
2955 rc = device_for_each_child(&port->dev, &ctx, poison_by_decoder);
2956 if (rc == 1)
2957 rc = cxl_get_poison_unmapped(to_cxl_memdev(port->uport_dev),
2958 &ctx);
2959
2960 return rc;
2961 }
2962
2963 struct cxl_dpa_to_region_context {
2964 struct cxl_region *cxlr;
2965 u64 dpa;
2966 };
2967
__cxl_dpa_to_region(struct device * dev,void * arg)2968 static int __cxl_dpa_to_region(struct device *dev, void *arg)
2969 {
2970 struct cxl_dpa_to_region_context *ctx = arg;
2971 struct cxl_endpoint_decoder *cxled;
2972 struct cxl_region *cxlr;
2973 u64 dpa = ctx->dpa;
2974
2975 if (!is_endpoint_decoder(dev))
2976 return 0;
2977
2978 cxled = to_cxl_endpoint_decoder(dev);
2979 if (!cxled || !cxled->dpa_res || !resource_size(cxled->dpa_res))
2980 return 0;
2981
2982 if (!cxl_resource_contains_addr(cxled->dpa_res, dpa))
2983 return 0;
2984
2985 /*
2986 * Stop the region search (return 1) when an endpoint mapping is
2987 * found. The region may not be fully constructed so offering
2988 * the cxlr in the context structure is not guaranteed.
2989 */
2990 cxlr = cxled->cxld.region;
2991 if (cxlr)
2992 dev_dbg(dev, "dpa:0x%llx mapped in region:%s\n", dpa,
2993 dev_name(&cxlr->dev));
2994 else
2995 dev_dbg(dev, "dpa:0x%llx mapped in endpoint:%s\n", dpa,
2996 dev_name(dev));
2997
2998 ctx->cxlr = cxlr;
2999
3000 return 1;
3001 }
3002
cxl_dpa_to_region(const struct cxl_memdev * cxlmd,u64 dpa)3003 struct cxl_region *cxl_dpa_to_region(const struct cxl_memdev *cxlmd, u64 dpa)
3004 {
3005 struct cxl_dpa_to_region_context ctx;
3006 struct cxl_port *port = cxlmd->endpoint;
3007
3008 if (!cxlmd->dev.driver)
3009 return NULL;
3010
3011 ctx = (struct cxl_dpa_to_region_context) {
3012 .dpa = dpa,
3013 };
3014 if (cxl_num_decoders_committed(port))
3015 device_for_each_child(&port->dev, &ctx, __cxl_dpa_to_region);
3016
3017 return ctx.cxlr;
3018 }
3019
cxl_is_hpa_in_chunk(u64 hpa,struct cxl_region * cxlr,int pos)3020 static bool cxl_is_hpa_in_chunk(u64 hpa, struct cxl_region *cxlr, int pos)
3021 {
3022 struct cxl_region_params *p = &cxlr->params;
3023 int gran = p->interleave_granularity;
3024 int ways = p->interleave_ways;
3025 u64 offset;
3026
3027 /* Is the hpa in an expected chunk for its pos(-ition) */
3028 offset = hpa - p->res->start;
3029 offset = do_div(offset, gran * ways);
3030 if ((offset >= pos * gran) && (offset < (pos + 1) * gran))
3031 return true;
3032
3033 dev_dbg(&cxlr->dev,
3034 "Addr trans fail: hpa 0x%llx not in expected chunk\n", hpa);
3035
3036 return false;
3037 }
3038
3039 #define CXL_POS_ZERO 0
3040 /**
3041 * cxl_validate_translation_params
3042 * @eiw: encoded interleave ways
3043 * @eig: encoded interleave granularity
3044 * @pos: position in interleave
3045 *
3046 * Callers pass CXL_POS_ZERO when no position parameter needs validating.
3047 *
3048 * Returns: 0 on success, -EINVAL on first invalid parameter
3049 */
cxl_validate_translation_params(u8 eiw,u16 eig,int pos)3050 int cxl_validate_translation_params(u8 eiw, u16 eig, int pos)
3051 {
3052 int ways, gran;
3053
3054 if (eiw_to_ways(eiw, &ways)) {
3055 pr_debug("%s: invalid eiw=%u\n", __func__, eiw);
3056 return -EINVAL;
3057 }
3058 if (eig_to_granularity(eig, &gran)) {
3059 pr_debug("%s: invalid eig=%u\n", __func__, eig);
3060 return -EINVAL;
3061 }
3062 if (pos < 0 || pos >= ways) {
3063 pr_debug("%s: invalid pos=%d for ways=%u\n", __func__, pos,
3064 ways);
3065 return -EINVAL;
3066 }
3067
3068 return 0;
3069 }
3070 EXPORT_SYMBOL_FOR_MODULES(cxl_validate_translation_params, "cxl_translate");
3071
cxl_calculate_dpa_offset(u64 hpa_offset,u8 eiw,u16 eig)3072 u64 cxl_calculate_dpa_offset(u64 hpa_offset, u8 eiw, u16 eig)
3073 {
3074 u64 dpa_offset, bits_lower, bits_upper, temp;
3075 int ret;
3076
3077 ret = cxl_validate_translation_params(eiw, eig, CXL_POS_ZERO);
3078 if (ret)
3079 return ULLONG_MAX;
3080
3081 /*
3082 * DPA offset: CXL Spec 3.2 Section 8.2.4.20.13
3083 * Lower bits [IG+7:0] pass through unchanged
3084 * (eiw < 8)
3085 * Per spec: DPAOffset[51:IG+8] = (HPAOffset[51:IG+IW+8] >> IW)
3086 * Clear the position bits to isolate upper section, then
3087 * reverse the left shift by eiw that occurred during DPA->HPA
3088 * (eiw >= 8)
3089 * Per spec: DPAOffset[51:IG+8] = HPAOffset[51:IG+IW] / 3
3090 * Extract upper bits from the correct bit range and divide by 3
3091 * to recover the original DPA upper bits
3092 */
3093 bits_lower = hpa_offset & GENMASK_ULL(eig + 7, 0);
3094 if (eiw < 8) {
3095 temp = hpa_offset &= ~GENMASK_ULL(eig + eiw + 8 - 1, 0);
3096 dpa_offset = temp >> eiw;
3097 } else {
3098 bits_upper = div64_u64(hpa_offset >> (eig + eiw), 3);
3099 dpa_offset = bits_upper << (eig + 8);
3100 }
3101 dpa_offset |= bits_lower;
3102
3103 return dpa_offset;
3104 }
3105 EXPORT_SYMBOL_FOR_MODULES(cxl_calculate_dpa_offset, "cxl_translate");
3106
cxl_calculate_position(u64 hpa_offset,u8 eiw,u16 eig)3107 int cxl_calculate_position(u64 hpa_offset, u8 eiw, u16 eig)
3108 {
3109 unsigned int ways = 0;
3110 u64 shifted, rem;
3111 int pos, ret;
3112
3113 ret = cxl_validate_translation_params(eiw, eig, CXL_POS_ZERO);
3114 if (ret)
3115 return ret;
3116
3117 if (!eiw)
3118 /* position is 0 if no interleaving */
3119 return 0;
3120
3121 /*
3122 * Interleave position: CXL Spec 3.2 Section 8.2.4.20.13
3123 * eiw < 8
3124 * Position is in the IW bits at HPA_OFFSET[IG+8+IW-1:IG+8].
3125 * Per spec "remove IW bits starting with bit position IG+8"
3126 * eiw >= 8
3127 * Position is not explicitly stored in HPA_OFFSET bits. It is
3128 * derived from the modulo operation of the upper bits using
3129 * the total number of interleave ways.
3130 */
3131 if (eiw < 8) {
3132 pos = (hpa_offset >> (eig + 8)) & GENMASK(eiw - 1, 0);
3133 } else {
3134 shifted = hpa_offset >> (eig + 8);
3135 eiw_to_ways(eiw, &ways);
3136 div64_u64_rem(shifted, ways, &rem);
3137 pos = rem;
3138 }
3139
3140 return pos;
3141 }
3142 EXPORT_SYMBOL_FOR_MODULES(cxl_calculate_position, "cxl_translate");
3143
cxl_calculate_hpa_offset(u64 dpa_offset,int pos,u8 eiw,u16 eig)3144 u64 cxl_calculate_hpa_offset(u64 dpa_offset, int pos, u8 eiw, u16 eig)
3145 {
3146 u64 mask_upper, hpa_offset, bits_upper;
3147 int ret;
3148
3149 ret = cxl_validate_translation_params(eiw, eig, pos);
3150 if (ret)
3151 return ULLONG_MAX;
3152
3153 /*
3154 * The device position in the region interleave set was removed
3155 * from the offset at HPA->DPA translation. To reconstruct the
3156 * HPA, place the 'pos' in the offset.
3157 *
3158 * The placement of 'pos' in the HPA is determined by interleave
3159 * ways and granularity and is defined in the CXL Spec 3.0 Section
3160 * 8.2.4.19.13 Implementation Note: Device Decode Logic
3161 */
3162
3163 mask_upper = GENMASK_ULL(51, eig + 8);
3164
3165 if (eiw < 8) {
3166 hpa_offset = (dpa_offset & mask_upper) << eiw;
3167 hpa_offset |= pos << (eig + 8);
3168 } else {
3169 bits_upper = (dpa_offset & mask_upper) >> (eig + 8);
3170 bits_upper = bits_upper * 3;
3171 hpa_offset = ((bits_upper << (eiw - 8)) + pos) << (eig + 8);
3172 }
3173
3174 /* The lower bits remain unchanged */
3175 hpa_offset |= dpa_offset & GENMASK_ULL(eig + 7, 0);
3176
3177 return hpa_offset;
3178 }
3179 EXPORT_SYMBOL_FOR_MODULES(cxl_calculate_hpa_offset, "cxl_translate");
3180
decode_pos(int region_ways,int hb_ways,int pos,int * pos_port,int * pos_hb)3181 static int decode_pos(int region_ways, int hb_ways, int pos, int *pos_port,
3182 int *pos_hb)
3183 {
3184 int devices_per_hb;
3185
3186 /*
3187 * Decode for 3-6-12 way interleaves as defined in the CXL
3188 * Spec 4.0 9.13.1.1 Legal Interleaving Configurations.
3189 * Region creation should prevent invalid combinations but
3190 * sanity check here to avoid a silent bad decode.
3191 */
3192 switch (hb_ways) {
3193 case 3:
3194 if (region_ways != 3 && region_ways != 6 && region_ways != 12)
3195 return -EINVAL;
3196 break;
3197 case 6:
3198 if (region_ways != 6 && region_ways != 12)
3199 return -EINVAL;
3200 break;
3201 case 12:
3202 if (region_ways != 12)
3203 return -EINVAL;
3204 break;
3205 default:
3206 return -EINVAL;
3207 }
3208 /*
3209 * Each host bridge contributes an equal number of endpoints
3210 * that are laid out contiguously per host bridge. Modulo
3211 * selects the port within a host bridge and division selects
3212 * the host bridge position.
3213 */
3214 devices_per_hb = region_ways / hb_ways;
3215 *pos_port = pos % devices_per_hb;
3216 *pos_hb = pos / devices_per_hb;
3217
3218 return 0;
3219 }
3220
3221 /*
3222 * restore_parent() reconstruct the address in parent
3223 *
3224 * This math, specifically the bitmask creation 'mask = gran - 1' relies
3225 * on the CXL Spec requirement that interleave granularity is always a
3226 * power of two.
3227 *
3228 * [mask] isolate the offset with the granularity
3229 * [addr & ~mask] remove the offset leaving the aligned portion
3230 * [* ways] distribute across all interleave ways
3231 * [+ (pos * gran)] add the positional offset
3232 * [+ (addr & mask)] restore the masked offset
3233 */
restore_parent(u64 addr,u64 pos,u64 gran,u64 ways)3234 static u64 restore_parent(u64 addr, u64 pos, u64 gran, u64 ways)
3235 {
3236 u64 mask = gran - 1;
3237
3238 return ((addr & ~mask) * ways) + (pos * gran) + (addr & mask);
3239 }
3240
3241 /*
3242 * unaligned_dpa_to_hpa() translates a DPA to HPA when the region resource
3243 * start address is not aligned at Host Bridge Interleave Ways * 256MB.
3244 *
3245 * Unaligned start addresses only occur with MOD3 interleaves. All power-
3246 * of-two interleaves are guaranteed aligned.
3247 */
unaligned_dpa_to_hpa(struct cxl_decoder * cxld,struct cxl_region_params * p,int pos,u64 dpa)3248 static u64 unaligned_dpa_to_hpa(struct cxl_decoder *cxld,
3249 struct cxl_region_params *p, int pos, u64 dpa)
3250 {
3251 int ways_port = p->interleave_ways / cxld->interleave_ways;
3252 int gran_port = p->interleave_granularity;
3253 int gran_hb = cxld->interleave_granularity;
3254 int ways_hb = cxld->interleave_ways;
3255 int pos_port, pos_hb, gran_shift;
3256 u64 hpa_port = 0;
3257
3258 /* Decode an endpoint 'pos' into port and host-bridge components */
3259 if (decode_pos(p->interleave_ways, ways_hb, pos, &pos_port, &pos_hb)) {
3260 dev_dbg(&cxld->dev, "not supported for region ways:%d\n",
3261 p->interleave_ways);
3262 return ULLONG_MAX;
3263 }
3264
3265 /* Restore the port parent address if needed */
3266 if (gran_hb != gran_port)
3267 hpa_port = restore_parent(dpa, pos_port, gran_port, ways_port);
3268 else
3269 hpa_port = dpa;
3270
3271 /*
3272 * Complete the HPA reconstruction by restoring the address as if
3273 * each HB position is a candidate. Test against expected pos_hb
3274 * to confirm match.
3275 */
3276 gran_shift = ilog2(gran_hb);
3277 for (int position = 0; position < ways_hb; position++) {
3278 u64 shifted, hpa;
3279
3280 hpa = restore_parent(hpa_port, position, gran_hb, ways_hb);
3281 hpa += p->res->start;
3282
3283 shifted = hpa >> gran_shift;
3284 if (do_div(shifted, ways_hb) == pos_hb)
3285 return hpa;
3286 }
3287
3288 dev_dbg(&cxld->dev, "fail dpa:%#llx region:%pr pos:%d\n", dpa, p->res,
3289 pos);
3290 dev_dbg(&cxld->dev, " port-w/g/p:%d/%d/%d hb-w/g/p:%d/%d/%d\n",
3291 ways_port, gran_port, pos_port, ways_hb, gran_hb, pos_hb);
3292
3293 return ULLONG_MAX;
3294 }
3295
region_is_unaligned_mod3(struct cxl_region * cxlr)3296 static bool region_is_unaligned_mod3(struct cxl_region *cxlr)
3297 {
3298 struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
3299 struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld;
3300 struct cxl_region_params *p = &cxlr->params;
3301 int hbiw = cxld->interleave_ways;
3302 u64 rem;
3303
3304 if (is_power_of_2(hbiw))
3305 return false;
3306
3307 div64_u64_rem(p->res->start, (u64)hbiw * SZ_256M, &rem);
3308
3309 return (rem != 0);
3310 }
3311
cxl_dpa_to_hpa(struct cxl_region * cxlr,const struct cxl_memdev * cxlmd,u64 dpa)3312 u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd,
3313 u64 dpa)
3314 {
3315 struct cxl_root_decoder *cxlrd = cxlr->cxlrd;
3316 struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld;
3317 struct cxl_region_params *p = &cxlr->params;
3318 struct cxl_endpoint_decoder *cxled = NULL;
3319 u64 base, dpa_offset, hpa_offset, hpa;
3320 bool unaligned = false;
3321 u16 eig = 0;
3322 u8 eiw = 0;
3323 int pos;
3324
3325 /*
3326 * Conversion between SPA and DPA is not supported in
3327 * Normalized Address mode.
3328 */
3329 if (test_bit(CXL_REGION_F_NORMALIZED_ADDRESSING, &cxlr->flags))
3330 return ULLONG_MAX;
3331
3332 for (int i = 0; i < p->nr_targets; i++) {
3333 if (cxlmd == cxled_to_memdev(p->targets[i])) {
3334 cxled = p->targets[i];
3335 break;
3336 }
3337 }
3338 if (!cxled)
3339 return ULLONG_MAX;
3340
3341 base = cxl_dpa_resource_start(cxled);
3342 if (base == RESOURCE_SIZE_MAX)
3343 return ULLONG_MAX;
3344
3345 dpa_offset = dpa - base;
3346
3347 /* Unaligned calc for MOD3 interleaves not hbiw * 256MB aligned */
3348 unaligned = region_is_unaligned_mod3(cxlr);
3349 if (unaligned) {
3350 hpa = unaligned_dpa_to_hpa(cxld, p, cxled->pos, dpa_offset);
3351 if (hpa == ULLONG_MAX)
3352 return ULLONG_MAX;
3353
3354 goto skip_aligned;
3355 }
3356 /*
3357 * Aligned calc for all power-of-2 interleaves and for MOD3
3358 * interleaves that are aligned at hbiw * 256MB
3359 */
3360 pos = cxled->pos;
3361 ways_to_eiw(p->interleave_ways, &eiw);
3362 granularity_to_eig(p->interleave_granularity, &eig);
3363
3364 hpa_offset = cxl_calculate_hpa_offset(dpa_offset, pos, eiw, eig);
3365 if (hpa_offset == ULLONG_MAX)
3366 return ULLONG_MAX;
3367
3368 /* Apply the hpa_offset to the region base address */
3369 hpa = hpa_offset + p->res->start;
3370
3371 skip_aligned:
3372 hpa += p->cache_size;
3373
3374 /* Root decoder translation overrides typical modulo decode */
3375 if (cxlrd->ops.hpa_to_spa)
3376 hpa = cxlrd->ops.hpa_to_spa(cxlrd, hpa);
3377
3378 if (hpa == ULLONG_MAX)
3379 return ULLONG_MAX;
3380
3381 if (!cxl_resource_contains_addr(p->res, hpa)) {
3382 dev_dbg(&cxlr->dev,
3383 "Addr trans fail: hpa 0x%llx not in region\n", hpa);
3384 return ULLONG_MAX;
3385 }
3386 /* Chunk check applies to aligned modulo decodes only */
3387 if (!unaligned && !cxlrd->ops.hpa_to_spa &&
3388 !cxl_is_hpa_in_chunk(hpa, cxlr, pos))
3389 return ULLONG_MAX;
3390
3391 return hpa;
3392 }
3393
3394 struct dpa_result {
3395 struct cxl_memdev *cxlmd;
3396 u64 dpa;
3397 };
3398
unaligned_region_offset_to_dpa_result(struct cxl_region * cxlr,u64 offset,struct dpa_result * result)3399 static int unaligned_region_offset_to_dpa_result(struct cxl_region *cxlr,
3400 u64 offset,
3401 struct dpa_result *result)
3402 {
3403 struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
3404 struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld;
3405 struct cxl_region_params *p = &cxlr->params;
3406 u64 interleave_width, interleave_index;
3407 u64 gran, gran_offset, dpa_offset;
3408 u64 hpa = p->res->start + offset;
3409 u64 tmp = offset;
3410
3411 /*
3412 * Unaligned addresses are not algebraically invertible. Calculate
3413 * a dpa_offset independent of the target device and then enumerate
3414 * and test that dpa_offset against each candidate endpoint decoder.
3415 */
3416 gran = cxld->interleave_granularity;
3417 interleave_width = gran * cxld->interleave_ways;
3418 interleave_index = div64_u64(offset, interleave_width);
3419 gran_offset = do_div(tmp, gran);
3420
3421 dpa_offset = interleave_index * gran + gran_offset;
3422
3423 for (int i = 0; i < p->nr_targets; i++) {
3424 struct cxl_endpoint_decoder *cxled = p->targets[i];
3425 int pos = cxled->pos;
3426 u64 test_hpa;
3427
3428 test_hpa = unaligned_dpa_to_hpa(cxld, p, pos, dpa_offset);
3429 if (test_hpa == hpa) {
3430 result->cxlmd = cxled_to_memdev(cxled);
3431 result->dpa =
3432 cxl_dpa_resource_start(cxled) + dpa_offset;
3433 return 0;
3434 }
3435 }
3436 dev_err(&cxlr->dev,
3437 "failed to resolve HPA %#llx in unaligned MOD3 region\n", hpa);
3438
3439 return -ENXIO;
3440 }
3441
region_offset_to_dpa_result(struct cxl_region * cxlr,u64 offset,struct dpa_result * result)3442 static int region_offset_to_dpa_result(struct cxl_region *cxlr, u64 offset,
3443 struct dpa_result *result)
3444 {
3445 struct cxl_region_params *p = &cxlr->params;
3446 struct cxl_root_decoder *cxlrd = cxlr->cxlrd;
3447 struct cxl_endpoint_decoder *cxled;
3448 u64 hpa_offset = offset;
3449 u64 dpa, dpa_offset;
3450 u16 eig = 0;
3451 u8 eiw = 0;
3452 int pos;
3453
3454 lockdep_assert_held(&cxl_rwsem.region);
3455 lockdep_assert_held(&cxl_rwsem.dpa);
3456
3457 /* Input validation ensures valid ways and gran */
3458 granularity_to_eig(p->interleave_granularity, &eig);
3459 ways_to_eiw(p->interleave_ways, &eiw);
3460
3461 /*
3462 * If the root decoder has SPA to CXL HPA callback, use it. Otherwise
3463 * CXL HPA is assumed to equal SPA.
3464 */
3465 if (cxlrd->ops.spa_to_hpa) {
3466 hpa_offset = cxlrd->ops.spa_to_hpa(cxlrd, p->res->start + offset);
3467 if (hpa_offset == ULLONG_MAX) {
3468 dev_dbg(&cxlr->dev, "HPA not found for %pr offset %#llx\n",
3469 p->res, offset);
3470 return -ENXIO;
3471 }
3472 hpa_offset -= p->res->start;
3473 }
3474
3475 if (region_is_unaligned_mod3(cxlr))
3476 return unaligned_region_offset_to_dpa_result(cxlr, offset,
3477 result);
3478
3479 pos = cxl_calculate_position(hpa_offset, eiw, eig);
3480 if (pos < 0 || pos >= p->nr_targets) {
3481 dev_dbg(&cxlr->dev, "Invalid position %d for %d targets\n",
3482 pos, p->nr_targets);
3483 return -ENXIO;
3484 }
3485
3486 dpa_offset = cxl_calculate_dpa_offset(hpa_offset, eiw, eig);
3487
3488 /* Look-up and return the result: a memdev and a DPA */
3489 for (int i = 0; i < p->nr_targets; i++) {
3490 cxled = p->targets[i];
3491 if (cxled->pos != pos)
3492 continue;
3493
3494 dpa = cxl_dpa_resource_start(cxled);
3495 if (dpa != RESOURCE_SIZE_MAX)
3496 dpa += dpa_offset;
3497
3498 result->cxlmd = cxled_to_memdev(cxled);
3499 result->dpa = dpa;
3500
3501 return 0;
3502 }
3503 dev_err(&cxlr->dev, "No device found for position %d\n", pos);
3504
3505 return -ENXIO;
3506 }
3507
match_root_decoder(struct device * dev,const void * data)3508 static int match_root_decoder(struct device *dev, const void *data)
3509 {
3510 const struct range *r1, *r2 = data;
3511 struct cxl_root_decoder *cxlrd;
3512
3513 if (!is_root_decoder(dev))
3514 return 0;
3515
3516 cxlrd = to_cxl_root_decoder(dev);
3517 r1 = &cxlrd->cxlsd.cxld.hpa_range;
3518
3519 return range_contains(r1, r2);
3520 }
3521
cxl_root_setup_translation(struct cxl_root * cxl_root,struct cxl_region_context * ctx)3522 static int cxl_root_setup_translation(struct cxl_root *cxl_root,
3523 struct cxl_region_context *ctx)
3524 {
3525 if (!cxl_root->ops.translation_setup_root)
3526 return 0;
3527
3528 return cxl_root->ops.translation_setup_root(cxl_root, ctx);
3529 }
3530
3531 /*
3532 * Note, when finished with the device, drop the reference with
3533 * put_device() or use the put_cxl_root_decoder helper.
3534 */
3535 static struct cxl_root_decoder *
get_cxl_root_decoder(struct cxl_endpoint_decoder * cxled,struct cxl_region_context * ctx)3536 get_cxl_root_decoder(struct cxl_endpoint_decoder *cxled,
3537 struct cxl_region_context *ctx)
3538 {
3539 struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
3540 struct cxl_port *port = cxled_to_port(cxled);
3541 struct cxl_root *cxl_root __free(put_cxl_root) = find_cxl_root(port);
3542 struct device *cxlrd_dev;
3543 int rc;
3544
3545 /*
3546 * Adjust the endpoint's HPA range and interleaving
3547 * configuration to the root decoder’s memory space before
3548 * setting up the root decoder.
3549 */
3550 rc = cxl_root_setup_translation(cxl_root, ctx);
3551 if (rc) {
3552 dev_err(cxlmd->dev.parent,
3553 "%s:%s Failed to setup translation for address range %#llx:%#llx\n",
3554 dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
3555 ctx->hpa_range.start, ctx->hpa_range.end);
3556 return ERR_PTR(rc);
3557 }
3558
3559 cxlrd_dev = device_find_child(&cxl_root->port.dev, &ctx->hpa_range,
3560 match_root_decoder);
3561 if (!cxlrd_dev) {
3562 dev_err(cxlmd->dev.parent,
3563 "%s:%s no CXL window for range %#llx:%#llx\n",
3564 dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
3565 ctx->hpa_range.start, ctx->hpa_range.end);
3566 return ERR_PTR(-ENXIO);
3567 }
3568
3569 return to_cxl_root_decoder(cxlrd_dev);
3570 }
3571
match_region_by_range(struct device * dev,const void * data)3572 static int match_region_by_range(struct device *dev, const void *data)
3573 {
3574 struct cxl_region_params *p;
3575 struct cxl_region *cxlr;
3576 const struct range *r = data;
3577
3578 if (!is_cxl_region(dev))
3579 return 0;
3580
3581 cxlr = to_cxl_region(dev);
3582 p = &cxlr->params;
3583
3584 guard(rwsem_read)(&cxl_rwsem.region);
3585 return spa_maps_hpa(p, r);
3586 }
3587
cxl_extended_linear_cache_resize(struct cxl_region * cxlr,struct resource * res)3588 static int cxl_extended_linear_cache_resize(struct cxl_region *cxlr,
3589 struct resource *res)
3590 {
3591 struct cxl_root_decoder *cxlrd = cxlr->cxlrd;
3592 struct cxl_region_params *p = &cxlr->params;
3593 resource_size_t size = resource_size(res);
3594 resource_size_t cache_size, start;
3595
3596 cache_size = cxlrd->cache_size;
3597 if (!cache_size)
3598 return 0;
3599
3600 if (size != cache_size) {
3601 dev_warn(&cxlr->dev,
3602 "Extended Linear Cache size %pa != CXL size %pa. No Support!",
3603 &cache_size, &size);
3604 return -ENXIO;
3605 }
3606
3607 /*
3608 * Move the start of the range to where the cache range starts. The
3609 * implementation assumes that the cache range is in front of the
3610 * CXL range. This is not dictated by the HMAT spec but is how the
3611 * current known implementation is configured.
3612 *
3613 * The cache range is expected to be within the CFMWS. The adjusted
3614 * res->start should not be less than cxlrd->res->start.
3615 */
3616 start = res->start - cache_size;
3617 if (start < cxlrd->res->start)
3618 return -ENXIO;
3619
3620 res->start = start;
3621 p->cache_size = cache_size;
3622
3623 return 0;
3624 }
3625
__construct_region(struct cxl_region * cxlr,struct cxl_region_context * ctx)3626 static int __construct_region(struct cxl_region *cxlr,
3627 struct cxl_region_context *ctx)
3628 {
3629 struct cxl_endpoint_decoder *cxled = ctx->cxled;
3630 struct cxl_root_decoder *cxlrd = cxlr->cxlrd;
3631 struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
3632 struct range *hpa_range = &ctx->hpa_range;
3633 struct cxl_region_params *p;
3634 struct resource *res;
3635 int rc;
3636
3637 guard(rwsem_write)(&cxl_rwsem.region);
3638 p = &cxlr->params;
3639 if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) {
3640 dev_err(cxlmd->dev.parent,
3641 "%s:%s: %s autodiscovery interrupted\n",
3642 dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
3643 __func__);
3644 return -EBUSY;
3645 }
3646
3647 set_bit(CXL_REGION_F_AUTO, &cxlr->flags);
3648 cxlr->hpa_range = *hpa_range;
3649
3650 res = kmalloc_obj(*res);
3651 if (!res)
3652 return -ENOMEM;
3653
3654 *res = DEFINE_RES_MEM_NAMED(hpa_range->start, range_len(hpa_range),
3655 dev_name(&cxlr->dev));
3656
3657 rc = cxl_extended_linear_cache_resize(cxlr, res);
3658 if (rc && rc != -EOPNOTSUPP) {
3659 /*
3660 * Failing to support extended linear cache region resize does not
3661 * prevent the region from functioning. Only causes cxl list showing
3662 * incorrect region size.
3663 */
3664 dev_warn(cxlmd->dev.parent,
3665 "Extended linear cache calculation failed rc:%d\n", rc);
3666 }
3667
3668 rc = sysfs_update_group(&cxlr->dev.kobj, &cxl_region_group);
3669 if (rc) {
3670 kfree(res);
3671 return rc;
3672 }
3673
3674 rc = insert_resource(cxlrd->res, res);
3675 if (rc) {
3676 /*
3677 * Platform-firmware may not have split resources like "System
3678 * RAM" on CXL window boundaries see cxl_region_iomem_release()
3679 */
3680 dev_warn(cxlmd->dev.parent,
3681 "%s:%s: %s %s cannot insert resource\n",
3682 dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
3683 __func__, dev_name(&cxlr->dev));
3684 }
3685
3686 p->res = res;
3687 p->interleave_ways = ctx->interleave_ways;
3688 p->interleave_granularity = ctx->interleave_granularity;
3689 p->state = CXL_CONFIG_INTERLEAVE_ACTIVE;
3690
3691 rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_target_group());
3692 if (rc)
3693 return rc;
3694
3695 dev_dbg(cxlmd->dev.parent, "%s:%s: %s %s res: %pr iw: %d ig: %d\n",
3696 dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), __func__,
3697 dev_name(&cxlr->dev), p->res, p->interleave_ways,
3698 p->interleave_granularity);
3699
3700 /* ...to match put_device() in cxl_add_to_region() */
3701 get_device(&cxlr->dev);
3702
3703 return 0;
3704 }
3705
3706 /* Establish an empty region covering the given HPA range */
construct_region(struct cxl_root_decoder * cxlrd,struct cxl_region_context * ctx)3707 static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd,
3708 struct cxl_region_context *ctx)
3709 {
3710 struct cxl_endpoint_decoder *cxled = ctx->cxled;
3711 struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
3712 struct cxl_port *port = cxlrd_to_port(cxlrd);
3713 struct cxl_dev_state *cxlds = cxlmd->cxlds;
3714 int rc, part = READ_ONCE(cxled->part);
3715 struct cxl_region *cxlr;
3716
3717 do {
3718 cxlr = __create_region(cxlrd, cxlds->part[part].mode,
3719 atomic_read(&cxlrd->region_id),
3720 cxled->cxld.target_type);
3721 } while (IS_ERR(cxlr) && PTR_ERR(cxlr) == -EBUSY);
3722
3723 if (IS_ERR(cxlr)) {
3724 dev_err(cxlmd->dev.parent,
3725 "%s:%s: %s failed assign region: %ld\n",
3726 dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
3727 __func__, PTR_ERR(cxlr));
3728 return cxlr;
3729 }
3730
3731 rc = __construct_region(cxlr, ctx);
3732 if (rc) {
3733 devm_release_action(port->uport_dev, unregister_region, cxlr);
3734 return ERR_PTR(rc);
3735 }
3736
3737 return cxlr;
3738 }
3739
3740 static struct cxl_region *
cxl_find_region_by_range(struct cxl_root_decoder * cxlrd,struct range * hpa_range)3741 cxl_find_region_by_range(struct cxl_root_decoder *cxlrd,
3742 struct range *hpa_range)
3743 {
3744 struct device *region_dev;
3745
3746 region_dev = device_find_child(&cxlrd->cxlsd.cxld.dev, hpa_range,
3747 match_region_by_range);
3748 if (!region_dev)
3749 return NULL;
3750
3751 return to_cxl_region(region_dev);
3752 }
3753
cxl_add_to_region(struct cxl_endpoint_decoder * cxled)3754 int cxl_add_to_region(struct cxl_endpoint_decoder *cxled)
3755 {
3756 struct cxl_region_context ctx;
3757 struct cxl_region_params *p;
3758 bool attach = false;
3759 int rc;
3760
3761 ctx = (struct cxl_region_context) {
3762 .cxled = cxled,
3763 .hpa_range = cxled->cxld.hpa_range,
3764 .interleave_ways = cxled->cxld.interleave_ways,
3765 .interleave_granularity = cxled->cxld.interleave_granularity,
3766 };
3767
3768 struct cxl_root_decoder *cxlrd __free(put_cxl_root_decoder) =
3769 get_cxl_root_decoder(cxled, &ctx);
3770
3771 if (IS_ERR(cxlrd))
3772 return PTR_ERR(cxlrd);
3773
3774 /*
3775 * Ensure that, if multiple threads race to construct_region()
3776 * for the HPA range, one does the construction and the others
3777 * add to that.
3778 */
3779 mutex_lock(&cxlrd->range_lock);
3780 struct cxl_region *cxlr __free(put_cxl_region) =
3781 cxl_find_region_by_range(cxlrd, &ctx.hpa_range);
3782 if (!cxlr)
3783 cxlr = construct_region(cxlrd, &ctx);
3784 mutex_unlock(&cxlrd->range_lock);
3785
3786 rc = PTR_ERR_OR_ZERO(cxlr);
3787 if (rc)
3788 return rc;
3789
3790 attach_target(cxlr, cxled, -1, TASK_UNINTERRUPTIBLE);
3791
3792 scoped_guard(rwsem_read, &cxl_rwsem.region) {
3793 p = &cxlr->params;
3794 attach = p->state == CXL_CONFIG_COMMIT;
3795 }
3796
3797 if (attach) {
3798 /*
3799 * If device_attach() fails the range may still be active via
3800 * the platform-firmware memory map, otherwise the driver for
3801 * regions is local to this file, so driver matching can't fail.
3802 */
3803 if (device_attach(&cxlr->dev) < 0)
3804 dev_err(&cxlr->dev, "failed to enable, range: %pr\n",
3805 p->res);
3806 }
3807
3808 return rc;
3809 }
3810 EXPORT_SYMBOL_NS_GPL(cxl_add_to_region, "CXL");
3811
cxl_port_get_spa_cache_alias(struct cxl_port * endpoint,u64 spa)3812 u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, u64 spa)
3813 {
3814 struct cxl_region_ref *iter;
3815 unsigned long index;
3816
3817 if (!endpoint)
3818 return ~0ULL;
3819
3820 guard(rwsem_write)(&cxl_rwsem.region);
3821
3822 xa_for_each(&endpoint->regions, index, iter) {
3823 struct cxl_region_params *p = &iter->region->params;
3824
3825 if (cxl_resource_contains_addr(p->res, spa)) {
3826 if (!p->cache_size)
3827 return ~0ULL;
3828
3829 if (spa >= p->res->start + p->cache_size)
3830 return spa - p->cache_size;
3831
3832 return spa + p->cache_size;
3833 }
3834 }
3835
3836 return ~0ULL;
3837 }
3838 EXPORT_SYMBOL_NS_GPL(cxl_port_get_spa_cache_alias, "CXL");
3839
is_system_ram(struct resource * res,void * arg)3840 static int is_system_ram(struct resource *res, void *arg)
3841 {
3842 struct cxl_region *cxlr = arg;
3843 struct cxl_region_params *p = &cxlr->params;
3844
3845 dev_dbg(&cxlr->dev, "%pr has System RAM: %pr\n", p->res, res);
3846 return 1;
3847 }
3848
shutdown_notifiers(void * _cxlr)3849 static void shutdown_notifiers(void *_cxlr)
3850 {
3851 struct cxl_region *cxlr = _cxlr;
3852
3853 unregister_node_notifier(&cxlr->node_notifier);
3854 unregister_mt_adistance_algorithm(&cxlr->adist_notifier);
3855 }
3856
remove_debugfs(void * dentry)3857 static void remove_debugfs(void *dentry)
3858 {
3859 debugfs_remove_recursive(dentry);
3860 }
3861
validate_region_offset(struct cxl_region * cxlr,u64 offset)3862 static int validate_region_offset(struct cxl_region *cxlr, u64 offset)
3863 {
3864 struct cxl_region_params *p = &cxlr->params;
3865 resource_size_t region_size;
3866 u64 hpa;
3867
3868 if (offset < p->cache_size) {
3869 dev_err(&cxlr->dev,
3870 "Offset %#llx is within extended linear cache %pa\n",
3871 offset, &p->cache_size);
3872 return -EINVAL;
3873 }
3874
3875 region_size = resource_size(p->res);
3876 if (offset >= region_size) {
3877 dev_err(&cxlr->dev, "Offset %#llx exceeds region size %pa\n",
3878 offset, ®ion_size);
3879 return -EINVAL;
3880 }
3881
3882 hpa = p->res->start + offset;
3883 if (hpa < p->res->start || hpa > p->res->end) {
3884 dev_err(&cxlr->dev, "HPA %#llx not in region %pr\n", hpa,
3885 p->res);
3886 return -EINVAL;
3887 }
3888
3889 return 0;
3890 }
3891
cxl_region_debugfs_poison_inject(void * data,u64 offset)3892 static int cxl_region_debugfs_poison_inject(void *data, u64 offset)
3893 {
3894 struct dpa_result result = { .dpa = ULLONG_MAX, .cxlmd = NULL };
3895 struct cxl_region *cxlr = data;
3896 int rc;
3897
3898 ACQUIRE(rwsem_read_intr, region_rwsem)(&cxl_rwsem.region);
3899 if ((rc = ACQUIRE_ERR(rwsem_read_intr, ®ion_rwsem)))
3900 return rc;
3901
3902 ACQUIRE(rwsem_read_intr, dpa_rwsem)(&cxl_rwsem.dpa);
3903 if ((rc = ACQUIRE_ERR(rwsem_read_intr, &dpa_rwsem)))
3904 return rc;
3905
3906 if (validate_region_offset(cxlr, offset))
3907 return -EINVAL;
3908
3909 offset -= cxlr->params.cache_size;
3910 rc = region_offset_to_dpa_result(cxlr, offset, &result);
3911 if (rc || !result.cxlmd || result.dpa == ULLONG_MAX) {
3912 dev_dbg(&cxlr->dev,
3913 "Failed to resolve DPA for region offset %#llx rc %d\n",
3914 offset, rc);
3915
3916 return rc ? rc : -EINVAL;
3917 }
3918
3919 return cxl_inject_poison_locked(result.cxlmd, result.dpa);
3920 }
3921
3922 DEFINE_DEBUGFS_ATTRIBUTE(cxl_poison_inject_fops, NULL,
3923 cxl_region_debugfs_poison_inject, "%llx\n");
3924
cxl_region_debugfs_poison_clear(void * data,u64 offset)3925 static int cxl_region_debugfs_poison_clear(void *data, u64 offset)
3926 {
3927 struct dpa_result result = { .dpa = ULLONG_MAX, .cxlmd = NULL };
3928 struct cxl_region *cxlr = data;
3929 int rc;
3930
3931 ACQUIRE(rwsem_read_intr, region_rwsem)(&cxl_rwsem.region);
3932 if ((rc = ACQUIRE_ERR(rwsem_read_intr, ®ion_rwsem)))
3933 return rc;
3934
3935 ACQUIRE(rwsem_read_intr, dpa_rwsem)(&cxl_rwsem.dpa);
3936 if ((rc = ACQUIRE_ERR(rwsem_read_intr, &dpa_rwsem)))
3937 return rc;
3938
3939 if (validate_region_offset(cxlr, offset))
3940 return -EINVAL;
3941
3942 offset -= cxlr->params.cache_size;
3943 rc = region_offset_to_dpa_result(cxlr, offset, &result);
3944 if (rc || !result.cxlmd || result.dpa == ULLONG_MAX) {
3945 dev_dbg(&cxlr->dev,
3946 "Failed to resolve DPA for region offset %#llx rc %d\n",
3947 offset, rc);
3948
3949 return rc ? rc : -EINVAL;
3950 }
3951
3952 return cxl_clear_poison_locked(result.cxlmd, result.dpa);
3953 }
3954
3955 DEFINE_DEBUGFS_ATTRIBUTE(cxl_poison_clear_fops, NULL,
3956 cxl_region_debugfs_poison_clear, "%llx\n");
3957
cxl_region_setup_poison(struct cxl_region * cxlr)3958 static int cxl_region_setup_poison(struct cxl_region *cxlr)
3959 {
3960 struct device *dev = &cxlr->dev;
3961 struct cxl_region_params *p = &cxlr->params;
3962 struct dentry *dentry;
3963
3964 /*
3965 * Do not enable poison injection in Normalized Address mode.
3966 * Conversion between SPA and DPA is required for this, but it is
3967 * not supported in this mode.
3968 */
3969 if (test_bit(CXL_REGION_F_NORMALIZED_ADDRESSING, &cxlr->flags))
3970 return 0;
3971
3972 /* Create poison attributes if all memdevs support the capabilities */
3973 for (int i = 0; i < p->nr_targets; i++) {
3974 struct cxl_endpoint_decoder *cxled = p->targets[i];
3975 struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
3976
3977 if (!cxl_memdev_has_poison_cmd(cxlmd, CXL_POISON_ENABLED_INJECT) ||
3978 !cxl_memdev_has_poison_cmd(cxlmd, CXL_POISON_ENABLED_CLEAR))
3979 return 0;
3980 }
3981
3982 dentry = cxl_debugfs_create_dir(dev_name(dev));
3983 debugfs_create_file("inject_poison", 0200, dentry, cxlr,
3984 &cxl_poison_inject_fops);
3985 debugfs_create_file("clear_poison", 0200, dentry, cxlr,
3986 &cxl_poison_clear_fops);
3987
3988 return devm_add_action_or_reset(dev, remove_debugfs, dentry);
3989 }
3990
region_contains_resource(struct device * dev,const void * data)3991 static int region_contains_resource(struct device *dev, const void *data)
3992 {
3993 const struct resource *res = data;
3994 struct cxl_region *cxlr;
3995 struct cxl_region_params *p;
3996
3997 if (!is_cxl_region(dev))
3998 return 0;
3999
4000 cxlr = to_cxl_region(dev);
4001 p = &cxlr->params;
4002
4003 if (p->state != CXL_CONFIG_COMMIT)
4004 return 0;
4005
4006 if (!p->res)
4007 return 0;
4008
4009 return resource_contains(p->res, res) ? 1 : 0;
4010 }
4011
cxl_region_contains_resource(const struct resource * res)4012 bool cxl_region_contains_resource(const struct resource *res)
4013 {
4014 guard(rwsem_read)(&cxl_rwsem.region);
4015 struct device *dev __free(put_device) = bus_find_device(
4016 &cxl_bus_type, NULL, res, region_contains_resource);
4017 return !!dev;
4018 }
4019 EXPORT_SYMBOL_FOR_MODULES(cxl_region_contains_resource, "dax_hmem");
4020
cxl_region_can_probe(struct cxl_region * cxlr)4021 static int cxl_region_can_probe(struct cxl_region *cxlr)
4022 {
4023 struct cxl_region_params *p = &cxlr->params;
4024 int rc;
4025
4026 ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
4027 if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem))) {
4028 dev_dbg(&cxlr->dev, "probe interrupted\n");
4029 return rc;
4030 }
4031
4032 if (p->state < CXL_CONFIG_COMMIT) {
4033 dev_dbg(&cxlr->dev, "config state: %d\n", p->state);
4034 return -ENXIO;
4035 }
4036
4037 if (test_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags)) {
4038 dev_err(&cxlr->dev,
4039 "failed to activate, re-commit region and retry\n");
4040 return -ENXIO;
4041 }
4042
4043 return 0;
4044 }
4045
cxl_region_probe(struct device * dev)4046 static int cxl_region_probe(struct device *dev)
4047 {
4048 struct cxl_region *cxlr = to_cxl_region(dev);
4049 struct cxl_region_params *p = &cxlr->params;
4050 int rc;
4051
4052 rc = cxl_region_can_probe(cxlr);
4053 if (rc)
4054 return rc;
4055
4056 /*
4057 * From this point on any path that changes the region's state away from
4058 * CXL_CONFIG_COMMIT is also responsible for releasing the driver.
4059 */
4060
4061 cxlr->node_notifier.notifier_call = cxl_region_perf_attrs_callback;
4062 cxlr->node_notifier.priority = CXL_CALLBACK_PRI;
4063 register_node_notifier(&cxlr->node_notifier);
4064
4065 cxlr->adist_notifier.notifier_call = cxl_region_calculate_adistance;
4066 cxlr->adist_notifier.priority = 100;
4067 register_mt_adistance_algorithm(&cxlr->adist_notifier);
4068
4069 rc = devm_add_action_or_reset(&cxlr->dev, shutdown_notifiers, cxlr);
4070 if (rc)
4071 return rc;
4072
4073 rc = cxl_region_setup_poison(cxlr);
4074 if (rc)
4075 return rc;
4076
4077 switch (cxlr->mode) {
4078 case CXL_PARTMODE_PMEM:
4079 rc = devm_cxl_region_edac_register(cxlr);
4080 if (rc)
4081 dev_dbg(&cxlr->dev, "CXL EDAC registration for region_id=%d failed\n",
4082 cxlr->id);
4083
4084 return devm_cxl_add_pmem_region(cxlr);
4085 case CXL_PARTMODE_RAM:
4086 rc = devm_cxl_region_edac_register(cxlr);
4087 if (rc)
4088 dev_dbg(&cxlr->dev, "CXL EDAC registration for region_id=%d failed\n",
4089 cxlr->id);
4090
4091 /*
4092 * The region can not be manged by CXL if any portion of
4093 * it is already online as 'System RAM'
4094 */
4095 if (walk_iomem_res_desc(IORES_DESC_NONE,
4096 IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
4097 p->res->start, p->res->end, cxlr,
4098 is_system_ram) > 0)
4099 return 0;
4100 return devm_cxl_add_dax_region(cxlr);
4101 default:
4102 dev_dbg(&cxlr->dev, "unsupported region mode: %d\n",
4103 cxlr->mode);
4104 return -ENXIO;
4105 }
4106 }
4107
4108 static struct cxl_driver cxl_region_driver = {
4109 .name = "cxl_region",
4110 .probe = cxl_region_probe,
4111 .id = CXL_DEVICE_REGION,
4112 };
4113
cxl_region_init(void)4114 int cxl_region_init(void)
4115 {
4116 return cxl_driver_register(&cxl_region_driver);
4117 }
4118
cxl_region_exit(void)4119 void cxl_region_exit(void)
4120 {
4121 cxl_driver_unregister(&cxl_region_driver);
4122 }
4123
4124 MODULE_IMPORT_NS("CXL");
4125 MODULE_IMPORT_NS("DEVMEM");
4126 MODULE_ALIAS_CXL(CXL_DEVICE_REGION);
4127