xref: /linux/drivers/cxl/core/region.c (revision f72af41a43e16276c46d44cf8a833cc0f9ba9d48)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright(c) 2022 Intel Corporation. All rights reserved. */
3 #include <linux/memregion.h>
4 #include <linux/genalloc.h>
5 #include <linux/debugfs.h>
6 #include <linux/device.h>
7 #include <linux/module.h>
8 #include <linux/memory.h>
9 #include <linux/slab.h>
10 #include <linux/uuid.h>
11 #include <linux/sort.h>
12 #include <linux/idr.h>
13 #include <linux/memory-tiers.h>
14 #include <linux/string_choices.h>
15 #include <cxlmem.h>
16 #include <cxl.h>
17 #include "core.h"
18 
19 /**
20  * DOC: cxl core region
21  *
22  * CXL Regions represent mapped memory capacity in system physical address
23  * space. Whereas the CXL Root Decoders identify the bounds of potential CXL
24  * Memory ranges, Regions represent the active mapped capacity by the HDM
25  * Decoder Capability structures throughout the Host Bridges, Switches, and
26  * Endpoints in the topology.
27  *
28  * Region configuration has ordering constraints. UUID may be set at any time
29  * but is only visible for persistent regions.
30  * 1. Interleave granularity
31  * 2. Interleave size
32  * 3. Decoder targets
33  */
34 
35 /*
36  * nodemask that sets per node when the access_coordinates for the node has
37  * been updated by the CXL memory hotplug notifier.
38  */
39 static nodemask_t nodemask_region_seen = NODE_MASK_NONE;
40 
41 static struct cxl_region *to_cxl_region(struct device *dev);
42 
43 #define __ACCESS_ATTR_RO(_level, _name) {				\
44 	.attr	= { .name = __stringify(_name), .mode = 0444 },		\
45 	.show	= _name##_access##_level##_show,			\
46 }
47 
48 #define ACCESS_DEVICE_ATTR_RO(level, name)	\
49 	struct device_attribute dev_attr_access##level##_##name = __ACCESS_ATTR_RO(level, name)
50 
51 #define ACCESS_ATTR_RO(level, attrib)					      \
52 static ssize_t attrib##_access##level##_show(struct device *dev,	      \
53 					  struct device_attribute *attr,      \
54 					  char *buf)			      \
55 {									      \
56 	struct cxl_region *cxlr = to_cxl_region(dev);			      \
57 									      \
58 	if (cxlr->coord[level].attrib == 0)				      \
59 		return -ENOENT;						      \
60 									      \
61 	return sysfs_emit(buf, "%u\n", cxlr->coord[level].attrib);	      \
62 }									      \
63 static ACCESS_DEVICE_ATTR_RO(level, attrib)
64 
65 ACCESS_ATTR_RO(0, read_bandwidth);
66 ACCESS_ATTR_RO(0, read_latency);
67 ACCESS_ATTR_RO(0, write_bandwidth);
68 ACCESS_ATTR_RO(0, write_latency);
69 
70 #define ACCESS_ATTR_DECLARE(level, attrib)	\
71 	(&dev_attr_access##level##_##attrib.attr)
72 
73 static struct attribute *access0_coordinate_attrs[] = {
74 	ACCESS_ATTR_DECLARE(0, read_bandwidth),
75 	ACCESS_ATTR_DECLARE(0, write_bandwidth),
76 	ACCESS_ATTR_DECLARE(0, read_latency),
77 	ACCESS_ATTR_DECLARE(0, write_latency),
78 	NULL
79 };
80 
81 ACCESS_ATTR_RO(1, read_bandwidth);
82 ACCESS_ATTR_RO(1, read_latency);
83 ACCESS_ATTR_RO(1, write_bandwidth);
84 ACCESS_ATTR_RO(1, write_latency);
85 
86 static struct attribute *access1_coordinate_attrs[] = {
87 	ACCESS_ATTR_DECLARE(1, read_bandwidth),
88 	ACCESS_ATTR_DECLARE(1, write_bandwidth),
89 	ACCESS_ATTR_DECLARE(1, read_latency),
90 	ACCESS_ATTR_DECLARE(1, write_latency),
91 	NULL
92 };
93 
94 #define ACCESS_VISIBLE(level)						\
95 static umode_t cxl_region_access##level##_coordinate_visible(		\
96 		struct kobject *kobj, struct attribute *a, int n)	\
97 {									\
98 	struct device *dev = kobj_to_dev(kobj);				\
99 	struct cxl_region *cxlr = to_cxl_region(dev);			\
100 									\
101 	if (a == &dev_attr_access##level##_read_latency.attr &&		\
102 	    cxlr->coord[level].read_latency == 0)			\
103 		return 0;						\
104 									\
105 	if (a == &dev_attr_access##level##_write_latency.attr &&	\
106 	    cxlr->coord[level].write_latency == 0)			\
107 		return 0;						\
108 									\
109 	if (a == &dev_attr_access##level##_read_bandwidth.attr &&	\
110 	    cxlr->coord[level].read_bandwidth == 0)			\
111 		return 0;						\
112 									\
113 	if (a == &dev_attr_access##level##_write_bandwidth.attr &&	\
114 	    cxlr->coord[level].write_bandwidth == 0)			\
115 		return 0;						\
116 									\
117 	return a->mode;							\
118 }
119 
120 ACCESS_VISIBLE(0);
121 ACCESS_VISIBLE(1);
122 
123 static const struct attribute_group cxl_region_access0_coordinate_group = {
124 	.name = "access0",
125 	.attrs = access0_coordinate_attrs,
126 	.is_visible = cxl_region_access0_coordinate_visible,
127 };
128 
129 static const struct attribute_group *get_cxl_region_access0_group(void)
130 {
131 	return &cxl_region_access0_coordinate_group;
132 }
133 
134 static const struct attribute_group cxl_region_access1_coordinate_group = {
135 	.name = "access1",
136 	.attrs = access1_coordinate_attrs,
137 	.is_visible = cxl_region_access1_coordinate_visible,
138 };
139 
140 static const struct attribute_group *get_cxl_region_access1_group(void)
141 {
142 	return &cxl_region_access1_coordinate_group;
143 }
144 
145 static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
146 			 char *buf)
147 {
148 	struct cxl_region *cxlr = to_cxl_region(dev);
149 	struct cxl_region_params *p = &cxlr->params;
150 	ssize_t rc;
151 
152 	ACQUIRE(rwsem_read_intr, region_rwsem)(&cxl_rwsem.region);
153 	if ((rc = ACQUIRE_ERR(rwsem_read_intr, &region_rwsem)))
154 		return rc;
155 	if (cxlr->mode != CXL_PARTMODE_PMEM)
156 		return sysfs_emit(buf, "\n");
157 	return sysfs_emit(buf, "%pUb\n", &p->uuid);
158 }
159 
160 static int is_dup(struct device *match, void *data)
161 {
162 	struct cxl_region_params *p;
163 	struct cxl_region *cxlr;
164 	uuid_t *uuid = data;
165 
166 	if (!is_cxl_region(match))
167 		return 0;
168 
169 	lockdep_assert_held(&cxl_rwsem.region);
170 	cxlr = to_cxl_region(match);
171 	p = &cxlr->params;
172 
173 	if (uuid_equal(&p->uuid, uuid)) {
174 		dev_dbg(match, "already has uuid: %pUb\n", uuid);
175 		return -EBUSY;
176 	}
177 
178 	return 0;
179 }
180 
181 static ssize_t uuid_store(struct device *dev, struct device_attribute *attr,
182 			  const char *buf, size_t len)
183 {
184 	struct cxl_region *cxlr = to_cxl_region(dev);
185 	struct cxl_region_params *p = &cxlr->params;
186 	uuid_t temp;
187 	ssize_t rc;
188 
189 	if (len != UUID_STRING_LEN + 1)
190 		return -EINVAL;
191 
192 	rc = uuid_parse(buf, &temp);
193 	if (rc)
194 		return rc;
195 
196 	if (uuid_is_null(&temp))
197 		return -EINVAL;
198 
199 	ACQUIRE(rwsem_write_kill, region_rwsem)(&cxl_rwsem.region);
200 	if ((rc = ACQUIRE_ERR(rwsem_write_kill, &region_rwsem)))
201 		return rc;
202 
203 	if (uuid_equal(&p->uuid, &temp))
204 		return len;
205 
206 	if (p->state >= CXL_CONFIG_ACTIVE)
207 		return -EBUSY;
208 
209 	rc = bus_for_each_dev(&cxl_bus_type, NULL, &temp, is_dup);
210 	if (rc < 0)
211 		return rc;
212 
213 	uuid_copy(&p->uuid, &temp);
214 
215 	return len;
216 }
217 static DEVICE_ATTR_RW(uuid);
218 
219 static struct cxl_region_ref *cxl_rr_load(struct cxl_port *port,
220 					  struct cxl_region *cxlr)
221 {
222 	return xa_load(&port->regions, (unsigned long)cxlr);
223 }
224 
225 static int cxl_region_invalidate_memregion(struct cxl_region *cxlr)
226 {
227 	if (!cpu_cache_has_invalidate_memregion()) {
228 		if (IS_ENABLED(CONFIG_CXL_REGION_INVALIDATION_TEST)) {
229 			dev_info_once(
230 				&cxlr->dev,
231 				"Bypassing cpu_cache_invalidate_memregion() for testing!\n");
232 			return 0;
233 		}
234 		dev_WARN(&cxlr->dev,
235 			"Failed to synchronize CPU cache state\n");
236 		return -ENXIO;
237 	}
238 
239 	if (!cxlr->params.res)
240 		return -ENXIO;
241 	cpu_cache_invalidate_memregion(cxlr->params.res->start,
242 				       resource_size(cxlr->params.res));
243 	return 0;
244 }
245 
246 static void cxl_region_decode_reset(struct cxl_region *cxlr, int count)
247 {
248 	struct cxl_region_params *p = &cxlr->params;
249 	int i;
250 
251 	if (test_bit(CXL_REGION_F_LOCK, &cxlr->flags))
252 		return;
253 
254 	/*
255 	 * Before region teardown attempt to flush, evict any data cached for
256 	 * this region, or scream loudly about missing arch / platform support
257 	 * for CXL teardown.
258 	 */
259 	cxl_region_invalidate_memregion(cxlr);
260 
261 	for (i = count - 1; i >= 0; i--) {
262 		struct cxl_endpoint_decoder *cxled = p->targets[i];
263 		struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
264 		struct cxl_port *iter = cxled_to_port(cxled);
265 		struct cxl_dev_state *cxlds = cxlmd->cxlds;
266 		struct cxl_ep *ep;
267 
268 		if (cxlds->rcd)
269 			goto endpoint_reset;
270 
271 		while (!is_cxl_root(to_cxl_port(iter->dev.parent)))
272 			iter = to_cxl_port(iter->dev.parent);
273 
274 		for (ep = cxl_ep_load(iter, cxlmd); iter;
275 		     iter = ep->next, ep = cxl_ep_load(iter, cxlmd)) {
276 			struct cxl_region_ref *cxl_rr;
277 			struct cxl_decoder *cxld;
278 
279 			cxl_rr = cxl_rr_load(iter, cxlr);
280 			cxld = cxl_rr->decoder;
281 			if (cxld->reset)
282 				cxld->reset(cxld);
283 			set_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags);
284 		}
285 
286 endpoint_reset:
287 		cxled->cxld.reset(&cxled->cxld);
288 		set_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags);
289 	}
290 
291 	/* all decoders associated with this region have been torn down */
292 	clear_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags);
293 }
294 
295 static int commit_decoder(struct cxl_decoder *cxld)
296 {
297 	struct cxl_switch_decoder *cxlsd = NULL;
298 
299 	if (cxld->commit)
300 		return cxld->commit(cxld);
301 
302 	if (is_switch_decoder(&cxld->dev))
303 		cxlsd = to_cxl_switch_decoder(&cxld->dev);
304 
305 	if (dev_WARN_ONCE(&cxld->dev, !cxlsd || cxlsd->nr_targets > 1,
306 			  "->commit() is required\n"))
307 		return -ENXIO;
308 	return 0;
309 }
310 
311 static int cxl_region_decode_commit(struct cxl_region *cxlr)
312 {
313 	struct cxl_region_params *p = &cxlr->params;
314 	int i, rc = 0;
315 
316 	for (i = 0; i < p->nr_targets; i++) {
317 		struct cxl_endpoint_decoder *cxled = p->targets[i];
318 		struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
319 		struct cxl_region_ref *cxl_rr;
320 		struct cxl_decoder *cxld;
321 		struct cxl_port *iter;
322 		struct cxl_ep *ep;
323 
324 		/* commit bottom up */
325 		for (iter = cxled_to_port(cxled); !is_cxl_root(iter);
326 		     iter = to_cxl_port(iter->dev.parent)) {
327 			cxl_rr = cxl_rr_load(iter, cxlr);
328 			cxld = cxl_rr->decoder;
329 			rc = commit_decoder(cxld);
330 			if (rc)
331 				break;
332 		}
333 
334 		if (rc) {
335 			/* programming @iter failed, teardown */
336 			for (ep = cxl_ep_load(iter, cxlmd); ep && iter;
337 			     iter = ep->next, ep = cxl_ep_load(iter, cxlmd)) {
338 				cxl_rr = cxl_rr_load(iter, cxlr);
339 				cxld = cxl_rr->decoder;
340 				if (cxld->reset)
341 					cxld->reset(cxld);
342 			}
343 
344 			cxled->cxld.reset(&cxled->cxld);
345 			goto err;
346 		}
347 	}
348 
349 	return 0;
350 
351 err:
352 	/* undo the targets that were successfully committed */
353 	cxl_region_decode_reset(cxlr, i);
354 	return rc;
355 }
356 
357 static int queue_reset(struct cxl_region *cxlr)
358 {
359 	struct cxl_region_params *p = &cxlr->params;
360 	int rc;
361 
362 	ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region);
363 	if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem)))
364 		return rc;
365 
366 	/* Already in the requested state? */
367 	if (p->state < CXL_CONFIG_COMMIT)
368 		return 0;
369 
370 	p->state = CXL_CONFIG_RESET_PENDING;
371 
372 	return 0;
373 }
374 
375 static int __commit(struct cxl_region *cxlr)
376 {
377 	struct cxl_region_params *p = &cxlr->params;
378 	int rc;
379 
380 	ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region);
381 	if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem)))
382 		return rc;
383 
384 	/* Already in the requested state? */
385 	if (p->state >= CXL_CONFIG_COMMIT)
386 		return 0;
387 
388 	/* Not ready to commit? */
389 	if (p->state < CXL_CONFIG_ACTIVE)
390 		return -ENXIO;
391 
392 	/*
393 	 * Invalidate caches before region setup to drop any speculative
394 	 * consumption of this address space
395 	 */
396 	rc = cxl_region_invalidate_memregion(cxlr);
397 	if (rc)
398 		return rc;
399 
400 	rc = cxl_region_decode_commit(cxlr);
401 	if (rc)
402 		return rc;
403 
404 	p->state = CXL_CONFIG_COMMIT;
405 
406 	return 0;
407 }
408 
409 static ssize_t commit_store(struct device *dev, struct device_attribute *attr,
410 			    const char *buf, size_t len)
411 {
412 	struct cxl_region *cxlr = to_cxl_region(dev);
413 	struct cxl_region_params *p = &cxlr->params;
414 	bool commit;
415 	ssize_t rc;
416 
417 	rc = kstrtobool(buf, &commit);
418 	if (rc)
419 		return rc;
420 
421 	if (commit) {
422 		rc = __commit(cxlr);
423 		if (rc)
424 			return rc;
425 		return len;
426 	}
427 
428 	if (test_bit(CXL_REGION_F_LOCK, &cxlr->flags))
429 		return -EPERM;
430 
431 	rc = queue_reset(cxlr);
432 	if (rc)
433 		return rc;
434 
435 	/*
436 	 * Unmap the region and depend the reset-pending state to ensure
437 	 * it does not go active again until post reset
438 	 */
439 	device_release_driver(&cxlr->dev);
440 
441 	/*
442 	 * With the reset pending take cxl_rwsem.region unconditionally
443 	 * to ensure the reset gets handled before returning.
444 	 */
445 	guard(rwsem_write)(&cxl_rwsem.region);
446 
447 	/*
448 	 * Revalidate that the reset is still pending in case another
449 	 * thread already handled this reset.
450 	 */
451 	if (p->state == CXL_CONFIG_RESET_PENDING) {
452 		cxl_region_decode_reset(cxlr, p->interleave_ways);
453 		p->state = CXL_CONFIG_ACTIVE;
454 	}
455 
456 	return len;
457 }
458 
459 static ssize_t commit_show(struct device *dev, struct device_attribute *attr,
460 			   char *buf)
461 {
462 	struct cxl_region *cxlr = to_cxl_region(dev);
463 	struct cxl_region_params *p = &cxlr->params;
464 	ssize_t rc;
465 
466 	ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
467 	if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
468 		return rc;
469 	return sysfs_emit(buf, "%d\n", p->state >= CXL_CONFIG_COMMIT);
470 }
471 static DEVICE_ATTR_RW(commit);
472 
473 static ssize_t interleave_ways_show(struct device *dev,
474 				    struct device_attribute *attr, char *buf)
475 {
476 	struct cxl_region *cxlr = to_cxl_region(dev);
477 	struct cxl_region_params *p = &cxlr->params;
478 	int rc;
479 
480 	ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
481 	if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
482 		return rc;
483 	return sysfs_emit(buf, "%d\n", p->interleave_ways);
484 }
485 
486 static const struct attribute_group *get_cxl_region_target_group(void);
487 
488 static int set_interleave_ways(struct cxl_region *cxlr, int val)
489 {
490 	struct cxl_root_decoder *cxlrd = cxlr->cxlrd;
491 	struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld;
492 	struct cxl_region_params *p = &cxlr->params;
493 	int save, rc;
494 	u8 iw;
495 
496 	rc = ways_to_eiw(val, &iw);
497 	if (rc)
498 		return rc;
499 
500 	/*
501 	 * Even for x3, x6, and x12 interleaves the region interleave must be a
502 	 * power of 2 multiple of the host bridge interleave.
503 	 */
504 	if (!is_power_of_2(val / cxld->interleave_ways) ||
505 	    (val % cxld->interleave_ways)) {
506 		dev_dbg(&cxlr->dev, "invalid interleave: %d\n", val);
507 		return -EINVAL;
508 	}
509 
510 	lockdep_assert_held_write(&cxl_rwsem.region);
511 
512 	if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE)
513 		return -EBUSY;
514 
515 	save = p->interleave_ways;
516 	p->interleave_ways = val;
517 	rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_target_group());
518 	if (rc)
519 		p->interleave_ways = save;
520 
521 	return rc;
522 }
523 
524 static ssize_t interleave_ways_store(struct device *dev,
525 				     struct device_attribute *attr,
526 				     const char *buf, size_t len)
527 {
528 	struct cxl_region *cxlr = to_cxl_region(dev);
529 	int val;
530 	int rc;
531 
532 	rc = kstrtoint(buf, 0, &val);
533 	if (rc)
534 		return rc;
535 
536 	ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region);
537 	if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem)))
538 		return rc;
539 
540 	rc = set_interleave_ways(cxlr, val);
541 	if (rc)
542 		return rc;
543 
544 	return len;
545 }
546 static DEVICE_ATTR_RW(interleave_ways);
547 
548 static ssize_t interleave_granularity_show(struct device *dev,
549 					   struct device_attribute *attr,
550 					   char *buf)
551 {
552 	struct cxl_region *cxlr = to_cxl_region(dev);
553 	struct cxl_region_params *p = &cxlr->params;
554 	int rc;
555 
556 	ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
557 	if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
558 		return rc;
559 	return sysfs_emit(buf, "%d\n", p->interleave_granularity);
560 }
561 
562 static int set_interleave_granularity(struct cxl_region *cxlr, int val)
563 {
564 	struct cxl_root_decoder *cxlrd = cxlr->cxlrd;
565 	struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld;
566 	struct cxl_region_params *p = &cxlr->params;
567 	int rc;
568 	u16 ig;
569 
570 	rc = granularity_to_eig(val, &ig);
571 	if (rc)
572 		return rc;
573 
574 	/*
575 	 * When the host-bridge is interleaved, disallow region granularity !=
576 	 * root granularity. Regions with a granularity less than the root
577 	 * interleave result in needing multiple endpoints to support a single
578 	 * slot in the interleave (possible to support in the future). Regions
579 	 * with a granularity greater than the root interleave result in invalid
580 	 * DPA translations (invalid to support).
581 	 */
582 	if (cxld->interleave_ways > 1 && val != cxld->interleave_granularity)
583 		return -EINVAL;
584 
585 	lockdep_assert_held_write(&cxl_rwsem.region);
586 
587 	if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE)
588 		return -EBUSY;
589 
590 	p->interleave_granularity = val;
591 	return 0;
592 }
593 
594 static ssize_t interleave_granularity_store(struct device *dev,
595 					    struct device_attribute *attr,
596 					    const char *buf, size_t len)
597 {
598 	struct cxl_region *cxlr = to_cxl_region(dev);
599 	int rc, val;
600 
601 	rc = kstrtoint(buf, 0, &val);
602 	if (rc)
603 		return rc;
604 
605 	ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region);
606 	if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem)))
607 		return rc;
608 
609 	rc = set_interleave_granularity(cxlr, val);
610 	if (rc)
611 		return rc;
612 
613 	return len;
614 }
615 static DEVICE_ATTR_RW(interleave_granularity);
616 
617 static ssize_t resource_show(struct device *dev, struct device_attribute *attr,
618 			     char *buf)
619 {
620 	struct cxl_region *cxlr = to_cxl_region(dev);
621 	struct cxl_region_params *p = &cxlr->params;
622 	u64 resource = -1ULL;
623 	int rc;
624 
625 	ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
626 	if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
627 		return rc;
628 
629 	if (p->res)
630 		resource = p->res->start;
631 	return sysfs_emit(buf, "%#llx\n", resource);
632 }
633 static DEVICE_ATTR_RO(resource);
634 
635 static ssize_t mode_show(struct device *dev, struct device_attribute *attr,
636 			 char *buf)
637 {
638 	struct cxl_region *cxlr = to_cxl_region(dev);
639 	const char *desc;
640 
641 	if (cxlr->mode == CXL_PARTMODE_RAM)
642 		desc = "ram";
643 	else if (cxlr->mode == CXL_PARTMODE_PMEM)
644 		desc = "pmem";
645 	else
646 		desc = "";
647 
648 	return sysfs_emit(buf, "%s\n", desc);
649 }
650 static DEVICE_ATTR_RO(mode);
651 
652 static int alloc_hpa(struct cxl_region *cxlr, resource_size_t size)
653 {
654 	struct cxl_root_decoder *cxlrd = cxlr->cxlrd;
655 	struct cxl_region_params *p = &cxlr->params;
656 	struct resource *res;
657 	u64 remainder = 0;
658 
659 	lockdep_assert_held_write(&cxl_rwsem.region);
660 
661 	/* Nothing to do... */
662 	if (p->res && resource_size(p->res) == size)
663 		return 0;
664 
665 	/* To change size the old size must be freed first */
666 	if (p->res)
667 		return -EBUSY;
668 
669 	if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE)
670 		return -EBUSY;
671 
672 	/* ways, granularity and uuid (if PMEM) need to be set before HPA */
673 	if (!p->interleave_ways || !p->interleave_granularity ||
674 	    (cxlr->mode == CXL_PARTMODE_PMEM && uuid_is_null(&p->uuid)))
675 		return -ENXIO;
676 
677 	div64_u64_rem(size, (u64)SZ_256M * p->interleave_ways, &remainder);
678 	if (remainder)
679 		return -EINVAL;
680 
681 	res = alloc_free_mem_region(cxlrd->res, size, SZ_256M,
682 				    dev_name(&cxlr->dev));
683 	if (IS_ERR(res)) {
684 		dev_dbg(&cxlr->dev,
685 			"HPA allocation error (%ld) for size:%pap in %s %pr\n",
686 			PTR_ERR(res), &size, cxlrd->res->name, cxlrd->res);
687 		return PTR_ERR(res);
688 	}
689 
690 	cxlr->hpa_range = DEFINE_RANGE(res->start, res->end);
691 
692 	p->res = res;
693 	p->state = CXL_CONFIG_INTERLEAVE_ACTIVE;
694 
695 	return 0;
696 }
697 
698 static void cxl_region_iomem_release(struct cxl_region *cxlr)
699 {
700 	struct cxl_region_params *p = &cxlr->params;
701 
702 	if (device_is_registered(&cxlr->dev))
703 		lockdep_assert_held_write(&cxl_rwsem.region);
704 	if (p->res) {
705 		/*
706 		 * Autodiscovered regions may not have been able to insert their
707 		 * resource.
708 		 */
709 		if (p->res->parent)
710 			remove_resource(p->res);
711 		kfree(p->res);
712 		p->res = NULL;
713 	}
714 }
715 
716 static int free_hpa(struct cxl_region *cxlr)
717 {
718 	struct cxl_region_params *p = &cxlr->params;
719 
720 	lockdep_assert_held_write(&cxl_rwsem.region);
721 
722 	if (!p->res)
723 		return 0;
724 
725 	if (p->state >= CXL_CONFIG_ACTIVE)
726 		return -EBUSY;
727 
728 	cxlr->hpa_range = DEFINE_RANGE(0, -1);
729 
730 	cxl_region_iomem_release(cxlr);
731 	p->state = CXL_CONFIG_IDLE;
732 	return 0;
733 }
734 
735 static ssize_t size_store(struct device *dev, struct device_attribute *attr,
736 			  const char *buf, size_t len)
737 {
738 	struct cxl_region *cxlr = to_cxl_region(dev);
739 	u64 val;
740 	int rc;
741 
742 	rc = kstrtou64(buf, 0, &val);
743 	if (rc)
744 		return rc;
745 
746 	ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region);
747 	if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem)))
748 		return rc;
749 
750 	if (val)
751 		rc = alloc_hpa(cxlr, val);
752 	else
753 		rc = free_hpa(cxlr);
754 
755 	if (rc)
756 		return rc;
757 
758 	return len;
759 }
760 
761 static ssize_t size_show(struct device *dev, struct device_attribute *attr,
762 			 char *buf)
763 {
764 	struct cxl_region *cxlr = to_cxl_region(dev);
765 	struct cxl_region_params *p = &cxlr->params;
766 	u64 size = 0;
767 	ssize_t rc;
768 
769 	ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
770 	if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
771 		return rc;
772 	if (p->res)
773 		size = resource_size(p->res);
774 	return sysfs_emit(buf, "%#llx\n", size);
775 }
776 static DEVICE_ATTR_RW(size);
777 
778 static ssize_t extended_linear_cache_size_show(struct device *dev,
779 					       struct device_attribute *attr,
780 					       char *buf)
781 {
782 	struct cxl_region *cxlr = to_cxl_region(dev);
783 	struct cxl_region_params *p = &cxlr->params;
784 	ssize_t rc;
785 
786 	ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
787 	if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
788 		return rc;
789 	return sysfs_emit(buf, "%pap\n", &p->cache_size);
790 }
791 static DEVICE_ATTR_RO(extended_linear_cache_size);
792 
793 static ssize_t locked_show(struct device *dev,
794 			   struct device_attribute *attr,
795 			   char *buf)
796 {
797 	struct cxl_region *cxlr = to_cxl_region(dev);
798 	int rc;
799 
800 	ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
801 	if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
802 		return rc;
803 
804 	rc = test_bit(CXL_REGION_F_LOCK, &cxlr->flags);
805 	return sysfs_emit(buf, "%d\n", rc);
806 }
807 static DEVICE_ATTR_RO(locked);
808 
809 static struct attribute *cxl_region_attrs[] = {
810 	&dev_attr_uuid.attr,
811 	&dev_attr_commit.attr,
812 	&dev_attr_interleave_ways.attr,
813 	&dev_attr_interleave_granularity.attr,
814 	&dev_attr_resource.attr,
815 	&dev_attr_size.attr,
816 	&dev_attr_mode.attr,
817 	&dev_attr_extended_linear_cache_size.attr,
818 	&dev_attr_locked.attr,
819 	NULL,
820 };
821 
822 static umode_t cxl_region_visible(struct kobject *kobj, struct attribute *a,
823 				  int n)
824 {
825 	struct device *dev = kobj_to_dev(kobj);
826 	struct cxl_region *cxlr = to_cxl_region(dev);
827 
828 	/*
829 	 * Support tooling that expects to find a 'uuid' attribute for all
830 	 * regions regardless of mode.
831 	 */
832 	if (a == &dev_attr_uuid.attr && cxlr->mode != CXL_PARTMODE_PMEM)
833 		return 0444;
834 
835 	/*
836 	 * Don't display extended linear cache attribute if there is no
837 	 * extended linear cache.
838 	 */
839 	if (a == &dev_attr_extended_linear_cache_size.attr &&
840 	    cxlr->params.cache_size == 0)
841 		return 0;
842 
843 	return a->mode;
844 }
845 
846 static const struct attribute_group cxl_region_group = {
847 	.attrs = cxl_region_attrs,
848 	.is_visible = cxl_region_visible,
849 };
850 
851 static size_t show_targetN(struct cxl_region *cxlr, char *buf, int pos)
852 {
853 	struct cxl_region_params *p = &cxlr->params;
854 	struct cxl_endpoint_decoder *cxled;
855 	int rc;
856 
857 	ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
858 	if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
859 		return rc;
860 
861 	if (pos >= p->interleave_ways) {
862 		dev_dbg(&cxlr->dev, "position %d out of range %d\n", pos,
863 			p->interleave_ways);
864 		return -ENXIO;
865 	}
866 
867 	cxled = p->targets[pos];
868 	if (!cxled)
869 		return sysfs_emit(buf, "\n");
870 	return sysfs_emit(buf, "%s\n", dev_name(&cxled->cxld.dev));
871 }
872 
873 static int check_commit_order(struct device *dev, void *data)
874 {
875 	struct cxl_decoder *cxld = to_cxl_decoder(dev);
876 
877 	/*
878 	 * if port->commit_end is not the only free decoder, then out of
879 	 * order shutdown has occurred, block further allocations until
880 	 * that is resolved
881 	 */
882 	if (((cxld->flags & CXL_DECODER_F_ENABLE) == 0))
883 		return -EBUSY;
884 	return 0;
885 }
886 
887 static int match_free_decoder(struct device *dev, const void *data)
888 {
889 	struct cxl_port *port = to_cxl_port(dev->parent);
890 	struct cxl_decoder *cxld;
891 	int rc;
892 
893 	if (!is_switch_decoder(dev))
894 		return 0;
895 
896 	cxld = to_cxl_decoder(dev);
897 
898 	if (cxld->id != port->commit_end + 1)
899 		return 0;
900 
901 	if (cxld->region) {
902 		dev_dbg(dev->parent,
903 			"next decoder to commit (%s) is already reserved (%s)\n",
904 			dev_name(dev), dev_name(&cxld->region->dev));
905 		return 0;
906 	}
907 
908 	rc = device_for_each_child_reverse_from(dev->parent, dev, NULL,
909 						check_commit_order);
910 	if (rc) {
911 		dev_dbg(dev->parent,
912 			"unable to allocate %s due to out of order shutdown\n",
913 			dev_name(dev));
914 		return 0;
915 	}
916 	return 1;
917 }
918 
919 static bool spa_maps_hpa(const struct cxl_region_params *p,
920 			 const struct range *range)
921 {
922 	if (!p->res)
923 		return false;
924 
925 	/*
926 	 * The extended linear cache region is constructed by a 1:1 ratio
927 	 * where the SPA maps equal amounts of DRAM and CXL HPA capacity with
928 	 * CXL decoders at the high end of the SPA range.
929 	 */
930 	return p->res->start + p->cache_size == range->start &&
931 		p->res->end == range->end;
932 }
933 
934 static int match_auto_decoder(struct device *dev, const void *data)
935 {
936 	const struct cxl_region_params *p = data;
937 	struct cxl_decoder *cxld;
938 	struct range *r;
939 
940 	if (!is_switch_decoder(dev))
941 		return 0;
942 
943 	cxld = to_cxl_decoder(dev);
944 	r = &cxld->hpa_range;
945 
946 	if (spa_maps_hpa(p, r))
947 		return 1;
948 
949 	return 0;
950 }
951 
952 /**
953  * cxl_port_pick_region_decoder() - assign or lookup a decoder for a region
954  * @port: a port in the ancestry of the endpoint implied by @cxled
955  * @cxled: endpoint decoder to be, or currently, mapped by @port
956  * @cxlr: region to establish, or validate, decode @port
957  *
958  * In the region creation path cxl_port_pick_region_decoder() is an
959  * allocator to find a free port. In the region assembly path, it is
960  * recalling the decoder that platform firmware picked for validation
961  * purposes.
962  *
963  * The result is recorded in a 'struct cxl_region_ref' in @port.
964  */
965 static struct cxl_decoder *
966 cxl_port_pick_region_decoder(struct cxl_port *port,
967 			     struct cxl_endpoint_decoder *cxled,
968 			     struct cxl_region *cxlr)
969 {
970 	struct device *dev;
971 
972 	if (port == cxled_to_port(cxled))
973 		return &cxled->cxld;
974 
975 	if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags))
976 		dev = device_find_child(&port->dev, &cxlr->params,
977 					match_auto_decoder);
978 	else
979 		dev = device_find_child(&port->dev, NULL, match_free_decoder);
980 	if (!dev)
981 		return NULL;
982 	/*
983 	 * This decoder is pinned registered as long as the endpoint decoder is
984 	 * registered, and endpoint decoder unregistration holds the
985 	 * cxl_rwsem.region over unregister events, so no need to hold on to
986 	 * this extra reference.
987 	 */
988 	put_device(dev);
989 	return to_cxl_decoder(dev);
990 }
991 
992 static bool auto_order_ok(struct cxl_port *port, struct cxl_region *cxlr_iter,
993 			  struct cxl_decoder *cxld)
994 {
995 	struct cxl_region_ref *rr = cxl_rr_load(port, cxlr_iter);
996 	struct cxl_decoder *cxld_iter = rr->decoder;
997 
998 	/*
999 	 * Allow the out of order assembly of auto-discovered regions.
1000 	 * Per CXL Spec 3.1 8.2.4.20.12 software must commit decoders
1001 	 * in HPA order. Confirm that the decoder with the lesser HPA
1002 	 * starting address has the lesser id.
1003 	 */
1004 	dev_dbg(&cxld->dev, "check for HPA violation %s:%d < %s:%d\n",
1005 		dev_name(&cxld->dev), cxld->id,
1006 		dev_name(&cxld_iter->dev), cxld_iter->id);
1007 
1008 	if (cxld_iter->id > cxld->id)
1009 		return true;
1010 
1011 	return false;
1012 }
1013 
1014 static struct cxl_region_ref *
1015 alloc_region_ref(struct cxl_port *port, struct cxl_region *cxlr,
1016 		 struct cxl_endpoint_decoder *cxled,
1017 		 struct cxl_decoder *cxld)
1018 {
1019 	struct cxl_region_params *p = &cxlr->params;
1020 	struct cxl_region_ref *cxl_rr, *iter;
1021 	unsigned long index;
1022 	int rc;
1023 
1024 	xa_for_each(&port->regions, index, iter) {
1025 		struct cxl_region_params *ip = &iter->region->params;
1026 
1027 		if (!ip->res || ip->res->start < p->res->start)
1028 			continue;
1029 
1030 		if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) {
1031 			if (auto_order_ok(port, iter->region, cxld))
1032 				continue;
1033 		}
1034 		dev_dbg(&cxlr->dev, "%s: HPA order violation %s:%pr vs %pr\n",
1035 			dev_name(&port->dev),
1036 			dev_name(&iter->region->dev), ip->res, p->res);
1037 
1038 		return ERR_PTR(-EBUSY);
1039 	}
1040 
1041 	cxl_rr = kzalloc_obj(*cxl_rr);
1042 	if (!cxl_rr)
1043 		return ERR_PTR(-ENOMEM);
1044 	cxl_rr->port = port;
1045 	cxl_rr->region = cxlr;
1046 	cxl_rr->nr_targets = 1;
1047 	xa_init(&cxl_rr->endpoints);
1048 
1049 	rc = xa_insert(&port->regions, (unsigned long)cxlr, cxl_rr, GFP_KERNEL);
1050 	if (rc) {
1051 		dev_dbg(&cxlr->dev,
1052 			"%s: failed to track region reference: %d\n",
1053 			dev_name(&port->dev), rc);
1054 		kfree(cxl_rr);
1055 		return ERR_PTR(rc);
1056 	}
1057 
1058 	return cxl_rr;
1059 }
1060 
1061 static void cxl_rr_free_decoder(struct cxl_region_ref *cxl_rr)
1062 {
1063 	struct cxl_region *cxlr = cxl_rr->region;
1064 	struct cxl_decoder *cxld = cxl_rr->decoder;
1065 
1066 	if (!cxld)
1067 		return;
1068 
1069 	dev_WARN_ONCE(&cxlr->dev, cxld->region != cxlr, "region mismatch\n");
1070 	if (cxld->region == cxlr) {
1071 		cxld->region = NULL;
1072 		put_device(&cxlr->dev);
1073 	}
1074 }
1075 
1076 static void free_region_ref(struct cxl_region_ref *cxl_rr)
1077 {
1078 	struct cxl_port *port = cxl_rr->port;
1079 	struct cxl_region *cxlr = cxl_rr->region;
1080 
1081 	cxl_rr_free_decoder(cxl_rr);
1082 	xa_erase(&port->regions, (unsigned long)cxlr);
1083 	xa_destroy(&cxl_rr->endpoints);
1084 	kfree(cxl_rr);
1085 }
1086 
1087 static int cxl_rr_ep_add(struct cxl_region_ref *cxl_rr,
1088 			 struct cxl_endpoint_decoder *cxled)
1089 {
1090 	int rc;
1091 	struct cxl_port *port = cxl_rr->port;
1092 	struct cxl_region *cxlr = cxl_rr->region;
1093 	struct cxl_decoder *cxld = cxl_rr->decoder;
1094 	struct cxl_ep *ep = cxl_ep_load(port, cxled_to_memdev(cxled));
1095 
1096 	if (ep) {
1097 		rc = xa_insert(&cxl_rr->endpoints, (unsigned long)cxled, ep,
1098 			       GFP_KERNEL);
1099 		if (rc)
1100 			return rc;
1101 	}
1102 	cxl_rr->nr_eps++;
1103 
1104 	if (!cxld->region) {
1105 		cxld->region = cxlr;
1106 
1107 		/*
1108 		 * Now that cxld->region is set the intermediate staging state
1109 		 * can be cleared.
1110 		 */
1111 		if (cxld == &cxled->cxld &&
1112 		    cxled->state == CXL_DECODER_STATE_AUTO_STAGED)
1113 			cxled->state = CXL_DECODER_STATE_AUTO;
1114 		get_device(&cxlr->dev);
1115 	}
1116 
1117 	return 0;
1118 }
1119 
1120 static int cxl_rr_assign_decoder(struct cxl_port *port, struct cxl_region *cxlr,
1121 				 struct cxl_endpoint_decoder *cxled,
1122 				 struct cxl_region_ref *cxl_rr,
1123 				 struct cxl_decoder *cxld)
1124 {
1125 	if (cxld->region) {
1126 		dev_dbg(&cxlr->dev, "%s: %s already attached to %s\n",
1127 			dev_name(&port->dev), dev_name(&cxld->dev),
1128 			dev_name(&cxld->region->dev));
1129 		return -EBUSY;
1130 	}
1131 
1132 	/*
1133 	 * Endpoints should already match the region type, but backstop that
1134 	 * assumption with an assertion. Switch-decoders change mapping-type
1135 	 * based on what is mapped when they are assigned to a region.
1136 	 */
1137 	dev_WARN_ONCE(&cxlr->dev,
1138 		      port == cxled_to_port(cxled) &&
1139 			      cxld->target_type != cxlr->type,
1140 		      "%s:%s mismatch decoder type %d -> %d\n",
1141 		      dev_name(&cxled_to_memdev(cxled)->dev),
1142 		      dev_name(&cxld->dev), cxld->target_type, cxlr->type);
1143 	cxld->target_type = cxlr->type;
1144 	cxl_rr->decoder = cxld;
1145 	return 0;
1146 }
1147 
1148 static void cxl_region_setup_flags(struct cxl_region *cxlr,
1149 				   struct cxl_decoder *cxld)
1150 {
1151 	if (is_endpoint_decoder(&cxld->dev)) {
1152 		struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(&cxld->dev);
1153 		struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
1154 
1155 		/*
1156 		 * When a region's memdevs specify an @attach method the attach
1157 		 * provider is responsible for dispositioning the region for
1158 		 * both probe and userspace management
1159 		 */
1160 		if (cxlmd->attach)
1161 			set_bit(CXL_REGION_F_LOCK, &cxlr->flags);
1162 	}
1163 
1164 	if (cxld->flags & CXL_DECODER_F_LOCK) {
1165 		set_bit(CXL_REGION_F_LOCK, &cxlr->flags);
1166 		clear_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags);
1167 	}
1168 
1169 	if (cxld->flags & CXL_DECODER_F_NORMALIZED_ADDRESSING)
1170 		set_bit(CXL_REGION_F_NORMALIZED_ADDRESSING, &cxlr->flags);
1171 }
1172 
1173 /**
1174  * cxl_port_attach_region() - track a region's interest in a port by endpoint
1175  * @port: port to add a new region reference 'struct cxl_region_ref'
1176  * @cxlr: region to attach to @port
1177  * @cxled: endpoint decoder used to create or further pin a region reference
1178  * @pos: interleave position of @cxled in @cxlr
1179  *
1180  * The attach event is an opportunity to validate CXL decode setup
1181  * constraints and record metadata needed for programming HDM decoders,
1182  * in particular decoder target lists.
1183  *
1184  * The steps are:
1185  *
1186  * - validate that there are no other regions with a higher HPA already
1187  *   associated with @port
1188  * - establish a region reference if one is not already present
1189  *
1190  *   - additionally allocate a decoder instance that will host @cxlr on
1191  *     @port
1192  *
1193  * - pin the region reference by the endpoint
1194  * - account for how many entries in @port's target list are needed to
1195  *   cover all of the added endpoints.
1196  */
1197 static int cxl_port_attach_region(struct cxl_port *port,
1198 				  struct cxl_region *cxlr,
1199 				  struct cxl_endpoint_decoder *cxled, int pos)
1200 {
1201 	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
1202 	struct cxl_ep *ep = cxl_ep_load(port, cxlmd);
1203 	struct cxl_region_ref *cxl_rr;
1204 	bool nr_targets_inc = false;
1205 	struct cxl_decoder *cxld;
1206 	unsigned long index;
1207 	int rc = -EBUSY;
1208 
1209 	lockdep_assert_held_write(&cxl_rwsem.region);
1210 
1211 	cxl_rr = cxl_rr_load(port, cxlr);
1212 	if (cxl_rr) {
1213 		struct cxl_ep *ep_iter;
1214 		int found = 0;
1215 
1216 		/*
1217 		 * Walk the existing endpoints that have been attached to
1218 		 * @cxlr at @port and see if they share the same 'next' port
1219 		 * in the downstream direction. I.e. endpoints that share common
1220 		 * upstream switch.
1221 		 */
1222 		xa_for_each(&cxl_rr->endpoints, index, ep_iter) {
1223 			if (ep_iter == ep)
1224 				continue;
1225 			if (ep_iter->next == ep->next) {
1226 				found++;
1227 				break;
1228 			}
1229 		}
1230 
1231 		/*
1232 		 * New target port, or @port is an endpoint port that always
1233 		 * accounts its own local decode as a target.
1234 		 */
1235 		if (!found || !ep->next) {
1236 			cxl_rr->nr_targets++;
1237 			nr_targets_inc = true;
1238 		}
1239 	} else {
1240 		cxld = cxl_port_pick_region_decoder(port, cxled, cxlr);
1241 		if (!cxld) {
1242 			dev_dbg(&cxlr->dev, "%s: no decoder available\n",
1243 				dev_name(&port->dev));
1244 			return -EBUSY;
1245 		}
1246 
1247 		cxl_rr = alloc_region_ref(port, cxlr, cxled, cxld);
1248 		if (IS_ERR(cxl_rr)) {
1249 			dev_dbg(&cxlr->dev,
1250 				"%s: failed to allocate region reference\n",
1251 				dev_name(&port->dev));
1252 			return PTR_ERR(cxl_rr);
1253 		}
1254 		nr_targets_inc = true;
1255 
1256 		rc = cxl_rr_assign_decoder(port, cxlr, cxled, cxl_rr, cxld);
1257 		if (rc)
1258 			goto out_erase;
1259 	}
1260 	cxld = cxl_rr->decoder;
1261 
1262 	/*
1263 	 * the number of targets should not exceed the target_count
1264 	 * of the decoder
1265 	 */
1266 	if (is_switch_decoder(&cxld->dev)) {
1267 		struct cxl_switch_decoder *cxlsd;
1268 
1269 		cxlsd = to_cxl_switch_decoder(&cxld->dev);
1270 		if (cxl_rr->nr_targets > cxlsd->nr_targets) {
1271 			dev_dbg(&cxlr->dev,
1272 				"%s:%s %s add: %s:%s @ %d overflows targets: %d\n",
1273 				dev_name(port->uport_dev), dev_name(&port->dev),
1274 				dev_name(&cxld->dev), dev_name(&cxlmd->dev),
1275 				dev_name(&cxled->cxld.dev), pos,
1276 				cxlsd->nr_targets);
1277 			rc = -ENXIO;
1278 			goto out_erase;
1279 		}
1280 	}
1281 
1282 	cxl_region_setup_flags(cxlr, cxld);
1283 
1284 	rc = cxl_rr_ep_add(cxl_rr, cxled);
1285 	if (rc) {
1286 		dev_dbg(&cxlr->dev,
1287 			"%s: failed to track endpoint %s:%s reference\n",
1288 			dev_name(&port->dev), dev_name(&cxlmd->dev),
1289 			dev_name(&cxld->dev));
1290 		goto out_erase;
1291 	}
1292 
1293 	dev_dbg(&cxlr->dev,
1294 		"%s:%s %s add: %s:%s @ %d next: %s nr_eps: %d nr_targets: %d\n",
1295 		dev_name(port->uport_dev), dev_name(&port->dev),
1296 		dev_name(&cxld->dev), dev_name(&cxlmd->dev),
1297 		dev_name(&cxled->cxld.dev), pos,
1298 		ep ? ep->next ? dev_name(ep->next->uport_dev) :
1299 				      dev_name(&cxlmd->dev) :
1300 			   "none",
1301 		cxl_rr->nr_eps, cxl_rr->nr_targets);
1302 
1303 	return 0;
1304 out_erase:
1305 	if (nr_targets_inc)
1306 		cxl_rr->nr_targets--;
1307 	if (cxl_rr->nr_eps == 0)
1308 		free_region_ref(cxl_rr);
1309 	return rc;
1310 }
1311 
1312 static void cxl_port_detach_region(struct cxl_port *port,
1313 				   struct cxl_region *cxlr,
1314 				   struct cxl_endpoint_decoder *cxled)
1315 {
1316 	struct cxl_region_ref *cxl_rr;
1317 	struct cxl_ep *ep = NULL;
1318 
1319 	lockdep_assert_held_write(&cxl_rwsem.region);
1320 
1321 	cxl_rr = cxl_rr_load(port, cxlr);
1322 	if (!cxl_rr)
1323 		return;
1324 
1325 	/*
1326 	 * Endpoint ports do not carry cxl_ep references, and they
1327 	 * never target more than one endpoint by definition
1328 	 */
1329 	if (cxl_rr->decoder == &cxled->cxld)
1330 		cxl_rr->nr_eps--;
1331 	else
1332 		ep = xa_erase(&cxl_rr->endpoints, (unsigned long)cxled);
1333 	if (ep) {
1334 		struct cxl_ep *ep_iter;
1335 		unsigned long index;
1336 		int found = 0;
1337 
1338 		cxl_rr->nr_eps--;
1339 		xa_for_each(&cxl_rr->endpoints, index, ep_iter) {
1340 			if (ep_iter->next == ep->next) {
1341 				found++;
1342 				break;
1343 			}
1344 		}
1345 		if (!found)
1346 			cxl_rr->nr_targets--;
1347 	}
1348 
1349 	if (cxl_rr->nr_eps == 0)
1350 		free_region_ref(cxl_rr);
1351 }
1352 
1353 static int check_last_peer(struct cxl_endpoint_decoder *cxled,
1354 			   struct cxl_ep *ep, struct cxl_region_ref *cxl_rr,
1355 			   int distance)
1356 {
1357 	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
1358 	struct cxl_region *cxlr = cxl_rr->region;
1359 	struct cxl_region_params *p = &cxlr->params;
1360 	struct cxl_endpoint_decoder *cxled_peer;
1361 	struct cxl_port *port = cxl_rr->port;
1362 	struct cxl_memdev *cxlmd_peer;
1363 	struct cxl_ep *ep_peer;
1364 	int pos = cxled->pos;
1365 
1366 	/*
1367 	 * If this position wants to share a dport with the last endpoint mapped
1368 	 * then that endpoint, at index 'position - distance', must also be
1369 	 * mapped by this dport.
1370 	 */
1371 	if (pos < distance) {
1372 		dev_dbg(&cxlr->dev, "%s:%s: cannot host %s:%s at %d\n",
1373 			dev_name(port->uport_dev), dev_name(&port->dev),
1374 			dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), pos);
1375 		return -ENXIO;
1376 	}
1377 	cxled_peer = p->targets[pos - distance];
1378 	cxlmd_peer = cxled_to_memdev(cxled_peer);
1379 	ep_peer = cxl_ep_load(port, cxlmd_peer);
1380 	if (ep->dport != ep_peer->dport) {
1381 		dev_dbg(&cxlr->dev,
1382 			"%s:%s: %s:%s pos %d mismatched peer %s:%s\n",
1383 			dev_name(port->uport_dev), dev_name(&port->dev),
1384 			dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), pos,
1385 			dev_name(&cxlmd_peer->dev),
1386 			dev_name(&cxled_peer->cxld.dev));
1387 		return -ENXIO;
1388 	}
1389 
1390 	return 0;
1391 }
1392 
1393 static int check_interleave_cap(struct cxl_decoder *cxld, int iw, int ig)
1394 {
1395 	struct cxl_port *port = to_cxl_port(cxld->dev.parent);
1396 	struct cxl_hdm *cxlhdm = dev_get_drvdata(&port->dev);
1397 	unsigned int interleave_mask;
1398 	u8 eiw;
1399 	u16 eig;
1400 	int high_pos, low_pos;
1401 
1402 	if (!test_bit(iw, &cxlhdm->iw_cap_mask))
1403 		return -ENXIO;
1404 	/*
1405 	 * Per CXL specification r3.1(8.2.4.20.13 Decoder Protection),
1406 	 * if eiw < 8:
1407 	 *   DPAOFFSET[51: eig + 8] = HPAOFFSET[51: eig + 8 + eiw]
1408 	 *   DPAOFFSET[eig + 7: 0]  = HPAOFFSET[eig + 7: 0]
1409 	 *
1410 	 *   when the eiw is 0, all the bits of HPAOFFSET[51: 0] are used, the
1411 	 *   interleave bits are none.
1412 	 *
1413 	 * if eiw >= 8:
1414 	 *   DPAOFFSET[51: eig + 8] = HPAOFFSET[51: eig + eiw] / 3
1415 	 *   DPAOFFSET[eig + 7: 0]  = HPAOFFSET[eig + 7: 0]
1416 	 *
1417 	 *   when the eiw is 8, all the bits of HPAOFFSET[51: 0] are used, the
1418 	 *   interleave bits are none.
1419 	 */
1420 	ways_to_eiw(iw, &eiw);
1421 	if (eiw == 0 || eiw == 8)
1422 		return 0;
1423 
1424 	granularity_to_eig(ig, &eig);
1425 	if (eiw > 8)
1426 		high_pos = eiw + eig - 1;
1427 	else
1428 		high_pos = eiw + eig + 7;
1429 	low_pos = eig + 8;
1430 	interleave_mask = GENMASK(high_pos, low_pos);
1431 	if (interleave_mask & ~cxlhdm->interleave_mask)
1432 		return -ENXIO;
1433 
1434 	return 0;
1435 }
1436 
1437 static int cxl_port_setup_targets(struct cxl_port *port,
1438 				  struct cxl_region *cxlr,
1439 				  struct cxl_endpoint_decoder *cxled)
1440 {
1441 	struct cxl_root_decoder *cxlrd = cxlr->cxlrd;
1442 	int parent_iw, parent_ig, ig, iw, rc, pos = cxled->pos;
1443 	struct cxl_port *parent_port = to_cxl_port(port->dev.parent);
1444 	struct cxl_region_ref *cxl_rr = cxl_rr_load(port, cxlr);
1445 	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
1446 	struct cxl_ep *ep = cxl_ep_load(port, cxlmd);
1447 	struct cxl_region_params *p = &cxlr->params;
1448 	struct cxl_decoder *cxld = cxl_rr->decoder;
1449 	struct cxl_switch_decoder *cxlsd;
1450 	struct cxl_port *iter = port;
1451 	u16 eig, peig;
1452 	u8 eiw, peiw;
1453 
1454 	/*
1455 	 * While root level decoders support x3, x6, x12, switch level
1456 	 * decoders only support powers of 2 up to x16.
1457 	 */
1458 	if (!is_power_of_2(cxl_rr->nr_targets)) {
1459 		dev_dbg(&cxlr->dev, "%s:%s: invalid target count %d\n",
1460 			dev_name(port->uport_dev), dev_name(&port->dev),
1461 			cxl_rr->nr_targets);
1462 		return -EINVAL;
1463 	}
1464 
1465 	cxlsd = to_cxl_switch_decoder(&cxld->dev);
1466 	if (cxl_rr->nr_targets_set) {
1467 		int i, distance = 1;
1468 		struct cxl_region_ref *cxl_rr_iter;
1469 
1470 		/*
1471 		 * The "distance" between peer downstream ports represents which
1472 		 * endpoint positions in the region interleave a given port can
1473 		 * host.
1474 		 *
1475 		 * For example, at the root of a hierarchy the distance is
1476 		 * always 1 as every index targets a different host-bridge. At
1477 		 * each subsequent switch level those ports map every Nth region
1478 		 * position where N is the width of the switch == distance.
1479 		 */
1480 		do {
1481 			cxl_rr_iter = cxl_rr_load(iter, cxlr);
1482 			distance *= cxl_rr_iter->nr_targets;
1483 			iter = to_cxl_port(iter->dev.parent);
1484 		} while (!is_cxl_root(iter));
1485 		distance *= cxlrd->cxlsd.cxld.interleave_ways;
1486 
1487 		for (i = 0; i < cxl_rr->nr_targets_set; i++)
1488 			if (ep->dport == cxlsd->target[i]) {
1489 				rc = check_last_peer(cxled, ep, cxl_rr,
1490 						     distance);
1491 				if (rc)
1492 					return rc;
1493 				goto out_target_set;
1494 			}
1495 		goto add_target;
1496 	}
1497 
1498 	if (is_cxl_root(parent_port)) {
1499 		/*
1500 		 * Root decoder IG is always set to value in CFMWS which
1501 		 * may be different than this region's IG.  We can use the
1502 		 * region's IG here since interleave_granularity_store()
1503 		 * does not allow interleaved host-bridges with
1504 		 * root IG != region IG.
1505 		 */
1506 		parent_ig = p->interleave_granularity;
1507 		parent_iw = cxlrd->cxlsd.cxld.interleave_ways;
1508 		/*
1509 		 * For purposes of address bit routing, use power-of-2 math for
1510 		 * switch ports.
1511 		 */
1512 		if (!is_power_of_2(parent_iw))
1513 			parent_iw /= 3;
1514 	} else {
1515 		struct cxl_region_ref *parent_rr;
1516 		struct cxl_decoder *parent_cxld;
1517 
1518 		parent_rr = cxl_rr_load(parent_port, cxlr);
1519 		parent_cxld = parent_rr->decoder;
1520 		parent_ig = parent_cxld->interleave_granularity;
1521 		parent_iw = parent_cxld->interleave_ways;
1522 	}
1523 
1524 	rc = granularity_to_eig(parent_ig, &peig);
1525 	if (rc) {
1526 		dev_dbg(&cxlr->dev, "%s:%s: invalid parent granularity: %d\n",
1527 			dev_name(parent_port->uport_dev),
1528 			dev_name(&parent_port->dev), parent_ig);
1529 		return rc;
1530 	}
1531 
1532 	rc = ways_to_eiw(parent_iw, &peiw);
1533 	if (rc) {
1534 		dev_dbg(&cxlr->dev, "%s:%s: invalid parent interleave: %d\n",
1535 			dev_name(parent_port->uport_dev),
1536 			dev_name(&parent_port->dev), parent_iw);
1537 		return rc;
1538 	}
1539 
1540 	iw = cxl_rr->nr_targets;
1541 	rc = ways_to_eiw(iw, &eiw);
1542 	if (rc) {
1543 		dev_dbg(&cxlr->dev, "%s:%s: invalid port interleave: %d\n",
1544 			dev_name(port->uport_dev), dev_name(&port->dev), iw);
1545 		return rc;
1546 	}
1547 
1548 	/*
1549 	 * Interleave granularity is a multiple of @parent_port granularity.
1550 	 * Multiplier is the parent port interleave ways.
1551 	 */
1552 	rc = granularity_to_eig(parent_ig * parent_iw, &eig);
1553 	if (rc) {
1554 		dev_dbg(&cxlr->dev,
1555 			"%s: invalid granularity calculation (%d * %d)\n",
1556 			dev_name(&parent_port->dev), parent_ig, parent_iw);
1557 		return rc;
1558 	}
1559 
1560 	rc = eig_to_granularity(eig, &ig);
1561 	if (rc) {
1562 		dev_dbg(&cxlr->dev, "%s:%s: invalid interleave: %d\n",
1563 			dev_name(port->uport_dev), dev_name(&port->dev),
1564 			256 << eig);
1565 		return rc;
1566 	}
1567 
1568 	if (iw > 8 || iw > cxlsd->nr_targets) {
1569 		dev_dbg(&cxlr->dev,
1570 			"%s:%s:%s: ways: %d overflows targets: %d\n",
1571 			dev_name(port->uport_dev), dev_name(&port->dev),
1572 			dev_name(&cxld->dev), iw, cxlsd->nr_targets);
1573 		return -ENXIO;
1574 	}
1575 
1576 	if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) {
1577 		if (cxld->interleave_ways != iw ||
1578 		    (iw > 1 && cxld->interleave_granularity != ig) ||
1579 		    !spa_maps_hpa(p, &cxld->hpa_range) ||
1580 		    ((cxld->flags & CXL_DECODER_F_ENABLE) == 0)) {
1581 			dev_err(&cxlr->dev,
1582 				"%s:%s %s expected iw: %d ig: %d %pr\n",
1583 				dev_name(port->uport_dev), dev_name(&port->dev),
1584 				__func__, iw, ig, p->res);
1585 			dev_err(&cxlr->dev,
1586 				"%s:%s %s got iw: %d ig: %d state: %s %#llx:%#llx\n",
1587 				dev_name(port->uport_dev), dev_name(&port->dev),
1588 				__func__, cxld->interleave_ways,
1589 				cxld->interleave_granularity,
1590 				str_enabled_disabled(cxld->flags & CXL_DECODER_F_ENABLE),
1591 				cxld->hpa_range.start, cxld->hpa_range.end);
1592 			return -ENXIO;
1593 		}
1594 	} else {
1595 		rc = check_interleave_cap(cxld, iw, ig);
1596 		if (rc) {
1597 			dev_dbg(&cxlr->dev,
1598 				"%s:%s iw: %d ig: %d is not supported\n",
1599 				dev_name(port->uport_dev),
1600 				dev_name(&port->dev), iw, ig);
1601 			return rc;
1602 		}
1603 
1604 		cxld->interleave_ways = iw;
1605 		cxld->interleave_granularity = ig;
1606 		cxld->hpa_range = (struct range) {
1607 			.start = p->res->start,
1608 			.end = p->res->end,
1609 		};
1610 	}
1611 	dev_dbg(&cxlr->dev, "%s:%s iw: %d ig: %d\n", dev_name(port->uport_dev),
1612 		dev_name(&port->dev), iw, ig);
1613 add_target:
1614 	if (cxl_rr->nr_targets_set == cxl_rr->nr_targets) {
1615 		dev_dbg(&cxlr->dev,
1616 			"%s:%s: targets full trying to add %s:%s at %d\n",
1617 			dev_name(port->uport_dev), dev_name(&port->dev),
1618 			dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), pos);
1619 		return -ENXIO;
1620 	}
1621 	if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) {
1622 		if (cxlsd->target[cxl_rr->nr_targets_set] != ep->dport) {
1623 			dev_dbg(&cxlr->dev, "%s:%s: %s expected %s at %d\n",
1624 				dev_name(port->uport_dev), dev_name(&port->dev),
1625 				dev_name(&cxlsd->cxld.dev),
1626 				dev_name(ep->dport->dport_dev),
1627 				cxl_rr->nr_targets_set);
1628 			return -ENXIO;
1629 		}
1630 	} else {
1631 		cxlsd->target[cxl_rr->nr_targets_set] = ep->dport;
1632 		cxlsd->cxld.target_map[cxl_rr->nr_targets_set] = ep->dport->port_id;
1633 	}
1634 	cxl_rr->nr_targets_set++;
1635 out_target_set:
1636 	dev_dbg(&cxlr->dev, "%s:%s target[%d] = %s for %s:%s @ %d\n",
1637 		dev_name(port->uport_dev), dev_name(&port->dev),
1638 		cxl_rr->nr_targets_set - 1, dev_name(ep->dport->dport_dev),
1639 		dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), pos);
1640 
1641 	return 0;
1642 }
1643 
1644 static void cxl_port_reset_targets(struct cxl_port *port,
1645 				   struct cxl_region *cxlr)
1646 {
1647 	struct cxl_region_ref *cxl_rr = cxl_rr_load(port, cxlr);
1648 	struct cxl_decoder *cxld;
1649 
1650 	/*
1651 	 * After the last endpoint has been detached the entire cxl_rr may now
1652 	 * be gone.
1653 	 */
1654 	if (!cxl_rr)
1655 		return;
1656 	cxl_rr->nr_targets_set = 0;
1657 
1658 	cxld = cxl_rr->decoder;
1659 	cxld->hpa_range = (struct range) {
1660 		.start = 0,
1661 		.end = -1,
1662 	};
1663 }
1664 
1665 static void cxl_region_teardown_targets(struct cxl_region *cxlr)
1666 {
1667 	struct cxl_region_params *p = &cxlr->params;
1668 	struct cxl_endpoint_decoder *cxled;
1669 	struct cxl_dev_state *cxlds;
1670 	struct cxl_memdev *cxlmd;
1671 	struct cxl_port *iter;
1672 	struct cxl_ep *ep;
1673 	int i;
1674 
1675 	/*
1676 	 * In the auto-discovery case skip automatic teardown since the
1677 	 * address space is already active
1678 	 */
1679 	if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags))
1680 		return;
1681 
1682 	for (i = 0; i < p->nr_targets; i++) {
1683 		cxled = p->targets[i];
1684 		cxlmd = cxled_to_memdev(cxled);
1685 		cxlds = cxlmd->cxlds;
1686 
1687 		if (cxlds->rcd)
1688 			continue;
1689 
1690 		iter = cxled_to_port(cxled);
1691 		while (!is_cxl_root(to_cxl_port(iter->dev.parent)))
1692 			iter = to_cxl_port(iter->dev.parent);
1693 
1694 		for (ep = cxl_ep_load(iter, cxlmd); iter;
1695 		     iter = ep->next, ep = cxl_ep_load(iter, cxlmd))
1696 			cxl_port_reset_targets(iter, cxlr);
1697 	}
1698 }
1699 
1700 static int cxl_region_setup_targets(struct cxl_region *cxlr)
1701 {
1702 	struct cxl_region_params *p = &cxlr->params;
1703 	struct cxl_endpoint_decoder *cxled;
1704 	struct cxl_dev_state *cxlds;
1705 	int i, rc, rch = 0, vh = 0;
1706 	struct cxl_memdev *cxlmd;
1707 	struct cxl_port *iter;
1708 	struct cxl_ep *ep;
1709 
1710 	for (i = 0; i < p->nr_targets; i++) {
1711 		cxled = p->targets[i];
1712 		cxlmd = cxled_to_memdev(cxled);
1713 		cxlds = cxlmd->cxlds;
1714 
1715 		/* validate that all targets agree on topology */
1716 		if (!cxlds->rcd) {
1717 			vh++;
1718 		} else {
1719 			rch++;
1720 			continue;
1721 		}
1722 
1723 		iter = cxled_to_port(cxled);
1724 		while (!is_cxl_root(to_cxl_port(iter->dev.parent)))
1725 			iter = to_cxl_port(iter->dev.parent);
1726 
1727 		/*
1728 		 * Descend the topology tree programming / validating
1729 		 * targets while looking for conflicts.
1730 		 */
1731 		for (ep = cxl_ep_load(iter, cxlmd); iter;
1732 		     iter = ep->next, ep = cxl_ep_load(iter, cxlmd)) {
1733 			rc = cxl_port_setup_targets(iter, cxlr, cxled);
1734 			if (rc) {
1735 				cxl_region_teardown_targets(cxlr);
1736 				return rc;
1737 			}
1738 		}
1739 	}
1740 
1741 	if (rch && vh) {
1742 		dev_err(&cxlr->dev, "mismatched CXL topologies detected\n");
1743 		cxl_region_teardown_targets(cxlr);
1744 		return -ENXIO;
1745 	}
1746 
1747 	return 0;
1748 }
1749 
1750 static int cxl_region_validate_position(struct cxl_region *cxlr,
1751 					struct cxl_endpoint_decoder *cxled,
1752 					int pos)
1753 {
1754 	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
1755 	struct cxl_region_params *p = &cxlr->params;
1756 	int i;
1757 
1758 	if (pos < 0 || pos >= p->interleave_ways) {
1759 		dev_dbg(&cxlr->dev, "position %d out of range %d\n", pos,
1760 			p->interleave_ways);
1761 		return -ENXIO;
1762 	}
1763 
1764 	if (p->targets[pos] == cxled)
1765 		return 0;
1766 
1767 	if (p->targets[pos]) {
1768 		struct cxl_endpoint_decoder *cxled_target = p->targets[pos];
1769 		struct cxl_memdev *cxlmd_target = cxled_to_memdev(cxled_target);
1770 
1771 		dev_dbg(&cxlr->dev, "position %d already assigned to %s:%s\n",
1772 			pos, dev_name(&cxlmd_target->dev),
1773 			dev_name(&cxled_target->cxld.dev));
1774 		return -EBUSY;
1775 	}
1776 
1777 	for (i = 0; i < p->interleave_ways; i++) {
1778 		struct cxl_endpoint_decoder *cxled_target;
1779 		struct cxl_memdev *cxlmd_target;
1780 
1781 		cxled_target = p->targets[i];
1782 		if (!cxled_target)
1783 			continue;
1784 
1785 		cxlmd_target = cxled_to_memdev(cxled_target);
1786 		if (cxlmd_target == cxlmd) {
1787 			dev_dbg(&cxlr->dev,
1788 				"%s already specified at position %d via: %s\n",
1789 				dev_name(&cxlmd->dev), pos,
1790 				dev_name(&cxled_target->cxld.dev));
1791 			return -EBUSY;
1792 		}
1793 	}
1794 
1795 	return 0;
1796 }
1797 
1798 static int cxl_region_attach_position(struct cxl_region *cxlr,
1799 				      struct cxl_endpoint_decoder *cxled,
1800 				      const struct cxl_dport *dport, int pos)
1801 {
1802 	struct cxl_root_decoder *cxlrd = cxlr->cxlrd;
1803 	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
1804 	struct cxl_switch_decoder *cxlsd = &cxlrd->cxlsd;
1805 	struct cxl_decoder *cxld = &cxlsd->cxld;
1806 	int iw = cxld->interleave_ways;
1807 	struct cxl_port *iter;
1808 	int rc;
1809 
1810 	if (dport != cxlrd->cxlsd.target[pos % iw]) {
1811 		dev_dbg(&cxlr->dev, "%s:%s invalid target position for %s\n",
1812 			dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
1813 			dev_name(&cxlrd->cxlsd.cxld.dev));
1814 		return -ENXIO;
1815 	}
1816 
1817 	for (iter = cxled_to_port(cxled); !is_cxl_root(iter);
1818 	     iter = to_cxl_port(iter->dev.parent)) {
1819 		rc = cxl_port_attach_region(iter, cxlr, cxled, pos);
1820 		if (rc)
1821 			goto err;
1822 	}
1823 
1824 	return 0;
1825 
1826 err:
1827 	for (iter = cxled_to_port(cxled); !is_cxl_root(iter);
1828 	     iter = to_cxl_port(iter->dev.parent))
1829 		cxl_port_detach_region(iter, cxlr, cxled);
1830 	return rc;
1831 }
1832 
1833 static int cxl_region_attach_auto(struct cxl_region *cxlr,
1834 				  struct cxl_endpoint_decoder *cxled, int pos)
1835 {
1836 	struct cxl_region_params *p = &cxlr->params;
1837 
1838 	if (cxled->state != CXL_DECODER_STATE_AUTO) {
1839 		dev_err(&cxlr->dev,
1840 			"%s: unable to add decoder to autodetected region\n",
1841 			dev_name(&cxled->cxld.dev));
1842 		return -EINVAL;
1843 	}
1844 
1845 	if (pos >= 0) {
1846 		dev_dbg(&cxlr->dev, "%s: expected auto position, not %d\n",
1847 			dev_name(&cxled->cxld.dev), pos);
1848 		return -EINVAL;
1849 	}
1850 
1851 	if (p->nr_targets >= p->interleave_ways) {
1852 		dev_err(&cxlr->dev, "%s: no more target slots available\n",
1853 			dev_name(&cxled->cxld.dev));
1854 		return -ENXIO;
1855 	}
1856 
1857 	/*
1858 	 * Temporarily record the endpoint decoder into the target array. Yes,
1859 	 * this means that userspace can view devices in the wrong position
1860 	 * before the region activates, and must be careful to understand when
1861 	 * it might be racing region autodiscovery.
1862 	 *
1863 	 * The endpoint decoder will be recorded into the first free slot of
1864 	 * the target array.
1865 	 */
1866 	for (pos = 0; pos < p->interleave_ways; pos++) {
1867 		if (!p->targets[pos])
1868 			break;
1869 	}
1870 
1871 	if (pos == p->interleave_ways) {
1872 		dev_err(&cxlr->dev, "%s: unable to find a free target slot\n",
1873 			dev_name(&cxled->cxld.dev));
1874 		return -ENXIO;
1875 	}
1876 
1877 	p->targets[pos] = cxled;
1878 	cxled->pos = pos;
1879 	cxled->state = CXL_DECODER_STATE_AUTO_STAGED;
1880 	p->nr_targets++;
1881 
1882 	return 0;
1883 }
1884 
1885 static int cmp_interleave_pos(const void *a, const void *b)
1886 {
1887 	struct cxl_endpoint_decoder *cxled_a = *(typeof(cxled_a) *)a;
1888 	struct cxl_endpoint_decoder *cxled_b = *(typeof(cxled_b) *)b;
1889 
1890 	return cxled_a->pos - cxled_b->pos;
1891 }
1892 
1893 static int match_switch_decoder_by_range(struct device *dev,
1894 					 const void *data)
1895 {
1896 	struct cxl_switch_decoder *cxlsd;
1897 	const struct range *r1, *r2 = data;
1898 
1899 
1900 	if (!is_switch_decoder(dev))
1901 		return 0;
1902 
1903 	cxlsd = to_cxl_switch_decoder(dev);
1904 	r1 = &cxlsd->cxld.hpa_range;
1905 
1906 	if (is_root_decoder(dev))
1907 		return range_contains(r1, r2);
1908 	return (r1->start == r2->start && r1->end == r2->end);
1909 }
1910 
1911 static int find_pos_and_ways(struct cxl_port *port, struct range *range,
1912 			     int *pos, int *ways)
1913 {
1914 	struct cxl_switch_decoder *cxlsd;
1915 	struct cxl_port *parent;
1916 	struct device *dev;
1917 	int rc = -ENXIO;
1918 
1919 	parent = parent_port_of(port);
1920 	if (!parent)
1921 		return rc;
1922 
1923 	dev = device_find_child(&parent->dev, range,
1924 				match_switch_decoder_by_range);
1925 	if (!dev) {
1926 		dev_err(port->uport_dev,
1927 			"failed to find decoder mapping %#llx-%#llx\n",
1928 			range->start, range->end);
1929 		return rc;
1930 	}
1931 	cxlsd = to_cxl_switch_decoder(dev);
1932 	*ways = cxlsd->cxld.interleave_ways;
1933 
1934 	for (int i = 0; i < *ways; i++) {
1935 		if (cxlsd->target[i] == port->parent_dport) {
1936 			*pos = i;
1937 			rc = 0;
1938 			break;
1939 		}
1940 	}
1941 	put_device(dev);
1942 
1943 	if (rc)
1944 		dev_err(port->uport_dev,
1945 			"failed to find %s:%s in target list of %s\n",
1946 			dev_name(&port->dev),
1947 			dev_name(port->parent_dport->dport_dev),
1948 			dev_name(&cxlsd->cxld.dev));
1949 
1950 	return rc;
1951 }
1952 
1953 /**
1954  * cxl_calc_interleave_pos() - calculate an endpoint position in a region
1955  * @cxled: endpoint decoder member of given region
1956  * @hpa_range: translated HPA range of the endpoint
1957  *
1958  * The endpoint position is calculated by traversing the topology from
1959  * the endpoint to the root decoder and iteratively applying this
1960  * calculation:
1961  *
1962  *    position = position * parent_ways + parent_pos;
1963  *
1964  * ...where @position is inferred from switch and root decoder target lists.
1965  *
1966  * Return: position >= 0 on success
1967  *	   -ENXIO on failure
1968  */
1969 static int cxl_calc_interleave_pos(struct cxl_endpoint_decoder *cxled,
1970 				   struct range *hpa_range)
1971 {
1972 	struct cxl_port *iter, *port = cxled_to_port(cxled);
1973 	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
1974 	int parent_ways = 0, parent_pos = 0, pos = 0;
1975 	int rc;
1976 
1977 	/*
1978 	 * Example: the expected interleave order of the 4-way region shown
1979 	 * below is: mem0, mem2, mem1, mem3
1980 	 *
1981 	 *		  root_port
1982 	 *                 /      \
1983 	 *      host_bridge_0    host_bridge_1
1984 	 *        |    |           |    |
1985 	 *       mem0 mem1        mem2 mem3
1986 	 *
1987 	 * In the example the calculator will iterate twice. The first iteration
1988 	 * uses the mem position in the host-bridge and the ways of the host-
1989 	 * bridge to generate the first, or local, position. The second
1990 	 * iteration uses the host-bridge position in the root_port and the ways
1991 	 * of the root_port to refine the position.
1992 	 *
1993 	 * A trace of the calculation per endpoint looks like this:
1994 	 * mem0: pos = 0 * 2 + 0    mem2: pos = 0 * 2 + 0
1995 	 *       pos = 0 * 2 + 0          pos = 0 * 2 + 1
1996 	 *       pos: 0                   pos: 1
1997 	 *
1998 	 * mem1: pos = 0 * 2 + 1    mem3: pos = 0 * 2 + 1
1999 	 *       pos = 1 * 2 + 0          pos = 1 * 2 + 1
2000 	 *       pos: 2                   pos = 3
2001 	 *
2002 	 * Note that while this example is simple, the method applies to more
2003 	 * complex topologies, including those with switches.
2004 	 */
2005 
2006 	/* Iterate from endpoint to root_port refining the position */
2007 	for (iter = port; iter; iter = parent_port_of(iter)) {
2008 		if (is_cxl_root(iter))
2009 			break;
2010 
2011 		rc = find_pos_and_ways(iter, hpa_range, &parent_pos,
2012 				       &parent_ways);
2013 		if (rc)
2014 			return rc;
2015 
2016 		pos = pos * parent_ways + parent_pos;
2017 	}
2018 
2019 	dev_dbg(&cxlmd->dev,
2020 		"decoder:%s parent:%s port:%s range:%#llx-%#llx pos:%d\n",
2021 		dev_name(&cxled->cxld.dev), dev_name(cxlmd->dev.parent),
2022 		dev_name(&port->dev), hpa_range->start, hpa_range->end, pos);
2023 
2024 	return pos;
2025 }
2026 
2027 static int cxl_region_sort_targets(struct cxl_region *cxlr)
2028 {
2029 	struct cxl_region_params *p = &cxlr->params;
2030 	int i, rc = 0;
2031 
2032 	for (i = 0; i < p->nr_targets; i++) {
2033 		struct cxl_endpoint_decoder *cxled = p->targets[i];
2034 
2035 		cxled->pos = cxl_calc_interleave_pos(cxled, &cxlr->hpa_range);
2036 		/*
2037 		 * Record that sorting failed, but still continue to calc
2038 		 * cxled->pos so that cxl_calc_interleave_pos() emits its
2039 		 * dev_dbg() for every member. which is useful for auto
2040 		 * discovery debug.
2041 		 */
2042 		if (cxled->pos < 0)
2043 			rc = -ENXIO;
2044 	}
2045 	/* Keep the cxlr target list in interleave position order */
2046 	sort(p->targets, p->nr_targets, sizeof(p->targets[0]),
2047 	     cmp_interleave_pos, NULL);
2048 
2049 	dev_dbg(&cxlr->dev, "region sort %s\n", rc ? "failed" : "successful");
2050 	return rc;
2051 }
2052 
2053 static int cxl_region_attach(struct cxl_region *cxlr,
2054 			     struct cxl_endpoint_decoder *cxled, int pos)
2055 {
2056 	struct cxl_root_decoder *cxlrd = cxlr->cxlrd;
2057 	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
2058 	struct cxl_dev_state *cxlds = cxlmd->cxlds;
2059 	struct cxl_region_params *p = &cxlr->params;
2060 	struct cxl_port *ep_port, *root_port;
2061 	struct cxl_dport *dport;
2062 	int rc = -ENXIO;
2063 
2064 	rc = check_interleave_cap(&cxled->cxld, p->interleave_ways,
2065 				  p->interleave_granularity);
2066 	if (rc) {
2067 		dev_dbg(&cxlr->dev, "%s iw: %d ig: %d is not supported\n",
2068 			dev_name(&cxled->cxld.dev), p->interleave_ways,
2069 			p->interleave_granularity);
2070 		return rc;
2071 	}
2072 
2073 	if (cxled->part < 0) {
2074 		dev_dbg(&cxlr->dev, "%s dead\n", dev_name(&cxled->cxld.dev));
2075 		return -ENODEV;
2076 	}
2077 
2078 	if (cxlds->part[cxled->part].mode != cxlr->mode) {
2079 		dev_dbg(&cxlr->dev, "%s region mode: %d mismatch\n",
2080 			dev_name(&cxled->cxld.dev), cxlr->mode);
2081 		return -EINVAL;
2082 	}
2083 
2084 	/* all full of members, or interleave config not established? */
2085 	if (p->state > CXL_CONFIG_INTERLEAVE_ACTIVE) {
2086 		dev_dbg(&cxlr->dev, "region already active\n");
2087 		return -EBUSY;
2088 	}
2089 
2090 	if (p->state < CXL_CONFIG_INTERLEAVE_ACTIVE) {
2091 		dev_dbg(&cxlr->dev, "interleave config missing\n");
2092 		return -ENXIO;
2093 	}
2094 
2095 	if (p->nr_targets >= p->interleave_ways) {
2096 		dev_dbg(&cxlr->dev, "region already has %d endpoints\n",
2097 			p->nr_targets);
2098 		return -EINVAL;
2099 	}
2100 
2101 	ep_port = cxled_to_port(cxled);
2102 	root_port = cxlrd_to_port(cxlrd);
2103 	dport = cxl_find_dport_by_dev(root_port, ep_port->host_bridge);
2104 	if (!dport) {
2105 		dev_dbg(&cxlr->dev, "%s:%s invalid target for %s\n",
2106 			dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
2107 			dev_name(cxlr->dev.parent));
2108 		return -ENXIO;
2109 	}
2110 
2111 	if (cxled->cxld.target_type != cxlr->type) {
2112 		dev_dbg(&cxlr->dev, "%s:%s type mismatch: %d vs %d\n",
2113 			dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
2114 			cxled->cxld.target_type, cxlr->type);
2115 		return -ENXIO;
2116 	}
2117 
2118 	if (!cxled->dpa_res) {
2119 		dev_dbg(&cxlr->dev, "%s:%s: missing DPA allocation.\n",
2120 			dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev));
2121 		return -ENXIO;
2122 	}
2123 
2124 	if (resource_size(cxled->dpa_res) * p->interleave_ways + p->cache_size !=
2125 	    resource_size(p->res)) {
2126 		dev_dbg(&cxlr->dev,
2127 			"%s:%s-size-%#llx * ways-%d + cache-%#llx != region-size-%#llx\n",
2128 			dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
2129 			(u64)resource_size(cxled->dpa_res), p->interleave_ways,
2130 			(u64)p->cache_size, (u64)resource_size(p->res));
2131 		return -EINVAL;
2132 	}
2133 
2134 	cxl_region_perf_data_calculate(cxlr, cxled);
2135 
2136 	if (test_bit(CXL_REGION_F_AUTO, &cxlr->flags)) {
2137 		int i;
2138 
2139 		rc = cxl_region_attach_auto(cxlr, cxled, pos);
2140 		if (rc)
2141 			return rc;
2142 
2143 		/* await more targets to arrive... */
2144 		if (p->nr_targets < p->interleave_ways)
2145 			return 0;
2146 
2147 		/*
2148 		 * All targets are here, which implies all PCI enumeration that
2149 		 * affects this region has been completed. Walk the topology to
2150 		 * sort the devices into their relative region decode position.
2151 		 */
2152 		rc = cxl_region_sort_targets(cxlr);
2153 		if (rc)
2154 			return rc;
2155 
2156 		for (i = 0; i < p->nr_targets; i++) {
2157 			cxled = p->targets[i];
2158 			ep_port = cxled_to_port(cxled);
2159 			dport = cxl_find_dport_by_dev(root_port,
2160 						      ep_port->host_bridge);
2161 			rc = cxl_region_attach_position(cxlr, cxled, dport, i);
2162 			if (rc)
2163 				return rc;
2164 		}
2165 
2166 		rc = cxl_region_setup_targets(cxlr);
2167 		if (rc)
2168 			return rc;
2169 
2170 		/*
2171 		 * If target setup succeeds in the autodiscovery case
2172 		 * then the region is already committed.
2173 		 */
2174 		p->state = CXL_CONFIG_COMMIT;
2175 		cxl_region_shared_upstream_bandwidth_update(cxlr);
2176 
2177 		return 0;
2178 	}
2179 
2180 	rc = cxl_region_validate_position(cxlr, cxled, pos);
2181 	if (rc)
2182 		return rc;
2183 
2184 	rc = cxl_region_attach_position(cxlr, cxled, dport, pos);
2185 	if (rc)
2186 		return rc;
2187 
2188 	p->targets[pos] = cxled;
2189 	cxled->pos = pos;
2190 	p->nr_targets++;
2191 
2192 	if (p->nr_targets == p->interleave_ways) {
2193 		rc = cxl_region_setup_targets(cxlr);
2194 		if (rc)
2195 			return rc;
2196 		p->state = CXL_CONFIG_ACTIVE;
2197 		cxl_region_shared_upstream_bandwidth_update(cxlr);
2198 	}
2199 
2200 	cxled->cxld.interleave_ways = p->interleave_ways;
2201 	cxled->cxld.interleave_granularity = p->interleave_granularity;
2202 	cxled->cxld.hpa_range = (struct range) {
2203 		.start = p->res->start,
2204 		.end = p->res->end,
2205 	};
2206 
2207 	if (p->nr_targets != p->interleave_ways)
2208 		return 0;
2209 
2210 	/*
2211 	 * Test the auto-discovery position calculator function
2212 	 * against this successfully created user-defined region.
2213 	 * A fail message here means that this interleave config
2214 	 * will fail when presented as CXL_REGION_F_AUTO.
2215 	 */
2216 	for (int i = 0; i < p->nr_targets; i++) {
2217 		struct cxl_endpoint_decoder *target = p->targets[i];
2218 		int test_pos;
2219 
2220 		test_pos = cxl_calc_interleave_pos(target, &cxlr->hpa_range);
2221 		dev_dbg(&target->cxld.dev,
2222 			"Test cxl_calc_interleave_pos(): %s test_pos:%d target->pos:%d\n",
2223 			(test_pos == target->pos) ? "success" : "fail",
2224 			test_pos, target->pos);
2225 	}
2226 
2227 	return 0;
2228 }
2229 
2230 static int cxl_region_remove_target(struct device *dev, void *data)
2231 {
2232 	struct cxl_endpoint_decoder *cxled = data;
2233 	struct cxl_region_params *p;
2234 	struct cxl_region *cxlr;
2235 	int i;
2236 
2237 	if (!is_cxl_region(dev))
2238 		return 0;
2239 
2240 	cxlr = to_cxl_region(dev);
2241 	p = &cxlr->params;
2242 	for (i = 0; i < p->interleave_ways; i++) {
2243 		if (p->targets[i] == cxled) {
2244 			p->nr_targets--;
2245 			cxled->state = CXL_DECODER_STATE_AUTO;
2246 			cxled->pos = -1;
2247 			p->targets[i] = NULL;
2248 
2249 			return 1;
2250 		}
2251 	}
2252 
2253 	return 0;
2254 }
2255 
2256 /*
2257  * When an auto-region fails to assemble the decoder may be listed as a target,
2258  * but not fully attached.
2259  */
2260 static void cxl_cancel_auto_attach(struct cxl_endpoint_decoder *cxled)
2261 {
2262 	if (cxled->state != CXL_DECODER_STATE_AUTO_STAGED)
2263 		return;
2264 
2265 	bus_for_each_dev(&cxl_bus_type, NULL, cxled, cxl_region_remove_target);
2266 }
2267 
2268 static struct cxl_region *
2269 __cxl_decoder_detach(struct cxl_region *cxlr,
2270 		     struct cxl_endpoint_decoder *cxled, int pos,
2271 		     enum cxl_detach_mode mode)
2272 {
2273 	struct cxl_region_params *p;
2274 
2275 	lockdep_assert_held_write(&cxl_rwsem.region);
2276 
2277 	if (!cxled) {
2278 		p = &cxlr->params;
2279 
2280 		if (pos >= p->interleave_ways) {
2281 			dev_dbg(&cxlr->dev, "position %d out of range %d\n",
2282 				pos, p->interleave_ways);
2283 			return NULL;
2284 		}
2285 
2286 		if (!p->targets[pos])
2287 			return NULL;
2288 		cxled = p->targets[pos];
2289 	} else {
2290 		cxlr = cxled->cxld.region;
2291 		if (!cxlr) {
2292 			cxl_cancel_auto_attach(cxled);
2293 			return NULL;
2294 		}
2295 		p = &cxlr->params;
2296 	}
2297 
2298 	if (mode == DETACH_INVALIDATE)
2299 		cxled->part = -1;
2300 
2301 	if (p->state > CXL_CONFIG_ACTIVE) {
2302 		cxl_region_decode_reset(cxlr, p->interleave_ways);
2303 		p->state = CXL_CONFIG_ACTIVE;
2304 	}
2305 
2306 	for (struct cxl_port *iter = cxled_to_port(cxled); !is_cxl_root(iter);
2307 	     iter = to_cxl_port(iter->dev.parent))
2308 		cxl_port_detach_region(iter, cxlr, cxled);
2309 
2310 	if (cxled->pos < 0 || cxled->pos >= p->interleave_ways ||
2311 	    p->targets[cxled->pos] != cxled) {
2312 		struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
2313 
2314 		dev_WARN_ONCE(&cxlr->dev, 1, "expected %s:%s at position %d\n",
2315 			      dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
2316 			      cxled->pos);
2317 		return NULL;
2318 	}
2319 
2320 	if (p->state == CXL_CONFIG_ACTIVE) {
2321 		p->state = CXL_CONFIG_INTERLEAVE_ACTIVE;
2322 		cxl_region_teardown_targets(cxlr);
2323 	}
2324 	p->targets[cxled->pos] = NULL;
2325 	p->nr_targets--;
2326 	cxled->cxld.hpa_range = (struct range) {
2327 		.start = 0,
2328 		.end = -1,
2329 	};
2330 
2331 	get_device(&cxlr->dev);
2332 	return cxlr;
2333 }
2334 
2335 /*
2336  * Cleanup a decoder's interest in a region. There are 2 cases to
2337  * handle, removing an unknown @cxled from a known position in a region
2338  * (detach_target()) or removing a known @cxled from an unknown @cxlr
2339  * (cxld_unregister())
2340  *
2341  * When the detachment finds a region release the region driver.
2342  */
2343 int cxl_decoder_detach(struct cxl_region *cxlr,
2344 		       struct cxl_endpoint_decoder *cxled, int pos,
2345 		       enum cxl_detach_mode mode)
2346 {
2347 	struct cxl_region *detach;
2348 
2349 	/* when the decoder is being destroyed lock unconditionally */
2350 	if (mode == DETACH_INVALIDATE) {
2351 		guard(rwsem_write)(&cxl_rwsem.region);
2352 		detach = __cxl_decoder_detach(cxlr, cxled, pos, mode);
2353 	} else {
2354 		int rc;
2355 
2356 		ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region);
2357 		if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem)))
2358 			return rc;
2359 		detach = __cxl_decoder_detach(cxlr, cxled, pos, mode);
2360 	}
2361 
2362 	if (detach) {
2363 		device_release_driver(&detach->dev);
2364 		put_device(&detach->dev);
2365 	}
2366 	return 0;
2367 }
2368 
2369 static int __attach_target(struct cxl_region *cxlr,
2370 			   struct cxl_endpoint_decoder *cxled, int pos,
2371 			   unsigned int state)
2372 {
2373 	int rc;
2374 
2375 	if (state == TASK_INTERRUPTIBLE) {
2376 		ACQUIRE(rwsem_write_kill, rwsem)(&cxl_rwsem.region);
2377 		if ((rc = ACQUIRE_ERR(rwsem_write_kill, &rwsem)))
2378 			return rc;
2379 		guard(rwsem_read)(&cxl_rwsem.dpa);
2380 		return cxl_region_attach(cxlr, cxled, pos);
2381 	}
2382 	guard(rwsem_write)(&cxl_rwsem.region);
2383 	guard(rwsem_read)(&cxl_rwsem.dpa);
2384 	return cxl_region_attach(cxlr, cxled, pos);
2385 }
2386 
2387 static int attach_target(struct cxl_region *cxlr,
2388 			 struct cxl_endpoint_decoder *cxled, int pos,
2389 			 unsigned int state)
2390 {
2391 	int rc = __attach_target(cxlr, cxled, pos, state);
2392 
2393 	if (rc == 0)
2394 		return 0;
2395 
2396 	dev_warn(cxled->cxld.dev.parent, "failed to attach %s to %s: %d\n",
2397 		 dev_name(&cxled->cxld.dev), dev_name(&cxlr->dev), rc);
2398 	return rc;
2399 }
2400 
2401 static int detach_target(struct cxl_region *cxlr, int pos)
2402 {
2403 	return cxl_decoder_detach(cxlr, NULL, pos, DETACH_ONLY);
2404 }
2405 
2406 static size_t store_targetN(struct cxl_region *cxlr, const char *buf, int pos,
2407 			    size_t len)
2408 {
2409 	int rc;
2410 
2411 	if (sysfs_streq(buf, "\n"))
2412 		rc = detach_target(cxlr, pos);
2413 	else {
2414 		struct device *dev;
2415 
2416 		dev = bus_find_device_by_name(&cxl_bus_type, NULL, buf);
2417 		if (!dev)
2418 			return -ENODEV;
2419 
2420 		if (!is_endpoint_decoder(dev)) {
2421 			rc = -EINVAL;
2422 			goto out;
2423 		}
2424 
2425 		rc = attach_target(cxlr, to_cxl_endpoint_decoder(dev), pos,
2426 				   TASK_INTERRUPTIBLE);
2427 out:
2428 		put_device(dev);
2429 	}
2430 
2431 	if (rc < 0)
2432 		return rc;
2433 	return len;
2434 }
2435 
2436 #define TARGET_ATTR_RW(n)                                              \
2437 static ssize_t target##n##_show(                                       \
2438 	struct device *dev, struct device_attribute *attr, char *buf)  \
2439 {                                                                      \
2440 	return show_targetN(to_cxl_region(dev), buf, (n));             \
2441 }                                                                      \
2442 static ssize_t target##n##_store(struct device *dev,                   \
2443 				 struct device_attribute *attr,        \
2444 				 const char *buf, size_t len)          \
2445 {                                                                      \
2446 	return store_targetN(to_cxl_region(dev), buf, (n), len);       \
2447 }                                                                      \
2448 static DEVICE_ATTR_RW(target##n)
2449 
2450 TARGET_ATTR_RW(0);
2451 TARGET_ATTR_RW(1);
2452 TARGET_ATTR_RW(2);
2453 TARGET_ATTR_RW(3);
2454 TARGET_ATTR_RW(4);
2455 TARGET_ATTR_RW(5);
2456 TARGET_ATTR_RW(6);
2457 TARGET_ATTR_RW(7);
2458 TARGET_ATTR_RW(8);
2459 TARGET_ATTR_RW(9);
2460 TARGET_ATTR_RW(10);
2461 TARGET_ATTR_RW(11);
2462 TARGET_ATTR_RW(12);
2463 TARGET_ATTR_RW(13);
2464 TARGET_ATTR_RW(14);
2465 TARGET_ATTR_RW(15);
2466 
2467 static struct attribute *target_attrs[] = {
2468 	&dev_attr_target0.attr,
2469 	&dev_attr_target1.attr,
2470 	&dev_attr_target2.attr,
2471 	&dev_attr_target3.attr,
2472 	&dev_attr_target4.attr,
2473 	&dev_attr_target5.attr,
2474 	&dev_attr_target6.attr,
2475 	&dev_attr_target7.attr,
2476 	&dev_attr_target8.attr,
2477 	&dev_attr_target9.attr,
2478 	&dev_attr_target10.attr,
2479 	&dev_attr_target11.attr,
2480 	&dev_attr_target12.attr,
2481 	&dev_attr_target13.attr,
2482 	&dev_attr_target14.attr,
2483 	&dev_attr_target15.attr,
2484 	NULL,
2485 };
2486 
2487 static umode_t cxl_region_target_visible(struct kobject *kobj,
2488 					 struct attribute *a, int n)
2489 {
2490 	struct device *dev = kobj_to_dev(kobj);
2491 	struct cxl_region *cxlr = to_cxl_region(dev);
2492 	struct cxl_region_params *p = &cxlr->params;
2493 
2494 	if (n < p->interleave_ways)
2495 		return a->mode;
2496 	return 0;
2497 }
2498 
2499 static const struct attribute_group cxl_region_target_group = {
2500 	.attrs = target_attrs,
2501 	.is_visible = cxl_region_target_visible,
2502 };
2503 
2504 static const struct attribute_group *get_cxl_region_target_group(void)
2505 {
2506 	return &cxl_region_target_group;
2507 }
2508 
2509 static const struct attribute_group *region_groups[] = {
2510 	&cxl_base_attribute_group,
2511 	&cxl_region_group,
2512 	&cxl_region_target_group,
2513 	&cxl_region_access0_coordinate_group,
2514 	&cxl_region_access1_coordinate_group,
2515 	NULL,
2516 };
2517 
2518 static void cxl_region_release(struct device *dev)
2519 {
2520 	struct cxl_region *cxlr = to_cxl_region(dev);
2521 	struct cxl_root_decoder *cxlrd = cxlr->cxlrd;
2522 	int id = atomic_read(&cxlrd->region_id);
2523 
2524 	/*
2525 	 * Try to reuse the recently idled id rather than the cached
2526 	 * next id to prevent the region id space from increasing
2527 	 * unnecessarily.
2528 	 */
2529 	if (cxlr->id < id)
2530 		if (atomic_try_cmpxchg(&cxlrd->region_id, &id, cxlr->id)) {
2531 			memregion_free(id);
2532 			goto out;
2533 		}
2534 
2535 	memregion_free(cxlr->id);
2536 out:
2537 	put_device(dev->parent);
2538 	kfree(cxlr);
2539 }
2540 
2541 const struct device_type cxl_region_type = {
2542 	.name = "cxl_region",
2543 	.release = cxl_region_release,
2544 	.groups = region_groups
2545 };
2546 
2547 bool is_cxl_region(struct device *dev)
2548 {
2549 	return dev->type == &cxl_region_type;
2550 }
2551 EXPORT_SYMBOL_NS_GPL(is_cxl_region, "CXL");
2552 
2553 static struct cxl_region *to_cxl_region(struct device *dev)
2554 {
2555 	if (dev_WARN_ONCE(dev, dev->type != &cxl_region_type,
2556 			  "not a cxl_region device\n"))
2557 		return NULL;
2558 
2559 	return container_of(dev, struct cxl_region, dev);
2560 }
2561 
2562 static void unregister_region(struct cxl_region *cxlr)
2563 {
2564 	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
2565 	struct cxl_region_params *p = &cxlr->params;
2566 	int i;
2567 
2568 	xa_erase(&cxlrd->regions, cxlr->id);
2569 	device_del(&cxlr->dev);
2570 
2571 	/*
2572 	 * Now that region sysfs is shutdown, the parameter block is now
2573 	 * read-only, so no need to hold the region rwsem to access the
2574 	 * region parameters.
2575 	 */
2576 	for (i = 0; i < p->interleave_ways; i++)
2577 		detach_target(cxlr, i);
2578 
2579 	cxlr->hpa_range = DEFINE_RANGE(0, -1);
2580 
2581 	cxl_region_iomem_release(cxlr);
2582 	put_device(&cxlr->dev);
2583 }
2584 
2585 static void endpoint_unregister_region(void *_cxlr)
2586 {
2587 	struct cxl_region *cxlr = _cxlr;
2588 	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
2589 
2590 	guard(mutex)(&cxlrd->regions_lock);
2591 	if (xa_load(&cxlrd->regions, cxlr->id))
2592 		unregister_region(cxlr);
2593 	put_device(&cxlr->dev);
2594 }
2595 
2596 static struct lock_class_key cxl_region_key;
2597 
2598 static struct cxl_region *cxl_region_alloc(struct cxl_root_decoder *cxlrd, int id)
2599 {
2600 	struct cxl_region *cxlr;
2601 	struct device *dev;
2602 
2603 	cxlr = kzalloc_obj(*cxlr);
2604 	if (!cxlr) {
2605 		memregion_free(id);
2606 		return ERR_PTR(-ENOMEM);
2607 	}
2608 
2609 	dev = &cxlr->dev;
2610 	device_initialize(dev);
2611 	lockdep_set_class(&dev->mutex, &cxl_region_key);
2612 	dev->parent = &cxlrd->cxlsd.cxld.dev;
2613 	/*
2614 	 * Keep root decoder pinned through cxl_region_release to fixup
2615 	 * region id allocations
2616 	 */
2617 	get_device(dev->parent);
2618 	cxlr->cxlrd = cxlrd;
2619 	cxlr->id = id;
2620 
2621 	device_set_pm_not_required(dev);
2622 	dev->bus = &cxl_bus_type;
2623 	dev->type = &cxl_region_type;
2624 	cxl_region_setup_flags(cxlr, &cxlrd->cxlsd.cxld);
2625 
2626 	return cxlr;
2627 }
2628 
2629 static bool cxl_region_update_coordinates(struct cxl_region *cxlr, int nid)
2630 {
2631 	int cset = 0;
2632 	int rc;
2633 
2634 	for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) {
2635 		if (cxlr->coord[i].read_bandwidth) {
2636 			node_update_perf_attrs(nid, &cxlr->coord[i], i);
2637 			cset++;
2638 		}
2639 	}
2640 
2641 	if (!cset)
2642 		return false;
2643 
2644 	rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_access0_group());
2645 	if (rc)
2646 		dev_dbg(&cxlr->dev, "Failed to update access0 group\n");
2647 
2648 	rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_access1_group());
2649 	if (rc)
2650 		dev_dbg(&cxlr->dev, "Failed to update access1 group\n");
2651 
2652 	return true;
2653 }
2654 
2655 static int cxl_region_perf_attrs_callback(struct notifier_block *nb,
2656 					  unsigned long action, void *arg)
2657 {
2658 	struct cxl_region *cxlr = container_of(nb, struct cxl_region,
2659 					       node_notifier);
2660 	struct node_notify *nn = arg;
2661 	int nid = nn->nid;
2662 	int region_nid;
2663 
2664 	if (action != NODE_ADDED_FIRST_MEMORY)
2665 		return NOTIFY_DONE;
2666 
2667 	/*
2668 	 * No need to hold cxl_rwsem.region; region parameters are stable
2669 	 * within the cxl_region driver.
2670 	 */
2671 	region_nid = phys_to_target_node(cxlr->params.res->start);
2672 	if (nid != region_nid)
2673 		return NOTIFY_DONE;
2674 
2675 	/* No action needed if node bit already set */
2676 	if (node_test_and_set(nid, nodemask_region_seen))
2677 		return NOTIFY_DONE;
2678 
2679 	if (!cxl_region_update_coordinates(cxlr, nid))
2680 		return NOTIFY_DONE;
2681 
2682 	return NOTIFY_OK;
2683 }
2684 
2685 static int cxl_region_calculate_adistance(struct notifier_block *nb,
2686 					  unsigned long nid, void *data)
2687 {
2688 	struct cxl_region *cxlr = container_of(nb, struct cxl_region,
2689 					       adist_notifier);
2690 	struct access_coordinate *perf;
2691 	int *adist = data;
2692 	int region_nid;
2693 
2694 	/*
2695 	 * No need to hold cxl_rwsem.region; region parameters are stable
2696 	 * within the cxl_region driver.
2697 	 */
2698 	region_nid = phys_to_target_node(cxlr->params.res->start);
2699 	if (nid != region_nid)
2700 		return NOTIFY_OK;
2701 
2702 	perf = &cxlr->coord[ACCESS_COORDINATE_CPU];
2703 
2704 	if (mt_perf_to_adistance(perf, adist))
2705 		return NOTIFY_OK;
2706 
2707 	return NOTIFY_STOP;
2708 }
2709 
2710 /* unwind all remaining regions */
2711 void kill_regions(struct cxl_root_decoder *cxlrd)
2712 {
2713 	unsigned long index;
2714 	struct cxl_region *cxlr;
2715 
2716 	guard(mutex)(&cxlrd->regions_lock);
2717 	/* no more region creation */
2718 	cxlrd->dead = true;
2719 	xa_for_each(&cxlrd->regions, index, cxlr)
2720 		unregister_region(cxlr);
2721 }
2722 
2723 /**
2724  * devm_cxl_add_region - Adds a region to a decoder
2725  * @cxlrd: root decoder
2726  * @id: memregion id to create, or memregion_free() on failure
2727  * @mode: mode for the endpoint decoders of this region
2728  * @type: select whether this is an expander or accelerator (type-2 or type-3)
2729  *
2730  * This is the second step of region initialization. Regions exist within an
2731  * address space which is mapped by a @cxlrd.
2732  *
2733  * Return: 0 if the region was added to the @cxlrd, else returns negative error
2734  * code. The region will be named "regionZ" where Z is the unique region number.
2735  */
2736 static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd,
2737 					      int id,
2738 					      enum cxl_partition_mode mode,
2739 					      enum cxl_decoder_type type)
2740 {
2741 	struct cxl_port *port = to_cxl_port(cxlrd->cxlsd.cxld.dev.parent);
2742 	struct cxl_region *cxlr;
2743 	struct device *dev;
2744 	int rc;
2745 
2746 	cxlr = cxl_region_alloc(cxlrd, id);
2747 	if (IS_ERR(cxlr))
2748 		return cxlr;
2749 	cxlr->mode = mode;
2750 	cxlr->type = type;
2751 
2752 	dev = &cxlr->dev;
2753 	rc = dev_set_name(dev, "region%d", id);
2754 	if (rc)
2755 		goto err;
2756 
2757 	rc = device_add(dev);
2758 	if (rc)
2759 		goto err;
2760 
2761 	rc = xa_insert(&cxlrd->regions, cxlr->id, cxlr, GFP_KERNEL);
2762 	if (rc) {
2763 		unregister_region(cxlr);
2764 		return ERR_PTR(rc);
2765 	}
2766 
2767 	dev_dbg(port->uport_dev, "%s: created %s\n",
2768 		dev_name(&cxlrd->cxlsd.cxld.dev), dev_name(dev));
2769 	return cxlr;
2770 err:
2771 	put_device(dev);
2772 	return ERR_PTR(rc);
2773 }
2774 
2775 static ssize_t __create_region_show(struct cxl_root_decoder *cxlrd, char *buf)
2776 {
2777 	return sysfs_emit(buf, "region%u\n", atomic_read(&cxlrd->region_id));
2778 }
2779 
2780 static ssize_t create_pmem_region_show(struct device *dev,
2781 				       struct device_attribute *attr, char *buf)
2782 {
2783 	return __create_region_show(to_cxl_root_decoder(dev), buf);
2784 }
2785 
2786 static ssize_t create_ram_region_show(struct device *dev,
2787 				      struct device_attribute *attr, char *buf)
2788 {
2789 	return __create_region_show(to_cxl_root_decoder(dev), buf);
2790 }
2791 
2792 static struct cxl_region *__create_region(struct cxl_root_decoder *cxlrd,
2793 					  enum cxl_partition_mode mode, int id,
2794 					  enum cxl_decoder_type target_type)
2795 {
2796 	int rc;
2797 
2798 	if (cxlrd->dead)
2799 		return ERR_PTR(-ENXIO);
2800 
2801 	switch (mode) {
2802 	case CXL_PARTMODE_RAM:
2803 	case CXL_PARTMODE_PMEM:
2804 		break;
2805 	default:
2806 		dev_err(&cxlrd->cxlsd.cxld.dev, "unsupported mode %d\n", mode);
2807 		return ERR_PTR(-EINVAL);
2808 	}
2809 
2810 	rc = memregion_alloc(GFP_KERNEL);
2811 	if (rc < 0)
2812 		return ERR_PTR(rc);
2813 
2814 	if (atomic_cmpxchg(&cxlrd->region_id, id, rc) != id) {
2815 		memregion_free(rc);
2816 		return ERR_PTR(-EBUSY);
2817 	}
2818 
2819 	return devm_cxl_add_region(cxlrd, id, mode, target_type);
2820 }
2821 
2822 static ssize_t create_region_store(struct device *dev, const char *buf,
2823 				   size_t len, enum cxl_partition_mode mode)
2824 {
2825 	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
2826 	struct cxl_region *cxlr;
2827 	int rc, id;
2828 
2829 	rc = sscanf(buf, "region%d\n", &id);
2830 	if (rc != 1)
2831 		return -EINVAL;
2832 
2833 	ACQUIRE(mutex_intr, regions_lock)(&cxlrd->regions_lock);
2834 	if ((rc = ACQUIRE_ERR(mutex_intr, &regions_lock)))
2835 		return rc;
2836 
2837 	cxlr = __create_region(cxlrd, mode, id, CXL_DECODER_HOSTONLYMEM);
2838 	if (IS_ERR(cxlr))
2839 		return PTR_ERR(cxlr);
2840 
2841 	return len;
2842 }
2843 
2844 static ssize_t create_pmem_region_store(struct device *dev,
2845 					struct device_attribute *attr,
2846 					const char *buf, size_t len)
2847 {
2848 	return create_region_store(dev, buf, len, CXL_PARTMODE_PMEM);
2849 }
2850 DEVICE_ATTR_RW(create_pmem_region);
2851 
2852 static ssize_t create_ram_region_store(struct device *dev,
2853 				       struct device_attribute *attr,
2854 				       const char *buf, size_t len)
2855 {
2856 	return create_region_store(dev, buf, len, CXL_PARTMODE_RAM);
2857 }
2858 DEVICE_ATTR_RW(create_ram_region);
2859 
2860 static ssize_t region_show(struct device *dev, struct device_attribute *attr,
2861 			   char *buf)
2862 {
2863 	struct cxl_decoder *cxld = to_cxl_decoder(dev);
2864 	ssize_t rc;
2865 
2866 	ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
2867 	if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
2868 		return rc;
2869 
2870 	if (cxld->region)
2871 		return sysfs_emit(buf, "%s\n", dev_name(&cxld->region->dev));
2872 	return sysfs_emit(buf, "\n");
2873 }
2874 DEVICE_ATTR_RO(region);
2875 
2876 static ssize_t delete_region_store(struct device *dev,
2877 				   struct device_attribute *attr,
2878 				   const char *buf, size_t len)
2879 {
2880 	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
2881 	struct cxl_region *cxlr;
2882 	int rc, id;
2883 
2884 	ACQUIRE(mutex_intr, regions_lock)(&cxlrd->regions_lock);
2885 	if ((rc = ACQUIRE_ERR(mutex_intr, &regions_lock)))
2886 		return rc;
2887 
2888 	rc = sscanf(buf, "region%d\n", &id);
2889 	if (rc != 1)
2890 		return -EINVAL;
2891 
2892 	cxlr = xa_load(&cxlrd->regions, id);
2893 	if (!cxlr || !sysfs_streq(buf, dev_name(&cxlr->dev)))
2894 		return -ENODEV;
2895 
2896 	unregister_region(cxlr);
2897 
2898 	return len;
2899 }
2900 DEVICE_ATTR_WO(delete_region);
2901 
2902 struct cxl_poison_context {
2903 	struct cxl_port *port;
2904 	int part;
2905 	u64 offset;
2906 };
2907 
2908 static int cxl_get_poison_unmapped(struct cxl_memdev *cxlmd,
2909 				   struct cxl_poison_context *ctx)
2910 {
2911 	struct cxl_dev_state *cxlds = cxlmd->cxlds;
2912 	const struct resource *res;
2913 	struct resource *p, *last;
2914 	u64 offset, length;
2915 	int rc = 0;
2916 
2917 	if (ctx->part < 0)
2918 		return 0;
2919 
2920 	/*
2921 	 * Collect poison for the remaining unmapped resources after
2922 	 * poison is collected by committed endpoints decoders.
2923 	 */
2924 	for (int i = ctx->part; i < cxlds->nr_partitions; i++) {
2925 		res = &cxlds->part[i].res;
2926 		for (p = res->child, last = NULL; p; p = p->sibling)
2927 			last = p;
2928 		if (last)
2929 			offset = last->end + 1;
2930 		else
2931 			offset = res->start;
2932 		length = res->end - offset + 1;
2933 		if (!length)
2934 			break;
2935 		rc = cxl_mem_get_poison(cxlmd, offset, length, NULL);
2936 		if (rc == -EFAULT && cxlds->part[i].mode == CXL_PARTMODE_RAM)
2937 			continue;
2938 		if (rc)
2939 			break;
2940 	}
2941 
2942 	return rc;
2943 }
2944 
2945 static int poison_by_decoder(struct device *dev, void *arg)
2946 {
2947 	struct cxl_poison_context *ctx = arg;
2948 	struct cxl_endpoint_decoder *cxled;
2949 	enum cxl_partition_mode mode;
2950 	struct cxl_dev_state *cxlds;
2951 	struct cxl_memdev *cxlmd;
2952 	u64 offset, length;
2953 	int rc = 0;
2954 
2955 	if (!is_endpoint_decoder(dev))
2956 		return rc;
2957 
2958 	cxled = to_cxl_endpoint_decoder(dev);
2959 	if (!cxled->dpa_res)
2960 		return rc;
2961 
2962 	cxlmd = cxled_to_memdev(cxled);
2963 	cxlds = cxlmd->cxlds;
2964 	mode = cxlds->part[cxled->part].mode;
2965 
2966 	if (cxled->skip) {
2967 		offset = cxled->dpa_res->start - cxled->skip;
2968 		length = cxled->skip;
2969 		rc = cxl_mem_get_poison(cxlmd, offset, length, NULL);
2970 		if (rc == -EFAULT && mode == CXL_PARTMODE_RAM)
2971 			rc = 0;
2972 		if (rc)
2973 			return rc;
2974 	}
2975 
2976 	offset = cxled->dpa_res->start;
2977 	length = cxled->dpa_res->end - offset + 1;
2978 	rc = cxl_mem_get_poison(cxlmd, offset, length, cxled->cxld.region);
2979 	if (rc == -EFAULT && mode == CXL_PARTMODE_RAM)
2980 		rc = 0;
2981 	if (rc)
2982 		return rc;
2983 
2984 	/* Iterate until commit_end is reached */
2985 	if (cxled->cxld.id == ctx->port->commit_end) {
2986 		ctx->offset = cxled->dpa_res->end + 1;
2987 		ctx->part = cxled->part;
2988 		return 1;
2989 	}
2990 
2991 	return 0;
2992 }
2993 
2994 int cxl_get_poison_by_endpoint(struct cxl_port *port)
2995 {
2996 	struct cxl_poison_context ctx;
2997 	int rc = 0;
2998 
2999 	ctx = (struct cxl_poison_context) {
3000 		.port = port,
3001 		.part = -1,
3002 	};
3003 
3004 	rc = device_for_each_child(&port->dev, &ctx, poison_by_decoder);
3005 	if (rc == 1)
3006 		rc = cxl_get_poison_unmapped(to_cxl_memdev(port->uport_dev),
3007 					     &ctx);
3008 
3009 	return rc;
3010 }
3011 
3012 struct cxl_dpa_to_region_context {
3013 	struct cxl_region *cxlr;
3014 	u64 dpa;
3015 };
3016 
3017 static int __cxl_dpa_to_region(struct device *dev, void *arg)
3018 {
3019 	struct cxl_dpa_to_region_context *ctx = arg;
3020 	struct cxl_endpoint_decoder *cxled;
3021 	struct cxl_region *cxlr;
3022 	u64 dpa = ctx->dpa;
3023 
3024 	if (!is_endpoint_decoder(dev))
3025 		return 0;
3026 
3027 	cxled = to_cxl_endpoint_decoder(dev);
3028 	if (!cxled || !cxled->dpa_res || !resource_size(cxled->dpa_res))
3029 		return 0;
3030 
3031 	if (!cxl_resource_contains_addr(cxled->dpa_res, dpa))
3032 		return 0;
3033 
3034 	/*
3035 	 * Stop the region search (return 1) when an endpoint mapping is
3036 	 * found. The region may not be fully constructed so offering
3037 	 * the cxlr in the context structure is not guaranteed.
3038 	 */
3039 	cxlr = cxled->cxld.region;
3040 	if (cxlr)
3041 		dev_dbg(dev, "dpa:0x%llx mapped in region:%s\n", dpa,
3042 			dev_name(&cxlr->dev));
3043 	else
3044 		dev_dbg(dev, "dpa:0x%llx mapped in endpoint:%s\n", dpa,
3045 			dev_name(dev));
3046 
3047 	ctx->cxlr = cxlr;
3048 
3049 	return 1;
3050 }
3051 
3052 struct cxl_region *cxl_dpa_to_region(const struct cxl_memdev *cxlmd, u64 dpa)
3053 {
3054 	struct cxl_dpa_to_region_context ctx;
3055 	struct cxl_port *port = cxlmd->endpoint;
3056 
3057 	if (!cxlmd->dev.driver)
3058 		return NULL;
3059 
3060 	ctx = (struct cxl_dpa_to_region_context) {
3061 		.dpa = dpa,
3062 	};
3063 	if (cxl_num_decoders_committed(port))
3064 		device_for_each_child(&port->dev, &ctx, __cxl_dpa_to_region);
3065 
3066 	return ctx.cxlr;
3067 }
3068 
3069 static bool cxl_is_hpa_in_chunk(u64 hpa, struct cxl_region *cxlr, int pos)
3070 {
3071 	struct cxl_region_params *p = &cxlr->params;
3072 	int gran = p->interleave_granularity;
3073 	int ways = p->interleave_ways;
3074 	u64 offset;
3075 
3076 	/* Is the hpa in an expected chunk for its pos(-ition) */
3077 	offset = hpa - p->res->start;
3078 	offset = do_div(offset, gran * ways);
3079 	if ((offset >= pos * gran) && (offset < (pos + 1) * gran))
3080 		return true;
3081 
3082 	dev_dbg(&cxlr->dev,
3083 		"Addr trans fail: hpa 0x%llx not in expected chunk\n", hpa);
3084 
3085 	return false;
3086 }
3087 
3088 #define CXL_POS_ZERO 0
3089 /**
3090  * cxl_validate_translation_params
3091  * @eiw: encoded interleave ways
3092  * @eig: encoded interleave granularity
3093  * @pos: position in interleave
3094  *
3095  * Callers pass CXL_POS_ZERO when no position parameter needs validating.
3096  *
3097  * Returns: 0 on success, -EINVAL on first invalid parameter
3098  */
3099 int cxl_validate_translation_params(u8 eiw, u16 eig, int pos)
3100 {
3101 	int ways, gran;
3102 
3103 	if (eiw_to_ways(eiw, &ways)) {
3104 		pr_debug("%s: invalid eiw=%u\n", __func__, eiw);
3105 		return -EINVAL;
3106 	}
3107 	if (eig_to_granularity(eig, &gran)) {
3108 		pr_debug("%s: invalid eig=%u\n", __func__, eig);
3109 		return -EINVAL;
3110 	}
3111 	if (pos < 0 || pos >= ways) {
3112 		pr_debug("%s: invalid pos=%d for ways=%d\n", __func__, pos,
3113 			 ways);
3114 		return -EINVAL;
3115 	}
3116 
3117 	return 0;
3118 }
3119 EXPORT_SYMBOL_FOR_MODULES(cxl_validate_translation_params, "cxl_translate");
3120 
3121 u64 cxl_calculate_dpa_offset(u64 hpa_offset, u8 eiw, u16 eig)
3122 {
3123 	u64 dpa_offset, bits_lower, bits_upper, temp;
3124 	int ret;
3125 
3126 	ret = cxl_validate_translation_params(eiw, eig, CXL_POS_ZERO);
3127 	if (ret)
3128 		return ULLONG_MAX;
3129 
3130 	/*
3131 	 * DPA offset: CXL Spec 3.2 Section 8.2.4.20.13
3132 	 * Lower bits [IG+7:0] pass through unchanged
3133 	 * (eiw < 8)
3134 	 *	Per spec: DPAOffset[51:IG+8] = (HPAOffset[51:IG+IW+8] >> IW)
3135 	 *	Clear the position bits to isolate upper section, then
3136 	 *	reverse the left shift by eiw that occurred during DPA->HPA
3137 	 * (eiw >= 8)
3138 	 *	Per spec: DPAOffset[51:IG+8] = HPAOffset[51:IG+IW] / 3
3139 	 *	Extract upper bits from the correct bit range and divide by 3
3140 	 *	to recover the original DPA upper bits
3141 	 */
3142 	bits_lower = hpa_offset & GENMASK_ULL(eig + 7, 0);
3143 	if (eiw < 8) {
3144 		temp = hpa_offset &= ~GENMASK_ULL(eig + eiw + 8 - 1, 0);
3145 		dpa_offset = temp >> eiw;
3146 	} else {
3147 		bits_upper = div64_u64(hpa_offset >> (eig + eiw), 3);
3148 		dpa_offset = bits_upper << (eig + 8);
3149 	}
3150 	dpa_offset |= bits_lower;
3151 
3152 	return dpa_offset;
3153 }
3154 EXPORT_SYMBOL_FOR_MODULES(cxl_calculate_dpa_offset, "cxl_translate");
3155 
3156 int cxl_calculate_position(u64 hpa_offset, u8 eiw, u16 eig)
3157 {
3158 	int ways = 0;
3159 	u64 shifted, rem;
3160 	int pos, ret;
3161 
3162 	ret = cxl_validate_translation_params(eiw, eig, CXL_POS_ZERO);
3163 	if (ret)
3164 		return ret;
3165 
3166 	if (!eiw)
3167 		/* position is 0 if no interleaving */
3168 		return 0;
3169 
3170 	/*
3171 	 * Interleave position: CXL Spec 3.2 Section 8.2.4.20.13
3172 	 * eiw < 8
3173 	 *	Position is in the IW bits at HPA_OFFSET[IG+8+IW-1:IG+8].
3174 	 *	Per spec "remove IW bits starting with bit position IG+8"
3175 	 * eiw >= 8
3176 	 *	Position is not explicitly stored in HPA_OFFSET bits. It is
3177 	 *	derived from the modulo operation of the upper bits using
3178 	 *	the total number of interleave ways.
3179 	 */
3180 	if (eiw < 8) {
3181 		pos = (hpa_offset >> (eig + 8)) & GENMASK(eiw - 1, 0);
3182 	} else {
3183 		shifted = hpa_offset >> (eig + 8);
3184 		eiw_to_ways(eiw, &ways);
3185 		div64_u64_rem(shifted, ways, &rem);
3186 		pos = rem;
3187 	}
3188 
3189 	return pos;
3190 }
3191 EXPORT_SYMBOL_FOR_MODULES(cxl_calculate_position, "cxl_translate");
3192 
3193 u64 cxl_calculate_hpa_offset(u64 dpa_offset, int pos, u8 eiw, u16 eig)
3194 {
3195 	u64 mask_upper, hpa_offset, bits_upper;
3196 	int ret;
3197 
3198 	ret = cxl_validate_translation_params(eiw, eig, pos);
3199 	if (ret)
3200 		return ULLONG_MAX;
3201 
3202 	/*
3203 	 * The device position in the region interleave set was removed
3204 	 * from the offset at HPA->DPA translation. To reconstruct the
3205 	 * HPA, place the 'pos' in the offset.
3206 	 *
3207 	 * The placement of 'pos' in the HPA is determined by interleave
3208 	 * ways and granularity and is defined in the CXL Spec 3.0 Section
3209 	 * 8.2.4.19.13 Implementation Note: Device Decode Logic
3210 	 */
3211 
3212 	mask_upper = GENMASK_ULL(51, eig + 8);
3213 
3214 	if (eiw < 8) {
3215 		hpa_offset = (dpa_offset & mask_upper) << eiw;
3216 		hpa_offset |= pos << (eig + 8);
3217 	} else {
3218 		bits_upper = (dpa_offset & mask_upper) >> (eig + 8);
3219 		bits_upper = bits_upper * 3;
3220 		hpa_offset = ((bits_upper << (eiw - 8)) + pos) << (eig + 8);
3221 	}
3222 
3223 	/* The lower bits remain unchanged */
3224 	hpa_offset |= dpa_offset & GENMASK_ULL(eig + 7, 0);
3225 
3226 	return hpa_offset;
3227 }
3228 EXPORT_SYMBOL_FOR_MODULES(cxl_calculate_hpa_offset, "cxl_translate");
3229 
3230 static int decode_pos(int region_ways, int hb_ways, int pos, int *pos_port,
3231 		      int *pos_hb)
3232 {
3233 	int devices_per_hb;
3234 
3235 	/*
3236 	 * Decode for 3-6-12 way interleaves as defined in the CXL
3237 	 * Spec 4.0 9.13.1.1 Legal Interleaving Configurations.
3238 	 * Region creation should prevent invalid combinations but
3239 	 * sanity check here to avoid a silent bad decode.
3240 	 */
3241 	switch (hb_ways) {
3242 	case 3:
3243 		if (region_ways != 3 && region_ways != 6 && region_ways != 12)
3244 			return -EINVAL;
3245 		break;
3246 	case 6:
3247 		if (region_ways != 6 && region_ways != 12)
3248 			return -EINVAL;
3249 		break;
3250 	case 12:
3251 		if (region_ways != 12)
3252 			return -EINVAL;
3253 		break;
3254 	default:
3255 		return -EINVAL;
3256 	}
3257 	/*
3258 	 * Each host bridge contributes an equal number of endpoints
3259 	 * that are laid out contiguously per host bridge. Modulo
3260 	 * selects the port within a host bridge and division selects
3261 	 * the host bridge position.
3262 	 */
3263 	devices_per_hb = region_ways / hb_ways;
3264 	*pos_port = pos % devices_per_hb;
3265 	*pos_hb = pos / devices_per_hb;
3266 
3267 	return 0;
3268 }
3269 
3270 /*
3271  * restore_parent() reconstruct the address in parent
3272  *
3273  * This math, specifically the bitmask creation 'mask = gran - 1' relies
3274  * on the CXL Spec requirement that interleave granularity is always a
3275  * power of two.
3276  *
3277  * [mask]		isolate the offset with the granularity
3278  * [addr & ~mask]	remove the offset leaving the aligned portion
3279  * [* ways]		distribute across all interleave ways
3280  * [+ (pos * gran)]	add the positional offset
3281  * [+ (addr & mask)]	restore the masked offset
3282  */
3283 static u64 restore_parent(u64 addr, u64 pos, u64 gran, u64 ways)
3284 {
3285 	u64 mask = gran - 1;
3286 
3287 	return ((addr & ~mask) * ways) + (pos * gran) + (addr & mask);
3288 }
3289 
3290 /*
3291  * unaligned_dpa_to_hpa() translates a DPA to HPA when the region resource
3292  * start address is not aligned at Host Bridge Interleave Ways * 256MB.
3293  *
3294  * Unaligned start addresses only occur with MOD3 interleaves. All power-
3295  * of-two interleaves are guaranteed aligned.
3296  */
3297 static u64 unaligned_dpa_to_hpa(struct cxl_decoder *cxld,
3298 				struct cxl_region_params *p, int pos, u64 dpa)
3299 {
3300 	int ways_port = p->interleave_ways / cxld->interleave_ways;
3301 	int gran_port = p->interleave_granularity;
3302 	int gran_hb = cxld->interleave_granularity;
3303 	int ways_hb = cxld->interleave_ways;
3304 	int pos_port, pos_hb, gran_shift;
3305 	u64 hpa_port = 0;
3306 
3307 	/* Decode an endpoint 'pos' into port and host-bridge components */
3308 	if (decode_pos(p->interleave_ways, ways_hb, pos, &pos_port, &pos_hb)) {
3309 		dev_dbg(&cxld->dev, "not supported for region ways:%d\n",
3310 			p->interleave_ways);
3311 		return ULLONG_MAX;
3312 	}
3313 
3314 	/* Restore the port parent address if needed */
3315 	if (gran_hb != gran_port)
3316 		hpa_port = restore_parent(dpa, pos_port, gran_port, ways_port);
3317 	else
3318 		hpa_port = dpa;
3319 
3320 	/*
3321 	 * Complete the HPA reconstruction by restoring the address as if
3322 	 * each HB position is a candidate. Test against expected pos_hb
3323 	 * to confirm match.
3324 	 */
3325 	gran_shift = ilog2(gran_hb);
3326 	for (int position = 0; position < ways_hb; position++) {
3327 		u64 shifted, hpa;
3328 
3329 		hpa = restore_parent(hpa_port, position, gran_hb, ways_hb);
3330 		hpa += p->res->start;
3331 
3332 		shifted = hpa >> gran_shift;
3333 		if (do_div(shifted, ways_hb) == pos_hb)
3334 			return hpa;
3335 	}
3336 
3337 	dev_dbg(&cxld->dev, "fail dpa:%#llx region:%pr pos:%d\n", dpa, p->res,
3338 		pos);
3339 	dev_dbg(&cxld->dev, "     port-w/g/p:%d/%d/%d hb-w/g/p:%d/%d/%d\n",
3340 		ways_port, gran_port, pos_port, ways_hb, gran_hb, pos_hb);
3341 
3342 	return ULLONG_MAX;
3343 }
3344 
3345 static bool region_is_unaligned_mod3(struct cxl_region *cxlr)
3346 {
3347 	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
3348 	struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld;
3349 	struct cxl_region_params *p = &cxlr->params;
3350 	int hbiw = cxld->interleave_ways;
3351 	u64 rem;
3352 
3353 	if (is_power_of_2(hbiw))
3354 		return false;
3355 
3356 	div64_u64_rem(p->res->start, (u64)hbiw * SZ_256M, &rem);
3357 
3358 	return (rem != 0);
3359 }
3360 
3361 u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd,
3362 		   u64 dpa)
3363 {
3364 	struct cxl_root_decoder *cxlrd = cxlr->cxlrd;
3365 	struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld;
3366 	struct cxl_region_params *p = &cxlr->params;
3367 	struct cxl_endpoint_decoder *cxled = NULL;
3368 	u64 base, dpa_offset, hpa_offset, hpa;
3369 	bool unaligned = false;
3370 	u16 eig = 0;
3371 	u8 eiw = 0;
3372 	int pos;
3373 
3374 	/*
3375 	 * Conversion between SPA and DPA is not supported in
3376 	 * Normalized Address mode.
3377 	 */
3378 	if (test_bit(CXL_REGION_F_NORMALIZED_ADDRESSING, &cxlr->flags))
3379 		return ULLONG_MAX;
3380 
3381 	for (int i = 0; i < p->nr_targets; i++) {
3382 		if (cxlmd == cxled_to_memdev(p->targets[i])) {
3383 			cxled = p->targets[i];
3384 			break;
3385 		}
3386 	}
3387 	if (!cxled)
3388 		return ULLONG_MAX;
3389 
3390 	base = cxl_dpa_resource_start(cxled);
3391 	if (base == RESOURCE_SIZE_MAX)
3392 		return ULLONG_MAX;
3393 
3394 	dpa_offset = dpa - base;
3395 
3396 	/* Unaligned calc for MOD3 interleaves not hbiw * 256MB aligned */
3397 	unaligned = region_is_unaligned_mod3(cxlr);
3398 	if (unaligned) {
3399 		hpa = unaligned_dpa_to_hpa(cxld, p, cxled->pos, dpa_offset);
3400 		if (hpa == ULLONG_MAX)
3401 			return ULLONG_MAX;
3402 
3403 		goto skip_aligned;
3404 	}
3405 	/*
3406 	 * Aligned calc for all power-of-2 interleaves and for MOD3
3407 	 * interleaves that are aligned at hbiw * 256MB
3408 	 */
3409 	pos = cxled->pos;
3410 	ways_to_eiw(p->interleave_ways, &eiw);
3411 	granularity_to_eig(p->interleave_granularity, &eig);
3412 
3413 	hpa_offset = cxl_calculate_hpa_offset(dpa_offset, pos, eiw, eig);
3414 	if (hpa_offset == ULLONG_MAX)
3415 		return ULLONG_MAX;
3416 
3417 	/* Apply the hpa_offset to the region base address */
3418 	hpa = hpa_offset + p->res->start;
3419 
3420 skip_aligned:
3421 	hpa += p->cache_size;
3422 
3423 	/* Root decoder translation overrides typical modulo decode */
3424 	if (cxlrd->ops.hpa_to_spa)
3425 		hpa = cxlrd->ops.hpa_to_spa(cxlrd, hpa);
3426 
3427 	if (hpa == ULLONG_MAX)
3428 		return ULLONG_MAX;
3429 
3430 	if (!cxl_resource_contains_addr(p->res, hpa)) {
3431 		dev_dbg(&cxlr->dev,
3432 			"Addr trans fail: hpa 0x%llx not in region\n", hpa);
3433 		return ULLONG_MAX;
3434 	}
3435 	/* Chunk check applies to aligned modulo decodes only */
3436 	if (!unaligned && !cxlrd->ops.hpa_to_spa &&
3437 	    !cxl_is_hpa_in_chunk(hpa, cxlr, pos))
3438 		return ULLONG_MAX;
3439 
3440 	return hpa;
3441 }
3442 
3443 struct dpa_result {
3444 	struct cxl_memdev *cxlmd;
3445 	u64 dpa;
3446 };
3447 
3448 static int unaligned_region_offset_to_dpa_result(struct cxl_region *cxlr,
3449 						 u64 offset,
3450 						 struct dpa_result *result)
3451 {
3452 	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent);
3453 	struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld;
3454 	struct cxl_region_params *p = &cxlr->params;
3455 	u64 interleave_width, interleave_index;
3456 	u64 gran, gran_offset, dpa_offset;
3457 	u64 hpa = p->res->start + offset;
3458 	u64 tmp = offset;
3459 
3460 	/*
3461 	 * Unaligned addresses are not algebraically invertible. Calculate
3462 	 * a dpa_offset independent of the target device and then enumerate
3463 	 * and test that dpa_offset against each candidate endpoint decoder.
3464 	 */
3465 	gran = cxld->interleave_granularity;
3466 	interleave_width = gran * cxld->interleave_ways;
3467 	interleave_index = div64_u64(offset, interleave_width);
3468 	gran_offset = do_div(tmp, gran);
3469 
3470 	dpa_offset = interleave_index * gran + gran_offset;
3471 
3472 	for (int i = 0; i < p->nr_targets; i++) {
3473 		struct cxl_endpoint_decoder *cxled = p->targets[i];
3474 		int pos = cxled->pos;
3475 		u64 test_hpa;
3476 
3477 		test_hpa = unaligned_dpa_to_hpa(cxld, p, pos, dpa_offset);
3478 		if (test_hpa == hpa) {
3479 			result->cxlmd = cxled_to_memdev(cxled);
3480 			result->dpa =
3481 				cxl_dpa_resource_start(cxled) + dpa_offset;
3482 			return 0;
3483 		}
3484 	}
3485 	dev_err(&cxlr->dev,
3486 		"failed to resolve HPA %#llx in unaligned MOD3 region\n", hpa);
3487 
3488 	return -ENXIO;
3489 }
3490 
3491 static int region_offset_to_dpa_result(struct cxl_region *cxlr, u64 offset,
3492 				       struct dpa_result *result)
3493 {
3494 	struct cxl_region_params *p = &cxlr->params;
3495 	struct cxl_root_decoder *cxlrd = cxlr->cxlrd;
3496 	struct cxl_endpoint_decoder *cxled;
3497 	u64 hpa_offset = offset;
3498 	u64 dpa, dpa_offset;
3499 	u16 eig = 0;
3500 	u8 eiw = 0;
3501 	int pos;
3502 
3503 	lockdep_assert_held(&cxl_rwsem.region);
3504 	lockdep_assert_held(&cxl_rwsem.dpa);
3505 
3506 	/* Input validation ensures valid ways and gran */
3507 	granularity_to_eig(p->interleave_granularity, &eig);
3508 	ways_to_eiw(p->interleave_ways, &eiw);
3509 
3510 	/*
3511 	 * If the root decoder has SPA to CXL HPA callback, use it. Otherwise
3512 	 * CXL HPA is assumed to equal SPA.
3513 	 */
3514 	if (cxlrd->ops.spa_to_hpa) {
3515 		hpa_offset = cxlrd->ops.spa_to_hpa(cxlrd, p->res->start + offset);
3516 		if (hpa_offset == ULLONG_MAX) {
3517 			dev_dbg(&cxlr->dev, "HPA not found for %pr offset %#llx\n",
3518 				p->res, offset);
3519 			return -ENXIO;
3520 		}
3521 		hpa_offset -= p->res->start;
3522 	}
3523 
3524 	if (region_is_unaligned_mod3(cxlr))
3525 		return unaligned_region_offset_to_dpa_result(cxlr, offset,
3526 							     result);
3527 
3528 	pos = cxl_calculate_position(hpa_offset, eiw, eig);
3529 	if (pos < 0 || pos >= p->nr_targets) {
3530 		dev_dbg(&cxlr->dev, "Invalid position %d for %d targets\n",
3531 			pos, p->nr_targets);
3532 		return -ENXIO;
3533 	}
3534 
3535 	dpa_offset = cxl_calculate_dpa_offset(hpa_offset, eiw, eig);
3536 
3537 	/* Look-up and return the result: a memdev and a DPA */
3538 	for (int i = 0; i < p->nr_targets; i++) {
3539 		cxled = p->targets[i];
3540 		if (cxled->pos != pos)
3541 			continue;
3542 
3543 		dpa = cxl_dpa_resource_start(cxled);
3544 		if (dpa != RESOURCE_SIZE_MAX)
3545 			dpa += dpa_offset;
3546 
3547 		result->cxlmd = cxled_to_memdev(cxled);
3548 		result->dpa = dpa;
3549 
3550 		return 0;
3551 	}
3552 	dev_err(&cxlr->dev, "No device found for position %d\n", pos);
3553 
3554 	return -ENXIO;
3555 }
3556 
3557 static int match_root_decoder(struct device *dev, const void *data)
3558 {
3559 	const struct range *r1, *r2 = data;
3560 	struct cxl_root_decoder *cxlrd;
3561 
3562 	if (!is_root_decoder(dev))
3563 		return 0;
3564 
3565 	cxlrd = to_cxl_root_decoder(dev);
3566 	r1 = &cxlrd->cxlsd.cxld.hpa_range;
3567 
3568 	return range_contains(r1, r2);
3569 }
3570 
3571 static int cxl_root_setup_translation(struct cxl_root *cxl_root,
3572 				      struct cxl_region_context *ctx)
3573 {
3574 	if (!cxl_root->ops.translation_setup_root)
3575 		return 0;
3576 
3577 	return cxl_root->ops.translation_setup_root(cxl_root, ctx);
3578 }
3579 
3580 /*
3581  * Note, when finished with the device, drop the reference with
3582  * put_device() or use the put_cxl_root_decoder helper.
3583  */
3584 static struct cxl_root_decoder *
3585 get_cxl_root_decoder(struct cxl_endpoint_decoder *cxled,
3586 		     struct cxl_region_context *ctx)
3587 {
3588 	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
3589 	struct cxl_port *port = cxled_to_port(cxled);
3590 	struct cxl_root *cxl_root __free(put_cxl_root) = find_cxl_root(port);
3591 	struct device *cxlrd_dev;
3592 	int rc;
3593 
3594 	/*
3595 	 * Adjust the endpoint's HPA range and interleaving
3596 	 * configuration to the root decoder’s memory space before
3597 	 * setting up the root decoder.
3598 	 */
3599 	rc = cxl_root_setup_translation(cxl_root, ctx);
3600 	if (rc) {
3601 		dev_err(cxlmd->dev.parent,
3602 			"%s:%s Failed to setup translation for address range %#llx:%#llx\n",
3603 			dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
3604 			ctx->hpa_range.start, ctx->hpa_range.end);
3605 		return ERR_PTR(rc);
3606 	}
3607 
3608 	cxlrd_dev = device_find_child(&cxl_root->port.dev, &ctx->hpa_range,
3609 				      match_root_decoder);
3610 	if (!cxlrd_dev) {
3611 		dev_err(cxlmd->dev.parent,
3612 			"%s:%s no CXL window for range %#llx:%#llx\n",
3613 			dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
3614 			ctx->hpa_range.start, ctx->hpa_range.end);
3615 		return ERR_PTR(-ENXIO);
3616 	}
3617 
3618 	return to_cxl_root_decoder(cxlrd_dev);
3619 }
3620 
3621 static int match_region_by_range(struct device *dev, const void *data)
3622 {
3623 	struct cxl_region_params *p;
3624 	struct cxl_region *cxlr;
3625 	const struct range *r = data;
3626 
3627 	if (!is_cxl_region(dev))
3628 		return 0;
3629 
3630 	cxlr = to_cxl_region(dev);
3631 	p = &cxlr->params;
3632 
3633 	guard(rwsem_read)(&cxl_rwsem.region);
3634 	return spa_maps_hpa(p, r);
3635 }
3636 
3637 static int cxl_extended_linear_cache_resize(struct cxl_region *cxlr,
3638 					    struct resource *res)
3639 {
3640 	struct cxl_root_decoder *cxlrd = cxlr->cxlrd;
3641 	struct cxl_region_params *p = &cxlr->params;
3642 	resource_size_t size = resource_size(res);
3643 	resource_size_t cache_size, start;
3644 
3645 	cache_size = cxlrd->cache_size;
3646 	if (!cache_size)
3647 		return 0;
3648 
3649 	if (size != cache_size) {
3650 		dev_warn(&cxlr->dev,
3651 			 "Extended Linear Cache size %pa != CXL size %pa. No Support!",
3652 			 &cache_size, &size);
3653 		return -ENXIO;
3654 	}
3655 
3656 	/*
3657 	 * Move the start of the range to where the cache range starts. The
3658 	 * implementation assumes that the cache range is in front of the
3659 	 * CXL range. This is not dictated by the HMAT spec but is how the
3660 	 * current known implementation is configured.
3661 	 *
3662 	 * The cache range is expected to be within the CFMWS. The adjusted
3663 	 * res->start should not be less than cxlrd->res->start.
3664 	 */
3665 	start = res->start - cache_size;
3666 	if (start < cxlrd->res->start)
3667 		return -ENXIO;
3668 
3669 	res->start = start;
3670 	p->cache_size = cache_size;
3671 
3672 	return 0;
3673 }
3674 
3675 static int __construct_region(struct cxl_region *cxlr,
3676 			      struct cxl_region_context *ctx)
3677 {
3678 	struct cxl_endpoint_decoder *cxled = ctx->cxled;
3679 	struct cxl_root_decoder *cxlrd = cxlr->cxlrd;
3680 	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
3681 	struct range *hpa_range = &ctx->hpa_range;
3682 	struct cxl_region_params *p;
3683 	struct resource *res;
3684 	int rc;
3685 
3686 	guard(rwsem_write)(&cxl_rwsem.region);
3687 	p = &cxlr->params;
3688 	if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) {
3689 		dev_err(cxlmd->dev.parent,
3690 			"%s:%s: %s autodiscovery interrupted\n",
3691 			dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
3692 			__func__);
3693 		return -EBUSY;
3694 	}
3695 
3696 	set_bit(CXL_REGION_F_AUTO, &cxlr->flags);
3697 	cxlr->hpa_range = *hpa_range;
3698 
3699 	res = kmalloc_obj(*res);
3700 	if (!res)
3701 		return -ENOMEM;
3702 
3703 	*res = DEFINE_RES_MEM_NAMED(hpa_range->start, range_len(hpa_range),
3704 				    dev_name(&cxlr->dev));
3705 
3706 	rc = cxl_extended_linear_cache_resize(cxlr, res);
3707 	if (rc && rc != -EOPNOTSUPP) {
3708 		/*
3709 		 * Failing to support extended linear cache region resize does not
3710 		 * prevent the region from functioning. Only causes cxl list showing
3711 		 * incorrect region size.
3712 		 */
3713 		dev_warn(cxlmd->dev.parent,
3714 			 "Extended linear cache calculation failed rc:%d\n", rc);
3715 	}
3716 
3717 	rc = sysfs_update_group(&cxlr->dev.kobj, &cxl_region_group);
3718 	if (rc) {
3719 		kfree(res);
3720 		return rc;
3721 	}
3722 
3723 	rc = insert_resource(cxlrd->res, res);
3724 	if (rc) {
3725 		/*
3726 		 * Platform-firmware may not have split resources like "System
3727 		 * RAM" on CXL window boundaries see cxl_region_iomem_release()
3728 		 */
3729 		dev_warn(cxlmd->dev.parent,
3730 			 "%s:%s: %s %s cannot insert resource\n",
3731 			 dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
3732 			 __func__, dev_name(&cxlr->dev));
3733 	}
3734 
3735 	p->res = res;
3736 	p->interleave_ways = ctx->interleave_ways;
3737 	p->interleave_granularity = ctx->interleave_granularity;
3738 	p->state = CXL_CONFIG_INTERLEAVE_ACTIVE;
3739 
3740 	rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_target_group());
3741 	if (rc)
3742 		return rc;
3743 
3744 	dev_dbg(cxlmd->dev.parent, "%s:%s: %s %s res: %pr iw: %d ig: %d\n",
3745 		dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), __func__,
3746 		dev_name(&cxlr->dev), p->res, p->interleave_ways,
3747 		p->interleave_granularity);
3748 
3749 	/* ...to match put_device() in cxl_add_to_region() */
3750 	get_device(&cxlr->dev);
3751 
3752 	return 0;
3753 }
3754 
3755 /* Establish an empty region covering the given HPA range */
3756 static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd,
3757 					   struct cxl_region_context *ctx)
3758 {
3759 	struct cxl_endpoint_decoder *cxled = ctx->cxled;
3760 	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
3761 	struct cxl_dev_state *cxlds = cxlmd->cxlds;
3762 	int rc, part = READ_ONCE(cxled->part);
3763 	struct cxl_region *cxlr;
3764 
3765 	if (part < 0)
3766 		return ERR_PTR(-EBUSY);
3767 
3768 	do {
3769 		cxlr = __create_region(cxlrd, cxlds->part[part].mode,
3770 				       atomic_read(&cxlrd->region_id),
3771 				       cxled->cxld.target_type);
3772 	} while (IS_ERR(cxlr) && PTR_ERR(cxlr) == -EBUSY);
3773 
3774 	if (IS_ERR(cxlr)) {
3775 		dev_err(cxlmd->dev.parent,
3776 			"%s:%s: %s failed assign region: %ld\n",
3777 			dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev),
3778 			__func__, PTR_ERR(cxlr));
3779 		return cxlr;
3780 	}
3781 
3782 	rc = __construct_region(cxlr, ctx);
3783 	if (rc) {
3784 		unregister_region(cxlr);
3785 		return ERR_PTR(rc);
3786 	}
3787 
3788 	return cxlr;
3789 }
3790 
3791 static struct cxl_region *
3792 cxl_find_region_by_range(struct cxl_root_decoder *cxlrd,
3793 			 struct range *hpa_range)
3794 {
3795 	struct device *region_dev;
3796 
3797 	region_dev = device_find_child(&cxlrd->cxlsd.cxld.dev, hpa_range,
3798 				       match_region_by_range);
3799 	if (!region_dev)
3800 		return NULL;
3801 
3802 	return to_cxl_region(region_dev);
3803 }
3804 
3805 int cxl_add_to_region(struct cxl_endpoint_decoder *cxled)
3806 {
3807 	struct cxl_region_context ctx;
3808 	struct cxl_region_params *p;
3809 	bool attach = false;
3810 	int rc;
3811 
3812 	ctx = (struct cxl_region_context) {
3813 		.cxled = cxled,
3814 		.hpa_range = cxled->cxld.hpa_range,
3815 		.interleave_ways = cxled->cxld.interleave_ways,
3816 		.interleave_granularity = cxled->cxld.interleave_granularity,
3817 	};
3818 
3819 	struct cxl_root_decoder *cxlrd __free(put_cxl_root_decoder) =
3820 		get_cxl_root_decoder(cxled, &ctx);
3821 
3822 	if (IS_ERR(cxlrd))
3823 		return PTR_ERR(cxlrd);
3824 
3825 	/*
3826 	 * Ensure that, if multiple threads race to construct_region()
3827 	 * for the HPA range, one does the construction and the others
3828 	 * add to that.
3829 	 */
3830 	guard(mutex)(&cxlrd->regions_lock);
3831 	struct cxl_region *cxlr __free(put_cxl_region) =
3832 		cxl_find_region_by_range(cxlrd, &ctx.hpa_range);
3833 	if (!cxlr)
3834 		cxlr = construct_region(cxlrd, &ctx);
3835 
3836 	rc = PTR_ERR_OR_ZERO(cxlr);
3837 	if (rc)
3838 		return rc;
3839 
3840 	attach_target(cxlr, cxled, -1, TASK_UNINTERRUPTIBLE);
3841 
3842 	scoped_guard(rwsem_read, &cxl_rwsem.region) {
3843 		p = &cxlr->params;
3844 		attach = p->state == CXL_CONFIG_COMMIT;
3845 	}
3846 
3847 	if (attach) {
3848 		/*
3849 		 * If device_attach() fails the range may still be active via
3850 		 * the platform-firmware memory map, otherwise the driver for
3851 		 * regions is local to this file, so driver matching can't fail.
3852 		 */
3853 		if (device_attach(&cxlr->dev) < 0)
3854 			dev_err(&cxlr->dev, "failed to enable, range: %pr\n",
3855 				p->res);
3856 	}
3857 
3858 	return rc;
3859 }
3860 EXPORT_SYMBOL_NS_GPL(cxl_add_to_region, "CXL");
3861 
3862 u64 cxl_port_get_spa_cache_alias(struct cxl_port *endpoint, u64 spa)
3863 {
3864 	struct cxl_region_ref *iter;
3865 	unsigned long index;
3866 
3867 	if (!endpoint)
3868 		return ~0ULL;
3869 
3870 	guard(rwsem_write)(&cxl_rwsem.region);
3871 
3872 	xa_for_each(&endpoint->regions, index, iter) {
3873 		struct cxl_region_params *p = &iter->region->params;
3874 
3875 		if (cxl_resource_contains_addr(p->res, spa)) {
3876 			if (!p->cache_size)
3877 				return ~0ULL;
3878 
3879 			if (spa >= p->res->start + p->cache_size)
3880 				return spa - p->cache_size;
3881 
3882 			return spa + p->cache_size;
3883 		}
3884 	}
3885 
3886 	return ~0ULL;
3887 }
3888 EXPORT_SYMBOL_NS_GPL(cxl_port_get_spa_cache_alias, "CXL");
3889 
3890 static int is_system_ram(struct resource *res, void *arg)
3891 {
3892 	struct cxl_region *cxlr = arg;
3893 	struct cxl_region_params *p = &cxlr->params;
3894 
3895 	dev_dbg(&cxlr->dev, "%pr has System RAM: %pr\n", p->res, res);
3896 	return 1;
3897 }
3898 
3899 static void shutdown_notifiers(void *_cxlr)
3900 {
3901 	struct cxl_region *cxlr = _cxlr;
3902 
3903 	unregister_node_notifier(&cxlr->node_notifier);
3904 	unregister_mt_adistance_algorithm(&cxlr->adist_notifier);
3905 }
3906 
3907 static void remove_debugfs(void *dentry)
3908 {
3909 	debugfs_remove_recursive(dentry);
3910 }
3911 
3912 static int validate_region_offset(struct cxl_region *cxlr, u64 offset)
3913 {
3914 	struct cxl_region_params *p = &cxlr->params;
3915 	resource_size_t region_size;
3916 	u64 hpa;
3917 
3918 	if (offset < p->cache_size) {
3919 		dev_err(&cxlr->dev,
3920 			"Offset %#llx is within extended linear cache %pa\n",
3921 			offset, &p->cache_size);
3922 		return -EINVAL;
3923 	}
3924 
3925 	region_size = resource_size(p->res);
3926 	if (offset >= region_size) {
3927 		dev_err(&cxlr->dev, "Offset %#llx exceeds region size %pa\n",
3928 			offset, &region_size);
3929 		return -EINVAL;
3930 	}
3931 
3932 	hpa = p->res->start + offset;
3933 	if (hpa < p->res->start || hpa > p->res->end) {
3934 		dev_err(&cxlr->dev, "HPA %#llx not in region %pr\n", hpa,
3935 			p->res);
3936 		return -EINVAL;
3937 	}
3938 
3939 	return 0;
3940 }
3941 
3942 static int cxl_region_debugfs_poison_inject(void *data, u64 offset)
3943 {
3944 	struct dpa_result result = { .dpa = ULLONG_MAX, .cxlmd = NULL };
3945 	struct cxl_region *cxlr = data;
3946 	int rc;
3947 
3948 	ACQUIRE(rwsem_read_intr, region_rwsem)(&cxl_rwsem.region);
3949 	if ((rc = ACQUIRE_ERR(rwsem_read_intr, &region_rwsem)))
3950 		return rc;
3951 
3952 	ACQUIRE(rwsem_read_intr, dpa_rwsem)(&cxl_rwsem.dpa);
3953 	if ((rc = ACQUIRE_ERR(rwsem_read_intr, &dpa_rwsem)))
3954 		return rc;
3955 
3956 	if (validate_region_offset(cxlr, offset))
3957 		return -EINVAL;
3958 
3959 	offset -= cxlr->params.cache_size;
3960 	rc = region_offset_to_dpa_result(cxlr, offset, &result);
3961 	if (rc || !result.cxlmd || result.dpa == ULLONG_MAX) {
3962 		dev_dbg(&cxlr->dev,
3963 			"Failed to resolve DPA for region offset %#llx rc %d\n",
3964 			offset, rc);
3965 
3966 		return rc ? rc : -EINVAL;
3967 	}
3968 
3969 	return cxl_inject_poison_locked(result.cxlmd, result.dpa);
3970 }
3971 
3972 DEFINE_DEBUGFS_ATTRIBUTE(cxl_poison_inject_fops, NULL,
3973 			 cxl_region_debugfs_poison_inject, "%llx\n");
3974 
3975 static int cxl_region_debugfs_poison_clear(void *data, u64 offset)
3976 {
3977 	struct dpa_result result = { .dpa = ULLONG_MAX, .cxlmd = NULL };
3978 	struct cxl_region *cxlr = data;
3979 	int rc;
3980 
3981 	ACQUIRE(rwsem_read_intr, region_rwsem)(&cxl_rwsem.region);
3982 	if ((rc = ACQUIRE_ERR(rwsem_read_intr, &region_rwsem)))
3983 		return rc;
3984 
3985 	ACQUIRE(rwsem_read_intr, dpa_rwsem)(&cxl_rwsem.dpa);
3986 	if ((rc = ACQUIRE_ERR(rwsem_read_intr, &dpa_rwsem)))
3987 		return rc;
3988 
3989 	if (validate_region_offset(cxlr, offset))
3990 		return -EINVAL;
3991 
3992 	offset -= cxlr->params.cache_size;
3993 	rc = region_offset_to_dpa_result(cxlr, offset, &result);
3994 	if (rc || !result.cxlmd || result.dpa == ULLONG_MAX) {
3995 		dev_dbg(&cxlr->dev,
3996 			"Failed to resolve DPA for region offset %#llx rc %d\n",
3997 			offset, rc);
3998 
3999 		return rc ? rc : -EINVAL;
4000 	}
4001 
4002 	return cxl_clear_poison_locked(result.cxlmd, result.dpa);
4003 }
4004 
4005 DEFINE_DEBUGFS_ATTRIBUTE(cxl_poison_clear_fops, NULL,
4006 			 cxl_region_debugfs_poison_clear, "%llx\n");
4007 
4008 static int cxl_region_setup_poison(struct cxl_region *cxlr)
4009 {
4010 	struct device *dev = &cxlr->dev;
4011 	struct cxl_region_params *p = &cxlr->params;
4012 	struct dentry *dentry;
4013 
4014 	/*
4015 	 * Do not enable poison injection in Normalized Address mode.
4016 	 * Conversion between SPA and DPA is required for this, but it is
4017 	 * not supported in this mode.
4018 	 */
4019 	if (test_bit(CXL_REGION_F_NORMALIZED_ADDRESSING, &cxlr->flags))
4020 		return 0;
4021 
4022 	/* Create poison attributes if all memdevs support the capabilities */
4023 	for (int i = 0; i < p->nr_targets; i++) {
4024 		struct cxl_endpoint_decoder *cxled = p->targets[i];
4025 		struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
4026 
4027 		if (!cxl_memdev_has_poison_cmd(cxlmd, CXL_POISON_ENABLED_INJECT) ||
4028 		    !cxl_memdev_has_poison_cmd(cxlmd, CXL_POISON_ENABLED_CLEAR))
4029 			return 0;
4030 	}
4031 
4032 	dentry = cxl_debugfs_create_dir(dev_name(dev));
4033 	debugfs_create_file("inject_poison", 0200, dentry, cxlr,
4034 			    &cxl_poison_inject_fops);
4035 	debugfs_create_file("clear_poison", 0200, dentry, cxlr,
4036 			    &cxl_poison_clear_fops);
4037 
4038 	return devm_add_action_or_reset(dev, remove_debugfs, dentry);
4039 }
4040 
4041 static int region_contains_resource(struct device *dev, const void *data)
4042 {
4043 	const struct resource *res = data;
4044 	struct cxl_region *cxlr;
4045 	struct cxl_region_params *p;
4046 
4047 	if (!is_cxl_region(dev))
4048 		return 0;
4049 
4050 	cxlr = to_cxl_region(dev);
4051 	p = &cxlr->params;
4052 
4053 	if (p->state != CXL_CONFIG_COMMIT)
4054 		return 0;
4055 
4056 	if (!p->res)
4057 		return 0;
4058 
4059 	return resource_contains(p->res, res) ? 1 : 0;
4060 }
4061 
4062 bool cxl_region_contains_resource(const struct resource *res)
4063 {
4064 	guard(rwsem_read)(&cxl_rwsem.region);
4065 	struct device *dev __free(put_device) = bus_find_device(
4066 		&cxl_bus_type, NULL, res, region_contains_resource);
4067 	return !!dev;
4068 }
4069 EXPORT_SYMBOL_FOR_MODULES(cxl_region_contains_resource, "dax_hmem");
4070 
4071 static int cxl_region_can_probe(struct cxl_region *cxlr)
4072 {
4073 	struct cxl_region_params *p = &cxlr->params;
4074 	int rc;
4075 
4076 	ACQUIRE(rwsem_read_intr, rwsem)(&cxl_rwsem.region);
4077 	if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem))) {
4078 		dev_dbg(&cxlr->dev, "probe interrupted\n");
4079 		return rc;
4080 	}
4081 
4082 	if (p->state < CXL_CONFIG_COMMIT) {
4083 		dev_dbg(&cxlr->dev, "config state: %d\n", p->state);
4084 		return -ENXIO;
4085 	}
4086 
4087 	if (test_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags)) {
4088 		dev_err(&cxlr->dev,
4089 			"failed to activate, re-commit region and retry\n");
4090 		return -ENXIO;
4091 	}
4092 
4093 	return 0;
4094 }
4095 
4096 static int first_mapped_decoder(struct device *dev, const void *data)
4097 {
4098 	struct cxl_endpoint_decoder *cxled;
4099 
4100 	if (!is_endpoint_decoder(dev))
4101 		return 0;
4102 
4103 	cxled = to_cxl_endpoint_decoder(dev);
4104 	if (cxled->cxld.region)
4105 		return 1;
4106 
4107 	return 0;
4108 }
4109 
4110 /*
4111  * Runs in cxl_mem_probe context after successful endpoint probe, assumes the
4112  * simple case of single mapped decoder per memdev.
4113  */
4114 int cxl_memdev_attach_region(struct cxl_memdev *cxlmd)
4115 {
4116 	struct cxl_attach_region *attach =
4117 		container_of(cxlmd->attach, typeof(*attach), attach);
4118 	struct cxl_port *endpoint = cxlmd->endpoint;
4119 	struct cxl_endpoint_decoder *cxled;
4120 	struct cxl_region *cxlr;
4121 	int rc;
4122 
4123 	/* hold endpoint lock to setup autoremove of the region */
4124 	guard(device)(&endpoint->dev);
4125 	if (!endpoint->dev.driver)
4126 		return -ENXIO;
4127 	guard(rwsem_read)(&cxl_rwsem.region);
4128 	guard(rwsem_read)(&cxl_rwsem.dpa);
4129 
4130 	/*
4131 	 * TODO auto-instantiate a region, for now assume this will find an
4132 	 * auto-region
4133 	 */
4134 	struct device *dev __free(put_device) =
4135 		device_find_child(&endpoint->dev, NULL, first_mapped_decoder);
4136 
4137 	if (!dev) {
4138 		dev_dbg(cxlmd->cxlds->dev, "no region found for memdev %s\n",
4139 			dev_name(&cxlmd->dev));
4140 		return -ENXIO;
4141 	}
4142 
4143 	cxled = to_cxl_endpoint_decoder(dev);
4144 	cxlr = cxled->cxld.region;
4145 
4146 	if (cxlr->params.state < CXL_CONFIG_COMMIT) {
4147 		dev_dbg(cxlmd->cxlds->dev,
4148 			"region %s not committed for memdev %s\n",
4149 			dev_name(&cxlr->dev), dev_name(&cxlmd->dev));
4150 		return -ENXIO;
4151 	}
4152 
4153 	if (cxlr->params.nr_targets > 1) {
4154 		dev_dbg(cxlmd->cxlds->dev,
4155 			"Only attach to local non-interleaved region\n");
4156 		return -ENXIO;
4157 	}
4158 
4159 	/* Only teardown regions that pass validation, ignore the rest */
4160 	get_device(&cxlr->dev);
4161 	rc = devm_add_action_or_reset(&endpoint->dev,
4162 				      endpoint_unregister_region, cxlr);
4163 	if (rc)
4164 		return rc;
4165 
4166 	attach->hpa_range = (struct range) {
4167 		.start = cxlr->params.res->start,
4168 		.end = cxlr->params.res->end,
4169 	};
4170 	return 0;
4171 }
4172 EXPORT_SYMBOL_FOR_MODULES(cxl_memdev_attach_region, "cxl_mem");
4173 
4174 /*
4175  * The presence of an attach method indicates that the region is designated for
4176  * a purpose outside of CXL core memory expansion defaults.
4177  */
4178 static bool cxl_region_has_memdev_attach(struct cxl_region *cxlr)
4179 {
4180 	struct cxl_region_params *p = &cxlr->params;
4181 
4182 	for (int i = 0; i < p->nr_targets; i++) {
4183 		struct cxl_endpoint_decoder *cxled = p->targets[i];
4184 		struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
4185 
4186 		if (cxlmd->attach)
4187 			return true;
4188 	}
4189 
4190 	return false;
4191 }
4192 
4193 static int cxl_region_probe(struct device *dev)
4194 {
4195 	struct cxl_region *cxlr = to_cxl_region(dev);
4196 	struct cxl_region_params *p = &cxlr->params;
4197 	int rc;
4198 
4199 	rc = cxl_region_can_probe(cxlr);
4200 	if (rc)
4201 		return rc;
4202 
4203 	/*
4204 	 * From this point on any path that changes the region's state away from
4205 	 * CXL_CONFIG_COMMIT is also responsible for releasing the driver.
4206 	 */
4207 
4208 	cxlr->node_notifier.notifier_call = cxl_region_perf_attrs_callback;
4209 	cxlr->node_notifier.priority = CXL_CALLBACK_PRI;
4210 	register_node_notifier(&cxlr->node_notifier);
4211 
4212 	cxlr->adist_notifier.notifier_call = cxl_region_calculate_adistance;
4213 	cxlr->adist_notifier.priority = 100;
4214 	register_mt_adistance_algorithm(&cxlr->adist_notifier);
4215 
4216 	rc = devm_add_action_or_reset(&cxlr->dev, shutdown_notifiers, cxlr);
4217 	if (rc)
4218 		return rc;
4219 
4220 	rc = cxl_region_setup_poison(cxlr);
4221 	if (rc)
4222 		return rc;
4223 
4224 	if (cxl_region_has_memdev_attach(cxlr))
4225 		return 0;
4226 
4227 	switch (cxlr->mode) {
4228 	case CXL_PARTMODE_PMEM:
4229 		rc = devm_cxl_region_edac_register(cxlr);
4230 		if (rc)
4231 			dev_dbg(&cxlr->dev, "CXL EDAC registration for region_id=%d failed\n",
4232 				cxlr->id);
4233 
4234 		return devm_cxl_add_pmem_region(cxlr);
4235 	case CXL_PARTMODE_RAM:
4236 		rc = devm_cxl_region_edac_register(cxlr);
4237 		if (rc)
4238 			dev_dbg(&cxlr->dev, "CXL EDAC registration for region_id=%d failed\n",
4239 				cxlr->id);
4240 
4241 		/*
4242 		 * The region can not be manged by CXL if any portion of
4243 		 * it is already online as 'System RAM'
4244 		 */
4245 		if (walk_iomem_res_desc(IORES_DESC_NONE,
4246 					IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
4247 					p->res->start, p->res->end, cxlr,
4248 					is_system_ram) > 0)
4249 			return 0;
4250 		return devm_cxl_add_dax_region(cxlr);
4251 	default:
4252 		dev_dbg(&cxlr->dev, "unsupported region mode: %d\n",
4253 			cxlr->mode);
4254 		return -ENXIO;
4255 	}
4256 }
4257 
4258 static struct cxl_driver cxl_region_driver = {
4259 	.name = "cxl_region",
4260 	.probe = cxl_region_probe,
4261 	.id = CXL_DEVICE_REGION,
4262 };
4263 
4264 int cxl_region_init(void)
4265 {
4266 	return cxl_driver_register(&cxl_region_driver);
4267 }
4268 
4269 void cxl_region_exit(void)
4270 {
4271 	cxl_driver_unregister(&cxl_region_driver);
4272 }
4273 
4274 MODULE_IMPORT_NS("CXL");
4275 MODULE_IMPORT_NS("DEVMEM");
4276 MODULE_ALIAS_CXL(CXL_DEVICE_REGION);
4277