1 // SPDX-License-Identifier: GPL-2.0 2 /* Copyright(c) 2016-2019 Intel Corporation. All rights reserved. */ 3 #include <linux/memremap.h> 4 #include <linux/pagemap.h> 5 #include <linux/memory.h> 6 #include <linux/module.h> 7 #include <linux/device.h> 8 #include <linux/slab.h> 9 #include <linux/dax.h> 10 #include <linux/fs.h> 11 #include <linux/mm.h> 12 #include <linux/mman.h> 13 #include <linux/memory-tiers.h> 14 #include <linux/memory_hotplug.h> 15 #include <linux/string_helpers.h> 16 #include "dax-private.h" 17 #include "bus.h" 18 19 /* 20 * Default abstract distance assigned to the NUMA node onlined 21 * by DAX/kmem if the low level platform driver didn't initialize 22 * one for this NUMA node. 23 */ 24 #define MEMTIER_DEFAULT_DAX_ADISTANCE (MEMTIER_ADISTANCE_DRAM * 5) 25 26 /* Memory resource name used for add_memory_driver_managed(). */ 27 static const char *kmem_name; 28 /* Set if any memory will remain added when the driver will be unloaded. */ 29 static bool any_hotremove_failed; 30 31 static int dax_kmem_range(struct dev_dax *dev_dax, int i, struct range *r) 32 { 33 struct dev_dax_range *dax_range = &dev_dax->ranges[i]; 34 struct range *range = &dax_range->range; 35 36 /* memory-block align the hotplug range */ 37 r->start = ALIGN(range->start, memory_block_size_bytes()); 38 r->end = ALIGN_DOWN(range->end + 1, memory_block_size_bytes()) - 1; 39 if (r->start >= r->end) { 40 r->start = range->start; 41 r->end = range->end; 42 return -ENOSPC; 43 } 44 return 0; 45 } 46 47 struct dax_kmem_data { 48 const char *res_name; 49 int mgid; 50 struct resource *res[]; 51 }; 52 53 static DEFINE_MUTEX(kmem_memory_type_lock); 54 static LIST_HEAD(kmem_memory_types); 55 56 static struct memory_dev_type *kmem_find_alloc_memory_type(int adist) 57 { 58 guard(mutex)(&kmem_memory_type_lock); 59 return mt_find_alloc_memory_type(adist, &kmem_memory_types); 60 } 61 62 static void kmem_put_memory_types(void) 63 { 64 guard(mutex)(&kmem_memory_type_lock); 65 mt_put_memory_types(&kmem_memory_types); 66 } 67 68 static int dev_dax_kmem_probe(struct dev_dax *dev_dax) 69 { 70 struct device *dev = &dev_dax->dev; 71 unsigned long total_len = 0, orig_len = 0; 72 struct dax_kmem_data *data; 73 struct memory_dev_type *mtype; 74 int i, rc, mapped = 0; 75 mhp_t mhp_flags; 76 int numa_node; 77 int adist = MEMTIER_DEFAULT_DAX_ADISTANCE; 78 79 /* 80 * Ensure good NUMA information for the persistent memory. 81 * Without this check, there is a risk that slow memory 82 * could be mixed in a node with faster memory, causing 83 * unavoidable performance issues. 84 */ 85 numa_node = dev_dax->target_node; 86 if (numa_node < 0) { 87 dev_warn(dev, "rejecting DAX region with invalid node: %d\n", 88 numa_node); 89 return -EINVAL; 90 } 91 92 mt_calc_adistance(numa_node, &adist); 93 mtype = kmem_find_alloc_memory_type(adist); 94 if (IS_ERR(mtype)) 95 return PTR_ERR(mtype); 96 97 for (i = 0; i < dev_dax->nr_range; i++) { 98 struct range range; 99 100 orig_len += range_len(&dev_dax->ranges[i].range); 101 rc = dax_kmem_range(dev_dax, i, &range); 102 if (rc) { 103 dev_info(dev, "mapping%d: %#llx-%#llx too small after alignment\n", 104 i, range.start, range.end); 105 continue; 106 } 107 total_len += range_len(&range); 108 } 109 110 if (!total_len) { 111 dev_warn(dev, "rejecting DAX region without any memory after alignment\n"); 112 return -EINVAL; 113 } else if (total_len != orig_len) { 114 char buf[16]; 115 116 string_get_size(orig_len - total_len, 1, STRING_UNITS_2, 117 buf, sizeof(buf)); 118 dev_warn(dev, "DAX region truncated by %s due to alignment\n", buf); 119 } 120 121 init_node_memory_type(numa_node, mtype); 122 123 rc = -ENOMEM; 124 data = kzalloc(struct_size(data, res, dev_dax->nr_range), GFP_KERNEL); 125 if (!data) 126 goto err_dax_kmem_data; 127 128 data->res_name = kstrdup(dev_name(dev), GFP_KERNEL); 129 if (!data->res_name) 130 goto err_res_name; 131 132 rc = memory_group_register_static(numa_node, PFN_UP(total_len)); 133 if (rc < 0) 134 goto err_reg_mgid; 135 data->mgid = rc; 136 137 for (i = 0; i < dev_dax->nr_range; i++) { 138 struct resource *res; 139 struct range range; 140 141 rc = dax_kmem_range(dev_dax, i, &range); 142 if (rc) 143 continue; 144 145 /* Region is permanently reserved if hotremove fails. */ 146 res = request_mem_region(range.start, range_len(&range), data->res_name); 147 if (!res) { 148 dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve region\n", 149 i, range.start, range.end); 150 /* 151 * Once some memory has been onlined we can't 152 * assume that it can be un-onlined safely. 153 */ 154 if (mapped) 155 continue; 156 rc = -EBUSY; 157 goto err_request_mem; 158 } 159 data->res[i] = res; 160 161 /* 162 * Set flags appropriate for System RAM. Leave ..._BUSY clear 163 * so that add_memory() can add a child resource. Do not 164 * inherit flags from the parent since it may set new flags 165 * unknown to us that will break add_memory() below. 166 */ 167 res->flags = IORESOURCE_SYSTEM_RAM; 168 169 mhp_flags = MHP_NID_IS_MGID; 170 if (dev_dax->memmap_on_memory) 171 mhp_flags |= MHP_MEMMAP_ON_MEMORY; 172 173 /* 174 * Ensure that future kexec'd kernels will not treat 175 * this as RAM automatically. 176 */ 177 rc = add_memory_driver_managed(data->mgid, range.start, 178 range_len(&range), kmem_name, mhp_flags); 179 180 if (rc) { 181 dev_warn(dev, "mapping%d: %#llx-%#llx memory add failed\n", 182 i, range.start, range.end); 183 remove_resource(res); 184 kfree(res); 185 data->res[i] = NULL; 186 if (mapped) 187 continue; 188 goto err_request_mem; 189 } 190 mapped++; 191 } 192 193 dev_set_drvdata(dev, data); 194 195 return 0; 196 197 err_request_mem: 198 memory_group_unregister(data->mgid); 199 err_reg_mgid: 200 kfree(data->res_name); 201 err_res_name: 202 kfree(data); 203 err_dax_kmem_data: 204 clear_node_memory_type(numa_node, mtype); 205 return rc; 206 } 207 208 #ifdef CONFIG_MEMORY_HOTREMOVE 209 static void dev_dax_kmem_remove(struct dev_dax *dev_dax) 210 { 211 int i, success = 0; 212 int node = dev_dax->target_node; 213 struct device *dev = &dev_dax->dev; 214 struct dax_kmem_data *data = dev_get_drvdata(dev); 215 216 /* 217 * We have one shot for removing memory, if some memory blocks were not 218 * offline prior to calling this function remove_memory() will fail, and 219 * there is no way to hotremove this memory until reboot because device 220 * unbind will succeed even if we return failure. 221 */ 222 for (i = 0; i < dev_dax->nr_range; i++) { 223 struct range range; 224 int rc; 225 226 rc = dax_kmem_range(dev_dax, i, &range); 227 if (rc) 228 continue; 229 230 rc = remove_memory(range.start, range_len(&range)); 231 if (rc == 0) { 232 remove_resource(data->res[i]); 233 kfree(data->res[i]); 234 data->res[i] = NULL; 235 success++; 236 continue; 237 } 238 any_hotremove_failed = true; 239 dev_err(dev, 240 "mapping%d: %#llx-%#llx cannot be hotremoved until the next reboot\n", 241 i, range.start, range.end); 242 } 243 244 if (success >= dev_dax->nr_range) { 245 memory_group_unregister(data->mgid); 246 kfree(data->res_name); 247 kfree(data); 248 dev_set_drvdata(dev, NULL); 249 /* 250 * Clear the memtype association on successful unplug. 251 * If not, we have memory blocks left which can be 252 * offlined/onlined later. We need to keep memory_dev_type 253 * for that. This implies this reference will be around 254 * till next reboot. 255 */ 256 clear_node_memory_type(node, NULL); 257 } 258 } 259 #else 260 static void dev_dax_kmem_remove(struct dev_dax *dev_dax) 261 { 262 /* 263 * Without hotremove purposely leak the request_mem_region() for the 264 * device-dax range and return '0' to ->remove() attempts. The removal 265 * of the device from the driver always succeeds, but the region is 266 * permanently pinned as reserved by the unreleased 267 * request_mem_region(). 268 */ 269 any_hotremove_failed = true; 270 } 271 #endif /* CONFIG_MEMORY_HOTREMOVE */ 272 273 static struct dax_device_driver device_dax_kmem_driver = { 274 .probe = dev_dax_kmem_probe, 275 .remove = dev_dax_kmem_remove, 276 .type = DAXDRV_KMEM_TYPE, 277 }; 278 279 static int __init dax_kmem_init(void) 280 { 281 int rc; 282 283 /* Resource name is permanently allocated if any hotremove fails. */ 284 kmem_name = kstrdup_const("System RAM (kmem)", GFP_KERNEL); 285 if (!kmem_name) 286 return -ENOMEM; 287 288 rc = dax_driver_register(&device_dax_kmem_driver); 289 if (rc) 290 goto error_dax_driver; 291 292 return rc; 293 294 error_dax_driver: 295 kmem_put_memory_types(); 296 kfree_const(kmem_name); 297 return rc; 298 } 299 300 static void __exit dax_kmem_exit(void) 301 { 302 dax_driver_unregister(&device_dax_kmem_driver); 303 if (!any_hotremove_failed) 304 kfree_const(kmem_name); 305 kmem_put_memory_types(); 306 } 307 308 MODULE_AUTHOR("Intel Corporation"); 309 MODULE_DESCRIPTION("KMEM DAX: map dax-devices as System-RAM"); 310 MODULE_LICENSE("GPL v2"); 311 module_init(dax_kmem_init); 312 module_exit(dax_kmem_exit); 313 MODULE_ALIAS_DAX_DEVICE(0); 314