1 // SPDX-License-Identifier: GPL-2.0 2 /* Copyright(c) 2016-2019 Intel Corporation. All rights reserved. */ 3 #include <linux/memremap.h> 4 #include <linux/pagemap.h> 5 #include <linux/memory.h> 6 #include <linux/module.h> 7 #include <linux/device.h> 8 #include <linux/pfn_t.h> 9 #include <linux/slab.h> 10 #include <linux/dax.h> 11 #include <linux/fs.h> 12 #include <linux/mm.h> 13 #include <linux/mman.h> 14 #include <linux/memory-tiers.h> 15 #include "dax-private.h" 16 #include "bus.h" 17 18 /* 19 * Default abstract distance assigned to the NUMA node onlined 20 * by DAX/kmem if the low level platform driver didn't initialize 21 * one for this NUMA node. 22 */ 23 #define MEMTIER_DEFAULT_DAX_ADISTANCE (MEMTIER_ADISTANCE_DRAM * 5) 24 25 /* Memory resource name used for add_memory_driver_managed(). */ 26 static const char *kmem_name; 27 /* Set if any memory will remain added when the driver will be unloaded. */ 28 static bool any_hotremove_failed; 29 30 static int dax_kmem_range(struct dev_dax *dev_dax, int i, struct range *r) 31 { 32 struct dev_dax_range *dax_range = &dev_dax->ranges[i]; 33 struct range *range = &dax_range->range; 34 35 /* memory-block align the hotplug range */ 36 r->start = ALIGN(range->start, memory_block_size_bytes()); 37 r->end = ALIGN_DOWN(range->end + 1, memory_block_size_bytes()) - 1; 38 if (r->start >= r->end) { 39 r->start = range->start; 40 r->end = range->end; 41 return -ENOSPC; 42 } 43 return 0; 44 } 45 46 struct dax_kmem_data { 47 const char *res_name; 48 int mgid; 49 struct resource *res[]; 50 }; 51 52 static DEFINE_MUTEX(kmem_memory_type_lock); 53 static LIST_HEAD(kmem_memory_types); 54 55 static struct memory_dev_type *kmem_find_alloc_memory_type(int adist) 56 { 57 bool found = false; 58 struct memory_dev_type *mtype; 59 60 mutex_lock(&kmem_memory_type_lock); 61 list_for_each_entry(mtype, &kmem_memory_types, list) { 62 if (mtype->adistance == adist) { 63 found = true; 64 break; 65 } 66 } 67 if (!found) { 68 mtype = alloc_memory_type(adist); 69 if (!IS_ERR(mtype)) 70 list_add(&mtype->list, &kmem_memory_types); 71 } 72 mutex_unlock(&kmem_memory_type_lock); 73 74 return mtype; 75 } 76 77 static void kmem_put_memory_types(void) 78 { 79 struct memory_dev_type *mtype, *mtn; 80 81 mutex_lock(&kmem_memory_type_lock); 82 list_for_each_entry_safe(mtype, mtn, &kmem_memory_types, list) { 83 list_del(&mtype->list); 84 put_memory_type(mtype); 85 } 86 mutex_unlock(&kmem_memory_type_lock); 87 } 88 89 static int dev_dax_kmem_probe(struct dev_dax *dev_dax) 90 { 91 struct device *dev = &dev_dax->dev; 92 unsigned long total_len = 0; 93 struct dax_kmem_data *data; 94 struct memory_dev_type *mtype; 95 int i, rc, mapped = 0; 96 int numa_node; 97 int adist = MEMTIER_DEFAULT_DAX_ADISTANCE; 98 99 /* 100 * Ensure good NUMA information for the persistent memory. 101 * Without this check, there is a risk that slow memory 102 * could be mixed in a node with faster memory, causing 103 * unavoidable performance issues. 104 */ 105 numa_node = dev_dax->target_node; 106 if (numa_node < 0) { 107 dev_warn(dev, "rejecting DAX region with invalid node: %d\n", 108 numa_node); 109 return -EINVAL; 110 } 111 112 mt_calc_adistance(numa_node, &adist); 113 mtype = kmem_find_alloc_memory_type(adist); 114 if (IS_ERR(mtype)) 115 return PTR_ERR(mtype); 116 117 for (i = 0; i < dev_dax->nr_range; i++) { 118 struct range range; 119 120 rc = dax_kmem_range(dev_dax, i, &range); 121 if (rc) { 122 dev_info(dev, "mapping%d: %#llx-%#llx too small after alignment\n", 123 i, range.start, range.end); 124 continue; 125 } 126 total_len += range_len(&range); 127 } 128 129 if (!total_len) { 130 dev_warn(dev, "rejecting DAX region without any memory after alignment\n"); 131 return -EINVAL; 132 } 133 134 init_node_memory_type(numa_node, mtype); 135 136 rc = -ENOMEM; 137 data = kzalloc(struct_size(data, res, dev_dax->nr_range), GFP_KERNEL); 138 if (!data) 139 goto err_dax_kmem_data; 140 141 data->res_name = kstrdup(dev_name(dev), GFP_KERNEL); 142 if (!data->res_name) 143 goto err_res_name; 144 145 rc = memory_group_register_static(numa_node, PFN_UP(total_len)); 146 if (rc < 0) 147 goto err_reg_mgid; 148 data->mgid = rc; 149 150 for (i = 0; i < dev_dax->nr_range; i++) { 151 struct resource *res; 152 struct range range; 153 154 rc = dax_kmem_range(dev_dax, i, &range); 155 if (rc) 156 continue; 157 158 /* Region is permanently reserved if hotremove fails. */ 159 res = request_mem_region(range.start, range_len(&range), data->res_name); 160 if (!res) { 161 dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve region\n", 162 i, range.start, range.end); 163 /* 164 * Once some memory has been onlined we can't 165 * assume that it can be un-onlined safely. 166 */ 167 if (mapped) 168 continue; 169 rc = -EBUSY; 170 goto err_request_mem; 171 } 172 data->res[i] = res; 173 174 /* 175 * Set flags appropriate for System RAM. Leave ..._BUSY clear 176 * so that add_memory() can add a child resource. Do not 177 * inherit flags from the parent since it may set new flags 178 * unknown to us that will break add_memory() below. 179 */ 180 res->flags = IORESOURCE_SYSTEM_RAM; 181 182 /* 183 * Ensure that future kexec'd kernels will not treat 184 * this as RAM automatically. 185 */ 186 rc = add_memory_driver_managed(data->mgid, range.start, 187 range_len(&range), kmem_name, MHP_NID_IS_MGID); 188 189 if (rc) { 190 dev_warn(dev, "mapping%d: %#llx-%#llx memory add failed\n", 191 i, range.start, range.end); 192 remove_resource(res); 193 kfree(res); 194 data->res[i] = NULL; 195 if (mapped) 196 continue; 197 goto err_request_mem; 198 } 199 mapped++; 200 } 201 202 dev_set_drvdata(dev, data); 203 204 return 0; 205 206 err_request_mem: 207 memory_group_unregister(data->mgid); 208 err_reg_mgid: 209 kfree(data->res_name); 210 err_res_name: 211 kfree(data); 212 err_dax_kmem_data: 213 clear_node_memory_type(numa_node, mtype); 214 return rc; 215 } 216 217 #ifdef CONFIG_MEMORY_HOTREMOVE 218 static void dev_dax_kmem_remove(struct dev_dax *dev_dax) 219 { 220 int i, success = 0; 221 int node = dev_dax->target_node; 222 struct device *dev = &dev_dax->dev; 223 struct dax_kmem_data *data = dev_get_drvdata(dev); 224 225 /* 226 * We have one shot for removing memory, if some memory blocks were not 227 * offline prior to calling this function remove_memory() will fail, and 228 * there is no way to hotremove this memory until reboot because device 229 * unbind will succeed even if we return failure. 230 */ 231 for (i = 0; i < dev_dax->nr_range; i++) { 232 struct range range; 233 int rc; 234 235 rc = dax_kmem_range(dev_dax, i, &range); 236 if (rc) 237 continue; 238 239 rc = remove_memory(range.start, range_len(&range)); 240 if (rc == 0) { 241 remove_resource(data->res[i]); 242 kfree(data->res[i]); 243 data->res[i] = NULL; 244 success++; 245 continue; 246 } 247 any_hotremove_failed = true; 248 dev_err(dev, 249 "mapping%d: %#llx-%#llx cannot be hotremoved until the next reboot\n", 250 i, range.start, range.end); 251 } 252 253 if (success >= dev_dax->nr_range) { 254 memory_group_unregister(data->mgid); 255 kfree(data->res_name); 256 kfree(data); 257 dev_set_drvdata(dev, NULL); 258 /* 259 * Clear the memtype association on successful unplug. 260 * If not, we have memory blocks left which can be 261 * offlined/onlined later. We need to keep memory_dev_type 262 * for that. This implies this reference will be around 263 * till next reboot. 264 */ 265 clear_node_memory_type(node, NULL); 266 } 267 } 268 #else 269 static void dev_dax_kmem_remove(struct dev_dax *dev_dax) 270 { 271 /* 272 * Without hotremove purposely leak the request_mem_region() for the 273 * device-dax range and return '0' to ->remove() attempts. The removal 274 * of the device from the driver always succeeds, but the region is 275 * permanently pinned as reserved by the unreleased 276 * request_mem_region(). 277 */ 278 any_hotremove_failed = true; 279 } 280 #endif /* CONFIG_MEMORY_HOTREMOVE */ 281 282 static struct dax_device_driver device_dax_kmem_driver = { 283 .probe = dev_dax_kmem_probe, 284 .remove = dev_dax_kmem_remove, 285 .type = DAXDRV_KMEM_TYPE, 286 }; 287 288 static int __init dax_kmem_init(void) 289 { 290 int rc; 291 292 /* Resource name is permanently allocated if any hotremove fails. */ 293 kmem_name = kstrdup_const("System RAM (kmem)", GFP_KERNEL); 294 if (!kmem_name) 295 return -ENOMEM; 296 297 rc = dax_driver_register(&device_dax_kmem_driver); 298 if (rc) 299 goto error_dax_driver; 300 301 return rc; 302 303 error_dax_driver: 304 kmem_put_memory_types(); 305 kfree_const(kmem_name); 306 return rc; 307 } 308 309 static void __exit dax_kmem_exit(void) 310 { 311 dax_driver_unregister(&device_dax_kmem_driver); 312 if (!any_hotremove_failed) 313 kfree_const(kmem_name); 314 kmem_put_memory_types(); 315 } 316 317 MODULE_AUTHOR("Intel Corporation"); 318 MODULE_LICENSE("GPL v2"); 319 module_init(dax_kmem_init); 320 module_exit(dax_kmem_exit); 321 MODULE_ALIAS_DAX_DEVICE(0); 322