1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright 2020-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 */ 5 6 /** 7 * DOC: Enclave lifetime management driver for Nitro Enclaves (NE). 8 * Nitro is a hypervisor that has been developed by Amazon. 9 */ 10 11 #include <linux/anon_inodes.h> 12 #include <linux/capability.h> 13 #include <linux/cpu.h> 14 #include <linux/device.h> 15 #include <linux/file.h> 16 #include <linux/hugetlb.h> 17 #include <linux/limits.h> 18 #include <linux/list.h> 19 #include <linux/miscdevice.h> 20 #include <linux/mm.h> 21 #include <linux/mman.h> 22 #include <linux/module.h> 23 #include <linux/mutex.h> 24 #include <linux/nitro_enclaves.h> 25 #include <linux/pci.h> 26 #include <linux/poll.h> 27 #include <linux/range.h> 28 #include <linux/slab.h> 29 #include <linux/types.h> 30 #include <uapi/linux/vm_sockets.h> 31 32 #include "ne_misc_dev.h" 33 #include "ne_pci_dev.h" 34 35 /** 36 * NE_CPUS_SIZE - Size for max 128 CPUs, for now, in a cpu-list string, comma 37 * separated. The NE CPU pool includes CPUs from a single NUMA 38 * node. 39 */ 40 #define NE_CPUS_SIZE (512) 41 42 /** 43 * NE_EIF_LOAD_OFFSET - The offset where to copy the Enclave Image Format (EIF) 44 * image in enclave memory. 45 */ 46 #define NE_EIF_LOAD_OFFSET (8 * 1024UL * 1024UL) 47 48 /** 49 * NE_MIN_ENCLAVE_MEM_SIZE - The minimum memory size an enclave can be launched 50 * with. 51 */ 52 #define NE_MIN_ENCLAVE_MEM_SIZE (64 * 1024UL * 1024UL) 53 54 /** 55 * NE_MIN_MEM_REGION_SIZE - The minimum size of an enclave memory region. 56 */ 57 #define NE_MIN_MEM_REGION_SIZE (2 * 1024UL * 1024UL) 58 59 /** 60 * NE_PARENT_VM_CID - The CID for the vsock device of the primary / parent VM. 61 */ 62 #define NE_PARENT_VM_CID (3) 63 64 static long ne_ioctl(struct file *file, unsigned int cmd, unsigned long arg); 65 66 static const struct file_operations ne_fops = { 67 .owner = THIS_MODULE, 68 .llseek = noop_llseek, 69 .unlocked_ioctl = ne_ioctl, 70 }; 71 72 static struct miscdevice ne_misc_dev = { 73 .minor = MISC_DYNAMIC_MINOR, 74 .name = "nitro_enclaves", 75 .fops = &ne_fops, 76 .mode = 0660, 77 }; 78 79 struct ne_devs ne_devs = { 80 .ne_misc_dev = &ne_misc_dev, 81 }; 82 83 /* 84 * TODO: Update logic to create new sysfs entries instead of using 85 * a kernel parameter e.g. if multiple sysfs files needed. 86 */ 87 static int ne_set_kernel_param(const char *val, const struct kernel_param *kp); 88 89 static const struct kernel_param_ops ne_cpu_pool_ops = { 90 .get = param_get_string, 91 .set = ne_set_kernel_param, 92 }; 93 94 static char ne_cpus[NE_CPUS_SIZE]; 95 static struct kparam_string ne_cpus_arg = { 96 .maxlen = sizeof(ne_cpus), 97 .string = ne_cpus, 98 }; 99 100 module_param_cb(ne_cpus, &ne_cpu_pool_ops, &ne_cpus_arg, 0644); 101 /* https://www.kernel.org/doc/html/latest/admin-guide/kernel-parameters.html#cpu-lists */ 102 MODULE_PARM_DESC(ne_cpus, "<cpu-list> - CPU pool used for Nitro Enclaves"); 103 104 /** 105 * struct ne_cpu_pool - CPU pool used for Nitro Enclaves. 106 * @avail_threads_per_core: Available full CPU cores to be dedicated to 107 * enclave(s). The cpumasks from the array, indexed 108 * by core id, contain all the threads from the 109 * available cores, that are not set for created 110 * enclave(s). The full CPU cores are part of the 111 * NE CPU pool. 112 * @mutex: Mutex for the access to the NE CPU pool. 113 * @nr_parent_vm_cores : The size of the available threads per core array. 114 * The total number of CPU cores available on the 115 * primary / parent VM. 116 * @nr_threads_per_core: The number of threads that a full CPU core has. 117 * @numa_node: NUMA node of the CPUs in the pool. 118 */ 119 struct ne_cpu_pool { 120 cpumask_var_t *avail_threads_per_core; 121 struct mutex mutex; 122 unsigned int nr_parent_vm_cores; 123 unsigned int nr_threads_per_core; 124 int numa_node; 125 }; 126 127 static struct ne_cpu_pool ne_cpu_pool; 128 129 /** 130 * struct ne_phys_contig_mem_regions - Contiguous physical memory regions. 131 * @num: The number of regions that currently has. 132 * @regions: The array of physical memory regions. 133 */ 134 struct ne_phys_contig_mem_regions { 135 unsigned long num; 136 struct range *regions; 137 }; 138 139 /** 140 * ne_check_enclaves_created() - Verify if at least one enclave has been created. 141 * @void: No parameters provided. 142 * 143 * Context: Process context. 144 * Return: 145 * * True if at least one enclave is created. 146 * * False otherwise. 147 */ 148 static bool ne_check_enclaves_created(void) 149 { 150 struct ne_pci_dev *ne_pci_dev = ne_devs.ne_pci_dev; 151 bool ret = false; 152 153 if (!ne_pci_dev) 154 return ret; 155 156 mutex_lock(&ne_pci_dev->enclaves_list_mutex); 157 158 if (!list_empty(&ne_pci_dev->enclaves_list)) 159 ret = true; 160 161 mutex_unlock(&ne_pci_dev->enclaves_list_mutex); 162 163 return ret; 164 } 165 166 /** 167 * ne_setup_cpu_pool() - Set the NE CPU pool after handling sanity checks such 168 * as not sharing CPU cores with the primary / parent VM 169 * or not using CPU 0, which should remain available for 170 * the primary / parent VM. Offline the CPUs from the 171 * pool after the checks passed. 172 * @ne_cpu_list: The CPU list used for setting NE CPU pool. 173 * 174 * Context: Process context. 175 * Return: 176 * * 0 on success. 177 * * Negative return value on failure. 178 */ 179 static int ne_setup_cpu_pool(const char *ne_cpu_list) 180 { 181 int core_id = -1; 182 unsigned int cpu = 0; 183 cpumask_var_t cpu_pool; 184 unsigned int cpu_sibling = 0; 185 unsigned int i = 0; 186 int numa_node = -1; 187 int rc = -EINVAL; 188 189 if (!zalloc_cpumask_var(&cpu_pool, GFP_KERNEL)) 190 return -ENOMEM; 191 192 mutex_lock(&ne_cpu_pool.mutex); 193 194 rc = cpulist_parse(ne_cpu_list, cpu_pool); 195 if (rc < 0) { 196 pr_err("%s: Error in cpulist parse [rc=%d]\n", ne_misc_dev.name, rc); 197 198 goto free_pool_cpumask; 199 } 200 201 cpu = cpumask_any(cpu_pool); 202 if (cpu >= nr_cpu_ids) { 203 pr_err("%s: No CPUs available in CPU pool\n", ne_misc_dev.name); 204 205 rc = -EINVAL; 206 207 goto free_pool_cpumask; 208 } 209 210 /* 211 * Check if the CPUs are online, to further get info about them 212 * e.g. numa node, core id, siblings. 213 */ 214 for_each_cpu(cpu, cpu_pool) 215 if (cpu_is_offline(cpu)) { 216 pr_err("%s: CPU %d is offline, has to be online to get its metadata\n", 217 ne_misc_dev.name, cpu); 218 219 rc = -EINVAL; 220 221 goto free_pool_cpumask; 222 } 223 224 /* 225 * Check if the CPUs from the NE CPU pool are from the same NUMA node. 226 */ 227 for_each_cpu(cpu, cpu_pool) 228 if (numa_node < 0) { 229 numa_node = cpu_to_node(cpu); 230 if (numa_node < 0) { 231 pr_err("%s: Invalid NUMA node %d\n", 232 ne_misc_dev.name, numa_node); 233 234 rc = -EINVAL; 235 236 goto free_pool_cpumask; 237 } 238 } else { 239 if (numa_node != cpu_to_node(cpu)) { 240 pr_err("%s: CPUs with different NUMA nodes\n", 241 ne_misc_dev.name); 242 243 rc = -EINVAL; 244 245 goto free_pool_cpumask; 246 } 247 } 248 249 /* 250 * Check if CPU 0 and its siblings are included in the provided CPU pool 251 * They should remain available for the primary / parent VM. 252 */ 253 if (cpumask_test_cpu(0, cpu_pool)) { 254 pr_err("%s: CPU 0 has to remain available\n", ne_misc_dev.name); 255 256 rc = -EINVAL; 257 258 goto free_pool_cpumask; 259 } 260 261 for_each_cpu(cpu_sibling, topology_sibling_cpumask(0)) { 262 if (cpumask_test_cpu(cpu_sibling, cpu_pool)) { 263 pr_err("%s: CPU sibling %d for CPU 0 is in CPU pool\n", 264 ne_misc_dev.name, cpu_sibling); 265 266 rc = -EINVAL; 267 268 goto free_pool_cpumask; 269 } 270 } 271 272 /* 273 * Check if CPU siblings are included in the provided CPU pool. The 274 * expectation is that full CPU cores are made available in the CPU pool 275 * for enclaves. 276 */ 277 for_each_cpu(cpu, cpu_pool) { 278 for_each_cpu(cpu_sibling, topology_sibling_cpumask(cpu)) { 279 if (!cpumask_test_cpu(cpu_sibling, cpu_pool)) { 280 pr_err("%s: CPU %d is not in CPU pool\n", 281 ne_misc_dev.name, cpu_sibling); 282 283 rc = -EINVAL; 284 285 goto free_pool_cpumask; 286 } 287 } 288 } 289 290 /* Calculate the number of threads from a full CPU core. */ 291 cpu = cpumask_any(cpu_pool); 292 for_each_cpu(cpu_sibling, topology_sibling_cpumask(cpu)) 293 ne_cpu_pool.nr_threads_per_core++; 294 295 ne_cpu_pool.nr_parent_vm_cores = nr_cpu_ids / ne_cpu_pool.nr_threads_per_core; 296 297 ne_cpu_pool.avail_threads_per_core = kzalloc_objs(*ne_cpu_pool.avail_threads_per_core, 298 ne_cpu_pool.nr_parent_vm_cores); 299 if (!ne_cpu_pool.avail_threads_per_core) { 300 rc = -ENOMEM; 301 302 goto free_pool_cpumask; 303 } 304 305 for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++) 306 if (!zalloc_cpumask_var(&ne_cpu_pool.avail_threads_per_core[i], GFP_KERNEL)) { 307 rc = -ENOMEM; 308 309 goto free_cores_cpumask; 310 } 311 312 /* 313 * Split the NE CPU pool in threads per core to keep the CPU topology 314 * after offlining the CPUs. 315 */ 316 for_each_cpu(cpu, cpu_pool) { 317 core_id = topology_core_id(cpu); 318 if (core_id < 0 || core_id >= ne_cpu_pool.nr_parent_vm_cores) { 319 pr_err("%s: Invalid core id %d for CPU %d\n", 320 ne_misc_dev.name, core_id, cpu); 321 322 rc = -EINVAL; 323 324 goto clear_cpumask; 325 } 326 327 cpumask_set_cpu(cpu, ne_cpu_pool.avail_threads_per_core[core_id]); 328 } 329 330 /* 331 * CPUs that are given to enclave(s) should not be considered online 332 * by Linux anymore, as the hypervisor will degrade them to floating. 333 * The physical CPUs (full cores) are carved out of the primary / parent 334 * VM and given to the enclave VM. The same number of vCPUs would run 335 * on less pCPUs for the primary / parent VM. 336 * 337 * We offline them here, to not degrade performance and expose correct 338 * topology to Linux and user space. 339 */ 340 for_each_cpu(cpu, cpu_pool) { 341 rc = remove_cpu(cpu); 342 if (rc != 0) { 343 pr_err("%s: CPU %d is not offlined [rc=%d]\n", 344 ne_misc_dev.name, cpu, rc); 345 346 goto online_cpus; 347 } 348 } 349 350 free_cpumask_var(cpu_pool); 351 352 ne_cpu_pool.numa_node = numa_node; 353 354 mutex_unlock(&ne_cpu_pool.mutex); 355 356 return 0; 357 358 online_cpus: 359 for_each_cpu(cpu, cpu_pool) 360 add_cpu(cpu); 361 clear_cpumask: 362 for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++) 363 cpumask_clear(ne_cpu_pool.avail_threads_per_core[i]); 364 free_cores_cpumask: 365 for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++) 366 free_cpumask_var(ne_cpu_pool.avail_threads_per_core[i]); 367 kfree(ne_cpu_pool.avail_threads_per_core); 368 free_pool_cpumask: 369 free_cpumask_var(cpu_pool); 370 ne_cpu_pool.nr_parent_vm_cores = 0; 371 ne_cpu_pool.nr_threads_per_core = 0; 372 ne_cpu_pool.numa_node = -1; 373 mutex_unlock(&ne_cpu_pool.mutex); 374 375 return rc; 376 } 377 378 /** 379 * ne_teardown_cpu_pool() - Online the CPUs from the NE CPU pool and cleanup the 380 * CPU pool. 381 * @void: No parameters provided. 382 * 383 * Context: Process context. 384 */ 385 static void ne_teardown_cpu_pool(void) 386 { 387 unsigned int cpu = 0; 388 unsigned int i = 0; 389 int rc = -EINVAL; 390 391 mutex_lock(&ne_cpu_pool.mutex); 392 393 if (!ne_cpu_pool.nr_parent_vm_cores) { 394 mutex_unlock(&ne_cpu_pool.mutex); 395 396 return; 397 } 398 399 for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++) { 400 for_each_cpu(cpu, ne_cpu_pool.avail_threads_per_core[i]) { 401 rc = add_cpu(cpu); 402 if (rc != 0) 403 pr_err("%s: CPU %d is not onlined [rc=%d]\n", 404 ne_misc_dev.name, cpu, rc); 405 } 406 407 cpumask_clear(ne_cpu_pool.avail_threads_per_core[i]); 408 409 free_cpumask_var(ne_cpu_pool.avail_threads_per_core[i]); 410 } 411 412 kfree(ne_cpu_pool.avail_threads_per_core); 413 ne_cpu_pool.nr_parent_vm_cores = 0; 414 ne_cpu_pool.nr_threads_per_core = 0; 415 ne_cpu_pool.numa_node = -1; 416 417 mutex_unlock(&ne_cpu_pool.mutex); 418 } 419 420 /** 421 * ne_set_kernel_param() - Set the NE CPU pool value via the NE kernel parameter. 422 * @val: NE CPU pool string value. 423 * @kp : NE kernel parameter associated with the NE CPU pool. 424 * 425 * Context: Process context. 426 * Return: 427 * * 0 on success. 428 * * Negative return value on failure. 429 */ 430 static int ne_set_kernel_param(const char *val, const struct kernel_param *kp) 431 { 432 char error_val[] = ""; 433 int rc = -EINVAL; 434 435 if (!capable(CAP_SYS_ADMIN)) 436 return -EPERM; 437 438 if (ne_check_enclaves_created()) { 439 pr_err("%s: The CPU pool is used by enclave(s)\n", ne_misc_dev.name); 440 441 return -EPERM; 442 } 443 444 ne_teardown_cpu_pool(); 445 446 rc = ne_setup_cpu_pool(val); 447 if (rc < 0) { 448 pr_err("%s: Error in setup CPU pool [rc=%d]\n", ne_misc_dev.name, rc); 449 450 param_set_copystring(error_val, kp); 451 452 return rc; 453 } 454 455 rc = param_set_copystring(val, kp); 456 if (rc < 0) { 457 pr_err("%s: Error in param set copystring [rc=%d]\n", ne_misc_dev.name, rc); 458 459 ne_teardown_cpu_pool(); 460 461 param_set_copystring(error_val, kp); 462 463 return rc; 464 } 465 466 return 0; 467 } 468 469 /** 470 * ne_donated_cpu() - Check if the provided CPU is already used by the enclave. 471 * @ne_enclave : Private data associated with the current enclave. 472 * @cpu: CPU to check if already used. 473 * 474 * Context: Process context. This function is called with the ne_enclave mutex held. 475 * Return: 476 * * True if the provided CPU is already used by the enclave. 477 * * False otherwise. 478 */ 479 static bool ne_donated_cpu(struct ne_enclave *ne_enclave, unsigned int cpu) 480 { 481 if (cpumask_test_cpu(cpu, ne_enclave->vcpu_ids)) 482 return true; 483 484 return false; 485 } 486 487 /** 488 * ne_get_unused_core_from_cpu_pool() - Get the id of a full core from the 489 * NE CPU pool. 490 * @void: No parameters provided. 491 * 492 * Context: Process context. This function is called with the ne_enclave and 493 * ne_cpu_pool mutexes held. 494 * Return: 495 * * Core id. 496 * * -1 if no CPU core available in the pool. 497 */ 498 static int ne_get_unused_core_from_cpu_pool(void) 499 { 500 int core_id = -1; 501 unsigned int i = 0; 502 503 for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++) 504 if (!cpumask_empty(ne_cpu_pool.avail_threads_per_core[i])) { 505 core_id = i; 506 507 break; 508 } 509 510 return core_id; 511 } 512 513 /** 514 * ne_set_enclave_threads_per_core() - Set the threads of the provided core in 515 * the enclave data structure. 516 * @ne_enclave : Private data associated with the current enclave. 517 * @core_id: Core id to get its threads from the NE CPU pool. 518 * @vcpu_id: vCPU id part of the provided core. 519 * 520 * Context: Process context. This function is called with the ne_enclave and 521 * ne_cpu_pool mutexes held. 522 * Return: 523 * * 0 on success. 524 * * Negative return value on failure. 525 */ 526 static int ne_set_enclave_threads_per_core(struct ne_enclave *ne_enclave, 527 int core_id, u32 vcpu_id) 528 { 529 unsigned int cpu = 0; 530 531 if (core_id < 0 && vcpu_id == 0) { 532 dev_err_ratelimited(ne_misc_dev.this_device, 533 "No CPUs available in NE CPU pool\n"); 534 535 return -NE_ERR_NO_CPUS_AVAIL_IN_POOL; 536 } 537 538 if (core_id < 0) { 539 dev_err_ratelimited(ne_misc_dev.this_device, 540 "CPU %d is not in NE CPU pool\n", vcpu_id); 541 542 return -NE_ERR_VCPU_NOT_IN_CPU_POOL; 543 } 544 545 if (core_id >= ne_enclave->nr_parent_vm_cores) { 546 dev_err_ratelimited(ne_misc_dev.this_device, 547 "Invalid core id %d - ne_enclave\n", core_id); 548 549 return -NE_ERR_VCPU_INVALID_CPU_CORE; 550 } 551 552 for_each_cpu(cpu, ne_cpu_pool.avail_threads_per_core[core_id]) 553 cpumask_set_cpu(cpu, ne_enclave->threads_per_core[core_id]); 554 555 cpumask_clear(ne_cpu_pool.avail_threads_per_core[core_id]); 556 557 return 0; 558 } 559 560 /** 561 * ne_get_cpu_from_cpu_pool() - Get a CPU from the NE CPU pool, either from the 562 * remaining sibling(s) of a CPU core or the first 563 * sibling of a new CPU core. 564 * @ne_enclave : Private data associated with the current enclave. 565 * @vcpu_id: vCPU to get from the NE CPU pool. 566 * 567 * Context: Process context. This function is called with the ne_enclave mutex held. 568 * Return: 569 * * 0 on success. 570 * * Negative return value on failure. 571 */ 572 static int ne_get_cpu_from_cpu_pool(struct ne_enclave *ne_enclave, u32 *vcpu_id) 573 { 574 int core_id = -1; 575 unsigned int cpu = 0; 576 unsigned int i = 0; 577 int rc = -EINVAL; 578 579 /* 580 * If previously allocated a thread of a core to this enclave, first 581 * check remaining sibling(s) for new CPU allocations, so that full 582 * CPU cores are used for the enclave. 583 */ 584 for (i = 0; i < ne_enclave->nr_parent_vm_cores; i++) 585 for_each_cpu(cpu, ne_enclave->threads_per_core[i]) 586 if (!ne_donated_cpu(ne_enclave, cpu)) { 587 *vcpu_id = cpu; 588 589 return 0; 590 } 591 592 mutex_lock(&ne_cpu_pool.mutex); 593 594 /* 595 * If no remaining siblings, get a core from the NE CPU pool and keep 596 * track of all the threads in the enclave threads per core data structure. 597 */ 598 core_id = ne_get_unused_core_from_cpu_pool(); 599 600 rc = ne_set_enclave_threads_per_core(ne_enclave, core_id, *vcpu_id); 601 if (rc < 0) 602 goto unlock_mutex; 603 604 *vcpu_id = cpumask_any(ne_enclave->threads_per_core[core_id]); 605 606 rc = 0; 607 608 unlock_mutex: 609 mutex_unlock(&ne_cpu_pool.mutex); 610 611 return rc; 612 } 613 614 /** 615 * ne_get_vcpu_core_from_cpu_pool() - Get from the NE CPU pool the id of the 616 * core associated with the provided vCPU. 617 * @vcpu_id: Provided vCPU id to get its associated core id. 618 * 619 * Context: Process context. This function is called with the ne_enclave and 620 * ne_cpu_pool mutexes held. 621 * Return: 622 * * Core id. 623 * * -1 if the provided vCPU is not in the pool. 624 */ 625 static int ne_get_vcpu_core_from_cpu_pool(u32 vcpu_id) 626 { 627 int core_id = -1; 628 unsigned int i = 0; 629 630 for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++) 631 if (cpumask_test_cpu(vcpu_id, ne_cpu_pool.avail_threads_per_core[i])) { 632 core_id = i; 633 634 break; 635 } 636 637 return core_id; 638 } 639 640 /** 641 * ne_check_cpu_in_cpu_pool() - Check if the given vCPU is in the available CPUs 642 * from the pool. 643 * @ne_enclave : Private data associated with the current enclave. 644 * @vcpu_id: ID of the vCPU to check if available in the NE CPU pool. 645 * 646 * Context: Process context. This function is called with the ne_enclave mutex held. 647 * Return: 648 * * 0 on success. 649 * * Negative return value on failure. 650 */ 651 static int ne_check_cpu_in_cpu_pool(struct ne_enclave *ne_enclave, u32 vcpu_id) 652 { 653 int core_id = -1; 654 unsigned int i = 0; 655 int rc = -EINVAL; 656 657 if (ne_donated_cpu(ne_enclave, vcpu_id)) { 658 dev_err_ratelimited(ne_misc_dev.this_device, 659 "CPU %d already used\n", vcpu_id); 660 661 return -NE_ERR_VCPU_ALREADY_USED; 662 } 663 664 /* 665 * If previously allocated a thread of a core to this enclave, but not 666 * the full core, first check remaining sibling(s). 667 */ 668 for (i = 0; i < ne_enclave->nr_parent_vm_cores; i++) 669 if (cpumask_test_cpu(vcpu_id, ne_enclave->threads_per_core[i])) 670 return 0; 671 672 mutex_lock(&ne_cpu_pool.mutex); 673 674 /* 675 * If no remaining siblings, get from the NE CPU pool the core 676 * associated with the vCPU and keep track of all the threads in the 677 * enclave threads per core data structure. 678 */ 679 core_id = ne_get_vcpu_core_from_cpu_pool(vcpu_id); 680 681 rc = ne_set_enclave_threads_per_core(ne_enclave, core_id, vcpu_id); 682 if (rc < 0) 683 goto unlock_mutex; 684 685 rc = 0; 686 687 unlock_mutex: 688 mutex_unlock(&ne_cpu_pool.mutex); 689 690 return rc; 691 } 692 693 /** 694 * ne_add_vcpu_ioctl() - Add a vCPU to the slot associated with the current 695 * enclave. 696 * @ne_enclave : Private data associated with the current enclave. 697 * @vcpu_id: ID of the CPU to be associated with the given slot, 698 * apic id on x86. 699 * 700 * Context: Process context. This function is called with the ne_enclave mutex held. 701 * Return: 702 * * 0 on success. 703 * * Negative return value on failure. 704 */ 705 static int ne_add_vcpu_ioctl(struct ne_enclave *ne_enclave, u32 vcpu_id) 706 { 707 struct ne_pci_dev_cmd_reply cmd_reply = {}; 708 struct pci_dev *pdev = ne_devs.ne_pci_dev->pdev; 709 int rc = -EINVAL; 710 struct slot_add_vcpu_req slot_add_vcpu_req = {}; 711 712 if (ne_enclave->mm != current->mm) 713 return -EIO; 714 715 slot_add_vcpu_req.slot_uid = ne_enclave->slot_uid; 716 slot_add_vcpu_req.vcpu_id = vcpu_id; 717 718 rc = ne_do_request(pdev, SLOT_ADD_VCPU, 719 &slot_add_vcpu_req, sizeof(slot_add_vcpu_req), 720 &cmd_reply, sizeof(cmd_reply)); 721 if (rc < 0) { 722 dev_err_ratelimited(ne_misc_dev.this_device, 723 "Error in slot add vCPU [rc=%d]\n", rc); 724 725 return rc; 726 } 727 728 cpumask_set_cpu(vcpu_id, ne_enclave->vcpu_ids); 729 730 ne_enclave->nr_vcpus++; 731 732 return 0; 733 } 734 735 /** 736 * ne_sanity_check_user_mem_region() - Sanity check the user space memory 737 * region received during the set user 738 * memory region ioctl call. 739 * @ne_enclave : Private data associated with the current enclave. 740 * @mem_region : User space memory region to be sanity checked. 741 * 742 * Context: Process context. This function is called with the ne_enclave mutex held. 743 * Return: 744 * * 0 on success. 745 * * Negative return value on failure. 746 */ 747 static int ne_sanity_check_user_mem_region(struct ne_enclave *ne_enclave, 748 struct ne_user_memory_region mem_region) 749 { 750 struct ne_mem_region *ne_mem_region = NULL; 751 752 if (ne_enclave->mm != current->mm) 753 return -EIO; 754 755 if (mem_region.memory_size & (NE_MIN_MEM_REGION_SIZE - 1)) { 756 dev_err_ratelimited(ne_misc_dev.this_device, 757 "User space memory size is not multiple of 2 MiB\n"); 758 759 return -NE_ERR_INVALID_MEM_REGION_SIZE; 760 } 761 762 if (!IS_ALIGNED(mem_region.userspace_addr, NE_MIN_MEM_REGION_SIZE)) { 763 dev_err_ratelimited(ne_misc_dev.this_device, 764 "User space address is not 2 MiB aligned\n"); 765 766 return -NE_ERR_UNALIGNED_MEM_REGION_ADDR; 767 } 768 769 if ((mem_region.userspace_addr & (NE_MIN_MEM_REGION_SIZE - 1)) || 770 !access_ok((void __user *)(unsigned long)mem_region.userspace_addr, 771 mem_region.memory_size)) { 772 dev_err_ratelimited(ne_misc_dev.this_device, 773 "Invalid user space address range\n"); 774 775 return -NE_ERR_INVALID_MEM_REGION_ADDR; 776 } 777 778 list_for_each_entry(ne_mem_region, &ne_enclave->mem_regions_list, 779 mem_region_list_entry) { 780 u64 memory_size = ne_mem_region->memory_size; 781 u64 userspace_addr = ne_mem_region->userspace_addr; 782 783 if ((userspace_addr <= mem_region.userspace_addr && 784 mem_region.userspace_addr < (userspace_addr + memory_size)) || 785 (mem_region.userspace_addr <= userspace_addr && 786 (mem_region.userspace_addr + mem_region.memory_size) > userspace_addr)) { 787 dev_err_ratelimited(ne_misc_dev.this_device, 788 "User space memory region already used\n"); 789 790 return -NE_ERR_MEM_REGION_ALREADY_USED; 791 } 792 } 793 794 return 0; 795 } 796 797 /** 798 * ne_sanity_check_user_mem_region_page() - Sanity check a page from the user space 799 * memory region received during the set 800 * user memory region ioctl call. 801 * @ne_enclave : Private data associated with the current enclave. 802 * @mem_region_page: Page from the user space memory region to be sanity checked. 803 * 804 * Context: Process context. This function is called with the ne_enclave mutex held. 805 * Return: 806 * * 0 on success. 807 * * Negative return value on failure. 808 */ 809 static int ne_sanity_check_user_mem_region_page(struct ne_enclave *ne_enclave, 810 struct page *mem_region_page) 811 { 812 if (!PageHuge(mem_region_page)) { 813 dev_err_ratelimited(ne_misc_dev.this_device, 814 "Not a hugetlbfs page\n"); 815 816 return -NE_ERR_MEM_NOT_HUGE_PAGE; 817 } 818 819 if (page_size(mem_region_page) & (NE_MIN_MEM_REGION_SIZE - 1)) { 820 dev_err_ratelimited(ne_misc_dev.this_device, 821 "Page size not multiple of 2 MiB\n"); 822 823 return -NE_ERR_INVALID_PAGE_SIZE; 824 } 825 826 if (ne_enclave->numa_node != page_to_nid(mem_region_page)) { 827 dev_err_ratelimited(ne_misc_dev.this_device, 828 "Page is not from NUMA node %d\n", 829 ne_enclave->numa_node); 830 831 return -NE_ERR_MEM_DIFFERENT_NUMA_NODE; 832 } 833 834 return 0; 835 } 836 837 /** 838 * ne_sanity_check_phys_mem_region() - Sanity check the start address and the size 839 * of a physical memory region. 840 * @phys_mem_region_paddr : Physical start address of the region to be sanity checked. 841 * @phys_mem_region_size : Length of the region to be sanity checked. 842 * 843 * Context: Process context. This function is called with the ne_enclave mutex held. 844 * Return: 845 * * 0 on success. 846 * * Negative return value on failure. 847 */ 848 static int ne_sanity_check_phys_mem_region(u64 phys_mem_region_paddr, 849 u64 phys_mem_region_size) 850 { 851 if (phys_mem_region_size & (NE_MIN_MEM_REGION_SIZE - 1)) { 852 dev_err_ratelimited(ne_misc_dev.this_device, 853 "Physical mem region size is not multiple of 2 MiB\n"); 854 855 return -EINVAL; 856 } 857 858 if (!IS_ALIGNED(phys_mem_region_paddr, NE_MIN_MEM_REGION_SIZE)) { 859 dev_err_ratelimited(ne_misc_dev.this_device, 860 "Physical mem region address is not 2 MiB aligned\n"); 861 862 return -EINVAL; 863 } 864 865 return 0; 866 } 867 868 /** 869 * ne_merge_phys_contig_memory_regions() - Add a memory region and merge the adjacent 870 * regions if they are physically contiguous. 871 * @phys_contig_regions : Private data associated with the contiguous physical memory regions. 872 * @page_paddr : Physical start address of the region to be added. 873 * @page_size : Length of the region to be added. 874 * 875 * Context: Process context. This function is called with the ne_enclave mutex held. 876 * Return: 877 * * 0 on success. 878 * * Negative return value on failure. 879 */ 880 static int 881 ne_merge_phys_contig_memory_regions(struct ne_phys_contig_mem_regions *phys_contig_regions, 882 u64 page_paddr, u64 page_size) 883 { 884 unsigned long num = phys_contig_regions->num; 885 int rc = 0; 886 887 rc = ne_sanity_check_phys_mem_region(page_paddr, page_size); 888 if (rc < 0) 889 return rc; 890 891 /* Physically contiguous, just merge */ 892 if (num && (phys_contig_regions->regions[num - 1].end + 1) == page_paddr) { 893 phys_contig_regions->regions[num - 1].end += page_size; 894 } else { 895 phys_contig_regions->regions[num].start = page_paddr; 896 phys_contig_regions->regions[num].end = page_paddr + page_size - 1; 897 phys_contig_regions->num++; 898 } 899 900 return 0; 901 } 902 903 /** 904 * ne_set_user_memory_region_ioctl() - Add user space memory region to the slot 905 * associated with the current enclave. 906 * @ne_enclave : Private data associated with the current enclave. 907 * @mem_region : User space memory region to be associated with the given slot. 908 * 909 * Context: Process context. This function is called with the ne_enclave mutex held. 910 * Return: 911 * * 0 on success. 912 * * Negative return value on failure. 913 */ 914 static int ne_set_user_memory_region_ioctl(struct ne_enclave *ne_enclave, 915 struct ne_user_memory_region mem_region) 916 { 917 long gup_rc = 0; 918 unsigned long i = 0; 919 unsigned long max_nr_pages = 0; 920 unsigned long memory_size = 0; 921 struct ne_mem_region *ne_mem_region = NULL; 922 struct pci_dev *pdev = ne_devs.ne_pci_dev->pdev; 923 struct ne_phys_contig_mem_regions phys_contig_mem_regions = {}; 924 int rc = -EINVAL; 925 926 rc = ne_sanity_check_user_mem_region(ne_enclave, mem_region); 927 if (rc < 0) 928 return rc; 929 930 ne_mem_region = kzalloc_obj(*ne_mem_region); 931 if (!ne_mem_region) 932 return -ENOMEM; 933 934 max_nr_pages = mem_region.memory_size / NE_MIN_MEM_REGION_SIZE; 935 936 ne_mem_region->pages = kzalloc_objs(*ne_mem_region->pages, max_nr_pages); 937 if (!ne_mem_region->pages) { 938 rc = -ENOMEM; 939 940 goto free_mem_region; 941 } 942 943 phys_contig_mem_regions.regions = kzalloc_objs(*phys_contig_mem_regions.regions, 944 max_nr_pages); 945 if (!phys_contig_mem_regions.regions) { 946 rc = -ENOMEM; 947 948 goto free_mem_region; 949 } 950 951 do { 952 i = ne_mem_region->nr_pages; 953 954 if (i == max_nr_pages) { 955 dev_err_ratelimited(ne_misc_dev.this_device, 956 "Reached max nr of pages in the pages data struct\n"); 957 958 rc = -ENOMEM; 959 960 goto put_pages; 961 } 962 963 gup_rc = get_user_pages_unlocked(mem_region.userspace_addr + memory_size, 1, 964 ne_mem_region->pages + i, FOLL_GET); 965 966 if (gup_rc < 0) { 967 rc = gup_rc; 968 969 dev_err_ratelimited(ne_misc_dev.this_device, 970 "Error in get user pages [rc=%d]\n", rc); 971 972 goto put_pages; 973 } 974 975 rc = ne_sanity_check_user_mem_region_page(ne_enclave, ne_mem_region->pages[i]); 976 if (rc < 0) 977 goto put_pages; 978 979 rc = ne_merge_phys_contig_memory_regions(&phys_contig_mem_regions, 980 page_to_phys(ne_mem_region->pages[i]), 981 page_size(ne_mem_region->pages[i])); 982 if (rc < 0) 983 goto put_pages; 984 985 memory_size += page_size(ne_mem_region->pages[i]); 986 987 ne_mem_region->nr_pages++; 988 } while (memory_size < mem_region.memory_size); 989 990 if ((ne_enclave->nr_mem_regions + phys_contig_mem_regions.num) > 991 ne_enclave->max_mem_regions) { 992 dev_err_ratelimited(ne_misc_dev.this_device, 993 "Reached max memory regions %lld\n", 994 ne_enclave->max_mem_regions); 995 996 rc = -NE_ERR_MEM_MAX_REGIONS; 997 998 goto put_pages; 999 } 1000 1001 for (i = 0; i < phys_contig_mem_regions.num; i++) { 1002 u64 phys_region_addr = phys_contig_mem_regions.regions[i].start; 1003 u64 phys_region_size = range_len(&phys_contig_mem_regions.regions[i]); 1004 1005 rc = ne_sanity_check_phys_mem_region(phys_region_addr, phys_region_size); 1006 if (rc < 0) 1007 goto put_pages; 1008 } 1009 1010 ne_mem_region->memory_size = mem_region.memory_size; 1011 ne_mem_region->userspace_addr = mem_region.userspace_addr; 1012 1013 list_add(&ne_mem_region->mem_region_list_entry, &ne_enclave->mem_regions_list); 1014 1015 for (i = 0; i < phys_contig_mem_regions.num; i++) { 1016 struct ne_pci_dev_cmd_reply cmd_reply = {}; 1017 struct slot_add_mem_req slot_add_mem_req = {}; 1018 1019 slot_add_mem_req.slot_uid = ne_enclave->slot_uid; 1020 slot_add_mem_req.paddr = phys_contig_mem_regions.regions[i].start; 1021 slot_add_mem_req.size = range_len(&phys_contig_mem_regions.regions[i]); 1022 1023 rc = ne_do_request(pdev, SLOT_ADD_MEM, 1024 &slot_add_mem_req, sizeof(slot_add_mem_req), 1025 &cmd_reply, sizeof(cmd_reply)); 1026 if (rc < 0) { 1027 dev_err_ratelimited(ne_misc_dev.this_device, 1028 "Error in slot add mem [rc=%d]\n", rc); 1029 1030 kfree(phys_contig_mem_regions.regions); 1031 1032 /* 1033 * Exit here without put pages as memory regions may 1034 * already been added. 1035 */ 1036 return rc; 1037 } 1038 1039 ne_enclave->mem_size += slot_add_mem_req.size; 1040 ne_enclave->nr_mem_regions++; 1041 } 1042 1043 kfree(phys_contig_mem_regions.regions); 1044 1045 return 0; 1046 1047 put_pages: 1048 for (i = 0; i < ne_mem_region->nr_pages; i++) 1049 put_page(ne_mem_region->pages[i]); 1050 free_mem_region: 1051 kfree(phys_contig_mem_regions.regions); 1052 kfree(ne_mem_region->pages); 1053 kfree(ne_mem_region); 1054 1055 return rc; 1056 } 1057 1058 /** 1059 * ne_start_enclave_ioctl() - Trigger enclave start after the enclave resources, 1060 * such as memory and CPU, have been set. 1061 * @ne_enclave : Private data associated with the current enclave. 1062 * @enclave_start_info : Enclave info that includes enclave cid and flags. 1063 * 1064 * Context: Process context. This function is called with the ne_enclave mutex held. 1065 * Return: 1066 * * 0 on success. 1067 * * Negative return value on failure. 1068 */ 1069 static int ne_start_enclave_ioctl(struct ne_enclave *ne_enclave, 1070 struct ne_enclave_start_info *enclave_start_info) 1071 { 1072 struct ne_pci_dev_cmd_reply cmd_reply = {}; 1073 unsigned int cpu = 0; 1074 struct enclave_start_req enclave_start_req = {}; 1075 unsigned int i = 0; 1076 struct pci_dev *pdev = ne_devs.ne_pci_dev->pdev; 1077 int rc = -EINVAL; 1078 1079 if (!ne_enclave->nr_mem_regions) { 1080 dev_err_ratelimited(ne_misc_dev.this_device, 1081 "Enclave has no mem regions\n"); 1082 1083 return -NE_ERR_NO_MEM_REGIONS_ADDED; 1084 } 1085 1086 if (ne_enclave->mem_size < NE_MIN_ENCLAVE_MEM_SIZE) { 1087 dev_err_ratelimited(ne_misc_dev.this_device, 1088 "Enclave memory is less than %ld\n", 1089 NE_MIN_ENCLAVE_MEM_SIZE); 1090 1091 return -NE_ERR_ENCLAVE_MEM_MIN_SIZE; 1092 } 1093 1094 if (!ne_enclave->nr_vcpus) { 1095 dev_err_ratelimited(ne_misc_dev.this_device, 1096 "Enclave has no vCPUs\n"); 1097 1098 return -NE_ERR_NO_VCPUS_ADDED; 1099 } 1100 1101 for (i = 0; i < ne_enclave->nr_parent_vm_cores; i++) 1102 for_each_cpu(cpu, ne_enclave->threads_per_core[i]) 1103 if (!cpumask_test_cpu(cpu, ne_enclave->vcpu_ids)) { 1104 dev_err_ratelimited(ne_misc_dev.this_device, 1105 "Full CPU cores not used\n"); 1106 1107 return -NE_ERR_FULL_CORES_NOT_USED; 1108 } 1109 1110 enclave_start_req.enclave_cid = enclave_start_info->enclave_cid; 1111 enclave_start_req.flags = enclave_start_info->flags; 1112 enclave_start_req.slot_uid = ne_enclave->slot_uid; 1113 1114 rc = ne_do_request(pdev, ENCLAVE_START, 1115 &enclave_start_req, sizeof(enclave_start_req), 1116 &cmd_reply, sizeof(cmd_reply)); 1117 if (rc < 0) { 1118 dev_err_ratelimited(ne_misc_dev.this_device, 1119 "Error in enclave start [rc=%d]\n", rc); 1120 1121 return rc; 1122 } 1123 1124 ne_enclave->state = NE_STATE_RUNNING; 1125 1126 enclave_start_info->enclave_cid = cmd_reply.enclave_cid; 1127 1128 return 0; 1129 } 1130 1131 /** 1132 * ne_enclave_ioctl() - Ioctl function provided by the enclave file. 1133 * @file: File associated with this ioctl function. 1134 * @cmd: The command that is set for the ioctl call. 1135 * @arg: The argument that is provided for the ioctl call. 1136 * 1137 * Context: Process context. 1138 * Return: 1139 * * 0 on success. 1140 * * Negative return value on failure. 1141 */ 1142 static long ne_enclave_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 1143 { 1144 struct ne_enclave *ne_enclave = file->private_data; 1145 1146 switch (cmd) { 1147 case NE_ADD_VCPU: { 1148 int rc = -EINVAL; 1149 u32 vcpu_id = 0; 1150 1151 if (copy_from_user(&vcpu_id, (void __user *)arg, sizeof(vcpu_id))) 1152 return -EFAULT; 1153 1154 mutex_lock(&ne_enclave->enclave_info_mutex); 1155 1156 if (ne_enclave->state != NE_STATE_INIT) { 1157 dev_err_ratelimited(ne_misc_dev.this_device, 1158 "Enclave is not in init state\n"); 1159 1160 mutex_unlock(&ne_enclave->enclave_info_mutex); 1161 1162 return -NE_ERR_NOT_IN_INIT_STATE; 1163 } 1164 1165 if (vcpu_id >= (ne_enclave->nr_parent_vm_cores * 1166 ne_enclave->nr_threads_per_core)) { 1167 dev_err_ratelimited(ne_misc_dev.this_device, 1168 "vCPU id higher than max CPU id\n"); 1169 1170 mutex_unlock(&ne_enclave->enclave_info_mutex); 1171 1172 return -NE_ERR_INVALID_VCPU; 1173 } 1174 1175 if (!vcpu_id) { 1176 /* Use the CPU pool for choosing a CPU for the enclave. */ 1177 rc = ne_get_cpu_from_cpu_pool(ne_enclave, &vcpu_id); 1178 if (rc < 0) { 1179 dev_err_ratelimited(ne_misc_dev.this_device, 1180 "Error in get CPU from pool [rc=%d]\n", 1181 rc); 1182 1183 mutex_unlock(&ne_enclave->enclave_info_mutex); 1184 1185 return rc; 1186 } 1187 } else { 1188 /* Check if the provided vCPU is available in the NE CPU pool. */ 1189 rc = ne_check_cpu_in_cpu_pool(ne_enclave, vcpu_id); 1190 if (rc < 0) { 1191 dev_err_ratelimited(ne_misc_dev.this_device, 1192 "Error in check CPU %d in pool [rc=%d]\n", 1193 vcpu_id, rc); 1194 1195 mutex_unlock(&ne_enclave->enclave_info_mutex); 1196 1197 return rc; 1198 } 1199 } 1200 1201 rc = ne_add_vcpu_ioctl(ne_enclave, vcpu_id); 1202 if (rc < 0) { 1203 mutex_unlock(&ne_enclave->enclave_info_mutex); 1204 1205 return rc; 1206 } 1207 1208 mutex_unlock(&ne_enclave->enclave_info_mutex); 1209 1210 if (copy_to_user((void __user *)arg, &vcpu_id, sizeof(vcpu_id))) 1211 return -EFAULT; 1212 1213 return 0; 1214 } 1215 1216 case NE_GET_IMAGE_LOAD_INFO: { 1217 struct ne_image_load_info image_load_info = {}; 1218 1219 if (copy_from_user(&image_load_info, (void __user *)arg, sizeof(image_load_info))) 1220 return -EFAULT; 1221 1222 mutex_lock(&ne_enclave->enclave_info_mutex); 1223 1224 if (ne_enclave->state != NE_STATE_INIT) { 1225 dev_err_ratelimited(ne_misc_dev.this_device, 1226 "Enclave is not in init state\n"); 1227 1228 mutex_unlock(&ne_enclave->enclave_info_mutex); 1229 1230 return -NE_ERR_NOT_IN_INIT_STATE; 1231 } 1232 1233 mutex_unlock(&ne_enclave->enclave_info_mutex); 1234 1235 if (!image_load_info.flags || 1236 image_load_info.flags >= NE_IMAGE_LOAD_MAX_FLAG_VAL) { 1237 dev_err_ratelimited(ne_misc_dev.this_device, 1238 "Incorrect flag in enclave image load info\n"); 1239 1240 return -NE_ERR_INVALID_FLAG_VALUE; 1241 } 1242 1243 if (image_load_info.flags == NE_EIF_IMAGE) 1244 image_load_info.memory_offset = NE_EIF_LOAD_OFFSET; 1245 1246 if (copy_to_user((void __user *)arg, &image_load_info, sizeof(image_load_info))) 1247 return -EFAULT; 1248 1249 return 0; 1250 } 1251 1252 case NE_SET_USER_MEMORY_REGION: { 1253 struct ne_user_memory_region mem_region = {}; 1254 int rc = -EINVAL; 1255 1256 if (copy_from_user(&mem_region, (void __user *)arg, sizeof(mem_region))) 1257 return -EFAULT; 1258 1259 if (mem_region.flags >= NE_MEMORY_REGION_MAX_FLAG_VAL) { 1260 dev_err_ratelimited(ne_misc_dev.this_device, 1261 "Incorrect flag for user memory region\n"); 1262 1263 return -NE_ERR_INVALID_FLAG_VALUE; 1264 } 1265 1266 mutex_lock(&ne_enclave->enclave_info_mutex); 1267 1268 if (ne_enclave->state != NE_STATE_INIT) { 1269 dev_err_ratelimited(ne_misc_dev.this_device, 1270 "Enclave is not in init state\n"); 1271 1272 mutex_unlock(&ne_enclave->enclave_info_mutex); 1273 1274 return -NE_ERR_NOT_IN_INIT_STATE; 1275 } 1276 1277 rc = ne_set_user_memory_region_ioctl(ne_enclave, mem_region); 1278 if (rc < 0) { 1279 mutex_unlock(&ne_enclave->enclave_info_mutex); 1280 1281 return rc; 1282 } 1283 1284 mutex_unlock(&ne_enclave->enclave_info_mutex); 1285 1286 return 0; 1287 } 1288 1289 case NE_START_ENCLAVE: { 1290 struct ne_enclave_start_info enclave_start_info = {}; 1291 int rc = -EINVAL; 1292 1293 if (copy_from_user(&enclave_start_info, (void __user *)arg, 1294 sizeof(enclave_start_info))) 1295 return -EFAULT; 1296 1297 if (enclave_start_info.flags >= NE_ENCLAVE_START_MAX_FLAG_VAL) { 1298 dev_err_ratelimited(ne_misc_dev.this_device, 1299 "Incorrect flag in enclave start info\n"); 1300 1301 return -NE_ERR_INVALID_FLAG_VALUE; 1302 } 1303 1304 /* 1305 * Do not use well-known CIDs - 0, 1, 2 - for enclaves. 1306 * VMADDR_CID_ANY = -1U 1307 * VMADDR_CID_HYPERVISOR = 0 1308 * VMADDR_CID_LOCAL = 1 1309 * VMADDR_CID_HOST = 2 1310 * Note: 0 is used as a placeholder to auto-generate an enclave CID. 1311 * http://man7.org/linux/man-pages/man7/vsock.7.html 1312 */ 1313 if (enclave_start_info.enclave_cid > 0 && 1314 enclave_start_info.enclave_cid <= VMADDR_CID_HOST) { 1315 dev_err_ratelimited(ne_misc_dev.this_device, 1316 "Well-known CID value, not to be used for enclaves\n"); 1317 1318 return -NE_ERR_INVALID_ENCLAVE_CID; 1319 } 1320 1321 if (enclave_start_info.enclave_cid == U32_MAX) { 1322 dev_err_ratelimited(ne_misc_dev.this_device, 1323 "Well-known CID value, not to be used for enclaves\n"); 1324 1325 return -NE_ERR_INVALID_ENCLAVE_CID; 1326 } 1327 1328 /* 1329 * Do not use the CID of the primary / parent VM for enclaves. 1330 */ 1331 if (enclave_start_info.enclave_cid == NE_PARENT_VM_CID) { 1332 dev_err_ratelimited(ne_misc_dev.this_device, 1333 "CID of the parent VM, not to be used for enclaves\n"); 1334 1335 return -NE_ERR_INVALID_ENCLAVE_CID; 1336 } 1337 1338 /* 64-bit CIDs are not yet supported for the vsock device. */ 1339 if (enclave_start_info.enclave_cid > U32_MAX) { 1340 dev_err_ratelimited(ne_misc_dev.this_device, 1341 "64-bit CIDs not yet supported for the vsock device\n"); 1342 1343 return -NE_ERR_INVALID_ENCLAVE_CID; 1344 } 1345 1346 mutex_lock(&ne_enclave->enclave_info_mutex); 1347 1348 if (ne_enclave->state != NE_STATE_INIT) { 1349 dev_err_ratelimited(ne_misc_dev.this_device, 1350 "Enclave is not in init state\n"); 1351 1352 mutex_unlock(&ne_enclave->enclave_info_mutex); 1353 1354 return -NE_ERR_NOT_IN_INIT_STATE; 1355 } 1356 1357 rc = ne_start_enclave_ioctl(ne_enclave, &enclave_start_info); 1358 if (rc < 0) { 1359 mutex_unlock(&ne_enclave->enclave_info_mutex); 1360 1361 return rc; 1362 } 1363 1364 mutex_unlock(&ne_enclave->enclave_info_mutex); 1365 1366 if (copy_to_user((void __user *)arg, &enclave_start_info, 1367 sizeof(enclave_start_info))) 1368 return -EFAULT; 1369 1370 return 0; 1371 } 1372 1373 default: 1374 return -ENOTTY; 1375 } 1376 1377 return 0; 1378 } 1379 1380 /** 1381 * ne_enclave_remove_all_mem_region_entries() - Remove all memory region entries 1382 * from the enclave data structure. 1383 * @ne_enclave : Private data associated with the current enclave. 1384 * 1385 * Context: Process context. This function is called with the ne_enclave mutex held. 1386 */ 1387 static void ne_enclave_remove_all_mem_region_entries(struct ne_enclave *ne_enclave) 1388 { 1389 unsigned long i = 0; 1390 struct ne_mem_region *ne_mem_region = NULL; 1391 struct ne_mem_region *ne_mem_region_tmp = NULL; 1392 1393 list_for_each_entry_safe(ne_mem_region, ne_mem_region_tmp, 1394 &ne_enclave->mem_regions_list, 1395 mem_region_list_entry) { 1396 list_del(&ne_mem_region->mem_region_list_entry); 1397 1398 for (i = 0; i < ne_mem_region->nr_pages; i++) 1399 put_page(ne_mem_region->pages[i]); 1400 1401 kfree(ne_mem_region->pages); 1402 1403 kfree(ne_mem_region); 1404 } 1405 } 1406 1407 /** 1408 * ne_enclave_remove_all_vcpu_id_entries() - Remove all vCPU id entries from 1409 * the enclave data structure. 1410 * @ne_enclave : Private data associated with the current enclave. 1411 * 1412 * Context: Process context. This function is called with the ne_enclave mutex held. 1413 */ 1414 static void ne_enclave_remove_all_vcpu_id_entries(struct ne_enclave *ne_enclave) 1415 { 1416 unsigned int cpu = 0; 1417 unsigned int i = 0; 1418 1419 mutex_lock(&ne_cpu_pool.mutex); 1420 1421 for (i = 0; i < ne_enclave->nr_parent_vm_cores; i++) { 1422 for_each_cpu(cpu, ne_enclave->threads_per_core[i]) 1423 /* Update the available NE CPU pool. */ 1424 cpumask_set_cpu(cpu, ne_cpu_pool.avail_threads_per_core[i]); 1425 1426 free_cpumask_var(ne_enclave->threads_per_core[i]); 1427 } 1428 1429 mutex_unlock(&ne_cpu_pool.mutex); 1430 1431 kfree(ne_enclave->threads_per_core); 1432 1433 free_cpumask_var(ne_enclave->vcpu_ids); 1434 } 1435 1436 /** 1437 * ne_pci_dev_remove_enclave_entry() - Remove the enclave entry from the data 1438 * structure that is part of the NE PCI 1439 * device private data. 1440 * @ne_enclave : Private data associated with the current enclave. 1441 * @ne_pci_dev : Private data associated with the PCI device. 1442 * 1443 * Context: Process context. This function is called with the ne_pci_dev enclave 1444 * mutex held. 1445 */ 1446 static void ne_pci_dev_remove_enclave_entry(struct ne_enclave *ne_enclave, 1447 struct ne_pci_dev *ne_pci_dev) 1448 { 1449 struct ne_enclave *ne_enclave_entry = NULL; 1450 struct ne_enclave *ne_enclave_entry_tmp = NULL; 1451 1452 list_for_each_entry_safe(ne_enclave_entry, ne_enclave_entry_tmp, 1453 &ne_pci_dev->enclaves_list, enclave_list_entry) { 1454 if (ne_enclave_entry->slot_uid == ne_enclave->slot_uid) { 1455 list_del(&ne_enclave_entry->enclave_list_entry); 1456 1457 break; 1458 } 1459 } 1460 } 1461 1462 /** 1463 * ne_enclave_release() - Release function provided by the enclave file. 1464 * @inode: Inode associated with this file release function. 1465 * @file: File associated with this release function. 1466 * 1467 * Context: Process context. 1468 * Return: 1469 * * 0 on success. 1470 * * Negative return value on failure. 1471 */ 1472 static int ne_enclave_release(struct inode *inode, struct file *file) 1473 { 1474 struct ne_pci_dev_cmd_reply cmd_reply = {}; 1475 struct enclave_stop_req enclave_stop_request = {}; 1476 struct ne_enclave *ne_enclave = file->private_data; 1477 struct ne_pci_dev *ne_pci_dev = ne_devs.ne_pci_dev; 1478 struct pci_dev *pdev = ne_pci_dev->pdev; 1479 int rc = -EINVAL; 1480 struct slot_free_req slot_free_req = {}; 1481 1482 if (!ne_enclave) 1483 return 0; 1484 1485 /* 1486 * Early exit in case there is an error in the enclave creation logic 1487 * and fput() is called on the cleanup path. 1488 */ 1489 if (!ne_enclave->slot_uid) 1490 return 0; 1491 1492 /* 1493 * Acquire the enclave list mutex before the enclave mutex 1494 * in order to avoid deadlocks with @ref ne_event_work_handler. 1495 */ 1496 mutex_lock(&ne_pci_dev->enclaves_list_mutex); 1497 mutex_lock(&ne_enclave->enclave_info_mutex); 1498 1499 if (ne_enclave->state != NE_STATE_INIT && ne_enclave->state != NE_STATE_STOPPED) { 1500 enclave_stop_request.slot_uid = ne_enclave->slot_uid; 1501 1502 rc = ne_do_request(pdev, ENCLAVE_STOP, 1503 &enclave_stop_request, sizeof(enclave_stop_request), 1504 &cmd_reply, sizeof(cmd_reply)); 1505 if (rc < 0) { 1506 dev_err_ratelimited(ne_misc_dev.this_device, 1507 "Error in enclave stop [rc=%d]\n", rc); 1508 1509 goto unlock_mutex; 1510 } 1511 1512 memset(&cmd_reply, 0, sizeof(cmd_reply)); 1513 } 1514 1515 slot_free_req.slot_uid = ne_enclave->slot_uid; 1516 1517 rc = ne_do_request(pdev, SLOT_FREE, 1518 &slot_free_req, sizeof(slot_free_req), 1519 &cmd_reply, sizeof(cmd_reply)); 1520 if (rc < 0) { 1521 dev_err_ratelimited(ne_misc_dev.this_device, 1522 "Error in slot free [rc=%d]\n", rc); 1523 1524 goto unlock_mutex; 1525 } 1526 1527 ne_pci_dev_remove_enclave_entry(ne_enclave, ne_pci_dev); 1528 ne_enclave_remove_all_mem_region_entries(ne_enclave); 1529 ne_enclave_remove_all_vcpu_id_entries(ne_enclave); 1530 1531 mutex_unlock(&ne_enclave->enclave_info_mutex); 1532 mutex_unlock(&ne_pci_dev->enclaves_list_mutex); 1533 1534 kfree(ne_enclave); 1535 1536 return 0; 1537 1538 unlock_mutex: 1539 mutex_unlock(&ne_enclave->enclave_info_mutex); 1540 mutex_unlock(&ne_pci_dev->enclaves_list_mutex); 1541 1542 return rc; 1543 } 1544 1545 /** 1546 * ne_enclave_poll() - Poll functionality used for enclave out-of-band events. 1547 * @file: File associated with this poll function. 1548 * @wait: Poll table data structure. 1549 * 1550 * Context: Process context. 1551 * Return: 1552 * * Poll mask. 1553 */ 1554 static __poll_t ne_enclave_poll(struct file *file, poll_table *wait) 1555 { 1556 __poll_t mask = 0; 1557 struct ne_enclave *ne_enclave = file->private_data; 1558 1559 poll_wait(file, &ne_enclave->eventq, wait); 1560 1561 if (ne_enclave->has_event) 1562 mask |= EPOLLHUP; 1563 1564 return mask; 1565 } 1566 1567 static const struct file_operations ne_enclave_fops = { 1568 .owner = THIS_MODULE, 1569 .llseek = noop_llseek, 1570 .poll = ne_enclave_poll, 1571 .unlocked_ioctl = ne_enclave_ioctl, 1572 .release = ne_enclave_release, 1573 }; 1574 1575 /** 1576 * ne_create_vm_ioctl() - Alloc slot to be associated with an enclave. Create 1577 * enclave file descriptor to be further used for enclave 1578 * resources handling e.g. memory regions and CPUs. 1579 * @ne_pci_dev : Private data associated with the PCI device. 1580 * @slot_uid: User pointer to store the generated unique slot id 1581 * associated with an enclave to. 1582 * 1583 * Context: Process context. This function is called with the ne_pci_dev enclave 1584 * mutex held. 1585 * Return: 1586 * * Enclave fd on success. 1587 * * Negative return value on failure. 1588 */ 1589 static int ne_create_vm_ioctl(struct ne_pci_dev *ne_pci_dev, u64 __user *slot_uid) 1590 { 1591 struct ne_pci_dev_cmd_reply cmd_reply = {}; 1592 int enclave_fd = -1; 1593 struct file *enclave_file = NULL; 1594 unsigned int i = 0; 1595 struct ne_enclave *ne_enclave = NULL; 1596 struct pci_dev *pdev = ne_pci_dev->pdev; 1597 int rc = -EINVAL; 1598 struct slot_alloc_req slot_alloc_req = {}; 1599 1600 mutex_lock(&ne_cpu_pool.mutex); 1601 1602 for (i = 0; i < ne_cpu_pool.nr_parent_vm_cores; i++) 1603 if (!cpumask_empty(ne_cpu_pool.avail_threads_per_core[i])) 1604 break; 1605 1606 if (i == ne_cpu_pool.nr_parent_vm_cores) { 1607 dev_err_ratelimited(ne_misc_dev.this_device, 1608 "No CPUs available in CPU pool\n"); 1609 1610 mutex_unlock(&ne_cpu_pool.mutex); 1611 1612 return -NE_ERR_NO_CPUS_AVAIL_IN_POOL; 1613 } 1614 1615 mutex_unlock(&ne_cpu_pool.mutex); 1616 1617 ne_enclave = kzalloc_obj(*ne_enclave); 1618 if (!ne_enclave) 1619 return -ENOMEM; 1620 1621 mutex_lock(&ne_cpu_pool.mutex); 1622 1623 ne_enclave->nr_parent_vm_cores = ne_cpu_pool.nr_parent_vm_cores; 1624 ne_enclave->nr_threads_per_core = ne_cpu_pool.nr_threads_per_core; 1625 ne_enclave->numa_node = ne_cpu_pool.numa_node; 1626 1627 mutex_unlock(&ne_cpu_pool.mutex); 1628 1629 ne_enclave->threads_per_core = kzalloc_objs(*ne_enclave->threads_per_core, 1630 ne_enclave->nr_parent_vm_cores); 1631 if (!ne_enclave->threads_per_core) { 1632 rc = -ENOMEM; 1633 1634 goto free_ne_enclave; 1635 } 1636 1637 for (i = 0; i < ne_enclave->nr_parent_vm_cores; i++) 1638 if (!zalloc_cpumask_var(&ne_enclave->threads_per_core[i], GFP_KERNEL)) { 1639 rc = -ENOMEM; 1640 1641 goto free_cpumask; 1642 } 1643 1644 if (!zalloc_cpumask_var(&ne_enclave->vcpu_ids, GFP_KERNEL)) { 1645 rc = -ENOMEM; 1646 1647 goto free_cpumask; 1648 } 1649 1650 enclave_fd = get_unused_fd_flags(O_CLOEXEC); 1651 if (enclave_fd < 0) { 1652 rc = enclave_fd; 1653 1654 dev_err_ratelimited(ne_misc_dev.this_device, 1655 "Error in getting unused fd [rc=%d]\n", rc); 1656 1657 goto free_cpumask; 1658 } 1659 1660 enclave_file = anon_inode_getfile("ne-vm", &ne_enclave_fops, ne_enclave, O_RDWR); 1661 if (IS_ERR(enclave_file)) { 1662 rc = PTR_ERR(enclave_file); 1663 1664 dev_err_ratelimited(ne_misc_dev.this_device, 1665 "Error in anon inode get file [rc=%d]\n", rc); 1666 1667 goto put_fd; 1668 } 1669 1670 rc = ne_do_request(pdev, SLOT_ALLOC, 1671 &slot_alloc_req, sizeof(slot_alloc_req), 1672 &cmd_reply, sizeof(cmd_reply)); 1673 if (rc < 0) { 1674 dev_err_ratelimited(ne_misc_dev.this_device, 1675 "Error in slot alloc [rc=%d]\n", rc); 1676 1677 goto put_file; 1678 } 1679 1680 init_waitqueue_head(&ne_enclave->eventq); 1681 ne_enclave->has_event = false; 1682 mutex_init(&ne_enclave->enclave_info_mutex); 1683 ne_enclave->max_mem_regions = cmd_reply.mem_regions; 1684 INIT_LIST_HEAD(&ne_enclave->mem_regions_list); 1685 ne_enclave->mm = current->mm; 1686 ne_enclave->slot_uid = cmd_reply.slot_uid; 1687 ne_enclave->state = NE_STATE_INIT; 1688 1689 list_add(&ne_enclave->enclave_list_entry, &ne_pci_dev->enclaves_list); 1690 1691 if (copy_to_user(slot_uid, &ne_enclave->slot_uid, sizeof(ne_enclave->slot_uid))) { 1692 /* 1693 * As we're holding the only reference to 'enclave_file', fput() 1694 * will call ne_enclave_release() which will do a proper cleanup 1695 * of all so far allocated resources, leaving only the unused fd 1696 * for us to free. 1697 */ 1698 fput(enclave_file); 1699 put_unused_fd(enclave_fd); 1700 1701 return -EFAULT; 1702 } 1703 1704 fd_install(enclave_fd, enclave_file); 1705 1706 return enclave_fd; 1707 1708 put_file: 1709 fput(enclave_file); 1710 put_fd: 1711 put_unused_fd(enclave_fd); 1712 free_cpumask: 1713 free_cpumask_var(ne_enclave->vcpu_ids); 1714 for (i = 0; i < ne_enclave->nr_parent_vm_cores; i++) 1715 free_cpumask_var(ne_enclave->threads_per_core[i]); 1716 kfree(ne_enclave->threads_per_core); 1717 free_ne_enclave: 1718 kfree(ne_enclave); 1719 1720 return rc; 1721 } 1722 1723 /** 1724 * ne_ioctl() - Ioctl function provided by the NE misc device. 1725 * @file: File associated with this ioctl function. 1726 * @cmd: The command that is set for the ioctl call. 1727 * @arg: The argument that is provided for the ioctl call. 1728 * 1729 * Context: Process context. 1730 * Return: 1731 * * Ioctl result (e.g. enclave file descriptor) on success. 1732 * * Negative return value on failure. 1733 */ 1734 static long ne_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 1735 { 1736 switch (cmd) { 1737 case NE_CREATE_VM: { 1738 int enclave_fd = -1; 1739 struct ne_pci_dev *ne_pci_dev = ne_devs.ne_pci_dev; 1740 u64 __user *slot_uid = (void __user *)arg; 1741 1742 mutex_lock(&ne_pci_dev->enclaves_list_mutex); 1743 enclave_fd = ne_create_vm_ioctl(ne_pci_dev, slot_uid); 1744 mutex_unlock(&ne_pci_dev->enclaves_list_mutex); 1745 1746 return enclave_fd; 1747 } 1748 1749 default: 1750 return -ENOTTY; 1751 } 1752 1753 return 0; 1754 } 1755 1756 #if defined(CONFIG_NITRO_ENCLAVES_MISC_DEV_TEST) 1757 #include "ne_misc_dev_test.c" 1758 #endif 1759 1760 static int __init ne_init(void) 1761 { 1762 mutex_init(&ne_cpu_pool.mutex); 1763 1764 return pci_register_driver(&ne_pci_driver); 1765 } 1766 1767 static void __exit ne_exit(void) 1768 { 1769 pci_unregister_driver(&ne_pci_driver); 1770 1771 ne_teardown_cpu_pool(); 1772 } 1773 1774 module_init(ne_init); 1775 module_exit(ne_exit); 1776 1777 MODULE_AUTHOR("Amazon.com, Inc. or its affiliates"); 1778 MODULE_DESCRIPTION("Nitro Enclaves Driver"); 1779 MODULE_LICENSE("GPL v2"); 1780