1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2023-2024 Intel Corporation 4 */ 5 6 #include <drm/drm_debugfs.h> 7 #include <drm/drm_managed.h> 8 9 #include "xe_gt_sriov_vf.h" 10 #include "xe_guc.h" 11 #include "xe_sriov_printk.h" 12 #include "xe_sriov_vf.h" 13 #include "xe_sriov_vf_ccs.h" 14 15 /** 16 * DOC: VF restore procedure in PF KMD and VF KMD 17 * 18 * Restoring previously saved state of a VF is one of core features of 19 * SR-IOV. All major VM Management applications allow saving and restoring 20 * the VM state, and doing that to a VM which uses SRIOV VF as one of 21 * the accessible devices requires support from KMD on both PF and VF side. 22 * VMM initiates all required operations through VFIO module, which then 23 * translates them into PF KMD calls. This description will focus on these 24 * calls, leaving out the module which initiates these steps (VFIO). 25 * 26 * In order to start the restore procedure, GuC needs to keep the VF in 27 * proper state. The PF driver can ensure GuC set it to VF_READY state 28 * by provisioning the VF, which in turn can be done after Function Level 29 * Reset of said VF (or after it was freshly created - in that case FLR 30 * is not needed). The FLR procedure ends with GuC sending message 31 * `GUC_PF_NOTIFY_VF_FLR_DONE`, and then provisioning data is sent to GuC. 32 * After the provisioning is completed, the VF needs to be paused, and 33 * at that point the actual restore can begin. 34 * 35 * During VF Restore, state of several resources is restored. These may 36 * include local memory content (system memory is restored by VMM itself), 37 * values of MMIO registers, stateless compression metadata and others. 38 * The final resource which also needs restoring is state of the VF 39 * submission maintained within GuC. For that, `GUC_PF_OPCODE_VF_RESTORE` 40 * message is used, with reference to the state blob to be consumed by 41 * GuC. 42 * 43 * Next, when VFIO is asked to set the VM into running state, the PF driver 44 * sends `GUC_PF_TRIGGER_VF_RESUME` to GuC. When sent after restore, this 45 * changes VF state within GuC to `VF_RESFIX_BLOCKED` rather than the 46 * usual `VF_RUNNING`. At this point GuC triggers an interrupt to inform 47 * the VF KMD within the VM that it was migrated. 48 * 49 * As soon as Virtual GPU of the VM starts, the VF driver within receives 50 * the MIGRATED interrupt and schedules post-migration recovery worker. 51 * That worker sends `VF2GUC_RESFIX_START` action along with non-zero 52 * marker, queries GuC for new provisioning (using MMIO communication), 53 * and applies fixups to any non-virtualized resources used by the VF. 54 * 55 * When the VF driver is ready to continue operation on the newly connected 56 * hardware, it sends `VF2GUC_RESFIX_DONE` action along with the same 57 * marker which was sent with `VF2GUC_RESFIX_START` which causes it to 58 * enter the long awaited `VF_RUNNING` state, and therefore start handling 59 * CTB messages and scheduling workloads from the VF:: 60 * 61 * PF GuC VF 62 * [ ] | | 63 * [ ] PF2GUC_VF_CONTROL(pause) | | 64 * [ ]---------------------------> [ ] | 65 * [ ] [ ] GuC sets new VF state to | 66 * [ ] [ ]------- VF_READY_PAUSED | 67 * [ ] [ ] | | 68 * [ ] [ ] <----- | 69 * [ ] success [ ] | 70 * [ ] <---------------------------[ ] | 71 * [ ] | | 72 * [ ] PF loads resources from the | | 73 * [ ]------- saved image supplied | | 74 * [ ] | | | 75 * [ ] <----- | | 76 * [ ] | | 77 * [ ] GUC_PF_OPCODE_VF_RESTORE | | 78 * [ ]---------------------------> [ ] | 79 * [ ] [ ] GuC loads contexts and CTB | 80 * [ ] [ ]------- state from image | 81 * [ ] [ ] | | 82 * [ ] [ ] <----- | 83 * [ ] [ ] | 84 * [ ] [ ] GuC sets new VF state to | 85 * [ ] [ ]------- VF_RESFIX_PAUSED | 86 * [ ] [ ] | | 87 * [ ] success [ ] <----- | 88 * [ ] <---------------------------[ ] | 89 * [ ] | | 90 * [ ] GUC_PF_TRIGGER_VF_RESUME | | 91 * [ ]---------------------------> [ ] | 92 * [ ] [ ] GuC sets new VF state to | 93 * [ ] [ ]------- VF_RESFIX_BLOCKED | 94 * [ ] [ ] | | 95 * [ ] [ ] <----- | 96 * [ ] [ ] | 97 * [ ] [ ] GUC_INTR_SW_INT_0 | 98 * [ ] success [ ]---------------------------> [ ] 99 * [ ] <---------------------------[ ] [ ] 100 * | | VF2GUC_QUERY_SINGLE_KLV [ ] 101 * | [ ] <---------------------------[ ] 102 * | [ ] [ ] 103 * | [ ] new VF provisioning [ ] 104 * | [ ]---------------------------> [ ] 105 * | | [ ] 106 * | | VF2GUC_RESFIX_START [ ] 107 * | [ ] <---------------------------[ ] 108 * | [ ] [ ] 109 * | [ ] success [ ] 110 * | [ ]---------------------------> [ ] 111 * | | VF driver applies post [ ] 112 * | | migration fixups -------[ ] 113 * | | | [ ] 114 * | | -----> [ ] 115 * | | [ ] 116 * | | VF2GUC_RESFIX_DONE [ ] 117 * | [ ] <---------------------------[ ] 118 * | [ ] [ ] 119 * | [ ] GuC sets new VF state to [ ] 120 * | [ ]------- VF_RUNNING [ ] 121 * | [ ] | [ ] 122 * | [ ] <----- [ ] 123 * | [ ] success [ ] 124 * | [ ]---------------------------> [ ] 125 * | | | 126 * | | | 127 * 128 * Handling of VF double migration flow is shown below:: 129 * 130 * GuC1 VF 131 * | | 132 * | [ ]<--- start fixups 133 * | VF2GUC_RESFIX_START(marker) [ ] 134 * [ ] <-------------------------------------------[ ] 135 * [ ] [ ] 136 * [ ]---\ [ ] 137 * [ ] store marker [ ] 138 * [ ]<--/ [ ] 139 * [ ] [ ] 140 * [ ] success [ ] 141 * [ ] ------------------------------------------> [ ] 142 * | [ ] 143 * | [ ]---\ 144 * | [ ] do fixups 145 * | [ ]<--/ 146 * | [ ] 147 * -------------- VF paused / saved ---------------- 148 * : 149 * 150 * GuC2 151 * | 152 * ----------------- VF restored ------------------ 153 * | 154 * [ ] 155 * [ ]---\ 156 * [ ] reset marker 157 * [ ]<--/ 158 * [ ] 159 * ----------------- VF resumed ------------------ 160 * | [ ] 161 * | [ ] 162 * | VF2GUC_RESFIX_DONE(marker) [ ] 163 * [ ] <-------------------------------------------[ ] 164 * [ ] [ ] 165 * [ ]---\ [ ] 166 * [ ] check marker [ ] 167 * [ ] (mismatch) [ ] 168 * [ ]<--/ [ ] 169 * [ ] [ ] 170 * [ ] RESPONSE_VF_MIGRATED [ ] 171 * [ ] ------------------------------------------> [ ] 172 * | [ ]---\ 173 * | [ ] reschedule fixups 174 * | [ ]<--/ 175 * | | 176 */ 177 178 /** 179 * xe_sriov_vf_migration_supported - Report whether SR-IOV VF migration is 180 * supported or not. 181 * @xe: the &xe_device to check 182 * 183 * Returns: true if VF migration is supported, false otherwise. 184 */ 185 bool xe_sriov_vf_migration_supported(struct xe_device *xe) 186 { 187 xe_assert(xe, IS_SRIOV_VF(xe)); 188 return !xe->sriov.vf.migration.disabled; 189 } 190 191 /** 192 * xe_sriov_vf_migration_disable - Turn off VF migration with given log message. 193 * @xe: the &xe_device instance. 194 * @fmt: format string for the log message, to be combined with following VAs. 195 */ 196 void xe_sriov_vf_migration_disable(struct xe_device *xe, const char *fmt, ...) 197 { 198 struct va_format vaf; 199 va_list va_args; 200 201 xe_assert(xe, IS_SRIOV_VF(xe)); 202 203 va_start(va_args, fmt); 204 vaf.fmt = fmt; 205 vaf.va = &va_args; 206 xe_sriov_notice(xe, "migration disabled: %pV\n", &vaf); 207 va_end(va_args); 208 209 xe->sriov.vf.migration.disabled = true; 210 } 211 212 static void vf_migration_init_early(struct xe_device *xe) 213 { 214 if (!xe_device_has_memirq(xe)) 215 return xe_sriov_vf_migration_disable(xe, "requires memory-based IRQ support"); 216 217 } 218 219 /** 220 * xe_sriov_vf_init_early - Initialize SR-IOV VF specific data. 221 * @xe: the &xe_device to initialize 222 */ 223 void xe_sriov_vf_init_early(struct xe_device *xe) 224 { 225 vf_migration_init_early(xe); 226 } 227 228 static int vf_migration_init_late(struct xe_device *xe) 229 { 230 struct xe_gt *gt = xe_root_mmio_gt(xe); 231 struct xe_uc_fw_version guc_version; 232 233 if (!xe_sriov_vf_migration_supported(xe)) 234 return 0; 235 236 xe_gt_sriov_vf_guc_versions(gt, NULL, &guc_version); 237 if (MAKE_GUC_VER_STRUCT(guc_version) < MAKE_GUC_VER(1, 27, 0)) { 238 xe_sriov_vf_migration_disable(xe, 239 "requires GuC ABI >= 1.27.0, but only %u.%u.%u found", 240 guc_version.major, guc_version.minor, 241 guc_version.patch); 242 return 0; 243 } 244 245 return xe_sriov_vf_ccs_init(xe); 246 } 247 248 /** 249 * xe_sriov_vf_init_late() - SR-IOV VF late initialization functions. 250 * @xe: the &xe_device to initialize 251 * 252 * This function initializes code for CCS migration. 253 * 254 * Return: 0 on success or a negative error code on failure. 255 */ 256 int xe_sriov_vf_init_late(struct xe_device *xe) 257 { 258 return vf_migration_init_late(xe); 259 } 260 261 static int sa_info_vf_ccs(struct seq_file *m, void *data) 262 { 263 struct drm_info_node *node = m->private; 264 struct xe_device *xe = to_xe_device(node->minor->dev); 265 struct drm_printer p = drm_seq_file_printer(m); 266 267 xe_sriov_vf_ccs_print(xe, &p); 268 return 0; 269 } 270 271 static const struct drm_info_list debugfs_list[] = { 272 { .name = "sa_info_vf_ccs", .show = sa_info_vf_ccs }, 273 }; 274 275 /** 276 * xe_sriov_vf_debugfs_register - Register VF debugfs attributes. 277 * @xe: the &xe_device 278 * @root: the root &dentry 279 * 280 * Prepare debugfs attributes exposed by the VF. 281 */ 282 void xe_sriov_vf_debugfs_register(struct xe_device *xe, struct dentry *root) 283 { 284 drm_debugfs_create_files(debugfs_list, ARRAY_SIZE(debugfs_list), 285 root, xe->drm.primary); 286 } 287