drm/i915/huc: Allow for very slow HuC loading
authorjsg <jsg@openbsd.org>
Sat, 10 Aug 2024 23:15:28 +0000 (23:15 +0000)
committerjsg <jsg@openbsd.org>
Sat, 10 Aug 2024 23:15:28 +0000 (23:15 +0000)
From John Harrison
a797099562267ebb281acd59750f1a8dbba36eef in mainline linux

Avoids HuC auth failing due to timeout on Raptor Lake-S,
reported and fix tested by Laurence Tratt.

sys/dev/pci/drm/i915/gt/uc/intel_guc_fw.c
sys/dev/pci/drm/i915/gt/uc/intel_huc.c

index 0f79cb6..52332bb 100644 (file)
@@ -184,7 +184,7 @@ static int guc_wait_ucode(struct intel_guc *guc)
         * in the seconds range. However, there is a limit on how long an
         * individual wait_for() can wait. So wrap it in a loop.
         */
-       before_freq = intel_rps_read_actual_frequency(&uncore->gt->rps);
+       before_freq = intel_rps_read_actual_frequency(&gt->rps);
        before = ktime_get();
        for (count = 0; count < GUC_LOAD_RETRY_LIMIT; count++) {
                ret = wait_for(guc_load_done(uncore, &status, &success), 1000);
@@ -192,7 +192,7 @@ static int guc_wait_ucode(struct intel_guc *guc)
                        break;
 
                guc_dbg(guc, "load still in progress, count = %d, freq = %dMHz, status = 0x%08X [0x%02X/%02X]\n",
-                       count, intel_rps_read_actual_frequency(&uncore->gt->rps), status,
+                       count, intel_rps_read_actual_frequency(&gt->rps), status,
                        REG_FIELD_GET(GS_BOOTROM_MASK, status),
                        REG_FIELD_GET(GS_UKERNEL_MASK, status));
        }
@@ -204,7 +204,7 @@ static int guc_wait_ucode(struct intel_guc *guc)
                u32 bootrom = REG_FIELD_GET(GS_BOOTROM_MASK, status);
 
                guc_info(guc, "load failed: status = 0x%08X, time = %lldms, freq = %dMHz, ret = %d\n",
-                        status, delta_ms, intel_rps_read_actual_frequency(&uncore->gt->rps), ret);
+                        status, delta_ms, intel_rps_read_actual_frequency(&gt->rps), ret);
                guc_info(guc, "load failed: status: Reset = %d, BootROM = 0x%02X, UKernel = 0x%02X, MIA = 0x%02X, Auth = 0x%02X\n",
                         REG_FIELD_GET(GS_MIA_IN_RESET, status),
                         bootrom, ukernel,
@@ -254,11 +254,11 @@ static int guc_wait_ucode(struct intel_guc *guc)
                guc_warn(guc, "excessive init time: %lldms! [status = 0x%08X, count = %d, ret = %d]\n",
                         delta_ms, status, count, ret);
                guc_warn(guc, "excessive init time: [freq = %dMHz, before = %dMHz, perf_limit_reasons = 0x%08X]\n",
-                        intel_rps_read_actual_frequency(&uncore->gt->rps), before_freq,
+                        intel_rps_read_actual_frequency(&gt->rps), before_freq,
                         intel_uncore_read(uncore, intel_gt_perf_limit_reasons_reg(gt)));
        } else {
                guc_dbg(guc, "init took %lldms, freq = %dMHz, before = %dMHz, status = 0x%08X, count = %d, ret = %d\n",
-                       delta_ms, intel_rps_read_actual_frequency(&uncore->gt->rps),
+                       delta_ms, intel_rps_read_actual_frequency(&gt->rps),
                        before_freq, status, count, ret);
        }
 
index 9c40964..e9bf7dc 100644 (file)
@@ -6,6 +6,7 @@
 #include <linux/types.h>
 
 #include "gt/intel_gt.h"
+#include "gt/intel_rps.h"
 #include "intel_guc_reg.h"
 #include "intel_huc.h"
 #include "intel_huc_print.h"
@@ -462,17 +463,68 @@ static const char *auth_mode_string(struct intel_huc *huc,
        return partial ? "clear media" : "all workloads";
 }
 
+/*
+ * Use a longer timeout for debug builds so that problems can be detected
+ * and analysed. But a shorter timeout for releases so that user's don't
+ * wait forever to find out there is a problem. Note that the only reason
+ * an end user should hit the timeout is in case of extreme thermal throttling.
+ * And a system that is that hot during boot is probably dead anyway!
+ */
+#if defined(CONFIG_DRM_I915_DEBUG_GEM)
+#define HUC_LOAD_RETRY_LIMIT   20
+#else
+#define HUC_LOAD_RETRY_LIMIT   3
+#endif
+
 int intel_huc_wait_for_auth_complete(struct intel_huc *huc,
                                     enum intel_huc_authentication_type type)
 {
        struct intel_gt *gt = huc_to_gt(huc);
-       int ret;
+       struct intel_uncore *uncore = gt->uncore;
+       ktime_t before, after, delta;
+       int ret, count;
+       u64 delta_ms;
+       u32 before_freq;
 
-       ret = __intel_wait_for_register(gt->uncore,
-                                       huc->status[type].reg,
-                                       huc->status[type].mask,
-                                       huc->status[type].value,
-                                       2, 50, NULL);
+       /*
+        * The KMD requests maximum frequency during driver load, however thermal
+        * throttling can force the frequency down to minimum (although the board
+        * really should never get that hot in real life!). IFWI  issues have been
+        * seen to cause sporadic failures to grant the higher frequency. And at
+        * minimum frequency, the authentication time can be in the seconds range.
+        * Note that there is a limit on how long an individual wait_for() can wait.
+        * So wrap it in a loop.
+        */
+       before_freq = intel_rps_read_actual_frequency(&gt->rps);
+       before = ktime_get();
+       for (count = 0; count < HUC_LOAD_RETRY_LIMIT; count++) {
+               ret = __intel_wait_for_register(gt->uncore,
+                                               huc->status[type].reg,
+                                               huc->status[type].mask,
+                                               huc->status[type].value,
+                                               2, 1000, NULL);
+               if (!ret)
+                       break;
+
+               huc_dbg(huc, "auth still in progress, count = %d, freq = %dMHz, status = 0x%08X\n",
+                       count, intel_rps_read_actual_frequency(&gt->rps),
+                       huc->status[type].reg.reg);
+       }
+       after = ktime_get();
+       delta = ktime_sub(after, before);
+       delta_ms = ktime_to_ms(delta);
+
+       if (delta_ms > 50) {
+               huc_warn(huc, "excessive auth time: %lldms! [status = 0x%08X, count = %d, ret = %d]\n",
+                        delta_ms, huc->status[type].reg.reg, count, ret);
+               huc_warn(huc, "excessive auth time: [freq = %dMHz, before = %dMHz, perf_limit_reasons = 0x%08X]\n",
+                        intel_rps_read_actual_frequency(&gt->rps), before_freq,
+                        intel_uncore_read(uncore, intel_gt_perf_limit_reasons_reg(gt)));
+       } else {
+               huc_dbg(huc, "auth took %lldms, freq = %dMHz, before = %dMHz, status = 0x%08X, count = %d, ret = %d\n",
+                       delta_ms, intel_rps_read_actual_frequency(&gt->rps),
+                       before_freq, huc->status[type].reg.reg, count, ret);
+       }
 
        /* mark the load process as complete even if the wait failed */
        delayed_huc_load_complete(huc);