From 788b966fef162d03c5f60cf24681ffe16563fd81 Mon Sep 17 00:00:00 2001 From: jsg Date: Thu, 17 Aug 2023 03:55:03 +0000 Subject: [PATCH] drm/amd/pm: avoid unintentional shutdown due to temperature momentary fluctuation From Evan Quan 0f19195d639764d68f6f316dda363ba29821e5bc in linux-6.1.y/6.1.46 b75efe88b20c2be28b67e2821a794cc183e32374 in mainline linux --- sys/dev/pci/drm/amd/amdgpu/amdgpu.h | 3 ++ .../pci/drm/amd/pm/powerplay/amd_powerplay.c | 48 +++++++++++++++++++ .../drm/amd/pm/powerplay/hwmgr/smu_helper.c | 27 ++++------- sys/dev/pci/drm/amd/pm/powerplay/inc/hwmgr.h | 2 + sys/dev/pci/drm/amd/pm/swsmu/amdgpu_smu.c | 34 +++++++++++++ sys/dev/pci/drm/amd/pm/swsmu/inc/amdgpu_smu.h | 2 + .../pci/drm/amd/pm/swsmu/smu11/smu_v11_0.c | 9 +--- .../pci/drm/amd/pm/swsmu/smu13/smu_v13_0.c | 9 +--- 8 files changed, 102 insertions(+), 32 deletions(-) diff --git a/sys/dev/pci/drm/amd/amdgpu/amdgpu.h b/sys/dev/pci/drm/amd/amdgpu/amdgpu.h index b15b9dc227a..6bd8895f3d4 100644 --- a/sys/dev/pci/drm/amd/amdgpu/amdgpu.h +++ b/sys/dev/pci/drm/amd/amdgpu/amdgpu.h @@ -288,6 +288,9 @@ extern int amdgpu_sg_display; #define AMDGPU_SMARTSHIFT_MAX_BIAS (100) #define AMDGPU_SMARTSHIFT_MIN_BIAS (-100) +/* Extra time delay(in ms) to eliminate the influence of temperature momentary fluctuation */ +#define AMDGPU_SWCTF_EXTRA_DELAY 50 + struct amdgpu_device; struct amdgpu_irq_src; struct amdgpu_fpriv; diff --git a/sys/dev/pci/drm/amd/pm/powerplay/amd_powerplay.c b/sys/dev/pci/drm/amd/pm/powerplay/amd_powerplay.c index 012dbdb193a..aa1b7a4cb26 100644 --- a/sys/dev/pci/drm/amd/pm/powerplay/amd_powerplay.c +++ b/sys/dev/pci/drm/amd/pm/powerplay/amd_powerplay.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "amd_shared.h" #include "amd_powerplay.h" #include "power_state.h" @@ -91,6 +92,45 @@ static int pp_early_init(void *handle) return 0; } +static void pp_swctf_delayed_work_handler(struct work_struct *work) +{ + struct pp_hwmgr *hwmgr = + container_of(work, struct pp_hwmgr, swctf_delayed_work.work); + struct amdgpu_device *adev = hwmgr->adev; + struct amdgpu_dpm_thermal *range = + &adev->pm.dpm.thermal; + uint32_t gpu_temperature, size; + int ret; + + /* + * If the hotspot/edge temperature is confirmed as below SW CTF setting point + * after the delay enforced, nothing will be done. + * Otherwise, a graceful shutdown will be performed to prevent further damage. + */ + if (range->sw_ctf_threshold && + hwmgr->hwmgr_func->read_sensor) { + ret = hwmgr->hwmgr_func->read_sensor(hwmgr, + AMDGPU_PP_SENSOR_HOTSPOT_TEMP, + &gpu_temperature, + &size); + /* + * For some legacy ASICs, hotspot temperature retrieving might be not + * supported. Check the edge temperature instead then. + */ + if (ret == -EOPNOTSUPP) + ret = hwmgr->hwmgr_func->read_sensor(hwmgr, + AMDGPU_PP_SENSOR_EDGE_TEMP, + &gpu_temperature, + &size); + if (!ret && gpu_temperature / 1000 < range->sw_ctf_threshold) + return; + } + + dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n"); + dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n"); + orderly_poweroff(true); +} + static int pp_sw_init(void *handle) { struct amdgpu_device *adev = handle; @@ -101,6 +141,10 @@ static int pp_sw_init(void *handle) pr_debug("powerplay sw init %s\n", ret ? "failed" : "successfully"); + if (!ret) + INIT_DELAYED_WORK(&hwmgr->swctf_delayed_work, + pp_swctf_delayed_work_handler); + return ret; } @@ -136,6 +180,8 @@ static int pp_hw_fini(void *handle) struct amdgpu_device *adev = handle; struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle; + cancel_delayed_work_sync(&hwmgr->swctf_delayed_work); + hwmgr_hw_fini(hwmgr); return 0; @@ -222,6 +268,8 @@ static int pp_suspend(void *handle) struct amdgpu_device *adev = handle; struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle; + cancel_delayed_work_sync(&hwmgr->swctf_delayed_work); + return hwmgr_suspend(hwmgr); } diff --git a/sys/dev/pci/drm/amd/pm/powerplay/hwmgr/smu_helper.c b/sys/dev/pci/drm/amd/pm/powerplay/hwmgr/smu_helper.c index bfe80ac0ad8..d0b1ab6c452 100644 --- a/sys/dev/pci/drm/amd/pm/powerplay/hwmgr/smu_helper.c +++ b/sys/dev/pci/drm/amd/pm/powerplay/hwmgr/smu_helper.c @@ -603,21 +603,17 @@ int phm_irq_process(struct amdgpu_device *adev, struct amdgpu_irq_src *source, struct amdgpu_iv_entry *entry) { + struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle; uint32_t client_id = entry->client_id; uint32_t src_id = entry->src_id; if (client_id == AMDGPU_IRQ_CLIENTID_LEGACY) { if (src_id == VISLANDS30_IV_SRCID_CG_TSS_THERMAL_LOW_TO_HIGH) { - dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n"); - /* - * SW CTF just occurred. - * Try to do a graceful shutdown to prevent further damage. - */ - dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n"); - orderly_poweroff(true); - } else if (src_id == VISLANDS30_IV_SRCID_CG_TSS_THERMAL_HIGH_TO_LOW) + schedule_delayed_work(&hwmgr->swctf_delayed_work, + msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY)); + } else if (src_id == VISLANDS30_IV_SRCID_CG_TSS_THERMAL_HIGH_TO_LOW) { dev_emerg(adev->dev, "ERROR: GPU under temperature range detected!\n"); - else if (src_id == VISLANDS30_IV_SRCID_GPIO_19) { + } else if (src_id == VISLANDS30_IV_SRCID_GPIO_19) { dev_emerg(adev->dev, "ERROR: GPU HW Critical Temperature Fault(aka CTF) detected!\n"); /* * HW CTF just occurred. Shutdown to prevent further damage. @@ -626,15 +622,10 @@ int phm_irq_process(struct amdgpu_device *adev, orderly_poweroff(true); } } else if (client_id == SOC15_IH_CLIENTID_THM) { - if (src_id == 0) { - dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n"); - /* - * SW CTF just occurred. - * Try to do a graceful shutdown to prevent further damage. - */ - dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n"); - orderly_poweroff(true); - } else + if (src_id == 0) + schedule_delayed_work(&hwmgr->swctf_delayed_work, + msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY)); + else dev_emerg(adev->dev, "ERROR: GPU under temperature range detected!\n"); } else if (client_id == SOC15_IH_CLIENTID_ROM_SMUIO) { dev_emerg(adev->dev, "ERROR: GPU HW Critical Temperature Fault(aka CTF) detected!\n"); diff --git a/sys/dev/pci/drm/amd/pm/powerplay/inc/hwmgr.h b/sys/dev/pci/drm/amd/pm/powerplay/inc/hwmgr.h index 06ad0aebfff..713c95725af 100644 --- a/sys/dev/pci/drm/amd/pm/powerplay/inc/hwmgr.h +++ b/sys/dev/pci/drm/amd/pm/powerplay/inc/hwmgr.h @@ -811,6 +811,8 @@ struct pp_hwmgr { bool gfxoff_state_changed_by_workload; uint32_t pstate_sclk_peak; uint32_t pstate_mclk_peak; + + struct delayed_work swctf_delayed_work; }; int hwmgr_early_init(struct pp_hwmgr *hwmgr); diff --git a/sys/dev/pci/drm/amd/pm/swsmu/amdgpu_smu.c b/sys/dev/pci/drm/amd/pm/swsmu/amdgpu_smu.c index 7a7d3a5e0d3..f68365e8a89 100644 --- a/sys/dev/pci/drm/amd/pm/swsmu/amdgpu_smu.c +++ b/sys/dev/pci/drm/amd/pm/swsmu/amdgpu_smu.c @@ -24,6 +24,7 @@ #include #include +#include #include "amdgpu.h" #include "amdgpu_smu.h" @@ -1061,6 +1062,34 @@ static void smu_interrupt_work_fn(struct work_struct *work) smu->ppt_funcs->interrupt_work(smu); } +static void smu_swctf_delayed_work_handler(struct work_struct *work) +{ + struct smu_context *smu = + container_of(work, struct smu_context, swctf_delayed_work.work); + struct smu_temperature_range *range = + &smu->thermal_range; + struct amdgpu_device *adev = smu->adev; + uint32_t hotspot_tmp, size; + + /* + * If the hotspot temperature is confirmed as below SW CTF setting point + * after the delay enforced, nothing will be done. + * Otherwise, a graceful shutdown will be performed to prevent further damage. + */ + if (range->software_shutdown_temp && + smu->ppt_funcs->read_sensor && + !smu->ppt_funcs->read_sensor(smu, + AMDGPU_PP_SENSOR_HOTSPOT_TEMP, + &hotspot_tmp, + &size) && + hotspot_tmp / 1000 < range->software_shutdown_temp) + return; + + dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n"); + dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n"); + orderly_poweroff(true); +} + static int smu_sw_init(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; @@ -1109,6 +1138,9 @@ static int smu_sw_init(void *handle) return ret; } + INIT_DELAYED_WORK(&smu->swctf_delayed_work, + smu_swctf_delayed_work_handler); + ret = smu_smc_table_sw_init(smu); if (ret) { dev_err(adev->dev, "Failed to sw init smc table!\n"); @@ -1581,6 +1613,8 @@ static int smu_smc_hw_cleanup(struct smu_context *smu) return ret; } + cancel_delayed_work_sync(&smu->swctf_delayed_work); + ret = smu_disable_dpms(smu); if (ret) { dev_err(adev->dev, "Fail to disable dpm features!\n"); diff --git a/sys/dev/pci/drm/amd/pm/swsmu/inc/amdgpu_smu.h b/sys/dev/pci/drm/amd/pm/swsmu/inc/amdgpu_smu.h index 1bb953c9796..d070ce60fd4 100644 --- a/sys/dev/pci/drm/amd/pm/swsmu/inc/amdgpu_smu.h +++ b/sys/dev/pci/drm/amd/pm/swsmu/inc/amdgpu_smu.h @@ -573,6 +573,8 @@ struct smu_context u32 debug_param_reg; u32 debug_msg_reg; u32 debug_resp_reg; + + struct delayed_work swctf_delayed_work; }; struct i2c_adapter; diff --git a/sys/dev/pci/drm/amd/pm/swsmu/smu11/smu_v11_0.c b/sys/dev/pci/drm/amd/pm/swsmu/smu11/smu_v11_0.c index efd7d1feb0c..6de309a4a60 100644 --- a/sys/dev/pci/drm/amd/pm/swsmu/smu11/smu_v11_0.c +++ b/sys/dev/pci/drm/amd/pm/swsmu/smu11/smu_v11_0.c @@ -1438,13 +1438,8 @@ static int smu_v11_0_irq_process(struct amdgpu_device *adev, if (client_id == SOC15_IH_CLIENTID_THM) { switch (src_id) { case THM_11_0__SRCID__THM_DIG_THERM_L2H: - dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n"); - /* - * SW CTF just occurred. - * Try to do a graceful shutdown to prevent further damage. - */ - dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n"); - orderly_poweroff(true); + schedule_delayed_work(&smu->swctf_delayed_work, + msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY)); break; case THM_11_0__SRCID__THM_DIG_THERM_H2L: dev_emerg(adev->dev, "ERROR: GPU under temperature range detected\n"); diff --git a/sys/dev/pci/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/sys/dev/pci/drm/amd/pm/swsmu/smu13/smu_v13_0.c index a2ad1a06806..7a4c6aef0ef 100644 --- a/sys/dev/pci/drm/amd/pm/swsmu/smu13/smu_v13_0.c +++ b/sys/dev/pci/drm/amd/pm/swsmu/smu13/smu_v13_0.c @@ -1386,13 +1386,8 @@ static int smu_v13_0_irq_process(struct amdgpu_device *adev, if (client_id == SOC15_IH_CLIENTID_THM) { switch (src_id) { case THM_11_0__SRCID__THM_DIG_THERM_L2H: - dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n"); - /* - * SW CTF just occurred. - * Try to do a graceful shutdown to prevent further damage. - */ - dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n"); - orderly_poweroff(true); + schedule_delayed_work(&smu->swctf_delayed_work, + msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY)); break; case THM_11_0__SRCID__THM_DIG_THERM_H2L: dev_emerg(adev->dev, "ERROR: GPU under temperature range detected\n"); -- 2.20.1