#define AMDGPU_SMARTSHIFT_MAX_BIAS (100)
#define AMDGPU_SMARTSHIFT_MIN_BIAS (-100)
+/* Extra time delay(in ms) to eliminate the influence of temperature momentary fluctuation */
+#define AMDGPU_SWCTF_EXTRA_DELAY 50
+
struct amdgpu_device;
struct amdgpu_irq_src;
struct amdgpu_fpriv;
#include <linux/gfp.h>
#include <linux/slab.h>
#include <linux/firmware.h>
+#include <linux/reboot.h>
#include "amd_shared.h"
#include "amd_powerplay.h"
#include "power_state.h"
return 0;
}
+static void pp_swctf_delayed_work_handler(struct work_struct *work)
+{
+ struct pp_hwmgr *hwmgr =
+ container_of(work, struct pp_hwmgr, swctf_delayed_work.work);
+ struct amdgpu_device *adev = hwmgr->adev;
+ struct amdgpu_dpm_thermal *range =
+ &adev->pm.dpm.thermal;
+ uint32_t gpu_temperature, size;
+ int ret;
+
+ /*
+ * If the hotspot/edge temperature is confirmed as below SW CTF setting point
+ * after the delay enforced, nothing will be done.
+ * Otherwise, a graceful shutdown will be performed to prevent further damage.
+ */
+ if (range->sw_ctf_threshold &&
+ hwmgr->hwmgr_func->read_sensor) {
+ ret = hwmgr->hwmgr_func->read_sensor(hwmgr,
+ AMDGPU_PP_SENSOR_HOTSPOT_TEMP,
+ &gpu_temperature,
+ &size);
+ /*
+ * For some legacy ASICs, hotspot temperature retrieving might be not
+ * supported. Check the edge temperature instead then.
+ */
+ if (ret == -EOPNOTSUPP)
+ ret = hwmgr->hwmgr_func->read_sensor(hwmgr,
+ AMDGPU_PP_SENSOR_EDGE_TEMP,
+ &gpu_temperature,
+ &size);
+ if (!ret && gpu_temperature / 1000 < range->sw_ctf_threshold)
+ return;
+ }
+
+ dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
+ dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
+ orderly_poweroff(true);
+}
+
static int pp_sw_init(void *handle)
{
struct amdgpu_device *adev = handle;
pr_debug("powerplay sw init %s\n", ret ? "failed" : "successfully");
+ if (!ret)
+ INIT_DELAYED_WORK(&hwmgr->swctf_delayed_work,
+ pp_swctf_delayed_work_handler);
+
return ret;
}
struct amdgpu_device *adev = handle;
struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle;
+ cancel_delayed_work_sync(&hwmgr->swctf_delayed_work);
+
hwmgr_hw_fini(hwmgr);
return 0;
struct amdgpu_device *adev = handle;
struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle;
+ cancel_delayed_work_sync(&hwmgr->swctf_delayed_work);
+
return hwmgr_suspend(hwmgr);
}
struct amdgpu_irq_src *source,
struct amdgpu_iv_entry *entry)
{
+ struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle;
uint32_t client_id = entry->client_id;
uint32_t src_id = entry->src_id;
if (client_id == AMDGPU_IRQ_CLIENTID_LEGACY) {
if (src_id == VISLANDS30_IV_SRCID_CG_TSS_THERMAL_LOW_TO_HIGH) {
- dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
- /*
- * SW CTF just occurred.
- * Try to do a graceful shutdown to prevent further damage.
- */
- dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
- orderly_poweroff(true);
- } else if (src_id == VISLANDS30_IV_SRCID_CG_TSS_THERMAL_HIGH_TO_LOW)
+ schedule_delayed_work(&hwmgr->swctf_delayed_work,
+ msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY));
+ } else if (src_id == VISLANDS30_IV_SRCID_CG_TSS_THERMAL_HIGH_TO_LOW) {
dev_emerg(adev->dev, "ERROR: GPU under temperature range detected!\n");
- else if (src_id == VISLANDS30_IV_SRCID_GPIO_19) {
+ } else if (src_id == VISLANDS30_IV_SRCID_GPIO_19) {
dev_emerg(adev->dev, "ERROR: GPU HW Critical Temperature Fault(aka CTF) detected!\n");
/*
* HW CTF just occurred. Shutdown to prevent further damage.
orderly_poweroff(true);
}
} else if (client_id == SOC15_IH_CLIENTID_THM) {
- if (src_id == 0) {
- dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
- /*
- * SW CTF just occurred.
- * Try to do a graceful shutdown to prevent further damage.
- */
- dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
- orderly_poweroff(true);
- } else
+ if (src_id == 0)
+ schedule_delayed_work(&hwmgr->swctf_delayed_work,
+ msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY));
+ else
dev_emerg(adev->dev, "ERROR: GPU under temperature range detected!\n");
} else if (client_id == SOC15_IH_CLIENTID_ROM_SMUIO) {
dev_emerg(adev->dev, "ERROR: GPU HW Critical Temperature Fault(aka CTF) detected!\n");
bool gfxoff_state_changed_by_workload;
uint32_t pstate_sclk_peak;
uint32_t pstate_mclk_peak;
+
+ struct delayed_work swctf_delayed_work;
};
int hwmgr_early_init(struct pp_hwmgr *hwmgr);
#include <linux/firmware.h>
#include <linux/pci.h>
+#include <linux/reboot.h>
#include "amdgpu.h"
#include "amdgpu_smu.h"
smu->ppt_funcs->interrupt_work(smu);
}
+static void smu_swctf_delayed_work_handler(struct work_struct *work)
+{
+ struct smu_context *smu =
+ container_of(work, struct smu_context, swctf_delayed_work.work);
+ struct smu_temperature_range *range =
+ &smu->thermal_range;
+ struct amdgpu_device *adev = smu->adev;
+ uint32_t hotspot_tmp, size;
+
+ /*
+ * If the hotspot temperature is confirmed as below SW CTF setting point
+ * after the delay enforced, nothing will be done.
+ * Otherwise, a graceful shutdown will be performed to prevent further damage.
+ */
+ if (range->software_shutdown_temp &&
+ smu->ppt_funcs->read_sensor &&
+ !smu->ppt_funcs->read_sensor(smu,
+ AMDGPU_PP_SENSOR_HOTSPOT_TEMP,
+ &hotspot_tmp,
+ &size) &&
+ hotspot_tmp / 1000 < range->software_shutdown_temp)
+ return;
+
+ dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
+ dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
+ orderly_poweroff(true);
+}
+
static int smu_sw_init(void *handle)
{
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
return ret;
}
+ INIT_DELAYED_WORK(&smu->swctf_delayed_work,
+ smu_swctf_delayed_work_handler);
+
ret = smu_smc_table_sw_init(smu);
if (ret) {
dev_err(adev->dev, "Failed to sw init smc table!\n");
return ret;
}
+ cancel_delayed_work_sync(&smu->swctf_delayed_work);
+
ret = smu_disable_dpms(smu);
if (ret) {
dev_err(adev->dev, "Fail to disable dpm features!\n");
u32 debug_param_reg;
u32 debug_msg_reg;
u32 debug_resp_reg;
+
+ struct delayed_work swctf_delayed_work;
};
struct i2c_adapter;
if (client_id == SOC15_IH_CLIENTID_THM) {
switch (src_id) {
case THM_11_0__SRCID__THM_DIG_THERM_L2H:
- dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
- /*
- * SW CTF just occurred.
- * Try to do a graceful shutdown to prevent further damage.
- */
- dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
- orderly_poweroff(true);
+ schedule_delayed_work(&smu->swctf_delayed_work,
+ msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY));
break;
case THM_11_0__SRCID__THM_DIG_THERM_H2L:
dev_emerg(adev->dev, "ERROR: GPU under temperature range detected\n");
if (client_id == SOC15_IH_CLIENTID_THM) {
switch (src_id) {
case THM_11_0__SRCID__THM_DIG_THERM_L2H:
- dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
- /*
- * SW CTF just occurred.
- * Try to do a graceful shutdown to prevent further damage.
- */
- dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
- orderly_poweroff(true);
+ schedule_delayed_work(&smu->swctf_delayed_work,
+ msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY));
break;
case THM_11_0__SRCID__THM_DIG_THERM_H2L:
dev_emerg(adev->dev, "ERROR: GPU under temperature range detected\n");