From 3800fc3581d8812fd1c522b2a75e85fb3d9ce5db Mon Sep 17 00:00:00 2001 From: jmatthew Date: Fri, 13 Sep 2024 09:57:34 +0000 Subject: [PATCH] Add sensors based on information in the SMART/health log page, showing overall device health and temperature. tested by many (a while ago) tweaks from gkoehler@ kettenis@ dv@ ok kettenis@ jca@ (earlier version), dlg@ --- sys/dev/ic/nvme.c | 99 +++++++++++++++++++++++++++++++++++++++++++- sys/dev/ic/nvmereg.h | 40 +++++++++++++++++- sys/dev/ic/nvmevar.h | 9 +++- 3 files changed, 145 insertions(+), 3 deletions(-) diff --git a/sys/dev/ic/nvme.c b/sys/dev/ic/nvme.c index 7e74d293152..f54454b8db7 100644 --- a/sys/dev/ic/nvme.c +++ b/sys/dev/ic/nvme.c @@ -1,4 +1,4 @@ -/* $OpenBSD: nvme.c,v 1.122 2024/09/01 03:08:56 jsg Exp $ */ +/* $OpenBSD: nvme.c,v 1.123 2024/09/13 09:57:34 jmatthew Exp $ */ /* * Copyright (c) 2014 David Gwynne @@ -60,6 +60,10 @@ void nvme_dumpregs(struct nvme_softc *); int nvme_identify(struct nvme_softc *, u_int); void nvme_fill_identify(struct nvme_softc *, struct nvme_ccb *, void *); +#ifndef SMALL_KERNEL +void nvme_refresh_sensors(void *); +#endif + int nvme_ccbs_alloc(struct nvme_softc *, u_int); void nvme_ccbs_free(struct nvme_softc *, u_int); @@ -158,6 +162,7 @@ static const struct nvme_ops nvme_ops = { #define NVME_TIMO_QOP 5000 /* ms to create/delete queue */ #define NVME_TIMO_PT 5000 /* ms to complete passthrough */ #define NVME_TIMO_IDENT 10000 /* ms to probe/identify */ +#define NVME_TIMO_LOG_PAGE 5000 /* ms to read log pages */ #define NVME_TIMO_DELAYNS 10 /* ns to delay() in poll loop */ /* @@ -407,6 +412,31 @@ nvme_attach(struct nvme_softc *sc) saa.saa_quirks = saa.saa_flags = 0; saa.saa_wwpn = saa.saa_wwnn = 0; + strlcpy(sc->sc_sensordev.xname, DEVNAME(sc), sizeof(sc->sc_sensordev.xname)); + +#ifndef SMALL_KERNEL + sc->sc_temp_sensor.type = SENSOR_TEMP; + sc->sc_temp_sensor.status = SENSOR_S_UNKNOWN; + sensor_attach(&sc->sc_sensordev, &sc->sc_temp_sensor); + + sc->sc_usage_sensor.type = SENSOR_PERCENT; + sc->sc_usage_sensor.status = SENSOR_S_UNKNOWN; + strlcpy(sc->sc_usage_sensor.desc, "endurance used", + sizeof(sc->sc_usage_sensor.desc)); + sensor_attach(&sc->sc_sensordev, &sc->sc_usage_sensor); + + sc->sc_spare_sensor.type = SENSOR_PERCENT; + sc->sc_spare_sensor.status = SENSOR_S_UNKNOWN; + strlcpy(sc->sc_spare_sensor.desc, "available spare", + sizeof(sc->sc_spare_sensor.desc)); + sensor_attach(&sc->sc_sensordev, &sc->sc_spare_sensor); + + if (sensor_task_register(sc, nvme_refresh_sensors, 60) == NULL) + goto free_q; + + sensordev_install(&sc->sc_sensordev); +#endif + sc->sc_scsibus = (struct scsibus_softc *)config_found(&sc->sc_dev, &saa, scsiprint); #if NBIO > 0 @@ -2128,3 +2158,70 @@ nvme_bioctl_disk(struct nvme_softc *sc, struct bioc_disk *bd) return 0; } #endif /* NBIO > 0 */ + +#ifndef SMALL_KERNEL +void +nvme_refresh_sensors(void *arg) +{ + struct nvme_softc *sc = arg; + struct nvme_sqe sqe; + struct nvme_dmamem *mem = NULL; + struct nvme_ccb *ccb = NULL; + struct nvm_smart_health *health; + uint32_t dwlen; + uint8_t cw; + int flags; + int64_t temp; + + ccb = nvme_ccb_get(sc); + if (ccb == NULL) + goto failed; + + mem = nvme_dmamem_alloc(sc, sizeof(*health)); + if (mem == NULL) + goto failed; + nvme_dmamem_sync(sc, mem, BUS_DMASYNC_PREREAD); + + dwlen = (sizeof(*health) >> 2) - 1; + memset(&sqe, 0, sizeof(sqe)); + sqe.opcode = NVM_ADMIN_GET_LOG_PG; + htolem32(&sqe.nsid, 0xffffffff); + htolem32(&sqe.cdw10, (dwlen << 16 | NVM_LOG_PAGE_SMART_HEALTH)); + htolem64(&sqe.entry.prp[0], NVME_DMA_DVA(mem)); + + ccb->ccb_done = nvme_empty_done; + ccb->ccb_cookie = &sqe; + flags = nvme_poll(sc, sc->sc_admin_q, ccb, nvme_sqe_fill, NVME_TIMO_LOG_PAGE); + + nvme_dmamem_sync(sc, mem, BUS_DMASYNC_POSTREAD); + + if (flags != 0) + goto failed; + + health = NVME_DMA_KVA(mem); + cw = health->critical_warning; + + sc->sc_temp_sensor.status = (cw & NVM_HEALTH_CW_TEMP) ? + SENSOR_S_CRIT : SENSOR_S_OK; + temp = letoh16(health->temperature); + sc->sc_temp_sensor.value = (temp * 1000000) + 150000; + + sc->sc_spare_sensor.status = (cw & NVM_HEALTH_CW_SPARE) ? + SENSOR_S_CRIT : SENSOR_S_OK; + sc->sc_spare_sensor.value = health->avail_spare * 1000; + + sc->sc_usage_sensor.status = SENSOR_S_OK; + sc->sc_usage_sensor.value = health->percent_used * 1000; + goto done; + + failed: + sc->sc_temp_sensor.status = SENSOR_S_UNKNOWN; + sc->sc_usage_sensor.status = SENSOR_S_UNKNOWN; + sc->sc_spare_sensor.status = SENSOR_S_UNKNOWN; + done: + if (mem != NULL) + nvme_dmamem_free(sc, mem); + if (ccb != NULL) + nvme_ccb_put(sc, ccb); +} +#endif /* SMALL_KERNEL */ diff --git a/sys/dev/ic/nvmereg.h b/sys/dev/ic/nvmereg.h index 2a28c6af83e..84a4533b84c 100644 --- a/sys/dev/ic/nvmereg.h +++ b/sys/dev/ic/nvmereg.h @@ -1,4 +1,4 @@ -/* $OpenBSD: nvmereg.h,v 1.15 2024/05/24 12:04:07 krw Exp $ */ +/* $OpenBSD: nvmereg.h,v 1.16 2024/09/13 09:57:34 jmatthew Exp $ */ /* * Copyright (c) 2014 David Gwynne @@ -415,3 +415,41 @@ struct nvm_identify_namespace { u_int8_t vs[3712]; } __packed __aligned(8); + +#define NVM_LOG_PAGE_SMART_HEALTH 0x02 +struct nvm_smart_health { + u_int8_t critical_warning; +#define NVM_HEALTH_CW_SPARE (1 << 0) +#define NVM_HEALTH_CW_TEMP (1 << 1) +#define NVM_HEALTH_CW_MEDIA (1 << 2) +#define NVM_HEALTH_CW_READONLY (1 << 3) +#define NVM_HEALTH_CW_VOLATILE (1 << 4) +#define NVM_HEALTH_CW_PMR (1 << 5) + u_int16_t temperature; + u_int8_t avail_spare; + u_int8_t avail_spare_threshold; + u_int8_t percent_used; + u_int8_t end_grp_summary; /* 1.4+ */ + + u_int8_t _reserved1[25]; + + u_int64_t data_units_read[2]; + u_int64_t data_units_written[2]; + u_int64_t host_read_commands[2]; + u_int64_t host_write_commands[2]; + u_int64_t busy_time[2]; + u_int64_t power_cycles[2]; + u_int64_t power_on_hours[2]; + u_int64_t unsafe_shutdowns[2]; + u_int64_t integrity_errors[2]; + u_int64_t error_log_entries[2]; + u_int32_t warn_temp_time; /* 1.2+ */ + u_int32_t crit_temp_time; /* 1.2+ */ + u_int16_t temp_sensors[8]; /* 1.2+ */ + u_int32_t therm_mgmt_count_1; /* 1.3+ */ + u_int32_t therm_mgmt_count_2; /* 1.3+ */ + u_int32_t therm_mgmt_time_1; /* 1.3+ */ + u_int32_t therm_mgmt_time_2; /* 1.3+ */ + + u_int8_t _reserved2[280]; +} __packed __aligned(8); diff --git a/sys/dev/ic/nvmevar.h b/sys/dev/ic/nvmevar.h index 15137057818..7fe0b26fd8f 100644 --- a/sys/dev/ic/nvmevar.h +++ b/sys/dev/ic/nvmevar.h @@ -1,4 +1,4 @@ -/* $OpenBSD: nvmevar.h,v 1.30 2024/06/26 21:41:30 asou Exp $ */ +/* $OpenBSD: nvmevar.h,v 1.31 2024/09/13 09:57:34 jmatthew Exp $ */ /* * Copyright (c) 2014 David Gwynne @@ -16,6 +16,8 @@ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +#include + #define NVME_IO_Q 1 #define NVME_HIB_Q 2 #define NVME_MAXPHYS (128 * 1024) @@ -126,6 +128,11 @@ struct nvme_softc { struct scsi_iopool sc_iopool; struct rwlock sc_lock; struct scsibus_softc *sc_scsibus; + + struct ksensordev sc_sensordev; + struct ksensor sc_temp_sensor; + struct ksensor sc_spare_sensor; + struct ksensor sc_usage_sensor; }; #define DEVNAME(_sc) ((_sc)->sc_dev.dv_xname) -- 2.20.1