Add sensors based on information in the SMART/health log page,
authorjmatthew <jmatthew@openbsd.org>
Fri, 13 Sep 2024 09:57:34 +0000 (09:57 +0000)
committerjmatthew <jmatthew@openbsd.org>
Fri, 13 Sep 2024 09:57:34 +0000 (09:57 +0000)
showing overall device health and temperature.

tested by many (a while ago)
tweaks from gkoehler@ kettenis@ dv@
ok kettenis@ jca@ (earlier version), dlg@

sys/dev/ic/nvme.c
sys/dev/ic/nvmereg.h
sys/dev/ic/nvmevar.h

index 7e74d29..f54454b 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: nvme.c,v 1.122 2024/09/01 03:08:56 jsg Exp $ */
+/*     $OpenBSD: nvme.c,v 1.123 2024/09/13 09:57:34 jmatthew Exp $ */
 
 /*
  * Copyright (c) 2014 David Gwynne <dlg@openbsd.org>
@@ -60,6 +60,10 @@ void nvme_dumpregs(struct nvme_softc *);
 int    nvme_identify(struct nvme_softc *, u_int);
 void   nvme_fill_identify(struct nvme_softc *, struct nvme_ccb *, void *);
 
+#ifndef SMALL_KERNEL
+void   nvme_refresh_sensors(void *);
+#endif
+
 int    nvme_ccbs_alloc(struct nvme_softc *, u_int);
 void   nvme_ccbs_free(struct nvme_softc *, u_int);
 
@@ -158,6 +162,7 @@ static const struct nvme_ops nvme_ops = {
 #define NVME_TIMO_QOP                  5000    /* ms to create/delete queue */
 #define NVME_TIMO_PT                   5000    /* ms to complete passthrough */
 #define NVME_TIMO_IDENT                        10000   /* ms to probe/identify */
+#define NVME_TIMO_LOG_PAGE             5000    /* ms to read log pages */
 #define NVME_TIMO_DELAYNS              10      /* ns to delay() in poll loop */
 
 /*
@@ -407,6 +412,31 @@ nvme_attach(struct nvme_softc *sc)
        saa.saa_quirks = saa.saa_flags = 0;
        saa.saa_wwpn = saa.saa_wwnn = 0;
 
+       strlcpy(sc->sc_sensordev.xname, DEVNAME(sc), sizeof(sc->sc_sensordev.xname));
+
+#ifndef SMALL_KERNEL
+       sc->sc_temp_sensor.type = SENSOR_TEMP;
+       sc->sc_temp_sensor.status = SENSOR_S_UNKNOWN;
+       sensor_attach(&sc->sc_sensordev, &sc->sc_temp_sensor);
+
+       sc->sc_usage_sensor.type = SENSOR_PERCENT;
+       sc->sc_usage_sensor.status = SENSOR_S_UNKNOWN;
+       strlcpy(sc->sc_usage_sensor.desc, "endurance used",
+           sizeof(sc->sc_usage_sensor.desc));
+       sensor_attach(&sc->sc_sensordev, &sc->sc_usage_sensor);
+
+       sc->sc_spare_sensor.type = SENSOR_PERCENT;
+       sc->sc_spare_sensor.status = SENSOR_S_UNKNOWN;
+       strlcpy(sc->sc_spare_sensor.desc, "available spare",
+           sizeof(sc->sc_spare_sensor.desc));
+       sensor_attach(&sc->sc_sensordev, &sc->sc_spare_sensor);
+
+       if (sensor_task_register(sc, nvme_refresh_sensors, 60) == NULL)
+               goto free_q;
+
+       sensordev_install(&sc->sc_sensordev);
+#endif
+
        sc->sc_scsibus = (struct scsibus_softc *)config_found(&sc->sc_dev,
            &saa, scsiprint);
 #if NBIO > 0
@@ -2128,3 +2158,70 @@ nvme_bioctl_disk(struct nvme_softc *sc, struct bioc_disk *bd)
        return 0;
 }
 #endif /* NBIO > 0 */
+
+#ifndef SMALL_KERNEL
+void
+nvme_refresh_sensors(void *arg)
+{
+       struct nvme_softc               *sc = arg;
+       struct nvme_sqe                  sqe;
+       struct nvme_dmamem              *mem = NULL;
+       struct nvme_ccb                 *ccb = NULL;
+       struct nvm_smart_health         *health;
+       uint32_t                         dwlen;
+       uint8_t                          cw;
+       int                              flags;
+       int64_t                          temp;
+
+       ccb = nvme_ccb_get(sc);
+       if (ccb == NULL)
+               goto failed;
+
+       mem = nvme_dmamem_alloc(sc, sizeof(*health));
+       if (mem == NULL)
+               goto failed;
+       nvme_dmamem_sync(sc, mem, BUS_DMASYNC_PREREAD);
+
+       dwlen = (sizeof(*health) >> 2) - 1;
+       memset(&sqe, 0, sizeof(sqe));
+       sqe.opcode = NVM_ADMIN_GET_LOG_PG;
+       htolem32(&sqe.nsid, 0xffffffff);
+       htolem32(&sqe.cdw10, (dwlen << 16 | NVM_LOG_PAGE_SMART_HEALTH));
+       htolem64(&sqe.entry.prp[0], NVME_DMA_DVA(mem));
+
+       ccb->ccb_done = nvme_empty_done;
+       ccb->ccb_cookie = &sqe;
+       flags = nvme_poll(sc, sc->sc_admin_q, ccb, nvme_sqe_fill, NVME_TIMO_LOG_PAGE);
+
+       nvme_dmamem_sync(sc, mem, BUS_DMASYNC_POSTREAD);
+
+       if (flags != 0)
+               goto failed;
+
+       health = NVME_DMA_KVA(mem); 
+       cw = health->critical_warning;
+
+       sc->sc_temp_sensor.status = (cw & NVM_HEALTH_CW_TEMP) ?
+           SENSOR_S_CRIT : SENSOR_S_OK;
+       temp = letoh16(health->temperature);
+       sc->sc_temp_sensor.value = (temp * 1000000) + 150000;
+
+       sc->sc_spare_sensor.status = (cw & NVM_HEALTH_CW_SPARE) ?
+           SENSOR_S_CRIT : SENSOR_S_OK;
+       sc->sc_spare_sensor.value = health->avail_spare * 1000;
+
+       sc->sc_usage_sensor.status = SENSOR_S_OK;
+       sc->sc_usage_sensor.value = health->percent_used * 1000;
+       goto done;
+
+ failed:
+       sc->sc_temp_sensor.status = SENSOR_S_UNKNOWN;
+       sc->sc_usage_sensor.status = SENSOR_S_UNKNOWN;
+       sc->sc_spare_sensor.status = SENSOR_S_UNKNOWN;
+ done:
+       if (mem != NULL)
+               nvme_dmamem_free(sc, mem);
+       if (ccb != NULL)
+               nvme_ccb_put(sc, ccb);
+}
+#endif /* SMALL_KERNEL */
index 2a28c6a..84a4533 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: nvmereg.h,v 1.15 2024/05/24 12:04:07 krw Exp $ */
+/*     $OpenBSD: nvmereg.h,v 1.16 2024/09/13 09:57:34 jmatthew Exp $ */
 
 /*
  * Copyright (c) 2014 David Gwynne <dlg@openbsd.org>
@@ -415,3 +415,41 @@ struct nvm_identify_namespace {
 
        u_int8_t        vs[3712];
 } __packed __aligned(8);
+
+#define NVM_LOG_PAGE_SMART_HEALTH      0x02
+struct nvm_smart_health {
+       u_int8_t        critical_warning;
+#define NVM_HEALTH_CW_SPARE            (1 << 0)
+#define NVM_HEALTH_CW_TEMP             (1 << 1)
+#define NVM_HEALTH_CW_MEDIA            (1 << 2)
+#define NVM_HEALTH_CW_READONLY         (1 << 3)
+#define NVM_HEALTH_CW_VOLATILE         (1 << 4)
+#define NVM_HEALTH_CW_PMR              (1 << 5)
+       u_int16_t       temperature;
+       u_int8_t        avail_spare;
+       u_int8_t        avail_spare_threshold;
+       u_int8_t        percent_used;
+       u_int8_t        end_grp_summary;        /* 1.4+ */
+
+       u_int8_t        _reserved1[25];
+
+       u_int64_t       data_units_read[2];
+       u_int64_t       data_units_written[2];
+       u_int64_t       host_read_commands[2];
+       u_int64_t       host_write_commands[2];
+       u_int64_t       busy_time[2];
+       u_int64_t       power_cycles[2];
+       u_int64_t       power_on_hours[2];
+       u_int64_t       unsafe_shutdowns[2];
+       u_int64_t       integrity_errors[2];
+       u_int64_t       error_log_entries[2];
+       u_int32_t       warn_temp_time;         /* 1.2+ */
+       u_int32_t       crit_temp_time;         /* 1.2+ */
+       u_int16_t       temp_sensors[8];        /* 1.2+ */
+       u_int32_t       therm_mgmt_count_1;     /* 1.3+ */
+       u_int32_t       therm_mgmt_count_2;     /* 1.3+ */
+       u_int32_t       therm_mgmt_time_1;      /* 1.3+ */
+       u_int32_t       therm_mgmt_time_2;      /* 1.3+ */
+       
+       u_int8_t        _reserved2[280];
+} __packed __aligned(8);
index 1513705..7fe0b26 100644 (file)
@@ -1,4 +1,4 @@
-/*     $OpenBSD: nvmevar.h,v 1.30 2024/06/26 21:41:30 asou Exp $ */
+/*     $OpenBSD: nvmevar.h,v 1.31 2024/09/13 09:57:34 jmatthew Exp $ */
 
 /*
  * Copyright (c) 2014 David Gwynne <dlg@openbsd.org>
@@ -16,6 +16,8 @@
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 
+#include <sys/sensors.h>
+
 #define NVME_IO_Q      1
 #define NVME_HIB_Q     2
 #define NVME_MAXPHYS   (128 * 1024)
@@ -126,6 +128,11 @@ struct nvme_softc {
        struct scsi_iopool      sc_iopool;
        struct rwlock           sc_lock;
        struct scsibus_softc    *sc_scsibus;
+
+       struct ksensordev       sc_sensordev;
+       struct ksensor          sc_temp_sensor;
+       struct ksensor          sc_spare_sensor;
+       struct ksensor          sc_usage_sensor;
 };
 
 #define DEVNAME(_sc) ((_sc)->sc_dev.dv_xname)