From 4549526877bce979ba7ae160c048dc8ce7cf4316 Mon Sep 17 00:00:00 2001 From: stsp Date: Fri, 16 Sep 2022 12:08:27 +0000 Subject: [PATCH] Make mfii(4) recover from firmware FAULT state on startup. In case firmware initially comes up in FAULT state, reset the device and give it one more chance to attach successfully. The Linux megaraid_sas driver applies the same workaround in this case. There seems to be a bug in some firmware versions which can trigger this behaviour; see mainline Linux commit 6431f5d7c6025f8b007af06ea090de308f7e6881 Problem observed by me with mfii(4) attached via KVM PCI-passthrough: mfii0 at pci0 dev 2 function 0 "Symbios Logic MegaRAID SAS2208" rev 0x05: msi mfii0: firmware fault With this workaround in place, attachment succeeds and the device works: mfii0 at pci0 dev 2 function 0 "Symbios Logic MegaRAID SAS2208" rev 0x05: msi mfii0: firmware fault; attempting full device reset, this can take some time mfii0: "RAID Ctrl SAS 6G 1GB (D3116C)", firmware 23.29.0-0019, 1024MB cache Tested for regressions on bare metal by Hrvoje with two different adapters: mfii0 at pci1 dev 0 function 0 "Symbios Logic MegaRAID SAS3508" rev 0x01: msi mfii0: "PERC H740P Mini ", firmware 51.16.0-4076, 8192MB cache mfii0 at pci4 dev 0 function 0 "Symbios Logic MegaRAID SAS2208" rev 0x05: msi mfii0: "ServeRAID M5110", firmware 23.34.0-0023, 512MB cache ok jmatthew@ --- sys/dev/ic/mfireg.h | 4 ++- sys/dev/pci/mfii.c | 80 +++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 76 insertions(+), 8 deletions(-) diff --git a/sys/dev/ic/mfireg.h b/sys/dev/ic/mfireg.h index 00fe6cae2fa..ff06fd10a54 100644 --- a/sys/dev/ic/mfireg.h +++ b/sys/dev/ic/mfireg.h @@ -1,4 +1,4 @@ -/* $OpenBSD: mfireg.h,v 1.51 2022/01/09 05:42:38 jsg Exp $ */ +/* $OpenBSD: mfireg.h,v 1.52 2022/09/16 12:08:27 stsp Exp $ */ /* * Copyright (c) 2006 Marco Peereboom * @@ -60,6 +60,7 @@ #define MFI_STATE_WAIT_HANDSHAKE 0x60000000 #define MFI_STATE_FW_INIT_2 0x70000000 #define MFI_STATE_DEVICE_SCAN 0x80000000 +#define MFI_STATE_BOOT_MESSAGE_PENDING 0x90000000 #define MFI_STATE_FLUSH_CACHE 0xa0000000 #define MFI_STATE_READY 0xb0000000 #define MFI_STATE_OPERATIONAL 0xc0000000 @@ -72,6 +73,7 @@ #define MFI_INIT_READY 0x00000002 #define MFI_INIT_MFIMODE 0x00000004 #define MFI_INIT_CLEAR_HANDSHAKE 0x00000008 +#define MFI_INIT_HOTPLUG 0x00000010 #define MFI_RESET_FLAGS MFI_INIT_READY|MFI_INIT_MFIMODE /* mfi Frame flags */ diff --git a/sys/dev/pci/mfii.c b/sys/dev/pci/mfii.c index 7aa2c664c05..381a1d74889 100644 --- a/sys/dev/pci/mfii.c +++ b/sys/dev/pci/mfii.c @@ -1,4 +1,4 @@ -/* $OpenBSD: mfii.c,v 1.85 2022/04/16 19:19:59 naddy Exp $ */ +/* $OpenBSD: mfii.c,v 1.86 2022/09/16 12:08:28 stsp Exp $ */ /* * Copyright (c) 2012 David Gwynne @@ -407,6 +407,7 @@ void mfii_put_ccb(void *, void *); int mfii_init_ccb(struct mfii_softc *); void mfii_scrub_ccb(struct mfii_ccb *); +int mfii_reset_hard(struct mfii_softc *); int mfii_transition_firmware(struct mfii_softc *); int mfii_initialise_firmware(struct mfii_softc *); int mfii_get_info(struct mfii_softc *); @@ -1396,11 +1397,58 @@ mfii_aen_unregister(struct mfii_softc *sc) /* XXX */ } +int +mfii_reset_hard(struct mfii_softc *sc) +{ + u_int16_t i; + + mfii_write(sc, MFI_OSTS, 0); + + /* enable diagnostic register */ + mfii_write(sc, MPII_WRITESEQ, MPII_WRITESEQ_FLUSH); + mfii_write(sc, MPII_WRITESEQ, MPII_WRITESEQ_1); + mfii_write(sc, MPII_WRITESEQ, MPII_WRITESEQ_2); + mfii_write(sc, MPII_WRITESEQ, MPII_WRITESEQ_3); + mfii_write(sc, MPII_WRITESEQ, MPII_WRITESEQ_4); + mfii_write(sc, MPII_WRITESEQ, MPII_WRITESEQ_5); + mfii_write(sc, MPII_WRITESEQ, MPII_WRITESEQ_6); + + delay(100); + + if ((mfii_read(sc, MPII_HOSTDIAG) & MPII_HOSTDIAG_DWRE) == 0) { + printf("%s: failed to enable diagnostic read/write\n", + DEVNAME(sc)); + return(1); + } + + /* reset ioc */ + mfii_write(sc, MPII_HOSTDIAG, MPII_HOSTDIAG_RESET_ADAPTER); + + /* 240 milliseconds */ + delay(240000); + + for (i = 0; i < 30000; i++) { + if ((mfii_read(sc, MPII_HOSTDIAG) & + MPII_HOSTDIAG_RESET_ADAPTER) == 0) + break; + delay(10000); + } + if (i >= 30000) { + printf("%s: failed to reset device\n", DEVNAME(sc)); + return (1); + } + + /* disable diagnostic register */ + mfii_write(sc, MPII_WRITESEQ, 0xff); + + return(0); +} + int mfii_transition_firmware(struct mfii_softc *sc) { int32_t fw_state, cur_state; - int max_wait, i; + int max_wait, i, reset_on_fault = 1; fw_state = mfii_fw_state(sc) & MFI_STATE_MASK; @@ -1408,8 +1456,17 @@ mfii_transition_firmware(struct mfii_softc *sc) cur_state = fw_state; switch (fw_state) { case MFI_STATE_FAULT: - printf("%s: firmware fault\n", DEVNAME(sc)); - return (1); + if (!reset_on_fault) { + printf("%s: firmware fault\n", DEVNAME(sc)); + return (1); + } + printf("%s: firmware fault; attempting full device " + "reset, this can take some time\n", DEVNAME(sc)); + if (mfii_reset_hard(sc)) + return (1); + max_wait = 20; + reset_on_fault = 0; + break; case MFI_STATE_WAIT_HANDSHAKE: mfii_write(sc, MFI_SKINNY_IDB, MFI_INIT_CLEAR_HANDSHAKE); @@ -1421,15 +1478,20 @@ mfii_transition_firmware(struct mfii_softc *sc) break; case MFI_STATE_UNDEFINED: case MFI_STATE_BB_INIT: - max_wait = 2; + max_wait = 20; break; case MFI_STATE_FW_INIT: + case MFI_STATE_FW_INIT_2: case MFI_STATE_DEVICE_SCAN: case MFI_STATE_FLUSH_CACHE: - max_wait = 20; + max_wait = 40; + break; + case MFI_STATE_BOOT_MESSAGE_PENDING: + mfii_write(sc, MFI_SKINNY_IDB, MFI_INIT_HOTPLUG); + max_wait = 10; break; default: - printf("%s: unknown firmware state %d\n", + printf("%s: unknown firmware state %#x\n", DEVNAME(sc), fw_state); return (1); } @@ -1444,6 +1506,10 @@ mfii_transition_firmware(struct mfii_softc *sc) printf("%s: firmware stuck in state %#x\n", DEVNAME(sc), fw_state); return (1); + } else { + DPRINTF("%s: firmware state change %#x -> %#x after " + "%d iterations\n", + DEVNAME(sc), cur_state, fw_state, i); } } -- 2.20.1