diff --git a/modules/monitoring.nix b/modules/monitoring.nix index 1723107..d05f343 100644 --- a/modules/monitoring.nix +++ b/modules/monitoring.nix @@ -15,6 +15,44 @@ echo "zpool_health,pool=$pool health=''${val}i" done ''; + + # Telegraf's inputs.smart parses SATA/NVMe attribute tables but not the + # SAS-specific sections of `smartctl -a` output. This script extracts the + # SAS predictive failure metrics that the SMART plugin misses. + sasSmartScript = pkgs.writeShellScript "sas-smart" '' + set -u + for dev in /dev/sd?; do + [ -b "$dev" ] || continue + out=$(${pkgs.sudo}/bin/sudo ${pkgs.smartmontools}/bin/smartctl -a "$dev" 2>/dev/null) || continue + + # Only SAS drives — skip if not SAS + echo "$out" | ${pkgs.gnugrep}/bin/grep -q "Transport protocol:.*SAS" || continue + + name=$(basename "$dev") + + poh=$(echo "$out" | ${pkgs.gnugrep}/bin/grep "Accumulated power on time" | ${pkgs.gawk}/bin/awk '{print $6}' | ${pkgs.coreutils}/bin/cut -d: -f1) + grown_defects=$(echo "$out" | ${pkgs.gnugrep}/bin/grep "Elements in grown defect list" | ${pkgs.gawk}/bin/awk -F: '{gsub(/ /,"",$2); print $2}') + [ -z "$grown_defects" ] && grown_defects=$(echo "$out" | ${pkgs.gnugrep}/bin/grep "Grown defects during certification" | ${pkgs.gawk}/bin/awk -F= '{gsub(/ /,"",$2); print $2}') + non_medium=$(echo "$out" | ${pkgs.gnugrep}/bin/grep "Non-medium error count" | ${pkgs.gawk}/bin/awk -F: '{gsub(/ /,"",$2); print $2}') + pending=$(echo "$out" | ${pkgs.gnugrep}/bin/grep "Pending defect count" | ${pkgs.gawk}/bin/awk '{print $4}') + + # ECC errors corrected (read/write) — last column of the totals row + read_uncorr=$(echo "$out" | ${pkgs.gnugrep}/bin/grep -A1 "^read:" | ${pkgs.coreutils}/bin/head -1 | ${pkgs.gawk}/bin/awk '{print $NF}') + write_uncorr=$(echo "$out" | ${pkgs.gnugrep}/bin/grep -A1 "^write:" | ${pkgs.coreutils}/bin/head -1 | ${pkgs.gawk}/bin/awk '{print $NF}') + + fields="" + [ -n "''${poh:-}" ] && fields="$fields,power_on_hours=''${poh}i" + [ -n "''${grown_defects:-}" ] && fields="$fields,grown_defects=''${grown_defects}i" + [ -n "''${non_medium:-}" ] && fields="$fields,non_medium_errors=''${non_medium}i" + [ -n "''${pending:-}" ] && fields="$fields,pending_defects=''${pending}i" + [ -n "''${read_uncorr:-}" ] && fields="$fields,read_uncorrected=''${read_uncorr}i" + [ -n "''${write_uncorr:-}" ] && fields="$fields,write_uncorrected=''${write_uncorr}i" + + if [ -n "$fields" ]; then + echo "smart_sas,device=$name ''${fields#,}" + fi + done + ''; in { options.skyworks.monitoring = { enable = lib.mkEnableOption "Telegraf monitoring to InfluxDB"; @@ -81,12 +119,20 @@ use_sudo = true; attributes = true; }]; - exec = [{ - commands = [ "${zpoolHealthScript}" ]; - interval = "60s"; - timeout = "10s"; - data_format = "influx"; - }]; + exec = [ + { + commands = [ "${zpoolHealthScript}" ]; + interval = "60s"; + timeout = "10s"; + data_format = "influx"; + } + { + commands = [ "${sasSmartScript}" ]; + interval = "30m"; + timeout = "60s"; + data_format = "influx"; + } + ]; }; }; };