diff --git a/modules/monitoring.nix b/modules/monitoring.nix index 0a9f12f..e9212fc 100644 --- a/modules/monitoring.nix +++ b/modules/monitoring.nix @@ -176,6 +176,91 @@ "d /var/lib/node_exporter/textfile 0755 root root -" ]; + # Custom textfile collectors: SAS SMART + ZFS pool health. + # node-exporter only parses SATA/NVMe SMART; SAS-specific predictive + # failure metrics (grown defects, non-medium errors, pending defects, + # ECC error counts) are not included by default and must be parsed + # manually from `smartctl -a` SAS output. ZFS pool health enum is + # also not exposed by the zfs collector (it gives ARC + pool I/O only). + systemd.services.skyw-textfile-collectors = lib.mkIf cfg.nodeExporter.enable { + description = "Write SAS SMART + zpool health textfile metrics"; + serviceConfig = { + Type = "oneshot"; + ExecStart = pkgs.writeShellScript "skyw-textfile-collectors" '' + set -u + OUT_DIR=/var/lib/node_exporter/textfile + TMP_SAS=$(mktemp "$OUT_DIR/sas_smart.prom.XXXXXX") + TMP_ZFS=$(mktemp "$OUT_DIR/zpool_health.prom.XXXXXX") + + # SAS SMART + { + echo "# HELP smart_sas_power_on_hours Drive power-on hours" + echo "# TYPE smart_sas_power_on_hours counter" + echo "# HELP smart_sas_grown_defects Reallocated/grown defects" + echo "# TYPE smart_sas_grown_defects gauge" + echo "# HELP smart_sas_non_medium_errors SAS non-medium error count" + echo "# TYPE smart_sas_non_medium_errors counter" + echo "# HELP smart_sas_pending_defects Pending defects (predictive)" + echo "# TYPE smart_sas_pending_defects gauge" + echo "# HELP smart_sas_read_uncorrected Uncorrected read errors" + echo "# TYPE smart_sas_read_uncorrected counter" + echo "# HELP smart_sas_write_uncorrected Uncorrected write errors" + echo "# TYPE smart_sas_write_uncorrected counter" + for dev in /dev/sd?; do + [ -b "$dev" ] || continue + out=$(/run/wrappers/bin/sudo ${pkgs.smartmontools}/bin/smartctl -a "$dev" 2>/dev/null) || continue + echo "$out" | ${pkgs.gnugrep}/bin/grep -q "Transport protocol:.*SAS" || continue + name=$(basename "$dev") + poh=$(echo "$out" | ${pkgs.gnugrep}/bin/grep "Accumulated power on time" | ${pkgs.gawk}/bin/awk '{print $6}' | ${pkgs.coreutils}/bin/cut -d: -f1) + [ -z "''${poh:-}" ] && poh=$(echo "$out" | ${pkgs.gnugrep}/bin/grep "number of hours powered up" | ${pkgs.gawk}/bin/awk -F= '{gsub(/ /,"",$2); print $2}' | ${pkgs.coreutils}/bin/cut -d. -f1) + gd=$(echo "$out" | ${pkgs.gnugrep}/bin/grep "Elements in grown defect list" | ${pkgs.gawk}/bin/awk -F: '{gsub(/ /,"",$2); print $2}') + [ -z "''${gd:-}" ] && gd=$(echo "$out" | ${pkgs.gnugrep}/bin/grep "Grown defects during certification" | ${pkgs.gawk}/bin/awk -F= '{gsub(/ /,"",$2); print $2}') + nm=$(echo "$out" | ${pkgs.gnugrep}/bin/grep "Non-medium error count" | ${pkgs.gawk}/bin/awk -F: '{gsub(/ /,"",$2); print $2}') + pd=$(echo "$out" | ${pkgs.gnugrep}/bin/grep "Pending defect count" | ${pkgs.gnused}/bin/sed 's/.*count:\([0-9]*\).*/\1/') + ru=$(echo "$out" | ${pkgs.gnugrep}/bin/grep -A1 "^read:" | ${pkgs.coreutils}/bin/head -1 | ${pkgs.gawk}/bin/awk '{print $NF}') + wu=$(echo "$out" | ${pkgs.gnugrep}/bin/grep -A1 "^write:" | ${pkgs.coreutils}/bin/head -1 | ${pkgs.gawk}/bin/awk '{print $NF}') + [ -n "''${poh:-}" ] && echo "smart_sas_power_on_hours{device=\"$name\"} $poh" + [ -n "''${gd:-}" ] && echo "smart_sas_grown_defects{device=\"$name\"} $gd" + [ -n "''${nm:-}" ] && echo "smart_sas_non_medium_errors{device=\"$name\"} $nm" + [ -n "''${pd:-}" ] && echo "smart_sas_pending_defects{device=\"$name\"} $pd" + [ -n "''${ru:-}" ] && echo "smart_sas_read_uncorrected{device=\"$name\"} $ru" + [ -n "''${wu:-}" ] && echo "smart_sas_write_uncorrected{device=\"$name\"} $wu" + done + } > "$TMP_SAS" + chmod 0644 "$TMP_SAS" + mv -f "$TMP_SAS" "$OUT_DIR/sas_smart.prom" + + # ZFS pool health + { + echo "# HELP zpool_health Pool health (0=ONLINE 1=DEGRADED 2=FAULTED 3=OFFLINE 4=UNAVAIL 5=REMOVED)" + echo "# TYPE zpool_health gauge" + ${pkgs.zfs}/bin/zpool list -H -o name,health 2>/dev/null | while read -r pool health; do + case "$health" in + ONLINE) v=0 ;; + DEGRADED) v=1 ;; + FAULTED) v=2 ;; + OFFLINE) v=3 ;; + UNAVAIL) v=4 ;; + REMOVED) v=5 ;; + *) v=6 ;; + esac + echo "zpool_health{pool=\"$pool\",state=\"$health\"} $v" + done + } > "$TMP_ZFS" + chmod 0644 "$TMP_ZFS" + mv -f "$TMP_ZFS" "$OUT_DIR/zpool_health.prom" + ''; + }; + }; + + systemd.timers.skyw-textfile-collectors = lib.mkIf cfg.nodeExporter.enable { + wantedBy = [ "timers.target" ]; + timerConfig = { + OnBootSec = "30s"; + OnUnitActiveSec = "5min"; + }; + }; + networking.firewall.allowedTCPPorts = lib.mkIf cfg.nodeExporter.enable [ cfg.nodeExporter.port ]; };