{ config, pkgs, lib, ... }:
let
cfg = config.skyworks.monitoring;
zpoolHealthScript = pkgs.writeShellScript "zpool-health" ''
${pkgs.zfs}/bin/zpool list -H -o name,health | while read -r pool health; do
case "$health" in
ONLINE) val=0 ;;
DEGRADED) val=1 ;;
FAULTED) val=2 ;;
OFFLINE) val=3 ;;
UNAVAIL) val=4 ;;
REMOVED) val=5 ;;
*) val=6 ;;
esac
echo "zpool_health,pool=$pool health=''${val}i"
done
'';
# Telegraf's inputs.smart parses SATA/NVMe attribute tables but not the
# SAS-specific sections of `smartctl -a` output. This script extracts the
# SAS predictive failure metrics that the SMART plugin misses.
sasSmartScript = pkgs.writeShellScript "sas-smart" ''
set -u
for dev in /dev/sd?; do
[ -b "$dev" ] || continue
# Use NixOS security wrapper sudo (the Nix store sudo lacks setuid bit)
out=$(/run/wrappers/bin/sudo ${pkgs.smartmontools}/bin/smartctl -a "$dev" 2>/dev/null) || continue
# Only SAS drives — skip if not SAS
echo "$out" | ${pkgs.gnugrep}/bin/grep -q "Transport protocol:.*SAS" || continue
name=$(basename "$dev")
# Power on hours: try both formats
poh=$(echo "$out" | ${pkgs.gnugrep}/bin/grep "Accumulated power on time" | ${pkgs.gawk}/bin/awk '{print $6}' | ${pkgs.coreutils}/bin/cut -d: -f1)
[ -z "''${poh:-}" ] && poh=$(echo "$out" | ${pkgs.gnugrep}/bin/grep "number of hours powered up" | ${pkgs.gawk}/bin/awk -F= '{gsub(/ /,"",$2); print $2}' | ${pkgs.coreutils}/bin/cut -d. -f1)
grown_defects=$(echo "$out" | ${pkgs.gnugrep}/bin/grep "Elements in grown defect list" | ${pkgs.gawk}/bin/awk -F: '{gsub(/ /,"",$2); print $2}')
[ -z "''${grown_defects:-}" ] && grown_defects=$(echo "$out" | ${pkgs.gnugrep}/bin/grep "Grown defects during certification" | ${pkgs.gawk}/bin/awk -F= '{gsub(/ /,"",$2); print $2}')
non_medium=$(echo "$out" | ${pkgs.gnugrep}/bin/grep "Non-medium error count" | ${pkgs.gawk}/bin/awk -F: '{gsub(/ /,"",$2); print $2}')
# "Pending defect count:0 Pending Defects" — extract number between : and space
pending=$(echo "$out" | ${pkgs.gnugrep}/bin/grep "Pending defect count" | ${pkgs.gnused}/bin/sed 's/.*count:\([0-9]*\).*/\1/')
# ECC errors corrected (read/write) — last column of the totals row
read_uncorr=$(echo "$out" | ${pkgs.gnugrep}/bin/grep -A1 "^read:" | ${pkgs.coreutils}/bin/head -1 | ${pkgs.gawk}/bin/awk '{print $NF}')
write_uncorr=$(echo "$out" | ${pkgs.gnugrep}/bin/grep -A1 "^write:" | ${pkgs.coreutils}/bin/head -1 | ${pkgs.gawk}/bin/awk '{print $NF}')
fields=""
[ -n "''${poh:-}" ] && fields="$fields,power_on_hours=''${poh}i"
[ -n "''${grown_defects:-}" ] && fields="$fields,grown_defects=''${grown_defects}i"
[ -n "''${non_medium:-}" ] && fields="$fields,non_medium_errors=''${non_medium}i"
[ -n "''${pending:-}" ] && fields="$fields,pending_defects=''${pending}i"
[ -n "''${read_uncorr:-}" ] && fields="$fields,read_uncorrected=''${read_uncorr}i"
[ -n "''${write_uncorr:-}" ] && fields="$fields,write_uncorrected=''${write_uncorr}i"
if [ -n "$fields" ]; then
echo "smart_sas,device=$name ''${fields#,}"
fi
done
'';
in {
options.skyworks.monitoring = {
enable = lib.mkEnableOption "Telegraf monitoring to InfluxDB";
bucket = lib.mkOption {
type = lib.types.str;
description = "InfluxDB bucket name";
};
influxUrl = lib.mkOption {
type = lib.types.str;
default = "http://10.0.1.1:8086";
description = "InfluxDB v2 HTTP API URL";
};
netInterfaces = lib.mkOption {
type = lib.types.listOf lib.types.str;
default = [ "*" ];
};
};
config = lib.mkIf cfg.enable {
age.secrets.influxdb-token = {
file = ../secrets/influxdb-token.age;
owner = "telegraf";
group = "telegraf";
mode = "0400";
};
systemd.services.telegraf.serviceConfig.EnvironmentFile =
config.age.secrets.influxdb-token.path;
systemd.services.telegraf.path = [ "/run/wrappers" pkgs.lm_sensors pkgs.smartmontools pkgs.nvme-cli ];
services.telegraf = {
enable = true;
extraConfig = {
agent = {
interval = "10s";
round_interval = true;
metric_batch_size = 1000;
metric_buffer_limit = 10000;
flush_interval = "10s";
hostname = config.networking.hostName;
};
outputs.influxdb_v2 = [{
urls = [ cfg.influxUrl ];
token = "$INFLUX_TOKEN";
organization = "door1";
bucket = cfg.bucket;
}];
inputs = {
cpu = [{ percpu = true; totalcpu = true; }];
mem = [{}];
swap = [{}];
system = [{}];
kernel = [{}];
disk = [{ ignore_fs = [ "tmpfs" "devtmpfs" "devfs" "iso9660" "overlay" "aufs" "squashfs" ]; }];
diskio = [{ devices = [ "*" ]; }];
net = [{ interfaces = cfg.netInterfaces; }];
sensors = [{ timeout = "5s"; }];
zfs = [{ poolMetrics = true; }];
smart = [{
interval = "30m";
use_sudo = true;
attributes = true;
# "never" instead of default "standby": wake spun-down drives every
# scrape so marginal drives in low power mode don't silently disappear
# from metrics. Trade-off: 30m spin-up overhead, but our ZFS pool
# drives shouldn't be spinning down anyway.
nocheck = "never";
}];
exec = [
{
commands = [ "${zpoolHealthScript}" ];
interval = "60s";
timeout = "10s";
data_format = "influx";
}
{
commands = [ "${sasSmartScript}" ];
interval = "5m";
timeout = "60s";
data_format = "influx";
}
];
};
};
};
security.sudo.extraRules = lib.mkAfter [{
users = [ "telegraf" ];
commands = [
{ command = "${pkgs.smartmontools}/bin/smartctl"; options = [ "NOPASSWD" ]; }
];
}];
};
}