skyworks-Nix-infra/modules/monitoring.nix at 91e7bec3853693ee22a71bc628dd57a8b2d72c26

Fork: 0
Skyworks / skyworks-Nix-infra
Find file
Newer
Older
skyworks-Nix-infra / modules / monitoring.nix
Dixiao-L 4 days ago 14 KB skydick/monitoring: drop InfluxDB zpool_io, add Prometheus pool-member map
Raw Blame History
{ config, pkgs, lib, ... }:
let
  cfg = config.skyworks.monitoring;
  zpoolHealthScript = pkgs.writeShellScript "zpool-health" ''
    ${pkgs.zfs}/bin/zpool list -H -o name,health | while read -r pool health; do
      case "$health" in
        ONLINE)   val=0 ;;
        DEGRADED) val=1 ;;
        FAULTED)  val=2 ;;
        OFFLINE)  val=3 ;;
        UNAVAIL)  val=4 ;;
        REMOVED)  val=5 ;;
        *)        val=6 ;;
      esac
      echo "zpool_health,pool=$pool health=''${val}i"
    done
  '';

  # Telegraf's inputs.smart parses SATA/NVMe attribute tables but not the
  # SAS-specific sections of `smartctl -a` output. This script extracts the
  # SAS predictive failure metrics that the SMART plugin misses.
  sasSmartScript = pkgs.writeShellScript "sas-smart" ''
    set -u
    for dev in /dev/sd?; do
      [ -b "$dev" ] || continue
      # Use NixOS security wrapper sudo (the Nix store sudo lacks setuid bit)
      out=$(/run/wrappers/bin/sudo ${pkgs.smartmontools}/bin/smartctl -a "$dev" 2>/dev/null) || continue

      # Only SAS drives — skip if not SAS
      echo "$out" | ${pkgs.gnugrep}/bin/grep -q "Transport protocol:.*SAS" || continue

      name=$(basename "$dev")

      # Power on hours: try both formats
      poh=$(echo "$out" | ${pkgs.gnugrep}/bin/grep "Accumulated power on time" | ${pkgs.gawk}/bin/awk '{print $6}' | ${pkgs.coreutils}/bin/cut -d: -f1)
      [ -z "''${poh:-}" ] && poh=$(echo "$out" | ${pkgs.gnugrep}/bin/grep "number of hours powered up" | ${pkgs.gawk}/bin/awk -F= '{gsub(/ /,"",$2); print $2}' | ${pkgs.coreutils}/bin/cut -d. -f1)

      grown_defects=$(echo "$out" | ${pkgs.gnugrep}/bin/grep "Elements in grown defect list" | ${pkgs.gawk}/bin/awk -F: '{gsub(/ /,"",$2); print $2}')
      [ -z "''${grown_defects:-}" ] && grown_defects=$(echo "$out" | ${pkgs.gnugrep}/bin/grep "Grown defects during certification" | ${pkgs.gawk}/bin/awk -F= '{gsub(/ /,"",$2); print $2}')

      non_medium=$(echo "$out" | ${pkgs.gnugrep}/bin/grep "Non-medium error count" | ${pkgs.gawk}/bin/awk -F: '{gsub(/ /,"",$2); print $2}')

      # "Pending defect count:0 Pending Defects" — extract number between : and space
      pending=$(echo "$out" | ${pkgs.gnugrep}/bin/grep "Pending defect count" | ${pkgs.gnused}/bin/sed 's/.*count:\([0-9]*\).*/\1/')

      # ECC errors corrected (read/write) — last column of the totals row
      read_uncorr=$(echo "$out" | ${pkgs.gnugrep}/bin/grep -A1 "^read:" | ${pkgs.coreutils}/bin/head -1 | ${pkgs.gawk}/bin/awk '{print $NF}')
      write_uncorr=$(echo "$out" | ${pkgs.gnugrep}/bin/grep -A1 "^write:" | ${pkgs.coreutils}/bin/head -1 | ${pkgs.gawk}/bin/awk '{print $NF}')

      fields=""
      [ -n "''${poh:-}" ] && fields="$fields,power_on_hours=''${poh}i"
      [ -n "''${grown_defects:-}" ] && fields="$fields,grown_defects=''${grown_defects}i"
      [ -n "''${non_medium:-}" ] && fields="$fields,non_medium_errors=''${non_medium}i"
      [ -n "''${pending:-}" ] && fields="$fields,pending_defects=''${pending}i"
      [ -n "''${read_uncorr:-}" ] && fields="$fields,read_uncorrected=''${read_uncorr}i"
      [ -n "''${write_uncorr:-}" ] && fields="$fields,write_uncorrected=''${write_uncorr}i"

      if [ -n "$fields" ]; then
        echo "smart_sas,device=$name ''${fields#,}"
      fi
    done
  '';
in {
  options.skyworks.monitoring = {
    enable = lib.mkEnableOption "Telegraf monitoring to InfluxDB";
    bucket = lib.mkOption {
      type = lib.types.str;
      description = "InfluxDB bucket name";
    };
    influxUrl = lib.mkOption {
      type = lib.types.str;
      default = "http://10.0.1.1:8086";
      description = "InfluxDB v2 HTTP API URL";
    };
    netInterfaces = lib.mkOption {
      type = lib.types.listOf lib.types.str;
      default = [ "*" ];
    };
    nodeExporter = {
      enable = lib.mkEnableOption "prometheus-node-exporter (scraped by skyw-gw)";
      port = lib.mkOption {
        type = lib.types.port;
        default = 9100;
      };
    };
  };

  config = lib.mkIf cfg.enable {
    age.secrets.influxdb-token = {
      file = ../secrets/influxdb-token.age;
      owner = "telegraf";
      group = "telegraf";
      mode = "0400";
    };

    systemd.services.telegraf.serviceConfig.EnvironmentFile =
      config.age.secrets.influxdb-token.path;

    systemd.services.telegraf.path = [ "/run/wrappers" pkgs.lm_sensors pkgs.smartmontools pkgs.nvme-cli ];

    services.telegraf = {
      enable = true;
      extraConfig = {
        agent = {
          interval = "10s";
          round_interval = true;
          metric_batch_size = 1000;
          metric_buffer_limit = 10000;
          flush_interval = "10s";
          hostname = config.networking.hostName;
        };

        outputs.influxdb_v2 = [{
          urls = [ cfg.influxUrl ];
          token = "$INFLUX_TOKEN";
          organization = "door1";
          bucket = cfg.bucket;
        }];

        inputs = {
          cpu = [{ percpu = true; totalcpu = true; }];
          mem = [{}];
          swap = [{}];
          system = [{}];
          kernel = [{}];
          disk = [{ ignore_fs = [ "tmpfs" "devtmpfs" "devfs" "iso9660" "overlay" "aufs" "squashfs" ]; }];
          diskio = [{ devices = [ "*" ]; }];
          net = [{ interfaces = cfg.netInterfaces; }];
          sensors = [{ timeout = "5s"; }];
          zfs = [{ poolMetrics = true; }];
          smart = [{
            interval = "30m";
            use_sudo = true;
            attributes = true;
            # "never" instead of default "standby": wake spun-down drives every
            # scrape so marginal drives in low power mode don't silently disappear
            # from metrics. Trade-off: 30m spin-up overhead, but our ZFS pool
            # drives shouldn't be spinning down anyway.
            nocheck = "never";
          }];
          exec = [
            {
              commands = [ "${zpoolHealthScript}" ];
              interval = "60s";
              timeout = "10s";
              data_format = "influx";
            }
            {
              commands = [ "${sasSmartScript}" ];
              interval = "5m";
              timeout = "60s";
              data_format = "influx";
            }
          ];
        };
      };
    };

    security.sudo.extraRules = lib.mkAfter [{
      users = [ "telegraf" ];
      commands = [
        { command = "${pkgs.smartmontools}/bin/smartctl"; options = [ "NOPASSWD" ]; }
      ];
    }];

    services.prometheus.exporters.node = lib.mkIf cfg.nodeExporter.enable {
      enable = true;
      port = cfg.nodeExporter.port;
      enabledCollectors = [ "systemd" "zfs" "processes" "logind" ];
      extraFlags = [
        "--collector.textfile.directory=/var/lib/node_exporter/textfile"
      ];
    };

    systemd.tmpfiles.rules = lib.mkIf cfg.nodeExporter.enable [
      "d /var/lib/node_exporter/textfile 0755 root root -"
    ];

    # Custom textfile collectors: SAS SMART + ZFS pool health.
    # node-exporter only parses SATA/NVMe SMART; SAS-specific predictive
    # failure metrics (grown defects, non-medium errors, pending defects,
    # ECC error counts) are not included by default and must be parsed
    # manually from `smartctl -a` SAS output. ZFS pool health enum is
    # also not exposed by the zfs collector (it gives ARC + pool I/O only).
    systemd.services.skyw-textfile-collectors = lib.mkIf cfg.nodeExporter.enable {
      description = "Write SAS SMART + zpool health textfile metrics";
      serviceConfig = {
        Type = "oneshot";
        ExecStart = pkgs.writeShellScript "skyw-textfile-collectors" ''
          set -u
          OUT_DIR=/var/lib/node_exporter/textfile
          TMP_SAS=$(mktemp "$OUT_DIR/sas_smart.prom.XXXXXX")
          TMP_ZFS=$(mktemp "$OUT_DIR/zpool_health.prom.XXXXXX")

          # SAS SMART
          {
            echo "# HELP smart_sas_power_on_hours Drive power-on hours"
            echo "# TYPE smart_sas_power_on_hours counter"
            echo "# HELP smart_sas_grown_defects Reallocated/grown defects"
            echo "# TYPE smart_sas_grown_defects gauge"
            echo "# HELP smart_sas_non_medium_errors SAS non-medium error count"
            echo "# TYPE smart_sas_non_medium_errors counter"
            echo "# HELP smart_sas_pending_defects Pending defects (predictive)"
            echo "# TYPE smart_sas_pending_defects gauge"
            echo "# HELP smart_sas_read_uncorrected Uncorrected read errors"
            echo "# TYPE smart_sas_read_uncorrected counter"
            echo "# HELP smart_sas_write_uncorrected Uncorrected write errors"
            echo "# TYPE smart_sas_write_uncorrected counter"
            echo "# HELP smart_sas_info Drive identification (vendor/product/revision/serial as labels)"
            echo "# TYPE smart_sas_info gauge"
            for dev in /dev/sd?; do
              [ -b "$dev" ] || continue
              out=$(/run/wrappers/bin/sudo ${pkgs.smartmontools}/bin/smartctl -a "$dev" 2>/dev/null) || continue
              echo "$out" | ${pkgs.gnugrep}/bin/grep -q "Transport protocol:.*SAS" || continue
              name=$(basename "$dev")
              poh=$(echo "$out" | ${pkgs.gnugrep}/bin/grep "Accumulated power on time" | ${pkgs.gawk}/bin/awk '{print $6}' | ${pkgs.coreutils}/bin/cut -d: -f1)
              [ -z "''${poh:-}" ] && poh=$(echo "$out" | ${pkgs.gnugrep}/bin/grep "number of hours powered up" | ${pkgs.gawk}/bin/awk -F= '{gsub(/ /,"",$2); print $2}' | ${pkgs.coreutils}/bin/cut -d. -f1)
              gd=$(echo "$out" | ${pkgs.gnugrep}/bin/grep "Elements in grown defect list" | ${pkgs.gawk}/bin/awk -F: '{gsub(/ /,"",$2); print $2}')
              [ -z "''${gd:-}" ] && gd=$(echo "$out" | ${pkgs.gnugrep}/bin/grep "Grown defects during certification" | ${pkgs.gawk}/bin/awk -F= '{gsub(/ /,"",$2); print $2}')
              nm=$(echo "$out" | ${pkgs.gnugrep}/bin/grep "Non-medium error count" | ${pkgs.gawk}/bin/awk -F: '{gsub(/ /,"",$2); print $2}')
              pd=$(echo "$out" | ${pkgs.gnugrep}/bin/grep "Pending defect count" | ${pkgs.gnused}/bin/sed 's/.*count:\([0-9]*\).*/\1/')
              ru=$(echo "$out" | ${pkgs.gnugrep}/bin/grep -A1 "^read:" | ${pkgs.coreutils}/bin/head -1 | ${pkgs.gawk}/bin/awk '{print $NF}')
              wu=$(echo "$out" | ${pkgs.gnugrep}/bin/grep -A1 "^write:" | ${pkgs.coreutils}/bin/head -1 | ${pkgs.gawk}/bin/awk '{print $NF}')
              vendor=$(echo "$out" | ${pkgs.gnugrep}/bin/grep "^Vendor:" | ${pkgs.gnused}/bin/sed 's/^Vendor:[[:space:]]*//;s/[[:space:]]*$//')
              product=$(echo "$out" | ${pkgs.gnugrep}/bin/grep "^Product:" | ${pkgs.gnused}/bin/sed 's/^Product:[[:space:]]*//;s/[[:space:]]*$//')
              rev=$(echo "$out" | ${pkgs.gnugrep}/bin/grep "^Revision:" | ${pkgs.gnused}/bin/sed 's/^Revision:[[:space:]]*//;s/[[:space:]]*$//')
              serial=$(echo "$out" | ${pkgs.gnugrep}/bin/grep "^Serial number:" | ${pkgs.gnused}/bin/sed 's/^Serial number:[[:space:]]*//;s/[[:space:]]*$//')
              [ -n "''${poh:-}" ] && echo "smart_sas_power_on_hours{device=\"$name\"} $poh"
              [ -n "''${gd:-}" ] && echo "smart_sas_grown_defects{device=\"$name\"} $gd"
              [ -n "''${nm:-}" ] && echo "smart_sas_non_medium_errors{device=\"$name\"} $nm"
              [ -n "''${pd:-}" ] && echo "smart_sas_pending_defects{device=\"$name\"} $pd"
              [ -n "''${ru:-}" ] && echo "smart_sas_read_uncorrected{device=\"$name\"} $ru"
              [ -n "''${wu:-}" ] && echo "smart_sas_write_uncorrected{device=\"$name\"} $wu"
              echo "smart_sas_info{device=\"$name\",vendor=\"''${vendor:-?}\",product=\"''${product:-?}\",revision=\"''${rev:-?}\",serial=\"''${serial:-?}\"} 1"
            done
          } > "$TMP_SAS"
          chmod 0644 "$TMP_SAS"
          mv -f "$TMP_SAS" "$OUT_DIR/sas_smart.prom"

          # ZFS pool health
          {
            echo "# HELP zpool_health Pool health (0=ONLINE 1=DEGRADED 2=FAULTED 3=OFFLINE 4=UNAVAIL 5=REMOVED)"
            echo "# TYPE zpool_health gauge"
            ${pkgs.zfs}/bin/zpool list -H -o name,health 2>/dev/null | while read -r pool health; do
              case "$health" in
                ONLINE)   v=0 ;;
                DEGRADED) v=1 ;;
                FAULTED)  v=2 ;;
                OFFLINE)  v=3 ;;
                UNAVAIL)  v=4 ;;
                REMOVED)  v=5 ;;
                *)        v=6 ;;
              esac
              echo "zpool_health{pool=\"$pool\",state=\"$health\"} $v"
            done
          } > "$TMP_ZFS"
          chmod 0644 "$TMP_ZFS"
          mv -f "$TMP_ZFS" "$OUT_DIR/zpool_health.prom"

          # ZFS pool -> block-device mapping. Lets Grafana compute *physical*
          # per-pool IOPS by joining node_disk_*_completed_total on `device`
          # (node_zfs_zpool_dataset_* is LOGICAL — it counts ARC cache hits, so
          # RAM-served reads look like huge phantom pool IOPS). Re-read from live
          # `zpool status` each run, so kernel device renames across reboots
          # can't break the dashboard. Partition suffix is stripped to match
          # node-exporter's whole-disk `device` label (sdb1->sdb, nvme0n1p1->nvme0n1).
          TMP_MEM=$(mktemp "$OUT_DIR/zpool_members.prom.XXXXXX")
          {
            echo "# HELP skyw_zpool_member 1 if the block device is a member of the pool"
            echo "# TYPE skyw_zpool_member gauge"
            ${pkgs.zfs}/bin/zpool status -LP 2>/dev/null | ${pkgs.gawk}/bin/awk '
              /^[[:space:]]*pool:/ { pool = $2; next }
              /\/dev\// {
                for (i = 1; i <= NF; i++) if ($i ~ /^\/dev\//) {
                  dev = $i; sub(/^\/dev\//, "", dev)
                  if (dev ~ /^nvme/) sub(/p[0-9]+$/, "", dev); else sub(/[0-9]+$/, "", dev)
                  line = "skyw_zpool_member{pool=\"" pool "\",device=\"" dev "\"} 1"
                  if (!(line in seen)) { seen[line] = 1; print line }
                }
              }'
          } > "$TMP_MEM"
          chmod 0644 "$TMP_MEM"
          mv -f "$TMP_MEM" "$OUT_DIR/zpool_members.prom"
        '';
      };
    };

    systemd.timers.skyw-textfile-collectors = lib.mkIf cfg.nodeExporter.enable {
      wantedBy = [ "timers.target" ];
      timerConfig = {
        OnBootSec = "30s";
        OnUnitActiveSec = "5min";
      };
    };

    networking.firewall.allowedTCPPorts = lib.mkIf cfg.nodeExporter.enable
      [ cfg.nodeExporter.port ];
  };
}