skyworks-Nix-infra/modules/monitoring.nix at dd38237df06ee16fae5dbabe5d619ed9be90a4e6

Fork: 0
Skyworks / skyworks-Nix-infra
Find file
Newer
Older
skyworks-Nix-infra / modules / monitoring.nix
Dixiao-L on 7 Apr 6 KB sas-smart: reduce exec interval from 30m to 5m
Raw Blame History
{ config, pkgs, lib, ... }:
let
  cfg = config.skyworks.monitoring;
  zpoolHealthScript = pkgs.writeShellScript "zpool-health" ''
    ${pkgs.zfs}/bin/zpool list -H -o name,health | while read -r pool health; do
      case "$health" in
        ONLINE)   val=0 ;;
        DEGRADED) val=1 ;;
        FAULTED)  val=2 ;;
        OFFLINE)  val=3 ;;
        UNAVAIL)  val=4 ;;
        REMOVED)  val=5 ;;
        *)        val=6 ;;
      esac
      echo "zpool_health,pool=$pool health=''${val}i"
    done
  '';

  # Telegraf's inputs.smart parses SATA/NVMe attribute tables but not the
  # SAS-specific sections of `smartctl -a` output. This script extracts the
  # SAS predictive failure metrics that the SMART plugin misses.
  sasSmartScript = pkgs.writeShellScript "sas-smart" ''
    set -u
    for dev in /dev/sd?; do
      [ -b "$dev" ] || continue
      # Use NixOS security wrapper sudo (the Nix store sudo lacks setuid bit)
      out=$(/run/wrappers/bin/sudo ${pkgs.smartmontools}/bin/smartctl -a "$dev" 2>/dev/null) || continue

      # Only SAS drives — skip if not SAS
      echo "$out" | ${pkgs.gnugrep}/bin/grep -q "Transport protocol:.*SAS" || continue

      name=$(basename "$dev")

      # Power on hours: try both formats
      poh=$(echo "$out" | ${pkgs.gnugrep}/bin/grep "Accumulated power on time" | ${pkgs.gawk}/bin/awk '{print $6}' | ${pkgs.coreutils}/bin/cut -d: -f1)
      [ -z "''${poh:-}" ] && poh=$(echo "$out" | ${pkgs.gnugrep}/bin/grep "number of hours powered up" | ${pkgs.gawk}/bin/awk -F= '{gsub(/ /,"",$2); print $2}' | ${pkgs.coreutils}/bin/cut -d. -f1)

      grown_defects=$(echo "$out" | ${pkgs.gnugrep}/bin/grep "Elements in grown defect list" | ${pkgs.gawk}/bin/awk -F: '{gsub(/ /,"",$2); print $2}')
      [ -z "''${grown_defects:-}" ] && grown_defects=$(echo "$out" | ${pkgs.gnugrep}/bin/grep "Grown defects during certification" | ${pkgs.gawk}/bin/awk -F= '{gsub(/ /,"",$2); print $2}')

      non_medium=$(echo "$out" | ${pkgs.gnugrep}/bin/grep "Non-medium error count" | ${pkgs.gawk}/bin/awk -F: '{gsub(/ /,"",$2); print $2}')

      # "Pending defect count:0 Pending Defects" — extract number between : and space
      pending=$(echo "$out" | ${pkgs.gnugrep}/bin/grep "Pending defect count" | ${pkgs.gnused}/bin/sed 's/.*count:\([0-9]*\).*/\1/')

      # ECC errors corrected (read/write) — last column of the totals row
      read_uncorr=$(echo "$out" | ${pkgs.gnugrep}/bin/grep -A1 "^read:" | ${pkgs.coreutils}/bin/head -1 | ${pkgs.gawk}/bin/awk '{print $NF}')
      write_uncorr=$(echo "$out" | ${pkgs.gnugrep}/bin/grep -A1 "^write:" | ${pkgs.coreutils}/bin/head -1 | ${pkgs.gawk}/bin/awk '{print $NF}')

      fields=""
      [ -n "''${poh:-}" ] && fields="$fields,power_on_hours=''${poh}i"
      [ -n "''${grown_defects:-}" ] && fields="$fields,grown_defects=''${grown_defects}i"
      [ -n "''${non_medium:-}" ] && fields="$fields,non_medium_errors=''${non_medium}i"
      [ -n "''${pending:-}" ] && fields="$fields,pending_defects=''${pending}i"
      [ -n "''${read_uncorr:-}" ] && fields="$fields,read_uncorrected=''${read_uncorr}i"
      [ -n "''${write_uncorr:-}" ] && fields="$fields,write_uncorrected=''${write_uncorr}i"

      if [ -n "$fields" ]; then
        echo "smart_sas,device=$name ''${fields#,}"
      fi
    done
  '';
in {
  options.skyworks.monitoring = {
    enable = lib.mkEnableOption "Telegraf monitoring to InfluxDB";
    bucket = lib.mkOption {
      type = lib.types.str;
      description = "InfluxDB bucket name";
    };
    influxUrl = lib.mkOption {
      type = lib.types.str;
      default = "http://10.0.1.1:8086";
      description = "InfluxDB v2 HTTP API URL";
    };
    netInterfaces = lib.mkOption {
      type = lib.types.listOf lib.types.str;
      default = [ "*" ];
    };
  };

  config = lib.mkIf cfg.enable {
    age.secrets.influxdb-token = {
      file = ../secrets/influxdb-token.age;
      owner = "telegraf";
      group = "telegraf";
      mode = "0400";
    };

    systemd.services.telegraf.serviceConfig.EnvironmentFile =
      config.age.secrets.influxdb-token.path;

    systemd.services.telegraf.path = [ "/run/wrappers" pkgs.lm_sensors pkgs.smartmontools pkgs.nvme-cli ];

    services.telegraf = {
      enable = true;
      extraConfig = {
        agent = {
          interval = "10s";
          round_interval = true;
          metric_batch_size = 1000;
          metric_buffer_limit = 10000;
          flush_interval = "10s";
          hostname = config.networking.hostName;
        };

        outputs.influxdb_v2 = [{
          urls = [ cfg.influxUrl ];
          token = "$INFLUX_TOKEN";
          organization = "door1";
          bucket = cfg.bucket;
        }];

        inputs = {
          cpu = [{ percpu = true; totalcpu = true; }];
          mem = [{}];
          swap = [{}];
          system = [{}];
          kernel = [{}];
          disk = [{ ignore_fs = [ "tmpfs" "devtmpfs" "devfs" "iso9660" "overlay" "aufs" "squashfs" ]; }];
          diskio = [{ devices = [ "*" ]; }];
          net = [{ interfaces = cfg.netInterfaces; }];
          sensors = [{ timeout = "5s"; }];
          zfs = [{ poolMetrics = true; }];
          smart = [{
            interval = "30m";
            use_sudo = true;
            attributes = true;
            # "never" instead of default "standby": wake spun-down drives every
            # scrape so marginal drives in low power mode don't silently disappear
            # from metrics. Trade-off: 30m spin-up overhead, but our ZFS pool
            # drives shouldn't be spinning down anyway.
            nocheck = "never";
          }];
          exec = [
            {
              commands = [ "${zpoolHealthScript}" ];
              interval = "60s";
              timeout = "10s";
              data_format = "influx";
            }
            {
              commands = [ "${sasSmartScript}" ];
              interval = "5m";
              timeout = "60s";
              data_format = "influx";
            }
          ];
        };
      };
    };

    security.sudo.extraRules = lib.mkAfter [{
      users = [ "telegraf" ];
      commands = [
        { command = "${pkgs.smartmontools}/bin/smartctl"; options = [ "NOPASSWD" ]; }
      ];
    }];
  };
}