Newer
Older
skyworks-Nix-infra / modules / monitoring.nix
{ config, pkgs, lib, ... }:
let
  cfg = config.skyworks.monitoring;
  zpoolHealthScript = pkgs.writeShellScript "zpool-health" ''
    ${pkgs.zfs}/bin/zpool list -H -o name,health | while read -r pool health; do
      case "$health" in
        ONLINE)   val=0 ;;
        DEGRADED) val=1 ;;
        FAULTED)  val=2 ;;
        OFFLINE)  val=3 ;;
        UNAVAIL)  val=4 ;;
        REMOVED)  val=5 ;;
        *)        val=6 ;;
      esac
      echo "zpool_health,pool=$pool health=''${val}i"
    done
  '';
in {
  options.skyworks.monitoring = {
    enable = lib.mkEnableOption "Telegraf monitoring to door1 InfluxDB";
    bucket = lib.mkOption {
      type = lib.types.str;
      description = "InfluxDB bucket name";
    };
    netInterfaces = lib.mkOption {
      type = lib.types.listOf lib.types.str;
      default = [ "*" ];
    };
  };

  config = lib.mkIf cfg.enable {
    age.secrets.influxdb-token = {
      file = ../secrets/influxdb-token.age;
      owner = "telegraf";
      group = "telegraf";
      mode = "0400";
    };

    systemd.services.telegraf.serviceConfig.EnvironmentFile =
      config.age.secrets.influxdb-token.path;

    systemd.services.telegraf.path = [ pkgs.lm_sensors pkgs.smartmontools pkgs.nvme-cli ];

    services.telegraf = {
      enable = true;
      extraConfig = {
        agent = {
          interval = "10s";
          round_interval = true;
          metric_batch_size = 1000;
          metric_buffer_limit = 10000;
          flush_interval = "10s";
          hostname = config.networking.hostName;
        };

        outputs.influxdb_v2 = [{
          urls = [ "http://10.0.91.30:8086" ];
          token = "$INFLUX_TOKEN";
          organization = "door1";
          bucket = cfg.bucket;
        }];

        inputs = {
          cpu = [{ percpu = true; totalcpu = true; }];
          mem = [{}];
          swap = [{}];
          system = [{}];
          kernel = [{}];
          disk = [{ ignore_fs = [ "tmpfs" "devtmpfs" "devfs" "iso9660" "overlay" "aufs" "squashfs" ]; }];
          diskio = [{ devices = [ "*" ]; }];
          net = [{ interfaces = cfg.netInterfaces; }];
          sensors = [{ timeout = "5s"; }];
          zfs = [{ poolMetrics = true; }];
          smart = [{
            interval = "6h";
            use_sudo = true;
            attributes = true;
          }];
          exec = [{
            commands = [ "${zpoolHealthScript}" ];
            interval = "60s";
            timeout = "10s";
            data_format = "influx";
          }];
        };
      };
    };

    security.sudo.extraRules = lib.mkAfter [{
      users = [ "telegraf" ];
      commands = [
        { command = "${pkgs.smartmontools}/bin/smartctl"; options = [ "NOPASSWD" ]; }
      ];
    }];
  };
}