diff --git a/modules/monitoring.nix b/modules/monitoring.nix index e5b13f1..7865202 100644 --- a/modules/monitoring.nix +++ b/modules/monitoring.nix @@ -60,6 +60,21 @@ fi done ''; + + # Physical pool IOPS/bandwidth straight from the vdev layer. The telegraf + # `inputs.zfs` poolMetrics emits per-objset *logical* counters, which include + # ARC cache hits — so RAM-served reads show up as huge "pool IOPS" with zero + # actual disk activity (this misled a 2026-05-30 mountd investigation). This + # collector reports what `zpool iostat` reports: real disk ops. The second + # block of `iostat 1 2` is the live 1s sample; the operations columns are + # already per-second rates, so these are gauges (no derivative in Grafana). + zpoolIostatScript = pkgs.writeShellScript "zpool-iostat" '' + npools=$(${pkgs.zfs}/bin/zpool list -H -o name | ${pkgs.coreutils}/bin/wc -l) + ${pkgs.zfs}/bin/zpool iostat -Hp 1 2 | ${pkgs.coreutils}/bin/tail -n "$npools" \ + | while read -r name alloc free rops wops rbytes wbytes; do + echo "zpool_io,pool=$name read_ops=''${rops}i,write_ops=''${wops}i,read_bytes=''${rbytes}i,write_bytes=''${wbytes}i" + done + ''; in { options.skyworks.monitoring = { enable = lib.mkEnableOption "Telegraf monitoring to InfluxDB"; @@ -151,6 +166,14 @@ timeout = "60s"; data_format = "influx"; } + { + # Physical pool IOPS (measurement `zpool_io`) — the honest version + # of the ARC-inclusive `zfs` poolMetrics counters. + commands = [ "${zpoolIostatScript}" ]; + interval = "10s"; + timeout = "10s"; + data_format = "influx"; + } ]; }; };