- alert: IceCreamStockLow expr: temperature_c{area="outside"} > 25 and on (shop) stock_level{item="ice-cream"} < 5 for: 5m labels: severity: major annotations: description: > Low on ice cream at the {{.Labels.shop}} shop and it is {{.Value | printf "%.1f"}}C {{.Labels.area}}
evaluation_interval: 1m rule_files: - ice-cream.rules.yml tests: - interval: 1m input_series: - series: stock_level{item="ice-cream",shop="Birmingham"} values: 2 2 2 2 2 2 - series: stock_level{item="ice-cream",shop="London"} values: 4 4 4 4 4 4 - series: temperature_c{area="outside",shop="Birmingham"} values: 23.98 23.98 23.98 23.98 23.98 23.98 - series: temperature_c{area="outside",shop="London"} values: 28.75+1x5 # 28.75 29.75 30.75 31.75 32.75 alert_rule_test: - alertname: IceCreamStockLow eval_time: 5m exp_alerts: - exp_labels: area: outside severity: major shop: London exp_annotations: description: Low on ice cream at the London shop and it is 33.8C outside
# Doesn't fire if stock levels are high enough - interval: 1m input_series: - series: stock_level{item="ice-cream",shop="London"} values: 5+0x5 # 5 5 5 5 5 5 - series: temperature_c{area="outside",shop="London"} values: 25.1+0x5 # 25.1 25.1 25.1 25.1 25.1 25.1 alert_rule_test: - alertname: IceCreamStockLow eval_time: 5m exp_alerts: []
# Doesn't fire for other products - interval: 1m input_series: - series: stock_level{item="hot-chocolate",shop="London"} values: 4+0x4 # 4 4 4 4 4 4 - series: temperature_c{area="outside",shop="London"} values: 25.1+0x5 # 25.1 25.1 25.1 25.1 25.1 25.1 alert_rule_test: - alertname: IceCreamStockLow eval_time: 5m exp_alerts: []
# Doesn't fire if temperature reduces within 5m - interval: 1m input_series: - series: stock_level{item="ice-cream",shop="London"} values: 4+0x5 # 4 4 4 4 4 4 - series: temperature_c{area="outside",shop="London"} values: 25.1+0x4 25 # 25.1 25.1 25.1 25.1 25.1 250 alert_rule_test: - alertname: IceCreamStockLow eval_time: 5m exp_alerts: []
evaluation_interval: 1m rule_files: - disk-io.rules.yml tests: - interval: 1m input_series: - series: node_disk_reads_completed_total{device="sda"} values: 0+74040x2 # 1234/sec for a minute == 74040/min alert_rule_test: - alertname: HighDiskIO eval_time: 2m exp_alerts: - exp_labels: severity: minor device: sda exp_annotations: description: 'High I/O on sda: 1234 reads/sec' -- ### Sample interval `1s` Per-second rates can be easier to work with - interval: 1s input_series: - series: node_disk_reads_completed_total{device="sda"} # rising at 1001/sec for almost 2 mins, then at 10/sec for last second values: 0+1001x119 119129 # +10 to final sample in seq (119119) - series: node_disk_reads_completed_total{device="sdb"} # rising at 1005/sec for 4 mins values: 0+1005x240 - series: node_disk_reads_completed_total{device="sdc"} # rising at 1000/sec for almost 4m, then 1200/sec for last second values: 0+1000x239 240200 # +1200 to final value in seq (239000) alert_rule_test: - alertname: HighDiskIO eval_time: 2m exp_alerts: - exp_labels: severity: minor device: sdb exp_annotations: description: 'High I/O on sdb: 1005 reads/sec' - alertname: HighDiskIO eval_time: 4m exp_alerts: - exp_labels: severity: minor device: sdb exp_annotations: description: 'High I/O on sdb: 1005 reads/sec' - exp_labels: severity: minor device: sdc exp_annotations: # Actual value without rounding is 1001.6666666666666 description: 'High I/O on sdc: 1002 reads/sec' --- ## Testing recording rules Consider metrics whose values are timestamps job_completed_at{job="job-one"} 1514321353 # Tue 26 Dec 2017 20:49:13 GMT job_completed_at{job="job-two"} 1554321353 # Wed 3 Apr 2019 20:55:53 BST Record a new time series containing only those jobs that completed in the past two hours rules: - record: job_completed_recently expr: job_completed_at >= (time() - (2*60*60)) How can we test it when we can't control the current time? -- Value of `time()` increases from zero throughout the test evaluation_interval: 1h rule_files: - job_completion.rules.yml tests: - interval: 1h input_series: - series: job_completed_at{job="job-one"} values: 0+0x3 # t=0h - series: job_completed_at{job="job-two"} values: 3600+0x3 # t=1h promql_expr_test: - expr: job_completed_recently eval_time: 2h exp_samples: - labels: job_completed_recently{job="job-one"} value: 0 - labels: job_completed_recently{job="job-two"} value: 3600 - expr: job_completed_recently eval_time: 3h exp_samples: - labels: job_completed_recently{job="job-two"} value: 3600 --- ## Test in Docker If you're not yet using Prometheus >=2.5.0, you can use a more recent version just for testing ```dockerfile FROM prom/prometheus:v2.9.2 WORKDIR /data COPY *.rules.yml . COPY *.rules.test.yml . RUN promtool test rules *.test.yml ``` Note: * Will only build successfully if tests pass -- You can trigger the build from Gradle ```gradle task testUsingRecentPrometheus(type:Exec) { commandLine "docker", "build", "-t", "prometheus-test", "." } ``` --- ## Summary Advantages: * Repeatable unit tests with precise control * Very quick to run * Can test alerts that only fire after an extended time period But: * They are only unit tests * Only as good as the simulated metrics * Cannot pick up on changes in the _sources_ of metrics Note: * e.g. when [kube-state-metrics](https://github.com/kubernetes/kube-state-metrics) makes changes to metric names and labels
- interval: 1s input_series: - series: node_disk_reads_completed_total{device="sda"} # rising at 1001/sec for almost 2 mins, then at 10/sec for last second values: 0+1001x119 119129 # +10 to final sample in seq (119119) - series: node_disk_reads_completed_total{device="sdb"} # rising at 1005/sec for 4 mins values: 0+1005x240 - series: node_disk_reads_completed_total{device="sdc"} # rising at 1000/sec for almost 4m, then 1200/sec for last second values: 0+1000x239 240200 # +1200 to final value in seq (239000) alert_rule_test: - alertname: HighDiskIO eval_time: 2m exp_alerts: - exp_labels: severity: minor device: sdb exp_annotations: description: 'High I/O on sdb: 1005 reads/sec' - alertname: HighDiskIO eval_time: 4m exp_alerts: - exp_labels: severity: minor device: sdb exp_annotations: description: 'High I/O on sdb: 1005 reads/sec' - exp_labels: severity: minor device: sdc exp_annotations: # Actual value without rounding is 1001.6666666666666 description: 'High I/O on sdc: 1002 reads/sec'
job_completed_at{job="job-one"} 1514321353 # Tue 26 Dec 2017 20:49:13 GMT job_completed_at{job="job-two"} 1554321353 # Wed 3 Apr 2019 20:55:53 BST
rules: - record: job_completed_recently expr: job_completed_at >= (time() - (2*60*60))
evaluation_interval: 1h rule_files: - job_completion.rules.yml tests: - interval: 1h input_series: - series: job_completed_at{job="job-one"} values: 0+0x3 # t=0h - series: job_completed_at{job="job-two"} values: 3600+0x3 # t=1h promql_expr_test: - expr: job_completed_recently eval_time: 2h exp_samples: - labels: job_completed_recently{job="job-one"} value: 0 - labels: job_completed_recently{job="job-two"} value: 3600 - expr: job_completed_recently eval_time: 3h exp_samples: - labels: job_completed_recently{job="job-two"} value: 3600