let
grpcPort = 19090;
queryPort = 9090;
minioPort = 9000;
pushgwPort = 9091;
frontPort = 9092;
s3 = {
accessKey = "BKIKJAA5BMMU2RHO6IBB";
secretKey = "V7f1CwQqAcwo80UEIJEjc5gVQUSSx5ohQ9GSrr12";
};
objstore.config = {
type = "S3";
config = {
bucket = "thanos-bucket";
endpoint = "s3:${toString minioPort}";
region = "us-east-1";
access_key = s3.accessKey;
secret_key = s3.secretKey;
insecure = true;
signature_version2 = false;
put_user_metadata = { };
http_config = {
idle_conn_timeout = "0s";
insecure_skip_verify = false;
trace = {
enable = false;
in
import ./make-test-python.nix {
name = "prometheus";
nodes = {
prometheus =
{ pkgs, ... }:
{
virtualisation.diskSize = 2 * 1024;
virtualisation.memorySize = 2048;
environment.systemPackages = [ pkgs.jq ];
networking.firewall.allowedTCPPorts = [ grpcPort ];
services.prometheus = {
enable = true;
enableReload = true;
scrapeConfigs = [
job_name = "prometheus";
static_configs = [
targets = [ "127.0.0.1:${toString queryPort}" ];
labels = {
instance = "localhost";
}
];
job_name = "pushgateway";
scrape_interval = "1s";
targets = [ "127.0.0.1:${toString pushgwPort}" ];
rules = [
''
groups:
- name: test
rules:
- record: testrule
expr: count(up{job="prometheus"})
globalConfig = {
external_labels = {
some_label = "required by thanos";
extraFlags = [
# Required by thanos
"--storage.tsdb.min-block-duration=5s"
"--storage.tsdb.max-block-duration=5s"
services.prometheus.pushgateway = {
web.listen-address = ":${toString pushgwPort}";
persistMetrics = true;
persistence.interval = "1s";
stateDir = "prometheus-pushgateway";
services.thanos = {
sidecar = {
grpc-address = "0.0.0.0:${toString grpcPort}";
inherit objstore;
# TODO: Add some tests for these services:
#rule = {
# enable = true;
# http-address = "0.0.0.0:19194";
# grpc-address = "0.0.0.0:19193";
# query.addresses = [
# "localhost:19191"
# ];
# labels = {
# just = "some";
# nice = "labels";
# };
#};
#
#receive = {
# http-address = "0.0.0.0:19195";
# Adds a "specialisation" of the above config which allows us to
# "switch" to it and see if the services.prometheus.enableReload
# functionality actually reloads the prometheus service instead of
# restarting it.
specialisation = {
"prometheus-config-change" = {
configuration = {
environment.systemPackages = [ pkgs.yq ];
# This configuration just adds a new prometheus job
# to scrape the node_exporter metrics of the s3 machine.
job_name = "s3-node_exporter";
targets = [ "s3:9100" ];
query =
services.thanos.query = {
http-address = "0.0.0.0:${toString queryPort}";
endpoints = [
"prometheus:${toString grpcPort}"
services.thanos.query-frontend = {
http-address = "0.0.0.0:${toString frontPort}";
query-frontend.downstream-url = "http://127.0.0.1:${toString queryPort}";
store =
environment.systemPackages = with pkgs; [
jq
thanos
services.thanos.store = {
http-address = "0.0.0.0:10902";
sync-block-duration = "1s";
services.thanos.compact = {
http-address = "0.0.0.0:10903";
consistency-delay = "5s";
"localhost:${toString grpcPort}"
s3 =
# Minio requires at least 1GiB of free disk space to run.
virtualisation = {
diskSize = 2 * 1024;
networking.firewall.allowedTCPPorts = [ minioPort ];
services.minio = {
inherit (s3) accessKey secretKey;
environment.systemPackages = [ pkgs.minio-client ];
services.prometheus.exporters.node = {
openFirewall = true;
testScript =
{ nodes, ... }:
# Before starting the other machines we first make sure that our S3 service is online
# and has a bucket added for thanos:
s3.start()
s3.wait_for_unit("minio.service")
s3.wait_for_open_port(${toString minioPort})
s3.succeed(
"mc config host add minio "
+ "http://localhost:${toString minioPort} "
+ "${s3.accessKey} ${s3.secretKey} --api s3v4",
"mc mb minio/thanos-bucket",
)
# Now that s3 has started we can start the other machines:
for machine in prometheus, query, store:
machine.start()
# Check if prometheus responds to requests:
prometheus.wait_for_unit("prometheus.service")
prometheus.wait_for_open_port(${toString queryPort})
prometheus.succeed("curl -sf http://127.0.0.1:${toString queryPort}/metrics")
# Let's test if pushing a metric to the pushgateway succeeds:
prometheus.wait_for_unit("pushgateway.service")
prometheus.succeed(
"echo 'some_metric 3.14' | "
+ "curl -f --data-binary \@- "
+ "http://127.0.0.1:${toString pushgwPort}/metrics/job/some_job"
# Now check whether that metric gets ingested by prometheus.
# Since we'll check for the metric several times on different machines
# we abstract the test using the following function:
# Function to check if the metric "some_metric" has been received and returns the correct value.
def wait_for_metric(machine):
return machine.wait_until_succeeds(
"curl -sf 'http://127.0.0.1:${toString queryPort}/api/v1/query?query=some_metric' | "
+ "jq '.data.result[0].value[1]' | grep '\"3.14\"'"
wait_for_metric(prometheus)
# Let's test if the pushgateway persists metrics to the configured location.
prometheus.wait_until_succeeds("test -e /var/lib/prometheus-pushgateway/metrics")
# Test thanos
prometheus.wait_for_unit("thanos-sidecar.service")
# Test if the Thanos query service can correctly retrieve the metric that was send above.
query.wait_for_unit("thanos-query.service")
wait_for_metric(query)
# Test Thanos query frontend service
query.wait_for_unit("thanos-query-frontend.service")
query.succeed("curl -sS http://localhost:${toString frontPort}/-/healthy")
# Test if the Thanos sidecar has correctly uploaded its TSDB to S3, if the
# Thanos storage service has correctly downloaded it from S3 and if the Thanos
# query service running on $store can correctly retrieve the metric:
store.wait_for_unit("thanos-store.service")
wait_for_metric(store)
store.wait_for_unit("thanos-compact.service")
# Test if the Thanos bucket command is able to retrieve blocks from the S3 bucket
# and check if the blocks have the correct labels:
store.succeed(
"thanos tools bucket ls "
+ "--objstore.config-file=${nodes.store.config.services.thanos.store.objstore.config-file} "
+ "--output=json | "
+ "jq .thanos.labels.some_label | "
+ "grep 'required by thanos'"
'';