2020-04-24 23:36:52 +00:00
|
|
|
import ./make-test-python.nix ({pkgs, lib, ...}:
|
|
|
|
|
|
|
|
let
|
|
|
|
# Settings for both servers and agents
|
|
|
|
webUi = true;
|
|
|
|
retry_interval = "1s";
|
|
|
|
raft_multiplier = 1;
|
|
|
|
|
|
|
|
defaultExtraConfig = {
|
|
|
|
inherit retry_interval;
|
|
|
|
performance = {
|
|
|
|
inherit raft_multiplier;
|
|
|
|
};
|
|
|
|
};
|
|
|
|
|
|
|
|
allConsensusServerHosts = [
|
|
|
|
"192.168.1.1"
|
|
|
|
"192.168.1.2"
|
|
|
|
"192.168.1.3"
|
|
|
|
];
|
|
|
|
|
|
|
|
allConsensusClientHosts = [
|
|
|
|
"192.168.2.1"
|
|
|
|
"192.168.2.2"
|
|
|
|
];
|
|
|
|
|
|
|
|
firewallSettings = {
|
|
|
|
# See https://www.consul.io/docs/install/ports.html
|
|
|
|
allowedTCPPorts = [ 8301 8302 8600 8500 8300 ];
|
|
|
|
allowedUDPPorts = [ 8301 8302 8600 ];
|
|
|
|
};
|
|
|
|
|
|
|
|
client = index: { pkgs, ... }:
|
|
|
|
let
|
|
|
|
ip = builtins.elemAt allConsensusClientHosts index;
|
|
|
|
in
|
|
|
|
{
|
|
|
|
environment.systemPackages = [ pkgs.consul ];
|
|
|
|
|
|
|
|
networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
|
|
|
|
{ address = ip; prefixLength = 16; }
|
|
|
|
];
|
|
|
|
networking.firewall = firewallSettings;
|
|
|
|
|
2024-02-29 20:09:43 +00:00
|
|
|
nixpkgs.config.allowUnfreePredicate = pkg: builtins.elem (lib.getName pkg) [ "consul" ];
|
|
|
|
|
2020-04-24 23:36:52 +00:00
|
|
|
services.consul = {
|
|
|
|
enable = true;
|
|
|
|
inherit webUi;
|
|
|
|
extraConfig = defaultExtraConfig // {
|
|
|
|
server = false;
|
|
|
|
retry_join = allConsensusServerHosts;
|
|
|
|
bind_addr = ip;
|
|
|
|
};
|
|
|
|
};
|
|
|
|
};
|
|
|
|
|
|
|
|
server = index: { pkgs, ... }:
|
|
|
|
let
|
2020-07-18 16:06:22 +00:00
|
|
|
numConsensusServers = builtins.length allConsensusServerHosts;
|
|
|
|
thisConsensusServerHost = builtins.elemAt allConsensusServerHosts index;
|
|
|
|
ip = thisConsensusServerHost; # since we already use IPs to identify servers
|
2020-04-24 23:36:52 +00:00
|
|
|
in
|
|
|
|
{
|
|
|
|
networking.interfaces.eth1.ipv4.addresses = pkgs.lib.mkOverride 0 [
|
2020-07-18 16:06:22 +00:00
|
|
|
{ address = ip; prefixLength = 16; }
|
2020-04-24 23:36:52 +00:00
|
|
|
];
|
|
|
|
networking.firewall = firewallSettings;
|
|
|
|
|
2024-02-29 20:09:43 +00:00
|
|
|
nixpkgs.config.allowUnfreePredicate = pkg: builtins.elem (lib.getName pkg) [ "consul" ];
|
|
|
|
|
2020-04-24 23:36:52 +00:00
|
|
|
services.consul =
|
|
|
|
assert builtins.elem thisConsensusServerHost allConsensusServerHosts;
|
|
|
|
{
|
|
|
|
enable = true;
|
|
|
|
inherit webUi;
|
|
|
|
extraConfig = defaultExtraConfig // {
|
|
|
|
server = true;
|
2020-07-18 16:06:22 +00:00
|
|
|
bootstrap_expect = numConsensusServers;
|
|
|
|
# Tell Consul that we never intend to drop below this many servers.
|
|
|
|
# Ensures to not permanently lose consensus after temporary loss.
|
|
|
|
# See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
|
|
|
|
autopilot.min_quorum = numConsensusServers;
|
2020-04-24 23:36:52 +00:00
|
|
|
retry_join =
|
|
|
|
# If there's only 1 node in the network, we allow self-join;
|
|
|
|
# otherwise, the node must not try to join itself, and join only the other servers.
|
|
|
|
# See https://github.com/hashicorp/consul/issues/2868
|
2020-07-18 16:06:22 +00:00
|
|
|
if numConsensusServers == 1
|
2020-04-24 23:36:52 +00:00
|
|
|
then allConsensusServerHosts
|
|
|
|
else builtins.filter (h: h != thisConsensusServerHost) allConsensusServerHosts;
|
|
|
|
bind_addr = ip;
|
|
|
|
};
|
|
|
|
};
|
|
|
|
};
|
|
|
|
in {
|
|
|
|
name = "consul";
|
|
|
|
|
|
|
|
nodes = {
|
|
|
|
server1 = server 0;
|
|
|
|
server2 = server 1;
|
|
|
|
server3 = server 2;
|
|
|
|
|
|
|
|
client1 = client 0;
|
|
|
|
client2 = client 1;
|
|
|
|
};
|
|
|
|
|
|
|
|
testScript = ''
|
|
|
|
servers = [server1, server2, server3]
|
|
|
|
machines = [server1, server2, server3, client1, client2]
|
|
|
|
|
|
|
|
for m in machines:
|
|
|
|
m.wait_for_unit("consul.service")
|
|
|
|
|
2020-07-18 16:06:22 +00:00
|
|
|
|
|
|
|
def wait_for_healthy_servers():
|
|
|
|
# See https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040
|
|
|
|
# for why the `Voter` column of `list-peers` has that info.
|
|
|
|
# TODO: The `grep true` relies on the fact that currently in
|
|
|
|
# the output like
|
|
|
|
# # consul operator raft list-peers
|
|
|
|
# Node ID Address State Voter RaftProtocol
|
|
|
|
# server3 ... 192.168.1.3:8300 leader true 3
|
|
|
|
# server2 ... 192.168.1.2:8300 follower true 3
|
|
|
|
# server1 ... 192.168.1.1:8300 follower false 3
|
|
|
|
# `Voter`is the only boolean column.
|
|
|
|
# Change this to the more reliable way to be defined by
|
|
|
|
# https://github.com/hashicorp/consul/issues/8118
|
|
|
|
# once that ticket is closed.
|
|
|
|
for m in machines:
|
|
|
|
m.wait_until_succeeds(
|
|
|
|
"[ $(consul operator raft list-peers | grep true | wc -l) == 3 ]"
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def wait_for_all_machines_alive():
|
|
|
|
"""
|
|
|
|
Note that Serf-"alive" does not mean "Raft"-healthy;
|
|
|
|
see `wait_for_healthy_servers()` for that instead.
|
|
|
|
"""
|
|
|
|
for m in machines:
|
|
|
|
m.wait_until_succeeds("[ $(consul members | grep -o alive | wc -l) == 5 ]")
|
|
|
|
|
|
|
|
|
|
|
|
wait_for_healthy_servers()
|
|
|
|
# Also wait for clients to be alive.
|
|
|
|
wait_for_all_machines_alive()
|
2020-04-24 23:36:52 +00:00
|
|
|
|
|
|
|
client1.succeed("consul kv put testkey 42")
|
|
|
|
client2.succeed("[ $(consul kv get testkey) == 42 ]")
|
|
|
|
|
|
|
|
|
2023-03-27 19:17:25 +00:00
|
|
|
def rolling_restart_test(proper_rolling_procedure=True):
|
2020-07-18 16:06:22 +00:00
|
|
|
"""
|
|
|
|
Tests that the cluster can tolearate failures of any single server,
|
|
|
|
following the recommended rolling upgrade procedure from
|
|
|
|
https://www.consul.io/docs/upgrading#standard-upgrades.
|
2020-04-24 23:36:52 +00:00
|
|
|
|
2020-07-18 16:06:22 +00:00
|
|
|
Optionally, `proper_rolling_procedure=False` can be given
|
|
|
|
to wait only for each server to be back `Healthy`, not `Stable`
|
|
|
|
in the Raft consensus, see Consul setting `ServerStabilizationTime` and
|
|
|
|
https://github.com/hashicorp/consul/issues/8118#issuecomment-645330040.
|
|
|
|
"""
|
|
|
|
|
|
|
|
for server in servers:
|
2023-03-27 19:17:25 +00:00
|
|
|
server.block()
|
|
|
|
server.systemctl("stop consul")
|
|
|
|
|
|
|
|
# Make sure the stopped peer is recognized as being down
|
|
|
|
client1.wait_until_succeeds(
|
|
|
|
f"[ $(consul members | grep {server.name} | grep -o -E 'failed|left' | wc -l) == 1 ]"
|
|
|
|
)
|
2020-07-18 16:06:22 +00:00
|
|
|
|
|
|
|
# For each client, wait until they have connection again
|
|
|
|
# using `kv get -recurse` before issuing commands.
|
|
|
|
client1.wait_until_succeeds("consul kv get -recurse")
|
|
|
|
client2.wait_until_succeeds("consul kv get -recurse")
|
2020-04-24 23:36:52 +00:00
|
|
|
|
2020-07-18 16:06:22 +00:00
|
|
|
# Do some consul actions while one server is down.
|
|
|
|
client1.succeed("consul kv put testkey 43")
|
|
|
|
client2.succeed("[ $(consul kv get testkey) == 43 ]")
|
|
|
|
client2.succeed("consul kv delete testkey")
|
|
|
|
|
2023-03-27 19:17:25 +00:00
|
|
|
server.unblock()
|
|
|
|
server.systemctl("start consul")
|
2020-07-18 16:06:22 +00:00
|
|
|
|
|
|
|
if proper_rolling_procedure:
|
|
|
|
# Wait for recovery.
|
|
|
|
wait_for_healthy_servers()
|
|
|
|
else:
|
|
|
|
# NOT proper rolling upgrade procedure, see above.
|
|
|
|
wait_for_all_machines_alive()
|
|
|
|
|
|
|
|
# Wait for client connections.
|
|
|
|
client1.wait_until_succeeds("consul kv get -recurse")
|
|
|
|
client2.wait_until_succeeds("consul kv get -recurse")
|
|
|
|
|
|
|
|
# Do some consul actions with server back up.
|
|
|
|
client1.succeed("consul kv put testkey 44")
|
|
|
|
client2.succeed("[ $(consul kv get testkey) == 44 ]")
|
|
|
|
client2.succeed("consul kv delete testkey")
|
|
|
|
|
|
|
|
|
|
|
|
def all_servers_crash_simultaneously_test():
|
|
|
|
"""
|
|
|
|
Tests that the cluster will eventually come back after all
|
|
|
|
servers crash simultaneously.
|
|
|
|
"""
|
|
|
|
|
|
|
|
for server in servers:
|
2023-03-27 19:17:25 +00:00
|
|
|
server.block()
|
|
|
|
server.systemctl("stop --no-block consul")
|
2020-07-18 16:06:22 +00:00
|
|
|
|
|
|
|
for server in servers:
|
2023-03-27 19:17:25 +00:00
|
|
|
# --no-block is async, so ensure it has been stopped by now
|
|
|
|
server.wait_until_fails("systemctl is-active --quiet consul")
|
|
|
|
server.unblock()
|
|
|
|
server.systemctl("start consul")
|
2020-04-24 23:36:52 +00:00
|
|
|
|
|
|
|
# Wait for recovery.
|
2020-07-18 16:06:22 +00:00
|
|
|
wait_for_healthy_servers()
|
2020-04-24 23:36:52 +00:00
|
|
|
|
|
|
|
# Wait for client connections.
|
|
|
|
client1.wait_until_succeeds("consul kv get -recurse")
|
|
|
|
client2.wait_until_succeeds("consul kv get -recurse")
|
|
|
|
|
2020-07-18 16:06:22 +00:00
|
|
|
# Do some consul actions with servers back up.
|
2020-04-24 23:36:52 +00:00
|
|
|
client1.succeed("consul kv put testkey 44")
|
|
|
|
client2.succeed("[ $(consul kv get testkey) == 44 ]")
|
|
|
|
client2.succeed("consul kv delete testkey")
|
2020-07-18 16:06:22 +00:00
|
|
|
|
|
|
|
|
|
|
|
# Run the tests.
|
|
|
|
|
2023-03-27 19:17:25 +00:00
|
|
|
print("rolling_restart_test()")
|
|
|
|
rolling_restart_test()
|
2020-07-18 16:06:22 +00:00
|
|
|
|
|
|
|
print("all_servers_crash_simultaneously_test()")
|
|
|
|
all_servers_crash_simultaneously_test()
|
|
|
|
|
2023-03-27 19:17:25 +00:00
|
|
|
print("rolling_restart_test(proper_rolling_procedure=False)")
|
|
|
|
rolling_restart_test(proper_rolling_procedure=False)
|
2020-04-24 23:36:52 +00:00
|
|
|
'';
|
|
|
|
})
|