depot/ops/nixos/totoro/default.nix

454 lines
15 KiB
Nix
Raw Normal View History

2020-06-28 18:32:52 +00:00
# SPDX-FileCopyrightText: 2020 Luke Granger-Brown <depot@lukegb.com>
#
# SPDX-License-Identifier: Apache-2.0
{ depot, lib, pkgs, rebuilder, config, ... }:
let
inherit (depot.ops) secrets;
in {
2021-01-06 21:29:33 +00:00
imports = [
../../../third_party/nixpkgs/nixos/modules/installer/scan/not-detected.nix
../lib/client.nix
../lib/whitby-distributed.nix
../lib/twitternuke.nix
2021-01-20 17:55:31 +00:00
../lib/quotes.bfob.gg.nix
2021-01-06 21:29:33 +00:00
];
2020-06-28 18:32:52 +00:00
boot.initrd.availableKernelModules = [ "xhci_pci" "ahci" "nvme" "usb_storage" "usbhid" "sd_mod" ];
boot.kernelModules = lib.mkAfter [ "kvm-intel" ];
boot.kernelParams = [ "mitigations=off" ];
fileSystems = let
zfs = device: {
device = device;
fsType = "zfs";
};
in {
"/" = zfs "zboot/safe/root";
"/nix" = zfs "zboot/local/nix";
"/home" = zfs "tank/safe/home";
2020-07-04 19:36:38 +00:00
"/export" = zfs "tank/safe/export";
2020-11-17 02:39:01 +00:00
"/srv" = zfs "tank/safe/srv";
"/srv/pancake" = zfs "tank/safe/srv/pancake";
2020-06-28 18:32:52 +00:00
2021-01-15 03:38:43 +00:00
"/persist" = zfs "tank/safe/persist";
"/store" = zfs "tank/local/store";
2020-06-28 18:32:52 +00:00
"/boot" = {
device = "/dev/disk/by-uuid/D178-4E19";
fsType = "vfat";
};
};
# Use the systemd-boot EFI boot loader.
boot.loader.systemd-boot.enable = true;
boot.loader.efi.canTouchEfiVariables = true;
2021-04-25 21:44:05 +00:00
services.postgresql.package = pkgs.postgresql_13;
2020-06-28 18:32:52 +00:00
nix.maxJobs = lib.mkDefault 8;
powerManagement.cpuFreqGovernor = lib.mkDefault "performance";
virtualisation = {
podman.enable = true;
};
2020-06-28 18:32:52 +00:00
# Extra packages.
environment.systemPackages = with pkgs; [
2020-10-15 13:22:18 +00:00
(depot.nix.pkgs.secretsync.configure {
workingDir = "/home/lukegb/depot";
gitlabAccessToken = secrets.deployer.gitlabAccessToken;
manifestVariable = "SECRETS_MANIFEST";
variablesToFile = {
"OPS_SECRETS_DEFAULT_NIX" = "ops/secrets/default.nix";
};
})
];
2020-06-28 18:32:52 +00:00
# Networking!
networking = {
hostName = "totoro"; # Define your hostname.
2021-04-10 20:15:30 +00:00
domain = "int.as205479.net";
2020-06-28 18:32:52 +00:00
hostId = "676c08c4";
useDHCP = false;
2020-06-28 22:22:43 +00:00
interfaces.br-ext.useDHCP = true;
bridges.br-ext.interfaces = [ "enp0s31f6" ];
2020-10-31 17:04:30 +00:00
interfaces.br-int = {
virtual = true;
useDHCP = false;
ipv4.addresses = [{ address = "10.0.0.2"; prefixLength = 24; }];
};
bridges.br-int.interfaces = [];
2021-01-15 03:39:36 +00:00
firewall.allowedTCPPorts = [
2021-01-15 03:42:38 +00:00
80 443 # web
2021-01-15 03:39:36 +00:00
4001 # ipfs
];
firewall.allowedUDPPorts = [
4001 # ipfs
];
2020-06-28 22:22:43 +00:00
};
my.ip.tailscale = "100.122.86.11";
2020-06-28 22:22:43 +00:00
# Virtualisation
virtualisation.libvirtd = {
enable = true;
allowedBridges = [ "virbr0" "br-ext" ];
2020-06-28 18:32:52 +00:00
};
2020-06-28 22:23:43 +00:00
users.users.lukegb = {
2020-07-08 18:35:48 +00:00
packages = with depot.pkgs; [ irssi ];
2020-06-28 22:23:43 +00:00
extraGroups = lib.mkAfter [ "libvirtd" ];
};
2020-11-17 03:14:04 +00:00
users.users.pancake = {
isSystemUser = true;
group = "pancake";
home = "/srv/pancake";
};
users.users.nginx.extraGroups = lib.mkAfter [ "acme" ];
users.groups.pancake = {
members = ["pancake" "nginx"];
};
2020-06-28 18:32:52 +00:00
systemd.tmpfiles.rules = [
"L /var/lib/export - - - - /export"
];
2020-11-17 03:14:04 +00:00
services.nginx = {
enable = true;
virtualHosts = {
"invoices.lukegb.com" = let
fastcgi = {
extraConfig = ''
rewrite ^(.*)$ /index.php break;
fastcgi_split_path_info ^(.+\.php)(/.+)$;
fastcgi_index index.php;
fastcgi_pass unix:${config.services.phpfpm.pools.pancake.socket};
include ${pkgs.nginx}/conf/fastcgi_params;
include ${pkgs.nginx}/conf/fastcgi.conf;
'';
};
in {
root = "/srv/pancake/public_html";
useACMEHost = "invoices.lukegb.com";
forceSSL = true;
locations."/" = {
tryFiles = "$uri $uri/ @router";
index = "index.html index.php";
extraConfig = ''
error_page 403 = @router;
error_page 404 = @router;
'';
};
locations."~ (.php|\\/[^./]+)$" = fastcgi;
locations."@router" = fastcgi;
};
};
};
services.phpfpm = let settingsBase = {
"listen.owner" = config.services.nginx.user;
"pm" = "dynamic";
"pm.max_children" = 32;
"pm.max_requests" = 500;
"pm.start_servers" = 2;
"pm.min_spare_servers" = 2;
"pm.max_spare_servers" = 5;
"php_admin_value[error_log]" = "stderr";
"php_admin_flag[log_errors]" = true;
"catch_workers_output" = true;
}; in {
pools.pancake = {
user = "pancake";
group = "pancake";
settings = settingsBase;
phpEnv."PATH" = lib.makeBinPath [ pkgs.php ];
};
};
services.mysql = {
enable = true;
package = pkgs.mariadb;
ensureDatabases = ["pancake"];
ensureUsers = [{
name = "pancake";
ensurePermissions = {
"pancake.*" = "ALL PRIVILEGES";
};
}];
};
security.acme = {
acceptTerms = true;
email = "letsencrypt@lukegb.com";
certs."invoices.lukegb.com" = {
domain = "invoices.lukegb.com";
dnsProvider = "cloudflare";
credentialsFile = secrets.cloudflareCredentials;
postRun = ''
systemctl reload nginx
'';
};
};
services.prometheus = {
enable = true;
stateDir = "export/monitoring/prometheus";
alertmanagers = [{
scheme = "http";
static_configs = [{
targets = ["localhost:${toString config.services.prometheus.alertmanager.port}"];
}];
}];
globalConfig.scrape_interval = "15s";
scrapeConfigs = (builtins.attrValues depot.ops.nixos.systemExporters) ++ [{
job_name = "blade-oa/snmp";
metrics_path = "/snmp";
params = {
module = ["hpe"];
};
static_configs = [{
targets = ["10.100.1.200"];
}];
relabel_configs = [{
source_labels = ["__address__"];
target_label = "__param_target";
} {
source_labels = ["__param_target"];
target_label = "instance";
} {
target_label = "__address__";
replacement = "totoro:${toString config.services.prometheus.exporters.snmp.port}";
}];
2021-06-28 21:17:48 +00:00
} {
job_name = "minotar/minotarproxy";
scheme = "https";
static_configs = [{
targets = ["minotarproxy.lukegb.xyz:443"];
}];
} {
job_name = "nixos/prometheus";
metrics_path = "/prometheus/federate";
honor_labels = true;
params = {
"match[]" = [
''hydra_job_failed{current="1"}''
''hydra_job_completion_time{current="1"}''
];
};
scheme = "https";
static_configs = [{
targets = ["monitoring.nixos.org:443"];
}];
}];
pushgateway.enable = true;
rules = [
''
groups:
- name: alerting
rules:
# Blade power
- alert: AveragePowerUsageTooHigh
expr: (sum(avg_over_time(cpqRackPowerSupplyCurPwrOutput{job="blade-oa/snmp"}[10m])) / 230) > 6.5
labels:
severity: page
annotations:
summary: "Blade: Power Usage Too High (rolling)"
description: "Power usage of blade system has been too high for last 10 minutes ({{ $value }}). https://grafana.int.lukegb.com/d/g-u3XQ8Gk/blade-power"
- alert: PowerUsageTooHigh
expr: (sum(cpqRackPowerSupplyCurPwrOutput{job="blade-oa/snmp"}) / 230) > 6.5
for: 10m
labels:
severity: page
annotations:
summary: "Blade: Power Usage Too High"
description: "Power usage of blade system has been too high for last 10 minutes ({{ $value }}). https://grafana.int.lukegb.com/d/g-u3XQ8Gk/blade-power"
- alert: BladePowerUsageOutOfBounds
expr: node_hwmon_power_average_watt{system=~"blade-.*"} > on () group_left() (1.5 * quantile(0.5, node_hwmon_power_average_watt{system=~"blade-.*"}))
for: 60m
labels:
severity: page
annotations:
summary: "Blade: Single Blade Power Usage Out of Bounds"
description: "{{ $labels.system }} has power usage of {{ $value }}, which is out of expected bounds."
2021-04-20 14:48:00 +00:00
# Systems
- alert: NodeExporterDown
expr: up{exporter="node", system=~"(blade-(tuvok|paris|janeway|torres)|kusakabe|marukuru|swann|totoro|clouvider-.*|etheroute-.*)"} < 1
for: 30m
labels:
severity: page
annotations:
summary: "Node exporter no longer scrapable"
description: "{{ $labels.system }} is not reachable from totoro."
2021-04-20 14:48:00 +00:00
# Alert if the NixOS channels are broken
- alert: NixOSChannelBad
expr: hydra_job_failed{} == 1
for: 30m
labels:
severity: email
annotations:
summary: "NixOS Channel {{ $labels.channel }} failing"
description: "The channel {{ $labels.channel }} is failing - see https://hydra.nixos.org/job/{{ $labels.project }}/{{ $labels.jobset }}/tested"
2021-07-13 00:55:53 +00:00
# Packet loss
- alert: SmokepingAveragePacketLossHigh
expr: (avg((rate(smokeping_requests_total{host=~"(([a-z0-9]+.)+[a-z]+|([0-9]+.){3}[0-9]+)"}[5m]) - rate(smokeping_response_duration_seconds_count[5m])) / rate(smokeping_requests_total[5m])) by (system)) >= 0.01
for: 10m
labels:
severity: page
annotations:
summary: "Average packet loss from {{ $labels.system }} high"
description: "The average packet loss from {{ $labels.system }} is {{ $value | humanizePercentage }}%, which is too high."
2021-07-13 00:55:53 +00:00
- alert: SmokepingPacketLossVeryHigh
expr: ((rate(smokeping_requests_total{host=~"(([a-z0-9]+.)+[a-z]+|([0-9]+.){3}[0-9]+)"}[5m]) - rate(smokeping_response_duration_seconds_count[5m])) / rate(smokeping_requests_total[5m])) >= 0.10
for: 10m
labels:
severity: page
annotations:
summary: "Packet loss to {{ $labels.host }} from {{ $labels.system }} high"
description: "The packet loss from {{ $labels.system }} to {{ $labels.host }} (IP: {{ $labels.ip }}) is very high ({{ $value | humanizePercentage }}%)."
2021-07-13 00:55:53 +00:00
# Ping latency
- alert: Smokeping95LatencyHigh
expr: histogram_quantile(0.95, sum(rate(smokeping_response_duration_seconds_bucket{host=~"^(1.1.1.1|8.8.8.8)$"}[5m])) by (le, host, system)) > 0.03
for: 15m
labels:
severity: page
annotations:
summary: "Ping latency from {{ $labels.system }} to {{ $labels.host }} high"
description: "The 95th-percentile ping latency from {{ $labels.system }} to {{ $labels.host }} is {{ $value }}."
# Internet connectivity
- alert: MaldenRoadInternetConnectivityFailure
expr: sum(bird_bfd_session_state{state="Up"} * on(instance,name,neighbor_address,system) group_left(device) bird_bfd_session_device) by (instance,neighbor_address,device,state,system) < 1
labels:
severity: page
annotations:
summary: "Device {{ $labels.device }} on {{ $labels.system }} reports BFD down to neighbour {{ $labels.neighbor_address }}"
description: "Ruh roh, Raggy"
''
];
2021-02-23 00:16:41 +00:00
alertmanager = {
enable = true;
configuration = {
global = {};
route = {
receiver = "default-receiver";
};
receivers = [{
name = "default-receiver";
2021-02-23 01:07:33 +00:00
webhook_configs = [{
url = "http://localhost:9997";
}];
pushover_configs = [{
user_key = secrets.pushover.userKey;
token = secrets.pushover.tokens.alertmanager;
}];
}];
};
};
exporters.snmp = {
enable = true;
configurationPath = depot.nix.pkgs.prometheus-snmp-config;
};
};
2020-12-29 20:08:55 +00:00
services.grafana = {
enable = true;
addr = "0.0.0.0";
port = 3000;
2020-12-29 20:08:55 +00:00
domain = "grafana.int.lukegb.com";
rootUrl = "https://grafana.int.lukegb.com/";
extraOptions = let
convertName = name: lib.toUpper (builtins.replaceStrings ["." "-"] ["_" "_"] name);
convertOptionSection = sectionName: lib.mapAttrsToList (name: value: { name = "${convertName sectionName}_${convertName name}"; inherit value; });
2020-12-29 20:08:55 +00:00
convertOptions = opts: builtins.listToAttrs (builtins.concatLists (lib.mapAttrsToList convertOptionSection opts));
in convertOptions {
"auth.proxy" = {
enabled = "true";
header_name = "X-Pomerium-Claim-Email";
header_property = "email";
headers = "username:X-Pomerium-Claim-User";
auto_sign_up = "true";
};
security.cookie_secure = "true";
};
};
systemd.services.grafana.preStart = let
cfg = config.services.grafana;
plugins = with depot.pkgs.grafana-plugins; [
grafana-piechart-panel
2020-12-30 03:30:24 +00:00
grafana-clock-panel
grafana-worldmap-panel
grafana-polystat-panel
];
pluginLines = lib.concatMapStringsSep "\n" (pkg: ''
ln -sf ${pkg} ${cfg.dataDir}/plugins/${pkg.pname}
'') plugins;
in lib.mkAfter ''
rm -rf ${cfg.dataDir}/plugins
mkdir ${cfg.dataDir}/plugins
${pluginLines}
'';
2021-01-15 03:38:43 +00:00
services.ipfs = {
enable = true;
dataDir = "/store/ipfs";
extraConfig = {
Experimental.FilestoreEnabled = true;
};
2021-01-15 03:38:43 +00:00
};
2021-02-23 01:07:33 +00:00
systemd.services.alertmanager-discord = {
enable = true;
2021-07-17 01:45:31 +00:00
wantedBy = [ "multi-user.target" ];
2021-02-23 01:07:33 +00:00
serviceConfig = {
ExecStart = "${depot.pkgs.alertmanager-discord}/bin/alertmanager-discord -listen.address 127.0.0.1:9997";
EnvironmentFile = pkgs.writeText "discord-secret" ''
DISCORD_WEBHOOK=${secrets.monitoring.alertmanager.discord.api_url}
'';
DynamicUser = true;
MountAPIVFS = true;
PrivateTmp = true;
PrivateUsers = true;
ProtectControlGroups = true;
ProtectKernelModules = true;
ProtectKernelTunables = true;
};
};
2021-07-17 01:45:31 +00:00
systemd.services.sslrenew-raritan = {
enable = true;
after = [ "network-online.target" ];
serviceConfig = {
Type = "oneshot";
ExecStart = "${depot.ops.raritan.ssl-renew}/lego.sh";
EnvironmentFile = pkgs.writeText "sslrenew-secret" ''
CERTIFICATE_DOMAIN=kvm.lukegb.xyz
LETSENCRYPT_EMAIL=letsencrypt@lukegb.com
CF_DNS_API_TOKEN=${secrets.cloudflareCredentials.token}
RARITAN_IP=192.168.1.50
RARITAN_USERNAME=${secrets.raritan.sslrenew.username}
RARITAN_PASSWORD=${secrets.raritan.sslrenew.password}
'';
DynamicUser = true;
StateDirectory = "sslrenew-raritan";
StateDirectoryMode = "0700";
WorkingDirectory = "/var/lib/sslrenew-raritan";
};
};
systemd.timers.sslrenew-raritan = {
enable = true;
wantedBy = [ "timers.target" ];
timerConfig = {
OnCalendar = "daily";
};
};
2020-06-28 18:32:52 +00:00
system.stateVersion = "20.03";
}