# SPDX-FileCopyrightText: 2020 Luke Granger-Brown # # SPDX-License-Identifier: Apache-2.0 { depot, lib, pkgs, rebuilder, config, ... }: let inherit (depot.ops) secrets; in { imports = [ ../../../third_party/nixpkgs/nixos/modules/installer/scan/not-detected.nix ../lib/client.nix ../lib/whitby-distributed.nix ../lib/twitternuke.nix ../lib/quotes.bfob.gg.nix ]; boot.initrd.availableKernelModules = [ "xhci_pci" "ahci" "nvme" "usb_storage" "usbhid" "sd_mod" ]; boot.kernelModules = lib.mkAfter [ "kvm-intel" ]; boot.kernelParams = [ "mitigations=off" ]; fileSystems = let zfs = device: { device = device; fsType = "zfs"; }; in { "/" = zfs "zboot/safe/root"; "/nix" = zfs "zboot/local/nix"; "/home" = zfs "tank/safe/home"; "/export" = zfs "tank/safe/export"; "/srv" = zfs "tank/safe/srv"; "/srv/pancake" = zfs "tank/safe/srv/pancake"; "/persist" = zfs "tank/safe/persist"; "/store" = zfs "tank/local/store"; "/boot" = { device = "/dev/disk/by-uuid/D178-4E19"; fsType = "vfat"; }; }; # Use the systemd-boot EFI boot loader. boot.loader.systemd-boot.enable = true; boot.loader.efi.canTouchEfiVariables = true; services.postgresql.package = pkgs.postgresql_13; nix.maxJobs = lib.mkDefault 8; powerManagement.cpuFreqGovernor = lib.mkDefault "performance"; virtualisation = { podman.enable = true; }; # Extra packages. environment.systemPackages = with pkgs; [ (depot.nix.pkgs.secretsync.configure { workingDir = "/home/lukegb/depot"; gitlabAccessToken = secrets.deployer.gitlabAccessToken; manifestVariable = "SECRETS_MANIFEST"; variablesToFile = { "OPS_SECRETS_DEFAULT_NIX" = "ops/secrets/default.nix"; }; }) ]; # Networking! networking = { hostName = "totoro"; # Define your hostname. domain = "int.as205479.net"; hostId = "676c08c4"; useDHCP = false; interfaces.br-ext.useDHCP = true; bridges.br-ext.interfaces = [ "enp0s31f6" ]; interfaces.br-int = { virtual = true; useDHCP = false; ipv4.addresses = [{ address = "10.0.0.2"; prefixLength = 24; }]; }; bridges.br-int.interfaces = []; firewall.allowedTCPPorts = [ 80 443 # web 4001 # ipfs ]; firewall.allowedUDPPorts = [ 4001 # ipfs ]; }; my.ip.tailscale = "100.122.86.11"; # Virtualisation virtualisation.libvirtd = { enable = true; allowedBridges = [ "virbr0" "br-ext" ]; }; users.users.lukegb = { packages = with depot.pkgs; [ irssi ]; extraGroups = lib.mkAfter [ "libvirtd" ]; }; users.users.pancake = { isSystemUser = true; group = "pancake"; home = "/srv/pancake"; }; users.users.nginx.extraGroups = lib.mkAfter [ "acme" ]; users.groups.pancake = { members = ["pancake" "nginx"]; }; systemd.tmpfiles.rules = [ "L /var/lib/export - - - - /export" ]; services.nginx = { enable = true; package = pkgs.nginxMainline; additionalModules = with pkgs.nginxModules; [ rtmp ]; appendConfig = '' rtmp { server { listen 1935; chunk_size 4000; application app { live on; record off; allow publish all; allow play all; push rtmp://coventry.beam.bfob.gg/beam/thecakeisalie; } } } ''; virtualHosts = { "invoices.lukegb.com" = let fastcgi = { extraConfig = '' rewrite ^(.*)$ /index.php break; fastcgi_split_path_info ^(.+\.php)(/.+)$; fastcgi_index index.php; fastcgi_pass unix:${config.services.phpfpm.pools.pancake.socket}; include ${pkgs.nginx}/conf/fastcgi_params; include ${pkgs.nginx}/conf/fastcgi.conf; ''; }; in { root = "/srv/pancake/public_html"; useACMEHost = "invoices.lukegb.com"; forceSSL = true; locations."/" = { tryFiles = "$uri $uri/ @router"; index = "index.html index.php"; extraConfig = '' error_page 403 = @router; error_page 404 = @router; ''; }; locations."~ (.php|\\/[^./]+)$" = fastcgi; locations."@router" = fastcgi; }; }; }; services.phpfpm = let settingsBase = { "listen.owner" = config.services.nginx.user; "pm" = "dynamic"; "pm.max_children" = 32; "pm.max_requests" = 500; "pm.start_servers" = 2; "pm.min_spare_servers" = 2; "pm.max_spare_servers" = 5; "php_admin_value[error_log]" = "stderr"; "php_admin_flag[log_errors]" = true; "catch_workers_output" = true; }; in { pools.pancake = { user = "pancake"; group = "pancake"; settings = settingsBase; phpEnv."PATH" = lib.makeBinPath [ pkgs.php ]; }; }; services.mysql = { enable = true; package = pkgs.mariadb; ensureDatabases = ["pancake"]; ensureUsers = [{ name = "pancake"; ensurePermissions = { "pancake.*" = "ALL PRIVILEGES"; }; }]; }; security.acme = { acceptTerms = true; email = "letsencrypt@lukegb.com"; certs."invoices.lukegb.com" = { domain = "invoices.lukegb.com"; dnsProvider = "cloudflare"; credentialsFile = secrets.cloudflareCredentials; postRun = '' systemctl reload nginx ''; }; }; services.prometheus = { enable = true; stateDir = "export/monitoring/prometheus"; alertmanagers = [{ scheme = "http"; static_configs = [{ targets = ["localhost:${toString config.services.prometheus.alertmanager.port}"]; }]; }]; globalConfig.scrape_interval = "15s"; scrapeConfigs = (builtins.attrValues depot.ops.nixos.systemExporters) ++ [{ job_name = "blade-oa/snmp"; metrics_path = "/snmp"; params = { module = ["hpe"]; }; static_configs = [{ targets = ["10.100.1.200"]; }]; relabel_configs = [{ source_labels = ["__address__"]; target_label = "__param_target"; } { source_labels = ["__param_target"]; target_label = "instance"; } { target_label = "__address__"; replacement = "totoro:${toString config.services.prometheus.exporters.snmp.port}"; }]; } { job_name = "minotar/minotarproxy"; scheme = "https"; static_configs = [{ targets = ["minotarproxy.lukegb.xyz:443"]; }]; } { job_name = "nixos/prometheus"; metrics_path = "/prometheus/federate"; honor_labels = true; params = { "match[]" = [ ''hydra_job_failed{current="1"}'' ''hydra_job_completion_time{current="1"}'' ]; }; scheme = "https"; static_configs = [{ targets = ["monitoring.nixos.org:443"]; }]; }]; pushgateway.enable = true; rules = [ '' groups: - name: alerting rules: # Blade power - alert: AveragePowerUsageTooHigh expr: (sum(avg_over_time(cpqRackPowerSupplyCurPwrOutput{job="blade-oa/snmp"}[10m])) / 230) > 6.5 labels: severity: page annotations: summary: "Blade: Power Usage Too High (rolling)" description: "Power usage of blade system has been too high for last 10 minutes ({{ $value }}). https://grafana.int.lukegb.com/d/g-u3XQ8Gk/blade-power" - alert: PowerUsageTooHigh expr: (sum(cpqRackPowerSupplyCurPwrOutput{job="blade-oa/snmp"}) / 230) > 6.5 for: 10m labels: severity: page annotations: summary: "Blade: Power Usage Too High" description: "Power usage of blade system has been too high for last 10 minutes ({{ $value }}). https://grafana.int.lukegb.com/d/g-u3XQ8Gk/blade-power" - alert: BladePowerUsageOutOfBounds expr: node_hwmon_power_average_watt{system=~"blade-.*"} > on () group_left() (1.5 * quantile(0.5, node_hwmon_power_average_watt{system=~"blade-.*"})) for: 60m labels: severity: page annotations: summary: "Blade: Single Blade Power Usage Out of Bounds" description: "{{ $labels.system }} has power usage of {{ $value }}, which is out of expected bounds." # Systems - alert: NodeExporterDown expr: up{exporter="node", system=~"(blade-(tuvok|paris|janeway|torres)|kusakabe|marukuru|swann|totoro|clouvider-.*|etheroute-.*)"} < 1 for: 30m labels: severity: page annotations: summary: "Node exporter no longer scrapable" description: "{{ $labels.system }} is not reachable from totoro." # Alert if the NixOS channels are broken - alert: NixOSChannelBad expr: hydra_job_failed{} == 1 for: 30m labels: severity: email annotations: summary: "NixOS Channel {{ $labels.channel }} failing" description: "The channel {{ $labels.channel }} is failing - see https://hydra.nixos.org/job/{{ $labels.project }}/{{ $labels.jobset }}/tested" # Packet loss - alert: SmokepingAveragePacketLossHigh expr: (avg((rate(smokeping_requests_total{host=~"(([a-z0-9]+.)+[a-z]+|([0-9]+.){3}[0-9]+)"}[5m]) - rate(smokeping_response_duration_seconds_count[5m])) / rate(smokeping_requests_total[5m])) by (system)) >= 0.01 for: 10m labels: severity: page annotations: summary: "Average packet loss from {{ $labels.system }} high" description: "The average packet loss from {{ $labels.system }} is {{ $value | humanizePercentage }}%, which is too high." - alert: SmokepingPacketLossVeryHigh expr: ((rate(smokeping_requests_total{host=~"(([a-z0-9]+.)+[a-z]+|([0-9]+.){3}[0-9]+)"}[5m]) - rate(smokeping_response_duration_seconds_count[5m])) / rate(smokeping_requests_total[5m])) >= 0.10 for: 10m labels: severity: page annotations: summary: "Packet loss to {{ $labels.host }} from {{ $labels.system }} high" description: "The packet loss from {{ $labels.system }} to {{ $labels.host }} (IP: {{ $labels.ip }}) is very high ({{ $value | humanizePercentage }}%)." # Ping latency - alert: Smokeping95LatencyHigh expr: histogram_quantile(0.95, sum(rate(smokeping_response_duration_seconds_bucket{host=~"^(1.1.1.1|8.8.8.8)$"}[5m])) by (le, host, system)) > 0.03 for: 15m labels: severity: page annotations: summary: "Ping latency from {{ $labels.system }} to {{ $labels.host }} high" description: "The 95th-percentile ping latency from {{ $labels.system }} to {{ $labels.host }} is {{ $value }}." # Internet connectivity - alert: MaldenRoadInternetConnectivityFailure expr: sum(bird_bfd_session_state{state="Up"} * on(instance,name,neighbor_address,system) group_left(device) bird_bfd_session_device) by (instance,neighbor_address,device,state,system) < 1 for: 15m labels: severity: page annotations: summary: "Device {{ $labels.device }} on {{ $labels.system }} reports BFD down to neighbour {{ $labels.neighbor_address }}" description: "Ruh roh, Raggy" '' ]; alertmanager = { enable = true; configuration = { global = {}; route = { receiver = "default-receiver"; }; receivers = [{ name = "default-receiver"; webhook_configs = [{ url = "http://localhost:9997"; }]; pushover_configs = [{ user_key = secrets.pushover.userKey; token = secrets.pushover.tokens.alertmanager; }]; }]; }; }; exporters.snmp = { enable = true; configurationPath = depot.nix.pkgs.prometheus-snmp-config; }; }; services.grafana = { enable = true; addr = "0.0.0.0"; port = 3000; domain = "grafana.int.lukegb.com"; rootUrl = "https://grafana.int.lukegb.com/"; extraOptions = let convertName = name: lib.toUpper (builtins.replaceStrings ["." "-"] ["_" "_"] name); convertOptionSection = sectionName: lib.mapAttrsToList (name: value: { name = "${convertName sectionName}_${convertName name}"; inherit value; }); convertOptions = opts: builtins.listToAttrs (builtins.concatLists (lib.mapAttrsToList convertOptionSection opts)); in convertOptions { "auth.proxy" = { enabled = "true"; header_name = "X-Pomerium-Claim-Email"; header_property = "email"; headers = "username:X-Pomerium-Claim-User"; auto_sign_up = "true"; }; security.cookie_secure = "true"; }; }; systemd.services.grafana.preStart = let cfg = config.services.grafana; plugins = with depot.pkgs.grafana-plugins; [ grafana-piechart-panel grafana-clock-panel grafana-worldmap-panel grafana-polystat-panel ]; pluginLines = lib.concatMapStringsSep "\n" (pkg: '' ln -sf ${pkg} ${cfg.dataDir}/plugins/${pkg.pname} '') plugins; in lib.mkAfter '' rm -rf ${cfg.dataDir}/plugins mkdir ${cfg.dataDir}/plugins ${pluginLines} ''; services.ipfs = { enable = true; dataDir = "/store/ipfs"; extraConfig = { Experimental.FilestoreEnabled = true; }; }; systemd.services.alertmanager-discord = { enable = true; wantedBy = [ "multi-user.target" ]; serviceConfig = { ExecStart = "${depot.pkgs.alertmanager-discord}/bin/alertmanager-discord -listen.address 127.0.0.1:9997"; EnvironmentFile = pkgs.writeText "discord-secret" '' DISCORD_WEBHOOK=${secrets.monitoring.alertmanager.discord.api_url} ''; DynamicUser = true; MountAPIVFS = true; PrivateTmp = true; PrivateUsers = true; ProtectControlGroups = true; ProtectKernelModules = true; ProtectKernelTunables = true; }; }; systemd.services.sslrenew-raritan = { enable = true; after = [ "network-online.target" ]; serviceConfig = { Type = "oneshot"; ExecStart = "${depot.ops.raritan.ssl-renew}/lego.sh"; EnvironmentFile = pkgs.writeText "sslrenew-secret" '' CERTIFICATE_DOMAIN=kvm.lukegb.xyz LETSENCRYPT_EMAIL=letsencrypt@lukegb.com CF_DNS_API_TOKEN=${secrets.cloudflareCredentials.token} RARITAN_IP=192.168.1.50 RARITAN_USERNAME=${secrets.raritan.sslrenew.username} RARITAN_PASSWORD=${secrets.raritan.sslrenew.password} ''; DynamicUser = true; StateDirectory = "sslrenew-raritan"; StateDirectoryMode = "0700"; WorkingDirectory = "/var/lib/sslrenew-raritan"; }; }; systemd.timers.sslrenew-raritan = { enable = true; wantedBy = [ "timers.target" ]; timerConfig = { OnCalendar = "daily"; }; }; system.stateVersion = "20.03"; }