2021-12-19 01:06:50 +00:00
|
|
|
{ config, pkgs, lib, utils, ... }:
|
2020-04-24 23:36:52 +00:00
|
|
|
|
|
|
|
let
|
|
|
|
toplevelConfig = config;
|
|
|
|
inherit (lib) types;
|
2021-12-19 01:06:50 +00:00
|
|
|
inherit (utils.systemdUtils.lib) mkPathSafeName;
|
2020-04-24 23:36:52 +00:00
|
|
|
in {
|
|
|
|
options.systemd.services = lib.mkOption {
|
|
|
|
type = types.attrsOf (types.submodule ({ name, config, ... }: {
|
|
|
|
options.confinement.enable = lib.mkOption {
|
|
|
|
type = types.bool;
|
|
|
|
default = false;
|
2024-04-21 15:54:59 +00:00
|
|
|
description = ''
|
2020-04-24 23:36:52 +00:00
|
|
|
If set, all the required runtime store paths for this service are
|
2022-08-21 13:32:41 +00:00
|
|
|
bind-mounted into a `tmpfs`-based
|
|
|
|
{manpage}`chroot(2)`.
|
2020-04-24 23:36:52 +00:00
|
|
|
'';
|
|
|
|
};
|
|
|
|
|
|
|
|
options.confinement.fullUnit = lib.mkOption {
|
|
|
|
type = types.bool;
|
|
|
|
default = false;
|
2024-04-21 15:54:59 +00:00
|
|
|
description = ''
|
2020-04-24 23:36:52 +00:00
|
|
|
Whether to include the full closure of the systemd unit file into the
|
|
|
|
chroot, instead of just the dependencies for the executables.
|
|
|
|
|
2022-06-26 10:26:21 +00:00
|
|
|
::: {.warning}
|
|
|
|
While it may be tempting to just enable this option to
|
2020-04-24 23:36:52 +00:00
|
|
|
make things work quickly, please be aware that this might add paths
|
|
|
|
to the closure of the chroot that you didn't anticipate. It's better
|
2022-06-26 10:26:21 +00:00
|
|
|
to use {option}`confinement.packages` to **explicitly** add additional store paths to the
|
|
|
|
chroot.
|
|
|
|
:::
|
2020-04-24 23:36:52 +00:00
|
|
|
'';
|
|
|
|
};
|
|
|
|
|
|
|
|
options.confinement.packages = lib.mkOption {
|
|
|
|
type = types.listOf (types.either types.str types.package);
|
|
|
|
default = [];
|
|
|
|
description = let
|
2022-09-09 14:08:57 +00:00
|
|
|
mkScOption = optName: "{option}`serviceConfig.${optName}`";
|
2024-04-21 15:54:59 +00:00
|
|
|
in ''
|
2020-04-24 23:36:52 +00:00
|
|
|
Additional packages or strings with context to add to the closure of
|
|
|
|
the chroot. By default, this includes all the packages from the
|
|
|
|
${lib.concatMapStringsSep ", " mkScOption [
|
|
|
|
"ExecReload" "ExecStartPost" "ExecStartPre" "ExecStop"
|
|
|
|
"ExecStopPost"
|
|
|
|
]} and ${mkScOption "ExecStart"} options. If you want to have all the
|
|
|
|
dependencies of this systemd unit, you can use
|
2022-09-09 14:08:57 +00:00
|
|
|
{option}`confinement.fullUnit`.
|
2020-04-24 23:36:52 +00:00
|
|
|
|
2022-09-09 14:08:57 +00:00
|
|
|
::: {.note}
|
|
|
|
The store paths listed in {option}`path` are
|
|
|
|
**not** included in the closure as
|
2020-04-24 23:36:52 +00:00
|
|
|
well as paths from other options except those listed
|
2022-09-09 14:08:57 +00:00
|
|
|
above.
|
|
|
|
:::
|
2020-04-24 23:36:52 +00:00
|
|
|
'';
|
|
|
|
};
|
|
|
|
|
|
|
|
options.confinement.binSh = lib.mkOption {
|
|
|
|
type = types.nullOr types.path;
|
|
|
|
default = toplevelConfig.environment.binsh;
|
2021-10-06 13:57:05 +00:00
|
|
|
defaultText = lib.literalExpression "config.environment.binsh";
|
|
|
|
example = lib.literalExpression ''"''${pkgs.dash}/bin/dash"'';
|
2024-04-21 15:54:59 +00:00
|
|
|
description = ''
|
2022-08-21 13:32:41 +00:00
|
|
|
The program to make available as {file}`/bin/sh` inside
|
|
|
|
the chroot. If this is set to `null`, no
|
|
|
|
{file}`/bin/sh` is provided at all.
|
2020-04-24 23:36:52 +00:00
|
|
|
|
|
|
|
This is useful for some applications, which for example use the
|
2022-08-21 13:32:41 +00:00
|
|
|
{manpage}`system(3)` library function to execute commands.
|
2020-04-24 23:36:52 +00:00
|
|
|
'';
|
|
|
|
};
|
|
|
|
|
|
|
|
options.confinement.mode = lib.mkOption {
|
|
|
|
type = types.enum [ "full-apivfs" "chroot-only" ];
|
|
|
|
default = "full-apivfs";
|
2024-04-21 15:54:59 +00:00
|
|
|
description = ''
|
2022-09-09 14:08:57 +00:00
|
|
|
The value `full-apivfs` (the default) sets up
|
|
|
|
private {file}`/dev`, {file}`/proc`,
|
2024-05-15 15:35:15 +00:00
|
|
|
{file}`/sys`, {file}`/tmp` and {file}`/var/tmp` file systems
|
|
|
|
in a separate user name space.
|
2020-04-24 23:36:52 +00:00
|
|
|
|
2022-09-09 14:08:57 +00:00
|
|
|
If this is set to `chroot-only`, only the file
|
2022-08-21 13:32:41 +00:00
|
|
|
system name space is set up along with the call to
|
2022-09-09 14:08:57 +00:00
|
|
|
{manpage}`chroot(2)`.
|
2020-04-24 23:36:52 +00:00
|
|
|
|
2024-05-15 15:35:15 +00:00
|
|
|
In all cases, unless `serviceConfig.PrivateTmp=true` is set,
|
|
|
|
both {file}`/tmp` and {file}`/var/tmp` paths are added to `InaccessiblePaths=`.
|
|
|
|
This is to overcome options like `DynamicUser=true`
|
|
|
|
implying `PrivateTmp=true` without letting it being turned off.
|
|
|
|
Beware however that giving processes the `CAP_SYS_ADMIN` and `@mount` privileges
|
|
|
|
can let them undo the effects of `InaccessiblePaths=`.
|
|
|
|
|
2022-09-09 14:08:57 +00:00
|
|
|
::: {.note}
|
|
|
|
This doesn't cover network namespaces and is solely for
|
|
|
|
file system level isolation.
|
|
|
|
:::
|
2020-04-24 23:36:52 +00:00
|
|
|
'';
|
|
|
|
};
|
|
|
|
|
|
|
|
config = let
|
|
|
|
inherit (config.confinement) binSh fullUnit;
|
|
|
|
wantsAPIVFS = lib.mkDefault (config.confinement.mode == "full-apivfs");
|
|
|
|
in lib.mkIf config.confinement.enable {
|
|
|
|
serviceConfig = {
|
2024-05-15 15:35:15 +00:00
|
|
|
ReadOnlyPaths = [ "+/" ];
|
|
|
|
RuntimeDirectory = [ "confinement/${mkPathSafeName name}" ];
|
|
|
|
RootDirectory = "/run/confinement/${mkPathSafeName name}";
|
|
|
|
InaccessiblePaths = [
|
|
|
|
"-+/run/confinement/${mkPathSafeName name}"
|
|
|
|
];
|
2020-04-24 23:36:52 +00:00
|
|
|
PrivateMounts = lib.mkDefault true;
|
|
|
|
|
|
|
|
# https://github.com/NixOS/nixpkgs/issues/14645 is a future attempt
|
|
|
|
# to change some of these to default to true.
|
|
|
|
#
|
|
|
|
# If we run in chroot-only mode, having something like PrivateDevices
|
|
|
|
# set to true by default will mount /dev within the chroot, whereas
|
|
|
|
# with "chroot-only" it's expected that there are no /dev, /proc and
|
|
|
|
# /sys file systems available.
|
|
|
|
#
|
|
|
|
# However, if this suddenly becomes true, the attack surface will
|
|
|
|
# increase, so let's explicitly set these options to true/false
|
|
|
|
# depending on the mode.
|
|
|
|
MountAPIVFS = wantsAPIVFS;
|
|
|
|
PrivateDevices = wantsAPIVFS;
|
|
|
|
PrivateTmp = wantsAPIVFS;
|
|
|
|
PrivateUsers = wantsAPIVFS;
|
|
|
|
ProtectControlGroups = wantsAPIVFS;
|
|
|
|
ProtectKernelModules = wantsAPIVFS;
|
|
|
|
ProtectKernelTunables = wantsAPIVFS;
|
|
|
|
};
|
|
|
|
confinement.packages = let
|
|
|
|
execOpts = [
|
|
|
|
"ExecReload" "ExecStart" "ExecStartPost" "ExecStartPre" "ExecStop"
|
|
|
|
"ExecStopPost"
|
|
|
|
];
|
|
|
|
execPkgs = lib.concatMap (opt: let
|
|
|
|
isSet = config.serviceConfig ? ${opt};
|
2020-09-25 04:45:31 +00:00
|
|
|
in lib.flatten (lib.optional isSet config.serviceConfig.${opt})) execOpts;
|
2020-04-24 23:36:52 +00:00
|
|
|
unitAttrs = toplevelConfig.systemd.units."${name}.service";
|
|
|
|
allPkgs = lib.singleton (builtins.toJSON unitAttrs);
|
|
|
|
unitPkgs = if fullUnit then allPkgs else execPkgs;
|
|
|
|
in unitPkgs ++ lib.optional (binSh != null) binSh;
|
|
|
|
};
|
|
|
|
}));
|
|
|
|
};
|
|
|
|
|
|
|
|
config.assertions = lib.concatLists (lib.mapAttrsToList (name: cfg: let
|
|
|
|
whatOpt = optName: "The 'serviceConfig' option '${optName}' for"
|
|
|
|
+ " service '${name}' is enabled in conjunction with"
|
|
|
|
+ " 'confinement.enable'";
|
|
|
|
in lib.optionals cfg.confinement.enable [
|
|
|
|
{ assertion = !cfg.serviceConfig.RootDirectoryStartOnly or false;
|
|
|
|
message = "${whatOpt "RootDirectoryStartOnly"}, but right now systemd"
|
|
|
|
+ " doesn't support restricting bind-mounts to 'ExecStart'."
|
|
|
|
+ " Please either define a separate service or find a way to run"
|
|
|
|
+ " commands other than ExecStart within the chroot.";
|
|
|
|
}
|
|
|
|
]) config.systemd.services);
|
|
|
|
|
|
|
|
config.systemd.packages = lib.concatLists (lib.mapAttrsToList (name: cfg: let
|
|
|
|
rootPaths = let
|
|
|
|
contents = lib.concatStringsSep "\n" cfg.confinement.packages;
|
|
|
|
in pkgs.writeText "${mkPathSafeName name}-string-contexts.txt" contents;
|
|
|
|
|
|
|
|
chrootPaths = pkgs.runCommand "${mkPathSafeName name}-chroot-paths" {
|
|
|
|
closureInfo = pkgs.closureInfo { inherit rootPaths; };
|
|
|
|
serviceName = "${name}.service";
|
|
|
|
excludedPath = rootPaths;
|
|
|
|
} ''
|
2022-03-05 16:20:37 +00:00
|
|
|
mkdir -p "$out/lib/systemd/system/$serviceName.d"
|
|
|
|
serviceFile="$out/lib/systemd/system/$serviceName.d/confinement.conf"
|
2020-04-24 23:36:52 +00:00
|
|
|
|
|
|
|
echo '[Service]' > "$serviceFile"
|
|
|
|
|
|
|
|
# /bin/sh is special here, because the option value could contain a
|
|
|
|
# symlink and we need to properly resolve it.
|
|
|
|
${lib.optionalString (cfg.confinement.binSh != null) ''
|
|
|
|
binsh=${lib.escapeShellArg cfg.confinement.binSh}
|
|
|
|
realprog="$(readlink -e "$binsh")"
|
|
|
|
echo "BindReadOnlyPaths=$realprog:/bin/sh" >> "$serviceFile"
|
|
|
|
''}
|
|
|
|
|
2024-05-15 15:35:15 +00:00
|
|
|
# If DynamicUser= is enabled, PrivateTmp=true is implied (and cannot be turned off).
|
|
|
|
# so disable them unless PrivateTmp=true is explicitely set.
|
|
|
|
${lib.optionalString (!cfg.serviceConfig.PrivateTmp) ''
|
|
|
|
echo "InaccessiblePaths=-+/tmp" >> "$serviceFile"
|
|
|
|
echo "InaccessiblePaths=-+/var/tmp" >> "$serviceFile"
|
|
|
|
''}
|
|
|
|
|
2020-04-24 23:36:52 +00:00
|
|
|
while read storePath; do
|
|
|
|
if [ -L "$storePath" ]; then
|
|
|
|
# Currently, systemd can't cope with symlinks in Bind(ReadOnly)Paths,
|
|
|
|
# so let's just bind-mount the target to that location.
|
|
|
|
echo "BindReadOnlyPaths=$(readlink -e "$storePath"):$storePath"
|
|
|
|
elif [ "$storePath" != "$excludedPath" ]; then
|
|
|
|
echo "BindReadOnlyPaths=$storePath"
|
|
|
|
fi
|
|
|
|
done < "$closureInfo/store-paths" >> "$serviceFile"
|
|
|
|
'';
|
|
|
|
in lib.optional cfg.confinement.enable chrootPaths) config.systemd.services);
|
|
|
|
}
|