183 lines
3.8 KiB
Nix
183 lines
3.8 KiB
Nix
{
|
|
lib,
|
|
stdenv,
|
|
python,
|
|
buildPythonPackage,
|
|
pythonRelaxDepsHook,
|
|
fetchFromGitHub,
|
|
which,
|
|
ninja,
|
|
cmake,
|
|
packaging,
|
|
setuptools,
|
|
torch,
|
|
outlines,
|
|
wheel,
|
|
psutil,
|
|
ray,
|
|
pandas,
|
|
pyarrow,
|
|
sentencepiece,
|
|
numpy,
|
|
transformers,
|
|
xformers,
|
|
fastapi,
|
|
uvicorn,
|
|
pydantic,
|
|
aioprometheus,
|
|
pynvml,
|
|
openai,
|
|
pyzmq,
|
|
tiktoken,
|
|
torchvision,
|
|
py-cpuinfo,
|
|
lm-format-enforcer,
|
|
prometheus-fastapi-instrumentator,
|
|
cupy,
|
|
writeShellScript,
|
|
|
|
config,
|
|
|
|
cudaSupport ? config.cudaSupport,
|
|
cudaPackages ? { },
|
|
|
|
# Has to be either rocm or cuda, default to the free one
|
|
rocmSupport ? !config.cudaSupport,
|
|
rocmPackages ? { },
|
|
gpuTargets ? [ ],
|
|
}@args:
|
|
|
|
let
|
|
cutlass = fetchFromGitHub {
|
|
owner = "NVIDIA";
|
|
repo = "cutlass";
|
|
rev = "refs/tags/v3.5.0";
|
|
sha256 = "sha256-D/s7eYsa5l/mfx73tE4mnFcTQdYqGmXa9d9TCryw4e4=";
|
|
};
|
|
in
|
|
|
|
buildPythonPackage rec {
|
|
pname = "vllm";
|
|
version = "0.6.2";
|
|
pyproject = true;
|
|
|
|
stdenv = if cudaSupport then cudaPackages.backendStdenv else args.stdenv;
|
|
|
|
src = fetchFromGitHub {
|
|
owner = "vllm-project";
|
|
repo = pname;
|
|
rev = "refs/tags/v${version}";
|
|
hash = "sha256-zUkqAPPhDRdN9rDQ2biCl1B+trV0xIHXub++v9zsQGo=";
|
|
};
|
|
|
|
patches = [
|
|
./0001-setup.py-don-t-ask-for-hipcc-version.patch
|
|
./0002-setup.py-nix-support-respect-cmakeFlags.patch
|
|
];
|
|
|
|
# Ignore the python version check because it hard-codes minor versions and
|
|
# lags behind `ray`'s python interpreter support
|
|
postPatch = ''
|
|
substituteInPlace CMakeLists.txt \
|
|
--replace-fail \
|
|
'set(PYTHON_SUPPORTED_VERSIONS' \
|
|
'set(PYTHON_SUPPORTED_VERSIONS "${lib.versions.majorMinor python.version}"'
|
|
'';
|
|
|
|
nativeBuildInputs = [
|
|
cmake
|
|
ninja
|
|
pythonRelaxDepsHook
|
|
which
|
|
] ++ lib.optionals rocmSupport [ rocmPackages.hipcc ];
|
|
|
|
build-system = [
|
|
packaging
|
|
setuptools
|
|
wheel
|
|
];
|
|
|
|
buildInputs =
|
|
(lib.optionals cudaSupport (
|
|
with cudaPackages;
|
|
[
|
|
cuda_cudart # cuda_runtime.h, -lcudart
|
|
cuda_cccl
|
|
libcusparse # cusparse.h
|
|
libcusolver # cusolverDn.h
|
|
cuda_nvcc
|
|
cuda_nvtx
|
|
libcublas
|
|
]
|
|
))
|
|
++ (lib.optionals rocmSupport (
|
|
with rocmPackages;
|
|
[
|
|
clr
|
|
rocthrust
|
|
rocprim
|
|
hipsparse
|
|
hipblas
|
|
]
|
|
));
|
|
|
|
dependencies =
|
|
[
|
|
aioprometheus
|
|
fastapi
|
|
lm-format-enforcer
|
|
numpy
|
|
openai
|
|
outlines
|
|
pandas
|
|
prometheus-fastapi-instrumentator
|
|
psutil
|
|
py-cpuinfo
|
|
pyarrow
|
|
pydantic
|
|
pyzmq
|
|
ray
|
|
sentencepiece
|
|
tiktoken
|
|
torch
|
|
torchvision
|
|
transformers
|
|
uvicorn
|
|
xformers
|
|
]
|
|
++ uvicorn.optional-dependencies.standard
|
|
++ aioprometheus.optional-dependencies.starlette
|
|
++ lib.optionals cudaSupport [
|
|
cupy
|
|
pynvml
|
|
];
|
|
|
|
dontUseCmakeConfigure = true;
|
|
cmakeFlags = [ (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_CUTLASS" "${lib.getDev cutlass}") ];
|
|
|
|
env =
|
|
lib.optionalAttrs cudaSupport { CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}"; }
|
|
// lib.optionalAttrs rocmSupport {
|
|
# Otherwise it tries to enumerate host supported ROCM gfx archs, and that is not possible due to sandboxing.
|
|
PYTORCH_ROCM_ARCH = lib.strings.concatStringsSep ";" rocmPackages.clr.gpuTargets;
|
|
ROCM_HOME = "${rocmPackages.clr}";
|
|
};
|
|
|
|
pythonRelaxDeps = true;
|
|
|
|
pythonImportsCheck = [ "vllm" ];
|
|
|
|
meta = with lib; {
|
|
description = "High-throughput and memory-efficient inference and serving engine for LLMs";
|
|
changelog = "https://github.com/vllm-project/vllm/releases/tag/v${version}";
|
|
homepage = "https://github.com/vllm-project/vllm";
|
|
license = licenses.asl20;
|
|
maintainers = with maintainers; [
|
|
happysalada
|
|
lach
|
|
];
|
|
# RuntimeError: Unknown runtime environment
|
|
broken = true;
|
|
# broken = !cudaSupport && !rocmSupport;
|
|
};
|
|
}
|