{ lib
, config
, buildPythonPackage
, fetchFromGitHub
, fetchpatch
, addOpenGLRunpath
, setuptools
, pytestCheckHook
, pythonRelaxDepsHook
, cmake
, ninja
, pybind11
, gtest
, zlib
, ncurses
, libxml2
, lit
, llvm
, filelock
, torchWithRocm
, python

, runCommand

, cudaPackages
, cudaSupport ? config.cudaSupport
}:

let
  ptxas = "${cudaPackages.cuda_nvcc}/bin/ptxas"; # Make sure cudaPackages is the right version each update (See python/setup.py)
in
buildPythonPackage rec {
  pname = "triton";
  version = "2.1.0";
  pyproject = true;

  src = fetchFromGitHub {
    owner = "openai";
    repo = pname;
    rev = "v${version}";
    hash = "sha256-8UTUwLH+SriiJnpejdrzz9qIquP2zBp1/uwLdHmv0XQ=";
  };

  patches = [
    # fix overflow error
    (fetchpatch {
      url = "https://github.com/openai/triton/commit/52c146f66b79b6079bcd28c55312fc6ea1852519.patch";
      hash = "sha256-098/TCQrzvrBAbQiaVGCMaF3o5Yc3yWDxzwSkzIuAtY=";
    })
  ] ++ lib.optionals (!cudaSupport) [
    ./0000-dont-download-ptxas.patch
    # openai-triton wants to get ptxas version even if ptxas is not
    # used, resulting in ptxas not found error.
    ./0001-ptxas-disable-version-key-for-non-cuda-targets.patch
  ];

  nativeBuildInputs = [
    setuptools
    pythonRelaxDepsHook
    # pytestCheckHook # Requires torch (circular dependency) and probably needs GPUs:
    cmake
    ninja

    # Note for future:
    # These *probably* should go in depsTargetTarget
    # ...but we cannot test cross right now anyway
    # because we only support cudaPackages on x86_64-linux atm
    lit
    llvm
  ];

  buildInputs = [
    gtest
    libxml2.dev
    ncurses
    pybind11
    zlib
  ];

  propagatedBuildInputs = [
    filelock
    # openai-triton uses setuptools at runtime:
    # https://github.com/NixOS/nixpkgs/pull/286763/#discussion_r1480392652
    setuptools
  ];

  postPatch = let
    # Bash was getting weird without linting,
    # but basically upstream contains [cc, ..., "-lcuda", ...]
    # and we replace it with [..., "-lcuda", "-L/run/opengl-driver/lib", "-L$stubs", ...]
    old = [ "-lcuda" ];
    new = [ "-lcuda" "-L${addOpenGLRunpath.driverLink}" "-L${cudaPackages.cuda_cudart}/lib/stubs/" ];

    quote = x: ''"${x}"'';
    oldStr = lib.concatMapStringsSep ", " quote old;
    newStr = lib.concatMapStringsSep ", " quote new;
  in ''
    # Use our `cmakeFlags` instead and avoid downloading dependencies
    substituteInPlace python/setup.py \
      --replace "= get_thirdparty_packages(triton_cache_path)" "= os.environ[\"cmakeFlags\"].split()"

    # Already defined in llvm, when built with -DLLVM_INSTALL_UTILS
    substituteInPlace bin/CMakeLists.txt \
      --replace "add_subdirectory(FileCheck)" ""

    # Don't fetch googletest
    substituteInPlace unittest/CMakeLists.txt \
      --replace "include (\''${CMAKE_CURRENT_SOURCE_DIR}/googletest.cmake)" ""\
      --replace "include(GoogleTest)" "find_package(GTest REQUIRED)"
  '' + lib.optionalString cudaSupport ''
    # Use our linker flags
    substituteInPlace python/triton/common/build.py \
      --replace '${oldStr}' '${newStr}'
  '';

  # Avoid GLIBCXX mismatch with other cuda-enabled python packages
  preConfigure = ''
    # Ensure that the build process uses the requested number of cores
    export MAX_JOBS="$NIX_BUILD_CORES"

    # Upstream's setup.py tries to write cache somewhere in ~/
    export HOME=$(mktemp -d)

    # Upstream's github actions patch setup.cfg to write base-dir. May be redundant
    echo "
    [build_ext]
    base-dir=$PWD" >> python/setup.cfg

    # The rest (including buildPhase) is relative to ./python/
    cd python
  '' + lib.optionalString cudaSupport ''
    export CC=${cudaPackages.backendStdenv.cc}/bin/cc;
    export CXX=${cudaPackages.backendStdenv.cc}/bin/c++;

    # Work around download_and_copy_ptxas()
    mkdir -p $PWD/triton/third_party/cuda/bin
    ln -s ${ptxas} $PWD/triton/third_party/cuda/bin
  '';

  # CMake is run by setup.py instead
  dontUseCmakeConfigure = true;

  # Setuptools (?) strips runpath and +x flags. Let's just restore the symlink
  postFixup = lib.optionalString cudaSupport ''
    rm -f $out/${python.sitePackages}/triton/third_party/cuda/bin/ptxas
    ln -s ${ptxas} $out/${python.sitePackages}/triton/third_party/cuda/bin/ptxas
  '';

  checkInputs = [ cmake ]; # ctest
  dontUseSetuptoolsCheck = true;

  preCheck = ''
    # build/temp* refers to build_ext.build_temp (looked up in the build logs)
    (cd /build/source/python/build/temp* ; ctest)

    # For pytestCheckHook
    cd test/unit
  '';

  # Circular dependency on torch
  # pythonImportsCheck = [
  #   "triton"
  #   "triton.language"
  # ];

  # Ultimately, torch is our test suite:
  passthru.tests = {
    inherit torchWithRocm;
    # Implemented as alternative to pythonImportsCheck, in case if circular dependency on torch occurs again,
    # and pythonImportsCheck is commented back.
    import-triton = runCommand "import-triton" { nativeBuildInputs = [(python.withPackages (ps: [ps.openai-triton]))]; } ''
      python << \EOF
      import triton
      import triton.language
      EOF
      touch "$out"
    '';
  };

  pythonRemoveDeps = [
    # Circular dependency, cf. https://github.com/openai/triton/issues/1374
    "torch"

    # CLI tools without dist-info
    "cmake"
    "lit"
  ];

  meta = with lib; {
    description = "Language and compiler for writing highly efficient custom Deep-Learning primitives";
    homepage = "https://github.com/openai/triton";
    platforms = platforms.linux;
    license = licenses.mit;
    maintainers = with maintainers; [ SomeoneSerge Madouura ];
  };
}