depot/third_party/nixpkgs/pkgs/development/python-modules/pyspark/default.nix
Default email 472aeafc57 Project import generated by Copybara.
GitOrigin-RevId: c31898adf5a8ed202ce5bea9f347b1c6871f32d1
2024-10-04 18:56:33 +02:00

70 lines
1.7 KiB
Nix

{
lib,
buildPythonPackage,
fetchPypi,
numpy,
pandas,
py4j,
pyarrow,
pythonOlder,
}:
buildPythonPackage rec {
pname = "pyspark";
version = "3.5.1";
format = "setuptools";
disabled = pythonOlder "3.7";
src = fetchPypi {
inherit pname version;
hash = "sha256-3WVp5Uc2Xq3E+Ie/V/FT5NWCpoxLSQ3kddVbmYFmSRA=";
};
# pypandoc is broken with pandoc2, so we just lose docs.
postPatch = ''
sed -i "s/'pypandoc'//" setup.py
substituteInPlace setup.py \
--replace py4j== 'py4j>='
'';
postFixup = ''
# find_python_home.py has been wrapped as a shell script
substituteInPlace $out/bin/find-spark-home \
--replace 'export SPARK_HOME=$($PYSPARK_DRIVER_PYTHON "$FIND_SPARK_HOME_PYTHON_SCRIPT")' \
'export SPARK_HOME=$("$FIND_SPARK_HOME_PYTHON_SCRIPT")'
# patch PYTHONPATH in pyspark so that it properly looks at SPARK_HOME
substituteInPlace $out/bin/pyspark \
--replace 'export PYTHONPATH="''${SPARK_HOME}/python/:$PYTHONPATH"' \
'export PYTHONPATH="''${SPARK_HOME}/..:''${SPARK_HOME}/python/:$PYTHONPATH"'
'';
propagatedBuildInputs = [ py4j ];
optional-dependencies = {
ml = [ numpy ];
mllib = [ numpy ];
sql = [
numpy
pandas
pyarrow
];
};
# Tests assume running spark instance
doCheck = false;
pythonImportsCheck = [ "pyspark" ];
meta = with lib; {
description = "Python bindings for Apache Spark";
homepage = "https://github.com/apache/spark/tree/master/python";
sourceProvenance = with sourceTypes; [
fromSource
binaryBytecode
];
license = licenses.asl20;
maintainers = with maintainers; [ shlevy ];
};
}