depot/third_party/nixpkgs/pkgs/development/python-modules/unstructured/default.nix
Default email d5f4a57cbf Project import generated by Copybara.
GitOrigin-RevId: ce5e4a6ef2e59d89a971bc434ca8ca222b9c7f5e
2023-08-10 09:59:29 +02:00

143 lines
2.6 KiB
Nix

{ lib
, buildPythonPackage
, fetchFromGitHub
# propagated build inputs
, chardet
, filetype
, lxml
, msg-parser
, nltk
, openpyxl
, pandas
, pdf2image
, pdfminer-six
, pillow
, pypandoc
, python-docx
, python-pptx
, python-magic
, markdown
, requests
, tabulate
, xlrd
# optional-dependencies
, langdetect
, sacremoses
, sentencepiece
, torch
, transformers
, unstructured-inference
, s3fs
, fsspec
, adlfs
# , discord-py
, pygithub
, python-gitlab
, praw
, slack-sdk
, wikipedia
, google-api-python-client
# , gcsfs
, elasticsearch8
, jq
# , dropboxdrivefs
, atlassian-python-api
# test dependencies
, pytestCheckHook
, black
, coverage
, click
, freezegun
# , label-studio-sdk
, mypy
, pytest-cov
, pytest-mock
, vcrpy
, grpcio
}:
let
version = "0.9.1";
optional-dependencies = {
huggingflace = [
langdetect
sacremoses
sentencepiece
torch
transformers
];
local-inference = [ unstructured-inference ];
s3 = [ s3fs fsspec ];
azure = [ adlfs fsspec ];
discord = [ ]; # discord-py
github = [ pygithub ];
gitlab = [ python-gitlab ];
reddit = [ praw ];
slack = [ slack-sdk ];
wikipedia = [ wikipedia ];
google-drive = [ google-api-python-client ];
gcs = []; # gcsfs fsspec
elasticsearch = [ elasticsearch8 jq ];
dropbox = []; # dropboxdrivefs fsspec
confluence = [ atlassian-python-api ];
};
in
buildPythonPackage {
pname = "unstructured";
inherit version;
format = "setuptools";
src = fetchFromGitHub {
owner = "Unstructured-IO";
repo = "unstructured";
rev = "refs/tags/${version}";
hash = "sha256-9O/rZ07vZC0XN5XgevFvWuG8gwyTM+gfn+OqgaIHld8=";
};
propagatedBuildInputs = [
chardet
filetype
lxml
msg-parser
nltk
openpyxl
pandas
pdf2image
pdfminer-six
pillow
pypandoc
python-docx
python-pptx
python-magic
markdown
requests
tabulate
xlrd
];
pythonImportsCheck = [ "unstructured" ];
# test try to download punkt from nltk
# figure out how to make it available to enable the tests
doCheck = false;
nativeCheckInputs = [
pytestCheckHook
black
coverage
click
freezegun
mypy
pytest-cov
pytest-mock
vcrpy
grpcio
];
meta = with lib; {
description = "Open source libraries and APIs to build custom preprocessing pipelines for labeling, training, or production machine learning pipelines";
homepage = "https://github.com/Unstructured-IO/unstructured";
changelog = "https://github.com/Unstructured-IO/unstructured/blob/${version}/CHANGELOG.md";
license = licenses.asl20;
maintainers = with maintainers; [ happysalada ];
};
}