197 lines
3.7 KiB
Nix
197 lines
3.7 KiB
Nix
{
|
|
lib,
|
|
stdenvNoCC,
|
|
fetchFromGitHub,
|
|
python3,
|
|
makeWrapper,
|
|
nix-update-script,
|
|
symlinkJoin,
|
|
nltk-data,
|
|
}:
|
|
let
|
|
pythonEnv = python3.withPackages (
|
|
packages:
|
|
with packages;
|
|
[
|
|
aiofiles
|
|
annotated-types
|
|
antlr4-python3-runtime
|
|
anyio
|
|
backoff
|
|
beautifulsoup4
|
|
cachetools
|
|
certifi
|
|
cffi
|
|
chardet
|
|
charset-normalizer
|
|
click
|
|
coloredlogs
|
|
contourpy
|
|
cryptography
|
|
cycler
|
|
dataclasses-json
|
|
deprecated
|
|
effdet
|
|
emoji
|
|
et-xmlfile
|
|
eval-type-backport
|
|
fastapi
|
|
filelock
|
|
filetype
|
|
flatbuffers
|
|
fonttools
|
|
fsspec
|
|
google-api-core
|
|
google-auth
|
|
google-cloud-vision
|
|
googleapis-common-protos
|
|
grpcio
|
|
grpcio-status
|
|
h11
|
|
html5lib
|
|
httpcore
|
|
httpx
|
|
huggingface-hub
|
|
humanfriendly
|
|
idna
|
|
iopath
|
|
jinja2
|
|
joblib
|
|
jsonpath
|
|
kiwisolver
|
|
langdetect
|
|
layoutparser
|
|
lxml
|
|
markdown
|
|
markupsafe
|
|
marshmallow
|
|
matplotlib
|
|
mpmath
|
|
mypy-extensions
|
|
nest-asyncio
|
|
networkx
|
|
nltk
|
|
numpy
|
|
olefile
|
|
omegaconf
|
|
onnx
|
|
onnxruntime
|
|
opencv-python
|
|
openpyxl
|
|
packaging
|
|
pandas
|
|
pdf2image
|
|
pdfminer-six
|
|
pdfplumber
|
|
# pi-heif
|
|
pikepdf
|
|
pillow
|
|
portalocker
|
|
proto-plus
|
|
protobuf
|
|
psutil
|
|
pyasn1
|
|
pyasn1-modules
|
|
pycocotools
|
|
pycparser
|
|
pycryptodome
|
|
pydantic
|
|
pydantic-core
|
|
pypandoc
|
|
pyparsing
|
|
pypdf
|
|
pypdfium2
|
|
python-dateutil
|
|
python-docx
|
|
# python-iso639
|
|
python-magic
|
|
python-multipart
|
|
# python-oxmsg
|
|
python-pptx
|
|
pytz
|
|
pyyaml
|
|
rapidfuzz
|
|
ratelimit
|
|
regex
|
|
requests
|
|
requests-toolbelt
|
|
rsa
|
|
safetensors
|
|
scipy
|
|
six
|
|
sniffio
|
|
soupsieve
|
|
starlette
|
|
sympy
|
|
timm
|
|
tokenizers
|
|
torch
|
|
torchvision
|
|
tqdm
|
|
transformers
|
|
typing-extensions
|
|
typing-inspect
|
|
tzdata
|
|
unstructured
|
|
# unstructured-client
|
|
unstructured-inference
|
|
# unstructured-pytesseract
|
|
urllib3
|
|
uvicorn
|
|
webencodings
|
|
wrapt
|
|
xlrd
|
|
xlsxwriter
|
|
]
|
|
++ google-api-core.optional-dependencies.grpc
|
|
++ unstructured.optional-dependencies.all-docs
|
|
);
|
|
version = "0.0.82";
|
|
unstructured_api_nltk_data = symlinkJoin {
|
|
name = "unstructured_api_nltk_data";
|
|
|
|
paths = [
|
|
nltk-data.punkt
|
|
nltk-data.averaged_perceptron_tagger
|
|
];
|
|
};
|
|
in
|
|
stdenvNoCC.mkDerivation {
|
|
pname = "unstructured-api";
|
|
inherit version;
|
|
|
|
src = fetchFromGitHub {
|
|
owner = "Unstructured-IO";
|
|
repo = "unstructured-api";
|
|
rev = version;
|
|
hash = "sha256-mvcARpewqC25x3ZdpM8QB7SjbqGoBL/rtxi90KdKdO8=";
|
|
};
|
|
|
|
nativeBuildInputs = [ makeWrapper ];
|
|
|
|
installPhase = ''
|
|
runHook preInstall
|
|
|
|
mkdir -p $out $out/bin $out/lib
|
|
cp -r . $out/lib
|
|
|
|
makeWrapper ${pythonEnv}/bin/uvicorn $out/bin/unstructured-api \
|
|
--set NLTK_DATA ${unstructured_api_nltk_data} \
|
|
--prefix PYTHONPATH : $out/lib \
|
|
--add-flags "prepline_general.api.app:app"
|
|
|
|
runHook postInstall
|
|
'';
|
|
|
|
passthru = {
|
|
updateScript = nix-update-script { };
|
|
};
|
|
|
|
meta = with lib; {
|
|
description = "open-source toolkit designed to make it easy to prepare unstructured data like PDFs, HTML and Word Documents for downstream data science tasks";
|
|
homepage = "https://github.com/Unstructured-IO/unstructured-api";
|
|
changelog = "https://github.com/Unstructured-IO/unstructured-api/releases/tag/${version}";
|
|
license = licenses.asl20;
|
|
maintainers = with maintainers; [ happysalada ];
|
|
};
|
|
}
|