depot/third_party/nixpkgs/pkgs/development/python-modules/tokenizers/default.nix
Default email 7e47f3658e Project import generated by Copybara.
GitOrigin-RevId: 1925c603f17fc89f4c8f6bf6f631a802ad85d784
2024-09-26 11:04:55 +00:00

156 lines
4.1 KiB
Nix

{
lib,
stdenv,
linkFarm,
fetchurl,
buildPythonPackage,
fetchFromGitHub,
python,
# nativeBuildInputs
pkg-config,
setuptools-rust,
rustPlatform,
cargo,
rustc,
# buildInputs
openssl,
libiconv,
Security,
# dependencies
huggingface-hub,
numpy,
# tests
datasets,
pytestCheckHook,
requests,
tiktoken,
}:
let
# See https://github.com/huggingface/tokenizers/blob/main/bindings/python/tests/utils.py for details
# about URLs and file names
test-data = linkFarm "tokenizers-test-data" {
"roberta-base-vocab.json" = fetchurl {
url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json";
hash = "sha256-nn9jwtFdZmtS4h0lDS5RO4fJtxPPpph6gu2J5eblBlU=";
};
"roberta-base-merges.txt" = fetchurl {
url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt";
hash = "sha256-HOFmR3PFDz4MyIQmGak+3EYkUltyixiKngvjO3cmrcU=";
};
"albert-base-v1-tokenizer.json" = fetchurl {
url = "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json";
hash = "sha256-biqj1cpMaEG8NqUCgXnLTWPBKZMfoY/OOP2zjOxNKsM=";
};
"bert-base-uncased-vocab.txt" = fetchurl {
url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt";
hash = "sha256-B+ztN1zsFE0nyQAkHz4zlHjeyVj5L928VR8pXJkgOKM=";
};
"big.txt" = fetchurl {
url = "https://norvig.com/big.txt";
sha256 = "sha256-+gZsfUDw8gGsQUTmUqpiQw5YprOAXscGUPZ42lgE6Hs=";
};
"bert-wiki.json" = fetchurl {
url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json";
hash = "sha256-i533xC8J5CDMNxBjo+p6avIM8UOcui8RmGAmK0GmfBc=";
};
"tokenizer-wiki.json" = fetchurl {
url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json";
hash = "sha256-ipY9d5DR5nxoO6kj7rItueZ9AO5wq9+Nzr6GuEIfIBI=";
};
"openai-gpt-vocab.json" = fetchurl {
url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json";
hash = "sha256-/fSbGefeI2hSCR2gm4Sno81eew55kWN2z0X2uBJ7gHg=";
};
"openai-gpt-merges.txt" = fetchurl {
url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt";
hash = "sha256-Dqm1GuaVBzzYceA1j3AWMR1nGn/zlj42fVI2Ui8pRyU=";
};
};
in
buildPythonPackage rec {
pname = "tokenizers";
version = "0.20.0";
pyproject = true;
src = fetchFromGitHub {
owner = "huggingface";
repo = "tokenizers";
rev = "refs/tags/v${version}";
hash = "sha256-uuSHsdyx77YQjf1aiz7EJ/X+6RaOgfmjGqHSlMaCWDI=";
};
cargoDeps = rustPlatform.importCargoLock { lockFile = ./Cargo.lock; };
sourceRoot = "${src.name}/bindings/python";
maturinBuildFlags = [ "--interpreter ${python.executable}" ];
nativeBuildInputs = [
pkg-config
setuptools-rust
rustPlatform.cargoSetupHook
rustPlatform.maturinBuildHook
cargo
rustc
];
buildInputs =
[ openssl ]
++ lib.optionals stdenv.hostPlatform.isDarwin [
libiconv
Security
];
dependencies = [
huggingface-hub
numpy
];
nativeCheckInputs = [
datasets
pytestCheckHook
requests
tiktoken
];
postUnpack = ''
# Add data files for tests, otherwise tests attempt network access
mkdir $sourceRoot/tests/data
ln -s ${test-data}/* $sourceRoot/tests/data/
'';
preCheck = ''
export HOME=$(mktemp -d);
'';
pythonImportsCheck = [ "tokenizers" ];
disabledTests = [
# Downloads data using the datasets module
"test_encode_special_tokens"
"test_splitting"
"TestTrainFromIterators"
# Those tests require more data
"test_from_pretrained"
"test_from_pretrained_revision"
"test_continuing_prefix_trainer_mistmatch"
];
disabledTestPaths = [
# fixture 'model' not found
"benches/test_tiktoken.py"
];
meta = {
description = "Fast State-of-the-Art Tokenizers optimized for Research and Production";
homepage = "https://github.com/huggingface/tokenizers";
license = lib.licenses.asl20;
maintainers = with lib.maintainers; [ GaetanLepage ];
platforms = lib.platforms.unix;
};
}