2020-07-18 16:06:22 +00:00
|
|
|
{ stdenv
|
|
|
|
, rustPlatform
|
|
|
|
, fetchFromGitHub
|
|
|
|
, fetchurl
|
|
|
|
, maturin
|
|
|
|
, pipInstallHook
|
|
|
|
, pytest
|
|
|
|
, python
|
|
|
|
, requests
|
|
|
|
}:
|
|
|
|
|
|
|
|
let
|
|
|
|
robertaVocab = fetchurl {
|
|
|
|
url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json";
|
|
|
|
sha256 = "0m86wpkfb2gdh9x9i9ng2fvwk1rva4p0s98xw996nrjxs7166zwy";
|
|
|
|
};
|
|
|
|
robertaMerges = fetchurl {
|
|
|
|
url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt";
|
|
|
|
sha256 = "1idd4rvkpqqbks51i2vjbd928inw7slij9l4r063w3y5fd3ndq8w";
|
|
|
|
};
|
|
|
|
bertVocab = fetchurl {
|
|
|
|
url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt";
|
|
|
|
sha256 = "18rq42cmqa8zanydsbzrb34xwy4l6cz1y900r4kls57cbhvyvv07";
|
|
|
|
};
|
|
|
|
openaiVocab = fetchurl {
|
|
|
|
url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json";
|
|
|
|
sha256 = "0y40gc9bixj5rxv674br1rxmxkd3ly29p80x1596h8yywwcrpx7x";
|
|
|
|
};
|
|
|
|
openaiMerges = fetchurl {
|
|
|
|
url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt";
|
|
|
|
sha256 = "09a754pm4djjglv3x5pkgwd6f79i2rq8ydg0f7c3q1wmwqdbba8f";
|
|
|
|
};
|
|
|
|
in rustPlatform.buildRustPackage rec {
|
|
|
|
pname = "tokenizers";
|
2020-08-20 17:08:02 +00:00
|
|
|
version = "0.8.1";
|
2020-07-18 16:06:22 +00:00
|
|
|
|
|
|
|
src = fetchFromGitHub {
|
|
|
|
owner = "huggingface";
|
|
|
|
repo = pname;
|
|
|
|
rev = "python-v${version}";
|
2020-08-20 17:08:02 +00:00
|
|
|
sha256 = "0sxdwx05hr87j2z32rk4rgwn6a26w9r7m5fgj6ah1sgagiiyxbjw";
|
2020-07-18 16:06:22 +00:00
|
|
|
};
|
|
|
|
|
2020-08-20 17:08:02 +00:00
|
|
|
# Update parking_lot to be compatible with recent Rust versions, that
|
|
|
|
# replace asm! by llvm_asm!:
|
|
|
|
#
|
|
|
|
# https://github.com/Amanieu/parking_lot/pull/223
|
|
|
|
#
|
|
|
|
# Remove once upstream updates this dependency.
|
|
|
|
cargoPatches = [ ./update-parking-lot.diff ];
|
|
|
|
|
|
|
|
cargoSha256 = "0cdkxmj8z2wdspn6r62lqlpvd0sj1z0cmb1zpqaajxvr0b2kjlj8";
|
2020-07-18 16:06:22 +00:00
|
|
|
|
|
|
|
sourceRoot = "source/bindings/python";
|
|
|
|
|
|
|
|
nativeBuildInputs = [
|
|
|
|
maturin
|
|
|
|
pipInstallHook
|
|
|
|
];
|
|
|
|
|
|
|
|
propagatedBuildInputs = [
|
|
|
|
python
|
|
|
|
];
|
|
|
|
|
|
|
|
# tokenizers uses pyo3, which requires Rust nightly.
|
|
|
|
RUSTC_BOOTSTRAP = 1;
|
|
|
|
|
|
|
|
doCheck = false;
|
|
|
|
doInstallCheck = true;
|
|
|
|
|
|
|
|
postUnpack = ''
|
|
|
|
# Add data files for tests, otherwise tests attempt network access.
|
|
|
|
mkdir $sourceRoot/tests/data
|
|
|
|
( cd $sourceRoot/tests/data
|
|
|
|
ln -s ${robertaVocab} roberta-base-vocab.json
|
|
|
|
ln -s ${robertaMerges} roberta-base-merges.txt
|
|
|
|
ln -s ${bertVocab} bert-base-uncased-vocab.txt
|
|
|
|
ln -s ${openaiVocab} openai-gpt-vocab.json
|
|
|
|
ln -s ${openaiMerges} openai-gpt-merges.txt )
|
|
|
|
'';
|
|
|
|
|
|
|
|
postPatch = ''
|
|
|
|
# pyo3's build check verifies that Rust is a nightly
|
|
|
|
# version. Disable this check.
|
|
|
|
substituteInPlace $NIX_BUILD_TOP/$cargoDepsCopy/pyo3/build.rs \
|
|
|
|
--replace "check_rustc_version()?;" ""
|
|
|
|
|
|
|
|
# Patching the vendored dependency invalidates the file
|
|
|
|
# checksums, so remove them. This should be safe, since
|
|
|
|
# this is just a copy of the vendored dependencies and
|
|
|
|
# the integrity of the vendored dependencies is validated
|
|
|
|
# by cargoSha256.
|
|
|
|
sed -r -i 's|"files":\{[^}]+\}|"files":{}|' \
|
|
|
|
$NIX_BUILD_TOP/$cargoDepsCopy/pyo3/.cargo-checksum.json
|
|
|
|
|
|
|
|
# Maturin uses the crate name as the wheel name.
|
|
|
|
substituteInPlace Cargo.toml \
|
|
|
|
--replace "tokenizers-python" "tokenizers"
|
|
|
|
'';
|
|
|
|
|
|
|
|
buildPhase = ''
|
|
|
|
maturin build --release --manylinux off
|
|
|
|
'';
|
|
|
|
|
|
|
|
installPhase = ''
|
|
|
|
# Put the wheels where the pip install hook can find them.
|
|
|
|
install -Dm644 -t dist target/wheels/*.whl
|
|
|
|
pipInstallPhase
|
|
|
|
'';
|
|
|
|
|
|
|
|
installCheckInputs = [
|
|
|
|
pytest
|
|
|
|
requests
|
|
|
|
];
|
|
|
|
|
|
|
|
installCheckPhase = ''
|
|
|
|
# Append paths, or the binding's tokenizer module will be
|
|
|
|
# used, since the test directories have __init__.py
|
|
|
|
pytest --import-mode=append
|
|
|
|
'';
|
|
|
|
|
|
|
|
meta = with stdenv.lib; {
|
|
|
|
homepage = "https://github.com/huggingface/tokenizers";
|
|
|
|
description = "Fast State-of-the-Art Tokenizers optimized for Research and Production";
|
|
|
|
license = licenses.asl20;
|
|
|
|
platforms = platforms.unix;
|
|
|
|
maintainers = with maintainers; [ danieldk ];
|
|
|
|
};
|
|
|
|
}
|