aboutsummaryrefslogtreecommitdiff
path: root/infra/libkookie/nixpkgs/pkgs/development/python-modules/tokenizers/default.nix
diff options
context:
space:
mode:
Diffstat (limited to 'infra/libkookie/nixpkgs/pkgs/development/python-modules/tokenizers/default.nix')
-rw-r--r--infra/libkookie/nixpkgs/pkgs/development/python-modules/tokenizers/default.nix84
1 files changed, 35 insertions, 49 deletions
diff --git a/infra/libkookie/nixpkgs/pkgs/development/python-modules/tokenizers/default.nix b/infra/libkookie/nixpkgs/pkgs/development/python-modules/tokenizers/default.nix
index d650f350bd28..4e5aacd6c7ac 100644
--- a/infra/libkookie/nixpkgs/pkgs/development/python-modules/tokenizers/default.nix
+++ b/infra/libkookie/nixpkgs/pkgs/development/python-modules/tokenizers/default.nix
@@ -2,10 +2,12 @@
, rustPlatform
, fetchFromGitHub
, fetchurl
-, maturin
, pipInstallHook
-, pytest
+, setuptools-rust
+, wheel
+, numpy
, python
+, pytestCheckHook
, requests
}:
@@ -18,10 +20,26 @@ let
url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt";
sha256 = "1idd4rvkpqqbks51i2vjbd928inw7slij9l4r063w3y5fd3ndq8w";
};
+ albertVocab = fetchurl {
+ url = "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json";
+ sha256 = "1hra9pn8rczx7378z88zjclw2qsdrdwq20m56sy42s2crbas6akf";
+ };
bertVocab = fetchurl {
url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt";
sha256 = "18rq42cmqa8zanydsbzrb34xwy4l6cz1y900r4kls57cbhvyvv07";
};
+ norvigBig = fetchurl {
+ url = "https://norvig.com/big.txt";
+ sha256 = "0yz80icdly7na03cfpl0nfk5h3j3cam55rj486n03wph81ynq1ps";
+ };
+ docPipelineTokenizer = fetchurl {
+ url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json";
+ hash = "sha256-i533xC8J5CDMNxBjo+p6avIM8UOcui8RmGAmK0GmfBc=";
+ };
+ docQuicktourTokenizer = fetchurl {
+ url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json";
+ hash = "sha256-ipY9d5DR5nxoO6kj7rItueZ9AO5wq9+Nzr6GuEIfIBI=";
+ };
openaiVocab = fetchurl {
url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json";
sha256 = "0y40gc9bixj5rxv674br1rxmxkd3ly29p80x1596h8yywwcrpx7x";
@@ -32,38 +50,34 @@ let
};
in rustPlatform.buildRustPackage rec {
pname = "tokenizers";
- version = "0.8.1";
+ version = "0.9.4";
src = fetchFromGitHub {
owner = "huggingface";
repo = pname;
rev = "python-v${version}";
- sha256 = "0sxdwx05hr87j2z32rk4rgwn6a26w9r7m5fgj6ah1sgagiiyxbjw";
+ hash = "sha256-JXoH9yfhMIFg5qDY5zrF6iWb7XKugjMfk1NxSizfaWg=";
};
- # Update parking_lot to be compatible with recent Rust versions, that
- # replace asm! by llvm_asm!:
- #
- # https://github.com/Amanieu/parking_lot/pull/223
- #
- # Remove once upstream updates this dependency.
- cargoPatches = [ ./update-parking-lot.diff ];
-
- cargoSha256 = "0cdkxmj8z2wdspn6r62lqlpvd0sj1z0cmb1zpqaajxvr0b2kjlj8";
+ cargoSha256 = "sha256-u9qitrOxJSABs0VjwHUZgmw7VTQXNbp6l8fKKE/RQ7M=";
sourceRoot = "source/bindings/python";
nativeBuildInputs = [
- maturin
pipInstallHook
+ setuptools-rust
+ wheel
];
propagatedBuildInputs = [
+ numpy
python
];
- # tokenizers uses pyo3, which requires Rust nightly.
- RUSTC_BOOTSTRAP = 1;
+ installCheckInputs = [
+ pytestCheckHook
+ requests
+ ];
doCheck = false;
doInstallCheck = true;
@@ -74,51 +88,23 @@ in rustPlatform.buildRustPackage rec {
( cd $sourceRoot/tests/data
ln -s ${robertaVocab} roberta-base-vocab.json
ln -s ${robertaMerges} roberta-base-merges.txt
+ ln -s ${albertVocab} albert-base-v1-tokenizer.json
ln -s ${bertVocab} bert-base-uncased-vocab.txt
+ ln -s ${docPipelineTokenizer} bert-wiki.json
+ ln -s ${docQuicktourTokenizer} tokenizer-wiki.json
+ ln -s ${norvigBig} big.txt
ln -s ${openaiVocab} openai-gpt-vocab.json
ln -s ${openaiMerges} openai-gpt-merges.txt )
'';
- postPatch = ''
- # pyo3's build check verifies that Rust is a nightly
- # version. Disable this check.
- substituteInPlace $NIX_BUILD_TOP/$cargoDepsCopy/pyo3/build.rs \
- --replace "check_rustc_version()?;" ""
-
- # Patching the vendored dependency invalidates the file
- # checksums, so remove them. This should be safe, since
- # this is just a copy of the vendored dependencies and
- # the integrity of the vendored dependencies is validated
- # by cargoSha256.
- sed -r -i 's|"files":\{[^}]+\}|"files":{}|' \
- $NIX_BUILD_TOP/$cargoDepsCopy/pyo3/.cargo-checksum.json
-
- # Maturin uses the crate name as the wheel name.
- substituteInPlace Cargo.toml \
- --replace "tokenizers-python" "tokenizers"
- '';
-
buildPhase = ''
- maturin build --release --manylinux off
+ ${python.interpreter} setup.py bdist_wheel
'';
installPhase = ''
- # Put the wheels where the pip install hook can find them.
- install -Dm644 -t dist target/wheels/*.whl
pipInstallPhase
'';
- installCheckInputs = [
- pytest
- requests
- ];
-
- installCheckPhase = ''
- # Append paths, or the binding's tokenizer module will be
- # used, since the test directories have __init__.py
- pytest --import-mode=append
- '';
-
meta = with stdenv.lib; {
homepage = "https://github.com/huggingface/tokenizers";
description = "Fast State-of-the-Art Tokenizers optimized for Research and Production";