aboutsummaryrefslogtreecommitdiff
path: root/infra/libkookie/nixpkgs/pkgs/development/python-modules/tokenizers
diff options
context:
space:
mode:
Diffstat (limited to 'infra/libkookie/nixpkgs/pkgs/development/python-modules/tokenizers')
-rw-r--r--infra/libkookie/nixpkgs/pkgs/development/python-modules/tokenizers/default.nix84
-rw-r--r--infra/libkookie/nixpkgs/pkgs/development/python-modules/tokenizers/update-parking-lot.diff63
2 files changed, 35 insertions, 112 deletions
diff --git a/infra/libkookie/nixpkgs/pkgs/development/python-modules/tokenizers/default.nix b/infra/libkookie/nixpkgs/pkgs/development/python-modules/tokenizers/default.nix
index d650f350bd28..4e5aacd6c7ac 100644
--- a/infra/libkookie/nixpkgs/pkgs/development/python-modules/tokenizers/default.nix
+++ b/infra/libkookie/nixpkgs/pkgs/development/python-modules/tokenizers/default.nix
@@ -2,10 +2,12 @@
, rustPlatform
, fetchFromGitHub
, fetchurl
-, maturin
, pipInstallHook
-, pytest
+, setuptools-rust
+, wheel
+, numpy
, python
+, pytestCheckHook
, requests
}:
@@ -18,10 +20,26 @@ let
url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt";
sha256 = "1idd4rvkpqqbks51i2vjbd928inw7slij9l4r063w3y5fd3ndq8w";
};
+ albertVocab = fetchurl {
+ url = "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json";
+ sha256 = "1hra9pn8rczx7378z88zjclw2qsdrdwq20m56sy42s2crbas6akf";
+ };
bertVocab = fetchurl {
url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt";
sha256 = "18rq42cmqa8zanydsbzrb34xwy4l6cz1y900r4kls57cbhvyvv07";
};
+ norvigBig = fetchurl {
+ url = "https://norvig.com/big.txt";
+ sha256 = "0yz80icdly7na03cfpl0nfk5h3j3cam55rj486n03wph81ynq1ps";
+ };
+ docPipelineTokenizer = fetchurl {
+ url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json";
+ hash = "sha256-i533xC8J5CDMNxBjo+p6avIM8UOcui8RmGAmK0GmfBc=";
+ };
+ docQuicktourTokenizer = fetchurl {
+ url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json";
+ hash = "sha256-ipY9d5DR5nxoO6kj7rItueZ9AO5wq9+Nzr6GuEIfIBI=";
+ };
openaiVocab = fetchurl {
url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json";
sha256 = "0y40gc9bixj5rxv674br1rxmxkd3ly29p80x1596h8yywwcrpx7x";
@@ -32,38 +50,34 @@ let
};
in rustPlatform.buildRustPackage rec {
pname = "tokenizers";
- version = "0.8.1";
+ version = "0.9.4";
src = fetchFromGitHub {
owner = "huggingface";
repo = pname;
rev = "python-v${version}";
- sha256 = "0sxdwx05hr87j2z32rk4rgwn6a26w9r7m5fgj6ah1sgagiiyxbjw";
+ hash = "sha256-JXoH9yfhMIFg5qDY5zrF6iWb7XKugjMfk1NxSizfaWg=";
};
- # Update parking_lot to be compatible with recent Rust versions, that
- # replace asm! by llvm_asm!:
- #
- # https://github.com/Amanieu/parking_lot/pull/223
- #
- # Remove once upstream updates this dependency.
- cargoPatches = [ ./update-parking-lot.diff ];
-
- cargoSha256 = "0cdkxmj8z2wdspn6r62lqlpvd0sj1z0cmb1zpqaajxvr0b2kjlj8";
+ cargoSha256 = "sha256-u9qitrOxJSABs0VjwHUZgmw7VTQXNbp6l8fKKE/RQ7M=";
sourceRoot = "source/bindings/python";
nativeBuildInputs = [
- maturin
pipInstallHook
+ setuptools-rust
+ wheel
];
propagatedBuildInputs = [
+ numpy
python
];
- # tokenizers uses pyo3, which requires Rust nightly.
- RUSTC_BOOTSTRAP = 1;
+ installCheckInputs = [
+ pytestCheckHook
+ requests
+ ];
doCheck = false;
doInstallCheck = true;
@@ -74,51 +88,23 @@ in rustPlatform.buildRustPackage rec {
( cd $sourceRoot/tests/data
ln -s ${robertaVocab} roberta-base-vocab.json
ln -s ${robertaMerges} roberta-base-merges.txt
+ ln -s ${albertVocab} albert-base-v1-tokenizer.json
ln -s ${bertVocab} bert-base-uncased-vocab.txt
+ ln -s ${docPipelineTokenizer} bert-wiki.json
+ ln -s ${docQuicktourTokenizer} tokenizer-wiki.json
+ ln -s ${norvigBig} big.txt
ln -s ${openaiVocab} openai-gpt-vocab.json
ln -s ${openaiMerges} openai-gpt-merges.txt )
'';
- postPatch = ''
- # pyo3's build check verifies that Rust is a nightly
- # version. Disable this check.
- substituteInPlace $NIX_BUILD_TOP/$cargoDepsCopy/pyo3/build.rs \
- --replace "check_rustc_version()?;" ""
-
- # Patching the vendored dependency invalidates the file
- # checksums, so remove them. This should be safe, since
- # this is just a copy of the vendored dependencies and
- # the integrity of the vendored dependencies is validated
- # by cargoSha256.
- sed -r -i 's|"files":\{[^}]+\}|"files":{}|' \
- $NIX_BUILD_TOP/$cargoDepsCopy/pyo3/.cargo-checksum.json
-
- # Maturin uses the crate name as the wheel name.
- substituteInPlace Cargo.toml \
- --replace "tokenizers-python" "tokenizers"
- '';
-
buildPhase = ''
- maturin build --release --manylinux off
+ ${python.interpreter} setup.py bdist_wheel
'';
installPhase = ''
- # Put the wheels where the pip install hook can find them.
- install -Dm644 -t dist target/wheels/*.whl
pipInstallPhase
'';
- installCheckInputs = [
- pytest
- requests
- ];
-
- installCheckPhase = ''
- # Append paths, or the binding's tokenizer module will be
- # used, since the test directories have __init__.py
- pytest --import-mode=append
- '';
-
meta = with stdenv.lib; {
homepage = "https://github.com/huggingface/tokenizers";
description = "Fast State-of-the-Art Tokenizers optimized for Research and Production";
diff --git a/infra/libkookie/nixpkgs/pkgs/development/python-modules/tokenizers/update-parking-lot.diff b/infra/libkookie/nixpkgs/pkgs/development/python-modules/tokenizers/update-parking-lot.diff
deleted file mode 100644
index d8f144465ac9..000000000000
--- a/infra/libkookie/nixpkgs/pkgs/development/python-modules/tokenizers/update-parking-lot.diff
+++ /dev/null
@@ -1,63 +0,0 @@
-diff --git a/bindings/python/Cargo.lock b/bindings/python/Cargo.lock
-index f50db71..ea71817 100644
---- a/Cargo.lock
-+++ b/Cargo.lock
-@@ -269,7 +269,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
-
- [[package]]
- name = "lock_api"
--version = "0.3.3"
-+version = "0.3.4"
- source = "registry+https://github.com/rust-lang/crates.io-index"
- dependencies = [
- "scopeguard 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
-@@ -337,16 +337,16 @@ dependencies = [
-
- [[package]]
- name = "parking_lot"
--version = "0.10.0"
-+version = "0.10.2"
- source = "registry+https://github.com/rust-lang/crates.io-index"
- dependencies = [
-- "lock_api 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)",
-- "parking_lot_core 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
-+ "lock_api 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)",
-+ "parking_lot_core 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)",
- ]
-
- [[package]]
- name = "parking_lot_core"
--version = "0.7.0"
-+version = "0.7.2"
- source = "registry+https://github.com/rust-lang/crates.io-index"
- dependencies = [
- "cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)",
-@@ -409,7 +409,7 @@ dependencies = [
- "inventory 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)",
- "libc 0.2.68 (registry+https://github.com/rust-lang/crates.io-index)",
- "num-traits 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)",
-- "parking_lot 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)",
-+ "parking_lot 0.10.2 (registry+https://github.com/rust-lang/crates.io-index)",
- "paste 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)",
- "pyo3cls 0.9.2 (registry+https://github.com/rust-lang/crates.io-index)",
- "regex 1.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
-@@ -768,7 +768,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
- "checksum itoa 0.4.5 (registry+https://github.com/rust-lang/crates.io-index)" = "b8b7a7c0c47db5545ed3fef7468ee7bb5b74691498139e4b3f6a20685dc6dd8e"
- "checksum lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
- "checksum libc 0.2.68 (registry+https://github.com/rust-lang/crates.io-index)" = "dea0c0405123bba743ee3f91f49b1c7cfb684eef0da0a50110f758ccf24cdff0"
--"checksum lock_api 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "79b2de95ecb4691949fea4716ca53cdbcfccb2c612e19644a8bad05edcf9f47b"
-+"checksum lock_api 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "c4da24a77a3d8a6d4862d95f72e6fdb9c09a643ecdb402d754004a557f2bec75"
- "checksum maybe-uninit 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00"
- "checksum memchr 2.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3728d817d99e5ac407411fa471ff9800a778d88a24685968b36824eaf4bee400"
- "checksum memoffset 0.5.4 (registry+https://github.com/rust-lang/crates.io-index)" = "b4fc2c02a7e374099d4ee95a193111f72d2110197fe200272371758f6c3643d8"
-@@ -777,8 +777,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
- "checksum number_prefix 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "17b02fc0ff9a9e4b35b3342880f48e896ebf69f2967921fe8646bf5b7125956a"
- "checksum onig 6.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bd91ccd8a02fce2f7e8a86655aec67bc6c171e6f8e704118a0e8c4b866a05a8a"
- "checksum onig_sys 69.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "3814583fad89f3c60ae0701d80e87e1fd3028741723deda72d0d4a0ecf0cb0db"
--"checksum parking_lot 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "92e98c49ab0b7ce5b222f2cc9193fc4efe11c6d0bd4f648e374684a6857b1cfc"
--"checksum parking_lot_core 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7582838484df45743c8434fbff785e8edf260c28748353d44bc0da32e0ceabf1"
-+"checksum parking_lot 0.10.2 (registry+https://github.com/rust-lang/crates.io-index)" = "d3a704eb390aafdc107b0e392f56a82b668e3a71366993b5340f5833fd62505e"
-+"checksum parking_lot_core 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)" = "d58c7c768d4ba344e3e8d72518ac13e259d7c7ade24167003b8488e10b6740a3"
- "checksum paste 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)" = "ab4fb1930692d1b6a9cfabdde3d06ea0a7d186518e2f4d67660d8970e2fa647a"
- "checksum paste-impl 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)" = "a62486e111e571b1e93b710b61e8f493c0013be39629b714cb166bdb06aa5a8a"
- "checksum pkg-config 0.3.17 (registry+https://github.com/rust-lang/crates.io-index)" = "05da548ad6865900e60eaba7f589cc0783590a92e940c26953ff81ddbab2d677"