aboutsummaryrefslogtreecommitdiff
path: root/pkgs/development/python-modules/tokenizers/default.nix
diff options
context:
space:
mode:
Diffstat (limited to 'pkgs/development/python-modules/tokenizers/default.nix')
-rw-r--r--pkgs/development/python-modules/tokenizers/default.nix16
1 files changed, 13 insertions, 3 deletions
diff --git a/pkgs/development/python-modules/tokenizers/default.nix b/pkgs/development/python-modules/tokenizers/default.nix
index 5b0ba5d0bfa0..4e5aacd6c7ac 100644
--- a/pkgs/development/python-modules/tokenizers/default.nix
+++ b/pkgs/development/python-modules/tokenizers/default.nix
@@ -32,6 +32,14 @@ let
url = "https://norvig.com/big.txt";
sha256 = "0yz80icdly7na03cfpl0nfk5h3j3cam55rj486n03wph81ynq1ps";
};
+ docPipelineTokenizer = fetchurl {
+ url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json";
+ hash = "sha256-i533xC8J5CDMNxBjo+p6avIM8UOcui8RmGAmK0GmfBc=";
+ };
+ docQuicktourTokenizer = fetchurl {
+ url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json";
+ hash = "sha256-ipY9d5DR5nxoO6kj7rItueZ9AO5wq9+Nzr6GuEIfIBI=";
+ };
openaiVocab = fetchurl {
url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json";
sha256 = "0y40gc9bixj5rxv674br1rxmxkd3ly29p80x1596h8yywwcrpx7x";
@@ -42,16 +50,16 @@ let
};
in rustPlatform.buildRustPackage rec {
pname = "tokenizers";
- version = "0.9.2";
+ version = "0.9.4";
src = fetchFromGitHub {
owner = "huggingface";
repo = pname;
rev = "python-v${version}";
- sha256 = "0rsm1g5zfq3ygdb3s8v9xqqpgfzvvkc4n5ik3ahy8sw7pyjljb4m";
+ hash = "sha256-JXoH9yfhMIFg5qDY5zrF6iWb7XKugjMfk1NxSizfaWg=";
};
- cargoSha256 = "0yn699dq9hdjh7fyci99ni8mmd5qdhzrsi80grzgf5cch8g38rbi";
+ cargoSha256 = "sha256-u9qitrOxJSABs0VjwHUZgmw7VTQXNbp6l8fKKE/RQ7M=";
sourceRoot = "source/bindings/python";
@@ -82,6 +90,8 @@ in rustPlatform.buildRustPackage rec {
ln -s ${robertaMerges} roberta-base-merges.txt
ln -s ${albertVocab} albert-base-v1-tokenizer.json
ln -s ${bertVocab} bert-base-uncased-vocab.txt
+ ln -s ${docPipelineTokenizer} bert-wiki.json
+ ln -s ${docQuicktourTokenizer} tokenizer-wiki.json
ln -s ${norvigBig} big.txt
ln -s ${openaiVocab} openai-gpt-vocab.json
ln -s ${openaiMerges} openai-gpt-merges.txt )