aboutsummaryrefslogtreecommitdiff
path: root/infra/libkookie/nixpkgs/pkgs/development/python-modules/tokenizers/default.nix
blob: d650f350bd2870f83735def67203a190362ce509 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
{ stdenv
, rustPlatform
, fetchFromGitHub
, fetchurl
, maturin
, pipInstallHook
, pytest
, python
, requests
}:

let
  robertaVocab = fetchurl {
    url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json";
    sha256 = "0m86wpkfb2gdh9x9i9ng2fvwk1rva4p0s98xw996nrjxs7166zwy";
  };
  robertaMerges = fetchurl {
    url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt";
    sha256 = "1idd4rvkpqqbks51i2vjbd928inw7slij9l4r063w3y5fd3ndq8w";
  };
  bertVocab = fetchurl {
    url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt";
    sha256 = "18rq42cmqa8zanydsbzrb34xwy4l6cz1y900r4kls57cbhvyvv07";
  };
  openaiVocab = fetchurl {
    url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json";
    sha256 = "0y40gc9bixj5rxv674br1rxmxkd3ly29p80x1596h8yywwcrpx7x";
  };
  openaiMerges = fetchurl {
    url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt";
    sha256 = "09a754pm4djjglv3x5pkgwd6f79i2rq8ydg0f7c3q1wmwqdbba8f";
  };
in rustPlatform.buildRustPackage rec {
  pname = "tokenizers";
  version = "0.8.1";

  src = fetchFromGitHub {
    owner = "huggingface";
    repo = pname;
    rev = "python-v${version}";
    sha256 = "0sxdwx05hr87j2z32rk4rgwn6a26w9r7m5fgj6ah1sgagiiyxbjw";
  };

  # Update parking_lot to be compatible with recent Rust versions, that
  # replace asm! by llvm_asm!:
  #
  # https://github.com/Amanieu/parking_lot/pull/223
  #
  # Remove once upstream updates this dependency.
  cargoPatches = [ ./update-parking-lot.diff ];

  cargoSha256 = "0cdkxmj8z2wdspn6r62lqlpvd0sj1z0cmb1zpqaajxvr0b2kjlj8";

  sourceRoot = "source/bindings/python";

  nativeBuildInputs = [
    maturin
    pipInstallHook
  ];

  propagatedBuildInputs = [
    python
  ];

  # tokenizers uses pyo3, which requires Rust nightly.
  RUSTC_BOOTSTRAP = 1;

  doCheck = false;
  doInstallCheck = true;

  postUnpack = ''
    # Add data files for tests, otherwise tests attempt network access.
    mkdir $sourceRoot/tests/data
    ( cd $sourceRoot/tests/data
      ln -s ${robertaVocab} roberta-base-vocab.json
      ln -s ${robertaMerges} roberta-base-merges.txt
      ln -s ${bertVocab} bert-base-uncased-vocab.txt
      ln -s ${openaiVocab} openai-gpt-vocab.json
      ln -s ${openaiMerges} openai-gpt-merges.txt )
  '';

  postPatch = ''
    # pyo3's build check verifies that Rust is a nightly
    # version. Disable this check.
    substituteInPlace $NIX_BUILD_TOP/$cargoDepsCopy/pyo3/build.rs \
      --replace "check_rustc_version()?;" ""

    # Patching the vendored dependency invalidates the file
    # checksums, so remove them. This should be safe, since
    # this is just a copy of the vendored dependencies and
    # the integrity of the vendored dependencies is validated
    # by cargoSha256.
    sed -r -i 's|"files":\{[^}]+\}|"files":{}|' \
      $NIX_BUILD_TOP/$cargoDepsCopy/pyo3/.cargo-checksum.json

    # Maturin uses the crate name as the wheel name.
    substituteInPlace Cargo.toml \
      --replace "tokenizers-python" "tokenizers"
  '';

  buildPhase = ''
    maturin build --release --manylinux off
  '';

  installPhase = ''
    # Put the wheels where the pip install hook can find them.
    install -Dm644 -t dist target/wheels/*.whl
    pipInstallPhase
  '';

  installCheckInputs = [
    pytest
    requests
  ];

  installCheckPhase = ''
    # Append paths, or the binding's tokenizer module will be
    # used, since the test directories have __init__.py
    pytest --import-mode=append
  '';

  meta = with stdenv.lib; {
    homepage = "https://github.com/huggingface/tokenizers";
    description = "Fast State-of-the-Art Tokenizers optimized for Research and Production";
    license = licenses.asl20;
    platforms = platforms.unix;
    maintainers = with maintainers; [ danieldk ];
  };
}