nixpkgs/pkgs/development/python-modules/spacy/annotation-test/annotate.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69

import pytest
import spacy

en_text = (
    "When Sebastian Thrun started working on self-driving cars at "
    "Google in 2007, few people outside of the company took him "
    "seriously. “I can tell you very senior CEOs of major American "
    "car companies would shake my hand and turn away because I wasn’t "
    "worth talking to,” said Thrun, in an interview with Recode earlier "
    "this week.")


@pytest.fixture
def en_core_web_sm():
    return spacy.load("en_core_web_sm")


@pytest.fixture
def doc_en_core_web_sm(en_core_web_sm):
    return en_core_web_sm(en_text)


def test_entities(doc_en_core_web_sm):
    entities = list(map(lambda e: (e.text, e.label_),
                        doc_en_core_web_sm.ents))

    assert entities == [
        ('Sebastian Thrun', 'PERSON'),
        ('Google', 'ORG'), ('2007', 'DATE'),
        ('American', 'NORP'),
        ('Thrun', 'ORG'),
        ('earlier this week', 'DATE')
    ]


def test_nouns(doc_en_core_web_sm):
    assert [
        chunk.text for chunk in doc_en_core_web_sm.noun_chunks] == [
        'Sebastian Thrun',
        'self-driving cars',
        'Google',
        'few people',
        'the company',
        'him',
        'I',
        'you',
        'very senior CEOs',
        'major American car companies',
        'my hand',
        'I',
        'Thrun',
        'an interview',
        'Recode']


def test_verbs(doc_en_core_web_sm):
    assert [
        token.lemma_ for token in doc_en_core_web_sm if token.pos_ == "VERB"] == [
        'start',
        'work',
        'drive',
        'take',
        'can',
        'tell',
        'would',
        'shake',
        'turn',
        'talk',
        'say']