Skip to content

Mention detection

MentionDetection(base_url, wiki_version)

Bases: MentionDetectionBase

Class responsible for mention detection.

Source code in /home/docs/checkouts/readthedocs.org/user_builds/rel/envs/latest/lib/python3.7/site-packages/REL/mention_detection.py
13
14
15
16
17
18
def __init__(self, base_url, wiki_version):
    self.cnt_exact = 0
    self.cnt_partial = 0
    self.cnt_total = 0

    super().__init__(base_url, wiki_version)

find_mentions(dataset, tagger=None)

Responsible for finding mentions given a set of documents in a batch-wise manner. More specifically, it returns the mention, its left/right context and a set of candidates.

Returns:

  • –

    Dictionary with mentions per document.

Source code in /home/docs/checkouts/readthedocs.org/user_builds/rel/envs/latest/lib/python3.7/site-packages/REL/mention_detection.py
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
def find_mentions(self, dataset, tagger=None):
    """
    Responsible for finding mentions given a set of documents in a batch-wise manner. More specifically,
    it returns the mention, its left/right context and a set of candidates.
    :return: Dictionary with mentions per document.
    """
    if tagger is None:
        raise Exception(
            "No NER tagger is set, but you are attempting to perform Mention Detection.."
        )
    # Verify if Flair, else ngram or custom.
    is_flair = isinstance(tagger, SequenceTagger)
    dataset_sentences_raw, processed_sentences, splits = self.split_text(
        dataset, is_flair
    )
    results = {}
    total_ment = 0
    if is_flair:
        tagger.predict(processed_sentences)
    for i, doc in enumerate(dataset_sentences_raw):
        contents = dataset_sentences_raw[doc]
        raw_text = dataset[doc][0]
        sentences_doc = [v[0] for v in contents.values()]
        sentences = processed_sentences[splits[i] : splits[i + 1]]
        result_doc = []
        cum_sent_length = 0
        offset = 0
        for (idx_sent, (sentence, ground_truth_sentence)), snt in zip(
            contents.items(), sentences
        ):
            # Only include offset if using Flair.
            if is_flair:
                offset = raw_text.find(sentence, cum_sent_length)

            for entity in (
                snt.get_spans("ner")
                if is_flair
                else tagger.predict(snt, processed_sentences)
            ):
                text, start_pos, end_pos, conf, tag = (
                    entity.text,
                    entity.start_position,
                    entity.end_position,
                    entity.score,
                    entity.tag,
                )
                total_ment += 1
                m = self.preprocess_mention(text)
                cands = self.get_candidates(m)
                if len(cands) == 0:
                    continue
                # Re-create ngram as 'text' is at times changed by Flair (e.g. double spaces are removed).
                ngram = sentence[start_pos:end_pos]
                left_ctxt, right_ctxt = self.get_ctxt(
                    start_pos, end_pos, idx_sent, sentence, sentences_doc
                )
                res = {
                    "mention": m,
                    "context": (left_ctxt, right_ctxt),
                    "candidates": cands,
                    "gold": ["NONE"],
                    "pos": start_pos + offset,
                    "sent_idx": idx_sent,
                    "ngram": ngram,
                    "end_pos": end_pos + offset,
                    "sentence": sentence,
                    "conf_md": conf,
                    "tag": tag,
                }
                result_doc.append(res)
            cum_sent_length += len(sentence) + (offset - cum_sent_length)
        results[doc] = result_doc
    return results, total_ment

format_spans(dataset)

Responsible for formatting given spans into dataset for the ED step. More specifically, it returns the mention, its left/right context and a set of candidates.

Returns:

  • –

    Dictionary with mentions per document.

Source code in /home/docs/checkouts/readthedocs.org/user_builds/rel/envs/latest/lib/python3.7/site-packages/REL/mention_detection.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
def format_spans(self, dataset):
    """
    Responsible for formatting given spans into dataset for the ED step. More specifically,
    it returns the mention, its left/right context and a set of candidates.

    :return: Dictionary with mentions per document.
    """

    dataset, _, _ = self.split_text(dataset)
    results = {}
    total_ment = 0

    for doc in dataset:
        contents = dataset[doc]
        sentences_doc = [v[0] for v in contents.values()]

        results_doc = []
        for idx_sent, (sentence, spans) in contents.items():
            for ngram, start_pos, end_pos in spans:
                total_ment += 1

                # end_pos = start_pos + length
                # ngram = text[start_pos:end_pos]
                mention = self.preprocess_mention(ngram)
                left_ctxt, right_ctxt = self.get_ctxt(
                    start_pos, end_pos, idx_sent, sentence, sentences_doc
                )

                chosen_cands = self.get_candidates(mention)
                res = {
                    "mention": mention,
                    "context": (left_ctxt, right_ctxt),
                    "candidates": chosen_cands,
                    "gold": ["NONE"],
                    "pos": start_pos,
                    "sent_idx": idx_sent,
                    "ngram": ngram,
                    "end_pos": end_pos,
                    "sentence": sentence,
                }

                results_doc.append(res)
        results[doc] = results_doc
    return results, total_ment

split_text(dataset, is_flair=False)

Splits text into sentences with optional spans (format is a requirement for GERBIL usage). This behavior is required for the default NER-tagger, which during experiments was experienced to achieve higher performance.

Returns:

  • –

    dictionary with sentences and optional given spans per sentence.

Source code in /home/docs/checkouts/readthedocs.org/user_builds/rel/envs/latest/lib/python3.7/site-packages/REL/mention_detection.py
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
def split_text(self, dataset, is_flair=False):
    """
    Splits text into sentences with optional spans (format is a requirement for GERBIL usage).
    This behavior is required for the default NER-tagger, which during experiments was experienced
    to achieve higher performance.

    :return: dictionary with sentences and optional given spans per sentence.
    """

    res = {}
    splits = [0]
    processed_sentences = []
    for doc in dataset:
        text, spans = dataset[doc]
        sentences = split_single(text)
        res[doc] = {}

        i = 0
        pos_end = 0  # Added  (issue #49)
        for sent in sentences:
            if len(sent.strip()) == 0:
                continue
            # Match gt to sentence.
            # pos_start = text.find(sent) # Commented out (issue #49)
            pos_start = text.find(sent, pos_end)  # Added  (issue #49)
            pos_end = pos_start + len(sent)

            # ngram, start_pos, end_pos
            spans_sent = [
                [text[x[0] : x[0] + x[1]], x[0], x[0] + x[1]]
                for x in spans
                if pos_start <= x[0] < pos_end
            ]
            res[doc][i] = [sent, spans_sent]
            if len(spans) == 0:
                processed_sentences.append(
                    Sentence(sent, use_tokenizer=True) if is_flair else sent
                )
            i += 1
        splits.append(splits[-1] + i)
    return res, processed_sentences, splits