Skip to content

Utils

flatten_list_of_lists(list_of_lists)

making inputs to torch.nn.EmbeddingBag

Source code in /home/docs/checkouts/readthedocs.org/user_builds/rel/envs/latest/lib/python3.7/site-packages/REL/utils.py
163
164
165
166
167
168
169
170
def flatten_list_of_lists(list_of_lists):
    """
    making inputs to torch.nn.EmbeddingBag
    """
    list_of_lists = [[]] + list_of_lists
    offsets = np.cumsum([len(x) for x in list_of_lists])[:-1]
    flatten = sum(list_of_lists[1:], [])
    return flatten, offsets

is_important_word(s) cached

an important word is not a stopword, a number, or len == 1

Source code in /home/docs/checkouts/readthedocs.org/user_builds/rel/envs/latest/lib/python3.7/site-packages/REL/utils.py
185
186
187
188
189
190
191
192
193
194
195
196
@lru_cache(maxsize=None)
def is_important_word(s):
    """
    an important word is not a stopword, a number, or len == 1
    """
    try:
        if len(s) <= 1 or s.lower() in STOPWORDS:
            return False
        float(s)
        return False
    except:
        return True

preprocess_mention(m, wiki_db)

Responsible for preprocessing a mention and making sure we find a set of matching candidates in our database.

Returns:

  • –

    mention

Source code in /home/docs/checkouts/readthedocs.org/user_builds/rel/envs/latest/lib/python3.7/site-packages/REL/utils.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
def preprocess_mention(m, wiki_db):
    """
    Responsible for preprocessing a mention and making sure we find a set of matching candidates
    in our database.

    :return: mention
    """

    # TODO: This can be optimised (less db calls required).
    cur_m = modify_uppercase_phrase(m)
    freq_lookup_cur_m = wiki_db.wiki(cur_m, "wiki", "freq")

    if not freq_lookup_cur_m:
        cur_m = m

    freq_lookup_m = wiki_db.wiki(m, "wiki", "freq")
    freq_lookup_cur_m = wiki_db.wiki(cur_m, "wiki", "freq")

    if freq_lookup_m and (freq_lookup_m > freq_lookup_cur_m):
        # Cases like 'U.S.' are handed badly by modify_uppercase_phrase
        cur_m = m

    freq_lookup_cur_m = wiki_db.wiki(cur_m, "wiki", "freq")
    # If we cannot find the exact mention in our index, we try our luck to
    # find it in a case insensitive index.
    if not freq_lookup_cur_m:
        # cur_m and m both not found, verify if lower-case version can be found.
        find_lower = wiki_db.wiki(m.lower(), "wiki", "lower")

        if find_lower:
            cur_m = find_lower

    freq_lookup_cur_m = wiki_db.wiki(cur_m, "wiki", "freq")
    # Try and remove first or last characters (e.g. 'Washington,' to 'Washington')
    # To be error prone, we only try this if no match was found thus far, else
    # this might get in the way of 'U.S.' converting to 'US'.
    # Could do this recursively, interesting to explore in future work.
    if not freq_lookup_cur_m:
        temp = re.sub(r"[\(.|,|!|')]", "", m).strip()
        simple_lookup = wiki_db.wiki(temp, "wiki", "freq")

        if simple_lookup:
            cur_m = temp

    return cur_m

process_results(mentions_dataset, predictions, processed, include_offset=False)

Function that can be used to process the End-to-End results.

Returns:

  • –

    dictionary with results and document as key.

Source code in /home/docs/checkouts/readthedocs.org/user_builds/rel/envs/latest/lib/python3.7/site-packages/REL/utils.py
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
def process_results(
    mentions_dataset,
    predictions,
    processed,
    include_offset=False,
):
    """
    Function that can be used to process the End-to-End results.
    :return: dictionary with results and document as key.
    """
    res = {}
    for doc in mentions_dataset:
        if doc not in predictions:
            # No mentions found, we return empty list.
            continue
        pred_doc = predictions[doc]
        ment_doc = mentions_dataset[doc]
        text = processed[doc][0]
        res_doc = []

        for pred, ment in zip(pred_doc, ment_doc):
            sent = ment["sentence"]
            idx = ment["sent_idx"]
            start_pos = ment["pos"]
            mention_length = int(ment["end_pos"] - ment["pos"])

            if pred["prediction"] != "NIL":
                temp = (
                    start_pos,
                    mention_length,
                    ment["ngram"],
                    pred["prediction"],
                    pred["conf_ed"],
                    ment["conf_md"] if "conf_md" in ment else 0.0,
                    ment["tag"] if "tag" in ment else "NULL",
                )
                res_doc.append(temp)
        res[doc] = res_doc
    return res

split_in_words(inputstr)

This regexp also splits 'AL-NAHAR', which should be a single word into 'AL' and 'NAHAR', resulting in the inability to find a match.

Same with U.S.

Source code in /home/docs/checkouts/readthedocs.org/user_builds/rel/envs/latest/lib/python3.7/site-packages/REL/utils.py
133
134
135
136
137
138
139
140
141
142
143
def split_in_words(inputstr):
    """
    This regexp also splits 'AL-NAHAR', which should be a single word
    into 'AL' and 'NAHAR', resulting in the inability to find a match.

    Same with U.S.
    """
    tokenizer = RegexpTokenizer(r"\w+")
    return [
        anyascii_cached(w) for w in tokenizer.tokenize(inputstr)
    ]  # #inputstr.split()]#

split_in_words_mention(inputstr)

This regexp also splits 'AL-NAHAR', which should be a single word into 'AL' and 'NAHAR', resulting in the inability to find a match.

Same with U.S.

Source code in /home/docs/checkouts/readthedocs.org/user_builds/rel/envs/latest/lib/python3.7/site-packages/REL/utils.py
146
147
148
149
150
151
152
153
def split_in_words_mention(inputstr):
    """
    This regexp also splits 'AL-NAHAR', which should be a single word
    into 'AL' and 'NAHAR', resulting in the inability to find a match.

    Same with U.S.
    """
    return [anyascii_cached(w) for w in inputstr.split()]  # #inputstr.split()]#