Utils

`flatten_list_of_lists(list_of_lists)`

making inputs to torch.nn.EmbeddingBag

Source code in /home/docs/checkouts/readthedocs.org/user_builds/rel/envs/latest/lib/python3.7/site-packages/REL/utils.py

def flatten_list_of_lists(list_of_lists):
    """
    making inputs to torch.nn.EmbeddingBag
    """
    list_of_lists = [[]] + list_of_lists
    offsets = np.cumsum([len(x) for x in list_of_lists])[:-1]
    flatten = sum(list_of_lists[1:], [])
    return flatten, offsets

`is_important_word(s)` `cached`

an important word is not a stopword, a number, or len == 1

Source code in /home/docs/checkouts/readthedocs.org/user_builds/rel/envs/latest/lib/python3.7/site-packages/REL/utils.py

@lru_cache(maxsize=None)
def is_important_word(s):
    """
    an important word is not a stopword, a number, or len == 1
    """
    try:
        if len(s) <= 1 or s.lower() in STOPWORDS:
            return False
        float(s)
        return False
    except:
        return True

`preprocess_mention(m, wiki_db)`

Responsible for preprocessing a mention and making sure we find a set of matching candidates in our database.

Returns:

–

mention

Source code in /home/docs/checkouts/readthedocs.org/user_builds/rel/envs/latest/lib/python3.7/site-packages/REL/utils.py

def preprocess_mention(m, wiki_db):
    """
    Responsible for preprocessing a mention and making sure we find a set of matching candidates
    in our database.

    :return: mention
    """

    # TODO: This can be optimised (less db calls required).
    cur_m = modify_uppercase_phrase(m)
    freq_lookup_cur_m = wiki_db.wiki(cur_m, "wiki", "freq")

    if not freq_lookup_cur_m:
        cur_m = m

    freq_lookup_m = wiki_db.wiki(m, "wiki", "freq")
    freq_lookup_cur_m = wiki_db.wiki(cur_m, "wiki", "freq")

    if freq_lookup_m and (freq_lookup_m > freq_lookup_cur_m):
        # Cases like 'U.S.' are handed badly by modify_uppercase_phrase
        cur_m = m

    freq_lookup_cur_m = wiki_db.wiki(cur_m, "wiki", "freq")
    # If we cannot find the exact mention in our index, we try our luck to
    # find it in a case insensitive index.
    if not freq_lookup_cur_m:
        # cur_m and m both not found, verify if lower-case version can be found.
        find_lower = wiki_db.wiki(m.lower(), "wiki", "lower")

        if find_lower:
            cur_m = find_lower

    freq_lookup_cur_m = wiki_db.wiki(cur_m, "wiki", "freq")
    # Try and remove first or last characters (e.g. 'Washington,' to 'Washington')
    # To be error prone, we only try this if no match was found thus far, else
    # this might get in the way of 'U.S.' converting to 'US'.
    # Could do this recursively, interesting to explore in future work.
    if not freq_lookup_cur_m:
        temp = re.sub(r"[\(.|,|!|')]", "", m).strip()
        simple_lookup = wiki_db.wiki(temp, "wiki", "freq")

        if simple_lookup:
            cur_m = temp

    return cur_m

`process_results(mentions_dataset, predictions, processed, include_offset=False)`

Function that can be used to process the End-to-End results.

Returns:

–

dictionary with results and document as key.

Source code in /home/docs/checkouts/readthedocs.org/user_builds/rel/envs/latest/lib/python3.7/site-packages/REL/utils.py

def process_results(
    mentions_dataset,
    predictions,
    processed,
    include_offset=False,
):
    """
    Function that can be used to process the End-to-End results.
    :return: dictionary with results and document as key.
    """
    res = {}
    for doc in mentions_dataset:
        if doc not in predictions:
            # No mentions found, we return empty list.
            continue
        pred_doc = predictions[doc]
        ment_doc = mentions_dataset[doc]
        text = processed[doc][0]
        res_doc = []

        for pred, ment in zip(pred_doc, ment_doc):
            sent = ment["sentence"]
            idx = ment["sent_idx"]
            start_pos = ment["pos"]
            mention_length = int(ment["end_pos"] - ment["pos"])

            if pred["prediction"] != "NIL":
                temp = (
                    start_pos,
                    mention_length,
                    ment["ngram"],
                    pred["prediction"],
                    pred["conf_ed"],
                    ment["conf_md"] if "conf_md" in ment else 0.0,
                    ment["tag"] if "tag" in ment else "NULL",
                )
                res_doc.append(temp)
        res[doc] = res_doc
    return res

`split_in_words(inputstr)`

This regexp also splits 'AL-NAHAR', which should be a single word into 'AL' and 'NAHAR', resulting in the inability to find a match.

Same with U.S.

Source code in /home/docs/checkouts/readthedocs.org/user_builds/rel/envs/latest/lib/python3.7/site-packages/REL/utils.py

def split_in_words(inputstr):
    """
    This regexp also splits 'AL-NAHAR', which should be a single word
    into 'AL' and 'NAHAR', resulting in the inability to find a match.

    Same with U.S.
    """
    tokenizer = RegexpTokenizer(r"\w+")
    return [
        anyascii_cached(w) for w in tokenizer.tokenize(inputstr)
    ]  # #inputstr.split()]#

`split_in_words_mention(inputstr)`

This regexp also splits 'AL-NAHAR', which should be a single word into 'AL' and 'NAHAR', resulting in the inability to find a match.

Same with U.S.

Source code in /home/docs/checkouts/readthedocs.org/user_builds/rel/envs/latest/lib/python3.7/site-packages/REL/utils.py

def split_in_words_mention(inputstr):
    """
    This regexp also splits 'AL-NAHAR', which should be a single word
    into 'AL' and 'NAHAR', resulting in the inability to find a match.

    Same with U.S.
    """
    return [anyascii_cached(w) for w in inputstr.split()]  # #inputstr.split()]#

Utils

flatten_list_of_lists(list_of_lists)

is_important_word(s) cached

preprocess_mention(m, wiki_db)

process_results(mentions_dataset, predictions, processed, include_offset=False)

split_in_words(inputstr)

split_in_words_mention(inputstr)

`flatten_list_of_lists(list_of_lists)`

`is_important_word(s)` `cached`

`preprocess_mention(m, wiki_db)`

`process_results(mentions_dataset, predictions, processed, include_offset=False)`

`split_in_words(inputstr)`

`split_in_words_mention(inputstr)`