Skip to content

Training datasets

TrainingEvaluationDatasets(base_url, wiki_version)

Class responsible for loading training/evaluation datasets for local ED.

Reading dataset from CoNLL dataset, extracted by https://github.com/dalab/deep-ed/

Source code in /home/docs/checkouts/readthedocs.org/user_builds/rel/envs/latest/lib/python3.7/site-packages/REL/training_datasets.py
12
13
14
15
16
def __init__(self, base_url, wiki_version):
    self.person_names = self.__load_person_names(
        os.path.join(base_url, "generic/p_e_m_data/persons.txt")
    )
    self.base_url = os.path.join(base_url, wiki_version)

__find_coref(ment, mentlist)

Attempts to find coreferences

Returns:

  • –

    coreferences

Source code in /home/docs/checkouts/readthedocs.org/user_builds/rel/envs/latest/lib/python3.7/site-packages/REL/training_datasets.py
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
def __find_coref(self, ment, mentlist):
    """
    Attempts to find coreferences

    :return: coreferences
    """

    cur_m = ment["mention"].lower()
    coref = []
    for m in mentlist:
        if (
            len(m["candidates"]) == 0
            or m["candidates"][0][0] not in self.person_names
        ):
            continue

        mention = m["mention"].lower()
        if mention == cur_m:
            continue
        start_pos = mention.find(cur_m)
        if start_pos == -1:
            continue

        end_pos = start_pos + len(cur_m) - 1
        if (start_pos == 0 or mention[start_pos - 1] == " ") and (
            end_pos == len(mention) - 1 or mention[end_pos + 1] == " "
        ):
            coref.append(m)

    return coref

__load_person_names(path)

Loads person names to find coreferences.

Returns:

  • –

    set of names.

Source code in /home/docs/checkouts/readthedocs.org/user_builds/rel/envs/latest/lib/python3.7/site-packages/REL/training_datasets.py
61
62
63
64
65
66
67
68
69
70
71
72
def __load_person_names(self, path):
    """
    Loads person names to find coreferences.

    :return: set of names.
    """

    data = []
    with open(path, "r", encoding="utf8") as f:
        for line in f:
            data.append(line.strip().replace(" ", "_"))
    return set(data)

__read_pickle_file(path)

Reads pickle file.

Returns:

  • –

    Dataset

Source code in /home/docs/checkouts/readthedocs.org/user_builds/rel/envs/latest/lib/python3.7/site-packages/REL/training_datasets.py
50
51
52
53
54
55
56
57
58
59
def __read_pickle_file(self, path):
    """
    Reads pickle file.

    :return: Dataset
    """
    with open(path, "rb") as f:
        data = pickle.load(f)

    return data

load()

Loads respective datasets and processes coreferences.

Returns:

  • –

    Returns training/evaluation datasets.

Source code in /home/docs/checkouts/readthedocs.org/user_builds/rel/envs/latest/lib/python3.7/site-packages/REL/training_datasets.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
def load(self):
    """
    Loads respective datasets and processes coreferences.

    :return: Returns training/evaluation datasets.
    """
    datasets = {}
    for ds in [
        "aida_train",
        "aida_testA",
        "aida_testB",
        "wned-ace2004",
        "wned-aquaint",
        "wned-clueweb",
        "wned-msnbc",
        "wned-wikipedia",
    ]:
        print("Loading {}".format(ds))
        datasets[ds] = self.__read_pickle_file(
            os.path.join(self.base_url, "generated/test_train_data/", f"{ds}.pkl")
        )

        if ds == "wned-wikipedia":
            if "Jiří_Třanovský" in datasets[ds]:
                del datasets[ds]["Jiří_Třanovský"]
            if "Jiří_Třanovský Jiří_Třanovský" in datasets[ds]:
                del datasets[ds]["Jiří_Třanovský Jiří_Třanovský"]

        self.with_coref(datasets[ds])

    return datasets

with_coref(dataset)

Parent function that checks if there are coreferences in the given dataset.

Returns:

  • –

    dataset

Source code in /home/docs/checkouts/readthedocs.org/user_builds/rel/envs/latest/lib/python3.7/site-packages/REL/training_datasets.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
def with_coref(self, dataset):
    """
    Parent function that checks if there are coreferences in the given dataset.

    :return: dataset
    """

    for data_name, content in dataset.items():
        for cur_m in content:
            coref = self.__find_coref(cur_m, content)
            if coref is not None and len(coref) > 0:
                cur_cands = {}
                for m in coref:
                    for c, p in m["candidates"]:
                        cur_cands[c] = cur_cands.get(c, 0) + p
                for c in cur_cands.keys():
                    cur_cands[c] /= len(coref)
                cur_m["candidates"] = sorted(
                    list(cur_cands.items()), key=lambda x: x[1]
                )[::-1]