Skip to content

Wikipedia yago freq

WikipediaYagoFreq(base_url, wiki_version, wikipedia)

Class responsible for processing Wikipedia dumps. Performs computations to obtain the p(e|m) index and counts overall occurrences of mentions.

Source code in /home/docs/checkouts/readthedocs.org/user_builds/rel/envs/latest/lib/python3.7/site-packages/REL/wikipedia_yago_freq.py
17
18
19
20
21
22
23
24
def __init__(self, base_url, wiki_version, wikipedia):
    self.base_url = base_url
    self.wiki_version = wiki_version
    self.wikipedia = wikipedia

    self.wiki_freq = {}
    self.p_e_m = {}
    self.mention_freq = {}

__cross_wiki_counts()

Updates mention/entity for Wiki with this additional corpus.

Returns:

  • –
Source code in /home/docs/checkouts/readthedocs.org/user_builds/rel/envs/latest/lib/python3.7/site-packages/REL/wikipedia_yago_freq.py
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
def __cross_wiki_counts(self):
    """
    Updates mention/entity for Wiki with this additional corpus.

    :return:
    """

    print("Updating counts by merging with CrossWiki")

    cnt = 0
    crosswiki_path = os.path.join(
        self.base_url, "generic/p_e_m_data/crosswikis_p_e_m.txt"
    )

    with open(crosswiki_path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.split("\t")
            mention = unquote(parts[0])

            if ("Wikipedia" not in mention) and ("wikipedia" not in mention):
                if mention not in self.wiki_freq:
                    self.wiki_freq[mention] = {}

                num_ents = len(parts)
                for i in range(2, num_ents):
                    ent_str = parts[i].split(",")
                    ent_wiki_id = int(ent_str[0])
                    freq_ent = int(ent_str[1])

                    if (
                        ent_wiki_id
                        not in self.wikipedia.wiki_id_name_map["ent_id_to_name"]
                    ):
                        ent_name_re = self.wikipedia.wiki_redirect_id(ent_wiki_id)
                        if (
                            ent_name_re
                            in self.wikipedia.wiki_id_name_map["ent_name_to_id"]
                        ):
                            ent_wiki_id = self.wikipedia.wiki_id_name_map[
                                "ent_name_to_id"
                            ][ent_name_re]

                    cnt += 1
                    if (
                        ent_wiki_id
                        in self.wikipedia.wiki_id_name_map["ent_id_to_name"]
                    ):
                        if mention not in self.mention_freq:
                            self.mention_freq[mention] = 0
                        self.mention_freq[mention] += freq_ent

                        ent_name = self.wikipedia.wiki_id_name_map[
                            "ent_id_to_name"
                        ][ent_wiki_id].replace(" ", "_")
                        if ent_name not in self.wiki_freq[mention]:
                            self.wiki_freq[mention][ent_name] = 0
                        self.wiki_freq[mention][ent_name] += freq_ent

__extract_text_and_hyp(line)

Extracts hyperlinks from given Wikipedia document to obtain mention/entity counts.

Returns:

  • –

    list of mentions/wiki Ids and their respective counts (plus some statistics).

Source code in /home/docs/checkouts/readthedocs.org/user_builds/rel/envs/latest/lib/python3.7/site-packages/REL/wikipedia_yago_freq.py
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
def __extract_text_and_hyp(self, line):
    """
    Extracts hyperlinks from given Wikipedia document to obtain mention/entity counts.

    :return: list of mentions/wiki Ids and their respective counts (plus some statistics).
    """

    line = unquote(line)
    list_hyp = []
    num_mentions = 0
    start_entities = [m.start() for m in re.finditer('<a href="', line)]
    end_entities = [m.start() for m in re.finditer('">', line)]
    end_mentions = [m.start() for m in re.finditer("</a>", line)]

    disambiguation_ent_errors = 0
    start_entity = line.find('<a href="')

    while start_entity >= 0:
        line = line[start_entity + len('<a href="') :]
        end_entity = line.find('">')
        end_mention = line.find("</a>")
        mention = line[end_entity + len('">') : end_mention]

        if (
            ("Wikipedia" not in mention)
            and ("wikipedia" not in mention)
            and (len(mention) >= 1)
        ):
            # Valid mention
            entity = line[0:end_entity]
            find_wikt = entity.find("wikt:")
            entity = entity[len("wikt:") :] if find_wikt == 0 else entity
            entity = self.wikipedia.preprocess_ent_name(entity)

            if entity.find("List of ") != 0:
                if "#" not in entity:
                    ent_wiki_id = self.wikipedia.ent_wiki_id_from_name(entity)
                    if ent_wiki_id == -1:
                        disambiguation_ent_errors += 1
                    else:
                        num_mentions += 1
                        list_hyp.append(
                            {
                                "mention": mention,
                                "ent_wikiid": ent_wiki_id,
                                "cnt": num_mentions,
                            }
                        )
        # find new entity
        start_entity = line.find('<a href="')
    return (
        list_hyp,
        disambiguation_ent_errors,
        [len(start_entities), len(end_entities), len(end_mentions)],
    )

__wiki_counts()

Computes mention/entity for a given Wiki dump.

Returns:

  • –
Source code in /home/docs/checkouts/readthedocs.org/user_builds/rel/envs/latest/lib/python3.7/site-packages/REL/wikipedia_yago_freq.py
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
def __wiki_counts(self):
    """
    Computes mention/entity for a given Wiki dump.

    :return:
    """

    num_lines = 0
    num_valid_hyperlinks = 0
    disambiguation_ent_errors = 0

    print("Calculating Wikipedia mention/entity occurrences")

    last_processed_id = -1
    exist_id_found = False

    wiki_anchor_files = os.listdir(
        os.path.join(self.base_url, self.wiki_version, "basic_data/anchor_files/")
    )
    for wiki_anchor in wiki_anchor_files:
        wiki_file = os.path.join(
            self.base_url,
            self.wiki_version,
            "basic_data/anchor_files/",
            wiki_anchor,
        )

        with open(wiki_file, "r", encoding="utf-8") as f:
            for line in f:
                num_lines += 1

                if num_lines % 5000000 == 0:
                    print(
                        "Processed {} lines, valid hyperlinks {}".format(
                            num_lines, num_valid_hyperlinks
                        )
                    )
                if '<doc id="' in line:
                    id = int(line[line.find("id") + 4 : line.find("url") - 2])
                    if id <= last_processed_id:
                        exist_id_found = True
                        continue
                    else:
                        exist_id_found = False
                        last_processed_id = id
                else:
                    if not exist_id_found:
                        (
                            list_hyp,
                            disambiguation_ent_error,
                            print_values,
                        ) = self.__extract_text_and_hyp(line)

                        disambiguation_ent_errors += disambiguation_ent_error

                        for el in list_hyp:
                            mention = el["mention"]
                            ent_wiki_id = el["ent_wikiid"]

                            num_valid_hyperlinks += 1
                            if mention not in self.wiki_freq:
                                self.wiki_freq[mention] = {}

                            if (
                                ent_wiki_id
                                in self.wikipedia.wiki_id_name_map["ent_id_to_name"]
                            ):
                                if mention not in self.mention_freq:
                                    self.mention_freq[mention] = 0
                                self.mention_freq[mention] += 1

                                ent_name = self.wikipedia.wiki_id_name_map[
                                    "ent_id_to_name"
                                ][ent_wiki_id].replace(" ", "_")
                                if ent_name not in self.wiki_freq[mention]:
                                    self.wiki_freq[mention][ent_name] = 0
                                self.wiki_freq[mention][ent_name] += 1

    print(
        "Done computing Wikipedia counts. Num valid hyperlinks = {}".format(
            num_valid_hyperlinks
        )
    )

__yago_counts()

Counts mention/entity occurrences for YAGO.

Returns:

  • –

    frequency index

Source code in /home/docs/checkouts/readthedocs.org/user_builds/rel/envs/latest/lib/python3.7/site-packages/REL/wikipedia_yago_freq.py
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
def __yago_counts(self):
    """
    Counts mention/entity occurrences for YAGO.

    :return: frequency index
    """

    num_lines = 0
    print("Calculating Yago occurrences")
    custom_freq = {}
    with open(
        os.path.join(self.base_url, "generic/p_e_m_data/aida_means.tsv"),
        "r",
        encoding="utf-8",
    ) as f:
        for line in f:
            num_lines += 1

            if num_lines % 5000000 == 0:
                print("Processed {} lines.".format(num_lines))

            line = line.rstrip()
            line = unquote(line)
            parts = line.split("\t")
            mention = parts[0][1:-1].strip()

            ent_name = parts[1].strip()
            ent_name = ent_name.replace("&amp;", "&")
            ent_name = ent_name.replace("&quot;", '"')

            x = ent_name.find("\\u")
            while x != -1:
                code = ent_name[x : x + 6]
                replace = unicode2ascii(code)
                if replace == "%":
                    replace = "%%"

                ent_name = ent_name.replace(code, replace)
                x = ent_name.find("\\u")

            ent_name = self.wikipedia.preprocess_ent_name(ent_name)
            if ent_name in self.wikipedia.wiki_id_name_map["ent_name_to_id"]:
                if mention not in custom_freq:
                    custom_freq[mention] = {}
                ent_name = ent_name.replace(" ", "_")
                if ent_name not in custom_freq[mention]:
                    custom_freq[mention][ent_name] = 1

    return custom_freq

compute_custom(custom=None)

Computes p(e|m) index for YAGO and combines this index with the Wikipedia p(e|m) index as reported by Ganea et al. in 'Deep Joint Entity Disambiguation with Local Neural Attention'.

Alternatively, users may specificy their own custom p(e|m) by providing mention/entity counts.

Returns:

  • –
Source code in /home/docs/checkouts/readthedocs.org/user_builds/rel/envs/latest/lib/python3.7/site-packages/REL/wikipedia_yago_freq.py
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
def compute_custom(self, custom=None):
    """
    Computes p(e|m) index for YAGO and combines this index with the Wikipedia p(e|m) index as reported
    by Ganea et al. in 'Deep Joint Entity Disambiguation with Local Neural Attention'.

    Alternatively, users may specificy their own custom p(e|m) by providing mention/entity counts.


    :return:
    """
    if custom:
        self.custom_freq = custom
    else:
        self.custom_freq = self.__yago_counts()

    print("Computing p(e|m)")
    for mention in self.custom_freq:
        total = len(self.custom_freq[mention])

        # Assumes uniform distribution, else total will need to be adjusted.
        if mention not in self.mention_freq:
            self.mention_freq[mention] = 0
        self.mention_freq[mention] += 1
        cust_ment_ent_temp = {
            k: 1 / total for k, v in self.custom_freq[mention].items()
        }

        if mention not in self.p_e_m:
            self.p_e_m[mention] = cust_ment_ent_temp
        else:
            for ent_wiki_id in cust_ment_ent_temp:
                prob = cust_ment_ent_temp[ent_wiki_id]
                if ent_wiki_id not in self.p_e_m[mention]:
                    self.p_e_m[mention][ent_wiki_id] = 0.0

                # Assumes addition of p(e|m) as described by authors.
                self.p_e_m[mention][ent_wiki_id] = np.round(
                    min(1.0, self.p_e_m[mention][ent_wiki_id] + prob), 3
                )

compute_wiki()

Computes p(e|m) index for a given wiki and crosswikis dump.

Returns:

  • –
Source code in /home/docs/checkouts/readthedocs.org/user_builds/rel/envs/latest/lib/python3.7/site-packages/REL/wikipedia_yago_freq.py
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
def compute_wiki(self):
    """
    Computes p(e|m) index for a given wiki and crosswikis dump.

    :return:
    """

    self.__wiki_counts()
    self.__cross_wiki_counts()

    # Step 1: Calculate p(e|m) for wiki.
    print("Filtering candidates and calculating p(e|m) values for Wikipedia.")
    for ent_mention in self.wiki_freq:
        if len(ent_mention) < 1:
            continue

        ent_wiki_names = sorted(
            self.wiki_freq[ent_mention].items(), key=lambda kv: kv[1], reverse=True
        )
        # Get the sum of at most 100 candidates, but less if less are available.
        total_count = np.sum([v for k, v in ent_wiki_names][:100])

        if total_count < 1:
            continue

        self.p_e_m[ent_mention] = {}

        for ent_name, count in ent_wiki_names:
            self.p_e_m[ent_mention][ent_name] = count / total_count

            if len(self.p_e_m[ent_mention]) >= 100:
                break

    del self.wiki_freq

store()

Stores results in a sqlite3 database.

Returns:

  • –
Source code in /home/docs/checkouts/readthedocs.org/user_builds/rel/envs/latest/lib/python3.7/site-packages/REL/wikipedia_yago_freq.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
def store(self):
    """
    Stores results in a sqlite3 database.

    :return:
    """
    print("Please take a break, this will take a while :).")

    wiki_db = GenericLookup(
        "entity_word_embedding",
        os.path.join(self.base_url, self.wiki_version, "generated"),
        table_name="wiki",
        columns={"p_e_m": "blob", "lower": "text", "freq": "INTEGER"},
    )

    wiki_db.load_wiki(self.p_e_m, self.mention_freq, batch_size=50000, reset=True)