AIscripts/wikiconv.py at master · daedalus/AIscripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# translation from https://mattmahoney.net/dc/textdata.html

import sys
import re

def spell_digits(text):
    return (text.replace('0', ' zero ')
                .replace('1', ' one ')
                .replace('2', ' two ')
                .replace('3', ' three ')
                .replace('4', ' four ')
                .replace('5', ' five ')
                .replace('6', ' six ')
                .replace('7', ' seven ')
                .replace('8', ' eight ')
                .replace('9', ' nine '))

text_mode = False

for line in sys.stdin:
    if '<text ' in line:
        text_mode = True
    if re.search(r'#redirect', line, re.IGNORECASE):
        text_mode = False
    if text_mode:
        if '</text>' in line:
            text_mode = False

        # Remove XML tags
        line = re.sub(r'<.*?>', '', line)

        # Decode URL encoded chars
        line = line.replace('&amp;', '&')
        line = line.replace('&lt;', '<')
        line = line.replace('&gt;', '>')

        # Remove references
        line = re.sub(r'<ref[^<]*?</ref>', '', line)

        # Remove XHTML tags
        line = re.sub(r'<[^>]*>', '', line)

        # Remove normal URLs, preserve visible text
        line = re.sub(r'\[http:[^\]\s]*', '[', line)

        # Remove image link details, preserve caption
        line = re.sub(r'\|thumb', '', line, flags=re.IGNORECASE)
        line = re.sub(r'\|left', '', line, flags=re.IGNORECASE)
        line = re.sub(r'\|right', '', line, flags=re.IGNORECASE)
        line = re.sub(r'\|\d+px', '', line, flags=re.IGNORECASE)
        line = re.sub(r'\[\[image:[^\[\]]*\|', '', line, flags=re.IGNORECASE)

        # Show categories without markup
        line = re.sub(r'\[\[category:([^|\]]*)[^\]]*\]\]', r'[[\1]]', line, flags=re.IGNORECASE)

        # Remove links to other languages
        line = re.sub(r'\[\[[a-z\-]*:[^\]]*\]\]', '', line)

        # Remove wiki URL, preserve visible text
        line = re.sub(r'\[\[[^\|\]]*\|', '[[', line)

        # Remove templates ({{...}}, {...})
        line = re.sub(r'\{\{[^}]*\}\}', '', line)
        line = re.sub(r'\{[^}]*\}', '', line)

        # Remove [ and ]
        line = line.replace('[', '')
        line = line.replace(']', '')

        # Remove remaining URL encoded entities
        line = re.sub(r'&[^;]*;', ' ', line)

        # Convert to lowercase and spell out digits
        line = ' ' + line + ' '
        line = line.lower()
        line = spell_digits(line)

        # Convert all non-lowercase a-z to space, and squeeze spaces
        line = re.sub(r'[^a-z]', ' ', line)
        line = re.sub(r'\s+', ' ', line).strip()

        print(line)