-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwikiconv.py
More file actions
83 lines (63 loc) · 2.71 KB
/
Copy pathwikiconv.py
File metadata and controls
83 lines (63 loc) · 2.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# translation from https://mattmahoney.net/dc/textdata.html
import sys
import re
def spell_digits(text):
return (text.replace('0', ' zero ')
.replace('1', ' one ')
.replace('2', ' two ')
.replace('3', ' three ')
.replace('4', ' four ')
.replace('5', ' five ')
.replace('6', ' six ')
.replace('7', ' seven ')
.replace('8', ' eight ')
.replace('9', ' nine '))
text_mode = False
for line in sys.stdin:
if '<text ' in line:
text_mode = True
if re.search(r'#redirect', line, re.IGNORECASE):
text_mode = False
if text_mode:
if '</text>' in line:
text_mode = False
# Remove XML tags
line = re.sub(r'<.*?>', '', line)
# Decode URL encoded chars
line = line.replace('&', '&')
line = line.replace('<', '<')
line = line.replace('>', '>')
# Remove references
line = re.sub(r'<ref[^<]*?</ref>', '', line)
# Remove XHTML tags
line = re.sub(r'<[^>]*>', '', line)
# Remove normal URLs, preserve visible text
line = re.sub(r'\[http:[^\]\s]*', '[', line)
# Remove image link details, preserve caption
line = re.sub(r'\|thumb', '', line, flags=re.IGNORECASE)
line = re.sub(r'\|left', '', line, flags=re.IGNORECASE)
line = re.sub(r'\|right', '', line, flags=re.IGNORECASE)
line = re.sub(r'\|\d+px', '', line, flags=re.IGNORECASE)
line = re.sub(r'\[\[image:[^\[\]]*\|', '', line, flags=re.IGNORECASE)
# Show categories without markup
line = re.sub(r'\[\[category:([^|\]]*)[^\]]*\]\]', r'[[\1]]', line, flags=re.IGNORECASE)
# Remove links to other languages
line = re.sub(r'\[\[[a-z\-]*:[^\]]*\]\]', '', line)
# Remove wiki URL, preserve visible text
line = re.sub(r'\[\[[^\|\]]*\|', '[[', line)
# Remove templates ({{...}}, {...})
line = re.sub(r'\{\{[^}]*\}\}', '', line)
line = re.sub(r'\{[^}]*\}', '', line)
# Remove [ and ]
line = line.replace('[', '')
line = line.replace(']', '')
# Remove remaining URL encoded entities
line = re.sub(r'&[^;]*;', ' ', line)
# Convert to lowercase and spell out digits
line = ' ' + line + ' '
line = line.lower()
line = spell_digits(line)
# Convert all non-lowercase a-z to space, and squeeze spaces
line = re.sub(r'[^a-z]', ' ', line)
line = re.sub(r'\s+', ' ', line).strip()
print(line)