forked from eleyine/wikicode-renderer
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Wikidict.py
135 lines (121 loc) · 5.17 KB
/
Wikidict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import string
import re
class Wikidict:
'''Constructs a wikicode dictionary from a Google Doc .html file
(Key, Value) pairs are in the form: (htmlElement, [leftWikicodetag, rightWikicodeTag])'''
# Constructor
def __init__(self, filename):
self.filename = filename
self.style = self.getStyle()
self.dict = self.getDict(self.getCSSDict())
print self.getCSSDict()
# Helper methods
def getStyle(self):
style_regex = re.compile('<style type="text/css">(.*)</style>')
file_object = open(self.filename, "r")
html = file_object.readline()
file_object.close()
return style_regex.search(html).group(1)
def getCSSDict(self):
'''Create a subdictionary of the form {classA:{attribute1:value1,attribute2:value2}, classB:...}'''
cssdict = {}
class_regex = re.compile('\.(c\d*?)\{(.*?)\}')
class_iterator = class_regex.finditer(self.style)
for class_content in class_iterator:
class_name = class_content.group(1)
content = class_content.group(2)
content_regex = re.compile('(.*?):([#\d\w\.]*?);')
cssdict[class_name] = {}
content = content + ';' # hacky, don't judge me
content_iterator = content_regex.finditer(content)
for attr_val in content_iterator:
attr = attr_val.group(1)
val = attr_val.group(2)
cssdict[class_name][attr] = val
return cssdict
def getDict(self, cssdict):
''' Base dictionary of the form {element1:{class1:[format]}, element2:{class1:[format], class2:[format]}'''
dict = { 'title':{'noclass':['=', '=\n']},
'h1':{'noclass':['\n=','=\n']},
'h2':{'noclass':['\n==', '==\n']},
'h3':{'noclass':['\n===', '===\n']},
'h4':{'noclass':['\n====', '====\n']},
'h5':{'noclass':['\n=====', '=====\n']},
'p': {'noclass':['\n','']},
'b': {'noclass':["'''", "'''"]},
'table': {'noclass':['\n{| class="wikitable" width="100%"', ' \n|}']},
# if you don't like these default settings, feel free to change them
'td': {'default':['\n|', '']},
'tr': {'append':['\n|-', '']},
'li': {}, 'span': {}, 'ol' : {}
}
dict = self.addLi(dict, cssdict)
dict = self.addOther(dict, cssdict)
return dict
## Element-specific helper functions ##
# Sort classes according to 'margin-left' values and associate appropriate number of bullets (*)
def addLi(self, dict, cssdict):
indents = []
class_indent_map = {}
for class_name in cssdict.keys():
if 'margin-left' in cssdict[class_name]:
indent = cssdict[class_name]['margin-left']
if indent not in class_indent_map.keys():
class_indent_map[indent] = [class_name]
indents.append(indent)
else:
class_indent_map[indent].append(class_name)
# Sort indents in increasing order
maxlen = 0
for indent in indents:
if len(indent) > maxlen:
maxlen = len(indent)
for i in range(len(indents)):
while len(indents[i]) < maxlen:
indents[i] = '0' + indents[i] # basically pad with 0s so I can sort later
indents.sort()
for i in range(len(indents)):
indents[i] = indents[i].lstrip('0')
# Assign degree to li class
degree = 1
for indent in indents:
degree = degree + 1
classes = class_indent_map[indent]
for class_name in classes:
if class_name not in dict['li']:
dict['li'][class_name] = [degree]
return dict
# Takes care of other elements: bold, italics, underlined text, type of lists (decimal or not)
def addOther(self, dict, cssdict):
for class_name in cssdict.keys():
if ('font-weight' in cssdict[class_name]) and (cssdict[class_name]['font-weight'] == 'bold') and (class_name not in dict['span']):
dict['span'][class_name] = ["'''", "'''"]
if 'font-style' in cssdict[class_name] and cssdict[class_name]['font-style'] == 'italic' and class_name not in dict['span']:
dict['span'][class_name] = ["''", "''"]
if 'text-decoration' in cssdict[class_name] and cssdict[class_name]['text-decoration'] == 'underline' and class_name not in dict['span']:
dict['span'][class_name] = ['<u>', '</u>']
if 'list-style-type' in cssdict[class_name]:
if cssdict[class_name]['list-style-type'] == 'decimal' and class_name not in dict['span']:
dict['ol'][class_name] = ['#']
else:
if class_name not in dict['span']:
dict['ol'][class_name] = ['*']
attr_vals = []
# attribute 'width' is ingored because it would produce narrow tables
# if you want to allow that, just add 'width' in the list below
possible_td_attributes = ['border-color', 'vertical-align', 'border-width', 'border-style', 'padding']
for attribute in possible_td_attributes:
if attribute in cssdict[class_name]:
attr_vals.append(attribute+'=\"' + cssdict[class_name][attribute] + '\"')
if len(attr_vals) > 0:
dict['td'][class_name] = ['\n| '+ ' '.join(attr_vals) + ' | ', '']
print 'C:', dict['td'][class_name]
# if 'height' in cssdict[class_name]:
# dict['tr'][class_name] = ['\n! ' + cssdict[class_name]['height'] + ' | ', '']
return dict
def debug():
filename = raw_input('Enter filename: ')
myDict = Wikidict(filename)
print myDict.dict
if __name__ == '__main__':
debug()