-
Notifications
You must be signed in to change notification settings - Fork 0
/
px_aux.py
133 lines (107 loc) · 3.67 KB
/
px_aux.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import os
import pickle
from smart_open import open as _Open
from datetime import datetime
# a dictionary with Stanford POS terms (currently not used)
POSoptions = {
"CC": "Coordinating conjunction",
"CD": "Cardinal number",
"DT": "Determiner",
"EX": "Existential there",
"FW": "Foreign word",
"IN": "Preposition or subordinating conjunction",
"JJ": "Adjective",
"JJR": "Adjective, comparative",
"JJS": "Adjective, superlative",
"LS": "List item marker",
"MD": "Modal",
"NN": "Noun, singular or mass",
"NNS": "Noun, plural",
"NP": "Proper noun, singular",
"NNPS": "Proper noun, plural",
"PDT": "Predeterminer",
"POS": "Possessive ending",
"PRP": "Personal pronoun",
"PRP$": "Possessive pronoun",
"RB": "Adverb",
"RBR": "Adverb, comparative",
"RBS": "Adverb, superlative",
"RP": "Particle",
"SYM": "Symbol",
"TO": "to",
"UH": "Interjection",
"VB": "Verb, base form",
"VBD": "Verb, past tense",
"VBG": "Verb, gerund or present participle",
"VBN": "Verb, past participle",
"VBP": "Verb, non-3rd person singular present",
"VBZ": "Verb, 3rd person singular present",
"WDT": "Wh-determiner",
"WP": "Wh-pronoun",
"WP$": "Possessive wh-pronoun",
"WRB": "Wh-adverb"
}
# endpoints of DBpedia (SPARQL queries), DBpedia SpotLight, and WikiData
URL_DB = "https://dbpedia.org/sparql"
#URL_DB_SL_annotate = "http://model.dbpedia-spotlight.org/en/annotate" # dbpedia spotlight original server
#URL_DB_SL_annotate = "http://api.dbpedia-spotlight.org/en/annotate" # dbpedia spotlight original server
URL_DB_SL_annotate = "http://gssi.det.uvigo.es:2222/rest/annotate" # dbpedia spotlight mac pro server
URL_WK = "https://query.wikidata.org/sparql"
# folders and filenames involved in corpus construction
# folder with training texts
TEXTS_FOLDER = './texts/'
ORIGINAL_TEXTS_FOLDER = './texts/originales'
#DEFAULT_TRAINING_TEXTS = "historical_modify.txt"
DEFAULT_TRAINING_TEXTS = "originales.s.w"
# scripts para recalcular los textos de entrenamiento tras cambiar parámetros
SCRIPT_STEP2 = "./module_processCorpus/S2.py"
SCRIPT_STEP3 = "./module_processCorpus/S3.py"
# variable and function to control if program must print log messages (change to True if argument -m)
FLAG_MES = False
def Print (*args):
if FLAG_MES == True:
lista = list(map(lambda x: str(x), args))
print(" ".join(lista))
return
# to save some ASCII content in a file
def saveFile (f, content):
out = _Open(f, 'w')
out.write(content)
out.close()
return
FLAG_LOG = False
def log(f, line):
if (FLAG_LOG):
d = str(datetime.now())
fd = _Open(f, "a")
fd.write(d+": "+line+"\n")
fd.close()
# to highlight in a file the entities contained in its '.p' and so generate its '.p.html'
# type ="s" if filename is '.s', implying that it is necessary highlight the field @surfaceForm
# type ="w" if filename is '.w', implying that it is necessary highlight the field entityName
def getContentMarked (filename, type):
file = _Open(filename, 'r')
content = file.read()
pfilename = filename+".p"
if not os.path.isfile(pfilename):
print("Does not exist "+pfilename)
return content
pfile = _Open(pfilename, 'rb')
dics = pickle.load(pfile)
dicOffsets = dics["byOffset"]
finalHTMLContent = ""
currentPosition = 0
# iteration follows the input order in the dictionary, that is supposed to be the offset order, increasing
for k in dicOffsets:
entity = dicOffsets[k]
text = content[currentPosition:int(k)]
currentPosition += len(text)
finalHTMLContent += text.replace("\n", "\n<br>")
urlEntity = entity["@URI"]
if type == "s":
name = entity["@surfaceForm"]
else:
name = entity["entityName"]
finalHTMLContent += "<a href='"+urlEntity+"?lang=en'>"+name+"</a>"
currentPosition += len(name)
return finalHTMLContent