-
Notifications
You must be signed in to change notification settings - Fork 0
/
WordNet.py
123 lines (107 loc) · 4.11 KB
/
WordNet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#-----------------------------------------------------------------------------------
# Marta Villegas (UAB) VIW project (Visual into Words http://pagines.uab.cat/viw)
#
# Adds WordNet semantic classes to freeling files (for Nouns, Verbs and Adjectives)
#
# This script uses the IULA sparql server: http://lodserver.iula.upf.edu/sparql
#
# No semantic dissambiguation is performed (manual checking required...)
#
#-----------------------------------------------------------------------------------
from SPARQLWrapper import SPARQLWrapper, JSON
from collections import defaultdict
import sys
import urllib
import re
sparql = SPARQLWrapper("http://lodserver.iula.upf.edu/sparql")
args = sys.argv
lines = [line.split() for line in open(args[1])]
lang = args[2]
iri = "<http://ewn" + lang + ".edu>"
outFile = args[1][:-4] + "-2.txt"
f1=open(outFile, 'w+')
def main(lines,lang):
global f1
verb = re.compile('^VM[^P].*')
verbP = re.compile('^VMP.*')
aux = re.compile('^VA.*')
adj = re.compile('^AQ.*')
noun = re.compile('^NC.*')
soyAux = "no"
for l in lines:
if len(l) > 1 and verb.match(l[2]): ### 'common' verb form
wordform = unicode( l[1], "utf-8" )
#print l
results = getSparql(wordform,iri,'verb')
if len(results["results"]["bindings"]) > 0:
sumo = results["results"]["bindings"][0]["plus"]["value"].split("#")
l.append(sumo[1])
else:
l.append("null")
elif len(l) > 1 and aux.match(l[2]): ### aux!!!
soyAux = "yes"
elif len(l) > 1 and verbP.match(l[2]) and soyAux == "yes" : ### 'ha fet'
wordform = unicode( l[1], "utf-8" )
#print("HA", l)
results = getSparql(wordform,iri,'verb')
if len(results["results"]["bindings"]) > 0:
sumo = results["results"]["bindings"][0]["plus"]["value"].split("#")
l.append(sumo[1])
else:
l.append("null")
elif len(l) > 1 and verbP.match(l[2]): ### preocupat
wordform = unicode( l[1], "utf-8" )
#print("HA", l)
results = getSparql(wordform,iri,'verb')
if len(results["results"]["bindings"]) > 0:
sumo = results["results"]["bindings"][0]["plus"]["value"].split("#")
l.append("A-" + sumo[1])
else:
l.append("null")
elif len(l) > 1 and adj.match(l[2]): ### adj
wordform = unicode( l[1], "utf-8" )
#print l
results = getSparql(wordform,iri,'adjective')
if len(results["results"]["bindings"]) > 0:
sumo = results["results"]["bindings"][0]["plus"]["value"].split("#")
l.append("A-" + sumo[1])
else:
l.append("null")
elif len(l) > 1 and noun.match(l[2]): ### 'nouns'
wordform = unicode( l[1], "utf-8" )
#print l
results = getSparql(wordform,iri,'noun')
#print results
if len(results["results"]["bindings"]) > 0:
sumo = results["results"]["bindings"][0]["plus"]["value"].split("#")
#print sumo
if sumo[1] == "BodyPart" or sumo[1] == "Clothing":
l.append(sumo[1])
else:
l.append("null")
else:
soyAux = "no"
f1.write('\t'.join(l))
f1.write('\n')
def getSparql(word,iri,pos):
head = """
prefix lemon: <http://lemon-model.net/lemon#>
prefix lexinfo: <http://www.lexinfo.net/ontology/2.0/lexinfo#>
SELECT DISTINCT ?plus FROM """
where = """
WHERE {
?entry lemon:canonicalForm ?form ; lexinfo:partOfSpeech lexinfo:"""
where1 = """ ; lemon:sense ?sense .
?form lemon:writtenRepresentation '"""
where2 = """'.
?sense <http://lodserver.iula.upf.edu/euroWordNetMCR/sumo_plus> ?plus .
}"""
query = head + iri + where + pos + where1 + word + where2
#print query
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
return(results)
main(lines,iri)