Skip to content

Commit

Permalink
Merge pull request #3 from avidale/ruwn-2021
Browse files Browse the repository at this point in the history
Update the RWN data to 2021 version (RuWordNet 2.0)
  • Loading branch information
avidale authored Sep 14, 2022
2 parents bb17f75 + 2d2fb48 commit 380fc88
Show file tree
Hide file tree
Showing 34 changed files with 1,945,794 additions and 45 deletions.
26 changes: 26 additions & 0 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
name: Run the tests

on:
- push
- pull_request

jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.7
uses: actions/setup-python@v2
with:
python-version: 3.7
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install flake8 pytest pycodestyle pytest-cov
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Lint with pycodestyle
run: |
pycodestyle . --max-line-length 120
- name: Test with pytest
run: |
pytest
19 changes: 15 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ This is a Python wrapper for the [RuWordNet](https://ruwordnet.ru/en) thesaurus.

Это питонячья обёртка для тезауруса [RuWordNet](https://ruwordnet.ru/ru).

В настоящий момент поддержана версия начала 2020 года: 49713 синсетов
(смысловых групп), 130417 значений слов и словосочетаний.
В настоящий момент поддержана версия 2021 года (RuWordNet 2.0): 59905 синсетов
(смысловых групп), 154111 значений слов и словосочетаний.

Аналогичные и связанные проекты:
- https://www.nltk.org/howto/wordnet.html (оригинальный английский Wordnet)
Expand Down Expand Up @@ -39,9 +39,11 @@ RuWordNet разбит на *синсеты* --- наборы синонимич
- `classes` / `instances` : классы и экземпляры, например, "Смоленск" / "областной центр"
- `premises` / `conclusions`: предпосылки и возможные выводы из них, например,
"прибежать" / "бегать" (поддержано только для глаголов)
- `causes` / `effects`: причины и следствия, например, TODO (только для глаголов)
- `causes` / `effects`: причины и следствия, например, "толпиться" / "переснимать" (только для глаголов)
- `pos_synonyms`: синонимы из других частей речи
- `antonyms`: антонимы
- `related`: прочие смысловые связи, например, "овощи" / "овощехранилище"
- `ili`: interlingual index, то есть аналогичный синсет в другом языке (английском)

Кроме того, есть отношения между смыслами:
- `words` / `phrases` - из каких слов состоит фраза, и в каких фразах участвует
Expand All @@ -63,6 +65,15 @@ pip install ruwordnet
ruwordnet download
```

В настоящий момент существует несколько версий тезауруса:
* версия 2020 года содержится в файле `ruwordnet/static/ruwordnet.db`,
и доступна в версии пакета `ruwordnet>=0.0.2`.
* версия 2021 года (RuWordNet 2.0) содержится в файле `ruwordnet/static/ruwordnet-2021.db`,
и доступна по умолчанию начиная с версии пакета `ruwordnet>=0.0.4`.
В этой версии увеличился размер словаря и добавились свойства
`related` (произвольная связь с другим синсетом)
и `ili` (interlingual index, т.е. связь с английским WordNet) у синсетов.

Для применения пакета нужно создать объект `RuWordNet`:
```python
from ruwordnet import RuWordNet
Expand All @@ -73,7 +84,7 @@ wn = RuWordNet()
указать путь до файла либо передать в конструктор открытую сессию SQLAlchemy:
```python
from ruwordnet import RuWordNet
wn = RuWordNet(filename_or_session='ruwordnet/static/ruwordnet.db')
wn = RuWordNet(filename_or_session='ruwordnet/static/ruwordnet-2021.db')
```

После этого можно, например, искать синсеты, в которые входит слово
Expand Down
148 changes: 117 additions & 31 deletions conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,25 @@
"""
import argparse
import os
from collections import OrderedDict, defaultdict

from tqdm.auto import tqdm
import xmltodict
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

from ruwordnet.models import Sense, Synset, Base, hypernymy_table, domains_table, meronymy_table, pos_synonymy_table, \
antonymy_table, composition_table, entailment_table, cause_table, derivation_table, instances_table
antonymy_table, composition_table, entailment_table, cause_table, derivation_table, instances_table, related_table
from ruwordnet.models import WNSynset, WNSense, ili_table


def load_from_xml(root='.', parts='NVA', file_name='ruwordnet/static/ruwordnet.db'):
dirname = os.path.dirname(file_name)
if not os.path.exists(dirname):
os.makedirs(dirname)

if os.path.exists(file_name):
os.remove(file_name)
engine = create_engine(f'sqlite:///{file_name}', echo=False)
Base.metadata.create_all(engine)

Expand Down Expand Up @@ -60,6 +64,44 @@ def load_from_xml(root='.', parts='NVA', file_name='ruwordnet/static/ruwordnet.d

session.commit()

# load interlingual index
fn = os.path.join(root, f'ili.xml')
if os.path.exists(fn):
print('creating foreign Wordnet...')
with open(fn, 'r', encoding='utf-8') as f:
ili_raw = xmltodict.parse(f.read(), process_namespaces=True)
pairs_to_insert = set()
already = set()
for match in tqdm(ili_raw['ili']['match']):
wn_synsets = match['wn-synset']
if isinstance(wn_synsets, OrderedDict):
wn_synsets = [wn_synsets]
for wnss in wn_synsets:
pairs_to_insert.add((match['rwn-synset']['@id'], wnss['@id']))
if wnss['@id'] in already:
continue
already.add(wnss['@id'])
lemmas = wnss['lemma']
if isinstance(lemmas, OrderedDict):
lemmas = [lemmas]
for s in lemmas:
if s['@key'] in already:
continue
already.add(s['@key'])
wn_sense = WNSense(name=s['@name'], key=s['@key'], synset_id=wnss['@id'])
session.add(wn_sense)
wn_synset = WNSynset(
id=wnss['@id'],
definition=wnss['@definition'],
)
session.add(wn_synset)
session.commit()
print('connecting synsets with foreign Wordnet...')
conn = engine.connect()
conn.execute(ili_table.insert(), [dict(ruwn_id=id1, wn_id=id2) for id1, id2 in pairs_to_insert])
else:
print('interlingual index does not exist; skipping it!')

conn = engine.connect()

# load synset relations
Expand All @@ -68,75 +110,119 @@ def load_from_xml(root='.', parts='NVA', file_name='ruwordnet/static/ruwordnet.d
fn = os.path.join(root, f'synset_relations.{part}.xml')
with open(fn, 'r', encoding='utf-8') as f:
relations = xmltodict.parse(f.read(), process_namespaces=True)
rel2values = defaultdict(set)
for relation in tqdm(relations['relations']['relation']):
parent_id = relation['@parent_id']
child_id = relation['@child_id']
# parent = session.query(Synset).filter_by(id=parent_id).first()
# child = session.query(Synset).filter_by(id=child_id).first()
if relation['@name'] == 'hypernym':
insert = hypernymy_table.insert().values(hyponym_id=parent_id, hypernym_id=child_id)
conn.execute(insert)
elif relation['@name'] == 'instance hypernym':
insert = instances_table.insert().values(instance_id=parent_id, class_id=child_id)
conn.execute(insert)
elif relation['@name'] == 'domain':
insert = domains_table.insert().values(domain_item_id=parent_id, domain_id=child_id)
conn.execute(insert)
elif relation['@name'] == 'part holonym':
insert = meronymy_table.insert().values(meronym_id=parent_id, holonym_id=child_id)
conn.execute(insert)
elif relation['@name'] == 'POS-synonymy':
insert = pos_synonymy_table.insert().values(left_id=parent_id, right_id=child_id)
conn.execute(insert)
rel2values[relation['@name']].add((parent_id, child_id))

# ['hypernym', 'related', 'POS-synonymy', 'hyponym', 'domain', 'part holonym', 'instance hypernym',
# 'instance hyponym', 'part meronym', 'antonym'])
# ['hypernym', 'entailment', 'domain', 'POS-synonymy', 'hyponym', 'cause', 'antonym']
# ['POS-synonymy', 'domain', 'hypernym', 'hyponym', 'antonym']
# uncovered: related, hyponym, instance hyponym, part meronym
for relation_name, pairs in rel2values.items():
if relation_name == 'hypernym':
conn.execute(
hypernymy_table.insert(),
[dict(hyponym_id=parent_id, hypernym_id=child_id) for parent_id, child_id in pairs]
)
elif relation_name == 'instance hypernym':
conn.execute(
instances_table.insert(),
[dict(instance_id=parent_id, class_id=child_id) for parent_id, child_id in pairs]
)
elif relation_name == 'domain':
conn.execute(
domains_table.insert(),
[dict(domain_item_id=parent_id, domain_id=child_id) for parent_id, child_id in pairs]
)
elif relation_name == 'part holonym':
conn.execute(
meronymy_table.insert(),
[dict(meronym_id=parent_id, holonym_id=child_id) for parent_id, child_id in pairs]
)
elif relation_name == 'POS-synonymy':
conn.execute(
pos_synonymy_table.insert(),
[dict(left_id=parent_id, right_id=child_id) for parent_id, child_id in pairs]
)
# synonyms are already duplicated in the data
# insert = pos_synonymy_table.insert().values(right_id=parent_id, left_id=child_id)
# conn.execute(insert)
elif relation['@name'] == 'antonym':
insert = antonymy_table.insert().values(left_id=parent_id, right_id=child_id)
conn.execute(insert)
elif relation_name == 'antonym':
conn.execute(
antonymy_table.insert(),
[dict(left_id=parent_id, right_id=child_id) for parent_id, child_id in pairs]
)
# antonyms are already duplicated in the data
# insert = antonymy_table.insert().values(right_id=parent_id, left_id=child_id)
# conn.execute(insert)
elif relation['@name'] == 'entailment':
insert = entailment_table.insert().values(premise_id=parent_id, conclusion_id=child_id)
conn.execute(insert)
elif relation['@name'] == 'cause':
insert = cause_table.insert().values(cause_id=parent_id, effect_id=child_id)
conn.execute(insert)
elif relation_name == 'entailment':
conn.execute(
entailment_table.insert(),
[dict(premise_id=parent_id, conclusion_id=child_id) for parent_id, child_id in pairs]
)
elif relation_name == 'cause':
conn.execute(
cause_table.insert(),
[dict(cause_id=parent_id, effect_id=child_id) for parent_id, child_id in pairs]
)
elif relation_name == 'related':
conn.execute(
related_table.insert(),
[dict(left_id=parent_id, right_id=child_id) for parent_id, child_id in pairs]
)
else:
print('unknown relation name', relation_name)
print('relation types', rel2values.keys())

print('loading phrases')
fn = os.path.join(root, 'composed_of.xml')
with open(fn, 'r', encoding='utf-8') as f:
relations = xmltodict.parse(f.read(), process_namespaces=True)
pairs_to_insert = set()
for relation in tqdm(relations['senses']['sense']):
phrase_id = relation['@id']
words = relation['composed_of']['sense']
if not isinstance(words, list):
words = [words]
for word in words:
word_id = word['@id']
insert = composition_table.insert().values(word_id=word_id, phrase_id=phrase_id)
conn.execute(insert)
pairs_to_insert.add((word_id, phrase_id))
conn.execute(
composition_table.insert(),
[dict(word_id=word_id, phrase_id=phrase_id) for word_id, phrase_id in pairs_to_insert]
)

print('loading derivations')
fn = os.path.join(root, 'derived_from.xml')
with open(fn, 'r', encoding='utf-8') as f:
relations = xmltodict.parse(f.read(), process_namespaces=True)
pairs_to_insert = set()
for relation in tqdm(relations['senses']['sense']):
source_id = relation['@id']
derivatives = relation['derived_from']['sense']
if not isinstance(derivatives, list):
derivatives = [derivatives]
for derivative in derivatives:
derivative_id = derivative['@id']
insert = derivation_table.insert().values(source_id=source_id, derivative_id=derivative_id)
conn.execute(insert)
pairs_to_insert.add((source_id, derivative_id))
conn.execute(
derivation_table.insert(),
[dict(source_id=source_id, derivative_id=derivative_id) for source_id, derivative_id in pairs_to_insert]
)
print('All loaded successfully!')


if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Convert RuWordNet from xml to sqlite')
parser.add_argument('-s', '--source', default='data', help='name of the directory with the source xml files')
parser.add_argument('-d', '--destination', default='ruwordnet/static/ruwordnet.db',
parser.add_argument(
'-s', '--source', default='data/rwn-2021', help='name of the directory with the source xml files'
)
parser.add_argument('-d', '--destination', default='ruwordnet/static/ruwordnet-2021.db',
help='destination database filename')
args = parser.parse_args()
load_from_xml(root=args.source, file_name=args.destination)
Loading

0 comments on commit 380fc88

Please sign in to comment.