Skip to content

Commit

Permalink
Experiences with Doc Retrieval and Sentence Retrieval
Browse files Browse the repository at this point in the history
  • Loading branch information
pedrojlazevedo committed Mar 26, 2020
1 parent a5959f3 commit 78eac09
Show file tree
Hide file tree
Showing 3 changed files with 217 additions and 25 deletions.
33 changes: 18 additions & 15 deletions doc_retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import stringdist
import unicodedata as ud
import clausiepy.clausiepy as clausie
from gensim.parsing.preprocessing import remove_stopwords


def clean_entities(entities):
Expand All @@ -17,9 +18,9 @@ def clean_entities(entities):
continue
if entities[i] in entities[j]:
# keep the smaller ones...
ents_to_remove.add(entities[j])
# ents_to_remove.add(entities[j])
# or keep the bigger one...
# ents_to_remove.add(entities[i])
ents_to_remove.add(entities[i])
for ent in ents_to_remove:
entities.remove(ent)

Expand All @@ -45,36 +46,38 @@ def get_docs_with_oie(claim, wiki_entities,client):
ents.add(obj.text)
print(ner_spacy)
print(ents)
for ent in ner_spacy:
ents.add(ent.text)

if len(ents) > 5:
if len(ents) > 4:
ents = clean_entities(ents)

ents = list(ents)

for ent in ner_spacy:
_text = ent.text
if not _text in ents:
ents.append( _text)

if "(" in claim:
disambiguation = claim[claim.find("(") + 1:claim.find(")")]
_text += " " + disambiguation
ents.append(_text)

if len(ents) != 0:
_str = ""
for ent in ents:
_str += ent
_str += " "
_str = _str[:-1]
ents.append(_str)

if "film" in claim:
_str += " ( film )"
ents.append(_str)
elif "(" in claim:
disambiguation = claim[claim.find("(") + 1:claim.find(")")]
_str += " " + disambiguation
ents.append(_str)
else:
ents.append(remove_stopwords(claim))
print(ents)
docs, entities = getClosestDocs(wiki_entities, ents)

return docs, entities


# getting the 3 closest docs!
# getting the 2 closest docs!
def getClosestDocs(wiki_entities, entities):
entities = list(entities)
for i in range(len(entities)):
Expand Down Expand Up @@ -123,7 +126,7 @@ def getClosestDocs(wiki_entities, entities):

selected_docs.append(best_match_1)
selected_docs.append(best_match_2)
selected_docs.append(best_match_3)
# selected_docs.append(best_match_3)
return selected_docs, entities


Expand Down
82 changes: 72 additions & 10 deletions run_sentence_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,24 @@
import json
from sentence_transformers import SentenceTransformer
import scipy.spatial
from sklearn.metrics.pairwise import cosine_similarity

wiki_split_docs_dir = "data/wiki-pages-split"
relevant_docs_file = "data/dev_concatenation.jsonl"
relevant_sent_file = "data/dev_sentence_selection.jsonl"

relevant_docs_file = jsonlines.open(relevant_docs_file)


# relevant_sent_file = jsonlines.open(relevant_sent_file)

def get_sentence(doc, line_num):
file = codecs.open(wiki_split_docs_dir + "/" + doc + ".json", "r", "utf-8")
try:
file = codecs.open(wiki_split_docs_dir + "/" + doc + ".json", "r", "latin-1")
except:
print("Failed Loading" + str(doc))
return ""

file = json.load(file)
full_lines = file["lines"]
lines = []
Expand All @@ -20,35 +30,70 @@ def get_sentence(doc, line_num):
return sentence


def clean_sentence(_sentence):
_sentence = _sentence.replace("-LRB-", "(")
_sentence = _sentence.replace("-RRB-", ")")
_sentence = _sentence.replace("-LSB-", "[")
_sentence = _sentence.replace("-RSB-", "]")
return _sentence


# model = SentenceTransformer('bert-base-nli-mean-tokens')
embedder = SentenceTransformer('bert-base-wikipedia-sections-mean-tokens')
embedder = SentenceTransformer('bert-large-nli-mean-tokens')
# embedder = SentenceTransformer('bert-base-wikipedia-sections-mean-tokens')

claims = []
for line in relevant_docs_file:
claims.append(line)

# testing
claim_0 = claims[0]
for pair in claim_0['predicted_sentences_ner']:
print(get_sentence(pair[0], pair[1]))
# # testing
# claim_0 = claims[0]
# for pair in claim_0['predicted_sentences_ner']:
# print("\n")
# print(pair[0])
# print(pair[1])
# print(get_sentence(pair[0], pair[1]))

STOP = -1
with jsonlines.open(relevant_sent_file, mode='w') as writer_c:
corpus = []
for claim in claims:
# get all possible sentences
pair_sent_pair = {}

for pair in claim['predicted_sentences_ner']:
sentence = get_sentence(pair[0], pair[1])
corpus.append(sentence)
sentence = clean_sentence(sentence)
title = pair[0].replace("_", " ")

# if not title.lower() in sentence.lower():
# sentence = pair[0] + " " + sentence
pair_sent_pair[sentence] = (pair[0], pair[1])

for pair in claim['predicted_sentences']:
sentence = get_sentence(pair[0], pair[1])
sentence = clean_sentence(sentence)
pair_sent_pair[sentence] = (pair[0], pair[1])

corpus = []
sentence_identifier = []
for key in pair_sent_pair:
corpus.append(key)
sentence_identifier.append(pair_sent_pair[key])

claim['predicted_sentences_bert'] = []

# create embeddings
corpus_embeddings = embedder.encode(corpus)
query_embeddings = embedder.encode(claim['claim'])

# get the n most similar sentences
closest_n = 5
for query, query_embedding in zip(claim, query_embeddings):
for query, query_embedding in zip([claim['claim']], query_embeddings):
distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, "cosine")[0]

# print(query_embedding)
# print(corpus_embeddings)
# print(distances)
# print(scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, "cosine"))
results = zip(range(len(distances)), distances)
results = sorted(results, key=lambda x: x[1])

Expand All @@ -58,3 +103,20 @@ def get_sentence(doc, line_num):

for idx, distance in results[0:closest_n]:
print(corpus[idx].strip(), "(Score: %.4f)" % (1 - distance))
print(sentence_identifier[idx])
# test_1 = query_embedding.reshape(-1, 1)
# test_2 = [query_embedding]
# test_3 = corpus_embeddings[idx]
# test_4 = corpus_embeddings[idx].reshape(-1, 1)
# print(scipy.spatial.distance.cdist([query_embedding],
# [corpus_embeddings[idx]],
# "cosine")[0])
# print(cosine_similarity([query_embedding], [corpus_embeddings[idx]]))

claim['predicted_sentences_bert'].append(sentence_identifier[idx])
writer_c.write(claim)
print(STOP)
if STOP == 0:
break
else:
STOP -= 1
127 changes: 127 additions & 0 deletions run_sentence_selection_doc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
import jsonlines
import codecs
import json
from sentence_transformers import SentenceTransformer
import scipy.spatial
from sklearn.metrics.pairwise import cosine_similarity

wiki_split_docs_dir = "data/wiki-pages-split"
relevant_docs_file = "data/dev_concatenation.jsonl"
relevant_sent_file = "data/dev_sentence_selection_doc.jsonl"

relevant_docs_file = jsonlines.open(relevant_docs_file)


# relevant_sent_file = jsonlines.open(relevant_sent_file)

def get_sentence(doc, line_num):
try:
file = codecs.open(wiki_split_docs_dir + "/" + doc + ".json", "r", "latin-1")
except:
print("Failed Loading" + str(doc))
return ""

file = json.load(file)
full_lines = file["lines"]
lines = []
for line in full_lines:
lines.append(line['content'])
sentence = lines[line_num]
return sentence


def clean_sentence(_sentence):
_sentence = _sentence.replace("-LRB-", "(")
_sentence = _sentence.replace("-RRB-", ")")
_sentence = _sentence.replace("-LSB-", "[")
_sentence = _sentence.replace("-RSB-", "]")
return _sentence


# model = SentenceTransformer('bert-base-nli-mean-tokens')
embedder = SentenceTransformer('bert-large-nli-mean-tokens')
# embedder = SentenceTransformer('bert-base-wikipedia-sections-mean-tokens')

claims = []
for line in relevant_docs_file:
claims.append(line)

# # testing
# claim_0 = claims[0]
# for pair in claim_0['predicted_sentences_ner']:
# print("\n")
# print(pair[0])
# print(pair[1])
# print(get_sentence(pair[0], pair[1]))

STOP = -1
with jsonlines.open(relevant_sent_file, mode='w') as writer_c:
for claim in claims:
# get all possible sentences
corpus = {}
sentence_identifier = {}
for pair in claim['predicted_sentences_ner']:
doc = pair[0]
if not doc in corpus:
corpus[doc] = []
sentence_identifier[doc] = []
sentence = get_sentence(doc, pair[1])
sentence = clean_sentence(sentence)
title = doc.replace("_", " ")

# if not title.lower() in sentence.lower():
# sentence = pair[0] + " " + sentence
corpus[doc].append(sentence)
sentence_identifier[doc].append((doc, pair[1]))

# for pair in claim['predicted_sentences']:
# sentence = get_sentence(pair[0], pair[1])
# sentence = clean_sentence(sentence)
# corpus.add(sentence)
# sentence_identifier.add((pair[0], pair[1]))

claim['predicted_sentences_bert'] = []

# create embeddings

for doc in corpus:
all_sentences = list(corpus[doc])
all_sentences_identifier = list(sentence_identifier[doc])
corpus_embeddings = embedder.encode(all_sentences)
query_embeddings = embedder.encode(claim['claim'])

# get the n most similar sentences
closest_n = 2
for query, query_embedding in zip([claim['claim']], query_embeddings):
distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, "cosine")[0]
# print(query_embedding)
# print(corpus_embeddings)
# print(distances)
# print(scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, "cosine"))
results = zip(range(len(distances)), distances)
results = sorted(results, key=lambda x: x[1])

print("\n\n======================\n\n")
print("Query:", query)
print("\nTop 5 most similar sentences in corpus:")

for idx, distance in results[0:closest_n]:
print(all_sentences[idx].strip(), "(Score: %.4f)" % (1 - distance))
print(all_sentences_identifier[idx])
# test_1 = query_embedding.reshape(-1, 1)
# test_2 = [query_embedding]
# test_3 = corpus_embeddings[idx]
# test_4 = corpus_embeddings[idx].reshape(-1, 1)
# print(scipy.spatial.distance.cdist([query_embedding],
# [corpus_embeddings[idx]],
# "cosine")[0])
# print(cosine_similarity([query_embedding], [corpus_embeddings[idx]]))

claim['predicted_sentences_bert'].append(all_sentences_identifier[idx])
print(claim['predicted_sentences_bert'])
writer_c.write(claim)
print(STOP)
if STOP == 0:
break
else:
STOP -= 1

0 comments on commit 78eac09

Please sign in to comment.