Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Torch 1.0.1 #27

Open
wants to merge 40 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
fcc33bd
Update to torch>=1.0.1
belerico May 18, 2019
d0ee42a
Update fasttext download urls and add download with resume
belerico May 18, 2019
49b3d58
Import torch
belerico May 18, 2019
8bd0c8c
Update travis.yml to clone this repo
belerico May 18, 2019
85dbeab
Update test_field.py no existing files
belerico May 18, 2019
573c0e5
Add download_with_resume
belerico May 19, 2019
992e498
Fix download_with_resume content-length not set
belerico May 19, 2019
ba987bb
Fix bug check file size when content-length not set
belerico May 19, 2019
cb94ef0
Check file size
belerico May 19, 2019
9b58cb8
Check header before getting resource
belerico May 20, 2019
dac85d1
Add to(device) before
belerico May 27, 2019
90bf914
import tqdm correctly
belerico May 28, 2019
f62709e
Send all to device
belerico May 28, 2019
46c15e5
Send to device
belerico May 28, 2019
44c7ab8
Fix bug output attributes on predicitons
belerico Jun 15, 2019
e994d95
Fix bug concat predictions with raw table
belerico Jun 15, 2019
7e4c77b
Fix concat predictions
belerico Jun 15, 2019
3ed4aeb
Merge raw and prod table
belerico Jun 17, 2019
6c8edfe
Join index when predict
belerico Jun 18, 2019
6c488ad
Update torch version
belerico Jul 8, 2019
f8ee7f6
Change preprocess to work with unlabeled data as pandas df
belerico Jul 11, 2019
cf68522
Fix bug load model on different devices
belerico Jul 11, 2019
f27c47e
Change pytorch version
belerico Jul 19, 2019
e35c9f7
Back to 1.1.0
belerico Jul 19, 2019
02ed0f6
Update setup.py
belerico Jul 25, 2019
00d7b77
Update setup.py
belerico Jul 25, 2019
59eeded
Update setup.py
belerico Jul 25, 2019
c33328c
Update to fasttext 0.9
Jul 25, 2019
b820ba7
Fix typo bug
belerico Jul 25, 2019
3918b6c
Add download of common crals and wiki fasttext binaries
belerico Nov 10, 2019
08ab046
Fix typo
belerico Nov 10, 2019
1664c3a
Change pytorch version
belerico Nov 12, 2019
f79b779
Change logicval not to last version of torch
belerico Nov 12, 2019
82de9dc
Merge branch 'torch_1.0.1' of https://github.com/belerico/deepmatcher…
belerico Nov 12, 2019
071a99f
Update logical not
belerico Nov 12, 2019
4b23fea
Try kmax pooling
belerico Nov 15, 2019
a9fef87
Add dataset splits to return tables and set random state
belerico Dec 20, 2019
1e0dd5c
Fixed minor on attr_condensors
belerico Dec 20, 2019
d26eb41
Fixed minor print
belerico Dec 20, 2019
f797ce9
Add split only for train validation
belerico Dec 20, 2019
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ before_install:

install:
- conda install --yes python=$PYTHON_VERSION pip scikit-learn nose
- pip install --process-dependency-links git+https://github.com/anhaidgroup/deepmatcher | cat
- pip install --process-dependency-links git+https://github.com/belerico/deepmatcher@torch_1.0.1 | cat
- python -m nltk.downloader perluniprops nonbreaking_prefixes punkt

script:
Expand Down
3 changes: 3 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"python.pythonPath": "/home/belerico/.local/share/virtualenvs/deepmatcher-hlJg2Q1v/bin/python"
}
14 changes: 14 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
[[source]]
name = "pypi"
url = "https://pypi.org/simple"
verify_ssl = true

[dev-packages]
nose = "*"
pylint = "*"

[packages]
deepmatcher = {editable = true,path = "."}

[requires]
python_version = "3.7"
373 changes: 373 additions & 0 deletions Pipfile.lock

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion deepmatcher/batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ def __new__(cls, *args, **kwargs):
if 'word_probs' in train_info.metadata:
raw_word_probs = train_info.metadata['word_probs'][name]
word_probs = torch.Tensor(
[[raw_word_probs[w] for w in b] for b in data.data])
# [[raw_word_probs[w] for w in b] for b in data.data])
[[raw_word_probs[w] for w in b] for b in data.data.tolist()])
if data.is_cuda:
word_probs = word_probs.cuda()
pc = None
Expand Down
63 changes: 48 additions & 15 deletions deepmatcher/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,16 @@

logger = logging.getLogger(__name__)


def split(table,
path,
train_prefix,
validation_prefix,
test_prefix,
split_ratio=[0.6, 0.2, 0.2],
stratified=False,
strata_field='label'):
strata_field='label',
random_state=None):

"""Split a pandas dataframe or CSV file into train / validation / test data sets.

Args:
Expand All @@ -47,8 +48,10 @@ def split(table,
Default is False.
strata_field (str): name of the examples Field stratified over.
Default is 'label' for the conventional label field.
random_state (tuple): the random seed used for shuffling.
A return value of random.getstate()
"""
assert len(split_ratio) == 3
assert (isinstance(split_ratio, list) and len(split_ratio) <= 3) or (split_ratio >= 0 and split_ratio <= 1)

if not isinstance(table, pd.DataFrame):
table = pd.read_csv(table)
Expand All @@ -58,15 +61,29 @@ def split(table,
examples = list(table.itertuples(index=False))
fields = [(col, None) for col in list(table)]
dataset = data.Dataset(examples, fields)
train, valid, test = dataset.split(split_ratio, stratified, strata_field)
if isinstance(split_ratio, list) and len(split_ratio) == 3:
train, valid, test = dataset.split(split_ratio, stratified, strata_field, random_state=random_state)

tables = (pd.DataFrame(train.examples), pd.DataFrame(valid.examples),
pd.DataFrame(test.examples))
prefixes = (train_prefix, validation_prefix, test_prefix)

for i in range(len(tables)):
tables[i].columns = table.columns
if path is not None:
tables[i].to_csv(os.path.join(path, prefixes[i]), index=False)
else:
train, test = dataset.split(split_ratio, stratified, strata_field, random_state=random_state)

tables = (pd.DataFrame(train.examples), pd.DataFrame(valid.examples),
pd.DataFrame(test.examples))
prefixes = (train_prefix, validation_prefix, test_prefix)
tables = (pd.DataFrame(train.examples), pd.DataFrame(test.examples))
prefixes = (train_prefix, test_prefix)

for i in range(len(tables)):
tables[i].columns = table.columns
tables[i].to_csv(os.path.join(path, prefixes[i]), index=False)
for i in range(len(tables)):
tables[i].columns = table.columns
if path is not None:
tables[i].to_csv(os.path.join(path, prefixes[i]), index=False)

return tables


class MatchingDataset(data.Dataset):
Expand Down Expand Up @@ -203,7 +220,7 @@ def _set_attributes(self):
self.label_field = self.column_naming['label']
self.id_field = self.column_naming['id']

def compute_metadata(self, pca=False):
def compute_metadata(self, pca=False, device=None):
r"""Computes metadata about the dataset.

Computes the following metadata about the dataset:
Expand All @@ -220,20 +237,28 @@ def compute_metadata(self, pca=False):

Arguments:
pca (bool): Whether to compute the ``pc`` metadata.
device (str or torch.device): The device type on which compute metadata of the model.
Set to 'cpu' to use CPU only, even if GPU is available.
If None, will use first available GPU, or use CPU if no GPUs are available.
Defaults to None.
This is a keyword only param.
"""
if device is None:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

self.metadata = {}

# Create an iterator over the entire dataset.
train_iter = MatchingIterator(
self, self, train=False, batch_size=1024, device=-1, sort_in_buckets=False)
self, self, train=False, batch_size=1024, sort_in_buckets=False, device=device)
counter = defaultdict(Counter)

# For each attribute, find the number of times each word id occurs in the dataset.
# Note that word ids here also include ``UNK`` tokens, padding tokens, etc.
for batch in pyprind.prog_bar(train_iter, title='\nBuilding vocabulary'):
for name in self.all_text_fields:
attr_input = getattr(batch, name)
counter[name].update(attr_input.data.data.view(-1))
counter[name].update(attr_input.data.data.view(-1).tolist())

word_probs = {}
totals = {}
Expand Down Expand Up @@ -270,7 +295,7 @@ def compute_metadata(self, pca=False):

# Create an iterator over the entire dataset.
train_iter = MatchingIterator(
self, self, train=False, batch_size=1024, device=-1, sort_in_buckets=False)
self, self, train=False, batch_size=1024, sort_in_buckets=False, device=device)
attr_embeddings = defaultdict(list)

# Run the constructed neural network to compute weighted sequence embeddings
Expand Down Expand Up @@ -524,11 +549,19 @@ def splits(cls,
filter_pred (callable or None): Use only examples for which
filter_pred(example) is True, or use all examples if None.
Default is None. This is a keyword-only parameter.
device (str or torch.device): The device type on which compute metadata of the model.
Set to 'cpu' to use CPU only, even if GPU is available.
If None, will use first available GPU, or use CPU if no GPUs are available.
Defaults to None.
This is a keyword only param.

Returns:
Tuple[MatchingDataset]: Datasets for (train, validation, and test) splits in
that order, if provided.
"""
device = kwargs.pop('device', None)
if device is None:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

fields_dict = dict(fields)
state_args = {'train_pca': train_pca}
Expand Down Expand Up @@ -578,7 +611,7 @@ def splits(cls,
logger.info('Vocab construction time: {}s'.format(after_vocab - after_load))

if train:
datasets[0].compute_metadata(train_pca)
datasets[0].compute_metadata(train_pca, device)
after_metadata = timer()
logger.info(
'Metadata computation time: {}s'.format(after_metadata - after_vocab))
Expand Down
92 changes: 80 additions & 12 deletions deepmatcher/data/field.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,25 @@
import nltk
import six

import fastText
import fasttext
import torch
from torchtext import data, vocab
from torchtext.utils import download_from_url

import os
import time
import shutil
from tqdm import tqdm
import requests

logger = logging.getLogger(__name__)


class FastText(vocab.Vectors):

def __init__(self,
suffix='wiki-news-300d-1M.vec.zip',
url_base='https://s3-us-west-1.amazonaws.com/fasttext-vectors/',
url_base='https://dl.fbaipublicfiles.com/fasttext/vectors-english/',
**kwargs):
url = url_base + suffix
base, ext = os.path.splitext(suffix)
Expand All @@ -29,12 +35,12 @@ def __init__(self,
class FastTextBinary(vocab.Vectors):

name_base = 'wiki.{}.bin'
_direct_en_url = 'https://drive.google.com/uc?export=download&id=1Vih8gAmgBnuYDxfblbT94P6WjB7s1ZSh'
_direct_en_url = 'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.zip'

def __init__(self, language='en', url_base=None, cache=None):
def __init__(self, language='en', url_base=None, cache=None, vectors_type=None):
"""
Arguments:
language: Language of fastText pre-trained embedding model
language: Language of fasttext pre-trained embedding model
cache: directory for cached model
"""
cache = os.path.expanduser(cache)
Expand All @@ -43,24 +49,84 @@ def __init__(self, language='en', url_base=None, cache=None):
self.destination = os.path.join(cache, 'wiki.' + language + '.bin')
else:
if url_base is None:
url_base = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.{}.zip'
url_base = 'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.{}.zip'
url = url_base.format(language)
self.destination = os.path.join(cache, 'wiki.' + language + '.zip')
name = FastTextBinary.name_base.format(language)
if vectors_type is None:
self.destination = os.path.join(cache, 'wiki.' + language + '.zip')
else:
self.destination = os.path.join(cache, 'wiki_cc.' + language + '.bin.gz')
if vectors_type is None:
name = FastTextBinary.name_base.format(language)
else:
name = 'wiki_cc.{}.bin'.format(language)

self.cache(name, cache, url=url)

def __getitem__(self, token):
return torch.Tensor(self.model.get_word_vector(token))


def __download_with_resume(self, url, destination):
# Check if the requested url is ok, i.e. 200 <= status_code < 400
head = requests.head(url)
if not head.ok:
head.raise_for_status()

# Since requests doesn't support local file reading
# we check if protocol is file://
if url.startswith('file://'):
url_no_protocol = url.replace('file://', '', count=1)
if os.path.exists(url_no_protocol):
print('File already exists, no need to download')
return
else:
raise Exception('File not found at %s' % url_no_protocol)

# Don't download if the file exists
if os.path.exists(os.path.expanduser(destination)):
print('File already exists, no need to download')
return

tmp_file = destination + '.part'
first_byte = os.path.getsize(tmp_file) if os.path.exists(tmp_file) else 0
chunk_size = 1024 ** 2 # 1 MB
file_mode = 'ab' if first_byte else 'wb'

# Set headers to resume download from where we've left
headers = {"Range": "bytes=%s-" % first_byte}
r = requests.get(url, headers=headers, stream=True)
file_size = int(r.headers.get('Content-length', -1))
if file_size >= 0:
# Content-length set
file_size += first_byte
total = file_size
else:
# Content-length not set
print('Cannot retrieve Content-length from server')
total = None

print('Download from ' + url)
print('Starting download at %.1fMB' % (first_byte / (10 ** 6)))
print('File size is %.1fMB' % (file_size / (10 ** 6)))

with tqdm(initial=first_byte, total=total, unit_scale=True) as pbar:
with open(tmp_file, file_mode) as f:
for chunk in r.iter_content(chunk_size=chunk_size):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
pbar.update(len(chunk))

# Rename the temp download file to the correct name if fully downloaded
shutil.move(tmp_file, destination)

def cache(self, name, cache, url=None):
path = os.path.join(cache, name)
if not os.path.isfile(path) and url:
logger.info('Downloading vectors from {}'.format(url))
if not os.path.exists(cache):
os.makedirs(cache)
if not os.path.isfile(self.destination):
download_from_url(url, self.destination)
# self.__download_with_resume(url, self.destination)
self.__download_with_resume(url, self.destination)
logger.info('Extracting vectors into {}'.format(cache))
ext = os.path.splitext(self.destination)[1][1:]
if ext == 'zip':
Expand All @@ -72,7 +138,7 @@ def cache(self, name, cache, url=None):
if not os.path.isfile(path):
raise RuntimeError('no vectors found at {}'.format(path))

self.model = fastText.load_model(path)
self.model = fasttext.load_model(path)
self.dim = len(self['a'])


Expand Down Expand Up @@ -143,7 +209,9 @@ def _get_vector_data(cls, vecs, cache):
if vec_data is None:
parts = vec_name.split('.')
if parts[0] == 'fasttext':
if parts[2] == 'bin':
if parts[1] == 'cc':
vec_data = FastTextBinary(language=parts[2], cache=cache, url_base='https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.{}.300.bin.gz', vectors_type=parts[1])
elif parts[2] == 'bin':
vec_data = FastTextBinary(language=parts[1], cache=cache)
elif parts[2] == 'vec' and parts[1] == 'wiki':
vec_data = FastText(
Expand Down
16 changes: 14 additions & 2 deletions deepmatcher/data/process.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import torch
import copy
import io
import logging
Expand Down Expand Up @@ -103,7 +104,8 @@ def process(path,
left_prefix='left_',
right_prefix='right_',
use_magellan_convention=False,
pca=True):
pca=True,
**kwargs):
"""Creates dataset objects for multiple splits of a dataset.

This involves the following steps (if data cannot be retrieved from the cache):
Expand Down Expand Up @@ -174,11 +176,20 @@ def process(path,
Specifically, set them to be '_id', 'ltable_', and 'rtable_' respectively.
pca (bool): Whether to compute PCA for each attribute (needed for SIF model).
Defaults to False.
device (str or torch.device): The device type on which compute metadata of the model.
Set to 'cpu' to use CPU only, even if GPU is available.
If None, will use first available GPU, or use CPU if no GPUs are available.
Defaults to None.
This is a keyword only param.

Returns:
Tuple[MatchingDataset]: Datasets for (train, validation, and test) splits in that
order, if provided, or dataset for unlabeled, if provided.
"""
device = kwargs.get('device')
if device is None:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

if unlabeled is not None:
raise ValueError('Parameter "unlabeled" has been deprecated, use '
'"deepmatcher.data.process_unlabeled" instead.')
Expand Down Expand Up @@ -217,7 +228,8 @@ def process(path,
cache,
check_cached_data,
auto_rebuild_cache,
train_pca=pca)
train_pca=pca,
**kwargs)

# Save additional information to train dataset.
datasets[0].ignore_columns = ignore_columns
Expand Down
Loading