anhaidgroup · belerico · May 18, 2019 · May 18, 2019 · May 18, 2019 · May 18, 2019
diff --git a/.travis.yml b/.travis.yml
@@ -39,7 +39,7 @@ before_install:
 
 install:
   - conda install --yes python=$PYTHON_VERSION pip scikit-learn nose
-  - pip install --process-dependency-links git+https://github.com/anhaidgroup/deepmatcher | cat
+  - pip install --process-dependency-links git+https://github.com/belerico/deepmatcher@torch_1.0.1 | cat
   - python -m nltk.downloader perluniprops nonbreaking_prefixes punkt
 
 script:

diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+    "python.pythonPath": "/home/belerico/.local/share/virtualenvs/deepmatcher-hlJg2Q1v/bin/python"
+}
diff --git a/Pipfile b/Pipfile
@@ -0,0 +1,14 @@
+[[source]]
+name = "pypi"
+url = "https://pypi.org/simple"
+verify_ssl = true
+
+[dev-packages]
+nose = "*"
+pylint = "*"
+
+[packages]
+deepmatcher = {editable = true,path = "."}
+
+[requires]
+python_version = "3.7"
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/deepmatcher/batch.py b/deepmatcher/batch.py
@@ -48,7 +48,8 @@ def __new__(cls, *args, **kwargs):
             if 'word_probs' in train_info.metadata:
                 raw_word_probs = train_info.metadata['word_probs'][name]
                 word_probs = torch.Tensor(
-                    [[raw_word_probs[w] for w in b] for b in data.data])
+                    # [[raw_word_probs[w] for w in b] for b in data.data])
+                    [[raw_word_probs[w] for w in b] for b in data.data.tolist()])
                 if data.is_cuda:
                     word_probs = word_probs.cuda()
             pc = None

diff --git a/deepmatcher/data/dataset.py b/deepmatcher/data/dataset.py
@@ -24,15 +24,16 @@
 
 logger = logging.getLogger(__name__)
 
-
 def split(table,
           path,
           train_prefix,
           validation_prefix,
           test_prefix,
           split_ratio=[0.6, 0.2, 0.2],
           stratified=False,
-          strata_field='label'):
+          strata_field='label',
+          random_state=None):
+
     """Split a pandas dataframe or CSV file into train / validation / test data sets.
 
     Args:
@@ -47,8 +48,10 @@ def split(table,
             Default is False.
         strata_field (str): name of the examples Field stratified over.
             Default is 'label' for the conventional label field.
+        random_state (tuple): the random seed used for shuffling.
+            A return value of random.getstate()
     """
-    assert len(split_ratio) == 3
+    assert (isinstance(split_ratio, list) and len(split_ratio) <= 3) or (split_ratio >= 0 and split_ratio <= 1)
 
     if not isinstance(table, pd.DataFrame):
         table = pd.read_csv(table)
@@ -58,15 +61,29 @@ def split(table,
     examples = list(table.itertuples(index=False))
     fields = [(col, None) for col in list(table)]
     dataset = data.Dataset(examples, fields)
-    train, valid, test = dataset.split(split_ratio, stratified, strata_field)
+    if isinstance(split_ratio, list) and len(split_ratio) == 3:
+        train, valid, test = dataset.split(split_ratio, stratified, strata_field, random_state=random_state)
+
+        tables = (pd.DataFrame(train.examples), pd.DataFrame(valid.examples),
+                pd.DataFrame(test.examples))
+        prefixes = (train_prefix, validation_prefix, test_prefix)
+
+        for i in range(len(tables)):
+            tables[i].columns = table.columns
+            if path is not None:
+                tables[i].to_csv(os.path.join(path, prefixes[i]), index=False)
+    else:
+        train, test = dataset.split(split_ratio, stratified, strata_field, random_state=random_state)
 
-    tables = (pd.DataFrame(train.examples), pd.DataFrame(valid.examples),
-              pd.DataFrame(test.examples))
-    prefixes = (train_prefix, validation_prefix, test_prefix)
+        tables = (pd.DataFrame(train.examples), pd.DataFrame(test.examples))
+        prefixes = (train_prefix, test_prefix)
 
-    for i in range(len(tables)):
-        tables[i].columns = table.columns
-        tables[i].to_csv(os.path.join(path, prefixes[i]), index=False)
+        for i in range(len(tables)):
+            tables[i].columns = table.columns
+            if path is not None:
+                tables[i].to_csv(os.path.join(path, prefixes[i]), index=False)
+
+    return tables
 
 
 class MatchingDataset(data.Dataset):
@@ -203,7 +220,7 @@ def _set_attributes(self):
         self.label_field = self.column_naming['label']
         self.id_field = self.column_naming['id']
 
-    def compute_metadata(self, pca=False):
+    def compute_metadata(self, pca=False, device=None):
         r"""Computes metadata about the dataset.
 
         Computes the following metadata about the dataset:
@@ -220,20 +237,28 @@ def compute_metadata(self, pca=False):
 
         Arguments:
             pca (bool): Whether to compute the ``pc`` metadata.
+            device (str or torch.device): The device type on which compute metadata of the model. 
+                Set to 'cpu' to use CPU only, even if GPU is available. 
+                If None, will use first available GPU, or use CPU if no GPUs are available. 
+                Defaults to None.
+                This is a keyword only param.
         """
+        if device is None:
+            device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+
         self.metadata = {}
 
         # Create an iterator over the entire dataset.
         train_iter = MatchingIterator(
-            self, self, train=False, batch_size=1024, device=-1, sort_in_buckets=False)
+            self, self, train=False, batch_size=1024, sort_in_buckets=False, device=device)
         counter = defaultdict(Counter)
 
         # For each attribute, find the number of times each word id occurs in the dataset.
         # Note that word ids here also include ``UNK`` tokens, padding tokens, etc.
         for batch in pyprind.prog_bar(train_iter, title='\nBuilding vocabulary'):
             for name in self.all_text_fields:
                 attr_input = getattr(batch, name)
-                counter[name].update(attr_input.data.data.view(-1))
+                counter[name].update(attr_input.data.data.view(-1).tolist())
 
         word_probs = {}
         totals = {}
@@ -270,7 +295,7 @@ def compute_metadata(self, pca=False):
 
         # Create an iterator over the entire dataset.
         train_iter = MatchingIterator(
-            self, self, train=False, batch_size=1024, device=-1, sort_in_buckets=False)
+            self, self, train=False, batch_size=1024, sort_in_buckets=False, device=device)
         attr_embeddings = defaultdict(list)
 
         # Run the constructed neural network to compute weighted sequence embeddings
@@ -524,11 +549,19 @@ def splits(cls,
             filter_pred (callable or None): Use only examples for which
                 filter_pred(example) is True, or use all examples if None.
                 Default is None. This is a keyword-only parameter.
+            device (str or torch.device): The device type on which compute metadata of the model. 
+                Set to 'cpu' to use CPU only, even if GPU is available. 
+                If None, will use first available GPU, or use CPU if no GPUs are available. 
+                Defaults to None.
+                This is a keyword only param.
 
         Returns:
             Tuple[MatchingDataset]: Datasets for (train, validation, and test) splits in
                 that order, if provided.
         """
+        device = kwargs.pop('device', None)
+        if device is None:
+            device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
 
         fields_dict = dict(fields)
         state_args = {'train_pca': train_pca}
@@ -578,7 +611,7 @@ def splits(cls,
             logger.info('Vocab construction time: {}s'.format(after_vocab - after_load))
 
             if train:
-                datasets[0].compute_metadata(train_pca)
+                datasets[0].compute_metadata(train_pca, device)
             after_metadata = timer()
             logger.info(
                 'Metadata computation time: {}s'.format(after_metadata - after_vocab))

diff --git a/deepmatcher/data/field.py b/deepmatcher/data/field.py
@@ -6,19 +6,25 @@
 import nltk
 import six
 
-import fastText
+import fasttext
 import torch
 from torchtext import data, vocab
 from torchtext.utils import download_from_url
 
+import os
+import time
+import shutil
+from tqdm import tqdm
+import requests
+
 logger = logging.getLogger(__name__)
 
 
 class FastText(vocab.Vectors):
 
     def __init__(self,
                  suffix='wiki-news-300d-1M.vec.zip',
-                 url_base='https://s3-us-west-1.amazonaws.com/fasttext-vectors/',
+                 url_base='https://dl.fbaipublicfiles.com/fasttext/vectors-english/',
                  **kwargs):
         url = url_base + suffix
         base, ext = os.path.splitext(suffix)
@@ -29,12 +35,12 @@ def __init__(self,
 class FastTextBinary(vocab.Vectors):
 
     name_base = 'wiki.{}.bin'
-    _direct_en_url = 'https://drive.google.com/uc?export=download&id=1Vih8gAmgBnuYDxfblbT94P6WjB7s1ZSh'
+    _direct_en_url = 'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.zip'
 
-    def __init__(self, language='en', url_base=None, cache=None):
+    def __init__(self, language='en', url_base=None, cache=None, vectors_type=None):
         """
         Arguments:
-           language: Language of fastText pre-trained embedding model
+           language: Language of fasttext pre-trained embedding model
            cache: directory for cached model
          """
         cache = os.path.expanduser(cache)
@@ -43,24 +49,84 @@ def __init__(self, language='en', url_base=None, cache=None):
             self.destination = os.path.join(cache, 'wiki.' + language + '.bin')
         else:
             if url_base is None:
-                url_base = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.{}.zip'
+                url_base = 'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.{}.zip'
             url = url_base.format(language)
-            self.destination = os.path.join(cache, 'wiki.' + language + '.zip')
-        name = FastTextBinary.name_base.format(language)
+            if vectors_type is None:
+                self.destination = os.path.join(cache, 'wiki.' + language + '.zip')
+            else:
+                self.destination = os.path.join(cache, 'wiki_cc.' + language + '.bin.gz')
+        if vectors_type is None:
+            name = FastTextBinary.name_base.format(language)
+        else:
+            name = 'wiki_cc.{}.bin'.format(language)
 
         self.cache(name, cache, url=url)
 
     def __getitem__(self, token):
         return torch.Tensor(self.model.get_word_vector(token))
-
+
+    def __download_with_resume(self, url, destination):
+        # Check if the requested url is ok, i.e. 200 <= status_code < 400
+        head = requests.head(url)
+        if not head.ok:
+            head.raise_for_status()
+
+        # Since requests doesn't support local file reading
+        # we check if protocol is file://
+        if url.startswith('file://'):
+            url_no_protocol = url.replace('file://', '', count=1)
+            if os.path.exists(url_no_protocol):
+                print('File already exists, no need to download')
+                return
+            else:
+                raise Exception('File not found at %s' % url_no_protocol)
+
+        # Don't download if the file exists
+        if os.path.exists(os.path.expanduser(destination)):
+            print('File already exists, no need to download')
+            return
+
+        tmp_file = destination + '.part'
+        first_byte = os.path.getsize(tmp_file) if os.path.exists(tmp_file) else 0
+        chunk_size = 1024 ** 2  # 1 MB
+        file_mode = 'ab' if first_byte else 'wb'
+
+        # Set headers to resume download from where we've left 
+        headers = {"Range": "bytes=%s-" % first_byte}
+        r = requests.get(url, headers=headers, stream=True)
+        file_size = int(r.headers.get('Content-length', -1))
+        if file_size >= 0:
+            # Content-length set
+            file_size += first_byte
+            total = file_size
+        else:
+            # Content-length not set
+            print('Cannot retrieve Content-length from server')
+            total = None
+
+        print('Download from ' + url)
+        print('Starting download at %.1fMB' % (first_byte / (10 ** 6)))
+        print('File size is %.1fMB' % (file_size / (10 ** 6)))
+
+        with tqdm(initial=first_byte, total=total, unit_scale=True) as pbar:
+            with open(tmp_file, file_mode) as f:
+                for chunk in r.iter_content(chunk_size=chunk_size):
+                    if chunk: # filter out keep-alive new chunks
+                        f.write(chunk)
+                        pbar.update(len(chunk))
+
+        # Rename the temp download file to the correct name if fully downloaded
+        shutil.move(tmp_file, destination)
+
     def cache(self, name, cache, url=None):
         path = os.path.join(cache, name)
         if not os.path.isfile(path) and url:
             logger.info('Downloading vectors from {}'.format(url))
             if not os.path.exists(cache):
                 os.makedirs(cache)
             if not os.path.isfile(self.destination):
-                download_from_url(url, self.destination)
+                # self.__download_with_resume(url, self.destination)
+                self.__download_with_resume(url, self.destination)
             logger.info('Extracting vectors into {}'.format(cache))
             ext = os.path.splitext(self.destination)[1][1:]
             if ext == 'zip':
@@ -72,7 +138,7 @@ def cache(self, name, cache, url=None):
         if not os.path.isfile(path):
             raise RuntimeError('no vectors found at {}'.format(path))
 
-        self.model = fastText.load_model(path)
+        self.model = fasttext.load_model(path)
         self.dim = len(self['a'])
 
 
@@ -143,7 +209,9 @@ def _get_vector_data(cls, vecs, cache):
                 if vec_data is None:
                     parts = vec_name.split('.')
                     if parts[0] == 'fasttext':
-                        if parts[2] == 'bin':
+                        if parts[1] == 'cc':
+                            vec_data = FastTextBinary(language=parts[2], cache=cache, url_base='https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.{}.300.bin.gz', vectors_type=parts[1])  
+                        elif parts[2] == 'bin':
                             vec_data = FastTextBinary(language=parts[1], cache=cache)
                         elif parts[2] == 'vec' and parts[1] == 'wiki':
                             vec_data = FastText(

diff --git a/deepmatcher/data/process.py b/deepmatcher/data/process.py
@@ -1,3 +1,4 @@
+import torch
 import copy
 import io
 import logging
@@ -103,7 +104,8 @@ def process(path,
             left_prefix='left_',
             right_prefix='right_',
             use_magellan_convention=False,
-            pca=True):
+            pca=True,
+            **kwargs):
     """Creates dataset objects for multiple splits of a dataset.
 
     This involves the following steps (if data cannot be retrieved from the cache):
@@ -174,11 +176,20 @@ def process(path,
             Specifically, set them to be '_id', 'ltable_', and 'rtable_' respectively.
         pca (bool): Whether to compute PCA for each attribute (needed for SIF model).
             Defaults to False.
+        device (str or torch.device): The device type on which compute metadata of the model. 
+            Set to 'cpu' to use CPU only, even if GPU is available. 
+            If None, will use first available GPU, or use CPU if no GPUs are available. 
+            Defaults to None.
+            This is a keyword only param.
 
     Returns:
         Tuple[MatchingDataset]: Datasets for (train, validation, and test) splits in that
             order, if provided, or dataset for unlabeled, if provided.
     """
+    device = kwargs.get('device')
+    if device is None:
+        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+
     if unlabeled is not None:
         raise ValueError('Parameter "unlabeled" has been deprecated, use '
                          '"deepmatcher.data.process_unlabeled" instead.')
@@ -217,7 +228,8 @@ def process(path,
         cache,
         check_cached_data,
         auto_rebuild_cache,
-        train_pca=pca)
+        train_pca=pca,
+        **kwargs)
 
     # Save additional information to train dataset.
     datasets[0].ignore_columns = ignore_columns