From 0aac3004628c56089a9d4e92602856ee09a078ab Mon Sep 17 00:00:00 2001 From: "fmilo@entropysource.com" Date: Fri, 22 May 2020 06:50:34 +0000 Subject: [PATCH] fix stream logic --- deepmatcher/data/__init__.py | 4 ++-- deepmatcher/data/dataset.py | 21 ++++++++------------- deepmatcher/data/process.py | 3 ++- 3 files changed, 12 insertions(+), 16 deletions(-) diff --git a/deepmatcher/data/__init__.py b/deepmatcher/data/__init__.py index 286fe96..0e4e6db 100644 --- a/deepmatcher/data/__init__.py +++ b/deepmatcher/data/__init__.py @@ -1,10 +1,10 @@ from .field import MatchingField, reset_vector_cache from .dataset import MatchingDataset from .iterator import MatchingIterator -from .process import process, process_unlabeled +from .process import process, process_unlabeled, process_unlabeled_stream from .dataset import split __all__ = [ - 'MatchingField', 'MatchingDataset', 'MatchingIterator', 'process', 'process_unlabeled', 'split', + 'MatchingField', 'MatchingDataset', 'MatchingIterator', 'process', 'process_unlabeled', 'process_unlabeled_stream', 'split', 'reset_vector_cache' ] diff --git a/deepmatcher/data/dataset.py b/deepmatcher/data/dataset.py index 8609575..0291fdb 100644 --- a/deepmatcher/data/dataset.py +++ b/deepmatcher/data/dataset.py @@ -4,7 +4,7 @@ import logging import os import pdb -from collections import Counter, defaultdict +from collections import Counter, defaultdict, Iterator from timeit import default_timer as timer import pandas as pd @@ -69,14 +69,14 @@ def split(table, tables[i].to_csv(os.path.join(path, prefixes[i]), index=False) -class CountingWrapper(object): +class CountingWrapper(Iterator): def __init__(self, stream): self.line_count = 0 self.f = stream - - def read(self, *args, **kwargs): + + def __next__(self): self.line_count += 1 - return self.f.read(*args, **kwargs) + return self.f.readline() class MatchingDataset(data.Dataset): @@ -167,13 +167,7 @@ def __init__(self, reader = f lines = f.line_count - next(reader) - examples = [ - make_example(line, fields) for line in pyprind.prog_bar( - reader, - iterations=lines, - title='\nReading and processing data from "' + path + '"') - ] + examples = [ make_example(line, fields) for line in reader ] super(MatchingDataset, self).__init__(examples, fields, **kwargs) else: @@ -216,6 +210,7 @@ def _set_attributes(self): self.label_field = self.column_naming['label'] self.id_field = self.column_naming['id'] + def compute_metadata(self, pca=False): r"""Computes metadata about the dataset. @@ -634,7 +629,7 @@ def splits(cls, cachefile, column_naming, state_args) after_cache = timer() - logger.info('Cache save time: {time}s', time=after_cache - + logger.info('Cache save time: {time}s', time=(after_cache - after_vocab)) if train: diff --git a/deepmatcher/data/process.py b/deepmatcher/data/process.py index f0ae688..faa3499 100644 --- a/deepmatcher/data/process.py +++ b/deepmatcher/data/process.py @@ -248,6 +248,7 @@ def process_unlabeled(path, trained_model, ignore_columns=None): with io.open(path, encoding="utf8") as f: return process_unlabeled_stream(f, trained_model, ignore_columns) + def process_unlabeled_stream(stream, trained_model, ignore_columns=None): """Creates a dataset object for an unlabeled dataset. @@ -274,7 +275,7 @@ def process_unlabeled_stream(stream, trained_model, ignore_columns=None): train_info.tokenize, train_info.include_lengths) begin = timer() - dataset_args = {'fields': fields, 'column_naming': column_naming} + dataset_args = {'fields': fields, 'column_naming': column_naming, 'format':'csv'} dataset = MatchingDataset(stream=stream, **dataset_args) # Make sure we have the same attributes.