diff --git a/__pycache__/utils.cpython-36.pyc b/__pycache__/utils.cpython-36.pyc index fbd2e56..8ef2c5b 100644 Binary files a/__pycache__/utils.cpython-36.pyc and b/__pycache__/utils.cpython-36.pyc differ diff --git a/data/make_persona_file.py b/data/make_persona_file.py index 20ef23b..9bb7655 100644 --- a/data/make_persona_file.py +++ b/data/make_persona_file.py @@ -82,7 +82,7 @@ def preprocess_line(text): ##remove empty start end ##it is possible text may become empty! - text = test.strip() + text = text.strip() return text diff --git a/interact.py b/interact.py index d368204..bf3be98 100644 --- a/interact.py +++ b/interact.py @@ -132,7 +132,24 @@ def run(): logger.info("Sample a personality") dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache) personalities = [dialog["personality"] for dataset in dataset.values() for dialog in dataset] + + + logger.info("Tokenize and encode the dataset") + def tokenize(obj): + if isinstance(obj, str): + return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) + if isinstance(obj, dict): + return dict((n, tokenize(o)) for n, o in obj.items()) + return list(tokenize(o) for o in obj) + + personality = random.choice(personalities) + + ##TALK TO HAL + personality_hal = ["that's true. my name is hal"] + personality = tokenize(personality_hal) + print(personality) + logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) history = []