ArneBinder · ArneBinder · Aug 31, 2022 · Aug 31, 2022 · Aug 31, 2022 · Aug 31, 2022
diff --git a/configs/dataset/imdb.yaml b/configs/dataset/imdb.yaml
@@ -0,0 +1,3 @@
+_target_: datasets.load_dataset
+
+path: ${original_work_dir}/datasets/imdb
diff --git a/configs/experiment/imdb.yaml b/configs/experiment/imdb.yaml
@@ -0,0 +1,44 @@
+# @package _global_
+
+# to execute this experiment run:
+# python train.py experiment=imdb
+
+defaults:
+  - override /dataset: imdb
+  - override /datamodule: default
+  - override /taskmodule: simple_transformer_text_classification
+  - override /model: transformer_text_classification
+  - override /callbacks: default
+  - override /logger: wandb
+  - override /trainer: default
+
+# all parameters below will be merged with parameters from default configurations set above
+# this allows you to overwrite only specified parameters
+
+# name of the run determines folder name in logs
+name: "imdb/transformer_text_classification"
+
+seed: 12345
+
+trainer:
+  min_epochs: 5
+  max_epochs: 20
+  # gradient_clip_val: 0.5
+
+taskmodule:
+  # the texts in imdb are rather short, so we decrease max_length to save resources
+  max_length: 128
+
+datamodule:
+  batch_size: 32
+  # the imdb dataset has no val split, so we use "test" for that
+  val_split: test
+
+logger:
+  wandb:
+    name: "first-run"
+    tags:
+      - dataset=imdb
+      - model=transformer_text_classification
+      - task=sentiment_classification
+# save_dir: models/${name}/debug
diff --git a/configs/model/transformer_text_classification.yaml b/configs/model/transformer_text_classification.yaml
@@ -0,0 +1,4 @@
+_target_: pytorch_ie.models.TransformerTextClassificationModel
+
+#model_name_or_path: ${transformer_model} # transformer_model is specified in config_rel.yaml
+model_name_or_path: bert-base-uncased
diff --git a/configs/taskmodule/simple_transformer_text_classification.yaml b/configs/taskmodule/simple_transformer_text_classification.yaml
@@ -0,0 +1,4 @@
+_target_: pytorch_ie.taskmodules.SimpleTransformerTextClassificationTaskModule
+
+#tokenizer_name_or_path: ${transformer_model} # transformer_model is specified in config_rel.yaml
+tokenizer_name_or_path: bert-base-uncased
diff --git a/datasets/imdb/imdb.py b/datasets/imdb/imdb.py
@@ -0,0 +1,55 @@
+from dataclasses import dataclass
+
+import pytorch_ie.data.builder
+from pytorch_ie.annotations import Label
+from pytorch_ie.core import AnnotationList, annotation_field
+from pytorch_ie.documents import TextDocument
+
+import datasets
+
+
+class ImdbConfig(datasets.BuilderConfig):
+    """BuilderConfig for IMDB"""
+
+    def __init__(self, **kwargs):
+        """BuilderConfig for IMDB.
+        Args:
+          **kwargs: keyword arguments forwarded to super.
+        """
+        super().__init__(**kwargs)
+
+
+@dataclass
+class ImdbDocument(TextDocument):
+    label: AnnotationList[Label] = annotation_field()
+
+
+class Imdb(pytorch_ie.data.builder.GeneratorBasedBuilder):
+    DOCUMENT_TYPE = ImdbDocument
+
+    BASE_DATASET_PATH = "imdb"
+
+    BUILDER_CONFIGS = [
+        ImdbConfig(
+            name="plain_text",
+            version=datasets.Version("1.0.0"),
+            description="IMDB sentiment classification dataset",
+        ),
+    ]
+
+    def _generate_document_kwargs(self, dataset):
+        return {"int2str": dataset.features["label"].int2str}
+
+    def _generate_document(self, example, int2str):
+
+        text = example["text"]
+        document = ImdbDocument(text=text)
+        label_id = example["label"]
+        if label_id < 0:
+            return document
+
+        label = int2str(label_id)
+        label_annotation = Label(label=label)
+        document.label.append(label_annotation)
+
+        return document
diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,8 @@
 # --------- pytorch-ie --------- #
 # pytorch-ie>=0.8.0,<0.9
-# we need the latest dev version of pytorch-ie where
-# the pipeline works without the parameter "predict_field"
+# we need the latest dev version of pytorch-ie
+# * where the pipeline works without the parameter "predict_field", and
+# * to allow for annotations without a target (the Label annotation)
 git+https://github.com/ChristophAlt/pytorch-ie.git
 
 # --------- hydra --------- #
@@ -10,7 +11,7 @@ hydra-colorlog>=1.1.0
 hydra-optuna-sweeper>=1.1.0
 
 # --------- loggers --------- #
-# wandb
+wandb  # used in the example experiment
 # neptune-client
 # mlflow
 # comet-ml

diff --git a/src/training_pipeline.py b/src/training_pipeline.py
@@ -51,7 +51,12 @@ def train(config: DictConfig) -> Optional[float]:
     # Init pytorch-ie model
     log.info(f"Instantiating model <{config.model._target_}>")
     # NOTE: THE FOLLOWING LINE MAY NEED ADAPTATION WHEN YOU DEFINE YOUR OWN MODELS OR TASKMODULES!
-    additional_model_kwargs: Dict[str, Any] = dict(num_classes=len(taskmodule.label_to_id))
+    # additional_model_kwargs: Dict[str, Any] = dict(num_classes=len(taskmodule.label_to_id))
+    additional_model_kwargs: Dict[str, Any] = dict(
+        num_classes=len(taskmodule.label_to_id),
+        tokenizer_vocab_size=taskmodule.tokenizer.vocab_size,
+        t_total=datamodule.num_train * config["trainer"]["max_epochs"],
+    )
     model: PyTorchIEModel = hydra.utils.instantiate(
         config.model, _convert_="partial", **additional_model_kwargs
     )

diff --git a/tests/shell/test_basic_commands.py b/tests/shell/test_basic_commands.py
@@ -12,7 +12,14 @@
 
 def test_fast_dev_run_with_evaluation():
     """Test running for 1 train, val and test batch."""
-    command = ["train.py", "++trainer.fast_dev_run=true", "++test=true"]
+    command = [
+        "train.py",
+        "experiment=imdb",
+        "logger=wandb",
+        "logger.wandb.offline=true",
+        "++trainer.fast_dev_run=true",
+        "++test=true",
+    ]
     run_command(command)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		_target_: datasets.load_dataset

		path: ${original_work_dir}/datasets/imdb