Merge pull request #20 from ruanchaves/pysentimiento

Pysentimiento features
ruanchaves · Feb 6, 2022 · 2dc158d · 2dc158d
2 parents 3263cbe + f5a6228
commit 2dc158d
Show file tree

Hide file tree

Showing 6 changed files with 509 additions and 24 deletions.
diff --git a/docs/EVALUATION.md b/docs/EVALUATION.md
@@ -1,5 +1,13 @@
 # Evaluation
 
+We provide a detailed evaluation of the accuracy and speed of the `hashformers` framework in comparison with alternative libraries.
+
+Although models based on n-grams such as `ekphrasis` are orders of magnitude faster than `hashformers`, they are remarkably unstable across different datasets. 
+
+Research papers on word segmentation usually try to bring the best of both worlds together and combine deep learning with statistical methods. So it is possible that the best speed-accuracy trade-off may lie in building [ranking cascades](https://arxiv.org/abs/2010.06467) ( a.k.a. "telescoping" ) where `hashformers` is used as a fallback for when less time-consuming methods score below a certain confidence threshold.
+
+## Accuracy
+
 <h1 align="center">
   <img src="https://raw.githubusercontent.com/ruanchaves/hashformers/master/barplot_evaluation.png" width="512" title="hashformers">
 </h1>
@@ -34,4 +42,30 @@ A script to reproduce the evaluation of ekphrasis is available on [scripts/evalu
 |               |               |            |
 | average (all) | HashtagMaster |     58.35  |
 |               | ekphrasis     |     41.65  |
-|               |**hashformers**|   **68.06**|
+|               |**hashformers**|   **68.06**|
+
+## Speed
+
+| model         | hashtags/second | accuracy  | topk | layers|
+|:--------------|:----------------|----------:|-----:|------:|
+| ekphrasis     |    4405.00      |   44.74   |  -   |   -   |
+| gpt2-large    |      12.04      |   63.86   |  2   | first |
+| distilgpt2    |      29.32      |   64.56   |  2   | first |
+|**distilgpt2** |    **15.00**    | **80.48** |**2** |**all**|
+| gpt2          |      11.36      |    -      |  2   |  all  |
+| gpt2          |      3.48       |    -      |  20  |  all  |
+| gpt2 + bert   |      1.38       |   83.68   |  20  |  all  |
+
+In this table we evaluate hashformers under different settings on the Dev-BOUN dataset and compare it with ekphrasis. As ekphrasis relies on n-grams, it is a few orders of magnitude faster than hashformers.  
+
+All experiments were performed on Google Colab while connected to a Tesla T4 GPU with 15GB of RAM. We highlight `distilgpt2` at `topk = 2`, which provides the best speed-accuracy trade-off.
+
+* **model**: The name of the model. We evaluate ekphrasis under the default settings, and use the reranker only for the SOTA experiment at the bottom row.
+
+* **hashtags/second**: How many hashtags the model can segment per second. All experiments on hashformers had the `batch_size` parameter adjusted to take up close to 100% of GPU RAM. A sidenote: even at 100% of GPU memory usage, we get about 60% of GPU utilization. So you may get better results by using GPUs with more memory than 16GB.
+
+* **accuracy**: Accuracy on the Dev-BOUN dataset. We don't evaluate the accuracy of `gpt2`, but we know [from the literature](https://arxiv.org/abs/2112.03213) that it is expected to be between `distilgpt2` (at 80%) and `gpt2 + bert` (the SOTA, at 83%).
+
+* **topk**: the `topk` parameter of the Beamsearch algorithm ( passed as the `topk` argument to the `WordSegmenter.segment` method). The `steps` Beamsearch parameter was fixed at a default value of 13 for all experiments with hashformers, as it doesn't have a significant impact on performance as `topk`.
+
+* **layers**: How many Transformer layers were utilized for language modeling: either all layers or just the bottom layer.
diff --git a/setup.py b/setup.py
@@ -10,6 +10,9 @@
    package_dir={'': 'src'},
    install_requires=[
    "mlm-hashformers",
-   "lm-scorer-hashformers"
+   "lm-scorer-hashformers",
+   "twitter-text-python",
+   "ekphrasis",
+   "pandas",
    ]
 )
diff --git a/src/hashformers/beamsearch/gpt2_lm.py b/src/hashformers/beamsearch/gpt2_lm.py
@@ -1,9 +1,82 @@
-from lm_scorer.models.auto import GPT2LMScorer as LMScorer
+from lm_scorer.models.auto import GPT2LMScorer
+from typing import *  # pylint: disable=wildcard-import,unused-wildcard-import
+import torch
+from transformers import AutoTokenizer, GPT2LMHeadModel
+from transformers.tokenization_utils import BatchEncoding
+
+class PaddedGPT2LMScorer(GPT2LMScorer):
+
+    def __init__(self, *args, **kwargs):
+      super().__init__(*args, **kwargs)
+
+    def _build(self, model_name: str, options: Dict[str, Any]) -> None:
+        super()._build(model_name, options)
+
+        # pylint: disable=attribute-defined-outside-init
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_name, use_fast=True, add_special_tokens=False
+        )
+        # Add the pad token to GPT2 dictionary.
+        # len(tokenizer) = vocab_size + 1
+        self.tokenizer.add_special_tokens({"additional_special_tokens": ["<|pad|>"]})
+        self.tokenizer.pad_token = "<|pad|>"
+
+        self.model = GPT2LMHeadModel.from_pretrained(model_name)
+        # We need to resize the embedding layer because we added the pad token.
+        self.model.resize_token_embeddings(len(self.tokenizer))
+        self.model.eval()
+        if "device" in options:
+            self.model.to(options["device"])
+
+    def _tokens_log_prob_for_batch(
+        self, text: List[str]
+    ) -> List[Tuple[torch.DoubleTensor, torch.LongTensor, List[str]]]:
+        outputs: List[Tuple[torch.DoubleTensor, torch.LongTensor, List[str]]] = []
+        if len(text) == 0:
+            return outputs
+
+        # TODO: Handle overflowing elements for long sentences
+        text = list(map(self._add_special_tokens, text))
+        encoding: BatchEncoding = self.tokenizer.batch_encode_plus(
+            text, return_tensors="pt", padding=True, truncation=True
+        )
+        with torch.no_grad():
+            ids = encoding["input_ids"].to(self.model.device)
+            attention_mask = encoding["attention_mask"].to(self.model.device)
+            nopad_mask = ids != self.tokenizer.pad_token_id
+            logits: torch.Tensor = self.model(ids, attention_mask=attention_mask)[0]
+
+        for sent_index in range(len(text)):
+            sent_nopad_mask = nopad_mask[sent_index]
+            # len(tokens) = len(text[sent_index]) + 1
+            sent_tokens = [
+                tok
+                for i, tok in enumerate(encoding.tokens(sent_index))
+                if sent_nopad_mask[i] and i != 0
+            ]
+
+            # sent_ids.shape = [len(text[sent_index]) + 1]
+            sent_ids = ids[sent_index, sent_nopad_mask][1:]
+            # logits.shape = [len(text[sent_index]) + 1, vocab_size]
+            sent_logits = logits[sent_index, sent_nopad_mask][:-1, :]
+            sent_logits[:, self.tokenizer.pad_token_id] = float("-inf")
+            # ids_scores.shape = [seq_len + 1]
+            sent_ids_scores = sent_logits.gather(1, sent_ids.unsqueeze(1)).squeeze(1)
+            # log_prob.shape = [seq_len + 1]
+            sent_log_probs = sent_ids_scores - sent_logits.logsumexp(1)
+
+            sent_log_probs = cast(torch.DoubleTensor, sent_log_probs)
+            sent_ids = cast(torch.LongTensor, sent_ids)
+
+            output = (sent_log_probs, sent_ids, sent_tokens)
+            outputs.append(output)
+
+        return outputs
 
 class GPT2LM(object):
 
     def __init__(self, model_name_or_path, device='cuda', gpu_batch_size=20):
-        self.scorer = LMScorer(model_name_or_path, device=device, batch_size=gpu_batch_size)
+        self.scorer = PaddedGPT2LMScorer(model_name_or_path, device=device, batch_size=gpu_batch_size)
 
     def get_probs(self, list_of_candidates):
         scores =  self.scorer.sentence_score(list_of_candidates, log=True)