facebookresearch · heffernankevin · Dec 7, 2023 · Nov 24, 2023 · Nov 24, 2023 · Nov 24, 2023
diff --git a/tasks/SentimentAnalysis/README.md b/tasks/SentimentAnalysis/README.md
@@ -0,0 +1,32 @@
+# Laser Encoder: Sentiment Analysis
+
+## Overview
+
+This project demonstrates the application of the Laser Encoder tool for creating sentence embeddings in the context of sentiment analysis. The Laser Encoder is used to encode text data, and a sentiment analysis model is trained to predict the sentiment of the text.
+
+## Getting Started
+
+To run the notebook in Google Colab, simply click the "Open in Colab" button below:
+
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/12gQUG7rPJvOVeWQkpMFzMiixqwDIdv4W?usp=sharing)
+
+## Example Usage
+
+- Download Dataset:
+    Download the sample dataset from the following link: [Sample Dataset](https://www.kaggle.com/datasets/abhi8923shriv/sentiment-analysis-dataset)
+
+- Run the Example Notebook:
+    Execute the provided Jupyter notebook SentimentAnalysis.ipynb
+
+        jupyter notebook SentimentAnalysis.ipynb
+
+
+## Customization
+
+- Modify the model architecture, hyperparameters, and training settings in the neural network model section based on your requirements.
+- Customize the sentiment mapping and handling of unknown sentiments in the data preparation section.
+
+## Additional Notes
+- Feel free to experiment with different models, embeddings, and hyperparameters to optimize performance.
+- Ensure that the dimensions of embeddings and model inputs are compatible.
+Adapt the code based on your specific dataset and use case.
diff --git a/tasks/SentimentAnalysis/SentimentAnalysis.ipynb b/tasks/SentimentAnalysis/SentimentAnalysis.ipynb
@@ -0,0 +1,186 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! pip install laser_encoders\n",
+    "! pip install chardet"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import chardet\n",
+    "from laser_encoders import LaserEncoderPipeline\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.metrics import accuracy_score\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.preprocessing import LabelEncoder\n",
+    "from tensorflow.keras.models import Sequential\n",
+    "from tensorflow.keras.layers import Dense\n",
+    "from tqdm import tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('/content/drive/MyDrive/dataset/train.csv', 'rb') as f:\n",
+    "    result = chardet.detect(f.read())\n",
+    "\n",
+    "# Use the detected encoding when reading the CSV file\n",
+    "data = pd.read_csv('/content/drive/MyDrive/dataset/train.csv', encoding=result['encoding'])\n",
+    "data = data[['sentiment', 'text']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(data.head())\n",
+    "print(data.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentiments = []\n",
+    "texts = []\n",
+    "\n",
+    "for index, row in data.iterrows():\n",
+    "    sentiment = row['sentiment'].lower()  # Convert to lowercase for case-insensitivity\n",
+    "    if sentiment == 'neutral':\n",
+    "        sentiments.append(1)\n",
+    "    elif sentiment == 'positive':\n",
+    "        sentiments.append(2)\n",
+    "    elif sentiment == 'negative':\n",
+    "        sentiments.append(3)\n",
+    "    else:\n",
+    "        # Handle the case where sentiment is not one of the expected values\n",
+    "        # You may choose to skip this row or handle it differently based on your requirements\n",
+    "        print(f\"Warning: Unknown sentiment '{sentiment}' in row {index}\")\n",
+    "\n",
+    "    text = row['text']\n",
+    "    texts.append(text)\n",
+    "\n",
+    "print(len(sentiments))\n",
+    "print(len(texts))\n",
+    "sentiments = sentiments[:300] + sentiments[400:]\n",
+    "texts = texts[:300] + texts[400:]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "label_encoder = LabelEncoder()\n",
+    "encoded_sentiments = label_encoder.fit_transform(sentiments)\n",
+    "\n",
+    "# Split the data into training and testing sets\n",
+    "X_train, X_test, y_train, y_test = train_test_split(texts, encoded_sentiments, test_size=0.2, random_state=42)\n",
+    "\n",
+    "# Initialize the LaserEncoder\n",
+    "encoder = LaserEncoderPipeline(lang=\"eng_Latn\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Initialize empty arrays to store embeddings\n",
+    "X_train_embeddings = []\n",
+    "X_test_embeddings = []\n",
+    "\n",
+    "# Encode sentences line-wise using tqdm for progress visualization\n",
+    "print(\"Encoding training sentences:\")\n",
+    "for sentence in tqdm(X_train):\n",
+    "    embeddings = encoder.encode_sentences([sentence])[0]\n",
+    "    X_train_embeddings.append(embeddings)\n",
+    "\n",
+    "print(\"Encoding testing sentences:\")\n",
+    "for sentence in tqdm(X_test):\n",
+    "    embeddings = encoder.encode_sentences([sentence])[0]\n",
+    "    X_test_embeddings.append(embeddings)\n",
+    "\n",
+    "# Convert lists to numpy arrays\n",
+    "X_train_embeddings = np.array(X_train_embeddings)\n",
+    "X_test_embeddings = np.array(X_test_embeddings)\n",
+    "\n",
+    "# # Encode sentences line-wise\n",
+    "# X_train_embeddings = np.array([encoder.encode_sentences([sentence])[0] for sentence in X_train])\n",
+    "# X_test_embeddings = np.array([encoder.encode_sentences([sentence])[0] for sentence in X_test])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Build a simple neural network model\n",
+    "model = Sequential()\n",
+    "model.add(Dense(64, input_shape=(1024,), activation='relu'))\n",
+    "model.add(Dense(3, activation='softmax'))  # Assuming 3 classes (neutral, positive, negative)\n",
+    "\n",
+    "# Compile the model\n",
+    "model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])\n",
+    "\n",
+    "# Train the model\n",
+    "model.fit(X_train_embeddings, y_train, epochs=20, batch_size=32, validation_split=0.1)\n",
+    "\n",
+    "# Evaluate the model on the test set\n",
+    "accuracy = model.evaluate(X_test_embeddings, y_test)[1]\n",
+    "print(f\"Accuracy: {accuracy * 100:.2f}%\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Now, you can use the trained model to predict the sentiment of user input\n",
+    "user_text = input(\"Enter a text: \")\n",
+    "user_text_embedding = encoder.encode_sentences([user_text])[0]\n",
+    "user_text_embedding = np.reshape(user_text_embedding, (1, -1))\n",
+    "\n",
+    "predicted_sentiment = np.argmax(model.predict(user_text_embedding))\n",
+    "predicted_sentiment_no = label_encoder.inverse_transform([predicted_sentiment])[0]\n",
+    "if predicted_sentiment_no == 1:\n",
+    "  predicted_sentiment_label = 'neutral'\n",
+    "elif predicted_sentiment_no == 2:\n",
+    "  predicted_sentiment_label = 'positive'\n",
+    "else:\n",
+    "  predicted_sentiment_label = 'negative'\n",
+    "\n",
+    "print(f\"Predicted Sentiment: {predicted_sentiment_label}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}