Text Summarization Model.

Added Text Summarization Model.
Niketkumardheeryan · May 21, 2024 · 1eac21e · 1eac21e
2 parents 773958c + 5c2a6a9
commit 1eac21e
Show file tree

Hide file tree

Showing 2 changed files with 202 additions and 0 deletions.
diff --git a/Text Summarization Model/Model/text_summary.ipynb b/Text Summarization Model/Model/text_summary.ipynb
@@ -0,0 +1,143 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import nltk\n",
+    "from nltk.tokenize import sent_tokenize\n",
+    "from nltk.corpus import stopwords\n",
+    "from nltk.cluster.util import cosine_distance\n",
+    "import numpy as np\n",
+    "import networkx as nx\n",
+    "\n",
+    "# Download necessary NLTK resources\n",
+    "nltk.download('punkt')\n",
+    "nltk.download('stopwords')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def read_article(text):\n",
+    "    sentences = sent_tokenize(text)\n",
+    "    return sentences"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def sentence_similarity(sent1, sent2, stopwords=None):\n",
+    "    if stopwords is None:\n",
+    "        stopwords = []\n",
+    "\n",
+    "    sent1 = [w.lower() for w in sent1]\n",
+    "    sent2 = [w.lower() for w in sent2]\n",
+    "\n",
+    "    all_words = list(set(sent1 + sent2))\n",
+    "\n",
+    "    vector1 = [0] * len(all_words)\n",
+    "    vector2 = [0] * len(all_words)\n",
+    "\n",
+    "    for w in sent1:\n",
+    "        if w in stopwords:\n",
+    "            continue\n",
+    "        vector1[all_words.index(w)] += 1\n",
+    "\n",
+    "    for w in sent2:\n",
+    "        if w in stopwords:\n",
+    "            continue\n",
+    "        vector2[all_words.index(w)] += 1\n",
+    "\n",
+    "    return 1 - cosine_distance(vector1, vector2)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def build_similarity_matrix(sentences, stop_words):\n",
+    "    similarity_matrix = np.zeros((len(sentences), len(sentences)))\n",
+    "\n",
+    "    for idx1 in range(len(sentences)):\n",
+    "        for idx2 in range(len(sentences)):\n",
+    "            if idx1 == idx2:\n",
+    "                continue\n",
+    "            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)\n",
+    "    return similarity_matrix\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_summary(text, threshold=0.2):\n",
+    "    stop_words = stopwords.words('english')\n",
+    "    summarize_text = []\n",
+    "\n",
+    "    sentences = read_article(text)\n",
+    "    sentence_similarity_matrix = build_similarity_matrix(sentences, stop_words)\n",
+    "    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)\n",
+    "    scores = nx.pagerank(sentence_similarity_graph)\n",
+    "\n",
+    "    # Sort the sentences by their PageRank scores\n",
+    "    ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)\n",
+    "\n",
+    "    # Include sentences in the summary if their score is above the threshold\n",
+    "    for score, sentence in ranked_sentences:\n",
+    "        if score > threshold:\n",
+    "            summarize_text.append(sentence)\n",
+    "\n",
+    "    # Combine selected sentences to create the final summary\n",
+    "    summary = ' '.join(summarize_text)\n",
+    "    return summary\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Example usage:\n",
+    "text = \"\"\"A week ago a friend invited a couple of other couples over for dinner. Eventually, the food (but not the wine) was cleared off the table for what turned out to be some fierce Scrabbling. Heeding the strategy of going for the shorter, more valuable word over the longer cheaper word, our final play was “Bon,” which–as luck would have it!–happens to be a Japanese Buddhist festival, and not, as I had originally asserted while laying the tiles on the board, one half of a chocolate-covered cherry treat. Anyway, the strategy worked. My team only lost by 53 points instead of 58.\"\"\"\n",
+    "\n",
+    "summary = generate_summary(text)\n",
+    "print(summary)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/Text Summarization Model/README.md b/Text Summarization Model/README.md
@@ -0,0 +1,59 @@
+# Text Summarization Using TextRank Algorithm
+
+## Overview:
+This Python script provides a text summarization algorithm based on the TextRank algorithm. It automatically generates summaries for input text by identifying important sentences using graph-based ranking techniques.
+
+## Dependencies:
+Ensure you have the following dependencies installed:
+- NLTK (Natural Language Toolkit)
+- NumPy
+- NetworkX
+
+You can install NLTK and NumPy using pip:
+
+'''
+$ pip install nltk numpy networkx
+$ pip install nltk
+$ pip install numpy
+$ pip install networkx
+'''
+
+## Usage:
+1. Clone the repository to your local machine.
+2. Install the required dependencies using the provided command.
+3. Use the `generate_summary` function in the `text_summarization.py` file to generate summaries for your text data.
+
+## How It Works:
+1. **Text Preprocessing:**
+   - The input text is tokenized into sentences using NLTK's `sent_tokenize` function.
+   - Stop words and punctuation are removed from each sentence.
+
+2. **Sentence Similarity:**
+   - Cosine similarity is computed between each pair of sentences based on the occurrence of words after preprocessing.
+   - A similarity matrix is constructed to represent the similarity between sentences.
+
+3. **Graph Representation:**
+   - The similarity matrix is converted into a graph representation, where each node represents a sentence and edges represent the similarity between sentences.
+
+4. **Ranking:**
+   - The PageRank algorithm, implemented using NetworkX, is applied to the sentence similarity graph to assign importance scores to each sentence.
+   - Sentences are ranked based on their importance scores.
+
+5. **Summary Generation:**
+   - Sentences with importance scores above a specified threshold are selected to form the summary.
+   - The selected sentences are concatenated to generate the final summary.
+
+## Example:
+```python
+text = "Dave watched as the forest burned up on the hill, only a few miles from her house. The car had been hastily packed and Marta was inside trying to round up the last of the pets. Dave went through his mental list of the most important papers and documents that they couldn't leave behind. He scolded himself for not having prepared these better in advance and hoped that he had remembered everything that was needed. He continued to wait for Marta to appear with the pets, but she still was nowhere to be seen.
+"
+summary = generate_summary(text)
+print(summary)
+
+Contributing:
+    Contributions are welcome! Feel free to submit bug reports, feature requests, or pull requests.
+
+
+This project is licensed under the MIT License. See the LICENSE file for details.
+
+Feel free to customize this README file according to your preferences and requirements!