v 2.5.5

anirudhpnbb · Jun 18, 2021 · 8849aac · 8849aac
1 parent 4fd156f
commit 8849aac
Show file tree

Hide file tree

Showing 28 changed files with 324 additions and 3 deletions.
diff --git a/Pyostie.egg-info/PKG-INFO b/Pyostie.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: Pyostie
-Version: 2.5.4
+Version: 2.5.5
 Summary: A python package to OCR data and extract text with insights too.
 Home-page: https://github.com/anirudhpnbb/Pyostie
 Author: Anirudh Palaparthi

diff --git a/Pyostie.egg-info/SOURCES.txt b/Pyostie.egg-info/SOURCES.txt
@@ -8,7 +8,14 @@ Pyostie.egg-info/requires.txt
 Pyostie.egg-info/top_level.txt
 pyostie/__init__.py
 pyostie/convert.py
+pyostie/csv_extract.py
+pyostie/docx.py
+pyostie/excel.py
+pyostie/image.py
 pyostie/insights_ext.py
-pyostie/parsers.py
+pyostie/pdf.py
 pyostie/plots.py
+pyostie/pptx.py
+pyostie/speechtotext.py
+pyostie/text.py
 pyostie/utils.py
diff --git a/build/lib/pyostie/csv_extract.py b/build/lib/pyostie/csv_extract.py
@@ -0,0 +1,28 @@
+import csv
+
+
+class CSVParser:
+
+    def __init__(self, filename, delimiter):
+        """
+
+        Parameters
+        ----------
+        filename : The file that needs to be processed.
+        delimiter : By default ','. Can be changed if any other delimiter is needed.
+
+        """
+        self.file = filename
+        self.delimiter = delimiter
+
+    def extract_csv(self):
+        """
+
+        Returns
+        -------
+        CSVParser for csv files.
+
+        """
+        with open(self.file) as file:
+            output = csv.reader(file, delimiter=self.delimiter)
+            return ' '.join([' '.join(row) for row in output])
diff --git a/build/lib/pyostie/docx.py b/build/lib/pyostie/docx.py
@@ -0,0 +1,26 @@
+import docx2txt
+
+
+class DOCXParser:
+
+    def __init__(self, filename, img_dir):
+        """
+
+        Parameters
+        ----------
+        filename : The file that needs to be processed.
+        """
+        self.file = filename
+        self.img_dir = img_dir
+
+    def extract_docx(self):
+        """
+
+        Returns
+        -------
+        DOCXParser for Docx files.
+        extract text and write images in img_dir
+
+        """
+        output = docx2txt.process(self.file, self.img_dir)
+        return output
diff --git a/build/lib/pyostie/excel.py b/build/lib/pyostie/excel.py
@@ -0,0 +1,29 @@
+import xlrd
+
+
+class XLSXParser:
+
+    def __init__(self, filename):
+        """
+
+        Parameters
+        ----------
+        filename : The file that needs to be processed.
+        """
+        self.file = filename
+
+    def extract_xlsx(self):
+        """
+
+        Returns
+        -------
+        XLSXParser for XLSX and XLS files.
+        """
+        out_list = []
+        book = xlrd.open_workbook(self.file)
+        for val in range(len(book.sheet_names())):
+            sheet = book.sheet_by_index(val)
+            for res in range(sheet.nrows):
+                output = " " + " ".join(str(val_) for val_ in (sheet.row_values(res)))
+                out_list.append(output)
+        return out_list
diff --git a/build/lib/pyostie/image.py b/build/lib/pyostie/image.py
@@ -0,0 +1,36 @@
+import pytesseract
+from PIL import Image
+
+
+class ImageParser:
+
+    def __init__(self, filename, tess_path=None):
+        """
+
+        Parameters
+        ----------
+        filename : The file that needs to be processed.
+        tess_path : The path to the tesseract cmd (Only for windows.)
+        """
+        self.file = filename
+        self.path = tess_path
+
+    def extract_image(self):
+        """
+
+        Returns
+        -------
+        ImageParser for Image formats.
+
+        """
+        out_list = []
+        if self.path is not None:
+            pytesseract.pytesseract.tesseract_cmd = self.path
+            img = Image.open(self.file)
+            text = pytesseract.image_to_string(img)
+            out_list.append(text)
+        else:
+            img = Image.open(self.file)
+            text = pytesseract.image_to_string(img)
+            out_list.append(text)
+        return out_list
diff --git a/build/lib/pyostie/pdf.py b/build/lib/pyostie/pdf.py
@@ -0,0 +1,109 @@
+import PyPDF2
+import pdfplumber
+from pdf2image import convert_from_path
+from pkgutil import find_loader
+
+
+from pyostie.convert import *
+from pyostie.insights_ext import *
+
+pandas_installed = find_loader("pandas") is not None
+if pandas_installed:
+    import pandas as pd
+
+
+a = pd.DataFrame()
+ocr_dict_output = []
+
+
+class PDFParser:
+
+    def __init__(self, filename, insights=False):
+        """
+
+        Parameters
+        ----------
+        filename : The file that needs to be processed.
+        insights : True by default. False if the dataframe is not needed.
+        """
+        self.file = filename
+        self.insights = insights
+
+    def extract_pypdf2(self):
+        """
+
+        Returns
+        -------
+        PDFParser for pdf files.
+
+        """
+        contents = []
+        text = ' '
+        pdfFileObj = open(self.file, 'rb')
+        pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
+        pdfPages = pdfReader.getNumPages()
+        if pdfPages == 1:
+            for val in range(pdfReader.numPages):
+                pageObject = pdfReader.getPage(val)
+                text = text + pageObject.extractText()
+            contents.append(text)
+            if self.insights:
+                conv = conversion(self.file)
+                __conv = conv.convert()
+                insights = generate_insights(__conv, df)
+                __insights = insights.generate_df()
+                remove_files(__conv)
+                return __insights, contents
+            else:
+                return contents
+
+        if pdfPages >= 2:
+            pdf_multipage_df = pd.DataFrame()
+            for val in range(pdfReader.numPages):
+                pageObject = pdfReader.getPage(val)
+                text = text + pageObject.extractText()
+            contents.append(text)
+            if self.insights:
+                df_list = []
+                pdffile = self.file
+                tempdir = "tempdir"
+                if not os.path.isdir(tempdir):
+                    os.mkdir(tempdir)
+                if os.path.isdir(tempdir):
+                    shutil.rmtree(tempdir)
+                os.mkdir(tempdir)
+                os.mkdir(tempdir + "/converted_files")
+                images = convert_from_path(pdffile)
+                converted_files = tempdir + "/converted_files/"
+                for val in range(len(images)):
+                    images[val - 1].save(converted_files + str(val) + ".jpg", "JPEG")
+                jpgfiles = os.listdir(converted_files)
+                output_files = [converted_files + os.sep + _val for _val in jpgfiles if _val[-3:].upper() == "JPG"]
+                for val in range(len(output_files)):
+                    insights = generate_insights(output_files[val], df)
+                    __insights = insights.generate_df()
+                    page = [val] * len(__insights)
+                    __insights["page_num"] = page
+                    df_list.append(__insights)
+                    pdf_multipage_df = pd.concat([pdf_multipage_df, __insights])
+                shutil.rmtree(tempdir)
+                df1 = pdf_multipage_df.reset_index()
+                df1 = df1.drop("index", 1)
+                return df1, contents
+            else:
+                return contents
+
+    def extract_pdfplumber(self):
+        """
+
+        Returns
+        -------
+        Works as an alternative for PyPDF2.
+        """
+        out_list = []
+        with pdfplumber.open(self.file) as pdf:
+            for val in range(len(pdf.pages)):
+                page = pdf.pages[val]
+                output = page.extract_text()
+                out_list.append(output)
+        return out_list
diff --git a/build/lib/pyostie/pptx.py b/build/lib/pyostie/pptx.py
@@ -0,0 +1,32 @@
+from pptx import Presentation
+
+
+class PPTXParser:
+
+    def __init__(self, filename):
+        """
+
+        Parameters
+        ----------
+        filename : The file that needs to be processed.
+        """
+        self.file = filename
+
+    def extract_pptx(self):
+        """
+
+        Returns
+        -------
+        PPTXParser for pptx files.
+        """
+        text = []
+        paper = Presentation(self.file)
+        for slide in paper.slides:
+            for shape in slide.shapes:
+                if not shape.has_text_frame:
+                    continue
+                for paragraph in shape.text_frame.paragraphs:
+                    stripped = paragraph.text.strip()
+                    if stripped:
+                        text.append(paragraph.text)
+        return text
diff --git a/build/lib/pyostie/speechtotext.py b/build/lib/pyostie/speechtotext.py
@@ -0,0 +1,34 @@
+import os
+import speech_recognition as sr
+
+from pyostie.utils import *
+
+
+class speech_to_text:
+
+    def __init__(self, filename):
+        """
+
+        Parameters
+        ----------
+        filename : The file that needs to be processed.
+        """
+        self.file = filename
+
+    def extract_audio(self):
+        """
+
+        Returns
+        -------
+        speech_to_text for mp3, wav files.
+        """
+        output_audio = []
+        os.mkdir("tempdir")
+        dst_file = mp3_to_wav(self.file, "tempdir/sample.wav", format="wav")
+        output = sr.AudioFile(dst_file)
+        recog = sr.Recognizer()
+        with output as source:
+            audio = recog.record(source)
+        output_audio.append(recog.recognize_google(audio))
+        shutil.rmtree("tempdir")
+        return output_audio
diff --git a/build/lib/pyostie/text.py b/build/lib/pyostie/text.py
@@ -0,0 +1,20 @@
+class TXTParser:
+
+    def __init__(self, filename):
+        """
+
+        Parameters
+        ----------
+        filename : The file that needs to be processed.
+        """
+        self.file = filename
+
+    def extract_txt(self):
+        """
+
+        Returns
+        -------
+        TXTParser for txt, log or no extension files.
+        """
+        with open(self.file) as file:
+            return file.read()
diff --git a/dist/Pyostie-2.5.4-py3-none-any.whl b/dist/Pyostie-2.5.4-py3-none-any.whl
diff --git a/dist/Pyostie-2.5.4.tar.gz b/dist/Pyostie-2.5.4.tar.gz
diff --git a/dist/Pyostie-2.5.5-py3-none-any.whl b/dist/Pyostie-2.5.5-py3-none-any.whl
diff --git a/dist/Pyostie-2.5.5.tar.gz b/dist/Pyostie-2.5.5.tar.gz
diff --git a/pyostie/__pycache__/__init__.cpython-36.pyc b/pyostie/__pycache__/__init__.cpython-36.pyc
diff --git a/pyostie/__pycache__/convert.cpython-36.pyc b/pyostie/__pycache__/convert.cpython-36.pyc
diff --git a/pyostie/__pycache__/csv_extract.cpython-36.pyc b/pyostie/__pycache__/csv_extract.cpython-36.pyc
diff --git a/pyostie/__pycache__/docx.cpython-36.pyc b/pyostie/__pycache__/docx.cpython-36.pyc
diff --git a/pyostie/__pycache__/excel.cpython-36.pyc b/pyostie/__pycache__/excel.cpython-36.pyc
diff --git a/pyostie/__pycache__/image.cpython-36.pyc b/pyostie/__pycache__/image.cpython-36.pyc
diff --git a/pyostie/__pycache__/insights_ext.cpython-36.pyc b/pyostie/__pycache__/insights_ext.cpython-36.pyc
diff --git a/pyostie/__pycache__/pdf.cpython-36.pyc b/pyostie/__pycache__/pdf.cpython-36.pyc
diff --git a/pyostie/__pycache__/plots.cpython-36.pyc b/pyostie/__pycache__/plots.cpython-36.pyc
diff --git a/pyostie/__pycache__/pptx.cpython-36.pyc b/pyostie/__pycache__/pptx.cpython-36.pyc
diff --git a/pyostie/__pycache__/speechtotext.cpython-36.pyc b/pyostie/__pycache__/speechtotext.cpython-36.pyc
diff --git a/pyostie/__pycache__/text.cpython-36.pyc b/pyostie/__pycache__/text.cpython-36.pyc
diff --git a/pyostie/__pycache__/utils.cpython-36.pyc b/pyostie/__pycache__/utils.cpython-36.pyc
diff --git a/setup.py b/setup.py
@@ -8,7 +8,7 @@
     name="Pyostie",
 
     # version of the module
-    version="2.5.4",
+    version="2.5.5",
 
     # Name of Author
     author="Anirudh Palaparthi",