Skip to content

Commit

Permalink
v 2.5.5
Browse files Browse the repository at this point in the history
  • Loading branch information
ap1001 committed Jun 18, 2021
1 parent 4fd156f commit 8849aac
Show file tree
Hide file tree
Showing 28 changed files with 324 additions and 3 deletions.
2 changes: 1 addition & 1 deletion Pyostie.egg-info/PKG-INFO
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: Pyostie
Version: 2.5.4
Version: 2.5.5
Summary: A python package to OCR data and extract text with insights too.
Home-page: https://github.com/anirudhpnbb/Pyostie
Author: Anirudh Palaparthi
Expand Down
9 changes: 8 additions & 1 deletion Pyostie.egg-info/SOURCES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,14 @@ Pyostie.egg-info/requires.txt
Pyostie.egg-info/top_level.txt
pyostie/__init__.py
pyostie/convert.py
pyostie/csv_extract.py
pyostie/docx.py
pyostie/excel.py
pyostie/image.py
pyostie/insights_ext.py
pyostie/parsers.py
pyostie/pdf.py
pyostie/plots.py
pyostie/pptx.py
pyostie/speechtotext.py
pyostie/text.py
pyostie/utils.py
28 changes: 28 additions & 0 deletions build/lib/pyostie/csv_extract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import csv


class CSVParser:

def __init__(self, filename, delimiter):
"""
Parameters
----------
filename : The file that needs to be processed.
delimiter : By default ','. Can be changed if any other delimiter is needed.
"""
self.file = filename
self.delimiter = delimiter

def extract_csv(self):
"""
Returns
-------
CSVParser for csv files.
"""
with open(self.file) as file:
output = csv.reader(file, delimiter=self.delimiter)
return ' '.join([' '.join(row) for row in output])
26 changes: 26 additions & 0 deletions build/lib/pyostie/docx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import docx2txt


class DOCXParser:

def __init__(self, filename, img_dir):
"""
Parameters
----------
filename : The file that needs to be processed.
"""
self.file = filename
self.img_dir = img_dir

def extract_docx(self):
"""
Returns
-------
DOCXParser for Docx files.
extract text and write images in img_dir
"""
output = docx2txt.process(self.file, self.img_dir)
return output
29 changes: 29 additions & 0 deletions build/lib/pyostie/excel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import xlrd


class XLSXParser:

def __init__(self, filename):
"""
Parameters
----------
filename : The file that needs to be processed.
"""
self.file = filename

def extract_xlsx(self):
"""
Returns
-------
XLSXParser for XLSX and XLS files.
"""
out_list = []
book = xlrd.open_workbook(self.file)
for val in range(len(book.sheet_names())):
sheet = book.sheet_by_index(val)
for res in range(sheet.nrows):
output = " " + " ".join(str(val_) for val_ in (sheet.row_values(res)))
out_list.append(output)
return out_list
36 changes: 36 additions & 0 deletions build/lib/pyostie/image.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import pytesseract
from PIL import Image


class ImageParser:

def __init__(self, filename, tess_path=None):
"""
Parameters
----------
filename : The file that needs to be processed.
tess_path : The path to the tesseract cmd (Only for windows.)
"""
self.file = filename
self.path = tess_path

def extract_image(self):
"""
Returns
-------
ImageParser for Image formats.
"""
out_list = []
if self.path is not None:
pytesseract.pytesseract.tesseract_cmd = self.path
img = Image.open(self.file)
text = pytesseract.image_to_string(img)
out_list.append(text)
else:
img = Image.open(self.file)
text = pytesseract.image_to_string(img)
out_list.append(text)
return out_list
109 changes: 109 additions & 0 deletions build/lib/pyostie/pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import PyPDF2
import pdfplumber
from pdf2image import convert_from_path
from pkgutil import find_loader


from pyostie.convert import *
from pyostie.insights_ext import *

pandas_installed = find_loader("pandas") is not None
if pandas_installed:
import pandas as pd


a = pd.DataFrame()
ocr_dict_output = []


class PDFParser:

def __init__(self, filename, insights=False):
"""
Parameters
----------
filename : The file that needs to be processed.
insights : True by default. False if the dataframe is not needed.
"""
self.file = filename
self.insights = insights

def extract_pypdf2(self):
"""
Returns
-------
PDFParser for pdf files.
"""
contents = []
text = ' '
pdfFileObj = open(self.file, 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pdfPages = pdfReader.getNumPages()
if pdfPages == 1:
for val in range(pdfReader.numPages):
pageObject = pdfReader.getPage(val)
text = text + pageObject.extractText()
contents.append(text)
if self.insights:
conv = conversion(self.file)
__conv = conv.convert()
insights = generate_insights(__conv, df)
__insights = insights.generate_df()
remove_files(__conv)
return __insights, contents
else:
return contents

if pdfPages >= 2:
pdf_multipage_df = pd.DataFrame()
for val in range(pdfReader.numPages):
pageObject = pdfReader.getPage(val)
text = text + pageObject.extractText()
contents.append(text)
if self.insights:
df_list = []
pdffile = self.file
tempdir = "tempdir"
if not os.path.isdir(tempdir):
os.mkdir(tempdir)
if os.path.isdir(tempdir):
shutil.rmtree(tempdir)
os.mkdir(tempdir)
os.mkdir(tempdir + "/converted_files")
images = convert_from_path(pdffile)
converted_files = tempdir + "/converted_files/"
for val in range(len(images)):
images[val - 1].save(converted_files + str(val) + ".jpg", "JPEG")
jpgfiles = os.listdir(converted_files)
output_files = [converted_files + os.sep + _val for _val in jpgfiles if _val[-3:].upper() == "JPG"]
for val in range(len(output_files)):
insights = generate_insights(output_files[val], df)
__insights = insights.generate_df()
page = [val] * len(__insights)
__insights["page_num"] = page
df_list.append(__insights)
pdf_multipage_df = pd.concat([pdf_multipage_df, __insights])
shutil.rmtree(tempdir)
df1 = pdf_multipage_df.reset_index()
df1 = df1.drop("index", 1)
return df1, contents
else:
return contents

def extract_pdfplumber(self):
"""
Returns
-------
Works as an alternative for PyPDF2.
"""
out_list = []
with pdfplumber.open(self.file) as pdf:
for val in range(len(pdf.pages)):
page = pdf.pages[val]
output = page.extract_text()
out_list.append(output)
return out_list
32 changes: 32 additions & 0 deletions build/lib/pyostie/pptx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from pptx import Presentation


class PPTXParser:

def __init__(self, filename):
"""
Parameters
----------
filename : The file that needs to be processed.
"""
self.file = filename

def extract_pptx(self):
"""
Returns
-------
PPTXParser for pptx files.
"""
text = []
paper = Presentation(self.file)
for slide in paper.slides:
for shape in slide.shapes:
if not shape.has_text_frame:
continue
for paragraph in shape.text_frame.paragraphs:
stripped = paragraph.text.strip()
if stripped:
text.append(paragraph.text)
return text
34 changes: 34 additions & 0 deletions build/lib/pyostie/speechtotext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import os
import speech_recognition as sr

from pyostie.utils import *


class speech_to_text:

def __init__(self, filename):
"""
Parameters
----------
filename : The file that needs to be processed.
"""
self.file = filename

def extract_audio(self):
"""
Returns
-------
speech_to_text for mp3, wav files.
"""
output_audio = []
os.mkdir("tempdir")
dst_file = mp3_to_wav(self.file, "tempdir/sample.wav", format="wav")
output = sr.AudioFile(dst_file)
recog = sr.Recognizer()
with output as source:
audio = recog.record(source)
output_audio.append(recog.recognize_google(audio))
shutil.rmtree("tempdir")
return output_audio
20 changes: 20 additions & 0 deletions build/lib/pyostie/text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
class TXTParser:

def __init__(self, filename):
"""
Parameters
----------
filename : The file that needs to be processed.
"""
self.file = filename

def extract_txt(self):
"""
Returns
-------
TXTParser for txt, log or no extension files.
"""
with open(self.file) as file:
return file.read()
Binary file removed dist/Pyostie-2.5.4-py3-none-any.whl
Binary file not shown.
Binary file removed dist/Pyostie-2.5.4.tar.gz
Binary file not shown.
Binary file added dist/Pyostie-2.5.5-py3-none-any.whl
Binary file not shown.
Binary file added dist/Pyostie-2.5.5.tar.gz
Binary file not shown.
Binary file modified pyostie/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file modified pyostie/__pycache__/convert.cpython-36.pyc
Binary file not shown.
Binary file modified pyostie/__pycache__/csv_extract.cpython-36.pyc
Binary file not shown.
Binary file modified pyostie/__pycache__/docx.cpython-36.pyc
Binary file not shown.
Binary file modified pyostie/__pycache__/excel.cpython-36.pyc
Binary file not shown.
Binary file modified pyostie/__pycache__/image.cpython-36.pyc
Binary file not shown.
Binary file modified pyostie/__pycache__/insights_ext.cpython-36.pyc
Binary file not shown.
Binary file modified pyostie/__pycache__/pdf.cpython-36.pyc
Binary file not shown.
Binary file modified pyostie/__pycache__/plots.cpython-36.pyc
Binary file not shown.
Binary file modified pyostie/__pycache__/pptx.cpython-36.pyc
Binary file not shown.
Binary file modified pyostie/__pycache__/speechtotext.cpython-36.pyc
Binary file not shown.
Binary file modified pyostie/__pycache__/text.cpython-36.pyc
Binary file not shown.
Binary file modified pyostie/__pycache__/utils.cpython-36.pyc
Binary file not shown.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
name="Pyostie",

# version of the module
version="2.5.4",
version="2.5.5",

# Name of Author
author="Anirudh Palaparthi",
Expand Down

0 comments on commit 8849aac

Please sign in to comment.