Skip to content

Commit

Permalink
Merge pull request #1 from iitb-research-code/devsb
Browse files Browse the repository at this point in the history
postprocessing for singular word treatment
  • Loading branch information
kasuba-badri-vishal authored Jul 12, 2023
2 parents 2d43e4c + 21b1f11 commit 1ffe8f3
Showing 1 changed file with 51 additions and 0 deletions.
51 changes: 51 additions & 0 deletions experiments/postprocessing_singular_word_treatment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import re
from bs4 import BeautifulSoup, Tag


def postprocess(soup):
for soup_div_child in soup.find_all("div"):
try:
if len(soup_div_child.find_next("span")) <= 3: #and len(re.findall("\w|\d", soup_div_child.get_text()))!=0:
prev_div = soup_div_child.find_previous("div")
prev_p = soup_div_child.find_previous("p", attrs="ocr_par")
prev_span = soup_div_child.find_previous("span", attrs="ocr_line")
curr_span = soup_div_child.find_next("span", attrs="ocr_line")
curr_span_bbox = curr_span.attrs["title"].split(";")[0]
prev_div_bbox = prev_div.attrs["title"].split(";")[0]
prev_p_bbox = prev_p.attrs["title"].split(";")[0]

cx1, cy1, cx2, cy2 = map(int, curr_span_bbox.split(" ")[1:])
pdx1, pdy1, pdx2, pdy2 = map(int, prev_div_bbox.split(" ")[1:])
ppx1, ppy1, ppx2, ppy2 = map(int, prev_p_bbox.split(" ")[1:])
ndx1, ndy1, ndx2, ndy2 = min(cx1, pdx1), min(cy1, pdy1), max(cx2, pdx2), max(cy2, pdy2) # for div class
npx1, npy1, npx2, npy2 = min(cx1, ppx1), min(cy1, ppy1), max(cx2, ppx2), max(cy2, ppy2) # for p class

# change the coordinates of the bbox of the div
new_bbox_parent_div = " ".join(["bbox"]+[str(ndx1), str(ndy1), str(ndx2), str(ndy2)])
new_bbox_parent_p = " ".join(["bbox"]+[str(npx1), str(npy1), str(npx2), str(npy2)])

prev_div.attrs["title"] = new_bbox_parent_div
prev_p.attrs["title"] = new_bbox_parent_p

# inserting the target span tags after the last span tag
# soup_div_child.find_previous("span").insert_after(soup_div_child.find_next("span"))
prev_span.insert_after(curr_span)


except Exception as e:
# print(e)
continue

# removing the leftover div class
for soup_div in soup.div:
for el in soup_div.next_element:
if isinstance(el, Tag):
if len(list(el.children)) < 3:
el.parent.decompose()

return soup


def hocr_correction(hocr):
soup = BeautifulSoup(hocr, "html.parser")
return postprocess(soup)

0 comments on commit 1ffe8f3

Please sign in to comment.