Skip to content

Commit

Permalink
added and improved algorithm to get 100% scraping accuracy
Browse files Browse the repository at this point in the history
even if ocr generates false positives

added and improved both desktop(windows) and colab versions
  • Loading branch information
alloc7260 committed Sep 3, 2022
1 parent 2d1ad05 commit d36ab5a
Show file tree
Hide file tree
Showing 40 changed files with 774 additions and 211 deletions.
347 changes: 347 additions & 0 deletions gturesults.in/automated_result_downloder_gtu_results.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,347 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"private_outputs": true,
"provenance": [],
"collapsed_sections": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"source": [
"install selenium\n",
"and\n",
"chrome driver (also define path)"
],
"metadata": {
"id": "hQnMeGfcky0D"
}
},
{
"cell_type": "code",
"source": [
"!pip install selenium\n",
"!apt-get update # to update ubuntu to correctly run apt install\n",
"!apt install chromium-chromedriver\n",
"!cp /usr/lib/chromium-browser/chromedriver /usr/bin\n",
"import sys\n",
"sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')"
],
"metadata": {
"id": "BGKqk2CXky0E"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"install tesseract"
],
"metadata": {
"id": "JKV47yZbky0C"
}
},
{
"cell_type": "code",
"source": [
"!sudo apt install tesseract-ocr\n",
"!pip install pytesseract"
],
"metadata": {
"id": "X47h-_HQky0D"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"restart runtime"
],
"metadata": {
"id": "MBT77b1SBIWq"
}
},
{
"cell_type": "code",
"source": [
"import os\n",
"os.kill(os.getpid(), 9)"
],
"metadata": {
"id": "LQu1QserAnKc"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"import dependancies"
],
"metadata": {
"id": "-l3BwLvnky0E"
}
},
{
"cell_type": "code",
"source": [
"from selenium import webdriver\n",
"from selenium.webdriver.common.by import By\n",
"from selenium.webdriver.common.keys import Keys\n",
"from selenium.webdriver.support.select import Select\n",
"from selenium.webdriver.chrome.service import Service\n",
"\n",
"import cv2\n",
"from PIL import Image, ImageCms, ImageFilter\n",
"import pytesseract"
],
"metadata": {
"id": "wkot4dh0ky0E"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"helper functions"
],
"metadata": {
"id": "tmMP9JI0ky0F"
}
},
{
"cell_type": "code",
"source": [
"def step1():\n",
"\t# open webpage\n",
"\tdriver.get(URL)\n",
"\n",
"\t# save captcha\n",
"\timdata = driver.find_element(By.ID,\"imgCaptcha\")\n",
"\twith open(path, 'wb') as file:\n",
"\t file.write(imdata.screenshot_as_png)\n",
"\n",
"def step2():\n",
"\t# convert to inverted mask and save img_temp\n",
"\tim = cv2.imread(path)\n",
"\tgray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)\n",
"\tthresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]\n",
"\thorizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 1))\n",
"\tMask = cv2.morphologyEx(thresh, cv2.MORPH_OPEN,horizontal_kernel, iterations=2)\n",
"\t#Mask = cv2.bitwise_not(Mask)\n",
"\tcv2.imwrite(\"old.png\", Mask)\n",
"\n",
"\t# open img_temp and reinvert mask\n",
"\timg = Image.open(\"old.png\")\n",
"\timg = img.convert(\"RGBA\")\n",
"\tdatas = img.getdata()\n",
"\tnewData = []\n",
"\tfor item in datas:\n",
"\t if item[0] == 0 and item[1] == 0 and item[2] == 0:\n",
"\t newData.append((255, 255, 255, 0))\n",
"\t else:\n",
"\t newData.append(item)\n",
"\timg.putdata(newData)\n",
"\n",
"\t# paste mask on img and save new_temp_img\n",
"\tbackground = Image.open(path)\n",
"\tbackground = background.convert(\"RGBA\")\n",
"\tbackground.paste(img,mask=img)\n",
"\tbackground.save(\"new.png\",\"PNG\")\n",
"\n",
"def step3(im): # solve captcha\n",
" im = Image.open(im) # open last saved img\n",
" im = im.crop((5,5,115,35)) # crop it\n",
" # conver image to extractable form elements (deffer captcha styles)\n",
" rgb = ImageCms.createProfile(colorSpace='sRGB')\n",
" lab = ImageCms.createProfile(colorSpace='LAB')\n",
" transform = ImageCms.buildTransform(inputProfile=rgb, outputProfile=lab, inMode='RGB', outMode='LAB')\n",
" lab_im = ImageCms.applyTransform(im=im, transform=transform)\n",
" l, a, b = lab_im.split()\n",
" im=l # select an element which is most extractable\n",
" im = im.filter(ImageFilter.MinFilter(3)) # filter it\n",
" result = pytesseract.image_to_string(im) # send it to ocr and save results to a variable\n",
" l=[]\n",
" l.append(result.strip())\n",
" if l[0]==\" \" or l[0]==\"\" : # if result will be empty then it will do above steps again untill it gets the result\n",
" step1()\n",
" step2()\n",
" l[0]=step3(\"new.png\")\n",
" return l[0] # return final result (maybe right or wrong)\n",
"\n",
"def step4(enroll,fname,ans): # print webpage\n",
"\t# site automation \n",
"\tsel = Select(driver.find_element(By.ID,\"ddlbatch\")) # focus on select element\n",
"\tsel.select_by_value(exam) # select element by giving id (specific for a exam)\n",
"\tenr = driver.find_element(By.ID,\"txtenroll\") # get enrollment no. text box\n",
"\tcaptex = driver.find_element(By.ID,\"CodeNumberTextBox\") # get captcha text box\n",
"\tenr.send_keys(enroll) # send (type) given enrollment number to text box\n",
"\tcaptex.send_keys(ans) # send (type) extracted captcha text to text box\n",
"\tcaptex.send_keys(Keys.RETURN) # return (ENTER)\n",
"\n",
"\tere = driver.find_element(By.ID,\"lblmsg\").text\n",
"\tif ere == \"ERROR: Incorrect captcha code, try again.\" : \n",
"\t\treturn \"err\"\n",
"\tif ere == \"Your request count is reached to maximum limit, Please try again later.\" : \n",
"\t\treturn \"reqover\"\n",
"\n",
"\t# resize window for full view screenshot\n",
"\tdriver.set_window_size(1000,1000) \n",
"\tdriver.find_element(By.TAG_NAME,\"body\").screenshot(fname) # take a screenshot and save it to given filename\n",
"\n",
"def loop():\n",
"\t# just a loop through different enrollment numbers\n",
"\tglobal counter\n",
"\tmynewlist = []\n",
"\tfor i in mylist :\n",
"\t\tenroll = \"{}\".format(i)\n",
"\t\tfname = \"./outputs/{}.png\".format(i)\n",
"\t\tstep1()\n",
"\t\tstep2()\n",
"\t\tans=step3(\"new.png\")\n",
"\t\tnr=step4(enroll,fname,ans)\n",
"\t\tif nr == \"err\" :\n",
"\t\t\tmynewlist.append(enroll)\n",
"\t\telif nr == \"reqover\" :\n",
"\t\t\tprint(\"Change the SERVER!\")\n",
"\t\t\tbreak\n",
"\t\telse :\n",
"\t\t\tcounter += 1\n",
"\t\t\tprint(f\"{counter}/{tc} {int(counter*100/tc)}%\")\n",
"\treturn mynewlist"
],
"metadata": {
"id": "2ROK21SYky0F"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"make outputs directory for collecting outputs"
],
"metadata": {
"id": "WtxZCyDj8mrc"
}
},
{
"cell_type": "code",
"source": [
"!mkdir /content/outputs"
],
"metadata": {
"id": "Z4kbPXeh8a3d"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"main function"
],
"metadata": {
"id": "R2TKjx9_ky0G"
}
},
{
"cell_type": "code",
"source": [
"# initiate webdriver and configure options\n",
"chrome_options = webdriver.ChromeOptions()\n",
"chrome_options.add_argument('--headless')\n",
"chrome_options.add_argument('--no-sandbox')\n",
"chrome_options.add_argument('--disable-dev-shm-usage')\n",
"chrome_options.add_argument(\"--incognito\")\n",
"\n",
"# initiate chromedriver service\n",
"ser = Service(\"chromedriver\")\n",
"# start webdriver\n",
"driver = webdriver.Chrome(service=ser,options=chrome_options)\n",
"\n",
"# define url and filename for download captcha_temp\n",
"URL = \"https://www.gturesults.in/\"\n",
"path=\"cap.jpg\"\n",
"# define exam\n",
"exam = \"3361$S2022$2022-08-25$current$0\"\n",
" \n",
"# take enrollment no. input from excel file\n",
"# import pandas as pd\n",
"# df = pd.read_excel('g.xlsx')\n",
"# mylist = df['Enroll No.'].tolist()\n",
"#------------------- OR --------------------\n",
"# we can define our own list here\n",
"#mylist = [190280111010]\n",
"#------------------- OR --------------------\n",
"# we can define our range here\n",
"mylist = range(190280111001,190280111020+1)\n",
"\n",
"counter = 0\n",
"tc = len(mylist)\n",
"\n",
"while 1:\n",
"\tmynewlist=loop()\n",
"\tif len(mynewlist) != 0:\n",
"\t\tmylist = mynewlist\n",
"\telse:\n",
"\t\tbreak\n",
"\n",
"# closing webinstance\n",
"driver.quit()\n",
"\n",
"# removing unwanted files \n",
"!rm cap.jpg\n",
"!rm new.png\n",
"!rm old.png"
],
"metadata": {
"id": "TGsoejuHky0G"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"download folder\n"
],
"metadata": {
"id": "obUWTMAVziTB"
}
},
{
"cell_type": "code",
"source": [
"!zip -r /content/outputs.zip /content/outputs"
],
"metadata": {
"id": "ujWFCgxDo7JY"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"👈 download <font color='yellow'>outputs.zip</font> from left side bar by double clicking it"
],
"metadata": {
"id": "qgkULANw3ZQ2"
}
}
]
}
Binary file modified gturesults.in/outputs/190280105031.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified gturesults.in/outputs/190280106068.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified gturesults.in/outputs/190280111005.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified gturesults.in/outputs/190280111006.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified gturesults.in/outputs/190280111009.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified gturesults.in/outputs/190280111010.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified gturesults.in/outputs/190280111011.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified gturesults.in/outputs/190280111018.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified gturesults.in/outputs/190280111029.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified gturesults.in/outputs/190280111031.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified gturesults.in/outputs/190280111032.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified gturesults.in/outputs/190280111033.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified gturesults.in/outputs/190280111036.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified gturesults.in/outputs/190280111038.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified gturesults.in/outputs/190280111043.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified gturesults.in/outputs/190320105007.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified gturesults.in/outputs/190320107013.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified gturesults.in/outputs/190320107506.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified gturesults.in/outputs/190320107521.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified gturesults.in/outputs/190320107528.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified gturesults.in/outputs/190320116006.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified gturesults.in/outputs/191260107004.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit d36ab5a

Please sign in to comment.