-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
added and improved algorithm to get 100% scraping accuracy
even if ocr generates false positives added and improved both desktop(windows) and colab versions
- Loading branch information
Showing
40 changed files
with
774 additions
and
211 deletions.
There are no files selected for viewing
347 changes: 347 additions & 0 deletions
347
gturesults.in/automated_result_downloder_gtu_results.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,347 @@ | ||
{ | ||
"nbformat": 4, | ||
"nbformat_minor": 0, | ||
"metadata": { | ||
"colab": { | ||
"private_outputs": true, | ||
"provenance": [], | ||
"collapsed_sections": [] | ||
}, | ||
"kernelspec": { | ||
"name": "python3", | ||
"display_name": "Python 3" | ||
}, | ||
"language_info": { | ||
"name": "python" | ||
} | ||
}, | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"install selenium\n", | ||
"and\n", | ||
"chrome driver (also define path)" | ||
], | ||
"metadata": { | ||
"id": "hQnMeGfcky0D" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"!pip install selenium\n", | ||
"!apt-get update # to update ubuntu to correctly run apt install\n", | ||
"!apt install chromium-chromedriver\n", | ||
"!cp /usr/lib/chromium-browser/chromedriver /usr/bin\n", | ||
"import sys\n", | ||
"sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')" | ||
], | ||
"metadata": { | ||
"id": "BGKqk2CXky0E" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"install tesseract" | ||
], | ||
"metadata": { | ||
"id": "JKV47yZbky0C" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"!sudo apt install tesseract-ocr\n", | ||
"!pip install pytesseract" | ||
], | ||
"metadata": { | ||
"id": "X47h-_HQky0D" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"restart runtime" | ||
], | ||
"metadata": { | ||
"id": "MBT77b1SBIWq" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"import os\n", | ||
"os.kill(os.getpid(), 9)" | ||
], | ||
"metadata": { | ||
"id": "LQu1QserAnKc" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"import dependancies" | ||
], | ||
"metadata": { | ||
"id": "-l3BwLvnky0E" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"from selenium import webdriver\n", | ||
"from selenium.webdriver.common.by import By\n", | ||
"from selenium.webdriver.common.keys import Keys\n", | ||
"from selenium.webdriver.support.select import Select\n", | ||
"from selenium.webdriver.chrome.service import Service\n", | ||
"\n", | ||
"import cv2\n", | ||
"from PIL import Image, ImageCms, ImageFilter\n", | ||
"import pytesseract" | ||
], | ||
"metadata": { | ||
"id": "wkot4dh0ky0E" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"helper functions" | ||
], | ||
"metadata": { | ||
"id": "tmMP9JI0ky0F" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"def step1():\n", | ||
"\t# open webpage\n", | ||
"\tdriver.get(URL)\n", | ||
"\n", | ||
"\t# save captcha\n", | ||
"\timdata = driver.find_element(By.ID,\"imgCaptcha\")\n", | ||
"\twith open(path, 'wb') as file:\n", | ||
"\t file.write(imdata.screenshot_as_png)\n", | ||
"\n", | ||
"def step2():\n", | ||
"\t# convert to inverted mask and save img_temp\n", | ||
"\tim = cv2.imread(path)\n", | ||
"\tgray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)\n", | ||
"\tthresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]\n", | ||
"\thorizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 1))\n", | ||
"\tMask = cv2.morphologyEx(thresh, cv2.MORPH_OPEN,horizontal_kernel, iterations=2)\n", | ||
"\t#Mask = cv2.bitwise_not(Mask)\n", | ||
"\tcv2.imwrite(\"old.png\", Mask)\n", | ||
"\n", | ||
"\t# open img_temp and reinvert mask\n", | ||
"\timg = Image.open(\"old.png\")\n", | ||
"\timg = img.convert(\"RGBA\")\n", | ||
"\tdatas = img.getdata()\n", | ||
"\tnewData = []\n", | ||
"\tfor item in datas:\n", | ||
"\t if item[0] == 0 and item[1] == 0 and item[2] == 0:\n", | ||
"\t newData.append((255, 255, 255, 0))\n", | ||
"\t else:\n", | ||
"\t newData.append(item)\n", | ||
"\timg.putdata(newData)\n", | ||
"\n", | ||
"\t# paste mask on img and save new_temp_img\n", | ||
"\tbackground = Image.open(path)\n", | ||
"\tbackground = background.convert(\"RGBA\")\n", | ||
"\tbackground.paste(img,mask=img)\n", | ||
"\tbackground.save(\"new.png\",\"PNG\")\n", | ||
"\n", | ||
"def step3(im): # solve captcha\n", | ||
" im = Image.open(im) # open last saved img\n", | ||
" im = im.crop((5,5,115,35)) # crop it\n", | ||
" # conver image to extractable form elements (deffer captcha styles)\n", | ||
" rgb = ImageCms.createProfile(colorSpace='sRGB')\n", | ||
" lab = ImageCms.createProfile(colorSpace='LAB')\n", | ||
" transform = ImageCms.buildTransform(inputProfile=rgb, outputProfile=lab, inMode='RGB', outMode='LAB')\n", | ||
" lab_im = ImageCms.applyTransform(im=im, transform=transform)\n", | ||
" l, a, b = lab_im.split()\n", | ||
" im=l # select an element which is most extractable\n", | ||
" im = im.filter(ImageFilter.MinFilter(3)) # filter it\n", | ||
" result = pytesseract.image_to_string(im) # send it to ocr and save results to a variable\n", | ||
" l=[]\n", | ||
" l.append(result.strip())\n", | ||
" if l[0]==\" \" or l[0]==\"\" : # if result will be empty then it will do above steps again untill it gets the result\n", | ||
" step1()\n", | ||
" step2()\n", | ||
" l[0]=step3(\"new.png\")\n", | ||
" return l[0] # return final result (maybe right or wrong)\n", | ||
"\n", | ||
"def step4(enroll,fname,ans): # print webpage\n", | ||
"\t# site automation \n", | ||
"\tsel = Select(driver.find_element(By.ID,\"ddlbatch\")) # focus on select element\n", | ||
"\tsel.select_by_value(exam) # select element by giving id (specific for a exam)\n", | ||
"\tenr = driver.find_element(By.ID,\"txtenroll\") # get enrollment no. text box\n", | ||
"\tcaptex = driver.find_element(By.ID,\"CodeNumberTextBox\") # get captcha text box\n", | ||
"\tenr.send_keys(enroll) # send (type) given enrollment number to text box\n", | ||
"\tcaptex.send_keys(ans) # send (type) extracted captcha text to text box\n", | ||
"\tcaptex.send_keys(Keys.RETURN) # return (ENTER)\n", | ||
"\n", | ||
"\tere = driver.find_element(By.ID,\"lblmsg\").text\n", | ||
"\tif ere == \"ERROR: Incorrect captcha code, try again.\" : \n", | ||
"\t\treturn \"err\"\n", | ||
"\tif ere == \"Your request count is reached to maximum limit, Please try again later.\" : \n", | ||
"\t\treturn \"reqover\"\n", | ||
"\n", | ||
"\t# resize window for full view screenshot\n", | ||
"\tdriver.set_window_size(1000,1000) \n", | ||
"\tdriver.find_element(By.TAG_NAME,\"body\").screenshot(fname) # take a screenshot and save it to given filename\n", | ||
"\n", | ||
"def loop():\n", | ||
"\t# just a loop through different enrollment numbers\n", | ||
"\tglobal counter\n", | ||
"\tmynewlist = []\n", | ||
"\tfor i in mylist :\n", | ||
"\t\tenroll = \"{}\".format(i)\n", | ||
"\t\tfname = \"./outputs/{}.png\".format(i)\n", | ||
"\t\tstep1()\n", | ||
"\t\tstep2()\n", | ||
"\t\tans=step3(\"new.png\")\n", | ||
"\t\tnr=step4(enroll,fname,ans)\n", | ||
"\t\tif nr == \"err\" :\n", | ||
"\t\t\tmynewlist.append(enroll)\n", | ||
"\t\telif nr == \"reqover\" :\n", | ||
"\t\t\tprint(\"Change the SERVER!\")\n", | ||
"\t\t\tbreak\n", | ||
"\t\telse :\n", | ||
"\t\t\tcounter += 1\n", | ||
"\t\t\tprint(f\"{counter}/{tc} {int(counter*100/tc)}%\")\n", | ||
"\treturn mynewlist" | ||
], | ||
"metadata": { | ||
"id": "2ROK21SYky0F" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"make outputs directory for collecting outputs" | ||
], | ||
"metadata": { | ||
"id": "WtxZCyDj8mrc" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"!mkdir /content/outputs" | ||
], | ||
"metadata": { | ||
"id": "Z4kbPXeh8a3d" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"main function" | ||
], | ||
"metadata": { | ||
"id": "R2TKjx9_ky0G" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"# initiate webdriver and configure options\n", | ||
"chrome_options = webdriver.ChromeOptions()\n", | ||
"chrome_options.add_argument('--headless')\n", | ||
"chrome_options.add_argument('--no-sandbox')\n", | ||
"chrome_options.add_argument('--disable-dev-shm-usage')\n", | ||
"chrome_options.add_argument(\"--incognito\")\n", | ||
"\n", | ||
"# initiate chromedriver service\n", | ||
"ser = Service(\"chromedriver\")\n", | ||
"# start webdriver\n", | ||
"driver = webdriver.Chrome(service=ser,options=chrome_options)\n", | ||
"\n", | ||
"# define url and filename for download captcha_temp\n", | ||
"URL = \"https://www.gturesults.in/\"\n", | ||
"path=\"cap.jpg\"\n", | ||
"# define exam\n", | ||
"exam = \"3361$S2022$2022-08-25$current$0\"\n", | ||
" \n", | ||
"# take enrollment no. input from excel file\n", | ||
"# import pandas as pd\n", | ||
"# df = pd.read_excel('g.xlsx')\n", | ||
"# mylist = df['Enroll No.'].tolist()\n", | ||
"#------------------- OR --------------------\n", | ||
"# we can define our own list here\n", | ||
"#mylist = [190280111010]\n", | ||
"#------------------- OR --------------------\n", | ||
"# we can define our range here\n", | ||
"mylist = range(190280111001,190280111020+1)\n", | ||
"\n", | ||
"counter = 0\n", | ||
"tc = len(mylist)\n", | ||
"\n", | ||
"while 1:\n", | ||
"\tmynewlist=loop()\n", | ||
"\tif len(mynewlist) != 0:\n", | ||
"\t\tmylist = mynewlist\n", | ||
"\telse:\n", | ||
"\t\tbreak\n", | ||
"\n", | ||
"# closing webinstance\n", | ||
"driver.quit()\n", | ||
"\n", | ||
"# removing unwanted files \n", | ||
"!rm cap.jpg\n", | ||
"!rm new.png\n", | ||
"!rm old.png" | ||
], | ||
"metadata": { | ||
"id": "TGsoejuHky0G" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"download folder\n" | ||
], | ||
"metadata": { | ||
"id": "obUWTMAVziTB" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"!zip -r /content/outputs.zip /content/outputs" | ||
], | ||
"metadata": { | ||
"id": "ujWFCgxDo7JY" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"👈 download <font color='yellow'>outputs.zip</font> from left side bar by double clicking it" | ||
], | ||
"metadata": { | ||
"id": "qgkULANw3ZQ2" | ||
} | ||
} | ||
] | ||
} |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Oops, something went wrong.