Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

glassdoor scraping.ipynb has been modified and updated as the glassdo… #4

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
271 changes: 270 additions & 1 deletion .ipynb_checkpoints/glassdoor scraping-checkpoint.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,275 @@
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#!/usr/bin/env python3\n",
"# -*- coding: utf-8 -*-\n",
"\"\"\"\n",
"original author: Ömer Sakarya , Oct 15, 2019\n",
"git : https://github.com/arapfaik/scraping-glassdoor-selenium\n",
"original tutorial: https://towardsdatascience.com/selenium-tutorial-scraping-glassdoor-com-in-10-minutes-3d0915c6d905\n",
"\n",
"tutorial followed(youtube: KenJee): https://www.youtube.com/watch?v=GmW4F6MHqqs&list=PL2zq7klxX5ASFejJj80ob9ZAnBHdz5O1t\n",
"\n",
"Disclaimer: I don't own the copyrights of the code , It was written and coded as\n",
" followed on the youtube channel mentioned above\n",
" P.S: the code has been modified according to the updated structure of the website for \n",
" webscraping, there are fields/data that I couldn't able to find, for reading\n",
" purposes the old lines of code is commented and updated code added underneath\n",
" for better understanding.please take a note that I have changed the names of the column and files according to my need\n",
" , if you are copy pasting this code you have to look for syntax errors in names of files and\n",
" data-columns that are used in tutorial.\n",
"\"\"\"\n",
"from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException\n",
"from selenium import webdriver\n",
"import time\n",
"import pandas as pd\n",
"\n",
"def get_jobs(keyword, num_jobs, verbose,path,slp_time):\n",
" \n",
" '''Gathers jobs as a dataframe, scraped from Glassdoor'''\n",
" \n",
" #Initializing the webdriver\n",
" options = webdriver.ChromeOptions()\n",
" \n",
" #Uncomment the line below if you'd like to scrape without a new Chrome window every time.\n",
" #options.add_argument('headless')\n",
" \n",
" #Change the path to where chromedriver is in your home folder.\n",
" # driver = webdriver.Chrome(executable_path=\"/Users/omersakarya/Documents/GitHub/scraping-glassdoor-selenium/chromedriver\", options=options)\n",
" # path = \"ChromeDriver/chromedriver\" # I have made a folder:\"ChromeDriver\" and put file\"chromedriver.exe\"inside this folder. \n",
" # ^^^Folder^^^/^^^this is .exe file\n",
" driver = webdriver.Chrome(executable_path=path, options=options)\n",
" driver.set_window_size(1120, 1000)\n",
" \n",
" # url = 'https://www.glassdoor.com/Job/jobs.htm?sc.keyword='+ keyword +'&includeNoSalaryJobs=false&radius=100'\n",
" url = 'https://www.glassdoor.com/Job/jobs.htm?sc.keyword=\"' + keyword + '\"&locT=C&locId=1147401&locKeyword=San%20Francisco,%20CA&jobType=all&fromAge=-1&minSalary=0&includeNoSalaryJobs=false&radius=100&cityId=-1&minRating=0.0&industryId=-1&sgocId=-1&seniorityType=all&companyId=-1&employerSizes=0&applicationType=0&remoteWorkType=0'\n",
" driver.get(url)\n",
" jobs = []\n",
"\n",
" while len(jobs) < num_jobs: #If true, should be still looking for new jobs.\n",
"\n",
" #Let the page load. Change this number based on your internet speed.\n",
" #Or, wait until the webpage is loaded, instead of hardcoding it.\n",
" time.sleep(slp_time)\n",
"\n",
" #Test for the \"Sign Up\" prompt and get rid of it.\n",
" # try:\n",
" # driver.find_element_by_class_name(\"selected\").click()\n",
" # except ElementClickInterceptedException:\n",
" # pass\n",
"\n",
" # time.sleep(.1)\n",
"\n",
" try:\n",
" driver.find_element_by_class_name(\"ModalStyle__xBtn___29PT9\").click() #clicking to the X.\n",
" except NoSuchElementException:\n",
" pass\n",
" \n",
" # found_popup = False \n",
" currentJoblist = 0\n",
" \n",
" \n",
" if not (len(jobs) >= num_jobs):\n",
" listButtonsCount = len(driver.find_elements_by_xpath('//*[@id=\"MainCol\"]//div[1]//ul//li[@data-test=\"jobListing\"]'))\n",
" print(\"&&& job butons:\" +str(listButtonsCount))\n",
" #Going through each job in this page\n",
" # job_buttons = driver.find_elements_by_class_name(\"jl\") #jl for Job Listing. These are the buttons we're going to click.\n",
" job_buttons = driver.find_elements_by_xpath('.//*[@id=\"MainCol\"]//a[@class=\"jobLink\"]') #jl for Job Listing. These are the buttons we're going to click.\n",
" \n",
" for job_button in job_buttons: \n",
" \n",
" print(\"Progress: {}\".format(\"\" + str(len(jobs)) + \"/\" + str(num_jobs)))\n",
" if len(jobs) >= num_jobs:\n",
" break\n",
" \n",
" \n",
" job_button.click() #You might \n",
" \n",
" time.sleep(4)\n",
" \n",
" #___________ code to kill the sign-up pop-up after it render on screen\n",
" # if not found_popup:\n",
" try:\n",
" driver.find_element_by_css_selector('[alt=\"Close\"]').click()\n",
" # print(\"&&& line 89\")\n",
" # found_popup = True\n",
" except NoSuchElementException:\n",
" # print(\"&&& line 92\")\n",
" pass\n",
" \n",
" # __________\n",
" \n",
" \n",
" collected_successfully = False\n",
" \n",
" while not collected_successfully:\n",
" try:\n",
" # company_name = driver.find_element_by_xpath('.//div[@class=\"employerName\"]').text\n",
" company_name = driver.find_element_by_xpath('//*[@id=\"MainCol\"]//li['+ str(currentJoblist + 1) +']//div[2]//a//span').text\n",
" \n",
" # location = driver.find_element_by_xpath('.//div[@class=\"location\"]').text\n",
" location = driver.find_element_by_xpath('//*[@id=\"MainCol\"]//li['+ str(currentJoblist + 1) +']//div[2]//div[2]/span').text\n",
" \n",
" # job_title = driver.find_element_by_xpath('.//div[contains(@class, \"title\")]').text\n",
" job_title = driver.find_element_by_xpath('//*[@id=\"MainCol\"]//li['+ str(currentJoblist + 1) +']//a[@data-test=\"job-link\"]').text\n",
" \n",
" job_description = driver.find_element_by_xpath('.//div[@class=\"jobDescriptionContent desc\"]').text\n",
" \n",
" # job_function is an additional information not included in previous code\n",
" job_function = driver.find_element_by_xpath('//*[@id=\"JDCol\"]//strong[text()[1]=\"Job Function\"]//following-sibling::*').text\n",
" \n",
" collected_successfully = True\n",
" except:\n",
" # print(\"&&& line 67\")\n",
" # collected_successfully=True\n",
" time.sleep(5)\n",
" \n",
" try:\n",
" # salary_estimate = driver.find_element_by_xpath('.//span[@class=\"gray small salary\"]').text\n",
" salary_estimate = driver.find_element_by_xpath('//*[@id=\"JDCol\"]//span[@data-test=\"detailSalary\"]').text\n",
" except NoSuchElementException:\n",
" salary_estimate = -1 #You need to set a \"not found value. It's important.\"\n",
" \n",
" try:\n",
" # rating = driver.find_element_by_xpath('.//span[@class=\"rating\"]').text\n",
" rating = driver.find_element_by_xpath('//*[@id=\"JDCol\"]//span[@data-test=\"detailRating\"]').text\n",
" except NoSuchElementException:\n",
" rating = -1 #You need to set a \"not found value. It's important.\"\n",
" \n",
" # #Printing for debugging\n",
" if verbose:\n",
" print(\"Job Title: {}\".format(job_title))\n",
" print(\"Salary Estimate: {}\".format(salary_estimate))\n",
" print(\"Job Description: {}\".format(job_description[:500]))\n",
" print(\"Rating: {}\".format(rating))\n",
" print(\"Company Name: {}\".format(company_name))\n",
" print(\"Location: {}\".format(location))\n",
" print(\"Job Function: {}\".format(job_function))\n",
" \n",
" #Going to the Company tab...\n",
" #clicking on this:\n",
" #<div class=\"tab\" data-tab-type=\"overview\"><span>Company</span></div>\n",
" time.sleep(1)\n",
" try:\n",
" # driver.find_element_by_xpath('.//div[@class=\"tab\" and @data-tab-type=\"overview\"]').click()\n",
" driver.find_element_by_xpath('.//div[@id=\"SerpFixedHeader\"]//span[text()=\"Company\"]').click()\n",
" \n",
" # try:\n",
" # #<div class=\"infoEntity\">\n",
" # # <label>Headquarters</label>\n",
" # # <span class=\"value\">San Francisco, CA</span>\n",
" # #</div>\n",
" # headquarters = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Headquarters\"]//following-sibling::*').text\n",
" # ^^^^^^^^^^ couldn't abel to find \"headquarters\"\n",
" # except NoSuchElementException:\n",
" # headquarters = -1\n",
" \n",
" try:\n",
" # size = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Size\"]//following-sibling::*').text\n",
" size = driver.find_element_by_xpath('.//div[@id=\"EmpBasicInfo\"]//span[text()=\"Size\"]//following-sibling::*').text\n",
" except NoSuchElementException:\n",
" size = -1\n",
" \n",
" try:\n",
" # founded = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Founded\"]//following-sibling::*').text\n",
" founded = driver.find_element_by_xpath('.//div[@id=\"EmpBasicInfo\"]//span[text()=\"Founded\"]//following-sibling::*').text\n",
" except NoSuchElementException:\n",
" founded = -1\n",
" \n",
" try:\n",
" # type_of_ownership = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Type\"]//following-sibling::*').text\n",
" type_of_ownership = driver.find_element_by_xpath('.//div[@id=\"EmpBasicInfo\"]//span[text()=\"Type\"]//following-sibling::*').text\n",
" except NoSuchElementException:\n",
" type_of_ownership = -1\n",
" \n",
" try:\n",
" # industry = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Industry\"]//following-sibling::*').text\n",
" industry = driver.find_element_by_xpath('.//div[@id=\"EmpBasicInfo\"]//span[text()=\"Industry\"]//following-sibling::*').text\n",
" except NoSuchElementException:\n",
" industry = -1\n",
" \n",
" try:\n",
" # sector = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Sector\"]//following-sibling::*').text\n",
" sector = driver.find_element_by_xpath('.//div[@id=\"EmpBasicInfo\"]//span[text()=\"Sector\"]//following-sibling::*').text\n",
" except NoSuchElementException:\n",
" sector = -1\n",
" \n",
" try:\n",
" # revenue = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Revenue\"]//following-sibling::*').text\n",
" revenue = driver.find_element_by_xpath('.//div[@id=\"EmpBasicInfo\"]//span[text()=\"Revenue\"]//following-sibling::*').text\n",
" except NoSuchElementException:\n",
" revenue = -1\n",
" \n",
" # try:\n",
" # competitors = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Competitors\"]//following-sibling::*').text\n",
" # ^^^^^^^^^^^ couldn't able to find \"competitors\"\n",
" # except NoSuchElementException:\n",
" # competitors = -1\n",
" \n",
" except NoSuchElementException: #Rarely, some job postings do not have the \"Company\" tab.\n",
" # headquarters = -1\n",
" size = -1\n",
" founded = -1\n",
" type_of_ownership = -1\n",
" industry = -1\n",
" sector = -1\n",
" revenue = -1\n",
" # competitors = -1\n",
" \n",
" \n",
" if verbose:\n",
" \n",
" print(\"Size: {}\".format(size))\n",
" print(\"Founded: {}\".format(founded))\n",
" print(\"Type of Ownership: {}\".format(type_of_ownership))\n",
" print(\"Industry: {}\".format(industry))\n",
" print(\"Sector: {}\".format(sector))\n",
" print(\"Revenue: {}\".format(revenue))\n",
" # print(\"Headquarters: {}\".format(headquarters))\n",
" # print(\"Competitors: {}\".format(competitors))\n",
" print(\"@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\")\n",
" \n",
" jobs.append({\"Job Title\" : job_title, \n",
" \"Salary Estimate\" : salary_estimate,\n",
" \"Job Function\" : job_function,\n",
" \"Job Description\" : job_description, \n",
" \"Company Name\" : company_name,\n",
" \"Rating\" : rating, \n",
" \"Location\" : location,\n",
" \"Size\" : size,\n",
" \"Founded\" : founded,\n",
" \"Type of ownership\" : type_of_ownership,\n",
" \"Industry\" : industry,\n",
" \"Sector\" : sector,\n",
" \"Revenue\" : revenue})\n",
" # \"Headquarters\" : headquarters,\n",
" # \"Competitors\" : competitors})\n",
" # ^^^^^^^^ couldn't able to find \"Headquarters\" and \"Competitors\"\n",
" #add job to jobs\n",
" \n",
" currentJoblist=currentJoblist+1 # increasing the count of the list of buttons clicked and saved\n",
" \n",
" if not (currentJoblist < listButtonsCount): # to check the list last button and to go to next page\n",
" currentJoblist = 0 # resetting the button list count for new page button's list\n",
" break\n",
" #Clicking on the \"next page\" button\n",
" try: \n",
" # driver.find_element_by_xpath('.//li[@class=\"next\"]//a').click()\n",
" driver.find_element_by_xpath('//*[@id=\"FooterPageNav\"]//a[@data-test=\"pagination-next\"]').click()\n",
" \n",
" except NoSuchElementException:\n",
" print(\"Scraping terminated before reaching target number of jobs. Needed {}, got {}.\".format(num_jobs, len(jobs)))\n",
" break\n",
"\n",
" return pd.DataFrame(jobs) #This line converts the dictionary object into a pandas DataFrame."
]
}
],
"metadata": {
Expand All @@ -418,7 +687,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
"version": "3.8.3"
}
},
"nbformat": 4,
Expand Down
Loading