From 2a0d6677e02d867c0c9ad64be8e63131c58f6f27 Mon Sep 17 00:00:00 2001 From: wizRocks Date: Sun, 28 Mar 2021 23:10:52 +1100 Subject: [PATCH] glassdoor scraping.ipynb has been modified and updated as the glassdoor.com website restructured front-end layout --- .../glassdoor scraping-checkpoint.ipynb | 271 +++++++++++++++++- glassdoor scraping.ipynb | 271 +++++++++++++++++- 2 files changed, 540 insertions(+), 2 deletions(-) diff --git a/.ipynb_checkpoints/glassdoor scraping-checkpoint.ipynb b/.ipynb_checkpoints/glassdoor scraping-checkpoint.ipynb index 82b7c11..36973b3 100644 --- a/.ipynb_checkpoints/glassdoor scraping-checkpoint.ipynb +++ b/.ipynb_checkpoints/glassdoor scraping-checkpoint.ipynb @@ -400,6 +400,275 @@ "metadata": {}, "outputs": [], "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#!/usr/bin/env python3\n", + "# -*- coding: utf-8 -*-\n", + "\"\"\"\n", + "original author: Ömer Sakarya , Oct 15, 2019\n", + "git : https://github.com/arapfaik/scraping-glassdoor-selenium\n", + "original tutorial: https://towardsdatascience.com/selenium-tutorial-scraping-glassdoor-com-in-10-minutes-3d0915c6d905\n", + "\n", + "tutorial followed(youtube: KenJee): https://www.youtube.com/watch?v=GmW4F6MHqqs&list=PL2zq7klxX5ASFejJj80ob9ZAnBHdz5O1t\n", + "\n", + "Disclaimer: I don't own the copyrights of the code , It was written and coded as\n", + " followed on the youtube channel mentioned above\n", + " P.S: the code has been modified according to the updated structure of the website for \n", + " webscraping, there are fields/data that I couldn't able to find, for reading\n", + " purposes the old lines of code is commented and updated code added underneath\n", + " for better understanding.please take a note that I have changed the names of the column and files according to my need\n", + " , if you are copy pasting this code you have to look for syntax errors in names of files and\n", + " data-columns that are used in tutorial.\n", + "\"\"\"\n", + "from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException\n", + "from selenium import webdriver\n", + "import time\n", + "import pandas as pd\n", + "\n", + "def get_jobs(keyword, num_jobs, verbose,path,slp_time):\n", + " \n", + " '''Gathers jobs as a dataframe, scraped from Glassdoor'''\n", + " \n", + " #Initializing the webdriver\n", + " options = webdriver.ChromeOptions()\n", + " \n", + " #Uncomment the line below if you'd like to scrape without a new Chrome window every time.\n", + " #options.add_argument('headless')\n", + " \n", + " #Change the path to where chromedriver is in your home folder.\n", + " # driver = webdriver.Chrome(executable_path=\"/Users/omersakarya/Documents/GitHub/scraping-glassdoor-selenium/chromedriver\", options=options)\n", + " # path = \"ChromeDriver/chromedriver\" # I have made a folder:\"ChromeDriver\" and put file\"chromedriver.exe\"inside this folder. \n", + " # ^^^Folder^^^/^^^this is .exe file\n", + " driver = webdriver.Chrome(executable_path=path, options=options)\n", + " driver.set_window_size(1120, 1000)\n", + " \n", + " # url = 'https://www.glassdoor.com/Job/jobs.htm?sc.keyword='+ keyword +'&includeNoSalaryJobs=false&radius=100'\n", + " url = 'https://www.glassdoor.com/Job/jobs.htm?sc.keyword=\"' + keyword + '\"&locT=C&locId=1147401&locKeyword=San%20Francisco,%20CA&jobType=all&fromAge=-1&minSalary=0&includeNoSalaryJobs=false&radius=100&cityId=-1&minRating=0.0&industryId=-1&sgocId=-1&seniorityType=all&companyId=-1&employerSizes=0&applicationType=0&remoteWorkType=0'\n", + " driver.get(url)\n", + " jobs = []\n", + "\n", + " while len(jobs) < num_jobs: #If true, should be still looking for new jobs.\n", + "\n", + " #Let the page load. Change this number based on your internet speed.\n", + " #Or, wait until the webpage is loaded, instead of hardcoding it.\n", + " time.sleep(slp_time)\n", + "\n", + " #Test for the \"Sign Up\" prompt and get rid of it.\n", + " # try:\n", + " # driver.find_element_by_class_name(\"selected\").click()\n", + " # except ElementClickInterceptedException:\n", + " # pass\n", + "\n", + " # time.sleep(.1)\n", + "\n", + " try:\n", + " driver.find_element_by_class_name(\"ModalStyle__xBtn___29PT9\").click() #clicking to the X.\n", + " except NoSuchElementException:\n", + " pass\n", + " \n", + " # found_popup = False \n", + " currentJoblist = 0\n", + " \n", + " \n", + " if not (len(jobs) >= num_jobs):\n", + " listButtonsCount = len(driver.find_elements_by_xpath('//*[@id=\"MainCol\"]//div[1]//ul//li[@data-test=\"jobListing\"]'))\n", + " print(\"&&& job butons:\" +str(listButtonsCount))\n", + " #Going through each job in this page\n", + " # job_buttons = driver.find_elements_by_class_name(\"jl\") #jl for Job Listing. These are the buttons we're going to click.\n", + " job_buttons = driver.find_elements_by_xpath('.//*[@id=\"MainCol\"]//a[@class=\"jobLink\"]') #jl for Job Listing. These are the buttons we're going to click.\n", + " \n", + " for job_button in job_buttons: \n", + " \n", + " print(\"Progress: {}\".format(\"\" + str(len(jobs)) + \"/\" + str(num_jobs)))\n", + " if len(jobs) >= num_jobs:\n", + " break\n", + " \n", + " \n", + " job_button.click() #You might \n", + " \n", + " time.sleep(4)\n", + " \n", + " #___________ code to kill the sign-up pop-up after it render on screen\n", + " # if not found_popup:\n", + " try:\n", + " driver.find_element_by_css_selector('[alt=\"Close\"]').click()\n", + " # print(\"&&& line 89\")\n", + " # found_popup = True\n", + " except NoSuchElementException:\n", + " # print(\"&&& line 92\")\n", + " pass\n", + " \n", + " # __________\n", + " \n", + " \n", + " collected_successfully = False\n", + " \n", + " while not collected_successfully:\n", + " try:\n", + " # company_name = driver.find_element_by_xpath('.//div[@class=\"employerName\"]').text\n", + " company_name = driver.find_element_by_xpath('//*[@id=\"MainCol\"]//li['+ str(currentJoblist + 1) +']//div[2]//a//span').text\n", + " \n", + " # location = driver.find_element_by_xpath('.//div[@class=\"location\"]').text\n", + " location = driver.find_element_by_xpath('//*[@id=\"MainCol\"]//li['+ str(currentJoblist + 1) +']//div[2]//div[2]/span').text\n", + " \n", + " # job_title = driver.find_element_by_xpath('.//div[contains(@class, \"title\")]').text\n", + " job_title = driver.find_element_by_xpath('//*[@id=\"MainCol\"]//li['+ str(currentJoblist + 1) +']//a[@data-test=\"job-link\"]').text\n", + " \n", + " job_description = driver.find_element_by_xpath('.//div[@class=\"jobDescriptionContent desc\"]').text\n", + " \n", + " # job_function is an additional information not included in previous code\n", + " job_function = driver.find_element_by_xpath('//*[@id=\"JDCol\"]//strong[text()[1]=\"Job Function\"]//following-sibling::*').text\n", + " \n", + " collected_successfully = True\n", + " except:\n", + " # print(\"&&& line 67\")\n", + " # collected_successfully=True\n", + " time.sleep(5)\n", + " \n", + " try:\n", + " # salary_estimate = driver.find_element_by_xpath('.//span[@class=\"gray small salary\"]').text\n", + " salary_estimate = driver.find_element_by_xpath('//*[@id=\"JDCol\"]//span[@data-test=\"detailSalary\"]').text\n", + " except NoSuchElementException:\n", + " salary_estimate = -1 #You need to set a \"not found value. It's important.\"\n", + " \n", + " try:\n", + " # rating = driver.find_element_by_xpath('.//span[@class=\"rating\"]').text\n", + " rating = driver.find_element_by_xpath('//*[@id=\"JDCol\"]//span[@data-test=\"detailRating\"]').text\n", + " except NoSuchElementException:\n", + " rating = -1 #You need to set a \"not found value. It's important.\"\n", + " \n", + " # #Printing for debugging\n", + " if verbose:\n", + " print(\"Job Title: {}\".format(job_title))\n", + " print(\"Salary Estimate: {}\".format(salary_estimate))\n", + " print(\"Job Description: {}\".format(job_description[:500]))\n", + " print(\"Rating: {}\".format(rating))\n", + " print(\"Company Name: {}\".format(company_name))\n", + " print(\"Location: {}\".format(location))\n", + " print(\"Job Function: {}\".format(job_function))\n", + " \n", + " #Going to the Company tab...\n", + " #clicking on this:\n", + " #
Company
\n", + " time.sleep(1)\n", + " try:\n", + " # driver.find_element_by_xpath('.//div[@class=\"tab\" and @data-tab-type=\"overview\"]').click()\n", + " driver.find_element_by_xpath('.//div[@id=\"SerpFixedHeader\"]//span[text()=\"Company\"]').click()\n", + " \n", + " # try:\n", + " # #
\n", + " # # \n", + " # # San Francisco, CA\n", + " # #
\n", + " # headquarters = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Headquarters\"]//following-sibling::*').text\n", + " # ^^^^^^^^^^ couldn't abel to find \"headquarters\"\n", + " # except NoSuchElementException:\n", + " # headquarters = -1\n", + " \n", + " try:\n", + " # size = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Size\"]//following-sibling::*').text\n", + " size = driver.find_element_by_xpath('.//div[@id=\"EmpBasicInfo\"]//span[text()=\"Size\"]//following-sibling::*').text\n", + " except NoSuchElementException:\n", + " size = -1\n", + " \n", + " try:\n", + " # founded = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Founded\"]//following-sibling::*').text\n", + " founded = driver.find_element_by_xpath('.//div[@id=\"EmpBasicInfo\"]//span[text()=\"Founded\"]//following-sibling::*').text\n", + " except NoSuchElementException:\n", + " founded = -1\n", + " \n", + " try:\n", + " # type_of_ownership = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Type\"]//following-sibling::*').text\n", + " type_of_ownership = driver.find_element_by_xpath('.//div[@id=\"EmpBasicInfo\"]//span[text()=\"Type\"]//following-sibling::*').text\n", + " except NoSuchElementException:\n", + " type_of_ownership = -1\n", + " \n", + " try:\n", + " # industry = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Industry\"]//following-sibling::*').text\n", + " industry = driver.find_element_by_xpath('.//div[@id=\"EmpBasicInfo\"]//span[text()=\"Industry\"]//following-sibling::*').text\n", + " except NoSuchElementException:\n", + " industry = -1\n", + " \n", + " try:\n", + " # sector = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Sector\"]//following-sibling::*').text\n", + " sector = driver.find_element_by_xpath('.//div[@id=\"EmpBasicInfo\"]//span[text()=\"Sector\"]//following-sibling::*').text\n", + " except NoSuchElementException:\n", + " sector = -1\n", + " \n", + " try:\n", + " # revenue = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Revenue\"]//following-sibling::*').text\n", + " revenue = driver.find_element_by_xpath('.//div[@id=\"EmpBasicInfo\"]//span[text()=\"Revenue\"]//following-sibling::*').text\n", + " except NoSuchElementException:\n", + " revenue = -1\n", + " \n", + " # try:\n", + " # competitors = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Competitors\"]//following-sibling::*').text\n", + " # ^^^^^^^^^^^ couldn't able to find \"competitors\"\n", + " # except NoSuchElementException:\n", + " # competitors = -1\n", + " \n", + " except NoSuchElementException: #Rarely, some job postings do not have the \"Company\" tab.\n", + " # headquarters = -1\n", + " size = -1\n", + " founded = -1\n", + " type_of_ownership = -1\n", + " industry = -1\n", + " sector = -1\n", + " revenue = -1\n", + " # competitors = -1\n", + " \n", + " \n", + " if verbose:\n", + " \n", + " print(\"Size: {}\".format(size))\n", + " print(\"Founded: {}\".format(founded))\n", + " print(\"Type of Ownership: {}\".format(type_of_ownership))\n", + " print(\"Industry: {}\".format(industry))\n", + " print(\"Sector: {}\".format(sector))\n", + " print(\"Revenue: {}\".format(revenue))\n", + " # print(\"Headquarters: {}\".format(headquarters))\n", + " # print(\"Competitors: {}\".format(competitors))\n", + " print(\"@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\")\n", + " \n", + " jobs.append({\"Job Title\" : job_title, \n", + " \"Salary Estimate\" : salary_estimate,\n", + " \"Job Function\" : job_function,\n", + " \"Job Description\" : job_description, \n", + " \"Company Name\" : company_name,\n", + " \"Rating\" : rating, \n", + " \"Location\" : location,\n", + " \"Size\" : size,\n", + " \"Founded\" : founded,\n", + " \"Type of ownership\" : type_of_ownership,\n", + " \"Industry\" : industry,\n", + " \"Sector\" : sector,\n", + " \"Revenue\" : revenue})\n", + " # \"Headquarters\" : headquarters,\n", + " # \"Competitors\" : competitors})\n", + " # ^^^^^^^^ couldn't able to find \"Headquarters\" and \"Competitors\"\n", + " #add job to jobs\n", + " \n", + " currentJoblist=currentJoblist+1 # increasing the count of the list of buttons clicked and saved\n", + " \n", + " if not (currentJoblist < listButtonsCount): # to check the list last button and to go to next page\n", + " currentJoblist = 0 # resetting the button list count for new page button's list\n", + " break\n", + " #Clicking on the \"next page\" button\n", + " try: \n", + " # driver.find_element_by_xpath('.//li[@class=\"next\"]//a').click()\n", + " driver.find_element_by_xpath('//*[@id=\"FooterPageNav\"]//a[@data-test=\"pagination-next\"]').click()\n", + " \n", + " except NoSuchElementException:\n", + " print(\"Scraping terminated before reaching target number of jobs. Needed {}, got {}.\".format(num_jobs, len(jobs)))\n", + " break\n", + "\n", + " return pd.DataFrame(jobs) #This line converts the dictionary object into a pandas DataFrame." + ] } ], "metadata": { @@ -418,7 +687,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.0" + "version": "3.8.3" } }, "nbformat": 4, diff --git a/glassdoor scraping.ipynb b/glassdoor scraping.ipynb index 82b7c11..36973b3 100644 --- a/glassdoor scraping.ipynb +++ b/glassdoor scraping.ipynb @@ -400,6 +400,275 @@ "metadata": {}, "outputs": [], "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#!/usr/bin/env python3\n", + "# -*- coding: utf-8 -*-\n", + "\"\"\"\n", + "original author: Ömer Sakarya , Oct 15, 2019\n", + "git : https://github.com/arapfaik/scraping-glassdoor-selenium\n", + "original tutorial: https://towardsdatascience.com/selenium-tutorial-scraping-glassdoor-com-in-10-minutes-3d0915c6d905\n", + "\n", + "tutorial followed(youtube: KenJee): https://www.youtube.com/watch?v=GmW4F6MHqqs&list=PL2zq7klxX5ASFejJj80ob9ZAnBHdz5O1t\n", + "\n", + "Disclaimer: I don't own the copyrights of the code , It was written and coded as\n", + " followed on the youtube channel mentioned above\n", + " P.S: the code has been modified according to the updated structure of the website for \n", + " webscraping, there are fields/data that I couldn't able to find, for reading\n", + " purposes the old lines of code is commented and updated code added underneath\n", + " for better understanding.please take a note that I have changed the names of the column and files according to my need\n", + " , if you are copy pasting this code you have to look for syntax errors in names of files and\n", + " data-columns that are used in tutorial.\n", + "\"\"\"\n", + "from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException\n", + "from selenium import webdriver\n", + "import time\n", + "import pandas as pd\n", + "\n", + "def get_jobs(keyword, num_jobs, verbose,path,slp_time):\n", + " \n", + " '''Gathers jobs as a dataframe, scraped from Glassdoor'''\n", + " \n", + " #Initializing the webdriver\n", + " options = webdriver.ChromeOptions()\n", + " \n", + " #Uncomment the line below if you'd like to scrape without a new Chrome window every time.\n", + " #options.add_argument('headless')\n", + " \n", + " #Change the path to where chromedriver is in your home folder.\n", + " # driver = webdriver.Chrome(executable_path=\"/Users/omersakarya/Documents/GitHub/scraping-glassdoor-selenium/chromedriver\", options=options)\n", + " # path = \"ChromeDriver/chromedriver\" # I have made a folder:\"ChromeDriver\" and put file\"chromedriver.exe\"inside this folder. \n", + " # ^^^Folder^^^/^^^this is .exe file\n", + " driver = webdriver.Chrome(executable_path=path, options=options)\n", + " driver.set_window_size(1120, 1000)\n", + " \n", + " # url = 'https://www.glassdoor.com/Job/jobs.htm?sc.keyword='+ keyword +'&includeNoSalaryJobs=false&radius=100'\n", + " url = 'https://www.glassdoor.com/Job/jobs.htm?sc.keyword=\"' + keyword + '\"&locT=C&locId=1147401&locKeyword=San%20Francisco,%20CA&jobType=all&fromAge=-1&minSalary=0&includeNoSalaryJobs=false&radius=100&cityId=-1&minRating=0.0&industryId=-1&sgocId=-1&seniorityType=all&companyId=-1&employerSizes=0&applicationType=0&remoteWorkType=0'\n", + " driver.get(url)\n", + " jobs = []\n", + "\n", + " while len(jobs) < num_jobs: #If true, should be still looking for new jobs.\n", + "\n", + " #Let the page load. Change this number based on your internet speed.\n", + " #Or, wait until the webpage is loaded, instead of hardcoding it.\n", + " time.sleep(slp_time)\n", + "\n", + " #Test for the \"Sign Up\" prompt and get rid of it.\n", + " # try:\n", + " # driver.find_element_by_class_name(\"selected\").click()\n", + " # except ElementClickInterceptedException:\n", + " # pass\n", + "\n", + " # time.sleep(.1)\n", + "\n", + " try:\n", + " driver.find_element_by_class_name(\"ModalStyle__xBtn___29PT9\").click() #clicking to the X.\n", + " except NoSuchElementException:\n", + " pass\n", + " \n", + " # found_popup = False \n", + " currentJoblist = 0\n", + " \n", + " \n", + " if not (len(jobs) >= num_jobs):\n", + " listButtonsCount = len(driver.find_elements_by_xpath('//*[@id=\"MainCol\"]//div[1]//ul//li[@data-test=\"jobListing\"]'))\n", + " print(\"&&& job butons:\" +str(listButtonsCount))\n", + " #Going through each job in this page\n", + " # job_buttons = driver.find_elements_by_class_name(\"jl\") #jl for Job Listing. These are the buttons we're going to click.\n", + " job_buttons = driver.find_elements_by_xpath('.//*[@id=\"MainCol\"]//a[@class=\"jobLink\"]') #jl for Job Listing. These are the buttons we're going to click.\n", + " \n", + " for job_button in job_buttons: \n", + " \n", + " print(\"Progress: {}\".format(\"\" + str(len(jobs)) + \"/\" + str(num_jobs)))\n", + " if len(jobs) >= num_jobs:\n", + " break\n", + " \n", + " \n", + " job_button.click() #You might \n", + " \n", + " time.sleep(4)\n", + " \n", + " #___________ code to kill the sign-up pop-up after it render on screen\n", + " # if not found_popup:\n", + " try:\n", + " driver.find_element_by_css_selector('[alt=\"Close\"]').click()\n", + " # print(\"&&& line 89\")\n", + " # found_popup = True\n", + " except NoSuchElementException:\n", + " # print(\"&&& line 92\")\n", + " pass\n", + " \n", + " # __________\n", + " \n", + " \n", + " collected_successfully = False\n", + " \n", + " while not collected_successfully:\n", + " try:\n", + " # company_name = driver.find_element_by_xpath('.//div[@class=\"employerName\"]').text\n", + " company_name = driver.find_element_by_xpath('//*[@id=\"MainCol\"]//li['+ str(currentJoblist + 1) +']//div[2]//a//span').text\n", + " \n", + " # location = driver.find_element_by_xpath('.//div[@class=\"location\"]').text\n", + " location = driver.find_element_by_xpath('//*[@id=\"MainCol\"]//li['+ str(currentJoblist + 1) +']//div[2]//div[2]/span').text\n", + " \n", + " # job_title = driver.find_element_by_xpath('.//div[contains(@class, \"title\")]').text\n", + " job_title = driver.find_element_by_xpath('//*[@id=\"MainCol\"]//li['+ str(currentJoblist + 1) +']//a[@data-test=\"job-link\"]').text\n", + " \n", + " job_description = driver.find_element_by_xpath('.//div[@class=\"jobDescriptionContent desc\"]').text\n", + " \n", + " # job_function is an additional information not included in previous code\n", + " job_function = driver.find_element_by_xpath('//*[@id=\"JDCol\"]//strong[text()[1]=\"Job Function\"]//following-sibling::*').text\n", + " \n", + " collected_successfully = True\n", + " except:\n", + " # print(\"&&& line 67\")\n", + " # collected_successfully=True\n", + " time.sleep(5)\n", + " \n", + " try:\n", + " # salary_estimate = driver.find_element_by_xpath('.//span[@class=\"gray small salary\"]').text\n", + " salary_estimate = driver.find_element_by_xpath('//*[@id=\"JDCol\"]//span[@data-test=\"detailSalary\"]').text\n", + " except NoSuchElementException:\n", + " salary_estimate = -1 #You need to set a \"not found value. It's important.\"\n", + " \n", + " try:\n", + " # rating = driver.find_element_by_xpath('.//span[@class=\"rating\"]').text\n", + " rating = driver.find_element_by_xpath('//*[@id=\"JDCol\"]//span[@data-test=\"detailRating\"]').text\n", + " except NoSuchElementException:\n", + " rating = -1 #You need to set a \"not found value. It's important.\"\n", + " \n", + " # #Printing for debugging\n", + " if verbose:\n", + " print(\"Job Title: {}\".format(job_title))\n", + " print(\"Salary Estimate: {}\".format(salary_estimate))\n", + " print(\"Job Description: {}\".format(job_description[:500]))\n", + " print(\"Rating: {}\".format(rating))\n", + " print(\"Company Name: {}\".format(company_name))\n", + " print(\"Location: {}\".format(location))\n", + " print(\"Job Function: {}\".format(job_function))\n", + " \n", + " #Going to the Company tab...\n", + " #clicking on this:\n", + " #
Company
\n", + " time.sleep(1)\n", + " try:\n", + " # driver.find_element_by_xpath('.//div[@class=\"tab\" and @data-tab-type=\"overview\"]').click()\n", + " driver.find_element_by_xpath('.//div[@id=\"SerpFixedHeader\"]//span[text()=\"Company\"]').click()\n", + " \n", + " # try:\n", + " # #
\n", + " # # \n", + " # # San Francisco, CA\n", + " # #
\n", + " # headquarters = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Headquarters\"]//following-sibling::*').text\n", + " # ^^^^^^^^^^ couldn't abel to find \"headquarters\"\n", + " # except NoSuchElementException:\n", + " # headquarters = -1\n", + " \n", + " try:\n", + " # size = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Size\"]//following-sibling::*').text\n", + " size = driver.find_element_by_xpath('.//div[@id=\"EmpBasicInfo\"]//span[text()=\"Size\"]//following-sibling::*').text\n", + " except NoSuchElementException:\n", + " size = -1\n", + " \n", + " try:\n", + " # founded = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Founded\"]//following-sibling::*').text\n", + " founded = driver.find_element_by_xpath('.//div[@id=\"EmpBasicInfo\"]//span[text()=\"Founded\"]//following-sibling::*').text\n", + " except NoSuchElementException:\n", + " founded = -1\n", + " \n", + " try:\n", + " # type_of_ownership = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Type\"]//following-sibling::*').text\n", + " type_of_ownership = driver.find_element_by_xpath('.//div[@id=\"EmpBasicInfo\"]//span[text()=\"Type\"]//following-sibling::*').text\n", + " except NoSuchElementException:\n", + " type_of_ownership = -1\n", + " \n", + " try:\n", + " # industry = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Industry\"]//following-sibling::*').text\n", + " industry = driver.find_element_by_xpath('.//div[@id=\"EmpBasicInfo\"]//span[text()=\"Industry\"]//following-sibling::*').text\n", + " except NoSuchElementException:\n", + " industry = -1\n", + " \n", + " try:\n", + " # sector = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Sector\"]//following-sibling::*').text\n", + " sector = driver.find_element_by_xpath('.//div[@id=\"EmpBasicInfo\"]//span[text()=\"Sector\"]//following-sibling::*').text\n", + " except NoSuchElementException:\n", + " sector = -1\n", + " \n", + " try:\n", + " # revenue = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Revenue\"]//following-sibling::*').text\n", + " revenue = driver.find_element_by_xpath('.//div[@id=\"EmpBasicInfo\"]//span[text()=\"Revenue\"]//following-sibling::*').text\n", + " except NoSuchElementException:\n", + " revenue = -1\n", + " \n", + " # try:\n", + " # competitors = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Competitors\"]//following-sibling::*').text\n", + " # ^^^^^^^^^^^ couldn't able to find \"competitors\"\n", + " # except NoSuchElementException:\n", + " # competitors = -1\n", + " \n", + " except NoSuchElementException: #Rarely, some job postings do not have the \"Company\" tab.\n", + " # headquarters = -1\n", + " size = -1\n", + " founded = -1\n", + " type_of_ownership = -1\n", + " industry = -1\n", + " sector = -1\n", + " revenue = -1\n", + " # competitors = -1\n", + " \n", + " \n", + " if verbose:\n", + " \n", + " print(\"Size: {}\".format(size))\n", + " print(\"Founded: {}\".format(founded))\n", + " print(\"Type of Ownership: {}\".format(type_of_ownership))\n", + " print(\"Industry: {}\".format(industry))\n", + " print(\"Sector: {}\".format(sector))\n", + " print(\"Revenue: {}\".format(revenue))\n", + " # print(\"Headquarters: {}\".format(headquarters))\n", + " # print(\"Competitors: {}\".format(competitors))\n", + " print(\"@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\")\n", + " \n", + " jobs.append({\"Job Title\" : job_title, \n", + " \"Salary Estimate\" : salary_estimate,\n", + " \"Job Function\" : job_function,\n", + " \"Job Description\" : job_description, \n", + " \"Company Name\" : company_name,\n", + " \"Rating\" : rating, \n", + " \"Location\" : location,\n", + " \"Size\" : size,\n", + " \"Founded\" : founded,\n", + " \"Type of ownership\" : type_of_ownership,\n", + " \"Industry\" : industry,\n", + " \"Sector\" : sector,\n", + " \"Revenue\" : revenue})\n", + " # \"Headquarters\" : headquarters,\n", + " # \"Competitors\" : competitors})\n", + " # ^^^^^^^^ couldn't able to find \"Headquarters\" and \"Competitors\"\n", + " #add job to jobs\n", + " \n", + " currentJoblist=currentJoblist+1 # increasing the count of the list of buttons clicked and saved\n", + " \n", + " if not (currentJoblist < listButtonsCount): # to check the list last button and to go to next page\n", + " currentJoblist = 0 # resetting the button list count for new page button's list\n", + " break\n", + " #Clicking on the \"next page\" button\n", + " try: \n", + " # driver.find_element_by_xpath('.//li[@class=\"next\"]//a').click()\n", + " driver.find_element_by_xpath('//*[@id=\"FooterPageNav\"]//a[@data-test=\"pagination-next\"]').click()\n", + " \n", + " except NoSuchElementException:\n", + " print(\"Scraping terminated before reaching target number of jobs. Needed {}, got {}.\".format(num_jobs, len(jobs)))\n", + " break\n", + "\n", + " return pd.DataFrame(jobs) #This line converts the dictionary object into a pandas DataFrame." + ] } ], "metadata": { @@ -418,7 +687,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.0" + "version": "3.8.3" } }, "nbformat": 4,