arapfaik · wizrox · Mar 28, 2021
diff --git a/.ipynb_checkpoints/glassdoor scraping-checkpoint.ipynb b/.ipynb_checkpoints/glassdoor scraping-checkpoint.ipynb
@@ -400,6 +400,275 @@
    "metadata": {},
    "outputs": [],
    "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!/usr/bin/env python3\n",
+    "# -*- coding: utf-8 -*-\n",
+    "\"\"\"\n",
+    "original author: Ömer Sakarya , Oct 15, 2019\n",
+    "git : https://github.com/arapfaik/scraping-glassdoor-selenium\n",
+    "original tutorial: https://towardsdatascience.com/selenium-tutorial-scraping-glassdoor-com-in-10-minutes-3d0915c6d905\n",
+    "\n",
+    "tutorial followed(youtube: KenJee): https://www.youtube.com/watch?v=GmW4F6MHqqs&list=PL2zq7klxX5ASFejJj80ob9ZAnBHdz5O1t\n",
+    "\n",
+    "Disclaimer: I don't own the copyrights of the code , It was written and coded as\n",
+    "            followed on the youtube channel mentioned above\n",
+    "            P.S: the code has been modified according to the updated structure of the website for \n",
+    "                 webscraping, there are fields/data that I couldn't able to find, for reading\n",
+    "                 purposes the old lines of code is commented and updated code added underneath\n",
+    "                 for better understanding.please take a note that I have changed the names of the column and files according to my need\n",
+    "                   , if you are copy pasting this code you have to look for syntax errors in names of files and\n",
+    "                   data-columns that are used in tutorial.\n",
+    "\"\"\"\n",
+    "from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException\n",
+    "from selenium import webdriver\n",
+    "import time\n",
+    "import pandas as pd\n",
+    "\n",
+    "def get_jobs(keyword, num_jobs, verbose,path,slp_time):\n",
+    "    \n",
+    "    '''Gathers jobs as a dataframe, scraped from Glassdoor'''\n",
+    "    \n",
+    "    #Initializing the webdriver\n",
+    "    options = webdriver.ChromeOptions()\n",
+    "    \n",
+    "    #Uncomment the line below if you'd like to scrape without a new Chrome window every time.\n",
+    "    #options.add_argument('headless')\n",
+    "    \n",
+    "    #Change the path to where chromedriver is in your home folder.\n",
+    "    # driver = webdriver.Chrome(executable_path=\"/Users/omersakarya/Documents/GitHub/scraping-glassdoor-selenium/chromedriver\", options=options)\n",
+    "    # path = \"ChromeDriver/chromedriver\"   # I have made a folder:\"ChromeDriver\" and put file\"chromedriver.exe\"inside this folder. \n",
+    "    #         ^^^Folder^^^/^^^this is .exe file\n",
+    "    driver = webdriver.Chrome(executable_path=path, options=options)\n",
+    "    driver.set_window_size(1120, 1000)\n",
+    "    \n",
+    "    # url = 'https://www.glassdoor.com/Job/jobs.htm?sc.keyword='+ keyword +'&includeNoSalaryJobs=false&radius=100'\n",
+    "    url = 'https://www.glassdoor.com/Job/jobs.htm?sc.keyword=\"' + keyword + '\"&locT=C&locId=1147401&locKeyword=San%20Francisco,%20CA&jobType=all&fromAge=-1&minSalary=0&includeNoSalaryJobs=false&radius=100&cityId=-1&minRating=0.0&industryId=-1&sgocId=-1&seniorityType=all&companyId=-1&employerSizes=0&applicationType=0&remoteWorkType=0'\n",
+    "    driver.get(url)\n",
+    "    jobs = []\n",
+    "\n",
+    "    while len(jobs) < num_jobs:  #If true, should be still looking for new jobs.\n",
+    "\n",
+    "        #Let the page load. Change this number based on your internet speed.\n",
+    "        #Or, wait until the webpage is loaded, instead of hardcoding it.\n",
+    "        time.sleep(slp_time)\n",
+    "\n",
+    "        #Test for the \"Sign Up\" prompt and get rid of it.\n",
+    "        # try:\n",
+    "        #     driver.find_element_by_class_name(\"selected\").click()\n",
+    "        # except ElementClickInterceptedException:\n",
+    "        #     pass\n",
+    "\n",
+    "        # time.sleep(.1)\n",
+    "\n",
+    "        try:\n",
+    "            driver.find_element_by_class_name(\"ModalStyle__xBtn___29PT9\").click()  #clicking to the X.\n",
+    "        except NoSuchElementException:\n",
+    "            pass\n",
+    "        \n",
+    "        # found_popup = False \n",
+    "        currentJoblist = 0\n",
+    "        \n",
+    "        \n",
+    "        if not (len(jobs) >= num_jobs):\n",
+    "            listButtonsCount = len(driver.find_elements_by_xpath('//*[@id=\"MainCol\"]//div[1]//ul//li[@data-test=\"jobListing\"]'))\n",
+    "            print(\"&&& job butons:\" +str(listButtonsCount))\n",
+    "            #Going through each job in this page\n",
+    "            # job_buttons = driver.find_elements_by_class_name(\"jl\")  #jl for Job Listing. These are the buttons we're going to click.\n",
+    "            job_buttons = driver.find_elements_by_xpath('.//*[@id=\"MainCol\"]//a[@class=\"jobLink\"]')  #jl for Job Listing. These are the buttons we're going to click.\n",
+    "            \n",
+    "            for job_button in job_buttons:  \n",
+    "    \n",
+    "                print(\"Progress: {}\".format(\"\" + str(len(jobs)) + \"/\" + str(num_jobs)))\n",
+    "                if len(jobs) >= num_jobs:\n",
+    "                    break\n",
+    "                \n",
+    "                            \n",
+    "                job_button.click()  #You might \n",
+    "                \n",
+    "                time.sleep(4)\n",
+    "                \n",
+    "                #___________ code to kill the sign-up pop-up after it render on screen\n",
+    "                # if not found_popup:\n",
+    "                try:\n",
+    "                    driver.find_element_by_css_selector('[alt=\"Close\"]').click()\n",
+    "                    # print(\"&&& line 89\")\n",
+    "                    # found_popup = True\n",
+    "                except NoSuchElementException:\n",
+    "                    # print(\"&&& line 92\")\n",
+    "                    pass\n",
+    "                          \n",
+    "                # __________\n",
+    "                \n",
+    "                \n",
+    "                collected_successfully = False\n",
+    "                \n",
+    "                while not collected_successfully:\n",
+    "                    try:\n",
+    "                        # company_name = driver.find_element_by_xpath('.//div[@class=\"employerName\"]').text\n",
+    "                        company_name = driver.find_element_by_xpath('//*[@id=\"MainCol\"]//li['+ str(currentJoblist + 1) +']//div[2]//a//span').text\n",
+    "                        \n",
+    "                        # location = driver.find_element_by_xpath('.//div[@class=\"location\"]').text\n",
+    "                        location = driver.find_element_by_xpath('//*[@id=\"MainCol\"]//li['+ str(currentJoblist + 1) +']//div[2]//div[2]/span').text\n",
+    "                        \n",
+    "                        # job_title = driver.find_element_by_xpath('.//div[contains(@class, \"title\")]').text\n",
+    "                        job_title = driver.find_element_by_xpath('//*[@id=\"MainCol\"]//li['+ str(currentJoblist + 1) +']//a[@data-test=\"job-link\"]').text\n",
+    "                        \n",
+    "                        job_description = driver.find_element_by_xpath('.//div[@class=\"jobDescriptionContent desc\"]').text\n",
+    "                        \n",
+    "                        # job_function is an additional information not included in previous code\n",
+    "                        job_function = driver.find_element_by_xpath('//*[@id=\"JDCol\"]//strong[text()[1]=\"Job Function\"]//following-sibling::*').text\n",
+    "                        \n",
+    "                        collected_successfully = True\n",
+    "                    except:\n",
+    "                        # print(\"&&& line 67\")\n",
+    "                        # collected_successfully=True\n",
+    "                        time.sleep(5)\n",
+    "    \n",
+    "                try:\n",
+    "                    # salary_estimate = driver.find_element_by_xpath('.//span[@class=\"gray small salary\"]').text\n",
+    "                    salary_estimate = driver.find_element_by_xpath('//*[@id=\"JDCol\"]//span[@data-test=\"detailSalary\"]').text\n",
+    "                except NoSuchElementException:\n",
+    "                    salary_estimate = -1 #You need to set a \"not found value. It's important.\"\n",
+    "                \n",
+    "                try:\n",
+    "                    # rating = driver.find_element_by_xpath('.//span[@class=\"rating\"]').text\n",
+    "                    rating = driver.find_element_by_xpath('//*[@id=\"JDCol\"]//span[@data-test=\"detailRating\"]').text\n",
+    "                except NoSuchElementException:\n",
+    "                    rating = -1 #You need to set a \"not found value. It's important.\"\n",
+    "    \n",
+    "                # #Printing for debugging\n",
+    "                if verbose:\n",
+    "                    print(\"Job Title: {}\".format(job_title))\n",
+    "                    print(\"Salary Estimate: {}\".format(salary_estimate))\n",
+    "                    print(\"Job Description: {}\".format(job_description[:500]))\n",
+    "                    print(\"Rating: {}\".format(rating))\n",
+    "                    print(\"Company Name: {}\".format(company_name))\n",
+    "                    print(\"Location: {}\".format(location))\n",
+    "                    print(\"Job Function: {}\".format(job_function))\n",
+    "    \n",
+    "                #Going to the Company tab...\n",
+    "                #clicking on this:\n",
+    "                #<div class=\"tab\" data-tab-type=\"overview\"><span>Company</span></div>\n",
+    "                time.sleep(1)\n",
+    "                try:\n",
+    "                    # driver.find_element_by_xpath('.//div[@class=\"tab\" and @data-tab-type=\"overview\"]').click()\n",
+    "                    driver.find_element_by_xpath('.//div[@id=\"SerpFixedHeader\"]//span[text()=\"Company\"]').click()\n",
+    "    \n",
+    "                #     try:\n",
+    "                #         #<div class=\"infoEntity\">\n",
+    "                #         #    <label>Headquarters</label>\n",
+    "                #         #    <span class=\"value\">San Francisco, CA</span>\n",
+    "                #         #</div>\n",
+    "                #         headquarters = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Headquarters\"]//following-sibling::*').text\n",
+    "                         # ^^^^^^^^^^ couldn't abel to find \"headquarters\"\n",
+    "                #     except NoSuchElementException:\n",
+    "                #         headquarters = -1\n",
+    "    \n",
+    "                    try:\n",
+    "                        # size = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Size\"]//following-sibling::*').text\n",
+    "                        size = driver.find_element_by_xpath('.//div[@id=\"EmpBasicInfo\"]//span[text()=\"Size\"]//following-sibling::*').text\n",
+    "                    except NoSuchElementException:\n",
+    "                        size = -1\n",
+    "    \n",
+    "                    try:\n",
+    "                        # founded = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Founded\"]//following-sibling::*').text\n",
+    "                        founded = driver.find_element_by_xpath('.//div[@id=\"EmpBasicInfo\"]//span[text()=\"Founded\"]//following-sibling::*').text\n",
+    "                    except NoSuchElementException:\n",
+    "                        founded = -1\n",
+    "    \n",
+    "                    try:\n",
+    "                        # type_of_ownership = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Type\"]//following-sibling::*').text\n",
+    "                        type_of_ownership = driver.find_element_by_xpath('.//div[@id=\"EmpBasicInfo\"]//span[text()=\"Type\"]//following-sibling::*').text\n",
+    "                    except NoSuchElementException:\n",
+    "                        type_of_ownership = -1\n",
+    "    \n",
+    "                    try:\n",
+    "                        # industry = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Industry\"]//following-sibling::*').text\n",
+    "                        industry = driver.find_element_by_xpath('.//div[@id=\"EmpBasicInfo\"]//span[text()=\"Industry\"]//following-sibling::*').text\n",
+    "                    except NoSuchElementException:\n",
+    "                        industry = -1\n",
+    "    \n",
+    "                    try:\n",
+    "                        # sector = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Sector\"]//following-sibling::*').text\n",
+    "                        sector = driver.find_element_by_xpath('.//div[@id=\"EmpBasicInfo\"]//span[text()=\"Sector\"]//following-sibling::*').text\n",
+    "                    except NoSuchElementException:\n",
+    "                        sector = -1\n",
+    "    \n",
+    "                    try:\n",
+    "                        # revenue = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Revenue\"]//following-sibling::*').text\n",
+    "                        revenue = driver.find_element_by_xpath('.//div[@id=\"EmpBasicInfo\"]//span[text()=\"Revenue\"]//following-sibling::*').text\n",
+    "                    except NoSuchElementException:\n",
+    "                        revenue = -1\n",
+    "    \n",
+    "                #     try:\n",
+    "                #         competitors = driver.find_element_by_xpath('.//div[@class=\"infoEntity\"]//label[text()=\"Competitors\"]//following-sibling::*').text\n",
+    "                          # ^^^^^^^^^^^ couldn't able to find \"competitors\"\n",
+    "                #     except NoSuchElementException:\n",
+    "                #         competitors = -1\n",
+    "    \n",
+    "                except NoSuchElementException:  #Rarely, some job postings do not have the \"Company\" tab.\n",
+    "                #     headquarters = -1\n",
+    "                    size = -1\n",
+    "                    founded = -1\n",
+    "                    type_of_ownership = -1\n",
+    "                    industry = -1\n",
+    "                    sector = -1\n",
+    "                    revenue = -1\n",
+    "                #     competitors = -1\n",
+    "    \n",
+    "                    \n",
+    "                if verbose:\n",
+    "                    \n",
+    "                    print(\"Size: {}\".format(size))\n",
+    "                    print(\"Founded: {}\".format(founded))\n",
+    "                    print(\"Type of Ownership: {}\".format(type_of_ownership))\n",
+    "                    print(\"Industry: {}\".format(industry))\n",
+    "                    print(\"Sector: {}\".format(sector))\n",
+    "                    print(\"Revenue: {}\".format(revenue))\n",
+    "                    # print(\"Headquarters: {}\".format(headquarters))\n",
+    "                    # print(\"Competitors: {}\".format(competitors))\n",
+    "                    print(\"@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\")\n",
+    "    \n",
+    "                jobs.append({\"Job Title\" : job_title,           \n",
+    "                \"Salary Estimate\" : salary_estimate,\n",
+    "                \"Job Function\" : job_function,\n",
+    "                \"Job Description\" : job_description,            \n",
+    "                \"Company Name\" : company_name,\n",
+    "                \"Rating\" : rating,            \n",
+    "                \"Location\" : location,\n",
+    "                \"Size\" : size,\n",
+    "                \"Founded\" : founded,\n",
+    "                \"Type of ownership\" : type_of_ownership,\n",
+    "                \"Industry\" : industry,\n",
+    "                \"Sector\" : sector,\n",
+    "                \"Revenue\" : revenue})\n",
+    "                # \"Headquarters\" : headquarters,\n",
+    "                # \"Competitors\" : competitors})\n",
+    "                # ^^^^^^^^ couldn't able to find \"Headquarters\" and \"Competitors\"\n",
+    "                #add job to jobs\n",
+    "    \n",
+    "                currentJoblist=currentJoblist+1 # increasing the count of the list of buttons clicked and saved\n",
+    "                \n",
+    "                if not (currentJoblist < listButtonsCount): # to check the list last button and to go to next page\n",
+    "                        currentJoblist = 0  # resetting the button list count for new page button's list\n",
+    "                        break\n",
+    "            #Clicking on the \"next page\" button\n",
+    "            try:                \n",
+    "                # driver.find_element_by_xpath('.//li[@class=\"next\"]//a').click()\n",
+    "                driver.find_element_by_xpath('//*[@id=\"FooterPageNav\"]//a[@data-test=\"pagination-next\"]').click()\n",
+    "                \n",
+    "            except NoSuchElementException:\n",
+    "                print(\"Scraping terminated before reaching target number of jobs. Needed {}, got {}.\".format(num_jobs, len(jobs)))\n",
+    "                break\n",
+    "\n",
+    "    return pd.DataFrame(jobs)  #This line converts the dictionary object into a pandas DataFrame."
+   ]
   }
  ],
  "metadata": {
@@ -418,7 +687,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.0"
+   "version": "3.8.3"
   }
  },
  "nbformat": 4,