-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
used only one session,dynamically getting the page number,function to…
… get staff by dept
- Loading branch information
Digvijay Narayan
authored and
Digvijay Narayan
committed
Apr 23, 2024
1 parent
51cab00
commit df00203
Showing
2 changed files
with
141 additions
and
106 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,136 +1,172 @@ | ||
import time | ||
# from selenium import webdriver | ||
# from selenium.webdriver.chrome.options import Options | ||
# from selenium.webdriver.common.by import By | ||
# from selenium.webdriver.support.ui import WebDriverWait | ||
# from selenium.webdriver.support import expected_conditions as EC | ||
from requests_html import HTMLSession | ||
import requests | ||
from bs4 import BeautifulSoup | ||
from ..models.staff import Staff | ||
from ..models.staff import Staff | ||
|
||
|
||
class StaffPageHandler: | ||
@staticmethod | ||
def get_staff_details() -> Staff: | ||
def get_staff_details() -> list: | ||
try: | ||
base_url = "https://staff.pes.edu/atoz/" | ||
session=HTMLSession() | ||
# options = Options() | ||
# options.add_argument("--disable-infobars") | ||
# options.add_argument("--headless") | ||
# driver = webdriver.Chrome() | ||
for page_num in range(1, 23): | ||
session = HTMLSession() | ||
response = session.get(base_url) | ||
if response.status_code != 200: | ||
raise ConnectionError(f"Failed to fetch URL: {base_url}") | ||
|
||
soup = BeautifulSoup(response.text, "html.parser") | ||
last_page_span = soup.find( | ||
"span", {"aria-hidden": "true"} | ||
) # getting the last page from the pagination end | ||
last_page_number = int(last_page_span.get_text()) | ||
PESU_STAFF_LIST = [] | ||
for page_num in range(1, last_page_number + 1): | ||
print("Scraping page:", page_num) | ||
staff_url = f"{base_url}?page={page_num}" | ||
response = session.get(staff_url) | ||
if response.status_code != 200: | ||
raise ConnectionError(f"Failed to fetch URL: {staff_url}") | ||
soup=BeautifulSoup(response.text,"html.parser") | ||
staff_divs = soup.find_all('div', class_='staff-profile') | ||
response = session.get(staff_url) | ||
soup = BeautifulSoup(response.text, "html.parser") | ||
|
||
staff_divs = soup.find_all("div", class_="staff-profile") | ||
for staff_div in staff_divs: | ||
anchor_tag = staff_div.find('a', class_='geodir-category-img_item') | ||
anchor_tag = staff_div.find("a", class_="geodir-category-img_item") | ||
if anchor_tag: | ||
base_url_single_staff="https://staff.pes.edu/" | ||
staff_url = anchor_tag['href'] | ||
base_url_single_staff = "https://staff.pes.edu/" | ||
staff_url = anchor_tag["href"] | ||
request_path = base_url_single_staff + staff_url[1:] | ||
# driver.get(request_path) | ||
# time.sleep(3) | ||
# html = driver.page_source | ||
# soup = BeautifulSoup(html, 'html.parser') | ||
# StaffPageHandler.get_details_from_url(request_path, session) | ||
PESU_STAFF=StaffPageHandler.get_details_from_url(request_path, session) | ||
print(PESU_STAFF) | ||
# return PESU_STAFF | ||
PESU_STAFF = StaffPageHandler.get_details_from_url( | ||
request_path, session | ||
) | ||
PESU_STAFF_LIST.append(PESU_STAFF) | ||
|
||
return PESU_STAFF_LIST | ||
|
||
except Exception as e: | ||
print(f"Error occurred: {e}") | ||
raise ConnectionError("Unable to fetch staff data.") | ||
finally: | ||
session.close() | ||
@staticmethod | ||
def get_details_from_url(url, driver): | ||
# driver.get(url) | ||
# time.sleep(3) | ||
|
||
# html = driver.page_source | ||
session=HTMLSession() | ||
response=session.get(url) | ||
@staticmethod | ||
def get_details_from_url(url, session): | ||
response = session.get(url) | ||
if response.status_code != 200: | ||
raise ConnectionError(f"Failed to fetch URL: {url}") | ||
soup = BeautifulSoup(response.text, 'html.parser') | ||
#name | ||
name_tag = soup.find('h4') | ||
soup = BeautifulSoup(response.text, "html.parser") | ||
# name | ||
name_tag = soup.find("h4") | ||
name = name_tag.text.strip() if name_tag else None | ||
#domain | ||
teaching_items = soup.select('#tab-teaching .bookings-item-content ul.ul-item-left li') | ||
# domain | ||
teaching_items = soup.select( | ||
"#tab-teaching .bookings-item-content ul.ul-item-left li" | ||
) | ||
domains = [item.text.strip() for item in teaching_items] | ||
#designation | ||
designation=soup.find('h5') | ||
designation = ' '.join(designation.text.split()) | ||
#Education | ||
# designation | ||
designation = soup.find("h5") | ||
designation = " ".join(designation.text.split()) | ||
# Education | ||
professor_education = [] | ||
education_section = soup.find('h3', string='Education') | ||
if education_section: | ||
education_list = education_section.find_next('ul', class_='ul-item-left').find_all('li') | ||
education_details = [item.find('p').text.strip() for item in education_list] | ||
for detail in education_details: | ||
professor_education.append(detail) | ||
# print(professor_education) | ||
# print() | ||
#Experience | ||
professor_experience=[] | ||
experience_section = soup.find('h3', string='Experience') | ||
if experience_section: | ||
experience_list = experience_section.find_next('ul', class_='ul-item-left').find_all('li') | ||
experience_details = [item.find('p').text.strip() for item in experience_list] | ||
for detail in experience_details: | ||
professor_experience.append(detail) | ||
# print(professor_experience) | ||
# print() | ||
|
||
|
||
education_section = soup.find_all("h3") | ||
education_section_filter = [ | ||
h3 for h3 in education_section if h3.get_text(strip=True) == "Education" | ||
] | ||
|
||
for h3 in education_section_filter: | ||
education_list = h3.find_next("ul", class_="ul-item-left") | ||
if education_list: | ||
education_items = education_list.find_all("li") | ||
education_details = [ | ||
item.find("p").text.strip() for item in education_items | ||
] | ||
for detail in education_details: | ||
professor_education.append(detail) | ||
|
||
# print(professor_education) | ||
|
||
# Experience | ||
professor_experience = [] | ||
experience_section = soup.find_all("h3") | ||
experience_section_filter = [ | ||
h3 for h3 in experience_section if h3.get_text(strip=True) == "Experience" | ||
] | ||
for h3 in experience_section_filter: | ||
experience_list = h3.find_next("ul", class_="ul-item-left") | ||
if experience_list: | ||
experience_items = experience_list.find_all("li") | ||
experience_details = [ | ||
item.find("p").text.strip() for item in experience_items | ||
] | ||
for detail in experience_details: | ||
professor_experience.append(detail) | ||
|
||
# print(professor_experience) | ||
|
||
all_a_tags = soup.find_all("a") | ||
email = [ | ||
tag for tag in all_a_tags | ||
tag | ||
for tag in all_a_tags | ||
if "pes.edu" in tag.get("href", "") and "pes.edu" in tag.get_text() | ||
] | ||
email=email[0].get_text() | ||
|
||
#department | ||
department_element = soup.find('li', class_='contat-card') | ||
department_paragraph = department_element.find('p') | ||
if email: | ||
email = email[0].get_text() | ||
# department | ||
department_element = soup.find("li", class_="contat-card") | ||
department_paragraph = department_element.find("p") | ||
department = department_paragraph.get_text(strip=True) | ||
# campus | ||
try: | ||
campus_element = soup.find_all("li", class_="contat-card")[1] | ||
if campus_element: | ||
campus_paragraph = campus_element.find("p") | ||
campus = campus_paragraph.get_text(strip=True) | ||
except IndexError: | ||
campus = None | ||
# responsibilities | ||
responsibilities = [] | ||
responsibilities_div = soup.find("div", id="tab-responsibilities") | ||
if responsibilities_div is not None: | ||
p_tags = responsibilities_div.find_all("p") | ||
responsibilities = [p.text for p in p_tags] | ||
Pesu_Staff = Staff( | ||
name=name, | ||
designation=designation, | ||
education=professor_education, | ||
experience=professor_experience, | ||
department=department, | ||
campus=campus, | ||
domains=domains, | ||
mail=email, | ||
responsibilities=responsibilities, | ||
) | ||
return Pesu_Staff | ||
|
||
#campus | ||
campus_element=soup.find_all('li', class_='contat-card')[1] | ||
campus_paragraph = campus_element.find('p') | ||
campus=campus_paragraph.get_text(strip=True) | ||
@staticmethod | ||
def get_staff(department=None, designation=None): | ||
all_staff = StaffPageHandler.get_staff_details() | ||
print(all_staff) | ||
filtered_staff = all_staff | ||
|
||
if department: | ||
# Filter staff by department | ||
filtered_staff = [ | ||
staff for staff in filtered_staff if staff.department == department | ||
] | ||
|
||
#responsibilities | ||
responsibilities=[] | ||
if designation: | ||
# Filter staff by designation | ||
filtered_staff = [ | ||
staff for staff in filtered_staff if staff.designation == designation | ||
] | ||
|
||
responsibilities_div = soup.find('div', id='tab-responsibilities') | ||
if(responsibilities_div is not None): | ||
# print(len(responsibilities_div)) | ||
# print(responsibilities_div) | ||
p_tags = responsibilities_div.find_all('p') | ||
responsibilities = [p.text for p in p_tags] | ||
return filtered_staff | ||
|
||
# print(responsibilities) | ||
# print() | ||
|
||
Pesu_Staff=Staff(name,designation,professor_education,professor_experience,campus,department,domains,responsibilities,email) | ||
# Pesu_Staff.name=name | ||
# Pesu_Staff.designation=designation | ||
# Pesu_Staff.domains=domains | ||
# Pesu_Staff.education=professor_education | ||
# Pesu_Staff.experience=professor_experience | ||
# Pesu_Staff.department=department | ||
# Pesu_Staff.email=email | ||
# pesu_staff.campus=campus | ||
# Pesu_Staff.responsibilities=responsibilities | ||
return Pesu_Staff | ||
|
||
# def main(): | ||
# #usage | ||
# cse_staff = StaffPageHandler.get_staff(department="Computer Science") | ||
# for staff_member in cse_staff: | ||
# print(staff_member.name) | ||
|
||
|
||
# if __name__ == "__main__": | ||
# main() |