Skip to content

Commit

Permalink
used only one session,dynamically getting the page number,function to…
Browse files Browse the repository at this point in the history
… get staff by dept
  • Loading branch information
Digvijay Narayan authored and Digvijay Narayan committed Apr 23, 2024
1 parent 51cab00 commit df00203
Show file tree
Hide file tree
Showing 2 changed files with 141 additions and 106 deletions.
13 changes: 6 additions & 7 deletions pesuacademy/models/staff.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,18 @@
from typing import Optional



class Staff:
def __init__(
self,
name: str,
designation: str,
designation: str,
campus: str,
department: str,
mail : str,
domains: Optional[list]=None,
responsibilities: Optional[list]=None,
education: Optional[list]=None,
experience: Optional[list]=None,
mail: str,
domains: Optional[list] = None,
responsibilities: Optional[list] = None,
education: Optional[list] = None,
experience: Optional[list] = None,
):
self.name = name
self.designation = designation
Expand Down
234 changes: 135 additions & 99 deletions pesuacademy/pages/staff.py
Original file line number Diff line number Diff line change
@@ -1,136 +1,172 @@
import time
# from selenium import webdriver
# from selenium.webdriver.chrome.options import Options
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
from requests_html import HTMLSession
import requests
from bs4 import BeautifulSoup
from ..models.staff import Staff
from ..models.staff import Staff


class StaffPageHandler:
@staticmethod
def get_staff_details() -> Staff:
def get_staff_details() -> list:
try:
base_url = "https://staff.pes.edu/atoz/"
session=HTMLSession()
# options = Options()
# options.add_argument("--disable-infobars")
# options.add_argument("--headless")
# driver = webdriver.Chrome()
for page_num in range(1, 23):
session = HTMLSession()
response = session.get(base_url)
if response.status_code != 200:
raise ConnectionError(f"Failed to fetch URL: {base_url}")

soup = BeautifulSoup(response.text, "html.parser")
last_page_span = soup.find(
"span", {"aria-hidden": "true"}
) # getting the last page from the pagination end
last_page_number = int(last_page_span.get_text())
PESU_STAFF_LIST = []
for page_num in range(1, last_page_number + 1):
print("Scraping page:", page_num)
staff_url = f"{base_url}?page={page_num}"
response = session.get(staff_url)
if response.status_code != 200:
raise ConnectionError(f"Failed to fetch URL: {staff_url}")
soup=BeautifulSoup(response.text,"html.parser")
staff_divs = soup.find_all('div', class_='staff-profile')
response = session.get(staff_url)
soup = BeautifulSoup(response.text, "html.parser")

staff_divs = soup.find_all("div", class_="staff-profile")
for staff_div in staff_divs:
anchor_tag = staff_div.find('a', class_='geodir-category-img_item')
anchor_tag = staff_div.find("a", class_="geodir-category-img_item")
if anchor_tag:
base_url_single_staff="https://staff.pes.edu/"
staff_url = anchor_tag['href']
base_url_single_staff = "https://staff.pes.edu/"
staff_url = anchor_tag["href"]
request_path = base_url_single_staff + staff_url[1:]
# driver.get(request_path)
# time.sleep(3)
# html = driver.page_source
# soup = BeautifulSoup(html, 'html.parser')
# StaffPageHandler.get_details_from_url(request_path, session)
PESU_STAFF=StaffPageHandler.get_details_from_url(request_path, session)
print(PESU_STAFF)
# return PESU_STAFF
PESU_STAFF = StaffPageHandler.get_details_from_url(
request_path, session
)
PESU_STAFF_LIST.append(PESU_STAFF)

return PESU_STAFF_LIST

except Exception as e:
print(f"Error occurred: {e}")
raise ConnectionError("Unable to fetch staff data.")
finally:
session.close()
@staticmethod
def get_details_from_url(url, driver):
# driver.get(url)
# time.sleep(3)

# html = driver.page_source
session=HTMLSession()
response=session.get(url)
@staticmethod
def get_details_from_url(url, session):
response = session.get(url)
if response.status_code != 200:
raise ConnectionError(f"Failed to fetch URL: {url}")
soup = BeautifulSoup(response.text, 'html.parser')
#name
name_tag = soup.find('h4')
soup = BeautifulSoup(response.text, "html.parser")
# name
name_tag = soup.find("h4")
name = name_tag.text.strip() if name_tag else None
#domain
teaching_items = soup.select('#tab-teaching .bookings-item-content ul.ul-item-left li')
# domain
teaching_items = soup.select(
"#tab-teaching .bookings-item-content ul.ul-item-left li"
)
domains = [item.text.strip() for item in teaching_items]
#designation
designation=soup.find('h5')
designation = ' '.join(designation.text.split())
#Education
# designation
designation = soup.find("h5")
designation = " ".join(designation.text.split())
# Education
professor_education = []
education_section = soup.find('h3', string='Education')
if education_section:
education_list = education_section.find_next('ul', class_='ul-item-left').find_all('li')
education_details = [item.find('p').text.strip() for item in education_list]
for detail in education_details:
professor_education.append(detail)
# print(professor_education)
# print()
#Experience
professor_experience=[]
experience_section = soup.find('h3', string='Experience')
if experience_section:
experience_list = experience_section.find_next('ul', class_='ul-item-left').find_all('li')
experience_details = [item.find('p').text.strip() for item in experience_list]
for detail in experience_details:
professor_experience.append(detail)
# print(professor_experience)
# print()


#email
education_section = soup.find_all("h3")
education_section_filter = [
h3 for h3 in education_section if h3.get_text(strip=True) == "Education"
]

for h3 in education_section_filter:
education_list = h3.find_next("ul", class_="ul-item-left")
if education_list:
education_items = education_list.find_all("li")
education_details = [
item.find("p").text.strip() for item in education_items
]
for detail in education_details:
professor_education.append(detail)

# print(professor_education)

# Experience
professor_experience = []
experience_section = soup.find_all("h3")
experience_section_filter = [
h3 for h3 in experience_section if h3.get_text(strip=True) == "Experience"
]
for h3 in experience_section_filter:
experience_list = h3.find_next("ul", class_="ul-item-left")
if experience_list:
experience_items = experience_list.find_all("li")
experience_details = [
item.find("p").text.strip() for item in experience_items
]
for detail in experience_details:
professor_experience.append(detail)

# print(professor_experience)

# email
all_a_tags = soup.find_all("a")
email = [
tag for tag in all_a_tags
tag
for tag in all_a_tags
if "pes.edu" in tag.get("href", "") and "pes.edu" in tag.get_text()
]
email=email[0].get_text()

#department
department_element = soup.find('li', class_='contat-card')
department_paragraph = department_element.find('p')
if email:
email = email[0].get_text()
# department
department_element = soup.find("li", class_="contat-card")
department_paragraph = department_element.find("p")
department = department_paragraph.get_text(strip=True)
# campus
try:
campus_element = soup.find_all("li", class_="contat-card")[1]
if campus_element:
campus_paragraph = campus_element.find("p")
campus = campus_paragraph.get_text(strip=True)
except IndexError:
campus = None
# responsibilities
responsibilities = []
responsibilities_div = soup.find("div", id="tab-responsibilities")
if responsibilities_div is not None:
p_tags = responsibilities_div.find_all("p")
responsibilities = [p.text for p in p_tags]
Pesu_Staff = Staff(
name=name,
designation=designation,
education=professor_education,
experience=professor_experience,
department=department,
campus=campus,
domains=domains,
mail=email,
responsibilities=responsibilities,
)
return Pesu_Staff

#campus
campus_element=soup.find_all('li', class_='contat-card')[1]
campus_paragraph = campus_element.find('p')
campus=campus_paragraph.get_text(strip=True)
@staticmethod
def get_staff(department=None, designation=None):
all_staff = StaffPageHandler.get_staff_details()
print(all_staff)
filtered_staff = all_staff

if department:
# Filter staff by department
filtered_staff = [
staff for staff in filtered_staff if staff.department == department
]

#responsibilities
responsibilities=[]
if designation:
# Filter staff by designation
filtered_staff = [
staff for staff in filtered_staff if staff.designation == designation
]

responsibilities_div = soup.find('div', id='tab-responsibilities')
if(responsibilities_div is not None):
# print(len(responsibilities_div))
# print(responsibilities_div)
p_tags = responsibilities_div.find_all('p')
responsibilities = [p.text for p in p_tags]
return filtered_staff

# print(responsibilities)
# print()

Pesu_Staff=Staff(name,designation,professor_education,professor_experience,campus,department,domains,responsibilities,email)
# Pesu_Staff.name=name
# Pesu_Staff.designation=designation
# Pesu_Staff.domains=domains
# Pesu_Staff.education=professor_education
# Pesu_Staff.experience=professor_experience
# Pesu_Staff.department=department
# Pesu_Staff.email=email
# pesu_staff.campus=campus
# Pesu_Staff.responsibilities=responsibilities
return Pesu_Staff

# def main():
# #usage
# cse_staff = StaffPageHandler.get_staff(department="Computer Science")
# for staff_member in cse_staff:
# print(staff_member.name)


# if __name__ == "__main__":
# main()

0 comments on commit df00203

Please sign in to comment.