Skip to content

Commit

Permalink
Add DownloadPass movie,scene,performer byURL script (#2114)
Browse files Browse the repository at this point in the history
* downloadpass sceneByURL xPathScraper

* downloadpass movie,scene,performer byURL script

* remove extraneous type import

* fix performer url
  • Loading branch information
WellDoneLiz authored Nov 21, 2024
1 parent d62c060 commit c8e9609
Show file tree
Hide file tree
Showing 2 changed files with 264 additions and 0 deletions.
233 changes: 233 additions & 0 deletions scrapers/DownloadPass/DownloadPass.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
import json
import re
import sys
from datetime import datetime as dt

from py_common import log
from py_common.deps import ensure_requirements
from py_common.types import (
ScrapedMovie,
ScrapedPerformer,
ScrapedScene,
ScrapedStudio,
ScrapedTag,
)
from py_common.util import scraper_args

ensure_requirements("requests", "lxml")
import requests # noqa: E402
from lxml import html # noqa: E402

BASE_URL = "https://www.downloadpass.com"


def get_html_from_url(url: str, session: requests.Session) -> html.HtmlElement | None:
try:
res = session.get(url)
res.raise_for_status()
except Exception as ex:
log.error(f"Error getting URL: {ex}")
else:
return html.fromstring(res.text)


def get_first_elem_text(root: html.HtmlElement, expression: str) -> str | None:
try:
el = root.xpath(expression)[0]
except IndexError:
log.warning(f"Element not found: {expression}")
return None
if isinstance(el, html.HtmlElement):
return el.text
return el


def urls_from_style(style_text: str) -> list[str]:
return re.findall(r"(https?://\S+)\)", style_text)


def movie_from_url(url: str) -> ScrapedMovie:
def get_table_text(tree: html.HtmlElement, header: str):
return get_first_elem_text(tree, ".//li/span[text()='" + header + "']/parent::li/text()")

sess = requests.Session()

url = url.replace("heatwavepass.com", "downloadpass.com")
tree = get_html_from_url(url, sess)
if tree is None:
sys.exit(1)

try:
info_el = tree.xpath("//div[contains(@class, 'dvd-info')]")[0]
except IndexError:
log.warning("Info element not found")
sys.exit(1)

group = ScrapedMovie()

# Name
name_text = get_first_elem_text(info_el, "./h1")
if name_text:
group["name"] = name_text
# Date
date_text = get_table_text(info_el, "Added")
if date_text:
group["date"] = dt.strptime(date_text, "%b %d, %Y").strftime("%Y-%m-%d")
# Duration
duration_text = get_table_text(info_el, "Duration")
if duration_text:
duration = 0
duration_parts = re.findall(r"(\d+[h|m|s])", duration_text)
for part in duration_parts:
t_abbr = part[-1]
t_val = int(part[:-1])
match t_abbr:
case "s":
t_mult = 1
case "m":
t_mult = 60
case "h":
t_mult = 60 * 60
duration = duration + t_val * t_mult
if duration:
group["duration"] = str(duration)
# Covers
cover_map = {"front_image": "cover-front", "back_image": "cover-back"}
for k, v in cover_map.items():
cover_style_text = get_first_elem_text(info_el, f".//div[@id='{v}']/@style")
try:
cover_url = urls_from_style(cover_style_text)[0]
except (TypeError, IndexError):
pass
else:
group[k] = cover_url
# URL
group_url_text = get_first_elem_text(tree, "//link[@rel='canonical']/@href")
if group_url_text:
group["url"] = group_url_text

return group


def scene_from_url(url: str) -> ScrapedScene:
sess = requests.Session()

tree = get_html_from_url(url, sess)
if tree is None:
sys.exit(1)

scene = ScrapedScene()

# Details
try:
scene["details"] = tree.xpath("//meta[@name='description']/@content")[0]
except IndexError:
log.error("Details element not found.")

# URL
try:
scene["url"] = tree.xpath("//link[@rel='canonical']/@href")[0]
except IndexError:
log.error("Canonical link element not found.")

try:
player_el = tree.xpath("//div[@id='player_page']")[0]
except IndexError:
log.error("Player element not found.")
else:
# Title
title_el = get_first_elem_text(player_el, ".//h1[@class='title']")
if title_el:
scene["title"] = title_el
# Image
image_text = get_first_elem_text(player_el, ".//div[@id='promo-shots']/div[1]/@style")
if image_text:
image_url = urls_from_style(image_text)[0]
# Use the first thumbnail image name as scene image name. Naming convention happens to align.
scene["image"] = re.sub(r"(.+)/images/(.+)/crop/\d+x\d+/(.+)", r"\1/sc/\2/\3", image_url)
# Performers
performer_els = player_el.xpath(".//div[contains(@class, 'starItem')]")
performers = []
for p_el in performer_els:
p_name = get_first_elem_text(p_el, "./div[@class='name']/a")
if p_name:
performer = ScrapedPerformer(name=p_name)
p_url = get_first_elem_text(p_el, "./div[@class='name']/a/@href")
if p_url:
performer["urls"] = [BASE_URL + p_url]
performers.append(performer)
scene["performers"] = performers

try:
info_el = tree.xpath("//div[@id='info_container']")[0]
except IndexError:
log.error("Info element not found.")
else:
# Date
date_text = get_first_elem_text(info_el, ".//span[text()='Added']/parent::p/text()")
if date_text:
scene["date"] = dt.strptime(date_text.replace("Added", "").strip(), "%B %d, %Y").strftime("%Y-%m-%d")
# Groups
group_title = get_first_elem_text(info_el, ".//span[text()='DVD Title']/parent::p/a")
if group_title:
group = ScrapedMovie(name=group_title)
group_url = get_first_elem_text(tree, "//div[@id='right']/div[@class='dvd']/h4/a/@href")
if group_url:
group["url"] = BASE_URL + group_url
scene["movies"] = [group]
# Tags
tag_els = info_el.xpath(".//span[text()='Tags']/parent::p/a")
if tag_els:
scene["tags"] = [ScrapedTag(name=t.text) for t in tag_els]

# Studio
studio_url = get_first_elem_text(tree, "//div[@id='right']/div[@class='dvd']/h4/a/@href")
if studio_url:
studio_tree = get_html_from_url(BASE_URL + studio_url, sess)
if studio_tree is not None:
studio_name = get_first_elem_text(
studio_tree,
"//div[@id='content']//div[contains(@class, 'dvd-info')]//li/span[text()='Studio']/parent::li/a",
)
if studio_name:
scene["studio"] = ScrapedStudio(name=studio_name)

return scene


def performer_from_url(url: str) -> ScrapedPerformer:
sess = requests.Session()
tree = get_html_from_url(url, sess)
if tree is None:
sys.exit(1)

try:
bio_elem = tree.xpath("//div[contains(@class, 'pornstar-bio')]")[0]
except IndexError:
log.error("Bio element not found.")
sys.exit(1)

p_id = re.findall(r".+?(\d+)", url)[0]
performer = ScrapedPerformer(
name=get_first_elem_text(bio_elem, "./h1"),
image=f"https://images.downloadpass.com/images/headshots/{p_id[:1]}/{p_id}/crop/300.jpg",
)

return performer


if __name__ == "__main__":
op, args = scraper_args()
result = None
match op, args:
case "movie-by-url", {"url": url} if url:
result = movie_from_url(url)
case "scene-by-url", {"url": url} if url:
result = scene_from_url(url)
case "performer-by-url", {"url": url} if url:
result = performer_from_url(url)
case _:
log.error(f"Not Implemented: Operation: {op}, arguments: {json.dumps(args)}")
sys.exit(1)

print(json.dumps(result))
31 changes: 31 additions & 0 deletions scrapers/DownloadPass/DownloadPass.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
name: DownloadPass

movieByURL:
- action: script
url:
- downloadpass.com/dvds/
- heatwavepass.com/dvds/
script:
- python3
- DownloadPass.py
- movie-by-url
sceneByURL:
- action: script
url:
- downloadpass.com/scenes/
- heatwavepass.com/scenes/
script:
- python3
- DownloadPass.py
- scene-by-url
performerByURL:
- action: script
url:
- downloadpass.com/pornstars/
- heatwavepass.com/pornstars/
script:
- python3
- DownloadPass.py
- performer-by-url

# Last Updated November 21, 2024

0 comments on commit c8e9609

Please sign in to comment.