Skip to content

Commit

Permalink
April 2022 MP3 download issue and URL path fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
GatorQue committed Apr 16, 2022
1 parent caff617 commit 1f8883f
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 6 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ This script is ideal for:
- *Anyone* who wishes to study from the conference talks but doesn't have a reliable internet connection.

## What it can do?
This script will allow you to download the LDS General Conference talks in mp3 form that are available at https://www.churchofjesuschrist.org/general-conference.
This script will allow you to download the LDS General Conference talks in mp3 form that are available at https://www.churchofjesuschrist.org/study/general-conference.
It will create *playlists* as *.m3u files to allow you to play an *entire session*.
It will also create playlists for *speakers* and *topics*.
This will not only work with the default English versions, but also for *every other language* for which audio files are available.
Expand Down
27 changes: 22 additions & 5 deletions gen_conf_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""

import argparse
import base64
import datetime
import glob
import io
Expand Down Expand Up @@ -35,17 +36,20 @@
TalkByTopic = namedtuple('TalkByTopic', 'link speaker title topic')

LDS_ORG_URL = 'https://www.churchofjesuschrist.org'
ALL_CONFERENCES_URL = f'{LDS_ORG_URL}/general-conference/conferences'
ALL_TOPICS_URL = f'{LDS_ORG_URL}/general-conference/topics'
ALL_CONFERENCES_URL = f'{LDS_ORG_URL}/study/general-conference'
ALL_TOPICS_URL = f'{LDS_ORG_URL}/study/general-conference/topics'

GET_LANGS_REGEX = 'data-lang=\".*?\" data-clang=\"(.*?)\">(.*?)</a>'
CONFERENCES_REGEX = '<a[^>]*href="([^"]*)"[^>]*><div[^>]*><img[^>]*></div><span[^>]*>([A-Z][a-z]* \d{4})</span></a>'
CONFERENCE_GROUPS_REGEX = '<a[^>]*href="([^"]*)"[^>]*><div[^>]*><img[^>]*></div><span[^>]*>(\d{4}.\d{4})</span></a>'
CONFERENCE_GROUPS_RANGE_REGEX = '.*/(\d{4})(\d{4})\?lang=.*'
CONFERENCE_LINK_YEAR_MONTH_REGEX = '.*(\d{4})/(\d{2})\?lang=.*'

SCRIPT_BASE64_REGEX = '<script>window.__INITIAL_STATE__[^"]*"([^"]*)";</script>'
MP3_DOWNLOAD_REGEX = '<a[^>]*href="([^"]*)"[^>]*>This Page \(MP3\).*?</a>'
MP3_FILENAME_REGEX = '.*/(.*\.mp3)\?lang=.*'
MP3_DOWNLOAD_FILENAME_REGEX = '.*/(.*\.mp3)\?lang=.*'
MP3_MEDIAURL_REGEX = '{"mediaUrl":"([^"]*)","variant":"audio"}'
MP3_MEDIAURL_FILENAME_REGEX = '.*/(.*\.mp3)'

SESSIONS_REGEX = '<a[^>]*href="([^"]*)"[^>]*><div[^>]*><p><span[^>]*>([^<]*)</span></p></div></a><ul[^>]*>(.*?)</ul>'
SESSION_TALKS_REGEX = '<a[^>]*href="([^"]*)"[^>]*><div[^>]*><p><span[^>]*>([^<]*)</span></p><p[^>]*>([^<]*)</p></div></a>'
Expand Down Expand Up @@ -302,10 +306,23 @@ def get_all_talks_by_topic(args):
def get_audio(args, talk):
link_html = get_html(args, f'{LDS_ORG_URL}{decode(talk.link)}')
mp3_link = re.search(MP3_DOWNLOAD_REGEX, link_html)
if not mp3_link:
# In April 2022 the MP3 link became buried in base64 encoded script section
match = re.search(SCRIPT_BASE64_REGEX, link_html)
if mp3_link:
# Extract and reuse the filename from the MP3 URL (exclude language)
mp3_file = re.match(MP3_DOWNLOAD_FILENAME_REGEX, mp3_link.group(1))
elif not mp3_link and not match:
return
elif not mp3_link and match:
# MP3 link is probably in the base64 encoded script section
script_data = str(base64.b64decode(match.group(1)))
# Search for JSON object containing mediaUrl key and value
mp3_link = re.search(MP3_MEDIAURL_REGEX, script_data)
if not mp3_link:
return
# Extract and reuse the filename from the MP3 URL
mp3_file = re.match(MP3_MEDIAURL_FILENAME_REGEX, mp3_link.group(1))

mp3_file = re.match(MP3_FILENAME_REGEX, mp3_link.group(1))
if not mp3_file:
return

Expand Down

0 comments on commit 1f8883f

Please sign in to comment.