11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158 | class TorahDownloadsExtractor(Extractor):
"""Extract audio content from TorahDownloads.com.
This extractor handles URLs from torahdownloads.com and extracts MP3 download
links along with their associated titles from various locations in the page.
"""
name: str = "TorahDownloads"
homepage: str = "https://torahdownloads.com"
EXAMPLES = [ # noqa: RUF012
ExtractionExample(
name="main_page",
url="https://torahdownloads.com/shiur-23156.html",
download_url="https://torahcdn.net/tdn/23156.mp3",
title="Acharei Mos - Maavir Sedra of Pesukim - Rabbi Dovid Grossman - TD23156",
file_format="audio/mp3",
valid=True,
),
ExtractionExample(
name="intro_to_prayer",
url="https://torahdownloads.com/shiur-13655.html",
download_url="https://torahcdn.net/tdn/13655.mp3",
title="Intro To Prayer - Rabbi Mordechai Becher - TD13655",
file_format="audio/mp3",
valid=True,
),
ExtractionExample(
name="invalid_link",
url="https://torahdownloads.com/shiur-00000.html",
download_url="",
title="",
file_format="",
valid=False,
),
]
# URL pattern for TorahDownloads.com pages
URL_PATTERN = re.compile(r"https?://(?:www\.)?torahdownloads\.com/")
# Pattern to find download URL in script tags
SCRIPT_URL_PATTERN = re.compile(r"(?:audioUrl|audio_url|url)\s*:\s*['\"]([^'\"]+\.mp3)['\"]", flags=re.IGNORECASE)
RAW_URL_PATTERN = re.compile(r"https?://[^\"'\s]+\.mp3")
@property
def url_patterns(self) -> list[Pattern]:
"""Return the URL pattern(s) that this extractor can handle.
Returns:
List[Pattern]: List of compiled regex patterns matching TorahDownloads.com URLs
"""
return [self.URL_PATTERN]
def _extract_title(self, soup: BeautifulSoup) -> str | None:
"""Extract the title from the page using various selectors.
Args:
soup: BeautifulSoup object of the page
Returns:
str | None: The extracted title or None if not found
"""
# Try finding the title in the Details section
if details := soup.find("div", string="Details"): # noqa: SIM102
if length_text := details.find_next(string=lambda text: text and "Length:" in text): # noqa: SIM102
# Get all text nodes between Details and Length
if title_node := length_text.find_previous(
string=lambda text: text and text.strip() and "Details" not in text
):
return title_node.strip()
# Try finding the title in the breadcrumb/navigation area
if nav_title := soup.find("div", class_="nav-title"):
return nav_title.get_text().strip()
# Try finding any standalone text that looks like a title
for text in soup.stripped_strings:
text = text.strip()
# Skip common non-title text
if (
text
and len(text) > 3
and "Length:" not in text
and "Details" not in text
and "Source" not in text
and "Speaker" not in text
and "Category" not in text
and "Language" not in text
):
return text
return None
def extract(self, url: str) -> Extraction:
"""Extract download URL and title from a TorahDownloads.com page.
Args:
url: The TorahDownloads.com URL to extract from
Returns:
Extraction: Object containing the download URL and title
Raises:
ValueError: If the URL is invalid or content cannot be extracted
requests.RequestException: If there are network-related issues
"""
try:
response = requests.get(url, timeout=30, headers={"User-Agent": "torah-dl/1.0"})
response.raise_for_status()
except requests.RequestException as e:
raise NetworkError(str(e)) from e # pragma: no cover
# Parse the page content
soup = BeautifulSoup(response.content, "html.parser")
html = str(response.content)
# Extract title first since we'll need it for all cases
title = self._extract_title(soup)
download_url = None
# Try finding audio element first
media_selector = 'audio source[src*=".mp3"], audio[src*=".mp3"], a[href*=".mp3"]'
if audio_element := soup.select_one(media_selector):
download_url = audio_element.get("src") or audio_element.get("href")
# Try finding download link with various patterns
if not download_url:
download_selector = 'a[href*="/download/"], a[href*="getfile"], a[href*="audio"]'
if download_link := soup.select_one(download_selector):
download_url = download_link.get("href")
# Try finding audio URL in script tags
if not download_url:
for script in soup.find_all("script"):
content = script.string or ""
if media_url_match := self.SCRIPT_URL_PATTERN.search(content):
download_url = media_url_match.group(1)
break
# Try finding in the raw HTML for any mp3 URLs
if not download_url and (media_url_match := self.RAW_URL_PATTERN.search(html)):
download_url = media_url_match.group(0)
if download_url:
file_name = download_url.split("/")[-1]
return Extraction(download_url=download_url, title=title, file_format="audio/mp3", file_name=file_name)
raise DownloadURLError()
|