Skip to content

torahdownloads

TorahDownloadsExtractor

Bases: Extractor

Extract audio content from TorahDownloads.com.

This extractor handles URLs from torahdownloads.com and extracts MP3 download links along with their associated titles from various locations in the page.

Source code in src/torah_dl/core/extractors/torahdownloads.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
class TorahDownloadsExtractor(Extractor):
    """Extract audio content from TorahDownloads.com.

    This extractor handles URLs from torahdownloads.com and extracts MP3 download
    links along with their associated titles from various locations in the page.
    """

    name: str = "TorahDownloads"
    homepage: str = "https://torahdownloads.com"

    EXAMPLES = [  # noqa: RUF012
        ExtractionExample(
            name="main_page",
            url="https://torahdownloads.com/shiur-23156.html",
            download_url="https://torahcdn.net/tdn/23156.mp3",
            title="Acharei Mos  - Maavir Sedra of Pesukim - Rabbi Dovid Grossman - TD23156",
            file_format="audio/mp3",
            valid=True,
        ),
        ExtractionExample(
            name="intro_to_prayer",
            url="https://torahdownloads.com/shiur-13655.html",
            download_url="https://torahcdn.net/tdn/13655.mp3",
            title="Intro To Prayer - Rabbi Mordechai Becher - TD13655",
            file_format="audio/mp3",
            valid=True,
        ),
        ExtractionExample(
            name="invalid_link",
            url="https://torahdownloads.com/shiur-00000.html",
            download_url="",
            title="",
            file_format="",
            valid=False,
        ),
    ]

    # URL pattern for TorahDownloads.com pages
    URL_PATTERN = re.compile(r"https?://(?:www\.)?torahdownloads\.com/")

    # Pattern to find download URL in script tags
    SCRIPT_URL_PATTERN = re.compile(r"(?:audioUrl|audio_url|url)\s*:\s*['\"]([^'\"]+\.mp3)['\"]", flags=re.IGNORECASE)
    RAW_URL_PATTERN = re.compile(r"https?://[^\"'\s]+\.mp3")

    @property
    def url_patterns(self) -> list[Pattern]:
        """Return the URL pattern(s) that this extractor can handle.

        Returns:
            List[Pattern]: List of compiled regex patterns matching TorahDownloads.com URLs
        """
        return [self.URL_PATTERN]

    def _extract_title(self, soup: BeautifulSoup) -> str | None:
        """Extract the title from the page using various selectors.

        Args:
            soup: BeautifulSoup object of the page

        Returns:
            str | None: The extracted title or None if not found
        """
        # Try finding the title in the Details section
        if details := soup.find("div", string="Details"):  # noqa: SIM102
            if length_text := details.find_next(string=lambda text: text and "Length:" in text):  # noqa: SIM102
                # Get all text nodes between Details and Length
                if title_node := length_text.find_previous(
                    string=lambda text: text and text.strip() and "Details" not in text
                ):
                    return title_node.strip()

        # Try finding the title in the breadcrumb/navigation area
        if nav_title := soup.find("div", class_="nav-title"):
            return nav_title.get_text().strip()

        # Try finding any standalone text that looks like a title
        for text in soup.stripped_strings:
            text = text.strip()
            # Skip common non-title text
            if (
                text
                and len(text) > 3
                and "Length:" not in text
                and "Details" not in text
                and "Source" not in text
                and "Speaker" not in text
                and "Category" not in text
                and "Language" not in text
            ):
                return text

        return None

    def extract(self, url: str) -> Extraction:
        """Extract download URL and title from a TorahDownloads.com page.

        Args:
            url: The TorahDownloads.com URL to extract from

        Returns:
            Extraction: Object containing the download URL and title

        Raises:
            ValueError: If the URL is invalid or content cannot be extracted
            requests.RequestException: If there are network-related issues
        """
        try:
            response = requests.get(url, timeout=30, headers={"User-Agent": "torah-dl/1.0"})
            response.raise_for_status()
        except requests.RequestException as e:
            raise NetworkError(str(e)) from e  # pragma: no cover

        # Parse the page content
        soup = BeautifulSoup(response.content, "html.parser")
        html = str(response.content)

        # Extract title first since we'll need it for all cases
        title = self._extract_title(soup)
        download_url = None

        # Try finding audio element first
        media_selector = 'audio source[src*=".mp3"], audio[src*=".mp3"], a[href*=".mp3"]'
        if audio_element := soup.select_one(media_selector):
            download_url = audio_element.get("src") or audio_element.get("href")

        # Try finding download link with various patterns
        if not download_url:
            download_selector = 'a[href*="/download/"], a[href*="getfile"], a[href*="audio"]'
            if download_link := soup.select_one(download_selector):
                download_url = download_link.get("href")

        # Try finding audio URL in script tags
        if not download_url:
            for script in soup.find_all("script"):
                content = script.string or ""
                if media_url_match := self.SCRIPT_URL_PATTERN.search(content):
                    download_url = media_url_match.group(1)
                    break

        # Try finding in the raw HTML for any mp3 URLs
        if not download_url and (media_url_match := self.RAW_URL_PATTERN.search(html)):
            download_url = media_url_match.group(0)

        if download_url:
            file_name = download_url.split("/")[-1]
            return Extraction(download_url=download_url, title=title, file_format="audio/mp3", file_name=file_name)

        raise DownloadURLError()

url_patterns property

url_patterns: list[Pattern]

Return the URL pattern(s) that this extractor can handle.

Returns:

Type Description
list[Pattern]

List[Pattern]: List of compiled regex patterns matching TorahDownloads.com URLs

extract

extract(url: str) -> Extraction

Extract download URL and title from a TorahDownloads.com page.

Parameters:

Name Type Description Default
url str

The TorahDownloads.com URL to extract from

required

Returns:

Name Type Description
Extraction Extraction

Object containing the download URL and title

Raises:

Type Description
ValueError

If the URL is invalid or content cannot be extracted

RequestException

If there are network-related issues

Source code in src/torah_dl/core/extractors/torahdownloads.py
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
def extract(self, url: str) -> Extraction:
    """Extract download URL and title from a TorahDownloads.com page.

    Args:
        url: The TorahDownloads.com URL to extract from

    Returns:
        Extraction: Object containing the download URL and title

    Raises:
        ValueError: If the URL is invalid or content cannot be extracted
        requests.RequestException: If there are network-related issues
    """
    try:
        response = requests.get(url, timeout=30, headers={"User-Agent": "torah-dl/1.0"})
        response.raise_for_status()
    except requests.RequestException as e:
        raise NetworkError(str(e)) from e  # pragma: no cover

    # Parse the page content
    soup = BeautifulSoup(response.content, "html.parser")
    html = str(response.content)

    # Extract title first since we'll need it for all cases
    title = self._extract_title(soup)
    download_url = None

    # Try finding audio element first
    media_selector = 'audio source[src*=".mp3"], audio[src*=".mp3"], a[href*=".mp3"]'
    if audio_element := soup.select_one(media_selector):
        download_url = audio_element.get("src") or audio_element.get("href")

    # Try finding download link with various patterns
    if not download_url:
        download_selector = 'a[href*="/download/"], a[href*="getfile"], a[href*="audio"]'
        if download_link := soup.select_one(download_selector):
            download_url = download_link.get("href")

    # Try finding audio URL in script tags
    if not download_url:
        for script in soup.find_all("script"):
            content = script.string or ""
            if media_url_match := self.SCRIPT_URL_PATTERN.search(content):
                download_url = media_url_match.group(1)
                break

    # Try finding in the raw HTML for any mp3 URLs
    if not download_url and (media_url_match := self.RAW_URL_PATTERN.search(html)):
        download_url = media_url_match.group(0)

    if download_url:
        file_name = download_url.split("/")[-1]
        return Extraction(download_url=download_url, title=title, file_format="audio/mp3", file_name=file_name)

    raise DownloadURLError()