Skip to content

naaleh

NaalehExtractor

Bases: Extractor

Extract audio content from Naaleh.com.

This extractor handles URLs from naaleh.com and extracts audio download links using the JWPlayer media key along with their titles.

Source code in src/torah_dl/core/extractors/naaleh.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
class NaalehExtractor(Extractor):
    """Extract audio content from Naaleh.com.

    This extractor handles URLs from naaleh.com and extracts audio download
    links using the JWPlayer media key along with their titles.
    """

    name: str = "Naaleh"
    homepage: str = "https://naaleh.com"

    EXAMPLES = [  # noqa: RUF012
        ExtractionExample(
            name="main_page",
            url="https://www.naaleh.com/torah_library/?post_id=34538",
            download_url="https://www.naaleh.com/file_downloader/?file_url=https://cdn.jwplayer.com/videos/Md9qaTch.m4a&title=Unlocking%20the%20order%20of%20Seder%20Night.mp3",
            title="Unlocking the order of Seder Night",
            file_format="audio/mp3",
            valid=True,
        ),
        ExtractionExample(
            name="invalid_link",
            url="https://www.naaleh.com/torah_library/?post_id=00000",
            download_url="",
            title="",
            file_format="",
            valid=False,
        ),
    ]

    URL_PATTERN = re.compile(r"https?://(?:www\.)?naaleh\.com/")

    @property
    @override
    def url_patterns(self) -> list[Pattern[str]]:
        """Return the URL pattern(s) that this extractor can handle.

        Returns:
            List[Pattern]: List of compiled regex patterns matching Naaleh.com URLs
        """
        return [self.URL_PATTERN]

    @override
    def extract(self, url: str) -> Extraction:
        """Extract download URL and title from a Naaleh.com page.

        Args:
            url: The Naaleh.com URL to extract from

        Returns:
            Extraction: Object containing the download URL and title

        Raises:
            NetworkError: If there are network-related issues
            DownloadURLError: If the download URL cannot be found
        """
        # Extract post_id from URL
        parsed_url = urlparse(url)
        query_params = parse_qs(parsed_url.query)
        post_id = query_params.get("post_id", [""])[0]
        if not post_id:
            raise DownloadURLError()

        try:
            response = requests.get(url, timeout=30, headers={"User-Agent": "torah-dl/1.0"})
            response.raise_for_status()
        except requests.RequestException as e:
            raise NetworkError(str(e)) from e

        soup = BeautifulSoup(response.content, "html.parser")

        # Find the media container element that has the JWPlayer data and matching post_id
        media_element = soup.find(attrs={"data-jwplayer-media-key": True, "data-post-id": post_id})
        if not media_element or not isinstance(media_element, Tag):
            raise DownloadURLError()

        media_key = cast(str, media_element.get("data-jwplayer-media-key"))
        title = cast(str, media_element.get("data-post-title"))
        if not media_key or not title:
            raise DownloadURLError()

        # Construct the download URL using the JWPlayer media key
        jwplayer_url = f"https://cdn.jwplayer.com/videos/{media_key}.m4a"
        encoded_title = quote(f"{title}.mp3")
        download_url = f"https://www.naaleh.com/file_downloader/?file_url={jwplayer_url}&title={encoded_title}"

        return Extraction(download_url=download_url, title=title, file_format="audio/mp3", file_name=f"{title}.mp3")

url_patterns property

url_patterns: list[Pattern[str]]

Return the URL pattern(s) that this extractor can handle.

Returns:

Type Description
list[Pattern[str]]

List[Pattern]: List of compiled regex patterns matching Naaleh.com URLs

extract

extract(url: str) -> Extraction

Extract download URL and title from a Naaleh.com page.

Parameters:

Name Type Description Default
url str

The Naaleh.com URL to extract from

required

Returns:

Name Type Description
Extraction Extraction

Object containing the download URL and title

Raises:

Type Description
NetworkError

If there are network-related issues

DownloadURLError

If the download URL cannot be found

Source code in src/torah_dl/core/extractors/naaleh.py
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
@override
def extract(self, url: str) -> Extraction:
    """Extract download URL and title from a Naaleh.com page.

    Args:
        url: The Naaleh.com URL to extract from

    Returns:
        Extraction: Object containing the download URL and title

    Raises:
        NetworkError: If there are network-related issues
        DownloadURLError: If the download URL cannot be found
    """
    # Extract post_id from URL
    parsed_url = urlparse(url)
    query_params = parse_qs(parsed_url.query)
    post_id = query_params.get("post_id", [""])[0]
    if not post_id:
        raise DownloadURLError()

    try:
        response = requests.get(url, timeout=30, headers={"User-Agent": "torah-dl/1.0"})
        response.raise_for_status()
    except requests.RequestException as e:
        raise NetworkError(str(e)) from e

    soup = BeautifulSoup(response.content, "html.parser")

    # Find the media container element that has the JWPlayer data and matching post_id
    media_element = soup.find(attrs={"data-jwplayer-media-key": True, "data-post-id": post_id})
    if not media_element or not isinstance(media_element, Tag):
        raise DownloadURLError()

    media_key = cast(str, media_element.get("data-jwplayer-media-key"))
    title = cast(str, media_element.get("data-post-title"))
    if not media_key or not title:
        raise DownloadURLError()

    # Construct the download URL using the JWPlayer media key
    jwplayer_url = f"https://cdn.jwplayer.com/videos/{media_key}.m4a"
    encoded_title = quote(f"{title}.mp3")
    download_url = f"https://www.naaleh.com/file_downloader/?file_url={jwplayer_url}&title={encoded_title}"

    return Extraction(download_url=download_url, title=title, file_format="audio/mp3", file_name=f"{title}.mp3")