73 lines
2.6 KiB
Python
73 lines
2.6 KiB
Python
import os
|
|
import requests
|
|
from requests.exceptions import HTTPError
|
|
from exceptions import TazDownloadFormatException
|
|
from exceptions import TazDownloadError
|
|
from bs4 import BeautifulSoup
|
|
|
|
dir_path = os.path.dirname(os.path.realpath(__file__)) + '/'
|
|
|
|
|
|
class TazDownloader:
|
|
download_formats = ["pdf", "epub", "epubt", "html", "ascii", "mobi", "mobit"]
|
|
BASE_URL = "https://dl.taz.de/"
|
|
HEADERS = {"User-agent": 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '
|
|
'Chrome/79.0.3945.130 Safari/537.36'}
|
|
|
|
def __init__(self, taz_id: str, password: str, download_format: str = "pdf"):
|
|
"""
|
|
:param taz_id:
|
|
:param password:
|
|
:param download_format:
|
|
"""
|
|
self.taz_id = taz_id
|
|
self.password = password
|
|
if download_format in self.download_formats:
|
|
self.download_url = self.BASE_URL + download_format
|
|
else:
|
|
raise TazDownloadFormatException
|
|
|
|
def scrape_newspaper(self) -> list:
|
|
"""
|
|
Scrapes the newspaper available for download from https://dl.taz.de/
|
|
:return: a list of file names (str)
|
|
"""
|
|
page = requests.get(self.download_url, headers=self.HEADERS)
|
|
soup = BeautifulSoup(page.content, 'html.parser')
|
|
return [n['value'] for n in soup.find("select").find_all("option")]
|
|
|
|
def download_newspaper(self, taz: str, download_folder: str = dir_path + 'tmp/'):
|
|
"""
|
|
Downloads a newspaper from dl.taz.de and stores it in /tmp
|
|
"""
|
|
|
|
# Check if folder exists
|
|
try:
|
|
if not os.path.isdir(dir_path):
|
|
os.mkdirs(dir_path)
|
|
except Exception as e:
|
|
raise TazDownloadError(f"Could find or create \"{dir_path}\":\n{e}")
|
|
|
|
# download taz
|
|
try:
|
|
with requests.get(
|
|
self.download_url,
|
|
stream=True,
|
|
headers=self.HEADERS,
|
|
params={
|
|
'name': self.taz_id,
|
|
'password': self.password,
|
|
'id': taz,
|
|
'Laden': '+Laden+',
|
|
}
|
|
) as r:
|
|
# write response to file
|
|
with open(download_folder + taz, "wb") as f:
|
|
for chunk in r.iter_content(chunk_size=8192):
|
|
f.write(chunk)
|
|
return True
|
|
except HTTPError as http_e:
|
|
raise TazDownloadError(f"Could not download taz:\n{http_e}")
|
|
except Exception as e:
|
|
raise TazDownloadError(f"Something went wrong:\n{e}")
|