208 lines
7.5 KiB
Python
208 lines
7.5 KiB
Python
import argparse
|
|
import os
|
|
|
|
import filetype
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from envyaml import EnvYAML
|
|
from requests.exceptions import HTTPError
|
|
|
|
from exceptions import TazDownloadFormatException, TazConfigurationError, TazDownloadError
|
|
|
|
dir_path = os.path.dirname(os.path.realpath(__file__))
|
|
|
|
|
|
class TazConfiguration:
|
|
"""
|
|
This class represents the configuration that is needed to run the program.
|
|
On initialization it trys to load the configuration from either the config.yaml or from the arguments passed.
|
|
"""
|
|
|
|
# List of tuples that each defines a single configuration that can be set either in the config.yaml by passing it
|
|
# as an argument.
|
|
# CONFIGURATIONS[0]: configuration name
|
|
# CONFIGURATIONS[1]: is it required?
|
|
CONFIGURATIONS = [
|
|
('id', True),
|
|
('password', True),
|
|
('download_format', False),
|
|
('download_folder', False),
|
|
('nextcloud_webdav_url', False),
|
|
('nextcloud_webdav_password', False),
|
|
('limit_requests', False),
|
|
('log_level', False),
|
|
]
|
|
|
|
def __init__(self):
|
|
self._config = {}
|
|
|
|
# try to load configuration
|
|
try:
|
|
self._load_config()
|
|
except TazDownloadFormatException:
|
|
raise
|
|
except Exception:
|
|
raise
|
|
|
|
def _load_config(self):
|
|
# Try to load config.yaml
|
|
try:
|
|
conf_yaml = EnvYAML(os.path.join(dir_path, 'config.yaml'), os.path.join(dir_path, '.env'))
|
|
except Exception as e:
|
|
raise Exception(f"Something went wrong when reading config.yaml.\n{e}")
|
|
|
|
# Get console arguments
|
|
console_args = self._parse_arguments()
|
|
|
|
# Set configurations by preferring console arguments over settings in config.yaml
|
|
for conf, required in self.CONFIGURATIONS:
|
|
if conf in console_args and getattr(console_args, conf) is not None:
|
|
self._config[conf] = getattr(console_args, conf)
|
|
elif conf_yaml.get(conf, None) is not None:
|
|
self._config[conf] = conf_yaml[conf]
|
|
else:
|
|
if required:
|
|
raise TazConfigurationError(conf)
|
|
|
|
def _parse_arguments(self):
|
|
"""
|
|
Parse command line arguments.
|
|
"""
|
|
argparser = argparse.ArgumentParser(
|
|
description='Download taz e-paper',
|
|
)
|
|
argparser.add_argument(
|
|
'-i',
|
|
'--id',
|
|
action='store',
|
|
type=str,
|
|
help='Your taz-ID',
|
|
)
|
|
argparser.add_argument(
|
|
'-p',
|
|
'--password',
|
|
action='store',
|
|
type=str,
|
|
help='Your password',
|
|
)
|
|
argparser.add_argument(
|
|
'-f',
|
|
'--download-format',
|
|
action='store',
|
|
type=str,
|
|
choices=['pdf', 'epub', 'epubt', 'html', 'ascii', 'mobi', 'mobit'],
|
|
help='The e-paper format',
|
|
)
|
|
argparser.add_argument(
|
|
'-d',
|
|
'--download_folder',
|
|
action='store',
|
|
type=str,
|
|
help='The path to a folder where the e-paper should be stored',
|
|
)
|
|
argparser.add_argument(
|
|
'--nextcloud_webdav_url',
|
|
action='store',
|
|
type=str,
|
|
help='The url of a Nextcloud webdav',
|
|
)
|
|
argparser.add_argument(
|
|
'--nextcloud_webdav_password',
|
|
action='store',
|
|
type=str,
|
|
help='The webdav password',
|
|
)
|
|
argparser.add_argument(
|
|
'-l',
|
|
'--limit-requests',
|
|
action='store_true',
|
|
default=None,
|
|
help='Only query website for available newspaper if tomorrow\'s newspaper has not already been downloaded',
|
|
)
|
|
argparser.add_argument(
|
|
'--log_level',
|
|
action='store',
|
|
choices=['notset', 'debug', 'info', 'warning', 'error', 'critical'],
|
|
help='Set the log level',
|
|
)
|
|
return argparser.parse_args()
|
|
|
|
def get_config(self) -> dict:
|
|
return self._config
|
|
|
|
|
|
class TazDownloader:
|
|
download_formats = ["pdf", "epub", "epubt", "html", "ascii", "mobi", "mobit"]
|
|
BASE_URL = "https://dl.taz.de/"
|
|
HEADERS = {"User-agent": 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '
|
|
'Chrome/79.0.3945.130 Safari/537.36'}
|
|
|
|
def __init__(self, taz_id: str, password: str, download_format: str = "pdf"):
|
|
self.taz_id = taz_id
|
|
self.password = password
|
|
if download_format in self.download_formats:
|
|
self.download_url = self.BASE_URL + download_format
|
|
else:
|
|
raise TazDownloadFormatException(download_format)
|
|
|
|
def scrape_newspaper(self) -> list:
|
|
"""
|
|
Scrapes the newspaper available for download from https://dl.taz.de/
|
|
:return: a list of file names (str)
|
|
"""
|
|
try:
|
|
page = requests.get(self.download_url, headers=self.HEADERS)
|
|
soup = BeautifulSoup(page.content, 'html.parser')
|
|
return [n['value'] for n in soup.find("select").find_all("option")]
|
|
except HTTPError as http_e:
|
|
raise TazDownloadError(f"Could not scrape available newspaper editions:\n{http_e}")
|
|
|
|
def download_newspaper(self, taz: str, download_folder: str = os.path.join(dir_path, 'tmp')):
|
|
"""
|
|
Downloads a newspaper from dl.taz.de and stores it in tmp folder
|
|
"""
|
|
|
|
# Check if folder exists
|
|
try:
|
|
if not os.path.isdir(download_folder):
|
|
os.makedirs(download_folder)
|
|
except Exception as e:
|
|
raise TazDownloadError(f"Could not find or create \"{download_folder}\":\n{e}")
|
|
|
|
# download taz
|
|
try:
|
|
with requests.get(
|
|
self.download_url,
|
|
stream=True,
|
|
headers=self.HEADERS,
|
|
params={
|
|
'username': self.taz_id,
|
|
'password': self.password,
|
|
'id': taz,
|
|
'Laden': '+Laden+',
|
|
}
|
|
) as r:
|
|
# write response to file
|
|
with open(os.path.join(download_folder, taz), "wb") as f:
|
|
for chunk in r.iter_content(chunk_size=8192):
|
|
f.write(chunk)
|
|
# Unfortunately, the taz website does not respond with a http error code if the credentials are wrong.
|
|
# So we have to check if the response is a pdf file or the html page with an error message.
|
|
try:
|
|
if filetype.guess(os.path.join(download_folder, taz)).mime != 'application/pdf':
|
|
raise TazDownloadError()
|
|
except (AttributeError, TazDownloadError) as e:
|
|
# Try to get the error message from the html file to put it in the log
|
|
with open(os.path.join(download_folder, taz), 'r') as f:
|
|
soup = BeautifulSoup(f.read(), 'html.parser')
|
|
error_displayed_on_page = soup.find('p', class_='error').text
|
|
if error_displayed_on_page:
|
|
os.remove(os.path.join(download_folder, taz))
|
|
raise TazDownloadError(error_displayed_on_page)
|
|
else:
|
|
os.remove(os.path.join(download_folder, taz))
|
|
raise TazDownloadError(e)
|
|
return True
|
|
except HTTPError as http_e:
|
|
raise TazDownloadError(http_e)
|