Merge branch 'dev' into main

This commit is contained in:
Marc Koch 2021-09-12 15:26:20 +02:00
commit 76c24041e0
2 changed files with 52 additions and 29 deletions

41
main.py
View File

@ -9,23 +9,25 @@ from models import TazDownloader, TazConfiguration
from exceptions import TazConfigurationError, TazDownloadError, TazDownloadFormatException from exceptions import TazConfigurationError, TazDownloadError, TazDownloadFormatException
# Get directory # Get directory
dir_path = os.path.dirname(os.path.realpath(__file__)) + '/' dir_path = os.path.dirname(os.path.realpath(__file__))
def main(config: dict): def main(config: dict):
# Get german date for tomorrow # Get german date for tomorrow
tomorrow = (datetime.now(pytz.timezone('Europe/Berlin')) + timedelta(1)).strftime('%Y_%m_%d') tomorrow = (datetime.now(pytz.timezone('Europe/Berlin')) + timedelta(1)).strftime('%Y_%m_%d')
# Define tmp/ folder
tmp_folder = os.path.join(dir_path, 'tmp')
# Set log level # Set log level
try: try:
logging.getLogger().setLevel(config['log_level'].upper()) logging.getLogger().setLevel(config['log_level'].upper())
except ValueError as e: except ValueError as e:
logging.error(f"Could not set log level.\n{e}", exc_info=True) logging.error(f"Could not set log level.\n {e}")
# Read download history from csv file # Read download history from csv file
try: try:
df = pd.read_csv(dir_path + 'download_history.csv', header=0) df = pd.read_csv(os.path.join(dir_path, 'download_history.csv'), header=0)
except FileNotFoundError: except FileNotFoundError:
# In case, there isn't yet a csv file, create data frame with headers # In case, there isn't yet a csv file, create data frame with headers
df = pd.DataFrame( df = pd.DataFrame(
@ -42,14 +44,13 @@ def main(config: dict):
logging.info('Tomorrow\'s newspaper was already downloaded. Execution canceled.') logging.info('Tomorrow\'s newspaper was already downloaded. Execution canceled.')
sys.exit(0) sys.exit(0)
except Exception as e: except Exception as e:
logging.error(f"Could not check whether tomorrow's newspaper has already been downloaded.\n{e}", logging.error(f"Could not check whether tomorrow's newspaper has already been downloaded.\n {e}")
exc_info=True)
# Instantiate downloader object # Instantiate downloader object
try: try:
taz_dl = TazDownloader(config['id'], config['password'], config['download_format']) taz_dl = TazDownloader(config['id'], config['password'], config['download_format'])
except TazDownloadFormatException as e: except TazDownloadFormatException as e:
logging.error(e, exc_info=True) logging.error(e)
sys.exit(1) sys.exit(1)
try: try:
@ -62,17 +63,17 @@ def main(config: dict):
# Find newspaper which are not already downloaded # Find newspaper which are not already downloaded
newspaper_to_download = [n for n in newspaper_available if n not in df.file.values] newspaper_to_download = [n for n in newspaper_available if n not in df.file.values]
except TazDownloadError as e: except TazDownloadError as e:
logging.error(e, exc_info=True) logging.error(e)
sys.exit(1) sys.exit(1)
# Download newspaper # Download newspaper
newspaper_downloaded = [] newspaper_downloaded = []
for n in newspaper_to_download: for n in newspaper_to_download:
try: try:
if taz_dl.download_newspaper(n): if taz_dl.download_newspaper(n, tmp_folder):
newspaper_downloaded.append(n) newspaper_downloaded.append(n)
except Exception as e: except Exception as e:
logging.error(f"Could not download {n}\n{e}", exc_info=True) logging.error(f"Could not download {n}\n {e}")
# Add downloaded newspaper to download_history.csv # Add downloaded newspaper to download_history.csv
try: try:
@ -85,26 +86,34 @@ def main(config: dict):
) )
df = df.append(df_tmp, ignore_index=True) df = df.append(df_tmp, ignore_index=True)
df.sort_values(by='file', ascending=False, inplace=True) df.sort_values(by='file', ascending=False, inplace=True)
df.to_csv(dir_path + 'download_history.csv', index=False) df.to_csv(os.path.join(dir_path, 'download_history.csv'), index=False)
except Exception as e: except Exception as e:
logging.error(f"Could not update download_history.csv\n{e}", exc_info=True) logging.error(f"Could not update download_history.csv\n {e}")
# Move downloaded file to download folder # Move downloaded file to download folder
newspaper_downloaded_string = "\n ".join(newspaper_downloaded)
if os.path.isdir(config['download_folder']): if os.path.isdir(config['download_folder']):
download_folder = \ download_folder = \
config['download_folder'] if config['download_folder'].endswith('/') else config['download_folder'] + "/" config['download_folder'] \
if config['download_folder'].endswith(os.path.sep) \
else config['download_folder'] + os.path.sep
for n in newspaper_downloaded: for n in newspaper_downloaded:
try: try:
shutil.move(dir_path + 'tmp/' + n, download_folder) shutil.move(os.path.join(tmp_folder, n), download_folder)
except Exception as e: except Exception as e:
logging.error(f"Could not move file to download folder \"{download_folder}\"\n{e}", exc_info=True) logging.error(f"Could not move {n} to download folder \"{download_folder}\"\n {e}")
if newspaper_downloaded:
logging.info(f"Downloaded\n {newspaper_downloaded_string}\n to {config['download_folder']}")
else:
logging.error(f"{config['download_folder']} does not exists.\n {newspaper_downloaded_string}"
f"\n downloaded to {tmp_folder}")
if __name__ == '__main__': if __name__ == '__main__':
# Set up logging # Set up logging
logging.basicConfig( logging.basicConfig(
filename=dir_path + 'tazPlease.log', filename=os.path.join(dir_path, 'tazPlease.log'),
level=logging.ERROR, level=logging.ERROR,
format='%(asctime)s - %(message)s' format='%(asctime)s - %(message)s'
) )

View File

@ -1,13 +1,13 @@
import os import os
import requests import requests
from requests.exceptions import HTTPError from requests.exceptions import HTTPError
from exceptions import TazDownloadFormatException from exceptions import TazDownloadFormatException, TazConfigurationError, TazDownloadError
from exceptions import TazDownloadError
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from envyaml import EnvYAML from envyaml import EnvYAML
import argparse import argparse
import filetype
dir_path = os.path.dirname(os.path.realpath(__file__)) + '/' dir_path = os.path.dirname(os.path.realpath(__file__))
class TazConfiguration: class TazConfiguration:
@ -43,7 +43,7 @@ class TazConfiguration:
def _load_config(self): def _load_config(self):
# Try to load config.yaml # Try to load config.yaml
try: try:
conf_yaml = EnvYAML(dir_path + 'config.yaml', dir_path + '.env') conf_yaml = EnvYAML(os.path.join(dir_path, 'config.yaml'), os.path.join(dir_path, '.env'))
except Exception as e: except Exception as e:
raise Exception(f"Something went wrong when reading config.yaml.\n{e}") raise Exception(f"Something went wrong when reading config.yaml.\n{e}")
@ -141,17 +141,17 @@ class TazDownloader:
except HTTPError as http_e: except HTTPError as http_e:
raise TazDownloadError(f"Could not scrape available newspaper editions:\n{http_e}") raise TazDownloadError(f"Could not scrape available newspaper editions:\n{http_e}")
def download_newspaper(self, taz: str, download_folder: str = dir_path + 'tmp/'): def download_newspaper(self, taz: str, download_folder: str = os.path.join(dir_path, 'tmp')):
""" """
Downloads a newspaper from dl.taz.de and stores it in tmp/ Downloads a newspaper from dl.taz.de and stores it in tmp folder
""" """
# Check if folder exists # Check if folder exists
try: try:
if not os.path.isdir(dir_path): if not os.path.isdir(download_folder):
os.mkdirs(dir_path) os.makedirs(download_folder)
except Exception as e: except Exception as e:
raise TazDownloadError(f"Could find or create \"{dir_path}\":\n{e}") raise TazDownloadError(f"Could find or create \"{download_folder}\":\n{e}")
# download taz # download taz
try: try:
@ -167,11 +167,25 @@ class TazDownloader:
} }
) as r: ) as r:
# write response to file # write response to file
with open(download_folder + taz, "wb") as f: with open(os.path.join(download_folder, taz), "wb") as f:
for chunk in r.iter_content(chunk_size=8192): for chunk in r.iter_content(chunk_size=8192):
f.write(chunk) f.write(chunk)
# Unfortunately, the taz website does not respond with an http error code if the credentials are wrong.
# So we have to check if the response is a pdf file or the html page with an error message.
try:
if filetype.guess(os.path.join(download_folder, taz)).mime != 'application/pdf':
raise TazDownloadError()
except (AttributeError, TazDownloadError) as e:
# Try to get the error message from the html file to put it in the log
with open(os.path.join(download_folder, taz), 'r') as f:
soup = BeautifulSoup(f.read(), 'html.parser')
error_displayed_on_page = soup.find('p', class_='error').text
if error_displayed_on_page:
os.remove(os.path.join(download_folder, taz))
raise TazDownloadError(error_displayed_on_page)
else:
os.remove(os.path.join(download_folder, taz))
raise TazDownloadError(e)
return True return True
except HTTPError as http_e: except HTTPError as http_e:
raise TazDownloadError(f"Could not download taz:\n{http_e}") raise TazDownloadError(http_e)
except Exception as e:
raise TazDownloadError(f"Something went wrong:\n{e}")