diff --git a/.gitignore b/.gitignore index 555d79b..5412ad2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,8 @@ download_history.csv tazPlease.log config.yaml +tmp/ +*.lock # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/example_config.yaml b/example_config.yaml index 1eadcee..e5ac1e2 100644 --- a/example_config.yaml +++ b/example_config.yaml @@ -1,9 +1,20 @@ -taz: - taz_id: ${TAZ_ID} - taz_password: ${TAZ_PASSWORD} - dowload_format: "pdf" # Valid formats are: pdf, epub, epubt, html, ascii, mobi, mobit +# Your taz id (gets read from .env file) +id: ${TAZ_ID} +# Your taz password (gets read from .env file) +password: ${TAZ_PASSWORD} + +# In which format do you want to download your newspaper? +# Valid formats are: pdf, epub, epubt, html, ascii, mobi, mobit +download_format: "pdf" + +# Where should the downloaded files be stored? download_folder: "/path/to/download/folder" -logging: - log_level: "info" +# Use a lock file that indicates whether tomorrow's newspaper has already been downloaded to limit the number of times +# the taz.de website is queried for new editions +use_lock_file: True + +# Set the log level. +# Valid formats are: notset, debug, info, warning, error, critical +log_level: "info" diff --git a/exceptions.py b/exceptions.py index 007d3da..e145d1a 100644 --- a/exceptions.py +++ b/exceptions.py @@ -12,3 +12,12 @@ class TazDownloadError(Exception): def __inti__(self, format: str): self.format = format + + +class TazConfigurationError(Exception): + + def __inti__(self, misconfiguration: str): + self.misconfiguration = misconfiguration + + def __str__(self): + return f"\"{self.misconfiguration}\" must be defined either in the config.yaml or by passing it as an argument." diff --git a/main.py b/main.py index 4d4703f..79b82bc 100644 --- a/main.py +++ b/main.py @@ -1,92 +1,140 @@ import sys import os -import datetime +from datetime import datetime, timedelta +import pytz import logging import shutil -from envyaml import EnvYAML -from models import TazDownloader import pandas as pd +from models import TazDownloader, TazConfiguration +from exceptions import TazConfigurationError, TazDownloadError, TazDownloadFormatException +# Get directory dir_path = os.path.dirname(os.path.realpath(__file__)) + '/' -# Set up logging -logging.basicConfig( - filename=dir_path + 'tazPlease.log', - level=logging.ERROR, - format='%(asctime)s - %(message)s' -) -# Load configuration -try: - config = EnvYAML(dir_path + 'config.yaml', dir_path + '.env') -except Exception: - logging.error('Could not load config.yaml', exc_info=True) - sys.exit(1) +def main(config: dict): -# Set log level -try: - logging.getLogger().setLevel(config['logging']['log_level'].upper()) -except ValueError as e: - logging.error(f"Could not set log level. \n{e}", exc_info=True) + # Get german date for tomorrow + tomorrow = (datetime.now(pytz.timezone('Europe/Berlin')) + timedelta(1)).strftime('%Y_%m_%d') -# Read download history from csv file -try: - df = pd.read_csv(dir_path + 'download_history.csv', header=0) -except FileNotFoundError: - # In case, there isn't yet a csv file, create data frame with headers - df = pd.DataFrame( - columns=[ - 'file', - 'download_timestamp', - ] + # Set log level + try: + logging.getLogger().setLevel(config['log_level'].upper()) + except ValueError as e: + logging.error(f"Could not set log level.\n{e}", exc_info=True) + + # If 'use_lock_file' configuration is set, check if lockfile exists for tomorrow's newspaper + if config['use_lock_file']: + try: + lock_files = [entry for entry in os.listdir(dir_path) if os.path.isfile(entry) and entry.endswith('.lock')] + # Delete all lock files that do not refer to tomorrow's date + for file in lock_files: + if not file.startswith('.' + tomorrow): + os.remove(dir_path + file) + # If there is a lock file for tomorrow, exit the program + for file in lock_files: + if file.startswith('.' + tomorrow): + logging.info('Tomorrow\'s newspaper was already downloaded. Execution canceled.') + sys.exit(0) + except Exception as e: + logging.error(f"Could not check for lock files.\n{e}", exc_info=True) + + # Read download history from csv file + try: + df = pd.read_csv(dir_path + 'download_history.csv', header=0) + except FileNotFoundError: + # In case, there isn't yet a csv file, create data frame with headers + df = pd.DataFrame( + columns=[ + 'file', + 'download_timestamp', + ] + ) + + # Instantiate downloader object + try: + taz_dl = TazDownloader(config['id'], config['password'], config['download_format']) + except TazDownloadFormatException as e: + logging.error(e, exc_info=True) + sys.exit(1) + + try: + # Get newspapers available for download + newspaper_available = taz_dl.scrape_newspaper() + + # Remove outdated newspaper from download_history.csv + df.drop([index for index, row in df.iterrows() if row.file not in newspaper_available], inplace=True) + + # Find newspaper which are not already downloaded + newspaper_to_download = [n for n in newspaper_available if n not in df.file.values] + except TazDownloadError as e: + logging.error(e, exc_info=True) + sys.exit(1) + + # Download newspaper + newspaper_downloaded = [] + for n in newspaper_to_download: + try: + if taz_dl.download_newspaper(n): + newspaper_downloaded.append(n) + except Exception as e: + logging.error(f"Could not download {n}\n{e}", exc_info=True) + + # Create lock file for tomorrow + if config['use_lock_file']: + try: + lock_file = '.' + tomorrow + '.lock' + for n in newspaper_downloaded: + if n.startswith('taz_' + tomorrow): + os.mknod(dir_path + lock_file) + except Exception as e: + logging.error(f"Could not download create lock file \"{lock_file}\"\n{e}", exc_info=True) + + # Add downloaded newspaper to download_history.csv + try: + for n in newspaper_downloaded: + df_tmp = pd.DataFrame( + { + 'file': [n], + 'download_timestamp': [datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')], + } + ) + df = df.append(df_tmp, ignore_index=True) + df.sort_values(by='file', ascending=False, inplace=True) + df.to_csv(dir_path + 'download_history.csv', index=False) + except Exception as e: + logging.error(f"Could not update download_history.csv\n{e}", exc_info=True) + + # Move downloaded file to download folder + if os.path.isdir(config['download_folder']): + download_folder = \ + config['download_folder'] if config['download_folder'].endswith('/') else config['download_folder'] + "/" + for n in newspaper_downloaded: + try: + shutil.move(dir_path + 'tmp/' + n, download_folder) + except Exception as e: + logging.error(f"Could not move file to download folder \"{download_folder}\"\n{e}", exc_info=True) + + +if __name__ == '__main__': + + # Set up logging + logging.basicConfig( + filename=dir_path + 'tazPlease.log', + level=logging.ERROR, + format='%(asctime)s - %(message)s' ) -# Instantiate downloader object -taz_dl = TazDownloader(config['taz']['taz_id'], config['taz']['taz_password']) - -try: - # Get newspapers available for download - newspaper_available = taz_dl.scrape_newspaper() - - # Remove outdated newspaper from download_history.csv - df.drop([index for index, row in df.iterrows() if row.file not in newspaper_available], inplace=True) - - # Find newspaper which are not already downloaded - newspaper_to_download = [n for n in newspaper_available if n not in df.file.values] -except Exception as e: - logging.error(f"Could get available newspaper from website\n{e}", exc_info=True) - sys.exit(1) - -# Download newspaper -newspaper_downloaded = [] -for n in newspaper_to_download: + # Load configuration try: - if taz_dl.download_newspaper(n): - newspaper_downloaded.append(n) - except Exception as e: - logging.error(f"Could not download {n}\n{e}", exc_info=True) + configuration = TazConfiguration().get_config() + except TazConfigurationError as tce: + print(tce) + sys.exit(1) + except Exception as exception: + print(exception) + sys.exit(1) -# Add downloaded newspaper to download_history.csv -try: - for n in newspaper_downloaded: - df_tmp = pd.DataFrame( - { - 'file': [n], - 'download_timestamp': [datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')], - } - ) - df = df.append(df_tmp, ignore_index=True) - df.sort_values(by='download_timestamp', ascending=False, inplace=True) - df.to_csv(dir_path + 'download_history.csv', index=False) -except Exception as e: - logging.error(f"Could not update download_history.csv\n{e}", exc_info=True) - -# Move downloaded file to download folder -if os.path.isdir(config['download_folder']): - download_folder = \ - config['download_folder'] if config['download_folder'].endswith('/') else config['download_folder'] + "/" - for n in newspaper_downloaded: - try: - shutil.move(dir_path + 'tmp/' + n, download_folder) - except Exception as e: - logging.error(f"Could not move file to download folder \"{download_folder}\"\n{e}", exc_info=True) + # Execute main function + if configuration: + main(configuration) diff --git a/models.py b/models.py index f978ccd..3396363 100644 --- a/models.py +++ b/models.py @@ -4,10 +4,111 @@ from requests.exceptions import HTTPError from exceptions import TazDownloadFormatException from exceptions import TazDownloadError from bs4 import BeautifulSoup +from envyaml import EnvYAML +import argparse dir_path = os.path.dirname(os.path.realpath(__file__)) + '/' +class TazConfiguration: + """ + This class represents the configuration that is needed to run the program. + On initialization it trys to load the configuration from either the config.yaml or from the arguments passed. + """ + + # List of tuples that each defines a single configuration that can be set either in the config.yaml by passing it + # as an argument. + # CONFIGURATIONS[0]: configuration name + # CONFIGURATIONS[1]: is it required? + CONFIGURATIONS = [ + ('id', True), + ('password', True), + ('download_format', False), + ('download_folder', True), + ('use_lock_file', False), + ('log_level', False), + ] + + def __init__(self): + self._config = {} + + # try to load configuration + try: + self._load_config() + except TazDownloadFormatException: + raise + except Exception: + raise + + def _load_config(self): + # Try to load config.yaml + try: + conf_yaml = EnvYAML(dir_path + 'config.yaml', dir_path + '.env') + except Exception as e: + raise Exception(f"Something went wrong when reading config.yaml.\n{e}") + + # Get console arguments + console_args = self._parse_arguments() + + # Set configurations by preferring console arguments over settings in config.yaml + for conf, required in self.CONFIGURATIONS: + if conf in console_args and getattr(console_args, conf) is not None: + self._config[conf] = getattr(console_args, conf) + elif conf_yaml.get(conf, None) is not None: + self._config[conf] = conf_yaml[conf] + else: + if required: + raise TazConfigurationError(conf) + + def _parse_arguments(self): + """ + Parse command line arguments. + """ + argparser = argparse.ArgumentParser( + description='Download taz e-paper' + ) + argparser.add_argument( + '-i', + '--id', + action='store', + type=str, + ) + argparser.add_argument( + '-p', + '--password', + action='store', + type=str, + ) + argparser.add_argument( + '-f', + '--download-format', + action='store', + type=str, + choices=['pdf', 'epub', 'epubt', 'html', 'ascii', 'mobi', 'mobit'], + ) + argparser.add_argument( + '-d', + '--download_folder', + action='store', + type=str, + ) + argparser.add_argument( + '-l', + '--use_lock_file', + action='store_true', + default=None + ) + argparser.add_argument( + '--log_level', + action='store', + choices=['notset', 'debug', 'info', 'warning', 'error', 'critical'], + ) + return argparser.parse_args() + + def get_config(self) -> dict: + return self._config + + class TazDownloader: download_formats = ["pdf", "epub", "epubt", "html", "ascii", "mobi", "mobit"] BASE_URL = "https://dl.taz.de/" @@ -15,30 +116,28 @@ class TazDownloader: 'Chrome/79.0.3945.130 Safari/537.36'} def __init__(self, taz_id: str, password: str, download_format: str = "pdf"): - """ - :param taz_id: - :param password: - :param download_format: - """ self.taz_id = taz_id self.password = password if download_format in self.download_formats: self.download_url = self.BASE_URL + download_format else: - raise TazDownloadFormatException + raise TazDownloadFormatException(download_format) def scrape_newspaper(self) -> list: """ Scrapes the newspaper available for download from https://dl.taz.de/ :return: a list of file names (str) """ - page = requests.get(self.download_url, headers=self.HEADERS) - soup = BeautifulSoup(page.content, 'html.parser') - return [n['value'] for n in soup.find("select").find_all("option")] + try: + page = requests.get(self.download_url, headers=self.HEADERS) + soup = BeautifulSoup(page.content, 'html.parser') + return [n['value'] for n in soup.find("select").find_all("option")] + except HTTPError as http_e: + raise TazDownloadError(f"Could not scrape available newspaper editions:\n{http_e}") def download_newspaper(self, taz: str, download_folder: str = dir_path + 'tmp/'): """ - Downloads a newspaper from dl.taz.de and stores it in /tmp + Downloads a newspaper from dl.taz.de and stores it in tmp/ """ # Check if folder exists