✨️ implement lock files and console arguments

- Use a lock file that indicates whether tomorrow's newspaper has already been downloaded to limit the number of times the taz.de website is queried for new editions - use the program with command line arguments
2021-09-05 23:42:15 +02:00 · 2021-09-05 23:42:15 +02:00 · 6e8d9d9ef8
parent 6286798e6c
commit 6e8d9d9ef8
5 changed files with 262 additions and 93 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,8 @@
 download_history.csv
 tazPlease.log
 config.yaml
 tmp/
 *.lock
 # Byte-compiled / optimized / DLL files
 __pycache__/
--- a/example_config.yaml
+++ b/example_config.yaml
@ -1,9 +1,20 @@
-taz:
+# Your taz id (gets read from .env file)
-  taz_id: ${TAZ_ID}
+id: ${TAZ_ID}
  taz_password: ${TAZ_PASSWORD}
  dowload_format: "pdf" # Valid formats are: pdf, epub, epubt, html, ascii, mobi, mobit
 # Your taz password (gets read from .env file)
 password: ${TAZ_PASSWORD}
 # In which format do you want to download your newspaper?
 # Valid formats are: pdf, epub, epubt, html, ascii, mobi, mobit
 download_format: "pdf"
 # Where should the downloaded files be stored?
 download_folder: "/path/to/download/folder"
-logging:
+# Use a lock file that indicates whether tomorrow's newspaper has already been downloaded to limit the number of times
-  log_level: "info"
+# the taz.de website is queried for new editions
 use_lock_file: True
 # Set the log level.
 # Valid formats are: notset, debug, info, warning, error, critical
 log_level: "info"
--- a/exceptions.py
+++ b/exceptions.py
@ -12,3 +12,12 @@ class TazDownloadError(Exception):
    def __inti__(self, format: str):
        self.format = format
 class TazConfigurationError(Exception):
    def __inti__(self, misconfiguration: str):
        self.misconfiguration = misconfiguration
    def __str__(self):
        return f"\"{self.misconfiguration}\" must be defined either in the config.yaml or by passing it as an argument."
--- a/main.py
+++ b/main.py
@ -1,92 +1,140 @@
 import sys
 import os
-import datetime
+from datetime import datetime, timedelta
 import pytz
 import logging
 import shutil
 from envyaml import EnvYAML
 from models import TazDownloader
 import pandas as pd
 from models import TazDownloader, TazConfiguration
 from exceptions import TazConfigurationError, TazDownloadError, TazDownloadFormatException
 # Get directory
 dir_path = os.path.dirname(os.path.realpath(__file__)) + '/'
 # Set up logging
 logging.basicConfig(
    filename=dir_path + 'tazPlease.log',
    level=logging.ERROR,
    format='%(asctime)s - %(message)s'
 )
-# Load configuration
+def main(config: dict):
 try:
    config = EnvYAML(dir_path + 'config.yaml', dir_path + '.env')
 except Exception:
    logging.error('Could not load config.yaml', exc_info=True)
    sys.exit(1)
-# Set log level
+    # Get german date for tomorrow
-try:
+    tomorrow = (datetime.now(pytz.timezone('Europe/Berlin')) + timedelta(1)).strftime('%Y_%m_%d')
    logging.getLogger().setLevel(config['logging']['log_level'].upper())
 except ValueError as e:
    logging.error(f"Could not set log level. \n{e}", exc_info=True)
-# Read download history from csv file
+    # Set log level
-try:
+    try:
-    df = pd.read_csv(dir_path + 'download_history.csv', header=0)
+        logging.getLogger().setLevel(config['log_level'].upper())
-except FileNotFoundError:
+    except ValueError as e:
-    # In case, there isn't yet a csv file, create data frame with headers
+        logging.error(f"Could not set log level.\n{e}", exc_info=True)
-    df = pd.DataFrame(
+
-        columns=[
+    # If 'use_lock_file' configuration is set, check if lockfile exists for tomorrow's newspaper
-            'file',
+    if config['use_lock_file']:
-            'download_timestamp',
+        try:
-        ]
+            lock_files = [entry for entry in os.listdir(dir_path) if os.path.isfile(entry) and entry.endswith('.lock')]
            # Delete all lock files that do not refer to tomorrow's date
            for file in lock_files:
                if not file.startswith('.' + tomorrow):
                    os.remove(dir_path + file)
            # If there is a lock file for tomorrow, exit the program
            for file in lock_files:
                if file.startswith('.' + tomorrow):
                    logging.info('Tomorrow\'s newspaper was already downloaded. Execution canceled.')
                    sys.exit(0)
        except Exception as e:
            logging.error(f"Could not check for lock files.\n{e}", exc_info=True)
    # Read download history from csv file
    try:
        df = pd.read_csv(dir_path + 'download_history.csv', header=0)
    except FileNotFoundError:
        # In case, there isn't yet a csv file, create data frame with headers
        df = pd.DataFrame(
            columns=[
                'file',
                'download_timestamp',
            ]
        )
    # Instantiate downloader object
    try:
        taz_dl = TazDownloader(config['id'], config['password'], config['download_format'])
    except TazDownloadFormatException as e:
        logging.error(e, exc_info=True)
        sys.exit(1)
    try:
        # Get newspapers available for download
        newspaper_available = taz_dl.scrape_newspaper()
        # Remove outdated newspaper from download_history.csv
        df.drop([index for index, row in df.iterrows() if row.file not in newspaper_available], inplace=True)
        # Find newspaper which are not already downloaded
        newspaper_to_download = [n for n in newspaper_available if n not in df.file.values]
    except TazDownloadError as e:
        logging.error(e, exc_info=True)
        sys.exit(1)
    # Download newspaper
    newspaper_downloaded = []
    for n in newspaper_to_download:
        try:
            if taz_dl.download_newspaper(n):
                newspaper_downloaded.append(n)
        except Exception as e:
            logging.error(f"Could not download {n}\n{e}", exc_info=True)
    # Create lock file for tomorrow
    if config['use_lock_file']:
        try:
            lock_file = '.' + tomorrow + '.lock'
            for n in newspaper_downloaded:
                if n.startswith('taz_' + tomorrow):
                    os.mknod(dir_path + lock_file)
        except Exception as e:
            logging.error(f"Could not download create lock file \"{lock_file}\"\n{e}", exc_info=True)
    # Add downloaded newspaper to download_history.csv
    try:
        for n in newspaper_downloaded:
            df_tmp = pd.DataFrame(
                {
                    'file': [n],
                    'download_timestamp': [datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')],
                }
            )
            df = df.append(df_tmp, ignore_index=True)
        df.sort_values(by='file', ascending=False, inplace=True)
        df.to_csv(dir_path + 'download_history.csv', index=False)
    except Exception as e:
        logging.error(f"Could not update download_history.csv\n{e}", exc_info=True)
    # Move downloaded file to download folder
    if os.path.isdir(config['download_folder']):
        download_folder = \
            config['download_folder'] if config['download_folder'].endswith('/') else config['download_folder'] + "/"
        for n in newspaper_downloaded:
            try:
                shutil.move(dir_path + 'tmp/' + n, download_folder)
            except Exception as e:
                logging.error(f"Could not move file to download folder \"{download_folder}\"\n{e}", exc_info=True)
 if __name__ == '__main__':
    # Set up logging
    logging.basicConfig(
        filename=dir_path + 'tazPlease.log',
        level=logging.ERROR,
        format='%(asctime)s - %(message)s'
    )
-# Instantiate downloader object
+    # Load configuration
 taz_dl = TazDownloader(config['taz']['taz_id'], config['taz']['taz_password'])
 try:
    # Get newspapers available for download
    newspaper_available = taz_dl.scrape_newspaper()
    # Remove outdated newspaper from download_history.csv
    df.drop([index for index, row in df.iterrows() if row.file not in newspaper_available], inplace=True)
    # Find newspaper which are not already downloaded
    newspaper_to_download = [n for n in newspaper_available if n not in df.file.values]
 except Exception as e:
    logging.error(f"Could get available newspaper from website\n{e}", exc_info=True)
    sys.exit(1)
 # Download newspaper
 newspaper_downloaded = []
 for n in newspaper_to_download:
    try:
-        if taz_dl.download_newspaper(n):
+        configuration = TazConfiguration().get_config()
-            newspaper_downloaded.append(n)
+    except TazConfigurationError as tce:
-    except Exception as e:
+        print(tce)
-        logging.error(f"Could not download {n}\n{e}", exc_info=True)
+        sys.exit(1)
    except Exception as exception:
        print(exception)
        sys.exit(1)
-# Add downloaded newspaper to download_history.csv
+    # Execute main function
-try:
+    if configuration:
-    for n in newspaper_downloaded:
+        main(configuration)
        df_tmp = pd.DataFrame(
            {
                'file': [n],
                'download_timestamp': [datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')],
            }
        )
        df = df.append(df_tmp, ignore_index=True)
    df.sort_values(by='download_timestamp', ascending=False, inplace=True)
    df.to_csv(dir_path + 'download_history.csv', index=False)
 except Exception as e:
    logging.error(f"Could not update download_history.csv\n{e}", exc_info=True)
 # Move downloaded file to download folder
 if os.path.isdir(config['download_folder']):
    download_folder = \
        config['download_folder'] if config['download_folder'].endswith('/') else config['download_folder'] + "/"
    for n in newspaper_downloaded:
        try:
            shutil.move(dir_path + 'tmp/' + n, download_folder)
        except Exception as e:
            logging.error(f"Could not move file to download folder \"{download_folder}\"\n{e}", exc_info=True)
--- a/models.py
+++ b/models.py
@ -4,10 +4,111 @@ from requests.exceptions import HTTPError
 from exceptions import TazDownloadFormatException
 from exceptions import TazDownloadError
 from bs4 import BeautifulSoup
 from envyaml import EnvYAML
 import argparse
 dir_path = os.path.dirname(os.path.realpath(__file__)) + '/'
 class TazConfiguration:
    """
    This class represents the configuration that is needed to run the program.
    On initialization it trys to load the configuration from either the config.yaml or from the arguments passed.
    """
    # List of tuples that each defines a single configuration that can be set either in the config.yaml by passing it
    # as an argument.
    # CONFIGURATIONS[0]: configuration name
    # CONFIGURATIONS[1]: is it required?
    CONFIGURATIONS = [
        ('id', True),
        ('password', True),
        ('download_format', False),
        ('download_folder', True),
        ('use_lock_file', False),
        ('log_level', False),
    ]
    def __init__(self):
        self._config = {}
        # try to load configuration
        try:
            self._load_config()
        except TazDownloadFormatException:
            raise
        except Exception:
            raise
    def _load_config(self):
        # Try to load config.yaml
        try:
            conf_yaml = EnvYAML(dir_path + 'config.yaml', dir_path + '.env')
        except Exception as e:
            raise Exception(f"Something went wrong when reading config.yaml.\n{e}")
        # Get console arguments
        console_args = self._parse_arguments()
        # Set configurations by preferring console arguments over settings in config.yaml
        for conf, required in self.CONFIGURATIONS:
            if conf in console_args and getattr(console_args, conf) is not None:
                self._config[conf] = getattr(console_args, conf)
            elif conf_yaml.get(conf, None) is not None:
                self._config[conf] = conf_yaml[conf]
            else:
                if required:
                    raise TazConfigurationError(conf)
    def _parse_arguments(self):
        """
        Parse command line arguments.
        """
        argparser = argparse.ArgumentParser(
            description='Download taz e-paper'
        )
        argparser.add_argument(
            '-i',
            '--id',
            action='store',
            type=str,
        )
        argparser.add_argument(
            '-p',
            '--password',
            action='store',
            type=str,
        )
        argparser.add_argument(
            '-f',
            '--download-format',
            action='store',
            type=str,
            choices=['pdf', 'epub', 'epubt', 'html', 'ascii', 'mobi', 'mobit'],
        )
        argparser.add_argument(
            '-d',
            '--download_folder',
            action='store',
            type=str,
        )
        argparser.add_argument(
            '-l',
            '--use_lock_file',
            action='store_true',
            default=None
        )
        argparser.add_argument(
            '--log_level',
            action='store',
            choices=['notset', 'debug', 'info', 'warning', 'error', 'critical'],
        )
        return argparser.parse_args()
    def get_config(self) -> dict:
        return self._config
 class TazDownloader:
    download_formats = ["pdf", "epub", "epubt", "html", "ascii", "mobi", "mobit"]
    BASE_URL = "https://dl.taz.de/"
@ -15,30 +116,28 @@ class TazDownloader:
                             'Chrome/79.0.3945.130 Safari/537.36'}
    def __init__(self, taz_id: str, password: str, download_format: str = "pdf"):
        """
        :param taz_id:
        :param password:
        :param download_format:
        """
        self.taz_id = taz_id
        self.password = password
        if download_format in self.download_formats:
            self.download_url = self.BASE_URL + download_format
        else:
-            raise TazDownloadFormatException
+            raise TazDownloadFormatException(download_format)
    def scrape_newspaper(self) -> list:
        """
        Scrapes the newspaper available for download from https://dl.taz.de/
        :return: a list of file names (str)
        """
-        page = requests.get(self.download_url, headers=self.HEADERS)
+        try:
-        soup = BeautifulSoup(page.content, 'html.parser')
+            page = requests.get(self.download_url, headers=self.HEADERS)
-        return [n['value'] for n in soup.find("select").find_all("option")]
+            soup = BeautifulSoup(page.content, 'html.parser')
            return [n['value'] for n in soup.find("select").find_all("option")]
        except HTTPError as http_e:
            raise TazDownloadError(f"Could not scrape available newspaper editions:\n{http_e}")
    def download_newspaper(self, taz: str, download_folder: str = dir_path + 'tmp/'):
        """
-        Downloads a newspaper from dl.taz.de and stores it in /tmp
+        Downloads a newspaper from dl.taz.de and stores it in tmp/
        """
        # Check if folder exists