️ implement lock files and console arguments

- Use a lock file that indicates whether tomorrow's newspaper has already been downloaded to limit the number of times the taz.de website is queried for new editions
- use the program with command line arguments
This commit is contained in:
Marc Koch 2021-09-05 23:42:15 +02:00
parent 6286798e6c
commit 6e8d9d9ef8
5 changed files with 262 additions and 93 deletions

2
.gitignore vendored
View File

@ -1,6 +1,8 @@
download_history.csv
tazPlease.log
config.yaml
tmp/
*.lock
# Byte-compiled / optimized / DLL files
__pycache__/

View File

@ -1,9 +1,20 @@
taz:
taz_id: ${TAZ_ID}
taz_password: ${TAZ_PASSWORD}
dowload_format: "pdf" # Valid formats are: pdf, epub, epubt, html, ascii, mobi, mobit
# Your taz id (gets read from .env file)
id: ${TAZ_ID}
# Your taz password (gets read from .env file)
password: ${TAZ_PASSWORD}
# In which format do you want to download your newspaper?
# Valid formats are: pdf, epub, epubt, html, ascii, mobi, mobit
download_format: "pdf"
# Where should the downloaded files be stored?
download_folder: "/path/to/download/folder"
logging:
log_level: "info"
# Use a lock file that indicates whether tomorrow's newspaper has already been downloaded to limit the number of times
# the taz.de website is queried for new editions
use_lock_file: True
# Set the log level.
# Valid formats are: notset, debug, info, warning, error, critical
log_level: "info"

View File

@ -12,3 +12,12 @@ class TazDownloadError(Exception):
def __inti__(self, format: str):
self.format = format
class TazConfigurationError(Exception):
def __inti__(self, misconfiguration: str):
self.misconfiguration = misconfiguration
def __str__(self):
return f"\"{self.misconfiguration}\" must be defined either in the config.yaml or by passing it as an argument."

124
main.py
View File

@ -1,38 +1,48 @@
import sys
import os
import datetime
from datetime import datetime, timedelta
import pytz
import logging
import shutil
from envyaml import EnvYAML
from models import TazDownloader
import pandas as pd
from models import TazDownloader, TazConfiguration
from exceptions import TazConfigurationError, TazDownloadError, TazDownloadFormatException
# Get directory
dir_path = os.path.dirname(os.path.realpath(__file__)) + '/'
# Set up logging
logging.basicConfig(
filename=dir_path + 'tazPlease.log',
level=logging.ERROR,
format='%(asctime)s - %(message)s'
)
# Load configuration
try:
config = EnvYAML(dir_path + 'config.yaml', dir_path + '.env')
except Exception:
logging.error('Could not load config.yaml', exc_info=True)
sys.exit(1)
def main(config: dict):
# Set log level
try:
logging.getLogger().setLevel(config['logging']['log_level'].upper())
except ValueError as e:
logging.error(f"Could not set log level. \n{e}", exc_info=True)
# Get german date for tomorrow
tomorrow = (datetime.now(pytz.timezone('Europe/Berlin')) + timedelta(1)).strftime('%Y_%m_%d')
# Read download history from csv file
try:
# Set log level
try:
logging.getLogger().setLevel(config['log_level'].upper())
except ValueError as e:
logging.error(f"Could not set log level.\n{e}", exc_info=True)
# If 'use_lock_file' configuration is set, check if lockfile exists for tomorrow's newspaper
if config['use_lock_file']:
try:
lock_files = [entry for entry in os.listdir(dir_path) if os.path.isfile(entry) and entry.endswith('.lock')]
# Delete all lock files that do not refer to tomorrow's date
for file in lock_files:
if not file.startswith('.' + tomorrow):
os.remove(dir_path + file)
# If there is a lock file for tomorrow, exit the program
for file in lock_files:
if file.startswith('.' + tomorrow):
logging.info('Tomorrow\'s newspaper was already downloaded. Execution canceled.')
sys.exit(0)
except Exception as e:
logging.error(f"Could not check for lock files.\n{e}", exc_info=True)
# Read download history from csv file
try:
df = pd.read_csv(dir_path + 'download_history.csv', header=0)
except FileNotFoundError:
except FileNotFoundError:
# In case, there isn't yet a csv file, create data frame with headers
df = pd.DataFrame(
columns=[
@ -41,10 +51,14 @@ except FileNotFoundError:
]
)
# Instantiate downloader object
taz_dl = TazDownloader(config['taz']['taz_id'], config['taz']['taz_password'])
# Instantiate downloader object
try:
taz_dl = TazDownloader(config['id'], config['password'], config['download_format'])
except TazDownloadFormatException as e:
logging.error(e, exc_info=True)
sys.exit(1)
try:
try:
# Get newspapers available for download
newspaper_available = taz_dl.scrape_newspaper()
@ -53,36 +67,46 @@ try:
# Find newspaper which are not already downloaded
newspaper_to_download = [n for n in newspaper_available if n not in df.file.values]
except Exception as e:
logging.error(f"Could get available newspaper from website\n{e}", exc_info=True)
except TazDownloadError as e:
logging.error(e, exc_info=True)
sys.exit(1)
# Download newspaper
newspaper_downloaded = []
for n in newspaper_to_download:
# Download newspaper
newspaper_downloaded = []
for n in newspaper_to_download:
try:
if taz_dl.download_newspaper(n):
newspaper_downloaded.append(n)
except Exception as e:
logging.error(f"Could not download {n}\n{e}", exc_info=True)
# Add downloaded newspaper to download_history.csv
try:
# Create lock file for tomorrow
if config['use_lock_file']:
try:
lock_file = '.' + tomorrow + '.lock'
for n in newspaper_downloaded:
if n.startswith('taz_' + tomorrow):
os.mknod(dir_path + lock_file)
except Exception as e:
logging.error(f"Could not download create lock file \"{lock_file}\"\n{e}", exc_info=True)
# Add downloaded newspaper to download_history.csv
try:
for n in newspaper_downloaded:
df_tmp = pd.DataFrame(
{
'file': [n],
'download_timestamp': [datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')],
'download_timestamp': [datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')],
}
)
df = df.append(df_tmp, ignore_index=True)
df.sort_values(by='download_timestamp', ascending=False, inplace=True)
df.sort_values(by='file', ascending=False, inplace=True)
df.to_csv(dir_path + 'download_history.csv', index=False)
except Exception as e:
except Exception as e:
logging.error(f"Could not update download_history.csv\n{e}", exc_info=True)
# Move downloaded file to download folder
if os.path.isdir(config['download_folder']):
# Move downloaded file to download folder
if os.path.isdir(config['download_folder']):
download_folder = \
config['download_folder'] if config['download_folder'].endswith('/') else config['download_folder'] + "/"
for n in newspaper_downloaded:
@ -90,3 +114,27 @@ if os.path.isdir(config['download_folder']):
shutil.move(dir_path + 'tmp/' + n, download_folder)
except Exception as e:
logging.error(f"Could not move file to download folder \"{download_folder}\"\n{e}", exc_info=True)
if __name__ == '__main__':
# Set up logging
logging.basicConfig(
filename=dir_path + 'tazPlease.log',
level=logging.ERROR,
format='%(asctime)s - %(message)s'
)
# Load configuration
try:
configuration = TazConfiguration().get_config()
except TazConfigurationError as tce:
print(tce)
sys.exit(1)
except Exception as exception:
print(exception)
sys.exit(1)
# Execute main function
if configuration:
main(configuration)

113
models.py
View File

@ -4,10 +4,111 @@ from requests.exceptions import HTTPError
from exceptions import TazDownloadFormatException
from exceptions import TazDownloadError
from bs4 import BeautifulSoup
from envyaml import EnvYAML
import argparse
dir_path = os.path.dirname(os.path.realpath(__file__)) + '/'
class TazConfiguration:
"""
This class represents the configuration that is needed to run the program.
On initialization it trys to load the configuration from either the config.yaml or from the arguments passed.
"""
# List of tuples that each defines a single configuration that can be set either in the config.yaml by passing it
# as an argument.
# CONFIGURATIONS[0]: configuration name
# CONFIGURATIONS[1]: is it required?
CONFIGURATIONS = [
('id', True),
('password', True),
('download_format', False),
('download_folder', True),
('use_lock_file', False),
('log_level', False),
]
def __init__(self):
self._config = {}
# try to load configuration
try:
self._load_config()
except TazDownloadFormatException:
raise
except Exception:
raise
def _load_config(self):
# Try to load config.yaml
try:
conf_yaml = EnvYAML(dir_path + 'config.yaml', dir_path + '.env')
except Exception as e:
raise Exception(f"Something went wrong when reading config.yaml.\n{e}")
# Get console arguments
console_args = self._parse_arguments()
# Set configurations by preferring console arguments over settings in config.yaml
for conf, required in self.CONFIGURATIONS:
if conf in console_args and getattr(console_args, conf) is not None:
self._config[conf] = getattr(console_args, conf)
elif conf_yaml.get(conf, None) is not None:
self._config[conf] = conf_yaml[conf]
else:
if required:
raise TazConfigurationError(conf)
def _parse_arguments(self):
"""
Parse command line arguments.
"""
argparser = argparse.ArgumentParser(
description='Download taz e-paper'
)
argparser.add_argument(
'-i',
'--id',
action='store',
type=str,
)
argparser.add_argument(
'-p',
'--password',
action='store',
type=str,
)
argparser.add_argument(
'-f',
'--download-format',
action='store',
type=str,
choices=['pdf', 'epub', 'epubt', 'html', 'ascii', 'mobi', 'mobit'],
)
argparser.add_argument(
'-d',
'--download_folder',
action='store',
type=str,
)
argparser.add_argument(
'-l',
'--use_lock_file',
action='store_true',
default=None
)
argparser.add_argument(
'--log_level',
action='store',
choices=['notset', 'debug', 'info', 'warning', 'error', 'critical'],
)
return argparser.parse_args()
def get_config(self) -> dict:
return self._config
class TazDownloader:
download_formats = ["pdf", "epub", "epubt", "html", "ascii", "mobi", "mobit"]
BASE_URL = "https://dl.taz.de/"
@ -15,30 +116,28 @@ class TazDownloader:
'Chrome/79.0.3945.130 Safari/537.36'}
def __init__(self, taz_id: str, password: str, download_format: str = "pdf"):
"""
:param taz_id:
:param password:
:param download_format:
"""
self.taz_id = taz_id
self.password = password
if download_format in self.download_formats:
self.download_url = self.BASE_URL + download_format
else:
raise TazDownloadFormatException
raise TazDownloadFormatException(download_format)
def scrape_newspaper(self) -> list:
"""
Scrapes the newspaper available for download from https://dl.taz.de/
:return: a list of file names (str)
"""
try:
page = requests.get(self.download_url, headers=self.HEADERS)
soup = BeautifulSoup(page.content, 'html.parser')
return [n['value'] for n in soup.find("select").find_all("option")]
except HTTPError as http_e:
raise TazDownloadError(f"Could not scrape available newspaper editions:\n{http_e}")
def download_newspaper(self, taz: str, download_folder: str = dir_path + 'tmp/'):
"""
Downloads a newspaper from dl.taz.de and stores it in /tmp
Downloads a newspaper from dl.taz.de and stores it in tmp/
"""
# Check if folder exists