️ implement lock files and console arguments

- Use a lock file that indicates whether tomorrow's newspaper has already been downloaded to limit the number of times the taz.de website is queried for new editions
- use the program with command line arguments
This commit is contained in:
Marc Koch 2021-09-05 23:42:15 +02:00
parent 6286798e6c
commit 6e8d9d9ef8
5 changed files with 262 additions and 93 deletions

2
.gitignore vendored
View File

@ -1,6 +1,8 @@
download_history.csv download_history.csv
tazPlease.log tazPlease.log
config.yaml config.yaml
tmp/
*.lock
# Byte-compiled / optimized / DLL files # Byte-compiled / optimized / DLL files
__pycache__/ __pycache__/

View File

@ -1,9 +1,20 @@
taz: # Your taz id (gets read from .env file)
taz_id: ${TAZ_ID} id: ${TAZ_ID}
taz_password: ${TAZ_PASSWORD}
dowload_format: "pdf" # Valid formats are: pdf, epub, epubt, html, ascii, mobi, mobit
# Your taz password (gets read from .env file)
password: ${TAZ_PASSWORD}
# In which format do you want to download your newspaper?
# Valid formats are: pdf, epub, epubt, html, ascii, mobi, mobit
download_format: "pdf"
# Where should the downloaded files be stored?
download_folder: "/path/to/download/folder" download_folder: "/path/to/download/folder"
logging: # Use a lock file that indicates whether tomorrow's newspaper has already been downloaded to limit the number of times
log_level: "info" # the taz.de website is queried for new editions
use_lock_file: True
# Set the log level.
# Valid formats are: notset, debug, info, warning, error, critical
log_level: "info"

View File

@ -12,3 +12,12 @@ class TazDownloadError(Exception):
def __inti__(self, format: str): def __inti__(self, format: str):
self.format = format self.format = format
class TazConfigurationError(Exception):
def __inti__(self, misconfiguration: str):
self.misconfiguration = misconfiguration
def __str__(self):
return f"\"{self.misconfiguration}\" must be defined either in the config.yaml or by passing it as an argument."

202
main.py
View File

@ -1,92 +1,140 @@
import sys import sys
import os import os
import datetime from datetime import datetime, timedelta
import pytz
import logging import logging
import shutil import shutil
from envyaml import EnvYAML
from models import TazDownloader
import pandas as pd import pandas as pd
from models import TazDownloader, TazConfiguration
from exceptions import TazConfigurationError, TazDownloadError, TazDownloadFormatException
# Get directory
dir_path = os.path.dirname(os.path.realpath(__file__)) + '/' dir_path = os.path.dirname(os.path.realpath(__file__)) + '/'
# Set up logging
logging.basicConfig(
filename=dir_path + 'tazPlease.log',
level=logging.ERROR,
format='%(asctime)s - %(message)s'
)
# Load configuration def main(config: dict):
try:
config = EnvYAML(dir_path + 'config.yaml', dir_path + '.env')
except Exception:
logging.error('Could not load config.yaml', exc_info=True)
sys.exit(1)
# Set log level # Get german date for tomorrow
try: tomorrow = (datetime.now(pytz.timezone('Europe/Berlin')) + timedelta(1)).strftime('%Y_%m_%d')
logging.getLogger().setLevel(config['logging']['log_level'].upper())
except ValueError as e:
logging.error(f"Could not set log level. \n{e}", exc_info=True)
# Read download history from csv file # Set log level
try: try:
df = pd.read_csv(dir_path + 'download_history.csv', header=0) logging.getLogger().setLevel(config['log_level'].upper())
except FileNotFoundError: except ValueError as e:
# In case, there isn't yet a csv file, create data frame with headers logging.error(f"Could not set log level.\n{e}", exc_info=True)
df = pd.DataFrame(
columns=[ # If 'use_lock_file' configuration is set, check if lockfile exists for tomorrow's newspaper
'file', if config['use_lock_file']:
'download_timestamp', try:
] lock_files = [entry for entry in os.listdir(dir_path) if os.path.isfile(entry) and entry.endswith('.lock')]
# Delete all lock files that do not refer to tomorrow's date
for file in lock_files:
if not file.startswith('.' + tomorrow):
os.remove(dir_path + file)
# If there is a lock file for tomorrow, exit the program
for file in lock_files:
if file.startswith('.' + tomorrow):
logging.info('Tomorrow\'s newspaper was already downloaded. Execution canceled.')
sys.exit(0)
except Exception as e:
logging.error(f"Could not check for lock files.\n{e}", exc_info=True)
# Read download history from csv file
try:
df = pd.read_csv(dir_path + 'download_history.csv', header=0)
except FileNotFoundError:
# In case, there isn't yet a csv file, create data frame with headers
df = pd.DataFrame(
columns=[
'file',
'download_timestamp',
]
)
# Instantiate downloader object
try:
taz_dl = TazDownloader(config['id'], config['password'], config['download_format'])
except TazDownloadFormatException as e:
logging.error(e, exc_info=True)
sys.exit(1)
try:
# Get newspapers available for download
newspaper_available = taz_dl.scrape_newspaper()
# Remove outdated newspaper from download_history.csv
df.drop([index for index, row in df.iterrows() if row.file not in newspaper_available], inplace=True)
# Find newspaper which are not already downloaded
newspaper_to_download = [n for n in newspaper_available if n not in df.file.values]
except TazDownloadError as e:
logging.error(e, exc_info=True)
sys.exit(1)
# Download newspaper
newspaper_downloaded = []
for n in newspaper_to_download:
try:
if taz_dl.download_newspaper(n):
newspaper_downloaded.append(n)
except Exception as e:
logging.error(f"Could not download {n}\n{e}", exc_info=True)
# Create lock file for tomorrow
if config['use_lock_file']:
try:
lock_file = '.' + tomorrow + '.lock'
for n in newspaper_downloaded:
if n.startswith('taz_' + tomorrow):
os.mknod(dir_path + lock_file)
except Exception as e:
logging.error(f"Could not download create lock file \"{lock_file}\"\n{e}", exc_info=True)
# Add downloaded newspaper to download_history.csv
try:
for n in newspaper_downloaded:
df_tmp = pd.DataFrame(
{
'file': [n],
'download_timestamp': [datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')],
}
)
df = df.append(df_tmp, ignore_index=True)
df.sort_values(by='file', ascending=False, inplace=True)
df.to_csv(dir_path + 'download_history.csv', index=False)
except Exception as e:
logging.error(f"Could not update download_history.csv\n{e}", exc_info=True)
# Move downloaded file to download folder
if os.path.isdir(config['download_folder']):
download_folder = \
config['download_folder'] if config['download_folder'].endswith('/') else config['download_folder'] + "/"
for n in newspaper_downloaded:
try:
shutil.move(dir_path + 'tmp/' + n, download_folder)
except Exception as e:
logging.error(f"Could not move file to download folder \"{download_folder}\"\n{e}", exc_info=True)
if __name__ == '__main__':
# Set up logging
logging.basicConfig(
filename=dir_path + 'tazPlease.log',
level=logging.ERROR,
format='%(asctime)s - %(message)s'
) )
# Instantiate downloader object # Load configuration
taz_dl = TazDownloader(config['taz']['taz_id'], config['taz']['taz_password'])
try:
# Get newspapers available for download
newspaper_available = taz_dl.scrape_newspaper()
# Remove outdated newspaper from download_history.csv
df.drop([index for index, row in df.iterrows() if row.file not in newspaper_available], inplace=True)
# Find newspaper which are not already downloaded
newspaper_to_download = [n for n in newspaper_available if n not in df.file.values]
except Exception as e:
logging.error(f"Could get available newspaper from website\n{e}", exc_info=True)
sys.exit(1)
# Download newspaper
newspaper_downloaded = []
for n in newspaper_to_download:
try: try:
if taz_dl.download_newspaper(n): configuration = TazConfiguration().get_config()
newspaper_downloaded.append(n) except TazConfigurationError as tce:
except Exception as e: print(tce)
logging.error(f"Could not download {n}\n{e}", exc_info=True) sys.exit(1)
except Exception as exception:
print(exception)
sys.exit(1)
# Add downloaded newspaper to download_history.csv # Execute main function
try: if configuration:
for n in newspaper_downloaded: main(configuration)
df_tmp = pd.DataFrame(
{
'file': [n],
'download_timestamp': [datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')],
}
)
df = df.append(df_tmp, ignore_index=True)
df.sort_values(by='download_timestamp', ascending=False, inplace=True)
df.to_csv(dir_path + 'download_history.csv', index=False)
except Exception as e:
logging.error(f"Could not update download_history.csv\n{e}", exc_info=True)
# Move downloaded file to download folder
if os.path.isdir(config['download_folder']):
download_folder = \
config['download_folder'] if config['download_folder'].endswith('/') else config['download_folder'] + "/"
for n in newspaper_downloaded:
try:
shutil.move(dir_path + 'tmp/' + n, download_folder)
except Exception as e:
logging.error(f"Could not move file to download folder \"{download_folder}\"\n{e}", exc_info=True)

119
models.py
View File

@ -4,10 +4,111 @@ from requests.exceptions import HTTPError
from exceptions import TazDownloadFormatException from exceptions import TazDownloadFormatException
from exceptions import TazDownloadError from exceptions import TazDownloadError
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from envyaml import EnvYAML
import argparse
dir_path = os.path.dirname(os.path.realpath(__file__)) + '/' dir_path = os.path.dirname(os.path.realpath(__file__)) + '/'
class TazConfiguration:
"""
This class represents the configuration that is needed to run the program.
On initialization it trys to load the configuration from either the config.yaml or from the arguments passed.
"""
# List of tuples that each defines a single configuration that can be set either in the config.yaml by passing it
# as an argument.
# CONFIGURATIONS[0]: configuration name
# CONFIGURATIONS[1]: is it required?
CONFIGURATIONS = [
('id', True),
('password', True),
('download_format', False),
('download_folder', True),
('use_lock_file', False),
('log_level', False),
]
def __init__(self):
self._config = {}
# try to load configuration
try:
self._load_config()
except TazDownloadFormatException:
raise
except Exception:
raise
def _load_config(self):
# Try to load config.yaml
try:
conf_yaml = EnvYAML(dir_path + 'config.yaml', dir_path + '.env')
except Exception as e:
raise Exception(f"Something went wrong when reading config.yaml.\n{e}")
# Get console arguments
console_args = self._parse_arguments()
# Set configurations by preferring console arguments over settings in config.yaml
for conf, required in self.CONFIGURATIONS:
if conf in console_args and getattr(console_args, conf) is not None:
self._config[conf] = getattr(console_args, conf)
elif conf_yaml.get(conf, None) is not None:
self._config[conf] = conf_yaml[conf]
else:
if required:
raise TazConfigurationError(conf)
def _parse_arguments(self):
"""
Parse command line arguments.
"""
argparser = argparse.ArgumentParser(
description='Download taz e-paper'
)
argparser.add_argument(
'-i',
'--id',
action='store',
type=str,
)
argparser.add_argument(
'-p',
'--password',
action='store',
type=str,
)
argparser.add_argument(
'-f',
'--download-format',
action='store',
type=str,
choices=['pdf', 'epub', 'epubt', 'html', 'ascii', 'mobi', 'mobit'],
)
argparser.add_argument(
'-d',
'--download_folder',
action='store',
type=str,
)
argparser.add_argument(
'-l',
'--use_lock_file',
action='store_true',
default=None
)
argparser.add_argument(
'--log_level',
action='store',
choices=['notset', 'debug', 'info', 'warning', 'error', 'critical'],
)
return argparser.parse_args()
def get_config(self) -> dict:
return self._config
class TazDownloader: class TazDownloader:
download_formats = ["pdf", "epub", "epubt", "html", "ascii", "mobi", "mobit"] download_formats = ["pdf", "epub", "epubt", "html", "ascii", "mobi", "mobit"]
BASE_URL = "https://dl.taz.de/" BASE_URL = "https://dl.taz.de/"
@ -15,30 +116,28 @@ class TazDownloader:
'Chrome/79.0.3945.130 Safari/537.36'} 'Chrome/79.0.3945.130 Safari/537.36'}
def __init__(self, taz_id: str, password: str, download_format: str = "pdf"): def __init__(self, taz_id: str, password: str, download_format: str = "pdf"):
"""
:param taz_id:
:param password:
:param download_format:
"""
self.taz_id = taz_id self.taz_id = taz_id
self.password = password self.password = password
if download_format in self.download_formats: if download_format in self.download_formats:
self.download_url = self.BASE_URL + download_format self.download_url = self.BASE_URL + download_format
else: else:
raise TazDownloadFormatException raise TazDownloadFormatException(download_format)
def scrape_newspaper(self) -> list: def scrape_newspaper(self) -> list:
""" """
Scrapes the newspaper available for download from https://dl.taz.de/ Scrapes the newspaper available for download from https://dl.taz.de/
:return: a list of file names (str) :return: a list of file names (str)
""" """
page = requests.get(self.download_url, headers=self.HEADERS) try:
soup = BeautifulSoup(page.content, 'html.parser') page = requests.get(self.download_url, headers=self.HEADERS)
return [n['value'] for n in soup.find("select").find_all("option")] soup = BeautifulSoup(page.content, 'html.parser')
return [n['value'] for n in soup.find("select").find_all("option")]
except HTTPError as http_e:
raise TazDownloadError(f"Could not scrape available newspaper editions:\n{http_e}")
def download_newspaper(self, taz: str, download_folder: str = dir_path + 'tmp/'): def download_newspaper(self, taz: str, download_folder: str = dir_path + 'tmp/'):
""" """
Downloads a newspaper from dl.taz.de and stores it in /tmp Downloads a newspaper from dl.taz.de and stores it in tmp/
""" """
# Check if folder exists # Check if folder exists