✨️ implement lock files and console arguments
- Use a lock file that indicates whether tomorrow's newspaper has already been downloaded to limit the number of times the taz.de website is queried for new editions - use the program with command line arguments
This commit is contained in:
parent
6286798e6c
commit
6e8d9d9ef8
|
|
@ -1,6 +1,8 @@
|
|||
download_history.csv
|
||||
tazPlease.log
|
||||
config.yaml
|
||||
tmp/
|
||||
*.lock
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
|
|
|
|||
|
|
@ -1,9 +1,20 @@
|
|||
taz:
|
||||
taz_id: ${TAZ_ID}
|
||||
taz_password: ${TAZ_PASSWORD}
|
||||
dowload_format: "pdf" # Valid formats are: pdf, epub, epubt, html, ascii, mobi, mobit
|
||||
# Your taz id (gets read from .env file)
|
||||
id: ${TAZ_ID}
|
||||
|
||||
# Your taz password (gets read from .env file)
|
||||
password: ${TAZ_PASSWORD}
|
||||
|
||||
# In which format do you want to download your newspaper?
|
||||
# Valid formats are: pdf, epub, epubt, html, ascii, mobi, mobit
|
||||
download_format: "pdf"
|
||||
|
||||
# Where should the downloaded files be stored?
|
||||
download_folder: "/path/to/download/folder"
|
||||
|
||||
logging:
|
||||
log_level: "info"
|
||||
# Use a lock file that indicates whether tomorrow's newspaper has already been downloaded to limit the number of times
|
||||
# the taz.de website is queried for new editions
|
||||
use_lock_file: True
|
||||
|
||||
# Set the log level.
|
||||
# Valid formats are: notset, debug, info, warning, error, critical
|
||||
log_level: "info"
|
||||
|
|
|
|||
|
|
@ -12,3 +12,12 @@ class TazDownloadError(Exception):
|
|||
|
||||
def __inti__(self, format: str):
|
||||
self.format = format
|
||||
|
||||
|
||||
class TazConfigurationError(Exception):
|
||||
|
||||
def __inti__(self, misconfiguration: str):
|
||||
self.misconfiguration = misconfiguration
|
||||
|
||||
def __str__(self):
|
||||
return f"\"{self.misconfiguration}\" must be defined either in the config.yaml or by passing it as an argument."
|
||||
|
|
|
|||
202
main.py
202
main.py
|
|
@ -1,92 +1,140 @@
|
|||
import sys
|
||||
import os
|
||||
import datetime
|
||||
from datetime import datetime, timedelta
|
||||
import pytz
|
||||
import logging
|
||||
import shutil
|
||||
from envyaml import EnvYAML
|
||||
from models import TazDownloader
|
||||
import pandas as pd
|
||||
from models import TazDownloader, TazConfiguration
|
||||
from exceptions import TazConfigurationError, TazDownloadError, TazDownloadFormatException
|
||||
|
||||
# Get directory
|
||||
dir_path = os.path.dirname(os.path.realpath(__file__)) + '/'
|
||||
|
||||
# Set up logging
|
||||
logging.basicConfig(
|
||||
filename=dir_path + 'tazPlease.log',
|
||||
level=logging.ERROR,
|
||||
format='%(asctime)s - %(message)s'
|
||||
)
|
||||
|
||||
# Load configuration
|
||||
try:
|
||||
config = EnvYAML(dir_path + 'config.yaml', dir_path + '.env')
|
||||
except Exception:
|
||||
logging.error('Could not load config.yaml', exc_info=True)
|
||||
sys.exit(1)
|
||||
def main(config: dict):
|
||||
|
||||
# Set log level
|
||||
try:
|
||||
logging.getLogger().setLevel(config['logging']['log_level'].upper())
|
||||
except ValueError as e:
|
||||
logging.error(f"Could not set log level. \n{e}", exc_info=True)
|
||||
# Get german date for tomorrow
|
||||
tomorrow = (datetime.now(pytz.timezone('Europe/Berlin')) + timedelta(1)).strftime('%Y_%m_%d')
|
||||
|
||||
# Read download history from csv file
|
||||
try:
|
||||
df = pd.read_csv(dir_path + 'download_history.csv', header=0)
|
||||
except FileNotFoundError:
|
||||
# In case, there isn't yet a csv file, create data frame with headers
|
||||
df = pd.DataFrame(
|
||||
columns=[
|
||||
'file',
|
||||
'download_timestamp',
|
||||
]
|
||||
# Set log level
|
||||
try:
|
||||
logging.getLogger().setLevel(config['log_level'].upper())
|
||||
except ValueError as e:
|
||||
logging.error(f"Could not set log level.\n{e}", exc_info=True)
|
||||
|
||||
# If 'use_lock_file' configuration is set, check if lockfile exists for tomorrow's newspaper
|
||||
if config['use_lock_file']:
|
||||
try:
|
||||
lock_files = [entry for entry in os.listdir(dir_path) if os.path.isfile(entry) and entry.endswith('.lock')]
|
||||
# Delete all lock files that do not refer to tomorrow's date
|
||||
for file in lock_files:
|
||||
if not file.startswith('.' + tomorrow):
|
||||
os.remove(dir_path + file)
|
||||
# If there is a lock file for tomorrow, exit the program
|
||||
for file in lock_files:
|
||||
if file.startswith('.' + tomorrow):
|
||||
logging.info('Tomorrow\'s newspaper was already downloaded. Execution canceled.')
|
||||
sys.exit(0)
|
||||
except Exception as e:
|
||||
logging.error(f"Could not check for lock files.\n{e}", exc_info=True)
|
||||
|
||||
# Read download history from csv file
|
||||
try:
|
||||
df = pd.read_csv(dir_path + 'download_history.csv', header=0)
|
||||
except FileNotFoundError:
|
||||
# In case, there isn't yet a csv file, create data frame with headers
|
||||
df = pd.DataFrame(
|
||||
columns=[
|
||||
'file',
|
||||
'download_timestamp',
|
||||
]
|
||||
)
|
||||
|
||||
# Instantiate downloader object
|
||||
try:
|
||||
taz_dl = TazDownloader(config['id'], config['password'], config['download_format'])
|
||||
except TazDownloadFormatException as e:
|
||||
logging.error(e, exc_info=True)
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
# Get newspapers available for download
|
||||
newspaper_available = taz_dl.scrape_newspaper()
|
||||
|
||||
# Remove outdated newspaper from download_history.csv
|
||||
df.drop([index for index, row in df.iterrows() if row.file not in newspaper_available], inplace=True)
|
||||
|
||||
# Find newspaper which are not already downloaded
|
||||
newspaper_to_download = [n for n in newspaper_available if n not in df.file.values]
|
||||
except TazDownloadError as e:
|
||||
logging.error(e, exc_info=True)
|
||||
sys.exit(1)
|
||||
|
||||
# Download newspaper
|
||||
newspaper_downloaded = []
|
||||
for n in newspaper_to_download:
|
||||
try:
|
||||
if taz_dl.download_newspaper(n):
|
||||
newspaper_downloaded.append(n)
|
||||
except Exception as e:
|
||||
logging.error(f"Could not download {n}\n{e}", exc_info=True)
|
||||
|
||||
# Create lock file for tomorrow
|
||||
if config['use_lock_file']:
|
||||
try:
|
||||
lock_file = '.' + tomorrow + '.lock'
|
||||
for n in newspaper_downloaded:
|
||||
if n.startswith('taz_' + tomorrow):
|
||||
os.mknod(dir_path + lock_file)
|
||||
except Exception as e:
|
||||
logging.error(f"Could not download create lock file \"{lock_file}\"\n{e}", exc_info=True)
|
||||
|
||||
# Add downloaded newspaper to download_history.csv
|
||||
try:
|
||||
for n in newspaper_downloaded:
|
||||
df_tmp = pd.DataFrame(
|
||||
{
|
||||
'file': [n],
|
||||
'download_timestamp': [datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')],
|
||||
}
|
||||
)
|
||||
df = df.append(df_tmp, ignore_index=True)
|
||||
df.sort_values(by='file', ascending=False, inplace=True)
|
||||
df.to_csv(dir_path + 'download_history.csv', index=False)
|
||||
except Exception as e:
|
||||
logging.error(f"Could not update download_history.csv\n{e}", exc_info=True)
|
||||
|
||||
# Move downloaded file to download folder
|
||||
if os.path.isdir(config['download_folder']):
|
||||
download_folder = \
|
||||
config['download_folder'] if config['download_folder'].endswith('/') else config['download_folder'] + "/"
|
||||
for n in newspaper_downloaded:
|
||||
try:
|
||||
shutil.move(dir_path + 'tmp/' + n, download_folder)
|
||||
except Exception as e:
|
||||
logging.error(f"Could not move file to download folder \"{download_folder}\"\n{e}", exc_info=True)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
# Set up logging
|
||||
logging.basicConfig(
|
||||
filename=dir_path + 'tazPlease.log',
|
||||
level=logging.ERROR,
|
||||
format='%(asctime)s - %(message)s'
|
||||
)
|
||||
|
||||
# Instantiate downloader object
|
||||
taz_dl = TazDownloader(config['taz']['taz_id'], config['taz']['taz_password'])
|
||||
|
||||
try:
|
||||
# Get newspapers available for download
|
||||
newspaper_available = taz_dl.scrape_newspaper()
|
||||
|
||||
# Remove outdated newspaper from download_history.csv
|
||||
df.drop([index for index, row in df.iterrows() if row.file not in newspaper_available], inplace=True)
|
||||
|
||||
# Find newspaper which are not already downloaded
|
||||
newspaper_to_download = [n for n in newspaper_available if n not in df.file.values]
|
||||
except Exception as e:
|
||||
logging.error(f"Could get available newspaper from website\n{e}", exc_info=True)
|
||||
sys.exit(1)
|
||||
|
||||
# Download newspaper
|
||||
newspaper_downloaded = []
|
||||
for n in newspaper_to_download:
|
||||
# Load configuration
|
||||
try:
|
||||
if taz_dl.download_newspaper(n):
|
||||
newspaper_downloaded.append(n)
|
||||
except Exception as e:
|
||||
logging.error(f"Could not download {n}\n{e}", exc_info=True)
|
||||
configuration = TazConfiguration().get_config()
|
||||
except TazConfigurationError as tce:
|
||||
print(tce)
|
||||
sys.exit(1)
|
||||
except Exception as exception:
|
||||
print(exception)
|
||||
sys.exit(1)
|
||||
|
||||
# Add downloaded newspaper to download_history.csv
|
||||
try:
|
||||
for n in newspaper_downloaded:
|
||||
df_tmp = pd.DataFrame(
|
||||
{
|
||||
'file': [n],
|
||||
'download_timestamp': [datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')],
|
||||
}
|
||||
)
|
||||
df = df.append(df_tmp, ignore_index=True)
|
||||
df.sort_values(by='download_timestamp', ascending=False, inplace=True)
|
||||
df.to_csv(dir_path + 'download_history.csv', index=False)
|
||||
except Exception as e:
|
||||
logging.error(f"Could not update download_history.csv\n{e}", exc_info=True)
|
||||
|
||||
# Move downloaded file to download folder
|
||||
if os.path.isdir(config['download_folder']):
|
||||
download_folder = \
|
||||
config['download_folder'] if config['download_folder'].endswith('/') else config['download_folder'] + "/"
|
||||
for n in newspaper_downloaded:
|
||||
try:
|
||||
shutil.move(dir_path + 'tmp/' + n, download_folder)
|
||||
except Exception as e:
|
||||
logging.error(f"Could not move file to download folder \"{download_folder}\"\n{e}", exc_info=True)
|
||||
# Execute main function
|
||||
if configuration:
|
||||
main(configuration)
|
||||
|
|
|
|||
119
models.py
119
models.py
|
|
@ -4,10 +4,111 @@ from requests.exceptions import HTTPError
|
|||
from exceptions import TazDownloadFormatException
|
||||
from exceptions import TazDownloadError
|
||||
from bs4 import BeautifulSoup
|
||||
from envyaml import EnvYAML
|
||||
import argparse
|
||||
|
||||
dir_path = os.path.dirname(os.path.realpath(__file__)) + '/'
|
||||
|
||||
|
||||
class TazConfiguration:
|
||||
"""
|
||||
This class represents the configuration that is needed to run the program.
|
||||
On initialization it trys to load the configuration from either the config.yaml or from the arguments passed.
|
||||
"""
|
||||
|
||||
# List of tuples that each defines a single configuration that can be set either in the config.yaml by passing it
|
||||
# as an argument.
|
||||
# CONFIGURATIONS[0]: configuration name
|
||||
# CONFIGURATIONS[1]: is it required?
|
||||
CONFIGURATIONS = [
|
||||
('id', True),
|
||||
('password', True),
|
||||
('download_format', False),
|
||||
('download_folder', True),
|
||||
('use_lock_file', False),
|
||||
('log_level', False),
|
||||
]
|
||||
|
||||
def __init__(self):
|
||||
self._config = {}
|
||||
|
||||
# try to load configuration
|
||||
try:
|
||||
self._load_config()
|
||||
except TazDownloadFormatException:
|
||||
raise
|
||||
except Exception:
|
||||
raise
|
||||
|
||||
def _load_config(self):
|
||||
# Try to load config.yaml
|
||||
try:
|
||||
conf_yaml = EnvYAML(dir_path + 'config.yaml', dir_path + '.env')
|
||||
except Exception as e:
|
||||
raise Exception(f"Something went wrong when reading config.yaml.\n{e}")
|
||||
|
||||
# Get console arguments
|
||||
console_args = self._parse_arguments()
|
||||
|
||||
# Set configurations by preferring console arguments over settings in config.yaml
|
||||
for conf, required in self.CONFIGURATIONS:
|
||||
if conf in console_args and getattr(console_args, conf) is not None:
|
||||
self._config[conf] = getattr(console_args, conf)
|
||||
elif conf_yaml.get(conf, None) is not None:
|
||||
self._config[conf] = conf_yaml[conf]
|
||||
else:
|
||||
if required:
|
||||
raise TazConfigurationError(conf)
|
||||
|
||||
def _parse_arguments(self):
|
||||
"""
|
||||
Parse command line arguments.
|
||||
"""
|
||||
argparser = argparse.ArgumentParser(
|
||||
description='Download taz e-paper'
|
||||
)
|
||||
argparser.add_argument(
|
||||
'-i',
|
||||
'--id',
|
||||
action='store',
|
||||
type=str,
|
||||
)
|
||||
argparser.add_argument(
|
||||
'-p',
|
||||
'--password',
|
||||
action='store',
|
||||
type=str,
|
||||
)
|
||||
argparser.add_argument(
|
||||
'-f',
|
||||
'--download-format',
|
||||
action='store',
|
||||
type=str,
|
||||
choices=['pdf', 'epub', 'epubt', 'html', 'ascii', 'mobi', 'mobit'],
|
||||
)
|
||||
argparser.add_argument(
|
||||
'-d',
|
||||
'--download_folder',
|
||||
action='store',
|
||||
type=str,
|
||||
)
|
||||
argparser.add_argument(
|
||||
'-l',
|
||||
'--use_lock_file',
|
||||
action='store_true',
|
||||
default=None
|
||||
)
|
||||
argparser.add_argument(
|
||||
'--log_level',
|
||||
action='store',
|
||||
choices=['notset', 'debug', 'info', 'warning', 'error', 'critical'],
|
||||
)
|
||||
return argparser.parse_args()
|
||||
|
||||
def get_config(self) -> dict:
|
||||
return self._config
|
||||
|
||||
|
||||
class TazDownloader:
|
||||
download_formats = ["pdf", "epub", "epubt", "html", "ascii", "mobi", "mobit"]
|
||||
BASE_URL = "https://dl.taz.de/"
|
||||
|
|
@ -15,30 +116,28 @@ class TazDownloader:
|
|||
'Chrome/79.0.3945.130 Safari/537.36'}
|
||||
|
||||
def __init__(self, taz_id: str, password: str, download_format: str = "pdf"):
|
||||
"""
|
||||
:param taz_id:
|
||||
:param password:
|
||||
:param download_format:
|
||||
"""
|
||||
self.taz_id = taz_id
|
||||
self.password = password
|
||||
if download_format in self.download_formats:
|
||||
self.download_url = self.BASE_URL + download_format
|
||||
else:
|
||||
raise TazDownloadFormatException
|
||||
raise TazDownloadFormatException(download_format)
|
||||
|
||||
def scrape_newspaper(self) -> list:
|
||||
"""
|
||||
Scrapes the newspaper available for download from https://dl.taz.de/
|
||||
:return: a list of file names (str)
|
||||
"""
|
||||
page = requests.get(self.download_url, headers=self.HEADERS)
|
||||
soup = BeautifulSoup(page.content, 'html.parser')
|
||||
return [n['value'] for n in soup.find("select").find_all("option")]
|
||||
try:
|
||||
page = requests.get(self.download_url, headers=self.HEADERS)
|
||||
soup = BeautifulSoup(page.content, 'html.parser')
|
||||
return [n['value'] for n in soup.find("select").find_all("option")]
|
||||
except HTTPError as http_e:
|
||||
raise TazDownloadError(f"Could not scrape available newspaper editions:\n{http_e}")
|
||||
|
||||
def download_newspaper(self, taz: str, download_folder: str = dir_path + 'tmp/'):
|
||||
"""
|
||||
Downloads a newspaper from dl.taz.de and stores it in /tmp
|
||||
Downloads a newspaper from dl.taz.de and stores it in tmp/
|
||||
"""
|
||||
|
||||
# Check if folder exists
|
||||
|
|
|
|||
Loading…
Reference in New Issue