✨️ implement lock files and console arguments
- Use a lock file that indicates whether tomorrow's newspaper has already been downloaded to limit the number of times the taz.de website is queried for new editions - use the program with command line arguments
This commit is contained in:
parent
6286798e6c
commit
6e8d9d9ef8
|
|
@ -1,6 +1,8 @@
|
||||||
download_history.csv
|
download_history.csv
|
||||||
tazPlease.log
|
tazPlease.log
|
||||||
config.yaml
|
config.yaml
|
||||||
|
tmp/
|
||||||
|
*.lock
|
||||||
|
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,20 @@
|
||||||
taz:
|
# Your taz id (gets read from .env file)
|
||||||
taz_id: ${TAZ_ID}
|
id: ${TAZ_ID}
|
||||||
taz_password: ${TAZ_PASSWORD}
|
|
||||||
dowload_format: "pdf" # Valid formats are: pdf, epub, epubt, html, ascii, mobi, mobit
|
|
||||||
|
|
||||||
|
# Your taz password (gets read from .env file)
|
||||||
|
password: ${TAZ_PASSWORD}
|
||||||
|
|
||||||
|
# In which format do you want to download your newspaper?
|
||||||
|
# Valid formats are: pdf, epub, epubt, html, ascii, mobi, mobit
|
||||||
|
download_format: "pdf"
|
||||||
|
|
||||||
|
# Where should the downloaded files be stored?
|
||||||
download_folder: "/path/to/download/folder"
|
download_folder: "/path/to/download/folder"
|
||||||
|
|
||||||
logging:
|
# Use a lock file that indicates whether tomorrow's newspaper has already been downloaded to limit the number of times
|
||||||
|
# the taz.de website is queried for new editions
|
||||||
|
use_lock_file: True
|
||||||
|
|
||||||
|
# Set the log level.
|
||||||
|
# Valid formats are: notset, debug, info, warning, error, critical
|
||||||
log_level: "info"
|
log_level: "info"
|
||||||
|
|
|
||||||
|
|
@ -12,3 +12,12 @@ class TazDownloadError(Exception):
|
||||||
|
|
||||||
def __inti__(self, format: str):
|
def __inti__(self, format: str):
|
||||||
self.format = format
|
self.format = format
|
||||||
|
|
||||||
|
|
||||||
|
class TazConfigurationError(Exception):
|
||||||
|
|
||||||
|
def __inti__(self, misconfiguration: str):
|
||||||
|
self.misconfiguration = misconfiguration
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return f"\"{self.misconfiguration}\" must be defined either in the config.yaml or by passing it as an argument."
|
||||||
|
|
|
||||||
90
main.py
90
main.py
|
|
@ -1,34 +1,44 @@
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
import datetime
|
from datetime import datetime, timedelta
|
||||||
|
import pytz
|
||||||
import logging
|
import logging
|
||||||
import shutil
|
import shutil
|
||||||
from envyaml import EnvYAML
|
|
||||||
from models import TazDownloader
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
from models import TazDownloader, TazConfiguration
|
||||||
|
from exceptions import TazConfigurationError, TazDownloadError, TazDownloadFormatException
|
||||||
|
|
||||||
|
# Get directory
|
||||||
dir_path = os.path.dirname(os.path.realpath(__file__)) + '/'
|
dir_path = os.path.dirname(os.path.realpath(__file__)) + '/'
|
||||||
|
|
||||||
# Set up logging
|
|
||||||
logging.basicConfig(
|
|
||||||
filename=dir_path + 'tazPlease.log',
|
|
||||||
level=logging.ERROR,
|
|
||||||
format='%(asctime)s - %(message)s'
|
|
||||||
)
|
|
||||||
|
|
||||||
# Load configuration
|
def main(config: dict):
|
||||||
try:
|
|
||||||
config = EnvYAML(dir_path + 'config.yaml', dir_path + '.env')
|
# Get german date for tomorrow
|
||||||
except Exception:
|
tomorrow = (datetime.now(pytz.timezone('Europe/Berlin')) + timedelta(1)).strftime('%Y_%m_%d')
|
||||||
logging.error('Could not load config.yaml', exc_info=True)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# Set log level
|
# Set log level
|
||||||
try:
|
try:
|
||||||
logging.getLogger().setLevel(config['logging']['log_level'].upper())
|
logging.getLogger().setLevel(config['log_level'].upper())
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
logging.error(f"Could not set log level.\n{e}", exc_info=True)
|
logging.error(f"Could not set log level.\n{e}", exc_info=True)
|
||||||
|
|
||||||
|
# If 'use_lock_file' configuration is set, check if lockfile exists for tomorrow's newspaper
|
||||||
|
if config['use_lock_file']:
|
||||||
|
try:
|
||||||
|
lock_files = [entry for entry in os.listdir(dir_path) if os.path.isfile(entry) and entry.endswith('.lock')]
|
||||||
|
# Delete all lock files that do not refer to tomorrow's date
|
||||||
|
for file in lock_files:
|
||||||
|
if not file.startswith('.' + tomorrow):
|
||||||
|
os.remove(dir_path + file)
|
||||||
|
# If there is a lock file for tomorrow, exit the program
|
||||||
|
for file in lock_files:
|
||||||
|
if file.startswith('.' + tomorrow):
|
||||||
|
logging.info('Tomorrow\'s newspaper was already downloaded. Execution canceled.')
|
||||||
|
sys.exit(0)
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Could not check for lock files.\n{e}", exc_info=True)
|
||||||
|
|
||||||
# Read download history from csv file
|
# Read download history from csv file
|
||||||
try:
|
try:
|
||||||
df = pd.read_csv(dir_path + 'download_history.csv', header=0)
|
df = pd.read_csv(dir_path + 'download_history.csv', header=0)
|
||||||
|
|
@ -42,7 +52,11 @@ except FileNotFoundError:
|
||||||
)
|
)
|
||||||
|
|
||||||
# Instantiate downloader object
|
# Instantiate downloader object
|
||||||
taz_dl = TazDownloader(config['taz']['taz_id'], config['taz']['taz_password'])
|
try:
|
||||||
|
taz_dl = TazDownloader(config['id'], config['password'], config['download_format'])
|
||||||
|
except TazDownloadFormatException as e:
|
||||||
|
logging.error(e, exc_info=True)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Get newspapers available for download
|
# Get newspapers available for download
|
||||||
|
|
@ -53,8 +67,8 @@ try:
|
||||||
|
|
||||||
# Find newspaper which are not already downloaded
|
# Find newspaper which are not already downloaded
|
||||||
newspaper_to_download = [n for n in newspaper_available if n not in df.file.values]
|
newspaper_to_download = [n for n in newspaper_available if n not in df.file.values]
|
||||||
except Exception as e:
|
except TazDownloadError as e:
|
||||||
logging.error(f"Could get available newspaper from website\n{e}", exc_info=True)
|
logging.error(e, exc_info=True)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
# Download newspaper
|
# Download newspaper
|
||||||
|
|
@ -66,17 +80,27 @@ for n in newspaper_to_download:
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"Could not download {n}\n{e}", exc_info=True)
|
logging.error(f"Could not download {n}\n{e}", exc_info=True)
|
||||||
|
|
||||||
|
# Create lock file for tomorrow
|
||||||
|
if config['use_lock_file']:
|
||||||
|
try:
|
||||||
|
lock_file = '.' + tomorrow + '.lock'
|
||||||
|
for n in newspaper_downloaded:
|
||||||
|
if n.startswith('taz_' + tomorrow):
|
||||||
|
os.mknod(dir_path + lock_file)
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Could not download create lock file \"{lock_file}\"\n{e}", exc_info=True)
|
||||||
|
|
||||||
# Add downloaded newspaper to download_history.csv
|
# Add downloaded newspaper to download_history.csv
|
||||||
try:
|
try:
|
||||||
for n in newspaper_downloaded:
|
for n in newspaper_downloaded:
|
||||||
df_tmp = pd.DataFrame(
|
df_tmp = pd.DataFrame(
|
||||||
{
|
{
|
||||||
'file': [n],
|
'file': [n],
|
||||||
'download_timestamp': [datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')],
|
'download_timestamp': [datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')],
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
df = df.append(df_tmp, ignore_index=True)
|
df = df.append(df_tmp, ignore_index=True)
|
||||||
df.sort_values(by='download_timestamp', ascending=False, inplace=True)
|
df.sort_values(by='file', ascending=False, inplace=True)
|
||||||
df.to_csv(dir_path + 'download_history.csv', index=False)
|
df.to_csv(dir_path + 'download_history.csv', index=False)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"Could not update download_history.csv\n{e}", exc_info=True)
|
logging.error(f"Could not update download_history.csv\n{e}", exc_info=True)
|
||||||
|
|
@ -90,3 +114,27 @@ if os.path.isdir(config['download_folder']):
|
||||||
shutil.move(dir_path + 'tmp/' + n, download_folder)
|
shutil.move(dir_path + 'tmp/' + n, download_folder)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"Could not move file to download folder \"{download_folder}\"\n{e}", exc_info=True)
|
logging.error(f"Could not move file to download folder \"{download_folder}\"\n{e}", exc_info=True)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
# Set up logging
|
||||||
|
logging.basicConfig(
|
||||||
|
filename=dir_path + 'tazPlease.log',
|
||||||
|
level=logging.ERROR,
|
||||||
|
format='%(asctime)s - %(message)s'
|
||||||
|
)
|
||||||
|
|
||||||
|
# Load configuration
|
||||||
|
try:
|
||||||
|
configuration = TazConfiguration().get_config()
|
||||||
|
except TazConfigurationError as tce:
|
||||||
|
print(tce)
|
||||||
|
sys.exit(1)
|
||||||
|
except Exception as exception:
|
||||||
|
print(exception)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Execute main function
|
||||||
|
if configuration:
|
||||||
|
main(configuration)
|
||||||
|
|
|
||||||
113
models.py
113
models.py
|
|
@ -4,10 +4,111 @@ from requests.exceptions import HTTPError
|
||||||
from exceptions import TazDownloadFormatException
|
from exceptions import TazDownloadFormatException
|
||||||
from exceptions import TazDownloadError
|
from exceptions import TazDownloadError
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
from envyaml import EnvYAML
|
||||||
|
import argparse
|
||||||
|
|
||||||
dir_path = os.path.dirname(os.path.realpath(__file__)) + '/'
|
dir_path = os.path.dirname(os.path.realpath(__file__)) + '/'
|
||||||
|
|
||||||
|
|
||||||
|
class TazConfiguration:
|
||||||
|
"""
|
||||||
|
This class represents the configuration that is needed to run the program.
|
||||||
|
On initialization it trys to load the configuration from either the config.yaml or from the arguments passed.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# List of tuples that each defines a single configuration that can be set either in the config.yaml by passing it
|
||||||
|
# as an argument.
|
||||||
|
# CONFIGURATIONS[0]: configuration name
|
||||||
|
# CONFIGURATIONS[1]: is it required?
|
||||||
|
CONFIGURATIONS = [
|
||||||
|
('id', True),
|
||||||
|
('password', True),
|
||||||
|
('download_format', False),
|
||||||
|
('download_folder', True),
|
||||||
|
('use_lock_file', False),
|
||||||
|
('log_level', False),
|
||||||
|
]
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self._config = {}
|
||||||
|
|
||||||
|
# try to load configuration
|
||||||
|
try:
|
||||||
|
self._load_config()
|
||||||
|
except TazDownloadFormatException:
|
||||||
|
raise
|
||||||
|
except Exception:
|
||||||
|
raise
|
||||||
|
|
||||||
|
def _load_config(self):
|
||||||
|
# Try to load config.yaml
|
||||||
|
try:
|
||||||
|
conf_yaml = EnvYAML(dir_path + 'config.yaml', dir_path + '.env')
|
||||||
|
except Exception as e:
|
||||||
|
raise Exception(f"Something went wrong when reading config.yaml.\n{e}")
|
||||||
|
|
||||||
|
# Get console arguments
|
||||||
|
console_args = self._parse_arguments()
|
||||||
|
|
||||||
|
# Set configurations by preferring console arguments over settings in config.yaml
|
||||||
|
for conf, required in self.CONFIGURATIONS:
|
||||||
|
if conf in console_args and getattr(console_args, conf) is not None:
|
||||||
|
self._config[conf] = getattr(console_args, conf)
|
||||||
|
elif conf_yaml.get(conf, None) is not None:
|
||||||
|
self._config[conf] = conf_yaml[conf]
|
||||||
|
else:
|
||||||
|
if required:
|
||||||
|
raise TazConfigurationError(conf)
|
||||||
|
|
||||||
|
def _parse_arguments(self):
|
||||||
|
"""
|
||||||
|
Parse command line arguments.
|
||||||
|
"""
|
||||||
|
argparser = argparse.ArgumentParser(
|
||||||
|
description='Download taz e-paper'
|
||||||
|
)
|
||||||
|
argparser.add_argument(
|
||||||
|
'-i',
|
||||||
|
'--id',
|
||||||
|
action='store',
|
||||||
|
type=str,
|
||||||
|
)
|
||||||
|
argparser.add_argument(
|
||||||
|
'-p',
|
||||||
|
'--password',
|
||||||
|
action='store',
|
||||||
|
type=str,
|
||||||
|
)
|
||||||
|
argparser.add_argument(
|
||||||
|
'-f',
|
||||||
|
'--download-format',
|
||||||
|
action='store',
|
||||||
|
type=str,
|
||||||
|
choices=['pdf', 'epub', 'epubt', 'html', 'ascii', 'mobi', 'mobit'],
|
||||||
|
)
|
||||||
|
argparser.add_argument(
|
||||||
|
'-d',
|
||||||
|
'--download_folder',
|
||||||
|
action='store',
|
||||||
|
type=str,
|
||||||
|
)
|
||||||
|
argparser.add_argument(
|
||||||
|
'-l',
|
||||||
|
'--use_lock_file',
|
||||||
|
action='store_true',
|
||||||
|
default=None
|
||||||
|
)
|
||||||
|
argparser.add_argument(
|
||||||
|
'--log_level',
|
||||||
|
action='store',
|
||||||
|
choices=['notset', 'debug', 'info', 'warning', 'error', 'critical'],
|
||||||
|
)
|
||||||
|
return argparser.parse_args()
|
||||||
|
|
||||||
|
def get_config(self) -> dict:
|
||||||
|
return self._config
|
||||||
|
|
||||||
|
|
||||||
class TazDownloader:
|
class TazDownloader:
|
||||||
download_formats = ["pdf", "epub", "epubt", "html", "ascii", "mobi", "mobit"]
|
download_formats = ["pdf", "epub", "epubt", "html", "ascii", "mobi", "mobit"]
|
||||||
BASE_URL = "https://dl.taz.de/"
|
BASE_URL = "https://dl.taz.de/"
|
||||||
|
|
@ -15,30 +116,28 @@ class TazDownloader:
|
||||||
'Chrome/79.0.3945.130 Safari/537.36'}
|
'Chrome/79.0.3945.130 Safari/537.36'}
|
||||||
|
|
||||||
def __init__(self, taz_id: str, password: str, download_format: str = "pdf"):
|
def __init__(self, taz_id: str, password: str, download_format: str = "pdf"):
|
||||||
"""
|
|
||||||
:param taz_id:
|
|
||||||
:param password:
|
|
||||||
:param download_format:
|
|
||||||
"""
|
|
||||||
self.taz_id = taz_id
|
self.taz_id = taz_id
|
||||||
self.password = password
|
self.password = password
|
||||||
if download_format in self.download_formats:
|
if download_format in self.download_formats:
|
||||||
self.download_url = self.BASE_URL + download_format
|
self.download_url = self.BASE_URL + download_format
|
||||||
else:
|
else:
|
||||||
raise TazDownloadFormatException
|
raise TazDownloadFormatException(download_format)
|
||||||
|
|
||||||
def scrape_newspaper(self) -> list:
|
def scrape_newspaper(self) -> list:
|
||||||
"""
|
"""
|
||||||
Scrapes the newspaper available for download from https://dl.taz.de/
|
Scrapes the newspaper available for download from https://dl.taz.de/
|
||||||
:return: a list of file names (str)
|
:return: a list of file names (str)
|
||||||
"""
|
"""
|
||||||
|
try:
|
||||||
page = requests.get(self.download_url, headers=self.HEADERS)
|
page = requests.get(self.download_url, headers=self.HEADERS)
|
||||||
soup = BeautifulSoup(page.content, 'html.parser')
|
soup = BeautifulSoup(page.content, 'html.parser')
|
||||||
return [n['value'] for n in soup.find("select").find_all("option")]
|
return [n['value'] for n in soup.find("select").find_all("option")]
|
||||||
|
except HTTPError as http_e:
|
||||||
|
raise TazDownloadError(f"Could not scrape available newspaper editions:\n{http_e}")
|
||||||
|
|
||||||
def download_newspaper(self, taz: str, download_folder: str = dir_path + 'tmp/'):
|
def download_newspaper(self, taz: str, download_folder: str = dir_path + 'tmp/'):
|
||||||
"""
|
"""
|
||||||
Downloads a newspaper from dl.taz.de and stores it in /tmp
|
Downloads a newspaper from dl.taz.de and stores it in tmp/
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Check if folder exists
|
# Check if folder exists
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue