141 lines
5.0 KiB
Python
141 lines
5.0 KiB
Python
import sys
|
|
import os
|
|
from datetime import datetime, timedelta
|
|
import pytz
|
|
import logging
|
|
import shutil
|
|
import pandas as pd
|
|
from models import TazDownloader, TazConfiguration
|
|
from exceptions import TazConfigurationError, TazDownloadError, TazDownloadFormatException
|
|
|
|
# Get directory
|
|
dir_path = os.path.dirname(os.path.realpath(__file__)) + '/'
|
|
|
|
|
|
def main(config: dict):
|
|
|
|
# Get german date for tomorrow
|
|
tomorrow = (datetime.now(pytz.timezone('Europe/Berlin')) + timedelta(1)).strftime('%Y_%m_%d')
|
|
|
|
# Set log level
|
|
try:
|
|
logging.getLogger().setLevel(config['log_level'].upper())
|
|
except ValueError as e:
|
|
logging.error(f"Could not set log level.\n{e}", exc_info=True)
|
|
|
|
# If 'use_lock_file' configuration is set, check if lockfile exists for tomorrow's newspaper
|
|
if config['use_lock_file']:
|
|
try:
|
|
lock_files = [entry for entry in os.listdir(dir_path) if os.path.isfile(entry) and entry.endswith('.lock')]
|
|
# Delete all lock files that do not refer to tomorrow's date
|
|
for file in lock_files:
|
|
if not file.startswith('.' + tomorrow):
|
|
os.remove(dir_path + file)
|
|
# If there is a lock file for tomorrow, exit the program
|
|
for file in lock_files:
|
|
if file.startswith('.' + tomorrow):
|
|
logging.info('Tomorrow\'s newspaper was already downloaded. Execution canceled.')
|
|
sys.exit(0)
|
|
except Exception as e:
|
|
logging.error(f"Could not check for lock files.\n{e}", exc_info=True)
|
|
|
|
# Read download history from csv file
|
|
try:
|
|
df = pd.read_csv(dir_path + 'download_history.csv', header=0)
|
|
except FileNotFoundError:
|
|
# In case, there isn't yet a csv file, create data frame with headers
|
|
df = pd.DataFrame(
|
|
columns=[
|
|
'file',
|
|
'download_timestamp',
|
|
]
|
|
)
|
|
|
|
# Instantiate downloader object
|
|
try:
|
|
taz_dl = TazDownloader(config['id'], config['password'], config['download_format'])
|
|
except TazDownloadFormatException as e:
|
|
logging.error(e, exc_info=True)
|
|
sys.exit(1)
|
|
|
|
try:
|
|
# Get newspapers available for download
|
|
newspaper_available = taz_dl.scrape_newspaper()
|
|
|
|
# Remove outdated newspaper from download_history.csv
|
|
df.drop([index for index, row in df.iterrows() if row.file not in newspaper_available], inplace=True)
|
|
|
|
# Find newspaper which are not already downloaded
|
|
newspaper_to_download = [n for n in newspaper_available if n not in df.file.values]
|
|
except TazDownloadError as e:
|
|
logging.error(e, exc_info=True)
|
|
sys.exit(1)
|
|
|
|
# Download newspaper
|
|
newspaper_downloaded = []
|
|
for n in newspaper_to_download:
|
|
try:
|
|
if taz_dl.download_newspaper(n):
|
|
newspaper_downloaded.append(n)
|
|
except Exception as e:
|
|
logging.error(f"Could not download {n}\n{e}", exc_info=True)
|
|
|
|
# Create lock file for tomorrow
|
|
if config['use_lock_file']:
|
|
try:
|
|
lock_file = '.' + tomorrow + '.lock'
|
|
for n in newspaper_downloaded:
|
|
if n.startswith('taz_' + tomorrow):
|
|
os.mknod(dir_path + lock_file)
|
|
except Exception as e:
|
|
logging.error(f"Could not download create lock file \"{lock_file}\"\n{e}", exc_info=True)
|
|
|
|
# Add downloaded newspaper to download_history.csv
|
|
try:
|
|
for n in newspaper_downloaded:
|
|
df_tmp = pd.DataFrame(
|
|
{
|
|
'file': [n],
|
|
'download_timestamp': [datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')],
|
|
}
|
|
)
|
|
df = df.append(df_tmp, ignore_index=True)
|
|
df.sort_values(by='file', ascending=False, inplace=True)
|
|
df.to_csv(dir_path + 'download_history.csv', index=False)
|
|
except Exception as e:
|
|
logging.error(f"Could not update download_history.csv\n{e}", exc_info=True)
|
|
|
|
# Move downloaded file to download folder
|
|
if os.path.isdir(config['download_folder']):
|
|
download_folder = \
|
|
config['download_folder'] if config['download_folder'].endswith('/') else config['download_folder'] + "/"
|
|
for n in newspaper_downloaded:
|
|
try:
|
|
shutil.move(dir_path + 'tmp/' + n, download_folder)
|
|
except Exception as e:
|
|
logging.error(f"Could not move file to download folder \"{download_folder}\"\n{e}", exc_info=True)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
# Set up logging
|
|
logging.basicConfig(
|
|
filename=dir_path + 'tazPlease.log',
|
|
level=logging.ERROR,
|
|
format='%(asctime)s - %(message)s'
|
|
)
|
|
|
|
# Load configuration
|
|
try:
|
|
configuration = TazConfiguration().get_config()
|
|
except TazConfigurationError as tce:
|
|
print(tce)
|
|
sys.exit(1)
|
|
except Exception as exception:
|
|
print(exception)
|
|
sys.exit(1)
|
|
|
|
# Execute main function
|
|
if configuration:
|
|
main(configuration)
|