134 lines
4.7 KiB
Python
134 lines
4.7 KiB
Python
import sys
|
|
import os
|
|
from datetime import datetime, timedelta
|
|
import pytz
|
|
import logging
|
|
import shutil
|
|
import pandas as pd
|
|
from models import TazDownloader, TazConfiguration
|
|
from exceptions import TazConfigurationError, TazDownloadError, TazDownloadFormatException
|
|
|
|
# Get directory
|
|
dir_path = os.path.dirname(os.path.realpath(__file__))
|
|
|
|
|
|
def main(config: dict):
|
|
# Get german date for tomorrow
|
|
tomorrow = (datetime.now(pytz.timezone('Europe/Berlin')) + timedelta(1)).strftime('%Y_%m_%d')
|
|
|
|
# Define tmp/ folder
|
|
tmp_folder = os.path.join(dir_path, 'tmp')
|
|
|
|
# Set log level
|
|
try:
|
|
logging.getLogger().setLevel(config['log_level'].upper())
|
|
except ValueError as e:
|
|
logging.error(f"Could not set log level.\n {e}")
|
|
|
|
# Read download history from csv file
|
|
try:
|
|
df = pd.read_csv(os.path.join(dir_path, 'download_history.csv'), header=0)
|
|
except FileNotFoundError:
|
|
# In case, there isn't yet a csv file, create data frame with headers
|
|
df = pd.DataFrame(
|
|
columns=[
|
|
'file',
|
|
'download_timestamp',
|
|
]
|
|
)
|
|
|
|
# If the 'limit_requests' argument is specified, check whether tomorrow's newspaper has already been downloaded
|
|
if config['limit_requests']:
|
|
try:
|
|
if any(df.file.str.contains(pat=tomorrow)):
|
|
logging.info('Tomorrow\'s newspaper was already downloaded. Execution canceled.')
|
|
sys.exit(0)
|
|
except Exception as e:
|
|
logging.error(f"Could not check whether tomorrow's newspaper has already been downloaded.\n {e}")
|
|
|
|
# Instantiate downloader object
|
|
try:
|
|
taz_dl = TazDownloader(config['id'], config['password'], config['download_format'])
|
|
except TazDownloadFormatException as e:
|
|
logging.error(e)
|
|
sys.exit(1)
|
|
|
|
try:
|
|
# Get newspaper available for download
|
|
newspaper_available = taz_dl.scrape_newspaper()
|
|
|
|
# Remove outdated newspaper from download_history.csv
|
|
df.drop([index for index, row in df.iterrows() if row.file not in newspaper_available], inplace=True)
|
|
|
|
# Find newspaper which are not already downloaded
|
|
newspaper_to_download = [n for n in newspaper_available if n not in df.file.values]
|
|
except TazDownloadError as e:
|
|
logging.error(e)
|
|
sys.exit(1)
|
|
|
|
# Download newspaper
|
|
newspaper_downloaded = []
|
|
for n in newspaper_to_download:
|
|
try:
|
|
if taz_dl.download_newspaper(n, tmp_folder):
|
|
newspaper_downloaded.append(n)
|
|
except Exception as e:
|
|
logging.error(f"Could not download {n}\n {e}")
|
|
|
|
# Add downloaded newspaper to download_history.csv
|
|
try:
|
|
for n in newspaper_downloaded:
|
|
df_tmp = pd.DataFrame(
|
|
{
|
|
'file': [n],
|
|
'download_timestamp': [datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')],
|
|
}
|
|
)
|
|
df = df.append(df_tmp, ignore_index=True)
|
|
df.sort_values(by='file', ascending=False, inplace=True)
|
|
df.to_csv(os.path.join(dir_path, 'download_history.csv'), index=False)
|
|
except Exception as e:
|
|
logging.error(f"Could not update download_history.csv\n {e}")
|
|
|
|
# Move downloaded file to download folder
|
|
newspaper_downloaded_string = "\n ".join(newspaper_downloaded)
|
|
if os.path.isdir(config['download_folder']):
|
|
download_folder = \
|
|
config['download_folder'] \
|
|
if config['download_folder'].endswith(os.path.sep) \
|
|
else config['download_folder'] + os.path.sep
|
|
for n in newspaper_downloaded:
|
|
try:
|
|
shutil.move(os.path.join(tmp_folder, n), download_folder)
|
|
except Exception as e:
|
|
logging.error(f"Could not move {n} to download folder \"{download_folder}\"\n {e}")
|
|
if newspaper_downloaded:
|
|
logging.info(f"Downloaded\n {newspaper_downloaded_string}\n to {config['download_folder']}")
|
|
else:
|
|
logging.error(f"{config['download_folder']} does not exists.\n {newspaper_downloaded_string}"
|
|
f"\n downloaded to {tmp_folder}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
# Set up logging
|
|
logging.basicConfig(
|
|
filename=os.path.join(dir_path, 'tazPlease.log'),
|
|
level=logging.ERROR,
|
|
format='%(asctime)s - %(message)s'
|
|
)
|
|
|
|
# Load configuration
|
|
try:
|
|
configuration = TazConfiguration().get_config()
|
|
except TazConfigurationError as tce:
|
|
print(tce)
|
|
sys.exit(1)
|
|
except Exception as exception:
|
|
print(exception)
|
|
sys.exit(1)
|
|
|
|
# Execute main function
|
|
if configuration:
|
|
main(configuration)
|