tazPlease/main.py

93 lines
3.0 KiB
Python

import sys
import os
import datetime
import logging
import shutil
from envyaml import EnvYAML
from models import TazDownloader
import pandas as pd
dir_path = os.path.dirname(os.path.realpath(__file__)) + '/'
# Set up logging
logging.basicConfig(
filename=dir_path + 'tazPlease.log',
level=logging.ERROR,
format='%(asctime)s - %(message)s'
)
# Load configuration
try:
config = EnvYAML(dir_path + 'config.yaml', dir_path + '.env')
except Exception:
logging.error('Could not load config.yaml', exc_info=True)
sys.exit(1)
# Set log level
try:
logging.getLogger().setLevel(config['logging']['log_level'].upper())
except ValueError as e:
logging.error(f"Could not set log level. \n{e}", exc_info=True)
# Read download history from csv file
try:
df = pd.read_csv(dir_path + 'download_history.csv', header=0)
except FileNotFoundError:
# In case, there isn't yet a csv file, create data frame with headers
df = pd.DataFrame(
columns=[
'file',
'download_timestamp',
]
)
# Instantiate downloader object
taz_dl = TazDownloader(config['taz']['taz_id'], config['taz']['taz_password'])
try:
# Get newspapers available for download
newspaper_available = taz_dl.scrape_newspaper()
# Remove outdated newspaper from download_history.csv
df.drop([f.index for f in df['file'] if f not in newspaper_available], inplace=True)
# Find newspaper which are not already downloaded
newspaper_to_download = [n for n in newspaper_available if n not in df.file.values]
except Exception as e:
logging.error(f"Could get available newspaper from website\n{e}", exc_info=True)
sys.exit(1)
# Download newspaper
newspaper_downloaded = []
for n in newspaper_to_download:
try:
if taz_dl.download_newspaper(n):
newspaper_downloaded.append(n)
except Exception as e:
logging.error(f"Could not download {n}\n{e}", exc_info=True)
# Add downloaded newspaper to download_history.csv
try:
for n in newspaper_downloaded:
df_tmp = pd.DataFrame(
{
'file': [n],
'download_timestamp': [datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')],
}
)
df = df.append(df_tmp, ignore_index=True)
df.sort_values(by='download_timestamp', ascending=False, inplace=True)
df.to_csv(dir_path + 'download_history.csv', index=False)
except Exception as e:
logging.error(f"Could not update download_history.csv\n{e}", exc_info=True)
# Move downloaded file to download folder
if os.path.isdir(config['download_folder']):
download_folder = \
config['download_folder'] if config['download_folder'].endswith('/') else config['download_folder'] + "/"
for n in newspaper_downloaded:
try:
shutil.move(dir_path + 'tmp/' + n, download_folder + n)
except Exception as e:
logging.error(f"Could not move file to download folder \"{download_folder}\"\n{e}", exc_info=True)