93 lines
3.0 KiB
Python
93 lines
3.0 KiB
Python
import sys
|
|
import os
|
|
import datetime
|
|
import logging
|
|
import shutil
|
|
from envyaml import EnvYAML
|
|
from models import TazDownloader
|
|
import pandas as pd
|
|
|
|
dir_path = os.path.dirname(os.path.realpath(__file__)) + '/'
|
|
|
|
# Set up logging
|
|
logging.basicConfig(
|
|
filename=dir_path + 'tazPlease.log',
|
|
level=logging.ERROR,
|
|
format='%(asctime)s - %(message)s'
|
|
)
|
|
|
|
# Load configuration
|
|
try:
|
|
config = EnvYAML(dir_path + 'config.yaml', dir_path + '.env')
|
|
except Exception:
|
|
logging.error('Could not load config.yaml', exc_info=True)
|
|
sys.exit(1)
|
|
|
|
# Set log level
|
|
try:
|
|
logging.getLogger().setLevel(config['logging']['log_level'].upper())
|
|
except ValueError as e:
|
|
logging.error(f"Could not set log level. \n{e}", exc_info=True)
|
|
|
|
# Read download history from csv file
|
|
try:
|
|
df = pd.read_csv(dir_path + 'download_history.csv', header=0)
|
|
except FileNotFoundError:
|
|
# In case, there isn't yet a csv file, create data frame with headers
|
|
df = pd.DataFrame(
|
|
columns=[
|
|
'file',
|
|
'download_timestamp',
|
|
]
|
|
)
|
|
|
|
# Instantiate downloader object
|
|
taz_dl = TazDownloader(config['taz']['taz_id'], config['taz']['taz_password'])
|
|
|
|
try:
|
|
# Get newspapers available for download
|
|
newspaper_available = taz_dl.scrape_newspaper()
|
|
|
|
# Remove outdated newspaper from download_history.csv
|
|
df.drop([index for index, row in df.iterrows() if row.file not in newspaper_available], inplace=True)
|
|
|
|
# Find newspaper which are not already downloaded
|
|
newspaper_to_download = [n for n in newspaper_available if n not in df.file.values]
|
|
except Exception as e:
|
|
logging.error(f"Could get available newspaper from website\n{e}", exc_info=True)
|
|
sys.exit(1)
|
|
|
|
# Download newspaper
|
|
newspaper_downloaded = []
|
|
for n in newspaper_to_download:
|
|
try:
|
|
if taz_dl.download_newspaper(n):
|
|
newspaper_downloaded.append(n)
|
|
except Exception as e:
|
|
logging.error(f"Could not download {n}\n{e}", exc_info=True)
|
|
|
|
# Add downloaded newspaper to download_history.csv
|
|
try:
|
|
for n in newspaper_downloaded:
|
|
df_tmp = pd.DataFrame(
|
|
{
|
|
'file': [n],
|
|
'download_timestamp': [datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')],
|
|
}
|
|
)
|
|
df = df.append(df_tmp, ignore_index=True)
|
|
df.sort_values(by='download_timestamp', ascending=False, inplace=True)
|
|
df.to_csv(dir_path + 'download_history.csv', index=False)
|
|
except Exception as e:
|
|
logging.error(f"Could not update download_history.csv\n{e}", exc_info=True)
|
|
|
|
# Move downloaded file to download folder
|
|
if os.path.isdir(config['download_folder']):
|
|
download_folder = \
|
|
config['download_folder'] if config['download_folder'].endswith('/') else config['download_folder'] + "/"
|
|
for n in newspaper_downloaded:
|
|
try:
|
|
shutil.move(dir_path + 'tmp/' + n, download_folder)
|
|
except Exception as e:
|
|
logging.error(f"Could not move file to download folder \"{download_folder}\"\n{e}", exc_info=True)
|