Merge branch 'dev' into main
This commit is contained in:
commit
76c24041e0
41
main.py
41
main.py
|
|
@ -9,23 +9,25 @@ from models import TazDownloader, TazConfiguration
|
|||
from exceptions import TazConfigurationError, TazDownloadError, TazDownloadFormatException
|
||||
|
||||
# Get directory
|
||||
dir_path = os.path.dirname(os.path.realpath(__file__)) + '/'
|
||||
dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
|
||||
def main(config: dict):
|
||||
|
||||
# Get german date for tomorrow
|
||||
tomorrow = (datetime.now(pytz.timezone('Europe/Berlin')) + timedelta(1)).strftime('%Y_%m_%d')
|
||||
|
||||
# Define tmp/ folder
|
||||
tmp_folder = os.path.join(dir_path, 'tmp')
|
||||
|
||||
# Set log level
|
||||
try:
|
||||
logging.getLogger().setLevel(config['log_level'].upper())
|
||||
except ValueError as e:
|
||||
logging.error(f"Could not set log level.\n{e}", exc_info=True)
|
||||
logging.error(f"Could not set log level.\n {e}")
|
||||
|
||||
# Read download history from csv file
|
||||
try:
|
||||
df = pd.read_csv(dir_path + 'download_history.csv', header=0)
|
||||
df = pd.read_csv(os.path.join(dir_path, 'download_history.csv'), header=0)
|
||||
except FileNotFoundError:
|
||||
# In case, there isn't yet a csv file, create data frame with headers
|
||||
df = pd.DataFrame(
|
||||
|
|
@ -42,14 +44,13 @@ def main(config: dict):
|
|||
logging.info('Tomorrow\'s newspaper was already downloaded. Execution canceled.')
|
||||
sys.exit(0)
|
||||
except Exception as e:
|
||||
logging.error(f"Could not check whether tomorrow's newspaper has already been downloaded.\n{e}",
|
||||
exc_info=True)
|
||||
logging.error(f"Could not check whether tomorrow's newspaper has already been downloaded.\n {e}")
|
||||
|
||||
# Instantiate downloader object
|
||||
try:
|
||||
taz_dl = TazDownloader(config['id'], config['password'], config['download_format'])
|
||||
except TazDownloadFormatException as e:
|
||||
logging.error(e, exc_info=True)
|
||||
logging.error(e)
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
|
|
@ -62,17 +63,17 @@ def main(config: dict):
|
|||
# Find newspaper which are not already downloaded
|
||||
newspaper_to_download = [n for n in newspaper_available if n not in df.file.values]
|
||||
except TazDownloadError as e:
|
||||
logging.error(e, exc_info=True)
|
||||
logging.error(e)
|
||||
sys.exit(1)
|
||||
|
||||
# Download newspaper
|
||||
newspaper_downloaded = []
|
||||
for n in newspaper_to_download:
|
||||
try:
|
||||
if taz_dl.download_newspaper(n):
|
||||
if taz_dl.download_newspaper(n, tmp_folder):
|
||||
newspaper_downloaded.append(n)
|
||||
except Exception as e:
|
||||
logging.error(f"Could not download {n}\n{e}", exc_info=True)
|
||||
logging.error(f"Could not download {n}\n {e}")
|
||||
|
||||
# Add downloaded newspaper to download_history.csv
|
||||
try:
|
||||
|
|
@ -85,26 +86,34 @@ def main(config: dict):
|
|||
)
|
||||
df = df.append(df_tmp, ignore_index=True)
|
||||
df.sort_values(by='file', ascending=False, inplace=True)
|
||||
df.to_csv(dir_path + 'download_history.csv', index=False)
|
||||
df.to_csv(os.path.join(dir_path, 'download_history.csv'), index=False)
|
||||
except Exception as e:
|
||||
logging.error(f"Could not update download_history.csv\n{e}", exc_info=True)
|
||||
logging.error(f"Could not update download_history.csv\n {e}")
|
||||
|
||||
# Move downloaded file to download folder
|
||||
newspaper_downloaded_string = "\n ".join(newspaper_downloaded)
|
||||
if os.path.isdir(config['download_folder']):
|
||||
download_folder = \
|
||||
config['download_folder'] if config['download_folder'].endswith('/') else config['download_folder'] + "/"
|
||||
config['download_folder'] \
|
||||
if config['download_folder'].endswith(os.path.sep) \
|
||||
else config['download_folder'] + os.path.sep
|
||||
for n in newspaper_downloaded:
|
||||
try:
|
||||
shutil.move(dir_path + 'tmp/' + n, download_folder)
|
||||
shutil.move(os.path.join(tmp_folder, n), download_folder)
|
||||
except Exception as e:
|
||||
logging.error(f"Could not move file to download folder \"{download_folder}\"\n{e}", exc_info=True)
|
||||
logging.error(f"Could not move {n} to download folder \"{download_folder}\"\n {e}")
|
||||
if newspaper_downloaded:
|
||||
logging.info(f"Downloaded\n {newspaper_downloaded_string}\n to {config['download_folder']}")
|
||||
else:
|
||||
logging.error(f"{config['download_folder']} does not exists.\n {newspaper_downloaded_string}"
|
||||
f"\n downloaded to {tmp_folder}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
# Set up logging
|
||||
logging.basicConfig(
|
||||
filename=dir_path + 'tazPlease.log',
|
||||
filename=os.path.join(dir_path, 'tazPlease.log'),
|
||||
level=logging.ERROR,
|
||||
format='%(asctime)s - %(message)s'
|
||||
)
|
||||
|
|
|
|||
40
models.py
40
models.py
|
|
@ -1,13 +1,13 @@
|
|||
import os
|
||||
import requests
|
||||
from requests.exceptions import HTTPError
|
||||
from exceptions import TazDownloadFormatException
|
||||
from exceptions import TazDownloadError
|
||||
from exceptions import TazDownloadFormatException, TazConfigurationError, TazDownloadError
|
||||
from bs4 import BeautifulSoup
|
||||
from envyaml import EnvYAML
|
||||
import argparse
|
||||
import filetype
|
||||
|
||||
dir_path = os.path.dirname(os.path.realpath(__file__)) + '/'
|
||||
dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
|
||||
class TazConfiguration:
|
||||
|
|
@ -43,7 +43,7 @@ class TazConfiguration:
|
|||
def _load_config(self):
|
||||
# Try to load config.yaml
|
||||
try:
|
||||
conf_yaml = EnvYAML(dir_path + 'config.yaml', dir_path + '.env')
|
||||
conf_yaml = EnvYAML(os.path.join(dir_path, 'config.yaml'), os.path.join(dir_path, '.env'))
|
||||
except Exception as e:
|
||||
raise Exception(f"Something went wrong when reading config.yaml.\n{e}")
|
||||
|
||||
|
|
@ -141,17 +141,17 @@ class TazDownloader:
|
|||
except HTTPError as http_e:
|
||||
raise TazDownloadError(f"Could not scrape available newspaper editions:\n{http_e}")
|
||||
|
||||
def download_newspaper(self, taz: str, download_folder: str = dir_path + 'tmp/'):
|
||||
def download_newspaper(self, taz: str, download_folder: str = os.path.join(dir_path, 'tmp')):
|
||||
"""
|
||||
Downloads a newspaper from dl.taz.de and stores it in tmp/
|
||||
Downloads a newspaper from dl.taz.de and stores it in tmp folder
|
||||
"""
|
||||
|
||||
# Check if folder exists
|
||||
try:
|
||||
if not os.path.isdir(dir_path):
|
||||
os.mkdirs(dir_path)
|
||||
if not os.path.isdir(download_folder):
|
||||
os.makedirs(download_folder)
|
||||
except Exception as e:
|
||||
raise TazDownloadError(f"Could find or create \"{dir_path}\":\n{e}")
|
||||
raise TazDownloadError(f"Could find or create \"{download_folder}\":\n{e}")
|
||||
|
||||
# download taz
|
||||
try:
|
||||
|
|
@ -167,11 +167,25 @@ class TazDownloader:
|
|||
}
|
||||
) as r:
|
||||
# write response to file
|
||||
with open(download_folder + taz, "wb") as f:
|
||||
with open(os.path.join(download_folder, taz), "wb") as f:
|
||||
for chunk in r.iter_content(chunk_size=8192):
|
||||
f.write(chunk)
|
||||
# Unfortunately, the taz website does not respond with an http error code if the credentials are wrong.
|
||||
# So we have to check if the response is a pdf file or the html page with an error message.
|
||||
try:
|
||||
if filetype.guess(os.path.join(download_folder, taz)).mime != 'application/pdf':
|
||||
raise TazDownloadError()
|
||||
except (AttributeError, TazDownloadError) as e:
|
||||
# Try to get the error message from the html file to put it in the log
|
||||
with open(os.path.join(download_folder, taz), 'r') as f:
|
||||
soup = BeautifulSoup(f.read(), 'html.parser')
|
||||
error_displayed_on_page = soup.find('p', class_='error').text
|
||||
if error_displayed_on_page:
|
||||
os.remove(os.path.join(download_folder, taz))
|
||||
raise TazDownloadError(error_displayed_on_page)
|
||||
else:
|
||||
os.remove(os.path.join(download_folder, taz))
|
||||
raise TazDownloadError(e)
|
||||
return True
|
||||
except HTTPError as http_e:
|
||||
raise TazDownloadError(f"Could not download taz:\n{http_e}")
|
||||
except Exception as e:
|
||||
raise TazDownloadError(f"Something went wrong:\n{e}")
|
||||
raise TazDownloadError(http_e)
|
||||
|
|
|
|||
Loading…
Reference in New Issue