From 70e5d574c5d20ded6e1d80802bb4b5da871cd515 Mon Sep 17 00:00:00 2001 From: Marc Michalsky Date: Sun, 12 Sep 2021 15:19:56 +0200 Subject: [PATCH 1/3] =?UTF-8?q?=E2=9C=A8=20use=20os.path.join()=20instead?= =?UTF-8?q?=20of=20string=20concatenation=20to=20join=20paths?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change should make the program compatible with other operating systems. --- main.py | 20 ++++++++++++-------- models.py | 16 ++++++++-------- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/main.py b/main.py index 3992cfc..dffa08b 100644 --- a/main.py +++ b/main.py @@ -9,14 +9,16 @@ from models import TazDownloader, TazConfiguration from exceptions import TazConfigurationError, TazDownloadError, TazDownloadFormatException # Get directory -dir_path = os.path.dirname(os.path.realpath(__file__)) + '/' +dir_path = os.path.dirname(os.path.realpath(__file__)) def main(config: dict): - # Get german date for tomorrow tomorrow = (datetime.now(pytz.timezone('Europe/Berlin')) + timedelta(1)).strftime('%Y_%m_%d') + # Define tmp/ folder + tmp_folder = os.path.join(dir_path, 'tmp') + # Set log level try: logging.getLogger().setLevel(config['log_level'].upper()) @@ -25,7 +27,7 @@ def main(config: dict): # Read download history from csv file try: - df = pd.read_csv(dir_path + 'download_history.csv', header=0) + df = pd.read_csv(os.path.join(dir_path, 'download_history.csv'), header=0) except FileNotFoundError: # In case, there isn't yet a csv file, create data frame with headers df = pd.DataFrame( @@ -69,7 +71,7 @@ def main(config: dict): newspaper_downloaded = [] for n in newspaper_to_download: try: - if taz_dl.download_newspaper(n): + if taz_dl.download_newspaper(n, tmp_folder): newspaper_downloaded.append(n) except Exception as e: logging.error(f"Could not download {n}\n{e}", exc_info=True) @@ -85,17 +87,19 @@ def main(config: dict): ) df = df.append(df_tmp, ignore_index=True) df.sort_values(by='file', ascending=False, inplace=True) - df.to_csv(dir_path + 'download_history.csv', index=False) + df.to_csv(os.path.join(dir_path, 'download_history.csv'), index=False) except Exception as e: logging.error(f"Could not update download_history.csv\n{e}", exc_info=True) # Move downloaded file to download folder if os.path.isdir(config['download_folder']): download_folder = \ - config['download_folder'] if config['download_folder'].endswith('/') else config['download_folder'] + "/" + config['download_folder'] \ + if config['download_folder'].endswith(os.path.sep) \ + else config['download_folder'] + os.path.sep for n in newspaper_downloaded: try: - shutil.move(dir_path + 'tmp/' + n, download_folder) + shutil.move(os.path.join(tmp_folder, n), download_folder) except Exception as e: logging.error(f"Could not move file to download folder \"{download_folder}\"\n{e}", exc_info=True) @@ -104,7 +108,7 @@ if __name__ == '__main__': # Set up logging logging.basicConfig( - filename=dir_path + 'tazPlease.log', + filename=os.path.join(dir_path, 'tazPlease.log'), level=logging.ERROR, format='%(asctime)s - %(message)s' ) diff --git a/models.py b/models.py index ffb1571..81bd11c 100644 --- a/models.py +++ b/models.py @@ -7,7 +7,7 @@ from bs4 import BeautifulSoup from envyaml import EnvYAML import argparse -dir_path = os.path.dirname(os.path.realpath(__file__)) + '/' +dir_path = os.path.dirname(os.path.realpath(__file__)) class TazConfiguration: @@ -43,7 +43,7 @@ class TazConfiguration: def _load_config(self): # Try to load config.yaml try: - conf_yaml = EnvYAML(dir_path + 'config.yaml', dir_path + '.env') + conf_yaml = EnvYAML(os.path.join(dir_path, 'config.yaml'), os.path.join(dir_path, '.env')) except Exception as e: raise Exception(f"Something went wrong when reading config.yaml.\n{e}") @@ -141,17 +141,17 @@ class TazDownloader: except HTTPError as http_e: raise TazDownloadError(f"Could not scrape available newspaper editions:\n{http_e}") - def download_newspaper(self, taz: str, download_folder: str = dir_path + 'tmp/'): + def download_newspaper(self, taz: str, download_folder: str = os.path.join(dir_path, 'tmp')): """ - Downloads a newspaper from dl.taz.de and stores it in tmp/ + Downloads a newspaper from dl.taz.de and stores it in tmp folder """ # Check if folder exists try: - if not os.path.isdir(dir_path): - os.mkdirs(dir_path) + if not os.path.isdir(download_folder): + os.makedirs(download_folder) except Exception as e: - raise TazDownloadError(f"Could find or create \"{dir_path}\":\n{e}") + raise TazDownloadError(f"Could find or create \"{download_folder}\":\n{e}") # download taz try: @@ -167,7 +167,7 @@ class TazDownloader: } ) as r: # write response to file - with open(download_folder + taz, "wb") as f: + with open(os.path.join(download_folder, taz), "wb") as f: for chunk in r.iter_content(chunk_size=8192): f.write(chunk) return True From e1380d81c2d20218219d594321b45887581a7d70 Mon Sep 17 00:00:00 2001 From: Marc Michalsky Date: Sun, 12 Sep 2021 15:23:01 +0200 Subject: [PATCH 2/3] =?UTF-8?q?=E2=9C=A8=20add=20error=20handling=20for=20?= =?UTF-8?q?wrong=20credentials?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Unfortunately, the taz website does not respond with an http error code if the credentials are wrong. So we have to check if the response is a pdf file or the html page with an error message. --- models.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/models.py b/models.py index 81bd11c..8e24ef5 100644 --- a/models.py +++ b/models.py @@ -1,11 +1,11 @@ import os import requests from requests.exceptions import HTTPError -from exceptions import TazDownloadFormatException -from exceptions import TazDownloadError +from exceptions import TazDownloadFormatException, TazConfigurationError, TazDownloadError from bs4 import BeautifulSoup from envyaml import EnvYAML import argparse +import filetype dir_path = os.path.dirname(os.path.realpath(__file__)) @@ -170,8 +170,22 @@ class TazDownloader: with open(os.path.join(download_folder, taz), "wb") as f: for chunk in r.iter_content(chunk_size=8192): f.write(chunk) + # Unfortunately, the taz website does not respond with an http error code if the credentials are wrong. + # So we have to check if the response is a pdf file or the html page with an error message. + try: + if filetype.guess(os.path.join(download_folder, taz)).mime != 'application/pdf': + raise TazDownloadError() + except (AttributeError, TazDownloadError) as e: + # Try to get the error message from the html file to put it in the log + with open(os.path.join(download_folder, taz), 'r') as f: + soup = BeautifulSoup(f.read(), 'html.parser') + error_displayed_on_page = soup.find('p', class_='error').text + if error_displayed_on_page: + os.remove(os.path.join(download_folder, taz)) + raise TazDownloadError(error_displayed_on_page) + else: + os.remove(os.path.join(download_folder, taz)) + raise TazDownloadError(e) return True except HTTPError as http_e: - raise TazDownloadError(f"Could not download taz:\n{http_e}") - except Exception as e: - raise TazDownloadError(f"Something went wrong:\n{e}") + raise TazDownloadError(http_e) From fec991792b321dab30e9fa7891fd2c557fb7de23 Mon Sep 17 00:00:00 2001 From: Marc Michalsky Date: Sun, 12 Sep 2021 15:25:58 +0200 Subject: [PATCH 3/3] =?UTF-8?q?=F0=9F=92=84nicer=20log=20messages?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/main.py b/main.py index dffa08b..a0011ce 100644 --- a/main.py +++ b/main.py @@ -23,7 +23,7 @@ def main(config: dict): try: logging.getLogger().setLevel(config['log_level'].upper()) except ValueError as e: - logging.error(f"Could not set log level.\n{e}", exc_info=True) + logging.error(f"Could not set log level.\n {e}") # Read download history from csv file try: @@ -44,14 +44,13 @@ def main(config: dict): logging.info('Tomorrow\'s newspaper was already downloaded. Execution canceled.') sys.exit(0) except Exception as e: - logging.error(f"Could not check whether tomorrow's newspaper has already been downloaded.\n{e}", - exc_info=True) + logging.error(f"Could not check whether tomorrow's newspaper has already been downloaded.\n {e}") # Instantiate downloader object try: taz_dl = TazDownloader(config['id'], config['password'], config['download_format']) except TazDownloadFormatException as e: - logging.error(e, exc_info=True) + logging.error(e) sys.exit(1) try: @@ -64,7 +63,7 @@ def main(config: dict): # Find newspaper which are not already downloaded newspaper_to_download = [n for n in newspaper_available if n not in df.file.values] except TazDownloadError as e: - logging.error(e, exc_info=True) + logging.error(e) sys.exit(1) # Download newspaper @@ -74,7 +73,7 @@ def main(config: dict): if taz_dl.download_newspaper(n, tmp_folder): newspaper_downloaded.append(n) except Exception as e: - logging.error(f"Could not download {n}\n{e}", exc_info=True) + logging.error(f"Could not download {n}\n {e}") # Add downloaded newspaper to download_history.csv try: @@ -89,9 +88,10 @@ def main(config: dict): df.sort_values(by='file', ascending=False, inplace=True) df.to_csv(os.path.join(dir_path, 'download_history.csv'), index=False) except Exception as e: - logging.error(f"Could not update download_history.csv\n{e}", exc_info=True) + logging.error(f"Could not update download_history.csv\n {e}") # Move downloaded file to download folder + newspaper_downloaded_string = "\n ".join(newspaper_downloaded) if os.path.isdir(config['download_folder']): download_folder = \ config['download_folder'] \ @@ -101,7 +101,12 @@ def main(config: dict): try: shutil.move(os.path.join(tmp_folder, n), download_folder) except Exception as e: - logging.error(f"Could not move file to download folder \"{download_folder}\"\n{e}", exc_info=True) + logging.error(f"Could not move {n} to download folder \"{download_folder}\"\n {e}") + if newspaper_downloaded: + logging.info(f"Downloaded\n {newspaper_downloaded_string}\n to {config['download_folder']}") + else: + logging.error(f"{config['download_folder']} does not exists.\n {newspaper_downloaded_string}" + f"\n downloaded to {tmp_folder}") if __name__ == '__main__':