From 6b7dcda8d78f3ffd86e9a843872a50b14d395154 Mon Sep 17 00:00:00 2001 From: Marc Michalsky Date: Sun, 15 Aug 2021 20:00:25 +0200 Subject: [PATCH] =?UTF-8?q?=F0=9F=8E=89=20Initial=20commit?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .example_env | 2 + .gitignore | 134 ++++++++++++++++++++++++++++++++++++++++++++ LICENSE.md | 22 ++++++++ README.md | 1 + example_config.yaml | 9 +++ exceptions.py | 14 +++++ main.py | 92 ++++++++++++++++++++++++++++++ models.py | 65 +++++++++++++++++++++ requirements.txt | 4 ++ 9 files changed, 343 insertions(+) create mode 100644 .example_env create mode 100644 .gitignore create mode 100644 LICENSE.md create mode 100644 README.md create mode 100644 example_config.yaml create mode 100644 exceptions.py create mode 100644 main.py create mode 100644 models.py create mode 100644 requirements.txt diff --git a/.example_env b/.example_env new file mode 100644 index 0000000..9e7733f --- /dev/null +++ b/.example_env @@ -0,0 +1,2 @@ +TAZ_ID="my@taz.id" +TAZ_PASSWORD="secret_password" diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..555d79b --- /dev/null +++ b/.gitignore @@ -0,0 +1,134 @@ +download_history.csv +tazPlease.log +config.yaml + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +#, +Usually these files are written by a python script from a template +#, +before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +#, + According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +#, + However, in case of collaboration, if having platform-specific dependencies or dependencies +#, + having no cross-platform support, pipenv may install dependencies that don't work, or not +#, + install all needed dependencies. +#Pipfile.lock + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..9d7c04b --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,22 @@ + +The MIT License (MIT) + +Copyright (c) 2021 Marc Michalsky + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..baa0878 --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +tazPlease diff --git a/example_config.yaml b/example_config.yaml new file mode 100644 index 0000000..1eadcee --- /dev/null +++ b/example_config.yaml @@ -0,0 +1,9 @@ +taz: + taz_id: ${TAZ_ID} + taz_password: ${TAZ_PASSWORD} + dowload_format: "pdf" # Valid formats are: pdf, epub, epubt, html, ascii, mobi, mobit + +download_folder: "/path/to/download/folder" + +logging: + log_level: "info" diff --git a/exceptions.py b/exceptions.py new file mode 100644 index 0000000..007d3da --- /dev/null +++ b/exceptions.py @@ -0,0 +1,14 @@ +class TazDownloadFormatException(Exception): + + def __inti__(self, format: str): + self.format = format + + def __str__(self): + return f"\"{self.format}\" is not a valid download format." \ + f"\nValid formats are: pdf, epub, epubt, html, ascii, mobi, mobit" + + +class TazDownloadError(Exception): + + def __inti__(self, format: str): + self.format = format diff --git a/main.py b/main.py new file mode 100644 index 0000000..da04913 --- /dev/null +++ b/main.py @@ -0,0 +1,92 @@ +import sys +import os +import datetime +import logging +import shutil +from envyaml import EnvYAML +from models import TazDownloader +import pandas as pd + +dir_path = os.path.dirname(os.path.realpath(__file__)) + '/' + +# Set up logging +logging.basicConfig( + filename=dir_path + 'tazPlease.log', + level=logging.ERROR, + format='%(asctime)s - %(message)s' +) + +# Load configuration +try: + config = EnvYAML(dir_path + 'config.yaml', dir_path + '.env') +except Exception: + logging.error('Could not load config.yaml', exc_info=True) + sys.exit(1) + +# Set log level +try: + logging.getLogger().setLevel(config['logging']['log_level'].upper()) +except ValueError as e: + logging.error(f"Could not set log level. \n{e}", exc_info=True) + +# Read download history from csv file +try: + df = pd.read_csv(dir_path + 'download_history.csv', header=0) +except FileNotFoundError: + # In case, there isn't yet a csv file, create data frame with headers + df = pd.DataFrame( + columns=[ + 'file', + 'download_timestamp', + ] + ) + +# Instantiate downloader object +taz_dl = TazDownloader(config['taz']['taz_id'], config['taz']['taz_password']) + +try: + # Get newspapers available for download + newspaper_available = taz_dl.scrape_newspaper() + + # Remove outdated newspaper from download_history.csv + df.drop([f.index for f in df['file'] if f not in newspaper_available], inplace=True) + + # Find newspaper which are not already downloaded + newspaper_to_download = [n for n in newspaper_available if n not in df.file.values] +except Exception as e: + logging.error(f"Could get available newspaper from website\n{e}", exc_info=True) + sys.exit(1) + +# Download newspaper +newspaper_downloaded = [] +for n in newspaper_to_download: + try: + if taz_dl.download_newspaper(n): + newspaper_downloaded.append(n) + except Exception as e: + logging.error(f"Could not download {n}\n{e}", exc_info=True) + +# Add downloaded newspaper to download_history.csv +try: + for n in newspaper_downloaded: + df_tmp = pd.DataFrame( + { + 'file': [n], + 'download_timestamp': [datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')], + } + ) + df = df.append(df_tmp, ignore_index=True) + df.sort_values(by='download_timestamp', ascending=False, inplace=True) + df.to_csv(dir_path + 'download_history.csv', index=False) +except Exception as e: + logging.error(f"Could not update download_history.csv\n{e}", exc_info=True) + +# Move downloaded file to download folder +if os.path.isdir(config['download_folder']): + download_folder = \ + config['download_folder'] if config['download_folder'].endswith('/') else config['download_folder'] + "/" + for n in newspaper_downloaded: + try: + shutil.move(dir_path + 'tmp/' + n, download_folder + n) + except Exception as e: + logging.error(f"Could not move file to download folder \"{download_folder}\"\n{e}", exc_info=True) diff --git a/models.py b/models.py new file mode 100644 index 0000000..2d034d3 --- /dev/null +++ b/models.py @@ -0,0 +1,65 @@ +import os +import requests +from requests.exceptions import HTTPError +from exceptions import TazDownloadFormatException +from exceptions import TazDownloadError +from bs4 import BeautifulSoup + +dir_path = os.path.dirname(os.path.realpath(__file__)) + '/' + + +class TazDownloader: + download_formats = ["pdf", "epub", "epubt", "html", "ascii", "mobi", "mobit"] + BASE_URL = "https://dl.taz.de/" + HEADERS = {"User-agent": 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) ' + 'Chrome/79.0.3945.130 Safari/537.36'} + + def __init__(self, taz_id: str, password: str, download_format: str = "pdf"): + """ + :param taz_id: + :param password: + :param download_format: + """ + self.taz_id = taz_id + self.password = password + if download_format in self.download_formats: + self.download_url = self.BASE_URL + download_format + else: + raise TazDownloadFormatException + + def scrape_newspaper(self) -> list: + """ + Scrapes the newspaper available for download from https://dl.taz.de/ + :return: a list of file names (str) + """ + page = requests.get(self.download_url, headers=self.HEADERS) + soup = BeautifulSoup(page.content, 'html.parser') + return [n['value'] for n in soup.find("select").find_all("option")] + + def download_newspaper(self, taz: str, download_folder: str = dir_path + 'tmp/'): + """ + Downloads a newspaper from dl.taz.de and stores it in /tmp + """ + + # download taz + try: + with requests.get( + self.download_url, + stream=True, + headers=self.HEADERS, + params={ + 'name': self.taz_id, + 'password': self.password, + 'id': taz, + 'Laden': '+Laden+', + } + ) as r: + # write response to file + with open(download_folder + taz, "wb") as f: + for chunk in r.iter_content(chunk_size=8192): + f.write(chunk) + return True + except HTTPError as http_e: + raise TazDownloadError(f"Could not download taz:\n{http_e}") + except Exception as e: + raise TazDownloadError(f"Something went wrong:\n{e}") diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..45d62c5 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +pandas~=1.3.2 +envyaml~=1.8.210417 +requests~=2.26.0 +beautifulsoup4~=4.9.3 \ No newline at end of file