🎉 Initial commit

2021-08-15 20:00:25 +02:00 · 2021-08-15 20:00:25 +02:00 · 6b7dcda8d7
commit 6b7dcda8d7
9 changed files with 343 additions and 0 deletions
--- a/.example_env
+++ b/.example_env
@ -0,0 +1,2 @@
+TAZ_ID="my@taz.id"
+TAZ_PASSWORD="secret_password"
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,134 @@
+download_history.csv
+tazPlease.log
+config.yaml
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#,
+Usually these files are written by a python script from a template
+#,
+before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#,
+ According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#,
+ However, in case of collaboration, if having platform-specific dependencies or dependencies
+#,
+ having no cross-platform support, pipenv may install dependencies that don't work, or not
+#,
+ install all needed dependencies.
+#Pipfile.lock
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
--- a/LICENSE.md
+++ b/LICENSE.md
@ -0,0 +1,22 @@
+
+The MIT License (MIT)
+
+Copyright (c) 2021 Marc Michalsky
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/README.md
+++ b/README.md
@ -0,0 +1 @@
+tazPlease
--- a/example_config.yaml
+++ b/example_config.yaml
@ -0,0 +1,9 @@
+taz:
+  taz_id: ${TAZ_ID}
+  taz_password: ${TAZ_PASSWORD}
+  dowload_format: "pdf" # Valid formats are: pdf, epub, epubt, html, ascii, mobi, mobit
+
+download_folder: "/path/to/download/folder"
+
+logging:
+  log_level: "info"
--- a/exceptions.py
+++ b/exceptions.py
@ -0,0 +1,14 @@
+class TazDownloadFormatException(Exception):
+
+    def __inti__(self, format: str):
+        self.format = format
+
+    def __str__(self):
+        return f"\"{self.format}\" is not a valid download format." \
+               f"\nValid formats are: pdf, epub, epubt, html, ascii, mobi, mobit"
+
+
+class TazDownloadError(Exception):
+
+    def __inti__(self, format: str):
+        self.format = format
--- a/main.py
+++ b/main.py
@ -0,0 +1,92 @@
+import sys
+import os
+import datetime
+import logging
+import shutil
+from envyaml import EnvYAML
+from models import TazDownloader
+import pandas as pd
+
+dir_path = os.path.dirname(os.path.realpath(__file__)) + '/'
+
+# Set up logging
+logging.basicConfig(
+    filename=dir_path + 'tazPlease.log',
+    level=logging.ERROR,
+    format='%(asctime)s - %(message)s'
+)
+
+# Load configuration
+try:
+    config = EnvYAML(dir_path + 'config.yaml', dir_path + '.env')
+except Exception:
+    logging.error('Could not load config.yaml', exc_info=True)
+    sys.exit(1)
+
+# Set log level
+try:
+    logging.getLogger().setLevel(config['logging']['log_level'].upper())
+except ValueError as e:
+    logging.error(f"Could not set log level. \n{e}", exc_info=True)
+
+# Read download history from csv file
+try:
+    df = pd.read_csv(dir_path + 'download_history.csv', header=0)
+except FileNotFoundError:
+    # In case, there isn't yet a csv file, create data frame with headers
+    df = pd.DataFrame(
+        columns=[
+            'file',
+            'download_timestamp',
+        ]
+    )
+
+# Instantiate downloader object
+taz_dl = TazDownloader(config['taz']['taz_id'], config['taz']['taz_password'])
+
+try:
+    # Get newspapers available for download
+    newspaper_available = taz_dl.scrape_newspaper()
+
+    # Remove outdated newspaper from download_history.csv
+    df.drop([f.index for f in df['file'] if f not in newspaper_available], inplace=True)
+
+    # Find newspaper which are not already downloaded
+    newspaper_to_download = [n for n in newspaper_available if n not in df.file.values]
+except Exception as e:
+    logging.error(f"Could get available newspaper from website\n{e}", exc_info=True)
+    sys.exit(1)
+
+# Download newspaper
+newspaper_downloaded = []
+for n in newspaper_to_download:
+    try:
+        if taz_dl.download_newspaper(n):
+            newspaper_downloaded.append(n)
+    except Exception as e:
+        logging.error(f"Could not download {n}\n{e}", exc_info=True)
+
+# Add downloaded newspaper to download_history.csv
+try:
+    for n in newspaper_downloaded:
+        df_tmp = pd.DataFrame(
+            {
+                'file': [n],
+                'download_timestamp': [datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')],
+            }
+        )
+        df = df.append(df_tmp, ignore_index=True)
+    df.sort_values(by='download_timestamp', ascending=False, inplace=True)
+    df.to_csv(dir_path + 'download_history.csv', index=False)
+except Exception as e:
+    logging.error(f"Could not update download_history.csv\n{e}", exc_info=True)
+
+# Move downloaded file to download folder
+if os.path.isdir(config['download_folder']):
+    download_folder = \
+        config['download_folder'] if config['download_folder'].endswith('/') else config['download_folder'] + "/"
+    for n in newspaper_downloaded:
+        try:
+            shutil.move(dir_path + 'tmp/' + n, download_folder + n)
+        except Exception as e:
+            logging.error(f"Could not move file to download folder \"{download_folder}\"\n{e}", exc_info=True)
--- a/models.py
+++ b/models.py
@ -0,0 +1,65 @@
+import os
+import requests
+from requests.exceptions import HTTPError
+from exceptions import TazDownloadFormatException
+from exceptions import TazDownloadError
+from bs4 import BeautifulSoup
+
+dir_path = os.path.dirname(os.path.realpath(__file__)) + '/'
+
+
+class TazDownloader:
+    download_formats = ["pdf", "epub", "epubt", "html", "ascii", "mobi", "mobit"]
+    BASE_URL = "https://dl.taz.de/"
+    HEADERS = {"User-agent": 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '
+                             'Chrome/79.0.3945.130 Safari/537.36'}
+
+    def __init__(self, taz_id: str, password: str, download_format: str = "pdf"):
+        """
+        :param taz_id:
+        :param password:
+        :param download_format:
+        """
+        self.taz_id = taz_id
+        self.password = password
+        if download_format in self.download_formats:
+            self.download_url = self.BASE_URL + download_format
+        else:
+            raise TazDownloadFormatException
+
+    def scrape_newspaper(self) -> list:
+        """
+        Scrapes the newspaper available for download from https://dl.taz.de/
+        :return: a list of file names (str)
+        """
+        page = requests.get(self.download_url, headers=self.HEADERS)
+        soup = BeautifulSoup(page.content, 'html.parser')
+        return [n['value'] for n in soup.find("select").find_all("option")]
+
+    def download_newspaper(self, taz: str, download_folder: str = dir_path + 'tmp/'):
+        """
+        Downloads a newspaper from dl.taz.de and stores it in /tmp
+        """
+
+        # download taz
+        try:
+            with requests.get(
+                    self.download_url,
+                    stream=True,
+                    headers=self.HEADERS,
+                    params={
+                        'name': self.taz_id,
+                        'password': self.password,
+                        'id': taz,
+                        'Laden': '+Laden+',
+                    }
+            ) as r:
+                # write response to file
+                with open(download_folder + taz, "wb") as f:
+                    for chunk in r.iter_content(chunk_size=8192):
+                        f.write(chunk)
+            return True
+        except HTTPError as http_e:
+            raise TazDownloadError(f"Could not download taz:\n{http_e}")
+        except Exception as e:
+            raise TazDownloadError(f"Something went wrong:\n{e}")
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,4 @@
+pandas~=1.3.2
+envyaml~=1.8.210417
+requests~=2.26.0
+beautifulsoup4~=4.9.3