🎉 Initial commit

This commit is contained in:
Marc Koch 2021-08-15 20:00:25 +02:00
commit 6b7dcda8d7
9 changed files with 343 additions and 0 deletions

2
.example_env Normal file
View File

@ -0,0 +1,2 @@
TAZ_ID="my@taz.id"
TAZ_PASSWORD="secret_password"

134
.gitignore vendored Normal file
View File

@ -0,0 +1,134 @@
download_history.csv
tazPlease.log
config.yaml
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
#,
Usually these files are written by a python script from a template
#,
before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
#,
According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#,
However, in case of collaboration, if having platform-specific dependencies or dependencies
#,
having no cross-platform support, pipenv may install dependencies that don't work, or not
#,
install all needed dependencies.
#Pipfile.lock
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/

22
LICENSE.md Normal file
View File

@ -0,0 +1,22 @@
The MIT License (MIT)
Copyright (c) 2021 Marc Michalsky
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

1
README.md Normal file
View File

@ -0,0 +1 @@
tazPlease

9
example_config.yaml Normal file
View File

@ -0,0 +1,9 @@
taz:
taz_id: ${TAZ_ID}
taz_password: ${TAZ_PASSWORD}
dowload_format: "pdf" # Valid formats are: pdf, epub, epubt, html, ascii, mobi, mobit
download_folder: "/path/to/download/folder"
logging:
log_level: "info"

14
exceptions.py Normal file
View File

@ -0,0 +1,14 @@
class TazDownloadFormatException(Exception):
def __inti__(self, format: str):
self.format = format
def __str__(self):
return f"\"{self.format}\" is not a valid download format." \
f"\nValid formats are: pdf, epub, epubt, html, ascii, mobi, mobit"
class TazDownloadError(Exception):
def __inti__(self, format: str):
self.format = format

92
main.py Normal file
View File

@ -0,0 +1,92 @@
import sys
import os
import datetime
import logging
import shutil
from envyaml import EnvYAML
from models import TazDownloader
import pandas as pd
dir_path = os.path.dirname(os.path.realpath(__file__)) + '/'
# Set up logging
logging.basicConfig(
filename=dir_path + 'tazPlease.log',
level=logging.ERROR,
format='%(asctime)s - %(message)s'
)
# Load configuration
try:
config = EnvYAML(dir_path + 'config.yaml', dir_path + '.env')
except Exception:
logging.error('Could not load config.yaml', exc_info=True)
sys.exit(1)
# Set log level
try:
logging.getLogger().setLevel(config['logging']['log_level'].upper())
except ValueError as e:
logging.error(f"Could not set log level. \n{e}", exc_info=True)
# Read download history from csv file
try:
df = pd.read_csv(dir_path + 'download_history.csv', header=0)
except FileNotFoundError:
# In case, there isn't yet a csv file, create data frame with headers
df = pd.DataFrame(
columns=[
'file',
'download_timestamp',
]
)
# Instantiate downloader object
taz_dl = TazDownloader(config['taz']['taz_id'], config['taz']['taz_password'])
try:
# Get newspapers available for download
newspaper_available = taz_dl.scrape_newspaper()
# Remove outdated newspaper from download_history.csv
df.drop([f.index for f in df['file'] if f not in newspaper_available], inplace=True)
# Find newspaper which are not already downloaded
newspaper_to_download = [n for n in newspaper_available if n not in df.file.values]
except Exception as e:
logging.error(f"Could get available newspaper from website\n{e}", exc_info=True)
sys.exit(1)
# Download newspaper
newspaper_downloaded = []
for n in newspaper_to_download:
try:
if taz_dl.download_newspaper(n):
newspaper_downloaded.append(n)
except Exception as e:
logging.error(f"Could not download {n}\n{e}", exc_info=True)
# Add downloaded newspaper to download_history.csv
try:
for n in newspaper_downloaded:
df_tmp = pd.DataFrame(
{
'file': [n],
'download_timestamp': [datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')],
}
)
df = df.append(df_tmp, ignore_index=True)
df.sort_values(by='download_timestamp', ascending=False, inplace=True)
df.to_csv(dir_path + 'download_history.csv', index=False)
except Exception as e:
logging.error(f"Could not update download_history.csv\n{e}", exc_info=True)
# Move downloaded file to download folder
if os.path.isdir(config['download_folder']):
download_folder = \
config['download_folder'] if config['download_folder'].endswith('/') else config['download_folder'] + "/"
for n in newspaper_downloaded:
try:
shutil.move(dir_path + 'tmp/' + n, download_folder + n)
except Exception as e:
logging.error(f"Could not move file to download folder \"{download_folder}\"\n{e}", exc_info=True)

65
models.py Normal file
View File

@ -0,0 +1,65 @@
import os
import requests
from requests.exceptions import HTTPError
from exceptions import TazDownloadFormatException
from exceptions import TazDownloadError
from bs4 import BeautifulSoup
dir_path = os.path.dirname(os.path.realpath(__file__)) + '/'
class TazDownloader:
download_formats = ["pdf", "epub", "epubt", "html", "ascii", "mobi", "mobit"]
BASE_URL = "https://dl.taz.de/"
HEADERS = {"User-agent": 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/79.0.3945.130 Safari/537.36'}
def __init__(self, taz_id: str, password: str, download_format: str = "pdf"):
"""
:param taz_id:
:param password:
:param download_format:
"""
self.taz_id = taz_id
self.password = password
if download_format in self.download_formats:
self.download_url = self.BASE_URL + download_format
else:
raise TazDownloadFormatException
def scrape_newspaper(self) -> list:
"""
Scrapes the newspaper available for download from https://dl.taz.de/
:return: a list of file names (str)
"""
page = requests.get(self.download_url, headers=self.HEADERS)
soup = BeautifulSoup(page.content, 'html.parser')
return [n['value'] for n in soup.find("select").find_all("option")]
def download_newspaper(self, taz: str, download_folder: str = dir_path + 'tmp/'):
"""
Downloads a newspaper from dl.taz.de and stores it in /tmp
"""
# download taz
try:
with requests.get(
self.download_url,
stream=True,
headers=self.HEADERS,
params={
'name': self.taz_id,
'password': self.password,
'id': taz,
'Laden': '+Laden+',
}
) as r:
# write response to file
with open(download_folder + taz, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
return True
except HTTPError as http_e:
raise TazDownloadError(f"Could not download taz:\n{http_e}")
except Exception as e:
raise TazDownloadError(f"Something went wrong:\n{e}")

4
requirements.txt Normal file
View File

@ -0,0 +1,4 @@
pandas~=1.3.2
envyaml~=1.8.210417
requests~=2.26.0
beautifulsoup4~=4.9.3