🎉 Initial commit
This commit is contained in:
commit
6b7dcda8d7
|
|
@ -0,0 +1,2 @@
|
||||||
|
TAZ_ID="my@taz.id"
|
||||||
|
TAZ_PASSWORD="secret_password"
|
||||||
|
|
@ -0,0 +1,134 @@
|
||||||
|
download_history.csv
|
||||||
|
tazPlease.log
|
||||||
|
config.yaml
|
||||||
|
|
||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
pip-wheel-metadata/
|
||||||
|
share/python-wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
#,
|
||||||
|
Usually these files are written by a python script from a template
|
||||||
|
#,
|
||||||
|
before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.nox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*.cover
|
||||||
|
.hypothesis/
|
||||||
|
.pytest_cache/
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
# Django stuff:
|
||||||
|
*.log
|
||||||
|
local_settings.py
|
||||||
|
db.sqlite3
|
||||||
|
db.sqlite3-journal
|
||||||
|
|
||||||
|
# Flask stuff:
|
||||||
|
instance/
|
||||||
|
.webassets-cache
|
||||||
|
|
||||||
|
# Scrapy stuff:
|
||||||
|
.scrapy
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
target/
|
||||||
|
|
||||||
|
# Jupyter Notebook
|
||||||
|
.ipynb_checkpoints
|
||||||
|
|
||||||
|
# IPython
|
||||||
|
profile_default/
|
||||||
|
ipython_config.py
|
||||||
|
|
||||||
|
# pyenv
|
||||||
|
.python-version
|
||||||
|
|
||||||
|
# pipenv
|
||||||
|
#,
|
||||||
|
According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||||
|
#,
|
||||||
|
However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||||
|
#,
|
||||||
|
having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||||
|
#,
|
||||||
|
install all needed dependencies.
|
||||||
|
#Pipfile.lock
|
||||||
|
|
||||||
|
# celery beat schedule file
|
||||||
|
celerybeat-schedule
|
||||||
|
|
||||||
|
# SageMath parsed files
|
||||||
|
*.sage.py
|
||||||
|
|
||||||
|
# Environments
|
||||||
|
.env
|
||||||
|
.venv
|
||||||
|
env/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env.bak/
|
||||||
|
venv.bak/
|
||||||
|
|
||||||
|
# Spyder project settings
|
||||||
|
.spyderproject
|
||||||
|
.spyproject
|
||||||
|
|
||||||
|
# Rope project settings
|
||||||
|
.ropeproject
|
||||||
|
|
||||||
|
# mkdocs documentation
|
||||||
|
/site
|
||||||
|
|
||||||
|
# mypy
|
||||||
|
.mypy_cache/
|
||||||
|
.dmypy.json
|
||||||
|
dmypy.json
|
||||||
|
|
||||||
|
# Pyre type checker
|
||||||
|
.pyre/
|
||||||
|
|
@ -0,0 +1,22 @@
|
||||||
|
|
||||||
|
The MIT License (MIT)
|
||||||
|
|
||||||
|
Copyright (c) 2021 Marc Michalsky
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
||||||
|
|
@ -0,0 +1,9 @@
|
||||||
|
taz:
|
||||||
|
taz_id: ${TAZ_ID}
|
||||||
|
taz_password: ${TAZ_PASSWORD}
|
||||||
|
dowload_format: "pdf" # Valid formats are: pdf, epub, epubt, html, ascii, mobi, mobit
|
||||||
|
|
||||||
|
download_folder: "/path/to/download/folder"
|
||||||
|
|
||||||
|
logging:
|
||||||
|
log_level: "info"
|
||||||
|
|
@ -0,0 +1,14 @@
|
||||||
|
class TazDownloadFormatException(Exception):
|
||||||
|
|
||||||
|
def __inti__(self, format: str):
|
||||||
|
self.format = format
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return f"\"{self.format}\" is not a valid download format." \
|
||||||
|
f"\nValid formats are: pdf, epub, epubt, html, ascii, mobi, mobit"
|
||||||
|
|
||||||
|
|
||||||
|
class TazDownloadError(Exception):
|
||||||
|
|
||||||
|
def __inti__(self, format: str):
|
||||||
|
self.format = format
|
||||||
|
|
@ -0,0 +1,92 @@
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import datetime
|
||||||
|
import logging
|
||||||
|
import shutil
|
||||||
|
from envyaml import EnvYAML
|
||||||
|
from models import TazDownloader
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
dir_path = os.path.dirname(os.path.realpath(__file__)) + '/'
|
||||||
|
|
||||||
|
# Set up logging
|
||||||
|
logging.basicConfig(
|
||||||
|
filename=dir_path + 'tazPlease.log',
|
||||||
|
level=logging.ERROR,
|
||||||
|
format='%(asctime)s - %(message)s'
|
||||||
|
)
|
||||||
|
|
||||||
|
# Load configuration
|
||||||
|
try:
|
||||||
|
config = EnvYAML(dir_path + 'config.yaml', dir_path + '.env')
|
||||||
|
except Exception:
|
||||||
|
logging.error('Could not load config.yaml', exc_info=True)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Set log level
|
||||||
|
try:
|
||||||
|
logging.getLogger().setLevel(config['logging']['log_level'].upper())
|
||||||
|
except ValueError as e:
|
||||||
|
logging.error(f"Could not set log level. \n{e}", exc_info=True)
|
||||||
|
|
||||||
|
# Read download history from csv file
|
||||||
|
try:
|
||||||
|
df = pd.read_csv(dir_path + 'download_history.csv', header=0)
|
||||||
|
except FileNotFoundError:
|
||||||
|
# In case, there isn't yet a csv file, create data frame with headers
|
||||||
|
df = pd.DataFrame(
|
||||||
|
columns=[
|
||||||
|
'file',
|
||||||
|
'download_timestamp',
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Instantiate downloader object
|
||||||
|
taz_dl = TazDownloader(config['taz']['taz_id'], config['taz']['taz_password'])
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Get newspapers available for download
|
||||||
|
newspaper_available = taz_dl.scrape_newspaper()
|
||||||
|
|
||||||
|
# Remove outdated newspaper from download_history.csv
|
||||||
|
df.drop([f.index for f in df['file'] if f not in newspaper_available], inplace=True)
|
||||||
|
|
||||||
|
# Find newspaper which are not already downloaded
|
||||||
|
newspaper_to_download = [n for n in newspaper_available if n not in df.file.values]
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Could get available newspaper from website\n{e}", exc_info=True)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Download newspaper
|
||||||
|
newspaper_downloaded = []
|
||||||
|
for n in newspaper_to_download:
|
||||||
|
try:
|
||||||
|
if taz_dl.download_newspaper(n):
|
||||||
|
newspaper_downloaded.append(n)
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Could not download {n}\n{e}", exc_info=True)
|
||||||
|
|
||||||
|
# Add downloaded newspaper to download_history.csv
|
||||||
|
try:
|
||||||
|
for n in newspaper_downloaded:
|
||||||
|
df_tmp = pd.DataFrame(
|
||||||
|
{
|
||||||
|
'file': [n],
|
||||||
|
'download_timestamp': [datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
df = df.append(df_tmp, ignore_index=True)
|
||||||
|
df.sort_values(by='download_timestamp', ascending=False, inplace=True)
|
||||||
|
df.to_csv(dir_path + 'download_history.csv', index=False)
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Could not update download_history.csv\n{e}", exc_info=True)
|
||||||
|
|
||||||
|
# Move downloaded file to download folder
|
||||||
|
if os.path.isdir(config['download_folder']):
|
||||||
|
download_folder = \
|
||||||
|
config['download_folder'] if config['download_folder'].endswith('/') else config['download_folder'] + "/"
|
||||||
|
for n in newspaper_downloaded:
|
||||||
|
try:
|
||||||
|
shutil.move(dir_path + 'tmp/' + n, download_folder + n)
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Could not move file to download folder \"{download_folder}\"\n{e}", exc_info=True)
|
||||||
|
|
@ -0,0 +1,65 @@
|
||||||
|
import os
|
||||||
|
import requests
|
||||||
|
from requests.exceptions import HTTPError
|
||||||
|
from exceptions import TazDownloadFormatException
|
||||||
|
from exceptions import TazDownloadError
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
dir_path = os.path.dirname(os.path.realpath(__file__)) + '/'
|
||||||
|
|
||||||
|
|
||||||
|
class TazDownloader:
|
||||||
|
download_formats = ["pdf", "epub", "epubt", "html", "ascii", "mobi", "mobit"]
|
||||||
|
BASE_URL = "https://dl.taz.de/"
|
||||||
|
HEADERS = {"User-agent": 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '
|
||||||
|
'Chrome/79.0.3945.130 Safari/537.36'}
|
||||||
|
|
||||||
|
def __init__(self, taz_id: str, password: str, download_format: str = "pdf"):
|
||||||
|
"""
|
||||||
|
:param taz_id:
|
||||||
|
:param password:
|
||||||
|
:param download_format:
|
||||||
|
"""
|
||||||
|
self.taz_id = taz_id
|
||||||
|
self.password = password
|
||||||
|
if download_format in self.download_formats:
|
||||||
|
self.download_url = self.BASE_URL + download_format
|
||||||
|
else:
|
||||||
|
raise TazDownloadFormatException
|
||||||
|
|
||||||
|
def scrape_newspaper(self) -> list:
|
||||||
|
"""
|
||||||
|
Scrapes the newspaper available for download from https://dl.taz.de/
|
||||||
|
:return: a list of file names (str)
|
||||||
|
"""
|
||||||
|
page = requests.get(self.download_url, headers=self.HEADERS)
|
||||||
|
soup = BeautifulSoup(page.content, 'html.parser')
|
||||||
|
return [n['value'] for n in soup.find("select").find_all("option")]
|
||||||
|
|
||||||
|
def download_newspaper(self, taz: str, download_folder: str = dir_path + 'tmp/'):
|
||||||
|
"""
|
||||||
|
Downloads a newspaper from dl.taz.de and stores it in /tmp
|
||||||
|
"""
|
||||||
|
|
||||||
|
# download taz
|
||||||
|
try:
|
||||||
|
with requests.get(
|
||||||
|
self.download_url,
|
||||||
|
stream=True,
|
||||||
|
headers=self.HEADERS,
|
||||||
|
params={
|
||||||
|
'name': self.taz_id,
|
||||||
|
'password': self.password,
|
||||||
|
'id': taz,
|
||||||
|
'Laden': '+Laden+',
|
||||||
|
}
|
||||||
|
) as r:
|
||||||
|
# write response to file
|
||||||
|
with open(download_folder + taz, "wb") as f:
|
||||||
|
for chunk in r.iter_content(chunk_size=8192):
|
||||||
|
f.write(chunk)
|
||||||
|
return True
|
||||||
|
except HTTPError as http_e:
|
||||||
|
raise TazDownloadError(f"Could not download taz:\n{http_e}")
|
||||||
|
except Exception as e:
|
||||||
|
raise TazDownloadError(f"Something went wrong:\n{e}")
|
||||||
|
|
@ -0,0 +1,4 @@
|
||||||
|
pandas~=1.3.2
|
||||||
|
envyaml~=1.8.210417
|
||||||
|
requests~=2.26.0
|
||||||
|
beautifulsoup4~=4.9.3
|
||||||
Loading…
Reference in New Issue