🎉 Initial commit
This commit is contained in:
commit
6b7dcda8d7
|
|
@ -0,0 +1,2 @@
|
|||
TAZ_ID="my@taz.id"
|
||||
TAZ_PASSWORD="secret_password"
|
||||
|
|
@ -0,0 +1,134 @@
|
|||
download_history.csv
|
||||
tazPlease.log
|
||||
config.yaml
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
pip-wheel-metadata/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
#,
|
||||
Usually these files are written by a python script from a template
|
||||
#,
|
||||
before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
.python-version
|
||||
|
||||
# pipenv
|
||||
#,
|
||||
According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
#,
|
||||
However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
#,
|
||||
having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
#,
|
||||
install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# celery beat schedule file
|
||||
celerybeat-schedule
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
|
@ -0,0 +1,22 @@
|
|||
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2021 Marc Michalsky
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
|
|
@ -0,0 +1,9 @@
|
|||
taz:
|
||||
taz_id: ${TAZ_ID}
|
||||
taz_password: ${TAZ_PASSWORD}
|
||||
dowload_format: "pdf" # Valid formats are: pdf, epub, epubt, html, ascii, mobi, mobit
|
||||
|
||||
download_folder: "/path/to/download/folder"
|
||||
|
||||
logging:
|
||||
log_level: "info"
|
||||
|
|
@ -0,0 +1,14 @@
|
|||
class TazDownloadFormatException(Exception):
|
||||
|
||||
def __inti__(self, format: str):
|
||||
self.format = format
|
||||
|
||||
def __str__(self):
|
||||
return f"\"{self.format}\" is not a valid download format." \
|
||||
f"\nValid formats are: pdf, epub, epubt, html, ascii, mobi, mobit"
|
||||
|
||||
|
||||
class TazDownloadError(Exception):
|
||||
|
||||
def __inti__(self, format: str):
|
||||
self.format = format
|
||||
|
|
@ -0,0 +1,92 @@
|
|||
import sys
|
||||
import os
|
||||
import datetime
|
||||
import logging
|
||||
import shutil
|
||||
from envyaml import EnvYAML
|
||||
from models import TazDownloader
|
||||
import pandas as pd
|
||||
|
||||
dir_path = os.path.dirname(os.path.realpath(__file__)) + '/'
|
||||
|
||||
# Set up logging
|
||||
logging.basicConfig(
|
||||
filename=dir_path + 'tazPlease.log',
|
||||
level=logging.ERROR,
|
||||
format='%(asctime)s - %(message)s'
|
||||
)
|
||||
|
||||
# Load configuration
|
||||
try:
|
||||
config = EnvYAML(dir_path + 'config.yaml', dir_path + '.env')
|
||||
except Exception:
|
||||
logging.error('Could not load config.yaml', exc_info=True)
|
||||
sys.exit(1)
|
||||
|
||||
# Set log level
|
||||
try:
|
||||
logging.getLogger().setLevel(config['logging']['log_level'].upper())
|
||||
except ValueError as e:
|
||||
logging.error(f"Could not set log level. \n{e}", exc_info=True)
|
||||
|
||||
# Read download history from csv file
|
||||
try:
|
||||
df = pd.read_csv(dir_path + 'download_history.csv', header=0)
|
||||
except FileNotFoundError:
|
||||
# In case, there isn't yet a csv file, create data frame with headers
|
||||
df = pd.DataFrame(
|
||||
columns=[
|
||||
'file',
|
||||
'download_timestamp',
|
||||
]
|
||||
)
|
||||
|
||||
# Instantiate downloader object
|
||||
taz_dl = TazDownloader(config['taz']['taz_id'], config['taz']['taz_password'])
|
||||
|
||||
try:
|
||||
# Get newspapers available for download
|
||||
newspaper_available = taz_dl.scrape_newspaper()
|
||||
|
||||
# Remove outdated newspaper from download_history.csv
|
||||
df.drop([f.index for f in df['file'] if f not in newspaper_available], inplace=True)
|
||||
|
||||
# Find newspaper which are not already downloaded
|
||||
newspaper_to_download = [n for n in newspaper_available if n not in df.file.values]
|
||||
except Exception as e:
|
||||
logging.error(f"Could get available newspaper from website\n{e}", exc_info=True)
|
||||
sys.exit(1)
|
||||
|
||||
# Download newspaper
|
||||
newspaper_downloaded = []
|
||||
for n in newspaper_to_download:
|
||||
try:
|
||||
if taz_dl.download_newspaper(n):
|
||||
newspaper_downloaded.append(n)
|
||||
except Exception as e:
|
||||
logging.error(f"Could not download {n}\n{e}", exc_info=True)
|
||||
|
||||
# Add downloaded newspaper to download_history.csv
|
||||
try:
|
||||
for n in newspaper_downloaded:
|
||||
df_tmp = pd.DataFrame(
|
||||
{
|
||||
'file': [n],
|
||||
'download_timestamp': [datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')],
|
||||
}
|
||||
)
|
||||
df = df.append(df_tmp, ignore_index=True)
|
||||
df.sort_values(by='download_timestamp', ascending=False, inplace=True)
|
||||
df.to_csv(dir_path + 'download_history.csv', index=False)
|
||||
except Exception as e:
|
||||
logging.error(f"Could not update download_history.csv\n{e}", exc_info=True)
|
||||
|
||||
# Move downloaded file to download folder
|
||||
if os.path.isdir(config['download_folder']):
|
||||
download_folder = \
|
||||
config['download_folder'] if config['download_folder'].endswith('/') else config['download_folder'] + "/"
|
||||
for n in newspaper_downloaded:
|
||||
try:
|
||||
shutil.move(dir_path + 'tmp/' + n, download_folder + n)
|
||||
except Exception as e:
|
||||
logging.error(f"Could not move file to download folder \"{download_folder}\"\n{e}", exc_info=True)
|
||||
|
|
@ -0,0 +1,65 @@
|
|||
import os
|
||||
import requests
|
||||
from requests.exceptions import HTTPError
|
||||
from exceptions import TazDownloadFormatException
|
||||
from exceptions import TazDownloadError
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
dir_path = os.path.dirname(os.path.realpath(__file__)) + '/'
|
||||
|
||||
|
||||
class TazDownloader:
|
||||
download_formats = ["pdf", "epub", "epubt", "html", "ascii", "mobi", "mobit"]
|
||||
BASE_URL = "https://dl.taz.de/"
|
||||
HEADERS = {"User-agent": 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '
|
||||
'Chrome/79.0.3945.130 Safari/537.36'}
|
||||
|
||||
def __init__(self, taz_id: str, password: str, download_format: str = "pdf"):
|
||||
"""
|
||||
:param taz_id:
|
||||
:param password:
|
||||
:param download_format:
|
||||
"""
|
||||
self.taz_id = taz_id
|
||||
self.password = password
|
||||
if download_format in self.download_formats:
|
||||
self.download_url = self.BASE_URL + download_format
|
||||
else:
|
||||
raise TazDownloadFormatException
|
||||
|
||||
def scrape_newspaper(self) -> list:
|
||||
"""
|
||||
Scrapes the newspaper available for download from https://dl.taz.de/
|
||||
:return: a list of file names (str)
|
||||
"""
|
||||
page = requests.get(self.download_url, headers=self.HEADERS)
|
||||
soup = BeautifulSoup(page.content, 'html.parser')
|
||||
return [n['value'] for n in soup.find("select").find_all("option")]
|
||||
|
||||
def download_newspaper(self, taz: str, download_folder: str = dir_path + 'tmp/'):
|
||||
"""
|
||||
Downloads a newspaper from dl.taz.de and stores it in /tmp
|
||||
"""
|
||||
|
||||
# download taz
|
||||
try:
|
||||
with requests.get(
|
||||
self.download_url,
|
||||
stream=True,
|
||||
headers=self.HEADERS,
|
||||
params={
|
||||
'name': self.taz_id,
|
||||
'password': self.password,
|
||||
'id': taz,
|
||||
'Laden': '+Laden+',
|
||||
}
|
||||
) as r:
|
||||
# write response to file
|
||||
with open(download_folder + taz, "wb") as f:
|
||||
for chunk in r.iter_content(chunk_size=8192):
|
||||
f.write(chunk)
|
||||
return True
|
||||
except HTTPError as http_e:
|
||||
raise TazDownloadError(f"Could not download taz:\n{http_e}")
|
||||
except Exception as e:
|
||||
raise TazDownloadError(f"Something went wrong:\n{e}")
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
pandas~=1.3.2
|
||||
envyaml~=1.8.210417
|
||||
requests~=2.26.0
|
||||
beautifulsoup4~=4.9.3
|
||||
Loading…
Reference in New Issue