Compare commits

...

10 commits

Author SHA1 Message Date
mkrieger
bb06a072b0 Ignore scraper-data folders 2026-03-10 11:43:40 +01:00
mkrieger
b7c2fde21b push 2026-03-10 11:40:13 +01:00
mkrieger
210173c4b4 Update 2026-03-10 11:36:43 +01:00
mkrieger
df8c2313a9 Initial commit 2026-03-10 11:33:18 +01:00
mkrieger
387bc056b9 upload angepasst 2024-11-14 11:03:00 +01:00
mkrieger
b3001de67b updated Adressliste_vorlage.csv 2024-11-14 10:44:20 +01:00
mkrieger
d12ac4189f added template and instructions to upload-page 2024-11-14 10:41:50 +01:00
mkrieger
81e906c01d added logo to login-page 2024-11-14 10:30:13 +01:00
mkrieger
6d24007b56 added logo to login-page 2024-11-14 10:26:24 +01:00
mkrieger
6b057fb941 webcrawler v1.0 2024-11-14 10:20:42 +01:00
34 changed files with 3318 additions and 243 deletions

163
.gitignore vendored Normal file
View file

@ -0,0 +1,163 @@
### Flask ###
instance/*
!instance/.gitignore
.webassets-cache
.env
### Flask.Python Stack ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# Uploads / Results
uploads/
results/
scraper-data-*/

View file

@ -7,12 +7,14 @@ WORKDIR /app
# Abhängigkeiten installieren # Abhängigkeiten installieren
COPY requirements.txt requirements.txt COPY requirements.txt requirements.txt
RUN pip install --no-cache-dir -r requirements.txt RUN pip install --no-cache-dir -r requirements.txt
RUN apt update
RUN apt install curl -y
# App-Dateien kopieren # App-Dateien kopieren
COPY . . COPY . .
# Flask Umgebungsvariable setzen
ENV FLASK_APP=app ENV FLASK_APP=app
ENV FLASK_ENV=production
# Flask starten EXPOSE 5000
CMD ["flask", "run", "--host=0.0.0.0", "--port=5000"] CMD ["flask", "run", "--host=0.0.0.0", "--port=5000"]

View file

@ -1,52 +1,88 @@
import os import os
from flask import Flask, redirect, url_for, request from flask import Flask, redirect, url_for, request, current_app
from flask_sqlalchemy import SQLAlchemy from flask_sqlalchemy import SQLAlchemy
from flask_login import LoginManager, current_user from flask_login import LoginManager, current_user
from .models import db, User from flask_migrate import Migrate
from sqlalchemy import text
# Konfiguration für Upload- und Ergebnis-Ordner # ✅ Docker-Pfade
UPLOAD_FOLDER = '/app/uploads' UPLOAD_FOLDER = '/app/uploads'
RESULT_FOLDER = '/app/results' RESULT_FOLDER = '/app/results'
db = SQLAlchemy()
login_manager = LoginManager()
migrate = Migrate()
def _run_migrations(app):
"""Fehlende DB-Spalten automatisch hinzufügen übersteht jeden Neustart"""
migrations = [
("job", "result_filename_raw", "VARCHAR(150)"),
("job", "scraper_job_id", "VARCHAR(255)"),
("user", "is_admin", "BOOLEAN DEFAULT 0"),
]
with app.app_context():
for table, column, col_type in migrations:
try:
db.session.execute(text(f"ALTER TABLE {table} ADD COLUMN {column} {col_type}"))
db.session.commit()
print(f"✅ Migration: {table}.{column} hinzugefügt")
except Exception:
db.session.rollback()
def create_app(): def create_app():
app = Flask(__name__) app = Flask(__name__)
# 🔑 Configs
app.config['SECRET_KEY'] = '008e7369b075886d5f494c8813efdfb17155da6af12b3fe8ee' app.config['SECRET_KEY'] = '008e7369b075886d5f494c8813efdfb17155da6af12b3fe8ee'
app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///users.db' app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///users.db'
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
app.config['RESULT_FOLDER'] = RESULT_FOLDER app.config['RESULT_FOLDER'] = RESULT_FOLDER
app.config['ALLOW_USER_SIGNUP'] = False
# DB + Tools
db.init_app(app) db.init_app(app)
migrate.init_app(app, db)
# Flask-Login Setup
login_manager = LoginManager()
login_manager.login_view = 'auth.login'
login_manager.init_app(app) login_manager.init_app(app)
login_manager.login_view = 'auth.login'
# User Loader
@login_manager.user_loader @login_manager.user_loader
def load_user(user_id): def load_user(user_id):
from .models import User
return User.query.get(int(user_id)) return User.query.get(int(user_id))
# Umleitung nicht authentifizierter Benutzer, statische Dateien und bestimmte Routen ausnehmen # Protected Routes
@app.before_request @app.before_request
def require_login(): def require_login():
allowed_routes = ['auth.login', 'auth.signup'] allowed = ['auth.login', 'auth.signup', 'static']
if (not current_user.is_authenticated and
# Prüfen, ob der Benutzer authentifiziert ist oder eine erlaubte Route anfragt request.endpoint not in allowed and
if (not current_user.is_authenticated not request.path.startswith('/static')):
and request.endpoint not in allowed_routes
and not request.path.startswith('/static/')):
return redirect(url_for('auth.login')) return redirect(url_for('auth.login'))
# Erstellen Sie die Ordner, falls sie noch nicht existieren # Ordner
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True) os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
os.makedirs(app.config['RESULT_FOLDER'], exist_ok=True) os.makedirs(app.config['RESULT_FOLDER'], exist_ok=True)
# Registrieren der Routen # Routes
from . import routes from . import routes
app.register_blueprint(routes.bp) app.register_blueprint(routes.bp)
# Erstellen der Tabellen in der Datenbank # Index Redirect
@app.route('/')
def index():
return redirect(url_for('auth.job_status'))
# DB Tables + Auto-Migration
with app.app_context(): with app.app_context():
db.create_all() db.create_all()
_run_migrations(app)
return app return app
if __name__ == '__main__':
app = create_app()
app.run(host='0.0.0.0', port=5000, debug=False)

68
app/init.py.bak Normal file
View file

@ -0,0 +1,68 @@
import os
from flask import Flask, redirect, url_for, request, current_app
from flask_sqlalchemy import SQLAlchemy
from flask_login import LoginManager, current_user
from flask_migrate import Migrate
# ✅ Docker-Pfade
UPLOAD_FOLDER = '/app/uploads'
RESULT_FOLDER = '/app/results'
db = SQLAlchemy()
login_manager = LoginManager()
migrate = Migrate()
def create_app():
app = Flask(__name__)
# 🔑 Configs
app.config['SECRET_KEY'] = '008e7369b075886d5f494c8813efdfb17155da6af12b3fe8ee'
app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///users.db'
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
app.config['RESULT_FOLDER'] = RESULT_FOLDER
app.config['ALLOW_USER_SIGNUP'] = True # ✅ Aktiviert!
# DB + Tools
db.init_app(app)
migrate.init_app(app, db)
login_manager.init_app(app)
login_manager.login_view = 'auth.login'
# User Loader
@login_manager.user_loader
def load_user(user_id):
from .models import User
return User.query.get(int(user_id))
# Protected Routes
@app.before_request
def require_login():
allowed = ['auth.login', 'auth.signup', 'static']
if (not current_user.is_authenticated and
request.endpoint not in allowed and
not request.path.startswith('/static')):
return redirect(url_for('auth.login'))
# Ordner
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
os.makedirs(app.config['RESULT_FOLDER'], exist_ok=True)
# Routes
from . import routes
app.register_blueprint(routes.bp)
# Index Redirect
@app.route('/')
def index():
return redirect(url_for('auth.job_status'))
# DB Tables
with app.app_context():
db.create_all()
return app
if __name__ == '__main__':
app = create_app()
app.run(host='0.0.0.0', port=5000, debug=False)

View file

@ -1,20 +1,25 @@
from flask_sqlalchemy import SQLAlchemy
from flask_login import UserMixin from flask_login import UserMixin
from datetime import datetime from datetime import datetime
from . import db
db = SQLAlchemy()
class User(UserMixin, db.Model): class User(UserMixin, db.Model):
id = db.Column(db.Integer, primary_key=True) id = db.Column(db.Integer, primary_key=True)
username = db.Column(db.String(150), unique=True, nullable=False) username = db.Column(db.String(150), unique=True, nullable=False)
password = db.Column(db.String(150), nullable=False) password = db.Column(db.String(150), nullable=False)
is_admin = db.Column(db.Boolean, default=False)
class Job(db.Model): class Job(db.Model):
id = db.Column(db.Integer, primary_key=True) id = db.Column(db.Integer, primary_key=True)
user_id = db.Column(db.Integer, db.ForeignKey('user.id'), nullable=False) user_id = db.Column(db.Integer, db.ForeignKey('user.id'), nullable=False)
filename = db.Column(db.String(150), nullable=False) filename = db.Column(db.String(150), nullable=False)
status = db.Column(db.String(50), default="Pending") # Status: Pending, In Progress, Completed status = db.Column(db.String(50), default="Pending")
created_at = db.Column(db.DateTime, default=datetime.utcnow) created_at = db.Column(db.DateTime, default=datetime.utcnow)
result_filename = db.Column(db.String(150), nullable=True) result_filename = db.Column(db.String(150), nullable=True)
result_filename_raw = db.Column(db.String(150), nullable=True)
user = db.relationship('User', backref=db.backref('jobs', lazy=True)) user = db.relationship('User', backref=db.backref('jobs', lazy=True))
class AppConfig(db.Model):
id = db.Column(db.Integer, primary_key=True)
key = db.Column(db.String(100), unique=True, nullable=False)
value = db.Column(db.String(100), nullable=False, default='false')

223
app/routes.orig Normal file
View file

@ -0,0 +1,223 @@
import time
import csv
import os
import threading
from flask import Blueprint, request, redirect, url_for, flash, render_template, send_file, current_app
from flask_login import login_user, logout_user, login_required, current_user
from werkzeug.utils import secure_filename
from werkzeug.security import generate_password_hash, check_password_hash
from .models import db, User, Job
from .webcrawler import process_file # Importiere die Funktion für das Webscraping
UPLOAD_FOLDER = 'uploads'
RESULT_FOLDER = 'results'
# Blueprint für auth erstellen
bp = Blueprint('auth', __name__)
@bp.route('/login', methods=['GET', 'POST'])
def login():
if request.method == 'POST':
username = request.form['username']
password = request.form['password']
user = User.query.filter_by(username=username).first()
if user and check_password_hash(user.password, password):
login_user(user)
return redirect(url_for('auth.job_status'))
flash('Login fehlgeschlagen. Überprüfen Sie Benutzername und Passwort.')
return render_template('login.html')
@bp.route('/signup', methods=['GET', 'POST'])
def signup():
if not current_app.config['ALLOW_USER_SIGNUP']:
flash("Registrierung ist derzeit deaktiviert.")
return redirect(url_for('auth.login'))
if request.method == 'POST':
username = request.form['username']
password = generate_password_hash(request.form['password'], method='sha256')
new_user = User(username=username, password=password)
db.session.add(new_user)
db.session.commit()
flash('Benutzer erfolgreich erstellt! Sie können sich jetzt einloggen.')
return redirect(url_for('auth.login'))
return render_template('signup.html')
@bp.route('/logout')
@login_required
def logout():
logout_user()
return redirect(url_for('auth.login'))
@bp.route('/jobs')
@login_required
def job_status():
jobs = Job.query.filter_by(user_id=current_user.id).all()
return render_template('jobs.html', jobs=jobs)
@bp.route('/upload', methods=['GET', 'POST'])
@login_required
def upload():
if request.method == 'POST':
file = request.files['file']
filename = secure_filename(file.filename)
# Überprüfen, ob eine Datei mit dem gleichen Namen bereits existiert
file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], filename)
if os.path.exists(file_path):
# Wenn eine Datei mit dem gleichen Namen existiert, einen Zeitstempel hinzufügen
name, ext = os.path.splitext(filename)
timestamp = time.strftime("%Y%m%d-%H%M%S") # Zeitstempel im Format JahrMonatTag-StundenMinutenSekunden
filename = f"{name}_{timestamp}{ext}"
file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], filename)
flash(f"Eine Datei mit gleichem Namen existierte bereits. Die Datei wurde als '{filename}' gespeichert.")
# Speichern der Datei
file.save(file_path)
flash('Datei erfolgreich hochgeladen und Job gestartet')
# Neuen Job erstellen
new_job = Job(user_id=current_user.id, filename=filename, status="Pending")
db.session.add(new_job)
db.session.commit()
# Debugging-Ausgabe zur Überprüfung der Thread-Erstellung
print(f"Starte Scraping-Thread für Job-ID: {new_job.id}")
# Starten des Scraping im Hintergrund-Thread und Übergeben des aktuellen Anwendungskontexts
thread = threading.Thread(target=process_file, args=(filename, new_job.id, current_app._get_current_object()))
thread.start()
# Debugging-Ausgabe, nachdem der Thread gestartet wurde
print(f"Thread für Job {new_job.id} erfolgreich gestartet.")
return redirect(url_for('auth.job_status'))
return render_template('upload.html')
@bp.route('/download/<int:job_id>', methods=['GET'])
@login_required
def download_result(job_id):
job = Job.query.get_or_404(job_id)
print(f"Job ID: {job.id} - User ID: {job.user_id} - Current User ID: {current_user.id}")
# Überprüfen, ob der Job dem aktuellen Benutzer gehört
if job.user_id != current_user.id:
flash("Sie haben keine Berechtigung, dieses Ergebnis herunterzuladen.")
return redirect(url_for('auth.job_status'))
# Überprüfen, ob das Ergebnis vorhanden ist
if not job.result_filename:
flash("Das Ergebnis ist noch nicht verfügbar.")
return redirect(url_for('auth.job_status'))
# Überprüfen, ob die Datei im angegebenen Pfad existiert
result_path = os.path.join(current_app.config['RESULT_FOLDER'], job.result_filename)
print(f"Versuche, Datei herunterzuladen von: {result_path}")
if os.path.exists(result_path):
print("Datei existiert und wird zum Download bereitgestellt.")
return send_file(result_path, as_attachment=True)
else:
print("Datei nicht gefunden. Ergebnisverzeichnis oder Pfad prüfen.")
flash("Ergebnisdatei nicht gefunden.")
return redirect(url_for('auth.job_status'))
@bp.route('/delete_job/<int:job_id>', methods=['POST'])
@login_required
def delete_job(job_id):
job = Job.query.get_or_404(job_id)
if job.user_id != current_user.id:
flash("Sie haben keine Berechtigung, diesen Job zu löschen.")
return redirect(url_for('auth.job_status'))
# Löschen der Upload-Datei
upload_path = os.path.join(current_app.config['UPLOAD_FOLDER'], job.filename)
if os.path.exists(upload_path):
os.remove(upload_path)
print(f"Upload-Datei gelöscht: {upload_path}")
else:
print(f"Upload-Datei nicht gefunden: {upload_path}")
# Löschen der Results-Datei, falls vorhanden
if job.result_filename:
result_path = os.path.join(current_app.config['RESULT_FOLDER'], job.result_filename)
print(f"Versuche Ergebnisdatei zu löschen: {result_path}")
if os.path.exists(result_path):
try:
os.remove(result_path)
print(f"Ergebnisdatei gelöscht: {result_path}")
except Exception as e:
print(f"Fehler beim Löschen der Ergebnisdatei: {e}")
else:
print(f"Ergebnisdatei nicht gefunden im Pfad: {result_path}")
# Job aus der Datenbank löschen
db.session.delete(job)
db.session.commit()
flash("Job erfolgreich gelöscht.")
return redirect(url_for('auth.job_status'))
@bp.route('/admin', methods=['GET'])
@login_required
def admin_panel():
if not current_user.is_admin:
flash("Keine Berechtigung.")
return redirect(url_for('auth.job_status'))
users = User.query.all()
return render_template('admin_panel.html', users=users)
@bp.route('/admin/create_user', methods=['POST'])
@login_required
def create_user():
if not current_user.is_admin:
flash("Keine Berechtigung.")
return redirect(url_for('auth.admin_panel'))
username = request.form['username']
password = request.form['password']
is_admin = 'is_admin' in request.form # Checkbox für Adminrechte
hashed_password = generate_password_hash(password, method='sha256')
new_user = User(username=username, password=hashed_password, is_admin=is_admin)
db.session.add(new_user)
db.session.commit()
flash(f"Benutzer {username} wurde erstellt.")
return redirect(url_for('auth.admin_panel'))
@bp.route('/admin/reset_password/<int:user_id>', methods=['POST'])
@login_required
def reset_password(user_id):
if not current_user.is_admin:
flash("Keine Berechtigung.")
return redirect(url_for('auth.admin_panel'))
user = User.query.get_or_404(user_id)
new_password = request.form['new_password']
user.password = generate_password_hash(new_password, method='sha256')
db.session.commit()
flash(f"Passwort für Benutzer {user.username} wurde zurückgesetzt.")
return redirect(url_for('auth.admin_panel'))
@bp.route('/admin/delete_user/<int:user_id>', methods=['POST'])
@login_required
def delete_user(user_id):
if not current_user.is_admin:
flash("Keine Berechtigung.")
return redirect(url_for('auth.admin_panel'))
user = User.query.get_or_404(user_id)
if user.is_admin:
flash("Administratoren können nicht gelöscht werden.")
return redirect(url_for('auth.admin_panel'))
db.session.delete(user)
db.session.commit()
flash(f"Benutzer {user.username} wurde gelöscht.")
return redirect(url_for('auth.admin_panel'))

View file

@ -1,17 +1,16 @@
import csv import time
import os import os
import threading import threading
from flask import Blueprint, request, redirect, url_for, flash, render_template, send_file, current_app from flask import Blueprint, request, redirect, url_for, flash, render_template, send_file, jsonify, current_app
from flask_login import login_user, logout_user, login_required, current_user from flask_login import login_user, logout_user, login_required, current_user
from werkzeug.utils import secure_filename from werkzeug.utils import secure_filename
from werkzeug.security import generate_password_hash, check_password_hash from werkzeug.security import generate_password_hash, check_password_hash
from .models import db, User, Job from .models import db, User, Job, AppConfig
from .webcrawler import process_file # Importiere die Funktion für das Webscraping from .webcrawler import process_file
UPLOAD_FOLDER = 'uploads' UPLOAD_FOLDER = '/app/uploads'
RESULT_FOLDER = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'results') RESULT_FOLDER = '/app/results'
# Blueprint für auth erstellen
bp = Blueprint('auth', __name__) bp = Blueprint('auth', __name__)
@bp.route('/login', methods=['GET', 'POST']) @bp.route('/login', methods=['GET', 'POST'])
@ -28,13 +27,18 @@ def login():
@bp.route('/signup', methods=['GET', 'POST']) @bp.route('/signup', methods=['GET', 'POST'])
def signup(): def signup():
cfg = AppConfig.query.filter_by(key='allow_signup').first()
if not cfg or cfg.value != 'true':
flash("Registrierung ist derzeit deaktiviert.")
return redirect(url_for('auth.login'))
if request.method == 'POST': if request.method == 'POST':
username = request.form['username'] username = request.form['username']
password = generate_password_hash(request.form['password'], method='sha256') password = generate_password_hash(request.form['password']) # ✅ Fix
new_user = User(username=username, password=password) new_user = User(username=username, password=password)
db.session.add(new_user) db.session.add(new_user)
db.session.commit() db.session.commit()
flash('Benutzer erfolgreich erstellt! Sie können sich jetzt einloggen.') flash('Benutzer erfolgreich erstellt!')
return redirect(url_for('auth.login')) return redirect(url_for('auth.login'))
return render_template('signup.html') return render_template('signup.html')
@ -47,102 +51,203 @@ def logout():
@bp.route('/jobs') @bp.route('/jobs')
@login_required @login_required
def job_status(): def job_status():
jobs = Job.query.filter_by(user_id=current_user.id).all() jobs = Job.query.filter_by(user_id=current_user.id).order_by(Job.created_at.desc()).all()
return render_template('jobs.html', jobs=jobs) return render_template('jobs.html', jobs=jobs)
# Hochladen und Verarbeiten der Datei im Hintergrund
@bp.route('/upload', methods=['GET', 'POST']) @bp.route('/upload', methods=['GET', 'POST'])
@login_required @login_required
def upload(): def upload():
if request.method == 'POST': if request.method == 'POST':
file = request.files['file'] if 'file' not in request.files:
filename = secure_filename(file.filename) flash('Keine Datei ausgewählt.')
if not filename.endswith('.csv'):
flash('Bitte eine CSV-Datei hochladen')
return redirect(url_for('auth.upload')) return redirect(url_for('auth.upload'))
file_path = os.path.join(UPLOAD_FOLDER, filename) file = request.files['file']
file.save(file_path) if not file or file.filename == '':
flash('Datei erfolgreich hochgeladen und Job gestartet') flash('Keine gültige Datei.')
return redirect(url_for('auth.upload'))
# Neuen Job erstellen filename = secure_filename(file.filename)
new_job = Job(user_id=current_user.id, filename=filename, status="Pending") name, ext = os.path.splitext(filename)
timestamp = time.strftime("%Y%m%d_%H%M%S")
unique_filename = f"{name}_{timestamp}{ext}" if os.path.exists(os.path.join(UPLOAD_FOLDER, filename)) else filename
filepath = os.path.join(UPLOAD_FOLDER, unique_filename)
file.save(filepath)
print(f"💾 UPLOAD: {filepath}")
new_job = Job(
user_id=current_user.id,
filename=unique_filename,
status="Pending"
)
db.session.add(new_job) db.session.add(new_job)
db.session.commit() db.session.commit()
print(f"🆕 JOB #{new_job.id} für User {current_user.id}")
# Debugging-Ausgabe zur Überprüfung der Thread-Erstellung thread = threading.Thread(
print(f"Starte Scraping-Thread für Job-ID: {new_job.id}") target=process_file,
args=(unique_filename, new_job.id, current_app._get_current_object())
# Starten des Scraping im Hintergrund-Thread und Übergeben des aktuellen Anwendungskontexts )
thread = threading.Thread(target=process_file, args=(filename, new_job.id, current_app._get_current_object())) thread.daemon = True
thread.start() thread.start()
print(f"🔄 THREAD STARTED Job {new_job.id}")
# Debugging-Ausgabe, nachdem der Thread gestartet wurde flash(f'"{unique_filename}" → Job #{new_job.id} läuft!')
print(f"Thread für Job {new_job.id} erfolgreich gestartet.")
return redirect(url_for('auth.job_status')) return redirect(url_for('auth.job_status'))
return render_template('upload.html') return render_template('upload.html')
@bp.route('/download/<int:job_id>', methods=['GET']) @bp.route('/download/<int:job_id>')
@login_required @login_required
def download_result(job_id): def download_result(job_id):
job = Job.query.get_or_404(job_id) job = Job.query.filter_by(id=job_id, user_id=current_user.id).first_or_404()
print(f"Job ID: {job.id} - User ID: {job.user_id} - Current User ID: {current_user.id}")
# Überprüfen, ob der Job dem aktuellen Benutzer gehört if not job.result_filename or not job.status.startswith(''):
if job.user_id != current_user.id: flash('Ergebnis nicht bereit.')
flash("Sie haben keine Berechtigung, dieses Ergebnis herunterzuladen.")
return redirect(url_for('auth.job_status')) return redirect(url_for('auth.job_status'))
# Überprüfen, ob das Ergebnis vorhanden ist result_path = os.path.join(RESULT_FOLDER, job.result_filename)
if not job.result_filename:
flash("Das Ergebnis ist noch nicht verfügbar.")
return redirect(url_for('auth.job_status'))
# Überprüfen, ob die Datei im angegebenen Pfad existiert
result_path = os.path.join(current_app.config['RESULT_FOLDER'], job.result_filename)
print(f"Versuche, Datei herunterzuladen von: {result_path}")
if os.path.exists(result_path): if os.path.exists(result_path):
print("Datei existiert und wird zum Download bereitgestellt.")
return send_file(result_path, as_attachment=True) return send_file(result_path, as_attachment=True)
else: flash('Datei fehlt.')
print("Datei nicht gefunden. Ergebnisverzeichnis oder Pfad prüfen.")
flash("Ergebnisdatei nicht gefunden.")
return redirect(url_for('auth.job_status')) return redirect(url_for('auth.job_status'))
@bp.route('/download_raw/<int:job_id>')
@login_required
def download_result_raw(job_id):
job = Job.query.filter_by(id=job_id, user_id=current_user.id).first_or_404()
if not job.result_filename_raw:
flash('Rohdaten nicht verfügbar.')
return redirect(url_for('auth.job_status'))
result_path = os.path.join(RESULT_FOLDER, job.result_filename_raw)
if os.path.exists(result_path):
return send_file(result_path, as_attachment=True)
flash('Datei fehlt.')
return redirect(url_for('auth.job_status'))
@bp.route('/delete_job/<int:job_id>', methods=['POST']) @bp.route('/delete_job/<int:job_id>', methods=['POST'])
@login_required @login_required
def delete_job(job_id): def delete_job(job_id):
job = Job.query.get_or_404(job_id) job = Job.query.filter_by(id=job_id, user_id=current_user.id).first_or_404()
if job.user_id != current_user.id:
flash("Sie haben keine Berechtigung, diesen Job zu löschen.")
return redirect(url_for('auth.job_status'))
# Löschen der Upload-Datei
upload_path = os.path.join(UPLOAD_FOLDER, job.filename) upload_path = os.path.join(UPLOAD_FOLDER, job.filename)
if os.path.exists(upload_path): if os.path.exists(upload_path):
os.remove(upload_path) os.remove(upload_path)
print(f"Upload-Datei gelöscht: {upload_path}")
else:
print(f"Upload-Datei nicht gefunden: {upload_path}")
# Löschen der Results-Datei, falls vorhanden
if job.result_filename: if job.result_filename:
result_path = os.path.join(RESULT_FOLDER, job.result_filename) result_path = os.path.join(RESULT_FOLDER, job.result_filename)
if os.path.exists(result_path): if os.path.exists(result_path):
try:
os.remove(result_path) os.remove(result_path)
print(f"Ergebnisdatei gelöscht: {result_path}")
except Exception as e:
print(f"Fehler beim Löschen der Ergebnisdatei: {e}")
else:
print(f"Ergebnisdatei nicht gefunden: {result_path}")
# Job aus der Datenbank löschen if job.result_filename_raw: # ✅ Raw auch löschen
raw_path = os.path.join(RESULT_FOLDER, job.result_filename_raw)
if os.path.exists(raw_path):
os.remove(raw_path)
db.session.delete(job) db.session.delete(job)
db.session.commit() db.session.commit()
flash("Job erfolgreich gelöscht.") flash('Job gelöscht.')
return redirect(url_for('auth.job_status')) return redirect(url_for('auth.job_status'))
@bp.route('/job_status/<int:job_id>')
@login_required
def job_status_api(job_id):
job = Job.query.filter_by(id=job_id, user_id=current_user.id).first()
if not job:
return jsonify({'error': 'Not found'}), 404
return jsonify({
'id': job.id,
'status': job.status,
'result_filename': job.result_filename,
'result_filename_raw': getattr(job, 'result_filename_raw', None),
'scraper_job_id': getattr(job, 'scraper_job_id', None)
})
@bp.route('/resume_job/<int:job_id>', methods=['POST'])
@login_required
def resume_job(job_id):
job = Job.query.filter_by(id=job_id, user_id=current_user.id).first_or_404()
thread = threading.Thread(
target=process_file,
args=(job.filename, job.id, current_app._get_current_object())
)
thread.daemon = True
thread.start()
flash(f'Job #{job_id} wird fortgesetzt...')
return redirect(url_for('auth.job_status'))
# ── ADMIN ──────────────────────────────────────────
@bp.route('/admin', methods=['GET'])
@login_required
def admin_panel():
if not current_user.is_admin:
flash("Keine Berechtigung.")
return redirect(url_for('auth.job_status'))
users = User.query.all()
cfg = AppConfig.query.filter_by(key='allow_signup').first()
signup_allowed = cfg and cfg.value == 'true'
return render_template('admin_panel.html', users=users, signup_allowed=signup_allowed)
@bp.route('/admin/create_user', methods=['POST'])
@login_required
def create_user():
if not current_user.is_admin:
return redirect(url_for('auth.admin_panel'))
username = request.form['username']
password = generate_password_hash(request.form['password']) # ✅ Fix
is_admin = 'is_admin' in request.form
new_user = User(username=username, password=password, is_admin=is_admin)
db.session.add(new_user)
db.session.commit()
flash(f'{username} erstellt.')
return redirect(url_for('auth.admin_panel'))
@bp.route('/admin/reset_password/<int:user_id>', methods=['POST'])
@login_required
def reset_password(user_id):
if not current_user.is_admin:
return redirect(url_for('auth.admin_panel'))
user = User.query.get_or_404(user_id)
new_password = request.form['new_password']
user.password = generate_password_hash(new_password) # ✅ Fix
db.session.commit()
flash(f'Passwort {user.username} zurückgesetzt.')
return redirect(url_for('auth.admin_panel'))
@bp.route('/admin/delete_user/<int:user_id>', methods=['POST'])
@login_required
def delete_user(user_id):
if not current_user.is_admin:
return redirect(url_for('auth.admin_panel'))
user = User.query.get_or_404(user_id)
if user.is_admin:
flash('Admin nicht löschbar.')
return redirect(url_for('auth.admin_panel'))
db.session.delete(user)
db.session.commit()
flash(f'{user.username} gelöscht.')
return redirect(url_for('auth.admin_panel'))
@bp.route('/admin/toggle_signup', methods=['POST'])
@login_required
def toggle_signup():
if not current_user.is_admin:
return redirect(url_for('auth.admin_panel'))
cfg = AppConfig.query.filter_by(key='allow_signup').first()
if not cfg:
cfg = AppConfig(key='allow_signup', value='true')
db.session.add(cfg)
else:
cfg.value = 'false' if cfg.value == 'true' else 'true'
db.session.commit()
state = '✅ aktiviert' if cfg.value == 'true' else '🔒 deaktiviert'
flash(f'Registrierung {state}.')
return redirect(url_for('auth.admin_panel'))

View file

@ -0,0 +1,6 @@
PLZ;Stadt;Straße;Hausnummer;Zusatz
53175;Bonn;Godesberger Str.;27;
50667;Köln;Hohenzollernring;52;A
10115;Berlin;Chausseestraße;125;
80331;München;Marienplatz;1;B
20095;Hamburg;Ostwall;5;
1 PLZ Stadt Straße Hausnummer Zusatz
2 53175 Bonn Godesberger Str. 27
3 50667 Köln Hohenzollernring 52 A
4 10115 Berlin Chausseestraße 125
5 80331 München Marienplatz 1 B
6 20095 Hamburg Ostwall 5

View file

@ -164,3 +164,186 @@ tr:nth-child(even) td {
.delete-btn:hover { .delete-btn:hover {
background-color: #e60000; background-color: #e60000;
} }
/* Flash-Badge Styling */
.flash-badge {
position: fixed;
top: 20px;
right: 20px;
background-color: #f44336; /* Material Design Rot */
color: white;
padding: 12px 24px;
border-radius: 8px;
font-family: 'Roboto', sans-serif;
font-weight: 500;
box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.2);
z-index: 1000;
opacity: 0;
transform: translateY(-20px);
transition: opacity 0.4s ease, transform 0.4s ease;
}
/* Einblend-Animation */
.flash-badge.show {
opacity: 1;
transform: translateY(0);
}
/* Ausblend-Animation */
.flash-badge.hide {
opacity: 0;
transform: translateY(-20px);
}
.admin-panel {
max-width: 800px;
margin: 2em auto;
padding: 2em;
background: white;
border-radius: 8px;
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
}
.admin-panel h2 {
font-weight: 500;
color: #1d1d1f;
margin-bottom: 1em;
}
.user-table {
width: 100%;
border-collapse: collapse;
margin-bottom: 2em;
}
.user-table th, .user-table td {
padding: 0.75em;
text-align: left;
border: 1px solid #d1d1d6;
}
.user-table th {
background-color: #f1f1f1;
color: #333;
}
.user-table td {
background-color: white;
}
.user-table tr:nth-child(even) td {
background-color: #f9f9f9;
}
.reset-btn, .delete-btn, .create-btn {
padding: 0.5em 1em;
font-size: 0.9em;
font-weight: 500;
border: none;
border-radius: 4px;
cursor: pointer;
transition: background-color 0.2s ease-in-out;
}
.reset-btn {
background-color: #4caf50;
color: white;
}
.reset-btn:hover {
background-color: #388e3c;
}
.delete-btn {
background-color: #f44336;
color: white;
}
.delete-btn:hover {
background-color: #d32f2f;
}
.create-btn {
background-color: #007aff;
color: white;
padding: 0.75em;
margin-top: 1em;
display: block;
width: 100%;
font-size: 1em;
}
.create-btn:hover {
background-color: #005bb5;
}
.create-user-form {
margin-top: 1.5em;
}
.create-user-form input[type="text"],
.create-user-form input[type="password"] {
width: 100%;
padding: 0.75em;
margin-bottom: 1em;
border: 1px solid #d1d1d6;
border-radius: 8px;
}
.create-user-form label {
font-size: 0.9em;
color: #6e6e73;
display: block;
margin-bottom: 1em;
}
.logo-container {
text-align: center;
margin-bottom: 1em;
}
.logo-container img {
max-width: 100%;
height: auto;
width: 150px; /* Standardbreite für das Logo */
transition: width 0.3s ease;
}
@media (max-width: 768px) {
.logo-container img {
width: 120px; /* Kleinere Breite auf kleineren Bildschirmen */
}
}
@media (max-width: 480px) {
.logo-container img {
width: 100px; /* Noch kleinere Breite auf sehr kleinen Bildschirmen */
}
}
.template-info {
background-color: #f9f9f9;
padding: 1em;
border-radius: 8px;
margin-bottom: 1.5em;
text-align: center;
}
.template-info p {
margin: 0.5em 0;
}
.template-download {
display: inline-block;
padding: 0.5em 1em;
margin-top: 0.5em;
background-color: #007aff;
color: white;
border-radius: 4px;
text-decoration: none;
transition: background-color 0.2s ease;
}
.template-download:hover {
background-color: #005bb5;
}

View file

@ -0,0 +1,78 @@
{% extends "base.html" %}
{% block content %}
<div class="admin-panel">
<h2>Benutzerverwaltung</h2>
<!-- Tabelle für Benutzerverwaltung -->
<table class="user-table">
<thead>
<tr>
<th>ID</th>
<th>Benutzername</th>
<th>Admin</th>
<th>Aktionen</th>
</tr>
</thead>
<tbody>
{% for user in users %}
<tr>
<td>{{ user.id }}</td>
<td>{{ user.username }}</td>
<td>{{ 'Ja' if user.is_admin else 'Nein' }}</td>
<td>
<form action="{{ url_for('auth.reset_password', user_id=user.id) }}" method="post" style="display:inline;">
<input type="text" name="new_password" placeholder="Neues Passwort" required>
<button type="submit" class="reset-btn">Passwort zurücksetzen</button>
</form>
{% if not user.is_admin %}
<form action="{{ url_for('auth.delete_user', user_id=user.id) }}" method="post" style="display:inline;">
<button type="submit" class="delete-btn">Benutzer löschen</button>
</form>
{% endif %}
</td>
</tr>
{% endfor %}
</tbody>
</table>
<!-- Formular zum Erstellen neuer Benutzer -->
<h3>Neuen Benutzer erstellen</h3>
<form action="{{ url_for('auth.create_user') }}" method="post" class="create-user-form">
<input type="text" name="username" placeholder="Benutzername" required>
<input type="password" name="password" placeholder="Passwort" required>
<label>
<input type="checkbox" name="is_admin"> Admin
</label>
<button type="submit" class="create-btn">Benutzer erstellen</button>
</form>
</div>
<div class="config-box">
<h3>⚙️ Einstellungen</h3>
<form action="{{ url_for('auth.toggle_signup') }}" method="POST">
<div class="toggle-row">
<span>Benutzer-Registrierung:</span>
{% if signup_allowed %}
<span class="badge badge-green">✅ Aktiv</span>
<button type="submit" class="btn-danger">🔒 Deaktivieren</button>
{% else %}
<span class="badge badge-red">🔒 Deaktiviert</span>
<button type="submit" class="btn-success">✅ Aktivieren</button>
{% endif %}
</div>
</form>
</div>
<style>
.config-box { background: #f8f9fa; border: 1px solid #ddd; border-radius: 6px; padding: 16px; margin-bottom: 24px; }
.toggle-row { display: flex; align-items: center; gap: 12px; }
.badge { padding: 3px 10px; border-radius: 12px; font-size: 0.85em; font-weight: bold; }
.badge-green { background: #d4edda; color: #155724; }
.badge-red { background: #f8d7da; color: #721c24; }
.btn-danger { background: #e74c3c; color: white; border: none; padding: 6px 14px; border-radius: 4px; cursor: pointer; }
.btn-success { background: #27ae60; color: white; border: none; padding: 6px 14px; border-radius: 4px; cursor: pointer; }
.btn-danger:hover { background: #c0392b; }
.btn-success:hover { background: #1e8449; }
</style>
{% endblock %}

View file

@ -14,14 +14,50 @@
<ul> <ul>
<li><a href="{{ url_for('auth.job_status') }}">Jobs</a></li> <li><a href="{{ url_for('auth.job_status') }}">Jobs</a></li>
<li><a href="{{ url_for('auth.upload') }}">Upload</a></li> <li><a href="{{ url_for('auth.upload') }}">Upload</a></li>
{% if current_user.is_admin %}
<li><a href="{{ url_for('auth.admin_panel') }}">Admin</a></li> <!-- Admin-Bereich Link -->
{% endif %}
<li><a href="{{ url_for('auth.logout') }}">Logout</a></li> <li><a href="{{ url_for('auth.logout') }}">Logout</a></li>
</ul> </ul>
</nav> </nav>
</header> </header>
{% endif %} {% endif %}
<!-- Flash-Nachrichten -->
{% with messages = get_flashed_messages() %}
{% if messages %}
<div id="flash-badge-container">
{% for message in messages %}
<div class="flash-badge">{{ message }}</div>
{% endfor %}
</div>
{% endif %}
{% endwith %}
<div class="{% if request.endpoint in ['auth.login', 'auth.signup'] %}form-container{% else %}container{% endif %}"> <div class="{% if request.endpoint in ['auth.login', 'auth.signup'] %}form-container{% else %}container{% endif %}">
{% block content %}{% endblock %} {% block content %}{% endblock %}
</div> </div>
<!-- JavaScript für Ein- und Ausblendanimation des Flash-Badges -->
<script>
document.addEventListener("DOMContentLoaded", function() {
var flashBadges = document.querySelectorAll('.flash-badge');
flashBadges.forEach(function(badge) {
// Einblendung mit Verzögerung
setTimeout(function() {
badge.classList.add('show');
}, 100);
// Ausblendung nach 5 Sekunden und Entfernen aus dem DOM
setTimeout(function() {
badge.classList.remove('show');
badge.classList.add('hide');
setTimeout(function() {
badge.remove();
}, 400); // Zeit für die Ausblendanimation
}, 5000);
});
});
</script>
</body> </body>
</html> </html>

121
app/templates/jobs.bck Normal file
View file

@ -0,0 +1,121 @@
{% extends "base.html" %}
{% block content %}
<div class="table-container">
<h2>Ihre Aufträge</h2>
<table id="jobs-table">
<thead>
<tr>
<th>Dateiname</th>
<th>Status</th>
<th>Erstellt am</th>
<th>Ergebnis</th>
<th>Aktionen</th>
</tr>
</thead>
<tbody>
{% for job in jobs %}
<tr id="job-row-{{ job.id }}">
<td>{{ job.filename }}</td>
<td id="status-{{ job.id }}" class="job-status">{{ job.status }}</td>
<td>{{ job.created_at.strftime('%Y-%m-%d %H:%M:%S') }}</td>
<td id="result-{{ job.id }}">
{% if job.result_filename and 'Failed' not in job.status %}
<a href="{{ url_for('auth.download_result', job_id=job.id) }}" class="dl-btn">
🎯 Gefiltert
</a>
{% if job.result_filename_raw %}
&nbsp;
<a href="{{ url_for('auth.download_result_raw', job_id=job.id) }}" class="dl-btn dl-btn-raw">
📋 Alle
</a>
{% endif %}
{% elif 'Failed' in job.status %}
<span class="status-failed">❌ {{ job.result_filename or 'Fehler' }}</span>
{% else %}
<span class="status-pending">⏳ Noch nicht verfügbar</span>
{% endif %}
</td>
<td>
<form action="{{ url_for('auth.delete_job', job_id=job.id) }}" method="POST" style="display:inline;">
<button type="submit" class="delete-btn">🗑️ Löschen</button>
</form>
</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
<style>
.job-status { font-weight: bold; }
.status-failed { color: #e74c3c; font-weight: bold; }
.status-pending { color: #888; }
.status-completed { color: #27ae60; }
.dl-btn {
display: inline-block;
padding: 4px 10px;
border-radius: 4px;
text-decoration: none;
font-size: 0.85em;
font-weight: bold;
background: #27ae60;
color: #fff;
margin: 2px 1px;
transition: background 0.2s;
}
.dl-btn:hover { background: #1e8449; }
.dl-btn-raw { background: #2980b9; }
.dl-btn-raw:hover { background: #1a5e8a; }
</style>
<script>
document.addEventListener('DOMContentLoaded', function () {
document.querySelectorAll('.job-status').forEach(function (cell) {
const jobId = cell.id.split('-')[1];
const status = cell.textContent.trim();
if (!status.includes('✅') && !status.includes('Failed')) {
pollJob(jobId);
}
});
});
function renderResult(resultCell, data) {
const hasFailed = data.status.includes('Failed');
const hasFiltered = data.result_filename && !hasFailed;
const hasRaw = data.result_filename_raw && !hasFailed;
if (hasFiltered) {
let html = `<a href="/download/${data.id}" class="dl-btn">🎯 Gefiltert</a>`;
if (hasRaw) {
html += ` <a href="/download_raw/${data.id}" class="dl-btn dl-btn-raw">📋 Alle</a>`;
}
resultCell.innerHTML = html;
} else if (hasFailed) {
resultCell.innerHTML = `<span class="status-failed">❌ ${data.result_filename || 'Fehler'}</span>`;
} else {
resultCell.innerHTML = `<span class="status-pending">⏳ Noch nicht verfügbar</span>`;
}
}
function pollJob(jobId) {
fetch(`/job_status/${jobId}`)
.then(r => r.json())
.then(data => {
const statusCell = document.getElementById(`status-${jobId}`);
const resultCell = document.getElementById(`result-${jobId}`);
statusCell.textContent = data.status;
renderResult(resultCell, data);
// Weiter pollen wenn noch nicht fertig
const done = data.status.includes('✅') || data.status.includes('Failed');
if (!done) {
setTimeout(() => pollJob(jobId), 5000);
}
})
.catch(() => setTimeout(() => pollJob(jobId), 10000));
}
</script>
{% endblock %}

View file

@ -15,20 +15,38 @@
</thead> </thead>
<tbody> <tbody>
{% for job in jobs %} {% for job in jobs %}
<tr> <tr id="job-row-{{ job.id }}">
<td>{{ job.filename }}</td> <td>{{ job.filename }}</td>
<td class="job-status">{{ job.status }}</td> <td id="status-{{ job.id }}" class="job-status">
{{ job.status }}
</td>
<td>{{ job.created_at.strftime('%Y-%m-%d %H:%M:%S') }}</td> <td>{{ job.created_at.strftime('%Y-%m-%d %H:%M:%S') }}</td>
<td> <td id="result-{{ job.id }}">
{% if job.status == "Completed" %} {% if job.result_filename and 'Failed' not in job.status %}
<a href="{{ url_for('auth.download_result', job_id=job.id) }}">Download</a> <a href="{{ url_for('auth.download_result', job_id=job.id) }}" class="dl-btn">
🎯 Gefiltert
</a>
{% if job.result_filename_raw %}
&nbsp;
<a href="{{ url_for('auth.download_result_raw', job_id=job.id) }}" class="dl-btn dl-btn-raw">
📋 Alle
</a>
{% endif %}
{% elif 'Failed' in job.status %}
<span class="status-failed">❌ {{ job.result_filename or 'Fehler' }}</span>
{% else %} {% else %}
Noch nicht verfügbar <span class="status-pending"> Noch nicht verfügbar</span>
{% endif %} {% endif %}
</td> </td>
<td> <td>
{% if 'Failed' in job.status %}
<!-- 🆕 Resume Button -->
<form action="{{ url_for('auth.resume_job', job_id=job.id) }}" method="POST" style="display:inline;">
<button type="submit" class="btn-resume">▶️ Resume</button>
</form>
{% endif %}
<form action="{{ url_for('auth.delete_job', job_id=job.id) }}" method="POST" style="display:inline;"> <form action="{{ url_for('auth.delete_job', job_id=job.id) }}" method="POST" style="display:inline;">
<button type="submit" class="delete-btn">Löschen</button> <button type="submit" class="delete-btn">🗑️ Löschen</button>
</form> </form>
</td> </td>
</tr> </tr>
@ -37,25 +55,101 @@
</table> </table>
</div> </div>
<style>
.job-status { font-weight: bold; }
.status-failed { color: #e74c3c; font-weight: bold; }
.status-pending { color: #888; }
.eta-badge { display: inline-block; background: #eaf4ff; color: #1a6fa8;
border-radius: 10px; padding: 2px 8px; font-size: 0.82em;
font-weight: bold; margin-left: 6px; }
.dl-btn { display: inline-block; padding: 4px 10px; border-radius: 4px;
text-decoration: none; font-size: 0.85em; font-weight: bold;
background: #27ae60; color: #fff; margin: 2px 1px; transition: background 0.2s; }
.dl-btn:hover { background: #1e8449; }
.dl-btn-raw { background: #2980b9; }
.dl-btn-raw:hover { background: #1a5e8a; }
.btn-resume { background: #e67e22; color: white; border: none;
padding: 4px 10px; border-radius: 4px; cursor: pointer;
font-size: 0.85em; font-weight: bold; margin-right: 4px; }
.btn-resume:hover { background: #ca6f1e; }
</style>
<script> <script>
// Periodische Aktualisierung des Jobstatus // ETA Badge aus Status-String parsen
setInterval(function() { function parseStatus(status) {
fetch('{{ url_for("auth.job_status") }}') const parts = status.split('|');
.then(response => response.text()) if (parts.length === 2) {
.then(html => { return `<span>${parts[0].trim()}</span>
const parser = new DOMParser(); <span class="eta-badge">${parts[1].trim()}</span>`;
const doc = parser.parseFromString(html, 'text/html'); }
const newRows = doc.querySelectorAll('#jobs-table tbody tr'); return status;
const currentRows = document.querySelectorAll('#jobs-table tbody tr'); }
newRows.forEach((newRow, index) => { function renderResult(resultCell, data) {
const newStatus = newRow.querySelector('.job-status').textContent; const hasFailed = data.status.includes('Failed') || data.status.includes('❌');
currentRows[index].querySelector('.job-status').textContent = newStatus; const hasFiltered = data.result_filename && !hasFailed;
const hasRaw = data.result_filename_raw && !hasFailed;
const newResult = newRow.querySelector('td:nth-child(4)').innerHTML; if (hasFiltered) {
currentRows[index].querySelector('td:nth-child(4)').innerHTML = newResult; let html = `<a href="/download/${data.id}" class="dl-btn">🎯 Gefiltert</a>`;
if (hasRaw) {
html += ` <a href="/download_raw/${data.id}" class="dl-btn dl-btn-raw">📋 Alle</a>`;
}
resultCell.innerHTML = html;
} else if (hasFailed) {
resultCell.innerHTML = `<span class="status-failed">❌ ${data.result_filename || 'Fehler'}</span>`;
} else {
resultCell.innerHTML = `<span class="status-pending">⏳ Noch nicht verfügbar</span>`;
}
}
function renderActions(row, data) {
const actionsCell = row.querySelector('td:last-child');
const hasFailed = data.status.includes('Failed');
let html = '';
if (hasFailed) {
html += `<form action="/resume_job/${data.id}" method="POST" style="display:inline;">
<button type="submit" class="btn-resume">▶️ Resume</button>
</form>`;
}
html += `<form action="/delete_job/${data.id}" method="POST" style="display:inline;">
<button type="submit" class="delete-btn">🗑️ Löschen</button>
</form>`;
actionsCell.innerHTML = html;
}
function pollJob(jobId) {
fetch(`/job_status/${jobId}`)
.then(r => r.json())
.then(data => {
const statusCell = document.getElementById(`status-${jobId}`);
const resultCell = document.getElementById(`result-${jobId}`);
const row = document.getElementById(`job-row-${jobId}`);
statusCell.innerHTML = parseStatus(data.status);
renderResult(resultCell, data);
renderActions(row, data);
const done = data.status.includes('✅') || data.status.includes('Failed') || data.status.includes('❌');
if (!done) {
setTimeout(() => pollJob(jobId), 5000);
}
})
.catch(() => setTimeout(() => pollJob(jobId), 10000));
}
document.addEventListener('DOMContentLoaded', function () {
document.querySelectorAll('.job-status').forEach(function (cell) {
const jobId = cell.id.split('-')[1];
const status = cell.textContent.trim();
cell.innerHTML = parseStatus(status);
if (!status.includes('✅') && !status.includes('Failed') && !status.includes('❌')) {
pollJob(jobId);
}
}); });
}); });
}, 5000); // Aktualisierung alle 5 Sekunden
</script> </script>
{% endblock %} {% endblock %}

61
app/templates/jobs.orig Normal file
View file

@ -0,0 +1,61 @@
{% extends "base.html" %}
{% block content %}
<div class="table-container">
<h2>Ihre Aufträge</h2>
<table id="jobs-table">
<thead>
<tr>
<th>Dateiname</th>
<th>Status</th>
<th>Erstellt am</th>
<th>Ergebnis</th>
<th>Aktionen</th>
</tr>
</thead>
<tbody>
{% for job in jobs %}
<tr>
<td>{{ job.filename }}</td>
<td class="job-status">{{ job.status }}</td>
<td>{{ job.created_at.strftime('%Y-%m-%d %H:%M:%S') }}</td>
<td>
{% if job.status == "Completed" %}
<a href="{{ url_for('auth.download_result', job_id=job.id) }}">Download</a>
{% else %}
Noch nicht verfügbar
{% endif %}
</td>
<td>
<form action="{{ url_for('auth.delete_job', job_id=job.id) }}" method="POST" style="display:inline;">
<button type="submit" class="delete-btn">Löschen</button>
</form>
</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
<script>
// Periodische Aktualisierung des Jobstatus
setInterval(function() {
fetch('{{ url_for("auth.job_status") }}')
.then(response => response.text())
.then(html => {
const parser = new DOMParser();
const doc = parser.parseFromString(html, 'text/html');
const newRows = doc.querySelectorAll('#jobs-table tbody tr');
const currentRows = document.querySelectorAll('#jobs-table tbody tr');
newRows.forEach((newRow, index) => {
const newStatus = newRow.querySelector('.job-status').textContent;
currentRows[index].querySelector('.job-status').textContent = newStatus;
const newResult = newRow.querySelector('td:nth-child(4)').innerHTML;
currentRows[index].querySelector('td:nth-child(4)').innerHTML = newResult;
});
});
}, 5000); // Aktualisierung alle 5 Sekunden
</script>
{% endblock %}

File diff suppressed because one or more lines are too long

View file

@ -1,11 +1,22 @@
{% extends "base.html" %} {% extends "base.html" %}
{% block content %} {% block content %}
<div class="form-container"> <div class="form-container">
<h2>Datei hochladen</h2> <h2>Datei hochladen</h2>
<form method="POST" enctype="multipart/form-data">
<label for="file">CSV-Datei:</label> <!-- Instructions and download link for the template -->
<div class="template-info">
<p>Bitte verwenden Sie die folgende Vorlage für Ihre Adressliste:</p>
<a href="{{ url_for('static', filename='Adressliste_vorlage.csv') }}" download="Adressliste_vorlage.csv" class="template-download">Adressliste Vorlage herunterladen</a>
<p>Die Vorlage enthält die folgenden Spalten: <strong>PLZ, Stadt, Straße, Hausnummer, Zusatz</strong>. Bitte verwenden Sie das Format für eine reibungslose Verarbeitung.</p>
</div>
<form action="{{ url_for('auth.upload') }}" method="post" enctype="multipart/form-data">
<label for="file">Datei auswählen:</label>
<input type="file" id="file" name="file" accept=".csv" required> <input type="file" id="file" name="file" accept=".csv" required>
<button type="submit">Upload</button> <button type="submit">Datei hochladen</button>
</form> </form>
</div> </div>
</body>
</html>
{% endblock %} {% endblock %}

316
app/webcrawler.bck02032026 Normal file
View file

@ -0,0 +1,316 @@
import os
import re
import pandas as pd
import requests
import time
import random
from io import StringIO
from app.models import db, Job
print("🆕 MODERN webcrawler LOADED!")
UPLOAD_FOLDER = '/app/uploads'
RESULT_FOLDER = '/app/results'
SCRAPER_URL = "http://gmaps-scraper:8080"
OUTPUT_COLS = ['title', 'category', 'address', 'open_hours', 'website', 'phone', 'link']
# ──────────────────────────────────────────────
# Hilfsfunktionen
# ──────────────────────────────────────────────
def get_batch_size(total_rows):
if total_rows < 50: return 10
elif total_rows < 200: return 10
elif total_rows < 500: return 5
else: return 5
def get_delay(total_rows):
if total_rows < 50: return (5, 10)
elif total_rows < 200: return (10, 20)
else: return (20, 40)
def is_blocked(data):
text = str(data).lower()
blocked = any(kw in text for kw in ['captcha', 'blocked', 'rate limit', 'too many', '429'])
if blocked:
print(f"🚫 BLOCKED: {str(data)[:100]}")
return blocked
def fix_encoding(text):
"""Kaputte ISO→UTF8 Zeichen reparieren (z.B. Industriestraße → Industriestraße)"""
if not isinstance(text, str):
return text
try:
return text.encode('latin-1').decode('utf-8')
except (UnicodeEncodeError, UnicodeDecodeError):
return text
def build_input_addresses(df):
"""Normalisierte Adressen aus Input-CSV für Abgleich"""
addresses = set()
for _, row in df.iterrows():
plz = str(row.get('PLZ', '')).strip()
stadt = str(row.get('Stadt', '')).strip()
str_ = str(row.get('Straße', '')).strip()
nr = str(row.get('Hausnummer', '')).strip()
zusatz = str(row.get('Zusatz', '')).strip()
full = f"{str_} {nr} {zusatz} {plz} {stadt}".lower().strip()
full = ' '.join(full.split())
addresses.add(full)
return addresses
def normalize_address(addr):
"""Output-Adresse normalisieren für Abgleich"""
if not isinstance(addr, str):
return ''
addr = fix_encoding(addr)
return ' '.join(addr.lower().strip().split())
def address_in_input(result_addr, input_addresses):
"""Prüft ob PLZ + Straßenname aus Result im Input vorkommen"""
norm = normalize_address(result_addr)
for inp_addr in input_addresses:
plz_match = re.search(r'\b\d{5}\b', inp_addr)
if plz_match:
plz = plz_match.group()
if plz in norm:
street = inp_addr.split()[0] if inp_addr else ''
if len(street) > 3 and street[:4].lower() in norm:
return True
return False
# ──────────────────────────────────────────────
# CSV Nachbearbeitung (apply_filter umschaltbar)
# ──────────────────────────────────────────────
def process_result_csv(raw_bytes, input_df, apply_filter=True):
"""
Raw CSV → bereinigt:
- Nur OUTPUT_COLS
- Encoding fix
- Optional: Input/Output Abgleich + Duplikate
"""
try:
content = raw_bytes.decode('utf-8', errors='replace')
df_out = pd.read_csv(StringIO(content))
print(f"📄 Raw result: {df_out.shape} | Columns: {list(df_out.columns)[:8]}")
# Spalten filtern
available = [c for c in OUTPUT_COLS if c in df_out.columns]
missing = [c for c in OUTPUT_COLS if c not in df_out.columns]
if missing:
print(f"⚠️ Fehlende Spalten: {missing}")
df_out = df_out[available]
# 🔤 Encoding fix
for col in df_out.columns:
df_out[col] = df_out[col].apply(fix_encoding)
print(f"🔤 Encoding fix: done")
if apply_filter:
# 📍 Input/Output Abgleich
input_addresses = build_input_addresses(input_df)
before = len(df_out)
df_out = df_out[
df_out['address'].apply(
lambda a: address_in_input(a, input_addresses)
)
]
print(f"📍 Adress-Filter: {before} → {len(df_out)} Zeilen")
# 🔁 Duplikate entfernen (immer, auch bei Raw)
before_dedup = len(df_out)
df_out = df_out.drop_duplicates(subset=['title', 'address'], keep='first')
print(f"🔁 Duplikate: {before_dedup} → {len(df_out)} Zeilen")
# Leere Titel entfernen
df_out = df_out.dropna(subset=['title'], how='all')
df_out = df_out[df_out['title'].str.strip().astype(bool)]
print(f"✅ Final ({'gefiltert' if apply_filter else 'alle'}): {df_out.shape}")
return df_out
except Exception as e:
print(f"💥 process_result_csv: {e}")
import traceback
traceback.print_exc()
return None
# ──────────────────────────────────────────────
# Haupt-Worker
# ──────────────────────────────────────────────
def process_file(filename, job_id, app):
print(f"🎯 {filename} Job#{job_id} START!")
with app.app_context():
job = Job.query.get(job_id)
if not job:
print("❌ Job missing")
return
try:
# 1⃣ CSV Parse
job.status = "📊 parsing CSV"
db.session.commit()
filepath = os.path.join(UPLOAD_FOLDER, filename)
print(f"📁 {filepath} | {os.path.getsize(filepath)}b")
df_input = pd.read_csv(filepath, sep=';', encoding='ISO-8859-1')
print(f"📊 {df_input.shape} | Columns: {list(df_input.columns)}")
queries = []
for _, row in df_input.iterrows():
parts = [
str(row.get('PLZ', '')).strip(),
str(row.get('Stadt', '')).strip(),
str(row.get('Straße', '')).strip(),
str(row.get('Hausnummer', '')).strip(),
str(row.get('Zusatz', '')).strip(),
]
q = f"Firmen {' '.join(p for p in parts if p and p != 'nan')}".strip()
if len(q) > 10:
queries.append(q)
total = len(queries)
print(f"🔍 {total} Queries | Samples: {queries[:3]}")
if not queries:
raise ValueError("Keine gültigen Adressen in CSV")
# 2⃣ Batch + Delay
batch_size = get_batch_size(total)
delay_min, delay_max = get_delay(total)
batch = queries[:batch_size]
pre_delay = random.uniform(delay_min, delay_max)
print(f"📦 Batch {len(batch)}/{total} | 😴 {pre_delay:.1f}s Delay")
time.sleep(pre_delay)
# 3⃣ API Call
job.status = "📤 sending to scraper"
db.session.commit()
payload = {
"name": f"{filename.replace('.csv','')}-{job_id}",
"keywords": batch,
"lang": "de",
"depth": 1,
"zoom": 17,
"radius": 50,
"max_time": 60,
"fast_mode": False
}
print(f"🌐 POST {SCRAPER_URL}/api/v1/jobs | {payload['name']}")
resp = requests.post(f"{SCRAPER_URL}/api/v1/jobs", json=payload, timeout=30)
print(f"📤 {resp.status_code}: {resp.text[:300]}")
if is_blocked(resp.text):
raise ValueError("🚫 IP geblockt! Proxy konfigurieren.")
if resp.status_code != 201:
raise ValueError(f"API {resp.status_code}: {resp.text[:200]}")
# 4⃣ Polling
scraper_id = resp.json()['id']
job.scraper_job_id = scraper_id
job.status = "⏳ scraping"
db.session.commit()
print(f"✅ Scraper Job: {scraper_id}")
for i in range(1, 61): # Max 10min
try:
r = requests.get(
f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}",
timeout=10
)
data = r.json()
status = data.get('Status', data.get('status', '?'))
print(f"⏳ {i}/60: {status}")
if is_blocked(data):
raise ValueError("🚫 IP geblockt während scraping!")
if status in ('ok', 'completed', 'scraped'):
dl = requests.get(
f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}/download",
timeout=60
)
if dl.status_code != 200:
raise ValueError(f"Download {dl.status_code}")
if is_blocked(dl.text[:200]):
raise ValueError("🚫 IP geblockt beim Download!")
# 5⃣ Nachbearbeitung → zwei Versionen
job.status = "🔧 processing result"
db.session.commit()
base = filename.replace('.csv', '')
os.makedirs(RESULT_FOLDER, exist_ok=True)
# ── Version A: Gefiltert (Adressabgleich + Deduplizierung) ──
df_filtered = process_result_csv(dl.content, df_input, apply_filter=True)
outname_filtered = f"results_{base}_filtered.csv"
outpath_filtered = os.path.join(RESULT_FOLDER, outname_filtered)
if df_filtered is not None and len(df_filtered) > 0:
df_filtered.to_csv(
outpath_filtered, index=False,
encoding='utf-8-sig', sep=';'
)
print(f"🎯 Filtered: {outname_filtered} → {len(df_filtered)} Firmen")
else:
print("⚠️ Keine Treffer nach Filter leere Datei wird erstellt")
pd.DataFrame(columns=OUTPUT_COLS).to_csv(
outpath_filtered, index=False,
encoding='utf-8-sig', sep=';'
)
# ── Version B: Alle (nur Spalten + Encoding, kein Filter) ──
df_raw = process_result_csv(dl.content, df_input, apply_filter=False)
outname_raw = f"results_{base}_all.csv"
outpath_raw = os.path.join(RESULT_FOLDER, outname_raw)
if df_raw is not None:
df_raw.to_csv(
outpath_raw, index=False,
encoding='utf-8-sig', sep=';'
)
print(f"📋 All: {outname_raw} → {len(df_raw)} Firmen")
else:
print("⚠️ df_raw None Rohinhalt wird gespeichert")
with open(outpath_raw, 'wb') as f:
f.write(dl.content)
# ── DB speichern ──
job.status = "✅ Fertig"
job.result_filename = outname_filtered # 🎯 Gefiltert
job.result_filename_raw = outname_raw # 📋 Alle
db.session.commit()
print(f"🎉 Beide Dateien gespeichert!")
break
elif status in ('failed', 'cancelled', 'error'):
raise ValueError(f"Scraper: {status}")
except requests.RequestException as e:
print(f"⚠️ Poll {i}: {e}")
time.sleep(random.uniform(8, 15))
else:
raise ValueError("Timeout nach 10min")
except Exception as e:
job.status = "Failed"
job.result_filename = str(e)
print(f"💥 ERROR: {e}")
import traceback
traceback.print_exc()
db.session.commit()
print(f"✅ DONE! Status: {job.status}\n")

275
app/webcrawler.bck04032026 Normal file
View file

@ -0,0 +1,275 @@
import os
import re
import pandas as pd
import requests
import time
import random
from io import StringIO
from app.models import db, Job
print("🆕 MODERN webcrawler LOADED! BATCHED + PROXY")
UPLOAD_FOLDER = '/app/uploads'
RESULT_FOLDER = '/app/results'
SCRAPER_URL = "http://gmaps-scraper:8080"
OUTPUT_COLS = ['title', 'category', 'address', 'open_hours', 'website', 'phone', 'link']
PROXY_URL = "http://bitlleuv-rotate:s5hzse6hz74b@p.webshare.io:80"
API_PROXIES = {"http": PROXY_URL, "https": PROXY_URL}
# ──────────────────────────────────────────────
# Hilfsfunktionen
# ──────────────────────────────────────────────
def is_blocked(data):
text = str(data).lower()
blocked = any(kw in text for kw in ['captcha', 'blocked', 'rate limit', 'too many', '429'])
if blocked:
print(f"🚫 BLOCKED: {str(data)[:100]}")
return blocked
def fix_encoding(text):
if not isinstance(text, str):
return text
try:
return text.encode('latin-1').decode('utf-8')
except (UnicodeEncodeError, UnicodeDecodeError):
return text
def build_input_addresses(df):
addresses = set()
for _, row in df.iterrows():
plz = str(row.get('PLZ', '')).strip()
stadt = str(row.get('Stadt', '')).strip()
str_ = str(row.get('Straße', '')).strip()
nr = str(row.get('Hausnummer', '')).strip()
zusatz = str(row.get('Zusatz', '')).strip()
full = f"{str_} {nr} {zusatz} {plz} {stadt}".lower().strip()
full = ' '.join(full.split())
addresses.add(full)
return addresses
def normalize_address(addr):
if not isinstance(addr, str):
return ''
addr = fix_encoding(addr)
return ' '.join(addr.lower().strip().split())
def address_in_input(result_addr, input_addresses):
norm = normalize_address(result_addr)
for inp_addr in input_addresses:
plz_match = re.search(r'\b\d{5}\b', inp_addr)
if plz_match:
plz = plz_match.group()
if plz in norm:
street = inp_addr.split()[0] if inp_addr else ''
if len(street) > 3 and street[:4].lower() in norm:
return True
return False
# ──────────────────────────────────────────────
# CSV Nachbearbeitung
# ──────────────────────────────────────────────
def process_result_csv(raw_bytes, input_df, apply_filter=True):
try:
content = raw_bytes.decode('utf-8', errors='replace')
df_out = pd.read_csv(StringIO(content))
print(f"📄 Raw result: {df_out.shape}")
available = [c for c in OUTPUT_COLS if c in df_out.columns]
df_out = df_out[available]
for col in df_out.columns:
df_out[col] = df_out[col].apply(fix_encoding)
if apply_filter:
input_addresses = build_input_addresses(input_df)
before = len(df_out)
df_out = df_out[
df_out['address'].apply(
lambda a: address_in_input(a, input_addresses)
)
]
print(f"📍 Filter: {before} → {len(df_out)}")
df_out = df_out.drop_duplicates(subset=['title', 'address'], keep='first')
df_out = df_out.dropna(subset=['title'], how='all')
df_out = df_out[df_out['title'].str.strip().astype(bool)]
print(f"✅ Final ({'gefiltert' if apply_filter else 'alle'}): {df_out.shape}")
return df_out
except Exception as e:
print(f"💥 process_result_csv: {e}")
return None
# ──────────────────────────────────────────────
# HAUPT-WORKER
# ──────────────────────────────────────────────
def process_file(filename, job_id, app):
print(f"🎯 {filename} Job#{job_id} START!")
with app.app_context():
job = Job.query.get(job_id)
if not job:
print("❌ Job missing")
return
try:
#Parse + ALLE Queries
job.status = "📊 parsing CSV"
db.session.commit()
filepath = os.path.join(UPLOAD_FOLDER, filename)
print(f"📁 {filepath} | {os.path.getsize(filepath)}b")
df_input = pd.read_csv(filepath, sep=';', encoding='ISO-8859-1')
print(f"📊 {df_input.shape}")
queries = []
for _, row in df_input.iterrows():
parts = [
str(row.get('PLZ', '')).strip(),
str(row.get('Stadt', '')).strip(),
str(row.get('Straße', '')).strip(),
str(row.get('Hausnummer', '')).strip(),
str(row.get('Zusatz', '')).strip(),
]
q = f"Firmen {' '.join(p for p in parts if p and p != 'nan')}".strip()
if len(q) > 10:
queries.append(q)
total_queries = len(queries)
print(f"🔍 {total_queries} Queries | Samples: {queries[:3]}")
if total_queries == 0:
raise ValueError("Keine gültigen Adressen")
#BATCHED Processing
BATCH_SIZE = 10 # Erhöht: 5 → 10 (paid proxy)
BATCH_DELAY_MIN, BATCH_DELAY_MAX = 10, 20 # Reduziert: 30-60s → 10-20s (paid proxy)
batches = (total_queries + BATCH_SIZE - 1) // BATCH_SIZE
print(f"📦 {batches} Batches à {BATCH_SIZE} | ETA: ~{batches*15//60:.0f}h")
all_results_filtered = []
all_results_raw = []
job.status = f"🔄 Batch 1/{batches}"
db.session.commit()
for batch_idx in range(batches):
batch_start = batch_idx * BATCH_SIZE
batch_end = min(batch_start + BATCH_SIZE, total_queries)
batch_queries = queries[batch_start:batch_end]
print(f"\n🔄 BATCH {batch_idx+1}/{batches} ({batch_start+1}-{batch_end}/{total_queries})")
#Random Delay
delay = random.uniform(BATCH_DELAY_MIN, BATCH_DELAY_MAX)
print(f"😴 Delay: {delay:.0f}s | Proxy: {PROXY_URL}")
time.sleep(delay)
#API Call
payload = {
"name": f"{filename.replace('.csv','')}-{job_id}-B{batch_idx+1:03d}",
"keywords": batch_queries,
"lang": "de",
"depth": 1,
"zoom": 17,
"radius": 50,
"max_time": 60, # Reduziert: 120 → 60 (paid proxy schneller)
"fast_mode": False,
"proxies": [PROXY_URL]
}
try:
resp = requests.post(
f"{SCRAPER_URL}/api/v1/jobs",
json=payload,
timeout=45
)
print(f"📤 {resp.status_code}")
if is_blocked(resp.text):
print("🚫 Batch übersprungen (blocked)")
continue
if resp.status_code != 201:
print(f"⚠️ Batch {batch_idx+1} fehlgeschlagen: {resp.text[:100]}")
continue
scraper_id = resp.json()['id']
print(f"✅ Scraper: {scraper_id}")
for poll_i in range(1, 61): # Reduziert: 121 → 61 (max_time 60s)
r = requests.get(
f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}",
timeout=15
)
data = r.json()
status = data.get('Status', data.get('status', '?'))
if status in ('ok', 'completed', 'scraped'):
dl = requests.get(
f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}/download",
timeout=90
)
if dl.status_code == 200:
df_filtered = process_result_csv(dl.content, df_input, True)
df_raw = process_result_csv(dl.content, df_input, False)
if df_filtered is not None:
all_results_filtered.append(df_filtered)
all_results_raw.append(df_raw)
print(f"📊 Batch {batch_idx+1}: {len(df_filtered)} filtered")
break
elif status in ('failed', 'error'):
print(f"💥 Batch {batch_idx+1}: {status}")
break
time.sleep(random.uniform(5, 10)) # Reduziert: 10-20s → 5-10s (paid proxy)
except Exception as e:
print(f"💥 Batch {batch_idx+1}: {e}")
job.status = f"🔄 Batch {batch_idx+2}/{batches}"
db.session.commit()
#MERGE & SAVE
job.status = "🔧 merging results"
db.session.commit()
base = filename.replace('.csv', '')
os.makedirs(RESULT_FOLDER, exist_ok=True)
if all_results_filtered:
df_final_filtered = pd.concat(all_results_filtered, ignore_index=True)
df_final_filtered = df_final_filtered.drop_duplicates(subset=['title', 'address'])
out_filtered = f"results_{base}_filtered.csv"
df_final_filtered.to_csv(
os.path.join(RESULT_FOLDER, out_filtered),
index=False, encoding='utf-8-sig', sep=';'
)
if all_results_raw:
df_final_raw = pd.concat(all_results_raw, ignore_index=True)
out_raw = f"results_{base}_all.csv"
df_final_raw.to_csv(
os.path.join(RESULT_FOLDER, out_raw),
index=False, encoding='utf-8-sig', sep=';'
)
job.result_filename = out_filtered
job.result_filename_raw = out_raw
job.status = f"✅ Fertig: {len(df_final_filtered)} Firmen"
else:
job.status = "❌ Keine Ergebnisse"
db.session.commit()
print(f"🎉 Job {job_id} komplett!")
except Exception as e:
job.status = f"Failed: {str(e)[:50]}"
print(f"💥 FATAL: {e}")
import traceback
traceback.print_exc()
db.session.commit()
print(f"✅ DONE! Status: {job.status}")

View file

@ -0,0 +1,429 @@
import os
import re
import unicodedata
import json
import pandas as pd
import requests
import time
import random
from io import StringIO
from app.models import db, Job
print("🆕 MODERN webcrawler LOADED! BATCHED + PROXY + RESUME + ETA + 2x SCRAPER")
UPLOAD_FOLDER = '/app/uploads'
RESULT_FOLDER = '/app/results'
# 2x Scraper abwechselnd genutzt
SCRAPER_URLS = [
"http://gmaps-scraper-1:8080",
"http://gmaps-scraper-2:8080",
]
OUTPUT_COLS = ['title', 'category', 'address', 'open_hours', 'website', 'phone', 'link']
PROXY_URL = "http://bitlleuv-rotate:s5hzse6hz74b@p.webshare.io:80"
API_PROXIES = {"http": PROXY_URL, "https": PROXY_URL}
# ──────────────────────────────────────────────
# Tuning
# ──────────────────────────────────────────────
BATCH_SIZE = 30 # Keywords pro Scraper-Job
BATCH_DELAY_MIN = 3 # Sekunden Pause zwischen Batches (min)
BATCH_DELAY_MAX = 6 # Sekunden Pause zwischen Batches (max)
MAX_TIME = 60 # Sekunden die der Scraper pro Batch hat
POLL_MAX = 90 # Max. Poll-Versuche pro Batch
POLL_DELAY_MIN = 2 # Sekunden zwischen Polls (min)
POLL_DELAY_MAX = 5 # Sekunden zwischen Polls (max)
STUCK_THRESHOLD = 8 # Polls auf 'pending' bis Auto-Restart
MAX_RETRIES = 2 # Wiederholversuche pro Batch bei Fehler
# ──────────────────────────────────────────────
# Hilfsfunktionen
# ──────────────────────────────────────────────
def is_blocked(data):
text = str(data).lower()
blocked = any(kw in text for kw in ['captcha', 'blocked', 'rate limit', 'too many', '429'])
if blocked:
print(f"🚫 BLOCKED: {str(data)[:100]}")
return blocked
def fix_encoding(text):
if not isinstance(text, str):
return text
try:
return text.encode('latin-1').decode('utf-8')
except (UnicodeEncodeError, UnicodeDecodeError):
return text
# Fix 1: Sonderzeichen in Queries bereinigen
def clean_query(q):
"""Steuerzeichen + fehlerhafte Bytes entfernen für saubere Google Maps URLs"""
q = ''.join(c for c in q if unicodedata.category(c) != 'Cc')
q = ' '.join(q.split())
return q.strip()
def build_input_addresses(df):
addresses = set()
for _, row in df.iterrows():
plz = str(row.get('PLZ', '')).strip()
stadt = str(row.get('Stadt', '')).strip()
str_ = str(row.get('Straße', '')).strip()
nr = str(row.get('Hausnummer', '')).strip()
zusatz = str(row.get('Zusatz', '')).strip()
full = f"{str_} {nr} {zusatz} {plz} {stadt}".lower().strip()
full = ' '.join(full.split())
addresses.add(full)
return addresses
def normalize_address(addr):
if not isinstance(addr, str):
return ''
addr = fix_encoding(addr)
return ' '.join(addr.lower().strip().split())
def address_in_input(result_addr, input_addresses):
norm = normalize_address(result_addr)
for inp_addr in input_addresses:
plz_match = re.search(r'\b\d{5}\b', inp_addr)
if plz_match:
plz = plz_match.group()
if plz in norm:
street = inp_addr.split()[0] if inp_addr else ''
if len(street) > 3 and street[:4].lower() in norm:
return True
return False
def format_eta(seconds):
"""Sekunden → lesbares ETA-Format"""
if seconds < 60:
return f"{int(seconds)}s"
h, rem = divmod(int(seconds), 3600)
m = rem // 60
return f"{h}h {m:02d}min" if h > 0 else f"{m}min"
# ──────────────────────────────────────────────
# Fix 3: Scraper-Neustart bei Inactivity
# ──────────────────────────────────────────────
def restart_scraper(scraper_url):
"""Den betroffenen Scraper-Container neu starten"""
try:
import subprocess
# Container-Name aus URL ableiten: http://gmaps-scraper-1:8080 → gmaps-scraper-1
container = scraper_url.split("//")[1].split(":")[0]
print(f"🔄 Starte {container} neu...")
subprocess.run(["docker", "restart", container], timeout=30, capture_output=True)
print(f"✅ {container} neu gestartet warte 15s...")
time.sleep(15)
return True
except Exception as e:
print(f"⚠️ Scraper-Neustart fehlgeschlagen: {e}")
return False
# ──────────────────────────────────────────────
# Resume: Progress-File Hilfsfunktionen
# ──────────────────────────────────────────────
def get_progress_path(job_id):
return os.path.join(RESULT_FOLDER, f"progress_{job_id}.json")
def get_partial_path(job_id, suffix):
return os.path.join(RESULT_FOLDER, f"partial_{job_id}_{suffix}.csv")
def load_progress(job_id):
"""Gespeicherten Fortschritt laden (falls vorhanden)"""
path = get_progress_path(job_id)
if os.path.exists(path):
with open(path, 'r') as f:
data = json.load(f)
print(f"🔁 RESUME: ab Batch {data['last_completed_batch'] + 1}/{data['total_batches']}")
return data
return None
def save_progress(job_id, last_completed_batch, total_batches):
"""Fortschritt nach jedem Batch speichern"""
path = get_progress_path(job_id)
with open(path, 'w') as f:
json.dump({"last_completed_batch": last_completed_batch, "total_batches": total_batches}, f)
def append_partial(job_id, df_filtered, df_raw):
"""Batch-Ergebnis an Partial-CSV anhängen"""
for suffix, df in [('filtered', df_filtered), ('raw', df_raw)]:
if df is None:
continue
path = get_partial_path(job_id, suffix)
header = not os.path.exists(path)
df.to_csv(path, mode='a', index=False, header=header, encoding='utf-8-sig', sep=';')
def load_partial(job_id):
"""Bestehende Partial-CSVs laden"""
results_filtered, results_raw = [], []
for suffix, lst in [('filtered', results_filtered), ('raw', results_raw)]:
path = get_partial_path(job_id, suffix)
if os.path.exists(path):
try:
df = pd.read_csv(path, sep=';', encoding='utf-8-sig')
lst.append(df)
print(f"📂 Partial {suffix}: {len(df)} Zeilen geladen")
except Exception as e:
print(f"⚠️ Partial {suffix} Ladefehler: {e}")
return results_filtered, results_raw
def cleanup_progress(job_id):
"""Progress + Partial-Files nach Abschluss löschen"""
for path in [
get_progress_path(job_id),
get_partial_path(job_id, 'filtered'),
get_partial_path(job_id, 'raw'),
]:
if os.path.exists(path):
os.remove(path)
# ──────────────────────────────────────────────
# CSV Nachbearbeitung
# ──────────────────────────────────────────────
def process_result_csv(raw_bytes, input_df, apply_filter=True):
try:
content = raw_bytes.decode('utf-8', errors='replace')
df_out = pd.read_csv(StringIO(content))
print(f"📄 Raw result: {df_out.shape}")
available = [c for c in OUTPUT_COLS if c in df_out.columns]
df_out = df_out[available]
for col in df_out.columns:
df_out[col] = df_out[col].apply(fix_encoding)
if apply_filter:
input_addresses = build_input_addresses(input_df)
before = len(df_out)
df_out = df_out[
df_out['address'].apply(lambda a: address_in_input(a, input_addresses))
]
print(f"📍 Filter: {before} → {len(df_out)}")
df_out = df_out.drop_duplicates(subset=['title', 'address'], keep='first')
df_out = df_out.dropna(subset=['title'], how='all')
df_out = df_out[df_out['title'].str.strip().astype(bool)]
print(f"✅ Final ({'gefiltert' if apply_filter else 'alle'}): {df_out.shape}")
return df_out
except Exception as e:
print(f"💥 process_result_csv: {e}")
return None
# ──────────────────────────────────────────────
# HAUPT-WORKER
# ──────────────────────────────────────────────
def process_file(filename, job_id, app):
print(f"🎯 {filename} Job#{job_id} START!")
with app.app_context():
job = Job.query.get(job_id)
if not job:
print("❌ Job missing")
return
try:
#Parse + ALLE Queries
job.status = "📊 parsing CSV"
db.session.commit()
filepath = os.path.join(UPLOAD_FOLDER, filename)
print(f"📁 {filepath} | {os.path.getsize(filepath)}b")
df_input = pd.read_csv(filepath, sep=';', encoding='ISO-8859-1')
print(f"📊 {df_input.shape}")
queries = []
for _, row in df_input.iterrows():
parts = [
str(row.get('PLZ', '')).strip(),
str(row.get('Stadt', '')).strip(),
str(row.get('Straße', '')).strip(),
str(row.get('Hausnummer', '')).strip(),
str(row.get('Zusatz', '')).strip(),
]
q = f"Firmen {' '.join(p for p in parts if p and p != 'nan')}".strip()
q = clean_query(q) # Fix 1: Sonderzeichen bereinigen
if len(q) > 10:
queries.append(q)
total_queries = len(queries)
print(f"🔍 {total_queries} Queries | Samples: {queries[:3]}")
if total_queries == 0:
raise ValueError("Keine gültigen Adressen")
#BATCHED Processing
batches = (total_queries + BATCH_SIZE - 1) // BATCH_SIZE
# Resume: Fortschritt laden falls vorhanden
os.makedirs(RESULT_FOLDER, exist_ok=True)
progress = load_progress(job_id)
start_batch = progress['last_completed_batch'] + 1 if progress else 0
all_results_filtered, all_results_raw = load_partial(job_id) if progress else ([], [])
eta_initial = format_eta((batches - start_batch) * ((BATCH_DELAY_MAX + MAX_TIME) / 2))
print(f"📦 {batches} Batches à {BATCH_SIZE} | 2x Scraper | Start: {start_batch} | ETA: ~{eta_initial}")
job_start_time = time.time()
job.status = f"🔄 Batch {start_batch+1}/{batches} | ⏱️ ~{eta_initial}"
db.session.commit()
for batch_idx in range(start_batch, batches):
batch_start = batch_idx * BATCH_SIZE
batch_end = min(batch_start + BATCH_SIZE, total_queries)
batch_queries = queries[batch_start:batch_end]
# 2x Scraper: abwechselnd nutzen
scraper_url = SCRAPER_URLS[batch_idx % len(SCRAPER_URLS)]
print(f"\n🔄 BATCH {batch_idx+1}/{batches} ({batch_start+1}-{batch_end}/{total_queries}) → {scraper_url}")
#Random Delay
delay = random.uniform(BATCH_DELAY_MIN, BATCH_DELAY_MAX)
print(f"😴 Delay: {delay:.0f}s")
time.sleep(delay)
#API Call
payload = {
"name": f"{filename.replace('.csv','')}-{job_id}-B{batch_idx+1:03d}",
"keywords": batch_queries,
"lang": "de",
"depth": 1,
"zoom": 15,
"radius": 50,
"max_time": MAX_TIME,
"fast_mode": False,
"proxies": [PROXY_URL]
}
batch_success = False
# Fix 2: Retry-Logik bei Scraper-Fehler
for attempt in range(1, MAX_RETRIES + 1):
try:
resp = requests.post(f"{scraper_url}/api/v1/jobs", json=payload, timeout=45)
print(f"📤 {resp.status_code} (Versuch {attempt} | {scraper_url})")
if is_blocked(resp.text):
print("🚫 Batch übersprungen (blocked)")
break
if resp.status_code != 201:
print(f"⚠️ Batch {batch_idx+1} fehlgeschlagen: {resp.text[:100]}")
if attempt < MAX_RETRIES:
time.sleep(10)
continue
scraper_id = resp.json()['id']
print(f"✅ Scraper: {scraper_id}")
stuck_counter = 0
for poll_i in range(1, POLL_MAX + 1):
r = requests.get(f"{scraper_url}/api/v1/jobs/{scraper_id}", timeout=15)
data = r.json()
status = data.get('Status', data.get('status', '?'))
print(f"⏳ Poll {poll_i}: {status}")
# Fix 4: Auto-Recovery bei Pending-Stuck
if status == 'pending':
stuck_counter += 1
if stuck_counter >= STUCK_THRESHOLD:
print(f"⚠️ Job {scraper_id} hängt abbrechen + Neustart")
requests.delete(f"{scraper_url}/api/v1/jobs/{scraper_id}", timeout=10)
restart_scraper(scraper_url) # Fix 3: Nur betroffenen Scraper neu starten
break
else:
stuck_counter = 0
if status in ('ok', 'completed', 'scraped'):
dl = requests.get(f"{scraper_url}/api/v1/jobs/{scraper_id}/download", timeout=90)
if dl.status_code == 200:
df_filtered = process_result_csv(dl.content, df_input, True)
df_raw = process_result_csv(dl.content, df_input, False)
if df_filtered is not None:
all_results_filtered.append(df_filtered)
all_results_raw.append(df_raw)
append_partial(job_id, df_filtered, df_raw) # Resume: sofort speichern
print(f"📊 Batch {batch_idx+1}: {len(df_filtered)} filtered")
batch_success = True
break
# Fix 2: Scraper-Fehler → Retry
elif status in ('failed', 'error'):
print(f"💥 Batch {batch_idx+1}: {status} (Versuch {attempt})")
if attempt < MAX_RETRIES:
time.sleep(10)
break
time.sleep(random.uniform(POLL_DELAY_MIN, POLL_DELAY_MAX))
if batch_success:
break
except Exception as e:
print(f"💥 Batch {batch_idx+1} Versuch {attempt}: {e}")
if attempt < MAX_RETRIES:
time.sleep(10)
# Resume: Fortschritt nach jedem Batch speichern
save_progress(job_id, batch_idx, batches)
# ETA berechnen
elapsed = time.time() - job_start_time
done_so_far = batch_idx - start_batch + 1
if done_so_far > 0:
avg_per_batch = elapsed / done_so_far
remaining = (batches - batch_idx - 1) * avg_per_batch
eta_str = format_eta(remaining)
else:
eta_str = "?"
job.status = f"🔄 Batch {batch_idx+2}/{batches} | ⏱️ ~{eta_str}"
db.session.commit()
#MERGE & SAVE
job.status = "🔧 merging results"
db.session.commit()
base = filename.replace('.csv', '')
if all_results_filtered:
df_final_filtered = pd.concat(all_results_filtered, ignore_index=True)
df_final_filtered = df_final_filtered.drop_duplicates(subset=['title', 'address'])
out_filtered = f"results_{base}_filtered.csv"
df_final_filtered.to_csv(
os.path.join(RESULT_FOLDER, out_filtered),
index=False, encoding='utf-8-sig', sep=';'
)
if all_results_raw:
df_final_raw = pd.concat(all_results_raw, ignore_index=True)
out_raw = f"results_{base}_all.csv"
df_final_raw.to_csv(
os.path.join(RESULT_FOLDER, out_raw),
index=False, encoding='utf-8-sig', sep=';'
)
job.result_filename = out_filtered
job.result_filename_raw = out_raw
job.status = f"✅ Fertig: {len(df_final_filtered)} Firmen"
# Resume: Cleanup nach Abschluss
cleanup_progress(job_id)
else:
job.status = "❌ Keine Ergebnisse"
db.session.commit()
print(f"🎉 Job {job_id} komplett!")
except Exception as e:
job.status = f"Failed: {str(e)[:50]}"
print(f"💥 FATAL: {e}")
import traceback
traceback.print_exc()
db.session.commit()
print(f"✅ DONE! Status: {job.status}")

138
app/webcrawler.orig Normal file
View file

@ -0,0 +1,138 @@
import csv
import os
import requests
from .models import db, Job
from flask import current_app
UPLOAD_FOLDER = 'uploads'
RESULT_FOLDER = 'results'
API_KEY = 'AIzaSyAIf0yXJTwo87VMWLBtq2m2LqE-OaPGbzw'
processed_companies = set()
def get_geocode(address):
url = f"https://maps.googleapis.com/maps/api/geocode/json"
params = {'address': address, 'key': API_KEY}
try:
response = requests.get(url, params=params, timeout=5)
if response.status_code == 200:
data = response.json()
if data['status'] == 'OK':
location = data['results'][0]['geometry']['location']
return location['lat'], location['lng']
except requests.RequestException as e:
print(f"Geocode API Fehler für {address}: {e}")
return None, None
def get_nearby_places(lat, lng):
places_url = f"https://maps.googleapis.com/maps/api/place/nearbysearch/json"
params = {
'location': f"{lat},{lng}",
'radius': 10,
'type': 'point_of_interest',
'key': API_KEY
}
try:
response = requests.get(places_url, params=params, timeout=5)
if response.status_code == 200:
return response.json().get('results', [])
except requests.RequestException as e:
print(f"Nearby Places API Fehler für Standort {lat},{lng}: {e}")
return []
def get_place_details(place_id):
details_url = f"https://maps.googleapis.com/maps/api/place/details/json"
params = {
'place_id': place_id,
'fields': 'formatted_phone_number,website',
'key': API_KEY
}
try:
response = requests.get(details_url, params=params, timeout=5)
if response.status_code == 200:
result = response.json().get('result', {})
return result.get('formatted_phone_number', 'N/A'), result.get('website', 'N/A')
except requests.RequestException as e:
print(f"Place Details API Fehler für Place ID {place_id}: {e}")
return 'N/A', 'N/A'
def process_file(filename, job_id, app):
with app.app_context():
filepath = os.path.join(UPLOAD_FOLDER, filename)
results = []
job = Job.query.get(job_id)
if not job:
print("Job wurde abgebrochen.")
return
job.status = "In Progress"
db.session.commit()
with open(filepath, newline='', encoding='ISO-8859-1') as csvfile:
reader = csv.DictReader(csvfile, delimiter=';')
headers = reader.fieldnames
if not all(field in headers for field in ['PLZ', 'Straße', 'Hausnummer']):
print("CSV-Datei enthält nicht alle notwendigen Spalten.")
job.status = "Failed"
db.session.commit()
return
for row in reader:
plz = row.get('PLZ', '').strip()
city = row.get('Stadt', row.get('Bezirk', '')).strip()
street = row.get('Straße', '').strip()
house_number = row.get('Hausnummer', '').strip()
additional = row.get('Zusatz', '').strip()
if not all([plz, city, street, house_number]):
continue
full_address = f"{street} {house_number} {additional}, {plz} {city}"
lat, lng = get_geocode(full_address)
if lat is None or lng is None:
continue
nearby_places = get_nearby_places(lat, lng)
for place in nearby_places:
company_name = place['name']
if company_name in processed_companies:
continue
processed_companies.add(company_name)
company_address = place.get('vicinity', 'N/A').split(',')[0]
place_id = place.get('place_id')
company_phone, company_website = get_place_details(place_id) if place_id else ('N/A', 'N/A')
results.append({
'PLZ': plz,
'Stadt': city,
'Straße': street,
'Hausnummer': house_number,
'Zusatz': additional,
'Company Name': company_name,
'Company Address': company_address,
'Company Phone': company_phone,
'Company Website': company_website
})
if results:
result_file = f"results_{os.path.splitext(filename)[0]}.csv"
result_path = os.path.join(RESULT_FOLDER, result_file)
with open(result_path, 'w', newline='', encoding='utf-8-sig') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=[
'PLZ', 'Stadt', 'Straße', 'Hausnummer', 'Zusatz',
'Company Name', 'Company Address', 'Company Phone', 'Company Website'
])
writer.writeheader()
writer.writerows(results)
job.status = "Completed"
job.result_filename = result_file
db.session.commit()
else:
job.status = "Failed"
db.session.commit()

View file

@ -1,128 +1,487 @@
import csv
import os import os
import re
import unicodedata
import json
import threading
import pandas as pd
import requests import requests
from .models import db, Job import time
from flask import current_app import random
from io import StringIO
from concurrent.futures import ThreadPoolExecutor, as_completed
from app.models import db, Job
UPLOAD_FOLDER = 'uploads' print("🆕 MODERN webcrawler LOADED! BATCHED + PROXY + RESUME + ETA + 4x SCRAPER CHUNK-PARALLEL")
RESULT_FOLDER = 'results'
API_KEY = 'AIzaSyAIf0yXJTwo87VMWLBtq2m2LqE-OaPGbzw'
def get_place_details(street, city_zip): UPLOAD_FOLDER = '/app/uploads'
address = f"{street}, {city_zip}" RESULT_FOLDER = '/app/results'
url = f"https://maps.googleapis.com/maps/api/place/textsearch/json"
params = {'query': address, 'key': API_KEY}
results = [] SCRAPER_URLS = [
"http://gmaps-scraper-1:8080",
"http://gmaps-scraper-2:8080",
"http://gmaps-scraper-3:8080",
"http://gmaps-scraper-4:8080",
]
OUTPUT_COLS = ['title', 'category', 'address', 'open_hours', 'website', 'phone', 'link']
PROXY_URL = "http://bitlleuv-rotate:s5hzse6hz74b@p.webshare.io:80"
API_PROXIES = {"http": PROXY_URL, "https": PROXY_URL}
_job_semaphore = threading.Semaphore(1)
# ──────────────────────────────────────────────
# Tuning
# ──────────────────────────────────────────────
BATCH_SIZE = 30 # Keywords pro Scraper-Job
BATCH_DELAY_MIN = 3 # Sekunden Pause zwischen Chunks (min)
BATCH_DELAY_MAX = 6 # Sekunden Pause zwischen Chunks (max)
MAX_TIME = 60 # Sekunden die der Scraper pro Batch hat
POLL_MAX = 90 # Max. Poll-Versuche pro Batch
POLL_DELAY_MIN = 2 # Sekunden zwischen Polls (min)
POLL_DELAY_MAX = 5 # Sekunden zwischen Polls (max)
STUCK_TIMEOUT = 300 # Sekunden bis Scraper-Neustart (5 Min)
MAX_RETRIES = 2 # Wiederholversuche pro Batch bei Fehler
PARALLEL_WORKERS = len(SCRAPER_URLS)
_partial_lock = threading.Lock()
# ──────────────────────────────────────────────
# Hilfsfunktionen
# ──────────────────────────────────────────────
def is_blocked(data):
text = str(data).lower()
blocked = any(kw in text for kw in ['captcha', 'blocked', 'rate limit', 'too many', '429'])
if blocked:
print(f"🚫 BLOCKED: {str(data)[:100]}")
return blocked
def fix_encoding(text):
if not isinstance(text, str):
return text
try: try:
response = requests.get(url, params=params, timeout=5) return text.encode('latin-1').decode('utf-8')
if response.status_code == 200: except (UnicodeEncodeError, UnicodeDecodeError):
data = response.json() return text
print(f"API Response Data for {address}: {data}")
for place in data.get('results', []): def clean_query(q):
name = place.get('name', 'N/A') q = ''.join(c for c in q if unicodedata.category(c) != 'Cc')
place_id = place.get('place_id') q = ' '.join(q.split())
formatted_address = place.get('formatted_address', 'N/A') return q.strip()
# Zweite Anfrage für detailliertere Informationen def build_input_addresses(df):
phone, website = 'N/A', 'N/A' addresses = set()
if place_id: for _, row in df.iterrows():
details_url = f"https://maps.googleapis.com/maps/api/place/details/json" plz = str(row.get('PLZ', '')).strip()
details_params = { stadt = str(row.get('Stadt', '')).strip()
'place_id': place_id, str_ = str(row.get('Straße', '')).strip()
'fields': 'formatted_phone_number,website', nr = str(row.get('Hausnummer', '')).strip()
'key': API_KEY zusatz = str(row.get('Zusatz', '')).strip()
full = f"{str_} {nr} {zusatz} {plz} {stadt}".lower().strip()
full = ' '.join(full.split())
addresses.add(full)
return addresses
def normalize_address(addr):
if not isinstance(addr, str):
return ''
addr = fix_encoding(addr)
return ' '.join(addr.lower().strip().split())
def address_in_input(result_addr, input_addresses):
norm = normalize_address(result_addr)
for inp_addr in input_addresses:
plz_match = re.search(r'\b\d{5}\b', inp_addr)
if not plz_match:
continue
plz = plz_match.group()
if plz not in norm:
continue
parts = inp_addr.split()
street = parts[0] if parts else ''
if len(street) < 4 or street[:5].lower() not in norm:
continue
hausnr = parts[1] if len(parts) > 1 else ''
if hausnr and not re.search(rf'\b{re.escape(hausnr)}\b', norm):
continue
return True
return False
def format_eta(seconds):
if seconds < 60:
return f"{int(seconds)}s"
h, rem = divmod(int(seconds), 3600)
m = rem // 60
return f"{h}h {m:02d}min" if h > 0 else f"{m}min"
# ──────────────────────────────────────────────
# Scraper-Job Cleanup
# ──────────────────────────────────────────────
def _cleanup_scraper_job(scraper_url, scraper_id):
"""Scraper-Job immer aufräumen wenn wir ihn nicht mehr brauchen"""
try:
requests.delete(f"{scraper_url}/api/v1/jobs/{scraper_id}", timeout=10)
print(f"🗑️ Scraper-Job {scraper_id} gelöscht")
except Exception as e:
print(f"⚠️ Cleanup fehlgeschlagen: {e}")
# ──────────────────────────────────────────────
# Scraper-Neustart via Docker SDK
# ──────────────────────────────────────────────
def restart_scraper(scraper_url):
try:
import docker
container_name = scraper_url.split("//")[1].split(":")[0]
print(f"🔄 Starte {container_name} neu...")
client = docker.from_env()
container = client.containers.get(container_name)
container.restart()
print(f"{container_name} neu gestartet warte 15s...")
time.sleep(15)
return True
except Exception as e:
print(f"⚠️ Scraper-Neustart fehlgeschlagen: {e}")
return False
# ──────────────────────────────────────────────
# Resume: Progress-File Hilfsfunktionen
# ──────────────────────────────────────────────
def get_progress_path(job_id):
return os.path.join(RESULT_FOLDER, f"progress_{job_id}.json")
def get_partial_path(job_id, suffix):
return os.path.join(RESULT_FOLDER, f"partial_{job_id}_{suffix}.csv")
def load_progress(job_id):
path = get_progress_path(job_id)
if os.path.exists(path):
with open(path, 'r') as f:
data = json.load(f)
print(f"🔁 RESUME: ab Batch {data['last_completed_batch'] + 1}/{data['total_batches']}")
return data
return None
def save_progress(job_id, last_completed_batch, total_batches):
path = get_progress_path(job_id)
with open(path, 'w') as f:
json.dump({"last_completed_batch": last_completed_batch, "total_batches": total_batches}, f)
def append_partial(job_id, df_filtered, df_raw):
with _partial_lock:
for suffix, df in [('filtered', df_filtered), ('raw', df_raw)]:
if df is None:
continue
path = get_partial_path(job_id, suffix)
header = not os.path.exists(path)
df.to_csv(path, mode='a', index=False, header=header, encoding='utf-8-sig', sep=';')
def load_partial(job_id):
results_filtered, results_raw = [], []
for suffix, lst in [('filtered', results_filtered), ('raw', results_raw)]:
path = get_partial_path(job_id, suffix)
if os.path.exists(path):
try:
df = pd.read_csv(path, sep=';', encoding='utf-8-sig')
lst.append(df)
print(f"📂 Partial {suffix}: {len(df)} Zeilen geladen")
except Exception as e:
print(f"⚠️ Partial {suffix} Ladefehler: {e}")
return results_filtered, results_raw
def cleanup_progress(job_id):
for path in [
get_progress_path(job_id),
get_partial_path(job_id, 'filtered'),
get_partial_path(job_id, 'raw'),
]:
if os.path.exists(path):
os.remove(path)
# ──────────────────────────────────────────────
# CSV Nachbearbeitung
# ──────────────────────────────────────────────
def process_result_csv(raw_bytes, input_df, apply_filter=True):
try:
content = raw_bytes.decode('utf-8', errors='replace')
df_out = pd.read_csv(StringIO(content))
print(f"📄 Raw result: {df_out.shape}")
available = [c for c in OUTPUT_COLS if c in df_out.columns]
df_out = df_out[available]
for col in df_out.columns:
df_out[col] = df_out[col].apply(fix_encoding)
if apply_filter:
input_addresses = build_input_addresses(input_df)
before = len(df_out)
df_out = df_out[
df_out['address'].apply(lambda a: address_in_input(a, input_addresses))
]
print(f"📍 Filter: {before}{len(df_out)}")
df_out = df_out.drop_duplicates(subset=['title', 'address'], keep='first')
df_out = df_out.dropna(subset=['title'], how='all')
df_out = df_out[df_out['title'].str.strip().astype(bool)]
print(f"✅ Final ({'gefiltert' if apply_filter else 'alle'}): {df_out.shape}")
return df_out
except Exception as e:
print(f"💥 process_result_csv: {e}")
return None
# ──────────────────────────────────────────────
# Parallel: Einzelnen Batch verarbeiten
# ──────────────────────────────────────────────
def process_batch(batch_idx, batch_queries, scraper_url, filename, job_id, df_input):
payload = {
"name": f"{filename.replace('.csv','')}-{job_id}-B{batch_idx+1:03d}",
"keywords": batch_queries,
"lang": "de",
"depth": 1,
"zoom": 17,
"radius": 100,
"max_time": MAX_TIME,
"fast_mode": False,
"proxies": [PROXY_URL]
} }
details_response = requests.get(details_url, params=details_params, timeout=5)
if details_response.status_code == 200:
details_data = details_response.json().get('result', {})
phone = details_data.get('formatted_phone_number', 'N/A')
website = details_data.get('website', 'N/A')
# Speichern nur, wenn Name und Telefonnummer vorhanden sind for attempt in range(1, MAX_RETRIES + 1):
if name != 'N/A' and phone != 'N/A': scraper_id = None
results.append({ try:
'Name': name, resp = requests.post(f"{scraper_url}/api/v1/jobs", json=payload, timeout=45)
'Address': formatted_address, print(f"📤 Batch {batch_idx+1}{scraper_url} | {resp.status_code} (Versuch {attempt})")
'Phone': phone,
'Website': website
})
else:
print(f"Fehler beim Abrufen der URL: {url} - Statuscode: {response.status_code}")
except requests.exceptions.RequestException as e:
print(f"Anfragefehler für {url}: {e}")
return results if is_blocked(resp.text):
print(f"🚫 Batch {batch_idx+1} blocked")
return None, None
if resp.status_code != 201:
print(f"⚠️ Batch {batch_idx+1} fehlgeschlagen: {resp.text[:100]}")
if attempt < MAX_RETRIES:
time.sleep(10)
continue
scraper_id = resp.json()['id']
print(f"✅ Batch {batch_idx+1} Scraper-ID: {scraper_id}")
batch_start_time = time.time()
for poll_i in range(1, POLL_MAX + 1):
r = requests.get(f"{scraper_url}/api/v1/jobs/{scraper_id}", timeout=15)
data = r.json()
status = data.get('Status', data.get('status', '?'))
elapsed = time.time() - batch_start_time
print(f"⏳ Batch {batch_idx+1} Poll {poll_i}: {status} | {int(elapsed)}s")
if status == 'pending' and elapsed > STUCK_TIMEOUT:
print(f"⚠️ Batch {batch_idx+1} hängt seit {int(elapsed)}s Neustart {scraper_url}")
_cleanup_scraper_job(scraper_url, scraper_id)
scraper_id = None
restart_scraper(scraper_url)
break
if status in ('ok', 'completed', 'scraped'):
dl = requests.get(f"{scraper_url}/api/v1/jobs/{scraper_id}/download", timeout=90)
scraper_id = None
if dl.status_code == 200:
df_filtered = process_result_csv(dl.content, df_input, True)
df_raw = process_result_csv(dl.content, df_input, False)
print(f"📊 Batch {batch_idx+1}: {len(df_filtered) if df_filtered is not None else 0} filtered")
return df_filtered, df_raw
return None, None
elif status in ('failed', 'error'):
print(f"💥 Batch {batch_idx+1}: {status} (Versuch {attempt})")
_cleanup_scraper_job(scraper_url, scraper_id)
scraper_id = None
if attempt < MAX_RETRIES:
time.sleep(10)
break
time.sleep(random.uniform(POLL_DELAY_MIN, POLL_DELAY_MAX))
except Exception as e:
print(f"💥 Batch {batch_idx+1} Versuch {attempt}: {e}")
if scraper_id:
_cleanup_scraper_job(scraper_url, scraper_id)
scraper_id = None
if attempt < MAX_RETRIES:
time.sleep(10)
return None, None
# ──────────────────────────────────────────────
# HAUPT-WORKER
# ──────────────────────────────────────────────
def process_file(filename, job_id, app): def process_file(filename, job_id, app):
with app.app_context(): with app.app_context():
print(f"Starte Prozess für Job-ID: {job_id}")
filepath = os.path.join(UPLOAD_FOLDER, filename)
results = []
job = Job.query.get(job_id) job = Job.query.get(job_id)
if not job: if job:
print("Job wurde abgebrochen, bevor er starten konnte.") job.status = "⏳ Wartet auf anderen Job..."
return
job.status = "In Progress"
db.session.commit() db.session.commit()
with open(filepath, newline='', encoding='ISO-8859-1') as csvfile: with _job_semaphore:
reader = csv.DictReader(csvfile, delimiter=';') print(f"🎯 {filename} Job#{job_id} START!")
rows = list(reader)
total_rows = len(rows)
print(f"Insgesamt zu verarbeitende Zeilen: {total_rows}")
for index, row in enumerate(rows): with app.app_context():
# Job-Verfügbarkeit erneut prüfen
job = Job.query.get(job_id) job = Job.query.get(job_id)
if not job: if not job:
print("Job wurde abgebrochen.") print("❌ Job missing")
return return
# Vollständige Adresse erstellen
street = f"{row.get('Straße', '')} {row.get('Hausnummer', '')}".strip()
city_zip = f"{row.get('PLZ', '')} {row.get('Stadt', '')}".strip()
print(f"Verarbeite Adresse: {street}, {city_zip}")
address_results = get_place_details(street, city_zip)
for result in address_results:
# Ergebnisse nur speichern, wenn Name und Telefonnummer vorhanden sind
if result['Name'] != 'N/A' and result['Phone'] != 'N/A':
result.update({
'PLZ': row.get('PLZ', ''),
'Stadt': row.get('Stadt', ''),
'Straße': row.get('Straße', ''),
'Hausnummer': row.get('Hausnummer', ''),
'Zusatz': row.get('Zusatz', '')
})
results.append(result)
# Results-Dateiname basierend auf dem Upload-Dateinamen
result_file = f"results_{filename}"
result_path = os.path.join(RESULT_FOLDER, result_file)
# Prüfen und erstellen des Ergebnisverzeichnisses
if not os.path.exists(RESULT_FOLDER):
os.makedirs(RESULT_FOLDER)
print(f"Erstelle Ergebnisverzeichnis: {RESULT_FOLDER}")
try: try:
if results: # Nur speichern, wenn Ergebnisse vorhanden sind job.status = "📊 parsing CSV"
with open(result_path, 'w', newline='', encoding='utf-8-sig') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=['Name', 'Address', 'Phone', 'Website', 'PLZ', 'Stadt', 'Straße', 'Hausnummer', 'Zusatz'])
writer.writeheader()
writer.writerows(results)
print(f"Ergebnisdatei erfolgreich gespeichert unter: {result_path}")
job.status = "Completed"
job.result_filename = result_file
db.session.commit() db.session.commit()
else:
print("Keine relevanten Ergebnisse zum Speichern vorhanden. Markiere den Job als 'Failed'.") filepath = os.path.join(UPLOAD_FOLDER, filename)
job.status = "Failed" print(f"📁 {filepath} | {os.path.getsize(filepath)}b")
df_input = pd.read_csv(filepath, sep=';', encoding='ISO-8859-1')
print(f"📊 {df_input.shape}")
queries = []
for _, row in df_input.iterrows():
parts = [
str(row.get('PLZ', '')).strip(),
str(row.get('Stadt', '')).strip(),
str(row.get('Straße', '')).strip(),
str(row.get('Hausnummer', '')).strip(),
str(row.get('Zusatz', '')).strip(),
]
q = f"Firmen {' '.join(p for p in parts if p and p != 'nan')}".strip()
q = clean_query(q)
if len(q) > 10:
queries.append(q)
total_queries = len(queries)
print(f"🔍 {total_queries} Queries | Samples: {queries[:3]}")
if total_queries == 0:
raise ValueError("Keine gültigen Adressen")
batches = (total_queries + BATCH_SIZE - 1) // BATCH_SIZE
os.makedirs(RESULT_FOLDER, exist_ok=True)
progress = load_progress(job_id)
start_batch = progress['last_completed_batch'] + 1 if progress else 0
all_results_filtered, all_results_raw = load_partial(job_id) if progress else ([], [])
eta_initial = format_eta((batches - start_batch) * ((BATCH_DELAY_MAX + MAX_TIME) / 2) / PARALLEL_WORKERS)
print(f"📦 {batches} Batches à {BATCH_SIZE} | {PARALLEL_WORKERS}x parallel (Chunk) | Start: {start_batch} | ETA: ~{eta_initial}")
job_start_time = time.time()
job.status = f"🔄 Batch {start_batch+1}/{batches} | ⏱️ ~{eta_initial}"
db.session.commit() db.session.commit()
completed_count = 0
batch_indices = list(range(start_batch, batches))
chunks = [
batch_indices[i:i + PARALLEL_WORKERS]
for i in range(0, len(batch_indices), PARALLEL_WORKERS)
]
with ThreadPoolExecutor(max_workers=PARALLEL_WORKERS) as executor:
for chunk_idx, chunk in enumerate(chunks):
futures = {}
for batch_idx in chunk:
batch_start_q = batch_idx * BATCH_SIZE
batch_end_q = min(batch_start_q + BATCH_SIZE, total_queries)
batch_queries = queries[batch_start_q:batch_end_q]
scraper_url = SCRAPER_URLS[batch_idx % len(SCRAPER_URLS)]
print(f"\n🚀 Chunk {chunk_idx+1} | Batch {batch_idx+1}/{batches}{scraper_url}")
time.sleep(random.uniform(1, 2))
future = executor.submit(
process_batch,
batch_idx, batch_queries, scraper_url,
filename, job_id, df_input
)
futures[future] = batch_idx
for future in as_completed(futures):
batch_idx = futures[future]
completed_count += 1
try:
df_filtered, df_raw = future.result()
if df_filtered is not None:
all_results_filtered.append(df_filtered)
all_results_raw.append(df_raw)
append_partial(job_id, df_filtered, df_raw)
except Exception as e: except Exception as e:
print(f"Fehler beim Schreiben der Ergebnisdatei: {e}") print(f"💥 Batch {batch_idx+1} Exception: {e}")
job.status = "Failed"
save_progress(job_id, batch_idx, batches)
elapsed = time.time() - job_start_time
if completed_count > 0:
avg_per_batch = elapsed / completed_count
remaining = (batches - start_batch - completed_count) * avg_per_batch / PARALLEL_WORKERS
eta_str = format_eta(remaining)
else:
eta_str = "?"
job.status = f"🔄 {completed_count}/{batches - start_batch} fertig | ⏱️ ~{eta_str}"
db.session.commit() db.session.commit()
if chunk_idx < len(chunks) - 1:
delay = random.uniform(BATCH_DELAY_MIN, BATCH_DELAY_MAX)
print(f"⏸️ Chunk {chunk_idx+1} fertig warte {delay:.1f}s...")
time.sleep(delay)
# ── MERGE & SAVE ──
job.status = "🔧 merging results"
db.session.commit()
base = filename.replace('.csv', '')
if all_results_filtered:
df_final_filtered = pd.concat(all_results_filtered, ignore_index=True)
df_final_filtered = df_final_filtered.drop_duplicates(subset=['title', 'address'])
out_filtered = f"results_{base}_filtered.csv"
df_final_filtered.to_csv(
os.path.join(RESULT_FOLDER, out_filtered),
index=False, encoding='utf-8-sig', sep=';'
)
out_raw = None
if all_results_raw:
df_final_raw = pd.concat(all_results_raw, ignore_index=True)
out_raw = f"results_{base}_all.csv"
df_final_raw.to_csv(
os.path.join(RESULT_FOLDER, out_raw),
index=False, encoding='utf-8-sig', sep=';'
)
job.result_filename = out_filtered
job.result_filename_raw = out_raw
job.status = f"✅ Fertig: {len(df_final_filtered)} Firmen"
cleanup_progress(job_id)
else:
job.status = "❌ Keine Ergebnisse"
db.session.commit()
print(f"🎉 Job {job_id} komplett!")
except Exception as e:
job.status = f"Failed: {str(e)[:50]}"
print(f"💥 FATAL: {e}")
import traceback
traceback.print_exc()
db.session.commit()
print(f"✅ DONE! Status: {job.status}")

21
delete-crawl-jobs.py Normal file
View file

@ -0,0 +1,21 @@
import requests
import time
base_url = "http://localhost:5001/api/v1/jobs"
response = requests.get(base_url)
jobs = response.json() # Direkt Array
print(f"{len(jobs)} Jobs gefunden.")
deleted = 0
for job in jobs:
job_id = job["ID"]
del_res = requests.delete(f"{base_url}/{job_id}")
if del_res.status_code in [200, 204]:
print(f"{job_id}")
deleted += 1
else:
print(f"{job_id}: {del_res.status_code}")
time.sleep(0.1)
print(f"{deleted}/{len(jobs)} gelöscht.")

View file

@ -1,4 +1,4 @@
version: '3' version: '3.8'
services: services:
web: web:
build: . build: .
@ -6,6 +6,114 @@ services:
- "5000:5000" - "5000:5000"
environment: environment:
- FLASK_APP=app - FLASK_APP=app
command: flask run --host=0.0.0.0 --port=5000 - FLASK_ENV=production
- PYTHONUNBUFFERED=1
volumes: volumes:
- .:/app - ./app:/app/app
- ./uploads:/app/uploads
- ./results:/app/results
- ./instance:/app/instance
- /var/run/docker.sock:/var/run/docker.sock
depends_on:
- gmaps-scraper-1
- gmaps-scraper-2
- gmaps-scraper-3
- gmaps-scraper-4
restart: always
networks:
- scraper-net
gmaps-scraper-1:
image: gosom/google-maps-scraper:latest
container_name: gmaps-scraper-1
environment:
- PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
ports:
- "5001:8080"
volumes:
- ./scraper-data-1:/gmapsdata
command:
- "-web"
- "-data-folder=/gmapsdata"
restart: always
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:8080/api/v1/jobs || exit 1"]
interval: 30s
timeout: 10s
retries: 3
start_period: 15s
networks:
- scraper-net
gmaps-scraper-2:
image: gosom/google-maps-scraper:latest
container_name: gmaps-scraper-2
environment:
- PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
ports:
- "5002:8080"
volumes:
- ./scraper-data-2:/gmapsdata
command:
- "-web"
- "-data-folder=/gmapsdata"
restart: always
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:8080/api/v1/jobs || exit 1"]
interval: 30s
timeout: 10s
retries: 3
start_period: 15s
networks:
- scraper-net
gmaps-scraper-3:
image: gosom/google-maps-scraper:latest
container_name: gmaps-scraper-3
environment:
- PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
ports:
- "5003:8080"
volumes:
- ./scraper-data-3:/gmapsdata
command:
- "-web"
- "-data-folder=/gmapsdata"
restart: always
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:8080/api/v1/jobs || exit 1"]
interval: 30s
timeout: 10s
retries: 3
start_period: 15s
networks:
- scraper-net
gmaps-scraper-4:
image: gosom/google-maps-scraper:latest
container_name: gmaps-scraper-4
environment:
- PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
ports:
- "5004:8080"
volumes:
- ./scraper-data-4:/gmapsdata
command:
- "-web"
- "-data-folder=/gmapsdata"
restart: always
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:8080/api/v1/jobs || exit 1"]
interval: 30s
timeout: 10s
retries: 3
start_period: 15s
networks:
- scraper-net
networks:
scraper-net:
driver: bridge

Binary file not shown.

1
migrations/README Normal file
View file

@ -0,0 +1 @@
Single-database configuration for Flask.

50
migrations/alembic.ini Normal file
View file

@ -0,0 +1,50 @@
# A generic, single database configuration.
[alembic]
# template used to generate migration files
# file_template = %%(rev)s_%%(slug)s
# set to 'true' to run the environment during
# the 'revision' command, regardless of autogenerate
# revision_environment = false
# Logging configuration
[loggers]
keys = root,sqlalchemy,alembic,flask_migrate
[handlers]
keys = console
[formatters]
keys = generic
[logger_root]
level = WARN
handlers = console
qualname =
[logger_sqlalchemy]
level = WARN
handlers =
qualname = sqlalchemy.engine
[logger_alembic]
level = INFO
handlers =
qualname = alembic
[logger_flask_migrate]
level = INFO
handlers =
qualname = flask_migrate
[handler_console]
class = StreamHandler
args = (sys.stderr,)
level = NOTSET
formatter = generic
[formatter_generic]
format = %(levelname)-5.5s [%(name)s] %(message)s
datefmt = %H:%M:%S

113
migrations/env.py Normal file
View file

@ -0,0 +1,113 @@
import logging
from logging.config import fileConfig
from flask import current_app
from alembic import context
# this is the Alembic Config object, which provides
# access to the values within the .ini file in use.
config = context.config
# Interpret the config file for Python logging.
# This line sets up loggers basically.
fileConfig(config.config_file_name)
logger = logging.getLogger('alembic.env')
def get_engine():
try:
# this works with Flask-SQLAlchemy<3 and Alchemical
return current_app.extensions['migrate'].db.get_engine()
except (TypeError, AttributeError):
# this works with Flask-SQLAlchemy>=3
return current_app.extensions['migrate'].db.engine
def get_engine_url():
try:
return get_engine().url.render_as_string(hide_password=False).replace(
'%', '%%')
except AttributeError:
return str(get_engine().url).replace('%', '%%')
# add your model's MetaData object here
# for 'autogenerate' support
# from myapp import mymodel
# target_metadata = mymodel.Base.metadata
config.set_main_option('sqlalchemy.url', get_engine_url())
target_db = current_app.extensions['migrate'].db
# other values from the config, defined by the needs of env.py,
# can be acquired:
# my_important_option = config.get_main_option("my_important_option")
# ... etc.
def get_metadata():
if hasattr(target_db, 'metadatas'):
return target_db.metadatas[None]
return target_db.metadata
def run_migrations_offline():
"""Run migrations in 'offline' mode.
This configures the context with just a URL
and not an Engine, though an Engine is acceptable
here as well. By skipping the Engine creation
we don't even need a DBAPI to be available.
Calls to context.execute() here emit the given string to the
script output.
"""
url = config.get_main_option("sqlalchemy.url")
context.configure(
url=url, target_metadata=get_metadata(), literal_binds=True
)
with context.begin_transaction():
context.run_migrations()
def run_migrations_online():
"""Run migrations in 'online' mode.
In this scenario we need to create an Engine
and associate a connection with the context.
"""
# this callback is used to prevent an auto-migration from being generated
# when there are no changes to the schema
# reference: http://alembic.zzzcomputing.com/en/latest/cookbook.html
def process_revision_directives(context, revision, directives):
if getattr(config.cmd_opts, 'autogenerate', False):
script = directives[0]
if script.upgrade_ops.is_empty():
directives[:] = []
logger.info('No changes in schema detected.')
conf_args = current_app.extensions['migrate'].configure_args
if conf_args.get("process_revision_directives") is None:
conf_args["process_revision_directives"] = process_revision_directives
connectable = get_engine()
with connectable.connect() as connection:
context.configure(
connection=connection,
target_metadata=get_metadata(),
**conf_args
)
with context.begin_transaction():
context.run_migrations()
if context.is_offline_mode():
run_migrations_offline()
else:
run_migrations_online()

24
migrations/script.py.mako Normal file
View file

@ -0,0 +1,24 @@
"""${message}
Revision ID: ${up_revision}
Revises: ${down_revision | comma,n}
Create Date: ${create_date}
"""
from alembic import op
import sqlalchemy as sa
${imports if imports else ""}
# revision identifiers, used by Alembic.
revision = ${repr(up_revision)}
down_revision = ${repr(down_revision)}
branch_labels = ${repr(branch_labels)}
depends_on = ${repr(depends_on)}
def upgrade():
${upgrades if upgrades else "pass"}
def downgrade():
${downgrades if downgrades else "pass"}

View file

@ -0,0 +1,45 @@
"""Add is_admin column to User model
Revision ID: 10331d61a25d
Revises:
Create Date: 2024-11-14 08:36:27.125841
"""
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = '10331d61a25d'
down_revision = None
branch_labels = None
depends_on = None
def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.drop_table('user')
op.drop_table('job')
# ### end Alembic commands ###
def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.create_table('job',
sa.Column('id', sa.INTEGER(), nullable=False),
sa.Column('user_id', sa.INTEGER(), nullable=False),
sa.Column('filename', sa.VARCHAR(length=150), nullable=False),
sa.Column('status', sa.VARCHAR(length=50), nullable=True),
sa.Column('created_at', sa.DATETIME(), nullable=True),
sa.Column('result_filename', sa.VARCHAR(length=150), nullable=True),
sa.ForeignKeyConstraint(['user_id'], ['user.id'], ),
sa.PrimaryKeyConstraint('id')
)
op.create_table('user',
sa.Column('id', sa.INTEGER(), nullable=False),
sa.Column('username', sa.VARCHAR(length=150), nullable=False),
sa.Column('password', sa.VARCHAR(length=150), nullable=False),
sa.PrimaryKeyConstraint('id'),
sa.UniqueConstraint('username')
)
# ### end Alembic commands ###

View file

@ -1,7 +1,8 @@
Flask==2.2.5 flask
Flask-Login==0.6.2 flask-sqlalchemy
Flask-SQLAlchemy==3.0.3 flask-login
Werkzeug==2.2.2 flask-migrate
pandas pandas
requests requests
beautifulsoup4 werkzeug
docker