Compare commits

..

No commits in common. "bb06a072b0a65b11b67c7eeb92394757065902bd" and "008e2bc274594179d5f0aeb651dff7e9166c1205" have entirely different histories.

34 changed files with 243 additions and 3318 deletions

163
.gitignore vendored
View file

@ -1,163 +0,0 @@
### Flask ###
instance/*
!instance/.gitignore
.webassets-cache
.env
### Flask.Python Stack ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# Uploads / Results
uploads/
results/
scraper-data-*/

View file

@ -7,14 +7,12 @@ WORKDIR /app
# Abhängigkeiten installieren
COPY requirements.txt requirements.txt
RUN pip install --no-cache-dir -r requirements.txt
RUN apt update
RUN apt install curl -y
# App-Dateien kopieren
COPY . .
# Flask Umgebungsvariable setzen
ENV FLASK_APP=app
ENV FLASK_ENV=production
EXPOSE 5000
# Flask starten
CMD ["flask", "run", "--host=0.0.0.0", "--port=5000"]

View file

@ -1,88 +1,52 @@
import os
from flask import Flask, redirect, url_for, request, current_app
from flask import Flask, redirect, url_for, request
from flask_sqlalchemy import SQLAlchemy
from flask_login import LoginManager, current_user
from flask_migrate import Migrate
from sqlalchemy import text
from .models import db, User
# ✅ Docker-Pfade
# Konfiguration für Upload- und Ergebnis-Ordner
UPLOAD_FOLDER = '/app/uploads'
RESULT_FOLDER = '/app/results'
db = SQLAlchemy()
login_manager = LoginManager()
migrate = Migrate()
def _run_migrations(app):
"""Fehlende DB-Spalten automatisch hinzufügen übersteht jeden Neustart"""
migrations = [
("job", "result_filename_raw", "VARCHAR(150)"),
("job", "scraper_job_id", "VARCHAR(255)"),
("user", "is_admin", "BOOLEAN DEFAULT 0"),
]
with app.app_context():
for table, column, col_type in migrations:
try:
db.session.execute(text(f"ALTER TABLE {table} ADD COLUMN {column} {col_type}"))
db.session.commit()
print(f"✅ Migration: {table}.{column} hinzugefügt")
except Exception:
db.session.rollback()
def create_app():
app = Flask(__name__)
# 🔑 Configs
app.config['SECRET_KEY'] = '008e7369b075886d5f494c8813efdfb17155da6af12b3fe8ee'
app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///users.db'
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
app.config['RESULT_FOLDER'] = RESULT_FOLDER
app.config['ALLOW_USER_SIGNUP'] = False
# DB + Tools
db.init_app(app)
migrate.init_app(app, db)
login_manager.init_app(app)
login_manager.login_view = 'auth.login'
# User Loader
# Flask-Login Setup
login_manager = LoginManager()
login_manager.login_view = 'auth.login'
login_manager.init_app(app)
@login_manager.user_loader
def load_user(user_id):
from .models import User
return User.query.get(int(user_id))
# Protected Routes
# Umleitung nicht authentifizierter Benutzer, statische Dateien und bestimmte Routen ausnehmen
@app.before_request
def require_login():
allowed = ['auth.login', 'auth.signup', 'static']
if (not current_user.is_authenticated and
request.endpoint not in allowed and
not request.path.startswith('/static')):
allowed_routes = ['auth.login', 'auth.signup']
# Prüfen, ob der Benutzer authentifiziert ist oder eine erlaubte Route anfragt
if (not current_user.is_authenticated
and request.endpoint not in allowed_routes
and not request.path.startswith('/static/')):
return redirect(url_for('auth.login'))
# Ordner
# Erstellen Sie die Ordner, falls sie noch nicht existieren
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
os.makedirs(app.config['RESULT_FOLDER'], exist_ok=True)
# Routes
# Registrieren der Routen
from . import routes
app.register_blueprint(routes.bp)
# Index Redirect
@app.route('/')
def index():
return redirect(url_for('auth.job_status'))
# DB Tables + Auto-Migration
# Erstellen der Tabellen in der Datenbank
with app.app_context():
db.create_all()
_run_migrations(app)
return app
if __name__ == '__main__':
app = create_app()
app.run(host='0.0.0.0', port=5000, debug=False)

View file

@ -1,68 +0,0 @@
import os
from flask import Flask, redirect, url_for, request, current_app
from flask_sqlalchemy import SQLAlchemy
from flask_login import LoginManager, current_user
from flask_migrate import Migrate
# ✅ Docker-Pfade
UPLOAD_FOLDER = '/app/uploads'
RESULT_FOLDER = '/app/results'
db = SQLAlchemy()
login_manager = LoginManager()
migrate = Migrate()
def create_app():
app = Flask(__name__)
# 🔑 Configs
app.config['SECRET_KEY'] = '008e7369b075886d5f494c8813efdfb17155da6af12b3fe8ee'
app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///users.db'
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
app.config['RESULT_FOLDER'] = RESULT_FOLDER
app.config['ALLOW_USER_SIGNUP'] = True # ✅ Aktiviert!
# DB + Tools
db.init_app(app)
migrate.init_app(app, db)
login_manager.init_app(app)
login_manager.login_view = 'auth.login'
# User Loader
@login_manager.user_loader
def load_user(user_id):
from .models import User
return User.query.get(int(user_id))
# Protected Routes
@app.before_request
def require_login():
allowed = ['auth.login', 'auth.signup', 'static']
if (not current_user.is_authenticated and
request.endpoint not in allowed and
not request.path.startswith('/static')):
return redirect(url_for('auth.login'))
# Ordner
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
os.makedirs(app.config['RESULT_FOLDER'], exist_ok=True)
# Routes
from . import routes
app.register_blueprint(routes.bp)
# Index Redirect
@app.route('/')
def index():
return redirect(url_for('auth.job_status'))
# DB Tables
with app.app_context():
db.create_all()
return app
if __name__ == '__main__':
app = create_app()
app.run(host='0.0.0.0', port=5000, debug=False)

View file

@ -1,25 +1,20 @@
from flask_sqlalchemy import SQLAlchemy
from flask_login import UserMixin
from datetime import datetime
from . import db
db = SQLAlchemy()
class User(UserMixin, db.Model):
id = db.Column(db.Integer, primary_key=True)
username = db.Column(db.String(150), unique=True, nullable=False)
password = db.Column(db.String(150), nullable=False)
is_admin = db.Column(db.Boolean, default=False)
class Job(db.Model):
id = db.Column(db.Integer, primary_key=True)
user_id = db.Column(db.Integer, db.ForeignKey('user.id'), nullable=False)
filename = db.Column(db.String(150), nullable=False)
status = db.Column(db.String(50), default="Pending")
status = db.Column(db.String(50), default="Pending") # Status: Pending, In Progress, Completed
created_at = db.Column(db.DateTime, default=datetime.utcnow)
result_filename = db.Column(db.String(150), nullable=True)
result_filename_raw = db.Column(db.String(150), nullable=True)
user = db.relationship('User', backref=db.backref('jobs', lazy=True))
class AppConfig(db.Model):
id = db.Column(db.Integer, primary_key=True)
key = db.Column(db.String(100), unique=True, nullable=False)
value = db.Column(db.String(100), nullable=False, default='false')

View file

@ -1,223 +0,0 @@
import time
import csv
import os
import threading
from flask import Blueprint, request, redirect, url_for, flash, render_template, send_file, current_app
from flask_login import login_user, logout_user, login_required, current_user
from werkzeug.utils import secure_filename
from werkzeug.security import generate_password_hash, check_password_hash
from .models import db, User, Job
from .webcrawler import process_file # Importiere die Funktion für das Webscraping
UPLOAD_FOLDER = 'uploads'
RESULT_FOLDER = 'results'
# Blueprint für auth erstellen
bp = Blueprint('auth', __name__)
@bp.route('/login', methods=['GET', 'POST'])
def login():
if request.method == 'POST':
username = request.form['username']
password = request.form['password']
user = User.query.filter_by(username=username).first()
if user and check_password_hash(user.password, password):
login_user(user)
return redirect(url_for('auth.job_status'))
flash('Login fehlgeschlagen. Überprüfen Sie Benutzername und Passwort.')
return render_template('login.html')
@bp.route('/signup', methods=['GET', 'POST'])
def signup():
if not current_app.config['ALLOW_USER_SIGNUP']:
flash("Registrierung ist derzeit deaktiviert.")
return redirect(url_for('auth.login'))
if request.method == 'POST':
username = request.form['username']
password = generate_password_hash(request.form['password'], method='sha256')
new_user = User(username=username, password=password)
db.session.add(new_user)
db.session.commit()
flash('Benutzer erfolgreich erstellt! Sie können sich jetzt einloggen.')
return redirect(url_for('auth.login'))
return render_template('signup.html')
@bp.route('/logout')
@login_required
def logout():
logout_user()
return redirect(url_for('auth.login'))
@bp.route('/jobs')
@login_required
def job_status():
jobs = Job.query.filter_by(user_id=current_user.id).all()
return render_template('jobs.html', jobs=jobs)
@bp.route('/upload', methods=['GET', 'POST'])
@login_required
def upload():
if request.method == 'POST':
file = request.files['file']
filename = secure_filename(file.filename)
# Überprüfen, ob eine Datei mit dem gleichen Namen bereits existiert
file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], filename)
if os.path.exists(file_path):
# Wenn eine Datei mit dem gleichen Namen existiert, einen Zeitstempel hinzufügen
name, ext = os.path.splitext(filename)
timestamp = time.strftime("%Y%m%d-%H%M%S") # Zeitstempel im Format JahrMonatTag-StundenMinutenSekunden
filename = f"{name}_{timestamp}{ext}"
file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], filename)
flash(f"Eine Datei mit gleichem Namen existierte bereits. Die Datei wurde als '{filename}' gespeichert.")
# Speichern der Datei
file.save(file_path)
flash('Datei erfolgreich hochgeladen und Job gestartet')
# Neuen Job erstellen
new_job = Job(user_id=current_user.id, filename=filename, status="Pending")
db.session.add(new_job)
db.session.commit()
# Debugging-Ausgabe zur Überprüfung der Thread-Erstellung
print(f"Starte Scraping-Thread für Job-ID: {new_job.id}")
# Starten des Scraping im Hintergrund-Thread und Übergeben des aktuellen Anwendungskontexts
thread = threading.Thread(target=process_file, args=(filename, new_job.id, current_app._get_current_object()))
thread.start()
# Debugging-Ausgabe, nachdem der Thread gestartet wurde
print(f"Thread für Job {new_job.id} erfolgreich gestartet.")
return redirect(url_for('auth.job_status'))
return render_template('upload.html')
@bp.route('/download/<int:job_id>', methods=['GET'])
@login_required
def download_result(job_id):
job = Job.query.get_or_404(job_id)
print(f"Job ID: {job.id} - User ID: {job.user_id} - Current User ID: {current_user.id}")
# Überprüfen, ob der Job dem aktuellen Benutzer gehört
if job.user_id != current_user.id:
flash("Sie haben keine Berechtigung, dieses Ergebnis herunterzuladen.")
return redirect(url_for('auth.job_status'))
# Überprüfen, ob das Ergebnis vorhanden ist
if not job.result_filename:
flash("Das Ergebnis ist noch nicht verfügbar.")
return redirect(url_for('auth.job_status'))
# Überprüfen, ob die Datei im angegebenen Pfad existiert
result_path = os.path.join(current_app.config['RESULT_FOLDER'], job.result_filename)
print(f"Versuche, Datei herunterzuladen von: {result_path}")
if os.path.exists(result_path):
print("Datei existiert und wird zum Download bereitgestellt.")
return send_file(result_path, as_attachment=True)
else:
print("Datei nicht gefunden. Ergebnisverzeichnis oder Pfad prüfen.")
flash("Ergebnisdatei nicht gefunden.")
return redirect(url_for('auth.job_status'))
@bp.route('/delete_job/<int:job_id>', methods=['POST'])
@login_required
def delete_job(job_id):
job = Job.query.get_or_404(job_id)
if job.user_id != current_user.id:
flash("Sie haben keine Berechtigung, diesen Job zu löschen.")
return redirect(url_for('auth.job_status'))
# Löschen der Upload-Datei
upload_path = os.path.join(current_app.config['UPLOAD_FOLDER'], job.filename)
if os.path.exists(upload_path):
os.remove(upload_path)
print(f"Upload-Datei gelöscht: {upload_path}")
else:
print(f"Upload-Datei nicht gefunden: {upload_path}")
# Löschen der Results-Datei, falls vorhanden
if job.result_filename:
result_path = os.path.join(current_app.config['RESULT_FOLDER'], job.result_filename)
print(f"Versuche Ergebnisdatei zu löschen: {result_path}")
if os.path.exists(result_path):
try:
os.remove(result_path)
print(f"Ergebnisdatei gelöscht: {result_path}")
except Exception as e:
print(f"Fehler beim Löschen der Ergebnisdatei: {e}")
else:
print(f"Ergebnisdatei nicht gefunden im Pfad: {result_path}")
# Job aus der Datenbank löschen
db.session.delete(job)
db.session.commit()
flash("Job erfolgreich gelöscht.")
return redirect(url_for('auth.job_status'))
@bp.route('/admin', methods=['GET'])
@login_required
def admin_panel():
if not current_user.is_admin:
flash("Keine Berechtigung.")
return redirect(url_for('auth.job_status'))
users = User.query.all()
return render_template('admin_panel.html', users=users)
@bp.route('/admin/create_user', methods=['POST'])
@login_required
def create_user():
if not current_user.is_admin:
flash("Keine Berechtigung.")
return redirect(url_for('auth.admin_panel'))
username = request.form['username']
password = request.form['password']
is_admin = 'is_admin' in request.form # Checkbox für Adminrechte
hashed_password = generate_password_hash(password, method='sha256')
new_user = User(username=username, password=hashed_password, is_admin=is_admin)
db.session.add(new_user)
db.session.commit()
flash(f"Benutzer {username} wurde erstellt.")
return redirect(url_for('auth.admin_panel'))
@bp.route('/admin/reset_password/<int:user_id>', methods=['POST'])
@login_required
def reset_password(user_id):
if not current_user.is_admin:
flash("Keine Berechtigung.")
return redirect(url_for('auth.admin_panel'))
user = User.query.get_or_404(user_id)
new_password = request.form['new_password']
user.password = generate_password_hash(new_password, method='sha256')
db.session.commit()
flash(f"Passwort für Benutzer {user.username} wurde zurückgesetzt.")
return redirect(url_for('auth.admin_panel'))
@bp.route('/admin/delete_user/<int:user_id>', methods=['POST'])
@login_required
def delete_user(user_id):
if not current_user.is_admin:
flash("Keine Berechtigung.")
return redirect(url_for('auth.admin_panel'))
user = User.query.get_or_404(user_id)
if user.is_admin:
flash("Administratoren können nicht gelöscht werden.")
return redirect(url_for('auth.admin_panel'))
db.session.delete(user)
db.session.commit()
flash(f"Benutzer {user.username} wurde gelöscht.")
return redirect(url_for('auth.admin_panel'))

View file

@ -1,16 +1,17 @@
import time
import csv
import os
import threading
from flask import Blueprint, request, redirect, url_for, flash, render_template, send_file, jsonify, current_app
from flask import Blueprint, request, redirect, url_for, flash, render_template, send_file, current_app
from flask_login import login_user, logout_user, login_required, current_user
from werkzeug.utils import secure_filename
from werkzeug.security import generate_password_hash, check_password_hash
from .models import db, User, Job, AppConfig
from .webcrawler import process_file
from .models import db, User, Job
from .webcrawler import process_file # Importiere die Funktion für das Webscraping
UPLOAD_FOLDER = '/app/uploads'
RESULT_FOLDER = '/app/results'
UPLOAD_FOLDER = 'uploads'
RESULT_FOLDER = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'results')
# Blueprint für auth erstellen
bp = Blueprint('auth', __name__)
@bp.route('/login', methods=['GET', 'POST'])
@ -27,18 +28,13 @@ def login():
@bp.route('/signup', methods=['GET', 'POST'])
def signup():
cfg = AppConfig.query.filter_by(key='allow_signup').first()
if not cfg or cfg.value != 'true':
flash("Registrierung ist derzeit deaktiviert.")
return redirect(url_for('auth.login'))
if request.method == 'POST':
username = request.form['username']
password = generate_password_hash(request.form['password']) # ✅ Fix
password = generate_password_hash(request.form['password'], method='sha256')
new_user = User(username=username, password=password)
db.session.add(new_user)
db.session.commit()
flash('Benutzer erfolgreich erstellt!')
flash('Benutzer erfolgreich erstellt! Sie können sich jetzt einloggen.')
return redirect(url_for('auth.login'))
return render_template('signup.html')
@ -51,203 +47,102 @@ def logout():
@bp.route('/jobs')
@login_required
def job_status():
jobs = Job.query.filter_by(user_id=current_user.id).order_by(Job.created_at.desc()).all()
jobs = Job.query.filter_by(user_id=current_user.id).all()
return render_template('jobs.html', jobs=jobs)
# Hochladen und Verarbeiten der Datei im Hintergrund
@bp.route('/upload', methods=['GET', 'POST'])
@login_required
def upload():
if request.method == 'POST':
if 'file' not in request.files:
flash('Keine Datei ausgewählt.')
return redirect(url_for('auth.upload'))
file = request.files['file']
if not file or file.filename == '':
flash('Keine gültige Datei.')
filename = secure_filename(file.filename)
if not filename.endswith('.csv'):
flash('Bitte eine CSV-Datei hochladen')
return redirect(url_for('auth.upload'))
filename = secure_filename(file.filename)
name, ext = os.path.splitext(filename)
file_path = os.path.join(UPLOAD_FOLDER, filename)
file.save(file_path)
flash('Datei erfolgreich hochgeladen und Job gestartet')
timestamp = time.strftime("%Y%m%d_%H%M%S")
unique_filename = f"{name}_{timestamp}{ext}" if os.path.exists(os.path.join(UPLOAD_FOLDER, filename)) else filename
filepath = os.path.join(UPLOAD_FOLDER, unique_filename)
file.save(filepath)
print(f"💾 UPLOAD: {filepath}")
new_job = Job(
user_id=current_user.id,
filename=unique_filename,
status="Pending"
)
# Neuen Job erstellen
new_job = Job(user_id=current_user.id, filename=filename, status="Pending")
db.session.add(new_job)
db.session.commit()
print(f"🆕 JOB #{new_job.id} für User {current_user.id}")
thread = threading.Thread(
target=process_file,
args=(unique_filename, new_job.id, current_app._get_current_object())
)
thread.daemon = True
# Debugging-Ausgabe zur Überprüfung der Thread-Erstellung
print(f"Starte Scraping-Thread für Job-ID: {new_job.id}")
# Starten des Scraping im Hintergrund-Thread und Übergeben des aktuellen Anwendungskontexts
thread = threading.Thread(target=process_file, args=(filename, new_job.id, current_app._get_current_object()))
thread.start()
print(f"🔄 THREAD STARTED Job {new_job.id}")
# Debugging-Ausgabe, nachdem der Thread gestartet wurde
print(f"Thread für Job {new_job.id} erfolgreich gestartet.")
flash(f'"{unique_filename}" → Job #{new_job.id} läuft!')
return redirect(url_for('auth.job_status'))
return render_template('upload.html')
@bp.route('/download/<int:job_id>')
@bp.route('/download/<int:job_id>', methods=['GET'])
@login_required
def download_result(job_id):
job = Job.query.filter_by(id=job_id, user_id=current_user.id).first_or_404()
job = Job.query.get_or_404(job_id)
print(f"Job ID: {job.id} - User ID: {job.user_id} - Current User ID: {current_user.id}")
if not job.result_filename or not job.status.startswith(''):
flash('Ergebnis nicht bereit.')
# Überprüfen, ob der Job dem aktuellen Benutzer gehört
if job.user_id != current_user.id:
flash("Sie haben keine Berechtigung, dieses Ergebnis herunterzuladen.")
return redirect(url_for('auth.job_status'))
result_path = os.path.join(RESULT_FOLDER, job.result_filename)
if os.path.exists(result_path):
return send_file(result_path, as_attachment=True)
flash('Datei fehlt.')
return redirect(url_for('auth.job_status'))
@bp.route('/download_raw/<int:job_id>')
@login_required
def download_result_raw(job_id):
job = Job.query.filter_by(id=job_id, user_id=current_user.id).first_or_404()
if not job.result_filename_raw:
flash('Rohdaten nicht verfügbar.')
# Überprüfen, ob das Ergebnis vorhanden ist
if not job.result_filename:
flash("Das Ergebnis ist noch nicht verfügbar.")
return redirect(url_for('auth.job_status'))
result_path = os.path.join(RESULT_FOLDER, job.result_filename_raw)
# Überprüfen, ob die Datei im angegebenen Pfad existiert
result_path = os.path.join(current_app.config['RESULT_FOLDER'], job.result_filename)
print(f"Versuche, Datei herunterzuladen von: {result_path}")
if os.path.exists(result_path):
print("Datei existiert und wird zum Download bereitgestellt.")
return send_file(result_path, as_attachment=True)
flash('Datei fehlt.')
return redirect(url_for('auth.job_status'))
else:
print("Datei nicht gefunden. Ergebnisverzeichnis oder Pfad prüfen.")
flash("Ergebnisdatei nicht gefunden.")
return redirect(url_for('auth.job_status'))
@bp.route('/delete_job/<int:job_id>', methods=['POST'])
@login_required
def delete_job(job_id):
job = Job.query.filter_by(id=job_id, user_id=current_user.id).first_or_404()
job = Job.query.get_or_404(job_id)
if job.user_id != current_user.id:
flash("Sie haben keine Berechtigung, diesen Job zu löschen.")
return redirect(url_for('auth.job_status'))
# Löschen der Upload-Datei
upload_path = os.path.join(UPLOAD_FOLDER, job.filename)
if os.path.exists(upload_path):
os.remove(upload_path)
print(f"Upload-Datei gelöscht: {upload_path}")
else:
print(f"Upload-Datei nicht gefunden: {upload_path}")
# Löschen der Results-Datei, falls vorhanden
if job.result_filename:
result_path = os.path.join(RESULT_FOLDER, job.result_filename)
if os.path.exists(result_path):
os.remove(result_path)
if job.result_filename_raw: # ✅ Raw auch löschen
raw_path = os.path.join(RESULT_FOLDER, job.result_filename_raw)
if os.path.exists(raw_path):
os.remove(raw_path)
try:
os.remove(result_path)
print(f"Ergebnisdatei gelöscht: {result_path}")
except Exception as e:
print(f"Fehler beim Löschen der Ergebnisdatei: {e}")
else:
print(f"Ergebnisdatei nicht gefunden: {result_path}")
# Job aus der Datenbank löschen
db.session.delete(job)
db.session.commit()
flash('Job gelöscht.')
flash("Job erfolgreich gelöscht.")
return redirect(url_for('auth.job_status'))
@bp.route('/job_status/<int:job_id>')
@login_required
def job_status_api(job_id):
job = Job.query.filter_by(id=job_id, user_id=current_user.id).first()
if not job:
return jsonify({'error': 'Not found'}), 404
return jsonify({
'id': job.id,
'status': job.status,
'result_filename': job.result_filename,
'result_filename_raw': getattr(job, 'result_filename_raw', None),
'scraper_job_id': getattr(job, 'scraper_job_id', None)
})
@bp.route('/resume_job/<int:job_id>', methods=['POST'])
@login_required
def resume_job(job_id):
job = Job.query.filter_by(id=job_id, user_id=current_user.id).first_or_404()
thread = threading.Thread(
target=process_file,
args=(job.filename, job.id, current_app._get_current_object())
)
thread.daemon = True
thread.start()
flash(f'Job #{job_id} wird fortgesetzt...')
return redirect(url_for('auth.job_status'))
# ── ADMIN ──────────────────────────────────────────
@bp.route('/admin', methods=['GET'])
@login_required
def admin_panel():
if not current_user.is_admin:
flash("Keine Berechtigung.")
return redirect(url_for('auth.job_status'))
users = User.query.all()
cfg = AppConfig.query.filter_by(key='allow_signup').first()
signup_allowed = cfg and cfg.value == 'true'
return render_template('admin_panel.html', users=users, signup_allowed=signup_allowed)
@bp.route('/admin/create_user', methods=['POST'])
@login_required
def create_user():
if not current_user.is_admin:
return redirect(url_for('auth.admin_panel'))
username = request.form['username']
password = generate_password_hash(request.form['password']) # ✅ Fix
is_admin = 'is_admin' in request.form
new_user = User(username=username, password=password, is_admin=is_admin)
db.session.add(new_user)
db.session.commit()
flash(f'{username} erstellt.')
return redirect(url_for('auth.admin_panel'))
@bp.route('/admin/reset_password/<int:user_id>', methods=['POST'])
@login_required
def reset_password(user_id):
if not current_user.is_admin:
return redirect(url_for('auth.admin_panel'))
user = User.query.get_or_404(user_id)
new_password = request.form['new_password']
user.password = generate_password_hash(new_password) # ✅ Fix
db.session.commit()
flash(f'Passwort {user.username} zurückgesetzt.')
return redirect(url_for('auth.admin_panel'))
@bp.route('/admin/delete_user/<int:user_id>', methods=['POST'])
@login_required
def delete_user(user_id):
if not current_user.is_admin:
return redirect(url_for('auth.admin_panel'))
user = User.query.get_or_404(user_id)
if user.is_admin:
flash('Admin nicht löschbar.')
return redirect(url_for('auth.admin_panel'))
db.session.delete(user)
db.session.commit()
flash(f'{user.username} gelöscht.')
return redirect(url_for('auth.admin_panel'))
@bp.route('/admin/toggle_signup', methods=['POST'])
@login_required
def toggle_signup():
if not current_user.is_admin:
return redirect(url_for('auth.admin_panel'))
cfg = AppConfig.query.filter_by(key='allow_signup').first()
if not cfg:
cfg = AppConfig(key='allow_signup', value='true')
db.session.add(cfg)
else:
cfg.value = 'false' if cfg.value == 'true' else 'true'
db.session.commit()
state = '✅ aktiviert' if cfg.value == 'true' else '🔒 deaktiviert'
flash(f'Registrierung {state}.')
return redirect(url_for('auth.admin_panel'))

View file

@ -1,6 +0,0 @@
PLZ;Stadt;Straße;Hausnummer;Zusatz
53175;Bonn;Godesberger Str.;27;
50667;Köln;Hohenzollernring;52;A
10115;Berlin;Chausseestraße;125;
80331;München;Marienplatz;1;B
20095;Hamburg;Ostwall;5;
1 PLZ Stadt Straße Hausnummer Zusatz
2 53175 Bonn Godesberger Str. 27
3 50667 Köln Hohenzollernring 52 A
4 10115 Berlin Chausseestraße 125
5 80331 München Marienplatz 1 B
6 20095 Hamburg Ostwall 5

View file

@ -164,186 +164,3 @@ tr:nth-child(even) td {
.delete-btn:hover {
background-color: #e60000;
}
/* Flash-Badge Styling */
.flash-badge {
position: fixed;
top: 20px;
right: 20px;
background-color: #f44336; /* Material Design Rot */
color: white;
padding: 12px 24px;
border-radius: 8px;
font-family: 'Roboto', sans-serif;
font-weight: 500;
box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.2);
z-index: 1000;
opacity: 0;
transform: translateY(-20px);
transition: opacity 0.4s ease, transform 0.4s ease;
}
/* Einblend-Animation */
.flash-badge.show {
opacity: 1;
transform: translateY(0);
}
/* Ausblend-Animation */
.flash-badge.hide {
opacity: 0;
transform: translateY(-20px);
}
.admin-panel {
max-width: 800px;
margin: 2em auto;
padding: 2em;
background: white;
border-radius: 8px;
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
}
.admin-panel h2 {
font-weight: 500;
color: #1d1d1f;
margin-bottom: 1em;
}
.user-table {
width: 100%;
border-collapse: collapse;
margin-bottom: 2em;
}
.user-table th, .user-table td {
padding: 0.75em;
text-align: left;
border: 1px solid #d1d1d6;
}
.user-table th {
background-color: #f1f1f1;
color: #333;
}
.user-table td {
background-color: white;
}
.user-table tr:nth-child(even) td {
background-color: #f9f9f9;
}
.reset-btn, .delete-btn, .create-btn {
padding: 0.5em 1em;
font-size: 0.9em;
font-weight: 500;
border: none;
border-radius: 4px;
cursor: pointer;
transition: background-color 0.2s ease-in-out;
}
.reset-btn {
background-color: #4caf50;
color: white;
}
.reset-btn:hover {
background-color: #388e3c;
}
.delete-btn {
background-color: #f44336;
color: white;
}
.delete-btn:hover {
background-color: #d32f2f;
}
.create-btn {
background-color: #007aff;
color: white;
padding: 0.75em;
margin-top: 1em;
display: block;
width: 100%;
font-size: 1em;
}
.create-btn:hover {
background-color: #005bb5;
}
.create-user-form {
margin-top: 1.5em;
}
.create-user-form input[type="text"],
.create-user-form input[type="password"] {
width: 100%;
padding: 0.75em;
margin-bottom: 1em;
border: 1px solid #d1d1d6;
border-radius: 8px;
}
.create-user-form label {
font-size: 0.9em;
color: #6e6e73;
display: block;
margin-bottom: 1em;
}
.logo-container {
text-align: center;
margin-bottom: 1em;
}
.logo-container img {
max-width: 100%;
height: auto;
width: 150px; /* Standardbreite für das Logo */
transition: width 0.3s ease;
}
@media (max-width: 768px) {
.logo-container img {
width: 120px; /* Kleinere Breite auf kleineren Bildschirmen */
}
}
@media (max-width: 480px) {
.logo-container img {
width: 100px; /* Noch kleinere Breite auf sehr kleinen Bildschirmen */
}
}
.template-info {
background-color: #f9f9f9;
padding: 1em;
border-radius: 8px;
margin-bottom: 1.5em;
text-align: center;
}
.template-info p {
margin: 0.5em 0;
}
.template-download {
display: inline-block;
padding: 0.5em 1em;
margin-top: 0.5em;
background-color: #007aff;
color: white;
border-radius: 4px;
text-decoration: none;
transition: background-color 0.2s ease;
}
.template-download:hover {
background-color: #005bb5;
}

View file

@ -1,78 +0,0 @@
{% extends "base.html" %}
{% block content %}
<div class="admin-panel">
<h2>Benutzerverwaltung</h2>
<!-- Tabelle für Benutzerverwaltung -->
<table class="user-table">
<thead>
<tr>
<th>ID</th>
<th>Benutzername</th>
<th>Admin</th>
<th>Aktionen</th>
</tr>
</thead>
<tbody>
{% for user in users %}
<tr>
<td>{{ user.id }}</td>
<td>{{ user.username }}</td>
<td>{{ 'Ja' if user.is_admin else 'Nein' }}</td>
<td>
<form action="{{ url_for('auth.reset_password', user_id=user.id) }}" method="post" style="display:inline;">
<input type="text" name="new_password" placeholder="Neues Passwort" required>
<button type="submit" class="reset-btn">Passwort zurücksetzen</button>
</form>
{% if not user.is_admin %}
<form action="{{ url_for('auth.delete_user', user_id=user.id) }}" method="post" style="display:inline;">
<button type="submit" class="delete-btn">Benutzer löschen</button>
</form>
{% endif %}
</td>
</tr>
{% endfor %}
</tbody>
</table>
<!-- Formular zum Erstellen neuer Benutzer -->
<h3>Neuen Benutzer erstellen</h3>
<form action="{{ url_for('auth.create_user') }}" method="post" class="create-user-form">
<input type="text" name="username" placeholder="Benutzername" required>
<input type="password" name="password" placeholder="Passwort" required>
<label>
<input type="checkbox" name="is_admin"> Admin
</label>
<button type="submit" class="create-btn">Benutzer erstellen</button>
</form>
</div>
<div class="config-box">
<h3>⚙️ Einstellungen</h3>
<form action="{{ url_for('auth.toggle_signup') }}" method="POST">
<div class="toggle-row">
<span>Benutzer-Registrierung:</span>
{% if signup_allowed %}
<span class="badge badge-green">✅ Aktiv</span>
<button type="submit" class="btn-danger">🔒 Deaktivieren</button>
{% else %}
<span class="badge badge-red">🔒 Deaktiviert</span>
<button type="submit" class="btn-success">✅ Aktivieren</button>
{% endif %}
</div>
</form>
</div>
<style>
.config-box { background: #f8f9fa; border: 1px solid #ddd; border-radius: 6px; padding: 16px; margin-bottom: 24px; }
.toggle-row { display: flex; align-items: center; gap: 12px; }
.badge { padding: 3px 10px; border-radius: 12px; font-size: 0.85em; font-weight: bold; }
.badge-green { background: #d4edda; color: #155724; }
.badge-red { background: #f8d7da; color: #721c24; }
.btn-danger { background: #e74c3c; color: white; border: none; padding: 6px 14px; border-radius: 4px; cursor: pointer; }
.btn-success { background: #27ae60; color: white; border: none; padding: 6px 14px; border-radius: 4px; cursor: pointer; }
.btn-danger:hover { background: #c0392b; }
.btn-success:hover { background: #1e8449; }
</style>
{% endblock %}

View file

@ -14,50 +14,14 @@
<ul>
<li><a href="{{ url_for('auth.job_status') }}">Jobs</a></li>
<li><a href="{{ url_for('auth.upload') }}">Upload</a></li>
{% if current_user.is_admin %}
<li><a href="{{ url_for('auth.admin_panel') }}">Admin</a></li> <!-- Admin-Bereich Link -->
{% endif %}
<li><a href="{{ url_for('auth.logout') }}">Logout</a></li>
</ul>
</nav>
</header>
{% endif %}
<!-- Flash-Nachrichten -->
{% with messages = get_flashed_messages() %}
{% if messages %}
<div id="flash-badge-container">
{% for message in messages %}
<div class="flash-badge">{{ message }}</div>
{% endfor %}
</div>
{% endif %}
{% endwith %}
<div class="{% if request.endpoint in ['auth.login', 'auth.signup'] %}form-container{% else %}container{% endif %}">
{% block content %}{% endblock %}
</div>
<!-- JavaScript für Ein- und Ausblendanimation des Flash-Badges -->
<script>
document.addEventListener("DOMContentLoaded", function() {
var flashBadges = document.querySelectorAll('.flash-badge');
flashBadges.forEach(function(badge) {
// Einblendung mit Verzögerung
setTimeout(function() {
badge.classList.add('show');
}, 100);
// Ausblendung nach 5 Sekunden und Entfernen aus dem DOM
setTimeout(function() {
badge.classList.remove('show');
badge.classList.add('hide');
setTimeout(function() {
badge.remove();
}, 400); // Zeit für die Ausblendanimation
}, 5000);
});
});
</script>
</body>
</html>

View file

@ -1,121 +0,0 @@
{% extends "base.html" %}
{% block content %}
<div class="table-container">
<h2>Ihre Aufträge</h2>
<table id="jobs-table">
<thead>
<tr>
<th>Dateiname</th>
<th>Status</th>
<th>Erstellt am</th>
<th>Ergebnis</th>
<th>Aktionen</th>
</tr>
</thead>
<tbody>
{% for job in jobs %}
<tr id="job-row-{{ job.id }}">
<td>{{ job.filename }}</td>
<td id="status-{{ job.id }}" class="job-status">{{ job.status }}</td>
<td>{{ job.created_at.strftime('%Y-%m-%d %H:%M:%S') }}</td>
<td id="result-{{ job.id }}">
{% if job.result_filename and 'Failed' not in job.status %}
<a href="{{ url_for('auth.download_result', job_id=job.id) }}" class="dl-btn">
🎯 Gefiltert
</a>
{% if job.result_filename_raw %}
&nbsp;
<a href="{{ url_for('auth.download_result_raw', job_id=job.id) }}" class="dl-btn dl-btn-raw">
📋 Alle
</a>
{% endif %}
{% elif 'Failed' in job.status %}
<span class="status-failed">❌ {{ job.result_filename or 'Fehler' }}</span>
{% else %}
<span class="status-pending">⏳ Noch nicht verfügbar</span>
{% endif %}
</td>
<td>
<form action="{{ url_for('auth.delete_job', job_id=job.id) }}" method="POST" style="display:inline;">
<button type="submit" class="delete-btn">🗑️ Löschen</button>
</form>
</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
<style>
.job-status { font-weight: bold; }
.status-failed { color: #e74c3c; font-weight: bold; }
.status-pending { color: #888; }
.status-completed { color: #27ae60; }
.dl-btn {
display: inline-block;
padding: 4px 10px;
border-radius: 4px;
text-decoration: none;
font-size: 0.85em;
font-weight: bold;
background: #27ae60;
color: #fff;
margin: 2px 1px;
transition: background 0.2s;
}
.dl-btn:hover { background: #1e8449; }
.dl-btn-raw { background: #2980b9; }
.dl-btn-raw:hover { background: #1a5e8a; }
</style>
<script>
document.addEventListener('DOMContentLoaded', function () {
document.querySelectorAll('.job-status').forEach(function (cell) {
const jobId = cell.id.split('-')[1];
const status = cell.textContent.trim();
if (!status.includes('✅') && !status.includes('Failed')) {
pollJob(jobId);
}
});
});
function renderResult(resultCell, data) {
const hasFailed = data.status.includes('Failed');
const hasFiltered = data.result_filename && !hasFailed;
const hasRaw = data.result_filename_raw && !hasFailed;
if (hasFiltered) {
let html = `<a href="/download/${data.id}" class="dl-btn">🎯 Gefiltert</a>`;
if (hasRaw) {
html += ` <a href="/download_raw/${data.id}" class="dl-btn dl-btn-raw">📋 Alle</a>`;
}
resultCell.innerHTML = html;
} else if (hasFailed) {
resultCell.innerHTML = `<span class="status-failed">❌ ${data.result_filename || 'Fehler'}</span>`;
} else {
resultCell.innerHTML = `<span class="status-pending">⏳ Noch nicht verfügbar</span>`;
}
}
function pollJob(jobId) {
fetch(`/job_status/${jobId}`)
.then(r => r.json())
.then(data => {
const statusCell = document.getElementById(`status-${jobId}`);
const resultCell = document.getElementById(`result-${jobId}`);
statusCell.textContent = data.status;
renderResult(resultCell, data);
// Weiter pollen wenn noch nicht fertig
const done = data.status.includes('✅') || data.status.includes('Failed');
if (!done) {
setTimeout(() => pollJob(jobId), 5000);
}
})
.catch(() => setTimeout(() => pollJob(jobId), 10000));
}
</script>
{% endblock %}

View file

@ -15,38 +15,20 @@
</thead>
<tbody>
{% for job in jobs %}
<tr id="job-row-{{ job.id }}">
<tr>
<td>{{ job.filename }}</td>
<td id="status-{{ job.id }}" class="job-status">
{{ job.status }}
</td>
<td class="job-status">{{ job.status }}</td>
<td>{{ job.created_at.strftime('%Y-%m-%d %H:%M:%S') }}</td>
<td id="result-{{ job.id }}">
{% if job.result_filename and 'Failed' not in job.status %}
<a href="{{ url_for('auth.download_result', job_id=job.id) }}" class="dl-btn">
🎯 Gefiltert
</a>
{% if job.result_filename_raw %}
&nbsp;
<a href="{{ url_for('auth.download_result_raw', job_id=job.id) }}" class="dl-btn dl-btn-raw">
📋 Alle
</a>
{% endif %}
{% elif 'Failed' in job.status %}
<span class="status-failed">❌ {{ job.result_filename or 'Fehler' }}</span>
<td>
{% if job.status == "Completed" %}
<a href="{{ url_for('auth.download_result', job_id=job.id) }}">Download</a>
{% else %}
<span class="status-pending"> Noch nicht verfügbar</span>
Noch nicht verfügbar
{% endif %}
</td>
<td>
{% if 'Failed' in job.status %}
<!-- 🆕 Resume Button -->
<form action="{{ url_for('auth.resume_job', job_id=job.id) }}" method="POST" style="display:inline;">
<button type="submit" class="btn-resume">▶️ Resume</button>
</form>
{% endif %}
<form action="{{ url_for('auth.delete_job', job_id=job.id) }}" method="POST" style="display:inline;">
<button type="submit" class="delete-btn">🗑️ Löschen</button>
<button type="submit" class="delete-btn">Löschen</button>
</form>
</td>
</tr>
@ -55,101 +37,25 @@
</table>
</div>
<style>
.job-status { font-weight: bold; }
.status-failed { color: #e74c3c; font-weight: bold; }
.status-pending { color: #888; }
.eta-badge { display: inline-block; background: #eaf4ff; color: #1a6fa8;
border-radius: 10px; padding: 2px 8px; font-size: 0.82em;
font-weight: bold; margin-left: 6px; }
.dl-btn { display: inline-block; padding: 4px 10px; border-radius: 4px;
text-decoration: none; font-size: 0.85em; font-weight: bold;
background: #27ae60; color: #fff; margin: 2px 1px; transition: background 0.2s; }
.dl-btn:hover { background: #1e8449; }
.dl-btn-raw { background: #2980b9; }
.dl-btn-raw:hover { background: #1a5e8a; }
.btn-resume { background: #e67e22; color: white; border: none;
padding: 4px 10px; border-radius: 4px; cursor: pointer;
font-size: 0.85em; font-weight: bold; margin-right: 4px; }
.btn-resume:hover { background: #ca6f1e; }
</style>
<script>
// ETA Badge aus Status-String parsen
function parseStatus(status) {
const parts = status.split('|');
if (parts.length === 2) {
return `<span>${parts[0].trim()}</span>
<span class="eta-badge">${parts[1].trim()}</span>`;
}
return status;
}
// Periodische Aktualisierung des Jobstatus
setInterval(function() {
fetch('{{ url_for("auth.job_status") }}')
.then(response => response.text())
.then(html => {
const parser = new DOMParser();
const doc = parser.parseFromString(html, 'text/html');
const newRows = doc.querySelectorAll('#jobs-table tbody tr');
const currentRows = document.querySelectorAll('#jobs-table tbody tr');
function renderResult(resultCell, data) {
const hasFailed = data.status.includes('Failed') || data.status.includes('❌');
const hasFiltered = data.result_filename && !hasFailed;
const hasRaw = data.result_filename_raw && !hasFailed;
newRows.forEach((newRow, index) => {
const newStatus = newRow.querySelector('.job-status').textContent;
currentRows[index].querySelector('.job-status').textContent = newStatus;
if (hasFiltered) {
let html = `<a href="/download/${data.id}" class="dl-btn">🎯 Gefiltert</a>`;
if (hasRaw) {
html += ` <a href="/download_raw/${data.id}" class="dl-btn dl-btn-raw">📋 Alle</a>`;
}
resultCell.innerHTML = html;
} else if (hasFailed) {
resultCell.innerHTML = `<span class="status-failed">❌ ${data.result_filename || 'Fehler'}</span>`;
} else {
resultCell.innerHTML = `<span class="status-pending">⏳ Noch nicht verfügbar</span>`;
}
}
function renderActions(row, data) {
const actionsCell = row.querySelector('td:last-child');
const hasFailed = data.status.includes('Failed');
let html = '';
if (hasFailed) {
html += `<form action="/resume_job/${data.id}" method="POST" style="display:inline;">
<button type="submit" class="btn-resume">▶️ Resume</button>
</form>`;
}
html += `<form action="/delete_job/${data.id}" method="POST" style="display:inline;">
<button type="submit" class="delete-btn">🗑️ Löschen</button>
</form>`;
actionsCell.innerHTML = html;
}
function pollJob(jobId) {
fetch(`/job_status/${jobId}`)
.then(r => r.json())
.then(data => {
const statusCell = document.getElementById(`status-${jobId}`);
const resultCell = document.getElementById(`result-${jobId}`);
const row = document.getElementById(`job-row-${jobId}`);
statusCell.innerHTML = parseStatus(data.status);
renderResult(resultCell, data);
renderActions(row, data);
const done = data.status.includes('✅') || data.status.includes('Failed') || data.status.includes('❌');
if (!done) {
setTimeout(() => pollJob(jobId), 5000);
}
})
.catch(() => setTimeout(() => pollJob(jobId), 10000));
}
document.addEventListener('DOMContentLoaded', function () {
document.querySelectorAll('.job-status').forEach(function (cell) {
const jobId = cell.id.split('-')[1];
const status = cell.textContent.trim();
cell.innerHTML = parseStatus(status);
if (!status.includes('✅') && !status.includes('Failed') && !status.includes('❌')) {
pollJob(jobId);
}
});
});
const newResult = newRow.querySelector('td:nth-child(4)').innerHTML;
currentRows[index].querySelector('td:nth-child(4)').innerHTML = newResult;
});
});
}, 5000); // Aktualisierung alle 5 Sekunden
</script>
{% endblock %}

View file

@ -1,61 +0,0 @@
{% extends "base.html" %}
{% block content %}
<div class="table-container">
<h2>Ihre Aufträge</h2>
<table id="jobs-table">
<thead>
<tr>
<th>Dateiname</th>
<th>Status</th>
<th>Erstellt am</th>
<th>Ergebnis</th>
<th>Aktionen</th>
</tr>
</thead>
<tbody>
{% for job in jobs %}
<tr>
<td>{{ job.filename }}</td>
<td class="job-status">{{ job.status }}</td>
<td>{{ job.created_at.strftime('%Y-%m-%d %H:%M:%S') }}</td>
<td>
{% if job.status == "Completed" %}
<a href="{{ url_for('auth.download_result', job_id=job.id) }}">Download</a>
{% else %}
Noch nicht verfügbar
{% endif %}
</td>
<td>
<form action="{{ url_for('auth.delete_job', job_id=job.id) }}" method="POST" style="display:inline;">
<button type="submit" class="delete-btn">Löschen</button>
</form>
</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
<script>
// Periodische Aktualisierung des Jobstatus
setInterval(function() {
fetch('{{ url_for("auth.job_status") }}')
.then(response => response.text())
.then(html => {
const parser = new DOMParser();
const doc = parser.parseFromString(html, 'text/html');
const newRows = doc.querySelectorAll('#jobs-table tbody tr');
const currentRows = document.querySelectorAll('#jobs-table tbody tr');
newRows.forEach((newRow, index) => {
const newStatus = newRow.querySelector('.job-status').textContent;
currentRows[index].querySelector('.job-status').textContent = newStatus;
const newResult = newRow.querySelector('td:nth-child(4)').innerHTML;
currentRows[index].querySelector('td:nth-child(4)').innerHTML = newResult;
});
});
}, 5000); // Aktualisierung alle 5 Sekunden
</script>
{% endblock %}

File diff suppressed because one or more lines are too long

View file

@ -1,22 +1,11 @@
{% extends "base.html" %}
{% block content %}
<div class="form-container">
<h2>Datei hochladen</h2>
<!-- Instructions and download link for the template -->
<div class="template-info">
<p>Bitte verwenden Sie die folgende Vorlage für Ihre Adressliste:</p>
<a href="{{ url_for('static', filename='Adressliste_vorlage.csv') }}" download="Adressliste_vorlage.csv" class="template-download">Adressliste Vorlage herunterladen</a>
<p>Die Vorlage enthält die folgenden Spalten: <strong>PLZ, Stadt, Straße, Hausnummer, Zusatz</strong>. Bitte verwenden Sie das Format für eine reibungslose Verarbeitung.</p>
</div>
<form action="{{ url_for('auth.upload') }}" method="post" enctype="multipart/form-data">
<label for="file">Datei auswählen:</label>
<input type="file" id="file" name="file" accept=".csv" required>
<button type="submit">Datei hochladen</button>
</form>
</div>
</body>
</html>
<div class="form-container">
<h2>Datei hochladen</h2>
<form method="POST" enctype="multipart/form-data">
<label for="file">CSV-Datei:</label>
<input type="file" id="file" name="file" accept=".csv" required>
<button type="submit">Upload</button>
</form>
</div>
{% endblock %}

View file

@ -1,316 +0,0 @@
import os
import re
import pandas as pd
import requests
import time
import random
from io import StringIO
from app.models import db, Job
print("🆕 MODERN webcrawler LOADED!")
UPLOAD_FOLDER = '/app/uploads'
RESULT_FOLDER = '/app/results'
SCRAPER_URL = "http://gmaps-scraper:8080"
OUTPUT_COLS = ['title', 'category', 'address', 'open_hours', 'website', 'phone', 'link']
# ──────────────────────────────────────────────
# Hilfsfunktionen
# ──────────────────────────────────────────────
def get_batch_size(total_rows):
if total_rows < 50: return 10
elif total_rows < 200: return 10
elif total_rows < 500: return 5
else: return 5
def get_delay(total_rows):
if total_rows < 50: return (5, 10)
elif total_rows < 200: return (10, 20)
else: return (20, 40)
def is_blocked(data):
text = str(data).lower()
blocked = any(kw in text for kw in ['captcha', 'blocked', 'rate limit', 'too many', '429'])
if blocked:
print(f"🚫 BLOCKED: {str(data)[:100]}")
return blocked
def fix_encoding(text):
"""Kaputte ISO→UTF8 Zeichen reparieren (z.B. Industriestraße → Industriestraße)"""
if not isinstance(text, str):
return text
try:
return text.encode('latin-1').decode('utf-8')
except (UnicodeEncodeError, UnicodeDecodeError):
return text
def build_input_addresses(df):
"""Normalisierte Adressen aus Input-CSV für Abgleich"""
addresses = set()
for _, row in df.iterrows():
plz = str(row.get('PLZ', '')).strip()
stadt = str(row.get('Stadt', '')).strip()
str_ = str(row.get('Straße', '')).strip()
nr = str(row.get('Hausnummer', '')).strip()
zusatz = str(row.get('Zusatz', '')).strip()
full = f"{str_} {nr} {zusatz} {plz} {stadt}".lower().strip()
full = ' '.join(full.split())
addresses.add(full)
return addresses
def normalize_address(addr):
"""Output-Adresse normalisieren für Abgleich"""
if not isinstance(addr, str):
return ''
addr = fix_encoding(addr)
return ' '.join(addr.lower().strip().split())
def address_in_input(result_addr, input_addresses):
"""Prüft ob PLZ + Straßenname aus Result im Input vorkommen"""
norm = normalize_address(result_addr)
for inp_addr in input_addresses:
plz_match = re.search(r'\b\d{5}\b', inp_addr)
if plz_match:
plz = plz_match.group()
if plz in norm:
street = inp_addr.split()[0] if inp_addr else ''
if len(street) > 3 and street[:4].lower() in norm:
return True
return False
# ──────────────────────────────────────────────
# CSV Nachbearbeitung (apply_filter umschaltbar)
# ──────────────────────────────────────────────
def process_result_csv(raw_bytes, input_df, apply_filter=True):
"""
Raw CSV → bereinigt:
- Nur OUTPUT_COLS
- Encoding fix
- Optional: Input/Output Abgleich + Duplikate
"""
try:
content = raw_bytes.decode('utf-8', errors='replace')
df_out = pd.read_csv(StringIO(content))
print(f"📄 Raw result: {df_out.shape} | Columns: {list(df_out.columns)[:8]}")
# Spalten filtern
available = [c for c in OUTPUT_COLS if c in df_out.columns]
missing = [c for c in OUTPUT_COLS if c not in df_out.columns]
if missing:
print(f"⚠️ Fehlende Spalten: {missing}")
df_out = df_out[available]
# 🔤 Encoding fix
for col in df_out.columns:
df_out[col] = df_out[col].apply(fix_encoding)
print(f"🔤 Encoding fix: done")
if apply_filter:
# 📍 Input/Output Abgleich
input_addresses = build_input_addresses(input_df)
before = len(df_out)
df_out = df_out[
df_out['address'].apply(
lambda a: address_in_input(a, input_addresses)
)
]
print(f"📍 Adress-Filter: {before} → {len(df_out)} Zeilen")
# 🔁 Duplikate entfernen (immer, auch bei Raw)
before_dedup = len(df_out)
df_out = df_out.drop_duplicates(subset=['title', 'address'], keep='first')
print(f"🔁 Duplikate: {before_dedup} → {len(df_out)} Zeilen")
# Leere Titel entfernen
df_out = df_out.dropna(subset=['title'], how='all')
df_out = df_out[df_out['title'].str.strip().astype(bool)]
print(f"✅ Final ({'gefiltert' if apply_filter else 'alle'}): {df_out.shape}")
return df_out
except Exception as e:
print(f"💥 process_result_csv: {e}")
import traceback
traceback.print_exc()
return None
# ──────────────────────────────────────────────
# Haupt-Worker
# ──────────────────────────────────────────────
def process_file(filename, job_id, app):
print(f"🎯 {filename} Job#{job_id} START!")
with app.app_context():
job = Job.query.get(job_id)
if not job:
print("❌ Job missing")
return
try:
# 1⃣ CSV Parse
job.status = "📊 parsing CSV"
db.session.commit()
filepath = os.path.join(UPLOAD_FOLDER, filename)
print(f"📁 {filepath} | {os.path.getsize(filepath)}b")
df_input = pd.read_csv(filepath, sep=';', encoding='ISO-8859-1')
print(f"📊 {df_input.shape} | Columns: {list(df_input.columns)}")
queries = []
for _, row in df_input.iterrows():
parts = [
str(row.get('PLZ', '')).strip(),
str(row.get('Stadt', '')).strip(),
str(row.get('Straße', '')).strip(),
str(row.get('Hausnummer', '')).strip(),
str(row.get('Zusatz', '')).strip(),
]
q = f"Firmen {' '.join(p for p in parts if p and p != 'nan')}".strip()
if len(q) > 10:
queries.append(q)
total = len(queries)
print(f"🔍 {total} Queries | Samples: {queries[:3]}")
if not queries:
raise ValueError("Keine gültigen Adressen in CSV")
# 2⃣ Batch + Delay
batch_size = get_batch_size(total)
delay_min, delay_max = get_delay(total)
batch = queries[:batch_size]
pre_delay = random.uniform(delay_min, delay_max)
print(f"📦 Batch {len(batch)}/{total} | 😴 {pre_delay:.1f}s Delay")
time.sleep(pre_delay)
# 3⃣ API Call
job.status = "📤 sending to scraper"
db.session.commit()
payload = {
"name": f"{filename.replace('.csv','')}-{job_id}",
"keywords": batch,
"lang": "de",
"depth": 1,
"zoom": 17,
"radius": 50,
"max_time": 60,
"fast_mode": False
}
print(f"🌐 POST {SCRAPER_URL}/api/v1/jobs | {payload['name']}")
resp = requests.post(f"{SCRAPER_URL}/api/v1/jobs", json=payload, timeout=30)
print(f"📤 {resp.status_code}: {resp.text[:300]}")
if is_blocked(resp.text):
raise ValueError("🚫 IP geblockt! Proxy konfigurieren.")
if resp.status_code != 201:
raise ValueError(f"API {resp.status_code}: {resp.text[:200]}")
# 4⃣ Polling
scraper_id = resp.json()['id']
job.scraper_job_id = scraper_id
job.status = "⏳ scraping"
db.session.commit()
print(f"✅ Scraper Job: {scraper_id}")
for i in range(1, 61): # Max 10min
try:
r = requests.get(
f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}",
timeout=10
)
data = r.json()
status = data.get('Status', data.get('status', '?'))
print(f"⏳ {i}/60: {status}")
if is_blocked(data):
raise ValueError("🚫 IP geblockt während scraping!")
if status in ('ok', 'completed', 'scraped'):
dl = requests.get(
f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}/download",
timeout=60
)
if dl.status_code != 200:
raise ValueError(f"Download {dl.status_code}")
if is_blocked(dl.text[:200]):
raise ValueError("🚫 IP geblockt beim Download!")
# 5⃣ Nachbearbeitung → zwei Versionen
job.status = "🔧 processing result"
db.session.commit()
base = filename.replace('.csv', '')
os.makedirs(RESULT_FOLDER, exist_ok=True)
# ── Version A: Gefiltert (Adressabgleich + Deduplizierung) ──
df_filtered = process_result_csv(dl.content, df_input, apply_filter=True)
outname_filtered = f"results_{base}_filtered.csv"
outpath_filtered = os.path.join(RESULT_FOLDER, outname_filtered)
if df_filtered is not None and len(df_filtered) > 0:
df_filtered.to_csv(
outpath_filtered, index=False,
encoding='utf-8-sig', sep=';'
)
print(f"🎯 Filtered: {outname_filtered} → {len(df_filtered)} Firmen")
else:
print("⚠️ Keine Treffer nach Filter leere Datei wird erstellt")
pd.DataFrame(columns=OUTPUT_COLS).to_csv(
outpath_filtered, index=False,
encoding='utf-8-sig', sep=';'
)
# ── Version B: Alle (nur Spalten + Encoding, kein Filter) ──
df_raw = process_result_csv(dl.content, df_input, apply_filter=False)
outname_raw = f"results_{base}_all.csv"
outpath_raw = os.path.join(RESULT_FOLDER, outname_raw)
if df_raw is not None:
df_raw.to_csv(
outpath_raw, index=False,
encoding='utf-8-sig', sep=';'
)
print(f"📋 All: {outname_raw} → {len(df_raw)} Firmen")
else:
print("⚠️ df_raw None Rohinhalt wird gespeichert")
with open(outpath_raw, 'wb') as f:
f.write(dl.content)
# ── DB speichern ──
job.status = "✅ Fertig"
job.result_filename = outname_filtered # 🎯 Gefiltert
job.result_filename_raw = outname_raw # 📋 Alle
db.session.commit()
print(f"🎉 Beide Dateien gespeichert!")
break
elif status in ('failed', 'cancelled', 'error'):
raise ValueError(f"Scraper: {status}")
except requests.RequestException as e:
print(f"⚠️ Poll {i}: {e}")
time.sleep(random.uniform(8, 15))
else:
raise ValueError("Timeout nach 10min")
except Exception as e:
job.status = "Failed"
job.result_filename = str(e)
print(f"💥 ERROR: {e}")
import traceback
traceback.print_exc()
db.session.commit()
print(f"✅ DONE! Status: {job.status}\n")

View file

@ -1,275 +0,0 @@
import os
import re
import pandas as pd
import requests
import time
import random
from io import StringIO
from app.models import db, Job
print("🆕 MODERN webcrawler LOADED! BATCHED + PROXY")
UPLOAD_FOLDER = '/app/uploads'
RESULT_FOLDER = '/app/results'
SCRAPER_URL = "http://gmaps-scraper:8080"
OUTPUT_COLS = ['title', 'category', 'address', 'open_hours', 'website', 'phone', 'link']
PROXY_URL = "http://bitlleuv-rotate:s5hzse6hz74b@p.webshare.io:80"
API_PROXIES = {"http": PROXY_URL, "https": PROXY_URL}
# ──────────────────────────────────────────────
# Hilfsfunktionen
# ──────────────────────────────────────────────
def is_blocked(data):
text = str(data).lower()
blocked = any(kw in text for kw in ['captcha', 'blocked', 'rate limit', 'too many', '429'])
if blocked:
print(f"🚫 BLOCKED: {str(data)[:100]}")
return blocked
def fix_encoding(text):
if not isinstance(text, str):
return text
try:
return text.encode('latin-1').decode('utf-8')
except (UnicodeEncodeError, UnicodeDecodeError):
return text
def build_input_addresses(df):
addresses = set()
for _, row in df.iterrows():
plz = str(row.get('PLZ', '')).strip()
stadt = str(row.get('Stadt', '')).strip()
str_ = str(row.get('Straße', '')).strip()
nr = str(row.get('Hausnummer', '')).strip()
zusatz = str(row.get('Zusatz', '')).strip()
full = f"{str_} {nr} {zusatz} {plz} {stadt}".lower().strip()
full = ' '.join(full.split())
addresses.add(full)
return addresses
def normalize_address(addr):
if not isinstance(addr, str):
return ''
addr = fix_encoding(addr)
return ' '.join(addr.lower().strip().split())
def address_in_input(result_addr, input_addresses):
norm = normalize_address(result_addr)
for inp_addr in input_addresses:
plz_match = re.search(r'\b\d{5}\b', inp_addr)
if plz_match:
plz = plz_match.group()
if plz in norm:
street = inp_addr.split()[0] if inp_addr else ''
if len(street) > 3 and street[:4].lower() in norm:
return True
return False
# ──────────────────────────────────────────────
# CSV Nachbearbeitung
# ──────────────────────────────────────────────
def process_result_csv(raw_bytes, input_df, apply_filter=True):
try:
content = raw_bytes.decode('utf-8', errors='replace')
df_out = pd.read_csv(StringIO(content))
print(f"📄 Raw result: {df_out.shape}")
available = [c for c in OUTPUT_COLS if c in df_out.columns]
df_out = df_out[available]
for col in df_out.columns:
df_out[col] = df_out[col].apply(fix_encoding)
if apply_filter:
input_addresses = build_input_addresses(input_df)
before = len(df_out)
df_out = df_out[
df_out['address'].apply(
lambda a: address_in_input(a, input_addresses)
)
]
print(f"📍 Filter: {before} → {len(df_out)}")
df_out = df_out.drop_duplicates(subset=['title', 'address'], keep='first')
df_out = df_out.dropna(subset=['title'], how='all')
df_out = df_out[df_out['title'].str.strip().astype(bool)]
print(f"✅ Final ({'gefiltert' if apply_filter else 'alle'}): {df_out.shape}")
return df_out
except Exception as e:
print(f"💥 process_result_csv: {e}")
return None
# ──────────────────────────────────────────────
# HAUPT-WORKER
# ──────────────────────────────────────────────
def process_file(filename, job_id, app):
print(f"🎯 {filename} Job#{job_id} START!")
with app.app_context():
job = Job.query.get(job_id)
if not job:
print("❌ Job missing")
return
try:
#Parse + ALLE Queries
job.status = "📊 parsing CSV"
db.session.commit()
filepath = os.path.join(UPLOAD_FOLDER, filename)
print(f"📁 {filepath} | {os.path.getsize(filepath)}b")
df_input = pd.read_csv(filepath, sep=';', encoding='ISO-8859-1')
print(f"📊 {df_input.shape}")
queries = []
for _, row in df_input.iterrows():
parts = [
str(row.get('PLZ', '')).strip(),
str(row.get('Stadt', '')).strip(),
str(row.get('Straße', '')).strip(),
str(row.get('Hausnummer', '')).strip(),
str(row.get('Zusatz', '')).strip(),
]
q = f"Firmen {' '.join(p for p in parts if p and p != 'nan')}".strip()
if len(q) > 10:
queries.append(q)
total_queries = len(queries)
print(f"🔍 {total_queries} Queries | Samples: {queries[:3]}")
if total_queries == 0:
raise ValueError("Keine gültigen Adressen")
#BATCHED Processing
BATCH_SIZE = 10 # Erhöht: 5 → 10 (paid proxy)
BATCH_DELAY_MIN, BATCH_DELAY_MAX = 10, 20 # Reduziert: 30-60s → 10-20s (paid proxy)
batches = (total_queries + BATCH_SIZE - 1) // BATCH_SIZE
print(f"📦 {batches} Batches à {BATCH_SIZE} | ETA: ~{batches*15//60:.0f}h")
all_results_filtered = []
all_results_raw = []
job.status = f"🔄 Batch 1/{batches}"
db.session.commit()
for batch_idx in range(batches):
batch_start = batch_idx * BATCH_SIZE
batch_end = min(batch_start + BATCH_SIZE, total_queries)
batch_queries = queries[batch_start:batch_end]
print(f"\n🔄 BATCH {batch_idx+1}/{batches} ({batch_start+1}-{batch_end}/{total_queries})")
#Random Delay
delay = random.uniform(BATCH_DELAY_MIN, BATCH_DELAY_MAX)
print(f"😴 Delay: {delay:.0f}s | Proxy: {PROXY_URL}")
time.sleep(delay)
#API Call
payload = {
"name": f"{filename.replace('.csv','')}-{job_id}-B{batch_idx+1:03d}",
"keywords": batch_queries,
"lang": "de",
"depth": 1,
"zoom": 17,
"radius": 50,
"max_time": 60, # Reduziert: 120 → 60 (paid proxy schneller)
"fast_mode": False,
"proxies": [PROXY_URL]
}
try:
resp = requests.post(
f"{SCRAPER_URL}/api/v1/jobs",
json=payload,
timeout=45
)
print(f"📤 {resp.status_code}")
if is_blocked(resp.text):
print("🚫 Batch übersprungen (blocked)")
continue
if resp.status_code != 201:
print(f"⚠️ Batch {batch_idx+1} fehlgeschlagen: {resp.text[:100]}")
continue
scraper_id = resp.json()['id']
print(f"✅ Scraper: {scraper_id}")
for poll_i in range(1, 61): # Reduziert: 121 → 61 (max_time 60s)
r = requests.get(
f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}",
timeout=15
)
data = r.json()
status = data.get('Status', data.get('status', '?'))
if status in ('ok', 'completed', 'scraped'):
dl = requests.get(
f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}/download",
timeout=90
)
if dl.status_code == 200:
df_filtered = process_result_csv(dl.content, df_input, True)
df_raw = process_result_csv(dl.content, df_input, False)
if df_filtered is not None:
all_results_filtered.append(df_filtered)
all_results_raw.append(df_raw)
print(f"📊 Batch {batch_idx+1}: {len(df_filtered)} filtered")
break
elif status in ('failed', 'error'):
print(f"💥 Batch {batch_idx+1}: {status}")
break
time.sleep(random.uniform(5, 10)) # Reduziert: 10-20s → 5-10s (paid proxy)
except Exception as e:
print(f"💥 Batch {batch_idx+1}: {e}")
job.status = f"🔄 Batch {batch_idx+2}/{batches}"
db.session.commit()
#MERGE & SAVE
job.status = "🔧 merging results"
db.session.commit()
base = filename.replace('.csv', '')
os.makedirs(RESULT_FOLDER, exist_ok=True)
if all_results_filtered:
df_final_filtered = pd.concat(all_results_filtered, ignore_index=True)
df_final_filtered = df_final_filtered.drop_duplicates(subset=['title', 'address'])
out_filtered = f"results_{base}_filtered.csv"
df_final_filtered.to_csv(
os.path.join(RESULT_FOLDER, out_filtered),
index=False, encoding='utf-8-sig', sep=';'
)
if all_results_raw:
df_final_raw = pd.concat(all_results_raw, ignore_index=True)
out_raw = f"results_{base}_all.csv"
df_final_raw.to_csv(
os.path.join(RESULT_FOLDER, out_raw),
index=False, encoding='utf-8-sig', sep=';'
)
job.result_filename = out_filtered
job.result_filename_raw = out_raw
job.status = f"✅ Fertig: {len(df_final_filtered)} Firmen"
else:
job.status = "❌ Keine Ergebnisse"
db.session.commit()
print(f"🎉 Job {job_id} komplett!")
except Exception as e:
job.status = f"Failed: {str(e)[:50]}"
print(f"💥 FATAL: {e}")
import traceback
traceback.print_exc()
db.session.commit()
print(f"✅ DONE! Status: {job.status}")

View file

@ -1,429 +0,0 @@
import os
import re
import unicodedata
import json
import pandas as pd
import requests
import time
import random
from io import StringIO
from app.models import db, Job
print("🆕 MODERN webcrawler LOADED! BATCHED + PROXY + RESUME + ETA + 2x SCRAPER")
UPLOAD_FOLDER = '/app/uploads'
RESULT_FOLDER = '/app/results'
# 2x Scraper abwechselnd genutzt
SCRAPER_URLS = [
"http://gmaps-scraper-1:8080",
"http://gmaps-scraper-2:8080",
]
OUTPUT_COLS = ['title', 'category', 'address', 'open_hours', 'website', 'phone', 'link']
PROXY_URL = "http://bitlleuv-rotate:s5hzse6hz74b@p.webshare.io:80"
API_PROXIES = {"http": PROXY_URL, "https": PROXY_URL}
# ──────────────────────────────────────────────
# Tuning
# ──────────────────────────────────────────────
BATCH_SIZE = 30 # Keywords pro Scraper-Job
BATCH_DELAY_MIN = 3 # Sekunden Pause zwischen Batches (min)
BATCH_DELAY_MAX = 6 # Sekunden Pause zwischen Batches (max)
MAX_TIME = 60 # Sekunden die der Scraper pro Batch hat
POLL_MAX = 90 # Max. Poll-Versuche pro Batch
POLL_DELAY_MIN = 2 # Sekunden zwischen Polls (min)
POLL_DELAY_MAX = 5 # Sekunden zwischen Polls (max)
STUCK_THRESHOLD = 8 # Polls auf 'pending' bis Auto-Restart
MAX_RETRIES = 2 # Wiederholversuche pro Batch bei Fehler
# ──────────────────────────────────────────────
# Hilfsfunktionen
# ──────────────────────────────────────────────
def is_blocked(data):
text = str(data).lower()
blocked = any(kw in text for kw in ['captcha', 'blocked', 'rate limit', 'too many', '429'])
if blocked:
print(f"🚫 BLOCKED: {str(data)[:100]}")
return blocked
def fix_encoding(text):
if not isinstance(text, str):
return text
try:
return text.encode('latin-1').decode('utf-8')
except (UnicodeEncodeError, UnicodeDecodeError):
return text
# Fix 1: Sonderzeichen in Queries bereinigen
def clean_query(q):
"""Steuerzeichen + fehlerhafte Bytes entfernen für saubere Google Maps URLs"""
q = ''.join(c for c in q if unicodedata.category(c) != 'Cc')
q = ' '.join(q.split())
return q.strip()
def build_input_addresses(df):
addresses = set()
for _, row in df.iterrows():
plz = str(row.get('PLZ', '')).strip()
stadt = str(row.get('Stadt', '')).strip()
str_ = str(row.get('Straße', '')).strip()
nr = str(row.get('Hausnummer', '')).strip()
zusatz = str(row.get('Zusatz', '')).strip()
full = f"{str_} {nr} {zusatz} {plz} {stadt}".lower().strip()
full = ' '.join(full.split())
addresses.add(full)
return addresses
def normalize_address(addr):
if not isinstance(addr, str):
return ''
addr = fix_encoding(addr)
return ' '.join(addr.lower().strip().split())
def address_in_input(result_addr, input_addresses):
norm = normalize_address(result_addr)
for inp_addr in input_addresses:
plz_match = re.search(r'\b\d{5}\b', inp_addr)
if plz_match:
plz = plz_match.group()
if plz in norm:
street = inp_addr.split()[0] if inp_addr else ''
if len(street) > 3 and street[:4].lower() in norm:
return True
return False
def format_eta(seconds):
"""Sekunden → lesbares ETA-Format"""
if seconds < 60:
return f"{int(seconds)}s"
h, rem = divmod(int(seconds), 3600)
m = rem // 60
return f"{h}h {m:02d}min" if h > 0 else f"{m}min"
# ──────────────────────────────────────────────
# Fix 3: Scraper-Neustart bei Inactivity
# ──────────────────────────────────────────────
def restart_scraper(scraper_url):
"""Den betroffenen Scraper-Container neu starten"""
try:
import subprocess
# Container-Name aus URL ableiten: http://gmaps-scraper-1:8080 → gmaps-scraper-1
container = scraper_url.split("//")[1].split(":")[0]
print(f"🔄 Starte {container} neu...")
subprocess.run(["docker", "restart", container], timeout=30, capture_output=True)
print(f"✅ {container} neu gestartet warte 15s...")
time.sleep(15)
return True
except Exception as e:
print(f"⚠️ Scraper-Neustart fehlgeschlagen: {e}")
return False
# ──────────────────────────────────────────────
# Resume: Progress-File Hilfsfunktionen
# ──────────────────────────────────────────────
def get_progress_path(job_id):
return os.path.join(RESULT_FOLDER, f"progress_{job_id}.json")
def get_partial_path(job_id, suffix):
return os.path.join(RESULT_FOLDER, f"partial_{job_id}_{suffix}.csv")
def load_progress(job_id):
"""Gespeicherten Fortschritt laden (falls vorhanden)"""
path = get_progress_path(job_id)
if os.path.exists(path):
with open(path, 'r') as f:
data = json.load(f)
print(f"🔁 RESUME: ab Batch {data['last_completed_batch'] + 1}/{data['total_batches']}")
return data
return None
def save_progress(job_id, last_completed_batch, total_batches):
"""Fortschritt nach jedem Batch speichern"""
path = get_progress_path(job_id)
with open(path, 'w') as f:
json.dump({"last_completed_batch": last_completed_batch, "total_batches": total_batches}, f)
def append_partial(job_id, df_filtered, df_raw):
"""Batch-Ergebnis an Partial-CSV anhängen"""
for suffix, df in [('filtered', df_filtered), ('raw', df_raw)]:
if df is None:
continue
path = get_partial_path(job_id, suffix)
header = not os.path.exists(path)
df.to_csv(path, mode='a', index=False, header=header, encoding='utf-8-sig', sep=';')
def load_partial(job_id):
"""Bestehende Partial-CSVs laden"""
results_filtered, results_raw = [], []
for suffix, lst in [('filtered', results_filtered), ('raw', results_raw)]:
path = get_partial_path(job_id, suffix)
if os.path.exists(path):
try:
df = pd.read_csv(path, sep=';', encoding='utf-8-sig')
lst.append(df)
print(f"📂 Partial {suffix}: {len(df)} Zeilen geladen")
except Exception as e:
print(f"⚠️ Partial {suffix} Ladefehler: {e}")
return results_filtered, results_raw
def cleanup_progress(job_id):
"""Progress + Partial-Files nach Abschluss löschen"""
for path in [
get_progress_path(job_id),
get_partial_path(job_id, 'filtered'),
get_partial_path(job_id, 'raw'),
]:
if os.path.exists(path):
os.remove(path)
# ──────────────────────────────────────────────
# CSV Nachbearbeitung
# ──────────────────────────────────────────────
def process_result_csv(raw_bytes, input_df, apply_filter=True):
try:
content = raw_bytes.decode('utf-8', errors='replace')
df_out = pd.read_csv(StringIO(content))
print(f"📄 Raw result: {df_out.shape}")
available = [c for c in OUTPUT_COLS if c in df_out.columns]
df_out = df_out[available]
for col in df_out.columns:
df_out[col] = df_out[col].apply(fix_encoding)
if apply_filter:
input_addresses = build_input_addresses(input_df)
before = len(df_out)
df_out = df_out[
df_out['address'].apply(lambda a: address_in_input(a, input_addresses))
]
print(f"📍 Filter: {before} → {len(df_out)}")
df_out = df_out.drop_duplicates(subset=['title', 'address'], keep='first')
df_out = df_out.dropna(subset=['title'], how='all')
df_out = df_out[df_out['title'].str.strip().astype(bool)]
print(f"✅ Final ({'gefiltert' if apply_filter else 'alle'}): {df_out.shape}")
return df_out
except Exception as e:
print(f"💥 process_result_csv: {e}")
return None
# ──────────────────────────────────────────────
# HAUPT-WORKER
# ──────────────────────────────────────────────
def process_file(filename, job_id, app):
print(f"🎯 {filename} Job#{job_id} START!")
with app.app_context():
job = Job.query.get(job_id)
if not job:
print("❌ Job missing")
return
try:
#Parse + ALLE Queries
job.status = "📊 parsing CSV"
db.session.commit()
filepath = os.path.join(UPLOAD_FOLDER, filename)
print(f"📁 {filepath} | {os.path.getsize(filepath)}b")
df_input = pd.read_csv(filepath, sep=';', encoding='ISO-8859-1')
print(f"📊 {df_input.shape}")
queries = []
for _, row in df_input.iterrows():
parts = [
str(row.get('PLZ', '')).strip(),
str(row.get('Stadt', '')).strip(),
str(row.get('Straße', '')).strip(),
str(row.get('Hausnummer', '')).strip(),
str(row.get('Zusatz', '')).strip(),
]
q = f"Firmen {' '.join(p for p in parts if p and p != 'nan')}".strip()
q = clean_query(q) # Fix 1: Sonderzeichen bereinigen
if len(q) > 10:
queries.append(q)
total_queries = len(queries)
print(f"🔍 {total_queries} Queries | Samples: {queries[:3]}")
if total_queries == 0:
raise ValueError("Keine gültigen Adressen")
#BATCHED Processing
batches = (total_queries + BATCH_SIZE - 1) // BATCH_SIZE
# Resume: Fortschritt laden falls vorhanden
os.makedirs(RESULT_FOLDER, exist_ok=True)
progress = load_progress(job_id)
start_batch = progress['last_completed_batch'] + 1 if progress else 0
all_results_filtered, all_results_raw = load_partial(job_id) if progress else ([], [])
eta_initial = format_eta((batches - start_batch) * ((BATCH_DELAY_MAX + MAX_TIME) / 2))
print(f"📦 {batches} Batches à {BATCH_SIZE} | 2x Scraper | Start: {start_batch} | ETA: ~{eta_initial}")
job_start_time = time.time()
job.status = f"🔄 Batch {start_batch+1}/{batches} | ⏱️ ~{eta_initial}"
db.session.commit()
for batch_idx in range(start_batch, batches):
batch_start = batch_idx * BATCH_SIZE
batch_end = min(batch_start + BATCH_SIZE, total_queries)
batch_queries = queries[batch_start:batch_end]
# 2x Scraper: abwechselnd nutzen
scraper_url = SCRAPER_URLS[batch_idx % len(SCRAPER_URLS)]
print(f"\n🔄 BATCH {batch_idx+1}/{batches} ({batch_start+1}-{batch_end}/{total_queries}) → {scraper_url}")
#Random Delay
delay = random.uniform(BATCH_DELAY_MIN, BATCH_DELAY_MAX)
print(f"😴 Delay: {delay:.0f}s")
time.sleep(delay)
#API Call
payload = {
"name": f"{filename.replace('.csv','')}-{job_id}-B{batch_idx+1:03d}",
"keywords": batch_queries,
"lang": "de",
"depth": 1,
"zoom": 15,
"radius": 50,
"max_time": MAX_TIME,
"fast_mode": False,
"proxies": [PROXY_URL]
}
batch_success = False
# Fix 2: Retry-Logik bei Scraper-Fehler
for attempt in range(1, MAX_RETRIES + 1):
try:
resp = requests.post(f"{scraper_url}/api/v1/jobs", json=payload, timeout=45)
print(f"📤 {resp.status_code} (Versuch {attempt} | {scraper_url})")
if is_blocked(resp.text):
print("🚫 Batch übersprungen (blocked)")
break
if resp.status_code != 201:
print(f"⚠️ Batch {batch_idx+1} fehlgeschlagen: {resp.text[:100]}")
if attempt < MAX_RETRIES:
time.sleep(10)
continue
scraper_id = resp.json()['id']
print(f"✅ Scraper: {scraper_id}")
stuck_counter = 0
for poll_i in range(1, POLL_MAX + 1):
r = requests.get(f"{scraper_url}/api/v1/jobs/{scraper_id}", timeout=15)
data = r.json()
status = data.get('Status', data.get('status', '?'))
print(f"⏳ Poll {poll_i}: {status}")
# Fix 4: Auto-Recovery bei Pending-Stuck
if status == 'pending':
stuck_counter += 1
if stuck_counter >= STUCK_THRESHOLD:
print(f"⚠️ Job {scraper_id} hängt abbrechen + Neustart")
requests.delete(f"{scraper_url}/api/v1/jobs/{scraper_id}", timeout=10)
restart_scraper(scraper_url) # Fix 3: Nur betroffenen Scraper neu starten
break
else:
stuck_counter = 0
if status in ('ok', 'completed', 'scraped'):
dl = requests.get(f"{scraper_url}/api/v1/jobs/{scraper_id}/download", timeout=90)
if dl.status_code == 200:
df_filtered = process_result_csv(dl.content, df_input, True)
df_raw = process_result_csv(dl.content, df_input, False)
if df_filtered is not None:
all_results_filtered.append(df_filtered)
all_results_raw.append(df_raw)
append_partial(job_id, df_filtered, df_raw) # Resume: sofort speichern
print(f"📊 Batch {batch_idx+1}: {len(df_filtered)} filtered")
batch_success = True
break
# Fix 2: Scraper-Fehler → Retry
elif status in ('failed', 'error'):
print(f"💥 Batch {batch_idx+1}: {status} (Versuch {attempt})")
if attempt < MAX_RETRIES:
time.sleep(10)
break
time.sleep(random.uniform(POLL_DELAY_MIN, POLL_DELAY_MAX))
if batch_success:
break
except Exception as e:
print(f"💥 Batch {batch_idx+1} Versuch {attempt}: {e}")
if attempt < MAX_RETRIES:
time.sleep(10)
# Resume: Fortschritt nach jedem Batch speichern
save_progress(job_id, batch_idx, batches)
# ETA berechnen
elapsed = time.time() - job_start_time
done_so_far = batch_idx - start_batch + 1
if done_so_far > 0:
avg_per_batch = elapsed / done_so_far
remaining = (batches - batch_idx - 1) * avg_per_batch
eta_str = format_eta(remaining)
else:
eta_str = "?"
job.status = f"🔄 Batch {batch_idx+2}/{batches} | ⏱️ ~{eta_str}"
db.session.commit()
#MERGE & SAVE
job.status = "🔧 merging results"
db.session.commit()
base = filename.replace('.csv', '')
if all_results_filtered:
df_final_filtered = pd.concat(all_results_filtered, ignore_index=True)
df_final_filtered = df_final_filtered.drop_duplicates(subset=['title', 'address'])
out_filtered = f"results_{base}_filtered.csv"
df_final_filtered.to_csv(
os.path.join(RESULT_FOLDER, out_filtered),
index=False, encoding='utf-8-sig', sep=';'
)
if all_results_raw:
df_final_raw = pd.concat(all_results_raw, ignore_index=True)
out_raw = f"results_{base}_all.csv"
df_final_raw.to_csv(
os.path.join(RESULT_FOLDER, out_raw),
index=False, encoding='utf-8-sig', sep=';'
)
job.result_filename = out_filtered
job.result_filename_raw = out_raw
job.status = f"✅ Fertig: {len(df_final_filtered)} Firmen"
# Resume: Cleanup nach Abschluss
cleanup_progress(job_id)
else:
job.status = "❌ Keine Ergebnisse"
db.session.commit()
print(f"🎉 Job {job_id} komplett!")
except Exception as e:
job.status = f"Failed: {str(e)[:50]}"
print(f"💥 FATAL: {e}")
import traceback
traceback.print_exc()
db.session.commit()
print(f"✅ DONE! Status: {job.status}")

View file

@ -1,138 +0,0 @@
import csv
import os
import requests
from .models import db, Job
from flask import current_app
UPLOAD_FOLDER = 'uploads'
RESULT_FOLDER = 'results'
API_KEY = 'AIzaSyAIf0yXJTwo87VMWLBtq2m2LqE-OaPGbzw'
processed_companies = set()
def get_geocode(address):
url = f"https://maps.googleapis.com/maps/api/geocode/json"
params = {'address': address, 'key': API_KEY}
try:
response = requests.get(url, params=params, timeout=5)
if response.status_code == 200:
data = response.json()
if data['status'] == 'OK':
location = data['results'][0]['geometry']['location']
return location['lat'], location['lng']
except requests.RequestException as e:
print(f"Geocode API Fehler für {address}: {e}")
return None, None
def get_nearby_places(lat, lng):
places_url = f"https://maps.googleapis.com/maps/api/place/nearbysearch/json"
params = {
'location': f"{lat},{lng}",
'radius': 10,
'type': 'point_of_interest',
'key': API_KEY
}
try:
response = requests.get(places_url, params=params, timeout=5)
if response.status_code == 200:
return response.json().get('results', [])
except requests.RequestException as e:
print(f"Nearby Places API Fehler für Standort {lat},{lng}: {e}")
return []
def get_place_details(place_id):
details_url = f"https://maps.googleapis.com/maps/api/place/details/json"
params = {
'place_id': place_id,
'fields': 'formatted_phone_number,website',
'key': API_KEY
}
try:
response = requests.get(details_url, params=params, timeout=5)
if response.status_code == 200:
result = response.json().get('result', {})
return result.get('formatted_phone_number', 'N/A'), result.get('website', 'N/A')
except requests.RequestException as e:
print(f"Place Details API Fehler für Place ID {place_id}: {e}")
return 'N/A', 'N/A'
def process_file(filename, job_id, app):
with app.app_context():
filepath = os.path.join(UPLOAD_FOLDER, filename)
results = []
job = Job.query.get(job_id)
if not job:
print("Job wurde abgebrochen.")
return
job.status = "In Progress"
db.session.commit()
with open(filepath, newline='', encoding='ISO-8859-1') as csvfile:
reader = csv.DictReader(csvfile, delimiter=';')
headers = reader.fieldnames
if not all(field in headers for field in ['PLZ', 'Straße', 'Hausnummer']):
print("CSV-Datei enthält nicht alle notwendigen Spalten.")
job.status = "Failed"
db.session.commit()
return
for row in reader:
plz = row.get('PLZ', '').strip()
city = row.get('Stadt', row.get('Bezirk', '')).strip()
street = row.get('Straße', '').strip()
house_number = row.get('Hausnummer', '').strip()
additional = row.get('Zusatz', '').strip()
if not all([plz, city, street, house_number]):
continue
full_address = f"{street} {house_number} {additional}, {plz} {city}"
lat, lng = get_geocode(full_address)
if lat is None or lng is None:
continue
nearby_places = get_nearby_places(lat, lng)
for place in nearby_places:
company_name = place['name']
if company_name in processed_companies:
continue
processed_companies.add(company_name)
company_address = place.get('vicinity', 'N/A').split(',')[0]
place_id = place.get('place_id')
company_phone, company_website = get_place_details(place_id) if place_id else ('N/A', 'N/A')
results.append({
'PLZ': plz,
'Stadt': city,
'Straße': street,
'Hausnummer': house_number,
'Zusatz': additional,
'Company Name': company_name,
'Company Address': company_address,
'Company Phone': company_phone,
'Company Website': company_website
})
if results:
result_file = f"results_{os.path.splitext(filename)[0]}.csv"
result_path = os.path.join(RESULT_FOLDER, result_file)
with open(result_path, 'w', newline='', encoding='utf-8-sig') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=[
'PLZ', 'Stadt', 'Straße', 'Hausnummer', 'Zusatz',
'Company Name', 'Company Address', 'Company Phone', 'Company Website'
])
writer.writeheader()
writer.writerows(results)
job.status = "Completed"
job.result_filename = result_file
db.session.commit()
else:
job.status = "Failed"
db.session.commit()

View file

@ -1,487 +1,128 @@
import csv
import os
import re
import unicodedata
import json
import threading
import pandas as pd
import requests
import time
import random
from io import StringIO
from concurrent.futures import ThreadPoolExecutor, as_completed
from app.models import db, Job
from .models import db, Job
from flask import current_app
print("🆕 MODERN webcrawler LOADED! BATCHED + PROXY + RESUME + ETA + 4x SCRAPER CHUNK-PARALLEL")
UPLOAD_FOLDER = 'uploads'
RESULT_FOLDER = 'results'
API_KEY = 'AIzaSyAIf0yXJTwo87VMWLBtq2m2LqE-OaPGbzw'
UPLOAD_FOLDER = '/app/uploads'
RESULT_FOLDER = '/app/results'
def get_place_details(street, city_zip):
address = f"{street}, {city_zip}"
url = f"https://maps.googleapis.com/maps/api/place/textsearch/json"
params = {'query': address, 'key': API_KEY}
SCRAPER_URLS = [
"http://gmaps-scraper-1:8080",
"http://gmaps-scraper-2:8080",
"http://gmaps-scraper-3:8080",
"http://gmaps-scraper-4:8080",
]
OUTPUT_COLS = ['title', 'category', 'address', 'open_hours', 'website', 'phone', 'link']
PROXY_URL = "http://bitlleuv-rotate:s5hzse6hz74b@p.webshare.io:80"
API_PROXIES = {"http": PROXY_URL, "https": PROXY_URL}
_job_semaphore = threading.Semaphore(1)
# ──────────────────────────────────────────────
# Tuning
# ──────────────────────────────────────────────
BATCH_SIZE = 30 # Keywords pro Scraper-Job
BATCH_DELAY_MIN = 3 # Sekunden Pause zwischen Chunks (min)
BATCH_DELAY_MAX = 6 # Sekunden Pause zwischen Chunks (max)
MAX_TIME = 60 # Sekunden die der Scraper pro Batch hat
POLL_MAX = 90 # Max. Poll-Versuche pro Batch
POLL_DELAY_MIN = 2 # Sekunden zwischen Polls (min)
POLL_DELAY_MAX = 5 # Sekunden zwischen Polls (max)
STUCK_TIMEOUT = 300 # Sekunden bis Scraper-Neustart (5 Min)
MAX_RETRIES = 2 # Wiederholversuche pro Batch bei Fehler
PARALLEL_WORKERS = len(SCRAPER_URLS)
_partial_lock = threading.Lock()
# ──────────────────────────────────────────────
# Hilfsfunktionen
# ──────────────────────────────────────────────
def is_blocked(data):
text = str(data).lower()
blocked = any(kw in text for kw in ['captcha', 'blocked', 'rate limit', 'too many', '429'])
if blocked:
print(f"🚫 BLOCKED: {str(data)[:100]}")
return blocked
def fix_encoding(text):
if not isinstance(text, str):
return text
results = []
try:
return text.encode('latin-1').decode('utf-8')
except (UnicodeEncodeError, UnicodeDecodeError):
return text
response = requests.get(url, params=params, timeout=5)
if response.status_code == 200:
data = response.json()
print(f"API Response Data for {address}: {data}")
def clean_query(q):
q = ''.join(c for c in q if unicodedata.category(c) != 'Cc')
q = ' '.join(q.split())
return q.strip()
for place in data.get('results', []):
name = place.get('name', 'N/A')
place_id = place.get('place_id')
formatted_address = place.get('formatted_address', 'N/A')
def build_input_addresses(df):
addresses = set()
for _, row in df.iterrows():
plz = str(row.get('PLZ', '')).strip()
stadt = str(row.get('Stadt', '')).strip()
str_ = str(row.get('Straße', '')).strip()
nr = str(row.get('Hausnummer', '')).strip()
zusatz = str(row.get('Zusatz', '')).strip()
full = f"{str_} {nr} {zusatz} {plz} {stadt}".lower().strip()
full = ' '.join(full.split())
addresses.add(full)
return addresses
# Zweite Anfrage für detailliertere Informationen
phone, website = 'N/A', 'N/A'
if place_id:
details_url = f"https://maps.googleapis.com/maps/api/place/details/json"
details_params = {
'place_id': place_id,
'fields': 'formatted_phone_number,website',
'key': API_KEY
}
details_response = requests.get(details_url, params=details_params, timeout=5)
if details_response.status_code == 200:
details_data = details_response.json().get('result', {})
phone = details_data.get('formatted_phone_number', 'N/A')
website = details_data.get('website', 'N/A')
def normalize_address(addr):
if not isinstance(addr, str):
return ''
addr = fix_encoding(addr)
return ' '.join(addr.lower().strip().split())
# Speichern nur, wenn Name und Telefonnummer vorhanden sind
if name != 'N/A' and phone != 'N/A':
results.append({
'Name': name,
'Address': formatted_address,
'Phone': phone,
'Website': website
})
else:
print(f"Fehler beim Abrufen der URL: {url} - Statuscode: {response.status_code}")
except requests.exceptions.RequestException as e:
print(f"Anfragefehler für {url}: {e}")
def address_in_input(result_addr, input_addresses):
norm = normalize_address(result_addr)
for inp_addr in input_addresses:
plz_match = re.search(r'\b\d{5}\b', inp_addr)
if not plz_match:
continue
plz = plz_match.group()
if plz not in norm:
continue
parts = inp_addr.split()
street = parts[0] if parts else ''
if len(street) < 4 or street[:5].lower() not in norm:
continue
hausnr = parts[1] if len(parts) > 1 else ''
if hausnr and not re.search(rf'\b{re.escape(hausnr)}\b', norm):
continue
return True
return False
def format_eta(seconds):
if seconds < 60:
return f"{int(seconds)}s"
h, rem = divmod(int(seconds), 3600)
m = rem // 60
return f"{h}h {m:02d}min" if h > 0 else f"{m}min"
# ──────────────────────────────────────────────
# Scraper-Job Cleanup
# ──────────────────────────────────────────────
def _cleanup_scraper_job(scraper_url, scraper_id):
"""Scraper-Job immer aufräumen wenn wir ihn nicht mehr brauchen"""
try:
requests.delete(f"{scraper_url}/api/v1/jobs/{scraper_id}", timeout=10)
print(f"🗑️ Scraper-Job {scraper_id} gelöscht")
except Exception as e:
print(f"⚠️ Cleanup fehlgeschlagen: {e}")
# ──────────────────────────────────────────────
# Scraper-Neustart via Docker SDK
# ──────────────────────────────────────────────
def restart_scraper(scraper_url):
try:
import docker
container_name = scraper_url.split("//")[1].split(":")[0]
print(f"🔄 Starte {container_name} neu...")
client = docker.from_env()
container = client.containers.get(container_name)
container.restart()
print(f"{container_name} neu gestartet warte 15s...")
time.sleep(15)
return True
except Exception as e:
print(f"⚠️ Scraper-Neustart fehlgeschlagen: {e}")
return False
# ──────────────────────────────────────────────
# Resume: Progress-File Hilfsfunktionen
# ──────────────────────────────────────────────
def get_progress_path(job_id):
return os.path.join(RESULT_FOLDER, f"progress_{job_id}.json")
def get_partial_path(job_id, suffix):
return os.path.join(RESULT_FOLDER, f"partial_{job_id}_{suffix}.csv")
def load_progress(job_id):
path = get_progress_path(job_id)
if os.path.exists(path):
with open(path, 'r') as f:
data = json.load(f)
print(f"🔁 RESUME: ab Batch {data['last_completed_batch'] + 1}/{data['total_batches']}")
return data
return None
def save_progress(job_id, last_completed_batch, total_batches):
path = get_progress_path(job_id)
with open(path, 'w') as f:
json.dump({"last_completed_batch": last_completed_batch, "total_batches": total_batches}, f)
def append_partial(job_id, df_filtered, df_raw):
with _partial_lock:
for suffix, df in [('filtered', df_filtered), ('raw', df_raw)]:
if df is None:
continue
path = get_partial_path(job_id, suffix)
header = not os.path.exists(path)
df.to_csv(path, mode='a', index=False, header=header, encoding='utf-8-sig', sep=';')
def load_partial(job_id):
results_filtered, results_raw = [], []
for suffix, lst in [('filtered', results_filtered), ('raw', results_raw)]:
path = get_partial_path(job_id, suffix)
if os.path.exists(path):
try:
df = pd.read_csv(path, sep=';', encoding='utf-8-sig')
lst.append(df)
print(f"📂 Partial {suffix}: {len(df)} Zeilen geladen")
except Exception as e:
print(f"⚠️ Partial {suffix} Ladefehler: {e}")
return results_filtered, results_raw
def cleanup_progress(job_id):
for path in [
get_progress_path(job_id),
get_partial_path(job_id, 'filtered'),
get_partial_path(job_id, 'raw'),
]:
if os.path.exists(path):
os.remove(path)
# ──────────────────────────────────────────────
# CSV Nachbearbeitung
# ──────────────────────────────────────────────
def process_result_csv(raw_bytes, input_df, apply_filter=True):
try:
content = raw_bytes.decode('utf-8', errors='replace')
df_out = pd.read_csv(StringIO(content))
print(f"📄 Raw result: {df_out.shape}")
available = [c for c in OUTPUT_COLS if c in df_out.columns]
df_out = df_out[available]
for col in df_out.columns:
df_out[col] = df_out[col].apply(fix_encoding)
if apply_filter:
input_addresses = build_input_addresses(input_df)
before = len(df_out)
df_out = df_out[
df_out['address'].apply(lambda a: address_in_input(a, input_addresses))
]
print(f"📍 Filter: {before}{len(df_out)}")
df_out = df_out.drop_duplicates(subset=['title', 'address'], keep='first')
df_out = df_out.dropna(subset=['title'], how='all')
df_out = df_out[df_out['title'].str.strip().astype(bool)]
print(f"✅ Final ({'gefiltert' if apply_filter else 'alle'}): {df_out.shape}")
return df_out
except Exception as e:
print(f"💥 process_result_csv: {e}")
return None
# ──────────────────────────────────────────────
# Parallel: Einzelnen Batch verarbeiten
# ──────────────────────────────────────────────
def process_batch(batch_idx, batch_queries, scraper_url, filename, job_id, df_input):
payload = {
"name": f"{filename.replace('.csv','')}-{job_id}-B{batch_idx+1:03d}",
"keywords": batch_queries,
"lang": "de",
"depth": 1,
"zoom": 17,
"radius": 100,
"max_time": MAX_TIME,
"fast_mode": False,
"proxies": [PROXY_URL]
}
for attempt in range(1, MAX_RETRIES + 1):
scraper_id = None
try:
resp = requests.post(f"{scraper_url}/api/v1/jobs", json=payload, timeout=45)
print(f"📤 Batch {batch_idx+1}{scraper_url} | {resp.status_code} (Versuch {attempt})")
if is_blocked(resp.text):
print(f"🚫 Batch {batch_idx+1} blocked")
return None, None
if resp.status_code != 201:
print(f"⚠️ Batch {batch_idx+1} fehlgeschlagen: {resp.text[:100]}")
if attempt < MAX_RETRIES:
time.sleep(10)
continue
scraper_id = resp.json()['id']
print(f"✅ Batch {batch_idx+1} Scraper-ID: {scraper_id}")
batch_start_time = time.time()
for poll_i in range(1, POLL_MAX + 1):
r = requests.get(f"{scraper_url}/api/v1/jobs/{scraper_id}", timeout=15)
data = r.json()
status = data.get('Status', data.get('status', '?'))
elapsed = time.time() - batch_start_time
print(f"⏳ Batch {batch_idx+1} Poll {poll_i}: {status} | {int(elapsed)}s")
if status == 'pending' and elapsed > STUCK_TIMEOUT:
print(f"⚠️ Batch {batch_idx+1} hängt seit {int(elapsed)}s Neustart {scraper_url}")
_cleanup_scraper_job(scraper_url, scraper_id)
scraper_id = None
restart_scraper(scraper_url)
break
if status in ('ok', 'completed', 'scraped'):
dl = requests.get(f"{scraper_url}/api/v1/jobs/{scraper_id}/download", timeout=90)
scraper_id = None
if dl.status_code == 200:
df_filtered = process_result_csv(dl.content, df_input, True)
df_raw = process_result_csv(dl.content, df_input, False)
print(f"📊 Batch {batch_idx+1}: {len(df_filtered) if df_filtered is not None else 0} filtered")
return df_filtered, df_raw
return None, None
elif status in ('failed', 'error'):
print(f"💥 Batch {batch_idx+1}: {status} (Versuch {attempt})")
_cleanup_scraper_job(scraper_url, scraper_id)
scraper_id = None
if attempt < MAX_RETRIES:
time.sleep(10)
break
time.sleep(random.uniform(POLL_DELAY_MIN, POLL_DELAY_MAX))
except Exception as e:
print(f"💥 Batch {batch_idx+1} Versuch {attempt}: {e}")
if scraper_id:
_cleanup_scraper_job(scraper_url, scraper_id)
scraper_id = None
if attempt < MAX_RETRIES:
time.sleep(10)
return None, None
# ──────────────────────────────────────────────
# HAUPT-WORKER
# ──────────────────────────────────────────────
return results
def process_file(filename, job_id, app):
with app.app_context():
print(f"Starte Prozess für Job-ID: {job_id}")
filepath = os.path.join(UPLOAD_FOLDER, filename)
results = []
job = Job.query.get(job_id)
if job:
job.status = "⏳ Wartet auf anderen Job..."
if not job:
print("Job wurde abgebrochen, bevor er starten konnte.")
return
job.status = "In Progress"
db.session.commit()
with open(filepath, newline='', encoding='ISO-8859-1') as csvfile:
reader = csv.DictReader(csvfile, delimiter=';')
rows = list(reader)
total_rows = len(rows)
print(f"Insgesamt zu verarbeitende Zeilen: {total_rows}")
for index, row in enumerate(rows):
# Job-Verfügbarkeit erneut prüfen
job = Job.query.get(job_id)
if not job:
print("Job wurde abgebrochen.")
return
# Vollständige Adresse erstellen
street = f"{row.get('Straße', '')} {row.get('Hausnummer', '')}".strip()
city_zip = f"{row.get('PLZ', '')} {row.get('Stadt', '')}".strip()
print(f"Verarbeite Adresse: {street}, {city_zip}")
address_results = get_place_details(street, city_zip)
for result in address_results:
# Ergebnisse nur speichern, wenn Name und Telefonnummer vorhanden sind
if result['Name'] != 'N/A' and result['Phone'] != 'N/A':
result.update({
'PLZ': row.get('PLZ', ''),
'Stadt': row.get('Stadt', ''),
'Straße': row.get('Straße', ''),
'Hausnummer': row.get('Hausnummer', ''),
'Zusatz': row.get('Zusatz', '')
})
results.append(result)
# Results-Dateiname basierend auf dem Upload-Dateinamen
result_file = f"results_{filename}"
result_path = os.path.join(RESULT_FOLDER, result_file)
# Prüfen und erstellen des Ergebnisverzeichnisses
if not os.path.exists(RESULT_FOLDER):
os.makedirs(RESULT_FOLDER)
print(f"Erstelle Ergebnisverzeichnis: {RESULT_FOLDER}")
try:
if results: # Nur speichern, wenn Ergebnisse vorhanden sind
with open(result_path, 'w', newline='', encoding='utf-8-sig') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=['Name', 'Address', 'Phone', 'Website', 'PLZ', 'Stadt', 'Straße', 'Hausnummer', 'Zusatz'])
writer.writeheader()
writer.writerows(results)
print(f"Ergebnisdatei erfolgreich gespeichert unter: {result_path}")
job.status = "Completed"
job.result_filename = result_file
db.session.commit()
else:
print("Keine relevanten Ergebnisse zum Speichern vorhanden. Markiere den Job als 'Failed'.")
job.status = "Failed"
db.session.commit()
except Exception as e:
print(f"Fehler beim Schreiben der Ergebnisdatei: {e}")
job.status = "Failed"
db.session.commit()
with _job_semaphore:
print(f"🎯 {filename} Job#{job_id} START!")
with app.app_context():
job = Job.query.get(job_id)
if not job:
print("❌ Job missing")
return
try:
job.status = "📊 parsing CSV"
db.session.commit()
filepath = os.path.join(UPLOAD_FOLDER, filename)
print(f"📁 {filepath} | {os.path.getsize(filepath)}b")
df_input = pd.read_csv(filepath, sep=';', encoding='ISO-8859-1')
print(f"📊 {df_input.shape}")
queries = []
for _, row in df_input.iterrows():
parts = [
str(row.get('PLZ', '')).strip(),
str(row.get('Stadt', '')).strip(),
str(row.get('Straße', '')).strip(),
str(row.get('Hausnummer', '')).strip(),
str(row.get('Zusatz', '')).strip(),
]
q = f"Firmen {' '.join(p for p in parts if p and p != 'nan')}".strip()
q = clean_query(q)
if len(q) > 10:
queries.append(q)
total_queries = len(queries)
print(f"🔍 {total_queries} Queries | Samples: {queries[:3]}")
if total_queries == 0:
raise ValueError("Keine gültigen Adressen")
batches = (total_queries + BATCH_SIZE - 1) // BATCH_SIZE
os.makedirs(RESULT_FOLDER, exist_ok=True)
progress = load_progress(job_id)
start_batch = progress['last_completed_batch'] + 1 if progress else 0
all_results_filtered, all_results_raw = load_partial(job_id) if progress else ([], [])
eta_initial = format_eta((batches - start_batch) * ((BATCH_DELAY_MAX + MAX_TIME) / 2) / PARALLEL_WORKERS)
print(f"📦 {batches} Batches à {BATCH_SIZE} | {PARALLEL_WORKERS}x parallel (Chunk) | Start: {start_batch} | ETA: ~{eta_initial}")
job_start_time = time.time()
job.status = f"🔄 Batch {start_batch+1}/{batches} | ⏱️ ~{eta_initial}"
db.session.commit()
completed_count = 0
batch_indices = list(range(start_batch, batches))
chunks = [
batch_indices[i:i + PARALLEL_WORKERS]
for i in range(0, len(batch_indices), PARALLEL_WORKERS)
]
with ThreadPoolExecutor(max_workers=PARALLEL_WORKERS) as executor:
for chunk_idx, chunk in enumerate(chunks):
futures = {}
for batch_idx in chunk:
batch_start_q = batch_idx * BATCH_SIZE
batch_end_q = min(batch_start_q + BATCH_SIZE, total_queries)
batch_queries = queries[batch_start_q:batch_end_q]
scraper_url = SCRAPER_URLS[batch_idx % len(SCRAPER_URLS)]
print(f"\n🚀 Chunk {chunk_idx+1} | Batch {batch_idx+1}/{batches}{scraper_url}")
time.sleep(random.uniform(1, 2))
future = executor.submit(
process_batch,
batch_idx, batch_queries, scraper_url,
filename, job_id, df_input
)
futures[future] = batch_idx
for future in as_completed(futures):
batch_idx = futures[future]
completed_count += 1
try:
df_filtered, df_raw = future.result()
if df_filtered is not None:
all_results_filtered.append(df_filtered)
all_results_raw.append(df_raw)
append_partial(job_id, df_filtered, df_raw)
except Exception as e:
print(f"💥 Batch {batch_idx+1} Exception: {e}")
save_progress(job_id, batch_idx, batches)
elapsed = time.time() - job_start_time
if completed_count > 0:
avg_per_batch = elapsed / completed_count
remaining = (batches - start_batch - completed_count) * avg_per_batch / PARALLEL_WORKERS
eta_str = format_eta(remaining)
else:
eta_str = "?"
job.status = f"🔄 {completed_count}/{batches - start_batch} fertig | ⏱️ ~{eta_str}"
db.session.commit()
if chunk_idx < len(chunks) - 1:
delay = random.uniform(BATCH_DELAY_MIN, BATCH_DELAY_MAX)
print(f"⏸️ Chunk {chunk_idx+1} fertig warte {delay:.1f}s...")
time.sleep(delay)
# ── MERGE & SAVE ──
job.status = "🔧 merging results"
db.session.commit()
base = filename.replace('.csv', '')
if all_results_filtered:
df_final_filtered = pd.concat(all_results_filtered, ignore_index=True)
df_final_filtered = df_final_filtered.drop_duplicates(subset=['title', 'address'])
out_filtered = f"results_{base}_filtered.csv"
df_final_filtered.to_csv(
os.path.join(RESULT_FOLDER, out_filtered),
index=False, encoding='utf-8-sig', sep=';'
)
out_raw = None
if all_results_raw:
df_final_raw = pd.concat(all_results_raw, ignore_index=True)
out_raw = f"results_{base}_all.csv"
df_final_raw.to_csv(
os.path.join(RESULT_FOLDER, out_raw),
index=False, encoding='utf-8-sig', sep=';'
)
job.result_filename = out_filtered
job.result_filename_raw = out_raw
job.status = f"✅ Fertig: {len(df_final_filtered)} Firmen"
cleanup_progress(job_id)
else:
job.status = "❌ Keine Ergebnisse"
db.session.commit()
print(f"🎉 Job {job_id} komplett!")
except Exception as e:
job.status = f"Failed: {str(e)[:50]}"
print(f"💥 FATAL: {e}")
import traceback
traceback.print_exc()
db.session.commit()
print(f"✅ DONE! Status: {job.status}")

View file

@ -1,21 +0,0 @@
import requests
import time
base_url = "http://localhost:5001/api/v1/jobs"
response = requests.get(base_url)
jobs = response.json() # Direkt Array
print(f"{len(jobs)} Jobs gefunden.")
deleted = 0
for job in jobs:
job_id = job["ID"]
del_res = requests.delete(f"{base_url}/{job_id}")
if del_res.status_code in [200, 204]:
print(f"{job_id}")
deleted += 1
else:
print(f"{job_id}: {del_res.status_code}")
time.sleep(0.1)
print(f"{deleted}/{len(jobs)} gelöscht.")

View file

@ -1,4 +1,4 @@
version: '3.8'
version: '3'
services:
web:
build: .
@ -6,114 +6,6 @@ services:
- "5000:5000"
environment:
- FLASK_APP=app
- FLASK_ENV=production
- PYTHONUNBUFFERED=1
command: flask run --host=0.0.0.0 --port=5000
volumes:
- ./app:/app/app
- ./uploads:/app/uploads
- ./results:/app/results
- ./instance:/app/instance
- /var/run/docker.sock:/var/run/docker.sock
depends_on:
- gmaps-scraper-1
- gmaps-scraper-2
- gmaps-scraper-3
- gmaps-scraper-4
restart: always
networks:
- scraper-net
gmaps-scraper-1:
image: gosom/google-maps-scraper:latest
container_name: gmaps-scraper-1
environment:
- PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
ports:
- "5001:8080"
volumes:
- ./scraper-data-1:/gmapsdata
command:
- "-web"
- "-data-folder=/gmapsdata"
restart: always
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:8080/api/v1/jobs || exit 1"]
interval: 30s
timeout: 10s
retries: 3
start_period: 15s
networks:
- scraper-net
gmaps-scraper-2:
image: gosom/google-maps-scraper:latest
container_name: gmaps-scraper-2
environment:
- PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
ports:
- "5002:8080"
volumes:
- ./scraper-data-2:/gmapsdata
command:
- "-web"
- "-data-folder=/gmapsdata"
restart: always
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:8080/api/v1/jobs || exit 1"]
interval: 30s
timeout: 10s
retries: 3
start_period: 15s
networks:
- scraper-net
gmaps-scraper-3:
image: gosom/google-maps-scraper:latest
container_name: gmaps-scraper-3
environment:
- PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
ports:
- "5003:8080"
volumes:
- ./scraper-data-3:/gmapsdata
command:
- "-web"
- "-data-folder=/gmapsdata"
restart: always
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:8080/api/v1/jobs || exit 1"]
interval: 30s
timeout: 10s
retries: 3
start_period: 15s
networks:
- scraper-net
gmaps-scraper-4:
image: gosom/google-maps-scraper:latest
container_name: gmaps-scraper-4
environment:
- PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
ports:
- "5004:8080"
volumes:
- ./scraper-data-4:/gmapsdata
command:
- "-web"
- "-data-folder=/gmapsdata"
restart: always
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:8080/api/v1/jobs || exit 1"]
interval: 30s
timeout: 10s
retries: 3
start_period: 15s
networks:
- scraper-net
networks:
scraper-net:
driver: bridge
- .:/app

Binary file not shown.

View file

@ -1 +0,0 @@
Single-database configuration for Flask.

View file

@ -1,50 +0,0 @@
# A generic, single database configuration.
[alembic]
# template used to generate migration files
# file_template = %%(rev)s_%%(slug)s
# set to 'true' to run the environment during
# the 'revision' command, regardless of autogenerate
# revision_environment = false
# Logging configuration
[loggers]
keys = root,sqlalchemy,alembic,flask_migrate
[handlers]
keys = console
[formatters]
keys = generic
[logger_root]
level = WARN
handlers = console
qualname =
[logger_sqlalchemy]
level = WARN
handlers =
qualname = sqlalchemy.engine
[logger_alembic]
level = INFO
handlers =
qualname = alembic
[logger_flask_migrate]
level = INFO
handlers =
qualname = flask_migrate
[handler_console]
class = StreamHandler
args = (sys.stderr,)
level = NOTSET
formatter = generic
[formatter_generic]
format = %(levelname)-5.5s [%(name)s] %(message)s
datefmt = %H:%M:%S

View file

@ -1,113 +0,0 @@
import logging
from logging.config import fileConfig
from flask import current_app
from alembic import context
# this is the Alembic Config object, which provides
# access to the values within the .ini file in use.
config = context.config
# Interpret the config file for Python logging.
# This line sets up loggers basically.
fileConfig(config.config_file_name)
logger = logging.getLogger('alembic.env')
def get_engine():
try:
# this works with Flask-SQLAlchemy<3 and Alchemical
return current_app.extensions['migrate'].db.get_engine()
except (TypeError, AttributeError):
# this works with Flask-SQLAlchemy>=3
return current_app.extensions['migrate'].db.engine
def get_engine_url():
try:
return get_engine().url.render_as_string(hide_password=False).replace(
'%', '%%')
except AttributeError:
return str(get_engine().url).replace('%', '%%')
# add your model's MetaData object here
# for 'autogenerate' support
# from myapp import mymodel
# target_metadata = mymodel.Base.metadata
config.set_main_option('sqlalchemy.url', get_engine_url())
target_db = current_app.extensions['migrate'].db
# other values from the config, defined by the needs of env.py,
# can be acquired:
# my_important_option = config.get_main_option("my_important_option")
# ... etc.
def get_metadata():
if hasattr(target_db, 'metadatas'):
return target_db.metadatas[None]
return target_db.metadata
def run_migrations_offline():
"""Run migrations in 'offline' mode.
This configures the context with just a URL
and not an Engine, though an Engine is acceptable
here as well. By skipping the Engine creation
we don't even need a DBAPI to be available.
Calls to context.execute() here emit the given string to the
script output.
"""
url = config.get_main_option("sqlalchemy.url")
context.configure(
url=url, target_metadata=get_metadata(), literal_binds=True
)
with context.begin_transaction():
context.run_migrations()
def run_migrations_online():
"""Run migrations in 'online' mode.
In this scenario we need to create an Engine
and associate a connection with the context.
"""
# this callback is used to prevent an auto-migration from being generated
# when there are no changes to the schema
# reference: http://alembic.zzzcomputing.com/en/latest/cookbook.html
def process_revision_directives(context, revision, directives):
if getattr(config.cmd_opts, 'autogenerate', False):
script = directives[0]
if script.upgrade_ops.is_empty():
directives[:] = []
logger.info('No changes in schema detected.')
conf_args = current_app.extensions['migrate'].configure_args
if conf_args.get("process_revision_directives") is None:
conf_args["process_revision_directives"] = process_revision_directives
connectable = get_engine()
with connectable.connect() as connection:
context.configure(
connection=connection,
target_metadata=get_metadata(),
**conf_args
)
with context.begin_transaction():
context.run_migrations()
if context.is_offline_mode():
run_migrations_offline()
else:
run_migrations_online()

View file

@ -1,24 +0,0 @@
"""${message}
Revision ID: ${up_revision}
Revises: ${down_revision | comma,n}
Create Date: ${create_date}
"""
from alembic import op
import sqlalchemy as sa
${imports if imports else ""}
# revision identifiers, used by Alembic.
revision = ${repr(up_revision)}
down_revision = ${repr(down_revision)}
branch_labels = ${repr(branch_labels)}
depends_on = ${repr(depends_on)}
def upgrade():
${upgrades if upgrades else "pass"}
def downgrade():
${downgrades if downgrades else "pass"}

View file

@ -1,45 +0,0 @@
"""Add is_admin column to User model
Revision ID: 10331d61a25d
Revises:
Create Date: 2024-11-14 08:36:27.125841
"""
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = '10331d61a25d'
down_revision = None
branch_labels = None
depends_on = None
def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.drop_table('user')
op.drop_table('job')
# ### end Alembic commands ###
def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.create_table('job',
sa.Column('id', sa.INTEGER(), nullable=False),
sa.Column('user_id', sa.INTEGER(), nullable=False),
sa.Column('filename', sa.VARCHAR(length=150), nullable=False),
sa.Column('status', sa.VARCHAR(length=50), nullable=True),
sa.Column('created_at', sa.DATETIME(), nullable=True),
sa.Column('result_filename', sa.VARCHAR(length=150), nullable=True),
sa.ForeignKeyConstraint(['user_id'], ['user.id'], ),
sa.PrimaryKeyConstraint('id')
)
op.create_table('user',
sa.Column('id', sa.INTEGER(), nullable=False),
sa.Column('username', sa.VARCHAR(length=150), nullable=False),
sa.Column('password', sa.VARCHAR(length=150), nullable=False),
sa.PrimaryKeyConstraint('id'),
sa.UniqueConstraint('username')
)
# ### end Alembic commands ###

View file

@ -1,8 +1,7 @@
flask
flask-sqlalchemy
flask-login
flask-migrate
Flask==2.2.5
Flask-Login==0.6.2
Flask-SQLAlchemy==3.0.3
Werkzeug==2.2.2
pandas
requests
werkzeug
docker
beautifulsoup4