Compare commits
No commits in common. "bb06a072b0a65b11b67c7eeb92394757065902bd" and "008e2bc274594179d5f0aeb651dff7e9166c1205" have entirely different histories.
bb06a072b0
...
008e2bc274
34 changed files with 243 additions and 3318 deletions
163
.gitignore
vendored
163
.gitignore
vendored
|
|
@ -1,163 +0,0 @@
|
|||
### Flask ###
|
||||
instance/*
|
||||
!instance/.gitignore
|
||||
.webassets-cache
|
||||
.env
|
||||
|
||||
### Flask.Python Stack ###
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm.fming.dev/#use-with-ide
|
||||
.pdm.toml
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# Uploads / Results
|
||||
uploads/
|
||||
results/
|
||||
scraper-data-*/
|
||||
|
|
@ -7,14 +7,12 @@ WORKDIR /app
|
|||
# Abhängigkeiten installieren
|
||||
COPY requirements.txt requirements.txt
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
RUN apt update
|
||||
RUN apt install curl -y
|
||||
|
||||
# App-Dateien kopieren
|
||||
COPY . .
|
||||
|
||||
# Flask Umgebungsvariable setzen
|
||||
ENV FLASK_APP=app
|
||||
ENV FLASK_ENV=production
|
||||
|
||||
EXPOSE 5000
|
||||
# Flask starten
|
||||
CMD ["flask", "run", "--host=0.0.0.0", "--port=5000"]
|
||||
|
|
|
|||
|
|
@ -1,88 +1,52 @@
|
|||
import os
|
||||
from flask import Flask, redirect, url_for, request, current_app
|
||||
from flask import Flask, redirect, url_for, request
|
||||
from flask_sqlalchemy import SQLAlchemy
|
||||
from flask_login import LoginManager, current_user
|
||||
from flask_migrate import Migrate
|
||||
from sqlalchemy import text
|
||||
from .models import db, User
|
||||
|
||||
# ✅ Docker-Pfade
|
||||
# Konfiguration für Upload- und Ergebnis-Ordner
|
||||
UPLOAD_FOLDER = '/app/uploads'
|
||||
RESULT_FOLDER = '/app/results'
|
||||
|
||||
db = SQLAlchemy()
|
||||
login_manager = LoginManager()
|
||||
migrate = Migrate()
|
||||
|
||||
|
||||
def _run_migrations(app):
|
||||
"""Fehlende DB-Spalten automatisch hinzufügen – übersteht jeden Neustart"""
|
||||
migrations = [
|
||||
("job", "result_filename_raw", "VARCHAR(150)"),
|
||||
("job", "scraper_job_id", "VARCHAR(255)"),
|
||||
("user", "is_admin", "BOOLEAN DEFAULT 0"),
|
||||
]
|
||||
with app.app_context():
|
||||
for table, column, col_type in migrations:
|
||||
try:
|
||||
db.session.execute(text(f"ALTER TABLE {table} ADD COLUMN {column} {col_type}"))
|
||||
db.session.commit()
|
||||
print(f"✅ Migration: {table}.{column} hinzugefügt")
|
||||
except Exception:
|
||||
db.session.rollback()
|
||||
|
||||
|
||||
def create_app():
|
||||
app = Flask(__name__)
|
||||
|
||||
# 🔑 Configs
|
||||
app.config['SECRET_KEY'] = '008e7369b075886d5f494c8813efdfb17155da6af12b3fe8ee'
|
||||
app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///users.db'
|
||||
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
|
||||
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
|
||||
app.config['RESULT_FOLDER'] = RESULT_FOLDER
|
||||
app.config['ALLOW_USER_SIGNUP'] = False
|
||||
|
||||
# DB + Tools
|
||||
db.init_app(app)
|
||||
migrate.init_app(app, db)
|
||||
login_manager.init_app(app)
|
||||
login_manager.login_view = 'auth.login'
|
||||
|
||||
# User Loader
|
||||
# Flask-Login Setup
|
||||
login_manager = LoginManager()
|
||||
login_manager.login_view = 'auth.login'
|
||||
login_manager.init_app(app)
|
||||
|
||||
@login_manager.user_loader
|
||||
def load_user(user_id):
|
||||
from .models import User
|
||||
return User.query.get(int(user_id))
|
||||
|
||||
# Protected Routes
|
||||
# Umleitung nicht authentifizierter Benutzer, statische Dateien und bestimmte Routen ausnehmen
|
||||
@app.before_request
|
||||
def require_login():
|
||||
allowed = ['auth.login', 'auth.signup', 'static']
|
||||
if (not current_user.is_authenticated and
|
||||
request.endpoint not in allowed and
|
||||
not request.path.startswith('/static')):
|
||||
allowed_routes = ['auth.login', 'auth.signup']
|
||||
|
||||
# Prüfen, ob der Benutzer authentifiziert ist oder eine erlaubte Route anfragt
|
||||
if (not current_user.is_authenticated
|
||||
and request.endpoint not in allowed_routes
|
||||
and not request.path.startswith('/static/')):
|
||||
return redirect(url_for('auth.login'))
|
||||
|
||||
# Ordner
|
||||
# Erstellen Sie die Ordner, falls sie noch nicht existieren
|
||||
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
|
||||
os.makedirs(app.config['RESULT_FOLDER'], exist_ok=True)
|
||||
|
||||
# Routes
|
||||
# Registrieren der Routen
|
||||
from . import routes
|
||||
app.register_blueprint(routes.bp)
|
||||
|
||||
# Index Redirect
|
||||
@app.route('/')
|
||||
def index():
|
||||
return redirect(url_for('auth.job_status'))
|
||||
|
||||
# DB Tables + Auto-Migration
|
||||
# Erstellen der Tabellen in der Datenbank
|
||||
with app.app_context():
|
||||
db.create_all()
|
||||
_run_migrations(app)
|
||||
|
||||
return app
|
||||
|
||||
if __name__ == '__main__':
|
||||
app = create_app()
|
||||
app.run(host='0.0.0.0', port=5000, debug=False)
|
||||
|
|
|
|||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
|
@ -1,68 +0,0 @@
|
|||
import os
|
||||
from flask import Flask, redirect, url_for, request, current_app
|
||||
from flask_sqlalchemy import SQLAlchemy
|
||||
from flask_login import LoginManager, current_user
|
||||
from flask_migrate import Migrate
|
||||
|
||||
# ✅ Docker-Pfade
|
||||
UPLOAD_FOLDER = '/app/uploads'
|
||||
RESULT_FOLDER = '/app/results'
|
||||
|
||||
db = SQLAlchemy()
|
||||
login_manager = LoginManager()
|
||||
migrate = Migrate()
|
||||
|
||||
def create_app():
|
||||
app = Flask(__name__)
|
||||
|
||||
# 🔑 Configs
|
||||
app.config['SECRET_KEY'] = '008e7369b075886d5f494c8813efdfb17155da6af12b3fe8ee'
|
||||
app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///users.db'
|
||||
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
|
||||
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
|
||||
app.config['RESULT_FOLDER'] = RESULT_FOLDER
|
||||
app.config['ALLOW_USER_SIGNUP'] = True # ✅ Aktiviert!
|
||||
|
||||
# DB + Tools
|
||||
db.init_app(app)
|
||||
migrate.init_app(app, db)
|
||||
login_manager.init_app(app)
|
||||
login_manager.login_view = 'auth.login'
|
||||
|
||||
# User Loader
|
||||
@login_manager.user_loader
|
||||
def load_user(user_id):
|
||||
from .models import User
|
||||
return User.query.get(int(user_id))
|
||||
|
||||
# Protected Routes
|
||||
@app.before_request
|
||||
def require_login():
|
||||
allowed = ['auth.login', 'auth.signup', 'static']
|
||||
if (not current_user.is_authenticated and
|
||||
request.endpoint not in allowed and
|
||||
not request.path.startswith('/static')):
|
||||
return redirect(url_for('auth.login'))
|
||||
|
||||
# Ordner
|
||||
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
|
||||
os.makedirs(app.config['RESULT_FOLDER'], exist_ok=True)
|
||||
|
||||
# Routes
|
||||
from . import routes
|
||||
app.register_blueprint(routes.bp)
|
||||
|
||||
# Index Redirect
|
||||
@app.route('/')
|
||||
def index():
|
||||
return redirect(url_for('auth.job_status'))
|
||||
|
||||
# DB Tables
|
||||
with app.app_context():
|
||||
db.create_all()
|
||||
|
||||
return app
|
||||
|
||||
if __name__ == '__main__':
|
||||
app = create_app()
|
||||
app.run(host='0.0.0.0', port=5000, debug=False)
|
||||
|
|
@ -1,25 +1,20 @@
|
|||
from flask_sqlalchemy import SQLAlchemy
|
||||
from flask_login import UserMixin
|
||||
from datetime import datetime
|
||||
from . import db
|
||||
|
||||
db = SQLAlchemy()
|
||||
|
||||
class User(UserMixin, db.Model):
|
||||
id = db.Column(db.Integer, primary_key=True)
|
||||
username = db.Column(db.String(150), unique=True, nullable=False)
|
||||
password = db.Column(db.String(150), nullable=False)
|
||||
is_admin = db.Column(db.Boolean, default=False)
|
||||
|
||||
class Job(db.Model):
|
||||
id = db.Column(db.Integer, primary_key=True)
|
||||
user_id = db.Column(db.Integer, db.ForeignKey('user.id'), nullable=False)
|
||||
filename = db.Column(db.String(150), nullable=False)
|
||||
status = db.Column(db.String(50), default="Pending")
|
||||
status = db.Column(db.String(50), default="Pending") # Status: Pending, In Progress, Completed
|
||||
created_at = db.Column(db.DateTime, default=datetime.utcnow)
|
||||
result_filename = db.Column(db.String(150), nullable=True)
|
||||
result_filename_raw = db.Column(db.String(150), nullable=True)
|
||||
|
||||
|
||||
user = db.relationship('User', backref=db.backref('jobs', lazy=True))
|
||||
|
||||
class AppConfig(db.Model):
|
||||
id = db.Column(db.Integer, primary_key=True)
|
||||
key = db.Column(db.String(100), unique=True, nullable=False)
|
||||
value = db.Column(db.String(100), nullable=False, default='false')
|
||||
|
|
|
|||
223
app/routes.orig
223
app/routes.orig
|
|
@ -1,223 +0,0 @@
|
|||
import time
|
||||
import csv
|
||||
import os
|
||||
import threading
|
||||
from flask import Blueprint, request, redirect, url_for, flash, render_template, send_file, current_app
|
||||
from flask_login import login_user, logout_user, login_required, current_user
|
||||
from werkzeug.utils import secure_filename
|
||||
from werkzeug.security import generate_password_hash, check_password_hash
|
||||
from .models import db, User, Job
|
||||
from .webcrawler import process_file # Importiere die Funktion für das Webscraping
|
||||
|
||||
UPLOAD_FOLDER = 'uploads'
|
||||
RESULT_FOLDER = 'results'
|
||||
|
||||
# Blueprint für auth erstellen
|
||||
bp = Blueprint('auth', __name__)
|
||||
|
||||
@bp.route('/login', methods=['GET', 'POST'])
|
||||
def login():
|
||||
if request.method == 'POST':
|
||||
username = request.form['username']
|
||||
password = request.form['password']
|
||||
user = User.query.filter_by(username=username).first()
|
||||
if user and check_password_hash(user.password, password):
|
||||
login_user(user)
|
||||
return redirect(url_for('auth.job_status'))
|
||||
flash('Login fehlgeschlagen. Überprüfen Sie Benutzername und Passwort.')
|
||||
return render_template('login.html')
|
||||
|
||||
@bp.route('/signup', methods=['GET', 'POST'])
|
||||
def signup():
|
||||
if not current_app.config['ALLOW_USER_SIGNUP']:
|
||||
flash("Registrierung ist derzeit deaktiviert.")
|
||||
return redirect(url_for('auth.login'))
|
||||
|
||||
if request.method == 'POST':
|
||||
username = request.form['username']
|
||||
password = generate_password_hash(request.form['password'], method='sha256')
|
||||
new_user = User(username=username, password=password)
|
||||
db.session.add(new_user)
|
||||
db.session.commit()
|
||||
flash('Benutzer erfolgreich erstellt! Sie können sich jetzt einloggen.')
|
||||
return redirect(url_for('auth.login'))
|
||||
|
||||
return render_template('signup.html')
|
||||
|
||||
@bp.route('/logout')
|
||||
@login_required
|
||||
def logout():
|
||||
logout_user()
|
||||
return redirect(url_for('auth.login'))
|
||||
|
||||
@bp.route('/jobs')
|
||||
@login_required
|
||||
def job_status():
|
||||
jobs = Job.query.filter_by(user_id=current_user.id).all()
|
||||
return render_template('jobs.html', jobs=jobs)
|
||||
|
||||
@bp.route('/upload', methods=['GET', 'POST'])
|
||||
@login_required
|
||||
def upload():
|
||||
if request.method == 'POST':
|
||||
file = request.files['file']
|
||||
filename = secure_filename(file.filename)
|
||||
|
||||
# Überprüfen, ob eine Datei mit dem gleichen Namen bereits existiert
|
||||
file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], filename)
|
||||
if os.path.exists(file_path):
|
||||
# Wenn eine Datei mit dem gleichen Namen existiert, einen Zeitstempel hinzufügen
|
||||
name, ext = os.path.splitext(filename)
|
||||
timestamp = time.strftime("%Y%m%d-%H%M%S") # Zeitstempel im Format JahrMonatTag-StundenMinutenSekunden
|
||||
filename = f"{name}_{timestamp}{ext}"
|
||||
file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], filename)
|
||||
flash(f"Eine Datei mit gleichem Namen existierte bereits. Die Datei wurde als '{filename}' gespeichert.")
|
||||
|
||||
# Speichern der Datei
|
||||
file.save(file_path)
|
||||
flash('Datei erfolgreich hochgeladen und Job gestartet')
|
||||
|
||||
# Neuen Job erstellen
|
||||
new_job = Job(user_id=current_user.id, filename=filename, status="Pending")
|
||||
db.session.add(new_job)
|
||||
db.session.commit()
|
||||
|
||||
# Debugging-Ausgabe zur Überprüfung der Thread-Erstellung
|
||||
print(f"Starte Scraping-Thread für Job-ID: {new_job.id}")
|
||||
|
||||
# Starten des Scraping im Hintergrund-Thread und Übergeben des aktuellen Anwendungskontexts
|
||||
thread = threading.Thread(target=process_file, args=(filename, new_job.id, current_app._get_current_object()))
|
||||
thread.start()
|
||||
|
||||
# Debugging-Ausgabe, nachdem der Thread gestartet wurde
|
||||
print(f"Thread für Job {new_job.id} erfolgreich gestartet.")
|
||||
|
||||
return redirect(url_for('auth.job_status'))
|
||||
|
||||
return render_template('upload.html')
|
||||
|
||||
@bp.route('/download/<int:job_id>', methods=['GET'])
|
||||
@login_required
|
||||
def download_result(job_id):
|
||||
job = Job.query.get_or_404(job_id)
|
||||
print(f"Job ID: {job.id} - User ID: {job.user_id} - Current User ID: {current_user.id}")
|
||||
|
||||
# Überprüfen, ob der Job dem aktuellen Benutzer gehört
|
||||
if job.user_id != current_user.id:
|
||||
flash("Sie haben keine Berechtigung, dieses Ergebnis herunterzuladen.")
|
||||
return redirect(url_for('auth.job_status'))
|
||||
|
||||
# Überprüfen, ob das Ergebnis vorhanden ist
|
||||
if not job.result_filename:
|
||||
flash("Das Ergebnis ist noch nicht verfügbar.")
|
||||
return redirect(url_for('auth.job_status'))
|
||||
|
||||
# Überprüfen, ob die Datei im angegebenen Pfad existiert
|
||||
result_path = os.path.join(current_app.config['RESULT_FOLDER'], job.result_filename)
|
||||
print(f"Versuche, Datei herunterzuladen von: {result_path}")
|
||||
|
||||
if os.path.exists(result_path):
|
||||
print("Datei existiert und wird zum Download bereitgestellt.")
|
||||
return send_file(result_path, as_attachment=True)
|
||||
else:
|
||||
print("Datei nicht gefunden. Ergebnisverzeichnis oder Pfad prüfen.")
|
||||
flash("Ergebnisdatei nicht gefunden.")
|
||||
return redirect(url_for('auth.job_status'))
|
||||
|
||||
|
||||
@bp.route('/delete_job/<int:job_id>', methods=['POST'])
|
||||
@login_required
|
||||
def delete_job(job_id):
|
||||
job = Job.query.get_or_404(job_id)
|
||||
if job.user_id != current_user.id:
|
||||
flash("Sie haben keine Berechtigung, diesen Job zu löschen.")
|
||||
return redirect(url_for('auth.job_status'))
|
||||
|
||||
# Löschen der Upload-Datei
|
||||
upload_path = os.path.join(current_app.config['UPLOAD_FOLDER'], job.filename)
|
||||
if os.path.exists(upload_path):
|
||||
os.remove(upload_path)
|
||||
print(f"Upload-Datei gelöscht: {upload_path}")
|
||||
else:
|
||||
print(f"Upload-Datei nicht gefunden: {upload_path}")
|
||||
|
||||
# Löschen der Results-Datei, falls vorhanden
|
||||
if job.result_filename:
|
||||
result_path = os.path.join(current_app.config['RESULT_FOLDER'], job.result_filename)
|
||||
print(f"Versuche Ergebnisdatei zu löschen: {result_path}")
|
||||
|
||||
if os.path.exists(result_path):
|
||||
try:
|
||||
os.remove(result_path)
|
||||
print(f"Ergebnisdatei gelöscht: {result_path}")
|
||||
except Exception as e:
|
||||
print(f"Fehler beim Löschen der Ergebnisdatei: {e}")
|
||||
else:
|
||||
print(f"Ergebnisdatei nicht gefunden im Pfad: {result_path}")
|
||||
|
||||
# Job aus der Datenbank löschen
|
||||
db.session.delete(job)
|
||||
db.session.commit()
|
||||
flash("Job erfolgreich gelöscht.")
|
||||
return redirect(url_for('auth.job_status'))
|
||||
|
||||
@bp.route('/admin', methods=['GET'])
|
||||
@login_required
|
||||
def admin_panel():
|
||||
if not current_user.is_admin:
|
||||
flash("Keine Berechtigung.")
|
||||
return redirect(url_for('auth.job_status'))
|
||||
|
||||
users = User.query.all()
|
||||
return render_template('admin_panel.html', users=users)
|
||||
|
||||
@bp.route('/admin/create_user', methods=['POST'])
|
||||
@login_required
|
||||
def create_user():
|
||||
if not current_user.is_admin:
|
||||
flash("Keine Berechtigung.")
|
||||
return redirect(url_for('auth.admin_panel'))
|
||||
|
||||
username = request.form['username']
|
||||
password = request.form['password']
|
||||
is_admin = 'is_admin' in request.form # Checkbox für Adminrechte
|
||||
|
||||
hashed_password = generate_password_hash(password, method='sha256')
|
||||
new_user = User(username=username, password=hashed_password, is_admin=is_admin)
|
||||
db.session.add(new_user)
|
||||
db.session.commit()
|
||||
|
||||
flash(f"Benutzer {username} wurde erstellt.")
|
||||
return redirect(url_for('auth.admin_panel'))
|
||||
|
||||
@bp.route('/admin/reset_password/<int:user_id>', methods=['POST'])
|
||||
@login_required
|
||||
def reset_password(user_id):
|
||||
if not current_user.is_admin:
|
||||
flash("Keine Berechtigung.")
|
||||
return redirect(url_for('auth.admin_panel'))
|
||||
|
||||
user = User.query.get_or_404(user_id)
|
||||
new_password = request.form['new_password']
|
||||
user.password = generate_password_hash(new_password, method='sha256')
|
||||
db.session.commit()
|
||||
|
||||
flash(f"Passwort für Benutzer {user.username} wurde zurückgesetzt.")
|
||||
return redirect(url_for('auth.admin_panel'))
|
||||
|
||||
@bp.route('/admin/delete_user/<int:user_id>', methods=['POST'])
|
||||
@login_required
|
||||
def delete_user(user_id):
|
||||
if not current_user.is_admin:
|
||||
flash("Keine Berechtigung.")
|
||||
return redirect(url_for('auth.admin_panel'))
|
||||
|
||||
user = User.query.get_or_404(user_id)
|
||||
if user.is_admin:
|
||||
flash("Administratoren können nicht gelöscht werden.")
|
||||
return redirect(url_for('auth.admin_panel'))
|
||||
|
||||
db.session.delete(user)
|
||||
db.session.commit()
|
||||
flash(f"Benutzer {user.username} wurde gelöscht.")
|
||||
return redirect(url_for('auth.admin_panel'))
|
||||
233
app/routes.py
233
app/routes.py
|
|
@ -1,16 +1,17 @@
|
|||
import time
|
||||
import csv
|
||||
import os
|
||||
import threading
|
||||
from flask import Blueprint, request, redirect, url_for, flash, render_template, send_file, jsonify, current_app
|
||||
from flask import Blueprint, request, redirect, url_for, flash, render_template, send_file, current_app
|
||||
from flask_login import login_user, logout_user, login_required, current_user
|
||||
from werkzeug.utils import secure_filename
|
||||
from werkzeug.security import generate_password_hash, check_password_hash
|
||||
from .models import db, User, Job, AppConfig
|
||||
from .webcrawler import process_file
|
||||
from .models import db, User, Job
|
||||
from .webcrawler import process_file # Importiere die Funktion für das Webscraping
|
||||
|
||||
UPLOAD_FOLDER = '/app/uploads'
|
||||
RESULT_FOLDER = '/app/results'
|
||||
UPLOAD_FOLDER = 'uploads'
|
||||
RESULT_FOLDER = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'results')
|
||||
|
||||
# Blueprint für auth erstellen
|
||||
bp = Blueprint('auth', __name__)
|
||||
|
||||
@bp.route('/login', methods=['GET', 'POST'])
|
||||
|
|
@ -27,18 +28,13 @@ def login():
|
|||
|
||||
@bp.route('/signup', methods=['GET', 'POST'])
|
||||
def signup():
|
||||
cfg = AppConfig.query.filter_by(key='allow_signup').first()
|
||||
if not cfg or cfg.value != 'true':
|
||||
flash("Registrierung ist derzeit deaktiviert.")
|
||||
return redirect(url_for('auth.login'))
|
||||
|
||||
if request.method == 'POST':
|
||||
username = request.form['username']
|
||||
password = generate_password_hash(request.form['password']) # ✅ Fix
|
||||
password = generate_password_hash(request.form['password'], method='sha256')
|
||||
new_user = User(username=username, password=password)
|
||||
db.session.add(new_user)
|
||||
db.session.commit()
|
||||
flash('Benutzer erfolgreich erstellt!')
|
||||
flash('Benutzer erfolgreich erstellt! Sie können sich jetzt einloggen.')
|
||||
return redirect(url_for('auth.login'))
|
||||
return render_template('signup.html')
|
||||
|
||||
|
|
@ -51,203 +47,102 @@ def logout():
|
|||
@bp.route('/jobs')
|
||||
@login_required
|
||||
def job_status():
|
||||
jobs = Job.query.filter_by(user_id=current_user.id).order_by(Job.created_at.desc()).all()
|
||||
jobs = Job.query.filter_by(user_id=current_user.id).all()
|
||||
return render_template('jobs.html', jobs=jobs)
|
||||
|
||||
# Hochladen und Verarbeiten der Datei im Hintergrund
|
||||
@bp.route('/upload', methods=['GET', 'POST'])
|
||||
@login_required
|
||||
def upload():
|
||||
if request.method == 'POST':
|
||||
if 'file' not in request.files:
|
||||
flash('Keine Datei ausgewählt.')
|
||||
return redirect(url_for('auth.upload'))
|
||||
|
||||
file = request.files['file']
|
||||
if not file or file.filename == '':
|
||||
flash('Keine gültige Datei.')
|
||||
filename = secure_filename(file.filename)
|
||||
if not filename.endswith('.csv'):
|
||||
flash('Bitte eine CSV-Datei hochladen')
|
||||
return redirect(url_for('auth.upload'))
|
||||
|
||||
filename = secure_filename(file.filename)
|
||||
name, ext = os.path.splitext(filename)
|
||||
file_path = os.path.join(UPLOAD_FOLDER, filename)
|
||||
file.save(file_path)
|
||||
flash('Datei erfolgreich hochgeladen und Job gestartet')
|
||||
|
||||
timestamp = time.strftime("%Y%m%d_%H%M%S")
|
||||
unique_filename = f"{name}_{timestamp}{ext}" if os.path.exists(os.path.join(UPLOAD_FOLDER, filename)) else filename
|
||||
|
||||
filepath = os.path.join(UPLOAD_FOLDER, unique_filename)
|
||||
file.save(filepath)
|
||||
print(f"💾 UPLOAD: {filepath}")
|
||||
|
||||
new_job = Job(
|
||||
user_id=current_user.id,
|
||||
filename=unique_filename,
|
||||
status="Pending"
|
||||
)
|
||||
# Neuen Job erstellen
|
||||
new_job = Job(user_id=current_user.id, filename=filename, status="Pending")
|
||||
db.session.add(new_job)
|
||||
db.session.commit()
|
||||
print(f"🆕 JOB #{new_job.id} für User {current_user.id}")
|
||||
|
||||
thread = threading.Thread(
|
||||
target=process_file,
|
||||
args=(unique_filename, new_job.id, current_app._get_current_object())
|
||||
)
|
||||
thread.daemon = True
|
||||
# Debugging-Ausgabe zur Überprüfung der Thread-Erstellung
|
||||
print(f"Starte Scraping-Thread für Job-ID: {new_job.id}")
|
||||
|
||||
# Starten des Scraping im Hintergrund-Thread und Übergeben des aktuellen Anwendungskontexts
|
||||
thread = threading.Thread(target=process_file, args=(filename, new_job.id, current_app._get_current_object()))
|
||||
thread.start()
|
||||
print(f"🔄 THREAD STARTED Job {new_job.id}")
|
||||
|
||||
# Debugging-Ausgabe, nachdem der Thread gestartet wurde
|
||||
print(f"Thread für Job {new_job.id} erfolgreich gestartet.")
|
||||
|
||||
flash(f'"{unique_filename}" → Job #{new_job.id} läuft!')
|
||||
return redirect(url_for('auth.job_status'))
|
||||
|
||||
return render_template('upload.html')
|
||||
|
||||
@bp.route('/download/<int:job_id>')
|
||||
@bp.route('/download/<int:job_id>', methods=['GET'])
|
||||
@login_required
|
||||
def download_result(job_id):
|
||||
job = Job.query.filter_by(id=job_id, user_id=current_user.id).first_or_404()
|
||||
job = Job.query.get_or_404(job_id)
|
||||
print(f"Job ID: {job.id} - User ID: {job.user_id} - Current User ID: {current_user.id}")
|
||||
|
||||
if not job.result_filename or not job.status.startswith('✅'):
|
||||
flash('Ergebnis nicht bereit.')
|
||||
# Überprüfen, ob der Job dem aktuellen Benutzer gehört
|
||||
if job.user_id != current_user.id:
|
||||
flash("Sie haben keine Berechtigung, dieses Ergebnis herunterzuladen.")
|
||||
return redirect(url_for('auth.job_status'))
|
||||
|
||||
result_path = os.path.join(RESULT_FOLDER, job.result_filename)
|
||||
if os.path.exists(result_path):
|
||||
return send_file(result_path, as_attachment=True)
|
||||
flash('Datei fehlt.')
|
||||
return redirect(url_for('auth.job_status'))
|
||||
|
||||
@bp.route('/download_raw/<int:job_id>')
|
||||
@login_required
|
||||
def download_result_raw(job_id):
|
||||
job = Job.query.filter_by(id=job_id, user_id=current_user.id).first_or_404()
|
||||
|
||||
if not job.result_filename_raw:
|
||||
flash('Rohdaten nicht verfügbar.')
|
||||
# Überprüfen, ob das Ergebnis vorhanden ist
|
||||
if not job.result_filename:
|
||||
flash("Das Ergebnis ist noch nicht verfügbar.")
|
||||
return redirect(url_for('auth.job_status'))
|
||||
|
||||
result_path = os.path.join(RESULT_FOLDER, job.result_filename_raw)
|
||||
# Überprüfen, ob die Datei im angegebenen Pfad existiert
|
||||
result_path = os.path.join(current_app.config['RESULT_FOLDER'], job.result_filename)
|
||||
print(f"Versuche, Datei herunterzuladen von: {result_path}")
|
||||
|
||||
if os.path.exists(result_path):
|
||||
print("Datei existiert und wird zum Download bereitgestellt.")
|
||||
return send_file(result_path, as_attachment=True)
|
||||
flash('Datei fehlt.')
|
||||
return redirect(url_for('auth.job_status'))
|
||||
else:
|
||||
print("Datei nicht gefunden. Ergebnisverzeichnis oder Pfad prüfen.")
|
||||
flash("Ergebnisdatei nicht gefunden.")
|
||||
return redirect(url_for('auth.job_status'))
|
||||
|
||||
|
||||
@bp.route('/delete_job/<int:job_id>', methods=['POST'])
|
||||
@login_required
|
||||
def delete_job(job_id):
|
||||
job = Job.query.filter_by(id=job_id, user_id=current_user.id).first_or_404()
|
||||
job = Job.query.get_or_404(job_id)
|
||||
if job.user_id != current_user.id:
|
||||
flash("Sie haben keine Berechtigung, diesen Job zu löschen.")
|
||||
return redirect(url_for('auth.job_status'))
|
||||
|
||||
# Löschen der Upload-Datei
|
||||
upload_path = os.path.join(UPLOAD_FOLDER, job.filename)
|
||||
if os.path.exists(upload_path):
|
||||
os.remove(upload_path)
|
||||
print(f"Upload-Datei gelöscht: {upload_path}")
|
||||
else:
|
||||
print(f"Upload-Datei nicht gefunden: {upload_path}")
|
||||
|
||||
# Löschen der Results-Datei, falls vorhanden
|
||||
if job.result_filename:
|
||||
result_path = os.path.join(RESULT_FOLDER, job.result_filename)
|
||||
if os.path.exists(result_path):
|
||||
os.remove(result_path)
|
||||
|
||||
if job.result_filename_raw: # ✅ Raw auch löschen
|
||||
raw_path = os.path.join(RESULT_FOLDER, job.result_filename_raw)
|
||||
if os.path.exists(raw_path):
|
||||
os.remove(raw_path)
|
||||
try:
|
||||
os.remove(result_path)
|
||||
print(f"Ergebnisdatei gelöscht: {result_path}")
|
||||
except Exception as e:
|
||||
print(f"Fehler beim Löschen der Ergebnisdatei: {e}")
|
||||
else:
|
||||
print(f"Ergebnisdatei nicht gefunden: {result_path}")
|
||||
|
||||
# Job aus der Datenbank löschen
|
||||
db.session.delete(job)
|
||||
db.session.commit()
|
||||
flash('Job gelöscht.')
|
||||
flash("Job erfolgreich gelöscht.")
|
||||
return redirect(url_for('auth.job_status'))
|
||||
|
||||
@bp.route('/job_status/<int:job_id>')
|
||||
@login_required
|
||||
def job_status_api(job_id):
|
||||
job = Job.query.filter_by(id=job_id, user_id=current_user.id).first()
|
||||
if not job:
|
||||
return jsonify({'error': 'Not found'}), 404
|
||||
return jsonify({
|
||||
'id': job.id,
|
||||
'status': job.status,
|
||||
'result_filename': job.result_filename,
|
||||
'result_filename_raw': getattr(job, 'result_filename_raw', None),
|
||||
'scraper_job_id': getattr(job, 'scraper_job_id', None)
|
||||
})
|
||||
|
||||
@bp.route('/resume_job/<int:job_id>', methods=['POST'])
|
||||
@login_required
|
||||
def resume_job(job_id):
|
||||
job = Job.query.filter_by(id=job_id, user_id=current_user.id).first_or_404()
|
||||
|
||||
thread = threading.Thread(
|
||||
target=process_file,
|
||||
args=(job.filename, job.id, current_app._get_current_object())
|
||||
)
|
||||
thread.daemon = True
|
||||
thread.start()
|
||||
flash(f'Job #{job_id} wird fortgesetzt...')
|
||||
return redirect(url_for('auth.job_status'))
|
||||
|
||||
|
||||
# ── ADMIN ──────────────────────────────────────────
|
||||
@bp.route('/admin', methods=['GET'])
|
||||
@login_required
|
||||
def admin_panel():
|
||||
if not current_user.is_admin:
|
||||
flash("Keine Berechtigung.")
|
||||
return redirect(url_for('auth.job_status'))
|
||||
users = User.query.all()
|
||||
cfg = AppConfig.query.filter_by(key='allow_signup').first()
|
||||
signup_allowed = cfg and cfg.value == 'true'
|
||||
return render_template('admin_panel.html', users=users, signup_allowed=signup_allowed)
|
||||
|
||||
@bp.route('/admin/create_user', methods=['POST'])
|
||||
@login_required
|
||||
def create_user():
|
||||
if not current_user.is_admin:
|
||||
return redirect(url_for('auth.admin_panel'))
|
||||
username = request.form['username']
|
||||
password = generate_password_hash(request.form['password']) # ✅ Fix
|
||||
is_admin = 'is_admin' in request.form
|
||||
new_user = User(username=username, password=password, is_admin=is_admin)
|
||||
db.session.add(new_user)
|
||||
db.session.commit()
|
||||
flash(f'{username} erstellt.')
|
||||
return redirect(url_for('auth.admin_panel'))
|
||||
|
||||
@bp.route('/admin/reset_password/<int:user_id>', methods=['POST'])
|
||||
@login_required
|
||||
def reset_password(user_id):
|
||||
if not current_user.is_admin:
|
||||
return redirect(url_for('auth.admin_panel'))
|
||||
user = User.query.get_or_404(user_id)
|
||||
new_password = request.form['new_password']
|
||||
user.password = generate_password_hash(new_password) # ✅ Fix
|
||||
db.session.commit()
|
||||
flash(f'Passwort {user.username} zurückgesetzt.')
|
||||
return redirect(url_for('auth.admin_panel'))
|
||||
|
||||
@bp.route('/admin/delete_user/<int:user_id>', methods=['POST'])
|
||||
@login_required
|
||||
def delete_user(user_id):
|
||||
if not current_user.is_admin:
|
||||
return redirect(url_for('auth.admin_panel'))
|
||||
user = User.query.get_or_404(user_id)
|
||||
if user.is_admin:
|
||||
flash('Admin nicht löschbar.')
|
||||
return redirect(url_for('auth.admin_panel'))
|
||||
db.session.delete(user)
|
||||
db.session.commit()
|
||||
flash(f'{user.username} gelöscht.')
|
||||
return redirect(url_for('auth.admin_panel'))
|
||||
|
||||
@bp.route('/admin/toggle_signup', methods=['POST'])
|
||||
@login_required
|
||||
def toggle_signup():
|
||||
if not current_user.is_admin:
|
||||
return redirect(url_for('auth.admin_panel'))
|
||||
|
||||
cfg = AppConfig.query.filter_by(key='allow_signup').first()
|
||||
if not cfg:
|
||||
cfg = AppConfig(key='allow_signup', value='true')
|
||||
db.session.add(cfg)
|
||||
else:
|
||||
cfg.value = 'false' if cfg.value == 'true' else 'true'
|
||||
|
||||
db.session.commit()
|
||||
state = '✅ aktiviert' if cfg.value == 'true' else '🔒 deaktiviert'
|
||||
flash(f'Registrierung {state}.')
|
||||
return redirect(url_for('auth.admin_panel'))
|
||||
|
|
|
|||
|
|
@ -1,6 +0,0 @@
|
|||
PLZ;Stadt;Straße;Hausnummer;Zusatz
|
||||
53175;Bonn;Godesberger Str.;27;
|
||||
50667;Köln;Hohenzollernring;52;A
|
||||
10115;Berlin;Chausseestraße;125;
|
||||
80331;München;Marienplatz;1;B
|
||||
20095;Hamburg;Ostwall;5;
|
||||
|
|
|
@ -164,186 +164,3 @@ tr:nth-child(even) td {
|
|||
.delete-btn:hover {
|
||||
background-color: #e60000;
|
||||
}
|
||||
|
||||
/* Flash-Badge Styling */
|
||||
.flash-badge {
|
||||
position: fixed;
|
||||
top: 20px;
|
||||
right: 20px;
|
||||
background-color: #f44336; /* Material Design Rot */
|
||||
color: white;
|
||||
padding: 12px 24px;
|
||||
border-radius: 8px;
|
||||
font-family: 'Roboto', sans-serif;
|
||||
font-weight: 500;
|
||||
box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.2);
|
||||
z-index: 1000;
|
||||
opacity: 0;
|
||||
transform: translateY(-20px);
|
||||
transition: opacity 0.4s ease, transform 0.4s ease;
|
||||
}
|
||||
|
||||
/* Einblend-Animation */
|
||||
.flash-badge.show {
|
||||
opacity: 1;
|
||||
transform: translateY(0);
|
||||
}
|
||||
|
||||
/* Ausblend-Animation */
|
||||
.flash-badge.hide {
|
||||
opacity: 0;
|
||||
transform: translateY(-20px);
|
||||
}
|
||||
|
||||
.admin-panel {
|
||||
max-width: 800px;
|
||||
margin: 2em auto;
|
||||
padding: 2em;
|
||||
background: white;
|
||||
border-radius: 8px;
|
||||
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
|
||||
}
|
||||
|
||||
.admin-panel h2 {
|
||||
font-weight: 500;
|
||||
color: #1d1d1f;
|
||||
margin-bottom: 1em;
|
||||
}
|
||||
|
||||
.user-table {
|
||||
width: 100%;
|
||||
border-collapse: collapse;
|
||||
margin-bottom: 2em;
|
||||
}
|
||||
|
||||
.user-table th, .user-table td {
|
||||
padding: 0.75em;
|
||||
text-align: left;
|
||||
border: 1px solid #d1d1d6;
|
||||
}
|
||||
|
||||
.user-table th {
|
||||
background-color: #f1f1f1;
|
||||
color: #333;
|
||||
}
|
||||
|
||||
.user-table td {
|
||||
background-color: white;
|
||||
}
|
||||
|
||||
.user-table tr:nth-child(even) td {
|
||||
background-color: #f9f9f9;
|
||||
}
|
||||
|
||||
.reset-btn, .delete-btn, .create-btn {
|
||||
padding: 0.5em 1em;
|
||||
font-size: 0.9em;
|
||||
font-weight: 500;
|
||||
border: none;
|
||||
border-radius: 4px;
|
||||
cursor: pointer;
|
||||
transition: background-color 0.2s ease-in-out;
|
||||
}
|
||||
|
||||
.reset-btn {
|
||||
background-color: #4caf50;
|
||||
color: white;
|
||||
}
|
||||
|
||||
.reset-btn:hover {
|
||||
background-color: #388e3c;
|
||||
}
|
||||
|
||||
.delete-btn {
|
||||
background-color: #f44336;
|
||||
color: white;
|
||||
}
|
||||
|
||||
.delete-btn:hover {
|
||||
background-color: #d32f2f;
|
||||
}
|
||||
|
||||
.create-btn {
|
||||
background-color: #007aff;
|
||||
color: white;
|
||||
padding: 0.75em;
|
||||
margin-top: 1em;
|
||||
display: block;
|
||||
width: 100%;
|
||||
font-size: 1em;
|
||||
}
|
||||
|
||||
.create-btn:hover {
|
||||
background-color: #005bb5;
|
||||
}
|
||||
|
||||
.create-user-form {
|
||||
margin-top: 1.5em;
|
||||
}
|
||||
|
||||
.create-user-form input[type="text"],
|
||||
.create-user-form input[type="password"] {
|
||||
width: 100%;
|
||||
padding: 0.75em;
|
||||
margin-bottom: 1em;
|
||||
border: 1px solid #d1d1d6;
|
||||
border-radius: 8px;
|
||||
}
|
||||
|
||||
.create-user-form label {
|
||||
font-size: 0.9em;
|
||||
color: #6e6e73;
|
||||
display: block;
|
||||
margin-bottom: 1em;
|
||||
}
|
||||
|
||||
.logo-container {
|
||||
text-align: center;
|
||||
margin-bottom: 1em;
|
||||
}
|
||||
|
||||
.logo-container img {
|
||||
max-width: 100%;
|
||||
height: auto;
|
||||
width: 150px; /* Standardbreite für das Logo */
|
||||
transition: width 0.3s ease;
|
||||
}
|
||||
|
||||
@media (max-width: 768px) {
|
||||
.logo-container img {
|
||||
width: 120px; /* Kleinere Breite auf kleineren Bildschirmen */
|
||||
}
|
||||
}
|
||||
|
||||
@media (max-width: 480px) {
|
||||
.logo-container img {
|
||||
width: 100px; /* Noch kleinere Breite auf sehr kleinen Bildschirmen */
|
||||
}
|
||||
}
|
||||
|
||||
.template-info {
|
||||
background-color: #f9f9f9;
|
||||
padding: 1em;
|
||||
border-radius: 8px;
|
||||
margin-bottom: 1.5em;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.template-info p {
|
||||
margin: 0.5em 0;
|
||||
}
|
||||
|
||||
.template-download {
|
||||
display: inline-block;
|
||||
padding: 0.5em 1em;
|
||||
margin-top: 0.5em;
|
||||
background-color: #007aff;
|
||||
color: white;
|
||||
border-radius: 4px;
|
||||
text-decoration: none;
|
||||
transition: background-color 0.2s ease;
|
||||
}
|
||||
|
||||
.template-download:hover {
|
||||
background-color: #005bb5;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,78 +0,0 @@
|
|||
{% extends "base.html" %}
|
||||
|
||||
{% block content %}
|
||||
<div class="admin-panel">
|
||||
<h2>Benutzerverwaltung</h2>
|
||||
|
||||
<!-- Tabelle für Benutzerverwaltung -->
|
||||
<table class="user-table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>ID</th>
|
||||
<th>Benutzername</th>
|
||||
<th>Admin</th>
|
||||
<th>Aktionen</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for user in users %}
|
||||
<tr>
|
||||
<td>{{ user.id }}</td>
|
||||
<td>{{ user.username }}</td>
|
||||
<td>{{ 'Ja' if user.is_admin else 'Nein' }}</td>
|
||||
<td>
|
||||
<form action="{{ url_for('auth.reset_password', user_id=user.id) }}" method="post" style="display:inline;">
|
||||
<input type="text" name="new_password" placeholder="Neues Passwort" required>
|
||||
<button type="submit" class="reset-btn">Passwort zurücksetzen</button>
|
||||
</form>
|
||||
{% if not user.is_admin %}
|
||||
<form action="{{ url_for('auth.delete_user', user_id=user.id) }}" method="post" style="display:inline;">
|
||||
<button type="submit" class="delete-btn">Benutzer löschen</button>
|
||||
</form>
|
||||
{% endif %}
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<!-- Formular zum Erstellen neuer Benutzer -->
|
||||
<h3>Neuen Benutzer erstellen</h3>
|
||||
<form action="{{ url_for('auth.create_user') }}" method="post" class="create-user-form">
|
||||
<input type="text" name="username" placeholder="Benutzername" required>
|
||||
<input type="password" name="password" placeholder="Passwort" required>
|
||||
<label>
|
||||
<input type="checkbox" name="is_admin"> Admin
|
||||
</label>
|
||||
<button type="submit" class="create-btn">Benutzer erstellen</button>
|
||||
</form>
|
||||
</div>
|
||||
|
||||
<div class="config-box">
|
||||
<h3>⚙️ Einstellungen</h3>
|
||||
<form action="{{ url_for('auth.toggle_signup') }}" method="POST">
|
||||
<div class="toggle-row">
|
||||
<span>Benutzer-Registrierung:</span>
|
||||
{% if signup_allowed %}
|
||||
<span class="badge badge-green">✅ Aktiv</span>
|
||||
<button type="submit" class="btn-danger">🔒 Deaktivieren</button>
|
||||
{% else %}
|
||||
<span class="badge badge-red">🔒 Deaktiviert</span>
|
||||
<button type="submit" class="btn-success">✅ Aktivieren</button>
|
||||
{% endif %}
|
||||
</div>
|
||||
</form>
|
||||
</div>
|
||||
|
||||
<style>
|
||||
.config-box { background: #f8f9fa; border: 1px solid #ddd; border-radius: 6px; padding: 16px; margin-bottom: 24px; }
|
||||
.toggle-row { display: flex; align-items: center; gap: 12px; }
|
||||
.badge { padding: 3px 10px; border-radius: 12px; font-size: 0.85em; font-weight: bold; }
|
||||
.badge-green { background: #d4edda; color: #155724; }
|
||||
.badge-red { background: #f8d7da; color: #721c24; }
|
||||
.btn-danger { background: #e74c3c; color: white; border: none; padding: 6px 14px; border-radius: 4px; cursor: pointer; }
|
||||
.btn-success { background: #27ae60; color: white; border: none; padding: 6px 14px; border-radius: 4px; cursor: pointer; }
|
||||
.btn-danger:hover { background: #c0392b; }
|
||||
.btn-success:hover { background: #1e8449; }
|
||||
</style>
|
||||
{% endblock %}
|
||||
|
|
@ -14,50 +14,14 @@
|
|||
<ul>
|
||||
<li><a href="{{ url_for('auth.job_status') }}">Jobs</a></li>
|
||||
<li><a href="{{ url_for('auth.upload') }}">Upload</a></li>
|
||||
{% if current_user.is_admin %}
|
||||
<li><a href="{{ url_for('auth.admin_panel') }}">Admin</a></li> <!-- Admin-Bereich Link -->
|
||||
{% endif %}
|
||||
<li><a href="{{ url_for('auth.logout') }}">Logout</a></li>
|
||||
</ul>
|
||||
</nav>
|
||||
</header>
|
||||
{% endif %}
|
||||
|
||||
<!-- Flash-Nachrichten -->
|
||||
{% with messages = get_flashed_messages() %}
|
||||
{% if messages %}
|
||||
<div id="flash-badge-container">
|
||||
{% for message in messages %}
|
||||
<div class="flash-badge">{{ message }}</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
{% endif %}
|
||||
{% endwith %}
|
||||
|
||||
|
||||
<div class="{% if request.endpoint in ['auth.login', 'auth.signup'] %}form-container{% else %}container{% endif %}">
|
||||
{% block content %}{% endblock %}
|
||||
</div>
|
||||
|
||||
<!-- JavaScript für Ein- und Ausblendanimation des Flash-Badges -->
|
||||
<script>
|
||||
document.addEventListener("DOMContentLoaded", function() {
|
||||
var flashBadges = document.querySelectorAll('.flash-badge');
|
||||
flashBadges.forEach(function(badge) {
|
||||
// Einblendung mit Verzögerung
|
||||
setTimeout(function() {
|
||||
badge.classList.add('show');
|
||||
}, 100);
|
||||
|
||||
// Ausblendung nach 5 Sekunden und Entfernen aus dem DOM
|
||||
setTimeout(function() {
|
||||
badge.classList.remove('show');
|
||||
badge.classList.add('hide');
|
||||
setTimeout(function() {
|
||||
badge.remove();
|
||||
}, 400); // Zeit für die Ausblendanimation
|
||||
}, 5000);
|
||||
});
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
|||
|
|
@ -1,121 +0,0 @@
|
|||
{% extends "base.html" %}
|
||||
|
||||
{% block content %}
|
||||
<div class="table-container">
|
||||
<h2>Ihre Aufträge</h2>
|
||||
<table id="jobs-table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Dateiname</th>
|
||||
<th>Status</th>
|
||||
<th>Erstellt am</th>
|
||||
<th>Ergebnis</th>
|
||||
<th>Aktionen</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for job in jobs %}
|
||||
<tr id="job-row-{{ job.id }}">
|
||||
<td>{{ job.filename }}</td>
|
||||
<td id="status-{{ job.id }}" class="job-status">{{ job.status }}</td>
|
||||
<td>{{ job.created_at.strftime('%Y-%m-%d %H:%M:%S') }}</td>
|
||||
<td id="result-{{ job.id }}">
|
||||
{% if job.result_filename and 'Failed' not in job.status %}
|
||||
<a href="{{ url_for('auth.download_result', job_id=job.id) }}" class="dl-btn">
|
||||
🎯 Gefiltert
|
||||
</a>
|
||||
{% if job.result_filename_raw %}
|
||||
|
||||
<a href="{{ url_for('auth.download_result_raw', job_id=job.id) }}" class="dl-btn dl-btn-raw">
|
||||
📋 Alle
|
||||
</a>
|
||||
{% endif %}
|
||||
{% elif 'Failed' in job.status %}
|
||||
<span class="status-failed">❌ {{ job.result_filename or 'Fehler' }}</span>
|
||||
{% else %}
|
||||
<span class="status-pending">⏳ Noch nicht verfügbar</span>
|
||||
{% endif %}
|
||||
</td>
|
||||
<td>
|
||||
<form action="{{ url_for('auth.delete_job', job_id=job.id) }}" method="POST" style="display:inline;">
|
||||
<button type="submit" class="delete-btn">🗑️ Löschen</button>
|
||||
</form>
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
<style>
|
||||
.job-status { font-weight: bold; }
|
||||
.status-failed { color: #e74c3c; font-weight: bold; }
|
||||
.status-pending { color: #888; }
|
||||
.status-completed { color: #27ae60; }
|
||||
|
||||
.dl-btn {
|
||||
display: inline-block;
|
||||
padding: 4px 10px;
|
||||
border-radius: 4px;
|
||||
text-decoration: none;
|
||||
font-size: 0.85em;
|
||||
font-weight: bold;
|
||||
background: #27ae60;
|
||||
color: #fff;
|
||||
margin: 2px 1px;
|
||||
transition: background 0.2s;
|
||||
}
|
||||
.dl-btn:hover { background: #1e8449; }
|
||||
.dl-btn-raw { background: #2980b9; }
|
||||
.dl-btn-raw:hover { background: #1a5e8a; }
|
||||
</style>
|
||||
|
||||
<script>
|
||||
document.addEventListener('DOMContentLoaded', function () {
|
||||
document.querySelectorAll('.job-status').forEach(function (cell) {
|
||||
const jobId = cell.id.split('-')[1];
|
||||
const status = cell.textContent.trim();
|
||||
if (!status.includes('✅') && !status.includes('Failed')) {
|
||||
pollJob(jobId);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
function renderResult(resultCell, data) {
|
||||
const hasFailed = data.status.includes('Failed');
|
||||
const hasFiltered = data.result_filename && !hasFailed;
|
||||
const hasRaw = data.result_filename_raw && !hasFailed;
|
||||
|
||||
if (hasFiltered) {
|
||||
let html = `<a href="/download/${data.id}" class="dl-btn">🎯 Gefiltert</a>`;
|
||||
if (hasRaw) {
|
||||
html += ` <a href="/download_raw/${data.id}" class="dl-btn dl-btn-raw">📋 Alle</a>`;
|
||||
}
|
||||
resultCell.innerHTML = html;
|
||||
} else if (hasFailed) {
|
||||
resultCell.innerHTML = `<span class="status-failed">❌ ${data.result_filename || 'Fehler'}</span>`;
|
||||
} else {
|
||||
resultCell.innerHTML = `<span class="status-pending">⏳ Noch nicht verfügbar</span>`;
|
||||
}
|
||||
}
|
||||
|
||||
function pollJob(jobId) {
|
||||
fetch(`/job_status/${jobId}`)
|
||||
.then(r => r.json())
|
||||
.then(data => {
|
||||
const statusCell = document.getElementById(`status-${jobId}`);
|
||||
const resultCell = document.getElementById(`result-${jobId}`);
|
||||
|
||||
statusCell.textContent = data.status;
|
||||
renderResult(resultCell, data);
|
||||
|
||||
// Weiter pollen wenn noch nicht fertig
|
||||
const done = data.status.includes('✅') || data.status.includes('Failed');
|
||||
if (!done) {
|
||||
setTimeout(() => pollJob(jobId), 5000);
|
||||
}
|
||||
})
|
||||
.catch(() => setTimeout(() => pollJob(jobId), 10000));
|
||||
}
|
||||
</script>
|
||||
{% endblock %}
|
||||
|
|
@ -15,38 +15,20 @@
|
|||
</thead>
|
||||
<tbody>
|
||||
{% for job in jobs %}
|
||||
<tr id="job-row-{{ job.id }}">
|
||||
<tr>
|
||||
<td>{{ job.filename }}</td>
|
||||
<td id="status-{{ job.id }}" class="job-status">
|
||||
{{ job.status }}
|
||||
</td>
|
||||
<td class="job-status">{{ job.status }}</td>
|
||||
<td>{{ job.created_at.strftime('%Y-%m-%d %H:%M:%S') }}</td>
|
||||
<td id="result-{{ job.id }}">
|
||||
{% if job.result_filename and 'Failed' not in job.status %}
|
||||
<a href="{{ url_for('auth.download_result', job_id=job.id) }}" class="dl-btn">
|
||||
🎯 Gefiltert
|
||||
</a>
|
||||
{% if job.result_filename_raw %}
|
||||
|
||||
<a href="{{ url_for('auth.download_result_raw', job_id=job.id) }}" class="dl-btn dl-btn-raw">
|
||||
📋 Alle
|
||||
</a>
|
||||
{% endif %}
|
||||
{% elif 'Failed' in job.status %}
|
||||
<span class="status-failed">❌ {{ job.result_filename or 'Fehler' }}</span>
|
||||
<td>
|
||||
{% if job.status == "Completed" %}
|
||||
<a href="{{ url_for('auth.download_result', job_id=job.id) }}">Download</a>
|
||||
{% else %}
|
||||
<span class="status-pending">⏳ Noch nicht verfügbar</span>
|
||||
Noch nicht verfügbar
|
||||
{% endif %}
|
||||
</td>
|
||||
<td>
|
||||
{% if 'Failed' in job.status %}
|
||||
<!-- 🆕 Resume Button -->
|
||||
<form action="{{ url_for('auth.resume_job', job_id=job.id) }}" method="POST" style="display:inline;">
|
||||
<button type="submit" class="btn-resume">▶️ Resume</button>
|
||||
</form>
|
||||
{% endif %}
|
||||
<form action="{{ url_for('auth.delete_job', job_id=job.id) }}" method="POST" style="display:inline;">
|
||||
<button type="submit" class="delete-btn">🗑️ Löschen</button>
|
||||
<button type="submit" class="delete-btn">Löschen</button>
|
||||
</form>
|
||||
</td>
|
||||
</tr>
|
||||
|
|
@ -55,101 +37,25 @@
|
|||
</table>
|
||||
</div>
|
||||
|
||||
<style>
|
||||
.job-status { font-weight: bold; }
|
||||
.status-failed { color: #e74c3c; font-weight: bold; }
|
||||
.status-pending { color: #888; }
|
||||
|
||||
.eta-badge { display: inline-block; background: #eaf4ff; color: #1a6fa8;
|
||||
border-radius: 10px; padding: 2px 8px; font-size: 0.82em;
|
||||
font-weight: bold; margin-left: 6px; }
|
||||
|
||||
.dl-btn { display: inline-block; padding: 4px 10px; border-radius: 4px;
|
||||
text-decoration: none; font-size: 0.85em; font-weight: bold;
|
||||
background: #27ae60; color: #fff; margin: 2px 1px; transition: background 0.2s; }
|
||||
.dl-btn:hover { background: #1e8449; }
|
||||
.dl-btn-raw { background: #2980b9; }
|
||||
.dl-btn-raw:hover { background: #1a5e8a; }
|
||||
|
||||
.btn-resume { background: #e67e22; color: white; border: none;
|
||||
padding: 4px 10px; border-radius: 4px; cursor: pointer;
|
||||
font-size: 0.85em; font-weight: bold; margin-right: 4px; }
|
||||
.btn-resume:hover { background: #ca6f1e; }
|
||||
</style>
|
||||
|
||||
<script>
|
||||
// ETA Badge aus Status-String parsen
|
||||
function parseStatus(status) {
|
||||
const parts = status.split('|');
|
||||
if (parts.length === 2) {
|
||||
return `<span>${parts[0].trim()}</span>
|
||||
<span class="eta-badge">${parts[1].trim()}</span>`;
|
||||
}
|
||||
return status;
|
||||
}
|
||||
// Periodische Aktualisierung des Jobstatus
|
||||
setInterval(function() {
|
||||
fetch('{{ url_for("auth.job_status") }}')
|
||||
.then(response => response.text())
|
||||
.then(html => {
|
||||
const parser = new DOMParser();
|
||||
const doc = parser.parseFromString(html, 'text/html');
|
||||
const newRows = doc.querySelectorAll('#jobs-table tbody tr');
|
||||
const currentRows = document.querySelectorAll('#jobs-table tbody tr');
|
||||
|
||||
function renderResult(resultCell, data) {
|
||||
const hasFailed = data.status.includes('Failed') || data.status.includes('❌');
|
||||
const hasFiltered = data.result_filename && !hasFailed;
|
||||
const hasRaw = data.result_filename_raw && !hasFailed;
|
||||
newRows.forEach((newRow, index) => {
|
||||
const newStatus = newRow.querySelector('.job-status').textContent;
|
||||
currentRows[index].querySelector('.job-status').textContent = newStatus;
|
||||
|
||||
if (hasFiltered) {
|
||||
let html = `<a href="/download/${data.id}" class="dl-btn">🎯 Gefiltert</a>`;
|
||||
if (hasRaw) {
|
||||
html += ` <a href="/download_raw/${data.id}" class="dl-btn dl-btn-raw">📋 Alle</a>`;
|
||||
}
|
||||
resultCell.innerHTML = html;
|
||||
} else if (hasFailed) {
|
||||
resultCell.innerHTML = `<span class="status-failed">❌ ${data.result_filename || 'Fehler'}</span>`;
|
||||
} else {
|
||||
resultCell.innerHTML = `<span class="status-pending">⏳ Noch nicht verfügbar</span>`;
|
||||
}
|
||||
}
|
||||
|
||||
function renderActions(row, data) {
|
||||
const actionsCell = row.querySelector('td:last-child');
|
||||
const hasFailed = data.status.includes('Failed');
|
||||
let html = '';
|
||||
if (hasFailed) {
|
||||
html += `<form action="/resume_job/${data.id}" method="POST" style="display:inline;">
|
||||
<button type="submit" class="btn-resume">▶️ Resume</button>
|
||||
</form>`;
|
||||
}
|
||||
html += `<form action="/delete_job/${data.id}" method="POST" style="display:inline;">
|
||||
<button type="submit" class="delete-btn">🗑️ Löschen</button>
|
||||
</form>`;
|
||||
actionsCell.innerHTML = html;
|
||||
}
|
||||
|
||||
function pollJob(jobId) {
|
||||
fetch(`/job_status/${jobId}`)
|
||||
.then(r => r.json())
|
||||
.then(data => {
|
||||
const statusCell = document.getElementById(`status-${jobId}`);
|
||||
const resultCell = document.getElementById(`result-${jobId}`);
|
||||
const row = document.getElementById(`job-row-${jobId}`);
|
||||
|
||||
statusCell.innerHTML = parseStatus(data.status);
|
||||
renderResult(resultCell, data);
|
||||
renderActions(row, data);
|
||||
|
||||
const done = data.status.includes('✅') || data.status.includes('Failed') || data.status.includes('❌');
|
||||
if (!done) {
|
||||
setTimeout(() => pollJob(jobId), 5000);
|
||||
}
|
||||
})
|
||||
.catch(() => setTimeout(() => pollJob(jobId), 10000));
|
||||
}
|
||||
|
||||
document.addEventListener('DOMContentLoaded', function () {
|
||||
document.querySelectorAll('.job-status').forEach(function (cell) {
|
||||
const jobId = cell.id.split('-')[1];
|
||||
const status = cell.textContent.trim();
|
||||
cell.innerHTML = parseStatus(status);
|
||||
if (!status.includes('✅') && !status.includes('Failed') && !status.includes('❌')) {
|
||||
pollJob(jobId);
|
||||
}
|
||||
});
|
||||
});
|
||||
const newResult = newRow.querySelector('td:nth-child(4)').innerHTML;
|
||||
currentRows[index].querySelector('td:nth-child(4)').innerHTML = newResult;
|
||||
});
|
||||
});
|
||||
}, 5000); // Aktualisierung alle 5 Sekunden
|
||||
</script>
|
||||
{% endblock %}
|
||||
|
|
|
|||
|
|
@ -1,61 +0,0 @@
|
|||
{% extends "base.html" %}
|
||||
|
||||
{% block content %}
|
||||
<div class="table-container">
|
||||
<h2>Ihre Aufträge</h2>
|
||||
<table id="jobs-table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Dateiname</th>
|
||||
<th>Status</th>
|
||||
<th>Erstellt am</th>
|
||||
<th>Ergebnis</th>
|
||||
<th>Aktionen</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for job in jobs %}
|
||||
<tr>
|
||||
<td>{{ job.filename }}</td>
|
||||
<td class="job-status">{{ job.status }}</td>
|
||||
<td>{{ job.created_at.strftime('%Y-%m-%d %H:%M:%S') }}</td>
|
||||
<td>
|
||||
{% if job.status == "Completed" %}
|
||||
<a href="{{ url_for('auth.download_result', job_id=job.id) }}">Download</a>
|
||||
{% else %}
|
||||
Noch nicht verfügbar
|
||||
{% endif %}
|
||||
</td>
|
||||
<td>
|
||||
<form action="{{ url_for('auth.delete_job', job_id=job.id) }}" method="POST" style="display:inline;">
|
||||
<button type="submit" class="delete-btn">Löschen</button>
|
||||
</form>
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
// Periodische Aktualisierung des Jobstatus
|
||||
setInterval(function() {
|
||||
fetch('{{ url_for("auth.job_status") }}')
|
||||
.then(response => response.text())
|
||||
.then(html => {
|
||||
const parser = new DOMParser();
|
||||
const doc = parser.parseFromString(html, 'text/html');
|
||||
const newRows = doc.querySelectorAll('#jobs-table tbody tr');
|
||||
const currentRows = document.querySelectorAll('#jobs-table tbody tr');
|
||||
|
||||
newRows.forEach((newRow, index) => {
|
||||
const newStatus = newRow.querySelector('.job-status').textContent;
|
||||
currentRows[index].querySelector('.job-status').textContent = newStatus;
|
||||
|
||||
const newResult = newRow.querySelector('td:nth-child(4)').innerHTML;
|
||||
currentRows[index].querySelector('td:nth-child(4)').innerHTML = newResult;
|
||||
});
|
||||
});
|
||||
}, 5000); // Aktualisierung alle 5 Sekunden
|
||||
</script>
|
||||
{% endblock %}
|
||||
File diff suppressed because one or more lines are too long
|
|
@ -1,22 +1,11 @@
|
|||
{% extends "base.html" %}
|
||||
|
||||
{% block content %}
|
||||
<div class="form-container">
|
||||
<h2>Datei hochladen</h2>
|
||||
|
||||
<!-- Instructions and download link for the template -->
|
||||
<div class="template-info">
|
||||
<p>Bitte verwenden Sie die folgende Vorlage für Ihre Adressliste:</p>
|
||||
<a href="{{ url_for('static', filename='Adressliste_vorlage.csv') }}" download="Adressliste_vorlage.csv" class="template-download">Adressliste Vorlage herunterladen</a>
|
||||
<p>Die Vorlage enthält die folgenden Spalten: <strong>PLZ, Stadt, Straße, Hausnummer, Zusatz</strong>. Bitte verwenden Sie das Format für eine reibungslose Verarbeitung.</p>
|
||||
</div>
|
||||
|
||||
<form action="{{ url_for('auth.upload') }}" method="post" enctype="multipart/form-data">
|
||||
<label for="file">Datei auswählen:</label>
|
||||
<input type="file" id="file" name="file" accept=".csv" required>
|
||||
<button type="submit">Datei hochladen</button>
|
||||
</form>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
<div class="form-container">
|
||||
<h2>Datei hochladen</h2>
|
||||
<form method="POST" enctype="multipart/form-data">
|
||||
<label for="file">CSV-Datei:</label>
|
||||
<input type="file" id="file" name="file" accept=".csv" required>
|
||||
<button type="submit">Upload</button>
|
||||
</form>
|
||||
</div>
|
||||
{% endblock %}
|
||||
|
|
|
|||
|
|
@ -1,316 +0,0 @@
|
|||
import os
|
||||
import re
|
||||
import pandas as pd
|
||||
import requests
|
||||
import time
|
||||
import random
|
||||
from io import StringIO
|
||||
from app.models import db, Job
|
||||
|
||||
print("🆕 MODERN webcrawler LOADED!")
|
||||
|
||||
UPLOAD_FOLDER = '/app/uploads'
|
||||
RESULT_FOLDER = '/app/results'
|
||||
SCRAPER_URL = "http://gmaps-scraper:8080"
|
||||
|
||||
OUTPUT_COLS = ['title', 'category', 'address', 'open_hours', 'website', 'phone', 'link']
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# Hilfsfunktionen
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
def get_batch_size(total_rows):
|
||||
if total_rows < 50: return 10
|
||||
elif total_rows < 200: return 10
|
||||
elif total_rows < 500: return 5
|
||||
else: return 5
|
||||
|
||||
def get_delay(total_rows):
|
||||
if total_rows < 50: return (5, 10)
|
||||
elif total_rows < 200: return (10, 20)
|
||||
else: return (20, 40)
|
||||
|
||||
def is_blocked(data):
|
||||
text = str(data).lower()
|
||||
blocked = any(kw in text for kw in ['captcha', 'blocked', 'rate limit', 'too many', '429'])
|
||||
if blocked:
|
||||
print(f"🚫 BLOCKED: {str(data)[:100]}")
|
||||
return blocked
|
||||
|
||||
def fix_encoding(text):
|
||||
"""Kaputte ISO→UTF8 Zeichen reparieren (z.B. Industriestraße → Industriestraße)"""
|
||||
if not isinstance(text, str):
|
||||
return text
|
||||
try:
|
||||
return text.encode('latin-1').decode('utf-8')
|
||||
except (UnicodeEncodeError, UnicodeDecodeError):
|
||||
return text
|
||||
|
||||
def build_input_addresses(df):
|
||||
"""Normalisierte Adressen aus Input-CSV für Abgleich"""
|
||||
addresses = set()
|
||||
for _, row in df.iterrows():
|
||||
plz = str(row.get('PLZ', '')).strip()
|
||||
stadt = str(row.get('Stadt', '')).strip()
|
||||
str_ = str(row.get('Straße', '')).strip()
|
||||
nr = str(row.get('Hausnummer', '')).strip()
|
||||
zusatz = str(row.get('Zusatz', '')).strip()
|
||||
|
||||
full = f"{str_} {nr} {zusatz} {plz} {stadt}".lower().strip()
|
||||
full = ' '.join(full.split())
|
||||
addresses.add(full)
|
||||
return addresses
|
||||
|
||||
def normalize_address(addr):
|
||||
"""Output-Adresse normalisieren für Abgleich"""
|
||||
if not isinstance(addr, str):
|
||||
return ''
|
||||
addr = fix_encoding(addr)
|
||||
return ' '.join(addr.lower().strip().split())
|
||||
|
||||
def address_in_input(result_addr, input_addresses):
|
||||
"""Prüft ob PLZ + Straßenname aus Result im Input vorkommen"""
|
||||
norm = normalize_address(result_addr)
|
||||
for inp_addr in input_addresses:
|
||||
plz_match = re.search(r'\b\d{5}\b', inp_addr)
|
||||
if plz_match:
|
||||
plz = plz_match.group()
|
||||
if plz in norm:
|
||||
street = inp_addr.split()[0] if inp_addr else ''
|
||||
if len(street) > 3 and street[:4].lower() in norm:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# CSV Nachbearbeitung (apply_filter umschaltbar)
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
def process_result_csv(raw_bytes, input_df, apply_filter=True):
|
||||
"""
|
||||
Raw CSV → bereinigt:
|
||||
- Nur OUTPUT_COLS
|
||||
- Encoding fix
|
||||
- Optional: Input/Output Abgleich + Duplikate
|
||||
"""
|
||||
try:
|
||||
content = raw_bytes.decode('utf-8', errors='replace')
|
||||
df_out = pd.read_csv(StringIO(content))
|
||||
print(f"📄 Raw result: {df_out.shape} | Columns: {list(df_out.columns)[:8]}")
|
||||
|
||||
# Spalten filtern
|
||||
available = [c for c in OUTPUT_COLS if c in df_out.columns]
|
||||
missing = [c for c in OUTPUT_COLS if c not in df_out.columns]
|
||||
if missing:
|
||||
print(f"⚠️ Fehlende Spalten: {missing}")
|
||||
df_out = df_out[available]
|
||||
|
||||
# 🔤 Encoding fix
|
||||
for col in df_out.columns:
|
||||
df_out[col] = df_out[col].apply(fix_encoding)
|
||||
print(f"🔤 Encoding fix: done")
|
||||
|
||||
if apply_filter:
|
||||
# 📍 Input/Output Abgleich
|
||||
input_addresses = build_input_addresses(input_df)
|
||||
before = len(df_out)
|
||||
df_out = df_out[
|
||||
df_out['address'].apply(
|
||||
lambda a: address_in_input(a, input_addresses)
|
||||
)
|
||||
]
|
||||
print(f"📍 Adress-Filter: {before} → {len(df_out)} Zeilen")
|
||||
|
||||
# 🔁 Duplikate entfernen (immer, auch bei Raw)
|
||||
before_dedup = len(df_out)
|
||||
df_out = df_out.drop_duplicates(subset=['title', 'address'], keep='first')
|
||||
print(f"🔁 Duplikate: {before_dedup} → {len(df_out)} Zeilen")
|
||||
|
||||
# Leere Titel entfernen
|
||||
df_out = df_out.dropna(subset=['title'], how='all')
|
||||
df_out = df_out[df_out['title'].str.strip().astype(bool)]
|
||||
|
||||
print(f"✅ Final ({'gefiltert' if apply_filter else 'alle'}): {df_out.shape}")
|
||||
return df_out
|
||||
|
||||
except Exception as e:
|
||||
print(f"💥 process_result_csv: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return None
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# Haupt-Worker
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
def process_file(filename, job_id, app):
|
||||
print(f"🎯 {filename} Job#{job_id} START!")
|
||||
|
||||
with app.app_context():
|
||||
job = Job.query.get(job_id)
|
||||
if not job:
|
||||
print("❌ Job missing")
|
||||
return
|
||||
|
||||
try:
|
||||
# 1️⃣ CSV Parse
|
||||
job.status = "📊 parsing CSV"
|
||||
db.session.commit()
|
||||
|
||||
filepath = os.path.join(UPLOAD_FOLDER, filename)
|
||||
print(f"📁 {filepath} | {os.path.getsize(filepath)}b")
|
||||
|
||||
df_input = pd.read_csv(filepath, sep=';', encoding='ISO-8859-1')
|
||||
print(f"📊 {df_input.shape} | Columns: {list(df_input.columns)}")
|
||||
|
||||
queries = []
|
||||
for _, row in df_input.iterrows():
|
||||
parts = [
|
||||
str(row.get('PLZ', '')).strip(),
|
||||
str(row.get('Stadt', '')).strip(),
|
||||
str(row.get('Straße', '')).strip(),
|
||||
str(row.get('Hausnummer', '')).strip(),
|
||||
str(row.get('Zusatz', '')).strip(),
|
||||
]
|
||||
q = f"Firmen {' '.join(p for p in parts if p and p != 'nan')}".strip()
|
||||
if len(q) > 10:
|
||||
queries.append(q)
|
||||
|
||||
total = len(queries)
|
||||
print(f"🔍 {total} Queries | Samples: {queries[:3]}")
|
||||
if not queries:
|
||||
raise ValueError("Keine gültigen Adressen in CSV")
|
||||
|
||||
# 2️⃣ Batch + Delay
|
||||
batch_size = get_batch_size(total)
|
||||
delay_min, delay_max = get_delay(total)
|
||||
batch = queries[:batch_size]
|
||||
pre_delay = random.uniform(delay_min, delay_max)
|
||||
print(f"📦 Batch {len(batch)}/{total} | 😴 {pre_delay:.1f}s Delay")
|
||||
time.sleep(pre_delay)
|
||||
|
||||
# 3️⃣ API Call
|
||||
job.status = "📤 sending to scraper"
|
||||
db.session.commit()
|
||||
|
||||
payload = {
|
||||
"name": f"{filename.replace('.csv','')}-{job_id}",
|
||||
"keywords": batch,
|
||||
"lang": "de",
|
||||
"depth": 1,
|
||||
"zoom": 17,
|
||||
"radius": 50,
|
||||
"max_time": 60,
|
||||
"fast_mode": False
|
||||
}
|
||||
|
||||
print(f"🌐 POST {SCRAPER_URL}/api/v1/jobs | {payload['name']}")
|
||||
resp = requests.post(f"{SCRAPER_URL}/api/v1/jobs", json=payload, timeout=30)
|
||||
print(f"📤 {resp.status_code}: {resp.text[:300]}")
|
||||
|
||||
if is_blocked(resp.text):
|
||||
raise ValueError("🚫 IP geblockt! Proxy konfigurieren.")
|
||||
if resp.status_code != 201:
|
||||
raise ValueError(f"API {resp.status_code}: {resp.text[:200]}")
|
||||
|
||||
# 4️⃣ Polling
|
||||
scraper_id = resp.json()['id']
|
||||
job.scraper_job_id = scraper_id
|
||||
job.status = "⏳ scraping"
|
||||
db.session.commit()
|
||||
print(f"✅ Scraper Job: {scraper_id}")
|
||||
|
||||
for i in range(1, 61): # Max 10min
|
||||
try:
|
||||
r = requests.get(
|
||||
f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}",
|
||||
timeout=10
|
||||
)
|
||||
data = r.json()
|
||||
status = data.get('Status', data.get('status', '?'))
|
||||
print(f"⏳ {i}/60: {status}")
|
||||
|
||||
if is_blocked(data):
|
||||
raise ValueError("🚫 IP geblockt während scraping!")
|
||||
|
||||
if status in ('ok', 'completed', 'scraped'):
|
||||
dl = requests.get(
|
||||
f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}/download",
|
||||
timeout=60
|
||||
)
|
||||
if dl.status_code != 200:
|
||||
raise ValueError(f"Download {dl.status_code}")
|
||||
if is_blocked(dl.text[:200]):
|
||||
raise ValueError("🚫 IP geblockt beim Download!")
|
||||
|
||||
# 5️⃣ Nachbearbeitung → zwei Versionen
|
||||
job.status = "🔧 processing result"
|
||||
db.session.commit()
|
||||
|
||||
base = filename.replace('.csv', '')
|
||||
os.makedirs(RESULT_FOLDER, exist_ok=True)
|
||||
|
||||
# ── Version A: Gefiltert (Adressabgleich + Deduplizierung) ──
|
||||
df_filtered = process_result_csv(dl.content, df_input, apply_filter=True)
|
||||
outname_filtered = f"results_{base}_filtered.csv"
|
||||
outpath_filtered = os.path.join(RESULT_FOLDER, outname_filtered)
|
||||
|
||||
if df_filtered is not None and len(df_filtered) > 0:
|
||||
df_filtered.to_csv(
|
||||
outpath_filtered, index=False,
|
||||
encoding='utf-8-sig', sep=';'
|
||||
)
|
||||
print(f"🎯 Filtered: {outname_filtered} → {len(df_filtered)} Firmen")
|
||||
else:
|
||||
print("⚠️ Keine Treffer nach Filter – leere Datei wird erstellt")
|
||||
pd.DataFrame(columns=OUTPUT_COLS).to_csv(
|
||||
outpath_filtered, index=False,
|
||||
encoding='utf-8-sig', sep=';'
|
||||
)
|
||||
|
||||
# ── Version B: Alle (nur Spalten + Encoding, kein Filter) ──
|
||||
df_raw = process_result_csv(dl.content, df_input, apply_filter=False)
|
||||
outname_raw = f"results_{base}_all.csv"
|
||||
outpath_raw = os.path.join(RESULT_FOLDER, outname_raw)
|
||||
|
||||
if df_raw is not None:
|
||||
df_raw.to_csv(
|
||||
outpath_raw, index=False,
|
||||
encoding='utf-8-sig', sep=';'
|
||||
)
|
||||
print(f"📋 All: {outname_raw} → {len(df_raw)} Firmen")
|
||||
else:
|
||||
print("⚠️ df_raw None – Rohinhalt wird gespeichert")
|
||||
with open(outpath_raw, 'wb') as f:
|
||||
f.write(dl.content)
|
||||
|
||||
# ── DB speichern ──
|
||||
job.status = "✅ Fertig"
|
||||
job.result_filename = outname_filtered # 🎯 Gefiltert
|
||||
job.result_filename_raw = outname_raw # 📋 Alle
|
||||
db.session.commit()
|
||||
print(f"🎉 Beide Dateien gespeichert!")
|
||||
break
|
||||
|
||||
elif status in ('failed', 'cancelled', 'error'):
|
||||
raise ValueError(f"Scraper: {status}")
|
||||
|
||||
except requests.RequestException as e:
|
||||
print(f"⚠️ Poll {i}: {e}")
|
||||
|
||||
time.sleep(random.uniform(8, 15))
|
||||
|
||||
else:
|
||||
raise ValueError("Timeout nach 10min")
|
||||
|
||||
except Exception as e:
|
||||
job.status = "Failed"
|
||||
job.result_filename = str(e)
|
||||
print(f"💥 ERROR: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
db.session.commit()
|
||||
print(f"✅ DONE! Status: {job.status}\n")
|
||||
|
|
@ -1,275 +0,0 @@
|
|||
import os
|
||||
import re
|
||||
import pandas as pd
|
||||
import requests
|
||||
import time
|
||||
import random
|
||||
from io import StringIO
|
||||
from app.models import db, Job
|
||||
|
||||
print("🆕 MODERN webcrawler LOADED! – BATCHED + PROXY")
|
||||
|
||||
UPLOAD_FOLDER = '/app/uploads'
|
||||
RESULT_FOLDER = '/app/results'
|
||||
SCRAPER_URL = "http://gmaps-scraper:8080"
|
||||
|
||||
OUTPUT_COLS = ['title', 'category', 'address', 'open_hours', 'website', 'phone', 'link']
|
||||
|
||||
PROXY_URL = "http://bitlleuv-rotate:s5hzse6hz74b@p.webshare.io:80"
|
||||
API_PROXIES = {"http": PROXY_URL, "https": PROXY_URL}
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# Hilfsfunktionen
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
def is_blocked(data):
|
||||
text = str(data).lower()
|
||||
blocked = any(kw in text for kw in ['captcha', 'blocked', 'rate limit', 'too many', '429'])
|
||||
if blocked:
|
||||
print(f"🚫 BLOCKED: {str(data)[:100]}")
|
||||
return blocked
|
||||
|
||||
def fix_encoding(text):
|
||||
if not isinstance(text, str):
|
||||
return text
|
||||
try:
|
||||
return text.encode('latin-1').decode('utf-8')
|
||||
except (UnicodeEncodeError, UnicodeDecodeError):
|
||||
return text
|
||||
|
||||
def build_input_addresses(df):
|
||||
addresses = set()
|
||||
for _, row in df.iterrows():
|
||||
plz = str(row.get('PLZ', '')).strip()
|
||||
stadt = str(row.get('Stadt', '')).strip()
|
||||
str_ = str(row.get('Straße', '')).strip()
|
||||
nr = str(row.get('Hausnummer', '')).strip()
|
||||
zusatz = str(row.get('Zusatz', '')).strip()
|
||||
full = f"{str_} {nr} {zusatz} {plz} {stadt}".lower().strip()
|
||||
full = ' '.join(full.split())
|
||||
addresses.add(full)
|
||||
return addresses
|
||||
|
||||
def normalize_address(addr):
|
||||
if not isinstance(addr, str):
|
||||
return ''
|
||||
addr = fix_encoding(addr)
|
||||
return ' '.join(addr.lower().strip().split())
|
||||
|
||||
def address_in_input(result_addr, input_addresses):
|
||||
norm = normalize_address(result_addr)
|
||||
for inp_addr in input_addresses:
|
||||
plz_match = re.search(r'\b\d{5}\b', inp_addr)
|
||||
if plz_match:
|
||||
plz = plz_match.group()
|
||||
if plz in norm:
|
||||
street = inp_addr.split()[0] if inp_addr else ''
|
||||
if len(street) > 3 and street[:4].lower() in norm:
|
||||
return True
|
||||
return False
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# CSV Nachbearbeitung
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
def process_result_csv(raw_bytes, input_df, apply_filter=True):
|
||||
try:
|
||||
content = raw_bytes.decode('utf-8', errors='replace')
|
||||
df_out = pd.read_csv(StringIO(content))
|
||||
print(f"📄 Raw result: {df_out.shape}")
|
||||
|
||||
available = [c for c in OUTPUT_COLS if c in df_out.columns]
|
||||
df_out = df_out[available]
|
||||
|
||||
for col in df_out.columns:
|
||||
df_out[col] = df_out[col].apply(fix_encoding)
|
||||
|
||||
if apply_filter:
|
||||
input_addresses = build_input_addresses(input_df)
|
||||
before = len(df_out)
|
||||
df_out = df_out[
|
||||
df_out['address'].apply(
|
||||
lambda a: address_in_input(a, input_addresses)
|
||||
)
|
||||
]
|
||||
print(f"📍 Filter: {before} → {len(df_out)}")
|
||||
|
||||
df_out = df_out.drop_duplicates(subset=['title', 'address'], keep='first')
|
||||
df_out = df_out.dropna(subset=['title'], how='all')
|
||||
df_out = df_out[df_out['title'].str.strip().astype(bool)]
|
||||
|
||||
print(f"✅ Final ({'gefiltert' if apply_filter else 'alle'}): {df_out.shape}")
|
||||
return df_out
|
||||
except Exception as e:
|
||||
print(f"💥 process_result_csv: {e}")
|
||||
return None
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# HAUPT-WORKER
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
def process_file(filename, job_id, app):
|
||||
print(f"🎯 {filename} Job#{job_id} START!")
|
||||
|
||||
with app.app_context():
|
||||
job = Job.query.get(job_id)
|
||||
if not job:
|
||||
print("❌ Job missing")
|
||||
return
|
||||
|
||||
try:
|
||||
#Parse + ALLE Queries
|
||||
job.status = "📊 parsing CSV"
|
||||
db.session.commit()
|
||||
|
||||
filepath = os.path.join(UPLOAD_FOLDER, filename)
|
||||
print(f"📁 {filepath} | {os.path.getsize(filepath)}b")
|
||||
|
||||
df_input = pd.read_csv(filepath, sep=';', encoding='ISO-8859-1')
|
||||
print(f"📊 {df_input.shape}")
|
||||
|
||||
queries = []
|
||||
for _, row in df_input.iterrows():
|
||||
parts = [
|
||||
str(row.get('PLZ', '')).strip(),
|
||||
str(row.get('Stadt', '')).strip(),
|
||||
str(row.get('Straße', '')).strip(),
|
||||
str(row.get('Hausnummer', '')).strip(),
|
||||
str(row.get('Zusatz', '')).strip(),
|
||||
]
|
||||
q = f"Firmen {' '.join(p for p in parts if p and p != 'nan')}".strip()
|
||||
if len(q) > 10:
|
||||
queries.append(q)
|
||||
|
||||
total_queries = len(queries)
|
||||
print(f"🔍 {total_queries} Queries | Samples: {queries[:3]}")
|
||||
if total_queries == 0:
|
||||
raise ValueError("Keine gültigen Adressen")
|
||||
|
||||
#BATCHED Processing
|
||||
BATCH_SIZE = 10 # Erhöht: 5 → 10 (paid proxy)
|
||||
BATCH_DELAY_MIN, BATCH_DELAY_MAX = 10, 20 # Reduziert: 30-60s → 10-20s (paid proxy)
|
||||
batches = (total_queries + BATCH_SIZE - 1) // BATCH_SIZE
|
||||
print(f"📦 {batches} Batches à {BATCH_SIZE} | ETA: ~{batches*15//60:.0f}h")
|
||||
|
||||
all_results_filtered = []
|
||||
all_results_raw = []
|
||||
job.status = f"🔄 Batch 1/{batches}"
|
||||
db.session.commit()
|
||||
|
||||
for batch_idx in range(batches):
|
||||
batch_start = batch_idx * BATCH_SIZE
|
||||
batch_end = min(batch_start + BATCH_SIZE, total_queries)
|
||||
batch_queries = queries[batch_start:batch_end]
|
||||
print(f"\n🔄 BATCH {batch_idx+1}/{batches} ({batch_start+1}-{batch_end}/{total_queries})")
|
||||
|
||||
#Random Delay
|
||||
delay = random.uniform(BATCH_DELAY_MIN, BATCH_DELAY_MAX)
|
||||
print(f"😴 Delay: {delay:.0f}s | Proxy: {PROXY_URL}")
|
||||
time.sleep(delay)
|
||||
|
||||
#API Call
|
||||
payload = {
|
||||
"name": f"{filename.replace('.csv','')}-{job_id}-B{batch_idx+1:03d}",
|
||||
"keywords": batch_queries,
|
||||
"lang": "de",
|
||||
"depth": 1,
|
||||
"zoom": 17,
|
||||
"radius": 50,
|
||||
"max_time": 60, # Reduziert: 120 → 60 (paid proxy schneller)
|
||||
"fast_mode": False,
|
||||
"proxies": [PROXY_URL]
|
||||
}
|
||||
|
||||
try:
|
||||
resp = requests.post(
|
||||
f"{SCRAPER_URL}/api/v1/jobs",
|
||||
json=payload,
|
||||
timeout=45
|
||||
)
|
||||
print(f"📤 {resp.status_code}")
|
||||
if is_blocked(resp.text):
|
||||
print("🚫 Batch übersprungen (blocked)")
|
||||
continue
|
||||
if resp.status_code != 201:
|
||||
print(f"⚠️ Batch {batch_idx+1} fehlgeschlagen: {resp.text[:100]}")
|
||||
continue
|
||||
|
||||
scraper_id = resp.json()['id']
|
||||
print(f"✅ Scraper: {scraper_id}")
|
||||
|
||||
for poll_i in range(1, 61): # Reduziert: 121 → 61 (max_time 60s)
|
||||
r = requests.get(
|
||||
f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}",
|
||||
timeout=15
|
||||
)
|
||||
data = r.json()
|
||||
status = data.get('Status', data.get('status', '?'))
|
||||
|
||||
if status in ('ok', 'completed', 'scraped'):
|
||||
dl = requests.get(
|
||||
f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}/download",
|
||||
timeout=90
|
||||
)
|
||||
if dl.status_code == 200:
|
||||
df_filtered = process_result_csv(dl.content, df_input, True)
|
||||
df_raw = process_result_csv(dl.content, df_input, False)
|
||||
if df_filtered is not None:
|
||||
all_results_filtered.append(df_filtered)
|
||||
all_results_raw.append(df_raw)
|
||||
print(f"📊 Batch {batch_idx+1}: {len(df_filtered)} filtered")
|
||||
break
|
||||
elif status in ('failed', 'error'):
|
||||
print(f"💥 Batch {batch_idx+1}: {status}")
|
||||
break
|
||||
|
||||
time.sleep(random.uniform(5, 10)) # Reduziert: 10-20s → 5-10s (paid proxy)
|
||||
|
||||
except Exception as e:
|
||||
print(f"💥 Batch {batch_idx+1}: {e}")
|
||||
|
||||
job.status = f"🔄 Batch {batch_idx+2}/{batches}"
|
||||
db.session.commit()
|
||||
|
||||
#MERGE & SAVE
|
||||
job.status = "🔧 merging results"
|
||||
db.session.commit()
|
||||
|
||||
base = filename.replace('.csv', '')
|
||||
os.makedirs(RESULT_FOLDER, exist_ok=True)
|
||||
|
||||
if all_results_filtered:
|
||||
df_final_filtered = pd.concat(all_results_filtered, ignore_index=True)
|
||||
df_final_filtered = df_final_filtered.drop_duplicates(subset=['title', 'address'])
|
||||
|
||||
out_filtered = f"results_{base}_filtered.csv"
|
||||
df_final_filtered.to_csv(
|
||||
os.path.join(RESULT_FOLDER, out_filtered),
|
||||
index=False, encoding='utf-8-sig', sep=';'
|
||||
)
|
||||
|
||||
if all_results_raw:
|
||||
df_final_raw = pd.concat(all_results_raw, ignore_index=True)
|
||||
out_raw = f"results_{base}_all.csv"
|
||||
df_final_raw.to_csv(
|
||||
os.path.join(RESULT_FOLDER, out_raw),
|
||||
index=False, encoding='utf-8-sig', sep=';'
|
||||
)
|
||||
|
||||
job.result_filename = out_filtered
|
||||
job.result_filename_raw = out_raw
|
||||
job.status = f"✅ Fertig: {len(df_final_filtered)} Firmen"
|
||||
else:
|
||||
job.status = "❌ Keine Ergebnisse"
|
||||
|
||||
db.session.commit()
|
||||
print(f"🎉 Job {job_id} komplett!")
|
||||
|
||||
except Exception as e:
|
||||
job.status = f"Failed: {str(e)[:50]}"
|
||||
print(f"💥 FATAL: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
db.session.commit()
|
||||
|
||||
print(f"✅ DONE! Status: {job.status}")
|
||||
|
|
@ -1,429 +0,0 @@
|
|||
import os
|
||||
import re
|
||||
import unicodedata
|
||||
import json
|
||||
import pandas as pd
|
||||
import requests
|
||||
import time
|
||||
import random
|
||||
from io import StringIO
|
||||
from app.models import db, Job
|
||||
|
||||
print("🆕 MODERN webcrawler LOADED! – BATCHED + PROXY + RESUME + ETA + 2x SCRAPER")
|
||||
|
||||
UPLOAD_FOLDER = '/app/uploads'
|
||||
RESULT_FOLDER = '/app/results'
|
||||
|
||||
# 2x Scraper – abwechselnd genutzt
|
||||
SCRAPER_URLS = [
|
||||
"http://gmaps-scraper-1:8080",
|
||||
"http://gmaps-scraper-2:8080",
|
||||
]
|
||||
|
||||
OUTPUT_COLS = ['title', 'category', 'address', 'open_hours', 'website', 'phone', 'link']
|
||||
|
||||
PROXY_URL = "http://bitlleuv-rotate:s5hzse6hz74b@p.webshare.io:80"
|
||||
API_PROXIES = {"http": PROXY_URL, "https": PROXY_URL}
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# Tuning
|
||||
# ──────────────────────────────────────────────
|
||||
BATCH_SIZE = 30 # Keywords pro Scraper-Job
|
||||
BATCH_DELAY_MIN = 3 # Sekunden Pause zwischen Batches (min)
|
||||
BATCH_DELAY_MAX = 6 # Sekunden Pause zwischen Batches (max)
|
||||
MAX_TIME = 60 # Sekunden die der Scraper pro Batch hat
|
||||
POLL_MAX = 90 # Max. Poll-Versuche pro Batch
|
||||
POLL_DELAY_MIN = 2 # Sekunden zwischen Polls (min)
|
||||
POLL_DELAY_MAX = 5 # Sekunden zwischen Polls (max)
|
||||
STUCK_THRESHOLD = 8 # Polls auf 'pending' bis Auto-Restart
|
||||
MAX_RETRIES = 2 # Wiederholversuche pro Batch bei Fehler
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# Hilfsfunktionen
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
def is_blocked(data):
|
||||
text = str(data).lower()
|
||||
blocked = any(kw in text for kw in ['captcha', 'blocked', 'rate limit', 'too many', '429'])
|
||||
if blocked:
|
||||
print(f"🚫 BLOCKED: {str(data)[:100]}")
|
||||
return blocked
|
||||
|
||||
def fix_encoding(text):
|
||||
if not isinstance(text, str):
|
||||
return text
|
||||
try:
|
||||
return text.encode('latin-1').decode('utf-8')
|
||||
except (UnicodeEncodeError, UnicodeDecodeError):
|
||||
return text
|
||||
|
||||
# Fix 1: Sonderzeichen in Queries bereinigen
|
||||
def clean_query(q):
|
||||
"""Steuerzeichen + fehlerhafte Bytes entfernen für saubere Google Maps URLs"""
|
||||
q = ''.join(c for c in q if unicodedata.category(c) != 'Cc')
|
||||
q = ' '.join(q.split())
|
||||
return q.strip()
|
||||
|
||||
def build_input_addresses(df):
|
||||
addresses = set()
|
||||
for _, row in df.iterrows():
|
||||
plz = str(row.get('PLZ', '')).strip()
|
||||
stadt = str(row.get('Stadt', '')).strip()
|
||||
str_ = str(row.get('Straße', '')).strip()
|
||||
nr = str(row.get('Hausnummer', '')).strip()
|
||||
zusatz = str(row.get('Zusatz', '')).strip()
|
||||
full = f"{str_} {nr} {zusatz} {plz} {stadt}".lower().strip()
|
||||
full = ' '.join(full.split())
|
||||
addresses.add(full)
|
||||
return addresses
|
||||
|
||||
def normalize_address(addr):
|
||||
if not isinstance(addr, str):
|
||||
return ''
|
||||
addr = fix_encoding(addr)
|
||||
return ' '.join(addr.lower().strip().split())
|
||||
|
||||
def address_in_input(result_addr, input_addresses):
|
||||
norm = normalize_address(result_addr)
|
||||
for inp_addr in input_addresses:
|
||||
plz_match = re.search(r'\b\d{5}\b', inp_addr)
|
||||
if plz_match:
|
||||
plz = plz_match.group()
|
||||
if plz in norm:
|
||||
street = inp_addr.split()[0] if inp_addr else ''
|
||||
if len(street) > 3 and street[:4].lower() in norm:
|
||||
return True
|
||||
return False
|
||||
|
||||
def format_eta(seconds):
|
||||
"""Sekunden → lesbares ETA-Format"""
|
||||
if seconds < 60:
|
||||
return f"{int(seconds)}s"
|
||||
h, rem = divmod(int(seconds), 3600)
|
||||
m = rem // 60
|
||||
return f"{h}h {m:02d}min" if h > 0 else f"{m}min"
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# Fix 3: Scraper-Neustart bei Inactivity
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
def restart_scraper(scraper_url):
|
||||
"""Den betroffenen Scraper-Container neu starten"""
|
||||
try:
|
||||
import subprocess
|
||||
# Container-Name aus URL ableiten: http://gmaps-scraper-1:8080 → gmaps-scraper-1
|
||||
container = scraper_url.split("//")[1].split(":")[0]
|
||||
print(f"🔄 Starte {container} neu...")
|
||||
subprocess.run(["docker", "restart", container], timeout=30, capture_output=True)
|
||||
print(f"✅ {container} neu gestartet – warte 15s...")
|
||||
time.sleep(15)
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"⚠️ Scraper-Neustart fehlgeschlagen: {e}")
|
||||
return False
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# Resume: Progress-File Hilfsfunktionen
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
def get_progress_path(job_id):
|
||||
return os.path.join(RESULT_FOLDER, f"progress_{job_id}.json")
|
||||
|
||||
def get_partial_path(job_id, suffix):
|
||||
return os.path.join(RESULT_FOLDER, f"partial_{job_id}_{suffix}.csv")
|
||||
|
||||
def load_progress(job_id):
|
||||
"""Gespeicherten Fortschritt laden (falls vorhanden)"""
|
||||
path = get_progress_path(job_id)
|
||||
if os.path.exists(path):
|
||||
with open(path, 'r') as f:
|
||||
data = json.load(f)
|
||||
print(f"🔁 RESUME: ab Batch {data['last_completed_batch'] + 1}/{data['total_batches']}")
|
||||
return data
|
||||
return None
|
||||
|
||||
def save_progress(job_id, last_completed_batch, total_batches):
|
||||
"""Fortschritt nach jedem Batch speichern"""
|
||||
path = get_progress_path(job_id)
|
||||
with open(path, 'w') as f:
|
||||
json.dump({"last_completed_batch": last_completed_batch, "total_batches": total_batches}, f)
|
||||
|
||||
def append_partial(job_id, df_filtered, df_raw):
|
||||
"""Batch-Ergebnis an Partial-CSV anhängen"""
|
||||
for suffix, df in [('filtered', df_filtered), ('raw', df_raw)]:
|
||||
if df is None:
|
||||
continue
|
||||
path = get_partial_path(job_id, suffix)
|
||||
header = not os.path.exists(path)
|
||||
df.to_csv(path, mode='a', index=False, header=header, encoding='utf-8-sig', sep=';')
|
||||
|
||||
def load_partial(job_id):
|
||||
"""Bestehende Partial-CSVs laden"""
|
||||
results_filtered, results_raw = [], []
|
||||
for suffix, lst in [('filtered', results_filtered), ('raw', results_raw)]:
|
||||
path = get_partial_path(job_id, suffix)
|
||||
if os.path.exists(path):
|
||||
try:
|
||||
df = pd.read_csv(path, sep=';', encoding='utf-8-sig')
|
||||
lst.append(df)
|
||||
print(f"📂 Partial {suffix}: {len(df)} Zeilen geladen")
|
||||
except Exception as e:
|
||||
print(f"⚠️ Partial {suffix} Ladefehler: {e}")
|
||||
return results_filtered, results_raw
|
||||
|
||||
def cleanup_progress(job_id):
|
||||
"""Progress + Partial-Files nach Abschluss löschen"""
|
||||
for path in [
|
||||
get_progress_path(job_id),
|
||||
get_partial_path(job_id, 'filtered'),
|
||||
get_partial_path(job_id, 'raw'),
|
||||
]:
|
||||
if os.path.exists(path):
|
||||
os.remove(path)
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# CSV Nachbearbeitung
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
def process_result_csv(raw_bytes, input_df, apply_filter=True):
|
||||
try:
|
||||
content = raw_bytes.decode('utf-8', errors='replace')
|
||||
df_out = pd.read_csv(StringIO(content))
|
||||
print(f"📄 Raw result: {df_out.shape}")
|
||||
|
||||
available = [c for c in OUTPUT_COLS if c in df_out.columns]
|
||||
df_out = df_out[available]
|
||||
|
||||
for col in df_out.columns:
|
||||
df_out[col] = df_out[col].apply(fix_encoding)
|
||||
|
||||
if apply_filter:
|
||||
input_addresses = build_input_addresses(input_df)
|
||||
before = len(df_out)
|
||||
df_out = df_out[
|
||||
df_out['address'].apply(lambda a: address_in_input(a, input_addresses))
|
||||
]
|
||||
print(f"📍 Filter: {before} → {len(df_out)}")
|
||||
|
||||
df_out = df_out.drop_duplicates(subset=['title', 'address'], keep='first')
|
||||
df_out = df_out.dropna(subset=['title'], how='all')
|
||||
df_out = df_out[df_out['title'].str.strip().astype(bool)]
|
||||
|
||||
print(f"✅ Final ({'gefiltert' if apply_filter else 'alle'}): {df_out.shape}")
|
||||
return df_out
|
||||
except Exception as e:
|
||||
print(f"💥 process_result_csv: {e}")
|
||||
return None
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# HAUPT-WORKER
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
def process_file(filename, job_id, app):
|
||||
print(f"🎯 {filename} Job#{job_id} START!")
|
||||
|
||||
with app.app_context():
|
||||
job = Job.query.get(job_id)
|
||||
if not job:
|
||||
print("❌ Job missing")
|
||||
return
|
||||
|
||||
try:
|
||||
#Parse + ALLE Queries
|
||||
job.status = "📊 parsing CSV"
|
||||
db.session.commit()
|
||||
|
||||
filepath = os.path.join(UPLOAD_FOLDER, filename)
|
||||
print(f"📁 {filepath} | {os.path.getsize(filepath)}b")
|
||||
|
||||
df_input = pd.read_csv(filepath, sep=';', encoding='ISO-8859-1')
|
||||
print(f"📊 {df_input.shape}")
|
||||
|
||||
queries = []
|
||||
for _, row in df_input.iterrows():
|
||||
parts = [
|
||||
str(row.get('PLZ', '')).strip(),
|
||||
str(row.get('Stadt', '')).strip(),
|
||||
str(row.get('Straße', '')).strip(),
|
||||
str(row.get('Hausnummer', '')).strip(),
|
||||
str(row.get('Zusatz', '')).strip(),
|
||||
]
|
||||
q = f"Firmen {' '.join(p for p in parts if p and p != 'nan')}".strip()
|
||||
q = clean_query(q) # Fix 1: Sonderzeichen bereinigen
|
||||
if len(q) > 10:
|
||||
queries.append(q)
|
||||
|
||||
total_queries = len(queries)
|
||||
print(f"🔍 {total_queries} Queries | Samples: {queries[:3]}")
|
||||
if total_queries == 0:
|
||||
raise ValueError("Keine gültigen Adressen")
|
||||
|
||||
#BATCHED Processing
|
||||
batches = (total_queries + BATCH_SIZE - 1) // BATCH_SIZE
|
||||
|
||||
# Resume: Fortschritt laden falls vorhanden
|
||||
os.makedirs(RESULT_FOLDER, exist_ok=True)
|
||||
progress = load_progress(job_id)
|
||||
start_batch = progress['last_completed_batch'] + 1 if progress else 0
|
||||
all_results_filtered, all_results_raw = load_partial(job_id) if progress else ([], [])
|
||||
|
||||
eta_initial = format_eta((batches - start_batch) * ((BATCH_DELAY_MAX + MAX_TIME) / 2))
|
||||
print(f"📦 {batches} Batches à {BATCH_SIZE} | 2x Scraper | Start: {start_batch} | ETA: ~{eta_initial}")
|
||||
job_start_time = time.time()
|
||||
job.status = f"🔄 Batch {start_batch+1}/{batches} | ⏱️ ~{eta_initial}"
|
||||
db.session.commit()
|
||||
|
||||
for batch_idx in range(start_batch, batches):
|
||||
batch_start = batch_idx * BATCH_SIZE
|
||||
batch_end = min(batch_start + BATCH_SIZE, total_queries)
|
||||
batch_queries = queries[batch_start:batch_end]
|
||||
|
||||
# 2x Scraper: abwechselnd nutzen
|
||||
scraper_url = SCRAPER_URLS[batch_idx % len(SCRAPER_URLS)]
|
||||
print(f"\n🔄 BATCH {batch_idx+1}/{batches} ({batch_start+1}-{batch_end}/{total_queries}) → {scraper_url}")
|
||||
|
||||
#Random Delay
|
||||
delay = random.uniform(BATCH_DELAY_MIN, BATCH_DELAY_MAX)
|
||||
print(f"😴 Delay: {delay:.0f}s")
|
||||
time.sleep(delay)
|
||||
|
||||
#API Call
|
||||
payload = {
|
||||
"name": f"{filename.replace('.csv','')}-{job_id}-B{batch_idx+1:03d}",
|
||||
"keywords": batch_queries,
|
||||
"lang": "de",
|
||||
"depth": 1,
|
||||
"zoom": 15,
|
||||
"radius": 50,
|
||||
"max_time": MAX_TIME,
|
||||
"fast_mode": False,
|
||||
"proxies": [PROXY_URL]
|
||||
}
|
||||
|
||||
batch_success = False
|
||||
# Fix 2: Retry-Logik bei Scraper-Fehler
|
||||
for attempt in range(1, MAX_RETRIES + 1):
|
||||
try:
|
||||
resp = requests.post(f"{scraper_url}/api/v1/jobs", json=payload, timeout=45)
|
||||
print(f"📤 {resp.status_code} (Versuch {attempt} | {scraper_url})")
|
||||
|
||||
if is_blocked(resp.text):
|
||||
print("🚫 Batch übersprungen (blocked)")
|
||||
break
|
||||
if resp.status_code != 201:
|
||||
print(f"⚠️ Batch {batch_idx+1} fehlgeschlagen: {resp.text[:100]}")
|
||||
if attempt < MAX_RETRIES:
|
||||
time.sleep(10)
|
||||
continue
|
||||
|
||||
scraper_id = resp.json()['id']
|
||||
print(f"✅ Scraper: {scraper_id}")
|
||||
|
||||
stuck_counter = 0
|
||||
for poll_i in range(1, POLL_MAX + 1):
|
||||
r = requests.get(f"{scraper_url}/api/v1/jobs/{scraper_id}", timeout=15)
|
||||
data = r.json()
|
||||
status = data.get('Status', data.get('status', '?'))
|
||||
print(f"⏳ Poll {poll_i}: {status}")
|
||||
|
||||
# Fix 4: Auto-Recovery bei Pending-Stuck
|
||||
if status == 'pending':
|
||||
stuck_counter += 1
|
||||
if stuck_counter >= STUCK_THRESHOLD:
|
||||
print(f"⚠️ Job {scraper_id} hängt – abbrechen + Neustart")
|
||||
requests.delete(f"{scraper_url}/api/v1/jobs/{scraper_id}", timeout=10)
|
||||
restart_scraper(scraper_url) # Fix 3: Nur betroffenen Scraper neu starten
|
||||
break
|
||||
else:
|
||||
stuck_counter = 0
|
||||
|
||||
if status in ('ok', 'completed', 'scraped'):
|
||||
dl = requests.get(f"{scraper_url}/api/v1/jobs/{scraper_id}/download", timeout=90)
|
||||
if dl.status_code == 200:
|
||||
df_filtered = process_result_csv(dl.content, df_input, True)
|
||||
df_raw = process_result_csv(dl.content, df_input, False)
|
||||
if df_filtered is not None:
|
||||
all_results_filtered.append(df_filtered)
|
||||
all_results_raw.append(df_raw)
|
||||
append_partial(job_id, df_filtered, df_raw) # Resume: sofort speichern
|
||||
print(f"📊 Batch {batch_idx+1}: {len(df_filtered)} filtered")
|
||||
batch_success = True
|
||||
break
|
||||
|
||||
# Fix 2: Scraper-Fehler → Retry
|
||||
elif status in ('failed', 'error'):
|
||||
print(f"💥 Batch {batch_idx+1}: {status} (Versuch {attempt})")
|
||||
if attempt < MAX_RETRIES:
|
||||
time.sleep(10)
|
||||
break
|
||||
|
||||
time.sleep(random.uniform(POLL_DELAY_MIN, POLL_DELAY_MAX))
|
||||
|
||||
if batch_success:
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
print(f"💥 Batch {batch_idx+1} Versuch {attempt}: {e}")
|
||||
if attempt < MAX_RETRIES:
|
||||
time.sleep(10)
|
||||
|
||||
# Resume: Fortschritt nach jedem Batch speichern
|
||||
save_progress(job_id, batch_idx, batches)
|
||||
|
||||
# ETA berechnen
|
||||
elapsed = time.time() - job_start_time
|
||||
done_so_far = batch_idx - start_batch + 1
|
||||
if done_so_far > 0:
|
||||
avg_per_batch = elapsed / done_so_far
|
||||
remaining = (batches - batch_idx - 1) * avg_per_batch
|
||||
eta_str = format_eta(remaining)
|
||||
else:
|
||||
eta_str = "?"
|
||||
|
||||
job.status = f"🔄 Batch {batch_idx+2}/{batches} | ⏱️ ~{eta_str}"
|
||||
db.session.commit()
|
||||
|
||||
#MERGE & SAVE
|
||||
job.status = "🔧 merging results"
|
||||
db.session.commit()
|
||||
|
||||
base = filename.replace('.csv', '')
|
||||
|
||||
if all_results_filtered:
|
||||
df_final_filtered = pd.concat(all_results_filtered, ignore_index=True)
|
||||
df_final_filtered = df_final_filtered.drop_duplicates(subset=['title', 'address'])
|
||||
|
||||
out_filtered = f"results_{base}_filtered.csv"
|
||||
df_final_filtered.to_csv(
|
||||
os.path.join(RESULT_FOLDER, out_filtered),
|
||||
index=False, encoding='utf-8-sig', sep=';'
|
||||
)
|
||||
|
||||
if all_results_raw:
|
||||
df_final_raw = pd.concat(all_results_raw, ignore_index=True)
|
||||
out_raw = f"results_{base}_all.csv"
|
||||
df_final_raw.to_csv(
|
||||
os.path.join(RESULT_FOLDER, out_raw),
|
||||
index=False, encoding='utf-8-sig', sep=';'
|
||||
)
|
||||
|
||||
job.result_filename = out_filtered
|
||||
job.result_filename_raw = out_raw
|
||||
job.status = f"✅ Fertig: {len(df_final_filtered)} Firmen"
|
||||
|
||||
# Resume: Cleanup nach Abschluss
|
||||
cleanup_progress(job_id)
|
||||
else:
|
||||
job.status = "❌ Keine Ergebnisse"
|
||||
|
||||
db.session.commit()
|
||||
print(f"🎉 Job {job_id} komplett!")
|
||||
|
||||
except Exception as e:
|
||||
job.status = f"Failed: {str(e)[:50]}"
|
||||
print(f"💥 FATAL: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
db.session.commit()
|
||||
|
||||
print(f"✅ DONE! Status: {job.status}")
|
||||
|
|
@ -1,138 +0,0 @@
|
|||
import csv
|
||||
import os
|
||||
import requests
|
||||
from .models import db, Job
|
||||
from flask import current_app
|
||||
|
||||
UPLOAD_FOLDER = 'uploads'
|
||||
RESULT_FOLDER = 'results'
|
||||
|
||||
API_KEY = 'AIzaSyAIf0yXJTwo87VMWLBtq2m2LqE-OaPGbzw'
|
||||
|
||||
processed_companies = set()
|
||||
|
||||
def get_geocode(address):
|
||||
url = f"https://maps.googleapis.com/maps/api/geocode/json"
|
||||
params = {'address': address, 'key': API_KEY}
|
||||
|
||||
try:
|
||||
response = requests.get(url, params=params, timeout=5)
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
if data['status'] == 'OK':
|
||||
location = data['results'][0]['geometry']['location']
|
||||
return location['lat'], location['lng']
|
||||
except requests.RequestException as e:
|
||||
print(f"Geocode API Fehler für {address}: {e}")
|
||||
return None, None
|
||||
|
||||
def get_nearby_places(lat, lng):
|
||||
places_url = f"https://maps.googleapis.com/maps/api/place/nearbysearch/json"
|
||||
params = {
|
||||
'location': f"{lat},{lng}",
|
||||
'radius': 10,
|
||||
'type': 'point_of_interest',
|
||||
'key': API_KEY
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.get(places_url, params=params, timeout=5)
|
||||
if response.status_code == 200:
|
||||
return response.json().get('results', [])
|
||||
except requests.RequestException as e:
|
||||
print(f"Nearby Places API Fehler für Standort {lat},{lng}: {e}")
|
||||
return []
|
||||
|
||||
def get_place_details(place_id):
|
||||
details_url = f"https://maps.googleapis.com/maps/api/place/details/json"
|
||||
params = {
|
||||
'place_id': place_id,
|
||||
'fields': 'formatted_phone_number,website',
|
||||
'key': API_KEY
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.get(details_url, params=params, timeout=5)
|
||||
if response.status_code == 200:
|
||||
result = response.json().get('result', {})
|
||||
return result.get('formatted_phone_number', 'N/A'), result.get('website', 'N/A')
|
||||
except requests.RequestException as e:
|
||||
print(f"Place Details API Fehler für Place ID {place_id}: {e}")
|
||||
return 'N/A', 'N/A'
|
||||
|
||||
def process_file(filename, job_id, app):
|
||||
with app.app_context():
|
||||
filepath = os.path.join(UPLOAD_FOLDER, filename)
|
||||
results = []
|
||||
|
||||
job = Job.query.get(job_id)
|
||||
if not job:
|
||||
print("Job wurde abgebrochen.")
|
||||
return
|
||||
job.status = "In Progress"
|
||||
db.session.commit()
|
||||
|
||||
with open(filepath, newline='', encoding='ISO-8859-1') as csvfile:
|
||||
reader = csv.DictReader(csvfile, delimiter=';')
|
||||
headers = reader.fieldnames
|
||||
|
||||
if not all(field in headers for field in ['PLZ', 'Straße', 'Hausnummer']):
|
||||
print("CSV-Datei enthält nicht alle notwendigen Spalten.")
|
||||
job.status = "Failed"
|
||||
db.session.commit()
|
||||
return
|
||||
|
||||
for row in reader:
|
||||
plz = row.get('PLZ', '').strip()
|
||||
city = row.get('Stadt', row.get('Bezirk', '')).strip()
|
||||
street = row.get('Straße', '').strip()
|
||||
house_number = row.get('Hausnummer', '').strip()
|
||||
additional = row.get('Zusatz', '').strip()
|
||||
|
||||
if not all([plz, city, street, house_number]):
|
||||
continue
|
||||
|
||||
full_address = f"{street} {house_number} {additional}, {plz} {city}"
|
||||
lat, lng = get_geocode(full_address)
|
||||
if lat is None or lng is None:
|
||||
continue
|
||||
|
||||
nearby_places = get_nearby_places(lat, lng)
|
||||
for place in nearby_places:
|
||||
company_name = place['name']
|
||||
if company_name in processed_companies:
|
||||
continue
|
||||
|
||||
processed_companies.add(company_name)
|
||||
company_address = place.get('vicinity', 'N/A').split(',')[0]
|
||||
place_id = place.get('place_id')
|
||||
company_phone, company_website = get_place_details(place_id) if place_id else ('N/A', 'N/A')
|
||||
|
||||
results.append({
|
||||
'PLZ': plz,
|
||||
'Stadt': city,
|
||||
'Straße': street,
|
||||
'Hausnummer': house_number,
|
||||
'Zusatz': additional,
|
||||
'Company Name': company_name,
|
||||
'Company Address': company_address,
|
||||
'Company Phone': company_phone,
|
||||
'Company Website': company_website
|
||||
})
|
||||
|
||||
if results:
|
||||
result_file = f"results_{os.path.splitext(filename)[0]}.csv"
|
||||
result_path = os.path.join(RESULT_FOLDER, result_file)
|
||||
with open(result_path, 'w', newline='', encoding='utf-8-sig') as csvfile:
|
||||
writer = csv.DictWriter(csvfile, fieldnames=[
|
||||
'PLZ', 'Stadt', 'Straße', 'Hausnummer', 'Zusatz',
|
||||
'Company Name', 'Company Address', 'Company Phone', 'Company Website'
|
||||
])
|
||||
writer.writeheader()
|
||||
writer.writerows(results)
|
||||
job.status = "Completed"
|
||||
job.result_filename = result_file
|
||||
db.session.commit()
|
||||
else:
|
||||
job.status = "Failed"
|
||||
db.session.commit()
|
||||
|
|
@ -1,487 +1,128 @@
|
|||
import csv
|
||||
import os
|
||||
import re
|
||||
import unicodedata
|
||||
import json
|
||||
import threading
|
||||
import pandas as pd
|
||||
import requests
|
||||
import time
|
||||
import random
|
||||
from io import StringIO
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from app.models import db, Job
|
||||
from .models import db, Job
|
||||
from flask import current_app
|
||||
|
||||
print("🆕 MODERN webcrawler LOADED! – BATCHED + PROXY + RESUME + ETA + 4x SCRAPER CHUNK-PARALLEL")
|
||||
UPLOAD_FOLDER = 'uploads'
|
||||
RESULT_FOLDER = 'results'
|
||||
API_KEY = 'AIzaSyAIf0yXJTwo87VMWLBtq2m2LqE-OaPGbzw'
|
||||
|
||||
UPLOAD_FOLDER = '/app/uploads'
|
||||
RESULT_FOLDER = '/app/results'
|
||||
def get_place_details(street, city_zip):
|
||||
address = f"{street}, {city_zip}"
|
||||
url = f"https://maps.googleapis.com/maps/api/place/textsearch/json"
|
||||
params = {'query': address, 'key': API_KEY}
|
||||
|
||||
SCRAPER_URLS = [
|
||||
"http://gmaps-scraper-1:8080",
|
||||
"http://gmaps-scraper-2:8080",
|
||||
"http://gmaps-scraper-3:8080",
|
||||
"http://gmaps-scraper-4:8080",
|
||||
]
|
||||
|
||||
OUTPUT_COLS = ['title', 'category', 'address', 'open_hours', 'website', 'phone', 'link']
|
||||
|
||||
PROXY_URL = "http://bitlleuv-rotate:s5hzse6hz74b@p.webshare.io:80"
|
||||
API_PROXIES = {"http": PROXY_URL, "https": PROXY_URL}
|
||||
|
||||
_job_semaphore = threading.Semaphore(1)
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# Tuning
|
||||
# ──────────────────────────────────────────────
|
||||
BATCH_SIZE = 30 # Keywords pro Scraper-Job
|
||||
BATCH_DELAY_MIN = 3 # Sekunden Pause zwischen Chunks (min)
|
||||
BATCH_DELAY_MAX = 6 # Sekunden Pause zwischen Chunks (max)
|
||||
MAX_TIME = 60 # Sekunden die der Scraper pro Batch hat
|
||||
POLL_MAX = 90 # Max. Poll-Versuche pro Batch
|
||||
POLL_DELAY_MIN = 2 # Sekunden zwischen Polls (min)
|
||||
POLL_DELAY_MAX = 5 # Sekunden zwischen Polls (max)
|
||||
STUCK_TIMEOUT = 300 # Sekunden bis Scraper-Neustart (5 Min)
|
||||
MAX_RETRIES = 2 # Wiederholversuche pro Batch bei Fehler
|
||||
PARALLEL_WORKERS = len(SCRAPER_URLS)
|
||||
|
||||
_partial_lock = threading.Lock()
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# Hilfsfunktionen
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
def is_blocked(data):
|
||||
text = str(data).lower()
|
||||
blocked = any(kw in text for kw in ['captcha', 'blocked', 'rate limit', 'too many', '429'])
|
||||
if blocked:
|
||||
print(f"🚫 BLOCKED: {str(data)[:100]}")
|
||||
return blocked
|
||||
|
||||
def fix_encoding(text):
|
||||
if not isinstance(text, str):
|
||||
return text
|
||||
results = []
|
||||
try:
|
||||
return text.encode('latin-1').decode('utf-8')
|
||||
except (UnicodeEncodeError, UnicodeDecodeError):
|
||||
return text
|
||||
response = requests.get(url, params=params, timeout=5)
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
print(f"API Response Data for {address}: {data}")
|
||||
|
||||
def clean_query(q):
|
||||
q = ''.join(c for c in q if unicodedata.category(c) != 'Cc')
|
||||
q = ' '.join(q.split())
|
||||
return q.strip()
|
||||
for place in data.get('results', []):
|
||||
name = place.get('name', 'N/A')
|
||||
place_id = place.get('place_id')
|
||||
formatted_address = place.get('formatted_address', 'N/A')
|
||||
|
||||
def build_input_addresses(df):
|
||||
addresses = set()
|
||||
for _, row in df.iterrows():
|
||||
plz = str(row.get('PLZ', '')).strip()
|
||||
stadt = str(row.get('Stadt', '')).strip()
|
||||
str_ = str(row.get('Straße', '')).strip()
|
||||
nr = str(row.get('Hausnummer', '')).strip()
|
||||
zusatz = str(row.get('Zusatz', '')).strip()
|
||||
full = f"{str_} {nr} {zusatz} {plz} {stadt}".lower().strip()
|
||||
full = ' '.join(full.split())
|
||||
addresses.add(full)
|
||||
return addresses
|
||||
# Zweite Anfrage für detailliertere Informationen
|
||||
phone, website = 'N/A', 'N/A'
|
||||
if place_id:
|
||||
details_url = f"https://maps.googleapis.com/maps/api/place/details/json"
|
||||
details_params = {
|
||||
'place_id': place_id,
|
||||
'fields': 'formatted_phone_number,website',
|
||||
'key': API_KEY
|
||||
}
|
||||
details_response = requests.get(details_url, params=details_params, timeout=5)
|
||||
if details_response.status_code == 200:
|
||||
details_data = details_response.json().get('result', {})
|
||||
phone = details_data.get('formatted_phone_number', 'N/A')
|
||||
website = details_data.get('website', 'N/A')
|
||||
|
||||
def normalize_address(addr):
|
||||
if not isinstance(addr, str):
|
||||
return ''
|
||||
addr = fix_encoding(addr)
|
||||
return ' '.join(addr.lower().strip().split())
|
||||
# Speichern nur, wenn Name und Telefonnummer vorhanden sind
|
||||
if name != 'N/A' and phone != 'N/A':
|
||||
results.append({
|
||||
'Name': name,
|
||||
'Address': formatted_address,
|
||||
'Phone': phone,
|
||||
'Website': website
|
||||
})
|
||||
else:
|
||||
print(f"Fehler beim Abrufen der URL: {url} - Statuscode: {response.status_code}")
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"Anfragefehler für {url}: {e}")
|
||||
|
||||
def address_in_input(result_addr, input_addresses):
|
||||
norm = normalize_address(result_addr)
|
||||
for inp_addr in input_addresses:
|
||||
plz_match = re.search(r'\b\d{5}\b', inp_addr)
|
||||
if not plz_match:
|
||||
continue
|
||||
plz = plz_match.group()
|
||||
if plz not in norm:
|
||||
continue
|
||||
|
||||
parts = inp_addr.split()
|
||||
|
||||
street = parts[0] if parts else ''
|
||||
if len(street) < 4 or street[:5].lower() not in norm:
|
||||
continue
|
||||
|
||||
hausnr = parts[1] if len(parts) > 1 else ''
|
||||
if hausnr and not re.search(rf'\b{re.escape(hausnr)}\b', norm):
|
||||
continue
|
||||
|
||||
return True
|
||||
return False
|
||||
|
||||
def format_eta(seconds):
|
||||
if seconds < 60:
|
||||
return f"{int(seconds)}s"
|
||||
h, rem = divmod(int(seconds), 3600)
|
||||
m = rem // 60
|
||||
return f"{h}h {m:02d}min" if h > 0 else f"{m}min"
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# Scraper-Job Cleanup
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
def _cleanup_scraper_job(scraper_url, scraper_id):
|
||||
"""Scraper-Job immer aufräumen wenn wir ihn nicht mehr brauchen"""
|
||||
try:
|
||||
requests.delete(f"{scraper_url}/api/v1/jobs/{scraper_id}", timeout=10)
|
||||
print(f"🗑️ Scraper-Job {scraper_id} gelöscht")
|
||||
except Exception as e:
|
||||
print(f"⚠️ Cleanup fehlgeschlagen: {e}")
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# Scraper-Neustart via Docker SDK
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
def restart_scraper(scraper_url):
|
||||
try:
|
||||
import docker
|
||||
container_name = scraper_url.split("//")[1].split(":")[0]
|
||||
print(f"🔄 Starte {container_name} neu...")
|
||||
client = docker.from_env()
|
||||
container = client.containers.get(container_name)
|
||||
container.restart()
|
||||
print(f"✅ {container_name} neu gestartet – warte 15s...")
|
||||
time.sleep(15)
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"⚠️ Scraper-Neustart fehlgeschlagen: {e}")
|
||||
return False
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# Resume: Progress-File Hilfsfunktionen
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
def get_progress_path(job_id):
|
||||
return os.path.join(RESULT_FOLDER, f"progress_{job_id}.json")
|
||||
|
||||
def get_partial_path(job_id, suffix):
|
||||
return os.path.join(RESULT_FOLDER, f"partial_{job_id}_{suffix}.csv")
|
||||
|
||||
def load_progress(job_id):
|
||||
path = get_progress_path(job_id)
|
||||
if os.path.exists(path):
|
||||
with open(path, 'r') as f:
|
||||
data = json.load(f)
|
||||
print(f"🔁 RESUME: ab Batch {data['last_completed_batch'] + 1}/{data['total_batches']}")
|
||||
return data
|
||||
return None
|
||||
|
||||
def save_progress(job_id, last_completed_batch, total_batches):
|
||||
path = get_progress_path(job_id)
|
||||
with open(path, 'w') as f:
|
||||
json.dump({"last_completed_batch": last_completed_batch, "total_batches": total_batches}, f)
|
||||
|
||||
def append_partial(job_id, df_filtered, df_raw):
|
||||
with _partial_lock:
|
||||
for suffix, df in [('filtered', df_filtered), ('raw', df_raw)]:
|
||||
if df is None:
|
||||
continue
|
||||
path = get_partial_path(job_id, suffix)
|
||||
header = not os.path.exists(path)
|
||||
df.to_csv(path, mode='a', index=False, header=header, encoding='utf-8-sig', sep=';')
|
||||
|
||||
def load_partial(job_id):
|
||||
results_filtered, results_raw = [], []
|
||||
for suffix, lst in [('filtered', results_filtered), ('raw', results_raw)]:
|
||||
path = get_partial_path(job_id, suffix)
|
||||
if os.path.exists(path):
|
||||
try:
|
||||
df = pd.read_csv(path, sep=';', encoding='utf-8-sig')
|
||||
lst.append(df)
|
||||
print(f"📂 Partial {suffix}: {len(df)} Zeilen geladen")
|
||||
except Exception as e:
|
||||
print(f"⚠️ Partial {suffix} Ladefehler: {e}")
|
||||
return results_filtered, results_raw
|
||||
|
||||
def cleanup_progress(job_id):
|
||||
for path in [
|
||||
get_progress_path(job_id),
|
||||
get_partial_path(job_id, 'filtered'),
|
||||
get_partial_path(job_id, 'raw'),
|
||||
]:
|
||||
if os.path.exists(path):
|
||||
os.remove(path)
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# CSV Nachbearbeitung
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
def process_result_csv(raw_bytes, input_df, apply_filter=True):
|
||||
try:
|
||||
content = raw_bytes.decode('utf-8', errors='replace')
|
||||
df_out = pd.read_csv(StringIO(content))
|
||||
print(f"📄 Raw result: {df_out.shape}")
|
||||
|
||||
available = [c for c in OUTPUT_COLS if c in df_out.columns]
|
||||
df_out = df_out[available]
|
||||
|
||||
for col in df_out.columns:
|
||||
df_out[col] = df_out[col].apply(fix_encoding)
|
||||
|
||||
if apply_filter:
|
||||
input_addresses = build_input_addresses(input_df)
|
||||
before = len(df_out)
|
||||
df_out = df_out[
|
||||
df_out['address'].apply(lambda a: address_in_input(a, input_addresses))
|
||||
]
|
||||
print(f"📍 Filter: {before} → {len(df_out)}")
|
||||
|
||||
df_out = df_out.drop_duplicates(subset=['title', 'address'], keep='first')
|
||||
df_out = df_out.dropna(subset=['title'], how='all')
|
||||
df_out = df_out[df_out['title'].str.strip().astype(bool)]
|
||||
|
||||
print(f"✅ Final ({'gefiltert' if apply_filter else 'alle'}): {df_out.shape}")
|
||||
return df_out
|
||||
except Exception as e:
|
||||
print(f"💥 process_result_csv: {e}")
|
||||
return None
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# Parallel: Einzelnen Batch verarbeiten
|
||||
# ──────────────────────────────────────────────
|
||||
|
||||
def process_batch(batch_idx, batch_queries, scraper_url, filename, job_id, df_input):
|
||||
payload = {
|
||||
"name": f"{filename.replace('.csv','')}-{job_id}-B{batch_idx+1:03d}",
|
||||
"keywords": batch_queries,
|
||||
"lang": "de",
|
||||
"depth": 1,
|
||||
"zoom": 17,
|
||||
"radius": 100,
|
||||
"max_time": MAX_TIME,
|
||||
"fast_mode": False,
|
||||
"proxies": [PROXY_URL]
|
||||
}
|
||||
|
||||
for attempt in range(1, MAX_RETRIES + 1):
|
||||
scraper_id = None
|
||||
try:
|
||||
resp = requests.post(f"{scraper_url}/api/v1/jobs", json=payload, timeout=45)
|
||||
print(f"📤 Batch {batch_idx+1} → {scraper_url} | {resp.status_code} (Versuch {attempt})")
|
||||
|
||||
if is_blocked(resp.text):
|
||||
print(f"🚫 Batch {batch_idx+1} blocked")
|
||||
return None, None
|
||||
|
||||
if resp.status_code != 201:
|
||||
print(f"⚠️ Batch {batch_idx+1} fehlgeschlagen: {resp.text[:100]}")
|
||||
if attempt < MAX_RETRIES:
|
||||
time.sleep(10)
|
||||
continue
|
||||
|
||||
scraper_id = resp.json()['id']
|
||||
print(f"✅ Batch {batch_idx+1} Scraper-ID: {scraper_id}")
|
||||
|
||||
batch_start_time = time.time()
|
||||
|
||||
for poll_i in range(1, POLL_MAX + 1):
|
||||
r = requests.get(f"{scraper_url}/api/v1/jobs/{scraper_id}", timeout=15)
|
||||
data = r.json()
|
||||
status = data.get('Status', data.get('status', '?'))
|
||||
elapsed = time.time() - batch_start_time
|
||||
print(f"⏳ Batch {batch_idx+1} Poll {poll_i}: {status} | {int(elapsed)}s")
|
||||
|
||||
if status == 'pending' and elapsed > STUCK_TIMEOUT:
|
||||
print(f"⚠️ Batch {batch_idx+1} hängt seit {int(elapsed)}s – Neustart {scraper_url}")
|
||||
_cleanup_scraper_job(scraper_url, scraper_id)
|
||||
scraper_id = None
|
||||
restart_scraper(scraper_url)
|
||||
break
|
||||
|
||||
if status in ('ok', 'completed', 'scraped'):
|
||||
dl = requests.get(f"{scraper_url}/api/v1/jobs/{scraper_id}/download", timeout=90)
|
||||
scraper_id = None
|
||||
if dl.status_code == 200:
|
||||
df_filtered = process_result_csv(dl.content, df_input, True)
|
||||
df_raw = process_result_csv(dl.content, df_input, False)
|
||||
print(f"📊 Batch {batch_idx+1}: {len(df_filtered) if df_filtered is not None else 0} filtered")
|
||||
return df_filtered, df_raw
|
||||
return None, None
|
||||
|
||||
elif status in ('failed', 'error'):
|
||||
print(f"💥 Batch {batch_idx+1}: {status} (Versuch {attempt})")
|
||||
_cleanup_scraper_job(scraper_url, scraper_id)
|
||||
scraper_id = None
|
||||
if attempt < MAX_RETRIES:
|
||||
time.sleep(10)
|
||||
break
|
||||
|
||||
time.sleep(random.uniform(POLL_DELAY_MIN, POLL_DELAY_MAX))
|
||||
|
||||
except Exception as e:
|
||||
print(f"💥 Batch {batch_idx+1} Versuch {attempt}: {e}")
|
||||
if scraper_id:
|
||||
_cleanup_scraper_job(scraper_url, scraper_id)
|
||||
scraper_id = None
|
||||
if attempt < MAX_RETRIES:
|
||||
time.sleep(10)
|
||||
|
||||
return None, None
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# HAUPT-WORKER
|
||||
# ──────────────────────────────────────────────
|
||||
return results
|
||||
|
||||
def process_file(filename, job_id, app):
|
||||
with app.app_context():
|
||||
print(f"Starte Prozess für Job-ID: {job_id}")
|
||||
filepath = os.path.join(UPLOAD_FOLDER, filename)
|
||||
results = []
|
||||
|
||||
job = Job.query.get(job_id)
|
||||
if job:
|
||||
job.status = "⏳ Wartet auf anderen Job..."
|
||||
if not job:
|
||||
print("Job wurde abgebrochen, bevor er starten konnte.")
|
||||
return
|
||||
job.status = "In Progress"
|
||||
db.session.commit()
|
||||
|
||||
with open(filepath, newline='', encoding='ISO-8859-1') as csvfile:
|
||||
reader = csv.DictReader(csvfile, delimiter=';')
|
||||
rows = list(reader)
|
||||
total_rows = len(rows)
|
||||
print(f"Insgesamt zu verarbeitende Zeilen: {total_rows}")
|
||||
|
||||
for index, row in enumerate(rows):
|
||||
# Job-Verfügbarkeit erneut prüfen
|
||||
job = Job.query.get(job_id)
|
||||
if not job:
|
||||
print("Job wurde abgebrochen.")
|
||||
return
|
||||
|
||||
# Vollständige Adresse erstellen
|
||||
street = f"{row.get('Straße', '')} {row.get('Hausnummer', '')}".strip()
|
||||
city_zip = f"{row.get('PLZ', '')} {row.get('Stadt', '')}".strip()
|
||||
print(f"Verarbeite Adresse: {street}, {city_zip}")
|
||||
address_results = get_place_details(street, city_zip)
|
||||
|
||||
for result in address_results:
|
||||
# Ergebnisse nur speichern, wenn Name und Telefonnummer vorhanden sind
|
||||
if result['Name'] != 'N/A' and result['Phone'] != 'N/A':
|
||||
result.update({
|
||||
'PLZ': row.get('PLZ', ''),
|
||||
'Stadt': row.get('Stadt', ''),
|
||||
'Straße': row.get('Straße', ''),
|
||||
'Hausnummer': row.get('Hausnummer', ''),
|
||||
'Zusatz': row.get('Zusatz', '')
|
||||
})
|
||||
results.append(result)
|
||||
|
||||
# Results-Dateiname basierend auf dem Upload-Dateinamen
|
||||
result_file = f"results_{filename}"
|
||||
result_path = os.path.join(RESULT_FOLDER, result_file)
|
||||
|
||||
# Prüfen und erstellen des Ergebnisverzeichnisses
|
||||
if not os.path.exists(RESULT_FOLDER):
|
||||
os.makedirs(RESULT_FOLDER)
|
||||
print(f"Erstelle Ergebnisverzeichnis: {RESULT_FOLDER}")
|
||||
|
||||
try:
|
||||
if results: # Nur speichern, wenn Ergebnisse vorhanden sind
|
||||
with open(result_path, 'w', newline='', encoding='utf-8-sig') as csvfile:
|
||||
writer = csv.DictWriter(csvfile, fieldnames=['Name', 'Address', 'Phone', 'Website', 'PLZ', 'Stadt', 'Straße', 'Hausnummer', 'Zusatz'])
|
||||
writer.writeheader()
|
||||
writer.writerows(results)
|
||||
print(f"Ergebnisdatei erfolgreich gespeichert unter: {result_path}")
|
||||
job.status = "Completed"
|
||||
job.result_filename = result_file
|
||||
db.session.commit()
|
||||
else:
|
||||
print("Keine relevanten Ergebnisse zum Speichern vorhanden. Markiere den Job als 'Failed'.")
|
||||
job.status = "Failed"
|
||||
db.session.commit()
|
||||
except Exception as e:
|
||||
print(f"Fehler beim Schreiben der Ergebnisdatei: {e}")
|
||||
job.status = "Failed"
|
||||
db.session.commit()
|
||||
|
||||
with _job_semaphore:
|
||||
print(f"🎯 {filename} Job#{job_id} START!")
|
||||
|
||||
with app.app_context():
|
||||
job = Job.query.get(job_id)
|
||||
if not job:
|
||||
print("❌ Job missing")
|
||||
return
|
||||
|
||||
try:
|
||||
job.status = "📊 parsing CSV"
|
||||
db.session.commit()
|
||||
|
||||
filepath = os.path.join(UPLOAD_FOLDER, filename)
|
||||
print(f"📁 {filepath} | {os.path.getsize(filepath)}b")
|
||||
|
||||
df_input = pd.read_csv(filepath, sep=';', encoding='ISO-8859-1')
|
||||
print(f"📊 {df_input.shape}")
|
||||
|
||||
queries = []
|
||||
for _, row in df_input.iterrows():
|
||||
parts = [
|
||||
str(row.get('PLZ', '')).strip(),
|
||||
str(row.get('Stadt', '')).strip(),
|
||||
str(row.get('Straße', '')).strip(),
|
||||
str(row.get('Hausnummer', '')).strip(),
|
||||
str(row.get('Zusatz', '')).strip(),
|
||||
]
|
||||
q = f"Firmen {' '.join(p for p in parts if p and p != 'nan')}".strip()
|
||||
q = clean_query(q)
|
||||
if len(q) > 10:
|
||||
queries.append(q)
|
||||
|
||||
total_queries = len(queries)
|
||||
print(f"🔍 {total_queries} Queries | Samples: {queries[:3]}")
|
||||
if total_queries == 0:
|
||||
raise ValueError("Keine gültigen Adressen")
|
||||
|
||||
batches = (total_queries + BATCH_SIZE - 1) // BATCH_SIZE
|
||||
|
||||
os.makedirs(RESULT_FOLDER, exist_ok=True)
|
||||
progress = load_progress(job_id)
|
||||
start_batch = progress['last_completed_batch'] + 1 if progress else 0
|
||||
all_results_filtered, all_results_raw = load_partial(job_id) if progress else ([], [])
|
||||
|
||||
eta_initial = format_eta((batches - start_batch) * ((BATCH_DELAY_MAX + MAX_TIME) / 2) / PARALLEL_WORKERS)
|
||||
print(f"📦 {batches} Batches à {BATCH_SIZE} | {PARALLEL_WORKERS}x parallel (Chunk) | Start: {start_batch} | ETA: ~{eta_initial}")
|
||||
job_start_time = time.time()
|
||||
job.status = f"🔄 Batch {start_batch+1}/{batches} | ⏱️ ~{eta_initial}"
|
||||
db.session.commit()
|
||||
|
||||
completed_count = 0
|
||||
|
||||
batch_indices = list(range(start_batch, batches))
|
||||
chunks = [
|
||||
batch_indices[i:i + PARALLEL_WORKERS]
|
||||
for i in range(0, len(batch_indices), PARALLEL_WORKERS)
|
||||
]
|
||||
|
||||
with ThreadPoolExecutor(max_workers=PARALLEL_WORKERS) as executor:
|
||||
for chunk_idx, chunk in enumerate(chunks):
|
||||
futures = {}
|
||||
|
||||
for batch_idx in chunk:
|
||||
batch_start_q = batch_idx * BATCH_SIZE
|
||||
batch_end_q = min(batch_start_q + BATCH_SIZE, total_queries)
|
||||
batch_queries = queries[batch_start_q:batch_end_q]
|
||||
scraper_url = SCRAPER_URLS[batch_idx % len(SCRAPER_URLS)]
|
||||
|
||||
print(f"\n🚀 Chunk {chunk_idx+1} | Batch {batch_idx+1}/{batches} → {scraper_url}")
|
||||
time.sleep(random.uniform(1, 2))
|
||||
|
||||
future = executor.submit(
|
||||
process_batch,
|
||||
batch_idx, batch_queries, scraper_url,
|
||||
filename, job_id, df_input
|
||||
)
|
||||
futures[future] = batch_idx
|
||||
|
||||
for future in as_completed(futures):
|
||||
batch_idx = futures[future]
|
||||
completed_count += 1
|
||||
try:
|
||||
df_filtered, df_raw = future.result()
|
||||
if df_filtered is not None:
|
||||
all_results_filtered.append(df_filtered)
|
||||
all_results_raw.append(df_raw)
|
||||
append_partial(job_id, df_filtered, df_raw)
|
||||
except Exception as e:
|
||||
print(f"💥 Batch {batch_idx+1} Exception: {e}")
|
||||
|
||||
save_progress(job_id, batch_idx, batches)
|
||||
|
||||
elapsed = time.time() - job_start_time
|
||||
if completed_count > 0:
|
||||
avg_per_batch = elapsed / completed_count
|
||||
remaining = (batches - start_batch - completed_count) * avg_per_batch / PARALLEL_WORKERS
|
||||
eta_str = format_eta(remaining)
|
||||
else:
|
||||
eta_str = "?"
|
||||
|
||||
job.status = f"🔄 {completed_count}/{batches - start_batch} fertig | ⏱️ ~{eta_str}"
|
||||
db.session.commit()
|
||||
|
||||
if chunk_idx < len(chunks) - 1:
|
||||
delay = random.uniform(BATCH_DELAY_MIN, BATCH_DELAY_MAX)
|
||||
print(f"⏸️ Chunk {chunk_idx+1} fertig – warte {delay:.1f}s...")
|
||||
time.sleep(delay)
|
||||
|
||||
# ── MERGE & SAVE ──
|
||||
job.status = "🔧 merging results"
|
||||
db.session.commit()
|
||||
|
||||
base = filename.replace('.csv', '')
|
||||
|
||||
if all_results_filtered:
|
||||
df_final_filtered = pd.concat(all_results_filtered, ignore_index=True)
|
||||
df_final_filtered = df_final_filtered.drop_duplicates(subset=['title', 'address'])
|
||||
|
||||
out_filtered = f"results_{base}_filtered.csv"
|
||||
df_final_filtered.to_csv(
|
||||
os.path.join(RESULT_FOLDER, out_filtered),
|
||||
index=False, encoding='utf-8-sig', sep=';'
|
||||
)
|
||||
|
||||
out_raw = None
|
||||
if all_results_raw:
|
||||
df_final_raw = pd.concat(all_results_raw, ignore_index=True)
|
||||
out_raw = f"results_{base}_all.csv"
|
||||
df_final_raw.to_csv(
|
||||
os.path.join(RESULT_FOLDER, out_raw),
|
||||
index=False, encoding='utf-8-sig', sep=';'
|
||||
)
|
||||
|
||||
job.result_filename = out_filtered
|
||||
job.result_filename_raw = out_raw
|
||||
job.status = f"✅ Fertig: {len(df_final_filtered)} Firmen"
|
||||
|
||||
cleanup_progress(job_id)
|
||||
else:
|
||||
job.status = "❌ Keine Ergebnisse"
|
||||
|
||||
db.session.commit()
|
||||
print(f"🎉 Job {job_id} komplett!")
|
||||
|
||||
except Exception as e:
|
||||
job.status = f"Failed: {str(e)[:50]}"
|
||||
print(f"💥 FATAL: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
db.session.commit()
|
||||
|
||||
print(f"✅ DONE! Status: {job.status}")
|
||||
|
|
|
|||
|
|
@ -1,21 +0,0 @@
|
|||
import requests
|
||||
import time
|
||||
|
||||
base_url = "http://localhost:5001/api/v1/jobs"
|
||||
|
||||
response = requests.get(base_url)
|
||||
jobs = response.json() # Direkt Array
|
||||
print(f"{len(jobs)} Jobs gefunden.")
|
||||
|
||||
deleted = 0
|
||||
for job in jobs:
|
||||
job_id = job["ID"]
|
||||
del_res = requests.delete(f"{base_url}/{job_id}")
|
||||
if del_res.status_code in [200, 204]:
|
||||
print(f"✓ {job_id}")
|
||||
deleted += 1
|
||||
else:
|
||||
print(f"✗ {job_id}: {del_res.status_code}")
|
||||
time.sleep(0.1)
|
||||
|
||||
print(f"{deleted}/{len(jobs)} gelöscht.")
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
version: '3.8'
|
||||
version: '3'
|
||||
services:
|
||||
web:
|
||||
build: .
|
||||
|
|
@ -6,114 +6,6 @@ services:
|
|||
- "5000:5000"
|
||||
environment:
|
||||
- FLASK_APP=app
|
||||
- FLASK_ENV=production
|
||||
- PYTHONUNBUFFERED=1
|
||||
command: flask run --host=0.0.0.0 --port=5000
|
||||
volumes:
|
||||
- ./app:/app/app
|
||||
- ./uploads:/app/uploads
|
||||
- ./results:/app/results
|
||||
- ./instance:/app/instance
|
||||
- /var/run/docker.sock:/var/run/docker.sock
|
||||
depends_on:
|
||||
- gmaps-scraper-1
|
||||
- gmaps-scraper-2
|
||||
- gmaps-scraper-3
|
||||
- gmaps-scraper-4
|
||||
restart: always
|
||||
networks:
|
||||
- scraper-net
|
||||
|
||||
gmaps-scraper-1:
|
||||
image: gosom/google-maps-scraper:latest
|
||||
container_name: gmaps-scraper-1
|
||||
environment:
|
||||
- PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
|
||||
ports:
|
||||
- "5001:8080"
|
||||
volumes:
|
||||
- ./scraper-data-1:/gmapsdata
|
||||
command:
|
||||
- "-web"
|
||||
- "-data-folder=/gmapsdata"
|
||||
restart: always
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "wget -qO- http://localhost:8080/api/v1/jobs || exit 1"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 15s
|
||||
networks:
|
||||
- scraper-net
|
||||
|
||||
|
||||
gmaps-scraper-2:
|
||||
image: gosom/google-maps-scraper:latest
|
||||
container_name: gmaps-scraper-2
|
||||
environment:
|
||||
- PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
|
||||
ports:
|
||||
- "5002:8080"
|
||||
volumes:
|
||||
- ./scraper-data-2:/gmapsdata
|
||||
command:
|
||||
- "-web"
|
||||
- "-data-folder=/gmapsdata"
|
||||
restart: always
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "wget -qO- http://localhost:8080/api/v1/jobs || exit 1"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 15s
|
||||
networks:
|
||||
- scraper-net
|
||||
|
||||
gmaps-scraper-3:
|
||||
image: gosom/google-maps-scraper:latest
|
||||
container_name: gmaps-scraper-3
|
||||
environment:
|
||||
- PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
|
||||
ports:
|
||||
- "5003:8080"
|
||||
volumes:
|
||||
- ./scraper-data-3:/gmapsdata
|
||||
command:
|
||||
- "-web"
|
||||
- "-data-folder=/gmapsdata"
|
||||
restart: always
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "wget -qO- http://localhost:8080/api/v1/jobs || exit 1"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 15s
|
||||
networks:
|
||||
- scraper-net
|
||||
|
||||
gmaps-scraper-4:
|
||||
image: gosom/google-maps-scraper:latest
|
||||
container_name: gmaps-scraper-4
|
||||
environment:
|
||||
- PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
|
||||
ports:
|
||||
- "5004:8080"
|
||||
volumes:
|
||||
- ./scraper-data-4:/gmapsdata
|
||||
command:
|
||||
- "-web"
|
||||
- "-data-folder=/gmapsdata"
|
||||
restart: always
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "wget -qO- http://localhost:8080/api/v1/jobs || exit 1"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 15s
|
||||
networks:
|
||||
- scraper-net
|
||||
|
||||
|
||||
|
||||
networks:
|
||||
scraper-net:
|
||||
driver: bridge
|
||||
- .:/app
|
||||
|
|
|
|||
Binary file not shown.
|
|
@ -1 +0,0 @@
|
|||
Single-database configuration for Flask.
|
||||
|
|
@ -1,50 +0,0 @@
|
|||
# A generic, single database configuration.
|
||||
|
||||
[alembic]
|
||||
# template used to generate migration files
|
||||
# file_template = %%(rev)s_%%(slug)s
|
||||
|
||||
# set to 'true' to run the environment during
|
||||
# the 'revision' command, regardless of autogenerate
|
||||
# revision_environment = false
|
||||
|
||||
|
||||
# Logging configuration
|
||||
[loggers]
|
||||
keys = root,sqlalchemy,alembic,flask_migrate
|
||||
|
||||
[handlers]
|
||||
keys = console
|
||||
|
||||
[formatters]
|
||||
keys = generic
|
||||
|
||||
[logger_root]
|
||||
level = WARN
|
||||
handlers = console
|
||||
qualname =
|
||||
|
||||
[logger_sqlalchemy]
|
||||
level = WARN
|
||||
handlers =
|
||||
qualname = sqlalchemy.engine
|
||||
|
||||
[logger_alembic]
|
||||
level = INFO
|
||||
handlers =
|
||||
qualname = alembic
|
||||
|
||||
[logger_flask_migrate]
|
||||
level = INFO
|
||||
handlers =
|
||||
qualname = flask_migrate
|
||||
|
||||
[handler_console]
|
||||
class = StreamHandler
|
||||
args = (sys.stderr,)
|
||||
level = NOTSET
|
||||
formatter = generic
|
||||
|
||||
[formatter_generic]
|
||||
format = %(levelname)-5.5s [%(name)s] %(message)s
|
||||
datefmt = %H:%M:%S
|
||||
|
|
@ -1,113 +0,0 @@
|
|||
import logging
|
||||
from logging.config import fileConfig
|
||||
|
||||
from flask import current_app
|
||||
|
||||
from alembic import context
|
||||
|
||||
# this is the Alembic Config object, which provides
|
||||
# access to the values within the .ini file in use.
|
||||
config = context.config
|
||||
|
||||
# Interpret the config file for Python logging.
|
||||
# This line sets up loggers basically.
|
||||
fileConfig(config.config_file_name)
|
||||
logger = logging.getLogger('alembic.env')
|
||||
|
||||
|
||||
def get_engine():
|
||||
try:
|
||||
# this works with Flask-SQLAlchemy<3 and Alchemical
|
||||
return current_app.extensions['migrate'].db.get_engine()
|
||||
except (TypeError, AttributeError):
|
||||
# this works with Flask-SQLAlchemy>=3
|
||||
return current_app.extensions['migrate'].db.engine
|
||||
|
||||
|
||||
def get_engine_url():
|
||||
try:
|
||||
return get_engine().url.render_as_string(hide_password=False).replace(
|
||||
'%', '%%')
|
||||
except AttributeError:
|
||||
return str(get_engine().url).replace('%', '%%')
|
||||
|
||||
|
||||
# add your model's MetaData object here
|
||||
# for 'autogenerate' support
|
||||
# from myapp import mymodel
|
||||
# target_metadata = mymodel.Base.metadata
|
||||
config.set_main_option('sqlalchemy.url', get_engine_url())
|
||||
target_db = current_app.extensions['migrate'].db
|
||||
|
||||
# other values from the config, defined by the needs of env.py,
|
||||
# can be acquired:
|
||||
# my_important_option = config.get_main_option("my_important_option")
|
||||
# ... etc.
|
||||
|
||||
|
||||
def get_metadata():
|
||||
if hasattr(target_db, 'metadatas'):
|
||||
return target_db.metadatas[None]
|
||||
return target_db.metadata
|
||||
|
||||
|
||||
def run_migrations_offline():
|
||||
"""Run migrations in 'offline' mode.
|
||||
|
||||
This configures the context with just a URL
|
||||
and not an Engine, though an Engine is acceptable
|
||||
here as well. By skipping the Engine creation
|
||||
we don't even need a DBAPI to be available.
|
||||
|
||||
Calls to context.execute() here emit the given string to the
|
||||
script output.
|
||||
|
||||
"""
|
||||
url = config.get_main_option("sqlalchemy.url")
|
||||
context.configure(
|
||||
url=url, target_metadata=get_metadata(), literal_binds=True
|
||||
)
|
||||
|
||||
with context.begin_transaction():
|
||||
context.run_migrations()
|
||||
|
||||
|
||||
def run_migrations_online():
|
||||
"""Run migrations in 'online' mode.
|
||||
|
||||
In this scenario we need to create an Engine
|
||||
and associate a connection with the context.
|
||||
|
||||
"""
|
||||
|
||||
# this callback is used to prevent an auto-migration from being generated
|
||||
# when there are no changes to the schema
|
||||
# reference: http://alembic.zzzcomputing.com/en/latest/cookbook.html
|
||||
def process_revision_directives(context, revision, directives):
|
||||
if getattr(config.cmd_opts, 'autogenerate', False):
|
||||
script = directives[0]
|
||||
if script.upgrade_ops.is_empty():
|
||||
directives[:] = []
|
||||
logger.info('No changes in schema detected.')
|
||||
|
||||
conf_args = current_app.extensions['migrate'].configure_args
|
||||
if conf_args.get("process_revision_directives") is None:
|
||||
conf_args["process_revision_directives"] = process_revision_directives
|
||||
|
||||
connectable = get_engine()
|
||||
|
||||
with connectable.connect() as connection:
|
||||
context.configure(
|
||||
connection=connection,
|
||||
target_metadata=get_metadata(),
|
||||
**conf_args
|
||||
)
|
||||
|
||||
with context.begin_transaction():
|
||||
context.run_migrations()
|
||||
|
||||
|
||||
if context.is_offline_mode():
|
||||
run_migrations_offline()
|
||||
else:
|
||||
run_migrations_online()
|
||||
|
|
@ -1,24 +0,0 @@
|
|||
"""${message}
|
||||
|
||||
Revision ID: ${up_revision}
|
||||
Revises: ${down_revision | comma,n}
|
||||
Create Date: ${create_date}
|
||||
|
||||
"""
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
${imports if imports else ""}
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = ${repr(up_revision)}
|
||||
down_revision = ${repr(down_revision)}
|
||||
branch_labels = ${repr(branch_labels)}
|
||||
depends_on = ${repr(depends_on)}
|
||||
|
||||
|
||||
def upgrade():
|
||||
${upgrades if upgrades else "pass"}
|
||||
|
||||
|
||||
def downgrade():
|
||||
${downgrades if downgrades else "pass"}
|
||||
|
|
@ -1,45 +0,0 @@
|
|||
"""Add is_admin column to User model
|
||||
|
||||
Revision ID: 10331d61a25d
|
||||
Revises:
|
||||
Create Date: 2024-11-14 08:36:27.125841
|
||||
|
||||
"""
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = '10331d61a25d'
|
||||
down_revision = None
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade():
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
op.drop_table('user')
|
||||
op.drop_table('job')
|
||||
# ### end Alembic commands ###
|
||||
|
||||
|
||||
def downgrade():
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
op.create_table('job',
|
||||
sa.Column('id', sa.INTEGER(), nullable=False),
|
||||
sa.Column('user_id', sa.INTEGER(), nullable=False),
|
||||
sa.Column('filename', sa.VARCHAR(length=150), nullable=False),
|
||||
sa.Column('status', sa.VARCHAR(length=50), nullable=True),
|
||||
sa.Column('created_at', sa.DATETIME(), nullable=True),
|
||||
sa.Column('result_filename', sa.VARCHAR(length=150), nullable=True),
|
||||
sa.ForeignKeyConstraint(['user_id'], ['user.id'], ),
|
||||
sa.PrimaryKeyConstraint('id')
|
||||
)
|
||||
op.create_table('user',
|
||||
sa.Column('id', sa.INTEGER(), nullable=False),
|
||||
sa.Column('username', sa.VARCHAR(length=150), nullable=False),
|
||||
sa.Column('password', sa.VARCHAR(length=150), nullable=False),
|
||||
sa.PrimaryKeyConstraint('id'),
|
||||
sa.UniqueConstraint('username')
|
||||
)
|
||||
# ### end Alembic commands ###
|
||||
|
|
@ -1,8 +1,7 @@
|
|||
flask
|
||||
flask-sqlalchemy
|
||||
flask-login
|
||||
flask-migrate
|
||||
Flask==2.2.5
|
||||
Flask-Login==0.6.2
|
||||
Flask-SQLAlchemy==3.0.3
|
||||
Werkzeug==2.2.2
|
||||
pandas
|
||||
requests
|
||||
werkzeug
|
||||
docker
|
||||
beautifulsoup4
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue