Initial commit

2026-03-10 11:33:18 +01:00 · 2026-03-10 11:33:18 +01:00 · df8c2313a9
commit df8c2313a9
parent 387bc056b9
275 changed files with 12939 additions and 263 deletions
--- a/6
+++ b/6
@ -7,12 +7,14 @@ WORKDIR /app
 # Abhängigkeiten installieren
 COPY requirements.txt requirements.txt
 RUN pip install --no-cache-dir -r requirements.txt
 RUN apt update
 RUN apt install curl -y
 # App-Dateien kopieren
 COPY . .
 # Flask Umgebungsvariable setzen
 ENV FLASK_APP=app
 ENV FLASK_ENV=production
-# Flask starten
+EXPOSE 5000
 CMD ["flask", "run", "--host=0.0.0.0", "--port=5000"]
--- a/app/init.py
+++ b/app/init.py
@ -1,56 +1,88 @@
 import os
-from flask import Flask, redirect, url_for, request
+from flask import Flask, redirect, url_for, request, current_app
 from flask_sqlalchemy import SQLAlchemy
 from flask_login import LoginManager, current_user
 from flask_migrate import Migrate
 from sqlalchemy import text
-# Konfiguration für Upload- und Ergebnis-Ordner
+# ✅ Docker-Pfade
 UPLOAD_FOLDER = '/app/uploads'
 RESULT_FOLDER = '/app/results'
 db = SQLAlchemy()
 login_manager = LoginManager()
 migrate = Migrate()
 def _run_migrations(app):
    """Fehlende DB-Spalten automatisch hinzufügen – übersteht jeden Neustart"""
    migrations = [
        ("job",  "result_filename_raw", "VARCHAR(150)"),
        ("job",  "scraper_job_id",      "VARCHAR(255)"),
        ("user", "is_admin",            "BOOLEAN DEFAULT 0"),
    ]
    with app.app_context():
        for table, column, col_type in migrations:
            try:
                db.session.execute(text(f"ALTER TABLE {table} ADD COLUMN {column} {col_type}"))
                db.session.commit()
                print(f"✅ Migration: {table}.{column} hinzugefügt")
            except Exception:
                db.session.rollback()
 def create_app():
    app = Flask(__name__)
    # 🔑 Configs
    app.config['SECRET_KEY'] = '008e7369b075886d5f494c8813efdfb17155da6af12b3fe8ee'
    app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///users.db'
    app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
    app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
    app.config['RESULT_FOLDER'] = RESULT_FOLDER
    app.config['ALLOW_USER_SIGNUP'] = False
    # DB + Tools
    db.init_app(app)
    migrate.init_app(app, db)
    # Flask-Login Setup
    login_manager = LoginManager()
    login_manager.login_view = 'auth.login'
    login_manager.init_app(app)
    login_manager.login_view = 'auth.login'
    # User Loader
    @login_manager.user_loader
    def load_user(user_id):
        from .models import User
        return User.query.get(int(user_id))
-    # Umleitung nicht authentifizierter Benutzer, statische Dateien und bestimmte Routen ausnehmen
+    # Protected Routes
    @app.before_request
    def require_login():
-        allowed_routes = ['auth.login', 'auth.signup']
+        allowed = ['auth.login', 'auth.signup', 'static']
-        if (not current_user.is_authenticated
+        if (not current_user.is_authenticated and
-                and request.endpoint not in allowed_routes
+            request.endpoint not in allowed and
-                and not request.path.startswith('/static/')):
+            not request.path.startswith('/static')):
            return redirect(url_for('auth.login'))
-    # Erstellen Sie die Ordner, falls sie noch nicht existieren
+    # Ordner
    os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
    os.makedirs(app.config['RESULT_FOLDER'], exist_ok=True)
-    # Registrieren der Routen
+    # Routes
    from . import routes
    app.register_blueprint(routes.bp)
-    # Erstellen der Tabellen in der Datenbank
+    # Index Redirect
    @app.route('/')
    def index():
        return redirect(url_for('auth.job_status'))
    # DB Tables + Auto-Migration
    with app.app_context():
        db.create_all()
        _run_migrations(app)
    return app
 if __name__ == '__main__':
    app = create_app()
    app.run(host='0.0.0.0', port=5000, debug=False)
--- a/app/pycache/init.cpython-310.pyc
+++ b/app/pycache/init.cpython-310.pyc
--- a/app/pycache/models.cpython-310.pyc
+++ b/app/pycache/models.cpython-310.pyc
--- a/app/pycache/routes.cpython-310.pyc
+++ b/app/pycache/routes.cpython-310.pyc
--- a/app/pycache/webcrawler.cpython-310.pyc
+++ b/app/pycache/webcrawler.cpython-310.pyc
--- a/app/init.py.bak
+++ b/app/init.py.bak
@ -0,0 +1,68 @@
 import os
 from flask import Flask, redirect, url_for, request, current_app
 from flask_sqlalchemy import SQLAlchemy
 from flask_login import LoginManager, current_user
 from flask_migrate import Migrate
 # ✅ Docker-Pfade
 UPLOAD_FOLDER = '/app/uploads'
 RESULT_FOLDER = '/app/results'
 db = SQLAlchemy()
 login_manager = LoginManager()
 migrate = Migrate()
 def create_app():
    app = Flask(__name__)
    # 🔑 Configs
    app.config['SECRET_KEY'] = '008e7369b075886d5f494c8813efdfb17155da6af12b3fe8ee'
    app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///users.db'
    app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
    app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
    app.config['RESULT_FOLDER'] = RESULT_FOLDER
    app.config['ALLOW_USER_SIGNUP'] = True  # ✅ Aktiviert!
    # DB + Tools
    db.init_app(app)
    migrate.init_app(app, db)
    login_manager.init_app(app)
    login_manager.login_view = 'auth.login'
    # User Loader
    @login_manager.user_loader
    def load_user(user_id):
        from .models import User
        return User.query.get(int(user_id))
    # Protected Routes
    @app.before_request
    def require_login():
        allowed = ['auth.login', 'auth.signup', 'static']
        if (not current_user.is_authenticated and 
            request.endpoint not in allowed and 
            not request.path.startswith('/static')):
            return redirect(url_for('auth.login'))
    # Ordner
    os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
    os.makedirs(app.config['RESULT_FOLDER'], exist_ok=True)
    # Routes
    from . import routes
    app.register_blueprint(routes.bp)
    # Index Redirect
    @app.route('/')
    def index():
        return redirect(url_for('auth.job_status'))
    # DB Tables
    with app.app_context():
        db.create_all()
    return app
 if __name__ == '__main__':
    app = create_app()
    app.run(host='0.0.0.0', port=5000, debug=False)
--- a/app/models.py
+++ b/app/models.py
@ -15,5 +15,11 @@ class Job(db.Model):
    status = db.Column(db.String(50), default="Pending")
    created_at = db.Column(db.DateTime, default=datetime.utcnow)
    result_filename = db.Column(db.String(150), nullable=True)
    result_filename_raw = db.Column(db.String(150), nullable=True)
    user = db.relationship('User', backref=db.backref('jobs', lazy=True))
 class AppConfig(db.Model):
    id    = db.Column(db.Integer, primary_key=True)
    key   = db.Column(db.String(100), unique=True, nullable=False)
    value = db.Column(db.String(100), nullable=False, default='false')
--- a/app/routes.orig
+++ b/app/routes.orig
@ -0,0 +1,223 @@
 import time
 import csv
 import os
 import threading
 from flask import Blueprint, request, redirect, url_for, flash, render_template, send_file, current_app
 from flask_login import login_user, logout_user, login_required, current_user
 from werkzeug.utils import secure_filename
 from werkzeug.security import generate_password_hash, check_password_hash
 from .models import db, User, Job
 from .webcrawler import process_file  # Importiere die Funktion für das Webscraping
 UPLOAD_FOLDER = 'uploads'
 RESULT_FOLDER = 'results'
 # Blueprint für auth erstellen
 bp = Blueprint('auth', __name__)
@bp.route('/login', methods=['GET', 'POST'])
 def login():
    if request.method == 'POST':
        username = request.form['username']
        password = request.form['password']
        user = User.query.filter_by(username=username).first()
        if user and check_password_hash(user.password, password):
            login_user(user)
            return redirect(url_for('auth.job_status'))
        flash('Login fehlgeschlagen. Überprüfen Sie Benutzername und Passwort.')
    return render_template('login.html')
@bp.route('/signup', methods=['GET', 'POST'])
 def signup():
    if not current_app.config['ALLOW_USER_SIGNUP']:
        flash("Registrierung ist derzeit deaktiviert.")
        return redirect(url_for('auth.login'))
    if request.method == 'POST':
        username = request.form['username']
        password = generate_password_hash(request.form['password'], method='sha256')
        new_user = User(username=username, password=password)
        db.session.add(new_user)
        db.session.commit()
        flash('Benutzer erfolgreich erstellt! Sie können sich jetzt einloggen.')
        return redirect(url_for('auth.login'))
    return render_template('signup.html')
@bp.route('/logout')
@login_required
 def logout():
    logout_user()
    return redirect(url_for('auth.login'))
@bp.route('/jobs')
@login_required
 def job_status():
    jobs = Job.query.filter_by(user_id=current_user.id).all()
    return render_template('jobs.html', jobs=jobs)
@bp.route('/upload', methods=['GET', 'POST'])
@login_required
 def upload():
    if request.method == 'POST':
        file = request.files['file']
        filename = secure_filename(file.filename)
        # Überprüfen, ob eine Datei mit dem gleichen Namen bereits existiert
        file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], filename)
        if os.path.exists(file_path):
            # Wenn eine Datei mit dem gleichen Namen existiert, einen Zeitstempel hinzufügen
            name, ext = os.path.splitext(filename)
            timestamp = time.strftime("%Y%m%d-%H%M%S")  # Zeitstempel im Format JahrMonatTag-StundenMinutenSekunden
            filename = f"{name}_{timestamp}{ext}"
            file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], filename)
            flash(f"Eine Datei mit gleichem Namen existierte bereits. Die Datei wurde als '{filename}' gespeichert.")
        # Speichern der Datei
        file.save(file_path)
        flash('Datei erfolgreich hochgeladen und Job gestartet')
        # Neuen Job erstellen
        new_job = Job(user_id=current_user.id, filename=filename, status="Pending")
        db.session.add(new_job)
        db.session.commit()
        # Debugging-Ausgabe zur Überprüfung der Thread-Erstellung
        print(f"Starte Scraping-Thread für Job-ID: {new_job.id}")
        # Starten des Scraping im Hintergrund-Thread und Übergeben des aktuellen Anwendungskontexts
        thread = threading.Thread(target=process_file, args=(filename, new_job.id, current_app._get_current_object()))
        thread.start()
        # Debugging-Ausgabe, nachdem der Thread gestartet wurde
        print(f"Thread für Job {new_job.id} erfolgreich gestartet.")
        return redirect(url_for('auth.job_status'))
    return render_template('upload.html')
@bp.route('/download/<int:job_id>', methods=['GET'])
@login_required
 def download_result(job_id):
    job = Job.query.get_or_404(job_id)
    print(f"Job ID: {job.id} - User ID: {job.user_id} - Current User ID: {current_user.id}")
    # Überprüfen, ob der Job dem aktuellen Benutzer gehört
    if job.user_id != current_user.id:
        flash("Sie haben keine Berechtigung, dieses Ergebnis herunterzuladen.")
        return redirect(url_for('auth.job_status'))
    # Überprüfen, ob das Ergebnis vorhanden ist
    if not job.result_filename:
        flash("Das Ergebnis ist noch nicht verfügbar.")
        return redirect(url_for('auth.job_status'))
    # Überprüfen, ob die Datei im angegebenen Pfad existiert
    result_path = os.path.join(current_app.config['RESULT_FOLDER'], job.result_filename)
    print(f"Versuche, Datei herunterzuladen von: {result_path}")
    if os.path.exists(result_path):
        print("Datei existiert und wird zum Download bereitgestellt.")
        return send_file(result_path, as_attachment=True)
    else:
        print("Datei nicht gefunden. Ergebnisverzeichnis oder Pfad prüfen.")
        flash("Ergebnisdatei nicht gefunden.")
        return redirect(url_for('auth.job_status'))
@bp.route('/delete_job/<int:job_id>', methods=['POST'])
@login_required
 def delete_job(job_id):
    job = Job.query.get_or_404(job_id)
    if job.user_id != current_user.id:
        flash("Sie haben keine Berechtigung, diesen Job zu löschen.")
        return redirect(url_for('auth.job_status'))
    # Löschen der Upload-Datei
    upload_path = os.path.join(current_app.config['UPLOAD_FOLDER'], job.filename)
    if os.path.exists(upload_path):
        os.remove(upload_path)
        print(f"Upload-Datei gelöscht: {upload_path}")
    else:
        print(f"Upload-Datei nicht gefunden: {upload_path}")
    # Löschen der Results-Datei, falls vorhanden
    if job.result_filename:
        result_path = os.path.join(current_app.config['RESULT_FOLDER'], job.result_filename)
        print(f"Versuche Ergebnisdatei zu löschen: {result_path}")
        if os.path.exists(result_path):
            try:
                os.remove(result_path)
                print(f"Ergebnisdatei gelöscht: {result_path}")
            except Exception as e:
                print(f"Fehler beim Löschen der Ergebnisdatei: {e}")
        else:
            print(f"Ergebnisdatei nicht gefunden im Pfad: {result_path}")
    # Job aus der Datenbank löschen
    db.session.delete(job)
    db.session.commit()
    flash("Job erfolgreich gelöscht.")
    return redirect(url_for('auth.job_status'))
@bp.route('/admin', methods=['GET'])
@login_required
 def admin_panel():
    if not current_user.is_admin:
        flash("Keine Berechtigung.")
        return redirect(url_for('auth.job_status'))
    users = User.query.all()
    return render_template('admin_panel.html', users=users)
@bp.route('/admin/create_user', methods=['POST'])
@login_required
 def create_user():
    if not current_user.is_admin:
        flash("Keine Berechtigung.")
        return redirect(url_for('auth.admin_panel'))
    username = request.form['username']
    password = request.form['password']
    is_admin = 'is_admin' in request.form  # Checkbox für Adminrechte
    hashed_password = generate_password_hash(password, method='sha256')
    new_user = User(username=username, password=hashed_password, is_admin=is_admin)
    db.session.add(new_user)
    db.session.commit()
    flash(f"Benutzer {username} wurde erstellt.")
    return redirect(url_for('auth.admin_panel'))
@bp.route('/admin/reset_password/<int:user_id>', methods=['POST'])
@login_required
 def reset_password(user_id):
    if not current_user.is_admin:
        flash("Keine Berechtigung.")
        return redirect(url_for('auth.admin_panel'))
    user = User.query.get_or_404(user_id)
    new_password = request.form['new_password']
    user.password = generate_password_hash(new_password, method='sha256')
    db.session.commit()
    flash(f"Passwort für Benutzer {user.username} wurde zurückgesetzt.")
    return redirect(url_for('auth.admin_panel'))
@bp.route('/admin/delete_user/<int:user_id>', methods=['POST'])
@login_required
 def delete_user(user_id):
    if not current_user.is_admin:
        flash("Keine Berechtigung.")
        return redirect(url_for('auth.admin_panel'))
    user = User.query.get_or_404(user_id)
    if user.is_admin:
        flash("Administratoren können nicht gelöscht werden.")
        return redirect(url_for('auth.admin_panel'))
    db.session.delete(user)
    db.session.commit()
    flash(f"Benutzer {user.username} wurde gelöscht.")
    return redirect(url_for('auth.admin_panel'))
--- a/app/routes.py
+++ b/app/routes.py
@ -1,18 +1,16 @@
 import time
 import csv
 import os
 import threading
-from flask import Blueprint, request, redirect, url_for, flash, render_template, send_file, current_app
+from flask import Blueprint, request, redirect, url_for, flash, render_template, send_file, jsonify, current_app
 from flask_login import login_user, logout_user, login_required, current_user
 from werkzeug.utils import secure_filename
 from werkzeug.security import generate_password_hash, check_password_hash
-from .models import db, User, Job
+from .models import db, User, Job, AppConfig
-from .webcrawler import process_file  # Importiere die Funktion für das Webscraping
+from .webcrawler import process_file
-UPLOAD_FOLDER = 'uploads'
+UPLOAD_FOLDER = '/app/uploads'
-RESULT_FOLDER = 'results'
+RESULT_FOLDER = '/app/results'
 # Blueprint für auth erstellen
 bp = Blueprint('auth', __name__)
@bp.route('/login', methods=['GET', 'POST'])
@ -29,19 +27,19 @@ def login():
@bp.route('/signup', methods=['GET', 'POST'])
 def signup():
-    if not current_app.config['ALLOW_USER_SIGNUP']:
+    cfg = AppConfig.query.filter_by(key='allow_signup').first()
    if not cfg or cfg.value != 'true':
        flash("Registrierung ist derzeit deaktiviert.")
        return redirect(url_for('auth.login'))
    if request.method == 'POST':
        username = request.form['username']
-        password = generate_password_hash(request.form['password'], method='sha256')
+        password = generate_password_hash(request.form['password'])  # ✅ Fix
        new_user = User(username=username, password=password)
        db.session.add(new_user)
        db.session.commit()
-        flash('Benutzer erfolgreich erstellt! Sie können sich jetzt einloggen.')
+        flash('Benutzer erfolgreich erstellt!')
        return redirect(url_for('auth.login'))
    return render_template('signup.html')
@bp.route('/logout')
@ -53,171 +51,203 @@ def logout():
@bp.route('/jobs')
@login_required
 def job_status():
-    jobs = Job.query.filter_by(user_id=current_user.id).all()
+    jobs = Job.query.filter_by(user_id=current_user.id).order_by(Job.created_at.desc()).all()
    return render_template('jobs.html', jobs=jobs)
@bp.route('/upload', methods=['GET', 'POST'])
@login_required
 def upload():
    if request.method == 'POST':
-        file = request.files['file']
+        if 'file' not in request.files:
-        filename = secure_filename(file.filename)
+            flash('Keine Datei ausgewählt.')
-        
+            return redirect(url_for('auth.upload'))
        # Überprüfen, ob eine Datei mit dem gleichen Namen bereits existiert
        file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], filename)
        if os.path.exists(file_path):
            # Wenn eine Datei mit dem gleichen Namen existiert, einen Zeitstempel hinzufügen
            name, ext = os.path.splitext(filename)
            timestamp = time.strftime("%Y%m%d-%H%M%S")  # Zeitstempel im Format JahrMonatTag-StundenMinutenSekunden
            filename = f"{name}_{timestamp}{ext}"
            file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], filename)
            flash(f"Eine Datei mit gleichem Namen existierte bereits. Die Datei wurde als '{filename}' gespeichert.")
        # Speichern der Datei
        file.save(file_path)
        flash('Datei erfolgreich hochgeladen und Job gestartet')
-        # Neuen Job erstellen
+        file = request.files['file']
-        new_job = Job(user_id=current_user.id, filename=filename, status="Pending")
+        if not file or file.filename == '':
            flash('Keine gültige Datei.')
            return redirect(url_for('auth.upload'))
        filename = secure_filename(file.filename)
        name, ext = os.path.splitext(filename)
        timestamp = time.strftime("%Y%m%d_%H%M%S")
        unique_filename = f"{name}_{timestamp}{ext}" if os.path.exists(os.path.join(UPLOAD_FOLDER, filename)) else filename
        filepath = os.path.join(UPLOAD_FOLDER, unique_filename)
        file.save(filepath)
        print(f"💾 UPLOAD: {filepath}")
        new_job = Job(
            user_id=current_user.id,
            filename=unique_filename,
            status="Pending"
        )
        db.session.add(new_job)
        db.session.commit()
        print(f"🆕 JOB #{new_job.id} für User {current_user.id}")
-        # Debugging-Ausgabe zur Überprüfung der Thread-Erstellung
+        thread = threading.Thread(
-        print(f"Starte Scraping-Thread für Job-ID: {new_job.id}")
+            target=process_file,
-
+            args=(unique_filename, new_job.id, current_app._get_current_object())
-        # Starten des Scraping im Hintergrund-Thread und Übergeben des aktuellen Anwendungskontexts
+        )
-        thread = threading.Thread(target=process_file, args=(filename, new_job.id, current_app._get_current_object()))
+        thread.daemon = True
        thread.start()
        print(f"🔄 THREAD STARTED Job {new_job.id}")
-        # Debugging-Ausgabe, nachdem der Thread gestartet wurde
+        flash(f'"{unique_filename}" → Job #{new_job.id} läuft!')
        print(f"Thread für Job {new_job.id} erfolgreich gestartet.")
        return redirect(url_for('auth.job_status'))
    return render_template('upload.html')
-@bp.route('/download/<int:job_id>', methods=['GET'])
+@bp.route('/download/<int:job_id>')
@login_required
 def download_result(job_id):
-    job = Job.query.get_or_404(job_id)
+    job = Job.query.filter_by(id=job_id, user_id=current_user.id).first_or_404()
    print(f"Job ID: {job.id} - User ID: {job.user_id} - Current User ID: {current_user.id}")
-    # Überprüfen, ob der Job dem aktuellen Benutzer gehört
+    if not job.result_filename or not job.status.startswith('✅'):
-    if job.user_id != current_user.id:
+        flash('Ergebnis nicht bereit.')
        flash("Sie haben keine Berechtigung, dieses Ergebnis herunterzuladen.")
        return redirect(url_for('auth.job_status'))
-    # Überprüfen, ob das Ergebnis vorhanden ist
+    result_path = os.path.join(RESULT_FOLDER, job.result_filename)
    if not job.result_filename:
        flash("Das Ergebnis ist noch nicht verfügbar.")
        return redirect(url_for('auth.job_status'))
    # Überprüfen, ob die Datei im angegebenen Pfad existiert
    result_path = os.path.join(current_app.config['RESULT_FOLDER'], job.result_filename)
    print(f"Versuche, Datei herunterzuladen von: {result_path}")
    if os.path.exists(result_path):
        print("Datei existiert und wird zum Download bereitgestellt.")
        return send_file(result_path, as_attachment=True)
-    else:
+    flash('Datei fehlt.')
-        print("Datei nicht gefunden. Ergebnisverzeichnis oder Pfad prüfen.")
+    return redirect(url_for('auth.job_status'))
-        flash("Ergebnisdatei nicht gefunden.")
+
@bp.route('/download_raw/<int:job_id>')
@login_required
 def download_result_raw(job_id):
    job = Job.query.filter_by(id=job_id, user_id=current_user.id).first_or_404()
    if not job.result_filename_raw:
        flash('Rohdaten nicht verfügbar.')
        return redirect(url_for('auth.job_status'))
    result_path = os.path.join(RESULT_FOLDER, job.result_filename_raw)
    if os.path.exists(result_path):
        return send_file(result_path, as_attachment=True)
    flash('Datei fehlt.')
    return redirect(url_for('auth.job_status'))
@bp.route('/delete_job/<int:job_id>', methods=['POST'])
@login_required
 def delete_job(job_id):
-    job = Job.query.get_or_404(job_id)
+    job = Job.query.filter_by(id=job_id, user_id=current_user.id).first_or_404()
    if job.user_id != current_user.id:
        flash("Sie haben keine Berechtigung, diesen Job zu löschen.")
        return redirect(url_for('auth.job_status'))
-    # Löschen der Upload-Datei
+    upload_path = os.path.join(UPLOAD_FOLDER, job.filename)
    upload_path = os.path.join(current_app.config['UPLOAD_FOLDER'], job.filename)
    if os.path.exists(upload_path):
        os.remove(upload_path)
        print(f"Upload-Datei gelöscht: {upload_path}")
    else:
        print(f"Upload-Datei nicht gefunden: {upload_path}")
    # Löschen der Results-Datei, falls vorhanden
    if job.result_filename:
-        result_path = os.path.join(current_app.config['RESULT_FOLDER'], job.result_filename)
+        result_path = os.path.join(RESULT_FOLDER, job.result_filename)
        print(f"Versuche Ergebnisdatei zu löschen: {result_path}")
        if os.path.exists(result_path):
-            try:
+            os.remove(result_path)
-                os.remove(result_path)
+
-                print(f"Ergebnisdatei gelöscht: {result_path}")
+    if job.result_filename_raw:                                              # ✅ Raw auch löschen
-            except Exception as e:
+        raw_path = os.path.join(RESULT_FOLDER, job.result_filename_raw)
-                print(f"Fehler beim Löschen der Ergebnisdatei: {e}")
+        if os.path.exists(raw_path):
-        else:
+            os.remove(raw_path)
            print(f"Ergebnisdatei nicht gefunden im Pfad: {result_path}")
    # Job aus der Datenbank löschen
    db.session.delete(job)
    db.session.commit()
-    flash("Job erfolgreich gelöscht.")
+    flash('Job gelöscht.')
    return redirect(url_for('auth.job_status'))
@bp.route('/job_status/<int:job_id>')
@login_required
 def job_status_api(job_id):
    job = Job.query.filter_by(id=job_id, user_id=current_user.id).first()
    if not job:
        return jsonify({'error': 'Not found'}), 404
    return jsonify({
        'id':                  job.id,
        'status':              job.status,
        'result_filename':     job.result_filename,
        'result_filename_raw': getattr(job, 'result_filename_raw', None),
        'scraper_job_id':      getattr(job, 'scraper_job_id', None)
    })
@bp.route('/resume_job/<int:job_id>', methods=['POST'])
@login_required
 def resume_job(job_id):
    job = Job.query.filter_by(id=job_id, user_id=current_user.id).first_or_404()
    thread = threading.Thread(
        target=process_file,
        args=(job.filename, job.id, current_app._get_current_object())
    )
    thread.daemon = True
    thread.start()
    flash(f'Job #{job_id} wird fortgesetzt...')
    return redirect(url_for('auth.job_status'))
 # ── ADMIN ──────────────────────────────────────────
@bp.route('/admin', methods=['GET'])
@login_required
 def admin_panel():
    if not current_user.is_admin:
        flash("Keine Berechtigung.")
        return redirect(url_for('auth.job_status'))
    users = User.query.all()
-    return render_template('admin_panel.html', users=users)
+    cfg = AppConfig.query.filter_by(key='allow_signup').first()
    signup_allowed = cfg and cfg.value == 'true'
    return render_template('admin_panel.html', users=users, signup_allowed=signup_allowed)
@bp.route('/admin/create_user', methods=['POST'])
@login_required
 def create_user():
    if not current_user.is_admin:
        flash("Keine Berechtigung.")
        return redirect(url_for('auth.admin_panel'))
    username = request.form['username']
-    password = request.form['password']
+    password = generate_password_hash(request.form['password'])              # ✅ Fix
-    is_admin = 'is_admin' in request.form  # Checkbox für Adminrechte
+    is_admin = 'is_admin' in request.form
-
+    new_user = User(username=username, password=password, is_admin=is_admin)
    hashed_password = generate_password_hash(password, method='sha256')
    new_user = User(username=username, password=hashed_password, is_admin=is_admin)
    db.session.add(new_user)
    db.session.commit()
-
+    flash(f'{username} erstellt.')
    flash(f"Benutzer {username} wurde erstellt.")
    return redirect(url_for('auth.admin_panel'))
@bp.route('/admin/reset_password/<int:user_id>', methods=['POST'])
@login_required
 def reset_password(user_id):
    if not current_user.is_admin:
        flash("Keine Berechtigung.")
        return redirect(url_for('auth.admin_panel'))
    user = User.query.get_or_404(user_id)
    new_password = request.form['new_password']
-    user.password = generate_password_hash(new_password, method='sha256')
+    user.password = generate_password_hash(new_password)                     # ✅ Fix
    db.session.commit()
-
+    flash(f'Passwort {user.username} zurückgesetzt.')
    flash(f"Passwort für Benutzer {user.username} wurde zurückgesetzt.")
    return redirect(url_for('auth.admin_panel'))
@bp.route('/admin/delete_user/<int:user_id>', methods=['POST'])
@login_required
 def delete_user(user_id):
    if not current_user.is_admin:
        flash("Keine Berechtigung.")
        return redirect(url_for('auth.admin_panel'))
    user = User.query.get_or_404(user_id)
    if user.is_admin:
-        flash("Administratoren können nicht gelöscht werden.")
+        flash('Admin nicht löschbar.')
        return redirect(url_for('auth.admin_panel'))
    db.session.delete(user)
    db.session.commit()
-    flash(f"Benutzer {user.username} wurde gelöscht.")
+    flash(f'{user.username} gelöscht.')
    return redirect(url_for('auth.admin_panel'))
@bp.route('/admin/toggle_signup', methods=['POST'])
@login_required
 def toggle_signup():
    if not current_user.is_admin:
        return redirect(url_for('auth.admin_panel'))
    cfg = AppConfig.query.filter_by(key='allow_signup').first()
    if not cfg:
        cfg = AppConfig(key='allow_signup', value='true')
        db.session.add(cfg)
    else:
        cfg.value = 'false' if cfg.value == 'true' else 'true'
    db.session.commit()
    state = '✅ aktiviert' if cfg.value == 'true' else '🔒 deaktiviert'
    flash(f'Registrierung {state}.')
    return redirect(url_for('auth.admin_panel'))
--- a/app/templates/admin_panel.html
+++ b/app/templates/admin_panel.html
@ -47,4 +47,32 @@
        <button type="submit" class="create-btn">Benutzer erstellen</button>
    </form>
 </div>
 <div class="config-box">
    <h3>⚙️ Einstellungen</h3>
    <form action="{{ url_for('auth.toggle_signup') }}" method="POST">
        <div class="toggle-row">
            <span>Benutzer-Registrierung:</span>
            {% if signup_allowed %}
                <span class="badge badge-green">✅ Aktiv</span>
                <button type="submit" class="btn-danger">🔒 Deaktivieren</button>
            {% else %}
                <span class="badge badge-red">🔒 Deaktiviert</span>
                <button type="submit" class="btn-success">✅ Aktivieren</button>
            {% endif %}
        </div>
    </form>
 </div>
 <style>
 .config-box      { background: #f8f9fa; border: 1px solid #ddd; border-radius: 6px; padding: 16px; margin-bottom: 24px; }
 .toggle-row      { display: flex; align-items: center; gap: 12px; }
 .badge           { padding: 3px 10px; border-radius: 12px; font-size: 0.85em; font-weight: bold; }
 .badge-green     { background: #d4edda; color: #155724; }
 .badge-red       { background: #f8d7da; color: #721c24; }
 .btn-danger      { background: #e74c3c; color: white; border: none; padding: 6px 14px; border-radius: 4px; cursor: pointer; }
 .btn-success     { background: #27ae60; color: white; border: none; padding: 6px 14px; border-radius: 4px; cursor: pointer; }
 .btn-danger:hover  { background: #c0392b; }
 .btn-success:hover { background: #1e8449; }
 </style>
 {% endblock %}
--- a/app/templates/jobs.bck
+++ b/app/templates/jobs.bck
@ -0,0 +1,121 @@
 {% extends "base.html" %}
 {% block content %}
 <div class="table-container">
    <h2>Ihre Aufträge</h2>
    <table id="jobs-table">
        <thead>
            <tr>
                <th>Dateiname</th>
                <th>Status</th>
                <th>Erstellt am</th>
                <th>Ergebnis</th>
                <th>Aktionen</th>
            </tr>
        </thead>
        <tbody>
            {% for job in jobs %}
            <tr id="job-row-{{ job.id }}">
                <td>{{ job.filename }}</td>
                <td id="status-{{ job.id }}" class="job-status">{{ job.status }}</td>
                <td>{{ job.created_at.strftime('%Y-%m-%d %H:%M:%S') }}</td>
                <td id="result-{{ job.id }}">
                    {% if job.result_filename and 'Failed' not in job.status %}
                        <a href="{{ url_for('auth.download_result', job_id=job.id) }}" class="dl-btn">
                            🎯 Gefiltert
                        </a>
                        {% if job.result_filename_raw %}
                        &nbsp;
                        <a href="{{ url_for('auth.download_result_raw', job_id=job.id) }}" class="dl-btn dl-btn-raw">
                            📋 Alle
                        </a>
                        {% endif %}
                    {% elif 'Failed' in job.status %}
                        <span class="status-failed">❌ {{ job.result_filename or 'Fehler' }}</span>
                    {% else %}
                        <span class="status-pending">⏳ Noch nicht verfügbar</span>
                    {% endif %}
                </td>
                <td>
                    <form action="{{ url_for('auth.delete_job', job_id=job.id) }}" method="POST" style="display:inline;">
                        <button type="submit" class="delete-btn">🗑️ Löschen</button>
                    </form>
                </td>
            </tr>
            {% endfor %}
        </tbody>
    </table>
 </div>
 <style>
 .job-status    { font-weight: bold; }
 .status-failed  { color: #e74c3c; font-weight: bold; }
 .status-pending { color: #888; }
 .status-completed { color: #27ae60; }
 .dl-btn {
    display: inline-block;
    padding: 4px 10px;
    border-radius: 4px;
    text-decoration: none;
    font-size: 0.85em;
    font-weight: bold;
    background: #27ae60;
    color: #fff;
    margin: 2px 1px;
    transition: background 0.2s;
 }
 .dl-btn:hover       { background: #1e8449; }
 .dl-btn-raw         { background: #2980b9; }
 .dl-btn-raw:hover   { background: #1a5e8a; }
 </style>
 <script>
 document.addEventListener('DOMContentLoaded', function () {
    document.querySelectorAll('.job-status').forEach(function (cell) {
        const jobId  = cell.id.split('-')[1];
        const status = cell.textContent.trim();
        if (!status.includes('✅') && !status.includes('Failed')) {
            pollJob(jobId);
        }
    });
 });
 function renderResult(resultCell, data) {
    const hasFailed   = data.status.includes('Failed');
    const hasFiltered = data.result_filename && !hasFailed;
    const hasRaw      = data.result_filename_raw && !hasFailed;
    if (hasFiltered) {
        let html = `<a href="/download/${data.id}" class="dl-btn">🎯 Gefiltert</a>`;
        if (hasRaw) {
            html += ` <a href="/download_raw/${data.id}" class="dl-btn dl-btn-raw">📋 Alle</a>`;
        }
        resultCell.innerHTML = html;
    } else if (hasFailed) {
        resultCell.innerHTML = `<span class="status-failed">❌ ${data.result_filename || 'Fehler'}</span>`;
    } else {
        resultCell.innerHTML = `<span class="status-pending">⏳ Noch nicht verfügbar</span>`;
    }
 }
 function pollJob(jobId) {
    fetch(`/job_status/${jobId}`)
        .then(r => r.json())
        .then(data => {
            const statusCell = document.getElementById(`status-${jobId}`);
            const resultCell = document.getElementById(`result-${jobId}`);
            statusCell.textContent = data.status;
            renderResult(resultCell, data);
            // Weiter pollen wenn noch nicht fertig
            const done = data.status.includes('✅') || data.status.includes('Failed');
            if (!done) {
                setTimeout(() => pollJob(jobId), 5000);
            }
        })
        .catch(() => setTimeout(() => pollJob(jobId), 10000));
 }
 </script>
 {% endblock %}
--- a/app/templates/jobs.html
+++ b/app/templates/jobs.html
@ -15,20 +15,38 @@
        </thead>
        <tbody>
            {% for job in jobs %}
-            <tr>
+            <tr id="job-row-{{ job.id }}">
                <td>{{ job.filename }}</td>
-                <td class="job-status">{{ job.status }}</td>
+                <td id="status-{{ job.id }}" class="job-status">
                    {{ job.status }}
                </td>
                <td>{{ job.created_at.strftime('%Y-%m-%d %H:%M:%S') }}</td>
-                <td>
+                <td id="result-{{ job.id }}">
-                    {% if job.status == "Completed" %}
+                    {% if job.result_filename and 'Failed' not in job.status %}
-                        <a href="{{ url_for('auth.download_result', job_id=job.id) }}">Download</a>
+                        <a href="{{ url_for('auth.download_result', job_id=job.id) }}" class="dl-btn">
                            🎯 Gefiltert
                        </a>
                        {% if job.result_filename_raw %}
                        &nbsp;
                        <a href="{{ url_for('auth.download_result_raw', job_id=job.id) }}" class="dl-btn dl-btn-raw">
                            📋 Alle
                        </a>
                        {% endif %}
                    {% elif 'Failed' in job.status %}
                        <span class="status-failed">❌ {{ job.result_filename or 'Fehler' }}</span>
                    {% else %}
-                        Noch nicht verfügbar
+                        <span class="status-pending">⏳ Noch nicht verfügbar</span>
                    {% endif %}
                </td>
                <td>
                    {% if 'Failed' in job.status %}
                    <!-- 🆕 Resume Button -->
                    <form action="{{ url_for('auth.resume_job', job_id=job.id) }}" method="POST" style="display:inline;">
                        <button type="submit" class="btn-resume">▶️ Resume</button>
                    </form>
                    {% endif %}
                    <form action="{{ url_for('auth.delete_job', job_id=job.id) }}" method="POST" style="display:inline;">
-                        <button type="submit" class="delete-btn">Löschen</button>
+                        <button type="submit" class="delete-btn">🗑️ Löschen</button>
                    </form>
                </td>
            </tr>
@ -37,25 +55,101 @@
    </table>
 </div>
 <style>
 .job-status     { font-weight: bold; }
 .status-failed  { color: #e74c3c; font-weight: bold; }
 .status-pending { color: #888; }
 .eta-badge      { display: inline-block; background: #eaf4ff; color: #1a6fa8;
                  border-radius: 10px; padding: 2px 8px; font-size: 0.82em;
                  font-weight: bold; margin-left: 6px; }
 .dl-btn         { display: inline-block; padding: 4px 10px; border-radius: 4px;
                  text-decoration: none; font-size: 0.85em; font-weight: bold;
                  background: #27ae60; color: #fff; margin: 2px 1px; transition: background 0.2s; }
 .dl-btn:hover   { background: #1e8449; }
 .dl-btn-raw     { background: #2980b9; }
 .dl-btn-raw:hover { background: #1a5e8a; }
 .btn-resume     { background: #e67e22; color: white; border: none;
                  padding: 4px 10px; border-radius: 4px; cursor: pointer;
                  font-size: 0.85em; font-weight: bold; margin-right: 4px; }
 .btn-resume:hover { background: #ca6f1e; }
 </style>
 <script>
-    // Periodische Aktualisierung des Jobstatus
+// ETA Badge aus Status-String parsen
-    setInterval(function() {
+function parseStatus(status) {
-        fetch('{{ url_for("auth.job_status") }}')
+    const parts = status.split('|');
-            .then(response => response.text())
+    if (parts.length === 2) {
-            .then(html => {
+        return `<span>${parts[0].trim()}</span>
-                const parser = new DOMParser();
+                <span class="eta-badge">${parts[1].trim()}</span>`;
-                const doc = parser.parseFromString(html, 'text/html');
+    }
-                const newRows = doc.querySelectorAll('#jobs-table tbody tr');
+    return status;
-                const currentRows = document.querySelectorAll('#jobs-table tbody tr');
+}
-                newRows.forEach((newRow, index) => {
+function renderResult(resultCell, data) {
-                    const newStatus = newRow.querySelector('.job-status').textContent;
+    const hasFailed   = data.status.includes('Failed') || data.status.includes('❌');
-                    currentRows[index].querySelector('.job-status').textContent = newStatus;
+    const hasFiltered = data.result_filename && !hasFailed;
    const hasRaw      = data.result_filename_raw && !hasFailed;
-                    const newResult = newRow.querySelector('td:nth-child(4)').innerHTML;
+    if (hasFiltered) {
-                    currentRows[index].querySelector('td:nth-child(4)').innerHTML = newResult;
+        let html = `<a href="/download/${data.id}" class="dl-btn">🎯 Gefiltert</a>`;
-                });
+        if (hasRaw) {
-            });
+            html += ` <a href="/download_raw/${data.id}" class="dl-btn dl-btn-raw">📋 Alle</a>`;
-    }, 5000); // Aktualisierung alle 5 Sekunden
+        }
        resultCell.innerHTML = html;
    } else if (hasFailed) {
        resultCell.innerHTML = `<span class="status-failed">❌ ${data.result_filename || 'Fehler'}</span>`;
    } else {
        resultCell.innerHTML = `<span class="status-pending">⏳ Noch nicht verfügbar</span>`;
    }
 }
 function renderActions(row, data) {
    const actionsCell = row.querySelector('td:last-child');
    const hasFailed = data.status.includes('Failed');
    let html = '';
    if (hasFailed) {
        html += `<form action="/resume_job/${data.id}" method="POST" style="display:inline;">
                    <button type="submit" class="btn-resume">▶️ Resume</button>
                 </form>`;
    }
    html += `<form action="/delete_job/${data.id}" method="POST" style="display:inline;">
                <button type="submit" class="delete-btn">🗑️ Löschen</button>
             </form>`;
    actionsCell.innerHTML = html;
 }
 function pollJob(jobId) {
    fetch(`/job_status/${jobId}`)
        .then(r => r.json())
        .then(data => {
            const statusCell = document.getElementById(`status-${jobId}`);
            const resultCell = document.getElementById(`result-${jobId}`);
            const row        = document.getElementById(`job-row-${jobId}`);
            statusCell.innerHTML = parseStatus(data.status);
            renderResult(resultCell, data);
            renderActions(row, data);
            const done = data.status.includes('✅') || data.status.includes('Failed') || data.status.includes('❌');
            if (!done) {
                setTimeout(() => pollJob(jobId), 5000);
            }
        })
        .catch(() => setTimeout(() => pollJob(jobId), 10000));
 }
 document.addEventListener('DOMContentLoaded', function () {
    document.querySelectorAll('.job-status').forEach(function (cell) {
        const jobId  = cell.id.split('-')[1];
        const status = cell.textContent.trim();
        cell.innerHTML = parseStatus(status);
        if (!status.includes('✅') && !status.includes('Failed') && !status.includes('❌')) {
            pollJob(jobId);
        }
    });
 });
 </script>
 {% endblock %}
--- a/app/templates/jobs.orig
+++ b/app/templates/jobs.orig
@ -0,0 +1,61 @@
 {% extends "base.html" %}
 {% block content %}
 <div class="table-container">
    <h2>Ihre Aufträge</h2>
    <table id="jobs-table">
        <thead>
            <tr>
                <th>Dateiname</th>
                <th>Status</th>
                <th>Erstellt am</th>
                <th>Ergebnis</th>
                <th>Aktionen</th>
            </tr>
        </thead>
        <tbody>
            {% for job in jobs %}
            <tr>
                <td>{{ job.filename }}</td>
                <td class="job-status">{{ job.status }}</td>
                <td>{{ job.created_at.strftime('%Y-%m-%d %H:%M:%S') }}</td>
                <td>
                    {% if job.status == "Completed" %}
                        <a href="{{ url_for('auth.download_result', job_id=job.id) }}">Download</a>
                    {% else %}
                        Noch nicht verfügbar
                    {% endif %}
                </td>
                <td>
                    <form action="{{ url_for('auth.delete_job', job_id=job.id) }}" method="POST" style="display:inline;">
                        <button type="submit" class="delete-btn">Löschen</button>
                    </form>
                </td>
            </tr>
            {% endfor %}
        </tbody>
    </table>
 </div>
 <script>
    // Periodische Aktualisierung des Jobstatus
    setInterval(function() {
        fetch('{{ url_for("auth.job_status") }}')
            .then(response => response.text())
            .then(html => {
                const parser = new DOMParser();
                const doc = parser.parseFromString(html, 'text/html');
                const newRows = doc.querySelectorAll('#jobs-table tbody tr');
                const currentRows = document.querySelectorAll('#jobs-table tbody tr');
                newRows.forEach((newRow, index) => {
                    const newStatus = newRow.querySelector('.job-status').textContent;
                    currentRows[index].querySelector('.job-status').textContent = newStatus;
                    const newResult = newRow.querySelector('td:nth-child(4)').innerHTML;
                    currentRows[index].querySelector('td:nth-child(4)').innerHTML = newResult;
                });
            });
    }, 5000); // Aktualisierung alle 5 Sekunden
 </script>
 {% endblock %}
--- a/app/webcrawler.bck02032026
+++ b/app/webcrawler.bck02032026
@ -0,0 +1,316 @@
 import os
 import re
 import pandas as pd
 import requests
 import time
 import random
 from io import StringIO
 from app.models import db, Job
 print("🆕 MODERN webcrawler LOADED!")
 UPLOAD_FOLDER = '/app/uploads'
 RESULT_FOLDER = '/app/results'
 SCRAPER_URL = "http://gmaps-scraper:8080"
 OUTPUT_COLS = ['title', 'category', 'address', 'open_hours', 'website', 'phone', 'link']
 # ──────────────────────────────────────────────
 # Hilfsfunktionen
 # ──────────────────────────────────────────────
 def get_batch_size(total_rows):
    if total_rows < 50:    return 10
    elif total_rows < 200: return 10
    elif total_rows < 500: return 5
    else:                  return 5
 def get_delay(total_rows):
    if total_rows < 50:    return (5, 10)
    elif total_rows < 200: return (10, 20)
    else:                  return (20, 40)
 def is_blocked(data):
    text = str(data).lower()
    blocked = any(kw in text for kw in ['captcha', 'blocked', 'rate limit', 'too many', '429'])
    if blocked:
        print(f"🚫 BLOCKED: {str(data)[:100]}")
    return blocked
 def fix_encoding(text):
    """Kaputte ISO→UTF8 Zeichen reparieren (z.B. IndustriestraÃŸe → Industriestraße)"""
    if not isinstance(text, str):
        return text
    try:
        return text.encode('latin-1').decode('utf-8')
    except (UnicodeEncodeError, UnicodeDecodeError):
        return text
 def build_input_addresses(df):
    """Normalisierte Adressen aus Input-CSV für Abgleich"""
    addresses = set()
    for _, row in df.iterrows():
        plz    = str(row.get('PLZ', '')).strip()
        stadt  = str(row.get('Stadt', '')).strip()
        str_   = str(row.get('Straße', '')).strip()
        nr     = str(row.get('Hausnummer', '')).strip()
        zusatz = str(row.get('Zusatz', '')).strip()
        full = f"{str_} {nr} {zusatz} {plz} {stadt}".lower().strip()
        full = ' '.join(full.split())
        addresses.add(full)
    return addresses
 def normalize_address(addr):
    """Output-Adresse normalisieren für Abgleich"""
    if not isinstance(addr, str):
        return ''
    addr = fix_encoding(addr)
    return ' '.join(addr.lower().strip().split())
 def address_in_input(result_addr, input_addresses):
    """Prüft ob PLZ + Straßenname aus Result im Input vorkommen"""
    norm = normalize_address(result_addr)
    for inp_addr in input_addresses:
        plz_match = re.search(r'\b\d{5}\b', inp_addr)
        if plz_match:
            plz = plz_match.group()
            if plz in norm:
                street = inp_addr.split()[0] if inp_addr else ''
                if len(street) > 3 and street[:4].lower() in norm:
                    return True
    return False
 # ──────────────────────────────────────────────
 # CSV Nachbearbeitung (apply_filter umschaltbar)
 # ──────────────────────────────────────────────
 def process_result_csv(raw_bytes, input_df, apply_filter=True):
    """
    Raw CSV → bereinigt:
    - Nur OUTPUT_COLS
    - Encoding fix
    - Optional: Input/Output Abgleich + Duplikate
    """
    try:
        content = raw_bytes.decode('utf-8', errors='replace')
        df_out = pd.read_csv(StringIO(content))
        print(f"📄 Raw result: {df_out.shape} | Columns: {list(df_out.columns)[:8]}")
        # Spalten filtern
        available = [c for c in OUTPUT_COLS if c in df_out.columns]
        missing   = [c for c in OUTPUT_COLS if c not in df_out.columns]
        if missing:
            print(f"⚠️ Fehlende Spalten: {missing}")
        df_out = df_out[available]
        # 🔤 Encoding fix
        for col in df_out.columns:
            df_out[col] = df_out[col].apply(fix_encoding)
        print(f"🔤 Encoding fix: done")
        if apply_filter:
            # 📍 Input/Output Abgleich
            input_addresses = build_input_addresses(input_df)
            before = len(df_out)
            df_out = df_out[
                df_out['address'].apply(
                    lambda a: address_in_input(a, input_addresses)
                )
            ]
            print(f"📍 Adress-Filter: {before} → {len(df_out)} Zeilen")
        # 🔁 Duplikate entfernen (immer, auch bei Raw)
        before_dedup = len(df_out)
        df_out = df_out.drop_duplicates(subset=['title', 'address'], keep='first')
        print(f"🔁 Duplikate: {before_dedup} → {len(df_out)} Zeilen")
        # Leere Titel entfernen
        df_out = df_out.dropna(subset=['title'], how='all')
        df_out = df_out[df_out['title'].str.strip().astype(bool)]
        print(f"✅ Final ({'gefiltert' if apply_filter else 'alle'}): {df_out.shape}")
        return df_out
    except Exception as e:
        print(f"💥 process_result_csv: {e}")
        import traceback
        traceback.print_exc()
        return None
 # ──────────────────────────────────────────────
 # Haupt-Worker
 # ──────────────────────────────────────────────
 def process_file(filename, job_id, app):
    print(f"🎯 {filename} Job#{job_id} START!")
    with app.app_context():
        job = Job.query.get(job_id)
        if not job:
            print("❌ Job missing")
            return
        try:
            # 1️⃣ CSV Parse
            job.status = "📊 parsing CSV"
            db.session.commit()
            filepath = os.path.join(UPLOAD_FOLDER, filename)
            print(f"📁 {filepath} | {os.path.getsize(filepath)}b")
            df_input = pd.read_csv(filepath, sep=';', encoding='ISO-8859-1')
            print(f"📊 {df_input.shape} | Columns: {list(df_input.columns)}")
            queries = []
            for _, row in df_input.iterrows():
                parts = [
                    str(row.get('PLZ', '')).strip(),
                    str(row.get('Stadt', '')).strip(),
                    str(row.get('Straße', '')).strip(),
                    str(row.get('Hausnummer', '')).strip(),
                    str(row.get('Zusatz', '')).strip(),
                ]
                q = f"Firmen {' '.join(p for p in parts if p and p != 'nan')}".strip()
                if len(q) > 10:
                    queries.append(q)
            total = len(queries)
            print(f"🔍 {total} Queries | Samples: {queries[:3]}")
            if not queries:
                raise ValueError("Keine gültigen Adressen in CSV")
            # 2️⃣ Batch + Delay
            batch_size = get_batch_size(total)
            delay_min, delay_max = get_delay(total)
            batch = queries[:batch_size]
            pre_delay = random.uniform(delay_min, delay_max)
            print(f"📦 Batch {len(batch)}/{total} | 😴 {pre_delay:.1f}s Delay")
            time.sleep(pre_delay)
            # 3️⃣ API Call
            job.status = "📤 sending to scraper"
            db.session.commit()
            payload = {
                "name": f"{filename.replace('.csv','')}-{job_id}",
                "keywords": batch,
                "lang": "de",
                "depth": 1,
                "zoom": 17,
                "radius": 50,
                "max_time": 60,
                "fast_mode": False
            }
            print(f"🌐 POST {SCRAPER_URL}/api/v1/jobs | {payload['name']}")
            resp = requests.post(f"{SCRAPER_URL}/api/v1/jobs", json=payload, timeout=30)
            print(f"📤 {resp.status_code}: {resp.text[:300]}")
            if is_blocked(resp.text):
                raise ValueError("🚫 IP geblockt! Proxy konfigurieren.")
            if resp.status_code != 201:
                raise ValueError(f"API {resp.status_code}: {resp.text[:200]}")
            # 4️⃣ Polling
            scraper_id = resp.json()['id']
            job.scraper_job_id = scraper_id
            job.status = "⏳ scraping"
            db.session.commit()
            print(f"✅ Scraper Job: {scraper_id}")
            for i in range(1, 61):  # Max 10min
                try:
                    r = requests.get(
                        f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}",
                        timeout=10
                    )
                    data = r.json()
                    status = data.get('Status', data.get('status', '?'))
                    print(f"⏳ {i}/60: {status}")
                    if is_blocked(data):
                        raise ValueError("🚫 IP geblockt während scraping!")
                    if status in ('ok', 'completed', 'scraped'):
                        dl = requests.get(
                            f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}/download",
                            timeout=60
                        )
                        if dl.status_code != 200:
                            raise ValueError(f"Download {dl.status_code}")
                        if is_blocked(dl.text[:200]):
                            raise ValueError("🚫 IP geblockt beim Download!")
                        # 5️⃣ Nachbearbeitung → zwei Versionen
                        job.status = "🔧 processing result"
                        db.session.commit()
                        base = filename.replace('.csv', '')
                        os.makedirs(RESULT_FOLDER, exist_ok=True)
                        # ── Version A: Gefiltert (Adressabgleich + Deduplizierung) ──
                        df_filtered = process_result_csv(dl.content, df_input, apply_filter=True)
                        outname_filtered = f"results_{base}_filtered.csv"
                        outpath_filtered = os.path.join(RESULT_FOLDER, outname_filtered)
                        if df_filtered is not None and len(df_filtered) > 0:
                            df_filtered.to_csv(
                                outpath_filtered, index=False,
                                encoding='utf-8-sig', sep=';'
                            )
                            print(f"🎯 Filtered: {outname_filtered} → {len(df_filtered)} Firmen")
                        else:
                            print("⚠️ Keine Treffer nach Filter – leere Datei wird erstellt")
                            pd.DataFrame(columns=OUTPUT_COLS).to_csv(
                                outpath_filtered, index=False,
                                encoding='utf-8-sig', sep=';'
                            )
                        # ── Version B: Alle (nur Spalten + Encoding, kein Filter) ──
                        df_raw = process_result_csv(dl.content, df_input, apply_filter=False)
                        outname_raw = f"results_{base}_all.csv"
                        outpath_raw = os.path.join(RESULT_FOLDER, outname_raw)
                        if df_raw is not None:
                            df_raw.to_csv(
                                outpath_raw, index=False,
                                encoding='utf-8-sig', sep=';'
                            )
                            print(f"📋 All: {outname_raw} → {len(df_raw)} Firmen")
                        else:
                            print("⚠️ df_raw None – Rohinhalt wird gespeichert")
                            with open(outpath_raw, 'wb') as f:
                                f.write(dl.content)
                        # ── DB speichern ──
                        job.status = "✅ Fertig"
                        job.result_filename     = outname_filtered   # 🎯 Gefiltert
                        job.result_filename_raw = outname_raw        # 📋 Alle
                        db.session.commit()
                        print(f"🎉 Beide Dateien gespeichert!")
                        break
                    elif status in ('failed', 'cancelled', 'error'):
                        raise ValueError(f"Scraper: {status}")
                except requests.RequestException as e:
                    print(f"⚠️ Poll {i}: {e}")
                time.sleep(random.uniform(8, 15))
            else:
                raise ValueError("Timeout nach 10min")
        except Exception as e:
            job.status = "Failed"
            job.result_filename = str(e)
            print(f"💥 ERROR: {e}")
            import traceback
            traceback.print_exc()
        db.session.commit()
        print(f"✅ DONE! Status: {job.status}\n")
--- a/app/webcrawler.bck04032026
+++ b/app/webcrawler.bck04032026
@ -0,0 +1,275 @@
 import os
 import re
 import pandas as pd
 import requests
 import time
 import random
 from io import StringIO
 from app.models import db, Job
 print("🆕 MODERN webcrawler LOADED! – BATCHED + PROXY")
 UPLOAD_FOLDER = '/app/uploads'
 RESULT_FOLDER = '/app/results'
 SCRAPER_URL = "http://gmaps-scraper:8080"
 OUTPUT_COLS = ['title', 'category', 'address', 'open_hours', 'website', 'phone', 'link']
 PROXY_URL = "http://bitlleuv-rotate:s5hzse6hz74b@p.webshare.io:80"
 API_PROXIES = {"http": PROXY_URL, "https": PROXY_URL}
 # ──────────────────────────────────────────────
 # Hilfsfunktionen
 # ──────────────────────────────────────────────
 def is_blocked(data):
    text = str(data).lower()
    blocked = any(kw in text for kw in ['captcha', 'blocked', 'rate limit', 'too many', '429'])
    if blocked:
        print(f"🚫 BLOCKED: {str(data)[:100]}")
    return blocked
 def fix_encoding(text):
    if not isinstance(text, str):
        return text
    try:
        return text.encode('latin-1').decode('utf-8')
    except (UnicodeEncodeError, UnicodeDecodeError):
        return text
 def build_input_addresses(df):
    addresses = set()
    for _, row in df.iterrows():
        plz    = str(row.get('PLZ', '')).strip()
        stadt  = str(row.get('Stadt', '')).strip()
        str_   = str(row.get('Straße', '')).strip()
        nr     = str(row.get('Hausnummer', '')).strip()
        zusatz = str(row.get('Zusatz', '')).strip()
        full = f"{str_} {nr} {zusatz} {plz} {stadt}".lower().strip()
        full = ' '.join(full.split())
        addresses.add(full)
    return addresses
 def normalize_address(addr):
    if not isinstance(addr, str):
        return ''
    addr = fix_encoding(addr)
    return ' '.join(addr.lower().strip().split())
 def address_in_input(result_addr, input_addresses):
    norm = normalize_address(result_addr)
    for inp_addr in input_addresses:
        plz_match = re.search(r'\b\d{5}\b', inp_addr)
        if plz_match:
            plz = plz_match.group()
            if plz in norm:
                street = inp_addr.split()[0] if inp_addr else ''
                if len(street) > 3 and street[:4].lower() in norm:
                    return True
    return False
 # ──────────────────────────────────────────────
 # CSV Nachbearbeitung
 # ──────────────────────────────────────────────
 def process_result_csv(raw_bytes, input_df, apply_filter=True):
    try:
        content = raw_bytes.decode('utf-8', errors='replace')
        df_out = pd.read_csv(StringIO(content))
        print(f"📄 Raw result: {df_out.shape}")
        available = [c for c in OUTPUT_COLS if c in df_out.columns]
        df_out = df_out[available]
        for col in df_out.columns:
            df_out[col] = df_out[col].apply(fix_encoding)
        if apply_filter:
            input_addresses = build_input_addresses(input_df)
            before = len(df_out)
            df_out = df_out[
                df_out['address'].apply(
                    lambda a: address_in_input(a, input_addresses)
                )
            ]
            print(f"📍 Filter: {before} → {len(df_out)}")
        df_out = df_out.drop_duplicates(subset=['title', 'address'], keep='first')
        df_out = df_out.dropna(subset=['title'], how='all')
        df_out = df_out[df_out['title'].str.strip().astype(bool)]
        print(f"✅ Final ({'gefiltert' if apply_filter else 'alle'}): {df_out.shape}")
        return df_out
    except Exception as e:
        print(f"💥 process_result_csv: {e}")
        return None
 # ──────────────────────────────────────────────
 # HAUPT-WORKER
 # ──────────────────────────────────────────────
 def process_file(filename, job_id, app):
    print(f"🎯 {filename} Job#{job_id} START!")
    with app.app_context():
        job = Job.query.get(job_id)
        if not job:
            print("❌ Job missing")
            return
        try:
            #Parse + ALLE Queries
            job.status = "📊 parsing CSV"
            db.session.commit()
            filepath = os.path.join(UPLOAD_FOLDER, filename)
            print(f"📁 {filepath} | {os.path.getsize(filepath)}b")
            df_input = pd.read_csv(filepath, sep=';', encoding='ISO-8859-1')
            print(f"📊 {df_input.shape}")
            queries = []
            for _, row in df_input.iterrows():
                parts = [
                    str(row.get('PLZ', '')).strip(),
                    str(row.get('Stadt', '')).strip(),
                    str(row.get('Straße', '')).strip(),
                    str(row.get('Hausnummer', '')).strip(),
                    str(row.get('Zusatz', '')).strip(),
                ]
                q = f"Firmen {' '.join(p for p in parts if p and p != 'nan')}".strip()
                if len(q) > 10:
                    queries.append(q)
            total_queries = len(queries)
            print(f"🔍 {total_queries} Queries | Samples: {queries[:3]}")
            if total_queries == 0:
                raise ValueError("Keine gültigen Adressen")
            #BATCHED Processing
            BATCH_SIZE = 10                              # Erhöht: 5 → 10 (paid proxy)
            BATCH_DELAY_MIN, BATCH_DELAY_MAX = 10, 20   # Reduziert: 30-60s → 10-20s (paid proxy)
            batches = (total_queries + BATCH_SIZE - 1) // BATCH_SIZE
            print(f"📦 {batches} Batches à {BATCH_SIZE} | ETA: ~{batches*15//60:.0f}h")
            all_results_filtered = []
            all_results_raw = []
            job.status = f"🔄 Batch 1/{batches}"
            db.session.commit()
            for batch_idx in range(batches):
                batch_start = batch_idx * BATCH_SIZE
                batch_end = min(batch_start + BATCH_SIZE, total_queries)
                batch_queries = queries[batch_start:batch_end]
                print(f"\n🔄 BATCH {batch_idx+1}/{batches} ({batch_start+1}-{batch_end}/{total_queries})")
                #Random Delay
                delay = random.uniform(BATCH_DELAY_MIN, BATCH_DELAY_MAX)
                print(f"😴 Delay: {delay:.0f}s | Proxy: {PROXY_URL}")
                time.sleep(delay)
                #API Call
                payload = {
                    "name": f"{filename.replace('.csv','')}-{job_id}-B{batch_idx+1:03d}",
                    "keywords": batch_queries,
                    "lang": "de",
                    "depth": 1,
                    "zoom": 17,
                    "radius": 50,
                    "max_time": 60,       # Reduziert: 120 → 60 (paid proxy schneller)
                    "fast_mode": False,
                    "proxies": [PROXY_URL]
                }
                try:
                    resp = requests.post(
                        f"{SCRAPER_URL}/api/v1/jobs",
                        json=payload,
                        timeout=45
                    )
                    print(f"📤 {resp.status_code}")
                    if is_blocked(resp.text):
                        print("🚫 Batch übersprungen (blocked)")
                        continue
                    if resp.status_code != 201:
                        print(f"⚠️ Batch {batch_idx+1} fehlgeschlagen: {resp.text[:100]}")
                        continue
                    scraper_id = resp.json()['id']
                    print(f"✅ Scraper: {scraper_id}")
                    for poll_i in range(1, 61):           # Reduziert: 121 → 61 (max_time 60s)
                        r = requests.get(
                            f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}",
                            timeout=15
                        )
                        data = r.json()
                        status = data.get('Status', data.get('status', '?'))
                        if status in ('ok', 'completed', 'scraped'):
                            dl = requests.get(
                                f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}/download",
                                timeout=90
                            )
                            if dl.status_code == 200:
                                df_filtered = process_result_csv(dl.content, df_input, True)
                                df_raw = process_result_csv(dl.content, df_input, False)
                                if df_filtered is not None:
                                    all_results_filtered.append(df_filtered)
                                    all_results_raw.append(df_raw)
                                    print(f"📊 Batch {batch_idx+1}: {len(df_filtered)} filtered")
                            break
                        elif status in ('failed', 'error'):
                            print(f"💥 Batch {batch_idx+1}: {status}")
                            break
                        time.sleep(random.uniform(5, 10))  # Reduziert: 10-20s → 5-10s (paid proxy)
                except Exception as e:
                    print(f"💥 Batch {batch_idx+1}: {e}")
                job.status = f"🔄 Batch {batch_idx+2}/{batches}"
                db.session.commit()
            #MERGE & SAVE
            job.status = "🔧 merging results"
            db.session.commit()
            base = filename.replace('.csv', '')
            os.makedirs(RESULT_FOLDER, exist_ok=True)
            if all_results_filtered:
                df_final_filtered = pd.concat(all_results_filtered, ignore_index=True)
                df_final_filtered = df_final_filtered.drop_duplicates(subset=['title', 'address'])
                out_filtered = f"results_{base}_filtered.csv"
                df_final_filtered.to_csv(
                    os.path.join(RESULT_FOLDER, out_filtered),
                    index=False, encoding='utf-8-sig', sep=';'
                )
                if all_results_raw:
                    df_final_raw = pd.concat(all_results_raw, ignore_index=True)
                    out_raw = f"results_{base}_all.csv"
                    df_final_raw.to_csv(
                        os.path.join(RESULT_FOLDER, out_raw),
                        index=False, encoding='utf-8-sig', sep=';'
                    )
                job.result_filename = out_filtered
                job.result_filename_raw = out_raw
                job.status = f"✅ Fertig: {len(df_final_filtered)} Firmen"
            else:
                job.status = "❌ Keine Ergebnisse"
            db.session.commit()
            print(f"🎉 Job {job_id} komplett!")
        except Exception as e:
            job.status = f"Failed: {str(e)[:50]}"
            print(f"💥 FATAL: {e}")
            import traceback
            traceback.print_exc()
            db.session.commit()
        print(f"✅ DONE! Status: {job.status}")
--- a/app/webcrawler.bck04032026_2
+++ b/app/webcrawler.bck04032026_2
@ -0,0 +1,429 @@
 import os
 import re
 import unicodedata
 import json
 import pandas as pd
 import requests
 import time
 import random
 from io import StringIO
 from app.models import db, Job
 print("🆕 MODERN webcrawler LOADED! – BATCHED + PROXY + RESUME + ETA + 2x SCRAPER")
 UPLOAD_FOLDER = '/app/uploads'
 RESULT_FOLDER = '/app/results'
 # 2x Scraper – abwechselnd genutzt
 SCRAPER_URLS = [
    "http://gmaps-scraper-1:8080",
    "http://gmaps-scraper-2:8080",
 ]
 OUTPUT_COLS = ['title', 'category', 'address', 'open_hours', 'website', 'phone', 'link']
 PROXY_URL = "http://bitlleuv-rotate:s5hzse6hz74b@p.webshare.io:80"
 API_PROXIES = {"http": PROXY_URL, "https": PROXY_URL}
 # ──────────────────────────────────────────────
 # Tuning
 # ──────────────────────────────────────────────
 BATCH_SIZE        = 30    # Keywords pro Scraper-Job
 BATCH_DELAY_MIN   = 3     # Sekunden Pause zwischen Batches (min)
 BATCH_DELAY_MAX   = 6     # Sekunden Pause zwischen Batches (max)
 MAX_TIME          = 60    # Sekunden die der Scraper pro Batch hat
 POLL_MAX          = 90    # Max. Poll-Versuche pro Batch
 POLL_DELAY_MIN    = 2     # Sekunden zwischen Polls (min)
 POLL_DELAY_MAX    = 5     # Sekunden zwischen Polls (max)
 STUCK_THRESHOLD   = 8     # Polls auf 'pending' bis Auto-Restart
 MAX_RETRIES       = 2     # Wiederholversuche pro Batch bei Fehler
 # ──────────────────────────────────────────────
 # Hilfsfunktionen
 # ──────────────────────────────────────────────
 def is_blocked(data):
    text = str(data).lower()
    blocked = any(kw in text for kw in ['captcha', 'blocked', 'rate limit', 'too many', '429'])
    if blocked:
        print(f"🚫 BLOCKED: {str(data)[:100]}")
    return blocked
 def fix_encoding(text):
    if not isinstance(text, str):
        return text
    try:
        return text.encode('latin-1').decode('utf-8')
    except (UnicodeEncodeError, UnicodeDecodeError):
        return text
 # Fix 1: Sonderzeichen in Queries bereinigen
 def clean_query(q):
    """Steuerzeichen + fehlerhafte Bytes entfernen für saubere Google Maps URLs"""
    q = ''.join(c for c in q if unicodedata.category(c) != 'Cc')
    q = ' '.join(q.split())
    return q.strip()
 def build_input_addresses(df):
    addresses = set()
    for _, row in df.iterrows():
        plz    = str(row.get('PLZ', '')).strip()
        stadt  = str(row.get('Stadt', '')).strip()
        str_   = str(row.get('Straße', '')).strip()
        nr     = str(row.get('Hausnummer', '')).strip()
        zusatz = str(row.get('Zusatz', '')).strip()
        full = f"{str_} {nr} {zusatz} {plz} {stadt}".lower().strip()
        full = ' '.join(full.split())
        addresses.add(full)
    return addresses
 def normalize_address(addr):
    if not isinstance(addr, str):
        return ''
    addr = fix_encoding(addr)
    return ' '.join(addr.lower().strip().split())
 def address_in_input(result_addr, input_addresses):
    norm = normalize_address(result_addr)
    for inp_addr in input_addresses:
        plz_match = re.search(r'\b\d{5}\b', inp_addr)
        if plz_match:
            plz = plz_match.group()
            if plz in norm:
                street = inp_addr.split()[0] if inp_addr else ''
                if len(street) > 3 and street[:4].lower() in norm:
                    return True
    return False
 def format_eta(seconds):
    """Sekunden → lesbares ETA-Format"""
    if seconds < 60:
        return f"{int(seconds)}s"
    h, rem = divmod(int(seconds), 3600)
    m = rem // 60
    return f"{h}h {m:02d}min" if h > 0 else f"{m}min"
 # ──────────────────────────────────────────────
 # Fix 3: Scraper-Neustart bei Inactivity
 # ──────────────────────────────────────────────
 def restart_scraper(scraper_url):
    """Den betroffenen Scraper-Container neu starten"""
    try:
        import subprocess
        # Container-Name aus URL ableiten: http://gmaps-scraper-1:8080 → gmaps-scraper-1
        container = scraper_url.split("//")[1].split(":")[0]
        print(f"🔄 Starte {container} neu...")
        subprocess.run(["docker", "restart", container], timeout=30, capture_output=True)
        print(f"✅ {container} neu gestartet – warte 15s...")
        time.sleep(15)
        return True
    except Exception as e:
        print(f"⚠️ Scraper-Neustart fehlgeschlagen: {e}")
        return False
 # ──────────────────────────────────────────────
 # Resume: Progress-File Hilfsfunktionen
 # ──────────────────────────────────────────────
 def get_progress_path(job_id):
    return os.path.join(RESULT_FOLDER, f"progress_{job_id}.json")
 def get_partial_path(job_id, suffix):
    return os.path.join(RESULT_FOLDER, f"partial_{job_id}_{suffix}.csv")
 def load_progress(job_id):
    """Gespeicherten Fortschritt laden (falls vorhanden)"""
    path = get_progress_path(job_id)
    if os.path.exists(path):
        with open(path, 'r') as f:
            data = json.load(f)
        print(f"🔁 RESUME: ab Batch {data['last_completed_batch'] + 1}/{data['total_batches']}")
        return data
    return None
 def save_progress(job_id, last_completed_batch, total_batches):
    """Fortschritt nach jedem Batch speichern"""
    path = get_progress_path(job_id)
    with open(path, 'w') as f:
        json.dump({"last_completed_batch": last_completed_batch, "total_batches": total_batches}, f)
 def append_partial(job_id, df_filtered, df_raw):
    """Batch-Ergebnis an Partial-CSV anhängen"""
    for suffix, df in [('filtered', df_filtered), ('raw', df_raw)]:
        if df is None:
            continue
        path = get_partial_path(job_id, suffix)
        header = not os.path.exists(path)
        df.to_csv(path, mode='a', index=False, header=header, encoding='utf-8-sig', sep=';')
 def load_partial(job_id):
    """Bestehende Partial-CSVs laden"""
    results_filtered, results_raw = [], []
    for suffix, lst in [('filtered', results_filtered), ('raw', results_raw)]:
        path = get_partial_path(job_id, suffix)
        if os.path.exists(path):
            try:
                df = pd.read_csv(path, sep=';', encoding='utf-8-sig')
                lst.append(df)
                print(f"📂 Partial {suffix}: {len(df)} Zeilen geladen")
            except Exception as e:
                print(f"⚠️ Partial {suffix} Ladefehler: {e}")
    return results_filtered, results_raw
 def cleanup_progress(job_id):
    """Progress + Partial-Files nach Abschluss löschen"""
    for path in [
        get_progress_path(job_id),
        get_partial_path(job_id, 'filtered'),
        get_partial_path(job_id, 'raw'),
    ]:
        if os.path.exists(path):
            os.remove(path)
 # ──────────────────────────────────────────────
 # CSV Nachbearbeitung
 # ──────────────────────────────────────────────
 def process_result_csv(raw_bytes, input_df, apply_filter=True):
    try:
        content = raw_bytes.decode('utf-8', errors='replace')
        df_out = pd.read_csv(StringIO(content))
        print(f"📄 Raw result: {df_out.shape}")
        available = [c for c in OUTPUT_COLS if c in df_out.columns]
        df_out = df_out[available]
        for col in df_out.columns:
            df_out[col] = df_out[col].apply(fix_encoding)
        if apply_filter:
            input_addresses = build_input_addresses(input_df)
            before = len(df_out)
            df_out = df_out[
                df_out['address'].apply(lambda a: address_in_input(a, input_addresses))
            ]
            print(f"📍 Filter: {before} → {len(df_out)}")
        df_out = df_out.drop_duplicates(subset=['title', 'address'], keep='first')
        df_out = df_out.dropna(subset=['title'], how='all')
        df_out = df_out[df_out['title'].str.strip().astype(bool)]
        print(f"✅ Final ({'gefiltert' if apply_filter else 'alle'}): {df_out.shape}")
        return df_out
    except Exception as e:
        print(f"💥 process_result_csv: {e}")
        return None
 # ──────────────────────────────────────────────
 # HAUPT-WORKER
 # ──────────────────────────────────────────────
 def process_file(filename, job_id, app):
    print(f"🎯 {filename} Job#{job_id} START!")
    with app.app_context():
        job = Job.query.get(job_id)
        if not job:
            print("❌ Job missing")
            return
        try:
            #Parse + ALLE Queries
            job.status = "📊 parsing CSV"
            db.session.commit()
            filepath = os.path.join(UPLOAD_FOLDER, filename)
            print(f"📁 {filepath} | {os.path.getsize(filepath)}b")
            df_input = pd.read_csv(filepath, sep=';', encoding='ISO-8859-1')
            print(f"📊 {df_input.shape}")
            queries = []
            for _, row in df_input.iterrows():
                parts = [
                    str(row.get('PLZ', '')).strip(),
                    str(row.get('Stadt', '')).strip(),
                    str(row.get('Straße', '')).strip(),
                    str(row.get('Hausnummer', '')).strip(),
                    str(row.get('Zusatz', '')).strip(),
                ]
                q = f"Firmen {' '.join(p for p in parts if p and p != 'nan')}".strip()
                q = clean_query(q)  # Fix 1: Sonderzeichen bereinigen
                if len(q) > 10:
                    queries.append(q)
            total_queries = len(queries)
            print(f"🔍 {total_queries} Queries | Samples: {queries[:3]}")
            if total_queries == 0:
                raise ValueError("Keine gültigen Adressen")
            #BATCHED Processing
            batches = (total_queries + BATCH_SIZE - 1) // BATCH_SIZE
            # Resume: Fortschritt laden falls vorhanden
            os.makedirs(RESULT_FOLDER, exist_ok=True)
            progress = load_progress(job_id)
            start_batch = progress['last_completed_batch'] + 1 if progress else 0
            all_results_filtered, all_results_raw = load_partial(job_id) if progress else ([], [])
            eta_initial = format_eta((batches - start_batch) * ((BATCH_DELAY_MAX + MAX_TIME) / 2))
            print(f"📦 {batches} Batches à {BATCH_SIZE} | 2x Scraper | Start: {start_batch} | ETA: ~{eta_initial}")
            job_start_time = time.time()
            job.status = f"🔄 Batch {start_batch+1}/{batches} | ⏱️ ~{eta_initial}"
            db.session.commit()
            for batch_idx in range(start_batch, batches):
                batch_start = batch_idx * BATCH_SIZE
                batch_end = min(batch_start + BATCH_SIZE, total_queries)
                batch_queries = queries[batch_start:batch_end]
                # 2x Scraper: abwechselnd nutzen
                scraper_url = SCRAPER_URLS[batch_idx % len(SCRAPER_URLS)]
                print(f"\n🔄 BATCH {batch_idx+1}/{batches} ({batch_start+1}-{batch_end}/{total_queries}) → {scraper_url}")
                #Random Delay
                delay = random.uniform(BATCH_DELAY_MIN, BATCH_DELAY_MAX)
                print(f"😴 Delay: {delay:.0f}s")
                time.sleep(delay)
                #API Call
                payload = {
                    "name": f"{filename.replace('.csv','')}-{job_id}-B{batch_idx+1:03d}",
                    "keywords": batch_queries,
                    "lang": "de",
                    "depth": 1,
                    "zoom": 15,
                    "radius": 50,
                    "max_time": MAX_TIME,
                    "fast_mode": False,
                    "proxies": [PROXY_URL]
                }
                batch_success = False
                # Fix 2: Retry-Logik bei Scraper-Fehler
                for attempt in range(1, MAX_RETRIES + 1):
                    try:
                        resp = requests.post(f"{scraper_url}/api/v1/jobs", json=payload, timeout=45)
                        print(f"📤 {resp.status_code} (Versuch {attempt} | {scraper_url})")
                        if is_blocked(resp.text):
                            print("🚫 Batch übersprungen (blocked)")
                            break
                        if resp.status_code != 201:
                            print(f"⚠️ Batch {batch_idx+1} fehlgeschlagen: {resp.text[:100]}")
                            if attempt < MAX_RETRIES:
                                time.sleep(10)
                            continue
                        scraper_id = resp.json()['id']
                        print(f"✅ Scraper: {scraper_id}")
                        stuck_counter = 0
                        for poll_i in range(1, POLL_MAX + 1):
                            r = requests.get(f"{scraper_url}/api/v1/jobs/{scraper_id}", timeout=15)
                            data = r.json()
                            status = data.get('Status', data.get('status', '?'))
                            print(f"⏳ Poll {poll_i}: {status}")
                            # Fix 4: Auto-Recovery bei Pending-Stuck
                            if status == 'pending':
                                stuck_counter += 1
                                if stuck_counter >= STUCK_THRESHOLD:
                                    print(f"⚠️ Job {scraper_id} hängt – abbrechen + Neustart")
                                    requests.delete(f"{scraper_url}/api/v1/jobs/{scraper_id}", timeout=10)
                                    restart_scraper(scraper_url)  # Fix 3: Nur betroffenen Scraper neu starten
                                    break
                            else:
                                stuck_counter = 0
                            if status in ('ok', 'completed', 'scraped'):
                                dl = requests.get(f"{scraper_url}/api/v1/jobs/{scraper_id}/download", timeout=90)
                                if dl.status_code == 200:
                                    df_filtered = process_result_csv(dl.content, df_input, True)
                                    df_raw = process_result_csv(dl.content, df_input, False)
                                    if df_filtered is not None:
                                        all_results_filtered.append(df_filtered)
                                        all_results_raw.append(df_raw)
                                        append_partial(job_id, df_filtered, df_raw)  # Resume: sofort speichern
                                        print(f"📊 Batch {batch_idx+1}: {len(df_filtered)} filtered")
                                batch_success = True
                                break
                            # Fix 2: Scraper-Fehler → Retry
                            elif status in ('failed', 'error'):
                                print(f"💥 Batch {batch_idx+1}: {status} (Versuch {attempt})")
                                if attempt < MAX_RETRIES:
                                    time.sleep(10)
                                break
                            time.sleep(random.uniform(POLL_DELAY_MIN, POLL_DELAY_MAX))
                        if batch_success:
                            break
                    except Exception as e:
                        print(f"💥 Batch {batch_idx+1} Versuch {attempt}: {e}")
                        if attempt < MAX_RETRIES:
                            time.sleep(10)
                # Resume: Fortschritt nach jedem Batch speichern
                save_progress(job_id, batch_idx, batches)
                # ETA berechnen
                elapsed = time.time() - job_start_time
                done_so_far = batch_idx - start_batch + 1
                if done_so_far > 0:
                    avg_per_batch = elapsed / done_so_far
                    remaining = (batches - batch_idx - 1) * avg_per_batch
                    eta_str = format_eta(remaining)
                else:
                    eta_str = "?"
                job.status = f"🔄 Batch {batch_idx+2}/{batches} | ⏱️ ~{eta_str}"
                db.session.commit()
            #MERGE & SAVE
            job.status = "🔧 merging results"
            db.session.commit()
            base = filename.replace('.csv', '')
            if all_results_filtered:
                df_final_filtered = pd.concat(all_results_filtered, ignore_index=True)
                df_final_filtered = df_final_filtered.drop_duplicates(subset=['title', 'address'])
                out_filtered = f"results_{base}_filtered.csv"
                df_final_filtered.to_csv(
                    os.path.join(RESULT_FOLDER, out_filtered),
                    index=False, encoding='utf-8-sig', sep=';'
                )
                if all_results_raw:
                    df_final_raw = pd.concat(all_results_raw, ignore_index=True)
                    out_raw = f"results_{base}_all.csv"
                    df_final_raw.to_csv(
                        os.path.join(RESULT_FOLDER, out_raw),
                        index=False, encoding='utf-8-sig', sep=';'
                    )
                job.result_filename = out_filtered
                job.result_filename_raw = out_raw
                job.status = f"✅ Fertig: {len(df_final_filtered)} Firmen"
                # Resume: Cleanup nach Abschluss
                cleanup_progress(job_id)
            else:
                job.status = "❌ Keine Ergebnisse"
            db.session.commit()
            print(f"🎉 Job {job_id} komplett!")
        except Exception as e:
            job.status = f"Failed: {str(e)[:50]}"
            print(f"💥 FATAL: {e}")
            import traceback
            traceback.print_exc()
            db.session.commit()
        print(f"✅ DONE! Status: {job.status}")
--- a/app/webcrawler.orig
+++ b/app/webcrawler.orig
@ -0,0 +1,138 @@
 import csv
 import os
 import requests
 from .models import db, Job
 from flask import current_app
 UPLOAD_FOLDER = 'uploads'
 RESULT_FOLDER = 'results'
 API_KEY = 'AIzaSyAIf0yXJTwo87VMWLBtq2m2LqE-OaPGbzw'
 processed_companies = set()
 def get_geocode(address):
    url = f"https://maps.googleapis.com/maps/api/geocode/json"
    params = {'address': address, 'key': API_KEY}
    try:
        response = requests.get(url, params=params, timeout=5)
        if response.status_code == 200:
            data = response.json()
            if data['status'] == 'OK':
                location = data['results'][0]['geometry']['location']
                return location['lat'], location['lng']
    except requests.RequestException as e:
        print(f"Geocode API Fehler für {address}: {e}")
    return None, None
 def get_nearby_places(lat, lng):
    places_url = f"https://maps.googleapis.com/maps/api/place/nearbysearch/json"
    params = {
        'location': f"{lat},{lng}",
        'radius': 10,
        'type': 'point_of_interest',
        'key': API_KEY
    }
    try:
        response = requests.get(places_url, params=params, timeout=5)
        if response.status_code == 200:
            return response.json().get('results', [])
    except requests.RequestException as e:
        print(f"Nearby Places API Fehler für Standort {lat},{lng}: {e}")
    return []
 def get_place_details(place_id):
    details_url = f"https://maps.googleapis.com/maps/api/place/details/json"
    params = {
        'place_id': place_id,
        'fields': 'formatted_phone_number,website',
        'key': API_KEY
    }
    try:
        response = requests.get(details_url, params=params, timeout=5)
        if response.status_code == 200:
            result = response.json().get('result', {})
            return result.get('formatted_phone_number', 'N/A'), result.get('website', 'N/A')
    except requests.RequestException as e:
        print(f"Place Details API Fehler für Place ID {place_id}: {e}")
    return 'N/A', 'N/A'
 def process_file(filename, job_id, app):
    with app.app_context():
        filepath = os.path.join(UPLOAD_FOLDER, filename)
        results = []
        job = Job.query.get(job_id)
        if not job:
            print("Job wurde abgebrochen.")
            return
        job.status = "In Progress"
        db.session.commit()
        with open(filepath, newline='', encoding='ISO-8859-1') as csvfile:
            reader = csv.DictReader(csvfile, delimiter=';')
            headers = reader.fieldnames
            if not all(field in headers for field in ['PLZ', 'Straße', 'Hausnummer']):
                print("CSV-Datei enthält nicht alle notwendigen Spalten.")
                job.status = "Failed"
                db.session.commit()
                return
            for row in reader:
                plz = row.get('PLZ', '').strip()
                city = row.get('Stadt', row.get('Bezirk', '')).strip()
                street = row.get('Straße', '').strip()
                house_number = row.get('Hausnummer', '').strip()
                additional = row.get('Zusatz', '').strip()
                if not all([plz, city, street, house_number]):
                    continue
                full_address = f"{street} {house_number} {additional}, {plz} {city}"
                lat, lng = get_geocode(full_address)
                if lat is None or lng is None:
                    continue
                nearby_places = get_nearby_places(lat, lng)
                for place in nearby_places:
                    company_name = place['name']
                    if company_name in processed_companies:
                        continue
                    processed_companies.add(company_name)
                    company_address = place.get('vicinity', 'N/A').split(',')[0]
                    place_id = place.get('place_id')
                    company_phone, company_website = get_place_details(place_id) if place_id else ('N/A', 'N/A')
                    results.append({
                        'PLZ': plz,
                        'Stadt': city,
                        'Straße': street,
                        'Hausnummer': house_number,
                        'Zusatz': additional,
                        'Company Name': company_name,
                        'Company Address': company_address,
                        'Company Phone': company_phone,
                        'Company Website': company_website
                    })
        if results:
            result_file = f"results_{os.path.splitext(filename)[0]}.csv"
            result_path = os.path.join(RESULT_FOLDER, result_file)
            with open(result_path, 'w', newline='', encoding='utf-8-sig') as csvfile:
                writer = csv.DictWriter(csvfile, fieldnames=[
                    'PLZ', 'Stadt', 'Straße', 'Hausnummer', 'Zusatz',
                    'Company Name', 'Company Address', 'Company Phone', 'Company Website'
                ])
                writer.writeheader()
                writer.writerows(results)
            job.status = "Completed"
            job.result_filename = result_file
            db.session.commit()
        else:
            job.status = "Failed"
            db.session.commit()
--- a/app/webcrawler.py
+++ b/app/webcrawler.py
@ -1,138 +1,487 @@
 import csv
 import os
 import re
 import unicodedata
 import json
 import threading
 import pandas as pd
 import requests
-from .models import db, Job
+import time
-from flask import current_app
+import random
 from io import StringIO
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from app.models import db, Job
-UPLOAD_FOLDER = 'uploads'
+print("🆕 MODERN webcrawler LOADED! – BATCHED + PROXY + RESUME + ETA + 4x SCRAPER CHUNK-PARALLEL")
 RESULT_FOLDER = 'results'
-API_KEY = 'AIzaSyAIf0yXJTwo87VMWLBtq2m2LqE-OaPGbzw'
+UPLOAD_FOLDER = '/app/uploads'
 RESULT_FOLDER = '/app/results'
-processed_companies = set()
+SCRAPER_URLS = [
    "http://gmaps-scraper-1:8080",
    "http://gmaps-scraper-2:8080",
    "http://gmaps-scraper-3:8080",
    "http://gmaps-scraper-4:8080",
 ]
-def get_geocode(address):
+OUTPUT_COLS = ['title', 'category', 'address', 'open_hours', 'website', 'phone', 'link']
    url = f"https://maps.googleapis.com/maps/api/geocode/json"
    params = {'address': address, 'key': API_KEY}
 PROXY_URL = "http://bitlleuv-rotate:s5hzse6hz74b@p.webshare.io:80"
 API_PROXIES = {"http": PROXY_URL, "https": PROXY_URL}
 _job_semaphore = threading.Semaphore(1)
 # ──────────────────────────────────────────────
 # Tuning
 # ──────────────────────────────────────────────
 BATCH_SIZE        = 30    # Keywords pro Scraper-Job
 BATCH_DELAY_MIN   = 3     # Sekunden Pause zwischen Chunks (min)
 BATCH_DELAY_MAX   = 6     # Sekunden Pause zwischen Chunks (max)
 MAX_TIME          = 60    # Sekunden die der Scraper pro Batch hat
 POLL_MAX          = 90    # Max. Poll-Versuche pro Batch
 POLL_DELAY_MIN    = 2     # Sekunden zwischen Polls (min)
 POLL_DELAY_MAX    = 5     # Sekunden zwischen Polls (max)
 STUCK_TIMEOUT     = 300   # Sekunden bis Scraper-Neustart (5 Min)
 MAX_RETRIES       = 2     # Wiederholversuche pro Batch bei Fehler
 PARALLEL_WORKERS  = len(SCRAPER_URLS)
 _partial_lock = threading.Lock()
 # ──────────────────────────────────────────────
 # Hilfsfunktionen
 # ──────────────────────────────────────────────
 def is_blocked(data):
    text = str(data).lower()
    blocked = any(kw in text for kw in ['captcha', 'blocked', 'rate limit', 'too many', '429'])
    if blocked:
        print(f"🚫 BLOCKED: {str(data)[:100]}")
    return blocked
 def fix_encoding(text):
    if not isinstance(text, str):
        return text
    try:
-        response = requests.get(url, params=params, timeout=5)
+        return text.encode('latin-1').decode('utf-8')
-        if response.status_code == 200:
+    except (UnicodeEncodeError, UnicodeDecodeError):
-            data = response.json()
+        return text
-            if data['status'] == 'OK':
+
-                location = data['results'][0]['geometry']['location']
+def clean_query(q):
-                return location['lat'], location['lng']
+    q = ''.join(c for c in q if unicodedata.category(c) != 'Cc')
-    except requests.RequestException as e:
+    q = ' '.join(q.split())
-        print(f"Geocode API Fehler für {address}: {e}")
+    return q.strip()
 def build_input_addresses(df):
    addresses = set()
    for _, row in df.iterrows():
        plz    = str(row.get('PLZ', '')).strip()
        stadt  = str(row.get('Stadt', '')).strip()
        str_   = str(row.get('Straße', '')).strip()
        nr     = str(row.get('Hausnummer', '')).strip()
        zusatz = str(row.get('Zusatz', '')).strip()
        full = f"{str_} {nr} {zusatz} {plz} {stadt}".lower().strip()
        full = ' '.join(full.split())
        addresses.add(full)
    return addresses
 def normalize_address(addr):
    if not isinstance(addr, str):
        return ''
    addr = fix_encoding(addr)
    return ' '.join(addr.lower().strip().split())
 def address_in_input(result_addr, input_addresses):
    norm = normalize_address(result_addr)
    for inp_addr in input_addresses:
        plz_match = re.search(r'\b\d{5}\b', inp_addr)
        if not plz_match:
            continue
        plz = plz_match.group()
        if plz not in norm:
            continue
        parts = inp_addr.split()
        street = parts[0] if parts else ''
        if len(street) < 4 or street[:5].lower() not in norm:
            continue
        hausnr = parts[1] if len(parts) > 1 else ''
        if hausnr and not re.search(rf'\b{re.escape(hausnr)}\b', norm):
            continue
        return True
    return False
 def format_eta(seconds):
    if seconds < 60:
        return f"{int(seconds)}s"
    h, rem = divmod(int(seconds), 3600)
    m = rem // 60
    return f"{h}h {m:02d}min" if h > 0 else f"{m}min"
 # ──────────────────────────────────────────────
 # Scraper-Job Cleanup
 # ──────────────────────────────────────────────
 def _cleanup_scraper_job(scraper_url, scraper_id):
    """Scraper-Job immer aufräumen wenn wir ihn nicht mehr brauchen"""
    try:
        requests.delete(f"{scraper_url}/api/v1/jobs/{scraper_id}", timeout=10)
        print(f"🗑️ Scraper-Job {scraper_id} gelöscht")
    except Exception as e:
        print(f"⚠️ Cleanup fehlgeschlagen: {e}")
 # ──────────────────────────────────────────────
 # Scraper-Neustart via Docker SDK
 # ──────────────────────────────────────────────
 def restart_scraper(scraper_url):
    try:
        import docker
        container_name = scraper_url.split("//")[1].split(":")[0]
        print(f"🔄 Starte {container_name} neu...")
        client = docker.from_env()
        container = client.containers.get(container_name)
        container.restart()
        print(f"✅ {container_name} neu gestartet – warte 15s...")
        time.sleep(15)
        return True
    except Exception as e:
        print(f"⚠️ Scraper-Neustart fehlgeschlagen: {e}")
        return False
 # ──────────────────────────────────────────────
 # Resume: Progress-File Hilfsfunktionen
 # ──────────────────────────────────────────────
 def get_progress_path(job_id):
    return os.path.join(RESULT_FOLDER, f"progress_{job_id}.json")
 def get_partial_path(job_id, suffix):
    return os.path.join(RESULT_FOLDER, f"partial_{job_id}_{suffix}.csv")
 def load_progress(job_id):
    path = get_progress_path(job_id)
    if os.path.exists(path):
        with open(path, 'r') as f:
            data = json.load(f)
        print(f"🔁 RESUME: ab Batch {data['last_completed_batch'] + 1}/{data['total_batches']}")
        return data
    return None
 def save_progress(job_id, last_completed_batch, total_batches):
    path = get_progress_path(job_id)
    with open(path, 'w') as f:
        json.dump({"last_completed_batch": last_completed_batch, "total_batches": total_batches}, f)
 def append_partial(job_id, df_filtered, df_raw):
    with _partial_lock:
        for suffix, df in [('filtered', df_filtered), ('raw', df_raw)]:
            if df is None:
                continue
            path = get_partial_path(job_id, suffix)
            header = not os.path.exists(path)
            df.to_csv(path, mode='a', index=False, header=header, encoding='utf-8-sig', sep=';')
 def load_partial(job_id):
    results_filtered, results_raw = [], []
    for suffix, lst in [('filtered', results_filtered), ('raw', results_raw)]:
        path = get_partial_path(job_id, suffix)
        if os.path.exists(path):
            try:
                df = pd.read_csv(path, sep=';', encoding='utf-8-sig')
                lst.append(df)
                print(f"📂 Partial {suffix}: {len(df)} Zeilen geladen")
            except Exception as e:
                print(f"⚠️ Partial {suffix} Ladefehler: {e}")
    return results_filtered, results_raw
 def cleanup_progress(job_id):
    for path in [
        get_progress_path(job_id),
        get_partial_path(job_id, 'filtered'),
        get_partial_path(job_id, 'raw'),
    ]:
        if os.path.exists(path):
            os.remove(path)
 # ──────────────────────────────────────────────
 # CSV Nachbearbeitung
 # ──────────────────────────────────────────────
 def process_result_csv(raw_bytes, input_df, apply_filter=True):
    try:
        content = raw_bytes.decode('utf-8', errors='replace')
        df_out = pd.read_csv(StringIO(content))
        print(f"📄 Raw result: {df_out.shape}")
        available = [c for c in OUTPUT_COLS if c in df_out.columns]
        df_out = df_out[available]
        for col in df_out.columns:
            df_out[col] = df_out[col].apply(fix_encoding)
        if apply_filter:
            input_addresses = build_input_addresses(input_df)
            before = len(df_out)
            df_out = df_out[
                df_out['address'].apply(lambda a: address_in_input(a, input_addresses))
            ]
            print(f"📍 Filter: {before} → {len(df_out)}")
        df_out = df_out.drop_duplicates(subset=['title', 'address'], keep='first')
        df_out = df_out.dropna(subset=['title'], how='all')
        df_out = df_out[df_out['title'].str.strip().astype(bool)]
        print(f"✅ Final ({'gefiltert' if apply_filter else 'alle'}): {df_out.shape}")
        return df_out
    except Exception as e:
        print(f"💥 process_result_csv: {e}")
        return None
 # ──────────────────────────────────────────────
 # Parallel: Einzelnen Batch verarbeiten
 # ──────────────────────────────────────────────
 def process_batch(batch_idx, batch_queries, scraper_url, filename, job_id, df_input):
    payload = {
        "name": f"{filename.replace('.csv','')}-{job_id}-B{batch_idx+1:03d}",
        "keywords": batch_queries,
        "lang": "de",
        "depth": 1,
        "zoom": 17,
        "radius": 100,
        "max_time": MAX_TIME,
        "fast_mode": False,
        "proxies": [PROXY_URL]
    }
    for attempt in range(1, MAX_RETRIES + 1):
        scraper_id = None
        try:
            resp = requests.post(f"{scraper_url}/api/v1/jobs", json=payload, timeout=45)
            print(f"📤 Batch {batch_idx+1} → {scraper_url} | {resp.status_code} (Versuch {attempt})")
            if is_blocked(resp.text):
                print(f"🚫 Batch {batch_idx+1} blocked")
                return None, None
            if resp.status_code != 201:
                print(f"⚠️ Batch {batch_idx+1} fehlgeschlagen: {resp.text[:100]}")
                if attempt < MAX_RETRIES:
                    time.sleep(10)
                continue
            scraper_id = resp.json()['id']
            print(f"✅ Batch {batch_idx+1} Scraper-ID: {scraper_id}")
            batch_start_time = time.time()
            for poll_i in range(1, POLL_MAX + 1):
                r = requests.get(f"{scraper_url}/api/v1/jobs/{scraper_id}", timeout=15)
                data = r.json()
                status = data.get('Status', data.get('status', '?'))
                elapsed = time.time() - batch_start_time
                print(f"⏳ Batch {batch_idx+1} Poll {poll_i}: {status} | {int(elapsed)}s")
                if status == 'pending' and elapsed > STUCK_TIMEOUT:
                    print(f"⚠️ Batch {batch_idx+1} hängt seit {int(elapsed)}s – Neustart {scraper_url}")
                    _cleanup_scraper_job(scraper_url, scraper_id)
                    scraper_id = None
                    restart_scraper(scraper_url)
                    break
                if status in ('ok', 'completed', 'scraped'):
                    dl = requests.get(f"{scraper_url}/api/v1/jobs/{scraper_id}/download", timeout=90)
                    scraper_id = None 
                    if dl.status_code == 200:
                        df_filtered = process_result_csv(dl.content, df_input, True)
                        df_raw      = process_result_csv(dl.content, df_input, False)
                        print(f"📊 Batch {batch_idx+1}: {len(df_filtered) if df_filtered is not None else 0} filtered")
                        return df_filtered, df_raw
                    return None, None
                elif status in ('failed', 'error'):
                    print(f"💥 Batch {batch_idx+1}: {status} (Versuch {attempt})")
                    _cleanup_scraper_job(scraper_url, scraper_id)
                    scraper_id = None
                    if attempt < MAX_RETRIES:
                        time.sleep(10)
                    break
                time.sleep(random.uniform(POLL_DELAY_MIN, POLL_DELAY_MAX))
        except Exception as e:
            print(f"💥 Batch {batch_idx+1} Versuch {attempt}: {e}")
            if scraper_id:
                _cleanup_scraper_job(scraper_url, scraper_id)
                scraper_id = None
            if attempt < MAX_RETRIES:
                time.sleep(10)
    return None, None
-def get_nearby_places(lat, lng):
+# ──────────────────────────────────────────────
-    places_url = f"https://maps.googleapis.com/maps/api/place/nearbysearch/json"
+# HAUPT-WORKER
-    params = {
+# ──────────────────────────────────────────────
        'location': f"{lat},{lng}",
        'radius': 10,
        'type': 'point_of_interest',
        'key': API_KEY
    }
    try:
        response = requests.get(places_url, params=params, timeout=5)
        if response.status_code == 200:
            return response.json().get('results', [])
    except requests.RequestException as e:
        print(f"Nearby Places API Fehler für Standort {lat},{lng}: {e}")
    return []
 def get_place_details(place_id):
    details_url = f"https://maps.googleapis.com/maps/api/place/details/json"
    params = {
        'place_id': place_id,
        'fields': 'formatted_phone_number,website',
        'key': API_KEY
    }
    try:
        response = requests.get(details_url, params=params, timeout=5)
        if response.status_code == 200:
            result = response.json().get('result', {})
            return result.get('formatted_phone_number', 'N/A'), result.get('website', 'N/A')
    except requests.RequestException as e:
        print(f"Place Details API Fehler für Place ID {place_id}: {e}")
    return 'N/A', 'N/A'
 def process_file(filename, job_id, app):
    with app.app_context():
        filepath = os.path.join(UPLOAD_FOLDER, filename)
        results = []
        job = Job.query.get(job_id)
-        if not job:
+        if job:
-            print("Job wurde abgebrochen.")
+            job.status = "⏳ Wartet auf anderen Job..."
-            return
+            db.session.commit()
        job.status = "In Progress"
        db.session.commit()
-        with open(filepath, newline='', encoding='ISO-8859-1') as csvfile:
+    with _job_semaphore:
-            reader = csv.DictReader(csvfile, delimiter=';')
+        print(f"🎯 {filename} Job#{job_id} START!")
            headers = reader.fieldnames
-            if not all(field in headers for field in ['PLZ', 'Straße', 'Hausnummer']):
+        with app.app_context():
-                print("CSV-Datei enthält nicht alle notwendigen Spalten.")
+            job = Job.query.get(job_id)
-                job.status = "Failed"
+            if not job:
-                db.session.commit()
+                print("❌ Job missing")
                return
-            for row in reader:
+            try:
-                plz = row.get('PLZ', '').strip()
+                job.status = "📊 parsing CSV"
-                city = row.get('Stadt', row.get('Bezirk', '')).strip()
+                db.session.commit()
                street = row.get('Straße', '').strip()
                house_number = row.get('Hausnummer', '').strip()
                additional = row.get('Zusatz', '').strip()
-                if not all([plz, city, street, house_number]):
+                filepath = os.path.join(UPLOAD_FOLDER, filename)
-                    continue
+                print(f"📁 {filepath} | {os.path.getsize(filepath)}b")
-                full_address = f"{street} {house_number} {additional}, {plz} {city}"
+                df_input = pd.read_csv(filepath, sep=';', encoding='ISO-8859-1')
-                lat, lng = get_geocode(full_address)
+                print(f"📊 {df_input.shape}")
                if lat is None or lng is None:
                    continue
-                nearby_places = get_nearby_places(lat, lng)
+                queries = []
-                for place in nearby_places:
+                for _, row in df_input.iterrows():
-                    company_name = place['name']
+                    parts = [
-                    if company_name in processed_companies:
+                        str(row.get('PLZ', '')).strip(),
-                        continue
+                        str(row.get('Stadt', '')).strip(),
                        str(row.get('Straße', '')).strip(),
                        str(row.get('Hausnummer', '')).strip(),
                        str(row.get('Zusatz', '')).strip(),
                    ]
                    q = f"Firmen {' '.join(p for p in parts if p and p != 'nan')}".strip()
                    q = clean_query(q)
                    if len(q) > 10:
                        queries.append(q)
-                    processed_companies.add(company_name)
+                total_queries = len(queries)
-                    company_address = place.get('vicinity', 'N/A').split(',')[0]
+                print(f"🔍 {total_queries} Queries | Samples: {queries[:3]}")
-                    place_id = place.get('place_id')
+                if total_queries == 0:
-                    company_phone, company_website = get_place_details(place_id) if place_id else ('N/A', 'N/A')
+                    raise ValueError("Keine gültigen Adressen")
-                    results.append({
+                batches = (total_queries + BATCH_SIZE - 1) // BATCH_SIZE
                        'PLZ': plz,
                        'Stadt': city,
                        'Straße': street,
                        'Hausnummer': house_number,
                        'Zusatz': additional,
                        'Company Name': company_name,
                        'Company Address': company_address,
                        'Company Phone': company_phone,
                        'Company Website': company_website
                    })
-        if results:
+                os.makedirs(RESULT_FOLDER, exist_ok=True)
-            result_file = f"results_{os.path.splitext(filename)[0]}.csv"
+                progress = load_progress(job_id)
-            result_path = os.path.join(RESULT_FOLDER, result_file)
+                start_batch = progress['last_completed_batch'] + 1 if progress else 0
-            with open(result_path, 'w', newline='', encoding='utf-8-sig') as csvfile:
+                all_results_filtered, all_results_raw = load_partial(job_id) if progress else ([], [])
-                writer = csv.DictWriter(csvfile, fieldnames=[
+
-                    'PLZ', 'Stadt', 'Straße', 'Hausnummer', 'Zusatz',
+                eta_initial = format_eta((batches - start_batch) * ((BATCH_DELAY_MAX + MAX_TIME) / 2) / PARALLEL_WORKERS)
-                    'Company Name', 'Company Address', 'Company Phone', 'Company Website'
+                print(f"📦 {batches} Batches à {BATCH_SIZE} | {PARALLEL_WORKERS}x parallel (Chunk) | Start: {start_batch} | ETA: ~{eta_initial}")
-                ])
+                job_start_time = time.time()
-                writer.writeheader()
+                job.status = f"🔄 Batch {start_batch+1}/{batches} | ⏱️ ~{eta_initial}"
-                writer.writerows(results)
+                db.session.commit()
-            job.status = "Completed"
+
-            job.result_filename = result_file
+                completed_count = 0
-            db.session.commit()
+
-        else:
+                batch_indices = list(range(start_batch, batches))
-            job.status = "Failed"
+                chunks = [
-            db.session.commit()
+                    batch_indices[i:i + PARALLEL_WORKERS]
                    for i in range(0, len(batch_indices), PARALLEL_WORKERS)
                ]
                with ThreadPoolExecutor(max_workers=PARALLEL_WORKERS) as executor:
                    for chunk_idx, chunk in enumerate(chunks):
                        futures = {}
                        for batch_idx in chunk:
                            batch_start_q = batch_idx * BATCH_SIZE
                            batch_end_q   = min(batch_start_q + BATCH_SIZE, total_queries)
                            batch_queries = queries[batch_start_q:batch_end_q]
                            scraper_url   = SCRAPER_URLS[batch_idx % len(SCRAPER_URLS)]
                            print(f"\n🚀 Chunk {chunk_idx+1} | Batch {batch_idx+1}/{batches} → {scraper_url}")
                            time.sleep(random.uniform(1, 2))
                            future = executor.submit(
                                process_batch,
                                batch_idx, batch_queries, scraper_url,
                                filename, job_id, df_input
                            )
                            futures[future] = batch_idx
                        for future in as_completed(futures):
                            batch_idx = futures[future]
                            completed_count += 1
                            try:
                                df_filtered, df_raw = future.result()
                                if df_filtered is not None:
                                    all_results_filtered.append(df_filtered)
                                    all_results_raw.append(df_raw)
                                    append_partial(job_id, df_filtered, df_raw)
                            except Exception as e:
                                print(f"💥 Batch {batch_idx+1} Exception: {e}")
                            save_progress(job_id, batch_idx, batches)
                            elapsed = time.time() - job_start_time
                            if completed_count > 0:
                                avg_per_batch = elapsed / completed_count
                                remaining = (batches - start_batch - completed_count) * avg_per_batch / PARALLEL_WORKERS
                                eta_str = format_eta(remaining)
                            else:
                                eta_str = "?"
                            job.status = f"🔄 {completed_count}/{batches - start_batch} fertig | ⏱️ ~{eta_str}"
                            db.session.commit()
                        if chunk_idx < len(chunks) - 1:
                            delay = random.uniform(BATCH_DELAY_MIN, BATCH_DELAY_MAX)
                            print(f"⏸️ Chunk {chunk_idx+1} fertig – warte {delay:.1f}s...")
                            time.sleep(delay)
                # ── MERGE & SAVE ──
                job.status = "🔧 merging results"
                db.session.commit()
                base = filename.replace('.csv', '')
                if all_results_filtered:
                    df_final_filtered = pd.concat(all_results_filtered, ignore_index=True)
                    df_final_filtered = df_final_filtered.drop_duplicates(subset=['title', 'address'])
                    out_filtered = f"results_{base}_filtered.csv"
                    df_final_filtered.to_csv(
                        os.path.join(RESULT_FOLDER, out_filtered),
                        index=False, encoding='utf-8-sig', sep=';'
                    )
                    out_raw = None
                    if all_results_raw:
                        df_final_raw = pd.concat(all_results_raw, ignore_index=True)
                        out_raw = f"results_{base}_all.csv"
                        df_final_raw.to_csv(
                            os.path.join(RESULT_FOLDER, out_raw),
                            index=False, encoding='utf-8-sig', sep=';'
                        )
                    job.result_filename     = out_filtered
                    job.result_filename_raw = out_raw
                    job.status = f"✅ Fertig: {len(df_final_filtered)} Firmen"
                    cleanup_progress(job_id)
                else:
                    job.status = "❌ Keine Ergebnisse"
                db.session.commit()
                print(f"🎉 Job {job_id} komplett!")
            except Exception as e:
                job.status = f"Failed: {str(e)[:50]}"
                print(f"💥 FATAL: {e}")
                import traceback
                traceback.print_exc()
                db.session.commit()
            print(f"✅ DONE! Status: {job.status}")
--- a/delete-crawl-jobs.py
+++ b/delete-crawl-jobs.py
@ -0,0 +1,21 @@
 import requests
 import time
 base_url = "http://localhost:5001/api/v1/jobs"
 response = requests.get(base_url)
 jobs = response.json()  # Direkt Array
 print(f"{len(jobs)} Jobs gefunden.")
 deleted = 0
 for job in jobs:
    job_id = job["ID"]
    del_res = requests.delete(f"{base_url}/{job_id}")
    if del_res.status_code in [200, 204]:
        print(f"✓ {job_id}")
        deleted += 1
    else:
        print(f"✗ {job_id}: {del_res.status_code}")
    time.sleep(0.1)
 print(f"{deleted}/{len(jobs)} gelöscht.")
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -1,4 +1,4 @@
-version: '3'
+version: '3.8'
 services:
  web:
    build: .
@ -6,6 +6,114 @@ services:
      - "5000:5000"
    environment:
      - FLASK_APP=app
-    command: flask run --host=0.0.0.0 --port=5000
+      - FLASK_ENV=production
      - PYTHONUNBUFFERED=1
    volumes:
-      - .:/app
+      - ./app:/app/app
      - ./uploads:/app/uploads
      - ./results:/app/results
      - ./instance:/app/instance
      - /var/run/docker.sock:/var/run/docker.sock
    depends_on:
      - gmaps-scraper-1
      - gmaps-scraper-2
      - gmaps-scraper-3
      - gmaps-scraper-4
    restart: always
    networks:
      - scraper-net
  gmaps-scraper-1:
    image: gosom/google-maps-scraper:latest
    container_name: gmaps-scraper-1
    environment:
      - PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
    ports:
      - "5001:8080"
    volumes:
      - ./scraper-data-1:/gmapsdata
    command:
      - "-web"
      - "-data-folder=/gmapsdata"
    restart: always
    healthcheck:
      test: ["CMD-SHELL", "wget -qO- http://localhost:8080/api/v1/jobs || exit 1"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 15s
    networks:
      - scraper-net
  gmaps-scraper-2:
    image: gosom/google-maps-scraper:latest
    container_name: gmaps-scraper-2
    environment:
      - PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
    ports:
      - "5002:8080"
    volumes:
      - ./scraper-data-2:/gmapsdata
    command:
      - "-web"
      - "-data-folder=/gmapsdata"
    restart: always
    healthcheck:
      test: ["CMD-SHELL", "wget -qO- http://localhost:8080/api/v1/jobs || exit 1"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 15s
    networks:
      - scraper-net
  gmaps-scraper-3:
    image: gosom/google-maps-scraper:latest
    container_name: gmaps-scraper-3
    environment:
      - PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
    ports:
      - "5003:8080"
    volumes:
      - ./scraper-data-3:/gmapsdata
    command:
      - "-web"
      - "-data-folder=/gmapsdata"
    restart: always
    healthcheck:
      test: ["CMD-SHELL", "wget -qO- http://localhost:8080/api/v1/jobs || exit 1"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 15s
    networks:
      - scraper-net
  gmaps-scraper-4:
    image: gosom/google-maps-scraper:latest
    container_name: gmaps-scraper-4
    environment:
      - PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
    ports:
      - "5004:8080"
    volumes:
      - ./scraper-data-4:/gmapsdata
    command:
      - "-web"
      - "-data-folder=/gmapsdata"
    restart: always
    healthcheck:
      test: ["CMD-SHELL", "wget -qO- http://localhost:8080/api/v1/jobs || exit 1"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 15s
    networks:
      - scraper-net
 networks:
  scraper-net:
    driver: bridge
--- a/instance/users.db
+++ b/instance/users.db
--- a/requirements.txt
+++ b/requirements.txt
@ -1,8 +1,8 @@
-Flask==2.2.5
+flask
-Flask-Login==0.6.2
+flask-sqlalchemy
-Flask-SQLAlchemy==3.0.3
+flask-login
-Werkzeug==2.2.2
+flask-migrate
 pandas
 requests
-beautifulsoup4
+werkzeug
-Flask-Migrate
+docker
--- a/scraper-data-1/02af4949-431c-4736-beec-8ef7bc859c9d.csv
+++ b/scraper-data-1/02af4949-431c-4736-beec-8ef7bc859c9d.csv
--- a/scraper-data-1/072fe9f8-ce9d-4af5-a0aa-bde70349d5ba.csv
+++ b/scraper-data-1/072fe9f8-ce9d-4af5-a0aa-bde70349d5ba.csv
--- a/scraper-data-1/0b7932ea-4588-44bb-9b08-a69e95ef5d28.csv
+++ b/scraper-data-1/0b7932ea-4588-44bb-9b08-a69e95ef5d28.csv
--- a/scraper-data-1/0d9a6e99-a20c-4301-add9-00211dcc5fa3.csv
+++ b/scraper-data-1/0d9a6e99-a20c-4301-add9-00211dcc5fa3.csv
--- a/scraper-data-1/197953d0-9719-4fb7-a8a1-3a8a9e6994b0.csv
+++ b/scraper-data-1/197953d0-9719-4fb7-a8a1-3a8a9e6994b0.csv
--- a/scraper-data-1/1ebb427f-3308-4637-be28-1c0032a2107d.csv
+++ b/scraper-data-1/1ebb427f-3308-4637-be28-1c0032a2107d.csv
--- a/scraper-data-1/245440fd-eb76-4ae1-9278-bf416a1351a0.csv
+++ b/scraper-data-1/245440fd-eb76-4ae1-9278-bf416a1351a0.csv
--- a/scraper-data-1/26b82d8c-c109-48a4-8b06-f98638d565c3.csv
+++ b/scraper-data-1/26b82d8c-c109-48a4-8b06-f98638d565c3.csv
--- a/scraper-data-1/28b024e8-74db-429a-88c2-c6e8d314bf33.csv
+++ b/scraper-data-1/28b024e8-74db-429a-88c2-c6e8d314bf33.csv
--- a/scraper-data-1/2af0ecab-6f95-48d4-b65b-7cef52d3fe73.csv
+++ b/scraper-data-1/2af0ecab-6f95-48d4-b65b-7cef52d3fe73.csv
--- a/scraper-data-1/2b17ff19-48dc-44d8-b747-eb751c2c28ad.csv
+++ b/scraper-data-1/2b17ff19-48dc-44d8-b747-eb751c2c28ad.csv
--- a/scraper-data-1/2c3dce9e-a011-42ca-9310-34b0bb481fdf.csv
+++ b/scraper-data-1/2c3dce9e-a011-42ca-9310-34b0bb481fdf.csv
--- a/scraper-data-1/2efb0985-cac3-454a-9018-e27f98476df2.csv
+++ b/scraper-data-1/2efb0985-cac3-454a-9018-e27f98476df2.csv
--- a/scraper-data-1/332988ef-754f-4a11-b948-38c1ca463987.csv
+++ b/scraper-data-1/332988ef-754f-4a11-b948-38c1ca463987.csv
--- a/scraper-data-1/39cc6d71-3567-46b9-883c-0cb4fed755a1.csv
+++ b/scraper-data-1/39cc6d71-3567-46b9-883c-0cb4fed755a1.csv
--- a/scraper-data-1/3f53b4d3-61c4-478f-aee7-0b1524b3e480.csv
+++ b/scraper-data-1/3f53b4d3-61c4-478f-aee7-0b1524b3e480.csv
--- a/scraper-data-1/4586b26a-6c85-4109-9d7f-bdcae264ab25.csv
+++ b/scraper-data-1/4586b26a-6c85-4109-9d7f-bdcae264ab25.csv
--- a/scraper-data-1/4612826f-e088-4ebd-8de2-523fe801dd2b.csv
+++ b/scraper-data-1/4612826f-e088-4ebd-8de2-523fe801dd2b.csv
--- a/scraper-data-1/485abd75-8b7d-44a6-92f3-6fad9828a288.csv
+++ b/scraper-data-1/485abd75-8b7d-44a6-92f3-6fad9828a288.csv
--- a/scraper-data-1/4a4acb8f-be75-4328-b011-5dd6f633271e.csv
+++ b/scraper-data-1/4a4acb8f-be75-4328-b011-5dd6f633271e.csv
--- a/scraper-data-1/5610de98-4347-49cd-a480-03afa1c9ee15.csv
+++ b/scraper-data-1/5610de98-4347-49cd-a480-03afa1c9ee15.csv
--- a/scraper-data-1/581a5fce-910b-4da7-89f0-4ae4abb9d48c.csv
+++ b/scraper-data-1/581a5fce-910b-4da7-89f0-4ae4abb9d48c.csv
--- a/scraper-data-1/5943bc28-1757-4474-9e15-504a35fa90ac.csv
+++ b/scraper-data-1/5943bc28-1757-4474-9e15-504a35fa90ac.csv
--- a/scraper-data-1/59bfb43f-0fd5-48ba-b8cc-aab842330d3d.csv
+++ b/scraper-data-1/59bfb43f-0fd5-48ba-b8cc-aab842330d3d.csv
--- a/scraper-data-1/5e590d3f-8fc6-4cca-b111-bcd5b7694a47.csv
+++ b/scraper-data-1/5e590d3f-8fc6-4cca-b111-bcd5b7694a47.csv
--- a/scraper-data-1/5f2179d9-32f0-4fbb-9d30-056ffb74d559.csv
+++ b/scraper-data-1/5f2179d9-32f0-4fbb-9d30-056ffb74d559.csv
--- a/scraper-data-1/60919958-7169-48c2-9079-34f32ed6065d.csv
+++ b/scraper-data-1/60919958-7169-48c2-9079-34f32ed6065d.csv
--- a/scraper-data-1/6512e64f-6321-4b9c-8072-3f8260e13454.csv
+++ b/scraper-data-1/6512e64f-6321-4b9c-8072-3f8260e13454.csv
--- a/scraper-data-1/6c322761-2f3e-415f-829d-4d65dc3567b1.csv
+++ b/scraper-data-1/6c322761-2f3e-415f-829d-4d65dc3567b1.csv
--- a/scraper-data-1/6ca5f31c-d372-45f4-8948-d2844eb9305f.csv
+++ b/scraper-data-1/6ca5f31c-d372-45f4-8948-d2844eb9305f.csv
--- a/scraper-data-1/6ef54a15-b1cc-4d5c-83f6-9960ceb113a0.csv
+++ b/scraper-data-1/6ef54a15-b1cc-4d5c-83f6-9960ceb113a0.csv
--- a/scraper-data-1/6fe123f2-7247-42e3-a876-bd86f1a3191d.csv
+++ b/scraper-data-1/6fe123f2-7247-42e3-a876-bd86f1a3191d.csv
--- a/scraper-data-1/7be0bb2d-b0fa-4629-bb53-ffa4eb152292.csv
+++ b/scraper-data-1/7be0bb2d-b0fa-4629-bb53-ffa4eb152292.csv
--- a/scraper-data-1/7ecc5496-afc5-4e8f-9c2f-8f06e81fa8d5.csv
+++ b/scraper-data-1/7ecc5496-afc5-4e8f-9c2f-8f06e81fa8d5.csv
--- a/scraper-data-1/8a08068b-de23-4612-9a1c-d07dcc4d8a3f.csv
+++ b/scraper-data-1/8a08068b-de23-4612-9a1c-d07dcc4d8a3f.csv
--- a/scraper-data-1/90249f15-69e4-4174-bee1-8dd4658b73e3.csv
+++ b/scraper-data-1/90249f15-69e4-4174-bee1-8dd4658b73e3.csv
--- a/scraper-data-1/9567b79b-b2c2-4590-8cbc-b17e4e41c0a6.csv
+++ b/scraper-data-1/9567b79b-b2c2-4590-8cbc-b17e4e41c0a6.csv
--- a/scraper-data-1/96ad0ca1-95c9-4e28-8b12-16a1ad0d10a1.csv
+++ b/scraper-data-1/96ad0ca1-95c9-4e28-8b12-16a1ad0d10a1.csv
--- a/scraper-data-1/9ede5b47-e75d-4cef-b125-29f66981c3ce.csv
+++ b/scraper-data-1/9ede5b47-e75d-4cef-b125-29f66981c3ce.csv
--- a/scraper-data-1/a14661d3-e928-44db-b3b1-dbad7c87b9e6.csv
+++ b/scraper-data-1/a14661d3-e928-44db-b3b1-dbad7c87b9e6.csv
--- a/scraper-data-1/a5c2212f-e428-43ab-9242-2329890a18d8.csv
+++ b/scraper-data-1/a5c2212f-e428-43ab-9242-2329890a18d8.csv
--- a/scraper-data-1/a9e79597-e967-48b2-bbd1-55cea9d516c6.csv
+++ b/scraper-data-1/a9e79597-e967-48b2-bbd1-55cea9d516c6.csv
--- a/scraper-data-1/aaf5e97d-7dab-4d5e-8ecb-ffc834101e83.csv
+++ b/scraper-data-1/aaf5e97d-7dab-4d5e-8ecb-ffc834101e83.csv
--- a/scraper-data-1/b2b1e231-d153-476d-80ff-de60225b700e.csv
+++ b/scraper-data-1/b2b1e231-d153-476d-80ff-de60225b700e.csv
--- a/scraper-data-1/ba511b6d-4530-4a5a-a2dc-6a242c26e307.csv
+++ b/scraper-data-1/ba511b6d-4530-4a5a-a2dc-6a242c26e307.csv
--- a/scraper-data-1/bc9f63c0-8069-4ad2-9f41-a34fbdfd68ca.csv
+++ b/scraper-data-1/bc9f63c0-8069-4ad2-9f41-a34fbdfd68ca.csv
--- a/scraper-data-1/c0c28d24-5898-4118-9035-1bbfe5a4ffd8.csv
+++ b/scraper-data-1/c0c28d24-5898-4118-9035-1bbfe5a4ffd8.csv
--- a/scraper-data-1/c0e9b36a-0269-4666-b860-5f56c62b4e8d.csv
+++ b/scraper-data-1/c0e9b36a-0269-4666-b860-5f56c62b4e8d.csv
--- a/scraper-data-1/cb8e6aa2-a886-43c8-b089-44ffbf198f4a.csv
+++ b/scraper-data-1/cb8e6aa2-a886-43c8-b089-44ffbf198f4a.csv
--- a/scraper-data-1/cc7fdc27-4523-421d-93a2-c2688633d7a2.csv
+++ b/scraper-data-1/cc7fdc27-4523-421d-93a2-c2688633d7a2.csv
--- a/scraper-data-1/d019440e-43a3-4790-bb39-b66d2fbb9486.csv
+++ b/scraper-data-1/d019440e-43a3-4790-bb39-b66d2fbb9486.csv
--- a/scraper-data-1/d025a229-c945-4c62-a11b-548fdea678d6.csv
+++ b/scraper-data-1/d025a229-c945-4c62-a11b-548fdea678d6.csv
--- a/scraper-data-1/d1f47306-7e62-4870-b7b0-b17266513b64.csv
+++ b/scraper-data-1/d1f47306-7e62-4870-b7b0-b17266513b64.csv
--- a/scraper-data-1/d228bcd4-10f1-4d9d-ac61-a749c429cd7b.csv
+++ b/scraper-data-1/d228bcd4-10f1-4d9d-ac61-a749c429cd7b.csv
--- a/scraper-data-1/d58b61a0-842a-42d1-8eb2-8ea691298796.csv
+++ b/scraper-data-1/d58b61a0-842a-42d1-8eb2-8ea691298796.csv
--- a/scraper-data-1/d5912fef-1aa8-45ff-a4ee-0048baab63c8.csv
+++ b/scraper-data-1/d5912fef-1aa8-45ff-a4ee-0048baab63c8.csv
--- a/scraper-data-1/d6371de0-d9b1-4ecd-9e4a-a795df718857.csv
+++ b/scraper-data-1/d6371de0-d9b1-4ecd-9e4a-a795df718857.csv
--- a/scraper-data-1/d67b4b2e-cd69-416e-9675-294e4a1bf7b7.csv
+++ b/scraper-data-1/d67b4b2e-cd69-416e-9675-294e4a1bf7b7.csv
--- a/scraper-data-1/d842cfb6-be7d-4690-b3e4-6fcb7d262019.csv
+++ b/scraper-data-1/d842cfb6-be7d-4690-b3e4-6fcb7d262019.csv
--- a/scraper-data-1/e274bc34-5df0-41fb-8491-2280e6d72ab7.csv
+++ b/scraper-data-1/e274bc34-5df0-41fb-8491-2280e6d72ab7.csv
--- a/scraper-data-1/e49938e1-b7ac-4aed-9706-01806c38dbf0.csv
+++ b/scraper-data-1/e49938e1-b7ac-4aed-9706-01806c38dbf0.csv
--- a/scraper-data-1/e67bce34-5c93-4355-90b7-cdf18175b869.csv
+++ b/scraper-data-1/e67bce34-5c93-4355-90b7-cdf18175b869.csv
--- a/scraper-data-1/ea482d12-613c-4cb5-8297-c4156ab3f305.csv
+++ b/scraper-data-1/ea482d12-613c-4cb5-8297-c4156ab3f305.csv
--- a/scraper-data-1/ebc83e46-38e2-4297-8e27-f77fdb3bb9a9.csv
+++ b/scraper-data-1/ebc83e46-38e2-4297-8e27-f77fdb3bb9a9.csv
--- a/scraper-data-1/fa49de61-ddcc-4117-9753-60134537c237.csv
+++ b/scraper-data-1/fa49de61-ddcc-4117-9753-60134537c237.csv
--- a/scraper-data-1/jobs.db
+++ b/scraper-data-1/jobs.db
--- a/scraper-data-1/jobs.db-shm
+++ b/scraper-data-1/jobs.db-shm
--- a/scraper-data-1/jobs.db-wal
+++ b/scraper-data-1/jobs.db-wal
--- a/scraper-data-2/0190015c-b2d5-4423-9831-783612514cc1.csv
+++ b/scraper-data-2/0190015c-b2d5-4423-9831-783612514cc1.csv
--- a/scraper-data-2/03100699-9292-4952-ab27-0b502755623e.csv
+++ b/scraper-data-2/03100699-9292-4952-ab27-0b502755623e.csv
--- a/scraper-data-2/073062a2-ca90-4fde-8fa6-ff3c9ed762aa.csv
+++ b/scraper-data-2/073062a2-ca90-4fde-8fa6-ff3c9ed762aa.csv
--- a/scraper-data-2/08476dc0-386f-493d-a006-2dacc9c3a969.csv
+++ b/scraper-data-2/08476dc0-386f-493d-a006-2dacc9c3a969.csv
--- a/scraper-data-2/0b76010e-7005-4f71-a98d-8f87b335fc08.csv
+++ b/scraper-data-2/0b76010e-7005-4f71-a98d-8f87b335fc08.csv
--- a/scraper-data-2/0ce4e8b0-9ab2-4f10-98b8-0ad99ff15daf.csv
+++ b/scraper-data-2/0ce4e8b0-9ab2-4f10-98b8-0ad99ff15daf.csv
--- a/scraper-data-2/15c99b61-c169-432c-bb16-c514c65e6d1e.csv
+++ b/scraper-data-2/15c99b61-c169-432c-bb16-c514c65e6d1e.csv
--- a/scraper-data-2/1611afca-30d1-4dcd-984c-772c5de32fb3.csv
+++ b/scraper-data-2/1611afca-30d1-4dcd-984c-772c5de32fb3.csv
--- a/scraper-data-2/1e06930f-a2a8-4020-9f94-3580b6e51d00.csv
+++ b/scraper-data-2/1e06930f-a2a8-4020-9f94-3580b6e51d00.csv
--- a/Show more
+++ b/Show more