Initial commit

This commit is contained in:
mkrieger 2026-03-10 11:33:18 +01:00
parent 387bc056b9
commit df8c2313a9
275 changed files with 12939 additions and 263 deletions

View file

@ -7,12 +7,14 @@ WORKDIR /app
# Abhängigkeiten installieren
COPY requirements.txt requirements.txt
RUN pip install --no-cache-dir -r requirements.txt
RUN apt update
RUN apt install curl -y
# App-Dateien kopieren
COPY . .
# Flask Umgebungsvariable setzen
ENV FLASK_APP=app
ENV FLASK_ENV=production
# Flask starten
EXPOSE 5000
CMD ["flask", "run", "--host=0.0.0.0", "--port=5000"]

View file

@ -1,56 +1,88 @@
import os
from flask import Flask, redirect, url_for, request
from flask import Flask, redirect, url_for, request, current_app
from flask_sqlalchemy import SQLAlchemy
from flask_login import LoginManager, current_user
from flask_migrate import Migrate
from sqlalchemy import text
# Konfiguration für Upload- und Ergebnis-Ordner
# ✅ Docker-Pfade
UPLOAD_FOLDER = '/app/uploads'
RESULT_FOLDER = '/app/results'
db = SQLAlchemy()
login_manager = LoginManager()
migrate = Migrate()
def _run_migrations(app):
"""Fehlende DB-Spalten automatisch hinzufügen übersteht jeden Neustart"""
migrations = [
("job", "result_filename_raw", "VARCHAR(150)"),
("job", "scraper_job_id", "VARCHAR(255)"),
("user", "is_admin", "BOOLEAN DEFAULT 0"),
]
with app.app_context():
for table, column, col_type in migrations:
try:
db.session.execute(text(f"ALTER TABLE {table} ADD COLUMN {column} {col_type}"))
db.session.commit()
print(f"✅ Migration: {table}.{column} hinzugefügt")
except Exception:
db.session.rollback()
def create_app():
app = Flask(__name__)
# 🔑 Configs
app.config['SECRET_KEY'] = '008e7369b075886d5f494c8813efdfb17155da6af12b3fe8ee'
app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///users.db'
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
app.config['RESULT_FOLDER'] = RESULT_FOLDER
app.config['ALLOW_USER_SIGNUP'] = False
# DB + Tools
db.init_app(app)
migrate.init_app(app, db)
# Flask-Login Setup
login_manager = LoginManager()
login_manager.login_view = 'auth.login'
login_manager.init_app(app)
login_manager.login_view = 'auth.login'
# User Loader
@login_manager.user_loader
def load_user(user_id):
from .models import User
return User.query.get(int(user_id))
# Umleitung nicht authentifizierter Benutzer, statische Dateien und bestimmte Routen ausnehmen
# Protected Routes
@app.before_request
def require_login():
allowed_routes = ['auth.login', 'auth.signup']
if (not current_user.is_authenticated
and request.endpoint not in allowed_routes
and not request.path.startswith('/static/')):
allowed = ['auth.login', 'auth.signup', 'static']
if (not current_user.is_authenticated and
request.endpoint not in allowed and
not request.path.startswith('/static')):
return redirect(url_for('auth.login'))
# Erstellen Sie die Ordner, falls sie noch nicht existieren
# Ordner
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
os.makedirs(app.config['RESULT_FOLDER'], exist_ok=True)
# Registrieren der Routen
# Routes
from . import routes
app.register_blueprint(routes.bp)
# Erstellen der Tabellen in der Datenbank
# Index Redirect
@app.route('/')
def index():
return redirect(url_for('auth.job_status'))
# DB Tables + Auto-Migration
with app.app_context():
db.create_all()
_run_migrations(app)
return app
if __name__ == '__main__':
app = create_app()
app.run(host='0.0.0.0', port=5000, debug=False)

68
app/init.py.bak Normal file
View file

@ -0,0 +1,68 @@
import os
from flask import Flask, redirect, url_for, request, current_app
from flask_sqlalchemy import SQLAlchemy
from flask_login import LoginManager, current_user
from flask_migrate import Migrate
# ✅ Docker-Pfade
UPLOAD_FOLDER = '/app/uploads'
RESULT_FOLDER = '/app/results'
db = SQLAlchemy()
login_manager = LoginManager()
migrate = Migrate()
def create_app():
app = Flask(__name__)
# 🔑 Configs
app.config['SECRET_KEY'] = '008e7369b075886d5f494c8813efdfb17155da6af12b3fe8ee'
app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///users.db'
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
app.config['RESULT_FOLDER'] = RESULT_FOLDER
app.config['ALLOW_USER_SIGNUP'] = True # ✅ Aktiviert!
# DB + Tools
db.init_app(app)
migrate.init_app(app, db)
login_manager.init_app(app)
login_manager.login_view = 'auth.login'
# User Loader
@login_manager.user_loader
def load_user(user_id):
from .models import User
return User.query.get(int(user_id))
# Protected Routes
@app.before_request
def require_login():
allowed = ['auth.login', 'auth.signup', 'static']
if (not current_user.is_authenticated and
request.endpoint not in allowed and
not request.path.startswith('/static')):
return redirect(url_for('auth.login'))
# Ordner
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
os.makedirs(app.config['RESULT_FOLDER'], exist_ok=True)
# Routes
from . import routes
app.register_blueprint(routes.bp)
# Index Redirect
@app.route('/')
def index():
return redirect(url_for('auth.job_status'))
# DB Tables
with app.app_context():
db.create_all()
return app
if __name__ == '__main__':
app = create_app()
app.run(host='0.0.0.0', port=5000, debug=False)

View file

@ -15,5 +15,11 @@ class Job(db.Model):
status = db.Column(db.String(50), default="Pending")
created_at = db.Column(db.DateTime, default=datetime.utcnow)
result_filename = db.Column(db.String(150), nullable=True)
result_filename_raw = db.Column(db.String(150), nullable=True)
user = db.relationship('User', backref=db.backref('jobs', lazy=True))
class AppConfig(db.Model):
id = db.Column(db.Integer, primary_key=True)
key = db.Column(db.String(100), unique=True, nullable=False)
value = db.Column(db.String(100), nullable=False, default='false')

223
app/routes.orig Normal file
View file

@ -0,0 +1,223 @@
import time
import csv
import os
import threading
from flask import Blueprint, request, redirect, url_for, flash, render_template, send_file, current_app
from flask_login import login_user, logout_user, login_required, current_user
from werkzeug.utils import secure_filename
from werkzeug.security import generate_password_hash, check_password_hash
from .models import db, User, Job
from .webcrawler import process_file # Importiere die Funktion für das Webscraping
UPLOAD_FOLDER = 'uploads'
RESULT_FOLDER = 'results'
# Blueprint für auth erstellen
bp = Blueprint('auth', __name__)
@bp.route('/login', methods=['GET', 'POST'])
def login():
if request.method == 'POST':
username = request.form['username']
password = request.form['password']
user = User.query.filter_by(username=username).first()
if user and check_password_hash(user.password, password):
login_user(user)
return redirect(url_for('auth.job_status'))
flash('Login fehlgeschlagen. Überprüfen Sie Benutzername und Passwort.')
return render_template('login.html')
@bp.route('/signup', methods=['GET', 'POST'])
def signup():
if not current_app.config['ALLOW_USER_SIGNUP']:
flash("Registrierung ist derzeit deaktiviert.")
return redirect(url_for('auth.login'))
if request.method == 'POST':
username = request.form['username']
password = generate_password_hash(request.form['password'], method='sha256')
new_user = User(username=username, password=password)
db.session.add(new_user)
db.session.commit()
flash('Benutzer erfolgreich erstellt! Sie können sich jetzt einloggen.')
return redirect(url_for('auth.login'))
return render_template('signup.html')
@bp.route('/logout')
@login_required
def logout():
logout_user()
return redirect(url_for('auth.login'))
@bp.route('/jobs')
@login_required
def job_status():
jobs = Job.query.filter_by(user_id=current_user.id).all()
return render_template('jobs.html', jobs=jobs)
@bp.route('/upload', methods=['GET', 'POST'])
@login_required
def upload():
if request.method == 'POST':
file = request.files['file']
filename = secure_filename(file.filename)
# Überprüfen, ob eine Datei mit dem gleichen Namen bereits existiert
file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], filename)
if os.path.exists(file_path):
# Wenn eine Datei mit dem gleichen Namen existiert, einen Zeitstempel hinzufügen
name, ext = os.path.splitext(filename)
timestamp = time.strftime("%Y%m%d-%H%M%S") # Zeitstempel im Format JahrMonatTag-StundenMinutenSekunden
filename = f"{name}_{timestamp}{ext}"
file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], filename)
flash(f"Eine Datei mit gleichem Namen existierte bereits. Die Datei wurde als '{filename}' gespeichert.")
# Speichern der Datei
file.save(file_path)
flash('Datei erfolgreich hochgeladen und Job gestartet')
# Neuen Job erstellen
new_job = Job(user_id=current_user.id, filename=filename, status="Pending")
db.session.add(new_job)
db.session.commit()
# Debugging-Ausgabe zur Überprüfung der Thread-Erstellung
print(f"Starte Scraping-Thread für Job-ID: {new_job.id}")
# Starten des Scraping im Hintergrund-Thread und Übergeben des aktuellen Anwendungskontexts
thread = threading.Thread(target=process_file, args=(filename, new_job.id, current_app._get_current_object()))
thread.start()
# Debugging-Ausgabe, nachdem der Thread gestartet wurde
print(f"Thread für Job {new_job.id} erfolgreich gestartet.")
return redirect(url_for('auth.job_status'))
return render_template('upload.html')
@bp.route('/download/<int:job_id>', methods=['GET'])
@login_required
def download_result(job_id):
job = Job.query.get_or_404(job_id)
print(f"Job ID: {job.id} - User ID: {job.user_id} - Current User ID: {current_user.id}")
# Überprüfen, ob der Job dem aktuellen Benutzer gehört
if job.user_id != current_user.id:
flash("Sie haben keine Berechtigung, dieses Ergebnis herunterzuladen.")
return redirect(url_for('auth.job_status'))
# Überprüfen, ob das Ergebnis vorhanden ist
if not job.result_filename:
flash("Das Ergebnis ist noch nicht verfügbar.")
return redirect(url_for('auth.job_status'))
# Überprüfen, ob die Datei im angegebenen Pfad existiert
result_path = os.path.join(current_app.config['RESULT_FOLDER'], job.result_filename)
print(f"Versuche, Datei herunterzuladen von: {result_path}")
if os.path.exists(result_path):
print("Datei existiert und wird zum Download bereitgestellt.")
return send_file(result_path, as_attachment=True)
else:
print("Datei nicht gefunden. Ergebnisverzeichnis oder Pfad prüfen.")
flash("Ergebnisdatei nicht gefunden.")
return redirect(url_for('auth.job_status'))
@bp.route('/delete_job/<int:job_id>', methods=['POST'])
@login_required
def delete_job(job_id):
job = Job.query.get_or_404(job_id)
if job.user_id != current_user.id:
flash("Sie haben keine Berechtigung, diesen Job zu löschen.")
return redirect(url_for('auth.job_status'))
# Löschen der Upload-Datei
upload_path = os.path.join(current_app.config['UPLOAD_FOLDER'], job.filename)
if os.path.exists(upload_path):
os.remove(upload_path)
print(f"Upload-Datei gelöscht: {upload_path}")
else:
print(f"Upload-Datei nicht gefunden: {upload_path}")
# Löschen der Results-Datei, falls vorhanden
if job.result_filename:
result_path = os.path.join(current_app.config['RESULT_FOLDER'], job.result_filename)
print(f"Versuche Ergebnisdatei zu löschen: {result_path}")
if os.path.exists(result_path):
try:
os.remove(result_path)
print(f"Ergebnisdatei gelöscht: {result_path}")
except Exception as e:
print(f"Fehler beim Löschen der Ergebnisdatei: {e}")
else:
print(f"Ergebnisdatei nicht gefunden im Pfad: {result_path}")
# Job aus der Datenbank löschen
db.session.delete(job)
db.session.commit()
flash("Job erfolgreich gelöscht.")
return redirect(url_for('auth.job_status'))
@bp.route('/admin', methods=['GET'])
@login_required
def admin_panel():
if not current_user.is_admin:
flash("Keine Berechtigung.")
return redirect(url_for('auth.job_status'))
users = User.query.all()
return render_template('admin_panel.html', users=users)
@bp.route('/admin/create_user', methods=['POST'])
@login_required
def create_user():
if not current_user.is_admin:
flash("Keine Berechtigung.")
return redirect(url_for('auth.admin_panel'))
username = request.form['username']
password = request.form['password']
is_admin = 'is_admin' in request.form # Checkbox für Adminrechte
hashed_password = generate_password_hash(password, method='sha256')
new_user = User(username=username, password=hashed_password, is_admin=is_admin)
db.session.add(new_user)
db.session.commit()
flash(f"Benutzer {username} wurde erstellt.")
return redirect(url_for('auth.admin_panel'))
@bp.route('/admin/reset_password/<int:user_id>', methods=['POST'])
@login_required
def reset_password(user_id):
if not current_user.is_admin:
flash("Keine Berechtigung.")
return redirect(url_for('auth.admin_panel'))
user = User.query.get_or_404(user_id)
new_password = request.form['new_password']
user.password = generate_password_hash(new_password, method='sha256')
db.session.commit()
flash(f"Passwort für Benutzer {user.username} wurde zurückgesetzt.")
return redirect(url_for('auth.admin_panel'))
@bp.route('/admin/delete_user/<int:user_id>', methods=['POST'])
@login_required
def delete_user(user_id):
if not current_user.is_admin:
flash("Keine Berechtigung.")
return redirect(url_for('auth.admin_panel'))
user = User.query.get_or_404(user_id)
if user.is_admin:
flash("Administratoren können nicht gelöscht werden.")
return redirect(url_for('auth.admin_panel'))
db.session.delete(user)
db.session.commit()
flash(f"Benutzer {user.username} wurde gelöscht.")
return redirect(url_for('auth.admin_panel'))

View file

@ -1,18 +1,16 @@
import time
import csv
import os
import threading
from flask import Blueprint, request, redirect, url_for, flash, render_template, send_file, current_app
from flask import Blueprint, request, redirect, url_for, flash, render_template, send_file, jsonify, current_app
from flask_login import login_user, logout_user, login_required, current_user
from werkzeug.utils import secure_filename
from werkzeug.security import generate_password_hash, check_password_hash
from .models import db, User, Job
from .webcrawler import process_file # Importiere die Funktion für das Webscraping
from .models import db, User, Job, AppConfig
from .webcrawler import process_file
UPLOAD_FOLDER = 'uploads'
RESULT_FOLDER = 'results'
UPLOAD_FOLDER = '/app/uploads'
RESULT_FOLDER = '/app/results'
# Blueprint für auth erstellen
bp = Blueprint('auth', __name__)
@bp.route('/login', methods=['GET', 'POST'])
@ -29,19 +27,19 @@ def login():
@bp.route('/signup', methods=['GET', 'POST'])
def signup():
if not current_app.config['ALLOW_USER_SIGNUP']:
cfg = AppConfig.query.filter_by(key='allow_signup').first()
if not cfg or cfg.value != 'true':
flash("Registrierung ist derzeit deaktiviert.")
return redirect(url_for('auth.login'))
if request.method == 'POST':
username = request.form['username']
password = generate_password_hash(request.form['password'], method='sha256')
password = generate_password_hash(request.form['password']) # ✅ Fix
new_user = User(username=username, password=password)
db.session.add(new_user)
db.session.commit()
flash('Benutzer erfolgreich erstellt! Sie können sich jetzt einloggen.')
flash('Benutzer erfolgreich erstellt!')
return redirect(url_for('auth.login'))
return render_template('signup.html')
@bp.route('/logout')
@ -53,171 +51,203 @@ def logout():
@bp.route('/jobs')
@login_required
def job_status():
jobs = Job.query.filter_by(user_id=current_user.id).all()
jobs = Job.query.filter_by(user_id=current_user.id).order_by(Job.created_at.desc()).all()
return render_template('jobs.html', jobs=jobs)
@bp.route('/upload', methods=['GET', 'POST'])
@login_required
def upload():
if request.method == 'POST':
file = request.files['file']
filename = secure_filename(file.filename)
# Überprüfen, ob eine Datei mit dem gleichen Namen bereits existiert
file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], filename)
if os.path.exists(file_path):
# Wenn eine Datei mit dem gleichen Namen existiert, einen Zeitstempel hinzufügen
name, ext = os.path.splitext(filename)
timestamp = time.strftime("%Y%m%d-%H%M%S") # Zeitstempel im Format JahrMonatTag-StundenMinutenSekunden
filename = f"{name}_{timestamp}{ext}"
file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], filename)
flash(f"Eine Datei mit gleichem Namen existierte bereits. Die Datei wurde als '{filename}' gespeichert.")
# Speichern der Datei
file.save(file_path)
flash('Datei erfolgreich hochgeladen und Job gestartet')
if 'file' not in request.files:
flash('Keine Datei ausgewählt.')
return redirect(url_for('auth.upload'))
# Neuen Job erstellen
new_job = Job(user_id=current_user.id, filename=filename, status="Pending")
file = request.files['file']
if not file or file.filename == '':
flash('Keine gültige Datei.')
return redirect(url_for('auth.upload'))
filename = secure_filename(file.filename)
name, ext = os.path.splitext(filename)
timestamp = time.strftime("%Y%m%d_%H%M%S")
unique_filename = f"{name}_{timestamp}{ext}" if os.path.exists(os.path.join(UPLOAD_FOLDER, filename)) else filename
filepath = os.path.join(UPLOAD_FOLDER, unique_filename)
file.save(filepath)
print(f"💾 UPLOAD: {filepath}")
new_job = Job(
user_id=current_user.id,
filename=unique_filename,
status="Pending"
)
db.session.add(new_job)
db.session.commit()
print(f"🆕 JOB #{new_job.id} für User {current_user.id}")
# Debugging-Ausgabe zur Überprüfung der Thread-Erstellung
print(f"Starte Scraping-Thread für Job-ID: {new_job.id}")
# Starten des Scraping im Hintergrund-Thread und Übergeben des aktuellen Anwendungskontexts
thread = threading.Thread(target=process_file, args=(filename, new_job.id, current_app._get_current_object()))
thread = threading.Thread(
target=process_file,
args=(unique_filename, new_job.id, current_app._get_current_object())
)
thread.daemon = True
thread.start()
print(f"🔄 THREAD STARTED Job {new_job.id}")
# Debugging-Ausgabe, nachdem der Thread gestartet wurde
print(f"Thread für Job {new_job.id} erfolgreich gestartet.")
flash(f'"{unique_filename}" → Job #{new_job.id} läuft!')
return redirect(url_for('auth.job_status'))
return render_template('upload.html')
@bp.route('/download/<int:job_id>', methods=['GET'])
@bp.route('/download/<int:job_id>')
@login_required
def download_result(job_id):
job = Job.query.get_or_404(job_id)
print(f"Job ID: {job.id} - User ID: {job.user_id} - Current User ID: {current_user.id}")
job = Job.query.filter_by(id=job_id, user_id=current_user.id).first_or_404()
# Überprüfen, ob der Job dem aktuellen Benutzer gehört
if job.user_id != current_user.id:
flash("Sie haben keine Berechtigung, dieses Ergebnis herunterzuladen.")
if not job.result_filename or not job.status.startswith(''):
flash('Ergebnis nicht bereit.')
return redirect(url_for('auth.job_status'))
# Überprüfen, ob das Ergebnis vorhanden ist
if not job.result_filename:
flash("Das Ergebnis ist noch nicht verfügbar.")
return redirect(url_for('auth.job_status'))
# Überprüfen, ob die Datei im angegebenen Pfad existiert
result_path = os.path.join(current_app.config['RESULT_FOLDER'], job.result_filename)
print(f"Versuche, Datei herunterzuladen von: {result_path}")
result_path = os.path.join(RESULT_FOLDER, job.result_filename)
if os.path.exists(result_path):
print("Datei existiert und wird zum Download bereitgestellt.")
return send_file(result_path, as_attachment=True)
else:
print("Datei nicht gefunden. Ergebnisverzeichnis oder Pfad prüfen.")
flash("Ergebnisdatei nicht gefunden.")
flash('Datei fehlt.')
return redirect(url_for('auth.job_status'))
@bp.route('/download_raw/<int:job_id>')
@login_required
def download_result_raw(job_id):
job = Job.query.filter_by(id=job_id, user_id=current_user.id).first_or_404()
if not job.result_filename_raw:
flash('Rohdaten nicht verfügbar.')
return redirect(url_for('auth.job_status'))
result_path = os.path.join(RESULT_FOLDER, job.result_filename_raw)
if os.path.exists(result_path):
return send_file(result_path, as_attachment=True)
flash('Datei fehlt.')
return redirect(url_for('auth.job_status'))
@bp.route('/delete_job/<int:job_id>', methods=['POST'])
@login_required
def delete_job(job_id):
job = Job.query.get_or_404(job_id)
if job.user_id != current_user.id:
flash("Sie haben keine Berechtigung, diesen Job zu löschen.")
return redirect(url_for('auth.job_status'))
job = Job.query.filter_by(id=job_id, user_id=current_user.id).first_or_404()
# Löschen der Upload-Datei
upload_path = os.path.join(current_app.config['UPLOAD_FOLDER'], job.filename)
upload_path = os.path.join(UPLOAD_FOLDER, job.filename)
if os.path.exists(upload_path):
os.remove(upload_path)
print(f"Upload-Datei gelöscht: {upload_path}")
else:
print(f"Upload-Datei nicht gefunden: {upload_path}")
# Löschen der Results-Datei, falls vorhanden
if job.result_filename:
result_path = os.path.join(current_app.config['RESULT_FOLDER'], job.result_filename)
print(f"Versuche Ergebnisdatei zu löschen: {result_path}")
result_path = os.path.join(RESULT_FOLDER, job.result_filename)
if os.path.exists(result_path):
try:
os.remove(result_path)
print(f"Ergebnisdatei gelöscht: {result_path}")
except Exception as e:
print(f"Fehler beim Löschen der Ergebnisdatei: {e}")
else:
print(f"Ergebnisdatei nicht gefunden im Pfad: {result_path}")
os.remove(result_path)
if job.result_filename_raw: # ✅ Raw auch löschen
raw_path = os.path.join(RESULT_FOLDER, job.result_filename_raw)
if os.path.exists(raw_path):
os.remove(raw_path)
# Job aus der Datenbank löschen
db.session.delete(job)
db.session.commit()
flash("Job erfolgreich gelöscht.")
flash('Job gelöscht.')
return redirect(url_for('auth.job_status'))
@bp.route('/job_status/<int:job_id>')
@login_required
def job_status_api(job_id):
job = Job.query.filter_by(id=job_id, user_id=current_user.id).first()
if not job:
return jsonify({'error': 'Not found'}), 404
return jsonify({
'id': job.id,
'status': job.status,
'result_filename': job.result_filename,
'result_filename_raw': getattr(job, 'result_filename_raw', None),
'scraper_job_id': getattr(job, 'scraper_job_id', None)
})
@bp.route('/resume_job/<int:job_id>', methods=['POST'])
@login_required
def resume_job(job_id):
job = Job.query.filter_by(id=job_id, user_id=current_user.id).first_or_404()
thread = threading.Thread(
target=process_file,
args=(job.filename, job.id, current_app._get_current_object())
)
thread.daemon = True
thread.start()
flash(f'Job #{job_id} wird fortgesetzt...')
return redirect(url_for('auth.job_status'))
# ── ADMIN ──────────────────────────────────────────
@bp.route('/admin', methods=['GET'])
@login_required
def admin_panel():
if not current_user.is_admin:
flash("Keine Berechtigung.")
return redirect(url_for('auth.job_status'))
users = User.query.all()
return render_template('admin_panel.html', users=users)
cfg = AppConfig.query.filter_by(key='allow_signup').first()
signup_allowed = cfg and cfg.value == 'true'
return render_template('admin_panel.html', users=users, signup_allowed=signup_allowed)
@bp.route('/admin/create_user', methods=['POST'])
@login_required
def create_user():
if not current_user.is_admin:
flash("Keine Berechtigung.")
return redirect(url_for('auth.admin_panel'))
username = request.form['username']
password = request.form['password']
is_admin = 'is_admin' in request.form # Checkbox für Adminrechte
hashed_password = generate_password_hash(password, method='sha256')
new_user = User(username=username, password=hashed_password, is_admin=is_admin)
password = generate_password_hash(request.form['password']) # ✅ Fix
is_admin = 'is_admin' in request.form
new_user = User(username=username, password=password, is_admin=is_admin)
db.session.add(new_user)
db.session.commit()
flash(f"Benutzer {username} wurde erstellt.")
flash(f'{username} erstellt.')
return redirect(url_for('auth.admin_panel'))
@bp.route('/admin/reset_password/<int:user_id>', methods=['POST'])
@login_required
def reset_password(user_id):
if not current_user.is_admin:
flash("Keine Berechtigung.")
return redirect(url_for('auth.admin_panel'))
user = User.query.get_or_404(user_id)
new_password = request.form['new_password']
user.password = generate_password_hash(new_password, method='sha256')
user.password = generate_password_hash(new_password) # ✅ Fix
db.session.commit()
flash(f"Passwort für Benutzer {user.username} wurde zurückgesetzt.")
flash(f'Passwort {user.username} zurückgesetzt.')
return redirect(url_for('auth.admin_panel'))
@bp.route('/admin/delete_user/<int:user_id>', methods=['POST'])
@login_required
def delete_user(user_id):
if not current_user.is_admin:
flash("Keine Berechtigung.")
return redirect(url_for('auth.admin_panel'))
user = User.query.get_or_404(user_id)
if user.is_admin:
flash("Administratoren können nicht gelöscht werden.")
flash('Admin nicht löschbar.')
return redirect(url_for('auth.admin_panel'))
db.session.delete(user)
db.session.commit()
flash(f"Benutzer {user.username} wurde gelöscht.")
flash(f'{user.username} gelöscht.')
return redirect(url_for('auth.admin_panel'))
@bp.route('/admin/toggle_signup', methods=['POST'])
@login_required
def toggle_signup():
if not current_user.is_admin:
return redirect(url_for('auth.admin_panel'))
cfg = AppConfig.query.filter_by(key='allow_signup').first()
if not cfg:
cfg = AppConfig(key='allow_signup', value='true')
db.session.add(cfg)
else:
cfg.value = 'false' if cfg.value == 'true' else 'true'
db.session.commit()
state = '✅ aktiviert' if cfg.value == 'true' else '🔒 deaktiviert'
flash(f'Registrierung {state}.')
return redirect(url_for('auth.admin_panel'))

View file

@ -47,4 +47,32 @@
<button type="submit" class="create-btn">Benutzer erstellen</button>
</form>
</div>
<div class="config-box">
<h3>⚙️ Einstellungen</h3>
<form action="{{ url_for('auth.toggle_signup') }}" method="POST">
<div class="toggle-row">
<span>Benutzer-Registrierung:</span>
{% if signup_allowed %}
<span class="badge badge-green">✅ Aktiv</span>
<button type="submit" class="btn-danger">🔒 Deaktivieren</button>
{% else %}
<span class="badge badge-red">🔒 Deaktiviert</span>
<button type="submit" class="btn-success">✅ Aktivieren</button>
{% endif %}
</div>
</form>
</div>
<style>
.config-box { background: #f8f9fa; border: 1px solid #ddd; border-radius: 6px; padding: 16px; margin-bottom: 24px; }
.toggle-row { display: flex; align-items: center; gap: 12px; }
.badge { padding: 3px 10px; border-radius: 12px; font-size: 0.85em; font-weight: bold; }
.badge-green { background: #d4edda; color: #155724; }
.badge-red { background: #f8d7da; color: #721c24; }
.btn-danger { background: #e74c3c; color: white; border: none; padding: 6px 14px; border-radius: 4px; cursor: pointer; }
.btn-success { background: #27ae60; color: white; border: none; padding: 6px 14px; border-radius: 4px; cursor: pointer; }
.btn-danger:hover { background: #c0392b; }
.btn-success:hover { background: #1e8449; }
</style>
{% endblock %}

121
app/templates/jobs.bck Normal file
View file

@ -0,0 +1,121 @@
{% extends "base.html" %}
{% block content %}
<div class="table-container">
<h2>Ihre Aufträge</h2>
<table id="jobs-table">
<thead>
<tr>
<th>Dateiname</th>
<th>Status</th>
<th>Erstellt am</th>
<th>Ergebnis</th>
<th>Aktionen</th>
</tr>
</thead>
<tbody>
{% for job in jobs %}
<tr id="job-row-{{ job.id }}">
<td>{{ job.filename }}</td>
<td id="status-{{ job.id }}" class="job-status">{{ job.status }}</td>
<td>{{ job.created_at.strftime('%Y-%m-%d %H:%M:%S') }}</td>
<td id="result-{{ job.id }}">
{% if job.result_filename and 'Failed' not in job.status %}
<a href="{{ url_for('auth.download_result', job_id=job.id) }}" class="dl-btn">
🎯 Gefiltert
</a>
{% if job.result_filename_raw %}
&nbsp;
<a href="{{ url_for('auth.download_result_raw', job_id=job.id) }}" class="dl-btn dl-btn-raw">
📋 Alle
</a>
{% endif %}
{% elif 'Failed' in job.status %}
<span class="status-failed">❌ {{ job.result_filename or 'Fehler' }}</span>
{% else %}
<span class="status-pending">⏳ Noch nicht verfügbar</span>
{% endif %}
</td>
<td>
<form action="{{ url_for('auth.delete_job', job_id=job.id) }}" method="POST" style="display:inline;">
<button type="submit" class="delete-btn">🗑️ Löschen</button>
</form>
</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
<style>
.job-status { font-weight: bold; }
.status-failed { color: #e74c3c; font-weight: bold; }
.status-pending { color: #888; }
.status-completed { color: #27ae60; }
.dl-btn {
display: inline-block;
padding: 4px 10px;
border-radius: 4px;
text-decoration: none;
font-size: 0.85em;
font-weight: bold;
background: #27ae60;
color: #fff;
margin: 2px 1px;
transition: background 0.2s;
}
.dl-btn:hover { background: #1e8449; }
.dl-btn-raw { background: #2980b9; }
.dl-btn-raw:hover { background: #1a5e8a; }
</style>
<script>
document.addEventListener('DOMContentLoaded', function () {
document.querySelectorAll('.job-status').forEach(function (cell) {
const jobId = cell.id.split('-')[1];
const status = cell.textContent.trim();
if (!status.includes('✅') && !status.includes('Failed')) {
pollJob(jobId);
}
});
});
function renderResult(resultCell, data) {
const hasFailed = data.status.includes('Failed');
const hasFiltered = data.result_filename && !hasFailed;
const hasRaw = data.result_filename_raw && !hasFailed;
if (hasFiltered) {
let html = `<a href="/download/${data.id}" class="dl-btn">🎯 Gefiltert</a>`;
if (hasRaw) {
html += ` <a href="/download_raw/${data.id}" class="dl-btn dl-btn-raw">📋 Alle</a>`;
}
resultCell.innerHTML = html;
} else if (hasFailed) {
resultCell.innerHTML = `<span class="status-failed">❌ ${data.result_filename || 'Fehler'}</span>`;
} else {
resultCell.innerHTML = `<span class="status-pending">⏳ Noch nicht verfügbar</span>`;
}
}
function pollJob(jobId) {
fetch(`/job_status/${jobId}`)
.then(r => r.json())
.then(data => {
const statusCell = document.getElementById(`status-${jobId}`);
const resultCell = document.getElementById(`result-${jobId}`);
statusCell.textContent = data.status;
renderResult(resultCell, data);
// Weiter pollen wenn noch nicht fertig
const done = data.status.includes('✅') || data.status.includes('Failed');
if (!done) {
setTimeout(() => pollJob(jobId), 5000);
}
})
.catch(() => setTimeout(() => pollJob(jobId), 10000));
}
</script>
{% endblock %}

View file

@ -15,20 +15,38 @@
</thead>
<tbody>
{% for job in jobs %}
<tr>
<tr id="job-row-{{ job.id }}">
<td>{{ job.filename }}</td>
<td class="job-status">{{ job.status }}</td>
<td id="status-{{ job.id }}" class="job-status">
{{ job.status }}
</td>
<td>{{ job.created_at.strftime('%Y-%m-%d %H:%M:%S') }}</td>
<td>
{% if job.status == "Completed" %}
<a href="{{ url_for('auth.download_result', job_id=job.id) }}">Download</a>
<td id="result-{{ job.id }}">
{% if job.result_filename and 'Failed' not in job.status %}
<a href="{{ url_for('auth.download_result', job_id=job.id) }}" class="dl-btn">
🎯 Gefiltert
</a>
{% if job.result_filename_raw %}
&nbsp;
<a href="{{ url_for('auth.download_result_raw', job_id=job.id) }}" class="dl-btn dl-btn-raw">
📋 Alle
</a>
{% endif %}
{% elif 'Failed' in job.status %}
<span class="status-failed">❌ {{ job.result_filename or 'Fehler' }}</span>
{% else %}
Noch nicht verfügbar
<span class="status-pending"> Noch nicht verfügbar</span>
{% endif %}
</td>
<td>
{% if 'Failed' in job.status %}
<!-- 🆕 Resume Button -->
<form action="{{ url_for('auth.resume_job', job_id=job.id) }}" method="POST" style="display:inline;">
<button type="submit" class="btn-resume">▶️ Resume</button>
</form>
{% endif %}
<form action="{{ url_for('auth.delete_job', job_id=job.id) }}" method="POST" style="display:inline;">
<button type="submit" class="delete-btn">Löschen</button>
<button type="submit" class="delete-btn">🗑️ Löschen</button>
</form>
</td>
</tr>
@ -37,25 +55,101 @@
</table>
</div>
<style>
.job-status { font-weight: bold; }
.status-failed { color: #e74c3c; font-weight: bold; }
.status-pending { color: #888; }
.eta-badge { display: inline-block; background: #eaf4ff; color: #1a6fa8;
border-radius: 10px; padding: 2px 8px; font-size: 0.82em;
font-weight: bold; margin-left: 6px; }
.dl-btn { display: inline-block; padding: 4px 10px; border-radius: 4px;
text-decoration: none; font-size: 0.85em; font-weight: bold;
background: #27ae60; color: #fff; margin: 2px 1px; transition: background 0.2s; }
.dl-btn:hover { background: #1e8449; }
.dl-btn-raw { background: #2980b9; }
.dl-btn-raw:hover { background: #1a5e8a; }
.btn-resume { background: #e67e22; color: white; border: none;
padding: 4px 10px; border-radius: 4px; cursor: pointer;
font-size: 0.85em; font-weight: bold; margin-right: 4px; }
.btn-resume:hover { background: #ca6f1e; }
</style>
<script>
// Periodische Aktualisierung des Jobstatus
setInterval(function() {
fetch('{{ url_for("auth.job_status") }}')
.then(response => response.text())
.then(html => {
const parser = new DOMParser();
const doc = parser.parseFromString(html, 'text/html');
const newRows = doc.querySelectorAll('#jobs-table tbody tr');
const currentRows = document.querySelectorAll('#jobs-table tbody tr');
// ETA Badge aus Status-String parsen
function parseStatus(status) {
const parts = status.split('|');
if (parts.length === 2) {
return `<span>${parts[0].trim()}</span>
<span class="eta-badge">${parts[1].trim()}</span>`;
}
return status;
}
newRows.forEach((newRow, index) => {
const newStatus = newRow.querySelector('.job-status').textContent;
currentRows[index].querySelector('.job-status').textContent = newStatus;
function renderResult(resultCell, data) {
const hasFailed = data.status.includes('Failed') || data.status.includes('❌');
const hasFiltered = data.result_filename && !hasFailed;
const hasRaw = data.result_filename_raw && !hasFailed;
const newResult = newRow.querySelector('td:nth-child(4)').innerHTML;
currentRows[index].querySelector('td:nth-child(4)').innerHTML = newResult;
});
});
}, 5000); // Aktualisierung alle 5 Sekunden
if (hasFiltered) {
let html = `<a href="/download/${data.id}" class="dl-btn">🎯 Gefiltert</a>`;
if (hasRaw) {
html += ` <a href="/download_raw/${data.id}" class="dl-btn dl-btn-raw">📋 Alle</a>`;
}
resultCell.innerHTML = html;
} else if (hasFailed) {
resultCell.innerHTML = `<span class="status-failed">❌ ${data.result_filename || 'Fehler'}</span>`;
} else {
resultCell.innerHTML = `<span class="status-pending">⏳ Noch nicht verfügbar</span>`;
}
}
function renderActions(row, data) {
const actionsCell = row.querySelector('td:last-child');
const hasFailed = data.status.includes('Failed');
let html = '';
if (hasFailed) {
html += `<form action="/resume_job/${data.id}" method="POST" style="display:inline;">
<button type="submit" class="btn-resume">▶️ Resume</button>
</form>`;
}
html += `<form action="/delete_job/${data.id}" method="POST" style="display:inline;">
<button type="submit" class="delete-btn">🗑️ Löschen</button>
</form>`;
actionsCell.innerHTML = html;
}
function pollJob(jobId) {
fetch(`/job_status/${jobId}`)
.then(r => r.json())
.then(data => {
const statusCell = document.getElementById(`status-${jobId}`);
const resultCell = document.getElementById(`result-${jobId}`);
const row = document.getElementById(`job-row-${jobId}`);
statusCell.innerHTML = parseStatus(data.status);
renderResult(resultCell, data);
renderActions(row, data);
const done = data.status.includes('✅') || data.status.includes('Failed') || data.status.includes('❌');
if (!done) {
setTimeout(() => pollJob(jobId), 5000);
}
})
.catch(() => setTimeout(() => pollJob(jobId), 10000));
}
document.addEventListener('DOMContentLoaded', function () {
document.querySelectorAll('.job-status').forEach(function (cell) {
const jobId = cell.id.split('-')[1];
const status = cell.textContent.trim();
cell.innerHTML = parseStatus(status);
if (!status.includes('✅') && !status.includes('Failed') && !status.includes('❌')) {
pollJob(jobId);
}
});
});
</script>
{% endblock %}

61
app/templates/jobs.orig Normal file
View file

@ -0,0 +1,61 @@
{% extends "base.html" %}
{% block content %}
<div class="table-container">
<h2>Ihre Aufträge</h2>
<table id="jobs-table">
<thead>
<tr>
<th>Dateiname</th>
<th>Status</th>
<th>Erstellt am</th>
<th>Ergebnis</th>
<th>Aktionen</th>
</tr>
</thead>
<tbody>
{% for job in jobs %}
<tr>
<td>{{ job.filename }}</td>
<td class="job-status">{{ job.status }}</td>
<td>{{ job.created_at.strftime('%Y-%m-%d %H:%M:%S') }}</td>
<td>
{% if job.status == "Completed" %}
<a href="{{ url_for('auth.download_result', job_id=job.id) }}">Download</a>
{% else %}
Noch nicht verfügbar
{% endif %}
</td>
<td>
<form action="{{ url_for('auth.delete_job', job_id=job.id) }}" method="POST" style="display:inline;">
<button type="submit" class="delete-btn">Löschen</button>
</form>
</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
<script>
// Periodische Aktualisierung des Jobstatus
setInterval(function() {
fetch('{{ url_for("auth.job_status") }}')
.then(response => response.text())
.then(html => {
const parser = new DOMParser();
const doc = parser.parseFromString(html, 'text/html');
const newRows = doc.querySelectorAll('#jobs-table tbody tr');
const currentRows = document.querySelectorAll('#jobs-table tbody tr');
newRows.forEach((newRow, index) => {
const newStatus = newRow.querySelector('.job-status').textContent;
currentRows[index].querySelector('.job-status').textContent = newStatus;
const newResult = newRow.querySelector('td:nth-child(4)').innerHTML;
currentRows[index].querySelector('td:nth-child(4)').innerHTML = newResult;
});
});
}, 5000); // Aktualisierung alle 5 Sekunden
</script>
{% endblock %}

316
app/webcrawler.bck02032026 Normal file
View file

@ -0,0 +1,316 @@
import os
import re
import pandas as pd
import requests
import time
import random
from io import StringIO
from app.models import db, Job
print("🆕 MODERN webcrawler LOADED!")
UPLOAD_FOLDER = '/app/uploads'
RESULT_FOLDER = '/app/results'
SCRAPER_URL = "http://gmaps-scraper:8080"
OUTPUT_COLS = ['title', 'category', 'address', 'open_hours', 'website', 'phone', 'link']
# ──────────────────────────────────────────────
# Hilfsfunktionen
# ──────────────────────────────────────────────
def get_batch_size(total_rows):
if total_rows < 50: return 10
elif total_rows < 200: return 10
elif total_rows < 500: return 5
else: return 5
def get_delay(total_rows):
if total_rows < 50: return (5, 10)
elif total_rows < 200: return (10, 20)
else: return (20, 40)
def is_blocked(data):
text = str(data).lower()
blocked = any(kw in text for kw in ['captcha', 'blocked', 'rate limit', 'too many', '429'])
if blocked:
print(f"🚫 BLOCKED: {str(data)[:100]}")
return blocked
def fix_encoding(text):
"""Kaputte ISO→UTF8 Zeichen reparieren (z.B. Industriestraße → Industriestraße)"""
if not isinstance(text, str):
return text
try:
return text.encode('latin-1').decode('utf-8')
except (UnicodeEncodeError, UnicodeDecodeError):
return text
def build_input_addresses(df):
"""Normalisierte Adressen aus Input-CSV für Abgleich"""
addresses = set()
for _, row in df.iterrows():
plz = str(row.get('PLZ', '')).strip()
stadt = str(row.get('Stadt', '')).strip()
str_ = str(row.get('Straße', '')).strip()
nr = str(row.get('Hausnummer', '')).strip()
zusatz = str(row.get('Zusatz', '')).strip()
full = f"{str_} {nr} {zusatz} {plz} {stadt}".lower().strip()
full = ' '.join(full.split())
addresses.add(full)
return addresses
def normalize_address(addr):
"""Output-Adresse normalisieren für Abgleich"""
if not isinstance(addr, str):
return ''
addr = fix_encoding(addr)
return ' '.join(addr.lower().strip().split())
def address_in_input(result_addr, input_addresses):
"""Prüft ob PLZ + Straßenname aus Result im Input vorkommen"""
norm = normalize_address(result_addr)
for inp_addr in input_addresses:
plz_match = re.search(r'\b\d{5}\b', inp_addr)
if plz_match:
plz = plz_match.group()
if plz in norm:
street = inp_addr.split()[0] if inp_addr else ''
if len(street) > 3 and street[:4].lower() in norm:
return True
return False
# ──────────────────────────────────────────────
# CSV Nachbearbeitung (apply_filter umschaltbar)
# ──────────────────────────────────────────────
def process_result_csv(raw_bytes, input_df, apply_filter=True):
"""
Raw CSV → bereinigt:
- Nur OUTPUT_COLS
- Encoding fix
- Optional: Input/Output Abgleich + Duplikate
"""
try:
content = raw_bytes.decode('utf-8', errors='replace')
df_out = pd.read_csv(StringIO(content))
print(f"📄 Raw result: {df_out.shape} | Columns: {list(df_out.columns)[:8]}")
# Spalten filtern
available = [c for c in OUTPUT_COLS if c in df_out.columns]
missing = [c for c in OUTPUT_COLS if c not in df_out.columns]
if missing:
print(f"⚠️ Fehlende Spalten: {missing}")
df_out = df_out[available]
# 🔤 Encoding fix
for col in df_out.columns:
df_out[col] = df_out[col].apply(fix_encoding)
print(f"🔤 Encoding fix: done")
if apply_filter:
# 📍 Input/Output Abgleich
input_addresses = build_input_addresses(input_df)
before = len(df_out)
df_out = df_out[
df_out['address'].apply(
lambda a: address_in_input(a, input_addresses)
)
]
print(f"📍 Adress-Filter: {before} → {len(df_out)} Zeilen")
# 🔁 Duplikate entfernen (immer, auch bei Raw)
before_dedup = len(df_out)
df_out = df_out.drop_duplicates(subset=['title', 'address'], keep='first')
print(f"🔁 Duplikate: {before_dedup} → {len(df_out)} Zeilen")
# Leere Titel entfernen
df_out = df_out.dropna(subset=['title'], how='all')
df_out = df_out[df_out['title'].str.strip().astype(bool)]
print(f"✅ Final ({'gefiltert' if apply_filter else 'alle'}): {df_out.shape}")
return df_out
except Exception as e:
print(f"💥 process_result_csv: {e}")
import traceback
traceback.print_exc()
return None
# ──────────────────────────────────────────────
# Haupt-Worker
# ──────────────────────────────────────────────
def process_file(filename, job_id, app):
print(f"🎯 {filename} Job#{job_id} START!")
with app.app_context():
job = Job.query.get(job_id)
if not job:
print("❌ Job missing")
return
try:
# 1⃣ CSV Parse
job.status = "📊 parsing CSV"
db.session.commit()
filepath = os.path.join(UPLOAD_FOLDER, filename)
print(f"📁 {filepath} | {os.path.getsize(filepath)}b")
df_input = pd.read_csv(filepath, sep=';', encoding='ISO-8859-1')
print(f"📊 {df_input.shape} | Columns: {list(df_input.columns)}")
queries = []
for _, row in df_input.iterrows():
parts = [
str(row.get('PLZ', '')).strip(),
str(row.get('Stadt', '')).strip(),
str(row.get('Straße', '')).strip(),
str(row.get('Hausnummer', '')).strip(),
str(row.get('Zusatz', '')).strip(),
]
q = f"Firmen {' '.join(p for p in parts if p and p != 'nan')}".strip()
if len(q) > 10:
queries.append(q)
total = len(queries)
print(f"🔍 {total} Queries | Samples: {queries[:3]}")
if not queries:
raise ValueError("Keine gültigen Adressen in CSV")
# 2⃣ Batch + Delay
batch_size = get_batch_size(total)
delay_min, delay_max = get_delay(total)
batch = queries[:batch_size]
pre_delay = random.uniform(delay_min, delay_max)
print(f"📦 Batch {len(batch)}/{total} | 😴 {pre_delay:.1f}s Delay")
time.sleep(pre_delay)
# 3⃣ API Call
job.status = "📤 sending to scraper"
db.session.commit()
payload = {
"name": f"{filename.replace('.csv','')}-{job_id}",
"keywords": batch,
"lang": "de",
"depth": 1,
"zoom": 17,
"radius": 50,
"max_time": 60,
"fast_mode": False
}
print(f"🌐 POST {SCRAPER_URL}/api/v1/jobs | {payload['name']}")
resp = requests.post(f"{SCRAPER_URL}/api/v1/jobs", json=payload, timeout=30)
print(f"📤 {resp.status_code}: {resp.text[:300]}")
if is_blocked(resp.text):
raise ValueError("🚫 IP geblockt! Proxy konfigurieren.")
if resp.status_code != 201:
raise ValueError(f"API {resp.status_code}: {resp.text[:200]}")
# 4⃣ Polling
scraper_id = resp.json()['id']
job.scraper_job_id = scraper_id
job.status = "⏳ scraping"
db.session.commit()
print(f"✅ Scraper Job: {scraper_id}")
for i in range(1, 61): # Max 10min
try:
r = requests.get(
f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}",
timeout=10
)
data = r.json()
status = data.get('Status', data.get('status', '?'))
print(f"⏳ {i}/60: {status}")
if is_blocked(data):
raise ValueError("🚫 IP geblockt während scraping!")
if status in ('ok', 'completed', 'scraped'):
dl = requests.get(
f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}/download",
timeout=60
)
if dl.status_code != 200:
raise ValueError(f"Download {dl.status_code}")
if is_blocked(dl.text[:200]):
raise ValueError("🚫 IP geblockt beim Download!")
# 5⃣ Nachbearbeitung → zwei Versionen
job.status = "🔧 processing result"
db.session.commit()
base = filename.replace('.csv', '')
os.makedirs(RESULT_FOLDER, exist_ok=True)
# ── Version A: Gefiltert (Adressabgleich + Deduplizierung) ──
df_filtered = process_result_csv(dl.content, df_input, apply_filter=True)
outname_filtered = f"results_{base}_filtered.csv"
outpath_filtered = os.path.join(RESULT_FOLDER, outname_filtered)
if df_filtered is not None and len(df_filtered) > 0:
df_filtered.to_csv(
outpath_filtered, index=False,
encoding='utf-8-sig', sep=';'
)
print(f"🎯 Filtered: {outname_filtered} → {len(df_filtered)} Firmen")
else:
print("⚠️ Keine Treffer nach Filter leere Datei wird erstellt")
pd.DataFrame(columns=OUTPUT_COLS).to_csv(
outpath_filtered, index=False,
encoding='utf-8-sig', sep=';'
)
# ── Version B: Alle (nur Spalten + Encoding, kein Filter) ──
df_raw = process_result_csv(dl.content, df_input, apply_filter=False)
outname_raw = f"results_{base}_all.csv"
outpath_raw = os.path.join(RESULT_FOLDER, outname_raw)
if df_raw is not None:
df_raw.to_csv(
outpath_raw, index=False,
encoding='utf-8-sig', sep=';'
)
print(f"📋 All: {outname_raw} → {len(df_raw)} Firmen")
else:
print("⚠️ df_raw None Rohinhalt wird gespeichert")
with open(outpath_raw, 'wb') as f:
f.write(dl.content)
# ── DB speichern ──
job.status = "✅ Fertig"
job.result_filename = outname_filtered # 🎯 Gefiltert
job.result_filename_raw = outname_raw # 📋 Alle
db.session.commit()
print(f"🎉 Beide Dateien gespeichert!")
break
elif status in ('failed', 'cancelled', 'error'):
raise ValueError(f"Scraper: {status}")
except requests.RequestException as e:
print(f"⚠️ Poll {i}: {e}")
time.sleep(random.uniform(8, 15))
else:
raise ValueError("Timeout nach 10min")
except Exception as e:
job.status = "Failed"
job.result_filename = str(e)
print(f"💥 ERROR: {e}")
import traceback
traceback.print_exc()
db.session.commit()
print(f"✅ DONE! Status: {job.status}\n")

275
app/webcrawler.bck04032026 Normal file
View file

@ -0,0 +1,275 @@
import os
import re
import pandas as pd
import requests
import time
import random
from io import StringIO
from app.models import db, Job
print("🆕 MODERN webcrawler LOADED! BATCHED + PROXY")
UPLOAD_FOLDER = '/app/uploads'
RESULT_FOLDER = '/app/results'
SCRAPER_URL = "http://gmaps-scraper:8080"
OUTPUT_COLS = ['title', 'category', 'address', 'open_hours', 'website', 'phone', 'link']
PROXY_URL = "http://bitlleuv-rotate:s5hzse6hz74b@p.webshare.io:80"
API_PROXIES = {"http": PROXY_URL, "https": PROXY_URL}
# ──────────────────────────────────────────────
# Hilfsfunktionen
# ──────────────────────────────────────────────
def is_blocked(data):
text = str(data).lower()
blocked = any(kw in text for kw in ['captcha', 'blocked', 'rate limit', 'too many', '429'])
if blocked:
print(f"🚫 BLOCKED: {str(data)[:100]}")
return blocked
def fix_encoding(text):
if not isinstance(text, str):
return text
try:
return text.encode('latin-1').decode('utf-8')
except (UnicodeEncodeError, UnicodeDecodeError):
return text
def build_input_addresses(df):
addresses = set()
for _, row in df.iterrows():
plz = str(row.get('PLZ', '')).strip()
stadt = str(row.get('Stadt', '')).strip()
str_ = str(row.get('Straße', '')).strip()
nr = str(row.get('Hausnummer', '')).strip()
zusatz = str(row.get('Zusatz', '')).strip()
full = f"{str_} {nr} {zusatz} {plz} {stadt}".lower().strip()
full = ' '.join(full.split())
addresses.add(full)
return addresses
def normalize_address(addr):
if not isinstance(addr, str):
return ''
addr = fix_encoding(addr)
return ' '.join(addr.lower().strip().split())
def address_in_input(result_addr, input_addresses):
norm = normalize_address(result_addr)
for inp_addr in input_addresses:
plz_match = re.search(r'\b\d{5}\b', inp_addr)
if plz_match:
plz = plz_match.group()
if plz in norm:
street = inp_addr.split()[0] if inp_addr else ''
if len(street) > 3 and street[:4].lower() in norm:
return True
return False
# ──────────────────────────────────────────────
# CSV Nachbearbeitung
# ──────────────────────────────────────────────
def process_result_csv(raw_bytes, input_df, apply_filter=True):
try:
content = raw_bytes.decode('utf-8', errors='replace')
df_out = pd.read_csv(StringIO(content))
print(f"📄 Raw result: {df_out.shape}")
available = [c for c in OUTPUT_COLS if c in df_out.columns]
df_out = df_out[available]
for col in df_out.columns:
df_out[col] = df_out[col].apply(fix_encoding)
if apply_filter:
input_addresses = build_input_addresses(input_df)
before = len(df_out)
df_out = df_out[
df_out['address'].apply(
lambda a: address_in_input(a, input_addresses)
)
]
print(f"📍 Filter: {before} → {len(df_out)}")
df_out = df_out.drop_duplicates(subset=['title', 'address'], keep='first')
df_out = df_out.dropna(subset=['title'], how='all')
df_out = df_out[df_out['title'].str.strip().astype(bool)]
print(f"✅ Final ({'gefiltert' if apply_filter else 'alle'}): {df_out.shape}")
return df_out
except Exception as e:
print(f"💥 process_result_csv: {e}")
return None
# ──────────────────────────────────────────────
# HAUPT-WORKER
# ──────────────────────────────────────────────
def process_file(filename, job_id, app):
print(f"🎯 {filename} Job#{job_id} START!")
with app.app_context():
job = Job.query.get(job_id)
if not job:
print("❌ Job missing")
return
try:
#Parse + ALLE Queries
job.status = "📊 parsing CSV"
db.session.commit()
filepath = os.path.join(UPLOAD_FOLDER, filename)
print(f"📁 {filepath} | {os.path.getsize(filepath)}b")
df_input = pd.read_csv(filepath, sep=';', encoding='ISO-8859-1')
print(f"📊 {df_input.shape}")
queries = []
for _, row in df_input.iterrows():
parts = [
str(row.get('PLZ', '')).strip(),
str(row.get('Stadt', '')).strip(),
str(row.get('Straße', '')).strip(),
str(row.get('Hausnummer', '')).strip(),
str(row.get('Zusatz', '')).strip(),
]
q = f"Firmen {' '.join(p for p in parts if p and p != 'nan')}".strip()
if len(q) > 10:
queries.append(q)
total_queries = len(queries)
print(f"🔍 {total_queries} Queries | Samples: {queries[:3]}")
if total_queries == 0:
raise ValueError("Keine gültigen Adressen")
#BATCHED Processing
BATCH_SIZE = 10 # Erhöht: 5 → 10 (paid proxy)
BATCH_DELAY_MIN, BATCH_DELAY_MAX = 10, 20 # Reduziert: 30-60s → 10-20s (paid proxy)
batches = (total_queries + BATCH_SIZE - 1) // BATCH_SIZE
print(f"📦 {batches} Batches à {BATCH_SIZE} | ETA: ~{batches*15//60:.0f}h")
all_results_filtered = []
all_results_raw = []
job.status = f"🔄 Batch 1/{batches}"
db.session.commit()
for batch_idx in range(batches):
batch_start = batch_idx * BATCH_SIZE
batch_end = min(batch_start + BATCH_SIZE, total_queries)
batch_queries = queries[batch_start:batch_end]
print(f"\n🔄 BATCH {batch_idx+1}/{batches} ({batch_start+1}-{batch_end}/{total_queries})")
#Random Delay
delay = random.uniform(BATCH_DELAY_MIN, BATCH_DELAY_MAX)
print(f"😴 Delay: {delay:.0f}s | Proxy: {PROXY_URL}")
time.sleep(delay)
#API Call
payload = {
"name": f"{filename.replace('.csv','')}-{job_id}-B{batch_idx+1:03d}",
"keywords": batch_queries,
"lang": "de",
"depth": 1,
"zoom": 17,
"radius": 50,
"max_time": 60, # Reduziert: 120 → 60 (paid proxy schneller)
"fast_mode": False,
"proxies": [PROXY_URL]
}
try:
resp = requests.post(
f"{SCRAPER_URL}/api/v1/jobs",
json=payload,
timeout=45
)
print(f"📤 {resp.status_code}")
if is_blocked(resp.text):
print("🚫 Batch übersprungen (blocked)")
continue
if resp.status_code != 201:
print(f"⚠️ Batch {batch_idx+1} fehlgeschlagen: {resp.text[:100]}")
continue
scraper_id = resp.json()['id']
print(f"✅ Scraper: {scraper_id}")
for poll_i in range(1, 61): # Reduziert: 121 → 61 (max_time 60s)
r = requests.get(
f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}",
timeout=15
)
data = r.json()
status = data.get('Status', data.get('status', '?'))
if status in ('ok', 'completed', 'scraped'):
dl = requests.get(
f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}/download",
timeout=90
)
if dl.status_code == 200:
df_filtered = process_result_csv(dl.content, df_input, True)
df_raw = process_result_csv(dl.content, df_input, False)
if df_filtered is not None:
all_results_filtered.append(df_filtered)
all_results_raw.append(df_raw)
print(f"📊 Batch {batch_idx+1}: {len(df_filtered)} filtered")
break
elif status in ('failed', 'error'):
print(f"💥 Batch {batch_idx+1}: {status}")
break
time.sleep(random.uniform(5, 10)) # Reduziert: 10-20s → 5-10s (paid proxy)
except Exception as e:
print(f"💥 Batch {batch_idx+1}: {e}")
job.status = f"🔄 Batch {batch_idx+2}/{batches}"
db.session.commit()
#MERGE & SAVE
job.status = "🔧 merging results"
db.session.commit()
base = filename.replace('.csv', '')
os.makedirs(RESULT_FOLDER, exist_ok=True)
if all_results_filtered:
df_final_filtered = pd.concat(all_results_filtered, ignore_index=True)
df_final_filtered = df_final_filtered.drop_duplicates(subset=['title', 'address'])
out_filtered = f"results_{base}_filtered.csv"
df_final_filtered.to_csv(
os.path.join(RESULT_FOLDER, out_filtered),
index=False, encoding='utf-8-sig', sep=';'
)
if all_results_raw:
df_final_raw = pd.concat(all_results_raw, ignore_index=True)
out_raw = f"results_{base}_all.csv"
df_final_raw.to_csv(
os.path.join(RESULT_FOLDER, out_raw),
index=False, encoding='utf-8-sig', sep=';'
)
job.result_filename = out_filtered
job.result_filename_raw = out_raw
job.status = f"✅ Fertig: {len(df_final_filtered)} Firmen"
else:
job.status = "❌ Keine Ergebnisse"
db.session.commit()
print(f"🎉 Job {job_id} komplett!")
except Exception as e:
job.status = f"Failed: {str(e)[:50]}"
print(f"💥 FATAL: {e}")
import traceback
traceback.print_exc()
db.session.commit()
print(f"✅ DONE! Status: {job.status}")

View file

@ -0,0 +1,429 @@
import os
import re
import unicodedata
import json
import pandas as pd
import requests
import time
import random
from io import StringIO
from app.models import db, Job
print("🆕 MODERN webcrawler LOADED! BATCHED + PROXY + RESUME + ETA + 2x SCRAPER")
UPLOAD_FOLDER = '/app/uploads'
RESULT_FOLDER = '/app/results'
# 2x Scraper abwechselnd genutzt
SCRAPER_URLS = [
"http://gmaps-scraper-1:8080",
"http://gmaps-scraper-2:8080",
]
OUTPUT_COLS = ['title', 'category', 'address', 'open_hours', 'website', 'phone', 'link']
PROXY_URL = "http://bitlleuv-rotate:s5hzse6hz74b@p.webshare.io:80"
API_PROXIES = {"http": PROXY_URL, "https": PROXY_URL}
# ──────────────────────────────────────────────
# Tuning
# ──────────────────────────────────────────────
BATCH_SIZE = 30 # Keywords pro Scraper-Job
BATCH_DELAY_MIN = 3 # Sekunden Pause zwischen Batches (min)
BATCH_DELAY_MAX = 6 # Sekunden Pause zwischen Batches (max)
MAX_TIME = 60 # Sekunden die der Scraper pro Batch hat
POLL_MAX = 90 # Max. Poll-Versuche pro Batch
POLL_DELAY_MIN = 2 # Sekunden zwischen Polls (min)
POLL_DELAY_MAX = 5 # Sekunden zwischen Polls (max)
STUCK_THRESHOLD = 8 # Polls auf 'pending' bis Auto-Restart
MAX_RETRIES = 2 # Wiederholversuche pro Batch bei Fehler
# ──────────────────────────────────────────────
# Hilfsfunktionen
# ──────────────────────────────────────────────
def is_blocked(data):
text = str(data).lower()
blocked = any(kw in text for kw in ['captcha', 'blocked', 'rate limit', 'too many', '429'])
if blocked:
print(f"🚫 BLOCKED: {str(data)[:100]}")
return blocked
def fix_encoding(text):
if not isinstance(text, str):
return text
try:
return text.encode('latin-1').decode('utf-8')
except (UnicodeEncodeError, UnicodeDecodeError):
return text
# Fix 1: Sonderzeichen in Queries bereinigen
def clean_query(q):
"""Steuerzeichen + fehlerhafte Bytes entfernen für saubere Google Maps URLs"""
q = ''.join(c for c in q if unicodedata.category(c) != 'Cc')
q = ' '.join(q.split())
return q.strip()
def build_input_addresses(df):
addresses = set()
for _, row in df.iterrows():
plz = str(row.get('PLZ', '')).strip()
stadt = str(row.get('Stadt', '')).strip()
str_ = str(row.get('Straße', '')).strip()
nr = str(row.get('Hausnummer', '')).strip()
zusatz = str(row.get('Zusatz', '')).strip()
full = f"{str_} {nr} {zusatz} {plz} {stadt}".lower().strip()
full = ' '.join(full.split())
addresses.add(full)
return addresses
def normalize_address(addr):
if not isinstance(addr, str):
return ''
addr = fix_encoding(addr)
return ' '.join(addr.lower().strip().split())
def address_in_input(result_addr, input_addresses):
norm = normalize_address(result_addr)
for inp_addr in input_addresses:
plz_match = re.search(r'\b\d{5}\b', inp_addr)
if plz_match:
plz = plz_match.group()
if plz in norm:
street = inp_addr.split()[0] if inp_addr else ''
if len(street) > 3 and street[:4].lower() in norm:
return True
return False
def format_eta(seconds):
"""Sekunden → lesbares ETA-Format"""
if seconds < 60:
return f"{int(seconds)}s"
h, rem = divmod(int(seconds), 3600)
m = rem // 60
return f"{h}h {m:02d}min" if h > 0 else f"{m}min"
# ──────────────────────────────────────────────
# Fix 3: Scraper-Neustart bei Inactivity
# ──────────────────────────────────────────────
def restart_scraper(scraper_url):
"""Den betroffenen Scraper-Container neu starten"""
try:
import subprocess
# Container-Name aus URL ableiten: http://gmaps-scraper-1:8080 → gmaps-scraper-1
container = scraper_url.split("//")[1].split(":")[0]
print(f"🔄 Starte {container} neu...")
subprocess.run(["docker", "restart", container], timeout=30, capture_output=True)
print(f"✅ {container} neu gestartet warte 15s...")
time.sleep(15)
return True
except Exception as e:
print(f"⚠️ Scraper-Neustart fehlgeschlagen: {e}")
return False
# ──────────────────────────────────────────────
# Resume: Progress-File Hilfsfunktionen
# ──────────────────────────────────────────────
def get_progress_path(job_id):
return os.path.join(RESULT_FOLDER, f"progress_{job_id}.json")
def get_partial_path(job_id, suffix):
return os.path.join(RESULT_FOLDER, f"partial_{job_id}_{suffix}.csv")
def load_progress(job_id):
"""Gespeicherten Fortschritt laden (falls vorhanden)"""
path = get_progress_path(job_id)
if os.path.exists(path):
with open(path, 'r') as f:
data = json.load(f)
print(f"🔁 RESUME: ab Batch {data['last_completed_batch'] + 1}/{data['total_batches']}")
return data
return None
def save_progress(job_id, last_completed_batch, total_batches):
"""Fortschritt nach jedem Batch speichern"""
path = get_progress_path(job_id)
with open(path, 'w') as f:
json.dump({"last_completed_batch": last_completed_batch, "total_batches": total_batches}, f)
def append_partial(job_id, df_filtered, df_raw):
"""Batch-Ergebnis an Partial-CSV anhängen"""
for suffix, df in [('filtered', df_filtered), ('raw', df_raw)]:
if df is None:
continue
path = get_partial_path(job_id, suffix)
header = not os.path.exists(path)
df.to_csv(path, mode='a', index=False, header=header, encoding='utf-8-sig', sep=';')
def load_partial(job_id):
"""Bestehende Partial-CSVs laden"""
results_filtered, results_raw = [], []
for suffix, lst in [('filtered', results_filtered), ('raw', results_raw)]:
path = get_partial_path(job_id, suffix)
if os.path.exists(path):
try:
df = pd.read_csv(path, sep=';', encoding='utf-8-sig')
lst.append(df)
print(f"📂 Partial {suffix}: {len(df)} Zeilen geladen")
except Exception as e:
print(f"⚠️ Partial {suffix} Ladefehler: {e}")
return results_filtered, results_raw
def cleanup_progress(job_id):
"""Progress + Partial-Files nach Abschluss löschen"""
for path in [
get_progress_path(job_id),
get_partial_path(job_id, 'filtered'),
get_partial_path(job_id, 'raw'),
]:
if os.path.exists(path):
os.remove(path)
# ──────────────────────────────────────────────
# CSV Nachbearbeitung
# ──────────────────────────────────────────────
def process_result_csv(raw_bytes, input_df, apply_filter=True):
try:
content = raw_bytes.decode('utf-8', errors='replace')
df_out = pd.read_csv(StringIO(content))
print(f"📄 Raw result: {df_out.shape}")
available = [c for c in OUTPUT_COLS if c in df_out.columns]
df_out = df_out[available]
for col in df_out.columns:
df_out[col] = df_out[col].apply(fix_encoding)
if apply_filter:
input_addresses = build_input_addresses(input_df)
before = len(df_out)
df_out = df_out[
df_out['address'].apply(lambda a: address_in_input(a, input_addresses))
]
print(f"📍 Filter: {before} → {len(df_out)}")
df_out = df_out.drop_duplicates(subset=['title', 'address'], keep='first')
df_out = df_out.dropna(subset=['title'], how='all')
df_out = df_out[df_out['title'].str.strip().astype(bool)]
print(f"✅ Final ({'gefiltert' if apply_filter else 'alle'}): {df_out.shape}")
return df_out
except Exception as e:
print(f"💥 process_result_csv: {e}")
return None
# ──────────────────────────────────────────────
# HAUPT-WORKER
# ──────────────────────────────────────────────
def process_file(filename, job_id, app):
print(f"🎯 {filename} Job#{job_id} START!")
with app.app_context():
job = Job.query.get(job_id)
if not job:
print("❌ Job missing")
return
try:
#Parse + ALLE Queries
job.status = "📊 parsing CSV"
db.session.commit()
filepath = os.path.join(UPLOAD_FOLDER, filename)
print(f"📁 {filepath} | {os.path.getsize(filepath)}b")
df_input = pd.read_csv(filepath, sep=';', encoding='ISO-8859-1')
print(f"📊 {df_input.shape}")
queries = []
for _, row in df_input.iterrows():
parts = [
str(row.get('PLZ', '')).strip(),
str(row.get('Stadt', '')).strip(),
str(row.get('Straße', '')).strip(),
str(row.get('Hausnummer', '')).strip(),
str(row.get('Zusatz', '')).strip(),
]
q = f"Firmen {' '.join(p for p in parts if p and p != 'nan')}".strip()
q = clean_query(q) # Fix 1: Sonderzeichen bereinigen
if len(q) > 10:
queries.append(q)
total_queries = len(queries)
print(f"🔍 {total_queries} Queries | Samples: {queries[:3]}")
if total_queries == 0:
raise ValueError("Keine gültigen Adressen")
#BATCHED Processing
batches = (total_queries + BATCH_SIZE - 1) // BATCH_SIZE
# Resume: Fortschritt laden falls vorhanden
os.makedirs(RESULT_FOLDER, exist_ok=True)
progress = load_progress(job_id)
start_batch = progress['last_completed_batch'] + 1 if progress else 0
all_results_filtered, all_results_raw = load_partial(job_id) if progress else ([], [])
eta_initial = format_eta((batches - start_batch) * ((BATCH_DELAY_MAX + MAX_TIME) / 2))
print(f"📦 {batches} Batches à {BATCH_SIZE} | 2x Scraper | Start: {start_batch} | ETA: ~{eta_initial}")
job_start_time = time.time()
job.status = f"🔄 Batch {start_batch+1}/{batches} | ⏱️ ~{eta_initial}"
db.session.commit()
for batch_idx in range(start_batch, batches):
batch_start = batch_idx * BATCH_SIZE
batch_end = min(batch_start + BATCH_SIZE, total_queries)
batch_queries = queries[batch_start:batch_end]
# 2x Scraper: abwechselnd nutzen
scraper_url = SCRAPER_URLS[batch_idx % len(SCRAPER_URLS)]
print(f"\n🔄 BATCH {batch_idx+1}/{batches} ({batch_start+1}-{batch_end}/{total_queries}) → {scraper_url}")
#Random Delay
delay = random.uniform(BATCH_DELAY_MIN, BATCH_DELAY_MAX)
print(f"😴 Delay: {delay:.0f}s")
time.sleep(delay)
#API Call
payload = {
"name": f"{filename.replace('.csv','')}-{job_id}-B{batch_idx+1:03d}",
"keywords": batch_queries,
"lang": "de",
"depth": 1,
"zoom": 15,
"radius": 50,
"max_time": MAX_TIME,
"fast_mode": False,
"proxies": [PROXY_URL]
}
batch_success = False
# Fix 2: Retry-Logik bei Scraper-Fehler
for attempt in range(1, MAX_RETRIES + 1):
try:
resp = requests.post(f"{scraper_url}/api/v1/jobs", json=payload, timeout=45)
print(f"📤 {resp.status_code} (Versuch {attempt} | {scraper_url})")
if is_blocked(resp.text):
print("🚫 Batch übersprungen (blocked)")
break
if resp.status_code != 201:
print(f"⚠️ Batch {batch_idx+1} fehlgeschlagen: {resp.text[:100]}")
if attempt < MAX_RETRIES:
time.sleep(10)
continue
scraper_id = resp.json()['id']
print(f"✅ Scraper: {scraper_id}")
stuck_counter = 0
for poll_i in range(1, POLL_MAX + 1):
r = requests.get(f"{scraper_url}/api/v1/jobs/{scraper_id}", timeout=15)
data = r.json()
status = data.get('Status', data.get('status', '?'))
print(f"⏳ Poll {poll_i}: {status}")
# Fix 4: Auto-Recovery bei Pending-Stuck
if status == 'pending':
stuck_counter += 1
if stuck_counter >= STUCK_THRESHOLD:
print(f"⚠️ Job {scraper_id} hängt abbrechen + Neustart")
requests.delete(f"{scraper_url}/api/v1/jobs/{scraper_id}", timeout=10)
restart_scraper(scraper_url) # Fix 3: Nur betroffenen Scraper neu starten
break
else:
stuck_counter = 0
if status in ('ok', 'completed', 'scraped'):
dl = requests.get(f"{scraper_url}/api/v1/jobs/{scraper_id}/download", timeout=90)
if dl.status_code == 200:
df_filtered = process_result_csv(dl.content, df_input, True)
df_raw = process_result_csv(dl.content, df_input, False)
if df_filtered is not None:
all_results_filtered.append(df_filtered)
all_results_raw.append(df_raw)
append_partial(job_id, df_filtered, df_raw) # Resume: sofort speichern
print(f"📊 Batch {batch_idx+1}: {len(df_filtered)} filtered")
batch_success = True
break
# Fix 2: Scraper-Fehler → Retry
elif status in ('failed', 'error'):
print(f"💥 Batch {batch_idx+1}: {status} (Versuch {attempt})")
if attempt < MAX_RETRIES:
time.sleep(10)
break
time.sleep(random.uniform(POLL_DELAY_MIN, POLL_DELAY_MAX))
if batch_success:
break
except Exception as e:
print(f"💥 Batch {batch_idx+1} Versuch {attempt}: {e}")
if attempt < MAX_RETRIES:
time.sleep(10)
# Resume: Fortschritt nach jedem Batch speichern
save_progress(job_id, batch_idx, batches)
# ETA berechnen
elapsed = time.time() - job_start_time
done_so_far = batch_idx - start_batch + 1
if done_so_far > 0:
avg_per_batch = elapsed / done_so_far
remaining = (batches - batch_idx - 1) * avg_per_batch
eta_str = format_eta(remaining)
else:
eta_str = "?"
job.status = f"🔄 Batch {batch_idx+2}/{batches} | ⏱️ ~{eta_str}"
db.session.commit()
#MERGE & SAVE
job.status = "🔧 merging results"
db.session.commit()
base = filename.replace('.csv', '')
if all_results_filtered:
df_final_filtered = pd.concat(all_results_filtered, ignore_index=True)
df_final_filtered = df_final_filtered.drop_duplicates(subset=['title', 'address'])
out_filtered = f"results_{base}_filtered.csv"
df_final_filtered.to_csv(
os.path.join(RESULT_FOLDER, out_filtered),
index=False, encoding='utf-8-sig', sep=';'
)
if all_results_raw:
df_final_raw = pd.concat(all_results_raw, ignore_index=True)
out_raw = f"results_{base}_all.csv"
df_final_raw.to_csv(
os.path.join(RESULT_FOLDER, out_raw),
index=False, encoding='utf-8-sig', sep=';'
)
job.result_filename = out_filtered
job.result_filename_raw = out_raw
job.status = f"✅ Fertig: {len(df_final_filtered)} Firmen"
# Resume: Cleanup nach Abschluss
cleanup_progress(job_id)
else:
job.status = "❌ Keine Ergebnisse"
db.session.commit()
print(f"🎉 Job {job_id} komplett!")
except Exception as e:
job.status = f"Failed: {str(e)[:50]}"
print(f"💥 FATAL: {e}")
import traceback
traceback.print_exc()
db.session.commit()
print(f"✅ DONE! Status: {job.status}")

138
app/webcrawler.orig Normal file
View file

@ -0,0 +1,138 @@
import csv
import os
import requests
from .models import db, Job
from flask import current_app
UPLOAD_FOLDER = 'uploads'
RESULT_FOLDER = 'results'
API_KEY = 'AIzaSyAIf0yXJTwo87VMWLBtq2m2LqE-OaPGbzw'
processed_companies = set()
def get_geocode(address):
url = f"https://maps.googleapis.com/maps/api/geocode/json"
params = {'address': address, 'key': API_KEY}
try:
response = requests.get(url, params=params, timeout=5)
if response.status_code == 200:
data = response.json()
if data['status'] == 'OK':
location = data['results'][0]['geometry']['location']
return location['lat'], location['lng']
except requests.RequestException as e:
print(f"Geocode API Fehler für {address}: {e}")
return None, None
def get_nearby_places(lat, lng):
places_url = f"https://maps.googleapis.com/maps/api/place/nearbysearch/json"
params = {
'location': f"{lat},{lng}",
'radius': 10,
'type': 'point_of_interest',
'key': API_KEY
}
try:
response = requests.get(places_url, params=params, timeout=5)
if response.status_code == 200:
return response.json().get('results', [])
except requests.RequestException as e:
print(f"Nearby Places API Fehler für Standort {lat},{lng}: {e}")
return []
def get_place_details(place_id):
details_url = f"https://maps.googleapis.com/maps/api/place/details/json"
params = {
'place_id': place_id,
'fields': 'formatted_phone_number,website',
'key': API_KEY
}
try:
response = requests.get(details_url, params=params, timeout=5)
if response.status_code == 200:
result = response.json().get('result', {})
return result.get('formatted_phone_number', 'N/A'), result.get('website', 'N/A')
except requests.RequestException as e:
print(f"Place Details API Fehler für Place ID {place_id}: {e}")
return 'N/A', 'N/A'
def process_file(filename, job_id, app):
with app.app_context():
filepath = os.path.join(UPLOAD_FOLDER, filename)
results = []
job = Job.query.get(job_id)
if not job:
print("Job wurde abgebrochen.")
return
job.status = "In Progress"
db.session.commit()
with open(filepath, newline='', encoding='ISO-8859-1') as csvfile:
reader = csv.DictReader(csvfile, delimiter=';')
headers = reader.fieldnames
if not all(field in headers for field in ['PLZ', 'Straße', 'Hausnummer']):
print("CSV-Datei enthält nicht alle notwendigen Spalten.")
job.status = "Failed"
db.session.commit()
return
for row in reader:
plz = row.get('PLZ', '').strip()
city = row.get('Stadt', row.get('Bezirk', '')).strip()
street = row.get('Straße', '').strip()
house_number = row.get('Hausnummer', '').strip()
additional = row.get('Zusatz', '').strip()
if not all([plz, city, street, house_number]):
continue
full_address = f"{street} {house_number} {additional}, {plz} {city}"
lat, lng = get_geocode(full_address)
if lat is None or lng is None:
continue
nearby_places = get_nearby_places(lat, lng)
for place in nearby_places:
company_name = place['name']
if company_name in processed_companies:
continue
processed_companies.add(company_name)
company_address = place.get('vicinity', 'N/A').split(',')[0]
place_id = place.get('place_id')
company_phone, company_website = get_place_details(place_id) if place_id else ('N/A', 'N/A')
results.append({
'PLZ': plz,
'Stadt': city,
'Straße': street,
'Hausnummer': house_number,
'Zusatz': additional,
'Company Name': company_name,
'Company Address': company_address,
'Company Phone': company_phone,
'Company Website': company_website
})
if results:
result_file = f"results_{os.path.splitext(filename)[0]}.csv"
result_path = os.path.join(RESULT_FOLDER, result_file)
with open(result_path, 'w', newline='', encoding='utf-8-sig') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=[
'PLZ', 'Stadt', 'Straße', 'Hausnummer', 'Zusatz',
'Company Name', 'Company Address', 'Company Phone', 'Company Website'
])
writer.writeheader()
writer.writerows(results)
job.status = "Completed"
job.result_filename = result_file
db.session.commit()
else:
job.status = "Failed"
db.session.commit()

View file

@ -1,138 +1,487 @@
import csv
import os
import re
import unicodedata
import json
import threading
import pandas as pd
import requests
from .models import db, Job
from flask import current_app
import time
import random
from io import StringIO
from concurrent.futures import ThreadPoolExecutor, as_completed
from app.models import db, Job
UPLOAD_FOLDER = 'uploads'
RESULT_FOLDER = 'results'
print("🆕 MODERN webcrawler LOADED! BATCHED + PROXY + RESUME + ETA + 4x SCRAPER CHUNK-PARALLEL")
API_KEY = 'AIzaSyAIf0yXJTwo87VMWLBtq2m2LqE-OaPGbzw'
UPLOAD_FOLDER = '/app/uploads'
RESULT_FOLDER = '/app/results'
processed_companies = set()
SCRAPER_URLS = [
"http://gmaps-scraper-1:8080",
"http://gmaps-scraper-2:8080",
"http://gmaps-scraper-3:8080",
"http://gmaps-scraper-4:8080",
]
def get_geocode(address):
url = f"https://maps.googleapis.com/maps/api/geocode/json"
params = {'address': address, 'key': API_KEY}
OUTPUT_COLS = ['title', 'category', 'address', 'open_hours', 'website', 'phone', 'link']
PROXY_URL = "http://bitlleuv-rotate:s5hzse6hz74b@p.webshare.io:80"
API_PROXIES = {"http": PROXY_URL, "https": PROXY_URL}
_job_semaphore = threading.Semaphore(1)
# ──────────────────────────────────────────────
# Tuning
# ──────────────────────────────────────────────
BATCH_SIZE = 30 # Keywords pro Scraper-Job
BATCH_DELAY_MIN = 3 # Sekunden Pause zwischen Chunks (min)
BATCH_DELAY_MAX = 6 # Sekunden Pause zwischen Chunks (max)
MAX_TIME = 60 # Sekunden die der Scraper pro Batch hat
POLL_MAX = 90 # Max. Poll-Versuche pro Batch
POLL_DELAY_MIN = 2 # Sekunden zwischen Polls (min)
POLL_DELAY_MAX = 5 # Sekunden zwischen Polls (max)
STUCK_TIMEOUT = 300 # Sekunden bis Scraper-Neustart (5 Min)
MAX_RETRIES = 2 # Wiederholversuche pro Batch bei Fehler
PARALLEL_WORKERS = len(SCRAPER_URLS)
_partial_lock = threading.Lock()
# ──────────────────────────────────────────────
# Hilfsfunktionen
# ──────────────────────────────────────────────
def is_blocked(data):
text = str(data).lower()
blocked = any(kw in text for kw in ['captcha', 'blocked', 'rate limit', 'too many', '429'])
if blocked:
print(f"🚫 BLOCKED: {str(data)[:100]}")
return blocked
def fix_encoding(text):
if not isinstance(text, str):
return text
try:
response = requests.get(url, params=params, timeout=5)
if response.status_code == 200:
data = response.json()
if data['status'] == 'OK':
location = data['results'][0]['geometry']['location']
return location['lat'], location['lng']
except requests.RequestException as e:
print(f"Geocode API Fehler für {address}: {e}")
return text.encode('latin-1').decode('utf-8')
except (UnicodeEncodeError, UnicodeDecodeError):
return text
def clean_query(q):
q = ''.join(c for c in q if unicodedata.category(c) != 'Cc')
q = ' '.join(q.split())
return q.strip()
def build_input_addresses(df):
addresses = set()
for _, row in df.iterrows():
plz = str(row.get('PLZ', '')).strip()
stadt = str(row.get('Stadt', '')).strip()
str_ = str(row.get('Straße', '')).strip()
nr = str(row.get('Hausnummer', '')).strip()
zusatz = str(row.get('Zusatz', '')).strip()
full = f"{str_} {nr} {zusatz} {plz} {stadt}".lower().strip()
full = ' '.join(full.split())
addresses.add(full)
return addresses
def normalize_address(addr):
if not isinstance(addr, str):
return ''
addr = fix_encoding(addr)
return ' '.join(addr.lower().strip().split())
def address_in_input(result_addr, input_addresses):
norm = normalize_address(result_addr)
for inp_addr in input_addresses:
plz_match = re.search(r'\b\d{5}\b', inp_addr)
if not plz_match:
continue
plz = plz_match.group()
if plz not in norm:
continue
parts = inp_addr.split()
street = parts[0] if parts else ''
if len(street) < 4 or street[:5].lower() not in norm:
continue
hausnr = parts[1] if len(parts) > 1 else ''
if hausnr and not re.search(rf'\b{re.escape(hausnr)}\b', norm):
continue
return True
return False
def format_eta(seconds):
if seconds < 60:
return f"{int(seconds)}s"
h, rem = divmod(int(seconds), 3600)
m = rem // 60
return f"{h}h {m:02d}min" if h > 0 else f"{m}min"
# ──────────────────────────────────────────────
# Scraper-Job Cleanup
# ──────────────────────────────────────────────
def _cleanup_scraper_job(scraper_url, scraper_id):
"""Scraper-Job immer aufräumen wenn wir ihn nicht mehr brauchen"""
try:
requests.delete(f"{scraper_url}/api/v1/jobs/{scraper_id}", timeout=10)
print(f"🗑️ Scraper-Job {scraper_id} gelöscht")
except Exception as e:
print(f"⚠️ Cleanup fehlgeschlagen: {e}")
# ──────────────────────────────────────────────
# Scraper-Neustart via Docker SDK
# ──────────────────────────────────────────────
def restart_scraper(scraper_url):
try:
import docker
container_name = scraper_url.split("//")[1].split(":")[0]
print(f"🔄 Starte {container_name} neu...")
client = docker.from_env()
container = client.containers.get(container_name)
container.restart()
print(f"{container_name} neu gestartet warte 15s...")
time.sleep(15)
return True
except Exception as e:
print(f"⚠️ Scraper-Neustart fehlgeschlagen: {e}")
return False
# ──────────────────────────────────────────────
# Resume: Progress-File Hilfsfunktionen
# ──────────────────────────────────────────────
def get_progress_path(job_id):
return os.path.join(RESULT_FOLDER, f"progress_{job_id}.json")
def get_partial_path(job_id, suffix):
return os.path.join(RESULT_FOLDER, f"partial_{job_id}_{suffix}.csv")
def load_progress(job_id):
path = get_progress_path(job_id)
if os.path.exists(path):
with open(path, 'r') as f:
data = json.load(f)
print(f"🔁 RESUME: ab Batch {data['last_completed_batch'] + 1}/{data['total_batches']}")
return data
return None
def save_progress(job_id, last_completed_batch, total_batches):
path = get_progress_path(job_id)
with open(path, 'w') as f:
json.dump({"last_completed_batch": last_completed_batch, "total_batches": total_batches}, f)
def append_partial(job_id, df_filtered, df_raw):
with _partial_lock:
for suffix, df in [('filtered', df_filtered), ('raw', df_raw)]:
if df is None:
continue
path = get_partial_path(job_id, suffix)
header = not os.path.exists(path)
df.to_csv(path, mode='a', index=False, header=header, encoding='utf-8-sig', sep=';')
def load_partial(job_id):
results_filtered, results_raw = [], []
for suffix, lst in [('filtered', results_filtered), ('raw', results_raw)]:
path = get_partial_path(job_id, suffix)
if os.path.exists(path):
try:
df = pd.read_csv(path, sep=';', encoding='utf-8-sig')
lst.append(df)
print(f"📂 Partial {suffix}: {len(df)} Zeilen geladen")
except Exception as e:
print(f"⚠️ Partial {suffix} Ladefehler: {e}")
return results_filtered, results_raw
def cleanup_progress(job_id):
for path in [
get_progress_path(job_id),
get_partial_path(job_id, 'filtered'),
get_partial_path(job_id, 'raw'),
]:
if os.path.exists(path):
os.remove(path)
# ──────────────────────────────────────────────
# CSV Nachbearbeitung
# ──────────────────────────────────────────────
def process_result_csv(raw_bytes, input_df, apply_filter=True):
try:
content = raw_bytes.decode('utf-8', errors='replace')
df_out = pd.read_csv(StringIO(content))
print(f"📄 Raw result: {df_out.shape}")
available = [c for c in OUTPUT_COLS if c in df_out.columns]
df_out = df_out[available]
for col in df_out.columns:
df_out[col] = df_out[col].apply(fix_encoding)
if apply_filter:
input_addresses = build_input_addresses(input_df)
before = len(df_out)
df_out = df_out[
df_out['address'].apply(lambda a: address_in_input(a, input_addresses))
]
print(f"📍 Filter: {before}{len(df_out)}")
df_out = df_out.drop_duplicates(subset=['title', 'address'], keep='first')
df_out = df_out.dropna(subset=['title'], how='all')
df_out = df_out[df_out['title'].str.strip().astype(bool)]
print(f"✅ Final ({'gefiltert' if apply_filter else 'alle'}): {df_out.shape}")
return df_out
except Exception as e:
print(f"💥 process_result_csv: {e}")
return None
# ──────────────────────────────────────────────
# Parallel: Einzelnen Batch verarbeiten
# ──────────────────────────────────────────────
def process_batch(batch_idx, batch_queries, scraper_url, filename, job_id, df_input):
payload = {
"name": f"{filename.replace('.csv','')}-{job_id}-B{batch_idx+1:03d}",
"keywords": batch_queries,
"lang": "de",
"depth": 1,
"zoom": 17,
"radius": 100,
"max_time": MAX_TIME,
"fast_mode": False,
"proxies": [PROXY_URL]
}
for attempt in range(1, MAX_RETRIES + 1):
scraper_id = None
try:
resp = requests.post(f"{scraper_url}/api/v1/jobs", json=payload, timeout=45)
print(f"📤 Batch {batch_idx+1}{scraper_url} | {resp.status_code} (Versuch {attempt})")
if is_blocked(resp.text):
print(f"🚫 Batch {batch_idx+1} blocked")
return None, None
if resp.status_code != 201:
print(f"⚠️ Batch {batch_idx+1} fehlgeschlagen: {resp.text[:100]}")
if attempt < MAX_RETRIES:
time.sleep(10)
continue
scraper_id = resp.json()['id']
print(f"✅ Batch {batch_idx+1} Scraper-ID: {scraper_id}")
batch_start_time = time.time()
for poll_i in range(1, POLL_MAX + 1):
r = requests.get(f"{scraper_url}/api/v1/jobs/{scraper_id}", timeout=15)
data = r.json()
status = data.get('Status', data.get('status', '?'))
elapsed = time.time() - batch_start_time
print(f"⏳ Batch {batch_idx+1} Poll {poll_i}: {status} | {int(elapsed)}s")
if status == 'pending' and elapsed > STUCK_TIMEOUT:
print(f"⚠️ Batch {batch_idx+1} hängt seit {int(elapsed)}s Neustart {scraper_url}")
_cleanup_scraper_job(scraper_url, scraper_id)
scraper_id = None
restart_scraper(scraper_url)
break
if status in ('ok', 'completed', 'scraped'):
dl = requests.get(f"{scraper_url}/api/v1/jobs/{scraper_id}/download", timeout=90)
scraper_id = None
if dl.status_code == 200:
df_filtered = process_result_csv(dl.content, df_input, True)
df_raw = process_result_csv(dl.content, df_input, False)
print(f"📊 Batch {batch_idx+1}: {len(df_filtered) if df_filtered is not None else 0} filtered")
return df_filtered, df_raw
return None, None
elif status in ('failed', 'error'):
print(f"💥 Batch {batch_idx+1}: {status} (Versuch {attempt})")
_cleanup_scraper_job(scraper_url, scraper_id)
scraper_id = None
if attempt < MAX_RETRIES:
time.sleep(10)
break
time.sleep(random.uniform(POLL_DELAY_MIN, POLL_DELAY_MAX))
except Exception as e:
print(f"💥 Batch {batch_idx+1} Versuch {attempt}: {e}")
if scraper_id:
_cleanup_scraper_job(scraper_url, scraper_id)
scraper_id = None
if attempt < MAX_RETRIES:
time.sleep(10)
return None, None
def get_nearby_places(lat, lng):
places_url = f"https://maps.googleapis.com/maps/api/place/nearbysearch/json"
params = {
'location': f"{lat},{lng}",
'radius': 10,
'type': 'point_of_interest',
'key': API_KEY
}
try:
response = requests.get(places_url, params=params, timeout=5)
if response.status_code == 200:
return response.json().get('results', [])
except requests.RequestException as e:
print(f"Nearby Places API Fehler für Standort {lat},{lng}: {e}")
return []
def get_place_details(place_id):
details_url = f"https://maps.googleapis.com/maps/api/place/details/json"
params = {
'place_id': place_id,
'fields': 'formatted_phone_number,website',
'key': API_KEY
}
try:
response = requests.get(details_url, params=params, timeout=5)
if response.status_code == 200:
result = response.json().get('result', {})
return result.get('formatted_phone_number', 'N/A'), result.get('website', 'N/A')
except requests.RequestException as e:
print(f"Place Details API Fehler für Place ID {place_id}: {e}")
return 'N/A', 'N/A'
# ──────────────────────────────────────────────
# HAUPT-WORKER
# ──────────────────────────────────────────────
def process_file(filename, job_id, app):
with app.app_context():
filepath = os.path.join(UPLOAD_FOLDER, filename)
results = []
job = Job.query.get(job_id)
if not job:
print("Job wurde abgebrochen.")
return
job.status = "In Progress"
db.session.commit()
if job:
job.status = "⏳ Wartet auf anderen Job..."
db.session.commit()
with open(filepath, newline='', encoding='ISO-8859-1') as csvfile:
reader = csv.DictReader(csvfile, delimiter=';')
headers = reader.fieldnames
with _job_semaphore:
print(f"🎯 {filename} Job#{job_id} START!")
if not all(field in headers for field in ['PLZ', 'Straße', 'Hausnummer']):
print("CSV-Datei enthält nicht alle notwendigen Spalten.")
job.status = "Failed"
db.session.commit()
with app.app_context():
job = Job.query.get(job_id)
if not job:
print("❌ Job missing")
return
for row in reader:
plz = row.get('PLZ', '').strip()
city = row.get('Stadt', row.get('Bezirk', '')).strip()
street = row.get('Straße', '').strip()
house_number = row.get('Hausnummer', '').strip()
additional = row.get('Zusatz', '').strip()
try:
job.status = "📊 parsing CSV"
db.session.commit()
if not all([plz, city, street, house_number]):
continue
filepath = os.path.join(UPLOAD_FOLDER, filename)
print(f"📁 {filepath} | {os.path.getsize(filepath)}b")
full_address = f"{street} {house_number} {additional}, {plz} {city}"
lat, lng = get_geocode(full_address)
if lat is None or lng is None:
continue
df_input = pd.read_csv(filepath, sep=';', encoding='ISO-8859-1')
print(f"📊 {df_input.shape}")
nearby_places = get_nearby_places(lat, lng)
for place in nearby_places:
company_name = place['name']
if company_name in processed_companies:
continue
queries = []
for _, row in df_input.iterrows():
parts = [
str(row.get('PLZ', '')).strip(),
str(row.get('Stadt', '')).strip(),
str(row.get('Straße', '')).strip(),
str(row.get('Hausnummer', '')).strip(),
str(row.get('Zusatz', '')).strip(),
]
q = f"Firmen {' '.join(p for p in parts if p and p != 'nan')}".strip()
q = clean_query(q)
if len(q) > 10:
queries.append(q)
processed_companies.add(company_name)
company_address = place.get('vicinity', 'N/A').split(',')[0]
place_id = place.get('place_id')
company_phone, company_website = get_place_details(place_id) if place_id else ('N/A', 'N/A')
total_queries = len(queries)
print(f"🔍 {total_queries} Queries | Samples: {queries[:3]}")
if total_queries == 0:
raise ValueError("Keine gültigen Adressen")
results.append({
'PLZ': plz,
'Stadt': city,
'Straße': street,
'Hausnummer': house_number,
'Zusatz': additional,
'Company Name': company_name,
'Company Address': company_address,
'Company Phone': company_phone,
'Company Website': company_website
})
batches = (total_queries + BATCH_SIZE - 1) // BATCH_SIZE
if results:
result_file = f"results_{os.path.splitext(filename)[0]}.csv"
result_path = os.path.join(RESULT_FOLDER, result_file)
with open(result_path, 'w', newline='', encoding='utf-8-sig') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=[
'PLZ', 'Stadt', 'Straße', 'Hausnummer', 'Zusatz',
'Company Name', 'Company Address', 'Company Phone', 'Company Website'
])
writer.writeheader()
writer.writerows(results)
job.status = "Completed"
job.result_filename = result_file
db.session.commit()
else:
job.status = "Failed"
db.session.commit()
os.makedirs(RESULT_FOLDER, exist_ok=True)
progress = load_progress(job_id)
start_batch = progress['last_completed_batch'] + 1 if progress else 0
all_results_filtered, all_results_raw = load_partial(job_id) if progress else ([], [])
eta_initial = format_eta((batches - start_batch) * ((BATCH_DELAY_MAX + MAX_TIME) / 2) / PARALLEL_WORKERS)
print(f"📦 {batches} Batches à {BATCH_SIZE} | {PARALLEL_WORKERS}x parallel (Chunk) | Start: {start_batch} | ETA: ~{eta_initial}")
job_start_time = time.time()
job.status = f"🔄 Batch {start_batch+1}/{batches} | ⏱️ ~{eta_initial}"
db.session.commit()
completed_count = 0
batch_indices = list(range(start_batch, batches))
chunks = [
batch_indices[i:i + PARALLEL_WORKERS]
for i in range(0, len(batch_indices), PARALLEL_WORKERS)
]
with ThreadPoolExecutor(max_workers=PARALLEL_WORKERS) as executor:
for chunk_idx, chunk in enumerate(chunks):
futures = {}
for batch_idx in chunk:
batch_start_q = batch_idx * BATCH_SIZE
batch_end_q = min(batch_start_q + BATCH_SIZE, total_queries)
batch_queries = queries[batch_start_q:batch_end_q]
scraper_url = SCRAPER_URLS[batch_idx % len(SCRAPER_URLS)]
print(f"\n🚀 Chunk {chunk_idx+1} | Batch {batch_idx+1}/{batches}{scraper_url}")
time.sleep(random.uniform(1, 2))
future = executor.submit(
process_batch,
batch_idx, batch_queries, scraper_url,
filename, job_id, df_input
)
futures[future] = batch_idx
for future in as_completed(futures):
batch_idx = futures[future]
completed_count += 1
try:
df_filtered, df_raw = future.result()
if df_filtered is not None:
all_results_filtered.append(df_filtered)
all_results_raw.append(df_raw)
append_partial(job_id, df_filtered, df_raw)
except Exception as e:
print(f"💥 Batch {batch_idx+1} Exception: {e}")
save_progress(job_id, batch_idx, batches)
elapsed = time.time() - job_start_time
if completed_count > 0:
avg_per_batch = elapsed / completed_count
remaining = (batches - start_batch - completed_count) * avg_per_batch / PARALLEL_WORKERS
eta_str = format_eta(remaining)
else:
eta_str = "?"
job.status = f"🔄 {completed_count}/{batches - start_batch} fertig | ⏱️ ~{eta_str}"
db.session.commit()
if chunk_idx < len(chunks) - 1:
delay = random.uniform(BATCH_DELAY_MIN, BATCH_DELAY_MAX)
print(f"⏸️ Chunk {chunk_idx+1} fertig warte {delay:.1f}s...")
time.sleep(delay)
# ── MERGE & SAVE ──
job.status = "🔧 merging results"
db.session.commit()
base = filename.replace('.csv', '')
if all_results_filtered:
df_final_filtered = pd.concat(all_results_filtered, ignore_index=True)
df_final_filtered = df_final_filtered.drop_duplicates(subset=['title', 'address'])
out_filtered = f"results_{base}_filtered.csv"
df_final_filtered.to_csv(
os.path.join(RESULT_FOLDER, out_filtered),
index=False, encoding='utf-8-sig', sep=';'
)
out_raw = None
if all_results_raw:
df_final_raw = pd.concat(all_results_raw, ignore_index=True)
out_raw = f"results_{base}_all.csv"
df_final_raw.to_csv(
os.path.join(RESULT_FOLDER, out_raw),
index=False, encoding='utf-8-sig', sep=';'
)
job.result_filename = out_filtered
job.result_filename_raw = out_raw
job.status = f"✅ Fertig: {len(df_final_filtered)} Firmen"
cleanup_progress(job_id)
else:
job.status = "❌ Keine Ergebnisse"
db.session.commit()
print(f"🎉 Job {job_id} komplett!")
except Exception as e:
job.status = f"Failed: {str(e)[:50]}"
print(f"💥 FATAL: {e}")
import traceback
traceback.print_exc()
db.session.commit()
print(f"✅ DONE! Status: {job.status}")

21
delete-crawl-jobs.py Normal file
View file

@ -0,0 +1,21 @@
import requests
import time
base_url = "http://localhost:5001/api/v1/jobs"
response = requests.get(base_url)
jobs = response.json() # Direkt Array
print(f"{len(jobs)} Jobs gefunden.")
deleted = 0
for job in jobs:
job_id = job["ID"]
del_res = requests.delete(f"{base_url}/{job_id}")
if del_res.status_code in [200, 204]:
print(f"{job_id}")
deleted += 1
else:
print(f"{job_id}: {del_res.status_code}")
time.sleep(0.1)
print(f"{deleted}/{len(jobs)} gelöscht.")

View file

@ -1,4 +1,4 @@
version: '3'
version: '3.8'
services:
web:
build: .
@ -6,6 +6,114 @@ services:
- "5000:5000"
environment:
- FLASK_APP=app
command: flask run --host=0.0.0.0 --port=5000
- FLASK_ENV=production
- PYTHONUNBUFFERED=1
volumes:
- .:/app
- ./app:/app/app
- ./uploads:/app/uploads
- ./results:/app/results
- ./instance:/app/instance
- /var/run/docker.sock:/var/run/docker.sock
depends_on:
- gmaps-scraper-1
- gmaps-scraper-2
- gmaps-scraper-3
- gmaps-scraper-4
restart: always
networks:
- scraper-net
gmaps-scraper-1:
image: gosom/google-maps-scraper:latest
container_name: gmaps-scraper-1
environment:
- PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
ports:
- "5001:8080"
volumes:
- ./scraper-data-1:/gmapsdata
command:
- "-web"
- "-data-folder=/gmapsdata"
restart: always
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:8080/api/v1/jobs || exit 1"]
interval: 30s
timeout: 10s
retries: 3
start_period: 15s
networks:
- scraper-net
gmaps-scraper-2:
image: gosom/google-maps-scraper:latest
container_name: gmaps-scraper-2
environment:
- PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
ports:
- "5002:8080"
volumes:
- ./scraper-data-2:/gmapsdata
command:
- "-web"
- "-data-folder=/gmapsdata"
restart: always
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:8080/api/v1/jobs || exit 1"]
interval: 30s
timeout: 10s
retries: 3
start_period: 15s
networks:
- scraper-net
gmaps-scraper-3:
image: gosom/google-maps-scraper:latest
container_name: gmaps-scraper-3
environment:
- PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
ports:
- "5003:8080"
volumes:
- ./scraper-data-3:/gmapsdata
command:
- "-web"
- "-data-folder=/gmapsdata"
restart: always
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:8080/api/v1/jobs || exit 1"]
interval: 30s
timeout: 10s
retries: 3
start_period: 15s
networks:
- scraper-net
gmaps-scraper-4:
image: gosom/google-maps-scraper:latest
container_name: gmaps-scraper-4
environment:
- PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
ports:
- "5004:8080"
volumes:
- ./scraper-data-4:/gmapsdata
command:
- "-web"
- "-data-folder=/gmapsdata"
restart: always
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:8080/api/v1/jobs || exit 1"]
interval: 30s
timeout: 10s
retries: 3
start_period: 15s
networks:
- scraper-net
networks:
scraper-net:
driver: bridge

Binary file not shown.

View file

@ -1,8 +1,8 @@
Flask==2.2.5
Flask-Login==0.6.2
Flask-SQLAlchemy==3.0.3
Werkzeug==2.2.2
flask
flask-sqlalchemy
flask-login
flask-migrate
pandas
requests
beautifulsoup4
Flask-Migrate
werkzeug
docker

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

BIN
scraper-data-1/jobs.db Normal file

Binary file not shown.

BIN
scraper-data-1/jobs.db-shm Normal file

Binary file not shown.

BIN
scraper-data-1/jobs.db-wal Normal file

Binary file not shown.

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

Some files were not shown because too many files have changed in this diff Show more