Initial commit
This commit is contained in:
parent
387bc056b9
commit
df8c2313a9
275 changed files with 12939 additions and 263 deletions
|
|
@ -7,12 +7,14 @@ WORKDIR /app
|
||||||
# Abhängigkeiten installieren
|
# Abhängigkeiten installieren
|
||||||
COPY requirements.txt requirements.txt
|
COPY requirements.txt requirements.txt
|
||||||
RUN pip install --no-cache-dir -r requirements.txt
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
RUN apt update
|
||||||
|
RUN apt install curl -y
|
||||||
|
|
||||||
# App-Dateien kopieren
|
# App-Dateien kopieren
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
# Flask Umgebungsvariable setzen
|
|
||||||
ENV FLASK_APP=app
|
ENV FLASK_APP=app
|
||||||
|
ENV FLASK_ENV=production
|
||||||
|
|
||||||
# Flask starten
|
EXPOSE 5000
|
||||||
CMD ["flask", "run", "--host=0.0.0.0", "--port=5000"]
|
CMD ["flask", "run", "--host=0.0.0.0", "--port=5000"]
|
||||||
|
|
|
||||||
|
|
@ -1,56 +1,88 @@
|
||||||
import os
|
import os
|
||||||
from flask import Flask, redirect, url_for, request
|
from flask import Flask, redirect, url_for, request, current_app
|
||||||
from flask_sqlalchemy import SQLAlchemy
|
from flask_sqlalchemy import SQLAlchemy
|
||||||
from flask_login import LoginManager, current_user
|
from flask_login import LoginManager, current_user
|
||||||
from flask_migrate import Migrate
|
from flask_migrate import Migrate
|
||||||
|
from sqlalchemy import text
|
||||||
|
|
||||||
# Konfiguration für Upload- und Ergebnis-Ordner
|
# ✅ Docker-Pfade
|
||||||
UPLOAD_FOLDER = '/app/uploads'
|
UPLOAD_FOLDER = '/app/uploads'
|
||||||
RESULT_FOLDER = '/app/results'
|
RESULT_FOLDER = '/app/results'
|
||||||
|
|
||||||
db = SQLAlchemy()
|
db = SQLAlchemy()
|
||||||
|
login_manager = LoginManager()
|
||||||
migrate = Migrate()
|
migrate = Migrate()
|
||||||
|
|
||||||
|
|
||||||
|
def _run_migrations(app):
|
||||||
|
"""Fehlende DB-Spalten automatisch hinzufügen – übersteht jeden Neustart"""
|
||||||
|
migrations = [
|
||||||
|
("job", "result_filename_raw", "VARCHAR(150)"),
|
||||||
|
("job", "scraper_job_id", "VARCHAR(255)"),
|
||||||
|
("user", "is_admin", "BOOLEAN DEFAULT 0"),
|
||||||
|
]
|
||||||
|
with app.app_context():
|
||||||
|
for table, column, col_type in migrations:
|
||||||
|
try:
|
||||||
|
db.session.execute(text(f"ALTER TABLE {table} ADD COLUMN {column} {col_type}"))
|
||||||
|
db.session.commit()
|
||||||
|
print(f"✅ Migration: {table}.{column} hinzugefügt")
|
||||||
|
except Exception:
|
||||||
|
db.session.rollback()
|
||||||
|
|
||||||
|
|
||||||
def create_app():
|
def create_app():
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
# 🔑 Configs
|
||||||
app.config['SECRET_KEY'] = '008e7369b075886d5f494c8813efdfb17155da6af12b3fe8ee'
|
app.config['SECRET_KEY'] = '008e7369b075886d5f494c8813efdfb17155da6af12b3fe8ee'
|
||||||
app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///users.db'
|
app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///users.db'
|
||||||
|
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
|
||||||
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
|
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
|
||||||
app.config['RESULT_FOLDER'] = RESULT_FOLDER
|
app.config['RESULT_FOLDER'] = RESULT_FOLDER
|
||||||
app.config['ALLOW_USER_SIGNUP'] = False
|
app.config['ALLOW_USER_SIGNUP'] = False
|
||||||
|
|
||||||
|
# DB + Tools
|
||||||
db.init_app(app)
|
db.init_app(app)
|
||||||
migrate.init_app(app, db)
|
migrate.init_app(app, db)
|
||||||
|
|
||||||
# Flask-Login Setup
|
|
||||||
login_manager = LoginManager()
|
|
||||||
login_manager.login_view = 'auth.login'
|
|
||||||
login_manager.init_app(app)
|
login_manager.init_app(app)
|
||||||
|
login_manager.login_view = 'auth.login'
|
||||||
|
|
||||||
|
# User Loader
|
||||||
@login_manager.user_loader
|
@login_manager.user_loader
|
||||||
def load_user(user_id):
|
def load_user(user_id):
|
||||||
from .models import User
|
from .models import User
|
||||||
return User.query.get(int(user_id))
|
return User.query.get(int(user_id))
|
||||||
|
|
||||||
# Umleitung nicht authentifizierter Benutzer, statische Dateien und bestimmte Routen ausnehmen
|
# Protected Routes
|
||||||
@app.before_request
|
@app.before_request
|
||||||
def require_login():
|
def require_login():
|
||||||
allowed_routes = ['auth.login', 'auth.signup']
|
allowed = ['auth.login', 'auth.signup', 'static']
|
||||||
if (not current_user.is_authenticated
|
if (not current_user.is_authenticated and
|
||||||
and request.endpoint not in allowed_routes
|
request.endpoint not in allowed and
|
||||||
and not request.path.startswith('/static/')):
|
not request.path.startswith('/static')):
|
||||||
return redirect(url_for('auth.login'))
|
return redirect(url_for('auth.login'))
|
||||||
|
|
||||||
# Erstellen Sie die Ordner, falls sie noch nicht existieren
|
# Ordner
|
||||||
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
|
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
|
||||||
os.makedirs(app.config['RESULT_FOLDER'], exist_ok=True)
|
os.makedirs(app.config['RESULT_FOLDER'], exist_ok=True)
|
||||||
|
|
||||||
# Registrieren der Routen
|
# Routes
|
||||||
from . import routes
|
from . import routes
|
||||||
app.register_blueprint(routes.bp)
|
app.register_blueprint(routes.bp)
|
||||||
|
|
||||||
# Erstellen der Tabellen in der Datenbank
|
# Index Redirect
|
||||||
|
@app.route('/')
|
||||||
|
def index():
|
||||||
|
return redirect(url_for('auth.job_status'))
|
||||||
|
|
||||||
|
# DB Tables + Auto-Migration
|
||||||
with app.app_context():
|
with app.app_context():
|
||||||
db.create_all()
|
db.create_all()
|
||||||
|
_run_migrations(app)
|
||||||
|
|
||||||
return app
|
return app
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
app = create_app()
|
||||||
|
app.run(host='0.0.0.0', port=5000, debug=False)
|
||||||
|
|
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
68
app/init.py.bak
Normal file
68
app/init.py.bak
Normal file
|
|
@ -0,0 +1,68 @@
|
||||||
|
import os
|
||||||
|
from flask import Flask, redirect, url_for, request, current_app
|
||||||
|
from flask_sqlalchemy import SQLAlchemy
|
||||||
|
from flask_login import LoginManager, current_user
|
||||||
|
from flask_migrate import Migrate
|
||||||
|
|
||||||
|
# ✅ Docker-Pfade
|
||||||
|
UPLOAD_FOLDER = '/app/uploads'
|
||||||
|
RESULT_FOLDER = '/app/results'
|
||||||
|
|
||||||
|
db = SQLAlchemy()
|
||||||
|
login_manager = LoginManager()
|
||||||
|
migrate = Migrate()
|
||||||
|
|
||||||
|
def create_app():
|
||||||
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
# 🔑 Configs
|
||||||
|
app.config['SECRET_KEY'] = '008e7369b075886d5f494c8813efdfb17155da6af12b3fe8ee'
|
||||||
|
app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///users.db'
|
||||||
|
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
|
||||||
|
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
|
||||||
|
app.config['RESULT_FOLDER'] = RESULT_FOLDER
|
||||||
|
app.config['ALLOW_USER_SIGNUP'] = True # ✅ Aktiviert!
|
||||||
|
|
||||||
|
# DB + Tools
|
||||||
|
db.init_app(app)
|
||||||
|
migrate.init_app(app, db)
|
||||||
|
login_manager.init_app(app)
|
||||||
|
login_manager.login_view = 'auth.login'
|
||||||
|
|
||||||
|
# User Loader
|
||||||
|
@login_manager.user_loader
|
||||||
|
def load_user(user_id):
|
||||||
|
from .models import User
|
||||||
|
return User.query.get(int(user_id))
|
||||||
|
|
||||||
|
# Protected Routes
|
||||||
|
@app.before_request
|
||||||
|
def require_login():
|
||||||
|
allowed = ['auth.login', 'auth.signup', 'static']
|
||||||
|
if (not current_user.is_authenticated and
|
||||||
|
request.endpoint not in allowed and
|
||||||
|
not request.path.startswith('/static')):
|
||||||
|
return redirect(url_for('auth.login'))
|
||||||
|
|
||||||
|
# Ordner
|
||||||
|
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
|
||||||
|
os.makedirs(app.config['RESULT_FOLDER'], exist_ok=True)
|
||||||
|
|
||||||
|
# Routes
|
||||||
|
from . import routes
|
||||||
|
app.register_blueprint(routes.bp)
|
||||||
|
|
||||||
|
# Index Redirect
|
||||||
|
@app.route('/')
|
||||||
|
def index():
|
||||||
|
return redirect(url_for('auth.job_status'))
|
||||||
|
|
||||||
|
# DB Tables
|
||||||
|
with app.app_context():
|
||||||
|
db.create_all()
|
||||||
|
|
||||||
|
return app
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
app = create_app()
|
||||||
|
app.run(host='0.0.0.0', port=5000, debug=False)
|
||||||
|
|
@ -15,5 +15,11 @@ class Job(db.Model):
|
||||||
status = db.Column(db.String(50), default="Pending")
|
status = db.Column(db.String(50), default="Pending")
|
||||||
created_at = db.Column(db.DateTime, default=datetime.utcnow)
|
created_at = db.Column(db.DateTime, default=datetime.utcnow)
|
||||||
result_filename = db.Column(db.String(150), nullable=True)
|
result_filename = db.Column(db.String(150), nullable=True)
|
||||||
|
result_filename_raw = db.Column(db.String(150), nullable=True)
|
||||||
|
|
||||||
user = db.relationship('User', backref=db.backref('jobs', lazy=True))
|
user = db.relationship('User', backref=db.backref('jobs', lazy=True))
|
||||||
|
|
||||||
|
class AppConfig(db.Model):
|
||||||
|
id = db.Column(db.Integer, primary_key=True)
|
||||||
|
key = db.Column(db.String(100), unique=True, nullable=False)
|
||||||
|
value = db.Column(db.String(100), nullable=False, default='false')
|
||||||
|
|
|
||||||
223
app/routes.orig
Normal file
223
app/routes.orig
Normal file
|
|
@ -0,0 +1,223 @@
|
||||||
|
import time
|
||||||
|
import csv
|
||||||
|
import os
|
||||||
|
import threading
|
||||||
|
from flask import Blueprint, request, redirect, url_for, flash, render_template, send_file, current_app
|
||||||
|
from flask_login import login_user, logout_user, login_required, current_user
|
||||||
|
from werkzeug.utils import secure_filename
|
||||||
|
from werkzeug.security import generate_password_hash, check_password_hash
|
||||||
|
from .models import db, User, Job
|
||||||
|
from .webcrawler import process_file # Importiere die Funktion für das Webscraping
|
||||||
|
|
||||||
|
UPLOAD_FOLDER = 'uploads'
|
||||||
|
RESULT_FOLDER = 'results'
|
||||||
|
|
||||||
|
# Blueprint für auth erstellen
|
||||||
|
bp = Blueprint('auth', __name__)
|
||||||
|
|
||||||
|
@bp.route('/login', methods=['GET', 'POST'])
|
||||||
|
def login():
|
||||||
|
if request.method == 'POST':
|
||||||
|
username = request.form['username']
|
||||||
|
password = request.form['password']
|
||||||
|
user = User.query.filter_by(username=username).first()
|
||||||
|
if user and check_password_hash(user.password, password):
|
||||||
|
login_user(user)
|
||||||
|
return redirect(url_for('auth.job_status'))
|
||||||
|
flash('Login fehlgeschlagen. Überprüfen Sie Benutzername und Passwort.')
|
||||||
|
return render_template('login.html')
|
||||||
|
|
||||||
|
@bp.route('/signup', methods=['GET', 'POST'])
|
||||||
|
def signup():
|
||||||
|
if not current_app.config['ALLOW_USER_SIGNUP']:
|
||||||
|
flash("Registrierung ist derzeit deaktiviert.")
|
||||||
|
return redirect(url_for('auth.login'))
|
||||||
|
|
||||||
|
if request.method == 'POST':
|
||||||
|
username = request.form['username']
|
||||||
|
password = generate_password_hash(request.form['password'], method='sha256')
|
||||||
|
new_user = User(username=username, password=password)
|
||||||
|
db.session.add(new_user)
|
||||||
|
db.session.commit()
|
||||||
|
flash('Benutzer erfolgreich erstellt! Sie können sich jetzt einloggen.')
|
||||||
|
return redirect(url_for('auth.login'))
|
||||||
|
|
||||||
|
return render_template('signup.html')
|
||||||
|
|
||||||
|
@bp.route('/logout')
|
||||||
|
@login_required
|
||||||
|
def logout():
|
||||||
|
logout_user()
|
||||||
|
return redirect(url_for('auth.login'))
|
||||||
|
|
||||||
|
@bp.route('/jobs')
|
||||||
|
@login_required
|
||||||
|
def job_status():
|
||||||
|
jobs = Job.query.filter_by(user_id=current_user.id).all()
|
||||||
|
return render_template('jobs.html', jobs=jobs)
|
||||||
|
|
||||||
|
@bp.route('/upload', methods=['GET', 'POST'])
|
||||||
|
@login_required
|
||||||
|
def upload():
|
||||||
|
if request.method == 'POST':
|
||||||
|
file = request.files['file']
|
||||||
|
filename = secure_filename(file.filename)
|
||||||
|
|
||||||
|
# Überprüfen, ob eine Datei mit dem gleichen Namen bereits existiert
|
||||||
|
file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], filename)
|
||||||
|
if os.path.exists(file_path):
|
||||||
|
# Wenn eine Datei mit dem gleichen Namen existiert, einen Zeitstempel hinzufügen
|
||||||
|
name, ext = os.path.splitext(filename)
|
||||||
|
timestamp = time.strftime("%Y%m%d-%H%M%S") # Zeitstempel im Format JahrMonatTag-StundenMinutenSekunden
|
||||||
|
filename = f"{name}_{timestamp}{ext}"
|
||||||
|
file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], filename)
|
||||||
|
flash(f"Eine Datei mit gleichem Namen existierte bereits. Die Datei wurde als '{filename}' gespeichert.")
|
||||||
|
|
||||||
|
# Speichern der Datei
|
||||||
|
file.save(file_path)
|
||||||
|
flash('Datei erfolgreich hochgeladen und Job gestartet')
|
||||||
|
|
||||||
|
# Neuen Job erstellen
|
||||||
|
new_job = Job(user_id=current_user.id, filename=filename, status="Pending")
|
||||||
|
db.session.add(new_job)
|
||||||
|
db.session.commit()
|
||||||
|
|
||||||
|
# Debugging-Ausgabe zur Überprüfung der Thread-Erstellung
|
||||||
|
print(f"Starte Scraping-Thread für Job-ID: {new_job.id}")
|
||||||
|
|
||||||
|
# Starten des Scraping im Hintergrund-Thread und Übergeben des aktuellen Anwendungskontexts
|
||||||
|
thread = threading.Thread(target=process_file, args=(filename, new_job.id, current_app._get_current_object()))
|
||||||
|
thread.start()
|
||||||
|
|
||||||
|
# Debugging-Ausgabe, nachdem der Thread gestartet wurde
|
||||||
|
print(f"Thread für Job {new_job.id} erfolgreich gestartet.")
|
||||||
|
|
||||||
|
return redirect(url_for('auth.job_status'))
|
||||||
|
|
||||||
|
return render_template('upload.html')
|
||||||
|
|
||||||
|
@bp.route('/download/<int:job_id>', methods=['GET'])
|
||||||
|
@login_required
|
||||||
|
def download_result(job_id):
|
||||||
|
job = Job.query.get_or_404(job_id)
|
||||||
|
print(f"Job ID: {job.id} - User ID: {job.user_id} - Current User ID: {current_user.id}")
|
||||||
|
|
||||||
|
# Überprüfen, ob der Job dem aktuellen Benutzer gehört
|
||||||
|
if job.user_id != current_user.id:
|
||||||
|
flash("Sie haben keine Berechtigung, dieses Ergebnis herunterzuladen.")
|
||||||
|
return redirect(url_for('auth.job_status'))
|
||||||
|
|
||||||
|
# Überprüfen, ob das Ergebnis vorhanden ist
|
||||||
|
if not job.result_filename:
|
||||||
|
flash("Das Ergebnis ist noch nicht verfügbar.")
|
||||||
|
return redirect(url_for('auth.job_status'))
|
||||||
|
|
||||||
|
# Überprüfen, ob die Datei im angegebenen Pfad existiert
|
||||||
|
result_path = os.path.join(current_app.config['RESULT_FOLDER'], job.result_filename)
|
||||||
|
print(f"Versuche, Datei herunterzuladen von: {result_path}")
|
||||||
|
|
||||||
|
if os.path.exists(result_path):
|
||||||
|
print("Datei existiert und wird zum Download bereitgestellt.")
|
||||||
|
return send_file(result_path, as_attachment=True)
|
||||||
|
else:
|
||||||
|
print("Datei nicht gefunden. Ergebnisverzeichnis oder Pfad prüfen.")
|
||||||
|
flash("Ergebnisdatei nicht gefunden.")
|
||||||
|
return redirect(url_for('auth.job_status'))
|
||||||
|
|
||||||
|
|
||||||
|
@bp.route('/delete_job/<int:job_id>', methods=['POST'])
|
||||||
|
@login_required
|
||||||
|
def delete_job(job_id):
|
||||||
|
job = Job.query.get_or_404(job_id)
|
||||||
|
if job.user_id != current_user.id:
|
||||||
|
flash("Sie haben keine Berechtigung, diesen Job zu löschen.")
|
||||||
|
return redirect(url_for('auth.job_status'))
|
||||||
|
|
||||||
|
# Löschen der Upload-Datei
|
||||||
|
upload_path = os.path.join(current_app.config['UPLOAD_FOLDER'], job.filename)
|
||||||
|
if os.path.exists(upload_path):
|
||||||
|
os.remove(upload_path)
|
||||||
|
print(f"Upload-Datei gelöscht: {upload_path}")
|
||||||
|
else:
|
||||||
|
print(f"Upload-Datei nicht gefunden: {upload_path}")
|
||||||
|
|
||||||
|
# Löschen der Results-Datei, falls vorhanden
|
||||||
|
if job.result_filename:
|
||||||
|
result_path = os.path.join(current_app.config['RESULT_FOLDER'], job.result_filename)
|
||||||
|
print(f"Versuche Ergebnisdatei zu löschen: {result_path}")
|
||||||
|
|
||||||
|
if os.path.exists(result_path):
|
||||||
|
try:
|
||||||
|
os.remove(result_path)
|
||||||
|
print(f"Ergebnisdatei gelöscht: {result_path}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Fehler beim Löschen der Ergebnisdatei: {e}")
|
||||||
|
else:
|
||||||
|
print(f"Ergebnisdatei nicht gefunden im Pfad: {result_path}")
|
||||||
|
|
||||||
|
# Job aus der Datenbank löschen
|
||||||
|
db.session.delete(job)
|
||||||
|
db.session.commit()
|
||||||
|
flash("Job erfolgreich gelöscht.")
|
||||||
|
return redirect(url_for('auth.job_status'))
|
||||||
|
|
||||||
|
@bp.route('/admin', methods=['GET'])
|
||||||
|
@login_required
|
||||||
|
def admin_panel():
|
||||||
|
if not current_user.is_admin:
|
||||||
|
flash("Keine Berechtigung.")
|
||||||
|
return redirect(url_for('auth.job_status'))
|
||||||
|
|
||||||
|
users = User.query.all()
|
||||||
|
return render_template('admin_panel.html', users=users)
|
||||||
|
|
||||||
|
@bp.route('/admin/create_user', methods=['POST'])
|
||||||
|
@login_required
|
||||||
|
def create_user():
|
||||||
|
if not current_user.is_admin:
|
||||||
|
flash("Keine Berechtigung.")
|
||||||
|
return redirect(url_for('auth.admin_panel'))
|
||||||
|
|
||||||
|
username = request.form['username']
|
||||||
|
password = request.form['password']
|
||||||
|
is_admin = 'is_admin' in request.form # Checkbox für Adminrechte
|
||||||
|
|
||||||
|
hashed_password = generate_password_hash(password, method='sha256')
|
||||||
|
new_user = User(username=username, password=hashed_password, is_admin=is_admin)
|
||||||
|
db.session.add(new_user)
|
||||||
|
db.session.commit()
|
||||||
|
|
||||||
|
flash(f"Benutzer {username} wurde erstellt.")
|
||||||
|
return redirect(url_for('auth.admin_panel'))
|
||||||
|
|
||||||
|
@bp.route('/admin/reset_password/<int:user_id>', methods=['POST'])
|
||||||
|
@login_required
|
||||||
|
def reset_password(user_id):
|
||||||
|
if not current_user.is_admin:
|
||||||
|
flash("Keine Berechtigung.")
|
||||||
|
return redirect(url_for('auth.admin_panel'))
|
||||||
|
|
||||||
|
user = User.query.get_or_404(user_id)
|
||||||
|
new_password = request.form['new_password']
|
||||||
|
user.password = generate_password_hash(new_password, method='sha256')
|
||||||
|
db.session.commit()
|
||||||
|
|
||||||
|
flash(f"Passwort für Benutzer {user.username} wurde zurückgesetzt.")
|
||||||
|
return redirect(url_for('auth.admin_panel'))
|
||||||
|
|
||||||
|
@bp.route('/admin/delete_user/<int:user_id>', methods=['POST'])
|
||||||
|
@login_required
|
||||||
|
def delete_user(user_id):
|
||||||
|
if not current_user.is_admin:
|
||||||
|
flash("Keine Berechtigung.")
|
||||||
|
return redirect(url_for('auth.admin_panel'))
|
||||||
|
|
||||||
|
user = User.query.get_or_404(user_id)
|
||||||
|
if user.is_admin:
|
||||||
|
flash("Administratoren können nicht gelöscht werden.")
|
||||||
|
return redirect(url_for('auth.admin_panel'))
|
||||||
|
|
||||||
|
db.session.delete(user)
|
||||||
|
db.session.commit()
|
||||||
|
flash(f"Benutzer {user.username} wurde gelöscht.")
|
||||||
|
return redirect(url_for('auth.admin_panel'))
|
||||||
230
app/routes.py
230
app/routes.py
|
|
@ -1,18 +1,16 @@
|
||||||
import time
|
import time
|
||||||
import csv
|
|
||||||
import os
|
import os
|
||||||
import threading
|
import threading
|
||||||
from flask import Blueprint, request, redirect, url_for, flash, render_template, send_file, current_app
|
from flask import Blueprint, request, redirect, url_for, flash, render_template, send_file, jsonify, current_app
|
||||||
from flask_login import login_user, logout_user, login_required, current_user
|
from flask_login import login_user, logout_user, login_required, current_user
|
||||||
from werkzeug.utils import secure_filename
|
from werkzeug.utils import secure_filename
|
||||||
from werkzeug.security import generate_password_hash, check_password_hash
|
from werkzeug.security import generate_password_hash, check_password_hash
|
||||||
from .models import db, User, Job
|
from .models import db, User, Job, AppConfig
|
||||||
from .webcrawler import process_file # Importiere die Funktion für das Webscraping
|
from .webcrawler import process_file
|
||||||
|
|
||||||
UPLOAD_FOLDER = 'uploads'
|
UPLOAD_FOLDER = '/app/uploads'
|
||||||
RESULT_FOLDER = 'results'
|
RESULT_FOLDER = '/app/results'
|
||||||
|
|
||||||
# Blueprint für auth erstellen
|
|
||||||
bp = Blueprint('auth', __name__)
|
bp = Blueprint('auth', __name__)
|
||||||
|
|
||||||
@bp.route('/login', methods=['GET', 'POST'])
|
@bp.route('/login', methods=['GET', 'POST'])
|
||||||
|
|
@ -29,19 +27,19 @@ def login():
|
||||||
|
|
||||||
@bp.route('/signup', methods=['GET', 'POST'])
|
@bp.route('/signup', methods=['GET', 'POST'])
|
||||||
def signup():
|
def signup():
|
||||||
if not current_app.config['ALLOW_USER_SIGNUP']:
|
cfg = AppConfig.query.filter_by(key='allow_signup').first()
|
||||||
|
if not cfg or cfg.value != 'true':
|
||||||
flash("Registrierung ist derzeit deaktiviert.")
|
flash("Registrierung ist derzeit deaktiviert.")
|
||||||
return redirect(url_for('auth.login'))
|
return redirect(url_for('auth.login'))
|
||||||
|
|
||||||
if request.method == 'POST':
|
if request.method == 'POST':
|
||||||
username = request.form['username']
|
username = request.form['username']
|
||||||
password = generate_password_hash(request.form['password'], method='sha256')
|
password = generate_password_hash(request.form['password']) # ✅ Fix
|
||||||
new_user = User(username=username, password=password)
|
new_user = User(username=username, password=password)
|
||||||
db.session.add(new_user)
|
db.session.add(new_user)
|
||||||
db.session.commit()
|
db.session.commit()
|
||||||
flash('Benutzer erfolgreich erstellt! Sie können sich jetzt einloggen.')
|
flash('Benutzer erfolgreich erstellt!')
|
||||||
return redirect(url_for('auth.login'))
|
return redirect(url_for('auth.login'))
|
||||||
|
|
||||||
return render_template('signup.html')
|
return render_template('signup.html')
|
||||||
|
|
||||||
@bp.route('/logout')
|
@bp.route('/logout')
|
||||||
|
|
@ -53,171 +51,203 @@ def logout():
|
||||||
@bp.route('/jobs')
|
@bp.route('/jobs')
|
||||||
@login_required
|
@login_required
|
||||||
def job_status():
|
def job_status():
|
||||||
jobs = Job.query.filter_by(user_id=current_user.id).all()
|
jobs = Job.query.filter_by(user_id=current_user.id).order_by(Job.created_at.desc()).all()
|
||||||
return render_template('jobs.html', jobs=jobs)
|
return render_template('jobs.html', jobs=jobs)
|
||||||
|
|
||||||
@bp.route('/upload', methods=['GET', 'POST'])
|
@bp.route('/upload', methods=['GET', 'POST'])
|
||||||
@login_required
|
@login_required
|
||||||
def upload():
|
def upload():
|
||||||
if request.method == 'POST':
|
if request.method == 'POST':
|
||||||
file = request.files['file']
|
if 'file' not in request.files:
|
||||||
filename = secure_filename(file.filename)
|
flash('Keine Datei ausgewählt.')
|
||||||
|
return redirect(url_for('auth.upload'))
|
||||||
# Überprüfen, ob eine Datei mit dem gleichen Namen bereits existiert
|
|
||||||
file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], filename)
|
|
||||||
if os.path.exists(file_path):
|
|
||||||
# Wenn eine Datei mit dem gleichen Namen existiert, einen Zeitstempel hinzufügen
|
|
||||||
name, ext = os.path.splitext(filename)
|
|
||||||
timestamp = time.strftime("%Y%m%d-%H%M%S") # Zeitstempel im Format JahrMonatTag-StundenMinutenSekunden
|
|
||||||
filename = f"{name}_{timestamp}{ext}"
|
|
||||||
file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], filename)
|
|
||||||
flash(f"Eine Datei mit gleichem Namen existierte bereits. Die Datei wurde als '{filename}' gespeichert.")
|
|
||||||
|
|
||||||
# Speichern der Datei
|
|
||||||
file.save(file_path)
|
|
||||||
flash('Datei erfolgreich hochgeladen und Job gestartet')
|
|
||||||
|
|
||||||
# Neuen Job erstellen
|
file = request.files['file']
|
||||||
new_job = Job(user_id=current_user.id, filename=filename, status="Pending")
|
if not file or file.filename == '':
|
||||||
|
flash('Keine gültige Datei.')
|
||||||
|
return redirect(url_for('auth.upload'))
|
||||||
|
|
||||||
|
filename = secure_filename(file.filename)
|
||||||
|
name, ext = os.path.splitext(filename)
|
||||||
|
|
||||||
|
timestamp = time.strftime("%Y%m%d_%H%M%S")
|
||||||
|
unique_filename = f"{name}_{timestamp}{ext}" if os.path.exists(os.path.join(UPLOAD_FOLDER, filename)) else filename
|
||||||
|
|
||||||
|
filepath = os.path.join(UPLOAD_FOLDER, unique_filename)
|
||||||
|
file.save(filepath)
|
||||||
|
print(f"💾 UPLOAD: {filepath}")
|
||||||
|
|
||||||
|
new_job = Job(
|
||||||
|
user_id=current_user.id,
|
||||||
|
filename=unique_filename,
|
||||||
|
status="Pending"
|
||||||
|
)
|
||||||
db.session.add(new_job)
|
db.session.add(new_job)
|
||||||
db.session.commit()
|
db.session.commit()
|
||||||
|
print(f"🆕 JOB #{new_job.id} für User {current_user.id}")
|
||||||
|
|
||||||
# Debugging-Ausgabe zur Überprüfung der Thread-Erstellung
|
thread = threading.Thread(
|
||||||
print(f"Starte Scraping-Thread für Job-ID: {new_job.id}")
|
target=process_file,
|
||||||
|
args=(unique_filename, new_job.id, current_app._get_current_object())
|
||||||
# Starten des Scraping im Hintergrund-Thread und Übergeben des aktuellen Anwendungskontexts
|
)
|
||||||
thread = threading.Thread(target=process_file, args=(filename, new_job.id, current_app._get_current_object()))
|
thread.daemon = True
|
||||||
thread.start()
|
thread.start()
|
||||||
|
print(f"🔄 THREAD STARTED Job {new_job.id}")
|
||||||
|
|
||||||
# Debugging-Ausgabe, nachdem der Thread gestartet wurde
|
flash(f'"{unique_filename}" → Job #{new_job.id} läuft!')
|
||||||
print(f"Thread für Job {new_job.id} erfolgreich gestartet.")
|
|
||||||
|
|
||||||
return redirect(url_for('auth.job_status'))
|
return redirect(url_for('auth.job_status'))
|
||||||
|
|
||||||
return render_template('upload.html')
|
return render_template('upload.html')
|
||||||
|
|
||||||
@bp.route('/download/<int:job_id>', methods=['GET'])
|
@bp.route('/download/<int:job_id>')
|
||||||
@login_required
|
@login_required
|
||||||
def download_result(job_id):
|
def download_result(job_id):
|
||||||
job = Job.query.get_or_404(job_id)
|
job = Job.query.filter_by(id=job_id, user_id=current_user.id).first_or_404()
|
||||||
print(f"Job ID: {job.id} - User ID: {job.user_id} - Current User ID: {current_user.id}")
|
|
||||||
|
|
||||||
# Überprüfen, ob der Job dem aktuellen Benutzer gehört
|
if not job.result_filename or not job.status.startswith('✅'):
|
||||||
if job.user_id != current_user.id:
|
flash('Ergebnis nicht bereit.')
|
||||||
flash("Sie haben keine Berechtigung, dieses Ergebnis herunterzuladen.")
|
|
||||||
return redirect(url_for('auth.job_status'))
|
return redirect(url_for('auth.job_status'))
|
||||||
|
|
||||||
# Überprüfen, ob das Ergebnis vorhanden ist
|
result_path = os.path.join(RESULT_FOLDER, job.result_filename)
|
||||||
if not job.result_filename:
|
|
||||||
flash("Das Ergebnis ist noch nicht verfügbar.")
|
|
||||||
return redirect(url_for('auth.job_status'))
|
|
||||||
|
|
||||||
# Überprüfen, ob die Datei im angegebenen Pfad existiert
|
|
||||||
result_path = os.path.join(current_app.config['RESULT_FOLDER'], job.result_filename)
|
|
||||||
print(f"Versuche, Datei herunterzuladen von: {result_path}")
|
|
||||||
|
|
||||||
if os.path.exists(result_path):
|
if os.path.exists(result_path):
|
||||||
print("Datei existiert und wird zum Download bereitgestellt.")
|
|
||||||
return send_file(result_path, as_attachment=True)
|
return send_file(result_path, as_attachment=True)
|
||||||
else:
|
flash('Datei fehlt.')
|
||||||
print("Datei nicht gefunden. Ergebnisverzeichnis oder Pfad prüfen.")
|
return redirect(url_for('auth.job_status'))
|
||||||
flash("Ergebnisdatei nicht gefunden.")
|
|
||||||
|
@bp.route('/download_raw/<int:job_id>')
|
||||||
|
@login_required
|
||||||
|
def download_result_raw(job_id):
|
||||||
|
job = Job.query.filter_by(id=job_id, user_id=current_user.id).first_or_404()
|
||||||
|
|
||||||
|
if not job.result_filename_raw:
|
||||||
|
flash('Rohdaten nicht verfügbar.')
|
||||||
return redirect(url_for('auth.job_status'))
|
return redirect(url_for('auth.job_status'))
|
||||||
|
|
||||||
|
result_path = os.path.join(RESULT_FOLDER, job.result_filename_raw)
|
||||||
|
if os.path.exists(result_path):
|
||||||
|
return send_file(result_path, as_attachment=True)
|
||||||
|
flash('Datei fehlt.')
|
||||||
|
return redirect(url_for('auth.job_status'))
|
||||||
|
|
||||||
@bp.route('/delete_job/<int:job_id>', methods=['POST'])
|
@bp.route('/delete_job/<int:job_id>', methods=['POST'])
|
||||||
@login_required
|
@login_required
|
||||||
def delete_job(job_id):
|
def delete_job(job_id):
|
||||||
job = Job.query.get_or_404(job_id)
|
job = Job.query.filter_by(id=job_id, user_id=current_user.id).first_or_404()
|
||||||
if job.user_id != current_user.id:
|
|
||||||
flash("Sie haben keine Berechtigung, diesen Job zu löschen.")
|
|
||||||
return redirect(url_for('auth.job_status'))
|
|
||||||
|
|
||||||
# Löschen der Upload-Datei
|
upload_path = os.path.join(UPLOAD_FOLDER, job.filename)
|
||||||
upload_path = os.path.join(current_app.config['UPLOAD_FOLDER'], job.filename)
|
|
||||||
if os.path.exists(upload_path):
|
if os.path.exists(upload_path):
|
||||||
os.remove(upload_path)
|
os.remove(upload_path)
|
||||||
print(f"Upload-Datei gelöscht: {upload_path}")
|
|
||||||
else:
|
|
||||||
print(f"Upload-Datei nicht gefunden: {upload_path}")
|
|
||||||
|
|
||||||
# Löschen der Results-Datei, falls vorhanden
|
|
||||||
if job.result_filename:
|
if job.result_filename:
|
||||||
result_path = os.path.join(current_app.config['RESULT_FOLDER'], job.result_filename)
|
result_path = os.path.join(RESULT_FOLDER, job.result_filename)
|
||||||
print(f"Versuche Ergebnisdatei zu löschen: {result_path}")
|
|
||||||
|
|
||||||
if os.path.exists(result_path):
|
if os.path.exists(result_path):
|
||||||
try:
|
os.remove(result_path)
|
||||||
os.remove(result_path)
|
|
||||||
print(f"Ergebnisdatei gelöscht: {result_path}")
|
if job.result_filename_raw: # ✅ Raw auch löschen
|
||||||
except Exception as e:
|
raw_path = os.path.join(RESULT_FOLDER, job.result_filename_raw)
|
||||||
print(f"Fehler beim Löschen der Ergebnisdatei: {e}")
|
if os.path.exists(raw_path):
|
||||||
else:
|
os.remove(raw_path)
|
||||||
print(f"Ergebnisdatei nicht gefunden im Pfad: {result_path}")
|
|
||||||
|
|
||||||
# Job aus der Datenbank löschen
|
|
||||||
db.session.delete(job)
|
db.session.delete(job)
|
||||||
db.session.commit()
|
db.session.commit()
|
||||||
flash("Job erfolgreich gelöscht.")
|
flash('Job gelöscht.')
|
||||||
return redirect(url_for('auth.job_status'))
|
return redirect(url_for('auth.job_status'))
|
||||||
|
|
||||||
|
@bp.route('/job_status/<int:job_id>')
|
||||||
|
@login_required
|
||||||
|
def job_status_api(job_id):
|
||||||
|
job = Job.query.filter_by(id=job_id, user_id=current_user.id).first()
|
||||||
|
if not job:
|
||||||
|
return jsonify({'error': 'Not found'}), 404
|
||||||
|
return jsonify({
|
||||||
|
'id': job.id,
|
||||||
|
'status': job.status,
|
||||||
|
'result_filename': job.result_filename,
|
||||||
|
'result_filename_raw': getattr(job, 'result_filename_raw', None),
|
||||||
|
'scraper_job_id': getattr(job, 'scraper_job_id', None)
|
||||||
|
})
|
||||||
|
|
||||||
|
@bp.route('/resume_job/<int:job_id>', methods=['POST'])
|
||||||
|
@login_required
|
||||||
|
def resume_job(job_id):
|
||||||
|
job = Job.query.filter_by(id=job_id, user_id=current_user.id).first_or_404()
|
||||||
|
|
||||||
|
thread = threading.Thread(
|
||||||
|
target=process_file,
|
||||||
|
args=(job.filename, job.id, current_app._get_current_object())
|
||||||
|
)
|
||||||
|
thread.daemon = True
|
||||||
|
thread.start()
|
||||||
|
flash(f'Job #{job_id} wird fortgesetzt...')
|
||||||
|
return redirect(url_for('auth.job_status'))
|
||||||
|
|
||||||
|
|
||||||
|
# ── ADMIN ──────────────────────────────────────────
|
||||||
@bp.route('/admin', methods=['GET'])
|
@bp.route('/admin', methods=['GET'])
|
||||||
@login_required
|
@login_required
|
||||||
def admin_panel():
|
def admin_panel():
|
||||||
if not current_user.is_admin:
|
if not current_user.is_admin:
|
||||||
flash("Keine Berechtigung.")
|
flash("Keine Berechtigung.")
|
||||||
return redirect(url_for('auth.job_status'))
|
return redirect(url_for('auth.job_status'))
|
||||||
|
|
||||||
users = User.query.all()
|
users = User.query.all()
|
||||||
return render_template('admin_panel.html', users=users)
|
cfg = AppConfig.query.filter_by(key='allow_signup').first()
|
||||||
|
signup_allowed = cfg and cfg.value == 'true'
|
||||||
|
return render_template('admin_panel.html', users=users, signup_allowed=signup_allowed)
|
||||||
|
|
||||||
@bp.route('/admin/create_user', methods=['POST'])
|
@bp.route('/admin/create_user', methods=['POST'])
|
||||||
@login_required
|
@login_required
|
||||||
def create_user():
|
def create_user():
|
||||||
if not current_user.is_admin:
|
if not current_user.is_admin:
|
||||||
flash("Keine Berechtigung.")
|
|
||||||
return redirect(url_for('auth.admin_panel'))
|
return redirect(url_for('auth.admin_panel'))
|
||||||
|
|
||||||
username = request.form['username']
|
username = request.form['username']
|
||||||
password = request.form['password']
|
password = generate_password_hash(request.form['password']) # ✅ Fix
|
||||||
is_admin = 'is_admin' in request.form # Checkbox für Adminrechte
|
is_admin = 'is_admin' in request.form
|
||||||
|
new_user = User(username=username, password=password, is_admin=is_admin)
|
||||||
hashed_password = generate_password_hash(password, method='sha256')
|
|
||||||
new_user = User(username=username, password=hashed_password, is_admin=is_admin)
|
|
||||||
db.session.add(new_user)
|
db.session.add(new_user)
|
||||||
db.session.commit()
|
db.session.commit()
|
||||||
|
flash(f'{username} erstellt.')
|
||||||
flash(f"Benutzer {username} wurde erstellt.")
|
|
||||||
return redirect(url_for('auth.admin_panel'))
|
return redirect(url_for('auth.admin_panel'))
|
||||||
|
|
||||||
@bp.route('/admin/reset_password/<int:user_id>', methods=['POST'])
|
@bp.route('/admin/reset_password/<int:user_id>', methods=['POST'])
|
||||||
@login_required
|
@login_required
|
||||||
def reset_password(user_id):
|
def reset_password(user_id):
|
||||||
if not current_user.is_admin:
|
if not current_user.is_admin:
|
||||||
flash("Keine Berechtigung.")
|
|
||||||
return redirect(url_for('auth.admin_panel'))
|
return redirect(url_for('auth.admin_panel'))
|
||||||
|
|
||||||
user = User.query.get_or_404(user_id)
|
user = User.query.get_or_404(user_id)
|
||||||
new_password = request.form['new_password']
|
new_password = request.form['new_password']
|
||||||
user.password = generate_password_hash(new_password, method='sha256')
|
user.password = generate_password_hash(new_password) # ✅ Fix
|
||||||
db.session.commit()
|
db.session.commit()
|
||||||
|
flash(f'Passwort {user.username} zurückgesetzt.')
|
||||||
flash(f"Passwort für Benutzer {user.username} wurde zurückgesetzt.")
|
|
||||||
return redirect(url_for('auth.admin_panel'))
|
return redirect(url_for('auth.admin_panel'))
|
||||||
|
|
||||||
@bp.route('/admin/delete_user/<int:user_id>', methods=['POST'])
|
@bp.route('/admin/delete_user/<int:user_id>', methods=['POST'])
|
||||||
@login_required
|
@login_required
|
||||||
def delete_user(user_id):
|
def delete_user(user_id):
|
||||||
if not current_user.is_admin:
|
if not current_user.is_admin:
|
||||||
flash("Keine Berechtigung.")
|
|
||||||
return redirect(url_for('auth.admin_panel'))
|
return redirect(url_for('auth.admin_panel'))
|
||||||
|
|
||||||
user = User.query.get_or_404(user_id)
|
user = User.query.get_or_404(user_id)
|
||||||
if user.is_admin:
|
if user.is_admin:
|
||||||
flash("Administratoren können nicht gelöscht werden.")
|
flash('Admin nicht löschbar.')
|
||||||
return redirect(url_for('auth.admin_panel'))
|
return redirect(url_for('auth.admin_panel'))
|
||||||
|
|
||||||
db.session.delete(user)
|
db.session.delete(user)
|
||||||
db.session.commit()
|
db.session.commit()
|
||||||
flash(f"Benutzer {user.username} wurde gelöscht.")
|
flash(f'{user.username} gelöscht.')
|
||||||
|
return redirect(url_for('auth.admin_panel'))
|
||||||
|
|
||||||
|
@bp.route('/admin/toggle_signup', methods=['POST'])
|
||||||
|
@login_required
|
||||||
|
def toggle_signup():
|
||||||
|
if not current_user.is_admin:
|
||||||
|
return redirect(url_for('auth.admin_panel'))
|
||||||
|
|
||||||
|
cfg = AppConfig.query.filter_by(key='allow_signup').first()
|
||||||
|
if not cfg:
|
||||||
|
cfg = AppConfig(key='allow_signup', value='true')
|
||||||
|
db.session.add(cfg)
|
||||||
|
else:
|
||||||
|
cfg.value = 'false' if cfg.value == 'true' else 'true'
|
||||||
|
|
||||||
|
db.session.commit()
|
||||||
|
state = '✅ aktiviert' if cfg.value == 'true' else '🔒 deaktiviert'
|
||||||
|
flash(f'Registrierung {state}.')
|
||||||
return redirect(url_for('auth.admin_panel'))
|
return redirect(url_for('auth.admin_panel'))
|
||||||
|
|
|
||||||
|
|
@ -47,4 +47,32 @@
|
||||||
<button type="submit" class="create-btn">Benutzer erstellen</button>
|
<button type="submit" class="create-btn">Benutzer erstellen</button>
|
||||||
</form>
|
</form>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<div class="config-box">
|
||||||
|
<h3>⚙️ Einstellungen</h3>
|
||||||
|
<form action="{{ url_for('auth.toggle_signup') }}" method="POST">
|
||||||
|
<div class="toggle-row">
|
||||||
|
<span>Benutzer-Registrierung:</span>
|
||||||
|
{% if signup_allowed %}
|
||||||
|
<span class="badge badge-green">✅ Aktiv</span>
|
||||||
|
<button type="submit" class="btn-danger">🔒 Deaktivieren</button>
|
||||||
|
{% else %}
|
||||||
|
<span class="badge badge-red">🔒 Deaktiviert</span>
|
||||||
|
<button type="submit" class="btn-success">✅ Aktivieren</button>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
</form>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<style>
|
||||||
|
.config-box { background: #f8f9fa; border: 1px solid #ddd; border-radius: 6px; padding: 16px; margin-bottom: 24px; }
|
||||||
|
.toggle-row { display: flex; align-items: center; gap: 12px; }
|
||||||
|
.badge { padding: 3px 10px; border-radius: 12px; font-size: 0.85em; font-weight: bold; }
|
||||||
|
.badge-green { background: #d4edda; color: #155724; }
|
||||||
|
.badge-red { background: #f8d7da; color: #721c24; }
|
||||||
|
.btn-danger { background: #e74c3c; color: white; border: none; padding: 6px 14px; border-radius: 4px; cursor: pointer; }
|
||||||
|
.btn-success { background: #27ae60; color: white; border: none; padding: 6px 14px; border-radius: 4px; cursor: pointer; }
|
||||||
|
.btn-danger:hover { background: #c0392b; }
|
||||||
|
.btn-success:hover { background: #1e8449; }
|
||||||
|
</style>
|
||||||
{% endblock %}
|
{% endblock %}
|
||||||
|
|
|
||||||
121
app/templates/jobs.bck
Normal file
121
app/templates/jobs.bck
Normal file
|
|
@ -0,0 +1,121 @@
|
||||||
|
{% extends "base.html" %}
|
||||||
|
|
||||||
|
{% block content %}
|
||||||
|
<div class="table-container">
|
||||||
|
<h2>Ihre Aufträge</h2>
|
||||||
|
<table id="jobs-table">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>Dateiname</th>
|
||||||
|
<th>Status</th>
|
||||||
|
<th>Erstellt am</th>
|
||||||
|
<th>Ergebnis</th>
|
||||||
|
<th>Aktionen</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
{% for job in jobs %}
|
||||||
|
<tr id="job-row-{{ job.id }}">
|
||||||
|
<td>{{ job.filename }}</td>
|
||||||
|
<td id="status-{{ job.id }}" class="job-status">{{ job.status }}</td>
|
||||||
|
<td>{{ job.created_at.strftime('%Y-%m-%d %H:%M:%S') }}</td>
|
||||||
|
<td id="result-{{ job.id }}">
|
||||||
|
{% if job.result_filename and 'Failed' not in job.status %}
|
||||||
|
<a href="{{ url_for('auth.download_result', job_id=job.id) }}" class="dl-btn">
|
||||||
|
🎯 Gefiltert
|
||||||
|
</a>
|
||||||
|
{% if job.result_filename_raw %}
|
||||||
|
|
||||||
|
<a href="{{ url_for('auth.download_result_raw', job_id=job.id) }}" class="dl-btn dl-btn-raw">
|
||||||
|
📋 Alle
|
||||||
|
</a>
|
||||||
|
{% endif %}
|
||||||
|
{% elif 'Failed' in job.status %}
|
||||||
|
<span class="status-failed">❌ {{ job.result_filename or 'Fehler' }}</span>
|
||||||
|
{% else %}
|
||||||
|
<span class="status-pending">⏳ Noch nicht verfügbar</span>
|
||||||
|
{% endif %}
|
||||||
|
</td>
|
||||||
|
<td>
|
||||||
|
<form action="{{ url_for('auth.delete_job', job_id=job.id) }}" method="POST" style="display:inline;">
|
||||||
|
<button type="submit" class="delete-btn">🗑️ Löschen</button>
|
||||||
|
</form>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
{% endfor %}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<style>
|
||||||
|
.job-status { font-weight: bold; }
|
||||||
|
.status-failed { color: #e74c3c; font-weight: bold; }
|
||||||
|
.status-pending { color: #888; }
|
||||||
|
.status-completed { color: #27ae60; }
|
||||||
|
|
||||||
|
.dl-btn {
|
||||||
|
display: inline-block;
|
||||||
|
padding: 4px 10px;
|
||||||
|
border-radius: 4px;
|
||||||
|
text-decoration: none;
|
||||||
|
font-size: 0.85em;
|
||||||
|
font-weight: bold;
|
||||||
|
background: #27ae60;
|
||||||
|
color: #fff;
|
||||||
|
margin: 2px 1px;
|
||||||
|
transition: background 0.2s;
|
||||||
|
}
|
||||||
|
.dl-btn:hover { background: #1e8449; }
|
||||||
|
.dl-btn-raw { background: #2980b9; }
|
||||||
|
.dl-btn-raw:hover { background: #1a5e8a; }
|
||||||
|
</style>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
document.addEventListener('DOMContentLoaded', function () {
|
||||||
|
document.querySelectorAll('.job-status').forEach(function (cell) {
|
||||||
|
const jobId = cell.id.split('-')[1];
|
||||||
|
const status = cell.textContent.trim();
|
||||||
|
if (!status.includes('✅') && !status.includes('Failed')) {
|
||||||
|
pollJob(jobId);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
function renderResult(resultCell, data) {
|
||||||
|
const hasFailed = data.status.includes('Failed');
|
||||||
|
const hasFiltered = data.result_filename && !hasFailed;
|
||||||
|
const hasRaw = data.result_filename_raw && !hasFailed;
|
||||||
|
|
||||||
|
if (hasFiltered) {
|
||||||
|
let html = `<a href="/download/${data.id}" class="dl-btn">🎯 Gefiltert</a>`;
|
||||||
|
if (hasRaw) {
|
||||||
|
html += ` <a href="/download_raw/${data.id}" class="dl-btn dl-btn-raw">📋 Alle</a>`;
|
||||||
|
}
|
||||||
|
resultCell.innerHTML = html;
|
||||||
|
} else if (hasFailed) {
|
||||||
|
resultCell.innerHTML = `<span class="status-failed">❌ ${data.result_filename || 'Fehler'}</span>`;
|
||||||
|
} else {
|
||||||
|
resultCell.innerHTML = `<span class="status-pending">⏳ Noch nicht verfügbar</span>`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function pollJob(jobId) {
|
||||||
|
fetch(`/job_status/${jobId}`)
|
||||||
|
.then(r => r.json())
|
||||||
|
.then(data => {
|
||||||
|
const statusCell = document.getElementById(`status-${jobId}`);
|
||||||
|
const resultCell = document.getElementById(`result-${jobId}`);
|
||||||
|
|
||||||
|
statusCell.textContent = data.status;
|
||||||
|
renderResult(resultCell, data);
|
||||||
|
|
||||||
|
// Weiter pollen wenn noch nicht fertig
|
||||||
|
const done = data.status.includes('✅') || data.status.includes('Failed');
|
||||||
|
if (!done) {
|
||||||
|
setTimeout(() => pollJob(jobId), 5000);
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.catch(() => setTimeout(() => pollJob(jobId), 10000));
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
{% endblock %}
|
||||||
|
|
@ -15,20 +15,38 @@
|
||||||
</thead>
|
</thead>
|
||||||
<tbody>
|
<tbody>
|
||||||
{% for job in jobs %}
|
{% for job in jobs %}
|
||||||
<tr>
|
<tr id="job-row-{{ job.id }}">
|
||||||
<td>{{ job.filename }}</td>
|
<td>{{ job.filename }}</td>
|
||||||
<td class="job-status">{{ job.status }}</td>
|
<td id="status-{{ job.id }}" class="job-status">
|
||||||
|
{{ job.status }}
|
||||||
|
</td>
|
||||||
<td>{{ job.created_at.strftime('%Y-%m-%d %H:%M:%S') }}</td>
|
<td>{{ job.created_at.strftime('%Y-%m-%d %H:%M:%S') }}</td>
|
||||||
<td>
|
<td id="result-{{ job.id }}">
|
||||||
{% if job.status == "Completed" %}
|
{% if job.result_filename and 'Failed' not in job.status %}
|
||||||
<a href="{{ url_for('auth.download_result', job_id=job.id) }}">Download</a>
|
<a href="{{ url_for('auth.download_result', job_id=job.id) }}" class="dl-btn">
|
||||||
|
🎯 Gefiltert
|
||||||
|
</a>
|
||||||
|
{% if job.result_filename_raw %}
|
||||||
|
|
||||||
|
<a href="{{ url_for('auth.download_result_raw', job_id=job.id) }}" class="dl-btn dl-btn-raw">
|
||||||
|
📋 Alle
|
||||||
|
</a>
|
||||||
|
{% endif %}
|
||||||
|
{% elif 'Failed' in job.status %}
|
||||||
|
<span class="status-failed">❌ {{ job.result_filename or 'Fehler' }}</span>
|
||||||
{% else %}
|
{% else %}
|
||||||
Noch nicht verfügbar
|
<span class="status-pending">⏳ Noch nicht verfügbar</span>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
</td>
|
</td>
|
||||||
<td>
|
<td>
|
||||||
|
{% if 'Failed' in job.status %}
|
||||||
|
<!-- 🆕 Resume Button -->
|
||||||
|
<form action="{{ url_for('auth.resume_job', job_id=job.id) }}" method="POST" style="display:inline;">
|
||||||
|
<button type="submit" class="btn-resume">▶️ Resume</button>
|
||||||
|
</form>
|
||||||
|
{% endif %}
|
||||||
<form action="{{ url_for('auth.delete_job', job_id=job.id) }}" method="POST" style="display:inline;">
|
<form action="{{ url_for('auth.delete_job', job_id=job.id) }}" method="POST" style="display:inline;">
|
||||||
<button type="submit" class="delete-btn">Löschen</button>
|
<button type="submit" class="delete-btn">🗑️ Löschen</button>
|
||||||
</form>
|
</form>
|
||||||
</td>
|
</td>
|
||||||
</tr>
|
</tr>
|
||||||
|
|
@ -37,25 +55,101 @@
|
||||||
</table>
|
</table>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<style>
|
||||||
|
.job-status { font-weight: bold; }
|
||||||
|
.status-failed { color: #e74c3c; font-weight: bold; }
|
||||||
|
.status-pending { color: #888; }
|
||||||
|
|
||||||
|
.eta-badge { display: inline-block; background: #eaf4ff; color: #1a6fa8;
|
||||||
|
border-radius: 10px; padding: 2px 8px; font-size: 0.82em;
|
||||||
|
font-weight: bold; margin-left: 6px; }
|
||||||
|
|
||||||
|
.dl-btn { display: inline-block; padding: 4px 10px; border-radius: 4px;
|
||||||
|
text-decoration: none; font-size: 0.85em; font-weight: bold;
|
||||||
|
background: #27ae60; color: #fff; margin: 2px 1px; transition: background 0.2s; }
|
||||||
|
.dl-btn:hover { background: #1e8449; }
|
||||||
|
.dl-btn-raw { background: #2980b9; }
|
||||||
|
.dl-btn-raw:hover { background: #1a5e8a; }
|
||||||
|
|
||||||
|
.btn-resume { background: #e67e22; color: white; border: none;
|
||||||
|
padding: 4px 10px; border-radius: 4px; cursor: pointer;
|
||||||
|
font-size: 0.85em; font-weight: bold; margin-right: 4px; }
|
||||||
|
.btn-resume:hover { background: #ca6f1e; }
|
||||||
|
</style>
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
// Periodische Aktualisierung des Jobstatus
|
// ETA Badge aus Status-String parsen
|
||||||
setInterval(function() {
|
function parseStatus(status) {
|
||||||
fetch('{{ url_for("auth.job_status") }}')
|
const parts = status.split('|');
|
||||||
.then(response => response.text())
|
if (parts.length === 2) {
|
||||||
.then(html => {
|
return `<span>${parts[0].trim()}</span>
|
||||||
const parser = new DOMParser();
|
<span class="eta-badge">${parts[1].trim()}</span>`;
|
||||||
const doc = parser.parseFromString(html, 'text/html');
|
}
|
||||||
const newRows = doc.querySelectorAll('#jobs-table tbody tr');
|
return status;
|
||||||
const currentRows = document.querySelectorAll('#jobs-table tbody tr');
|
}
|
||||||
|
|
||||||
newRows.forEach((newRow, index) => {
|
function renderResult(resultCell, data) {
|
||||||
const newStatus = newRow.querySelector('.job-status').textContent;
|
const hasFailed = data.status.includes('Failed') || data.status.includes('❌');
|
||||||
currentRows[index].querySelector('.job-status').textContent = newStatus;
|
const hasFiltered = data.result_filename && !hasFailed;
|
||||||
|
const hasRaw = data.result_filename_raw && !hasFailed;
|
||||||
|
|
||||||
const newResult = newRow.querySelector('td:nth-child(4)').innerHTML;
|
if (hasFiltered) {
|
||||||
currentRows[index].querySelector('td:nth-child(4)').innerHTML = newResult;
|
let html = `<a href="/download/${data.id}" class="dl-btn">🎯 Gefiltert</a>`;
|
||||||
});
|
if (hasRaw) {
|
||||||
});
|
html += ` <a href="/download_raw/${data.id}" class="dl-btn dl-btn-raw">📋 Alle</a>`;
|
||||||
}, 5000); // Aktualisierung alle 5 Sekunden
|
}
|
||||||
|
resultCell.innerHTML = html;
|
||||||
|
} else if (hasFailed) {
|
||||||
|
resultCell.innerHTML = `<span class="status-failed">❌ ${data.result_filename || 'Fehler'}</span>`;
|
||||||
|
} else {
|
||||||
|
resultCell.innerHTML = `<span class="status-pending">⏳ Noch nicht verfügbar</span>`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function renderActions(row, data) {
|
||||||
|
const actionsCell = row.querySelector('td:last-child');
|
||||||
|
const hasFailed = data.status.includes('Failed');
|
||||||
|
let html = '';
|
||||||
|
if (hasFailed) {
|
||||||
|
html += `<form action="/resume_job/${data.id}" method="POST" style="display:inline;">
|
||||||
|
<button type="submit" class="btn-resume">▶️ Resume</button>
|
||||||
|
</form>`;
|
||||||
|
}
|
||||||
|
html += `<form action="/delete_job/${data.id}" method="POST" style="display:inline;">
|
||||||
|
<button type="submit" class="delete-btn">🗑️ Löschen</button>
|
||||||
|
</form>`;
|
||||||
|
actionsCell.innerHTML = html;
|
||||||
|
}
|
||||||
|
|
||||||
|
function pollJob(jobId) {
|
||||||
|
fetch(`/job_status/${jobId}`)
|
||||||
|
.then(r => r.json())
|
||||||
|
.then(data => {
|
||||||
|
const statusCell = document.getElementById(`status-${jobId}`);
|
||||||
|
const resultCell = document.getElementById(`result-${jobId}`);
|
||||||
|
const row = document.getElementById(`job-row-${jobId}`);
|
||||||
|
|
||||||
|
statusCell.innerHTML = parseStatus(data.status);
|
||||||
|
renderResult(resultCell, data);
|
||||||
|
renderActions(row, data);
|
||||||
|
|
||||||
|
const done = data.status.includes('✅') || data.status.includes('Failed') || data.status.includes('❌');
|
||||||
|
if (!done) {
|
||||||
|
setTimeout(() => pollJob(jobId), 5000);
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.catch(() => setTimeout(() => pollJob(jobId), 10000));
|
||||||
|
}
|
||||||
|
|
||||||
|
document.addEventListener('DOMContentLoaded', function () {
|
||||||
|
document.querySelectorAll('.job-status').forEach(function (cell) {
|
||||||
|
const jobId = cell.id.split('-')[1];
|
||||||
|
const status = cell.textContent.trim();
|
||||||
|
cell.innerHTML = parseStatus(status);
|
||||||
|
if (!status.includes('✅') && !status.includes('Failed') && !status.includes('❌')) {
|
||||||
|
pollJob(jobId);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
</script>
|
</script>
|
||||||
{% endblock %}
|
{% endblock %}
|
||||||
|
|
|
||||||
61
app/templates/jobs.orig
Normal file
61
app/templates/jobs.orig
Normal file
|
|
@ -0,0 +1,61 @@
|
||||||
|
{% extends "base.html" %}
|
||||||
|
|
||||||
|
{% block content %}
|
||||||
|
<div class="table-container">
|
||||||
|
<h2>Ihre Aufträge</h2>
|
||||||
|
<table id="jobs-table">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>Dateiname</th>
|
||||||
|
<th>Status</th>
|
||||||
|
<th>Erstellt am</th>
|
||||||
|
<th>Ergebnis</th>
|
||||||
|
<th>Aktionen</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
{% for job in jobs %}
|
||||||
|
<tr>
|
||||||
|
<td>{{ job.filename }}</td>
|
||||||
|
<td class="job-status">{{ job.status }}</td>
|
||||||
|
<td>{{ job.created_at.strftime('%Y-%m-%d %H:%M:%S') }}</td>
|
||||||
|
<td>
|
||||||
|
{% if job.status == "Completed" %}
|
||||||
|
<a href="{{ url_for('auth.download_result', job_id=job.id) }}">Download</a>
|
||||||
|
{% else %}
|
||||||
|
Noch nicht verfügbar
|
||||||
|
{% endif %}
|
||||||
|
</td>
|
||||||
|
<td>
|
||||||
|
<form action="{{ url_for('auth.delete_job', job_id=job.id) }}" method="POST" style="display:inline;">
|
||||||
|
<button type="submit" class="delete-btn">Löschen</button>
|
||||||
|
</form>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
{% endfor %}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
// Periodische Aktualisierung des Jobstatus
|
||||||
|
setInterval(function() {
|
||||||
|
fetch('{{ url_for("auth.job_status") }}')
|
||||||
|
.then(response => response.text())
|
||||||
|
.then(html => {
|
||||||
|
const parser = new DOMParser();
|
||||||
|
const doc = parser.parseFromString(html, 'text/html');
|
||||||
|
const newRows = doc.querySelectorAll('#jobs-table tbody tr');
|
||||||
|
const currentRows = document.querySelectorAll('#jobs-table tbody tr');
|
||||||
|
|
||||||
|
newRows.forEach((newRow, index) => {
|
||||||
|
const newStatus = newRow.querySelector('.job-status').textContent;
|
||||||
|
currentRows[index].querySelector('.job-status').textContent = newStatus;
|
||||||
|
|
||||||
|
const newResult = newRow.querySelector('td:nth-child(4)').innerHTML;
|
||||||
|
currentRows[index].querySelector('td:nth-child(4)').innerHTML = newResult;
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}, 5000); // Aktualisierung alle 5 Sekunden
|
||||||
|
</script>
|
||||||
|
{% endblock %}
|
||||||
316
app/webcrawler.bck02032026
Normal file
316
app/webcrawler.bck02032026
Normal file
|
|
@ -0,0 +1,316 @@
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import pandas as pd
|
||||||
|
import requests
|
||||||
|
import time
|
||||||
|
import random
|
||||||
|
from io import StringIO
|
||||||
|
from app.models import db, Job
|
||||||
|
|
||||||
|
print("🆕 MODERN webcrawler LOADED!")
|
||||||
|
|
||||||
|
UPLOAD_FOLDER = '/app/uploads'
|
||||||
|
RESULT_FOLDER = '/app/results'
|
||||||
|
SCRAPER_URL = "http://gmaps-scraper:8080"
|
||||||
|
|
||||||
|
OUTPUT_COLS = ['title', 'category', 'address', 'open_hours', 'website', 'phone', 'link']
|
||||||
|
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────
|
||||||
|
# Hilfsfunktionen
|
||||||
|
# ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
def get_batch_size(total_rows):
|
||||||
|
if total_rows < 50: return 10
|
||||||
|
elif total_rows < 200: return 10
|
||||||
|
elif total_rows < 500: return 5
|
||||||
|
else: return 5
|
||||||
|
|
||||||
|
def get_delay(total_rows):
|
||||||
|
if total_rows < 50: return (5, 10)
|
||||||
|
elif total_rows < 200: return (10, 20)
|
||||||
|
else: return (20, 40)
|
||||||
|
|
||||||
|
def is_blocked(data):
|
||||||
|
text = str(data).lower()
|
||||||
|
blocked = any(kw in text for kw in ['captcha', 'blocked', 'rate limit', 'too many', '429'])
|
||||||
|
if blocked:
|
||||||
|
print(f"🚫 BLOCKED: {str(data)[:100]}")
|
||||||
|
return blocked
|
||||||
|
|
||||||
|
def fix_encoding(text):
|
||||||
|
"""Kaputte ISO→UTF8 Zeichen reparieren (z.B. Industriestraße → Industriestraße)"""
|
||||||
|
if not isinstance(text, str):
|
||||||
|
return text
|
||||||
|
try:
|
||||||
|
return text.encode('latin-1').decode('utf-8')
|
||||||
|
except (UnicodeEncodeError, UnicodeDecodeError):
|
||||||
|
return text
|
||||||
|
|
||||||
|
def build_input_addresses(df):
|
||||||
|
"""Normalisierte Adressen aus Input-CSV für Abgleich"""
|
||||||
|
addresses = set()
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
plz = str(row.get('PLZ', '')).strip()
|
||||||
|
stadt = str(row.get('Stadt', '')).strip()
|
||||||
|
str_ = str(row.get('Straße', '')).strip()
|
||||||
|
nr = str(row.get('Hausnummer', '')).strip()
|
||||||
|
zusatz = str(row.get('Zusatz', '')).strip()
|
||||||
|
|
||||||
|
full = f"{str_} {nr} {zusatz} {plz} {stadt}".lower().strip()
|
||||||
|
full = ' '.join(full.split())
|
||||||
|
addresses.add(full)
|
||||||
|
return addresses
|
||||||
|
|
||||||
|
def normalize_address(addr):
|
||||||
|
"""Output-Adresse normalisieren für Abgleich"""
|
||||||
|
if not isinstance(addr, str):
|
||||||
|
return ''
|
||||||
|
addr = fix_encoding(addr)
|
||||||
|
return ' '.join(addr.lower().strip().split())
|
||||||
|
|
||||||
|
def address_in_input(result_addr, input_addresses):
|
||||||
|
"""Prüft ob PLZ + Straßenname aus Result im Input vorkommen"""
|
||||||
|
norm = normalize_address(result_addr)
|
||||||
|
for inp_addr in input_addresses:
|
||||||
|
plz_match = re.search(r'\b\d{5}\b', inp_addr)
|
||||||
|
if plz_match:
|
||||||
|
plz = plz_match.group()
|
||||||
|
if plz in norm:
|
||||||
|
street = inp_addr.split()[0] if inp_addr else ''
|
||||||
|
if len(street) > 3 and street[:4].lower() in norm:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────
|
||||||
|
# CSV Nachbearbeitung (apply_filter umschaltbar)
|
||||||
|
# ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
def process_result_csv(raw_bytes, input_df, apply_filter=True):
|
||||||
|
"""
|
||||||
|
Raw CSV → bereinigt:
|
||||||
|
- Nur OUTPUT_COLS
|
||||||
|
- Encoding fix
|
||||||
|
- Optional: Input/Output Abgleich + Duplikate
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
content = raw_bytes.decode('utf-8', errors='replace')
|
||||||
|
df_out = pd.read_csv(StringIO(content))
|
||||||
|
print(f"📄 Raw result: {df_out.shape} | Columns: {list(df_out.columns)[:8]}")
|
||||||
|
|
||||||
|
# Spalten filtern
|
||||||
|
available = [c for c in OUTPUT_COLS if c in df_out.columns]
|
||||||
|
missing = [c for c in OUTPUT_COLS if c not in df_out.columns]
|
||||||
|
if missing:
|
||||||
|
print(f"⚠️ Fehlende Spalten: {missing}")
|
||||||
|
df_out = df_out[available]
|
||||||
|
|
||||||
|
# 🔤 Encoding fix
|
||||||
|
for col in df_out.columns:
|
||||||
|
df_out[col] = df_out[col].apply(fix_encoding)
|
||||||
|
print(f"🔤 Encoding fix: done")
|
||||||
|
|
||||||
|
if apply_filter:
|
||||||
|
# 📍 Input/Output Abgleich
|
||||||
|
input_addresses = build_input_addresses(input_df)
|
||||||
|
before = len(df_out)
|
||||||
|
df_out = df_out[
|
||||||
|
df_out['address'].apply(
|
||||||
|
lambda a: address_in_input(a, input_addresses)
|
||||||
|
)
|
||||||
|
]
|
||||||
|
print(f"📍 Adress-Filter: {before} → {len(df_out)} Zeilen")
|
||||||
|
|
||||||
|
# 🔁 Duplikate entfernen (immer, auch bei Raw)
|
||||||
|
before_dedup = len(df_out)
|
||||||
|
df_out = df_out.drop_duplicates(subset=['title', 'address'], keep='first')
|
||||||
|
print(f"🔁 Duplikate: {before_dedup} → {len(df_out)} Zeilen")
|
||||||
|
|
||||||
|
# Leere Titel entfernen
|
||||||
|
df_out = df_out.dropna(subset=['title'], how='all')
|
||||||
|
df_out = df_out[df_out['title'].str.strip().astype(bool)]
|
||||||
|
|
||||||
|
print(f"✅ Final ({'gefiltert' if apply_filter else 'alle'}): {df_out.shape}")
|
||||||
|
return df_out
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"💥 process_result_csv: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────
|
||||||
|
# Haupt-Worker
|
||||||
|
# ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
def process_file(filename, job_id, app):
|
||||||
|
print(f"🎯 {filename} Job#{job_id} START!")
|
||||||
|
|
||||||
|
with app.app_context():
|
||||||
|
job = Job.query.get(job_id)
|
||||||
|
if not job:
|
||||||
|
print("❌ Job missing")
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 1️⃣ CSV Parse
|
||||||
|
job.status = "📊 parsing CSV"
|
||||||
|
db.session.commit()
|
||||||
|
|
||||||
|
filepath = os.path.join(UPLOAD_FOLDER, filename)
|
||||||
|
print(f"📁 {filepath} | {os.path.getsize(filepath)}b")
|
||||||
|
|
||||||
|
df_input = pd.read_csv(filepath, sep=';', encoding='ISO-8859-1')
|
||||||
|
print(f"📊 {df_input.shape} | Columns: {list(df_input.columns)}")
|
||||||
|
|
||||||
|
queries = []
|
||||||
|
for _, row in df_input.iterrows():
|
||||||
|
parts = [
|
||||||
|
str(row.get('PLZ', '')).strip(),
|
||||||
|
str(row.get('Stadt', '')).strip(),
|
||||||
|
str(row.get('Straße', '')).strip(),
|
||||||
|
str(row.get('Hausnummer', '')).strip(),
|
||||||
|
str(row.get('Zusatz', '')).strip(),
|
||||||
|
]
|
||||||
|
q = f"Firmen {' '.join(p for p in parts if p and p != 'nan')}".strip()
|
||||||
|
if len(q) > 10:
|
||||||
|
queries.append(q)
|
||||||
|
|
||||||
|
total = len(queries)
|
||||||
|
print(f"🔍 {total} Queries | Samples: {queries[:3]}")
|
||||||
|
if not queries:
|
||||||
|
raise ValueError("Keine gültigen Adressen in CSV")
|
||||||
|
|
||||||
|
# 2️⃣ Batch + Delay
|
||||||
|
batch_size = get_batch_size(total)
|
||||||
|
delay_min, delay_max = get_delay(total)
|
||||||
|
batch = queries[:batch_size]
|
||||||
|
pre_delay = random.uniform(delay_min, delay_max)
|
||||||
|
print(f"📦 Batch {len(batch)}/{total} | 😴 {pre_delay:.1f}s Delay")
|
||||||
|
time.sleep(pre_delay)
|
||||||
|
|
||||||
|
# 3️⃣ API Call
|
||||||
|
job.status = "📤 sending to scraper"
|
||||||
|
db.session.commit()
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"name": f"{filename.replace('.csv','')}-{job_id}",
|
||||||
|
"keywords": batch,
|
||||||
|
"lang": "de",
|
||||||
|
"depth": 1,
|
||||||
|
"zoom": 17,
|
||||||
|
"radius": 50,
|
||||||
|
"max_time": 60,
|
||||||
|
"fast_mode": False
|
||||||
|
}
|
||||||
|
|
||||||
|
print(f"🌐 POST {SCRAPER_URL}/api/v1/jobs | {payload['name']}")
|
||||||
|
resp = requests.post(f"{SCRAPER_URL}/api/v1/jobs", json=payload, timeout=30)
|
||||||
|
print(f"📤 {resp.status_code}: {resp.text[:300]}")
|
||||||
|
|
||||||
|
if is_blocked(resp.text):
|
||||||
|
raise ValueError("🚫 IP geblockt! Proxy konfigurieren.")
|
||||||
|
if resp.status_code != 201:
|
||||||
|
raise ValueError(f"API {resp.status_code}: {resp.text[:200]}")
|
||||||
|
|
||||||
|
# 4️⃣ Polling
|
||||||
|
scraper_id = resp.json()['id']
|
||||||
|
job.scraper_job_id = scraper_id
|
||||||
|
job.status = "⏳ scraping"
|
||||||
|
db.session.commit()
|
||||||
|
print(f"✅ Scraper Job: {scraper_id}")
|
||||||
|
|
||||||
|
for i in range(1, 61): # Max 10min
|
||||||
|
try:
|
||||||
|
r = requests.get(
|
||||||
|
f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}",
|
||||||
|
timeout=10
|
||||||
|
)
|
||||||
|
data = r.json()
|
||||||
|
status = data.get('Status', data.get('status', '?'))
|
||||||
|
print(f"⏳ {i}/60: {status}")
|
||||||
|
|
||||||
|
if is_blocked(data):
|
||||||
|
raise ValueError("🚫 IP geblockt während scraping!")
|
||||||
|
|
||||||
|
if status in ('ok', 'completed', 'scraped'):
|
||||||
|
dl = requests.get(
|
||||||
|
f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}/download",
|
||||||
|
timeout=60
|
||||||
|
)
|
||||||
|
if dl.status_code != 200:
|
||||||
|
raise ValueError(f"Download {dl.status_code}")
|
||||||
|
if is_blocked(dl.text[:200]):
|
||||||
|
raise ValueError("🚫 IP geblockt beim Download!")
|
||||||
|
|
||||||
|
# 5️⃣ Nachbearbeitung → zwei Versionen
|
||||||
|
job.status = "🔧 processing result"
|
||||||
|
db.session.commit()
|
||||||
|
|
||||||
|
base = filename.replace('.csv', '')
|
||||||
|
os.makedirs(RESULT_FOLDER, exist_ok=True)
|
||||||
|
|
||||||
|
# ── Version A: Gefiltert (Adressabgleich + Deduplizierung) ──
|
||||||
|
df_filtered = process_result_csv(dl.content, df_input, apply_filter=True)
|
||||||
|
outname_filtered = f"results_{base}_filtered.csv"
|
||||||
|
outpath_filtered = os.path.join(RESULT_FOLDER, outname_filtered)
|
||||||
|
|
||||||
|
if df_filtered is not None and len(df_filtered) > 0:
|
||||||
|
df_filtered.to_csv(
|
||||||
|
outpath_filtered, index=False,
|
||||||
|
encoding='utf-8-sig', sep=';'
|
||||||
|
)
|
||||||
|
print(f"🎯 Filtered: {outname_filtered} → {len(df_filtered)} Firmen")
|
||||||
|
else:
|
||||||
|
print("⚠️ Keine Treffer nach Filter – leere Datei wird erstellt")
|
||||||
|
pd.DataFrame(columns=OUTPUT_COLS).to_csv(
|
||||||
|
outpath_filtered, index=False,
|
||||||
|
encoding='utf-8-sig', sep=';'
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── Version B: Alle (nur Spalten + Encoding, kein Filter) ──
|
||||||
|
df_raw = process_result_csv(dl.content, df_input, apply_filter=False)
|
||||||
|
outname_raw = f"results_{base}_all.csv"
|
||||||
|
outpath_raw = os.path.join(RESULT_FOLDER, outname_raw)
|
||||||
|
|
||||||
|
if df_raw is not None:
|
||||||
|
df_raw.to_csv(
|
||||||
|
outpath_raw, index=False,
|
||||||
|
encoding='utf-8-sig', sep=';'
|
||||||
|
)
|
||||||
|
print(f"📋 All: {outname_raw} → {len(df_raw)} Firmen")
|
||||||
|
else:
|
||||||
|
print("⚠️ df_raw None – Rohinhalt wird gespeichert")
|
||||||
|
with open(outpath_raw, 'wb') as f:
|
||||||
|
f.write(dl.content)
|
||||||
|
|
||||||
|
# ── DB speichern ──
|
||||||
|
job.status = "✅ Fertig"
|
||||||
|
job.result_filename = outname_filtered # 🎯 Gefiltert
|
||||||
|
job.result_filename_raw = outname_raw # 📋 Alle
|
||||||
|
db.session.commit()
|
||||||
|
print(f"🎉 Beide Dateien gespeichert!")
|
||||||
|
break
|
||||||
|
|
||||||
|
elif status in ('failed', 'cancelled', 'error'):
|
||||||
|
raise ValueError(f"Scraper: {status}")
|
||||||
|
|
||||||
|
except requests.RequestException as e:
|
||||||
|
print(f"⚠️ Poll {i}: {e}")
|
||||||
|
|
||||||
|
time.sleep(random.uniform(8, 15))
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise ValueError("Timeout nach 10min")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
job.status = "Failed"
|
||||||
|
job.result_filename = str(e)
|
||||||
|
print(f"💥 ERROR: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
|
db.session.commit()
|
||||||
|
print(f"✅ DONE! Status: {job.status}\n")
|
||||||
275
app/webcrawler.bck04032026
Normal file
275
app/webcrawler.bck04032026
Normal file
|
|
@ -0,0 +1,275 @@
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import pandas as pd
|
||||||
|
import requests
|
||||||
|
import time
|
||||||
|
import random
|
||||||
|
from io import StringIO
|
||||||
|
from app.models import db, Job
|
||||||
|
|
||||||
|
print("🆕 MODERN webcrawler LOADED! – BATCHED + PROXY")
|
||||||
|
|
||||||
|
UPLOAD_FOLDER = '/app/uploads'
|
||||||
|
RESULT_FOLDER = '/app/results'
|
||||||
|
SCRAPER_URL = "http://gmaps-scraper:8080"
|
||||||
|
|
||||||
|
OUTPUT_COLS = ['title', 'category', 'address', 'open_hours', 'website', 'phone', 'link']
|
||||||
|
|
||||||
|
PROXY_URL = "http://bitlleuv-rotate:s5hzse6hz74b@p.webshare.io:80"
|
||||||
|
API_PROXIES = {"http": PROXY_URL, "https": PROXY_URL}
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────
|
||||||
|
# Hilfsfunktionen
|
||||||
|
# ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
def is_blocked(data):
|
||||||
|
text = str(data).lower()
|
||||||
|
blocked = any(kw in text for kw in ['captcha', 'blocked', 'rate limit', 'too many', '429'])
|
||||||
|
if blocked:
|
||||||
|
print(f"🚫 BLOCKED: {str(data)[:100]}")
|
||||||
|
return blocked
|
||||||
|
|
||||||
|
def fix_encoding(text):
|
||||||
|
if not isinstance(text, str):
|
||||||
|
return text
|
||||||
|
try:
|
||||||
|
return text.encode('latin-1').decode('utf-8')
|
||||||
|
except (UnicodeEncodeError, UnicodeDecodeError):
|
||||||
|
return text
|
||||||
|
|
||||||
|
def build_input_addresses(df):
|
||||||
|
addresses = set()
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
plz = str(row.get('PLZ', '')).strip()
|
||||||
|
stadt = str(row.get('Stadt', '')).strip()
|
||||||
|
str_ = str(row.get('Straße', '')).strip()
|
||||||
|
nr = str(row.get('Hausnummer', '')).strip()
|
||||||
|
zusatz = str(row.get('Zusatz', '')).strip()
|
||||||
|
full = f"{str_} {nr} {zusatz} {plz} {stadt}".lower().strip()
|
||||||
|
full = ' '.join(full.split())
|
||||||
|
addresses.add(full)
|
||||||
|
return addresses
|
||||||
|
|
||||||
|
def normalize_address(addr):
|
||||||
|
if not isinstance(addr, str):
|
||||||
|
return ''
|
||||||
|
addr = fix_encoding(addr)
|
||||||
|
return ' '.join(addr.lower().strip().split())
|
||||||
|
|
||||||
|
def address_in_input(result_addr, input_addresses):
|
||||||
|
norm = normalize_address(result_addr)
|
||||||
|
for inp_addr in input_addresses:
|
||||||
|
plz_match = re.search(r'\b\d{5}\b', inp_addr)
|
||||||
|
if plz_match:
|
||||||
|
plz = plz_match.group()
|
||||||
|
if plz in norm:
|
||||||
|
street = inp_addr.split()[0] if inp_addr else ''
|
||||||
|
if len(street) > 3 and street[:4].lower() in norm:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────
|
||||||
|
# CSV Nachbearbeitung
|
||||||
|
# ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
def process_result_csv(raw_bytes, input_df, apply_filter=True):
|
||||||
|
try:
|
||||||
|
content = raw_bytes.decode('utf-8', errors='replace')
|
||||||
|
df_out = pd.read_csv(StringIO(content))
|
||||||
|
print(f"📄 Raw result: {df_out.shape}")
|
||||||
|
|
||||||
|
available = [c for c in OUTPUT_COLS if c in df_out.columns]
|
||||||
|
df_out = df_out[available]
|
||||||
|
|
||||||
|
for col in df_out.columns:
|
||||||
|
df_out[col] = df_out[col].apply(fix_encoding)
|
||||||
|
|
||||||
|
if apply_filter:
|
||||||
|
input_addresses = build_input_addresses(input_df)
|
||||||
|
before = len(df_out)
|
||||||
|
df_out = df_out[
|
||||||
|
df_out['address'].apply(
|
||||||
|
lambda a: address_in_input(a, input_addresses)
|
||||||
|
)
|
||||||
|
]
|
||||||
|
print(f"📍 Filter: {before} → {len(df_out)}")
|
||||||
|
|
||||||
|
df_out = df_out.drop_duplicates(subset=['title', 'address'], keep='first')
|
||||||
|
df_out = df_out.dropna(subset=['title'], how='all')
|
||||||
|
df_out = df_out[df_out['title'].str.strip().astype(bool)]
|
||||||
|
|
||||||
|
print(f"✅ Final ({'gefiltert' if apply_filter else 'alle'}): {df_out.shape}")
|
||||||
|
return df_out
|
||||||
|
except Exception as e:
|
||||||
|
print(f"💥 process_result_csv: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────
|
||||||
|
# HAUPT-WORKER
|
||||||
|
# ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
def process_file(filename, job_id, app):
|
||||||
|
print(f"🎯 {filename} Job#{job_id} START!")
|
||||||
|
|
||||||
|
with app.app_context():
|
||||||
|
job = Job.query.get(job_id)
|
||||||
|
if not job:
|
||||||
|
print("❌ Job missing")
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
#Parse + ALLE Queries
|
||||||
|
job.status = "📊 parsing CSV"
|
||||||
|
db.session.commit()
|
||||||
|
|
||||||
|
filepath = os.path.join(UPLOAD_FOLDER, filename)
|
||||||
|
print(f"📁 {filepath} | {os.path.getsize(filepath)}b")
|
||||||
|
|
||||||
|
df_input = pd.read_csv(filepath, sep=';', encoding='ISO-8859-1')
|
||||||
|
print(f"📊 {df_input.shape}")
|
||||||
|
|
||||||
|
queries = []
|
||||||
|
for _, row in df_input.iterrows():
|
||||||
|
parts = [
|
||||||
|
str(row.get('PLZ', '')).strip(),
|
||||||
|
str(row.get('Stadt', '')).strip(),
|
||||||
|
str(row.get('Straße', '')).strip(),
|
||||||
|
str(row.get('Hausnummer', '')).strip(),
|
||||||
|
str(row.get('Zusatz', '')).strip(),
|
||||||
|
]
|
||||||
|
q = f"Firmen {' '.join(p for p in parts if p and p != 'nan')}".strip()
|
||||||
|
if len(q) > 10:
|
||||||
|
queries.append(q)
|
||||||
|
|
||||||
|
total_queries = len(queries)
|
||||||
|
print(f"🔍 {total_queries} Queries | Samples: {queries[:3]}")
|
||||||
|
if total_queries == 0:
|
||||||
|
raise ValueError("Keine gültigen Adressen")
|
||||||
|
|
||||||
|
#BATCHED Processing
|
||||||
|
BATCH_SIZE = 10 # Erhöht: 5 → 10 (paid proxy)
|
||||||
|
BATCH_DELAY_MIN, BATCH_DELAY_MAX = 10, 20 # Reduziert: 30-60s → 10-20s (paid proxy)
|
||||||
|
batches = (total_queries + BATCH_SIZE - 1) // BATCH_SIZE
|
||||||
|
print(f"📦 {batches} Batches à {BATCH_SIZE} | ETA: ~{batches*15//60:.0f}h")
|
||||||
|
|
||||||
|
all_results_filtered = []
|
||||||
|
all_results_raw = []
|
||||||
|
job.status = f"🔄 Batch 1/{batches}"
|
||||||
|
db.session.commit()
|
||||||
|
|
||||||
|
for batch_idx in range(batches):
|
||||||
|
batch_start = batch_idx * BATCH_SIZE
|
||||||
|
batch_end = min(batch_start + BATCH_SIZE, total_queries)
|
||||||
|
batch_queries = queries[batch_start:batch_end]
|
||||||
|
print(f"\n🔄 BATCH {batch_idx+1}/{batches} ({batch_start+1}-{batch_end}/{total_queries})")
|
||||||
|
|
||||||
|
#Random Delay
|
||||||
|
delay = random.uniform(BATCH_DELAY_MIN, BATCH_DELAY_MAX)
|
||||||
|
print(f"😴 Delay: {delay:.0f}s | Proxy: {PROXY_URL}")
|
||||||
|
time.sleep(delay)
|
||||||
|
|
||||||
|
#API Call
|
||||||
|
payload = {
|
||||||
|
"name": f"{filename.replace('.csv','')}-{job_id}-B{batch_idx+1:03d}",
|
||||||
|
"keywords": batch_queries,
|
||||||
|
"lang": "de",
|
||||||
|
"depth": 1,
|
||||||
|
"zoom": 17,
|
||||||
|
"radius": 50,
|
||||||
|
"max_time": 60, # Reduziert: 120 → 60 (paid proxy schneller)
|
||||||
|
"fast_mode": False,
|
||||||
|
"proxies": [PROXY_URL]
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
resp = requests.post(
|
||||||
|
f"{SCRAPER_URL}/api/v1/jobs",
|
||||||
|
json=payload,
|
||||||
|
timeout=45
|
||||||
|
)
|
||||||
|
print(f"📤 {resp.status_code}")
|
||||||
|
if is_blocked(resp.text):
|
||||||
|
print("🚫 Batch übersprungen (blocked)")
|
||||||
|
continue
|
||||||
|
if resp.status_code != 201:
|
||||||
|
print(f"⚠️ Batch {batch_idx+1} fehlgeschlagen: {resp.text[:100]}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
scraper_id = resp.json()['id']
|
||||||
|
print(f"✅ Scraper: {scraper_id}")
|
||||||
|
|
||||||
|
for poll_i in range(1, 61): # Reduziert: 121 → 61 (max_time 60s)
|
||||||
|
r = requests.get(
|
||||||
|
f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}",
|
||||||
|
timeout=15
|
||||||
|
)
|
||||||
|
data = r.json()
|
||||||
|
status = data.get('Status', data.get('status', '?'))
|
||||||
|
|
||||||
|
if status in ('ok', 'completed', 'scraped'):
|
||||||
|
dl = requests.get(
|
||||||
|
f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}/download",
|
||||||
|
timeout=90
|
||||||
|
)
|
||||||
|
if dl.status_code == 200:
|
||||||
|
df_filtered = process_result_csv(dl.content, df_input, True)
|
||||||
|
df_raw = process_result_csv(dl.content, df_input, False)
|
||||||
|
if df_filtered is not None:
|
||||||
|
all_results_filtered.append(df_filtered)
|
||||||
|
all_results_raw.append(df_raw)
|
||||||
|
print(f"📊 Batch {batch_idx+1}: {len(df_filtered)} filtered")
|
||||||
|
break
|
||||||
|
elif status in ('failed', 'error'):
|
||||||
|
print(f"💥 Batch {batch_idx+1}: {status}")
|
||||||
|
break
|
||||||
|
|
||||||
|
time.sleep(random.uniform(5, 10)) # Reduziert: 10-20s → 5-10s (paid proxy)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"💥 Batch {batch_idx+1}: {e}")
|
||||||
|
|
||||||
|
job.status = f"🔄 Batch {batch_idx+2}/{batches}"
|
||||||
|
db.session.commit()
|
||||||
|
|
||||||
|
#MERGE & SAVE
|
||||||
|
job.status = "🔧 merging results"
|
||||||
|
db.session.commit()
|
||||||
|
|
||||||
|
base = filename.replace('.csv', '')
|
||||||
|
os.makedirs(RESULT_FOLDER, exist_ok=True)
|
||||||
|
|
||||||
|
if all_results_filtered:
|
||||||
|
df_final_filtered = pd.concat(all_results_filtered, ignore_index=True)
|
||||||
|
df_final_filtered = df_final_filtered.drop_duplicates(subset=['title', 'address'])
|
||||||
|
|
||||||
|
out_filtered = f"results_{base}_filtered.csv"
|
||||||
|
df_final_filtered.to_csv(
|
||||||
|
os.path.join(RESULT_FOLDER, out_filtered),
|
||||||
|
index=False, encoding='utf-8-sig', sep=';'
|
||||||
|
)
|
||||||
|
|
||||||
|
if all_results_raw:
|
||||||
|
df_final_raw = pd.concat(all_results_raw, ignore_index=True)
|
||||||
|
out_raw = f"results_{base}_all.csv"
|
||||||
|
df_final_raw.to_csv(
|
||||||
|
os.path.join(RESULT_FOLDER, out_raw),
|
||||||
|
index=False, encoding='utf-8-sig', sep=';'
|
||||||
|
)
|
||||||
|
|
||||||
|
job.result_filename = out_filtered
|
||||||
|
job.result_filename_raw = out_raw
|
||||||
|
job.status = f"✅ Fertig: {len(df_final_filtered)} Firmen"
|
||||||
|
else:
|
||||||
|
job.status = "❌ Keine Ergebnisse"
|
||||||
|
|
||||||
|
db.session.commit()
|
||||||
|
print(f"🎉 Job {job_id} komplett!")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
job.status = f"Failed: {str(e)[:50]}"
|
||||||
|
print(f"💥 FATAL: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
db.session.commit()
|
||||||
|
|
||||||
|
print(f"✅ DONE! Status: {job.status}")
|
||||||
429
app/webcrawler.bck04032026_2
Normal file
429
app/webcrawler.bck04032026_2
Normal file
|
|
@ -0,0 +1,429 @@
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import unicodedata
|
||||||
|
import json
|
||||||
|
import pandas as pd
|
||||||
|
import requests
|
||||||
|
import time
|
||||||
|
import random
|
||||||
|
from io import StringIO
|
||||||
|
from app.models import db, Job
|
||||||
|
|
||||||
|
print("🆕 MODERN webcrawler LOADED! – BATCHED + PROXY + RESUME + ETA + 2x SCRAPER")
|
||||||
|
|
||||||
|
UPLOAD_FOLDER = '/app/uploads'
|
||||||
|
RESULT_FOLDER = '/app/results'
|
||||||
|
|
||||||
|
# 2x Scraper – abwechselnd genutzt
|
||||||
|
SCRAPER_URLS = [
|
||||||
|
"http://gmaps-scraper-1:8080",
|
||||||
|
"http://gmaps-scraper-2:8080",
|
||||||
|
]
|
||||||
|
|
||||||
|
OUTPUT_COLS = ['title', 'category', 'address', 'open_hours', 'website', 'phone', 'link']
|
||||||
|
|
||||||
|
PROXY_URL = "http://bitlleuv-rotate:s5hzse6hz74b@p.webshare.io:80"
|
||||||
|
API_PROXIES = {"http": PROXY_URL, "https": PROXY_URL}
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────
|
||||||
|
# Tuning
|
||||||
|
# ──────────────────────────────────────────────
|
||||||
|
BATCH_SIZE = 30 # Keywords pro Scraper-Job
|
||||||
|
BATCH_DELAY_MIN = 3 # Sekunden Pause zwischen Batches (min)
|
||||||
|
BATCH_DELAY_MAX = 6 # Sekunden Pause zwischen Batches (max)
|
||||||
|
MAX_TIME = 60 # Sekunden die der Scraper pro Batch hat
|
||||||
|
POLL_MAX = 90 # Max. Poll-Versuche pro Batch
|
||||||
|
POLL_DELAY_MIN = 2 # Sekunden zwischen Polls (min)
|
||||||
|
POLL_DELAY_MAX = 5 # Sekunden zwischen Polls (max)
|
||||||
|
STUCK_THRESHOLD = 8 # Polls auf 'pending' bis Auto-Restart
|
||||||
|
MAX_RETRIES = 2 # Wiederholversuche pro Batch bei Fehler
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────
|
||||||
|
# Hilfsfunktionen
|
||||||
|
# ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
def is_blocked(data):
|
||||||
|
text = str(data).lower()
|
||||||
|
blocked = any(kw in text for kw in ['captcha', 'blocked', 'rate limit', 'too many', '429'])
|
||||||
|
if blocked:
|
||||||
|
print(f"🚫 BLOCKED: {str(data)[:100]}")
|
||||||
|
return blocked
|
||||||
|
|
||||||
|
def fix_encoding(text):
|
||||||
|
if not isinstance(text, str):
|
||||||
|
return text
|
||||||
|
try:
|
||||||
|
return text.encode('latin-1').decode('utf-8')
|
||||||
|
except (UnicodeEncodeError, UnicodeDecodeError):
|
||||||
|
return text
|
||||||
|
|
||||||
|
# Fix 1: Sonderzeichen in Queries bereinigen
|
||||||
|
def clean_query(q):
|
||||||
|
"""Steuerzeichen + fehlerhafte Bytes entfernen für saubere Google Maps URLs"""
|
||||||
|
q = ''.join(c for c in q if unicodedata.category(c) != 'Cc')
|
||||||
|
q = ' '.join(q.split())
|
||||||
|
return q.strip()
|
||||||
|
|
||||||
|
def build_input_addresses(df):
|
||||||
|
addresses = set()
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
plz = str(row.get('PLZ', '')).strip()
|
||||||
|
stadt = str(row.get('Stadt', '')).strip()
|
||||||
|
str_ = str(row.get('Straße', '')).strip()
|
||||||
|
nr = str(row.get('Hausnummer', '')).strip()
|
||||||
|
zusatz = str(row.get('Zusatz', '')).strip()
|
||||||
|
full = f"{str_} {nr} {zusatz} {plz} {stadt}".lower().strip()
|
||||||
|
full = ' '.join(full.split())
|
||||||
|
addresses.add(full)
|
||||||
|
return addresses
|
||||||
|
|
||||||
|
def normalize_address(addr):
|
||||||
|
if not isinstance(addr, str):
|
||||||
|
return ''
|
||||||
|
addr = fix_encoding(addr)
|
||||||
|
return ' '.join(addr.lower().strip().split())
|
||||||
|
|
||||||
|
def address_in_input(result_addr, input_addresses):
|
||||||
|
norm = normalize_address(result_addr)
|
||||||
|
for inp_addr in input_addresses:
|
||||||
|
plz_match = re.search(r'\b\d{5}\b', inp_addr)
|
||||||
|
if plz_match:
|
||||||
|
plz = plz_match.group()
|
||||||
|
if plz in norm:
|
||||||
|
street = inp_addr.split()[0] if inp_addr else ''
|
||||||
|
if len(street) > 3 and street[:4].lower() in norm:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def format_eta(seconds):
|
||||||
|
"""Sekunden → lesbares ETA-Format"""
|
||||||
|
if seconds < 60:
|
||||||
|
return f"{int(seconds)}s"
|
||||||
|
h, rem = divmod(int(seconds), 3600)
|
||||||
|
m = rem // 60
|
||||||
|
return f"{h}h {m:02d}min" if h > 0 else f"{m}min"
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────
|
||||||
|
# Fix 3: Scraper-Neustart bei Inactivity
|
||||||
|
# ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
def restart_scraper(scraper_url):
|
||||||
|
"""Den betroffenen Scraper-Container neu starten"""
|
||||||
|
try:
|
||||||
|
import subprocess
|
||||||
|
# Container-Name aus URL ableiten: http://gmaps-scraper-1:8080 → gmaps-scraper-1
|
||||||
|
container = scraper_url.split("//")[1].split(":")[0]
|
||||||
|
print(f"🔄 Starte {container} neu...")
|
||||||
|
subprocess.run(["docker", "restart", container], timeout=30, capture_output=True)
|
||||||
|
print(f"✅ {container} neu gestartet – warte 15s...")
|
||||||
|
time.sleep(15)
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"⚠️ Scraper-Neustart fehlgeschlagen: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────
|
||||||
|
# Resume: Progress-File Hilfsfunktionen
|
||||||
|
# ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
def get_progress_path(job_id):
|
||||||
|
return os.path.join(RESULT_FOLDER, f"progress_{job_id}.json")
|
||||||
|
|
||||||
|
def get_partial_path(job_id, suffix):
|
||||||
|
return os.path.join(RESULT_FOLDER, f"partial_{job_id}_{suffix}.csv")
|
||||||
|
|
||||||
|
def load_progress(job_id):
|
||||||
|
"""Gespeicherten Fortschritt laden (falls vorhanden)"""
|
||||||
|
path = get_progress_path(job_id)
|
||||||
|
if os.path.exists(path):
|
||||||
|
with open(path, 'r') as f:
|
||||||
|
data = json.load(f)
|
||||||
|
print(f"🔁 RESUME: ab Batch {data['last_completed_batch'] + 1}/{data['total_batches']}")
|
||||||
|
return data
|
||||||
|
return None
|
||||||
|
|
||||||
|
def save_progress(job_id, last_completed_batch, total_batches):
|
||||||
|
"""Fortschritt nach jedem Batch speichern"""
|
||||||
|
path = get_progress_path(job_id)
|
||||||
|
with open(path, 'w') as f:
|
||||||
|
json.dump({"last_completed_batch": last_completed_batch, "total_batches": total_batches}, f)
|
||||||
|
|
||||||
|
def append_partial(job_id, df_filtered, df_raw):
|
||||||
|
"""Batch-Ergebnis an Partial-CSV anhängen"""
|
||||||
|
for suffix, df in [('filtered', df_filtered), ('raw', df_raw)]:
|
||||||
|
if df is None:
|
||||||
|
continue
|
||||||
|
path = get_partial_path(job_id, suffix)
|
||||||
|
header = not os.path.exists(path)
|
||||||
|
df.to_csv(path, mode='a', index=False, header=header, encoding='utf-8-sig', sep=';')
|
||||||
|
|
||||||
|
def load_partial(job_id):
|
||||||
|
"""Bestehende Partial-CSVs laden"""
|
||||||
|
results_filtered, results_raw = [], []
|
||||||
|
for suffix, lst in [('filtered', results_filtered), ('raw', results_raw)]:
|
||||||
|
path = get_partial_path(job_id, suffix)
|
||||||
|
if os.path.exists(path):
|
||||||
|
try:
|
||||||
|
df = pd.read_csv(path, sep=';', encoding='utf-8-sig')
|
||||||
|
lst.append(df)
|
||||||
|
print(f"📂 Partial {suffix}: {len(df)} Zeilen geladen")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"⚠️ Partial {suffix} Ladefehler: {e}")
|
||||||
|
return results_filtered, results_raw
|
||||||
|
|
||||||
|
def cleanup_progress(job_id):
|
||||||
|
"""Progress + Partial-Files nach Abschluss löschen"""
|
||||||
|
for path in [
|
||||||
|
get_progress_path(job_id),
|
||||||
|
get_partial_path(job_id, 'filtered'),
|
||||||
|
get_partial_path(job_id, 'raw'),
|
||||||
|
]:
|
||||||
|
if os.path.exists(path):
|
||||||
|
os.remove(path)
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────
|
||||||
|
# CSV Nachbearbeitung
|
||||||
|
# ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
def process_result_csv(raw_bytes, input_df, apply_filter=True):
|
||||||
|
try:
|
||||||
|
content = raw_bytes.decode('utf-8', errors='replace')
|
||||||
|
df_out = pd.read_csv(StringIO(content))
|
||||||
|
print(f"📄 Raw result: {df_out.shape}")
|
||||||
|
|
||||||
|
available = [c for c in OUTPUT_COLS if c in df_out.columns]
|
||||||
|
df_out = df_out[available]
|
||||||
|
|
||||||
|
for col in df_out.columns:
|
||||||
|
df_out[col] = df_out[col].apply(fix_encoding)
|
||||||
|
|
||||||
|
if apply_filter:
|
||||||
|
input_addresses = build_input_addresses(input_df)
|
||||||
|
before = len(df_out)
|
||||||
|
df_out = df_out[
|
||||||
|
df_out['address'].apply(lambda a: address_in_input(a, input_addresses))
|
||||||
|
]
|
||||||
|
print(f"📍 Filter: {before} → {len(df_out)}")
|
||||||
|
|
||||||
|
df_out = df_out.drop_duplicates(subset=['title', 'address'], keep='first')
|
||||||
|
df_out = df_out.dropna(subset=['title'], how='all')
|
||||||
|
df_out = df_out[df_out['title'].str.strip().astype(bool)]
|
||||||
|
|
||||||
|
print(f"✅ Final ({'gefiltert' if apply_filter else 'alle'}): {df_out.shape}")
|
||||||
|
return df_out
|
||||||
|
except Exception as e:
|
||||||
|
print(f"💥 process_result_csv: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────
|
||||||
|
# HAUPT-WORKER
|
||||||
|
# ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
def process_file(filename, job_id, app):
|
||||||
|
print(f"🎯 {filename} Job#{job_id} START!")
|
||||||
|
|
||||||
|
with app.app_context():
|
||||||
|
job = Job.query.get(job_id)
|
||||||
|
if not job:
|
||||||
|
print("❌ Job missing")
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
#Parse + ALLE Queries
|
||||||
|
job.status = "📊 parsing CSV"
|
||||||
|
db.session.commit()
|
||||||
|
|
||||||
|
filepath = os.path.join(UPLOAD_FOLDER, filename)
|
||||||
|
print(f"📁 {filepath} | {os.path.getsize(filepath)}b")
|
||||||
|
|
||||||
|
df_input = pd.read_csv(filepath, sep=';', encoding='ISO-8859-1')
|
||||||
|
print(f"📊 {df_input.shape}")
|
||||||
|
|
||||||
|
queries = []
|
||||||
|
for _, row in df_input.iterrows():
|
||||||
|
parts = [
|
||||||
|
str(row.get('PLZ', '')).strip(),
|
||||||
|
str(row.get('Stadt', '')).strip(),
|
||||||
|
str(row.get('Straße', '')).strip(),
|
||||||
|
str(row.get('Hausnummer', '')).strip(),
|
||||||
|
str(row.get('Zusatz', '')).strip(),
|
||||||
|
]
|
||||||
|
q = f"Firmen {' '.join(p for p in parts if p and p != 'nan')}".strip()
|
||||||
|
q = clean_query(q) # Fix 1: Sonderzeichen bereinigen
|
||||||
|
if len(q) > 10:
|
||||||
|
queries.append(q)
|
||||||
|
|
||||||
|
total_queries = len(queries)
|
||||||
|
print(f"🔍 {total_queries} Queries | Samples: {queries[:3]}")
|
||||||
|
if total_queries == 0:
|
||||||
|
raise ValueError("Keine gültigen Adressen")
|
||||||
|
|
||||||
|
#BATCHED Processing
|
||||||
|
batches = (total_queries + BATCH_SIZE - 1) // BATCH_SIZE
|
||||||
|
|
||||||
|
# Resume: Fortschritt laden falls vorhanden
|
||||||
|
os.makedirs(RESULT_FOLDER, exist_ok=True)
|
||||||
|
progress = load_progress(job_id)
|
||||||
|
start_batch = progress['last_completed_batch'] + 1 if progress else 0
|
||||||
|
all_results_filtered, all_results_raw = load_partial(job_id) if progress else ([], [])
|
||||||
|
|
||||||
|
eta_initial = format_eta((batches - start_batch) * ((BATCH_DELAY_MAX + MAX_TIME) / 2))
|
||||||
|
print(f"📦 {batches} Batches à {BATCH_SIZE} | 2x Scraper | Start: {start_batch} | ETA: ~{eta_initial}")
|
||||||
|
job_start_time = time.time()
|
||||||
|
job.status = f"🔄 Batch {start_batch+1}/{batches} | ⏱️ ~{eta_initial}"
|
||||||
|
db.session.commit()
|
||||||
|
|
||||||
|
for batch_idx in range(start_batch, batches):
|
||||||
|
batch_start = batch_idx * BATCH_SIZE
|
||||||
|
batch_end = min(batch_start + BATCH_SIZE, total_queries)
|
||||||
|
batch_queries = queries[batch_start:batch_end]
|
||||||
|
|
||||||
|
# 2x Scraper: abwechselnd nutzen
|
||||||
|
scraper_url = SCRAPER_URLS[batch_idx % len(SCRAPER_URLS)]
|
||||||
|
print(f"\n🔄 BATCH {batch_idx+1}/{batches} ({batch_start+1}-{batch_end}/{total_queries}) → {scraper_url}")
|
||||||
|
|
||||||
|
#Random Delay
|
||||||
|
delay = random.uniform(BATCH_DELAY_MIN, BATCH_DELAY_MAX)
|
||||||
|
print(f"😴 Delay: {delay:.0f}s")
|
||||||
|
time.sleep(delay)
|
||||||
|
|
||||||
|
#API Call
|
||||||
|
payload = {
|
||||||
|
"name": f"{filename.replace('.csv','')}-{job_id}-B{batch_idx+1:03d}",
|
||||||
|
"keywords": batch_queries,
|
||||||
|
"lang": "de",
|
||||||
|
"depth": 1,
|
||||||
|
"zoom": 15,
|
||||||
|
"radius": 50,
|
||||||
|
"max_time": MAX_TIME,
|
||||||
|
"fast_mode": False,
|
||||||
|
"proxies": [PROXY_URL]
|
||||||
|
}
|
||||||
|
|
||||||
|
batch_success = False
|
||||||
|
# Fix 2: Retry-Logik bei Scraper-Fehler
|
||||||
|
for attempt in range(1, MAX_RETRIES + 1):
|
||||||
|
try:
|
||||||
|
resp = requests.post(f"{scraper_url}/api/v1/jobs", json=payload, timeout=45)
|
||||||
|
print(f"📤 {resp.status_code} (Versuch {attempt} | {scraper_url})")
|
||||||
|
|
||||||
|
if is_blocked(resp.text):
|
||||||
|
print("🚫 Batch übersprungen (blocked)")
|
||||||
|
break
|
||||||
|
if resp.status_code != 201:
|
||||||
|
print(f"⚠️ Batch {batch_idx+1} fehlgeschlagen: {resp.text[:100]}")
|
||||||
|
if attempt < MAX_RETRIES:
|
||||||
|
time.sleep(10)
|
||||||
|
continue
|
||||||
|
|
||||||
|
scraper_id = resp.json()['id']
|
||||||
|
print(f"✅ Scraper: {scraper_id}")
|
||||||
|
|
||||||
|
stuck_counter = 0
|
||||||
|
for poll_i in range(1, POLL_MAX + 1):
|
||||||
|
r = requests.get(f"{scraper_url}/api/v1/jobs/{scraper_id}", timeout=15)
|
||||||
|
data = r.json()
|
||||||
|
status = data.get('Status', data.get('status', '?'))
|
||||||
|
print(f"⏳ Poll {poll_i}: {status}")
|
||||||
|
|
||||||
|
# Fix 4: Auto-Recovery bei Pending-Stuck
|
||||||
|
if status == 'pending':
|
||||||
|
stuck_counter += 1
|
||||||
|
if stuck_counter >= STUCK_THRESHOLD:
|
||||||
|
print(f"⚠️ Job {scraper_id} hängt – abbrechen + Neustart")
|
||||||
|
requests.delete(f"{scraper_url}/api/v1/jobs/{scraper_id}", timeout=10)
|
||||||
|
restart_scraper(scraper_url) # Fix 3: Nur betroffenen Scraper neu starten
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
stuck_counter = 0
|
||||||
|
|
||||||
|
if status in ('ok', 'completed', 'scraped'):
|
||||||
|
dl = requests.get(f"{scraper_url}/api/v1/jobs/{scraper_id}/download", timeout=90)
|
||||||
|
if dl.status_code == 200:
|
||||||
|
df_filtered = process_result_csv(dl.content, df_input, True)
|
||||||
|
df_raw = process_result_csv(dl.content, df_input, False)
|
||||||
|
if df_filtered is not None:
|
||||||
|
all_results_filtered.append(df_filtered)
|
||||||
|
all_results_raw.append(df_raw)
|
||||||
|
append_partial(job_id, df_filtered, df_raw) # Resume: sofort speichern
|
||||||
|
print(f"📊 Batch {batch_idx+1}: {len(df_filtered)} filtered")
|
||||||
|
batch_success = True
|
||||||
|
break
|
||||||
|
|
||||||
|
# Fix 2: Scraper-Fehler → Retry
|
||||||
|
elif status in ('failed', 'error'):
|
||||||
|
print(f"💥 Batch {batch_idx+1}: {status} (Versuch {attempt})")
|
||||||
|
if attempt < MAX_RETRIES:
|
||||||
|
time.sleep(10)
|
||||||
|
break
|
||||||
|
|
||||||
|
time.sleep(random.uniform(POLL_DELAY_MIN, POLL_DELAY_MAX))
|
||||||
|
|
||||||
|
if batch_success:
|
||||||
|
break
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"💥 Batch {batch_idx+1} Versuch {attempt}: {e}")
|
||||||
|
if attempt < MAX_RETRIES:
|
||||||
|
time.sleep(10)
|
||||||
|
|
||||||
|
# Resume: Fortschritt nach jedem Batch speichern
|
||||||
|
save_progress(job_id, batch_idx, batches)
|
||||||
|
|
||||||
|
# ETA berechnen
|
||||||
|
elapsed = time.time() - job_start_time
|
||||||
|
done_so_far = batch_idx - start_batch + 1
|
||||||
|
if done_so_far > 0:
|
||||||
|
avg_per_batch = elapsed / done_so_far
|
||||||
|
remaining = (batches - batch_idx - 1) * avg_per_batch
|
||||||
|
eta_str = format_eta(remaining)
|
||||||
|
else:
|
||||||
|
eta_str = "?"
|
||||||
|
|
||||||
|
job.status = f"🔄 Batch {batch_idx+2}/{batches} | ⏱️ ~{eta_str}"
|
||||||
|
db.session.commit()
|
||||||
|
|
||||||
|
#MERGE & SAVE
|
||||||
|
job.status = "🔧 merging results"
|
||||||
|
db.session.commit()
|
||||||
|
|
||||||
|
base = filename.replace('.csv', '')
|
||||||
|
|
||||||
|
if all_results_filtered:
|
||||||
|
df_final_filtered = pd.concat(all_results_filtered, ignore_index=True)
|
||||||
|
df_final_filtered = df_final_filtered.drop_duplicates(subset=['title', 'address'])
|
||||||
|
|
||||||
|
out_filtered = f"results_{base}_filtered.csv"
|
||||||
|
df_final_filtered.to_csv(
|
||||||
|
os.path.join(RESULT_FOLDER, out_filtered),
|
||||||
|
index=False, encoding='utf-8-sig', sep=';'
|
||||||
|
)
|
||||||
|
|
||||||
|
if all_results_raw:
|
||||||
|
df_final_raw = pd.concat(all_results_raw, ignore_index=True)
|
||||||
|
out_raw = f"results_{base}_all.csv"
|
||||||
|
df_final_raw.to_csv(
|
||||||
|
os.path.join(RESULT_FOLDER, out_raw),
|
||||||
|
index=False, encoding='utf-8-sig', sep=';'
|
||||||
|
)
|
||||||
|
|
||||||
|
job.result_filename = out_filtered
|
||||||
|
job.result_filename_raw = out_raw
|
||||||
|
job.status = f"✅ Fertig: {len(df_final_filtered)} Firmen"
|
||||||
|
|
||||||
|
# Resume: Cleanup nach Abschluss
|
||||||
|
cleanup_progress(job_id)
|
||||||
|
else:
|
||||||
|
job.status = "❌ Keine Ergebnisse"
|
||||||
|
|
||||||
|
db.session.commit()
|
||||||
|
print(f"🎉 Job {job_id} komplett!")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
job.status = f"Failed: {str(e)[:50]}"
|
||||||
|
print(f"💥 FATAL: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
db.session.commit()
|
||||||
|
|
||||||
|
print(f"✅ DONE! Status: {job.status}")
|
||||||
138
app/webcrawler.orig
Normal file
138
app/webcrawler.orig
Normal file
|
|
@ -0,0 +1,138 @@
|
||||||
|
import csv
|
||||||
|
import os
|
||||||
|
import requests
|
||||||
|
from .models import db, Job
|
||||||
|
from flask import current_app
|
||||||
|
|
||||||
|
UPLOAD_FOLDER = 'uploads'
|
||||||
|
RESULT_FOLDER = 'results'
|
||||||
|
|
||||||
|
API_KEY = 'AIzaSyAIf0yXJTwo87VMWLBtq2m2LqE-OaPGbzw'
|
||||||
|
|
||||||
|
processed_companies = set()
|
||||||
|
|
||||||
|
def get_geocode(address):
|
||||||
|
url = f"https://maps.googleapis.com/maps/api/geocode/json"
|
||||||
|
params = {'address': address, 'key': API_KEY}
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.get(url, params=params, timeout=5)
|
||||||
|
if response.status_code == 200:
|
||||||
|
data = response.json()
|
||||||
|
if data['status'] == 'OK':
|
||||||
|
location = data['results'][0]['geometry']['location']
|
||||||
|
return location['lat'], location['lng']
|
||||||
|
except requests.RequestException as e:
|
||||||
|
print(f"Geocode API Fehler für {address}: {e}")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
def get_nearby_places(lat, lng):
|
||||||
|
places_url = f"https://maps.googleapis.com/maps/api/place/nearbysearch/json"
|
||||||
|
params = {
|
||||||
|
'location': f"{lat},{lng}",
|
||||||
|
'radius': 10,
|
||||||
|
'type': 'point_of_interest',
|
||||||
|
'key': API_KEY
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.get(places_url, params=params, timeout=5)
|
||||||
|
if response.status_code == 200:
|
||||||
|
return response.json().get('results', [])
|
||||||
|
except requests.RequestException as e:
|
||||||
|
print(f"Nearby Places API Fehler für Standort {lat},{lng}: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def get_place_details(place_id):
|
||||||
|
details_url = f"https://maps.googleapis.com/maps/api/place/details/json"
|
||||||
|
params = {
|
||||||
|
'place_id': place_id,
|
||||||
|
'fields': 'formatted_phone_number,website',
|
||||||
|
'key': API_KEY
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.get(details_url, params=params, timeout=5)
|
||||||
|
if response.status_code == 200:
|
||||||
|
result = response.json().get('result', {})
|
||||||
|
return result.get('formatted_phone_number', 'N/A'), result.get('website', 'N/A')
|
||||||
|
except requests.RequestException as e:
|
||||||
|
print(f"Place Details API Fehler für Place ID {place_id}: {e}")
|
||||||
|
return 'N/A', 'N/A'
|
||||||
|
|
||||||
|
def process_file(filename, job_id, app):
|
||||||
|
with app.app_context():
|
||||||
|
filepath = os.path.join(UPLOAD_FOLDER, filename)
|
||||||
|
results = []
|
||||||
|
|
||||||
|
job = Job.query.get(job_id)
|
||||||
|
if not job:
|
||||||
|
print("Job wurde abgebrochen.")
|
||||||
|
return
|
||||||
|
job.status = "In Progress"
|
||||||
|
db.session.commit()
|
||||||
|
|
||||||
|
with open(filepath, newline='', encoding='ISO-8859-1') as csvfile:
|
||||||
|
reader = csv.DictReader(csvfile, delimiter=';')
|
||||||
|
headers = reader.fieldnames
|
||||||
|
|
||||||
|
if not all(field in headers for field in ['PLZ', 'Straße', 'Hausnummer']):
|
||||||
|
print("CSV-Datei enthält nicht alle notwendigen Spalten.")
|
||||||
|
job.status = "Failed"
|
||||||
|
db.session.commit()
|
||||||
|
return
|
||||||
|
|
||||||
|
for row in reader:
|
||||||
|
plz = row.get('PLZ', '').strip()
|
||||||
|
city = row.get('Stadt', row.get('Bezirk', '')).strip()
|
||||||
|
street = row.get('Straße', '').strip()
|
||||||
|
house_number = row.get('Hausnummer', '').strip()
|
||||||
|
additional = row.get('Zusatz', '').strip()
|
||||||
|
|
||||||
|
if not all([plz, city, street, house_number]):
|
||||||
|
continue
|
||||||
|
|
||||||
|
full_address = f"{street} {house_number} {additional}, {plz} {city}"
|
||||||
|
lat, lng = get_geocode(full_address)
|
||||||
|
if lat is None or lng is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
nearby_places = get_nearby_places(lat, lng)
|
||||||
|
for place in nearby_places:
|
||||||
|
company_name = place['name']
|
||||||
|
if company_name in processed_companies:
|
||||||
|
continue
|
||||||
|
|
||||||
|
processed_companies.add(company_name)
|
||||||
|
company_address = place.get('vicinity', 'N/A').split(',')[0]
|
||||||
|
place_id = place.get('place_id')
|
||||||
|
company_phone, company_website = get_place_details(place_id) if place_id else ('N/A', 'N/A')
|
||||||
|
|
||||||
|
results.append({
|
||||||
|
'PLZ': plz,
|
||||||
|
'Stadt': city,
|
||||||
|
'Straße': street,
|
||||||
|
'Hausnummer': house_number,
|
||||||
|
'Zusatz': additional,
|
||||||
|
'Company Name': company_name,
|
||||||
|
'Company Address': company_address,
|
||||||
|
'Company Phone': company_phone,
|
||||||
|
'Company Website': company_website
|
||||||
|
})
|
||||||
|
|
||||||
|
if results:
|
||||||
|
result_file = f"results_{os.path.splitext(filename)[0]}.csv"
|
||||||
|
result_path = os.path.join(RESULT_FOLDER, result_file)
|
||||||
|
with open(result_path, 'w', newline='', encoding='utf-8-sig') as csvfile:
|
||||||
|
writer = csv.DictWriter(csvfile, fieldnames=[
|
||||||
|
'PLZ', 'Stadt', 'Straße', 'Hausnummer', 'Zusatz',
|
||||||
|
'Company Name', 'Company Address', 'Company Phone', 'Company Website'
|
||||||
|
])
|
||||||
|
writer.writeheader()
|
||||||
|
writer.writerows(results)
|
||||||
|
job.status = "Completed"
|
||||||
|
job.result_filename = result_file
|
||||||
|
db.session.commit()
|
||||||
|
else:
|
||||||
|
job.status = "Failed"
|
||||||
|
db.session.commit()
|
||||||
|
|
@ -1,138 +1,487 @@
|
||||||
import csv
|
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
|
import unicodedata
|
||||||
|
import json
|
||||||
|
import threading
|
||||||
|
import pandas as pd
|
||||||
import requests
|
import requests
|
||||||
from .models import db, Job
|
import time
|
||||||
from flask import current_app
|
import random
|
||||||
|
from io import StringIO
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
from app.models import db, Job
|
||||||
|
|
||||||
UPLOAD_FOLDER = 'uploads'
|
print("🆕 MODERN webcrawler LOADED! – BATCHED + PROXY + RESUME + ETA + 4x SCRAPER CHUNK-PARALLEL")
|
||||||
RESULT_FOLDER = 'results'
|
|
||||||
|
|
||||||
API_KEY = 'AIzaSyAIf0yXJTwo87VMWLBtq2m2LqE-OaPGbzw'
|
UPLOAD_FOLDER = '/app/uploads'
|
||||||
|
RESULT_FOLDER = '/app/results'
|
||||||
|
|
||||||
processed_companies = set()
|
SCRAPER_URLS = [
|
||||||
|
"http://gmaps-scraper-1:8080",
|
||||||
|
"http://gmaps-scraper-2:8080",
|
||||||
|
"http://gmaps-scraper-3:8080",
|
||||||
|
"http://gmaps-scraper-4:8080",
|
||||||
|
]
|
||||||
|
|
||||||
def get_geocode(address):
|
OUTPUT_COLS = ['title', 'category', 'address', 'open_hours', 'website', 'phone', 'link']
|
||||||
url = f"https://maps.googleapis.com/maps/api/geocode/json"
|
|
||||||
params = {'address': address, 'key': API_KEY}
|
|
||||||
|
|
||||||
|
PROXY_URL = "http://bitlleuv-rotate:s5hzse6hz74b@p.webshare.io:80"
|
||||||
|
API_PROXIES = {"http": PROXY_URL, "https": PROXY_URL}
|
||||||
|
|
||||||
|
_job_semaphore = threading.Semaphore(1)
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────
|
||||||
|
# Tuning
|
||||||
|
# ──────────────────────────────────────────────
|
||||||
|
BATCH_SIZE = 30 # Keywords pro Scraper-Job
|
||||||
|
BATCH_DELAY_MIN = 3 # Sekunden Pause zwischen Chunks (min)
|
||||||
|
BATCH_DELAY_MAX = 6 # Sekunden Pause zwischen Chunks (max)
|
||||||
|
MAX_TIME = 60 # Sekunden die der Scraper pro Batch hat
|
||||||
|
POLL_MAX = 90 # Max. Poll-Versuche pro Batch
|
||||||
|
POLL_DELAY_MIN = 2 # Sekunden zwischen Polls (min)
|
||||||
|
POLL_DELAY_MAX = 5 # Sekunden zwischen Polls (max)
|
||||||
|
STUCK_TIMEOUT = 300 # Sekunden bis Scraper-Neustart (5 Min)
|
||||||
|
MAX_RETRIES = 2 # Wiederholversuche pro Batch bei Fehler
|
||||||
|
PARALLEL_WORKERS = len(SCRAPER_URLS)
|
||||||
|
|
||||||
|
_partial_lock = threading.Lock()
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────
|
||||||
|
# Hilfsfunktionen
|
||||||
|
# ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
def is_blocked(data):
|
||||||
|
text = str(data).lower()
|
||||||
|
blocked = any(kw in text for kw in ['captcha', 'blocked', 'rate limit', 'too many', '429'])
|
||||||
|
if blocked:
|
||||||
|
print(f"🚫 BLOCKED: {str(data)[:100]}")
|
||||||
|
return blocked
|
||||||
|
|
||||||
|
def fix_encoding(text):
|
||||||
|
if not isinstance(text, str):
|
||||||
|
return text
|
||||||
try:
|
try:
|
||||||
response = requests.get(url, params=params, timeout=5)
|
return text.encode('latin-1').decode('utf-8')
|
||||||
if response.status_code == 200:
|
except (UnicodeEncodeError, UnicodeDecodeError):
|
||||||
data = response.json()
|
return text
|
||||||
if data['status'] == 'OK':
|
|
||||||
location = data['results'][0]['geometry']['location']
|
def clean_query(q):
|
||||||
return location['lat'], location['lng']
|
q = ''.join(c for c in q if unicodedata.category(c) != 'Cc')
|
||||||
except requests.RequestException as e:
|
q = ' '.join(q.split())
|
||||||
print(f"Geocode API Fehler für {address}: {e}")
|
return q.strip()
|
||||||
|
|
||||||
|
def build_input_addresses(df):
|
||||||
|
addresses = set()
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
plz = str(row.get('PLZ', '')).strip()
|
||||||
|
stadt = str(row.get('Stadt', '')).strip()
|
||||||
|
str_ = str(row.get('Straße', '')).strip()
|
||||||
|
nr = str(row.get('Hausnummer', '')).strip()
|
||||||
|
zusatz = str(row.get('Zusatz', '')).strip()
|
||||||
|
full = f"{str_} {nr} {zusatz} {plz} {stadt}".lower().strip()
|
||||||
|
full = ' '.join(full.split())
|
||||||
|
addresses.add(full)
|
||||||
|
return addresses
|
||||||
|
|
||||||
|
def normalize_address(addr):
|
||||||
|
if not isinstance(addr, str):
|
||||||
|
return ''
|
||||||
|
addr = fix_encoding(addr)
|
||||||
|
return ' '.join(addr.lower().strip().split())
|
||||||
|
|
||||||
|
def address_in_input(result_addr, input_addresses):
|
||||||
|
norm = normalize_address(result_addr)
|
||||||
|
for inp_addr in input_addresses:
|
||||||
|
plz_match = re.search(r'\b\d{5}\b', inp_addr)
|
||||||
|
if not plz_match:
|
||||||
|
continue
|
||||||
|
plz = plz_match.group()
|
||||||
|
if plz not in norm:
|
||||||
|
continue
|
||||||
|
|
||||||
|
parts = inp_addr.split()
|
||||||
|
|
||||||
|
street = parts[0] if parts else ''
|
||||||
|
if len(street) < 4 or street[:5].lower() not in norm:
|
||||||
|
continue
|
||||||
|
|
||||||
|
hausnr = parts[1] if len(parts) > 1 else ''
|
||||||
|
if hausnr and not re.search(rf'\b{re.escape(hausnr)}\b', norm):
|
||||||
|
continue
|
||||||
|
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def format_eta(seconds):
|
||||||
|
if seconds < 60:
|
||||||
|
return f"{int(seconds)}s"
|
||||||
|
h, rem = divmod(int(seconds), 3600)
|
||||||
|
m = rem // 60
|
||||||
|
return f"{h}h {m:02d}min" if h > 0 else f"{m}min"
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────
|
||||||
|
# Scraper-Job Cleanup
|
||||||
|
# ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _cleanup_scraper_job(scraper_url, scraper_id):
|
||||||
|
"""Scraper-Job immer aufräumen wenn wir ihn nicht mehr brauchen"""
|
||||||
|
try:
|
||||||
|
requests.delete(f"{scraper_url}/api/v1/jobs/{scraper_id}", timeout=10)
|
||||||
|
print(f"🗑️ Scraper-Job {scraper_id} gelöscht")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"⚠️ Cleanup fehlgeschlagen: {e}")
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────
|
||||||
|
# Scraper-Neustart via Docker SDK
|
||||||
|
# ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
def restart_scraper(scraper_url):
|
||||||
|
try:
|
||||||
|
import docker
|
||||||
|
container_name = scraper_url.split("//")[1].split(":")[0]
|
||||||
|
print(f"🔄 Starte {container_name} neu...")
|
||||||
|
client = docker.from_env()
|
||||||
|
container = client.containers.get(container_name)
|
||||||
|
container.restart()
|
||||||
|
print(f"✅ {container_name} neu gestartet – warte 15s...")
|
||||||
|
time.sleep(15)
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"⚠️ Scraper-Neustart fehlgeschlagen: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────
|
||||||
|
# Resume: Progress-File Hilfsfunktionen
|
||||||
|
# ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
def get_progress_path(job_id):
|
||||||
|
return os.path.join(RESULT_FOLDER, f"progress_{job_id}.json")
|
||||||
|
|
||||||
|
def get_partial_path(job_id, suffix):
|
||||||
|
return os.path.join(RESULT_FOLDER, f"partial_{job_id}_{suffix}.csv")
|
||||||
|
|
||||||
|
def load_progress(job_id):
|
||||||
|
path = get_progress_path(job_id)
|
||||||
|
if os.path.exists(path):
|
||||||
|
with open(path, 'r') as f:
|
||||||
|
data = json.load(f)
|
||||||
|
print(f"🔁 RESUME: ab Batch {data['last_completed_batch'] + 1}/{data['total_batches']}")
|
||||||
|
return data
|
||||||
|
return None
|
||||||
|
|
||||||
|
def save_progress(job_id, last_completed_batch, total_batches):
|
||||||
|
path = get_progress_path(job_id)
|
||||||
|
with open(path, 'w') as f:
|
||||||
|
json.dump({"last_completed_batch": last_completed_batch, "total_batches": total_batches}, f)
|
||||||
|
|
||||||
|
def append_partial(job_id, df_filtered, df_raw):
|
||||||
|
with _partial_lock:
|
||||||
|
for suffix, df in [('filtered', df_filtered), ('raw', df_raw)]:
|
||||||
|
if df is None:
|
||||||
|
continue
|
||||||
|
path = get_partial_path(job_id, suffix)
|
||||||
|
header = not os.path.exists(path)
|
||||||
|
df.to_csv(path, mode='a', index=False, header=header, encoding='utf-8-sig', sep=';')
|
||||||
|
|
||||||
|
def load_partial(job_id):
|
||||||
|
results_filtered, results_raw = [], []
|
||||||
|
for suffix, lst in [('filtered', results_filtered), ('raw', results_raw)]:
|
||||||
|
path = get_partial_path(job_id, suffix)
|
||||||
|
if os.path.exists(path):
|
||||||
|
try:
|
||||||
|
df = pd.read_csv(path, sep=';', encoding='utf-8-sig')
|
||||||
|
lst.append(df)
|
||||||
|
print(f"📂 Partial {suffix}: {len(df)} Zeilen geladen")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"⚠️ Partial {suffix} Ladefehler: {e}")
|
||||||
|
return results_filtered, results_raw
|
||||||
|
|
||||||
|
def cleanup_progress(job_id):
|
||||||
|
for path in [
|
||||||
|
get_progress_path(job_id),
|
||||||
|
get_partial_path(job_id, 'filtered'),
|
||||||
|
get_partial_path(job_id, 'raw'),
|
||||||
|
]:
|
||||||
|
if os.path.exists(path):
|
||||||
|
os.remove(path)
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────
|
||||||
|
# CSV Nachbearbeitung
|
||||||
|
# ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
def process_result_csv(raw_bytes, input_df, apply_filter=True):
|
||||||
|
try:
|
||||||
|
content = raw_bytes.decode('utf-8', errors='replace')
|
||||||
|
df_out = pd.read_csv(StringIO(content))
|
||||||
|
print(f"📄 Raw result: {df_out.shape}")
|
||||||
|
|
||||||
|
available = [c for c in OUTPUT_COLS if c in df_out.columns]
|
||||||
|
df_out = df_out[available]
|
||||||
|
|
||||||
|
for col in df_out.columns:
|
||||||
|
df_out[col] = df_out[col].apply(fix_encoding)
|
||||||
|
|
||||||
|
if apply_filter:
|
||||||
|
input_addresses = build_input_addresses(input_df)
|
||||||
|
before = len(df_out)
|
||||||
|
df_out = df_out[
|
||||||
|
df_out['address'].apply(lambda a: address_in_input(a, input_addresses))
|
||||||
|
]
|
||||||
|
print(f"📍 Filter: {before} → {len(df_out)}")
|
||||||
|
|
||||||
|
df_out = df_out.drop_duplicates(subset=['title', 'address'], keep='first')
|
||||||
|
df_out = df_out.dropna(subset=['title'], how='all')
|
||||||
|
df_out = df_out[df_out['title'].str.strip().astype(bool)]
|
||||||
|
|
||||||
|
print(f"✅ Final ({'gefiltert' if apply_filter else 'alle'}): {df_out.shape}")
|
||||||
|
return df_out
|
||||||
|
except Exception as e:
|
||||||
|
print(f"💥 process_result_csv: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────
|
||||||
|
# Parallel: Einzelnen Batch verarbeiten
|
||||||
|
# ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
def process_batch(batch_idx, batch_queries, scraper_url, filename, job_id, df_input):
|
||||||
|
payload = {
|
||||||
|
"name": f"{filename.replace('.csv','')}-{job_id}-B{batch_idx+1:03d}",
|
||||||
|
"keywords": batch_queries,
|
||||||
|
"lang": "de",
|
||||||
|
"depth": 1,
|
||||||
|
"zoom": 17,
|
||||||
|
"radius": 100,
|
||||||
|
"max_time": MAX_TIME,
|
||||||
|
"fast_mode": False,
|
||||||
|
"proxies": [PROXY_URL]
|
||||||
|
}
|
||||||
|
|
||||||
|
for attempt in range(1, MAX_RETRIES + 1):
|
||||||
|
scraper_id = None
|
||||||
|
try:
|
||||||
|
resp = requests.post(f"{scraper_url}/api/v1/jobs", json=payload, timeout=45)
|
||||||
|
print(f"📤 Batch {batch_idx+1} → {scraper_url} | {resp.status_code} (Versuch {attempt})")
|
||||||
|
|
||||||
|
if is_blocked(resp.text):
|
||||||
|
print(f"🚫 Batch {batch_idx+1} blocked")
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
if resp.status_code != 201:
|
||||||
|
print(f"⚠️ Batch {batch_idx+1} fehlgeschlagen: {resp.text[:100]}")
|
||||||
|
if attempt < MAX_RETRIES:
|
||||||
|
time.sleep(10)
|
||||||
|
continue
|
||||||
|
|
||||||
|
scraper_id = resp.json()['id']
|
||||||
|
print(f"✅ Batch {batch_idx+1} Scraper-ID: {scraper_id}")
|
||||||
|
|
||||||
|
batch_start_time = time.time()
|
||||||
|
|
||||||
|
for poll_i in range(1, POLL_MAX + 1):
|
||||||
|
r = requests.get(f"{scraper_url}/api/v1/jobs/{scraper_id}", timeout=15)
|
||||||
|
data = r.json()
|
||||||
|
status = data.get('Status', data.get('status', '?'))
|
||||||
|
elapsed = time.time() - batch_start_time
|
||||||
|
print(f"⏳ Batch {batch_idx+1} Poll {poll_i}: {status} | {int(elapsed)}s")
|
||||||
|
|
||||||
|
if status == 'pending' and elapsed > STUCK_TIMEOUT:
|
||||||
|
print(f"⚠️ Batch {batch_idx+1} hängt seit {int(elapsed)}s – Neustart {scraper_url}")
|
||||||
|
_cleanup_scraper_job(scraper_url, scraper_id)
|
||||||
|
scraper_id = None
|
||||||
|
restart_scraper(scraper_url)
|
||||||
|
break
|
||||||
|
|
||||||
|
if status in ('ok', 'completed', 'scraped'):
|
||||||
|
dl = requests.get(f"{scraper_url}/api/v1/jobs/{scraper_id}/download", timeout=90)
|
||||||
|
scraper_id = None
|
||||||
|
if dl.status_code == 200:
|
||||||
|
df_filtered = process_result_csv(dl.content, df_input, True)
|
||||||
|
df_raw = process_result_csv(dl.content, df_input, False)
|
||||||
|
print(f"📊 Batch {batch_idx+1}: {len(df_filtered) if df_filtered is not None else 0} filtered")
|
||||||
|
return df_filtered, df_raw
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
elif status in ('failed', 'error'):
|
||||||
|
print(f"💥 Batch {batch_idx+1}: {status} (Versuch {attempt})")
|
||||||
|
_cleanup_scraper_job(scraper_url, scraper_id)
|
||||||
|
scraper_id = None
|
||||||
|
if attempt < MAX_RETRIES:
|
||||||
|
time.sleep(10)
|
||||||
|
break
|
||||||
|
|
||||||
|
time.sleep(random.uniform(POLL_DELAY_MIN, POLL_DELAY_MAX))
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"💥 Batch {batch_idx+1} Versuch {attempt}: {e}")
|
||||||
|
if scraper_id:
|
||||||
|
_cleanup_scraper_job(scraper_url, scraper_id)
|
||||||
|
scraper_id = None
|
||||||
|
if attempt < MAX_RETRIES:
|
||||||
|
time.sleep(10)
|
||||||
|
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
def get_nearby_places(lat, lng):
|
# ──────────────────────────────────────────────
|
||||||
places_url = f"https://maps.googleapis.com/maps/api/place/nearbysearch/json"
|
# HAUPT-WORKER
|
||||||
params = {
|
# ──────────────────────────────────────────────
|
||||||
'location': f"{lat},{lng}",
|
|
||||||
'radius': 10,
|
|
||||||
'type': 'point_of_interest',
|
|
||||||
'key': API_KEY
|
|
||||||
}
|
|
||||||
|
|
||||||
try:
|
|
||||||
response = requests.get(places_url, params=params, timeout=5)
|
|
||||||
if response.status_code == 200:
|
|
||||||
return response.json().get('results', [])
|
|
||||||
except requests.RequestException as e:
|
|
||||||
print(f"Nearby Places API Fehler für Standort {lat},{lng}: {e}")
|
|
||||||
return []
|
|
||||||
|
|
||||||
def get_place_details(place_id):
|
|
||||||
details_url = f"https://maps.googleapis.com/maps/api/place/details/json"
|
|
||||||
params = {
|
|
||||||
'place_id': place_id,
|
|
||||||
'fields': 'formatted_phone_number,website',
|
|
||||||
'key': API_KEY
|
|
||||||
}
|
|
||||||
|
|
||||||
try:
|
|
||||||
response = requests.get(details_url, params=params, timeout=5)
|
|
||||||
if response.status_code == 200:
|
|
||||||
result = response.json().get('result', {})
|
|
||||||
return result.get('formatted_phone_number', 'N/A'), result.get('website', 'N/A')
|
|
||||||
except requests.RequestException as e:
|
|
||||||
print(f"Place Details API Fehler für Place ID {place_id}: {e}")
|
|
||||||
return 'N/A', 'N/A'
|
|
||||||
|
|
||||||
def process_file(filename, job_id, app):
|
def process_file(filename, job_id, app):
|
||||||
with app.app_context():
|
with app.app_context():
|
||||||
filepath = os.path.join(UPLOAD_FOLDER, filename)
|
|
||||||
results = []
|
|
||||||
|
|
||||||
job = Job.query.get(job_id)
|
job = Job.query.get(job_id)
|
||||||
if not job:
|
if job:
|
||||||
print("Job wurde abgebrochen.")
|
job.status = "⏳ Wartet auf anderen Job..."
|
||||||
return
|
db.session.commit()
|
||||||
job.status = "In Progress"
|
|
||||||
db.session.commit()
|
|
||||||
|
|
||||||
with open(filepath, newline='', encoding='ISO-8859-1') as csvfile:
|
with _job_semaphore:
|
||||||
reader = csv.DictReader(csvfile, delimiter=';')
|
print(f"🎯 {filename} Job#{job_id} START!")
|
||||||
headers = reader.fieldnames
|
|
||||||
|
|
||||||
if not all(field in headers for field in ['PLZ', 'Straße', 'Hausnummer']):
|
with app.app_context():
|
||||||
print("CSV-Datei enthält nicht alle notwendigen Spalten.")
|
job = Job.query.get(job_id)
|
||||||
job.status = "Failed"
|
if not job:
|
||||||
db.session.commit()
|
print("❌ Job missing")
|
||||||
return
|
return
|
||||||
|
|
||||||
for row in reader:
|
try:
|
||||||
plz = row.get('PLZ', '').strip()
|
job.status = "📊 parsing CSV"
|
||||||
city = row.get('Stadt', row.get('Bezirk', '')).strip()
|
db.session.commit()
|
||||||
street = row.get('Straße', '').strip()
|
|
||||||
house_number = row.get('Hausnummer', '').strip()
|
|
||||||
additional = row.get('Zusatz', '').strip()
|
|
||||||
|
|
||||||
if not all([plz, city, street, house_number]):
|
filepath = os.path.join(UPLOAD_FOLDER, filename)
|
||||||
continue
|
print(f"📁 {filepath} | {os.path.getsize(filepath)}b")
|
||||||
|
|
||||||
full_address = f"{street} {house_number} {additional}, {plz} {city}"
|
df_input = pd.read_csv(filepath, sep=';', encoding='ISO-8859-1')
|
||||||
lat, lng = get_geocode(full_address)
|
print(f"📊 {df_input.shape}")
|
||||||
if lat is None or lng is None:
|
|
||||||
continue
|
|
||||||
|
|
||||||
nearby_places = get_nearby_places(lat, lng)
|
queries = []
|
||||||
for place in nearby_places:
|
for _, row in df_input.iterrows():
|
||||||
company_name = place['name']
|
parts = [
|
||||||
if company_name in processed_companies:
|
str(row.get('PLZ', '')).strip(),
|
||||||
continue
|
str(row.get('Stadt', '')).strip(),
|
||||||
|
str(row.get('Straße', '')).strip(),
|
||||||
|
str(row.get('Hausnummer', '')).strip(),
|
||||||
|
str(row.get('Zusatz', '')).strip(),
|
||||||
|
]
|
||||||
|
q = f"Firmen {' '.join(p for p in parts if p and p != 'nan')}".strip()
|
||||||
|
q = clean_query(q)
|
||||||
|
if len(q) > 10:
|
||||||
|
queries.append(q)
|
||||||
|
|
||||||
processed_companies.add(company_name)
|
total_queries = len(queries)
|
||||||
company_address = place.get('vicinity', 'N/A').split(',')[0]
|
print(f"🔍 {total_queries} Queries | Samples: {queries[:3]}")
|
||||||
place_id = place.get('place_id')
|
if total_queries == 0:
|
||||||
company_phone, company_website = get_place_details(place_id) if place_id else ('N/A', 'N/A')
|
raise ValueError("Keine gültigen Adressen")
|
||||||
|
|
||||||
results.append({
|
batches = (total_queries + BATCH_SIZE - 1) // BATCH_SIZE
|
||||||
'PLZ': plz,
|
|
||||||
'Stadt': city,
|
|
||||||
'Straße': street,
|
|
||||||
'Hausnummer': house_number,
|
|
||||||
'Zusatz': additional,
|
|
||||||
'Company Name': company_name,
|
|
||||||
'Company Address': company_address,
|
|
||||||
'Company Phone': company_phone,
|
|
||||||
'Company Website': company_website
|
|
||||||
})
|
|
||||||
|
|
||||||
if results:
|
os.makedirs(RESULT_FOLDER, exist_ok=True)
|
||||||
result_file = f"results_{os.path.splitext(filename)[0]}.csv"
|
progress = load_progress(job_id)
|
||||||
result_path = os.path.join(RESULT_FOLDER, result_file)
|
start_batch = progress['last_completed_batch'] + 1 if progress else 0
|
||||||
with open(result_path, 'w', newline='', encoding='utf-8-sig') as csvfile:
|
all_results_filtered, all_results_raw = load_partial(job_id) if progress else ([], [])
|
||||||
writer = csv.DictWriter(csvfile, fieldnames=[
|
|
||||||
'PLZ', 'Stadt', 'Straße', 'Hausnummer', 'Zusatz',
|
eta_initial = format_eta((batches - start_batch) * ((BATCH_DELAY_MAX + MAX_TIME) / 2) / PARALLEL_WORKERS)
|
||||||
'Company Name', 'Company Address', 'Company Phone', 'Company Website'
|
print(f"📦 {batches} Batches à {BATCH_SIZE} | {PARALLEL_WORKERS}x parallel (Chunk) | Start: {start_batch} | ETA: ~{eta_initial}")
|
||||||
])
|
job_start_time = time.time()
|
||||||
writer.writeheader()
|
job.status = f"🔄 Batch {start_batch+1}/{batches} | ⏱️ ~{eta_initial}"
|
||||||
writer.writerows(results)
|
db.session.commit()
|
||||||
job.status = "Completed"
|
|
||||||
job.result_filename = result_file
|
completed_count = 0
|
||||||
db.session.commit()
|
|
||||||
else:
|
batch_indices = list(range(start_batch, batches))
|
||||||
job.status = "Failed"
|
chunks = [
|
||||||
db.session.commit()
|
batch_indices[i:i + PARALLEL_WORKERS]
|
||||||
|
for i in range(0, len(batch_indices), PARALLEL_WORKERS)
|
||||||
|
]
|
||||||
|
|
||||||
|
with ThreadPoolExecutor(max_workers=PARALLEL_WORKERS) as executor:
|
||||||
|
for chunk_idx, chunk in enumerate(chunks):
|
||||||
|
futures = {}
|
||||||
|
|
||||||
|
for batch_idx in chunk:
|
||||||
|
batch_start_q = batch_idx * BATCH_SIZE
|
||||||
|
batch_end_q = min(batch_start_q + BATCH_SIZE, total_queries)
|
||||||
|
batch_queries = queries[batch_start_q:batch_end_q]
|
||||||
|
scraper_url = SCRAPER_URLS[batch_idx % len(SCRAPER_URLS)]
|
||||||
|
|
||||||
|
print(f"\n🚀 Chunk {chunk_idx+1} | Batch {batch_idx+1}/{batches} → {scraper_url}")
|
||||||
|
time.sleep(random.uniform(1, 2))
|
||||||
|
|
||||||
|
future = executor.submit(
|
||||||
|
process_batch,
|
||||||
|
batch_idx, batch_queries, scraper_url,
|
||||||
|
filename, job_id, df_input
|
||||||
|
)
|
||||||
|
futures[future] = batch_idx
|
||||||
|
|
||||||
|
for future in as_completed(futures):
|
||||||
|
batch_idx = futures[future]
|
||||||
|
completed_count += 1
|
||||||
|
try:
|
||||||
|
df_filtered, df_raw = future.result()
|
||||||
|
if df_filtered is not None:
|
||||||
|
all_results_filtered.append(df_filtered)
|
||||||
|
all_results_raw.append(df_raw)
|
||||||
|
append_partial(job_id, df_filtered, df_raw)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"💥 Batch {batch_idx+1} Exception: {e}")
|
||||||
|
|
||||||
|
save_progress(job_id, batch_idx, batches)
|
||||||
|
|
||||||
|
elapsed = time.time() - job_start_time
|
||||||
|
if completed_count > 0:
|
||||||
|
avg_per_batch = elapsed / completed_count
|
||||||
|
remaining = (batches - start_batch - completed_count) * avg_per_batch / PARALLEL_WORKERS
|
||||||
|
eta_str = format_eta(remaining)
|
||||||
|
else:
|
||||||
|
eta_str = "?"
|
||||||
|
|
||||||
|
job.status = f"🔄 {completed_count}/{batches - start_batch} fertig | ⏱️ ~{eta_str}"
|
||||||
|
db.session.commit()
|
||||||
|
|
||||||
|
if chunk_idx < len(chunks) - 1:
|
||||||
|
delay = random.uniform(BATCH_DELAY_MIN, BATCH_DELAY_MAX)
|
||||||
|
print(f"⏸️ Chunk {chunk_idx+1} fertig – warte {delay:.1f}s...")
|
||||||
|
time.sleep(delay)
|
||||||
|
|
||||||
|
# ── MERGE & SAVE ──
|
||||||
|
job.status = "🔧 merging results"
|
||||||
|
db.session.commit()
|
||||||
|
|
||||||
|
base = filename.replace('.csv', '')
|
||||||
|
|
||||||
|
if all_results_filtered:
|
||||||
|
df_final_filtered = pd.concat(all_results_filtered, ignore_index=True)
|
||||||
|
df_final_filtered = df_final_filtered.drop_duplicates(subset=['title', 'address'])
|
||||||
|
|
||||||
|
out_filtered = f"results_{base}_filtered.csv"
|
||||||
|
df_final_filtered.to_csv(
|
||||||
|
os.path.join(RESULT_FOLDER, out_filtered),
|
||||||
|
index=False, encoding='utf-8-sig', sep=';'
|
||||||
|
)
|
||||||
|
|
||||||
|
out_raw = None
|
||||||
|
if all_results_raw:
|
||||||
|
df_final_raw = pd.concat(all_results_raw, ignore_index=True)
|
||||||
|
out_raw = f"results_{base}_all.csv"
|
||||||
|
df_final_raw.to_csv(
|
||||||
|
os.path.join(RESULT_FOLDER, out_raw),
|
||||||
|
index=False, encoding='utf-8-sig', sep=';'
|
||||||
|
)
|
||||||
|
|
||||||
|
job.result_filename = out_filtered
|
||||||
|
job.result_filename_raw = out_raw
|
||||||
|
job.status = f"✅ Fertig: {len(df_final_filtered)} Firmen"
|
||||||
|
|
||||||
|
cleanup_progress(job_id)
|
||||||
|
else:
|
||||||
|
job.status = "❌ Keine Ergebnisse"
|
||||||
|
|
||||||
|
db.session.commit()
|
||||||
|
print(f"🎉 Job {job_id} komplett!")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
job.status = f"Failed: {str(e)[:50]}"
|
||||||
|
print(f"💥 FATAL: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
db.session.commit()
|
||||||
|
|
||||||
|
print(f"✅ DONE! Status: {job.status}")
|
||||||
|
|
|
||||||
21
delete-crawl-jobs.py
Normal file
21
delete-crawl-jobs.py
Normal file
|
|
@ -0,0 +1,21 @@
|
||||||
|
import requests
|
||||||
|
import time
|
||||||
|
|
||||||
|
base_url = "http://localhost:5001/api/v1/jobs"
|
||||||
|
|
||||||
|
response = requests.get(base_url)
|
||||||
|
jobs = response.json() # Direkt Array
|
||||||
|
print(f"{len(jobs)} Jobs gefunden.")
|
||||||
|
|
||||||
|
deleted = 0
|
||||||
|
for job in jobs:
|
||||||
|
job_id = job["ID"]
|
||||||
|
del_res = requests.delete(f"{base_url}/{job_id}")
|
||||||
|
if del_res.status_code in [200, 204]:
|
||||||
|
print(f"✓ {job_id}")
|
||||||
|
deleted += 1
|
||||||
|
else:
|
||||||
|
print(f"✗ {job_id}: {del_res.status_code}")
|
||||||
|
time.sleep(0.1)
|
||||||
|
|
||||||
|
print(f"{deleted}/{len(jobs)} gelöscht.")
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
version: '3'
|
version: '3.8'
|
||||||
services:
|
services:
|
||||||
web:
|
web:
|
||||||
build: .
|
build: .
|
||||||
|
|
@ -6,6 +6,114 @@ services:
|
||||||
- "5000:5000"
|
- "5000:5000"
|
||||||
environment:
|
environment:
|
||||||
- FLASK_APP=app
|
- FLASK_APP=app
|
||||||
command: flask run --host=0.0.0.0 --port=5000
|
- FLASK_ENV=production
|
||||||
|
- PYTHONUNBUFFERED=1
|
||||||
volumes:
|
volumes:
|
||||||
- .:/app
|
- ./app:/app/app
|
||||||
|
- ./uploads:/app/uploads
|
||||||
|
- ./results:/app/results
|
||||||
|
- ./instance:/app/instance
|
||||||
|
- /var/run/docker.sock:/var/run/docker.sock
|
||||||
|
depends_on:
|
||||||
|
- gmaps-scraper-1
|
||||||
|
- gmaps-scraper-2
|
||||||
|
- gmaps-scraper-3
|
||||||
|
- gmaps-scraper-4
|
||||||
|
restart: always
|
||||||
|
networks:
|
||||||
|
- scraper-net
|
||||||
|
|
||||||
|
gmaps-scraper-1:
|
||||||
|
image: gosom/google-maps-scraper:latest
|
||||||
|
container_name: gmaps-scraper-1
|
||||||
|
environment:
|
||||||
|
- PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
|
||||||
|
ports:
|
||||||
|
- "5001:8080"
|
||||||
|
volumes:
|
||||||
|
- ./scraper-data-1:/gmapsdata
|
||||||
|
command:
|
||||||
|
- "-web"
|
||||||
|
- "-data-folder=/gmapsdata"
|
||||||
|
restart: always
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD-SHELL", "wget -qO- http://localhost:8080/api/v1/jobs || exit 1"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
start_period: 15s
|
||||||
|
networks:
|
||||||
|
- scraper-net
|
||||||
|
|
||||||
|
|
||||||
|
gmaps-scraper-2:
|
||||||
|
image: gosom/google-maps-scraper:latest
|
||||||
|
container_name: gmaps-scraper-2
|
||||||
|
environment:
|
||||||
|
- PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
|
||||||
|
ports:
|
||||||
|
- "5002:8080"
|
||||||
|
volumes:
|
||||||
|
- ./scraper-data-2:/gmapsdata
|
||||||
|
command:
|
||||||
|
- "-web"
|
||||||
|
- "-data-folder=/gmapsdata"
|
||||||
|
restart: always
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD-SHELL", "wget -qO- http://localhost:8080/api/v1/jobs || exit 1"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
start_period: 15s
|
||||||
|
networks:
|
||||||
|
- scraper-net
|
||||||
|
|
||||||
|
gmaps-scraper-3:
|
||||||
|
image: gosom/google-maps-scraper:latest
|
||||||
|
container_name: gmaps-scraper-3
|
||||||
|
environment:
|
||||||
|
- PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
|
||||||
|
ports:
|
||||||
|
- "5003:8080"
|
||||||
|
volumes:
|
||||||
|
- ./scraper-data-3:/gmapsdata
|
||||||
|
command:
|
||||||
|
- "-web"
|
||||||
|
- "-data-folder=/gmapsdata"
|
||||||
|
restart: always
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD-SHELL", "wget -qO- http://localhost:8080/api/v1/jobs || exit 1"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
start_period: 15s
|
||||||
|
networks:
|
||||||
|
- scraper-net
|
||||||
|
|
||||||
|
gmaps-scraper-4:
|
||||||
|
image: gosom/google-maps-scraper:latest
|
||||||
|
container_name: gmaps-scraper-4
|
||||||
|
environment:
|
||||||
|
- PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
|
||||||
|
ports:
|
||||||
|
- "5004:8080"
|
||||||
|
volumes:
|
||||||
|
- ./scraper-data-4:/gmapsdata
|
||||||
|
command:
|
||||||
|
- "-web"
|
||||||
|
- "-data-folder=/gmapsdata"
|
||||||
|
restart: always
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD-SHELL", "wget -qO- http://localhost:8080/api/v1/jobs || exit 1"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
start_period: 15s
|
||||||
|
networks:
|
||||||
|
- scraper-net
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
networks:
|
||||||
|
scraper-net:
|
||||||
|
driver: bridge
|
||||||
|
|
|
||||||
Binary file not shown.
|
|
@ -1,8 +1,8 @@
|
||||||
Flask==2.2.5
|
flask
|
||||||
Flask-Login==0.6.2
|
flask-sqlalchemy
|
||||||
Flask-SQLAlchemy==3.0.3
|
flask-login
|
||||||
Werkzeug==2.2.2
|
flask-migrate
|
||||||
pandas
|
pandas
|
||||||
requests
|
requests
|
||||||
beautifulsoup4
|
werkzeug
|
||||||
Flask-Migrate
|
docker
|
||||||
|
|
|
||||||
29
scraper-data-1/02af4949-431c-4736-beec-8ef7bc859c9d.csv
Normal file
29
scraper-data-1/02af4949-431c-4736-beec-8ef7bc859c9d.csv
Normal file
File diff suppressed because one or more lines are too long
49
scraper-data-1/072fe9f8-ce9d-4af5-a0aa-bde70349d5ba.csv
Normal file
49
scraper-data-1/072fe9f8-ce9d-4af5-a0aa-bde70349d5ba.csv
Normal file
File diff suppressed because one or more lines are too long
37
scraper-data-1/0b7932ea-4588-44bb-9b08-a69e95ef5d28.csv
Normal file
37
scraper-data-1/0b7932ea-4588-44bb-9b08-a69e95ef5d28.csv
Normal file
File diff suppressed because one or more lines are too long
50
scraper-data-1/0d9a6e99-a20c-4301-add9-00211dcc5fa3.csv
Normal file
50
scraper-data-1/0d9a6e99-a20c-4301-add9-00211dcc5fa3.csv
Normal file
File diff suppressed because one or more lines are too long
41
scraper-data-1/197953d0-9719-4fb7-a8a1-3a8a9e6994b0.csv
Normal file
41
scraper-data-1/197953d0-9719-4fb7-a8a1-3a8a9e6994b0.csv
Normal file
File diff suppressed because one or more lines are too long
44
scraper-data-1/1ebb427f-3308-4637-be28-1c0032a2107d.csv
Normal file
44
scraper-data-1/1ebb427f-3308-4637-be28-1c0032a2107d.csv
Normal file
File diff suppressed because one or more lines are too long
39
scraper-data-1/245440fd-eb76-4ae1-9278-bf416a1351a0.csv
Normal file
39
scraper-data-1/245440fd-eb76-4ae1-9278-bf416a1351a0.csv
Normal file
File diff suppressed because one or more lines are too long
44
scraper-data-1/26b82d8c-c109-48a4-8b06-f98638d565c3.csv
Normal file
44
scraper-data-1/26b82d8c-c109-48a4-8b06-f98638d565c3.csv
Normal file
File diff suppressed because one or more lines are too long
25
scraper-data-1/28b024e8-74db-429a-88c2-c6e8d314bf33.csv
Normal file
25
scraper-data-1/28b024e8-74db-429a-88c2-c6e8d314bf33.csv
Normal file
File diff suppressed because one or more lines are too long
12
scraper-data-1/2af0ecab-6f95-48d4-b65b-7cef52d3fe73.csv
Normal file
12
scraper-data-1/2af0ecab-6f95-48d4-b65b-7cef52d3fe73.csv
Normal file
File diff suppressed because one or more lines are too long
126
scraper-data-1/2b17ff19-48dc-44d8-b747-eb751c2c28ad.csv
Normal file
126
scraper-data-1/2b17ff19-48dc-44d8-b747-eb751c2c28ad.csv
Normal file
File diff suppressed because one or more lines are too long
35
scraper-data-1/2c3dce9e-a011-42ca-9310-34b0bb481fdf.csv
Normal file
35
scraper-data-1/2c3dce9e-a011-42ca-9310-34b0bb481fdf.csv
Normal file
File diff suppressed because one or more lines are too long
46
scraper-data-1/2efb0985-cac3-454a-9018-e27f98476df2.csv
Normal file
46
scraper-data-1/2efb0985-cac3-454a-9018-e27f98476df2.csv
Normal file
File diff suppressed because one or more lines are too long
47
scraper-data-1/332988ef-754f-4a11-b948-38c1ca463987.csv
Normal file
47
scraper-data-1/332988ef-754f-4a11-b948-38c1ca463987.csv
Normal file
File diff suppressed because one or more lines are too long
28
scraper-data-1/39cc6d71-3567-46b9-883c-0cb4fed755a1.csv
Normal file
28
scraper-data-1/39cc6d71-3567-46b9-883c-0cb4fed755a1.csv
Normal file
File diff suppressed because one or more lines are too long
13
scraper-data-1/3f53b4d3-61c4-478f-aee7-0b1524b3e480.csv
Normal file
13
scraper-data-1/3f53b4d3-61c4-478f-aee7-0b1524b3e480.csv
Normal file
File diff suppressed because one or more lines are too long
40
scraper-data-1/4586b26a-6c85-4109-9d7f-bdcae264ab25.csv
Normal file
40
scraper-data-1/4586b26a-6c85-4109-9d7f-bdcae264ab25.csv
Normal file
File diff suppressed because one or more lines are too long
42
scraper-data-1/4612826f-e088-4ebd-8de2-523fe801dd2b.csv
Normal file
42
scraper-data-1/4612826f-e088-4ebd-8de2-523fe801dd2b.csv
Normal file
File diff suppressed because one or more lines are too long
51
scraper-data-1/485abd75-8b7d-44a6-92f3-6fad9828a288.csv
Normal file
51
scraper-data-1/485abd75-8b7d-44a6-92f3-6fad9828a288.csv
Normal file
File diff suppressed because one or more lines are too long
26
scraper-data-1/4a4acb8f-be75-4328-b011-5dd6f633271e.csv
Normal file
26
scraper-data-1/4a4acb8f-be75-4328-b011-5dd6f633271e.csv
Normal file
File diff suppressed because one or more lines are too long
35
scraper-data-1/5610de98-4347-49cd-a480-03afa1c9ee15.csv
Normal file
35
scraper-data-1/5610de98-4347-49cd-a480-03afa1c9ee15.csv
Normal file
File diff suppressed because one or more lines are too long
77
scraper-data-1/581a5fce-910b-4da7-89f0-4ae4abb9d48c.csv
Normal file
77
scraper-data-1/581a5fce-910b-4da7-89f0-4ae4abb9d48c.csv
Normal file
File diff suppressed because one or more lines are too long
49
scraper-data-1/5943bc28-1757-4474-9e15-504a35fa90ac.csv
Normal file
49
scraper-data-1/5943bc28-1757-4474-9e15-504a35fa90ac.csv
Normal file
File diff suppressed because one or more lines are too long
60
scraper-data-1/59bfb43f-0fd5-48ba-b8cc-aab842330d3d.csv
Normal file
60
scraper-data-1/59bfb43f-0fd5-48ba-b8cc-aab842330d3d.csv
Normal file
File diff suppressed because one or more lines are too long
47
scraper-data-1/5e590d3f-8fc6-4cca-b111-bcd5b7694a47.csv
Normal file
47
scraper-data-1/5e590d3f-8fc6-4cca-b111-bcd5b7694a47.csv
Normal file
File diff suppressed because one or more lines are too long
38
scraper-data-1/5f2179d9-32f0-4fbb-9d30-056ffb74d559.csv
Normal file
38
scraper-data-1/5f2179d9-32f0-4fbb-9d30-056ffb74d559.csv
Normal file
File diff suppressed because one or more lines are too long
39
scraper-data-1/60919958-7169-48c2-9079-34f32ed6065d.csv
Normal file
39
scraper-data-1/60919958-7169-48c2-9079-34f32ed6065d.csv
Normal file
File diff suppressed because one or more lines are too long
48
scraper-data-1/6512e64f-6321-4b9c-8072-3f8260e13454.csv
Normal file
48
scraper-data-1/6512e64f-6321-4b9c-8072-3f8260e13454.csv
Normal file
File diff suppressed because one or more lines are too long
50
scraper-data-1/6c322761-2f3e-415f-829d-4d65dc3567b1.csv
Normal file
50
scraper-data-1/6c322761-2f3e-415f-829d-4d65dc3567b1.csv
Normal file
File diff suppressed because one or more lines are too long
33
scraper-data-1/6ca5f31c-d372-45f4-8948-d2844eb9305f.csv
Normal file
33
scraper-data-1/6ca5f31c-d372-45f4-8948-d2844eb9305f.csv
Normal file
File diff suppressed because one or more lines are too long
0
scraper-data-1/6ef54a15-b1cc-4d5c-83f6-9960ceb113a0.csv
Normal file
0
scraper-data-1/6ef54a15-b1cc-4d5c-83f6-9960ceb113a0.csv
Normal file
|
|
75
scraper-data-1/6fe123f2-7247-42e3-a876-bd86f1a3191d.csv
Normal file
75
scraper-data-1/6fe123f2-7247-42e3-a876-bd86f1a3191d.csv
Normal file
File diff suppressed because one or more lines are too long
24
scraper-data-1/7be0bb2d-b0fa-4629-bb53-ffa4eb152292.csv
Normal file
24
scraper-data-1/7be0bb2d-b0fa-4629-bb53-ffa4eb152292.csv
Normal file
File diff suppressed because one or more lines are too long
43
scraper-data-1/7ecc5496-afc5-4e8f-9c2f-8f06e81fa8d5.csv
Normal file
43
scraper-data-1/7ecc5496-afc5-4e8f-9c2f-8f06e81fa8d5.csv
Normal file
File diff suppressed because one or more lines are too long
59
scraper-data-1/8a08068b-de23-4612-9a1c-d07dcc4d8a3f.csv
Normal file
59
scraper-data-1/8a08068b-de23-4612-9a1c-d07dcc4d8a3f.csv
Normal file
File diff suppressed because one or more lines are too long
37
scraper-data-1/90249f15-69e4-4174-bee1-8dd4658b73e3.csv
Normal file
37
scraper-data-1/90249f15-69e4-4174-bee1-8dd4658b73e3.csv
Normal file
File diff suppressed because one or more lines are too long
74
scraper-data-1/9567b79b-b2c2-4590-8cbc-b17e4e41c0a6.csv
Normal file
74
scraper-data-1/9567b79b-b2c2-4590-8cbc-b17e4e41c0a6.csv
Normal file
File diff suppressed because one or more lines are too long
40
scraper-data-1/96ad0ca1-95c9-4e28-8b12-16a1ad0d10a1.csv
Normal file
40
scraper-data-1/96ad0ca1-95c9-4e28-8b12-16a1ad0d10a1.csv
Normal file
File diff suppressed because one or more lines are too long
25
scraper-data-1/9ede5b47-e75d-4cef-b125-29f66981c3ce.csv
Normal file
25
scraper-data-1/9ede5b47-e75d-4cef-b125-29f66981c3ce.csv
Normal file
File diff suppressed because one or more lines are too long
43
scraper-data-1/a14661d3-e928-44db-b3b1-dbad7c87b9e6.csv
Normal file
43
scraper-data-1/a14661d3-e928-44db-b3b1-dbad7c87b9e6.csv
Normal file
File diff suppressed because one or more lines are too long
33
scraper-data-1/a5c2212f-e428-43ab-9242-2329890a18d8.csv
Normal file
33
scraper-data-1/a5c2212f-e428-43ab-9242-2329890a18d8.csv
Normal file
File diff suppressed because one or more lines are too long
18
scraper-data-1/a9e79597-e967-48b2-bbd1-55cea9d516c6.csv
Normal file
18
scraper-data-1/a9e79597-e967-48b2-bbd1-55cea9d516c6.csv
Normal file
File diff suppressed because one or more lines are too long
43
scraper-data-1/aaf5e97d-7dab-4d5e-8ecb-ffc834101e83.csv
Normal file
43
scraper-data-1/aaf5e97d-7dab-4d5e-8ecb-ffc834101e83.csv
Normal file
File diff suppressed because one or more lines are too long
48
scraper-data-1/b2b1e231-d153-476d-80ff-de60225b700e.csv
Normal file
48
scraper-data-1/b2b1e231-d153-476d-80ff-de60225b700e.csv
Normal file
File diff suppressed because one or more lines are too long
117
scraper-data-1/ba511b6d-4530-4a5a-a2dc-6a242c26e307.csv
Normal file
117
scraper-data-1/ba511b6d-4530-4a5a-a2dc-6a242c26e307.csv
Normal file
File diff suppressed because one or more lines are too long
45
scraper-data-1/bc9f63c0-8069-4ad2-9f41-a34fbdfd68ca.csv
Normal file
45
scraper-data-1/bc9f63c0-8069-4ad2-9f41-a34fbdfd68ca.csv
Normal file
File diff suppressed because one or more lines are too long
46
scraper-data-1/c0c28d24-5898-4118-9035-1bbfe5a4ffd8.csv
Normal file
46
scraper-data-1/c0c28d24-5898-4118-9035-1bbfe5a4ffd8.csv
Normal file
File diff suppressed because one or more lines are too long
41
scraper-data-1/c0e9b36a-0269-4666-b860-5f56c62b4e8d.csv
Normal file
41
scraper-data-1/c0e9b36a-0269-4666-b860-5f56c62b4e8d.csv
Normal file
File diff suppressed because one or more lines are too long
52
scraper-data-1/cb8e6aa2-a886-43c8-b089-44ffbf198f4a.csv
Normal file
52
scraper-data-1/cb8e6aa2-a886-43c8-b089-44ffbf198f4a.csv
Normal file
File diff suppressed because one or more lines are too long
59
scraper-data-1/cc7fdc27-4523-421d-93a2-c2688633d7a2.csv
Normal file
59
scraper-data-1/cc7fdc27-4523-421d-93a2-c2688633d7a2.csv
Normal file
File diff suppressed because one or more lines are too long
2
scraper-data-1/d019440e-43a3-4790-bb39-b66d2fbb9486.csv
Normal file
2
scraper-data-1/d019440e-43a3-4790-bb39-b66d2fbb9486.csv
Normal file
File diff suppressed because one or more lines are too long
32
scraper-data-1/d025a229-c945-4c62-a11b-548fdea678d6.csv
Normal file
32
scraper-data-1/d025a229-c945-4c62-a11b-548fdea678d6.csv
Normal file
File diff suppressed because one or more lines are too long
47
scraper-data-1/d1f47306-7e62-4870-b7b0-b17266513b64.csv
Normal file
47
scraper-data-1/d1f47306-7e62-4870-b7b0-b17266513b64.csv
Normal file
File diff suppressed because one or more lines are too long
19
scraper-data-1/d228bcd4-10f1-4d9d-ac61-a749c429cd7b.csv
Normal file
19
scraper-data-1/d228bcd4-10f1-4d9d-ac61-a749c429cd7b.csv
Normal file
File diff suppressed because one or more lines are too long
32
scraper-data-1/d58b61a0-842a-42d1-8eb2-8ea691298796.csv
Normal file
32
scraper-data-1/d58b61a0-842a-42d1-8eb2-8ea691298796.csv
Normal file
File diff suppressed because one or more lines are too long
55
scraper-data-1/d5912fef-1aa8-45ff-a4ee-0048baab63c8.csv
Normal file
55
scraper-data-1/d5912fef-1aa8-45ff-a4ee-0048baab63c8.csv
Normal file
File diff suppressed because one or more lines are too long
31
scraper-data-1/d6371de0-d9b1-4ecd-9e4a-a795df718857.csv
Normal file
31
scraper-data-1/d6371de0-d9b1-4ecd-9e4a-a795df718857.csv
Normal file
File diff suppressed because one or more lines are too long
79
scraper-data-1/d67b4b2e-cd69-416e-9675-294e4a1bf7b7.csv
Normal file
79
scraper-data-1/d67b4b2e-cd69-416e-9675-294e4a1bf7b7.csv
Normal file
File diff suppressed because one or more lines are too long
30
scraper-data-1/d842cfb6-be7d-4690-b3e4-6fcb7d262019.csv
Normal file
30
scraper-data-1/d842cfb6-be7d-4690-b3e4-6fcb7d262019.csv
Normal file
File diff suppressed because one or more lines are too long
62
scraper-data-1/e274bc34-5df0-41fb-8491-2280e6d72ab7.csv
Normal file
62
scraper-data-1/e274bc34-5df0-41fb-8491-2280e6d72ab7.csv
Normal file
File diff suppressed because one or more lines are too long
50
scraper-data-1/e49938e1-b7ac-4aed-9706-01806c38dbf0.csv
Normal file
50
scraper-data-1/e49938e1-b7ac-4aed-9706-01806c38dbf0.csv
Normal file
File diff suppressed because one or more lines are too long
35
scraper-data-1/e67bce34-5c93-4355-90b7-cdf18175b869.csv
Normal file
35
scraper-data-1/e67bce34-5c93-4355-90b7-cdf18175b869.csv
Normal file
File diff suppressed because one or more lines are too long
26
scraper-data-1/ea482d12-613c-4cb5-8297-c4156ab3f305.csv
Normal file
26
scraper-data-1/ea482d12-613c-4cb5-8297-c4156ab3f305.csv
Normal file
File diff suppressed because one or more lines are too long
29
scraper-data-1/ebc83e46-38e2-4297-8e27-f77fdb3bb9a9.csv
Normal file
29
scraper-data-1/ebc83e46-38e2-4297-8e27-f77fdb3bb9a9.csv
Normal file
File diff suppressed because one or more lines are too long
49
scraper-data-1/fa49de61-ddcc-4117-9753-60134537c237.csv
Normal file
49
scraper-data-1/fa49de61-ddcc-4117-9753-60134537c237.csv
Normal file
File diff suppressed because one or more lines are too long
BIN
scraper-data-1/jobs.db
Normal file
BIN
scraper-data-1/jobs.db
Normal file
Binary file not shown.
BIN
scraper-data-1/jobs.db-shm
Normal file
BIN
scraper-data-1/jobs.db-shm
Normal file
Binary file not shown.
BIN
scraper-data-1/jobs.db-wal
Normal file
BIN
scraper-data-1/jobs.db-wal
Normal file
Binary file not shown.
48
scraper-data-2/0190015c-b2d5-4423-9831-783612514cc1.csv
Normal file
48
scraper-data-2/0190015c-b2d5-4423-9831-783612514cc1.csv
Normal file
File diff suppressed because one or more lines are too long
40
scraper-data-2/03100699-9292-4952-ab27-0b502755623e.csv
Normal file
40
scraper-data-2/03100699-9292-4952-ab27-0b502755623e.csv
Normal file
File diff suppressed because one or more lines are too long
25
scraper-data-2/073062a2-ca90-4fde-8fa6-ff3c9ed762aa.csv
Normal file
25
scraper-data-2/073062a2-ca90-4fde-8fa6-ff3c9ed762aa.csv
Normal file
File diff suppressed because one or more lines are too long
55
scraper-data-2/08476dc0-386f-493d-a006-2dacc9c3a969.csv
Normal file
55
scraper-data-2/08476dc0-386f-493d-a006-2dacc9c3a969.csv
Normal file
File diff suppressed because one or more lines are too long
21
scraper-data-2/0b76010e-7005-4f71-a98d-8f87b335fc08.csv
Normal file
21
scraper-data-2/0b76010e-7005-4f71-a98d-8f87b335fc08.csv
Normal file
File diff suppressed because one or more lines are too long
46
scraper-data-2/0ce4e8b0-9ab2-4f10-98b8-0ad99ff15daf.csv
Normal file
46
scraper-data-2/0ce4e8b0-9ab2-4f10-98b8-0ad99ff15daf.csv
Normal file
File diff suppressed because one or more lines are too long
36
scraper-data-2/15c99b61-c169-432c-bb16-c514c65e6d1e.csv
Normal file
36
scraper-data-2/15c99b61-c169-432c-bb16-c514c65e6d1e.csv
Normal file
File diff suppressed because one or more lines are too long
46
scraper-data-2/1611afca-30d1-4dcd-984c-772c5de32fb3.csv
Normal file
46
scraper-data-2/1611afca-30d1-4dcd-984c-772c5de32fb3.csv
Normal file
File diff suppressed because one or more lines are too long
44
scraper-data-2/1e06930f-a2a8-4020-9f94-3580b6e51d00.csv
Normal file
44
scraper-data-2/1e06930f-a2a8-4020-9f94-3580b6e51d00.csv
Normal file
File diff suppressed because one or more lines are too long
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue