cleanup
This commit is contained in:
parent
bb06a072b0
commit
355130a2d9
6 changed files with 0 additions and 1449 deletions
|
|
@ -1,68 +0,0 @@
|
||||||
import os
|
|
||||||
from flask import Flask, redirect, url_for, request, current_app
|
|
||||||
from flask_sqlalchemy import SQLAlchemy
|
|
||||||
from flask_login import LoginManager, current_user
|
|
||||||
from flask_migrate import Migrate
|
|
||||||
|
|
||||||
# ✅ Docker-Pfade
|
|
||||||
UPLOAD_FOLDER = '/app/uploads'
|
|
||||||
RESULT_FOLDER = '/app/results'
|
|
||||||
|
|
||||||
db = SQLAlchemy()
|
|
||||||
login_manager = LoginManager()
|
|
||||||
migrate = Migrate()
|
|
||||||
|
|
||||||
def create_app():
|
|
||||||
app = Flask(__name__)
|
|
||||||
|
|
||||||
# 🔑 Configs
|
|
||||||
app.config['SECRET_KEY'] = '008e7369b075886d5f494c8813efdfb17155da6af12b3fe8ee'
|
|
||||||
app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///users.db'
|
|
||||||
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
|
|
||||||
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
|
|
||||||
app.config['RESULT_FOLDER'] = RESULT_FOLDER
|
|
||||||
app.config['ALLOW_USER_SIGNUP'] = True # ✅ Aktiviert!
|
|
||||||
|
|
||||||
# DB + Tools
|
|
||||||
db.init_app(app)
|
|
||||||
migrate.init_app(app, db)
|
|
||||||
login_manager.init_app(app)
|
|
||||||
login_manager.login_view = 'auth.login'
|
|
||||||
|
|
||||||
# User Loader
|
|
||||||
@login_manager.user_loader
|
|
||||||
def load_user(user_id):
|
|
||||||
from .models import User
|
|
||||||
return User.query.get(int(user_id))
|
|
||||||
|
|
||||||
# Protected Routes
|
|
||||||
@app.before_request
|
|
||||||
def require_login():
|
|
||||||
allowed = ['auth.login', 'auth.signup', 'static']
|
|
||||||
if (not current_user.is_authenticated and
|
|
||||||
request.endpoint not in allowed and
|
|
||||||
not request.path.startswith('/static')):
|
|
||||||
return redirect(url_for('auth.login'))
|
|
||||||
|
|
||||||
# Ordner
|
|
||||||
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
|
|
||||||
os.makedirs(app.config['RESULT_FOLDER'], exist_ok=True)
|
|
||||||
|
|
||||||
# Routes
|
|
||||||
from . import routes
|
|
||||||
app.register_blueprint(routes.bp)
|
|
||||||
|
|
||||||
# Index Redirect
|
|
||||||
@app.route('/')
|
|
||||||
def index():
|
|
||||||
return redirect(url_for('auth.job_status'))
|
|
||||||
|
|
||||||
# DB Tables
|
|
||||||
with app.app_context():
|
|
||||||
db.create_all()
|
|
||||||
|
|
||||||
return app
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
app = create_app()
|
|
||||||
app.run(host='0.0.0.0', port=5000, debug=False)
|
|
||||||
223
app/routes.orig
223
app/routes.orig
|
|
@ -1,223 +0,0 @@
|
||||||
import time
|
|
||||||
import csv
|
|
||||||
import os
|
|
||||||
import threading
|
|
||||||
from flask import Blueprint, request, redirect, url_for, flash, render_template, send_file, current_app
|
|
||||||
from flask_login import login_user, logout_user, login_required, current_user
|
|
||||||
from werkzeug.utils import secure_filename
|
|
||||||
from werkzeug.security import generate_password_hash, check_password_hash
|
|
||||||
from .models import db, User, Job
|
|
||||||
from .webcrawler import process_file # Importiere die Funktion für das Webscraping
|
|
||||||
|
|
||||||
UPLOAD_FOLDER = 'uploads'
|
|
||||||
RESULT_FOLDER = 'results'
|
|
||||||
|
|
||||||
# Blueprint für auth erstellen
|
|
||||||
bp = Blueprint('auth', __name__)
|
|
||||||
|
|
||||||
@bp.route('/login', methods=['GET', 'POST'])
|
|
||||||
def login():
|
|
||||||
if request.method == 'POST':
|
|
||||||
username = request.form['username']
|
|
||||||
password = request.form['password']
|
|
||||||
user = User.query.filter_by(username=username).first()
|
|
||||||
if user and check_password_hash(user.password, password):
|
|
||||||
login_user(user)
|
|
||||||
return redirect(url_for('auth.job_status'))
|
|
||||||
flash('Login fehlgeschlagen. Überprüfen Sie Benutzername und Passwort.')
|
|
||||||
return render_template('login.html')
|
|
||||||
|
|
||||||
@bp.route('/signup', methods=['GET', 'POST'])
|
|
||||||
def signup():
|
|
||||||
if not current_app.config['ALLOW_USER_SIGNUP']:
|
|
||||||
flash("Registrierung ist derzeit deaktiviert.")
|
|
||||||
return redirect(url_for('auth.login'))
|
|
||||||
|
|
||||||
if request.method == 'POST':
|
|
||||||
username = request.form['username']
|
|
||||||
password = generate_password_hash(request.form['password'], method='sha256')
|
|
||||||
new_user = User(username=username, password=password)
|
|
||||||
db.session.add(new_user)
|
|
||||||
db.session.commit()
|
|
||||||
flash('Benutzer erfolgreich erstellt! Sie können sich jetzt einloggen.')
|
|
||||||
return redirect(url_for('auth.login'))
|
|
||||||
|
|
||||||
return render_template('signup.html')
|
|
||||||
|
|
||||||
@bp.route('/logout')
|
|
||||||
@login_required
|
|
||||||
def logout():
|
|
||||||
logout_user()
|
|
||||||
return redirect(url_for('auth.login'))
|
|
||||||
|
|
||||||
@bp.route('/jobs')
|
|
||||||
@login_required
|
|
||||||
def job_status():
|
|
||||||
jobs = Job.query.filter_by(user_id=current_user.id).all()
|
|
||||||
return render_template('jobs.html', jobs=jobs)
|
|
||||||
|
|
||||||
@bp.route('/upload', methods=['GET', 'POST'])
|
|
||||||
@login_required
|
|
||||||
def upload():
|
|
||||||
if request.method == 'POST':
|
|
||||||
file = request.files['file']
|
|
||||||
filename = secure_filename(file.filename)
|
|
||||||
|
|
||||||
# Überprüfen, ob eine Datei mit dem gleichen Namen bereits existiert
|
|
||||||
file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], filename)
|
|
||||||
if os.path.exists(file_path):
|
|
||||||
# Wenn eine Datei mit dem gleichen Namen existiert, einen Zeitstempel hinzufügen
|
|
||||||
name, ext = os.path.splitext(filename)
|
|
||||||
timestamp = time.strftime("%Y%m%d-%H%M%S") # Zeitstempel im Format JahrMonatTag-StundenMinutenSekunden
|
|
||||||
filename = f"{name}_{timestamp}{ext}"
|
|
||||||
file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], filename)
|
|
||||||
flash(f"Eine Datei mit gleichem Namen existierte bereits. Die Datei wurde als '{filename}' gespeichert.")
|
|
||||||
|
|
||||||
# Speichern der Datei
|
|
||||||
file.save(file_path)
|
|
||||||
flash('Datei erfolgreich hochgeladen und Job gestartet')
|
|
||||||
|
|
||||||
# Neuen Job erstellen
|
|
||||||
new_job = Job(user_id=current_user.id, filename=filename, status="Pending")
|
|
||||||
db.session.add(new_job)
|
|
||||||
db.session.commit()
|
|
||||||
|
|
||||||
# Debugging-Ausgabe zur Überprüfung der Thread-Erstellung
|
|
||||||
print(f"Starte Scraping-Thread für Job-ID: {new_job.id}")
|
|
||||||
|
|
||||||
# Starten des Scraping im Hintergrund-Thread und Übergeben des aktuellen Anwendungskontexts
|
|
||||||
thread = threading.Thread(target=process_file, args=(filename, new_job.id, current_app._get_current_object()))
|
|
||||||
thread.start()
|
|
||||||
|
|
||||||
# Debugging-Ausgabe, nachdem der Thread gestartet wurde
|
|
||||||
print(f"Thread für Job {new_job.id} erfolgreich gestartet.")
|
|
||||||
|
|
||||||
return redirect(url_for('auth.job_status'))
|
|
||||||
|
|
||||||
return render_template('upload.html')
|
|
||||||
|
|
||||||
@bp.route('/download/<int:job_id>', methods=['GET'])
|
|
||||||
@login_required
|
|
||||||
def download_result(job_id):
|
|
||||||
job = Job.query.get_or_404(job_id)
|
|
||||||
print(f"Job ID: {job.id} - User ID: {job.user_id} - Current User ID: {current_user.id}")
|
|
||||||
|
|
||||||
# Überprüfen, ob der Job dem aktuellen Benutzer gehört
|
|
||||||
if job.user_id != current_user.id:
|
|
||||||
flash("Sie haben keine Berechtigung, dieses Ergebnis herunterzuladen.")
|
|
||||||
return redirect(url_for('auth.job_status'))
|
|
||||||
|
|
||||||
# Überprüfen, ob das Ergebnis vorhanden ist
|
|
||||||
if not job.result_filename:
|
|
||||||
flash("Das Ergebnis ist noch nicht verfügbar.")
|
|
||||||
return redirect(url_for('auth.job_status'))
|
|
||||||
|
|
||||||
# Überprüfen, ob die Datei im angegebenen Pfad existiert
|
|
||||||
result_path = os.path.join(current_app.config['RESULT_FOLDER'], job.result_filename)
|
|
||||||
print(f"Versuche, Datei herunterzuladen von: {result_path}")
|
|
||||||
|
|
||||||
if os.path.exists(result_path):
|
|
||||||
print("Datei existiert und wird zum Download bereitgestellt.")
|
|
||||||
return send_file(result_path, as_attachment=True)
|
|
||||||
else:
|
|
||||||
print("Datei nicht gefunden. Ergebnisverzeichnis oder Pfad prüfen.")
|
|
||||||
flash("Ergebnisdatei nicht gefunden.")
|
|
||||||
return redirect(url_for('auth.job_status'))
|
|
||||||
|
|
||||||
|
|
||||||
@bp.route('/delete_job/<int:job_id>', methods=['POST'])
|
|
||||||
@login_required
|
|
||||||
def delete_job(job_id):
|
|
||||||
job = Job.query.get_or_404(job_id)
|
|
||||||
if job.user_id != current_user.id:
|
|
||||||
flash("Sie haben keine Berechtigung, diesen Job zu löschen.")
|
|
||||||
return redirect(url_for('auth.job_status'))
|
|
||||||
|
|
||||||
# Löschen der Upload-Datei
|
|
||||||
upload_path = os.path.join(current_app.config['UPLOAD_FOLDER'], job.filename)
|
|
||||||
if os.path.exists(upload_path):
|
|
||||||
os.remove(upload_path)
|
|
||||||
print(f"Upload-Datei gelöscht: {upload_path}")
|
|
||||||
else:
|
|
||||||
print(f"Upload-Datei nicht gefunden: {upload_path}")
|
|
||||||
|
|
||||||
# Löschen der Results-Datei, falls vorhanden
|
|
||||||
if job.result_filename:
|
|
||||||
result_path = os.path.join(current_app.config['RESULT_FOLDER'], job.result_filename)
|
|
||||||
print(f"Versuche Ergebnisdatei zu löschen: {result_path}")
|
|
||||||
|
|
||||||
if os.path.exists(result_path):
|
|
||||||
try:
|
|
||||||
os.remove(result_path)
|
|
||||||
print(f"Ergebnisdatei gelöscht: {result_path}")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Fehler beim Löschen der Ergebnisdatei: {e}")
|
|
||||||
else:
|
|
||||||
print(f"Ergebnisdatei nicht gefunden im Pfad: {result_path}")
|
|
||||||
|
|
||||||
# Job aus der Datenbank löschen
|
|
||||||
db.session.delete(job)
|
|
||||||
db.session.commit()
|
|
||||||
flash("Job erfolgreich gelöscht.")
|
|
||||||
return redirect(url_for('auth.job_status'))
|
|
||||||
|
|
||||||
@bp.route('/admin', methods=['GET'])
|
|
||||||
@login_required
|
|
||||||
def admin_panel():
|
|
||||||
if not current_user.is_admin:
|
|
||||||
flash("Keine Berechtigung.")
|
|
||||||
return redirect(url_for('auth.job_status'))
|
|
||||||
|
|
||||||
users = User.query.all()
|
|
||||||
return render_template('admin_panel.html', users=users)
|
|
||||||
|
|
||||||
@bp.route('/admin/create_user', methods=['POST'])
|
|
||||||
@login_required
|
|
||||||
def create_user():
|
|
||||||
if not current_user.is_admin:
|
|
||||||
flash("Keine Berechtigung.")
|
|
||||||
return redirect(url_for('auth.admin_panel'))
|
|
||||||
|
|
||||||
username = request.form['username']
|
|
||||||
password = request.form['password']
|
|
||||||
is_admin = 'is_admin' in request.form # Checkbox für Adminrechte
|
|
||||||
|
|
||||||
hashed_password = generate_password_hash(password, method='sha256')
|
|
||||||
new_user = User(username=username, password=hashed_password, is_admin=is_admin)
|
|
||||||
db.session.add(new_user)
|
|
||||||
db.session.commit()
|
|
||||||
|
|
||||||
flash(f"Benutzer {username} wurde erstellt.")
|
|
||||||
return redirect(url_for('auth.admin_panel'))
|
|
||||||
|
|
||||||
@bp.route('/admin/reset_password/<int:user_id>', methods=['POST'])
|
|
||||||
@login_required
|
|
||||||
def reset_password(user_id):
|
|
||||||
if not current_user.is_admin:
|
|
||||||
flash("Keine Berechtigung.")
|
|
||||||
return redirect(url_for('auth.admin_panel'))
|
|
||||||
|
|
||||||
user = User.query.get_or_404(user_id)
|
|
||||||
new_password = request.form['new_password']
|
|
||||||
user.password = generate_password_hash(new_password, method='sha256')
|
|
||||||
db.session.commit()
|
|
||||||
|
|
||||||
flash(f"Passwort für Benutzer {user.username} wurde zurückgesetzt.")
|
|
||||||
return redirect(url_for('auth.admin_panel'))
|
|
||||||
|
|
||||||
@bp.route('/admin/delete_user/<int:user_id>', methods=['POST'])
|
|
||||||
@login_required
|
|
||||||
def delete_user(user_id):
|
|
||||||
if not current_user.is_admin:
|
|
||||||
flash("Keine Berechtigung.")
|
|
||||||
return redirect(url_for('auth.admin_panel'))
|
|
||||||
|
|
||||||
user = User.query.get_or_404(user_id)
|
|
||||||
if user.is_admin:
|
|
||||||
flash("Administratoren können nicht gelöscht werden.")
|
|
||||||
return redirect(url_for('auth.admin_panel'))
|
|
||||||
|
|
||||||
db.session.delete(user)
|
|
||||||
db.session.commit()
|
|
||||||
flash(f"Benutzer {user.username} wurde gelöscht.")
|
|
||||||
return redirect(url_for('auth.admin_panel'))
|
|
||||||
|
|
@ -1,316 +0,0 @@
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import pandas as pd
|
|
||||||
import requests
|
|
||||||
import time
|
|
||||||
import random
|
|
||||||
from io import StringIO
|
|
||||||
from app.models import db, Job
|
|
||||||
|
|
||||||
print("🆕 MODERN webcrawler LOADED!")
|
|
||||||
|
|
||||||
UPLOAD_FOLDER = '/app/uploads'
|
|
||||||
RESULT_FOLDER = '/app/results'
|
|
||||||
SCRAPER_URL = "http://gmaps-scraper:8080"
|
|
||||||
|
|
||||||
OUTPUT_COLS = ['title', 'category', 'address', 'open_hours', 'website', 'phone', 'link']
|
|
||||||
|
|
||||||
|
|
||||||
# ──────────────────────────────────────────────
|
|
||||||
# Hilfsfunktionen
|
|
||||||
# ──────────────────────────────────────────────
|
|
||||||
|
|
||||||
def get_batch_size(total_rows):
|
|
||||||
if total_rows < 50: return 10
|
|
||||||
elif total_rows < 200: return 10
|
|
||||||
elif total_rows < 500: return 5
|
|
||||||
else: return 5
|
|
||||||
|
|
||||||
def get_delay(total_rows):
|
|
||||||
if total_rows < 50: return (5, 10)
|
|
||||||
elif total_rows < 200: return (10, 20)
|
|
||||||
else: return (20, 40)
|
|
||||||
|
|
||||||
def is_blocked(data):
|
|
||||||
text = str(data).lower()
|
|
||||||
blocked = any(kw in text for kw in ['captcha', 'blocked', 'rate limit', 'too many', '429'])
|
|
||||||
if blocked:
|
|
||||||
print(f"🚫 BLOCKED: {str(data)[:100]}")
|
|
||||||
return blocked
|
|
||||||
|
|
||||||
def fix_encoding(text):
|
|
||||||
"""Kaputte ISO→UTF8 Zeichen reparieren (z.B. Industriestraße → Industriestraße)"""
|
|
||||||
if not isinstance(text, str):
|
|
||||||
return text
|
|
||||||
try:
|
|
||||||
return text.encode('latin-1').decode('utf-8')
|
|
||||||
except (UnicodeEncodeError, UnicodeDecodeError):
|
|
||||||
return text
|
|
||||||
|
|
||||||
def build_input_addresses(df):
|
|
||||||
"""Normalisierte Adressen aus Input-CSV für Abgleich"""
|
|
||||||
addresses = set()
|
|
||||||
for _, row in df.iterrows():
|
|
||||||
plz = str(row.get('PLZ', '')).strip()
|
|
||||||
stadt = str(row.get('Stadt', '')).strip()
|
|
||||||
str_ = str(row.get('Straße', '')).strip()
|
|
||||||
nr = str(row.get('Hausnummer', '')).strip()
|
|
||||||
zusatz = str(row.get('Zusatz', '')).strip()
|
|
||||||
|
|
||||||
full = f"{str_} {nr} {zusatz} {plz} {stadt}".lower().strip()
|
|
||||||
full = ' '.join(full.split())
|
|
||||||
addresses.add(full)
|
|
||||||
return addresses
|
|
||||||
|
|
||||||
def normalize_address(addr):
|
|
||||||
"""Output-Adresse normalisieren für Abgleich"""
|
|
||||||
if not isinstance(addr, str):
|
|
||||||
return ''
|
|
||||||
addr = fix_encoding(addr)
|
|
||||||
return ' '.join(addr.lower().strip().split())
|
|
||||||
|
|
||||||
def address_in_input(result_addr, input_addresses):
|
|
||||||
"""Prüft ob PLZ + Straßenname aus Result im Input vorkommen"""
|
|
||||||
norm = normalize_address(result_addr)
|
|
||||||
for inp_addr in input_addresses:
|
|
||||||
plz_match = re.search(r'\b\d{5}\b', inp_addr)
|
|
||||||
if plz_match:
|
|
||||||
plz = plz_match.group()
|
|
||||||
if plz in norm:
|
|
||||||
street = inp_addr.split()[0] if inp_addr else ''
|
|
||||||
if len(street) > 3 and street[:4].lower() in norm:
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
# ──────────────────────────────────────────────
|
|
||||||
# CSV Nachbearbeitung (apply_filter umschaltbar)
|
|
||||||
# ──────────────────────────────────────────────
|
|
||||||
|
|
||||||
def process_result_csv(raw_bytes, input_df, apply_filter=True):
|
|
||||||
"""
|
|
||||||
Raw CSV → bereinigt:
|
|
||||||
- Nur OUTPUT_COLS
|
|
||||||
- Encoding fix
|
|
||||||
- Optional: Input/Output Abgleich + Duplikate
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
content = raw_bytes.decode('utf-8', errors='replace')
|
|
||||||
df_out = pd.read_csv(StringIO(content))
|
|
||||||
print(f"📄 Raw result: {df_out.shape} | Columns: {list(df_out.columns)[:8]}")
|
|
||||||
|
|
||||||
# Spalten filtern
|
|
||||||
available = [c for c in OUTPUT_COLS if c in df_out.columns]
|
|
||||||
missing = [c for c in OUTPUT_COLS if c not in df_out.columns]
|
|
||||||
if missing:
|
|
||||||
print(f"⚠️ Fehlende Spalten: {missing}")
|
|
||||||
df_out = df_out[available]
|
|
||||||
|
|
||||||
# 🔤 Encoding fix
|
|
||||||
for col in df_out.columns:
|
|
||||||
df_out[col] = df_out[col].apply(fix_encoding)
|
|
||||||
print(f"🔤 Encoding fix: done")
|
|
||||||
|
|
||||||
if apply_filter:
|
|
||||||
# 📍 Input/Output Abgleich
|
|
||||||
input_addresses = build_input_addresses(input_df)
|
|
||||||
before = len(df_out)
|
|
||||||
df_out = df_out[
|
|
||||||
df_out['address'].apply(
|
|
||||||
lambda a: address_in_input(a, input_addresses)
|
|
||||||
)
|
|
||||||
]
|
|
||||||
print(f"📍 Adress-Filter: {before} → {len(df_out)} Zeilen")
|
|
||||||
|
|
||||||
# 🔁 Duplikate entfernen (immer, auch bei Raw)
|
|
||||||
before_dedup = len(df_out)
|
|
||||||
df_out = df_out.drop_duplicates(subset=['title', 'address'], keep='first')
|
|
||||||
print(f"🔁 Duplikate: {before_dedup} → {len(df_out)} Zeilen")
|
|
||||||
|
|
||||||
# Leere Titel entfernen
|
|
||||||
df_out = df_out.dropna(subset=['title'], how='all')
|
|
||||||
df_out = df_out[df_out['title'].str.strip().astype(bool)]
|
|
||||||
|
|
||||||
print(f"✅ Final ({'gefiltert' if apply_filter else 'alle'}): {df_out.shape}")
|
|
||||||
return df_out
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"💥 process_result_csv: {e}")
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
# ──────────────────────────────────────────────
|
|
||||||
# Haupt-Worker
|
|
||||||
# ──────────────────────────────────────────────
|
|
||||||
|
|
||||||
def process_file(filename, job_id, app):
|
|
||||||
print(f"🎯 {filename} Job#{job_id} START!")
|
|
||||||
|
|
||||||
with app.app_context():
|
|
||||||
job = Job.query.get(job_id)
|
|
||||||
if not job:
|
|
||||||
print("❌ Job missing")
|
|
||||||
return
|
|
||||||
|
|
||||||
try:
|
|
||||||
# 1️⃣ CSV Parse
|
|
||||||
job.status = "📊 parsing CSV"
|
|
||||||
db.session.commit()
|
|
||||||
|
|
||||||
filepath = os.path.join(UPLOAD_FOLDER, filename)
|
|
||||||
print(f"📁 {filepath} | {os.path.getsize(filepath)}b")
|
|
||||||
|
|
||||||
df_input = pd.read_csv(filepath, sep=';', encoding='ISO-8859-1')
|
|
||||||
print(f"📊 {df_input.shape} | Columns: {list(df_input.columns)}")
|
|
||||||
|
|
||||||
queries = []
|
|
||||||
for _, row in df_input.iterrows():
|
|
||||||
parts = [
|
|
||||||
str(row.get('PLZ', '')).strip(),
|
|
||||||
str(row.get('Stadt', '')).strip(),
|
|
||||||
str(row.get('Straße', '')).strip(),
|
|
||||||
str(row.get('Hausnummer', '')).strip(),
|
|
||||||
str(row.get('Zusatz', '')).strip(),
|
|
||||||
]
|
|
||||||
q = f"Firmen {' '.join(p for p in parts if p and p != 'nan')}".strip()
|
|
||||||
if len(q) > 10:
|
|
||||||
queries.append(q)
|
|
||||||
|
|
||||||
total = len(queries)
|
|
||||||
print(f"🔍 {total} Queries | Samples: {queries[:3]}")
|
|
||||||
if not queries:
|
|
||||||
raise ValueError("Keine gültigen Adressen in CSV")
|
|
||||||
|
|
||||||
# 2️⃣ Batch + Delay
|
|
||||||
batch_size = get_batch_size(total)
|
|
||||||
delay_min, delay_max = get_delay(total)
|
|
||||||
batch = queries[:batch_size]
|
|
||||||
pre_delay = random.uniform(delay_min, delay_max)
|
|
||||||
print(f"📦 Batch {len(batch)}/{total} | 😴 {pre_delay:.1f}s Delay")
|
|
||||||
time.sleep(pre_delay)
|
|
||||||
|
|
||||||
# 3️⃣ API Call
|
|
||||||
job.status = "📤 sending to scraper"
|
|
||||||
db.session.commit()
|
|
||||||
|
|
||||||
payload = {
|
|
||||||
"name": f"{filename.replace('.csv','')}-{job_id}",
|
|
||||||
"keywords": batch,
|
|
||||||
"lang": "de",
|
|
||||||
"depth": 1,
|
|
||||||
"zoom": 17,
|
|
||||||
"radius": 50,
|
|
||||||
"max_time": 60,
|
|
||||||
"fast_mode": False
|
|
||||||
}
|
|
||||||
|
|
||||||
print(f"🌐 POST {SCRAPER_URL}/api/v1/jobs | {payload['name']}")
|
|
||||||
resp = requests.post(f"{SCRAPER_URL}/api/v1/jobs", json=payload, timeout=30)
|
|
||||||
print(f"📤 {resp.status_code}: {resp.text[:300]}")
|
|
||||||
|
|
||||||
if is_blocked(resp.text):
|
|
||||||
raise ValueError("🚫 IP geblockt! Proxy konfigurieren.")
|
|
||||||
if resp.status_code != 201:
|
|
||||||
raise ValueError(f"API {resp.status_code}: {resp.text[:200]}")
|
|
||||||
|
|
||||||
# 4️⃣ Polling
|
|
||||||
scraper_id = resp.json()['id']
|
|
||||||
job.scraper_job_id = scraper_id
|
|
||||||
job.status = "⏳ scraping"
|
|
||||||
db.session.commit()
|
|
||||||
print(f"✅ Scraper Job: {scraper_id}")
|
|
||||||
|
|
||||||
for i in range(1, 61): # Max 10min
|
|
||||||
try:
|
|
||||||
r = requests.get(
|
|
||||||
f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}",
|
|
||||||
timeout=10
|
|
||||||
)
|
|
||||||
data = r.json()
|
|
||||||
status = data.get('Status', data.get('status', '?'))
|
|
||||||
print(f"⏳ {i}/60: {status}")
|
|
||||||
|
|
||||||
if is_blocked(data):
|
|
||||||
raise ValueError("🚫 IP geblockt während scraping!")
|
|
||||||
|
|
||||||
if status in ('ok', 'completed', 'scraped'):
|
|
||||||
dl = requests.get(
|
|
||||||
f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}/download",
|
|
||||||
timeout=60
|
|
||||||
)
|
|
||||||
if dl.status_code != 200:
|
|
||||||
raise ValueError(f"Download {dl.status_code}")
|
|
||||||
if is_blocked(dl.text[:200]):
|
|
||||||
raise ValueError("🚫 IP geblockt beim Download!")
|
|
||||||
|
|
||||||
# 5️⃣ Nachbearbeitung → zwei Versionen
|
|
||||||
job.status = "🔧 processing result"
|
|
||||||
db.session.commit()
|
|
||||||
|
|
||||||
base = filename.replace('.csv', '')
|
|
||||||
os.makedirs(RESULT_FOLDER, exist_ok=True)
|
|
||||||
|
|
||||||
# ── Version A: Gefiltert (Adressabgleich + Deduplizierung) ──
|
|
||||||
df_filtered = process_result_csv(dl.content, df_input, apply_filter=True)
|
|
||||||
outname_filtered = f"results_{base}_filtered.csv"
|
|
||||||
outpath_filtered = os.path.join(RESULT_FOLDER, outname_filtered)
|
|
||||||
|
|
||||||
if df_filtered is not None and len(df_filtered) > 0:
|
|
||||||
df_filtered.to_csv(
|
|
||||||
outpath_filtered, index=False,
|
|
||||||
encoding='utf-8-sig', sep=';'
|
|
||||||
)
|
|
||||||
print(f"🎯 Filtered: {outname_filtered} → {len(df_filtered)} Firmen")
|
|
||||||
else:
|
|
||||||
print("⚠️ Keine Treffer nach Filter – leere Datei wird erstellt")
|
|
||||||
pd.DataFrame(columns=OUTPUT_COLS).to_csv(
|
|
||||||
outpath_filtered, index=False,
|
|
||||||
encoding='utf-8-sig', sep=';'
|
|
||||||
)
|
|
||||||
|
|
||||||
# ── Version B: Alle (nur Spalten + Encoding, kein Filter) ──
|
|
||||||
df_raw = process_result_csv(dl.content, df_input, apply_filter=False)
|
|
||||||
outname_raw = f"results_{base}_all.csv"
|
|
||||||
outpath_raw = os.path.join(RESULT_FOLDER, outname_raw)
|
|
||||||
|
|
||||||
if df_raw is not None:
|
|
||||||
df_raw.to_csv(
|
|
||||||
outpath_raw, index=False,
|
|
||||||
encoding='utf-8-sig', sep=';'
|
|
||||||
)
|
|
||||||
print(f"📋 All: {outname_raw} → {len(df_raw)} Firmen")
|
|
||||||
else:
|
|
||||||
print("⚠️ df_raw None – Rohinhalt wird gespeichert")
|
|
||||||
with open(outpath_raw, 'wb') as f:
|
|
||||||
f.write(dl.content)
|
|
||||||
|
|
||||||
# ── DB speichern ──
|
|
||||||
job.status = "✅ Fertig"
|
|
||||||
job.result_filename = outname_filtered # 🎯 Gefiltert
|
|
||||||
job.result_filename_raw = outname_raw # 📋 Alle
|
|
||||||
db.session.commit()
|
|
||||||
print(f"🎉 Beide Dateien gespeichert!")
|
|
||||||
break
|
|
||||||
|
|
||||||
elif status in ('failed', 'cancelled', 'error'):
|
|
||||||
raise ValueError(f"Scraper: {status}")
|
|
||||||
|
|
||||||
except requests.RequestException as e:
|
|
||||||
print(f"⚠️ Poll {i}: {e}")
|
|
||||||
|
|
||||||
time.sleep(random.uniform(8, 15))
|
|
||||||
|
|
||||||
else:
|
|
||||||
raise ValueError("Timeout nach 10min")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
job.status = "Failed"
|
|
||||||
job.result_filename = str(e)
|
|
||||||
print(f"💥 ERROR: {e}")
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
|
|
||||||
db.session.commit()
|
|
||||||
print(f"✅ DONE! Status: {job.status}\n")
|
|
||||||
|
|
@ -1,275 +0,0 @@
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import pandas as pd
|
|
||||||
import requests
|
|
||||||
import time
|
|
||||||
import random
|
|
||||||
from io import StringIO
|
|
||||||
from app.models import db, Job
|
|
||||||
|
|
||||||
print("🆕 MODERN webcrawler LOADED! – BATCHED + PROXY")
|
|
||||||
|
|
||||||
UPLOAD_FOLDER = '/app/uploads'
|
|
||||||
RESULT_FOLDER = '/app/results'
|
|
||||||
SCRAPER_URL = "http://gmaps-scraper:8080"
|
|
||||||
|
|
||||||
OUTPUT_COLS = ['title', 'category', 'address', 'open_hours', 'website', 'phone', 'link']
|
|
||||||
|
|
||||||
PROXY_URL = "http://bitlleuv-rotate:s5hzse6hz74b@p.webshare.io:80"
|
|
||||||
API_PROXIES = {"http": PROXY_URL, "https": PROXY_URL}
|
|
||||||
|
|
||||||
# ──────────────────────────────────────────────
|
|
||||||
# Hilfsfunktionen
|
|
||||||
# ──────────────────────────────────────────────
|
|
||||||
|
|
||||||
def is_blocked(data):
|
|
||||||
text = str(data).lower()
|
|
||||||
blocked = any(kw in text for kw in ['captcha', 'blocked', 'rate limit', 'too many', '429'])
|
|
||||||
if blocked:
|
|
||||||
print(f"🚫 BLOCKED: {str(data)[:100]}")
|
|
||||||
return blocked
|
|
||||||
|
|
||||||
def fix_encoding(text):
|
|
||||||
if not isinstance(text, str):
|
|
||||||
return text
|
|
||||||
try:
|
|
||||||
return text.encode('latin-1').decode('utf-8')
|
|
||||||
except (UnicodeEncodeError, UnicodeDecodeError):
|
|
||||||
return text
|
|
||||||
|
|
||||||
def build_input_addresses(df):
|
|
||||||
addresses = set()
|
|
||||||
for _, row in df.iterrows():
|
|
||||||
plz = str(row.get('PLZ', '')).strip()
|
|
||||||
stadt = str(row.get('Stadt', '')).strip()
|
|
||||||
str_ = str(row.get('Straße', '')).strip()
|
|
||||||
nr = str(row.get('Hausnummer', '')).strip()
|
|
||||||
zusatz = str(row.get('Zusatz', '')).strip()
|
|
||||||
full = f"{str_} {nr} {zusatz} {plz} {stadt}".lower().strip()
|
|
||||||
full = ' '.join(full.split())
|
|
||||||
addresses.add(full)
|
|
||||||
return addresses
|
|
||||||
|
|
||||||
def normalize_address(addr):
|
|
||||||
if not isinstance(addr, str):
|
|
||||||
return ''
|
|
||||||
addr = fix_encoding(addr)
|
|
||||||
return ' '.join(addr.lower().strip().split())
|
|
||||||
|
|
||||||
def address_in_input(result_addr, input_addresses):
|
|
||||||
norm = normalize_address(result_addr)
|
|
||||||
for inp_addr in input_addresses:
|
|
||||||
plz_match = re.search(r'\b\d{5}\b', inp_addr)
|
|
||||||
if plz_match:
|
|
||||||
plz = plz_match.group()
|
|
||||||
if plz in norm:
|
|
||||||
street = inp_addr.split()[0] if inp_addr else ''
|
|
||||||
if len(street) > 3 and street[:4].lower() in norm:
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
# ──────────────────────────────────────────────
|
|
||||||
# CSV Nachbearbeitung
|
|
||||||
# ──────────────────────────────────────────────
|
|
||||||
|
|
||||||
def process_result_csv(raw_bytes, input_df, apply_filter=True):
|
|
||||||
try:
|
|
||||||
content = raw_bytes.decode('utf-8', errors='replace')
|
|
||||||
df_out = pd.read_csv(StringIO(content))
|
|
||||||
print(f"📄 Raw result: {df_out.shape}")
|
|
||||||
|
|
||||||
available = [c for c in OUTPUT_COLS if c in df_out.columns]
|
|
||||||
df_out = df_out[available]
|
|
||||||
|
|
||||||
for col in df_out.columns:
|
|
||||||
df_out[col] = df_out[col].apply(fix_encoding)
|
|
||||||
|
|
||||||
if apply_filter:
|
|
||||||
input_addresses = build_input_addresses(input_df)
|
|
||||||
before = len(df_out)
|
|
||||||
df_out = df_out[
|
|
||||||
df_out['address'].apply(
|
|
||||||
lambda a: address_in_input(a, input_addresses)
|
|
||||||
)
|
|
||||||
]
|
|
||||||
print(f"📍 Filter: {before} → {len(df_out)}")
|
|
||||||
|
|
||||||
df_out = df_out.drop_duplicates(subset=['title', 'address'], keep='first')
|
|
||||||
df_out = df_out.dropna(subset=['title'], how='all')
|
|
||||||
df_out = df_out[df_out['title'].str.strip().astype(bool)]
|
|
||||||
|
|
||||||
print(f"✅ Final ({'gefiltert' if apply_filter else 'alle'}): {df_out.shape}")
|
|
||||||
return df_out
|
|
||||||
except Exception as e:
|
|
||||||
print(f"💥 process_result_csv: {e}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
# ──────────────────────────────────────────────
|
|
||||||
# HAUPT-WORKER
|
|
||||||
# ──────────────────────────────────────────────
|
|
||||||
|
|
||||||
def process_file(filename, job_id, app):
|
|
||||||
print(f"🎯 {filename} Job#{job_id} START!")
|
|
||||||
|
|
||||||
with app.app_context():
|
|
||||||
job = Job.query.get(job_id)
|
|
||||||
if not job:
|
|
||||||
print("❌ Job missing")
|
|
||||||
return
|
|
||||||
|
|
||||||
try:
|
|
||||||
#Parse + ALLE Queries
|
|
||||||
job.status = "📊 parsing CSV"
|
|
||||||
db.session.commit()
|
|
||||||
|
|
||||||
filepath = os.path.join(UPLOAD_FOLDER, filename)
|
|
||||||
print(f"📁 {filepath} | {os.path.getsize(filepath)}b")
|
|
||||||
|
|
||||||
df_input = pd.read_csv(filepath, sep=';', encoding='ISO-8859-1')
|
|
||||||
print(f"📊 {df_input.shape}")
|
|
||||||
|
|
||||||
queries = []
|
|
||||||
for _, row in df_input.iterrows():
|
|
||||||
parts = [
|
|
||||||
str(row.get('PLZ', '')).strip(),
|
|
||||||
str(row.get('Stadt', '')).strip(),
|
|
||||||
str(row.get('Straße', '')).strip(),
|
|
||||||
str(row.get('Hausnummer', '')).strip(),
|
|
||||||
str(row.get('Zusatz', '')).strip(),
|
|
||||||
]
|
|
||||||
q = f"Firmen {' '.join(p for p in parts if p and p != 'nan')}".strip()
|
|
||||||
if len(q) > 10:
|
|
||||||
queries.append(q)
|
|
||||||
|
|
||||||
total_queries = len(queries)
|
|
||||||
print(f"🔍 {total_queries} Queries | Samples: {queries[:3]}")
|
|
||||||
if total_queries == 0:
|
|
||||||
raise ValueError("Keine gültigen Adressen")
|
|
||||||
|
|
||||||
#BATCHED Processing
|
|
||||||
BATCH_SIZE = 10 # Erhöht: 5 → 10 (paid proxy)
|
|
||||||
BATCH_DELAY_MIN, BATCH_DELAY_MAX = 10, 20 # Reduziert: 30-60s → 10-20s (paid proxy)
|
|
||||||
batches = (total_queries + BATCH_SIZE - 1) // BATCH_SIZE
|
|
||||||
print(f"📦 {batches} Batches à {BATCH_SIZE} | ETA: ~{batches*15//60:.0f}h")
|
|
||||||
|
|
||||||
all_results_filtered = []
|
|
||||||
all_results_raw = []
|
|
||||||
job.status = f"🔄 Batch 1/{batches}"
|
|
||||||
db.session.commit()
|
|
||||||
|
|
||||||
for batch_idx in range(batches):
|
|
||||||
batch_start = batch_idx * BATCH_SIZE
|
|
||||||
batch_end = min(batch_start + BATCH_SIZE, total_queries)
|
|
||||||
batch_queries = queries[batch_start:batch_end]
|
|
||||||
print(f"\n🔄 BATCH {batch_idx+1}/{batches} ({batch_start+1}-{batch_end}/{total_queries})")
|
|
||||||
|
|
||||||
#Random Delay
|
|
||||||
delay = random.uniform(BATCH_DELAY_MIN, BATCH_DELAY_MAX)
|
|
||||||
print(f"😴 Delay: {delay:.0f}s | Proxy: {PROXY_URL}")
|
|
||||||
time.sleep(delay)
|
|
||||||
|
|
||||||
#API Call
|
|
||||||
payload = {
|
|
||||||
"name": f"{filename.replace('.csv','')}-{job_id}-B{batch_idx+1:03d}",
|
|
||||||
"keywords": batch_queries,
|
|
||||||
"lang": "de",
|
|
||||||
"depth": 1,
|
|
||||||
"zoom": 17,
|
|
||||||
"radius": 50,
|
|
||||||
"max_time": 60, # Reduziert: 120 → 60 (paid proxy schneller)
|
|
||||||
"fast_mode": False,
|
|
||||||
"proxies": [PROXY_URL]
|
|
||||||
}
|
|
||||||
|
|
||||||
try:
|
|
||||||
resp = requests.post(
|
|
||||||
f"{SCRAPER_URL}/api/v1/jobs",
|
|
||||||
json=payload,
|
|
||||||
timeout=45
|
|
||||||
)
|
|
||||||
print(f"📤 {resp.status_code}")
|
|
||||||
if is_blocked(resp.text):
|
|
||||||
print("🚫 Batch übersprungen (blocked)")
|
|
||||||
continue
|
|
||||||
if resp.status_code != 201:
|
|
||||||
print(f"⚠️ Batch {batch_idx+1} fehlgeschlagen: {resp.text[:100]}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
scraper_id = resp.json()['id']
|
|
||||||
print(f"✅ Scraper: {scraper_id}")
|
|
||||||
|
|
||||||
for poll_i in range(1, 61): # Reduziert: 121 → 61 (max_time 60s)
|
|
||||||
r = requests.get(
|
|
||||||
f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}",
|
|
||||||
timeout=15
|
|
||||||
)
|
|
||||||
data = r.json()
|
|
||||||
status = data.get('Status', data.get('status', '?'))
|
|
||||||
|
|
||||||
if status in ('ok', 'completed', 'scraped'):
|
|
||||||
dl = requests.get(
|
|
||||||
f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}/download",
|
|
||||||
timeout=90
|
|
||||||
)
|
|
||||||
if dl.status_code == 200:
|
|
||||||
df_filtered = process_result_csv(dl.content, df_input, True)
|
|
||||||
df_raw = process_result_csv(dl.content, df_input, False)
|
|
||||||
if df_filtered is not None:
|
|
||||||
all_results_filtered.append(df_filtered)
|
|
||||||
all_results_raw.append(df_raw)
|
|
||||||
print(f"📊 Batch {batch_idx+1}: {len(df_filtered)} filtered")
|
|
||||||
break
|
|
||||||
elif status in ('failed', 'error'):
|
|
||||||
print(f"💥 Batch {batch_idx+1}: {status}")
|
|
||||||
break
|
|
||||||
|
|
||||||
time.sleep(random.uniform(5, 10)) # Reduziert: 10-20s → 5-10s (paid proxy)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"💥 Batch {batch_idx+1}: {e}")
|
|
||||||
|
|
||||||
job.status = f"🔄 Batch {batch_idx+2}/{batches}"
|
|
||||||
db.session.commit()
|
|
||||||
|
|
||||||
#MERGE & SAVE
|
|
||||||
job.status = "🔧 merging results"
|
|
||||||
db.session.commit()
|
|
||||||
|
|
||||||
base = filename.replace('.csv', '')
|
|
||||||
os.makedirs(RESULT_FOLDER, exist_ok=True)
|
|
||||||
|
|
||||||
if all_results_filtered:
|
|
||||||
df_final_filtered = pd.concat(all_results_filtered, ignore_index=True)
|
|
||||||
df_final_filtered = df_final_filtered.drop_duplicates(subset=['title', 'address'])
|
|
||||||
|
|
||||||
out_filtered = f"results_{base}_filtered.csv"
|
|
||||||
df_final_filtered.to_csv(
|
|
||||||
os.path.join(RESULT_FOLDER, out_filtered),
|
|
||||||
index=False, encoding='utf-8-sig', sep=';'
|
|
||||||
)
|
|
||||||
|
|
||||||
if all_results_raw:
|
|
||||||
df_final_raw = pd.concat(all_results_raw, ignore_index=True)
|
|
||||||
out_raw = f"results_{base}_all.csv"
|
|
||||||
df_final_raw.to_csv(
|
|
||||||
os.path.join(RESULT_FOLDER, out_raw),
|
|
||||||
index=False, encoding='utf-8-sig', sep=';'
|
|
||||||
)
|
|
||||||
|
|
||||||
job.result_filename = out_filtered
|
|
||||||
job.result_filename_raw = out_raw
|
|
||||||
job.status = f"✅ Fertig: {len(df_final_filtered)} Firmen"
|
|
||||||
else:
|
|
||||||
job.status = "❌ Keine Ergebnisse"
|
|
||||||
|
|
||||||
db.session.commit()
|
|
||||||
print(f"🎉 Job {job_id} komplett!")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
job.status = f"Failed: {str(e)[:50]}"
|
|
||||||
print(f"💥 FATAL: {e}")
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
db.session.commit()
|
|
||||||
|
|
||||||
print(f"✅ DONE! Status: {job.status}")
|
|
||||||
|
|
@ -1,429 +0,0 @@
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import unicodedata
|
|
||||||
import json
|
|
||||||
import pandas as pd
|
|
||||||
import requests
|
|
||||||
import time
|
|
||||||
import random
|
|
||||||
from io import StringIO
|
|
||||||
from app.models import db, Job
|
|
||||||
|
|
||||||
print("🆕 MODERN webcrawler LOADED! – BATCHED + PROXY + RESUME + ETA + 2x SCRAPER")
|
|
||||||
|
|
||||||
UPLOAD_FOLDER = '/app/uploads'
|
|
||||||
RESULT_FOLDER = '/app/results'
|
|
||||||
|
|
||||||
# 2x Scraper – abwechselnd genutzt
|
|
||||||
SCRAPER_URLS = [
|
|
||||||
"http://gmaps-scraper-1:8080",
|
|
||||||
"http://gmaps-scraper-2:8080",
|
|
||||||
]
|
|
||||||
|
|
||||||
OUTPUT_COLS = ['title', 'category', 'address', 'open_hours', 'website', 'phone', 'link']
|
|
||||||
|
|
||||||
PROXY_URL = "http://bitlleuv-rotate:s5hzse6hz74b@p.webshare.io:80"
|
|
||||||
API_PROXIES = {"http": PROXY_URL, "https": PROXY_URL}
|
|
||||||
|
|
||||||
# ──────────────────────────────────────────────
|
|
||||||
# Tuning
|
|
||||||
# ──────────────────────────────────────────────
|
|
||||||
BATCH_SIZE = 30 # Keywords pro Scraper-Job
|
|
||||||
BATCH_DELAY_MIN = 3 # Sekunden Pause zwischen Batches (min)
|
|
||||||
BATCH_DELAY_MAX = 6 # Sekunden Pause zwischen Batches (max)
|
|
||||||
MAX_TIME = 60 # Sekunden die der Scraper pro Batch hat
|
|
||||||
POLL_MAX = 90 # Max. Poll-Versuche pro Batch
|
|
||||||
POLL_DELAY_MIN = 2 # Sekunden zwischen Polls (min)
|
|
||||||
POLL_DELAY_MAX = 5 # Sekunden zwischen Polls (max)
|
|
||||||
STUCK_THRESHOLD = 8 # Polls auf 'pending' bis Auto-Restart
|
|
||||||
MAX_RETRIES = 2 # Wiederholversuche pro Batch bei Fehler
|
|
||||||
|
|
||||||
# ──────────────────────────────────────────────
|
|
||||||
# Hilfsfunktionen
|
|
||||||
# ──────────────────────────────────────────────
|
|
||||||
|
|
||||||
def is_blocked(data):
|
|
||||||
text = str(data).lower()
|
|
||||||
blocked = any(kw in text for kw in ['captcha', 'blocked', 'rate limit', 'too many', '429'])
|
|
||||||
if blocked:
|
|
||||||
print(f"🚫 BLOCKED: {str(data)[:100]}")
|
|
||||||
return blocked
|
|
||||||
|
|
||||||
def fix_encoding(text):
|
|
||||||
if not isinstance(text, str):
|
|
||||||
return text
|
|
||||||
try:
|
|
||||||
return text.encode('latin-1').decode('utf-8')
|
|
||||||
except (UnicodeEncodeError, UnicodeDecodeError):
|
|
||||||
return text
|
|
||||||
|
|
||||||
# Fix 1: Sonderzeichen in Queries bereinigen
|
|
||||||
def clean_query(q):
|
|
||||||
"""Steuerzeichen + fehlerhafte Bytes entfernen für saubere Google Maps URLs"""
|
|
||||||
q = ''.join(c for c in q if unicodedata.category(c) != 'Cc')
|
|
||||||
q = ' '.join(q.split())
|
|
||||||
return q.strip()
|
|
||||||
|
|
||||||
def build_input_addresses(df):
|
|
||||||
addresses = set()
|
|
||||||
for _, row in df.iterrows():
|
|
||||||
plz = str(row.get('PLZ', '')).strip()
|
|
||||||
stadt = str(row.get('Stadt', '')).strip()
|
|
||||||
str_ = str(row.get('Straße', '')).strip()
|
|
||||||
nr = str(row.get('Hausnummer', '')).strip()
|
|
||||||
zusatz = str(row.get('Zusatz', '')).strip()
|
|
||||||
full = f"{str_} {nr} {zusatz} {plz} {stadt}".lower().strip()
|
|
||||||
full = ' '.join(full.split())
|
|
||||||
addresses.add(full)
|
|
||||||
return addresses
|
|
||||||
|
|
||||||
def normalize_address(addr):
|
|
||||||
if not isinstance(addr, str):
|
|
||||||
return ''
|
|
||||||
addr = fix_encoding(addr)
|
|
||||||
return ' '.join(addr.lower().strip().split())
|
|
||||||
|
|
||||||
def address_in_input(result_addr, input_addresses):
|
|
||||||
norm = normalize_address(result_addr)
|
|
||||||
for inp_addr in input_addresses:
|
|
||||||
plz_match = re.search(r'\b\d{5}\b', inp_addr)
|
|
||||||
if plz_match:
|
|
||||||
plz = plz_match.group()
|
|
||||||
if plz in norm:
|
|
||||||
street = inp_addr.split()[0] if inp_addr else ''
|
|
||||||
if len(street) > 3 and street[:4].lower() in norm:
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
def format_eta(seconds):
|
|
||||||
"""Sekunden → lesbares ETA-Format"""
|
|
||||||
if seconds < 60:
|
|
||||||
return f"{int(seconds)}s"
|
|
||||||
h, rem = divmod(int(seconds), 3600)
|
|
||||||
m = rem // 60
|
|
||||||
return f"{h}h {m:02d}min" if h > 0 else f"{m}min"
|
|
||||||
|
|
||||||
# ──────────────────────────────────────────────
|
|
||||||
# Fix 3: Scraper-Neustart bei Inactivity
|
|
||||||
# ──────────────────────────────────────────────
|
|
||||||
|
|
||||||
def restart_scraper(scraper_url):
|
|
||||||
"""Den betroffenen Scraper-Container neu starten"""
|
|
||||||
try:
|
|
||||||
import subprocess
|
|
||||||
# Container-Name aus URL ableiten: http://gmaps-scraper-1:8080 → gmaps-scraper-1
|
|
||||||
container = scraper_url.split("//")[1].split(":")[0]
|
|
||||||
print(f"🔄 Starte {container} neu...")
|
|
||||||
subprocess.run(["docker", "restart", container], timeout=30, capture_output=True)
|
|
||||||
print(f"✅ {container} neu gestartet – warte 15s...")
|
|
||||||
time.sleep(15)
|
|
||||||
return True
|
|
||||||
except Exception as e:
|
|
||||||
print(f"⚠️ Scraper-Neustart fehlgeschlagen: {e}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
# ──────────────────────────────────────────────
|
|
||||||
# Resume: Progress-File Hilfsfunktionen
|
|
||||||
# ──────────────────────────────────────────────
|
|
||||||
|
|
||||||
def get_progress_path(job_id):
|
|
||||||
return os.path.join(RESULT_FOLDER, f"progress_{job_id}.json")
|
|
||||||
|
|
||||||
def get_partial_path(job_id, suffix):
|
|
||||||
return os.path.join(RESULT_FOLDER, f"partial_{job_id}_{suffix}.csv")
|
|
||||||
|
|
||||||
def load_progress(job_id):
|
|
||||||
"""Gespeicherten Fortschritt laden (falls vorhanden)"""
|
|
||||||
path = get_progress_path(job_id)
|
|
||||||
if os.path.exists(path):
|
|
||||||
with open(path, 'r') as f:
|
|
||||||
data = json.load(f)
|
|
||||||
print(f"🔁 RESUME: ab Batch {data['last_completed_batch'] + 1}/{data['total_batches']}")
|
|
||||||
return data
|
|
||||||
return None
|
|
||||||
|
|
||||||
def save_progress(job_id, last_completed_batch, total_batches):
|
|
||||||
"""Fortschritt nach jedem Batch speichern"""
|
|
||||||
path = get_progress_path(job_id)
|
|
||||||
with open(path, 'w') as f:
|
|
||||||
json.dump({"last_completed_batch": last_completed_batch, "total_batches": total_batches}, f)
|
|
||||||
|
|
||||||
def append_partial(job_id, df_filtered, df_raw):
|
|
||||||
"""Batch-Ergebnis an Partial-CSV anhängen"""
|
|
||||||
for suffix, df in [('filtered', df_filtered), ('raw', df_raw)]:
|
|
||||||
if df is None:
|
|
||||||
continue
|
|
||||||
path = get_partial_path(job_id, suffix)
|
|
||||||
header = not os.path.exists(path)
|
|
||||||
df.to_csv(path, mode='a', index=False, header=header, encoding='utf-8-sig', sep=';')
|
|
||||||
|
|
||||||
def load_partial(job_id):
|
|
||||||
"""Bestehende Partial-CSVs laden"""
|
|
||||||
results_filtered, results_raw = [], []
|
|
||||||
for suffix, lst in [('filtered', results_filtered), ('raw', results_raw)]:
|
|
||||||
path = get_partial_path(job_id, suffix)
|
|
||||||
if os.path.exists(path):
|
|
||||||
try:
|
|
||||||
df = pd.read_csv(path, sep=';', encoding='utf-8-sig')
|
|
||||||
lst.append(df)
|
|
||||||
print(f"📂 Partial {suffix}: {len(df)} Zeilen geladen")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"⚠️ Partial {suffix} Ladefehler: {e}")
|
|
||||||
return results_filtered, results_raw
|
|
||||||
|
|
||||||
def cleanup_progress(job_id):
|
|
||||||
"""Progress + Partial-Files nach Abschluss löschen"""
|
|
||||||
for path in [
|
|
||||||
get_progress_path(job_id),
|
|
||||||
get_partial_path(job_id, 'filtered'),
|
|
||||||
get_partial_path(job_id, 'raw'),
|
|
||||||
]:
|
|
||||||
if os.path.exists(path):
|
|
||||||
os.remove(path)
|
|
||||||
|
|
||||||
# ──────────────────────────────────────────────
|
|
||||||
# CSV Nachbearbeitung
|
|
||||||
# ──────────────────────────────────────────────
|
|
||||||
|
|
||||||
def process_result_csv(raw_bytes, input_df, apply_filter=True):
|
|
||||||
try:
|
|
||||||
content = raw_bytes.decode('utf-8', errors='replace')
|
|
||||||
df_out = pd.read_csv(StringIO(content))
|
|
||||||
print(f"📄 Raw result: {df_out.shape}")
|
|
||||||
|
|
||||||
available = [c for c in OUTPUT_COLS if c in df_out.columns]
|
|
||||||
df_out = df_out[available]
|
|
||||||
|
|
||||||
for col in df_out.columns:
|
|
||||||
df_out[col] = df_out[col].apply(fix_encoding)
|
|
||||||
|
|
||||||
if apply_filter:
|
|
||||||
input_addresses = build_input_addresses(input_df)
|
|
||||||
before = len(df_out)
|
|
||||||
df_out = df_out[
|
|
||||||
df_out['address'].apply(lambda a: address_in_input(a, input_addresses))
|
|
||||||
]
|
|
||||||
print(f"📍 Filter: {before} → {len(df_out)}")
|
|
||||||
|
|
||||||
df_out = df_out.drop_duplicates(subset=['title', 'address'], keep='first')
|
|
||||||
df_out = df_out.dropna(subset=['title'], how='all')
|
|
||||||
df_out = df_out[df_out['title'].str.strip().astype(bool)]
|
|
||||||
|
|
||||||
print(f"✅ Final ({'gefiltert' if apply_filter else 'alle'}): {df_out.shape}")
|
|
||||||
return df_out
|
|
||||||
except Exception as e:
|
|
||||||
print(f"💥 process_result_csv: {e}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
# ──────────────────────────────────────────────
|
|
||||||
# HAUPT-WORKER
|
|
||||||
# ──────────────────────────────────────────────
|
|
||||||
|
|
||||||
def process_file(filename, job_id, app):
|
|
||||||
print(f"🎯 {filename} Job#{job_id} START!")
|
|
||||||
|
|
||||||
with app.app_context():
|
|
||||||
job = Job.query.get(job_id)
|
|
||||||
if not job:
|
|
||||||
print("❌ Job missing")
|
|
||||||
return
|
|
||||||
|
|
||||||
try:
|
|
||||||
#Parse + ALLE Queries
|
|
||||||
job.status = "📊 parsing CSV"
|
|
||||||
db.session.commit()
|
|
||||||
|
|
||||||
filepath = os.path.join(UPLOAD_FOLDER, filename)
|
|
||||||
print(f"📁 {filepath} | {os.path.getsize(filepath)}b")
|
|
||||||
|
|
||||||
df_input = pd.read_csv(filepath, sep=';', encoding='ISO-8859-1')
|
|
||||||
print(f"📊 {df_input.shape}")
|
|
||||||
|
|
||||||
queries = []
|
|
||||||
for _, row in df_input.iterrows():
|
|
||||||
parts = [
|
|
||||||
str(row.get('PLZ', '')).strip(),
|
|
||||||
str(row.get('Stadt', '')).strip(),
|
|
||||||
str(row.get('Straße', '')).strip(),
|
|
||||||
str(row.get('Hausnummer', '')).strip(),
|
|
||||||
str(row.get('Zusatz', '')).strip(),
|
|
||||||
]
|
|
||||||
q = f"Firmen {' '.join(p for p in parts if p and p != 'nan')}".strip()
|
|
||||||
q = clean_query(q) # Fix 1: Sonderzeichen bereinigen
|
|
||||||
if len(q) > 10:
|
|
||||||
queries.append(q)
|
|
||||||
|
|
||||||
total_queries = len(queries)
|
|
||||||
print(f"🔍 {total_queries} Queries | Samples: {queries[:3]}")
|
|
||||||
if total_queries == 0:
|
|
||||||
raise ValueError("Keine gültigen Adressen")
|
|
||||||
|
|
||||||
#BATCHED Processing
|
|
||||||
batches = (total_queries + BATCH_SIZE - 1) // BATCH_SIZE
|
|
||||||
|
|
||||||
# Resume: Fortschritt laden falls vorhanden
|
|
||||||
os.makedirs(RESULT_FOLDER, exist_ok=True)
|
|
||||||
progress = load_progress(job_id)
|
|
||||||
start_batch = progress['last_completed_batch'] + 1 if progress else 0
|
|
||||||
all_results_filtered, all_results_raw = load_partial(job_id) if progress else ([], [])
|
|
||||||
|
|
||||||
eta_initial = format_eta((batches - start_batch) * ((BATCH_DELAY_MAX + MAX_TIME) / 2))
|
|
||||||
print(f"📦 {batches} Batches à {BATCH_SIZE} | 2x Scraper | Start: {start_batch} | ETA: ~{eta_initial}")
|
|
||||||
job_start_time = time.time()
|
|
||||||
job.status = f"🔄 Batch {start_batch+1}/{batches} | ⏱️ ~{eta_initial}"
|
|
||||||
db.session.commit()
|
|
||||||
|
|
||||||
for batch_idx in range(start_batch, batches):
|
|
||||||
batch_start = batch_idx * BATCH_SIZE
|
|
||||||
batch_end = min(batch_start + BATCH_SIZE, total_queries)
|
|
||||||
batch_queries = queries[batch_start:batch_end]
|
|
||||||
|
|
||||||
# 2x Scraper: abwechselnd nutzen
|
|
||||||
scraper_url = SCRAPER_URLS[batch_idx % len(SCRAPER_URLS)]
|
|
||||||
print(f"\n🔄 BATCH {batch_idx+1}/{batches} ({batch_start+1}-{batch_end}/{total_queries}) → {scraper_url}")
|
|
||||||
|
|
||||||
#Random Delay
|
|
||||||
delay = random.uniform(BATCH_DELAY_MIN, BATCH_DELAY_MAX)
|
|
||||||
print(f"😴 Delay: {delay:.0f}s")
|
|
||||||
time.sleep(delay)
|
|
||||||
|
|
||||||
#API Call
|
|
||||||
payload = {
|
|
||||||
"name": f"{filename.replace('.csv','')}-{job_id}-B{batch_idx+1:03d}",
|
|
||||||
"keywords": batch_queries,
|
|
||||||
"lang": "de",
|
|
||||||
"depth": 1,
|
|
||||||
"zoom": 15,
|
|
||||||
"radius": 50,
|
|
||||||
"max_time": MAX_TIME,
|
|
||||||
"fast_mode": False,
|
|
||||||
"proxies": [PROXY_URL]
|
|
||||||
}
|
|
||||||
|
|
||||||
batch_success = False
|
|
||||||
# Fix 2: Retry-Logik bei Scraper-Fehler
|
|
||||||
for attempt in range(1, MAX_RETRIES + 1):
|
|
||||||
try:
|
|
||||||
resp = requests.post(f"{scraper_url}/api/v1/jobs", json=payload, timeout=45)
|
|
||||||
print(f"📤 {resp.status_code} (Versuch {attempt} | {scraper_url})")
|
|
||||||
|
|
||||||
if is_blocked(resp.text):
|
|
||||||
print("🚫 Batch übersprungen (blocked)")
|
|
||||||
break
|
|
||||||
if resp.status_code != 201:
|
|
||||||
print(f"⚠️ Batch {batch_idx+1} fehlgeschlagen: {resp.text[:100]}")
|
|
||||||
if attempt < MAX_RETRIES:
|
|
||||||
time.sleep(10)
|
|
||||||
continue
|
|
||||||
|
|
||||||
scraper_id = resp.json()['id']
|
|
||||||
print(f"✅ Scraper: {scraper_id}")
|
|
||||||
|
|
||||||
stuck_counter = 0
|
|
||||||
for poll_i in range(1, POLL_MAX + 1):
|
|
||||||
r = requests.get(f"{scraper_url}/api/v1/jobs/{scraper_id}", timeout=15)
|
|
||||||
data = r.json()
|
|
||||||
status = data.get('Status', data.get('status', '?'))
|
|
||||||
print(f"⏳ Poll {poll_i}: {status}")
|
|
||||||
|
|
||||||
# Fix 4: Auto-Recovery bei Pending-Stuck
|
|
||||||
if status == 'pending':
|
|
||||||
stuck_counter += 1
|
|
||||||
if stuck_counter >= STUCK_THRESHOLD:
|
|
||||||
print(f"⚠️ Job {scraper_id} hängt – abbrechen + Neustart")
|
|
||||||
requests.delete(f"{scraper_url}/api/v1/jobs/{scraper_id}", timeout=10)
|
|
||||||
restart_scraper(scraper_url) # Fix 3: Nur betroffenen Scraper neu starten
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
stuck_counter = 0
|
|
||||||
|
|
||||||
if status in ('ok', 'completed', 'scraped'):
|
|
||||||
dl = requests.get(f"{scraper_url}/api/v1/jobs/{scraper_id}/download", timeout=90)
|
|
||||||
if dl.status_code == 200:
|
|
||||||
df_filtered = process_result_csv(dl.content, df_input, True)
|
|
||||||
df_raw = process_result_csv(dl.content, df_input, False)
|
|
||||||
if df_filtered is not None:
|
|
||||||
all_results_filtered.append(df_filtered)
|
|
||||||
all_results_raw.append(df_raw)
|
|
||||||
append_partial(job_id, df_filtered, df_raw) # Resume: sofort speichern
|
|
||||||
print(f"📊 Batch {batch_idx+1}: {len(df_filtered)} filtered")
|
|
||||||
batch_success = True
|
|
||||||
break
|
|
||||||
|
|
||||||
# Fix 2: Scraper-Fehler → Retry
|
|
||||||
elif status in ('failed', 'error'):
|
|
||||||
print(f"💥 Batch {batch_idx+1}: {status} (Versuch {attempt})")
|
|
||||||
if attempt < MAX_RETRIES:
|
|
||||||
time.sleep(10)
|
|
||||||
break
|
|
||||||
|
|
||||||
time.sleep(random.uniform(POLL_DELAY_MIN, POLL_DELAY_MAX))
|
|
||||||
|
|
||||||
if batch_success:
|
|
||||||
break
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"💥 Batch {batch_idx+1} Versuch {attempt}: {e}")
|
|
||||||
if attempt < MAX_RETRIES:
|
|
||||||
time.sleep(10)
|
|
||||||
|
|
||||||
# Resume: Fortschritt nach jedem Batch speichern
|
|
||||||
save_progress(job_id, batch_idx, batches)
|
|
||||||
|
|
||||||
# ETA berechnen
|
|
||||||
elapsed = time.time() - job_start_time
|
|
||||||
done_so_far = batch_idx - start_batch + 1
|
|
||||||
if done_so_far > 0:
|
|
||||||
avg_per_batch = elapsed / done_so_far
|
|
||||||
remaining = (batches - batch_idx - 1) * avg_per_batch
|
|
||||||
eta_str = format_eta(remaining)
|
|
||||||
else:
|
|
||||||
eta_str = "?"
|
|
||||||
|
|
||||||
job.status = f"🔄 Batch {batch_idx+2}/{batches} | ⏱️ ~{eta_str}"
|
|
||||||
db.session.commit()
|
|
||||||
|
|
||||||
#MERGE & SAVE
|
|
||||||
job.status = "🔧 merging results"
|
|
||||||
db.session.commit()
|
|
||||||
|
|
||||||
base = filename.replace('.csv', '')
|
|
||||||
|
|
||||||
if all_results_filtered:
|
|
||||||
df_final_filtered = pd.concat(all_results_filtered, ignore_index=True)
|
|
||||||
df_final_filtered = df_final_filtered.drop_duplicates(subset=['title', 'address'])
|
|
||||||
|
|
||||||
out_filtered = f"results_{base}_filtered.csv"
|
|
||||||
df_final_filtered.to_csv(
|
|
||||||
os.path.join(RESULT_FOLDER, out_filtered),
|
|
||||||
index=False, encoding='utf-8-sig', sep=';'
|
|
||||||
)
|
|
||||||
|
|
||||||
if all_results_raw:
|
|
||||||
df_final_raw = pd.concat(all_results_raw, ignore_index=True)
|
|
||||||
out_raw = f"results_{base}_all.csv"
|
|
||||||
df_final_raw.to_csv(
|
|
||||||
os.path.join(RESULT_FOLDER, out_raw),
|
|
||||||
index=False, encoding='utf-8-sig', sep=';'
|
|
||||||
)
|
|
||||||
|
|
||||||
job.result_filename = out_filtered
|
|
||||||
job.result_filename_raw = out_raw
|
|
||||||
job.status = f"✅ Fertig: {len(df_final_filtered)} Firmen"
|
|
||||||
|
|
||||||
# Resume: Cleanup nach Abschluss
|
|
||||||
cleanup_progress(job_id)
|
|
||||||
else:
|
|
||||||
job.status = "❌ Keine Ergebnisse"
|
|
||||||
|
|
||||||
db.session.commit()
|
|
||||||
print(f"🎉 Job {job_id} komplett!")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
job.status = f"Failed: {str(e)[:50]}"
|
|
||||||
print(f"💥 FATAL: {e}")
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
db.session.commit()
|
|
||||||
|
|
||||||
print(f"✅ DONE! Status: {job.status}")
|
|
||||||
|
|
@ -1,138 +0,0 @@
|
||||||
import csv
|
|
||||||
import os
|
|
||||||
import requests
|
|
||||||
from .models import db, Job
|
|
||||||
from flask import current_app
|
|
||||||
|
|
||||||
UPLOAD_FOLDER = 'uploads'
|
|
||||||
RESULT_FOLDER = 'results'
|
|
||||||
|
|
||||||
API_KEY = 'AIzaSyAIf0yXJTwo87VMWLBtq2m2LqE-OaPGbzw'
|
|
||||||
|
|
||||||
processed_companies = set()
|
|
||||||
|
|
||||||
def get_geocode(address):
|
|
||||||
url = f"https://maps.googleapis.com/maps/api/geocode/json"
|
|
||||||
params = {'address': address, 'key': API_KEY}
|
|
||||||
|
|
||||||
try:
|
|
||||||
response = requests.get(url, params=params, timeout=5)
|
|
||||||
if response.status_code == 200:
|
|
||||||
data = response.json()
|
|
||||||
if data['status'] == 'OK':
|
|
||||||
location = data['results'][0]['geometry']['location']
|
|
||||||
return location['lat'], location['lng']
|
|
||||||
except requests.RequestException as e:
|
|
||||||
print(f"Geocode API Fehler für {address}: {e}")
|
|
||||||
return None, None
|
|
||||||
|
|
||||||
def get_nearby_places(lat, lng):
|
|
||||||
places_url = f"https://maps.googleapis.com/maps/api/place/nearbysearch/json"
|
|
||||||
params = {
|
|
||||||
'location': f"{lat},{lng}",
|
|
||||||
'radius': 10,
|
|
||||||
'type': 'point_of_interest',
|
|
||||||
'key': API_KEY
|
|
||||||
}
|
|
||||||
|
|
||||||
try:
|
|
||||||
response = requests.get(places_url, params=params, timeout=5)
|
|
||||||
if response.status_code == 200:
|
|
||||||
return response.json().get('results', [])
|
|
||||||
except requests.RequestException as e:
|
|
||||||
print(f"Nearby Places API Fehler für Standort {lat},{lng}: {e}")
|
|
||||||
return []
|
|
||||||
|
|
||||||
def get_place_details(place_id):
|
|
||||||
details_url = f"https://maps.googleapis.com/maps/api/place/details/json"
|
|
||||||
params = {
|
|
||||||
'place_id': place_id,
|
|
||||||
'fields': 'formatted_phone_number,website',
|
|
||||||
'key': API_KEY
|
|
||||||
}
|
|
||||||
|
|
||||||
try:
|
|
||||||
response = requests.get(details_url, params=params, timeout=5)
|
|
||||||
if response.status_code == 200:
|
|
||||||
result = response.json().get('result', {})
|
|
||||||
return result.get('formatted_phone_number', 'N/A'), result.get('website', 'N/A')
|
|
||||||
except requests.RequestException as e:
|
|
||||||
print(f"Place Details API Fehler für Place ID {place_id}: {e}")
|
|
||||||
return 'N/A', 'N/A'
|
|
||||||
|
|
||||||
def process_file(filename, job_id, app):
|
|
||||||
with app.app_context():
|
|
||||||
filepath = os.path.join(UPLOAD_FOLDER, filename)
|
|
||||||
results = []
|
|
||||||
|
|
||||||
job = Job.query.get(job_id)
|
|
||||||
if not job:
|
|
||||||
print("Job wurde abgebrochen.")
|
|
||||||
return
|
|
||||||
job.status = "In Progress"
|
|
||||||
db.session.commit()
|
|
||||||
|
|
||||||
with open(filepath, newline='', encoding='ISO-8859-1') as csvfile:
|
|
||||||
reader = csv.DictReader(csvfile, delimiter=';')
|
|
||||||
headers = reader.fieldnames
|
|
||||||
|
|
||||||
if not all(field in headers for field in ['PLZ', 'Straße', 'Hausnummer']):
|
|
||||||
print("CSV-Datei enthält nicht alle notwendigen Spalten.")
|
|
||||||
job.status = "Failed"
|
|
||||||
db.session.commit()
|
|
||||||
return
|
|
||||||
|
|
||||||
for row in reader:
|
|
||||||
plz = row.get('PLZ', '').strip()
|
|
||||||
city = row.get('Stadt', row.get('Bezirk', '')).strip()
|
|
||||||
street = row.get('Straße', '').strip()
|
|
||||||
house_number = row.get('Hausnummer', '').strip()
|
|
||||||
additional = row.get('Zusatz', '').strip()
|
|
||||||
|
|
||||||
if not all([plz, city, street, house_number]):
|
|
||||||
continue
|
|
||||||
|
|
||||||
full_address = f"{street} {house_number} {additional}, {plz} {city}"
|
|
||||||
lat, lng = get_geocode(full_address)
|
|
||||||
if lat is None or lng is None:
|
|
||||||
continue
|
|
||||||
|
|
||||||
nearby_places = get_nearby_places(lat, lng)
|
|
||||||
for place in nearby_places:
|
|
||||||
company_name = place['name']
|
|
||||||
if company_name in processed_companies:
|
|
||||||
continue
|
|
||||||
|
|
||||||
processed_companies.add(company_name)
|
|
||||||
company_address = place.get('vicinity', 'N/A').split(',')[0]
|
|
||||||
place_id = place.get('place_id')
|
|
||||||
company_phone, company_website = get_place_details(place_id) if place_id else ('N/A', 'N/A')
|
|
||||||
|
|
||||||
results.append({
|
|
||||||
'PLZ': plz,
|
|
||||||
'Stadt': city,
|
|
||||||
'Straße': street,
|
|
||||||
'Hausnummer': house_number,
|
|
||||||
'Zusatz': additional,
|
|
||||||
'Company Name': company_name,
|
|
||||||
'Company Address': company_address,
|
|
||||||
'Company Phone': company_phone,
|
|
||||||
'Company Website': company_website
|
|
||||||
})
|
|
||||||
|
|
||||||
if results:
|
|
||||||
result_file = f"results_{os.path.splitext(filename)[0]}.csv"
|
|
||||||
result_path = os.path.join(RESULT_FOLDER, result_file)
|
|
||||||
with open(result_path, 'w', newline='', encoding='utf-8-sig') as csvfile:
|
|
||||||
writer = csv.DictWriter(csvfile, fieldnames=[
|
|
||||||
'PLZ', 'Stadt', 'Straße', 'Hausnummer', 'Zusatz',
|
|
||||||
'Company Name', 'Company Address', 'Company Phone', 'Company Website'
|
|
||||||
])
|
|
||||||
writer.writeheader()
|
|
||||||
writer.writerows(results)
|
|
||||||
job.status = "Completed"
|
|
||||||
job.result_filename = result_file
|
|
||||||
db.session.commit()
|
|
||||||
else:
|
|
||||||
job.status = "Failed"
|
|
||||||
db.session.commit()
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue