From 355130a2d993ce7f13210f66389c47dafd27689e Mon Sep 17 00:00:00 2001 From: mkrieger Date: Tue, 10 Mar 2026 11:46:23 +0100 Subject: [PATCH] cleanup --- app/init.py.bak | 68 ------ app/routes.orig | 223 ------------------ app/webcrawler.bck02032026 | 316 -------------------------- app/webcrawler.bck04032026 | 275 ---------------------- app/webcrawler.bck04032026_2 | 429 ----------------------------------- app/webcrawler.orig | 138 ----------- 6 files changed, 1449 deletions(-) delete mode 100644 app/init.py.bak delete mode 100644 app/routes.orig delete mode 100644 app/webcrawler.bck02032026 delete mode 100644 app/webcrawler.bck04032026 delete mode 100644 app/webcrawler.bck04032026_2 delete mode 100644 app/webcrawler.orig diff --git a/app/init.py.bak b/app/init.py.bak deleted file mode 100644 index 7e77ff6..0000000 --- a/app/init.py.bak +++ /dev/null @@ -1,68 +0,0 @@ -import os -from flask import Flask, redirect, url_for, request, current_app -from flask_sqlalchemy import SQLAlchemy -from flask_login import LoginManager, current_user -from flask_migrate import Migrate - -# βœ… Docker-Pfade -UPLOAD_FOLDER = '/app/uploads' -RESULT_FOLDER = '/app/results' - -db = SQLAlchemy() -login_manager = LoginManager() -migrate = Migrate() - -def create_app(): - app = Flask(__name__) - - # πŸ”‘ Configs - app.config['SECRET_KEY'] = '008e7369b075886d5f494c8813efdfb17155da6af12b3fe8ee' - app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///users.db' - app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False - app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER - app.config['RESULT_FOLDER'] = RESULT_FOLDER - app.config['ALLOW_USER_SIGNUP'] = True # βœ… Aktiviert! - - # DB + Tools - db.init_app(app) - migrate.init_app(app, db) - login_manager.init_app(app) - login_manager.login_view = 'auth.login' - - # User Loader - @login_manager.user_loader - def load_user(user_id): - from .models import User - return User.query.get(int(user_id)) - - # Protected Routes - @app.before_request - def require_login(): - allowed = ['auth.login', 'auth.signup', 'static'] - if (not current_user.is_authenticated and - request.endpoint not in allowed and - not request.path.startswith('/static')): - return redirect(url_for('auth.login')) - - # Ordner - os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True) - os.makedirs(app.config['RESULT_FOLDER'], exist_ok=True) - - # Routes - from . import routes - app.register_blueprint(routes.bp) - - # Index Redirect - @app.route('/') - def index(): - return redirect(url_for('auth.job_status')) - - # DB Tables - with app.app_context(): - db.create_all() - - return app - -if __name__ == '__main__': - app = create_app() - app.run(host='0.0.0.0', port=5000, debug=False) diff --git a/app/routes.orig b/app/routes.orig deleted file mode 100644 index 378517f..0000000 --- a/app/routes.orig +++ /dev/null @@ -1,223 +0,0 @@ -import time -import csv -import os -import threading -from flask import Blueprint, request, redirect, url_for, flash, render_template, send_file, current_app -from flask_login import login_user, logout_user, login_required, current_user -from werkzeug.utils import secure_filename -from werkzeug.security import generate_password_hash, check_password_hash -from .models import db, User, Job -from .webcrawler import process_file # Importiere die Funktion fΓΌr das Webscraping - -UPLOAD_FOLDER = 'uploads' -RESULT_FOLDER = 'results' - -# Blueprint fΓΌr auth erstellen -bp = Blueprint('auth', __name__) - -@bp.route('/login', methods=['GET', 'POST']) -def login(): - if request.method == 'POST': - username = request.form['username'] - password = request.form['password'] - user = User.query.filter_by(username=username).first() - if user and check_password_hash(user.password, password): - login_user(user) - return redirect(url_for('auth.job_status')) - flash('Login fehlgeschlagen. ÜberprΓΌfen Sie Benutzername und Passwort.') - return render_template('login.html') - -@bp.route('/signup', methods=['GET', 'POST']) -def signup(): - if not current_app.config['ALLOW_USER_SIGNUP']: - flash("Registrierung ist derzeit deaktiviert.") - return redirect(url_for('auth.login')) - - if request.method == 'POST': - username = request.form['username'] - password = generate_password_hash(request.form['password'], method='sha256') - new_user = User(username=username, password=password) - db.session.add(new_user) - db.session.commit() - flash('Benutzer erfolgreich erstellt! Sie kΓΆnnen sich jetzt einloggen.') - return redirect(url_for('auth.login')) - - return render_template('signup.html') - -@bp.route('/logout') -@login_required -def logout(): - logout_user() - return redirect(url_for('auth.login')) - -@bp.route('/jobs') -@login_required -def job_status(): - jobs = Job.query.filter_by(user_id=current_user.id).all() - return render_template('jobs.html', jobs=jobs) - -@bp.route('/upload', methods=['GET', 'POST']) -@login_required -def upload(): - if request.method == 'POST': - file = request.files['file'] - filename = secure_filename(file.filename) - - # ÜberprΓΌfen, ob eine Datei mit dem gleichen Namen bereits existiert - file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], filename) - if os.path.exists(file_path): - # Wenn eine Datei mit dem gleichen Namen existiert, einen Zeitstempel hinzufΓΌgen - name, ext = os.path.splitext(filename) - timestamp = time.strftime("%Y%m%d-%H%M%S") # Zeitstempel im Format JahrMonatTag-StundenMinutenSekunden - filename = f"{name}_{timestamp}{ext}" - file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], filename) - flash(f"Eine Datei mit gleichem Namen existierte bereits. Die Datei wurde als '{filename}' gespeichert.") - - # Speichern der Datei - file.save(file_path) - flash('Datei erfolgreich hochgeladen und Job gestartet') - - # Neuen Job erstellen - new_job = Job(user_id=current_user.id, filename=filename, status="Pending") - db.session.add(new_job) - db.session.commit() - - # Debugging-Ausgabe zur ÜberprΓΌfung der Thread-Erstellung - print(f"Starte Scraping-Thread fΓΌr Job-ID: {new_job.id}") - - # Starten des Scraping im Hintergrund-Thread und Übergeben des aktuellen Anwendungskontexts - thread = threading.Thread(target=process_file, args=(filename, new_job.id, current_app._get_current_object())) - thread.start() - - # Debugging-Ausgabe, nachdem der Thread gestartet wurde - print(f"Thread fΓΌr Job {new_job.id} erfolgreich gestartet.") - - return redirect(url_for('auth.job_status')) - - return render_template('upload.html') - -@bp.route('/download/', methods=['GET']) -@login_required -def download_result(job_id): - job = Job.query.get_or_404(job_id) - print(f"Job ID: {job.id} - User ID: {job.user_id} - Current User ID: {current_user.id}") - - # ÜberprΓΌfen, ob der Job dem aktuellen Benutzer gehΓΆrt - if job.user_id != current_user.id: - flash("Sie haben keine Berechtigung, dieses Ergebnis herunterzuladen.") - return redirect(url_for('auth.job_status')) - - # ÜberprΓΌfen, ob das Ergebnis vorhanden ist - if not job.result_filename: - flash("Das Ergebnis ist noch nicht verfΓΌgbar.") - return redirect(url_for('auth.job_status')) - - # ÜberprΓΌfen, ob die Datei im angegebenen Pfad existiert - result_path = os.path.join(current_app.config['RESULT_FOLDER'], job.result_filename) - print(f"Versuche, Datei herunterzuladen von: {result_path}") - - if os.path.exists(result_path): - print("Datei existiert und wird zum Download bereitgestellt.") - return send_file(result_path, as_attachment=True) - else: - print("Datei nicht gefunden. Ergebnisverzeichnis oder Pfad prΓΌfen.") - flash("Ergebnisdatei nicht gefunden.") - return redirect(url_for('auth.job_status')) - - -@bp.route('/delete_job/', methods=['POST']) -@login_required -def delete_job(job_id): - job = Job.query.get_or_404(job_id) - if job.user_id != current_user.id: - flash("Sie haben keine Berechtigung, diesen Job zu lΓΆschen.") - return redirect(url_for('auth.job_status')) - - # LΓΆschen der Upload-Datei - upload_path = os.path.join(current_app.config['UPLOAD_FOLDER'], job.filename) - if os.path.exists(upload_path): - os.remove(upload_path) - print(f"Upload-Datei gelΓΆscht: {upload_path}") - else: - print(f"Upload-Datei nicht gefunden: {upload_path}") - - # LΓΆschen der Results-Datei, falls vorhanden - if job.result_filename: - result_path = os.path.join(current_app.config['RESULT_FOLDER'], job.result_filename) - print(f"Versuche Ergebnisdatei zu lΓΆschen: {result_path}") - - if os.path.exists(result_path): - try: - os.remove(result_path) - print(f"Ergebnisdatei gelΓΆscht: {result_path}") - except Exception as e: - print(f"Fehler beim LΓΆschen der Ergebnisdatei: {e}") - else: - print(f"Ergebnisdatei nicht gefunden im Pfad: {result_path}") - - # Job aus der Datenbank lΓΆschen - db.session.delete(job) - db.session.commit() - flash("Job erfolgreich gelΓΆscht.") - return redirect(url_for('auth.job_status')) - -@bp.route('/admin', methods=['GET']) -@login_required -def admin_panel(): - if not current_user.is_admin: - flash("Keine Berechtigung.") - return redirect(url_for('auth.job_status')) - - users = User.query.all() - return render_template('admin_panel.html', users=users) - -@bp.route('/admin/create_user', methods=['POST']) -@login_required -def create_user(): - if not current_user.is_admin: - flash("Keine Berechtigung.") - return redirect(url_for('auth.admin_panel')) - - username = request.form['username'] - password = request.form['password'] - is_admin = 'is_admin' in request.form # Checkbox fΓΌr Adminrechte - - hashed_password = generate_password_hash(password, method='sha256') - new_user = User(username=username, password=hashed_password, is_admin=is_admin) - db.session.add(new_user) - db.session.commit() - - flash(f"Benutzer {username} wurde erstellt.") - return redirect(url_for('auth.admin_panel')) - -@bp.route('/admin/reset_password/', methods=['POST']) -@login_required -def reset_password(user_id): - if not current_user.is_admin: - flash("Keine Berechtigung.") - return redirect(url_for('auth.admin_panel')) - - user = User.query.get_or_404(user_id) - new_password = request.form['new_password'] - user.password = generate_password_hash(new_password, method='sha256') - db.session.commit() - - flash(f"Passwort fΓΌr Benutzer {user.username} wurde zurΓΌckgesetzt.") - return redirect(url_for('auth.admin_panel')) - -@bp.route('/admin/delete_user/', methods=['POST']) -@login_required -def delete_user(user_id): - if not current_user.is_admin: - flash("Keine Berechtigung.") - return redirect(url_for('auth.admin_panel')) - - user = User.query.get_or_404(user_id) - if user.is_admin: - flash("Administratoren kΓΆnnen nicht gelΓΆscht werden.") - return redirect(url_for('auth.admin_panel')) - - db.session.delete(user) - db.session.commit() - flash(f"Benutzer {user.username} wurde gelΓΆscht.") - return redirect(url_for('auth.admin_panel')) diff --git a/app/webcrawler.bck02032026 b/app/webcrawler.bck02032026 deleted file mode 100644 index 32be52b..0000000 --- a/app/webcrawler.bck02032026 +++ /dev/null @@ -1,316 +0,0 @@ -import os -import re -import pandas as pd -import requests -import time -import random -from io import StringIO -from app.models import db, Job - -print("πŸ†• MODERN webcrawler LOADED!") - -UPLOAD_FOLDER = '/app/uploads' -RESULT_FOLDER = '/app/results' -SCRAPER_URL = "http://gmaps-scraper:8080" - -OUTPUT_COLS = ['title', 'category', 'address', 'open_hours', 'website', 'phone', 'link'] - - -# ────────────────────────────────────────────── -# Hilfsfunktionen -# ────────────────────────────────────────────── - -def get_batch_size(total_rows): - if total_rows < 50: return 10 - elif total_rows < 200: return 10 - elif total_rows < 500: return 5 - else: return 5 - -def get_delay(total_rows): - if total_rows < 50: return (5, 10) - elif total_rows < 200: return (10, 20) - else: return (20, 40) - -def is_blocked(data): - text = str(data).lower() - blocked = any(kw in text for kw in ['captcha', 'blocked', 'rate limit', 'too many', '429']) - if blocked: - print(f"🚫 BLOCKED: {str(data)[:100]}") - return blocked - -def fix_encoding(text): - """Kaputte ISOβ†’UTF8 Zeichen reparieren (z.B. IndustriestraΓƒΕΈe β†’ Industriestraße)""" - if not isinstance(text, str): - return text - try: - return text.encode('latin-1').decode('utf-8') - except (UnicodeEncodeError, UnicodeDecodeError): - return text - -def build_input_addresses(df): - """Normalisierte Adressen aus Input-CSV fΓΌr Abgleich""" - addresses = set() - for _, row in df.iterrows(): - plz = str(row.get('PLZ', '')).strip() - stadt = str(row.get('Stadt', '')).strip() - str_ = str(row.get('Straße', '')).strip() - nr = str(row.get('Hausnummer', '')).strip() - zusatz = str(row.get('Zusatz', '')).strip() - - full = f"{str_} {nr} {zusatz} {plz} {stadt}".lower().strip() - full = ' '.join(full.split()) - addresses.add(full) - return addresses - -def normalize_address(addr): - """Output-Adresse normalisieren fΓΌr Abgleich""" - if not isinstance(addr, str): - return '' - addr = fix_encoding(addr) - return ' '.join(addr.lower().strip().split()) - -def address_in_input(result_addr, input_addresses): - """PrΓΌft ob PLZ + Straßenname aus Result im Input vorkommen""" - norm = normalize_address(result_addr) - for inp_addr in input_addresses: - plz_match = re.search(r'\b\d{5}\b', inp_addr) - if plz_match: - plz = plz_match.group() - if plz in norm: - street = inp_addr.split()[0] if inp_addr else '' - if len(street) > 3 and street[:4].lower() in norm: - return True - return False - - -# ────────────────────────────────────────────── -# CSV Nachbearbeitung (apply_filter umschaltbar) -# ────────────────────────────────────────────── - -def process_result_csv(raw_bytes, input_df, apply_filter=True): - """ - Raw CSV β†’ bereinigt: - - Nur OUTPUT_COLS - - Encoding fix - - Optional: Input/Output Abgleich + Duplikate - """ - try: - content = raw_bytes.decode('utf-8', errors='replace') - df_out = pd.read_csv(StringIO(content)) - print(f"πŸ“„ Raw result: {df_out.shape} | Columns: {list(df_out.columns)[:8]}") - - # Spalten filtern - available = [c for c in OUTPUT_COLS if c in df_out.columns] - missing = [c for c in OUTPUT_COLS if c not in df_out.columns] - if missing: - print(f"⚠️ Fehlende Spalten: {missing}") - df_out = df_out[available] - - # πŸ”€ Encoding fix - for col in df_out.columns: - df_out[col] = df_out[col].apply(fix_encoding) - print(f"πŸ”€ Encoding fix: done") - - if apply_filter: - # πŸ“ Input/Output Abgleich - input_addresses = build_input_addresses(input_df) - before = len(df_out) - df_out = df_out[ - df_out['address'].apply( - lambda a: address_in_input(a, input_addresses) - ) - ] - print(f"πŸ“ Adress-Filter: {before} β†’ {len(df_out)} Zeilen") - - # πŸ” Duplikate entfernen (immer, auch bei Raw) - before_dedup = len(df_out) - df_out = df_out.drop_duplicates(subset=['title', 'address'], keep='first') - print(f"πŸ” Duplikate: {before_dedup} β†’ {len(df_out)} Zeilen") - - # Leere Titel entfernen - df_out = df_out.dropna(subset=['title'], how='all') - df_out = df_out[df_out['title'].str.strip().astype(bool)] - - print(f"βœ… Final ({'gefiltert' if apply_filter else 'alle'}): {df_out.shape}") - return df_out - - except Exception as e: - print(f"πŸ’₯ process_result_csv: {e}") - import traceback - traceback.print_exc() - return None - - -# ────────────────────────────────────────────── -# Haupt-Worker -# ────────────────────────────────────────────── - -def process_file(filename, job_id, app): - print(f"🎯 {filename} Job#{job_id} START!") - - with app.app_context(): - job = Job.query.get(job_id) - if not job: - print("❌ Job missing") - return - - try: - # 1️⃣ CSV Parse - job.status = "πŸ“Š parsing CSV" - db.session.commit() - - filepath = os.path.join(UPLOAD_FOLDER, filename) - print(f"πŸ“ {filepath} | {os.path.getsize(filepath)}b") - - df_input = pd.read_csv(filepath, sep=';', encoding='ISO-8859-1') - print(f"πŸ“Š {df_input.shape} | Columns: {list(df_input.columns)}") - - queries = [] - for _, row in df_input.iterrows(): - parts = [ - str(row.get('PLZ', '')).strip(), - str(row.get('Stadt', '')).strip(), - str(row.get('Straße', '')).strip(), - str(row.get('Hausnummer', '')).strip(), - str(row.get('Zusatz', '')).strip(), - ] - q = f"Firmen {' '.join(p for p in parts if p and p != 'nan')}".strip() - if len(q) > 10: - queries.append(q) - - total = len(queries) - print(f"πŸ” {total} Queries | Samples: {queries[:3]}") - if not queries: - raise ValueError("Keine gΓΌltigen Adressen in CSV") - - # 2️⃣ Batch + Delay - batch_size = get_batch_size(total) - delay_min, delay_max = get_delay(total) - batch = queries[:batch_size] - pre_delay = random.uniform(delay_min, delay_max) - print(f"πŸ“¦ Batch {len(batch)}/{total} | 😴 {pre_delay:.1f}s Delay") - time.sleep(pre_delay) - - # 3️⃣ API Call - job.status = "πŸ“€ sending to scraper" - db.session.commit() - - payload = { - "name": f"{filename.replace('.csv','')}-{job_id}", - "keywords": batch, - "lang": "de", - "depth": 1, - "zoom": 17, - "radius": 50, - "max_time": 60, - "fast_mode": False - } - - print(f"🌐 POST {SCRAPER_URL}/api/v1/jobs | {payload['name']}") - resp = requests.post(f"{SCRAPER_URL}/api/v1/jobs", json=payload, timeout=30) - print(f"πŸ“€ {resp.status_code}: {resp.text[:300]}") - - if is_blocked(resp.text): - raise ValueError("🚫 IP geblockt! Proxy konfigurieren.") - if resp.status_code != 201: - raise ValueError(f"API {resp.status_code}: {resp.text[:200]}") - - # 4️⃣ Polling - scraper_id = resp.json()['id'] - job.scraper_job_id = scraper_id - job.status = "⏳ scraping" - db.session.commit() - print(f"βœ… Scraper Job: {scraper_id}") - - for i in range(1, 61): # Max 10min - try: - r = requests.get( - f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}", - timeout=10 - ) - data = r.json() - status = data.get('Status', data.get('status', '?')) - print(f"⏳ {i}/60: {status}") - - if is_blocked(data): - raise ValueError("🚫 IP geblockt wΓ€hrend scraping!") - - if status in ('ok', 'completed', 'scraped'): - dl = requests.get( - f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}/download", - timeout=60 - ) - if dl.status_code != 200: - raise ValueError(f"Download {dl.status_code}") - if is_blocked(dl.text[:200]): - raise ValueError("🚫 IP geblockt beim Download!") - - # 5️⃣ Nachbearbeitung β†’ zwei Versionen - job.status = "πŸ”§ processing result" - db.session.commit() - - base = filename.replace('.csv', '') - os.makedirs(RESULT_FOLDER, exist_ok=True) - - # ── Version A: Gefiltert (Adressabgleich + Deduplizierung) ── - df_filtered = process_result_csv(dl.content, df_input, apply_filter=True) - outname_filtered = f"results_{base}_filtered.csv" - outpath_filtered = os.path.join(RESULT_FOLDER, outname_filtered) - - if df_filtered is not None and len(df_filtered) > 0: - df_filtered.to_csv( - outpath_filtered, index=False, - encoding='utf-8-sig', sep=';' - ) - print(f"🎯 Filtered: {outname_filtered} β†’ {len(df_filtered)} Firmen") - else: - print("⚠️ Keine Treffer nach Filter – leere Datei wird erstellt") - pd.DataFrame(columns=OUTPUT_COLS).to_csv( - outpath_filtered, index=False, - encoding='utf-8-sig', sep=';' - ) - - # ── Version B: Alle (nur Spalten + Encoding, kein Filter) ── - df_raw = process_result_csv(dl.content, df_input, apply_filter=False) - outname_raw = f"results_{base}_all.csv" - outpath_raw = os.path.join(RESULT_FOLDER, outname_raw) - - if df_raw is not None: - df_raw.to_csv( - outpath_raw, index=False, - encoding='utf-8-sig', sep=';' - ) - print(f"πŸ“‹ All: {outname_raw} β†’ {len(df_raw)} Firmen") - else: - print("⚠️ df_raw None – Rohinhalt wird gespeichert") - with open(outpath_raw, 'wb') as f: - f.write(dl.content) - - # ── DB speichern ── - job.status = "βœ… Fertig" - job.result_filename = outname_filtered # 🎯 Gefiltert - job.result_filename_raw = outname_raw # πŸ“‹ Alle - db.session.commit() - print(f"πŸŽ‰ Beide Dateien gespeichert!") - break - - elif status in ('failed', 'cancelled', 'error'): - raise ValueError(f"Scraper: {status}") - - except requests.RequestException as e: - print(f"⚠️ Poll {i}: {e}") - - time.sleep(random.uniform(8, 15)) - - else: - raise ValueError("Timeout nach 10min") - - except Exception as e: - job.status = "Failed" - job.result_filename = str(e) - print(f"πŸ’₯ ERROR: {e}") - import traceback - traceback.print_exc() - - db.session.commit() - print(f"βœ… DONE! Status: {job.status}\n") diff --git a/app/webcrawler.bck04032026 b/app/webcrawler.bck04032026 deleted file mode 100644 index 8860ad6..0000000 --- a/app/webcrawler.bck04032026 +++ /dev/null @@ -1,275 +0,0 @@ -import os -import re -import pandas as pd -import requests -import time -import random -from io import StringIO -from app.models import db, Job - -print("πŸ†• MODERN webcrawler LOADED! – BATCHED + PROXY") - -UPLOAD_FOLDER = '/app/uploads' -RESULT_FOLDER = '/app/results' -SCRAPER_URL = "http://gmaps-scraper:8080" - -OUTPUT_COLS = ['title', 'category', 'address', 'open_hours', 'website', 'phone', 'link'] - -PROXY_URL = "http://bitlleuv-rotate:s5hzse6hz74b@p.webshare.io:80" -API_PROXIES = {"http": PROXY_URL, "https": PROXY_URL} - -# ────────────────────────────────────────────── -# Hilfsfunktionen -# ────────────────────────────────────────────── - -def is_blocked(data): - text = str(data).lower() - blocked = any(kw in text for kw in ['captcha', 'blocked', 'rate limit', 'too many', '429']) - if blocked: - print(f"🚫 BLOCKED: {str(data)[:100]}") - return blocked - -def fix_encoding(text): - if not isinstance(text, str): - return text - try: - return text.encode('latin-1').decode('utf-8') - except (UnicodeEncodeError, UnicodeDecodeError): - return text - -def build_input_addresses(df): - addresses = set() - for _, row in df.iterrows(): - plz = str(row.get('PLZ', '')).strip() - stadt = str(row.get('Stadt', '')).strip() - str_ = str(row.get('Straße', '')).strip() - nr = str(row.get('Hausnummer', '')).strip() - zusatz = str(row.get('Zusatz', '')).strip() - full = f"{str_} {nr} {zusatz} {plz} {stadt}".lower().strip() - full = ' '.join(full.split()) - addresses.add(full) - return addresses - -def normalize_address(addr): - if not isinstance(addr, str): - return '' - addr = fix_encoding(addr) - return ' '.join(addr.lower().strip().split()) - -def address_in_input(result_addr, input_addresses): - norm = normalize_address(result_addr) - for inp_addr in input_addresses: - plz_match = re.search(r'\b\d{5}\b', inp_addr) - if plz_match: - plz = plz_match.group() - if plz in norm: - street = inp_addr.split()[0] if inp_addr else '' - if len(street) > 3 and street[:4].lower() in norm: - return True - return False - -# ────────────────────────────────────────────── -# CSV Nachbearbeitung -# ────────────────────────────────────────────── - -def process_result_csv(raw_bytes, input_df, apply_filter=True): - try: - content = raw_bytes.decode('utf-8', errors='replace') - df_out = pd.read_csv(StringIO(content)) - print(f"πŸ“„ Raw result: {df_out.shape}") - - available = [c for c in OUTPUT_COLS if c in df_out.columns] - df_out = df_out[available] - - for col in df_out.columns: - df_out[col] = df_out[col].apply(fix_encoding) - - if apply_filter: - input_addresses = build_input_addresses(input_df) - before = len(df_out) - df_out = df_out[ - df_out['address'].apply( - lambda a: address_in_input(a, input_addresses) - ) - ] - print(f"πŸ“ Filter: {before} β†’ {len(df_out)}") - - df_out = df_out.drop_duplicates(subset=['title', 'address'], keep='first') - df_out = df_out.dropna(subset=['title'], how='all') - df_out = df_out[df_out['title'].str.strip().astype(bool)] - - print(f"βœ… Final ({'gefiltert' if apply_filter else 'alle'}): {df_out.shape}") - return df_out - except Exception as e: - print(f"πŸ’₯ process_result_csv: {e}") - return None - -# ────────────────────────────────────────────── -# HAUPT-WORKER -# ────────────────────────────────────────────── - -def process_file(filename, job_id, app): - print(f"🎯 {filename} Job#{job_id} START!") - - with app.app_context(): - job = Job.query.get(job_id) - if not job: - print("❌ Job missing") - return - - try: - #Parse + ALLE Queries - job.status = "πŸ“Š parsing CSV" - db.session.commit() - - filepath = os.path.join(UPLOAD_FOLDER, filename) - print(f"πŸ“ {filepath} | {os.path.getsize(filepath)}b") - - df_input = pd.read_csv(filepath, sep=';', encoding='ISO-8859-1') - print(f"πŸ“Š {df_input.shape}") - - queries = [] - for _, row in df_input.iterrows(): - parts = [ - str(row.get('PLZ', '')).strip(), - str(row.get('Stadt', '')).strip(), - str(row.get('Straße', '')).strip(), - str(row.get('Hausnummer', '')).strip(), - str(row.get('Zusatz', '')).strip(), - ] - q = f"Firmen {' '.join(p for p in parts if p and p != 'nan')}".strip() - if len(q) > 10: - queries.append(q) - - total_queries = len(queries) - print(f"πŸ” {total_queries} Queries | Samples: {queries[:3]}") - if total_queries == 0: - raise ValueError("Keine gΓΌltigen Adressen") - - #BATCHED Processing - BATCH_SIZE = 10 # ErhΓΆht: 5 β†’ 10 (paid proxy) - BATCH_DELAY_MIN, BATCH_DELAY_MAX = 10, 20 # Reduziert: 30-60s β†’ 10-20s (paid proxy) - batches = (total_queries + BATCH_SIZE - 1) // BATCH_SIZE - print(f"πŸ“¦ {batches} Batches Γ  {BATCH_SIZE} | ETA: ~{batches*15//60:.0f}h") - - all_results_filtered = [] - all_results_raw = [] - job.status = f"πŸ”„ Batch 1/{batches}" - db.session.commit() - - for batch_idx in range(batches): - batch_start = batch_idx * BATCH_SIZE - batch_end = min(batch_start + BATCH_SIZE, total_queries) - batch_queries = queries[batch_start:batch_end] - print(f"\nπŸ”„ BATCH {batch_idx+1}/{batches} ({batch_start+1}-{batch_end}/{total_queries})") - - #Random Delay - delay = random.uniform(BATCH_DELAY_MIN, BATCH_DELAY_MAX) - print(f"😴 Delay: {delay:.0f}s | Proxy: {PROXY_URL}") - time.sleep(delay) - - #API Call - payload = { - "name": f"{filename.replace('.csv','')}-{job_id}-B{batch_idx+1:03d}", - "keywords": batch_queries, - "lang": "de", - "depth": 1, - "zoom": 17, - "radius": 50, - "max_time": 60, # Reduziert: 120 β†’ 60 (paid proxy schneller) - "fast_mode": False, - "proxies": [PROXY_URL] - } - - try: - resp = requests.post( - f"{SCRAPER_URL}/api/v1/jobs", - json=payload, - timeout=45 - ) - print(f"πŸ“€ {resp.status_code}") - if is_blocked(resp.text): - print("🚫 Batch ΓΌbersprungen (blocked)") - continue - if resp.status_code != 201: - print(f"⚠️ Batch {batch_idx+1} fehlgeschlagen: {resp.text[:100]}") - continue - - scraper_id = resp.json()['id'] - print(f"βœ… Scraper: {scraper_id}") - - for poll_i in range(1, 61): # Reduziert: 121 β†’ 61 (max_time 60s) - r = requests.get( - f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}", - timeout=15 - ) - data = r.json() - status = data.get('Status', data.get('status', '?')) - - if status in ('ok', 'completed', 'scraped'): - dl = requests.get( - f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}/download", - timeout=90 - ) - if dl.status_code == 200: - df_filtered = process_result_csv(dl.content, df_input, True) - df_raw = process_result_csv(dl.content, df_input, False) - if df_filtered is not None: - all_results_filtered.append(df_filtered) - all_results_raw.append(df_raw) - print(f"πŸ“Š Batch {batch_idx+1}: {len(df_filtered)} filtered") - break - elif status in ('failed', 'error'): - print(f"πŸ’₯ Batch {batch_idx+1}: {status}") - break - - time.sleep(random.uniform(5, 10)) # Reduziert: 10-20s β†’ 5-10s (paid proxy) - - except Exception as e: - print(f"πŸ’₯ Batch {batch_idx+1}: {e}") - - job.status = f"πŸ”„ Batch {batch_idx+2}/{batches}" - db.session.commit() - - #MERGE & SAVE - job.status = "πŸ”§ merging results" - db.session.commit() - - base = filename.replace('.csv', '') - os.makedirs(RESULT_FOLDER, exist_ok=True) - - if all_results_filtered: - df_final_filtered = pd.concat(all_results_filtered, ignore_index=True) - df_final_filtered = df_final_filtered.drop_duplicates(subset=['title', 'address']) - - out_filtered = f"results_{base}_filtered.csv" - df_final_filtered.to_csv( - os.path.join(RESULT_FOLDER, out_filtered), - index=False, encoding='utf-8-sig', sep=';' - ) - - if all_results_raw: - df_final_raw = pd.concat(all_results_raw, ignore_index=True) - out_raw = f"results_{base}_all.csv" - df_final_raw.to_csv( - os.path.join(RESULT_FOLDER, out_raw), - index=False, encoding='utf-8-sig', sep=';' - ) - - job.result_filename = out_filtered - job.result_filename_raw = out_raw - job.status = f"βœ… Fertig: {len(df_final_filtered)} Firmen" - else: - job.status = "❌ Keine Ergebnisse" - - db.session.commit() - print(f"πŸŽ‰ Job {job_id} komplett!") - - except Exception as e: - job.status = f"Failed: {str(e)[:50]}" - print(f"πŸ’₯ FATAL: {e}") - import traceback - traceback.print_exc() - db.session.commit() - - print(f"βœ… DONE! Status: {job.status}") diff --git a/app/webcrawler.bck04032026_2 b/app/webcrawler.bck04032026_2 deleted file mode 100644 index 4a30b38..0000000 --- a/app/webcrawler.bck04032026_2 +++ /dev/null @@ -1,429 +0,0 @@ -import os -import re -import unicodedata -import json -import pandas as pd -import requests -import time -import random -from io import StringIO -from app.models import db, Job - -print("πŸ†• MODERN webcrawler LOADED! – BATCHED + PROXY + RESUME + ETA + 2x SCRAPER") - -UPLOAD_FOLDER = '/app/uploads' -RESULT_FOLDER = '/app/results' - -# 2x Scraper – abwechselnd genutzt -SCRAPER_URLS = [ - "http://gmaps-scraper-1:8080", - "http://gmaps-scraper-2:8080", -] - -OUTPUT_COLS = ['title', 'category', 'address', 'open_hours', 'website', 'phone', 'link'] - -PROXY_URL = "http://bitlleuv-rotate:s5hzse6hz74b@p.webshare.io:80" -API_PROXIES = {"http": PROXY_URL, "https": PROXY_URL} - -# ────────────────────────────────────────────── -# Tuning -# ────────────────────────────────────────────── -BATCH_SIZE = 30 # Keywords pro Scraper-Job -BATCH_DELAY_MIN = 3 # Sekunden Pause zwischen Batches (min) -BATCH_DELAY_MAX = 6 # Sekunden Pause zwischen Batches (max) -MAX_TIME = 60 # Sekunden die der Scraper pro Batch hat -POLL_MAX = 90 # Max. Poll-Versuche pro Batch -POLL_DELAY_MIN = 2 # Sekunden zwischen Polls (min) -POLL_DELAY_MAX = 5 # Sekunden zwischen Polls (max) -STUCK_THRESHOLD = 8 # Polls auf 'pending' bis Auto-Restart -MAX_RETRIES = 2 # Wiederholversuche pro Batch bei Fehler - -# ────────────────────────────────────────────── -# Hilfsfunktionen -# ────────────────────────────────────────────── - -def is_blocked(data): - text = str(data).lower() - blocked = any(kw in text for kw in ['captcha', 'blocked', 'rate limit', 'too many', '429']) - if blocked: - print(f"🚫 BLOCKED: {str(data)[:100]}") - return blocked - -def fix_encoding(text): - if not isinstance(text, str): - return text - try: - return text.encode('latin-1').decode('utf-8') - except (UnicodeEncodeError, UnicodeDecodeError): - return text - -# Fix 1: Sonderzeichen in Queries bereinigen -def clean_query(q): - """Steuerzeichen + fehlerhafte Bytes entfernen fΓΌr saubere Google Maps URLs""" - q = ''.join(c for c in q if unicodedata.category(c) != 'Cc') - q = ' '.join(q.split()) - return q.strip() - -def build_input_addresses(df): - addresses = set() - for _, row in df.iterrows(): - plz = str(row.get('PLZ', '')).strip() - stadt = str(row.get('Stadt', '')).strip() - str_ = str(row.get('Straße', '')).strip() - nr = str(row.get('Hausnummer', '')).strip() - zusatz = str(row.get('Zusatz', '')).strip() - full = f"{str_} {nr} {zusatz} {plz} {stadt}".lower().strip() - full = ' '.join(full.split()) - addresses.add(full) - return addresses - -def normalize_address(addr): - if not isinstance(addr, str): - return '' - addr = fix_encoding(addr) - return ' '.join(addr.lower().strip().split()) - -def address_in_input(result_addr, input_addresses): - norm = normalize_address(result_addr) - for inp_addr in input_addresses: - plz_match = re.search(r'\b\d{5}\b', inp_addr) - if plz_match: - plz = plz_match.group() - if plz in norm: - street = inp_addr.split()[0] if inp_addr else '' - if len(street) > 3 and street[:4].lower() in norm: - return True - return False - -def format_eta(seconds): - """Sekunden β†’ lesbares ETA-Format""" - if seconds < 60: - return f"{int(seconds)}s" - h, rem = divmod(int(seconds), 3600) - m = rem // 60 - return f"{h}h {m:02d}min" if h > 0 else f"{m}min" - -# ────────────────────────────────────────────── -# Fix 3: Scraper-Neustart bei Inactivity -# ────────────────────────────────────────────── - -def restart_scraper(scraper_url): - """Den betroffenen Scraper-Container neu starten""" - try: - import subprocess - # Container-Name aus URL ableiten: http://gmaps-scraper-1:8080 β†’ gmaps-scraper-1 - container = scraper_url.split("//")[1].split(":")[0] - print(f"πŸ”„ Starte {container} neu...") - subprocess.run(["docker", "restart", container], timeout=30, capture_output=True) - print(f"βœ… {container} neu gestartet – warte 15s...") - time.sleep(15) - return True - except Exception as e: - print(f"⚠️ Scraper-Neustart fehlgeschlagen: {e}") - return False - -# ────────────────────────────────────────────── -# Resume: Progress-File Hilfsfunktionen -# ────────────────────────────────────────────── - -def get_progress_path(job_id): - return os.path.join(RESULT_FOLDER, f"progress_{job_id}.json") - -def get_partial_path(job_id, suffix): - return os.path.join(RESULT_FOLDER, f"partial_{job_id}_{suffix}.csv") - -def load_progress(job_id): - """Gespeicherten Fortschritt laden (falls vorhanden)""" - path = get_progress_path(job_id) - if os.path.exists(path): - with open(path, 'r') as f: - data = json.load(f) - print(f"πŸ” RESUME: ab Batch {data['last_completed_batch'] + 1}/{data['total_batches']}") - return data - return None - -def save_progress(job_id, last_completed_batch, total_batches): - """Fortschritt nach jedem Batch speichern""" - path = get_progress_path(job_id) - with open(path, 'w') as f: - json.dump({"last_completed_batch": last_completed_batch, "total_batches": total_batches}, f) - -def append_partial(job_id, df_filtered, df_raw): - """Batch-Ergebnis an Partial-CSV anhΓ€ngen""" - for suffix, df in [('filtered', df_filtered), ('raw', df_raw)]: - if df is None: - continue - path = get_partial_path(job_id, suffix) - header = not os.path.exists(path) - df.to_csv(path, mode='a', index=False, header=header, encoding='utf-8-sig', sep=';') - -def load_partial(job_id): - """Bestehende Partial-CSVs laden""" - results_filtered, results_raw = [], [] - for suffix, lst in [('filtered', results_filtered), ('raw', results_raw)]: - path = get_partial_path(job_id, suffix) - if os.path.exists(path): - try: - df = pd.read_csv(path, sep=';', encoding='utf-8-sig') - lst.append(df) - print(f"πŸ“‚ Partial {suffix}: {len(df)} Zeilen geladen") - except Exception as e: - print(f"⚠️ Partial {suffix} Ladefehler: {e}") - return results_filtered, results_raw - -def cleanup_progress(job_id): - """Progress + Partial-Files nach Abschluss lΓΆschen""" - for path in [ - get_progress_path(job_id), - get_partial_path(job_id, 'filtered'), - get_partial_path(job_id, 'raw'), - ]: - if os.path.exists(path): - os.remove(path) - -# ────────────────────────────────────────────── -# CSV Nachbearbeitung -# ────────────────────────────────────────────── - -def process_result_csv(raw_bytes, input_df, apply_filter=True): - try: - content = raw_bytes.decode('utf-8', errors='replace') - df_out = pd.read_csv(StringIO(content)) - print(f"πŸ“„ Raw result: {df_out.shape}") - - available = [c for c in OUTPUT_COLS if c in df_out.columns] - df_out = df_out[available] - - for col in df_out.columns: - df_out[col] = df_out[col].apply(fix_encoding) - - if apply_filter: - input_addresses = build_input_addresses(input_df) - before = len(df_out) - df_out = df_out[ - df_out['address'].apply(lambda a: address_in_input(a, input_addresses)) - ] - print(f"πŸ“ Filter: {before} β†’ {len(df_out)}") - - df_out = df_out.drop_duplicates(subset=['title', 'address'], keep='first') - df_out = df_out.dropna(subset=['title'], how='all') - df_out = df_out[df_out['title'].str.strip().astype(bool)] - - print(f"βœ… Final ({'gefiltert' if apply_filter else 'alle'}): {df_out.shape}") - return df_out - except Exception as e: - print(f"πŸ’₯ process_result_csv: {e}") - return None - -# ────────────────────────────────────────────── -# HAUPT-WORKER -# ────────────────────────────────────────────── - -def process_file(filename, job_id, app): - print(f"🎯 {filename} Job#{job_id} START!") - - with app.app_context(): - job = Job.query.get(job_id) - if not job: - print("❌ Job missing") - return - - try: - #Parse + ALLE Queries - job.status = "πŸ“Š parsing CSV" - db.session.commit() - - filepath = os.path.join(UPLOAD_FOLDER, filename) - print(f"πŸ“ {filepath} | {os.path.getsize(filepath)}b") - - df_input = pd.read_csv(filepath, sep=';', encoding='ISO-8859-1') - print(f"πŸ“Š {df_input.shape}") - - queries = [] - for _, row in df_input.iterrows(): - parts = [ - str(row.get('PLZ', '')).strip(), - str(row.get('Stadt', '')).strip(), - str(row.get('Straße', '')).strip(), - str(row.get('Hausnummer', '')).strip(), - str(row.get('Zusatz', '')).strip(), - ] - q = f"Firmen {' '.join(p for p in parts if p and p != 'nan')}".strip() - q = clean_query(q) # Fix 1: Sonderzeichen bereinigen - if len(q) > 10: - queries.append(q) - - total_queries = len(queries) - print(f"πŸ” {total_queries} Queries | Samples: {queries[:3]}") - if total_queries == 0: - raise ValueError("Keine gΓΌltigen Adressen") - - #BATCHED Processing - batches = (total_queries + BATCH_SIZE - 1) // BATCH_SIZE - - # Resume: Fortschritt laden falls vorhanden - os.makedirs(RESULT_FOLDER, exist_ok=True) - progress = load_progress(job_id) - start_batch = progress['last_completed_batch'] + 1 if progress else 0 - all_results_filtered, all_results_raw = load_partial(job_id) if progress else ([], []) - - eta_initial = format_eta((batches - start_batch) * ((BATCH_DELAY_MAX + MAX_TIME) / 2)) - print(f"πŸ“¦ {batches} Batches Γ  {BATCH_SIZE} | 2x Scraper | Start: {start_batch} | ETA: ~{eta_initial}") - job_start_time = time.time() - job.status = f"πŸ”„ Batch {start_batch+1}/{batches} | ⏱️ ~{eta_initial}" - db.session.commit() - - for batch_idx in range(start_batch, batches): - batch_start = batch_idx * BATCH_SIZE - batch_end = min(batch_start + BATCH_SIZE, total_queries) - batch_queries = queries[batch_start:batch_end] - - # 2x Scraper: abwechselnd nutzen - scraper_url = SCRAPER_URLS[batch_idx % len(SCRAPER_URLS)] - print(f"\nπŸ”„ BATCH {batch_idx+1}/{batches} ({batch_start+1}-{batch_end}/{total_queries}) β†’ {scraper_url}") - - #Random Delay - delay = random.uniform(BATCH_DELAY_MIN, BATCH_DELAY_MAX) - print(f"😴 Delay: {delay:.0f}s") - time.sleep(delay) - - #API Call - payload = { - "name": f"{filename.replace('.csv','')}-{job_id}-B{batch_idx+1:03d}", - "keywords": batch_queries, - "lang": "de", - "depth": 1, - "zoom": 15, - "radius": 50, - "max_time": MAX_TIME, - "fast_mode": False, - "proxies": [PROXY_URL] - } - - batch_success = False - # Fix 2: Retry-Logik bei Scraper-Fehler - for attempt in range(1, MAX_RETRIES + 1): - try: - resp = requests.post(f"{scraper_url}/api/v1/jobs", json=payload, timeout=45) - print(f"πŸ“€ {resp.status_code} (Versuch {attempt} | {scraper_url})") - - if is_blocked(resp.text): - print("🚫 Batch ΓΌbersprungen (blocked)") - break - if resp.status_code != 201: - print(f"⚠️ Batch {batch_idx+1} fehlgeschlagen: {resp.text[:100]}") - if attempt < MAX_RETRIES: - time.sleep(10) - continue - - scraper_id = resp.json()['id'] - print(f"βœ… Scraper: {scraper_id}") - - stuck_counter = 0 - for poll_i in range(1, POLL_MAX + 1): - r = requests.get(f"{scraper_url}/api/v1/jobs/{scraper_id}", timeout=15) - data = r.json() - status = data.get('Status', data.get('status', '?')) - print(f"⏳ Poll {poll_i}: {status}") - - # Fix 4: Auto-Recovery bei Pending-Stuck - if status == 'pending': - stuck_counter += 1 - if stuck_counter >= STUCK_THRESHOLD: - print(f"⚠️ Job {scraper_id} hΓ€ngt – abbrechen + Neustart") - requests.delete(f"{scraper_url}/api/v1/jobs/{scraper_id}", timeout=10) - restart_scraper(scraper_url) # Fix 3: Nur betroffenen Scraper neu starten - break - else: - stuck_counter = 0 - - if status in ('ok', 'completed', 'scraped'): - dl = requests.get(f"{scraper_url}/api/v1/jobs/{scraper_id}/download", timeout=90) - if dl.status_code == 200: - df_filtered = process_result_csv(dl.content, df_input, True) - df_raw = process_result_csv(dl.content, df_input, False) - if df_filtered is not None: - all_results_filtered.append(df_filtered) - all_results_raw.append(df_raw) - append_partial(job_id, df_filtered, df_raw) # Resume: sofort speichern - print(f"πŸ“Š Batch {batch_idx+1}: {len(df_filtered)} filtered") - batch_success = True - break - - # Fix 2: Scraper-Fehler β†’ Retry - elif status in ('failed', 'error'): - print(f"πŸ’₯ Batch {batch_idx+1}: {status} (Versuch {attempt})") - if attempt < MAX_RETRIES: - time.sleep(10) - break - - time.sleep(random.uniform(POLL_DELAY_MIN, POLL_DELAY_MAX)) - - if batch_success: - break - - except Exception as e: - print(f"πŸ’₯ Batch {batch_idx+1} Versuch {attempt}: {e}") - if attempt < MAX_RETRIES: - time.sleep(10) - - # Resume: Fortschritt nach jedem Batch speichern - save_progress(job_id, batch_idx, batches) - - # ETA berechnen - elapsed = time.time() - job_start_time - done_so_far = batch_idx - start_batch + 1 - if done_so_far > 0: - avg_per_batch = elapsed / done_so_far - remaining = (batches - batch_idx - 1) * avg_per_batch - eta_str = format_eta(remaining) - else: - eta_str = "?" - - job.status = f"πŸ”„ Batch {batch_idx+2}/{batches} | ⏱️ ~{eta_str}" - db.session.commit() - - #MERGE & SAVE - job.status = "πŸ”§ merging results" - db.session.commit() - - base = filename.replace('.csv', '') - - if all_results_filtered: - df_final_filtered = pd.concat(all_results_filtered, ignore_index=True) - df_final_filtered = df_final_filtered.drop_duplicates(subset=['title', 'address']) - - out_filtered = f"results_{base}_filtered.csv" - df_final_filtered.to_csv( - os.path.join(RESULT_FOLDER, out_filtered), - index=False, encoding='utf-8-sig', sep=';' - ) - - if all_results_raw: - df_final_raw = pd.concat(all_results_raw, ignore_index=True) - out_raw = f"results_{base}_all.csv" - df_final_raw.to_csv( - os.path.join(RESULT_FOLDER, out_raw), - index=False, encoding='utf-8-sig', sep=';' - ) - - job.result_filename = out_filtered - job.result_filename_raw = out_raw - job.status = f"βœ… Fertig: {len(df_final_filtered)} Firmen" - - # Resume: Cleanup nach Abschluss - cleanup_progress(job_id) - else: - job.status = "❌ Keine Ergebnisse" - - db.session.commit() - print(f"πŸŽ‰ Job {job_id} komplett!") - - except Exception as e: - job.status = f"Failed: {str(e)[:50]}" - print(f"πŸ’₯ FATAL: {e}") - import traceback - traceback.print_exc() - db.session.commit() - - print(f"βœ… DONE! Status: {job.status}") diff --git a/app/webcrawler.orig b/app/webcrawler.orig deleted file mode 100644 index f73f061..0000000 --- a/app/webcrawler.orig +++ /dev/null @@ -1,138 +0,0 @@ -import csv -import os -import requests -from .models import db, Job -from flask import current_app - -UPLOAD_FOLDER = 'uploads' -RESULT_FOLDER = 'results' - -API_KEY = 'AIzaSyAIf0yXJTwo87VMWLBtq2m2LqE-OaPGbzw' - -processed_companies = set() - -def get_geocode(address): - url = f"https://maps.googleapis.com/maps/api/geocode/json" - params = {'address': address, 'key': API_KEY} - - try: - response = requests.get(url, params=params, timeout=5) - if response.status_code == 200: - data = response.json() - if data['status'] == 'OK': - location = data['results'][0]['geometry']['location'] - return location['lat'], location['lng'] - except requests.RequestException as e: - print(f"Geocode API Fehler fΓΌr {address}: {e}") - return None, None - -def get_nearby_places(lat, lng): - places_url = f"https://maps.googleapis.com/maps/api/place/nearbysearch/json" - params = { - 'location': f"{lat},{lng}", - 'radius': 10, - 'type': 'point_of_interest', - 'key': API_KEY - } - - try: - response = requests.get(places_url, params=params, timeout=5) - if response.status_code == 200: - return response.json().get('results', []) - except requests.RequestException as e: - print(f"Nearby Places API Fehler fΓΌr Standort {lat},{lng}: {e}") - return [] - -def get_place_details(place_id): - details_url = f"https://maps.googleapis.com/maps/api/place/details/json" - params = { - 'place_id': place_id, - 'fields': 'formatted_phone_number,website', - 'key': API_KEY - } - - try: - response = requests.get(details_url, params=params, timeout=5) - if response.status_code == 200: - result = response.json().get('result', {}) - return result.get('formatted_phone_number', 'N/A'), result.get('website', 'N/A') - except requests.RequestException as e: - print(f"Place Details API Fehler fΓΌr Place ID {place_id}: {e}") - return 'N/A', 'N/A' - -def process_file(filename, job_id, app): - with app.app_context(): - filepath = os.path.join(UPLOAD_FOLDER, filename) - results = [] - - job = Job.query.get(job_id) - if not job: - print("Job wurde abgebrochen.") - return - job.status = "In Progress" - db.session.commit() - - with open(filepath, newline='', encoding='ISO-8859-1') as csvfile: - reader = csv.DictReader(csvfile, delimiter=';') - headers = reader.fieldnames - - if not all(field in headers for field in ['PLZ', 'Straße', 'Hausnummer']): - print("CSV-Datei enthΓ€lt nicht alle notwendigen Spalten.") - job.status = "Failed" - db.session.commit() - return - - for row in reader: - plz = row.get('PLZ', '').strip() - city = row.get('Stadt', row.get('Bezirk', '')).strip() - street = row.get('Straße', '').strip() - house_number = row.get('Hausnummer', '').strip() - additional = row.get('Zusatz', '').strip() - - if not all([plz, city, street, house_number]): - continue - - full_address = f"{street} {house_number} {additional}, {plz} {city}" - lat, lng = get_geocode(full_address) - if lat is None or lng is None: - continue - - nearby_places = get_nearby_places(lat, lng) - for place in nearby_places: - company_name = place['name'] - if company_name in processed_companies: - continue - - processed_companies.add(company_name) - company_address = place.get('vicinity', 'N/A').split(',')[0] - place_id = place.get('place_id') - company_phone, company_website = get_place_details(place_id) if place_id else ('N/A', 'N/A') - - results.append({ - 'PLZ': plz, - 'Stadt': city, - 'Straße': street, - 'Hausnummer': house_number, - 'Zusatz': additional, - 'Company Name': company_name, - 'Company Address': company_address, - 'Company Phone': company_phone, - 'Company Website': company_website - }) - - if results: - result_file = f"results_{os.path.splitext(filename)[0]}.csv" - result_path = os.path.join(RESULT_FOLDER, result_file) - with open(result_path, 'w', newline='', encoding='utf-8-sig') as csvfile: - writer = csv.DictWriter(csvfile, fieldnames=[ - 'PLZ', 'Stadt', 'Straße', 'Hausnummer', 'Zusatz', - 'Company Name', 'Company Address', 'Company Phone', 'Company Website' - ]) - writer.writeheader() - writer.writerows(results) - job.status = "Completed" - job.result_filename = result_file - db.session.commit() - else: - job.status = "Failed" - db.session.commit()