Initial commit

2026-03-10 11:33:18 +01:00 · 2026-03-10 11:33:18 +01:00 · df8c2313a9
commit df8c2313a9
parent 387bc056b9
275 changed files with 12939 additions and 263 deletions
--- a/app/webcrawler.py
+++ b/app/webcrawler.py
@ -1,138 +1,487 @@
-import csv
 import os
+import re
+import unicodedata
+import json
+import threading
+import pandas as pd
 import requests
-from .models import db, Job
-from flask import current_app
+import time
+import random
+from io import StringIO
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from app.models import db, Job

-UPLOAD_FOLDER = 'uploads'
-RESULT_FOLDER = 'results'
+print("🆕 MODERN webcrawler LOADED! – BATCHED + PROXY + RESUME + ETA + 4x SCRAPER CHUNK-PARALLEL")

-API_KEY = 'AIzaSyAIf0yXJTwo87VMWLBtq2m2LqE-OaPGbzw'
+UPLOAD_FOLDER = '/app/uploads'
+RESULT_FOLDER = '/app/results'

-processed_companies = set()
+SCRAPER_URLS = [
+    "http://gmaps-scraper-1:8080",
+    "http://gmaps-scraper-2:8080",
+    "http://gmaps-scraper-3:8080",
+    "http://gmaps-scraper-4:8080",
+]

-def get_geocode(address):
-    url = f"https://maps.googleapis.com/maps/api/geocode/json"
-    params = {'address': address, 'key': API_KEY}
+OUTPUT_COLS = ['title', 'category', 'address', 'open_hours', 'website', 'phone', 'link']

+PROXY_URL = "http://bitlleuv-rotate:s5hzse6hz74b@p.webshare.io:80"
+API_PROXIES = {"http": PROXY_URL, "https": PROXY_URL}
+
+_job_semaphore = threading.Semaphore(1)
+
+# ──────────────────────────────────────────────
+# Tuning
+# ──────────────────────────────────────────────
+BATCH_SIZE        = 30    # Keywords pro Scraper-Job
+BATCH_DELAY_MIN   = 3     # Sekunden Pause zwischen Chunks (min)
+BATCH_DELAY_MAX   = 6     # Sekunden Pause zwischen Chunks (max)
+MAX_TIME          = 60    # Sekunden die der Scraper pro Batch hat
+POLL_MAX          = 90    # Max. Poll-Versuche pro Batch
+POLL_DELAY_MIN    = 2     # Sekunden zwischen Polls (min)
+POLL_DELAY_MAX    = 5     # Sekunden zwischen Polls (max)
+STUCK_TIMEOUT     = 300   # Sekunden bis Scraper-Neustart (5 Min)
+MAX_RETRIES       = 2     # Wiederholversuche pro Batch bei Fehler
+PARALLEL_WORKERS  = len(SCRAPER_URLS)
+
+_partial_lock = threading.Lock()
+
+# ──────────────────────────────────────────────
+# Hilfsfunktionen
+# ──────────────────────────────────────────────
+
+def is_blocked(data):
+    text = str(data).lower()
+    blocked = any(kw in text for kw in ['captcha', 'blocked', 'rate limit', 'too many', '429'])
+    if blocked:
+        print(f"🚫 BLOCKED: {str(data)[:100]}")
+    return blocked
+
+def fix_encoding(text):
+    if not isinstance(text, str):
+        return text
    try:
-        response = requests.get(url, params=params, timeout=5)
-        if response.status_code == 200:
-            data = response.json()
-            if data['status'] == 'OK':
-                location = data['results'][0]['geometry']['location']
-                return location['lat'], location['lng']
-    except requests.RequestException as e:
-        print(f"Geocode API Fehler für {address}: {e}")
+        return text.encode('latin-1').decode('utf-8')
+    except (UnicodeEncodeError, UnicodeDecodeError):
+        return text
+
+def clean_query(q):
+    q = ''.join(c for c in q if unicodedata.category(c) != 'Cc')
+    q = ' '.join(q.split())
+    return q.strip()
+
+def build_input_addresses(df):
+    addresses = set()
+    for _, row in df.iterrows():
+        plz    = str(row.get('PLZ', '')).strip()
+        stadt  = str(row.get('Stadt', '')).strip()
+        str_   = str(row.get('Straße', '')).strip()
+        nr     = str(row.get('Hausnummer', '')).strip()
+        zusatz = str(row.get('Zusatz', '')).strip()
+        full = f"{str_} {nr} {zusatz} {plz} {stadt}".lower().strip()
+        full = ' '.join(full.split())
+        addresses.add(full)
+    return addresses
+
+def normalize_address(addr):
+    if not isinstance(addr, str):
+        return ''
+    addr = fix_encoding(addr)
+    return ' '.join(addr.lower().strip().split())
+
+def address_in_input(result_addr, input_addresses):
+    norm = normalize_address(result_addr)
+    for inp_addr in input_addresses:
+        plz_match = re.search(r'\b\d{5}\b', inp_addr)
+        if not plz_match:
+            continue
+        plz = plz_match.group()
+        if plz not in norm:
+            continue
+
+        parts = inp_addr.split()
+
+        street = parts[0] if parts else ''
+        if len(street) < 4 or street[:5].lower() not in norm:
+            continue
+
+        hausnr = parts[1] if len(parts) > 1 else ''
+        if hausnr and not re.search(rf'\b{re.escape(hausnr)}\b', norm):
+            continue
+
+        return True
+    return False
+
+def format_eta(seconds):
+    if seconds < 60:
+        return f"{int(seconds)}s"
+    h, rem = divmod(int(seconds), 3600)
+    m = rem // 60
+    return f"{h}h {m:02d}min" if h > 0 else f"{m}min"
+
+# ──────────────────────────────────────────────
+# Scraper-Job Cleanup
+# ──────────────────────────────────────────────
+
+def _cleanup_scraper_job(scraper_url, scraper_id):
+    """Scraper-Job immer aufräumen wenn wir ihn nicht mehr brauchen"""
+    try:
+        requests.delete(f"{scraper_url}/api/v1/jobs/{scraper_id}", timeout=10)
+        print(f"🗑️ Scraper-Job {scraper_id} gelöscht")
+    except Exception as e:
+        print(f"⚠️ Cleanup fehlgeschlagen: {e}")
+
+# ──────────────────────────────────────────────
+# Scraper-Neustart via Docker SDK
+# ──────────────────────────────────────────────
+
+def restart_scraper(scraper_url):
+    try:
+        import docker
+        container_name = scraper_url.split("//")[1].split(":")[0]
+        print(f"🔄 Starte {container_name} neu...")
+        client = docker.from_env()
+        container = client.containers.get(container_name)
+        container.restart()
+        print(f"✅ {container_name} neu gestartet – warte 15s...")
+        time.sleep(15)
+        return True
+    except Exception as e:
+        print(f"⚠️ Scraper-Neustart fehlgeschlagen: {e}")
+        return False
+
+# ──────────────────────────────────────────────
+# Resume: Progress-File Hilfsfunktionen
+# ──────────────────────────────────────────────
+
+def get_progress_path(job_id):
+    return os.path.join(RESULT_FOLDER, f"progress_{job_id}.json")
+
+def get_partial_path(job_id, suffix):
+    return os.path.join(RESULT_FOLDER, f"partial_{job_id}_{suffix}.csv")
+
+def load_progress(job_id):
+    path = get_progress_path(job_id)
+    if os.path.exists(path):
+        with open(path, 'r') as f:
+            data = json.load(f)
+        print(f"🔁 RESUME: ab Batch {data['last_completed_batch'] + 1}/{data['total_batches']}")
+        return data
+    return None
+
+def save_progress(job_id, last_completed_batch, total_batches):
+    path = get_progress_path(job_id)
+    with open(path, 'w') as f:
+        json.dump({"last_completed_batch": last_completed_batch, "total_batches": total_batches}, f)
+
+def append_partial(job_id, df_filtered, df_raw):
+    with _partial_lock:
+        for suffix, df in [('filtered', df_filtered), ('raw', df_raw)]:
+            if df is None:
+                continue
+            path = get_partial_path(job_id, suffix)
+            header = not os.path.exists(path)
+            df.to_csv(path, mode='a', index=False, header=header, encoding='utf-8-sig', sep=';')
+
+def load_partial(job_id):
+    results_filtered, results_raw = [], []
+    for suffix, lst in [('filtered', results_filtered), ('raw', results_raw)]:
+        path = get_partial_path(job_id, suffix)
+        if os.path.exists(path):
+            try:
+                df = pd.read_csv(path, sep=';', encoding='utf-8-sig')
+                lst.append(df)
+                print(f"📂 Partial {suffix}: {len(df)} Zeilen geladen")
+            except Exception as e:
+                print(f"⚠️ Partial {suffix} Ladefehler: {e}")
+    return results_filtered, results_raw
+
+def cleanup_progress(job_id):
+    for path in [
+        get_progress_path(job_id),
+        get_partial_path(job_id, 'filtered'),
+        get_partial_path(job_id, 'raw'),
+    ]:
+        if os.path.exists(path):
+            os.remove(path)
+
+# ──────────────────────────────────────────────
+# CSV Nachbearbeitung
+# ──────────────────────────────────────────────
+
+def process_result_csv(raw_bytes, input_df, apply_filter=True):
+    try:
+        content = raw_bytes.decode('utf-8', errors='replace')
+        df_out = pd.read_csv(StringIO(content))
+        print(f"📄 Raw result: {df_out.shape}")
+
+        available = [c for c in OUTPUT_COLS if c in df_out.columns]
+        df_out = df_out[available]
+
+        for col in df_out.columns:
+            df_out[col] = df_out[col].apply(fix_encoding)
+
+        if apply_filter:
+            input_addresses = build_input_addresses(input_df)
+            before = len(df_out)
+            df_out = df_out[
+                df_out['address'].apply(lambda a: address_in_input(a, input_addresses))
+            ]
+            print(f"📍 Filter: {before} → {len(df_out)}")
+
+        df_out = df_out.drop_duplicates(subset=['title', 'address'], keep='first')
+        df_out = df_out.dropna(subset=['title'], how='all')
+        df_out = df_out[df_out['title'].str.strip().astype(bool)]
+
+        print(f"✅ Final ({'gefiltert' if apply_filter else 'alle'}): {df_out.shape}")
+        return df_out
+    except Exception as e:
+        print(f"💥 process_result_csv: {e}")
+        return None
+
+# ──────────────────────────────────────────────
+# Parallel: Einzelnen Batch verarbeiten
+# ──────────────────────────────────────────────
+
+def process_batch(batch_idx, batch_queries, scraper_url, filename, job_id, df_input):
+    payload = {
+        "name": f"{filename.replace('.csv','')}-{job_id}-B{batch_idx+1:03d}",
+        "keywords": batch_queries,
+        "lang": "de",
+        "depth": 1,
+        "zoom": 17,
+        "radius": 100,
+        "max_time": MAX_TIME,
+        "fast_mode": False,
+        "proxies": [PROXY_URL]
+    }
+
+    for attempt in range(1, MAX_RETRIES + 1):
+        scraper_id = None
+        try:
+            resp = requests.post(f"{scraper_url}/api/v1/jobs", json=payload, timeout=45)
+            print(f"📤 Batch {batch_idx+1} → {scraper_url} | {resp.status_code} (Versuch {attempt})")
+
+            if is_blocked(resp.text):
+                print(f"🚫 Batch {batch_idx+1} blocked")
+                return None, None
+
+            if resp.status_code != 201:
+                print(f"⚠️ Batch {batch_idx+1} fehlgeschlagen: {resp.text[:100]}")
+                if attempt < MAX_RETRIES:
+                    time.sleep(10)
+                continue
+
+            scraper_id = resp.json()['id']
+            print(f"✅ Batch {batch_idx+1} Scraper-ID: {scraper_id}")
+
+            batch_start_time = time.time()
+
+            for poll_i in range(1, POLL_MAX + 1):
+                r = requests.get(f"{scraper_url}/api/v1/jobs/{scraper_id}", timeout=15)
+                data = r.json()
+                status = data.get('Status', data.get('status', '?'))
+                elapsed = time.time() - batch_start_time
+                print(f"⏳ Batch {batch_idx+1} Poll {poll_i}: {status} | {int(elapsed)}s")
+
+                if status == 'pending' and elapsed > STUCK_TIMEOUT:
+                    print(f"⚠️ Batch {batch_idx+1} hängt seit {int(elapsed)}s – Neustart {scraper_url}")
+                    _cleanup_scraper_job(scraper_url, scraper_id)
+                    scraper_id = None
+                    restart_scraper(scraper_url)
+                    break
+
+                if status in ('ok', 'completed', 'scraped'):
+                    dl = requests.get(f"{scraper_url}/api/v1/jobs/{scraper_id}/download", timeout=90)
+                    scraper_id = None 
+                    if dl.status_code == 200:
+                        df_filtered = process_result_csv(dl.content, df_input, True)
+                        df_raw      = process_result_csv(dl.content, df_input, False)
+                        print(f"📊 Batch {batch_idx+1}: {len(df_filtered) if df_filtered is not None else 0} filtered")
+                        return df_filtered, df_raw
+                    return None, None
+
+                elif status in ('failed', 'error'):
+                    print(f"💥 Batch {batch_idx+1}: {status} (Versuch {attempt})")
+                    _cleanup_scraper_job(scraper_url, scraper_id)
+                    scraper_id = None
+                    if attempt < MAX_RETRIES:
+                        time.sleep(10)
+                    break
+
+                time.sleep(random.uniform(POLL_DELAY_MIN, POLL_DELAY_MAX))
+
+        except Exception as e:
+            print(f"💥 Batch {batch_idx+1} Versuch {attempt}: {e}")
+            if scraper_id:
+                _cleanup_scraper_job(scraper_url, scraper_id)
+                scraper_id = None
+            if attempt < MAX_RETRIES:
+                time.sleep(10)
+
    return None, None

-def get_nearby_places(lat, lng):
-    places_url = f"https://maps.googleapis.com/maps/api/place/nearbysearch/json"
-    params = {
-        'location': f"{lat},{lng}",
-        'radius': 10,
-        'type': 'point_of_interest',
-        'key': API_KEY
-    }
-
-    try:
-        response = requests.get(places_url, params=params, timeout=5)
-        if response.status_code == 200:
-            return response.json().get('results', [])
-    except requests.RequestException as e:
-        print(f"Nearby Places API Fehler für Standort {lat},{lng}: {e}")
-    return []
-
-def get_place_details(place_id):
-    details_url = f"https://maps.googleapis.com/maps/api/place/details/json"
-    params = {
-        'place_id': place_id,
-        'fields': 'formatted_phone_number,website',
-        'key': API_KEY
-    }
-
-    try:
-        response = requests.get(details_url, params=params, timeout=5)
-        if response.status_code == 200:
-            result = response.json().get('result', {})
-            return result.get('formatted_phone_number', 'N/A'), result.get('website', 'N/A')
-    except requests.RequestException as e:
-        print(f"Place Details API Fehler für Place ID {place_id}: {e}")
-    return 'N/A', 'N/A'
+# ──────────────────────────────────────────────
+# HAUPT-WORKER
+# ──────────────────────────────────────────────

 def process_file(filename, job_id, app):
    with app.app_context():
-        filepath = os.path.join(UPLOAD_FOLDER, filename)
-        results = []
-
        job = Job.query.get(job_id)
-        if not job:
-            print("Job wurde abgebrochen.")
-            return
-        job.status = "In Progress"
-        db.session.commit()
+        if job:
+            job.status = "⏳ Wartet auf anderen Job..."
+            db.session.commit()

-        with open(filepath, newline='', encoding='ISO-8859-1') as csvfile:
-            reader = csv.DictReader(csvfile, delimiter=';')
-            headers = reader.fieldnames
+    with _job_semaphore:
+        print(f"🎯 {filename} Job#{job_id} START!")

-            if not all(field in headers for field in ['PLZ', 'Straße', 'Hausnummer']):
-                print("CSV-Datei enthält nicht alle notwendigen Spalten.")
-                job.status = "Failed"
-                db.session.commit()
+        with app.app_context():
+            job = Job.query.get(job_id)
+            if not job:
+                print("❌ Job missing")
                return

-            for row in reader:
-                plz = row.get('PLZ', '').strip()
-                city = row.get('Stadt', row.get('Bezirk', '')).strip()
-                street = row.get('Straße', '').strip()
-                house_number = row.get('Hausnummer', '').strip()
-                additional = row.get('Zusatz', '').strip()
+            try:
+                job.status = "📊 parsing CSV"
+                db.session.commit()

-                if not all([plz, city, street, house_number]):
-                    continue
+                filepath = os.path.join(UPLOAD_FOLDER, filename)
+                print(f"📁 {filepath} | {os.path.getsize(filepath)}b")

-                full_address = f"{street} {house_number} {additional}, {plz} {city}"
-                lat, lng = get_geocode(full_address)
-                if lat is None or lng is None:
-                    continue
+                df_input = pd.read_csv(filepath, sep=';', encoding='ISO-8859-1')
+                print(f"📊 {df_input.shape}")

-                nearby_places = get_nearby_places(lat, lng)
-                for place in nearby_places:
-                    company_name = place['name']
-                    if company_name in processed_companies:
-                        continue
+                queries = []
+                for _, row in df_input.iterrows():
+                    parts = [
+                        str(row.get('PLZ', '')).strip(),
+                        str(row.get('Stadt', '')).strip(),
+                        str(row.get('Straße', '')).strip(),
+                        str(row.get('Hausnummer', '')).strip(),
+                        str(row.get('Zusatz', '')).strip(),
+                    ]
+                    q = f"Firmen {' '.join(p for p in parts if p and p != 'nan')}".strip()
+                    q = clean_query(q)
+                    if len(q) > 10:
+                        queries.append(q)

-                    processed_companies.add(company_name)
-                    company_address = place.get('vicinity', 'N/A').split(',')[0]
-                    place_id = place.get('place_id')
-                    company_phone, company_website = get_place_details(place_id) if place_id else ('N/A', 'N/A')
+                total_queries = len(queries)
+                print(f"🔍 {total_queries} Queries | Samples: {queries[:3]}")
+                if total_queries == 0:
+                    raise ValueError("Keine gültigen Adressen")

-                    results.append({
-                        'PLZ': plz,
-                        'Stadt': city,
-                        'Straße': street,
-                        'Hausnummer': house_number,
-                        'Zusatz': additional,
-                        'Company Name': company_name,
-                        'Company Address': company_address,
-                        'Company Phone': company_phone,
-                        'Company Website': company_website
-                    })
+                batches = (total_queries + BATCH_SIZE - 1) // BATCH_SIZE

-        if results:
-            result_file = f"results_{os.path.splitext(filename)[0]}.csv"
-            result_path = os.path.join(RESULT_FOLDER, result_file)
-            with open(result_path, 'w', newline='', encoding='utf-8-sig') as csvfile:
-                writer = csv.DictWriter(csvfile, fieldnames=[
-                    'PLZ', 'Stadt', 'Straße', 'Hausnummer', 'Zusatz',
-                    'Company Name', 'Company Address', 'Company Phone', 'Company Website'
-                ])
-                writer.writeheader()
-                writer.writerows(results)
-            job.status = "Completed"
-            job.result_filename = result_file
-            db.session.commit()
-        else:
-            job.status = "Failed"
-            db.session.commit()
+                os.makedirs(RESULT_FOLDER, exist_ok=True)
+                progress = load_progress(job_id)
+                start_batch = progress['last_completed_batch'] + 1 if progress else 0
+                all_results_filtered, all_results_raw = load_partial(job_id) if progress else ([], [])
+
+                eta_initial = format_eta((batches - start_batch) * ((BATCH_DELAY_MAX + MAX_TIME) / 2) / PARALLEL_WORKERS)
+                print(f"📦 {batches} Batches à {BATCH_SIZE} | {PARALLEL_WORKERS}x parallel (Chunk) | Start: {start_batch} | ETA: ~{eta_initial}")
+                job_start_time = time.time()
+                job.status = f"🔄 Batch {start_batch+1}/{batches} | ⏱️ ~{eta_initial}"
+                db.session.commit()
+
+                completed_count = 0
+
+                batch_indices = list(range(start_batch, batches))
+                chunks = [
+                    batch_indices[i:i + PARALLEL_WORKERS]
+                    for i in range(0, len(batch_indices), PARALLEL_WORKERS)
+                ]
+
+                with ThreadPoolExecutor(max_workers=PARALLEL_WORKERS) as executor:
+                    for chunk_idx, chunk in enumerate(chunks):
+                        futures = {}
+
+                        for batch_idx in chunk:
+                            batch_start_q = batch_idx * BATCH_SIZE
+                            batch_end_q   = min(batch_start_q + BATCH_SIZE, total_queries)
+                            batch_queries = queries[batch_start_q:batch_end_q]
+                            scraper_url   = SCRAPER_URLS[batch_idx % len(SCRAPER_URLS)]
+
+                            print(f"\n🚀 Chunk {chunk_idx+1} | Batch {batch_idx+1}/{batches} → {scraper_url}")
+                            time.sleep(random.uniform(1, 2))
+
+                            future = executor.submit(
+                                process_batch,
+                                batch_idx, batch_queries, scraper_url,
+                                filename, job_id, df_input
+                            )
+                            futures[future] = batch_idx
+
+                        for future in as_completed(futures):
+                            batch_idx = futures[future]
+                            completed_count += 1
+                            try:
+                                df_filtered, df_raw = future.result()
+                                if df_filtered is not None:
+                                    all_results_filtered.append(df_filtered)
+                                    all_results_raw.append(df_raw)
+                                    append_partial(job_id, df_filtered, df_raw)
+                            except Exception as e:
+                                print(f"💥 Batch {batch_idx+1} Exception: {e}")
+
+                            save_progress(job_id, batch_idx, batches)
+
+                            elapsed = time.time() - job_start_time
+                            if completed_count > 0:
+                                avg_per_batch = elapsed / completed_count
+                                remaining = (batches - start_batch - completed_count) * avg_per_batch / PARALLEL_WORKERS
+                                eta_str = format_eta(remaining)
+                            else:
+                                eta_str = "?"
+
+                            job.status = f"🔄 {completed_count}/{batches - start_batch} fertig | ⏱️ ~{eta_str}"
+                            db.session.commit()
+
+                        if chunk_idx < len(chunks) - 1:
+                            delay = random.uniform(BATCH_DELAY_MIN, BATCH_DELAY_MAX)
+                            print(f"⏸️ Chunk {chunk_idx+1} fertig – warte {delay:.1f}s...")
+                            time.sleep(delay)
+
+                # ── MERGE & SAVE ──
+                job.status = "🔧 merging results"
+                db.session.commit()
+
+                base = filename.replace('.csv', '')
+
+                if all_results_filtered:
+                    df_final_filtered = pd.concat(all_results_filtered, ignore_index=True)
+                    df_final_filtered = df_final_filtered.drop_duplicates(subset=['title', 'address'])
+
+                    out_filtered = f"results_{base}_filtered.csv"
+                    df_final_filtered.to_csv(
+                        os.path.join(RESULT_FOLDER, out_filtered),
+                        index=False, encoding='utf-8-sig', sep=';'
+                    )
+
+                    out_raw = None
+                    if all_results_raw:
+                        df_final_raw = pd.concat(all_results_raw, ignore_index=True)
+                        out_raw = f"results_{base}_all.csv"
+                        df_final_raw.to_csv(
+                            os.path.join(RESULT_FOLDER, out_raw),
+                            index=False, encoding='utf-8-sig', sep=';'
+                        )
+
+                    job.result_filename     = out_filtered
+                    job.result_filename_raw = out_raw
+                    job.status = f"✅ Fertig: {len(df_final_filtered)} Firmen"
+
+                    cleanup_progress(job_id)
+                else:
+                    job.status = "❌ Keine Ergebnisse"
+
+                db.session.commit()
+                print(f"🎉 Job {job_id} komplett!")
+
+            except Exception as e:
+                job.status = f"Failed: {str(e)[:50]}"
+                print(f"💥 FATAL: {e}")
+                import traceback
+                traceback.print_exc()
+                db.session.commit()
+
+            print(f"✅ DONE! Status: {job.status}")