Initial commit

2026-03-10 11:33:18 +01:00 · 2026-03-10 11:33:18 +01:00 · df8c2313a9
commit df8c2313a9
parent 387bc056b9
275 changed files with 12939 additions and 263 deletions
--- a/app/webcrawler.bck04032026
+++ b/app/webcrawler.bck04032026
@ -0,0 +1,275 @@
+import os
+import re
+import pandas as pd
+import requests
+import time
+import random
+from io import StringIO
+from app.models import db, Job
+
+print("🆕 MODERN webcrawler LOADED! – BATCHED + PROXY")
+
+UPLOAD_FOLDER = '/app/uploads'
+RESULT_FOLDER = '/app/results'
+SCRAPER_URL = "http://gmaps-scraper:8080"
+
+OUTPUT_COLS = ['title', 'category', 'address', 'open_hours', 'website', 'phone', 'link']
+
+PROXY_URL = "http://bitlleuv-rotate:s5hzse6hz74b@p.webshare.io:80"
+API_PROXIES = {"http": PROXY_URL, "https": PROXY_URL}
+
+# ──────────────────────────────────────────────
+# Hilfsfunktionen
+# ──────────────────────────────────────────────
+
+def is_blocked(data):
+    text = str(data).lower()
+    blocked = any(kw in text for kw in ['captcha', 'blocked', 'rate limit', 'too many', '429'])
+    if blocked:
+        print(f"🚫 BLOCKED: {str(data)[:100]}")
+    return blocked
+
+def fix_encoding(text):
+    if not isinstance(text, str):
+        return text
+    try:
+        return text.encode('latin-1').decode('utf-8')
+    except (UnicodeEncodeError, UnicodeDecodeError):
+        return text
+
+def build_input_addresses(df):
+    addresses = set()
+    for _, row in df.iterrows():
+        plz    = str(row.get('PLZ', '')).strip()
+        stadt  = str(row.get('Stadt', '')).strip()
+        str_   = str(row.get('Straße', '')).strip()
+        nr     = str(row.get('Hausnummer', '')).strip()
+        zusatz = str(row.get('Zusatz', '')).strip()
+        full = f"{str_} {nr} {zusatz} {plz} {stadt}".lower().strip()
+        full = ' '.join(full.split())
+        addresses.add(full)
+    return addresses
+
+def normalize_address(addr):
+    if not isinstance(addr, str):
+        return ''
+    addr = fix_encoding(addr)
+    return ' '.join(addr.lower().strip().split())
+
+def address_in_input(result_addr, input_addresses):
+    norm = normalize_address(result_addr)
+    for inp_addr in input_addresses:
+        plz_match = re.search(r'\b\d{5}\b', inp_addr)
+        if plz_match:
+            plz = plz_match.group()
+            if plz in norm:
+                street = inp_addr.split()[0] if inp_addr else ''
+                if len(street) > 3 and street[:4].lower() in norm:
+                    return True
+    return False
+
+# ──────────────────────────────────────────────
+# CSV Nachbearbeitung
+# ──────────────────────────────────────────────
+
+def process_result_csv(raw_bytes, input_df, apply_filter=True):
+    try:
+        content = raw_bytes.decode('utf-8', errors='replace')
+        df_out = pd.read_csv(StringIO(content))
+        print(f"📄 Raw result: {df_out.shape}")
+
+        available = [c for c in OUTPUT_COLS if c in df_out.columns]
+        df_out = df_out[available]
+
+        for col in df_out.columns:
+            df_out[col] = df_out[col].apply(fix_encoding)
+
+        if apply_filter:
+            input_addresses = build_input_addresses(input_df)
+            before = len(df_out)
+            df_out = df_out[
+                df_out['address'].apply(
+                    lambda a: address_in_input(a, input_addresses)
+                )
+            ]
+            print(f"📍 Filter: {before} → {len(df_out)}")
+
+        df_out = df_out.drop_duplicates(subset=['title', 'address'], keep='first')
+        df_out = df_out.dropna(subset=['title'], how='all')
+        df_out = df_out[df_out['title'].str.strip().astype(bool)]
+
+        print(f"✅ Final ({'gefiltert' if apply_filter else 'alle'}): {df_out.shape}")
+        return df_out
+    except Exception as e:
+        print(f"💥 process_result_csv: {e}")
+        return None
+
+# ──────────────────────────────────────────────
+# HAUPT-WORKER
+# ──────────────────────────────────────────────
+
+def process_file(filename, job_id, app):
+    print(f"🎯 {filename} Job#{job_id} START!")
+
+    with app.app_context():
+        job = Job.query.get(job_id)
+        if not job:
+            print("❌ Job missing")
+            return
+
+        try:
+            #Parse + ALLE Queries
+            job.status = "📊 parsing CSV"
+            db.session.commit()
+
+            filepath = os.path.join(UPLOAD_FOLDER, filename)
+            print(f"📁 {filepath} | {os.path.getsize(filepath)}b")
+
+            df_input = pd.read_csv(filepath, sep=';', encoding='ISO-8859-1')
+            print(f"📊 {df_input.shape}")
+
+            queries = []
+            for _, row in df_input.iterrows():
+                parts = [
+                    str(row.get('PLZ', '')).strip(),
+                    str(row.get('Stadt', '')).strip(),
+                    str(row.get('Straße', '')).strip(),
+                    str(row.get('Hausnummer', '')).strip(),
+                    str(row.get('Zusatz', '')).strip(),
+                ]
+                q = f"Firmen {' '.join(p for p in parts if p and p != 'nan')}".strip()
+                if len(q) > 10:
+                    queries.append(q)
+
+            total_queries = len(queries)
+            print(f"🔍 {total_queries} Queries | Samples: {queries[:3]}")
+            if total_queries == 0:
+                raise ValueError("Keine gültigen Adressen")
+
+            #BATCHED Processing
+            BATCH_SIZE = 10                              # Erhöht: 5 → 10 (paid proxy)
+            BATCH_DELAY_MIN, BATCH_DELAY_MAX = 10, 20   # Reduziert: 30-60s → 10-20s (paid proxy)
+            batches = (total_queries + BATCH_SIZE - 1) // BATCH_SIZE
+            print(f"📦 {batches} Batches à {BATCH_SIZE} | ETA: ~{batches*15//60:.0f}h")
+
+            all_results_filtered = []
+            all_results_raw = []
+            job.status = f"🔄 Batch 1/{batches}"
+            db.session.commit()
+
+            for batch_idx in range(batches):
+                batch_start = batch_idx * BATCH_SIZE
+                batch_end = min(batch_start + BATCH_SIZE, total_queries)
+                batch_queries = queries[batch_start:batch_end]
+                print(f"\n🔄 BATCH {batch_idx+1}/{batches} ({batch_start+1}-{batch_end}/{total_queries})")
+
+                #Random Delay
+                delay = random.uniform(BATCH_DELAY_MIN, BATCH_DELAY_MAX)
+                print(f"😴 Delay: {delay:.0f}s | Proxy: {PROXY_URL}")
+                time.sleep(delay)
+
+                #API Call
+                payload = {
+                    "name": f"{filename.replace('.csv','')}-{job_id}-B{batch_idx+1:03d}",
+                    "keywords": batch_queries,
+                    "lang": "de",
+                    "depth": 1,
+                    "zoom": 17,
+                    "radius": 50,
+                    "max_time": 60,       # Reduziert: 120 → 60 (paid proxy schneller)
+                    "fast_mode": False,
+                    "proxies": [PROXY_URL]
+                }
+
+                try:
+                    resp = requests.post(
+                        f"{SCRAPER_URL}/api/v1/jobs",
+                        json=payload,
+                        timeout=45
+                    )
+                    print(f"📤 {resp.status_code}")
+                    if is_blocked(resp.text):
+                        print("🚫 Batch übersprungen (blocked)")
+                        continue
+                    if resp.status_code != 201:
+                        print(f"⚠️ Batch {batch_idx+1} fehlgeschlagen: {resp.text[:100]}")
+                        continue
+
+                    scraper_id = resp.json()['id']
+                    print(f"✅ Scraper: {scraper_id}")
+
+                    for poll_i in range(1, 61):           # Reduziert: 121 → 61 (max_time 60s)
+                        r = requests.get(
+                            f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}",
+                            timeout=15
+                        )
+                        data = r.json()
+                        status = data.get('Status', data.get('status', '?'))
+
+                        if status in ('ok', 'completed', 'scraped'):
+                            dl = requests.get(
+                                f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}/download",
+                                timeout=90
+                            )
+                            if dl.status_code == 200:
+                                df_filtered = process_result_csv(dl.content, df_input, True)
+                                df_raw = process_result_csv(dl.content, df_input, False)
+                                if df_filtered is not None:
+                                    all_results_filtered.append(df_filtered)
+                                    all_results_raw.append(df_raw)
+                                    print(f"📊 Batch {batch_idx+1}: {len(df_filtered)} filtered")
+                            break
+                        elif status in ('failed', 'error'):
+                            print(f"💥 Batch {batch_idx+1}: {status}")
+                            break
+
+                        time.sleep(random.uniform(5, 10))  # Reduziert: 10-20s → 5-10s (paid proxy)
+
+                except Exception as e:
+                    print(f"💥 Batch {batch_idx+1}: {e}")
+
+                job.status = f"🔄 Batch {batch_idx+2}/{batches}"
+                db.session.commit()
+
+            #MERGE & SAVE
+            job.status = "🔧 merging results"
+            db.session.commit()
+
+            base = filename.replace('.csv', '')
+            os.makedirs(RESULT_FOLDER, exist_ok=True)
+
+            if all_results_filtered:
+                df_final_filtered = pd.concat(all_results_filtered, ignore_index=True)
+                df_final_filtered = df_final_filtered.drop_duplicates(subset=['title', 'address'])
+
+                out_filtered = f"results_{base}_filtered.csv"
+                df_final_filtered.to_csv(
+                    os.path.join(RESULT_FOLDER, out_filtered),
+                    index=False, encoding='utf-8-sig', sep=';'
+                )
+
+                if all_results_raw:
+                    df_final_raw = pd.concat(all_results_raw, ignore_index=True)
+                    out_raw = f"results_{base}_all.csv"
+                    df_final_raw.to_csv(
+                        os.path.join(RESULT_FOLDER, out_raw),
+                        index=False, encoding='utf-8-sig', sep=';'
+                    )
+
+                job.result_filename = out_filtered
+                job.result_filename_raw = out_raw
+                job.status = f"✅ Fertig: {len(df_final_filtered)} Firmen"
+            else:
+                job.status = "❌ Keine Ergebnisse"
+
+            db.session.commit()
+            print(f"🎉 Job {job_id} komplett!")
+
+        except Exception as e:
+            job.status = f"Failed: {str(e)[:50]}"
+            print(f"💥 FATAL: {e}")
+            import traceback
+            traceback.print_exc()
+            db.session.commit()
+
+        print(f"✅ DONE! Status: {job.status}")