heko_webcrawler/app/webcrawler.bck02032026
2026-03-10 11:33:18 +01:00

316 lines
12 KiB
Text
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import re
import pandas as pd
import requests
import time
import random
from io import StringIO
from app.models import db, Job
print("🆕 MODERN webcrawler LOADED!")
UPLOAD_FOLDER = '/app/uploads'
RESULT_FOLDER = '/app/results'
SCRAPER_URL = "http://gmaps-scraper:8080"
OUTPUT_COLS = ['title', 'category', 'address', 'open_hours', 'website', 'phone', 'link']
# ──────────────────────────────────────────────
# Hilfsfunktionen
# ──────────────────────────────────────────────
def get_batch_size(total_rows):
if total_rows < 50: return 10
elif total_rows < 200: return 10
elif total_rows < 500: return 5
else: return 5
def get_delay(total_rows):
if total_rows < 50: return (5, 10)
elif total_rows < 200: return (10, 20)
else: return (20, 40)
def is_blocked(data):
text = str(data).lower()
blocked = any(kw in text for kw in ['captcha', 'blocked', 'rate limit', 'too many', '429'])
if blocked:
print(f"🚫 BLOCKED: {str(data)[:100]}")
return blocked
def fix_encoding(text):
"""Kaputte ISO→UTF8 Zeichen reparieren (z.B. Industriestraße → Industriestraße)"""
if not isinstance(text, str):
return text
try:
return text.encode('latin-1').decode('utf-8')
except (UnicodeEncodeError, UnicodeDecodeError):
return text
def build_input_addresses(df):
"""Normalisierte Adressen aus Input-CSV für Abgleich"""
addresses = set()
for _, row in df.iterrows():
plz = str(row.get('PLZ', '')).strip()
stadt = str(row.get('Stadt', '')).strip()
str_ = str(row.get('Straße', '')).strip()
nr = str(row.get('Hausnummer', '')).strip()
zusatz = str(row.get('Zusatz', '')).strip()
full = f"{str_} {nr} {zusatz} {plz} {stadt}".lower().strip()
full = ' '.join(full.split())
addresses.add(full)
return addresses
def normalize_address(addr):
"""Output-Adresse normalisieren für Abgleich"""
if not isinstance(addr, str):
return ''
addr = fix_encoding(addr)
return ' '.join(addr.lower().strip().split())
def address_in_input(result_addr, input_addresses):
"""Prüft ob PLZ + Straßenname aus Result im Input vorkommen"""
norm = normalize_address(result_addr)
for inp_addr in input_addresses:
plz_match = re.search(r'\b\d{5}\b', inp_addr)
if plz_match:
plz = plz_match.group()
if plz in norm:
street = inp_addr.split()[0] if inp_addr else ''
if len(street) > 3 and street[:4].lower() in norm:
return True
return False
# ──────────────────────────────────────────────
# CSV Nachbearbeitung (apply_filter umschaltbar)
# ──────────────────────────────────────────────
def process_result_csv(raw_bytes, input_df, apply_filter=True):
"""
Raw CSV → bereinigt:
- Nur OUTPUT_COLS
- Encoding fix
- Optional: Input/Output Abgleich + Duplikate
"""
try:
content = raw_bytes.decode('utf-8', errors='replace')
df_out = pd.read_csv(StringIO(content))
print(f"📄 Raw result: {df_out.shape} | Columns: {list(df_out.columns)[:8]}")
# Spalten filtern
available = [c for c in OUTPUT_COLS if c in df_out.columns]
missing = [c for c in OUTPUT_COLS if c not in df_out.columns]
if missing:
print(f"⚠️ Fehlende Spalten: {missing}")
df_out = df_out[available]
# 🔤 Encoding fix
for col in df_out.columns:
df_out[col] = df_out[col].apply(fix_encoding)
print(f"🔤 Encoding fix: done")
if apply_filter:
# 📍 Input/Output Abgleich
input_addresses = build_input_addresses(input_df)
before = len(df_out)
df_out = df_out[
df_out['address'].apply(
lambda a: address_in_input(a, input_addresses)
)
]
print(f"📍 Adress-Filter: {before} → {len(df_out)} Zeilen")
# 🔁 Duplikate entfernen (immer, auch bei Raw)
before_dedup = len(df_out)
df_out = df_out.drop_duplicates(subset=['title', 'address'], keep='first')
print(f"🔁 Duplikate: {before_dedup} → {len(df_out)} Zeilen")
# Leere Titel entfernen
df_out = df_out.dropna(subset=['title'], how='all')
df_out = df_out[df_out['title'].str.strip().astype(bool)]
print(f"✅ Final ({'gefiltert' if apply_filter else 'alle'}): {df_out.shape}")
return df_out
except Exception as e:
print(f"💥 process_result_csv: {e}")
import traceback
traceback.print_exc()
return None
# ──────────────────────────────────────────────
# Haupt-Worker
# ──────────────────────────────────────────────
def process_file(filename, job_id, app):
print(f"🎯 {filename} Job#{job_id} START!")
with app.app_context():
job = Job.query.get(job_id)
if not job:
print("❌ Job missing")
return
try:
# 1⃣ CSV Parse
job.status = "📊 parsing CSV"
db.session.commit()
filepath = os.path.join(UPLOAD_FOLDER, filename)
print(f"📁 {filepath} | {os.path.getsize(filepath)}b")
df_input = pd.read_csv(filepath, sep=';', encoding='ISO-8859-1')
print(f"📊 {df_input.shape} | Columns: {list(df_input.columns)}")
queries = []
for _, row in df_input.iterrows():
parts = [
str(row.get('PLZ', '')).strip(),
str(row.get('Stadt', '')).strip(),
str(row.get('Straße', '')).strip(),
str(row.get('Hausnummer', '')).strip(),
str(row.get('Zusatz', '')).strip(),
]
q = f"Firmen {' '.join(p for p in parts if p and p != 'nan')}".strip()
if len(q) > 10:
queries.append(q)
total = len(queries)
print(f"🔍 {total} Queries | Samples: {queries[:3]}")
if not queries:
raise ValueError("Keine gültigen Adressen in CSV")
# 2⃣ Batch + Delay
batch_size = get_batch_size(total)
delay_min, delay_max = get_delay(total)
batch = queries[:batch_size]
pre_delay = random.uniform(delay_min, delay_max)
print(f"📦 Batch {len(batch)}/{total} | 😴 {pre_delay:.1f}s Delay")
time.sleep(pre_delay)
# 3⃣ API Call
job.status = "📤 sending to scraper"
db.session.commit()
payload = {
"name": f"{filename.replace('.csv','')}-{job_id}",
"keywords": batch,
"lang": "de",
"depth": 1,
"zoom": 17,
"radius": 50,
"max_time": 60,
"fast_mode": False
}
print(f"🌐 POST {SCRAPER_URL}/api/v1/jobs | {payload['name']}")
resp = requests.post(f"{SCRAPER_URL}/api/v1/jobs", json=payload, timeout=30)
print(f"📤 {resp.status_code}: {resp.text[:300]}")
if is_blocked(resp.text):
raise ValueError("🚫 IP geblockt! Proxy konfigurieren.")
if resp.status_code != 201:
raise ValueError(f"API {resp.status_code}: {resp.text[:200]}")
# 4⃣ Polling
scraper_id = resp.json()['id']
job.scraper_job_id = scraper_id
job.status = "⏳ scraping"
db.session.commit()
print(f"✅ Scraper Job: {scraper_id}")
for i in range(1, 61): # Max 10min
try:
r = requests.get(
f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}",
timeout=10
)
data = r.json()
status = data.get('Status', data.get('status', '?'))
print(f"⏳ {i}/60: {status}")
if is_blocked(data):
raise ValueError("🚫 IP geblockt während scraping!")
if status in ('ok', 'completed', 'scraped'):
dl = requests.get(
f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}/download",
timeout=60
)
if dl.status_code != 200:
raise ValueError(f"Download {dl.status_code}")
if is_blocked(dl.text[:200]):
raise ValueError("🚫 IP geblockt beim Download!")
# 5⃣ Nachbearbeitung → zwei Versionen
job.status = "🔧 processing result"
db.session.commit()
base = filename.replace('.csv', '')
os.makedirs(RESULT_FOLDER, exist_ok=True)
# ── Version A: Gefiltert (Adressabgleich + Deduplizierung) ──
df_filtered = process_result_csv(dl.content, df_input, apply_filter=True)
outname_filtered = f"results_{base}_filtered.csv"
outpath_filtered = os.path.join(RESULT_FOLDER, outname_filtered)
if df_filtered is not None and len(df_filtered) > 0:
df_filtered.to_csv(
outpath_filtered, index=False,
encoding='utf-8-sig', sep=';'
)
print(f"🎯 Filtered: {outname_filtered} → {len(df_filtered)} Firmen")
else:
print("⚠️ Keine Treffer nach Filter leere Datei wird erstellt")
pd.DataFrame(columns=OUTPUT_COLS).to_csv(
outpath_filtered, index=False,
encoding='utf-8-sig', sep=';'
)
# ── Version B: Alle (nur Spalten + Encoding, kein Filter) ──
df_raw = process_result_csv(dl.content, df_input, apply_filter=False)
outname_raw = f"results_{base}_all.csv"
outpath_raw = os.path.join(RESULT_FOLDER, outname_raw)
if df_raw is not None:
df_raw.to_csv(
outpath_raw, index=False,
encoding='utf-8-sig', sep=';'
)
print(f"📋 All: {outname_raw} → {len(df_raw)} Firmen")
else:
print("⚠️ df_raw None Rohinhalt wird gespeichert")
with open(outpath_raw, 'wb') as f:
f.write(dl.content)
# ── DB speichern ──
job.status = "✅ Fertig"
job.result_filename = outname_filtered # 🎯 Gefiltert
job.result_filename_raw = outname_raw # 📋 Alle
db.session.commit()
print(f"🎉 Beide Dateien gespeichert!")
break
elif status in ('failed', 'cancelled', 'error'):
raise ValueError(f"Scraper: {status}")
except requests.RequestException as e:
print(f"⚠️ Poll {i}: {e}")
time.sleep(random.uniform(8, 15))
else:
raise ValueError("Timeout nach 10min")
except Exception as e:
job.status = "Failed"
job.result_filename = str(e)
print(f"💥 ERROR: {e}")
import traceback
traceback.print_exc()
db.session.commit()
print(f"✅ DONE! Status: {job.status}\n")