275 lines
11 KiB
Text
275 lines
11 KiB
Text
import os
|
||
import re
|
||
import pandas as pd
|
||
import requests
|
||
import time
|
||
import random
|
||
from io import StringIO
|
||
from app.models import db, Job
|
||
|
||
print("🆕 MODERN webcrawler LOADED! – BATCHED + PROXY")
|
||
|
||
UPLOAD_FOLDER = '/app/uploads'
|
||
RESULT_FOLDER = '/app/results'
|
||
SCRAPER_URL = "http://gmaps-scraper:8080"
|
||
|
||
OUTPUT_COLS = ['title', 'category', 'address', 'open_hours', 'website', 'phone', 'link']
|
||
|
||
PROXY_URL = "http://bitlleuv-rotate:s5hzse6hz74b@p.webshare.io:80"
|
||
API_PROXIES = {"http": PROXY_URL, "https": PROXY_URL}
|
||
|
||
# ──────────────────────────────────────────────
|
||
# Hilfsfunktionen
|
||
# ──────────────────────────────────────────────
|
||
|
||
def is_blocked(data):
|
||
text = str(data).lower()
|
||
blocked = any(kw in text for kw in ['captcha', 'blocked', 'rate limit', 'too many', '429'])
|
||
if blocked:
|
||
print(f"🚫 BLOCKED: {str(data)[:100]}")
|
||
return blocked
|
||
|
||
def fix_encoding(text):
|
||
if not isinstance(text, str):
|
||
return text
|
||
try:
|
||
return text.encode('latin-1').decode('utf-8')
|
||
except (UnicodeEncodeError, UnicodeDecodeError):
|
||
return text
|
||
|
||
def build_input_addresses(df):
|
||
addresses = set()
|
||
for _, row in df.iterrows():
|
||
plz = str(row.get('PLZ', '')).strip()
|
||
stadt = str(row.get('Stadt', '')).strip()
|
||
str_ = str(row.get('Straße', '')).strip()
|
||
nr = str(row.get('Hausnummer', '')).strip()
|
||
zusatz = str(row.get('Zusatz', '')).strip()
|
||
full = f"{str_} {nr} {zusatz} {plz} {stadt}".lower().strip()
|
||
full = ' '.join(full.split())
|
||
addresses.add(full)
|
||
return addresses
|
||
|
||
def normalize_address(addr):
|
||
if not isinstance(addr, str):
|
||
return ''
|
||
addr = fix_encoding(addr)
|
||
return ' '.join(addr.lower().strip().split())
|
||
|
||
def address_in_input(result_addr, input_addresses):
|
||
norm = normalize_address(result_addr)
|
||
for inp_addr in input_addresses:
|
||
plz_match = re.search(r'\b\d{5}\b', inp_addr)
|
||
if plz_match:
|
||
plz = plz_match.group()
|
||
if plz in norm:
|
||
street = inp_addr.split()[0] if inp_addr else ''
|
||
if len(street) > 3 and street[:4].lower() in norm:
|
||
return True
|
||
return False
|
||
|
||
# ──────────────────────────────────────────────
|
||
# CSV Nachbearbeitung
|
||
# ──────────────────────────────────────────────
|
||
|
||
def process_result_csv(raw_bytes, input_df, apply_filter=True):
|
||
try:
|
||
content = raw_bytes.decode('utf-8', errors='replace')
|
||
df_out = pd.read_csv(StringIO(content))
|
||
print(f"📄 Raw result: {df_out.shape}")
|
||
|
||
available = [c for c in OUTPUT_COLS if c in df_out.columns]
|
||
df_out = df_out[available]
|
||
|
||
for col in df_out.columns:
|
||
df_out[col] = df_out[col].apply(fix_encoding)
|
||
|
||
if apply_filter:
|
||
input_addresses = build_input_addresses(input_df)
|
||
before = len(df_out)
|
||
df_out = df_out[
|
||
df_out['address'].apply(
|
||
lambda a: address_in_input(a, input_addresses)
|
||
)
|
||
]
|
||
print(f"📍 Filter: {before} → {len(df_out)}")
|
||
|
||
df_out = df_out.drop_duplicates(subset=['title', 'address'], keep='first')
|
||
df_out = df_out.dropna(subset=['title'], how='all')
|
||
df_out = df_out[df_out['title'].str.strip().astype(bool)]
|
||
|
||
print(f"✅ Final ({'gefiltert' if apply_filter else 'alle'}): {df_out.shape}")
|
||
return df_out
|
||
except Exception as e:
|
||
print(f"💥 process_result_csv: {e}")
|
||
return None
|
||
|
||
# ──────────────────────────────────────────────
|
||
# HAUPT-WORKER
|
||
# ──────────────────────────────────────────────
|
||
|
||
def process_file(filename, job_id, app):
|
||
print(f"🎯 {filename} Job#{job_id} START!")
|
||
|
||
with app.app_context():
|
||
job = Job.query.get(job_id)
|
||
if not job:
|
||
print("❌ Job missing")
|
||
return
|
||
|
||
try:
|
||
#Parse + ALLE Queries
|
||
job.status = "📊 parsing CSV"
|
||
db.session.commit()
|
||
|
||
filepath = os.path.join(UPLOAD_FOLDER, filename)
|
||
print(f"📁 {filepath} | {os.path.getsize(filepath)}b")
|
||
|
||
df_input = pd.read_csv(filepath, sep=';', encoding='ISO-8859-1')
|
||
print(f"📊 {df_input.shape}")
|
||
|
||
queries = []
|
||
for _, row in df_input.iterrows():
|
||
parts = [
|
||
str(row.get('PLZ', '')).strip(),
|
||
str(row.get('Stadt', '')).strip(),
|
||
str(row.get('Straße', '')).strip(),
|
||
str(row.get('Hausnummer', '')).strip(),
|
||
str(row.get('Zusatz', '')).strip(),
|
||
]
|
||
q = f"Firmen {' '.join(p for p in parts if p and p != 'nan')}".strip()
|
||
if len(q) > 10:
|
||
queries.append(q)
|
||
|
||
total_queries = len(queries)
|
||
print(f"🔍 {total_queries} Queries | Samples: {queries[:3]}")
|
||
if total_queries == 0:
|
||
raise ValueError("Keine gültigen Adressen")
|
||
|
||
#BATCHED Processing
|
||
BATCH_SIZE = 10 # Erhöht: 5 → 10 (paid proxy)
|
||
BATCH_DELAY_MIN, BATCH_DELAY_MAX = 10, 20 # Reduziert: 30-60s → 10-20s (paid proxy)
|
||
batches = (total_queries + BATCH_SIZE - 1) // BATCH_SIZE
|
||
print(f"📦 {batches} Batches à {BATCH_SIZE} | ETA: ~{batches*15//60:.0f}h")
|
||
|
||
all_results_filtered = []
|
||
all_results_raw = []
|
||
job.status = f"🔄 Batch 1/{batches}"
|
||
db.session.commit()
|
||
|
||
for batch_idx in range(batches):
|
||
batch_start = batch_idx * BATCH_SIZE
|
||
batch_end = min(batch_start + BATCH_SIZE, total_queries)
|
||
batch_queries = queries[batch_start:batch_end]
|
||
print(f"\n🔄 BATCH {batch_idx+1}/{batches} ({batch_start+1}-{batch_end}/{total_queries})")
|
||
|
||
#Random Delay
|
||
delay = random.uniform(BATCH_DELAY_MIN, BATCH_DELAY_MAX)
|
||
print(f"😴 Delay: {delay:.0f}s | Proxy: {PROXY_URL}")
|
||
time.sleep(delay)
|
||
|
||
#API Call
|
||
payload = {
|
||
"name": f"{filename.replace('.csv','')}-{job_id}-B{batch_idx+1:03d}",
|
||
"keywords": batch_queries,
|
||
"lang": "de",
|
||
"depth": 1,
|
||
"zoom": 17,
|
||
"radius": 50,
|
||
"max_time": 60, # Reduziert: 120 → 60 (paid proxy schneller)
|
||
"fast_mode": False,
|
||
"proxies": [PROXY_URL]
|
||
}
|
||
|
||
try:
|
||
resp = requests.post(
|
||
f"{SCRAPER_URL}/api/v1/jobs",
|
||
json=payload,
|
||
timeout=45
|
||
)
|
||
print(f"📤 {resp.status_code}")
|
||
if is_blocked(resp.text):
|
||
print("🚫 Batch übersprungen (blocked)")
|
||
continue
|
||
if resp.status_code != 201:
|
||
print(f"⚠️ Batch {batch_idx+1} fehlgeschlagen: {resp.text[:100]}")
|
||
continue
|
||
|
||
scraper_id = resp.json()['id']
|
||
print(f"✅ Scraper: {scraper_id}")
|
||
|
||
for poll_i in range(1, 61): # Reduziert: 121 → 61 (max_time 60s)
|
||
r = requests.get(
|
||
f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}",
|
||
timeout=15
|
||
)
|
||
data = r.json()
|
||
status = data.get('Status', data.get('status', '?'))
|
||
|
||
if status in ('ok', 'completed', 'scraped'):
|
||
dl = requests.get(
|
||
f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}/download",
|
||
timeout=90
|
||
)
|
||
if dl.status_code == 200:
|
||
df_filtered = process_result_csv(dl.content, df_input, True)
|
||
df_raw = process_result_csv(dl.content, df_input, False)
|
||
if df_filtered is not None:
|
||
all_results_filtered.append(df_filtered)
|
||
all_results_raw.append(df_raw)
|
||
print(f"📊 Batch {batch_idx+1}: {len(df_filtered)} filtered")
|
||
break
|
||
elif status in ('failed', 'error'):
|
||
print(f"💥 Batch {batch_idx+1}: {status}")
|
||
break
|
||
|
||
time.sleep(random.uniform(5, 10)) # Reduziert: 10-20s → 5-10s (paid proxy)
|
||
|
||
except Exception as e:
|
||
print(f"💥 Batch {batch_idx+1}: {e}")
|
||
|
||
job.status = f"🔄 Batch {batch_idx+2}/{batches}"
|
||
db.session.commit()
|
||
|
||
#MERGE & SAVE
|
||
job.status = "🔧 merging results"
|
||
db.session.commit()
|
||
|
||
base = filename.replace('.csv', '')
|
||
os.makedirs(RESULT_FOLDER, exist_ok=True)
|
||
|
||
if all_results_filtered:
|
||
df_final_filtered = pd.concat(all_results_filtered, ignore_index=True)
|
||
df_final_filtered = df_final_filtered.drop_duplicates(subset=['title', 'address'])
|
||
|
||
out_filtered = f"results_{base}_filtered.csv"
|
||
df_final_filtered.to_csv(
|
||
os.path.join(RESULT_FOLDER, out_filtered),
|
||
index=False, encoding='utf-8-sig', sep=';'
|
||
)
|
||
|
||
if all_results_raw:
|
||
df_final_raw = pd.concat(all_results_raw, ignore_index=True)
|
||
out_raw = f"results_{base}_all.csv"
|
||
df_final_raw.to_csv(
|
||
os.path.join(RESULT_FOLDER, out_raw),
|
||
index=False, encoding='utf-8-sig', sep=';'
|
||
)
|
||
|
||
job.result_filename = out_filtered
|
||
job.result_filename_raw = out_raw
|
||
job.status = f"✅ Fertig: {len(df_final_filtered)} Firmen"
|
||
else:
|
||
job.status = "❌ Keine Ergebnisse"
|
||
|
||
db.session.commit()
|
||
print(f"🎉 Job {job_id} komplett!")
|
||
|
||
except Exception as e:
|
||
job.status = f"Failed: {str(e)[:50]}"
|
||
print(f"💥 FATAL: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
db.session.commit()
|
||
|
||
print(f"✅ DONE! Status: {job.status}")
|