import os import re import pandas as pd import requests import time import random from io import StringIO from app.models import db, Job print("πŸ†• MODERN webcrawler LOADED! – BATCHED + PROXY") UPLOAD_FOLDER = '/app/uploads' RESULT_FOLDER = '/app/results' SCRAPER_URL = "http://gmaps-scraper:8080" OUTPUT_COLS = ['title', 'category', 'address', 'open_hours', 'website', 'phone', 'link'] PROXY_URL = "http://bitlleuv-rotate:s5hzse6hz74b@p.webshare.io:80" API_PROXIES = {"http": PROXY_URL, "https": PROXY_URL} # ────────────────────────────────────────────── # Hilfsfunktionen # ────────────────────────────────────────────── def is_blocked(data): text = str(data).lower() blocked = any(kw in text for kw in ['captcha', 'blocked', 'rate limit', 'too many', '429']) if blocked: print(f"🚫 BLOCKED: {str(data)[:100]}") return blocked def fix_encoding(text): if not isinstance(text, str): return text try: return text.encode('latin-1').decode('utf-8') except (UnicodeEncodeError, UnicodeDecodeError): return text def build_input_addresses(df): addresses = set() for _, row in df.iterrows(): plz = str(row.get('PLZ', '')).strip() stadt = str(row.get('Stadt', '')).strip() str_ = str(row.get('Straße', '')).strip() nr = str(row.get('Hausnummer', '')).strip() zusatz = str(row.get('Zusatz', '')).strip() full = f"{str_} {nr} {zusatz} {plz} {stadt}".lower().strip() full = ' '.join(full.split()) addresses.add(full) return addresses def normalize_address(addr): if not isinstance(addr, str): return '' addr = fix_encoding(addr) return ' '.join(addr.lower().strip().split()) def address_in_input(result_addr, input_addresses): norm = normalize_address(result_addr) for inp_addr in input_addresses: plz_match = re.search(r'\b\d{5}\b', inp_addr) if plz_match: plz = plz_match.group() if plz in norm: street = inp_addr.split()[0] if inp_addr else '' if len(street) > 3 and street[:4].lower() in norm: return True return False # ────────────────────────────────────────────── # CSV Nachbearbeitung # ────────────────────────────────────────────── def process_result_csv(raw_bytes, input_df, apply_filter=True): try: content = raw_bytes.decode('utf-8', errors='replace') df_out = pd.read_csv(StringIO(content)) print(f"πŸ“„ Raw result: {df_out.shape}") available = [c for c in OUTPUT_COLS if c in df_out.columns] df_out = df_out[available] for col in df_out.columns: df_out[col] = df_out[col].apply(fix_encoding) if apply_filter: input_addresses = build_input_addresses(input_df) before = len(df_out) df_out = df_out[ df_out['address'].apply( lambda a: address_in_input(a, input_addresses) ) ] print(f"πŸ“ Filter: {before} β†’ {len(df_out)}") df_out = df_out.drop_duplicates(subset=['title', 'address'], keep='first') df_out = df_out.dropna(subset=['title'], how='all') df_out = df_out[df_out['title'].str.strip().astype(bool)] print(f"βœ… Final ({'gefiltert' if apply_filter else 'alle'}): {df_out.shape}") return df_out except Exception as e: print(f"πŸ’₯ process_result_csv: {e}") return None # ────────────────────────────────────────────── # HAUPT-WORKER # ────────────────────────────────────────────── def process_file(filename, job_id, app): print(f"🎯 {filename} Job#{job_id} START!") with app.app_context(): job = Job.query.get(job_id) if not job: print("❌ Job missing") return try: #Parse + ALLE Queries job.status = "πŸ“Š parsing CSV" db.session.commit() filepath = os.path.join(UPLOAD_FOLDER, filename) print(f"πŸ“ {filepath} | {os.path.getsize(filepath)}b") df_input = pd.read_csv(filepath, sep=';', encoding='ISO-8859-1') print(f"πŸ“Š {df_input.shape}") queries = [] for _, row in df_input.iterrows(): parts = [ str(row.get('PLZ', '')).strip(), str(row.get('Stadt', '')).strip(), str(row.get('Straße', '')).strip(), str(row.get('Hausnummer', '')).strip(), str(row.get('Zusatz', '')).strip(), ] q = f"Firmen {' '.join(p for p in parts if p and p != 'nan')}".strip() if len(q) > 10: queries.append(q) total_queries = len(queries) print(f"πŸ” {total_queries} Queries | Samples: {queries[:3]}") if total_queries == 0: raise ValueError("Keine gΓΌltigen Adressen") #BATCHED Processing BATCH_SIZE = 10 # ErhΓΆht: 5 β†’ 10 (paid proxy) BATCH_DELAY_MIN, BATCH_DELAY_MAX = 10, 20 # Reduziert: 30-60s β†’ 10-20s (paid proxy) batches = (total_queries + BATCH_SIZE - 1) // BATCH_SIZE print(f"πŸ“¦ {batches} Batches Γ  {BATCH_SIZE} | ETA: ~{batches*15//60:.0f}h") all_results_filtered = [] all_results_raw = [] job.status = f"πŸ”„ Batch 1/{batches}" db.session.commit() for batch_idx in range(batches): batch_start = batch_idx * BATCH_SIZE batch_end = min(batch_start + BATCH_SIZE, total_queries) batch_queries = queries[batch_start:batch_end] print(f"\nπŸ”„ BATCH {batch_idx+1}/{batches} ({batch_start+1}-{batch_end}/{total_queries})") #Random Delay delay = random.uniform(BATCH_DELAY_MIN, BATCH_DELAY_MAX) print(f"😴 Delay: {delay:.0f}s | Proxy: {PROXY_URL}") time.sleep(delay) #API Call payload = { "name": f"{filename.replace('.csv','')}-{job_id}-B{batch_idx+1:03d}", "keywords": batch_queries, "lang": "de", "depth": 1, "zoom": 17, "radius": 50, "max_time": 60, # Reduziert: 120 β†’ 60 (paid proxy schneller) "fast_mode": False, "proxies": [PROXY_URL] } try: resp = requests.post( f"{SCRAPER_URL}/api/v1/jobs", json=payload, timeout=45 ) print(f"πŸ“€ {resp.status_code}") if is_blocked(resp.text): print("🚫 Batch ΓΌbersprungen (blocked)") continue if resp.status_code != 201: print(f"⚠️ Batch {batch_idx+1} fehlgeschlagen: {resp.text[:100]}") continue scraper_id = resp.json()['id'] print(f"βœ… Scraper: {scraper_id}") for poll_i in range(1, 61): # Reduziert: 121 β†’ 61 (max_time 60s) r = requests.get( f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}", timeout=15 ) data = r.json() status = data.get('Status', data.get('status', '?')) if status in ('ok', 'completed', 'scraped'): dl = requests.get( f"{SCRAPER_URL}/api/v1/jobs/{scraper_id}/download", timeout=90 ) if dl.status_code == 200: df_filtered = process_result_csv(dl.content, df_input, True) df_raw = process_result_csv(dl.content, df_input, False) if df_filtered is not None: all_results_filtered.append(df_filtered) all_results_raw.append(df_raw) print(f"πŸ“Š Batch {batch_idx+1}: {len(df_filtered)} filtered") break elif status in ('failed', 'error'): print(f"πŸ’₯ Batch {batch_idx+1}: {status}") break time.sleep(random.uniform(5, 10)) # Reduziert: 10-20s β†’ 5-10s (paid proxy) except Exception as e: print(f"πŸ’₯ Batch {batch_idx+1}: {e}") job.status = f"πŸ”„ Batch {batch_idx+2}/{batches}" db.session.commit() #MERGE & SAVE job.status = "πŸ”§ merging results" db.session.commit() base = filename.replace('.csv', '') os.makedirs(RESULT_FOLDER, exist_ok=True) if all_results_filtered: df_final_filtered = pd.concat(all_results_filtered, ignore_index=True) df_final_filtered = df_final_filtered.drop_duplicates(subset=['title', 'address']) out_filtered = f"results_{base}_filtered.csv" df_final_filtered.to_csv( os.path.join(RESULT_FOLDER, out_filtered), index=False, encoding='utf-8-sig', sep=';' ) if all_results_raw: df_final_raw = pd.concat(all_results_raw, ignore_index=True) out_raw = f"results_{base}_all.csv" df_final_raw.to_csv( os.path.join(RESULT_FOLDER, out_raw), index=False, encoding='utf-8-sig', sep=';' ) job.result_filename = out_filtered job.result_filename_raw = out_raw job.status = f"βœ… Fertig: {len(df_final_filtered)} Firmen" else: job.status = "❌ Keine Ergebnisse" db.session.commit() print(f"πŸŽ‰ Job {job_id} komplett!") except Exception as e: job.status = f"Failed: {str(e)[:50]}" print(f"πŸ’₯ FATAL: {e}") import traceback traceback.print_exc() db.session.commit() print(f"βœ… DONE! Status: {job.status}")