heko_webcrawler/app/webcrawler.py
2024-11-14 10:20:42 +01:00

138 lines
5.1 KiB
Python

import csv
import os
import requests
from .models import db, Job
from flask import current_app
UPLOAD_FOLDER = 'uploads'
RESULT_FOLDER = 'results'
API_KEY = 'AIzaSyAIf0yXJTwo87VMWLBtq2m2LqE-OaPGbzw'
processed_companies = set()
def get_geocode(address):
url = f"https://maps.googleapis.com/maps/api/geocode/json"
params = {'address': address, 'key': API_KEY}
try:
response = requests.get(url, params=params, timeout=5)
if response.status_code == 200:
data = response.json()
if data['status'] == 'OK':
location = data['results'][0]['geometry']['location']
return location['lat'], location['lng']
except requests.RequestException as e:
print(f"Geocode API Fehler für {address}: {e}")
return None, None
def get_nearby_places(lat, lng):
places_url = f"https://maps.googleapis.com/maps/api/place/nearbysearch/json"
params = {
'location': f"{lat},{lng}",
'radius': 10,
'type': 'point_of_interest',
'key': API_KEY
}
try:
response = requests.get(places_url, params=params, timeout=5)
if response.status_code == 200:
return response.json().get('results', [])
except requests.RequestException as e:
print(f"Nearby Places API Fehler für Standort {lat},{lng}: {e}")
return []
def get_place_details(place_id):
details_url = f"https://maps.googleapis.com/maps/api/place/details/json"
params = {
'place_id': place_id,
'fields': 'formatted_phone_number,website',
'key': API_KEY
}
try:
response = requests.get(details_url, params=params, timeout=5)
if response.status_code == 200:
result = response.json().get('result', {})
return result.get('formatted_phone_number', 'N/A'), result.get('website', 'N/A')
except requests.RequestException as e:
print(f"Place Details API Fehler für Place ID {place_id}: {e}")
return 'N/A', 'N/A'
def process_file(filename, job_id, app):
with app.app_context():
filepath = os.path.join(UPLOAD_FOLDER, filename)
results = []
job = Job.query.get(job_id)
if not job:
print("Job wurde abgebrochen.")
return
job.status = "In Progress"
db.session.commit()
with open(filepath, newline='', encoding='ISO-8859-1') as csvfile:
reader = csv.DictReader(csvfile, delimiter=';')
headers = reader.fieldnames
if not all(field in headers for field in ['PLZ', 'Straße', 'Hausnummer']):
print("CSV-Datei enthält nicht alle notwendigen Spalten.")
job.status = "Failed"
db.session.commit()
return
for row in reader:
plz = row.get('PLZ', '').strip()
city = row.get('Stadt', row.get('Bezirk', '')).strip()
street = row.get('Straße', '').strip()
house_number = row.get('Hausnummer', '').strip()
additional = row.get('Zusatz', '').strip()
if not all([plz, city, street, house_number]):
continue
full_address = f"{street} {house_number} {additional}, {plz} {city}"
lat, lng = get_geocode(full_address)
if lat is None or lng is None:
continue
nearby_places = get_nearby_places(lat, lng)
for place in nearby_places:
company_name = place['name']
if company_name in processed_companies:
continue
processed_companies.add(company_name)
company_address = place.get('vicinity', 'N/A').split(',')[0]
place_id = place.get('place_id')
company_phone, company_website = get_place_details(place_id) if place_id else ('N/A', 'N/A')
results.append({
'PLZ': plz,
'Stadt': city,
'Straße': street,
'Hausnummer': house_number,
'Zusatz': additional,
'Company Name': company_name,
'Company Address': company_address,
'Company Phone': company_phone,
'Company Website': company_website
})
if results:
result_file = f"results_{os.path.splitext(filename)[0]}.csv"
result_path = os.path.join(RESULT_FOLDER, result_file)
with open(result_path, 'w', newline='', encoding='utf-8-sig') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=[
'PLZ', 'Stadt', 'Straße', 'Hausnummer', 'Zusatz',
'Company Name', 'Company Address', 'Company Phone', 'Company Website'
])
writer.writeheader()
writer.writerows(results)
job.status = "Completed"
job.result_filename = result_file
db.session.commit()
else:
job.status = "Failed"
db.session.commit()