webcrawler v1.0
This commit is contained in:
parent
008e2bc274
commit
6b057fb941
19 changed files with 814 additions and 112 deletions
|
|
@ -6,123 +6,133 @@ from flask import current_app
|
|||
|
||||
UPLOAD_FOLDER = 'uploads'
|
||||
RESULT_FOLDER = 'results'
|
||||
|
||||
API_KEY = 'AIzaSyAIf0yXJTwo87VMWLBtq2m2LqE-OaPGbzw'
|
||||
|
||||
def get_place_details(street, city_zip):
|
||||
address = f"{street}, {city_zip}"
|
||||
url = f"https://maps.googleapis.com/maps/api/place/textsearch/json"
|
||||
params = {'query': address, 'key': API_KEY}
|
||||
processed_companies = set()
|
||||
|
||||
def get_geocode(address):
|
||||
url = f"https://maps.googleapis.com/maps/api/geocode/json"
|
||||
params = {'address': address, 'key': API_KEY}
|
||||
|
||||
results = []
|
||||
try:
|
||||
response = requests.get(url, params=params, timeout=5)
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
print(f"API Response Data for {address}: {data}")
|
||||
if data['status'] == 'OK':
|
||||
location = data['results'][0]['geometry']['location']
|
||||
return location['lat'], location['lng']
|
||||
except requests.RequestException as e:
|
||||
print(f"Geocode API Fehler für {address}: {e}")
|
||||
return None, None
|
||||
|
||||
for place in data.get('results', []):
|
||||
name = place.get('name', 'N/A')
|
||||
place_id = place.get('place_id')
|
||||
formatted_address = place.get('formatted_address', 'N/A')
|
||||
def get_nearby_places(lat, lng):
|
||||
places_url = f"https://maps.googleapis.com/maps/api/place/nearbysearch/json"
|
||||
params = {
|
||||
'location': f"{lat},{lng}",
|
||||
'radius': 10,
|
||||
'type': 'point_of_interest',
|
||||
'key': API_KEY
|
||||
}
|
||||
|
||||
# Zweite Anfrage für detailliertere Informationen
|
||||
phone, website = 'N/A', 'N/A'
|
||||
if place_id:
|
||||
details_url = f"https://maps.googleapis.com/maps/api/place/details/json"
|
||||
details_params = {
|
||||
'place_id': place_id,
|
||||
'fields': 'formatted_phone_number,website',
|
||||
'key': API_KEY
|
||||
}
|
||||
details_response = requests.get(details_url, params=details_params, timeout=5)
|
||||
if details_response.status_code == 200:
|
||||
details_data = details_response.json().get('result', {})
|
||||
phone = details_data.get('formatted_phone_number', 'N/A')
|
||||
website = details_data.get('website', 'N/A')
|
||||
try:
|
||||
response = requests.get(places_url, params=params, timeout=5)
|
||||
if response.status_code == 200:
|
||||
return response.json().get('results', [])
|
||||
except requests.RequestException as e:
|
||||
print(f"Nearby Places API Fehler für Standort {lat},{lng}: {e}")
|
||||
return []
|
||||
|
||||
# Speichern nur, wenn Name und Telefonnummer vorhanden sind
|
||||
if name != 'N/A' and phone != 'N/A':
|
||||
results.append({
|
||||
'Name': name,
|
||||
'Address': formatted_address,
|
||||
'Phone': phone,
|
||||
'Website': website
|
||||
})
|
||||
else:
|
||||
print(f"Fehler beim Abrufen der URL: {url} - Statuscode: {response.status_code}")
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"Anfragefehler für {url}: {e}")
|
||||
def get_place_details(place_id):
|
||||
details_url = f"https://maps.googleapis.com/maps/api/place/details/json"
|
||||
params = {
|
||||
'place_id': place_id,
|
||||
'fields': 'formatted_phone_number,website',
|
||||
'key': API_KEY
|
||||
}
|
||||
|
||||
return results
|
||||
try:
|
||||
response = requests.get(details_url, params=params, timeout=5)
|
||||
if response.status_code == 200:
|
||||
result = response.json().get('result', {})
|
||||
return result.get('formatted_phone_number', 'N/A'), result.get('website', 'N/A')
|
||||
except requests.RequestException as e:
|
||||
print(f"Place Details API Fehler für Place ID {place_id}: {e}")
|
||||
return 'N/A', 'N/A'
|
||||
|
||||
def process_file(filename, job_id, app):
|
||||
with app.app_context():
|
||||
print(f"Starte Prozess für Job-ID: {job_id}")
|
||||
filepath = os.path.join(UPLOAD_FOLDER, filename)
|
||||
results = []
|
||||
|
||||
job = Job.query.get(job_id)
|
||||
if not job:
|
||||
print("Job wurde abgebrochen, bevor er starten konnte.")
|
||||
print("Job wurde abgebrochen.")
|
||||
return
|
||||
job.status = "In Progress"
|
||||
db.session.commit()
|
||||
|
||||
with open(filepath, newline='', encoding='ISO-8859-1') as csvfile:
|
||||
reader = csv.DictReader(csvfile, delimiter=';')
|
||||
rows = list(reader)
|
||||
total_rows = len(rows)
|
||||
print(f"Insgesamt zu verarbeitende Zeilen: {total_rows}")
|
||||
headers = reader.fieldnames
|
||||
|
||||
for index, row in enumerate(rows):
|
||||
# Job-Verfügbarkeit erneut prüfen
|
||||
job = Job.query.get(job_id)
|
||||
if not job:
|
||||
print("Job wurde abgebrochen.")
|
||||
return
|
||||
|
||||
# Vollständige Adresse erstellen
|
||||
street = f"{row.get('Straße', '')} {row.get('Hausnummer', '')}".strip()
|
||||
city_zip = f"{row.get('PLZ', '')} {row.get('Stadt', '')}".strip()
|
||||
print(f"Verarbeite Adresse: {street}, {city_zip}")
|
||||
address_results = get_place_details(street, city_zip)
|
||||
|
||||
for result in address_results:
|
||||
# Ergebnisse nur speichern, wenn Name und Telefonnummer vorhanden sind
|
||||
if result['Name'] != 'N/A' and result['Phone'] != 'N/A':
|
||||
result.update({
|
||||
'PLZ': row.get('PLZ', ''),
|
||||
'Stadt': row.get('Stadt', ''),
|
||||
'Straße': row.get('Straße', ''),
|
||||
'Hausnummer': row.get('Hausnummer', ''),
|
||||
'Zusatz': row.get('Zusatz', '')
|
||||
})
|
||||
results.append(result)
|
||||
|
||||
# Results-Dateiname basierend auf dem Upload-Dateinamen
|
||||
result_file = f"results_{filename}"
|
||||
result_path = os.path.join(RESULT_FOLDER, result_file)
|
||||
|
||||
# Prüfen und erstellen des Ergebnisverzeichnisses
|
||||
if not os.path.exists(RESULT_FOLDER):
|
||||
os.makedirs(RESULT_FOLDER)
|
||||
print(f"Erstelle Ergebnisverzeichnis: {RESULT_FOLDER}")
|
||||
|
||||
try:
|
||||
if results: # Nur speichern, wenn Ergebnisse vorhanden sind
|
||||
with open(result_path, 'w', newline='', encoding='utf-8-sig') as csvfile:
|
||||
writer = csv.DictWriter(csvfile, fieldnames=['Name', 'Address', 'Phone', 'Website', 'PLZ', 'Stadt', 'Straße', 'Hausnummer', 'Zusatz'])
|
||||
writer.writeheader()
|
||||
writer.writerows(results)
|
||||
print(f"Ergebnisdatei erfolgreich gespeichert unter: {result_path}")
|
||||
job.status = "Completed"
|
||||
job.result_filename = result_file
|
||||
db.session.commit()
|
||||
else:
|
||||
print("Keine relevanten Ergebnisse zum Speichern vorhanden. Markiere den Job als 'Failed'.")
|
||||
if not all(field in headers for field in ['PLZ', 'Straße', 'Hausnummer']):
|
||||
print("CSV-Datei enthält nicht alle notwendigen Spalten.")
|
||||
job.status = "Failed"
|
||||
db.session.commit()
|
||||
except Exception as e:
|
||||
print(f"Fehler beim Schreiben der Ergebnisdatei: {e}")
|
||||
return
|
||||
|
||||
for row in reader:
|
||||
plz = row.get('PLZ', '').strip()
|
||||
city = row.get('Stadt', row.get('Bezirk', '')).strip()
|
||||
street = row.get('Straße', '').strip()
|
||||
house_number = row.get('Hausnummer', '').strip()
|
||||
additional = row.get('Zusatz', '').strip()
|
||||
|
||||
if not all([plz, city, street, house_number]):
|
||||
continue
|
||||
|
||||
full_address = f"{street} {house_number} {additional}, {plz} {city}"
|
||||
lat, lng = get_geocode(full_address)
|
||||
if lat is None or lng is None:
|
||||
continue
|
||||
|
||||
nearby_places = get_nearby_places(lat, lng)
|
||||
for place in nearby_places:
|
||||
company_name = place['name']
|
||||
if company_name in processed_companies:
|
||||
continue
|
||||
|
||||
processed_companies.add(company_name)
|
||||
company_address = place.get('vicinity', 'N/A').split(',')[0]
|
||||
place_id = place.get('place_id')
|
||||
company_phone, company_website = get_place_details(place_id) if place_id else ('N/A', 'N/A')
|
||||
|
||||
results.append({
|
||||
'PLZ': plz,
|
||||
'Stadt': city,
|
||||
'Straße': street,
|
||||
'Hausnummer': house_number,
|
||||
'Zusatz': additional,
|
||||
'Company Name': company_name,
|
||||
'Company Address': company_address,
|
||||
'Company Phone': company_phone,
|
||||
'Company Website': company_website
|
||||
})
|
||||
|
||||
if results:
|
||||
result_file = f"results_{os.path.splitext(filename)[0]}.csv"
|
||||
result_path = os.path.join(RESULT_FOLDER, result_file)
|
||||
with open(result_path, 'w', newline='', encoding='utf-8-sig') as csvfile:
|
||||
writer = csv.DictWriter(csvfile, fieldnames=[
|
||||
'PLZ', 'Stadt', 'Straße', 'Hausnummer', 'Zusatz',
|
||||
'Company Name', 'Company Address', 'Company Phone', 'Company Website'
|
||||
])
|
||||
writer.writeheader()
|
||||
writer.writerows(results)
|
||||
job.status = "Completed"
|
||||
job.result_filename = result_file
|
||||
db.session.commit()
|
||||
else:
|
||||
job.status = "Failed"
|
||||
db.session.commit()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue