webcrawler v1.0

This commit is contained in:
mkrieger 2024-11-14 10:20:42 +01:00
parent 008e2bc274
commit 6b057fb941
19 changed files with 814 additions and 112 deletions

View file

@ -2,20 +2,25 @@ import os
from flask import Flask, redirect, url_for, request
from flask_sqlalchemy import SQLAlchemy
from flask_login import LoginManager, current_user
from .models import db, User
from flask_migrate import Migrate
# Konfiguration für Upload- und Ergebnis-Ordner
UPLOAD_FOLDER = '/app/uploads'
RESULT_FOLDER = '/app/results'
db = SQLAlchemy()
migrate = Migrate()
def create_app():
app = Flask(__name__)
app.config['SECRET_KEY'] = '008e7369b075886d5f494c8813efdfb17155da6af12b3fe8ee'
app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///users.db'
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
app.config['RESULT_FOLDER'] = RESULT_FOLDER
app.config['ALLOW_USER_SIGNUP'] = False
db.init_app(app)
migrate.init_app(app, db)
# Flask-Login Setup
login_manager = LoginManager()
@ -24,16 +29,15 @@ def create_app():
@login_manager.user_loader
def load_user(user_id):
from .models import User
return User.query.get(int(user_id))
# Umleitung nicht authentifizierter Benutzer, statische Dateien und bestimmte Routen ausnehmen
@app.before_request
def require_login():
allowed_routes = ['auth.login', 'auth.signup']
# Prüfen, ob der Benutzer authentifiziert ist oder eine erlaubte Route anfragt
if (not current_user.is_authenticated
and request.endpoint not in allowed_routes
if (not current_user.is_authenticated
and request.endpoint not in allowed_routes
and not request.path.startswith('/static/')):
return redirect(url_for('auth.login'))

View file

@ -1,20 +1,19 @@
from flask_sqlalchemy import SQLAlchemy
from flask_login import UserMixin
from datetime import datetime
db = SQLAlchemy()
from . import db
class User(UserMixin, db.Model):
id = db.Column(db.Integer, primary_key=True)
username = db.Column(db.String(150), unique=True, nullable=False)
password = db.Column(db.String(150), nullable=False)
is_admin = db.Column(db.Boolean, default=False)
class Job(db.Model):
id = db.Column(db.Integer, primary_key=True)
user_id = db.Column(db.Integer, db.ForeignKey('user.id'), nullable=False)
filename = db.Column(db.String(150), nullable=False)
status = db.Column(db.String(50), default="Pending") # Status: Pending, In Progress, Completed
status = db.Column(db.String(50), default="Pending")
created_at = db.Column(db.DateTime, default=datetime.utcnow)
result_filename = db.Column(db.String(150), nullable=True)
user = db.relationship('User', backref=db.backref('jobs', lazy=True))

View file

@ -1,3 +1,4 @@
import time
import csv
import os
import threading
@ -9,7 +10,7 @@ from .models import db, User, Job
from .webcrawler import process_file # Importiere die Funktion für das Webscraping
UPLOAD_FOLDER = 'uploads'
RESULT_FOLDER = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'results')
RESULT_FOLDER = 'results'
# Blueprint für auth erstellen
bp = Blueprint('auth', __name__)
@ -28,6 +29,10 @@ def login():
@bp.route('/signup', methods=['GET', 'POST'])
def signup():
if not current_app.config['ALLOW_USER_SIGNUP']:
flash("Registrierung ist derzeit deaktiviert.")
return redirect(url_for('auth.login'))
if request.method == 'POST':
username = request.form['username']
password = generate_password_hash(request.form['password'], method='sha256')
@ -36,6 +41,7 @@ def signup():
db.session.commit()
flash('Benutzer erfolgreich erstellt! Sie können sich jetzt einloggen.')
return redirect(url_for('auth.login'))
return render_template('signup.html')
@bp.route('/logout')
@ -50,18 +56,24 @@ def job_status():
jobs = Job.query.filter_by(user_id=current_user.id).all()
return render_template('jobs.html', jobs=jobs)
# Hochladen und Verarbeiten der Datei im Hintergrund
@bp.route('/upload', methods=['GET', 'POST'])
@login_required
def upload():
if request.method == 'POST':
file = request.files['file']
filename = secure_filename(file.filename)
if not filename.endswith('.csv'):
flash('Bitte eine CSV-Datei hochladen')
return redirect(url_for('auth.upload'))
file_path = os.path.join(UPLOAD_FOLDER, filename)
# Überprüfen, ob eine Datei mit dem gleichen Namen bereits existiert
file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], filename)
if os.path.exists(file_path):
# Wenn eine Datei mit dem gleichen Namen existiert, einen Zeitstempel hinzufügen
name, ext = os.path.splitext(filename)
timestamp = time.strftime("%Y%m%d-%H%M%S") # Zeitstempel im Format JahrMonatTag-StundenMinutenSekunden
filename = f"{name}_{timestamp}{ext}"
file_path = os.path.join(current_app.config['UPLOAD_FOLDER'], filename)
flash(f"Eine Datei mit gleichem Namen existierte bereits. Die Datei wurde als '{filename}' gespeichert.")
# Speichern der Datei
file.save(file_path)
flash('Datei erfolgreich hochgeladen und Job gestartet')
@ -76,7 +88,7 @@ def upload():
# Starten des Scraping im Hintergrund-Thread und Übergeben des aktuellen Anwendungskontexts
thread = threading.Thread(target=process_file, args=(filename, new_job.id, current_app._get_current_object()))
thread.start()
# Debugging-Ausgabe, nachdem der Thread gestartet wurde
print(f"Thread für Job {new_job.id} erfolgreich gestartet.")
@ -122,7 +134,7 @@ def delete_job(job_id):
return redirect(url_for('auth.job_status'))
# Löschen der Upload-Datei
upload_path = os.path.join(UPLOAD_FOLDER, job.filename)
upload_path = os.path.join(current_app.config['UPLOAD_FOLDER'], job.filename)
if os.path.exists(upload_path):
os.remove(upload_path)
print(f"Upload-Datei gelöscht: {upload_path}")
@ -131,7 +143,9 @@ def delete_job(job_id):
# Löschen der Results-Datei, falls vorhanden
if job.result_filename:
result_path = os.path.join(RESULT_FOLDER, job.result_filename)
result_path = os.path.join(current_app.config['RESULT_FOLDER'], job.result_filename)
print(f"Versuche Ergebnisdatei zu löschen: {result_path}")
if os.path.exists(result_path):
try:
os.remove(result_path)
@ -139,10 +153,71 @@ def delete_job(job_id):
except Exception as e:
print(f"Fehler beim Löschen der Ergebnisdatei: {e}")
else:
print(f"Ergebnisdatei nicht gefunden: {result_path}")
print(f"Ergebnisdatei nicht gefunden im Pfad: {result_path}")
# Job aus der Datenbank löschen
db.session.delete(job)
db.session.commit()
flash("Job erfolgreich gelöscht.")
return redirect(url_for('auth.job_status'))
@bp.route('/admin', methods=['GET'])
@login_required
def admin_panel():
if not current_user.is_admin:
flash("Keine Berechtigung.")
return redirect(url_for('auth.job_status'))
users = User.query.all()
return render_template('admin_panel.html', users=users)
@bp.route('/admin/create_user', methods=['POST'])
@login_required
def create_user():
if not current_user.is_admin:
flash("Keine Berechtigung.")
return redirect(url_for('auth.admin_panel'))
username = request.form['username']
password = request.form['password']
is_admin = 'is_admin' in request.form # Checkbox für Adminrechte
hashed_password = generate_password_hash(password, method='sha256')
new_user = User(username=username, password=hashed_password, is_admin=is_admin)
db.session.add(new_user)
db.session.commit()
flash(f"Benutzer {username} wurde erstellt.")
return redirect(url_for('auth.admin_panel'))
@bp.route('/admin/reset_password/<int:user_id>', methods=['POST'])
@login_required
def reset_password(user_id):
if not current_user.is_admin:
flash("Keine Berechtigung.")
return redirect(url_for('auth.admin_panel'))
user = User.query.get_or_404(user_id)
new_password = request.form['new_password']
user.password = generate_password_hash(new_password, method='sha256')
db.session.commit()
flash(f"Passwort für Benutzer {user.username} wurde zurückgesetzt.")
return redirect(url_for('auth.admin_panel'))
@bp.route('/admin/delete_user/<int:user_id>', methods=['POST'])
@login_required
def delete_user(user_id):
if not current_user.is_admin:
flash("Keine Berechtigung.")
return redirect(url_for('auth.admin_panel'))
user = User.query.get_or_404(user_id)
if user.is_admin:
flash("Administratoren können nicht gelöscht werden.")
return redirect(url_for('auth.admin_panel'))
db.session.delete(user)
db.session.commit()
flash(f"Benutzer {user.username} wurde gelöscht.")
return redirect(url_for('auth.admin_panel'))

View file

@ -164,3 +164,135 @@ tr:nth-child(even) td {
.delete-btn:hover {
background-color: #e60000;
}
/* Flash-Badge Styling */
.flash-badge {
position: fixed;
top: 20px;
right: 20px;
background-color: #f44336; /* Material Design Rot */
color: white;
padding: 12px 24px;
border-radius: 8px;
font-family: 'Roboto', sans-serif;
font-weight: 500;
box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.2);
z-index: 1000;
opacity: 0;
transform: translateY(-20px);
transition: opacity 0.4s ease, transform 0.4s ease;
}
/* Einblend-Animation */
.flash-badge.show {
opacity: 1;
transform: translateY(0);
}
/* Ausblend-Animation */
.flash-badge.hide {
opacity: 0;
transform: translateY(-20px);
}
.admin-panel {
max-width: 800px;
margin: 2em auto;
padding: 2em;
background: white;
border-radius: 8px;
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
}
.admin-panel h2 {
font-weight: 500;
color: #1d1d1f;
margin-bottom: 1em;
}
.user-table {
width: 100%;
border-collapse: collapse;
margin-bottom: 2em;
}
.user-table th, .user-table td {
padding: 0.75em;
text-align: left;
border: 1px solid #d1d1d6;
}
.user-table th {
background-color: #f1f1f1;
color: #333;
}
.user-table td {
background-color: white;
}
.user-table tr:nth-child(even) td {
background-color: #f9f9f9;
}
.reset-btn, .delete-btn, .create-btn {
padding: 0.5em 1em;
font-size: 0.9em;
font-weight: 500;
border: none;
border-radius: 4px;
cursor: pointer;
transition: background-color 0.2s ease-in-out;
}
.reset-btn {
background-color: #4caf50;
color: white;
}
.reset-btn:hover {
background-color: #388e3c;
}
.delete-btn {
background-color: #f44336;
color: white;
}
.delete-btn:hover {
background-color: #d32f2f;
}
.create-btn {
background-color: #007aff;
color: white;
padding: 0.75em;
margin-top: 1em;
display: block;
width: 100%;
font-size: 1em;
}
.create-btn:hover {
background-color: #005bb5;
}
.create-user-form {
margin-top: 1.5em;
}
.create-user-form input[type="text"],
.create-user-form input[type="password"] {
width: 100%;
padding: 0.75em;
margin-bottom: 1em;
border: 1px solid #d1d1d6;
border-radius: 8px;
}
.create-user-form label {
font-size: 0.9em;
color: #6e6e73;
display: block;
margin-bottom: 1em;
}

View file

@ -0,0 +1,50 @@
{% extends "base.html" %}
{% block content %}
<div class="admin-panel">
<h2>Benutzerverwaltung</h2>
<!-- Tabelle für Benutzerverwaltung -->
<table class="user-table">
<thead>
<tr>
<th>ID</th>
<th>Benutzername</th>
<th>Admin</th>
<th>Aktionen</th>
</tr>
</thead>
<tbody>
{% for user in users %}
<tr>
<td>{{ user.id }}</td>
<td>{{ user.username }}</td>
<td>{{ 'Ja' if user.is_admin else 'Nein' }}</td>
<td>
<form action="{{ url_for('auth.reset_password', user_id=user.id) }}" method="post" style="display:inline;">
<input type="text" name="new_password" placeholder="Neues Passwort" required>
<button type="submit" class="reset-btn">Passwort zurücksetzen</button>
</form>
{% if not user.is_admin %}
<form action="{{ url_for('auth.delete_user', user_id=user.id) }}" method="post" style="display:inline;">
<button type="submit" class="delete-btn">Benutzer löschen</button>
</form>
{% endif %}
</td>
</tr>
{% endfor %}
</tbody>
</table>
<!-- Formular zum Erstellen neuer Benutzer -->
<h3>Neuen Benutzer erstellen</h3>
<form action="{{ url_for('auth.create_user') }}" method="post" class="create-user-form">
<input type="text" name="username" placeholder="Benutzername" required>
<input type="password" name="password" placeholder="Passwort" required>
<label>
<input type="checkbox" name="is_admin"> Admin
</label>
<button type="submit" class="create-btn">Benutzer erstellen</button>
</form>
</div>
{% endblock %}

View file

@ -14,14 +14,50 @@
<ul>
<li><a href="{{ url_for('auth.job_status') }}">Jobs</a></li>
<li><a href="{{ url_for('auth.upload') }}">Upload</a></li>
{% if current_user.is_admin %}
<li><a href="{{ url_for('auth.admin_panel') }}">Admin</a></li> <!-- Admin-Bereich Link -->
{% endif %}
<li><a href="{{ url_for('auth.logout') }}">Logout</a></li>
</ul>
</nav>
</header>
{% endif %}
<!-- Flash-Nachrichten -->
{% with messages = get_flashed_messages() %}
{% if messages %}
<div id="flash-badge-container">
{% for message in messages %}
<div class="flash-badge">{{ message }}</div>
{% endfor %}
</div>
{% endif %}
{% endwith %}
<div class="{% if request.endpoint in ['auth.login', 'auth.signup'] %}form-container{% else %}container{% endif %}">
{% block content %}{% endblock %}
</div>
<!-- JavaScript für Ein- und Ausblendanimation des Flash-Badges -->
<script>
document.addEventListener("DOMContentLoaded", function() {
var flashBadges = document.querySelectorAll('.flash-badge');
flashBadges.forEach(function(badge) {
// Einblendung mit Verzögerung
setTimeout(function() {
badge.classList.add('show');
}, 100);
// Ausblendung nach 5 Sekunden und Entfernen aus dem DOM
setTimeout(function() {
badge.classList.remove('show');
badge.classList.add('hide');
setTimeout(function() {
badge.remove();
}, 400); // Zeit für die Ausblendanimation
}, 5000);
});
});
</script>
</body>
</html>

View file

@ -6,123 +6,133 @@ from flask import current_app
UPLOAD_FOLDER = 'uploads'
RESULT_FOLDER = 'results'
API_KEY = 'AIzaSyAIf0yXJTwo87VMWLBtq2m2LqE-OaPGbzw'
def get_place_details(street, city_zip):
address = f"{street}, {city_zip}"
url = f"https://maps.googleapis.com/maps/api/place/textsearch/json"
params = {'query': address, 'key': API_KEY}
processed_companies = set()
def get_geocode(address):
url = f"https://maps.googleapis.com/maps/api/geocode/json"
params = {'address': address, 'key': API_KEY}
results = []
try:
response = requests.get(url, params=params, timeout=5)
if response.status_code == 200:
data = response.json()
print(f"API Response Data for {address}: {data}")
if data['status'] == 'OK':
location = data['results'][0]['geometry']['location']
return location['lat'], location['lng']
except requests.RequestException as e:
print(f"Geocode API Fehler für {address}: {e}")
return None, None
for place in data.get('results', []):
name = place.get('name', 'N/A')
place_id = place.get('place_id')
formatted_address = place.get('formatted_address', 'N/A')
def get_nearby_places(lat, lng):
places_url = f"https://maps.googleapis.com/maps/api/place/nearbysearch/json"
params = {
'location': f"{lat},{lng}",
'radius': 10,
'type': 'point_of_interest',
'key': API_KEY
}
# Zweite Anfrage für detailliertere Informationen
phone, website = 'N/A', 'N/A'
if place_id:
details_url = f"https://maps.googleapis.com/maps/api/place/details/json"
details_params = {
'place_id': place_id,
'fields': 'formatted_phone_number,website',
'key': API_KEY
}
details_response = requests.get(details_url, params=details_params, timeout=5)
if details_response.status_code == 200:
details_data = details_response.json().get('result', {})
phone = details_data.get('formatted_phone_number', 'N/A')
website = details_data.get('website', 'N/A')
try:
response = requests.get(places_url, params=params, timeout=5)
if response.status_code == 200:
return response.json().get('results', [])
except requests.RequestException as e:
print(f"Nearby Places API Fehler für Standort {lat},{lng}: {e}")
return []
# Speichern nur, wenn Name und Telefonnummer vorhanden sind
if name != 'N/A' and phone != 'N/A':
results.append({
'Name': name,
'Address': formatted_address,
'Phone': phone,
'Website': website
})
else:
print(f"Fehler beim Abrufen der URL: {url} - Statuscode: {response.status_code}")
except requests.exceptions.RequestException as e:
print(f"Anfragefehler für {url}: {e}")
def get_place_details(place_id):
details_url = f"https://maps.googleapis.com/maps/api/place/details/json"
params = {
'place_id': place_id,
'fields': 'formatted_phone_number,website',
'key': API_KEY
}
return results
try:
response = requests.get(details_url, params=params, timeout=5)
if response.status_code == 200:
result = response.json().get('result', {})
return result.get('formatted_phone_number', 'N/A'), result.get('website', 'N/A')
except requests.RequestException as e:
print(f"Place Details API Fehler für Place ID {place_id}: {e}")
return 'N/A', 'N/A'
def process_file(filename, job_id, app):
with app.app_context():
print(f"Starte Prozess für Job-ID: {job_id}")
filepath = os.path.join(UPLOAD_FOLDER, filename)
results = []
job = Job.query.get(job_id)
if not job:
print("Job wurde abgebrochen, bevor er starten konnte.")
print("Job wurde abgebrochen.")
return
job.status = "In Progress"
db.session.commit()
with open(filepath, newline='', encoding='ISO-8859-1') as csvfile:
reader = csv.DictReader(csvfile, delimiter=';')
rows = list(reader)
total_rows = len(rows)
print(f"Insgesamt zu verarbeitende Zeilen: {total_rows}")
headers = reader.fieldnames
for index, row in enumerate(rows):
# Job-Verfügbarkeit erneut prüfen
job = Job.query.get(job_id)
if not job:
print("Job wurde abgebrochen.")
return
# Vollständige Adresse erstellen
street = f"{row.get('Straße', '')} {row.get('Hausnummer', '')}".strip()
city_zip = f"{row.get('PLZ', '')} {row.get('Stadt', '')}".strip()
print(f"Verarbeite Adresse: {street}, {city_zip}")
address_results = get_place_details(street, city_zip)
for result in address_results:
# Ergebnisse nur speichern, wenn Name und Telefonnummer vorhanden sind
if result['Name'] != 'N/A' and result['Phone'] != 'N/A':
result.update({
'PLZ': row.get('PLZ', ''),
'Stadt': row.get('Stadt', ''),
'Straße': row.get('Straße', ''),
'Hausnummer': row.get('Hausnummer', ''),
'Zusatz': row.get('Zusatz', '')
})
results.append(result)
# Results-Dateiname basierend auf dem Upload-Dateinamen
result_file = f"results_{filename}"
result_path = os.path.join(RESULT_FOLDER, result_file)
# Prüfen und erstellen des Ergebnisverzeichnisses
if not os.path.exists(RESULT_FOLDER):
os.makedirs(RESULT_FOLDER)
print(f"Erstelle Ergebnisverzeichnis: {RESULT_FOLDER}")
try:
if results: # Nur speichern, wenn Ergebnisse vorhanden sind
with open(result_path, 'w', newline='', encoding='utf-8-sig') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=['Name', 'Address', 'Phone', 'Website', 'PLZ', 'Stadt', 'Straße', 'Hausnummer', 'Zusatz'])
writer.writeheader()
writer.writerows(results)
print(f"Ergebnisdatei erfolgreich gespeichert unter: {result_path}")
job.status = "Completed"
job.result_filename = result_file
db.session.commit()
else:
print("Keine relevanten Ergebnisse zum Speichern vorhanden. Markiere den Job als 'Failed'.")
if not all(field in headers for field in ['PLZ', 'Straße', 'Hausnummer']):
print("CSV-Datei enthält nicht alle notwendigen Spalten.")
job.status = "Failed"
db.session.commit()
except Exception as e:
print(f"Fehler beim Schreiben der Ergebnisdatei: {e}")
return
for row in reader:
plz = row.get('PLZ', '').strip()
city = row.get('Stadt', row.get('Bezirk', '')).strip()
street = row.get('Straße', '').strip()
house_number = row.get('Hausnummer', '').strip()
additional = row.get('Zusatz', '').strip()
if not all([plz, city, street, house_number]):
continue
full_address = f"{street} {house_number} {additional}, {plz} {city}"
lat, lng = get_geocode(full_address)
if lat is None or lng is None:
continue
nearby_places = get_nearby_places(lat, lng)
for place in nearby_places:
company_name = place['name']
if company_name in processed_companies:
continue
processed_companies.add(company_name)
company_address = place.get('vicinity', 'N/A').split(',')[0]
place_id = place.get('place_id')
company_phone, company_website = get_place_details(place_id) if place_id else ('N/A', 'N/A')
results.append({
'PLZ': plz,
'Stadt': city,
'Straße': street,
'Hausnummer': house_number,
'Zusatz': additional,
'Company Name': company_name,
'Company Address': company_address,
'Company Phone': company_phone,
'Company Website': company_website
})
if results:
result_file = f"results_{os.path.splitext(filename)[0]}.csv"
result_path = os.path.join(RESULT_FOLDER, result_file)
with open(result_path, 'w', newline='', encoding='utf-8-sig') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=[
'PLZ', 'Stadt', 'Straße', 'Hausnummer', 'Zusatz',
'Company Name', 'Company Address', 'Company Phone', 'Company Website'
])
writer.writeheader()
writer.writerows(results)
job.status = "Completed"
job.result_filename = result_file
db.session.commit()
else:
job.status = "Failed"
db.session.commit()