From 6320e80b5d1106a3d3832d74f2ef39e53db15d08 Mon Sep 17 00:00:00 2001 From: bpmcdevitt Date: Thu, 17 Apr 2025 11:10:37 -0500 Subject: [PATCH] adjust script to group correctly --- madison_police_incident_report_getter.py | 46 ++++++++++++------------ 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/madison_police_incident_report_getter.py b/madison_police_incident_report_getter.py index a13eed8..7c7ab49 100755 --- a/madison_police_incident_report_getter.py +++ b/madison_police_incident_report_getter.py @@ -1,4 +1,4 @@ -import requests +oimport requests import feedparser from bs4 import BeautifulSoup import os @@ -90,28 +90,24 @@ def extract_incident_data(html_content): return incident_data +def get_date_from_published(published_str): + """Extract date from published string and format as YYYY-MM-DD.""" + try: + # Parse the published date string + date_obj = datetime.datetime.strptime(published_str, '%a, %d %b %Y %H:%M:%S %Z') + return date_obj.strftime('%Y-%m-%d') + except Exception as e: + print(f"Error parsing published date: {e}") + # Fallback to current date + return datetime.datetime.now().strftime('%Y-%m-%d') + def get_incident_file_path(incident_data): - """Determine the file path for an incident based on its data.""" - # Extract date from published field or Incident Date field - date_str = None + """Determine the file path for an incident based on its published date.""" + # Always use the published date for directory grouping if 'published' in incident_data: - try: - date_obj = datetime.datetime.strptime(incident_data['published'], '%a, %d %b %Y %H:%M:%S %Z') - date_str = date_obj.strftime('%Y-%m-%d') - except Exception: - pass - - if not date_str and 'Incident Date' in incident_data: - try: - date_match = re.search(r'(\d{1,2}/\d{1,2}/\d{4})', incident_data['Incident Date']) - if date_match: - date_obj = datetime.datetime.strptime(date_match.group(1), '%m/%d/%Y') - date_str = date_obj.strftime('%Y-%m-%d') - except Exception: - pass - - # Use current date as fallback - if not date_str: + date_str = get_date_from_published(incident_data['published']) + else: + # Fallback to current date if no published date date_str = datetime.datetime.now().strftime('%Y-%m-%d') # Create safe case number for filename @@ -191,11 +187,13 @@ def save_incidents(incidents): print("No incidents to save") return - # Group incidents by date + # Group incidents by published date incidents_by_date = {} for incident in incidents: - file_path = get_incident_file_path(incident) - date_str = file_path.parent.name + if 'published' in incident: + date_str = get_date_from_published(incident['published']) + else: + date_str = datetime.datetime.now().strftime('%Y-%m-%d') if date_str not in incidents_by_date: incidents_by_date[date_str] = []