adjust script to group correctly

This commit is contained in:
Brendan McDevitt 2025-04-17 11:10:37 -05:00
parent c319f0de2b
commit 6320e80b5d

View file

@ -1,4 +1,4 @@
import requests
oimport requests
import feedparser
from bs4 import BeautifulSoup
import os
@ -90,28 +90,24 @@ def extract_incident_data(html_content):
return incident_data
def get_date_from_published(published_str):
"""Extract date from published string and format as YYYY-MM-DD."""
try:
# Parse the published date string
date_obj = datetime.datetime.strptime(published_str, '%a, %d %b %Y %H:%M:%S %Z')
return date_obj.strftime('%Y-%m-%d')
except Exception as e:
print(f"Error parsing published date: {e}")
# Fallback to current date
return datetime.datetime.now().strftime('%Y-%m-%d')
def get_incident_file_path(incident_data):
"""Determine the file path for an incident based on its data."""
# Extract date from published field or Incident Date field
date_str = None
"""Determine the file path for an incident based on its published date."""
# Always use the published date for directory grouping
if 'published' in incident_data:
try:
date_obj = datetime.datetime.strptime(incident_data['published'], '%a, %d %b %Y %H:%M:%S %Z')
date_str = date_obj.strftime('%Y-%m-%d')
except Exception:
pass
if not date_str and 'Incident Date' in incident_data:
try:
date_match = re.search(r'(\d{1,2}/\d{1,2}/\d{4})', incident_data['Incident Date'])
if date_match:
date_obj = datetime.datetime.strptime(date_match.group(1), '%m/%d/%Y')
date_str = date_obj.strftime('%Y-%m-%d')
except Exception:
pass
# Use current date as fallback
if not date_str:
date_str = get_date_from_published(incident_data['published'])
else:
# Fallback to current date if no published date
date_str = datetime.datetime.now().strftime('%Y-%m-%d')
# Create safe case number for filename
@ -191,11 +187,13 @@ def save_incidents(incidents):
print("No incidents to save")
return
# Group incidents by date
# Group incidents by published date
incidents_by_date = {}
for incident in incidents:
file_path = get_incident_file_path(incident)
date_str = file_path.parent.name
if 'published' in incident:
date_str = get_date_from_published(incident['published'])
else:
date_str = datetime.datetime.now().strftime('%Y-%m-%d')
if date_str not in incidents_by_date:
incidents_by_date[date_str] = []