adjust script to group correctly
This commit is contained in:
parent
c319f0de2b
commit
6320e80b5d
1 changed files with 22 additions and 24 deletions
|
@ -1,4 +1,4 @@
|
|||
import requests
|
||||
oimport requests
|
||||
import feedparser
|
||||
from bs4 import BeautifulSoup
|
||||
import os
|
||||
|
@ -90,28 +90,24 @@ def extract_incident_data(html_content):
|
|||
|
||||
return incident_data
|
||||
|
||||
def get_date_from_published(published_str):
|
||||
"""Extract date from published string and format as YYYY-MM-DD."""
|
||||
try:
|
||||
# Parse the published date string
|
||||
date_obj = datetime.datetime.strptime(published_str, '%a, %d %b %Y %H:%M:%S %Z')
|
||||
return date_obj.strftime('%Y-%m-%d')
|
||||
except Exception as e:
|
||||
print(f"Error parsing published date: {e}")
|
||||
# Fallback to current date
|
||||
return datetime.datetime.now().strftime('%Y-%m-%d')
|
||||
|
||||
def get_incident_file_path(incident_data):
|
||||
"""Determine the file path for an incident based on its data."""
|
||||
# Extract date from published field or Incident Date field
|
||||
date_str = None
|
||||
"""Determine the file path for an incident based on its published date."""
|
||||
# Always use the published date for directory grouping
|
||||
if 'published' in incident_data:
|
||||
try:
|
||||
date_obj = datetime.datetime.strptime(incident_data['published'], '%a, %d %b %Y %H:%M:%S %Z')
|
||||
date_str = date_obj.strftime('%Y-%m-%d')
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not date_str and 'Incident Date' in incident_data:
|
||||
try:
|
||||
date_match = re.search(r'(\d{1,2}/\d{1,2}/\d{4})', incident_data['Incident Date'])
|
||||
if date_match:
|
||||
date_obj = datetime.datetime.strptime(date_match.group(1), '%m/%d/%Y')
|
||||
date_str = date_obj.strftime('%Y-%m-%d')
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Use current date as fallback
|
||||
if not date_str:
|
||||
date_str = get_date_from_published(incident_data['published'])
|
||||
else:
|
||||
# Fallback to current date if no published date
|
||||
date_str = datetime.datetime.now().strftime('%Y-%m-%d')
|
||||
|
||||
# Create safe case number for filename
|
||||
|
@ -191,11 +187,13 @@ def save_incidents(incidents):
|
|||
print("No incidents to save")
|
||||
return
|
||||
|
||||
# Group incidents by date
|
||||
# Group incidents by published date
|
||||
incidents_by_date = {}
|
||||
for incident in incidents:
|
||||
file_path = get_incident_file_path(incident)
|
||||
date_str = file_path.parent.name
|
||||
if 'published' in incident:
|
||||
date_str = get_date_from_published(incident['published'])
|
||||
else:
|
||||
date_str = datetime.datetime.now().strftime('%Y-%m-%d')
|
||||
|
||||
if date_str not in incidents_by_date:
|
||||
incidents_by_date[date_str] = []
|
||||
|
|
Loading…
Add table
Reference in a new issue