Milton Crash Data Analysis
[1]:
import pandas as pd
import folium
import geopandas as gpd
from shapely.geometry import Point
import matplotlib.pyplot as plt
import contextily as ctx
import milton_maps as mm
import seaborn as sns
import logging
from milton_maps.process_crash_data import *
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("notebook")
# Suppress warnings in notebook output
import warnings
warnings.filterwarnings('ignore')
/Users/alexhasha/Library/Caches/pypoetry/virtualenvs/milton-maps-gfMaDXEA-py3.11/lib/python3.11/site-packages/geopandas/_compat.py:123: UserWarning: The Shapely GEOS version (3.10.3-CAPI-1.16.1) is incompatible with the GEOS version PyGEOS was compiled with (3.10.4-CAPI-1.16.2). Conversions between both will be slow.
warnings.warn(
[2]:
milton_boundaries = get_milton_boundaries()
[3]:
crash_geodf = get_crash_data()
INFO:process_crash_data:Found 2173 records missing coordinates, and will be dropped
INFO:process_crash_data:Found 555 records outside Milton were dropped by the geoclip.
[4]:
crash_geodf.columns
[4]:
Index(['Crash_Number', 'City_Town_Name', 'Crash_Date', 'Crash_Time',
'Crash_Severity', 'Maximum_Injury_Severity_Reported',
'Number_of_Vehicles', 'Total_Nonfatal_Injuries', 'Total_Fatal_Injuries',
'Manner_of_Collision', 'Vehicle_Action_Prior_to_Crash',
'Vehicle_Travel_Directions', 'Most_Harmful_Events',
'Vehicle_Configuration', 'Road_Surface_Condition', 'Ambient_Light',
'Weather_Condition', 'At_Roadway_Intersection',
'Distance_From_Nearest_Roadway_Intersection',
'Distance_From_Nearest_Milemarker', 'Distance_From_Nearest_Exit',
'Distance_From_Nearest_Landmark', 'Non_Motorist_Type', 'X_Cooordinate',
'Y_Cooordinate', 'Crash_DateTime', 'year', 'severity', 'geometry'],
dtype='object')
Visualize crash locations to help verify filtering logic
[5]:
fig, ax = plt.subplots(1, figsize=(8,8))
# Tighten the bounding box around the data locations
ax = mm.plot_map(crash_geodf,
column="Maximum_Injury_Severity_Reported",
categorical=True,
markersize=30,
alpha=0.75,
ax=ax,
)
# Move legend to the bottom right
ax.get_legend().set_bbox_to_anchor((.75, 0.5))
ctx.add_basemap(ax, crs="EPSG:26986")
Question: How to identify accidents along Randolph Avenue?
Unsuccessfull approach: filtering on tabular metadata
[6]:
crash_geodf["At_Roadway_Intersection"].value_counts()
[6]:
At_Roadway_Intersection
RANDOLPH AVENUE / CHICKATAWBUT ROAD 81
GRANITE AVENUE / SQUANTUM STREET 59
BROOK ROAD / CENTRE STREET 40
GRANITE AVENUE / ADAMS STREET 31
RANDOLPH AVENUE / BROOK ROAD 28
..
BLUE HILL AVENUE / BLUE HILL AVENUE / BRUSH HILL ROAD 0
RANDOLPH AVENUE / REEDSDALE ROAD Rte / Randolph Ave Rte 28 N / Reedsdale Rd Rte 0
Rte 138 N / BLUE HILL AVENUE 0
RANDOLPH AVENUE / REEDSDALE ROAD Rte / Randolph Ave Rte N / Reedsdale Rd Rte 0
/ 0
Name: count, Length: 2204, dtype: int64
[7]:
crash_geodf.loc[
crash_geodf["At_Roadway_Intersection"].str.lower().str.contains("randolph").fillna(False),
"At_Roadway_Intersection"
].value_counts()
[7]:
At_Roadway_Intersection
RANDOLPH AVENUE / CHICKATAWBUT ROAD 81
RANDOLPH AVENUE / BROOK ROAD 28
RANDOLPH AVENUE Rte 28 N / CHICKATAWBUT ROAD 26
RANDOLPH AVENUE / REEDSDALE ROAD 23
RANDOLPH AVENUE Rte 28 / CHICKATAWBUT ROAD 22
..
BROOK ROAD / FAIRFAX ROAD 0
BROOK ROAD / DUDLEY LANE 0
BROOK ROAD / CHURCHILL LANE 0
BROOK ROAD / CENTRE STREET / CENTRE ST / BROOK RD 0
\ / GRANITE AVENUE / GRANITE AVENUE 0
Name: count, Length: 2204, dtype: int64
[8]:
crash_geodf.shape
[8]:
(11602, 29)
[9]:
# Filter to crashes that occured on randolph using the pattern above.
randolph_crashes = crash_geodf.loc[crash_geodf["At_Roadway_Intersection"].str.lower().str.contains("randolph").fillna(False), :]
randolph_crashes.shape
[9]:
(805, 29)
[10]:
non_randolph_crashes = crash_geodf.loc[~crash_geodf["At_Roadway_Intersection"].str.lower().str.contains("randolph").fillna(False), :]
non_randolph_crashes.shape
[10]:
(10797, 29)
[11]:
# Plot randolph crashses in red and non-randolph crashes in blue
fig, ax = plt.subplots(1, figsize=(20,20))
non_randolph_crashes.plot(
ax=ax,
color="blue",
markersize=30,
)
randolph_crashes.plot(
ax=ax,
color="red",
markersize=20,
)
#Set plot title to "We cannot use the tabular data as a filter due to missing values"
#increase the size of the title to 20
ax.set_title("We cannot use the tabular data as a filter due to missing values", fontsize=20)
plt.show()
Successful approach: create bounding box from MassDOT road geometries.
[12]:
randolph_ave = get_randolph_ave_shape()
[13]:
intersection_crashes, upstream_crashes, randolph_ave_crashes = randolph_ave_upstream_vs_intersection(crash_geodf, randolph_ave)
other_crashes = crash_geodf.loc[~crash_geodf.index.isin(set(randolph_ave_crashes.index))]
INFO:process_crash_data:Intersection crashes shape: (413, 29)
INFO:process_crash_data:Upstream crashes shape: (767, 29)
[14]:
# Plot randolph crashses in red and non-randolph crashes in blue
fig, ax = plt.subplots(1, figsize=(7,7))
other_crashes.plot(
ax=ax,
color="blue",
markersize=10,
)
upstream_crashes.plot(
ax=ax,
color="yellow",
markersize=20,
)
intersection_crashes.plot(
ax=ax,
color="red",
markersize=20,
)
randolph_ave.plot(
ax=ax,
color="green",
)
#Set plot title to "We cannot use the tabular data as a filter due to missing values"
#increase the size of the title to 20
ax.set_title("We distinguish between intersection and upstream crashes with another buffer filter", fontsize=12)
ctx.add_basemap(ax=ax, crs="EPSG:26986")
plt.show()
Make an interactive map of Randolph Ave crashes with popups giving more details on the crash
[30]:
def popup_text(row):
result = f"""
Direction of Travel: {row['Vehicle_Travel_Directions']}<br>
Year: {row['year']}<br>
Time of Day: {row['Crash_Time']}<br>
Severity: {row['severity']}<br>
Where: {row['where']}<br>
"""
return result
[31]:
randolph_ave_crashes["severity"].value_counts(ascending=True)
[31]:
severity
Fatal Injury 7
Major Injury 36
Unknown 77
Minor Injury 454
No Injury 606
Name: count, dtype: int64
[32]:
# Create an interactive Folium Map with the crash points plotted and a popup with the direction of travel, time of day, and severity of the crash. Color the points by severity of the crash
m = folium.Map(location=[42.224225, -71.070639], zoom_start=18)
# Create a version of randolph_ave_crashes that uses a CRS in latitude and longitude
randolph_ave_crashes_latlon = randolph_ave_crashes.to_crs("EPSG:4326")
injury_crashes = randolph_ave_crashes_latlon.loc[
randolph_ave_crashes_latlon["severity"].isin(
["Fatal Injury", "Major Injury", "Minor Injury"]), :]
colormap = {
"Fatal Injury": "red",
"Major Injury": "yellow",
"Minor Injury": "blue",
}
for _, row in injury_crashes.sort_values(by="severity", ascending=False).iterrows():
popup_content = folium.Popup(popup_text(row), max_width=300) # Adjust max_width as needed
folium.CircleMarker(
location=[row["geometry"].y, row["geometry"].x],
radius=5,
popup=popup_content,
color=colormap[row["severity"]],
fill=True,
fill_color="yellow",
fill_opacity=0.5,
).add_to(m)
display(m)
Make this Notebook Trusted to load map: File -> Trust Notebook
Create a structured dataframe of intersection vs upstream crashes
[ ]:
[18]:
intersection_crashes["where"] = "intersection"
upstream_crashes["where"] = "upstream"
randolph_ave_crashes = gpd.GeoDataFrame( pd.concat([intersection_crashes, upstream_crashes], ignore_index=True) )
[19]:
# Use seaborn to plot number of crashes by year colored by "where"
sns.set_theme(style="whitegrid")
sns.set_palette(["#2d4059", "#ffd460"])
ax = sns.countplot(x="year", hue="where", data=randolph_ave_crashes)
ax.set_title("Crashes by year")
ax.set_ylabel("Number of crashes")
ax.set_xlabel("Year")
# Rotate x axis labels 45 degrees
plt.xticks(rotation=45)
leg = ax.legend()
leg.set_title("Crash Location")
leg.set_bbox_to_anchor((1.02, .6))
plt.show()
[20]:
# Apply the following color palette to the plot below:
# intersection: #2d4059
# upstream: #ffd460
sns.set_theme(style="whitegrid")
sns.set_palette(["#2d4059", "#ffd460"])
_data = randolph_ave_crashes.loc[randolph_ave_crashes["severity"].isin(["Major Injury", "Fatal Injury"])]
ax = sns.countplot(x="year", hue="where", data=_data)
ax.set_title("Major Crashes by year")
ax.set_ylabel("Number of crashes")
ax.set_xlabel("Year")
# Rotate x axis labels 45 degrees
plt.xticks(rotation=45)
leg = ax.legend()
leg.set_title("Crash Location")
leg.set_bbox_to_anchor((1.02, .6))
plt.show()
[21]:
# Use seaborn to plot number of crashes by severity, colored by "where", over the last 4 years
sns.set_theme(style="whitegrid")
ax = sns.countplot(x="severity", hue="where", data=randolph_ave_crashes[randolph_ave_crashes.year >= 2015])
[22]:
randolph_ave_crashes[randolph_ave_crashes.year >= 2010].groupby(["severity", "where"]).size()
[22]:
severity where
Fatal Injury intersection 2
upstream 4
Major Injury intersection 5
upstream 16
Minor Injury intersection 130
upstream 171
No Injury intersection 135
upstream 250
Unknown intersection 15
upstream 27
dtype: int64
[23]:
randolph_ave_crashes.columns
[23]:
Index(['Crash_Number', 'City_Town_Name', 'Crash_Date', 'Crash_Time',
'Crash_Severity', 'Maximum_Injury_Severity_Reported',
'Number_of_Vehicles', 'Total_Nonfatal_Injuries', 'Total_Fatal_Injuries',
'Manner_of_Collision', 'Vehicle_Action_Prior_to_Crash',
'Vehicle_Travel_Directions', 'Most_Harmful_Events',
'Vehicle_Configuration', 'Road_Surface_Condition', 'Ambient_Light',
'Weather_Condition', 'At_Roadway_Intersection',
'Distance_From_Nearest_Roadway_Intersection',
'Distance_From_Nearest_Milemarker', 'Distance_From_Nearest_Exit',
'Distance_From_Nearest_Landmark', 'Non_Motorist_Type', 'X_Cooordinate',
'Y_Cooordinate', 'Crash_DateTime', 'year', 'severity', 'geometry',
'where'],
dtype='object')
[24]:
randolph_ave_crashes.loc[
randolph_ave_crashes.severity == "Fatal Injury",
["Crash_DateTime", "where", "Crash_Date", "Crash_Time", "Manner_of_Collision", "Number_of_Vehicles", "Total_Fatal_Injuries", "Ambient_Light", "Weather_Condition"]
].sort_values(by="Crash_DateTime", ascending=False).drop(columns=["Crash_DateTime"])
[24]:
where | Crash_Date | Crash_Time | Manner_of_Collision | Number_of_Vehicles | Total_Fatal_Injuries | Ambient_Light | Weather_Condition | |
---|---|---|---|---|---|---|---|---|
3 | intersection | 15-Jul-2022 | 10:30 PM | Not reported | 2 | 1 | Not reported | Not Reported |
870 | upstream | 31-Aug-2017 | 8:42 AM | Angle | 2 | 1 | Daylight | Clear |
772 | upstream | 25-Apr-2017 | 4:03 PM | Head-on | 2 | 1 | Daylight | Rain/Cloudy |
204 | intersection | 31-May-2014 | 1:49 AM | Single vehicle crash | 1 | 1 | Dark - lighted roadway | Clear/Clear |
678 | upstream | 28-Nov-2013 | 7:55 AM | Single vehicle crash | 1 | 1 | Daylight | Clear |
731 | upstream | 01-Nov-2012 | 6:22 PM | Single vehicle crash | 1 | 1 | Dark - lighted roadway | Clear/Unknown |
919 | upstream | 15-Feb-2005 | 12:05 PM | Sideswipe, opposite direction | 2 | 1 | Daylight | Clear |
[25]:
# From 2014 to 2016 there were 54 crashes at this intersection. Of those 54, 28 involved a fatality or injury.
intersection_crashes.loc[
(intersection_crashes.year >= 2014) &
(intersection_crashes.year <= 2016)].shape[0]
[25]:
(64, 30)
[26]:
intersection_crashes.loc[
(intersection_crashes.year >= 2014) &
(intersection_crashes.year <= 2016), "severity"].value_counts()
[26]:
severity
Minor Injury 30
No Injury 30
Unknown 3
Fatal Injury 1
Name: count, dtype: int64
Scratch
[ ]:
[ ]:
[27]:
# Expand the axes of this plot to match the range of data in crash_geodf2
fig, ax = plt.subplots(1, figsize=(20,20))
randolph_ave_crashes.loc[
(randolph_ave_crashes.year >= 2010) & (randolph_ave_crashes.severity.isin(["Fatal Injury", "Major Injury"])),
].plot(
ax=ax,
column="severity",
categorical=True,
markersize=30,
alpha=0.5,
cmap = "flag"
)
# x and y range in crash_geodf
xmin = crash_geodf2.geometry.x.min()
xmax = crash_geodf2.geometry.x.max()
ymin = crash_geodf2.geometry.y.min()
ymax = crash_geodf2.geometry.y.max()
ax.set_xlim(xmin, xmax)
ax.set_ylim(ymin, ymax)
ctx.add_basemap(ax, crs="EPSG:26986")
plt.show()
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
/Users/alexhasha/repos/milton_maps/notebooks/crash_analysis.ipynb Cell 38 line 1
<a href='vscode-notebook-cell:/Users/alexhasha/repos/milton_maps/notebooks/crash_analysis.ipynb#Z1450sZmlsZQ%3D%3D?line=2'>3</a> randolph_ave_crashes.loc[
<a href='vscode-notebook-cell:/Users/alexhasha/repos/milton_maps/notebooks/crash_analysis.ipynb#Z1450sZmlsZQ%3D%3D?line=3'>4</a> (randolph_ave_crashes.year >= 2010) & (randolph_ave_crashes.severity.isin(["Fatal Injury", "Major Injury"])),
<a href='vscode-notebook-cell:/Users/alexhasha/repos/milton_maps/notebooks/crash_analysis.ipynb#Z1450sZmlsZQ%3D%3D?line=4'>5</a> ].plot(
(...)
<a href='vscode-notebook-cell:/Users/alexhasha/repos/milton_maps/notebooks/crash_analysis.ipynb#Z1450sZmlsZQ%3D%3D?line=10'>11</a> cmap = "flag"
<a href='vscode-notebook-cell:/Users/alexhasha/repos/milton_maps/notebooks/crash_analysis.ipynb#Z1450sZmlsZQ%3D%3D?line=11'>12</a> )
<a href='vscode-notebook-cell:/Users/alexhasha/repos/milton_maps/notebooks/crash_analysis.ipynb#Z1450sZmlsZQ%3D%3D?line=12'>13</a> # x and y range in crash_geodf
---> <a href='vscode-notebook-cell:/Users/alexhasha/repos/milton_maps/notebooks/crash_analysis.ipynb#Z1450sZmlsZQ%3D%3D?line=13'>14</a> xmin = crash_geodf2.geometry.x.min()
<a href='vscode-notebook-cell:/Users/alexhasha/repos/milton_maps/notebooks/crash_analysis.ipynb#Z1450sZmlsZQ%3D%3D?line=14'>15</a> xmax = crash_geodf2.geometry.x.max()
<a href='vscode-notebook-cell:/Users/alexhasha/repos/milton_maps/notebooks/crash_analysis.ipynb#Z1450sZmlsZQ%3D%3D?line=15'>16</a> ymin = crash_geodf2.geometry.y.min()
NameError: name 'crash_geodf2' is not defined
[ ]:
import numpy as np
# Create a new dataframe for crashes from 2010 and with relevant 'severity' levels
filtered_crashes = randolph_ave_crashes.loc[
(randolph_ave_crashes.year >= 2010)
]
# Convert geometry points to a NumPy array (needed for scikit-learn)
coords = filtered_crashes.geometry.apply(lambda p: [p.x, p.y]).to_list()
coords = np.array(coords)
[ ]:
min(coords[:,0])
[ ]:
from scipy.spatial import KDTree
import numpy as np
import matplotlib.pyplot as plt
# Example coords array; replace with your actual data
# coords = np.array([[x1, y1], [x2, y2], ...])
# Create a KDTree for efficient nearest neighbor search
kdtree = KDTree(coords)
# Define your curve parameterization here
# For example, let's assume a simple linear parameterization along the x-axis:
curve_y = np.linspace(min(coords[:,1]), max(coords[:,1]), 100) # 100 equidistant points
# Initialize an array to hold counts of nearest points to each interval center
counts = np.zeros(curve.shape)
# Calculate the nearest crash point for each point along the curve
for i, x in enumerate(curve):
# Query the KDTree for the nearest point
distance, index = kdtree.query([x, 0]) # replace 0 with the corresponding y value on the curve
# Increment the count for the nearest point
counts[i] += 1
# Now 'counts' contains the number of times each point along the curve was the nearest point to a crash
# To visualize, let's plot the curve and color it based on 'counts'
plt.scatter(curve, np.zeros(curve.shape), c=counts, cmap='hot')
plt.colorbar(label='Number of nearest crashes')
plt.show()
[ ]:
from shapely.ops import linemerge
# Combine the LineStrings into a single MultiLineString
multi_line = linemerge(randolph_ave.geometry.tolist())
[ ]:
# Number of points you want along the curve
num_points = 100
# Calculate the length of the merged line
line_length = multi_line.length
# Calculate equally spaced distances along the line
distances = np.linspace(0, line_length, num_points)
# Create a list to hold the equally spaced points
equally_spaced_points = []
for distance in distances:
# Use the `interpolate` method to find the point at each distance
point = multi_line.interpolate(distance)
equally_spaced_points.append(point)
[ ]:
equally_spaced_points
[ ]:
from scipy.spatial import KDTree
import numpy as np
# Create a KDTree from the equally spaced points
equally_spaced_coords = [(point.x, point.y) for point in equally_spaced_points]
kdtree = KDTree(equally_spaced_coords)
# Initialize an array to hold counts of nearest road points for each crash
counts = np.zeros(len(equally_spaced_points))
# Query KDTree for each crash point
crash_coords = filtered_crashes.geometry.apply(lambda p: (p.x, p.y)).tolist()
for coord in crash_coords:
distance, index = kdtree.query(coord)
counts[index] += 1
# Now 'counts' contains the number of times each equally spaced point along the road
# is the nearest point to a crash.
# You can visualize this data as needed.
[ ]:
counts
[ ]:
np.log10(counts+1)
[ ]:
# Scatter plot coords on a map, with markersize scaled by counts
fig, ax = plt.subplots(1, figsize=(20,20))
ax.scatter(
x=[p.x for p in equally_spaced_points],
y=[p.y for p in equally_spaced_points],
s=np.log(counts+1)*30,
alpha=0.5,
color="red",
)
# Set equal aspect ratio
ax.set_aspect('equal', 'box')
[ ]:
index
[ ]:
crash_geodf['crash_time'] = pd.to_datetime(crash_geodf['Crash Date'] + " " + crash_geodf['Crash Time'])
[ ]:
crash_geodf.info()
[ ]:
roads_df = gpd.read_file("../data/raw/MassDOT_Roads_SHP/EOTMAJROADS_RTE_MAJOR.shp")
[ ]:
ax = roads_df[roads_df.RT_NUMBER=="28"].plot()
ctx.add_basemap(ax, crs="EPSG:26986")
[ ]:
rt28 = roads_df[roads_df.RT_NUMBER=="28"]
milton_rt_28 = gpd.clip(rt28, milton_boundaries)
ax = milton_rt_28.plot()
ctx.add_basemap(ax, crs="EPSG:26986")
[ ]:
[ ]:
df_nearest_major_road = gpd.sjoin_nearest(crash_geodf, roads_df, how="left", distance_col="distance_to_major_road")
[ ]:
df_nearest_major_road.info()
[ ]:
rt_28_crashes = df_nearest_major_road[(df_nearest_major_road.RT_NUMBER=="28") & (df_nearest_major_road.distance_to_major_road < 10.0)]
rt_28_crashes
[ ]:
ax = rt_28_crashes.plot(column="severity")
ctx.add_basemap(ax, crs="EPSG:26986")
[ ]:
rt_28_crashes['month_year'] = rt_28_crashes['crash_time'].dt.to_period('M')
[ ]:
df = rt_28_crashes.groupby(["month_year", "severity"])['RMV Crash Number'].count().reset_index()
df
[ ]:
#df.columns
accidents_over_time = df.pivot(index='month_year', columns='severity', values='RMV Crash Number').fillna(0)
[ ]:
accidents_over_time.plot.area()
[ ]:
milton_roads_df = gpd.read_file("../data/raw/MassDOT_Roads_SHP/EOTROADS_ARC.shp", mask=milton_boundaries)
ax = milton_roads_df.plot()
ctx.add_basemap(ax,crs="EPSG:26986")
[ ]:
milton_roads_df.head()
[ ]:
chickatawbut_road = milton_roads_df[milton_roads_df.STREETNAME=="CHICKATAWBUT ROAD"].dissolve()
chickatawbut_road.plot()
[ ]:
chickatawbut_road.dissolve().plot()
[ ]:
randolph_ave = milton_roads_df[milton_roads_df.STREETNAME=="RANDOLPH AVENUE"].dissolve()
randolph_ave.plot()
[ ]:
pd.concat([randolph_ave, chickatawbut_road]).plot()
[ ]:
randolph_chickatawbut_intsct = randolph_ave.geometry.intersection(chickatawbut_road.geometry).buffer(50)
ax = pd.concat([randolph_ave, chickatawbut_road]).plot()
ax = randolph_chickatawbut_intsct.plot(ax=ax)
ctx.add_basemap(ax, crs="EPSG:26986")
[ ]:
ax = randolph_chickatawbut_intsct.plot()
ctx.add_basemap(ax, crs="EPSG:26986", zoom=12)
[ ]:
randolph_chickatawbut_intsct
[ ]:
193.18/647
[ ]: