In [26]:
import pandas as pd
import matplotlib.pyplot as plt
In [3]:
df = pd.read_csv('.data.csv')
df.columns
Out[3]:
Index(['event_id', 'date', 'province', 'district', 'latitude', 'longitude',
'peak_rainfall_mm_per_hr', 'duration_min', 'area_km2', 'deaths',
'injuries', 'notes'],
dtype='object')
In [10]:
df.head()
Out[10]:
| event_id | date | province | district | latitude | longitude | peak_rainfall_mm_per_hr | duration_min | area_km2 | deaths | injuries | notes | year | month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2010-01-10_143 | 2010-01-10 | Punjab | Multan | 36.4547 | 68.6467 | 364.8 | 60 | 23.8 | 6 | 0 | Cloudburst in Multan, Punjab; short-duration e... | 2010 | 1 |
| 1 | 2010-01-15_117 | 2010-01-15 | Khyber Pakhtunkhwa | Kohat | 28.2284 | 69.7835 | 100.8 | 45 | 58.2 | 0 | 0 | Cloudburst in Kohat, Khyber Pakhtunkhwa; short... | 2010 | 1 |
| 2 | 2010-02-10_093 | 2010-02-10 | Punjab | Multan | 34.2131 | 67.3667 | 148.8 | 120 | 11.7 | 1 | 1 | Cloudburst in Multan, Punjab; short-duration e... | 2010 | 2 |
| 3 | 2010-04-07_118 | 2010-04-07 | Islamabad | Islamabad | 27.1652 | 66.6173 | 327.8 | 90 | 22.5 | 0 | 1 | Cloudburst in Islamabad, Islamabad; short-dura... | 2010 | 4 |
| 4 | 2010-04-27_087 | 2010-04-27 | Punjab | Multan | 29.3843 | 66.5263 | 192.1 | 180 | 29.6 | 0 | 1 | Cloudburst in Multan, Punjab; short-duration e... | 2010 | 4 |
Temporal Analysis e.g. are cloudbursts becoming more frequent?¶
In [5]:
df['date'] = pd.to_datetime(df['date'])
In [6]:
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
In [21]:
def get_season(month):
if month in [6, 7, 8, 9]:
return 'Summer'
elif month in [12, 1, 2]:
return 'Winter'
elif month in [3, 4, 5]:
return 'Spring'
elif month in [10,11]:
return 'before winter'
else:
return 'Other'
In [22]:
df['season'] = df['month'].apply(get_season)
In [86]:
events_by_year = df.groupby('year').size()
events_by_month = df.groupby('month').size()
events_by_season = df.groupby('season').size()
Trend Analysis¶
In [14]:
# plt.style.use('seaborn')
# Plot 1: Line Plot for Events by Year
plt.figure(figsize=(10, 6))
events_by_year.plot(kind='line', marker='o', color='#4e79a7', linewidth=2)
plt.title('Cloudburst Events by Year', fontsize=14)
plt.xlabel('Year', fontsize=12)
plt.ylabel('Number of Events', fontsize=12)
plt.grid(True)
plt.tight_layout()
plt.show()
Seasonal Analysis¶
In [15]:
plt.figure(figsize=(10, 6))
events_by_month.plot(kind='bar', color='#f28e2b', edgecolor='black')
plt.title('Cloudburst Events by Month', fontsize=14)
plt.xlabel('Month', fontsize=12)
plt.ylabel('Number of Events', fontsize=12)
plt.xticks(ticks=range(12), labels=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], rotation=45)
plt.grid(axis='y')
plt.tight_layout()
plt.show()
In [25]:
plt.figure(figsize=(10, 6))
events_by_season.plot(kind='bar', color='#f28e2b', edgecolor='black')
plt.title('Cloudburst Events by Season', fontsize=14)
plt.xlabel('Season', fontsize=12)
plt.ylabel('Number of Events', fontsize=12)
plt.grid(axis='y')
plt.tight_layout()
plt.show()
In [85]:
# 1. Calculate Mean, Median, and Range of duration_min by Year
rainfall_stats_by_year = df.groupby('year')[['duration_min','peak_rainfall_mm_per_hr']].agg(['mean', 'median', 'min', 'max'])
rainfall_stats_by_year.columns = ['_'.join(col) for col in rainfall_stats_by_year.columns]
rainfall_stats_by_year.head()
Out[85]:
| duration_min_mean | duration_min_median | duration_min_min | duration_min_max | peak_rainfall_mm_per_hr_mean | peak_rainfall_mm_per_hr_median | peak_rainfall_mm_per_hr_min | peak_rainfall_mm_per_hr_max | |
|---|---|---|---|---|---|---|---|---|
| year | ||||||||
| 2010 | 84.230769 | 90.0 | 30 | 180 | 223.269231 | 214.30 | 100.8 | 364.8 |
| 2011 | 82.500000 | 75.0 | 30 | 180 | 242.875000 | 254.80 | 110.5 | 368.6 |
| 2012 | 84.375000 | 52.5 | 30 | 180 | 212.150000 | 198.25 | 50.0 | 322.0 |
| 2013 | 121.250000 | 120.0 | 45 | 180 | 209.483333 | 199.35 | 109.9 | 334.3 |
| 2014 | 87.500000 | 90.0 | 30 | 180 | 197.933333 | 196.65 | 60.8 | 365.8 |
In [74]:
plt.figure(figsize=(12, 8))
# Subplot 1: Duration
plt.subplot(2, 1, 1)
plt.plot(rainfall_stats_by_year.index, rainfall_stats_by_year['duration_min_mean'],
marker='o', color='#4e79a7', label='Mean Duration')
plt.plot(rainfall_stats_by_year.index, rainfall_stats_by_year['duration_min_median'],
marker='s', color='#76b7b2', label='Median Duration')
plt.title('Cloudburst Duration by Year', fontsize=14)
plt.xlabel('Year', fontsize=12)
plt.ylabel('Duration (Minutes)', fontsize=12)
plt.legend()
plt.grid(True)
# Subplot 2: Peak Rainfall
plt.subplot(2, 1, 2)
plt.plot(rainfall_stats_by_year.index, rainfall_stats_by_year['peak_rainfall_mm_per_hr_mean'],
marker='o', color='#f28e2b', label='Mean Peak Rainfall')
plt.plot(rainfall_stats_by_year.index, rainfall_stats_by_year['peak_rainfall_mm_per_hr_median'],
marker='s', color='#e15759', label='Median Peak Rainfall')
plt.title('Peak Rainfall Intensity by Year', fontsize=14)
plt.xlabel('Year', fontsize=12)
plt.ylabel('Peak Rainfall (mm/hr)', fontsize=12)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
In [77]:
impact_stats_by_year = df.groupby('year')[['deaths', 'injuries']].sum()
combined_stats = rainfall_stats_by_year.join(impact_stats_by_year)
In [83]:
plt.figure(figsize=(12, 10))
# Mean Duration vs. Deaths and Injuries
ax1 = plt.subplot(2, 1, 1)
ax1.plot(combined_stats.index, combined_stats['duration_min_mean'], marker='o', color='#4e79a7', label='Mean Duration (min)')
ax1.set_xlabel('Year', fontsize=12)
ax1.set_ylabel('Mean Duration (min)', fontsize=12, color='#4e79a7')
ax1.tick_params(axis='y', labelcolor='#4e79a7')
ax1.grid(True)
# Twin axis for Deaths and Injuries
ax1_twin = ax1.twinx()
ax1_twin.plot(combined_stats.index, combined_stats['deaths'], marker='s', color='#e15759', label='Deaths')
ax1_twin.plot(combined_stats.index, combined_stats['injuries'], marker='^', color='#76b7b2', label='Injuries')
ax1_twin.set_ylabel('Deaths / Injuries', fontsize=12, color='#e15759')
ax1_twin.tick_params(axis='y', labelcolor='#e15759')
ax1.set_title('Mean Cloudburst Duration vs. Deaths and Injuries by Year', fontsize=14)
# Combine legends
lines1, labels1 = ax1.get_legend_handles_labels()
lines1_twin, labels1_twin = ax1_twin.get_legend_handles_labels()
ax1_twin.legend(lines1 + lines1_twin, labels1 + labels1_twin, loc='upper left')
# Subplot 2: Mean Peak Rainfall vs. Deaths and Injuries
ax2 = plt.subplot(2, 1, 2)
ax2.plot(combined_stats.index, combined_stats['peak_rainfall_mm_per_hr_mean'], marker='o', color='#f28e2b', label='Mean Peak Rainfall (mm/hr)')
ax2.set_xlabel('Year', fontsize=12)
ax2.set_ylabel('Mean Peak Rainfall (mm/hr)', fontsize=12, color='#f28e2b')
ax2.tick_params(axis='y', labelcolor='#f28e2b')
ax2.grid(True)
# Twin axis for Deaths and Injuries
ax2_twin = ax2.twinx()
ax2_twin.plot(combined_stats.index, combined_stats['deaths'], marker='s', color='#e15759', label='Deaths')
ax2_twin.plot(combined_stats.index, combined_stats['injuries'], marker='^', color='#76b7b2', label='Injuries')
ax2_twin.set_ylabel('Deaths / Injuries', fontsize=12, color='#e15759')
ax2_twin.tick_params(axis='y', labelcolor='#e15759')
ax2.set_title('Mean Peak Rainfall vs. Deaths and Injuries by Year', fontsize=14)
# Combine legends
lines2, labels2 = ax2.get_legend_handles_labels()
lines2_twin, labels2_twin = ax2_twin.get_legend_handles_labels()
ax2_twin.legend(lines2 + lines2_twin, labels2 + labels2_twin, loc='upper left')
plt.tight_layout()
plt.show()
In [91]:
events_by_province = df.groupby('province')['event_id'].count()
deaths_injury_province = df.groupby('province')[['deaths', 'injuries']].sum()
Out[91]:
| deaths | injuries | |
|---|---|---|
| province | ||
| Balochistan | 16 | 40 |
| Gilgit-Baltistan | 17 | 52 |
| Islamabad | 21 | 33 |
| Khyber Pakhtunkhwa | 22 | 47 |
| Punjab | 16 | 36 |
| Sindh | 22 | 49 |
In [92]:
# Combine events_by_province and deaths_injury_province into one DataFrame
combined_province_stats = deaths_injury_province.join(events_by_province.rename('events'))
plt.figure(figsize=(12, 10))
# Subplot 1: Number of Events
plt.subplot(3, 1, 1)
combined_province_stats['events'].plot(kind='bar', color='#4e79a7', edgecolor='black')
plt.title('Number of Cloudburst Events by Province', fontsize=14)
plt.xlabel('Province', fontsize=12)
plt.ylabel('Number of Events', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y')
# Subplot 2: Total Deaths
plt.subplot(3, 1, 2)
combined_province_stats['deaths'].plot(kind='bar', color='#e15759', edgecolor='black')
plt.title('Total Deaths by Province', fontsize=14)
plt.xlabel('Province', fontsize=12)
plt.ylabel('Total Deaths', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y')
# Subplot 3: Total Injuries
plt.subplot(3, 1, 3)
combined_province_stats['injuries'].plot(kind='bar', color='#76b7b2', edgecolor='black')
plt.title('Total Injuries by Province', fontsize=14)
plt.xlabel('Province', fontsize=12)
plt.ylabel('Total Injuries', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y')
plt.tight_layout()
plt.show()
In [93]:
plt.figure(figsize=(12, 8))
combined_province_stats[['events', 'deaths', 'injuries']].plot(kind='bar',
color=['#4e79a7', '#e15759', '#76b7b2'],
edgecolor='black')
plt.title('Cloudburst Events, Deaths, and Injuries by Province', fontsize=14)
plt.xlabel('Province', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.legend(['Events', 'Deaths', 'Injuries'])
plt.grid(axis='y')
plt.tight_layout()
plt.show()
<Figure size 1200x800 with 0 Axes>
In [94]:
plt.figure(figsize=(12, 6))
# Subplot 1: Events vs. Deaths
plt.subplot(1, 2, 1)
plt.scatter(combined_province_stats['events'], combined_province_stats['deaths'],
color='#e15759', s=100)
for i, province in enumerate(combined_province_stats.index):
plt.text(combined_province_stats['events'].iloc[i],
combined_province_stats['deaths'].iloc[i],
province, fontsize=9)
plt.title('Cloudburst Events vs. Deaths by Province', fontsize=14)
plt.xlabel('Number of Events', fontsize=12)
plt.ylabel('Total Deaths', fontsize=12)
plt.grid(True)
# Subplot 2: Events vs. Injuries
plt.subplot(1, 2, 2)
plt.scatter(combined_province_stats['events'], combined_province_stats['injuries'],
color='#76b7b2', s=100)
for i, province in enumerate(combined_province_stats.index):
plt.text(combined_province_stats['events'].iloc[i],
combined_province_stats['injuries'].iloc[i],
province, fontsize=9)
plt.title('Cloudburst Events vs. Injuries by Province', fontsize=14)
plt.xlabel('Number of Events', fontsize=12)
plt.ylabel('Total Injuries', fontsize=12)
plt.grid(True)
plt.tight_layout()
plt.show()
In [ ]:
In [97]:
import seaborn as sns
# Normalize the data to compare on the same scale
normalized_stats = combined_province_stats.copy()
normalized_stats['events'] = normalized_stats['events'] / normalized_stats['events'].max()
normalized_stats['deaths'] = normalized_stats['deaths'] / normalized_stats['deaths'].max()
normalized_stats['injuries'] = normalized_stats['injuries'] / normalized_stats['injuries'].max()
normalized_stats
Out[97]:
| deaths | injuries | events | |
|---|---|---|---|
| province | |||
| Balochistan | 0.727273 | 0.769231 | 1.000000 |
| Gilgit-Baltistan | 0.772727 | 1.000000 | 0.862069 |
| Islamabad | 0.954545 | 0.634615 | 0.758621 |
| Khyber Pakhtunkhwa | 1.000000 | 0.903846 | 0.862069 |
| Punjab | 0.727273 | 0.692308 | 0.758621 |
| Sindh | 1.000000 | 0.942308 | 0.931034 |
In [96]:
# Plot heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(normalized_stats[['events', 'deaths', 'injuries']],
annot=True, cmap='YlOrRd', cbar_kws={'label': 'Normalized Value'})
plt.title('Normalized Cloudburst Events, Deaths, and Injuries by Province', fontsize=14)
plt.xlabel('Metric', fontsize=12)
plt.ylabel('Province', fontsize=12)
plt.tight_layout()
plt.show()
In [ ]:
In [ ]:
In [ ]: