import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd
import os
import seaborn as sns

# Load a filtered FDIC dataset and USA map shapefile
banks = pd.read_csv('/content/drive/MyDrive/filtered_institutions.csv', encoding='utf-8')

usmap = gpd.read_file("/content/drive/MyDrive/tl_2022_us_state/tl_2022_us_state.shp")
usmap = usmap.to_crs("EPSG:5070")


non_continental = ['HI','VI','MP','GU','AK','AS','PR']
usa = usmap
for n in non_continental:
    usa = usa[usa.STUSPS != n]

usa.boundary.plot()
plt.show()


# Select relevant columns from banks dataframe
bank_counts = banks['STNAME'].value_counts()
bank_counts = bank_counts.reset_index()
bank_counts.rename(columns={'index': 'STNAME', 'STNAME': 'COUNT'}, inplace=True)

# Merge the US map with the DataFrame containing the counts of banks
merged_geo_data = usa.merge(bank_counts, left_on='NAME', right_on='STNAME', how='left')

# Plot the US map with the counts of banks as a choropleth map
fig, ax = plt.subplots(1, 1, figsize=(16, 8))
merged_geo_data.plot(column='COUNT', cmap='Oranges', linewidth=0.8, ax=ax, edgecolor='0.8', legend=True)

area_threshold = merged_geo_data['geometry'].area.quantile(0.10)

for idx, row in merged_geo_data.iterrows():
    # Only add text for states larger than the area threshold
    if row.geometry.area > area_threshold:
      centroid = row.geometry.centroid
      plt.text(centroid.x, centroid.y, row['STUSPS'], ha='center', va='center', fontsize=6)

ax.set_title('Number of ever registered Banks by State')

plt.show()


# Calculate the count of active and inactive institutions
active_count = banks['ACTIVE'].sum()
inactive_count = len(banks) - active_count

# Create a donut-like chart using matplotlib
plt.figure(figsize=(5, 5))
labels = ['Active', 'Inactive']
sizes = [active_count, inactive_count]
colors = ['#467AB9', '#B98546']

# Function to format label with absolute numbers and percentages
def func(pct, sizes):
    absolute = int(pct/100.*sum(sizes))
    return f"{absolute:d}\n({pct:.1f}%)"

# Plotting the pie chart
plt.pie(sizes, labels=labels, colors=colors, startangle=90, autopct=lambda pct: func(pct, sizes))

# Draw a circle at the center to make it a donut chart
centre_circle = plt.Circle((0,0),0.70,fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

# Equal aspect ratio ensures that pie is drawn as a circle
plt.title('Comparison of active vs. inactive Banks')
plt.axis('equal')
plt.show()


# Filter out only 'ACTIVE' and 'STNAME' columns (1 means that bank is active)
banks_filtered = banks[['STNAME', 'ACTIVE']]

# Aggregate the number of active and inactive banks by state
bank_status_counts = banks_filtered.groupby(['STNAME', 'ACTIVE']).size().unstack(fill_value=0).reset_index()

# Merge the data
merged_geo_data = usa.merge(bank_status_counts, left_on='NAME', right_on='STNAME', how='left')

# Determine an area threshold
area_threshold = merged_geo_data['geometry'].area.quantile(0.25)  # Adjust the quantile as needed

# Plotting, corrected for integer column access with state abbreviations for larger states
fig, ax = plt.subplots(1, 2, figsize=(20, 10), sharex=True, sharey=True)

# Plot active banks
merged_geo_data.plot(column=1, cmap='Greens', linewidth=0.8, ax=ax[0], edgecolor='0.8', legend=True)
ax[0].set_title('Active Banks by State')
for idx, row in merged_geo_data.iterrows():
    if row.geometry.area > area_threshold:
        centroid = row.geometry.centroid
        ax[0].text(centroid.x, centroid.y, row['STUSPS'], ha='center', va='center', fontsize=8)
ax[0].axis('off')

# Plot inactive banks
merged_geo_data.plot(column=0, cmap='Reds', linewidth=0.8, ax=ax[1], edgecolor='0.8', legend=True)
ax[1].set_title('Inactive Banks by State')
for idx, row in merged_geo_data.iterrows():
    if row.geometry.area > area_threshold:
        centroid = row.geometry.centroid
        ax[1].text(centroid.x, centroid.y, row['STUSPS'], ha='center', va='center', fontsize=8)
ax[1].axis('off')

plt.suptitle('Comparison of active vs. inactive Banks by State', fontsize=16)
plt.show()


# Calculate the ratio of active to inactive banks for each state
bank_status_counts['Inactive/Active Ratio'] = bank_status_counts[0] / bank_status_counts[1]

# Replace any potential infinite values with NaN in case of 0 active banks
bank_status_counts.replace([np.inf, -np.inf], np.nan, inplace=True)

# Merge the geographic data with the bank status counts, including the new ratio
merged_geo_data_ratio = usa.merge(bank_status_counts, left_on='NAME', right_on='STNAME', how='left')

# Plotting the ratio of active to inactive banks by state with state abbreviations for larger states only
fig, ax = plt.subplots(1, 1, figsize=(16, 8))
merged_geo_data_ratio.plot(column='Inactive/Active Ratio', cmap='coolwarm', linewidth=0.8, ax=ax, edgecolor='0.8', legend=True)

# Determine an area threshold below which the state abbreviation will not be displayed
area_threshold = merged_geo_data_ratio['geometry'].area.quantile(0.10)

for idx, row in merged_geo_data_ratio.iterrows():
    # Only add text for states larger than the area threshold
    if row.geometry.area > area_threshold:
        centroid = row.geometry.centroid
        plt.text(centroid.x, centroid.y, row['STUSPS'], ha='center', va='center', fontsize=8)

ax.set_title('Ratio of inactive to active Banks by State')
ax.axis('off')
plt.show()


banks_filtered = banks.dropna(subset=['ROA', 'ROE', 'ASSET', 'STNAME', 'BKCLASS']).copy()

bank_class_dict = {
    'N': 'National Bank',
    'NM': 'State Bank, Non-Fed Member',
    'OI': 'Foreign Bank, Insured',
    'SB': 'Fed Savings Bank',
    'SI': 'State Savings Bank',
    'SL': 'State Savings & Loan',
    'SM': 'State Bank, Fed Member',
    'NC': 'Noninsured Commercial',
    'NS': 'Noninsured Savings Bank',
    'CU': 'Credit Union'
}

# Update BKCLASS with short descriptions
banks_filtered.loc[:, 'BKCLASS'] = banks_filtered['BKCLASS'].map(bank_class_dict)

# Calculate the mean ROA and ROE for each state and sort them for consistent order
mean_values = banks_filtered.groupby('STNAME')[['ROA', 'ROE']].mean().reset_index()
mean_values_sorted = mean_values.sort_values(by='ROA', ascending=False).head(40)

# Create a figure to hold the dashboard layout
dashboard_fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(20, 15))

# Average ROA by State Bar Plot
sns.barplot(y='STNAME', x='ROA', data=mean_values_sorted, ax=ax1)
ax1.set_title('Average ROA by State - Top 40')

# Average ROE by State Bar Plot
sns.barplot(y='STNAME', x='ROE', data=mean_values_sorted, ax=ax2)
ax2.set_title('Average ROE by State - Top 40')

# Create the barplot for Total Assets by State for the top 40 states
total_assets_by_state = banks_filtered.groupby('STNAME')['ASSET'].sum().reset_index()
top_states_by_assets = total_assets_by_state.sort_values(by='ASSET', ascending=False).head(40)
sns.barplot(data=top_states_by_assets, x='ASSET', y='STNAME', ax=ax3)
ax3.set_title('Total Assets by State - Top 40')
ax3.set_xscale('log')

# Bank Classification Types
sns.countplot(data=banks_filtered, y='BKCLASS', ax=ax4, order=banks_filtered['BKCLASS'].value_counts().index)
ax4.set_title('Bank Classification Types')

# Adjust layout spacing and show the dashboard
dashboard_fig.tight_layout(pad=3.0)
plt.show()


# Aggregate the number of banks by state and class
bank_class_counts = banks.groupby(['STNAME', 'BKCLASS']).size().reset_index(name='Counts')

# Pivot the data to have bank classes as columns and states as rows
pivot_bank_class_counts = bank_class_counts.pivot(index='STNAME', columns='BKCLASS', values='Counts').fillna(0)

# Merge the geographic data with the pivoted bank class counts
merged_geo_data_classes = usa.merge(pivot_bank_class_counts, left_on='NAME', right_index=True, how='left').fillna(0)

# Choose a bank class to visualize
bank_class_to_plot = 'NM'

# Build a plot
fig, ax = plt.subplots(1, 1, figsize=(16, 8))
merged_geo_data_classes.plot(column=bank_class_to_plot, cmap='Oranges', linewidth=0.8, ax=ax, edgecolor='0.8', legend=True)
area_threshold = merged_geo_data_classes['geometry'].area.quantile(0.10)

# Add state abbreviations without the area threshold condition for simplicity
for idx, row in merged_geo_data_classes.iterrows():
    # Only add text for states larger than the area threshold
    if row.geometry.area > area_threshold:
      centroid = row.geometry.centroid
      plt.text(centroid.x, centroid.y, row['STUSPS'], ha='center', va='center', fontsize=6)

ax.set_title('Concentration of commercial Banks (Fed Non-Members) by State')
ax.axis('off')
plt.show()

Apply Geopandas and Seaborn to FDIC Bank Data Visualizations¶

Let's exclude the following territories from the analysis:¶

Now we can display U.S. map of 49 states with boundaries.¶

Examine the number of banks ever registered in each state.¶

I'd like to plot a simple pie chart to see how many banks are still active in the country.¶

Compare number of active and inactive banks on the paired plots.¶

Not very representative. Seems like it's better to calculate a ratio between these categories.¶

Let's look at a couple of financial metrics in the form of a non-interactive dashboard¶

The most frequent type is a state bank which is not a Fed member. Time to plot all of them on the map.¶