import pandas as pd
import matplotlib.pyplot as plt
import sqlite3

encounters = pd.read_csv("encounters.csv")
payers = pd.read_csv("payers.csv")
patients = pd.read_csv("patients.csv")
procedures = pd.read_csv("procedures.csv")

conn = sqlite3.connect("hospital_db")
c = conn.cursor()
patients.to_sql("patients", conn, if_exists="replace", index=False)
encounters.to_sql("encounters", conn, if_exists="replace", index=False)
payers.to_sql("payers", conn, if_exists="replace", index=False)
procedures.to_sql("procedures", conn, if_exists="replace", index=False)

47701

patients.shape

(974, 20)

patients.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 974 entries, 0 to 973
Data columns (total 20 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Id          974 non-null    object 
 1   BIRTHDATE   974 non-null    object 
 2   DEATHDATE   154 non-null    object 
 3   PREFIX      974 non-null    object 
 4   FIRST       974 non-null    object 
 5   LAST        974 non-null    object 
 6   SUFFIX      21 non-null     object 
 7   MAIDEN      386 non-null    object 
 8   MARITAL     973 non-null    object 
 9   RACE        974 non-null    object 
 10  ETHNICITY   974 non-null    object 
 11  GENDER      974 non-null    object 
 12  BIRTHPLACE  974 non-null    object 
 13  ADDRESS     974 non-null    object 
 14  CITY        974 non-null    object 
 15  STATE       974 non-null    object 
 16  COUNTY      974 non-null    object 
 17  ZIP         832 non-null    float64
 18  LAT         974 non-null    float64
 19  LON         974 non-null    float64
dtypes: float64(3), object(17)
memory usage: 152.3+ KB

patients["GENDER"].value_counts()

GENDER
M    494
F    480
Name: count, dtype: int64

patients["ETHNICITY"].value_counts()

ETHNICITY
nonhispanic    783
hispanic       191
Name: count, dtype: int64

patients["RACE"].value_counts()

RACE
white       680
black       163
asian        91
other        16
hawaiian     13
native       11
Name: count, dtype: int64

cols = patients[["GENDER", "RACE", "ETHNICITY"]]
for c in cols.columns:
    plt.title("Plot of "+ c, fontsize=15)
    plt.hist(cols[c],bins = 5)
    plt.show()

encounters.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27891 entries, 0 to 27890
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Id                   27891 non-null  object 
 1   START                27891 non-null  object 
 2   STOP                 27891 non-null  object 
 3   PATIENT              27891 non-null  object 
 4   ORGANIZATION         27891 non-null  object 
 5   PAYER                27891 non-null  object 
 6   ENCOUNTERCLASS       27891 non-null  object 
 7   CODE                 27891 non-null  int64  
 8   DESCRIPTION          27891 non-null  object 
 9   BASE_ENCOUNTER_COST  27891 non-null  float64
 10  TOTAL_CLAIM_COST     27891 non-null  float64
 11  PAYER_COVERAGE       27891 non-null  float64
 12  REASONCODE           8350 non-null   float64
 13  REASONDESCRIPTION    8350 non-null   object 
dtypes: float64(4), int64(1), object(9)
memory usage: 3.0+ MB

encounters["ENCOUNTERCLASS"].hist()

<Axes: >

encounters["START"] = pd.to_datetime(encounters["START"])
# Group encounters by month start and plot 
encounters.set_index("START").resample("MS").size().plot(title="Monthly Encounter Volume")
plt.show()

monthly = (
    encounters
    .set_index("START")[["ENCOUNTERCLASS"]]
    .groupby("ENCOUNTERCLASS")
    .resample("MS")
    .size()
    .unstack(0)
)

monthly.plot(figsize=(12,6), title="Monthly Encounter Volume by Type")
plt.show()

C:\Users\wayne\AppData\Local\Temp\ipykernel_39596\3896481572.py:6: FutureWarning: DataFrameGroupBy.resample operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  .size()

encounters["PAYER_COVERAGE"].mean()

np.float64(1114.9656516438995)

encounters["TOTAL_CLAIM_COST"].mean()

np.float64(3639.6821741780504)

plt.figure(figsize=(7,6))
plt.title("Total Claim Cost by Encounter Type")
plt.bar(encounters["ENCOUNTERCLASS"],encounters["TOTAL_CLAIM_COST"])
plt.xlabel("Encounter Class Type", fontsize=13)
plt.ylabel("Claim Costs ($)", fontsize=13)
plt.show()

sql = """
SELECT DESCRIPTION, COUNT(*) as total_procedures, ROUND(AVG(BASE_COST),2) as Avg_cost
FROM procedures
GROUP BY DESCRIPTION
ORDER BY total_procedures DESC
LIMIT 10
"""
res = c.execute(sql)
for r in res:
    print(r)

('Assessment of health and social care needs (procedure)', 4596, 431.0)
('Hospice care (regime/therapy)', 4098, 431.0)
('Depression screening using Patient Health Questionnaire Two-Item score (procedure)', 3614, 431.0)
('Depression screening (procedure)', 3614, 431.0)
('Assessment of substance use (procedure)', 2906, 431.0)
('Renal dialysis (procedure)', 2746, 1004.09)
('Assessment using Morse Fall Scale (procedure)', 2422, 431.0)
('Assessment of anxiety (procedure)', 2288, 431.0)
('Medication Reconciliation (procedure)', 2284, 509.12)
('Screening for drug abuse (procedure)', 1484, 431.0)

sql = """
SELECT DESCRIPTION, COUNT(*) as total_procedures, ROUND(AVG(BASE_COST),2) as Avg_cost
FROM procedures
GROUP BY DESCRIPTION
ORDER BY Avg_cost DESC
LIMIT 10
"""
res = c.execute(sql)
for r in res:
    print(r)

('Admit to ICU (procedure)', 5, 206260.4)
('Coronary artery bypass grafting', 9, 47085.89)
('Lumpectomy of breast (procedure)', 5, 29353.0)
('Hemodialysis (procedure)', 27, 29299.56)
('Insertion of biventricular implantable cardioverter defibrillator', 4, 27201.0)
('Electrical cardioversion', 1383, 25903.11)
('Partial resection of colon', 7, 25229.29)
('Fine needle aspiration biopsy of lung (procedure)', 1, 23141.0)
('Percutaneous mechanical thrombectomy of portal vein using fluoroscopic guidance', 57, 20228.04)
('Percutaneous coronary intervention', 9, 19728.0)

sql = """
SELECT p.NAME, ROUND(AVG(e.TOTAL_CLAIM_COST),2) as avg_claim_cost
FROM encounters as e
JOIN payers as p 
on e.PAYER = p.Id
GROUP BY PAYER
ORDER BY avg_claim_cost DESC
"""
res = c.execute(sql)
for r in res:
    print(r)

('Medicaid', 6205.22)
('NO_INSURANCE', 5593.2)
('Anthem', 4236.81)
('Humana', 3269.3)
('Blue Cross Blue Shield', 3245.58)
('Cigna Health', 2996.95)
('UnitedHealthcare', 2848.34)
('Aetna', 2767.05)
('Medicare', 2167.55)
('Dual Eligible', 1696.19)

Executive Summary¶

Prepare Dataset¶

Patient Demographics¶

Demographics Graphs¶

Utilization¶

Cost / Payer Analysis¶

The top 10 most frequent procedures and their avergae cost¶

The most exepnsive prodecures and the number of occurences¶

The average total claim cost for encounters by payer¶