Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
DTMhango authored Feb 7, 2024
1 parent 62a852c commit 7908a0a
Show file tree
Hide file tree
Showing 12 changed files with 366 additions and 0 deletions.
Binary file added Daniel_3.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added Gralix Circle.ico
Binary file not shown.
120 changes: 120 additions & 0 deletions Home.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
import pandas as pd
from data_validation import clean_dataframe, clean_os_dataframe
import streamlit as st
import chainladder as cl
from datetime import datetime
from streamlit_extras.app_logo import add_logo

st.set_page_config(layout='wide', page_title='Gralix Actuarial Reserving Interface', page_icon='Gralix Circle.ico')

for k, v in st.session_state.items():
if k != 'upload':
st.session_state[k] = v


def logo():
add_logo("rsz_gralix2.png", height=150)

logo()


@st.cache_data
def convert_df(data_frame):
# Cache the conversion to prevent computation on every rerun
return data_frame.to_csv().encode('utf-8')


@st.cache_data
def read_data(data):
return pd.read_csv(data)


@st.cache_data
def find_index(file_list, match_string):
for i, file in enumerate(file_list):
file_name = file.name
if match_string in file_name:
return i


# Data to be uploaded - Add to session state
files = ['claims', 'case', 'premium']

for file in files:
if file not in st.session_state:
st.session_state[file] = None

min_date = datetime(2000, 1, 1)
max_date = datetime(2030, 12, 31)


uploaded_files = st.sidebar.file_uploader(label='Upload Files',
help='Upload the separate files containing Paid/Incurred Claims Data, '
'Case Reserves Data and Premium Data. Files MUST be ".csv"',
accept_multiple_files=True, key='upload')

if len(st.session_state['upload']) > 0:

try:
claims = st.session_state['upload'][find_index(st.session_state['upload'], 'Claims Data')]
claims_df = read_data(claims)
clean_df, error_df = clean_dataframe(claims_df)
st.session_state['claims'] = clean_df
st.session_state['errors'] = error_df
except (TypeError, ValueError):
pass

try:
case = st.session_state['upload'][find_index(st.session_state['upload'], 'Outstanding')]
case_res_df = read_data(case)
case_df, case_error = clean_os_dataframe(case_res_df)
for col in case_df.columns:
if 'Unnamed' in col:
case_df = case_df.drop(col, axis=1)
st.session_state['case'] = case_df

except (TypeError, ValueError):
pass

try:
premium = st.session_state['upload'][find_index(st.session_state['upload'], 'Premium')]
premium_df = read_data(premium)
premium_df = premium_df.dropna()
premium_df = premium_df.set_index(keys='Year', drop=True)
cols = premium_df.columns

st.session_state['premium'] = premium_df

except (TypeError, ValueError):
pass

claims_data = st.session_state.get('claims')
errors_df = st.session_state.get('errors')
case_data = st.session_state.get('case')
premium_data = st.session_state.get('premium')

if claims_data is not None:
st.markdown("**USABLE DATA: Clean File**")
st.dataframe(claims_data)
clean_csv = convert_df(claims_data)
download = st.sidebar.download_button(label="Download Clean File",
data=clean_csv,
file_name='Clean.csv',
mime='text/csv')
if errors_df is not None:
st.markdown("**UNUSABLE DATA: Error File**")
st.dataframe(errors_df)
errors_csv = convert_df(errors_df)
download = st.sidebar.download_button(label="Download Errors File",
data=errors_csv,
file_name='Errors.csv',
mime='text/csv')

if case_data is not None:
st.markdown("**CASE RESERVES DATA**")
st.dataframe(case_data)

if premium_data is not None:
st.markdown("**PREMIUM DATA**")
st.dataframe(premium_data)

Binary file added IMG_1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added IMG_8311.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
84 changes: 84 additions & 0 deletions cl_module.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import pandas as pd
import chainladder as cl
import warnings

warnings.filterwarnings(action='ignore', category=UserWarning)

def create_triangles(data, start_date, end_date):
dff = data[(data['LOSS DATE'] >= start_date) & (data['PAID DATE'] <= end_date)]
triangles = cl.Triangle(
data=dff,
origin='LOSS DATE',
development='PAID DATE',
columns=['GROSS AMOUNT', 'NET AMOUNT'],
index=['MAIN CLASS', 'ADDITIONAL SEGMENTATION'],
cumulative=False
)
return triangles


def create_os_triangles(data, start_date, end_date):
dff = data[(data['LOSS DATE'] >= start_date) & (data['OUTSTANDING DATE'] <= end_date)]
triangles = cl.Triangle(
data=dff,
origin='LOSS DATE',
development='OUTSTANDING DATE',
columns=['GROSS AMOUNT', 'NET AMOUNT'],
index=['MAIN CLASS', 'ADDITIONAL SEGMENTATION'],
cumulative=False
)
return triangles


def tri_size(triangle):
size = triangle.shape[2]
return size


def pure_ibnr(ibnr_triangle, os_triangle):
total = []
dates = []
zeroized = []
if tri_size(ibnr_triangle) == tri_size(os_triangle):
for i in range(tri_size(ibnr_triangle)):
if ibnr_triangle.origin_grain == 'Y':
dates.append(ibnr_triangle.origin[i].strftime('%F'))
elif ibnr_triangle.origin_grain == 'Q':
dates.append(
ibnr_triangle.origin[i].strftime('%F-Q%q'))
elif ibnr_triangle.origin_grain == 'M':
dates.append(ibnr_triangle.origin[i].strftime('%F-%m'))
else:
dates.append(
ibnr_triangle.origin[i].strftime('%F-Q%q'))

if os_triangle.iat[0, 0, i, 0] != os_triangle.iat[0, 0, i, 0]:
emerging = ibnr_triangle.iat[0, 0,
i, 0]
else:
emerging = ibnr_triangle.iat[0, 0,
i, 0] - os_triangle.iat[0, 0, i, 0]

total.append(emerging)
if emerging >= 0:
zeroized.append(emerging)
elif emerging < 0:
zeroized.append(0)
elif emerging != emerging:
zeroized.append(0)

pure_ibnr_dict = {'Pure IBNR': total}
pure_ibnr_df = (pd.DataFrame(pure_ibnr_dict, index=dates))

total_row_pure = pd.DataFrame(
{'Pure IBNR': [pure_ibnr_df['Pure IBNR'].sum()]}, index=['TOTAL'])
pure_ibnr_df = pd.concat([pure_ibnr_df, total_row_pure])

zeroized_dict = {'Zeroized Pure IBNR': zeroized}
zeroized_df = (pd.DataFrame(zeroized_dict, index=dates))

total_row_zeroized = pd.DataFrame(
{'Zeroized Pure IBNR': [zeroized_df['Zeroized Pure IBNR'].sum()]}, index=['TOTAL'])
zeroized_df = pd.concat([zeroized_df, total_row_zeroized])

return pure_ibnr_df, zeroized_df
155 changes: 155 additions & 0 deletions data_validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
import re
import io
import pandas as pd
import base64
from datetime import datetime, timedelta
# import streamlit as st


def parse_content(contents, filename):
print("Received contents:", contents)
content_type, content_string = contents.split(',')
decoded = base64.b64decode(content_string)
if '.csv' in filename:
df = pd.read_csv(io.StringIO(decoded.decode('utf-8')))
return df.to_dict('records')
elif '.xls' in filename:
df = pd.read_excel(io.BytesIO(decoded))
return df.to_dict('records')


def decode_image(image_file):
encoded = base64.b64encode(open(image_file, 'rb').read())
return f"data:image/png;base64,{encoded.decode()}"



# def clean_dates(df: pd.DataFrame, old_column: str, new_column: str) -> pd.DataFrame:
# """Cleans a date column with mixed types and unifies format. Creates a new column for the dates"""
# df[new_column] = pd.to_datetime(df[old_column], errors="coerce", dayfirst=True, format="%d/%m/%Y") # try date coercion
# # Coerce date if given in day count format
# mask = pd.to_numeric(df[old_column], errors="coerce").notna()
# df.loc[mask, new_column] = pd.to_datetime(df[old_column][mask].astype(float), errors="coerce", unit="D", origin="1899-12-30")
# return df

def clean_dates(date_str):
original_value = date_str

day_first_formats = [
# Day-First
'%d-%m-%Y',
'%d/%m/%Y',
'%d-%m-%Y %H:%M:%S',
'%d/%m/%Y %H:%M:%S',
'%d-%m-%y',
'%d-%m-%y %H:%M:%S',
'%d/%m/%y',
'%d/%m/%y %H:%M:%S'
]

month_first_formats = [
'%m-%d-%Y',
'%m/%d/%Y',
'%m-%d-%y',
'%m-%d-%Y %H:%M:%S',
'%m/%d/%Y %H:%M:%S',
'%m-%d-%y %H:%M:%S',
]

other_formats = [
'%Y-%m-%d',
'%Y-%m-%d %H:%M:%S',
'%d-%b-%y',
'%d-%b-%Y',
'%m/%d/%y',
'%m/%d/%Y',
'%B %d %Y',
'%B %d %Y %H:%M:%S'
]

try:
# try to parse an Excel style date
excel_date = float(date_str)
date = datetime(1899, 12, 30) + timedelta(days=excel_date)
return date.strftime('%Y-%m-%d')
except ValueError:
pass

for date_format in day_first_formats:
try:
return datetime.strptime(date_str, date_format).strftime('%Y-%m-%d')
except ValueError:
pass

for date_format in month_first_formats:
try:
return datetime.strptime(date_str, date_format).strftime('%Y-%m-%d')
except ValueError:
pass

for date_format in other_formats:
try:
return datetime.strptime(date_str, date_format).strftime('%Y-%m-%d')
except ValueError:
return original_value


def clean_amounts(amount_str):
if amount_str == 'nan' or amount_str.strip() == '':
return 0.0
try:
if re.match(r'^\(.+\d\)$', amount_str): # check amount enclosed in parentheses
return float('-' + re.sub(r'[^\d./-]', '', amount_str))
else:
amount = re.sub(r'[^\d./-]', '', amount_str) # extract digits, periods and hyphens
try:
return float(amount) # convert amount to float
except ValueError:
return amount_str
except ValueError:
return amount_str


# @st.cache_data
def clean_dataframe(my_df):
for col in my_df.columns:
if 'Unnamed' in col:
my_df = my_df.drop(col, axis=1)
for col in my_df.columns:
my_df[col] = my_df[col].astype(str)
if 'DATE' in col:
my_df[col] = my_df[col].apply(clean_dates)
if 'AMOUNT' in col:
my_df[col] = my_df[col].apply(clean_amounts)
date_cols = my_df.filter(like='DATE')
amount_cols = my_df.filter(like='AMOUNT')
inconsistent_date = my_df[(my_df['LOSS DATE'] > my_df['PAID DATE']) | (my_df['LOSS DATE'] > my_df['REPORTED DATE'])]
bad_dates = my_df[date_cols.apply(pd.to_datetime, errors='coerce', dayfirst=False).isna().any(axis=1)]
bad_amounts = my_df[amount_cols.apply(pd.to_numeric, errors='coerce').isna().any(axis=1)]
error_df = pd.concat([bad_dates, bad_amounts, inconsistent_date])
error_df['REFERENCE'] = error_df.index + 2
error_df = error_df.drop_duplicates().sort_index()
clean_df = (my_df.drop(error_df.index, axis=0)).reset_index(drop=True)
return clean_df, error_df

# @st.cache_data
def clean_os_dataframe(my_df):
for col in my_df.columns:
if 'Unnamed' in col:
my_df = my_df.drop(col, axis=1)
for col in my_df.columns:
my_df[col] = my_df[col].astype(str)
if 'DATE' in col:
my_df[col] = my_df[col].apply(clean_dates)
if 'AMOUNT' in col:
my_df[col] = my_df[col].apply(clean_amounts)
date_cols = my_df.filter(like='DATE')
amount_cols = my_df.filter(like='AMOUNT')
inconsistent_date = my_df[(my_df['LOSS DATE'] > my_df['OUTSTANDING DATE']) | (my_df['LOSS DATE'] > my_df['REPORTED DATE'])]
bad_dates = my_df[date_cols.apply(pd.to_datetime, errors='coerce', dayfirst=False).isna().any(axis=1)]
bad_amounts = my_df[amount_cols.apply(pd.to_numeric, errors='coerce').isna().any(axis=1)]
error_df = pd.concat([bad_dates, bad_amounts, inconsistent_date])
error_df['REFERENCE'] = error_df.index + 2
error_df = error_df.drop_duplicates().sort_index()
clean_df = (my_df.drop(error_df.index, axis=0)).reset_index(drop=True)
return clean_df, error_df
Binary file added requirements.txt
Binary file not shown.
Binary file added rsz_1gralix3.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added rsz_gralix2.ico
Binary file not shown.
Binary file added rsz_gralix2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
7 changes: 7 additions & 0 deletions run_app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import sys
from streamlit.web import cli as stcli


if __name__ == '__main__':
sys.argv = ["streamlit", "run", "C:/Users/mhang/Documents/IBNR APP/Home.py"]
sys.exit(stcli.main())

0 comments on commit 7908a0a

Please sign in to comment.