Python Parses Excel Table To DataFrame

Scenario

A broker statement is an excel file containing different section of data in tabular format. I’m interested to load the data to dataframe for analysis and execute a query into risk database for other interesting analysis tasks.

Technical breakdown

  • xlwings – a python library to access excel sessions.
  • sqlalchemy – a python library to access oracle.

xlwings

The steps of processing a statement, includes: open an excel, locate the table in interest, load the table to dataframe, process the dataframe, and close the excel.

#given an entire column, trying to get the cell that contains the text.
#e.g. cell=find(sheet.range("B:B"),"(reports of P/L)")
def find_cell(myrange, what):
    retval = None
    for cell in myrange:
        if cell.value==what:
            retval = cell
            break
    return retval

#locate the sheet for futures.
#1. QFII: the table is in sheet RQFII
#2. normal: if there's no QFII details, then it should be a domestic account, usually the first page.
def find_sheet(book):
    retval = None
    for sheet in book.sheets:
        if "RQFII" in sheet.name and "Future" in sheet.name:
            retval = sheet
            break
    if retval is None:
        retval=book.sheets[0]
    return retval

############################################################
#CLOSE OUT DETAILS
#load closeout details section from the broker statement has different format.
#each broker has it's own reader to extract the table into dataframe.
#steps to load
#a. load the workbook 
#b. search the table heading to locate the row, with row_num
#c. select the row_num+1 and expand selection
#d. return dataframe from selected range. 
def closedetails(indir,infile,broker):
    stmtfile="{}{}".format(indir,infile)
    with xw.App() as app:
        #load workbook 
        print("loading... {}".format(stmtfile))
        book = xw.books.open(stmtfile)
        #identify the correct sheet, QFII 
        sheet=find_sheet(book)
        #find the row.
        name_range=m_tbl_column[broker]
        name_table=m_tbl_name[broker]
        table_cell = find_cell(sheet.range(name_range), name_table)
        row_num = table_cell.row
        print(f"file={stmtfile}, broker={broker}, range={name_range}, title={name_table}, row={row_num}")
        #loadthe range into dataframe
        rng_closeout=sheet['A'+str(row_num+1)].expand()
        df_closeout=rng_closeout.options(pd.DataFrame).value
        print("closeout total rows: {}".format(len(df_closeout)))
        #check the open trade date; and we only want to re-upload trades before 20Nov.
        name_opendate=m_tbl_opendate[broker]
        df_closeout['OpenDate']=pd.to_datetime(df_closeout[name_opendate],format='%Y%m%d')
        #now export the file to excel
        df_closeout_pre=df_closeout[df_closeout.OpenDate<'2023-11-20'] #under old structure
        df_closeout_pos=df_closeout[df_closeout.OpenDate>='2023-11-20']#under package structure
        print("closeout involves trade before 20Nov: count={},opencount={}".format(len(df_closeout_pre),len(set(df_closeout_pre[m_tbl_opennumber[broker]]))))
        print("closeout invovles trade after  20Nov: count={},opencount={}".format(len(df_closeout_pos),len(set(df_closeout_pos[m_tbl_opennumber[broker]]))))
        if len(df_closeout_pre)>0:
            print(f"indir:{indir}")
            print(f"infile:{infile}")
            df_closeout_pre.to_excel('{}_RELOAD_{}'.format(indir,infile))
            return df_closeout_pre

sqlalchemy

sql_exec=sql_open_template.format(broker,stdate,account,account,",".join(["'ECN{}'".format(x) for x in list(set(opens))]))

export_fn_sql=f'{broker}_{stdate}_{entity}.sql'
print("generating sql file {} for analysis...".format(export_fn_sql))
with open(export_fn_sql,'w') as outfile:
    outfile.write(sql_exec)


print("executing oracle query...")
#read oracle data into dataframe 
# with cx_Oracle.connect(username,username,dsn=conn_string) as conn:
# df_open_trades=pd.read_sql(sql=sql_opens,con=conn)
engine=sqlalchemy.create_engine(f"oracle+cx_oracle://{username}:{username}@{hostname}:{port}/?service_name={servicename}",arraysize=1000)
df_open_trades=pd.read_sql(sql_exec,engine)
print("from statement, count={}, loaded={}\n".format(len(opens),len(df_open_trades)))

if exportfile:
    fn_export_tradecount=f'tradecount_{context_dt}.xlsx'
    print("exporting file: {} ...".format(fn_export_tradecount))
    df_open_trades.to_excel(fn_export_tradecount,index=False)

print("oracle returns count={}".format(len(df_open_trades)))