Scenario
A broker statement is an excel file containing different section of data in tabular format. I’m interested to load the data to dataframe for analysis and execute a query into risk database for other interesting analysis tasks.
Technical breakdown
- xlwings – a python library to access excel sessions.
- sqlalchemy – a python library to access oracle.
xlwings
The steps of processing a statement, includes: open an excel, locate the table in interest, load the table to dataframe, process the dataframe, and close the excel.
#given an entire column, trying to get the cell that contains the text.
#e.g. cell=find(sheet.range("B:B"),"(reports of P/L)")
def find_cell(myrange, what):
retval = None
for cell in myrange:
if cell.value==what:
retval = cell
break
return retval
#locate the sheet for futures.
#1. QFII: the table is in sheet RQFII
#2. normal: if there's no QFII details, then it should be a domestic account, usually the first page.
def find_sheet(book):
retval = None
for sheet in book.sheets:
if "RQFII" in sheet.name and "Future" in sheet.name:
retval = sheet
break
if retval is None:
retval=book.sheets[0]
return retval
############################################################
#CLOSE OUT DETAILS
#load closeout details section from the broker statement has different format.
#each broker has it's own reader to extract the table into dataframe.
#steps to load
#a. load the workbook
#b. search the table heading to locate the row, with row_num
#c. select the row_num+1 and expand selection
#d. return dataframe from selected range.
def closedetails(indir,infile,broker):
stmtfile="{}{}".format(indir,infile)
with xw.App() as app:
#load workbook
print("loading... {}".format(stmtfile))
book = xw.books.open(stmtfile)
#identify the correct sheet, QFII
sheet=find_sheet(book)
#find the row.
name_range=m_tbl_column[broker]
name_table=m_tbl_name[broker]
table_cell = find_cell(sheet.range(name_range), name_table)
row_num = table_cell.row
print(f"file={stmtfile}, broker={broker}, range={name_range}, title={name_table}, row={row_num}")
#loadthe range into dataframe
rng_closeout=sheet['A'+str(row_num+1)].expand()
df_closeout=rng_closeout.options(pd.DataFrame).value
print("closeout total rows: {}".format(len(df_closeout)))
#check the open trade date; and we only want to re-upload trades before 20Nov.
name_opendate=m_tbl_opendate[broker]
df_closeout['OpenDate']=pd.to_datetime(df_closeout[name_opendate],format='%Y%m%d')
#now export the file to excel
df_closeout_pre=df_closeout[df_closeout.OpenDate<'2023-11-20'] #under old structure
df_closeout_pos=df_closeout[df_closeout.OpenDate>='2023-11-20']#under package structure
print("closeout involves trade before 20Nov: count={},opencount={}".format(len(df_closeout_pre),len(set(df_closeout_pre[m_tbl_opennumber[broker]]))))
print("closeout invovles trade after 20Nov: count={},opencount={}".format(len(df_closeout_pos),len(set(df_closeout_pos[m_tbl_opennumber[broker]]))))
if len(df_closeout_pre)>0:
print(f"indir:{indir}")
print(f"infile:{infile}")
df_closeout_pre.to_excel('{}_RELOAD_{}'.format(indir,infile))
return df_closeout_pre
sqlalchemy
sql_exec=sql_open_template.format(broker,stdate,account,account,",".join(["'ECN{}'".format(x) for x in list(set(opens))]))
export_fn_sql=f'{broker}_{stdate}_{entity}.sql'
print("generating sql file {} for analysis...".format(export_fn_sql))
with open(export_fn_sql,'w') as outfile:
outfile.write(sql_exec)
print("executing oracle query...")
#read oracle data into dataframe
# with cx_Oracle.connect(username,username,dsn=conn_string) as conn:
# df_open_trades=pd.read_sql(sql=sql_opens,con=conn)
engine=sqlalchemy.create_engine(f"oracle+cx_oracle://{username}:{username}@{hostname}:{port}/?service_name={servicename}",arraysize=1000)
df_open_trades=pd.read_sql(sql_exec,engine)
print("from statement, count={}, loaded={}\n".format(len(opens),len(df_open_trades)))
if exportfile:
fn_export_tradecount=f'tradecount_{context_dt}.xlsx'
print("exporting file: {} ...".format(fn_export_tradecount))
df_open_trades.to_excel(fn_export_tradecount,index=False)
print("oracle returns count={}".format(len(df_open_trades)))