forked from robcarver17/python-uk-trading-tax-calculator
-
Notifications
You must be signed in to change notification settings - Fork 6
/
shredIBfiles.py
456 lines (321 loc) · 13.4 KB
/
shredIBfiles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
"""
Python UK trading tax calculator
Copyright (C) 2015 Robert Carver
You may copy, modify and redistribute this file as allowed in the license agreement
but you must retain this header
See README.txt
"""
"""
Shred the reports produced by IB
This section is for global variables that define how the data is stored in the .html files
HEADER_NAMES indicates the asset classes which are specified in the table.
CURRENCIES contains all the currencies that you trade.
TRADES_LOC and POSITIONS_LOC indicate where in the trade and activity report
respectively the tables of trades and current positions are held. They
shouldn't change too frequently.
"""
## Start of formatting globals
ASSETS=['Stocks', 'Futures', 'Forex', 'CFD', 'Equity and Index Options', 'Warrants']
CURRENCIES=['GBP', 'JPY' ,'EUR', 'KRW', 'AUD', 'CHF', 'USD', 'HKD', 'SEK', 'CAD']
## End of formatting globals
## Imports
import datetime
import sys
import pandas as pd
from bs4 import BeautifulSoup
from trades import Trade
from tradelist import TradeList
from positions import Position, PositionList
def _row_class(row):
"""
Returns the class of a row or "" if no class. Used to find summary rows
"""
if len(row.attrs)>0:
dattrs=dict(row.attrs)
if 'class' in dattrs:
return dattrs['class']
return ""
def _parse_html_table(rows):
"""
Get data from rows
"""
results = []
headerrow=None
for row in rows:
table_headers = row.findAll('th')
if table_headers:
"""
We've got headers. Note the first set of headers we find will be used to name our data frame
The rest ignored
"""
if headerrow is None:
## Its the first set of headers
##We add a column for the row class
headerlist=[str(headers.getText()) for headers in table_headers]+['Class']
##The terms notional value and proceeds are used depending on the asset class;
##consistently use notional value
headerlist=["Notional Value" if x=="Proceeds" else x for x in headerlist]
headerlist=["Tax" if x=="Fee" else x for x in headerlist]
headerlist=["Trade Date" if x=="Trade Date/Time" else x for x in headerlist]
headerrow=headerlist
table_data = row.findAll('td')
if table_data:
"""
Add some normal data, including the class of the row
"""
rowclass=_row_class(row)
results.append([str(data.getText()) for data in table_data]+[rowclass])
return (headerrow, results)
def _html_row(row, clength):
"""
row is a list of length, probably with unicode strings in it
We return a list of normal strings, padded out with Nones to clength
"""
assert len(row)<=clength
strings_row=[str(x) for x in row]
strings_row += [''] * (clength - len(strings_row))
return strings_row
def _html_table_to_pddataframe(headerrow, table_data):
"""
Returns a pandas data frame from a list table_data, where headerrow are column names
"""
## Column names ... getting rid of any unicode
colnames=[str(x) for x in headerrow]
clength=len(colnames)
## Pad out the rows so the same length and all strings
new_data=[_html_row(row, clength) for row in table_data]
## Create a dict
results=dict([(colnames[idx], [row[idx] for row in new_data]) for idx in range(clength)])
main_table=pd.DataFrame(results)
return main_table
def _check_ignore_row(row, colref="Acct ID"):
"""
Returns TRUE if rows don't contain real data or positions
"""
if "Total" in row[colref]:
"""
It's a sub total row
"""
return True
if not (row['Class']=='summaryRow' or row['Class']=='row-summary' or 'row-summary' in row['Class']):
"""
It's Granualar detail, can ignore
"""
return True
return False
def _select_and_clean_pd_dataframe(main_table, selection_idx, colref="Acct ID"):
"""
Remove 'dirty' rows from a dataframe, i.e. not real data
"""
if len(selection_idx)==0:
return None
pd_df=main_table.iloc[selection_idx,:]
dirty_rows=[rowidx for rowidx in range(len(pd_df.index)) if
_check_ignore_row(pd_df.iloc[rowidx], colref)]
pd_df=pd_df.drop(pd_df.index[dirty_rows])
if len(pd_df.index)==0:
return None
return pd_df
def _check_index_row(row, colref="Acct ID"):
"""
Index rows are empty except for the contents of colref. Returns bool
"""
restorfow=row.drop(colref)
ans=list(restorfow.values)
otherrowsempty=all([len(x)==0 for x in ans])
return otherrowsempty
def _check_blank_row(row):
return all([len(x)==0 for x in row])
def _get_index_row(row, colref):
return row[colref]
def _get_all_longnames_assets(table, colref="Acct ID"):
"""
Returns the list of asset classes in this file as tuple (shortname, longname)
"""
hrows=table[colref]
headers=[]
for x in hrows:
for shortname in ASSETS:
if shortname in x and "Total" not in x:
headers.append((shortname, x))
return list(set(headers))
def _get_all_currencies(table, colref="Acct ID"):
"""
Returns the list of currencies in this file
"""
hrows=table[colref]
ccys=[x for x in hrows if x in CURRENCIES]
return list(set(ccys))
def _parse_pandas_df(main_table, colref="Acct ID"):
"""
Turns a pandas df into a recursive version
Returns a dict (key names are asset classes)
Elements in dict are also dicts (key names are currencies)
Elements in those dicts are pd data frames
Also deletes superflous rows
"""
assetspacked=_get_all_longnames_assets(main_table, colref)
assetlongnames=[x[1] for x in assetspacked]
assetshortnames=[x[0] for x in assetspacked]
currencies=_get_all_currencies(main_table, colref)
## Create an empty recursive structure
## Each entry contains a list of row indices
results=dict([(hname, dict([(ccy, []) for ccy in currencies])) for hname in assetshortnames])
## Loop through populating the recursive structure, adding row names to it
rowidx=0
current_header=None
current_currency=None
total_len=len(main_table.index)
while rowidx<total_len:
row=main_table.iloc[rowidx]
if _check_blank_row(row):
rowidx=rowidx+1
continue
if _check_index_row(row, colref):
## It's an index row, i.e. it contains either an asset class or a currency
## Return the name of the index (asset class or currency
indexentry=_get_index_row(row, colref)
if indexentry in assetlongnames:
## It's an asset class. Since these are at a higher level than FX we reset the currency
current_header=[shortname for shortname, longname in assetspacked
if longname==indexentry][0]
current_currency=None
elif indexentry in currencies:
## It's a currency.
current_currency=indexentry
else:
print("This is weird %s" % indexentry)
raise Exception("Unrecognised header")
else:
## not an index, populating the table
if current_header is None or current_currency is None:
## This will happen if we have extraenous rows before the headers
raise Exception("Found data before eithier asset class or currency was set")
else:
## Add the row index to the right part of the dict
results[current_header][current_currency].append(rowidx)
## next row
rowidx=rowidx+1
## Create a dict of dicts of dataframes, with the appropriate subindex, cleaned up
df_results=dict([(assetname, dict([
(ccy, _select_and_clean_pd_dataframe(main_table, results[assetname][ccy], colref) )
for ccy in currencies])) for assetname in assetshortnames])
return df_results
def _collapse_recursive_dict(df_results):
"""
Convert the df_results back to a dataframe
df_results will be a two level dict with dataframes inside. We add the dict keys as extra columns
"""
all_results=[]
assets=list(df_results.keys())
for assetname in assets:
df_subresults=df_results[assetname]
currencies=list(df_subresults.keys())
for ccy in currencies:
df_subsub=df_subresults[ccy]
if df_subsub is None:
## Empty dict. It happens
continue
## Create extra columns for sub dataframe
df_subsub["AssetClass"]=[assetname]*len(df_subsub.index)
df_subsub["Currency"]=[ccy]*len(df_subsub.index)
## Add the sub dataframe to the list of dataframes
all_results.append(df_subsub)
all_results=pd.concat(all_results)
return all_results
def _parse_trade_date(tradedate):
try:
return datetime.datetime.strptime(tradedate, "%Y-%m-%d, %H:%M:%S")
except:
return datetime.datetime.strptime(tradedate, "%Y-%m-%d")
def _read_ib_html(fname, table_ref):
"""
Reads a single table from an .html file fname produced by IB reports, and returns a pandas dataframe
table_ref gives position of table in .html stack
"""
## Open the file
with open(fname,'r') as file_handle:
soup = BeautifulSoup(file_handle.read(), features="html.parser")
if len(soup)==0:
raise Exception("Empty or non existent html file %s" % fname)
## Find the right table and extract the rows
tables=soup.find_all('table')
table=tables[table_ref]
table_rows = table.find_all('tr')
## Process the rows from html into lists
(headerrow, table_data) = _parse_html_table(table_rows)
## Convert to pandas dataframe
main_table=_html_table_to_pddataframe(headerrow, table_data)
return main_table
def _from_positions_row_to_position(row):
"""
Convert a row into a Position object
"""
quantity=float(row.Quantity.replace(',',''))
this_position=Position(Code=row.Symbol.replace('+',''), Position=quantity)
return this_position
def _from_pddf_to_positions_object(all_results):
"""
Converts a pandas data frame to a list of positions
"""
plist=PositionList([_from_positions_row_to_position(all_results.iloc[idx])
for idx in range(len(all_results.index))])
return plist
def _from_trades_row_to_trade(row):
"""
Convert a row of trades into a trade object
"""
## IB has negative for buys, and positive for sales (i.e. cashflow method)
value=float(row['Notional Value'].replace(',',''))
quantity=float(row.Quantity.replace(',',''))
## Note that taxes and commissions are reported as negative (cashflow)
## Value is negative for buys and positive for sells, which is fine
## quantities are already signed
this_trade=Trade(Code=row.Symbol, Currency=row.Currency, Price=float(row.Price.replace(',','')),
Tax=abs(float(row.Tax.replace(',',''))),
Commission=abs(float(row.Comm.replace(',',''))),
Date=_parse_trade_date(row['Trade Date']), SignQuantity=quantity,
Quantity=abs(quantity), Value=value, AssetClass=row.AssetClass)
return this_trade
def _from_pddf_to_trades_object(all_results):
"""
Converts a pandas data frame to a list of trades
"""
tlist=TradeList([_from_trades_row_to_trade(all_results.iloc[idx])
for idx in range(len(all_results.index))])
return tlist
def get_ib_trades(fname, trades_loc=1):
"""
Reads an .html file output by interactive brokers
Returns a trade_list object
To get the file log in to Account manager... Reports.... trade confirmations....
Save the resulting report as trades.html (or whatever)
You'll need the report for the current financial year, plus
"""
print("Getting trades from %s" % fname)
main_table=_read_ib_html(fname, table_ref=trades_loc)
## Convert to a recursive dict of dicts, whilst doing some cleaning
df_results=_parse_pandas_df(main_table)
## Go back to a single data frame with extra columns added
all_results=_collapse_recursive_dict(df_results)
## Finally convert to a list of trades
return _from_pddf_to_trades_object(all_results)
def get_ib_positions(fname, positions_loc=2):
"""
Reads an .html file output by interactive brokers to get positions
These are used to check consistency with trades
positions_loc is 7 for old html files, 12 for more recent ones from IB.
Returns a positionList object
To get the file log in to Account manager... Reports.... activity report....
Save the resulting report as positions.html (or whatever)
Make sure the position report is run up to the same point as your trades
"""
print("Getting positions from %s" % fname)
main_table=_read_ib_html(fname, table_ref=positions_loc)
print("Processing positions")
## Convert to a recursive dict of dicts, whilst doing some cleaning
df_results=_parse_pandas_df(main_table, "Symbol")
## Collapse it back to a single df with extra columns
all_results=_collapse_recursive_dict(df_results)
## Finally convert to a list of positions
return _from_pddf_to_positions_object(all_results)