-
Notifications
You must be signed in to change notification settings - Fork 0
/
playLoader.cpp
595 lines (549 loc) · 25.9 KB
/
playLoader.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
/* This file is part of NFLdecisionTree. It creates decision trees to classify
situations within NFL football games, and displays the plays historically
called in those situations given a set of opponents.
Copyright (C) 2013 Ezra Erb
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License version 3 as published
by the Free Software Foundation.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
I'd appreciate a note if you find this program useful or make
updates. Please contact me through LinkedIn (my profile also has
a link to the code depository)
*/
#include<string>
#include<vector>
#include<fstream>
#include<iostream>
#include<sstream>
#include<algorithm>
#include"singlePlay.h" // Needed by playIndexSet.h
#include"playIndexSet.h" // Needed by dataStore.h
#include"playStats.h" // Needed by dataStore.h
#include"dataStore.h"
#include"playLoader.h"
#include"baseException.h"
using std::string;
using std::ifstream;
using std::cerr;
using std::endl;
using std::stringstream;
using std::find;
/* This class loads plays from .csv files into the in-memory database.
Statistical techniques are hard to apply to football, because the
number of variables a team faces are so large. Modern NFL teams deal
with the issue through their Quality Control departments. Any NFL team
can request film on any game ever played back to the early 1960s (when
NFL Films started). This department selects games that will be the most
useful to study and breaks down the film into individual plays.
Selection of games to study is one of the blackest arts of football
coaching. Most select previous games against a given opponent, teams
similiar to their their team against that opponent, and their team
against teams similiar to their opponent. The class builds the database
using this approach. The caller specifies the similiar teams.
WARNING: This class is still pretty simplistic compared to real Quality
Control, which will lead to skewed results. Many go much futher that
just teams, getting film containing key players or teams where a key
choach has worked previously. They also select games from different
team/season combinations while this routine gets all seasons for a given
matchup */
PlayLoader::PlayLoader(const string& directory)
: _playFile(), _directory(directory)
{
// All in the initialization list
}
// Destructor
PlayLoader::~PlayLoader()
{
// Ensure the file buffer is closed and released
if (_playFile.is_open())
_playFile.close();
}
// Loads plays for the wanted teams for one season into the data store
/* WARNING: Does NOT generate index data! */
void PlayLoader::loadSingleSeason(const string& thisTeam, const string& otherTeam,
const vector<string>& thisSimiliar,
const vector<string>& otherSimiliar,
unsigned short seasonYear,
DataStore& dataStore)
{
// First, open the file
if (_playFile.is_open())
_playFile.close();
// Assemble the file name. Format is XXXX_nfl_pbp_data.csv, where XXXX is the year
/* The passed directory does not include the backslash needed before the filename,
so add it
TRICKY NOTE: Notice the double backslash below. C++ uses '\' as an
escape character. The first is the esacpe character needed to insert a
litteral '\' in the string! */
stringstream fullFileName;
fullFileName << _directory << "\\";
fullFileName << seasonYear << "_nfl_pbp_data.csv";
// Trace the full file path, to catch the error where the directory is wrong
_playFile.open(fullFileName.str().c_str());
if (!_playFile.is_open()) {
stringstream errorMessage;
errorMessage << "Error, could not open data file " << fullFileName.str();
throw BaseException(__FILE__, __LINE__, errorMessage.str().c_str());
}
/* Some texts recommend always putting file reads in a try...catch block, to clean up
properly. This code doesn't bother because the destructor will handle the file,
and the lack of an index creation call on the data store ensures consistent (and empty)
results on an exception */
string playText;
// First line is a header. Read it to burn it
getline(_playFile, playText);
unsigned short sackCount = 0; // Number of sacks processed
while (!_playFile.eof()) {
// Read a play from the data file and process it
getline(_playFile, playText);
processPlay(playText, thisTeam, otherTeam, thisSimiliar, otherSimiliar, sackCount, dataStore);
}
_playFile.close();
}
// Process a single play from a data file
void PlayLoader::processPlay(const string& playString, const string& thisTeam, const string& otherTeam,
const vector<string>& thisSimiliar, const vector<string>& otherSimiliar,
unsigned short& sackCount, DataStore& dataStore)
{
/* Play data is organized in the following fields:
gameid,qtr,min,sec,off,def,down,togo,ydline,description,offscore,defscore,season
They are extracted by searching for the commas */
unsigned int pos;
unsigned int prevPos;
// First category is a game ID, burn it
pos = playString.find_first_of(',');
// Second category is quarter, burn it
pos = playString.find_first_of(',', pos + 1);
// Third cagetory is minues. Extract it
prevPos = pos + 1; // Move off the comma
pos = playString.find_first_of(',', prevPos);
if (pos == string::npos) { // Indicates badly formed input
cerr << "Improperly formatted input: " << playString << endl;
return;
}
unsigned short minutes = (unsigned short)extractNumeric(playString, prevPos, pos);
// Fourth category is seconds. Burn it
pos = playString.find_first_of(',', pos + 1);
// Fifth category is offence, extract it
prevPos = pos + 1; // Move off the comma
pos = playString.find_first_of(',', prevPos);
if (pos == string::npos) { // Indicates badly formed input
cerr << "Improperly formatted input: " << playString << endl;
return;
}
string offense(playString, prevPos, pos - prevPos);
// Sixth category is defence, extract it
prevPos = pos + 1; // Move off the comma
pos = playString.find_first_of(',', prevPos);
if (pos == string::npos) { // Indicates badly formed input
cerr << "Improperly formatted input: " << playString << endl;
return;
}
string defense(playString, prevPos, pos - prevPos);
// Seventh category is down, extract it
prevPos = pos + 1; // Move off the comma
pos = playString.find_first_of(',', prevPos);
if (pos == string::npos) { // Indicates badly formed input
cerr << "Improperly formatted input: " << playString << endl;
return;
}
/* If the down is missing, this play is a non-down play like kickoffs
and extra point attempts. The database deliberately ignores these */
if (pos == prevPos)
return;
unsigned short down = (unsigned short)extractNumeric(playString, prevPos, pos);
/* At this point, have enough information to determine whether this play is wanted
or not. It must fall within one of three categories
1. Offense matches wanted offense and defense matches wanted offense
2. Offense matches wanted offense and defense falls in list of similiar
defenses to wanted defense
3. Offense falls in list similiar offenses to wanted offense and defense
matches wanted defense
WARNING: This process is sensitive to both whitespace and capitalization, because
its more efficient for the caller to get this right that for this code to deal with
matching it. The current set of data files requires ALL CAPS and no whitespace */
bool haveMatch = false;
if (offense == thisTeam) {
if (defense == otherTeam)
haveMatch = true;
else
haveMatch = (find(otherSimiliar.begin(), otherSimiliar.end(), defense) != otherSimiliar.end());
} // Offense matches the wanted team
else if (defense == otherTeam)
haveMatch = (find(thisSimiliar.begin(), thisSimiliar.end(), offense) != thisSimiliar.end());
else
haveMatch = false;
if (!haveMatch)
return; // Not a wanted play
/* If get to here, want the play. Extract remaining data snd insert into the data store.
Some of it requires a tricky search of the description field */
// Eighth category is distance needed, extract it.
/* NOTE: If this is a non-down play, the distance won't be set either. They
are rejected above, so missing the distance here is an error */
prevPos = pos + 1; // Move off the comma
pos = playString.find_first_of(',', prevPos);
if ((pos == string::npos) || (pos == prevPos)) {
// Indicates badly formed input
cerr << "Improperly formatted input: " << playString << endl;
return;
}
unsigned short distanceNeeded = (unsigned short)extractNumeric(playString, prevPos, pos);
// Ninth category is position on the field, extract it.
prevPos = pos + 1; // Move off the comma
pos = playString.find_first_of(',', prevPos);
if (pos == string::npos) { // Indicates badly formed input
cerr << "Improperly formatted input: " << playString << endl;
return;
}
unsigned short yardLine = (unsigned short)extractNumeric(playString, prevPos, pos);
// Tenth category is play description. This needs further processing. Extract it here
prevPos = pos + 1; // Move off the comma
pos = playString.find_first_of(',', prevPos);
if (pos == string::npos) { // Indicates badly formed input
cerr << "Improperly formatted input: " << playString << endl;
return;
}
string description(playString, prevPos, pos - prevPos);
// Eleventh category is offence current score, extract it.
prevPos = pos + 1; // Move off the comma
pos = playString.find_first_of(',', prevPos);
if (pos == string::npos) { // Indicates badly formed input
cerr << "Improperly formatted input: " << playString << endl;
return;
}
unsigned short ownScore = (unsigned short)extractNumeric(playString, prevPos, pos);
// Twelveth category is defense current score, extract it.
prevPos = pos + 1; // Move off the comma
pos = playString.find_first_of(',', prevPos);
if (pos == string::npos) { // Indicates badly formed input
cerr << "Improperly formatted input: " << playString << endl;
return;
}
unsigned short oppScore = (unsigned short)extractNumeric(playString, prevPos, pos);
/* To get play type, yardage gained, and turnover, need to parse the description.
Thankfully, it has a standard format */
SinglePlay::PlayType playType = SinglePlay::punt;
short distanceGained = 0;
bool turnedOver = false;
unsigned int wordLoc;
bool havePlay = false;
// ' pass ' or ' passed ' indicates a pass play
wordLoc = description.find(string(" pass "));
if (wordLoc == string::npos)
wordLoc = description.find(string(" passed "));
if (wordLoc != string::npos) {
wordLoc += 5; // Move to next word
if (description[wordLoc] != ' ')
wordLoc += 2; // Are sitting on 'ed '
wordLoc++; // Move off the space
// If next word is 'incomplete ', no yards and no turnover
// SUBTLE NOTE: compare() returns 0 if they match
bool haveIncomplete = false;
if (description.compare(wordLoc, 11, string("incomplete ")) == 0) {
haveIncomplete = true;
wordLoc += 11;
} // Incomplete pass
// Next word should be either 'short ' or 'deep '.
bool haveDeep = (description.compare(wordLoc, 5, string("deep ")) == 0);
if (haveDeep)
// Move off the word
wordLoc += 5;
else {
/* Unfortunately, not all pass descriptions include the distance. These are
treated as short. The effect here is that 'short' should only be skipped
if it is there */
if (description.compare(wordLoc, 6, string("short ")) == 0)
wordLoc+=6;
}
// Next word will be 'left', 'right', or 'middle '
if (description.compare(wordLoc, 5, string("left ")) == 0) {
if (haveDeep)
playType = SinglePlay::pass_deep_left;
else
playType = SinglePlay::pass_short_left;
} // Left pass
else if (description.compare(wordLoc, 6, string("right ")) == 0) {
if (haveDeep)
playType = SinglePlay::pass_deep_right;
else
playType = SinglePlay::pass_short_right;
} // Left pass
else {
/* Unfortunately, not all pass descriptions include the direction either.
They are treated as passes to the middle, so they will end up here
whether the word 'middle' exists or not */
if (haveDeep)
playType = SinglePlay::pass_deep_middle;
else
playType = SinglePlay::pass_short_middle;
} // Right pass
if (haveIncomplete) {
distanceGained = 0;
turnedOver = false;
}
else {
/* If the description has 'INTERCEPTION', the play is listed as an interception for
no yards */
if (description.find(string("INTERCEPT"), wordLoc) != string::npos) {
distanceGained = 0;
turnedOver = true;
}
else
extractPlayYardageTurnover(description, wordLoc, distanceGained, turnedOver);
} // Not an incomplete pass
havePlay = true;
} // Pass play
/* Running plays are listed by specifying a player and a direction. Thankfully, the directions
are unique to running plays! Need to find each one individually */
if (!havePlay) {
wordLoc = description.find(string(" left end "));
if (wordLoc == string::npos)
wordLoc = description.find(string(" left guard "));
if (wordLoc == string::npos)
wordLoc = description.find(string(" left tackle "));
if (wordLoc != string::npos) {
playType = SinglePlay::run_left;
extractPlayYardageTurnover(description, wordLoc, distanceGained, turnedOver);
havePlay = true;
} // Run left
} // No play yet
if (!havePlay) {
wordLoc = description.find(string(" right end "));
if (wordLoc == string::npos)
wordLoc = description.find(string(" right guard "));
if (wordLoc == string::npos)
wordLoc = description.find(string(" right tackle "));
if (wordLoc != string::npos) {
playType = SinglePlay::run_right;
extractPlayYardageTurnover(description, wordLoc, distanceGained, turnedOver);
havePlay = true;
} // Run left
} // No play yet
if (!havePlay) {
/* Some rush plays have ' rushed ' with no direction. A few more have 'scrambled'
for quarterback scrambles. Treat these as up the middle */
wordLoc = description.find(string(" up the middle "));
if (wordLoc == string::npos)
wordLoc = description.find(string(" rushed "));
if (wordLoc == string::npos)
wordLoc = description.find(string(" scrambles "));
if (wordLoc != string::npos) {
playType = SinglePlay::run_middle;
extractPlayYardageTurnover(description, wordLoc, distanceGained, turnedOver);
havePlay = true;
} // Run left
} // No play yet
if (!havePlay) {
/* Quarter back sacks appear as ' sacked '. In practice they are almost always
busted pass plays. The type of pass play is unknown, so they are evenly divided
between the pass play types. Note that a sack has yardage and can fumble as well */
wordLoc = description.find(string(" sacked "));
if (wordLoc != string::npos) {
wordLoc += 8; // Move to next word
switch (sackCount % 6) {
case 0:
playType = SinglePlay::pass_short_left;
break;
case 1:
playType = SinglePlay::pass_short_middle;
break;
case 2:
playType = SinglePlay::pass_short_right;
break;
case 3:
playType = SinglePlay::pass_deep_left;
break;
case 4:
playType = SinglePlay::pass_deep_middle;
break;
case 5:
playType = SinglePlay::pass_deep_right;
break;
} // Switch on number of processed sacks
sackCount++;
extractPlayYardageTurnover(description, wordLoc, distanceGained, turnedOver);
havePlay = true;
} // Quarterback sack
} // Play not found so far
if (!havePlay) {
// ' punts ' or ' punted ' indicates a successful punt, followed by the yardage
wordLoc = description.find(string(" punts "));
if (wordLoc == string::npos)
wordLoc = description.find(string(" punted "));
if (wordLoc != string::npos) {
playType = SinglePlay::punt;
wordLoc += 6; // Skip word
if (description[wordLoc] != ' ')
wordLoc++; // On 'd '
wordLoc++; // Move off space
unsigned int nextWordLoc = description.find(' ', wordLoc);
distanceGained = extractNumeric(description, wordLoc, nextWordLoc);
turnedOver = false; // Successful punts are not considered turnovers
havePlay = true;
} // Punt
} // Play not found so far
if (!havePlay) {
// ' field goal ' indicates a field goal attempt.
wordLoc = description.find(string(" field goal "));
if (wordLoc != string::npos) {
/* If the next words are 'is GOOD', it succeeded. The yardage is found BEFORE
the word 'yard' before the signal words */
playType = SinglePlay::field_goal;
if (description.compare(wordLoc + 12, 7, string("is GOOD")) == 0) {
wordLoc = description.rfind(' ', wordLoc - 1); // Skip over 'yard'
unsigned int prevWordLoc = description.rfind(' ', wordLoc - 1);
distanceGained = extractNumeric(description, prevWordLoc + 1, wordLoc);
}
else
distanceGained = 0;
/* Field goal attempts that result in turnovers appear differently, so none
of these resulted in a turnover (turnover on downs doesn't count) */
turnedOver = false;
havePlay = true;
} // Field goal attempt
} // No play so far
if (!havePlay) {
/* The phrase 'FUMBLES (Aborted)' means the quaterback fumbled the ball without
being sacked. In practice they are almost always busted pass plays.
The type of pass play is unknown, so they are evenly divided between the pass
play types. */
wordLoc = description.find(string(" FUMBLES (Aborted) "));
if (wordLoc != string::npos) {
switch (sackCount % 6) {
case 0:
playType = SinglePlay::pass_short_left;
break;
case 1:
playType = SinglePlay::pass_short_middle;
break;
case 2:
playType = SinglePlay::pass_short_right;
break;
case 3:
playType = SinglePlay::pass_deep_left;
break;
case 4:
playType = SinglePlay::pass_deep_middle;
break;
case 5:
playType = SinglePlay::pass_deep_right;
break;
} // Switch on number of processed sacks
sackCount++;
distanceGained = 0;
turnedOver = true;
havePlay = true;
} // Quarterback sack
} // Play not found so far
if (!havePlay) {
/* The phrase ' Aborted. ' indicates a few types of busted plays. All three involve
turning the ball over */
if (description.find(string(" Aborted. ")) != string::npos) {
if (description.find(string("Punt")) != string::npos)
playType = SinglePlay::punt;
else if (description.find(string("Field Goal")) != string::npos)
playType = SinglePlay::field_goal;
else
// Bad handoff on a running play
playType = SinglePlay::run_middle;
distanceGained = 0;
turnedOver = true;
havePlay = true;
} // Busted play
} // Play not found so far
if (!havePlay) {
// ' punt is BLOCKED ' indicates an unsuccessful punt. This is treated as a turnover with no yardage
wordLoc = description.find(string(" punt is BLOCKED ")); // Note carefully the spaces on either end
if (wordLoc != string::npos) {
playType = SinglePlay::punt;
distanceGained = 0;
turnedOver = true; // Unsuccessful punts are treated as turnovers
havePlay = true;
} // Punt
} // Play not found so far
/* If get to here without a play, have yet another possibility. Some run plays do not have a
direction specified, so they are listed [name] to [team] [location] for [yards]. Assume anything
with this pattern is a running play, unless it contains 'kneels', indicating a kneel down */
if (!havePlay) {
if (description.find(string(" kneels ")) == string::npos) {
wordLoc = description.find(string(" to "));
wordLoc += 4; // Skip 'to'
wordLoc = description.find(' ', wordLoc + 1); // Skip team name in location
if (wordLoc != string::npos)
wordLoc = description.find(' ', wordLoc + 1); // Skip yardage in location
if (wordLoc != string::npos)
if (description.compare(wordLoc, 5, string(" for ")) == 0) {
playType = SinglePlay::run_middle;
extractPlayYardageTurnover(description, wordLoc, distanceGained, turnedOver);
havePlay = true;
} // Word pattern indicates a running play
} // Does NOT indicate a kneel down
} // Haven't found a play yet
/* If get to here, some running plays that fail to gain yardage are listed as ' lost ' by
itself followed by yardage */
if (!havePlay) {
wordLoc = description.find(string(" lost "));
if (wordLoc != string::npos) {
playType = SinglePlay::run_middle;
wordLoc += 6;
unsigned int nextWordLoc = description.find(' ', wordLoc);
distanceGained = extractNumeric(description, wordLoc, nextWordLoc);
distanceGained *= -1; // Play had negative yardage
turnedOver = false;
havePlay = true;
} // Have negative running play
} // No play yet
// If found a play at this point, insert it in the data store
if (havePlay)
dataStore.insertPlay(playType, down, distanceNeeded, yardLine, minutes,
ownScore, oppScore, distanceGained, turnedOver);
else {
/* Certain things indicate non-plays. Check for them. If the descrition
does not match any of them, output it as an unknown play type
1. Penalties after a play get their own play line
2. Kneel downs at the end of the game are ignored; their use is obvious
3. Spikes to stop the clock are ignored; their use is pretty obvious
4. Video reviews get their own line
5. Some kickoffs mistakenly have a down listed
*/
if ((description.find(string("PENALTY")) == string::npos) &&
(description.find(string("penalized")) == string::npos) &&
(description.find(string("kneels")) == string::npos) &&
(description.find(string("spiked")) == string::npos) &&
(description.find(string("kicked")) == string::npos) &&
(description.find(string(" play under review ")) == string::npos))
cerr << "UNKNOWN PLAY TYPE: " << playString << endl;
} // Not a known play type
}
// Finds the yardage achieved from a play, and whether the ball was fumbled
void PlayLoader::extractPlayYardageTurnover(const string& description, unsigned int pos,
short& distanceGained, bool& turnedOver)
{
// Yards gained always appears as ' for XXX yards'. Search for the ' for ' to find it
unsigned int wordLoc = description.find(string(" for "), pos);
wordLoc += 5; // Skip over string
unsigned int nextWordLoc = description.find(' ', wordLoc);
// Plays of zero yards are listed as 'no gain'. Need to check for 'no '
if (description.compare(wordLoc, nextWordLoc - wordLoc, string("no ")) == 0)
distanceGained = 0;
else {
/* Just to confuse things, some descriptions use 'a loss of' to indicate
negative yardage */
bool flipSign = false;
if (description.compare(wordLoc, nextWordLoc - wordLoc, string("a ")) == 0) {
flipSign = true;
wordLoc += 10;
nextWordLoc = description.find(' ', wordLoc);
} // Loss flagged in words
distanceGained = extractNumeric(description, wordLoc, nextWordLoc);
if (flipSign)
distanceGained *= -1;
}
// If the description contains 'FUMBLE', the ball carrier turned it over
turnedOver = (description.find(string("FUMBLE"), nextWordLoc) != string::npos);
}