-
Notifications
You must be signed in to change notification settings - Fork 0
/
combine.py
73 lines (54 loc) · 3 KB
/
combine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import os
import pandas as pd
# Define directories for source data
eod_data_dir = 'data/eod_data/'
historical_data_dir = 'data/historical_data/'
combined_file = 'data/combined.csv'
# Function to process files and extract 'Date' and 'Adjusted Close' columns
def process_files(directory, date_col_name, adjusted_close_col_name):
all_data = pd.DataFrame()
for filename in os.listdir(directory):
if filename.endswith('.csv'):
file_path = os.path.join(directory, filename)
stock_symbol = filename.split('_')[0] # Assumes filename starts with stock symbol (e.g., AAPL_eod_data.csv)
try:
# Read the CSV file
df = pd.read_csv(file_path)
# Check if the required columns exist
if date_col_name not in df.columns or adjusted_close_col_name not in df.columns:
print(f"Warning: Required columns not found in {filename}. Skipping this file.")
continue
# Rename columns to a standard format
df.rename(columns={date_col_name: 'Date', adjusted_close_col_name: stock_symbol}, inplace=True)
df = df[['Date', stock_symbol]] # Keep only 'Date' and adjusted close
# Append data for this stock to the combined dataframe
if all_data.empty:
all_data = df
else:
all_data = pd.merge(all_data, df, on='Date', how='outer') # Merge on 'Date'
except Exception as e:
print(f"Error processing file {filename}: {e}")
continue
return all_data
# Process EOD data (adjusted close is 'adjusted_close', date is 'date')
eod_data = process_files(eod_data_dir, 'date', 'adjusted_close')
# Process historical data (adjusted close is 'Adj Close', date is 'Date')
historical_data = process_files(historical_data_dir, 'Date', 'Adj Close')
# Combine both datasets
combined_df = pd.concat([eod_data, historical_data]).drop_duplicates(subset=['Date']).reset_index(drop=True)
# Convert 'Date' to pandas datetime format
combined_df['Date'] = pd.to_datetime(combined_df['Date'], format='%Y-%m-%d', errors='coerce')
# Drop rows with invalid dates
combined_df.dropna(subset=['Date'], inplace=True)
# Sort by 'Date'
combined_df.sort_values(by='Date', inplace=True)
# Find the earliest date in the combined dataset
earliest_date = combined_df['Date'].min()
# Add a 'Day' column starting from 1 (for the earliest date found)
combined_df['Day'] = (combined_df['Date'] - earliest_date).dt.days + 1
# Reorder columns to have 'Day', 'Date', and then the stock data
combined_df['Date'] = combined_df['Date'].dt.strftime('%d/%m/%Y') # Format Date as DD/MM/YYYY
combined_df = combined_df[['Day', 'Date'] + [col for col in combined_df.columns if col not in ['Day', 'Date']]]
# Save the combined dataset to a CSV file
combined_df.to_csv(combined_file, index=False)
print(f"Combined dataset created and saved to {combined_file}, with earliest date as {earliest_date.date()}.")