Source code for src.get_data

"""
Utility functions for acquiring financial data

@author: Duncan Mazza
"""

import pandas_datareader.data as web
from pandas import DataFrame, read_csv, errors
import os
import numpy as np
import requests
from datetime import datetime

ZERO_TIME = " 00:00:00"


[docs]class Company:
[docs] def __init__(self, ticker: str, start_date: datetime, end_date: datetime, call_populate_dataframe: bool = True, cache_bool: bool = True,): """ TODO: documentation here :param ticker: :param start_date: :param end_date: :param call_populate_dataframe: :param cache_bool: """ self.start_date = start_date self.end_date = end_date self.ticker = ticker self.data_frame: DataFrame = DataFrame() self.cache_bool = cache_bool self.end_date_changed = False self.start_date_changed = False if call_populate_dataframe: self.populate_dataframe()
[docs] def populate_dataframe(self): r""" Populates :attr:`data_frame` with stock data acquired using pandas_datareader.data. View more information `here <https://pandas-datareader.readthedocs.io/en/latest/remote_data.html>`__. Modifies :attr:`start_date`, :attr:`start_date_changed`, :attr:`end_date`, and :attr:`end_date_changed` if :attr:`start_date` and/or :attr:`end_date` are different than the actual start and end dates in :attr:`data_frame` such that :attr:`start_date` and :attr:`end_date` equal the actual start and end dates in :attr:`data_frame` (and :attr:`start_date_changed` and :attr:`end_date_changed` reflect whether :attr:`start_date` and :attr:`end_date` were changed respectively). """ if self.ticker == "dummy": self.data_frame = self.return_dummy_data() else: self.data_frame = self.return_data() data_frame_start_date = self.data_frame["Date"][0] data_frame_end_date = self.data_frame["Date"][self.data_frame.last_valid_index()] if data_frame_start_date != self.start_date: self.start_date = data_frame_start_date self.start_date_changed = True if data_frame_end_date != self.end_date: self.end_date = data_frame_end_date self.end_date_changed = True
[docs] def return_data(self, ticker: str = None, start_date: datetime = None, end_date: datetime = None) -> DataFrame: """ Returns the DataFrame containing the financial data for the prescribed company. This function will pull the data from the Yahoo API built into :ref:`pandas_datareader` if it has not been cached and will then cache the data, or it will read the data from the cached ``csv`` file. The cached files are named with the ticker, start date, and end dates that specify the API query, and exist in the ``.cache/`` folder located under the current working directory. :param ticker: ticker string for the company whose data will be retrieved :param start_date: start date for the data record :param end_date: end date for the data record :return: DataFrame of financial data """ if start_date is None: start_date = self.start_date if end_date is None: end_date = self.end_date if ticker is None: ticker = self.ticker start_date_str = start_date.__str__().strip(ZERO_TIME) end_date_str = end_date.__str__().strip(ZERO_TIME) rel_file_path = os.path.join(".cache", "&".join([ticker, start_date_str, end_date_str])) + ".csv" if os.path.exists(os.path.join(os.getcwd(), rel_file_path)): try: data_frame = read_csv(os.path.join(os.getcwd(), rel_file_path)) print( " > Loaded data requested for {} from {} to {} from '.cache/' folder".format(ticker, start_date_str, end_date_str)) return data_frame except errors.ParserError: print("Could not load data for {} from {} to {} from .cache/ folder (although the path exists" .format(ticker, start_date, end_date)) pass try: data_frame = web.get_data_yahoo(ticker, start_date, end_date) print(" > Loaded data requested for {} from {} to {} from internet".format(ticker, start_date_str, end_date_str)) except requests.exceptions.SSLError: print("ERROR: A 'requests.exceptions.SSLError' was raised, which may be indicative of a lack of " "internet connection; try again after verifying that you have a successful internet " "connection.") raise requests.exceptions.SSLError except requests.exceptions.ConnectionError: print("ERROR: A 'requests.exceptions.ConnectionError' was raised, which may be indicative of a " "lack of internet connection; try again after verifying that you have a successful " "internet connection.") raise requests.exceptions.ConnectionError if self.cache_bool: self.cache(rel_file_path, data_frame) # loading the dataframe from the internet as opposed to from the csv cache results in a different handling of # the timestamp index, where the timestamp index is converted to a "Date" column when cached. Consequently, # a "Date" column needs to be inserted data_frame.insert(0, "Date", data_frame.index) data_frame.index = np.arange(0, len(data_frame), 1) return data_frame
[docs] def return_dummy_data(self): """ Creates linear stock data as dummy data for testing a model :return: numpy array of dummy data """ return DataFrame({"Date" : [i for i in range(200)], "Close" : [i for i in range(200)]})
[docs] def revise_start_date(self, new_start_date: datetime): """ Modifies :attr:`data_frame` such that the starting date of the data is equal to ``new_start_date`` (all prior data is deleted). :param new_start_date: a datetime object of the new start date for :attr:`data_frame` (where ``new_start_date`` exists and is unique in ``self.data_frame["Date"]`` """ loc = self.data_frame["Date"][self.data_frame["Date"] == new_start_date].index[0] self.data_frame = self.data_frame[loc:] self.start_date = new_start_date
[docs] def revise_end_date(self, new_end_date: datetime): """ Modifies :attr:`data_frame` such that the last date of the data is equal to ``new_end_date`` (all following data is deleted). :param new_end_date: a datetime object of the new end date for :attr:`data_frame` (where ``new_end_date`` exists and is unique in ``self.data_frame["Date"]`` """ loc = self.data_frame["Date"][self.data_frame["Date"] == new_end_date].index[0] self.data_frame = self.data_frame[:loc + 1] # add 1 so that the last date is included self.data_frame: DataFrame self.end_date = new_end_date
[docs] def cache(self, file_path: str, data_frame: DataFrame = None): """ Saves a DataFrame as a ``.csv`` to a path relative to the current working directory. :param file_path: path to save the :ref:`DataFrame` to; if not an absolute path, then it is used as a path relative to the current working directory. :param data_frame: DataFrame to save (if not specified, will use :attr:`data_frame` (attribute) """ if not file_path.endswith(".csv"): file_path += ".csv" if not os.path.abspath(file_path): file_path = os.getcwd() + file_path if not os.path.isdir(os.path.join(os.getcwd(), ".cache")): os.mkdir(os.path.join(os.getcwd(), ".cache")) if data_frame is None: data_frame = self.data_frame data_frame.to_csv(file_path)
[docs] def return_numpy_array_of_company_daily_stock_close(self) -> np.ndarray: """ Returns a numpy array of the "Close" column of :attr:`data_frame`. :return: numpy array of closing stock prices indexed by day """ if self.data_frame.empty: self.populate_dataframe() return np.array(self.data_frame["Close"])
[docs] def return_numpy_array_of_company_daily_stock_percent_change(self, rolling_avg_length: int = 4) -> np.ndarray: """ Converts the numpy array of the closing stock data (acquired by calling :method:`return_numpy_array_of_company_daily_stock_close`) into an array of day-over-day percent change. Adds a value of 0 at the beginning of the array to maintain sequence length. :param apply_rolling_avg: if nonzero, applies a rolling average filter to the percent change data (see :method:`moving_average` for details on how the rolling average is calculated with padding) :return: numpy array of length 1 less than the array generated by :method:`return_numpy_array_of_company_daily_stock_close` """ daily_stock_data = self.return_numpy_array_of_company_daily_stock_close() start_array: np.ndarray = daily_stock_data[:-1] end_array: np.ndarray = daily_stock_data[1:] percent_change = (end_array - start_array) / start_array # if rolling_avg_length == 0: return np.concatenate((np.array([0]), percent_change), axis=0)
# else: # return self.moving_average(percent_change, n=rolling_avg_length)
[docs] def get_date_at_index(self, i): """ Returns the datetime object at index :param i: index to return the date of """ return self.data_frame["Date"].iloc[[i]].values[0]
[docs] @staticmethod def moving_average(a, n, padding : bool =True): r""" Calculates the moving average of a one-dimensional numpy array ``a``, capable of utilizing a padding of length ``n - 1`` at the beginning of the pre-filtered data so that the length is not truncated; the padding is populated with the same value as the first value of the % change sequence. :param a: one-dimensional numpy array :param n: length of rolling average filter :param padding: boolean for whether to utilize padding :return: filtered array """ if padding: a = np.concatenate((np.ones((n - 1,)) * a[0], a), axis=0) ret = np.cumsum(a, dtype=float) ret[n:] = ret[n:] - ret[:-n] return ret[n - 1:] / n
[docs] def reconstruct_stock_from_percent_change(self, percent_change_vec: np.ndarray, initial_condition_index: int): """ Reconstruct the stock prices from percent change :param percent_change_vec: vector of percent changes :param initial_condition_index: index of initial condition for the % change """ len_percent_change_vec = len(percent_change_vec) stock_price = np.zeros((len_percent_change_vec + 1)) stock_price[0] = self.data_frame["Close"].iloc[[initial_condition_index]] for i in range(1, len_percent_change_vec + 1): stock_price[i] = stock_price[i-1] * (1 + percent_change_vec[i-1]) return stock_price