431 lines
19 KiB
Python
431 lines
19 KiB
Python
from typing import Any, List
|
|
import pandas as pd
|
|
import datetime as dt
|
|
from Database import Database
|
|
from shared_utilities import query_satisfied, query_uptodate, unix_time_millis, timeframe_to_minutes
|
|
import logging
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class DataCache:
|
|
"""
|
|
Fetches and manages data limits and optimizes memory storage.
|
|
Handles connections and operations for the given exchanges.
|
|
|
|
Example usage:
|
|
--------------
|
|
db = DataCache(exchanges=some_exchanges_object)
|
|
"""
|
|
|
|
# Disable during production for improved performance.
|
|
TYPECHECKING_ENABLED = True
|
|
|
|
NO_RECORDS_FOUND = float('nan')
|
|
|
|
def __init__(self, exchanges):
|
|
"""
|
|
Initializes the DataCache class.
|
|
|
|
:param exchanges: The exchanges object handling communication with connected exchanges.
|
|
"""
|
|
self.max_tables = 50
|
|
self.max_records = 1000
|
|
self.cached_data = {}
|
|
self.db = Database()
|
|
self.exchanges = exchanges
|
|
|
|
def cache_exists(self, key: str) -> bool:
|
|
"""
|
|
Checks if a cache exists for the given key.
|
|
|
|
:param key: The access key.
|
|
:return: True if cache exists, False otherwise.
|
|
"""
|
|
return key in self.cached_data
|
|
|
|
def update_candle_cache(self, more_records: pd.DataFrame, key: str) -> None:
|
|
"""
|
|
Adds records to existing cache.
|
|
|
|
:param more_records: The new records to be added.
|
|
:param key: The access key.
|
|
:return: None.
|
|
"""
|
|
records = pd.concat([more_records, self.get_cache(key)], axis=0, ignore_index=True)
|
|
records = records.drop_duplicates(subset="open_time", keep='first')
|
|
records = records.sort_values(by='open_time').reset_index(drop=True)
|
|
self.set_cache(data=records, key=key)
|
|
|
|
def set_cache(self, data: Any, key: str, do_not_overwrite: bool = False) -> None:
|
|
"""
|
|
Creates a new cache key and inserts data.
|
|
|
|
:param data: The records to insert into cache.
|
|
:param key: The index key for the data.
|
|
:param do_not_overwrite: Flag to prevent overwriting existing data.
|
|
:return: None
|
|
"""
|
|
if do_not_overwrite and key in self.cached_data:
|
|
return
|
|
self.cached_data[key] = data
|
|
|
|
def update_cached_dict(self, cache_key: str, dict_key: str, data: Any) -> None:
|
|
"""
|
|
Updates a dictionary stored in cache.
|
|
|
|
:param data: The data to insert into cache.
|
|
:param cache_key: The cache index key for the dictionary.
|
|
:param dict_key: The dictionary key for the data.
|
|
:return: None
|
|
"""
|
|
self.cached_data[cache_key].update({dict_key: data})
|
|
|
|
def get_cache(self, key: str) -> Any:
|
|
"""
|
|
Returns data indexed by key.
|
|
|
|
:param key: The index key for the data.
|
|
:return: Any|None - The requested data or None on key error.
|
|
"""
|
|
if key not in self.cached_data:
|
|
logger.warning(f"The requested cache key({key}) doesn't exist!")
|
|
return None
|
|
return self.cached_data[key]
|
|
|
|
def improved_get_records_since(self, key: str, start_datetime: dt.datetime, record_length: int,
|
|
ex_details: List[str]) -> pd.DataFrame:
|
|
"""
|
|
Fetches records since the specified start datetime.
|
|
|
|
:param key: The cache key.
|
|
:param start_datetime: The start datetime to fetch records from.
|
|
:param record_length: The required number of records.
|
|
:param ex_details: Exchange details.
|
|
:return: DataFrame containing the records.
|
|
"""
|
|
try:
|
|
target = 'cache'
|
|
args = {
|
|
'key': key,
|
|
'start_datetime': start_datetime,
|
|
'end_datetime': dt.datetime.utcnow(),
|
|
'record_length': record_length,
|
|
'ex_details': ex_details
|
|
}
|
|
|
|
df = self.get_or_fetch_from(target=target, **args)
|
|
except Exception as e:
|
|
logger.error(f"An error occurred: {str(e)}")
|
|
raise
|
|
|
|
def get_or_fetch_from(self, target: str, **kwargs) -> pd.DataFrame:
|
|
key = kwargs.get('key')
|
|
start_datetime = kwargs.get('start_datetime')
|
|
end_datetime = kwargs.get('start_datetime')
|
|
record_length = kwargs.get('record_length')
|
|
ex_details = kwargs.get('ex_details')
|
|
|
|
if self.TYPECHECKING_ENABLED:
|
|
# Type checking
|
|
if not isinstance(key, str):
|
|
raise TypeError("key must be a string")
|
|
if not isinstance(start_datetime, dt.datetime):
|
|
raise TypeError("start_datetime must be a datetime object")
|
|
if not isinstance(end_datetime, dt.datetime):
|
|
raise TypeError("end_datetime must be a datetime object")
|
|
if not isinstance(record_length, int):
|
|
raise TypeError("record_length must be an integer")
|
|
if not isinstance(ex_details, list) or not all(isinstance(i, str) for i in ex_details):
|
|
raise TypeError("ex_details must be a list of strings")
|
|
|
|
# Ensure all required arguments are provided
|
|
if not all([key, start_datetime, record_length, ex_details]):
|
|
raise ValueError("Missing required arguments")
|
|
|
|
def get_from_cache():
|
|
return pd.DataFrame
|
|
|
|
def get_from_database():
|
|
return pd.DataFrame
|
|
|
|
def get_from_server():
|
|
return pd.DataFrame
|
|
|
|
def data_complete(data, **kwargs) -> bool:
|
|
"""Check if a dataframe completely satisfied a request."""
|
|
sd = kwargs.get('start_datetime')
|
|
ed = kwargs.get('start_datetime')
|
|
rl = kwargs.get('record_length')
|
|
|
|
is_complete = True
|
|
return is_complete
|
|
|
|
request_criteria = {
|
|
'start_datetime': start_datetime,
|
|
'end_datetime': end_datetime,
|
|
'record_length': record_length,
|
|
}
|
|
|
|
if target == 'cache':
|
|
result = get_from_cache()
|
|
if data_complete(result, **request_criteria):
|
|
return result
|
|
else:
|
|
self.get_or_fetch_from('database', **kwargs)
|
|
elif target == 'database':
|
|
result = get_from_database()
|
|
if data_complete(result, **request_criteria):
|
|
return result
|
|
else:
|
|
self.get_or_fetch_from('server', **kwargs)
|
|
elif target == 'server':
|
|
result = get_from_server()
|
|
if data_complete(result, **request_criteria):
|
|
return result
|
|
else:
|
|
logger.error('Unable to fetch the requested data.')
|
|
else:
|
|
raise ValueError(f'Not a valid target: {target}')
|
|
|
|
def get_records_since(self, key: str, start_datetime: dt.datetime, record_length: int,
|
|
ex_details: List[str]) -> pd.DataFrame:
|
|
"""
|
|
Fetches records since the specified start datetime.
|
|
|
|
:param key: The cache key.
|
|
:param start_datetime: The start datetime to fetch records from.
|
|
:param record_length: The required number of records.
|
|
:param ex_details: Exchange details.
|
|
:return: DataFrame containing the records.
|
|
"""
|
|
try:
|
|
end_datetime = dt.datetime.utcnow()
|
|
|
|
if self.cache_exists(key=key):
|
|
logger.debug('Getting records from cache.')
|
|
records = self.get_cache(key)
|
|
else:
|
|
logger.debug(
|
|
f'Records not in cache. Requesting from DB: starting at: {start_datetime} to: {end_datetime}')
|
|
records = self.get_records_since_from_db(table_name=key, st=start_datetime, et=end_datetime,
|
|
rl=record_length, ex_details=ex_details)
|
|
logger.debug(f'Got {len(records.index)} records from DB.')
|
|
self.set_cache(data=records, key=key)
|
|
|
|
first_timestamp = query_satisfied(start_datetime=start_datetime, records=records,
|
|
r_length_min=record_length)
|
|
if pd.isna(first_timestamp):
|
|
logger.debug('No records found to satisfy the query, continuing to fetch more records.')
|
|
additional_records = self.get_records_since_from_db(table_name=key, st=start_datetime, et=end_datetime,
|
|
rl=record_length, ex_details=ex_details)
|
|
elif first_timestamp is not None:
|
|
end_time = dt.datetime.utcfromtimestamp(first_timestamp)
|
|
logger.debug(f'Requesting additional records from {start_datetime} to {end_time}')
|
|
additional_records = self.get_records_since_from_db(table_name=key, st=start_datetime, et=end_time,
|
|
rl=record_length, ex_details=ex_details)
|
|
if not additional_records.empty:
|
|
logger.debug(f'Got {len(additional_records.index)} additional records from DB.')
|
|
self.update_candle_cache(additional_records, key)
|
|
|
|
last_timestamp = query_uptodate(records=records, r_length_min=record_length)
|
|
if last_timestamp is not None:
|
|
start_time = dt.datetime.utcfromtimestamp(last_timestamp)
|
|
logger.debug(f'Requesting additional records from {start_time} to {end_datetime}')
|
|
additional_records = self.get_records_since_from_db(table_name=key, st=start_time, et=end_datetime,
|
|
rl=record_length, ex_details=ex_details)
|
|
logger.debug(f'Got {len(additional_records.index)} additional records from DB.')
|
|
if not additional_records.empty:
|
|
self.update_candle_cache(additional_records, key)
|
|
|
|
_timestamp = unix_time_millis(start_datetime)
|
|
logger.debug(f"Start timestamp in human-readable form: {dt.datetime.utcfromtimestamp(_timestamp / 1000.0)}")
|
|
|
|
result = self.get_cache(key).query('open_time >= @_timestamp')
|
|
return result
|
|
|
|
except Exception as e:
|
|
logger.error(f"An error occurred: {str(e)}")
|
|
raise
|
|
|
|
def get_records_since_from_db(self, table_name: str, st: dt.datetime, et: dt.datetime, rl: float,
|
|
ex_details: List[str]) -> pd.DataFrame:
|
|
"""
|
|
Fetches records from the database since the specified start datetime.
|
|
|
|
:param table_name: The name of the table in the database.
|
|
:param st: The start datetime to fetch records from.
|
|
:param et: The end datetime to fetch records until.
|
|
:param rl: The required number of records.
|
|
:param ex_details: Exchange details.
|
|
:return: DataFrame containing the records.
|
|
"""
|
|
|
|
def add_data(data: pd.DataFrame, tn: str, start_t: dt.datetime, end_t: dt.datetime) -> pd.DataFrame:
|
|
new_records = self._populate_db(table_name=tn, start_time=start_t, end_time=end_t, ex_details=ex_details)
|
|
logger.debug(f'Got {len(new_records.index)} records from exchange_name')
|
|
if not new_records.empty:
|
|
data = pd.concat([data, new_records], axis=0, ignore_index=True)
|
|
data = data.drop_duplicates(subset="open_time", keep='first')
|
|
return data
|
|
|
|
if self.db.table_exists(table_name=table_name):
|
|
logger.debug('Table existed retrieving records from DB')
|
|
logger.debug(f'Requesting from {st} to {et}')
|
|
records = self.db.get_timestamped_records(table_name=table_name, timestamp_field='open_time', st=st, et=et)
|
|
logger.debug(f'Got {len(records.index)} records from db')
|
|
else:
|
|
logger.debug(f"Table didn't exist fetching from {ex_details[2]}")
|
|
temp = (((unix_time_millis(et) - unix_time_millis(st)) / 1000) / 60) / rl
|
|
logger.debug(f'Requesting from {st} to {et}, Should be {temp} records')
|
|
records = self._populate_db(table_name=table_name, start_time=st, end_time=et, ex_details=ex_details)
|
|
logger.debug(f'Got {len(records.index)} records from {ex_details[2]}')
|
|
|
|
first_timestamp = query_satisfied(start_datetime=st, records=records, r_length_min=rl)
|
|
if pd.isna(first_timestamp):
|
|
logger.debug('No records found to satisfy the query, continuing to fetch more records.')
|
|
records = add_data(data=records, tn=table_name, start_t=st, end_t=et)
|
|
elif first_timestamp:
|
|
logger.debug(f'Records did not go far enough back. Requesting from {ex_details[2]}')
|
|
logger.debug(f'First ts on record is: {first_timestamp}')
|
|
end_time = dt.datetime.utcfromtimestamp(first_timestamp)
|
|
logger.debug(f'Requesting from {st} to {end_time}')
|
|
records = add_data(data=records, tn=table_name, start_t=st, end_t=end_time)
|
|
|
|
last_timestamp = query_uptodate(records=records, r_length_min=rl)
|
|
if last_timestamp:
|
|
logger.debug(f'Records were not updated. Requesting from {ex_details[2]}.')
|
|
logger.debug(f'The last record on file is: {last_timestamp}')
|
|
start_time = dt.datetime.utcfromtimestamp(last_timestamp)
|
|
logger.debug(f'Requesting from {start_time} to {et}')
|
|
records = add_data(data=records, tn=table_name, start_t=start_time, end_t=et)
|
|
|
|
return records
|
|
|
|
def _populate_db(self, table_name: str, start_time: dt.datetime, ex_details: List[str],
|
|
end_time: dt.datetime = None) -> pd.DataFrame:
|
|
"""
|
|
Populates a database table with records from the exchange.
|
|
|
|
:param table_name: Name of the table in the database.
|
|
:param start_time: Start time to fetch the records from.
|
|
:param end_time: End time to fetch the records until (optional).
|
|
:param ex_details: Exchange details [symbol, interval, exchange_name, user_name].
|
|
:return: DataFrame of the data downloaded.
|
|
"""
|
|
if end_time is None:
|
|
end_time = dt.datetime.utcnow()
|
|
sym, inter, ex, un = ex_details
|
|
records = self._fetch_candles_from_exchange(symbol=sym, interval=inter, exchange_name=ex, user_name=un,
|
|
start_datetime=start_time, end_datetime=end_time)
|
|
if not records.empty:
|
|
self.db.insert_candles_into_db(records, table_name=table_name, symbol=sym, exchange_name=ex)
|
|
else:
|
|
logger.debug(f'No records inserted {records}')
|
|
return records
|
|
|
|
def _fetch_candles_from_exchange(self, symbol: str, interval: str, exchange_name: str, user_name: str,
|
|
start_datetime: dt.datetime = None,
|
|
end_datetime: dt.datetime = None) -> pd.DataFrame:
|
|
"""
|
|
Fetches and returns all candles from the specified market, timeframe, and exchange.
|
|
|
|
:param symbol: Symbol of the market.
|
|
:param interval: Timeframe in the format '<int><alpha>' (e.g., '15m', '4h').
|
|
:param exchange_name: Name of the exchange.
|
|
:param user_name: Name of the user.
|
|
:param start_datetime: Start datetime for fetching data (optional).
|
|
:param end_datetime: End datetime for fetching data (optional).
|
|
:return: DataFrame of candle data.
|
|
"""
|
|
|
|
def fill_data_holes(records: pd.DataFrame, interval: str) -> pd.DataFrame:
|
|
"""
|
|
Fills gaps in the data by replicating the last known data point for the missing periods.
|
|
|
|
:param records: DataFrame containing the original records.
|
|
:param interval: Interval of the data (e.g., '1m', '5m').
|
|
:return: DataFrame with gaps filled.
|
|
"""
|
|
time_span = timeframe_to_minutes(interval)
|
|
last_timestamp = None
|
|
filled_records = []
|
|
|
|
logger.info(f"Starting to fill data holes for interval: {interval}")
|
|
|
|
for index, row in records.iterrows():
|
|
time_stamp = row['open_time']
|
|
|
|
if last_timestamp is None:
|
|
last_timestamp = time_stamp
|
|
filled_records.append(row)
|
|
logger.debug(f"First timestamp: {time_stamp}")
|
|
continue
|
|
|
|
delta_ms = time_stamp - last_timestamp
|
|
delta_minutes = (delta_ms / 1000) / 60
|
|
|
|
logger.debug(f"Timestamp: {time_stamp}, Delta minutes: {delta_minutes}")
|
|
|
|
if delta_minutes > time_span:
|
|
num_missing_rec = int(delta_minutes / time_span)
|
|
step = int(delta_ms / num_missing_rec)
|
|
logger.debug(f"Gap detected. Filling {num_missing_rec} records with step: {step}")
|
|
|
|
for ts in range(int(last_timestamp) + step, int(time_stamp), step):
|
|
new_row = row.copy()
|
|
new_row['open_time'] = ts
|
|
filled_records.append(new_row)
|
|
logger.debug(f"Filled timestamp: {ts}")
|
|
|
|
filled_records.append(row)
|
|
last_timestamp = time_stamp
|
|
|
|
logger.info("Data holes filled successfully.")
|
|
return pd.DataFrame(filled_records)
|
|
|
|
if start_datetime is None:
|
|
start_datetime = dt.datetime(year=2017, month=1, day=1)
|
|
|
|
if end_datetime is None:
|
|
end_datetime = dt.datetime.utcnow()
|
|
|
|
if start_datetime > end_datetime:
|
|
raise ValueError("Invalid start and end parameters: start_datetime must be before end_datetime.")
|
|
|
|
exchange = self.exchanges.get_exchange(ename=exchange_name, uname=user_name)
|
|
|
|
expected_records = (((unix_time_millis(end_datetime) - unix_time_millis(
|
|
start_datetime)) / 1000) / 60) / timeframe_to_minutes(interval)
|
|
logger.info(
|
|
f'Fetching historical data from {start_datetime} to {end_datetime}. Expected records: {expected_records}')
|
|
|
|
if start_datetime == end_datetime:
|
|
end_datetime = None
|
|
|
|
candles = exchange.get_historical_klines(symbol=symbol, interval=interval, start_dt=start_datetime,
|
|
end_dt=end_datetime)
|
|
num_rec_records = len(candles.index)
|
|
|
|
logger.info(f'{num_rec_records} candles retrieved from the exchange.')
|
|
|
|
open_times = candles.open_time
|
|
min_open_time = open_times.min()
|
|
max_open_time = open_times.max()
|
|
|
|
if min_open_time < 1e10:
|
|
raise ValueError('Records are not in milliseconds')
|
|
|
|
max_open_time /= 1000
|
|
min_open_time /= 1000
|
|
|
|
estimated_num_records = ((max_open_time - min_open_time) / 60) / timeframe_to_minutes(interval)
|
|
|
|
logger.info(f'Estimated number of records: {estimated_num_records}')
|
|
|
|
if num_rec_records < estimated_num_records:
|
|
logger.info('Detected gaps in the data, attempting to fill missing records.')
|
|
candles = fill_data_holes(candles, interval)
|
|
|
|
return candles
|