Time to read this post: 10 mins

Background

There is not much information on the internet in how Yahoo daily closing prices are calculated and how they compare to the intraday data. At the NYSE daily closing price is determine by an closing auction. In this article we make an attempt to compare real data. The goal ist to compare:

1) The day end prices of historical intraday data such as 1m, 5m and 30m data to the daily closing prices 2) Compare live intraday data with the historical intraday data

Imports and Settings

import warnings
warnings.filterwarnings('ignore')

import os
import pandas as pd
import numpy as np

import time

import plotly.graph_objects as go
from plotly.subplots import make_subplots

import datetime
from datetime import timedelta

import yfinance as yf

pd.set_option('display.width', 1000)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

%load_ext autoreload
%autoreload 2
%cd ..

data_from_date  = datetime.datetime.now() - pd.offsets.Day(29) # Yahoo allows to download last 30 days only
data_end_date   = datetime.datetime.now() - pd.offsets.BDay(2)
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
/home/martin/GitHub

from alpaca_trade_api.rest import REST, TimeFrame

os.environ['APCA_API_BASE_URL']     = 'https://paper-api.alpaca.markets'
os.environ['APCA_API_KEY_ID']       = ''
os.environ['APCA_API_SECRET_KEY']   = ''

alpaca_api = REST()
account = alpaca_api.get_account()

Compare historical Daily Closing Prices with Intraday Prices

Define the tickers you want to compare.


tickers = ['CL', 'CSCO', 'FAST', 'HOLX', 'HSY', 'INTU', 'NKE', 'NTAP', 'TER', 'TSCO', 'TXN', 'WAT', 'YUM']

Define metric to use for comparison.

metric = 'adj_close'

Download Prices

df_dates = yf.download([tickers[0]], interval='1D', start=data_from_date, end=data_end_date, progress=False)
dates = df_dates.index
df_data = pd.DataFrame(index=dates, columns=['d_y', '1m_y', '5m_y', '30m_y', 'vwap_y', '1m_a'], dtype=np.float32)
df_data.index.names = ['date']

df_diffs = pd.DataFrame(columns=['%_1m_y', '%_5m_y', '%_30m_y', '%_vwap_y', '%_1m_a'], dtype=np.float32)

for ticker in tickers:
    
    for date in dates:

        start_date = (date.date() - pd.offsets.BDay(1)).strftime('%Y-%m-%d')
        end_date   = (date.date() + pd.offsets.BDay(1)).strftime('%Y-%m-%d')

        df_y_daily = yf.download([ticker], interval='1D', start=date, end=end_date, progress=False)
        df_y_daily.columns = map(str.lower, df_y_daily.columns)
        df_y_daily.rename(columns={'adj close': 'adj_close'}, inplace=True)
        df_y_daily.index.names = ['date']
        df_y_daily = df_y_daily.filter(regex='^' + date.date().strftime('%Y-%m-%d'), axis=0)
        df_y_daily.sort_index(inplace=True)
        df_y_daily.sort_index(inplace=True, axis=1)
        
        df_y_1m = yf.download([ticker], interval='1m', start=date, end=end_date, progress=False)
        df_y_1m.columns = map(str.lower, df_y_1m.columns)
        df_y_1m.rename(columns={'adj close': 'adj_close'}, inplace=True)
        df_y_1m.index.names = ['date']
        df_y_1m = df_y_1m.filter(regex='^' + date.date().strftime('%Y-%m-%d'), axis=0)
        df_y_1m.sort_index(inplace=True)
        df_y_1m.sort_index(inplace=True, axis=1)

        df_y_5m = yf.download([ticker], interval='5m', start=date, end=end_date, progress=False)
        df_y_5m.columns = map(str.lower, df_y_5m.columns)
        df_y_5m.rename(columns={'adj close': 'adj_close'}, inplace=True)
        df_y_5m.index.names = ['date']
        df_y_5m = df_y_5m.filter(regex='^' + date.date().strftime('%Y-%m-%d'), axis=0)
        df_y_5m.sort_index(inplace=True)
        df_y_5m.sort_index(inplace=True, axis=1)

        df_y_30m = yf.download([ticker], interval='30m', start=date, end=end_date, progress=False)
        df_y_30m.columns = map(str.lower, df_y_30m.columns)
        df_y_30m.rename(columns={'adj close': 'adj_close'}, inplace=True)
        df_y_30m.index.names = ['date']
        df_y_30m = df_y_30m.filter(regex='^' + date.date().strftime('%Y-%m-%d'), axis=0)
        df_y_30m.sort_index(inplace=True)
        df_y_30m.sort_index(inplace=True, axis=1)

        df_a_1m_not_adj = alpaca_api.get_bars(ticker, '1min', start_date, end_date, adjustment='raw').df
        df_a_1m_not_adj.drop(df_a_1m_not_adj.columns.difference(['close']), 1, inplace=True)
        df_a_1m = alpaca_api.get_bars(ticker, '1min', start_date, end_date, adjustment='all').df
        df_a_1m.rename(columns={'close': 'adj_close'}, inplace=True)
        df_a_1m = df_a_1m.merge(df_a_1m_not_adj, how='left', left_index=True, right_index=True).sort_index()
        df_a_1m = df_a_1m.filter(regex='^' + date.date().strftime('%Y-%m-%d'), axis=0)
        df_a_1m.sort_index(inplace=True)
        df_a_1m.sort_index(inplace=True, axis=1)
        calendar = alpaca_api.get_calendar(start=start_date, end=end_date)[0]
        df_a_1m = df_a_1m.tz_convert(tz='America/New_York').between_time(calendar.open, datetime.time(15, 59, 0))

        # compute vwap over last 30 minutes with 1m bars
        q       = df_y_1m.volume[-30:]
        p       = df_y_1m[metric][-30:]
        vwap    = (p * q).cumsum() / q.cumsum()

        df_data.loc[date] = [df_y_daily.iloc[-1].adj_close, df_y_1m.iloc[-1][metric], df_y_5m.iloc[-1][metric], df_y_30m.iloc[-1][metric], vwap[-1], df_a_1m.iloc[-1].adj_close]

    # compute percentage change
    df_diff = pd.DataFrame(index=dates, columns=df_diffs.columns, dtype=np.float32)
    nb_of_columns = len(df_diff.columns)
    
    multi_1 = np.broadcast_to(((np.ones(len(dates), dtype=np.float32)*100) / df_data.values[:,0].squeeze()).T, (nb_of_columns, len(dates))).T
    multi_2 = np.broadcast_to(df_data.values[:,0].squeeze(), (nb_of_columns, len(dates))).T - df_data.values[:,1:]

    df_diff.iloc[:] = np.round(multi_1 * multi_2, decimals=3)

    df_diffs = pd.concat([df_diffs, df_diff])

Analyse Result

The table below shows the differences between daily closing price and intraday data in percentage from adjusted closing price.

df_diffs.describe()
%_1m_y %_5m_y %_30m_y %_vwap_y %_1m_a
count 234.000 234.000 234.000 234.000 234.000
mean -0.016 -0.016 -0.016 0.029 -0.004
std 0.063 0.063 0.063 0.185 0.022
min -0.525 -0.525 -0.525 -0.653 -0.080
25% -0.020 -0.020 -0.020 -0.088 -0.015
50% -0.003 -0.003 -0.003 0.020 0.000
75% 0.000 0.000 0.000 0.158 0.004
max 0.088 0.088 0.088 0.470 0.100

Conclusions

We can make the following conclusions:

  • Worst price difference is -0.73% to daily intraday over last 30 trading days
  • Alpaca standard deviation is better with 0.036 compared to Yahoo with 0.13
  • VWAP with 1m bars between 15:30 and 16:00 is worse
  • Overall, price difference appear not to large.

Compare Live and Historical Intraday Data

In a first step, we download live data at around 15:45 NY Time on a specific date in this case 2022-09-21.

# define the date when data was downloaded
test_date = pd.Timestamp.today()
arrays = [[], []]
tuples = list(zip(*arrays))

multi_index = pd.MultiIndex.from_tuples(tuples, names=["date", "ticker"])
df_y_1m_live = pd.DataFrame(index=multi_index, columns=['open', 'high', 'low', 'adj_close', 'close', 'volume'], dtype=np.float32)
start_date  = test_date.date()
end_date    = (test_date.date() + pd.offsets.BDay(1)).strftime('%Y-%m-%d')

for ticker in tickers:

    df_rows = yf.download([ticker], interval='1m', start=start_date, end=end_date, progress=False)
    df_rows.columns = map(str.lower, df_rows.columns)
    df_rows.rename(columns={'adj close': 'adj_close'}, inplace=True)
    df_rows.index.names = ['date']
    df_rows.sort_index(inplace=True)
    df_rows.sort_index(inplace=True, axis=1)

    for time, prices in df_rows.iterrows():

        idx = pd.IndexSlice
        df_y_1m_live.loc[idx[time, ticker], 'open']      = prices.open
        df_y_1m_live.loc[idx[time, ticker], 'high']      = prices.high
        df_y_1m_live.loc[idx[time, ticker], 'low']       = prices.low
        df_y_1m_live.loc[idx[time, ticker], 'adj_close'] = prices.adj_close
        df_y_1m_live.loc[idx[time, ticker], 'close']     = prices.close
        df_y_1m_live.loc[idx[time, ticker], 'volume']    = prices.volume
            

Data is stored to Jupyter Notebook. Afterwards we have to wait for one day to continue to download same day as historical dataset.

%store df_y_1m_live test_date

On 2022-09-22, restore previous data.

# load previously stored data
%store -r
# define how many lines to compare
nb_of_lines = 10
from IPython.core.display import HTML

for ticker in tickers:

    df_live_ticker = df_y_1m_live.filter(regex=ticker, axis=0).reset_index().drop('ticker', axis=1).set_index('date').sort_index(axis=1)

    # download historical data for same date
    start_date = (test_date.date() - pd.offsets.BDay(1)).strftime('%Y-%m-%d')
    end_date   = (test_date.date() + pd.offsets.BDay(1)).strftime('%Y-%m-%d')

    df_y_1m = yf.download([ticker], interval='1m', start=start_date, end=end_date, progress=False)
    df_y_1m.columns = map(str.lower, df_y_1m.columns)
    df_y_1m.rename(columns={'adj close': 'adj_close'}, inplace=True)
    df_y_1m.index.names = ['date']
    df_y_1m = df_y_1m.filter(regex='^' + test_date.date().strftime('%Y-%m-%d'), axis=0)
    df_y_1m.sort_index(inplace=True)
    df_y_1m.sort_index(inplace=True, axis=1)
    
    # combine live and historical data where time of bars match
    merged = df_live_ticker.merge(df_y_1m, how='left', left_index=True, right_index=True).sort_index().dropna()

    print(ticker)

    # compute percentage change
    multi_1 = merged.iloc[:, 0:6].values
    multi_2 = merged.iloc[:, 6:12].values

    diff_merged = pd.DataFrame( 100 / multi_1 * (multi_1-multi_2), 
                                columns=['%_' + elem.replace('_x', '') for elem in merged.columns[0:6]],
                                index=merged.index).dropna()

    diff_merged = diff_merged.loc[~(diff_merged==0).all(axis=1)]
    display(HTML(diff_merged.describe().to_html()))
    
ADBE
%_adj_close %_close %_high %_low %_open %_volume
count 13.000 13.000 13.000 13.000 13.000 13.000
mean -0.007 -0.007 -0.000 0.009 0.012 -inf
std 0.023 0.023 0.001 0.031 0.041 NaN
min -0.085 -0.085 -0.003 0.000 -0.032 -inf
25% 0.000 0.000 0.000 0.000 0.000 -4.334
50% 0.000 0.000 0.000 0.000 0.000 0.024
75% 0.000 0.000 0.000 0.000 0.000 1.969
max 0.000 0.000 0.000 0.112 0.129 12.038
ADSK
%_adj_close %_close %_high %_low %_open %_volume
count 13.000 13.000 13.000 13.000 13.000 13.000
mean -0.006 -0.006 -0.034 0.019 -0.021 -inf
std 0.054 0.054 0.064 0.029 0.090 NaN
min -0.161 -0.161 -0.221 0.000 -0.311 -inf
25% 0.000 0.000 -0.040 0.000 0.000 -3.942
50% 0.000 0.000 0.000 0.000 0.000 -0.273
75% 0.005 0.005 0.000 0.048 0.000 3.098
max 0.065 0.065 0.000 0.065 0.070 24.126
ALGN
%_adj_close %_close %_high %_low %_open %_volume
count 13.000 13.000 13.000 13.000 13.000 13.000
mean -0.000 -0.000 -0.027 0.015 0.002 -inf
std 0.008 0.008 0.062 0.044 0.074 NaN
min -0.013 -0.013 -0.197 0.000 -0.197 -inf
25% 0.000 0.000 -0.013 0.000 0.000 -8.261
50% 0.000 0.000 0.000 0.000 0.000 0.000
75% 0.000 0.000 0.000 0.000 0.000 10.543
max 0.021 0.021 0.000 0.158 0.158 16.262
AMD
%_adj_close %_close %_high %_low %_open %_volume
count 15.000 15.000 15.000 15.000 15.000 15.000
mean -0.001 -0.001 -0.005 0.000 -0.004 -inf
std 0.004 0.004 0.010 0.001 0.007 NaN
min -0.013 -0.013 -0.039 0.000 -0.024 -inf
25% 0.000 0.000 -0.004 0.000 -0.004 -0.055
50% 0.000 0.000 0.000 0.000 0.000 -0.000
75% 0.000 0.000 0.000 0.000 0.000 0.022
max 0.000 0.000 0.000 0.005 0.000 0.328
AZO
%_adj_close %_close %_high %_low %_open %_volume
count 6.000 6.000 6.000 6.000 6.000 6.000
mean 0.026 0.026 -0.003 0.040 0.003 -inf
std 0.063 0.063 0.007 0.062 0.007 NaN
min 0.000 0.000 -0.017 0.000 -0.000 -inf
25% 0.000 0.000 0.000 0.000 0.000 -43.097
50% 0.000 0.000 0.000 0.009 0.000 18.149
75% 0.000 0.000 0.000 0.057 0.000 42.710
max 0.154 0.154 0.000 0.154 0.017 73.614
CL
%_adj_close %_close %_high %_low %_open %_volume
count 15.000 15.000 15.000 15.000 15.000 15.000
mean -0.002 -0.002 -0.012 0.022 0.016 -inf
std 0.012 0.012 0.014 0.030 0.026 NaN
min -0.026 -0.026 -0.039 0.000 -0.026 -inf
25% 0.000 0.000 -0.026 0.000 0.000 -14.383
50% 0.000 0.000 -0.007 0.007 0.000 -1.913
75% 0.000 0.000 0.000 0.033 0.026 6.030
max 0.013 0.013 0.000 0.092 0.072 26.771
CTXS
%_adj_close %_close %_high %_low %_open %_volume
count 14.000 14.000 14.000 14.000 14.000 14.000
mean 0.003 0.003 -0.002 0.002 -0.001 -inf
std 0.006 0.006 0.004 0.004 0.005 NaN
min 0.000 0.000 -0.010 0.000 -0.010 -inf
25% 0.000 0.000 -0.004 0.000 -0.004 -27.647
50% 0.000 0.000 0.000 0.000 0.000 -2.104
75% 0.007 0.007 0.000 0.005 0.000 4.733
max 0.014 0.014 0.000 0.010 0.010 9.932
HD
%_adj_close %_close %_high %_low %_open %_volume
count 15.000 15.000 15.000 15.000 15.000 15.000
mean 0.001 0.001 -0.011 0.018 0.013 -inf
std 0.022 0.022 0.014 0.027 0.056 NaN
min -0.032 -0.032 -0.036 0.000 -0.061 -inf
25% -0.002 -0.002 -0.020 0.000 0.000 -7.097
50% 0.000 0.000 0.000 0.000 0.000 0.530
75% 0.002 0.002 0.000 0.032 0.012 4.019
max 0.062 0.062 0.000 0.077 0.155 25.711
KLAC
%_adj_close %_close %_high %_low %_open %_volume
count 14.000 14.000 14.000 14.000 14.000 14.000
mean -0.019 -0.019 -0.035 0.008 -0.003 -inf
std 0.080 0.080 0.055 0.015 0.052 NaN
min -0.198 -0.198 -0.198 0.000 -0.115 -inf
25% -0.049 -0.049 -0.055 0.000 0.000 -0.805
50% 0.000 0.000 -0.006 0.000 0.000 0.000
75% 0.000 0.000 0.000 0.013 0.000 3.720
max 0.119 0.119 0.000 0.044 0.085 14.324
LRCX
%_adj_close %_close %_high %_low %_open %_volume
count 14.000 14.000 14.000 14.000 14.000 14.000
mean -0.023 -0.023 -0.040 0.004 -0.023 -inf
std 0.046 0.046 0.066 0.010 0.041 NaN
min -0.115 -0.115 -0.187 0.000 -0.128 -inf
25% 0.000 0.000 -0.081 0.000 -0.018 -5.022
50% 0.000 0.000 0.000 0.000 0.000 -0.100
75% 0.000 0.000 0.000 0.000 0.000 0.092
max 0.002 0.002 0.000 0.034 0.000 8.986
MAS
%_adj_close %_close %_high %_low %_open %_volume
count 16.000 16.000 16.000 16.000 16.000 16.000
mean -0.017 -0.017 -0.024 0.022 0.016 -inf
std 0.044 0.044 0.025 0.028 0.040 NaN
min -0.144 -0.144 -0.082 0.000 -0.062 -inf
25% -0.026 -0.026 -0.041 0.000 0.000 -20.361
50% 0.000 0.000 -0.020 0.015 0.000 1.822
75% 0.000 0.000 0.000 0.033 0.021 8.208
max 0.049 0.049 0.000 0.082 0.102 26.394
ORLY
%_adj_close %_close %_high %_low %_open %_volume
count 7.000 7.000 7.000 7.000 7.000 7.000
mean 0.021 0.021 0.000 0.053 0.030 -inf
std 0.047 0.047 0.000 0.054 0.052 NaN
min 0.000 0.000 0.000 0.000 0.000 -inf
25% 0.000 0.000 0.000 0.011 0.000 -6.326
50% 0.000 0.000 0.000 0.024 0.000 0.052
75% 0.011 0.011 0.000 0.102 0.042 12.820
max 0.126 0.126 0.000 0.122 0.122 41.216
ROK
%_adj_close %_close %_high %_low %_open %_volume
count 14.000 14.000 14.000 14.000 14.000 14.000
mean 0.011 0.011 -0.018 0.022 -0.024 -inf
std 0.033 0.033 0.031 0.038 0.052 NaN
min -0.008 -0.008 -0.087 0.000 -0.108 -inf
25% 0.000 0.000 -0.018 0.000 -0.077 -9.114
50% 0.000 0.000 0.000 0.000 0.000 -0.971
75% 0.000 0.000 0.000 0.026 0.000 4.999
max 0.105 0.105 0.000 0.105 0.065 29.213
TER
%_adj_close %_close %_high %_low %_open %_volume
count 12.000 12.000 12.000 12.000 12.000 12.000
mean 0.006 0.006 -0.009 0.026 0.007 -inf
std 0.039 0.039 0.015 0.069 0.079 NaN
min -0.048 -0.048 -0.036 0.000 -0.084 -inf
25% -0.007 -0.007 -0.019 0.000 0.000 -3.719
50% 0.000 0.000 0.000 0.000 0.000 0.000
75% 0.010 0.010 0.000 0.007 0.000 1.190
max 0.095 0.095 0.000 0.239 0.239 10.465
TXN
%_adj_close %_close %_high %_low %_open %_volume
count 15.000 15.000 15.000 15.000 15.000 15.000
mean -0.019 -0.019 -0.017 0.010 0.007 -inf
std 0.041 0.041 0.039 0.013 0.016 NaN
min -0.153 -0.153 -0.153 0.000 -0.006 -inf
25% -0.028 -0.028 -0.019 0.000 0.000 -5.138
50% 0.000 0.000 0.000 0.003 0.000 -0.778
75% 0.000 0.000 0.000 0.016 0.009 1.969
max 0.021 0.021 0.000 0.042 0.054 16.226
VRSN
%_adj_close %_close %_high %_low %_open %_volume
count 16.000 16.000 16.000 16.000 16.000 16.000
mean 0.006 0.006 -0.011 0.045 0.019 -inf
std 0.056 0.056 0.026 0.081 0.058 NaN
min -0.100 -0.100 -0.089 0.000 -0.017 -inf
25% 0.000 0.000 -0.006 0.000 0.000 -7.556
50% 0.000 0.000 0.000 0.000 0.000 0.000
75% 0.019 0.019 0.000 0.044 0.001 2.175
max 0.150 0.150 0.000 0.233 0.219 33.636
WAT
%_adj_close %_close %_high %_low %_open %_volume
count 11.000 11.000 11.000 11.000 11.000 11.000
mean 0.011 0.011 -0.006 0.013 -0.005 -inf
std 0.038 0.038 0.020 0.037 0.021 NaN
min -0.003 -0.003 -0.066 0.000 -0.066 -inf
25% 0.000 0.000 0.000 0.000 0.000 -259.686
50% 0.000 0.000 0.000 0.000 0.000 -4.217
75% 0.000 0.000 0.000 0.000 0.000 18.359
max 0.125 0.125 0.000 0.122 0.016 46.950
YUM
%_adj_close %_close %_high %_low %_open %_volume
count 15.000 15.000 15.000 15.000 15.000 15.000
mean -0.007 -0.007 -0.009 0.010 0.003 -inf
std 0.020 0.020 0.013 0.019 0.019 NaN
min -0.069 -0.069 -0.035 0.000 -0.030 -inf
25% 0.000 0.000 -0.017 0.000 0.000 -3.364
50% 0.000 0.000 0.000 0.000 0.000 -0.299
75% 0.000 0.000 0.000 0.013 0.004 1.714
max 0.004 0.004 0.000 0.052 0.061 9.637

Conclusions

Based on the above data we can conclude:

  • After market closing, historical intraday data is slightly different than during live session
  • 15 out of around 300 one-minute bars have a price difference of less than 0.2%
  • Overall, price differences exist do to adjustments after market closing, but are small