InĀ [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns # so we can beautify the plots
import statsmodels.api as sm 
from  statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error


data = pd.read_csv('../FoodPrices/FoodImports/Prices-PivotTable7.csv')
data = data.reset_index()
data.drop(columns=['level_0'], inplace=True)
data.columns =  ['Food'] + list(data.columns[1:])
data.index = data['Food']
data.drop(columns=['Food', '1999'], inplace=True)
data.drop('Commodity', axis=0, inplace=True)
data = data.iloc[:, [0, 1,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2]]
time_series = data.loc[:,'2000':'2023' ].copy()
# time_series.drop('Live meat animals', axis=0, inplace=True)

# time_series=time_series.astype(int)
for i in list(time_series.columns):
    time_series[i] = time_series[i].apply(lambda x: int(str(x).replace(',', '')) )    
types = { str(i).strip(','):int for i in set(time_series.columns) }

percentages = time_series[:13].copy()
prices = time_series[13:-1].copy()

time_series = time_series.astype(types) 
time_series
Out[1]:
2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 ... 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023
Food
Beverages -2 -1 6 -1 3 7 4 4 3 -4 ... 1 -1 -2 1 3 0 -5 4 2 -2
Cocoa and chocolate -15 10 24 21 -9 -1 1 11 26 0 ... 14 0 2 -7 0 4 2 1 9 15
Coffee, tea, and spices -9 -25 0 12 7 21 6 13 15 -5 ... 7 1 -6 7 -5 -6 4 13 28 -2
Dairy 0 -1 -1 8 14 -2 3 9 26 -10 ... 3 -14 -9 2 11 0 1 15 5 8
Fish and shellfish 9 -5 -5 0 0 3 5 4 5 -7 ... 9 -9 2 8 1 0 -5 17 5 -8
Fruits -1 3 2 6 5 12 7 10 9 0 ... 6 2 4 4 3 5 -1 11 11 2
Grains 9 -3 19 23 4 5 -1 6 18 -1 ... 12 4 2 -1 8 11 4 14 16 -2
Live meat animals 11 5 -9 -33 -17 46 18 5 -6 1 ... 32 -13 -21 -4 4 11 -8 -7 13 13
Meats 7 7 -3 6 17 3 1 2 9 -11 ... 11 -1 -8 2 6 6 2 15 1 -1
Nuts -11 -18 -1 19 29 7 -2 -1 19 -10 ... 6 9 2 8 4 -6 -9 -1 0 -9
Sugar and candy 2 7 10 4 -2 -9 -3 12 -7 10 ... -1 -1 2 7 -2 -3 -4 20 12 7
Vegetable oils -14 -13 15 33 11 1 -4 14 32 -23 ... -8 7 0 7 -1 -8 5 33 25 -11
Vegetables 5 2 -3 8 8 3 5 1 5 -3 ... -4 2 2 -2 1 3 7 -1 6 7
Beverages 1310 1296 1380 1362 1402 1499 1565 1635 1684 1612 ... 1857 1830 1801 1824 1887 1889 1794 1869 1915 1882
Cocoa and chocolate 1406 1550 1922 2331 2127 2106 2120 2357 2961 2973 ... 3649 3636 3720 3447 3433 3570 3629 3676 3998 4594
Coffee, tea, and spices 1995 1501 1501 1675 1789 2161 2298 2589 2966 2831 ... 3929 3950 3717 3983 3780 3553 3702 4188 5378 5295
Dairy 3104 3078 3058 3313 3781 3695 3823 4174 5273 4739 ... 6233 5335 4864 4979 5524 5539 5607 6435 6746 7299
Fish and shellfish 5775 5506 5249 5225 5247 5392 5668 5881 6182 5757 ... 8185 7416 7542 8134 8193 8230 7799 9100 9549 8811
Fruits 608 624 639 676 713 798 857 941 1024 1027 ... 1218 1242 1296 1352 1395 1464 1447 1607 1790 1817
Grains 415 404 479 588 611 641 632 670 789 780 ... 887 921 935 925 1003 1110 1155 1320 1525 1491
Live meat animals 218 228 208 138 115 168 198 208 196 198 ... 416 362 287 274 285 316 292 272 308 350
Meats 2426 2605 2520 2668 3109 3201 3222 3297 3585 3191 ... 5197 5143 4706 4803 5080 5385 5504 6351 6389 6335
Nuts 2589 2122 2106 2503 3221 3436 3374 3351 3980 3591 ... 4994 5455 5583 6057 6312 5946 5389 5347 5348 4873
Sugar and candy 739 789 867 901 886 808 787 884 825 903 ... 1061 1047 1069 1147 1124 1092 1051 1265 1414 1517
Vegetable oils 548 475 547 729 812 823 788 895 1177 911 ... 977 1048 1052 1128 1113 1025 1074 1434 1787 1598
Vegetables 794 811 789 856 920 950 1001 1008 1062 1028 ... 1136 1154 1172 1145 1150 1188 1270 1261 1334 1431
NaN 21918 20957 21319 23070 24804 25775 26375 27979 31856 29476 ... 39830 38521 37715 39231 40309 40325 39704 44263 47612 47313

27 rows Ɨ 24 columns

InĀ [1]:
ax = percentages.T.plot( use_index=True, figsize=(15,7), title='Percentages')
ax.legend(fontsize=7)  # Opciones como 'small', 'medium', 'large', o un valor numƩrico
plt.show()
No description has been provided for this image
InĀ [1]:
corr = percentages.T.corr()
plt.figure(figsize = (8,6))
sns.heatmap(corr,
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values,
            annot=True, fmt='.2f', linewidths=.30)
plt.title('Correlation of growth (percentages per food)', y =1.05,  size=15)
pos, textvals = plt.yticks()
# plt.yticks(pos,('GLD(USD)','SPX(USD)','BARR(USD)','SLV(USD)'), 
#     rotation=0, fontsize="10", va="center")
No description has been provided for this image
InĀ [1]:
ax = prices.T.plot( use_index=True, figsize=(15,7), title='Percentages')
ax.legend(fontsize=7)  # Opciones como 'small', 'medium', 'large', o un valor numƩrico
plt.show()
No description has been provided for this image
InĀ [1]:
corr = prices.T.corr()
plt.figure(figsize = (8,6))
sns.heatmap(corr,
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values,
            annot=True, fmt='.2f', linewidths=.30)
plt.title('Correlation of growth (prices per food)', y =1.05,  size=15)
pos, textvals = plt.yticks()
No description has been provided for this image
InĀ [1]:
print(corr['Fish and shellfish'].sort_values(ascending =False), '\n')
Food
Fish and shellfish         1.000000
Meats                      0.968106
Fruits                     0.950271
Grains                     0.905124
Beverages                  0.895815
Vegetables                 0.891814
Coffee, tea, and spices    0.883066
Sugar and candy            0.877104
Nuts                       0.862888
Dairy                      0.859711
Cocoa and chocolate        0.852803
Vegetable oils             0.820204
Live meat animals          0.800145
Name: Fish and shellfish, dtype: float64 
InĀ [1]:
'''
function result = johansen(x,p,k)
% PURPOSE: perform Johansen cointegration tests
% -------------------------------------------------------
% USAGE: result = johansen(x,p,k)
% where:      x = input matrix of time-series in levels, (nobs x m)
%             p = order of time polynomial in the null-hypothesis
%                 p = -1, no deterministic part
%                 p =  0, for constant term
%                 p =  1, for constant plus time-trend
%                 p >  1, for higher order polynomial
%             k = number of lagged difference terms used when
%                 computing the estimator
% -------------------------------------------------------
% RETURNS: a results structure:
%          result.eig  = eigenvalues  (m x 1)
%          result.evec = eigenvectors (m x m), where first
%                        r columns are normalized coint vectors
%          result.lr1  = likelihood ratio trace statistic for r=0 to m-1
%                        (m x 1) vector
%          result.lr2  = maximum eigenvalue statistic for r=0 to m-1
%                        (m x 1) vector
%          result.cvt  = critical values for trace statistic
%                        (m x 3) vector [90% 95% 99%]
%          result.cvm  = critical values for max eigen value statistic
%                        (m x 3) vector [90% 95% 99%]
%          result.ind  = index of co-integrating variables ordered by
%                        size of the eigenvalues from large to small
% -------------------------------------------------------
% NOTE: c_sja(), c_sjt() provide critical values generated using
%       a method of MacKinnon (1994, 1996).
%       critical values are available for n<=12 and -1 <= p <= 1,
%       zeros are returned for other cases.
% -------------------------------------------------------
% SEE ALSO: prt_coint, a function that prints results
% -------------------------------------------------------
% References: Johansen (1988), 'Statistical Analysis of Co-integration
% vectors', Journal of Economic Dynamics and Control, 12, pp. 231-254.
% MacKinnon, Haug, Michelis (1996) 'Numerical distribution
% functions of likelihood ratio tests for cointegration',
% Queen's University Institute for Economic Research Discussion paper.
% (see also: MacKinnon's JBES 1994 article
% -------------------------------------------------------

% written by:
% James P. LeSage, Dept of Economics
% University of Toledo
% 2801 W. Bancroft St,
% Toledo, OH 43606
% jlesage@spatial-econometrics.com

% ****************************************************************
% NOTE: Adina Enache provided some bug fixes and corrections that
%       she notes below in comments. 4/10/2000
% ****************************************************************
'''

import numpy as np
from numpy import zeros, ones, flipud, log
from numpy.linalg import inv, eig, cholesky as chol
from statsmodels.regression.linear_model import OLS


tdiff = np.diff

class Holder(object):
    pass

def rows(x):
    return x.shape[0]

def trimr(x, front, end):
    if end > 0:
        return x[front:-end]
    else:
        return x[front:]

import statsmodels.tsa.tsatools as tsat
mlag = tsat.lagmat

def mlag_(x, maxlag):
    '''return all lags up to maxlag
    '''
    return x[:-lag]

def lag(x, lag):
    return x[:-lag]

def detrend(y, order):
    if order == -1:
        return y
    return OLS(y, np.vander(np.linspace(-1, 1, len(y)), order + 1)).fit().resid

def resid(y, x):
    r = y - np.dot(x, np.dot(np.linalg.pinv(x), y))
    return r




def coint_johansen(x, p, k, print_on_console=True):

    #    % error checking on inputs
    #    if (nargin ~= 3)
    #     error('Wrong # of inputs to johansen')
    #    end
    nobs, m = x.shape

    # why this?  f is detrend transformed series, p is detrend data
    if (p > -1):
        f = 0
    else:
        f = p

    x = detrend(x, p)
    dx = tdiff(x, 1, axis=0)
    # dx    = trimr(dx,1,0)
    z = mlag(dx, k)  # [k-1:]
#    print z.shape
    z = trimr(z, k, 0)
    z = detrend(z, f)
#    print dx.shape
    dx = trimr(dx, k, 0)

    dx = detrend(dx, f)
    # r0t   = dx - z*(z\dx)
    r0t = resid(dx, z)  # diff on lagged diffs
    # lx = trimr(lag(x,k),k,0)
    lx = lag(x, k)
    lx = trimr(lx, 1, 0)
    dx = detrend(lx, f)
#    print 'rkt', dx.shape, z.shape
    # rkt   = dx - z*(z\dx)
    rkt = resid(dx, z)  # level on lagged diffs
    skk = np.dot(rkt.T, rkt) / rows(rkt)
    sk0 = np.dot(rkt.T, r0t) / rows(rkt)
    s00 = np.dot(r0t.T, r0t) / rows(r0t)
    sig = np.dot(sk0, np.dot(inv(s00), (sk0.T)))
    tmp = inv(skk)
    # du, au = eig(np.dot(tmp, sig))
    au, du = eig(np.dot(tmp, sig))  # au is eval, du is evec
    # orig = np.dot(tmp, sig)

    # % Normalize the eigen vectors such that (du'skk*du) = I
    temp = inv(chol(np.dot(du.T, np.dot(skk, du))))
    dt = np.dot(du, temp)


    # JP: the next part can be done much  easier

    # %      NOTE: At this point, the eigenvectors are aligned by column. To
    # %            physically move the column elements using the MATLAB sort,
    # %            take the transpose to put the eigenvectors across the row

    # dt = transpose(dt)

    # % sort eigenvalues and vectors

    # au, auind = np.sort(diag(au))
    auind = np.argsort(au)
    # a = flipud(au)
    aind = flipud(auind)
    a = au[aind]
    # d = dt[aind,:]
    d = dt[:, aind]

    # %NOTE: The eigenvectors have been sorted by row based on auind and moved to array "d".
    # %      Put the eigenvectors back in column format after the sort by taking the
    # %      transpose of "d". Since the eigenvectors have been physically moved, there is
    # %      no need for aind at all. To preserve existing programming, aind is reset back to
    # %      1, 2, 3, ....

    # d  =  transpose(d)
    # test = np.dot(transpose(d), np.dot(skk, d))

    # %EXPLANATION:  The MATLAB sort function sorts from low to high. The flip realigns
    # %auind to go from the largest to the smallest eigenvalue (now aind). The original procedure
    # %physically moved the rows of dt (to d) based on the alignment in aind and then used
    # %aind as a column index to address the eigenvectors from high to low. This is a double
    # %sort. If you wanted to extract the eigenvector corresponding to the largest eigenvalue by,
    # %using aind as a reference, you would get the correct eigenvector, but with sorted
    # %coefficients and, therefore, any follow-on calculation would seem to be in error.
    # %If alternative programming methods are used to evaluate the eigenvalues, e.g. Frame method
    # %followed by a root extraction on the characteristic equation, then the roots can be
    # %quickly sorted. One by one, the corresponding eigenvectors can be generated. The resultant
    # %array can be operated on using the Cholesky transformation, which enables a unit
    # %diagonalization of skk. But nowhere along the way are the coefficients within the
    # %eigenvector array ever changed. The final value of the "beta" array using either method
    # %should be the same.


    # % Compute the trace and max eigenvalue statistics */
    lr1 = zeros(m)
    lr2 = zeros(m)
    cvm = zeros((m, 3))
    cvt = zeros((m, 3))
    iota = ones(m)
    t, junk = rkt.shape
    for i in range(0, m):
        tmp = trimr(log(iota - a), i , 0)
        lr1[i] = -t * np.sum(tmp, 0)  # columnsum ?
        # tmp = np.log(1-a)
        # lr1[i] = -t * np.sum(tmp[i:])
        lr2[i] = -t * log(1 - a[i])
        cvm[i, :] = c_sja(m - i, p)
        cvt[i, :] = c_sjt(m - i, p)
        aind[i] = i
    # end

    result = Holder()
    # % set up results structure
    # estimation results, residuals
    result.rkt = rkt
    result.r0t = r0t
    result.eig = a
    result.evec = d  # transposed compared to matlab ?
    result.lr1 = lr1
    result.lr2 = lr2
    result.cvt = cvt
    result.cvm = cvm
    result.ind = aind
    result.meth = 'johansen'

    if print_on_console == True:
        print ('--------------------------------------------------')
        print ('--> Trace Statistics')
        print ('variable statistic Crit-90% Crit-95%  Crit-99%')
        for i in range(len(result.lr1)):
            print ('r =', i, '\t', round(result.lr1[i], 4), result.cvt[i, 0], result.cvt[i, 1], result.cvt[i, 2])
        print ('--------------------------------------------------')
        print ('--> Eigen Statistics')
        print ('variable statistic Crit-90% Crit-95%  Crit-99%')
        for i in range(len(result.lr2)):
            print ('r =', i, '\t', round(result.lr2[i], 4), result.cvm[i, 0], result.cvm[i, 1], result.cvm[i, 2])
        print ('--------------------------------------------------')
        print ('eigenvectors:\n', result.evec)
        print ('--------------------------------------------------')
        print ('eigenvalues:\n', result.eig)
        print ('--------------------------------------------------')


    return result

def c_sjt(n, p):

# PURPOSE: find critical values for Johansen trace statistic
# ------------------------------------------------------------
# USAGE:  jc = c_sjt(n,p)
# where:    n = dimension of the VAR system
#               NOTE: routine doesn't work for n > 12
#           p = order of time polynomial in the null-hypothesis
#                 p = -1, no deterministic part
#                 p =  0, for constant term
#                 p =  1, for constant plus time-trend
#                 p >  1  returns no critical values
# ------------------------------------------------------------
# RETURNS: a (3x1) vector of percentiles for the trace
#          statistic for [90# 95# 99#]
# ------------------------------------------------------------
# NOTES: for n > 12, the function returns a (3x1) vector of zeros.
#        The values returned by the function were generated using
#        a method described in MacKinnon (1996), using his FORTRAN
#        program johdist.f
# ------------------------------------------------------------
# SEE ALSO: johansen()
# ------------------------------------------------------------
# # References: MacKinnon, Haug, Michelis (1996) 'Numerical distribution
# functions of likelihood ratio tests for cointegration',
# Queen's University Institute for Economic Research Discussion paper.
# -------------------------------------------------------

# written by:
# James P. LeSage, Dept of Economics
# University of Toledo
# 2801 W. Bancroft St,
# Toledo, OH 43606
# jlesage@spatial-econometrics.com
#
# Ported to Python by Javier Garcia
# javier.macro.trader@gmail.com

# these are the values from Johansen's 1995 book
# for comparison to the MacKinnon values
# jcp0 = [ 2.98   4.14   7.02
#        10.35  12.21  16.16
#        21.58  24.08  29.19
#        36.58  39.71  46.00
#        55.54  59.24  66.71
#        78.30  86.36  91.12
#       104.93 109.93 119.58
#       135.16 140.74 151.70
#       169.30 175.47 187.82
#       207.21 214.07 226.95
#       248.77 256.23 270.47
#       293.83 301.95 318.14];




    jcp0 = ((2.9762, 4.1296, 6.9406),
            (10.4741, 12.3212, 16.3640),
            (21.7781, 24.2761, 29.5147),
            (37.0339, 40.1749, 46.5716),
            (56.2839, 60.0627, 67.6367),
            (79.5329, 83.9383, 92.7136),
            (106.7351, 111.7797, 121.7375),
            (137.9954, 143.6691, 154.7977),
            (173.2292, 179.5199, 191.8122),
            (212.4721, 219.4051, 232.8291),
            (255.6732, 263.2603, 277.9962),
            (302.9054, 311.1288, 326.9716))


    jcp1 = ((2.7055, 3.8415, 6.6349),
            (13.4294, 15.4943, 19.9349),
            (27.0669, 29.7961, 35.4628),
            (44.4929, 47.8545, 54.6815),
            (65.8202, 69.8189, 77.8202),
            (91.1090, 95.7542, 104.9637),
            (120.3673, 125.6185, 135.9825),
            (153.6341, 159.5290, 171.0905),
            (190.8714, 197.3772, 210.0366),
            (232.1030, 239.2468, 253.2526),
            (277.3740, 285.1402, 300.2821),
            (326.5354, 334.9795, 351.2150))

    jcp2 = ((2.7055, 3.8415, 6.6349),
            (16.1619, 18.3985, 23.1485),
            (32.0645, 35.0116, 41.0815),
            (51.6492, 55.2459, 62.5202),
            (75.1027, 79.3422, 87.7748),
            (102.4674, 107.3429, 116.9829),
            (133.7852, 139.2780, 150.0778),
            (169.0618, 175.1584, 187.1891),
            (208.3582, 215.1268, 228.2226),
            (251.6293, 259.0267, 273.3838),
            (298.8836, 306.8988, 322.4264),
            (350.1125, 358.7190, 375.3203))



    if (p > 1) or (p < -1):
        jc = (0, 0, 0)
    elif (n > 12) or (n < 1):
        jc = (0, 0, 0)
    elif p == -1:
        jc = jcp0[n - 1]
    elif p == 0:
        jc = jcp1[n - 1]
    elif p == 1:
        jc = jcp2[n - 1]



    return jc

def c_sja(n, p):

# PURPOSE: find critical values for Johansen maximum eigenvalue statistic
# ------------------------------------------------------------
# USAGE:  jc = c_sja(n,p)
# where:    n = dimension of the VAR system
#           p = order of time polynomial in the null-hypothesis
#                 p = -1, no deterministic part
#                 p =  0, for constant term
#                 p =  1, for constant plus time-trend
#                 p >  1  returns no critical values
# ------------------------------------------------------------
# RETURNS: a (3x1) vector of percentiles for the maximum eigenvalue
#          statistic for: [90# 95# 99#]
# ------------------------------------------------------------
# NOTES: for n > 12, the function returns a (3x1) vector of zeros.
#        The values returned by the function were generated using
#        a method described in MacKinnon (1996), using his FORTRAN
#        program johdist.f
# ------------------------------------------------------------
# SEE ALSO: johansen()
# ------------------------------------------------------------
# References: MacKinnon, Haug, Michelis (1996) 'Numerical distribution
# functions of likelihood ratio tests for cointegration',
# Queen's University Institute for Economic Research Discussion paper.
# -------------------------------------------------------

# written by:
# James P. LeSage, Dept of Economics
# University of Toledo
# 2801 W. Bancroft St,
# Toledo, OH 43606
# jlesage@spatial-econometrics.com
# Ported to Python by Javier Garcia
# javier.macro.trader@gmail.com


    jcp0 = ((2.9762, 4.1296, 6.9406),
            (9.4748, 11.2246, 15.0923),
            (15.7175, 17.7961, 22.2519),
            (21.8370, 24.1592, 29.0609),
            (27.9160, 30.4428, 35.7359),
            (33.9271, 36.6301, 42.2333),
            (39.9085, 42.7679, 48.6606),
            (45.8930, 48.8795, 55.0335),
            (51.8528, 54.9629, 61.3449),
            (57.7954, 61.0404, 67.6415),
            (63.7248, 67.0756, 73.8856),
            (69.6513, 73.0946, 80.0937))

    jcp1 = ((2.7055, 3.8415, 6.6349),
            (12.2971, 14.2639, 18.5200),
            (18.8928, 21.1314, 25.8650),
            (25.1236, 27.5858, 32.7172),
            (31.2379, 33.8777, 39.3693),
            (37.2786, 40.0763, 45.8662),
            (43.2947, 46.2299, 52.3069),
            (49.2855, 52.3622, 58.6634),
            (55.2412, 58.4332, 64.9960),
            (61.2041, 64.5040, 71.2525),
            (67.1307, 70.5392, 77.4877),
            (73.0563, 76.5734, 83.7105))

    jcp2 = ((2.7055, 3.8415, 6.6349),
            (15.0006, 17.1481, 21.7465),
            (21.8731, 24.2522, 29.2631),
            (28.2398, 30.8151, 36.1930),
            (34.4202, 37.1646, 42.8612),
            (40.5244, 43.4183, 49.4095),
            (46.5583, 49.5875, 55.8171),
            (52.5858, 55.7302, 62.1741),
            (58.5316, 61.8051, 68.5030),
            (64.5292, 67.9040, 74.7434),
            (70.4630, 73.9355, 81.0678),
            (76.4081, 79.9878, 87.2395))


    if (p > 1) or (p < -1):
        jc = (0, 0, 0)
    elif (n > 12) or (n < 1):
        jc = (0, 0, 0)
    elif p == -1:
        jc = jcp0[n - 1]
    elif p == 0:
        jc = jcp1[n - 1]
    elif p == 1:
        jc = jcp2[n - 1]


    return jc
InĀ [1]:
# print(percentages.T)
fish = percentages.T['Fish and shellfish']
meat = percentages.T['Meats']
# fish = percentages['Fish and Shellfish']

df= pd.DataFrame({'x':fish, 'y':meat})
print(coint_johansen(df,0,2))
--------------------------------------------------
--> Trace Statistics
variable statistic Crit-90% Crit-95%  Crit-99%
r = 0 	 20.6184 13.4294 15.4943 19.9349
r = 1 	 10.0099 2.7055 3.8415 6.6349
--------------------------------------------------
--> Eigen Statistics
variable statistic Crit-90% Crit-95%  Crit-99%
r = 0 	 10.6085 12.2971 14.2639 18.52
r = 1 	 10.0099 2.7055 3.8415 6.6349
--------------------------------------------------
eigenvectors:
 [[ 0.40500768 -0.12827445]
 [-0.04000889  0.32300068]]
--------------------------------------------------
eigenvalues:
 [0.39659618 0.37914764]
--------------------------------------------------
<__main__.Holder object at 0x17c1915b0>
InĀ [1]:
from statsmodels.tsa.api import VAR
import numpy as np

data = np.column_stack((percentages.T['Fish and shellfish'], percentages.T['Meats']))
model = VAR(data)
results = model.select_order(maxlags=2)
print(results.summary())
 VAR Order Selection (* highlights the minimums) 
=================================================
      AIC         BIC         FPE         HQIC   
-------------------------------------------------
0      7.559*      7.658*      1918.*      7.582*
1       7.868       8.166       2623.       7.939
2       7.709       8.205       2265.       7.826
-------------------------------------------------
InĀ [1]:
from statsmodels.tsa.stattools import adfuller

result = adfuller(percentages.T['Fish and shellfish'], autolag='AIC', regression='ct')  # 'ct' incluye tendencia y constante
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:', result[4])
ADF Statistic: -1.900203
p-value: 0.654564
Critical Values: {'1%': -4.7993511224489795, '5%': -3.7867277551020404, '10%': -3.339917201166181}
InĀ [1]:
result = adfuller(percentages.T['Meats'], autolag='AIC', regression='ct')  # 'ct' incluye tendencia y constante
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:', result[4])
ADF Statistic: -1.420752
p-value: 0.854717
Critical Values: {'1%': -4.7284062962962965, '5%': -3.7567874814814814, '10%': -3.323498888888889}
InĀ [1]:
prices = prices.T
n_obs=5
X_train, X_test = prices[0:-n_obs], prices[-n_obs:]
print(X_train.shape, X_test.shape)
(19, 13) (5, 13)
InĀ [1]:
X_train_transformed = X_train.diff(5).dropna()
X_train_transformed.head()
Out[1]:
Food Beverages Cocoa and chocolate Coffee, tea, and spices Dairy Fish and shellfish Fruits Grains Live meat animals Meats Nuts Sugar and candy Vegetable oils Vegetables
2005 189.0 700.0 166.0 591.0 -383.0 190.0 226.0 -50.0 775.0 847.0 69.0 275.0 156.0
2006 269.0 570.0 797.0 745.0 162.0 233.0 228.0 -30.0 617.0 1252.0 -2.0 313.0 190.0
2007 255.0 435.0 1088.0 1116.0 632.0 302.0 191.0 0.0 777.0 1245.0 17.0 348.0 219.0
2008 322.0 630.0 1291.0 1960.0 957.0 348.0 201.0 58.0 917.0 1477.0 -76.0 448.0 206.0
2009 210.0 846.0 1042.0 958.0 510.0 314.0 169.0 83.0 82.0 370.0 17.0 99.0 108.0
InĀ [1]:
def augmented_dickey_fuller_statistics(time_series):
    result = adfuller(time_series.values)
    print('ADF Statistic: %f' % result[0])
    print('p-value: %f' % result[1])
    print('Critical Values:')
    for key, value in result[4].items():
        print('\t%s: %.3f' % (key, value))
        
print('Fish and shellfish')
augmented_dickey_fuller_statistics(X_train_transformed['Fish and shellfish'])
print('Meats')
augmented_dickey_fuller_statistics(X_train_transformed['Meats'])
Fish and shellfish
ADF Statistic: -6.360993
p-value: 0.000000
Critical Values:
	1%: -4.665
	5%: -3.367
	10%: -2.803
Meats
ADF Statistic: -6.240211
p-value: 0.000000
Critical Values:
	1%: -4.473
	5%: -3.290
	10%: -2.772
InĀ [1]:
fig, axes = plt.subplots(nrows=13, ncols=1, dpi=120, figsize=(20,20))
for i, ax in enumerate(axes.flatten()):
    d = X_train_transformed[X_train_transformed.columns[i]]
    ax.plot(d, color='red', linewidth=1)
    ax.set_title(X_train_transformed.columns[i])
    ax.xaxis.set_ticks_position('none')
    ax.yaxis.set_ticks_position('none')
    ax.spines['top'].set_alpha(0)
    ax.tick_params(labelsize=8)
plt.show();
No description has been provided for this image
InĀ [1]:
import warnings
warnings.filterwarnings("ignore")

from statsmodels.tsa.stattools import grangercausalitytests
maxlag=2

test = 'ssr-chi2test'

def grangers_causality_matrix(data, variables, test = 'ssr_chi2test', verbose=False):

    dataset = pd.DataFrame(np.zeros((len(variables), len(variables))), columns=variables, index=variables)

    for c in dataset.columns:
        for r in dataset.index:
            test_result = grangercausalitytests(data[[r,c]], maxlag=maxlag, verbose=False)
            p_values = [round(test_result[i+1][0][test][1],4) for i in range(maxlag)]
            if verbose: print(f'Y = {r}, X = {c}, P Values = {p_values}')

            min_p_value = np.min(p_values)
            dataset.loc[r,c] = min_p_value

    dataset.columns = [var + '_x' for var in variables]

    dataset.index = [var + '_y' for var in variables]

    return dataset

grangers_causality_matrix(X_train_transformed, variables = X_train_transformed.columns)  
Out[1]:
Beverages_x Cocoa and chocolate_x Coffee, tea, and spices_x Dairy_x Fish and shellfish_x Fruits_x Grains_x Live meat animals_x Meats_x Nuts_x Sugar and candy_x Vegetable oils_x Vegetables_x
Beverages_y 1.0000 0.5765 0.5883 0.7069 0.0697 0.3895 0.0107 0.0022 0.5504 0.1550 0.4285 0.0517 0.0001
Cocoa and chocolate_y 0.1315 1.0000 0.7261 0.7097 0.1625 0.0051 0.2845 0.6987 0.0062 0.0469 0.0027 0.0773 0.1342
Coffee, tea, and spices_y 0.8125 0.0025 1.0000 0.0002 0.0001 0.1224 0.2658 0.0877 0.0000 0.0000 0.0025 0.3979 0.2401
Dairy_y 0.2110 0.4110 0.2958 1.0000 0.0001 0.0566 0.0124 0.0370 0.0000 0.0035 0.0105 0.0008 0.2261
Fish and shellfish_y 0.2071 0.2314 0.0003 0.1118 1.0000 0.0000 0.0000 0.3280 0.5601 0.0100 0.0383 0.0003 0.9227
Fruits_y 0.5736 0.1748 0.0037 0.0326 0.0000 1.0000 0.0766 0.0348 0.0094 0.0013 0.0236 0.1179 0.4397
Grains_y 0.8397 0.0910 0.0997 0.0087 0.0003 0.1969 1.0000 0.0746 0.0000 0.0000 0.0000 0.0244 0.8059
Live meat animals_y 0.4517 0.0000 0.0027 0.0078 0.1395 0.0826 0.0010 1.0000 0.2697 0.2174 0.0315 0.0044 0.0232
Meats_y 0.5374 0.1572 0.1424 0.1466 0.5105 0.0039 0.0064 0.4408 1.0000 0.0154 0.0000 0.0000 0.6843
Nuts_y 0.1332 0.5392 0.3208 0.4353 0.3407 0.2447 0.1729 0.5501 0.4589 1.0000 0.4526 0.2568 0.0042
Sugar and candy_y 0.5833 0.0062 0.1466 0.3671 0.3485 0.0008 0.3706 0.7546 0.0001 0.1145 1.0000 0.1378 0.1477
Vegetable oils_y 0.7107 0.1224 0.0863 0.0080 0.0002 0.2491 0.0149 0.0932 0.0000 0.0000 0.0006 1.0000 0.1579
Vegetables_y 0.1456 0.1154 0.8462 0.6777 0.0493 0.4786 0.1630 0.2082 0.2810 0.0670 0.0695 0.4688 1.0000
InĀ [1]:
def RMSEfromResid(X):
    summ = 0
    for i in X:
        summ+=i**2
    return((summ/len(X))**0.5)

#ARIMA Models selection
date = prices.index
X = prices['Fish and shellfish']
size = int(len(X)*0.8)
train, test = X[0:size], X[size:len(X)]
date_test = date[size:]
def evaluate_arima_model(X, model_order):
    model_arima = ARIMA(X, order=model_order).fit()
    AIC = model_arima.aic
    BIC = model_arima.bic
    LLF = model_arima.llf
    RMSE = RMSEfromResid(model_arima.resid)
    return([AIC, BIC, LLF, RMSE])

import warnings
warnings.filterwarnings("ignore")
# evaluate combinations of p, d and q values for an ARIMA model
p_values = [0,1,2,3]
d_values = [1]
q_values = [0,1,2]
data = list()
for p in p_values:
    for d in d_values:
        for q in q_values:
            order = (p,d,q)
            try:
                [AIC, BIC, LLF, RMSE] = evaluate_arima_model(train, order)
                data.append([order,AIC, BIC, LLF, RMSE])
            except:
                continue
ARIMA_Models = pd.DataFrame(data,columns=['ARIMA', 'AIC', 'BIC', 'Maximum Log-Likelihood', 'RMSE'], )

evaluate_arima_model(X, order)
ARIMA_Models.sort_values(by=['RMSE'])
Out[1]:
ARIMA AIC BIC Maximum Log-Likelihood RMSE
8 (2, 1, 2) 270.587037 275.038896 -130.293518 1363.043833
11 (3, 1, 2) 274.065900 279.408130 -131.032950 1366.886123
10 (3, 1, 1) 272.869982 277.321841 -131.434991 1369.164377
9 (3, 1, 0) 271.187671 274.749158 -131.593836 1370.052106
7 (2, 1, 1) 272.952058 276.513545 -132.476029 1374.767361
5 (1, 1, 2) 274.092719 277.654206 -133.046359 1378.400631
2 (0, 1, 2) 272.773406 275.444522 -133.386703 1380.469946
6 (2, 1, 0) 273.275097 275.946213 -133.637549 1382.221260
4 (1, 1, 1) 274.201868 276.872984 -134.100934 1385.405436
1 (0, 1, 1) 272.242552 274.023296 -134.121276 1385.541755
3 (1, 1, 0) 272.267045 274.047789 -134.133523 1385.628640
0 (0, 1, 0) 270.312584 271.202955 -134.156292 1385.783666
InĀ [1]:
#ARIMA Prediction
history = [x for x in train]
predictions = list()
data=list()
#len_test = len(test)
len_test= len(test)
for t in range(len_test):
    model_arima = ARIMA(endog = history, order=(2, 1, 1)).fit()
    output = model_arima.forecast()
    yhat = output[0]
    predictions.append(yhat)
    obs = test[t]
    history.append(obs)
    data.append([date_test[t], obs, yhat])
    
RMSE = (mean_squared_error(test, predictions))**0.5
arima_results = pd.DataFrame(data,columns=['Period','Actual Price', 'Predicted Price'],dtype=float)
print('Test RMSE: %.3f' % RMSE)
Test RMSE: 678.449
InĀ [1]:
def mape(y_true, y_pred):
    return np.mean(np.abs((y_pred - y_true) / y_true)) * 100

print('The Mean Absolute Percentage Error is: %.3f' % mape(np.array(test), predictions),'%.')
The Mean Absolute Percentage Error is: 5.814 %.
InĀ [1]:
# plot
plt.rcParams['figure.figsize'] = (20,10)
plt.plot(date_test, test, color='Blue', label='ACTUAL', marker='x')
plt.plot(date_test, predictions, color='green', label='PREDICTED', marker='x')
plt.legend(loc='upper right')
plt.show()
arimax_pred = predictions
arimax_RMSE = RMSE
No description has been provided for this image
InĀ [1]:
# ARIMAX Train-Test-Split:
prices['diffGLD'] = prices['Grains'].diff(2)
prices['diffSPX'] = prices['Fruits'].diff(2)
date = prices.index
prices['SPX_lag']=prices['diffSPX'].shift()
prices.dropna(inplace=True)
GLD_end = prices['Fish and shellfish']
SPX_ex = prices['SPX_lag']
m = len(GLD_end)
size = int(len(GLD_end)*0.8)
train, test = GLD_end[0:size], GLD_end[size:m]
ex_train, ex_test = SPX_ex[0:size], SPX_ex[size:m]
date_test = date[size:]

def evaluate_arimax_model(y, X, model_order):
    model_arimax = ARIMA(endog = y, exog=X, order=model_order).fit()
    AIC = model_arimax.aic
    BIC = model_arimax.bic
    LLF = model_arimax.llf
    RMSE = RMSEfromResid(model_arimax.resid)
    return([AIC, BIC, LLF, RMSE])

warnings.filterwarnings("ignore")
p_values = [0,1,2,3]
d_values = [1]
q_values = [0,1,2]
data = list()
for p in p_values:
    for d in d_values:
        for q in q_values:
            order = (p,d,q)
            try:
                [AIC, BIC, LLF, RMSE] = evaluate_arimax_model(train, ex_train, order)
                data.append([order,AIC, BIC, LLF, RMSE])
            except:
                continue

ARIMAX_Models = pd.DataFrame(data,columns=['ARIMAX', 'AIC', 'BIC', 'Maximum Log-Likelihood', 'RMSE'],)

evaluate_arimax_model(train, ex_train, order)
ARIMAX_Models.sort_values(by=['RMSE'])
Out[1]:
ARIMAX AIC BIC Maximum Log-Likelihood RMSE
11 (3, 1, 2) 218.301251 223.257602 -102.150625 1372.818837
8 (2, 1, 2) 220.415719 224.664020 -104.207860 1380.516992
10 (3, 1, 1) 218.167171 222.415472 -103.083586 1380.824335
5 (1, 1, 2) 219.419039 222.959290 -104.709519 1385.875074
3 (1, 1, 0) 218.128485 220.252635 -106.064242 1386.475153
1 (0, 1, 1) 217.674498 219.798649 -105.837249 1386.804141
4 (1, 1, 1) 219.149332 221.981533 -105.574666 1387.183808
7 (2, 1, 1) 221.148566 224.688817 -105.574283 1387.250861
6 (2, 1, 0) 219.454878 222.287079 -105.727439 1387.466107
9 (3, 1, 0) 220.539377 224.079628 -105.269689 1387.524419
2 (0, 1, 2) 219.153126 221.985327 -105.576563 1387.652048
0 (0, 1, 0) 221.542287 222.958387 -108.771144 1389.613361
InĀ [1]:
# For predicting from grid search cv
history = [x for x in train]
his_u = ex_train
predictions = list()
data=list()
test_index = list()
for t in range(len(ex_test)):
    model_arimax = ARIMA(endog = history,exog=his_u, order=(2,1,1)).fit()
    output = model_arimax.forecast(steps=1, exog=ex_test.iloc[[t]])
    yhat = output[0]
    predictions.append(yhat)
    history.append(test[t])
    test_index.append(t)
    his_u = list(ex_train.values).append(ex_test.iloc[test_index])
    data.append([date_test[t], test[t], yhat])


RMSE = (mean_squared_error(test, predictions))**0.5
arima_results = pd.DataFrame(data,columns=['Period','Actual Price', 'Predicted Price'])
print('Test RMSE: %.3f' % RMSE)
print('The Mean Absolute Percentage Error is: %.3f' % mape(np.array(test), predictions),'%.')
Test RMSE: 600.740
The Mean Absolute Percentage Error is: 4.687 %.
InĀ [1]:
# plot our calculations above to compare prediction to actual trend
plt.rcParams['figure.figsize'] = (20,10)
plt.plot(date_test[3:], test, color='Blue', label='ACTUAL', marker='x')
plt.plot(date_test[3:], predictions, color='green', label='PREDICTED', marker='x')
plt.legend(loc='upper right')
plt.show()
arimax_pred = predictions
arimax_RMSE = RMSE
No description has been provided for this image
InĀ [1]: