6626070
2997924

MATH05, Covariance and correlation

Back to the previous pageStatistics
List of posts to read before reading this article


Contents


Sample (co)variance


Variance

import numpy as np

def sample_variance(rv):
    s_var = 0
    for i in range(rv.shape[0]):
        s_var = s_var + (rv[i]-rv.mean())**2
    s_var = s_var/(rv.shape[0]-1)
    return s_var

np.random.seed(2019)
rv = np.random.RandomState(2019)
rv = rv.normal(10,100,(100,1))

sample_variance(rv)
10066.80475325
np.var
np.var(rv, ddof=1)
10066.80475325




Covariance

import numpy as np

def sample_covariance(rv):
    Cov = 0
    for i in range(rv.shape[0]):
        Cov = Cov + (rv[i,0]-rv[:,0].mean())*(rv[i,1]-rv[:,1].mean())
    Cov = Cov/(rv.shape[0]-1)
    return Cov

np.random.seed(2019)
rv = np.random.RandomState(2019)
rv = rv.normal(10,10,(1000,2))

sample_covariance(rv)
-0.06329806730517594
np.cov
np.cov(rv[:,0],rv[:,1])
array([[ 9.99728326e+01, -6.32980673e-02],
       [-6.32980673e-02,  1.00853519e+02]])





Sample standard deviation(correlation coefficient)


Standard deviation

import numpy as np

def sample_std(rv):
    s_std = 0
    for i in range(rv.shape[0]):
        s_std = s_std + (rv[i]-rv.mean())**2
    s_std = np.sqrt(s_std/(rv.shape[0]-1))
    return s_std
    
np.random.seed(2019)
rv = np.random.RandomState(2019)
rv = rv.normal(10,100,(100,1))

sample_std(rv)
100.33346776
np.std
np.std(rv, ddof=1)
100.33346776




Correlation coefficient

import numpy as np

def s_xx(rv):
    s_std = 0
    for i in range(rv.shape[0]):
        s_std = s_std + (rv[i,0]-rv[:,0].mean())**2
    s_std = s_std/(rv.shape[0]-1)
    return s_std

def s_yy(rv):
    s_std = 0
    for i in range(rv.shape[0]):
        s_std = s_std + (rv[i,1]-rv[:,1].mean())**2
    s_std = s_std/(rv.shape[0]-1)
    return s_std

def s_xy(rv):
    s_std = 0
    for i in range(rv.shape[0]):
        s_std = s_std + (rv[i,0]-rv[:,0].mean())*(rv[i,1]-rv[:,1].mean())
    s_std = s_std/(rv.shape[0]-1)
    return s_std

def correlation(rv):
    return s_xy(rv)/np.sqrt(s_xx(rv)*s_yy(rv))


np.random.seed(2019)
rv = np.random.RandomState(2019)
rv = rv.normal(10,10,(10000,2))
correlation(rv)
-0.0008369647036742862
np.corrcoef
np.corrcoef(rv[:,0], rv[:,1])
array([[ 1.00000000e+00, -8.36964704e-04],
       [-8.36964704e-04,  1.00000000e+00]])

stats.pearsonr
from scipy import stats

stats.pearsonr(rv[:,0], rv[:,1])
(-0.0008369647036742988, 0.933306070697321)





Real case : covariance and correlation

covariance

from sklearn.datasets import load_iris
from scipy import stats

X = load_iris().data
x1 = X[:, 0]  # 꽃받침의 길이
x2 = X[:, 1]  # 꽃받침의 폭
x3 = X[:, 2]  # 꽃잎의 길이
x4 = X[:, 3]  # 꽃잎의 폭

stats.pearsonr(x1, x3)[0]
0.8717537758865832




correlation

import numpy as np
import matplotlib.pyplot as plt

np.random.seed(0)
corrs = [1, 0.7, 0.3, 0, -0.3, -0.7, -1]
plt.figure(figsize=(len(corrs), 2))
for i, r in enumerate(corrs):
    x, y = np.random.multivariate_normal([0, 0], [[1, r], [r, 1]], 1000).T
    plt.subplot(1, len(corrs), i + 1)
    plt.plot(x, y, 'ro', ms=1)
    plt.axis('equal')
    plt.xticks([])
    plt.yticks([])
    plt.title(r"$\rho$={}".format(r))

plt.suptitle("scatter plot about correlation", y=1.1)
plt.tight_layout()
plt.show()

download (15)


import numpy as np
import matplotlib.pyplot as plt

np.random.seed(1)
slope = [1, 0.7, 0.3, 0, -0.3, -0.7, -1]
plt.figure(figsize=(len(slope), 2))
for i, s in enumerate(slope):
    plt.subplot(1, len(slope), i + 1)
    x, y = np.random.multivariate_normal([0, 0], [[1, 1], [1, 1]], 100).T
    y2 = s * y
    plt.plot(x, y2, 'ro', ms=1)
    plt.axis('equal')
    plt.xticks([])
    plt.yticks([])
    if s > 0:
        plt.title(r"$\rho$=1")
    if s < 0:
        plt.title(r"$\rho$=-1")

plt.suptitle("correlation and slope are independent", y=1.1)
plt.tight_layout()
plt.show()

download (17)



non-linear correlation

import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

n = 500
np.random.seed(1)

plt.subplot(221)
x1 = np.random.uniform(-1, 1, n)
y1 = 2*x1**2 + np.random.uniform(-0.1, 0.1, n)
plt.scatter(x1, y1)
r1 = stats.pearsonr(x1, y1)[0]
plt.title(r"non-linear correlation 1: r={:3.1f}".format(r1))

plt.subplot(222)
x2 = np.random.uniform(-1, 1, n)
y2 = 4*(x2**2-0.5)**2 + 0.1 * np.random.uniform(-1, 1, n)
plt.scatter(x2, y2)
r2 = stats.pearsonr(x2, y2)[0]
plt.title(r"non-linear correlation 2: r={:3.1f}".format(r2))

plt.subplot(223)
x3 = np.random.uniform(-1, 1, n)
y3 = np.cos(x3 * np.pi) + np.random.uniform(0, 1/8, n)
x3 = np.sin(x3 * np.pi) + np.random.uniform(0, 1/8, n)
plt.scatter(x3, y3)
r3 = stats.pearsonr(x3, y3)[0]
plt.title(r"non-linear correlation 3: r={:3.1f}".format(r3))

plt.subplot(224)
x4 = np.random.uniform(-1, 1, n)
y4 = (x4**2 + np.random.uniform(0, 0.1, n)) * \
    np.array([-1, 1])[np.random.random_integers(0, 1, size=n)]
plt.scatter(x4, y4)
r4 = stats.pearsonr(x4, y4)[0]
plt.title(r"non-linear correlation 4: r={:3.1f}".format(r4))


plt.tight_layout()
plt.show()

download (18)



Frank Anscombe data

import statsmodels.api as sm

data = sm.datasets.get_rdataset("anscombe")
df = data.data
df[["x1", "y1", "x2", "y2", "x3", "y3", "x4", "y4"]]
	x1	y1	x2	y2	x3	y3	x4	y4
0	10	8.04	10	9.14	10	7.46	8	6.58
1	8	6.95	8	8.14	8	6.77	8	5.76
2	13	7.58	13	8.74	13	12.74	8	7.71
3	9	8.81	9	8.77	9	7.11	8	8.84
4	11	8.33	11	9.26	11	7.81	8	8.47
5	14	9.96	14	8.10	14	8.84	8	7.04
6	6	7.24	6	6.13	6	6.08	8	5.25
7	4	4.26	4	3.10	4	5.39	19	12.50
8	12	10.84	12	9.13	12	8.15	8	5.56
9	7	4.82	7	7.26	7	6.42	8	7.91
10	5	5.68	5	4.74	5	5.73	8	6.89


import matplotlib.pyplot as plt
import seaborn as sns

plt.subplot(221)
sns.regplot(x="x1", y="y1", data=df)
plt.subplot(222)
sns.regplot(x="x2", y="y2", data=df)
plt.subplot(223)
sns.regplot(x="x3", y="y3", data=df)
plt.subplot(224)
sns.regplot(x="x4", y="y4", data=df)
plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.suptitle("Frank Anscombe data")
plt.show()

download (19)



List of posts followed by this article


Reference


OUTPUT