MATH05, Covariance and correlation

Back to the previous page ｜ Statistics
List of posts to read before reading this article

Sample (co)variance
Sample standard deviation(correlation coefficient)
Real case : covariance and correlation
non-linear correlation
Frank Anscombe data

Sample (co)variance

$q_{jk}=\frac{1}{N-1}\sum_{i=1}^{N}\left( x_{ij}-\bar{x}_j \right) \left( x_{ik}-\bar{x}_k \right),$
Variance

import numpy as np

def sample_variance(rv):
    s_var = 0
    for i in range(rv.shape[0]):
        s_var = s_var + (rv[i]-rv.mean())**2
    s_var = s_var/(rv.shape[0]-1)
    return s_var

np.random.seed(2019)
rv = np.random.RandomState(2019)
rv = rv.normal(10,100,(100,1))

sample_variance(rv)

10066.80475325

np.var

np.var(rv, ddof=1)

10066.80475325

Covariance

import numpy as np

def sample_covariance(rv):
    Cov = 0
    for i in range(rv.shape[0]):
        Cov = Cov + (rv[i,0]-rv[:,0].mean())*(rv[i,1]-rv[:,1].mean())
    Cov = Cov/(rv.shape[0]-1)
    return Cov

np.random.seed(2019)
rv = np.random.RandomState(2019)
rv = rv.normal(10,10,(1000,2))

sample_covariance(rv)

-0.06329806730517594

np.cov

np.cov(rv[:,0],rv[:,1])

array([[ 9.99728326e+01, -6.32980673e-02],
       [-6.32980673e-02,  1.00853519e+02]])

Sample standard deviation(correlation coefficient)

$r_{xy}\quad {\overset {\underset {\mathrm {def} }{}}{=}}\quad {\frac {\sum \limits _{i=1}^{n}(x_{i}-{\bar {x}})(y_{i}-{\bar {y}})}{(n-1)s_{x}s_{y}}}={\frac {\sum \limits _{i=1}^{n}(x_{i}-{\bar {x}})(y_{i}-{\bar {y}})}{\sqrt {\sum \limits _{i=1}^{n}(x_{i}-{\bar {x}})^{2}\sum \limits _{i=1}^{n}(y_{i}-{\bar {y}})^{2}}}},$
Standard deviation

import numpy as np

def sample_std(rv):
    s_std = 0
    for i in range(rv.shape[0]):
        s_std = s_std + (rv[i]-rv.mean())**2
    s_std = np.sqrt(s_std/(rv.shape[0]-1))
    return s_std
    
np.random.seed(2019)
rv = np.random.RandomState(2019)
rv = rv.normal(10,100,(100,1))

sample_std(rv)

100.33346776

np.std

np.std(rv, ddof=1)

100.33346776

Correlation coefficient

import numpy as np

def s_xx(rv):
    s_std = 0
    for i in range(rv.shape[0]):
        s_std = s_std + (rv[i,0]-rv[:,0].mean())**2
    s_std = s_std/(rv.shape[0]-1)
    return s_std

def s_yy(rv):
    s_std = 0
    for i in range(rv.shape[0]):
        s_std = s_std + (rv[i,1]-rv[:,1].mean())**2
    s_std = s_std/(rv.shape[0]-1)
    return s_std

def s_xy(rv):
    s_std = 0
    for i in range(rv.shape[0]):
        s_std = s_std + (rv[i,0]-rv[:,0].mean())*(rv[i,1]-rv[:,1].mean())
    s_std = s_std/(rv.shape[0]-1)
    return s_std

def correlation(rv):
    return s_xy(rv)/np.sqrt(s_xx(rv)*s_yy(rv))


np.random.seed(2019)
rv = np.random.RandomState(2019)
rv = rv.normal(10,10,(10000,2))
correlation(rv)

-0.0008369647036742862

np.corrcoef

np.corrcoef(rv[:,0], rv[:,1])

array([[ 1.00000000e+00, -8.36964704e-04],
       [-8.36964704e-04,  1.00000000e+00]])

stats.pearsonr

from scipy import stats

stats.pearsonr(rv[:,0], rv[:,1])

(-0.0008369647036742988, 0.933306070697321)

Real case : covariance and correlation

covariance $\operatorname {Cov} (X,Y)=\operatorname {E} \left((X-\mu )(Y-\nu )\right)\,$

from sklearn.datasets import load_iris
from scipy import stats

X = load_iris().data
x1 = X[:, 0]  # 꽃받침의 길이
x2 = X[:, 1]  # 꽃받침의 폭
x3 = X[:, 2]  # 꽃잎의 길이
x4 = X[:, 3]  # 꽃잎의 폭

stats.pearsonr(x1, x3)[0]

0.8717537758865832

correlation $\rho _{X,Y}=\operatorname {corr} (X,Y)={\operatorname {cov} (X,Y) \over \sigma _{X}\sigma _{Y}}={\operatorname {E} [(X-\mu _{X})(Y-\mu _{Y})] \over \sigma _{X}\sigma _{Y}}$

import numpy as np
import matplotlib.pyplot as plt

np.random.seed(0)
corrs = [1, 0.7, 0.3, 0, -0.3, -0.7, -1]
plt.figure(figsize=(len(corrs), 2))
for i, r in enumerate(corrs):
    x, y = np.random.multivariate_normal([0, 0], [[1, r], [r, 1]], 1000).T
    plt.subplot(1, len(corrs), i + 1)
    plt.plot(x, y, 'ro', ms=1)
    plt.axis('equal')
    plt.xticks([])
    plt.yticks([])
    plt.title(r"$\rho$={}".format(r))

plt.suptitle("scatter plot about correlation", y=1.1)
plt.tight_layout()
plt.show()

download (15)

import numpy as np
import matplotlib.pyplot as plt

np.random.seed(1)
slope = [1, 0.7, 0.3, 0, -0.3, -0.7, -1]
plt.figure(figsize=(len(slope), 2))
for i, s in enumerate(slope):
    plt.subplot(1, len(slope), i + 1)
    x, y = np.random.multivariate_normal([0, 0], [[1, 1], [1, 1]], 100).T
    y2 = s * y
    plt.plot(x, y2, 'ro', ms=1)
    plt.axis('equal')
    plt.xticks([])
    plt.yticks([])
    if s > 0:
        plt.title(r"$\rho$=1")
    if s < 0:
        plt.title(r"$\rho$=-1")

plt.suptitle("correlation and slope are independent", y=1.1)
plt.tight_layout()
plt.show()

download (17)

non-linear correlation

import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

n = 500
np.random.seed(1)

plt.subplot(221)
x1 = np.random.uniform(-1, 1, n)
y1 = 2*x1**2 + np.random.uniform(-0.1, 0.1, n)
plt.scatter(x1, y1)
r1 = stats.pearsonr(x1, y1)[0]
plt.title(r"non-linear correlation 1: r={:3.1f}".format(r1))

plt.subplot(222)
x2 = np.random.uniform(-1, 1, n)
y2 = 4*(x2**2-0.5)**2 + 0.1 * np.random.uniform(-1, 1, n)
plt.scatter(x2, y2)
r2 = stats.pearsonr(x2, y2)[0]
plt.title(r"non-linear correlation 2: r={:3.1f}".format(r2))

plt.subplot(223)
x3 = np.random.uniform(-1, 1, n)
y3 = np.cos(x3 * np.pi) + np.random.uniform(0, 1/8, n)
x3 = np.sin(x3 * np.pi) + np.random.uniform(0, 1/8, n)
plt.scatter(x3, y3)
r3 = stats.pearsonr(x3, y3)[0]
plt.title(r"non-linear correlation 3: r={:3.1f}".format(r3))

plt.subplot(224)
x4 = np.random.uniform(-1, 1, n)
y4 = (x4**2 + np.random.uniform(0, 0.1, n)) * \
    np.array([-1, 1])[np.random.random_integers(0, 1, size=n)]
plt.scatter(x4, y4)
r4 = stats.pearsonr(x4, y4)[0]
plt.title(r"non-linear correlation 4: r={:3.1f}".format(r4))


plt.tight_layout()
plt.show()

download (18)

Frank Anscombe data

import statsmodels.api as sm

data = sm.datasets.get_rdataset("anscombe")
df = data.data
df[["x1", "y1", "x2", "y2", "x3", "y3", "x4", "y4"]]

	x1	y1	x2	y2	x3	y3	x4	y4
10	8.04	10	9.14	10	7.46	8	6.58
8	6.95	8	8.14	8	6.77	8	5.76
13	7.58	13	8.74	13	12.74	8	7.71
9	8.81	9	8.77	9	7.11	8	8.84
11	8.33	11	9.26	11	7.81	8	8.47
14	9.96	14	8.10	14	8.84	8	7.04
6	7.24	6	6.13	6	6.08	8	5.25
4	4.26	4	3.10	4	5.39	19	12.50
12	10.84	12	9.13	12	8.15	8	5.56
7	4.82	7	7.26	7	6.42	8	7.91
5	5.68	5	4.74	5	5.73	8	6.89

import matplotlib.pyplot as plt
import seaborn as sns

plt.subplot(221)
sns.regplot(x="x1", y="y1", data=df)
plt.subplot(222)
sns.regplot(x="x2", y="y2", data=df)
plt.subplot(223)
sns.regplot(x="x3", y="y3", data=df)
plt.subplot(224)
sns.regplot(x="x4", y="y4", data=df)
plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.suptitle("Frank Anscombe data")
plt.show()

download (19)

List of posts followed by this article

Reference

OUTPUT

6626070
2997924

MATH05, Covariance and correlation

Contents

Sample (co)variance

Sample standard deviation(correlation coefficient)

Real case : covariance and correlation

non-linear correlation

Frank Anscombe data

6626070 2997924

MATH05, Covariance and correlation

Contents

Sample (co)variance

Sample standard deviation(correlation coefficient)

Real case : covariance and correlation

non-linear correlation

Frank Anscombe data

6626070
2997924