MATH05, Covariance and correlation
Back to the previous page | Statistics
List of posts to read before reading this article
Contents
- Sample (co)variance
- Sample standard deviation(correlation coefficient)
- Real case : covariance and correlation
- non-linear correlation
- Frank Anscombe data
Sample (co)variance
Variance
import numpy as np
def sample_variance(rv):
s_var = 0
for i in range(rv.shape[0]):
s_var = s_var + (rv[i]-rv.mean())**2
s_var = s_var/(rv.shape[0]-1)
return s_var
np.random.seed(2019)
rv = np.random.RandomState(2019)
rv = rv.normal(10,100,(100,1))
sample_variance(rv)
10066.80475325
np.var
np.var(rv, ddof=1)
10066.80475325
Covariance
import numpy as np
def sample_covariance(rv):
Cov = 0
for i in range(rv.shape[0]):
Cov = Cov + (rv[i,0]-rv[:,0].mean())*(rv[i,1]-rv[:,1].mean())
Cov = Cov/(rv.shape[0]-1)
return Cov
np.random.seed(2019)
rv = np.random.RandomState(2019)
rv = rv.normal(10,10,(1000,2))
sample_covariance(rv)
-0.06329806730517594
np.cov
np.cov(rv[:,0],rv[:,1])
array([[ 9.99728326e+01, -6.32980673e-02],
[-6.32980673e-02, 1.00853519e+02]])
Sample standard deviation(correlation coefficient)
Standard deviation
import numpy as np
def sample_std(rv):
s_std = 0
for i in range(rv.shape[0]):
s_std = s_std + (rv[i]-rv.mean())**2
s_std = np.sqrt(s_std/(rv.shape[0]-1))
return s_std
np.random.seed(2019)
rv = np.random.RandomState(2019)
rv = rv.normal(10,100,(100,1))
sample_std(rv)
100.33346776
np.std
np.std(rv, ddof=1)
100.33346776
Correlation coefficient
import numpy as np
def s_xx(rv):
s_std = 0
for i in range(rv.shape[0]):
s_std = s_std + (rv[i,0]-rv[:,0].mean())**2
s_std = s_std/(rv.shape[0]-1)
return s_std
def s_yy(rv):
s_std = 0
for i in range(rv.shape[0]):
s_std = s_std + (rv[i,1]-rv[:,1].mean())**2
s_std = s_std/(rv.shape[0]-1)
return s_std
def s_xy(rv):
s_std = 0
for i in range(rv.shape[0]):
s_std = s_std + (rv[i,0]-rv[:,0].mean())*(rv[i,1]-rv[:,1].mean())
s_std = s_std/(rv.shape[0]-1)
return s_std
def correlation(rv):
return s_xy(rv)/np.sqrt(s_xx(rv)*s_yy(rv))
np.random.seed(2019)
rv = np.random.RandomState(2019)
rv = rv.normal(10,10,(10000,2))
correlation(rv)
-0.0008369647036742862
np.corrcoef
np.corrcoef(rv[:,0], rv[:,1])
array([[ 1.00000000e+00, -8.36964704e-04],
[-8.36964704e-04, 1.00000000e+00]])
stats.pearsonr
from scipy import stats
stats.pearsonr(rv[:,0], rv[:,1])
(-0.0008369647036742988, 0.933306070697321)
Real case : covariance and correlation
covariance
from sklearn.datasets import load_iris
from scipy import stats
X = load_iris().data
x1 = X[:, 0] # 꽃받침의 길이
x2 = X[:, 1] # 꽃받침의 폭
x3 = X[:, 2] # 꽃잎의 길이
x4 = X[:, 3] # 꽃잎의 폭
stats.pearsonr(x1, x3)[0]
0.8717537758865832
correlation
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(0)
corrs = [1, 0.7, 0.3, 0, -0.3, -0.7, -1]
plt.figure(figsize=(len(corrs), 2))
for i, r in enumerate(corrs):
x, y = np.random.multivariate_normal([0, 0], [[1, r], [r, 1]], 1000).T
plt.subplot(1, len(corrs), i + 1)
plt.plot(x, y, 'ro', ms=1)
plt.axis('equal')
plt.xticks([])
plt.yticks([])
plt.title(r"$\rho$={}".format(r))
plt.suptitle("scatter plot about correlation", y=1.1)
plt.tight_layout()
plt.show()
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(1)
slope = [1, 0.7, 0.3, 0, -0.3, -0.7, -1]
plt.figure(figsize=(len(slope), 2))
for i, s in enumerate(slope):
plt.subplot(1, len(slope), i + 1)
x, y = np.random.multivariate_normal([0, 0], [[1, 1], [1, 1]], 100).T
y2 = s * y
plt.plot(x, y2, 'ro', ms=1)
plt.axis('equal')
plt.xticks([])
plt.yticks([])
if s > 0:
plt.title(r"$\rho$=1")
if s < 0:
plt.title(r"$\rho$=-1")
plt.suptitle("correlation and slope are independent", y=1.1)
plt.tight_layout()
plt.show()
non-linear correlation
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
n = 500
np.random.seed(1)
plt.subplot(221)
x1 = np.random.uniform(-1, 1, n)
y1 = 2*x1**2 + np.random.uniform(-0.1, 0.1, n)
plt.scatter(x1, y1)
r1 = stats.pearsonr(x1, y1)[0]
plt.title(r"non-linear correlation 1: r={:3.1f}".format(r1))
plt.subplot(222)
x2 = np.random.uniform(-1, 1, n)
y2 = 4*(x2**2-0.5)**2 + 0.1 * np.random.uniform(-1, 1, n)
plt.scatter(x2, y2)
r2 = stats.pearsonr(x2, y2)[0]
plt.title(r"non-linear correlation 2: r={:3.1f}".format(r2))
plt.subplot(223)
x3 = np.random.uniform(-1, 1, n)
y3 = np.cos(x3 * np.pi) + np.random.uniform(0, 1/8, n)
x3 = np.sin(x3 * np.pi) + np.random.uniform(0, 1/8, n)
plt.scatter(x3, y3)
r3 = stats.pearsonr(x3, y3)[0]
plt.title(r"non-linear correlation 3: r={:3.1f}".format(r3))
plt.subplot(224)
x4 = np.random.uniform(-1, 1, n)
y4 = (x4**2 + np.random.uniform(0, 0.1, n)) * \
np.array([-1, 1])[np.random.random_integers(0, 1, size=n)]
plt.scatter(x4, y4)
r4 = stats.pearsonr(x4, y4)[0]
plt.title(r"non-linear correlation 4: r={:3.1f}".format(r4))
plt.tight_layout()
plt.show()
Frank Anscombe data
import statsmodels.api as sm
data = sm.datasets.get_rdataset("anscombe")
df = data.data
df[["x1", "y1", "x2", "y2", "x3", "y3", "x4", "y4"]]
x1 y1 x2 y2 x3 y3 x4 y4
0 10 8.04 10 9.14 10 7.46 8 6.58
1 8 6.95 8 8.14 8 6.77 8 5.76
2 13 7.58 13 8.74 13 12.74 8 7.71
3 9 8.81 9 8.77 9 7.11 8 8.84
4 11 8.33 11 9.26 11 7.81 8 8.47
5 14 9.96 14 8.10 14 8.84 8 7.04
6 6 7.24 6 6.13 6 6.08 8 5.25
7 4 4.26 4 3.10 4 5.39 19 12.50
8 12 10.84 12 9.13 12 8.15 8 5.56
9 7 4.82 7 7.26 7 6.42 8 7.91
10 5 5.68 5 4.74 5 5.73 8 6.89
import matplotlib.pyplot as plt
import seaborn as sns
plt.subplot(221)
sns.regplot(x="x1", y="y1", data=df)
plt.subplot(222)
sns.regplot(x="x2", y="y2", data=df)
plt.subplot(223)
sns.regplot(x="x3", y="y3", data=df)
plt.subplot(224)
sns.regplot(x="x4", y="y4", data=df)
plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.suptitle("Frank Anscombe data")
plt.show()
List of posts followed by this article
Reference