MATH05, Probability
Back to the previous page | Statistics
List of posts to read before reading this article
Contents
- Discrete : Uni-variate random variable
- Continuous: Uni-variate random variable
- Mass : Multi-variate random variable
- Density : Multi-variate random variable
- Independent
Discrete : Uni-variate random variable
Continuous: Uni-variate random variable
Normal distribution
from scipy import stats
X = stats.norm(1,.5)
Descript statistic
X.rvs(10)
array([0.5903325 , 1.29429924, 1.00703009, 1.21073729, 1.51287354,
0.76989052, 0.96913931, 2.03268324, 0.65025789, 0.15307278])
X.mean()
1
X.median()
1.0
X.std()
0.5
X.var()
0.25
[X.moment(n) for n in range(5)]
[1.0, 1.0, 1.25, 1.75, 2.6875]
X.stats()
# stats.norm.stats(loc=1, scale=0.5)
# stats.norm(loc=1, scale=0.5).stats()
(array(1.), array(0.25))
X.pdf([0, 1, 2])
array([0.10798193, 0.79788456, 0.10798193])
X.cdf([0, 1, 2])
array([0.02275013, 0.5 , 0.97724987])
X.interval(0.95)
(0.020018007729972975, 1.979981992270027)
X.interval(0.99)
(-0.2879146517744502, 2.28791465177445)
Visualization
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")
def plot_rv_distribution(X, axes=None):
"""Plot the PDF or PMF, CDF, SF and PPF of a given random variable"""
if axes is None:
fig, axes = plt.subplots(1, 3, figsize=(12, 3))
x_min_999, x_max_999 = X.interval(0.999)
x999 = np.linspace(x_min_999, x_max_999, 1000)
x_min_95, x_max_95 = X.interval(0.95)
x95 = np.linspace(x_min_95, x_max_95, 1000)
if hasattr(X.dist, "pdf"):
axes[0].plot(x999, X.pdf(x999), label="PDF")
axes[0].fill_between(x95, X.pdf(x95), alpha=0.25)
else:
# discrete random variables do not have a pdf method, instead we use pmf:
x999_int = np.unique(x999.astype(int))
axes[0].bar(x999_int, X.pmf(x999_int), label="PMF")
axes[1].plot(x999, X.cdf(x999), label="CDF")
axes[1].plot(x999, X.sf(x999), label="SF")
axes[2].plot(x999, X.ppf(x999), label="PPF")
for ax in axes:
ax.legend()
fig, axes = plt.subplots(3, 3, figsize=(12, 9))
X = stats.norm()
plot_rv_distribution(X, axes=axes[0, :])
axes[0, 0].set_ylabel("Normal dist.")
X = stats.f(2, 50)
plot_rv_distribution(X, axes=axes[1, :])
axes[1, 0].set_ylabel("F dist.")
X = stats.poisson(5)
plot_rv_distribution(X, axes=axes[2, :])
axes[2, 0].set_ylabel("Poisson dist.")
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")
def plot_dist_samples(X, X_samples, title=None, ax=None):
""" Plot the PDF and histogram of samples of a continuous random variable"""
if ax is None:
fig, ax = plt.subplots(1, 1, figsize=(8, 4))
x_lim = X.interval(.99)
x = np.linspace(*x_lim, num=100)
ax.plot(x, X.pdf(x), label="PDF", lw=3)
ax.hist(X_samples, label="samples", normed=1, bins=75)
ax.set_xlim(*x_lim)
ax.legend()
if title:
ax.set_title(title)
return ax
fig, axes = plt.subplots(1, 3, figsize=(12, 3))
N = 2000
# Student's t distribution
X = stats.t(7.0)
plot_dist_samples(X, X.rvs(N), "Student's t dist.", ax=axes[0])
# The chisquared distribution
X = stats.chi2(5.0)
plot_dist_samples(X, X.rvs(N), r"$\chi^2$ dist.", ax=axes[1])
# The exponential distribution
X = stats.expon(0.5)
plot_dist_samples(X, X.rvs(N), "exponential dist.", ax=axes[2])
Chi-square distribution
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")
X = stats.chi2(df=5)
X_samples = X.rvs(500)
df, loc, scale = stats.chi2.fit(X_samples)
Y = stats.chi2(df=df, loc=loc, scale=scale)
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
x_lim = X.interval(.99)
x = np.linspace(*x_lim, num=100)
axes[0].plot(x, X.pdf(x), label="original")
axes[0].plot(x, Y.pdf(x), label="recreated")
axes[0].legend()
axes[1].plot(x, X.pdf(x) - Y.pdf(x), label="error")
axes[1].legend()
Mass : Multi-variate random variable
joint probability mass function
Dataset
import pandas as pd
grades = ["A", "B", "C", "D", "E", "F"]
scores = pd.DataFrame(
[[1, 2, 1, 0, 0, 0],
[0, 2, 3, 1, 0, 0],
[0, 4, 7, 4, 1, 0],
[0, 1, 4, 5, 4, 0],
[0, 0, 1, 3, 2, 0],
[0, 0, 0, 1, 2, 1]],
columns=grades, index=grades)
scores.index.name = "Y"
scores.columns.name = "X"
scores
X A B C D E F
Y
A 1 2 1 0 0 0
B 0 2 3 1 0 0
C 0 4 7 4 1 0
D 0 1 4 5 4 0
E 0 0 1 3 2 0
F 0 0 0 1 2 1
joint probability mass function
pmf = scores / scores.values.sum()
pmf
X A B C D E F
Y
A 0.02 0.04 0.02 0.00 0.00 0.00
B 0.00 0.04 0.06 0.02 0.00 0.00
C 0.00 0.08 0.14 0.08 0.02 0.00
D 0.00 0.02 0.08 0.10 0.08 0.00
E 0.00 0.00 0.02 0.06 0.04 0.00
F 0.00 0.00 0.00 0.02 0.04 0.02
Visualization
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
sns.heatmap(pmf, cmap=mpl.cm.bone_r, annot=True,
xticklabels=['A', 'B', 'C', 'D', 'E', 'F'],
yticklabels=['A', 'B', 'C', 'D', 'E', 'F'])
plt.title("joint probability density function p(x,y)")
plt.tight_layout()
plt.show()
marginal probability mass function
Dataset
import pandas as pd
import numpy as np
grades = ["A", "B", "C", "D", "E", "F"]
scores = pd.DataFrame(
[[1, 2, 1, 0, 0, 0],
[0, 2, 3, 1, 0, 0],
[0, 4, 7, 4, 1, 0],
[0, 1, 4, 5, 4, 0],
[0, 0, 1, 3, 2, 0],
[0, 0, 0, 1, 2, 1]],
columns=grades, index=grades)
scores.index.name = "Y"
scores.columns.name = "X"
scores
X A B C D E F
Y
A 1 2 1 0 0 0
B 0 2 3 1 0 0
C 0 4 7 4 1 0
D 0 1 4 5 4 0
E 0 0 1 3 2 0
F 0 0 0 1 2 1
marginal probability mass function
pmf = scores / scores.values.sum()
pmf_marginal_x = pmf.sum(axis=0)
pmf_marginal_y = pmf.sum(axis=1)
pmf_marginal_x
#pmf_marginal_x[np.newaxis, :]
X
A 0.02
B 0.18
C 0.32
D 0.28
E 0.18
F 0.02
dtype: float64
pmf_marginal_y
#pmf_marginal_y[:, np.newaxis]
Y
A 0.08
B 0.12
C 0.32
D 0.28
E 0.12
F 0.08
dtype: float64
conditional probability mass function
Dataset
import pandas as pd
import numpy as np
grades = ["A", "B", "C", "D", "E", "F"]
scores = pd.DataFrame(
[[1, 2, 1, 0, 0, 0],
[0, 2, 3, 1, 0, 0],
[0, 4, 7, 4, 1, 0],
[0, 1, 4, 5, 4, 0],
[0, 0, 1, 3, 2, 0],
[0, 0, 0, 1, 2, 1]],
columns=grades, index=grades)
scores.index.name = "Y"
scores.columns.name = "X"
scores
X A B C D E F
Y
A 1 2 1 0 0 0
B 0 2 3 1 0 0
C 0 4 7 4 1 0
D 0 1 4 5 4 0
E 0 0 1 3 2 0
F 0 0 0 1 2 1
conditional probability mass function
pmf = scores / scores.values.sum()
pmf_marginal_x = pmf.sum(axis=0)
pmf_marginal_y = pmf.sum(axis=1)
def conditional_x(y):
return pmf.iloc[y-1, :]/pmf_marginal_y[y-1]
def conditional_y(x):
return pmf.iloc[:, x-1]/pmf_marginal_x[x-1]
for i in range(1, pmf.shape[0]+1):
print("conditional_x(y=%d)\n"%(i),conditional_x(i), "\n")
OUTPUT
conditional_x(y=1)
X
A 0.25
B 0.50
C 0.25
D 0.00
E 0.00
F 0.00
Name: A, dtype: float64
conditional_x(y=2)
X
A 0.000000
B 0.333333
C 0.500000
D 0.166667
E 0.000000
F 0.000000
Name: B, dtype: float64
conditional_x(y=3)
X
A 0.0000
B 0.2500
C 0.4375
D 0.2500
E 0.0625
F 0.0000
Name: C, dtype: float64
conditional_x(y=4)
X
A 0.000000
B 0.071429
C 0.285714
D 0.357143
E 0.285714
F 0.000000
Name: D, dtype: float64
conditional_x(y=5)
X
A 0.000000
B 0.000000
C 0.166667
D 0.500000
E 0.333333
F 0.000000
Name: E, dtype: float64
conditional_x(y=6)
X
A 0.00
B 0.00
C 0.00
D 0.25
E 0.50
F 0.25
Name: F, dtype: float64
Visualization
given y, cross section of joint probability mass function
import string
import matplotlib.pyplot as plt
pmf = scores / scores.values.sum()
x = np.arange(6)
for i, y in enumerate(string.ascii_uppercase[:6]):
ax = plt.subplot(6, 1, i + 1)
ax.tick_params(labelleft=False)
plt.bar(x, conditional_x(i+1))
plt.ylabel("p(x, y=%s)/p(x)"%y, rotation=0, labelpad=40)
plt.ylim(0, 1)
plt.xticks(range(6), ['A', 'B', 'C', 'D', 'E', 'F'])
plt.suptitle("given y and $p(x)=\sum_{y} p(x,y)$, conditional probability mass function(x)", x=0.55 ,y=1.09)
plt.tight_layout()
plt.show()
for i in range(1, pmf.shape[1]+1):
print("conditional_y(x=%d)\n"%(i),conditional_y(i), "\n")
OUTPUT
conditional_y(x=1)
Y
A 1.0
B 0.0
C 0.0
D 0.0
E 0.0
F 0.0
Name: A, dtype: float64
conditional_y(x=2)
Y
A 0.222222
B 0.222222
C 0.444444
D 0.111111
E 0.000000
F 0.000000
Name: B, dtype: float64
conditional_y(x=3)
Y
A 0.0625
B 0.1875
C 0.4375
D 0.2500
E 0.0625
F 0.0000
Name: C, dtype: float64
conditional_y(x=4)
Y
A 0.000000
B 0.071429
C 0.285714
D 0.357143
E 0.214286
F 0.071429
Name: D, dtype: float64
conditional_y(x=5)
Y
A 0.000000
B 0.000000
C 0.111111
D 0.444444
E 0.222222
F 0.222222
Name: E, dtype: float64
conditional_y(x=6)
Y
A 0.0
B 0.0
C 0.0
D 0.0
E 0.0
F 1.0
Name: F, dtype: float64
Visualization
given x, cross section of joint probability mass function
import string
import matplotlib.pyplot as plt
pmf = scores / scores.values.sum()
x = np.arange(6)
for i, y in enumerate(string.ascii_uppercase[:6]):
ax = plt.subplot(6, 1, i + 1)
ax.tick_params(labelleft=False)
plt.bar(x, conditional_y(i+1))
plt.ylabel("p(x=%s, y)/p(y)"%y, rotation=0, labelpad=40)
plt.ylim(0, 1)
plt.xticks(range(6), ['A', 'B', 'C', 'D', 'E', 'F'])
plt.suptitle("given x and $p(y)=\sum_{x} p(x,y)$, conditional probability mass function(y)", x=0.55 ,y=1.09)
plt.tight_layout()
plt.show()
Density : Multi-variate random variable
joint probability density function
from scipy import stats
import matplotlib.pyplot as plt
# x:weight, y:height
mu = [70, 170]
cov = [[150, 140], [140, 300]]
rv = stats.multivariate_normal(mu, cov)
xx = np.linspace(20, 120, 100)
yy = np.linspace(100, 250, 100)
XX, YY = np.meshgrid(xx, yy)
ZZ = rv.pdf(np.dstack([XX, YY]))
plt.contour(XX, YY, ZZ)
plt.xlabel("x")
plt.ylabel("y")
plt.title("joint probability density function p(x,y)")
plt.show()
marginal probability density function
from matplotlib.ticker import NullFormatter
from matplotlib import transforms
from scipy.integrate import simps # 심슨법칙(Simpson's rule)을 사용한 적분 계산
xx = np.linspace(20, 120, 100)
yy = np.linspace(100, 250, 100)
XX, YY = np.meshgrid(xx, yy)
ZZ = rv.pdf(np.dstack([XX, YY]))
fx = [simps(Z, yy) for Z in ZZ.T]
fy = [simps(Z, xx) for Z in ZZ]
plt.figure(figsize=(6, 6))
left, width = 0.1, 0.65
bottom, height = 0.1, 0.65
bottom_h = left_h = left + width + 0.05
rect1 = [left, bottom, width, height]
rect2 = [left, bottom_h, width, 0.2]
rect3 = [left_h, bottom, 0.2, height]
ax1 = plt.axes(rect1)
ax2 = plt.axes(rect2)
ax3 = plt.axes(rect3)
ax2.xaxis.set_major_formatter(NullFormatter())
ax3.yaxis.set_major_formatter(NullFormatter())
ax1.contour(XX, YY, ZZ)
ax1.set_title("joint probability density function $p_{XY}(x, y)$")
ax1.set_xlabel("x")
ax1.set_ylabel("y")
ax2.plot(xx, fx)
ax2.set_title("marginal probability \n density function $p_X(x)$")
base = ax3.transData
rot = transforms.Affine2D().rotate_deg(-90)
plt.plot(-yy, fy, transform=rot + base)
plt.title("marginal probability \n density function $p_Y(y)$")
ax1.set_xlim(38, 102)
ax1.set_ylim(120, 220)
ax2.set_xlim(38, 102)
ax3.set_xlim(0, 0.025)
ax3.set_ylim(120, 220)
plt.show()
conditional probability density function
Cross section of joint probability density function
from matplotlib.collections import PolyCollection
from matplotlib import colors as mcolors
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np
xx = np.linspace(20, 120, 100)
yy = np.linspace(100, 250, 16)
XX, YY = np.meshgrid(xx, yy)
ZZ = rv.pdf(np.dstack([XX, YY]))
fig = plt.figure(dpi=150)
ax = fig.gca(projection='3d')
xs = np.hstack([0, xx, 0])
zs = np.zeros_like(xs)
verts = []
for i, y in enumerate(yy):
zs[1:-1] = ZZ[i]
verts.append(list(zip(xx, zs)))
poly = PolyCollection(verts)
poly.set_alpha(0.5)
ax.add_collection3d(poly, zs=yy, zdir='y')
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_xlim(20, 120)
ax.set_ylim(100, 250)
ax.set_zlim3d(0, 0.0007)
ax.view_init(20, -50)
plt.title("cross section of joint probability density function")
plt.show()
from scipy.integrate import simps # 심슨법칙(Simpson's rule)을 사용한 적분 계산
import matplotlib.pyplot as plt
import numpy as np
mag = 10 # 확대 비율
xx = np.linspace(20, 120, 100)
yy = np.linspace(100, 250, 16)
XX, YY = np.meshgrid(xx, yy)
ZZ = rv.pdf(np.dstack([XX, YY]))
plt.figure(figsize=(8, 6))
for i, j in enumerate(range(9, 4, -1)):
ax = plt.subplot(5, 1, i + 1)
ax.tick_params(labelleft=False)
plt.plot(xx, ZZ[j, :] * mag, 'r--', lw=2, label="cross section of joint probability density function")
marginal = simps(ZZ[j, :], xx)
plt.plot(xx, ZZ[j, :] / marginal, 'b-', lw=2, label="conditional probability density function")
plt.ylim(0, 0.05)
ax.xaxis.set_ticklabels([])
plt.ylabel("p(x, y={:.0f})".format(yy[j]), rotation=0, labelpad=40)
if i == 0:
plt.legend(loc=2)
plt.xlabel("x")
plt.tight_layout()
plt.show()
Independent
independent two variable
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
pmf1 = np.array([[1, 2, 4, 2, 1],
[2, 4, 8, 4, 2],
[4, 8, 16, 8, 4],
[2, 4, 8, 4, 2],
[1, 2, 4, 2, 1]])
pmf1 = pmf1/pmf1.sum()
pmf1_marginal_x = np.round(pmf1.sum(axis=0), 2)
pmf1_marginal_y = np.round(pmf1.sum(axis=1), 2)
pmf1x = pmf1_marginal_x * pmf1_marginal_y[:, np.newaxis]
plt.subplot(121)
sns.heatmap(pmf1, cmap=mpl.cm.bone_r, annot=True, square=True, linewidth=1, linecolor="k",
cbar=False, xticklabels=pmf1_marginal_x, yticklabels=pmf1_marginal_y)
plt.title("independent two variable - \n joint probability mass function")
plt.subplot(122)
pmf1x = pmf1_marginal_x * pmf1_marginal_y[:, np.newaxis]
sns.heatmap(pmf1x, cmap=mpl.cm.bone_r, annot=True, square=True, linewidth=1, linecolor="k",
cbar=False, xticklabels=pmf1_marginal_x, yticklabels=pmf1_marginal_y)
plt.title("two variable - the product of \n joint probability mass function")
plt.tight_layout()
plt.show()
dependent two variable
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
pmf2 = np.array([[0, 0, 0, 5, 5],
[0, 5, 5, 5, 5],
[0, 5, 30, 5, 0],
[5, 5, 5, 5, 0],
[5, 5, 0, 0, 0]])
pmf2 = pmf2/pmf2.sum()
pmf2_marginal_x = np.round(pmf2.sum(axis=0), 2)
pmf2_marginal_y = np.round(pmf2.sum(axis=1), 2)
plt.subplot(121)
sns.heatmap(pmf2, cmap=mpl.cm.bone_r, annot=True, square=True, linewidth=1, linecolor="k",
cbar=False, xticklabels=pmf2_marginal_x, yticklabels=pmf2_marginal_y)
plt.title("dependent two variable - \n joint probability mass function")
plt.subplot(122)
pmf2x = pmf2_marginal_x * pmf2_marginal_y[:, np.newaxis]
sns.heatmap(pmf2x, cmap=mpl.cm.bone_r, annot=True, square=True, linewidth=1, linecolor="k",
cbar=False, xticklabels=pmf2_marginal_x, yticklabels=pmf2_marginal_y)
plt.title("two variable - the product of \n joint probability mass function")
plt.tight_layout()
plt.show()
List of posts followed by this article
Reference