6626070
2997924

AI01, Datasets for research

Back to the previous pagepage management
List of datasets for machine-learning research


Contents


From sklearn

scikit-learn datasets

from sklearn.datasets import load_diabetes
loaded_dataset = load_diabetes()

print(loaded_dataset.data.shape)
print(loaded_dataset.target.shape)
(442, 10)
(442,)
Split dataset
from sklearn.model_selection import train_test_split

x = loaded_dataset.data
y = loaded_dataset.target
x_train_all, x_test, y_train_all, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train_all, y_train_all, test_size=0.2, random_state=42)

x_train_mean = np.mean(x_train, axis=0)
x_train_std = np.std(x_train, axis=0)
x_train_scaled = (x_train - x_train_mean)/x_train_std

x_val_mean = np.mean(x_val, axis=0)
x_val_std = np.std(x_val, axis=0)
x_val_scaled = (x_val - x_val_mean)/x_val_std

x_test_mean = np.mean(x_test, axis=0)
x_test_std = np.std(x_test, axis=0)
x_test_scaled = (x_test - x_test_mean)/x_test_std

Visualization



from sklearn.datasets import load_boston
loaded_dataset = load_boston()

print(loaded_dataset.data.shape)
print(loaded_dataset.target.shape)
(506, 13)
(506,)
Split dataset
from sklearn.model_selection import train_test_split

x = loaded_dataset.data
y = loaded_dataset.target
x_train_all, x_test, y_train_all, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train_all, y_train_all, test_size=0.2, random_state=42)

x_train_mean = np.mean(x_train, axis=0)
x_train_std = np.std(x_train, axis=0)
x_train_scaled = (x_train - x_train_mean)/x_train_std

x_val_mean = np.mean(x_val, axis=0)
x_val_std = np.std(x_val, axis=0)
x_val_scaled = (x_val - x_val_mean)/x_val_std

x_test_mean = np.mean(x_test, axis=0)
x_test_std = np.std(x_test, axis=0)
x_test_scaled = (x_test - x_test_mean)/x_test_std

Visualization



from sklearn.datasets import load_breast_cancer
loaded_dataset = load_breast_cancer()

print(loaded_dataset.data.shape)
print(loaded_dataset.target.shape)
(569, 30)
(569,)
Split dataset
from sklearn.model_selection import train_test_split

x = loaded_dataset.data
y = loaded_dataset.target
x_train_all, x_test, y_train_all, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train_all, y_train_all, test_size=0.2, random_state=42)

x_train_mean = np.mean(x_train, axis=0)
x_train_std = np.std(x_train, axis=0)
x_train_scaled = (x_train - x_train_mean)/x_train_std

x_val_mean = np.mean(x_val, axis=0)
x_val_std = np.std(x_val, axis=0)
x_val_scaled = (x_val - x_val_mean)/x_val_std

x_test_mean = np.mean(x_test, axis=0)
x_test_std = np.std(x_test, axis=0)
x_test_scaled = (x_test - x_test_mean)/x_test_std

Visualization
import matplotlib.pyplot as plt

fig, axes = plt.subplots(3,3, figsize=(10,10))
for i in range(3):
    for j in range(3):
        axes[i, j].scatter(loaded_dataset.data[:,3*i+j], loaded_dataset.target)
        axes[i, j].set_title("%d"%(3*i+j))
plt.tight_layout()
plt.show()

download



from sklearn.datasets import load_digits
loaded_dataset = load_digits()

print(loaded_dataset.data.shape)
print(loaded_dataset.target.shape)
(1797, 64)
(1797,)
Split dataset
from sklearn.model_selection import train_test_split

x = loaded_dataset.data
y = loaded_dataset.target
x_train_all, x_test, y_train_all, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train_all, y_train_all, test_size=0.2, random_state=42)

x_train_mean = np.mean(x_train, axis=0)
x_train_std = np.std(x_train, axis=0)
x_train_scaled = (x_train - x_train_mean)/x_train_std

x_val_mean = np.mean(x_val, axis=0)
x_val_std = np.std(x_val, axis=0)
x_val_scaled = (x_val - x_val_mean)/x_val_std

x_test_mean = np.mean(x_test, axis=0)
x_test_std = np.std(x_test, axis=0)
x_test_scaled = (x_test - x_test_mean)/x_test_std

Visualization



from sklearn.datasets import load_iris
loaded_dataset = load_iris()

print(loaded_dataset.data.shape)
print(loaded_dataset.target.shape)
(150, 4)
(150,)
Split dataset
from sklearn.model_selection import train_test_split

x = loaded_dataset.data
y = loaded_dataset.target
x_train_all, x_test, y_train_all, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train_all, y_train_all, test_size=0.2, random_state=42)

x_train_mean = np.mean(x_train, axis=0)
x_train_std = np.std(x_train, axis=0)
x_train_scaled = (x_train - x_train_mean)/x_train_std

x_val_mean = np.mean(x_val, axis=0)
x_val_std = np.std(x_val, axis=0)
x_val_scaled = (x_val - x_val_mean)/x_val_std

x_test_mean = np.mean(x_test, axis=0)
x_test_std = np.std(x_test, axis=0)
x_test_scaled = (x_test - x_test_mean)/x_test_std

Visualization



from sklearn.datasets import load_linnerud
loaded_dataset = load_linnerud()

print(loaded_dataset.data.shape)
print(loaded_dataset.target.shape)
(20, 3)
(20, 3)
Split dataset
from sklearn.model_selection import train_test_split

x = loaded_dataset.data
y = loaded_dataset.target
x_train_all, x_test, y_train_all, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train_all, y_train_all, test_size=0.2, random_state=42)

x_train_mean = np.mean(x_train, axis=0)
x_train_std = np.std(x_train, axis=0)
x_train_scaled = (x_train - x_train_mean)/x_train_std

x_val_mean = np.mean(x_val, axis=0)
x_val_std = np.std(x_val, axis=0)
x_val_scaled = (x_val - x_val_mean)/x_val_std

x_test_mean = np.mean(x_test, axis=0)
x_test_std = np.std(x_test, axis=0)
x_test_scaled = (x_test - x_test_mean)/x_test_std

Visualization



from sklearn.datasets import load_wine
loaded_dataset = load_wine()

print(loaded_dataset.data.shape)
print(loaded_dataset.target.shape)
(178, 13)
(178,)
Split dataset
from sklearn.model_selection import train_test_split

x = loaded_dataset.data
y = loaded_dataset.target
x_train_all, x_test, y_train_all, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train_all, y_train_all, test_size=0.2, random_state=42)

x_train_mean = np.mean(x_train, axis=0)
x_train_std = np.std(x_train, axis=0)
x_train_scaled = (x_train - x_train_mean)/x_train_std

x_val_mean = np.mean(x_val, axis=0)
x_val_std = np.std(x_val, axis=0)
x_val_scaled = (x_val - x_val_mean)/x_val_std

x_test_mean = np.mean(x_test, axis=0)
x_test_std = np.std(x_test, axis=0)
x_test_scaled = (x_test - x_test_mean)/x_test_std

Visualization





From keras

from keras.datasets import mnist





Reference