AI01, Datasets for research
Back to the previous page|page management
List of datasets for machine-learning research
Contents
From sklearn
from sklearn.datasets import load_diabetes
loaded_dataset = load_diabetes()
print(loaded_dataset.data.shape)
print(loaded_dataset.target.shape)
(442, 10)
(442,)
Split dataset
from sklearn.model_selection import train_test_split
x = loaded_dataset.data
y = loaded_dataset.target
x_train_all, x_test, y_train_all, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train_all, y_train_all, test_size=0.2, random_state=42)
x_train_mean = np.mean(x_train, axis=0)
x_train_std = np.std(x_train, axis=0)
x_train_scaled = (x_train - x_train_mean)/x_train_std
x_val_mean = np.mean(x_val, axis=0)
x_val_std = np.std(x_val, axis=0)
x_val_scaled = (x_val - x_val_mean)/x_val_std
x_test_mean = np.mean(x_test, axis=0)
x_test_std = np.std(x_test, axis=0)
x_test_scaled = (x_test - x_test_mean)/x_test_std
Visualization
from sklearn.datasets import load_boston
loaded_dataset = load_boston()
print(loaded_dataset.data.shape)
print(loaded_dataset.target.shape)
(506, 13)
(506,)
Split dataset
from sklearn.model_selection import train_test_split
x = loaded_dataset.data
y = loaded_dataset.target
x_train_all, x_test, y_train_all, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train_all, y_train_all, test_size=0.2, random_state=42)
x_train_mean = np.mean(x_train, axis=0)
x_train_std = np.std(x_train, axis=0)
x_train_scaled = (x_train - x_train_mean)/x_train_std
x_val_mean = np.mean(x_val, axis=0)
x_val_std = np.std(x_val, axis=0)
x_val_scaled = (x_val - x_val_mean)/x_val_std
x_test_mean = np.mean(x_test, axis=0)
x_test_std = np.std(x_test, axis=0)
x_test_scaled = (x_test - x_test_mean)/x_test_std
Visualization
from sklearn.datasets import load_breast_cancer
loaded_dataset = load_breast_cancer()
print(loaded_dataset.data.shape)
print(loaded_dataset.target.shape)
(569, 30)
(569,)
Split dataset
from sklearn.model_selection import train_test_split
x = loaded_dataset.data
y = loaded_dataset.target
x_train_all, x_test, y_train_all, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train_all, y_train_all, test_size=0.2, random_state=42)
x_train_mean = np.mean(x_train, axis=0)
x_train_std = np.std(x_train, axis=0)
x_train_scaled = (x_train - x_train_mean)/x_train_std
x_val_mean = np.mean(x_val, axis=0)
x_val_std = np.std(x_val, axis=0)
x_val_scaled = (x_val - x_val_mean)/x_val_std
x_test_mean = np.mean(x_test, axis=0)
x_test_std = np.std(x_test, axis=0)
x_test_scaled = (x_test - x_test_mean)/x_test_std
Visualization
import matplotlib.pyplot as plt
fig, axes = plt.subplots(3,3, figsize=(10,10))
for i in range(3):
for j in range(3):
axes[i, j].scatter(loaded_dataset.data[:,3*i+j], loaded_dataset.target)
axes[i, j].set_title("%d"%(3*i+j))
plt.tight_layout()
plt.show()
from sklearn.datasets import load_digits
loaded_dataset = load_digits()
print(loaded_dataset.data.shape)
print(loaded_dataset.target.shape)
(1797, 64)
(1797,)
Split dataset
from sklearn.model_selection import train_test_split
x = loaded_dataset.data
y = loaded_dataset.target
x_train_all, x_test, y_train_all, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train_all, y_train_all, test_size=0.2, random_state=42)
x_train_mean = np.mean(x_train, axis=0)
x_train_std = np.std(x_train, axis=0)
x_train_scaled = (x_train - x_train_mean)/x_train_std
x_val_mean = np.mean(x_val, axis=0)
x_val_std = np.std(x_val, axis=0)
x_val_scaled = (x_val - x_val_mean)/x_val_std
x_test_mean = np.mean(x_test, axis=0)
x_test_std = np.std(x_test, axis=0)
x_test_scaled = (x_test - x_test_mean)/x_test_std
Visualization
from sklearn.datasets import load_iris
loaded_dataset = load_iris()
print(loaded_dataset.data.shape)
print(loaded_dataset.target.shape)
(150, 4)
(150,)
Split dataset
from sklearn.model_selection import train_test_split
x = loaded_dataset.data
y = loaded_dataset.target
x_train_all, x_test, y_train_all, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train_all, y_train_all, test_size=0.2, random_state=42)
x_train_mean = np.mean(x_train, axis=0)
x_train_std = np.std(x_train, axis=0)
x_train_scaled = (x_train - x_train_mean)/x_train_std
x_val_mean = np.mean(x_val, axis=0)
x_val_std = np.std(x_val, axis=0)
x_val_scaled = (x_val - x_val_mean)/x_val_std
x_test_mean = np.mean(x_test, axis=0)
x_test_std = np.std(x_test, axis=0)
x_test_scaled = (x_test - x_test_mean)/x_test_std
Visualization
from sklearn.datasets import load_linnerud
loaded_dataset = load_linnerud()
print(loaded_dataset.data.shape)
print(loaded_dataset.target.shape)
(20, 3)
(20, 3)
Split dataset
from sklearn.model_selection import train_test_split
x = loaded_dataset.data
y = loaded_dataset.target
x_train_all, x_test, y_train_all, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train_all, y_train_all, test_size=0.2, random_state=42)
x_train_mean = np.mean(x_train, axis=0)
x_train_std = np.std(x_train, axis=0)
x_train_scaled = (x_train - x_train_mean)/x_train_std
x_val_mean = np.mean(x_val, axis=0)
x_val_std = np.std(x_val, axis=0)
x_val_scaled = (x_val - x_val_mean)/x_val_std
x_test_mean = np.mean(x_test, axis=0)
x_test_std = np.std(x_test, axis=0)
x_test_scaled = (x_test - x_test_mean)/x_test_std
Visualization
from sklearn.datasets import load_wine
loaded_dataset = load_wine()
print(loaded_dataset.data.shape)
print(loaded_dataset.target.shape)
(178, 13)
(178,)
Split dataset
from sklearn.model_selection import train_test_split
x = loaded_dataset.data
y = loaded_dataset.target
x_train_all, x_test, y_train_all, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train_all, y_train_all, test_size=0.2, random_state=42)
x_train_mean = np.mean(x_train, axis=0)
x_train_std = np.std(x_train, axis=0)
x_train_scaled = (x_train - x_train_mean)/x_train_std
x_val_mean = np.mean(x_val, axis=0)
x_val_std = np.std(x_val, axis=0)
x_val_scaled = (x_val - x_val_mean)/x_val_std
x_test_mean = np.mean(x_test, axis=0)
x_test_std = np.std(x_test, axis=0)
x_test_scaled = (x_test - x_test_mean)/x_test_std
Visualization
From keras
from keras.datasets import mnist
Reference
- Kaggle
- US Government Open Data
- Indian Government Open Data
- Amazon Web Service Datasets
- Google Dataset Search
- UCI ML Repository
- Open Data potal
- kosis