Scikitlearn
In [ ]:
#ScikitLearn
#Iris dataset classification
In [ ]:
#Loading external dataset: Now, consider the case when we want to load an external dataset.
#For this purpose, we can use pandas library for easily loading and manipulating dataset.
In [ ]:
#This is the format of the file; ### Does scikit learn identify always the last column as target? ###
'''
Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
0 1 5.1 3.5 1.4 0.2 Iris-setosa
1 2 4.9 3.0 1.4 0.2 Iris-setosa
2 3 4.7 3.2 1.3 0.2 Iris-setosa
3 4 4.6 3.1 1.5 0.2 Iris-setosa
'''
In [ ]:
import pandas as pd
In [8]:
# 1.- INITIALIZE
# load the iris dataset as an example
from sklearn.datasets import load_iris
iris = load_iris()
# store the feature matrix (X) and response vector (y)
X = iris.data #como sabe qué es data y que es target. Viene todo en la misma tabla? Si
y = iris.target #creo que por defecto coge como target la ultima columna.
# store the feature and target names
feature_names = iris.feature_names
target_names = iris.target_names
# printing features and target names of our dataset
print("Feature names:", feature_names)
print("Target names:", target_names)
# X and y are numpy arrays
print("\nType of X is:", type(X))
In [12]:
'''
import pandas as pd
#df = pd.DataFrame(iris)
df=pd.read_csv(iris)
print df.head()
'''
Out[12]:
In [4]:
print("\nFirst 5 rows of X:\n", X[:5])
In [ ]:
### CLASSIFICATION METHODS (SUPERVISED LEARNING) ###
In [ ]:
# Classification using the Knn method #
In [17]:
from sklearn import neighbors
knn= neighbors.KNeighborsClassifier(n_neighbors=5) #, weights=’uniform’, algorithm=’auto’, leaf_size=30, p=2, metric=’minkowski’, metric_params=None, n_jobs=None, **kwargs)
knn.fit(X,y)
print iris.target_names[knn.predict([[3,5,4,2]])] #Vale, aqui estoy prediciendo un punto
#pero como hago para predecir un conjunto entero? le puedo poner una lista?
In [ ]:
# Classification using the GaussianNB method #
In [19]:
from sklearn.naive_bayes import GaussianNB
# Initialize our classifier
gnb = GaussianNB()
# Train our classifier --- FIT
model = gnb.fit(X,y)
# Make predictions
#preds = gnb.predict(test) --- PREDS
preds = gnb.predict([[3,5,4,2]])
print preds
print iris.target_names[preds]
In [ ]:
#More about classifier:
#https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html
'''
#Classifiers:
classifiers = [
KNeighborsClassifier(3), #Number of neighbors to consider
SVC(kernel="linear", C=0.025),
SVC(gamma=2, C=1),
GaussianProcessClassifier(1.0 * RBF(1.0)),
DecisionTreeClassifier(max_depth=5),
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
MLPClassifier(alpha=1),
AdaBoostClassifier(),
GaussianNB(),
QuadraticDiscriminantAnalysis()]
'''
In [ ]:
'''
To summarize:
1.- Split the dataset into two pieces: a training set and a testing set.
2.- Train the model on the training set.
3.- Test the model on the testing set, and evaluate how well our model did.
'''
In [ ]:
### CLUSTERING METHODS - UNSUPERVISED LEARNING ###
In [21]:
# https://www.youtube.com/watch?v=rNHKCKXZde8
from sklearn import cluster
kmeans=cluster.KMeans(n_clusters=3)
kmeans.fit(X,y)
preds = gnb.predict([[3,5,4,2]])
print preds
print iris.target_names[preds]
In [ ]:
'''
CLUSTERING METHDOS
Clustering
kmeans
affinity propagation
mean-shift
spectral clustering
ward hierarchical clustering
agglomerative clustering
DBSCAN
Gaussian mixtures
Birch
'''
In [ ]:
### REGRESSSION -- Supervised Learning
#Linear
#Cuadratic
In [24]:
#Linear regression https://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
# Load the diabetes dataset
diabetes = datasets.load_diabetes()
# Use only one feature
diabetes_X = diabetes.data[:, np.newaxis, 2]
# Split the data into training/testing sets
diabetes_X_train = diabetes_X[:-20]
diabetes_X_test = diabetes_X[-20:]
# Split the targets into training/testing sets
diabetes_y_train = diabetes.target[:-20]
diabetes_y_test = diabetes.target[-20:]
# Create linear regression object
regr = linear_model.LinearRegression()
# Train the model using the training sets
regr.fit(diabetes_X_train, diabetes_y_train)
# Make predictions using the testing set
diabetes_y_pred = regr.predict(diabetes_X_test)
# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Mean squared error: %.2f"
% mean_squared_error(diabetes_y_test, diabetes_y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(diabetes_y_test, diabetes_y_pred))
# Plot outputs
plt.scatter(diabetes_X_test, diabetes_y_test, color='black')
plt.plot(diabetes_X_test, diabetes_y_pred, color='blue', linewidth=3)
plt.xticks(())
plt.yticks(())
plt.show()
In [29]:
#Can I do the same for the iris database? No because is not a continous database (classification problem instead)
#Linear regression https://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html
import sys
X = iris.data
y = iris.target
# Use only one feature
X_r = iris.data[:, np.newaxis, 0] #X_reduced
# Split the data into training/testing sets
X_train = X_r[:-20] #20 firts values
X_test = X_r[-20:] #Last 20 points
# Split the targets into training/testing sets ; targets== y column, last column
y_train = diabetes.target[:-20]
y_test = diabetes.target[-20:]
print 'train', X_train.shape
print 'ytrain', y_train.shape
print 'test', X_test.shape
print 'ytest', y_test.shape
#sys.exit()
# Create linear regression object
regr = linear_model.LinearRegression()
# Train the model using the training sets
regr.fit(X_train, y_train) #Test using my training values
# Make predictions using the testing set
y_pred = regr.predict(X_test) #Prediction using my test values
# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Mean squared error: %.2f"
% mean_squared_error(diabetes_y_test, diabetes_y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(diabetes_y_test, diabetes_y_pred))
# Plot outputs
plt.scatter(diabetes_X_test, diabetes_y_test, color='black')
plt.plot(diabetes_X_test, diabetes_y_pred, color='blue', linewidth=3)
plt.xticks(())
plt.yticks(())
plt.show()
In [ ]:
# Whats happend if x and y don't have the same number of rows?
In [30]:
### PCA Principal Component Analysis ###
#https://scikit-learn.org/stable/auto_examples/decomposition/plot_pca_iris.html
'''
Principal component analysis (PCA) is a statistical procedure
that uses an orthogonal transformation to convert a set of observations
of possibly correlated variables (entities each of which takes on
various numerical values) into a set of values of linearly uncorrelated
variables called principal components
'''
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import decomposition
from sklearn import datasets
np.random.seed(5)
centers = [[1, 1], [-1, -1], [1, -1]]
iris = datasets.load_iris()
X = iris.data
y = iris.target
fig = plt.figure(1, figsize=(4, 3))
plt.clf()
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
plt.cla()
pca = decomposition.PCA(n_components=3)
pca.fit(X)
X = pca.transform(X)
for name, label in [('Setosa', 0), ('Versicolour', 1), ('Virginica', 2)]:
ax.text3D(X[y == label, 0].mean(),
X[y == label, 1].mean() + 1.5,
X[y == label, 2].mean(), name,
horizontalalignment='center',
bbox=dict(alpha=.5, edgecolor='w', facecolor='w'))
# Reorder the labels to have colors matching the cluster results
y = np.choose(y, [1, 2, 0]).astype(np.float)
ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=y, cmap=plt.cm.nipy_spectral,
edgecolor='k')
ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
plt.show()