linkedin sandra-acebes mail google github
abrir

Scikitlearn

In [ ]:
#ScikitLearn
#Iris dataset classification
In [ ]:
#Loading external dataset: Now, consider the case when we want to load an external dataset. 
#For this purpose, we can use pandas library for easily loading and manipulating dataset.
In [ ]:
#This is the format of the file; ### Does scikit learn identify always the last column as target? ###
'''
Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0    1            5.1           3.5            1.4           0.2  Iris-setosa
1    2            4.9           3.0            1.4           0.2  Iris-setosa
2    3            4.7           3.2            1.3           0.2  Iris-setosa
3    4            4.6           3.1            1.5           0.2  Iris-setosa
'''
In [ ]:
import pandas as pd
In [8]:
# 1.- INITIALIZE
# load the iris dataset as an example 
from sklearn.datasets import load_iris
iris = load_iris()

# store the feature matrix (X) and response vector (y) 

X = iris.data    #como sabe qué es data y que es target. Viene todo en la misma tabla? Si
y = iris.target  #creo que por defecto coge como target la ultima columna. 


# store the feature and target names 
feature_names = iris.feature_names
target_names = iris.target_names

# printing features and target names of our dataset 
print("Feature names:", feature_names)
print("Target names:", target_names)

# X and y are numpy arrays 
print("\nType of X is:", type(X))
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
('Feature names:', ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'])
('Target names:', array(['setosa', 'versicolor', 'virginica'], dtype='|S10'))
('\nType of X is:', <type 'numpy.ndarray'>)
In [12]:
'''

import pandas as pd
#df = pd.DataFrame(iris)
df=pd.read_csv(iris)
print df.head()
'''
Out[12]:
'\nimport pandas as pd\n#df = pd.DataFrame(iris)\ndf=pd.read_csv(iris)\nprint df.head()\n'
In [4]:
print("\nFirst 5 rows of X:\n", X[:5])
('\nFirst 5 rows of X:\n', array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]]))
In [ ]:
###    CLASSIFICATION METHODS   (SUPERVISED LEARNING)  ###
In [ ]:
#     Classification using the Knn method     #
In [17]:
from sklearn import neighbors
knn= neighbors.KNeighborsClassifier(n_neighbors=5) #, weights=’uniform’, algorithm=’auto’, leaf_size=30, p=2, metric=’minkowski’, metric_params=None, n_jobs=None, **kwargs)
knn.fit(X,y)
print iris.target_names[knn.predict([[3,5,4,2]])]   #Vale, aqui estoy prediciendo un punto
#pero como hago para predecir un conjunto entero? le puedo poner una lista?
['versicolor']
In [ ]:
#    Classification using the GaussianNB method #
In [19]:
from sklearn.naive_bayes import GaussianNB
# Initialize our classifier
gnb = GaussianNB()

# Train our classifier --- FIT
model = gnb.fit(X,y)

# Make predictions
#preds = gnb.predict(test) --- PREDS
preds = gnb.predict([[3,5,4,2]])
print preds
print iris.target_names[preds]
[2]
['virginica']
In [ ]:
#More about classifier:
#https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html

'''
#Classifiers:
classifiers = [
    KNeighborsClassifier(3),    #Number of neighbors to consider
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]
'''
In [ ]:
'''

To summarize:

1.- Split the dataset into two pieces: a training set and a testing set.
2.- Train the model on the training set.
3.- Test the model on the testing set, and evaluate how well our model did. 
'''
In [ ]:
###  CLUSTERING METHODS  - UNSUPERVISED LEARNING ###
In [21]:
# https://www.youtube.com/watch?v=rNHKCKXZde8
from sklearn import cluster
kmeans=cluster.KMeans(n_clusters=3)
kmeans.fit(X,y)
preds = gnb.predict([[3,5,4,2]])
print preds
print iris.target_names[preds]
[2]
['virginica']
In [ ]:
'''
CLUSTERING METHDOS
Clustering
kmeans
affinity propagation
mean-shift
spectral clustering
ward hierarchical clustering
agglomerative clustering
DBSCAN
Gaussian mixtures
Birch
'''
In [ ]:
### REGRESSSION -- Supervised Learning
#Linear
#Cuadratic
In [24]:
#Linear regression   https://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

# Load the diabetes dataset
diabetes = datasets.load_diabetes()


# Use only one feature
diabetes_X = diabetes.data[:, np.newaxis, 2]

# Split the data into training/testing sets
diabetes_X_train = diabetes_X[:-20]
diabetes_X_test = diabetes_X[-20:]

# Split the targets into training/testing sets
diabetes_y_train = diabetes.target[:-20]
diabetes_y_test = diabetes.target[-20:]

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(diabetes_X_train, diabetes_y_train)

# Make predictions using the testing set
diabetes_y_pred = regr.predict(diabetes_X_test)

# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(diabetes_y_test, diabetes_y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(diabetes_y_test, diabetes_y_pred))

# Plot outputs
plt.scatter(diabetes_X_test, diabetes_y_test,  color='black')
plt.plot(diabetes_X_test, diabetes_y_pred, color='blue', linewidth=3)

plt.xticks(())
plt.yticks(())

plt.show()
('Coefficients: \n', array([938.23786125]))
Mean squared error: 2548.07
Variance score: 0.47
In [29]:
#Can I do the same for the iris database? No because is not a continous database (classification problem instead)
#Linear regression   https://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html
import sys

X = iris.data
y = iris.target

# Use only one feature
X_r = iris.data[:, np.newaxis, 0]  #X_reduced

# Split the data into training/testing sets
X_train = X_r[:-20]  #20 firts values
X_test = X_r[-20:]   #Last 20 points

# Split the targets into training/testing sets   ; targets== y column, last column
y_train = diabetes.target[:-20]
y_test = diabetes.target[-20:]

print 'train', X_train.shape
print 'ytrain', y_train.shape
print 'test', X_test.shape
print 'ytest', y_test.shape


#sys.exit()

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X_train, y_train) #Test using my training values

# Make predictions using the testing set
y_pred = regr.predict(X_test)  #Prediction using my test values

# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(diabetes_y_test, diabetes_y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(diabetes_y_test, diabetes_y_pred))

# Plot outputs
plt.scatter(diabetes_X_test, diabetes_y_test,  color='black')
plt.plot(diabetes_X_test, diabetes_y_pred, color='blue', linewidth=3)

plt.xticks(())
plt.yticks(())

plt.show()
train (130, 1)
ytrain (422,)
test (20, 1)
ytest (20,)
An exception has occurred, use %tb to see the full traceback.

SystemExit
In [ ]:
# Whats happend if x and y don't have the same number of rows?
In [30]:
### PCA Principal Component Analysis ###
#https://scikit-learn.org/stable/auto_examples/decomposition/plot_pca_iris.html
'''
Principal component analysis (PCA) is a statistical procedure 
that uses an orthogonal transformation to convert a set of observations 
of possibly correlated variables (entities each of which takes on 
various numerical values) into a set of values of linearly uncorrelated 
variables called principal components
'''
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D


from sklearn import decomposition
from sklearn import datasets

np.random.seed(5)

centers = [[1, 1], [-1, -1], [1, -1]]
iris = datasets.load_iris()
X = iris.data
y = iris.target

fig = plt.figure(1, figsize=(4, 3))
plt.clf()
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)

plt.cla()
pca = decomposition.PCA(n_components=3)
pca.fit(X)
X = pca.transform(X)

for name, label in [('Setosa', 0), ('Versicolour', 1), ('Virginica', 2)]:
    ax.text3D(X[y == label, 0].mean(),
              X[y == label, 1].mean() + 1.5,
              X[y == label, 2].mean(), name,
              horizontalalignment='center',
              bbox=dict(alpha=.5, edgecolor='w', facecolor='w'))
# Reorder the labels to have colors matching the cluster results
y = np.choose(y, [1, 2, 0]).astype(np.float)
ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=y, cmap=plt.cm.nipy_spectral,
           edgecolor='k')

ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])

plt.show()