19.Matplotlib

The iris data set

Famous data set in pattern recognition

150 observations, 4 features each
- Sepal length
- Sepal width
- Petal length
- Petal width
3 species: setosa, versicolor, virginica

reference: http://scikit-learn.org/stable/datasets/index.html

Load Dataset

import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets
import pandas as pd
iris = datasets.load_iris() #use dataset offerd by sklearn

# Make a DF
iris_DF = pd.DataFrame(iris.data, columns= iris.feature_names)
iris_DF.head()

# Add Species Columns
iris_DF["species"] = "temp"
iris_DF.loc[:49,'species'] = "setosa"
iris_DF.loc[50:99,'species'] = "versicolor"
iris_DF.loc[100:149,'species'] = "virginica"
iris_DF

# 查看每一個品種各有幾個
iris_DF.groupby("species").size()
# 取得詳細名稱
iris_DF['species'].describe()

# EDA
# Print summary statistics of the fare column with .describe() 第一個欄位使用iloc的切片索引值
print(iris_DF.iloc[:,1].describe())
# Generate a box plot of the fare column
iris_DF.iloc[:,1].plot(kind = "box")
# Show the plot 畫圖
plt.show()

# 畫出所有的Box plot
## 使用陣列呼叫
## 做quantile的EDA
# Print the 5th and 95th percentiles
kind = iris_DF.columns              
q = [0.05,0.25,0.75, 0.95]
print(iris_DF[kind].count())
print(iris_DF[kind].quantile(q))
# Generate a box plot
iris_DF[kind].plot(kind='box') #draw all the column
plt.show()

# Scatter Plot
iris_DF.plot(x = "petal length (cm)", y = "petal width (cm)", kind = "scatter")
iris_DF.plot(x = "petal length (cm)", y = "sepal length (cm)", kind = "scatter", logx= True)
plt.show()

Visual EDA: all data

# 觀看全部資料分布直方圖
iris_DF.plot(kind= 'hist', bins=50, range=(0,8), alpha=0.3,
edgecolor='black')
plt.title('Entire iris data set')
plt.xlabel('[cm]')
plt.figure(figsize=(18, 12), dpi=600)
plt.show()

# 觀看某一類
# visualize setosa data
setosa.plot(kind = "hist", bins = 50, range = (0,8), alpha = 0.3,
edgecolor = "black")
plt.title("setosa data set")
plt.xlabel("[cm]")
plt.show()