基于sklearn的机器学习入门练习-lhtry的博客

刘华
实验室：JS测试器
分类专栏
hadoop2篇
mysql1篇
nginx8篇
java5篇
架构设计3篇
信息安全6篇
前端开发2篇
数据同步6篇
AI7篇
其他9篇
基于sklearn的机器学习入门练习

liuhua-2023/12/3 17:11:21
包含数据集的下载、加载、训练集/测试集拆分、数据可视化、数据清洗（属性选择、空值处理、分类数值转换、数值缩放）、基本的模型训练（线性回归、决策树、随机森林）、模型评估（交叉验证）、模型调参（网格搜索、随机搜索）
代码有点乱，能运行。
import os
import tarfile
import pandas as pd
from six.moves import urllib

DOWNLOAD_ROOT="https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH="datasets/housing"
HOUSING_URL=DOWNLOAD_ROOT+HOUSING_PATH+"/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL,housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path=os.path.join(housing_path,"housing.tgz")
    urllib.request.urlretrieve(housing_url,tgz_path)
    print("downloaded!")
    housing_tgz=tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
    print("extracted!")
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path=os.path.join(housing_path,"housing.csv")
    return pd.read_csv(csv_path)
housing=load_housing_data()
#housing.describe()
#housing.info()

#import matplotlib.pyplot as plt
#housing.hist(bins=50,figsize=(20,15))
#plt.show()
import numpy as np
import hashlib
def test_set_check(identifier,test_ratio,hash):
    return hash(np.int64(identifier)).digest()[-1]<256*test_ratio
def split_train_test_by_id(data,test_ratio,id_column,hash=hashlib.md5):
    ids=data[id_column]
    in_test_set=ids.apply(lambda id_:test_set_check(id_,test_ratio,hash))
    return data.loc[~in_test_set],data.loc[in_test_set]
housing_with_id=housing.reset_index()
#train test split with longitude and latitude
housing_with_id["id"]=housing["longitude"]*1000+housing["latitude"]
train_set,test_set=split_train_test_by_id(housing_with_id,0.2,"id")
#train test split with sklearn
from sklearn.model_selection import train_test_split
train_set,test_set=train_test_split(housing,test_size=0.2,random_state=42)
print(len(train_set),"train+",len(test_set),"test")
housing["income_cat"]=np.ceil(housing["median_income"]/1.5)
housing["income_cat"].where(housing["income_cat"]<5,5.0,inplace=True)
from sklearn.model_selection import StratifiedShuffleSplit
split=StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_index,test_index in split.split(housing,housing["income_cat"]):
    strat_train_set=housing.loc[train_index]
    strat_test_set=housing.loc[test_index]
housing["income_cat"].value_counts()/len(housing)
strat_train_set["income_cat"].value_counts()/len(strat_train_set)
strat_test_set["income_cat"].value_counts()/len(strat_test_set)
#remove the income_cat attribute
for set in (strat_train_set,strat_test_set):
    set.drop(["income_cat"],axis=1,inplace=True)
#visualize the data
import matplotlib.pyplot as plt
housing=strat_train_set.copy()
housing.plot(kind="scatter",x="longitude",y="latitude",alpha=0.4,
s=housing["population"]/100,label="population",c="median_house_value",
cmap=plt.get_cmap("jet"),colorbar=True,)
plt.legend()

#seperate the predictors and the labels
housing=strat_train_set.drop("median_house_value",axis=1)
housing_labels=strat_train_set["median_house_value"].copy()
# median=housing["total_bedrooms"].median()
# housing["total_bedrooms"].fillna(median)
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(strategy="median")
housing_num=housing.drop("ocean_proximity",axis=1)
imputer.fit(housing_num)
#imputer.statistics_
X=imputer.transform(housing_num)
housing_tr=pd.DataFrame(X,columns=housing_num.columns)
housing_tr.describe()

#Label Encoder
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
housing_cat=housing["ocean_proximity"];
housing_cat_encoded=encoder.fit_transform(housing_cat)
housing_cat_encoded
from sklearn.preprocessing import OneHotEncoder
encoder=OneHotEncoder()
housing_cat_1hot=encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
housing_cat_1hot.toarray()
from sklearn.preprocessing import LabelBinarizer
encoder=LabelBinarizer()
housing_cat_1hot=encoder.fit_transform(housing_cat)
housing_cat_1hot

#总的管线
from sklearn.base import BaseEstimator,TransformerMixin
class DataFrameSelector(BaseEstimator,TransformerMixin):
    def __init__(self,attribute_names):
        self.attribute_names=attribute_names
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        return X[self.attribute_names].values

class MyLabelBinarizer(TransformerMixin):
    def __init__(self,*args,**kwargs):
        self.encoder=LabelBinarizer(*args,**kwargs)
    def fit(self,X,y=0):
        self.encoder.fit(X)
        return self
    def transform(self,X,y=0):
        return self.encoder.transform(X)
    

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion

num_attribs=list(housing_num)
cat_attribs=["ocean_proximity"]

num_pipeline=Pipeline([
    ("selector",DataFrameSelector(num_attribs)),
    ("imputer",SimpleImputer(strategy="median")),
    ("std_scaler",StandardScaler())
])
cat_pipeline=Pipeline([
    ("selector",DataFrameSelector(cat_attribs)),
    ("label_binarizer",MyLabelBinarizer())
])
full_pipeline=FeatureUnion(transformer_list=[
    ("num_pipeline",num_pipeline),
    ("cat_pipeline",cat_pipeline)
])
housing_prepared=full_pipeline.fit_transform(housing)

#训练模型
from sklearn.linear_model import LinearRegression
from sklearn.externals import joblib
lin_reg=LinearRegression()
lin_reg.fit(housing_prepared,housing_labels)
#try predict
some_data=housing.iloc[:5]
some_data_labels=housing_labels.iloc[:5]
some_data_prepared=full_pipeline.transform(some_data)
print("Predictions\t",lin_reg.predict(some_data_prepared))
print("Labels:\t",list(some_data_labels))
#保存模型
joblib.dump(lin_reg,"my_lin_reg_model.pkl")
'''
#分析RMSE
from sklearn.metrics import mean_squared_error
housing_prediction=lin_reg.predict(housing_prepared)
lin_mse=mean_squared_error(housing_prediction,housing_labels)
lin_rmse=np.sqrt(lin_mse)
lin_rmse
'''
def display_scores(scores):
    print("Scores:",scores)
    print("Mean:",scores.mean())
    print("Standard Deviation:",scores.std())

#采用决策树训练模型

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
tree_reg=DecisionTreeRegressor()
tree_reg.fit(housing_prepared,housing_labels)
joblib.dump(tree_reg,"my_tree_reg_model.pkl") #保存
tree_reg_loaded=joblib.load("my_tree_reg_model.pkl") #加载保存的模型
housing_prediction=tree_reg.predict(housing_prepared)
tree_mse=mean_squared_error(housing_prediction,housing_labels)
tree_rmse=np.sqrt(tree_mse)
print("tree_rmse",tree_rmse)

#采用交叉验证来评估决策树模型
from sklearn.model_selection import cross_val_score
scores=cross_val_score(tree_reg,housing_prepared,housing_labels,scoring="neg_mean_squared_error",cv=10)
rmse_scores=np.sqrt(-scores)
display_scores(rmse_scores)
#采用交叉验证来评估线性回归模型
lin_scores=cross_val_score(lin_reg,housing_prepared,housing_labels,scoring="neg_mean_squared_error",cv=10)
lin_rmse_scores=np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)
#采用交叉验证来评估随机森林模型
from sklearn.ensemble import RandomForestRegressor
forest_reg=RandomForestRegressor()
forest_scores=cross_val_score(forest_reg,housing_prepared,housing_labels,scoring="neg_mean_squared_error",cv=10)
forest_rmse_scores=np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)
#采用网格搜索来寻找最优的随机森林模型超参
'''
from sklearn.model_selection import GridSearchCV
param_grid=[
    {"n_estimators":[3,10,30],"max_features":[2,4,6,8]},
    {'bootstrap':[False],'n_estimators':[3,10],'max_features':[2,3,4]}
]
grid_search=GridSearchCV(forest_reg,param_grid,cv=5,scoring="neg_mean_squared_error")
grid_search.fit(housing_prepared,housing_labels)
grid_search.best_params_
grid_search.best_estimator_
cvres=grid_search.cv_results_
for mean_score,params in zip(cvres["mean_test_score"],cvres["params"]):
    print(mean_score,params)
grid_search.best_estimator_.feature_importances_
joblib.dump(grid_search.best_estimator_,"forest_reg_model_best.pkl") #保存最优的模型
'''
#在测试集测试最优的模型
final_model=joblib.load("forest_reg_model_best.pkl")
x_test=strat_test_set.drop("median_house_value",axis=1)
y_test=strat_test_set["median_house_value"].copy()
x_test_prepared=full_pipeline.transform(x_test)
x_test_prediction=final_model.predict(x_test_prepared)
final_mse=mean_squared_error(x_test_prediction,y_test)
final_rmse=np.sqrt(final_mse)
print("final_rmse:",final_rmse)

#采用网格搜索来寻找最优的支持向量机模型超参
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
param_grid=[
    {"kernel":["linear"],"C":[10,1000,2000]},
    {"kernel":["rbf"],"C":[10,1000,2000],"gamma":[0,0.1,0.2,0.4]}
]
svr=SVR()
param_grid_rand=[
    {"kernel":["linear"],"C":[*range(10,1000,2000)]},
    {"kernel":["rbf"],"C":[*range(10,1000,2000)],"gamma":[*range(0,1,2)]}
]
grid_search=GridSearchCV(svr,param_grid_rand,cv=5,scoring="neg_mean_squared_error")
grid_search.fit(housing_prepared,housing_labels)
cvres=grid_search.cv_results_
for mean_score,params in zip(cvres["mean_test_score"],cvres["params"]):
    print(mean_score,params)
final_model=grid_search.best_estimator_
x_test_prediction=final_model.predict(x_test_prepared)
svr_final_mse=mean_squared_error(x_test_prediction,y_test)
svr_final_rmse=np.sqrt(svr_final_mse)
print("best_estimator:",grid_search.best_estimator_)
print("svr_final_rmse:",svr_final_rmse)
#采用随机搜索来寻找最优的支持向量机模型
from sklearn.model_selection import RandomizedSearchCV
random_search=RandomizedSearchCV(svr,param_grid,n_iter=20,scoring="neg_mean_squared_error")
random_search.fit(housing_prepared,housing_labels)
cvres=random_search.cv_results_
for mean_score,params in zip(cvres["mean_test_score"],cvres["params"]):
    print(mean_score,params)
final_model=random_search.best_estimator_
x_test_prediction=final_model.predict(x_test_prepared)
svr_final_mse=mean_squared_error(x_test_prediction,y_test)
svr_final_rmse=np.sqrt(svr_final_mse)
print("rand_best_estimator:",final_model)
print("rand_svr_final_rmse:",svr_final_rmse)